From 1913a5b1053c66cee3be51322fc9f354d00f93cd Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 11 Dec 2019 15:04:52 +0800
Subject: [PATCH 001/578] create v2.0.0-release branch

---
 src/backend/utils/adt/version.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c
index 479682c5..9b7f49d3 100644
--- a/src/backend/utils/adt/version.c
+++ b/src/backend/utils/adt/version.c
@@ -78,7 +78,7 @@
 #include "utils/builtins.h"
 
 
-#define TBASE_VERSION_STR "TBase_V2.0.0"
+#define TBASE_VERSION_STR "TBase_V2.0.0_release"
 
 Datum
 pgsql_version(PG_FUNCTION_ARGS)

From ea325d947fcd5a4037fc2d41d45ffc5d95585599 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Mon, 22 Mar 2021 15:37:37 +0800
Subject: [PATCH 002/578] Revoke pgxc_node public accessed priviledge when
 init.

---
 src/bin/initdb/initdb.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index e7186d78..1e644e09 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -275,6 +275,7 @@ static void test_config_settings(void);
 static void setup_config(void);
 static void bootstrap_template1(void);
 static void setup_auth(FILE *cmdfd);
+static void setup_pgxc_node(FILE *cmdfd);
 static void get_su_pwd(void);
 static void setup_depend(FILE *cmdfd);
 static void setup_sysviews(FILE *cmdfd);
@@ -1518,6 +1519,30 @@ setup_auth(FILE *cmdfd)
                        username, escape_quotes(superuser_password));
 }
 
+/*
+ * set up the pgxc_node table
+ */
+static void
+setup_pgxc_node(FILE *cmdfd)
+{
+	const char *const *line;
+	static const char *const pgxc_node_setup[] = {
+			/*
+			 * Grant all priviledge except node_host and node_port
+			 */
+			"REVOKE ALL on pgxc_node FROM public;\n\n",
+
+			"GRANT ALL (xmin_gts, xmax_gts, shardid, xc_node_id ,    "
+			" tableoid, cmax, xmax, cmin, xmin, oid, ctid, node_name,"
+			" node_type, nodeis_primary, nodeis_preferred, node_id,  "
+			" node_cluster_name) ON pgxc_node TO public;\n\n",
+			NULL
+	};
+
+	for (line = pgxc_node_setup; *line != NULL; line++)
+		PG_CMD_PUTS(*line);
+}
+
 /*
  * get the superuser password if required
  */
@@ -3122,6 +3147,8 @@ initialize_data_directory(void)
 
     setup_auth(cmdfd);
 
+	setup_pgxc_node(cmdfd);
+
     setup_depend(cmdfd);
 
     /*

From f059816cf80be8f8879368430fb125856c64dec9 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 23 Jun 2020 22:54:19 +0800
Subject: [PATCH 003/578] fix regress tests for join and some others

---
 src/test/regress/expected/join_3.out        | 509 +++++++++++---------
 src/test/regress/expected/mls_check.out     |   6 +-
 src/test/regress/expected/rowsecurity_1.out |   4 +-
 src/test/regress/expected/tsrf_1.out        |   4 +-
 src/test/regress/sql/join.sql               |  20 +
 src/test/regress/sql/mls_check.sql          |   2 +-
 src/test/regress/sql/rowsecurity.sql        |   2 +-
 src/test/regress/sql/tsrf.sql               |   4 +-
 8 files changed, 309 insertions(+), 242 deletions(-)

diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index 040c4e20..8e9360aa 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -2223,6 +2223,7 @@ select aa, bb, unique1, unique1
 --
 -- regression test: check handling of empty-FROM subquery underneath outer join
 --
+set enable_nestloop to off;
 explain (costs off)
 select * from int8_tbl i1 left join (int8_tbl i2 join
   (select 123 as x) ss on i2.q1 = x) on i1.q2 = i2.q2
@@ -2256,6 +2257,7 @@ order by 1, 2;
  4567890123456789 |  4567890123456789 | 123 | 4567890123456789 | 123
 (5 rows)
 
+reset enable_nestloop;
 --
 -- regression test: check a case where join_clause_is_movable_into() gives
 -- an imprecise result, causing an assertion failure
@@ -3336,8 +3338,8 @@ select b.unique1 from
   join int4_tbl i1 on b.thousand = f1
   right join int4_tbl i2 on i2.f1 = b.tenthous
   order by 1;
-                                                 QUERY PLAN                                                  
--------------------------------------------------------------------------------------------------------------
+                                                      QUERY PLAN                                                       
+-----------------------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all
    ->  Sort
          Sort Key: b.unique1
@@ -3348,23 +3350,25 @@ select b.unique1 from
                ->  Materialize
                      ->  Remote Subquery Scan on all
                            Distribute results by H: tenthous
-                           ->  Nested Loop
-                                 Join Filter: (b.thousand = i1.f1)
-                                 ->  Nested Loop Left Join
-                                       Join Filter: (b.unique1 = 42)
+                           ->  Nested Loop Left Join
+                                 Join Filter: (b.unique1 = 42)
+                                 ->  Remote Subquery Scan on all
+                                       Distribute results by H: unique1
                                        ->  Nested Loop
                                              ->  Remote Subquery Scan on all
                                                    Distribute results by H: unique2
-                                                   ->  Index Scan using tenk1_thous_tenthous on tenk1 b
-                                                         Index Cond: (i2.f1 = tenthous)
+                                                   ->  Nested Loop
+                                                         ->  Seq Scan on int4_tbl i1
+                                                         ->  Index Scan using tenk1_thous_tenthous on tenk1 b
+                                                               Index Cond: ((thousand = i1.f1) AND (i2.f1 = tenthous))
                                              ->  Index Scan using tenk1_unique1 on tenk1 a
                                                    Index Cond: (unique1 = b.unique2)
-                                       ->  Materialize
-                                             ->  Remote Subquery Scan on all
-                                                   ->  Index Only Scan using tenk1_thous_tenthous on tenk1 c
-                                                         Index Cond: (thousand = a.thousand)
-                                 ->  Seq Scan on int4_tbl i1
-(26 rows)
+                                 ->  Materialize
+                                       ->  Remote Subquery Scan on all
+                                             Distribute results by H: 42
+                                             ->  Index Only Scan using tenk1_thous_tenthous on tenk1 c
+                                                   Index Cond: (thousand = a.thousand)
+(28 rows)
 
 select b.unique1 from
   tenk1 a join tenk1 b on a.unique1 = b.unique2
@@ -3573,6 +3577,9 @@ using (join_key);
 --
 -- test successful handling of nested outer joins with degenerate join quals
 --
+set enable_nestloop to on;
+set enable_hashjoin to off;
+set enable_mergejoin to off;
 explain (verbose, costs off)
 select t1.* from
   text_tbl t1
@@ -3584,13 +3591,13 @@ select t1.* from
   on (t1.f1 = b1.d1)
   left join int4_tbl i4
   on (i8.q2 = i4.f1);
-                                 QUERY PLAN                                 
-----------------------------------------------------------------------------
+                                    QUERY PLAN                                    
+----------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: t1.f1
-   ->  Hash Left Join
+   ->  Nested Loop Left Join
          Output: t1.f1
-         Hash Cond: (i8.q2 = i4.f1)
+         Join Filter: (i8.q2 = i4.f1)
          ->  Nested Loop Left Join
                Output: t1.f1, i8.q2
                Join Filter: (t1.f1 = '***'::text)
@@ -3598,23 +3605,23 @@ select t1.* from
                      Output: t1.f1
                ->  Materialize
                      Output: i8.q2
-                     ->  Hash Right Join
+                     ->  Nested Loop Left Join
                            Output: i8.q2
-                           Hash Cond: ((NULL::integer) = i8b1.q2)
-                           ->  Hash Left Join
+                           Join Filter: ((NULL::integer) = i8b1.q2)
+                           ->  Seq Scan on public.int8_tbl i8b1
+                                 Output: i8b1.q1, i8b1.q2
+                           ->  Materialize
                                  Output: i8.q2, (NULL::integer)
-                                 Hash Cond: (i8.q1 = i8b2.q1)
-                                 ->  Seq Scan on public.int8_tbl i8
-                                       Output: i8.q1, i8.q2
-                                 ->  Hash
-                                       Output: i8b2.q1, (NULL::integer)
-                                       ->  Seq Scan on public.int8_tbl i8b2
-                                             Output: i8b2.q1, NULL::integer
-                           ->  Hash
-                                 Output: i8b1.q2
-                                 ->  Seq Scan on public.int8_tbl i8b1
-                                       Output: i8b1.q2
-         ->  Hash
+                                 ->  Nested Loop Left Join
+                                       Output: i8.q2, (NULL::integer)
+                                       Join Filter: (i8.q1 = i8b2.q1)
+                                       ->  Seq Scan on public.int8_tbl i8
+                                             Output: i8.q1, i8.q2
+                                       ->  Materialize
+                                             Output: i8b2.q1, (NULL::integer)
+                                             ->  Seq Scan on public.int8_tbl i8b2
+                                                   Output: i8b2.q1, NULL::integer
+         ->  Materialize
                Output: i4.f1
                ->  Seq Scan on public.int4_tbl i4
                      Output: i4.f1
@@ -3647,13 +3654,13 @@ select t1.* from
   on (t1.f1 = b1.d1)
   left join int4_tbl i4
   on (i8.q2 = i4.f1);
-                                    QUERY PLAN                                    
-----------------------------------------------------------------------------------
+                                          QUERY PLAN                                          
+----------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: t1.f1
-   ->  Hash Left Join
+   ->  Nested Loop Left Join
          Output: t1.f1
-         Hash Cond: (i8.q2 = i4.f1)
+         Join Filter: (i8.q2 = i4.f1)
          ->  Nested Loop Left Join
                Output: t1.f1, i8.q2
                Join Filter: (t1.f1 = '***'::text)
@@ -3661,27 +3668,27 @@ select t1.* from
                      Output: t1.f1
                ->  Materialize
                      Output: i8.q2
-                     ->  Hash Right Join
+                     ->  Nested Loop Left Join
                            Output: i8.q2
-                           Hash Cond: ((NULL::integer) = i8b1.q2)
-                           ->  Hash Right Join
+                           Join Filter: ((NULL::integer) = i8b1.q2)
+                           ->  Seq Scan on public.int8_tbl i8b1
+                                 Output: i8b1.q1, i8b1.q2
+                           ->  Materialize
                                  Output: i8.q2, (NULL::integer)
-                                 Hash Cond: (i8b2.q1 = i8.q1)
-                                 ->  Nested Loop
-                                       Output: i8b2.q1, NULL::integer
-                                       ->  Seq Scan on public.int8_tbl i8b2
-                                             Output: i8b2.q1, i8b2.q2
-                                       ->  Materialize
-                                             ->  Seq Scan on public.int4_tbl i4b2
-                                 ->  Hash
-                                       Output: i8.q1, i8.q2
+                                 ->  Nested Loop Left Join
+                                       Output: i8.q2, (NULL::integer)
+                                       Join Filter: (i8.q1 = i8b2.q1)
                                        ->  Seq Scan on public.int8_tbl i8
                                              Output: i8.q1, i8.q2
-                           ->  Hash
-                                 Output: i8b1.q2
-                                 ->  Seq Scan on public.int8_tbl i8b1
-                                       Output: i8b1.q2
-         ->  Hash
+                                       ->  Materialize
+                                             Output: i8b2.q1, (NULL::integer)
+                                             ->  Nested Loop
+                                                   Output: i8b2.q1, NULL::integer
+                                                   ->  Seq Scan on public.int8_tbl i8b2
+                                                         Output: i8b2.q1, i8b2.q2
+                                                   ->  Materialize
+                                                         ->  Seq Scan on public.int4_tbl i4b2
+         ->  Materialize
                Output: i4.f1
                ->  Seq Scan on public.int4_tbl i4
                      Output: i4.f1
@@ -3715,13 +3722,13 @@ select t1.* from
   on (t1.f1 = b1.d1)
   left join int4_tbl i4
   on (i8.q2 = i4.f1);
-                                    QUERY PLAN                                    
-----------------------------------------------------------------------------------
+                                          QUERY PLAN                                          
+----------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: t1.f1
-   ->  Hash Left Join
+   ->  Nested Loop Left Join
          Output: t1.f1
-         Hash Cond: (i8.q2 = i4.f1)
+         Join Filter: (i8.q2 = i4.f1)
          ->  Nested Loop Left Join
                Output: t1.f1, i8.q2
                Join Filter: (t1.f1 = '***'::text)
@@ -3729,30 +3736,30 @@ select t1.* from
                      Output: t1.f1
                ->  Materialize
                      Output: i8.q2
-                     ->  Hash Right Join
+                     ->  Nested Loop Left Join
                            Output: i8.q2
-                           Hash Cond: ((NULL::integer) = i8b1.q2)
-                           ->  Hash Right Join
+                           Join Filter: ((NULL::integer) = i8b1.q2)
+                           ->  Seq Scan on public.int8_tbl i8b1
+                                 Output: i8b1.q1, i8b1.q2
+                           ->  Materialize
                                  Output: i8.q2, (NULL::integer)
-                                 Hash Cond: (i8b2.q1 = i8.q1)
-                                 ->  Hash Join
-                                       Output: i8b2.q1, NULL::integer
-                                       Hash Cond: (i8b2.q1 = i4b2.f1)
-                                       ->  Seq Scan on public.int8_tbl i8b2
-                                             Output: i8b2.q1, i8b2.q2
-                                       ->  Hash
-                                             Output: i4b2.f1
-                                             ->  Seq Scan on public.int4_tbl i4b2
-                                                   Output: i4b2.f1
-                                 ->  Hash
-                                       Output: i8.q1, i8.q2
+                                 ->  Nested Loop Left Join
+                                       Output: i8.q2, (NULL::integer)
+                                       Join Filter: (i8.q1 = i8b2.q1)
                                        ->  Seq Scan on public.int8_tbl i8
                                              Output: i8.q1, i8.q2
-                           ->  Hash
-                                 Output: i8b1.q2
-                                 ->  Seq Scan on public.int8_tbl i8b1
-                                       Output: i8b1.q2
-         ->  Hash
+                                       ->  Materialize
+                                             Output: i8b2.q1, (NULL::integer)
+                                             ->  Nested Loop
+                                                   Output: i8b2.q1, NULL::integer
+                                                   Join Filter: (i8b2.q1 = i4b2.f1)
+                                                   ->  Seq Scan on public.int8_tbl i8b2
+                                                         Output: i8b2.q1, i8b2.q2
+                                                   ->  Materialize
+                                                         Output: i4b2.f1
+                                                         ->  Seq Scan on public.int4_tbl i4b2
+                                                               Output: i4b2.f1
+         ->  Materialize
                Output: i4.f1
                ->  Seq Scan on public.int4_tbl i4
                      Output: i4.f1
@@ -3825,6 +3832,9 @@ select * from
  doh! | 123 | 456 | doh!              |   
 (2 rows)
 
+reset enable_nestloop;
+reset enable_hashjoin;
+reset enable_mergejoin;
 --
 -- test for appropriate join order in the presence of lateral references
 --
@@ -3835,8 +3845,8 @@ select * from
   on i8.q2 = 123,
   lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss
 where t1.f1 = ss.f1;
-                           QUERY PLAN                            
------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Nested Loop
    Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1
    Join Filter: (t1.f1 = t2.f1)
@@ -3851,15 +3861,17 @@ where t1.f1 = ss.f1;
                      ->  Seq Scan on public.int8_tbl i8
                            Output: i8.q1, i8.q2
                            Filter: (i8.q2 = 123)
-   ->  Limit
+   ->  Materialize
          Output: (i8.q1), t2.f1
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Output: i8.q1, t2.f1
-               ->  Limit
-                     Output: (i8.q1), t2.f1
-                     ->  Seq Scan on public.text_tbl t2
-                           Output: i8.q1, t2.f1
-(22 rows)
+         ->  Limit
+               Output: (i8.q1), t2.f1
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: i8.q1, t2.f1
+                     ->  Limit
+                           Output: (i8.q1), t2.f1
+                           ->  Seq Scan on public.text_tbl t2
+                                 Output: i8.q1, t2.f1
+(24 rows)
 
 select * from
   text_tbl t1
@@ -3880,8 +3892,8 @@ select * from
   lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss1,
   lateral (select ss1.* from text_tbl t3 limit 1) as ss2
 where t1.f1 = ss2.f1;
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
  Nested Loop
    Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1, ((i8.q1)), (t2.f1)
    Join Filter: (t1.f1 = (t2.f1))
@@ -3898,23 +3910,27 @@ where t1.f1 = ss2.f1;
                            ->  Seq Scan on public.int8_tbl i8
                                  Output: i8.q1, i8.q2
                                  Filter: (i8.q2 = 123)
-         ->  Limit
+         ->  Materialize
                Output: (i8.q1), t2.f1
+               ->  Limit
+                     Output: (i8.q1), t2.f1
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Output: i8.q1, t2.f1
+                           ->  Limit
+                                 Output: (i8.q1), t2.f1
+                                 ->  Seq Scan on public.text_tbl t2
+                                       Output: i8.q1, t2.f1
+   ->  Materialize
+         Output: ((i8.q1)), (t2.f1)
+         ->  Limit
+               Output: ((i8.q1)), (t2.f1)
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     Output: i8.q1, t2.f1
+                     Output: (i8.q1), t2.f1
                      ->  Limit
-                           Output: (i8.q1), t2.f1
-                           ->  Seq Scan on public.text_tbl t2
-                                 Output: i8.q1, t2.f1
-   ->  Limit
-         Output: ((i8.q1)), (t2.f1)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Output: (i8.q1), t2.f1
-               ->  Limit
-                     Output: ((i8.q1)), (t2.f1)
-                     ->  Seq Scan on public.text_tbl t3
-                           Output: (i8.q1), t2.f1
-(32 rows)
+                           Output: ((i8.q1)), (t2.f1)
+                           ->  Seq Scan on public.text_tbl t3
+                                 Output: (i8.q1), t2.f1
+(36 rows)
 
 select * from
   text_tbl t1
@@ -3965,18 +3981,20 @@ where tt1.f1 = ss1.c0;
                            ->  Seq Scan on public.text_tbl tt4
                                  Output: tt4.f1
                                  Filter: (tt4.f1 = 'foo'::text)
-   ->  Subquery Scan on ss1
+   ->  Materialize
          Output: ss1.c0
-         Filter: (ss1.c0 = 'foo'::text)
-         ->  Limit
-               Output: (tt4.f1)
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     Output: tt4.f1
-                     ->  Limit
-                           Output: (tt4.f1)
-                           ->  Seq Scan on public.text_tbl tt5
-                                 Output: tt4.f1
-(38 rows)
+         ->  Subquery Scan on ss1
+               Output: ss1.c0
+               Filter: (ss1.c0 = 'foo'::text)
+               ->  Limit
+                     Output: (tt4.f1)
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Output: tt4.f1
+                           ->  Limit
+                                 Output: (tt4.f1)
+                                 ->  Seq Scan on public.text_tbl tt5
+                                       Output: tt4.f1
+(40 rows)
 
 select 1 from
   text_tbl as tt1
@@ -4002,8 +4020,8 @@ select ss2.* from
   on i41.f1 = ss1.c1,
   lateral (select i41.*, i8.*, ss1.* from text_tbl limit 1) ss2
 where ss1.c2 = 0;
-                                     QUERY PLAN                                     
-------------------------------------------------------------------------------------
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
  Nested Loop
    Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
    ->  Remote Subquery Scan on all (datanode_1)
@@ -4027,15 +4045,17 @@ where ss1.c2 = 0;
                      Output: i42.f1
                      ->  Seq Scan on public.int4_tbl i42
                            Output: i42.f1
-   ->  Limit
+   ->  Materialize
          Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42)
-               ->  Limit
-                     Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
-                     ->  Seq Scan on public.text_tbl
-                           Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42)
-(31 rows)
+         ->  Limit
+               Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42)
+                     ->  Limit
+                           Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
+                           ->  Seq Scan on public.text_tbl
+                                 Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42)
+(33 rows)
 
 select ss2.* from
   int4_tbl i41
@@ -4059,18 +4079,19 @@ select * from
   left join
     (tenk1 as a1 full join (select 1 as id) as yy on (a1.unique1 = yy.id))
   on (xx.id = coalesce(yy.id));
-                           QUERY PLAN                            
------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Nested Loop Left Join
    Join Filter: ((1) = COALESCE((1)))
    ->  Result
-   ->  Hash Full Join
-         Hash Cond: (a1.unique1 = (1))
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Seq Scan on tenk1 a1
-         ->  Hash
-               ->  Result
-(9 rows)
+   ->  Materialize
+         ->  Hash Full Join
+               Hash Cond: (a1.unique1 = (1))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Seq Scan on tenk1 a1
+               ->  Hash
+                     ->  Result
+(10 rows)
 
 select * from
   (select 1 as id) as xx
@@ -4699,8 +4720,8 @@ select * from generate_series(100,200) g,
 explain (num_nodes off, nodes off, costs off)
   select count(*) from tenk1 a,
     tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x;
-                                     QUERY PLAN                                     
-------------------------------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Finalize Aggregate
    ->  Remote Subquery Scan on all
          ->  Partial Aggregate
@@ -4710,7 +4731,7 @@ explain (num_nodes off, nodes off, costs off)
                      ->  Hash
                            ->  Remote Subquery Scan on all
                                  Distribute results by H: unique2
-                                 ->  Index Only Scan using tenk1_unique2 on tenk1 b
+                                 ->  Seq Scan on tenk1 b
 (10 rows)
 
 select count(*) from tenk1 a,
@@ -4724,8 +4745,8 @@ select count(*) from tenk1 a,
 explain (num_nodes off, nodes off, costs off)
   select count(*) from tenk1 a,
     tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x;
-                               QUERY PLAN                               
-------------------------------------------------------------------------
+                     QUERY PLAN                      
+-----------------------------------------------------
  Aggregate
    ->  Hash Join
          Hash Cond: ("*VALUES*".column1 = b.unique2)
@@ -4735,7 +4756,7 @@ explain (num_nodes off, nodes off, costs off)
                ->  Values Scan on "*VALUES*"
          ->  Hash
                ->  Remote Subquery Scan on all
-                     ->  Index Only Scan using tenk1_unique2 on tenk1 b
+                     ->  Seq Scan on tenk1 b
 (10 rows)
 
 select count(*) from tenk1 a,
@@ -4746,6 +4767,8 @@ select count(*) from tenk1 a,
 (1 row)
 
 -- lateral injecting a strange outer join condition
+set enable_hashjoin to off;
+set enable_mergejoin to off;
 explain (num_nodes off, nodes off, costs off)
   select * from int8_tbl a,
     int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z)
@@ -4758,11 +4781,11 @@ explain (num_nodes off, nodes off, costs off)
          Sort Key: a.q1, a.q2, x.q1, x.q2, (a.q1)
          ->  Nested Loop
                ->  Seq Scan on int8_tbl a
-               ->  Hash Right Join
-                     Hash Cond: ((a.q1) = x.q2)
-                     ->  Seq Scan on int4_tbl y
-                     ->  Hash
-                           ->  Seq Scan on int8_tbl x
+               ->  Nested Loop Left Join
+                     Join Filter: (x.q2 = (a.q1))
+                     ->  Seq Scan on int8_tbl x
+                     ->  Materialize
+                           ->  Seq Scan on int4_tbl y
 (10 rows)
 
 select * from int8_tbl a,
@@ -4830,27 +4853,29 @@ select * from int8_tbl a,
  4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
 (57 rows)
 
+reset enable_hashjoin;
+reset enable_mergejoin;
 -- lateral reference to a join alias variable
 select * from (select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1,
-  lateral (select x) ss2(y);
+  lateral (select x) ss2(y) order by 1,2,3;
  x | f1 | y 
 ---+----+---
  0 |  0 | 0
 (1 row)
 
 select * from (select f1 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1,
-  lateral (values(x)) ss2(y);
+  lateral (values(x)) ss2(y) order by 1,2,3;
       x      |     f1      |      y      
 -------------+-------------+-------------
+ -2147483647 | -2147483647 | -2147483647
+     -123456 |     -123456 |     -123456
            0 |           0 |           0
       123456 |      123456 |      123456
-     -123456 |     -123456 |     -123456
   2147483647 |  2147483647 |  2147483647
- -2147483647 | -2147483647 | -2147483647
 (5 rows)
 
 select * from ((select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1) j,
-  lateral (select x) ss2(y);
+  lateral (select x) ss2(y) order by 1,2,3;
  x | f1 | y 
 ---+----+---
  0 |  0 | 0
@@ -4858,7 +4883,7 @@ select * from ((select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1)
 
 -- lateral references requiring pullup
 select * from (values(1)) x(lb),
-  lateral generate_series(lb,4) x4;
+  lateral generate_series(lb,4) x4 order by 1,2;
  lb | x4 
 ----+----
   1 |  1
@@ -4868,38 +4893,38 @@ select * from (values(1)) x(lb),
 (4 rows)
 
 select * from (select f1/1000000000 from int4_tbl) x(lb),
-  lateral generate_series(lb,4) x4;
+  lateral generate_series(lb,4) x4 order by 1,2;
  lb | x4 
 ----+----
+ -2 | -2
+ -2 | -1
+ -2 |  0
+ -2 |  1
+ -2 |  2
+ -2 |  3
+ -2 |  4
   0 |  0
-  0 |  1
-  0 |  2
-  0 |  3
-  0 |  4
   0 |  0
-  0 |  1
-  0 |  2
-  0 |  3
-  0 |  4
   0 |  0
   0 |  1
+  0 |  1
+  0 |  1
+  0 |  2
   0 |  2
+  0 |  2
+  0 |  3
   0 |  3
+  0 |  3
+  0 |  4
+  0 |  4
   0 |  4
   2 |  2
   2 |  3
   2 |  4
- -2 | -2
- -2 | -1
- -2 |  0
- -2 |  1
- -2 |  2
- -2 |  3
- -2 |  4
 (25 rows)
 
 select * from (values(1)) x(lb),
-  lateral (values(lb)) y(lbcopy);
+  lateral (values(lb)) y(lbcopy) order by 1,2;
  lb | lbcopy 
 ----+--------
   1 |      1
@@ -5176,6 +5201,8 @@ select * from int4_tbl i left join
   2147483647 | 
 (5 rows)
 
+set enable_hashjoin to off;
+set enable_mergejoin to off;
 explain (num_nodes off, nodes off, verbose, costs off)
 select * from int4_tbl a,
   lateral (
@@ -5189,12 +5216,12 @@ select * from int4_tbl a,
          Output: a.f1, b.f1, c.q1, c.q2
          ->  Seq Scan on public.int4_tbl a
                Output: a.f1
-         ->  Hash Left Join
+         ->  Nested Loop Left Join
                Output: b.f1, c.q1, c.q2
-               Hash Cond: (b.f1 = c.q1)
+               Join Filter: (b.f1 = c.q1)
                ->  Seq Scan on public.int4_tbl b
                      Output: b.f1
-               ->  Hash
+               ->  Materialize
                      Output: c.q1, c.q2
                      ->  Seq Scan on public.int8_tbl c
                            Output: c.q1, c.q2
@@ -5204,36 +5231,38 @@ select * from int4_tbl a,
 select * from int4_tbl a,
   lateral (
     select * from int4_tbl b left join int8_tbl c on (b.f1 = q1 and a.f1 = q2)
-  ) ss;
+  ) ss order by 1,2,3,4;
      f1      |     f1      | q1 | q2 
 -------------+-------------+----+----
+ -2147483647 | -2147483647 |    |   
+ -2147483647 |     -123456 |    |   
+ -2147483647 |           0 |    |   
+ -2147483647 |      123456 |    |   
+ -2147483647 |  2147483647 |    |   
+     -123456 | -2147483647 |    |   
+     -123456 |     -123456 |    |   
+     -123456 |           0 |    |   
+     -123456 |      123456 |    |   
+     -123456 |  2147483647 |    |   
+           0 | -2147483647 |    |   
+           0 |     -123456 |    |   
            0 |           0 |    |   
            0 |      123456 |    |   
-           0 |     -123456 |    |   
            0 |  2147483647 |    |   
-           0 | -2147483647 |    |   
+      123456 | -2147483647 |    |   
+      123456 |     -123456 |    |   
       123456 |           0 |    |   
       123456 |      123456 |    |   
-      123456 |     -123456 |    |   
       123456 |  2147483647 |    |   
-      123456 | -2147483647 |    |   
-     -123456 |           0 |    |   
-     -123456 |      123456 |    |   
-     -123456 |     -123456 |    |   
-     -123456 |  2147483647 |    |   
-     -123456 | -2147483647 |    |   
+  2147483647 | -2147483647 |    |   
+  2147483647 |     -123456 |    |   
   2147483647 |           0 |    |   
   2147483647 |      123456 |    |   
-  2147483647 |     -123456 |    |   
   2147483647 |  2147483647 |    |   
-  2147483647 | -2147483647 |    |   
- -2147483647 |           0 |    |   
- -2147483647 |      123456 |    |   
- -2147483647 |     -123456 |    |   
- -2147483647 |  2147483647 |    |   
- -2147483647 | -2147483647 |    |   
 (25 rows)
 
+reset enable_hashjoin;
+reset enable_mergejoin;
 -- lateral reference in a PlaceHolderVar evaluated at join level
 explain (num_nodes off, nodes off, verbose, costs off)
 select * from
@@ -5476,44 +5505,46 @@ lateral (select * from int8_tbl t1,
                                      where q2 = (select greatest(t1.q1,t2.q2))
                                        and (select v.id=0)) offset 0) ss2) ss
          where t1.q1 = ss.q2) ss0;
-                                    QUERY PLAN                                     
------------------------------------------------------------------------------------
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
  Nested Loop
    Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2
    ->  Remote Subquery Scan on all (datanode_1)
          Output: t1.q1, t1.q2
          ->  Seq Scan on public.int8_tbl t1
                Output: t1.q1, t1.q2
-   ->  Nested Loop
+   ->  Materialize
          Output: "*VALUES*".column1, ss2.q1, ss2.q2
-         ->  Values Scan on "*VALUES*"
-               Output: "*VALUES*".column1
-         ->  Materialize
-               Output: ss2.q1, ss2.q2
-               ->  Remote Subquery Scan on all (datanode_1)
+         ->  Nested Loop
+               Output: "*VALUES*".column1, ss2.q1, ss2.q2
+               ->  Values Scan on "*VALUES*"
+                     Output: "*VALUES*".column1
+               ->  Materialize
                      Output: ss2.q1, ss2.q2
-                     ->  Subquery Scan on ss2
+                     ->  Remote Subquery Scan on all (datanode_1)
                            Output: ss2.q1, ss2.q2
-                           Filter: (t1.q1 = ss2.q2)
-                           ->  Seq Scan on public.int8_tbl t2
-                                 Output: t2.q1, t2.q2
-                                 Filter: (SubPlan 3)
-                                 SubPlan 3
-                                   ->  Remote Subquery Scan on all (datanode_1)
-                                         Output: t3.q2
-                                         ->  Result
+                           ->  Subquery Scan on ss2
+                                 Output: ss2.q1, ss2.q2
+                                 Filter: (t1.q1 = ss2.q2)
+                                 ->  Seq Scan on public.int8_tbl t2
+                                       Output: t2.q1, t2.q2
+                                       Filter: (SubPlan 3)
+                                       SubPlan 3
+                                         ->  Remote Subquery Scan on all (datanode_1)
                                                Output: t3.q2
-                                               One-Time Filter: $4
-                                               InitPlan 1 (returns $2)
-                                                 ->  Result
-                                                       Output: GREATEST($0, t2.q2)
-                                               InitPlan 2 (returns $4)
-                                                 ->  Result
-                                                       Output: ($3 = 0)
-                                               ->  Seq Scan on public.int8_tbl t3
-                                                     Output: t3.q1, t3.q2
-                                                     Filter: (t3.q2 = $2)
-(35 rows)
+                                               ->  Result
+                                                     Output: t3.q2
+                                                     One-Time Filter: $4
+                                                     InitPlan 1 (returns $2)
+                                                       ->  Result
+                                                             Output: GREATEST($0, t2.q2)
+                                                     InitPlan 2 (returns $4)
+                                                       ->  Result
+                                                             Output: ($3 = 0)
+                                                     ->  Seq Scan on public.int8_tbl t3
+                                                           Output: t3.q1, t3.q2
+                                                           Filter: (t3.q2 = $2)
+(37 rows)
 
 select * from (values (0), (1)) v(id),
 lateral (select * from int8_tbl t1,
@@ -6064,34 +6095,48 @@ from onek t1, tenk1 t2
 where exists (select 1 from tenk1 t3
               where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
       and t1.unique1 < 1;
-                                                                                                          QUERY PLAN                                                                                                           
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+                                                                                                                QUERY PLAN                                                                                                                 
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: t1.unique1, t2.hundred
    ->  Nested Loop
          Output: t1.unique1, t2.hundred
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Output: t1.unique1, t3.tenthous
+               Distribute results by H: tenthous
                ->  Hash Join
                      Output: t1.unique1, t3.tenthous
                      Hash Cond: (t3.thousand = t1.unique1)
                      ->  HashAggregate
                            Output: t3.thousand, t3.tenthous
                            Group Key: t3.thousand, t3.tenthous
-                           ->  Seq Scan on public.tenk1 t3
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                  Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4
+                                 Distribute results by H: thousand
+                                 ->  HashAggregate
+                                       Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4
+                                       Group Key: t3.thousand, t3.tenthous
+                                       ->  Seq Scan on public.tenk1 t3
+                                             Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4
                      ->  Hash
                            Output: t1.unique1
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                  Output: t1.unique1
-                                 Sort Key: t1.unique1
-                                 ->  Index Only Scan using onek_unique1 on public.onek t1
+                                 ->  Bitmap Heap Scan on public.onek t1
                                        Output: t1.unique1
-                                       Index Cond: (t1.unique1 < 1)
-         ->  Index Only Scan using tenk1_hundred on public.tenk1 t2
+                                       Recheck Cond: (t1.unique1 < 1)
+                                       ->  Bitmap Index Scan on onek_unique1
+                                             Index Cond: (t1.unique1 < 1)
+         ->  Materialize
                Output: t2.hundred
-               Index Cond: (t2.hundred = t3.tenthous)
-(25 rows)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: t2.hundred
+                     Distribute results by H: hundred
+                     Sort Key: t2.hundred
+                     ->  Index Only Scan using tenk1_hundred on public.tenk1 t2
+                           Output: t2.hundred
+                           Index Cond: (t2.hundred = t3.tenthous)
+(39 rows)
 
 -- ... unless it actually is unique
 create table j3 as select unique1, tenthous from onek;
@@ -6103,8 +6148,8 @@ from onek t1, tenk1 t2
 where exists (select 1 from j3
               where j3.unique1 = t1.unique1 and j3.tenthous = t2.hundred)
       and t1.unique1 < 1;
-                                     QUERY PLAN                                     
-------------------------------------------------------------------------------------
+                                                                                                       QUERY PLAN                                                                                                        
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: t1.unique1, t2.hundred
    ->  Nested Loop
@@ -6113,15 +6158,17 @@ where exists (select 1 from j3
                Output: t1.unique1, j3.tenthous
                ->  Nested Loop
                      Output: t1.unique1, j3.tenthous
-                     ->  Index Only Scan using onek_unique1 on public.onek t1
-                           Output: t1.unique1
-                           Index Cond: (t1.unique1 < 1)
+                     ->  Bitmap Heap Scan on public.onek t1
+                           Output: t1.unique1, t1.unique2, t1.two, t1.four, t1.ten, t1.twenty, t1.hundred, t1.thousand, t1.twothousand, t1.fivethous, t1.tenthous, t1.odd, t1.even, t1.stringu1, t1.stringu2, t1.string4
+                           Recheck Cond: (t1.unique1 < 1)
+                           ->  Bitmap Index Scan on onek_unique1
+                                 Index Cond: (t1.unique1 < 1)
                      ->  Index Only Scan using j3_unique1_tenthous_idx on public.j3
                            Output: j3.unique1, j3.tenthous
                            Index Cond: (j3.unique1 = t1.unique1)
          ->  Index Only Scan using tenk1_hundred on public.tenk1 t2
                Output: t2.hundred
                Index Cond: (t2.hundred = j3.tenthous)
-(17 rows)
+(19 rows)
 
 drop table j3;
diff --git a/src/test/regress/expected/mls_check.out b/src/test/regress/expected/mls_check.out
index cd1b996d..0e5b955d 100644
--- a/src/test/regress/expected/mls_check.out
+++ b/src/test/regress/expected/mls_check.out
@@ -4557,14 +4557,14 @@ select * from xixi where i = 11;
 --case: insert into select from join 
 \c - godlike 
 insert into lala2 select a.i,a.j,b._cls from lala a, lala3 b where a.i = b.i;
-select * from lala2;
+select * from lala2 order by i;
  i  | j  |  _cls   
 ----+----+---------
+ 11 | 11 | 99:1026
  12 | 12 | 99:1026
  13 | 13 | 99:1026
- 15 | 15 | 99:1026
- 11 | 11 | 99:1026
  14 | 14 | 99:1026
+ 15 | 15 | 99:1026
 (5 rows)
 
 --ROUND1. end
diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out
index eb1a95cb..fb6327f3 100644
--- a/src/test/regress/expected/rowsecurity_1.out
+++ b/src/test/regress/expected/rowsecurity_1.out
@@ -737,13 +737,13 @@ EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE f_leak(b) FOR SHARE;
 (9 rows)
 
 -- union all query
-SELECT a, b, oid FROM t2 UNION ALL SELECT a, b, oid FROM t3;
+SELECT a, b, oid FROM t2 UNION ALL SELECT a, b, oid FROM t3 order by oid;
  a |  b  | oid 
 ---+-----+-----
  1 | abc | 201
+ 3 | cde | 203
  1 | xxx | 301
  2 | yyy | 302
- 3 | cde | 203
  3 | zzz | 303
 (5 rows)
 
diff --git a/src/test/regress/expected/tsrf_1.out b/src/test/regress/expected/tsrf_1.out
index e312b9dc..1831d4d6 100644
--- a/src/test/regress/expected/tsrf_1.out
+++ b/src/test/regress/expected/tsrf_1.out
@@ -233,7 +233,7 @@ LINE 1: SELECT min(generate_series(1, 3)) OVER() FROM few;
                    ^
 HINT:  You might be able to move the set-returning function into a LATERAL FROM item.
 -- SRFs are normally computed after window functions
-SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few;
+SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few ORDER BY 1, 4;
  id | lag | count | generate_series 
 ----+-----+-------+-----------------
   1 |     |     3 |               1
@@ -561,7 +561,7 @@ ERROR:  set-returning functions are not allowed in LIMIT
 LINE 1: SELECT 1 LIMIT generate_series(1,3);
                        ^
 -- tSRF in correlated subquery, referencing table outside
-SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET few.id) FROM few;
+SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET few.id) FROM few order by 1;
  generate_series 
 -----------------
                2
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index c30b6703..e6c695c9 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -404,6 +404,8 @@ select aa, bb, unique1, unique1
 --
 -- regression test: check handling of empty-FROM subquery underneath outer join
 --
+set enable_nestloop to off;
+
 explain (costs off)
 select * from int8_tbl i1 left join (int8_tbl i2 join
   (select 123 as x) ss on i2.q1 = x) on i1.q2 = i2.q2
@@ -413,6 +415,8 @@ select * from int8_tbl i1 left join (int8_tbl i2 join
   (select 123 as x) ss on i2.q1 = x) on i1.q2 = i2.q2
 order by 1, 2;
 
+reset enable_nestloop;
+
 --
 -- regression test: check a case where join_clause_is_movable_into() gives
 -- an imprecise result, causing an assertion failure
@@ -1097,6 +1101,9 @@ using (join_key);
 --
 -- test successful handling of nested outer joins with degenerate join quals
 --
+set enable_nestloop to on;
+set enable_hashjoin to off;
+set enable_mergejoin to off;
 
 explain (verbose, costs off)
 select t1.* from
@@ -1188,6 +1195,9 @@ select * from
   left join int4_tbl i4
   on i8.q1 = i4.f1;
 
+reset enable_nestloop;
+reset enable_hashjoin;
+reset enable_mergejoin;
 --
 -- test for appropriate join order in the presence of lateral references
 --
@@ -1576,6 +1586,9 @@ select count(*) from tenk1 a,
   tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x;
 
 -- lateral injecting a strange outer join condition
+set enable_hashjoin to off;
+set enable_mergejoin to off;
+
 explain (num_nodes off, nodes off, costs off)
   select * from int8_tbl a,
     int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z)
@@ -1586,6 +1599,9 @@ select * from int8_tbl a,
     on x.q2 = ss.z
   order by a.q1, a.q2, x.q1, x.q2, ss.z;
 
+reset enable_hashjoin;
+reset enable_mergejoin;
+
 -- lateral reference to a join alias variable
 select * from (select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1,
   lateral (select x) ss2(y) order by 1,2,3;
@@ -1658,6 +1674,8 @@ select * from int4_tbl i left join
   lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true;
 select * from int4_tbl i left join
   lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true order by 1;
+set enable_hashjoin to off;
+set enable_mergejoin to off;
 explain (num_nodes off, nodes off, verbose, costs off)
 select * from int4_tbl a,
   lateral (
@@ -1667,6 +1685,8 @@ select * from int4_tbl a,
   lateral (
     select * from int4_tbl b left join int8_tbl c on (b.f1 = q1 and a.f1 = q2)
   ) ss order by 1,2,3,4;
+reset enable_hashjoin;
+reset enable_mergejoin;
 
 -- lateral reference in a PlaceHolderVar evaluated at join level
 explain (num_nodes off, nodes off, verbose, costs off)
diff --git a/src/test/regress/sql/mls_check.sql b/src/test/regress/sql/mls_check.sql
index 532c7894..208fd38b 100644
--- a/src/test/regress/sql/mls_check.sql
+++ b/src/test/regress/sql/mls_check.sql
@@ -1732,7 +1732,7 @@ select * from xixi where i = 11;
 --case: insert into select from join 
 \c - godlike 
 insert into lala2 select a.i,a.j,b._cls from lala a, lala3 b where a.i = b.i;
-select * from lala2;
+select * from lala2 order by i;
 
 --ROUND1. end
 truncate table xixi;
diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql
index baf951ef..a010dc72 100644
--- a/src/test/regress/sql/rowsecurity.sql
+++ b/src/test/regress/sql/rowsecurity.sql
@@ -299,7 +299,7 @@ SELECT * FROM t1 WHERE f_leak(b) ORDER BY a FOR SHARE;
 EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE f_leak(b) FOR SHARE;
 
 -- union all query
-SELECT a, b, oid FROM t2 UNION ALL SELECT a, b, oid FROM t3;
+SELECT a, b, oid FROM t2 UNION ALL SELECT a, b, oid FROM t3 order by oid;
 EXPLAIN (COSTS OFF) SELECT a, b, oid FROM t2 UNION ALL SELECT a, b, oid FROM t3;
 
 -- superuser is allowed to bypass RLS checks
diff --git a/src/test/regress/sql/tsrf.sql b/src/test/regress/sql/tsrf.sql
index a6e58c5b..65f5ab61 100644
--- a/src/test/regress/sql/tsrf.sql
+++ b/src/test/regress/sql/tsrf.sql
@@ -71,7 +71,7 @@ SELECT sum((3 = ANY(SELECT lag(x) over(order by x)
 SELECT min(generate_series(1, 3)) OVER() FROM few;
 
 -- SRFs are normally computed after window functions
-SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few;
+SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few ORDER BY 1, 4;
 -- unless referencing SRFs
 SELECT SUM(count(*)) OVER(PARTITION BY generate_series(1,3) ORDER BY generate_series(1,3)), generate_series(1,3) g FROM few GROUP BY g;
 
@@ -142,7 +142,7 @@ SELECT a, generate_series(1,2) FROM (VALUES(1),(2),(3)) r(a) LIMIT 2 OFFSET 2;
 SELECT 1 LIMIT generate_series(1,3);
 
 -- tSRF in correlated subquery, referencing table outside
-SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET few.id) FROM few;
+SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET few.id) FROM few order by 1;
 -- tSRF in correlated subquery, referencing SRF outside
 SELECT (SELECT generate_series(1,3) LIMIT 1 OFFSET g.i) FROM generate_series(0,3) g(i);
 

From 960f9c967042ebf18a760f9b7adb10a576cd201c Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Wed, 24 Jun 2020 16:33:32 +0800
Subject: [PATCH 004/578] fix regress test unstable cases

---
 src/test/regress/expected/join_3.out | 47 ++++++++++++----------------
 src/test/regress/expected/tsrf_1.out |  2 +-
 src/test/regress/sql/join.sql        |  6 ++++
 src/test/regress/sql/tsrf.sql        |  2 +-
 4 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index 8e9360aa..f45e67d5 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -5139,25 +5139,28 @@ select * from
 
 -- lateral can result in join conditions appearing below their
 -- real semantic level
+set enable_nestloop to on;
+set enable_hashjoin to off;
+set enable_mergejoin to off;
 explain (num_nodes off, nodes off, verbose, costs off)
 select * from int4_tbl i left join
   lateral (select * from int2_tbl j where i.f1 = j.f1) k on true;
-                      QUERY PLAN                       
--------------------------------------------------------
+                   QUERY PLAN                    
+-------------------------------------------------
  Remote Subquery Scan on all
    Output: i.f1, j.f1
-   ->  Hash Right Join
+   ->  Nested Loop Left Join
          Output: i.f1, j.f1
-         Hash Cond: (j.f1 = i.f1)
-         ->  Seq Scan on public.int2_tbl j
-               Output: j.f1
-         ->  Hash
+         Join Filter: (i.f1 = j.f1)
+         ->  Remote Subquery Scan on all
                Output: i.f1
-               ->  Remote Subquery Scan on all
+               Distribute results by H: f1
+               ->  Seq Scan on public.int4_tbl i
                      Output: i.f1
-                     Distribute results by H: f1
-                     ->  Seq Scan on public.int4_tbl i
-                           Output: i.f1
+         ->  Materialize
+               Output: j.f1
+               ->  Seq Scan on public.int2_tbl j
+                     Output: j.f1
 (14 rows)
 
 select * from int4_tbl i left join
@@ -5171,25 +5174,15 @@ select * from int4_tbl i left join
   2147483647 |   
 (5 rows)
 
+reset enable_nestloop;
+reset enable_hashjoin;
+reset enable_mergejoin
 explain (num_nodes off, nodes off, verbose, costs off)
 select * from int4_tbl i left join
   lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true;
-                   QUERY PLAN                    
--------------------------------------------------
- Remote Subquery Scan on all
-   Output: i.f1, COALESCE(i.*)
-   ->  Nested Loop Left Join
-         Output: i.f1, (COALESCE(i.*))
-         ->  Remote Subquery Scan on all
-               Output: i.f1, i.*
-               Distribute results by H: f1
-               ->  Seq Scan on public.int4_tbl i
-                     Output: i.f1, i.*
-         ->  Seq Scan on public.int2_tbl j
-               Output: j.f1, COALESCE(i.*)
-               Filter: (i.f1 = j.f1)
-(12 rows)
-
+ERROR:  syntax error at or near "explain"
+LINE 2: explain (num_nodes off, nodes off, verbose, costs off)
+        ^
 select * from int4_tbl i left join
   lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true order by 1;
      f1      | coalesce 
diff --git a/src/test/regress/expected/tsrf_1.out b/src/test/regress/expected/tsrf_1.out
index 1831d4d6..a2bb9fa6 100644
--- a/src/test/regress/expected/tsrf_1.out
+++ b/src/test/regress/expected/tsrf_1.out
@@ -233,7 +233,7 @@ LINE 1: SELECT min(generate_series(1, 3)) OVER() FROM few;
                    ^
 HINT:  You might be able to move the set-returning function into a LATERAL FROM item.
 -- SRFs are normally computed after window functions
-SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few ORDER BY 1, 4;
+SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few ORDER BY 1, 2, 4;
  id | lag | count | generate_series 
 ----+-----+-------+-----------------
   1 |     |     3 |               1
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index e6c695c9..dceca27f 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1664,11 +1664,17 @@ select * from
 
 -- lateral can result in join conditions appearing below their
 -- real semantic level
+set enable_nestloop to on;
+set enable_hashjoin to off;
+set enable_mergejoin to off;
 explain (num_nodes off, nodes off, verbose, costs off)
 select * from int4_tbl i left join
   lateral (select * from int2_tbl j where i.f1 = j.f1) k on true;
 select * from int4_tbl i left join
   lateral (select * from int2_tbl j where i.f1 = j.f1) k on true order by 1;
+reset enable_nestloop;
+reset enable_hashjoin;
+reset enable_mergejoin
 explain (num_nodes off, nodes off, verbose, costs off)
 select * from int4_tbl i left join
   lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true;
diff --git a/src/test/regress/sql/tsrf.sql b/src/test/regress/sql/tsrf.sql
index 65f5ab61..0833b221 100644
--- a/src/test/regress/sql/tsrf.sql
+++ b/src/test/regress/sql/tsrf.sql
@@ -71,7 +71,7 @@ SELECT sum((3 = ANY(SELECT lag(x) over(order by x)
 SELECT min(generate_series(1, 3)) OVER() FROM few;
 
 -- SRFs are normally computed after window functions
-SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few ORDER BY 1, 4;
+SELECT id,lag(id) OVER(), count(*) OVER(), generate_series(1,3) FROM few ORDER BY 1, 2, 4;
 -- unless referencing SRFs
 SELECT SUM(count(*)) OVER(PARTITION BY generate_series(1,3) ORDER BY generate_series(1,3)), generate_series(1,3) g FROM few GROUP BY g;
 

From cccb332c80cad1a1176f313e2a5caf7be846119b Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Wed, 24 Jun 2020 16:39:47 +0800
Subject: [PATCH 005/578] parallel hashagg support group by column with all
 types of expression

http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131080395083
---
 src/backend/executor/nodeAgg.c                | 78 ++++++++++---------
 .../regress/expected/select_parallel_4.out    | 24 ++++++
 src/test/regress/sql/select_parallel.sql      |  6 ++
 3 files changed, 72 insertions(+), 36 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 3d203081..d0610f2a 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -3394,65 +3394,71 @@ agg_retrieve_direct(AggState *aggstate)
  */
 static void
 agg_fill_hash_table(AggState *aggstate)
-{// #lizard forgives
+{
     TupleTableSlot *outerslot;
     ExprContext *tmpcontext = aggstate->tmpcontext;
 #ifdef __TBASE__
-    AttrNumber varattno = 0;
-    Oid           dataType = 0;
-    aggstate->tmpcxt    = NULL;
-    
+    AttrNumber varattno = InvalidAttrNumber;
+    Oid                dataType = InvalidOid;
+
+    aggstate->tmpcxt = NULL;
+
+    /* get the redistribution hashfunc for parallel execution */
     if (IsParallelWorker() && aggstate->state)
     {
-        AttrNumber group_col = 0;
-        TargetEntry *en      = NULL;
+        AttrNumber group_col = InvalidAttrNumber;
+        TargetEntry *tle      = NULL;
             
         if (aggstate->aggstrategy != AGG_HASHED || 
             list_length(aggstate->all_grouped_cols) == 0)
         {
-            elog(ERROR, "mismatch plan while ReDistribute-Data.");
+            elog(ERROR, "plan mismatched while redistributing data across "
+                          "parallel workers.");
         }
         
-        /* get first groupby column in targetlist */
+        /*
+        * all_grouped_cols was sorted by AttributeNum in descending order, get
+        * first group-by column in targetlist .
+        *
+        * TODO: choose column with better distribution to avoid data skew
+        * within parallel workers
+        */
         group_col = llast_int(aggstate->all_grouped_cols);
 
         if (group_col < 1)
         {
-            elog(ERROR, "group column AttrNumber is smaller than 1.");
+            elog(ERROR, "invalid group by AttrNumber %d found while "
+                         "redistributing data across parallel workers.", group_col);
         }
 
-        /* get the groupby column's datatype and AttrNumber of input from outer plan */
-        en = (TargetEntry *)lfirst(list_nth_cell(aggstate->ss.ps.plan->lefttree->targetlist, group_col - 1));
-
-        if (IsA(en->expr, Var))
-        {
-            Var *var = (Var *)en->expr;
+		/*
+		 * get DataType and AttrNumber of the redistribution group-by column
+		 * from outer plan
+		 */
+		tle = (TargetEntry *) lfirst(list_nth_cell(
+				   aggstate->ss.ps.plan->lefttree->targetlist, group_col - 1));
 
-            dataType = var->vartype;
-            varattno = group_col;
+		dataType = exprType((Node*) tle->expr);
+		varattno = group_col;
 
-            aggstate->hashfunc = hash_func_ptr(dataType);
-            aggstate->dataType = dataType;
+		aggstate->hashfunc = hash_func_ptr(dataType);
+		aggstate->dataType = dataType;
 
-            /* could not find hash function for given data type */
-            if (!aggstate->hashfunc)
-            {
-                elog(ERROR, "could not find hash function for given data type:%u", dataType);
-            }
-        }
-        else
-        {
-            elog(ERROR, "could not get AttrNumber and data type of group column.");
-        }
+		/* could not find hash function for given data type */
+		if (!aggstate->hashfunc)
+		{
+			elog(ERROR, "could not find hash function for given data type:%u",
+					dataType);
+		}
 
-        /* initialize resources */
-        InitializeReDistribute(aggstate->state, &aggstate->file);
+		/* initialize resources */
+		InitializeReDistribute(aggstate->state, &aggstate->file);
 
-        aggstate->tmpcxt = AllocSetContextCreate(CurrentMemoryContext,
-                                             "ExecAgg temp memoryContext",
-                                             ALLOCSET_DEFAULT_SIZES);
+		aggstate->tmpcxt = AllocSetContextCreate(CurrentMemoryContext,
+												 "ExecAgg temp memoryContext",
+												 ALLOCSET_DEFAULT_SIZES);
 
-        elog(LOG, "worker:%d redistributed in HashAgg.", ParallelWorkerNumber);
+		elog(LOG, "worker:%d redistributed in HashAgg.", ParallelWorkerNumber);
     }
 #endif
 
diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out
index fd99f499..3ae6bc47 100644
--- a/src/test/regress/expected/select_parallel_4.out
+++ b/src/test/regress/expected/select_parallel_4.out
@@ -95,6 +95,30 @@ explain (costs off)
                            ->  Parallel Seq Scan on tenk1
 (10 rows)
 
+explain (costs off)
+	select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong
+		from tenk1 group by islong order by num;
+                                                       QUERY PLAN                                                       
+------------------------------------------------------------------------------------------------------------------------
+ Sort
+   Sort Key: (count(stringu1))
+   ->  Finalize HashAggregate
+         Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Gather
+                     Workers Planned: 4
+                     ->  Partial HashAggregate
+                           Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END
+                           ->  Parallel Seq Scan on tenk1
+(10 rows)
+
+select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong
+	from tenk1 group by islong order by num;
+  num  | islong 
+-------+--------
+ 10000 | LONG
+(1 row)
+
 -- test that parallel plan for aggregates is not selected when
 -- target list contains parallel restricted clause.
 explain (costs off)
diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql
index 461abaeb..70d0f0fb 100644
--- a/src/test/regress/sql/select_parallel.sql
+++ b/src/test/regress/sql/select_parallel.sql
@@ -34,6 +34,12 @@ select length(stringu1) from tenk1 group by length(stringu1);
 explain (costs off)
 	select stringu1, count(*) from tenk1 group by stringu1 order by stringu1;
 
+explain (costs off)
+	select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong
+		from tenk1 group by islong order by num;
+select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong
+	from tenk1 group by islong order by num;
+
 -- test that parallel plan for aggregates is not selected when
 -- target list contains parallel restricted clause.
 explain (costs off)

From a4ed26a29463fe0fc93dd2d3989b8d770913d8ac Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Fri, 26 Jun 2020 22:18:05 +0800
Subject: [PATCH 006/578] recalculate nestloop/hash/merge join cost caused by
 redistribution

---
 src/backend/optimizer/util/pathnode.c         |  80 +++-
 src/backend/optimizer/util/pgxcship.c         |   4 +
 src/test/regress/expected/equivclass.out      |   8 +-
 src/test/regress/expected/join_3.out          | 346 +++++++++---------
 src/test/regress/expected/privileges.out      |  18 +-
 src/test/regress/expected/stats_ext_2.out     |  18 +-
 src/test/regress/expected/stats_ext_3.out     |  92 +++--
 src/test/regress/expected/subselect_1.out     |  25 +-
 src/test/regress/expected/xc_FQS_join_1.out   |  91 ++---
 src/test/regress/expected/xc_for_update_1.out |  64 ++--
 src/test/regress/expected/xc_groupby_1.out    |  80 ++--
 src/test/regress/expected/xc_having_1.out     |  22 +-
 src/test/regress/expected/xl_join.out         |  24 +-
 13 files changed, 456 insertions(+), 416 deletions(-)

diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index ee237fa7..586a595c 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -5088,6 +5088,19 @@ create_nestloop_path(PlannerInfo *root,
 
     alternate = set_joinpath_distribution(root, pathnode);
 #endif
+
+#ifdef __TBASE__
+	/*
+	 * Since set_joinpath_distribution() could add additional pathnode such as
+	 * RemoteSubplan, the result of initial_cost_nestloop() needs to be
+	 * recalculated.
+	 */
+	initial_cost_nestloop(root, workspace, jointype,
+						  pathnode->outerjoinpath,
+						  pathnode->innerjoinpath,
+						  extra);
+#endif
+
     final_cost_nestloop(root, pathnode, workspace, extra);
 
 #ifdef XCP
@@ -5097,6 +5110,17 @@ create_nestloop_path(PlannerInfo *root,
     foreach(lc, alternate)
     {
         NestPath *altpath = (NestPath *) lfirst(lc);
+
+#ifdef __TBASE__
+		/*
+		 * Recalculate the initial cost of alternate path
+		 */
+		initial_cost_nestloop(root, workspace, jointype,
+							  altpath->outerjoinpath,
+							  altpath->innerjoinpath,
+							  extra);
+#endif
+
         final_cost_nestloop(root, altpath, workspace, extra);
         if (altpath->path.total_cost < pathnode->path.total_cost)
             pathnode = altpath;
@@ -5180,6 +5204,19 @@ create_mergejoin_path(PlannerInfo *root,
     /* pathnode->skip_mark_restore will be set by final_cost_mergejoin */
     /* pathnode->materialize_inner will be set by final_cost_mergejoin */
 
+#ifdef __TBASE__
+	/*
+	 * Since set_joinpath_distribution() could add additional pathnode such as
+	 * RemoteSubplan, the result of initial_cost_mergejoin() needs to be
+	 * recalculated.
+	 */
+	initial_cost_mergejoin(root, workspace, jointype, mergeclauses,
+						   pathnode->jpath.outerjoinpath,
+						   pathnode->jpath.innerjoinpath,
+						   outersortkeys, innersortkeys,
+						   extra);
+#endif
+
     final_cost_mergejoin(root, pathnode, workspace, extra);
 
 #ifdef XCP
@@ -5189,6 +5226,18 @@ create_mergejoin_path(PlannerInfo *root,
     foreach(lc, alternate)
     {
         MergePath *altpath = (MergePath *) lfirst(lc);
+
+#ifdef __TBASE__
+		/*
+		 * Recalculate the initial cost of alternate path
+		 */
+		initial_cost_mergejoin(root, workspace, jointype, mergeclauses,
+							   altpath->jpath.outerjoinpath,
+							   altpath->jpath.innerjoinpath,
+							   outersortkeys, innersortkeys,
+							   extra);
+#endif
+
         final_cost_mergejoin(root, altpath, workspace, extra);
         if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost)
             pathnode = altpath;
@@ -5277,8 +5326,23 @@ create_hashjoin_path(PlannerInfo *root,
 #ifdef XCP
     alternate = set_joinpath_distribution(root, (JoinPath *) pathnode);
 #endif
-    /* final_cost_hashjoin will fill in pathnode->num_batches */
 
+#ifdef __TBASE__
+	/*
+	 * Since set_joinpath_distribution() could add additional pathnode such as
+	 * RemoteSubplan, the result of initial_cost_hashjoin() needs to be
+	 * recalculated.
+	 */
+	initial_cost_hashjoin(root,
+						  workspace,
+						  jointype,
+						  hashclauses,
+						  pathnode->jpath.outerjoinpath,
+						  pathnode->jpath.innerjoinpath,
+						  extra);
+#endif
+
+	/* final_cost_hashjoin will fill in pathnode->num_batches */
     final_cost_hashjoin(root, pathnode, workspace, extra);
 
 #ifdef XCP
@@ -5288,6 +5352,20 @@ create_hashjoin_path(PlannerInfo *root,
     foreach(lc, alternate)
     {
         HashPath *altpath = (HashPath *) lfirst(lc);
+
+#ifdef __TBASE__
+		/*
+		 * Recalculate the initial cost of alternate path
+		 */
+		initial_cost_hashjoin(root,
+							  workspace,
+							  jointype,
+							  hashclauses,
+							  altpath->jpath.outerjoinpath,
+							  altpath->jpath.innerjoinpath,
+							  extra);
+#endif
+
         final_cost_hashjoin(root, altpath, workspace, extra);
         if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost)
             pathnode = altpath;
diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index 8bf2e0e2..f79eb3bd 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -1189,6 +1189,10 @@ pgxc_shippability_walker(Node *node, Shippability_context *sc_context)
              */
             if (query->commandType != CMD_SELECT && list_length(query->rtable) > 1)
             {
+                /*
+				 * Try to shipping insert with multiple rtables. Skip FQS if it
+				 * contains subquery.
+				 */
                 if(query->commandType == CMD_INSERT && query->onConflict)
                 {
                     ListCell *cell;
diff --git a/src/test/regress/expected/equivclass.out b/src/test/regress/expected/equivclass.out
index cfa96c42..d5868e69 100644
--- a/src/test/regress/expected/equivclass.out
+++ b/src/test/regress/expected/equivclass.out
@@ -207,13 +207,13 @@ explain (costs off)
                            QUERY PLAN                            
 -----------------------------------------------------------------
  Nested Loop
+   Join Filter: (ec1.ff = ec2.x1)
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-         ->  Seq Scan on ec2
-               Filter: (x1 = '42'::int8alias2)
+         ->  Seq Scan on ec1
    ->  Materialize
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Index Scan using ec1_pkey on ec1
-                     Index Cond: (ff = ec2.x1)
+               ->  Seq Scan on ec2
+                     Filter: (x1 = '42'::int8alias2)
 (8 rows)
 
 create unique index ec1_expr1 on ec1((ff + 1));
diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index f45e67d5..9d08f4b2 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -2478,8 +2478,8 @@ where not exists (
               ) a1 on t3.c2 = a1.c1
   where t1.c1 = t2.c2
 );
-                                             QUERY PLAN                                              
------------------------------------------------------------------------------------------------------
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Hash Anti Join
          Hash Cond: (t1.c1 = t2.c2)
@@ -2487,36 +2487,33 @@ where not exists (
          ->  Hash
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Distribute results by H: c2
-                     ->  Merge Left Join
-                           Merge Cond: (t3.c2 = t5.c1)
+                     ->  Merge Right Join
+                           Merge Cond: (t5.c1 = t3.c2)
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: c2
-                                 ->  Sort
-                                       Sort Key: t3.c2
-                                       ->  Merge Left Join
-                                             Merge Cond: (t2.c3 = t3.c1)
-                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                   Distribute results by H: c3
-                                                   ->  Sort
-                                                         Sort Key: t2.c3
-                                                         ->  Seq Scan on tt4x t2
+                                 Distribute results by H: c1
+                                 ->  Merge Join
+                                       Merge Cond: (t4.c2 = t5.c1)
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Distribute results by H: c2
                                              ->  Sort
-                                                   Sort Key: t3.c1
-                                                   ->  Seq Scan on tt4x t3
+                                                   Sort Key: t4.c2
+                                                   ->  Seq Scan on tt4x t4
+                                       ->  Sort
+                                             Sort Key: t5.c1
+                                             ->  Seq Scan on tt4x t5
                            ->  Materialize
                                  ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: c1
-                                       ->  Merge Join
-                                             Merge Cond: (t4.c2 = t5.c1)
-                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                   Distribute results by H: c2
-                                                   ->  Sort
-                                                         Sort Key: t4.c2
-                                                         ->  Seq Scan on tt4x t4
-                                             ->  Sort
-                                                   Sort Key: t5.c1
-                                                   ->  Seq Scan on tt4x t5
-(36 rows)
+                                       Distribute results by H: c2
+                                       ->  Sort
+                                             Sort Key: t3.c2
+                                             ->  Hash Left Join
+                                                   Hash Cond: (t2.c3 = t3.c1)
+                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                         Distribute results by H: c3
+                                                         ->  Seq Scan on tt4x t2
+                                                   ->  Hash
+                                                         ->  Seq Scan on tt4x t3
+(33 rows)
 
 --
 -- regression test for problems of the sort depicted in bug #3494
@@ -3084,24 +3081,24 @@ select t1.unique2, t1.stringu1, t2.unique1, t2.stringu2 from
   left join tenk1 t2
   on (subq1.y1 = t2.unique1)
 where t1.unique2 < 42 and t1.stringu1 > t2.stringu2;
-                                    QUERY PLAN                                     
------------------------------------------------------------------------------------
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
  Nested Loop
    Join Filter: (t1.stringu1 > t2.stringu2)
    ->  Nested Loop
-         Join Filter: ((0) = i1.f1)
          ->  Nested Loop
-               ->  Nested Loop
-                     Join Filter: ((1) = (1))
-                     ->  Result
-                     ->  Result
-               ->  Materialize
+               Join Filter: ((1) = (1))
+               ->  Hash Join
+                     Hash Cond: (i1.f1 = (0))
                      ->  Remote Subquery Scan on all
-                           ->  Index Scan using tenk1_unique2 on tenk1 t1
-                                 Index Cond: ((unique2 = (11)) AND (unique2 < 42))
+                           ->  Seq Scan on int4_tbl i1
+                     ->  Hash
+                           ->  Result
+               ->  Result
          ->  Materialize
                ->  Remote Subquery Scan on all
-                     ->  Seq Scan on int4_tbl i1
+                     ->  Index Scan using tenk1_unique2 on tenk1 t1
+                           Index Cond: ((unique2 = (11)) AND (unique2 < 42))
    ->  Materialize
          ->  Remote Subquery Scan on all
                ->  Index Scan using tenk1_unique1 on tenk1 t2
@@ -3244,19 +3241,19 @@ where t1.unique1 = 1;
          ->  Materialize
                ->  Remote Subquery Scan on all
                      Distribute results by H: hundred
-                     ->  Nested Loop
+                     ->  Hash Join
+                           Hash Cond: (t3.unique2 = t2.thousand)
                            Join Filter: (t1.ten = t3.ten)
                            ->  Remote Subquery Scan on all
-                                 Distribute results by H: thousand
-                                 ->  Bitmap Heap Scan on tenk1 t2
-                                       Recheck Cond: (t1.hundred = hundred)
-                                       ->  Bitmap Index Scan on tenk1_hundred
-                                             Index Cond: (t1.hundred = hundred)
-                           ->  Materialize
+                                 Distribute results by H: unique2
+                                 ->  Seq Scan on tenk1 t3
+                           ->  Hash
                                  ->  Remote Subquery Scan on all
-                                       Distribute results by H: unique2
-                                       ->  Index Scan using tenk1_unique2 on tenk1 t3
-                                             Index Cond: (unique2 = t2.thousand)
+                                       Distribute results by H: thousand
+                                       ->  Bitmap Heap Scan on tenk1 t2
+                                             Recheck Cond: (t1.hundred = hundred)
+                                             ->  Bitmap Index Scan on tenk1_hundred
+                                                   Index Cond: (t1.hundred = hundred)
 (22 rows)
 
 explain (num_nodes off, nodes off, costs off)
@@ -3275,19 +3272,19 @@ where t1.unique1 = 1;
          ->  Materialize
                ->  Remote Subquery Scan on all
                      Distribute results by H: hundred
-                     ->  Nested Loop
+                     ->  Hash Join
+                           Hash Cond: (t3.unique2 = t2.thousand)
                            Join Filter: ((t1.ten + t2.ten) = t3.ten)
                            ->  Remote Subquery Scan on all
-                                 Distribute results by H: thousand
-                                 ->  Bitmap Heap Scan on tenk1 t2
-                                       Recheck Cond: (t1.hundred = hundred)
-                                       ->  Bitmap Index Scan on tenk1_hundred
-                                             Index Cond: (t1.hundred = hundred)
-                           ->  Materialize
+                                 Distribute results by H: unique2
+                                 ->  Seq Scan on tenk1 t3
+                           ->  Hash
                                  ->  Remote Subquery Scan on all
-                                       Distribute results by H: unique2
-                                       ->  Index Scan using tenk1_unique2 on tenk1 t3
-                                             Index Cond: (unique2 = t2.thousand)
+                                       Distribute results by H: thousand
+                                       ->  Bitmap Heap Scan on tenk1 t2
+                                             Recheck Cond: (t1.hundred = hundred)
+                                             ->  Bitmap Index Scan on tenk1_hundred
+                                                   Index Cond: (t1.hundred = hundred)
 (22 rows)
 
 explain (num_nodes off, nodes off, costs off)
@@ -3295,31 +3292,31 @@ select count(*) from
   tenk1 a join tenk1 b on a.unique1 = b.unique2
   left join tenk1 c on a.unique2 = b.unique1 and c.thousand = a.thousand
   join int4_tbl on b.thousand = f1;
-                                           QUERY PLAN                                            
--------------------------------------------------------------------------------------------------
+                                              QUERY PLAN                                               
+-------------------------------------------------------------------------------------------------------
  Finalize Aggregate
    ->  Remote Subquery Scan on all
          ->  Partial Aggregate
-               ->  Nested Loop Left Join
+               ->  Hash Right Join
+                     Hash Cond: (c.thousand = a.thousand)
                      Join Filter: (a.unique2 = b.unique1)
                      ->  Remote Subquery Scan on all
                            Distribute results by H: thousand
-                           ->  Nested Loop
-                                 ->  Remote Subquery Scan on all
-                                       Distribute results by H: unique2
-                                       ->  Nested Loop
-                                             ->  Seq Scan on int4_tbl
-                                             ->  Bitmap Heap Scan on tenk1 b
-                                                   Recheck Cond: (thousand = int4_tbl.f1)
-                                                   ->  Bitmap Index Scan on tenk1_thous_tenthous
-                                                         Index Cond: (thousand = int4_tbl.f1)
-                                 ->  Index Scan using tenk1_unique1 on tenk1 a
-                                       Index Cond: (unique1 = b.unique2)
-                     ->  Materialize
+                           ->  Seq Scan on tenk1 c
+                     ->  Hash
                            ->  Remote Subquery Scan on all
                                  Distribute results by H: thousand
-                                 ->  Index Only Scan using tenk1_thous_tenthous on tenk1 c
-                                       Index Cond: (thousand = a.thousand)
+                                 ->  Nested Loop
+                                       ->  Remote Subquery Scan on all
+                                             Distribute results by H: unique2
+                                             ->  Nested Loop
+                                                   ->  Seq Scan on int4_tbl
+                                                   ->  Bitmap Heap Scan on tenk1 b
+                                                         Recheck Cond: (thousand = int4_tbl.f1)
+                                                         ->  Bitmap Index Scan on tenk1_thous_tenthous
+                                                               Index Cond: (thousand = int4_tbl.f1)
+                                       ->  Index Scan using tenk1_unique1 on tenk1 a
+                                             Index Cond: (unique1 = b.unique2)
 (23 rows)
 
 select count(*) from
@@ -3338,20 +3335,22 @@ select b.unique1 from
   join int4_tbl i1 on b.thousand = f1
   right join int4_tbl i2 on i2.f1 = b.tenthous
   order by 1;
-                                                      QUERY PLAN                                                       
------------------------------------------------------------------------------------------------------------------------
+                                                 QUERY PLAN                                                  
+-------------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all
    ->  Sort
          Sort Key: b.unique1
-         ->  Nested Loop Left Join
+         ->  Hash Right Join
+               Hash Cond: (b.tenthous = i2.f1)
                ->  Remote Subquery Scan on all
-                     Distribute results by H: f1
-                     ->  Seq Scan on int4_tbl i2
-               ->  Materialize
-                     ->  Remote Subquery Scan on all
-                           Distribute results by H: tenthous
-                           ->  Nested Loop Left Join
-                                 Join Filter: (b.unique1 = 42)
+                     Distribute results by H: tenthous
+                     ->  Hash Right Join
+                           Hash Cond: (c.thousand = a.thousand)
+                           Join Filter: (b.unique1 = 42)
+                           ->  Remote Subquery Scan on all
+                                 Distribute results by H: 42
+                                 ->  Seq Scan on tenk1 c
+                           ->  Hash
                                  ->  Remote Subquery Scan on all
                                        Distribute results by H: unique1
                                        ->  Nested Loop
@@ -3359,16 +3358,17 @@ select b.unique1 from
                                                    Distribute results by H: unique2
                                                    ->  Nested Loop
                                                          ->  Seq Scan on int4_tbl i1
-                                                         ->  Index Scan using tenk1_thous_tenthous on tenk1 b
-                                                               Index Cond: ((thousand = i1.f1) AND (i2.f1 = tenthous))
+                                                         ->  Bitmap Heap Scan on tenk1 b
+                                                               Recheck Cond: (thousand = i1.f1)
+                                                               ->  Bitmap Index Scan on tenk1_thous_tenthous
+                                                                     Index Cond: (thousand = i1.f1)
                                              ->  Index Scan using tenk1_unique1 on tenk1 a
                                                    Index Cond: (unique1 = b.unique2)
-                                 ->  Materialize
-                                       ->  Remote Subquery Scan on all
-                                             Distribute results by H: 42
-                                             ->  Index Only Scan using tenk1_thous_tenthous on tenk1 c
-                                                   Index Cond: (thousand = a.thousand)
-(28 rows)
+               ->  Hash
+                     ->  Remote Subquery Scan on all
+                           Distribute results by H: f1
+                           ->  Seq Scan on int4_tbl i2
+(31 rows)
 
 select b.unique1 from
   tenk1 a join tenk1 b on a.unique1 = b.unique2
@@ -3487,22 +3487,20 @@ select a.unique1, b.unique1, c.unique1, coalesce(b.twothousand, a.twothousand)
 ---------------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Nested Loop Left Join
-         ->  Nested Loop Left Join
+         ->  Hash Right Join
+               Hash Cond: (b.thousand = a.unique1)
                Filter: (COALESCE(b.twothousand, a.twothousand) = 44)
-               ->  Index Scan using tenk1_unique2 on tenk1 a
-                     Index Cond: (unique2 < 10)
-               ->  Materialize
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: thousand
-                           ->  Bitmap Heap Scan on tenk1 b
-                                 Recheck Cond: (thousand = a.unique1)
-                                 ->  Bitmap Index Scan on tenk1_thous_tenthous
-                                       Index Cond: (thousand = a.unique1)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: thousand
+                     ->  Seq Scan on tenk1 b
+               ->  Hash
+                     ->  Index Scan using tenk1_unique2 on tenk1 a
+                           Index Cond: (unique2 < 10)
          ->  Materialize
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      ->  Index Scan using tenk1_unique2 on tenk1 c
                            Index Cond: ((unique2 = COALESCE(b.twothousand, a.twothousand)) AND (unique2 = 44))
-(17 rows)
+(15 rows)
 
 select a.unique1, b.unique1, c.unique1, coalesce(b.twothousand, a.twothousand)
   from tenk1 a left join tenk1 b on b.thousand = a.unique1                        left join tenk1 c on c.unique2 = coalesce(b.twothousand, a.twothousand)
@@ -3527,33 +3525,33 @@ left join
    using (join_key)
   ) foo3
 using (join_key);
-                                         QUERY PLAN                                         
---------------------------------------------------------------------------------------------
- Nested Loop Left Join
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Hash Right Join
    Output: "*VALUES*".column1, i1.f1, (666)
-   Join Filter: ("*VALUES*".column1 = i1.f1)
-   ->  Values Scan on "*VALUES*"
-         Output: "*VALUES*".column1
-   ->  Materialize
-         Output: i1.f1, (666)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+   Hash Cond: (i1.f1 = "*VALUES*".column1)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         Output: i1.f1, 666
+         ->  Nested Loop Left Join
                Output: i1.f1, 666
-               ->  Nested Loop Left Join
-                     Output: i1.f1, 666
-                     ->  Remote Subquery Scan on all (datanode_1)
+               ->  Remote Subquery Scan on all (datanode_1)
+                     Output: i1.f1
+                     Distribute results by H: f1
+                     ->  Seq Scan on public.int4_tbl i1
                            Output: i1.f1
-                           Distribute results by H: f1
-                           ->  Seq Scan on public.int4_tbl i1
-                                 Output: i1.f1
-                     ->  Materialize
+               ->  Materialize
+                     Output: i2.unique2
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Output: i2.unique2
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: unique2
+                           Sort Key: i2.unique2
+                           ->  Index Only Scan using tenk1_unique2 on public.tenk1 i2
                                  Output: i2.unique2
-                                 Distribute results by H: unique2
-                                 Sort Key: i2.unique2
-                                 ->  Index Only Scan using tenk1_unique2 on public.tenk1 i2
-                                       Output: i2.unique2
-                                       Index Cond: (i2.unique2 = i1.f1)
+                                 Index Cond: (i2.unique2 = i1.f1)
+   ->  Hash
+         Output: "*VALUES*".column1
+         ->  Values Scan on "*VALUES*"
+               Output: "*VALUES*".column1
 (25 rows)
 
 select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from
@@ -4720,18 +4718,18 @@ select * from generate_series(100,200) g,
 explain (num_nodes off, nodes off, costs off)
   select count(*) from tenk1 a,
     tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x;
-                            QUERY PLAN                            
-------------------------------------------------------------------
+                         QUERY PLAN                         
+------------------------------------------------------------
  Finalize Aggregate
    ->  Remote Subquery Scan on all
          ->  Partial Aggregate
                ->  Hash Join
-                     Hash Cond: (a.unique1 = b.unique2)
-                     ->  Seq Scan on tenk1 a
+                     Hash Cond: (b.unique2 = a.unique1)
+                     ->  Remote Subquery Scan on all
+                           Distribute results by H: unique2
+                           ->  Seq Scan on tenk1 b
                      ->  Hash
-                           ->  Remote Subquery Scan on all
-                                 Distribute results by H: unique2
-                                 ->  Seq Scan on tenk1 b
+                           ->  Seq Scan on tenk1 a
 (10 rows)
 
 select count(*) from tenk1 a,
@@ -5462,20 +5460,18 @@ select * from
          Output: int4_tbl.f1
          ->  Remote Subquery Scan on all
                Output: int4_tbl.f1
-               ->  Nested Loop Semi Join
+               ->  Nested Loop
                      Output: int4_tbl.f1
                      Join Filter: (int4_tbl.f1 = tenk1.unique1)
-                     ->  Remote Subquery Scan on all
-                           Output: int4_tbl.f1
-                           Distribute results by H: f1
-                           ->  Seq Scan on public.int4_tbl
-                                 Output: int4_tbl.f1
-                     ->  Materialize
+                     ->  HashAggregate
                            Output: tenk1.unique1
+                           Group Key: tenk1.unique1
                            ->  Index Scan using tenk1_unique2 on public.tenk1
                                  Output: tenk1.unique1
                                  Index Cond: (tenk1.unique2 = "*VALUES*".column2)
-(21 rows)
+                     ->  Seq Scan on public.int4_tbl
+                           Output: int4_tbl.f1
+(19 rows)
 
 select * from
   (values (0,9998), (1,1000)) v(id,x),
@@ -5498,46 +5494,42 @@ lateral (select * from int8_tbl t1,
                                      where q2 = (select greatest(t1.q1,t2.q2))
                                        and (select v.id=0)) offset 0) ss2) ss
          where t1.q1 = ss.q2) ss0;
-                                       QUERY PLAN                                        
------------------------------------------------------------------------------------------
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
  Nested Loop
    Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2
-   ->  Remote Subquery Scan on all (datanode_1)
-         Output: t1.q1, t1.q2
-         ->  Seq Scan on public.int8_tbl t1
-               Output: t1.q1, t1.q2
+   ->  Values Scan on "*VALUES*"
+         Output: "*VALUES*".column1
    ->  Materialize
-         Output: "*VALUES*".column1, ss2.q1, ss2.q2
-         ->  Nested Loop
-               Output: "*VALUES*".column1, ss2.q1, ss2.q2
-               ->  Values Scan on "*VALUES*"
-                     Output: "*VALUES*".column1
-               ->  Materialize
-                     Output: ss2.q1, ss2.q2
-                     ->  Remote Subquery Scan on all (datanode_1)
+         Output: t1.q1, t1.q2, ss2.q1, ss2.q2
+         ->  Remote Subquery Scan on all (datanode_1)
+               Output: t1.q1, t1.q2, ss2.q1, ss2.q2
+               ->  Nested Loop
+                     Output: t1.q1, t1.q2, ss2.q1, ss2.q2
+                     ->  Seq Scan on public.int8_tbl t1
+                           Output: t1.q1, t1.q2
+                     ->  Subquery Scan on ss2
                            Output: ss2.q1, ss2.q2
-                           ->  Subquery Scan on ss2
-                                 Output: ss2.q1, ss2.q2
-                                 Filter: (t1.q1 = ss2.q2)
-                                 ->  Seq Scan on public.int8_tbl t2
-                                       Output: t2.q1, t2.q2
-                                       Filter: (SubPlan 3)
-                                       SubPlan 3
-                                         ->  Remote Subquery Scan on all (datanode_1)
+                           Filter: (t1.q1 = ss2.q2)
+                           ->  Seq Scan on public.int8_tbl t2
+                                 Output: t2.q1, t2.q2
+                                 Filter: (SubPlan 3)
+                                 SubPlan 3
+                                   ->  Remote Subquery Scan on all (datanode_1)
+                                         Output: t3.q2
+                                         ->  Result
                                                Output: t3.q2
-                                               ->  Result
-                                                     Output: t3.q2
-                                                     One-Time Filter: $4
-                                                     InitPlan 1 (returns $2)
-                                                       ->  Result
-                                                             Output: GREATEST($0, t2.q2)
-                                                     InitPlan 2 (returns $4)
-                                                       ->  Result
-                                                             Output: ($3 = 0)
-                                                     ->  Seq Scan on public.int8_tbl t3
-                                                           Output: t3.q1, t3.q2
-                                                           Filter: (t3.q2 = $2)
-(37 rows)
+                                               One-Time Filter: $4
+                                               InitPlan 1 (returns $2)
+                                                 ->  Result
+                                                       Output: GREATEST($0, t2.q2)
+                                               InitPlan 2 (returns $4)
+                                                 ->  Result
+                                                       Output: ($3 = 0)
+                                               ->  Seq Scan on public.int8_tbl t3
+                                                     Output: t3.q1, t3.q2
+                                                     Filter: (t3.q2 = $2)
+(33 rows)
 
 select * from (values (0), (1)) v(id),
 lateral (select * from int8_tbl t1,
diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
index 090e4122..85aea9c7 100644
--- a/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@@ -244,18 +244,18 @@ ERROR:  permission denied for relation atest12
 -- This plan should use hashjoin, as it will expect many rows to be selected.
 SET random_page_cost = 8.5;
 EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                           QUERY PLAN                            
+-----------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Hash Join
-         Hash Cond: (atest12.a = atest12_1.b)
-         ->  Seq Scan on atest12
-               Filter: (b <<< 5)
+         Hash Cond: (atest12_1.b = atest12.a)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: b
+               ->  Seq Scan on atest12 atest12_1
+                     Filter: (b <<< 5)
          ->  Hash
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     Distribute results by H: b
-                     ->  Seq Scan on atest12 atest12_1
-                           Filter: (b <<< 5)
+               ->  Seq Scan on atest12
+                     Filter: (b <<< 5)
 (10 rows)
 
 RESET random_page_cost;
diff --git a/src/test/regress/expected/stats_ext_2.out b/src/test/regress/expected/stats_ext_2.out
index 175fe9ba..3581037d 100644
--- a/src/test/regress/expected/stats_ext_2.out
+++ b/src/test/regress/expected/stats_ext_2.out
@@ -79,6 +79,7 @@ ALTER TABLE ab1 ALTER a SET STATISTICS 0;
 INSERT INTO ab1 SELECT a, a%23 FROM generate_series(1, 1000) a;
 CREATE STATISTICS ab1_a_b_stats ON a, b FROM ab1;
 ANALYZE ab1;
+WARNING:  statistics object "public.ab1_a_b_stats" could not be computed for relation "public.ab1"
 SELECT
   (stxndistinct IS NOT NULL) AS ndistinct,
   (stxdependencies IS NOT NULL) AS dependencies
@@ -91,6 +92,7 @@ FROM pg_statistic_ext WHERE stxname = 'ab1_a_b_stats';
 ALTER TABLE ab1 ALTER a SET STATISTICS -1;
 -- partial analyze doesn't build stats either
 ANALYZE ab1 (a);
+WARNING:  statistics object "public.ab1_a_b_stats" could not be computed for relation "public.ab1"
 SELECT
   (stxndistinct IS NOT NULL) AS ndistinct,
   (stxdependencies IS NOT NULL) AS dependencies
@@ -226,7 +228,7 @@ EXPLAIN (COSTS off)
    ->  Finalize HashAggregate
          Group Key: a, b, c, d
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: d
+               Distribute results by H: a
                ->  Partial GroupAggregate
                      Group Key: a, b, c, d
                      ->  Sort
@@ -242,7 +244,7 @@ EXPLAIN (COSTS off)
    ->  Finalize HashAggregate
          Group Key: b, c, d
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: d
+               Distribute results by H: b
                ->  Partial GroupAggregate
                      Group Key: b, c, d
                      ->  Sort
@@ -257,7 +259,7 @@ SELECT stxkind, stxndistinct
   FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass;
  stxkind |                      stxndistinct                       
 ---------+---------------------------------------------------------
- {d,f}   | {"3, 4": 161, "3, 6": 161, "4, 6": 161, "3, 4, 6": 161}
+ {d,f}   | {"3, 4": 301, "3, 6": 301, "4, 6": 301, "3, 4, 6": 301}
 (1 row)
 
 -- Hash Aggregate, thanks to estimates improved by the statistic
@@ -313,7 +315,7 @@ EXPLAIN (COSTS off)
    ->  Finalize HashAggregate
          Group Key: a, b, c, d
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: d
+               Distribute results by H: a
                ->  Partial GroupAggregate
                      Group Key: a, b, c, d
                      ->  Sort
@@ -329,7 +331,7 @@ EXPLAIN (COSTS off)
    ->  Finalize HashAggregate
          Group Key: b, c, d
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: d
+               Distribute results by H: b
                ->  Partial GroupAggregate
                      Group Key: b, c, d
                      ->  Sort
@@ -346,9 +348,9 @@ INSERT INTO ndistinct (a, b, c, filler1)
 ANALYZE ndistinct;
 SELECT stxkind, stxndistinct
   FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass;
- stxkind |                        stxndistinct                        
----------+------------------------------------------------------------
- {d,f}   | {"3, 4": 2378, "3, 6": 800, "4, 6": 1632, "3, 4, 6": 6060}
+ stxkind |                        stxndistinct                         
+---------+-------------------------------------------------------------
+ {d,f}   | {"3, 4": 2550, "3, 6": 800, "4, 6": 1632, "3, 4, 6": 10000}
 (1 row)
 
 -- plans using Group Aggregate, thanks to using correct esimates
diff --git a/src/test/regress/expected/stats_ext_3.out b/src/test/regress/expected/stats_ext_3.out
index 3581037d..e69852b6 100644
--- a/src/test/regress/expected/stats_ext_3.out
+++ b/src/test/regress/expected/stats_ext_3.out
@@ -206,19 +206,21 @@ EXPLAIN (COSTS off)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
-                           QUERY PLAN                            
------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Finalize HashAggregate
+   ->  Finalize GroupAggregate
          Group Key: a, b, c
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: a
-               ->  Partial GroupAggregate
-                     Group Key: a, b, c
-                     ->  Sort
-                           Sort Key: a, b, c
-                           ->  Seq Scan on ndistinct
-(10 rows)
+         ->  Sort
+               Sort Key: a, b, c
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Partial GroupAggregate
+                           Group Key: a, b, c
+                           ->  Sort
+                                 Sort Key: a, b, c
+                                 ->  Seq Scan on ndistinct
+(12 rows)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
@@ -238,19 +240,21 @@ EXPLAIN (COSTS off)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
-                           QUERY PLAN                            
------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Finalize HashAggregate
+   ->  Finalize GroupAggregate
          Group Key: b, c, d
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: b
-               ->  Partial GroupAggregate
-                     Group Key: b, c, d
-                     ->  Sort
-                           Sort Key: b, c, d
-                           ->  Seq Scan on ndistinct
-(10 rows)
+         ->  Sort
+               Sort Key: b, c, d
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Group Key: b, c, d
+                           ->  Sort
+                                 Sort Key: b, c, d
+                                 ->  Seq Scan on ndistinct
+(12 rows)
 
 -- correct command
 CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
@@ -455,17 +459,21 @@ EXPLAIN (COSTS off)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
-                           QUERY PLAN                            
------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Finalize HashAggregate
+   ->  Finalize GroupAggregate
          Group Key: a, b, c
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: b
-               ->  Partial HashAggregate
-                     Group Key: a, b, c
-                     ->  Seq Scan on ndistinct
-(8 rows)
+         ->  Sort
+               Sort Key: a, b, c
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Group Key: a, b, c
+                           ->  Sort
+                                 Sort Key: a, b, c
+                                 ->  Seq Scan on ndistinct
+(12 rows)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
@@ -483,17 +491,21 @@ EXPLAIN (COSTS off)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
-                           QUERY PLAN                            
------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Finalize HashAggregate
+   ->  Finalize GroupAggregate
          Group Key: b, c, d
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: d
-               ->  Partial HashAggregate
-                     Group Key: b, c, d
-                     ->  Seq Scan on ndistinct
-(8 rows)
+         ->  Sort
+               Sort Key: b, c, d
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: d
+                     ->  Partial GroupAggregate
+                           Group Key: b, c, d
+                           ->  Sort
+                                 Sort Key: b, c, d
+                                 ->  Seq Scan on ndistinct
+(12 rows)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY a, d;
diff --git a/src/test/regress/expected/subselect_1.out b/src/test/regress/expected/subselect_1.out
index a63fb3c4..50633a31 100644
--- a/src/test/regress/expected/subselect_1.out
+++ b/src/test/regress/expected/subselect_1.out
@@ -853,11 +853,11 @@ explain (verbose, costs off)
 select * from int4_tbl where
   (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in
   (select ten from tenk1 b);
-                                                  QUERY PLAN                                                   
----------------------------------------------------------------------------------------------------------------
+                                            QUERY PLAN                                             
+---------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: int4_tbl.f1
-   ->  Nested Loop Semi Join
+   ->  Hash Join
          Output: int4_tbl.f1
          Join Filter: ((CASE WHEN (hashed SubPlan 1) THEN int4_tbl.f1 ELSE NULL::integer END) = b.ten)
          ->  Remote Subquery Scan on all (datanode_1)
@@ -872,12 +872,23 @@ select * from int4_tbl where
                                    Output: a.unique1
          ->  Materialize
                Output: b.ten
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  HashAggregate
                      Output: b.ten
-                     Distribute results by H: ten
-                     ->  Seq Scan on public.tenk1 b
+                     Group Key: b.ten
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Output: b.ten
-(22 rows)
+                           Distribute results by H: ten
+                           ->  HashAggregate
+                                 Output: b.ten
+                                 Group Key: b.ten
+                                 ->  Seq Scan on public.tenk1 b
+                                       Output: b.ten
+         SubPlan 1
+           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                 Output: a.unique1
+                 ->  Seq Scan on public.tenk1 a
+                       Output: a.unique1
+(26 rows)
 
 select * from int4_tbl where
   (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in
diff --git a/src/test/regress/expected/xc_FQS_join_1.out b/src/test/regress/expected/xc_FQS_join_1.out
index 4cc96cde..c80fb0f2 100644
--- a/src/test/regress/expected/xc_FQS_join_1.out
+++ b/src/test/regress/expected/xc_FQS_join_1.out
@@ -313,30 +313,24 @@ select * from tab3_rep natural join tab4_rep
 
 explain (num_nodes on, nodes off, costs off, verbose on) select * from tab3_rep natural join tab4_rep
 			where tab3_rep.val > 2 and tab4_rep.val < 5;
-                                    QUERY PLAN                                     
------------------------------------------------------------------------------------
- Merge Join
+                                    QUERY PLAN                                    
+----------------------------------------------------------------------------------
+ Hash Join
    Output: tab3_rep.val, tab3_rep.val2
-   Merge Cond: ((tab3_rep.val = tab4_rep.val) AND (tab3_rep.val2 = tab4_rep.val2))
+   Hash Cond: ((tab3_rep.val = tab4_rep.val) AND (tab3_rep.val2 = tab4_rep.val2))
    ->  Remote Subquery Scan on all
          Output: tab3_rep.val, tab3_rep.val2
-         ->  Sort
+         ->  Seq Scan on public.tab3_rep
                Output: tab3_rep.val, tab3_rep.val2
-               Sort Key: tab3_rep.val, tab3_rep.val2
-               ->  Seq Scan on public.tab3_rep
-                     Output: tab3_rep.val, tab3_rep.val2
-                     Filter: (tab3_rep.val > 2)
-   ->  Materialize
+               Filter: (tab3_rep.val > 2)
+   ->  Hash
          Output: tab4_rep.val, tab4_rep.val2
          ->  Remote Subquery Scan on all
                Output: tab4_rep.val, tab4_rep.val2
-               ->  Sort
+               ->  Seq Scan on public.tab4_rep
                      Output: tab4_rep.val, tab4_rep.val2
-                     Sort Key: tab4_rep.val, tab4_rep.val2
-                     ->  Seq Scan on public.tab4_rep
-                           Output: tab4_rep.val, tab4_rep.val2
-                           Filter: (tab4_rep.val < 5)
-(21 rows)
+                     Filter: (tab4_rep.val < 5)
+(15 rows)
 
 -- Join involving one distributed and one replicated table, with replicated
 -- table existing on all nodes where distributed table exists. should be
@@ -392,31 +386,24 @@ select * from tab1_mod natural join tab4_rep
 
 explain (verbose on, nodes off, costs off) select * from tab1_mod natural join tab4_rep
 			where tab1_mod.val > 2 and tab4_rep.val < 4;
-                                    QUERY PLAN                                     
------------------------------------------------------------------------------------
- Merge Join
+                                    QUERY PLAN                                    
+----------------------------------------------------------------------------------
+ Hash Join
    Output: tab1_mod.val, tab1_mod.val2
-   Merge Cond: ((tab1_mod.val = tab4_rep.val) AND (tab1_mod.val2 = tab4_rep.val2))
+   Hash Cond: ((tab1_mod.val = tab4_rep.val) AND (tab1_mod.val2 = tab4_rep.val2))
    ->  Remote Subquery Scan on all
          Output: tab1_mod.val, tab1_mod.val2
-         Sort Key: tab1_mod.val, tab1_mod.val2
-         ->  Sort
+         ->  Seq Scan on public.tab1_mod
                Output: tab1_mod.val, tab1_mod.val2
-               Sort Key: tab1_mod.val, tab1_mod.val2
-               ->  Seq Scan on public.tab1_mod
-                     Output: tab1_mod.val, tab1_mod.val2
-                     Filter: (tab1_mod.val > 2)
-   ->  Materialize
+               Filter: (tab1_mod.val > 2)
+   ->  Hash
          Output: tab4_rep.val, tab4_rep.val2
          ->  Remote Subquery Scan on all
                Output: tab4_rep.val, tab4_rep.val2
-               ->  Sort
+               ->  Seq Scan on public.tab4_rep
                      Output: tab4_rep.val, tab4_rep.val2
-                     Sort Key: tab4_rep.val, tab4_rep.val2
-                     ->  Seq Scan on public.tab4_rep
-                           Output: tab4_rep.val, tab4_rep.val2
-                           Filter: (tab4_rep.val < 4)
-(22 rows)
+                     Filter: (tab4_rep.val < 4)
+(15 rows)
 
 -- Join involving two distributed tables, never shipped
 select * from tab1_mod natural join tab2_mod
@@ -432,31 +419,25 @@ select * from tab1_mod natural join tab2_mod
 
 explain (verbose on, nodes off, costs off) select * from tab1_mod natural join tab2_mod
 			where tab1_mod.val > 2 and tab2_mod.val < 4;
-                                       QUERY PLAN                                        
------------------------------------------------------------------------------------------
+                                       QUERY PLAN                                       
+----------------------------------------------------------------------------------------
  Remote Subquery Scan on all
    Output: tab1_mod.val, tab1_mod.val2
-   ->  Merge Join
+   ->  Hash Join
          Output: tab1_mod.val, tab1_mod.val2
-         Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
-         ->  Sort
+         Hash Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
+         ->  Seq Scan on public.tab1_mod
                Output: tab1_mod.val, tab1_mod.val2
-               Sort Key: tab1_mod.val, tab1_mod.val2
-               ->  Seq Scan on public.tab1_mod
-                     Output: tab1_mod.val, tab1_mod.val2
-                     Filter: (tab1_mod.val > 2)
-         ->  Materialize
+               Filter: (tab1_mod.val > 2)
+         ->  Hash
                Output: tab2_mod.val, tab2_mod.val2
                ->  Remote Subquery Scan on all
                      Output: tab2_mod.val, tab2_mod.val2
                      Distribute results by M: val
-                     ->  Sort
+                     ->  Seq Scan on public.tab2_mod
                            Output: tab2_mod.val, tab2_mod.val2
-                           Sort Key: tab2_mod.val, tab2_mod.val2
-                           ->  Seq Scan on public.tab2_mod
-                                 Output: tab2_mod.val, tab2_mod.val2
-                                 Filter: (tab2_mod.val < 4)
-(22 rows)
+                           Filter: (tab2_mod.val < 4)
+(16 rows)
 
 -- Join involving a distributed table and two replicated tables, such that the
 -- distributed table exists only on nodes common to replicated tables, try few
@@ -605,15 +586,15 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod
  Sort
    Output: tab1_mod.val, tab1_mod.val2, tab1_mod.val2
    Sort Key: tab1_mod.val2
-   ->  Hash Join
+   ->  Nested Loop
          Output: tab1_mod.val, tab1_mod.val2, tab1_mod.val2
-         Hash Cond: (tab1_mod.val2 = tab4_rep.val2)
+         Join Filter: (tab1_mod.val2 = tab4_rep.val2)
          ->  Remote Subquery Scan on all
                Output: tab1_mod.val, tab1_mod.val2
                ->  Seq Scan on public.tab1_mod
                      Output: tab1_mod.val, tab1_mod.val2
                      Filter: (tab1_mod.val = 1)
-         ->  Hash
+         ->  Materialize
                Output: tab4_rep.val, tab4_rep.val2
                ->  Remote Subquery Scan on all
                      Output: tab4_rep.val, tab4_rep.val2
@@ -641,15 +622,15 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod
 -------------------------------------------------------------------------
  Remote Subquery Scan on all
    Output: tab1_mod.val2, tab1_mod.val, tab2_mod.val, tab1_mod.val
-   ->  Hash Join
+   ->  Nested Loop
          Output: tab1_mod.val2, tab1_mod.val, tab2_mod.val, tab1_mod.val
-         Hash Cond: (tab1_mod.val2 = tab2_mod.val2)
+         Join Filter: (tab1_mod.val2 = tab2_mod.val2)
          ->  Remote Subquery Scan on all
                Output: tab1_mod.val2, tab1_mod.val
                ->  Seq Scan on public.tab1_mod
                      Output: tab1_mod.val2, tab1_mod.val
                      Filter: (tab1_mod.val = 1)
-         ->  Hash
+         ->  Materialize
                Output: tab2_mod.val, tab2_mod.val2
                ->  Seq Scan on public.tab2_mod
                      Output: tab2_mod.val, tab2_mod.val2
diff --git a/src/test/regress/expected/xc_for_update_1.out b/src/test/regress/expected/xc_for_update_1.out
index 8f2b3800..69bd0130 100644
--- a/src/test/regress/expected/xc_for_update_1.out
+++ b/src/test/regress/expected/xc_for_update_1.out
@@ -209,47 +209,47 @@ ERROR:  FOR UPDATE is not allowed with joins
 explain (costs off, num_nodes off, nodes off, verbose on)  select * from t1, t2, t3 for share of t1,t2 nowait;
 ERROR:  FOR SHARE is not allowed with joins
 explain (costs off, num_nodes off, nodes off, verbose on)  select * from t1 join t2 on (t1.val2 = t2.val2) join t3 on (t1.val2 = t3.val2);
-                             QUERY PLAN                              
----------------------------------------------------------------------
+                            QUERY PLAN                             
+-------------------------------------------------------------------
  Remote Subquery Scan on all
    Output: t1.val, t1.val2, t2.val, t2.val2, t3.val, t3.val2
    ->  Merge Join
          Output: t1.val, t1.val2, t2.val, t2.val2, t3.val, t3.val2
-         Merge Cond: (t3.val2 = t1.val2)
-         ->  Remote Subquery Scan on all
-               Output: t3.val, t3.val2
-               Distribute results by H: val2
-               Sort Key: t3.val2
-               ->  Sort
-                     Output: t3.val, t3.val2
-                     Sort Key: t3.val2
-                     ->  Seq Scan on public.t3
-                           Output: t3.val, t3.val2
-         ->  Materialize
+         Merge Cond: (t1.val2 = t3.val2)
+         ->  Merge Join
                Output: t1.val, t1.val2, t2.val, t2.val2
-               ->  Merge Join
-                     Output: t1.val, t1.val2, t2.val, t2.val2
-                     Merge Cond: (t1.val2 = t2.val2)
-                     ->  Remote Subquery Scan on all
+               Merge Cond: (t1.val2 = t2.val2)
+               ->  Remote Subquery Scan on all
+                     Output: t1.val, t1.val2
+                     Distribute results by H: val2
+                     Sort Key: t1.val2
+                     ->  Sort
                            Output: t1.val, t1.val2
-                           Distribute results by H: val2
                            Sort Key: t1.val2
-                           ->  Sort
+                           ->  Seq Scan on public.t1
                                  Output: t1.val, t1.val2
-                                 Sort Key: t1.val2
-                                 ->  Seq Scan on public.t1
-                                       Output: t1.val, t1.val2
-                     ->  Materialize
+               ->  Materialize
+                     Output: t2.val, t2.val2
+                     ->  Remote Subquery Scan on all
                            Output: t2.val, t2.val2
-                           ->  Remote Subquery Scan on all
+                           Distribute results by H: val2
+                           Sort Key: t2.val2
+                           ->  Sort
                                  Output: t2.val, t2.val2
-                                 Distribute results by H: val2
                                  Sort Key: t2.val2
-                                 ->  Sort
+                                 ->  Seq Scan on public.t2
                                        Output: t2.val, t2.val2
-                                       Sort Key: t2.val2
-                                       ->  Seq Scan on public.t2
-                                             Output: t2.val, t2.val2
+         ->  Materialize
+               Output: t3.val, t3.val2
+               ->  Remote Subquery Scan on all
+                     Output: t3.val, t3.val2
+                     Distribute results by H: val2
+                     Sort Key: t3.val2
+                     ->  Sort
+                           Output: t3.val, t3.val2
+                           Sort Key: t3.val2
+                           ->  Seq Scan on public.t3
+                                 Output: t3.val, t3.val2
 (39 rows)
 
 explain (costs off, num_nodes off, nodes off, verbose on)  select * from t1 join t2 on (t1.val2 = t2.val2) join t3 on (t1.val2 = t3.val2) for update;
@@ -262,12 +262,12 @@ select * from t1 join t2 on (t1.val2 = t2.val2) join t3 on (t1.val2 = t3.val2);
  val | val2 | val | val2 | val | val2 
 -----+------+-----+------+-----+------
    1 |   11 |   3 |   11 |   5 |   11
-   1 |   11 |   4 |   11 |   5 |   11
-   2 |   11 |   3 |   11 |   5 |   11
-   2 |   11 |   4 |   11 |   5 |   11
    1 |   11 |   3 |   11 |   6 |   11
+   1 |   11 |   4 |   11 |   5 |   11
    1 |   11 |   4 |   11 |   6 |   11
+   2 |   11 |   3 |   11 |   5 |   11
    2 |   11 |   3 |   11 |   6 |   11
+   2 |   11 |   4 |   11 |   5 |   11
    2 |   11 |   4 |   11 |   6 |   11
 (8 rows)
 
diff --git a/src/test/regress/expected/xc_groupby_1.out b/src/test/regress/expected/xc_groupby_1.out
index cbc7d0c4..8db42b7f 100644
--- a/src/test/regress/expected/xc_groupby_1.out
+++ b/src/test/regress/expected/xc_groupby_1.out
@@ -85,30 +85,22 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby
                ->  Partial HashAggregate
                      Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val))
                      Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
-                     ->  Merge Full Join
+                     ->  Hash Full Join
                            Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
-                           Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
+                           Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
                            ->  Remote Subquery Scan on all
                                  Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
                                  Distribute results by H: val2
-                                 Sort Key: xc_groupby_tab1.val2
-                                 ->  Sort
+                                 ->  Seq Scan on public.xc_groupby_tab1
                                        Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
-                                       Sort Key: xc_groupby_tab1.val2
-                                       ->  Seq Scan on public.xc_groupby_tab1
-                                             Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
-                           ->  Materialize
+                           ->  Hash
                                  Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
                                  ->  Remote Subquery Scan on all
                                        Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
                                        Distribute results by H: val2
-                                       Sort Key: xc_groupby_tab2.val2
-                                       ->  Sort
+                                       ->  Seq Scan on public.xc_groupby_tab2
                                              Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
-                                             Sort Key: xc_groupby_tab2.val2
-                                             ->  Seq Scan on public.xc_groupby_tab2
-                                                   Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
-(34 rows)
+(26 rows)
 
 -- aggregates over aggregates
 select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x order by 1;
@@ -2125,30 +2117,22 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby
                            ->  Sort
                                  Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
                                  Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
-                                 ->  Merge Full Join
+                                 ->  Hash Full Join
                                        Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
-                                       Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
+                                       Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
                                        ->  Remote Subquery Scan on all
                                              Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
                                              Distribute results by H: val2
-                                             Sort Key: xc_groupby_tab1.val2
-                                             ->  Sort
+                                             ->  Seq Scan on public.xc_groupby_tab1
                                                    Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
-                                                   Sort Key: xc_groupby_tab1.val2
-                                                   ->  Seq Scan on public.xc_groupby_tab1
-                                                         Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
-                                       ->  Materialize
+                                       ->  Hash
                                              Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
                                              ->  Remote Subquery Scan on all
                                                    Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
                                                    Distribute results by H: val2
-                                                   Sort Key: xc_groupby_tab2.val2
-                                                   ->  Sort
+                                                   ->  Seq Scan on public.xc_groupby_tab2
                                                          Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
-                                                         Sort Key: xc_groupby_tab2.val2
-                                                         ->  Seq Scan on public.xc_groupby_tab2
-                                                               Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
-(40 rows)
+(32 rows)
 
 -- aggregates over aggregates
 select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x;
@@ -3736,30 +3720,22 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby
                ->  Partial HashAggregate
                      Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val))
                      Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
-                     ->  Merge Full Join
+                     ->  Hash Full Join
                            Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
-                           Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
+                           Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
                            ->  Remote Subquery Scan on all
                                  Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
                                  Distribute results by H: val2
-                                 Sort Key: xc_groupby_tab1.val2
-                                 ->  Sort
+                                 ->  Seq Scan on public.xc_groupby_tab1
                                        Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
-                                       Sort Key: xc_groupby_tab1.val2
-                                       ->  Seq Scan on public.xc_groupby_tab1
-                                             Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
-                           ->  Materialize
+                           ->  Hash
                                  Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
                                  ->  Remote Subquery Scan on all
                                        Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
                                        Distribute results by H: val2
-                                       Sort Key: xc_groupby_tab2.val2
-                                       ->  Sort
+                                       ->  Seq Scan on public.xc_groupby_tab2
                                              Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
-                                             Sort Key: xc_groupby_tab2.val2
-                                             ->  Seq Scan on public.xc_groupby_tab2
-                                                   Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
-(34 rows)
+(26 rows)
 
 -- aggregates over aggregates
 select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x order by 1;
@@ -5824,30 +5800,22 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby
                            ->  Sort
                                  Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
                                  Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
-                                 ->  Merge Full Join
+                                 ->  Hash Full Join
                                        Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
-                                       Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
+                                       Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
                                        ->  Remote Subquery Scan on all
                                              Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
                                              Distribute results by H: val2
-                                             Sort Key: xc_groupby_tab1.val2
-                                             ->  Sort
+                                             ->  Seq Scan on public.xc_groupby_tab1
                                                    Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
-                                                   Sort Key: xc_groupby_tab1.val2
-                                                   ->  Seq Scan on public.xc_groupby_tab1
-                                                         Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
-                                       ->  Materialize
+                                       ->  Hash
                                              Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
                                              ->  Remote Subquery Scan on all
                                                    Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
                                                    Distribute results by H: val2
-                                                   Sort Key: xc_groupby_tab2.val2
-                                                   ->  Sort
+                                                   ->  Seq Scan on public.xc_groupby_tab2
                                                          Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
-                                                         Sort Key: xc_groupby_tab2.val2
-                                                         ->  Seq Scan on public.xc_groupby_tab2
-                                                               Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
-(40 rows)
+(32 rows)
 
 -- aggregates over aggregates
 select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x;
diff --git a/src/test/regress/expected/xc_having_1.out b/src/test/regress/expected/xc_having_1.out
index f12d97f9..93469960 100644
--- a/src/test/regress/expected/xc_having_1.out
+++ b/src/test/regress/expected/xc_having_1.out
@@ -151,34 +151,26 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_having_
 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all
    Output: count(*), sum((xc_having_tab1.val * xc_having_tab2.val)), avg((xc_having_tab1.val * xc_having_tab2.val)), ((sum((xc_having_tab1.val * xc_having_tab2.val)))::double precision / (count(*))::double precision), xc_having_tab1.val2, xc_having_tab2.val2
-   ->  GroupAggregate
+   ->  HashAggregate
          Output: count(*), sum((xc_having_tab1.val * xc_having_tab2.val)), avg((xc_having_tab1.val * xc_having_tab2.val)), ((sum((xc_having_tab1.val * xc_having_tab2.val)))::double precision / (count(*))::double precision), xc_having_tab1.val2, xc_having_tab2.val2
          Group Key: xc_having_tab1.val2, xc_having_tab2.val2
-         ->  Merge Join
+         ->  Hash Join
                Output: xc_having_tab1.val2, xc_having_tab2.val2, xc_having_tab1.val, xc_having_tab2.val
-               Merge Cond: (xc_having_tab1.val2 = xc_having_tab2.val2)
+               Hash Cond: (xc_having_tab1.val2 = xc_having_tab2.val2)
                Join Filter: ((xc_having_tab1.val2 + xc_having_tab2.val2) > 2)
                ->  Remote Subquery Scan on all
                      Output: xc_having_tab1.val, xc_having_tab1.val2
                      Distribute results by H: val2
-                     Sort Key: xc_having_tab1.val2
-                     ->  Sort
+                     ->  Seq Scan on public.xc_having_tab1
                            Output: xc_having_tab1.val, xc_having_tab1.val2
-                           Sort Key: xc_having_tab1.val2
-                           ->  Seq Scan on public.xc_having_tab1
-                                 Output: xc_having_tab1.val, xc_having_tab1.val2
-               ->  Materialize
+               ->  Hash
                      Output: xc_having_tab2.val, xc_having_tab2.val2
                      ->  Remote Subquery Scan on all
                            Output: xc_having_tab2.val, xc_having_tab2.val2
                            Distribute results by H: val2
-                           Sort Key: xc_having_tab2.val2
-                           ->  Sort
+                           ->  Seq Scan on public.xc_having_tab2
                                  Output: xc_having_tab2.val, xc_having_tab2.val2
-                                 Sort Key: xc_having_tab2.val2
-                                 ->  Seq Scan on public.xc_having_tab2
-                                       Output: xc_having_tab2.val, xc_having_tab2.val2
-(29 rows)
+(21 rows)
 
 -- group by and having, without aggregate in the target list
 select val2 from xc_having_tab1 group by val2 having sum(val) > 8;
diff --git a/src/test/regress/expected/xl_join.out b/src/test/regress/expected/xl_join.out
index 463e1baa..6369183d 100644
--- a/src/test/regress/expected/xl_join.out
+++ b/src/test/regress/expected/xl_join.out
@@ -8,25 +8,25 @@ EXPLAIN (COSTS OFF)
 SELECT * FROM xl_join_t1
 	INNER JOIN xl_join_t2 ON xl_join_t1.val1 = xl_join_t2.val2 
 	INNER JOIN xl_join_t3 ON xl_join_t1.val1 = xl_join_t3.val1;
-                                 QUERY PLAN                                  
------------------------------------------------------------------------------
+                             QUERY PLAN                              
+---------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Merge Join
-         Merge Cond: (xl_join_t3.val1 = xl_join_t1.val1)
-         ->  Sort
-               Sort Key: xl_join_t3.val1
-               ->  Seq Scan on xl_join_t3
+         Merge Cond: (xl_join_t2.val2 = xl_join_t1.val1)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: val2
+               ->  Sort
+                     Sort Key: xl_join_t2.val2
+                     ->  Seq Scan on xl_join_t2
          ->  Materialize
                ->  Merge Join
-                     Merge Cond: (xl_join_t2.val2 = xl_join_t1.val1)
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: val2
-                           ->  Sort
-                                 Sort Key: xl_join_t2.val2
-                                 ->  Seq Scan on xl_join_t2
+                     Merge Cond: (xl_join_t1.val1 = xl_join_t3.val1)
                      ->  Sort
                            Sort Key: xl_join_t1.val1
                            ->  Seq Scan on xl_join_t1
+                     ->  Sort
+                           Sort Key: xl_join_t3.val1
+                           ->  Seq Scan on xl_join_t3
 (17 rows)
 
 SELECT * FROM xl_join_t1

From 84c9dda7b6de41ff2e2e28acac00222c91af2c39 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Mon, 29 Jun 2020 11:09:35 +0800
Subject: [PATCH 007/578] adjust set_join_distribution to include replicate
 small rel cost

---
 src/backend/optimizer/util/pathnode.c | 36 +++++++++++++++++++++------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 586a595c..c1c50654 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1422,6 +1422,13 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
     Distribution   *distribution = NULL;
     RelOptInfo       *rel = subpath->parent;
     RemoteSubPath  *pathnode;
+#ifdef __TBASE__
+	int				num_replication;
+
+	/* IsLocatorNone() also indicates we are replicating through input nodes */
+	num_replication = (IsLocatorReplicated(distributionType) ||
+				IsLocatorNone(distributionType)) ? bms_num_members(nodes) : 1;
+#endif
 
      if (distributionType != LOCATOR_TYPE_NONE)
     {
@@ -1467,8 +1474,12 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
         /* (re)calculate costs */
         cost_remote_subplan((Path *) pathnode, subpath->startup_cost,
                             subpath->total_cost, subpath->rows, rel->reltarget->width,
+#ifdef __TBASE__
+							num_replication);
+#else
                             IsLocatorReplicated(distributionType) ?
                                     bms_num_members(nodes) : 1);
+#endif
         mpath->subpath = (Path *) pathnode;
         cost_material(&mpath->path,
                       pathnode->path.startup_cost,
@@ -1532,8 +1543,12 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
         cost_remote_subplan((Path *) pathnode,
                             input_startup_cost, input_total_cost,
                             subpath->rows, rel->reltarget->width,
+#ifdef __TBASE__
+							num_replication);
+#else
                             IsLocatorReplicated(distributionType) ?
                                     bms_num_members(nodes) : 1);
+#endif
         return (Path *) pathnode;
     }
 }
@@ -2256,6 +2271,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                     nodes = bms_add_member(nodes, i);
 
 #ifdef __TBASE__
+                /* check if we can distribute by shard */
                 if (OidIsValid(group))
                 {
                     int      node_index;
@@ -2275,9 +2291,9 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                 }
 
                 /*
-                  * if one of both is smaller enough, 
-                  * replicate the small one instead of redistribute both
-                                */
+                 * if any side is smaller enough, replicate the smaller one
+                 * instead of redistribute both of them.
+                 */
                 if(inner_size * outer_nodes < inner_size + outer_size &&
                     (pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL) &&
                     outerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_inner &&
@@ -2373,7 +2389,10 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
             if (new_inner_key)
             {
 #ifdef __TBASE__
-                /* replicate outer rel */
+				/*
+				 * replicate outer rel, just set LOCATOR_TYPE_NONE to remove
+				 * the path distribution.
+				 */
                 if(replicate_outer)
                 {
                     pathnode->outerjoinpath = redistribute_path(
@@ -2382,7 +2401,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                                                 outerpathkeys,
                                                 LOCATOR_TYPE_NONE,
                                                 NULL,
-                                                NULL,
+                                                innerd->nodes,
                                                 NULL);
 
                     if (IsA(pathnode, MergePath))
@@ -2414,7 +2433,10 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
             if (new_outer_key)
             {
 #ifdef __TBASE__
-                /* replicate inner rel */
+                /*
+				 * replicate inner rel, just set LOCATOR_TYPE_NONE to remove
+				 * the path distribution.
+				 */
                 if(replicate_inner)
                 {
                     pathnode->innerjoinpath = redistribute_path(
@@ -2423,7 +2445,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                                                 innerpathkeys,
                                                 LOCATOR_TYPE_NONE,
                                                 NULL,
-                                                NULL,
+                                                outerd->nodes,
                                                 NULL);
 
                     if (IsA(pathnode, MergePath))

From 4360b39b684291a520ca358761a1e3480e9d6577 Mon Sep 17 00:00:00 2001
From: branwu <branwu@tencent.com>
Date: Mon, 29 Jun 2020 14:49:03 +0800
Subject: [PATCH 008/578] fix bugs on tbase online upgrade.ID80501281

---
 .../tbase_upgrade_spec/2.15.12_after_start.sh | 39 ++++++++++++++-----
 .../tbase_upgrade_spec/2.15.12_before_stop.sh | 36 +++++++++++++----
 2 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/src/backend/utils/tbase_upgrade_spec/2.15.12_after_start.sh b/src/backend/utils/tbase_upgrade_spec/2.15.12_after_start.sh
index e4aaa208..1e7d4c45 100644
--- a/src/backend/utils/tbase_upgrade_spec/2.15.12_after_start.sh
+++ b/src/backend/utils/tbase_upgrade_spec/2.15.12_after_start.sh
@@ -13,18 +13,39 @@ execSql()
 {
     local sql="$1"
     local db=$2
-    export LD_LIBRARY_PATH=${bin_dir}/lib:${LD_LIBRARY_PATH} && export PATH=${bin_dir}/bin:${PATH} && $bin_dir/bin/psql -h $host -p $port -d $db -U $user -t -c "$sql" | sed '/^\s*$/d'
+    local node_host="$3"
+    local node_port=$4
+    export LD_LIBRARY_PATH=${bin_dir}/lib:${LD_LIBRARY_PATH}
+    export PATH=${bin_dir}/bin:${PATH}
+    $bin_dir/bin/psql -h $node_host -p $node_port -d $db -U $user -t -c "$sql" | sed '/^\s*$/d'
 }
 
-dbs=$(execSql "select datname from pg_database where datname !='template0'" $dbname)
-for db in ${dbs}
-do
-        echo $db
-        execSql "create extension pg_stat_log" $db
+getSeg()
+{
+    line="$1"
+    segNum="$2"
+
+    seg=$(echo "$line" | awk -F'|' '{print $segNum}' "segNum=$segNum")
+    seg=$(echo $seg)
+    echo $seg
+}
+
+nodeinfo=$(execSql "select node_host, node_port from pgxc_node where node_type='C' order by node_port asc limit 1" $dbname $host $port)
+
+num=$(echo "$nodeinfo" | wc -l)
+for ((i=1; i<=num; ++i)); do
+    node_host=$(getSeg "$nodeinfo" 1)
+    node_port=$(getSeg "$nodeinfo" 2)
+    dbs=$(execSql "select datname from pg_database where datname !='template0'" $dbname $node_host $node_port)
+    for db in ${dbs}
+    do
+        execSql "create extension if not exists pg_stat_log" $db $node_host $node_port
         if [ $? -eq 0 ]
         then
-                echo "create pg_stat_log on $host:$port:$db success"
+                echo "create pg_stat_log on $node_host:$node_port:$db success"
         else
-                echo "create pg_stat_log on $host:$port:$db failed"
+                echo "create pg_stat_log on $node_host:$node_port:$db failed"
         fi
-done
\ No newline at end of file
+    done
+done
+
diff --git a/src/backend/utils/tbase_upgrade_spec/2.15.12_before_stop.sh b/src/backend/utils/tbase_upgrade_spec/2.15.12_before_stop.sh
index a110a5a3..58e6b62c 100644
--- a/src/backend/utils/tbase_upgrade_spec/2.15.12_before_stop.sh
+++ b/src/backend/utils/tbase_upgrade_spec/2.15.12_before_stop.sh
@@ -13,17 +13,39 @@ execSql()
 {
     local sql="$1"
     local db=$2
-    export LD_LIBRARY_PATH=${bin_dir}/lib:${LD_LIBRARY_PATH} && export PATH=${bin_dir}/bin:${PATH} && $bin_dir/bin/psql -h $host -p $port -d $db -U $user -t -c "$sql" | sed '/^\s*$/d'
+    local node_host="$3"
+    local node_port=$4
+    export LD_LIBRARY_PATH=${bin_dir}/lib:${LD_LIBRARY_PATH}
+    export PATH=${bin_dir}/bin:${PATH}
+    $bin_dir/bin/psql -h $node_host -p $node_port -d $db -U $user -t -c "$sql" | sed '/^\s*$/d'
 }
 
-dbs=$(execSql "select datname from pg_database where datname !='template0'" $dbname)
-for db in ${dbs}
-do
-        execSql "drop extension pg_stat_log" $db
+getSeg()
+{
+    line="$1"
+    segNum="$2"
+
+    seg=$(echo "$line" | awk -F'|' '{print $segNum}' "segNum=$segNum")
+    seg=$(echo $seg)
+    echo $seg
+}
+
+nodeinfo=$(execSql "select node_host, node_port from pgxc_node where node_type='C' order by node_port asc limit 1" $dbname $host $port)
+
+num=$(echo "$nodeinfo" | wc -l)
+for ((i=1; i<=num; ++i)); do
+    node_host=$(getSeg "$nodeinfo" 1)
+    node_port=$(getSeg "$nodeinfo" 2)
+    dbs=$(execSql "select datname from pg_database where datname !='template0'" $dbname $node_host $node_port)
+    for db in ${dbs}
+    do
+        execSql "drop extension if exists pg_stat_log" $db $node_host $node_port
         if [ $? -eq 0 ]
         then
-                echo "drop pg_stat_log on $host:$port:$db success"
+                echo "drop pg_stat_log on $node_host:$node_port:$db success"
         else
-                echo "drop pg_stat_log on $host:$port:$db failed"
+                echo "drop pg_stat_log on $node_host:$node_port:$db failed"
         fi
+    done
 done
+

From 52a3acd2afcbc5897a01c023013116df836cb4c7 Mon Sep 17 00:00:00 2001
From: aidenma <aidenma@tencent.com>
Date: Mon, 29 Jun 2020 21:24:02 +0800
Subject: [PATCH 009/578] Fallback slave phyiscal replication fail problem
 tapd:http://tapd.tencent.com/pgxz/bugtrace/bugs/view?bug_id=1110092131078795210

---
 src/bin/pg_basebackup/pg_basebackup.c | 167 +++-----------------------
 1 file changed, 18 insertions(+), 149 deletions(-)

diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index fa586130..2f5717e2 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -1751,25 +1751,21 @@ WriteRecoveryConf(void)
 
 static void
 BaseBackup(void)
-{// #lizard forgives
-    PGresult   *res;
-    char       *sysidentifier;
-    TimeLineID    latesttli;
-    TimeLineID    starttli;
-    char       *basebkp;
-    char        escaped_label[MAXPGPATH];
-    char       *maxrate_clause = NULL;
-    int            i;
-    char        xlogstart[64];
-    char        xlogend[64];
-    int            minServerMajor,
-                maxServerMajor;
-    int            serverVersion,
-                serverMajor;
-	PGconn      *connDev = NULL;
-    PGresult    *resDev = NULL;
-    char        connInfo[MAXPGPATH];
-    char        *default_dbname = " dbname=postgres";
+{
+	PGresult   *res;
+	char	   *sysidentifier;
+	TimeLineID	latesttli;
+	TimeLineID	starttli;
+	char	   *basebkp;
+	char		escaped_label[MAXPGPATH];
+	char	   *maxrate_clause = NULL;
+	int			i;
+	char		xlogstart[64];
+	char		xlogend[64];
+	int			minServerMajor,
+				maxServerMajor;
+	int			serverVersion,
+				serverMajor;
 
     Assert(conn != NULL);
 
@@ -1867,136 +1863,9 @@ BaseBackup(void)
         disconnect_and_exit(1);
     }
 
-    /*
-     * found connstr is contain 'dbname' in pg_basebackup use -d parmas
-     */
-    memset(connInfo, '\0', sizeof(connInfo));
-    if (NULL != connection_string)
-    {
-        if (NULL != strstr(connection_string,"dbname"))
-        {
-            snprintf(connInfo, sizeof(connInfo), "%s",connection_string);
-
-        }
-        else
-        {
-            snprintf(connInfo, sizeof(connInfo), "%s %s",connection_string, default_dbname);
-        }
-    }
-        /*
-         * found connstr is contain 'dbname' in pg_basebackup not use -d parmas| use -U -h -p parmas
-         */
-    else if ((NULL != dbname) || (NULL != dbport) || (NULL != dbuser))
-    {
-        if(NULL == dbname)
-        {
-            snprintf(connInfo, sizeof(connInfo), "host=%s port=%s user=%s %s", dbhost, dbport, dbuser, default_dbname);
-        }
-        else
-        {
-            snprintf(connInfo, sizeof(connInfo), "host=%s port=%s user=%s dbname=%s", dbhost, dbport, dbuser, dbname);
-        }
-    }
-    connDev = PQconnectdb(connInfo);
-
-    if (PQstatus(connDev) != CONNECTION_OK)
-    {
-        fprintf(stderr, "Connection to database failed: %s\n",
-                PQerrorMessage(connDev));
-        disconnect_and_exit(1);
-    }
-    resDev = PQexec(connDev, "select restart_lsn from pg_replication_slots order by restart_lsn asc limit 1");
-    memset(xlogstart, '\0', sizeof(xlogstart));
-    if (PQntuples(resDev) == 0)
-    {
-        strlcpy(xlogstart, PQgetvalue(res, 0, 0), sizeof(xlogstart));
-    }
-    else
-    {
-        strlcpy(xlogstart, PQgetvalue(resDev, 0, 0), sizeof(xlogstart));
-        fprintf(stderr, _("%s: In pg_replication_slots restartlsn exchange write-ahead log start point: %s\n"),
-                progname, xlogstart);
-    }
-
-    if (NULL != resDev)
-    {
-        PQclear(resDev);
-    }
-
-    if (NULL != connDev)
-    {
-        PQfinish(connDev);
-    }
-
-
-    if (verbose)
-        fprintf(stderr, _("%s: checkpoint completed\n"), progname);
-
-    /*
-     * 9.3 and later sends the TLI of the starting point. With older servers,
-     * assume it's the same as the latest timeline reported by
-     * IDENTIFY_SYSTEM.
-     */
-    if (PQnfields(res) >= 2)
-        starttli = atoi(PQgetvalue(res, 0, 1));
-    else
-        starttli = latesttli;
-    PQclear(res);
-    MemSet(xlogend, 0, sizeof(xlogend));
-
-    if (verbose && includewal != NO_WAL)
-        fprintf(stderr, _("%s: write-ahead log start point: %s on timeline %u\n"),
-                progname, xlogstart, starttli);
-
-    /*
-     * Get the header
-     */
-    res = PQgetResult(conn);
-    if (PQresultStatus(res) != PGRES_TUPLES_OK)
-    {
-        fprintf(stderr, _("%s: could not get backup header: %s"),
-                progname, PQerrorMessage(conn));
-        disconnect_and_exit(1);
-    }
-    if (PQntuples(res) < 1)
-    {
-        fprintf(stderr, _("%s: no data returned from server\n"), progname);
-        disconnect_and_exit(1);
-    }
-
-    /*
-     * Sum up the total size, for progress reporting
-     */
-    totalsize = totaldone = 0;
-    tablespacecount = PQntuples(res);
-    for (i = 0; i < PQntuples(res); i++)
-    {
-        totalsize += atol(PQgetvalue(res, i, 2));
-
-        /*
-         * Verify tablespace directories are empty. Don't bother with the
-         * first once since it can be relocated, and it will be checked before
-         * we do anything anyway.
-         */
-        if (format == 'p' && !PQgetisnull(res, i, 1))
-        {
-            char       *path = (char *) get_tablespace_mapping(PQgetvalue(res, i, 1));
-
-            verify_dir_is_empty_or_create(path, &made_tablespace_dirs, &found_tablespace_dirs);
-        }
-    }
-
-    /*
-     * When writing to stdout, require a single tablespace
-     */
-    if (format == 't' && strcmp(basedir, "-") == 0 && PQntuples(res) > 1)
-    {
-        fprintf(stderr,
-                _("%s: can only write single tablespace to stdout, database has %d\n"),
-                progname, PQntuples(res));
-        disconnect_and_exit(1);
-    }
-
+    /* start_point: get last checkpoint point position from master */
+    strlcpy(xlogstart, PQgetvalue(res, 0, 0), sizeof(xlogstart));
+    
     /*
      * If we're streaming WAL, start the streaming session before we start
      * receiving the actual data chunks.

From 467725d7a84d1f533cc80702585936945d3add7b Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Mon, 29 Jun 2020 21:54:07 +0800
Subject: [PATCH 010/578] consider nestloop inner plan materialization cost

Nestloop join will add material plannode atop of inner subplan. This is
checked and added during create plan phase. But we did not consider the
cost in cost modle. I think the code was originally copied from merge
join, but forgot to copy the cost modle change in final_cost_mergejoin.
---
 src/backend/optimizer/path/costsize.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 64872e08..7de5eaa4 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -2345,6 +2345,25 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
     startup_cost += path->path.pathtarget->cost.startup;
     run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
 
+#ifdef __TBASE__
+	/*
+	 * While NestLoop is executed it rescans inner plan. We do not want to
+	 * rescan RemoteSubplan and do not support it. So if inner_plan is a
+	 * RemoteSubplan, materialize it.
+	 *
+	 * We add materialize plannode during the create plan phase to avoid
+	 * other optimizer side affect. But we still need to add the cost here
+	 * just like mergejoin did when considering materialize_inner flag.
+	 * During join reordering phase, there should be no other node between
+	 * current nestloop and RemoteSubPath. Thus we do not need to traverse
+	 * the whole subpath to find RemoteSubPath.
+	 */
+	if (IsA(inner_path, RemoteSubPath))
+	{
+		run_cost += cpu_operator_cost * inner_path_rows;
+	}
+#endif
+
     path->path.startup_cost = startup_cost;
     path->path.total_cost = startup_cost + run_cost;
 }

From aadbcd3569cc9410bd7dadc25d24ced0e2bf30a7 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 30 Jun 2020 15:07:48 +0800
Subject: [PATCH 011/578] Fix ReinitializeParallelDSM to tolerate finding no
 error queues.(Merge Postgres)

Commit d4663350646ca0c069a36d906155a0f7e3372eb7 changed things so
that shm_toc_lookup would fail with an error rather than silently
returning NULL in the hope that such failures would be reported
in a useful way rather than via a system crash.  However, it
overlooked the fact that the lookup of PARALLEL_KEY_ERROR_QUEUE
in ReinitializeParallelDSM is expected to fail when no DSM segment
was created in the first place; in that case, we end up with a
backend-private memory segment that still contains an entry for
PARALLEL_KEY_FIXED but no others.  Consequently a benign failure
to initialize parallelism can escalate into an elog(ERROR);
repair.
---
 src/backend/access/transam/parallel.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
index 8cccbfb0..4cffc98a 100644
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -420,9 +420,10 @@ ReinitializeParallelDSM(ParallelContext *pcxt)
     fps = shm_toc_lookup(pcxt->toc, PARALLEL_KEY_FIXED, false);
     fps->last_xlog_end = 0;
 
-    /* Recreate error queues. */
+    /* Recreate error queues (if they exist). */
     error_queue_space =
-        shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, false);
+        shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, true);
+    Assert(pcxt->nworkers == 0 || error_queue_space != NULL);   
     for (i = 0; i < pcxt->nworkers; ++i)
     {
         char       *start;

From 0368d60dc3ce55c32c508a9a2aeeb94599abc605 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 30 Jun 2020 15:12:19 +0800
Subject: [PATCH 012/578] Be more wary about shm_toc_lookup failure.(Merge
 Postgres)

Commit 445dbd82a basically missed the point of commit d46633506,
which was that we shouldn't allow shm_toc_lookup() failure to lead
to a core dump or assertion crash, because the odds of such a
failure should never be considered negligible.  It's correct that
we can't expect the PARALLEL_KEY_ERROR_QUEUE TOC entry to be there
if we have no workers.  But if we have no workers, we're not going
to do anything in this function with the lookup result anyway,
so let's just skip it.  That lets the code use the easy-to-prove-safe
noError=false case, rather than anything requiring effort to review.

Back-patch to v10, like the previous commit.

Discussion: https://postgr.es/m/3647.1517601675@sss.pgh.pa.us
---
 src/backend/access/transam/parallel.c | 29 +++++++++++++++------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
index 4cffc98a..84256fe1 100644
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -405,8 +405,6 @@ void
 ReinitializeParallelDSM(ParallelContext *pcxt)
 {
     FixedParallelState *fps;
-    char       *error_queue_space;
-    int            i;
 
     /* Wait for any old workers to exit. */
     if (pcxt->nworkers_launched > 0)
@@ -421,18 +419,23 @@ ReinitializeParallelDSM(ParallelContext *pcxt)
     fps->last_xlog_end = 0;
 
     /* Recreate error queues (if they exist). */
-    error_queue_space =
-        shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, true);
-    Assert(pcxt->nworkers == 0 || error_queue_space != NULL);   
-    for (i = 0; i < pcxt->nworkers; ++i)
+    if (pcxt->nworkers > 0)
     {
-        char       *start;
-        shm_mq       *mq;
-
-        start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE;
-        mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE);
-        shm_mq_set_receiver(mq, MyProc);
-        pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL);
+        char       *error_queue_space;
+        int                     i;
+
+		error_queue_space =
+			shm_toc_lookup(pcxt->toc, PARALLEL_KEY_ERROR_QUEUE, false);
+		for (i = 0; i < pcxt->nworkers; ++i)
+		{
+			char	   *start;
+			shm_mq	   *mq;
+
+			start = error_queue_space + i * PARALLEL_ERROR_QUEUE_SIZE;
+			mq = shm_mq_create(start, PARALLEL_ERROR_QUEUE_SIZE);
+			shm_mq_set_receiver(mq, MyProc);
+			pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL);
+		}
     }
 }
 

From 28ae14ba9851b828f8d5ca56f6af28050ace68c5 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Mon, 13 Jul 2020 20:21:02 +0800
Subject: [PATCH 013/578] refactor distributed transaction related functions

1. extract SetPlpgsqlTransactionBegin()
2. extract tuple visibility debugging functions
3. mask debugging functions in tuple visibility core path
4. other code format refactor
---
 src/backend/access/transam/gtm.c      |    4 +-
 src/backend/access/transam/twophase.c |   25 +-
 src/backend/access/transam/xact.c     |   43 +-
 src/backend/pgxc/pool/execRemote.c    |  956 +++++-----
 src/backend/pgxc/pool/pgxcnode.c      |    7 +-
 src/backend/storage/lmgr/nodelock.c   |    8 +-
 src/backend/tcop/postgres.c           |   98 +-
 src/backend/utils/cache/syscache.c    |    2 +-
 src/backend/utils/time/tqual.c        | 2463 +++++++++++--------------
 src/include/utils/snapshot.h          |   17 +-
 10 files changed, 1668 insertions(+), 1955 deletions(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 47b698d4..4545592b 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -200,7 +200,9 @@ void RegisterRenameSequence(char *new, char *old)
         rename_info = (RenameInfo *) lfirst(cell);            
         if (0 == strncmp(rename_info->new, old, GTM_NAME_LEN))
         {            
-            elog(LOG, "Combine requence seq:%s ->:%s, %s->%s to old:%s latest new:%s", rename_info->new, rename_info->old, new, old, rename_info->old, new);
+             elog(LOG, "Combine requence seq:%s ->:%s, %s->%s to old:%s latest "
+                                       "new:%s", rename_info->new, rename_info->old, new, old,
+                                       rename_info->old, new);
             snprintf(rename_info->new, GTM_NAME_LEN, "%s", new);
             return;
         }        
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index b2e35941..204e9edd 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -406,7 +406,8 @@ PostPrepare_Twophase(void)
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
 
-static void RecoverEndGlobalPrepare(GlobalTransaction gxact)
+static void
+RecoverEndGlobalPrepare(GlobalTransaction gxact)
 {
     volatile PGXACT    *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
     
@@ -416,17 +417,17 @@ static void RecoverEndGlobalPrepare(GlobalTransaction gxact)
 
 }
 
-
-void EndGlobalPrepare(GlobalTransaction gxact, bool isImplicit)
-{    
-    volatile PGXACT    *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
-    
-    pg_atomic_write_u64(&pgxact->prepare_timestamp, GetGlobalPrepareTimestamp());
-    if(enable_distri_print)
-    {
-        elog(LOG, "proc no %d prepare timestamp " INT64_FORMAT " xid %d.", gxact->pgprocno, 
-                            GetGlobalPrepareTimestamp(), pgxact->xid);
-    }
+void
+EndGlobalPrepare(GlobalTransaction gxact, bool isImplicit)
+{	
+	volatile PGXACT	*pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
+	
+	pg_atomic_write_u64(&pgxact->prepare_timestamp, GetGlobalPrepareTimestamp());
+	if(enable_distri_print)
+	{
+		elog(LOG, "proc no %d prepare timestamp " INT64_FORMAT " xid %d.", gxact->pgprocno, 
+							GetGlobalPrepareTimestamp(), pgxact->xid);
+	}
 
     if(isImplicit && !GlobalTimestampIsValid(pg_atomic_read_u64(&pgxact->prepare_timestamp)))
     {
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 993d0c52..5369958d 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -353,6 +353,14 @@ static TimestampTz xactStartTimestamp;
 static TimestampTz stmtStartTimestamp;
 static TimestampTz xactStopTimestamp;
 
+
+#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
+static GlobalTimestamp XactGlobalCommitTimestamp = 0;
+static GlobalTimestamp XactGlobalPrepareTimestamp = 0;
+static GlobalTimestamp XactLocalCommitTimestamp = 0;
+static GlobalTimestamp XactLocalPrepareTimestamp = 0;
+#endif
+
 /*
  * PGXC receives from GTM a timestamp value at the same time as a GXID
  * This one is set as GTMxactStartTimestamp and is a return value of now(), current_transaction().
@@ -361,14 +369,6 @@ static TimestampTz xactStopTimestamp;
  * during a transaction. Delta can have a different value through the nodes of the cluster
  * but its uniqueness in the cluster is maintained thanks to the global value GTMxactStartTimestamp.
  */
-#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-static GlobalTimestamp XactGlobalCommitTimestamp = 0;
-static GlobalTimestamp XactGlobalPrepareTimestamp = 0;
-static GlobalTimestamp XactLocalCommitTimestamp = 0;
-static GlobalTimestamp XactLocalPrepareTimestamp = 0;
-
-#endif
-
 #ifdef PGXC
 static TimestampTz GTMxactStartTimestamp = 0;
 static TimestampTz GTMdeltaTimestamp = 0;
@@ -7969,7 +7969,6 @@ NeedBeginTxn(void)
     return ret;
 }
 
-
 bool
 NeedBeginSubTxn(void)
 {
@@ -7987,19 +7986,21 @@ NeedBeginSubTxn(void)
 void 
 SetNodeBeginTxn(Oid nodeoid)    
 {
-    TransactionState s = &TopTransactionStateData;
-    MemoryContext oldcontext = NULL;
-    
-    if (!InPlpgsqlFunc() || s->nestingLevel != 1)
-    {
-        elog(PANIC,"SetNodeBeginTxn should only called in plpgsql exec env and TopmostTxn");
-    }
+	TransactionState s = &TopTransactionStateData;
+	MemoryContext oldcontext = NULL;
+	
+	if (!InPlpgsqlFunc() || s->nestingLevel != 1)
+	{
+		elog(PANIC,"SetNodeBeginTxn should only called in plpgsql exec env and "
+				"TopmostTxn");
+	}
 
-    oldcontext = MemoryContextSwitchTo(TopTransactionContext);
-    
-    s->node_has_begin_txn_list = list_append_unique_oid(s->node_has_begin_txn_list, nodeoid);
-      
-    MemoryContextSwitchTo(oldcontext);
+	oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+	
+	s->node_has_begin_txn_list =
+			list_append_unique_oid(s->node_has_begin_txn_list, nodeoid);
+	  
+	MemoryContextSwitchTo(oldcontext);
 }
 
 void 
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 4183be8b..72aa55f1 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3425,11 +3425,13 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
         }
         /* Send timestamp and check for errors */
 
-        if (GlobalTimestampIsValid(timestamp) && pgxc_node_send_timestamp(connections[i], timestamp))
-        {
-            elog(WARNING, "pgxc_node_begin sending timestamp fails: local start timestamp" INT64_FORMAT, timestamp);
-            return EOF;
-        }
+		if (GlobalTimestampIsValid(timestamp) &&
+			pgxc_node_send_timestamp(connections[i], timestamp))
+		{
+			elog(WARNING, "pgxc_node_begin sending timestamp fails: local start"
+					" timestamp" INT64_FORMAT, timestamp);
+			return EOF;
+		}
         if (IS_PGXC_DATANODE && GlobalTransactionIdIsValid(gxid))
         {
             need_tran_block = true;
@@ -3445,64 +3447,76 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
             need_send_begin = true;
         }
 
-        if (connections[i]->plpgsql_need_begin_txn && connections[i]->plpgsql_need_begin_sub_txn && 'I' == connections[i]->transaction_status)
-        {
-            need_send_begin = true;
-            cmd = begin_both_cmd;
-            connections[i]->plpgsql_need_begin_txn = false;
-            connections[i]->plpgsql_need_begin_sub_txn = false;
-            if (PlpgsqlDebugPrint)
-            {
-                elog(LOG, "[PLPGSQL] pgxc_node_begin cmd:%s conn->plpgsql_need_begin_txn was true, and conn->plpgsql_need_begin_sub_txn was true. in_plpgsql_exec_fun:%d", 
-                            cmd, g_in_plpgsql_exec_fun);
-            }
-        }
-        else if (connections[i]->plpgsql_need_begin_txn && 'I' == connections[i]->transaction_status)
-        {
-            need_send_begin = true;
-            connections[i]->plpgsql_need_begin_txn = false;
-            if (PlpgsqlDebugPrint)
-            {
-                elog(LOG, "[PLPGSQL] pgxc_node_begin cmd:%s conn->plpgsql_need_begin_txn was true, g_in_plpgsql_exec_fun:%d, conn->plpgsql_need_begin_sub_txn:%d", 
-                            cmd, g_in_plpgsql_exec_fun, connections[i]->plpgsql_need_begin_sub_txn);
-            }
-        }
-        else if (connections[i]->plpgsql_need_begin_sub_txn)
-        {
-            need_send_begin = true;
-            cmd = begin_subtxn_cmd;
-            connections[i]->plpgsql_need_begin_sub_txn = false;
-            if (PlpgsqlDebugPrint)
-            {
-                elog(LOG, "[PLPGSQL] pgxc_node_begin cmd:%s conn->plpgsql_need_begin_sub_txn was true, g_in_plpgsql_exec_fun:%d, conn->plpgsql_need_begin_txn:%d", 
-                            cmd, g_in_plpgsql_exec_fun, connections[i]->plpgsql_need_begin_txn);
-            }
-            if ('T' != connections[i]->transaction_status)
-            {
-                elog(PANIC, "[PLPGSQL] pgxc_node_begin need_begin_sub_txn wrong transaction_status");
-            }
-        }
-
+		if (connections[i]->plpgsql_need_begin_txn &&
+			connections[i]->plpgsql_need_begin_sub_txn &&
+			'I' == connections[i]->transaction_status)
+		{
+			need_send_begin = true;
+			cmd = begin_both_cmd;
+			connections[i]->plpgsql_need_begin_txn = false;
+			connections[i]->plpgsql_need_begin_sub_txn = false;
+			if (PlpgsqlDebugPrint)
+			{
+				elog(LOG, "[PLPGSQL] pgxc_node_begin cmd:%s conn->plpgsql_need_begin_txn "
+						"was true, and conn->plpgsql_need_begin_sub_txn was true. "
+						"in_plpgsql_exec_fun:%d", cmd, g_in_plpgsql_exec_fun);
+			}
+		}
+		else if (connections[i]->plpgsql_need_begin_txn &&
+				'I' == connections[i]->transaction_status)
+		{
+			need_send_begin = true;
+			connections[i]->plpgsql_need_begin_txn = false;
+			if (PlpgsqlDebugPrint)
+			{
+				elog(LOG, "[PLPGSQL] pgxc_node_begin cmd:%s conn->plpgsql_need_begin_txn "
+						"was true, g_in_plpgsql_exec_fun:%d, conn->plpgsql_need_begin_sub_txn:%d",
+							cmd, g_in_plpgsql_exec_fun, connections[i]->plpgsql_need_begin_sub_txn);
+			}
+		}
+		else if (connections[i]->plpgsql_need_begin_sub_txn)
+		{
+			need_send_begin = true;
+			cmd = begin_subtxn_cmd;
+			connections[i]->plpgsql_need_begin_sub_txn = false;
+			if (PlpgsqlDebugPrint)
+			{
+				elog(LOG, "[PLPGSQL] pgxc_node_begin cmd:%s conn->plpgsql_need_begin_sub_txn was"
+						" true, g_in_plpgsql_exec_fun:%d, conn->plpgsql_need_begin_txn:%d",
+							cmd, g_in_plpgsql_exec_fun, connections[i]->plpgsql_need_begin_txn);
+			}
+			if ('T' != connections[i]->transaction_status)
+			{
+				elog(PANIC, "[PLPGSQL] pgxc_node_begin need_begin_sub_txn wrong"
+						"transaction_status");
+			}
+		}
 
-        /* If exec savepoint command, we make sure begin should send(NB:can be sent only once) before send savepoint  */
-        if ('I' == connections[i]->transaction_status && SavepointDefined())
-        {
-            need_send_begin = true;
-        }
+		/*
+		 * If exec savepoint command, we make sure begin should send(NB:can be
+		 * sent only once) before send savepoint
+		 */
+		if ('I' == connections[i]->transaction_status && SavepointDefined())
+		{
+			need_send_begin = true;
+		}
 
 		/* 
-		 * Send the Coordinator info down to the PGXC node at the beginning of transaction,
-		 * In this way, Datanode can print this Coordinator info into logfile, 
-		 * and those infos can be found in Datanode logifile if needed during debugging
+		 * Send the Coordinator info down to the PGXC node at the beginning of
+		 * transaction, In this way, Datanode can print this Coordinator info
+		 * into logfile, and those infos can be found in Datanode logifile if
+		 * needed during debugging
 		 */
 		if (need_send_begin && IS_PGXC_COORDINATOR)
 		{
 			pgxc_node_send_coord_info(connections[i], MyProcPid, MyProc->lxid);
 		}
-#endif        
+#endif
 
-        elog(DEBUG5, "[PLPGSQL] pgxc_node_begin need_tran_block %d, connections[%d]->transaction_status %c need_send_begin:%d",
-                need_tran_block, i, connections[i]->transaction_status, need_send_begin);
+		elog(DEBUG5, "[PLPGSQL] pgxc_node_begin need_tran_block %d,"
+				"connections[%d]->transaction_status %c need_send_begin:%d",
+				need_tran_block, i, connections[i]->transaction_status,
+				need_send_begin);
 
         /* Send BEGIN if not already in transaction */
         if (need_send_begin)
@@ -3513,8 +3527,9 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
                 return EOF;
             }
 
-            elog(DEBUG5, "pgxc_node_begin send %s to node %s, pid:%d", cmd, connections[i]->nodename, connections[i]->backend_pid);
-            new_connections[new_count++] = connections[i];
+			elog(DEBUG5, "pgxc_node_begin send %s to node %s, pid:%d", cmd,
+					connections[i]->nodename, connections[i]->backend_pid);
+			new_connections[new_count++] = connections[i];
         }
         
 #if 0        
@@ -4877,6 +4892,40 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
     }
 }
 
+/*
+ * Set the node begein transaction in plpgsql function
+ */
+static void
+SetPlpgsqlTransactionBegin(PGXCNodeHandle *conn)
+{
+	Oid nodeOid = conn->nodeoid;
+
+	if (NeedBeginTxn() && !NodeHasBeginTxn(nodeOid))
+	{
+		conn->plpgsql_need_begin_txn = true;
+		SetNodeBeginTxn(nodeOid);
+		if (PlpgsqlDebugPrint)
+		{
+			elog(LOG, "[PLPGSQL] ExecRemoteUtility conn nodename:%s "
+					"backendpid:%d sock:%d nodeoid:%u need_begin_txn",
+					conn->nodename, conn->backend_pid, conn->sock,
+					conn->nodeoid);
+		}
+	}
+	if (NeedBeginSubTxn() && !NodeHasBeginSubTxn(nodeOid))
+	{
+		conn->plpgsql_need_begin_sub_txn = true;
+		SetNodeBeginSubTxn(nodeOid);
+		if (PlpgsqlDebugPrint)
+		{
+			elog(LOG, "[PLPGSQL] ExecRemoteUtility conn nodename:%s "
+					"backendpid:%d sock:%d nodeoid:%u need_begin_sub_txn",
+					conn->nodename, conn->backend_pid, conn->sock,
+					conn->nodeoid);
+		}
+	}
+}
+
 #ifdef __TWO_PHASE_TRANS__
 void InitLocalTwoPhaseState(void)
 {
@@ -6496,7 +6545,7 @@ ExecRemoteUtility(RemoteQuery *node)
 {// #lizard forgives
     RemoteQueryState *remotestate;
     ResponseCombiner *combiner;
-    bool        force_autocommit = node->force_autocommit;
+    bool              force_autocommit = node->force_autocommit;
     RemoteQueryExecType exec_type = node->exec_type;
     GlobalTransactionId gxid = InvalidGlobalTransactionId;
     Snapshot snapshot = NULL;
@@ -6524,6 +6573,7 @@ ExecRemoteUtility(RemoteQuery *node)
 
     dn_conn_count = pgxc_connections->dn_conn_count;
     co_conn_count = pgxc_connections->co_conn_count;
+
     /* exit right away if no nodes to run command on */
     if (dn_conn_count == 0 && co_conn_count == 0)
     {
@@ -6536,16 +6586,20 @@ ExecRemoteUtility(RemoteQuery *node)
     else
         need_tran_block = true;
 
-    /* Commands launched through EXECUTE DIRECT do not need start a transaction */
+    /*
+	 * Commands launched through EXECUTE DIRECT do not need start a
+	 * transaction
+	 */
     if (exec_direct_type == EXEC_DIRECT_UTILITY)
     {
         need_tran_block = false;
 
         /* This check is not done when analyzing to limit dependencies */
         if (IsTransactionBlock())
-            ereport(ERROR,
-                    (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
-                     errmsg("cannot run EXECUTE DIRECT with utility inside a transaction block")));
+			ereport(ERROR,
+					(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+					 errmsg("cannot run EXECUTE DIRECT with utility inside a "
+							"transaction block")));
     }
 
 #ifdef __TBASE__    
@@ -6558,150 +6612,89 @@ ExecRemoteUtility(RemoteQuery *node)
     
     if (ActiveSnapshotSet())
         snapshot = GetActiveSnapshot();
+
 #ifdef __TBASE__    
     if (!ExecDDLWithoutAcquireXid(node->parsetree))
 #endif
     {
         if (!GlobalTransactionIdIsValid(gxid))
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("Failed to get next transaction ID")));
+             ereport(ERROR,
+                    (errcode(ERRCODE_INTERNAL_ERROR),
+                     errmsg("Failed to get next transaction ID")));
     }
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
     if(!IS_PGXC_LOCAL_COORDINATOR)
     {
-        /* 
-         * Global xid is not needed to send to remote nodes 
-         * for connections from coord and datanode as 
-         * normal DDLs except for set_config_option are all single level 
-         * connections from Coords executing distributed DDLs.
-         */
+		/* 
+		 * Distributed DDLs only dispatch from the requested coordinator, thus
+		 * we skip sending gxid to avoid cycling.
+		 *
+		 * Note: except for 'set_config_option'.
+		 */
         gxid = InvalidTransactionId;
     }
 
 #endif
 
 #ifdef __TBASE__
+	/* Set node begin transaction in plpgsql function for CN/DN */
+	for (i = 0; i < dn_conn_count; i++)
     {
-            Oid nodeoid = InvalidOid;
-            PGXCNodeHandle *conn = NULL;
-            for (i = 0; i < dn_conn_count; i++)
-            {
-                conn = pgxc_connections->datanode_handles[i];
-                nodeoid  = conn->nodeoid;
-                if (NeedBeginTxn() && !NodeHasBeginTxn(nodeoid))
-                {
-                    conn->plpgsql_need_begin_txn = true;
-                    SetNodeBeginTxn(nodeoid);
-                    if (PlpgsqlDebugPrint)
-                    {
-                        elog(LOG, "[PLPGSQL] ExecRemoteUtility conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_txn", 
-                                conn->nodename, conn->backend_pid, conn->sock, conn->nodeoid);
-                    }
-                }
-                if (NeedBeginSubTxn() && !NodeHasBeginSubTxn(nodeoid))
-                {
-                    conn->plpgsql_need_begin_sub_txn = true;
-                    SetNodeBeginSubTxn(nodeoid);
-                    if (PlpgsqlDebugPrint)
-                    {
-                        elog(LOG, "[PLPGSQL] ExecRemoteUtility conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_sub_txn", 
-                                conn->nodename, conn->backend_pid, conn->sock, conn->nodeoid);
-                    }
-                }
-            }
-
-            for (i = 0; i < co_conn_count; i++)
-            {
-                conn = pgxc_connections->coord_handles[i];
-                nodeoid  = conn->nodeoid;
-                if (NeedBeginTxn() && !NodeHasBeginTxn(nodeoid))
-                {
-                    conn->plpgsql_need_begin_txn = true;
-                    SetNodeBeginTxn(nodeoid);
-                    if (PlpgsqlDebugPrint)
-                    {
-                        elog(LOG, "[PLPGSQL] ExecRemoteUtility conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_txn", 
-                                conn->nodename, conn->backend_pid, conn->sock, conn->nodeoid);
-                    }
-                }
-                if (NeedBeginSubTxn() && !NodeHasBeginSubTxn(nodeoid))
-                {
-                    conn->plpgsql_need_begin_sub_txn = true;
-                    SetNodeBeginSubTxn(nodeoid);
-                    if (PlpgsqlDebugPrint)
-                    {
-                        elog(LOG, "[PLPGSQL] ExecRemoteUtility conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_sub_txn", 
-                                conn->nodename, conn->backend_pid, conn->sock, conn->nodeoid);
-                    }
-                }
-            }
-    }        
+        SetPlpgsqlTransactionBegin(pgxc_connections->datanode_handles[i]);
+    }  
+    
+	for (i = 0; i < co_conn_count; i++)
+	{
+		SetPlpgsqlTransactionBegin(pgxc_connections->coord_handles[i]);
+	}     
 #endif 
 
+    /*
+	 * DDL will firstly be executed on coordinators then datanodes
+	 * which will avoid deadlocks in cluster.
+	 * Let us assume that user sql and ddl hold conflict locks,
+	 * then there will be two situations:
+	 * 1. The coordinator is not locked, user sql will see datanodes with no lock.
+	 * 2. The coordinator is locked, user sql will wait for ddl to complete.
+     *
+     * Send BEGIN control command to all coordinator nodes
+     */
+    if (pgxc_node_begin(co_conn_count,
+                        pgxc_connections->coord_handles,
+                        gxid,
+                        need_tran_block,
+                        false,
+                        PGXC_NODE_COORDINATOR))
     {
-        if (pgxc_node_begin(dn_conn_count, pgxc_connections->datanode_handles,
-                    gxid, need_tran_block, false, PGXC_NODE_DATANODE))
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Could not begin transaction on Datanodes")));
-        for (i = 0; i < dn_conn_count; i++)
-        {
-            PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
-
-            if (conn->state == DN_CONNECTION_STATE_QUERY)
-                BufferConnection(conn);
-            if (snapshot && pgxc_node_send_snapshot(conn, snapshot))
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to send snapshot to Datanodes")));
-            }
-            if (pgxc_node_send_cmd_id(conn, cid) < 0)
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to send command ID to Datanodes")));
-            }
-
-            if (pgxc_node_send_query(conn, node->sql_statement) != 0)
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to send command to Datanodes")));
-            }
-        }
+        ereport(ERROR,
+                (errcode(ERRCODE_INTERNAL_ERROR),
+                        errmsg("Could not begin transaction on coordinators")));
     }
 
+    /* Send other txn related messages to coordinator nodes */
+    for (i = 0; i < co_conn_count; i++)
     {
-        if (pgxc_node_begin(co_conn_count, pgxc_connections->coord_handles,
-                    gxid, need_tran_block, false, PGXC_NODE_COORDINATOR))
+        PGXCNodeHandle *conn = pgxc_connections->coord_handles[i];
+
+        if (snapshot && pgxc_node_send_snapshot(conn, snapshot))
+        {
             ereport(ERROR,
                     (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Could not begin transaction on coordinators")));
-        /* Now send it to Coordinators if necessary */
-        for (i = 0; i < co_conn_count; i++)
+                            errmsg("Failed to send command to coordinators")));
+        }
+        if (pgxc_node_send_cmd_id(conn, cid) < 0)
         {
-            if (snapshot && pgxc_node_send_snapshot(pgxc_connections->coord_handles[i], snapshot))
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to send command to coordinators")));
-            }
-            if (pgxc_node_send_cmd_id(pgxc_connections->coord_handles[i], cid) < 0)
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to send command ID to Datanodes")));
-            }
+            ereport(ERROR,
+                    (errcode(ERRCODE_INTERNAL_ERROR),
+                            errmsg("Failed to send command ID to Datanodes")));
+        }
 
-            if (pgxc_node_send_query(pgxc_connections->coord_handles[i], node->sql_statement) != 0)
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to send command to coordinators")));
-            }
+        if (pgxc_node_send_query(conn, node->sql_statement) != 0)
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_INTERNAL_ERROR),
+                            errmsg("Failed to send command to coordinators")));
         }
     }
 
@@ -6709,125 +6702,188 @@ ExecRemoteUtility(RemoteQuery *node)
      * Stop if all commands are completed or we got a data row and
      * initialized state node for subsequent invocations
      */
+    while (co_conn_count > 0)
     {
-        while (dn_conn_count > 0)
-        {
-            int i = 0;
+        int i = 0;
 
-            if (pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL))
-                break;
+        /* Wait until one of the connections has data available */
+        if (pgxc_node_receive(co_conn_count,
+                              pgxc_connections->coord_handles,
+                              NULL))
+        {
             /*
-             * Handle input from the Datanodes.
-             * We do not expect Datanodes returning tuples when running utility
-             * command.
-             * If we got EOF, move to the next connection, will receive more
-             * data on the next iteration.
+             * Got error
+             * TODO(Tbase): How do we check the error here?
              */
-            while (i < dn_conn_count)
-            {
-                PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
-                int res = handle_response(conn, combiner);
-                if (res == RESPONSE_EOF)
-                {
-                    i++;
-                }
-                else if (res == RESPONSE_COMPLETE)
-                {
-                    /* Ignore, wait for ReadyForQuery */
-                    if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
-                    {
-                        ereport(ERROR,
-                            (errcode(ERRCODE_INTERNAL_ERROR),
-                             errmsg("Unexpected FATAL ERROR on Connection to Datanode %s pid %d",
-                                     conn->nodename, conn->backend_pid)));
-                    }
-                }
-                else if (res == RESPONSE_ERROR)
-                {
-                    /* Ignore, wait for ReadyForQuery */
-                }
-                else if (res == RESPONSE_READY)
-                {
-                    if (i < --dn_conn_count)
-                        pgxc_connections->datanode_handles[i] =
-                            pgxc_connections->datanode_handles[dn_conn_count];
-                }
-                else if (res == RESPONSE_TUPDESC)
-                {
-                    ereport(ERROR,
-                            (errcode(ERRCODE_INTERNAL_ERROR),
-                             errmsg("Unexpected response from Datanode")));
-                }
-                else if (res == RESPONSE_DATAROW)
-                {
-                    ereport(ERROR,
-                            (errcode(ERRCODE_INTERNAL_ERROR),
-                             errmsg("Unexpected response from Datanode")));
-                }
-            }
+            break;
         }
-    }
 
-    /* Make the same for Coordinators */
-    {
-        while (co_conn_count > 0)
+        while (i < co_conn_count)
         {
-            int i = 0;
-
-            if (pgxc_node_receive(co_conn_count, pgxc_connections->coord_handles, NULL))
-                break;
+            PGXCNodeHandle *conn = pgxc_connections->coord_handles[i];
+            int 			res = handle_response(conn, combiner);
 
-            while (i < co_conn_count)
+            if (res == RESPONSE_EOF)
             {
-                int res = handle_response(pgxc_connections->coord_handles[i], combiner);
-                if (res == RESPONSE_EOF)
-                {
-                    i++;
-                }
-                else if (res == RESPONSE_COMPLETE)
-                {
-                    /* Ignore, wait for ReadyForQuery */
-                    if (pgxc_connections->coord_handles[i]->state == DN_CONNECTION_STATE_ERROR_FATAL)
-                    {
-                        ereport(ERROR,
-                            (errcode(ERRCODE_INTERNAL_ERROR),
-                             errmsg("Unexpected FATAL ERROR on Connection to Coordinator %s pid %d",
-                                     pgxc_connections->coord_handles[i]->nodename, pgxc_connections->coord_handles[i]->backend_pid)));
-                    }
-                }
-                else if (res == RESPONSE_ERROR)
-                {
-                    /* Ignore, wait for ReadyForQuery */
-                }
-                else if (res == RESPONSE_READY)
-                {
-                    if (i < --co_conn_count)
-                        pgxc_connections->coord_handles[i] =
-                             pgxc_connections->coord_handles[co_conn_count];
-                }
-                else if (res == RESPONSE_TUPDESC)
-                {
-                    ereport(ERROR,
-                            (errcode(ERRCODE_INTERNAL_ERROR),
-                             errmsg("Unexpected response from coordinator")));
-                }
-                else if (res == RESPONSE_DATAROW)
+                i++;
+            }
+            else if (res == RESPONSE_COMPLETE)
+            {
+                /* Ignore, wait for ReadyForQuery */
+                if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
                 {
                     ereport(ERROR,
                             (errcode(ERRCODE_INTERNAL_ERROR),
-                             errmsg("Unexpected response from coordinator")));
+                                    errmsg("Unexpected FATAL ERROR on Connection to "
+                                           "Coordinator %s pid %d",
+                                           pgxc_connections->coord_handles[i]->nodename,
+                                           pgxc_connections->coord_handles[i]->backend_pid)));
                 }
             }
+            else if (res == RESPONSE_ERROR)
+            {
+                /* Ignore, wait for ReadyForQuery */
+            }
+            else if (res == RESPONSE_READY)
+            {
+                if (i < --co_conn_count)
+                    pgxc_connections->coord_handles[i] =
+                            pgxc_connections->coord_handles[co_conn_count];
+            }
+            else if (res == RESPONSE_TUPDESC)
+            {
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                                errmsg("Unexpected response from coordinator")));
+            }
+            else if (res == RESPONSE_DATAROW)
+            {
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                                errmsg("Unexpected response from coordinator")));
+            }
         }
     }
 
-    /*
-     * We have processed all responses from nodes and if we have
-     * error message pending we can report it. All connections should be in
-     * consistent state now and so they can be released to the pool after ROLLBACK.
-     */
-    pfree_pgxc_all_handles(pgxc_connections);
-    pgxc_node_report_error(combiner);
+	/*
+	 * Send BEGIN control command to all data nodes
+	 */
+	if (pgxc_node_begin(dn_conn_count,
+						pgxc_connections->datanode_handles,
+						gxid,
+						need_tran_block,
+						false,
+						PGXC_NODE_DATANODE))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("Could not begin transaction on Datanodes")));
+	}
+
+	/* Send other txn related messages to data nodes */
+	for (i = 0; i < dn_conn_count; i++)
+	{
+		PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
+
+		if (conn->state == DN_CONNECTION_STATE_QUERY)
+			BufferConnection(conn);
+		if (snapshot && pgxc_node_send_snapshot(conn, snapshot))
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Failed to send snapshot to Datanodes")));
+		}
+		if (pgxc_node_send_cmd_id(conn, cid) < 0)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Failed to send command ID to Datanodes")));
+		}
+
+		if (pgxc_node_send_query(conn, node->sql_statement) != 0)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Failed to send command to Datanodes")));
+		}
+	}
+
+
+    /* Make the same for data nodes */
+	while (dn_conn_count > 0)
+	{
+		int i = 0;
+
+		/* Wait until one of the connections has data available */
+		if (pgxc_node_receive(dn_conn_count,
+							  pgxc_connections->datanode_handles,
+							  NULL))
+		{
+			/*
+			 * Got error
+			 * TODO(Tbase): How do we check the error here?
+			 */
+			break;
+		}
+
+		/*
+		 * Handle input from the data nodes. We do not expect data nodes
+		 * returning tuples when running utility command. If we got EOF, move
+		 * to the next connection, will receive more data on the next
+		 * iteration.
+		 */
+		while (i < dn_conn_count)
+		{
+			PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
+			int res = handle_response(conn, combiner);
+			if (res == RESPONSE_EOF)
+			{
+				i++;
+			}
+			else if (res == RESPONSE_COMPLETE)
+			{
+				/* Ignore, wait for ReadyForQuery */
+				if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+				{
+					ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("Unexpected FATAL ERROR on Connection to "
+								 "Datanode %s pid %d",
+								conn->nodename, conn->backend_pid)));
+				}
+			}
+			else if (res == RESPONSE_ERROR)
+			{
+				/* Ignore, wait for ReadyForQuery */
+			}
+			else if (res == RESPONSE_READY)
+			{
+				if (i < --dn_conn_count)
+					pgxc_connections->datanode_handles[i] =
+						pgxc_connections->datanode_handles[dn_conn_count];
+			}
+			else if (res == RESPONSE_TUPDESC)
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("Unexpected response from Datanode")));
+			}
+			else if (res == RESPONSE_DATAROW)
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("Unexpected response from Datanode")));
+			}
+		}
+	}
+
+	/*
+	 * We have processed all responses from nodes and if we have error message
+	 * pending we can report it. All connections should be in consistent state
+	 * now and so they can be released to the pool after ROLLBACK.
+	 */
+	pfree_pgxc_all_handles(pgxc_connections);
+	pgxc_node_report_error(combiner);
 }
 
 
@@ -8583,25 +8639,24 @@ ExecRemoteQuery(PlanState *pstate)
             connections = pgxc_connections->datanode_handles;
             total_conn_count = regular_conn_count = pgxc_connections->dn_conn_count;
 #ifdef __TBASE__
-            if (regular_conn_count > 1)
-            {
-                need_global_snapshot = true;
-            }
-            else if (regular_conn_count == 1 && !need_global_snapshot)
-            {
-                MemoryContext old;
-                
-                int nodeid = PGXCNodeGetNodeId(connections[0]->nodeoid, NULL);
-                
-                old = MemoryContextSwitchTo(TopTransactionContext);
-                executed_node_list = list_append_unique_int(executed_node_list, nodeid);
-                MemoryContextSwitchTo(old);
-
-                if (list_length(executed_node_list) > 1)
-                {
-                    need_global_snapshot = true;
-                }
-            }
+			if (regular_conn_count > 1)
+			{
+				need_global_snapshot = true;
+			}
+			else if (regular_conn_count == 1 && !need_global_snapshot)
+			{
+				int nodeid = PGXCNodeGetNodeId(connections[0]->nodeoid, NULL);
+				MemoryContext old = MemoryContextSwitchTo(TopTransactionContext);
+				
+				executed_node_list = list_append_unique_int(executed_node_list, nodeid);
+				
+				MemoryContextSwitchTo(old);
+
+				if (list_length(executed_node_list) > 1)
+				{
+					need_global_snapshot = true;
+				}
+			}
 #endif
         }
         else if (step->exec_type == EXEC_ON_COORDS)
@@ -8655,58 +8710,16 @@ ExecRemoteQuery(PlanState *pstate)
                     (TransactionBlockStatusCode() == 'T');
 
 #ifdef __TBASE__
-{
-        Oid nodeoid = InvalidOid;
-        if (primaryconnection)
-        {
-            nodeoid = primaryconnection->nodeoid;
-            if (NeedBeginTxn() && !NodeHasBeginTxn(nodeoid))
-            {
-                primaryconnection->plpgsql_need_begin_txn = true;
-                SetNodeBeginTxn(nodeoid );
-                if (PlpgsqlDebugPrint)
-                {
-                    elog(LOG, "[PLPGSQL] ExecRemoteQuery conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_txn", 
-                            primaryconnection->nodename, primaryconnection->backend_pid, primaryconnection->sock, primaryconnection->nodeoid);
-                }
-            }
-            if (NeedBeginSubTxn() && !NodeHasBeginSubTxn(nodeoid))
-            {
-                primaryconnection->plpgsql_need_begin_sub_txn = true;
-                SetNodeBeginSubTxn(nodeoid);
-                if (PlpgsqlDebugPrint)
-                {
-                    elog(LOG, "[PLPGSQL] ExecRemoteQuery conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_sub_txn", 
-                            primaryconnection->nodename, primaryconnection->backend_pid, primaryconnection->sock, primaryconnection->nodeoid);
-                }
-            }
-        }
+		/* Set plpgsql transaction begin for all connections */
+		if (primaryconnection)
+		{
+			SetPlpgsqlTransactionBegin(primaryconnection);
+		}
 
-        for (i = 0; i < regular_conn_count; i++)
-        {
-            nodeoid  = connections[i]->nodeoid;
-            if (NeedBeginTxn() && !NodeHasBeginTxn(nodeoid))
-            {
-                connections[i]->plpgsql_need_begin_txn = true;
-                SetNodeBeginTxn(nodeoid);
-                if (PlpgsqlDebugPrint)
-                {
-                    elog(LOG, "[PLPGSQL] ExecRemoteQuery conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_txn", 
-                            connections[i]->nodename, connections[i]->backend_pid, connections[i]->sock, connections[i]->nodeoid);
-                }
-            }
-            if (NeedBeginSubTxn() && !NodeHasBeginSubTxn(nodeoid))
-            {
-                connections[i]->plpgsql_need_begin_sub_txn = true;
-                SetNodeBeginSubTxn(nodeoid);
-                if (PlpgsqlDebugPrint)
-                {
-                    elog(LOG, "[PLPGSQL] ExecRemoteQuery conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_sub_txn", 
-                            connections[i]->nodename, connections[i]->backend_pid, connections[i]->sock, connections[i]->nodeoid);
-                }
-            }
-        }
-}        
+		for (i = 0; i < regular_conn_count; i++)
+		{
+			SetPlpgsqlTransactionBegin(connections[i]);
+		}
 #endif 
         stat_statement();
         stat_transaction(total_conn_count);
@@ -8718,59 +8731,65 @@ ExecRemoteQuery(PlanState *pstate)
             //elog(LOG, "[PLPGSQL]ExecRemoteQuery has primaryconnection");
             //primaryconnection->read_only = true;
 #ifdef __TBASE__
-            combiner->connections = &primaryconnection;
-            combiner->conn_count = 1;
-            combiner->current_conn = 0;
-#endif
-            if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block,
-                                step->read_only, PGXC_NODE_DATANODE))
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Could not begin transaction on data node:%s.", primaryconnection->nodename)));
-
-            /* If explicit transaction is needed gxid is already sent */
-            if (!pgxc_start_command_on_connection(primaryconnection, node, snapshot))
-            {
-                pgxc_node_remote_abort(TXN_TYPE_RollbackTxn, true);
-                pfree_pgxc_all_handles(pgxc_connections);
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to send command to data nodes")));
-            }
-            Assert(combiner->combine_type == COMBINE_TYPE_SAME);
-
-            pgxc_node_receive(1, &primaryconnection, NULL);
-            /* Make sure the command is completed on the primary node */
-            while (true)
-            {
-                int res = handle_response(primaryconnection, combiner);
-                if (res == RESPONSE_READY)
-                    break;
-                else if (res == RESPONSE_EOF)
-                    pgxc_node_receive(1, &primaryconnection, NULL);
-                else if (res == RESPONSE_COMPLETE || res == RESPONSE_ERROR)
-                {
-                    if (res == RESPONSE_COMPLETE && primaryconnection->state == DN_CONNECTION_STATE_ERROR_FATAL)
-                    {
-                        ereport(ERROR,
-                                (errcode(ERRCODE_INTERNAL_ERROR),
-                                 errmsg("Unexpected FATAL ERROR on Connection to Datanode %s pid %d",
-                                        primaryconnection->nodename, primaryconnection->backend_pid)));
-
-                    }
-                    /* Get ReadyForQuery */
-                    continue;
-                }
-                else if (res == RESPONSE_ASSIGN_GXID)
-                    continue;
-                else
-                    ereport(ERROR,
-                            (errcode(ERRCODE_INTERNAL_ERROR),
-                             errmsg("Unexpected response from data node")));
-            }
-            if (combiner->errorMessage)
-                pgxc_node_report_error(combiner);
-        }
+			combiner->connections = &primaryconnection;
+			combiner->conn_count = 1;
+			combiner->current_conn = 0;
+#endif
+			if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block,
+								step->read_only, PGXC_NODE_DATANODE))
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("Could not begin transaction on data node:%s.",
+								 primaryconnection->nodename)));
+
+			/* If explicit transaction is needed gxid is already sent */
+			if (!pgxc_start_command_on_connection(primaryconnection,
+												  node,
+												  snapshot))
+			{
+				pgxc_node_remote_abort(TXN_TYPE_RollbackTxn, true);
+				pfree_pgxc_all_handles(pgxc_connections);
+
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("Failed to send command to data nodes")));
+			}
+			Assert(combiner->combine_type == COMBINE_TYPE_SAME);
+
+			pgxc_node_receive(1, &primaryconnection, NULL);
+			/* Make sure the command is completed on the primary node */
+			while (true)
+			{
+				int res = handle_response(primaryconnection, combiner);
+				if (res == RESPONSE_READY)
+					break;
+				else if (res == RESPONSE_EOF)
+					pgxc_node_receive(1, &primaryconnection, NULL);
+				else if (res == RESPONSE_COMPLETE || res == RESPONSE_ERROR)
+				{
+					if (res == RESPONSE_COMPLETE &&
+						primaryconnection->state == DN_CONNECTION_STATE_ERROR_FATAL)
+					{
+						ereport(ERROR,
+								(errcode(ERRCODE_INTERNAL_ERROR),
+								 errmsg("Unexpected FATAL ERROR on Connection to Datanode %s pid %d",
+										primaryconnection->nodename,
+										primaryconnection->backend_pid)));
+
+					}
+				    /* Get ReadyForQuery */
+					continue;
+				}
+				else if (res == RESPONSE_ASSIGN_GXID)
+					continue;
+				else
+					ereport(ERROR,
+							(errcode(ERRCODE_INTERNAL_ERROR),
+							 errmsg("Unexpected response from data node")));
+			}
+			if (combiner->errorMessage)
+				pgxc_node_report_error(combiner);
+		}
 
 #ifdef __TBASE__
         if (regular_conn_count > 0)
@@ -8784,33 +8803,35 @@ ExecRemoteQuery(PlanState *pstate)
         {
             //connections[i]->read_only = true;
 #ifdef __TBASE__
-            connections[i]->recv_datarows = 0;
-#endif
-            if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block,
-                                step->read_only, PGXC_NODE_DATANODE))
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Could not begin transaction on data node:%s.", connections[i]->nodename)));
-
-            /* If explicit transaction is needed gxid is already sent */
-            if (!pgxc_start_command_on_connection(connections[i], node, snapshot))
-            {
-                pgxc_node_remote_abort(TXN_TYPE_RollbackTxn, true);
-                pfree_pgxc_all_handles(pgxc_connections);
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to send command to data nodes")));
-            }
-            connections[i]->combiner = combiner;
-        }
+			connections[i]->recv_datarows = 0;
+#endif
+			if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block,
+								step->read_only, PGXC_NODE_DATANODE))
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("Could not begin transaction on data node:%s.",
+								 connections[i]->nodename)));
+
+			/* If explicit transaction is needed gxid is already sent */
+			if (!pgxc_start_command_on_connection(connections[i], node, snapshot))
+			{
+				pgxc_node_remote_abort(TXN_TYPE_RollbackTxn, true);
+				pfree_pgxc_all_handles(pgxc_connections);
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("Failed to send command to data nodes")));
+			}
+			connections[i]->combiner = combiner;
+		}
 
-        if (step->cursor)
-        {
-            combiner->cursor = step->cursor;
-            combiner->cursor_count = regular_conn_count;
-            combiner->cursor_connections = (PGXCNodeHandle **) palloc(regular_conn_count * sizeof(PGXCNodeHandle *));
-            memcpy(combiner->cursor_connections, connections, regular_conn_count * sizeof(PGXCNodeHandle *));
-        }
+		if (step->cursor)
+		{
+			int conn_size = regular_conn_count * sizeof(PGXCNodeHandle *);
+			combiner->cursor = step->cursor;
+			combiner->cursor_count = regular_conn_count;
+			combiner->cursor_connections = (PGXCNodeHandle **)palloc(conn_size);
+			memcpy(combiner->cursor_connections, connections, conn_size);
+		}
 
         combiner->connections = connections;
         combiner->conn_count = regular_conn_count;
@@ -10170,37 +10191,13 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node)
             !IsA(outerPlan(plan), ModifyTable);
 
 #ifdef __TBASE__
-{
-    for (i = 0; i < combiner->conn_count; i++)
-    {
-        PGXCNodeHandle *connection_tmp = combiner->connections[i];
-        Oid                nodeoid           = connection_tmp->nodeoid;
-        if (NeedBeginTxn() && !NodeHasBeginTxn(nodeoid))
-        {
-            connection_tmp->plpgsql_need_begin_txn = true;
-            SetNodeBeginTxn(nodeoid );
-            if (PlpgsqlDebugPrint)
-            {
-                elog(LOG, "[PLPGSQL] ExecFinishInitRemoteSubplan conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_txn", 
-                        connection_tmp->nodename, connection_tmp->backend_pid, connection_tmp->sock, connection_tmp->nodeoid);
-            }
-        }
-         if (NeedBeginSubTxn() && !NodeHasBeginSubTxn(nodeoid))
-        {
-            connection_tmp->plpgsql_need_begin_sub_txn = true;
-            SetNodeBeginSubTxn(nodeoid);
-            if (PlpgsqlDebugPrint)
-            {
-                elog(LOG, "[PLPGSQL] ExecFinishInitRemoteSubplan conn nodename:%s backendpid:%d sock:%d nodeoid:%u need_begin_sub_txn", 
-                        connection_tmp->nodename, connection_tmp->backend_pid, connection_tmp->sock, connection_tmp->nodeoid);
-            }
-        }
-    }
-}        
+	/* Set plpgsql transaction begin for all connections */
+	for (i = 0; i < combiner->conn_count; i++)
+	{
+		SetPlpgsqlTransactionBegin(combiner->connections[i]);
+	}
 #endif 
 
-
-
 #if 0
     for (i = 0; i < combiner->conn_count; i++)
     {
@@ -10235,11 +10232,12 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node)
     {
         PGXCNodeHandle *connection = combiner->connections[i];
 
-        if (pgxc_node_begin(1, &connection, gxid, true,
-                            is_read_only, PGXC_NODE_DATANODE))
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Could not begin transaction on data node:%s.", connection->nodename)));
+		if (pgxc_node_begin(1, &connection, gxid, true,
+							is_read_only, PGXC_NODE_DATANODE))
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Could not begin transaction on data node:%s.",
+							 connection->nodename)));
 
         if (pgxc_node_send_timestamp(connection, timestamp))
         {
@@ -10268,21 +10266,23 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node)
         pgxc_node_send_plan(connection, cursor, "Remote Subplan",
                             node->subplanstr, node->nParamRemote, paramtypes);
 
-        if (enable_statistic)
-        {
-            elog(LOG, "Plan Message:pid:%d,remote_pid:%d,remote_ip:%s,remote_port:%d,fd:%d,cursor:%s",
-                      MyProcPid, connection->backend_pid, connection->nodehost, connection->nodeport, connection->sock, cursor);
-        }
-        
-        if (pgxc_node_flush(connection))
-        {
-            combiner->conn_count = 0;
-            pfree(combiner->connections);
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Failed to send subplan to data nodes")));
-        }
-    }
+		if (enable_statistic)
+		{
+			elog(LOG, "Plan Message:pid:%d,remote_pid:%d,remote_ip:%s,"
+					  "remote_port:%d,fd:%d,cursor:%s",
+				      MyProcPid, connection->backend_pid, connection->nodehost,
+					  connection->nodeport, connection->sock, cursor);
+		}
+		
+		if (pgxc_node_flush(connection))
+		{
+			combiner->conn_count = 0;
+			pfree(combiner->connections);
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Failed to send subplan to data nodes")));
+		}
+	}
 }
 
 
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 7ca99de4..4279a325 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -708,10 +708,15 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
 #endif    
 }
 
-
 /*
  * Wait while at least one of specified connections has data available and read
  * the data into the buffer
+ *
+ * Returning state code
+ * 		DNStatus_OK      = 0,
+ *		DNStatus_ERR     = 1,
+ *		DNStatus_EXPIRED = 2,
+ *		DNStatus_BUTTY
  */
 #ifdef __TBASE__
 int
diff --git a/src/backend/storage/lmgr/nodelock.c b/src/backend/storage/lmgr/nodelock.c
index 25e44b5f..060e0a0b 100644
--- a/src/backend/storage/lmgr/nodelock.c
+++ b/src/backend/storage/lmgr/nodelock.c
@@ -905,10 +905,10 @@ bool NodeLock(char *lockActions, char objectType, char *param1, char *param2, in
     }
 
     /* check to see whether running transactions exist or not.
-         * if checkTimes is given, we will wait for checkTimes seconds at most.
-         * before time's up, if no running transactions, keep going; else fail to
-         * lock node
-         */
+	 * if checkTimes is given, we will wait for checkTimes seconds at most.
+	 * before time's up, if no running transactions, keep going; else fail to
+	 * lock node
+	 */
     if (ret)
     {
         RunningTransactions running = NULL;
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 0af8acfa..ce95d95d 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -5743,56 +5743,56 @@ PostgresMain(int argc, char *argv[],
                 SetGlobalTimestamp(gts, SNAPSHOT_COORDINATOR);
                 break;
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-            case 'Z':            /* global prepare timestamp */
-                timestamp = (GlobalTimestamp) pq_getmsgint64(&input_message);
-                pq_getmsgend(&input_message);
-            
-                /*
-                 * Set Xact global prepare timestamp 
-                 */
-                if(enable_distri_print)
-                {
-                    elog(LOG, "set global prepare gts " INT64_FORMAT, timestamp);
-                }
-                SetGlobalPrepareTimestamp(timestamp);
-                
-                break;
-            
-            
-            case 'T':            /* global timestamp */
-                timestamp = (GlobalTimestamp) pq_getmsgint64(&input_message);
-                pq_getmsgend(&input_message);
-            
-                /*
-                 * Set Xact global commit timestamp 
-                 */
-                if(enable_distri_print)
-                {
-                    elog(LOG, "set global commit gts " INT64_FORMAT, timestamp);
-                }
-                SetGlobalCommitTimestamp(timestamp);
-                break;
+			case 'Z':			/* global prepare timestamp */
+				timestamp = (GlobalTimestamp) pq_getmsgint64(&input_message);
+				pq_getmsgend(&input_message);
+			
+				/*
+				 * Set Xact global prepare timestamp 
+				 */
+				if(enable_distri_print)
+				{
+					elog(LOG, "set global prepare gts " INT64_FORMAT, timestamp);
+				}
+				SetGlobalPrepareTimestamp(timestamp);
+				
+				break;
 
-            case 'G':   /* Explicit prepared gid */
-                {
-                    const char *gid;
-                    gid = pq_getmsgstring(&input_message);
-                    pq_getmsgend(&input_message);
-                    remotePrepareGID = MemoryContextStrdup(TopMemoryContext, gid);
-                    elog(DEBUG8, "receive remote prepare gid %s", remotePrepareGID);
-                }
-                break;
-            case 'W':    /* Prefinish phase */
-                timestamp = (GlobalTimestamp) pq_getmsgint64(&input_message);
-                pq_getmsgend(&input_message);
-                elog(DEBUG8, "get prefinish timestamp " INT64_FORMAT "for gid %s", timestamp, remotePrepareGID);
-                SetGlobalPrepareTimestamp(timestamp);
-                EndExplicitGlobalPrepare(remotePrepareGID);
-                pfree(remotePrepareGID);
-                remotePrepareGID = NULL;
-                ReadyForCommit(whereToSendOutput);
-                
-                break;
+			case 'T':			/* global timestamp */
+				timestamp = (GlobalTimestamp) pq_getmsgint64(&input_message);
+				pq_getmsgend(&input_message);
+			
+				/*
+				 * Set Xact global commit timestamp 
+				 */
+				if(enable_distri_print)
+				{
+					elog(LOG, "set global commit gts " INT64_FORMAT, timestamp);
+				}
+				SetGlobalCommitTimestamp(timestamp);
+				break;
+
+			case 'G':   /* Explicit prepared gid */
+				{
+					const char *gid;
+					gid = pq_getmsgstring(&input_message);
+					pq_getmsgend(&input_message);
+					remotePrepareGID = MemoryContextStrdup(TopMemoryContext, gid);
+					elog(DEBUG8, "receive remote prepare gid %s", remotePrepareGID);
+				}
+				break;
+
+			case 'W':	/* Prefinish phase */
+				timestamp = (GlobalTimestamp) pq_getmsgint64(&input_message);
+				pq_getmsgend(&input_message);
+				elog(DEBUG8, "get prefinish timestamp " INT64_FORMAT "for gid %s", timestamp, remotePrepareGID);
+				SetGlobalPrepareTimestamp(timestamp);
+				EndExplicitGlobalPrepare(remotePrepareGID);
+				pfree(remotePrepareGID);
+				remotePrepareGID = NULL;
+				ReadyForCommit(whereToSendOutput);
+
+				break;
 
 #endif
             case 't':            /* timestamp */
diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c
index 4c12eb81..dbe9b893 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -1948,4 +1948,4 @@ void GetSysCacheInfo(int32 cacheid,
         *nkeys = cacheinfo[cacheid].nkeys;
     }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index c95b3fa4..505028eb 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -86,18 +86,70 @@ SnapshotData SnapshotSelfData = {HeapTupleSatisfiesSelf};
 SnapshotData SnapshotAnyData = {HeapTupleSatisfiesAny};
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-static bool XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid, Snapshot snapshot, Buffer buffer, bool *need_retry, uint16 infomask);
-static bool
-XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot);
-#ifdef __SNAPSHOT_CHECK__
-static bool SnapshotCheck(TransactionId xid, Snapshot snapshot, int target_res, GlobalTimestamp target_committs);
+static bool XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid,
+									Snapshot snapshot, Buffer buffer,
+									bool *need_retry, uint16 infomask);
+static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot);
+
+/* Debugging.... */
+
+#ifdef DIST_TXN_DEBUG
+#define DEBUG_MVCC_XMIN(state, msg) \
+	if(enable_distri_visibility_print && \
+	   TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) \
+	{ \
+		elog(LOG, "MVCC ts " INT64_FORMAT " %s xmin %d %s.", \
+				state? "true":"false", snapshot->start_ts, \
+				HeapTupleHeaderGetRawXmin(tuple), msg); \
+	}
 #else
+#define DEBUG_MVCC_XMIN(state, msg) \
+	((void) 0)
+#endif
 
-#define SnapshotCheck(xid, snapshot, target_res, target_committs)
+#ifdef DIST_TXN_DEBUG
+#define DEBUG_MVCC_XMINXMAX(state, xmax, msg) \
+	if(enable_distri_visibility_print && \
+	   TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple))) \
+	{ \
+		elog(LOG, "MVCC ts " INT64_FORMAT " %s xmin %d xmax %d %s.", \
+				state? "true":"false", snapshot->start_ts, \
+				HeapTupleHeaderGetRawXmin(tuple), xmax, msg); \
+	}
+#else
+#define DEBUG_MVCC_XMINXMAX(state, xmax, msg) \
+	((void) 0)
+#endif
 
+#ifdef DIST_TXN_DEBUG
+#define DEBUG_SNAPSHOT(A) \
+	do { \
+		int	_debug_snapshot_save_errno = errno; \
+		if (enable_distri_visibility_print) \
+		{ \
+		    A; \
+		} \
+		errno = _debug_snapshot_save_errno; \
+	} while (0)
+#else
+#define DEBUG_SNAPSHOT(A) \
+	((void) 0)
 #endif
 
+#define DEBUG_INCREASE_VISIBLE_TUPLE \
+	if(enable_distri_debug) \
+	{ \
+		snapshot->number_visible_tuples++; \
+	}
+
+#ifdef __SNAPSHOT_CHECK__
+static bool SnapshotCheck(TransactionId xid, Snapshot snapshot, int target_res, GlobalTimestamp target_committs);
+#else
+#define SnapshotCheck(xid, snapshot, target_res, target_committs)
 #endif
+
+#endif //  __SUPPORT_DISTRIBUTED_TRANSACTION__
+
 /*
 #ifdef _MIGRATE_
 SnapshotData SnapshotNowData = {HeapTupleSatisfiesNow};
@@ -1089,196 +1141,186 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot,
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
 static bool
-XminInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer, 
-                                            bool *need_retry)
-{// #lizard forgives
-
-
-    GlobalTimestamp global_committs;
-    TransactionId xid = HeapTupleHeaderGetRawXmin(tuple);
-    bool     res;
-    
-    global_committs = HeapTupleHderGetXminTimestapAtomic(tuple);
-    
-    if(!GlobalTimestampIsValid(global_committs))
-    {
-        elog(DEBUG12, "invalid time xmin snapshot ts " INT64_FORMAT " xid %d.", snapshot->start_ts, xid);
-        return XidInMVCCSnapshotDistri(tuple, xid, snapshot, buffer, need_retry, HEAP_XMIN_COMMITTED);
-    }
-    else if (snapshot->local || CommitTimestampIsLocal(global_committs))
-    {
-        res =  XidInMVCCSnapshot(xid, snapshot);
-        SnapshotCheck(xid, snapshot, res, 0);
-        if(enable_distri_visibility_print)
-        {
-            elog(DEBUG12, "xmin local snapshot ts " INT64_FORMAT " res %d xid %d committs " INT64_FORMAT,
-                                snapshot->start_ts, res, xid, global_committs);
-        }
-        return res;
-    }
-    else
-    {
-        if(enable_distri_debug)
-        {
-            snapshot->scanned_tuples_after_committed++;
-        }
-        
-        *need_retry = false;
-        if(!GlobalTimestampIsValid(snapshot->start_ts))
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("transaction %d does not have valid timestamp. snapshot start ts " 
-                             INT64_FORMAT ", autovacuum %d in recovery %d", 
-                     xid, snapshot->start_ts, IsAutoVacuumWorkerProcess(), snapshot->takenDuringRecovery)));
-        }
-        elog(DEBUG12, "outer xmin snapshot ts " INT64_FORMAT " global committs " INT64_FORMAT " xid %d.", 
-            snapshot->start_ts, global_committs, xid);
-
-        if(enable_distri_visibility_print)
-        {
-            if(!TransactionIdDidCommit(xid))
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                        errmsg("xmin transaction %d should commit but not. snapshot start ts " INT64_FORMAT 
-                        " commit %d abort %d in-progress %d active %d recentxmin %d"
-                        " start ts " INT64_FORMAT " committs " INT64_FORMAT, 
-                        xid, 
-                        snapshot->start_ts, 
-                        TransactionIdDidCommit(xid), 
-                        TransactionIdDidAbort(xid), 
-                        TransactionIdIsInProgress(xid),
-                        TransactionIdIsActive(xid),
-                        RecentXmin,
-                        snapshot->start_ts, 
-                        global_committs)));
-            }
-        }
-
-
-        if(snapshot->start_ts > global_committs)
-        {
-            if(enable_distri_visibility_print)
-            {
-                elog(LOG, "snapshot ts " INT64_FORMAT " false xid %d committs "INT64_FORMAT" 21.", 
-                                snapshot->start_ts, xid, global_committs);
-            }
-            SnapshotCheck(xid, snapshot, false, global_committs);
-            return false;
-        }
-        else
-        {    
-            if(enable_distri_visibility_print)
-            {
-                elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d committs "INT64_FORMAT" 22.", 
-                            snapshot->start_ts, xid, global_committs);
-            }
-            SnapshotCheck(xid, snapshot, true, global_committs);
-            return true;
-        }
-    }
-    
-
+XminInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot,
+							  Buffer buffer, bool *need_retry)
+{
+	GlobalTimestamp global_committs;
+	TransactionId 	xid = HeapTupleHeaderGetRawXmin(tuple);
+	bool			res;
+	
+	global_committs = HeapTupleHderGetXminTimestapAtomic(tuple);
+	
+	if(!GlobalTimestampIsValid(global_committs))
+	{
+		DEBUG_SNAPSHOT(elog(LOG, "invalid time xmin snapshot ts " INT64_FORMAT
+								" xid %d.", snapshot->start_ts, xid));
+		return XidInMVCCSnapshotDistri(tuple, xid, snapshot, buffer,
+					need_retry, HEAP_XMIN_COMMITTED);
+	}
+	else if (snapshot->local || CommitTimestampIsLocal(global_committs))
+	{
+		res =  XidInMVCCSnapshot(xid, snapshot);
+		SnapshotCheck(xid, snapshot, res, 0);
+
+		DEBUG_SNAPSHOT(elog(LOG, "xmin local snapshot ts " INT64_FORMAT
+				" res %d xid %d committs " INT64_FORMAT, snapshot->start_ts,
+				res, xid, global_committs));
+		return res;
+	}
+	else
+	{
+		if(enable_distri_debug)
+		{
+			snapshot->scanned_tuples_after_committed++;
+		}
+		
+		*need_retry = false;
+		if(!GlobalTimestampIsValid(snapshot->start_ts))
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("transaction %d does not have valid timestamp. "
+							"snapshot start ts " INT64_FORMAT ", autovacuum %d"
+							" in recovery %d",
+					 xid, snapshot->start_ts,
+					 IsAutoVacuumWorkerProcess(),
+					 snapshot->takenDuringRecovery)));
+		}
+
+		DEBUG_SNAPSHOT(elog(LOG, "outer xmin snapshot ts " INT64_FORMAT " global"
+				" committs " INT64_FORMAT " xid %d.", snapshot->start_ts,
+				global_committs, xid));
+		DEBUG_SNAPSHOT(
+			if(!TransactionIdDidCommit(xid))
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						errmsg("xmin transaction %d should commit but not. "
+							   "snapshot start ts " INT64_FORMAT " commit %d "
+							   "abort %d in-progress %d active %d recentxmin %d "
+							   "start ts " INT64_FORMAT " committs " INT64_FORMAT,
+						xid, 
+						snapshot->start_ts, 
+						TransactionIdDidCommit(xid), 
+						TransactionIdDidAbort(xid), 
+						TransactionIdIsInProgress(xid),
+						TransactionIdIsActive(xid),
+						RecentXmin,
+						snapshot->start_ts, 
+						global_committs)));
+			});
+
+		if(snapshot->start_ts > global_committs)
+		{
+			SnapshotCheck(xid, snapshot, false, global_committs);
+
+			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " false xid %d "
+					"committs " INT64_FORMAT " 21.", snapshot->start_ts, xid,
+					global_committs));
+			return false;
+		}
+		else
+		{	
+			SnapshotCheck(xid, snapshot, true, global_committs);
+
+			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d"
+					" committs " INT64_FORMAT " 22.", snapshot->start_ts, xid,
+					global_committs));
+			return true;
+		}
+	}
 }
 
-
 static bool
-XmaxInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot, Buffer buffer, 
-                                            bool *need_retry)
-{// #lizard forgives
-
-
-    GlobalTimestamp global_committs;
-    TransactionId xid = HeapTupleHeaderGetRawXmax(tuple);
-    bool res;
-
-    global_committs = HeapTupleHderGetXmaxTimestapAtomic(tuple);
-    
-    if(!GlobalTimestampIsValid(global_committs))
-    {
-        elog(DEBUG12, "invalid time xmax snapshot ts " INT64_FORMAT " xid %d.", snapshot->start_ts, xid);
-        return XidInMVCCSnapshotDistri(tuple, xid, snapshot, buffer, need_retry, HEAP_XMAX_COMMITTED);
-    }
-    else if (snapshot->local || CommitTimestampIsLocal(global_committs))
-    {
-        res = XidInMVCCSnapshot(xid, snapshot);
-        SnapshotCheck(xid, snapshot, res, 0);
-        if(enable_distri_visibility_print)
-        {
-            elog(DEBUG12, "xmax local snapshot ts " INT64_FORMAT " res %d xid %d.", snapshot->start_ts, res, xid);
-        }
-        return res;
-    }
-    else
-    {
-        if(enable_distri_debug)
-        {
-            snapshot->scanned_tuples_after_committed++;
-        }
-            
-        *need_retry = false;
-        if(!GlobalTimestampIsValid(snapshot->start_ts))
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("transaction %d does not have valid timestamp. snapshot start ts " 
-                                 INT64_FORMAT ", autovacuum %d in recovery %d", 
-                     xid, snapshot->start_ts, IsAutoVacuumWorkerProcess(), snapshot->takenDuringRecovery)));
-        }
-        elog(DEBUG12, "outer xmax snapshot ts " INT64_FORMAT " global committs " INT64_FORMAT " xid %d.", 
-                            snapshot->start_ts, global_committs, xid);
-
-                
-        if(enable_distri_visibility_print)
-        {
-            if(!TransactionIdDidCommit(xid))
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                    errmsg("xmax transaction %d should commit but not. snapshot start ts " INT64_FORMAT 
-                    " commit %d abort %d in-progress %d active %d recentxmin %d"
-                    " start ts " INT64_FORMAT " committs " INT64_FORMAT, 
-                     xid, 
-                     snapshot->start_ts,
-                     TransactionIdDidCommit(xid), 
-                     TransactionIdDidAbort(xid), 
-                     TransactionIdIsInProgress(xid),
-                     TransactionIdIsActive(xid),
-                     RecentXmin,
-                     snapshot->start_ts, 
-                     global_committs)));
-            }
-        }
-
-
-
-        if(snapshot->start_ts > global_committs)
-        {
-            if(enable_distri_visibility_print)
-            {
-                elog(LOG, "snapshot ts " INT64_FORMAT " false xid %d committs "INT64_FORMAT" 11.", 
-                                snapshot->start_ts, xid, global_committs);
-            }
-            SnapshotCheck(xid, snapshot, false, global_committs);
-            return false;
-        }
-        else
-        {
-            if(enable_distri_visibility_print)
-            {
-                elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d committs "INT64_FORMAT" 12.",
-                            snapshot->start_ts, xid, global_committs);
-            }
-            SnapshotCheck(xid, snapshot, true, global_committs);
-            return true;
-        }
-    }
-    
-
+XmaxInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot,
+							  Buffer buffer, bool *need_retry)
+{
+	GlobalTimestamp global_committs;
+	TransactionId xid = HeapTupleHeaderGetRawXmax(tuple);
+	bool res;
+
+	global_committs = HeapTupleHderGetXmaxTimestapAtomic(tuple);
+	
+	if(!GlobalTimestampIsValid(global_committs))
+	{
+		DEBUG_SNAPSHOT(elog(LOG, "invalid time xmax snapshot ts " INT64_FORMAT
+				" xid %d.", snapshot->start_ts, xid));
+		return XidInMVCCSnapshotDistri(tuple, xid, snapshot, buffer,
+					need_retry, HEAP_XMAX_COMMITTED);
+	}
+	else if (snapshot->local || CommitTimestampIsLocal(global_committs))
+	{
+		res = XidInMVCCSnapshot(xid, snapshot);
+		SnapshotCheck(xid, snapshot, res, 0);
+
+		DEBUG_SNAPSHOT(elog(LOG, "xmax local snapshot ts " INT64_FORMAT " res "
+				"%d xid %d.", snapshot->start_ts, res, xid));
+		return res;
+	}
+	else
+	{
+		if(enable_distri_debug)
+		{
+			snapshot->scanned_tuples_after_committed++;
+		}
+			
+		*need_retry = false;
+		if(!GlobalTimestampIsValid(snapshot->start_ts))
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("transaction %d does not have valid timestamp. "
+							"snapshot start ts " INT64_FORMAT ", autovacuum %d "
+							"in recovery %d",
+					 xid, snapshot->start_ts, IsAutoVacuumWorkerProcess(),
+					 snapshot->takenDuringRecovery)));
+		}
+
+		DEBUG_SNAPSHOT(elog(LOG, "outer xmax snapshot ts " INT64_FORMAT "global"
+				" committs " INT64_FORMAT "xid %d.", snapshot->start_ts,
+				global_committs, xid));
+		DEBUG_SNAPSHOT(
+			if(!TransactionIdDidCommit(xid))
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("xmax transaction %d should commit but not. "
+								"snapshot start ts " INT64_FORMAT " commit %d "
+								"abort %d in-progress %d active %d recentxmin %d "
+								"start ts " INT64_FORMAT " committs " INT64_FORMAT,
+						 xid,
+						 snapshot->start_ts,
+						 TransactionIdDidCommit(xid),
+						 TransactionIdDidAbort(xid),
+						 TransactionIdIsInProgress(xid),
+						 TransactionIdIsActive(xid),
+						 RecentXmin,
+						 snapshot->start_ts,
+						 global_committs)));
+			}
+		);
+
+		if(snapshot->start_ts > global_committs)
+		{
+			SnapshotCheck(xid, snapshot, false, global_committs);
+
+			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "false xid %d"
+					" committs" INT64_FORMAT "11.", snapshot->start_ts, xid,
+					global_committs));
+			return false;
+		}
+		else
+		{
+			SnapshotCheck(xid, snapshot, true, global_committs);
+
+			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "true xid %d "
+					"committs " INT64_FORMAT "12.", snapshot->start_ts, xid,
+					global_committs));
+			return true;
+		}
+	}
 }
 
-
 /*
  * HeapTupleSatisfiesMVCC
  *        True iff heap tuple is valid for the given MVCC snapshot.
@@ -1312,976 +1354,651 @@ XmaxInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot, Buffer b
 bool
 HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
                        Buffer buffer)
-{// #lizard forgives
-    HeapTupleHeader tuple = htup->t_data;
-    bool need_retry;
-
-retry:
-    need_retry = false;
-    Assert(ItemPointerIsValid(&htup->t_self));
-    Assert(htup->t_tableOid != InvalidOid);
-
-#ifdef _MIGRATE_
-    if(IS_PGXC_DATANODE && ShardIDIsValid(tuple->t_shardid) && SnapshotGetShardTable(snapshot))
-    {
-        bool shard_is_visible = bms_is_member(tuple->t_shardid/snapshot->groupsize,
-                                                SnapshotGetShardTable(snapshot));
-
-        if(!IsConnFromApp())
-        {
-            if(!shard_is_visible)
-                return false;
-        }
-        else if(g_ShardVisibleMode != SHARD_VISIBLE_MODE_ALL)
-        {
-            if((!shard_is_visible && g_ShardVisibleMode == SHARD_VISIBLE_MODE_VISIBLE)
-                || (shard_is_visible && g_ShardVisibleMode == SHARD_VISIBLE_MODE_HIDDEN))
-            {
-                return false;
-            }
-        }
-    }
-#endif
-    if (!HeapTupleHeaderXminCommitted(tuple))
-    {
-        if (HeapTupleHeaderXminInvalid(tuple))
-        {
-            //elog(DEBUG11, "heap invalid xmin");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmin invalid.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            return false;
-        }
-
-        /* Used by pre-9.0 binary upgrades */
-        if (tuple->t_infomask & HEAP_MOVED_OFF)
-        {
-            TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
-
-            if (TransactionIdIsCurrentTransactionId(xvac))
-            {
-                //elog(DEBUG11, "heap moved off current transaction");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move off.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                return false;
-            }
-            if (!XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID))
-            {
-                if (TransactionIdDidCommit(xvac))
-                {
-                    SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
-                                InvalidTransactionId);
-                    //elog(DEBUG11, "heap moved off");
-                    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                    {
-                        elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move off 1.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                    }
-                    return false;
-                }
-                SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
-                            InvalidTransactionId);
-            }
-            if(need_retry)
-            {
-                goto retry;
-            }
-            
-        }
-        /* Used by pre-9.0 binary upgrades */
-        else if (tuple->t_infomask & HEAP_MOVED_IN)
-        {
-            TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
-
-            if (!TransactionIdIsCurrentTransactionId(xvac))
-            {
-                if (XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID))
-                {
-                    //elog(DEBUG11, "heap moved in in snapshot");
-                    if(need_retry)
-                    {
-                        goto retry;
-                    }
-                    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                    {
-                        elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move in.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                    }
-                    return false;
-                }
-                if (TransactionIdDidCommit(xvac))
-                    SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
-                                InvalidTransactionId);
-                else
-                {
-                    SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
-                                InvalidTransactionId);
-                    //elog(DEBUG11, "heap moved in");
-                    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                    {
-                        elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move in.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                    }
-                    return false;
-                }
-            }
-        }
-        else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple)))
-        {
-            
-            if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid)
-            {
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d current 1.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                return false;    /* inserted after scan started */
-            }
-            if (tuple->t_infomask & HEAP_XMAX_INVALID)    /* xid invalid */
-            {
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 2.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                if(enable_distri_debug)
-                {
-                    snapshot->number_visible_tuples++;
-                }
-                return true;
-            }
-            if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))    /* not deleter */
-            {
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 3.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                if(enable_distri_debug)
-                {
-                    snapshot->number_visible_tuples++;
-                }
-                return true;
-            }
-            if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
-            {
-                TransactionId xmax;
-
-                xmax = HeapTupleGetUpdateXid(tuple);
-
-                /* not LOCKED_ONLY, so it has to have an xmax */
-                Assert(TransactionIdIsValid(xmax));
-
-                /* updating subtransaction must have aborted */
-                if (!TransactionIdIsCurrentTransactionId(xmax))
-                {
-                    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                    {
-                        elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 3.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                    }
-                    if(enable_distri_debug)
-                    {
-                        snapshot->number_visible_tuples++;
-                    }
-                    return true;
-                }
-                else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
-                {
-                    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                    {
-                        elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 4.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                    }
-                    if(enable_distri_debug)
-                    {
-                        snapshot->number_visible_tuples++;
-                    }
-                    return true;    /* updated after scan started */
-                }
-                else
-                {
-                    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                    {
-                        elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d current 5.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                    }
-                    return false;    /* updated before scan started */
-                }
-            }
-
-            if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
-            {
-                /* deleting subtransaction must have aborted */
-                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-                            InvalidTransactionId);
-                //elog(DEBUG11, "heap deleting subtransaction");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 6.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                if(enable_distri_debug)
-                {
-                    snapshot->number_visible_tuples++;
-                }
-                return true;
-            }
-
-            if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
-            {
-                //elog(DEBUG11, "heap xmin deleted after scan");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 7.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                if(enable_distri_debug)
-                {
-                    snapshot->number_visible_tuples++;
-                }
-                return true;    /* deleted after scan started */
-            }
-            else
-            {
-                //elog(DEBUG11, "heap xmin deleted before scan");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d current 8.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                return false;    /* deleted before scan started */
-            }
-        }
-        else if (XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
-        {
-            //elog(DEBUG11, "heap xmin in snapshot");
-            if(need_retry)
-            {
-                goto retry;
-            }
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            return false;
-        }
-        else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple)))
-            SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
-                        HeapTupleHeaderGetRawXmin(tuple));
-        else
-        {
-            /* it must have aborted or crashed */
-            SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
-                        InvalidTransactionId);
-            //elog(DEBUG11, "heap xmin aborted");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmin abort.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            return false;
-        }
-    }
-    else
-    {
-        /* xmin is committed, but maybe not according to our snapshot */
-        if (!HeapTupleHeaderXminFrozen(tuple) &&
-            XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
-        {
-            if(need_retry)
-            {
-                goto retry;
-            }
-            //elog(DEBUG11, "heap xmin not committed according to snapshot");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d according to snapshot.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            return false;        /* treat as still in progress */
-        }        
-    }
-
-    /* by here, the inserting transaction has committed */
-
-    if (tuple->t_infomask & HEAP_XMAX_INVALID)    /* xid invalid or aborted */
-    {
-        //elog(DEBUG11, "heap invalid xmax");
-        if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-        {
-            elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-        }
-        if(enable_distri_debug)
-        {
-            snapshot->number_visible_tuples++;
-        }
-        return true;
-    }
-    if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)){
-        //elog(DEBUG11, "heap xmax locked");
-        if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-        {
-            elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-        }
-        if(enable_distri_debug)
-        {
-            snapshot->number_visible_tuples++;
-        }
-        return true;
-    }
-
-    if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
-    {
-        TransactionId xmax;
-
-        /* already checked above */
-        Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
-
-        xmax = HeapTupleGetUpdateXid(tuple);
-
-        /* not LOCKED_ONLY, so it has to have an xmax */
-        Assert(TransactionIdIsValid(xmax));
-
-        if (TransactionIdIsCurrentTransactionId(xmax))
-        {
-            if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
-            {
-                //elog(DEBUG11, "heap multi xmax deleted after scan");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                if(enable_distri_debug)
-                {
-                    snapshot->number_visible_tuples++;
-                }
-                return true;    /* deleted after scan started */
-            }
-            else
-            {
-                //elog(DEBUG11, "heap multi xmax deleted before scan");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax %d deleted after scan.", 
-                                                snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple), xmax);
-                }
-                return false;    /* deleted before scan started */
-            }
-        }
-        if (XidInMVCCSnapshotDistri(tuple, xmax, snapshot, buffer, &need_retry, HEAP_XMAX_INVALID))
-        {
-            if(need_retry)
-            {
-                goto retry;
-            }
-            //elog(DEBUG11, "heap multi xmax in snapshot");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            if(enable_distri_debug)
-            {
-                snapshot->number_visible_tuples++;
-            }
-            return true;
-        }
-        if (TransactionIdDidCommit(xmax))
-        {
-            //elog(DEBUG11, "heap multi xmax committed");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax %d committed .", 
-                        snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple), xmax);
-            }
-            return false;        /* updating transaction committed */
-        }
-        /* it must have aborted or crashed */
-        //elog(DEBUG11, "heap multi xmax aborted");
-        if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-        {
-            elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-        }
-        if(enable_distri_debug)
-        {
-            snapshot->number_visible_tuples++;
-        }
-        return true;
-    }
-
-    if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
-    {
-        if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
-        {
-            if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
-            {
-                //elog(DEBUG11, "heap xmax deleted after scan");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                if(enable_distri_debug)
-                {
-                    snapshot->number_visible_tuples++;
-                }
-                return true;    /* deleted after scan started */
-            }
-            else
-            {
-                //elog(DEBUG11, "heap xmax deleted before scan");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax deleted before scan.", 
-                                snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                return false;    /* deleted before scan started */
-            }
-        }
-
-        if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
-        {
-            if(need_retry)
-            {
-                goto retry;
-            }
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            if(enable_distri_debug)
-            {
-                snapshot->number_visible_tuples++;
-            }
-            return true;
-        }
-        if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
-        {
-            /* it must have aborted or crashed */
-            
-            SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-                        InvalidTransactionId);
-            //elog(DEBUG11, "heap xmax aborted");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            if(enable_distri_debug)
-            {
-                snapshot->number_visible_tuples++;
-            }
-            return true;
-        }
-
-        /* xmax transaction committed */
-        SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-                    HeapTupleHeaderGetRawXmax(tuple));
-    }
-    else
-    {    
-        /* xmax is committed, but maybe not according to our snapshot */
-        if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
-        {    
-            if(need_retry)
-            {
-                goto retry;
-            }
-            //elog(DEBUG11, "heap xmax not committed");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            if(enable_distri_debug)
-            {
-                snapshot->number_visible_tuples++;
-            }
-            return true;        /* treat as still in progress */
-        }
-    }
-
-    /* xmax transaction committed */
-    //elog(DEBUG11, "heap xmax committed");
-    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-    {
-        elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax %d committed last.", 
-                snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple), HeapTupleHeaderGetRawXmax(tuple));
-    }
-    return false;
-}
-
-#ifdef __STORAGE_SCALABLE__
-bool
-HeapTupleSatisfiesUnshard(HeapTuple htup, Snapshot snapshot, Buffer buffer)
-{// #lizard forgives
-    HeapTupleHeader tuple = htup->t_data;
-    bool need_retry;
-
-retry:
-    need_retry = false;
-    Assert(ItemPointerIsValid(&htup->t_self));
-    Assert(htup->t_tableOid != InvalidOid);
-
-    if(IS_PGXC_DATANODE && tuple->t_shardid < 0)
-        return false;
-
-    if(IS_PGXC_DATANODE && tuple->t_shardid >= 0)
-    {    
-        if(g_DatanodeShardgroupBitmap == NULL)
-        {
-            elog(ERROR, "shard map in share memory has not been initialized yet.");
-        }
-		LWLockAcquire(ShardMapLock, LW_SHARED);	
-        if(bms_is_member(tuple->t_shardid, g_DatanodeShardgroupBitmap))
-        {
-			LWLockRelease(ShardMapLock);
-            return false;
-        }
-		LWLockRelease(ShardMapLock);
-    }
-
-    if (!HeapTupleHeaderXminCommitted(tuple))
-    {
-        if (HeapTupleHeaderXminInvalid(tuple))
-        {
-            //elog(DEBUG11, "heap invalid xmin");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmin invalid.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            return false;
-        }
-
-        /* Used by pre-9.0 binary upgrades */
-        if (tuple->t_infomask & HEAP_MOVED_OFF)
-        {
-            TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
-
-            if (TransactionIdIsCurrentTransactionId(xvac))
-            {
-                //elog(DEBUG11, "heap moved off current transaction");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move off.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                return false;
-            }
-            if (!XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID))
-            {
-                if (TransactionIdDidCommit(xvac))
-                {
-                    SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
-                                InvalidTransactionId);
-                    //elog(DEBUG11, "heap moved off");
-                    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                    {
-                        elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move off 1.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                    }
-                    return false;
-                }
-                SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
-                            InvalidTransactionId);
-            }
-            if(need_retry)
-            {
-                goto retry;
-            }
-            
-        }
-        /* Used by pre-9.0 binary upgrades */
-        else if (tuple->t_infomask & HEAP_MOVED_IN)
-        {
-            TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
-
-            if (!TransactionIdIsCurrentTransactionId(xvac))
-            {
-                if (XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID))
-                {
-                    //elog(DEBUG11, "heap moved in in snapshot");
-                    if(need_retry)
-                    {
-                        goto retry;
-                    }
-                    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                    {
-                        elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move in.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                    }
-                    return false;
-                }
-                if (TransactionIdDidCommit(xvac))
-                    SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
-                                InvalidTransactionId);
-                else
-                {
-                    SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
-                                InvalidTransactionId);
-                    //elog(DEBUG11, "heap moved in");
-                    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                    {
-                        elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d move in.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                    }
-                    return false;
-                }
-            }
-        }
-        else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple)))
-        {
-            
-            if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid)
-            {
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d current 1.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                return false;    /* inserted after scan started */
-            }
-            if (tuple->t_infomask & HEAP_XMAX_INVALID)    /* xid invalid */
-            {
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 2.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                if(enable_distri_debug)
-                {
-                    snapshot->number_visible_tuples++;
-                }
-                return true;
-            }
-            if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))    /* not deleter */
-            {
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 3.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                if(enable_distri_debug)
-                {
-                    snapshot->number_visible_tuples++;
-                }
-                return true;
-            }
-            if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
-            {
-                TransactionId xmax;
-
-                xmax = HeapTupleGetUpdateXid(tuple);
-
-                /* not LOCKED_ONLY, so it has to have an xmax */
-                Assert(TransactionIdIsValid(xmax));
-
-                /* updating subtransaction must have aborted */
-                if (!TransactionIdIsCurrentTransactionId(xmax))
-                {
-                    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                    {
-                        elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 3.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                    }
-                    if(enable_distri_debug)
-                    {
-                        snapshot->number_visible_tuples++;
-                    }
-                    return true;
-                }
-                else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
-                {
-                    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                    {
-                        elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 4.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                    }
-                    if(enable_distri_debug)
-                    {
-                        snapshot->number_visible_tuples++;
-                    }
-                    return true;    /* updated after scan started */
-                }
-                else
-                {
-                    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                    {
-                        elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d current 5.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                    }
-                    return false;    /* updated before scan started */
-                }
-            }
-
-            if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
-            {
-                /* deleting subtransaction must have aborted */
-                SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-                            InvalidTransactionId);
-                //elog(DEBUG11, "heap deleting subtransaction");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 6.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                if(enable_distri_debug)
-                {
-                    snapshot->number_visible_tuples++;
-                }
-                return true;
-            }
-
-            if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
-            {
-                //elog(DEBUG11, "heap xmin deleted after scan");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d current 7.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                if(enable_distri_debug)
-                {
-                    snapshot->number_visible_tuples++;
-                }
-                return true;    /* deleted after scan started */
-            }
-            else
-            {
-                //elog(DEBUG11, "heap xmin deleted before scan");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d current 8.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                return false;    /* deleted before scan started */
-            }
-        }
-        else if (XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
-        {
-            //elog(DEBUG11, "heap xmin in snapshot");
-            if(need_retry)
-            {
-                goto retry;
-            }
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            return false;
-        }
-        else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple)))
-            SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
-                        HeapTupleHeaderGetRawXmin(tuple));
-        else
-        {
-            /* it must have aborted or crashed */
-            SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
-                        InvalidTransactionId);
-            //elog(DEBUG11, "heap xmin aborted");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmin abort.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            return false;
-        }
-    }
-    else
-    {
-        /* xmin is committed, but maybe not according to our snapshot */
-        if (!HeapTupleHeaderXminFrozen(tuple) &&
-            XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
-        {
-            if(need_retry)
-            {
-                goto retry;
-            }
-            //elog(DEBUG11, "heap xmin not committed according to snapshot");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d according to snapshot.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            return false;        /* treat as still in progress */
-        }        
-    }
-
-    /* by here, the inserting transaction has committed */
-
-    if (tuple->t_infomask & HEAP_XMAX_INVALID)    /* xid invalid or aborted */
-    {
-        //elog(DEBUG11, "heap invalid xmax");
-        if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-        {
-            elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-        }
-        if(enable_distri_debug)
-        {
-            snapshot->number_visible_tuples++;
-        }
-        return true;
-    }
-    if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)){
-        //elog(DEBUG11, "heap xmax locked");
-        if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-        {
-            elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-        }
-        if(enable_distri_debug)
-        {
-            snapshot->number_visible_tuples++;
-        }
-        return true;
-    }
-
-    if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
-    {
-        TransactionId xmax;
-
-        /* already checked above */
-        Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
+{// #lizard forgives
+    HeapTupleHeader tuple = htup->t_data;
+    bool need_retry;
 
-        xmax = HeapTupleGetUpdateXid(tuple);
+retry:
+    need_retry = false;
+    Assert(ItemPointerIsValid(&htup->t_self));
+    Assert(htup->t_tableOid != InvalidOid);
 
-        /* not LOCKED_ONLY, so it has to have an xmax */
-        Assert(TransactionIdIsValid(xmax));
+#ifdef _MIGRATE_
+    if(IS_PGXC_DATANODE && ShardIDIsValid(tuple->t_shardid) && SnapshotGetShardTable(snapshot))
+    {
+        bool shard_is_visible = bms_is_member(tuple->t_shardid/snapshot->groupsize,
+                                                SnapshotGetShardTable(snapshot));
 
-        if (TransactionIdIsCurrentTransactionId(xmax))
-        {
-            if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
-            {
-                //elog(DEBUG11, "heap multi xmax deleted after scan");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                if(enable_distri_debug)
-                {
-                    snapshot->number_visible_tuples++;
-                }
-                return true;    /* deleted after scan started */
-            }
-            else
-            {
-                //elog(DEBUG11, "heap multi xmax deleted before scan");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax %d deleted after scan.", 
-                                                snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple), xmax);
-                }
-                return false;    /* deleted before scan started */
-            }
-        }
-        if (XidInMVCCSnapshotDistri(tuple, xmax, snapshot, buffer, &need_retry, HEAP_XMAX_INVALID))
+        if(!IsConnFromApp())
         {
-            if(need_retry)
-            {
-                goto retry;
-            }
-            //elog(DEBUG11, "heap multi xmax in snapshot");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            if(enable_distri_debug)
-            {
-                snapshot->number_visible_tuples++;
-            }
-            return true;
+            if(!shard_is_visible)
+                return false;
         }
-        if (TransactionIdDidCommit(xmax))
+        else if(g_ShardVisibleMode != SHARD_VISIBLE_MODE_ALL)
         {
-            //elog(DEBUG11, "heap multi xmax committed");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
+            if((!shard_is_visible && g_ShardVisibleMode == SHARD_VISIBLE_MODE_VISIBLE)
+                || (shard_is_visible && g_ShardVisibleMode == SHARD_VISIBLE_MODE_HIDDEN))
             {
-                elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax %d committed .", 
-                        snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple), xmax);
+                return false;
             }
-            return false;        /* updating transaction committed */
         }
-        /* it must have aborted or crashed */
-        //elog(DEBUG11, "heap multi xmax aborted");
-        if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-        {
-            elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-        }
-        if(enable_distri_debug)
-        {
-            snapshot->number_visible_tuples++;
-        }
-        return true;
     }
+#endif
+	if (!HeapTupleHeaderXminCommitted(tuple))
+	{
+		if (HeapTupleHeaderXminInvalid(tuple))
+		{
+			DEBUG_MVCC_XMIN(false, "xmin invalid");
+			return false;
+		}
+
+		/* Used by pre-9.0 binary upgrades */
+		if (tuple->t_infomask & HEAP_MOVED_OFF)
+		{
+			TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
+
+			if (TransactionIdIsCurrentTransactionId(xvac))
+			{
+				DEBUG_MVCC_XMIN(false, "move off");
+				return false;
+			}
+			if (!XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID))
+			{
+				if (TransactionIdDidCommit(xvac))
+				{
+					SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+								InvalidTransactionId);
+					DEBUG_MVCC_XMIN(false, "move off 1");
+					return false;
+				}
+				SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+							InvalidTransactionId);
+			}
+			if(need_retry)
+			{
+				goto retry;
+			}
+			
+		}
+		/* Used by pre-9.0 binary upgrades */
+		else if (tuple->t_infomask & HEAP_MOVED_IN)
+		{
+			TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
+
+			if (!TransactionIdIsCurrentTransactionId(xvac))
+			{
+				if (XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID))
+				{
+					if(need_retry)
+					{
+						goto retry;
+					}
+					DEBUG_MVCC_XMIN(false, " move in");
+					return false;
+				}
+				if (TransactionIdDidCommit(xvac))
+					SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+								InvalidTransactionId);
+				else
+				{
+					SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+								InvalidTransactionId);
+					DEBUG_MVCC_XMIN(false, "move in");
+					return false;
+				}
+			}
+		}
+		else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple)))
+		{
+			
+			if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid)
+			{
+				DEBUG_MVCC_XMIN(false, "current 1");
+				return false;	/* inserted after scan started */
+			}
+			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
+			{
+				DEBUG_MVCC_XMIN(true, "current 2");
+				DEBUG_INCREASE_VISIBLE_TUPLE;
+				return true;
+			}
+			if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))	/* not deleter */
+			{
+				DEBUG_MVCC_XMIN(true, "current 3");
+				DEBUG_INCREASE_VISIBLE_TUPLE;
+				return true;
+			}
+			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+			{
+				TransactionId xmax;
+
+				xmax = HeapTupleGetUpdateXid(tuple);
+
+				/* not LOCKED_ONLY, so it has to have an xmax */
+				Assert(TransactionIdIsValid(xmax));
+
+				/* updating subtransaction must have aborted */
+				if (!TransactionIdIsCurrentTransactionId(xmax))
+				{
+					DEBUG_MVCC_XMIN(true, "current 3");
+					DEBUG_INCREASE_VISIBLE_TUPLE;
+					return true;
+				}
+				else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+				{
+					DEBUG_MVCC_XMIN(true, "current 4");
+					DEBUG_INCREASE_VISIBLE_TUPLE;
+					return true;	/* updated after scan started */
+				}
+				else
+				{
+					DEBUG_MVCC_XMIN(false, "current 5");
+					return false;	/* updated before scan started */
+				}
+			}
+
+			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
+			{
+				/* deleting subtransaction must have aborted */
+				SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							InvalidTransactionId);
+				DEBUG_MVCC_XMIN(true, "current 6");
+				DEBUG_INCREASE_VISIBLE_TUPLE;
+				return true;
+			}
+
+			if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+			{
+				DEBUG_MVCC_XMIN(true, "current 7");
+				DEBUG_INCREASE_VISIBLE_TUPLE;
+				return true;	/* deleted after scan started */
+			}
+			else
+			{
+				DEBUG_MVCC_XMIN(false, "current 8");
+				return false;	/* deleted before scan started */
+			}
+		}
+		else if (XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
+		{
+			//elog(DEBUG11, "heap xmin in snapshot");
+			if(need_retry)
+			{
+				goto retry;
+			}
+			DEBUG_MVCC_XMIN(false, "");
+			return false;
+		}
+		else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple)))
+			SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+						HeapTupleHeaderGetRawXmin(tuple));
+		else
+		{
+			/* it must have aborted or crashed */
+			SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+						InvalidTransactionId);
+			DEBUG_MVCC_XMIN(false, "xmin abort");
+			return false;
+		}
+	}
+	else
+	{
+		/* xmin is committed, but maybe not according to our snapshot */
+		if (!HeapTupleHeaderXminFrozen(tuple) &&
+			XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
+		{
+			if(need_retry)
+			{
+				goto retry;
+			}
+			DEBUG_MVCC_XMIN(false, "according to snapshot");
+			return false;		/* treat as still in progress */
+		}		
+	}
+
+	/* by here, the inserting transaction has committed */
+
+	if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid or aborted */
+	{
+		DEBUG_MVCC_XMIN(true, "");
+		DEBUG_INCREASE_VISIBLE_TUPLE;
+		return true;
+	}
+	if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)){
+		DEBUG_MVCC_XMIN(true, "xmax locked");
+		DEBUG_INCREASE_VISIBLE_TUPLE;
+		return true;
+	}
+
+	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+	{
+		TransactionId xmax;
+
+		/* already checked above */
+		Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+
+		/* not LOCKED_ONLY, so it has to have an xmax */
+		Assert(TransactionIdIsValid(xmax));
+
+		if (TransactionIdIsCurrentTransactionId(xmax))
+		{
+			if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+			{
+				DEBUG_MVCC_XMIN(true, "heap multi xmax deleted after scan");
+				DEBUG_INCREASE_VISIBLE_TUPLE
+				return true;	/* deleted after scan started */
+			}
+			else
+			{
+				DEBUG_MVCC_XMINXMAX(true, xmax, "deleted after scan");
+				return false;	/* deleted before scan started */
+			}
+		}
+		if (XidInMVCCSnapshotDistri(tuple, xmax, snapshot, buffer, &need_retry, HEAP_XMAX_INVALID))
+		{
+			if(need_retry)
+			{
+				goto retry;
+			}
+			DEBUG_MVCC_XMIN(true, "");
+			DEBUG_INCREASE_VISIBLE_TUPLE
+			return true;
+		}
+		if (TransactionIdDidCommit(xmax))
+		{
+			DEBUG_MVCC_XMINXMAX(false, xmax, "committed");
+			return false;		/* updating transaction committed */
+		}
+		/* it must have aborted or crashed */
+		DEBUG_MVCC_XMIN(true, "");
+		DEBUG_INCREASE_VISIBLE_TUPLE;
+		return true;
+	}
+
+	if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
+	{
+		if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
+		{
+			if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+			{
+				DEBUG_MVCC_XMIN(true, "");
+				DEBUG_INCREASE_VISIBLE_TUPLE;
+				return true;	/* deleted after scan started */
+			}
+			else
+			{
+				DEBUG_MVCC_XMIN(false, "xmax deleted before scan");
+				return false;	/* deleted before scan started */
+			}
+		}
+
+		if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
+		{
+			if(need_retry)
+			{
+				goto retry;
+			}
+			DEBUG_MVCC_XMIN(true, "");
+			DEBUG_INCREASE_VISIBLE_TUPLE;
+			return true;
+		}
+		if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
+		{
+			/* it must have aborted or crashed */
+			
+			SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+						InvalidTransactionId);
+			DEBUG_MVCC_XMIN(true, "heap xmax aborted");
+			DEBUG_INCREASE_VISIBLE_TUPLE;
+			return true;
+		}
+
+		/* xmax transaction committed */
+		SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+					HeapTupleHeaderGetRawXmax(tuple));
+	}
+	else
+	{	
+		/* xmax is committed, but maybe not according to our snapshot */
+		if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
+		{	
+			if(need_retry)
+			{
+				goto retry;
+			}
+			DEBUG_MVCC_XMIN(true, "heap xmax not committed");
+			DEBUG_INCREASE_VISIBLE_TUPLE;
+			return true;		/* treat as still in progress */
+		}
+	}
+
+	/* xmax transaction committed */
+	DEBUG_MVCC_XMINXMAX(true, HeapTupleHeaderGetRawXmax(tuple), "committed last");
+	return false;
+}
 
-    if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
-    {
-        if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
-        {
-            if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
-            {
-                //elog(DEBUG11, "heap xmax deleted after scan");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                if(enable_distri_debug)
-                {
-                    snapshot->number_visible_tuples++;
-                }
-                return true;    /* deleted after scan started */
-            }
-            else
-            {
-                //elog(DEBUG11, "heap xmax deleted before scan");
-                if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-                {
-                    elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax deleted before scan.", 
-                                snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-                }
-                return false;    /* deleted before scan started */
-            }
-        }
+#ifdef __STORAGE_SCALABLE__
+bool
+HeapTupleSatisfiesUnshard(HeapTuple htup, Snapshot snapshot, Buffer buffer)
+{// #lizard forgives
+    HeapTupleHeader tuple = htup->t_data;
+    bool need_retry;
+
+retry:
+    need_retry = false;
+    Assert(ItemPointerIsValid(&htup->t_self));
+    Assert(htup->t_tableOid != InvalidOid);
+
+    if(IS_PGXC_DATANODE && tuple->t_shardid < 0)
+        return false;
 
-        if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
+    if(IS_PGXC_DATANODE && tuple->t_shardid >= 0)
+    {    
+        if(g_DatanodeShardgroupBitmap == NULL)
         {
-            if(need_retry)
-            {
-                goto retry;
-            }
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            if(enable_distri_debug)
-            {
-                snapshot->number_visible_tuples++;
-            }
-            return true;
+            elog(ERROR, "shard map in share memory has not been initialized yet.");
         }
-        if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
+		LWLockAcquire(ShardMapLock, LW_SHARED);	
+        if(bms_is_member(tuple->t_shardid, g_DatanodeShardgroupBitmap))
         {
-            /* it must have aborted or crashed */
-            
-            SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
-                        InvalidTransactionId);
-            //elog(DEBUG11, "heap xmax aborted");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            if(enable_distri_debug)
-            {
-                snapshot->number_visible_tuples++;
-            }
-            return true;
-        }
-
-        /* xmax transaction committed */
-        SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
-                    HeapTupleHeaderGetRawXmax(tuple));
-    }
-    else
-    {    
-        /* xmax is committed, but maybe not according to our snapshot */
-        if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
-        {    
-            if(need_retry)
-            {
-                goto retry;
-            }
-            //elog(DEBUG11, "heap xmax not committed");
-            if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-            {
-                elog(LOG, "MVCC ts " INT64_FORMAT " true xmin %d.", snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple));
-            }
-            if(enable_distri_debug)
-            {
-                snapshot->number_visible_tuples++;
-            }
-            return true;        /* treat as still in progress */
+			LWLockRelease(ShardMapLock);
+            return false;
         }
-    }
-
-    /* xmax transaction committed */
-    //elog(DEBUG11, "heap xmax committed");
-    if(enable_distri_visibility_print && TransactionIdIsNormal(HeapTupleHeaderGetRawXmin(tuple)))
-    {
-        elog(LOG, "MVCC ts " INT64_FORMAT " false xmin %d xmax %d committed last.", 
-                snapshot->start_ts, HeapTupleHeaderGetRawXmin(tuple), HeapTupleHeaderGetRawXmax(tuple));
-    }
-    return false;
+		LWLockRelease(ShardMapLock);
+	}
+
+	if (!HeapTupleHeaderXminCommitted(tuple))
+	{
+		if (HeapTupleHeaderXminInvalid(tuple))
+		{
+			DEBUG_MVCC_XMIN(false, "xmin invalid");
+			return false;
+		}
+
+		/* Used by pre-9.0 binary upgrades */
+		if (tuple->t_infomask & HEAP_MOVED_OFF)
+		{
+			TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
+
+			if (TransactionIdIsCurrentTransactionId(xvac))
+			{
+				DEBUG_MVCC_XMIN(false, "move off");
+				return false;
+			}
+			if (!XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID))
+			{
+				if (TransactionIdDidCommit(xvac))
+				{
+					SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+								InvalidTransactionId);
+					DEBUG_MVCC_XMIN(false, "move off 1");
+					return false;
+				}
+				SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+							InvalidTransactionId);
+			}
+			if(need_retry)
+			{
+				goto retry;
+			}
+			
+		}
+		/* Used by pre-9.0 binary upgrades */
+		else if (tuple->t_infomask & HEAP_MOVED_IN)
+		{
+			TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
+
+			if (!TransactionIdIsCurrentTransactionId(xvac))
+			{
+				if (XidInMVCCSnapshotDistri(tuple, xvac, snapshot, buffer, &need_retry, HEAP_XMIN_INVALID))
+				{
+					if(need_retry)
+					{
+						goto retry;
+					}
+					DEBUG_MVCC_XMIN(false, "move in");
+					return false;
+				}
+				if (TransactionIdDidCommit(xvac))
+					SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+								InvalidTransactionId);
+				else
+				{
+					SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+								InvalidTransactionId);
+					DEBUG_MVCC_XMIN(false, "move in");
+					return false;
+				}
+			}
+		}
+		else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple)))
+		{
+			if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid)
+			{
+				DEBUG_MVCC_XMIN(false, "current 1");
+				return false;	/* inserted after scan started */
+			}
+			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
+			{
+				DEBUG_MVCC_XMIN(true, "current 2");
+				DEBUG_INCREASE_VISIBLE_TUPLE;
+				return true;
+			}
+			if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))	/* not deleter */
+			{
+				DEBUG_MVCC_XMIN(true, "current 3");
+				DEBUG_INCREASE_VISIBLE_TUPLE;
+				return true;
+			}
+			if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+			{
+				TransactionId xmax;
+
+				xmax = HeapTupleGetUpdateXid(tuple);
+
+				/* not LOCKED_ONLY, so it has to have an xmax */
+				Assert(TransactionIdIsValid(xmax));
+
+				/* updating subtransaction must have aborted */
+				if (!TransactionIdIsCurrentTransactionId(xmax))
+				{
+					DEBUG_MVCC_XMIN(true, "current 3");
+					DEBUG_INCREASE_VISIBLE_TUPLE;
+					return true;
+				}
+				else if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+				{
+					DEBUG_MVCC_XMIN(true, "current 4");
+					DEBUG_INCREASE_VISIBLE_TUPLE;
+					return true;	/* updated after scan started */
+				}
+				else
+				{
+					DEBUG_MVCC_XMIN(false, "current 5");
+					return false;	/* updated before scan started */
+				}
+			}
+
+			if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
+			{
+				/* deleting subtransaction must have aborted */
+				SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+							InvalidTransactionId);
+				DEBUG_MVCC_XMIN(true, "current 6");
+				DEBUG_INCREASE_VISIBLE_TUPLE;
+				return true;
+			}
+
+			if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+			{
+				DEBUG_MVCC_XMIN(true, "current 7");
+				DEBUG_INCREASE_VISIBLE_TUPLE;
+				return true;	/* deleted after scan started */
+			}
+			else
+			{
+				DEBUG_MVCC_XMIN(false, "current 8");
+				return false;	/* deleted before scan started */
+			}
+		}
+		else if (XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
+		{
+			if(need_retry)
+			{
+				goto retry;
+			}
+			DEBUG_MVCC_XMIN(false, "xmin in snapshot");
+			return false;
+		}
+		else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple)))
+			SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED,
+						HeapTupleHeaderGetRawXmin(tuple));
+		else
+		{
+			/* it must have aborted or crashed */
+			SetHintBits(tuple, buffer, HEAP_XMIN_INVALID,
+						InvalidTransactionId);
+			DEBUG_MVCC_XMIN(false, "xmin aborted");
+			return false;
+		}
+	}
+	else
+	{
+		/* xmin is committed, but maybe not according to our snapshot */
+		if (!HeapTupleHeaderXminFrozen(tuple) &&
+			XminInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
+		{
+			if(need_retry)
+			{
+				goto retry;
+			}
+			DEBUG_MVCC_XMIN(false, " xmin not committed according to snapshot");
+			return false;		/* treat as still in progress */
+		}		
+	}
+
+	/* by here, the inserting transaction has committed */
+
+	if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid or aborted */
+	{
+		DEBUG_MVCC_XMIN(true, "invalid xmax");
+		DEBUG_INCREASE_VISIBLE_TUPLE;
+		return true;
+	}
+	if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)){
+		DEBUG_MVCC_XMIN(true, "xmax locked");
+		DEBUG_INCREASE_VISIBLE_TUPLE;
+		return true;
+	}
+
+	if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
+	{
+		TransactionId xmax;
+
+		/* already checked above */
+		Assert(!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
+
+		xmax = HeapTupleGetUpdateXid(tuple);
+
+		/* not LOCKED_ONLY, so it has to have an xmax */
+		Assert(TransactionIdIsValid(xmax));
+
+		if (TransactionIdIsCurrentTransactionId(xmax))
+		{
+			if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+			{
+				DEBUG_MVCC_XMIN(true, "multi xmax deleted after scan");
+				DEBUG_INCREASE_VISIBLE_TUPLE;
+				return true;	/* deleted after scan started */
+			}
+			else
+			{
+				DEBUG_MVCC_XMINXMAX(false, xmax, "deleted before scan");
+				return false;	/* deleted before scan started */
+			}
+		}
+		if (XidInMVCCSnapshotDistri(tuple, xmax, snapshot, buffer, &need_retry, HEAP_XMAX_INVALID))
+		{
+			if(need_retry)
+			{
+				goto retry;
+			}
+			DEBUG_MVCC_XMIN(true, "multi xmax in snapshot");
+			DEBUG_INCREASE_VISIBLE_TUPLE;
+			return true;
+		}
+		if (TransactionIdDidCommit(xmax))
+		{
+			DEBUG_MVCC_XMINXMAX(false, xmax, "committed");
+			return false;		/* updating transaction committed */
+		}
+		/* it must have aborted or crashed */
+		DEBUG_MVCC_XMIN(true, "xmax aborted");
+		DEBUG_INCREASE_VISIBLE_TUPLE;
+		return true;
+	}
+
+	if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
+	{
+		if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple)))
+		{
+			if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid)
+			{
+				DEBUG_MVCC_XMIN(true, "xmax deleted after scan started");
+				DEBUG_INCREASE_VISIBLE_TUPLE;
+				return true;	/* deleted after scan started */
+			}
+			else
+			{
+				DEBUG_MVCC_XMIN(false, "xmax deleted before scan started");
+				return false;	/* deleted before scan started */
+			}
+		}
+
+		if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
+		{
+			if(need_retry)
+			{
+				goto retry;
+			}
+			DEBUG_MVCC_XMIN(true, "xmax in mvcc snapshot");
+			DEBUG_INCREASE_VISIBLE_TUPLE;
+			return true;
+		}
+		if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple)))
+		{
+			/* it must have aborted or crashed */
+			SetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
+						InvalidTransactionId);
+			DEBUG_MVCC_XMIN(true, "xmax aborted");
+			DEBUG_INCREASE_VISIBLE_TUPLE;
+			return true;
+		}
+
+		/* xmax transaction committed */
+		SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
+					HeapTupleHeaderGetRawXmax(tuple));
+	}
+	else
+	{	
+		/* xmax is committed, but maybe not according to our snapshot */
+		if (XmaxInMVCCSnapshotByTimestamp(tuple, snapshot, buffer, &need_retry))
+		{	
+			if(need_retry)
+			{
+				goto retry;
+			}
+			DEBUG_MVCC_XMIN(true, "xmax not committed");
+			DEBUG_INCREASE_VISIBLE_TUPLE;
+			return true;		/* treat as still in progress */
+		}
+	}
+
+	/* xmax transaction committed */
+	DEBUG_MVCC_XMINXMAX(true, HeapTupleHeaderGetRawXmax(tuple), "xmax committed");
+	return false;
 }
 
 #endif
@@ -3685,243 +3402,229 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
     return false;
 }
 
-
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
 static bool
-XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid, Snapshot snapshot, Buffer buffer, bool *need_retry,
-                            uint16 infomask)
-{// #lizard forgives
-    int res = false;
-    GlobalTimestamp prepare_ts;
-    GlobalTimestamp global_committs = 0;
-
-    *need_retry = false;
-    /* 
-     * For Tbase, we propose a concurrency control mechanism
-     * based on global timestamp to maintain distributed transaction consistency.
-     * 
-     * Rule: T2 can see T1's modification only if T2.start > T1.commit.
-     * For read-committed isolation, T2.start is the executing statement's start timestmap.
-     * 
-     */
-    
-    
-    if (snapshot->local || !TransactionIdIsNormal(xid))
-    {
-        
-        res = XidInMVCCSnapshot(xid, snapshot);
-        SnapshotCheck(xid, snapshot, res, 0);
-        if(enable_distri_visibility_print && snapshot->local)
-        {
-            elog(DEBUG12, "local: snapshot ts " INT64_FORMAT " xid %d res %d.", snapshot->start_ts, xid, res);
-        }
-        return res;
-    }
-    
-    if(TransactionIdGetCommitTsData(xid, &global_committs, NULL))
-    {
-        if(!GlobalTimestampIsValid(snapshot->start_ts))
-        {
-            ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("transaction %d does not have valid timestamp. snapshot start ts " INT64_FORMAT 
-                                     ", autovacuum %d in recovery %d", 
-                         xid, snapshot->start_ts, IsAutoVacuumWorkerProcess(), snapshot->takenDuringRecovery)));
-        }
-        Assert(GlobalTimestampIsValid(snapshot->start_ts));
-        
-        if(enable_distri_debug)
-        {
-            snapshot->scanned_tuples_after_committed++;
-        }
-        
-        if(CommitTimestampIsLocal(global_committs))
-        {
-            res = XidInMVCCSnapshot(xid, snapshot);
-            SnapshotCheck(xid, snapshot, res, 0);
-            elog(DEBUG12, "local snapshot ts " INT64_FORMAT " res %d xid %d after wait.", snapshot->start_ts, res, xid);
-            return res;
-        }
-
-
-        if(snapshot->start_ts > global_committs)
-        {
-            SnapshotCheck(xid, snapshot, false, global_committs);
-            if(enable_distri_visibility_print)
-            {
-                elog(LOG, "snapshot ts " INT64_FORMAT " false xid %d committs "INT64_FORMAT" 1.", 
-                                                            snapshot->start_ts, xid, global_committs);
-            }
-            return false;
-        }
-        else
-        {
-            SnapshotCheck(xid, snapshot, true, global_committs);
-            if(enable_distri_visibility_print)
-            {
-                elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d committs "INT64_FORMAT" 2.", 
-                                                            snapshot->start_ts, xid, global_committs);
-            }
-            SetTimestamp(tuple, xid, buffer, infomask);
-            return true;
-        }
-    }
-
-    prepare_ts = InvalidGlobalTimestamp;
-    /* 
-     * If xid has passed the prepare phase, 
-     * we should wait for it to complete.
-     */
-    if(XidIsPrepared(xid, snapshot, &prepare_ts))
-    {
-
-        if(enable_distri_debug)
-        {
-            snapshot->scanned_tuples_after_prepare++;
-        }
-        
-        if(!GlobalTimestampIsValid(snapshot->start_ts))
-        {
-            ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("transaction %d does not have valid timestamp. snapshot start ts " INT64_FORMAT ", autovacuum %d in recovery %d", 
-                         xid, snapshot->start_ts, IsAutoVacuumWorkerProcess(), snapshot->takenDuringRecovery)));
-        }
-
-        if(GlobalTimestampIsValid(prepare_ts) && !GlobalTimestampIsFrozen(prepare_ts) &&
-            (snapshot->start_ts < prepare_ts))
-        {
-            SnapshotCheck(xid, snapshot, true, 0);
-            if(enable_distri_visibility_print)
-            {
-                elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d prep "INT64_FORMAT".", snapshot->start_ts, xid, prepare_ts);
-            }
-            elog(DEBUG12, "xid %d, start_ts " INT64_FORMAT ", prepare " INT64_FORMAT " after wait true.", xid,  snapshot->start_ts, prepare_ts);
-            return true;
-        }
-        
-        if(GlobalTimestampIsValid(prepare_ts))
-        {
-            BufferDesc *buf;
-            int lock_type = -1;
-            
-            buf = GetBufferDescriptor(buffer - 1);
-            
-            if(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf),
-                                        LW_EXCLUSIVE))
-            {
-                lock_type = BUFFER_LOCK_EXCLUSIVE;
-            }
-            else if(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf),
-                                        LW_SHARED))
-            {
-                lock_type = BUFFER_LOCK_SHARE;
-            }
-
-            XactLockTableWait(xid, NULL, NULL, XLTW_None);
-            if(lock_type != -1)
-            {
-                /* Avoid deadlock */
-                if(TransactionIdDidAbort(xid))
-                {
-                    if(enable_distri_visibility_print)
-                    {
-                        elog(LOG, "abort snapshot ts " INT64_FORMAT " false xid %d .", snapshot->start_ts, xid);
-                    }
-                    if(enable_distri_debug)
-                    {
-                        snapshot->scanned_tuples_after_abort++;
-                    }
-                    
-                    *need_retry = false;
-                    return false;
-                }
-                else
-                {
-                    *need_retry = true;
-                    return true;
-                }
-            }
-        }
-
-        
-        
-        if(TransactionIdGetCommitTsData(xid, &global_committs, NULL))
-        {
-
-            if(enable_distri_debug)
-            {
-                snapshot->scanned_tuples_after_committed++;
-            }
-            
-            if(CommitTimestampIsLocal(global_committs))
-            {
-                res =  XidInMVCCSnapshot(xid, snapshot);
-                SnapshotCheck(xid, snapshot, res, 0);
-                elog(DEBUG12, "local snapshot ts " INT64_FORMAT " res %d xid %d after wait.", snapshot->start_ts, res, xid);
-                return res;
-            }
-            elog(DEBUG12, "snapshot ts " INT64_FORMAT " global committs " INT64_FORMAT " xid %d after wait.", snapshot->start_ts, global_committs, xid);
-
-
-            if(snapshot->start_ts > global_committs)
-            {
-                if(enable_distri_visibility_print)
-                {
-                    elog(LOG, "snapshot ts " INT64_FORMAT " false xid %d committs "INT64_FORMAT" 3.", 
-                                                                snapshot->start_ts, xid, global_committs);
-                }
-                SnapshotCheck(xid, snapshot, false, global_committs);
-                return false;
-            }
-            else
-            {
-                if(enable_distri_visibility_print)
-                {
-                    elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d committs "INT64_FORMAT" 4.", 
-                                                                snapshot->start_ts, xid, global_committs);
-                }
-                SnapshotCheck(xid, snapshot, true, global_committs);
-                SetTimestamp(tuple, xid, buffer, infomask);
-                return true;
-
-            }
-        }
-        else 
-        {/* Abort or crashed */
-
-            if(enable_distri_debug)
-            {
-                snapshot->scanned_tuples_after_abort++;
-            }
-            elog(DEBUG12, "abort: snapshot ts " INT64_FORMAT "  xid %d.", snapshot->start_ts, xid);
-
-            SnapshotCheck(xid, snapshot, false, 0);
-            if(enable_distri_visibility_print)
-            {
-                elog(LOG, "abort snapshot ts " INT64_FORMAT " false xid %d .", snapshot->start_ts, xid);
-            }
-            return false;
-        }
-    }
-
-    if(enable_distri_debug)
-    {
-        snapshot->scanned_tuples_before_prepare++;
-    }
-    /*
-     * For non-prepared transaction, its commit timestamp must be larger than
-     * the current running transaction/statement's start timestamp.
-     * This is because that as T1's commit timestamp has not yet been aquired on CN, 
-     * T2.start < T1.commit is always being held.
-     */
-    SnapshotCheck(xid, snapshot, true, 0);
-    if(enable_distri_visibility_print)
-    {
-        elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d 5.", snapshot->start_ts, xid);
-    }
-    return true;
-
+XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid,
+						Snapshot snapshot, Buffer buffer,
+						bool *need_retry, uint16 infomask)
+{
+	int res = false;
+	GlobalTimestamp prepare_ts;
+	GlobalTimestamp global_committs = 0;
+
+	*need_retry = false;
+	/* 
+	 * For Tbase, we propose a concurrency control mechanism based on global
+	 * timestamp to maintain distributed transaction consistency.
+	 * 
+	 * Rule: T2 can see T1's modification only if T2.start > T1.commit.
+	 * For read-committed isolation, T2.start is the executing statement's
+	 * start timestmap.
+	 */
+	if (snapshot->local || !TransactionIdIsNormal(xid))
+	{
+		res = XidInMVCCSnapshot(xid, snapshot);
+		SnapshotCheck(xid, snapshot, res, 0);
+
+		DEBUG_SNAPSHOT(elog(DEBUG12, "local: snapshot ts " INT64_FORMAT "xid %d"
+				" res %d.", snapshot->start_ts, xid, res));
+		return res;
+	}
+	
+	if(TransactionIdGetCommitTsData(xid, &global_committs, NULL))
+	{
+		if(!GlobalTimestampIsValid(snapshot->start_ts))
+		{
+			ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("transaction %d does not have valid timestamp."
+								"snapshot start ts " INT64_FORMAT ", autovacuum"
+								" %d in recovery %d",
+						 xid, snapshot->start_ts,
+						 IsAutoVacuumWorkerProcess(),
+						 snapshot->takenDuringRecovery)));
+		}
+		Assert(GlobalTimestampIsValid(snapshot->start_ts));
+		
+		if(enable_distri_debug)
+		{
+			snapshot->scanned_tuples_after_committed++;
+		}
+		
+		if(CommitTimestampIsLocal(global_committs))
+		{
+			res = XidInMVCCSnapshot(xid, snapshot);
+			SnapshotCheck(xid, snapshot, res, 0);
+
+			DEBUG_SNAPSHOT(elog(DEBUG12, "local snapshot ts " INT64_FORMAT "res"
+					" %d xid %d after wait.", snapshot->start_ts, res, xid));
+			return res;
+		}
+
+		if(snapshot->start_ts > global_committs)
+		{
+			SnapshotCheck(xid, snapshot, false, global_committs);
+
+			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "false xid %d"
+					" committs " INT64_FORMAT "1.", snapshot->start_ts, xid,
+					global_committs));
+			return false;
+		}
+		else
+		{
+			SnapshotCheck(xid, snapshot, true, global_committs);
+			SetTimestamp(tuple, xid, buffer, infomask);
+
+			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "true xid %d "
+					"committs " INT64_FORMAT "2.",
+					snapshot->start_ts, xid, global_committs));
+			return true;
+		}
+	}
+
+	prepare_ts = InvalidGlobalTimestamp;
+	/* 
+	 * If xid has passed the prepare phase, we should wait for it to complete.
+	 */
+	if(XidIsPrepared(xid, snapshot, &prepare_ts))
+	{
+		if(enable_distri_debug)
+		{
+			snapshot->scanned_tuples_after_prepare++;
+		}
+		
+		if(!GlobalTimestampIsValid(snapshot->start_ts))
+		{
+			ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("transaction %d does not have valid timestamp. "
+								"snapshot start ts " INT64_FORMAT ", autovacuum"
+								" %d in recovery %d",
+								xid, snapshot->start_ts,
+								IsAutoVacuumWorkerProcess(),
+								snapshot->takenDuringRecovery)));
+		}
+
+		if(GlobalTimestampIsValid(prepare_ts) && !GlobalTimestampIsFrozen(prepare_ts) &&
+			(snapshot->start_ts < prepare_ts))
+		{
+			SnapshotCheck(xid, snapshot, true, 0);
+
+			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d"
+					" prep " INT64_FORMAT, snapshot->start_ts, xid, prepare_ts));
+			return true;
+		}
+		
+		if(GlobalTimestampIsValid(prepare_ts))
+		{
+			BufferDesc *buf;
+			int lock_type = -1;
+			
+			buf = GetBufferDescriptor(buffer - 1);
+			
+			if(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf),
+										LW_EXCLUSIVE))
+			{
+				lock_type = BUFFER_LOCK_EXCLUSIVE;
+			}
+			else if(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf),
+										LW_SHARED))
+			{
+				lock_type = BUFFER_LOCK_SHARE;
+			}
+
+			XactLockTableWait(xid, NULL, NULL, XLTW_None);
+			if(lock_type != -1)
+			{
+				/* Avoid deadlock */
+				if(TransactionIdDidAbort(xid))
+				{
+					DEBUG_SNAPSHOT(elog(LOG, "abort snapshot ts " INT64_FORMAT
+							"false xid %d .", snapshot->start_ts, xid));
+					if(enable_distri_debug)
+					{
+						snapshot->scanned_tuples_after_abort++;
+					}
+					
+					*need_retry = false;
+					return false;
+				}
+				else
+				{
+					*need_retry = true;
+					return true;
+				}
+			}
+		}
+
+		if(TransactionIdGetCommitTsData(xid, &global_committs, NULL))
+		{
+			if(enable_distri_debug)
+			{
+				snapshot->scanned_tuples_after_committed++;
+			}
+			
+			if(CommitTimestampIsLocal(global_committs))
+			{
+				res =  XidInMVCCSnapshot(xid, snapshot);
+				SnapshotCheck(xid, snapshot, res, 0);
+
+				DEBUG_SNAPSHOT(elog(DEBUG12, "local snapshot ts " INT64_FORMAT
+						"res %d xid %d after wait.",
+						snapshot->start_ts, res,xid));
+				return res;
+			}
+
+			if(snapshot->start_ts > global_committs)
+			{
+				SnapshotCheck(xid, snapshot, false, global_committs);
+
+				DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " false "
+						"xid %d commit_ts " INT64_FORMAT " 3.",
+						snapshot->start_ts, xid, global_committs));
+				return false;
+			}
+			else
+			{
+				SnapshotCheck(xid, snapshot, true, global_committs);
+				SetTimestamp(tuple, xid, buffer, infomask);
+
+				DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid"
+						" %d committs" INT64_FORMAT " 4.", snapshot->start_ts,
+						xid, global_committs));
+				return true;
+			}
+		}
+		else 
+		{/* Abort or crashed */
+			if(enable_distri_debug)
+			{
+				snapshot->scanned_tuples_after_abort++;
+			}
+			SnapshotCheck(xid, snapshot, false, 0);
+
+			DEBUG_SNAPSHOT(elog(LOG, "abort snapshot ts " INT64_FORMAT " false"
+					" xid %d .", snapshot->start_ts, xid));
+			return false;
+		}
+	}
+
+	if(enable_distri_debug)
+	{
+		snapshot->scanned_tuples_before_prepare++;
+	}
+
+	/*
+	 * For non-prepared transaction, its commit timestamp must be larger than
+	 * the current running transaction/statement's start timestamp. This is
+	 * because that as T1's commit timestamp has not yet been aquired on CN,
+	 * T2.start < T1.commit is always being held.
+	 */
+	SnapshotCheck(xid, snapshot, true, 0);
+
+	DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d 5.",
+			snapshot->start_ts, xid));
+	return true;
 }
 
 #endif
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 94e82799..4ba6f96b 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -146,19 +146,20 @@ typedef struct SnapshotData
 #endif
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-    GlobalTimestamp start_ts;        /* global timestamp at which the statement/transaction starts */
+	/*
+	 * global timestamp at which the statement/transaction starts
+	 */
+	GlobalTimestamp start_ts;
 
-    bool            local;            /* local snapshot */
+	bool			local;		/* local snapshot */
 
-    TransactionId *prepare_xip;
-
-    GlobalTimestamp *prepare_xip_ts;
+	TransactionId *prepare_xip;
+	GlobalTimestamp *prepare_xip_ts;
 
     uint32        prepare_xcnt;
 
-    TransactionId *prepare_subxip;
-
-    GlobalTimestamp *prepare_subxip_ts;
+	TransactionId *prepare_subxip;
+	GlobalTimestamp *prepare_subxip_ts;
 
     uint32        prepare_subxcnt;
 

From 684e779135021ffbc89def45e1a16849ebbec3ef Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 14 Jul 2020 20:00:08 +0800
Subject: [PATCH 014/578] print debugging info before SnapshotCheck

---
 src/backend/utils/time/tqual.c | 54 ++++++++++++++++------------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index 505028eb..6b01aa3e 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -1160,11 +1160,11 @@ XminInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot,
 	else if (snapshot->local || CommitTimestampIsLocal(global_committs))
 	{
 		res =  XidInMVCCSnapshot(xid, snapshot);
-		SnapshotCheck(xid, snapshot, res, 0);
 
 		DEBUG_SNAPSHOT(elog(LOG, "xmin local snapshot ts " INT64_FORMAT
 				" res %d xid %d committs " INT64_FORMAT, snapshot->start_ts,
 				res, xid, global_committs));
+        SnapshotCheck(xid, snapshot, res, 0);
 		return res;
 	}
 	else
@@ -1212,20 +1212,19 @@ XminInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot,
 
 		if(snapshot->start_ts > global_committs)
 		{
-			SnapshotCheck(xid, snapshot, false, global_committs);
-
 			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " false xid %d "
 					"committs " INT64_FORMAT " 21.", snapshot->start_ts, xid,
 					global_committs));
+            SnapshotCheck(xid, snapshot, false, global_committs);
 			return false;
 		}
 		else
-		{	
-			SnapshotCheck(xid, snapshot, true, global_committs);
-
+        {
 			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d"
 					" committs " INT64_FORMAT " 22.", snapshot->start_ts, xid,
 					global_committs));
+
+			SnapshotCheck(xid, snapshot, true, global_committs);
 			return true;
 		}
 	}
@@ -1251,10 +1250,11 @@ XmaxInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot,
 	else if (snapshot->local || CommitTimestampIsLocal(global_committs))
 	{
 		res = XidInMVCCSnapshot(xid, snapshot);
-		SnapshotCheck(xid, snapshot, res, 0);
 
 		DEBUG_SNAPSHOT(elog(LOG, "xmax local snapshot ts " INT64_FORMAT " res "
 				"%d xid %d.", snapshot->start_ts, res, xid));
+
+		SnapshotCheck(xid, snapshot, res, 0);
 		return res;
 	}
 	else
@@ -1302,20 +1302,20 @@ XmaxInMVCCSnapshotByTimestamp(HeapTupleHeader tuple, Snapshot snapshot,
 
 		if(snapshot->start_ts > global_committs)
 		{
-			SnapshotCheck(xid, snapshot, false, global_committs);
-
 			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "false xid %d"
 					" committs" INT64_FORMAT "11.", snapshot->start_ts, xid,
 					global_committs));
+
+			SnapshotCheck(xid, snapshot, false, global_committs);
 			return false;
 		}
 		else
 		{
-			SnapshotCheck(xid, snapshot, true, global_committs);
-
 			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "true xid %d "
 					"committs " INT64_FORMAT "12.", snapshot->start_ts, xid,
 					global_committs));
+
+			SnapshotCheck(xid, snapshot, true, global_committs);
 			return true;
 		}
 	}
@@ -3424,10 +3424,10 @@ XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid,
 	if (snapshot->local || !TransactionIdIsNormal(xid))
 	{
 		res = XidInMVCCSnapshot(xid, snapshot);
-		SnapshotCheck(xid, snapshot, res, 0);
 
 		DEBUG_SNAPSHOT(elog(DEBUG12, "local: snapshot ts " INT64_FORMAT "xid %d"
 				" res %d.", snapshot->start_ts, xid, res));
+		SnapshotCheck(xid, snapshot, res, 0);
 		return res;
 	}
 	
@@ -3454,30 +3454,28 @@ XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid,
 		if(CommitTimestampIsLocal(global_committs))
 		{
 			res = XidInMVCCSnapshot(xid, snapshot);
-			SnapshotCheck(xid, snapshot, res, 0);
 
 			DEBUG_SNAPSHOT(elog(DEBUG12, "local snapshot ts " INT64_FORMAT "res"
 					" %d xid %d after wait.", snapshot->start_ts, res, xid));
+			SnapshotCheck(xid, snapshot, res, 0);
 			return res;
 		}
 
 		if(snapshot->start_ts > global_committs)
 		{
-			SnapshotCheck(xid, snapshot, false, global_committs);
-
 			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "false xid %d"
 					" committs " INT64_FORMAT "1.", snapshot->start_ts, xid,
 					global_committs));
+			SnapshotCheck(xid, snapshot, false, global_committs);
 			return false;
 		}
 		else
 		{
-			SnapshotCheck(xid, snapshot, true, global_committs);
-			SetTimestamp(tuple, xid, buffer, infomask);
-
 			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT "true xid %d "
 					"committs " INT64_FORMAT "2.",
 					snapshot->start_ts, xid, global_committs));
+			SnapshotCheck(xid, snapshot, true, global_committs);
+			SetTimestamp(tuple, xid, buffer, infomask);
 			return true;
 		}
 	}
@@ -3508,10 +3506,9 @@ XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid,
 		if(GlobalTimestampIsValid(prepare_ts) && !GlobalTimestampIsFrozen(prepare_ts) &&
 			(snapshot->start_ts < prepare_ts))
 		{
-			SnapshotCheck(xid, snapshot, true, 0);
-
 			DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d"
 					" prep " INT64_FORMAT, snapshot->start_ts, xid, prepare_ts));
+			SnapshotCheck(xid, snapshot, true, 0);
 			return true;
 		}
 		
@@ -3567,31 +3564,30 @@ XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid,
 			if(CommitTimestampIsLocal(global_committs))
 			{
 				res =  XidInMVCCSnapshot(xid, snapshot);
-				SnapshotCheck(xid, snapshot, res, 0);
 
 				DEBUG_SNAPSHOT(elog(DEBUG12, "local snapshot ts " INT64_FORMAT
 						"res %d xid %d after wait.",
 						snapshot->start_ts, res,xid));
+				SnapshotCheck(xid, snapshot, res, 0);
 				return res;
 			}
 
 			if(snapshot->start_ts > global_committs)
 			{
-				SnapshotCheck(xid, snapshot, false, global_committs);
-
 				DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " false "
 						"xid %d commit_ts " INT64_FORMAT " 3.",
 						snapshot->start_ts, xid, global_committs));
+				SnapshotCheck(xid, snapshot, false, global_committs);
 				return false;
 			}
 			else
 			{
-				SnapshotCheck(xid, snapshot, true, global_committs);
-				SetTimestamp(tuple, xid, buffer, infomask);
-
 				DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid"
 						" %d committs" INT64_FORMAT " 4.", snapshot->start_ts,
 						xid, global_committs));
+				SnapshotCheck(xid, snapshot, true, global_committs);
+
+				SetTimestamp(tuple, xid, buffer, infomask);
 				return true;
 			}
 		}
@@ -3601,10 +3597,10 @@ XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid,
 			{
 				snapshot->scanned_tuples_after_abort++;
 			}
-			SnapshotCheck(xid, snapshot, false, 0);
 
 			DEBUG_SNAPSHOT(elog(LOG, "abort snapshot ts " INT64_FORMAT " false"
 					" xid %d .", snapshot->start_ts, xid));
+			SnapshotCheck(xid, snapshot, false, 0);
 			return false;
 		}
 	}
@@ -3620,10 +3616,10 @@ XidInMVCCSnapshotDistri(HeapTupleHeader tuple, TransactionId xid,
 	 * because that as T1's commit timestamp has not yet been aquired on CN,
 	 * T2.start < T1.commit is always being held.
 	 */
-	SnapshotCheck(xid, snapshot, true, 0);
-
 	DEBUG_SNAPSHOT(elog(LOG, "snapshot ts " INT64_FORMAT " true xid %d 5.",
 			snapshot->start_ts, xid));
+	SnapshotCheck(xid, snapshot, true, 0);
+
 	return true;
 }
 

From 97226a8dfc427a01fb7b52ef7ea6f3d3036710d9 Mon Sep 17 00:00:00 2001
From: aidenma <aidenma@tencent.com>
Date: Tue, 28 Jul 2020 11:13:02 +0800
Subject: [PATCH 015/578] fix warning pg_basebackup

---
 src/bin/pg_basebackup/pg_basebackup.c | 601 ++++++++++++++------------
 1 file changed, 314 insertions(+), 287 deletions(-)

diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 2f5717e2..91af5b93 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -1766,302 +1766,329 @@ BaseBackup(void)
 				maxServerMajor;
 	int			serverVersion,
 				serverMajor;
-
-    Assert(conn != NULL);
-
-    /*
-     * Check server version. BASE_BACKUP command was introduced in 9.1, so we
-     * can't work with servers older than 9.1.
-     */
-    minServerMajor = 901;
-    maxServerMajor = PG_VERSION_NUM / 100;
-    serverVersion = PQserverVersion(conn);
-    serverMajor = serverVersion / 100;
-    if (serverMajor < minServerMajor || serverMajor > maxServerMajor)
-    {
-        const char *serverver = PQparameterStatus(conn, "server_version");
-
-        fprintf(stderr, _("%s: incompatible server version %s\n"),
-                progname, serverver ? serverver : "'unknown'");
-        disconnect_and_exit(1);
-    }
-
-    /*
-     * If WAL streaming was requested, also check that the server is new
-     * enough for that.
-     */
-    if (includewal == STREAM_WAL && !CheckServerVersionForStreaming(conn))
-    {
-        /*
-         * Error message already written in CheckServerVersionForStreaming(),
-         * but add a hint about using -X none.
-         */
-        fprintf(stderr, _("HINT: use -X none or -X fetch to disable log streaming\n"));
-        disconnect_and_exit(1);
-    }
-
-    /*
-     * Build contents of recovery.conf if requested
-     */
-    if (writerecoveryconf)
-        GenerateRecoveryConf(conn);
-
-    /*
-     * Run IDENTIFY_SYSTEM so we can get the timeline
-     */
-    if (!RunIdentifySystem(conn, &sysidentifier, &latesttli, NULL, NULL))
-        disconnect_and_exit(1);
-
-    /*
-     * Start the actual backup
-     */
-    PQescapeStringConn(conn, escaped_label, label, sizeof(escaped_label), &i);
-
-    if (maxrate > 0)
-        maxrate_clause = psprintf("MAX_RATE %u", maxrate);
-
-    if (verbose)
-        fprintf(stderr,
-                _("%s: initiating base backup, waiting for checkpoint to complete\n"),
-                progname);
-
-    if (showprogress && !verbose)
-        fprintf(stderr, "waiting for checkpoint\r");
-
-    basebkp =
-        psprintf("BASE_BACKUP LABEL '%s' %s %s %s %s %s %s",
-                 escaped_label,
-                 showprogress ? "PROGRESS" : "",
-                 includewal == FETCH_WAL ? "WAL" : "",
-                 fastcheckpoint ? "FAST" : "",
-                 includewal == NO_WAL ? "" : "NOWAIT",
-                 maxrate_clause ? maxrate_clause : "",
-                 format == 't' ? "TABLESPACE_MAP" : "");
-
-    if (PQsendQuery(conn, basebkp) == 0)
-    {
-        fprintf(stderr, _("%s: could not send replication command \"%s\": %s"),
-                progname, "BASE_BACKUP", PQerrorMessage(conn));
-        disconnect_and_exit(1);
-    }
-
-    /*
-     * Get the starting WAL location
-     */
-    res = PQgetResult(conn);
-    if (PQresultStatus(res) != PGRES_TUPLES_OK)
-    {
-        fprintf(stderr, _("%s: could not initiate base backup: %s"),
-                progname, PQerrorMessage(conn));
-        disconnect_and_exit(1);
-    }
-    if (PQntuples(res) != 1)
-    {
-        fprintf(stderr,
-                _("%s: server returned unexpected response to BASE_BACKUP command; got %d rows and %d fields, expected %d rows and %d fields\n"),
-                progname, PQntuples(res), PQnfields(res), 1, 2);
-        disconnect_and_exit(1);
-    }
-
-    /* start_point: get last checkpoint point position from master */
-    strlcpy(xlogstart, PQgetvalue(res, 0, 0), sizeof(xlogstart));
-    
-    /*
-     * If we're streaming WAL, start the streaming session before we start
-     * receiving the actual data chunks.
-     */
-    if (includewal == STREAM_WAL)
-    {
-        if (verbose)
-            fprintf(stderr, _("%s: starting background WAL receiver\n"),
-                    progname);
-        StartLogStreamer(xlogstart, starttli, sysidentifier);
-    }
-
-    /*
-     * Start receiving chunks
-     */
-    for (i = 0; i < PQntuples(res); i++)
-    {
-        if (format == 't')
-            ReceiveTarFile(conn, res, i);
-        else
-            ReceiveAndUnpackTarFile(conn, res, i);
-    }                            /* Loop over all tablespaces */
-
-    if (showprogress)
-    {
-        progress_report(PQntuples(res), NULL, true);
-        fprintf(stderr, "\n");    /* Need to move to next line */
-    }
-
-    PQclear(res);
-
-    /*
-     * Get the stop position
-     */
-    res = PQgetResult(conn);
-    if (PQresultStatus(res) != PGRES_TUPLES_OK)
-    {
-        fprintf(stderr,
-                _("%s: could not get write-ahead log end position from server: %s"),
-                progname, PQerrorMessage(conn));
-        disconnect_and_exit(1);
-    }
-    if (PQntuples(res) != 1)
-    {
-        fprintf(stderr,
-                _("%s: no write-ahead log end position returned from server\n"),
-                progname);
-        disconnect_and_exit(1);
-    }
-    strlcpy(xlogend, PQgetvalue(res, 0, 0), sizeof(xlogend));
-    if (verbose && includewal != NO_WAL)
-        fprintf(stderr, _("%s: write-ahead log end point: %s\n"), progname, xlogend);
-    PQclear(res);
-
-    res = PQgetResult(conn);
-    if (PQresultStatus(res) != PGRES_COMMAND_OK)
-    {
-        fprintf(stderr, _("%s: final receive failed: %s"),
-                progname, PQerrorMessage(conn));
-        disconnect_and_exit(1);
-    }
-
-    if (bgchild > 0)
-    {
+	Assert(conn != NULL);
+	/*
+	 * Check server version. BASE_BACKUP command was introduced in 9.1, so we
+	 * can't work with servers older than 9.1.
+	 */
+	minServerMajor = 901;
+	maxServerMajor = PG_VERSION_NUM / 100;
+	serverVersion = PQserverVersion(conn);
+	serverMajor = serverVersion / 100;
+	if (serverMajor < minServerMajor || serverMajor > maxServerMajor)
+	{
+		const char *serverver = PQparameterStatus(conn, "server_version");
+		fprintf(stderr, _("%s: incompatible server version %s\n"),
+				progname, serverver ? serverver : "'unknown'");
+		disconnect_and_exit(1);
+	}
+	/*
+	 * If WAL streaming was requested, also check that the server is new
+	 * enough for that.
+	 */
+	if (includewal == STREAM_WAL && !CheckServerVersionForStreaming(conn))
+	{
+		/*
+		 * Error message already written in CheckServerVersionForStreaming(),
+		 * but add a hint about using -X none.
+		 */
+		fprintf(stderr, _("HINT: use -X none or -X fetch to disable log streaming\n"));
+		disconnect_and_exit(1);
+	}
+	/*
+	 * Build contents of recovery.conf if requested
+	 */
+	if (writerecoveryconf)
+		GenerateRecoveryConf(conn);
+	/*
+	 * Run IDENTIFY_SYSTEM so we can get the timeline
+	 */
+	if (!RunIdentifySystem(conn, &sysidentifier, &latesttli, NULL, NULL))
+		disconnect_and_exit(1);
+	/*
+	 * Start the actual backup
+	 */
+	PQescapeStringConn(conn, escaped_label, label, sizeof(escaped_label), &i);
+	if (maxrate > 0)
+		maxrate_clause = psprintf("MAX_RATE %u", maxrate);
+	if (verbose)
+		fprintf(stderr,
+				_("%s: initiating base backup, waiting for checkpoint to complete\n"),
+				progname);
+	if (showprogress && !verbose)
+		fprintf(stderr, "waiting for checkpoint\r");
+	basebkp =
+		psprintf("BASE_BACKUP LABEL '%s' %s %s %s %s %s %s",
+				 escaped_label,
+				 showprogress ? "PROGRESS" : "",
+				 includewal == FETCH_WAL ? "WAL" : "",
+				 fastcheckpoint ? "FAST" : "",
+				 includewal == NO_WAL ? "" : "NOWAIT",
+				 maxrate_clause ? maxrate_clause : "",
+				 format == 't' ? "TABLESPACE_MAP" : "");
+	if (PQsendQuery(conn, basebkp) == 0)
+	{
+		fprintf(stderr, _("%s: could not send replication command \"%s\": %s"),
+				progname, "BASE_BACKUP", PQerrorMessage(conn));
+		disconnect_and_exit(1);
+	}
+	/*
+	 * Get the starting WAL location
+	 */
+	res = PQgetResult(conn);
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		fprintf(stderr, _("%s: could not initiate base backup: %s"),
+				progname, PQerrorMessage(conn));
+		disconnect_and_exit(1);
+	}
+	if (PQntuples(res) != 1)
+	{
+		fprintf(stderr,
+				_("%s: server returned unexpected response to BASE_BACKUP command; got %d rows and %d fields, expected %d rows and %d fields\n"),
+				progname, PQntuples(res), PQnfields(res), 1, 2);
+		disconnect_and_exit(1);
+	}
+	strlcpy(xlogstart, PQgetvalue(res, 0, 0), sizeof(xlogstart));
+	if (verbose)
+		fprintf(stderr, _("%s: checkpoint completed\n"), progname);
+	/*
+	 * 9.3 and later sends the TLI of the starting point. With older servers,
+	 * assume it's the same as the latest timeline reported by
+	 * IDENTIFY_SYSTEM.
+	 */
+	if (PQnfields(res) >= 2)
+		starttli = atoi(PQgetvalue(res, 0, 1));
+	else
+		starttli = latesttli;
+	PQclear(res);
+	MemSet(xlogend, 0, sizeof(xlogend));
+	if (verbose && includewal != NO_WAL)
+		fprintf(stderr, _("%s: write-ahead log start point: %s on timeline %u\n"),
+				progname, xlogstart, starttli);
+	/*
+	 * Get the header
+	 */
+	res = PQgetResult(conn);
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		fprintf(stderr, _("%s: could not get backup header: %s"),
+				progname, PQerrorMessage(conn));
+		disconnect_and_exit(1);
+	}
+	if (PQntuples(res) < 1)
+	{
+		fprintf(stderr, _("%s: no data returned from server\n"), progname);
+		disconnect_and_exit(1);
+	}
+	/*
+	 * Sum up the total size, for progress reporting
+	 */
+	totalsize = totaldone = 0;
+	tablespacecount = PQntuples(res);
+	for (i = 0; i < PQntuples(res); i++)
+	{
+		totalsize += atol(PQgetvalue(res, i, 2));
+		/*
+		 * Verify tablespace directories are empty. Don't bother with the
+		 * first once since it can be relocated, and it will be checked before
+		 * we do anything anyway.
+		 */
+		if (format == 'p' && !PQgetisnull(res, i, 1))
+		{
+			char	   *path = (char *) get_tablespace_mapping(PQgetvalue(res, i, 1));
+			verify_dir_is_empty_or_create(path, &made_tablespace_dirs, &found_tablespace_dirs);
+		}
+	}
+	/*
+	 * When writing to stdout, require a single tablespace
+	 */
+	if (format == 't' && strcmp(basedir, "-") == 0 && PQntuples(res) > 1)
+	{
+		fprintf(stderr,
+				_("%s: can only write single tablespace to stdout, database has %d\n"),
+				progname, PQntuples(res));
+		disconnect_and_exit(1);
+	}
+	/*
+	 * If we're streaming WAL, start the streaming session before we start
+	 * receiving the actual data chunks.
+	 */
+	if (includewal == STREAM_WAL)
+	{
+		if (verbose)
+			fprintf(stderr, _("%s: starting background WAL receiver\n"),
+					progname);
+		StartLogStreamer(xlogstart, starttli, sysidentifier);
+	}
+	/*
+	 * Start receiving chunks
+	 */
+	for (i = 0; i < PQntuples(res); i++)
+	{
+		if (format == 't')
+			ReceiveTarFile(conn, res, i);
+		else
+			ReceiveAndUnpackTarFile(conn, res, i);
+	}							/* Loop over all tablespaces */
+	if (showprogress)
+	{
+		progress_report(PQntuples(res), NULL, true);
+		fprintf(stderr, "\n");	/* Need to move to next line */
+	}
+	PQclear(res);
+	/*
+	 * Get the stop position
+	 */
+	res = PQgetResult(conn);
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+	{
+		fprintf(stderr,
+				_("%s: could not get write-ahead log end position from server: %s"),
+				progname, PQerrorMessage(conn));
+		disconnect_and_exit(1);
+	}
+	if (PQntuples(res) != 1)
+	{
+		fprintf(stderr,
+				_("%s: no write-ahead log end position returned from server\n"),
+				progname);
+		disconnect_and_exit(1);
+	}
+	strlcpy(xlogend, PQgetvalue(res, 0, 0), sizeof(xlogend));
+	if (verbose && includewal != NO_WAL)
+		fprintf(stderr, _("%s: write-ahead log end point: %s\n"), progname, xlogend);
+	PQclear(res);
+	res = PQgetResult(conn);
+	if (PQresultStatus(res) != PGRES_COMMAND_OK)
+	{
+		fprintf(stderr, _("%s: final receive failed: %s"),
+				progname, PQerrorMessage(conn));
+		disconnect_and_exit(1);
+	}
+	if (bgchild > 0)
+	{
 #ifndef WIN32
         int            status;
         int            r;
 #else
-        DWORD        status;
-
-        /*
-         * get a pointer sized version of bgchild to avoid warnings about
-         * casting to a different size on WIN64.
-         */
-        intptr_t    bgchild_handle = bgchild;
-        uint32        hi,
-                    lo;
+		DWORD		status;
+		/*
+		 * get a pointer sized version of bgchild to avoid warnings about
+		 * casting to a different size on WIN64.
+		 */
+		intptr_t	bgchild_handle = bgchild;
+		uint32		hi,
+					lo;
 #endif
-
-        if (verbose)
-            fprintf(stderr,
-                    _("%s: waiting for background process to finish streaming ...\n"), progname);
-
+		if (verbose)
+			fprintf(stderr,
+					_("%s: waiting for background process to finish streaming ...\n"), progname);
 #ifndef WIN32
-        if (write(bgpipe[1], xlogend, strlen(xlogend)) != strlen(xlogend))
-        {
-            fprintf(stderr,
-                    _("%s: could not send command to background pipe: %s\n"),
-                    progname, strerror(errno));
-            disconnect_and_exit(1);
-        }
-
-        /* Just wait for the background process to exit */
-        r = waitpid(bgchild, &status, 0);
-        if (r == -1)
-        {
-            fprintf(stderr, _("%s: could not wait for child process: %s\n"),
-                    progname, strerror(errno));
-            disconnect_and_exit(1);
-        }
-        if (r != bgchild)
-        {
-            fprintf(stderr, _("%s: child %d died, expected %d\n"),
-                    progname, r, (int) bgchild);
-            disconnect_and_exit(1);
-        }
-        if (!WIFEXITED(status))
-        {
-            fprintf(stderr, _("%s: child process did not exit normally\n"),
-                    progname);
-            disconnect_and_exit(1);
-        }
-        if (WEXITSTATUS(status) != 0)
-        {
-            fprintf(stderr, _("%s: child process exited with error %d\n"),
-                    progname, WEXITSTATUS(status));
-            disconnect_and_exit(1);
-        }
-        /* Exited normally, we're happy! */
-#else                            /* WIN32 */
-
-        /*
-         * On Windows, since we are in the same process, we can just store the
-         * value directly in the variable, and then set the flag that says
-         * it's there.
-         */
-        if (sscanf(xlogend, "%X/%X", &hi, &lo) != 2)
-        {
-            fprintf(stderr,
-                    _("%s: could not parse write-ahead log location \"%s\"\n"),
-                    progname, xlogend);
-            disconnect_and_exit(1);
-        }
-        xlogendptr = ((uint64) hi) << 32 | lo;
-        InterlockedIncrement(&has_xlogendptr);
-
-        /* First wait for the thread to exit */
-        if (WaitForSingleObjectEx((HANDLE) bgchild_handle, INFINITE, FALSE) !=
-            WAIT_OBJECT_0)
-        {
-            _dosmaperr(GetLastError());
-            fprintf(stderr, _("%s: could not wait for child thread: %s\n"),
-                    progname, strerror(errno));
-            disconnect_and_exit(1);
-        }
-        if (GetExitCodeThread((HANDLE) bgchild_handle, &status) == 0)
-        {
-            _dosmaperr(GetLastError());
-            fprintf(stderr, _("%s: could not get child thread exit status: %s\n"),
-                    progname, strerror(errno));
-            disconnect_and_exit(1);
-        }
-        if (status != 0)
-        {
-            fprintf(stderr, _("%s: child thread exited with error %u\n"),
-                    progname, (unsigned int) status);
-            disconnect_and_exit(1);
-        }
-        /* Exited normally, we're happy */
+		if (write(bgpipe[1], xlogend, strlen(xlogend)) != strlen(xlogend))
+		{
+			fprintf(stderr,
+					_("%s: could not send command to background pipe: %s\n"),
+					progname, strerror(errno));
+			disconnect_and_exit(1);
+		}
+		/* Just wait for the background process to exit */
+		r = waitpid(bgchild, &status, 0);
+		if (r == -1)
+		{
+			fprintf(stderr, _("%s: could not wait for child process: %s\n"),
+					progname, strerror(errno));
+			disconnect_and_exit(1);
+		}
+		if (r != bgchild)
+		{
+			fprintf(stderr, _("%s: child %d died, expected %d\n"),
+					progname, r, (int) bgchild);
+			disconnect_and_exit(1);
+		}
+		if (!WIFEXITED(status))
+		{
+			fprintf(stderr, _("%s: child process did not exit normally\n"),
+					progname);
+			disconnect_and_exit(1);
+		}
+		if (WEXITSTATUS(status) != 0)
+		{
+			fprintf(stderr, _("%s: child process exited with error %d\n"),
+					progname, WEXITSTATUS(status));
+			disconnect_and_exit(1);
+		}
+		/* Exited normally, we're happy! */
+#else							/* WIN32 */
+		/*
+		 * On Windows, since we are in the same process, we can just store the
+		 * value directly in the variable, and then set the flag that says
+		 * it's there.
+		 */
+		if (sscanf(xlogend, "%X/%X", &hi, &lo) != 2)
+		{
+			fprintf(stderr,
+					_("%s: could not parse write-ahead log location \"%s\"\n"),
+					progname, xlogend);
+			disconnect_and_exit(1);
+		}
+		xlogendptr = ((uint64) hi) << 32 | lo;
+		InterlockedIncrement(&has_xlogendptr);
+		/* First wait for the thread to exit */
+		if (WaitForSingleObjectEx((HANDLE) bgchild_handle, INFINITE, FALSE) !=
+			WAIT_OBJECT_0)
+		{
+			_dosmaperr(GetLastError());
+			fprintf(stderr, _("%s: could not wait for child thread: %s\n"),
+					progname, strerror(errno));
+			disconnect_and_exit(1);
+		}
+		if (GetExitCodeThread((HANDLE) bgchild_handle, &status) == 0)
+		{
+			_dosmaperr(GetLastError());
+			fprintf(stderr, _("%s: could not get child thread exit status: %s\n"),
+					progname, strerror(errno));
+			disconnect_and_exit(1);
+		}
+		if (status != 0)
+		{
+			fprintf(stderr, _("%s: child thread exited with error %u\n"),
+					progname, (unsigned int) status);
+			disconnect_and_exit(1);
+		}
+		/* Exited normally, we're happy */
 #endif
-    }
-
-    /* Free the recovery.conf contents */
-    destroyPQExpBuffer(recoveryconfcontents);
-
-    /*
-     * End of copy data. Final result is already checked inside the loop.
-     */
-    PQclear(res);
-    PQfinish(conn);
-
-    /*
-     * Make data persistent on disk once backup is completed. For tar format
-     * once syncing the parent directory is fine, each tar file created per
-     * tablespace has been already synced. In plain format, all the data of
-     * the base directory is synced, taking into account all the tablespaces.
-     * Errors are not considered fatal.
-     */
-    if (do_sync)
-    {
-        if (format == 't')
-        {
-            if (strcmp(basedir, "-") != 0)
-                (void) fsync_fname(basedir, true, progname);
-        }
-        else
-        {
-            (void) fsync_pgdata(basedir, progname, serverVersion);
-        }
-    }
-
-    if (verbose)
-        fprintf(stderr, _("%s: base backup completed\n"), progname);
+	}
+	/* Free the recovery.conf contents */
+	destroyPQExpBuffer(recoveryconfcontents);
+	/*
+	 * End of copy data. Final result is already checked inside the loop.
+	 */
+	PQclear(res);
+	PQfinish(conn);
+	/*
+	 * Make data persistent on disk once backup is completed. For tar format
+	 * once syncing the parent directory is fine, each tar file created per
+	 * tablespace has been already synced. In plain format, all the data of
+	 * the base directory is synced, taking into account all the tablespaces.
+	 * Errors are not considered fatal.
+	 */
+	if (do_sync)
+	{
+		if (format == 't')
+		{
+			if (strcmp(basedir, "-") != 0)
+				(void) fsync_fname(basedir, true, progname);
+		}
+		else
+		{
+			(void) fsync_pgdata(basedir, progname, serverVersion);
+		}
+	}
+	if (verbose)
+		fprintf(stderr, _("%s: base backup completed\n"), progname);
 }
 
-
 int
 main(int argc, char **argv)
 {// #lizard forgives

From 44bb70bfd302ececd9f07c907f1c233c973b06a8 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 28 Jul 2020 12:40:20 +0800
Subject: [PATCH 016/578] fix coordinator gets error under readonly plane with
 coldhot seperation.

---
 src/backend/pgxc/nodemgr/groupmgr.c | 78 +++++++++++++++--------------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/src/backend/pgxc/nodemgr/groupmgr.c b/src/backend/pgxc/nodemgr/groupmgr.c
index b1e4a339..be034bed 100644
--- a/src/backend/pgxc/nodemgr/groupmgr.c
+++ b/src/backend/pgxc/nodemgr/groupmgr.c
@@ -538,44 +538,46 @@ Oid RemoveNodeFromGroup(Oid nodeoid)
 
 Oid GetGroupOidByNode(Oid nodeoid)
 {
-    Relation    relation;
-    SysScanDesc scan;
-    HeapTuple    tup;
-    Form_pgxc_group group;
-    int i;
-    Oid         groupoid   = InvalidOid;
-    
-    relation = heap_open(PgxcGroupRelationId, AccessShareLock);
-        
-    scan = systable_beginscan(relation, InvalidOid, false, NULL, 0, NULL);
-
-    tup = systable_getnext(scan);
-
-    while(HeapTupleIsValid(tup))
-    {
-        group = (Form_pgxc_group)GETSTRUCT(tup);
-
-        for (i = 0; i < group->group_members.dim1; i++)
-        {
-            if (group->group_members.values[i] == nodeoid)
-            {
-                groupoid = HeapTupleGetOid(tup);
-                break;
-            }
-        }
-
-        if (OidIsValid(groupoid))
-        {
-            break;
-        }
-        
-        tup = systable_getnext(scan);
-    }
-
-    systable_endscan(scan);
-    heap_close(relation, AccessShareLock);
-    
-    return groupoid;
+	Relation	relation;
+	SysScanDesc scan;
+	HeapTuple	tup;
+	Form_pgxc_group group;
+	int i;
+	Oid         groupoid   = InvalidOid;
+
+    nodeoid = PGXCGetMainNodeOid(nodeoid);
+
+	relation = heap_open(PgxcGroupRelationId, AccessShareLock);
+		
+	scan = systable_beginscan(relation, InvalidOid, false, NULL, 0, NULL);
+
+	tup = systable_getnext(scan);
+
+	while(HeapTupleIsValid(tup))
+	{
+		group = (Form_pgxc_group)GETSTRUCT(tup);
+
+		for (i = 0; i < group->group_members.dim1; i++)
+		{
+			if (group->group_members.values[i] == nodeoid)
+			{
+				groupoid = HeapTupleGetOid(tup);
+				break;
+			}
+		}
+
+		if (OidIsValid(groupoid))
+		{
+			break;
+		}
+		
+		tup = systable_getnext(scan);
+	}
+
+	systable_endscan(scan);
+	heap_close(relation, AccessShareLock);
+	
+	return groupoid;
 }
 
 List *

From 34ef3de8a9b94f5c883bbb1397365052ab10d311 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Fri, 7 Aug 2020 12:42:39 +0800
Subject: [PATCH 017/578] Support complex UDPATE/DELETE when distribution key
 not matching

Previously we just throw error if we failed the distribution check in
group_planner. To support such cases, we need to improve the
set_joinpath_distribution, make it aware of result relation location.
Which means, if we know one side of the join path contains result
relation, then we need to keep it not redistributed.

TAPD:
http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696859222691
http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696859323617
---
 src/backend/nodes/outfuncs.c                  |   9 +-
 src/backend/optimizer/plan/planner.c          |  59 +-
 src/backend/optimizer/util/Makefile           |   3 +-
 src/backend/optimizer/util/distribution.c     | 125 +++
 src/backend/optimizer/util/pathnode.c         | 856 +++++++++++-------
 src/backend/optimizer/util/relnode.c          | 358 ++++----
 src/include/nodes/relation.h                  |  23 +-
 src/include/optimizer/distribution.h          |  28 +
 src/test/regress/expected/foreign_key_2.out   |  23 +-
 src/test/regress/expected/join_3.out          |  13 +-
 src/test/regress/expected/matview_1.out       |   4 +-
 src/test/regress/expected/returning.out       |   4 +-
 src/test/regress/expected/rowsecurity_1.out   | 150 ++-
 src/test/regress/expected/rowtypes_1.out      |  10 +-
 src/test/regress/expected/rules.out           | 132 ++-
 src/test/regress/expected/subselect_1.out     |   8 +-
 .../regress/expected/xl_limitations_1.out     |  24 +-
 src/test/regress/output/misc.source           |   4 -
 18 files changed, 1151 insertions(+), 682 deletions(-)
 create mode 100644 src/backend/optimizer/util/distribution.c
 create mode 100644 src/include/optimizer/distribution.h

diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index dee8b834..fb063aa2 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -3604,10 +3604,11 @@ _outRelOptInfo(StringInfo str, const RelOptInfo *node)
     WRITE_BOOL_FIELD(has_eclass_joins);
     WRITE_BITMAPSET_FIELD(top_parent_relids);
 #ifdef __TBASE__
-    WRITE_BOOL_FIELD(intervalparent);
-    WRITE_BOOL_FIELD(isdefault);
-    WRITE_BITMAPSET_FIELD(childs);
-    WRITE_INT_FIELD(estimate_partidx);
+	WRITE_BOOL_FIELD(intervalparent);
+	WRITE_BOOL_FIELD(isdefault);
+	WRITE_BITMAPSET_FIELD(childs);
+	WRITE_INT_FIELD(estimate_partidx);
+    WRITE_ENUM_FIELD(resultRelLoc, ResultRelLocation);
 #endif
 }
 
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index f41ef5a1..4495ef70 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -63,7 +63,9 @@
 #include "utils/selfuncs.h"
 #include "utils/lsyscache.h"
 #include "utils/syscache.h"
-
+#ifdef __TBASE__
+#include "optimizer/distribution.h"
+#endif
 
 /* GUC parameters */
 double        cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION;
@@ -189,10 +191,8 @@ static PathTarget *make_window_input_target(PlannerInfo *root,
 static List *make_pathkeys_for_window(PlannerInfo *root, WindowClause *wc,
                          List *tlist);
 static PathTarget *make_sort_input_target(PlannerInfo *root,
-                       PathTarget *final_target,
-                       bool *have_postponed_srfs);
-static bool equal_distributions(PlannerInfo *root, Distribution *dst1,
-                    Distribution *dst2);
+					   PathTarget *final_target,
+					   bool *have_postponed_srfs);
 static bool grouping_distribution_match(PlannerInfo *root, Query *parse,
                       Path *path, List *clauses);
 static bool groupingsets_distribution_match(PlannerInfo *root, Query *parse,
@@ -7856,55 +7856,6 @@ groupingsets_distribution_match(PlannerInfo *root, Query *parse, Path *path)
     return false;
 }
 
-/*
- * equal_distributions
- *     Check that two distributions are equal.
- *
- * Distributions are considered equal if they are of the same type, on the
- * same set of nodes, and if the distribution expressions are known to be equal
- * (either the same expressions or members of the same equivalence class).
- */
-static bool
-equal_distributions(PlannerInfo *root, Distribution *dst1,
-                    Distribution *dst2)
-{// #lizard forgives
-    /* fast path */
-    if (dst1 == dst2)
-        return true;
-
-    if (dst1 == NULL || dst2 == NULL)
-        return false;
-
-    /* conditions easier to check go first */
-    if (dst1->distributionType != dst2->distributionType)
-        return false;
-
-    if (!bms_equal(dst1->nodes, dst2->nodes))
-        return false;
-
-    if (equal(dst1->distributionExpr, dst2->distributionExpr))
-        return true;
-
-    /*
-     * For more thorough expression check we need to ensure they both are
-     * defined
-     */
-    if (dst1->distributionExpr == NULL || dst2->distributionExpr == NULL)
-        return false;
-
-    /*
-     * More thorough check, but allows some important cases, like if
-     * distribution column is not updated (implicit set distcol=distcol) or
-     * set distcol = CONST, ... WHERE distcol = CONST - pattern used by many
-     * applications.
-     */
-    if (exprs_known_equal(root, dst1->distributionExpr, dst2->distributionExpr))
-        return true;
-
-    /* The restrictNodes field does not matter for distribution equality */
-    return false;
-}
-
 /*
  * adjust_path_distribution
  *        Adjust distribution of the path to match what's expected by ModifyTable.
diff --git a/src/backend/optimizer/util/Makefile b/src/backend/optimizer/util/Makefile
index 2455d933..e625e51b 100644
--- a/src/backend/optimizer/util/Makefile
+++ b/src/backend/optimizer/util/Makefile
@@ -13,6 +13,7 @@ top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = clauses.o joininfo.o orclauses.o pathnode.o placeholder.o \
-       plancat.o predtest.o relnode.o restrictinfo.o tlist.o var.o pgxcship.o
+       plancat.o predtest.o relnode.o restrictinfo.o tlist.o var.o pgxcship.o \
+       distribution.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/optimizer/util/distribution.c b/src/backend/optimizer/util/distribution.c
new file mode 100644
index 00000000..3746b2c8
--- /dev/null
+++ b/src/backend/optimizer/util/distribution.c
@@ -0,0 +1,125 @@
+/*-------------------------------------------------------------------------
+ *
+ * distribution.c
+ *	  Routines related to adjust path distribution
+ *
+ * Copyright (c) 2020-Present TBase development team, Tencent
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/optimizer/util/distribution.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "nodes/bitmapset.h"
+#include "nodes/nodes.h"
+#include "optimizer/distribution.h"
+#include "optimizer/paths.h"
+
+/*
+ * equal_distributions
+ * 	Check that two distributions are equal.
+ *
+ * Distributions are considered equal if they are of the same type, on the
+ * same set of nodes, and if the distribution expressions are known to be equal
+ * (either the same expressions or members of the same equivalence class).
+ */
+bool
+equal_distributions(PlannerInfo *root, Distribution *dst1,
+					Distribution *dst2)
+{
+	/* fast path */
+	if (dst1 == dst2)
+		return true;
+
+	if (dst1 == NULL || dst2 == NULL)
+		return false;
+
+	/* conditions easier to check go first */
+	if (dst1->distributionType != dst2->distributionType)
+		return false;
+
+	if (!bms_equal(dst1->nodes, dst2->nodes))
+		return false;
+
+	if (equal(dst1->distributionExpr, dst2->distributionExpr))
+		return true;
+
+	/*
+	 * For more thorough expression check we need to ensure they both are
+	 * defined
+	 */
+	if (dst1->distributionExpr == NULL || dst2->distributionExpr == NULL)
+		return false;
+
+	/*
+	 * More thorough check, but allows some important cases, like if
+	 * distribution column is not updated (implicit set distcol=distcol) or
+	 * set distcol = CONST, ... WHERE distcol = CONST - pattern used by many
+	 * applications.
+	 */
+	if (exprs_known_equal(root, dst1->distributionExpr, dst2->distributionExpr))
+		return true;
+
+	/* The restrictNodes field does not matter for distribution equality */
+	return false;
+}
+
+/*
+ * Get the location of DML result relation if it appears in either subpath
+ */
+ResultRelLocation
+getResultRelLocation(int resultRel, Relids inner, Relids outer)
+{
+	ResultRelLocation location = RESULT_REL_NONE;
+
+	if (bms_is_member(resultRel, inner))
+	{
+		location = RESULT_REL_INNER;
+	}
+	else if (bms_is_member(resultRel, outer))
+	{
+		location = RESULT_REL_OUTER;
+	}
+
+	return location;
+}
+
+/*
+ * Check if the path distribution satisfy the result relation distribution.
+ */
+bool
+SatisfyResultRelDist(PlannerInfo *root, Path *path)
+{
+	PlannerInfo    *top_root = root;
+	bool			equal = false;
+
+	/* Get top root */
+	while(top_root->parent_root)
+	{
+		top_root = top_root->parent_root;
+	}
+
+	/*
+	 * Check the UPDATE/DELETE command, make sure the path distribution equals the
+	 * result relation distribution.
+	 * We only invalidate the check if the result relation appears in one of
+	 * the left/right subpath.
+	 */
+	if ((top_root->parse->commandType == CMD_UPDATE ||
+		 top_root->parse->commandType == CMD_DELETE) &&
+		path->parent->resultRelLoc != RESULT_REL_NONE)
+	{
+		equal = equal_distributions(top_root,
+									top_root->distribution,
+									path->distribution);
+
+		if (!equal)
+			return false;
+	}
+
+	return true;
+}
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index c1c50654..9bbf6040 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -37,13 +37,14 @@
 #include "pgxc/nodemgr.h"
 #include "utils/rel.h"
 #ifdef __TBASE__
+#include "catalog/pgxc_key_values.h"
+#include "executor/nodeAgg.h"
+#include "optimizer/distribution.h"
 #include "optimizer/tlist.h"
 #include "optimizer/planner.h"
+#include "optimizer/pgxcship.h"
 #include "pgxc/groupmgr.h"
-#include "catalog/pgxc_key_values.h"
 #include "pgxc/pgxcnode.h"
-#include "optimizer/pgxcship.h"
-#include "executor/nodeAgg.h"
 #endif
 
 #ifdef _MIGRATE_
@@ -287,6 +288,32 @@ set_cheapest(RelOptInfo *parent_rel)
 
     Assert(IsA(parent_rel, RelOptInfo));
 
+#ifdef __TBASE__
+	/*
+	 * When set_joinpath_distribution() adjusted the strategy for complex
+	 * UPDATE/DELETE, the original paths could be give up caused by no proper
+	 * distribution found. Which lead to an early error pop up here, thus
+	 * we need to provide more accurate error message here. (Before the
+	 * complex delete enhancement, this will pop up in group_planner at
+	 * final stage.)
+	 */
+	if (parent_rel->pathlist == NIL &&
+		parent_rel->resultRelLoc != RESULT_REL_NONE)
+	{
+#ifdef _PG_REGRESS_
+			ereport(ERROR,
+					(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+					 errmsg("could not plan this distributed UPDATE/DELETE"),
+					 errdetail("correlated or complex UPDATE/DELETE is currently not supported in Postgres-XL.")));
+#else
+			ereport(ERROR,
+					(errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+					 errmsg("could not plan this distributed UPDATE/DELETE"),
+					 errdetail("correlated or complex UPDATE/DELETE is currently not supported in TBase.")));
+#endif
+	}
+#endif
+
     if (parent_rel->pathlist == NIL)
         elog(ERROR, "could not devise a query plan for the given query");
 
@@ -470,6 +497,15 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
      */
     CHECK_FOR_INTERRUPTS();
 
+#ifdef __TBASE__
+	/*
+	 * In case we skipped the join paths caused by invalid result rel
+	 * distribution.
+	 */
+	if (!new_path)
+		return;
+#endif
+
     /* Pretend parameterized paths have no pathkeys, per comment above */
     new_path_pathkeys = new_path->param_info ? NIL : new_path->pathkeys;
 
@@ -1572,18 +1608,41 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
     List           *innerpathkeys = pathnode->innerjoinpath->pathkeys;
     List           *outerpathkeys = pathnode->outerjoinpath->pathkeys;
 #ifdef __TBASE__
-    bool           dml = false;
-    PlannerInfo    *top_root = root;
+	bool           dml = false;
+	bool		   keepResultRelLoc = false;
+	PlannerInfo    *top_root = root;
+	ResultRelLocation resultRelLoc = RESULT_REL_NONE;
 
-    while(top_root->parent_root)
-    {
-        top_root = top_root->parent_root;
-    }
+	while(top_root->parent_root)
+	{
+		top_root = top_root->parent_root;
+	}
 
-    if (top_root->parse->commandType == CMD_UPDATE ||
-        top_root->parse->commandType == CMD_DELETE)
-        dml = true;
-    
+	if (top_root->parse->commandType == CMD_UPDATE ||
+		top_root->parse->commandType == CMD_DELETE)
+	{
+		dml = true;
+	}
+
+	/*
+	 * Only top root will consider more restrict rules to make sure
+	 * UPDATE/DELETE result relation does not redistributed.
+	 */
+	if (top_root->parse->commandType == CMD_UPDATE ||
+		top_root->parse->commandType == CMD_DELETE)
+	{
+		/* Set the result relation location */
+		resultRelLoc = getResultRelLocation(top_root->parse->resultRelation,
+											pathnode->innerjoinpath->parent->relids,
+											pathnode->outerjoinpath->parent->relids);
+
+		pathnode->path.parent->resultRelLoc = resultRelLoc;
+
+		if (resultRelLoc != RESULT_REL_NONE)
+		{
+			keepResultRelLoc = true;
+		}
+	}
 #endif
 
 
@@ -1604,9 +1663,17 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
         return NIL;
 #ifdef __TBASE__
     /*
-     * dml may need to push down to datanodes, such as,
-     * 'delete from geocode_settings as gc using geocode_settings_default as gf where gf.name = gc.name and gf.setting = gc.setting;'
-     * prefer_olap means pulling query up to coordinator node, in case data re-distribute in TPC-C test case.
+     * DML may need to push down to datanodes, for example:
+     *   DELETE FROM
+     *   	geocode_settings as gc
+     *   USING geocode_settings_default AS gf
+     *   WHERE
+     *   	gf.name = gc.name and gf.setting = gc.setting;
+     * prefer_olap means pulling query up to coordinator node, in case data
+     * re-distribute in TPC-C test case.
+     *
+     * TODO: We need to automatically determine whether we need to pull it up,
+     * but not using GUC.
      */
     if(!prefer_olap && false == dml)
     {
@@ -1712,22 +1779,18 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
     restrictClauses = list_concat(restrictClauses,
             pathnode->movedrestrictinfo);
 
-    /*
-     * This join is still allowed if inner and outer paths have
-     * equivalent distribution and joined along the distribution keys.
-     */
-    if (innerd && outerd &&
-            innerd->distributionType == outerd->distributionType &&
-            innerd->distributionExpr &&
-            outerd->distributionExpr &&
-            bms_equal(innerd->nodes, outerd->nodes))
-    {
-        ListCell   *lc;
-
-        /*
-         * Make sure distribution functions are the same, for now they depend
-         * on data type
-         */
+	/*
+	 * This join is still allowed if inner and outer paths have equivalent
+	 * distribution and joined along the distribution keys. Make sure
+	 * distribution functions are the same, for now they depend on data type.
+	 */
+	if (innerd && outerd &&
+		innerd->distributionType == outerd->distributionType &&
+		innerd->distributionExpr &&
+		outerd->distributionExpr &&
+		bms_equal(innerd->nodes, outerd->nodes))
+	{
+		ListCell   *lc;
 
         /*
          * Planner already did necessary work and if there is a join
@@ -1739,9 +1802,10 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
          */
         foreach(lc, restrictClauses)
         {
-            RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
-            ListCell   *emc;
-            bool        found_outer, found_inner;
+			RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
+			ListCell	 *emc = NULL;
+			bool		  found_outer = false;
+			bool		  found_inner = false;
 
             /*
              * Restriction operator is not equality operator ?
@@ -1762,9 +1826,6 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
             if (!OidIsValid(ri->hashjoinoperator))
                 continue;
 
-            found_outer = false;
-            found_inner = false;
-
             /*
              * If parts belong to the same equivalence member check
              * if both distribution keys are members of the class.
@@ -1773,8 +1834,9 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
             {
                 foreach(emc, ri->left_ec->ec_members)
                 {
-                    EquivalenceMember *em = (EquivalenceMember *) lfirst(emc);
-                    Expr       *var = (Expr *)em->em_expr;
+					EquivalenceMember *em 	= (EquivalenceMember *) lfirst(emc);
+					Expr			  *var 	= (Expr *)em->em_expr;
+
                     if (IsA(var, RelabelType))
                         var = ((RelabelType *) var)->arg;
                     if (!found_outer)
@@ -1813,12 +1875,41 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                             if (equal(var, emvar))
                             {
                                 targetd->distributionExpr = (Node *) var;
+#ifdef __TBASE__
+								/*
+								 * For UPDATE/DELETE, make sure we are distributing by
+								 * the result relation.
+								 */
+								if (keepResultRelLoc &&
+									!equal_distributions(top_root,
+														 top_root->distribution,
+														 targetd))
+								{
+									continue;
+								}
+#endif
                                 return alternate;
                             }
                         }
                     }
                     /* Not found, take any */
                     targetd->distributionExpr = innerd->distributionExpr;
+
+#ifdef __TBASE__
+					/*
+					 * For UPDATE/DELETE, make sure we are distributing by
+					 * the result relation.
+					 */
+					if (keepResultRelLoc &&
+						!equal_distributions(top_root,
+											 top_root->distribution,
+											 targetd))
+					{
+						pfree(targetd);
+						targetd = NULL;
+						continue;
+					}
+#endif
                     return alternate;
                 }
             }
@@ -1874,47 +1965,80 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                         targetd->restrictNodes = bms_union(outerd->restrictNodes, innerd->restrictNodes);
                     }
 #endif
-                    /*
-                     * In case of outer join distribution key should not refer
-                     * distribution key of nullable part.
-                     */
-                    if (pathnode->jointype == JOIN_FULL)
-                        /* both parts are nullable */
-                        targetd->distributionExpr = NULL;
-                    else if (pathnode->jointype == JOIN_RIGHT)
-                        targetd->distributionExpr = innerd->distributionExpr;
-                    else
-                        targetd->distributionExpr = outerd->distributionExpr;
+					/*
+					 * In case of outer join distribution key should not refer
+					 * distribution key of nullable part.
+					 */
+					if (pathnode->jointype == JOIN_FULL)
+						/* both parts are nullable */
+						targetd->distributionExpr = NULL;
+					else if (pathnode->jointype == JOIN_RIGHT)
+						targetd->distributionExpr = innerd->distributionExpr;
+					else
+						targetd->distributionExpr = outerd->distributionExpr;
 
-                    return alternate;
-                }
-            }
-        }
+#ifdef __TBASE__
+					/*
+					 * For UPDATE/DELETE, make sure we are distributing by
+					 * the result relation.
+					 */
+					if (keepResultRelLoc &&
+						!equal_distributions(top_root,
+											 top_root->distribution,
+											 targetd))
+					{
+						pfree(targetd);
+						targetd = NULL;
+						continue;
+					}
+#endif
+					return alternate;
+				}
+			}
+		}
 #ifndef _PG_REGRESS_
-        if (bms_equal(innerd->restrictNodes, outerd->restrictNodes) &&
-            bms_num_members(innerd->restrictNodes) == 1 && restrict_query &&
-            pathnode->jointype != JOIN_FULL)
-        {
-            targetd = makeNode(Distribution);
-            targetd->distributionType = innerd->distributionType;
-            targetd->nodes = bms_copy(innerd->nodes);
-            targetd->restrictNodes = bms_copy(innerd->restrictNodes);
-            pathnode->path.distribution = targetd;
-
-            /*
-             * In case of outer join distribution key should not refer
-             * distribution key of nullable part.
-             */
-            if (pathnode->jointype == JOIN_FULL)
-                /* both parts are nullable */
-                targetd->distributionExpr = NULL;
-            else if (pathnode->jointype == JOIN_RIGHT)
-                targetd->distributionExpr = innerd->distributionExpr;
-            else
-                targetd->distributionExpr = outerd->distributionExpr;
+		if (bms_equal(innerd->restrictNodes, outerd->restrictNodes) &&
+			bms_num_members(innerd->restrictNodes) == 1 && restrict_query &&
+			pathnode->jointype != JOIN_FULL)
+		{
+			targetd = makeNode(Distribution);
+			targetd->distributionType = innerd->distributionType;
+			targetd->nodes = bms_copy(innerd->nodes);
+			targetd->restrictNodes = bms_copy(innerd->restrictNodes);
+			pathnode->path.distribution = targetd;
+
+			/*
+			 * In case of outer join distribution key should not refer
+			 * distribution key of nullable part.
+			 */
+			if (pathnode->jointype == JOIN_FULL)
+				/* both parts are nullable */
+				targetd->distributionExpr = NULL;
+			else if (pathnode->jointype == JOIN_RIGHT)
+				targetd->distributionExpr = innerd->distributionExpr;
+			else
+				targetd->distributionExpr = outerd->distributionExpr;
 
-            return alternate;
-        }
+#ifdef __TBASE__
+			/*
+			 * For UPDATE/DELETE, make sure we are distributing by
+			 * the result relation.
+			 */
+			if (!keepResultRelLoc || equal_distributions(top_root,
+														 top_root->distribution,
+														 targetd))
+			{
+				return alternate;
+			}
+			else
+			{
+				pfree(targetd);
+				targetd = NULL;
+			}
+#else
+			return alternate;
+#endif
+		}
 #endif
     }
 
@@ -2002,7 +2126,11 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
         char            distType = LOCATOR_TYPE_NONE;
         ListCell        *lc;
 #ifdef __TBASE__
-        Oid            group;
+		Oid				group;
+		int 			nRemotePlans_outer = 0;
+		int 			nRemotePlans_inner = 0;
+		bool 			redistribute_outer = false;
+		bool 			redistribute_inner = false;
 #endif
 
         /*
@@ -2070,108 +2198,121 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                         }
                     }
 #endif
-                    /*
-                     * Evaluation cost will be needed to choose preferred
-                     * distribution
-                     */
-                    cost_qual_eval_node(&cost, (Node *) ri, root);
+					/*
+					 * Evaluation cost will be needed to choose preferred
+					 * distribution
+					 */
+					cost_qual_eval_node(&cost, (Node *) ri, root);
 
-                    if (outerd->distributionExpr)
-                    {
-                        /*
-                         * If left side is distribution key of outer subquery
-                         * and right expression refers only inner subquery
-                         */
-                        if (equal(outerd->distributionExpr, left_expr) &&
-                                bms_is_subset(ri->right_relids, inner_rels))
-                        {
-                            if (!preferred || /* no preferred restriction yet found */
-                                    (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */
-                                    (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */
-                            {
-                                /* set new preferred restriction */
-                                preferred = ri;
-                                new_inner_key = right;
-                                new_outer_key = NULL; /* no need to change */
-                                distType = outerd->distributionType;
-                            }
-                            continue;
-                        }
-                        /*
-                         * If right side is distribution key of outer subquery
-                         * and left expression refers only inner subquery
-                         */
-                        if (equal(outerd->distributionExpr, right_expr) &&
-                                bms_is_subset(ri->left_relids, inner_rels))
-                        {
-                            if (!preferred || /* no preferred restriction yet found */
-                                    (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */
-                                    (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */
-                            {
-                                /* set new preferred restriction */
-                                preferred = ri;
-                                new_inner_key = left;
-                                new_outer_key = NULL; /* no need to change */
-                                distType = outerd->distributionType;
-                            }
-                            continue;
-                        }
-                    }
-                    if (innerd->distributionExpr)
-                    {
-                        /*
-                         * If left side is distribution key of inner subquery
-                         * and right expression refers only outer subquery
-                         */
-                        if (equal(innerd->distributionExpr, left_expr) &&
-                                bms_is_subset(ri->right_relids, outer_rels))
-                        {
-                            if (!preferred || /* no preferred restriction yet found */
-                                    (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */
-                                    (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */
-                            {
-                                /* set new preferred restriction */
-                                preferred = ri;
-                                new_inner_key = NULL; /* no need to change */
-                                new_outer_key = right;
-                                distType = innerd->distributionType;
-                            }
-                            continue;
-                        }
-                        /*
-                         * If right side is distribution key of inner subquery
-                         * and left expression refers only outer subquery
-                         */
-                        if (equal(innerd->distributionExpr, right_expr) &&
-                                bms_is_subset(ri->left_relids, outer_rels))
-                        {
-                            if (!preferred || /* no preferred restriction yet found */
-                                    (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */
-                                    (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */
-                            {
-                                /* set new preferred restriction */
-                                preferred = ri;
-                                new_inner_key = NULL; /* no need to change */
-                                new_outer_key = left;
-                                distType = innerd->distributionType;
-                            }
-                            continue;
-                        }
-                    }
-                    /*
-                     * Current restriction recuire redistribution of both parts.
-                     * If preferred restriction require redistribution of one,
-                     * keep it.
-                     */
-                    if (preferred &&
-                            (new_inner_key == NULL || new_outer_key == NULL))
-                        continue;
-
-                    /*
-                     * Skip this condition if the data type of the expressions
-                     * does not allow either HASH or MODULO distribution.
-                     * HASH distribution is preferrable.
-                     */
+					if (outerd->distributionExpr)
+					{
+#ifdef __TBASE__
+						/*
+						 * For UPDATE/DELETE, make sure outer rel does not need
+						 * to distribute
+						 */
+						if (keepResultRelLoc && resultRelLoc == RESULT_REL_INNER)
+							continue;
+#endif
+						/*
+						 * If left side is distribution key of outer subquery
+						 * and right expression refers only inner subquery
+						 */
+						if (equal(outerd->distributionExpr, left_expr) &&
+								bms_is_subset(ri->right_relids, inner_rels))
+						{
+							if (!preferred || /* no preferred restriction yet found */
+								(new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */
+								(cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */
+							{
+								/* set new preferred restriction */
+								preferred = ri;
+								new_inner_key = right;
+								new_outer_key = NULL; /* no need to change */
+								distType = outerd->distributionType;
+							}
+							continue;
+						}
+						/*
+						 * If right side is distribution key of outer subquery
+						 * and left expression refers only inner subquery
+						 */
+						if (equal(outerd->distributionExpr, right_expr) &&
+								bms_is_subset(ri->left_relids, inner_rels))
+						{
+							if (!preferred || /* no preferred restriction yet found */
+								(new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */
+								(cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */
+							{
+								/* set new preferred restriction */
+								preferred = ri;
+								new_inner_key = left;
+								new_outer_key = NULL; /* no need to change */
+								distType = outerd->distributionType;
+							}
+							continue;
+						}
+					}
+					if (innerd->distributionExpr)
+					{
+#ifdef __TBASE__
+						/* For UPDATE/DELETE, make sure inner rel does not need to distribute */
+						if (keepResultRelLoc && resultRelLoc == RESULT_REL_OUTER)
+							continue;
+#endif
+						/*
+						 * If left side is distribution key of inner subquery
+						 * and right expression refers only outer subquery
+						 */
+						if (equal(innerd->distributionExpr, left_expr) &&
+								bms_is_subset(ri->right_relids, outer_rels))
+						{
+							if (!preferred || /* no preferred restriction yet found */
+									(new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */
+									(cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */
+							{
+								/* set new preferred restriction */
+								preferred = ri;
+								new_inner_key = NULL; /* no need to change */
+								new_outer_key = right;
+								distType = innerd->distributionType;
+							}
+							continue;
+						}
+						/*
+						 * If right side is distribution key of inner subquery
+						 * and left expression refers only outer subquery
+						 */
+						if (equal(innerd->distributionExpr, right_expr) &&
+								bms_is_subset(ri->left_relids, outer_rels))
+						{
+							if (!preferred || /* no preferred restriction yet found */
+									(new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */
+									(cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */
+							{
+								/* set new preferred restriction */
+								preferred = ri;
+								new_inner_key = NULL; /* no need to change */
+								new_outer_key = left;
+								distType = innerd->distributionType;
+							}
+							continue;
+						}
+					}
+					/*
+					 * Current restriction recuire redistribution of both parts.
+					 * If preferred restriction require redistribution of one,
+					 * keep it.
+					 */
+					if (preferred &&
+							(new_inner_key == NULL || new_outer_key == NULL))
+						continue;
+
+					/*
+					 * Skip this condition if the data type of the expressions
+					 * does not allow either HASH or MODULO distribution.
+					 * HASH distribution is preferrable.
+					 */
 #ifdef __TBASE__
                     if (groupOids)
                     {
@@ -2196,71 +2337,76 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                     else
                         continue;
 #ifdef __TBASE__
-                    }
+					}
+
+					/*
+					 * Skip redistribute both side, which will redistribute the
+					 * result relation
+					 */
+					if (keepResultRelLoc)
+						continue;
 #endif
-                    /*
-                     * If this restriction the first or easier to calculate
-                     * then preferred, try to store it as new preferred
-                     * restriction to redistribute along it.
-                     */
-                    if (preferred == NULL ||
-                            (cost.per_tuple < preferred->eval_cost.per_tuple))
-                    {
-                        /*
-                         * Left expression depends only on outer subpath and
-                         * right expression depends only on inner subpath, so
-                         * we can redistribute both and make left expression the
-                         * distribution key of outer subplan and right
-                         * expression the distribution key of inner subplan
-                         */
-                        if (bms_is_subset(ri->left_relids, outer_rels) &&
-                                bms_is_subset(ri->right_relids, inner_rels))
-                        {
-                            preferred = ri;
-                            new_outer_key = left;
-                            new_inner_key = right;
-                        }
-                        /*
-                         * Left expression depends only on inner subpath and
-                         * right expression depends only on outer subpath, so
-                         * we can redistribute both and make left expression the
-                         * distribution key of inner subplan and right
-                         * expression the distribution key of outer subplan
-                         */
-                        if (bms_is_subset(ri->left_relids, inner_rels) &&
-                                bms_is_subset(ri->right_relids, outer_rels))
-                        {
-                            preferred = ri;
-                            new_inner_key = left;
-                            new_outer_key = right;
-                        }
-                    }
-                }
-            }
-        }
-        /* If we have suitable restriction we can repartition accordingly */
-        if (preferred)
-        {
-            Bitmapset *nodes = NULL;
-            Bitmapset *restrictNodes = NULL;
+					/*
+					 * If this restriction the first or easier to calculate
+					 * then preferred, try to store it as new preferred
+					 * restriction to redistribute along it.
+					 */
+					if (preferred == NULL ||
+						(cost.per_tuple < preferred->eval_cost.per_tuple))
+					{
+						/*
+						 * Left expression depends only on outer subpath and
+						 * right expression depends only on inner subpath, so
+						 * we can redistribute both and make left expression the
+						 * distribution key of outer subplan and right
+						 * expression the distribution key of inner subplan
+						 */
+						if (bms_is_subset(ri->left_relids, outer_rels) &&
+								bms_is_subset(ri->right_relids, inner_rels))
+						{
+							preferred = ri;
+							new_outer_key = left;
+							new_inner_key = right;
+						}
+						/*
+						 * Left expression depends only on inner subpath and
+						 * right expression depends only on outer subpath, so
+						 * we can redistribute both and make left expression the
+						 * distribution key of inner subplan and right
+						 * expression the distribution key of outer subplan
+						 */
+						if (bms_is_subset(ri->left_relids, inner_rels) &&
+								bms_is_subset(ri->right_relids, outer_rels))
+						{
+							preferred = ri;
+							new_inner_key = left;
+							new_outer_key = right;
+						}
+					}
+				}
+			}
+		}
+
 #ifdef __TBASE__
-            /* consider the outer/inner size when make the redistribute plan */
-            bool replicate_inner = false;
-            bool replicate_outer = false;
-            RelOptInfo *outer_rel = pathnode->outerjoinpath->parent;
-            RelOptInfo *inner_rel = pathnode->innerjoinpath->parent;
-            double outer_size = outer_rel->rows * outer_rel->reltarget->width;
-            double inner_size = inner_rel->rows * inner_rel->reltarget->width;
-            int outer_nodes = bms_num_members(outerd->nodes);
-            int inner_nodes = bms_num_members(innerd->nodes);
-
-            int nRemotePlans_outer = 0;
-            int nRemotePlans_inner = 0;
-            bool redistribute_outer = false; 
-            bool redistribute_inner = false; 
-
-            contains_remotesubplan(pathnode->outerjoinpath, &nRemotePlans_outer, &redistribute_outer);
-            contains_remotesubplan(pathnode->innerjoinpath, &nRemotePlans_inner, &redistribute_inner);
+		contains_remotesubplan(pathnode->outerjoinpath, &nRemotePlans_outer, &redistribute_outer);
+		contains_remotesubplan(pathnode->innerjoinpath, &nRemotePlans_inner, &redistribute_inner);
+#endif
+
+		/* If we have suitable restriction we can repartition accordingly */
+		if (preferred)
+		{
+			Bitmapset *nodes = NULL;
+			Bitmapset *restrictNodes = NULL;
+#ifdef __TBASE__
+			/* consider the outer/inner size when make the redistribute plan */
+			bool replicate_inner = false;
+			bool replicate_outer = false;
+			RelOptInfo *outer_rel = pathnode->outerjoinpath->parent;
+			RelOptInfo *inner_rel = pathnode->innerjoinpath->parent;
+			double outer_size = outer_rel->rows * outer_rel->reltarget->width;
+			double inner_size = inner_rel->rows * inner_rel->reltarget->width;
+			int outer_nodes = bms_num_members(outerd->nodes);
+			int inner_nodes = bms_num_members(innerd->nodes);
 #endif
 
             /* If we redistribute both parts do join on all nodes ... */
@@ -2271,28 +2417,33 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                     nodes = bms_add_member(nodes, i);
 
 #ifdef __TBASE__
-                /* check if we can distribute by shard */
-                if (OidIsValid(group))
-                {
-                    int      node_index;
-                    int32     dn_num;
-                    int32   *datanodes;
-                    //List *nodelist = GetGroupNodeList(group);
-
-                    GetShardNodes(group, &datanodes, &dn_num, NULL);
+				/* check if we can distribute by shard */
+				if (OidIsValid(group))
+				{
+					int      node_index;
+					int32	 dn_num;
+					int32   *datanodes;
+
+					GetShardNodes(group, &datanodes, &dn_num, NULL);
+
+					bms_free(nodes);
+					nodes = NULL;
+					
+					for(node_index = 0; node_index < dn_num; node_index++)
+					{						
+						nodes = bms_add_member(nodes, datanodes[node_index]);
+					}
+				}
 
-                    bms_free(nodes);
-                    nodes = NULL;
-                    
-                    for(node_index = 0; node_index < dn_num; node_index++)
-                    {                        
-                        nodes = bms_add_member(nodes, datanodes[node_index]);
-                    }
-                }
+				/*
+				 * We should not get both new_inner_key & new_outer_key for
+				 * UPDATE/DELETE
+				 */
+				Assert(!keepResultRelLoc);
 
-                /*
-                 * if any side is smaller enough, replicate the smaller one
-                 * instead of redistribute both of them.
+				/*
+				 * if any side is smaller enough, replicate the smaller one
+				 * instead of redistribute both of them.
                  */
                 if(inner_size * outer_nodes < inner_size + outer_size &&
                     (pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL) &&
@@ -2325,29 +2476,31 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
             else if (new_inner_key)
             {
 #ifdef __TBASE__
-                /*
-                  * if inner is smaller than outer, redistribute inner
-                  * if inner is bigger than outer (inner > inner->nodes * outer),
-                  * replicate outer; else redistribute inner
-                                */
-                if(inner_size > outer_size * inner_nodes &&
-                    (pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL &&
-                     pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI) &&
-                     innerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_outer &&
-                     get_num_connections(inner_nodes, nRemotePlans_outer + 1) < MaxConnections * REPLICATION_FACTOR &&
+				/*
+				 * If inner is smaller than outer, redistribute inner as the
+				 * preferred key we picked.
+				 * If inner is bigger than outer (inner > inner->nodes * outer),
+				 * replicate outer as an optimization to save network costs.
+                 */
+				if(inner_size > outer_size * inner_nodes &&
+					(pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL &&
+					 pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI) &&
+					 innerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_outer &&
+					 get_num_connections(inner_nodes, nRemotePlans_outer + 1) < MaxConnections * REPLICATION_FACTOR &&
 					 !dml && nRemotePlans_outer < replication_level && !pathnode->inner_unique)
-                {
-                    replicate_outer = true;
+				{
+					replicate_outer = true;
 
-                    /* replicate outer to all inner nodes */
-                    nodes = bms_copy(innerd->nodes);
-                    restrictNodes = bms_copy(innerd->restrictNodes);
-                }
-                else
-                {
+					/* replicate outer to all inner nodes */
+					nodes = bms_copy(innerd->nodes);
+					restrictNodes = bms_copy(innerd->restrictNodes);
+				}
+				else
+				{
+					Assert(!keepResultRelLoc || resultRelLoc != RESULT_REL_INNER);
 #endif
-                nodes = bms_copy(outerd->nodes);
-                restrictNodes = bms_copy(outerd->restrictNodes);
+					nodes = bms_copy(outerd->nodes);
+					restrictNodes = bms_copy(outerd->restrictNodes);
 #ifdef __TBASE__
                 }
 #endif
@@ -2355,28 +2508,30 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
             else /*if (new_outer_key)*/
             {
 #ifdef __TBASE__
-                /*
-                  * if outer is smaller than inner, redistribute outer
-                  * if outer is bigger than inner (outer > outer->nodes * inner),
-                  * replicate inner; else redistribute outer
-                                */
-                if(outer_size > inner_size * outer_nodes &&
-                    (pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL) &&
-                    outerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_inner &&
-                    get_num_connections(outer_nodes, nRemotePlans_inner + 1) < MaxConnections * REPLICATION_FACTOR &&
+				/*
+				 * If outer is smaller than inner, redistribute outer as the
+				 * preferred key we picked.
+				 * If outer is bigger than inner (outer > outer->nodes * inner),
+				 * replicate inner as an optimization to save network costs.
+				 */
+				if (outer_size > inner_size * outer_nodes &&
+					(pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL) &&
+					outerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_inner &&
+					get_num_connections(outer_nodes, nRemotePlans_inner + 1) < MaxConnections * REPLICATION_FACTOR &&
 					!dml && nRemotePlans_inner < replication_level && !pathnode->inner_unique)
-                {
-                    replicate_inner = true;
+				{
+					replicate_inner = true;
 
-                    /* replicate inner to all outer nodes */
-                    nodes = bms_copy(outerd->nodes);
-                    restrictNodes = bms_copy(outerd->restrictNodes);
-                }
-                else
-                {
+					/* replicate inner to all outer nodes */
+					nodes = bms_copy(outerd->nodes);
+					restrictNodes = bms_copy(outerd->restrictNodes);
+				}
+				else
+				{
+					Assert(!keepResultRelLoc || resultRelLoc != RESULT_REL_OUTER);
 #endif
-                nodes = bms_copy(innerd->nodes);
-                restrictNodes = bms_copy(innerd->restrictNodes);
+					nodes = bms_copy(innerd->nodes);
+					restrictNodes = bms_copy(innerd->restrictNodes);
 #ifdef __TBASE__
                 }
 #endif
@@ -2510,9 +2665,82 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                 targetd->distributionExpr =
                         pathnode->outerjoinpath->distribution->distributionExpr;
 
-            return alternate;
-        }
-    }
+			return alternate;
+		}
+
+#ifdef __TBASE__
+		if (keepResultRelLoc)
+		{
+			/*
+			 * We didn't got the preferred redistribution plan for UPDATE/DELETE.
+			 * Thus, to keeping result relation not redistributed, we replicate
+			 * the other subpath.
+			 */
+			if (resultRelLoc == RESULT_REL_INNER &&
+				pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL &&
+				pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI &&
+				nRemotePlans_outer < replication_level && !pathnode->inner_unique)
+			{
+				/* Replicate outer */
+				pathnode->outerjoinpath = redistribute_path(
+											root,
+											pathnode->outerjoinpath,
+											outerpathkeys,
+											LOCATOR_TYPE_NONE,
+											NULL,
+											innerd->nodes,
+											NULL);
+				pathnode->path.distribution = innerd;
+
+				if (IsA(pathnode, MergePath))
+					((MergePath*)pathnode)->outersortkeys = NIL;
+			}
+			else if (resultRelLoc == RESULT_REL_OUTER &&
+					 pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL &&
+					 nRemotePlans_outer < replication_level && !pathnode->inner_unique)
+			{
+				/* Replicate inner */
+				pathnode->innerjoinpath = redistribute_path(
+											root,
+											pathnode->innerjoinpath,
+											innerpathkeys,
+											LOCATOR_TYPE_NONE,
+											NULL,
+											outerd->nodes,
+											NULL);
+				pathnode->path.distribution = outerd;
+
+				if (IsA(pathnode, MergePath))
+					((MergePath*)pathnode)->innersortkeys = NIL;
+			}
+
+			return alternate;
+		}
+	}
+
+	/* 
+	 * For DELETE/UPDATE, If the other side already been replicated, we directly
+	 * inherit the resultRelLoc side distribution.
+	 */
+	if (keepResultRelLoc)
+	{
+		if (innerd &&resultRelLoc == RESULT_REL_INNER &&
+			pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL &&
+			pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI &&
+			!pathnode->inner_unique)
+		{
+			pathnode->path.distribution = innerd;
+			return alternate;
+		}
+		else if (outerd && resultRelLoc == RESULT_REL_OUTER &&
+				 pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL &&
+				 !pathnode->inner_unique)
+		{
+			pathnode->path.distribution = outerd;
+			return alternate;
+		}
+#endif
+	}
 
     /*
      * Build cartesian product, if no hasheable restrictions is found.
@@ -5149,7 +5377,11 @@ create_nestloop_path(PlannerInfo *root,
     }
 #endif
 
-    return pathnode;
+	/* For DELETE, check if the path distribution satisfy resultRel distribution */
+	if (!SatisfyResultRelDist(root, &pathnode->path))
+		return NULL;
+
+	return pathnode;
 }
 
 /*
@@ -5266,7 +5498,11 @@ create_mergejoin_path(PlannerInfo *root,
     }
 #endif
 
-    return pathnode;
+	/* For DELETE, check if the path distribution satisfy resultRel distribution */
+	if (!SatisfyResultRelDist(root, &pathnode->jpath.path))
+		return NULL;
+
+	return pathnode;
 }
 
 /*
@@ -5394,7 +5630,11 @@ create_hashjoin_path(PlannerInfo *root,
     }
 #endif
 
-    return pathnode;
+	/* For DELETE, check if the path distribution satisfy resultRel distribution */
+	if (!SatisfyResultRelDist(root, &pathnode->jpath.path))
+		return NULL;
+
+	return pathnode;
 }
 
 /*
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index 5ca3723f..153d3d36 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -32,9 +32,10 @@
 #endif
 #ifdef __TBASE__
 #include "access/heapam.h"
-#include "utils/rel.h"
-#include "utils/lsyscache.h"
 #include "access/sysattr.h"
+#include "optimizer/distribution.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
 #endif
 typedef struct JoinHashEntry
 {
@@ -155,10 +156,11 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
     rel->joininfo = NIL;
     rel->has_eclass_joins = false;
 #ifdef __TBASE__
-    rel->intervalparent = false;
-    rel->isdefault      = rte->isdefault;
-    rel->estimate_partidx = -1;
-    rel->childs = NULL;
+	rel->intervalparent = false;
+	rel->isdefault      = rte->isdefault;
+	rel->estimate_partidx = -1;
+	rel->childs = NULL;
+	rel->resultRelLoc = RESULT_REL_NONE;
 #endif
 
     /*
@@ -497,177 +499,183 @@ add_join_rel(PlannerInfo *root, RelOptInfo *joinrel)
  */
 RelOptInfo *
 build_join_rel(PlannerInfo *root,
-               Relids joinrelids,
-               RelOptInfo *outer_rel,
-               RelOptInfo *inner_rel,
-               SpecialJoinInfo *sjinfo,
-               List **restrictlist_ptr)
-{// #lizard forgives
-    RelOptInfo *joinrel;
-    List       *restrictlist;
-
-    /*
-     * See if we already have a joinrel for this set of base rels.
-     */
-    joinrel = find_join_rel(root, joinrelids);
-
-    if (joinrel)
-    {
-        /*
-         * Yes, so we only need to figure the restrictlist for this particular
-         * pair of component relations.
-         */
-        if (restrictlist_ptr)
-            *restrictlist_ptr = build_joinrel_restrictlist(root,
-                                                           joinrel,
-                                                           outer_rel,
-                                                           inner_rel);
-        return joinrel;
-    }
-
-    /*
-     * Nope, so make one.
-     */
-    joinrel = makeNode(RelOptInfo);
-    joinrel->reloptkind = RELOPT_JOINREL;
-    joinrel->relids = bms_copy(joinrelids);
-    joinrel->rows = 0;
-    /* cheap startup cost is interesting iff not all tuples to be retrieved */
-    joinrel->consider_startup = (root->tuple_fraction > 0);
-    joinrel->consider_param_startup = false;
-    joinrel->consider_parallel = false;
-    joinrel->reltarget = create_empty_pathtarget();
-    joinrel->pathlist = NIL;
-    joinrel->ppilist = NIL;
-    joinrel->partial_pathlist = NIL;
-    joinrel->cheapest_startup_path = NULL;
-    joinrel->cheapest_total_path = NULL;
-    joinrel->cheapest_unique_path = NULL;
-    joinrel->cheapest_parameterized_paths = NIL;
-    /* init direct_lateral_relids from children; we'll finish it up below */
-    joinrel->direct_lateral_relids =
-        bms_union(outer_rel->direct_lateral_relids,
-                  inner_rel->direct_lateral_relids);
-    joinrel->lateral_relids = min_join_parameterization(root, joinrel->relids,
-                                                        outer_rel, inner_rel);
-    joinrel->relid = 0;            /* indicates not a baserel */
-    joinrel->rtekind = RTE_JOIN;
-    joinrel->min_attr = 0;
-    joinrel->max_attr = 0;
-    joinrel->attr_needed = NULL;
-    joinrel->attr_widths = NULL;
-    joinrel->lateral_vars = NIL;
-    joinrel->lateral_referencers = NULL;
-    joinrel->indexlist = NIL;
-    joinrel->statlist = NIL;
-    joinrel->pages = 0;
-    joinrel->tuples = 0;
-    joinrel->allvisfrac = 0;
-    joinrel->subroot = NULL;
-    joinrel->subplan_params = NIL;
-    joinrel->rel_parallel_workers = -1;
-    joinrel->serverid = InvalidOid;
-    joinrel->userid = InvalidOid;
-    joinrel->useridiscurrent = false;
-    joinrel->fdwroutine = NULL;
-    joinrel->fdw_private = NULL;
-    joinrel->unique_for_rels = NIL;
-    joinrel->non_unique_for_rels = NIL;
-    joinrel->baserestrictinfo = NIL;
-    joinrel->baserestrictcost.startup = 0;
-    joinrel->baserestrictcost.per_tuple = 0;
-    joinrel->baserestrict_min_security = UINT_MAX;
-    joinrel->joininfo = NIL;
-    joinrel->has_eclass_joins = false;
-    joinrel->top_parent_relids = NULL;
-
-    /* Compute information relevant to the foreign relations. */
-    set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
-
-    /*
-     * Create a new tlist containing just the vars that need to be output from
-     * this join (ie, are needed for higher joinclauses or final output).
-     *
-     * NOTE: the tlist order for a join rel will depend on which pair of outer
-     * and inner rels we first try to build it from.  But the contents should
-     * be the same regardless.
-     */
-    build_joinrel_tlist(root, joinrel, outer_rel);
-    build_joinrel_tlist(root, joinrel, inner_rel);
-    add_placeholders_to_joinrel(root, joinrel, outer_rel, inner_rel);
-
-    /*
-     * add_placeholders_to_joinrel also took care of adding the ph_lateral
-     * sets of any PlaceHolderVars computed here to direct_lateral_relids, so
-     * now we can finish computing that.  This is much like the computation of
-     * the transitively-closed lateral_relids in min_join_parameterization,
-     * except that here we *do* have to consider the added PHVs.
-     */
-    joinrel->direct_lateral_relids =
-        bms_del_members(joinrel->direct_lateral_relids, joinrel->relids);
-    if (bms_is_empty(joinrel->direct_lateral_relids))
-        joinrel->direct_lateral_relids = NULL;
-
-    /*
-     * Construct restrict and join clause lists for the new joinrel. (The
-     * caller might or might not need the restrictlist, but I need it anyway
-     * for set_joinrel_size_estimates().)
-     */
-    restrictlist = build_joinrel_restrictlist(root, joinrel,
-                                              outer_rel, inner_rel);
-    if (restrictlist_ptr)
-        *restrictlist_ptr = restrictlist;
-    build_joinrel_joinlist(joinrel, outer_rel, inner_rel);
-
-    /*
-     * This is also the right place to check whether the joinrel has any
-     * pending EquivalenceClass joins.
-     */
-    joinrel->has_eclass_joins = has_relevant_eclass_joinclause(root, joinrel);
-
-    /*
-     * Set estimates of the joinrel's size.
-     */
-    set_joinrel_size_estimates(root, joinrel, outer_rel, inner_rel,
-                               sjinfo, restrictlist);
-
-    /*
-     * Set the consider_parallel flag if this joinrel could potentially be
-     * scanned within a parallel worker.  If this flag is false for either
-     * inner_rel or outer_rel, then it must be false for the joinrel also.
-     * Even if both are true, there might be parallel-restricted expressions
-     * in the targetlist or quals.
-     *
-     * Note that if there are more than two rels in this relation, they could
-     * be divided between inner_rel and outer_rel in any arbitrary way.  We
-     * assume this doesn't matter, because we should hit all the same baserels
-     * and joinclauses while building up to this joinrel no matter which we
-     * take; therefore, we should make the same decision here however we get
-     * here.
-     */
-    if (inner_rel->consider_parallel && outer_rel->consider_parallel &&
-        is_parallel_safe(root, (Node *) restrictlist) &&
-        is_parallel_safe(root, (Node *) joinrel->reltarget->exprs))
-        joinrel->consider_parallel = true;
-
-    /* Add the joinrel to the PlannerInfo. */
-    add_join_rel(root, joinrel);
+			   Relids joinrelids,
+			   RelOptInfo *outer_rel,
+			   RelOptInfo *inner_rel,
+			   SpecialJoinInfo *sjinfo,
+			   List **restrictlist_ptr)
+{
+	RelOptInfo *joinrel;
+	List	   *restrictlist;
+#ifdef __TBASE__
+	PlannerInfo *top_root = root;
+#endif
 
-    /*
-     * Also, if dynamic-programming join search is active, add the new joinrel
-     * to the appropriate sublist.  Note: you might think the Assert on number
-     * of members should be for equality, but some of the level 1 rels might
-     * have been joinrels already, so we can only assert <=.
-     */
-    if (root->join_rel_level)
-    {
-        Assert(root->join_cur_level > 0);
-        Assert(root->join_cur_level <= bms_num_members(joinrel->relids));
-        root->join_rel_level[root->join_cur_level] =
-            lappend(root->join_rel_level[root->join_cur_level], joinrel);
-    }
+	/*
+	 * See if we already have a joinrel for this set of base rels.
+	 */
+	joinrel = find_join_rel(root, joinrelids);
+
+	if (joinrel)
+	{
+		/*
+		 * Yes, so we only need to figure the restrictlist for this particular
+		 * pair of component relations.
+		 */
+		if (restrictlist_ptr)
+			*restrictlist_ptr = build_joinrel_restrictlist(root,
+														   joinrel,
+														   outer_rel,
+														   inner_rel);
+		return joinrel;
+	}
+
+	/*
+	 * Nope, so make one.
+	 */
+	joinrel = makeNode(RelOptInfo);
+	joinrel->reloptkind = RELOPT_JOINREL;
+	joinrel->relids = bms_copy(joinrelids);
+	joinrel->rows = 0;
+	/* cheap startup cost is interesting iff not all tuples to be retrieved */
+	joinrel->consider_startup = (root->tuple_fraction > 0);
+	joinrel->consider_param_startup = false;
+	joinrel->consider_parallel = false;
+	joinrel->reltarget = create_empty_pathtarget();
+	joinrel->pathlist = NIL;
+	joinrel->ppilist = NIL;
+	joinrel->partial_pathlist = NIL;
+	joinrel->cheapest_startup_path = NULL;
+	joinrel->cheapest_total_path = NULL;
+	joinrel->cheapest_unique_path = NULL;
+	joinrel->cheapest_parameterized_paths = NIL;
+	/* init direct_lateral_relids from children; we'll finish it up below */
+	joinrel->direct_lateral_relids =
+		bms_union(outer_rel->direct_lateral_relids,
+				  inner_rel->direct_lateral_relids);
+	joinrel->lateral_relids = min_join_parameterization(root, joinrel->relids,
+														outer_rel, inner_rel);
+	joinrel->relid = 0;			/* indicates not a baserel */
+	joinrel->rtekind = RTE_JOIN;
+	joinrel->min_attr = 0;
+	joinrel->max_attr = 0;
+	joinrel->attr_needed = NULL;
+	joinrel->attr_widths = NULL;
+	joinrel->lateral_vars = NIL;
+	joinrel->lateral_referencers = NULL;
+	joinrel->indexlist = NIL;
+	joinrel->statlist = NIL;
+	joinrel->pages = 0;
+	joinrel->tuples = 0;
+	joinrel->allvisfrac = 0;
+	joinrel->subroot = NULL;
+	joinrel->subplan_params = NIL;
+	joinrel->rel_parallel_workers = -1;
+	joinrel->serverid = InvalidOid;
+	joinrel->userid = InvalidOid;
+	joinrel->useridiscurrent = false;
+	joinrel->fdwroutine = NULL;
+	joinrel->fdw_private = NULL;
+	joinrel->unique_for_rels = NIL;
+	joinrel->non_unique_for_rels = NIL;
+	joinrel->baserestrictinfo = NIL;
+	joinrel->baserestrictcost.startup = 0;
+	joinrel->baserestrictcost.per_tuple = 0;
+	joinrel->baserestrict_min_security = UINT_MAX;
+	joinrel->joininfo = NIL;
+	joinrel->has_eclass_joins = false;
+	joinrel->top_parent_relids = NULL;
+#ifdef __TBASE__
+	joinrel->resultRelLoc = RESULT_REL_NONE;
+#endif
 
-    return joinrel;
+	/* Compute information relevant to the foreign relations. */
+	set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
+
+	/*
+	 * Create a new tlist containing just the vars that need to be output from
+	 * this join (ie, are needed for higher joinclauses or final output).
+	 *
+	 * NOTE: the tlist order for a join rel will depend on which pair of outer
+	 * and inner rels we first try to build it from.  But the contents should
+	 * be the same regardless.
+	 */
+	build_joinrel_tlist(root, joinrel, outer_rel);
+	build_joinrel_tlist(root, joinrel, inner_rel);
+	add_placeholders_to_joinrel(root, joinrel, outer_rel, inner_rel);
+
+	/*
+	 * add_placeholders_to_joinrel also took care of adding the ph_lateral
+	 * sets of any PlaceHolderVars computed here to direct_lateral_relids, so
+	 * now we can finish computing that.  This is much like the computation of
+	 * the transitively-closed lateral_relids in min_join_parameterization,
+	 * except that here we *do* have to consider the added PHVs.
+	 */
+	joinrel->direct_lateral_relids =
+		bms_del_members(joinrel->direct_lateral_relids, joinrel->relids);
+	if (bms_is_empty(joinrel->direct_lateral_relids))
+		joinrel->direct_lateral_relids = NULL;
+
+	/*
+	 * Construct restrict and join clause lists for the new joinrel. (The
+	 * caller might or might not need the restrictlist, but I need it anyway
+	 * for set_joinrel_size_estimates().)
+	 */
+	restrictlist = build_joinrel_restrictlist(root, joinrel,
+											  outer_rel, inner_rel);
+	if (restrictlist_ptr)
+		*restrictlist_ptr = restrictlist;
+	build_joinrel_joinlist(joinrel, outer_rel, inner_rel);
+
+	/*
+	 * This is also the right place to check whether the joinrel has any
+	 * pending EquivalenceClass joins.
+	 */
+	joinrel->has_eclass_joins = has_relevant_eclass_joinclause(root, joinrel);
+
+	/*
+	 * Set estimates of the joinrel's size.
+	 */
+	set_joinrel_size_estimates(root, joinrel, outer_rel, inner_rel,
+							   sjinfo, restrictlist);
+
+	/*
+	 * Set the consider_parallel flag if this joinrel could potentially be
+	 * scanned within a parallel worker.  If this flag is false for either
+	 * inner_rel or outer_rel, then it must be false for the joinrel also.
+	 * Even if both are true, there might be parallel-restricted expressions
+	 * in the targetlist or quals.
+	 *
+	 * Note that if there are more than two rels in this relation, they could
+	 * be divided between inner_rel and outer_rel in any arbitrary way.  We
+	 * assume this doesn't matter, because we should hit all the same baserels
+	 * and joinclauses while building up to this joinrel no matter which we
+	 * take; therefore, we should make the same decision here however we get
+	 * here.
+	 */
+	if (inner_rel->consider_parallel && outer_rel->consider_parallel &&
+		is_parallel_safe(root, (Node *) restrictlist) &&
+		is_parallel_safe(root, (Node *) joinrel->reltarget->exprs))
+		joinrel->consider_parallel = true;
+
+	/* Add the joinrel to the PlannerInfo. */
+	add_join_rel(root, joinrel);
+
+	/*
+	 * Also, if dynamic-programming join search is active, add the new joinrel
+	 * to the appropriate sublist.  Note: you might think the Assert on number
+	 * of members should be for equality, but some of the level 1 rels might
+	 * have been joinrels already, so we can only assert <=.
+	 */
+	if (root->join_rel_level)
+	{
+		Assert(root->join_cur_level > 0);
+		Assert(root->join_cur_level <= bms_num_members(joinrel->relids));
+		root->join_rel_level[root->join_cur_level] =
+			lappend(root->join_rel_level[root->join_cur_level], joinrel);
+	}
+
+	return joinrel;
 }
 
 /*
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index d2d0ec0a..99a6325c 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -100,6 +100,16 @@ typedef struct Distribution
 } Distribution;
 #endif
 
+#ifdef __TBASE__
+/*
+ * The location of DML result relation in JOINREL
+ */
+typedef enum ResultRelLocation {
+	RESULT_REL_NONE,		/* Not found */
+	RESULT_REL_INNER,		/* Appears in inner subpath */
+	RESULT_REL_OUTER		/* Appears in outer subpath */
+} ResultRelLocation;
+#endif
 
 /*
  * Relids
@@ -697,11 +707,14 @@ typedef struct RelOptInfo
     /* used by "other" relations */
     Relids        top_parent_relids;    /* Relids of topmost parents */
 #ifdef __TBASE__
-    /* used for interval partition */
-    bool        intervalparent;     /* is interval partition */
-    bool        isdefault;            /* is default partition table */
-    Bitmapset    *childs;            /* child tables to query */
-    int         estimate_partidx;   /* */
+	/* used for interval partition */
+	bool		intervalparent;     /* is interval partition */
+	bool		isdefault;			/* is default partition table */
+	Bitmapset	*childs;            /* child tables to query */
+	int 		estimate_partidx;   /* */
+
+	/* used for complex delete */
+	ResultRelLocation resultRelLoc;
 #endif
 
 } RelOptInfo;
diff --git a/src/include/optimizer/distribution.h b/src/include/optimizer/distribution.h
new file mode 100644
index 00000000..fe4d2aeb
--- /dev/null
+++ b/src/include/optimizer/distribution.h
@@ -0,0 +1,28 @@
+/*-------------------------------------------------------------------------
+ *
+ * distribution.h
+ *	  Routines related to adjust distribution
+ *
+ * Copyright (c) 2020-Present TBase development team, Tencent
+ *
+ *
+ * IDENTIFICATION
+ *	  src/include/optimizer/distribution.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DISTRIBUTION_H
+#define DISTRIBUTION_H
+
+#include "postgres.h"
+
+#include "nodes/relation.h"
+
+/* TODO(TBase): Move all plan/path distribution routines to this file */
+
+extern bool equal_distributions(PlannerInfo *root, Distribution *dst1,
+					Distribution *dst2);
+extern ResultRelLocation getResultRelLocation(int resultRel, Relids inner,
+					Relids outer);
+extern bool SatisfyResultRelDist(PlannerInfo *root, Path *path);
+#endif  /* DISTRIBUTION_H */
diff --git a/src/test/regress/expected/foreign_key_2.out b/src/test/regress/expected/foreign_key_2.out
index 1dfe6663..8b8ac8ac 100644
--- a/src/test/regress/expected/foreign_key_2.out
+++ b/src/test/regress/expected/foreign_key_2.out
@@ -1373,11 +1373,26 @@ create temp table t1 (a integer primary key, b text);
 create temp table t2 (a integer, b integer references t1) distribute by hash (b);
 create rule r1 as on delete to t1 do delete from t2 where t2.b = old.a;
 explain (costs off) delete from t1 where a = 1;
-ERROR:  could not plan this distributed delete
-DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   ->  Delete on t2
+         ->  Nested Loop
+               ->  Seq Scan on t2
+                     Filter: (b = 1)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1)
+                           ->  Index Scan using t1_pkey on t1
+                                 Index Cond: (a = 1)
+ 
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Delete on t1
+         ->  Index Scan using t1_pkey on t1
+               Index Cond: (a = 1)
+(15 rows)
+
 delete from t1 where a = 1;
-ERROR:  could not plan this distributed delete
-DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
 drop rule r1 on t1;
 explain (costs off, nodes off) delete from t1 where a = 1;
                  QUERY PLAN                 
diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index 9d08f4b2..f151b912 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -2341,15 +2341,12 @@ SELECT * FROM t3 ORDER By x, y;
 (3 rows)
 
 DELETE FROM t3 USING t1 JOIN t2 USING (a) WHERE t3.x > t1.a;
-ERROR:  could not plan this distributed delete
-DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
 SELECT * FROM t3 ORDER By x, y;
-  x  |  y  
------+-----
-   6 |   7
-   7 |   8
- 500 | 100
-(3 rows)
+ x | y 
+---+---
+ 6 | 7
+ 7 | 8
+(2 rows)
 
 DELETE FROM t3 USING t3 t3_other WHERE t3.x = t3_other.x AND t3.y = t3_other.y;
 SELECT * FROM t3 ORDER By x, y;
diff --git a/src/test/regress/expected/matview_1.out b/src/test/regress/expected/matview_1.out
index 4bd7e9dc..ec7d3220 100644
--- a/src/test/regress/expected/matview_1.out
+++ b/src/test/regress/expected/matview_1.out
@@ -542,8 +542,8 @@ drop materialized view mvtest_error;
 CREATE TABLE mvtest_v AS SELECT generate_series(1,10) AS a;
 CREATE MATERIALIZED VIEW mvtest_mv_v AS SELECT a FROM mvtest_v WHERE a <= 5;
 DELETE FROM mvtest_v WHERE EXISTS ( SELECT * FROM mvtest_mv_v WHERE mvtest_mv_v.a = mvtest_v.a );
-ERROR:  could not plan this distributed delete
-DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
+ERROR:  materialized view "mvtest_mv_v" has not been populated
+HINT:  Use the REFRESH MATERIALIZED VIEW command.
 SELECT * FROM mvtest_v order by 1;
  a  
 ----
diff --git a/src/test/regress/expected/returning.out b/src/test/regress/expected/returning.out
index 0e32cc37..5f667e4f 100644
--- a/src/test/regress/expected/returning.out
+++ b/src/test/regress/expected/returning.out
@@ -313,8 +313,8 @@ CREATE RULE joinview_u AS ON UPDATE TO joinview DO INSTEAD
     FROM joinme WHERE f2 = f2j AND f2 = old.f2
     RETURNING foo.*, other;
 UPDATE joinview SET f1 = f1 + 1 WHERE f3 = 57 RETURNING *, other + 1;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+ERROR:  could not plan this distributed UPDATE/DELETE
+DETAIL:  correlated or complex UPDATE/DELETE is currently not supported in Postgres-XL.
 SELECT * FROM joinview ORDER BY f1;
  f1 |  f2  | f3 | f4  | other 
 ----+------+----+-----+-------
diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out
index fb6327f3..670e9a06 100644
--- a/src/test/regress/expected/rowsecurity_1.out
+++ b/src/test/regress/expected/rowsecurity_1.out
@@ -1572,49 +1572,159 @@ UPDATE t1 SET b = b WHERE f_leak(b);
 -- updates with from clause
 EXPLAIN (COSTS OFF) UPDATE t2 SET b=t2.b FROM t3
 WHERE t2.a = 3 and t3.a = 2 AND f_leak(t2.b) AND f_leak(t3.b);
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_2)
+   ->  Update on t2
+         ->  Nested Loop
+               ->  Seq Scan on t2
+                     Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1)
+                           ->  Seq Scan on t3
+                                 Filter: ((a = 2) AND f_leak(b))
+(9 rows)
+
 UPDATE t2 SET b=t2.b FROM t3
 WHERE t2.a = 3 and t3.a = 2 AND f_leak(t2.b) AND f_leak(t3.b);
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 EXPLAIN (COSTS OFF) UPDATE t1 SET b=t1.b FROM t2
 WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b);
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_2)
+   ->  Update on t1
+         Update on t1
+         Update on t2 t2_1
+         Update on t3
+         ->  Nested Loop
+               ->  Remote Subquery Scan on all (datanode_2)
+                     ->  Seq Scan on t2
+                           Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
+               ->  Seq Scan on t1
+                     Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b))
+         ->  Nested Loop
+               ->  Remote Subquery Scan on all (datanode_2)
+                     ->  Seq Scan on t2
+                           Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
+               ->  Seq Scan on t2 t2_1
+                     Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b))
+         ->  Nested Loop
+               ->  Remote Subquery Scan on all (datanode_2)
+                     ->  Seq Scan on t2
+                           Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
+               ->  Seq Scan on t3
+                     Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b))
+(23 rows)
+
 UPDATE t1 SET b=t1.b FROM t2
 WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b);
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 EXPLAIN (COSTS OFF) UPDATE t2 SET b=t2.b FROM t1
 WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b);
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_2)
+   ->  Update on t2
+         ->  Nested Loop
+               ->  Seq Scan on t2
+                     Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           ->  Append
+                                 ->  Seq Scan on t1
+                                       Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b))
+                                 ->  Seq Scan on t2 t2_1
+                                       Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b))
+                                 ->  Seq Scan on t3
+                                       Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b))
+(14 rows)
+
 UPDATE t2 SET b=t2.b FROM t1
 WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b);
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 -- updates with from clause self join
 EXPLAIN (COSTS OFF) UPDATE t2 t2_1 SET b = t2_2.b FROM t2 t2_2
 WHERE t2_1.a = 3 AND t2_2.a = t2_1.a AND t2_2.b = t2_1.b
 AND f_leak(t2_1.b) AND f_leak(t2_2.b) RETURNING *, t2_1, t2_2;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_2)
+   ->  Update on t2 t2_1
+         ->  Nested Loop
+               Join Filter: (t2_1.b = t2_2.b)
+               ->  Remote Subquery Scan on all (datanode_2)
+                     ->  Seq Scan on t2 t2_2
+                           Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
+               ->  Seq Scan on t2 t2_1
+                     Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
+(9 rows)
+
 UPDATE t2 t2_1 SET b = t2_2.b FROM t2 t2_2
 WHERE t2_1.a = 3 AND t2_2.a = t2_1.a AND t2_2.b = t2_1.b
 AND f_leak(t2_1.b) AND f_leak(t2_2.b) RETURNING *, t2_1, t2_2;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+ a |  b  |  c  | a |  b  |  c  |    t2_1     |    t2_2     
+---+-----+-----+---+-----+-----+-------------+-------------
+ 3 | cde | 3.3 | 3 | cde | 3.3 | (3,cde,3.3) | (3,cde,3.3)
+(1 row)
+
 EXPLAIN (COSTS OFF) UPDATE t1 t1_1 SET b = t1_2.b FROM t1 t1_2
 WHERE t1_1.a = 4 AND t1_2.a = t1_1.a AND t1_2.b = t1_1.b
 AND f_leak(t1_1.b) AND f_leak(t1_2.b) RETURNING *, t1_1, t1_2;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_2)
+   ->  Update on t1 t1_1
+         Update on t1 t1_1
+         Update on t2 t1_1_1
+         Update on t3 t1_1_2
+         ->  Hash Join
+               Hash Cond: (b = t1_1.b)
+               ->  Remote Subquery Scan on all (datanode_2)
+                     ->  Append
+                           ->  Seq Scan on t1 t1_2
+                                 Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b))
+                           ->  Seq Scan on t2 t1_2_1
+                                 Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b))
+                           ->  Seq Scan on t3 t1_2_2
+                                 Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b))
+               ->  Hash
+                     ->  Seq Scan on t1 t1_1
+                           Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b))
+         ->  Nested Loop
+               Join Filter: (t1_1_1.b = b)
+               ->  Seq Scan on t2 t1_1_1
+                     Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b))
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           ->  Append
+                                 ->  Seq Scan on t1 t1_2
+                                       Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b))
+                                 ->  Seq Scan on t2 t1_2_1
+                                       Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b))
+                                 ->  Seq Scan on t3 t1_2_2
+                                       Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b))
+         ->  Nested Loop
+               Join Filter: (t1_1_2.b = b)
+               ->  Seq Scan on t3 t1_1_2
+                     Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b))
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           ->  Append
+                                 ->  Seq Scan on t1 t1_2
+                                       Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b))
+                                 ->  Seq Scan on t2 t1_2_1
+                                       Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b))
+                                 ->  Seq Scan on t3 t1_2_2
+                                       Filter: ((a = 4) AND ((a % 2) = 0) AND f_leak(b))
+(44 rows)
+
 UPDATE t1 t1_1 SET b = t1_2.b FROM t1 t1_2
 WHERE t1_1.a = 4 AND t1_2.a = t1_1.a AND t1_2.b = t1_1.b
 AND f_leak(t1_1.b) AND f_leak(t1_2.b) RETURNING *, t1_1, t1_2;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+ a |      b      | a |      b      |      t1_1       |      t1_2       
+---+-------------+---+-------------+-----------------+-----------------
+ 4 | daddad_updt | 4 | daddad_updt | (4,daddad_updt) | (4,daddad_updt)
+ 4 | defdef      | 4 | defdef      | (4,defdef)      | (4,defdef)
+(2 rows)
+
 RESET SESSION AUTHORIZATION;
 SET row_security TO OFF;
 SELECT * FROM t1 ORDER BY a,b;
diff --git a/src/test/regress/expected/rowtypes_1.out b/src/test/regress/expected/rowtypes_1.out
index 37ec7dc0..57671100 100644
--- a/src/test/regress/expected/rowtypes_1.out
+++ b/src/test/regress/expected/rowtypes_1.out
@@ -398,10 +398,14 @@ UPDATE price
     SET active = true, price = input_prices.price
     FROM unnest(ARRAY[(10, 123.00), (11, 99.99)]::price_input[]) input_prices
     WHERE price_key_from_table(price.*) = price_key_from_input(input_prices.*);
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 select * from price;
-ERROR:  current transaction is aborted, commands ignored until end of transaction block
+ id | active | price  
+----+--------+--------
+  1 | f      |     42
+ 10 | t      | 123.00
+ 11 | t      |  99.99
+(3 rows)
+
 rollback;
 --
 -- Test case derived from bug #9085: check * qualification of composite
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index d9f4a310..a169bf4a 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -291,8 +291,8 @@ select * from rtest_v1 order by a, b;
 
 -- updates in a mergejoin
 update rtest_v1 set b = rtest_t2.b from rtest_t2 where rtest_v1.a = rtest_t2.a;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+ERROR:  could not plan this distributed UPDATE/DELETE
+DETAIL:  correlated or complex UPDATE/DELETE is currently not supported in Postgres-XL.
 select * from rtest_v1 order by a, b;
  a | b  
 ---+----
@@ -330,8 +330,8 @@ select * from rtest_v1 order by a, b;
 (8 rows)
 
 update rtest_v1 set a = rtest_t3.a + 20 from rtest_t3 where rtest_v1.b = rtest_t3.b;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+ERROR:  could not plan this distributed UPDATE/DELETE
+DETAIL:  correlated or complex UPDATE/DELETE is currently not supported in Postgres-XL.
 select * from rtest_v1 order by a, b;
  a  | b  
 ----+----
@@ -361,58 +361,49 @@ insert into rtest_admin values ('jw', 'orion');
 insert into rtest_admin values ('jw', 'notjw');
 insert into rtest_admin values ('bm', 'neptun');
 update rtest_system set sysname = 'pluto' where sysname = 'neptun';
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 select * from rtest_interface order by sysname, ifname;
  sysname | ifname 
 ---------+--------
- neptun  | eth0
  notjw   | eth0
  orion   | eth0
  orion   | eth1
+ pluto   | eth0
 (4 rows)
 
 select * from rtest_admin order by pname, sysname;
  pname | sysname 
 -------+---------
- bm    | neptun
+ bm    | pluto
  jw    | notjw
  jw    | orion
 (3 rows)
 
 update rtest_person set pname = 'jwieck' where pdesc = 'Jan Wieck';
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 -- Note: use ORDER BY here to ensure consistent output across all systems.
 -- The above UPDATE affects two rows with equal keys, so they could be
 -- updated in either order depending on the whim of the local qsort().
 select * from rtest_admin order by pname, sysname;
- pname | sysname 
--------+---------
- bm    | neptun
- jw    | notjw
- jw    | orion
+ pname  | sysname 
+--------+---------
+ bm     | pluto
+ jwieck | notjw
+ jwieck | orion
 (3 rows)
 
 delete from rtest_system where sysname = 'orion';
-ERROR:  could not plan this distributed delete
-DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
 select * from rtest_interface order by sysname, ifname;
  sysname | ifname 
 ---------+--------
- neptun  | eth0
  notjw   | eth0
- orion   | eth0
- orion   | eth1
-(4 rows)
+ pluto   | eth0
+(2 rows)
 
 select * from rtest_admin order by pname, sysname;
- pname | sysname 
--------+---------
- bm    | neptun
- jw    | notjw
- jw    | orion
-(3 rows)
+ pname  | sysname 
+--------+---------
+ bm     | pluto
+ jwieck | notjw
+(2 rows)
 
 --
 -- Rule qualification test
@@ -452,36 +443,41 @@ select ename, who = current_user as "matches user", action, newsal, oldsal from
 
 update rtest_empmass set salary = salary + '1000.00';
 update rtest_emp set salary = rtest_empmass.salary from rtest_empmass where rtest_emp.ename = rtest_empmass.ename;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 select ename, who = current_user as "matches user", action, newsal, oldsal from rtest_emplog order by ename, action, newsal;
         ename         | matches user |   action   |   newsal   |   oldsal   
 ----------------------+--------------+------------+------------+------------
  gates                | t            | fired      |      $0.00 | $80,000.00
  gates                | t            | hired      | $80,000.00 |      $0.00
  maier                | t            | hired      |  $5,000.00 |      $0.00
+ maier                | t            | honored    |  $6,000.00 |  $5,000.00
  mayr                 | t            | hired      |  $6,000.00 |      $0.00
+ mayr                 | t            | honored    |  $7,000.00 |  $6,000.00
  meyer                | t            | hired      |  $4,000.00 |      $0.00
+ meyer                | t            | honored    |  $5,000.00 |  $4,000.00
  wiecc                | t            | hired      |  $5,000.00 |      $0.00
  wieck                | t            | honored    |  $6,000.00 |  $5,000.00
  wieck                | t            | honored    |  $7,000.00 |  $6,000.00
-(8 rows)
+(11 rows)
 
 delete from rtest_emp using rtest_empmass where rtest_emp.ename = rtest_empmass.ename;
-ERROR:  could not plan this distributed delete
-DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
 select ename, who = current_user as "matches user", action, newsal, oldsal from rtest_emplog order by ename, action, newsal;
         ename         | matches user |   action   |   newsal   |   oldsal   
 ----------------------+--------------+------------+------------+------------
  gates                | t            | fired      |      $0.00 | $80,000.00
  gates                | t            | hired      | $80,000.00 |      $0.00
+ maier                | t            | fired      |      $0.00 |  $6,000.00
  maier                | t            | hired      |  $5,000.00 |      $0.00
+ maier                | t            | honored    |  $6,000.00 |  $5,000.00
+ mayr                 | t            | fired      |      $0.00 |  $7,000.00
  mayr                 | t            | hired      |  $6,000.00 |      $0.00
+ mayr                 | t            | honored    |  $7,000.00 |  $6,000.00
+ meyer                | t            | fired      |      $0.00 |  $5,000.00
  meyer                | t            | hired      |  $4,000.00 |      $0.00
+ meyer                | t            | honored    |  $5,000.00 |  $4,000.00
  wiecc                | t            | hired      |  $5,000.00 |      $0.00
  wieck                | t            | honored    |  $6,000.00 |  $5,000.00
  wieck                | t            | honored    |  $7,000.00 |  $6,000.00
-(8 rows)
+(14 rows)
 
 --
 -- Multiple cascaded qualified instead rule test
@@ -1103,26 +1099,27 @@ SELECT * FROM shoelace ORDER BY sl_name;
 (8 rows)
 
 insert into shoelace_ok select * from shoelace_arrive;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 SELECT * FROM shoelace ORDER BY sl_name;
   sl_name   | sl_avail |  sl_color  | sl_len | sl_unit  | sl_len_cm 
 ------------+----------+------------+--------+----------+-----------
  sl1        |        5 | black      |     80 | cm       |        80
  sl2        |        6 | black      |    100 | cm       |       100
- sl3        |        0 | black      |     35 | inch     |      88.9
+ sl3        |       10 | black      |     35 | inch     |      88.9
  sl4        |        8 | black      |     40 | inch     |     101.6
  sl5        |        4 | brown      |      1 | m        |       100
- sl6        |        0 | brown      |    0.9 | m        |        90
+ sl6        |       20 | brown      |    0.9 | m        |        90
  sl7        |        6 | brown      |     60 | cm       |        60
- sl8        |        1 | brown      |     40 | inch     |     101.6
+ sl8        |       21 | brown      |     40 | inch     |     101.6
 (8 rows)
 
 SELECT * FROM shoelace_log ORDER BY sl_name;
   sl_name   | sl_avail | log_who  |         log_when         
 ------------+----------+----------+--------------------------
+ sl3        |       10 | Al Bundy | Thu Jan 01 00:00:00 1970
+ sl6        |       20 | Al Bundy | Thu Jan 01 00:00:00 1970
  sl7        |        6 | Al Bundy | Thu Jan 01 00:00:00 1970
-(1 row)
+ sl8        |       21 | Al Bundy | Thu Jan 01 00:00:00 1970
+(4 rows)
 
     CREATE VIEW shoelace_obsolete AS
 	SELECT * FROM shoelace WHERE NOT EXISTS
@@ -1151,22 +1148,19 @@ SELECT * FROM shoelace_candelete;
 DELETE FROM shoelace WHERE EXISTS
     (SELECT * FROM shoelace_candelete
              WHERE sl_name = shoelace.sl_name);
-ERROR:  could not plan this distributed delete
-DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
 SELECT * FROM shoelace ORDER BY sl_name;
   sl_name   | sl_avail |  sl_color  | sl_len | sl_unit  | sl_len_cm 
 ------------+----------+------------+--------+----------+-----------
  sl1        |        5 | black      |     80 | cm       |        80
  sl10       |     1000 | magenta    |     40 | inch     |     101.6
  sl2        |        6 | black      |    100 | cm       |       100
- sl3        |        0 | black      |     35 | inch     |      88.9
+ sl3        |       10 | black      |     35 | inch     |      88.9
  sl4        |        8 | black      |     40 | inch     |     101.6
  sl5        |        4 | brown      |      1 | m        |       100
- sl6        |        0 | brown      |    0.9 | m        |        90
+ sl6        |       20 | brown      |    0.9 | m        |        90
  sl7        |        6 | brown      |     60 | cm       |        60
- sl8        |        1 | brown      |     40 | inch     |     101.6
- sl9        |        0 | pink       |     35 | inch     |      88.9
-(10 rows)
+ sl8        |       21 | brown      |     40 | inch     |     101.6
+(9 rows)
 
 SELECT * FROM shoe ORDER BY shoename;
   shoename  | sh_avail |  slcolor   | slminlen | slminlen_cm | slmaxlen | slmaxlen_cm |  slunit  
@@ -1254,40 +1248,35 @@ select * from vview order by pid;
 (2 rows)
 
 update vview set descrip='test1' where pid=1;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 select * from vview order by pid;
- pid |   txt   | descrip  
------+---------+----------
-   1 | parent1 | descrip1
+ pid |   txt   | descrip 
+-----+---------+---------
+   1 | parent1 | test1
    2 | parent2 | 
 (2 rows)
 
 update vview set descrip='test2' where pid=2;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 select * from vview order by pid;
- pid |   txt   | descrip  
------+---------+----------
-   1 | parent1 | descrip1
-   2 | parent2 | 
+ pid |   txt   | descrip 
+-----+---------+---------
+   1 | parent1 | test1
+   2 | parent2 | test2
 (2 rows)
 
 update vview set descrip='test3' where pid=3;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 select * from vview order by pid;
- pid |   txt   | descrip  
------+---------+----------
-   1 | parent1 | descrip1
-   2 | parent2 | 
+ pid |   txt   | descrip 
+-----+---------+---------
+   1 | parent1 | test1
+   2 | parent2 | test2
 (2 rows)
 
 select * from cchild order by pid;
- pid | descrip  
------+----------
-   1 | descrip1
-(1 row)
+ pid | descrip 
+-----+---------
+   1 | test1
+   2 | test2
+(2 rows)
 
 drop rule rrule on vview;
 drop view vview;
@@ -2678,14 +2667,11 @@ select * from id_ordered order by id;
 (6 rows)
 
 update id_ordered set name = 'update 2' where id = 2;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+ERROR:  input of anonymous composite types is not implemented
 update id_ordered set name = 'update 4' where id = 4;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+ERROR:  input of anonymous composite types is not implemented
 update id_ordered set name = 'update 5' where id = 5;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+ERROR:  input of anonymous composite types is not implemented
 select * from id_ordered order by id;
  id |  name  
 ----+--------
diff --git a/src/test/regress/expected/subselect_1.out b/src/test/regress/expected/subselect_1.out
index 50633a31..e8cd553a 100644
--- a/src/test/regress/expected/subselect_1.out
+++ b/src/test/regress/expected/subselect_1.out
@@ -530,12 +530,10 @@ update shipped_view set value = 11
     from int4_tbl a join int4_tbl b
       on (a.f1 = (select f1 from int4_tbl c where c.f1=b.f1))
     where ordnum = a.f1;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 select * from shipped_view;
- ttype | ordnum | partnum |  value  
--------+--------+---------+---------
- wt    |      0 | 1       | 1234.56
+ ttype | ordnum | partnum | value 
+-------+--------+---------+-------
+ wt    |      0 | 1       |    11
 (1 row)
 
 select f1, ss1 as relabel from
diff --git a/src/test/regress/expected/xl_limitations_1.out b/src/test/regress/expected/xl_limitations_1.out
index e408b62a..c44f0d64 100644
--- a/src/test/regress/expected/xl_limitations_1.out
+++ b/src/test/regress/expected/xl_limitations_1.out
@@ -552,8 +552,6 @@ where xl_t.no = T1.no1;
 update xl_t1  set name1 = T1.name1 
 from (select name,name1 from xl_names) T1 
 where xl_t1.name1 = T1.name;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 select xl_nodename_from_id(xc_node_id), * from xl_t order by 1;
  xl_nodename_from_id | no | name 
 ---------------------+----+------
@@ -566,10 +564,10 @@ select xl_nodename_from_id(xc_node_id), * from xl_t order by 1;
 select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1;
  xl_nodename_from_id | no1 | name1 
 ---------------------+-----+-------
- datanode_1          |   1 | Z
- datanode_1          |   2 | Y
- datanode_2          |   3 | X
- datanode_2          |   4 | W
+ datanode_1          |   2 | Y1
+ datanode_1          |   1 | Z1
+ datanode_2          |   4 | W1
+ datanode_2          |   3 | X1
 (4 rows)
 
 --testing correlated delete:
@@ -580,23 +578,21 @@ where xl_t.no in (select no1 from xl_t1 where name1 in ('Z', 'X'))
 delete from xl_t1  
 where xl_t1.name1 in (select name1 from xl_names where name in ('Z', 'X'))
 ;
-ERROR:  could not plan this distributed delete
-DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
 select xl_nodename_from_id(xc_node_id), * from xl_t order by 1;
  xl_nodename_from_id | no | name 
 ---------------------+----+------
+ datanode_1          |  1 | Z
  datanode_1          |  2 | Y
+ datanode_2          |  3 | X
  datanode_2          |  4 | W
-(2 rows)
+(4 rows)
 
 select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1;
  xl_nodename_from_id | no1 | name1 
 ---------------------+-----+-------
- datanode_1          |   1 | Z
- datanode_1          |   2 | Y
- datanode_2          |   3 | X
- datanode_2          |   4 | W
-(4 rows)
+ datanode_1          |   2 | Y1
+ datanode_2          |   4 | W1
+(2 rows)
 
 drop table xl_t;
 drop table xl_t1;
diff --git a/src/test/regress/output/misc.source b/src/test/regress/output/misc.source
index c0658578..2e43a09a 100644
--- a/src/test/regress/output/misc.source
+++ b/src/test/regress/output/misc.source
@@ -29,15 +29,11 @@ UPDATE tmp
    FROM onek
    WHERE onek.stringu1 = 'JBAAAA' and
 	  onek.stringu1 = tmp.stringu1;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 UPDATE tmp
    SET stringu1 = reverse_name(onek2.stringu1)
    FROM onek2
    WHERE onek2.stringu1 = 'JCAAAA' and
 	  onek2.stringu1 = tmp.stringu1;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 DROP TABLE tmp;
 --UPDATE person*
 --   SET age = age + 1;

From 8a688d1d0d254917a83c9f5159bed930547ab741 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Fri, 7 Aug 2020 14:14:20 +0800
Subject: [PATCH 018/578] Remove the duplicate estate free in
 ExecEndModifyTable

http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696081354083
---
 src/backend/executor/nodeModifyTable.c | 39 ++++++++++++--------------
 src/backend/optimizer/plan/planner.c   | 38 +++++++++++++++++++++++++
 src/include/nodes/parsenodes.h         | 12 ++++----
 3 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 9d6aaae4..659f15b1 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -3204,30 +3204,27 @@ ExecEndModifyTable(ModifyTableState *node)
     }
 
 #ifdef __TBASE__
-    if (IS_PGXC_COORDINATOR)
-    {
-        EState *state = NULL;
-        ResponseCombiner   *combiner;
-        ModifyTable *plan = (ModifyTable *)node->ps.plan;
+	if (IS_PGXC_COORDINATOR)
+	{
+		ResponseCombiner   *combiner;
+		ModifyTable *plan = (ModifyTable *)node->ps.plan;
 
-        if (plan->remote_plans)
-        {
-            int nremote_plans = list_length(plan->remote_plans);
-            
-            for (i = 0; i < nremote_plans; i++)
-            {
-                RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i);
-                
-                combiner = (ResponseCombiner *) node->mt_remoterels[i];
-                state = combiner->ss.ps.state;
-                ExecEndNode(node->mt_remoterels[i]);
+		if (plan->remote_plans)
+		{
+			int nremote_plans = list_length(plan->remote_plans);
 
-                DropRemoteDMLStatement(rq->statement, rq->update_cursor);
-            }
+			for (i = 0; i < nremote_plans; i++)
+			{
+				RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i);
 
-            FreeExecutorState(state);
-        }
-    }
+				combiner = (ResponseCombiner *) node->mt_remoterels[i];
+
+				ExecEndNode(node->mt_remoterels[i]);
+
+				DropRemoteDMLStatement(rq->statement, rq->update_cursor);
+			}
+		}
+	}
 #endif
 
     /*
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 4495ef70..9fd805a8 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -2312,6 +2312,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
             path = adjust_path_distribution(root, parse, path);
 
 #ifdef __TBASE__
+<<<<<<< HEAD
             /*
               * unshippable triggers found on target relation, we have to do DML
               * on coordinator.
@@ -2324,6 +2325,43 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
                 }
             }
 #endif
+=======
+			/*
+			 * unshippable triggers found on target relation, we have to do DML
+			 * on coordinator.
+             */
+			if (parse->hasUnshippableTriggers)
+			{
+				if (path->distribution)
+				{
+					path = adjust_modifytable_subpath(root, parse, path);
+				}
+			}
+#endif
+
+			path = (Path *)
+				create_modifytable_path(root, final_rel,
+										parse->commandType,
+										parse->canSetTag,
+										parse->resultRelation,
+										NIL,
+										list_make1_int(parse->resultRelation),
+										list_make1(path),
+										list_make1(root),
+										withCheckOptionLists,
+										returningLists,
+										rowMarks,
+										parse->onConflict,
+										SS_assign_special_param(root));
+		}
+		else
+			/* Adjust path by injecting a remote subplan, if appropriate. */
+			path = adjust_path_distribution(root, parse, path);
+
+		/* And shove it into final_rel */
+		add_path(final_rel, path);
+	}
+>>>>>>> d1855902... Remove the duplicate estate free in ExecEndModifyTable
 
             path = (Path *)
                 create_modifytable_path(root, final_rel,
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 71e853ba..5ab4e1b5 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -146,12 +146,12 @@ typedef struct Query
     bool        hasForUpdate;    /* FOR [KEY] UPDATE/SHARE was specified */
     bool        hasRowSecurity; /* rewriter has applied some RLS policy */
 #ifdef __TBASE__
-    bool        isSingleValues; /*for interval partition insert */
-    bool        isMultiValues;  /* is simple insert into values (),(),()...();? */
-    bool        hasUnshippableTriggers; /* has unshippable triggers on resultRelation, 
-                                                                          * only used for DML. Will be set at the plan phase
-                                                                          * in shippability check.
-                                                                          */
+	bool		isSingleValues; /*for interval partition insert */
+	bool        isMultiValues;  /* is simple insert into values (),(),()...();? */
+	bool        hasUnshippableTriggers; /* has unshippable triggers on resultRelation, 
+										 * only used for DML. Will be set at the plan phase
+										 * in shippability check.
+										 */
     char        *copy_filename; /* fake filename for copy from */
     Bitmapset   *conflict_cols;
 #endif

From 7a65ce134c0927ed978f5261624f3ac6a66aeb10 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 2 Jun 2021 11:24:04 +0800
Subject: [PATCH 019/578] revert 8a688d1d0d254917a83c9f5159bed930547ab741

---
 src/backend/executor/nodeModifyTable.c | 39 ++++++++++++++------------
 src/backend/optimizer/plan/planner.c   | 38 -------------------------
 src/include/nodes/parsenodes.h         | 12 ++++----
 3 files changed, 27 insertions(+), 62 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 659f15b1..9d6aaae4 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -3204,27 +3204,30 @@ ExecEndModifyTable(ModifyTableState *node)
     }
 
 #ifdef __TBASE__
-	if (IS_PGXC_COORDINATOR)
-	{
-		ResponseCombiner   *combiner;
-		ModifyTable *plan = (ModifyTable *)node->ps.plan;
-
-		if (plan->remote_plans)
-		{
-			int nremote_plans = list_length(plan->remote_plans);
-
-			for (i = 0; i < nremote_plans; i++)
-			{
-				RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i);
+    if (IS_PGXC_COORDINATOR)
+    {
+        EState *state = NULL;
+        ResponseCombiner   *combiner;
+        ModifyTable *plan = (ModifyTable *)node->ps.plan;
 
-				combiner = (ResponseCombiner *) node->mt_remoterels[i];
+        if (plan->remote_plans)
+        {
+            int nremote_plans = list_length(plan->remote_plans);
+            
+            for (i = 0; i < nremote_plans; i++)
+            {
+                RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i);
+                
+                combiner = (ResponseCombiner *) node->mt_remoterels[i];
+                state = combiner->ss.ps.state;
+                ExecEndNode(node->mt_remoterels[i]);
 
-				ExecEndNode(node->mt_remoterels[i]);
+                DropRemoteDMLStatement(rq->statement, rq->update_cursor);
+            }
 
-				DropRemoteDMLStatement(rq->statement, rq->update_cursor);
-			}
-		}
-	}
+            FreeExecutorState(state);
+        }
+    }
 #endif
 
     /*
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 9fd805a8..4495ef70 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -2312,7 +2312,6 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
             path = adjust_path_distribution(root, parse, path);
 
 #ifdef __TBASE__
-<<<<<<< HEAD
             /*
               * unshippable triggers found on target relation, we have to do DML
               * on coordinator.
@@ -2325,43 +2324,6 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
                 }
             }
 #endif
-=======
-			/*
-			 * unshippable triggers found on target relation, we have to do DML
-			 * on coordinator.
-             */
-			if (parse->hasUnshippableTriggers)
-			{
-				if (path->distribution)
-				{
-					path = adjust_modifytable_subpath(root, parse, path);
-				}
-			}
-#endif
-
-			path = (Path *)
-				create_modifytable_path(root, final_rel,
-										parse->commandType,
-										parse->canSetTag,
-										parse->resultRelation,
-										NIL,
-										list_make1_int(parse->resultRelation),
-										list_make1(path),
-										list_make1(root),
-										withCheckOptionLists,
-										returningLists,
-										rowMarks,
-										parse->onConflict,
-										SS_assign_special_param(root));
-		}
-		else
-			/* Adjust path by injecting a remote subplan, if appropriate. */
-			path = adjust_path_distribution(root, parse, path);
-
-		/* And shove it into final_rel */
-		add_path(final_rel, path);
-	}
->>>>>>> d1855902... Remove the duplicate estate free in ExecEndModifyTable
 
             path = (Path *)
                 create_modifytable_path(root, final_rel,
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 5ab4e1b5..71e853ba 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -146,12 +146,12 @@ typedef struct Query
     bool        hasForUpdate;    /* FOR [KEY] UPDATE/SHARE was specified */
     bool        hasRowSecurity; /* rewriter has applied some RLS policy */
 #ifdef __TBASE__
-	bool		isSingleValues; /*for interval partition insert */
-	bool        isMultiValues;  /* is simple insert into values (),(),()...();? */
-	bool        hasUnshippableTriggers; /* has unshippable triggers on resultRelation, 
-										 * only used for DML. Will be set at the plan phase
-										 * in shippability check.
-										 */
+    bool        isSingleValues; /*for interval partition insert */
+    bool        isMultiValues;  /* is simple insert into values (),(),()...();? */
+    bool        hasUnshippableTriggers; /* has unshippable triggers on resultRelation, 
+                                                                          * only used for DML. Will be set at the plan phase
+                                                                          * in shippability check.
+                                                                          */
     char        *copy_filename; /* fake filename for copy from */
     Bitmapset   *conflict_cols;
 #endif

From ab1b2607568bfae1ebf99b4aeae59d6f9e904609 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Fri, 7 Aug 2020 14:14:20 +0800
Subject: [PATCH 020/578] Remove the duplicate estate free in
 ExecEndModifyTable

http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696081354083
---
 src/backend/executor/nodeModifyTable.c  | 39 ++++++++++++-------------
 src/backend/optimizer/plan/createplan.c | 16 +++++-----
 src/backend/optimizer/plan/planner.c    |  6 ++--
 3 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 9d6aaae4..659f15b1 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -3204,30 +3204,27 @@ ExecEndModifyTable(ModifyTableState *node)
     }
 
 #ifdef __TBASE__
-    if (IS_PGXC_COORDINATOR)
-    {
-        EState *state = NULL;
-        ResponseCombiner   *combiner;
-        ModifyTable *plan = (ModifyTable *)node->ps.plan;
+	if (IS_PGXC_COORDINATOR)
+	{
+		ResponseCombiner   *combiner;
+		ModifyTable *plan = (ModifyTable *)node->ps.plan;
 
-        if (plan->remote_plans)
-        {
-            int nremote_plans = list_length(plan->remote_plans);
-            
-            for (i = 0; i < nremote_plans; i++)
-            {
-                RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i);
-                
-                combiner = (ResponseCombiner *) node->mt_remoterels[i];
-                state = combiner->ss.ps.state;
-                ExecEndNode(node->mt_remoterels[i]);
+		if (plan->remote_plans)
+		{
+			int nremote_plans = list_length(plan->remote_plans);
 
-                DropRemoteDMLStatement(rq->statement, rq->update_cursor);
-            }
+			for (i = 0; i < nremote_plans; i++)
+			{
+				RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i);
 
-            FreeExecutorState(state);
-        }
-    }
+				combiner = (ResponseCombiner *) node->mt_remoterels[i];
+
+				ExecEndNode(node->mt_remoterels[i]);
+
+				DropRemoteDMLStatement(rq->statement, rq->update_cursor);
+			}
+		}
+	}
 #endif
 
     /*
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index d4e31d51..23d04153 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -3026,14 +3026,14 @@ create_modifytable_plan(PlannerInfo *root, ModifyTablePath *best_path)
     copy_generic_path_info(&plan->plan, &best_path->path);
 
 #ifdef __TBASE__
-    /*
-      * If we have unshippable triggers, we have to do DML on coordinators,
-      * generate remote_dml plan now.
-         */
-    if (root->parse->hasUnshippableTriggers)
-    {
-        create_remotedml_plan(root, (Plan *)plan, plan->operation);
-    }
+	/*
+	 * If we have unshippable triggers, we have to do DML on coordinators,
+	 * generate remote_dml plan now.
+     */
+	if (root->parse->hasUnshippableTriggers)
+	{
+		create_remotedml_plan(root, (Plan *)plan, plan->operation);
+	}
 #endif
     return plan;
 }
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 4495ef70..b5e70cc0 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -2313,9 +2313,9 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
 
 #ifdef __TBASE__
             /*
-              * unshippable triggers found on target relation, we have to do DML
-              * on coordinator.
-                        */
+             * unshippable triggers found on target relation, we have to do DML
+             * on coordinator.
+             */
             if (parse->hasUnshippableTriggers)
             {
                 if (path->distribution)

From 952a2a4f440a9a823ac2bcd5dc57567d3cb1edb1 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 11 Aug 2020 15:41:34 +0800
Subject: [PATCH 021/578] Split QTW_EXAMINE_RTES flag into
 QTW_EXAMINE_RTES_BEFORE/_AFTER.

This change allows callers of query_tree_walker() to choose whether
to visit an RTE before or after visiting the contents of the RTE
(i.e., prefix or postfix tree order).  All existing users of
QTW_EXAMINE_RTES want the QTW_EXAMINE_RTES_BEFORE behavior, but
an upcoming patch will want QTW_EXAMINE_RTES_AFTER, and it seems
like a potentially useful change on its own.

Andreas Karlsson (extracted from CTE inlining patch)

Discussion: https://postgr.es/m/8810.1542402910@sss.pgh.pa.us

TBASE:
Reset the definitions of MLS query_tree_walker flag to avoid conflicts
with postgres
---
 src/backend/nodes/nodeFuncs.c        | 112 +++++++++--------
 src/backend/optimizer/plan/setrefs.c |  52 ++++----
 src/backend/rewrite/rewriteManip.c   | 172 +++++++++++++--------------
 src/include/nodes/nodeFuncs.h        |  21 ++--
 4 files changed, 184 insertions(+), 173 deletions(-)

diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c
index 25186782..47ebb30d 100644
--- a/src/backend/nodes/nodeFuncs.c
+++ b/src/backend/nodes/nodeFuncs.c
@@ -2258,7 +2258,7 @@ expression_tree_walker(Node *node,
  * Some callers want to suppress visitation of certain items in the sub-Query,
  * typically because they need to process them specially, or don't actually
  * want to recurse into subqueries.  This is supported by the flags argument,
- * which is the bitwise OR of flag values to suppress visitation of
+ * which is the bitwise OR of flag values to add or suppress visitation of
  * indicated items.  (More flag bits may be added as needed.)
  */
 bool
@@ -2320,53 +2320,57 @@ query_tree_walker(Query *query,
  */
 bool
 range_table_walker(List *rtable,
-                   bool (*walker) (),
-                   void *context,
-                   int flags)
-{// #lizard forgives
-    ListCell   *rt;
-
-    foreach(rt, rtable)
-    {
-        RangeTblEntry *rte = (RangeTblEntry *) lfirst(rt);
-
-        /* For historical reasons, visiting RTEs is not the default */
-        if (flags & QTW_EXAMINE_RTES)
-            if (walker(rte, context))
-                return true;
-
-        switch (rte->rtekind)
-        {
-            case RTE_RELATION:
-                if (walker(rte->tablesample, context))
-                    return true;
-                break;
-            case RTE_CTE:
-            case RTE_NAMEDTUPLESTORE:
-                /* nothing to do */
-                break;
-            case RTE_SUBQUERY:
-                if (!(flags & QTW_IGNORE_RT_SUBQUERIES))
-                    if (walker(rte->subquery, context))
-                        return true;
-                break;
-            case RTE_JOIN:
-                if (!(flags & QTW_IGNORE_JOINALIASES))
-                    if (walker(rte->joinaliasvars, context))
-                        return true;
-                break;
-            case RTE_FUNCTION:
-                if (walker(rte->functions, context))
-                    return true;
-                break;
-            case RTE_TABLEFUNC:
-                if (walker(rte->tablefunc, context))
-                    return true;
-                break;
-            case RTE_VALUES:
-                if (walker(rte->values_lists, context))
-                    return true;
-                break;
+				   bool (*walker) (),
+				   void *context,
+				   int flags)
+{
+	ListCell   *rt;
+
+	foreach(rt, rtable)
+	{
+		RangeTblEntry *rte = (RangeTblEntry *) lfirst(rt);
+
+		/*
+		 * Walkers might need to examine the RTE node itself either before or
+		 * after visiting its contents (or, conceivably, both).  Note that if
+		 * you specify neither flag, the walker won't visit the RTE at all.
+		 */
+		if (flags & QTW_EXAMINE_RTES_BEFORE)
+			if (walker(rte, context))
+				return true;
+
+		switch (rte->rtekind)
+		{
+			case RTE_RELATION:
+				if (walker(rte->tablesample, context))
+					return true;
+				break;
+			case RTE_CTE:
+			case RTE_NAMEDTUPLESTORE:
+				/* nothing to do */
+				break;
+			case RTE_SUBQUERY:
+				if (!(flags & QTW_IGNORE_RT_SUBQUERIES))
+					if (walker(rte->subquery, context))
+						return true;
+				break;
+			case RTE_JOIN:
+				if (!(flags & QTW_IGNORE_JOINALIASES))
+					if (walker(rte->joinaliasvars, context))
+						return true;
+				break;
+			case RTE_FUNCTION:
+				if (walker(rte->functions, context))
+					return true;
+				break;
+			case RTE_TABLEFUNC:
+				if (walker(rte->tablefunc, context))
+					return true;
+				break;
+			case RTE_VALUES:
+				if (walker(rte->values_lists, context))
+					return true;
+				break;
 #ifdef PGXC
             case RTE_REMOTE_DUMMY:
                 elog(ERROR, "Invalid RTE found.");
@@ -2374,10 +2378,14 @@ range_table_walker(List *rtable,
 #endif /* PGXC */
         }
 
-        if (walker(rte->securityQuals, context))
-            return true;
-    }
-    return false;
+		if (walker(rte->securityQuals, context))
+			return true;
+
+		if (flags & QTW_EXAMINE_RTES_AFTER)
+			if (walker(rte, context))
+				return true;
+	}
+	return false;
 }
 
 
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index 460a458b..175058f9 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -352,37 +352,37 @@ add_rtes_to_flat_rtable(PlannerInfo *root, bool recursing)
 static void
 flatten_unplanned_rtes(PlannerGlobal *glob, RangeTblEntry *rte)
 {
-    /* Use query_tree_walker to find all RTEs in the parse tree */
-    (void) query_tree_walker(rte->subquery,
-                             flatten_rtes_walker,
-                             (void *) glob,
-                             QTW_EXAMINE_RTES);
+	/* Use query_tree_walker to find all RTEs in the parse tree */
+	(void) query_tree_walker(rte->subquery,
+							 flatten_rtes_walker,
+							 (void *) glob,
+							 QTW_EXAMINE_RTES_BEFORE);
 }
 
 static bool
 flatten_rtes_walker(Node *node, PlannerGlobal *glob)
 {
-    if (node == NULL)
-        return false;
-    if (IsA(node, RangeTblEntry))
-    {
-        RangeTblEntry *rte = (RangeTblEntry *) node;
-
-        /* As above, we need only save relation RTEs */
-        if (rte->rtekind == RTE_RELATION)
-            add_rte_to_flat_rtable(glob, rte);
-        return false;
-    }
-    if (IsA(node, Query))
-    {
-        /* Recurse into subselects */
-        return query_tree_walker((Query *) node,
-                                 flatten_rtes_walker,
-                                 (void *) glob,
-                                 QTW_EXAMINE_RTES);
-    }
-    return expression_tree_walker(node, flatten_rtes_walker,
-                                  (void *) glob);
+	if (node == NULL)
+		return false;
+	if (IsA(node, RangeTblEntry))
+	{
+		RangeTblEntry *rte = (RangeTblEntry *) node;
+
+		/* As above, we need only save relation RTEs */
+		if (rte->rtekind == RTE_RELATION)
+			add_rte_to_flat_rtable(glob, rte);
+		return false;
+	}
+	if (IsA(node, Query))
+	{
+		/* Recurse into subselects */
+		return query_tree_walker((Query *) node,
+								 flatten_rtes_walker,
+								 (void *) glob,
+								 QTW_EXAMINE_RTES_BEFORE);
+	}
+	return expression_tree_walker(node, flatten_rtes_walker,
+								  (void *) glob);
 }
 
 /*
diff --git a/src/backend/rewrite/rewriteManip.c b/src/backend/rewrite/rewriteManip.c
index 099fe968..09b14d4e 100644
--- a/src/backend/rewrite/rewriteManip.c
+++ b/src/backend/rewrite/rewriteManip.c
@@ -698,94 +698,94 @@ typedef struct
 
 static bool
 IncrementVarSublevelsUp_walker(Node *node,
-                               IncrementVarSublevelsUp_context *context)
-{// #lizard forgives
-    if (node == NULL)
-        return false;
-    if (IsA(node, Var))
-    {
-        Var           *var = (Var *) node;
-
-        if (var->varlevelsup >= context->min_sublevels_up)
-            var->varlevelsup += context->delta_sublevels_up;
-        return false;            /* done here */
-    }
-    if (IsA(node, CurrentOfExpr))
-    {
-        /* this should not happen */
-        if (context->min_sublevels_up == 0)
-            elog(ERROR, "cannot push down CurrentOfExpr");
-        return false;
-    }
-    if (IsA(node, Aggref))
-    {
-        Aggref       *agg = (Aggref *) node;
-
-        if (agg->agglevelsup >= context->min_sublevels_up)
-            agg->agglevelsup += context->delta_sublevels_up;
-        /* fall through to recurse into argument */
-    }
-    if (IsA(node, GroupingFunc))
-    {
-        GroupingFunc *grp = (GroupingFunc *) node;
-
-        if (grp->agglevelsup >= context->min_sublevels_up)
-            grp->agglevelsup += context->delta_sublevels_up;
-        /* fall through to recurse into argument */
-    }
-    if (IsA(node, PlaceHolderVar))
-    {
-        PlaceHolderVar *phv = (PlaceHolderVar *) node;
-
-        if (phv->phlevelsup >= context->min_sublevels_up)
-            phv->phlevelsup += context->delta_sublevels_up;
-        /* fall through to recurse into argument */
-    }
-    if (IsA(node, RangeTblEntry))
-    {
-        RangeTblEntry *rte = (RangeTblEntry *) node;
-
-        if (rte->rtekind == RTE_CTE)
-        {
-            if (rte->ctelevelsup >= context->min_sublevels_up)
-                rte->ctelevelsup += context->delta_sublevels_up;
-        }
-        return false;            /* allow range_table_walker to continue */
-    }
-    if (IsA(node, Query))
-    {
-        /* Recurse into subselects */
-        bool        result;
-
-        context->min_sublevels_up++;
-        result = query_tree_walker((Query *) node,
-                                   IncrementVarSublevelsUp_walker,
-                                   (void *) context,
-                                   QTW_EXAMINE_RTES);
-        context->min_sublevels_up--;
-        return result;
-    }
-    return expression_tree_walker(node, IncrementVarSublevelsUp_walker,
-                                  (void *) context);
+							   IncrementVarSublevelsUp_context *context)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, Var))
+	{
+		Var		   *var = (Var *) node;
+
+		if (var->varlevelsup >= context->min_sublevels_up)
+			var->varlevelsup += context->delta_sublevels_up;
+		return false;			/* done here */
+	}
+	if (IsA(node, CurrentOfExpr))
+	{
+		/* this should not happen */
+		if (context->min_sublevels_up == 0)
+			elog(ERROR, "cannot push down CurrentOfExpr");
+		return false;
+	}
+	if (IsA(node, Aggref))
+	{
+		Aggref	   *agg = (Aggref *) node;
+
+		if (agg->agglevelsup >= context->min_sublevels_up)
+			agg->agglevelsup += context->delta_sublevels_up;
+		/* fall through to recurse into argument */
+	}
+	if (IsA(node, GroupingFunc))
+	{
+		GroupingFunc *grp = (GroupingFunc *) node;
+
+		if (grp->agglevelsup >= context->min_sublevels_up)
+			grp->agglevelsup += context->delta_sublevels_up;
+		/* fall through to recurse into argument */
+	}
+	if (IsA(node, PlaceHolderVar))
+	{
+		PlaceHolderVar *phv = (PlaceHolderVar *) node;
+
+		if (phv->phlevelsup >= context->min_sublevels_up)
+			phv->phlevelsup += context->delta_sublevels_up;
+		/* fall through to recurse into argument */
+	}
+	if (IsA(node, RangeTblEntry))
+	{
+		RangeTblEntry *rte = (RangeTblEntry *) node;
+
+		if (rte->rtekind == RTE_CTE)
+		{
+			if (rte->ctelevelsup >= context->min_sublevels_up)
+				rte->ctelevelsup += context->delta_sublevels_up;
+		}
+		return false;			/* allow range_table_walker to continue */
+	}
+	if (IsA(node, Query))
+	{
+		/* Recurse into subselects */
+		bool		result;
+
+		context->min_sublevels_up++;
+		result = query_tree_walker((Query *) node,
+								   IncrementVarSublevelsUp_walker,
+								   (void *) context,
+								   QTW_EXAMINE_RTES_BEFORE);
+		context->min_sublevels_up--;
+		return result;
+	}
+	return expression_tree_walker(node, IncrementVarSublevelsUp_walker,
+								  (void *) context);
 }
 
 void
 IncrementVarSublevelsUp(Node *node, int delta_sublevels_up,
                         int min_sublevels_up)
 {
-    IncrementVarSublevelsUp_context context;
-
-    context.delta_sublevels_up = delta_sublevels_up;
-    context.min_sublevels_up = min_sublevels_up;
-
-    /*
-     * Must be prepared to start with a Query or a bare expression tree; if
-     * it's a Query, we don't want to increment sublevels_up.
-     */
-    query_or_expression_tree_walker(node,
-                                    IncrementVarSublevelsUp_walker,
-                                    (void *) &context,
-                                    QTW_EXAMINE_RTES);
+	IncrementVarSublevelsUp_context context;
+
+	context.delta_sublevels_up = delta_sublevels_up;
+	context.min_sublevels_up = min_sublevels_up;
+
+	/*
+	 * Must be prepared to start with a Query or a bare expression tree; if
+	 * it's a Query, we don't want to increment sublevels_up.
+	 */
+	query_or_expression_tree_walker(node,
+									IncrementVarSublevelsUp_walker,
+									(void *) &context,
+									QTW_EXAMINE_RTES_BEFORE);
 }
 
 /*
@@ -801,10 +801,10 @@ IncrementVarSublevelsUp_rtable(List *rtable, int delta_sublevels_up,
     context.delta_sublevels_up = delta_sublevels_up;
     context.min_sublevels_up = min_sublevels_up;
 
-    range_table_walker(rtable,
-                       IncrementVarSublevelsUp_walker,
-                       (void *) &context,
-                       QTW_EXAMINE_RTES);
+	range_table_walker(rtable,
+					   IncrementVarSublevelsUp_walker,
+					   (void *) &context,
+					   QTW_EXAMINE_RTES_BEFORE);
 }
 
 
diff --git a/src/include/nodes/nodeFuncs.h b/src/include/nodes/nodeFuncs.h
index 8d5c5000..cfb41c3c 100644
--- a/src/include/nodes/nodeFuncs.h
+++ b/src/include/nodes/nodeFuncs.h
@@ -77,16 +77,19 @@
 
 
 /* flags bits for query_tree_walker and query_tree_mutator */
-#define QTW_IGNORE_RT_SUBQUERIES    0x01    /* subqueries in rtable */
-#define QTW_IGNORE_CTE_SUBQUERIES    0x02    /* subqueries in cteList */
-#define QTW_IGNORE_RC_SUBQUERIES    0x03    /* both of above */
-#define QTW_IGNORE_JOINALIASES        0x04    /* JOIN alias var lists */
-#define QTW_IGNORE_RANGE_TABLE        0x08    /* skip rangetable entirely */
-#define QTW_EXAMINE_RTES            0x10    /* examine RTEs */
-#define QTW_DONT_COPY_QUERY            0x20    /* do not copy top Query */
+#define QTW_IGNORE_RT_SUBQUERIES	0x01	/* subqueries in rtable */
+#define QTW_IGNORE_CTE_SUBQUERIES	0x02	/* subqueries in cteList */
+#define QTW_IGNORE_RC_SUBQUERIES	0x03	/* both of above */
+#define QTW_IGNORE_JOINALIASES		0x04	/* JOIN alias var lists */
+#define QTW_IGNORE_RANGE_TABLE		0x08	/* skip rangetable entirely */
+#define QTW_EXAMINE_RTES_BEFORE		0x10	/* examine RTE nodes before their
+											 * contents */
+#define QTW_EXAMINE_RTES_AFTER		0x20	/* examine RTE nodes after their
+											 * contents */
+#define QTW_DONT_COPY_QUERY			0x40	/* do not copy top Query */
 #ifdef _MLS_
-#define QTW_IGNORE_TARGET_LIST      0x40    /* skip target list */
-#define QTW_IGNORE_RETURNING_LIST   0x80    /* skip returning list */
+#define QTW_IGNORE_TARGET_LIST      0x0100	/* skip target list */
+#define QTW_IGNORE_RETURNING_LIST   0x0200	/* skip returning list */
 #endif
 
 
From 97ba77f9d3185ebd21ce59c6c3283ff591650ea5 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Thu, 13 Aug 2020 12:48:14 +0800
Subject: [PATCH 022/578] Allow user control of CTE materialization, and change
 the default behavior.

Historically we've always materialized the full output of a CTE query,
treating WITH as an optimization fence (so that, for example,
restrictions
from the outer query cannot be pushed into it).  This is appropriate
when
the CTE query is INSERT/UPDATE/DELETE, or is recursive; but when the CTE
query is non-recursive and side-effect-free, there's no hazard of
changing
the query results by pushing restrictions down.

Another argument for materialization is that it can avoid duplicate
computation of an expensive WITH query --- but that only applies if
the WITH query is called more than once in the outer query.  Even then
it could still be a net loss, if each call has restrictions that
would allow just a small part of the WITH query to be computed.

Hence, let's change the behavior for WITH queries that are non-recursive
and side-effect-free.  By default, we will inline them into the outer
query (removing the optimization fence) if they are called just once.
If they are called more than once, we will keep the old behavior by
default, but the user can override this and force inlining by specifying
NOT MATERIALIZED.  Lastly, the user can force the old behavior by
specifying MATERIALIZED; this would mainly be useful when the query had
deliberately been employing WITH as an optimization fence to prevent a
poor choice of plan.

Andreas Karlsson, Andrew Gierth, David Fetter

Discussion: https://postgr.es/m/87sh48ffhb.fsf@news-spur.riddles.org.uk
---
 .../postgres_fdw/expected/postgres_fdw.out    |     4 +-
 contrib/postgres_fdw/sql/postgres_fdw.sql     |     4 +-
 doc/src/sgml/queries.sgml                     |    84 +-
 doc/src/sgml/ref/select.sgml                  |    55 +-
 src/backend/nodes/copyfuncs.c                 |    21 +-
 src/backend/nodes/equalfuncs.c                |    21 +-
 src/backend/nodes/outfuncs.c                  |    21 +-
 src/backend/nodes/readfuncs.c                 |    21 +-
 src/backend/optimizer/plan/planner.c          |   314 +-
 src/backend/optimizer/plan/subselect.c        |   293 +-
 src/backend/parser/gram.y                     |    14 +-
 src/backend/utils/adt/ruleutils.c             | 25864 ++++++++--------
 src/include/nodes/parsenodes.h                |    36 +-
 src/test/regress/expected/foreign_key_2.out   |    13 +-
 src/test/regress/expected/rowsecurity.out     |     4 +-
 src/test/regress/expected/rowsecurity_1.out   |     9 +-
 src/test/regress/expected/rowtypes.out        |     4 +-
 src/test/regress/expected/rowtypes_1.out      |     4 +-
 src/test/regress/expected/rules.out           |     5 +-
 src/test/regress/expected/subselect.out       |   270 +-
 src/test/regress/expected/xc_for_update_1.out |    34 +-
 src/test/regress/sql/rowsecurity.sql          |     9 +-
 src/test/regress/sql/rowtypes.sql             |     4 +-
 src/test/regress/sql/rules.sql                |     5 +-
 src/test/regress/sql/subselect.sql            |    93 +
 25 files changed, 13936 insertions(+), 13270 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index c19b3318..77a6e2ce 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -1868,7 +1868,7 @@ SELECT t1.c1, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1) ORDER BY t1.c3, t
 
 -- join in CTE
 EXPLAIN (VERBOSE, COSTS OFF)
-WITH t (c1_1, c1_3, c2_1) AS (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10;
+WITH t (c1_1, c1_3, c2_1) AS MATERIALIZED (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10;
                                                              QUERY PLAN                                                              
 -------------------------------------------------------------------------------------------------------------------------------------
  Limit
@@ -1885,7 +1885,7 @@ WITH t (c1_1, c1_3, c2_1) AS (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2
                Output: t.c1_1, t.c2_1, t.c1_3
 (12 rows)
 
-WITH t (c1_1, c1_3, c2_1) AS (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10;
+WITH t (c1_1, c1_3, c2_1) AS MATERIALIZED (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10;
  c1_1 | c2_1 
 ------+------
   101 |  101
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 5f65d9d9..5048bff6 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -495,8 +495,8 @@ SELECT t1.c1, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1) ORDER BY t1.c3, t
 SELECT t1.c1, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1) ORDER BY t1.c3, t1.c1 OFFSET 100 LIMIT 10 FOR SHARE;
 -- join in CTE
 EXPLAIN (VERBOSE, COSTS OFF)
-WITH t (c1_1, c1_3, c2_1) AS (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10;
-WITH t (c1_1, c1_3, c2_1) AS (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10;
+WITH t (c1_1, c1_3, c2_1) AS MATERIALIZED (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10;
+WITH t (c1_1, c1_3, c2_1) AS MATERIALIZED (SELECT t1.c1, t1.c3, t2.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1)) SELECT c1_1, c2_1 FROM t ORDER BY c1_3, c1_1 OFFSET 100 LIMIT 10;
 -- ctid with whole-row reference
 EXPLAIN (VERBOSE, COSTS OFF)
 SELECT t1.ctid, t1, t2, t1.c1 FROM ft1 t1 JOIN ft2 t2 ON (t1.c1 = t2.c1) ORDER BY t1.c3, t1.c1 OFFSET 100 LIMIT 10;
diff --git a/doc/src/sgml/queries.sgml b/doc/src/sgml/queries.sgml
index 0588da29..0ba7085f 100644
--- a/doc/src/sgml/queries.sgml
+++ b/doc/src/sgml/queries.sgml
@@ -2195,22 +2195,94 @@ SELECT n FROM t LIMIT 100;
   </para>
 
   <para>
-   A useful property of <literal>WITH</> queries is that they are evaluated
-   only once per execution of the parent query, even if they are referred to
-   more than once by the parent query or sibling <literal>WITH</> queries.
+   A useful property of <literal>WITH</literal> queries is that they are
+   normally evaluated only once per execution of the parent query, even if
+   they are referred to more than once by the parent query or
+   sibling <literal>WITH</literal> queries.
    Thus, expensive calculations that are needed in multiple places can be
    placed within a <literal>WITH</> query to avoid redundant work.  Another
    possible application is to prevent unwanted multiple evaluations of
    functions with side-effects.
-   However, the other side of this coin is that the optimizer is less able to
-   push restrictions from the parent query down into a <literal>WITH</> query
-   than an ordinary subquery.  The <literal>WITH</> query will generally be
+   However, the other side of this coin is that the optimizer is not able to
+   push restrictions from the parent query down into a multiply-referenced
+   <literal>WITH</literal> query, since that might affect all uses of the
+   <literal>WITH</literal> query's output when it should affect only one.
+   The multiply-referenced <literal>WITH</literal> query will be
    evaluated as written, without suppression of rows that the parent query
    might discard afterwards.  (But, as mentioned above, evaluation might stop
    early if the reference(s) to the query demand only a limited number of
    rows.)
   </para>
 
+  <para>
+   However, if a <literal>WITH</literal> query is non-recursive and
+   side-effect-free (that is, it is a <literal>SELECT</literal> containing
+   no volatile functions) then it can be folded into the parent query,
+   allowing joint optimization of the two query levels.  By default, this
+   happens if the parent query references the <literal>WITH</literal> query
+   just once, but not if it references the <literal>WITH</literal> query
+   more than once.  You can override that decision by
+   specifying <literal>MATERIALIZED</literal> to force separate calculation
+   of the <literal>WITH</literal> query, or by specifying <literal>NOT
+   MATERIALIZED</literal> to force it to be merged into the parent query.
+   The latter choice risks duplicate computation of
+   the <literal>WITH</literal> query, but it can still give a net savings if
+   each usage of the <literal>WITH</literal> query needs only a small part
+   of the <literal>WITH</literal> query's full output.
+  </para>
+
+  <para>
+   A simple example of these rules is
+<programlisting>
+WITH w AS (
+    SELECT * FROM big_table
+)
+SELECT * FROM w WHERE key = 123;
+</programlisting>
+   This <literal>WITH</literal> query will be folded, producing the same
+   execution plan as
+<programlisting>
+SELECT * FROM big_table WHERE key = 123;
+</programlisting>
+   In particular, if there's an index on <structfield>key</structfield>,
+   it will probably be used to fetch just the rows having <literal>key =
+   123</literal>.  On the other hand, in
+<programlisting>
+WITH w AS (
+    SELECT * FROM big_table
+)
+SELECT * FROM w AS w1 JOIN w AS w2 ON w1.key = w2.ref
+WHERE w2.key = 123;
+</programlisting>
+   the <literal>WITH</literal> query will be materialized, producing a
+   temporary copy of <structname>big_table</structname> that is then
+   joined with itself &mdash; without benefit of any index.  This query
+   will be executed much more efficiently if written as
+<programlisting>
+WITH w AS NOT MATERIALIZED (
+    SELECT * FROM big_table
+)
+SELECT * FROM w AS w1 JOIN w AS w2 ON w1.key = w2.ref
+WHERE w2.key = 123;
+</programlisting>
+   so that the parent query's restrictions can be applied directly
+   to scans of <structname>big_table</structname>.
+  </para>
+
+  <para>
+   An example where <literal>NOT MATERIALIZED</literal> could be
+   undesirable is
+<programlisting>
+WITH w AS (
+    SELECT key, very_expensive_function(val) as f FROM some_table
+)
+SELECT * FROM w AS w1 JOIN w AS w2 ON w1.f = w2.f;
+</programlisting>
+   Here, materialization of the <literal>WITH</literal> query ensures
+   that <function>very_expensive_function</function> is evaluated only
+   once per table row, not twice.
+  </para>
+  
   <para>
    The examples above only show <literal>WITH</> being used with
    <command>SELECT</>, but it can be attached in the same way to
diff --git a/doc/src/sgml/ref/select.sgml b/doc/src/sgml/ref/select.sgml
index 57f11e66..17172c05 100644
--- a/doc/src/sgml/ref/select.sgml
+++ b/doc/src/sgml/ref/select.sgml
@@ -72,7 +72,7 @@ SELECT [ ALL | DISTINCT [ ON ( <replaceable class="parameter">expression</replac
 
 <phrase>and <replaceable class="parameter">with_query</replaceable> is:</phrase>
 
-    <replaceable class="parameter">with_query_name</replaceable> [ ( <replaceable class="parameter">column_name</replaceable> [, ...] ) ] AS ( <replaceable class="parameter">select</replaceable> | <replaceable class="parameter">values</replaceable> | <replaceable class="parameter">insert</replaceable> | <replaceable class="parameter">update</replaceable> | <replaceable class="parameter">delete</replaceable> )
+    <replaceable class="parameter">with_query_name</replaceable> [ ( <replaceable class="parameter">column_name</replaceable> [, ...] ) ] AS [ [ NOT ] MATERIALIZED ] ( <replaceable class="parameter">select</replaceable> | <replaceable class="parameter">values</replaceable> | <replaceable class="parameter">insert</replaceable> | <replaceable class="parameter">update</replaceable> | <replaceable class="parameter">delete</replaceable> )
 
 TABLE [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ]
 </synopsis>
@@ -94,6 +94,7 @@ TABLE [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ]
       in the <literal>FROM</literal> list.  A <literal>WITH</literal> query
       that is referenced more than once in <literal>FROM</literal> is
       computed only once.
+      unless specified otherwise with <literal>NOT MATERIALIZED</literal>.
       (See <xref linkend="sql-with" endterm="sql-with-title"> below.)
      </para>
     </listitem>
@@ -272,9 +273,18 @@ TABLE [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ]
     that are earlier in the <literal>WITH</literal> list.
    </para>
 
+   <para>
+    The primary query and the <literal>WITH</literal> queries are all
+    (notionally) executed at the same time.  This implies that the effects of
+    a data-modifying statement in <literal>WITH</literal> cannot be seen from
+    other parts of the query, other than by reading its <literal>RETURNING</literal>
+    output.  If two such data-modifying statements attempt to modify the same
+    row, the results are unspecified.
+   </para>
+
    <para>
     A key property of <literal>WITH</literal> queries is that they
-    are evaluated only once per execution of the primary query,
+    are normally evaluated only once per execution of the primary query,
     even if the primary query refers to them more than once.
     In particular, data-modifying statements are guaranteed to be
     executed once and only once, regardless of whether the primary query
@@ -282,12 +292,35 @@ TABLE [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ * ]
    </para>
 
    <para>
-    The primary query and the <literal>WITH</literal> queries are all
-    (notionally) executed at the same time.  This implies that the effects of
-    a data-modifying statement in <literal>WITH</literal> cannot be seen from
-    other parts of the query, other than by reading its <literal>RETURNING</>
-    output.  If two such data-modifying statements attempt to modify the same
-    row, the results are unspecified.
+	However, a <literal>WITH</literal> query can be marked
+    <literal>NOT MATERIALIZED</literal> to remove this guarantee.  In that
+    case, the <literal>WITH</literal> query can be folded into the primary
+    query much as though it were a simple sub-<literal>SELECT</literal> in
+    the primary query's <literal>FROM</literal> clause.  This results in
+    duplicate computations if the primary query refers to
+    that <literal>WITH</literal> query more than once; but if each such use
+    requires only a few rows of the <literal>WITH</literal> query's total
+    output, <literal>NOT MATERIALIZED</literal> can provide a net savings by
+    allowing the queries to be optimized jointly.
+    <literal>NOT MATERIALIZED</literal> is ignored if it is attached to
+    a <literal>WITH</literal> query that is recursive or is not
+    side-effect-free (i.e., is not a plain <literal>SELECT</literal>
+    containing no volatile functions).
+   </para>
+
+   <para>
+    By default, a side-effect-free <literal>WITH</literal> query is folded
+    into the primary query if it is used exactly once in the primary
+    query's <literal>FROM</literal> clause.  This allows joint optimization
+    of the two query levels in situations where that should be semantically
+    invisible.  However, such folding can be prevented by marking the
+    <literal>WITH</literal> query as <literal>MATERIALIZED</literal>.
+    That might be useful, for example, if the <literal>WITH</literal> query
+    is being used as an optimization fence to prevent the planner from
+    choosing a bad plan.
+    <productname>PostgreSQL</productname> versions before v12 never did
+    such folding, so queries written for older versions might rely on
+    <literal>WITH</literal> to act as an optimization fence.
    </para>
 
    <para>
@@ -2046,6 +2079,12 @@ SELECT distributors.* WHERE distributors.name = 'Westward';
    <para>
     <literal>ROWS FROM( ... )</> is an extension of the SQL standard.
    </para>
+   
+   <para>
+    The <literal>MATERIALIZED</literal> and <literal>NOT
+    MATERIALIZED</literal> options of <literal>WITH</literal> are extensions
+    of the SQL standard.
+   </para>
   </refsect2>
 
  </refsect1>
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 06c7bdf4..8a447982 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -2747,16 +2747,17 @@ _copyCommonTableExpr(const CommonTableExpr *from)
 {
     CommonTableExpr *newnode = makeNode(CommonTableExpr);
 
-    COPY_STRING_FIELD(ctename);
-    COPY_NODE_FIELD(aliascolnames);
-    COPY_NODE_FIELD(ctequery);
-    COPY_LOCATION_FIELD(location);
-    COPY_SCALAR_FIELD(cterecursive);
-    COPY_SCALAR_FIELD(cterefcount);
-    COPY_NODE_FIELD(ctecolnames);
-    COPY_NODE_FIELD(ctecoltypes);
-    COPY_NODE_FIELD(ctecoltypmods);
-    COPY_NODE_FIELD(ctecolcollations);
+	COPY_STRING_FIELD(ctename);
+	COPY_NODE_FIELD(aliascolnames);
+	COPY_SCALAR_FIELD(ctematerialized);
+	COPY_NODE_FIELD(ctequery);
+	COPY_LOCATION_FIELD(location);
+	COPY_SCALAR_FIELD(cterecursive);
+	COPY_SCALAR_FIELD(cterefcount);
+	COPY_NODE_FIELD(ctecolnames);
+	COPY_NODE_FIELD(ctecoltypes);
+	COPY_NODE_FIELD(ctecoltypmods);
+	COPY_NODE_FIELD(ctecolcollations);
 
     return newnode;
 }
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index ccd20de5..3dbcb393 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -2850,16 +2850,17 @@ _equalOnConflictClause(const OnConflictClause *a, const OnConflictClause *b)
 static bool
 _equalCommonTableExpr(const CommonTableExpr *a, const CommonTableExpr *b)
 {
-    COMPARE_STRING_FIELD(ctename);
-    COMPARE_NODE_FIELD(aliascolnames);
-    COMPARE_NODE_FIELD(ctequery);
-    COMPARE_LOCATION_FIELD(location);
-    COMPARE_SCALAR_FIELD(cterecursive);
-    COMPARE_SCALAR_FIELD(cterefcount);
-    COMPARE_NODE_FIELD(ctecolnames);
-    COMPARE_NODE_FIELD(ctecoltypes);
-    COMPARE_NODE_FIELD(ctecoltypmods);
-    COMPARE_NODE_FIELD(ctecolcollations);
+	COMPARE_STRING_FIELD(ctename);
+	COMPARE_NODE_FIELD(aliascolnames);
+	COMPARE_SCALAR_FIELD(ctematerialized);
+	COMPARE_NODE_FIELD(ctequery);
+	COMPARE_LOCATION_FIELD(location);
+	COMPARE_SCALAR_FIELD(cterecursive);
+	COMPARE_SCALAR_FIELD(cterefcount);
+	COMPARE_NODE_FIELD(ctecolnames);
+	COMPARE_NODE_FIELD(ctecoltypes);
+	COMPARE_NODE_FIELD(ctecoltypmods);
+	COMPARE_NODE_FIELD(ctecolcollations);
 
     return true;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index fb063aa2..7d7a9704 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -4377,16 +4377,17 @@ _outCommonTableExpr(StringInfo str, const CommonTableExpr *node)
 {
     WRITE_NODE_TYPE("COMMONTABLEEXPR");
 
-    WRITE_STRING_FIELD(ctename);
-    WRITE_NODE_FIELD(aliascolnames);
-    WRITE_NODE_FIELD(ctequery);
-    WRITE_LOCATION_FIELD(location);
-    WRITE_BOOL_FIELD(cterecursive);
-    WRITE_INT_FIELD(cterefcount);
-    WRITE_NODE_FIELD(ctecolnames);
-    WRITE_NODE_FIELD(ctecoltypes);
-    WRITE_NODE_FIELD(ctecoltypmods);
-    WRITE_NODE_FIELD(ctecolcollations);
+	WRITE_STRING_FIELD(ctename);
+	WRITE_NODE_FIELD(aliascolnames);
+    WRITE_ENUM_FIELD(ctematerialized, CTEMaterialize);
+	WRITE_NODE_FIELD(ctequery);
+	WRITE_LOCATION_FIELD(location);
+	WRITE_BOOL_FIELD(cterecursive);
+	WRITE_INT_FIELD(cterefcount);
+	WRITE_NODE_FIELD(ctecolnames);
+	WRITE_NODE_FIELD(ctecoltypes);
+	WRITE_NODE_FIELD(ctecoltypmods);
+	WRITE_NODE_FIELD(ctecolcollations);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 2c886d9f..7207a98c 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -724,16 +724,17 @@ _readCommonTableExpr(void)
 {
     READ_LOCALS(CommonTableExpr);
 
-    READ_STRING_FIELD(ctename);
-    READ_NODE_FIELD(aliascolnames);
-    READ_NODE_FIELD(ctequery);
-    READ_LOCATION_FIELD(location);
-    READ_BOOL_FIELD(cterecursive);
-    READ_INT_FIELD(cterefcount);
-    READ_NODE_FIELD(ctecolnames);
-    READ_NODE_FIELD(ctecoltypes);
-    READ_NODE_FIELD(ctecoltypmods);
-    READ_NODE_FIELD(ctecolcollations);
+	READ_STRING_FIELD(ctename);
+	READ_NODE_FIELD(aliascolnames);
+	READ_ENUM_FIELD(ctematerialized, CTEMaterialize);
+	READ_NODE_FIELD(ctequery);
+	READ_LOCATION_FIELD(location);
+	READ_BOOL_FIELD(cterecursive);
+	READ_INT_FIELD(cterefcount);
+	READ_NODE_FIELD(ctecolnames);
+	READ_NODE_FIELD(ctecoltypes);
+	READ_NODE_FIELD(ctecoltypmods);
+	READ_NODE_FIELD(ctecolcollations);
 
     READ_DONE();
 }
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b5e70cc0..6ed3f131 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -626,173 +626,173 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
 #ifdef _MLS_
     root->hasClsPolicy = false;
 #endif
-    root->hasInheritedTarget = false;
-    root->hasRecursion = hasRecursion;
-    if (hasRecursion)
-        root->wt_param_id = SS_assign_special_param(root);
-    else
-        root->wt_param_id = -1;
-    root->non_recursive_path = NULL;
-
-    /*
-     * If there is a WITH list, process each WITH query and build an initplan
-     * SubPlan structure for it.
-     */
-    if (parse->cteList)
-        SS_process_ctes(root);
-
-    /*
-     * Look for ANY and EXISTS SubLinks in WHERE and JOIN/ON clauses, and try
-     * to transform them into joins.  Note that this step does not descend
-     * into subqueries; if we pull up any subqueries below, their SubLinks are
-     * processed just before pulling them up.
-     */
-    if (parse->hasSubLinks)
-        pull_up_sublinks(root);
-
-    /*
-     * Scan the rangetable for set-returning functions, and inline them if
-     * possible (producing subqueries that might get pulled up next).
-     * Recursion issues here are handled in the same way as for SubLinks.
-     */
-    inline_set_returning_functions(root);
-
-    /*
-     * Check to see if any subqueries in the jointree can be merged into this
-     * query.
-     */
-    pull_up_subqueries(root);
-
-    /*
-     * If this is a simple UNION ALL query, flatten it into an appendrel. We
-     * do this now because it requires applying pull_up_subqueries to the leaf
-     * queries of the UNION ALL, which weren't touched above because they
-     * weren't referenced by the jointree (they will be after we do this).
-     */
-    if (parse->setOperations)
-        flatten_simple_union_all(root);
-
-    /*
-     * Detect whether any rangetable entries are RTE_JOIN kind; if not, we can
-     * avoid the expense of doing flatten_join_alias_vars().  Also check for
-     * outer joins --- if none, we can skip reduce_outer_joins().  And check
-     * for LATERAL RTEs, too.  This must be done after we have done
-     * pull_up_subqueries(), of course.
-     */
-    root->hasJoinRTEs = false;
-    root->hasLateralRTEs = false;
-    hasOuterJoins = false;
-    foreach(l, parse->rtable)
-    {
-        RangeTblEntry *rte = (RangeTblEntry *) lfirst(l);
-
-        if (rte->rtekind == RTE_JOIN)
-        {
-            root->hasJoinRTEs = true;
-            if (IS_OUTER_JOIN(rte->jointype))
-                hasOuterJoins = true;
-        }
-        if (rte->lateral)
-            root->hasLateralRTEs = true;
-    }
-
-    /*
-     * Preprocess RowMark information.  We need to do this after subquery
-     * pullup (so that all non-inherited RTEs are present) and before
-     * inheritance expansion (so that the info is available for
-     * expand_inherited_tables to examine and modify).
-     */
-    preprocess_rowmarks(root);
-
-    /*
-     * Expand any rangetable entries that are inheritance sets into "append
-     * relations".  This can add entries to the rangetable, but they must be
-     * plain base relations not joins, so it's OK (and marginally more
-     * efficient) to do it after checking for join RTEs.  We must do it after
-     * pulling up subqueries, else we'd fail to handle inherited tables in
-     * subqueries.
-     */
-    expand_inherited_tables(root);
-
-    /*
-     * Set hasHavingQual to remember if HAVING clause is present.  Needed
-     * because preprocess_expression will reduce a constant-true condition to
-     * an empty qual list ... but "HAVING TRUE" is not a semantic no-op.
-     */
-    root->hasHavingQual = (parse->havingQual != NULL);
-
-    /* Clear this flag; might get set in distribute_qual_to_rels */
-    root->hasPseudoConstantQuals = false;
-
-    /*
-     * Do expression preprocessing on targetlist and quals, as well as other
-     * random expressions in the querytree.  Note that we do not need to
-     * handle sort/group expressions explicitly, because they are actually
-     * part of the targetlist.
-     */
-    parse->targetList = (List *)
-        preprocess_expression(root, (Node *) parse->targetList,
-                              EXPRKIND_TARGET);
+	root->hasInheritedTarget = false;
+	root->hasRecursion = hasRecursion;
+	if (hasRecursion)
+		root->wt_param_id = SS_assign_special_param(root);
+	else
+		root->wt_param_id = -1;
+	root->non_recursive_path = NULL;
+
+	/*
+	 * If there is a WITH list, process each WITH query and either convert it
+	 * to RTE_SUBQUERY RTE(s) or build an initplan SubPlan structure for it.
+	 */
+	if (parse->cteList)
+		SS_process_ctes(root);
+
+	/*
+	 * Look for ANY and EXISTS SubLinks in WHERE and JOIN/ON clauses, and try
+	 * to transform them into joins.  Note that this step does not descend
+	 * into subqueries; if we pull up any subqueries below, their SubLinks are
+	 * processed just before pulling them up.
+	 */
+	if (parse->hasSubLinks)
+		pull_up_sublinks(root);
+
+	/*
+	 * Scan the rangetable for set-returning functions, and inline them if
+	 * possible (producing subqueries that might get pulled up next).
+	 * Recursion issues here are handled in the same way as for SubLinks.
+	 */
+	inline_set_returning_functions(root);
+
+	/*
+	 * Check to see if any subqueries in the jointree can be merged into this
+	 * query.
+	 */
+	pull_up_subqueries(root);
+
+	/*
+	 * If this is a simple UNION ALL query, flatten it into an appendrel. We
+	 * do this now because it requires applying pull_up_subqueries to the leaf
+	 * queries of the UNION ALL, which weren't touched above because they
+	 * weren't referenced by the jointree (they will be after we do this).
+	 */
+	if (parse->setOperations)
+		flatten_simple_union_all(root);
+
+	/*
+	 * Detect whether any rangetable entries are RTE_JOIN kind; if not, we can
+	 * avoid the expense of doing flatten_join_alias_vars().  Also check for
+	 * outer joins --- if none, we can skip reduce_outer_joins().  And check
+	 * for LATERAL RTEs, too.  This must be done after we have done
+	 * pull_up_subqueries(), of course.
+	 */
+	root->hasJoinRTEs = false;
+	root->hasLateralRTEs = false;
+	hasOuterJoins = false;
+	foreach(l, parse->rtable)
+	{
+		RangeTblEntry *rte = (RangeTblEntry *) lfirst(l);
 
-    /* Constant-folding might have removed all set-returning functions */
-    if (parse->hasTargetSRFs)
-        parse->hasTargetSRFs = expression_returns_set((Node *) parse->targetList);
+		if (rte->rtekind == RTE_JOIN)
+		{
+			root->hasJoinRTEs = true;
+			if (IS_OUTER_JOIN(rte->jointype))
+				hasOuterJoins = true;
+		}
+		if (rte->lateral)
+			root->hasLateralRTEs = true;
+	}
 
-    newWithCheckOptions = NIL;
-    foreach(l, parse->withCheckOptions)
-    {
-        WithCheckOption *wco = (WithCheckOption *) lfirst(l);
+	/*
+	 * Preprocess RowMark information.  We need to do this after subquery
+	 * pullup (so that all non-inherited RTEs are present) and before
+	 * inheritance expansion (so that the info is available for
+	 * expand_inherited_tables to examine and modify).
+	 */
+	preprocess_rowmarks(root);
+
+	/*
+	 * Expand any rangetable entries that are inheritance sets into "append
+	 * relations".  This can add entries to the rangetable, but they must be
+	 * plain base relations not joins, so it's OK (and marginally more
+	 * efficient) to do it after checking for join RTEs.  We must do it after
+	 * pulling up subqueries, else we'd fail to handle inherited tables in
+	 * subqueries.
+	 */
+	expand_inherited_tables(root);
+
+	/*
+	 * Set hasHavingQual to remember if HAVING clause is present.  Needed
+	 * because preprocess_expression will reduce a constant-true condition to
+	 * an empty qual list ... but "HAVING TRUE" is not a semantic no-op.
+	 */
+	root->hasHavingQual = (parse->havingQual != NULL);
+
+	/* Clear this flag; might get set in distribute_qual_to_rels */
+	root->hasPseudoConstantQuals = false;
+
+	/*
+	 * Do expression preprocessing on targetlist and quals, as well as other
+	 * random expressions in the querytree.  Note that we do not need to
+	 * handle sort/group expressions explicitly, because they are actually
+	 * part of the targetlist.
+	 */
+	parse->targetList = (List *)
+		preprocess_expression(root, (Node *) parse->targetList,
+							  EXPRKIND_TARGET);
+
+	/* Constant-folding might have removed all set-returning functions */
+	if (parse->hasTargetSRFs)
+		parse->hasTargetSRFs = expression_returns_set((Node *) parse->targetList);
+
+	newWithCheckOptions = NIL;
+	foreach(l, parse->withCheckOptions)
+	{
+		WithCheckOption *wco = (WithCheckOption *) lfirst(l);
 
-        wco->qual = preprocess_expression(root, wco->qual,
-                                          EXPRKIND_QUAL);
-        if (wco->qual != NULL)
-            newWithCheckOptions = lappend(newWithCheckOptions, wco);
-    }
-    parse->withCheckOptions = newWithCheckOptions;
+		wco->qual = preprocess_expression(root, wco->qual,
+										  EXPRKIND_QUAL);
+		if (wco->qual != NULL)
+			newWithCheckOptions = lappend(newWithCheckOptions, wco);
+	}
+	parse->withCheckOptions = newWithCheckOptions;
 
-    parse->returningList = (List *)
-        preprocess_expression(root, (Node *) parse->returningList,
-                              EXPRKIND_TARGET);
+	parse->returningList = (List *)
+		preprocess_expression(root, (Node *) parse->returningList,
+							  EXPRKIND_TARGET);
 
-    preprocess_qual_conditions(root, (Node *) parse->jointree);
+	preprocess_qual_conditions(root, (Node *) parse->jointree);
 
-    parse->havingQual = preprocess_expression(root, parse->havingQual,
-                                              EXPRKIND_QUAL);
+	parse->havingQual = preprocess_expression(root, parse->havingQual,
+											  EXPRKIND_QUAL);
 
-    foreach(l, parse->windowClause)
-    {
-        WindowClause *wc = (WindowClause *) lfirst(l);
+	foreach(l, parse->windowClause)
+	{
+		WindowClause *wc = (WindowClause *) lfirst(l);
 
-        /* partitionClause/orderClause are sort/group expressions */
-        wc->startOffset = preprocess_expression(root, wc->startOffset,
-                                                EXPRKIND_LIMIT);
-        wc->endOffset = preprocess_expression(root, wc->endOffset,
-                                              EXPRKIND_LIMIT);
-    }
+		/* partitionClause/orderClause are sort/group expressions */
+		wc->startOffset = preprocess_expression(root, wc->startOffset,
+												EXPRKIND_LIMIT);
+		wc->endOffset = preprocess_expression(root, wc->endOffset,
+											  EXPRKIND_LIMIT);
+	}
 
-    parse->limitOffset = preprocess_expression(root, parse->limitOffset,
-                                               EXPRKIND_LIMIT);
-    parse->limitCount = preprocess_expression(root, parse->limitCount,
-                                              EXPRKIND_LIMIT);
+	parse->limitOffset = preprocess_expression(root, parse->limitOffset,
+											   EXPRKIND_LIMIT);
+	parse->limitCount = preprocess_expression(root, parse->limitCount,
+											  EXPRKIND_LIMIT);
 
-    if (parse->onConflict)
-    {
-        parse->onConflict->arbiterElems = (List *)
-            preprocess_expression(root,
-                                  (Node *) parse->onConflict->arbiterElems,
-                                  EXPRKIND_ARBITER_ELEM);
-        parse->onConflict->arbiterWhere =
-            preprocess_expression(root,
-                                  parse->onConflict->arbiterWhere,
-                                  EXPRKIND_QUAL);
-        parse->onConflict->onConflictSet = (List *)
-            preprocess_expression(root,
-                                  (Node *) parse->onConflict->onConflictSet,
-                                  EXPRKIND_TARGET);
-        parse->onConflict->onConflictWhere =
-            preprocess_expression(root,
-                                  parse->onConflict->onConflictWhere,
-                                  EXPRKIND_QUAL);
+	if (parse->onConflict)
+	{
+		parse->onConflict->arbiterElems = (List *)
+			preprocess_expression(root,
+								  (Node *) parse->onConflict->arbiterElems,
+								  EXPRKIND_ARBITER_ELEM);
+		parse->onConflict->arbiterWhere =
+			preprocess_expression(root,
+								  parse->onConflict->arbiterWhere,
+								  EXPRKIND_QUAL);
+		parse->onConflict->onConflictSet = (List *)
+			preprocess_expression(root,
+								  (Node *) parse->onConflict->onConflictSet,
+								  EXPRKIND_TARGET);
+		parse->onConflict->onConflictWhere =
+			preprocess_expression(root,
+								  parse->onConflict->onConflictWhere,
+								  EXPRKIND_QUAL);
 #ifdef _MLS_
         {   
             int             rt_index;
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 4c357f3e..3a7f8ccf 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -126,6 +126,14 @@ typedef struct finalize_primnode_context
     Bitmapset  *paramids;        /* Non-local PARAM_EXEC paramids found */
 } finalize_primnode_context;
 
+typedef struct inline_cte_walker_context
+{
+	const char *ctename;		/* name and relative level of target CTE */
+	int			levelsup;
+	int			refcount;		/* number of remaining references */
+	Query	   *ctequery;		/* query to substitute */
+} inline_cte_walker_context;
+
 
 static Node *build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot,
               List *plan_params,
@@ -144,6 +152,10 @@ static Node *convert_testexpr_mutator(Node *node,
 static bool subplan_is_hashable(Plan *plan);
 static bool testexpr_is_hashable(Node *testexpr);
 static bool hash_ok_operator(OpExpr *expr);
+static bool contain_dml(Node *node);
+static bool contain_dml_walker(Node *node, void *context);
+static void inline_cte(PlannerInfo *root, CommonTableExpr *cte);
+static bool inline_cte_walker(Node *node, inline_cte_walker_context *context);
 static bool simplify_EXISTS_query(PlannerInfo *root, Query *query);
 static Query *convert_EXISTS_to_ANY(PlannerInfo *root, Query *subselect,
                       Node **testexpr, List **paramIds);
@@ -1235,76 +1247,117 @@ hash_ok_operator(OpExpr *expr)
 /*
  * SS_process_ctes: process a query's WITH list
  *
- * We plan each interesting WITH item and convert it to an initplan.
+ * Consider each CTE in the WITH list and either ignore it (if it's an
+ * unreferenced SELECT), "inline" it to create a regular sub-SELECT-in-FROM,
+ * or convert it to an initplan.
+ *
  * A side effect is to fill in root->cte_plan_ids with a list that
  * parallels root->parse->cteList and provides the subplan ID for
- * each CTE's initplan.
+ * each CTE's initplan, or a dummy ID (-1) if we didn't make an initplan.
  */
 void
 SS_process_ctes(PlannerInfo *root)
 {
-    ListCell   *lc;
-
-    Assert(root->cte_plan_ids == NIL);
+	ListCell   *lc;
 
-    foreach(lc, root->parse->cteList)
-    {
-        CommonTableExpr *cte = (CommonTableExpr *) lfirst(lc);
-        CmdType        cmdType = ((Query *) cte->ctequery)->commandType;
-        Query       *subquery;
-        PlannerInfo *subroot;
-        RelOptInfo *final_rel;
-        Path       *best_path;
-        Plan       *plan;
-        SubPlan    *splan;
-        int            paramid;
-
-        /*
-         * Ignore SELECT CTEs that are not actually referenced anywhere.
-         */
-        if (cte->cterefcount == 0 && cmdType == CMD_SELECT)
-        {
-            /* Make a dummy entry in cte_plan_ids */
-            root->cte_plan_ids = lappend_int(root->cte_plan_ids, -1);
-            continue;
-        }
-
-        /*
-         * Copy the source Query node.  Probably not necessary, but let's keep
-         * this similar to make_subplan.
-         */
-        subquery = (Query *) copyObject(cte->ctequery);
-
-        /* plan_params should not be in use in current query level */
-        Assert(root->plan_params == NIL);
-
-        /*
-         * Generate Paths for the CTE query.  Always plan for full retrieval
-         * --- we don't have enough info to predict otherwise.
-         */
-        subroot = subquery_planner(root->glob, subquery,
-                                   root,
-                                   cte->cterecursive, 0.0);
-
-        /*
-         * Since the current query level doesn't yet contain any RTEs, it
-         * should not be possible for the CTE to have requested parameters of
-         * this level.
-         */
-        if (root->plan_params)
-            elog(ERROR, "unexpected outer reference in CTE query");
+	Assert(root->cte_plan_ids == NIL);
 
-        /*
-         * Select best Path and turn it into a Plan.  At least for now, there
-         * seems no reason to postpone doing that.
-         */
-        final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
-        best_path = final_rel->cheapest_total_path;
+	foreach(lc, root->parse->cteList)
+	{
+		CommonTableExpr *cte = (CommonTableExpr *) lfirst(lc);
+		CmdType		cmdType = ((Query *) cte->ctequery)->commandType;
+		Query	   *subquery;
+		PlannerInfo *subroot;
+		RelOptInfo *final_rel;
+		Path	   *best_path;
+		Plan	   *plan;
+		SubPlan    *splan;
+		int			paramid;
+
+		/*
+		 * Ignore SELECT CTEs that are not actually referenced anywhere.
+		 */
+		if (cte->cterefcount == 0 && cmdType == CMD_SELECT)
+		{
+			/* Make a dummy entry in cte_plan_ids */
+			root->cte_plan_ids = lappend_int(root->cte_plan_ids, -1);
+			continue;
+		}
 
-        if (!subroot->distribution)
-            subroot->distribution = best_path->distribution;
+ 		/*
+		 * Consider inlining the CTE (creating RTE_SUBQUERY RTE(s)) instead of
+		 * implementing it as a separately-planned CTE.
+		 *
+		 * We cannot inline if any of these conditions hold:
+		 *
+		 * 1. The user said not to (the CTEMaterializeAlways option).
+		 *
+		 * 2. The CTE is recursive.
+		 *
+		 * 3. The CTE has side-effects; this includes either not being a plain
+		 * SELECT, or containing volatile functions.  Inlining might change
+		 * the side-effects, which would be bad.
+		 *
+		 * Otherwise, we have an option whether to inline or not.  That should
+		 * always be a win if there's just a single reference, but if the CTE
+		 * is multiply-referenced then it's unclear: inlining adds duplicate
+		 * computations, but the ability to absorb restrictions from the outer
+		 * query level could outweigh that.  We do not have nearly enough
+		 * information at this point to tell whether that's true, so we let
+		 * the user express a preference.  Our default behavior is to inline
+		 * only singly-referenced CTEs, but a CTE marked CTEMaterializeNever
+		 * will be inlined even if multiply referenced.
+		 */
+		if ((cte->ctematerialized == CTEMaterializeNever ||
+			 (cte->ctematerialized == CTEMaterializeDefault &&
+			  cte->cterefcount == 1)) &&
+			!cte->cterecursive &&
+			cmdType == CMD_SELECT &&
+			!contain_dml(cte->ctequery) &&
+			!contain_volatile_functions(cte->ctequery))
+		{
+			inline_cte(root, cte);
+			/* Make a dummy entry in cte_plan_ids */
+			root->cte_plan_ids = lappend_int(root->cte_plan_ids, -1);
+			continue;
+		}
 
-        plan = create_plan(subroot, best_path);
+		/*
+		 * Copy the source Query node.  Probably not necessary, but let's keep
+		 * this similar to make_subplan.
+		 */
+		subquery = (Query *) copyObject(cte->ctequery);
+
+		/* plan_params should not be in use in current query level */
+		Assert(root->plan_params == NIL);
+
+		/*
+		 * Generate Paths for the CTE query.  Always plan for full retrieval
+		 * --- we don't have enough info to predict otherwise.
+		 */
+		subroot = subquery_planner(root->glob, subquery,
+								   root,
+								   cte->cterecursive, 0.0);
+
+		/*
+		 * Since the current query level doesn't yet contain any RTEs, it
+		 * should not be possible for the CTE to have requested parameters of
+		 * this level.
+		 */
+		if (root->plan_params)
+			elog(ERROR, "unexpected outer reference in CTE query");
+
+		/*
+		 * Select best Path and turn it into a Plan.  At least for now, there
+		 * seems no reason to postpone doing that.
+		 */
+		final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
+		best_path = final_rel->cheapest_total_path;
+
+		if (!subroot->distribution)
+			subroot->distribution = best_path->distribution;
+
+		plan = create_plan(subroot, best_path);
 
 #ifdef XCP
         /* Add a remote subplan, if redistribution is needed. */
@@ -1651,6 +1704,126 @@ add_vars_to_subquery_targetlist(Node     *whereClause, Query *subselect, int rti
 
 #endif
 
+/*
+ * contain_dml: is any subquery not a plain SELECT?
+ *
+ * We reject SELECT FOR UPDATE/SHARE as well as INSERT etc.
+ */
+static bool
+contain_dml(Node *node)
+{
+	return contain_dml_walker(node, NULL);
+}
+
+static bool
+contain_dml_walker(Node *node, void *context)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, Query))
+	{
+		Query	   *query = (Query *) node;
+
+		if (query->commandType != CMD_SELECT ||
+			query->rowMarks != NIL)
+			return true;
+
+		return query_tree_walker(query, contain_dml_walker, context, 0);
+	}
+	return expression_tree_walker(node, contain_dml_walker, context);
+}
+
+/*
+ * inline_cte: convert RTE_CTE references to given CTE into RTE_SUBQUERYs
+ */
+static void
+inline_cte(PlannerInfo *root, CommonTableExpr *cte)
+{
+	struct inline_cte_walker_context context;
+
+	context.ctename = cte->ctename;
+	/* Start at levelsup = -1 because we'll immediately increment it */
+	context.levelsup = -1;
+	context.refcount = cte->cterefcount;
+	context.ctequery = castNode(Query, cte->ctequery);
+
+	(void) inline_cte_walker((Node *) root->parse, &context);
+
+	/* Assert we replaced all references */
+	Assert(context.refcount == 0);
+}
+
+static bool
+inline_cte_walker(Node *node, inline_cte_walker_context *context)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, Query))
+	{
+		Query	   *query = (Query *) node;
+
+		context->levelsup++;
+
+		/*
+		 * Visit the query's RTE nodes after their contents; otherwise
+		 * query_tree_walker would descend into the newly inlined CTE query,
+		 * which we don't want.
+		 */
+		(void) query_tree_walker(query, inline_cte_walker, context,
+								 QTW_EXAMINE_RTES_AFTER);
+
+		context->levelsup--;
+
+		return false;
+	}
+	else if (IsA(node, RangeTblEntry))
+	{
+		RangeTblEntry *rte = (RangeTblEntry *) node;
+
+		if (rte->rtekind == RTE_CTE &&
+			strcmp(rte->ctename, context->ctename) == 0 &&
+			rte->ctelevelsup == context->levelsup)
+		{
+			/*
+			 * Found a reference to replace.  Generate a copy of the CTE query
+			 * with appropriate level adjustment for outer references (e.g.,
+			 * to other CTEs).
+			 */
+			Query	   *newquery = copyObject(context->ctequery);
+
+			if (context->levelsup > 0)
+				IncrementVarSublevelsUp((Node *) newquery, context->levelsup, 1);
+
+			/*
+			 * Convert the RTE_CTE RTE into a RTE_SUBQUERY.
+			 *
+			 * Historically, a FOR UPDATE clause has been treated as extending
+			 * into views and subqueries, but not into CTEs.  We preserve this
+			 * distinction by not trying to push rowmarks into the new
+			 * subquery.
+			 */
+			rte->rtekind = RTE_SUBQUERY;
+			rte->subquery = newquery;
+			rte->security_barrier = false;
+
+			/* Zero out CTE-specific fields */
+			rte->ctename = NULL;
+			rte->ctelevelsup = 0;
+			rte->self_reference = false;
+			rte->coltypes = NIL;
+			rte->coltypmods = NIL;
+			rte->colcollations = NIL;
+
+			/* Count the number of replacements we've done */
+			context->refcount--;
+		}
+
+		return false;
+	}
+
+	return expression_tree_walker(node, inline_cte_walker, context);
+}
+
 /*
  * convert_ANY_sublink_to_join: try to convert an ANY SubLink to a join
  *
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 4c3768f9..ad22456b 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -507,7 +507,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <list>	row explicit_row implicit_row type_list array_expr_list
 %type <node>	case_expr case_arg when_clause case_default
 %type <list>	when_clause_list
-%type <ival>	sub_type
+%type <ival>	sub_type opt_materialized
 %type <value>	NumericOnly
 %type <list>	NumericOnly_list
 %type <alias>	alias_clause opt_alias_clause
@@ -12750,17 +12750,24 @@ cte_list:
 		| cte_list ',' common_table_expr		{ $$ = lappend($1, $3); }
 		;
 
-common_table_expr:  name opt_name_list AS '(' PreparableStmt ')'
+common_table_expr:  name opt_name_list AS opt_materialized '(' PreparableStmt ')'
 			{
 				CommonTableExpr *n = makeNode(CommonTableExpr);
 				n->ctename = $1;
 				n->aliascolnames = $2;
-				n->ctequery = $5;
+				n->ctematerialized = $4;
+				n->ctequery = $6;
 				n->location = @1;
 				$$ = (Node *) n;
 			}
 		;
 
+opt_materialized:
+		MATERIALIZED							{ $$ = CTEMaterializeAlways; }
+		| NOT MATERIALIZED						{ $$ = CTEMaterializeNever; }
+		| /*EMPTY*/								{ $$ = CTEMaterializeDefault; }
+		;
+
 opt_with_clause:
 		with_clause								{ $$ = $1; }
 		| /*EMPTY*/								{ $$ = NULL; }
@@ -17827,6 +17834,7 @@ makeRecursiveViewSelect(char *relname, List *aliases, Node *query)
 	/* create common table expression */
 	cte->ctename = relname;
 	cte->aliascolnames = aliases;
+	cte->ctematerialized = CTEMaterializeDefault;
 	cte->ctequery = query;
 	cte->location = -1;
 
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 2a661f67..feb22b86 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -1,12926 +1,12938 @@
-/*-------------------------------------------------------------------------
- *
- * ruleutils.c
- *      Functions to convert stored expressions/querytrees back to
- *      source text
- *
- * Portions Copyright (c) 2012-2014, TransLattice, Inc.
- * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- *
- * IDENTIFICATION
- *      src/backend/utils/adt/ruleutils.c
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include <ctype.h>
-#include <unistd.h>
-#include <fcntl.h>
-
-#ifdef PGXC
-#include "access/reloptions.h"
-#endif /* PGXC */
-#include "access/amapi.h"
-#include "access/htup_details.h"
-#include "access/sysattr.h"
-#include "catalog/dependency.h"
-#include "catalog/indexing.h"
-#include "catalog/partition.h"
-#include "catalog/pg_aggregate.h"
-#include "catalog/pg_am.h"
-#include "catalog/pg_authid.h"
-#ifdef PGXC
-#include "catalog/pg_aggregate.h"
-#endif /* PGXC */
-#include "catalog/pg_collation.h"
-#include "catalog/pg_constraint.h"
-#include "catalog/pg_depend.h"
-#include "catalog/pg_language.h"
-#include "catalog/pg_opclass.h"
-#include "catalog/pg_operator.h"
-#include "catalog/pg_partitioned_table.h"
-#include "catalog/pg_proc.h"
-#include "catalog/pg_statistic_ext.h"
-#include "catalog/pg_trigger.h"
-#include "catalog/pg_type.h"
-#include "commands/defrem.h"
-#include "commands/tablespace.h"
-#include "common/keywords.h"
-#include "executor/spi.h"
-#include "funcapi.h"
-#ifdef PGXC
-#include "nodes/execnodes.h"
-#endif
-#include "mb/pg_wchar.h"
-#include "miscadmin.h"
-#include "nodes/makefuncs.h"
-#include "nodes/nodeFuncs.h"
-#include "optimizer/tlist.h"
-#include "parser/parse_node.h"
-#include "parser/parse_agg.h"
-#include "parser/parse_func.h"
-#include "parser/parse_oper.h"
-#include "parser/parse_type.h"
-#include "parser/parser.h"
-#include "parser/parsetree.h"
-#ifdef PGXC
-#include "pgxc/pgxc.h"
-#include "pgxc/planner.h"
-#endif
-#include "rewrite/rewriteHandler.h"
-#include "rewrite/rewriteManip.h"
-#include "rewrite/rewriteSupport.h"
-#include "utils/array.h"
-#include "utils/builtins.h"
-#include "utils/fmgroids.h"
-#include "utils/hsearch.h"
-#include "utils/lsyscache.h"
-#include "utils/rel.h"
-#include "utils/ruleutils.h"
-#include "utils/snapmgr.h"
-#include "utils/syscache.h"
-#include "utils/tqual.h"
-#include "utils/typcache.h"
-#include "utils/varlena.h"
-#include "utils/xml.h"
-#ifdef __TBASE__
-#include "optimizer/planmain.h"
-#endif
-#ifdef __COLD_HOT__
-#include "postmaster/postmaster.h"
-#endif
-
-/* ----------
- * Pretty formatting constants
- * ----------
- */
-
-/* Indent counts */
-#define PRETTYINDENT_STD        8
-#define PRETTYINDENT_JOIN        4
-#define PRETTYINDENT_VAR        4
-
-#define PRETTYINDENT_LIMIT        40    /* wrap limit */
-
-/* Pretty flags */
-#define PRETTYFLAG_PAREN        1
-#define PRETTYFLAG_INDENT        2
-
-/* Default line length for pretty-print wrapping: 0 means wrap always */
-#define WRAP_COLUMN_DEFAULT        0
-
-/* macro to test if pretty action needed */
-#define PRETTY_PAREN(context)    ((context)->prettyFlags & PRETTYFLAG_PAREN)
-#define PRETTY_INDENT(context)    ((context)->prettyFlags & PRETTYFLAG_INDENT)
-
-
-#ifdef __TBASE__
-static int daysofmonth[13] = {0,31,29,31,30,31,30,31,31,30,31,30,31};
-
-static struct pg_tm g_partition_base_time = { 0,
-                                               0,
-                                               0,
-                                               1,
-                                               1,      /* origin 0, not 1 */
-                                               1970,        /* relative to 1900 */
-                                               1,
-                                               1,
-                                               0,
-                                               0,
-                                               NULL
-                                              };
-#endif
-
-/* ----------
- * Local data types
- * ----------
- */
-
-/* Context info needed for invoking a recursive querytree display routine */
-typedef struct
-{
-    StringInfo    buf;            /* output buffer to append to */
-    List       *namespaces;        /* List of deparse_namespace nodes */
-    List       *windowClause;    /* Current query level's WINDOW clause */
-    List       *windowTList;    /* targetlist for resolving WINDOW clause */
-    int            prettyFlags;    /* enabling of pretty-print functions */
-    int            wrapColumn;        /* max line length, or -1 for no limit */
-    int            indentLevel;    /* current indent level for prettyprint */
-    bool        varprefix;        /* TRUE to print prefixes on Vars */
-    ParseExprKind special_exprkind; /* set only for exprkinds needing special
-                                     * handling */
-#ifdef PGXC
-    bool        finalise_aggs;    /* should Datanode finalise the aggregates? */
-    bool        sortgroup_colno;/* instead of expression use resno for
-                                 * sortgrouprefs.
-                                 */
-#endif /* PGXC */
-} deparse_context;
-
-/*
- * Each level of query context around a subtree needs a level of Var namespace.
- * A Var having varlevelsup=N refers to the N'th item (counting from 0) in
- * the current context's namespaces list.
- *
- * The rangetable is the list of actual RTEs from the query tree, and the
- * cte list is the list of actual CTEs.
- *
- * rtable_names holds the alias name to be used for each RTE (either a C
- * string, or NULL for nameless RTEs such as unnamed joins).
- * rtable_columns holds the column alias names to be used for each RTE.
- *
- * In some cases we need to make names of merged JOIN USING columns unique
- * across the whole query, not only per-RTE.  If so, unique_using is TRUE
- * and using_names is a list of C strings representing names already assigned
- * to USING columns.
- *
- * When deparsing plan trees, there is always just a single item in the
- * deparse_namespace list (since a plan tree never contains Vars with
- * varlevelsup > 0).  We store the PlanState node that is the immediate
- * parent of the expression to be deparsed, as well as a list of that
- * PlanState's ancestors.  In addition, we store its outer and inner subplan
- * state nodes, as well as their plan nodes' targetlists, and the index tlist
- * if the current plan node might contain INDEX_VAR Vars.  (These fields could
- * be derived on-the-fly from the current PlanState, but it seems notationally
- * clearer to set them up as separate fields.)
- */
-typedef struct
-{
-    List       *rtable;            /* List of RangeTblEntry nodes */
-    List       *rtable_names;    /* Parallel list of names for RTEs */
-    List       *rtable_columns; /* Parallel list of deparse_columns structs */
-    List       *ctes;            /* List of CommonTableExpr nodes */
-    /* Workspace for column alias assignment: */
-    bool        unique_using;    /* Are we making USING names globally unique */
-    List       *using_names;    /* List of assigned names for USING columns */
-    /* Remaining fields are used only when deparsing a Plan tree: */
-    PlanState  *planstate;        /* immediate parent of current expression */
-    List       *ancestors;        /* ancestors of planstate */
-    PlanState  *outer_planstate;    /* outer subplan state, or NULL if none */
-    PlanState  *inner_planstate;    /* inner subplan state, or NULL if none */
-    List       *outer_tlist;    /* referent for OUTER_VAR Vars */
-    List       *inner_tlist;    /* referent for INNER_VAR Vars */
-    List       *index_tlist;    /* referent for INDEX_VAR Vars */
-} deparse_namespace;
-
-/*
- * Per-relation data about column alias names.
- *
- * Selecting aliases is unreasonably complicated because of the need to dump
- * rules/views whose underlying tables may have had columns added, deleted, or
- * renamed since the query was parsed.  We must nonetheless print the rule/view
- * in a form that can be reloaded and will produce the same results as before.
- *
- * For each RTE used in the query, we must assign column aliases that are
- * unique within that RTE.  SQL does not require this of the original query,
- * but due to factors such as *-expansion we need to be able to uniquely
- * reference every column in a decompiled query.  As long as we qualify all
- * column references, per-RTE uniqueness is sufficient for that.
- *
- * However, we can't ensure per-column name uniqueness for unnamed join RTEs,
- * since they just inherit column names from their input RTEs, and we can't
- * rename the columns at the join level.  Most of the time this isn't an issue
- * because we don't need to reference the join's output columns as such; we
- * can reference the input columns instead.  That approach can fail for merged
- * JOIN USING columns, however, so when we have one of those in an unnamed
- * join, we have to make that column's alias globally unique across the whole
- * query to ensure it can be referenced unambiguously.
- *
- * Another problem is that a JOIN USING clause requires the columns to be
- * merged to have the same aliases in both input RTEs, and that no other
- * columns in those RTEs or their children conflict with the USING names.
- * To handle that, we do USING-column alias assignment in a recursive
- * traversal of the query's jointree.  When descending through a JOIN with
- * USING, we preassign the USING column names to the child columns, overriding
- * other rules for column alias assignment.  We also mark each RTE with a list
- * of all USING column names selected for joins containing that RTE, so that
- * when we assign other columns' aliases later, we can avoid conflicts.
- *
- * Another problem is that if a JOIN's input tables have had columns added or
- * deleted since the query was parsed, we must generate a column alias list
- * for the join that matches the current set of input columns --- otherwise, a
- * change in the number of columns in the left input would throw off matching
- * of aliases to columns of the right input.  Thus, positions in the printable
- * column alias list are not necessarily one-for-one with varattnos of the
- * JOIN, so we need a separate new_colnames[] array for printing purposes.
- */
-typedef struct
-{
-    /*
-     * colnames is an array containing column aliases to use for columns that
-     * existed when the query was parsed.  Dropped columns have NULL entries.
-     * This array can be directly indexed by varattno to get a Var's name.
-     *
-     * Non-NULL entries are guaranteed unique within the RTE, *except* when
-     * this is for an unnamed JOIN RTE.  In that case we merely copy up names
-     * from the two input RTEs.
-     *
-     * During the recursive descent in set_using_names(), forcible assignment
-     * of a child RTE's column name is represented by pre-setting that element
-     * of the child's colnames array.  So at that stage, NULL entries in this
-     * array just mean that no name has been preassigned, not necessarily that
-     * the column is dropped.
-     */
-    int            num_cols;        /* length of colnames[] array */
-    char      **colnames;        /* array of C strings and NULLs */
-
-    /*
-     * new_colnames is an array containing column aliases to use for columns
-     * that would exist if the query was re-parsed against the current
-     * definitions of its base tables.  This is what to print as the column
-     * alias list for the RTE.  This array does not include dropped columns,
-     * but it will include columns added since original parsing.  Indexes in
-     * it therefore have little to do with current varattno values.  As above,
-     * entries are unique unless this is for an unnamed JOIN RTE.  (In such an
-     * RTE, we never actually print this array, but we must compute it anyway
-     * for possible use in computing column names of upper joins.) The
-     * parallel array is_new_col marks which of these columns are new since
-     * original parsing.  Entries with is_new_col false must match the
-     * non-NULL colnames entries one-for-one.
-     */
-    int            num_new_cols;    /* length of new_colnames[] array */
-    char      **new_colnames;    /* array of C strings */
-    bool       *is_new_col;        /* array of bool flags */
-
-    /* This flag tells whether we should actually print a column alias list */
-    bool        printaliases;
-
-    /* This list has all names used as USING names in joins above this RTE */
-    List       *parentUsing;    /* names assigned to parent merged columns */
-
-    /*
-     * If this struct is for a JOIN RTE, we fill these fields during the
-     * set_using_names() pass to describe its relationship to its child RTEs.
-     *
-     * leftattnos and rightattnos are arrays with one entry per existing
-     * output column of the join (hence, indexable by join varattno).  For a
-     * simple reference to a column of the left child, leftattnos[i] is the
-     * child RTE's attno and rightattnos[i] is zero; and conversely for a
-     * column of the right child.  But for merged columns produced by JOIN
-     * USING/NATURAL JOIN, both leftattnos[i] and rightattnos[i] are nonzero.
-     * Also, if the column has been dropped, both are zero.
-     *
-     * If it's a JOIN USING, usingNames holds the alias names selected for the
-     * merged columns (these might be different from the original USING list,
-     * if we had to modify names to achieve uniqueness).
-     */
-    int            leftrti;        /* rangetable index of left child */
-    int            rightrti;        /* rangetable index of right child */
-    int           *leftattnos;        /* left-child varattnos of join cols, or 0 */
-    int           *rightattnos;    /* right-child varattnos of join cols, or 0 */
-    List       *usingNames;        /* names assigned to merged columns */
-} deparse_columns;
-
-/* This macro is analogous to rt_fetch(), but for deparse_columns structs */
-#define deparse_columns_fetch(rangetable_index, dpns) \
-    ((deparse_columns *) list_nth((dpns)->rtable_columns, (rangetable_index)-1))
-
-/*
- * Entry in set_rtable_names' hash table
- */
-typedef struct
-{
-    char        name[NAMEDATALEN];    /* Hash key --- must be first */
-    int            counter;        /* Largest addition used so far for name */
-} NameHashEntry;
-
-
-/* ----------
- * Global data
- * ----------
- */
-static SPIPlanPtr plan_getrulebyoid = NULL;
-static const char *query_getrulebyoid = "SELECT * FROM pg_catalog.pg_rewrite WHERE oid = $1";
-static SPIPlanPtr plan_getviewrule = NULL;
-static const char *query_getviewrule = "SELECT * FROM pg_catalog.pg_rewrite WHERE ev_class = $1 AND rulename = $2";
-
-/* GUC parameters */
-bool        quote_all_identifiers = false;
-
-
-/* ----------
- * Local functions
- *
- * Most of these functions used to use fixed-size buffers to build their
- * results.  Now, they take an (already initialized) StringInfo object
- * as a parameter, and append their text output to its contents.
- * ----------
- */
-static char *deparse_expression_pretty(Node *expr, List *dpcontext,
-                          bool forceprefix, bool showimplicit,
-                          int prettyFlags, int startIndent);
-static char *pg_get_viewdef_worker(Oid viewoid,
-                      int prettyFlags, int wrapColumn);
-static char *pg_get_triggerdef_worker(Oid trigid, bool pretty);
-static void decompile_column_index_array(Datum column_index_array, Oid relId,
-                             StringInfo buf);
-static char *pg_get_ruledef_worker(Oid ruleoid, int prettyFlags);
-static char *pg_get_indexdef_worker(Oid indexrelid, int colno,
-                       const Oid *excludeOps,
-                       bool attrsOnly, bool showTblSpc,
-                       int prettyFlags, bool missing_ok);
-static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok);
-static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags,
-                         bool attrsOnly, bool missing_ok);
-static char *pg_get_constraintdef_worker(Oid constraintId, bool fullCommand,
-                            int prettyFlags, bool missing_ok);
-static text *pg_get_expr_worker(text *expr, Oid relid, const char *relname,
-                   int prettyFlags);
-static int print_function_arguments(StringInfo buf, HeapTuple proctup,
-                         bool print_table_args, bool print_defaults);
-static void print_function_rettype(StringInfo buf, HeapTuple proctup);
-static void print_function_trftypes(StringInfo buf, HeapTuple proctup);
-static void set_rtable_names(deparse_namespace *dpns, List *parent_namespaces,
-                 Bitmapset *rels_used);
-static void set_deparse_for_query(deparse_namespace *dpns, Query *query,
-                      List *parent_namespaces);
-static void set_simple_column_names(deparse_namespace *dpns);
-static bool has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode);
-static void set_using_names(deparse_namespace *dpns, Node *jtnode,
-                List *parentUsing);
-static void set_relation_column_names(deparse_namespace *dpns,
-                          RangeTblEntry *rte,
-                          deparse_columns *colinfo);
-static void set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
-                      deparse_columns *colinfo);
-static bool colname_is_unique(char *colname, deparse_namespace *dpns,
-                  deparse_columns *colinfo);
-static char *make_colname_unique(char *colname, deparse_namespace *dpns,
-                    deparse_columns *colinfo);
-static void expand_colnames_array_to(deparse_columns *colinfo, int n);
-static void identify_join_columns(JoinExpr *j, RangeTblEntry *jrte,
-                      deparse_columns *colinfo);
-static void flatten_join_using_qual(Node *qual,
-                        List **leftvars, List **rightvars);
-static char *get_rtable_name(int rtindex, deparse_context *context);
-static void set_deparse_planstate(deparse_namespace *dpns, PlanState *ps);
-#ifdef PGXC
-static void set_deparse_plan(deparse_namespace *dpns, Plan *plan);
-#endif
-static void push_child_plan(deparse_namespace *dpns, PlanState *ps,
-                deparse_namespace *save_dpns);
-static void pop_child_plan(deparse_namespace *dpns,
-               deparse_namespace *save_dpns);
-static void push_ancestor_plan(deparse_namespace *dpns, ListCell *ancestor_cell,
-                   deparse_namespace *save_dpns);
-static void pop_ancestor_plan(deparse_namespace *dpns,
-                  deparse_namespace *save_dpns);
-static void make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
-             int prettyFlags);
-static void make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
-             int prettyFlags, int wrapColumn);
-static void get_query_def(Query *query, StringInfo buf, List *parentnamespace,
-              TupleDesc resultDesc,
-              int prettyFlags, int wrapColumn, int startIndent
-#ifdef PGXC
-              , bool finalise_aggregates, bool sortgroup_colno
-#endif /* PGXC */
-                );
-static void get_values_def(List *values_lists, deparse_context *context);
-static void get_with_clause(Query *query, deparse_context *context);
-static void get_select_query_def(Query *query, deparse_context *context,
-                     TupleDesc resultDesc);
-static void get_insert_query_def(Query *query, deparse_context *context);
-static void get_update_query_def(Query *query, deparse_context *context);
-static void get_update_query_targetlist_def(Query *query, List *targetList,
-                                deparse_context *context,
-                                RangeTblEntry *rte);
-static void get_delete_query_def(Query *query, deparse_context *context);
-static void get_utility_query_def(Query *query, deparse_context *context);
-static void get_basic_select_query(Query *query, deparse_context *context,
-                       TupleDesc resultDesc);
-static void get_target_list(List *targetList, deparse_context *context,
-                TupleDesc resultDesc);
-static void get_setop_query(Node *setOp, Query *query,
-                deparse_context *context,
-                TupleDesc resultDesc);
-static Node *get_rule_sortgroupclause(Index ref, List *tlist,
-                         bool force_colno,
-                         deparse_context *context);
-static void get_rule_groupingset(GroupingSet *gset, List *targetlist,
-                     bool omit_parens, deparse_context *context);
-static void get_rule_orderby(List *orderList, List *targetList,
-                 bool force_colno, deparse_context *context);
-static void get_rule_windowclause(Query *query, deparse_context *context);
-static void get_rule_windowspec(WindowClause *wc, List *targetList,
-                    deparse_context *context);
-static char *get_variable(Var *var, int levelsup, bool istoplevel,
-             deparse_context *context);
-static void get_special_variable(Node *node, deparse_context *context,
-                     void *private);
-static void resolve_special_varno(Node *node, deparse_context *context,
-                      void *private,
-                      void (*callback) (Node *, deparse_context *, void *));
-static Node *find_param_referent(Param *param, deparse_context *context,
-                    deparse_namespace **dpns_p, ListCell **ancestor_cell_p);
-static void get_parameter(Param *param, deparse_context *context);
-static const char *get_simple_binary_op_name(OpExpr *expr);
-static bool isSimpleNode(Node *node, Node *parentNode, int prettyFlags);
-static void appendContextKeyword(deparse_context *context, const char *str,
-                     int indentBefore, int indentAfter, int indentPlus);
-static void removeStringInfoSpaces(StringInfo str);
-static void get_rule_expr(Node *node, deparse_context *context,
-              bool showimplicit);
-static void get_rule_expr_toplevel(Node *node, deparse_context *context,
-                       bool showimplicit);
-static void get_rule_expr_funccall(Node *node, deparse_context *context,
-                       bool showimplicit);
-static bool looks_like_function(Node *node);
-static void get_oper_expr(OpExpr *expr, deparse_context *context);
-static void get_func_expr(FuncExpr *expr, deparse_context *context,
-              bool showimplicit);
-static void get_agg_expr(Aggref *aggref, deparse_context *context,
-             Aggref *original_aggref);
-static void get_agg_combine_expr(Node *node, deparse_context *context,
-                     void *private);
-static void get_windowfunc_expr(WindowFunc *wfunc, deparse_context *context);
-static void get_coercion_expr(Node *arg, deparse_context *context,
-                  Oid resulttype, int32 resulttypmod,
-                  Node *parentNode);
-static void get_const_expr(Const *constval, deparse_context *context,
-               int showtype);
-static void get_const_collation(Const *constval, deparse_context *context);
-static void simple_quote_literal(StringInfo buf, const char *val);
-static void get_sublink_expr(SubLink *sublink, deparse_context *context);
-static void get_tablefunc(TableFunc *tf, deparse_context *context,
-              bool showimplicit);
-static void get_from_clause(Query *query, const char *prefix,
-                deparse_context *context);
-static void get_from_clause_item(Node *jtnode, Query *query,
-                     deparse_context *context);
-static void get_column_alias_list(deparse_columns *colinfo,
-                      deparse_context *context);
-static void get_from_clause_coldeflist(RangeTblFunction *rtfunc,
-                           deparse_columns *colinfo,
-                           deparse_context *context);
-static void get_tablesample_def(TableSampleClause *tablesample,
-                    deparse_context *context);
-static void get_opclass_name(Oid opclass, Oid actual_datatype,
-                 StringInfo buf);
-static Node *processIndirection(Node *node, deparse_context *context);
-static void printSubscripts(ArrayRef *aref, deparse_context *context);
-static char *get_relation_name(Oid relid);
-static char *generate_relation_name(Oid relid, List *namespaces);
-static char *generate_qualified_relation_name(Oid relid);
-static char *generate_function_name(Oid funcid, int nargs,
-                       List *argnames, Oid *argtypes,
-                       bool has_variadic, bool *use_variadic_p,
-                       ParseExprKind special_exprkind);
-static char *generate_operator_name(Oid operid, Oid arg1, Oid arg2);
-static text *string_to_text(char *str);
-static char *flatten_reloptions(Oid relid);
-
-#ifdef __TBASE__
-static Bitmapset *pruning_walker(Relation rel, Node *expr);
-static Bitmapset *pruning_opexpr(Relation rel, OpExpr *expr);
-static Bitmapset *get_full_pruning_result(Relation rel);
-static int get_daysofmonth(int startmonth, int startday, 
-                                int endmonth, int endday);
-#endif
-#define only_marker(rte)  ((rte)->inh ? "" : "ONLY ")
-
-
-/* ----------
- * get_ruledef            - Do it all and return a text
- *                  that could be used as a statement
- *                  to recreate the rule
- * ----------
- */
-Datum
-pg_get_ruledef(PG_FUNCTION_ARGS)
-{
-    Oid            ruleoid = PG_GETARG_OID(0);
-    int            prettyFlags;
-    char       *res;
-
-    prettyFlags = PRETTYFLAG_INDENT;
-
-    res = pg_get_ruledef_worker(ruleoid, prettyFlags);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-
-Datum
-pg_get_ruledef_ext(PG_FUNCTION_ARGS)
-{
-    Oid            ruleoid = PG_GETARG_OID(0);
-    bool        pretty = PG_GETARG_BOOL(1);
-    int            prettyFlags;
-    char       *res;
-
-    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
-
-    res = pg_get_ruledef_worker(ruleoid, prettyFlags);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-
-static char *
-pg_get_ruledef_worker(Oid ruleoid, int prettyFlags)
-{// #lizard forgives
-    Datum        args[1];
-    char        nulls[1];
-    int            spirc;
-    HeapTuple    ruletup;
-    TupleDesc    rulettc;
-    StringInfoData buf;
-
-    /*
-     * Do this first so that string is alloc'd in outer context not SPI's.
-     */
-    initStringInfo(&buf);
-
-    /*
-     * Connect to SPI manager
-     */
-    if (SPI_connect() != SPI_OK_CONNECT)
-        elog(ERROR, "SPI_connect failed");
-
-    /*
-     * On the first call prepare the plan to lookup pg_rewrite. We read
-     * pg_rewrite over the SPI manager instead of using the syscache to be
-     * checked for read access on pg_rewrite.
-     */
-    if (plan_getrulebyoid == NULL)
-    {
-        Oid            argtypes[1];
-        SPIPlanPtr    plan;
-
-        argtypes[0] = OIDOID;
-        plan = SPI_prepare(query_getrulebyoid, 1, argtypes);
-        if (plan == NULL)
-            elog(ERROR, "SPI_prepare failed for \"%s\"", query_getrulebyoid);
-        SPI_keepplan(plan);
-        plan_getrulebyoid = plan;
-    }
-
-    /*
-     * Get the pg_rewrite tuple for this rule
-     */
-    args[0] = ObjectIdGetDatum(ruleoid);
-    nulls[0] = ' ';
-    spirc = SPI_execute_plan(plan_getrulebyoid, args, nulls, true, 0);
-    if (spirc != SPI_OK_SELECT)
-        elog(ERROR, "failed to get pg_rewrite tuple for rule %u", ruleoid);
-    if (SPI_processed != 1)
-    {
-        /*
-         * There is no tuple data available here, just keep the output buffer
-         * empty.
-         */
-    }
-    else
-    {
-        /*
-         * Get the rule's definition and put it into executor's memory
-         */
-        ruletup = SPI_tuptable->vals[0];
-        rulettc = SPI_tuptable->tupdesc;
-        make_ruledef(&buf, ruletup, rulettc, prettyFlags);
-    }
-
-    /*
-     * Disconnect from SPI manager
-     */
-    if (SPI_finish() != SPI_OK_FINISH)
-        elog(ERROR, "SPI_finish failed");
-
-    if (buf.len == 0)
-        return NULL;
-
-    return buf.data;
-}
-
-
-/* ----------
- * get_viewdef            - Mainly the same thing, but we
- *                  only return the SELECT part of a view
- * ----------
- */
-Datum
-pg_get_viewdef(PG_FUNCTION_ARGS)
-{
-    /* By OID */
-    Oid            viewoid = PG_GETARG_OID(0);
-    int            prettyFlags;
-    char       *res;
-
-    prettyFlags = PRETTYFLAG_INDENT;
-
-    res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-
-Datum
-pg_get_viewdef_ext(PG_FUNCTION_ARGS)
-{
-    /* By OID */
-    Oid            viewoid = PG_GETARG_OID(0);
-    bool        pretty = PG_GETARG_BOOL(1);
-    int            prettyFlags;
-    char       *res;
-
-    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
-
-    res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-Datum
-pg_get_viewdef_wrap(PG_FUNCTION_ARGS)
-{
-    /* By OID */
-    Oid            viewoid = PG_GETARG_OID(0);
-    int            wrap = PG_GETARG_INT32(1);
-    int            prettyFlags;
-    char       *res;
-
-    /* calling this implies we want pretty printing */
-    prettyFlags = PRETTYFLAG_PAREN | PRETTYFLAG_INDENT;
-
-    res = pg_get_viewdef_worker(viewoid, prettyFlags, wrap);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-Datum
-pg_get_viewdef_name(PG_FUNCTION_ARGS)
-{
-    /* By qualified name */
-    text       *viewname = PG_GETARG_TEXT_PP(0);
-    int            prettyFlags;
-    RangeVar   *viewrel;
-    Oid            viewoid;
-    char       *res;
-
-    prettyFlags = PRETTYFLAG_INDENT;
-
-    /* Look up view name.  Can't lock it - we might not have privileges. */
-    viewrel = makeRangeVarFromNameList(textToQualifiedNameList(viewname));
-    viewoid = RangeVarGetRelid(viewrel, NoLock, false);
-
-    res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-
-Datum
-pg_get_viewdef_name_ext(PG_FUNCTION_ARGS)
-{
-    /* By qualified name */
-    text       *viewname = PG_GETARG_TEXT_PP(0);
-    bool        pretty = PG_GETARG_BOOL(1);
-    int            prettyFlags;
-    RangeVar   *viewrel;
-    Oid            viewoid;
-    char       *res;
-
-    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
-
-    /* Look up view name.  Can't lock it - we might not have privileges. */
-    viewrel = makeRangeVarFromNameList(textToQualifiedNameList(viewname));
-    viewoid = RangeVarGetRelid(viewrel, NoLock, false);
-
-    res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-/*
- * Common code for by-OID and by-name variants of pg_get_viewdef
- */
-static char *
-pg_get_viewdef_worker(Oid viewoid, int prettyFlags, int wrapColumn)
-{// #lizard forgives
-    Datum        args[2];
-    char        nulls[2];
-    int            spirc;
-    HeapTuple    ruletup;
-    TupleDesc    rulettc;
-    StringInfoData buf;
-
-    /*
-     * Do this first so that string is alloc'd in outer context not SPI's.
-     */
-    initStringInfo(&buf);
-
-    /*
-     * Connect to SPI manager
-     */
-    if (SPI_connect() != SPI_OK_CONNECT)
-        elog(ERROR, "SPI_connect failed");
-
-    /*
-     * On the first call prepare the plan to lookup pg_rewrite. We read
-     * pg_rewrite over the SPI manager instead of using the syscache to be
-     * checked for read access on pg_rewrite.
-     */
-    if (plan_getviewrule == NULL)
-    {
-        Oid            argtypes[2];
-        SPIPlanPtr    plan;
-
-        argtypes[0] = OIDOID;
-        argtypes[1] = NAMEOID;
-        plan = SPI_prepare(query_getviewrule, 2, argtypes);
-        if (plan == NULL)
-            elog(ERROR, "SPI_prepare failed for \"%s\"", query_getviewrule);
-        SPI_keepplan(plan);
-        plan_getviewrule = plan;
-    }
-
-    /*
-     * Get the pg_rewrite tuple for the view's SELECT rule
-     */
-    args[0] = ObjectIdGetDatum(viewoid);
-    args[1] = DirectFunctionCall1(namein, CStringGetDatum(ViewSelectRuleName));
-    nulls[0] = ' ';
-    nulls[1] = ' ';
-    spirc = SPI_execute_plan(plan_getviewrule, args, nulls, true, 0);
-    if (spirc != SPI_OK_SELECT)
-        elog(ERROR, "failed to get pg_rewrite tuple for view %u", viewoid);
-    if (SPI_processed != 1)
-    {
-        /*
-         * There is no tuple data available here, just keep the output buffer
-         * empty.
-         */
-    }
-    else
-    {
-        /*
-         * Get the rule's definition and put it into executor's memory
-         */
-        ruletup = SPI_tuptable->vals[0];
-        rulettc = SPI_tuptable->tupdesc;
-        make_viewdef(&buf, ruletup, rulettc, prettyFlags, wrapColumn);
-    }
-
-    /*
-     * Disconnect from SPI manager
-     */
-    if (SPI_finish() != SPI_OK_FINISH)
-        elog(ERROR, "SPI_finish failed");
-
-    if (buf.len == 0)
-        return NULL;
-
-    return buf.data;
-}
-
-/* ----------
- * get_triggerdef            - Get the definition of a trigger
- * ----------
- */
-Datum
-pg_get_triggerdef(PG_FUNCTION_ARGS)
-{
-    Oid            trigid = PG_GETARG_OID(0);
-    char       *res;
-
-    res = pg_get_triggerdef_worker(trigid, false);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-Datum
-pg_get_triggerdef_ext(PG_FUNCTION_ARGS)
-{
-    Oid            trigid = PG_GETARG_OID(0);
-    bool        pretty = PG_GETARG_BOOL(1);
-    char       *res;
-
-    res = pg_get_triggerdef_worker(trigid, pretty);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-static char *
-pg_get_triggerdef_worker(Oid trigid, bool pretty)
-{// #lizard forgives
-    HeapTuple    ht_trig;
-    Form_pg_trigger trigrec;
-    StringInfoData buf;
-    Relation    tgrel;
-    ScanKeyData skey[1];
-    SysScanDesc tgscan;
-    int            findx = 0;
-    char       *tgname;
-    char       *tgoldtable;
-    char       *tgnewtable;
-    Oid            argtypes[1];    /* dummy */
-    Datum        value;
-    bool        isnull;
-
-    /*
-     * Fetch the pg_trigger tuple by the Oid of the trigger
-     */
-    tgrel = heap_open(TriggerRelationId, AccessShareLock);
-
-    ScanKeyInit(&skey[0],
-                ObjectIdAttributeNumber,
-                BTEqualStrategyNumber, F_OIDEQ,
-                ObjectIdGetDatum(trigid));
-
-    tgscan = systable_beginscan(tgrel, TriggerOidIndexId, true,
-                                NULL, 1, skey);
-
-    ht_trig = systable_getnext(tgscan);
-
-    if (!HeapTupleIsValid(ht_trig))
-    {
-        systable_endscan(tgscan);
-        heap_close(tgrel, AccessShareLock);
-        return NULL;
-    }
-
-    trigrec = (Form_pg_trigger) GETSTRUCT(ht_trig);
-
-    /*
-     * Start the trigger definition. Note that the trigger's name should never
-     * be schema-qualified, but the trigger rel's name may be.
-     */
-    initStringInfo(&buf);
-
-    tgname = NameStr(trigrec->tgname);
-    appendStringInfo(&buf, "CREATE %sTRIGGER %s ",
-                     OidIsValid(trigrec->tgconstraint) ? "CONSTRAINT " : "",
-                     quote_identifier(tgname));
-
-    if (TRIGGER_FOR_BEFORE(trigrec->tgtype))
-        appendStringInfoString(&buf, "BEFORE");
-    else if (TRIGGER_FOR_AFTER(trigrec->tgtype))
-        appendStringInfoString(&buf, "AFTER");
-    else if (TRIGGER_FOR_INSTEAD(trigrec->tgtype))
-        appendStringInfoString(&buf, "INSTEAD OF");
-    else
-        elog(ERROR, "unexpected tgtype value: %d", trigrec->tgtype);
-
-    if (TRIGGER_FOR_INSERT(trigrec->tgtype))
-    {
-        appendStringInfoString(&buf, " INSERT");
-        findx++;
-    }
-    if (TRIGGER_FOR_DELETE(trigrec->tgtype))
-    {
-        if (findx > 0)
-            appendStringInfoString(&buf, " OR DELETE");
-        else
-            appendStringInfoString(&buf, " DELETE");
-        findx++;
-    }
-    if (TRIGGER_FOR_UPDATE(trigrec->tgtype))
-    {
-        if (findx > 0)
-            appendStringInfoString(&buf, " OR UPDATE");
-        else
-            appendStringInfoString(&buf, " UPDATE");
-        findx++;
-        /* tgattr is first var-width field, so OK to access directly */
-        if (trigrec->tgattr.dim1 > 0)
-        {
-            int            i;
-
-            appendStringInfoString(&buf, " OF ");
-            for (i = 0; i < trigrec->tgattr.dim1; i++)
-            {
-                char       *attname;
-
-                if (i > 0)
-                    appendStringInfoString(&buf, ", ");
-                attname = get_relid_attribute_name(trigrec->tgrelid,
-                                                   trigrec->tgattr.values[i]);
-                appendStringInfoString(&buf, quote_identifier(attname));
-            }
-        }
-    }
-    if (TRIGGER_FOR_TRUNCATE(trigrec->tgtype))
-    {
-        if (findx > 0)
-            appendStringInfoString(&buf, " OR TRUNCATE");
-        else
-            appendStringInfoString(&buf, " TRUNCATE");
-        findx++;
-    }
-    appendStringInfo(&buf, " ON %s ",
-                     generate_relation_name(trigrec->tgrelid, NIL));
-
-    if (OidIsValid(trigrec->tgconstraint))
-    {
-        if (OidIsValid(trigrec->tgconstrrelid))
-            appendStringInfo(&buf, "FROM %s ",
-                             generate_relation_name(trigrec->tgconstrrelid, NIL));
-        if (!trigrec->tgdeferrable)
-            appendStringInfoString(&buf, "NOT ");
-        appendStringInfoString(&buf, "DEFERRABLE INITIALLY ");
-        if (trigrec->tginitdeferred)
-            appendStringInfoString(&buf, "DEFERRED ");
-        else
-            appendStringInfoString(&buf, "IMMEDIATE ");
-    }
-
-    value = fastgetattr(ht_trig, Anum_pg_trigger_tgoldtable,
-                        tgrel->rd_att, &isnull);
-    if (!isnull)
-        tgoldtable = NameStr(*((NameData *) DatumGetPointer(value)));
-    else
-        tgoldtable = NULL;
-    value = fastgetattr(ht_trig, Anum_pg_trigger_tgnewtable,
-                        tgrel->rd_att, &isnull);
-    if (!isnull)
-        tgnewtable = NameStr(*((NameData *) DatumGetPointer(value)));
-    else
-        tgnewtable = NULL;
-    if (tgoldtable != NULL || tgnewtable != NULL)
-    {
-        appendStringInfoString(&buf, "REFERENCING ");
-        if (tgoldtable != NULL)
-            appendStringInfo(&buf, "OLD TABLE AS %s ", tgoldtable);
-        if (tgnewtable != NULL)
-            appendStringInfo(&buf, "NEW TABLE AS %s ", tgnewtable);
-    }
-
-    if (TRIGGER_FOR_ROW(trigrec->tgtype))
-        appendStringInfoString(&buf, "FOR EACH ROW ");
-    else
-        appendStringInfoString(&buf, "FOR EACH STATEMENT ");
-
-    /* If the trigger has a WHEN qualification, add that */
-    value = fastgetattr(ht_trig, Anum_pg_trigger_tgqual,
-                        tgrel->rd_att, &isnull);
-    if (!isnull)
-    {
-        Node       *qual;
-        char        relkind;
-        deparse_context context;
-        deparse_namespace dpns;
-        RangeTblEntry *oldrte;
-        RangeTblEntry *newrte;
-
-        appendStringInfoString(&buf, "WHEN (");
-
-        qual = stringToNode(TextDatumGetCString(value));
-
-        relkind = get_rel_relkind(trigrec->tgrelid);
-
-        /* Build minimal OLD and NEW RTEs for the rel */
-        oldrte = makeNode(RangeTblEntry);
-        oldrte->rtekind = RTE_RELATION;
-        oldrte->relid = trigrec->tgrelid;
-        oldrte->relkind = relkind;
-        oldrte->alias = makeAlias("old", NIL);
-        oldrte->eref = oldrte->alias;
-        oldrte->lateral = false;
-        oldrte->inh = false;
-        oldrte->inFromCl = true;
-
-        newrte = makeNode(RangeTblEntry);
-        newrte->rtekind = RTE_RELATION;
-        newrte->relid = trigrec->tgrelid;
-        newrte->relkind = relkind;
-        newrte->alias = makeAlias("new", NIL);
-        newrte->eref = newrte->alias;
-        newrte->lateral = false;
-        newrte->inh = false;
-        newrte->inFromCl = true;
-
-        /* Build two-element rtable */
-        memset(&dpns, 0, sizeof(dpns));
-        dpns.rtable = list_make2(oldrte, newrte);
-        dpns.ctes = NIL;
-        set_rtable_names(&dpns, NIL, NULL);
-        set_simple_column_names(&dpns);
-
-        /* Set up context with one-deep namespace stack */
-        context.buf = &buf;
-        context.namespaces = list_make1(&dpns);
-        context.windowClause = NIL;
-        context.windowTList = NIL;
-        context.varprefix = true;
-        context.prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
-        context.wrapColumn = WRAP_COLUMN_DEFAULT;
-        context.indentLevel = PRETTYINDENT_STD;
-        context.special_exprkind = EXPR_KIND_NONE;
-
-        get_rule_expr(qual, &context, false);
-
-        appendStringInfoString(&buf, ") ");
-    }
-
-    appendStringInfo(&buf, "EXECUTE PROCEDURE %s(",
-                     generate_function_name(trigrec->tgfoid, 0,
-                                            NIL, argtypes,
-                                            false, NULL, EXPR_KIND_NONE));
-
-    if (trigrec->tgnargs > 0)
-    {
-        char       *p;
-        int            i;
-
-        value = fastgetattr(ht_trig, Anum_pg_trigger_tgargs,
-                            tgrel->rd_att, &isnull);
-        if (isnull)
-            elog(ERROR, "tgargs is null for trigger %u", trigid);
-        p = (char *) VARDATA_ANY(DatumGetByteaPP(value));
-        for (i = 0; i < trigrec->tgnargs; i++)
-        {
-            if (i > 0)
-                appendStringInfoString(&buf, ", ");
-            simple_quote_literal(&buf, p);
-            /* advance p to next string embedded in tgargs */
-            while (*p)
-                p++;
-            p++;
-        }
-    }
-
-    /* We deliberately do not put semi-colon at end */
-    appendStringInfoChar(&buf, ')');
-
-    /* Clean up */
-    systable_endscan(tgscan);
-
-    heap_close(tgrel, AccessShareLock);
-
-    return buf.data;
-}
-
-/* ----------
- * get_indexdef            - Get the definition of an index
- *
- * In the extended version, there is a colno argument as well as pretty bool.
- *    if colno == 0, we want a complete index definition.
- *    if colno > 0, we only want the Nth index key's variable or expression.
- *
- * Note that the SQL-function versions of this omit any info about the
- * index tablespace; this is intentional because pg_dump wants it that way.
- * However pg_get_indexdef_string() includes the index tablespace.
- * ----------
- */
-Datum
-pg_get_indexdef(PG_FUNCTION_ARGS)
-{
-    Oid            indexrelid = PG_GETARG_OID(0);
-    int            prettyFlags;
-    char       *res;
-
-    prettyFlags = PRETTYFLAG_INDENT;
-
-    res = pg_get_indexdef_worker(indexrelid, 0, NULL, false, false,
-                                 prettyFlags, true);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-Datum
-pg_get_indexdef_ext(PG_FUNCTION_ARGS)
-{
-    Oid            indexrelid = PG_GETARG_OID(0);
-    int32        colno = PG_GETARG_INT32(1);
-    bool        pretty = PG_GETARG_BOOL(2);
-    int            prettyFlags;
-    char       *res;
-
-    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
-
-    res = pg_get_indexdef_worker(indexrelid, colno, NULL, colno != 0, false,
-                                 prettyFlags, true);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-/*
- * Internal version for use by ALTER TABLE.
- * Includes a tablespace clause in the result.
- * Returns a palloc'd C string; no pretty-printing.
- */
-char *
-pg_get_indexdef_string(Oid indexrelid)
-{
-    return pg_get_indexdef_worker(indexrelid, 0, NULL, false, true, 0, false);
-}
-
-/* Internal version that just reports the column definitions */
-char *
-pg_get_indexdef_columns(Oid indexrelid, bool pretty)
-{
-    int            prettyFlags;
-
-    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
-    return pg_get_indexdef_worker(indexrelid, 0, NULL, true, false,
-                                  prettyFlags, false);
-}
-
-/*
- * Internal workhorse to decompile an index definition.
- *
- * This is now used for exclusion constraints as well: if excludeOps is not
- * NULL then it points to an array of exclusion operator OIDs.
- */
-static char *
-pg_get_indexdef_worker(Oid indexrelid, int colno,
-                       const Oid *excludeOps,
-                       bool attrsOnly, bool showTblSpc,
-                       int prettyFlags, bool missing_ok)
-{// #lizard forgives
-    /* might want a separate isConstraint parameter later */
-    bool        isConstraint = (excludeOps != NULL);
-    HeapTuple    ht_idx;
-    HeapTuple    ht_idxrel;
-    HeapTuple    ht_am;
-    Form_pg_index idxrec;
-    Form_pg_class idxrelrec;
-    Form_pg_am    amrec;
-    IndexAmRoutine *amroutine;
-    List       *indexprs;
-    ListCell   *indexpr_item;
-    List       *context;
-    Oid            indrelid;
-    int            keyno;
-    Datum        indcollDatum;
-    Datum        indclassDatum;
-    Datum        indoptionDatum;
-    bool        isnull;
-    oidvector  *indcollation;
-    oidvector  *indclass;
-    int2vector *indoption;
-    StringInfoData buf;
-    char       *str;
-    char       *sep;
-#ifdef __TBASE__
-    bool       is_interval_child = false;
-    HeapTuple  ht_parent_idx;
-#endif
-    /*
-     * Fetch the pg_index tuple by the Oid of the index
-     */
-    ht_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexrelid));
-    if (!HeapTupleIsValid(ht_idx))
-    {
-        if (missing_ok)
-            return NULL;
-        elog(ERROR, "cache lookup failed for index %u", indexrelid);
-    }
-    idxrec = (Form_pg_index) GETSTRUCT(ht_idx);
-
-    indrelid = idxrec->indrelid;
-    Assert(indexrelid == idxrec->indexrelid);
-
-    /* Must get indcollation, indclass, and indoption the hard way */
-    indcollDatum = SysCacheGetAttr(INDEXRELID, ht_idx,
-                                   Anum_pg_index_indcollation, &isnull);
-    Assert(!isnull);
-    indcollation = (oidvector *) DatumGetPointer(indcollDatum);
-
-    indclassDatum = SysCacheGetAttr(INDEXRELID, ht_idx,
-                                    Anum_pg_index_indclass, &isnull);
-    Assert(!isnull);
-    indclass = (oidvector *) DatumGetPointer(indclassDatum);
-
-    indoptionDatum = SysCacheGetAttr(INDEXRELID, ht_idx,
-                                     Anum_pg_index_indoption, &isnull);
-    Assert(!isnull);
-    indoption = (int2vector *) DatumGetPointer(indoptionDatum);
-
-    /*
-     * Fetch the pg_class tuple of the index relation
-     */
-    ht_idxrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indexrelid));
-    if (!HeapTupleIsValid(ht_idxrel))
-        elog(ERROR, "cache lookup failed for relation %u", indexrelid);
-    idxrelrec = (Form_pg_class) GETSTRUCT(ht_idxrel);
-
-    /*
-     * Fetch the pg_am tuple of the index' access method
-     */
-    ht_am = SearchSysCache1(AMOID, ObjectIdGetDatum(idxrelrec->relam));
-    if (!HeapTupleIsValid(ht_am))
-        elog(ERROR, "cache lookup failed for access method %u",
-             idxrelrec->relam);
-    amrec = (Form_pg_am) GETSTRUCT(ht_am);
-
-    /* Fetch the index AM's API struct */
-    amroutine = GetIndexAmRoutine(amrec->amhandler);
-
-    /*
-     * Get the index expressions, if any.  (NOTE: we do not use the relcache
-     * versions of the expressions and predicate, because we want to display
-     * non-const-folded expressions.)
-     */
-    if (!heap_attisnull(ht_idx, Anum_pg_index_indexprs, NULL))
-    {
-        Datum        exprsDatum;
-        bool        isnull;
-        char       *exprsString;
-
-        exprsDatum = SysCacheGetAttr(INDEXRELID, ht_idx,
-                                     Anum_pg_index_indexprs, &isnull);
-        Assert(!isnull);
-        exprsString = TextDatumGetCString(exprsDatum);
-        indexprs = (List *) stringToNode(exprsString);
-        pfree(exprsString);
-    }
-    else
-        indexprs = NIL;
-
-    indexpr_item = list_head(indexprs);
-
-    context = deparse_context_for(get_relation_name(indrelid), indrelid);
-
-    /*
-     * Start the index definition.  Note that the index's name should never be
-     * schema-qualified, but the indexed rel's name may be.
-     */
-    initStringInfo(&buf);
-
-    if (!attrsOnly)
-    {
-        if (!isConstraint)
-            appendStringInfo(&buf, "CREATE %sINDEX %s ON %s USING %s (",
-                             idxrec->indisunique ? "UNIQUE " : "",
-                             quote_identifier(NameStr(idxrelrec->relname)),
-                             generate_relation_name(indrelid, NIL),
-                             quote_identifier(NameStr(amrec->amname)));
-        else                    /* currently, must be EXCLUDE constraint */
-            appendStringInfo(&buf, "EXCLUDE USING %s (",
-                             quote_identifier(NameStr(amrec->amname)));
-    }
-
-    /*
-     * Report the indexed attributes
-     */
-#ifdef __TBASE__
-    {
-        Relation rel = relation_open(indrelid, NoLock);
-        if (rel->rd_rel->relkind == RELKIND_RELATION && RELATION_IS_CHILD(rel))
-        {
-            Oid parentIndexId = get_interval_parent_relid(indexrelid);
-            Oid parentId = get_interval_parent_relid(indrelid);
-            if (!OidIsValid(parentId))
-            {
-                elog(ERROR, "could not get interval parent for relation %u",
-                            indrelid);
-            }
-            indrelid = parentId;
-
-            if (OidIsValid(parentIndexId))
-            {
-                ht_parent_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(parentIndexId));
-                if (!HeapTupleIsValid(ht_parent_idx))
-                {
-                    if (missing_ok)
-                        return NULL;
-                    elog(ERROR, "cache lookup failed for index %u", parentIndexId);
-                }
-                idxrec = (Form_pg_index) GETSTRUCT(ht_parent_idx);
-                is_interval_child = true;
-            }
-        }
-        heap_close(rel, NoLock);
-    }
-#endif
-    sep = "";
-    for (keyno = 0; keyno < idxrec->indnatts; keyno++)
-    {
-        AttrNumber    attnum = idxrec->indkey.values[keyno];
-        int16        opt = indoption->values[keyno];
-        Oid            keycoltype;
-        Oid            keycolcollation;
-
-        if (!colno)
-            appendStringInfoString(&buf, sep);
-        sep = ", ";
-
-        if (attnum != 0)
-        {
-            /* Simple index column */
-            char       *attname;
-            int32        keycoltypmod;
-
-            attname = get_relid_attribute_name(indrelid, attnum);
-            if (!colno || colno == keyno + 1)
-                appendStringInfoString(&buf, quote_identifier(attname));
-            get_atttypetypmodcoll(indrelid, attnum,
-                                  &keycoltype, &keycoltypmod,
-                                  &keycolcollation);
-        }
-        else
-        {
-            /* expressional index */
-            Node       *indexkey;
-
-            if (indexpr_item == NULL)
-                elog(ERROR, "too few entries in indexprs list");
-            indexkey = (Node *) lfirst(indexpr_item);
-            indexpr_item = lnext(indexpr_item);
-            /* Deparse */
-            str = deparse_expression_pretty(indexkey, context, false, false,
-                                            prettyFlags, 0);
-            if (!colno || colno == keyno + 1)
-            {
-                /* Need parens if it's not a bare function call */
-                if (looks_like_function(indexkey))
-                    appendStringInfoString(&buf, str);
-                else
-                    appendStringInfo(&buf, "(%s)", str);
-            }
-            keycoltype = exprType(indexkey);
-            keycolcollation = exprCollation(indexkey);
-        }
-
-        if (!attrsOnly && (!colno || colno == keyno + 1))
-        {
-            Oid            indcoll;
-
-            /* Add collation, if not default for column */
-            indcoll = indcollation->values[keyno];
-            if (OidIsValid(indcoll) && indcoll != keycolcollation)
-                appendStringInfo(&buf, " COLLATE %s",
-                                 generate_collation_name((indcoll)));
-
-            /* Add the operator class name, if not default */
-            get_opclass_name(indclass->values[keyno], keycoltype, &buf);
-
-            /* Add options if relevant */
-            if (amroutine->amcanorder)
-            {
-                /* if it supports sort ordering, report DESC and NULLS opts */
-                if (opt & INDOPTION_DESC)
-                {
-                    appendStringInfoString(&buf, " DESC");
-                    /* NULLS FIRST is the default in this case */
-                    if (!(opt & INDOPTION_NULLS_FIRST))
-                        appendStringInfoString(&buf, " NULLS LAST");
-                }
-                else
-                {
-                    if (opt & INDOPTION_NULLS_FIRST)
-                        appendStringInfoString(&buf, " NULLS FIRST");
-                }
-            }
-
-            /* Add the exclusion operator if relevant */
-            if (excludeOps != NULL)
-                appendStringInfo(&buf, " WITH %s",
-                                 generate_operator_name(excludeOps[keyno],
-                                                        keycoltype,
-                                                        keycoltype));
-        }
-    }
-
-    if (!attrsOnly)
-    {
-        appendStringInfoChar(&buf, ')');
-
-        /*
-         * If it has options, append "WITH (options)"
-         */
-        str = flatten_reloptions(indexrelid);
-        if (str)
-        {
-            appendStringInfo(&buf, " WITH (%s)", str);
-            pfree(str);
-        }
-
-        /*
-         * Print tablespace, but only if requested
-         */
-        if (showTblSpc)
-        {
-            Oid            tblspc;
-
-            tblspc = get_rel_tablespace(indexrelid);
-            if (!OidIsValid(tblspc))
-                tblspc = MyDatabaseTableSpace;
-            if (isConstraint)
-                appendStringInfoString(&buf, " USING INDEX");
-            appendStringInfo(&buf, " TABLESPACE %s",
-                             quote_identifier(get_tablespace_name(tblspc)));
-        }
-
-        /*
-         * If it's a partial index, decompile and append the predicate
-         */
-        if (!heap_attisnull(ht_idx, Anum_pg_index_indpred, NULL))
-        {
-            Node       *node;
-            Datum        predDatum;
-            bool        isnull;
-            char       *predString;
-
-            /* Convert text string to node tree */
-            predDatum = SysCacheGetAttr(INDEXRELID, ht_idx,
-                                        Anum_pg_index_indpred, &isnull);
-            Assert(!isnull);
-            predString = TextDatumGetCString(predDatum);
-            node = (Node *) stringToNode(predString);
-            pfree(predString);
-
-            /* Deparse */
-            str = deparse_expression_pretty(node, context, false, false,
-                                            prettyFlags, 0);
-            if (isConstraint)
-                appendStringInfo(&buf, " WHERE (%s)", str);
-            else
-                appendStringInfo(&buf, " WHERE %s", str);
-        }
-    }
-
-    /* Clean up */
-    ReleaseSysCache(ht_idx);
-    ReleaseSysCache(ht_idxrel);
-    ReleaseSysCache(ht_am);
-#ifdef __TBASE__
-    if (is_interval_child)
-    {
-        ReleaseSysCache(ht_parent_idx);
-    }
-#endif
-    return buf.data;
-}
-
-/*
- * pg_get_statisticsobjdef
- *        Get the definition of an extended statistics object
- */
-Datum
-pg_get_statisticsobjdef(PG_FUNCTION_ARGS)
-{
-    Oid            statextid = PG_GETARG_OID(0);
-    char       *res;
-
-    res = pg_get_statisticsobj_worker(statextid, true);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-/*
- * Internal workhorse to decompile an extended statistics object.
- */
-static char *
-pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
-{// #lizard forgives
-    Form_pg_statistic_ext statextrec;
-    HeapTuple    statexttup;
-    StringInfoData buf;
-    int            colno;
-    char       *nsp;
-    ArrayType  *arr;
-    char       *enabled;
-    Datum        datum;
-    bool        isnull;
-    bool        ndistinct_enabled;
-    bool        dependencies_enabled;
-    int            i;
-
-    statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
-
-    if (!HeapTupleIsValid(statexttup))
-    {
-        if (missing_ok)
-            return NULL;
-        elog(ERROR, "cache lookup failed for statistics object %u", statextid);
-    }
-
-    statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
-
-    initStringInfo(&buf);
-
-    nsp = get_namespace_name(statextrec->stxnamespace);
-    appendStringInfo(&buf, "CREATE STATISTICS %s",
-                     quote_qualified_identifier(nsp,
-                                                NameStr(statextrec->stxname)));
-
-    /*
-     * Decode the stxkind column so that we know which stats types to print.
-     */
-    datum = SysCacheGetAttr(STATEXTOID, statexttup,
-                            Anum_pg_statistic_ext_stxkind, &isnull);
-    Assert(!isnull);
-    arr = DatumGetArrayTypeP(datum);
-    if (ARR_NDIM(arr) != 1 ||
-        ARR_HASNULL(arr) ||
-        ARR_ELEMTYPE(arr) != CHAROID)
-        elog(ERROR, "stxkind is not a 1-D char array");
-    enabled = (char *) ARR_DATA_PTR(arr);
-
-    ndistinct_enabled = false;
-    dependencies_enabled = false;
-
-    for (i = 0; i < ARR_DIMS(arr)[0]; i++)
-    {
-        if (enabled[i] == STATS_EXT_NDISTINCT)
-            ndistinct_enabled = true;
-        if (enabled[i] == STATS_EXT_DEPENDENCIES)
-            dependencies_enabled = true;
-    }
-
-    /*
-     * If any option is disabled, then we'll need to append the types clause
-     * to show which options are enabled.  We omit the types clause on purpose
-     * when all options are enabled, so a pg_dump/pg_restore will create all
-     * statistics types on a newer postgres version, if the statistics had all
-     * options enabled on the original version.
-     */
-    if (!ndistinct_enabled || !dependencies_enabled)
-    {
-        appendStringInfoString(&buf, " (");
-        if (ndistinct_enabled)
-            appendStringInfoString(&buf, "ndistinct");
-        else if (dependencies_enabled)
-            appendStringInfoString(&buf, "dependencies");
-        appendStringInfoChar(&buf, ')');
-    }
-
-    appendStringInfoString(&buf, " ON ");
-
-    for (colno = 0; colno < statextrec->stxkeys.dim1; colno++)
-    {
-        AttrNumber    attnum = statextrec->stxkeys.values[colno];
-        char       *attname;
-
-        if (colno > 0)
-            appendStringInfoString(&buf, ", ");
-
-        attname = get_relid_attribute_name(statextrec->stxrelid, attnum);
-
-        appendStringInfoString(&buf, quote_identifier(attname));
-    }
-
-    appendStringInfo(&buf, " FROM %s",
-                     generate_relation_name(statextrec->stxrelid, NIL));
-
-    ReleaseSysCache(statexttup);
-
-    return buf.data;
-}
-
-/*
- * pg_get_partkeydef
- *
- * Returns the partition key specification, ie, the following:
- *
- * PARTITION BY { RANGE | LIST } (column opt_collation opt_opclass [, ...])
- */
-Datum
-pg_get_partkeydef(PG_FUNCTION_ARGS)
-{
-    Oid            relid = PG_GETARG_OID(0);
-    char       *res;
-
-    res = pg_get_partkeydef_worker(relid, PRETTYFLAG_INDENT, false, true);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-/* Internal version that just reports the column definitions */
-char *
-pg_get_partkeydef_columns(Oid relid, bool pretty)
-{
-    int            prettyFlags;
-
-    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
-    return pg_get_partkeydef_worker(relid, prettyFlags, true, false);
-}
-
-/*
- * Internal workhorse to decompile a partition key definition.
- */
-static char *
-pg_get_partkeydef_worker(Oid relid, int prettyFlags,
-                         bool attrsOnly, bool missing_ok)
-{// #lizard forgives
-    Form_pg_partitioned_table form;
-    HeapTuple    tuple;
-    oidvector  *partclass;
-    oidvector  *partcollation;
-    List       *partexprs;
-    ListCell   *partexpr_item;
-    List       *context;
-    Datum        datum;
-    bool        isnull;
-    StringInfoData buf;
-    int            keyno;
-    char       *str;
-    char       *sep;
-
-    tuple = SearchSysCache1(PARTRELID, ObjectIdGetDatum(relid));
-    if (!HeapTupleIsValid(tuple))
-    {
-        if (missing_ok)
-            return NULL;
-        elog(ERROR, "cache lookup failed for partition key of %u", relid);
-    }
-
-    form = (Form_pg_partitioned_table) GETSTRUCT(tuple);
-
-    Assert(form->partrelid == relid);
-
-    /* Must get partclass and partcollation the hard way */
-    datum = SysCacheGetAttr(PARTRELID, tuple,
-                            Anum_pg_partitioned_table_partclass, &isnull);
-    Assert(!isnull);
-    partclass = (oidvector *) DatumGetPointer(datum);
-
-    datum = SysCacheGetAttr(PARTRELID, tuple,
-                            Anum_pg_partitioned_table_partcollation, &isnull);
-    Assert(!isnull);
-    partcollation = (oidvector *) DatumGetPointer(datum);
-
-
-    /*
-     * Get the expressions, if any.  (NOTE: we do not use the relcache
-     * versions of the expressions, because we want to display
-     * non-const-folded expressions.)
-     */
-    if (!heap_attisnull(tuple, Anum_pg_partitioned_table_partexprs, NULL))
-    {
-        Datum        exprsDatum;
-        bool        isnull;
-        char       *exprsString;
-
-        exprsDatum = SysCacheGetAttr(PARTRELID, tuple,
-                                     Anum_pg_partitioned_table_partexprs, &isnull);
-        Assert(!isnull);
-        exprsString = TextDatumGetCString(exprsDatum);
-        partexprs = (List *) stringToNode(exprsString);
-
-        if (!IsA(partexprs, List))
-            elog(ERROR, "unexpected node type found in partexprs: %d",
-                 (int) nodeTag(partexprs));
-
-        pfree(exprsString);
-    }
-    else
-        partexprs = NIL;
-
-    partexpr_item = list_head(partexprs);
-    context = deparse_context_for(get_relation_name(relid), relid);
-
-    initStringInfo(&buf);
-
-    switch (form->partstrat)
-    {
-        case PARTITION_STRATEGY_LIST:
-            if (!attrsOnly)
-                appendStringInfo(&buf, "LIST");
-            break;
-        case PARTITION_STRATEGY_RANGE:
-            if (!attrsOnly)
-                appendStringInfo(&buf, "RANGE");
-            break;
-        default:
-            elog(ERROR, "unexpected partition strategy: %d",
-                 (int) form->partstrat);
-    }
-
-    if (!attrsOnly)
-        appendStringInfo(&buf, " (");
-    sep = "";
-    for (keyno = 0; keyno < form->partnatts; keyno++)
-    {
-        AttrNumber    attnum = form->partattrs.values[keyno];
-        Oid            keycoltype;
-        Oid            keycolcollation;
-        Oid            partcoll;
-
-        appendStringInfoString(&buf, sep);
-        sep = ", ";
-        if (attnum != 0)
-        {
-            /* Simple attribute reference */
-            char       *attname;
-            int32        keycoltypmod;
-
-            attname = get_relid_attribute_name(relid, attnum);
-            appendStringInfoString(&buf, quote_identifier(attname));
-            get_atttypetypmodcoll(relid, attnum,
-                                  &keycoltype, &keycoltypmod,
-                                  &keycolcollation);
-        }
-        else
-        {
-            /* Expression */
-            Node       *partkey;
-
-            if (partexpr_item == NULL)
-                elog(ERROR, "too few entries in partexprs list");
-            partkey = (Node *) lfirst(partexpr_item);
-            partexpr_item = lnext(partexpr_item);
-
-            /* Deparse */
-            str = deparse_expression_pretty(partkey, context, false, false,
-                                            prettyFlags, 0);
-            /* Need parens if it's not a bare function call */
-            if (looks_like_function(partkey))
-                appendStringInfoString(&buf, str);
-            else
-                appendStringInfo(&buf, "(%s)", str);
-
-            keycoltype = exprType(partkey);
-            keycolcollation = exprCollation(partkey);
-        }
-
-        /* Add collation, if not default for column */
-        partcoll = partcollation->values[keyno];
-        if (!attrsOnly && OidIsValid(partcoll) && partcoll != keycolcollation)
-            appendStringInfo(&buf, " COLLATE %s",
-                             generate_collation_name((partcoll)));
-
-        /* Add the operator class name, if not default */
-        if (!attrsOnly)
-            get_opclass_name(partclass->values[keyno], keycoltype, &buf);
-    }
-
-    if (!attrsOnly)
-        appendStringInfoChar(&buf, ')');
-
-    /* Clean up */
-    ReleaseSysCache(tuple);
-
-    return buf.data;
-}
-
-/*
- * pg_get_partition_constraintdef
- *
- * Returns partition constraint expression as a string for the input relation
- */
-Datum
-pg_get_partition_constraintdef(PG_FUNCTION_ARGS)
-{
-    Oid            relationId = PG_GETARG_OID(0);
-    Expr       *constr_expr;
-    int            prettyFlags;
-    List       *context;
-    char       *consrc;
-
-    constr_expr = get_partition_qual_relid(relationId);
-
-    /* Quick exit if not a partition */
-    if (constr_expr == NULL)
-        PG_RETURN_NULL();
-
-    /*
-     * Deparse and return the constraint expression.
-     */
-    prettyFlags = PRETTYFLAG_INDENT;
-    context = deparse_context_for(get_relation_name(relationId), relationId);
-    consrc = deparse_expression_pretty((Node *) constr_expr, context, false,
-                                       false, prettyFlags, 0);
-
-    PG_RETURN_TEXT_P(string_to_text(consrc));
-}
-
-/*
- * pg_get_constraintdef
- *
- * Returns the definition for the constraint, ie, everything that needs to
- * appear after "ALTER TABLE ... ADD CONSTRAINT <constraintname>".
- */
-Datum
-pg_get_constraintdef(PG_FUNCTION_ARGS)
-{
-    Oid            constraintId = PG_GETARG_OID(0);
-    int            prettyFlags;
-    char       *res;
-
-    prettyFlags = PRETTYFLAG_INDENT;
-
-    res = pg_get_constraintdef_worker(constraintId, false, prettyFlags, true);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-Datum
-pg_get_constraintdef_ext(PG_FUNCTION_ARGS)
-{
-    Oid            constraintId = PG_GETARG_OID(0);
-    bool        pretty = PG_GETARG_BOOL(1);
-    int            prettyFlags;
-    char       *res;
-
-    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
-
-    res = pg_get_constraintdef_worker(constraintId, false, prettyFlags, true);
-
-    if (res == NULL)
-        PG_RETURN_NULL();
-
-    PG_RETURN_TEXT_P(string_to_text(res));
-}
-
-/*
- * Internal version that returns a full ALTER TABLE ... ADD CONSTRAINT command
- */
-char *
-pg_get_constraintdef_command(Oid constraintId)
-{
-    return pg_get_constraintdef_worker(constraintId, true, 0, false);
-}
-
-/*
- * As of 9.4, we now use an MVCC snapshot for this.
- */
-static char *
-pg_get_constraintdef_worker(Oid constraintId, bool fullCommand,
-                            int prettyFlags, bool missing_ok)
-{// #lizard forgives
-    HeapTuple    tup;
-    Form_pg_constraint conForm;
-    StringInfoData buf;
-    SysScanDesc scandesc;
-    ScanKeyData scankey[1];
-    Snapshot    snapshot = RegisterSnapshot(GetTransactionSnapshot());
-    Relation    relation = heap_open(ConstraintRelationId, AccessShareLock);
-
-    ScanKeyInit(&scankey[0],
-                ObjectIdAttributeNumber,
-                BTEqualStrategyNumber, F_OIDEQ,
-                ObjectIdGetDatum(constraintId));
-
-    scandesc = systable_beginscan(relation,
-                                  ConstraintOidIndexId,
-                                  true,
-                                  snapshot,
-                                  1,
-                                  scankey);
-
-    /*
-     * We later use the tuple with SysCacheGetAttr() as if we had obtained it
-     * via SearchSysCache, which works fine.
-     */
-    tup = systable_getnext(scandesc);
-
-    UnregisterSnapshot(snapshot);
-
-    if (!HeapTupleIsValid(tup))
-    {
-        if (missing_ok)
-        {
-            systable_endscan(scandesc);
-            heap_close(relation, AccessShareLock);
-            return NULL;
-        }
-        elog(ERROR, "could not find tuple for constraint %u", constraintId);
-    }
-
-    conForm = (Form_pg_constraint) GETSTRUCT(tup);
-
-    initStringInfo(&buf);
-
-    if (fullCommand)
-    {
-        /*
-         * Currently, callers want ALTER TABLE (without ONLY) for CHECK
-         * constraints, and other types of constraints don't inherit anyway so
-         * it doesn't matter whether we say ONLY or not.  Someday we might
-         * need to let callers specify whether to put ONLY in the command.
-         */
-        appendStringInfo(&buf, "ALTER TABLE %s ADD CONSTRAINT %s ",
-                         generate_qualified_relation_name(conForm->conrelid),
-                         quote_identifier(NameStr(conForm->conname)));
-    }
-
-    switch (conForm->contype)
-    {
-        case CONSTRAINT_FOREIGN:
-            {
-                Datum        val;
-                bool        isnull;
-                const char *string;
-
-                /* Start off the constraint definition */
-                appendStringInfoString(&buf, "FOREIGN KEY (");
-
-                /* Fetch and build referencing-column list */
-                val = SysCacheGetAttr(CONSTROID, tup,
-                                      Anum_pg_constraint_conkey, &isnull);
-                if (isnull)
-                    elog(ERROR, "null conkey for constraint %u",
-                         constraintId);
-
-                decompile_column_index_array(val, conForm->conrelid, &buf);
-
-                /* add foreign relation name */
-                appendStringInfo(&buf, ") REFERENCES %s(",
-                                 generate_relation_name(conForm->confrelid,
-                                                        NIL));
-
-                /* Fetch and build referenced-column list */
-                val = SysCacheGetAttr(CONSTROID, tup,
-                                      Anum_pg_constraint_confkey, &isnull);
-                if (isnull)
-                    elog(ERROR, "null confkey for constraint %u",
-                         constraintId);
-
-                decompile_column_index_array(val, conForm->confrelid, &buf);
-
-                appendStringInfoChar(&buf, ')');
-
-                /* Add match type */
-                switch (conForm->confmatchtype)
-                {
-                    case FKCONSTR_MATCH_FULL:
-                        string = " MATCH FULL";
-                        break;
-                    case FKCONSTR_MATCH_PARTIAL:
-                        string = " MATCH PARTIAL";
-                        break;
-                    case FKCONSTR_MATCH_SIMPLE:
-                        string = "";
-                        break;
-                    default:
-                        elog(ERROR, "unrecognized confmatchtype: %d",
-                             conForm->confmatchtype);
-                        string = "";    /* keep compiler quiet */
-                        break;
-                }
-                appendStringInfoString(&buf, string);
-
-                /* Add ON UPDATE and ON DELETE clauses, if needed */
-                switch (conForm->confupdtype)
-                {
-                    case FKCONSTR_ACTION_NOACTION:
-                        string = NULL;    /* suppress default */
-                        break;
-                    case FKCONSTR_ACTION_RESTRICT:
-                        string = "RESTRICT";
-                        break;
-                    case FKCONSTR_ACTION_CASCADE:
-                        string = "CASCADE";
-                        break;
-                    case FKCONSTR_ACTION_SETNULL:
-                        string = "SET NULL";
-                        break;
-                    case FKCONSTR_ACTION_SETDEFAULT:
-                        string = "SET DEFAULT";
-                        break;
-                    default:
-                        elog(ERROR, "unrecognized confupdtype: %d",
-                             conForm->confupdtype);
-                        string = NULL;    /* keep compiler quiet */
-                        break;
-                }
-                if (string)
-                    appendStringInfo(&buf, " ON UPDATE %s", string);
-
-                switch (conForm->confdeltype)
-                {
-                    case FKCONSTR_ACTION_NOACTION:
-                        string = NULL;    /* suppress default */
-                        break;
-                    case FKCONSTR_ACTION_RESTRICT:
-                        string = "RESTRICT";
-                        break;
-                    case FKCONSTR_ACTION_CASCADE:
-                        string = "CASCADE";
-                        break;
-                    case FKCONSTR_ACTION_SETNULL:
-                        string = "SET NULL";
-                        break;
-                    case FKCONSTR_ACTION_SETDEFAULT:
-                        string = "SET DEFAULT";
-                        break;
-                    default:
-                        elog(ERROR, "unrecognized confdeltype: %d",
-                             conForm->confdeltype);
-                        string = NULL;    /* keep compiler quiet */
-                        break;
-                }
-                if (string)
-                    appendStringInfo(&buf, " ON DELETE %s", string);
-
-                break;
-            }
-        case CONSTRAINT_PRIMARY:
-        case CONSTRAINT_UNIQUE:
-            {
-                Datum        val;
-                bool        isnull;
-                Oid            indexId;
-
-                /* Start off the constraint definition */
-                if (conForm->contype == CONSTRAINT_PRIMARY)
-                    appendStringInfoString(&buf, "PRIMARY KEY (");
-                else
-                    appendStringInfoString(&buf, "UNIQUE (");
-
-                /* Fetch and build target column list */
-                val = SysCacheGetAttr(CONSTROID, tup,
-                                      Anum_pg_constraint_conkey, &isnull);
-                if (isnull)
-                    elog(ERROR, "null conkey for constraint %u",
-                         constraintId);
-
-                decompile_column_index_array(val, conForm->conrelid, &buf);
-
-                appendStringInfoChar(&buf, ')');
-
-                indexId = get_constraint_index(constraintId);
-
-                /* XXX why do we only print these bits if fullCommand? */
-                if (fullCommand && OidIsValid(indexId))
-                {
-                    char       *options = flatten_reloptions(indexId);
-                    Oid            tblspc;
-
-                    if (options)
-                    {
-                        appendStringInfo(&buf, " WITH (%s)", options);
-                        pfree(options);
-                    }
-
-                    tblspc = get_rel_tablespace(indexId);
-                    if (OidIsValid(tblspc))
-                        appendStringInfo(&buf, " USING INDEX TABLESPACE %s",
-                                         quote_identifier(get_tablespace_name(tblspc)));
-                }
-
-                break;
-            }
-        case CONSTRAINT_CHECK:
-            {
-                Datum        val;
-                bool        isnull;
-                char       *conbin;
-                char       *consrc;
-                Node       *expr;
-                List       *context;
-
-                /* Fetch constraint expression in parsetree form */
-                val = SysCacheGetAttr(CONSTROID, tup,
-                                      Anum_pg_constraint_conbin, &isnull);
-                if (isnull)
-                    elog(ERROR, "null conbin for constraint %u",
-                         constraintId);
-
-                conbin = TextDatumGetCString(val);
-                expr = stringToNode(conbin);
-
-                /* Set up deparsing context for Var nodes in constraint */
-                if (conForm->conrelid != InvalidOid)
-                {
-                    /* relation constraint */
-                    context = deparse_context_for(get_relation_name(conForm->conrelid),
-                                                  conForm->conrelid);
-                }
-                else
-                {
-                    /* domain constraint --- can't have Vars */
-                    context = NIL;
-                }
-
-                consrc = deparse_expression_pretty(expr, context, false, false,
-                                                   prettyFlags, 0);
-
-                /*
-                 * Now emit the constraint definition, adding NO INHERIT if
-                 * necessary.
-                 *
-                 * There are cases where the constraint expression will be
-                 * fully parenthesized and we don't need the outer parens ...
-                 * but there are other cases where we do need 'em.  Be
-                 * conservative for now.
-                 *
-                 * Note that simply checking for leading '(' and trailing ')'
-                 * would NOT be good enough, consider "(x > 0) AND (y > 0)".
-                 */
-                appendStringInfo(&buf, "CHECK (%s)%s",
-                                 consrc,
-                                 conForm->connoinherit ? " NO INHERIT" : "");
-                break;
-            }
-        case CONSTRAINT_TRIGGER:
-
-            /*
-             * There isn't an ALTER TABLE syntax for creating a user-defined
-             * constraint trigger, but it seems better to print something than
-             * throw an error; if we throw error then this function couldn't
-             * safely be applied to all rows of pg_constraint.
-             */
-            appendStringInfoString(&buf, "TRIGGER");
-            break;
-        case CONSTRAINT_EXCLUSION:
-            {
-                Oid            indexOid = conForm->conindid;
-                Datum        val;
-                bool        isnull;
-                Datum       *elems;
-                int            nElems;
-                int            i;
-                Oid           *operators;
-
-                /* Extract operator OIDs from the pg_constraint tuple */
-                val = SysCacheGetAttr(CONSTROID, tup,
-                                      Anum_pg_constraint_conexclop,
-                                      &isnull);
-                if (isnull)
-                    elog(ERROR, "null conexclop for constraint %u",
-                         constraintId);
-
-                deconstruct_array(DatumGetArrayTypeP(val),
-                                  OIDOID, sizeof(Oid), true, 'i',
-                                  &elems, NULL, &nElems);
-
-                operators = (Oid *) palloc(nElems * sizeof(Oid));
-                for (i = 0; i < nElems; i++)
-                    operators[i] = DatumGetObjectId(elems[i]);
-
-                /* pg_get_indexdef_worker does the rest */
-                /* suppress tablespace because pg_dump wants it that way */
-                appendStringInfoString(&buf,
-                                       pg_get_indexdef_worker(indexOid,
-                                                              0,
-                                                              operators,
-                                                              false,
-                                                              false,
-                                                              prettyFlags,
-                                                              false));
-                break;
-            }
-        default:
-            elog(ERROR, "invalid constraint type \"%c\"", conForm->contype);
-            break;
-    }
-
-    if (conForm->condeferrable)
-        appendStringInfoString(&buf, " DEFERRABLE");
-    if (conForm->condeferred)
-        appendStringInfoString(&buf, " INITIALLY DEFERRED");
-    if (!conForm->convalidated)
-        appendStringInfoString(&buf, " NOT VALID");
-
-    /* Cleanup */
-    systable_endscan(scandesc);
-    heap_close(relation, AccessShareLock);
-
-    return buf.data;
-}
-
-
-/*
- * Convert an int16[] Datum into a comma-separated list of column names
- * for the indicated relation; append the list to buf.
- */
-static void
-decompile_column_index_array(Datum column_index_array, Oid relId,
-                             StringInfo buf)
-{
-    Datum       *keys;
-    int            nKeys;
-    int            j;
-
-    /* Extract data from array of int16 */
-    deconstruct_array(DatumGetArrayTypeP(column_index_array),
-                      INT2OID, 2, true, 's',
-                      &keys, NULL, &nKeys);
-
-    for (j = 0; j < nKeys; j++)
-    {
-        char       *colName;
-
-        colName = get_relid_attribute_name(relId, DatumGetInt16(keys[j]));
-
-        if (j == 0)
-            appendStringInfoString(buf, quote_identifier(colName));
-        else
-            appendStringInfo(buf, ", %s", quote_identifier(colName));
-    }
-}
-
-
-/* ----------
- * get_expr            - Decompile an expression tree
- *
- * Input: an expression tree in nodeToString form, and a relation OID
- *
- * Output: reverse-listed expression
- *
- * Currently, the expression can only refer to a single relation, namely
- * the one specified by the second parameter.  This is sufficient for
- * partial indexes, column default expressions, etc.  We also support
- * Var-free expressions, for which the OID can be InvalidOid.
- * ----------
- */
-Datum
-pg_get_expr(PG_FUNCTION_ARGS)
-{
-    text       *expr = PG_GETARG_TEXT_PP(0);
-    Oid            relid = PG_GETARG_OID(1);
-    int            prettyFlags;
-    char       *relname;
-
-    prettyFlags = PRETTYFLAG_INDENT;
-
-    if (OidIsValid(relid))
-    {
-        /* Get the name for the relation */
-        relname = get_rel_name(relid);
-
-        /*
-         * If the OID isn't actually valid, don't throw an error, just return
-         * NULL.  This is a bit questionable, but it's what we've done
-         * historically, and it can help avoid unwanted failures when
-         * examining catalog entries for just-deleted relations.
-         */
-        if (relname == NULL)
-            PG_RETURN_NULL();
-    }
-    else
-        relname = NULL;
-
-    PG_RETURN_TEXT_P(pg_get_expr_worker(expr, relid, relname, prettyFlags));
-}
-
-Datum
-pg_get_expr_ext(PG_FUNCTION_ARGS)
-{
-    text       *expr = PG_GETARG_TEXT_PP(0);
-    Oid            relid = PG_GETARG_OID(1);
-    bool        pretty = PG_GETARG_BOOL(2);
-    int            prettyFlags;
-    char       *relname;
-
-    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
-
-    if (OidIsValid(relid))
-    {
-        /* Get the name for the relation */
-        relname = get_rel_name(relid);
-        /* See notes above */
-        if (relname == NULL)
-            PG_RETURN_NULL();
-    }
-    else
-        relname = NULL;
-
-    PG_RETURN_TEXT_P(pg_get_expr_worker(expr, relid, relname, prettyFlags));
-}
-
-static text *
-pg_get_expr_worker(text *expr, Oid relid, const char *relname, int prettyFlags)
-{
-    Node       *node;
-    List       *context;
-    char       *exprstr;
-    char       *str;
-
-    /* Convert input TEXT object to C string */
-    exprstr = text_to_cstring(expr);
-
-    /* Convert expression to node tree */
-    node = (Node *) stringToNode(exprstr);
-
-    pfree(exprstr);
-
-    /* Prepare deparse context if needed */
-    if (OidIsValid(relid))
-        context = deparse_context_for(relname, relid);
-    else
-        context = NIL;
-
-    /* Deparse */
-    str = deparse_expression_pretty(node, context, false, false,
-                                    prettyFlags, 0);
-
-    return string_to_text(str);
-}
-
-
-/* ----------
- * get_userbyid            - Get a user name by roleid and
- *                  fallback to 'unknown (OID=n)'
- * ----------
- */
-Datum
-pg_get_userbyid(PG_FUNCTION_ARGS)
-{
-    Oid            roleid = PG_GETARG_OID(0);
-    Name        result;
-    HeapTuple    roletup;
-    Form_pg_authid role_rec;
-
-    /*
-     * Allocate space for the result
-     */
-    result = (Name) palloc(NAMEDATALEN);
-    memset(NameStr(*result), 0, NAMEDATALEN);
-
-    /*
-     * Get the pg_authid entry and print the result
-     */
-    roletup = SearchSysCache1(AUTHOID, ObjectIdGetDatum(roleid));
-    if (HeapTupleIsValid(roletup))
-    {
-        role_rec = (Form_pg_authid) GETSTRUCT(roletup);
-        StrNCpy(NameStr(*result), NameStr(role_rec->rolname), NAMEDATALEN);
-        ReleaseSysCache(roletup);
-    }
-    else
-        sprintf(NameStr(*result), "unknown (OID=%u)", roleid);
-
-    PG_RETURN_NAME(result);
-}
-
-
-/*
- * pg_get_serial_sequence
- *        Get the name of the sequence used by a serial column,
- *        formatted suitably for passing to setval, nextval or currval.
- *        First parameter is not treated as double-quoted, second parameter
- *        is --- see documentation for reason.
- */
-Datum
-pg_get_serial_sequence(PG_FUNCTION_ARGS)
-{// #lizard forgives
-    text       *tablename = PG_GETARG_TEXT_PP(0);
-    text       *columnname = PG_GETARG_TEXT_PP(1);
-    RangeVar   *tablerv;
-    Oid            tableOid;
-    char       *column;
-    AttrNumber    attnum;
-    Oid            sequenceId = InvalidOid;
-    Relation    depRel;
-    ScanKeyData key[3];
-    SysScanDesc scan;
-    HeapTuple    tup;
-
-    /* Look up table name.  Can't lock it - we might not have privileges. */
-    tablerv = makeRangeVarFromNameList(textToQualifiedNameList(tablename));
-    tableOid = RangeVarGetRelid(tablerv, NoLock, false);
-
-    /* Get the number of the column */
-    column = text_to_cstring(columnname);
-
-    attnum = get_attnum(tableOid, column);
-    if (attnum == InvalidAttrNumber)
-        ereport(ERROR,
-                (errcode(ERRCODE_UNDEFINED_COLUMN),
-                 errmsg("column \"%s\" of relation \"%s\" does not exist",
-                        column, tablerv->relname)));
-
-    /* Search the dependency table for the dependent sequence */
-    depRel = heap_open(DependRelationId, AccessShareLock);
-
-    ScanKeyInit(&key[0],
-                Anum_pg_depend_refclassid,
-                BTEqualStrategyNumber, F_OIDEQ,
-                ObjectIdGetDatum(RelationRelationId));
-    ScanKeyInit(&key[1],
-                Anum_pg_depend_refobjid,
-                BTEqualStrategyNumber, F_OIDEQ,
-                ObjectIdGetDatum(tableOid));
-    ScanKeyInit(&key[2],
-                Anum_pg_depend_refobjsubid,
-                BTEqualStrategyNumber, F_INT4EQ,
-                Int32GetDatum(attnum));
-
-    scan = systable_beginscan(depRel, DependReferenceIndexId, true,
-                              NULL, 3, key);
-
-    while (HeapTupleIsValid(tup = systable_getnext(scan)))
-    {
-        Form_pg_depend deprec = (Form_pg_depend) GETSTRUCT(tup);
-
-        /*
-         * We assume any auto dependency of a sequence on a column must be
-         * what we are looking for.  (We need the relkind test because indexes
-         * can also have auto dependencies on columns.)
-         */
-        if (deprec->classid == RelationRelationId &&
-            deprec->objsubid == 0 &&
-            deprec->deptype == DEPENDENCY_AUTO &&
-            get_rel_relkind(deprec->objid) == RELKIND_SEQUENCE)
-        {
-            sequenceId = deprec->objid;
-            break;
-        }
-    }
-
-    systable_endscan(scan);
-    heap_close(depRel, AccessShareLock);
-
-    if (OidIsValid(sequenceId))
-    {
-        char       *result;
-
-        result = generate_qualified_relation_name(sequenceId);
-
-        PG_RETURN_TEXT_P(string_to_text(result));
-    }
-
-    PG_RETURN_NULL();
-}
-
-
-/*
- * pg_get_functiondef
- *        Returns the complete "CREATE OR REPLACE FUNCTION ..." statement for
- *        the specified function.
- *
- * Note: if you change the output format of this function, be careful not
- * to break psql's rules (in \ef and \sf) for identifying the start of the
- * function body.  To wit: the function body starts on a line that begins
- * with "AS ", and no preceding line will look like that.
- */
-Datum
-pg_get_functiondef(PG_FUNCTION_ARGS)
-{// #lizard forgives
-    Oid            funcid = PG_GETARG_OID(0);
-    StringInfoData buf;
-    StringInfoData dq;
-    HeapTuple    proctup;
-    Form_pg_proc proc;
-    Datum        tmp;
-    bool        isnull;
-    const char *prosrc;
-    const char *name;
-    const char *nsp;
-    float4        procost;
-    int            oldlen;
-
-    initStringInfo(&buf);
-
-    /* Look up the function */
-    proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
-    if (!HeapTupleIsValid(proctup))
-        PG_RETURN_NULL();
-
-    proc = (Form_pg_proc) GETSTRUCT(proctup);
-    name = NameStr(proc->proname);
-
-    if (proc->proisagg)
-        ereport(ERROR,
-                (errcode(ERRCODE_WRONG_OBJECT_TYPE),
-                 errmsg("\"%s\" is an aggregate function", name)));
-
-    /*
-     * We always qualify the function name, to ensure the right function gets
-     * replaced.
-     */
-    nsp = get_namespace_name(proc->pronamespace);
-    appendStringInfo(&buf, "CREATE OR REPLACE FUNCTION %s(",
-                     quote_qualified_identifier(nsp, name));
-    (void) print_function_arguments(&buf, proctup, false, true);
-    appendStringInfoString(&buf, ")\n RETURNS ");
-    print_function_rettype(&buf, proctup);
-
-    print_function_trftypes(&buf, proctup);
-
-    appendStringInfo(&buf, "\n LANGUAGE %s\n",
-                     quote_identifier(get_language_name(proc->prolang, false)));
-
-    /* Emit some miscellaneous options on one line */
-    oldlen = buf.len;
-
-    if (proc->proiswindow)
-        appendStringInfoString(&buf, " WINDOW");
-    switch (proc->provolatile)
-    {
-        case PROVOLATILE_IMMUTABLE:
-            appendStringInfoString(&buf, " IMMUTABLE");
-            break;
-        case PROVOLATILE_STABLE:
-            appendStringInfoString(&buf, " STABLE");
-            break;
-        case PROVOLATILE_VOLATILE:
-            break;
-    }
-
-    switch (proc->proparallel)
-    {
-        case PROPARALLEL_SAFE:
-            appendStringInfoString(&buf, " PARALLEL SAFE");
-            break;
-        case PROPARALLEL_RESTRICTED:
-            appendStringInfoString(&buf, " PARALLEL RESTRICTED");
-            break;
-        case PROPARALLEL_UNSAFE:
-            break;
-    }
-
-    if (proc->proisstrict)
-        appendStringInfoString(&buf, " STRICT");
-    if (proc->prosecdef)
-        appendStringInfoString(&buf, " SECURITY DEFINER");
-    if (proc->proleakproof)
-        appendStringInfoString(&buf, " LEAKPROOF");
-
-    /* This code for the default cost and rows should match functioncmds.c */
-    if (proc->prolang == INTERNALlanguageId ||
-        proc->prolang == ClanguageId)
-        procost = 1;
-    else
-        procost = 100;
-    if (proc->procost != procost)
-        appendStringInfo(&buf, " COST %g", proc->procost);
-
-    if (proc->prorows > 0 && proc->prorows != 1000)
-        appendStringInfo(&buf, " ROWS %g", proc->prorows);
-
-    if (oldlen != buf.len)
-        appendStringInfoChar(&buf, '\n');
-
-    /* Emit any proconfig options, one per line */
-    tmp = SysCacheGetAttr(PROCOID, proctup, Anum_pg_proc_proconfig, &isnull);
-    if (!isnull)
-    {
-        ArrayType  *a = DatumGetArrayTypeP(tmp);
-        int            i;
-
-        Assert(ARR_ELEMTYPE(a) == TEXTOID);
-        Assert(ARR_NDIM(a) == 1);
-        Assert(ARR_LBOUND(a)[0] == 1);
-
-        for (i = 1; i <= ARR_DIMS(a)[0]; i++)
-        {
-            Datum        d;
-
-            d = array_ref(a, 1, &i,
-                          -1 /* varlenarray */ ,
-                          -1 /* TEXT's typlen */ ,
-                          false /* TEXT's typbyval */ ,
-                          'i' /* TEXT's typalign */ ,
-                          &isnull);
-            if (!isnull)
-            {
-                char       *configitem = TextDatumGetCString(d);
-                char       *pos;
-
-                pos = strchr(configitem, '=');
-                if (pos == NULL)
-                    continue;
-                *pos++ = '\0';
-
-                appendStringInfo(&buf, " SET %s TO ",
-                                 quote_identifier(configitem));
-
-                /*
-                 * Some GUC variable names are 'LIST' type and hence must not
-                 * be quoted.
-                 */
-                if (pg_strcasecmp(configitem, "DateStyle") == 0
-                    || pg_strcasecmp(configitem, "search_path") == 0)
-                    appendStringInfoString(&buf, pos);
-                else
-                    simple_quote_literal(&buf, pos);
-                appendStringInfoChar(&buf, '\n');
-            }
-        }
-    }
-
-    /* And finally the function definition ... */
-    appendStringInfoString(&buf, "AS ");
-
-    tmp = SysCacheGetAttr(PROCOID, proctup, Anum_pg_proc_probin, &isnull);
-    if (!isnull)
-    {
-        simple_quote_literal(&buf, TextDatumGetCString(tmp));
-        appendStringInfoString(&buf, ", "); /* assume prosrc isn't null */
-    }
-
-    tmp = SysCacheGetAttr(PROCOID, proctup, Anum_pg_proc_prosrc, &isnull);
-    if (isnull)
-        elog(ERROR, "null prosrc");
-    prosrc = TextDatumGetCString(tmp);
-
-    /*
-     * We always use dollar quoting.  Figure out a suitable delimiter.
-     *
-     * Since the user is likely to be editing the function body string, we
-     * shouldn't use a short delimiter that he might easily create a conflict
-     * with.  Hence prefer "$function$", but extend if needed.
-     */
-    initStringInfo(&dq);
-    appendStringInfoString(&dq, "$function");
-    while (strstr(prosrc, dq.data) != NULL)
-        appendStringInfoChar(&dq, 'x');
-    appendStringInfoChar(&dq, '$');
-
-    appendStringInfoString(&buf, dq.data);
-    appendStringInfoString(&buf, prosrc);
-    appendStringInfoString(&buf, dq.data);
-
-    appendStringInfoChar(&buf, '\n');
-
-    ReleaseSysCache(proctup);
-
-    PG_RETURN_TEXT_P(string_to_text(buf.data));
-}
-
-/*
- * pg_get_function_arguments
- *        Get a nicely-formatted list of arguments for a function.
- *        This is everything that would go between the parentheses in
- *        CREATE FUNCTION.
- */
-Datum
-pg_get_function_arguments(PG_FUNCTION_ARGS)
-{
-    Oid            funcid = PG_GETARG_OID(0);
-    StringInfoData buf;
-    HeapTuple    proctup;
-
-    proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
-    if (!HeapTupleIsValid(proctup))
-        PG_RETURN_NULL();
-
-    initStringInfo(&buf);
-
-    (void) print_function_arguments(&buf, proctup, false, true);
-
-    ReleaseSysCache(proctup);
-
-    PG_RETURN_TEXT_P(string_to_text(buf.data));
-}
-
-/*
- * pg_get_function_identity_arguments
- *        Get a formatted list of arguments for a function.
- *        This is everything that would go between the parentheses in
- *        ALTER FUNCTION, etc.  In particular, don't print defaults.
- */
-Datum
-pg_get_function_identity_arguments(PG_FUNCTION_ARGS)
-{
-    Oid            funcid = PG_GETARG_OID(0);
-    StringInfoData buf;
-    HeapTuple    proctup;
-
-    proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
-    if (!HeapTupleIsValid(proctup))
-        PG_RETURN_NULL();
-
-    initStringInfo(&buf);
-
-    (void) print_function_arguments(&buf, proctup, false, false);
-
-    ReleaseSysCache(proctup);
-
-    PG_RETURN_TEXT_P(string_to_text(buf.data));
-}
-
-/*
- * pg_get_function_result
- *        Get a nicely-formatted version of the result type of a function.
- *        This is what would appear after RETURNS in CREATE FUNCTION.
- */
-Datum
-pg_get_function_result(PG_FUNCTION_ARGS)
-{
-    Oid            funcid = PG_GETARG_OID(0);
-    StringInfoData buf;
-    HeapTuple    proctup;
-
-    proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
-    if (!HeapTupleIsValid(proctup))
-        PG_RETURN_NULL();
-
-    initStringInfo(&buf);
-
-    print_function_rettype(&buf, proctup);
-
-    ReleaseSysCache(proctup);
-
-    PG_RETURN_TEXT_P(string_to_text(buf.data));
-}
-
-/*
- * Guts of pg_get_function_result: append the function's return type
- * to the specified buffer.
- */
-static void
-print_function_rettype(StringInfo buf, HeapTuple proctup)
-{
-    Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(proctup);
-    int            ntabargs = 0;
-    StringInfoData rbuf;
-
-    initStringInfo(&rbuf);
-
-    if (proc->proretset)
-    {
-        /* It might be a table function; try to print the arguments */
-        appendStringInfoString(&rbuf, "TABLE(");
-        ntabargs = print_function_arguments(&rbuf, proctup, true, false);
-        if (ntabargs > 0)
-            appendStringInfoChar(&rbuf, ')');
-        else
-            resetStringInfo(&rbuf);
-    }
-
-    if (ntabargs == 0)
-    {
-        /* Not a table function, so do the normal thing */
-        if (proc->proretset)
-            appendStringInfoString(&rbuf, "SETOF ");
-        appendStringInfoString(&rbuf, format_type_be(proc->prorettype));
-    }
-
-    appendStringInfoString(buf, rbuf.data);
-}
-
-/*
- * Common code for pg_get_function_arguments and pg_get_function_result:
- * append the desired subset of arguments to buf.  We print only TABLE
- * arguments when print_table_args is true, and all the others when it's false.
- * We print argument defaults only if print_defaults is true.
- * Function return value is the number of arguments printed.
- */
-static int
-print_function_arguments(StringInfo buf, HeapTuple proctup,
-                         bool print_table_args, bool print_defaults)
-{// #lizard forgives
-    Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(proctup);
-    int            numargs;
-    Oid           *argtypes;
-    char      **argnames;
-    char       *argmodes;
-    int            insertorderbyat = -1;
-    int            argsprinted;
-    int            inputargno;
-    int            nlackdefaults;
-    ListCell   *nextargdefault = NULL;
-    int            i;
-
-    numargs = get_func_arg_info(proctup,
-                                &argtypes, &argnames, &argmodes);
-
-    nlackdefaults = numargs;
-    if (print_defaults && proc->pronargdefaults > 0)
-    {
-        Datum        proargdefaults;
-        bool        isnull;
-
-        proargdefaults = SysCacheGetAttr(PROCOID, proctup,
-                                         Anum_pg_proc_proargdefaults,
-                                         &isnull);
-        if (!isnull)
-        {
-            char       *str;
-            List       *argdefaults;
-
-            str = TextDatumGetCString(proargdefaults);
-            argdefaults = castNode(List, stringToNode(str));
-            pfree(str);
-            nextargdefault = list_head(argdefaults);
-            /* nlackdefaults counts only *input* arguments lacking defaults */
-            nlackdefaults = proc->pronargs - list_length(argdefaults);
-        }
-    }
-
-    /* Check for special treatment of ordered-set aggregates */
-    if (proc->proisagg)
-    {
-        HeapTuple    aggtup;
-        Form_pg_aggregate agg;
-
-        aggtup = SearchSysCache1(AGGFNOID,
-                                 ObjectIdGetDatum(HeapTupleGetOid(proctup)));
-        if (!HeapTupleIsValid(aggtup))
-            elog(ERROR, "cache lookup failed for aggregate %u",
-                 HeapTupleGetOid(proctup));
-        agg = (Form_pg_aggregate) GETSTRUCT(aggtup);
-        if (AGGKIND_IS_ORDERED_SET(agg->aggkind))
-            insertorderbyat = agg->aggnumdirectargs;
-        ReleaseSysCache(aggtup);
-    }
-
-    argsprinted = 0;
-    inputargno = 0;
-    for (i = 0; i < numargs; i++)
-    {
-        Oid            argtype = argtypes[i];
-        char       *argname = argnames ? argnames[i] : NULL;
-        char        argmode = argmodes ? argmodes[i] : PROARGMODE_IN;
-        const char *modename;
-        bool        isinput;
-
-        switch (argmode)
-        {
-            case PROARGMODE_IN:
-                modename = "";
-                isinput = true;
-                break;
-            case PROARGMODE_INOUT:
-                modename = "INOUT ";
-                isinput = true;
-                break;
-            case PROARGMODE_OUT:
-                modename = "OUT ";
-                isinput = false;
-                break;
-            case PROARGMODE_VARIADIC:
-                modename = "VARIADIC ";
-                isinput = true;
-                break;
-            case PROARGMODE_TABLE:
-                modename = "";
-                isinput = false;
-                break;
-            default:
-                elog(ERROR, "invalid parameter mode '%c'", argmode);
-                modename = NULL;    /* keep compiler quiet */
-                isinput = false;
-                break;
-        }
-        if (isinput)
-            inputargno++;        /* this is a 1-based counter */
-
-        if (print_table_args != (argmode == PROARGMODE_TABLE))
-            continue;
-
-        if (argsprinted == insertorderbyat)
-        {
-            if (argsprinted)
-                appendStringInfoChar(buf, ' ');
-            appendStringInfoString(buf, "ORDER BY ");
-        }
-        else if (argsprinted)
-            appendStringInfoString(buf, ", ");
-
-        appendStringInfoString(buf, modename);
-        if (argname && argname[0])
-            appendStringInfo(buf, "%s ", quote_identifier(argname));
-        appendStringInfoString(buf, format_type_be(argtype));
-        if (print_defaults && isinput && inputargno > nlackdefaults)
-        {
-            Node       *expr;
-
-            Assert(nextargdefault != NULL);
-            expr = (Node *) lfirst(nextargdefault);
-            nextargdefault = lnext(nextargdefault);
-
-            appendStringInfo(buf, " DEFAULT %s",
-                             deparse_expression(expr, NIL, false, false));
-        }
-        argsprinted++;
-
-        /* nasty hack: print the last arg twice for variadic ordered-set agg */
-        if (argsprinted == insertorderbyat && i == numargs - 1)
-        {
-            i--;
-            /* aggs shouldn't have defaults anyway, but just to be sure ... */
-            print_defaults = false;
-        }
-    }
-
-    return argsprinted;
-}
-
-static bool
-is_input_argument(int nth, const char *argmodes)
-{
-    return (!argmodes
-            || argmodes[nth] == PROARGMODE_IN
-            || argmodes[nth] == PROARGMODE_INOUT
-            || argmodes[nth] == PROARGMODE_VARIADIC);
-}
-
-/*
- * Append used transformed types to specified buffer
- */
-static void
-print_function_trftypes(StringInfo buf, HeapTuple proctup)
-{
-    Oid           *trftypes;
-    int            ntypes;
-
-    ntypes = get_func_trftypes(proctup, &trftypes);
-    if (ntypes > 0)
-    {
-        int            i;
-
-        appendStringInfoString(buf, "\n TRANSFORM ");
-        for (i = 0; i < ntypes; i++)
-        {
-            if (i != 0)
-                appendStringInfoString(buf, ", ");
-            appendStringInfo(buf, "FOR TYPE %s", format_type_be(trftypes[i]));
-        }
-    }
-}
-
-/*
- * Get textual representation of a function argument's default value.  The
- * second argument of this function is the argument number among all arguments
- * (i.e. proallargtypes, *not* proargtypes), starting with 1, because that's
- * how information_schema.sql uses it.
- */
-Datum
-pg_get_function_arg_default(PG_FUNCTION_ARGS)
-{// #lizard forgives
-    Oid            funcid = PG_GETARG_OID(0);
-    int32        nth_arg = PG_GETARG_INT32(1);
-    HeapTuple    proctup;
-    Form_pg_proc proc;
-    int            numargs;
-    Oid           *argtypes;
-    char      **argnames;
-    char       *argmodes;
-    int            i;
-    List       *argdefaults;
-    Node       *node;
-    char       *str;
-    int            nth_inputarg;
-    Datum        proargdefaults;
-    bool        isnull;
-    int            nth_default;
-
-    proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
-    if (!HeapTupleIsValid(proctup))
-        PG_RETURN_NULL();
-
-    numargs = get_func_arg_info(proctup, &argtypes, &argnames, &argmodes);
-    if (nth_arg < 1 || nth_arg > numargs || !is_input_argument(nth_arg - 1, argmodes))
-    {
-        ReleaseSysCache(proctup);
-        PG_RETURN_NULL();
-    }
-
-    nth_inputarg = 0;
-    for (i = 0; i < nth_arg; i++)
-        if (is_input_argument(i, argmodes))
-            nth_inputarg++;
-
-    proargdefaults = SysCacheGetAttr(PROCOID, proctup,
-                                     Anum_pg_proc_proargdefaults,
-                                     &isnull);
-    if (isnull)
-    {
-        ReleaseSysCache(proctup);
-        PG_RETURN_NULL();
-    }
-
-    str = TextDatumGetCString(proargdefaults);
-    argdefaults = castNode(List, stringToNode(str));
-    pfree(str);
-
-    proc = (Form_pg_proc) GETSTRUCT(proctup);
-
-    /*
-     * Calculate index into proargdefaults: proargdefaults corresponds to the
-     * last N input arguments, where N = pronargdefaults.
-     */
-    nth_default = nth_inputarg - 1 - (proc->pronargs - proc->pronargdefaults);
-
-    if (nth_default < 0 || nth_default >= list_length(argdefaults))
-    {
-        ReleaseSysCache(proctup);
-        PG_RETURN_NULL();
-    }
-    node = list_nth(argdefaults, nth_default);
-    str = deparse_expression(node, NIL, false, false);
-
-    ReleaseSysCache(proctup);
-
-    PG_RETURN_TEXT_P(string_to_text(str));
-}
-
-
-/*
- * deparse_expression            - General utility for deparsing expressions
- *
- * calls deparse_expression_pretty with all prettyPrinting disabled
- */
-char *
-deparse_expression(Node *expr, List *dpcontext,
-                   bool forceprefix, bool showimplicit)
-{
-    return deparse_expression_pretty(expr, dpcontext, forceprefix,
-                                     showimplicit, 0, 0);
-}
-
-/* ----------
- * deparse_expression_pretty    - General utility for deparsing expressions
- *
- * expr is the node tree to be deparsed.  It must be a transformed expression
- * tree (ie, not the raw output of gram.y).
- *
- * dpcontext is a list of deparse_namespace nodes representing the context
- * for interpreting Vars in the node tree.  It can be NIL if no Vars are
- * expected.
- *
- * forceprefix is TRUE to force all Vars to be prefixed with their table names.
- *
- * showimplicit is TRUE to force all implicit casts to be shown explicitly.
- *
- * Tries to pretty up the output according to prettyFlags and startIndent.
- *
- * The result is a palloc'd string.
- * ----------
- */
-static char *
-deparse_expression_pretty(Node *expr, List *dpcontext,
-                          bool forceprefix, bool showimplicit,
-                          int prettyFlags, int startIndent)
-{
-    StringInfoData buf;
-    deparse_context context;
-
-    initStringInfo(&buf);
-    context.buf = &buf;
-    context.namespaces = dpcontext;
-    context.windowClause = NIL;
-    context.windowTList = NIL;
-    context.varprefix = forceprefix;
-    context.prettyFlags = prettyFlags;
-    context.wrapColumn = WRAP_COLUMN_DEFAULT;
-    context.indentLevel = startIndent;
-    context.special_exprkind = EXPR_KIND_NONE;
-
-    get_rule_expr(expr, &context, showimplicit);
-
-    return buf.data;
-}
-
-/* ----------
- * deparse_context_for            - Build deparse context for a single relation
- *
- * Given the reference name (alias) and OID of a relation, build deparsing
- * context for an expression referencing only that relation (as varno 1,
- * varlevelsup 0).  This is sufficient for many uses of deparse_expression.
- * ----------
- */
-List *
-deparse_context_for(const char *aliasname, Oid relid)
-{
-    deparse_namespace *dpns;
-    RangeTblEntry *rte;
-
-    dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace));
-
-    /* Build a minimal RTE for the rel */
-    rte = makeNode(RangeTblEntry);
-    rte->rtekind = RTE_RELATION;
-    rte->relid = relid;
-    rte->relkind = RELKIND_RELATION;    /* no need for exactness here */
-    rte->alias = makeAlias(aliasname, NIL);
-    rte->eref = rte->alias;
-    rte->lateral = false;
-    rte->inh = false;
-    rte->inFromCl = true;
-
-    /* Build one-element rtable */
-    dpns->rtable = list_make1(rte);
-    dpns->ctes = NIL;
-    set_rtable_names(dpns, NIL, NULL);
-    set_simple_column_names(dpns);
-
-    /* Return a one-deep namespace stack */
-    return list_make1(dpns);
-}
-
-/*
- * deparse_context_for_plan_rtable - Build deparse context for a plan's rtable
- *
- * When deparsing an expression in a Plan tree, we use the plan's rangetable
- * to resolve names of simple Vars.  The initialization of column names for
- * this is rather expensive if the rangetable is large, and it'll be the same
- * for every expression in the Plan tree; so we do it just once and re-use
- * the result of this function for each expression.  (Note that the result
- * is not usable until set_deparse_context_planstate() is applied to it.)
- *
- * In addition to the plan's rangetable list, pass the per-RTE alias names
- * assigned by a previous call to select_rtable_names_for_explain.
- */
-List *
-deparse_context_for_plan_rtable(List *rtable, List *rtable_names)
-{
-    deparse_namespace *dpns;
-
-    dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace));
-
-    /* Initialize fields that stay the same across the whole plan tree */
-    dpns->rtable = rtable;
-    dpns->rtable_names = rtable_names;
-    dpns->ctes = NIL;
-
-    /*
-     * Set up column name aliases.  We will get rather bogus results for join
-     * RTEs, but that doesn't matter because plan trees don't contain any join
-     * alias Vars.
-     */
-    set_simple_column_names(dpns);
-
-    /* Return a one-deep namespace stack */
-    return list_make1(dpns);
-}
-
-/*
- * set_deparse_context_planstate    - Specify Plan node containing expression
- *
- * When deparsing an expression in a Plan tree, we might have to resolve
- * OUTER_VAR, INNER_VAR, or INDEX_VAR references.  To do this, the caller must
- * provide the parent PlanState node.  Then OUTER_VAR and INNER_VAR references
- * can be resolved by drilling down into the left and right child plans.
- * Similarly, INDEX_VAR references can be resolved by reference to the
- * indextlist given in a parent IndexOnlyScan node, or to the scan tlist in
- * ForeignScan and CustomScan nodes.  (Note that we don't currently support
- * deparsing of indexquals in regular IndexScan or BitmapIndexScan nodes;
- * for those, we can only deparse the indexqualorig fields, which won't
- * contain INDEX_VAR Vars.)
- *
- * Note: planstate really ought to be declared as "PlanState *", but we use
- * "Node *" to avoid having to include execnodes.h in ruleutils.h.
- *
- * The ancestors list is a list of the PlanState's parent PlanStates, the
- * most-closely-nested first.  This is needed to resolve PARAM_EXEC Params.
- * Note we assume that all the PlanStates share the same rtable.
- *
- * Once this function has been called, deparse_expression() can be called on
- * subsidiary expression(s) of the specified PlanState node.  To deparse
- * expressions of a different Plan node in the same Plan tree, re-call this
- * function to identify the new parent Plan node.
- *
- * The result is the same List passed in; this is a notational convenience.
- */
-List *
-set_deparse_context_planstate(List *dpcontext,
-                              Node *planstate, List *ancestors)
-{
-    deparse_namespace *dpns;
-
-
-    /* Should always have one-entry namespace list for Plan deparsing */
-    Assert(list_length(dpcontext) == 1);
-    dpns = (deparse_namespace *) linitial(dpcontext);
-
-    /* Set our attention on the specific plan node passed in */
-    set_deparse_planstate(dpns, (PlanState *) planstate);
-    dpns->ancestors = ancestors;
-
-    return dpcontext;
-}
-
-/*
- * select_rtable_names_for_explain    - Select RTE aliases for EXPLAIN
- *
- * Determine the relation aliases we'll use during an EXPLAIN operation.
- * This is just a frontend to set_rtable_names.  We have to expose the aliases
- * to EXPLAIN because EXPLAIN needs to know the right alias names to print.
- */
-List *
-select_rtable_names_for_explain(List *rtable, Bitmapset *rels_used)
-{
-    deparse_namespace dpns;
-
-    memset(&dpns, 0, sizeof(dpns));
-    dpns.rtable = rtable;
-    dpns.ctes = NIL;
-    set_rtable_names(&dpns, NIL, rels_used);
-    /* We needn't bother computing column aliases yet */
-
-    return dpns.rtable_names;
-}
-
-#ifdef PGXC
-/*
- * This is a special case deparse context to be used at the planning time to
- * generate query strings and expressions for remote shipping.
- *
- * XXX We should be careful while using this since the support is quite
- * limited. The only supported use case at this point is for remote join
- * reduction and some simple plan trees rooted by Agg node having a single
- * RemoteQuery node as leftree.
- */
-List *
-deparse_context_for_plan(Node *plan, List *ancestors,
-                              List *rtable)
-{
-    deparse_namespace *dpns;
-
-    dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace));
-
-    /* Initialize fields that stay the same across the whole plan tree */
-    dpns->rtable = rtable;
-    dpns->ctes = NIL;
-
-    /* Set our attention on the specific plan node passed in */
-    set_deparse_plan(dpns, (Plan *) plan);
-    dpns->ancestors = ancestors;
-
-    /* Return a one-deep namespace stack */
-    return list_make1(dpns);
-}
-
-/*
- * Set deparse context for Plan. Only those plan nodes which are immediate (or
- * through simple nodes) parents of RemoteQuery nodes are supported right now.
- *
- * This is a kind of work-around since the new deparse interface (since 9.1)
- * expects a PlanState node. But planstates are instantiated only at execution
- * time when InitPlan is called. But we are required to deparse the query
- * during planning time, so we hand-cook these dummy PlanState nodes instead of
- * init-ing the plan. Another approach could have been to delay the query
- * generation to the execution time, but we are not yet sure if this can be
- * safely done, especially for remote join reduction.
- */
-static void
-set_deparse_plan(deparse_namespace *dpns, Plan *plan)
-{// #lizard forgives
-
-    if (IsA(plan, NestLoop))
-    {
-        NestLoop *nestloop = (NestLoop *) plan;
-
-        dpns->planstate = (PlanState *) makeNode(NestLoopState);
-        dpns->planstate->plan = plan;
-
-        dpns->outer_planstate = (PlanState *) makeNode(PlanState);
-        dpns->outer_planstate->plan = nestloop->join.plan.lefttree;
-
-        dpns->inner_planstate = (PlanState *) makeNode(PlanState);
-        dpns->inner_planstate->plan = nestloop->join.plan.righttree;
-    }
-    else if (IsA(plan, RemoteQuery))
-    {
-        dpns->planstate = (PlanState *) makeNode(PlanState);
-        dpns->planstate->plan = plan;
-    }
-    else if (IsA(plan, Agg) || IsA(plan, Group))
-    {
-        /*
-         * We expect plan tree as Group/Agg->Sort->Result->Material->RemoteQuery,
-         * Result, Material nodes are optional. Sort is compulsory for Group but not
-         * for Agg.
-         * anything else is not handled right now.
-         */
-        Plan *temp_plan = plan->lefttree;
-        Plan *remote_scan = NULL;
-
-        if (temp_plan && IsA(temp_plan, Sort))
-            temp_plan = temp_plan->lefttree;
-        if (temp_plan && IsA(temp_plan, Result))
-            temp_plan = temp_plan->lefttree;
-        if (temp_plan && IsA(temp_plan, Material))
-            temp_plan = temp_plan->lefttree;
-        if (temp_plan && IsA(temp_plan, RemoteQuery))
-            remote_scan = temp_plan;
-
-        if (!remote_scan)
-            elog(ERROR, "Deparse of this query at planning is not supported yet");
-
-        dpns->planstate = (PlanState *) makeNode(PlanState);
-        dpns->planstate->plan = plan;
-    }
-    else
-        elog(ERROR, "Deparse of this query at planning not supported yet");
-}
-
-#endif
-/*
- * set_rtable_names: select RTE aliases to be used in printing a query
- *
- * We fill in dpns->rtable_names with a list of names that is one-for-one with
- * the already-filled dpns->rtable list.  Each RTE name is unique among those
- * in the new namespace plus any ancestor namespaces listed in
- * parent_namespaces.
- *
- * If rels_used isn't NULL, only RTE indexes listed in it are given aliases.
- *
- * Note that this function is only concerned with relation names, not column
- * names.
- */
-static void
-set_rtable_names(deparse_namespace *dpns, List *parent_namespaces,
-                 Bitmapset *rels_used)
-{// #lizard forgives
-    HASHCTL        hash_ctl;
-    HTAB       *names_hash;
-    NameHashEntry *hentry;
-    bool        found;
-    int            rtindex;
-    ListCell   *lc;
-
-    dpns->rtable_names = NIL;
-    /* nothing more to do if empty rtable */
-    if (dpns->rtable == NIL)
-        return;
-
-    /*
-     * We use a hash table to hold known names, so that this process is O(N)
-     * not O(N^2) for N names.
-     */
-    MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-    hash_ctl.keysize = NAMEDATALEN;
-    hash_ctl.entrysize = sizeof(NameHashEntry);
-    hash_ctl.hcxt = CurrentMemoryContext;
-    names_hash = hash_create("set_rtable_names names",
-                             list_length(dpns->rtable),
-                             &hash_ctl,
-                             HASH_ELEM | HASH_CONTEXT);
-    /* Preload the hash table with names appearing in parent_namespaces */
-    foreach(lc, parent_namespaces)
-    {
-        deparse_namespace *olddpns = (deparse_namespace *) lfirst(lc);
-        ListCell   *lc2;
-
-        foreach(lc2, olddpns->rtable_names)
-        {
-            char       *oldname = (char *) lfirst(lc2);
-
-            if (oldname == NULL)
-                continue;
-            hentry = (NameHashEntry *) hash_search(names_hash,
-                                                   oldname,
-                                                   HASH_ENTER,
-                                                   &found);
-            /* we do not complain about duplicate names in parent namespaces */
-            hentry->counter = 0;
-        }
-    }
-
-    /* Now we can scan the rtable */
-    rtindex = 1;
-    foreach(lc, dpns->rtable)
-    {
-        RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
-        char       *refname;
-
-        /* Just in case this takes an unreasonable amount of time ... */
-        CHECK_FOR_INTERRUPTS();
-
-        if (rels_used && !bms_is_member(rtindex, rels_used))
-        {
-            /* Ignore unreferenced RTE */
-            refname = NULL;
-        }
-        else if (rte->alias)
-        {
-            /* If RTE has a user-defined alias, prefer that */
-            refname = rte->alias->aliasname;
-        }
-        else if (rte->rtekind == RTE_RELATION)
-        {
-            /* Use the current actual name of the relation */
-            refname = get_rel_name(rte->relid);
-        }
-        else if (rte->rtekind == RTE_JOIN)
-        {
-            /* Unnamed join has no refname */
-            refname = NULL;
-        }
-        else
-        {
-            /* Otherwise use whatever the parser assigned */
-            refname = rte->eref->aliasname;
-        }
-
-        /*
-         * If the selected name isn't unique, append digits to make it so, and
-         * make a new hash entry for it once we've got a unique name.  For a
-         * very long input name, we might have to truncate to stay within
-         * NAMEDATALEN.
-         */
-        if (refname)
-        {
-            hentry = (NameHashEntry *) hash_search(names_hash,
-                                                   refname,
-                                                   HASH_ENTER,
-                                                   &found);
-            if (found)
-            {
-                /* Name already in use, must choose a new one */
-                int            refnamelen = strlen(refname);
-                char       *modname = (char *) palloc(refnamelen + 16);
-                NameHashEntry *hentry2;
-
-                do
-                {
-                    hentry->counter++;
-                    for (;;)
-                    {
-                        /*
-                         * We avoid using %.*s here because it can misbehave
-                         * if the data is not valid in what libc thinks is the
-                         * prevailing encoding.
-                         */
-                        memcpy(modname, refname, refnamelen);
-                        sprintf(modname + refnamelen, "_%d", hentry->counter);
-                        if (strlen(modname) < NAMEDATALEN)
-                            break;
-                        /* drop chars from refname to keep all the digits */
-                        refnamelen = pg_mbcliplen(refname, refnamelen,
-                                                  refnamelen - 1);
-                    }
-                    hentry2 = (NameHashEntry *) hash_search(names_hash,
-                                                            modname,
-                                                            HASH_ENTER,
-                                                            &found);
-                } while (found);
-                hentry2->counter = 0;    /* init new hash entry */
-                refname = modname;
-            }
-            else
-            {
-                /* Name not previously used, need only initialize hentry */
-                hentry->counter = 0;
-            }
-        }
-
-        dpns->rtable_names = lappend(dpns->rtable_names, refname);
-        rtindex++;
-    }
-
-    hash_destroy(names_hash);
-}
-
-/*
- * set_deparse_for_query: set up deparse_namespace for deparsing a Query tree
- *
- * For convenience, this is defined to initialize the deparse_namespace struct
- * from scratch.
- */
-static void
-set_deparse_for_query(deparse_namespace *dpns, Query *query,
-                      List *parent_namespaces)
-{
-    ListCell   *lc;
-    ListCell   *lc2;
-
-    /* Initialize *dpns and fill rtable/ctes links */
-    memset(dpns, 0, sizeof(deparse_namespace));
-    dpns->rtable = query->rtable;
-    dpns->ctes = query->cteList;
-
-    /* Assign a unique relation alias to each RTE */
-    set_rtable_names(dpns, parent_namespaces, NULL);
-
-    /* Initialize dpns->rtable_columns to contain zeroed structs */
-    dpns->rtable_columns = NIL;
-    while (list_length(dpns->rtable_columns) < list_length(dpns->rtable))
-        dpns->rtable_columns = lappend(dpns->rtable_columns,
-                                       palloc0(sizeof(deparse_columns)));
-
-    /* If it's a utility query, it won't have a jointree */
-    if (query->jointree)
-    {
-        /* Detect whether global uniqueness of USING names is needed */
-        dpns->unique_using =
-            has_dangerous_join_using(dpns, (Node *) query->jointree);
-
-        /*
-         * Select names for columns merged by USING, via a recursive pass over
-         * the query jointree.
-         */
-        set_using_names(dpns, (Node *) query->jointree, NIL);
-    }
-
-    /*
-     * Now assign remaining column aliases for each RTE.  We do this in a
-     * linear scan of the rtable, so as to process RTEs whether or not they
-     * are in the jointree (we mustn't miss NEW.*, INSERT target relations,
-     * etc).  JOIN RTEs must be processed after their children, but this is
-     * okay because they appear later in the rtable list than their children
-     * (cf Asserts in identify_join_columns()).
-     */
-    forboth(lc, dpns->rtable, lc2, dpns->rtable_columns)
-    {
-        RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
-        deparse_columns *colinfo = (deparse_columns *) lfirst(lc2);
-
-        if (rte->rtekind == RTE_JOIN)
-            set_join_column_names(dpns, rte, colinfo);
-        else
-            set_relation_column_names(dpns, rte, colinfo);
-    }
-}
-
-/*
- * set_simple_column_names: fill in column aliases for non-query situations
- *
- * This handles EXPLAIN and cases where we only have relation RTEs.  Without
- * a join tree, we can't do anything smart about join RTEs, but we don't
- * need to (note that EXPLAIN should never see join alias Vars anyway).
- * If we do hit a join RTE we'll just process it like a non-table base RTE.
- */
-static void
-set_simple_column_names(deparse_namespace *dpns)
-{
-    ListCell   *lc;
-    ListCell   *lc2;
-
-    /* Initialize dpns->rtable_columns to contain zeroed structs */
-    dpns->rtable_columns = NIL;
-    while (list_length(dpns->rtable_columns) < list_length(dpns->rtable))
-        dpns->rtable_columns = lappend(dpns->rtable_columns,
-                                       palloc0(sizeof(deparse_columns)));
-
-    /* Assign unique column aliases within each RTE */
-    forboth(lc, dpns->rtable, lc2, dpns->rtable_columns)
-    {
-        RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
-        deparse_columns *colinfo = (deparse_columns *) lfirst(lc2);
-
-        set_relation_column_names(dpns, rte, colinfo);
-    }
-}
-
-/*
- * has_dangerous_join_using: search jointree for unnamed JOIN USING
- *
- * Merged columns of a JOIN USING may act differently from either of the input
- * columns, either because they are merged with COALESCE (in a FULL JOIN) or
- * because an implicit coercion of the underlying input column is required.
- * In such a case the column must be referenced as a column of the JOIN not as
- * a column of either input.  And this is problematic if the join is unnamed
- * (alias-less): we cannot qualify the column's name with an RTE name, since
- * there is none.  (Forcibly assigning an alias to the join is not a solution,
- * since that will prevent legal references to tables below the join.)
- * To ensure that every column in the query is unambiguously referenceable,
- * we must assign such merged columns names that are globally unique across
- * the whole query, aliasing other columns out of the way as necessary.
- *
- * Because the ensuing re-aliasing is fairly damaging to the readability of
- * the query, we don't do this unless we have to.  So, we must pre-scan
- * the join tree to see if we have to, before starting set_using_names().
- */
-static bool
-has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode)
-{// #lizard forgives
-    if (IsA(jtnode, RangeTblRef))
-    {
-        /* nothing to do here */
-    }
-    else if (IsA(jtnode, FromExpr))
-    {
-        FromExpr   *f = (FromExpr *) jtnode;
-        ListCell   *lc;
-
-        foreach(lc, f->fromlist)
-        {
-            if (has_dangerous_join_using(dpns, (Node *) lfirst(lc)))
-                return true;
-        }
-    }
-    else if (IsA(jtnode, JoinExpr))
-    {
-        JoinExpr   *j = (JoinExpr *) jtnode;
-
-        /* Is it an unnamed JOIN with USING? */
-        if (j->alias == NULL && j->usingClause)
-        {
-            /*
-             * Yes, so check each join alias var to see if any of them are not
-             * simple references to underlying columns.  If so, we have a
-             * dangerous situation and must pick unique aliases.
-             */
-            RangeTblEntry *jrte = rt_fetch(j->rtindex, dpns->rtable);
-            ListCell   *lc;
-
-            foreach(lc, jrte->joinaliasvars)
-            {
-                Var           *aliasvar = (Var *) lfirst(lc);
-
-                if (aliasvar != NULL && !IsA(aliasvar, Var))
-                    return true;
-            }
-        }
-
-        /* Nope, but inspect children */
-        if (has_dangerous_join_using(dpns, j->larg))
-            return true;
-        if (has_dangerous_join_using(dpns, j->rarg))
-            return true;
-    }
-    else
-        elog(ERROR, "unrecognized node type: %d",
-             (int) nodeTag(jtnode));
-    return false;
-}
-
-/*
- * set_using_names: select column aliases to be used for merged USING columns
- *
- * We do this during a recursive descent of the query jointree.
- * dpns->unique_using must already be set to determine the global strategy.
- *
- * Column alias info is saved in the dpns->rtable_columns list, which is
- * assumed to be filled with pre-zeroed deparse_columns structs.
- *
- * parentUsing is a list of all USING aliases assigned in parent joins of
- * the current jointree node.  (The passed-in list must not be modified.)
- */
-static void
-set_using_names(deparse_namespace *dpns, Node *jtnode, List *parentUsing)
-{// #lizard forgives
-    if (IsA(jtnode, RangeTblRef))
-    {
-        /* nothing to do now */
-    }
-    else if (IsA(jtnode, FromExpr))
-    {
-        FromExpr   *f = (FromExpr *) jtnode;
-        ListCell   *lc;
-
-        foreach(lc, f->fromlist)
-            set_using_names(dpns, (Node *) lfirst(lc), parentUsing);
-    }
-    else if (IsA(jtnode, JoinExpr))
-    {
-        JoinExpr   *j = (JoinExpr *) jtnode;
-        RangeTblEntry *rte = rt_fetch(j->rtindex, dpns->rtable);
-        deparse_columns *colinfo = deparse_columns_fetch(j->rtindex, dpns);
-        int           *leftattnos;
-        int           *rightattnos;
-        deparse_columns *leftcolinfo;
-        deparse_columns *rightcolinfo;
-        int            i;
-        ListCell   *lc;
-
-        /* Get info about the shape of the join */
-        identify_join_columns(j, rte, colinfo);
-        leftattnos = colinfo->leftattnos;
-        rightattnos = colinfo->rightattnos;
-
-        /* Look up the not-yet-filled-in child deparse_columns structs */
-        leftcolinfo = deparse_columns_fetch(colinfo->leftrti, dpns);
-        rightcolinfo = deparse_columns_fetch(colinfo->rightrti, dpns);
-
-        /*
-         * If this join is unnamed, then we cannot substitute new aliases at
-         * this level, so any name requirements pushed down to here must be
-         * pushed down again to the children.
-         */
-        if (rte->alias == NULL)
-        {
-            for (i = 0; i < colinfo->num_cols; i++)
-            {
-                char       *colname = colinfo->colnames[i];
-
-                if (colname == NULL)
-                    continue;
-
-                /* Push down to left column, unless it's a system column */
-                if (leftattnos[i] > 0)
-                {
-                    expand_colnames_array_to(leftcolinfo, leftattnos[i]);
-                    leftcolinfo->colnames[leftattnos[i] - 1] = colname;
-                }
-
-                /* Same on the righthand side */
-                if (rightattnos[i] > 0)
-                {
-                    expand_colnames_array_to(rightcolinfo, rightattnos[i]);
-                    rightcolinfo->colnames[rightattnos[i] - 1] = colname;
-                }
-            }
-        }
-
-        /*
-         * If there's a USING clause, select the USING column names and push
-         * those names down to the children.  We have two strategies:
-         *
-         * If dpns->unique_using is TRUE, we force all USING names to be
-         * unique across the whole query level.  In principle we'd only need
-         * the names of dangerous USING columns to be globally unique, but to
-         * safely assign all USING names in a single pass, we have to enforce
-         * the same uniqueness rule for all of them.  However, if a USING
-         * column's name has been pushed down from the parent, we should use
-         * it as-is rather than making a uniqueness adjustment.  This is
-         * necessary when we're at an unnamed join, and it creates no risk of
-         * ambiguity.  Also, if there's a user-written output alias for a
-         * merged column, we prefer to use that rather than the input name;
-         * this simplifies the logic and seems likely to lead to less aliasing
-         * overall.
-         *
-         * If dpns->unique_using is FALSE, we only need USING names to be
-         * unique within their own join RTE.  We still need to honor
-         * pushed-down names, though.
-         *
-         * Though significantly different in results, these two strategies are
-         * implemented by the same code, with only the difference of whether
-         * to put assigned names into dpns->using_names.
-         */
-        if (j->usingClause)
-        {
-            /* Copy the input parentUsing list so we don't modify it */
-            parentUsing = list_copy(parentUsing);
-
-            /* USING names must correspond to the first join output columns */
-            expand_colnames_array_to(colinfo, list_length(j->usingClause));
-            i = 0;
-            foreach(lc, j->usingClause)
-            {
-                char       *colname = strVal(lfirst(lc));
-
-                /* Assert it's a merged column */
-                Assert(leftattnos[i] != 0 && rightattnos[i] != 0);
-
-                /* Adopt passed-down name if any, else select unique name */
-                if (colinfo->colnames[i] != NULL)
-                    colname = colinfo->colnames[i];
-                else
-                {
-                    /* Prefer user-written output alias if any */
-                    if (rte->alias && i < list_length(rte->alias->colnames))
-                        colname = strVal(list_nth(rte->alias->colnames, i));
-                    /* Make it appropriately unique */
-                    colname = make_colname_unique(colname, dpns, colinfo);
-                    if (dpns->unique_using)
-                        dpns->using_names = lappend(dpns->using_names,
-                                                    colname);
-                    /* Save it as output column name, too */
-                    colinfo->colnames[i] = colname;
-                }
-
-                /* Remember selected names for use later */
-                colinfo->usingNames = lappend(colinfo->usingNames, colname);
-                parentUsing = lappend(parentUsing, colname);
-
-                /* Push down to left column, unless it's a system column */
-                if (leftattnos[i] > 0)
-                {
-                    expand_colnames_array_to(leftcolinfo, leftattnos[i]);
-                    leftcolinfo->colnames[leftattnos[i] - 1] = colname;
-                }
-
-                /* Same on the righthand side */
-                if (rightattnos[i] > 0)
-                {
-                    expand_colnames_array_to(rightcolinfo, rightattnos[i]);
-                    rightcolinfo->colnames[rightattnos[i] - 1] = colname;
-                }
-
-                i++;
-            }
-        }
-
-        /* Mark child deparse_columns structs with correct parentUsing info */
-        leftcolinfo->parentUsing = parentUsing;
-        rightcolinfo->parentUsing = parentUsing;
-
-        /* Now recursively assign USING column names in children */
-        set_using_names(dpns, j->larg, parentUsing);
-        set_using_names(dpns, j->rarg, parentUsing);
-    }
-    else
-        elog(ERROR, "unrecognized node type: %d",
-             (int) nodeTag(jtnode));
-}
-
-/*
- * set_relation_column_names: select column aliases for a non-join RTE
- *
- * Column alias info is saved in *colinfo, which is assumed to be pre-zeroed.
- * If any colnames entries are already filled in, those override local
- * choices.
- */
-static void
-set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
-                          deparse_columns *colinfo)
-{// #lizard forgives
-    int            ncolumns;
-    char      **real_colnames;
-    bool        changed_any;
-    int            noldcolumns;
-    int            i;
-    int            j;
-
-    /*
-     * Extract the RTE's "real" column names.  This is comparable to
-     * get_rte_attribute_name, except that it's important to disregard dropped
-     * columns.  We put NULL into the array for a dropped column.
-     */
-    if (rte->rtekind == RTE_RELATION)
-    {
-        /* Relation --- look to the system catalogs for up-to-date info */
-        Relation    rel;
-        TupleDesc    tupdesc;
-
-        rel = relation_open(rte->relid, AccessShareLock);
-        tupdesc = RelationGetDescr(rel);
-
-        ncolumns = tupdesc->natts;
-        real_colnames = (char **) palloc(ncolumns * sizeof(char *));
-
-        for (i = 0; i < ncolumns; i++)
-        {
-            if (tupdesc->attrs[i]->attisdropped)
-                real_colnames[i] = NULL;
-            else
-                real_colnames[i] = pstrdup(NameStr(tupdesc->attrs[i]->attname));
-        }
-        relation_close(rel, AccessShareLock);
-    }
-    else
-    {
-        /* Otherwise use the column names from eref */
-        ListCell   *lc;
-
-        ncolumns = list_length(rte->eref->colnames);
-        real_colnames = (char **) palloc(ncolumns * sizeof(char *));
-
-        i = 0;
-        foreach(lc, rte->eref->colnames)
-        {
-            /*
-             * If the column name shown in eref is an empty string, then it's
-             * a column that was dropped at the time of parsing the query, so
-             * treat it as dropped.
-             */
-            char       *cname = strVal(lfirst(lc));
-
-            if (cname[0] == '\0')
-                cname = NULL;
-            real_colnames[i] = cname;
-            i++;
-        }
-    }
-
-    /*
-     * Ensure colinfo->colnames has a slot for each column.  (It could be long
-     * enough already, if we pushed down a name for the last column.)  Note:
-     * it's possible that there are now more columns than there were when the
-     * query was parsed, ie colnames could be longer than rte->eref->colnames.
-     * We must assign unique aliases to the new columns too, else there could
-     * be unresolved conflicts when the view/rule is reloaded.
-     */
-    expand_colnames_array_to(colinfo, ncolumns);
-    Assert(colinfo->num_cols == ncolumns);
-
-    /*
-     * Make sufficiently large new_colnames and is_new_col arrays, too.
-     *
-     * Note: because we leave colinfo->num_new_cols zero until after the loop,
-     * colname_is_unique will not consult that array, which is fine because it
-     * would only be duplicate effort.
-     */
-    colinfo->new_colnames = (char **) palloc(ncolumns * sizeof(char *));
-    colinfo->is_new_col = (bool *) palloc(ncolumns * sizeof(bool));
-
-    /*
-     * Scan the columns, select a unique alias for each one, and store it in
-     * colinfo->colnames and colinfo->new_colnames.  The former array has NULL
-     * entries for dropped columns, the latter omits them.  Also mark
-     * new_colnames entries as to whether they are new since parse time; this
-     * is the case for entries beyond the length of rte->eref->colnames.
-     */
-    noldcolumns = list_length(rte->eref->colnames);
-    changed_any = false;
-    j = 0;
-    for (i = 0; i < ncolumns; i++)
-    {
-        char       *real_colname = real_colnames[i];
-        char       *colname = colinfo->colnames[i];
-
-        /* Skip dropped columns */
-        if (real_colname == NULL)
-        {
-            Assert(colname == NULL);    /* colnames[i] is already NULL */
-            continue;
-        }
-
-        /* If alias already assigned, that's what to use */
-        if (colname == NULL)
-        {
-            /* If user wrote an alias, prefer that over real column name */
-            if (rte->alias && i < list_length(rte->alias->colnames))
-                colname = strVal(list_nth(rte->alias->colnames, i));
-            else
-                colname = real_colname;
-
-            /* Unique-ify and insert into colinfo */
-            colname = make_colname_unique(colname, dpns, colinfo);
-
-            colinfo->colnames[i] = colname;
-        }
-
-        /* Put names of non-dropped columns in new_colnames[] too */
-        colinfo->new_colnames[j] = colname;
-        /* And mark them as new or not */
-        colinfo->is_new_col[j] = (i >= noldcolumns);
-        j++;
-
-        /* Remember if any assigned aliases differ from "real" name */
-        if (!changed_any && strcmp(colname, real_colname) != 0)
-            changed_any = true;
-    }
-
-    /*
-     * Set correct length for new_colnames[] array.  (Note: if columns have
-     * been added, colinfo->num_cols includes them, which is not really quite
-     * right but is harmless, since any new columns must be at the end where
-     * they won't affect varattnos of pre-existing columns.)
-     */
-    colinfo->num_new_cols = j;
-
-    /*
-     * For a relation RTE, we need only print the alias column names if any
-     * are different from the underlying "real" names.  For a function RTE,
-     * always emit a complete column alias list; this is to protect against
-     * possible instability of the default column names (eg, from altering
-     * parameter names).  For tablefunc RTEs, we never print aliases, because
-     * the column names are part of the clause itself.  For other RTE types,
-     * print if we changed anything OR if there were user-written column
-     * aliases (since the latter would be part of the underlying "reality").
-     */
-    if (rte->rtekind == RTE_RELATION)
-        colinfo->printaliases = changed_any;
-    else if (rte->rtekind == RTE_FUNCTION)
-        colinfo->printaliases = true;
-    else if (rte->rtekind == RTE_TABLEFUNC)
-        colinfo->printaliases = false;
-    else if (rte->alias && rte->alias->colnames != NIL)
-        colinfo->printaliases = true;
-    else
-        colinfo->printaliases = changed_any;
-}
-
-/*
- * set_join_column_names: select column aliases for a join RTE
- *
- * Column alias info is saved in *colinfo, which is assumed to be pre-zeroed.
- * If any colnames entries are already filled in, those override local
- * choices.  Also, names for USING columns were already chosen by
- * set_using_names().  We further expect that column alias selection has been
- * completed for both input RTEs.
- */
-static void
-set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
-                      deparse_columns *colinfo)
-{// #lizard forgives
-    deparse_columns *leftcolinfo;
-    deparse_columns *rightcolinfo;
-    bool        changed_any;
-    int            noldcolumns;
-    int            nnewcolumns;
-    Bitmapset  *leftmerged = NULL;
-    Bitmapset  *rightmerged = NULL;
-    int            i;
-    int            j;
-    int            ic;
-    int            jc;
-
-    /* Look up the previously-filled-in child deparse_columns structs */
-    leftcolinfo = deparse_columns_fetch(colinfo->leftrti, dpns);
-    rightcolinfo = deparse_columns_fetch(colinfo->rightrti, dpns);
-
-    /*
-     * Ensure colinfo->colnames has a slot for each column.  (It could be long
-     * enough already, if we pushed down a name for the last column.)  Note:
-     * it's possible that one or both inputs now have more columns than there
-     * were when the query was parsed, but we'll deal with that below.  We
-     * only need entries in colnames for pre-existing columns.
-     */
-    noldcolumns = list_length(rte->eref->colnames);
-    expand_colnames_array_to(colinfo, noldcolumns);
-    Assert(colinfo->num_cols == noldcolumns);
-
-    /*
-     * Scan the join output columns, select an alias for each one, and store
-     * it in colinfo->colnames.  If there are USING columns, set_using_names()
-     * already selected their names, so we can start the loop at the first
-     * non-merged column.
-     */
-    changed_any = false;
-    for (i = list_length(colinfo->usingNames); i < noldcolumns; i++)
-    {
-        char       *colname = colinfo->colnames[i];
-        char       *real_colname;
-
-        /* Ignore dropped column (only possible for non-merged column) */
-        if (colinfo->leftattnos[i] == 0 && colinfo->rightattnos[i] == 0)
-        {
-            Assert(colname == NULL);
-            continue;
-        }
-
-        /* Get the child column name */
-        if (colinfo->leftattnos[i] > 0)
-            real_colname = leftcolinfo->colnames[colinfo->leftattnos[i] - 1];
-        else if (colinfo->rightattnos[i] > 0)
-            real_colname = rightcolinfo->colnames[colinfo->rightattnos[i] - 1];
-        else
-        {
-            /* We're joining system columns --- use eref name */
-            real_colname = strVal(list_nth(rte->eref->colnames, i));
-        }
-        Assert(real_colname != NULL);
-
-        /* In an unnamed join, just report child column names as-is */
-        if (rte->alias == NULL)
-        {
-            colinfo->colnames[i] = real_colname;
-            continue;
-        }
-
-        /* If alias already assigned, that's what to use */
-        if (colname == NULL)
-        {
-            /* If user wrote an alias, prefer that over real column name */
-            if (rte->alias && i < list_length(rte->alias->colnames))
-                colname = strVal(list_nth(rte->alias->colnames, i));
-            else
-                colname = real_colname;
-
-            /* Unique-ify and insert into colinfo */
-            colname = make_colname_unique(colname, dpns, colinfo);
-
-            colinfo->colnames[i] = colname;
-        }
-
-        /* Remember if any assigned aliases differ from "real" name */
-        if (!changed_any && strcmp(colname, real_colname) != 0)
-            changed_any = true;
-    }
-
-    /*
-     * Calculate number of columns the join would have if it were re-parsed
-     * now, and create storage for the new_colnames and is_new_col arrays.
-     *
-     * Note: colname_is_unique will be consulting new_colnames[] during the
-     * loops below, so its not-yet-filled entries must be zeroes.
-     */
-    nnewcolumns = leftcolinfo->num_new_cols + rightcolinfo->num_new_cols -
-        list_length(colinfo->usingNames);
-    colinfo->num_new_cols = nnewcolumns;
-    colinfo->new_colnames = (char **) palloc0(nnewcolumns * sizeof(char *));
-    colinfo->is_new_col = (bool *) palloc0(nnewcolumns * sizeof(bool));
-
-    /*
-     * Generating the new_colnames array is a bit tricky since any new columns
-     * added since parse time must be inserted in the right places.  This code
-     * must match the parser, which will order a join's columns as merged
-     * columns first (in USING-clause order), then non-merged columns from the
-     * left input (in attnum order), then non-merged columns from the right
-     * input (ditto).  If one of the inputs is itself a join, its columns will
-     * be ordered according to the same rule, which means newly-added columns
-     * might not be at the end.  We can figure out what's what by consulting
-     * the leftattnos and rightattnos arrays plus the input is_new_col arrays.
-     *
-     * In these loops, i indexes leftattnos/rightattnos (so it's join varattno
-     * less one), j indexes new_colnames/is_new_col, and ic/jc have similar
-     * meanings for the current child RTE.
-     */
-
-    /* Handle merged columns; they are first and can't be new */
-    i = j = 0;
-    while (i < noldcolumns &&
-           colinfo->leftattnos[i] != 0 &&
-           colinfo->rightattnos[i] != 0)
-    {
-        /* column name is already determined and known unique */
-        colinfo->new_colnames[j] = colinfo->colnames[i];
-        colinfo->is_new_col[j] = false;
-
-        /* build bitmapsets of child attnums of merged columns */
-        if (colinfo->leftattnos[i] > 0)
-            leftmerged = bms_add_member(leftmerged, colinfo->leftattnos[i]);
-        if (colinfo->rightattnos[i] > 0)
-            rightmerged = bms_add_member(rightmerged, colinfo->rightattnos[i]);
-
-        i++, j++;
-    }
-
-    /* Handle non-merged left-child columns */
-    ic = 0;
-    for (jc = 0; jc < leftcolinfo->num_new_cols; jc++)
-    {
-        char       *child_colname = leftcolinfo->new_colnames[jc];
-
-        if (!leftcolinfo->is_new_col[jc])
-        {
-            /* Advance ic to next non-dropped old column of left child */
-            while (ic < leftcolinfo->num_cols &&
-                   leftcolinfo->colnames[ic] == NULL)
-                ic++;
-            Assert(ic < leftcolinfo->num_cols);
-            ic++;
-            /* If it is a merged column, we already processed it */
-            if (bms_is_member(ic, leftmerged))
-                continue;
-            /* Else, advance i to the corresponding existing join column */
-            while (i < colinfo->num_cols &&
-                   colinfo->colnames[i] == NULL)
-                i++;
-            Assert(i < colinfo->num_cols);
-            Assert(ic == colinfo->leftattnos[i]);
-            /* Use the already-assigned name of this column */
-            colinfo->new_colnames[j] = colinfo->colnames[i];
-            i++;
-        }
-        else
-        {
-            /*
-             * Unique-ify the new child column name and assign, unless we're
-             * in an unnamed join, in which case just copy
-             */
-            if (rte->alias != NULL)
-            {
-                colinfo->new_colnames[j] =
-                    make_colname_unique(child_colname, dpns, colinfo);
-                if (!changed_any &&
-                    strcmp(colinfo->new_colnames[j], child_colname) != 0)
-                    changed_any = true;
-            }
-            else
-                colinfo->new_colnames[j] = child_colname;
-        }
-
-        colinfo->is_new_col[j] = leftcolinfo->is_new_col[jc];
-        j++;
-    }
-
-    /* Handle non-merged right-child columns in exactly the same way */
-    ic = 0;
-    for (jc = 0; jc < rightcolinfo->num_new_cols; jc++)
-    {
-        char       *child_colname = rightcolinfo->new_colnames[jc];
-
-        if (!rightcolinfo->is_new_col[jc])
-        {
-            /* Advance ic to next non-dropped old column of right child */
-            while (ic < rightcolinfo->num_cols &&
-                   rightcolinfo->colnames[ic] == NULL)
-                ic++;
-            Assert(ic < rightcolinfo->num_cols);
-            ic++;
-            /* If it is a merged column, we already processed it */
-            if (bms_is_member(ic, rightmerged))
-                continue;
-            /* Else, advance i to the corresponding existing join column */
-            while (i < colinfo->num_cols &&
-                   colinfo->colnames[i] == NULL)
-                i++;
-            Assert(i < colinfo->num_cols);
-            Assert(ic == colinfo->rightattnos[i]);
-            /* Use the already-assigned name of this column */
-            colinfo->new_colnames[j] = colinfo->colnames[i];
-            i++;
-        }
-        else
-        {
-            /*
-             * Unique-ify the new child column name and assign, unless we're
-             * in an unnamed join, in which case just copy
-             */
-            if (rte->alias != NULL)
-            {
-                colinfo->new_colnames[j] =
-                    make_colname_unique(child_colname, dpns, colinfo);
-                if (!changed_any &&
-                    strcmp(colinfo->new_colnames[j], child_colname) != 0)
-                    changed_any = true;
-            }
-            else
-                colinfo->new_colnames[j] = child_colname;
-        }
-
-        colinfo->is_new_col[j] = rightcolinfo->is_new_col[jc];
-        j++;
-    }
-
-    /* Assert we processed the right number of columns */
-#ifdef USE_ASSERT_CHECKING
-    while (i < colinfo->num_cols && colinfo->colnames[i] == NULL)
-        i++;
-    Assert(i == colinfo->num_cols);
-    Assert(j == nnewcolumns);
-#endif
-
-    /*
-     * For a named join, print column aliases if we changed any from the child
-     * names.  Unnamed joins cannot print aliases.
-     */
-    if (rte->alias != NULL)
-        colinfo->printaliases = changed_any;
-    else
-        colinfo->printaliases = false;
-}
-
-/*
- * colname_is_unique: is colname distinct from already-chosen column names?
- *
- * dpns is query-wide info, colinfo is for the column's RTE
- */
-static bool
-colname_is_unique(char *colname, deparse_namespace *dpns,
-                  deparse_columns *colinfo)
-{// #lizard forgives
-    int            i;
-    ListCell   *lc;
-
-    /* Check against already-assigned column aliases within RTE */
-    for (i = 0; i < colinfo->num_cols; i++)
-    {
-        char       *oldname = colinfo->colnames[i];
-
-        if (oldname && strcmp(oldname, colname) == 0)
-            return false;
-    }
-
-    /*
-     * If we're building a new_colnames array, check that too (this will be
-     * partially but not completely redundant with the previous checks)
-     */
-    for (i = 0; i < colinfo->num_new_cols; i++)
-    {
-        char       *oldname = colinfo->new_colnames[i];
-
-        if (oldname && strcmp(oldname, colname) == 0)
-            return false;
-    }
-
-    /* Also check against USING-column names that must be globally unique */
-    foreach(lc, dpns->using_names)
-    {
-        char       *oldname = (char *) lfirst(lc);
-
-        if (strcmp(oldname, colname) == 0)
-            return false;
-    }
-
-    /* Also check against names already assigned for parent-join USING cols */
-    foreach(lc, colinfo->parentUsing)
-    {
-        char       *oldname = (char *) lfirst(lc);
-
-        if (strcmp(oldname, colname) == 0)
-            return false;
-    }
-
-    return true;
-}
-
-/*
- * make_colname_unique: modify colname if necessary to make it unique
- *
- * dpns is query-wide info, colinfo is for the column's RTE
- */
-static char *
-make_colname_unique(char *colname, deparse_namespace *dpns,
-                    deparse_columns *colinfo)
-{
-    /*
-     * If the selected name isn't unique, append digits to make it so.  For a
-     * very long input name, we might have to truncate to stay within
-     * NAMEDATALEN.
-     */
-    if (!colname_is_unique(colname, dpns, colinfo))
-    {
-        int            colnamelen = strlen(colname);
-        char       *modname = (char *) palloc(colnamelen + 16);
-        int            i = 0;
-
-        do
-        {
-            i++;
-            for (;;)
-            {
-                /*
-                 * We avoid using %.*s here because it can misbehave if the
-                 * data is not valid in what libc thinks is the prevailing
-                 * encoding.
-                 */
-                memcpy(modname, colname, colnamelen);
-                sprintf(modname + colnamelen, "_%d", i);
-                if (strlen(modname) < NAMEDATALEN)
-                    break;
-                /* drop chars from colname to keep all the digits */
-                colnamelen = pg_mbcliplen(colname, colnamelen,
-                                          colnamelen - 1);
-            }
-        } while (!colname_is_unique(modname, dpns, colinfo));
-        colname = modname;
-    }
-    return colname;
-}
-
-/*
- * expand_colnames_array_to: make colinfo->colnames at least n items long
- *
- * Any added array entries are initialized to zero.
- */
-static void
-expand_colnames_array_to(deparse_columns *colinfo, int n)
-{
-    if (n > colinfo->num_cols)
-    {
-        if (colinfo->colnames == NULL)
-            colinfo->colnames = (char **) palloc0(n * sizeof(char *));
-        else
-        {
-            colinfo->colnames = (char **) repalloc(colinfo->colnames,
-                                                   n * sizeof(char *));
-            memset(colinfo->colnames + colinfo->num_cols, 0,
-                   (n - colinfo->num_cols) * sizeof(char *));
-        }
-        colinfo->num_cols = n;
-    }
-}
-
-/*
- * identify_join_columns: figure out where columns of a join come from
- *
- * Fills the join-specific fields of the colinfo struct, except for
- * usingNames which is filled later.
- */
-static void
-identify_join_columns(JoinExpr *j, RangeTblEntry *jrte,
-                      deparse_columns *colinfo)
-{// #lizard forgives
-    int            numjoincols;
-    int            i;
-    ListCell   *lc;
-
-    /* Extract left/right child RT indexes */
-    if (IsA(j->larg, RangeTblRef))
-        colinfo->leftrti = ((RangeTblRef *) j->larg)->rtindex;
-    else if (IsA(j->larg, JoinExpr))
-        colinfo->leftrti = ((JoinExpr *) j->larg)->rtindex;
-    else
-        elog(ERROR, "unrecognized node type in jointree: %d",
-             (int) nodeTag(j->larg));
-    if (IsA(j->rarg, RangeTblRef))
-        colinfo->rightrti = ((RangeTblRef *) j->rarg)->rtindex;
-    else if (IsA(j->rarg, JoinExpr))
-        colinfo->rightrti = ((JoinExpr *) j->rarg)->rtindex;
-    else
-        elog(ERROR, "unrecognized node type in jointree: %d",
-             (int) nodeTag(j->rarg));
-
-    /* Assert children will be processed earlier than join in second pass */
-    Assert(colinfo->leftrti < j->rtindex);
-    Assert(colinfo->rightrti < j->rtindex);
-
-    /* Initialize result arrays with zeroes */
-    numjoincols = list_length(jrte->joinaliasvars);
-    Assert(numjoincols == list_length(jrte->eref->colnames));
-    colinfo->leftattnos = (int *) palloc0(numjoincols * sizeof(int));
-    colinfo->rightattnos = (int *) palloc0(numjoincols * sizeof(int));
-
-    /* Scan the joinaliasvars list to identify simple column references */
-    i = 0;
-    foreach(lc, jrte->joinaliasvars)
-    {
-        Var           *aliasvar = (Var *) lfirst(lc);
-
-        /* get rid of any implicit coercion above the Var */
-        aliasvar = (Var *) strip_implicit_coercions((Node *) aliasvar);
-
-        if (aliasvar == NULL)
-        {
-            /* It's a dropped column; nothing to do here */
-        }
-        else if (IsA(aliasvar, Var))
-        {
-            Assert(aliasvar->varlevelsup == 0);
-            Assert(aliasvar->varattno != 0);
-            if (aliasvar->varno == colinfo->leftrti)
-                colinfo->leftattnos[i] = aliasvar->varattno;
-            else if (aliasvar->varno == colinfo->rightrti)
-                colinfo->rightattnos[i] = aliasvar->varattno;
-            else
-                elog(ERROR, "unexpected varno %d in JOIN RTE",
-                     aliasvar->varno);
-        }
-        else if (IsA(aliasvar, CoalesceExpr))
-        {
-            /*
-             * It's a merged column in FULL JOIN USING.  Ignore it for now and
-             * let the code below identify the merged columns.
-             */
-        }
-        else
-            elog(ERROR, "unrecognized node type in join alias vars: %d",
-                 (int) nodeTag(aliasvar));
-
-        i++;
-    }
-
-    /*
-     * If there's a USING clause, deconstruct the join quals to identify the
-     * merged columns.  This is a tad painful but if we cannot rely on the
-     * column names, there is no other representation of which columns were
-     * joined by USING.  (Unless the join type is FULL, we can't tell from the
-     * joinaliasvars list which columns are merged.)  Note: we assume that the
-     * merged columns are the first output column(s) of the join.
-     */
-    if (j->usingClause)
-    {
-        List       *leftvars = NIL;
-        List       *rightvars = NIL;
-        ListCell   *lc2;
-
-        /* Extract left- and right-side Vars from the qual expression */
-        flatten_join_using_qual(j->quals, &leftvars, &rightvars);
-        Assert(list_length(leftvars) == list_length(j->usingClause));
-        Assert(list_length(rightvars) == list_length(j->usingClause));
-
-        /* Mark the output columns accordingly */
-        i = 0;
-        forboth(lc, leftvars, lc2, rightvars)
-        {
-            Var           *leftvar = (Var *) lfirst(lc);
-            Var           *rightvar = (Var *) lfirst(lc2);
-
-            Assert(leftvar->varlevelsup == 0);
-            Assert(leftvar->varattno != 0);
-            if (leftvar->varno != colinfo->leftrti)
-                elog(ERROR, "unexpected varno %d in JOIN USING qual",
-                     leftvar->varno);
-            colinfo->leftattnos[i] = leftvar->varattno;
-
-            Assert(rightvar->varlevelsup == 0);
-            Assert(rightvar->varattno != 0);
-            if (rightvar->varno != colinfo->rightrti)
-                elog(ERROR, "unexpected varno %d in JOIN USING qual",
-                     rightvar->varno);
-            colinfo->rightattnos[i] = rightvar->varattno;
-
-            i++;
-        }
-    }
-}
-
-/*
- * flatten_join_using_qual: extract Vars being joined from a JOIN/USING qual
- *
- * We assume that transformJoinUsingClause won't have produced anything except
- * AND nodes, equality operator nodes, and possibly implicit coercions, and
- * that the AND node inputs match left-to-right with the original USING list.
- *
- * Caller must initialize the result lists to NIL.
- */
-static void
-flatten_join_using_qual(Node *qual, List **leftvars, List **rightvars)
-{
-    if (IsA(qual, BoolExpr))
-    {
-        /* Handle AND nodes by recursion */
-        BoolExpr   *b = (BoolExpr *) qual;
-        ListCell   *lc;
-
-        Assert(b->boolop == AND_EXPR);
-        foreach(lc, b->args)
-        {
-            flatten_join_using_qual((Node *) lfirst(lc),
-                                    leftvars, rightvars);
-        }
-    }
-    else if (IsA(qual, OpExpr))
-    {
-        /* Otherwise we should have an equality operator */
-        OpExpr       *op = (OpExpr *) qual;
-        Var           *var;
-
-        if (list_length(op->args) != 2)
-            elog(ERROR, "unexpected unary operator in JOIN/USING qual");
-        /* Arguments should be Vars with perhaps implicit coercions */
-        var = (Var *) strip_implicit_coercions((Node *) linitial(op->args));
-        if (!IsA(var, Var))
-            elog(ERROR, "unexpected node type in JOIN/USING qual: %d",
-                 (int) nodeTag(var));
-        *leftvars = lappend(*leftvars, var);
-        var = (Var *) strip_implicit_coercions((Node *) lsecond(op->args));
-        if (!IsA(var, Var))
-            elog(ERROR, "unexpected node type in JOIN/USING qual: %d",
-                 (int) nodeTag(var));
-        *rightvars = lappend(*rightvars, var);
-    }
-    else
-    {
-        /* Perhaps we have an implicit coercion to boolean? */
-        Node       *q = strip_implicit_coercions(qual);
-
-        if (q != qual)
-            flatten_join_using_qual(q, leftvars, rightvars);
-        else
-            elog(ERROR, "unexpected node type in JOIN/USING qual: %d",
-                 (int) nodeTag(qual));
-    }
-}
-
-/*
- * get_rtable_name: convenience function to get a previously assigned RTE alias
- *
- * The RTE must belong to the topmost namespace level in "context".
- */
-static char *
-get_rtable_name(int rtindex, deparse_context *context)
-{
-    deparse_namespace *dpns = (deparse_namespace *) linitial(context->namespaces);
-
-    Assert(rtindex > 0 && rtindex <= list_length(dpns->rtable_names));
-    return (char *) list_nth(dpns->rtable_names, rtindex - 1);
-}
-
-/*
- * set_deparse_planstate: set up deparse_namespace to parse subexpressions
- * of a given PlanState node
- *
- * This sets the planstate, outer_planstate, inner_planstate, outer_tlist,
- * inner_tlist, and index_tlist fields.  Caller is responsible for adjusting
- * the ancestors list if necessary.  Note that the rtable and ctes fields do
- * not need to change when shifting attention to different plan nodes in a
- * single plan tree.
- */
-static void
-set_deparse_planstate(deparse_namespace *dpns, PlanState *ps)
-{// #lizard forgives
-    dpns->planstate = ps;
-
-    /*
-     * We special-case Append and MergeAppend to pretend that the first child
-     * plan is the OUTER referent; we have to interpret OUTER Vars in their
-     * tlists according to one of the children, and the first one is the most
-     * natural choice.  Likewise special-case ModifyTable to pretend that the
-     * first child plan is the OUTER referent; this is to support RETURNING
-     * lists containing references to non-target relations.
-     */
-    if (IsA(ps, AppendState))
-        dpns->outer_planstate = ((AppendState *) ps)->appendplans[0];
-    else if (IsA(ps, MergeAppendState))
-        dpns->outer_planstate = ((MergeAppendState *) ps)->mergeplans[0];
-    else if (IsA(ps, ModifyTableState))
-        dpns->outer_planstate = ((ModifyTableState *) ps)->mt_plans[0];
-    else
-        dpns->outer_planstate = outerPlanState(ps);
-
-    if (dpns->outer_planstate)
-        dpns->outer_tlist = dpns->outer_planstate->plan->targetlist;
-    else
-        dpns->outer_tlist = NIL;
-
-    /*
-     * For a SubqueryScan, pretend the subplan is INNER referent.  (We don't
-     * use OUTER because that could someday conflict with the normal meaning.)
-     * Likewise, for a CteScan, pretend the subquery's plan is INNER referent.
-     * For ON CONFLICT .. UPDATE we just need the inner tlist to point to the
-     * excluded expression's tlist. (Similar to the SubqueryScan we don't want
-     * to reuse OUTER, it's used for RETURNING in some modify table cases,
-     * although not INSERT .. CONFLICT).
-     */
-    if (IsA(ps, SubqueryScanState))
-        dpns->inner_planstate = ((SubqueryScanState *) ps)->subplan;
-    else if (IsA(ps, CteScanState))
-        dpns->inner_planstate = ((CteScanState *) ps)->cteplanstate;
-    else if (IsA(ps, ModifyTableState))
-        dpns->inner_planstate = ps;
-    else
-        dpns->inner_planstate = innerPlanState(ps);
-
-    if (IsA(ps, ModifyTableState))
-        dpns->inner_tlist = ((ModifyTableState *) ps)->mt_excludedtlist;
-    else if (dpns->inner_planstate)
-        dpns->inner_tlist = dpns->inner_planstate->plan->targetlist;
-    else
-        dpns->inner_tlist = NIL;
-
-    /* Set up referent for INDEX_VAR Vars, if needed */
-    if (IsA(ps->plan, IndexOnlyScan))
-        dpns->index_tlist = ((IndexOnlyScan *) ps->plan)->indextlist;
-    else if (IsA(ps->plan, ForeignScan))
-        dpns->index_tlist = ((ForeignScan *) ps->plan)->fdw_scan_tlist;
-    else if (IsA(ps->plan, CustomScan))
-        dpns->index_tlist = ((CustomScan *) ps->plan)->custom_scan_tlist;
-    else
-        dpns->index_tlist = NIL;
-}
-
-/*
- * push_child_plan: temporarily transfer deparsing attention to a child plan
- *
- * When expanding an OUTER_VAR or INNER_VAR reference, we must adjust the
- * deparse context in case the referenced expression itself uses
- * OUTER_VAR/INNER_VAR.  We modify the top stack entry in-place to avoid
- * affecting levelsup issues (although in a Plan tree there really shouldn't
- * be any).
- *
- * Caller must provide a local deparse_namespace variable to save the
- * previous state for pop_child_plan.
- */
-static void
-push_child_plan(deparse_namespace *dpns, PlanState *ps,
-                deparse_namespace *save_dpns)
-{
-    /* Save state for restoration later */
-    *save_dpns = *dpns;
-
-    /* Link current plan node into ancestors list */
-    dpns->ancestors = lcons(dpns->planstate, dpns->ancestors);
-
-    /* Set attention on selected child */
-    set_deparse_planstate(dpns, ps);
-}
-
-/*
- * pop_child_plan: undo the effects of push_child_plan
- */
-static void
-pop_child_plan(deparse_namespace *dpns, deparse_namespace *save_dpns)
-{
-    List       *ancestors;
-
-    /* Get rid of ancestors list cell added by push_child_plan */
-    ancestors = list_delete_first(dpns->ancestors);
-
-    /* Restore fields changed by push_child_plan */
-    *dpns = *save_dpns;
-
-    /* Make sure dpns->ancestors is right (may be unnecessary) */
-    dpns->ancestors = ancestors;
-}
-
-/*
- * push_ancestor_plan: temporarily transfer deparsing attention to an
- * ancestor plan
- *
- * When expanding a Param reference, we must adjust the deparse context
- * to match the plan node that contains the expression being printed;
- * otherwise we'd fail if that expression itself contains a Param or
- * OUTER_VAR/INNER_VAR/INDEX_VAR variable.
- *
- * The target ancestor is conveniently identified by the ListCell holding it
- * in dpns->ancestors.
- *
- * Caller must provide a local deparse_namespace variable to save the
- * previous state for pop_ancestor_plan.
- */
-static void
-push_ancestor_plan(deparse_namespace *dpns, ListCell *ancestor_cell,
-                   deparse_namespace *save_dpns)
-{
-    PlanState  *ps = (PlanState *) lfirst(ancestor_cell);
-    List       *ancestors;
-
-    /* Save state for restoration later */
-    *save_dpns = *dpns;
-
-    /* Build a new ancestor list with just this node's ancestors */
-    ancestors = NIL;
-    while ((ancestor_cell = lnext(ancestor_cell)) != NULL)
-        ancestors = lappend(ancestors, lfirst(ancestor_cell));
-    dpns->ancestors = ancestors;
-
-    /* Set attention on selected ancestor */
-    set_deparse_planstate(dpns, ps);
-}
-
-/*
- * pop_ancestor_plan: undo the effects of push_ancestor_plan
- */
-static void
-pop_ancestor_plan(deparse_namespace *dpns, deparse_namespace *save_dpns)
-{
-    /* Free the ancestor list made in push_ancestor_plan */
-    list_free(dpns->ancestors);
-
-    /* Restore fields changed by push_ancestor_plan */
-    *dpns = *save_dpns;
-}
-
-
-/* ----------
- * make_ruledef            - reconstruct the CREATE RULE command
- *                  for a given pg_rewrite tuple
- * ----------
- */
-static void
-make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
-             int prettyFlags)
-{// #lizard forgives
-    char       *rulename;
-    char        ev_type;
-    Oid            ev_class;
-    bool        is_instead;
-    char       *ev_qual;
-    char       *ev_action;
-    List       *actions = NIL;
-    Relation    ev_relation;
-    TupleDesc    viewResultDesc = NULL;
-    int            fno;
-    Datum        dat;
-    bool        isnull;
-
-    /*
-     * Get the attribute values from the rules tuple
-     */
-    fno = SPI_fnumber(rulettc, "rulename");
-    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
-    Assert(!isnull);
-    rulename = NameStr(*(DatumGetName(dat)));
-
-    fno = SPI_fnumber(rulettc, "ev_type");
-    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
-    Assert(!isnull);
-    ev_type = DatumGetChar(dat);
-
-    fno = SPI_fnumber(rulettc, "ev_class");
-    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
-    Assert(!isnull);
-    ev_class = DatumGetObjectId(dat);
-
-    fno = SPI_fnumber(rulettc, "is_instead");
-    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
-    Assert(!isnull);
-    is_instead = DatumGetBool(dat);
-
-    /* these could be nulls */
-    fno = SPI_fnumber(rulettc, "ev_qual");
-    ev_qual = SPI_getvalue(ruletup, rulettc, fno);
-
-    fno = SPI_fnumber(rulettc, "ev_action");
-    ev_action = SPI_getvalue(ruletup, rulettc, fno);
-    if (ev_action != NULL)
-        actions = (List *) stringToNode(ev_action);
-
-    ev_relation = heap_open(ev_class, AccessShareLock);
-
-    /*
-     * Build the rules definition text
-     */
-    appendStringInfo(buf, "CREATE RULE %s AS",
-                     quote_identifier(rulename));
-
-    if (prettyFlags & PRETTYFLAG_INDENT)
-        appendStringInfoString(buf, "\n    ON ");
-    else
-        appendStringInfoString(buf, " ON ");
-
-    /* The event the rule is fired for */
-    switch (ev_type)
-    {
-        case '1':
-            appendStringInfoString(buf, "SELECT");
-            viewResultDesc = RelationGetDescr(ev_relation);
-            break;
-
-        case '2':
-            appendStringInfoString(buf, "UPDATE");
-            break;
-
-        case '3':
-            appendStringInfoString(buf, "INSERT");
-            break;
-
-        case '4':
-            appendStringInfoString(buf, "DELETE");
-            break;
-
-        default:
-            ereport(ERROR,
-                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                     errmsg("rule \"%s\" has unsupported event type %d",
-                            rulename, ev_type)));
-            break;
-    }
-
-    /* The relation the rule is fired on */
-    appendStringInfo(buf, " TO %s", generate_relation_name(ev_class, NIL));
-
-    /* If the rule has an event qualification, add it */
-    if (ev_qual == NULL)
-        ev_qual = "";
-    if (strlen(ev_qual) > 0 && strcmp(ev_qual, "<>") != 0)
-    {
-        Node       *qual;
-        Query       *query;
-        deparse_context context;
-        deparse_namespace dpns;
-
-        if (prettyFlags & PRETTYFLAG_INDENT)
-            appendStringInfoString(buf, "\n  ");
-        appendStringInfoString(buf, " WHERE ");
-
-        qual = stringToNode(ev_qual);
-
-        /*
-         * We need to make a context for recognizing any Vars in the qual
-         * (which can only be references to OLD and NEW).  Use the rtable of
-         * the first query in the action list for this purpose.
-         */
-        query = (Query *) linitial(actions);
-
-        /*
-         * If the action is INSERT...SELECT, OLD/NEW have been pushed down
-         * into the SELECT, and that's what we need to look at. (Ugly kluge
-         * ... try to fix this when we redesign querytrees.)
-         */
-        query = getInsertSelectQuery(query, NULL);
-
-        /* Must acquire locks right away; see notes in get_query_def() */
-        AcquireRewriteLocks(query, false, false);
-
-        context.buf = buf;
-        context.namespaces = list_make1(&dpns);
-        context.windowClause = NIL;
-        context.windowTList = NIL;
-        context.varprefix = (list_length(query->rtable) != 1);
-        context.prettyFlags = prettyFlags;
-        context.wrapColumn = WRAP_COLUMN_DEFAULT;
-        context.indentLevel = PRETTYINDENT_STD;
-        context.special_exprkind = EXPR_KIND_NONE;
-
-        set_deparse_for_query(&dpns, query, NIL);
-
-        get_rule_expr(qual, &context, false);
-    }
-
-    appendStringInfoString(buf, " DO ");
-
-    /* The INSTEAD keyword (if so) */
-    if (is_instead)
-        appendStringInfoString(buf, "INSTEAD ");
-
-    /* Finally the rules actions */
-    if (list_length(actions) > 1)
-    {
-        ListCell   *action;
-        Query       *query;
-
-        appendStringInfoChar(buf, '(');
-        foreach(action, actions)
-        {
-            query = (Query *) lfirst(action);
-            get_query_def(query, buf, NIL, viewResultDesc,
-                          prettyFlags, WRAP_COLUMN_DEFAULT, 0
-#ifdef PGXC
-                          , false, false
-#endif /* PGXC */
-                );
-            if (prettyFlags)
-                appendStringInfoString(buf, ";\n");
-            else
-                appendStringInfoString(buf, "; ");
-        }
-        appendStringInfoString(buf, ");");
-    }
-    else if (list_length(actions) == 0)
-    {
-        appendStringInfoString(buf, "NOTHING;");
-    }
-    else
-    {
-        Query       *query;
-
-        query = (Query *) linitial(actions);
-        get_query_def(query, buf, NIL, viewResultDesc,
-                      prettyFlags, WRAP_COLUMN_DEFAULT, 0
-#ifdef PGXC
-                        , false, false
-#endif /* PGXC */
-        );
-        appendStringInfo(buf, ";");
-    }
-
-    heap_close(ev_relation, AccessShareLock);
-}
-
-
-/* ----------
- * make_viewdef            - reconstruct the SELECT part of a
- *                  view rewrite rule
- * ----------
- */
-static void
-make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
-             int prettyFlags, int wrapColumn)
-{// #lizard forgives
-    Query       *query;
-    char        ev_type;
-    Oid            ev_class;
-    bool        is_instead;
-    char       *ev_qual;
-    char       *ev_action;
-    List       *actions = NIL;
-    Relation    ev_relation;
-    int            fno;
-    Datum        dat;
-    bool        isnull;
-
-    /*
-     * Get the attribute values from the rules tuple
-     */
-    fno = SPI_fnumber(rulettc, "ev_type");
-    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
-    Assert(!isnull);
-    ev_type = DatumGetChar(dat);
-
-    fno = SPI_fnumber(rulettc, "ev_class");
-    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
-    Assert(!isnull);
-    ev_class = DatumGetObjectId(dat);
-
-    fno = SPI_fnumber(rulettc, "is_instead");
-    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
-    Assert(!isnull);
-    is_instead = DatumGetBool(dat);
-
-    /* these could be nulls */
-    fno = SPI_fnumber(rulettc, "ev_qual");
-    ev_qual = SPI_getvalue(ruletup, rulettc, fno);
-
-    fno = SPI_fnumber(rulettc, "ev_action");
-    ev_action = SPI_getvalue(ruletup, rulettc, fno);
-    if (ev_action != NULL)
-        actions = (List *) stringToNode(ev_action);
-
-    if (list_length(actions) != 1)
-    {
-        /* keep output buffer empty and leave */
-        return;
-    }
-
-    query = (Query *) linitial(actions);
-
-    if (ev_type != '1' || !is_instead ||
-        strcmp(ev_qual, "<>") != 0 || query->commandType != CMD_SELECT)
-    {
-        /* keep output buffer empty and leave */
-        return;
-    }
-
-    ev_relation = heap_open(ev_class, AccessShareLock);
-
-    get_query_def(query, buf, NIL, RelationGetDescr(ev_relation),
-                  prettyFlags, wrapColumn, 0
-#ifdef PGXC
-                  , false, false
-#endif /* PGXC */
-                  );
-    appendStringInfo(buf, ";");
-
-    heap_close(ev_relation, AccessShareLock);
-}
-
-#ifdef PGXC
-/* ----------
- * deparse_query            - Parse back one query parsetree
- *
- * Purpose of this function is to build up statement for a RemoteQuery
- * It just calls get_query_def without pretty print flags
- * ----------
- */
-void
-deparse_query(Query *query, StringInfo buf, List *parentnamespace,
-                bool finalise_aggs, bool sortgroup_colno)
-{
-    get_query_def(query, buf, parentnamespace, NULL, 0, 0, 0, finalise_aggs,
-            sortgroup_colno);
-}
-
-/* code borrowed from get_insert_query_def */
-void
-get_query_def_from_valuesList(Query *query, StringInfo buf)
-{// #lizard forgives
-
-    RangeTblEntry *select_rte = NULL;
-    RangeTblEntry *values_rte = NULL;
-    RangeTblEntry *rte;
-    char       *sep;
-    ListCell   *values_cell;
-    ListCell   *l;
-    List       *strippedexprs;
-    deparse_context context;
-    deparse_namespace dpns;
-
-    /*
-     * Before we begin to examine the query, acquire locks on referenced
-     * relations, and fix up deleted columns in JOIN RTEs.    This ensures
-     * consistent results.    Note we assume it's OK to scribble on the passed
-     * querytree!
-     */
-    AcquireRewriteLocks(query, false, false);
-
-    context.buf = buf;
-    context.namespaces = NIL;
-    context.windowClause = NIL;
-    context.windowTList = NIL;
-    context.varprefix = (list_length(query->rtable) != 1);
-    context.prettyFlags = 0;
-    context.indentLevel = 0;
-    context.wrapColumn = 0;
-
-    dpns.rtable = query->rtable;
-    dpns.ctes = query->cteList;
-    dpns.planstate = NULL;
-    dpns.ancestors = NIL;
-    dpns.outer_planstate = dpns.inner_planstate = NULL;
-
-    /*
-     * If it's an INSERT ... SELECT or VALUES (...), (...), ... there will be
-     * a single RTE for the SELECT or VALUES.
-     */
-    foreach(l, query->rtable)
-    {
-        rte = (RangeTblEntry *) lfirst(l);
-
-        if (rte->rtekind == RTE_SUBQUERY)
-        {
-            if (select_rte)
-                elog(ERROR, "too many subquery RTEs in INSERT");
-            select_rte = rte;
-        }
-
-        if (rte->rtekind == RTE_VALUES)
-        {
-            if (values_rte)
-                elog(ERROR, "too many values RTEs in INSERT");
-            values_rte = rte;
-        }
-    }
-    if (select_rte && values_rte)
-        elog(ERROR, "both subquery and values RTEs in INSERT");
-
-    /*
-     * Start the query with INSERT INTO relname
-     */
-    rte = rt_fetch(query->resultRelation, query->rtable);
-    Assert(rte->rtekind == RTE_RELATION);
-
-    appendStringInfo(buf, "INSERT INTO %s (",
-                     generate_relation_name(rte->relid, NIL));
-
-    /*
-     * Add the insert-column-names list.  To handle indirection properly, we
-     * need to look for indirection nodes in the top targetlist (if it's
-     * INSERT ... SELECT or INSERT ... single VALUES), or in the first
-     * expression list of the VALUES RTE (if it's INSERT ... multi VALUES). We
-     * assume that all the expression lists will have similar indirection in
-     * the latter case.
-     */
-    if (values_rte)
-        values_cell = list_head((List *) linitial(values_rte->values_lists));
-    else
-        values_cell = NULL;
-    strippedexprs = NIL;
-    sep = "";
-    foreach(l, query->targetList)
-    {
-        TargetEntry *tle = (TargetEntry *) lfirst(l);
-
-        elog(DEBUG1, "targetEntry type is %d\n)", tle->expr->type);
-        if (tle->resjunk || !IsA(tle->expr, Var))
-            continue;            /* ignore junk entries */
-
-        appendStringInfoString(buf, sep);
-        sep = ", ";
-
-        /*
-         * Put out name of target column; look in the catalogs, not at
-         * tle->resname, since resname will fail to track RENAME.
-         */
-        appendStringInfoString(buf,quote_identifier(get_relid_attribute_name(rte->relid, tle->resno)));
-
-        /*
-         * Print any indirection needed (subfields or subscripts), and strip
-         * off the top-level nodes representing the indirection assignments.
-         */
-        if (values_cell)
-        {
-            /* we discard the stripped expression in this case */
-            processIndirection((Node *) lfirst(values_cell), &context);
-            values_cell = lnext(values_cell);
-        }
-        else
-        {
-            /* we keep a list of the stripped expressions in this case */
-            strippedexprs = lappend(strippedexprs, processIndirection((Node *) tle->expr, &context));
-        }
-    }
-    appendStringInfo(buf, ") ");
-
-    if (select_rte)
-    {
-        /* Add the SELECT */
-        get_query_def(select_rte->subquery, buf, NIL, NULL,
-                      context.prettyFlags, context.wrapColumn,
-                      context.indentLevel,
-                      context.finalise_aggs, context.sortgroup_colno);
-    }
-    else if (values_rte)
-    {
-        /* A WITH clause is possible here */
-        get_with_clause(query, &context);
-        /* Add the multi-VALUES expression lists */
-        get_values_def(values_rte->values_lists, &context);
-    }
-    else
-    {
-        /* A WITH clause is possible here */
-        get_with_clause(query, &context);
-        /* Add the single-VALUES expression list */
-        appendContextKeyword(&context, "VALUES (",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 2);
-        get_rule_expr((Node *) strippedexprs, &context, false);
-        appendStringInfoChar(buf, ')');
-    }
-
-    /* Add RETURNING if present */
-    if (query->returningList)
-    {
-        appendContextKeyword(&context, " RETURNING",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
-        get_target_list(query->returningList, &context, NULL);
-    }
-}
-#endif
-/* ----------
- * get_query_def            - Parse back one query parsetree
- *
- * If resultDesc is not NULL, then it is the output tuple descriptor for
- * the view represented by a SELECT query.
- * ----------
- */
-static void
-get_query_def(Query *query, StringInfo buf, List *parentnamespace,
-              TupleDesc resultDesc,
-              int prettyFlags, int wrapColumn, int startIndent,
-              bool finalise_aggs, bool sortgroup_colno)
-{// #lizard forgives
-    deparse_context context;
-    deparse_namespace dpns;
-
-    /* Guard against excessively long or deeply-nested queries */
-    CHECK_FOR_INTERRUPTS();
-    check_stack_depth();
-
-    /*
-     * Before we begin to examine the query, acquire locks on referenced
-     * relations, and fix up deleted columns in JOIN RTEs.  This ensures
-     * consistent results.  Note we assume it's OK to scribble on the passed
-     * querytree!
-     *
-     * We are only deparsing the query (we are not about to execute it), so we
-     * only need AccessShareLock on the relations it mentions.
-     */
-    AcquireRewriteLocks(query, false, false);
-
-    context.buf = buf;
-    context.namespaces = lcons(&dpns, list_copy(parentnamespace));
-    context.windowClause = NIL;
-    context.windowTList = NIL;
-    context.varprefix = (parentnamespace != NIL ||
-                         list_length(query->rtable) != 1);
-    context.prettyFlags = prettyFlags;
-    context.wrapColumn = wrapColumn;
-    context.indentLevel = startIndent;
-    context.special_exprkind = EXPR_KIND_NONE;
-    context.finalise_aggs = finalise_aggs;
-    context.sortgroup_colno = sortgroup_colno;
-
-    set_deparse_for_query(&dpns, query, parentnamespace);
-
-    switch (query->commandType)
-    {
-        case CMD_SELECT:
-            get_select_query_def(query, &context, resultDesc);
-            break;
-
-        case CMD_UPDATE:
-            get_update_query_def(query, &context);
-            break;
-
-        case CMD_INSERT:
-            get_insert_query_def(query, &context);
-            break;
-
-        case CMD_DELETE:
-            get_delete_query_def(query, &context);
-            break;
-
-        case CMD_NOTHING:
-            appendStringInfoString(buf, "NOTHING");
-            break;
-
-        case CMD_UTILITY:
-            get_utility_query_def(query, &context);
-            break;
-
-        default:
-            elog(ERROR, "unrecognized query command type: %d",
-                 query->commandType);
-            break;
-    }
-}
-
-/* ----------
- * get_values_def            - Parse back a VALUES list
- * ----------
- */
-static void
-get_values_def(List *values_lists, deparse_context *context)
-{
-    StringInfo    buf = context->buf;
-    bool        first_list = true;
-    ListCell   *vtl;
-
-    appendStringInfoString(buf, "VALUES ");
-
-    foreach(vtl, values_lists)
-    {
-        List       *sublist = (List *) lfirst(vtl);
-        bool        first_col = true;
-        ListCell   *lc;
-
-        if (first_list)
-            first_list = false;
-        else
-            appendStringInfoString(buf, ", ");
-
-        appendStringInfoChar(buf, '(');
-        foreach(lc, sublist)
-        {
-            Node       *col = (Node *) lfirst(lc);
-
-            if (first_col)
-                first_col = false;
-            else
-                appendStringInfoChar(buf, ',');
-
-            /*
-             * Print the value.  Whole-row Vars need special treatment.
-             */
-            get_rule_expr_toplevel(col, context, false);
-        }
-        appendStringInfoChar(buf, ')');
-    }
-}
-
-/* ----------
- * get_with_clause            - Parse back a WITH clause
- * ----------
- */
-static void
-get_with_clause(Query *query, deparse_context *context)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    const char *sep;
-    ListCell   *l;
-
-    if (query->cteList == NIL)
-        return;
-
-    if (PRETTY_INDENT(context))
-    {
-        context->indentLevel += PRETTYINDENT_STD;
-        appendStringInfoChar(buf, ' ');
-    }
-
-    if (query->hasRecursive)
-        sep = "WITH RECURSIVE ";
-    else
-        sep = "WITH ";
-    foreach(l, query->cteList)
-    {
-        CommonTableExpr *cte = (CommonTableExpr *) lfirst(l);
-
-        appendStringInfoString(buf, sep);
-        appendStringInfoString(buf, quote_identifier(cte->ctename));
-        if (cte->aliascolnames)
-        {
-            bool        first = true;
-            ListCell   *col;
-
-            appendStringInfoChar(buf, '(');
-            foreach(col, cte->aliascolnames)
-            {
-                if (first)
-                    first = false;
-                else
-                    appendStringInfoString(buf, ", ");
-                appendStringInfoString(buf,
-                                       quote_identifier(strVal(lfirst(col))));
-            }
-            appendStringInfoChar(buf, ')');
-        }
-        appendStringInfoString(buf, " AS (");
-        if (PRETTY_INDENT(context))
-            appendContextKeyword(context, "", 0, 0, 0);
-        get_query_def((Query *) cte->ctequery, buf, context->namespaces, NULL,
-                      context->prettyFlags, context->wrapColumn,
-                      context->indentLevel,
-                      context->finalise_aggs,
-                      context->sortgroup_colno);
-        if (PRETTY_INDENT(context))
-            appendContextKeyword(context, "", 0, 0, 0);
-        appendStringInfoChar(buf, ')');
-        sep = ", ";
-    }
-
-    if (PRETTY_INDENT(context))
-    {
-        context->indentLevel -= PRETTYINDENT_STD;
-        appendContextKeyword(context, "", 0, 0, 0);
-    }
-    else
-        appendStringInfoChar(buf, ' ');
-}
-
-/* ----------
- * get_select_query_def            - Parse back a SELECT parsetree
- * ----------
- */
-static void
-get_select_query_def(Query *query, deparse_context *context,
-                     TupleDesc resultDesc)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    List       *save_windowclause;
-    List       *save_windowtlist;
-    bool        force_colno;
-    ListCell   *l;
-
-    /* Insert the WITH clause if given */
-    get_with_clause(query, context);
-
-    /* Set up context for possible window functions */
-    save_windowclause = context->windowClause;
-    context->windowClause = query->windowClause;
-    save_windowtlist = context->windowTList;
-    context->windowTList = query->targetList;
-
-    /*
-     * If the Query node has a setOperations tree, then it's the top level of
-     * a UNION/INTERSECT/EXCEPT query; only the WITH, ORDER BY and LIMIT
-     * fields are interesting in the top query itself.
-     */
-    if (query->setOperations)
-    {
-        get_setop_query(query->setOperations, query, context, resultDesc);
-        /* ORDER BY clauses must be simple in this case */
-        force_colno = true;
-    }
-    else
-    {
-        get_basic_select_query(query, context, resultDesc);
-        force_colno = false;
-    }
-
-    /* Add the ORDER BY clause if given */
-    if (query->sortClause != NIL)
-    {
-        appendContextKeyword(context, " ORDER BY ",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
-        get_rule_orderby(query->sortClause, query->targetList,
-                         force_colno, context);
-    }
-
-    /* Add the LIMIT clause if given */
-    if (query->limitOffset != NULL)
-    {
-        appendContextKeyword(context, " OFFSET ",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
-        get_rule_expr(query->limitOffset, context, false);
-    }
-    if (query->limitCount != NULL)
-    {
-        appendContextKeyword(context, " LIMIT ",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
-        if (IsA(query->limitCount, Const) &&
-            ((Const *) query->limitCount)->constisnull)
-            appendStringInfoString(buf, "ALL");
-        else
-            get_rule_expr(query->limitCount, context, false);
-    }
-
-    /* Add FOR [KEY] UPDATE/SHARE clauses if present */
-    if (query->hasForUpdate)
-    {
-        foreach(l, query->rowMarks)
-        {
-            RowMarkClause *rc = (RowMarkClause *) lfirst(l);
-
-            /* don't print implicit clauses */
-            if (rc->pushedDown)
-                continue;
-
-            switch (rc->strength)
-            {
-                case LCS_NONE:
-                    /* we intentionally throw an error for LCS_NONE */
-                    elog(ERROR, "unrecognized LockClauseStrength %d",
-                         (int) rc->strength);
-                    break;
-                case LCS_FORKEYSHARE:
-                    appendContextKeyword(context, " FOR KEY SHARE",
-                                         -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
-                    break;
-                case LCS_FORSHARE:
-                    appendContextKeyword(context, " FOR SHARE",
-                                         -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
-                    break;
-                case LCS_FORNOKEYUPDATE:
-                    appendContextKeyword(context, " FOR NO KEY UPDATE",
-                                         -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
-                    break;
-                case LCS_FORUPDATE:
-                    appendContextKeyword(context, " FOR UPDATE",
-                                         -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
-                    break;
-            }
-
-            appendStringInfo(buf, " OF %s",
-                             quote_identifier(get_rtable_name(rc->rti,
-                                                              context)));
-            if (rc->waitPolicy == LockWaitError)
-                appendStringInfoString(buf, " NOWAIT");
-            else if (rc->waitPolicy == LockWaitSkip)
-                appendStringInfoString(buf, " SKIP LOCKED");
-        }
-    }
-
-    context->windowClause = save_windowclause;
-    context->windowTList = save_windowtlist;
-}
-
-/*
- * Detect whether query looks like SELECT ... FROM VALUES();
- * if so, return the VALUES RTE.  Otherwise return NULL.
- */
-static RangeTblEntry *
-get_simple_values_rte(Query *query)
-{// #lizard forgives
-    RangeTblEntry *result = NULL;
-    ListCell   *lc;
-
-    /*
-     * We want to return TRUE even if the Query also contains OLD or NEW rule
-     * RTEs.  So the idea is to scan the rtable and see if there is only one
-     * inFromCl RTE that is a VALUES RTE.
-     */
-    foreach(lc, query->rtable)
-    {
-        RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
-
-        if (rte->rtekind == RTE_VALUES && rte->inFromCl)
-        {
-            if (result)
-                return NULL;    /* multiple VALUES (probably not possible) */
-            result = rte;
-        }
-        else if (rte->rtekind == RTE_RELATION && !rte->inFromCl)
-            continue;            /* ignore rule entries */
-        else
-            return NULL;        /* something else -> not simple VALUES */
-    }
-
-    /*
-     * We don't need to check the targetlist in any great detail, because
-     * parser/analyze.c will never generate a "bare" VALUES RTE --- they only
-     * appear inside auto-generated sub-queries with very restricted
-     * structure.  However, DefineView might have modified the tlist by
-     * injecting new column aliases; so compare tlist resnames against the
-     * RTE's names to detect that.
-     */
-    if (result)
-    {
-        ListCell   *lcn;
-
-        if (list_length(query->targetList) != list_length(result->eref->colnames))
-            return NULL;        /* this probably cannot happen */
-        forboth(lc, query->targetList, lcn, result->eref->colnames)
-        {
-            TargetEntry *tle = (TargetEntry *) lfirst(lc);
-            char       *cname = strVal(lfirst(lcn));
-
-            if (tle->resjunk)
-                return NULL;    /* this probably cannot happen */
-            if (tle->resname == NULL || strcmp(tle->resname, cname) != 0)
-                return NULL;    /* column name has been changed */
-        }
-    }
-
-    return result;
-}
-
-static void
-get_basic_select_query(Query *query, deparse_context *context,
-                       TupleDesc resultDesc)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    RangeTblEntry *values_rte;
-    char       *sep;
-    ListCell   *l;
-
-    if (PRETTY_INDENT(context))
-    {
-        context->indentLevel += PRETTYINDENT_STD;
-        appendStringInfoChar(buf, ' ');
-    }
-
-    /*
-     * If the query looks like SELECT * FROM (VALUES ...), then print just the
-     * VALUES part.  This reverses what transformValuesClause() did at parse
-     * time.
-     */
-    values_rte = get_simple_values_rte(query);
-    if (values_rte)
-    {
-        get_values_def(values_rte->values_lists, context);
-        return;
-    }
-
-    /*
-     * Build up the query string - first we say SELECT
-     */
-    appendStringInfoString(buf, "SELECT");
-
-    /* Add the DISTINCT clause if given */
-    if (query->distinctClause != NIL)
-    {
-        if (query->hasDistinctOn)
-        {
-            appendStringInfoString(buf, " DISTINCT ON (");
-            sep = "";
-            foreach(l, query->distinctClause)
-            {
-                SortGroupClause *srt = (SortGroupClause *) lfirst(l);
-
-                appendStringInfoString(buf, sep);
-                get_rule_sortgroupclause(srt->tleSortGroupRef, query->targetList,
-                                         false, context);
-                sep = ", ";
-            }
-            appendStringInfoChar(buf, ')');
-        }
-        else
-            appendStringInfoString(buf, " DISTINCT");
-    }
-
-    /* Then we tell what to select (the targetlist) */
-    get_target_list(query->targetList, context, resultDesc);
-
-    /* Add the FROM clause if needed */
-    get_from_clause(query, " FROM ", context);
-
-    /* Add the WHERE clause if given */
-    if (query->jointree->quals != NULL)
-    {
-        appendContextKeyword(context, " WHERE ",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
-        get_rule_expr(query->jointree->quals, context, false);
-    }
-
-    /* Add the GROUP BY clause if given */
-    if (query->groupClause != NULL || query->groupingSets != NULL)
-    {
-        ParseExprKind save_exprkind;
-
-        appendContextKeyword(context, " GROUP BY ",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
-
-        save_exprkind = context->special_exprkind;
-        context->special_exprkind = EXPR_KIND_GROUP_BY;
-
-        if (query->groupingSets == NIL)
-        {
-            sep = "";
-            foreach(l, query->groupClause)
-            {
-                SortGroupClause *grp = (SortGroupClause *) lfirst(l);
-
-                appendStringInfoString(buf, sep);
-                get_rule_sortgroupclause(grp->tleSortGroupRef, query->targetList,
-                                         false, context);
-                sep = ", ";
-            }
-        }
-        else
-        {
-            sep = "";
-            foreach(l, query->groupingSets)
-            {
-                GroupingSet *grp = lfirst(l);
-
-                appendStringInfoString(buf, sep);
-                get_rule_groupingset(grp, query->targetList, true, context);
-                sep = ", ";
-            }
-        }
-
-        context->special_exprkind = save_exprkind;
-    }
-
-    /* Add the HAVING clause if given */
-    if (query->havingQual != NULL)
-    {
-        appendContextKeyword(context, " HAVING ",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
-        get_rule_expr(query->havingQual, context, false);
-    }
-
-    /* Add the WINDOW clause if needed */
-    if (query->windowClause != NIL)
-        get_rule_windowclause(query, context);
-}
-
-/* ----------
- * get_target_list            - Parse back a SELECT target list
- *
- * This is also used for RETURNING lists in INSERT/UPDATE/DELETE.
- * ----------
- */
-static void
-get_target_list(List *targetList, deparse_context *context,
-                TupleDesc resultDesc)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    StringInfoData targetbuf;
-    bool        last_was_multiline = false;
-    char       *sep;
-    int            colno;
-    ListCell   *l;
-#ifdef PGXC
-    bool no_targetlist = true;
-#endif
-
-    /* we use targetbuf to hold each TLE's text temporarily */
-    initStringInfo(&targetbuf);
-
-    sep = " ";
-    colno = 0;
-    foreach(l, targetList)
-    {
-        TargetEntry *tle = (TargetEntry *) lfirst(l);
-        char       *colname;
-        char       *attname;
-
-        if (tle->resjunk)
-            continue;            /* ignore junk entries */
-
-#ifdef PGXC
-        /* Found at least one element in the target list */
-        if (no_targetlist)
-            no_targetlist = false;
-#endif
-
-        appendStringInfoString(buf, sep);
-        sep = ", ";
-        colno++;
-
-        /*
-         * Put the new field text into targetbuf so we can decide after we've
-         * got it whether or not it needs to go on a new line.
-         */
-        resetStringInfo(&targetbuf);
-        context->buf = &targetbuf;
-
-        /*
-         * We special-case Var nodes rather than using get_rule_expr. This is
-         * needed because get_rule_expr will display a whole-row Var as
-         * "foo.*", which is the preferred notation in most contexts, but at
-         * the top level of a SELECT list it's not right (the parser will
-         * expand that notation into multiple columns, yielding behavior
-         * different from a whole-row Var).  We need to call get_variable
-         * directly so that we can tell it to do the right thing, and so that
-         * we can get the attribute name which is the default AS label.
-         */
-        if (tle->expr && (IsA(tle->expr, Var)))
-        {
-            attname = get_variable((Var *) tle->expr, 0, true, context);
-        }
-        else
-        {
-            get_rule_expr((Node *) tle->expr, context, true);
-            /* We'll show the AS name unless it's this: */
-            attname = "?column?";
-        }
-
-        /*
-         * Figure out what the result column should be called.  In the context
-         * of a view, use the view's tuple descriptor (so as to pick up the
-         * effects of any column RENAME that's been done on the view).
-         * Otherwise, just use what we can find in the TLE.
-         */
-        if (resultDesc && colno <= resultDesc->natts)
-            colname = NameStr(resultDesc->attrs[colno - 1]->attname);
-        else
-            colname = tle->resname;
-
-        /* Show AS unless the column's name is correct as-is */
-        if (colname)            /* resname could be NULL */
-        {
-            if (attname == NULL || strcmp(attname, colname) != 0)
-                appendStringInfo(&targetbuf, " AS %s", quote_identifier(colname));
-        }
-
-        /* Restore context's output buffer */
-        context->buf = buf;
-
-        /* Consider line-wrapping if enabled */
-        if (PRETTY_INDENT(context) && context->wrapColumn >= 0)
-        {
-            int            leading_nl_pos;
-
-            /* Does the new field start with a new line? */
-            if (targetbuf.len > 0 && targetbuf.data[0] == '\n')
-                leading_nl_pos = 0;
-            else
-                leading_nl_pos = -1;
-
-            /* If so, we shouldn't add anything */
-            if (leading_nl_pos >= 0)
-            {
-                /* instead, remove any trailing spaces currently in buf */
-                removeStringInfoSpaces(buf);
-            }
-            else
-            {
-                char       *trailing_nl;
-
-                /* Locate the start of the current line in the output buffer */
-                trailing_nl = strrchr(buf->data, '\n');
-                if (trailing_nl == NULL)
-                    trailing_nl = buf->data;
-                else
-                    trailing_nl++;
-
-                /*
-                 * Add a newline, plus some indentation, if the new field is
-                 * not the first and either the new field would cause an
-                 * overflow or the last field used more than one line.
-                 */
-                if (colno > 1 &&
-                    ((strlen(trailing_nl) + targetbuf.len > context->wrapColumn) ||
-                     last_was_multiline))
-                    appendContextKeyword(context, "", -PRETTYINDENT_STD,
-                                         PRETTYINDENT_STD, PRETTYINDENT_VAR);
-            }
-
-            /* Remember this field's multiline status for next iteration */
-            last_was_multiline =
-                (strchr(targetbuf.data + leading_nl_pos + 1, '\n') != NULL);
-        }
-
-        /* Add the new field */
-        appendStringInfoString(buf, targetbuf.data);
-    }
-
-#ifdef PGXC
-    /*
-     * Because the empty target list can generate invalid SQL
-     * clause. Here, just fill a '*' to process a table without
-     * any columns, this statement will be sent to Datanodes
-     * and treated correctly on remote nodes.
-     */
-    if (no_targetlist)
-        appendStringInfo(buf, " *");
-#endif
-    /* clean up */
-    pfree(targetbuf.data);
-}
-
-static void
-get_setop_query(Node *setOp, Query *query, deparse_context *context,
-                TupleDesc resultDesc)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    bool        need_paren;
-
-    /* Guard against excessively long or deeply-nested queries */
-    CHECK_FOR_INTERRUPTS();
-    check_stack_depth();
-
-    if (IsA(setOp, RangeTblRef))
-    {
-        RangeTblRef *rtr = (RangeTblRef *) setOp;
-        RangeTblEntry *rte = rt_fetch(rtr->rtindex, query->rtable);
-        Query       *subquery = rte->subquery;
-
-        Assert(subquery != NULL);
-        Assert(subquery->setOperations == NULL);
-        /* Need parens if WITH, ORDER BY, FOR UPDATE, or LIMIT; see gram.y */
-        need_paren = (subquery->cteList ||
-                      subquery->sortClause ||
-                      subquery->rowMarks ||
-                      subquery->limitOffset ||
-                      subquery->limitCount);
-        if (need_paren)
-            appendStringInfoChar(buf, '(');
-        get_query_def(subquery, buf, context->namespaces, resultDesc,
-                      context->prettyFlags, context->wrapColumn,
-                      context->indentLevel,
-                      context->finalise_aggs,
-                      context->sortgroup_colno);
-        if (need_paren)
-            appendStringInfoChar(buf, ')');
-    }
-    else if (IsA(setOp, SetOperationStmt))
-    {
-        SetOperationStmt *op = (SetOperationStmt *) setOp;
-        int            subindent;
-
-        /*
-         * We force parens when nesting two SetOperationStmts, except when the
-         * lefthand input is another setop of the same kind.  Syntactically,
-         * we could omit parens in rather more cases, but it seems best to use
-         * parens to flag cases where the setop operator changes.  If we use
-         * parens, we also increase the indentation level for the child query.
-         *
-         * There are some cases in which parens are needed around a leaf query
-         * too, but those are more easily handled at the next level down (see
-         * code above).
-         */
-        if (IsA(op->larg, SetOperationStmt))
-        {
-            SetOperationStmt *lop = (SetOperationStmt *) op->larg;
-
-            if (op->op == lop->op && op->all == lop->all)
-                need_paren = false;
-            else
-                need_paren = true;
-        }
-        else
-            need_paren = false;
-
-        if (need_paren)
-        {
-            appendStringInfoChar(buf, '(');
-            subindent = PRETTYINDENT_STD;
-            appendContextKeyword(context, "", subindent, 0, 0);
-        }
-        else
-            subindent = 0;
-
-        get_setop_query(op->larg, query, context, resultDesc);
-
-        if (need_paren)
-            appendContextKeyword(context, ") ", -subindent, 0, 0);
-        else if (PRETTY_INDENT(context))
-            appendContextKeyword(context, "", -subindent, 0, 0);
-        else
-            appendStringInfoChar(buf, ' ');
-
-        switch (op->op)
-        {
-            case SETOP_UNION:
-                appendStringInfoString(buf, "UNION ");
-                break;
-            case SETOP_INTERSECT:
-                appendStringInfoString(buf, "INTERSECT ");
-                break;
-            case SETOP_EXCEPT:
-                appendStringInfoString(buf, "EXCEPT ");
-                break;
-            default:
-                elog(ERROR, "unrecognized set op: %d",
-                     (int) op->op);
-        }
-        if (op->all)
-            appendStringInfoString(buf, "ALL ");
-
-        /* Always parenthesize if RHS is another setop */
-        need_paren = IsA(op->rarg, SetOperationStmt);
-
-        /*
-         * The indentation code here is deliberately a bit different from that
-         * for the lefthand input, because we want the line breaks in
-         * different places.
-         */
-        if (need_paren)
-        {
-            appendStringInfoChar(buf, '(');
-            subindent = PRETTYINDENT_STD;
-        }
-        else
-            subindent = 0;
-        appendContextKeyword(context, "", subindent, 0, 0);
-
-        get_setop_query(op->rarg, query, context, resultDesc);
-
-        if (PRETTY_INDENT(context))
-            context->indentLevel -= subindent;
-        if (need_paren)
-            appendContextKeyword(context, ")", 0, 0, 0);
-    }
-    else
-    {
-        elog(ERROR, "unrecognized node type: %d",
-             (int) nodeTag(setOp));
-    }
-}
-
-/*
- * Display a sort/group clause.
- *
- * Also returns the expression tree, so caller need not find it again.
- */
-static Node *
-get_rule_sortgroupclause(Index ref, List *tlist, bool force_colno,
-                         deparse_context *context)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    TargetEntry *tle;
-    Node       *expr;
-
-    tle = get_sortgroupref_tle(ref, tlist);
-    expr = (Node *) tle->expr;
-
-    /*
-     * Use column-number form if requested by caller.  Otherwise, if
-     * expression is a constant, force it to be dumped with an explicit cast
-     * as decoration --- this is because a simple integer constant is
-     * ambiguous (and will be misinterpreted by findTargetlistEntry()) if we
-     * dump it without any decoration.  If it's anything more complex than a
-     * simple Var, then force extra parens around it, to ensure it can't be
-     * misinterpreted as a cube() or rollup() construct.
-     */
-    if (force_colno)
-    {
-        Assert(!tle->resjunk);
-        appendStringInfo(buf, "%d", tle->resno);
-    }
-    else if (expr && IsA(expr, Const))
-        get_const_expr((Const *) expr, context, 1);
-    else if (!expr || IsA(expr, Var))
-        get_rule_expr(expr, context, true);
-    else
-    {
-        /*
-         * We must force parens for function-like expressions even if
-         * PRETTY_PAREN is off, since those are the ones in danger of
-         * misparsing. For other expressions we need to force them only if
-         * PRETTY_PAREN is on, since otherwise the expression will output them
-         * itself. (We can't skip the parens.)
-         */
-        bool        need_paren = (PRETTY_PAREN(context)
-                                  || IsA(expr, FuncExpr)
-                                  ||IsA(expr, Aggref)
-                                  ||IsA(expr, WindowFunc));
-
-        if (need_paren)
-            appendStringInfoString(context->buf, "(");
-        get_rule_expr(expr, context, true);
-        if (need_paren)
-            appendStringInfoString(context->buf, ")");
-    }
-
-    return expr;
-}
-
-/*
- * Display a GroupingSet
- */
-static void
-get_rule_groupingset(GroupingSet *gset, List *targetlist,
-                     bool omit_parens, deparse_context *context)
-{// #lizard forgives
-    ListCell   *l;
-    StringInfo    buf = context->buf;
-    bool        omit_child_parens = true;
-    char       *sep = "";
-
-    switch (gset->kind)
-    {
-        case GROUPING_SET_EMPTY:
-            appendStringInfoString(buf, "()");
-            return;
-
-        case GROUPING_SET_SIMPLE:
-            {
-                if (!omit_parens || list_length(gset->content) != 1)
-                    appendStringInfoString(buf, "(");
-
-                foreach(l, gset->content)
-                {
-                    Index        ref = lfirst_int(l);
-
-                    appendStringInfoString(buf, sep);
-                    get_rule_sortgroupclause(ref, targetlist,
-                                             false, context);
-                    sep = ", ";
-                }
-
-                if (!omit_parens || list_length(gset->content) != 1)
-                    appendStringInfoString(buf, ")");
-            }
-            return;
-
-        case GROUPING_SET_ROLLUP:
-            appendStringInfoString(buf, "ROLLUP(");
-            break;
-        case GROUPING_SET_CUBE:
-            appendStringInfoString(buf, "CUBE(");
-            break;
-        case GROUPING_SET_SETS:
-            appendStringInfoString(buf, "GROUPING SETS (");
-            omit_child_parens = false;
-            break;
-    }
-
-    foreach(l, gset->content)
-    {
-        appendStringInfoString(buf, sep);
-        get_rule_groupingset(lfirst(l), targetlist, omit_child_parens, context);
-        sep = ", ";
-    }
-
-    appendStringInfoString(buf, ")");
-}
-
-/*
- * Display an ORDER BY list.
- */
-static void
-get_rule_orderby(List *orderList, List *targetList,
-                 bool force_colno, deparse_context *context)
-{
-    StringInfo    buf = context->buf;
-    const char *sep;
-    ListCell   *l;
-
-    sep = "";
-    foreach(l, orderList)
-    {
-        SortGroupClause *srt = (SortGroupClause *) lfirst(l);
-        Node       *sortexpr;
-        Oid            sortcoltype;
-        TypeCacheEntry *typentry;
-
-        appendStringInfoString(buf, sep);
-        sortexpr = get_rule_sortgroupclause(srt->tleSortGroupRef, targetList,
-                                            force_colno, context);
-        sortcoltype = exprType(sortexpr);
-        /* See whether operator is default < or > for datatype */
-        typentry = lookup_type_cache(sortcoltype,
-                                     TYPECACHE_LT_OPR | TYPECACHE_GT_OPR);
-        if (srt->sortop == typentry->lt_opr)
-        {
-            /* ASC is default, so emit nothing for it */
-            if (srt->nulls_first)
-                appendStringInfoString(buf, " NULLS FIRST");
-        }
-        else if (srt->sortop == typentry->gt_opr)
-        {
-            appendStringInfoString(buf, " DESC");
-            /* DESC defaults to NULLS FIRST */
-            if (!srt->nulls_first)
-                appendStringInfoString(buf, " NULLS LAST");
-        }
-        else
-        {
-            appendStringInfo(buf, " USING %s",
-                             generate_operator_name(srt->sortop,
-                                                    sortcoltype,
-                                                    sortcoltype));
-            /* be specific to eliminate ambiguity */
-            if (srt->nulls_first)
-                appendStringInfoString(buf, " NULLS FIRST");
-            else
-                appendStringInfoString(buf, " NULLS LAST");
-        }
-        sep = ", ";
-    }
-}
-
-/*
- * Display a WINDOW clause.
- *
- * Note that the windowClause list might contain only anonymous window
- * specifications, in which case we should print nothing here.
- */
-static void
-get_rule_windowclause(Query *query, deparse_context *context)
-{
-    StringInfo    buf = context->buf;
-    const char *sep;
-    ListCell   *l;
-
-    sep = NULL;
-    foreach(l, query->windowClause)
-    {
-        WindowClause *wc = (WindowClause *) lfirst(l);
-
-        if (wc->name == NULL)
-            continue;            /* ignore anonymous windows */
-
-        if (sep == NULL)
-            appendContextKeyword(context, " WINDOW ",
-                                 -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
-        else
-            appendStringInfoString(buf, sep);
-
-        appendStringInfo(buf, "%s AS ", quote_identifier(wc->name));
-
-        get_rule_windowspec(wc, query->targetList, context);
-
-        sep = ", ";
-    }
-}
-
-/*
- * Display a window definition
- */
-static void
-get_rule_windowspec(WindowClause *wc, List *targetList,
-                    deparse_context *context)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    bool        needspace = false;
-    const char *sep;
-    ListCell   *l;
-
-    appendStringInfoChar(buf, '(');
-    if (wc->refname)
-    {
-        appendStringInfoString(buf, quote_identifier(wc->refname));
-        needspace = true;
-    }
-    /* partition clauses are always inherited, so only print if no refname */
-    if (wc->partitionClause && !wc->refname)
-    {
-        if (needspace)
-            appendStringInfoChar(buf, ' ');
-        appendStringInfoString(buf, "PARTITION BY ");
-        sep = "";
-        foreach(l, wc->partitionClause)
-        {
-            SortGroupClause *grp = (SortGroupClause *) lfirst(l);
-
-            appendStringInfoString(buf, sep);
-            get_rule_sortgroupclause(grp->tleSortGroupRef, targetList,
-                                     false, context);
-            sep = ", ";
-        }
-        needspace = true;
-    }
-    /* print ordering clause only if not inherited */
-    if (wc->orderClause && !wc->copiedOrder)
-    {
-        if (needspace)
-            appendStringInfoChar(buf, ' ');
-        appendStringInfoString(buf, "ORDER BY ");
-        get_rule_orderby(wc->orderClause, targetList, false, context);
-        needspace = true;
-    }
-    /* framing clause is never inherited, so print unless it's default */
-    if (wc->frameOptions & FRAMEOPTION_NONDEFAULT)
-    {
-        if (needspace)
-            appendStringInfoChar(buf, ' ');
-        if (wc->frameOptions & FRAMEOPTION_RANGE)
-            appendStringInfoString(buf, "RANGE ");
-        else if (wc->frameOptions & FRAMEOPTION_ROWS)
-            appendStringInfoString(buf, "ROWS ");
-        else
-            Assert(false);
-        if (wc->frameOptions & FRAMEOPTION_BETWEEN)
-            appendStringInfoString(buf, "BETWEEN ");
-        if (wc->frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING)
-            appendStringInfoString(buf, "UNBOUNDED PRECEDING ");
-        else if (wc->frameOptions & FRAMEOPTION_START_CURRENT_ROW)
-            appendStringInfoString(buf, "CURRENT ROW ");
-        else if (wc->frameOptions & FRAMEOPTION_START_VALUE)
-        {
-            get_rule_expr(wc->startOffset, context, false);
-            if (wc->frameOptions & FRAMEOPTION_START_VALUE_PRECEDING)
-                appendStringInfoString(buf, " PRECEDING ");
-            else if (wc->frameOptions & FRAMEOPTION_START_VALUE_FOLLOWING)
-                appendStringInfoString(buf, " FOLLOWING ");
-            else
-                Assert(false);
-        }
-        else
-            Assert(false);
-        if (wc->frameOptions & FRAMEOPTION_BETWEEN)
-        {
-            appendStringInfoString(buf, "AND ");
-            if (wc->frameOptions & FRAMEOPTION_END_UNBOUNDED_FOLLOWING)
-                appendStringInfoString(buf, "UNBOUNDED FOLLOWING ");
-            else if (wc->frameOptions & FRAMEOPTION_END_CURRENT_ROW)
-                appendStringInfoString(buf, "CURRENT ROW ");
-            else if (wc->frameOptions & FRAMEOPTION_END_VALUE)
-            {
-                get_rule_expr(wc->endOffset, context, false);
-                if (wc->frameOptions & FRAMEOPTION_END_VALUE_PRECEDING)
-                    appendStringInfoString(buf, " PRECEDING ");
-                else if (wc->frameOptions & FRAMEOPTION_END_VALUE_FOLLOWING)
-                    appendStringInfoString(buf, " FOLLOWING ");
-                else
-                    Assert(false);
-            }
-            else
-                Assert(false);
-        }
-        /* we will now have a trailing space; remove it */
-        buf->len--;
-    }
-    appendStringInfoChar(buf, ')');
-}
-
-/* ----------
- * get_insert_query_def            - Parse back an INSERT parsetree
- * ----------
- */
-static void
-get_insert_query_def(Query *query, deparse_context *context)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    RangeTblEntry *select_rte = NULL;
-    RangeTblEntry *values_rte = NULL;
-    RangeTblEntry *rte;
-    char       *sep;
-    ListCell   *l;
-    List       *strippedexprs;
-
-    /* Insert the WITH clause if given */
-    get_with_clause(query, context);
-
-#ifdef __TBASE__
-    /* 
-      * If query has unshippable triggers, we have to do INSERT on coordinator,
-      * and we do not need select_rte and values_rte.
-      * Hence we keep both select_rte and values_rte NULL.
-         */
-    if (!query->hasUnshippableTriggers)
-    {
-#endif
-    /*
-     * If it's an INSERT ... SELECT or multi-row VALUES, there will be a
-     * single RTE for the SELECT or VALUES.  Plain VALUES has neither.
-     */
-    foreach(l, query->rtable)
-    {
-        rte = (RangeTblEntry *) lfirst(l);
-
-        if (rte->rtekind == RTE_SUBQUERY)
-        {
-            if (select_rte)
-                elog(ERROR, "too many subquery RTEs in INSERT");
-            select_rte = rte;
-        }
-
-        if (rte->rtekind == RTE_VALUES)
-        {
-            if (values_rte)
-                elog(ERROR, "too many values RTEs in INSERT");
-            values_rte = rte;
-        }
-    }
-#ifdef __TBASE__
-    }
-#endif
-    if (select_rte && values_rte)
-        elog(ERROR, "both subquery and values RTEs in INSERT");
-
-    /*
-     * Start the query with INSERT INTO relname
-     */
-    rte = rt_fetch(query->resultRelation, query->rtable);
-    Assert(rte->rtekind == RTE_RELATION);
-
-    if (PRETTY_INDENT(context))
-    {
-        context->indentLevel += PRETTYINDENT_STD;
-        appendStringInfoChar(buf, ' ');
-    }
-    appendStringInfo(buf, "INSERT INTO %s ",
-                     generate_relation_name(rte->relid, NIL));
-    /* INSERT requires AS keyword for target alias */
-    if (rte->alias != NULL)
-        appendStringInfo(buf, "AS %s ",
-                         quote_identifier(rte->alias->aliasname));
-
-    /*
-     * Add the insert-column-names list.  Any indirection decoration needed on
-     * the column names can be inferred from the top targetlist.
-     */
-    strippedexprs = NIL;
-    sep = "";
-    if (query->targetList)
-        appendStringInfoChar(buf, '(');
-    foreach(l, query->targetList)
-    {
-        TargetEntry *tle = (TargetEntry *) lfirst(l);
-
-        if (tle->resjunk)
-            continue;            /* ignore junk entries */
-
-        appendStringInfoString(buf, sep);
-        sep = ", ";
-
-        /*
-         * Put out name of target column; look in the catalogs, not at
-         * tle->resname, since resname will fail to track RENAME.
-         */
-        appendStringInfoString(buf,
-                               quote_identifier(get_relid_attribute_name(rte->relid,
-                                                                         tle->resno)));
-
-        /*
-         * Print any indirection needed (subfields or subscripts), and strip
-         * off the top-level nodes representing the indirection assignments.
-         * Add the stripped expressions to strippedexprs.  (If it's a
-         * single-VALUES statement, the stripped expressions are the VALUES to
-         * print below.  Otherwise they're just Vars and not really
-         * interesting.)
-         */
-        strippedexprs = lappend(strippedexprs,
-                                processIndirection((Node *) tle->expr,
-                                                   context));
-    }
-    if (query->targetList)
-        appendStringInfoString(buf, ") ");
-
-    if (query->override)
-    {
-        if (query->override == OVERRIDING_SYSTEM_VALUE)
-            appendStringInfoString(buf, "OVERRIDING SYSTEM VALUE ");
-        else if (query->override == OVERRIDING_USER_VALUE)
-            appendStringInfoString(buf, "OVERRIDING USER VALUE ");
-    }
-
-    if (select_rte)
-    {
-        /* Add the SELECT */
-        get_query_def(select_rte->subquery, buf, NIL, NULL,
-                      context->prettyFlags, context->wrapColumn,
-                      context->indentLevel,
-                      context->finalise_aggs,
-                      context->sortgroup_colno);
-    }
-    else if (values_rte)
-    {
-        /* Add the multi-VALUES expression lists */
-        get_values_def(values_rte->values_lists, context);
-    }
-    else if (strippedexprs)
-    {
-        /* Add the single-VALUES expression list */
-        appendContextKeyword(context, "VALUES (",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 2);
-        get_rule_expr((Node *) strippedexprs, context, false);
-        appendStringInfoChar(buf, ')');
-    }
-    else
-    {
-        /* No expressions, so it must be DEFAULT VALUES */
-        appendStringInfoString(buf, "DEFAULT VALUES");
-    }
-
-    /* Add ON CONFLICT if present */
-    if (query->onConflict)
-    {
-        OnConflictExpr *confl = query->onConflict;
-
-        appendStringInfoString(buf, " ON CONFLICT");
-
-        if (confl->arbiterElems)
-        {
-            /* Add the single-VALUES expression list */
-            appendStringInfoChar(buf, '(');
-            get_rule_expr((Node *) confl->arbiterElems, context, false);
-            appendStringInfoChar(buf, ')');
-
-            /* Add a WHERE clause (for partial indexes) if given */
-            if (confl->arbiterWhere != NULL)
-            {
-                bool        save_varprefix;
-
-                /*
-                 * Force non-prefixing of Vars, since parser assumes that they
-                 * belong to target relation.  WHERE clause does not use
-                 * InferenceElem, so this is separately required.
-                 */
-                save_varprefix = context->varprefix;
-                context->varprefix = false;
-
-                appendContextKeyword(context, " WHERE ",
-                                     -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
-                get_rule_expr(confl->arbiterWhere, context, false);
-
-                context->varprefix = save_varprefix;
-            }
-        }
-        else if (OidIsValid(confl->constraint))
-        {
-            char       *constraint = get_constraint_name(confl->constraint);
-
-            if (!constraint)
-                elog(ERROR, "cache lookup failed for constraint %u",
-                     confl->constraint);
-            appendStringInfo(buf, " ON CONSTRAINT %s",
-                             quote_identifier(constraint));
-        }
-
-        if (confl->action == ONCONFLICT_NOTHING)
-        {
-            appendStringInfoString(buf, " DO NOTHING");
-        }
-        else
-        {
-            appendStringInfoString(buf, " DO UPDATE SET ");
-            /* Deparse targetlist */
-            get_update_query_targetlist_def(query, confl->onConflictSet,
-                                            context, rte);
-
-            /* Add a WHERE clause if given */
-            if (confl->onConflictWhere != NULL)
-            {
-                appendContextKeyword(context, " WHERE ",
-                                     -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
-                get_rule_expr(confl->onConflictWhere, context, false);
-            }
-        }
-    }
-
-    /* Add RETURNING if present */
-    if (query->returningList)
-    {
-        appendContextKeyword(context, " RETURNING",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
-        get_target_list(query->returningList, context, NULL);
-    }
-}
-
-
-/* ----------
- * get_update_query_def            - Parse back an UPDATE parsetree
- * ----------
- */
-static void
-get_update_query_def(Query *query, deparse_context *context)
-{
-    StringInfo    buf = context->buf;
-    RangeTblEntry *rte;
-
-    /* Insert the WITH clause if given */
-    get_with_clause(query, context);
-
-    /*
-     * Start the query with UPDATE relname SET
-     */
-    rte = rt_fetch(query->resultRelation, query->rtable);
-    Assert(rte->rtekind == RTE_RELATION);
-    if (PRETTY_INDENT(context))
-    {
-        appendStringInfoChar(buf, ' ');
-        context->indentLevel += PRETTYINDENT_STD;
-    }
-    appendStringInfo(buf, "UPDATE %s%s",
-                     only_marker(rte),
-                     generate_relation_name(rte->relid, NIL));
-    if (rte->alias != NULL)
-        appendStringInfo(buf, " %s",
-                         quote_identifier(rte->alias->aliasname));
-    appendStringInfoString(buf, " SET ");
-
-    /* Deparse targetlist */
-    get_update_query_targetlist_def(query, query->targetList, context, rte);
-
-    /* Add the FROM clause if needed */
-    get_from_clause(query, " FROM ", context);
-
-    /* Add a WHERE clause if given */
-    if (query->jointree->quals != NULL)
-    {
-        appendContextKeyword(context, " WHERE ",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
-        get_rule_expr(query->jointree->quals, context, false);
-    }
-
-    /* Add RETURNING if present */
-    if (query->returningList)
-    {
-        appendContextKeyword(context, " RETURNING",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
-        get_target_list(query->returningList, context, NULL);
-    }
-}
-
-
-/* ----------
- * get_update_query_targetlist_def            - Parse back an UPDATE targetlist
- * ----------
- */
-static void
-get_update_query_targetlist_def(Query *query, List *targetList,
-                                deparse_context *context, RangeTblEntry *rte)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    ListCell   *l;
-    ListCell   *next_ma_cell;
-    int            remaining_ma_columns;
-    const char *sep;
-    SubLink    *cur_ma_sublink;
-    List       *ma_sublinks;
-
-    /*
-     * Prepare to deal with MULTIEXPR assignments: collect the source SubLinks
-     * into a list.  We expect them to appear, in ID order, in resjunk tlist
-     * entries.
-     */
-    ma_sublinks = NIL;
-    if (query->hasSubLinks)        /* else there can't be any */
-    {
-        foreach(l, targetList)
-        {
-            TargetEntry *tle = (TargetEntry *) lfirst(l);
-
-            if (tle->resjunk && IsA(tle->expr, SubLink))
-            {
-                SubLink    *sl = (SubLink *) tle->expr;
-
-                if (sl->subLinkType == MULTIEXPR_SUBLINK)
-                {
-                    ma_sublinks = lappend(ma_sublinks, sl);
-                    Assert(sl->subLinkId == list_length(ma_sublinks));
-                }
-            }
-        }
-    }
-    next_ma_cell = list_head(ma_sublinks);
-    cur_ma_sublink = NULL;
-    remaining_ma_columns = 0;
-
-    /* Add the comma separated list of 'attname = value' */
-    sep = "";
-    foreach(l, targetList)
-    {
-        TargetEntry *tle = (TargetEntry *) lfirst(l);
-        Node       *expr;
-
-        if (tle->resjunk)
-            continue;            /* ignore junk entries */
-
-        /* Emit separator (OK whether we're in multiassignment or not) */
-        appendStringInfoString(buf, sep);
-        sep = ", ";
-
-        /*
-         * Check to see if we're starting a multiassignment group: if so,
-         * output a left paren.
-         */
-        if (next_ma_cell != NULL && cur_ma_sublink == NULL)
-        {
-            /*
-             * We must dig down into the expr to see if it's a PARAM_MULTIEXPR
-             * Param.  That could be buried under FieldStores and ArrayRefs
-             * and CoerceToDomains (cf processIndirection()), and underneath
-             * those there could be an implicit type coercion.  Because we
-             * would ignore implicit type coercions anyway, we don't need to
-             * be as careful as processIndirection() is about descending past
-             * implicit CoerceToDomains.
-             */
-            expr = (Node *) tle->expr;
-            while (expr)
-            {
-                if (IsA(expr, FieldStore))
-                {
-                    FieldStore *fstore = (FieldStore *) expr;
-
-                    expr = (Node *) linitial(fstore->newvals);
-                }
-                else if (IsA(expr, ArrayRef))
-                {
-                    ArrayRef   *aref = (ArrayRef *) expr;
-
-                    if (aref->refassgnexpr == NULL)
-                        break;
-                    expr = (Node *) aref->refassgnexpr;
-                }
-                else if (IsA(expr, CoerceToDomain))
-                {
-                    CoerceToDomain *cdomain = (CoerceToDomain *) expr;
-
-                    if (cdomain->coercionformat != COERCE_IMPLICIT_CAST)
-                        break;
-                    expr = (Node *) cdomain->arg;
-                }
-                else
-                    break;
-            }
-            expr = strip_implicit_coercions(expr);
-
-            if (expr && IsA(expr, Param) &&
-                ((Param *) expr)->paramkind == PARAM_MULTIEXPR)
-            {
-                cur_ma_sublink = (SubLink *) lfirst(next_ma_cell);
-                next_ma_cell = lnext(next_ma_cell);
-                remaining_ma_columns = count_nonjunk_tlist_entries(
-                                                                   ((Query *) cur_ma_sublink->subselect)->targetList);
-                Assert(((Param *) expr)->paramid ==
-                       ((cur_ma_sublink->subLinkId << 16) | 1));
-                appendStringInfoChar(buf, '(');
-            }
-        }
-
-        /*
-         * Put out name of target column; look in the catalogs, not at
-         * tle->resname, since resname will fail to track RENAME.
-         */
-        appendStringInfoString(buf,
-                               quote_identifier(get_relid_attribute_name(rte->relid,
-                                                                         tle->resno)));
-
-        /*
-         * Print any indirection needed (subfields or subscripts), and strip
-         * off the top-level nodes representing the indirection assignments.
-         */
-        expr = processIndirection((Node *) tle->expr, context);
-
-        /*
-         * If we're in a multiassignment, skip printing anything more, unless
-         * this is the last column; in which case, what we print should be the
-         * sublink, not the Param.
-         */
-        if (cur_ma_sublink != NULL)
-        {
-            if (--remaining_ma_columns > 0)
-                continue;        /* not the last column of multiassignment */
-            appendStringInfoChar(buf, ')');
-            expr = (Node *) cur_ma_sublink;
-            cur_ma_sublink = NULL;
-        }
-
-        appendStringInfoString(buf, " = ");
-
-        get_rule_expr(expr, context, false);
-    }
-}
-
-
-/* ----------
- * get_delete_query_def            - Parse back a DELETE parsetree
- * ----------
- */
-static void
-get_delete_query_def(Query *query, deparse_context *context)
-{
-    StringInfo    buf = context->buf;
-    RangeTblEntry *rte;
-
-    /* Insert the WITH clause if given */
-    get_with_clause(query, context);
-
-    /*
-     * Start the query with DELETE FROM relname
-     */
-    rte = rt_fetch(query->resultRelation, query->rtable);
-    Assert(rte->rtekind == RTE_RELATION);
-    if (PRETTY_INDENT(context))
-    {
-        appendStringInfoChar(buf, ' ');
-        context->indentLevel += PRETTYINDENT_STD;
-    }
-    appendStringInfo(buf, "DELETE FROM %s%s",
-                     only_marker(rte),
-                     generate_relation_name(rte->relid, NIL));
-    if (rte->alias != NULL)
-        appendStringInfo(buf, " %s",
-                         quote_identifier(rte->alias->aliasname));
-
-    /* Add the USING clause if given */
-    get_from_clause(query, " USING ", context);
-
-    /* Add a WHERE clause if given */
-    if (query->jointree->quals != NULL)
-    {
-        appendContextKeyword(context, " WHERE ",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
-        get_rule_expr(query->jointree->quals, context, false);
-    }
-
-    /* Add RETURNING if present */
-    if (query->returningList)
-    {
-        appendContextKeyword(context, " RETURNING",
-                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
-        get_target_list(query->returningList, context, NULL);
-    }
-}
-
-
-/* ----------
- * get_utility_query_def            - Parse back a UTILITY parsetree
- * ----------
- */
-static void
-get_utility_query_def(Query *query, deparse_context *context)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-
-    if (query->utilityStmt && IsA(query->utilityStmt, NotifyStmt))
-    {
-        NotifyStmt *stmt = (NotifyStmt *) query->utilityStmt;
-
-        appendContextKeyword(context, "",
-                             0, PRETTYINDENT_STD, 1);
-        appendStringInfo(buf, "NOTIFY %s",
-                         quote_identifier(stmt->conditionname));
-        if (stmt->payload)
-        {
-            appendStringInfoString(buf, ", ");
-            simple_quote_literal(buf, stmt->payload);
-        }
-    }
-#ifdef PGXC
-    else if (query->utilityStmt && IsA(query->utilityStmt, CreateStmt))
-    {
-        CreateStmt *stmt = (CreateStmt *) query->utilityStmt;
-        ListCell   *column;
-        const char *delimiter = "";
-        RangeVar   *relation = stmt->relation;
-        bool        istemp = (relation->relpersistence == RELPERSISTENCE_TEMP);
-        bool        isunlogged = (relation->relpersistence == RELPERSISTENCE_UNLOGGED);
-
-        appendStringInfo(buf, "CREATE %s %s %s TABLE %s ",
-                stmt->islocal ? "LOCAL" : "",
-                istemp ? "TEMP" : "",
-                isunlogged ? "UNLOGGED" : "",
-                stmt->if_not_exists ? "IF NOT EXISTS " : "");
-
-        if (!istemp && relation->schemaname && relation->schemaname[0])
-            appendStringInfo(buf, "%s.", quote_identifier(relation->schemaname));
-        appendStringInfo(buf, "%s", quote_identifier(relation->relname));
-
-        appendStringInfo(buf, "(");
-        foreach(column, stmt->tableElts)
-        {
-            Node *node = (Node *) lfirst(column);
-
-            appendStringInfo(buf, "%s", delimiter);
-            delimiter = ", ";
-
-            if (IsA(node, ColumnDef))
-            {
-                ColumnDef *coldef = (ColumnDef *) node;
-                TypeName *typename = coldef->typeName;
-#ifdef XCP
-                appendStringInfo(buf, "%s %s",
-                                 quote_identifier(coldef->colname),
-                                 format_type_with_typemod(typename->typeOid,
-                                                          typename->typemod));
-#else
-
-                /* error out if we have no recourse at all */
-                if (!OidIsValid(typename->typeOid))
-                    ereport(ERROR,
-                            (errcode(ERRCODE_SYNTAX_ERROR),
-                             errmsg("improper type oid: \"%u\"", typename->typeOid)));
-
-                /* get typename from the oid */
-                type = typeidType(typename->typeOid);
-
-                if (!HeapTupleIsValid(type))
-                    ereport(ERROR,
-                            (errcode(ERRCODE_UNDEFINED_OBJECT),
-                             errmsg("type \"%u\" does not exist",
-                                 typename->typeOid)));
-                appendStringInfo(buf, "%s %s", quote_identifier(coldef->colname),
-                        typeTypeName(type));
-                ReleaseSysCache(type);
-#endif
-            }
-            else
-                elog(ERROR, "Invalid table column definition.");
-        }
-        appendStringInfo(buf, ")");
-
-        /* Append storage parameters, like for instance WITH (OIDS) */
-        if (list_length(stmt->options) > 0)
-        {
-            Datum        reloptions;
-            static char *validnsps[] = HEAP_RELOPT_NAMESPACES;
-
-            reloptions = transformRelOptions((Datum) 0, stmt->options, NULL, validnsps,
-                                         false, false);
-
-            if (reloptions)
-            {
-                Datum   sep, txt;
-                /* Below is inspired from flatten_reloptions() */
-                sep = CStringGetTextDatum(", ");
-                txt = OidFunctionCall2(F_ARRAY_TO_TEXT, reloptions, sep);
-                appendStringInfo(buf, " WITH (%s)", TextDatumGetCString(txt));
-            }
-        }
-
-        /* add the on commit clauses for temporary tables */
-        switch (stmt->oncommit)
-        {
-            case ONCOMMIT_NOOP:
-                /* do nothing */
-                break;
-
-            case ONCOMMIT_PRESERVE_ROWS:
-                appendStringInfo(buf, " ON COMMIT PRESERVE ROWS");
-                break;
-
-            case ONCOMMIT_DELETE_ROWS:
-                appendStringInfo(buf, " ON COMMIT DELETE ROWS");
-                break;
-
-            case ONCOMMIT_DROP:
-                appendStringInfo(buf, " ON COMMIT DROP");
-                break;
-        }
-
-        if (stmt->distributeby)
-        {
-            /* add the on commit clauses for temporary tables */
-            switch (stmt->distributeby->disttype)
-            {
-                case DISTTYPE_REPLICATION:
-                    appendStringInfo(buf, " DISTRIBUTE BY REPLICATION");
-                    break;
-
-                case DISTTYPE_HASH:
-#ifdef __COLD_HOT__
-                    appendStringInfo(buf, " DISTRIBUTE BY HASH(%s)", strVal(linitial(stmt->distributeby->colname)));
-#else
-                    appendStringInfo(buf, " DISTRIBUTE BY HASH(%s)", stmt->distributeby->colname);
-#endif
-                    break;
-
-                case DISTTYPE_ROUNDROBIN:
-                    appendStringInfo(buf, " DISTRIBUTE BY ROUNDROBIN");
-                    break;
-
-                case DISTTYPE_MODULO:
-#ifdef __COLD_HOT__
-                    appendStringInfo(buf, " DISTRIBUTE BY MODULO(%s)",
-                            quote_identifier(strVal(linitial(stmt->distributeby->colname))));
-#else
-                    appendStringInfo(buf, " DISTRIBUTE BY MODULO(%s)",
-                            quote_identifier(stmt->distributeby->colname));
-#endif
-                    break;
-
-                default:
-                    ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
-                                errmsg("Invalid distribution type")));
-
-            }
-        }
-
-        if (stmt->subcluster)
-        {
-            ListCell   *cell;
-
-            switch (stmt->subcluster->clustertype)
-            {
-                case SUBCLUSTER_NODE:
-                    appendStringInfo(buf, " TO NODE (");
-
-                    /* Add node members */
-                    Assert(stmt->subcluster->members);
-                    foreach(cell, stmt->subcluster->members)
-                    {
-                        appendStringInfo(buf, " %s",
-                                quote_identifier(strVal(lfirst(cell))));
-                        if (cell->next)
-                            appendStringInfo(buf, ",");
-                    }
-                    appendStringInfo(buf, ")");
-                    break;
-
-                case SUBCLUSTER_GROUP:
-                    appendStringInfo(buf, " TO GROUP");
-
-                    /* Add group members */
-                    Assert(stmt->subcluster->members);
-                    foreach(cell, stmt->subcluster->members)
-                    {
-                        appendStringInfo(buf, " %s",
-                                quote_identifier(strVal(lfirst(cell))));
-                        if (cell->next)
-                            appendStringInfo(buf, ",");
-                    }
-                    break;
-
-                case SUBCLUSTER_NONE:
-                default:
-                    /* Nothing to do */
-                    break;
-            }
-        }
-    }
-#endif
-    else
-    {
-        /* Currently only NOTIFY utility commands can appear in rules */
-        elog(ERROR, "unexpected utility statement type");
-    }
-}
-
-/*
- * Display a Var appropriately.
- *
- * In some cases (currently only when recursing into an unnamed join)
- * the Var's varlevelsup has to be interpreted with respect to a context
- * above the current one; levelsup indicates the offset.
- *
- * If istoplevel is TRUE, the Var is at the top level of a SELECT's
- * targetlist, which means we need special treatment of whole-row Vars.
- * Instead of the normal "tab.*", we'll print "tab.*::typename", which is a
- * dirty hack to prevent "tab.*" from being expanded into multiple columns.
- * (The parser will strip the useless coercion, so no inefficiency is added in
- * dump and reload.)  We used to print just "tab" in such cases, but that is
- * ambiguous and will yield the wrong result if "tab" is also a plain column
- * name in the query.
- *
- * Returns the attname of the Var, or NULL if the Var has no attname (because
- * it is a whole-row Var or a subplan output reference).
- */
-static char *
-get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    RangeTblEntry *rte;
-    AttrNumber    attnum;
-    int            netlevelsup;
-    deparse_namespace *dpns;
-    deparse_columns *colinfo;
-    char       *refname;
-    char       *attname;
-
-    /* Find appropriate nesting depth */
-    netlevelsup = var->varlevelsup + levelsup;
-    if (netlevelsup >= list_length(context->namespaces))
-        elog(ERROR, "bogus varlevelsup: %d offset %d",
-             var->varlevelsup, levelsup);
-    dpns = (deparse_namespace *) list_nth(context->namespaces,
-                                          netlevelsup);
-
-    /*
-     * Try to find the relevant RTE in this rtable.  In a plan tree, it's
-     * likely that varno is OUTER_VAR or INNER_VAR, in which case we must dig
-     * down into the subplans, or INDEX_VAR, which is resolved similarly. Also
-     * find the aliases previously assigned for this RTE.
-     */
-    if (var->varno >= 1 && var->varno <= list_length(dpns->rtable))
-    {
-        rte = rt_fetch(var->varno, dpns->rtable);
-        refname = (char *) list_nth(dpns->rtable_names, var->varno - 1);
-        colinfo = deparse_columns_fetch(var->varno, dpns);
-        attnum = var->varattno;
-    }
-    else
-    {
-        resolve_special_varno((Node *) var, context, NULL,
-                              get_special_variable);
-        return NULL;
-    }
-
-    /*
-     * The planner will sometimes emit Vars referencing resjunk elements of a
-     * subquery's target list (this is currently only possible if it chooses
-     * to generate a "physical tlist" for a SubqueryScan or CteScan node).
-     * Although we prefer to print subquery-referencing Vars using the
-     * subquery's alias, that's not possible for resjunk items since they have
-     * no alias.  So in that case, drill down to the subplan and print the
-     * contents of the referenced tlist item.  This works because in a plan
-     * tree, such Vars can only occur in a SubqueryScan or CteScan node, and
-     * we'll have set dpns->inner_planstate to reference the child plan node.
-     */
-    if ((rte->rtekind == RTE_SUBQUERY || rte->rtekind == RTE_CTE) &&
-        attnum > list_length(rte->eref->colnames) &&
-        dpns->inner_planstate)
-    {
-        TargetEntry *tle;
-        deparse_namespace save_dpns;
-
-        tle = get_tle_by_resno(dpns->inner_tlist, var->varattno);
-        if (!tle)
-            elog(ERROR, "invalid attnum %d for relation \"%s\"",
-                 var->varattno, rte->eref->aliasname);
-
-        Assert(netlevelsup == 0);
-        push_child_plan(dpns, dpns->inner_planstate, &save_dpns);
-
-        /*
-         * Force parentheses because our caller probably assumed a Var is a
-         * simple expression.
-         */
-        if (!IsA(tle->expr, Var))
-            appendStringInfoChar(buf, '(');
-        get_rule_expr((Node *) tle->expr, context, true);
-        if (!IsA(tle->expr, Var))
-            appendStringInfoChar(buf, ')');
-
-        pop_child_plan(dpns, &save_dpns);
-        return NULL;
-    }
-
-#ifdef PGXC
-    if (rte->rtekind == RTE_REMOTE_DUMMY &&
-        attnum > list_length(rte->eref->colnames) &&
-        dpns->planstate)
-    {
-        TargetEntry *tle;
-        RemoteQuery *rqplan;
-        Assert(IsA(dpns->planstate, RemoteQueryState));
-        Assert(netlevelsup == 0);
-
-        /*
-         * Get the expression representing the given Var from base_tlist of the
-         * RemoteQuery
-         */
-        rqplan = (RemoteQuery *)dpns->planstate->plan;
-        Assert(IsA(rqplan, RemoteQuery));
-        tle = get_tle_by_resno(rqplan->base_tlist, var->varattno);
-        if (!tle)
-            elog(ERROR, "bogus varattno for remotequery var: %d", var->varattno);
-        /*
-         * Force parentheses because our caller probably assumed a Var is a
-         * simple expression.
-         */
-        if (!IsA(tle->expr, Var))
-            appendStringInfoChar(buf, '(');
-        get_rule_expr((Node *) tle->expr, context, true);
-        if (!IsA(tle->expr, Var))
-            appendStringInfoChar(buf, ')');
-
-        return NULL;
-    }
-#endif /* PGXC */
-
-    /*
-     * If it's an unnamed join, look at the expansion of the alias variable.
-     * If it's a simple reference to one of the input vars, then recursively
-     * print the name of that var instead.  When it's not a simple reference,
-     * we have to just print the unqualified join column name.  (This can only
-     * happen with "dangerous" merged columns in a JOIN USING; we took pains
-     * previously to make the unqualified column name unique in such cases.)
-     *
-     * This wouldn't work in decompiling plan trees, because we don't store
-     * joinaliasvars lists after planning; but a plan tree should never
-     * contain a join alias variable.
-     */
-    if (rte->rtekind == RTE_JOIN && rte->alias == NULL)
-    {
-        if (rte->joinaliasvars == NIL)
-            elog(ERROR, "cannot decompile join alias var in plan tree");
-        if (attnum > 0)
-        {
-            Var           *aliasvar;
-
-            aliasvar = (Var *) list_nth(rte->joinaliasvars, attnum - 1);
-            /* we intentionally don't strip implicit coercions here */
-            if (aliasvar && IsA(aliasvar, Var))
-            {
-                return get_variable(aliasvar, var->varlevelsup + levelsup,
-                                    istoplevel, context);
-            }
-        }
-
-        /*
-         * Unnamed join has no refname.  (Note: since it's unnamed, there is
-         * no way the user could have referenced it to create a whole-row Var
-         * for it.  So we don't have to cover that case below.)
-         */
-        Assert(refname == NULL);
-    }
-
-    if (attnum == InvalidAttrNumber)
-        attname = NULL;
-    else if (attnum > 0)
-    {
-        /* Get column name to use from the colinfo struct */
-        if (attnum > colinfo->num_cols)
-            elog(ERROR, "invalid attnum %d for relation \"%s\"",
-                 attnum, rte->eref->aliasname);
-        attname = colinfo->colnames[attnum - 1];
-        if (attname == NULL)    /* dropped column? */
-            elog(ERROR, "invalid attnum %d for relation \"%s\"",
-                 attnum, rte->eref->aliasname);
-    }
-    else
-    {
-        /* System column - name is fixed, get it from the catalog */
-        attname = get_rte_attribute_name(rte, attnum);
-    }
-
-    if (refname && (context->varprefix || attname == NULL))
-    {
-        appendStringInfoString(buf, quote_identifier(refname));
-        appendStringInfoChar(buf, '.');
-    }
-    if (attname)
-        appendStringInfoString(buf, quote_identifier(attname));
-    else
-    {
-        appendStringInfoChar(buf, '*');
-        if (istoplevel)
-            appendStringInfo(buf, "::%s",
-                             format_type_with_typemod(var->vartype,
-                                                      var->vartypmod));
-    }
-
-    return attname;
-}
-
-/*
- * Deparse a Var which references OUTER_VAR, INNER_VAR, or INDEX_VAR.  This
- * routine is actually a callback for get_special_varno, which handles finding
- * the correct TargetEntry.  We get the expression contained in that
- * TargetEntry and just need to deparse it, a job we can throw back on
- * get_rule_expr.
- */
-static void
-get_special_variable(Node *node, deparse_context *context, void *private)
-{
-    StringInfo    buf = context->buf;
-
-    /*
-     * Force parentheses because our caller probably assumed a Var is a simple
-     * expression.
-     */
-    if (!IsA(node, Var))
-        appendStringInfoChar(buf, '(');
-    get_rule_expr(node, context, true);
-    if (!IsA(node, Var))
-        appendStringInfoChar(buf, ')');
-}
-
-/*
- * Chase through plan references to special varnos (OUTER_VAR, INNER_VAR,
- * INDEX_VAR) until we find a real Var or some kind of non-Var node; then,
- * invoke the callback provided.
- */
-static void
-resolve_special_varno(Node *node, deparse_context *context, void *private,
-                      void (*callback) (Node *, deparse_context *, void *))
-{// #lizard forgives
-    Var           *var;
-    deparse_namespace *dpns;
-
-    /* If it's not a Var, invoke the callback. */
-    if (!IsA(node, Var))
-    {
-        callback(node, context, private);
-        return;
-    }
-
-    /* Find appropriate nesting depth */
-    var = (Var *) node;
-    dpns = (deparse_namespace *) list_nth(context->namespaces,
-                                          var->varlevelsup);
-
-    /*
-     * It's a special RTE, so recurse.
-     */
-    if (var->varno == OUTER_VAR && dpns->outer_tlist)
-    {
-        TargetEntry *tle;
-        deparse_namespace save_dpns;
-
-        tle = get_tle_by_resno(dpns->outer_tlist, var->varattno);
-        if (!tle)
-            elog(ERROR, "bogus varattno for OUTER_VAR var: %d", var->varattno);
-
-        push_child_plan(dpns, dpns->outer_planstate, &save_dpns);
-        resolve_special_varno((Node *) tle->expr, context, private, callback);
-        pop_child_plan(dpns, &save_dpns);
-        return;
-    }
-    else if (var->varno == INNER_VAR && dpns->inner_tlist)
-    {
-        TargetEntry *tle;
-        deparse_namespace save_dpns;
-
-        tle = get_tle_by_resno(dpns->inner_tlist, var->varattno);
-        if (!tle)
-            elog(ERROR, "bogus varattno for INNER_VAR var: %d", var->varattno);
-
-        push_child_plan(dpns, dpns->inner_planstate, &save_dpns);
-        resolve_special_varno((Node *) tle->expr, context, private, callback);
-        pop_child_plan(dpns, &save_dpns);
-        return;
-    }
-    else if (var->varno == INDEX_VAR && dpns->index_tlist)
-    {
-        TargetEntry *tle;
-
-        tle = get_tle_by_resno(dpns->index_tlist, var->varattno);
-        if (!tle)
-            elog(ERROR, "bogus varattno for INDEX_VAR var: %d", var->varattno);
-
-        resolve_special_varno((Node *) tle->expr, context, private, callback);
-        return;
-    }
-    else if (var->varno < 1 || var->varno > list_length(dpns->rtable))
-        elog(ERROR, "bogus varno: %d", var->varno);
-
-    /* Not special.  Just invoke the callback. */
-    callback(node, context, private);
-}
-
-/*
- * Get the name of a field of an expression of composite type.  The
- * expression is usually a Var, but we handle other cases too.
- *
- * levelsup is an extra offset to interpret the Var's varlevelsup correctly.
- *
- * This is fairly straightforward when the expression has a named composite
- * type; we need only look up the type in the catalogs.  However, the type
- * could also be RECORD.  Since no actual table or view column is allowed to
- * have type RECORD, a Var of type RECORD must refer to a JOIN or FUNCTION RTE
- * or to a subquery output.  We drill down to find the ultimate defining
- * expression and attempt to infer the field name from it.  We ereport if we
- * can't determine the name.
- *
- * Similarly, a PARAM of type RECORD has to refer to some expression of
- * a determinable composite type.
- */
-static const char *
-get_name_for_var_field(Var *var, int fieldno,
-                       int levelsup, deparse_context *context)
-{// #lizard forgives
-    RangeTblEntry *rte;
-    AttrNumber    attnum;
-    int            netlevelsup;
-    deparse_namespace *dpns;
-    TupleDesc    tupleDesc;
-    Node       *expr;
-
-    /*
-     * If it's a RowExpr that was expanded from a whole-row Var, use the
-     * column names attached to it.
-     */
-    if (IsA(var, RowExpr))
-    {
-        RowExpr    *r = (RowExpr *) var;
-
-        if (fieldno > 0 && fieldno <= list_length(r->colnames))
-            return strVal(list_nth(r->colnames, fieldno - 1));
-    }
-
-    /*
-     * If it's a Param of type RECORD, try to find what the Param refers to.
-     */
-    if (IsA(var, Param))
-    {
-        Param       *param = (Param *) var;
-        ListCell   *ancestor_cell;
-
-        expr = find_param_referent(param, context, &dpns, &ancestor_cell);
-        if (expr)
-        {
-            /* Found a match, so recurse to decipher the field name */
-            deparse_namespace save_dpns;
-            const char *result;
-
-            push_ancestor_plan(dpns, ancestor_cell, &save_dpns);
-            result = get_name_for_var_field((Var *) expr, fieldno,
-                                            0, context);
-            pop_ancestor_plan(dpns, &save_dpns);
-            return result;
-        }
-    }
-
-    /*
-     * If it's a Var of type RECORD, we have to find what the Var refers to;
-     * if not, we can use get_expr_result_type. If that fails, we try
-     * lookup_rowtype_tupdesc, which will probably fail too, but will ereport
-     * an acceptable message.
-     */
-    if (!IsA(var, Var) ||
-        var->vartype != RECORDOID)
-    {
-        if (get_expr_result_type((Node *) var, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
-            tupleDesc = lookup_rowtype_tupdesc_copy(exprType((Node *) var),
-                                                    exprTypmod((Node *) var));
-        Assert(tupleDesc);
-        /* Got the tupdesc, so we can extract the field name */
-        Assert(fieldno >= 1 && fieldno <= tupleDesc->natts);
-        return NameStr(tupleDesc->attrs[fieldno - 1]->attname);
-    }
-
-    /* Find appropriate nesting depth */
-    netlevelsup = var->varlevelsup + levelsup;
-    if (netlevelsup >= list_length(context->namespaces))
-        elog(ERROR, "bogus varlevelsup: %d offset %d",
-             var->varlevelsup, levelsup);
-    dpns = (deparse_namespace *) list_nth(context->namespaces,
-                                          netlevelsup);
-
-    /*
-     * Try to find the relevant RTE in this rtable.  In a plan tree, it's
-     * likely that varno is OUTER_VAR or INNER_VAR, in which case we must dig
-     * down into the subplans, or INDEX_VAR, which is resolved similarly.
-     */
-    if (var->varno >= 1 && var->varno <= list_length(dpns->rtable))
-    {
-        rte = rt_fetch(var->varno, dpns->rtable);
-        attnum = var->varattno;
-    }
-    else if (var->varno == OUTER_VAR && dpns->outer_tlist)
-    {
-        TargetEntry *tle;
-        deparse_namespace save_dpns;
-        const char *result;
-
-        tle = get_tle_by_resno(dpns->outer_tlist, var->varattno);
-        if (!tle)
-            elog(ERROR, "bogus varattno for OUTER_VAR var: %d", var->varattno);
-
-        Assert(netlevelsup == 0);
-        push_child_plan(dpns, dpns->outer_planstate, &save_dpns);
-
-        result = get_name_for_var_field((Var *) tle->expr, fieldno,
-                                        levelsup, context);
-
-        pop_child_plan(dpns, &save_dpns);
-        return result;
-    }
-    else if (var->varno == INNER_VAR && dpns->inner_tlist)
-    {
-        TargetEntry *tle;
-        deparse_namespace save_dpns;
-        const char *result;
-
-        tle = get_tle_by_resno(dpns->inner_tlist, var->varattno);
-        if (!tle)
-            elog(ERROR, "bogus varattno for INNER_VAR var: %d", var->varattno);
-
-        Assert(netlevelsup == 0);
-        push_child_plan(dpns, dpns->inner_planstate, &save_dpns);
-
-        result = get_name_for_var_field((Var *) tle->expr, fieldno,
-                                        levelsup, context);
-
-        pop_child_plan(dpns, &save_dpns);
-        return result;
-    }
-    else if (var->varno == INDEX_VAR && dpns->index_tlist)
-    {
-        TargetEntry *tle;
-        const char *result;
-
-        tle = get_tle_by_resno(dpns->index_tlist, var->varattno);
-        if (!tle)
-            elog(ERROR, "bogus varattno for INDEX_VAR var: %d", var->varattno);
-
-        Assert(netlevelsup == 0);
-
-        result = get_name_for_var_field((Var *) tle->expr, fieldno,
-                                        levelsup, context);
-
-        return result;
-    }
-    else
-    {
-        elog(ERROR, "bogus varno: %d", var->varno);
-        return NULL;            /* keep compiler quiet */
-    }
-
-    if (attnum == InvalidAttrNumber)
-    {
-        /* Var is whole-row reference to RTE, so select the right field */
-        return get_rte_attribute_name(rte, fieldno);
-    }
-
-    /*
-     * This part has essentially the same logic as the parser's
-     * expandRecordVariable() function, but we are dealing with a different
-     * representation of the input context, and we only need one field name
-     * not a TupleDesc.  Also, we need special cases for finding subquery and
-     * CTE subplans when deparsing Plan trees.
-     */
-    expr = (Node *) var;        /* default if we can't drill down */
-
-    switch (rte->rtekind)
-    {
-        case RTE_RELATION:
-        case RTE_VALUES:
-        case RTE_NAMEDTUPLESTORE:
-
-            /*
-             * This case should not occur: a column of a table or values list
-             * shouldn't have type RECORD.  Fall through and fail (most
-             * likely) at the bottom.
-             */
-            break;
-        case RTE_SUBQUERY:
-            /* Subselect-in-FROM: examine sub-select's output expr */
-            {
-                if (rte->subquery)
-                {
-                    TargetEntry *ste = get_tle_by_resno(rte->subquery->targetList,
-                                                        attnum);
-
-                    if (ste == NULL || ste->resjunk)
-                        elog(ERROR, "subquery %s does not have attribute %d",
-                             rte->eref->aliasname, attnum);
-                    expr = (Node *) ste->expr;
-                    if (IsA(expr, Var))
-                    {
-                        /*
-                         * Recurse into the sub-select to see what its Var
-                         * refers to. We have to build an additional level of
-                         * namespace to keep in step with varlevelsup in the
-                         * subselect.
-                         */
-                        deparse_namespace mydpns;
-                        const char *result;
-
-                        set_deparse_for_query(&mydpns, rte->subquery,
-                                              context->namespaces);
-
-                        context->namespaces = lcons(&mydpns,
-                                                    context->namespaces);
-
-                        result = get_name_for_var_field((Var *) expr, fieldno,
-                                                        0, context);
-
-                        context->namespaces =
-                            list_delete_first(context->namespaces);
-
-                        return result;
-                    }
-                    /* else fall through to inspect the expression */
-                }
-                else
-                {
-                    /*
-                     * We're deparsing a Plan tree so we don't have complete
-                     * RTE entries (in particular, rte->subquery is NULL). But
-                     * the only place we'd see a Var directly referencing a
-                     * SUBQUERY RTE is in a SubqueryScan plan node, and we can
-                     * look into the child plan's tlist instead.
-                     */
-                    TargetEntry *tle;
-                    deparse_namespace save_dpns;
-                    const char *result;
-
-                    if (!dpns->inner_planstate)
-                        elog(ERROR, "failed to find plan for subquery %s",
-                             rte->eref->aliasname);
-                    tle = get_tle_by_resno(dpns->inner_tlist, attnum);
-                    if (!tle)
-                        elog(ERROR, "bogus varattno for subquery var: %d",
-                             attnum);
-                    Assert(netlevelsup == 0);
-                    push_child_plan(dpns, dpns->inner_planstate, &save_dpns);
-
-                    result = get_name_for_var_field((Var *) tle->expr, fieldno,
-                                                    levelsup, context);
-
-                    pop_child_plan(dpns, &save_dpns);
-                    return result;
-                }
-            }
-            break;
-        case RTE_JOIN:
-            /* Join RTE --- recursively inspect the alias variable */
-            if (rte->joinaliasvars == NIL)
-                elog(ERROR, "cannot decompile join alias var in plan tree");
-            Assert(attnum > 0 && attnum <= list_length(rte->joinaliasvars));
-            expr = (Node *) list_nth(rte->joinaliasvars, attnum - 1);
-            Assert(expr != NULL);
-            /* we intentionally don't strip implicit coercions here */
-            if (IsA(expr, Var))
-                return get_name_for_var_field((Var *) expr, fieldno,
-                                              var->varlevelsup + levelsup,
-                                              context);
-            /* else fall through to inspect the expression */
-            break;
-        case RTE_FUNCTION:
-        case RTE_TABLEFUNC:
-
-            /*
-             * We couldn't get here unless a function is declared with one of
-             * its result columns as RECORD, which is not allowed.
-             */
-            break;
-        case RTE_CTE:
-            /* CTE reference: examine subquery's output expr */
-            {
-                CommonTableExpr *cte = NULL;
-                Index        ctelevelsup;
-                ListCell   *lc;
-
-                /*
-                 * Try to find the referenced CTE using the namespace stack.
-                 */
-                ctelevelsup = rte->ctelevelsup + netlevelsup;
-                if (ctelevelsup >= list_length(context->namespaces))
-                    lc = NULL;
-                else
-                {
-                    deparse_namespace *ctedpns;
-
-                    ctedpns = (deparse_namespace *)
-                        list_nth(context->namespaces, ctelevelsup);
-                    foreach(lc, ctedpns->ctes)
-                    {
-                        cte = (CommonTableExpr *) lfirst(lc);
-                        if (strcmp(cte->ctename, rte->ctename) == 0)
-                            break;
-                    }
-                }
-                if (lc != NULL)
-                {
-                    Query       *ctequery = (Query *) cte->ctequery;
-                    TargetEntry *ste = get_tle_by_resno(GetCTETargetList(cte),
-                                                        attnum);
-
-                    if (ste == NULL || ste->resjunk)
-                        elog(ERROR, "subquery %s does not have attribute %d",
-                             rte->eref->aliasname, attnum);
-                    expr = (Node *) ste->expr;
-                    if (IsA(expr, Var))
-                    {
-                        /*
-                         * Recurse into the CTE to see what its Var refers to.
-                         * We have to build an additional level of namespace
-                         * to keep in step with varlevelsup in the CTE.
-                         * Furthermore it could be an outer CTE, so we may
-                         * have to delete some levels of namespace.
-                         */
-                        List       *save_nslist = context->namespaces;
-                        List       *new_nslist;
-                        deparse_namespace mydpns;
-                        const char *result;
-
-                        set_deparse_for_query(&mydpns, ctequery,
-                                              context->namespaces);
-
-                        new_nslist = list_copy_tail(context->namespaces,
-                                                    ctelevelsup);
-                        context->namespaces = lcons(&mydpns, new_nslist);
-
-                        result = get_name_for_var_field((Var *) expr, fieldno,
-                                                        0, context);
-
-                        context->namespaces = save_nslist;
-
-                        return result;
-                    }
-                    /* else fall through to inspect the expression */
-                }
-                else
-                {
-                    /*
-                     * We're deparsing a Plan tree so we don't have a CTE
-                     * list.  But the only place we'd see a Var directly
-                     * referencing a CTE RTE is in a CteScan plan node, and we
-                     * can look into the subplan's tlist instead.
-                     */
-                    TargetEntry *tle;
-                    deparse_namespace save_dpns;
-                    const char *result;
-
-                    if (!dpns->inner_planstate)
-                        elog(ERROR, "failed to find plan for CTE %s",
-                             rte->eref->aliasname);
-                    tle = get_tle_by_resno(dpns->inner_tlist, attnum);
-                    if (!tle)
-                        elog(ERROR, "bogus varattno for subquery var: %d",
-                             attnum);
-                    Assert(netlevelsup == 0);
-                    push_child_plan(dpns, dpns->inner_planstate, &save_dpns);
-
-                    result = get_name_for_var_field((Var *) tle->expr, fieldno,
-                                                    levelsup, context);
-
-                    pop_child_plan(dpns, &save_dpns);
-                    return result;
-                }
-            }
-            break;
-#ifdef PGXC
-        case RTE_REMOTE_DUMMY:
-            elog(ERROR, "Invalid RTE found");
-            break;
-#endif /* PGXC */
-    }
-
-    /*
-     * We now have an expression we can't expand any more, so see if
-     * get_expr_result_type() can do anything with it.  If not, pass to
-     * lookup_rowtype_tupdesc() which will probably fail, but will give an
-     * appropriate error message while failing.
-     */
-    if (get_expr_result_type(expr, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
-        tupleDesc = lookup_rowtype_tupdesc_copy(exprType(expr),
-                                                exprTypmod(expr));
-    Assert(tupleDesc);
-    /* Got the tupdesc, so we can extract the field name */
-    Assert(fieldno >= 1 && fieldno <= tupleDesc->natts);
-    return NameStr(tupleDesc->attrs[fieldno - 1]->attname);
-}
-
-/*
- * Try to find the referenced expression for a PARAM_EXEC Param that might
- * reference a parameter supplied by an upper NestLoop or SubPlan plan node.
- *
- * If successful, return the expression and set *dpns_p and *ancestor_cell_p
- * appropriately for calling push_ancestor_plan().  If no referent can be
- * found, return NULL.
- */
-static Node *
-find_param_referent(Param *param, deparse_context *context,
-                    deparse_namespace **dpns_p, ListCell **ancestor_cell_p)
-{// #lizard forgives
-    /* Initialize output parameters to prevent compiler warnings */
-    *dpns_p = NULL;
-    *ancestor_cell_p = NULL;
-
-    /*
-     * If it's a PARAM_EXEC parameter, look for a matching NestLoopParam or
-     * SubPlan argument.  This will necessarily be in some ancestor of the
-     * current expression's PlanState.
-     */
-    if (param->paramkind == PARAM_EXEC)
-    {
-        deparse_namespace *dpns;
-        PlanState  *child_ps;
-        bool        in_same_plan_level;
-        ListCell   *lc;
-
-        dpns = (deparse_namespace *) linitial(context->namespaces);
-        child_ps = dpns->planstate;
-        in_same_plan_level = true;
-
-        foreach(lc, dpns->ancestors)
-        {
-            PlanState  *ps = (PlanState *) lfirst(lc);
-            ListCell   *lc2;
-
-            /*
-             * NestLoops transmit params to their inner child only; also, once
-             * we've crawled up out of a subplan, this couldn't possibly be
-             * the right match.
-             */
-            if (IsA(ps, NestLoopState) &&
-                child_ps == innerPlanState(ps) &&
-                in_same_plan_level)
-            {
-                NestLoop   *nl = (NestLoop *) ps->plan;
-
-                foreach(lc2, nl->nestParams)
-                {
-                    NestLoopParam *nlp = (NestLoopParam *) lfirst(lc2);
-
-                    if (nlp->paramno == param->paramid)
-                    {
-                        /* Found a match, so return it */
-                        *dpns_p = dpns;
-                        *ancestor_cell_p = lc;
-                        return (Node *) nlp->paramval;
-                    }
-                }
-            }
-
-            /*
-             * Check to see if we're crawling up from a subplan.
-             */
-            foreach(lc2, ps->subPlan)
-            {
-                SubPlanState *sstate = (SubPlanState *) lfirst(lc2);
-                SubPlan    *subplan = sstate->subplan;
-                ListCell   *lc3;
-                ListCell   *lc4;
-
-                if (child_ps != sstate->planstate)
-                    continue;
-
-                /* Matched subplan, so check its arguments */
-                forboth(lc3, subplan->parParam, lc4, subplan->args)
-                {
-                    int            paramid = lfirst_int(lc3);
-                    Node       *arg = (Node *) lfirst(lc4);
-
-                    if (paramid == param->paramid)
-                    {
-                        /* Found a match, so return it */
-                        *dpns_p = dpns;
-                        *ancestor_cell_p = lc;
-                        return arg;
-                    }
-                }
-
-                /* Keep looking, but we are emerging from a subplan. */
-                in_same_plan_level = false;
-                break;
-            }
-
-            /*
-             * Likewise check to see if we're emerging from an initplan.
-             * Initplans never have any parParams, so no need to search that
-             * list, but we need to know if we should reset
-             * in_same_plan_level.
-             */
-            foreach(lc2, ps->initPlan)
-            {
-                SubPlanState *sstate = (SubPlanState *) lfirst(lc2);
-
-                if (child_ps != sstate->planstate)
-                    continue;
-
-                /* No parameters to be had here. */
-                Assert(sstate->subplan->parParam == NIL);
-
-                /* Keep looking, but we are emerging from an initplan. */
-                in_same_plan_level = false;
-                break;
-            }
-
-            /* No luck, crawl up to next ancestor */
-            child_ps = ps;
-        }
-    }
-
-    /* No referent found */
-    return NULL;
-}
-
-/*
- * Display a Param appropriately.
- */
-static void
-get_parameter(Param *param, deparse_context *context)
-{// #lizard forgives
-    Node       *expr;
-    deparse_namespace *dpns;
-    ListCell   *ancestor_cell;
-
-    /*
-     * If it's a PARAM_EXEC parameter, try to locate the expression from which
-     * the parameter was computed.  Note that failing to find a referent isn't
-     * an error, since the Param might well be a subplan output rather than an
-     * input.
-     */
-    expr = find_param_referent(param, context, &dpns, &ancestor_cell);
-    if (expr)
-    {
-        /* Found a match, so print it */
-        deparse_namespace save_dpns;
-        bool        save_varprefix;
-        bool        need_paren;
-
-        /* Switch attention to the ancestor plan node */
-        push_ancestor_plan(dpns, ancestor_cell, &save_dpns);
-
-        /*
-         * Force prefixing of Vars, since they won't belong to the relation
-         * being scanned in the original plan node.
-         */
-        save_varprefix = context->varprefix;
-        context->varprefix = true;
-
-        /*
-         * A Param's expansion is typically a Var, Aggref, or upper-level
-         * Param, which wouldn't need extra parentheses.  Otherwise, insert
-         * parens to ensure the expression looks atomic.
-         */
-        need_paren = !(IsA(expr, Var) ||
-                       IsA(expr, Aggref) ||
-                       IsA(expr, Param));
-        if (need_paren)
-            appendStringInfoChar(context->buf, '(');
-
-        get_rule_expr(expr, context, false);
-
-        if (need_paren)
-            appendStringInfoChar(context->buf, ')');
-
-        context->varprefix = save_varprefix;
-
-        pop_ancestor_plan(dpns, &save_dpns);
-
-        return;
-    }
-
-    /*
-     * Not PARAM_EXEC, or couldn't find referent: just print $N.
-     */
-    appendStringInfo(context->buf, "$%d", param->paramid);
-
-#ifdef __TBASE__
-    /* param need explicit cast */
-    if (param->explicit_cast)
-    {
-        appendStringInfo(context->buf, "::%s",
-                         format_type_with_typemod(param->paramtype, param->paramtypmod));
-    }
-#endif
-}
-
-/*
- * get_simple_binary_op_name
- *
- * helper function for isSimpleNode
- * will return single char binary operator name, or NULL if it's not
- */
-static const char *
-get_simple_binary_op_name(OpExpr *expr)
-{
-    List       *args = expr->args;
-
-    if (list_length(args) == 2)
-    {
-        /* binary operator */
-        Node       *arg1 = (Node *) linitial(args);
-        Node       *arg2 = (Node *) lsecond(args);
-        const char *op;
-
-        op = generate_operator_name(expr->opno, exprType(arg1), exprType(arg2));
-        if (strlen(op) == 1)
-            return op;
-    }
-    return NULL;
-}
-
-
-/*
- * isSimpleNode - check if given node is simple (doesn't need parenthesizing)
- *
- *    true   : simple in the context of parent node's type
- *    false  : not simple
- */
-static bool
-isSimpleNode(Node *node, Node *parentNode, int prettyFlags)
-{// #lizard forgives
-    if (!node)
-        return false;
-
-    switch (nodeTag(node))
-    {
-        case T_Var:
-        case T_Const:
-        case T_Param:
-        case T_CoerceToDomainValue:
-        case T_SetToDefault:
-        case T_CurrentOfExpr:
-            /* single words: always simple */
-            return true;
-
-        case T_ArrayRef:
-        case T_ArrayExpr:
-        case T_RowExpr:
-        case T_CoalesceExpr:
-        case T_MinMaxExpr:
-        case T_SQLValueFunction:
-        case T_XmlExpr:
-        case T_NextValueExpr:
-        case T_NullIfExpr:
-        case T_Aggref:
-        case T_WindowFunc:
-        case T_FuncExpr:
-            /* function-like: name(..) or name[..] */
-            return true;
-
-            /* CASE keywords act as parentheses */
-        case T_CaseExpr:
-            return true;
-
-        case T_FieldSelect:
-
-            /*
-             * appears simple since . has top precedence, unless parent is
-             * T_FieldSelect itself!
-             */
-            return (IsA(parentNode, FieldSelect) ? false : true);
-
-        case T_FieldStore:
-
-            /*
-             * treat like FieldSelect (probably doesn't matter)
-             */
-            return (IsA(parentNode, FieldStore) ? false : true);
-
-        case T_CoerceToDomain:
-            /* maybe simple, check args */
-            return isSimpleNode((Node *) ((CoerceToDomain *) node)->arg,
-                                node, prettyFlags);
-        case T_RelabelType:
-            return isSimpleNode((Node *) ((RelabelType *) node)->arg,
-                                node, prettyFlags);
-        case T_CoerceViaIO:
-            return isSimpleNode((Node *) ((CoerceViaIO *) node)->arg,
-                                node, prettyFlags);
-        case T_ArrayCoerceExpr:
-            return isSimpleNode((Node *) ((ArrayCoerceExpr *) node)->arg,
-                                node, prettyFlags);
-        case T_ConvertRowtypeExpr:
-            return isSimpleNode((Node *) ((ConvertRowtypeExpr *) node)->arg,
-                                node, prettyFlags);
-
-        case T_OpExpr:
-            {
-                /* depends on parent node type; needs further checking */
-                if (prettyFlags & PRETTYFLAG_PAREN && IsA(parentNode, OpExpr))
-                {
-                    const char *op;
-                    const char *parentOp;
-                    bool        is_lopriop;
-                    bool        is_hipriop;
-                    bool        is_lopriparent;
-                    bool        is_hipriparent;
-
-                    op = get_simple_binary_op_name((OpExpr *) node);
-                    if (!op)
-                        return false;
-
-                    /* We know only the basic operators + - and * / % */
-                    is_lopriop = (strchr("+-", *op) != NULL);
-                    is_hipriop = (strchr("*/%", *op) != NULL);
-                    if (!(is_lopriop || is_hipriop))
-                        return false;
-
-                    parentOp = get_simple_binary_op_name((OpExpr *) parentNode);
-                    if (!parentOp)
-                        return false;
-
-                    is_lopriparent = (strchr("+-", *parentOp) != NULL);
-                    is_hipriparent = (strchr("*/%", *parentOp) != NULL);
-                    if (!(is_lopriparent || is_hipriparent))
-                        return false;
-
-                    if (is_hipriop && is_lopriparent)
-                        return true;    /* op binds tighter than parent */
-
-                    if (is_lopriop && is_hipriparent)
-                        return false;
-
-                    /*
-                     * Operators are same priority --- can skip parens only if
-                     * we have (a - b) - c, not a - (b - c).
-                     */
-                    if (node == (Node *) linitial(((OpExpr *) parentNode)->args))
-                        return true;
-
-                    return false;
-                }
-                /* else do the same stuff as for T_SubLink et al. */
-                /* FALL THROUGH */
-            }
-
-        case T_SubLink:
-        case T_NullTest:
-        case T_BooleanTest:
-        case T_DistinctExpr:
-            switch (nodeTag(parentNode))
-            {
-                case T_FuncExpr:
-                    {
-                        /* special handling for casts */
-                        CoercionForm type = ((FuncExpr *) parentNode)->funcformat;
-
-                        if (type == COERCE_EXPLICIT_CAST ||
-                            type == COERCE_IMPLICIT_CAST)
-                            return false;
-                        return true;    /* own parentheses */
-                    }
-                case T_BoolExpr:    /* lower precedence */
-                case T_ArrayRef:    /* other separators */
-                case T_ArrayExpr:    /* other separators */
-                case T_RowExpr: /* other separators */
-                case T_CoalesceExpr:    /* own parentheses */
-                case T_MinMaxExpr:    /* own parentheses */
-                case T_XmlExpr: /* own parentheses */
-                case T_NullIfExpr:    /* other separators */
-                case T_Aggref:    /* own parentheses */
-                case T_WindowFunc:    /* own parentheses */
-                case T_CaseExpr:    /* other separators */
-                    return true;
-                default:
-                    return false;
-            }
-
-        case T_BoolExpr:
-            switch (nodeTag(parentNode))
-            {
-                case T_BoolExpr:
-                    if (prettyFlags & PRETTYFLAG_PAREN)
-                    {
-                        BoolExprType type;
-                        BoolExprType parentType;
-
-                        type = ((BoolExpr *) node)->boolop;
-                        parentType = ((BoolExpr *) parentNode)->boolop;
-                        switch (type)
-                        {
-                            case NOT_EXPR:
-                            case AND_EXPR:
-                                if (parentType == AND_EXPR || parentType == OR_EXPR)
-                                    return true;
-                                break;
-                            case OR_EXPR:
-                                if (parentType == OR_EXPR)
-                                    return true;
-                                break;
-                        }
-                    }
-                    return false;
-                case T_FuncExpr:
-                    {
-                        /* special handling for casts */
-                        CoercionForm type = ((FuncExpr *) parentNode)->funcformat;
-
-                        if (type == COERCE_EXPLICIT_CAST ||
-                            type == COERCE_IMPLICIT_CAST)
-                            return false;
-                        return true;    /* own parentheses */
-                    }
-                case T_ArrayRef:    /* other separators */
-                case T_ArrayExpr:    /* other separators */
-                case T_RowExpr: /* other separators */
-                case T_CoalesceExpr:    /* own parentheses */
-                case T_MinMaxExpr:    /* own parentheses */
-                case T_XmlExpr: /* own parentheses */
-                case T_NullIfExpr:    /* other separators */
-                case T_Aggref:    /* own parentheses */
-                case T_WindowFunc:    /* own parentheses */
-                case T_CaseExpr:    /* other separators */
-                    return true;
-                default:
-                    return false;
-            }
-
-        default:
-            break;
-    }
-    /* those we don't know: in dubio complexo */
-    return false;
-}
-
-
-/*
- * appendContextKeyword - append a keyword to buffer
- *
- * If prettyPrint is enabled, perform a line break, and adjust indentation.
- * Otherwise, just append the keyword.
- */
-static void
-appendContextKeyword(deparse_context *context, const char *str,
-                     int indentBefore, int indentAfter, int indentPlus)
-{
-    StringInfo    buf = context->buf;
-
-    if (PRETTY_INDENT(context))
-    {
-        int            indentAmount;
-
-        context->indentLevel += indentBefore;
-
-        /* remove any trailing spaces currently in the buffer ... */
-        removeStringInfoSpaces(buf);
-        /* ... then add a newline and some spaces */
-        appendStringInfoChar(buf, '\n');
-
-        if (context->indentLevel < PRETTYINDENT_LIMIT)
-            indentAmount = Max(context->indentLevel, 0) + indentPlus;
-        else
-        {
-            /*
-             * If we're indented more than PRETTYINDENT_LIMIT characters, try
-             * to conserve horizontal space by reducing the per-level
-             * indentation.  For best results the scale factor here should
-             * divide all the indent amounts that get added to indentLevel
-             * (PRETTYINDENT_STD, etc).  It's important that the indentation
-             * not grow unboundedly, else deeply-nested trees use O(N^2)
-             * whitespace; so we also wrap modulo PRETTYINDENT_LIMIT.
-             */
-            indentAmount = PRETTYINDENT_LIMIT +
-                (context->indentLevel - PRETTYINDENT_LIMIT) /
-                (PRETTYINDENT_STD / 2);
-            indentAmount %= PRETTYINDENT_LIMIT;
-            /* scale/wrap logic affects indentLevel, but not indentPlus */
-            indentAmount += indentPlus;
-        }
-        appendStringInfoSpaces(buf, indentAmount);
-
-        appendStringInfoString(buf, str);
-
-        context->indentLevel += indentAfter;
-        if (context->indentLevel < 0)
-            context->indentLevel = 0;
-    }
-    else
-        appendStringInfoString(buf, str);
-}
-
-/*
- * removeStringInfoSpaces - delete trailing spaces from a buffer.
- *
- * Possibly this should move to stringinfo.c at some point.
- */
-static void
-removeStringInfoSpaces(StringInfo str)
-{
-    while (str->len > 0 && str->data[str->len - 1] == ' ')
-        str->data[--(str->len)] = '\0';
-}
-
-
-/*
- * get_rule_expr_paren    - deparse expr using get_rule_expr,
- * embracing the string with parentheses if necessary for prettyPrint.
- *
- * Never embrace if prettyFlags=0, because it's done in the calling node.
- *
- * Any node that does *not* embrace its argument node by sql syntax (with
- * parentheses, non-operator keywords like CASE/WHEN/ON, or comma etc) should
- * use get_rule_expr_paren instead of get_rule_expr so parentheses can be
- * added.
- */
-static void
-get_rule_expr_paren(Node *node, deparse_context *context,
-                    bool showimplicit, Node *parentNode)
-{
-    bool        need_paren;
-
-    need_paren = PRETTY_PAREN(context) &&
-        !isSimpleNode(node, parentNode, context->prettyFlags);
-
-    if (need_paren)
-        appendStringInfoChar(context->buf, '(');
-
-    get_rule_expr(node, context, showimplicit);
-
-    if (need_paren)
-        appendStringInfoChar(context->buf, ')');
-}
-
-
-/* ----------
- * get_rule_expr            - Parse back an expression
- *
- * Note: showimplicit determines whether we display any implicit cast that
- * is present at the top of the expression tree.  It is a passed argument,
- * not a field of the context struct, because we change the value as we
- * recurse down into the expression.  In general we suppress implicit casts
- * when the result type is known with certainty (eg, the arguments of an
- * OR must be boolean).  We display implicit casts for arguments of functions
- * and operators, since this is needed to be certain that the same function
- * or operator will be chosen when the expression is re-parsed.
- * ----------
- */
-static void
-get_rule_expr(Node *node, deparse_context *context,
-              bool showimplicit)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-
-    if (node == NULL)
-        return;
-
-    /* Guard against excessively long or deeply-nested queries */
-    CHECK_FOR_INTERRUPTS();
-    check_stack_depth();
-
-    /*
-     * Each level of get_rule_expr must emit an indivisible term
-     * (parenthesized if necessary) to ensure result is reparsed into the same
-     * expression tree.  The only exception is that when the input is a List,
-     * we emit the component items comma-separated with no surrounding
-     * decoration; this is convenient for most callers.
-     */
-    switch (nodeTag(node))
-    {
-        case T_Var:
-            (void) get_variable((Var *) node, 0, false, context);
-            break;
-
-        case T_Const:
-            get_const_expr((Const *) node, context, 0);
-            break;
-
-        case T_Param:
-            get_parameter((Param *) node, context);
-            break;
-
-        case T_Aggref:
-            get_agg_expr((Aggref *) node, context, (Aggref *) node);
-            break;
-
-        case T_GroupingFunc:
-            {
-                GroupingFunc *gexpr = (GroupingFunc *) node;
-
-                appendStringInfoString(buf, "GROUPING(");
-                get_rule_expr((Node *) gexpr->args, context, true);
-                appendStringInfoChar(buf, ')');
-            }
-            break;
-
-        case T_WindowFunc:
-            get_windowfunc_expr((WindowFunc *) node, context);
-            break;
-
-        case T_ArrayRef:
-            {
-                ArrayRef   *aref = (ArrayRef *) node;
-                bool        need_parens;
-
-                /*
-                 * If the argument is a CaseTestExpr, we must be inside a
-                 * FieldStore, ie, we are assigning to an element of an array
-                 * within a composite column.  Since we already punted on
-                 * displaying the FieldStore's target information, just punt
-                 * here too, and display only the assignment source
-                 * expression.
-                 */
-                if (IsA(aref->refexpr, CaseTestExpr))
-                {
-                    Assert(aref->refassgnexpr);
-                    get_rule_expr((Node *) aref->refassgnexpr,
-                                  context, showimplicit);
-                    break;
-                }
-
-                /*
-                 * Parenthesize the argument unless it's a simple Var or a
-                 * FieldSelect.  (In particular, if it's another ArrayRef, we
-                 * *must* parenthesize to avoid confusion.)
-                 */
-                need_parens = !IsA(aref->refexpr, Var) &&
-                    !IsA(aref->refexpr, FieldSelect);
-                if (need_parens)
-                    appendStringInfoChar(buf, '(');
-                get_rule_expr((Node *) aref->refexpr, context, showimplicit);
-                if (need_parens)
-                    appendStringInfoChar(buf, ')');
-
-                /*
-                 * If there's a refassgnexpr, we want to print the node in the
-                 * format "array[subscripts] := refassgnexpr".  This is not
-                 * legal SQL, so decompilation of INSERT or UPDATE statements
-                 * should always use processIndirection as part of the
-                 * statement-level syntax.  We should only see this when
-                 * EXPLAIN tries to print the targetlist of a plan resulting
-                 * from such a statement.
-                 */
-                if (aref->refassgnexpr)
-                {
-                    Node       *refassgnexpr;
-
-                    /*
-                     * Use processIndirection to print this node's subscripts
-                     * as well as any additional field selections or
-                     * subscripting in immediate descendants.  It returns the
-                     * RHS expr that is actually being "assigned".
-                     */
-                    refassgnexpr = processIndirection(node, context);
-                    appendStringInfoString(buf, " := ");
-                    get_rule_expr(refassgnexpr, context, showimplicit);
-                }
-                else
-                {
-                    /* Just an ordinary array fetch, so print subscripts */
-                    printSubscripts(aref, context);
-                }
-            }
-            break;
-
-        case T_FuncExpr:
-            get_func_expr((FuncExpr *) node, context, showimplicit);
-            break;
-
-        case T_NamedArgExpr:
-            {
-                NamedArgExpr *na = (NamedArgExpr *) node;
-
-                appendStringInfo(buf, "%s => ", quote_identifier(na->name));
-                get_rule_expr((Node *) na->arg, context, showimplicit);
-            }
-            break;
-
-        case T_OpExpr:
-            get_oper_expr((OpExpr *) node, context);
-            break;
-
-        case T_DistinctExpr:
-            {
-                DistinctExpr *expr = (DistinctExpr *) node;
-                List       *args = expr->args;
-                Node       *arg1 = (Node *) linitial(args);
-                Node       *arg2 = (Node *) lsecond(args);
-
-                if (!PRETTY_PAREN(context))
-                    appendStringInfoChar(buf, '(');
-                get_rule_expr_paren(arg1, context, true, node);
-                appendStringInfoString(buf, " IS DISTINCT FROM ");
-                get_rule_expr_paren(arg2, context, true, node);
-                if (!PRETTY_PAREN(context))
-                    appendStringInfoChar(buf, ')');
-            }
-            break;
-
-        case T_NullIfExpr:
-            {
-                NullIfExpr *nullifexpr = (NullIfExpr *) node;
-
-                appendStringInfoString(buf, "NULLIF(");
-                get_rule_expr((Node *) nullifexpr->args, context, true);
-                appendStringInfoChar(buf, ')');
-            }
-            break;
-
-        case T_ScalarArrayOpExpr:
-            {
-                ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) node;
-                List       *args = expr->args;
-                Node       *arg1 = (Node *) linitial(args);
-                Node       *arg2 = (Node *) lsecond(args);
-
-                if (!PRETTY_PAREN(context))
-                    appendStringInfoChar(buf, '(');
-                get_rule_expr_paren(arg1, context, true, node);
-                appendStringInfo(buf, " %s %s (",
-                                 generate_operator_name(expr->opno,
-                                                        exprType(arg1),
-                                                        get_base_element_type(exprType(arg2))),
-                                 expr->useOr ? "ANY" : "ALL");
-                get_rule_expr_paren(arg2, context, true, node);
-
-                /*
-                 * There's inherent ambiguity in "x op ANY/ALL (y)" when y is
-                 * a bare sub-SELECT.  Since we're here, the sub-SELECT must
-                 * be meant as a scalar sub-SELECT yielding an array value to
-                 * be used in ScalarArrayOpExpr; but the grammar will
-                 * preferentially interpret such a construct as an ANY/ALL
-                 * SubLink.  To prevent misparsing the output that way, insert
-                 * a dummy coercion (which will be stripped by parse analysis,
-                 * so no inefficiency is added in dump and reload).  This is
-                 * indeed most likely what the user wrote to get the construct
-                 * accepted in the first place.
-                 */
-                if (IsA(arg2, SubLink) &&
-                    ((SubLink *) arg2)->subLinkType == EXPR_SUBLINK)
-                    appendStringInfo(buf, "::%s",
-                                     format_type_with_typemod(exprType(arg2),
-                                                              exprTypmod(arg2)));
-                appendStringInfoChar(buf, ')');
-                if (!PRETTY_PAREN(context))
-                    appendStringInfoChar(buf, ')');
-            }
-            break;
-
-        case T_BoolExpr:
-            {
-                BoolExpr   *expr = (BoolExpr *) node;
-                Node       *first_arg = linitial(expr->args);
-                ListCell   *arg = lnext(list_head(expr->args));
-
-                switch (expr->boolop)
-                {
-                    case AND_EXPR:
-                        if (!PRETTY_PAREN(context))
-                            appendStringInfoChar(buf, '(');
-                        get_rule_expr_paren(first_arg, context,
-                                            false, node);
-                        while (arg)
-                        {
-                            appendStringInfoString(buf, " AND ");
-                            get_rule_expr_paren((Node *) lfirst(arg), context,
-                                                false, node);
-                            arg = lnext(arg);
-                        }
-                        if (!PRETTY_PAREN(context))
-                            appendStringInfoChar(buf, ')');
-                        break;
-
-                    case OR_EXPR:
-                        if (!PRETTY_PAREN(context))
-                            appendStringInfoChar(buf, '(');
-                        get_rule_expr_paren(first_arg, context,
-                                            false, node);
-                        while (arg)
-                        {
-                            appendStringInfoString(buf, " OR ");
-                            get_rule_expr_paren((Node *) lfirst(arg), context,
-                                                false, node);
-                            arg = lnext(arg);
-                        }
-                        if (!PRETTY_PAREN(context))
-                            appendStringInfoChar(buf, ')');
-                        break;
-
-                    case NOT_EXPR:
-                        if (!PRETTY_PAREN(context))
-                            appendStringInfoChar(buf, '(');
-                        appendStringInfoString(buf, "NOT ");
-                        get_rule_expr_paren(first_arg, context,
-                                            false, node);
-                        if (!PRETTY_PAREN(context))
-                            appendStringInfoChar(buf, ')');
-                        break;
-
-                    default:
-                        elog(ERROR, "unrecognized boolop: %d",
-                             (int) expr->boolop);
-                }
-            }
-            break;
-
-        case T_SubLink:
-            get_sublink_expr((SubLink *) node, context);
-            break;
-
-        case T_SubPlan:
-            {
-                SubPlan    *subplan = (SubPlan *) node;
-
-                /*
-                 * We cannot see an already-planned subplan in rule deparsing,
-                 * only while EXPLAINing a query plan.  We don't try to
-                 * reconstruct the original SQL, just reference the subplan
-                 * that appears elsewhere in EXPLAIN's result.
-                 */
-                if (subplan->useHashTable)
-                    appendStringInfo(buf, "(hashed %s)", subplan->plan_name);
-                else
-                    appendStringInfo(buf, "(%s)", subplan->plan_name);
-            }
-            break;
-
-        case T_AlternativeSubPlan:
-            {
-                AlternativeSubPlan *asplan = (AlternativeSubPlan *) node;
-                ListCell   *lc;
-
-                /* As above, this can only happen during EXPLAIN */
-                appendStringInfoString(buf, "(alternatives: ");
-                foreach(lc, asplan->subplans)
-                {
-                    SubPlan    *splan = lfirst_node(SubPlan, lc);
-
-                    if (splan->useHashTable)
-                        appendStringInfo(buf, "hashed %s", splan->plan_name);
-                    else
-                        appendStringInfoString(buf, splan->plan_name);
-                    if (lnext(lc))
-                        appendStringInfoString(buf, " or ");
-                }
-                appendStringInfoChar(buf, ')');
-            }
-            break;
-
-        case T_FieldSelect:
-            {
-                FieldSelect *fselect = (FieldSelect *) node;
-                Node       *arg = (Node *) fselect->arg;
-                int            fno = fselect->fieldnum;
-                const char *fieldname;
-                bool        need_parens;
-
-                /*
-                 * Parenthesize the argument unless it's an ArrayRef or
-                 * another FieldSelect.  Note in particular that it would be
-                 * WRONG to not parenthesize a Var argument; simplicity is not
-                 * the issue here, having the right number of names is.
-                 */
-                need_parens = !IsA(arg, ArrayRef) &&!IsA(arg, FieldSelect);
-                if (need_parens)
-                    appendStringInfoChar(buf, '(');
-                get_rule_expr(arg, context, true);
-                if (need_parens)
-                    appendStringInfoChar(buf, ')');
-
-                /*
-                 * Get and print the field name.
-                 */
-                fieldname = get_name_for_var_field((Var *) arg, fno,
-                                                   0, context);
-                appendStringInfo(buf, ".%s", quote_identifier(fieldname));
-            }
-            break;
-
-        case T_FieldStore:
-            {
-                FieldStore *fstore = (FieldStore *) node;
-                bool        need_parens;
-
-                /*
-                 * There is no good way to represent a FieldStore as real SQL,
-                 * so decompilation of INSERT or UPDATE statements should
-                 * always use processIndirection as part of the
-                 * statement-level syntax.  We should only get here when
-                 * EXPLAIN tries to print the targetlist of a plan resulting
-                 * from such a statement.  The plan case is even harder than
-                 * ordinary rules would be, because the planner tries to
-                 * collapse multiple assignments to the same field or subfield
-                 * into one FieldStore; so we can see a list of target fields
-                 * not just one, and the arguments could be FieldStores
-                 * themselves.  We don't bother to try to print the target
-                 * field names; we just print the source arguments, with a
-                 * ROW() around them if there's more than one.  This isn't
-                 * terribly complete, but it's probably good enough for
-                 * EXPLAIN's purposes; especially since anything more would be
-                 * either hopelessly confusing or an even poorer
-                 * representation of what the plan is actually doing.
-                 */
-                need_parens = (list_length(fstore->newvals) != 1);
-                if (need_parens)
-                    appendStringInfoString(buf, "ROW(");
-                get_rule_expr((Node *) fstore->newvals, context, showimplicit);
-                if (need_parens)
-                    appendStringInfoChar(buf, ')');
-            }
-            break;
-
-        case T_RelabelType:
-            {
-                RelabelType *relabel = (RelabelType *) node;
-                Node       *arg = (Node *) relabel->arg;
-
-                if (relabel->relabelformat == COERCE_IMPLICIT_CAST &&
-                    !showimplicit)
-                {
-                    /* don't show the implicit cast */
-                    get_rule_expr_paren(arg, context, false, node);
-                }
-                else
-                {
-                    get_coercion_expr(arg, context,
-                                      relabel->resulttype,
-                                      relabel->resulttypmod,
-                                      node);
-                }
-            }
-            break;
-
-        case T_CoerceViaIO:
-            {
-                CoerceViaIO *iocoerce = (CoerceViaIO *) node;
-                Node       *arg = (Node *) iocoerce->arg;
-
-                if (iocoerce->coerceformat == COERCE_IMPLICIT_CAST &&
-                    !showimplicit)
-                {
-                    /* don't show the implicit cast */
-                    get_rule_expr_paren(arg, context, false, node);
-                }
-                else
-                {
-                    get_coercion_expr(arg, context,
-                                      iocoerce->resulttype,
-                                      -1,
-                                      node);
-                }
-            }
-            break;
-
-        case T_ArrayCoerceExpr:
-            {
-                ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
-                Node       *arg = (Node *) acoerce->arg;
-
-                if (acoerce->coerceformat == COERCE_IMPLICIT_CAST &&
-                    !showimplicit)
-                {
-                    /* don't show the implicit cast */
-                    get_rule_expr_paren(arg, context, false, node);
-                }
-                else
-                {
-                    get_coercion_expr(arg, context,
-                                      acoerce->resulttype,
-                                      acoerce->resulttypmod,
-                                      node);
-                }
-            }
-            break;
-
-        case T_ConvertRowtypeExpr:
-            {
-                ConvertRowtypeExpr *convert = (ConvertRowtypeExpr *) node;
-                Node       *arg = (Node *) convert->arg;
-
-                if (convert->convertformat == COERCE_IMPLICIT_CAST &&
-                    !showimplicit)
-                {
-                    /* don't show the implicit cast */
-                    get_rule_expr_paren(arg, context, false, node);
-                }
-                else
-                {
-                    get_coercion_expr(arg, context,
-                                      convert->resulttype, -1,
-                                      node);
-                }
-            }
-            break;
-
-        case T_CollateExpr:
-            {
-                CollateExpr *collate = (CollateExpr *) node;
-                Node       *arg = (Node *) collate->arg;
-
-                if (!PRETTY_PAREN(context))
-                    appendStringInfoChar(buf, '(');
-                get_rule_expr_paren(arg, context, showimplicit, node);
-                appendStringInfo(buf, " COLLATE %s",
-                                 generate_collation_name(collate->collOid));
-                if (!PRETTY_PAREN(context))
-                    appendStringInfoChar(buf, ')');
-            }
-            break;
-
-        case T_CaseExpr:
-            {
-                CaseExpr   *caseexpr = (CaseExpr *) node;
-                ListCell   *temp;
-
-                appendContextKeyword(context, "CASE",
-                                     0, PRETTYINDENT_VAR, 0);
-                if (caseexpr->arg)
-                {
-                    appendStringInfoChar(buf, ' ');
-                    get_rule_expr((Node *) caseexpr->arg, context, true);
-                }
-                foreach(temp, caseexpr->args)
-                {
-                    CaseWhen   *when = (CaseWhen *) lfirst(temp);
-                    Node       *w = (Node *) when->expr;
-
-                    if (caseexpr->arg)
-                    {
-                        /*
-                         * The parser should have produced WHEN clauses of the
-                         * form "CaseTestExpr = RHS", possibly with an
-                         * implicit coercion inserted above the CaseTestExpr.
-                         * For accurate decompilation of rules it's essential
-                         * that we show just the RHS.  However in an
-                         * expression that's been through the optimizer, the
-                         * WHEN clause could be almost anything (since the
-                         * equality operator could have been expanded into an
-                         * inline function).  If we don't recognize the form
-                         * of the WHEN clause, just punt and display it as-is.
-                         */
-                        if (IsA(w, OpExpr))
-                        {
-                            List       *args = ((OpExpr *) w)->args;
-
-                            if (list_length(args) == 2 &&
-                                IsA(strip_implicit_coercions(linitial(args)),
-                                    CaseTestExpr))
-                                w = (Node *) lsecond(args);
-                        }
-                    }
-
-                    if (!PRETTY_INDENT(context))
-                        appendStringInfoChar(buf, ' ');
-                    appendContextKeyword(context, "WHEN ",
-                                         0, 0, 0);
-                    get_rule_expr(w, context, false);
-                    appendStringInfoString(buf, " THEN ");
-                    get_rule_expr((Node *) when->result, context, true);
-                }
-                if (!PRETTY_INDENT(context))
-                    appendStringInfoChar(buf, ' ');
-                appendContextKeyword(context, "ELSE ",
-                                     0, 0, 0);
-                get_rule_expr((Node *) caseexpr->defresult, context, true);
-                if (!PRETTY_INDENT(context))
-                    appendStringInfoChar(buf, ' ');
-                appendContextKeyword(context, "END",
-                                     -PRETTYINDENT_VAR, 0, 0);
-            }
-            break;
-
-        case T_CaseTestExpr:
-            {
-                /*
-                 * Normally we should never get here, since for expressions
-                 * that can contain this node type we attempt to avoid
-                 * recursing to it.  But in an optimized expression we might
-                 * be unable to avoid that (see comments for CaseExpr).  If we
-                 * do see one, print it as CASE_TEST_EXPR.
-                 */
-                appendStringInfoString(buf, "CASE_TEST_EXPR");
-            }
-            break;
-
-        case T_ArrayExpr:
-            {
-                ArrayExpr  *arrayexpr = (ArrayExpr *) node;
-
-                appendStringInfoString(buf, "ARRAY[");
-                get_rule_expr((Node *) arrayexpr->elements, context, true);
-                appendStringInfoChar(buf, ']');
-
-                /*
-                 * If the array isn't empty, we assume its elements are
-                 * coerced to the desired type.  If it's empty, though, we
-                 * need an explicit coercion to the array type.
-                 */
-                if (arrayexpr->elements == NIL)
-                    appendStringInfo(buf, "::%s",
-                                     format_type_with_typemod(arrayexpr->array_typeid, -1));
-            }
-            break;
-
-        case T_RowExpr:
-            {
-                RowExpr    *rowexpr = (RowExpr *) node;
-                TupleDesc    tupdesc = NULL;
-                ListCell   *arg;
-                int            i;
-                char       *sep;
-
-                /*
-                 * If it's a named type and not RECORD, we may have to skip
-                 * dropped columns and/or claim there are NULLs for added
-                 * columns.
-                 */
-                if (rowexpr->row_typeid != RECORDOID)
-                {
-                    tupdesc = lookup_rowtype_tupdesc(rowexpr->row_typeid, -1);
-                    Assert(list_length(rowexpr->args) <= tupdesc->natts);
-                }
-
-                /*
-                 * SQL99 allows "ROW" to be omitted when there is more than
-                 * one column, but for simplicity we always print it.
-                 */
-                appendStringInfoString(buf, "ROW(");
-                sep = "";
-                i = 0;
-                foreach(arg, rowexpr->args)
-                {
-                    Node       *e = (Node *) lfirst(arg);
-
-                    if (tupdesc == NULL ||
-                        !tupdesc->attrs[i]->attisdropped)
-                    {
-                        appendStringInfoString(buf, sep);
-                        /* Whole-row Vars need special treatment here */
-                        get_rule_expr_toplevel(e, context, true);
-                        sep = ", ";
-                    }
-                    i++;
-                }
-                if (tupdesc != NULL)
-                {
-                    while (i < tupdesc->natts)
-                    {
-                        if (!tupdesc->attrs[i]->attisdropped)
-                        {
-                            appendStringInfoString(buf, sep);
-                            appendStringInfoString(buf, "NULL");
-                            sep = ", ";
-                        }
-                        i++;
-                    }
-
-                    ReleaseTupleDesc(tupdesc);
-                }
-                appendStringInfoChar(buf, ')');
-                if (rowexpr->row_format == COERCE_EXPLICIT_CAST)
-                    appendStringInfo(buf, "::%s",
-                                     format_type_with_typemod(rowexpr->row_typeid, -1));
-            }
-            break;
-
-        case T_RowCompareExpr:
-            {
-                RowCompareExpr *rcexpr = (RowCompareExpr *) node;
-                ListCell   *arg;
-                char       *sep;
-
-                /*
-                 * SQL99 allows "ROW" to be omitted when there is more than
-                 * one column, but for simplicity we always print it.
-                 */
-                appendStringInfoString(buf, "(ROW(");
-                sep = "";
-                foreach(arg, rcexpr->largs)
-                {
-                    Node       *e = (Node *) lfirst(arg);
-
-                    appendStringInfoString(buf, sep);
-                    get_rule_expr(e, context, true);
-                    sep = ", ";
-                }
-
-                /*
-                 * We assume that the name of the first-column operator will
-                 * do for all the rest too.  This is definitely open to
-                 * failure, eg if some but not all operators were renamed
-                 * since the construct was parsed, but there seems no way to
-                 * be perfect.
-                 */
-                appendStringInfo(buf, ") %s ROW(",
-                                 generate_operator_name(linitial_oid(rcexpr->opnos),
-                                                        exprType(linitial(rcexpr->largs)),
-                                                        exprType(linitial(rcexpr->rargs))));
-                sep = "";
-                foreach(arg, rcexpr->rargs)
-                {
-                    Node       *e = (Node *) lfirst(arg);
-
-                    appendStringInfoString(buf, sep);
-                    get_rule_expr(e, context, true);
-                    sep = ", ";
-                }
-                appendStringInfoString(buf, "))");
-            }
-            break;
-
-        case T_CoalesceExpr:
-            {
-                CoalesceExpr *coalesceexpr = (CoalesceExpr *) node;
-
-                appendStringInfoString(buf, "COALESCE(");
-                get_rule_expr((Node *) coalesceexpr->args, context, true);
-                appendStringInfoChar(buf, ')');
-            }
-            break;
-
-        case T_MinMaxExpr:
-            {
-                MinMaxExpr *minmaxexpr = (MinMaxExpr *) node;
-
-                switch (minmaxexpr->op)
-                {
-                    case IS_GREATEST:
-                        appendStringInfoString(buf, "GREATEST(");
-                        break;
-                    case IS_LEAST:
-                        appendStringInfoString(buf, "LEAST(");
-                        break;
-                }
-                get_rule_expr((Node *) minmaxexpr->args, context, true);
-                appendStringInfoChar(buf, ')');
-            }
-            break;
-
-        case T_SQLValueFunction:
-            {
-                SQLValueFunction *svf = (SQLValueFunction *) node;
-
-                /*
-                 * Note: this code knows that typmod for time, timestamp, and
-                 * timestamptz just prints as integer.
-                 */
-                switch (svf->op)
-                {
-                    case SVFOP_CURRENT_DATE:
-                        appendStringInfoString(buf, "CURRENT_DATE");
-                        break;
-                    case SVFOP_CURRENT_TIME:
-                        appendStringInfoString(buf, "CURRENT_TIME");
-                        break;
-                    case SVFOP_CURRENT_TIME_N:
-                        appendStringInfo(buf, "CURRENT_TIME(%d)", svf->typmod);
-                        break;
-                    case SVFOP_CURRENT_TIMESTAMP:
-                        appendStringInfoString(buf, "CURRENT_TIMESTAMP");
-                        break;
-                    case SVFOP_CURRENT_TIMESTAMP_N:
-                        appendStringInfo(buf, "CURRENT_TIMESTAMP(%d)",
-                                         svf->typmod);
-                        break;
-                    case SVFOP_LOCALTIME:
-                        appendStringInfoString(buf, "LOCALTIME");
-                        break;
-                    case SVFOP_LOCALTIME_N:
-                        appendStringInfo(buf, "LOCALTIME(%d)", svf->typmod);
-                        break;
-                    case SVFOP_LOCALTIMESTAMP:
-                        appendStringInfoString(buf, "LOCALTIMESTAMP");
-                        break;
-                    case SVFOP_LOCALTIMESTAMP_N:
-                        appendStringInfo(buf, "LOCALTIMESTAMP(%d)",
-                                         svf->typmod);
-                        break;
-                    case SVFOP_CURRENT_ROLE:
-                        appendStringInfoString(buf, "CURRENT_ROLE");
-                        break;
-                    case SVFOP_CURRENT_USER:
-                        appendStringInfoString(buf, "CURRENT_USER");
-                        break;
-                    case SVFOP_USER:
-                        appendStringInfoString(buf, "USER");
-                        break;
-                    case SVFOP_SESSION_USER:
-                        appendStringInfoString(buf, "SESSION_USER");
-                        break;
-                    case SVFOP_CURRENT_CATALOG:
-                        appendStringInfoString(buf, "CURRENT_CATALOG");
-                        break;
-                    case SVFOP_CURRENT_SCHEMA:
-                        appendStringInfoString(buf, "CURRENT_SCHEMA");
-                        break;
-                }
-            }
-            break;
-
-        case T_NextValueExpr:
-            {
-                /*
-                 * This gets invoked by Fast Query Shipping code to deparse a
-                 * query. It seems enough to just generate a "DEFAULT" clause
-                 * and let the remote datanode handle finding the correct
-                 * sequence for replica identity.
-                 *
-                 * XXX PG10MERGE: If we do see issues with this, it might be
-                 * worthwhile to consider generating an expression such as,
-                 * nextval('sequence_name'::regclass)
-                 */
-                appendStringInfoString(buf, "DEFAULT");
-            }
-            break;
-
-        case T_XmlExpr:
-            {
-                XmlExpr    *xexpr = (XmlExpr *) node;
-                bool        needcomma = false;
-                ListCell   *arg;
-                ListCell   *narg;
-                Const       *con;
-
-                switch (xexpr->op)
-                {
-                    case IS_XMLCONCAT:
-                        appendStringInfoString(buf, "XMLCONCAT(");
-                        break;
-                    case IS_XMLELEMENT:
-                        appendStringInfoString(buf, "XMLELEMENT(");
-                        break;
-                    case IS_XMLFOREST:
-                        appendStringInfoString(buf, "XMLFOREST(");
-                        break;
-                    case IS_XMLPARSE:
-                        appendStringInfoString(buf, "XMLPARSE(");
-                        break;
-                    case IS_XMLPI:
-                        appendStringInfoString(buf, "XMLPI(");
-                        break;
-                    case IS_XMLROOT:
-                        appendStringInfoString(buf, "XMLROOT(");
-                        break;
-                    case IS_XMLSERIALIZE:
-                        appendStringInfoString(buf, "XMLSERIALIZE(");
-                        break;
-                    case IS_DOCUMENT:
-                        break;
-                }
-                if (xexpr->op == IS_XMLPARSE || xexpr->op == IS_XMLSERIALIZE)
-                {
-                    if (xexpr->xmloption == XMLOPTION_DOCUMENT)
-                        appendStringInfoString(buf, "DOCUMENT ");
-                    else
-                        appendStringInfoString(buf, "CONTENT ");
-                }
-                if (xexpr->name)
-                {
-                    appendStringInfo(buf, "NAME %s",
-                                     quote_identifier(map_xml_name_to_sql_identifier(xexpr->name)));
-                    needcomma = true;
-                }
-                if (xexpr->named_args)
-                {
-                    if (xexpr->op != IS_XMLFOREST)
-                    {
-                        if (needcomma)
-                            appendStringInfoString(buf, ", ");
-                        appendStringInfoString(buf, "XMLATTRIBUTES(");
-                        needcomma = false;
-                    }
-                    forboth(arg, xexpr->named_args, narg, xexpr->arg_names)
-                    {
-                        Node       *e = (Node *) lfirst(arg);
-                        char       *argname = strVal(lfirst(narg));
-
-                        if (needcomma)
-                            appendStringInfoString(buf, ", ");
-                        get_rule_expr((Node *) e, context, true);
-                        appendStringInfo(buf, " AS %s",
-                                         quote_identifier(map_xml_name_to_sql_identifier(argname)));
-                        needcomma = true;
-                    }
-                    if (xexpr->op != IS_XMLFOREST)
-                        appendStringInfoChar(buf, ')');
-                }
-                if (xexpr->args)
-                {
-                    if (needcomma)
-                        appendStringInfoString(buf, ", ");
-                    switch (xexpr->op)
-                    {
-                        case IS_XMLCONCAT:
-                        case IS_XMLELEMENT:
-                        case IS_XMLFOREST:
-                        case IS_XMLPI:
-                        case IS_XMLSERIALIZE:
-                            /* no extra decoration needed */
-                            get_rule_expr((Node *) xexpr->args, context, true);
-                            break;
-                        case IS_XMLPARSE:
-                            Assert(list_length(xexpr->args) == 2);
-
-                            get_rule_expr((Node *) linitial(xexpr->args),
-                                          context, true);
-
-                            con = lsecond_node(Const, xexpr->args);
-                            Assert(!con->constisnull);
-                            if (DatumGetBool(con->constvalue))
-                                appendStringInfoString(buf,
-                                                       " PRESERVE WHITESPACE");
-                            else
-                                appendStringInfoString(buf,
-                                                       " STRIP WHITESPACE");
-                            break;
-                        case IS_XMLROOT:
-                            Assert(list_length(xexpr->args) == 3);
-
-                            get_rule_expr((Node *) linitial(xexpr->args),
-                                          context, true);
-
-                            appendStringInfoString(buf, ", VERSION ");
-                            con = (Const *) lsecond(xexpr->args);
-                            if (IsA(con, Const) &&
-                                con->constisnull)
-                                appendStringInfoString(buf, "NO VALUE");
-                            else
-                                get_rule_expr((Node *) con, context, false);
-
-                            con = lthird_node(Const, xexpr->args);
-                            if (con->constisnull)
-                                 /* suppress STANDALONE NO VALUE */ ;
-                            else
-                            {
-                                switch (DatumGetInt32(con->constvalue))
-                                {
-                                    case XML_STANDALONE_YES:
-                                        appendStringInfoString(buf,
-                                                               ", STANDALONE YES");
-                                        break;
-                                    case XML_STANDALONE_NO:
-                                        appendStringInfoString(buf,
-                                                               ", STANDALONE NO");
-                                        break;
-                                    case XML_STANDALONE_NO_VALUE:
-                                        appendStringInfoString(buf,
-                                                               ", STANDALONE NO VALUE");
-                                        break;
-                                    default:
-                                        break;
-                                }
-                            }
-                            break;
-                        case IS_DOCUMENT:
-                            get_rule_expr_paren((Node *) xexpr->args, context, false, node);
-                            break;
-                    }
-
-                }
-                if (xexpr->op == IS_XMLSERIALIZE)
-                    appendStringInfo(buf, " AS %s",
-                                     format_type_with_typemod(xexpr->type,
-                                                              xexpr->typmod));
-                if (xexpr->op == IS_DOCUMENT)
-                    appendStringInfoString(buf, " IS DOCUMENT");
-                else
-                    appendStringInfoChar(buf, ')');
-            }
-            break;
-
-        case T_NullTest:
-            {
-                NullTest   *ntest = (NullTest *) node;
-
-                if (!PRETTY_PAREN(context))
-                    appendStringInfoChar(buf, '(');
-                get_rule_expr_paren((Node *) ntest->arg, context, true, node);
-
-                /*
-                 * For scalar inputs, we prefer to print as IS [NOT] NULL,
-                 * which is shorter and traditional.  If it's a rowtype input
-                 * but we're applying a scalar test, must print IS [NOT]
-                 * DISTINCT FROM NULL to be semantically correct.
-                 */
-                if (ntest->argisrow ||
-                    !type_is_rowtype(exprType((Node *) ntest->arg)))
-                {
-                    switch (ntest->nulltesttype)
-                    {
-                        case IS_NULL:
-                            appendStringInfoString(buf, " IS NULL");
-                            break;
-                        case IS_NOT_NULL:
-                            appendStringInfoString(buf, " IS NOT NULL");
-                            break;
-                        default:
-                            elog(ERROR, "unrecognized nulltesttype: %d",
-                                 (int) ntest->nulltesttype);
-                    }
-                }
-                else
-                {
-                    switch (ntest->nulltesttype)
-                    {
-                        case IS_NULL:
-                            appendStringInfoString(buf, " IS NOT DISTINCT FROM NULL");
-                            break;
-                        case IS_NOT_NULL:
-                            appendStringInfoString(buf, " IS DISTINCT FROM NULL");
-                            break;
-                        default:
-                            elog(ERROR, "unrecognized nulltesttype: %d",
-                                 (int) ntest->nulltesttype);
-                    }
-                }
-                if (!PRETTY_PAREN(context))
-                    appendStringInfoChar(buf, ')');
-            }
-            break;
-
-        case T_BooleanTest:
-            {
-                BooleanTest *btest = (BooleanTest *) node;
-
-                if (!PRETTY_PAREN(context))
-                    appendStringInfoChar(buf, '(');
-                get_rule_expr_paren((Node *) btest->arg, context, false, node);
-                switch (btest->booltesttype)
-                {
-                    case IS_TRUE:
-                        appendStringInfoString(buf, " IS TRUE");
-                        break;
-                    case IS_NOT_TRUE:
-                        appendStringInfoString(buf, " IS NOT TRUE");
-                        break;
-                    case IS_FALSE:
-                        appendStringInfoString(buf, " IS FALSE");
-                        break;
-                    case IS_NOT_FALSE:
-                        appendStringInfoString(buf, " IS NOT FALSE");
-                        break;
-                    case IS_UNKNOWN:
-                        appendStringInfoString(buf, " IS UNKNOWN");
-                        break;
-                    case IS_NOT_UNKNOWN:
-                        appendStringInfoString(buf, " IS NOT UNKNOWN");
-                        break;
-                    default:
-                        elog(ERROR, "unrecognized booltesttype: %d",
-                             (int) btest->booltesttype);
-                }
-                if (!PRETTY_PAREN(context))
-                    appendStringInfoChar(buf, ')');
-            }
-            break;
-
-        case T_CoerceToDomain:
-            {
-                CoerceToDomain *ctest = (CoerceToDomain *) node;
-                Node       *arg = (Node *) ctest->arg;
-
-                if (ctest->coercionformat == COERCE_IMPLICIT_CAST &&
-                    !showimplicit)
-                {
-                    /* don't show the implicit cast */
-                    get_rule_expr(arg, context, false);
-                }
-                else
-                {
-                    get_coercion_expr(arg, context,
-                                      ctest->resulttype,
-                                      ctest->resulttypmod,
-                                      node);
-                }
-            }
-            break;
-
-        case T_CoerceToDomainValue:
-            appendStringInfoString(buf, "VALUE");
-            break;
-
-        case T_SetToDefault:
-            appendStringInfoString(buf, "DEFAULT");
-            break;
-
-        case T_CurrentOfExpr:
-            {
-                CurrentOfExpr *cexpr = (CurrentOfExpr *) node;
-
-                if (cexpr->cursor_name)
-                    appendStringInfo(buf, "CURRENT OF %s",
-                                     quote_identifier(cexpr->cursor_name));
-                else
-                    appendStringInfo(buf, "CURRENT OF $%d",
-                                     cexpr->cursor_param);
-            }
-            break;
-
-        case T_InferenceElem:
-            {
-                InferenceElem *iexpr = (InferenceElem *) node;
-                bool        save_varprefix;
-                bool        need_parens;
-
-                /*
-                 * InferenceElem can only refer to target relation, so a
-                 * prefix is not useful, and indeed would cause parse errors.
-                 */
-                save_varprefix = context->varprefix;
-                context->varprefix = false;
-
-                /*
-                 * Parenthesize the element unless it's a simple Var or a bare
-                 * function call.  Follows pg_get_indexdef_worker().
-                 */
-                need_parens = !IsA(iexpr->expr, Var);
-                if (IsA(iexpr->expr, FuncExpr) &&
-                    ((FuncExpr *) iexpr->expr)->funcformat ==
-                    COERCE_EXPLICIT_CALL)
-                    need_parens = false;
-
-                if (need_parens)
-                    appendStringInfoChar(buf, '(');
-                get_rule_expr((Node *) iexpr->expr,
-                              context, false);
-                if (need_parens)
-                    appendStringInfoChar(buf, ')');
-
-                context->varprefix = save_varprefix;
-
-                if (iexpr->infercollid)
-                    appendStringInfo(buf, " COLLATE %s",
-                                     generate_collation_name(iexpr->infercollid));
-
-                /* Add the operator class name, if not default */
-                if (iexpr->inferopclass)
-                {
-                    Oid            inferopclass = iexpr->inferopclass;
-                    Oid            inferopcinputtype = get_opclass_input_type(iexpr->inferopclass);
-
-                    get_opclass_name(inferopclass, inferopcinputtype, buf);
-                }
-            }
-            break;
-
-        case T_PartitionBoundSpec:
-            {
-                PartitionBoundSpec *spec = (PartitionBoundSpec *) node;
-                ListCell   *cell;
-                char       *sep;
-
-                switch (spec->strategy)
-                {
-                    case PARTITION_STRATEGY_LIST:
-                        Assert(spec->listdatums != NIL);
-
-                        appendStringInfoString(buf, "FOR VALUES IN (");
-                        sep = "";
-                        foreach(cell, spec->listdatums)
-                        {
-                            Const       *val = castNode(Const, lfirst(cell));
-
-                            appendStringInfoString(buf, sep);
-                            get_const_expr(val, context, -1);
-                            sep = ", ";
-                        }
-
-                        appendStringInfoString(buf, ")");
-                        break;
-
-                    case PARTITION_STRATEGY_RANGE:
-                        Assert(spec->lowerdatums != NIL &&
-                               spec->upperdatums != NIL &&
-                               list_length(spec->lowerdatums) ==
-                               list_length(spec->upperdatums));
-
-                        appendStringInfo(buf, "FOR VALUES FROM %s TO %s",
-                                         get_range_partbound_string(spec->lowerdatums),
-                                         get_range_partbound_string(spec->upperdatums));
-                        break;
-
-                    default:
-                        elog(ERROR, "unrecognized partition strategy: %d",
-                             (int) spec->strategy);
-                        break;
-                }
-            }
-            break;
-
-        case T_List:
-            {
-                char       *sep;
-                ListCell   *l;
-
-                sep = "";
-                foreach(l, (List *) node)
-                {
-                    appendStringInfoString(buf, sep);
-                    get_rule_expr((Node *) lfirst(l), context, showimplicit);
-                    sep = ", ";
-                }
-            }
-            break;
-
-        case T_TableFunc:
-            get_tablefunc((TableFunc *) node, context, showimplicit);
-            break;
-
-        default:
-            elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
-            break;
-    }
-}
-
-/*
- * get_rule_expr_toplevel        - Parse back a toplevel expression
- *
- * Same as get_rule_expr(), except that if the expr is just a Var, we pass
- * istoplevel = true not false to get_variable().  This causes whole-row Vars
- * to get printed with decoration that will prevent expansion of "*".
- * We need to use this in contexts such as ROW() and VALUES(), where the
- * parser would expand "foo.*" appearing at top level.  (In principle we'd
- * use this in get_target_list() too, but that has additional worries about
- * whether to print AS, so it needs to invoke get_variable() directly anyway.)
- */
-static void
-get_rule_expr_toplevel(Node *node, deparse_context *context,
-                       bool showimplicit)
-{
-    if (node && IsA(node, Var))
-        (void) get_variable((Var *) node, 0, true, context);
-    else
-        get_rule_expr(node, context, showimplicit);
-}
-
-/*
- * get_rule_expr_funccall        - Parse back a function-call expression
- *
- * Same as get_rule_expr(), except that we guarantee that the output will
- * look like a function call, or like one of the things the grammar treats as
- * equivalent to a function call (see the func_expr_windowless production).
- * This is needed in places where the grammar uses func_expr_windowless and
- * you can't substitute a parenthesized a_expr.  If what we have isn't going
- * to look like a function call, wrap it in a dummy CAST() expression, which
- * will satisfy the grammar --- and, indeed, is likely what the user wrote to
- * produce such a thing.
- */
-static void
-get_rule_expr_funccall(Node *node, deparse_context *context,
-                       bool showimplicit)
-{
-    if (looks_like_function(node))
-        get_rule_expr(node, context, showimplicit);
-    else
-    {
-        StringInfo    buf = context->buf;
-
-        appendStringInfoString(buf, "CAST(");
-        /* no point in showing any top-level implicit cast */
-        get_rule_expr(node, context, false);
-        appendStringInfo(buf, " AS %s)",
-                         format_type_with_typemod(exprType(node),
-                                                  exprTypmod(node)));
-    }
-}
-
-/*
- * Helper function to identify node types that satisfy func_expr_windowless.
- * If in doubt, "false" is always a safe answer.
- */
-static bool
-looks_like_function(Node *node)
-{// #lizard forgives
-    if (node == NULL)
-        return false;            /* probably shouldn't happen */
-    switch (nodeTag(node))
-    {
-        case T_FuncExpr:
-            /* OK, unless it's going to deparse as a cast */
-            return (((FuncExpr *) node)->funcformat == COERCE_EXPLICIT_CALL);
-        case T_NullIfExpr:
-        case T_CoalesceExpr:
-        case T_MinMaxExpr:
-        case T_SQLValueFunction:
-        case T_XmlExpr:
-            /* these are all accepted by func_expr_common_subexpr */
-            return true;
-        default:
-            break;
-    }
-    return false;
-}
-
-
-/*
- * get_oper_expr            - Parse back an OpExpr node
- */
-static void
-get_oper_expr(OpExpr *expr, deparse_context *context)
-{
-    StringInfo    buf = context->buf;
-    Oid            opno = expr->opno;
-    List       *args = expr->args;
-
-    if (!PRETTY_PAREN(context))
-        appendStringInfoChar(buf, '(');
-    if (list_length(args) == 2)
-    {
-        /* binary operator */
-        Node       *arg1 = (Node *) linitial(args);
-        Node       *arg2 = (Node *) lsecond(args);
-
-        get_rule_expr_paren(arg1, context, true, (Node *) expr);
-        appendStringInfo(buf, " %s ",
-                         generate_operator_name(opno,
-                                                exprType(arg1),
-                                                exprType(arg2)));
-        get_rule_expr_paren(arg2, context, true, (Node *) expr);
-    }
-    else
-    {
-        /* unary operator --- but which side? */
-        Node       *arg = (Node *) linitial(args);
-        HeapTuple    tp;
-        Form_pg_operator optup;
-
-        tp = SearchSysCache1(OPEROID, ObjectIdGetDatum(opno));
-        if (!HeapTupleIsValid(tp))
-            elog(ERROR, "cache lookup failed for operator %u", opno);
-        optup = (Form_pg_operator) GETSTRUCT(tp);
-        switch (optup->oprkind)
-        {
-            case 'l':
-                appendStringInfo(buf, "%s ",
-                                 generate_operator_name(opno,
-                                                        InvalidOid,
-                                                        exprType(arg)));
-                get_rule_expr_paren(arg, context, true, (Node *) expr);
-                break;
-            case 'r':
-                get_rule_expr_paren(arg, context, true, (Node *) expr);
-                appendStringInfo(buf, " %s",
-                                 generate_operator_name(opno,
-                                                        exprType(arg),
-                                                        InvalidOid));
-                break;
-            default:
-                elog(ERROR, "bogus oprkind: %d", optup->oprkind);
-        }
-        ReleaseSysCache(tp);
-    }
-    if (!PRETTY_PAREN(context))
-        appendStringInfoChar(buf, ')');
-}
-
-/*
- * get_func_expr            - Parse back a FuncExpr node
- */
-static void
-get_func_expr(FuncExpr *expr, deparse_context *context,
-              bool showimplicit)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    Oid            funcoid = expr->funcid;
-    Oid            argtypes[FUNC_MAX_ARGS];
-    int            nargs;
-    List       *argnames;
-    bool        use_variadic;
-    ListCell   *l;
-
-    /*
-     * If the function call came from an implicit coercion, then just show the
-     * first argument --- unless caller wants to see implicit coercions.
-     */
-    if (expr->funcformat == COERCE_IMPLICIT_CAST && !showimplicit)
-    {
-        get_rule_expr_paren((Node *) linitial(expr->args), context,
-                            false, (Node *) expr);
-        return;
-    }
-
-    /*
-     * If the function call came from a cast, then show the first argument
-     * plus an explicit cast operation.
-     */
-    if (expr->funcformat == COERCE_EXPLICIT_CAST ||
-        expr->funcformat == COERCE_IMPLICIT_CAST)
-    {
-        Node       *arg = linitial(expr->args);
-        Oid            rettype = expr->funcresulttype;
-        int32        coercedTypmod;
-
-        /* Get the typmod if this is a length-coercion function */
-        (void) exprIsLengthCoercion((Node *) expr, &coercedTypmod);
-
-        get_coercion_expr(arg, context,
-                          rettype, coercedTypmod,
-                          (Node *) expr);
-
-        return;
-    }
-
-    /*
-     * Normal function: display as proname(args).  First we need to extract
-     * the argument datatypes.
-     */
-    if (list_length(expr->args) > FUNC_MAX_ARGS)
-        ereport(ERROR,
-                (errcode(ERRCODE_TOO_MANY_ARGUMENTS),
-                 errmsg("too many arguments")));
-    nargs = 0;
-    argnames = NIL;
-    foreach(l, expr->args)
-    {
-        Node       *arg = (Node *) lfirst(l);
-
-        if (IsA(arg, NamedArgExpr))
-            argnames = lappend(argnames, ((NamedArgExpr *) arg)->name);
-        argtypes[nargs] = exprType(arg);
-        nargs++;
-    }
-
-    appendStringInfo(buf, "%s(",
-                     generate_function_name(funcoid, nargs,
-                                            argnames, argtypes,
-                                            expr->funcvariadic,
-                                            &use_variadic,
-                                            context->special_exprkind));
-    nargs = 0;
-    foreach(l, expr->args)
-    {
-        if (nargs++ > 0)
-            appendStringInfoString(buf, ", ");
-        if (use_variadic && lnext(l) == NULL)
-            appendStringInfoString(buf, "VARIADIC ");
-        get_rule_expr((Node *) lfirst(l), context, true);
-    }
-    appendStringInfoChar(buf, ')');
-}
-
-/*
- * get_agg_expr            - Parse back an Aggref node
- */
-static void
-get_agg_expr(Aggref *aggref, deparse_context *context,
-             Aggref *original_aggref)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    Oid            argtypes[FUNC_MAX_ARGS];
-    int            nargs;
-#ifdef PGXC
-//    bool        added_finalfn = false;
-#endif /* PGXC */
-
-    bool        use_variadic;
-
-    /*
-     * For a combining aggregate, we look up and deparse the corresponding
-     * partial aggregate instead.  This is necessary because our input
-     * argument list has been replaced; the new argument list always has just
-     * one element, which will point to a partial Aggref that supplies us with
-     * transition states to combine.
-     */
-    if (DO_AGGSPLIT_COMBINE(aggref->aggsplit))
-    {
-        TargetEntry *tle = linitial_node(TargetEntry, aggref->args);
-
-        Assert(list_length(aggref->args) == 1);
-        resolve_special_varno((Node *) tle->expr, context, original_aggref,
-                              get_agg_combine_expr);
-        return;
-    }
-
-    /*
-     * Mark as PARTIAL, if appropriate.  We look to the original aggref so as
-     * to avoid printing this when recursing from the code just above.
-     */
-    if (DO_AGGSPLIT_SKIPFINAL(original_aggref->aggsplit))
-        appendStringInfoString(buf, "PARTIAL ");
-
-    /* Extract the argument types as seen by the parser */
-    nargs = get_aggregate_argtypes(aggref, argtypes);
-
-    /* Print the aggregate name, schema-qualified if needed */
-    appendStringInfo(buf, "%s(%s",
-                     generate_function_name(aggref->aggfnoid, nargs,
-                                            NIL, argtypes,
-                                            aggref->aggvariadic,
-                                            &use_variadic,
-                                            context->special_exprkind),
-                     (aggref->aggdistinct != NIL) ? "DISTINCT " : "");
-
-    if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
-    {
-        /*
-         * Ordered-set aggregates do not use "*" syntax.  Also, we needn't
-         * worry about inserting VARIADIC.  So we can just dump the direct
-         * args as-is.
-         */
-        Assert(!aggref->aggvariadic);
-        get_rule_expr((Node *) aggref->aggdirectargs, context, true);
-        Assert(aggref->aggorder != NIL);
-        appendStringInfoString(buf, ") WITHIN GROUP (ORDER BY ");
-        get_rule_orderby(aggref->aggorder, aggref->args, false, context);
-    }
-    else
-    {
-        /* aggstar can be set only in zero-argument aggregates */
-        if (aggref->aggstar)
-            appendStringInfoChar(buf, '*');
-        else
-        {
-            ListCell   *l;
-            int            i;
-
-            i = 0;
-            foreach(l, aggref->args)
-            {
-                TargetEntry *tle = (TargetEntry *) lfirst(l);
-                Node       *arg = (Node *) tle->expr;
-
-                Assert(!IsA(arg, NamedArgExpr));
-                if (tle->resjunk)
-                    continue;
-                if (i++ > 0)
-                    appendStringInfoString(buf, ", ");
-                if (use_variadic && i == nargs)
-                    appendStringInfoString(buf, "VARIADIC ");
-                get_rule_expr(arg, context, true);
-            }
-        }
-
-        if (aggref->aggorder != NIL)
-        {
-            appendStringInfoString(buf, " ORDER BY ");
-            get_rule_orderby(aggref->aggorder, aggref->args, false, context);
-        }
-    }
-
-    if (aggref->aggfilter != NULL)
-    {
-        appendStringInfoString(buf, ") FILTER (WHERE ");
-        get_rule_expr((Node *) aggref->aggfilter, context, false);
-    }
-
-    appendStringInfoChar(buf, ')');
-
-}
-
-/*
- * This is a helper function for get_agg_expr().  It's used when we deparse
- * a combining Aggref; resolve_special_varno locates the corresponding partial
- * Aggref and then calls this.
- */
-static void
-get_agg_combine_expr(Node *node, deparse_context *context, void *private)
-{
-    Aggref       *aggref;
-    Aggref       *original_aggref = private;
-
-    if (!IsA(node, Aggref))
-        elog(ERROR, "combining Aggref does not point to an Aggref");
-
-    aggref = (Aggref *) node;
-    get_agg_expr(aggref, context, original_aggref);
-}
-
-/*
- * get_windowfunc_expr    - Parse back a WindowFunc node
- */
-static void
-get_windowfunc_expr(WindowFunc *wfunc, deparse_context *context)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    Oid            argtypes[FUNC_MAX_ARGS];
-    int            nargs;
-    List       *argnames;
-    ListCell   *l;
-
-    if (list_length(wfunc->args) > FUNC_MAX_ARGS)
-        ereport(ERROR,
-                (errcode(ERRCODE_TOO_MANY_ARGUMENTS),
-                 errmsg("too many arguments")));
-    nargs = 0;
-    argnames = NIL;
-    foreach(l, wfunc->args)
-    {
-        Node       *arg = (Node *) lfirst(l);
-
-        if (IsA(arg, NamedArgExpr))
-            argnames = lappend(argnames, ((NamedArgExpr *) arg)->name);
-        argtypes[nargs] = exprType(arg);
-        nargs++;
-    }
-
-    appendStringInfo(buf, "%s(",
-                     generate_function_name(wfunc->winfnoid, nargs,
-                                            argnames, argtypes,
-                                            false, NULL,
-                                            context->special_exprkind));
-    /* winstar can be set only in zero-argument aggregates */
-    if (wfunc->winstar)
-        appendStringInfoChar(buf, '*');
-    else
-        get_rule_expr((Node *) wfunc->args, context, true);
-
-    if (wfunc->aggfilter != NULL)
-    {
-        appendStringInfoString(buf, ") FILTER (WHERE ");
-        get_rule_expr((Node *) wfunc->aggfilter, context, false);
-    }
-
-    appendStringInfoString(buf, ") OVER ");
-
-    foreach(l, context->windowClause)
-    {
-        WindowClause *wc = (WindowClause *) lfirst(l);
-
-        if (wc->winref == wfunc->winref)
-        {
-            if (wc->name)
-                appendStringInfoString(buf, quote_identifier(wc->name));
-            else
-                get_rule_windowspec(wc, context->windowTList, context);
-            break;
-        }
-    }
-    if (l == NULL)
-    {
-        if (context->windowClause)
-            elog(ERROR, "could not find window clause for winref %u",
-                 wfunc->winref);
-
-        /*
-         * In EXPLAIN, we don't have window context information available, so
-         * we have to settle for this:
-         */
-        appendStringInfoString(buf, "(?)");
-    }
-}
-
-/* ----------
- * get_coercion_expr
- *
- *    Make a string representation of a value coerced to a specific type
- * ----------
- */
-static void
-get_coercion_expr(Node *arg, deparse_context *context,
-                  Oid resulttype, int32 resulttypmod,
-                  Node *parentNode)
-{
-    StringInfo    buf = context->buf;
-
-    /*
-     * Since parse_coerce.c doesn't immediately collapse application of
-     * length-coercion functions to constants, what we'll typically see in
-     * such cases is a Const with typmod -1 and a length-coercion function
-     * right above it.  Avoid generating redundant output. However, beware of
-     * suppressing casts when the user actually wrote something like
-     * 'foo'::text::char(3).
-     *
-     * Note: it might seem that we are missing the possibility of needing to
-     * print a COLLATE clause for such a Const.  However, a Const could only
-     * have nondefault collation in a post-constant-folding tree, in which the
-     * length coercion would have been folded too.  See also the special
-     * handling of CollateExpr in coerce_to_target_type(): any collation
-     * marking will be above the coercion node, not below it.
-     */
-    if (arg && IsA(arg, Const) &&
-        ((Const *) arg)->consttype == resulttype &&
-        ((Const *) arg)->consttypmod == -1)
-    {
-        /* Show the constant without normal ::typename decoration */
-        get_const_expr((Const *) arg, context, -1);
-    }
-    else
-    {
-        if (!PRETTY_PAREN(context))
-            appendStringInfoChar(buf, '(');
-        get_rule_expr_paren(arg, context, false, parentNode);
-        if (!PRETTY_PAREN(context))
-            appendStringInfoChar(buf, ')');
-    }
-    appendStringInfo(buf, "::%s",
-                     format_type_with_typemod(resulttype, resulttypmod));
-}
-
-/* ----------
- * get_const_expr
- *
- *    Make a string representation of a Const
- *
- * showtype can be -1 to never show "::typename" decoration, or +1 to always
- * show it, or 0 to show it only if the constant wouldn't be assumed to be
- * the right type by default.
- *
- * If the Const's collation isn't default for its type, show that too.
- * We mustn't do this when showtype is -1 (since that means the caller will
- * print "::typename", and we can't put a COLLATE clause in between).  It's
- * caller's responsibility that collation isn't missed in such cases.
- * ----------
- */
-static void
-get_const_expr(Const *constval, deparse_context *context, int showtype)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    Oid            typoutput;
-    bool        typIsVarlena;
-    char       *extval;
-    bool        needlabel = false;
-
-    if (constval->constisnull)
-    {
-        /*
-         * Always label the type of a NULL constant to prevent misdecisions
-         * about type when reparsing.
-         */
-        appendStringInfoString(buf, "NULL");
-        if (showtype >= 0)
-        {
-            appendStringInfo(buf, "::%s",
-                             format_type_with_typemod(constval->consttype,
-                                                      constval->consttypmod));
-            get_const_collation(constval, context);
-        }
-        return;
-    }
-
-    getTypeOutputInfo(constval->consttype,
-                      &typoutput, &typIsVarlena);
-
-    extval = OidOutputFunctionCall(typoutput, constval->constvalue);
-
-    switch (constval->consttype)
-    {
-        case INT4OID:
-
-            /*
-             * INT4 can be printed without any decoration, unless it is
-             * negative; in that case print it as '-nnn'::integer to ensure
-             * that the output will re-parse as a constant, not as a constant
-             * plus operator.  In most cases we could get away with printing
-             * (-nnn) instead, because of the way that gram.y handles negative
-             * literals; but that doesn't work for INT_MIN, and it doesn't
-             * seem that much prettier anyway.
-             */
-            if (extval[0] != '-')
-                appendStringInfoString(buf, extval);
-            else
-            {
-                appendStringInfo(buf, "'%s'", extval);
-                needlabel = true;    /* we must attach a cast */
-            }
-            break;
-
-        case NUMERICOID:
-
-            /*
-             * NUMERIC can be printed without quotes if it looks like a float
-             * constant (not an integer, and not Infinity or NaN) and doesn't
-             * have a leading sign (for the same reason as for INT4).
-             */
-            if (isdigit((unsigned char) extval[0]) &&
-                strcspn(extval, "eE.") != strlen(extval))
-            {
-                appendStringInfoString(buf, extval);
-            }
-            else
-            {
-                appendStringInfo(buf, "'%s'", extval);
-                needlabel = true;    /* we must attach a cast */
-            }
-            break;
-
-        case BITOID:
-        case VARBITOID:
-            appendStringInfo(buf, "B'%s'", extval);
-            break;
-
-        case BOOLOID:
-            if (strcmp(extval, "t") == 0)
-                appendStringInfoString(buf, "true");
-            else
-                appendStringInfoString(buf, "false");
-            break;
-
-        default:
-            simple_quote_literal(buf, extval);
-            break;
-    }
-
-    pfree(extval);
-
-    if (showtype < 0)
-        return;
-
-    /*
-     * For showtype == 0, append ::typename unless the constant will be
-     * implicitly typed as the right type when it is read in.
-     *
-     * XXX this code has to be kept in sync with the behavior of the parser,
-     * especially make_const.
-     */
-    switch (constval->consttype)
-    {
-        case BOOLOID:
-        case UNKNOWNOID:
-            /* These types can be left unlabeled */
-            needlabel = false;
-            break;
-        case INT4OID:
-            /* We determined above whether a label is needed */
-            break;
-        case NUMERICOID:
-
-            /*
-             * Float-looking constants will be typed as numeric, which we
-             * checked above; but if there's a nondefault typmod we need to
-             * show it.
-             */
-            needlabel |= (constval->consttypmod >= 0);
-            break;
-        default:
-            needlabel = true;
-            break;
-    }
-    if (needlabel || showtype > 0)
-        appendStringInfo(buf, "::%s",
-                         format_type_with_typemod(constval->consttype,
-                                                  constval->consttypmod));
-
-    get_const_collation(constval, context);
-}
-
-/*
- * helper for get_const_expr: append COLLATE if needed
- */
-static void
-get_const_collation(Const *constval, deparse_context *context)
-{
-    StringInfo    buf = context->buf;
-
-    if (OidIsValid(constval->constcollid))
-    {
-        Oid            typcollation = get_typcollation(constval->consttype);
-
-        if (constval->constcollid != typcollation)
-        {
-            appendStringInfo(buf, " COLLATE %s",
-                             generate_collation_name(constval->constcollid));
-        }
-    }
-}
-
-/*
- * simple_quote_literal - Format a string as a SQL literal, append to buf
- */
-static void
-simple_quote_literal(StringInfo buf, const char *val)
-{
-    const char *valptr;
-
-    /*
-     * We form the string literal according to the prevailing setting of
-     * standard_conforming_strings; we never use E''. User is responsible for
-     * making sure result is used correctly.
-     */
-    appendStringInfoChar(buf, '\'');
-    for (valptr = val; *valptr; valptr++)
-    {
-        char        ch = *valptr;
-
-        if (SQL_STR_DOUBLE(ch, !standard_conforming_strings))
-            appendStringInfoChar(buf, ch);
-        appendStringInfoChar(buf, ch);
-    }
-    appendStringInfoChar(buf, '\'');
-}
-
-
-/* ----------
- * get_sublink_expr            - Parse back a sublink
- * ----------
- */
-static void
-get_sublink_expr(SubLink *sublink, deparse_context *context)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    Query       *query = (Query *) (sublink->subselect);
-    char       *opname = NULL;
-    bool        need_paren;
-
-    if (sublink->subLinkType == ARRAY_SUBLINK)
-        appendStringInfoString(buf, "ARRAY(");
-    else
-        appendStringInfoChar(buf, '(');
-
-    /*
-     * Note that we print the name of only the first operator, when there are
-     * multiple combining operators.  This is an approximation that could go
-     * wrong in various scenarios (operators in different schemas, renamed
-     * operators, etc) but there is not a whole lot we can do about it, since
-     * the syntax allows only one operator to be shown.
-     */
-    if (sublink->testexpr)
-    {
-        if (IsA(sublink->testexpr, OpExpr))
-        {
-            /* single combining operator */
-            OpExpr       *opexpr = (OpExpr *) sublink->testexpr;
-
-            get_rule_expr(linitial(opexpr->args), context, true);
-            opname = generate_operator_name(opexpr->opno,
-                                            exprType(linitial(opexpr->args)),
-                                            exprType(lsecond(opexpr->args)));
-        }
-        else if (IsA(sublink->testexpr, BoolExpr))
-        {
-            /* multiple combining operators, = or <> cases */
-            char       *sep;
-            ListCell   *l;
-
-            appendStringInfoChar(buf, '(');
-            sep = "";
-            foreach(l, ((BoolExpr *) sublink->testexpr)->args)
-            {
-                OpExpr       *opexpr = lfirst_node(OpExpr, l);
-
-                appendStringInfoString(buf, sep);
-                get_rule_expr(linitial(opexpr->args), context, true);
-                if (!opname)
-                    opname = generate_operator_name(opexpr->opno,
-                                                    exprType(linitial(opexpr->args)),
-                                                    exprType(lsecond(opexpr->args)));
-                sep = ", ";
-            }
-            appendStringInfoChar(buf, ')');
-        }
-        else if (IsA(sublink->testexpr, RowCompareExpr))
-        {
-            /* multiple combining operators, < <= > >= cases */
-            RowCompareExpr *rcexpr = (RowCompareExpr *) sublink->testexpr;
-
-            appendStringInfoChar(buf, '(');
-            get_rule_expr((Node *) rcexpr->largs, context, true);
-            opname = generate_operator_name(linitial_oid(rcexpr->opnos),
-                                            exprType(linitial(rcexpr->largs)),
-                                            exprType(linitial(rcexpr->rargs)));
-            appendStringInfoChar(buf, ')');
-        }
-        else
-            elog(ERROR, "unrecognized testexpr type: %d",
-                 (int) nodeTag(sublink->testexpr));
-    }
-
-    need_paren = true;
-
-    switch (sublink->subLinkType)
-    {
-        case EXISTS_SUBLINK:
-            appendStringInfoString(buf, "EXISTS ");
-            break;
-
-        case ANY_SUBLINK:
-            if (strcmp(opname, "=") == 0)    /* Represent = ANY as IN */
-                appendStringInfoString(buf, " IN ");
-            else
-                appendStringInfo(buf, " %s ANY ", opname);
-            break;
-
-        case ALL_SUBLINK:
-            appendStringInfo(buf, " %s ALL ", opname);
-            break;
-
-        case ROWCOMPARE_SUBLINK:
-            appendStringInfo(buf, " %s ", opname);
-            break;
-
-        case EXPR_SUBLINK:
-        case MULTIEXPR_SUBLINK:
-        case ARRAY_SUBLINK:
-            need_paren = false;
-            break;
-
-        case CTE_SUBLINK:        /* shouldn't occur in a SubLink */
-        default:
-            elog(ERROR, "unrecognized sublink type: %d",
-                 (int) sublink->subLinkType);
-            break;
-    }
-
-    if (need_paren)
-        appendStringInfoChar(buf, '(');
-
-    get_query_def(query, buf, context->namespaces, NULL,
-                  context->prettyFlags, context->wrapColumn,
-                  context->indentLevel,
-                  context->finalise_aggs,
-                  context->sortgroup_colno);
-
-    if (need_paren)
-        appendStringInfoString(buf, "))");
-    else
-        appendStringInfoChar(buf, ')');
-}
-
-
-/* ----------
- * get_tablefunc            - Parse back a table function
- * ----------
- */
-static void
-get_tablefunc(TableFunc *tf, deparse_context *context, bool showimplicit)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-
-    /* XMLTABLE is the only existing implementation.  */
-
-    appendStringInfoString(buf, "XMLTABLE(");
-
-    if (tf->ns_uris != NIL)
-    {
-        ListCell   *lc1,
-                   *lc2;
-        bool        first = true;
-
-        appendStringInfoString(buf, "XMLNAMESPACES (");
-        forboth(lc1, tf->ns_uris, lc2, tf->ns_names)
-        {
-            Node       *expr = (Node *) lfirst(lc1);
-            char       *name = strVal(lfirst(lc2));
-
-            if (!first)
-                appendStringInfoString(buf, ", ");
-            else
-                first = false;
-
-            if (name != NULL)
-            {
-                get_rule_expr(expr, context, showimplicit);
-                appendStringInfo(buf, " AS %s", name);
-            }
-            else
-            {
-                appendStringInfoString(buf, "DEFAULT ");
-                get_rule_expr(expr, context, showimplicit);
-            }
-        }
-        appendStringInfoString(buf, "), ");
-    }
-
-    appendStringInfoChar(buf, '(');
-    get_rule_expr((Node *) tf->rowexpr, context, showimplicit);
-    appendStringInfoString(buf, ") PASSING (");
-    get_rule_expr((Node *) tf->docexpr, context, showimplicit);
-    appendStringInfoChar(buf, ')');
-
-    if (tf->colexprs != NIL)
-    {
-        ListCell   *l1;
-        ListCell   *l2;
-        ListCell   *l3;
-        ListCell   *l4;
-        ListCell   *l5;
-        int            colnum = 0;
-
-        l2 = list_head(tf->coltypes);
-        l3 = list_head(tf->coltypmods);
-        l4 = list_head(tf->colexprs);
-        l5 = list_head(tf->coldefexprs);
-
-        appendStringInfoString(buf, " COLUMNS ");
-        foreach(l1, tf->colnames)
-        {
-            char       *colname = strVal(lfirst(l1));
-            Oid            typid;
-            int32        typmod;
-            Node       *colexpr;
-            Node       *coldefexpr;
-            bool        ordinality = tf->ordinalitycol == colnum;
-            bool        notnull = bms_is_member(colnum, tf->notnulls);
-
-            typid = lfirst_oid(l2);
-            l2 = lnext(l2);
-            typmod = lfirst_int(l3);
-            l3 = lnext(l3);
-            colexpr = (Node *) lfirst(l4);
-            l4 = lnext(l4);
-            coldefexpr = (Node *) lfirst(l5);
-            l5 = lnext(l5);
-
-            if (colnum > 0)
-                appendStringInfoString(buf, ", ");
-            colnum++;
-
-            appendStringInfo(buf, "%s %s", quote_identifier(colname),
-                             ordinality ? "FOR ORDINALITY" :
-                             format_type_with_typemod(typid, typmod));
-            if (ordinality)
-                continue;
-
-            if (coldefexpr != NULL)
-            {
-                appendStringInfoString(buf, " DEFAULT (");
-                get_rule_expr((Node *) coldefexpr, context, showimplicit);
-                appendStringInfoChar(buf, ')');
-            }
-            if (colexpr != NULL)
-            {
-                appendStringInfoString(buf, " PATH (");
-                get_rule_expr((Node *) colexpr, context, showimplicit);
-                appendStringInfoChar(buf, ')');
-            }
-            if (notnull)
-                appendStringInfoString(buf, " NOT NULL");
-        }
-    }
-
-    appendStringInfoChar(buf, ')');
-}
-
-/* ----------
- * get_from_clause            - Parse back a FROM clause
- *
- * "prefix" is the keyword that denotes the start of the list of FROM
- * elements. It is FROM when used to parse back SELECT and UPDATE, but
- * is USING when parsing back DELETE.
- * ----------
- */
-static void
-get_from_clause(Query *query, const char *prefix, deparse_context *context)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    bool        first = true;
-    ListCell   *l;
-
-    /*
-     * We use the query's jointree as a guide to what to print.  However, we
-     * must ignore auto-added RTEs that are marked not inFromCl. (These can
-     * only appear at the top level of the jointree, so it's sufficient to
-     * check here.)  This check also ensures we ignore the rule pseudo-RTEs
-     * for NEW and OLD.
-     */
-    foreach(l, query->jointree->fromlist)
-    {
-        Node       *jtnode = (Node *) lfirst(l);
-
-        if (IsA(jtnode, RangeTblRef))
-        {
-            int            varno = ((RangeTblRef *) jtnode)->rtindex;
-            RangeTblEntry *rte = rt_fetch(varno, query->rtable);
-
-            if (!rte->inFromCl)
-                continue;
-        }
-
-        if (first)
-        {
-            appendContextKeyword(context, prefix,
-                                 -PRETTYINDENT_STD, PRETTYINDENT_STD, 2);
-            first = false;
-
-            get_from_clause_item(jtnode, query, context);
-        }
-        else
-        {
-            StringInfoData itembuf;
-
-            appendStringInfoString(buf, ", ");
-
-            /*
-             * Put the new FROM item's text into itembuf so we can decide
-             * after we've got it whether or not it needs to go on a new line.
-             */
-            initStringInfo(&itembuf);
-            context->buf = &itembuf;
-
-            get_from_clause_item(jtnode, query, context);
-
-            /* Restore context's output buffer */
-            context->buf = buf;
-
-            /* Consider line-wrapping if enabled */
-            if (PRETTY_INDENT(context) && context->wrapColumn >= 0)
-            {
-                /* Does the new item start with a new line? */
-                if (itembuf.len > 0 && itembuf.data[0] == '\n')
-                {
-                    /* If so, we shouldn't add anything */
-                    /* instead, remove any trailing spaces currently in buf */
-                    removeStringInfoSpaces(buf);
-                }
-                else
-                {
-                    char       *trailing_nl;
-
-                    /* Locate the start of the current line in the buffer */
-                    trailing_nl = strrchr(buf->data, '\n');
-                    if (trailing_nl == NULL)
-                        trailing_nl = buf->data;
-                    else
-                        trailing_nl++;
-
-                    /*
-                     * Add a newline, plus some indentation, if the new item
-                     * would cause an overflow.
-                     */
-                    if (strlen(trailing_nl) + itembuf.len > context->wrapColumn)
-                        appendContextKeyword(context, "", -PRETTYINDENT_STD,
-                                             PRETTYINDENT_STD,
-                                             PRETTYINDENT_VAR);
-                }
-            }
-
-            /* Add the new item */
-            appendStringInfoString(buf, itembuf.data);
-
-            /* clean up */
-            pfree(itembuf.data);
-        }
-    }
-}
-
-static void
-get_from_clause_item(Node *jtnode, Query *query, deparse_context *context)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    deparse_namespace *dpns = (deparse_namespace *) linitial(context->namespaces);
-
-    if (IsA(jtnode, RangeTblRef))
-    {
-        int            varno = ((RangeTblRef *) jtnode)->rtindex;
-        RangeTblEntry *rte = rt_fetch(varno, query->rtable);
-        char       *refname = get_rtable_name(varno, context);
-        deparse_columns *colinfo = deparse_columns_fetch(varno, dpns);
-        RangeTblFunction *rtfunc1 = NULL;
-        bool        printalias;
-
-        if (rte->lateral)
-            appendStringInfoString(buf, "LATERAL ");
-
-        /* Print the FROM item proper */
-        switch (rte->rtekind)
-        {
-            case RTE_RELATION:
-                /* Normal relation RTE */
-                appendStringInfo(buf, "%s%s",
-                                 only_marker(rte),
-                                 generate_relation_name(rte->relid,
-                                                        context->namespaces));
-#ifdef __TBASE__
-                /* print for default partition */
-                if (rte->intervalparent && rte->isdefault)
-                {
-                    appendStringInfoString(buf, " PARTITION For Default ");
-                }
-#endif
-                break;
-            case RTE_SUBQUERY:
-                /* Subquery RTE */
-                appendStringInfoChar(buf, '(');
-                get_query_def(rte->subquery, buf, context->namespaces, NULL,
-                              context->prettyFlags, context->wrapColumn,
-                              context->indentLevel,
-                              context->finalise_aggs,
-                              context->sortgroup_colno);
-                appendStringInfoChar(buf, ')');
-                break;
-            case RTE_FUNCTION:
-                /* Function RTE */
-                rtfunc1 = (RangeTblFunction *) linitial(rte->functions);
-
-                /*
-                 * Omit ROWS FROM() syntax for just one function, unless it
-                 * has both a coldeflist and WITH ORDINALITY. If it has both,
-                 * we must use ROWS FROM() syntax to avoid ambiguity about
-                 * whether the coldeflist includes the ordinality column.
-                 */
-                if (list_length(rte->functions) == 1 &&
-                    (rtfunc1->funccolnames == NIL || !rte->funcordinality))
-                {
-                    get_rule_expr_funccall(rtfunc1->funcexpr, context, true);
-                    /* we'll print the coldeflist below, if it has one */
-                }
-                else
-                {
-                    bool        all_unnest;
-                    ListCell   *lc;
-
-                    /*
-                     * If all the function calls in the list are to unnest,
-                     * and none need a coldeflist, then collapse the list back
-                     * down to UNNEST(args).  (If we had more than one
-                     * built-in unnest function, this would get more
-                     * difficult.)
-                     *
-                     * XXX This is pretty ugly, since it makes not-terribly-
-                     * future-proof assumptions about what the parser would do
-                     * with the output; but the alternative is to emit our
-                     * nonstandard ROWS FROM() notation for what might have
-                     * been a perfectly spec-compliant multi-argument
-                     * UNNEST().
-                     */
-                    all_unnest = true;
-                    foreach(lc, rte->functions)
-                    {
-                        RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
-
-                        if (!IsA(rtfunc->funcexpr, FuncExpr) ||
-                            ((FuncExpr *) rtfunc->funcexpr)->funcid != F_ARRAY_UNNEST ||
-                            rtfunc->funccolnames != NIL)
-                        {
-                            all_unnest = false;
-                            break;
-                        }
-                    }
-
-                    if (all_unnest)
-                    {
-                        List       *allargs = NIL;
-
-                        foreach(lc, rte->functions)
-                        {
-                            RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
-                            List       *args = ((FuncExpr *) rtfunc->funcexpr)->args;
-
-                            allargs = list_concat(allargs, list_copy(args));
-                        }
-
-                        appendStringInfoString(buf, "UNNEST(");
-                        get_rule_expr((Node *) allargs, context, true);
-                        appendStringInfoChar(buf, ')');
-                    }
-                    else
-                    {
-                        int            funcno = 0;
-
-                        appendStringInfoString(buf, "ROWS FROM(");
-                        foreach(lc, rte->functions)
-                        {
-                            RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
-
-                            if (funcno > 0)
-                                appendStringInfoString(buf, ", ");
-                            get_rule_expr_funccall(rtfunc->funcexpr, context, true);
-                            if (rtfunc->funccolnames != NIL)
-                            {
-                                /* Reconstruct the column definition list */
-                                appendStringInfoString(buf, " AS ");
-                                get_from_clause_coldeflist(rtfunc,
-                                                           NULL,
-                                                           context);
-                            }
-                            funcno++;
-                        }
-                        appendStringInfoChar(buf, ')');
-                    }
-                    /* prevent printing duplicate coldeflist below */
-                    rtfunc1 = NULL;
-                }
-                if (rte->funcordinality)
-                    appendStringInfoString(buf, " WITH ORDINALITY");
-                break;
-            case RTE_TABLEFUNC:
-                get_tablefunc(rte->tablefunc, context, true);
-                break;
-            case RTE_VALUES:
-                /* Values list RTE */
-                appendStringInfoChar(buf, '(');
-                get_values_def(rte->values_lists, context);
-                appendStringInfoChar(buf, ')');
-                break;
-            case RTE_CTE:
-                appendStringInfoString(buf, quote_identifier(rte->ctename));
-                break;
-            default:
-                elog(ERROR, "unrecognized RTE kind: %d", (int) rte->rtekind);
-                break;
-        }
-
-        /* Print the relation alias, if needed */
-        printalias = false;
-        if (rte->alias != NULL)
-        {
-            /* Always print alias if user provided one */
-            printalias = true;
-        }
-        else if (colinfo->printaliases)
-        {
-            /* Always print alias if we need to print column aliases */
-            printalias = true;
-        }
-        else if (rte->rtekind == RTE_RELATION)
-        {
-            /*
-             * No need to print alias if it's same as relation name (this
-             * would normally be the case, but not if set_rtable_names had to
-             * resolve a conflict).
-             */
-            if (strcmp(refname, get_relation_name(rte->relid)) != 0)
-                printalias = true;
-        }
-#ifdef PGXC
-        else if (rte->rtekind == RTE_SUBQUERY && rte->eref->aliasname)
-        {
-            /*
-             *
-             * This condition arises when the from clause is a view. The
-             * corresponding subquery RTE has its eref set to view name.
-             * The remote query generated has this subquery of which the
-             * columns can be referred to as view_name.col1, so it should
-             * be possible to refer to this subquery object.
-             */            
-            appendStringInfo(buf, " %s",
-                             quote_identifier(rte->eref->aliasname));
-            printalias = true;
-        }
-#endif
-        else if (rte->rtekind == RTE_FUNCTION)
-        {
-            /*
-             * For a function RTE, always print alias.  This covers possible
-             * renaming of the function and/or instability of the
-             * FigureColname rules for things that aren't simple functions.
-             * Note we'd need to force it anyway for the columndef list case.
-             */
-            printalias = true;
-        }
-        else if (rte->rtekind == RTE_VALUES)
-        {
-            /* Alias is syntactically required for VALUES */
-            printalias = true;
-        }
-        else if (rte->rtekind == RTE_CTE)
-        {
-            /*
-             * No need to print alias if it's same as CTE name (this would
-             * normally be the case, but not if set_rtable_names had to
-             * resolve a conflict).
-             */
-            if (strcmp(refname, rte->ctename) != 0)
-                printalias = true;
-        }
-        if (printalias)
-            appendStringInfo(buf, " %s", quote_identifier(refname));
-
-        /* Print the column definitions or aliases, if needed */
-        if (rtfunc1 && rtfunc1->funccolnames != NIL)
-        {
-            /* Reconstruct the columndef list, which is also the aliases */
-            get_from_clause_coldeflist(rtfunc1, colinfo, context);
-        }
-        else
-        {
-            /* Else print column aliases as needed */
-            get_column_alias_list(colinfo, context);
-        }
-
-        /* Tablesample clause must go after any alias */
-        if (rte->rtekind == RTE_RELATION && rte->tablesample)
-            get_tablesample_def(rte->tablesample, context);
-    }
-    else if (IsA(jtnode, JoinExpr))
-    {
-        JoinExpr   *j = (JoinExpr *) jtnode;
-        deparse_columns *colinfo = deparse_columns_fetch(j->rtindex, dpns);
-        bool        need_paren_on_right;
-
-        need_paren_on_right = PRETTY_PAREN(context) &&
-            !IsA(j->rarg, RangeTblRef) &&
-            !(IsA(j->rarg, JoinExpr) &&((JoinExpr *) j->rarg)->alias != NULL);
-
-        if (!PRETTY_PAREN(context) || j->alias != NULL)
-            appendStringInfoChar(buf, '(');
-
-        get_from_clause_item(j->larg, query, context);
-
-        switch (j->jointype)
-        {
-            case JOIN_INNER:
-                if (j->quals)
-                    appendContextKeyword(context, " JOIN ",
-                                         -PRETTYINDENT_STD,
-                                         PRETTYINDENT_STD,
-                                         PRETTYINDENT_JOIN);
-                else
-                    appendContextKeyword(context, " CROSS JOIN ",
-                                         -PRETTYINDENT_STD,
-                                         PRETTYINDENT_STD,
-                                         PRETTYINDENT_JOIN);
-                break;
-            case JOIN_LEFT:
-                appendContextKeyword(context, " LEFT JOIN ",
-                                     -PRETTYINDENT_STD,
-                                     PRETTYINDENT_STD,
-                                     PRETTYINDENT_JOIN);
-                break;
-            case JOIN_FULL:
-                appendContextKeyword(context, " FULL JOIN ",
-                                     -PRETTYINDENT_STD,
-                                     PRETTYINDENT_STD,
-                                     PRETTYINDENT_JOIN);
-                break;
-            case JOIN_RIGHT:
-                appendContextKeyword(context, " RIGHT JOIN ",
-                                     -PRETTYINDENT_STD,
-                                     PRETTYINDENT_STD,
-                                     PRETTYINDENT_JOIN);
-                break;
-            default:
-                elog(ERROR, "unrecognized join type: %d",
-                     (int) j->jointype);
-        }
-
-        if (need_paren_on_right)
-            appendStringInfoChar(buf, '(');
-        get_from_clause_item(j->rarg, query, context);
-        if (need_paren_on_right)
-            appendStringInfoChar(buf, ')');
-
-        if (j->usingClause)
-        {
-            ListCell   *lc;
-            bool        first = true;
-
-            appendStringInfoString(buf, " USING (");
-            /* Use the assigned names, not what's in usingClause */
-            foreach(lc, colinfo->usingNames)
-            {
-                char       *colname = (char *) lfirst(lc);
-
-                if (first)
-                    first = false;
-                else
-                    appendStringInfoString(buf, ", ");
-                appendStringInfoString(buf, quote_identifier(colname));
-            }
-            appendStringInfoChar(buf, ')');
-        }
-        else if (j->quals)
-        {
-            appendStringInfoString(buf, " ON ");
-            if (!PRETTY_PAREN(context))
-                appendStringInfoChar(buf, '(');
-            get_rule_expr(j->quals, context, false);
-            if (!PRETTY_PAREN(context))
-                appendStringInfoChar(buf, ')');
-        }
-        else if (j->jointype != JOIN_INNER)
-        {
-            /* If we didn't say CROSS JOIN above, we must provide an ON */
-            appendStringInfoString(buf, " ON TRUE");
-        }
-
-        if (!PRETTY_PAREN(context) || j->alias != NULL)
-            appendStringInfoChar(buf, ')');
-
-        /* Yes, it's correct to put alias after the right paren ... */
-        if (j->alias != NULL)
-        {
-            appendStringInfo(buf, " %s",
-                             quote_identifier(j->alias->aliasname));
-            get_column_alias_list(colinfo, context);
-        }
-    }
-    else
-        elog(ERROR, "unrecognized node type: %d",
-             (int) nodeTag(jtnode));
-}
-
-/*
- * get_column_alias_list - print column alias list for an RTE
- *
- * Caller must already have printed the relation's alias name.
- */
-static void
-get_column_alias_list(deparse_columns *colinfo, deparse_context *context)
-{
-    StringInfo    buf = context->buf;
-    int            i;
-    bool        first = true;
-
-    /* Don't print aliases if not needed */
-    if (!colinfo->printaliases)
-        return;
-
-    for (i = 0; i < colinfo->num_new_cols; i++)
-    {
-        char       *colname = colinfo->new_colnames[i];
-
-        if (first)
-        {
-            appendStringInfoChar(buf, '(');
-            first = false;
-        }
-        else
-            appendStringInfoString(buf, ", ");
-        appendStringInfoString(buf, quote_identifier(colname));
-    }
-    if (!first)
-        appendStringInfoChar(buf, ')');
-}
-
-/*
- * get_from_clause_coldeflist - reproduce FROM clause coldeflist
- *
- * When printing a top-level coldeflist (which is syntactically also the
- * relation's column alias list), use column names from colinfo.  But when
- * printing a coldeflist embedded inside ROWS FROM(), we prefer to use the
- * original coldeflist's names, which are available in rtfunc->funccolnames.
- * Pass NULL for colinfo to select the latter behavior.
- *
- * The coldeflist is appended immediately (no space) to buf.  Caller is
- * responsible for ensuring that an alias or AS is present before it.
- */
-static void
-get_from_clause_coldeflist(RangeTblFunction *rtfunc,
-                           deparse_columns *colinfo,
-                           deparse_context *context)
-{
-    StringInfo    buf = context->buf;
-    ListCell   *l1;
-    ListCell   *l2;
-    ListCell   *l3;
-    ListCell   *l4;
-    int            i;
-
-    appendStringInfoChar(buf, '(');
-
-    /* there's no forfour(), so must chase one list the hard way */
-    i = 0;
-    l4 = list_head(rtfunc->funccolnames);
-    forthree(l1, rtfunc->funccoltypes,
-             l2, rtfunc->funccoltypmods,
-             l3, rtfunc->funccolcollations)
-    {
-        Oid            atttypid = lfirst_oid(l1);
-        int32        atttypmod = lfirst_int(l2);
-        Oid            attcollation = lfirst_oid(l3);
-        char       *attname;
-
-        if (colinfo)
-            attname = colinfo->colnames[i];
-        else
-            attname = strVal(lfirst(l4));
-
-        Assert(attname);        /* shouldn't be any dropped columns here */
-
-        if (i > 0)
-            appendStringInfoString(buf, ", ");
-        appendStringInfo(buf, "%s %s",
-                         quote_identifier(attname),
-                         format_type_with_typemod(atttypid, atttypmod));
-        if (OidIsValid(attcollation) &&
-            attcollation != get_typcollation(atttypid))
-            appendStringInfo(buf, " COLLATE %s",
-                             generate_collation_name(attcollation));
-
-        l4 = lnext(l4);
-        i++;
-    }
-
-    appendStringInfoChar(buf, ')');
-}
-
-/*
- * get_tablesample_def            - print a TableSampleClause
- */
-static void
-get_tablesample_def(TableSampleClause *tablesample, deparse_context *context)
-{
-    StringInfo    buf = context->buf;
-    Oid            argtypes[1];
-    int            nargs;
-    ListCell   *l;
-
-    /*
-     * We should qualify the handler's function name if it wouldn't be
-     * resolved by lookup in the current search path.
-     */
-    argtypes[0] = INTERNALOID;
-    appendStringInfo(buf, " TABLESAMPLE %s (",
-                     generate_function_name(tablesample->tsmhandler, 1,
-                                            NIL, argtypes,
-                                            false, NULL, EXPR_KIND_NONE));
-
-    nargs = 0;
-    foreach(l, tablesample->args)
-    {
-        if (nargs++ > 0)
-            appendStringInfoString(buf, ", ");
-        get_rule_expr((Node *) lfirst(l), context, false);
-    }
-    appendStringInfoChar(buf, ')');
-
-    if (tablesample->repeatable != NULL)
-    {
-        appendStringInfoString(buf, " REPEATABLE (");
-        get_rule_expr((Node *) tablesample->repeatable, context, false);
-        appendStringInfoChar(buf, ')');
-    }
-}
-
-/*
- * get_opclass_name            - fetch name of an index operator class
- *
- * The opclass name is appended (after a space) to buf.
- *
- * Output is suppressed if the opclass is the default for the given
- * actual_datatype.  (If you don't want this behavior, just pass
- * InvalidOid for actual_datatype.)
- */
-static void
-get_opclass_name(Oid opclass, Oid actual_datatype,
-                 StringInfo buf)
-{
-    HeapTuple    ht_opc;
-    Form_pg_opclass opcrec;
-    char       *opcname;
-    char       *nspname;
-
-    ht_opc = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclass));
-    if (!HeapTupleIsValid(ht_opc))
-        elog(ERROR, "cache lookup failed for opclass %u", opclass);
-    opcrec = (Form_pg_opclass) GETSTRUCT(ht_opc);
-
-    if (!OidIsValid(actual_datatype) ||
-        GetDefaultOpClass(actual_datatype, opcrec->opcmethod) != opclass)
-    {
-        /* Okay, we need the opclass name.  Do we need to qualify it? */
-        opcname = NameStr(opcrec->opcname);
-        if (OpclassIsVisible(opclass))
-            appendStringInfo(buf, " %s", quote_identifier(opcname));
-        else
-        {
-            nspname = get_namespace_name(opcrec->opcnamespace);
-            appendStringInfo(buf, " %s.%s",
-                             quote_identifier(nspname),
-                             quote_identifier(opcname));
-        }
-    }
-    ReleaseSysCache(ht_opc);
-}
-
-/*
- * processIndirection - take care of array and subfield assignment
- *
- * We strip any top-level FieldStore or assignment ArrayRef nodes that
- * appear in the input, printing them as decoration for the base column
- * name (which we assume the caller just printed).  We might also need to
- * strip CoerceToDomain nodes, but only ones that appear above assignment
- * nodes.
- *
- * Returns the subexpression that's to be assigned.
- */
-static Node *
-processIndirection(Node *node, deparse_context *context)
-{// #lizard forgives
-    StringInfo    buf = context->buf;
-    CoerceToDomain *cdomain = NULL;
-
-    for (;;)
-    {
-        if (node == NULL)
-            break;
-        if (IsA(node, FieldStore))
-        {
-            FieldStore *fstore = (FieldStore *) node;
-            Oid            typrelid;
-            char       *fieldname;
-
-            /* lookup tuple type */
-            typrelid = get_typ_typrelid(fstore->resulttype);
-            if (!OidIsValid(typrelid))
-                elog(ERROR, "argument type %s of FieldStore is not a tuple type",
-                     format_type_be(fstore->resulttype));
-
-            /*
-             * Print the field name.  There should only be one target field in
-             * stored rules.  There could be more than that in executable
-             * target lists, but this function cannot be used for that case.
-             */
-            Assert(list_length(fstore->fieldnums) == 1);
-            fieldname = get_relid_attribute_name(typrelid,
-                                                 linitial_int(fstore->fieldnums));
-            appendStringInfo(buf, ".%s", quote_identifier(fieldname));
-
-            /*
-             * We ignore arg since it should be an uninteresting reference to
-             * the target column or subcolumn.
-             */
-            node = (Node *) linitial(fstore->newvals);
-        }
-        else if (IsA(node, ArrayRef))
-        {
-            ArrayRef   *aref = (ArrayRef *) node;
-
-            if (aref->refassgnexpr == NULL)
-                break;
-            printSubscripts(aref, context);
-
-            /*
-             * We ignore refexpr since it should be an uninteresting reference
-             * to the target column or subcolumn.
-             */
-            node = (Node *) aref->refassgnexpr;
-        }
-        else if (IsA(node, CoerceToDomain))
-        {
-            cdomain = (CoerceToDomain *) node;
-            /* If it's an explicit domain coercion, we're done */
-            if (cdomain->coercionformat != COERCE_IMPLICIT_CAST)
-                break;
-            /* Tentatively descend past the CoerceToDomain */
-            node = (Node *) cdomain->arg;
-        }
-        else
-            break;
-    }
-
-    /*
-     * If we descended past a CoerceToDomain whose argument turned out not to
-     * be a FieldStore or array assignment, back up to the CoerceToDomain.
-     * (This is not enough to be fully correct if there are nested implicit
-     * CoerceToDomains, but such cases shouldn't ever occur.)
-     */
-    if (cdomain && node == (Node *) cdomain->arg)
-        node = (Node *) cdomain;
-
-    return node;
-}
-
-static void
-printSubscripts(ArrayRef *aref, deparse_context *context)
-{
-    StringInfo    buf = context->buf;
-    ListCell   *lowlist_item;
-    ListCell   *uplist_item;
-
-    lowlist_item = list_head(aref->reflowerindexpr);    /* could be NULL */
-    foreach(uplist_item, aref->refupperindexpr)
-    {
-        appendStringInfoChar(buf, '[');
-        if (lowlist_item)
-        {
-            /* If subexpression is NULL, get_rule_expr prints nothing */
-            get_rule_expr((Node *) lfirst(lowlist_item), context, false);
-            appendStringInfoChar(buf, ':');
-            lowlist_item = lnext(lowlist_item);
-        }
-        /* If subexpression is NULL, get_rule_expr prints nothing */
-        get_rule_expr((Node *) lfirst(uplist_item), context, false);
-        appendStringInfoChar(buf, ']');
-    }
-}
-
-/*
- * quote_identifier            - Quote an identifier only if needed
- *
- * When quotes are needed, we palloc the required space; slightly
- * space-wasteful but well worth it for notational simplicity.
- */
-const char *
-quote_identifier(const char *ident)
-{// #lizard forgives
-    /*
-     * Can avoid quoting if ident starts with a lowercase letter or underscore
-     * and contains only lowercase letters, digits, and underscores, *and* is
-     * not any SQL keyword.  Otherwise, supply quotes.
-     */
-    int            nquotes = 0;
-    bool        safe;
-    const char *ptr;
-    char       *result;
-    char       *optr;
-
-    /*
-     * would like to use <ctype.h> macros here, but they might yield unwanted
-     * locale-specific results...
-     */
-    safe = ((ident[0] >= 'a' && ident[0] <= 'z') || ident[0] == '_');
-
-    for (ptr = ident; *ptr; ptr++)
-    {
-        char        ch = *ptr;
-
-        if ((ch >= 'a' && ch <= 'z') ||
-            (ch >= '0' && ch <= '9') ||
-            (ch == '_'))
-        {
-            /* okay */
-        }
-        else
-        {
-            safe = false;
-            if (ch == '"')
-                nquotes++;
-        }
-    }
-
-    if (quote_all_identifiers)
-        safe = false;
-
-    if (safe)
-    {
-        /*
-         * Check for keyword.  We quote keywords except for unreserved ones.
-         * (In some cases we could avoid quoting a col_name or type_func_name
-         * keyword, but it seems much harder than it's worth to tell that.)
-         *
-         * Note: ScanKeywordLookup() does case-insensitive comparison, but
-         * that's fine, since we already know we have all-lower-case.
-         */
-        const ScanKeyword *keyword = ScanKeywordLookup(ident,
-                                                       ScanKeywords,
-                                                       NumScanKeywords);
-
-        if (keyword != NULL && keyword->category != UNRESERVED_KEYWORD)
-            safe = false;
-    }
-
-    if (safe)
-        return ident;            /* no change needed */
-
-    result = (char *) palloc(strlen(ident) + nquotes + 2 + 1);
-
-    optr = result;
-    *optr++ = '"';
-    for (ptr = ident; *ptr; ptr++)
-    {
-        char        ch = *ptr;
-
-        if (ch == '"')
-            *optr++ = '"';
-        *optr++ = ch;
-    }
-    *optr++ = '"';
-    *optr = '\0';
-
-    return result;
-}
-
-/*
- * quote_qualified_identifier    - Quote a possibly-qualified identifier
- *
- * Return a name of the form qualifier.ident, or just ident if qualifier
- * is NULL, quoting each component if necessary.  The result is palloc'd.
- */
-char *
-quote_qualified_identifier(const char *qualifier,
-                           const char *ident)
-{
-    StringInfoData buf;
-
-    initStringInfo(&buf);
-    if (qualifier)
-        appendStringInfo(&buf, "%s.", quote_identifier(qualifier));
-    appendStringInfoString(&buf, quote_identifier(ident));
-    return buf.data;
-}
-
-/*
- * get_relation_name
- *        Get the unqualified name of a relation specified by OID
- *
- * This differs from the underlying get_rel_name() function in that it will
- * throw error instead of silently returning NULL if the OID is bad.
- */
-static char *
-get_relation_name(Oid relid)
-{
-    char       *relname = get_rel_name(relid);
-
-    if (!relname)
-        elog(ERROR, "cache lookup failed for relation %u", relid);
-    return relname;
-}
-
-/*
- * generate_relation_name
- *        Compute the name to display for a relation specified by OID
- *
- * The result includes all necessary quoting and schema-prefixing.
- *
- * If namespaces isn't NIL, it must be a list of deparse_namespace nodes.
- * We will forcibly qualify the relation name if it equals any CTE name
- * visible in the namespace list.
- */
-static char *
-generate_relation_name(Oid relid, List *namespaces)
-{
-    HeapTuple    tp;
-    Form_pg_class reltup;
-    bool        need_qual;
-    ListCell   *nslist;
-    char       *relname;
-    char       *nspname;
-    char       *result;
-
-    tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
-    if (!HeapTupleIsValid(tp))
-        elog(ERROR, "cache lookup failed for relation %u", relid);
-    reltup = (Form_pg_class) GETSTRUCT(tp);
-    relname = NameStr(reltup->relname);
-
-    /* Check for conflicting CTE name */
-    need_qual = false;
-    foreach(nslist, namespaces)
-    {
-        deparse_namespace *dpns = (deparse_namespace *) lfirst(nslist);
-        ListCell   *ctlist;
-
-        foreach(ctlist, dpns->ctes)
-        {
-            CommonTableExpr *cte = (CommonTableExpr *) lfirst(ctlist);
-
-            if (strcmp(cte->ctename, relname) == 0)
-            {
-                need_qual = true;
-                break;
-            }
-        }
-        if (need_qual)
-            break;
-    }
-
-    /* Otherwise, qualify the name if not visible in search path */
-    if (!need_qual)
-        need_qual = !RelationIsVisible(relid);
-
-    if (need_qual)
-        nspname = get_namespace_name(reltup->relnamespace);
-    else
-        nspname = NULL;
-
-    result = quote_qualified_identifier(nspname, relname);
-
-    ReleaseSysCache(tp);
-
-    return result;
-}
-
-/*
- * generate_qualified_relation_name
- *        Compute the name to display for a relation specified by OID
- *
- * As above, but unconditionally schema-qualify the name.
- */
-static char *
-generate_qualified_relation_name(Oid relid)
-{
-    HeapTuple    tp;
-    Form_pg_class reltup;
-    char       *relname;
-    char       *nspname;
-    char       *result;
-
-    tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
-    if (!HeapTupleIsValid(tp))
-        elog(ERROR, "cache lookup failed for relation %u", relid);
-    reltup = (Form_pg_class) GETSTRUCT(tp);
-    relname = NameStr(reltup->relname);
-
-    nspname = get_namespace_name(reltup->relnamespace);
-    if (!nspname)
-        elog(ERROR, "cache lookup failed for namespace %u",
-             reltup->relnamespace);
-
-    result = quote_qualified_identifier(nspname, relname);
-
-    ReleaseSysCache(tp);
-
-    return result;
-}
-
-/*
- * generate_function_name
- *        Compute the name to display for a function specified by OID,
- *        given that it is being called with the specified actual arg names and
- *        types.  (Those matter because of ambiguous-function resolution rules.)
- *
- * If we're dealing with a potentially variadic function (in practice, this
- * means a FuncExpr or Aggref, not some other way of calling a function), then
- * has_variadic must specify whether variadic arguments have been merged,
- * and *use_variadic_p will be set to indicate whether to print VARIADIC in
- * the output.  For non-FuncExpr cases, has_variadic should be FALSE and
- * use_variadic_p can be NULL.
- *
- * The result includes all necessary quoting and schema-prefixing.
- */
-static char *
-generate_function_name(Oid funcid, int nargs, List *argnames, Oid *argtypes,
-                       bool has_variadic, bool *use_variadic_p,
-                       ParseExprKind special_exprkind)
-{// #lizard forgives
-    char       *result;
-    HeapTuple    proctup;
-    Form_pg_proc procform;
-    char       *proname;
-    bool        use_variadic;
-    char       *nspname;
-    FuncDetailCode p_result;
-    Oid            p_funcid;
-    Oid            p_rettype;
-    bool        p_retset;
-    int            p_nvargs;
-    Oid            p_vatype;
-    Oid           *p_true_typeids;
-    bool        force_qualify = false;
-
-    proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
-    if (!HeapTupleIsValid(proctup))
-        elog(ERROR, "cache lookup failed for function %u", funcid);
-    procform = (Form_pg_proc) GETSTRUCT(proctup);
-    proname = NameStr(procform->proname);
-
-    /*
-     * Due to parser hacks to avoid needing to reserve CUBE, we need to force
-     * qualification in some special cases.
-     */
-    if (special_exprkind == EXPR_KIND_GROUP_BY)
-    {
-        if (strcmp(proname, "cube") == 0 || strcmp(proname, "rollup") == 0)
-            force_qualify = true;
-    }
-
-    /*
-     * Determine whether VARIADIC should be printed.  We must do this first
-     * since it affects the lookup rules in func_get_detail().
-     *
-     * Currently, we always print VARIADIC if the function has a merged
-     * variadic-array argument.  Note that this is always the case for
-     * functions taking a VARIADIC argument type other than VARIADIC ANY.
-     *
-     * In principle, if VARIADIC wasn't originally specified and the array
-     * actual argument is deconstructable, we could print the array elements
-     * separately and not print VARIADIC, thus more nearly reproducing the
-     * original input.  For the moment that seems like too much complication
-     * for the benefit, and anyway we do not know whether VARIADIC was
-     * originally specified if it's a non-ANY type.
-     */
-    if (use_variadic_p)
-    {
-        /* Parser should not have set funcvariadic unless fn is variadic */
-        Assert(!has_variadic || OidIsValid(procform->provariadic));
-        use_variadic = has_variadic;
-        *use_variadic_p = use_variadic;
-    }
-    else
-    {
-        Assert(!has_variadic);
-        use_variadic = false;
-    }
-
-    /*
-     * The idea here is to schema-qualify only if the parser would fail to
-     * resolve the correct function given the unqualified func name with the
-     * specified argtypes and VARIADIC flag.  But if we already decided to
-     * force qualification, then we can skip the lookup and pretend we didn't
-     * find it.
-     */
-    if (!force_qualify)
-        p_result = func_get_detail(list_make1(makeString(proname)),
-                                   NIL, argnames, nargs, argtypes,
-                                   !use_variadic, true,
-                                   &p_funcid, &p_rettype,
-                                   &p_retset, &p_nvargs, &p_vatype,
-                                   &p_true_typeids, NULL);
-    else
-    {
-        p_result = FUNCDETAIL_NOTFOUND;
-        p_funcid = InvalidOid;
-    }
-
-    if ((p_result == FUNCDETAIL_NORMAL ||
-         p_result == FUNCDETAIL_AGGREGATE ||
-         p_result == FUNCDETAIL_WINDOWFUNC) &&
-        p_funcid == funcid)
-        nspname = NULL;
-    else
-        nspname = get_namespace_name(procform->pronamespace);
-
-    result = quote_qualified_identifier(nspname, proname);
-
-    ReleaseSysCache(proctup);
-
-    return result;
-}
-
-/*
- * generate_operator_name
- *        Compute the name to display for an operator specified by OID,
- *        given that it is being called with the specified actual arg types.
- *        (Arg types matter because of ambiguous-operator resolution rules.
- *        Pass InvalidOid for unused arg of a unary operator.)
- *
- * The result includes all necessary quoting and schema-prefixing,
- * plus the OPERATOR() decoration needed to use a qualified operator name
- * in an expression.
- */
-static char *
-generate_operator_name(Oid operid, Oid arg1, Oid arg2)
-{// #lizard forgives
-    StringInfoData buf;
-    HeapTuple    opertup;
-    Form_pg_operator operform;
-    char       *oprname;
-    char       *nspname;
-    Operator    p_result;
-
-    initStringInfo(&buf);
-
-    opertup = SearchSysCache1(OPEROID, ObjectIdGetDatum(operid));
-    if (!HeapTupleIsValid(opertup))
-        elog(ERROR, "cache lookup failed for operator %u", operid);
-    operform = (Form_pg_operator) GETSTRUCT(opertup);
-    oprname = NameStr(operform->oprname);
-
-    /*
-     * The idea here is to schema-qualify only if the parser would fail to
-     * resolve the correct operator given the unqualified op name with the
-     * specified argtypes.
-     */
-    switch (operform->oprkind)
-    {
-        case 'b':
-            p_result = oper(NULL, list_make1(makeString(oprname)), arg1, arg2,
-                            true, -1);
-            break;
-        case 'l':
-            p_result = left_oper(NULL, list_make1(makeString(oprname)), arg2,
-                                 true, -1);
-            break;
-        case 'r':
-            p_result = right_oper(NULL, list_make1(makeString(oprname)), arg1,
-                                  true, -1);
-            break;
-        default:
-            elog(ERROR, "unrecognized oprkind: %d", operform->oprkind);
-            p_result = NULL;    /* keep compiler quiet */
-            break;
-    }
-
-    if (p_result != NULL && oprid(p_result) == operid)
-        nspname = NULL;
-    else
-    {
-        nspname = get_namespace_name(operform->oprnamespace);
-        appendStringInfo(&buf, "OPERATOR(%s.", quote_identifier(nspname));
-    }
-
-    appendStringInfoString(&buf, oprname);
-
-    if (nspname)
-        appendStringInfoChar(&buf, ')');
-
-    if (p_result != NULL)
-        ReleaseSysCache(p_result);
-
-    ReleaseSysCache(opertup);
-
-    return buf.data;
-}
-
-/*
- * generate_collation_name
- *        Compute the name to display for a collation specified by OID
- *
- * The result includes all necessary quoting and schema-prefixing.
- */
-char *
-generate_collation_name(Oid collid)
-{
-    HeapTuple    tp;
-    Form_pg_collation colltup;
-    char       *collname;
-    char       *nspname;
-    char       *result;
-
-    tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
-    if (!HeapTupleIsValid(tp))
-        elog(ERROR, "cache lookup failed for collation %u", collid);
-    colltup = (Form_pg_collation) GETSTRUCT(tp);
-    collname = NameStr(colltup->collname);
-
-    if (!CollationIsVisible(collid))
-        nspname = get_namespace_name(colltup->collnamespace);
-    else
-        nspname = NULL;
-
-    result = quote_qualified_identifier(nspname, collname);
-
-    ReleaseSysCache(tp);
-
-    return result;
-}
-
-/*
- * Given a C string, produce a TEXT datum.
- *
- * We assume that the input was palloc'd and may be freed.
- */
-static text *
-string_to_text(char *str)
-{
-    text       *result;
-
-    result = cstring_to_text(str);
-    pfree(str);
-    return result;
-}
-
-/*
- * Generate a C string representing a relation's reloptions, or NULL if none.
- */
-static char *
-flatten_reloptions(Oid relid)
-{
-    char       *result = NULL;
-    HeapTuple    tuple;
-    Datum        reloptions;
-    bool        isnull;
-
-    tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
-    if (!HeapTupleIsValid(tuple))
-        elog(ERROR, "cache lookup failed for relation %u", relid);
-
-    reloptions = SysCacheGetAttr(RELOID, tuple,
-                                 Anum_pg_class_reloptions, &isnull);
-    if (!isnull)
-    {
-        StringInfoData buf;
-        Datum       *options;
-        int            noptions;
-        int            i;
-
-        initStringInfo(&buf);
-
-        deconstruct_array(DatumGetArrayTypeP(reloptions),
-                          TEXTOID, -1, false, 'i',
-                          &options, NULL, &noptions);
-
-        for (i = 0; i < noptions; i++)
-        {
-            char       *option = TextDatumGetCString(options[i]);
-            char       *name;
-            char       *separator;
-            char       *value;
-
-            /*
-             * Each array element should have the form name=value.  If the "="
-             * is missing for some reason, treat it like an empty value.
-             */
-            name = option;
-            separator = strchr(option, '=');
-            if (separator)
-            {
-                *separator = '\0';
-                value = separator + 1;
-            }
-            else
-                value = "";
-
-            if (i > 0)
-                appendStringInfoString(&buf, ", ");
-            appendStringInfo(&buf, "%s=", quote_identifier(name));
-
-            /*
-             * In general we need to quote the value; but to avoid unnecessary
-             * clutter, do not quote if it is an identifier that would not
-             * need quoting.  (We could also allow numbers, but that is a bit
-             * trickier than it looks --- for example, are leading zeroes
-             * significant?  We don't want to assume very much here about what
-             * custom reloptions might mean.)
-             */
-            if (quote_identifier(value) == value)
-                appendStringInfoString(&buf, value);
-            else
-                simple_quote_literal(&buf, value);
-
-            pfree(option);
-        }
-
-        result = buf.data;
-    }
-
-    ReleaseSysCache(tuple);
-
-    return result;
-}
-
-/*
- * get_one_range_partition_bound_string
- *        A C string representation of one range partition bound
- */
-char *
-get_range_partbound_string(List *bound_datums)
-{
-    deparse_context context;
-    StringInfo    buf = makeStringInfo();
-    ListCell   *cell;
-    char       *sep;
-
-    memset(&context, 0, sizeof(deparse_context));
-    context.buf = buf;
-
-    appendStringInfoString(buf, "(");
-    sep = "";
-    foreach(cell, bound_datums)
-    {
-        PartitionRangeDatum *datum =
-        castNode(PartitionRangeDatum, lfirst(cell));
-
-        appendStringInfoString(buf, sep);
-        if (datum->kind == PARTITION_RANGE_DATUM_MINVALUE)
-            appendStringInfoString(buf, "MINVALUE");
-        else if (datum->kind == PARTITION_RANGE_DATUM_MAXVALUE)
-            appendStringInfoString(buf, "MAXVALUE");
-        else
-        {
-            Const       *val = castNode(Const, datum->value);
-
-            get_const_expr(val, &context, -1);
-        }
-        sep = ", ";
-    }
-    appendStringInfoString(buf, ")");
-
-    return buf->data;
-}
-
-#ifdef __TBASE__
-/* form interval partition child table/index name */
-char * 
-GetPartitionName(Oid parentrelid, int partidx, bool isindex)
-{
-    char *partname;
-    char relname[NAMEDATALEN];
-    char *parentname = get_rel_name(parentrelid);
-
-    StrNCpy(relname, parentname, NAMEDATALEN - 12);
-
-    partname = (char *)palloc0(NAMEDATALEN);
-
-    snprintf(partname, NAMEDATALEN,
-                 "%s_part_%d", relname, partidx);
-
-#if 0
-    if(!isindex)
-        snprintf(partname, NAMEDATALEN,
-                 "part_%d_%d", parentrelid, partidx);
-    else
-        snprintf(partname, NAMEDATALEN,
-                 "idx_%d_%d", parentrelid, partidx);
-#endif
-
-    return partname;
-}
-
-static int
-find_partidx_by_int(int64 start, int step, int partitions, 
-                        int64 value, QulificationType qualtype)
-{// #lizard forgives
-    int partidx = -1;
-    int gap = -1;
-    int align = -1;
-
-    if(value < start || value >= start + step*partitions)
-    {
-        return PARTITION_ROUTER_RESULT_NULL;
-    }
-    
-    gap = (int32)((value - start)/step);
-    
-    align = (int32)((value - start)%step);
-
-    switch(qualtype)
-    {
-        case QULIFICATION_TYPE_LS:
-            if(align == 0) gap--;
-        case QULIFICATION_TYPE_LE:
-            {
-                if(gap >= partitions)
-                    partidx = PARTITION_ROUTER_RESULT_FULL;
-                else if(gap < 0)
-                    partidx = PARTITION_ROUTER_RESULT_NULL;
-                else
-                    partidx = gap;
-            }
-            break;
-            
-        case QULIFICATION_TYPE_EQUAL:
-            {
-                if(gap >= partitions || gap < 0 )
-                    partidx = PARTITION_ROUTER_RESULT_NULL;
-                else
-                    partidx = gap;
-            }
-            break;
-            
-        case QULIFICATION_TYPE_GE:
-        case QULIFICATION_TYPE_GT:
-            {
-                if(gap >= partitions)
-                    partidx = PARTITION_ROUTER_RESULT_NULL;
-                else if(gap < 0)
-                    partidx = PARTITION_ROUTER_RESULT_FULL;
-                else
-                    partidx = gap;
-            }
-            break;
-        default:
-            elog(ERROR, "not supported Qulification Type[%d]", qualtype);
-    }
-    
-    return partidx;
-}
-
-static int get_daysofyear(int startyear, int startmonth, int startday, 
-                                int endyear, int endmonth, int endday)
-{// #lizard forgives
-    int result;
-
-    result = 0;
-
-    if(startyear > endyear
-        || (startyear == endyear && startmonth > endmonth)
-        || (startyear == endyear && startmonth == endmonth && startday > endday))
-        return -1;
-
-    if(startyear == endyear)
-    {
-        result = get_daysofmonth(startmonth, startday, endmonth, endday);
-    }
-    else
-    {
-        result += get_daysofmonth(startmonth,startday, 12, 31);
-        result += (endyear - startyear - 1)*366;
-        result += get_daysofmonth(1, 1, endmonth, endday);
-    }
-
-    return result;
-}
-
-static int get_daysofmonth(int startmonth, int startday, 
-                                int endmonth, int endday)
-{// #lizard forgives
-    int result;
-
-    if(startmonth <=0 || startmonth > 12
-        || startday <= 0 || startday > 31
-        || endmonth <=0 || endmonth > 12
-        || endday <= 0 || endday > 31)
-    {
-        elog(ERROR, "internal error: getdaysofmonth: parameters is invalid");
-    }
-    
-    result = 0;
-
-    if(startmonth > endmonth || (startmonth == endmonth && startday > endday))
-        return -1;
-
-    if(startmonth == endmonth)
-    {
-        result = endday - startday;
-    }
-    else
-    {
-        int monidx = 0;
-        
-        result += daysofmonth[startmonth] - startday;
-
-        monidx = startmonth + 1;
-        while(monidx < endmonth)
-            result += daysofmonth[monidx++];
-
-        result += endday;
-    }
-
-    return result;
-}
-
-static int get_monthesofyear(int startyear, int startmonth, 
-                                   int endyear, int endmonth)
-{
-    int32 gap;
-    if(endyear < startyear || (endyear == startyear && endmonth < startmonth))
-    {
-        gap = -1;
-    }
-    else
-    {
-        gap = (endyear - startyear) * 12 + (endmonth - startmonth);
-    }
-    return gap;
-}
-
-
-static int
-find_partidx_by_timestamp(TimestampTz start, int step, int steptype, int partitions, 
-                        TimestampTz value, QulificationType qualtype)
-{// #lizard forgives
-    int partidx = -1;
-    int gap;
-    struct pg_tm start_time;
-    fsec_t start_sec;
-    struct pg_tm current_time;
-    fsec_t current_sec;
-    bool    isalign = false;
-
-
-    /* timestamp convert to posix struct */
-    if(timestamp2tm(start, NULL, &start_time, &start_sec, NULL, NULL) != 0)
-        ereport(ERROR,
-                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
-                     errmsg("timestamp out of range")));
-    
-    if(timestamp2tm(value, NULL, &current_time, &current_sec, NULL, NULL) != 0)
-        ereport(ERROR,
-                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
-                     errmsg("timestamp out of range")));
-
-    if(current_time.tm_hour == 0 && current_time.tm_min == 0 && current_time.tm_sec == 0 && current_sec == 0)
-    {
-        isalign = true;
-    }
-
-    if(isalign && steptype == IntervalType_Month)
-    {
-        isalign = (current_time.tm_mday == 1);
-    }
-    
-    /* computer gap*/
-    if(steptype == IntervalType_Month)
-    {
-        if(current_time.tm_year < start_time.tm_year
-            || (current_time.tm_year == start_time.tm_year && current_time.tm_mon < start_time.tm_mon))
-        {
-            gap = -1;
-        }
-        else
-        {
-            gap = (current_time.tm_year - start_time.tm_year)*12 + (current_time.tm_mon - start_time.tm_mon);
-        }
-    }
-    else if(steptype == IntervalType_Day)
-    {
-        gap = get_daysofyear(start_time.tm_year, start_time.tm_mon, start_time.tm_mday,
-                                current_time.tm_year, current_time.tm_mon, current_time.tm_mday);
-    }
-    else
-    {
-        elog(ERROR,"step type[%d] is invalid", steptype);
-    }
-    
-    if(gap >= 0)
-    {
-        if(isalign)
-            isalign = (gap % step == 0);
-        gap = gap/step;
-    }
-    else
-    {
-        gap = -1;
-        isalign = false;
-    }
-    
-    switch(qualtype)
-    {
-        case QULIFICATION_TYPE_LS:
-            if(isalign)
-            {
-                if (!(is_first_day_from_start(step, steptype, &start_time, &current_time)))
-                {
-                    gap--;
-                }
-            }
-        case QULIFICATION_TYPE_LE:
-            {
-                if(gap >= partitions)
-                    partidx = PARTITION_ROUTER_RESULT_FULL; /* all partitions*/
-                else if(gap < 0)
-                    partidx = PARTITION_ROUTER_RESULT_NULL;
-                else
-                    partidx = gap;
-            }
-            break;
-            
-        case QULIFICATION_TYPE_EQUAL:
-            {
-                if(gap >= partitions || gap < 0)
-                    partidx = PARTITION_ROUTER_RESULT_NULL;
-                else
-                    partidx = gap;
-            }
-            break;
-            
-        case QULIFICATION_TYPE_GE:
-        case QULIFICATION_TYPE_GT:
-            {
-                if(gap >= partitions)
-                    partidx = PARTITION_ROUTER_RESULT_NULL;
-                else if(gap < 0)
-                    partidx = PARTITION_ROUTER_RESULT_FULL;
-                else
-                    partidx = gap;
-            }
-            break;
-        default:
-            elog(ERROR, "not supported Qulification Type[%d]", qualtype);
-    }
-    
-    return partidx;
-}
-
-int
-RelationGetPartitionIdxByValue(Relation rel, Datum value)
-{
-    int         partidx = -1;
-    Form_pg_partition_interval routerinfo = NULL;
-
-    routerinfo = rel->rd_partitions_info;
-
-    if(!routerinfo)
-    {
-        elog(ERROR, "relation[%s] is not a partitioned table.", RelationGetRelationName(rel));
-    }
-
-    switch(routerinfo->partdatatype)
-    {
-        case INT2OID:  /* int2 */
-            {
-                int value_int16;
-                value_int16 = DatumGetInt16(value);
-                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
-                                    routerinfo->partnparts, (int64)value_int16, QULIFICATION_TYPE_EQUAL);
-            }
-            break;
-        case INT4OID: /* int4 */
-            {
-                int value_int32;
-                value_int32 = DatumGetInt32(value);
-                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
-                                    routerinfo->partnparts, (int64)value_int32, QULIFICATION_TYPE_EQUAL);
-            }
-            break;
-        case INT8OID: /* int8 */
-            {
-                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
-                                routerinfo->partnparts, DatumGetInt64(value), QULIFICATION_TYPE_EQUAL);
-            }
-            break;
-        case TIMESTAMPOID: /* timestamp */
-            partidx = find_partidx_by_timestamp(routerinfo->partstartvalue_ts, routerinfo->partinterval_int,
-                                routerinfo->partinterval_type,
-                                routerinfo->partnparts, DatumGetTimestamp(value), QULIFICATION_TYPE_EQUAL);
-            break;
-        default:
-            elog(ERROR, "unsupported interval type:[%d]", routerinfo->partinterval_type);
-    }
-
-    return partidx;
-
-}
-
-Bitmapset *
-RelationGetPartitionByValue(Relation rel, Const *value)
-{
-    //TODO:
-    int partidx = -1;
-    AttrNumber partkey = InvalidAttrNumber;
-    Form_pg_attribute attr = NULL;
-    Bitmapset * bms = NULL;
-	char *partname = NULL;
-	Oid partoid = InvalidOid;
-
-    partkey = RelationGetPartitionColumnIndex(rel);
-    attr = rel->rd_att->attrs[partkey-1];
-
-    if(attr->atttypid != value->consttype)
-    {
-        elog(ERROR, "internal error: RelationGetPartitionByValue: data type of parameter is not same as relation definition");
-    }
-
-    partidx = RelationGetPartitionIdxByValue(rel,value->constvalue);
-
-	partname = GetPartitionName(RelationGetRelid(rel), partidx, false);
-	partoid = get_relname_relid(partname, RelationGetNamespace(rel));
-
-	if(partidx >= 0 && partoid)
-        bms = bms_make_singleton(partidx);
-    else
-        bms = NULL;
-
-    return bms;
-}
-
-List *
-RelationGetAllPartitions(Relation rel)
-{
-    int nparts = 0;
-    char *partname = NULL;
-    Oid     partoid = InvalidOid;
-    int partidx = 0;
-    List * result = NULL;
-
-    nparts = RelationGetNParts(rel);
-
-    for(partidx = 0; partidx < nparts; partidx++)
-    {
-        partname = GetPartitionName(RelationGetRelid(rel), partidx, false);
-        partoid = get_relname_relid(partname, RelationGetNamespace(rel));
-
-        if(partname)
-            pfree(partname);
-        partname = NULL;
-
-		if (InvalidOid == partoid)
-		{
-			continue;
-		}
-
-		result = lappend_oid(result, partoid);
-    }
-
-    return result;
-}
-
-int
-RelationGetChildIndex(Relation rel, Oid childoid)
-{
-    int nparts = 0;
-    char *partname = NULL;
-    Oid     partoid = InvalidOid;
-    int partidx = 0;
-    int result = -1;
-
-	if (childoid)
-    {
-		nparts = RelationGetNParts(rel);
-
-		for(partidx = 0; partidx < nparts; partidx++)
-        {
-			partname = GetPartitionName(RelationGetRelid(rel), partidx, false);
-			partoid = get_relname_relid(partname, RelationGetNamespace(rel));
-
-			if (partoid == childoid)
-			{
-				result = partidx;
-
-				if(partname)
-					pfree(partname);
-				partname = NULL;
-
-				break;
-			}
-
-            if(partname)
-                pfree(partname);
-            partname = NULL;
-        }
-    }
-
-    return result;
-}
-
-Oid
-RelationGetPartitionIndex(Relation rel, Oid indexOid, int partidx)
-{
-    char *partidxname = NULL;
-    Oid     partidxoid = InvalidOid;
-    partidxname = GetPartitionName(indexOid,partidx,true);
-    partidxoid = get_relname_relid(partidxname,RelationGetNamespace(rel));
-
-    pfree(partidxname);
-    partidxname = NULL;
-    return partidxoid;
-}
-
-Oid
-RelationGetPartition(Relation rel, int partidx, bool isindex)
-{
-    char *partname = NULL;
-    Oid     partoid = InvalidOid;
-
-    partname = GetPartitionName(RelationGetRelid(rel), partidx, isindex);
-
-    partoid = get_relname_relid(partname, RelationGetNamespace(rel));
-
-    if(partname)
-        pfree(partname);
-    partname = NULL;
-    return partoid;
-}
-
-Bitmapset *
-RelationGetPartitionsByQuals(Relation rel, List *strictinfos)
-{
-    Bitmapset * result;
-    Bitmapset * temp_bms;
-    Bitmapset * temp_result;
-    
-    ListCell *cell;
-    RestrictInfo *ele;
-    result = NULL;
-    temp_bms = NULL;
-    temp_result = NULL;
-
-    if(list_length(strictinfos) == 0)
-        return get_full_pruning_result(rel);
-        
-    foreach(cell, strictinfos)
-    {
-        ele = (RestrictInfo*)lfirst(cell);
-        temp_bms = pruning_walker(rel,(Node*)ele);
-        if(result)
-            temp_result = bms_intersect(result, temp_bms);
-        else
-            temp_result = bms_copy(temp_bms);
-        bms_free(result);
-        bms_free(temp_bms);
-        temp_bms = NULL;
-        result = temp_result;
-    }
-
-    return result;
-}
-
-static Bitmapset *
-pruning_walker(Relation rel, Node *expr)
-{
-    Bitmapset * result;
-    result = NULL;
-    
-    switch(nodeTag(expr))
-    {
-        case T_OpExpr:
-            {
-                result = pruning_opexpr(rel,(OpExpr*)expr);
-            }
-            break;
-        case T_RestrictInfo:
-            {
-                RestrictInfo *restricted = (RestrictInfo *)expr;
-                result = pruning_walker(rel, (Node *)restricted->clause);
-            }
-            break;
-        case T_BoolExpr:
-            {
-                BoolExpr *boolexpr = (BoolExpr*)expr;
-                switch(boolexpr->boolop)
-                {
-                    ListCell * cell;
-                    Bitmapset * temp_bms;
-                    Bitmapset * temp_result;
-                    Node *ele;
-
-                    temp_bms = NULL;
-                    temp_result = NULL;
-                    case AND_EXPR:
-                        {
-                            foreach(cell,boolexpr->args)
-                            {
-                                ele = (Node*)lfirst(cell);
-                                temp_bms = pruning_walker(rel,ele);
-                                if(result)
-                                    temp_result = bms_intersect(result, temp_bms);
-                                else
-                                    temp_result = bms_copy(temp_bms);
-                                bms_free(result);
-                                bms_free(temp_bms);
-                                temp_bms = NULL;
-                                result = temp_result;
-                            }                            
-                        }
-                        break;
-                    case OR_EXPR:
-                        {
-                            foreach(cell,boolexpr->args)
-                            {
-                                ele = (Node*)lfirst(cell);
-                                temp_bms = pruning_walker(rel,ele);
-                                temp_result = bms_union(result, temp_bms);
-                                bms_free(result);
-                                bms_free(temp_bms);
-                                temp_bms = NULL;
-                                result = temp_result;
-                            }                            
-                        }
-                        break;
-                    case NOT_EXPR:                        
-                    default:
-                        result = get_full_pruning_result(rel);
-                        break;
-                }
-            }
-            break;
-        default:
-            result = get_full_pruning_result(rel);
-            break;
-    }
-
-    return result;
-}
-
-static Bitmapset *
-pruning_opexpr(Relation rel, OpExpr *expr)
-{// #lizard forgives
-    Bitmapset     *result = NULL;
-    char        *opname = NULL;
-    Node        *leftarg = NULL;
-    Node        *rightarg = NULL;
-    Var            *arg_var = NULL;
-    Const        *arg_const = NULL;
-    bool        isswap = false;
-    int            npart;
-    int         partidx;
-    AttrNumber     partkey;
-    //Oid            parttype;
-    QulificationType     qualtype = QULIFICATION_TYPE_EQUAL;
-    Form_pg_partition_interval routerinfo;
-
-    partkey = RelationGetPartitionColumnIndex(rel);
-
-    //parttype = rel->rd_att->attrs[partkey - 1]->atttypid;
-    
-    if(list_length(expr->args) != 2)
-        return get_full_pruning_result(rel);
-
-    leftarg = (Node *)list_nth(expr->args,0);
-    rightarg = (Node *)list_nth(expr->args,1); 
-
-    if(IsA(leftarg,Var) && IsA(rightarg,Const))
-    {
-        arg_var = (Var *)leftarg;
-        arg_const = (Const *)rightarg;
-    }
-    else if(IsA(leftarg,Const) && IsA(rightarg,Var))
-    {
-        arg_var = (Var *)rightarg;
-        arg_const = (Const *)leftarg;
-        isswap = true;
-    }
-    else
-    {
-        return get_full_pruning_result(rel);
-    }
-
-    if(arg_var->varattno != partkey)
-    {
-        return get_full_pruning_result(rel);
-    }
-
-    opname = get_opname(expr->opno);
-
-    if(strcmp("<",opname) == 0)
-    {
-        if(!isswap)
-            qualtype = QULIFICATION_TYPE_LS;
-        else
-            qualtype = QULIFICATION_TYPE_GT;
-    }
-    else if(strcmp("<=",opname) == 0)
-    {
-        if(!isswap)
-            qualtype = QULIFICATION_TYPE_LE;
-        else
-            qualtype = QULIFICATION_TYPE_GE;
-    }
-    else if(strcmp("=",opname) == 0)
-    {        
-        qualtype = QULIFICATION_TYPE_EQUAL;        
-    }
-    else if(strcmp(">=",opname) == 0)
-    {
-        if(!isswap)
-            qualtype = QULIFICATION_TYPE_GE;
-        else
-            qualtype = QULIFICATION_TYPE_LE;
-    }
-    else if(strcmp(">",opname) == 0)
-    {
-        if(!isswap)
-            qualtype = QULIFICATION_TYPE_GT;
-        else
-            qualtype = QULIFICATION_TYPE_LS;
-    }
-    else
-    {
-        /* any other case, get full partitions */
-        return get_full_pruning_result(rel);
-    }
-
-    routerinfo = rel->rd_partitions_info;
-
-    if(!routerinfo)
-    {
-        elog(ERROR, "relation[%s] is not a partitioned table", RelationGetRelationName(rel));
-    }
-
-    switch(arg_const->consttype)
-    {
-        case INT2OID:  /* int2 */
-            {
-                int value_int16;
-                value_int16 = DatumGetInt16(arg_const->constvalue);
-                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
-                                    routerinfo->partnparts, (int64)value_int16, qualtype);
-            }
-            break;
-        case INT4OID: /* int4 */
-            {
-                int value_int32;
-                value_int32 = DatumGetInt32(arg_const->constvalue);
-                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
-                                    routerinfo->partnparts, (int64)value_int32, qualtype);
-            }
-            break;
-        case INT8OID: /* int8 */
-            {
-                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
-                                routerinfo->partnparts, DatumGetInt64(arg_const->constvalue), qualtype);
-            }
-            break;
-        case TIMESTAMPOID: /* timestamp */
-            partidx = find_partidx_by_timestamp(routerinfo->partstartvalue_ts, routerinfo->partinterval_int,
-                                routerinfo->partinterval_type,
-                                routerinfo->partnparts, DatumGetTimestamp(arg_const->constvalue), qualtype);
-            break;
-        default:
-            elog(ERROR, "unsupported const type:[%u]", arg_const->consttype);
-    }
-
-    npart = RelationGetNParts(rel);
-    if(npart <= 0)
-    {
-        elog(ERROR, "internal error: pruning_opexpr:partitioned table has no partitions");
-    }
-    
-    if(partidx == PARTITION_ROUTER_RESULT_FULL)
-        return get_full_pruning_result(rel);
-    else if(partidx == PARTITION_ROUTER_RESULT_NULL)
-        return NULL;
-    else if(partidx >= 0)
-    {
-		char *partname = NULL;
-		Oid partoid = InvalidOid;
-
-        switch(qualtype)
-        {
-            case QULIFICATION_TYPE_LS:                
-            case QULIFICATION_TYPE_LE:
-                {
-                    int i;
-                    for(i = 0; i <= partidx; i++)
-					{
-						partname = GetPartitionName(RelationGetRelid(rel), i, false);
-						partoid = get_relname_relid(partname, RelationGetNamespace(rel));
-						if(partoid)
-						{
-							result = bms_add_member(result, i);
-						}
-					}
-                }
-                break;
-            case QULIFICATION_TYPE_EQUAL:
-                {
-					partname = GetPartitionName(RelationGetRelid(rel), partidx, false);
-					partoid = get_relname_relid(partname, RelationGetNamespace(rel));
-					if(partoid)
-					{
-					    result = bms_make_singleton(partidx);
-					}
-                }
-                break;
-            case QULIFICATION_TYPE_GE:
-            case QULIFICATION_TYPE_GT:
-                {
-                    int i;
-                    for(i = partidx; i < npart; i++)
-					{
-						partname = GetPartitionName(RelationGetRelid(rel), i, false);
-						partoid = get_relname_relid(partname, RelationGetNamespace(rel));
-						if(partoid)
-						{
-						    result = bms_add_member(result, i);
-						}
-					}
-                }
-                break;
-            default:
-                //nerver occur
-                elog(ERROR, "internal error: pruning_opexpr: invalid QulificationType[%d]", qualtype);
-        }
-    }
-
-    return result;
-}
-
-static Bitmapset *
-get_full_pruning_result(Relation rel)
-{
-    Bitmapset *result = NULL;
-    int i = 0;
-    int nparts = RelationGetNParts(rel);
-	char *partname = NULL;
-	Oid partoid = InvalidOid;
-
-	Assert(nparts > 0);
-
-    for(i=0; i<nparts; i++)
-	{
-		partname = GetPartitionName(RelationGetRelid(rel), i, false);
-		partoid = get_relname_relid(partname, RelationGetNamespace(rel));
-		if (partoid)
-		{
-			result = bms_add_member(result,i);
-		}
-	}
-
-    return result;
-}
-
-void
-replace_target_relation(Node *node, Index targetrel, Relation partitionparent, int partidx)
-{// #lizard forgives
-    /*
-     * do nothing when we get to the end of a leaf on tree.
-     */
-    if (node == NULL)
-        return;
-
-    switch (nodeTag(node))
-    {
-            /*
-             * control nodes
-             */
-        case T_ModifyTable:
-        case T_Append:
-        case T_MergeAppend:
-        case T_RecursiveUnion:
-            break;
-        case T_Result:
-        case T_BitmapAnd:
-            {
-                List *planlist;
-                planlist = ((BitmapAnd*)node)->bitmapplans;
-                replace_target_relation((Node *)planlist,targetrel,partitionparent,partidx);
-            }
-            break;
-        case T_BitmapOr:
-            {
-                List *planlist;
-                planlist = ((BitmapOr*)node)->bitmapplans;
-                replace_target_relation((Node *)planlist,targetrel,partitionparent,partidx);
-            }
-            break;
-
-            /*
-             * scan nodes
-             */
-        case T_TidScan:
-        case T_SeqScan:
-            {
-                SeqScan *seqscan;
-                seqscan = (SeqScan*)node;
-
-                if(seqscan->ispartchild)
-                    break;
-                if(seqscan->scanrelid != targetrel)
-                    break;
-                seqscan->ispartchild = true;
-                seqscan->childidx = partidx;
-            }
-            break;
-
-        case T_IndexScan:
-            {
-                IndexScan *indexscan;
-                indexscan = (IndexScan*)node;
-
-                if(indexscan->scan.ispartchild)
-                    break;
-                if(indexscan->scan.scanrelid != targetrel)
-                    break;
-                indexscan->scan.ispartchild = true;
-                indexscan->scan.childidx = partidx;
-                indexscan->indexid = RelationGetPartitionIndex(partitionparent,indexscan->indexid,partidx);
-            }
-            break;
-
-        case T_IndexOnlyScan:
-            {
-                IndexOnlyScan *indexscan;
-                indexscan = (IndexOnlyScan*)node;
-
-                if(indexscan->scan.ispartchild)
-                    return;
-                if(indexscan->scan.scanrelid != targetrel)
-                    return;
-                indexscan->scan.ispartchild = true;
-                indexscan->scan.childidx = partidx;
-                indexscan->indexid = RelationGetPartitionIndex(partitionparent,indexscan->indexid,partidx);
-            }
-            break;
-
-        case T_BitmapIndexScan:
-            {
-                BitmapIndexScan *indexscan;
-                indexscan = (BitmapIndexScan*)node;
-
-                if(indexscan->scan.ispartchild)
-                    break;
-                if(indexscan->scan.scanrelid != targetrel)
-                    break;
-                indexscan->scan.ispartchild = true;
-                indexscan->scan.childidx = partidx;
-                indexscan->indexid = RelationGetPartitionIndex(partitionparent,indexscan->indexid,partidx);
-            }
-            break;
-
-        case T_BitmapHeapScan:
-            {
-                Scan *scan;
-                scan = (Scan*)node;
-
-                if(scan->ispartchild)
-                    break;
-                if(scan->scanrelid != targetrel)
-                    break;
-
-                scan->ispartchild = true;
-                scan->childidx = partidx;
-                replace_partidx_bitmapheapscan(partitionparent,(Node*)scan->plan.lefttree,partidx);
-                //replace_target_relation((Node*)scan->scan.plan.lefttree,targetrel,partitionparent,partidx);
-            }
-            break;
-
-        case T_SubqueryScan:
-            break;
-
-        case T_FunctionScan:
-        case T_ValuesScan:
-        case T_CteScan:
-        case T_WorkTableScan:
-        case T_ForeignScan:
-            break;
-
-            /*
-             * join nodes
-             */
-        case T_NestLoop:
-        case T_MergeJoin:
-        case T_HashJoin:
-            {
-                Plan *join;
-                join = (Plan*)node;
-                replace_target_relation((Node*)join->lefttree,targetrel,partitionparent,partidx);
-                replace_target_relation((Node*)join->righttree,targetrel,partitionparent,partidx);
-            }
-            break;
-
-            /*
-             * materialization nodes
-             */
-        case T_Material:
-        case T_Sort:
-        case T_Hash:
-            {
-                Plan *mat = (Plan*)node;
-                replace_target_relation((Node*)mat->lefttree,targetrel,partitionparent,partidx);
-            }
-            break;
-        case T_Group:
-        case T_Agg:
-        case T_WindowAgg:
-        case T_Unique:
-        case T_SetOp:
-        case T_LockRows:
-        case T_Limit:
-            break;
-        case T_List:
-            {
-                List * list;
-                ListCell *cell;
-                Node *element;
-                
-                list = (List *)node;
-                foreach(cell,list)
-                {
-                    element = (Node*)lfirst(cell);
-                    replace_target_relation(element,targetrel,partitionparent,partidx);
-                }
-            }
-            break;
-        case T_RemoteSubplan:
-            {
-                RemoteSubplan *plan = (RemoteSubplan *)node;
-
-                plan->cursor = get_internal_cursor();
-
-                replace_target_relation((Node*)((Plan *)plan)->lefttree,targetrel,partitionparent,partidx);
-            }
-            break;
-        case T_RemoteQuery:
-            elog(ERROR,"internal error: update partitioned parent table is forbidden in coordinator");
-            break;
-        default:
-            elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
-            break;
-    }
-}
-
-void
-replace_partidx_bitmapheapscan(Relation relation, Node *plan, int partidx)
-{
-    switch(nodeTag(plan))
-    {
-        case T_BitmapAnd:
-            {
-                List *planlist;
-                planlist = ((BitmapAnd*)plan)->bitmapplans;
-                replace_partidx_bitmapheapscan(relation,(Node*)planlist, partidx);
-            }
-            break;
-        case T_BitmapOr:
-            {
-                List *planlist;
-                planlist = ((BitmapOr*)plan)->bitmapplans;
-                replace_partidx_bitmapheapscan(relation,(Node*)planlist, partidx);
-            }
-            break;
-        case T_BitmapIndexScan:
-            {
-                Scan *sscan;
-                BitmapIndexScan *idxscan_child;
-                
-                sscan = (Scan *)plan;
-                sscan->ispartchild = true;
-                sscan->childidx = partidx;
-                
-                idxscan_child = (BitmapIndexScan *)plan;                    
-                idxscan_child->indexid = RelationGetPartitionIndex(relation,idxscan_child->indexid,partidx);
-            }
-            break;
-        case T_List:
-            {
-                List * list;
-                ListCell *cell;
-                Node *scan;
-                
-                list = (List *)plan;
-                foreach(cell,list)
-                {
-                    scan = (Node*)lfirst(cell);
-                    replace_partidx_bitmapheapscan(relation, scan, partidx);
-                }
-            }
-            break;
-        default:
-            elog(ERROR, "internal error: BitmapHeapScan cannot have this subplan[%d]", nodeTag(plan));
-            break;
-    }
-}
-
-int32
-get_timestamptz_gap(TimestampTz value, int32 interval)
-{
-    int32  gap;    
-    fsec_t fsec;
-    struct pg_tm user_time;
-    
-    if(timestamp2tm(value, NULL, &user_time, &fsec, NULL, NULL) != 0)
-        ereport(ERROR,
-                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
-                     errmsg("timestamp out of range")));    
-
-
-    switch (interval)
-    {
-        case IntervalType_Year:
-        {
-            gap = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon,
-                                    user_time.tm_year, 1);
-            break;
-        }
-                
-        case IntervalType_Month:
-        {
-            gap = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon,
-                                    user_time.tm_year, user_time.tm_mon);
-            break;
-        }
-        
-        case IntervalType_Day:
-        {
-            gap = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday,
-                                 user_time.tm_year, user_time.tm_mon, user_time.tm_mday);
-            break;
-        }
-
-        default:
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
-                        errmsg("partition interval %d not support hot and cold seperation", interval)));
-        }
-    }   
-    return gap;
-}
-
-int32
-get_timestamptz_diff(TimestampTz value, int32 interval)
-{
-    int32 gap1;    
-    int32 gap2;    
-    TimestampTz current_tmstamp;
-    fsec_t fsec;
-    struct pg_tm current_time;
-    struct pg_tm user_time;
-    
-    if(timestamp2tm(value, NULL, &user_time, &fsec, NULL, NULL) != 0)
-    {   
-        ereport(ERROR,
-                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
-                     errmsg("timestamp out of range")));
-    }
-    
-    current_tmstamp = GetCurrentTimestamp();
-    if(timestamp2tm(current_tmstamp, NULL, &current_time, &fsec, NULL, NULL) != 0)
-    {   
-        ereport(ERROR,
-                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
-                        errmsg("timestamp out of range")));
-    }
-    
-    switch (interval)
-    {
-        case IntervalType_Month:
-        {
-            gap1 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, 
-                                     current_time.tm_year, current_time.tm_mon);
-
-            gap2 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, 
-                                     user_time.tm_year, user_time.tm_mon);
-            break;
-        }
-        
-        case IntervalType_Day:
-        {
-            gap1 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday,
-                          current_time.tm_year, current_time.tm_mon, current_time.tm_mday);
-            gap2 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday,
-                                  user_time.tm_year, user_time.tm_mon, user_time.tm_mday);
-            break;
-        }
-
-        default:
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
-                        errmsg("partition interval %d not support hot and cold seperation", interval)));
-        }
-    }   
-    
-
-    return gap1 - gap2;
-}
-
-int32
-date_diff(struct pg_tm *user_time)
-{
-    int32 gap1;    
-    int32 gap2;    
-    TimestampTz current_tmstamp;
-    fsec_t fsec;
-    struct pg_tm current_time;
-    
-    current_tmstamp = GetCurrentTimestamp();
-    if(timestamp2tm(current_tmstamp, NULL, &current_time, &fsec, NULL, NULL) != 0)
-    {   
-        ereport(ERROR,
-                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
-                        errmsg("timestamp out of range")));
-    }
-
-    gap1 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, 
-                             current_time.tm_year, current_time.tm_mon);
-
-    gap2 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, 
-                             user_time->tm_year, user_time->tm_mon);
-            
-    
-
-    return gap1 - gap2;
-}
-
-int32
-date_diff_indays(struct pg_tm *user_time)
-{
-    int32 gap1;    
-    int32 gap2;    
-    TimestampTz current_tmstamp;
-    fsec_t fsec;
-    struct pg_tm current_time;
-    
-    current_tmstamp = GetCurrentTimestamp();
-    if(timestamp2tm(current_tmstamp, NULL, &current_time, &fsec, NULL, NULL) != 0)
-    {   
-        ereport(ERROR,
-                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
-                        errmsg("timestamp out of range")));
-    }
-
-    gap1 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday,
-                          current_time.tm_year, current_time.tm_mon, current_time.tm_mday);
-    
-
-    gap2 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday,
-                             user_time->tm_year, user_time->tm_mon, user_time->tm_mday);
-            
-    return gap1 - gap2;
-}
-
-int get_months_away_from_base(struct pg_tm * user_tm)
-{
-    return get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, 
-                            user_tm->tm_year, user_tm->tm_mon);
-}
-
-int get_days_away_from_base(struct pg_tm * user_tm)
-{
-    return get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday,
-                          user_tm->tm_year, user_tm->tm_mon, user_tm->tm_mday);
-}
-
-bool is_sec_meet_temp_cold_date(TimestampTz secvalue, int32 interval, int step, TimestampTz startValue)
-{// #lizard forgives
-    bool ret;    
-    fsec_t fsec;
-    struct pg_tm sec_time;
-    
-    if(timestamp2tm(secvalue, NULL, &sec_time, &fsec, NULL, NULL) != 0)
-    {   
-        ereport(ERROR,
-                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
-                     errmsg("timestamp out of range")));
-    }
-
-    switch (interval)
-    {
-        case IntervalType_Year:
-        {
-            ret = (g_TempColdDataTime.tm_year == sec_time.tm_year); 
-            break;
-        }
-        case IntervalType_Month:
-        {
-            ret = (g_TempColdDataTime.tm_year == sec_time.tm_year) 
-                    && (g_TempColdDataTime.tm_mon == sec_time.tm_mon);
-            break;
-        }
-        
-        case IntervalType_Day:
-        {
-            ret = (g_TempColdDataTime.tm_year == sec_time.tm_year) 
-                    && (g_TempColdDataTime.tm_mon == sec_time.tm_mon)
-                    && (g_TempColdDataTime.tm_mday == sec_time.tm_mday);
-            if (!ret)
-            {
-                struct pg_tm start_time;
-    
-                if(timestamp2tm(startValue, NULL, &start_time, &fsec, NULL, NULL) != 0)
-                {   
-                    ereport(ERROR,
-                                (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
-                                 errmsg("timestamp out of range")));
-                }
-                
-                ret = is_first_day_from_start(step, interval, &start_time, &sec_time);
-                if (ret)
-                {
-                    if (g_TempColdDataTime.tm_year + 1 == sec_time.tm_year &&
-                        g_TempColdDataTime.tm_mon == 12 &&
-                        g_TempColdDataTime.tm_mday == 31)
-                    {
-                        ret = true;
-                    }
-                    else
-                    {
-                        ret = false;
-                    }
-                }
-            }
-            
-            break;
-        }
-
-        default:
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
-                        errmsg("partition interval %d not support hot and cold seperation", interval)));
-        }
-    }   
-
-    return ret;
-}
-
-int32 GetPartitionIndex(TimestampTz start, int step, int steptype, int partitions, TimestampTz value)
-{
-    return find_partidx_by_timestamp(start, step, steptype, partitions, value, QULIFICATION_TYPE_EQUAL);
-}
-
-/* is the first day of next year from start year */
-bool
-is_first_day_from_start(int step, int steptype, struct pg_tm *start_time, struct pg_tm *current_time)
-{
-    bool result = false;
-
-    /* partition by one day */
-    if (step == 1 && steptype == IntervalType_Day)
-    {
-        if (current_time->tm_year == start_time->tm_year + 1 && current_time->tm_mon == 1 &&
-            current_time->tm_mday == 1)
-        {
-            result = true;
-        }
-    }
-
-    return result;
-}
-#endif
+/*-------------------------------------------------------------------------
+ *
+ * ruleutils.c
+ *      Functions to convert stored expressions/querytrees back to
+ *      source text
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *      src/backend/utils/adt/ruleutils.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <ctype.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#ifdef PGXC
+#include "access/reloptions.h"
+#endif /* PGXC */
+#include "access/amapi.h"
+#include "access/htup_details.h"
+#include "access/sysattr.h"
+#include "catalog/dependency.h"
+#include "catalog/indexing.h"
+#include "catalog/partition.h"
+#include "catalog/pg_aggregate.h"
+#include "catalog/pg_am.h"
+#include "catalog/pg_authid.h"
+#ifdef PGXC
+#include "catalog/pg_aggregate.h"
+#endif /* PGXC */
+#include "catalog/pg_collation.h"
+#include "catalog/pg_constraint.h"
+#include "catalog/pg_depend.h"
+#include "catalog/pg_language.h"
+#include "catalog/pg_opclass.h"
+#include "catalog/pg_operator.h"
+#include "catalog/pg_partitioned_table.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_statistic_ext.h"
+#include "catalog/pg_trigger.h"
+#include "catalog/pg_type.h"
+#include "commands/defrem.h"
+#include "commands/tablespace.h"
+#include "common/keywords.h"
+#include "executor/spi.h"
+#include "funcapi.h"
+#ifdef PGXC
+#include "nodes/execnodes.h"
+#endif
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/tlist.h"
+#include "parser/parse_node.h"
+#include "parser/parse_agg.h"
+#include "parser/parse_func.h"
+#include "parser/parse_oper.h"
+#include "parser/parse_type.h"
+#include "parser/parser.h"
+#include "parser/parsetree.h"
+#ifdef PGXC
+#include "pgxc/pgxc.h"
+#include "pgxc/planner.h"
+#endif
+#include "rewrite/rewriteHandler.h"
+#include "rewrite/rewriteManip.h"
+#include "rewrite/rewriteSupport.h"
+#include "utils/array.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/hsearch.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/ruleutils.h"
+#include "utils/snapmgr.h"
+#include "utils/syscache.h"
+#include "utils/tqual.h"
+#include "utils/typcache.h"
+#include "utils/varlena.h"
+#include "utils/xml.h"
+#ifdef __TBASE__
+#include "optimizer/planmain.h"
+#endif
+#ifdef __COLD_HOT__
+#include "postmaster/postmaster.h"
+#endif
+
+/* ----------
+ * Pretty formatting constants
+ * ----------
+ */
+
+/* Indent counts */
+#define PRETTYINDENT_STD        8
+#define PRETTYINDENT_JOIN        4
+#define PRETTYINDENT_VAR        4
+
+#define PRETTYINDENT_LIMIT        40    /* wrap limit */
+
+/* Pretty flags */
+#define PRETTYFLAG_PAREN        1
+#define PRETTYFLAG_INDENT        2
+
+/* Default line length for pretty-print wrapping: 0 means wrap always */
+#define WRAP_COLUMN_DEFAULT        0
+
+/* macro to test if pretty action needed */
+#define PRETTY_PAREN(context)    ((context)->prettyFlags & PRETTYFLAG_PAREN)
+#define PRETTY_INDENT(context)    ((context)->prettyFlags & PRETTYFLAG_INDENT)
+
+
+#ifdef __TBASE__
+static int daysofmonth[13] = {0,31,29,31,30,31,30,31,31,30,31,30,31};
+
+static struct pg_tm g_partition_base_time = { 0,
+                                               0,
+                                               0,
+                                               1,
+                                               1,      /* origin 0, not 1 */
+                                               1970,        /* relative to 1900 */
+                                               1,
+                                               1,
+                                               0,
+                                               0,
+                                               NULL
+                                              };
+#endif
+
+/* ----------
+ * Local data types
+ * ----------
+ */
+
+/* Context info needed for invoking a recursive querytree display routine */
+typedef struct
+{
+    StringInfo    buf;            /* output buffer to append to */
+    List       *namespaces;        /* List of deparse_namespace nodes */
+    List       *windowClause;    /* Current query level's WINDOW clause */
+    List       *windowTList;    /* targetlist for resolving WINDOW clause */
+    int            prettyFlags;    /* enabling of pretty-print functions */
+    int            wrapColumn;        /* max line length, or -1 for no limit */
+    int            indentLevel;    /* current indent level for prettyprint */
+    bool        varprefix;        /* TRUE to print prefixes on Vars */
+    ParseExprKind special_exprkind; /* set only for exprkinds needing special
+                                     * handling */
+#ifdef PGXC
+    bool        finalise_aggs;    /* should Datanode finalise the aggregates? */
+    bool        sortgroup_colno;/* instead of expression use resno for
+                                 * sortgrouprefs.
+                                 */
+#endif /* PGXC */
+} deparse_context;
+
+/*
+ * Each level of query context around a subtree needs a level of Var namespace.
+ * A Var having varlevelsup=N refers to the N'th item (counting from 0) in
+ * the current context's namespaces list.
+ *
+ * The rangetable is the list of actual RTEs from the query tree, and the
+ * cte list is the list of actual CTEs.
+ *
+ * rtable_names holds the alias name to be used for each RTE (either a C
+ * string, or NULL for nameless RTEs such as unnamed joins).
+ * rtable_columns holds the column alias names to be used for each RTE.
+ *
+ * In some cases we need to make names of merged JOIN USING columns unique
+ * across the whole query, not only per-RTE.  If so, unique_using is TRUE
+ * and using_names is a list of C strings representing names already assigned
+ * to USING columns.
+ *
+ * When deparsing plan trees, there is always just a single item in the
+ * deparse_namespace list (since a plan tree never contains Vars with
+ * varlevelsup > 0).  We store the PlanState node that is the immediate
+ * parent of the expression to be deparsed, as well as a list of that
+ * PlanState's ancestors.  In addition, we store its outer and inner subplan
+ * state nodes, as well as their plan nodes' targetlists, and the index tlist
+ * if the current plan node might contain INDEX_VAR Vars.  (These fields could
+ * be derived on-the-fly from the current PlanState, but it seems notationally
+ * clearer to set them up as separate fields.)
+ */
+typedef struct
+{
+    List       *rtable;            /* List of RangeTblEntry nodes */
+    List       *rtable_names;    /* Parallel list of names for RTEs */
+    List       *rtable_columns; /* Parallel list of deparse_columns structs */
+    List       *ctes;            /* List of CommonTableExpr nodes */
+    /* Workspace for column alias assignment: */
+    bool        unique_using;    /* Are we making USING names globally unique */
+    List       *using_names;    /* List of assigned names for USING columns */
+    /* Remaining fields are used only when deparsing a Plan tree: */
+    PlanState  *planstate;        /* immediate parent of current expression */
+    List       *ancestors;        /* ancestors of planstate */
+    PlanState  *outer_planstate;    /* outer subplan state, or NULL if none */
+    PlanState  *inner_planstate;    /* inner subplan state, or NULL if none */
+    List       *outer_tlist;    /* referent for OUTER_VAR Vars */
+    List       *inner_tlist;    /* referent for INNER_VAR Vars */
+    List       *index_tlist;    /* referent for INDEX_VAR Vars */
+} deparse_namespace;
+
+/*
+ * Per-relation data about column alias names.
+ *
+ * Selecting aliases is unreasonably complicated because of the need to dump
+ * rules/views whose underlying tables may have had columns added, deleted, or
+ * renamed since the query was parsed.  We must nonetheless print the rule/view
+ * in a form that can be reloaded and will produce the same results as before.
+ *
+ * For each RTE used in the query, we must assign column aliases that are
+ * unique within that RTE.  SQL does not require this of the original query,
+ * but due to factors such as *-expansion we need to be able to uniquely
+ * reference every column in a decompiled query.  As long as we qualify all
+ * column references, per-RTE uniqueness is sufficient for that.
+ *
+ * However, we can't ensure per-column name uniqueness for unnamed join RTEs,
+ * since they just inherit column names from their input RTEs, and we can't
+ * rename the columns at the join level.  Most of the time this isn't an issue
+ * because we don't need to reference the join's output columns as such; we
+ * can reference the input columns instead.  That approach can fail for merged
+ * JOIN USING columns, however, so when we have one of those in an unnamed
+ * join, we have to make that column's alias globally unique across the whole
+ * query to ensure it can be referenced unambiguously.
+ *
+ * Another problem is that a JOIN USING clause requires the columns to be
+ * merged to have the same aliases in both input RTEs, and that no other
+ * columns in those RTEs or their children conflict with the USING names.
+ * To handle that, we do USING-column alias assignment in a recursive
+ * traversal of the query's jointree.  When descending through a JOIN with
+ * USING, we preassign the USING column names to the child columns, overriding
+ * other rules for column alias assignment.  We also mark each RTE with a list
+ * of all USING column names selected for joins containing that RTE, so that
+ * when we assign other columns' aliases later, we can avoid conflicts.
+ *
+ * Another problem is that if a JOIN's input tables have had columns added or
+ * deleted since the query was parsed, we must generate a column alias list
+ * for the join that matches the current set of input columns --- otherwise, a
+ * change in the number of columns in the left input would throw off matching
+ * of aliases to columns of the right input.  Thus, positions in the printable
+ * column alias list are not necessarily one-for-one with varattnos of the
+ * JOIN, so we need a separate new_colnames[] array for printing purposes.
+ */
+typedef struct
+{
+    /*
+     * colnames is an array containing column aliases to use for columns that
+     * existed when the query was parsed.  Dropped columns have NULL entries.
+     * This array can be directly indexed by varattno to get a Var's name.
+     *
+     * Non-NULL entries are guaranteed unique within the RTE, *except* when
+     * this is for an unnamed JOIN RTE.  In that case we merely copy up names
+     * from the two input RTEs.
+     *
+     * During the recursive descent in set_using_names(), forcible assignment
+     * of a child RTE's column name is represented by pre-setting that element
+     * of the child's colnames array.  So at that stage, NULL entries in this
+     * array just mean that no name has been preassigned, not necessarily that
+     * the column is dropped.
+     */
+    int            num_cols;        /* length of colnames[] array */
+    char      **colnames;        /* array of C strings and NULLs */
+
+    /*
+     * new_colnames is an array containing column aliases to use for columns
+     * that would exist if the query was re-parsed against the current
+     * definitions of its base tables.  This is what to print as the column
+     * alias list for the RTE.  This array does not include dropped columns,
+     * but it will include columns added since original parsing.  Indexes in
+     * it therefore have little to do with current varattno values.  As above,
+     * entries are unique unless this is for an unnamed JOIN RTE.  (In such an
+     * RTE, we never actually print this array, but we must compute it anyway
+     * for possible use in computing column names of upper joins.) The
+     * parallel array is_new_col marks which of these columns are new since
+     * original parsing.  Entries with is_new_col false must match the
+     * non-NULL colnames entries one-for-one.
+     */
+    int            num_new_cols;    /* length of new_colnames[] array */
+    char      **new_colnames;    /* array of C strings */
+    bool       *is_new_col;        /* array of bool flags */
+
+    /* This flag tells whether we should actually print a column alias list */
+    bool        printaliases;
+
+    /* This list has all names used as USING names in joins above this RTE */
+    List       *parentUsing;    /* names assigned to parent merged columns */
+
+    /*
+     * If this struct is for a JOIN RTE, we fill these fields during the
+     * set_using_names() pass to describe its relationship to its child RTEs.
+     *
+     * leftattnos and rightattnos are arrays with one entry per existing
+     * output column of the join (hence, indexable by join varattno).  For a
+     * simple reference to a column of the left child, leftattnos[i] is the
+     * child RTE's attno and rightattnos[i] is zero; and conversely for a
+     * column of the right child.  But for merged columns produced by JOIN
+     * USING/NATURAL JOIN, both leftattnos[i] and rightattnos[i] are nonzero.
+     * Also, if the column has been dropped, both are zero.
+     *
+     * If it's a JOIN USING, usingNames holds the alias names selected for the
+     * merged columns (these might be different from the original USING list,
+     * if we had to modify names to achieve uniqueness).
+     */
+    int            leftrti;        /* rangetable index of left child */
+    int            rightrti;        /* rangetable index of right child */
+    int           *leftattnos;        /* left-child varattnos of join cols, or 0 */
+    int           *rightattnos;    /* right-child varattnos of join cols, or 0 */
+    List       *usingNames;        /* names assigned to merged columns */
+} deparse_columns;
+
+/* This macro is analogous to rt_fetch(), but for deparse_columns structs */
+#define deparse_columns_fetch(rangetable_index, dpns) \
+    ((deparse_columns *) list_nth((dpns)->rtable_columns, (rangetable_index)-1))
+
+/*
+ * Entry in set_rtable_names' hash table
+ */
+typedef struct
+{
+    char        name[NAMEDATALEN];    /* Hash key --- must be first */
+    int            counter;        /* Largest addition used so far for name */
+} NameHashEntry;
+
+
+/* ----------
+ * Global data
+ * ----------
+ */
+static SPIPlanPtr plan_getrulebyoid = NULL;
+static const char *query_getrulebyoid = "SELECT * FROM pg_catalog.pg_rewrite WHERE oid = $1";
+static SPIPlanPtr plan_getviewrule = NULL;
+static const char *query_getviewrule = "SELECT * FROM pg_catalog.pg_rewrite WHERE ev_class = $1 AND rulename = $2";
+
+/* GUC parameters */
+bool        quote_all_identifiers = false;
+
+
+/* ----------
+ * Local functions
+ *
+ * Most of these functions used to use fixed-size buffers to build their
+ * results.  Now, they take an (already initialized) StringInfo object
+ * as a parameter, and append their text output to its contents.
+ * ----------
+ */
+static char *deparse_expression_pretty(Node *expr, List *dpcontext,
+                          bool forceprefix, bool showimplicit,
+                          int prettyFlags, int startIndent);
+static char *pg_get_viewdef_worker(Oid viewoid,
+                      int prettyFlags, int wrapColumn);
+static char *pg_get_triggerdef_worker(Oid trigid, bool pretty);
+static void decompile_column_index_array(Datum column_index_array, Oid relId,
+                             StringInfo buf);
+static char *pg_get_ruledef_worker(Oid ruleoid, int prettyFlags);
+static char *pg_get_indexdef_worker(Oid indexrelid, int colno,
+                       const Oid *excludeOps,
+                       bool attrsOnly, bool showTblSpc,
+                       int prettyFlags, bool missing_ok);
+static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok);
+static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags,
+                         bool attrsOnly, bool missing_ok);
+static char *pg_get_constraintdef_worker(Oid constraintId, bool fullCommand,
+                            int prettyFlags, bool missing_ok);
+static text *pg_get_expr_worker(text *expr, Oid relid, const char *relname,
+                   int prettyFlags);
+static int print_function_arguments(StringInfo buf, HeapTuple proctup,
+                         bool print_table_args, bool print_defaults);
+static void print_function_rettype(StringInfo buf, HeapTuple proctup);
+static void print_function_trftypes(StringInfo buf, HeapTuple proctup);
+static void set_rtable_names(deparse_namespace *dpns, List *parent_namespaces,
+                 Bitmapset *rels_used);
+static void set_deparse_for_query(deparse_namespace *dpns, Query *query,
+                      List *parent_namespaces);
+static void set_simple_column_names(deparse_namespace *dpns);
+static bool has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode);
+static void set_using_names(deparse_namespace *dpns, Node *jtnode,
+                List *parentUsing);
+static void set_relation_column_names(deparse_namespace *dpns,
+                          RangeTblEntry *rte,
+                          deparse_columns *colinfo);
+static void set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
+                      deparse_columns *colinfo);
+static bool colname_is_unique(char *colname, deparse_namespace *dpns,
+                  deparse_columns *colinfo);
+static char *make_colname_unique(char *colname, deparse_namespace *dpns,
+                    deparse_columns *colinfo);
+static void expand_colnames_array_to(deparse_columns *colinfo, int n);
+static void identify_join_columns(JoinExpr *j, RangeTblEntry *jrte,
+                      deparse_columns *colinfo);
+static void flatten_join_using_qual(Node *qual,
+                        List **leftvars, List **rightvars);
+static char *get_rtable_name(int rtindex, deparse_context *context);
+static void set_deparse_planstate(deparse_namespace *dpns, PlanState *ps);
+#ifdef PGXC
+static void set_deparse_plan(deparse_namespace *dpns, Plan *plan);
+#endif
+static void push_child_plan(deparse_namespace *dpns, PlanState *ps,
+                deparse_namespace *save_dpns);
+static void pop_child_plan(deparse_namespace *dpns,
+               deparse_namespace *save_dpns);
+static void push_ancestor_plan(deparse_namespace *dpns, ListCell *ancestor_cell,
+                   deparse_namespace *save_dpns);
+static void pop_ancestor_plan(deparse_namespace *dpns,
+                  deparse_namespace *save_dpns);
+static void make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
+             int prettyFlags);
+static void make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
+             int prettyFlags, int wrapColumn);
+static void get_query_def(Query *query, StringInfo buf, List *parentnamespace,
+              TupleDesc resultDesc,
+              int prettyFlags, int wrapColumn, int startIndent
+#ifdef PGXC
+              , bool finalise_aggregates, bool sortgroup_colno
+#endif /* PGXC */
+                );
+static void get_values_def(List *values_lists, deparse_context *context);
+static void get_with_clause(Query *query, deparse_context *context);
+static void get_select_query_def(Query *query, deparse_context *context,
+                     TupleDesc resultDesc);
+static void get_insert_query_def(Query *query, deparse_context *context);
+static void get_update_query_def(Query *query, deparse_context *context);
+static void get_update_query_targetlist_def(Query *query, List *targetList,
+                                deparse_context *context,
+                                RangeTblEntry *rte);
+static void get_delete_query_def(Query *query, deparse_context *context);
+static void get_utility_query_def(Query *query, deparse_context *context);
+static void get_basic_select_query(Query *query, deparse_context *context,
+                       TupleDesc resultDesc);
+static void get_target_list(List *targetList, deparse_context *context,
+                TupleDesc resultDesc);
+static void get_setop_query(Node *setOp, Query *query,
+                deparse_context *context,
+                TupleDesc resultDesc);
+static Node *get_rule_sortgroupclause(Index ref, List *tlist,
+                         bool force_colno,
+                         deparse_context *context);
+static void get_rule_groupingset(GroupingSet *gset, List *targetlist,
+                     bool omit_parens, deparse_context *context);
+static void get_rule_orderby(List *orderList, List *targetList,
+                 bool force_colno, deparse_context *context);
+static void get_rule_windowclause(Query *query, deparse_context *context);
+static void get_rule_windowspec(WindowClause *wc, List *targetList,
+                    deparse_context *context);
+static char *get_variable(Var *var, int levelsup, bool istoplevel,
+             deparse_context *context);
+static void get_special_variable(Node *node, deparse_context *context,
+                     void *private);
+static void resolve_special_varno(Node *node, deparse_context *context,
+                      void *private,
+                      void (*callback) (Node *, deparse_context *, void *));
+static Node *find_param_referent(Param *param, deparse_context *context,
+                    deparse_namespace **dpns_p, ListCell **ancestor_cell_p);
+static void get_parameter(Param *param, deparse_context *context);
+static const char *get_simple_binary_op_name(OpExpr *expr);
+static bool isSimpleNode(Node *node, Node *parentNode, int prettyFlags);
+static void appendContextKeyword(deparse_context *context, const char *str,
+                     int indentBefore, int indentAfter, int indentPlus);
+static void removeStringInfoSpaces(StringInfo str);
+static void get_rule_expr(Node *node, deparse_context *context,
+              bool showimplicit);
+static void get_rule_expr_toplevel(Node *node, deparse_context *context,
+                       bool showimplicit);
+static void get_rule_expr_funccall(Node *node, deparse_context *context,
+                       bool showimplicit);
+static bool looks_like_function(Node *node);
+static void get_oper_expr(OpExpr *expr, deparse_context *context);
+static void get_func_expr(FuncExpr *expr, deparse_context *context,
+              bool showimplicit);
+static void get_agg_expr(Aggref *aggref, deparse_context *context,
+             Aggref *original_aggref);
+static void get_agg_combine_expr(Node *node, deparse_context *context,
+                     void *private);
+static void get_windowfunc_expr(WindowFunc *wfunc, deparse_context *context);
+static void get_coercion_expr(Node *arg, deparse_context *context,
+                  Oid resulttype, int32 resulttypmod,
+                  Node *parentNode);
+static void get_const_expr(Const *constval, deparse_context *context,
+               int showtype);
+static void get_const_collation(Const *constval, deparse_context *context);
+static void simple_quote_literal(StringInfo buf, const char *val);
+static void get_sublink_expr(SubLink *sublink, deparse_context *context);
+static void get_tablefunc(TableFunc *tf, deparse_context *context,
+              bool showimplicit);
+static void get_from_clause(Query *query, const char *prefix,
+                deparse_context *context);
+static void get_from_clause_item(Node *jtnode, Query *query,
+                     deparse_context *context);
+static void get_column_alias_list(deparse_columns *colinfo,
+                      deparse_context *context);
+static void get_from_clause_coldeflist(RangeTblFunction *rtfunc,
+                           deparse_columns *colinfo,
+                           deparse_context *context);
+static void get_tablesample_def(TableSampleClause *tablesample,
+                    deparse_context *context);
+static void get_opclass_name(Oid opclass, Oid actual_datatype,
+                 StringInfo buf);
+static Node *processIndirection(Node *node, deparse_context *context);
+static void printSubscripts(ArrayRef *aref, deparse_context *context);
+static char *get_relation_name(Oid relid);
+static char *generate_relation_name(Oid relid, List *namespaces);
+static char *generate_qualified_relation_name(Oid relid);
+static char *generate_function_name(Oid funcid, int nargs,
+                       List *argnames, Oid *argtypes,
+                       bool has_variadic, bool *use_variadic_p,
+                       ParseExprKind special_exprkind);
+static char *generate_operator_name(Oid operid, Oid arg1, Oid arg2);
+static text *string_to_text(char *str);
+static char *flatten_reloptions(Oid relid);
+
+#ifdef __TBASE__
+static Bitmapset *pruning_walker(Relation rel, Node *expr);
+static Bitmapset *pruning_opexpr(Relation rel, OpExpr *expr);
+static Bitmapset *get_full_pruning_result(Relation rel);
+static int get_daysofmonth(int startmonth, int startday, 
+                                int endmonth, int endday);
+#endif
+#define only_marker(rte)  ((rte)->inh ? "" : "ONLY ")
+
+
+/* ----------
+ * get_ruledef            - Do it all and return a text
+ *                  that could be used as a statement
+ *                  to recreate the rule
+ * ----------
+ */
+Datum
+pg_get_ruledef(PG_FUNCTION_ARGS)
+{
+    Oid            ruleoid = PG_GETARG_OID(0);
+    int            prettyFlags;
+    char       *res;
+
+    prettyFlags = PRETTYFLAG_INDENT;
+
+    res = pg_get_ruledef_worker(ruleoid, prettyFlags);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+
+Datum
+pg_get_ruledef_ext(PG_FUNCTION_ARGS)
+{
+    Oid            ruleoid = PG_GETARG_OID(0);
+    bool        pretty = PG_GETARG_BOOL(1);
+    int            prettyFlags;
+    char       *res;
+
+    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
+
+    res = pg_get_ruledef_worker(ruleoid, prettyFlags);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+
+static char *
+pg_get_ruledef_worker(Oid ruleoid, int prettyFlags)
+{// #lizard forgives
+    Datum        args[1];
+    char        nulls[1];
+    int            spirc;
+    HeapTuple    ruletup;
+    TupleDesc    rulettc;
+    StringInfoData buf;
+
+    /*
+     * Do this first so that string is alloc'd in outer context not SPI's.
+     */
+    initStringInfo(&buf);
+
+    /*
+     * Connect to SPI manager
+     */
+    if (SPI_connect() != SPI_OK_CONNECT)
+        elog(ERROR, "SPI_connect failed");
+
+    /*
+     * On the first call prepare the plan to lookup pg_rewrite. We read
+     * pg_rewrite over the SPI manager instead of using the syscache to be
+     * checked for read access on pg_rewrite.
+     */
+    if (plan_getrulebyoid == NULL)
+    {
+        Oid            argtypes[1];
+        SPIPlanPtr    plan;
+
+        argtypes[0] = OIDOID;
+        plan = SPI_prepare(query_getrulebyoid, 1, argtypes);
+        if (plan == NULL)
+            elog(ERROR, "SPI_prepare failed for \"%s\"", query_getrulebyoid);
+        SPI_keepplan(plan);
+        plan_getrulebyoid = plan;
+    }
+
+    /*
+     * Get the pg_rewrite tuple for this rule
+     */
+    args[0] = ObjectIdGetDatum(ruleoid);
+    nulls[0] = ' ';
+    spirc = SPI_execute_plan(plan_getrulebyoid, args, nulls, true, 0);
+    if (spirc != SPI_OK_SELECT)
+        elog(ERROR, "failed to get pg_rewrite tuple for rule %u", ruleoid);
+    if (SPI_processed != 1)
+    {
+        /*
+         * There is no tuple data available here, just keep the output buffer
+         * empty.
+         */
+    }
+    else
+    {
+        /*
+         * Get the rule's definition and put it into executor's memory
+         */
+        ruletup = SPI_tuptable->vals[0];
+        rulettc = SPI_tuptable->tupdesc;
+        make_ruledef(&buf, ruletup, rulettc, prettyFlags);
+    }
+
+    /*
+     * Disconnect from SPI manager
+     */
+    if (SPI_finish() != SPI_OK_FINISH)
+        elog(ERROR, "SPI_finish failed");
+
+    if (buf.len == 0)
+        return NULL;
+
+    return buf.data;
+}
+
+
+/* ----------
+ * get_viewdef            - Mainly the same thing, but we
+ *                  only return the SELECT part of a view
+ * ----------
+ */
+Datum
+pg_get_viewdef(PG_FUNCTION_ARGS)
+{
+    /* By OID */
+    Oid            viewoid = PG_GETARG_OID(0);
+    int            prettyFlags;
+    char       *res;
+
+    prettyFlags = PRETTYFLAG_INDENT;
+
+    res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+
+Datum
+pg_get_viewdef_ext(PG_FUNCTION_ARGS)
+{
+    /* By OID */
+    Oid            viewoid = PG_GETARG_OID(0);
+    bool        pretty = PG_GETARG_BOOL(1);
+    int            prettyFlags;
+    char       *res;
+
+    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
+
+    res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+Datum
+pg_get_viewdef_wrap(PG_FUNCTION_ARGS)
+{
+    /* By OID */
+    Oid            viewoid = PG_GETARG_OID(0);
+    int            wrap = PG_GETARG_INT32(1);
+    int            prettyFlags;
+    char       *res;
+
+    /* calling this implies we want pretty printing */
+    prettyFlags = PRETTYFLAG_PAREN | PRETTYFLAG_INDENT;
+
+    res = pg_get_viewdef_worker(viewoid, prettyFlags, wrap);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+Datum
+pg_get_viewdef_name(PG_FUNCTION_ARGS)
+{
+    /* By qualified name */
+    text       *viewname = PG_GETARG_TEXT_PP(0);
+    int            prettyFlags;
+    RangeVar   *viewrel;
+    Oid            viewoid;
+    char       *res;
+
+    prettyFlags = PRETTYFLAG_INDENT;
+
+    /* Look up view name.  Can't lock it - we might not have privileges. */
+    viewrel = makeRangeVarFromNameList(textToQualifiedNameList(viewname));
+    viewoid = RangeVarGetRelid(viewrel, NoLock, false);
+
+    res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+
+Datum
+pg_get_viewdef_name_ext(PG_FUNCTION_ARGS)
+{
+    /* By qualified name */
+    text       *viewname = PG_GETARG_TEXT_PP(0);
+    bool        pretty = PG_GETARG_BOOL(1);
+    int            prettyFlags;
+    RangeVar   *viewrel;
+    Oid            viewoid;
+    char       *res;
+
+    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
+
+    /* Look up view name.  Can't lock it - we might not have privileges. */
+    viewrel = makeRangeVarFromNameList(textToQualifiedNameList(viewname));
+    viewoid = RangeVarGetRelid(viewrel, NoLock, false);
+
+    res = pg_get_viewdef_worker(viewoid, prettyFlags, WRAP_COLUMN_DEFAULT);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+/*
+ * Common code for by-OID and by-name variants of pg_get_viewdef
+ */
+static char *
+pg_get_viewdef_worker(Oid viewoid, int prettyFlags, int wrapColumn)
+{// #lizard forgives
+    Datum        args[2];
+    char        nulls[2];
+    int            spirc;
+    HeapTuple    ruletup;
+    TupleDesc    rulettc;
+    StringInfoData buf;
+
+    /*
+     * Do this first so that string is alloc'd in outer context not SPI's.
+     */
+    initStringInfo(&buf);
+
+    /*
+     * Connect to SPI manager
+     */
+    if (SPI_connect() != SPI_OK_CONNECT)
+        elog(ERROR, "SPI_connect failed");
+
+    /*
+     * On the first call prepare the plan to lookup pg_rewrite. We read
+     * pg_rewrite over the SPI manager instead of using the syscache to be
+     * checked for read access on pg_rewrite.
+     */
+    if (plan_getviewrule == NULL)
+    {
+        Oid            argtypes[2];
+        SPIPlanPtr    plan;
+
+        argtypes[0] = OIDOID;
+        argtypes[1] = NAMEOID;
+        plan = SPI_prepare(query_getviewrule, 2, argtypes);
+        if (plan == NULL)
+            elog(ERROR, "SPI_prepare failed for \"%s\"", query_getviewrule);
+        SPI_keepplan(plan);
+        plan_getviewrule = plan;
+    }
+
+    /*
+     * Get the pg_rewrite tuple for the view's SELECT rule
+     */
+    args[0] = ObjectIdGetDatum(viewoid);
+    args[1] = DirectFunctionCall1(namein, CStringGetDatum(ViewSelectRuleName));
+    nulls[0] = ' ';
+    nulls[1] = ' ';
+    spirc = SPI_execute_plan(plan_getviewrule, args, nulls, true, 0);
+    if (spirc != SPI_OK_SELECT)
+        elog(ERROR, "failed to get pg_rewrite tuple for view %u", viewoid);
+    if (SPI_processed != 1)
+    {
+        /*
+         * There is no tuple data available here, just keep the output buffer
+         * empty.
+         */
+    }
+    else
+    {
+        /*
+         * Get the rule's definition and put it into executor's memory
+         */
+        ruletup = SPI_tuptable->vals[0];
+        rulettc = SPI_tuptable->tupdesc;
+        make_viewdef(&buf, ruletup, rulettc, prettyFlags, wrapColumn);
+    }
+
+    /*
+     * Disconnect from SPI manager
+     */
+    if (SPI_finish() != SPI_OK_FINISH)
+        elog(ERROR, "SPI_finish failed");
+
+    if (buf.len == 0)
+        return NULL;
+
+    return buf.data;
+}
+
+/* ----------
+ * get_triggerdef            - Get the definition of a trigger
+ * ----------
+ */
+Datum
+pg_get_triggerdef(PG_FUNCTION_ARGS)
+{
+    Oid            trigid = PG_GETARG_OID(0);
+    char       *res;
+
+    res = pg_get_triggerdef_worker(trigid, false);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+Datum
+pg_get_triggerdef_ext(PG_FUNCTION_ARGS)
+{
+    Oid            trigid = PG_GETARG_OID(0);
+    bool        pretty = PG_GETARG_BOOL(1);
+    char       *res;
+
+    res = pg_get_triggerdef_worker(trigid, pretty);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+static char *
+pg_get_triggerdef_worker(Oid trigid, bool pretty)
+{// #lizard forgives
+    HeapTuple    ht_trig;
+    Form_pg_trigger trigrec;
+    StringInfoData buf;
+    Relation    tgrel;
+    ScanKeyData skey[1];
+    SysScanDesc tgscan;
+    int            findx = 0;
+    char       *tgname;
+    char       *tgoldtable;
+    char       *tgnewtable;
+    Oid            argtypes[1];    /* dummy */
+    Datum        value;
+    bool        isnull;
+
+    /*
+     * Fetch the pg_trigger tuple by the Oid of the trigger
+     */
+    tgrel = heap_open(TriggerRelationId, AccessShareLock);
+
+    ScanKeyInit(&skey[0],
+                ObjectIdAttributeNumber,
+                BTEqualStrategyNumber, F_OIDEQ,
+                ObjectIdGetDatum(trigid));
+
+    tgscan = systable_beginscan(tgrel, TriggerOidIndexId, true,
+                                NULL, 1, skey);
+
+    ht_trig = systable_getnext(tgscan);
+
+    if (!HeapTupleIsValid(ht_trig))
+    {
+        systable_endscan(tgscan);
+        heap_close(tgrel, AccessShareLock);
+        return NULL;
+    }
+
+    trigrec = (Form_pg_trigger) GETSTRUCT(ht_trig);
+
+    /*
+     * Start the trigger definition. Note that the trigger's name should never
+     * be schema-qualified, but the trigger rel's name may be.
+     */
+    initStringInfo(&buf);
+
+    tgname = NameStr(trigrec->tgname);
+    appendStringInfo(&buf, "CREATE %sTRIGGER %s ",
+                     OidIsValid(trigrec->tgconstraint) ? "CONSTRAINT " : "",
+                     quote_identifier(tgname));
+
+    if (TRIGGER_FOR_BEFORE(trigrec->tgtype))
+        appendStringInfoString(&buf, "BEFORE");
+    else if (TRIGGER_FOR_AFTER(trigrec->tgtype))
+        appendStringInfoString(&buf, "AFTER");
+    else if (TRIGGER_FOR_INSTEAD(trigrec->tgtype))
+        appendStringInfoString(&buf, "INSTEAD OF");
+    else
+        elog(ERROR, "unexpected tgtype value: %d", trigrec->tgtype);
+
+    if (TRIGGER_FOR_INSERT(trigrec->tgtype))
+    {
+        appendStringInfoString(&buf, " INSERT");
+        findx++;
+    }
+    if (TRIGGER_FOR_DELETE(trigrec->tgtype))
+    {
+        if (findx > 0)
+            appendStringInfoString(&buf, " OR DELETE");
+        else
+            appendStringInfoString(&buf, " DELETE");
+        findx++;
+    }
+    if (TRIGGER_FOR_UPDATE(trigrec->tgtype))
+    {
+        if (findx > 0)
+            appendStringInfoString(&buf, " OR UPDATE");
+        else
+            appendStringInfoString(&buf, " UPDATE");
+        findx++;
+        /* tgattr is first var-width field, so OK to access directly */
+        if (trigrec->tgattr.dim1 > 0)
+        {
+            int            i;
+
+            appendStringInfoString(&buf, " OF ");
+            for (i = 0; i < trigrec->tgattr.dim1; i++)
+            {
+                char       *attname;
+
+                if (i > 0)
+                    appendStringInfoString(&buf, ", ");
+                attname = get_relid_attribute_name(trigrec->tgrelid,
+                                                   trigrec->tgattr.values[i]);
+                appendStringInfoString(&buf, quote_identifier(attname));
+            }
+        }
+    }
+    if (TRIGGER_FOR_TRUNCATE(trigrec->tgtype))
+    {
+        if (findx > 0)
+            appendStringInfoString(&buf, " OR TRUNCATE");
+        else
+            appendStringInfoString(&buf, " TRUNCATE");
+        findx++;
+    }
+    appendStringInfo(&buf, " ON %s ",
+                     generate_relation_name(trigrec->tgrelid, NIL));
+
+    if (OidIsValid(trigrec->tgconstraint))
+    {
+        if (OidIsValid(trigrec->tgconstrrelid))
+            appendStringInfo(&buf, "FROM %s ",
+                             generate_relation_name(trigrec->tgconstrrelid, NIL));
+        if (!trigrec->tgdeferrable)
+            appendStringInfoString(&buf, "NOT ");
+        appendStringInfoString(&buf, "DEFERRABLE INITIALLY ");
+        if (trigrec->tginitdeferred)
+            appendStringInfoString(&buf, "DEFERRED ");
+        else
+            appendStringInfoString(&buf, "IMMEDIATE ");
+    }
+
+    value = fastgetattr(ht_trig, Anum_pg_trigger_tgoldtable,
+                        tgrel->rd_att, &isnull);
+    if (!isnull)
+        tgoldtable = NameStr(*((NameData *) DatumGetPointer(value)));
+    else
+        tgoldtable = NULL;
+    value = fastgetattr(ht_trig, Anum_pg_trigger_tgnewtable,
+                        tgrel->rd_att, &isnull);
+    if (!isnull)
+        tgnewtable = NameStr(*((NameData *) DatumGetPointer(value)));
+    else
+        tgnewtable = NULL;
+    if (tgoldtable != NULL || tgnewtable != NULL)
+    {
+        appendStringInfoString(&buf, "REFERENCING ");
+        if (tgoldtable != NULL)
+            appendStringInfo(&buf, "OLD TABLE AS %s ", tgoldtable);
+        if (tgnewtable != NULL)
+            appendStringInfo(&buf, "NEW TABLE AS %s ", tgnewtable);
+    }
+
+    if (TRIGGER_FOR_ROW(trigrec->tgtype))
+        appendStringInfoString(&buf, "FOR EACH ROW ");
+    else
+        appendStringInfoString(&buf, "FOR EACH STATEMENT ");
+
+    /* If the trigger has a WHEN qualification, add that */
+    value = fastgetattr(ht_trig, Anum_pg_trigger_tgqual,
+                        tgrel->rd_att, &isnull);
+    if (!isnull)
+    {
+        Node       *qual;
+        char        relkind;
+        deparse_context context;
+        deparse_namespace dpns;
+        RangeTblEntry *oldrte;
+        RangeTblEntry *newrte;
+
+        appendStringInfoString(&buf, "WHEN (");
+
+        qual = stringToNode(TextDatumGetCString(value));
+
+        relkind = get_rel_relkind(trigrec->tgrelid);
+
+        /* Build minimal OLD and NEW RTEs for the rel */
+        oldrte = makeNode(RangeTblEntry);
+        oldrte->rtekind = RTE_RELATION;
+        oldrte->relid = trigrec->tgrelid;
+        oldrte->relkind = relkind;
+        oldrte->alias = makeAlias("old", NIL);
+        oldrte->eref = oldrte->alias;
+        oldrte->lateral = false;
+        oldrte->inh = false;
+        oldrte->inFromCl = true;
+
+        newrte = makeNode(RangeTblEntry);
+        newrte->rtekind = RTE_RELATION;
+        newrte->relid = trigrec->tgrelid;
+        newrte->relkind = relkind;
+        newrte->alias = makeAlias("new", NIL);
+        newrte->eref = newrte->alias;
+        newrte->lateral = false;
+        newrte->inh = false;
+        newrte->inFromCl = true;
+
+        /* Build two-element rtable */
+        memset(&dpns, 0, sizeof(dpns));
+        dpns.rtable = list_make2(oldrte, newrte);
+        dpns.ctes = NIL;
+        set_rtable_names(&dpns, NIL, NULL);
+        set_simple_column_names(&dpns);
+
+        /* Set up context with one-deep namespace stack */
+        context.buf = &buf;
+        context.namespaces = list_make1(&dpns);
+        context.windowClause = NIL;
+        context.windowTList = NIL;
+        context.varprefix = true;
+        context.prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
+        context.wrapColumn = WRAP_COLUMN_DEFAULT;
+        context.indentLevel = PRETTYINDENT_STD;
+        context.special_exprkind = EXPR_KIND_NONE;
+
+        get_rule_expr(qual, &context, false);
+
+        appendStringInfoString(&buf, ") ");
+    }
+
+    appendStringInfo(&buf, "EXECUTE PROCEDURE %s(",
+                     generate_function_name(trigrec->tgfoid, 0,
+                                            NIL, argtypes,
+                                            false, NULL, EXPR_KIND_NONE));
+
+    if (trigrec->tgnargs > 0)
+    {
+        char       *p;
+        int            i;
+
+        value = fastgetattr(ht_trig, Anum_pg_trigger_tgargs,
+                            tgrel->rd_att, &isnull);
+        if (isnull)
+            elog(ERROR, "tgargs is null for trigger %u", trigid);
+        p = (char *) VARDATA_ANY(DatumGetByteaPP(value));
+        for (i = 0; i < trigrec->tgnargs; i++)
+        {
+            if (i > 0)
+                appendStringInfoString(&buf, ", ");
+            simple_quote_literal(&buf, p);
+            /* advance p to next string embedded in tgargs */
+            while (*p)
+                p++;
+            p++;
+        }
+    }
+
+    /* We deliberately do not put semi-colon at end */
+    appendStringInfoChar(&buf, ')');
+
+    /* Clean up */
+    systable_endscan(tgscan);
+
+    heap_close(tgrel, AccessShareLock);
+
+    return buf.data;
+}
+
+/* ----------
+ * get_indexdef            - Get the definition of an index
+ *
+ * In the extended version, there is a colno argument as well as pretty bool.
+ *    if colno == 0, we want a complete index definition.
+ *    if colno > 0, we only want the Nth index key's variable or expression.
+ *
+ * Note that the SQL-function versions of this omit any info about the
+ * index tablespace; this is intentional because pg_dump wants it that way.
+ * However pg_get_indexdef_string() includes the index tablespace.
+ * ----------
+ */
+Datum
+pg_get_indexdef(PG_FUNCTION_ARGS)
+{
+    Oid            indexrelid = PG_GETARG_OID(0);
+    int            prettyFlags;
+    char       *res;
+
+    prettyFlags = PRETTYFLAG_INDENT;
+
+    res = pg_get_indexdef_worker(indexrelid, 0, NULL, false, false,
+                                 prettyFlags, true);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+Datum
+pg_get_indexdef_ext(PG_FUNCTION_ARGS)
+{
+    Oid            indexrelid = PG_GETARG_OID(0);
+    int32        colno = PG_GETARG_INT32(1);
+    bool        pretty = PG_GETARG_BOOL(2);
+    int            prettyFlags;
+    char       *res;
+
+    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
+
+    res = pg_get_indexdef_worker(indexrelid, colno, NULL, colno != 0, false,
+                                 prettyFlags, true);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+/*
+ * Internal version for use by ALTER TABLE.
+ * Includes a tablespace clause in the result.
+ * Returns a palloc'd C string; no pretty-printing.
+ */
+char *
+pg_get_indexdef_string(Oid indexrelid)
+{
+    return pg_get_indexdef_worker(indexrelid, 0, NULL, false, true, 0, false);
+}
+
+/* Internal version that just reports the column definitions */
+char *
+pg_get_indexdef_columns(Oid indexrelid, bool pretty)
+{
+    int            prettyFlags;
+
+    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
+    return pg_get_indexdef_worker(indexrelid, 0, NULL, true, false,
+                                  prettyFlags, false);
+}
+
+/*
+ * Internal workhorse to decompile an index definition.
+ *
+ * This is now used for exclusion constraints as well: if excludeOps is not
+ * NULL then it points to an array of exclusion operator OIDs.
+ */
+static char *
+pg_get_indexdef_worker(Oid indexrelid, int colno,
+                       const Oid *excludeOps,
+                       bool attrsOnly, bool showTblSpc,
+                       int prettyFlags, bool missing_ok)
+{// #lizard forgives
+    /* might want a separate isConstraint parameter later */
+    bool        isConstraint = (excludeOps != NULL);
+    HeapTuple    ht_idx;
+    HeapTuple    ht_idxrel;
+    HeapTuple    ht_am;
+    Form_pg_index idxrec;
+    Form_pg_class idxrelrec;
+    Form_pg_am    amrec;
+    IndexAmRoutine *amroutine;
+    List       *indexprs;
+    ListCell   *indexpr_item;
+    List       *context;
+    Oid            indrelid;
+    int            keyno;
+    Datum        indcollDatum;
+    Datum        indclassDatum;
+    Datum        indoptionDatum;
+    bool        isnull;
+    oidvector  *indcollation;
+    oidvector  *indclass;
+    int2vector *indoption;
+    StringInfoData buf;
+    char       *str;
+    char       *sep;
+#ifdef __TBASE__
+    bool       is_interval_child = false;
+    HeapTuple  ht_parent_idx;
+#endif
+    /*
+     * Fetch the pg_index tuple by the Oid of the index
+     */
+    ht_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexrelid));
+    if (!HeapTupleIsValid(ht_idx))
+    {
+        if (missing_ok)
+            return NULL;
+        elog(ERROR, "cache lookup failed for index %u", indexrelid);
+    }
+    idxrec = (Form_pg_index) GETSTRUCT(ht_idx);
+
+    indrelid = idxrec->indrelid;
+    Assert(indexrelid == idxrec->indexrelid);
+
+    /* Must get indcollation, indclass, and indoption the hard way */
+    indcollDatum = SysCacheGetAttr(INDEXRELID, ht_idx,
+                                   Anum_pg_index_indcollation, &isnull);
+    Assert(!isnull);
+    indcollation = (oidvector *) DatumGetPointer(indcollDatum);
+
+    indclassDatum = SysCacheGetAttr(INDEXRELID, ht_idx,
+                                    Anum_pg_index_indclass, &isnull);
+    Assert(!isnull);
+    indclass = (oidvector *) DatumGetPointer(indclassDatum);
+
+    indoptionDatum = SysCacheGetAttr(INDEXRELID, ht_idx,
+                                     Anum_pg_index_indoption, &isnull);
+    Assert(!isnull);
+    indoption = (int2vector *) DatumGetPointer(indoptionDatum);
+
+    /*
+     * Fetch the pg_class tuple of the index relation
+     */
+    ht_idxrel = SearchSysCache1(RELOID, ObjectIdGetDatum(indexrelid));
+    if (!HeapTupleIsValid(ht_idxrel))
+        elog(ERROR, "cache lookup failed for relation %u", indexrelid);
+    idxrelrec = (Form_pg_class) GETSTRUCT(ht_idxrel);
+
+    /*
+     * Fetch the pg_am tuple of the index' access method
+     */
+    ht_am = SearchSysCache1(AMOID, ObjectIdGetDatum(idxrelrec->relam));
+    if (!HeapTupleIsValid(ht_am))
+        elog(ERROR, "cache lookup failed for access method %u",
+             idxrelrec->relam);
+    amrec = (Form_pg_am) GETSTRUCT(ht_am);
+
+    /* Fetch the index AM's API struct */
+    amroutine = GetIndexAmRoutine(amrec->amhandler);
+
+    /*
+     * Get the index expressions, if any.  (NOTE: we do not use the relcache
+     * versions of the expressions and predicate, because we want to display
+     * non-const-folded expressions.)
+     */
+    if (!heap_attisnull(ht_idx, Anum_pg_index_indexprs, NULL))
+    {
+        Datum        exprsDatum;
+        bool        isnull;
+        char       *exprsString;
+
+        exprsDatum = SysCacheGetAttr(INDEXRELID, ht_idx,
+                                     Anum_pg_index_indexprs, &isnull);
+        Assert(!isnull);
+        exprsString = TextDatumGetCString(exprsDatum);
+        indexprs = (List *) stringToNode(exprsString);
+        pfree(exprsString);
+    }
+    else
+        indexprs = NIL;
+
+    indexpr_item = list_head(indexprs);
+
+    context = deparse_context_for(get_relation_name(indrelid), indrelid);
+
+    /*
+     * Start the index definition.  Note that the index's name should never be
+     * schema-qualified, but the indexed rel's name may be.
+     */
+    initStringInfo(&buf);
+
+    if (!attrsOnly)
+    {
+        if (!isConstraint)
+            appendStringInfo(&buf, "CREATE %sINDEX %s ON %s USING %s (",
+                             idxrec->indisunique ? "UNIQUE " : "",
+                             quote_identifier(NameStr(idxrelrec->relname)),
+                             generate_relation_name(indrelid, NIL),
+                             quote_identifier(NameStr(amrec->amname)));
+        else                    /* currently, must be EXCLUDE constraint */
+            appendStringInfo(&buf, "EXCLUDE USING %s (",
+                             quote_identifier(NameStr(amrec->amname)));
+    }
+
+    /*
+     * Report the indexed attributes
+     */
+#ifdef __TBASE__
+    {
+        Relation rel = relation_open(indrelid, NoLock);
+        if (rel->rd_rel->relkind == RELKIND_RELATION && RELATION_IS_CHILD(rel))
+        {
+            Oid parentIndexId = get_interval_parent_relid(indexrelid);
+            Oid parentId = get_interval_parent_relid(indrelid);
+            if (!OidIsValid(parentId))
+            {
+                elog(ERROR, "could not get interval parent for relation %u",
+                            indrelid);
+            }
+            indrelid = parentId;
+
+            if (OidIsValid(parentIndexId))
+            {
+                ht_parent_idx = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(parentIndexId));
+                if (!HeapTupleIsValid(ht_parent_idx))
+                {
+                    if (missing_ok)
+                        return NULL;
+                    elog(ERROR, "cache lookup failed for index %u", parentIndexId);
+                }
+                idxrec = (Form_pg_index) GETSTRUCT(ht_parent_idx);
+                is_interval_child = true;
+            }
+        }
+        heap_close(rel, NoLock);
+    }
+#endif
+    sep = "";
+    for (keyno = 0; keyno < idxrec->indnatts; keyno++)
+    {
+        AttrNumber    attnum = idxrec->indkey.values[keyno];
+        int16        opt = indoption->values[keyno];
+        Oid            keycoltype;
+        Oid            keycolcollation;
+
+        if (!colno)
+            appendStringInfoString(&buf, sep);
+        sep = ", ";
+
+        if (attnum != 0)
+        {
+            /* Simple index column */
+            char       *attname;
+            int32        keycoltypmod;
+
+            attname = get_relid_attribute_name(indrelid, attnum);
+            if (!colno || colno == keyno + 1)
+                appendStringInfoString(&buf, quote_identifier(attname));
+            get_atttypetypmodcoll(indrelid, attnum,
+                                  &keycoltype, &keycoltypmod,
+                                  &keycolcollation);
+        }
+        else
+        {
+            /* expressional index */
+            Node       *indexkey;
+
+            if (indexpr_item == NULL)
+                elog(ERROR, "too few entries in indexprs list");
+            indexkey = (Node *) lfirst(indexpr_item);
+            indexpr_item = lnext(indexpr_item);
+            /* Deparse */
+            str = deparse_expression_pretty(indexkey, context, false, false,
+                                            prettyFlags, 0);
+            if (!colno || colno == keyno + 1)
+            {
+                /* Need parens if it's not a bare function call */
+                if (looks_like_function(indexkey))
+                    appendStringInfoString(&buf, str);
+                else
+                    appendStringInfo(&buf, "(%s)", str);
+            }
+            keycoltype = exprType(indexkey);
+            keycolcollation = exprCollation(indexkey);
+        }
+
+        if (!attrsOnly && (!colno || colno == keyno + 1))
+        {
+            Oid            indcoll;
+
+            /* Add collation, if not default for column */
+            indcoll = indcollation->values[keyno];
+            if (OidIsValid(indcoll) && indcoll != keycolcollation)
+                appendStringInfo(&buf, " COLLATE %s",
+                                 generate_collation_name((indcoll)));
+
+            /* Add the operator class name, if not default */
+            get_opclass_name(indclass->values[keyno], keycoltype, &buf);
+
+            /* Add options if relevant */
+            if (amroutine->amcanorder)
+            {
+                /* if it supports sort ordering, report DESC and NULLS opts */
+                if (opt & INDOPTION_DESC)
+                {
+                    appendStringInfoString(&buf, " DESC");
+                    /* NULLS FIRST is the default in this case */
+                    if (!(opt & INDOPTION_NULLS_FIRST))
+                        appendStringInfoString(&buf, " NULLS LAST");
+                }
+                else
+                {
+                    if (opt & INDOPTION_NULLS_FIRST)
+                        appendStringInfoString(&buf, " NULLS FIRST");
+                }
+            }
+
+            /* Add the exclusion operator if relevant */
+            if (excludeOps != NULL)
+                appendStringInfo(&buf, " WITH %s",
+                                 generate_operator_name(excludeOps[keyno],
+                                                        keycoltype,
+                                                        keycoltype));
+        }
+    }
+
+    if (!attrsOnly)
+    {
+        appendStringInfoChar(&buf, ')');
+
+        /*
+         * If it has options, append "WITH (options)"
+         */
+        str = flatten_reloptions(indexrelid);
+        if (str)
+        {
+            appendStringInfo(&buf, " WITH (%s)", str);
+            pfree(str);
+        }
+
+        /*
+         * Print tablespace, but only if requested
+         */
+        if (showTblSpc)
+        {
+            Oid            tblspc;
+
+            tblspc = get_rel_tablespace(indexrelid);
+            if (!OidIsValid(tblspc))
+                tblspc = MyDatabaseTableSpace;
+            if (isConstraint)
+                appendStringInfoString(&buf, " USING INDEX");
+            appendStringInfo(&buf, " TABLESPACE %s",
+                             quote_identifier(get_tablespace_name(tblspc)));
+        }
+
+        /*
+         * If it's a partial index, decompile and append the predicate
+         */
+        if (!heap_attisnull(ht_idx, Anum_pg_index_indpred, NULL))
+        {
+            Node       *node;
+            Datum        predDatum;
+            bool        isnull;
+            char       *predString;
+
+            /* Convert text string to node tree */
+            predDatum = SysCacheGetAttr(INDEXRELID, ht_idx,
+                                        Anum_pg_index_indpred, &isnull);
+            Assert(!isnull);
+            predString = TextDatumGetCString(predDatum);
+            node = (Node *) stringToNode(predString);
+            pfree(predString);
+
+            /* Deparse */
+            str = deparse_expression_pretty(node, context, false, false,
+                                            prettyFlags, 0);
+            if (isConstraint)
+                appendStringInfo(&buf, " WHERE (%s)", str);
+            else
+                appendStringInfo(&buf, " WHERE %s", str);
+        }
+    }
+
+    /* Clean up */
+    ReleaseSysCache(ht_idx);
+    ReleaseSysCache(ht_idxrel);
+    ReleaseSysCache(ht_am);
+#ifdef __TBASE__
+    if (is_interval_child)
+    {
+        ReleaseSysCache(ht_parent_idx);
+    }
+#endif
+    return buf.data;
+}
+
+/*
+ * pg_get_statisticsobjdef
+ *        Get the definition of an extended statistics object
+ */
+Datum
+pg_get_statisticsobjdef(PG_FUNCTION_ARGS)
+{
+    Oid            statextid = PG_GETARG_OID(0);
+    char       *res;
+
+    res = pg_get_statisticsobj_worker(statextid, true);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+/*
+ * Internal workhorse to decompile an extended statistics object.
+ */
+static char *
+pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
+{// #lizard forgives
+    Form_pg_statistic_ext statextrec;
+    HeapTuple    statexttup;
+    StringInfoData buf;
+    int            colno;
+    char       *nsp;
+    ArrayType  *arr;
+    char       *enabled;
+    Datum        datum;
+    bool        isnull;
+    bool        ndistinct_enabled;
+    bool        dependencies_enabled;
+    int            i;
+
+    statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
+
+    if (!HeapTupleIsValid(statexttup))
+    {
+        if (missing_ok)
+            return NULL;
+        elog(ERROR, "cache lookup failed for statistics object %u", statextid);
+    }
+
+    statextrec = (Form_pg_statistic_ext) GETSTRUCT(statexttup);
+
+    initStringInfo(&buf);
+
+    nsp = get_namespace_name(statextrec->stxnamespace);
+    appendStringInfo(&buf, "CREATE STATISTICS %s",
+                     quote_qualified_identifier(nsp,
+                                                NameStr(statextrec->stxname)));
+
+    /*
+     * Decode the stxkind column so that we know which stats types to print.
+     */
+    datum = SysCacheGetAttr(STATEXTOID, statexttup,
+                            Anum_pg_statistic_ext_stxkind, &isnull);
+    Assert(!isnull);
+    arr = DatumGetArrayTypeP(datum);
+    if (ARR_NDIM(arr) != 1 ||
+        ARR_HASNULL(arr) ||
+        ARR_ELEMTYPE(arr) != CHAROID)
+        elog(ERROR, "stxkind is not a 1-D char array");
+    enabled = (char *) ARR_DATA_PTR(arr);
+
+    ndistinct_enabled = false;
+    dependencies_enabled = false;
+
+    for (i = 0; i < ARR_DIMS(arr)[0]; i++)
+    {
+        if (enabled[i] == STATS_EXT_NDISTINCT)
+            ndistinct_enabled = true;
+        if (enabled[i] == STATS_EXT_DEPENDENCIES)
+            dependencies_enabled = true;
+    }
+
+    /*
+     * If any option is disabled, then we'll need to append the types clause
+     * to show which options are enabled.  We omit the types clause on purpose
+     * when all options are enabled, so a pg_dump/pg_restore will create all
+     * statistics types on a newer postgres version, if the statistics had all
+     * options enabled on the original version.
+     */
+    if (!ndistinct_enabled || !dependencies_enabled)
+    {
+        appendStringInfoString(&buf, " (");
+        if (ndistinct_enabled)
+            appendStringInfoString(&buf, "ndistinct");
+        else if (dependencies_enabled)
+            appendStringInfoString(&buf, "dependencies");
+        appendStringInfoChar(&buf, ')');
+    }
+
+    appendStringInfoString(&buf, " ON ");
+
+    for (colno = 0; colno < statextrec->stxkeys.dim1; colno++)
+    {
+        AttrNumber    attnum = statextrec->stxkeys.values[colno];
+        char       *attname;
+
+        if (colno > 0)
+            appendStringInfoString(&buf, ", ");
+
+        attname = get_relid_attribute_name(statextrec->stxrelid, attnum);
+
+        appendStringInfoString(&buf, quote_identifier(attname));
+    }
+
+    appendStringInfo(&buf, " FROM %s",
+                     generate_relation_name(statextrec->stxrelid, NIL));
+
+    ReleaseSysCache(statexttup);
+
+    return buf.data;
+}
+
+/*
+ * pg_get_partkeydef
+ *
+ * Returns the partition key specification, ie, the following:
+ *
+ * PARTITION BY { RANGE | LIST } (column opt_collation opt_opclass [, ...])
+ */
+Datum
+pg_get_partkeydef(PG_FUNCTION_ARGS)
+{
+    Oid            relid = PG_GETARG_OID(0);
+    char       *res;
+
+    res = pg_get_partkeydef_worker(relid, PRETTYFLAG_INDENT, false, true);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+/* Internal version that just reports the column definitions */
+char *
+pg_get_partkeydef_columns(Oid relid, bool pretty)
+{
+    int            prettyFlags;
+
+    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
+    return pg_get_partkeydef_worker(relid, prettyFlags, true, false);
+}
+
+/*
+ * Internal workhorse to decompile a partition key definition.
+ */
+static char *
+pg_get_partkeydef_worker(Oid relid, int prettyFlags,
+                         bool attrsOnly, bool missing_ok)
+{// #lizard forgives
+    Form_pg_partitioned_table form;
+    HeapTuple    tuple;
+    oidvector  *partclass;
+    oidvector  *partcollation;
+    List       *partexprs;
+    ListCell   *partexpr_item;
+    List       *context;
+    Datum        datum;
+    bool        isnull;
+    StringInfoData buf;
+    int            keyno;
+    char       *str;
+    char       *sep;
+
+    tuple = SearchSysCache1(PARTRELID, ObjectIdGetDatum(relid));
+    if (!HeapTupleIsValid(tuple))
+    {
+        if (missing_ok)
+            return NULL;
+        elog(ERROR, "cache lookup failed for partition key of %u", relid);
+    }
+
+    form = (Form_pg_partitioned_table) GETSTRUCT(tuple);
+
+    Assert(form->partrelid == relid);
+
+    /* Must get partclass and partcollation the hard way */
+    datum = SysCacheGetAttr(PARTRELID, tuple,
+                            Anum_pg_partitioned_table_partclass, &isnull);
+    Assert(!isnull);
+    partclass = (oidvector *) DatumGetPointer(datum);
+
+    datum = SysCacheGetAttr(PARTRELID, tuple,
+                            Anum_pg_partitioned_table_partcollation, &isnull);
+    Assert(!isnull);
+    partcollation = (oidvector *) DatumGetPointer(datum);
+
+
+    /*
+     * Get the expressions, if any.  (NOTE: we do not use the relcache
+     * versions of the expressions, because we want to display
+     * non-const-folded expressions.)
+     */
+    if (!heap_attisnull(tuple, Anum_pg_partitioned_table_partexprs, NULL))
+    {
+        Datum        exprsDatum;
+        bool        isnull;
+        char       *exprsString;
+
+        exprsDatum = SysCacheGetAttr(PARTRELID, tuple,
+                                     Anum_pg_partitioned_table_partexprs, &isnull);
+        Assert(!isnull);
+        exprsString = TextDatumGetCString(exprsDatum);
+        partexprs = (List *) stringToNode(exprsString);
+
+        if (!IsA(partexprs, List))
+            elog(ERROR, "unexpected node type found in partexprs: %d",
+                 (int) nodeTag(partexprs));
+
+        pfree(exprsString);
+    }
+    else
+        partexprs = NIL;
+
+    partexpr_item = list_head(partexprs);
+    context = deparse_context_for(get_relation_name(relid), relid);
+
+    initStringInfo(&buf);
+
+    switch (form->partstrat)
+    {
+        case PARTITION_STRATEGY_LIST:
+            if (!attrsOnly)
+                appendStringInfo(&buf, "LIST");
+            break;
+        case PARTITION_STRATEGY_RANGE:
+            if (!attrsOnly)
+                appendStringInfo(&buf, "RANGE");
+            break;
+        default:
+            elog(ERROR, "unexpected partition strategy: %d",
+                 (int) form->partstrat);
+    }
+
+    if (!attrsOnly)
+        appendStringInfo(&buf, " (");
+    sep = "";
+    for (keyno = 0; keyno < form->partnatts; keyno++)
+    {
+        AttrNumber    attnum = form->partattrs.values[keyno];
+        Oid            keycoltype;
+        Oid            keycolcollation;
+        Oid            partcoll;
+
+        appendStringInfoString(&buf, sep);
+        sep = ", ";
+        if (attnum != 0)
+        {
+            /* Simple attribute reference */
+            char       *attname;
+            int32        keycoltypmod;
+
+            attname = get_relid_attribute_name(relid, attnum);
+            appendStringInfoString(&buf, quote_identifier(attname));
+            get_atttypetypmodcoll(relid, attnum,
+                                  &keycoltype, &keycoltypmod,
+                                  &keycolcollation);
+        }
+        else
+        {
+            /* Expression */
+            Node       *partkey;
+
+            if (partexpr_item == NULL)
+                elog(ERROR, "too few entries in partexprs list");
+            partkey = (Node *) lfirst(partexpr_item);
+            partexpr_item = lnext(partexpr_item);
+
+            /* Deparse */
+            str = deparse_expression_pretty(partkey, context, false, false,
+                                            prettyFlags, 0);
+            /* Need parens if it's not a bare function call */
+            if (looks_like_function(partkey))
+                appendStringInfoString(&buf, str);
+            else
+                appendStringInfo(&buf, "(%s)", str);
+
+            keycoltype = exprType(partkey);
+            keycolcollation = exprCollation(partkey);
+        }
+
+        /* Add collation, if not default for column */
+        partcoll = partcollation->values[keyno];
+        if (!attrsOnly && OidIsValid(partcoll) && partcoll != keycolcollation)
+            appendStringInfo(&buf, " COLLATE %s",
+                             generate_collation_name((partcoll)));
+
+        /* Add the operator class name, if not default */
+        if (!attrsOnly)
+            get_opclass_name(partclass->values[keyno], keycoltype, &buf);
+    }
+
+    if (!attrsOnly)
+        appendStringInfoChar(&buf, ')');
+
+    /* Clean up */
+    ReleaseSysCache(tuple);
+
+    return buf.data;
+}
+
+/*
+ * pg_get_partition_constraintdef
+ *
+ * Returns partition constraint expression as a string for the input relation
+ */
+Datum
+pg_get_partition_constraintdef(PG_FUNCTION_ARGS)
+{
+    Oid            relationId = PG_GETARG_OID(0);
+    Expr       *constr_expr;
+    int            prettyFlags;
+    List       *context;
+    char       *consrc;
+
+    constr_expr = get_partition_qual_relid(relationId);
+
+    /* Quick exit if not a partition */
+    if (constr_expr == NULL)
+        PG_RETURN_NULL();
+
+    /*
+     * Deparse and return the constraint expression.
+     */
+    prettyFlags = PRETTYFLAG_INDENT;
+    context = deparse_context_for(get_relation_name(relationId), relationId);
+    consrc = deparse_expression_pretty((Node *) constr_expr, context, false,
+                                       false, prettyFlags, 0);
+
+    PG_RETURN_TEXT_P(string_to_text(consrc));
+}
+
+/*
+ * pg_get_constraintdef
+ *
+ * Returns the definition for the constraint, ie, everything that needs to
+ * appear after "ALTER TABLE ... ADD CONSTRAINT <constraintname>".
+ */
+Datum
+pg_get_constraintdef(PG_FUNCTION_ARGS)
+{
+    Oid            constraintId = PG_GETARG_OID(0);
+    int            prettyFlags;
+    char       *res;
+
+    prettyFlags = PRETTYFLAG_INDENT;
+
+    res = pg_get_constraintdef_worker(constraintId, false, prettyFlags, true);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+Datum
+pg_get_constraintdef_ext(PG_FUNCTION_ARGS)
+{
+    Oid            constraintId = PG_GETARG_OID(0);
+    bool        pretty = PG_GETARG_BOOL(1);
+    int            prettyFlags;
+    char       *res;
+
+    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
+
+    res = pg_get_constraintdef_worker(constraintId, false, prettyFlags, true);
+
+    if (res == NULL)
+        PG_RETURN_NULL();
+
+    PG_RETURN_TEXT_P(string_to_text(res));
+}
+
+/*
+ * Internal version that returns a full ALTER TABLE ... ADD CONSTRAINT command
+ */
+char *
+pg_get_constraintdef_command(Oid constraintId)
+{
+    return pg_get_constraintdef_worker(constraintId, true, 0, false);
+}
+
+/*
+ * As of 9.4, we now use an MVCC snapshot for this.
+ */
+static char *
+pg_get_constraintdef_worker(Oid constraintId, bool fullCommand,
+                            int prettyFlags, bool missing_ok)
+{// #lizard forgives
+    HeapTuple    tup;
+    Form_pg_constraint conForm;
+    StringInfoData buf;
+    SysScanDesc scandesc;
+    ScanKeyData scankey[1];
+    Snapshot    snapshot = RegisterSnapshot(GetTransactionSnapshot());
+    Relation    relation = heap_open(ConstraintRelationId, AccessShareLock);
+
+    ScanKeyInit(&scankey[0],
+                ObjectIdAttributeNumber,
+                BTEqualStrategyNumber, F_OIDEQ,
+                ObjectIdGetDatum(constraintId));
+
+    scandesc = systable_beginscan(relation,
+                                  ConstraintOidIndexId,
+                                  true,
+                                  snapshot,
+                                  1,
+                                  scankey);
+
+    /*
+     * We later use the tuple with SysCacheGetAttr() as if we had obtained it
+     * via SearchSysCache, which works fine.
+     */
+    tup = systable_getnext(scandesc);
+
+    UnregisterSnapshot(snapshot);
+
+    if (!HeapTupleIsValid(tup))
+    {
+        if (missing_ok)
+        {
+            systable_endscan(scandesc);
+            heap_close(relation, AccessShareLock);
+            return NULL;
+        }
+        elog(ERROR, "could not find tuple for constraint %u", constraintId);
+    }
+
+    conForm = (Form_pg_constraint) GETSTRUCT(tup);
+
+    initStringInfo(&buf);
+
+    if (fullCommand)
+    {
+        /*
+         * Currently, callers want ALTER TABLE (without ONLY) for CHECK
+         * constraints, and other types of constraints don't inherit anyway so
+         * it doesn't matter whether we say ONLY or not.  Someday we might
+         * need to let callers specify whether to put ONLY in the command.
+         */
+        appendStringInfo(&buf, "ALTER TABLE %s ADD CONSTRAINT %s ",
+                         generate_qualified_relation_name(conForm->conrelid),
+                         quote_identifier(NameStr(conForm->conname)));
+    }
+
+    switch (conForm->contype)
+    {
+        case CONSTRAINT_FOREIGN:
+            {
+                Datum        val;
+                bool        isnull;
+                const char *string;
+
+                /* Start off the constraint definition */
+                appendStringInfoString(&buf, "FOREIGN KEY (");
+
+                /* Fetch and build referencing-column list */
+                val = SysCacheGetAttr(CONSTROID, tup,
+                                      Anum_pg_constraint_conkey, &isnull);
+                if (isnull)
+                    elog(ERROR, "null conkey for constraint %u",
+                         constraintId);
+
+                decompile_column_index_array(val, conForm->conrelid, &buf);
+
+                /* add foreign relation name */
+                appendStringInfo(&buf, ") REFERENCES %s(",
+                                 generate_relation_name(conForm->confrelid,
+                                                        NIL));
+
+                /* Fetch and build referenced-column list */
+                val = SysCacheGetAttr(CONSTROID, tup,
+                                      Anum_pg_constraint_confkey, &isnull);
+                if (isnull)
+                    elog(ERROR, "null confkey for constraint %u",
+                         constraintId);
+
+                decompile_column_index_array(val, conForm->confrelid, &buf);
+
+                appendStringInfoChar(&buf, ')');
+
+                /* Add match type */
+                switch (conForm->confmatchtype)
+                {
+                    case FKCONSTR_MATCH_FULL:
+                        string = " MATCH FULL";
+                        break;
+                    case FKCONSTR_MATCH_PARTIAL:
+                        string = " MATCH PARTIAL";
+                        break;
+                    case FKCONSTR_MATCH_SIMPLE:
+                        string = "";
+                        break;
+                    default:
+                        elog(ERROR, "unrecognized confmatchtype: %d",
+                             conForm->confmatchtype);
+                        string = "";    /* keep compiler quiet */
+                        break;
+                }
+                appendStringInfoString(&buf, string);
+
+                /* Add ON UPDATE and ON DELETE clauses, if needed */
+                switch (conForm->confupdtype)
+                {
+                    case FKCONSTR_ACTION_NOACTION:
+                        string = NULL;    /* suppress default */
+                        break;
+                    case FKCONSTR_ACTION_RESTRICT:
+                        string = "RESTRICT";
+                        break;
+                    case FKCONSTR_ACTION_CASCADE:
+                        string = "CASCADE";
+                        break;
+                    case FKCONSTR_ACTION_SETNULL:
+                        string = "SET NULL";
+                        break;
+                    case FKCONSTR_ACTION_SETDEFAULT:
+                        string = "SET DEFAULT";
+                        break;
+                    default:
+                        elog(ERROR, "unrecognized confupdtype: %d",
+                             conForm->confupdtype);
+                        string = NULL;    /* keep compiler quiet */
+                        break;
+                }
+                if (string)
+                    appendStringInfo(&buf, " ON UPDATE %s", string);
+
+                switch (conForm->confdeltype)
+                {
+                    case FKCONSTR_ACTION_NOACTION:
+                        string = NULL;    /* suppress default */
+                        break;
+                    case FKCONSTR_ACTION_RESTRICT:
+                        string = "RESTRICT";
+                        break;
+                    case FKCONSTR_ACTION_CASCADE:
+                        string = "CASCADE";
+                        break;
+                    case FKCONSTR_ACTION_SETNULL:
+                        string = "SET NULL";
+                        break;
+                    case FKCONSTR_ACTION_SETDEFAULT:
+                        string = "SET DEFAULT";
+                        break;
+                    default:
+                        elog(ERROR, "unrecognized confdeltype: %d",
+                             conForm->confdeltype);
+                        string = NULL;    /* keep compiler quiet */
+                        break;
+                }
+                if (string)
+                    appendStringInfo(&buf, " ON DELETE %s", string);
+
+                break;
+            }
+        case CONSTRAINT_PRIMARY:
+        case CONSTRAINT_UNIQUE:
+            {
+                Datum        val;
+                bool        isnull;
+                Oid            indexId;
+
+                /* Start off the constraint definition */
+                if (conForm->contype == CONSTRAINT_PRIMARY)
+                    appendStringInfoString(&buf, "PRIMARY KEY (");
+                else
+                    appendStringInfoString(&buf, "UNIQUE (");
+
+                /* Fetch and build target column list */
+                val = SysCacheGetAttr(CONSTROID, tup,
+                                      Anum_pg_constraint_conkey, &isnull);
+                if (isnull)
+                    elog(ERROR, "null conkey for constraint %u",
+                         constraintId);
+
+                decompile_column_index_array(val, conForm->conrelid, &buf);
+
+                appendStringInfoChar(&buf, ')');
+
+                indexId = get_constraint_index(constraintId);
+
+                /* XXX why do we only print these bits if fullCommand? */
+                if (fullCommand && OidIsValid(indexId))
+                {
+                    char       *options = flatten_reloptions(indexId);
+                    Oid            tblspc;
+
+                    if (options)
+                    {
+                        appendStringInfo(&buf, " WITH (%s)", options);
+                        pfree(options);
+                    }
+
+                    tblspc = get_rel_tablespace(indexId);
+                    if (OidIsValid(tblspc))
+                        appendStringInfo(&buf, " USING INDEX TABLESPACE %s",
+                                         quote_identifier(get_tablespace_name(tblspc)));
+                }
+
+                break;
+            }
+        case CONSTRAINT_CHECK:
+            {
+                Datum        val;
+                bool        isnull;
+                char       *conbin;
+                char       *consrc;
+                Node       *expr;
+                List       *context;
+
+                /* Fetch constraint expression in parsetree form */
+                val = SysCacheGetAttr(CONSTROID, tup,
+                                      Anum_pg_constraint_conbin, &isnull);
+                if (isnull)
+                    elog(ERROR, "null conbin for constraint %u",
+                         constraintId);
+
+                conbin = TextDatumGetCString(val);
+                expr = stringToNode(conbin);
+
+                /* Set up deparsing context for Var nodes in constraint */
+                if (conForm->conrelid != InvalidOid)
+                {
+                    /* relation constraint */
+                    context = deparse_context_for(get_relation_name(conForm->conrelid),
+                                                  conForm->conrelid);
+                }
+                else
+                {
+                    /* domain constraint --- can't have Vars */
+                    context = NIL;
+                }
+
+                consrc = deparse_expression_pretty(expr, context, false, false,
+                                                   prettyFlags, 0);
+
+                /*
+                 * Now emit the constraint definition, adding NO INHERIT if
+                 * necessary.
+                 *
+                 * There are cases where the constraint expression will be
+                 * fully parenthesized and we don't need the outer parens ...
+                 * but there are other cases where we do need 'em.  Be
+                 * conservative for now.
+                 *
+                 * Note that simply checking for leading '(' and trailing ')'
+                 * would NOT be good enough, consider "(x > 0) AND (y > 0)".
+                 */
+                appendStringInfo(&buf, "CHECK (%s)%s",
+                                 consrc,
+                                 conForm->connoinherit ? " NO INHERIT" : "");
+                break;
+            }
+        case CONSTRAINT_TRIGGER:
+
+            /*
+             * There isn't an ALTER TABLE syntax for creating a user-defined
+             * constraint trigger, but it seems better to print something than
+             * throw an error; if we throw error then this function couldn't
+             * safely be applied to all rows of pg_constraint.
+             */
+            appendStringInfoString(&buf, "TRIGGER");
+            break;
+        case CONSTRAINT_EXCLUSION:
+            {
+                Oid            indexOid = conForm->conindid;
+                Datum        val;
+                bool        isnull;
+                Datum       *elems;
+                int            nElems;
+                int            i;
+                Oid           *operators;
+
+                /* Extract operator OIDs from the pg_constraint tuple */
+                val = SysCacheGetAttr(CONSTROID, tup,
+                                      Anum_pg_constraint_conexclop,
+                                      &isnull);
+                if (isnull)
+                    elog(ERROR, "null conexclop for constraint %u",
+                         constraintId);
+
+                deconstruct_array(DatumGetArrayTypeP(val),
+                                  OIDOID, sizeof(Oid), true, 'i',
+                                  &elems, NULL, &nElems);
+
+                operators = (Oid *) palloc(nElems * sizeof(Oid));
+                for (i = 0; i < nElems; i++)
+                    operators[i] = DatumGetObjectId(elems[i]);
+
+                /* pg_get_indexdef_worker does the rest */
+                /* suppress tablespace because pg_dump wants it that way */
+                appendStringInfoString(&buf,
+                                       pg_get_indexdef_worker(indexOid,
+                                                              0,
+                                                              operators,
+                                                              false,
+                                                              false,
+                                                              prettyFlags,
+                                                              false));
+                break;
+            }
+        default:
+            elog(ERROR, "invalid constraint type \"%c\"", conForm->contype);
+            break;
+    }
+
+    if (conForm->condeferrable)
+        appendStringInfoString(&buf, " DEFERRABLE");
+    if (conForm->condeferred)
+        appendStringInfoString(&buf, " INITIALLY DEFERRED");
+    if (!conForm->convalidated)
+        appendStringInfoString(&buf, " NOT VALID");
+
+    /* Cleanup */
+    systable_endscan(scandesc);
+    heap_close(relation, AccessShareLock);
+
+    return buf.data;
+}
+
+
+/*
+ * Convert an int16[] Datum into a comma-separated list of column names
+ * for the indicated relation; append the list to buf.
+ */
+static void
+decompile_column_index_array(Datum column_index_array, Oid relId,
+                             StringInfo buf)
+{
+    Datum       *keys;
+    int            nKeys;
+    int            j;
+
+    /* Extract data from array of int16 */
+    deconstruct_array(DatumGetArrayTypeP(column_index_array),
+                      INT2OID, 2, true, 's',
+                      &keys, NULL, &nKeys);
+
+    for (j = 0; j < nKeys; j++)
+    {
+        char       *colName;
+
+        colName = get_relid_attribute_name(relId, DatumGetInt16(keys[j]));
+
+        if (j == 0)
+            appendStringInfoString(buf, quote_identifier(colName));
+        else
+            appendStringInfo(buf, ", %s", quote_identifier(colName));
+    }
+}
+
+
+/* ----------
+ * get_expr            - Decompile an expression tree
+ *
+ * Input: an expression tree in nodeToString form, and a relation OID
+ *
+ * Output: reverse-listed expression
+ *
+ * Currently, the expression can only refer to a single relation, namely
+ * the one specified by the second parameter.  This is sufficient for
+ * partial indexes, column default expressions, etc.  We also support
+ * Var-free expressions, for which the OID can be InvalidOid.
+ * ----------
+ */
+Datum
+pg_get_expr(PG_FUNCTION_ARGS)
+{
+    text       *expr = PG_GETARG_TEXT_PP(0);
+    Oid            relid = PG_GETARG_OID(1);
+    int            prettyFlags;
+    char       *relname;
+
+    prettyFlags = PRETTYFLAG_INDENT;
+
+    if (OidIsValid(relid))
+    {
+        /* Get the name for the relation */
+        relname = get_rel_name(relid);
+
+        /*
+         * If the OID isn't actually valid, don't throw an error, just return
+         * NULL.  This is a bit questionable, but it's what we've done
+         * historically, and it can help avoid unwanted failures when
+         * examining catalog entries for just-deleted relations.
+         */
+        if (relname == NULL)
+            PG_RETURN_NULL();
+    }
+    else
+        relname = NULL;
+
+    PG_RETURN_TEXT_P(pg_get_expr_worker(expr, relid, relname, prettyFlags));
+}
+
+Datum
+pg_get_expr_ext(PG_FUNCTION_ARGS)
+{
+    text       *expr = PG_GETARG_TEXT_PP(0);
+    Oid            relid = PG_GETARG_OID(1);
+    bool        pretty = PG_GETARG_BOOL(2);
+    int            prettyFlags;
+    char       *relname;
+
+    prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
+
+    if (OidIsValid(relid))
+    {
+        /* Get the name for the relation */
+        relname = get_rel_name(relid);
+        /* See notes above */
+        if (relname == NULL)
+            PG_RETURN_NULL();
+    }
+    else
+        relname = NULL;
+
+    PG_RETURN_TEXT_P(pg_get_expr_worker(expr, relid, relname, prettyFlags));
+}
+
+static text *
+pg_get_expr_worker(text *expr, Oid relid, const char *relname, int prettyFlags)
+{
+    Node       *node;
+    List       *context;
+    char       *exprstr;
+    char       *str;
+
+    /* Convert input TEXT object to C string */
+    exprstr = text_to_cstring(expr);
+
+    /* Convert expression to node tree */
+    node = (Node *) stringToNode(exprstr);
+
+    pfree(exprstr);
+
+    /* Prepare deparse context if needed */
+    if (OidIsValid(relid))
+        context = deparse_context_for(relname, relid);
+    else
+        context = NIL;
+
+    /* Deparse */
+    str = deparse_expression_pretty(node, context, false, false,
+                                    prettyFlags, 0);
+
+    return string_to_text(str);
+}
+
+
+/* ----------
+ * get_userbyid            - Get a user name by roleid and
+ *                  fallback to 'unknown (OID=n)'
+ * ----------
+ */
+Datum
+pg_get_userbyid(PG_FUNCTION_ARGS)
+{
+    Oid            roleid = PG_GETARG_OID(0);
+    Name        result;
+    HeapTuple    roletup;
+    Form_pg_authid role_rec;
+
+    /*
+     * Allocate space for the result
+     */
+    result = (Name) palloc(NAMEDATALEN);
+    memset(NameStr(*result), 0, NAMEDATALEN);
+
+    /*
+     * Get the pg_authid entry and print the result
+     */
+    roletup = SearchSysCache1(AUTHOID, ObjectIdGetDatum(roleid));
+    if (HeapTupleIsValid(roletup))
+    {
+        role_rec = (Form_pg_authid) GETSTRUCT(roletup);
+        StrNCpy(NameStr(*result), NameStr(role_rec->rolname), NAMEDATALEN);
+        ReleaseSysCache(roletup);
+    }
+    else
+        sprintf(NameStr(*result), "unknown (OID=%u)", roleid);
+
+    PG_RETURN_NAME(result);
+}
+
+
+/*
+ * pg_get_serial_sequence
+ *        Get the name of the sequence used by a serial column,
+ *        formatted suitably for passing to setval, nextval or currval.
+ *        First parameter is not treated as double-quoted, second parameter
+ *        is --- see documentation for reason.
+ */
+Datum
+pg_get_serial_sequence(PG_FUNCTION_ARGS)
+{// #lizard forgives
+    text       *tablename = PG_GETARG_TEXT_PP(0);
+    text       *columnname = PG_GETARG_TEXT_PP(1);
+    RangeVar   *tablerv;
+    Oid            tableOid;
+    char       *column;
+    AttrNumber    attnum;
+    Oid            sequenceId = InvalidOid;
+    Relation    depRel;
+    ScanKeyData key[3];
+    SysScanDesc scan;
+    HeapTuple    tup;
+
+    /* Look up table name.  Can't lock it - we might not have privileges. */
+    tablerv = makeRangeVarFromNameList(textToQualifiedNameList(tablename));
+    tableOid = RangeVarGetRelid(tablerv, NoLock, false);
+
+    /* Get the number of the column */
+    column = text_to_cstring(columnname);
+
+    attnum = get_attnum(tableOid, column);
+    if (attnum == InvalidAttrNumber)
+        ereport(ERROR,
+                (errcode(ERRCODE_UNDEFINED_COLUMN),
+                 errmsg("column \"%s\" of relation \"%s\" does not exist",
+                        column, tablerv->relname)));
+
+    /* Search the dependency table for the dependent sequence */
+    depRel = heap_open(DependRelationId, AccessShareLock);
+
+    ScanKeyInit(&key[0],
+                Anum_pg_depend_refclassid,
+                BTEqualStrategyNumber, F_OIDEQ,
+                ObjectIdGetDatum(RelationRelationId));
+    ScanKeyInit(&key[1],
+                Anum_pg_depend_refobjid,
+                BTEqualStrategyNumber, F_OIDEQ,
+                ObjectIdGetDatum(tableOid));
+    ScanKeyInit(&key[2],
+                Anum_pg_depend_refobjsubid,
+                BTEqualStrategyNumber, F_INT4EQ,
+                Int32GetDatum(attnum));
+
+    scan = systable_beginscan(depRel, DependReferenceIndexId, true,
+                              NULL, 3, key);
+
+    while (HeapTupleIsValid(tup = systable_getnext(scan)))
+    {
+        Form_pg_depend deprec = (Form_pg_depend) GETSTRUCT(tup);
+
+        /*
+         * We assume any auto dependency of a sequence on a column must be
+         * what we are looking for.  (We need the relkind test because indexes
+         * can also have auto dependencies on columns.)
+         */
+        if (deprec->classid == RelationRelationId &&
+            deprec->objsubid == 0 &&
+            deprec->deptype == DEPENDENCY_AUTO &&
+            get_rel_relkind(deprec->objid) == RELKIND_SEQUENCE)
+        {
+            sequenceId = deprec->objid;
+            break;
+        }
+    }
+
+    systable_endscan(scan);
+    heap_close(depRel, AccessShareLock);
+
+    if (OidIsValid(sequenceId))
+    {
+        char       *result;
+
+        result = generate_qualified_relation_name(sequenceId);
+
+        PG_RETURN_TEXT_P(string_to_text(result));
+    }
+
+    PG_RETURN_NULL();
+}
+
+
+/*
+ * pg_get_functiondef
+ *        Returns the complete "CREATE OR REPLACE FUNCTION ..." statement for
+ *        the specified function.
+ *
+ * Note: if you change the output format of this function, be careful not
+ * to break psql's rules (in \ef and \sf) for identifying the start of the
+ * function body.  To wit: the function body starts on a line that begins
+ * with "AS ", and no preceding line will look like that.
+ */
+Datum
+pg_get_functiondef(PG_FUNCTION_ARGS)
+{// #lizard forgives
+    Oid            funcid = PG_GETARG_OID(0);
+    StringInfoData buf;
+    StringInfoData dq;
+    HeapTuple    proctup;
+    Form_pg_proc proc;
+    Datum        tmp;
+    bool        isnull;
+    const char *prosrc;
+    const char *name;
+    const char *nsp;
+    float4        procost;
+    int            oldlen;
+
+    initStringInfo(&buf);
+
+    /* Look up the function */
+    proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
+    if (!HeapTupleIsValid(proctup))
+        PG_RETURN_NULL();
+
+    proc = (Form_pg_proc) GETSTRUCT(proctup);
+    name = NameStr(proc->proname);
+
+    if (proc->proisagg)
+        ereport(ERROR,
+                (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+                 errmsg("\"%s\" is an aggregate function", name)));
+
+    /*
+     * We always qualify the function name, to ensure the right function gets
+     * replaced.
+     */
+    nsp = get_namespace_name(proc->pronamespace);
+    appendStringInfo(&buf, "CREATE OR REPLACE FUNCTION %s(",
+                     quote_qualified_identifier(nsp, name));
+    (void) print_function_arguments(&buf, proctup, false, true);
+    appendStringInfoString(&buf, ")\n RETURNS ");
+    print_function_rettype(&buf, proctup);
+
+    print_function_trftypes(&buf, proctup);
+
+    appendStringInfo(&buf, "\n LANGUAGE %s\n",
+                     quote_identifier(get_language_name(proc->prolang, false)));
+
+    /* Emit some miscellaneous options on one line */
+    oldlen = buf.len;
+
+    if (proc->proiswindow)
+        appendStringInfoString(&buf, " WINDOW");
+    switch (proc->provolatile)
+    {
+        case PROVOLATILE_IMMUTABLE:
+            appendStringInfoString(&buf, " IMMUTABLE");
+            break;
+        case PROVOLATILE_STABLE:
+            appendStringInfoString(&buf, " STABLE");
+            break;
+        case PROVOLATILE_VOLATILE:
+            break;
+    }
+
+    switch (proc->proparallel)
+    {
+        case PROPARALLEL_SAFE:
+            appendStringInfoString(&buf, " PARALLEL SAFE");
+            break;
+        case PROPARALLEL_RESTRICTED:
+            appendStringInfoString(&buf, " PARALLEL RESTRICTED");
+            break;
+        case PROPARALLEL_UNSAFE:
+            break;
+    }
+
+    if (proc->proisstrict)
+        appendStringInfoString(&buf, " STRICT");
+    if (proc->prosecdef)
+        appendStringInfoString(&buf, " SECURITY DEFINER");
+    if (proc->proleakproof)
+        appendStringInfoString(&buf, " LEAKPROOF");
+
+    /* This code for the default cost and rows should match functioncmds.c */
+    if (proc->prolang == INTERNALlanguageId ||
+        proc->prolang == ClanguageId)
+        procost = 1;
+    else
+        procost = 100;
+    if (proc->procost != procost)
+        appendStringInfo(&buf, " COST %g", proc->procost);
+
+    if (proc->prorows > 0 && proc->prorows != 1000)
+        appendStringInfo(&buf, " ROWS %g", proc->prorows);
+
+    if (oldlen != buf.len)
+        appendStringInfoChar(&buf, '\n');
+
+    /* Emit any proconfig options, one per line */
+    tmp = SysCacheGetAttr(PROCOID, proctup, Anum_pg_proc_proconfig, &isnull);
+    if (!isnull)
+    {
+        ArrayType  *a = DatumGetArrayTypeP(tmp);
+        int            i;
+
+        Assert(ARR_ELEMTYPE(a) == TEXTOID);
+        Assert(ARR_NDIM(a) == 1);
+        Assert(ARR_LBOUND(a)[0] == 1);
+
+        for (i = 1; i <= ARR_DIMS(a)[0]; i++)
+        {
+            Datum        d;
+
+            d = array_ref(a, 1, &i,
+                          -1 /* varlenarray */ ,
+                          -1 /* TEXT's typlen */ ,
+                          false /* TEXT's typbyval */ ,
+                          'i' /* TEXT's typalign */ ,
+                          &isnull);
+            if (!isnull)
+            {
+                char       *configitem = TextDatumGetCString(d);
+                char       *pos;
+
+                pos = strchr(configitem, '=');
+                if (pos == NULL)
+                    continue;
+                *pos++ = '\0';
+
+                appendStringInfo(&buf, " SET %s TO ",
+                                 quote_identifier(configitem));
+
+                /*
+                 * Some GUC variable names are 'LIST' type and hence must not
+                 * be quoted.
+                 */
+                if (pg_strcasecmp(configitem, "DateStyle") == 0
+                    || pg_strcasecmp(configitem, "search_path") == 0)
+                    appendStringInfoString(&buf, pos);
+                else
+                    simple_quote_literal(&buf, pos);
+                appendStringInfoChar(&buf, '\n');
+            }
+        }
+    }
+
+    /* And finally the function definition ... */
+    appendStringInfoString(&buf, "AS ");
+
+    tmp = SysCacheGetAttr(PROCOID, proctup, Anum_pg_proc_probin, &isnull);
+    if (!isnull)
+    {
+        simple_quote_literal(&buf, TextDatumGetCString(tmp));
+        appendStringInfoString(&buf, ", "); /* assume prosrc isn't null */
+    }
+
+    tmp = SysCacheGetAttr(PROCOID, proctup, Anum_pg_proc_prosrc, &isnull);
+    if (isnull)
+        elog(ERROR, "null prosrc");
+    prosrc = TextDatumGetCString(tmp);
+
+    /*
+     * We always use dollar quoting.  Figure out a suitable delimiter.
+     *
+     * Since the user is likely to be editing the function body string, we
+     * shouldn't use a short delimiter that he might easily create a conflict
+     * with.  Hence prefer "$function$", but extend if needed.
+     */
+    initStringInfo(&dq);
+    appendStringInfoString(&dq, "$function");
+    while (strstr(prosrc, dq.data) != NULL)
+        appendStringInfoChar(&dq, 'x');
+    appendStringInfoChar(&dq, '$');
+
+    appendStringInfoString(&buf, dq.data);
+    appendStringInfoString(&buf, prosrc);
+    appendStringInfoString(&buf, dq.data);
+
+    appendStringInfoChar(&buf, '\n');
+
+    ReleaseSysCache(proctup);
+
+    PG_RETURN_TEXT_P(string_to_text(buf.data));
+}
+
+/*
+ * pg_get_function_arguments
+ *        Get a nicely-formatted list of arguments for a function.
+ *        This is everything that would go between the parentheses in
+ *        CREATE FUNCTION.
+ */
+Datum
+pg_get_function_arguments(PG_FUNCTION_ARGS)
+{
+    Oid            funcid = PG_GETARG_OID(0);
+    StringInfoData buf;
+    HeapTuple    proctup;
+
+    proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
+    if (!HeapTupleIsValid(proctup))
+        PG_RETURN_NULL();
+
+    initStringInfo(&buf);
+
+    (void) print_function_arguments(&buf, proctup, false, true);
+
+    ReleaseSysCache(proctup);
+
+    PG_RETURN_TEXT_P(string_to_text(buf.data));
+}
+
+/*
+ * pg_get_function_identity_arguments
+ *        Get a formatted list of arguments for a function.
+ *        This is everything that would go between the parentheses in
+ *        ALTER FUNCTION, etc.  In particular, don't print defaults.
+ */
+Datum
+pg_get_function_identity_arguments(PG_FUNCTION_ARGS)
+{
+    Oid            funcid = PG_GETARG_OID(0);
+    StringInfoData buf;
+    HeapTuple    proctup;
+
+    proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
+    if (!HeapTupleIsValid(proctup))
+        PG_RETURN_NULL();
+
+    initStringInfo(&buf);
+
+    (void) print_function_arguments(&buf, proctup, false, false);
+
+    ReleaseSysCache(proctup);
+
+    PG_RETURN_TEXT_P(string_to_text(buf.data));
+}
+
+/*
+ * pg_get_function_result
+ *        Get a nicely-formatted version of the result type of a function.
+ *        This is what would appear after RETURNS in CREATE FUNCTION.
+ */
+Datum
+pg_get_function_result(PG_FUNCTION_ARGS)
+{
+    Oid            funcid = PG_GETARG_OID(0);
+    StringInfoData buf;
+    HeapTuple    proctup;
+
+    proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
+    if (!HeapTupleIsValid(proctup))
+        PG_RETURN_NULL();
+
+    initStringInfo(&buf);
+
+    print_function_rettype(&buf, proctup);
+
+    ReleaseSysCache(proctup);
+
+    PG_RETURN_TEXT_P(string_to_text(buf.data));
+}
+
+/*
+ * Guts of pg_get_function_result: append the function's return type
+ * to the specified buffer.
+ */
+static void
+print_function_rettype(StringInfo buf, HeapTuple proctup)
+{
+    Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(proctup);
+    int            ntabargs = 0;
+    StringInfoData rbuf;
+
+    initStringInfo(&rbuf);
+
+    if (proc->proretset)
+    {
+        /* It might be a table function; try to print the arguments */
+        appendStringInfoString(&rbuf, "TABLE(");
+        ntabargs = print_function_arguments(&rbuf, proctup, true, false);
+        if (ntabargs > 0)
+            appendStringInfoChar(&rbuf, ')');
+        else
+            resetStringInfo(&rbuf);
+    }
+
+    if (ntabargs == 0)
+    {
+        /* Not a table function, so do the normal thing */
+        if (proc->proretset)
+            appendStringInfoString(&rbuf, "SETOF ");
+        appendStringInfoString(&rbuf, format_type_be(proc->prorettype));
+    }
+
+    appendStringInfoString(buf, rbuf.data);
+}
+
+/*
+ * Common code for pg_get_function_arguments and pg_get_function_result:
+ * append the desired subset of arguments to buf.  We print only TABLE
+ * arguments when print_table_args is true, and all the others when it's false.
+ * We print argument defaults only if print_defaults is true.
+ * Function return value is the number of arguments printed.
+ */
+static int
+print_function_arguments(StringInfo buf, HeapTuple proctup,
+                         bool print_table_args, bool print_defaults)
+{// #lizard forgives
+    Form_pg_proc proc = (Form_pg_proc) GETSTRUCT(proctup);
+    int            numargs;
+    Oid           *argtypes;
+    char      **argnames;
+    char       *argmodes;
+    int            insertorderbyat = -1;
+    int            argsprinted;
+    int            inputargno;
+    int            nlackdefaults;
+    ListCell   *nextargdefault = NULL;
+    int            i;
+
+    numargs = get_func_arg_info(proctup,
+                                &argtypes, &argnames, &argmodes);
+
+    nlackdefaults = numargs;
+    if (print_defaults && proc->pronargdefaults > 0)
+    {
+        Datum        proargdefaults;
+        bool        isnull;
+
+        proargdefaults = SysCacheGetAttr(PROCOID, proctup,
+                                         Anum_pg_proc_proargdefaults,
+                                         &isnull);
+        if (!isnull)
+        {
+            char       *str;
+            List       *argdefaults;
+
+            str = TextDatumGetCString(proargdefaults);
+            argdefaults = castNode(List, stringToNode(str));
+            pfree(str);
+            nextargdefault = list_head(argdefaults);
+            /* nlackdefaults counts only *input* arguments lacking defaults */
+            nlackdefaults = proc->pronargs - list_length(argdefaults);
+        }
+    }
+
+    /* Check for special treatment of ordered-set aggregates */
+    if (proc->proisagg)
+    {
+        HeapTuple    aggtup;
+        Form_pg_aggregate agg;
+
+        aggtup = SearchSysCache1(AGGFNOID,
+                                 ObjectIdGetDatum(HeapTupleGetOid(proctup)));
+        if (!HeapTupleIsValid(aggtup))
+            elog(ERROR, "cache lookup failed for aggregate %u",
+                 HeapTupleGetOid(proctup));
+        agg = (Form_pg_aggregate) GETSTRUCT(aggtup);
+        if (AGGKIND_IS_ORDERED_SET(agg->aggkind))
+            insertorderbyat = agg->aggnumdirectargs;
+        ReleaseSysCache(aggtup);
+    }
+
+    argsprinted = 0;
+    inputargno = 0;
+    for (i = 0; i < numargs; i++)
+    {
+        Oid            argtype = argtypes[i];
+        char       *argname = argnames ? argnames[i] : NULL;
+        char        argmode = argmodes ? argmodes[i] : PROARGMODE_IN;
+        const char *modename;
+        bool        isinput;
+
+        switch (argmode)
+        {
+            case PROARGMODE_IN:
+                modename = "";
+                isinput = true;
+                break;
+            case PROARGMODE_INOUT:
+                modename = "INOUT ";
+                isinput = true;
+                break;
+            case PROARGMODE_OUT:
+                modename = "OUT ";
+                isinput = false;
+                break;
+            case PROARGMODE_VARIADIC:
+                modename = "VARIADIC ";
+                isinput = true;
+                break;
+            case PROARGMODE_TABLE:
+                modename = "";
+                isinput = false;
+                break;
+            default:
+                elog(ERROR, "invalid parameter mode '%c'", argmode);
+                modename = NULL;    /* keep compiler quiet */
+                isinput = false;
+                break;
+        }
+        if (isinput)
+            inputargno++;        /* this is a 1-based counter */
+
+        if (print_table_args != (argmode == PROARGMODE_TABLE))
+            continue;
+
+        if (argsprinted == insertorderbyat)
+        {
+            if (argsprinted)
+                appendStringInfoChar(buf, ' ');
+            appendStringInfoString(buf, "ORDER BY ");
+        }
+        else if (argsprinted)
+            appendStringInfoString(buf, ", ");
+
+        appendStringInfoString(buf, modename);
+        if (argname && argname[0])
+            appendStringInfo(buf, "%s ", quote_identifier(argname));
+        appendStringInfoString(buf, format_type_be(argtype));
+        if (print_defaults && isinput && inputargno > nlackdefaults)
+        {
+            Node       *expr;
+
+            Assert(nextargdefault != NULL);
+            expr = (Node *) lfirst(nextargdefault);
+            nextargdefault = lnext(nextargdefault);
+
+            appendStringInfo(buf, " DEFAULT %s",
+                             deparse_expression(expr, NIL, false, false));
+        }
+        argsprinted++;
+
+        /* nasty hack: print the last arg twice for variadic ordered-set agg */
+        if (argsprinted == insertorderbyat && i == numargs - 1)
+        {
+            i--;
+            /* aggs shouldn't have defaults anyway, but just to be sure ... */
+            print_defaults = false;
+        }
+    }
+
+    return argsprinted;
+}
+
+static bool
+is_input_argument(int nth, const char *argmodes)
+{
+    return (!argmodes
+            || argmodes[nth] == PROARGMODE_IN
+            || argmodes[nth] == PROARGMODE_INOUT
+            || argmodes[nth] == PROARGMODE_VARIADIC);
+}
+
+/*
+ * Append used transformed types to specified buffer
+ */
+static void
+print_function_trftypes(StringInfo buf, HeapTuple proctup)
+{
+    Oid           *trftypes;
+    int            ntypes;
+
+    ntypes = get_func_trftypes(proctup, &trftypes);
+    if (ntypes > 0)
+    {
+        int            i;
+
+        appendStringInfoString(buf, "\n TRANSFORM ");
+        for (i = 0; i < ntypes; i++)
+        {
+            if (i != 0)
+                appendStringInfoString(buf, ", ");
+            appendStringInfo(buf, "FOR TYPE %s", format_type_be(trftypes[i]));
+        }
+    }
+}
+
+/*
+ * Get textual representation of a function argument's default value.  The
+ * second argument of this function is the argument number among all arguments
+ * (i.e. proallargtypes, *not* proargtypes), starting with 1, because that's
+ * how information_schema.sql uses it.
+ */
+Datum
+pg_get_function_arg_default(PG_FUNCTION_ARGS)
+{// #lizard forgives
+    Oid            funcid = PG_GETARG_OID(0);
+    int32        nth_arg = PG_GETARG_INT32(1);
+    HeapTuple    proctup;
+    Form_pg_proc proc;
+    int            numargs;
+    Oid           *argtypes;
+    char      **argnames;
+    char       *argmodes;
+    int            i;
+    List       *argdefaults;
+    Node       *node;
+    char       *str;
+    int            nth_inputarg;
+    Datum        proargdefaults;
+    bool        isnull;
+    int            nth_default;
+
+    proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
+    if (!HeapTupleIsValid(proctup))
+        PG_RETURN_NULL();
+
+    numargs = get_func_arg_info(proctup, &argtypes, &argnames, &argmodes);
+    if (nth_arg < 1 || nth_arg > numargs || !is_input_argument(nth_arg - 1, argmodes))
+    {
+        ReleaseSysCache(proctup);
+        PG_RETURN_NULL();
+    }
+
+    nth_inputarg = 0;
+    for (i = 0; i < nth_arg; i++)
+        if (is_input_argument(i, argmodes))
+            nth_inputarg++;
+
+    proargdefaults = SysCacheGetAttr(PROCOID, proctup,
+                                     Anum_pg_proc_proargdefaults,
+                                     &isnull);
+    if (isnull)
+    {
+        ReleaseSysCache(proctup);
+        PG_RETURN_NULL();
+    }
+
+    str = TextDatumGetCString(proargdefaults);
+    argdefaults = castNode(List, stringToNode(str));
+    pfree(str);
+
+    proc = (Form_pg_proc) GETSTRUCT(proctup);
+
+    /*
+     * Calculate index into proargdefaults: proargdefaults corresponds to the
+     * last N input arguments, where N = pronargdefaults.
+     */
+    nth_default = nth_inputarg - 1 - (proc->pronargs - proc->pronargdefaults);
+
+    if (nth_default < 0 || nth_default >= list_length(argdefaults))
+    {
+        ReleaseSysCache(proctup);
+        PG_RETURN_NULL();
+    }
+    node = list_nth(argdefaults, nth_default);
+    str = deparse_expression(node, NIL, false, false);
+
+    ReleaseSysCache(proctup);
+
+    PG_RETURN_TEXT_P(string_to_text(str));
+}
+
+
+/*
+ * deparse_expression            - General utility for deparsing expressions
+ *
+ * calls deparse_expression_pretty with all prettyPrinting disabled
+ */
+char *
+deparse_expression(Node *expr, List *dpcontext,
+                   bool forceprefix, bool showimplicit)
+{
+    return deparse_expression_pretty(expr, dpcontext, forceprefix,
+                                     showimplicit, 0, 0);
+}
+
+/* ----------
+ * deparse_expression_pretty    - General utility for deparsing expressions
+ *
+ * expr is the node tree to be deparsed.  It must be a transformed expression
+ * tree (ie, not the raw output of gram.y).
+ *
+ * dpcontext is a list of deparse_namespace nodes representing the context
+ * for interpreting Vars in the node tree.  It can be NIL if no Vars are
+ * expected.
+ *
+ * forceprefix is TRUE to force all Vars to be prefixed with their table names.
+ *
+ * showimplicit is TRUE to force all implicit casts to be shown explicitly.
+ *
+ * Tries to pretty up the output according to prettyFlags and startIndent.
+ *
+ * The result is a palloc'd string.
+ * ----------
+ */
+static char *
+deparse_expression_pretty(Node *expr, List *dpcontext,
+                          bool forceprefix, bool showimplicit,
+                          int prettyFlags, int startIndent)
+{
+    StringInfoData buf;
+    deparse_context context;
+
+    initStringInfo(&buf);
+    context.buf = &buf;
+    context.namespaces = dpcontext;
+    context.windowClause = NIL;
+    context.windowTList = NIL;
+    context.varprefix = forceprefix;
+    context.prettyFlags = prettyFlags;
+    context.wrapColumn = WRAP_COLUMN_DEFAULT;
+    context.indentLevel = startIndent;
+    context.special_exprkind = EXPR_KIND_NONE;
+
+    get_rule_expr(expr, &context, showimplicit);
+
+    return buf.data;
+}
+
+/* ----------
+ * deparse_context_for            - Build deparse context for a single relation
+ *
+ * Given the reference name (alias) and OID of a relation, build deparsing
+ * context for an expression referencing only that relation (as varno 1,
+ * varlevelsup 0).  This is sufficient for many uses of deparse_expression.
+ * ----------
+ */
+List *
+deparse_context_for(const char *aliasname, Oid relid)
+{
+    deparse_namespace *dpns;
+    RangeTblEntry *rte;
+
+    dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace));
+
+    /* Build a minimal RTE for the rel */
+    rte = makeNode(RangeTblEntry);
+    rte->rtekind = RTE_RELATION;
+    rte->relid = relid;
+    rte->relkind = RELKIND_RELATION;    /* no need for exactness here */
+    rte->alias = makeAlias(aliasname, NIL);
+    rte->eref = rte->alias;
+    rte->lateral = false;
+    rte->inh = false;
+    rte->inFromCl = true;
+
+    /* Build one-element rtable */
+    dpns->rtable = list_make1(rte);
+    dpns->ctes = NIL;
+    set_rtable_names(dpns, NIL, NULL);
+    set_simple_column_names(dpns);
+
+    /* Return a one-deep namespace stack */
+    return list_make1(dpns);
+}
+
+/*
+ * deparse_context_for_plan_rtable - Build deparse context for a plan's rtable
+ *
+ * When deparsing an expression in a Plan tree, we use the plan's rangetable
+ * to resolve names of simple Vars.  The initialization of column names for
+ * this is rather expensive if the rangetable is large, and it'll be the same
+ * for every expression in the Plan tree; so we do it just once and re-use
+ * the result of this function for each expression.  (Note that the result
+ * is not usable until set_deparse_context_planstate() is applied to it.)
+ *
+ * In addition to the plan's rangetable list, pass the per-RTE alias names
+ * assigned by a previous call to select_rtable_names_for_explain.
+ */
+List *
+deparse_context_for_plan_rtable(List *rtable, List *rtable_names)
+{
+    deparse_namespace *dpns;
+
+    dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace));
+
+    /* Initialize fields that stay the same across the whole plan tree */
+    dpns->rtable = rtable;
+    dpns->rtable_names = rtable_names;
+    dpns->ctes = NIL;
+
+    /*
+     * Set up column name aliases.  We will get rather bogus results for join
+     * RTEs, but that doesn't matter because plan trees don't contain any join
+     * alias Vars.
+     */
+    set_simple_column_names(dpns);
+
+    /* Return a one-deep namespace stack */
+    return list_make1(dpns);
+}
+
+/*
+ * set_deparse_context_planstate    - Specify Plan node containing expression
+ *
+ * When deparsing an expression in a Plan tree, we might have to resolve
+ * OUTER_VAR, INNER_VAR, or INDEX_VAR references.  To do this, the caller must
+ * provide the parent PlanState node.  Then OUTER_VAR and INNER_VAR references
+ * can be resolved by drilling down into the left and right child plans.
+ * Similarly, INDEX_VAR references can be resolved by reference to the
+ * indextlist given in a parent IndexOnlyScan node, or to the scan tlist in
+ * ForeignScan and CustomScan nodes.  (Note that we don't currently support
+ * deparsing of indexquals in regular IndexScan or BitmapIndexScan nodes;
+ * for those, we can only deparse the indexqualorig fields, which won't
+ * contain INDEX_VAR Vars.)
+ *
+ * Note: planstate really ought to be declared as "PlanState *", but we use
+ * "Node *" to avoid having to include execnodes.h in ruleutils.h.
+ *
+ * The ancestors list is a list of the PlanState's parent PlanStates, the
+ * most-closely-nested first.  This is needed to resolve PARAM_EXEC Params.
+ * Note we assume that all the PlanStates share the same rtable.
+ *
+ * Once this function has been called, deparse_expression() can be called on
+ * subsidiary expression(s) of the specified PlanState node.  To deparse
+ * expressions of a different Plan node in the same Plan tree, re-call this
+ * function to identify the new parent Plan node.
+ *
+ * The result is the same List passed in; this is a notational convenience.
+ */
+List *
+set_deparse_context_planstate(List *dpcontext,
+                              Node *planstate, List *ancestors)
+{
+    deparse_namespace *dpns;
+
+
+    /* Should always have one-entry namespace list for Plan deparsing */
+    Assert(list_length(dpcontext) == 1);
+    dpns = (deparse_namespace *) linitial(dpcontext);
+
+    /* Set our attention on the specific plan node passed in */
+    set_deparse_planstate(dpns, (PlanState *) planstate);
+    dpns->ancestors = ancestors;
+
+    return dpcontext;
+}
+
+/*
+ * select_rtable_names_for_explain    - Select RTE aliases for EXPLAIN
+ *
+ * Determine the relation aliases we'll use during an EXPLAIN operation.
+ * This is just a frontend to set_rtable_names.  We have to expose the aliases
+ * to EXPLAIN because EXPLAIN needs to know the right alias names to print.
+ */
+List *
+select_rtable_names_for_explain(List *rtable, Bitmapset *rels_used)
+{
+    deparse_namespace dpns;
+
+    memset(&dpns, 0, sizeof(dpns));
+    dpns.rtable = rtable;
+    dpns.ctes = NIL;
+    set_rtable_names(&dpns, NIL, rels_used);
+    /* We needn't bother computing column aliases yet */
+
+    return dpns.rtable_names;
+}
+
+#ifdef PGXC
+/*
+ * This is a special case deparse context to be used at the planning time to
+ * generate query strings and expressions for remote shipping.
+ *
+ * XXX We should be careful while using this since the support is quite
+ * limited. The only supported use case at this point is for remote join
+ * reduction and some simple plan trees rooted by Agg node having a single
+ * RemoteQuery node as leftree.
+ */
+List *
+deparse_context_for_plan(Node *plan, List *ancestors,
+                              List *rtable)
+{
+    deparse_namespace *dpns;
+
+    dpns = (deparse_namespace *) palloc0(sizeof(deparse_namespace));
+
+    /* Initialize fields that stay the same across the whole plan tree */
+    dpns->rtable = rtable;
+    dpns->ctes = NIL;
+
+    /* Set our attention on the specific plan node passed in */
+    set_deparse_plan(dpns, (Plan *) plan);
+    dpns->ancestors = ancestors;
+
+    /* Return a one-deep namespace stack */
+    return list_make1(dpns);
+}
+
+/*
+ * Set deparse context for Plan. Only those plan nodes which are immediate (or
+ * through simple nodes) parents of RemoteQuery nodes are supported right now.
+ *
+ * This is a kind of work-around since the new deparse interface (since 9.1)
+ * expects a PlanState node. But planstates are instantiated only at execution
+ * time when InitPlan is called. But we are required to deparse the query
+ * during planning time, so we hand-cook these dummy PlanState nodes instead of
+ * init-ing the plan. Another approach could have been to delay the query
+ * generation to the execution time, but we are not yet sure if this can be
+ * safely done, especially for remote join reduction.
+ */
+static void
+set_deparse_plan(deparse_namespace *dpns, Plan *plan)
+{// #lizard forgives
+
+    if (IsA(plan, NestLoop))
+    {
+        NestLoop *nestloop = (NestLoop *) plan;
+
+        dpns->planstate = (PlanState *) makeNode(NestLoopState);
+        dpns->planstate->plan = plan;
+
+        dpns->outer_planstate = (PlanState *) makeNode(PlanState);
+        dpns->outer_planstate->plan = nestloop->join.plan.lefttree;
+
+        dpns->inner_planstate = (PlanState *) makeNode(PlanState);
+        dpns->inner_planstate->plan = nestloop->join.plan.righttree;
+    }
+    else if (IsA(plan, RemoteQuery))
+    {
+        dpns->planstate = (PlanState *) makeNode(PlanState);
+        dpns->planstate->plan = plan;
+    }
+    else if (IsA(plan, Agg) || IsA(plan, Group))
+    {
+        /*
+         * We expect plan tree as Group/Agg->Sort->Result->Material->RemoteQuery,
+         * Result, Material nodes are optional. Sort is compulsory for Group but not
+         * for Agg.
+         * anything else is not handled right now.
+         */
+        Plan *temp_plan = plan->lefttree;
+        Plan *remote_scan = NULL;
+
+        if (temp_plan && IsA(temp_plan, Sort))
+            temp_plan = temp_plan->lefttree;
+        if (temp_plan && IsA(temp_plan, Result))
+            temp_plan = temp_plan->lefttree;
+        if (temp_plan && IsA(temp_plan, Material))
+            temp_plan = temp_plan->lefttree;
+        if (temp_plan && IsA(temp_plan, RemoteQuery))
+            remote_scan = temp_plan;
+
+        if (!remote_scan)
+            elog(ERROR, "Deparse of this query at planning is not supported yet");
+
+        dpns->planstate = (PlanState *) makeNode(PlanState);
+        dpns->planstate->plan = plan;
+    }
+    else
+        elog(ERROR, "Deparse of this query at planning not supported yet");
+}
+
+#endif
+/*
+ * set_rtable_names: select RTE aliases to be used in printing a query
+ *
+ * We fill in dpns->rtable_names with a list of names that is one-for-one with
+ * the already-filled dpns->rtable list.  Each RTE name is unique among those
+ * in the new namespace plus any ancestor namespaces listed in
+ * parent_namespaces.
+ *
+ * If rels_used isn't NULL, only RTE indexes listed in it are given aliases.
+ *
+ * Note that this function is only concerned with relation names, not column
+ * names.
+ */
+static void
+set_rtable_names(deparse_namespace *dpns, List *parent_namespaces,
+                 Bitmapset *rels_used)
+{// #lizard forgives
+    HASHCTL        hash_ctl;
+    HTAB       *names_hash;
+    NameHashEntry *hentry;
+    bool        found;
+    int            rtindex;
+    ListCell   *lc;
+
+    dpns->rtable_names = NIL;
+    /* nothing more to do if empty rtable */
+    if (dpns->rtable == NIL)
+        return;
+
+    /*
+     * We use a hash table to hold known names, so that this process is O(N)
+     * not O(N^2) for N names.
+     */
+    MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+    hash_ctl.keysize = NAMEDATALEN;
+    hash_ctl.entrysize = sizeof(NameHashEntry);
+    hash_ctl.hcxt = CurrentMemoryContext;
+    names_hash = hash_create("set_rtable_names names",
+                             list_length(dpns->rtable),
+                             &hash_ctl,
+                             HASH_ELEM | HASH_CONTEXT);
+    /* Preload the hash table with names appearing in parent_namespaces */
+    foreach(lc, parent_namespaces)
+    {
+        deparse_namespace *olddpns = (deparse_namespace *) lfirst(lc);
+        ListCell   *lc2;
+
+        foreach(lc2, olddpns->rtable_names)
+        {
+            char       *oldname = (char *) lfirst(lc2);
+
+            if (oldname == NULL)
+                continue;
+            hentry = (NameHashEntry *) hash_search(names_hash,
+                                                   oldname,
+                                                   HASH_ENTER,
+                                                   &found);
+            /* we do not complain about duplicate names in parent namespaces */
+            hentry->counter = 0;
+        }
+    }
+
+    /* Now we can scan the rtable */
+    rtindex = 1;
+    foreach(lc, dpns->rtable)
+    {
+        RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
+        char       *refname;
+
+        /* Just in case this takes an unreasonable amount of time ... */
+        CHECK_FOR_INTERRUPTS();
+
+        if (rels_used && !bms_is_member(rtindex, rels_used))
+        {
+            /* Ignore unreferenced RTE */
+            refname = NULL;
+        }
+        else if (rte->alias)
+        {
+            /* If RTE has a user-defined alias, prefer that */
+            refname = rte->alias->aliasname;
+        }
+        else if (rte->rtekind == RTE_RELATION)
+        {
+            /* Use the current actual name of the relation */
+            refname = get_rel_name(rte->relid);
+        }
+        else if (rte->rtekind == RTE_JOIN)
+        {
+            /* Unnamed join has no refname */
+            refname = NULL;
+        }
+        else
+        {
+            /* Otherwise use whatever the parser assigned */
+            refname = rte->eref->aliasname;
+        }
+
+        /*
+         * If the selected name isn't unique, append digits to make it so, and
+         * make a new hash entry for it once we've got a unique name.  For a
+         * very long input name, we might have to truncate to stay within
+         * NAMEDATALEN.
+         */
+        if (refname)
+        {
+            hentry = (NameHashEntry *) hash_search(names_hash,
+                                                   refname,
+                                                   HASH_ENTER,
+                                                   &found);
+            if (found)
+            {
+                /* Name already in use, must choose a new one */
+                int            refnamelen = strlen(refname);
+                char       *modname = (char *) palloc(refnamelen + 16);
+                NameHashEntry *hentry2;
+
+                do
+                {
+                    hentry->counter++;
+                    for (;;)
+                    {
+                        /*
+                         * We avoid using %.*s here because it can misbehave
+                         * if the data is not valid in what libc thinks is the
+                         * prevailing encoding.
+                         */
+                        memcpy(modname, refname, refnamelen);
+                        sprintf(modname + refnamelen, "_%d", hentry->counter);
+                        if (strlen(modname) < NAMEDATALEN)
+                            break;
+                        /* drop chars from refname to keep all the digits */
+                        refnamelen = pg_mbcliplen(refname, refnamelen,
+                                                  refnamelen - 1);
+                    }
+                    hentry2 = (NameHashEntry *) hash_search(names_hash,
+                                                            modname,
+                                                            HASH_ENTER,
+                                                            &found);
+                } while (found);
+                hentry2->counter = 0;    /* init new hash entry */
+                refname = modname;
+            }
+            else
+            {
+                /* Name not previously used, need only initialize hentry */
+                hentry->counter = 0;
+            }
+        }
+
+        dpns->rtable_names = lappend(dpns->rtable_names, refname);
+        rtindex++;
+    }
+
+    hash_destroy(names_hash);
+}
+
+/*
+ * set_deparse_for_query: set up deparse_namespace for deparsing a Query tree
+ *
+ * For convenience, this is defined to initialize the deparse_namespace struct
+ * from scratch.
+ */
+static void
+set_deparse_for_query(deparse_namespace *dpns, Query *query,
+                      List *parent_namespaces)
+{
+    ListCell   *lc;
+    ListCell   *lc2;
+
+    /* Initialize *dpns and fill rtable/ctes links */
+    memset(dpns, 0, sizeof(deparse_namespace));
+    dpns->rtable = query->rtable;
+    dpns->ctes = query->cteList;
+
+    /* Assign a unique relation alias to each RTE */
+    set_rtable_names(dpns, parent_namespaces, NULL);
+
+    /* Initialize dpns->rtable_columns to contain zeroed structs */
+    dpns->rtable_columns = NIL;
+    while (list_length(dpns->rtable_columns) < list_length(dpns->rtable))
+        dpns->rtable_columns = lappend(dpns->rtable_columns,
+                                       palloc0(sizeof(deparse_columns)));
+
+    /* If it's a utility query, it won't have a jointree */
+    if (query->jointree)
+    {
+        /* Detect whether global uniqueness of USING names is needed */
+        dpns->unique_using =
+            has_dangerous_join_using(dpns, (Node *) query->jointree);
+
+        /*
+         * Select names for columns merged by USING, via a recursive pass over
+         * the query jointree.
+         */
+        set_using_names(dpns, (Node *) query->jointree, NIL);
+    }
+
+    /*
+     * Now assign remaining column aliases for each RTE.  We do this in a
+     * linear scan of the rtable, so as to process RTEs whether or not they
+     * are in the jointree (we mustn't miss NEW.*, INSERT target relations,
+     * etc).  JOIN RTEs must be processed after their children, but this is
+     * okay because they appear later in the rtable list than their children
+     * (cf Asserts in identify_join_columns()).
+     */
+    forboth(lc, dpns->rtable, lc2, dpns->rtable_columns)
+    {
+        RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
+        deparse_columns *colinfo = (deparse_columns *) lfirst(lc2);
+
+        if (rte->rtekind == RTE_JOIN)
+            set_join_column_names(dpns, rte, colinfo);
+        else
+            set_relation_column_names(dpns, rte, colinfo);
+    }
+}
+
+/*
+ * set_simple_column_names: fill in column aliases for non-query situations
+ *
+ * This handles EXPLAIN and cases where we only have relation RTEs.  Without
+ * a join tree, we can't do anything smart about join RTEs, but we don't
+ * need to (note that EXPLAIN should never see join alias Vars anyway).
+ * If we do hit a join RTE we'll just process it like a non-table base RTE.
+ */
+static void
+set_simple_column_names(deparse_namespace *dpns)
+{
+    ListCell   *lc;
+    ListCell   *lc2;
+
+    /* Initialize dpns->rtable_columns to contain zeroed structs */
+    dpns->rtable_columns = NIL;
+    while (list_length(dpns->rtable_columns) < list_length(dpns->rtable))
+        dpns->rtable_columns = lappend(dpns->rtable_columns,
+                                       palloc0(sizeof(deparse_columns)));
+
+    /* Assign unique column aliases within each RTE */
+    forboth(lc, dpns->rtable, lc2, dpns->rtable_columns)
+    {
+        RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
+        deparse_columns *colinfo = (deparse_columns *) lfirst(lc2);
+
+        set_relation_column_names(dpns, rte, colinfo);
+    }
+}
+
+/*
+ * has_dangerous_join_using: search jointree for unnamed JOIN USING
+ *
+ * Merged columns of a JOIN USING may act differently from either of the input
+ * columns, either because they are merged with COALESCE (in a FULL JOIN) or
+ * because an implicit coercion of the underlying input column is required.
+ * In such a case the column must be referenced as a column of the JOIN not as
+ * a column of either input.  And this is problematic if the join is unnamed
+ * (alias-less): we cannot qualify the column's name with an RTE name, since
+ * there is none.  (Forcibly assigning an alias to the join is not a solution,
+ * since that will prevent legal references to tables below the join.)
+ * To ensure that every column in the query is unambiguously referenceable,
+ * we must assign such merged columns names that are globally unique across
+ * the whole query, aliasing other columns out of the way as necessary.
+ *
+ * Because the ensuing re-aliasing is fairly damaging to the readability of
+ * the query, we don't do this unless we have to.  So, we must pre-scan
+ * the join tree to see if we have to, before starting set_using_names().
+ */
+static bool
+has_dangerous_join_using(deparse_namespace *dpns, Node *jtnode)
+{// #lizard forgives
+    if (IsA(jtnode, RangeTblRef))
+    {
+        /* nothing to do here */
+    }
+    else if (IsA(jtnode, FromExpr))
+    {
+        FromExpr   *f = (FromExpr *) jtnode;
+        ListCell   *lc;
+
+        foreach(lc, f->fromlist)
+        {
+            if (has_dangerous_join_using(dpns, (Node *) lfirst(lc)))
+                return true;
+        }
+    }
+    else if (IsA(jtnode, JoinExpr))
+    {
+        JoinExpr   *j = (JoinExpr *) jtnode;
+
+        /* Is it an unnamed JOIN with USING? */
+        if (j->alias == NULL && j->usingClause)
+        {
+            /*
+             * Yes, so check each join alias var to see if any of them are not
+             * simple references to underlying columns.  If so, we have a
+             * dangerous situation and must pick unique aliases.
+             */
+            RangeTblEntry *jrte = rt_fetch(j->rtindex, dpns->rtable);
+            ListCell   *lc;
+
+            foreach(lc, jrte->joinaliasvars)
+            {
+                Var           *aliasvar = (Var *) lfirst(lc);
+
+                if (aliasvar != NULL && !IsA(aliasvar, Var))
+                    return true;
+            }
+        }
+
+        /* Nope, but inspect children */
+        if (has_dangerous_join_using(dpns, j->larg))
+            return true;
+        if (has_dangerous_join_using(dpns, j->rarg))
+            return true;
+    }
+    else
+        elog(ERROR, "unrecognized node type: %d",
+             (int) nodeTag(jtnode));
+    return false;
+}
+
+/*
+ * set_using_names: select column aliases to be used for merged USING columns
+ *
+ * We do this during a recursive descent of the query jointree.
+ * dpns->unique_using must already be set to determine the global strategy.
+ *
+ * Column alias info is saved in the dpns->rtable_columns list, which is
+ * assumed to be filled with pre-zeroed deparse_columns structs.
+ *
+ * parentUsing is a list of all USING aliases assigned in parent joins of
+ * the current jointree node.  (The passed-in list must not be modified.)
+ */
+static void
+set_using_names(deparse_namespace *dpns, Node *jtnode, List *parentUsing)
+{// #lizard forgives
+    if (IsA(jtnode, RangeTblRef))
+    {
+        /* nothing to do now */
+    }
+    else if (IsA(jtnode, FromExpr))
+    {
+        FromExpr   *f = (FromExpr *) jtnode;
+        ListCell   *lc;
+
+        foreach(lc, f->fromlist)
+            set_using_names(dpns, (Node *) lfirst(lc), parentUsing);
+    }
+    else if (IsA(jtnode, JoinExpr))
+    {
+        JoinExpr   *j = (JoinExpr *) jtnode;
+        RangeTblEntry *rte = rt_fetch(j->rtindex, dpns->rtable);
+        deparse_columns *colinfo = deparse_columns_fetch(j->rtindex, dpns);
+        int           *leftattnos;
+        int           *rightattnos;
+        deparse_columns *leftcolinfo;
+        deparse_columns *rightcolinfo;
+        int            i;
+        ListCell   *lc;
+
+        /* Get info about the shape of the join */
+        identify_join_columns(j, rte, colinfo);
+        leftattnos = colinfo->leftattnos;
+        rightattnos = colinfo->rightattnos;
+
+        /* Look up the not-yet-filled-in child deparse_columns structs */
+        leftcolinfo = deparse_columns_fetch(colinfo->leftrti, dpns);
+        rightcolinfo = deparse_columns_fetch(colinfo->rightrti, dpns);
+
+        /*
+         * If this join is unnamed, then we cannot substitute new aliases at
+         * this level, so any name requirements pushed down to here must be
+         * pushed down again to the children.
+         */
+        if (rte->alias == NULL)
+        {
+            for (i = 0; i < colinfo->num_cols; i++)
+            {
+                char       *colname = colinfo->colnames[i];
+
+                if (colname == NULL)
+                    continue;
+
+                /* Push down to left column, unless it's a system column */
+                if (leftattnos[i] > 0)
+                {
+                    expand_colnames_array_to(leftcolinfo, leftattnos[i]);
+                    leftcolinfo->colnames[leftattnos[i] - 1] = colname;
+                }
+
+                /* Same on the righthand side */
+                if (rightattnos[i] > 0)
+                {
+                    expand_colnames_array_to(rightcolinfo, rightattnos[i]);
+                    rightcolinfo->colnames[rightattnos[i] - 1] = colname;
+                }
+            }
+        }
+
+        /*
+         * If there's a USING clause, select the USING column names and push
+         * those names down to the children.  We have two strategies:
+         *
+         * If dpns->unique_using is TRUE, we force all USING names to be
+         * unique across the whole query level.  In principle we'd only need
+         * the names of dangerous USING columns to be globally unique, but to
+         * safely assign all USING names in a single pass, we have to enforce
+         * the same uniqueness rule for all of them.  However, if a USING
+         * column's name has been pushed down from the parent, we should use
+         * it as-is rather than making a uniqueness adjustment.  This is
+         * necessary when we're at an unnamed join, and it creates no risk of
+         * ambiguity.  Also, if there's a user-written output alias for a
+         * merged column, we prefer to use that rather than the input name;
+         * this simplifies the logic and seems likely to lead to less aliasing
+         * overall.
+         *
+         * If dpns->unique_using is FALSE, we only need USING names to be
+         * unique within their own join RTE.  We still need to honor
+         * pushed-down names, though.
+         *
+         * Though significantly different in results, these two strategies are
+         * implemented by the same code, with only the difference of whether
+         * to put assigned names into dpns->using_names.
+         */
+        if (j->usingClause)
+        {
+            /* Copy the input parentUsing list so we don't modify it */
+            parentUsing = list_copy(parentUsing);
+
+            /* USING names must correspond to the first join output columns */
+            expand_colnames_array_to(colinfo, list_length(j->usingClause));
+            i = 0;
+            foreach(lc, j->usingClause)
+            {
+                char       *colname = strVal(lfirst(lc));
+
+                /* Assert it's a merged column */
+                Assert(leftattnos[i] != 0 && rightattnos[i] != 0);
+
+                /* Adopt passed-down name if any, else select unique name */
+                if (colinfo->colnames[i] != NULL)
+                    colname = colinfo->colnames[i];
+                else
+                {
+                    /* Prefer user-written output alias if any */
+                    if (rte->alias && i < list_length(rte->alias->colnames))
+                        colname = strVal(list_nth(rte->alias->colnames, i));
+                    /* Make it appropriately unique */
+                    colname = make_colname_unique(colname, dpns, colinfo);
+                    if (dpns->unique_using)
+                        dpns->using_names = lappend(dpns->using_names,
+                                                    colname);
+                    /* Save it as output column name, too */
+                    colinfo->colnames[i] = colname;
+                }
+
+                /* Remember selected names for use later */
+                colinfo->usingNames = lappend(colinfo->usingNames, colname);
+                parentUsing = lappend(parentUsing, colname);
+
+                /* Push down to left column, unless it's a system column */
+                if (leftattnos[i] > 0)
+                {
+                    expand_colnames_array_to(leftcolinfo, leftattnos[i]);
+                    leftcolinfo->colnames[leftattnos[i] - 1] = colname;
+                }
+
+                /* Same on the righthand side */
+                if (rightattnos[i] > 0)
+                {
+                    expand_colnames_array_to(rightcolinfo, rightattnos[i]);
+                    rightcolinfo->colnames[rightattnos[i] - 1] = colname;
+                }
+
+                i++;
+            }
+        }
+
+        /* Mark child deparse_columns structs with correct parentUsing info */
+        leftcolinfo->parentUsing = parentUsing;
+        rightcolinfo->parentUsing = parentUsing;
+
+        /* Now recursively assign USING column names in children */
+        set_using_names(dpns, j->larg, parentUsing);
+        set_using_names(dpns, j->rarg, parentUsing);
+    }
+    else
+        elog(ERROR, "unrecognized node type: %d",
+             (int) nodeTag(jtnode));
+}
+
+/*
+ * set_relation_column_names: select column aliases for a non-join RTE
+ *
+ * Column alias info is saved in *colinfo, which is assumed to be pre-zeroed.
+ * If any colnames entries are already filled in, those override local
+ * choices.
+ */
+static void
+set_relation_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
+                          deparse_columns *colinfo)
+{// #lizard forgives
+    int            ncolumns;
+    char      **real_colnames;
+    bool        changed_any;
+    int            noldcolumns;
+    int            i;
+    int            j;
+
+    /*
+     * Extract the RTE's "real" column names.  This is comparable to
+     * get_rte_attribute_name, except that it's important to disregard dropped
+     * columns.  We put NULL into the array for a dropped column.
+     */
+    if (rte->rtekind == RTE_RELATION)
+    {
+        /* Relation --- look to the system catalogs for up-to-date info */
+        Relation    rel;
+        TupleDesc    tupdesc;
+
+        rel = relation_open(rte->relid, AccessShareLock);
+        tupdesc = RelationGetDescr(rel);
+
+        ncolumns = tupdesc->natts;
+        real_colnames = (char **) palloc(ncolumns * sizeof(char *));
+
+        for (i = 0; i < ncolumns; i++)
+        {
+            if (tupdesc->attrs[i]->attisdropped)
+                real_colnames[i] = NULL;
+            else
+                real_colnames[i] = pstrdup(NameStr(tupdesc->attrs[i]->attname));
+        }
+        relation_close(rel, AccessShareLock);
+    }
+    else
+    {
+        /* Otherwise use the column names from eref */
+        ListCell   *lc;
+
+        ncolumns = list_length(rte->eref->colnames);
+        real_colnames = (char **) palloc(ncolumns * sizeof(char *));
+
+        i = 0;
+        foreach(lc, rte->eref->colnames)
+        {
+            /*
+             * If the column name shown in eref is an empty string, then it's
+             * a column that was dropped at the time of parsing the query, so
+             * treat it as dropped.
+             */
+            char       *cname = strVal(lfirst(lc));
+
+            if (cname[0] == '\0')
+                cname = NULL;
+            real_colnames[i] = cname;
+            i++;
+        }
+    }
+
+    /*
+     * Ensure colinfo->colnames has a slot for each column.  (It could be long
+     * enough already, if we pushed down a name for the last column.)  Note:
+     * it's possible that there are now more columns than there were when the
+     * query was parsed, ie colnames could be longer than rte->eref->colnames.
+     * We must assign unique aliases to the new columns too, else there could
+     * be unresolved conflicts when the view/rule is reloaded.
+     */
+    expand_colnames_array_to(colinfo, ncolumns);
+    Assert(colinfo->num_cols == ncolumns);
+
+    /*
+     * Make sufficiently large new_colnames and is_new_col arrays, too.
+     *
+     * Note: because we leave colinfo->num_new_cols zero until after the loop,
+     * colname_is_unique will not consult that array, which is fine because it
+     * would only be duplicate effort.
+     */
+    colinfo->new_colnames = (char **) palloc(ncolumns * sizeof(char *));
+    colinfo->is_new_col = (bool *) palloc(ncolumns * sizeof(bool));
+
+    /*
+     * Scan the columns, select a unique alias for each one, and store it in
+     * colinfo->colnames and colinfo->new_colnames.  The former array has NULL
+     * entries for dropped columns, the latter omits them.  Also mark
+     * new_colnames entries as to whether they are new since parse time; this
+     * is the case for entries beyond the length of rte->eref->colnames.
+     */
+    noldcolumns = list_length(rte->eref->colnames);
+    changed_any = false;
+    j = 0;
+    for (i = 0; i < ncolumns; i++)
+    {
+        char       *real_colname = real_colnames[i];
+        char       *colname = colinfo->colnames[i];
+
+        /* Skip dropped columns */
+        if (real_colname == NULL)
+        {
+            Assert(colname == NULL);    /* colnames[i] is already NULL */
+            continue;
+        }
+
+        /* If alias already assigned, that's what to use */
+        if (colname == NULL)
+        {
+            /* If user wrote an alias, prefer that over real column name */
+            if (rte->alias && i < list_length(rte->alias->colnames))
+                colname = strVal(list_nth(rte->alias->colnames, i));
+            else
+                colname = real_colname;
+
+            /* Unique-ify and insert into colinfo */
+            colname = make_colname_unique(colname, dpns, colinfo);
+
+            colinfo->colnames[i] = colname;
+        }
+
+        /* Put names of non-dropped columns in new_colnames[] too */
+        colinfo->new_colnames[j] = colname;
+        /* And mark them as new or not */
+        colinfo->is_new_col[j] = (i >= noldcolumns);
+        j++;
+
+        /* Remember if any assigned aliases differ from "real" name */
+        if (!changed_any && strcmp(colname, real_colname) != 0)
+            changed_any = true;
+    }
+
+    /*
+     * Set correct length for new_colnames[] array.  (Note: if columns have
+     * been added, colinfo->num_cols includes them, which is not really quite
+     * right but is harmless, since any new columns must be at the end where
+     * they won't affect varattnos of pre-existing columns.)
+     */
+    colinfo->num_new_cols = j;
+
+    /*
+     * For a relation RTE, we need only print the alias column names if any
+     * are different from the underlying "real" names.  For a function RTE,
+     * always emit a complete column alias list; this is to protect against
+     * possible instability of the default column names (eg, from altering
+     * parameter names).  For tablefunc RTEs, we never print aliases, because
+     * the column names are part of the clause itself.  For other RTE types,
+     * print if we changed anything OR if there were user-written column
+     * aliases (since the latter would be part of the underlying "reality").
+     */
+    if (rte->rtekind == RTE_RELATION)
+        colinfo->printaliases = changed_any;
+    else if (rte->rtekind == RTE_FUNCTION)
+        colinfo->printaliases = true;
+    else if (rte->rtekind == RTE_TABLEFUNC)
+        colinfo->printaliases = false;
+    else if (rte->alias && rte->alias->colnames != NIL)
+        colinfo->printaliases = true;
+    else
+        colinfo->printaliases = changed_any;
+}
+
+/*
+ * set_join_column_names: select column aliases for a join RTE
+ *
+ * Column alias info is saved in *colinfo, which is assumed to be pre-zeroed.
+ * If any colnames entries are already filled in, those override local
+ * choices.  Also, names for USING columns were already chosen by
+ * set_using_names().  We further expect that column alias selection has been
+ * completed for both input RTEs.
+ */
+static void
+set_join_column_names(deparse_namespace *dpns, RangeTblEntry *rte,
+                      deparse_columns *colinfo)
+{// #lizard forgives
+    deparse_columns *leftcolinfo;
+    deparse_columns *rightcolinfo;
+    bool        changed_any;
+    int            noldcolumns;
+    int            nnewcolumns;
+    Bitmapset  *leftmerged = NULL;
+    Bitmapset  *rightmerged = NULL;
+    int            i;
+    int            j;
+    int            ic;
+    int            jc;
+
+    /* Look up the previously-filled-in child deparse_columns structs */
+    leftcolinfo = deparse_columns_fetch(colinfo->leftrti, dpns);
+    rightcolinfo = deparse_columns_fetch(colinfo->rightrti, dpns);
+
+    /*
+     * Ensure colinfo->colnames has a slot for each column.  (It could be long
+     * enough already, if we pushed down a name for the last column.)  Note:
+     * it's possible that one or both inputs now have more columns than there
+     * were when the query was parsed, but we'll deal with that below.  We
+     * only need entries in colnames for pre-existing columns.
+     */
+    noldcolumns = list_length(rte->eref->colnames);
+    expand_colnames_array_to(colinfo, noldcolumns);
+    Assert(colinfo->num_cols == noldcolumns);
+
+    /*
+     * Scan the join output columns, select an alias for each one, and store
+     * it in colinfo->colnames.  If there are USING columns, set_using_names()
+     * already selected their names, so we can start the loop at the first
+     * non-merged column.
+     */
+    changed_any = false;
+    for (i = list_length(colinfo->usingNames); i < noldcolumns; i++)
+    {
+        char       *colname = colinfo->colnames[i];
+        char       *real_colname;
+
+        /* Ignore dropped column (only possible for non-merged column) */
+        if (colinfo->leftattnos[i] == 0 && colinfo->rightattnos[i] == 0)
+        {
+            Assert(colname == NULL);
+            continue;
+        }
+
+        /* Get the child column name */
+        if (colinfo->leftattnos[i] > 0)
+            real_colname = leftcolinfo->colnames[colinfo->leftattnos[i] - 1];
+        else if (colinfo->rightattnos[i] > 0)
+            real_colname = rightcolinfo->colnames[colinfo->rightattnos[i] - 1];
+        else
+        {
+            /* We're joining system columns --- use eref name */
+            real_colname = strVal(list_nth(rte->eref->colnames, i));
+        }
+        Assert(real_colname != NULL);
+
+        /* In an unnamed join, just report child column names as-is */
+        if (rte->alias == NULL)
+        {
+            colinfo->colnames[i] = real_colname;
+            continue;
+        }
+
+        /* If alias already assigned, that's what to use */
+        if (colname == NULL)
+        {
+            /* If user wrote an alias, prefer that over real column name */
+            if (rte->alias && i < list_length(rte->alias->colnames))
+                colname = strVal(list_nth(rte->alias->colnames, i));
+            else
+                colname = real_colname;
+
+            /* Unique-ify and insert into colinfo */
+            colname = make_colname_unique(colname, dpns, colinfo);
+
+            colinfo->colnames[i] = colname;
+        }
+
+        /* Remember if any assigned aliases differ from "real" name */
+        if (!changed_any && strcmp(colname, real_colname) != 0)
+            changed_any = true;
+    }
+
+    /*
+     * Calculate number of columns the join would have if it were re-parsed
+     * now, and create storage for the new_colnames and is_new_col arrays.
+     *
+     * Note: colname_is_unique will be consulting new_colnames[] during the
+     * loops below, so its not-yet-filled entries must be zeroes.
+     */
+    nnewcolumns = leftcolinfo->num_new_cols + rightcolinfo->num_new_cols -
+        list_length(colinfo->usingNames);
+    colinfo->num_new_cols = nnewcolumns;
+    colinfo->new_colnames = (char **) palloc0(nnewcolumns * sizeof(char *));
+    colinfo->is_new_col = (bool *) palloc0(nnewcolumns * sizeof(bool));
+
+    /*
+     * Generating the new_colnames array is a bit tricky since any new columns
+     * added since parse time must be inserted in the right places.  This code
+     * must match the parser, which will order a join's columns as merged
+     * columns first (in USING-clause order), then non-merged columns from the
+     * left input (in attnum order), then non-merged columns from the right
+     * input (ditto).  If one of the inputs is itself a join, its columns will
+     * be ordered according to the same rule, which means newly-added columns
+     * might not be at the end.  We can figure out what's what by consulting
+     * the leftattnos and rightattnos arrays plus the input is_new_col arrays.
+     *
+     * In these loops, i indexes leftattnos/rightattnos (so it's join varattno
+     * less one), j indexes new_colnames/is_new_col, and ic/jc have similar
+     * meanings for the current child RTE.
+     */
+
+    /* Handle merged columns; they are first and can't be new */
+    i = j = 0;
+    while (i < noldcolumns &&
+           colinfo->leftattnos[i] != 0 &&
+           colinfo->rightattnos[i] != 0)
+    {
+        /* column name is already determined and known unique */
+        colinfo->new_colnames[j] = colinfo->colnames[i];
+        colinfo->is_new_col[j] = false;
+
+        /* build bitmapsets of child attnums of merged columns */
+        if (colinfo->leftattnos[i] > 0)
+            leftmerged = bms_add_member(leftmerged, colinfo->leftattnos[i]);
+        if (colinfo->rightattnos[i] > 0)
+            rightmerged = bms_add_member(rightmerged, colinfo->rightattnos[i]);
+
+        i++, j++;
+    }
+
+    /* Handle non-merged left-child columns */
+    ic = 0;
+    for (jc = 0; jc < leftcolinfo->num_new_cols; jc++)
+    {
+        char       *child_colname = leftcolinfo->new_colnames[jc];
+
+        if (!leftcolinfo->is_new_col[jc])
+        {
+            /* Advance ic to next non-dropped old column of left child */
+            while (ic < leftcolinfo->num_cols &&
+                   leftcolinfo->colnames[ic] == NULL)
+                ic++;
+            Assert(ic < leftcolinfo->num_cols);
+            ic++;
+            /* If it is a merged column, we already processed it */
+            if (bms_is_member(ic, leftmerged))
+                continue;
+            /* Else, advance i to the corresponding existing join column */
+            while (i < colinfo->num_cols &&
+                   colinfo->colnames[i] == NULL)
+                i++;
+            Assert(i < colinfo->num_cols);
+            Assert(ic == colinfo->leftattnos[i]);
+            /* Use the already-assigned name of this column */
+            colinfo->new_colnames[j] = colinfo->colnames[i];
+            i++;
+        }
+        else
+        {
+            /*
+             * Unique-ify the new child column name and assign, unless we're
+             * in an unnamed join, in which case just copy
+             */
+            if (rte->alias != NULL)
+            {
+                colinfo->new_colnames[j] =
+                    make_colname_unique(child_colname, dpns, colinfo);
+                if (!changed_any &&
+                    strcmp(colinfo->new_colnames[j], child_colname) != 0)
+                    changed_any = true;
+            }
+            else
+                colinfo->new_colnames[j] = child_colname;
+        }
+
+        colinfo->is_new_col[j] = leftcolinfo->is_new_col[jc];
+        j++;
+    }
+
+    /* Handle non-merged right-child columns in exactly the same way */
+    ic = 0;
+    for (jc = 0; jc < rightcolinfo->num_new_cols; jc++)
+    {
+        char       *child_colname = rightcolinfo->new_colnames[jc];
+
+        if (!rightcolinfo->is_new_col[jc])
+        {
+            /* Advance ic to next non-dropped old column of right child */
+            while (ic < rightcolinfo->num_cols &&
+                   rightcolinfo->colnames[ic] == NULL)
+                ic++;
+            Assert(ic < rightcolinfo->num_cols);
+            ic++;
+            /* If it is a merged column, we already processed it */
+            if (bms_is_member(ic, rightmerged))
+                continue;
+            /* Else, advance i to the corresponding existing join column */
+            while (i < colinfo->num_cols &&
+                   colinfo->colnames[i] == NULL)
+                i++;
+            Assert(i < colinfo->num_cols);
+            Assert(ic == colinfo->rightattnos[i]);
+            /* Use the already-assigned name of this column */
+            colinfo->new_colnames[j] = colinfo->colnames[i];
+            i++;
+        }
+        else
+        {
+            /*
+             * Unique-ify the new child column name and assign, unless we're
+             * in an unnamed join, in which case just copy
+             */
+            if (rte->alias != NULL)
+            {
+                colinfo->new_colnames[j] =
+                    make_colname_unique(child_colname, dpns, colinfo);
+                if (!changed_any &&
+                    strcmp(colinfo->new_colnames[j], child_colname) != 0)
+                    changed_any = true;
+            }
+            else
+                colinfo->new_colnames[j] = child_colname;
+        }
+
+        colinfo->is_new_col[j] = rightcolinfo->is_new_col[jc];
+        j++;
+    }
+
+    /* Assert we processed the right number of columns */
+#ifdef USE_ASSERT_CHECKING
+    while (i < colinfo->num_cols && colinfo->colnames[i] == NULL)
+        i++;
+    Assert(i == colinfo->num_cols);
+    Assert(j == nnewcolumns);
+#endif
+
+    /*
+     * For a named join, print column aliases if we changed any from the child
+     * names.  Unnamed joins cannot print aliases.
+     */
+    if (rte->alias != NULL)
+        colinfo->printaliases = changed_any;
+    else
+        colinfo->printaliases = false;
+}
+
+/*
+ * colname_is_unique: is colname distinct from already-chosen column names?
+ *
+ * dpns is query-wide info, colinfo is for the column's RTE
+ */
+static bool
+colname_is_unique(char *colname, deparse_namespace *dpns,
+                  deparse_columns *colinfo)
+{// #lizard forgives
+    int            i;
+    ListCell   *lc;
+
+    /* Check against already-assigned column aliases within RTE */
+    for (i = 0; i < colinfo->num_cols; i++)
+    {
+        char       *oldname = colinfo->colnames[i];
+
+        if (oldname && strcmp(oldname, colname) == 0)
+            return false;
+    }
+
+    /*
+     * If we're building a new_colnames array, check that too (this will be
+     * partially but not completely redundant with the previous checks)
+     */
+    for (i = 0; i < colinfo->num_new_cols; i++)
+    {
+        char       *oldname = colinfo->new_colnames[i];
+
+        if (oldname && strcmp(oldname, colname) == 0)
+            return false;
+    }
+
+    /* Also check against USING-column names that must be globally unique */
+    foreach(lc, dpns->using_names)
+    {
+        char       *oldname = (char *) lfirst(lc);
+
+        if (strcmp(oldname, colname) == 0)
+            return false;
+    }
+
+    /* Also check against names already assigned for parent-join USING cols */
+    foreach(lc, colinfo->parentUsing)
+    {
+        char       *oldname = (char *) lfirst(lc);
+
+        if (strcmp(oldname, colname) == 0)
+            return false;
+    }
+
+    return true;
+}
+
+/*
+ * make_colname_unique: modify colname if necessary to make it unique
+ *
+ * dpns is query-wide info, colinfo is for the column's RTE
+ */
+static char *
+make_colname_unique(char *colname, deparse_namespace *dpns,
+                    deparse_columns *colinfo)
+{
+    /*
+     * If the selected name isn't unique, append digits to make it so.  For a
+     * very long input name, we might have to truncate to stay within
+     * NAMEDATALEN.
+     */
+    if (!colname_is_unique(colname, dpns, colinfo))
+    {
+        int            colnamelen = strlen(colname);
+        char       *modname = (char *) palloc(colnamelen + 16);
+        int            i = 0;
+
+        do
+        {
+            i++;
+            for (;;)
+            {
+                /*
+                 * We avoid using %.*s here because it can misbehave if the
+                 * data is not valid in what libc thinks is the prevailing
+                 * encoding.
+                 */
+                memcpy(modname, colname, colnamelen);
+                sprintf(modname + colnamelen, "_%d", i);
+                if (strlen(modname) < NAMEDATALEN)
+                    break;
+                /* drop chars from colname to keep all the digits */
+                colnamelen = pg_mbcliplen(colname, colnamelen,
+                                          colnamelen - 1);
+            }
+        } while (!colname_is_unique(modname, dpns, colinfo));
+        colname = modname;
+    }
+    return colname;
+}
+
+/*
+ * expand_colnames_array_to: make colinfo->colnames at least n items long
+ *
+ * Any added array entries are initialized to zero.
+ */
+static void
+expand_colnames_array_to(deparse_columns *colinfo, int n)
+{
+    if (n > colinfo->num_cols)
+    {
+        if (colinfo->colnames == NULL)
+            colinfo->colnames = (char **) palloc0(n * sizeof(char *));
+        else
+        {
+            colinfo->colnames = (char **) repalloc(colinfo->colnames,
+                                                   n * sizeof(char *));
+            memset(colinfo->colnames + colinfo->num_cols, 0,
+                   (n - colinfo->num_cols) * sizeof(char *));
+        }
+        colinfo->num_cols = n;
+    }
+}
+
+/*
+ * identify_join_columns: figure out where columns of a join come from
+ *
+ * Fills the join-specific fields of the colinfo struct, except for
+ * usingNames which is filled later.
+ */
+static void
+identify_join_columns(JoinExpr *j, RangeTblEntry *jrte,
+                      deparse_columns *colinfo)
+{// #lizard forgives
+    int            numjoincols;
+    int            i;
+    ListCell   *lc;
+
+    /* Extract left/right child RT indexes */
+    if (IsA(j->larg, RangeTblRef))
+        colinfo->leftrti = ((RangeTblRef *) j->larg)->rtindex;
+    else if (IsA(j->larg, JoinExpr))
+        colinfo->leftrti = ((JoinExpr *) j->larg)->rtindex;
+    else
+        elog(ERROR, "unrecognized node type in jointree: %d",
+             (int) nodeTag(j->larg));
+    if (IsA(j->rarg, RangeTblRef))
+        colinfo->rightrti = ((RangeTblRef *) j->rarg)->rtindex;
+    else if (IsA(j->rarg, JoinExpr))
+        colinfo->rightrti = ((JoinExpr *) j->rarg)->rtindex;
+    else
+        elog(ERROR, "unrecognized node type in jointree: %d",
+             (int) nodeTag(j->rarg));
+
+    /* Assert children will be processed earlier than join in second pass */
+    Assert(colinfo->leftrti < j->rtindex);
+    Assert(colinfo->rightrti < j->rtindex);
+
+    /* Initialize result arrays with zeroes */
+    numjoincols = list_length(jrte->joinaliasvars);
+    Assert(numjoincols == list_length(jrte->eref->colnames));
+    colinfo->leftattnos = (int *) palloc0(numjoincols * sizeof(int));
+    colinfo->rightattnos = (int *) palloc0(numjoincols * sizeof(int));
+
+    /* Scan the joinaliasvars list to identify simple column references */
+    i = 0;
+    foreach(lc, jrte->joinaliasvars)
+    {
+        Var           *aliasvar = (Var *) lfirst(lc);
+
+        /* get rid of any implicit coercion above the Var */
+        aliasvar = (Var *) strip_implicit_coercions((Node *) aliasvar);
+
+        if (aliasvar == NULL)
+        {
+            /* It's a dropped column; nothing to do here */
+        }
+        else if (IsA(aliasvar, Var))
+        {
+            Assert(aliasvar->varlevelsup == 0);
+            Assert(aliasvar->varattno != 0);
+            if (aliasvar->varno == colinfo->leftrti)
+                colinfo->leftattnos[i] = aliasvar->varattno;
+            else if (aliasvar->varno == colinfo->rightrti)
+                colinfo->rightattnos[i] = aliasvar->varattno;
+            else
+                elog(ERROR, "unexpected varno %d in JOIN RTE",
+                     aliasvar->varno);
+        }
+        else if (IsA(aliasvar, CoalesceExpr))
+        {
+            /*
+             * It's a merged column in FULL JOIN USING.  Ignore it for now and
+             * let the code below identify the merged columns.
+             */
+        }
+        else
+            elog(ERROR, "unrecognized node type in join alias vars: %d",
+                 (int) nodeTag(aliasvar));
+
+        i++;
+    }
+
+    /*
+     * If there's a USING clause, deconstruct the join quals to identify the
+     * merged columns.  This is a tad painful but if we cannot rely on the
+     * column names, there is no other representation of which columns were
+     * joined by USING.  (Unless the join type is FULL, we can't tell from the
+     * joinaliasvars list which columns are merged.)  Note: we assume that the
+     * merged columns are the first output column(s) of the join.
+     */
+    if (j->usingClause)
+    {
+        List       *leftvars = NIL;
+        List       *rightvars = NIL;
+        ListCell   *lc2;
+
+        /* Extract left- and right-side Vars from the qual expression */
+        flatten_join_using_qual(j->quals, &leftvars, &rightvars);
+        Assert(list_length(leftvars) == list_length(j->usingClause));
+        Assert(list_length(rightvars) == list_length(j->usingClause));
+
+        /* Mark the output columns accordingly */
+        i = 0;
+        forboth(lc, leftvars, lc2, rightvars)
+        {
+            Var           *leftvar = (Var *) lfirst(lc);
+            Var           *rightvar = (Var *) lfirst(lc2);
+
+            Assert(leftvar->varlevelsup == 0);
+            Assert(leftvar->varattno != 0);
+            if (leftvar->varno != colinfo->leftrti)
+                elog(ERROR, "unexpected varno %d in JOIN USING qual",
+                     leftvar->varno);
+            colinfo->leftattnos[i] = leftvar->varattno;
+
+            Assert(rightvar->varlevelsup == 0);
+            Assert(rightvar->varattno != 0);
+            if (rightvar->varno != colinfo->rightrti)
+                elog(ERROR, "unexpected varno %d in JOIN USING qual",
+                     rightvar->varno);
+            colinfo->rightattnos[i] = rightvar->varattno;
+
+            i++;
+        }
+    }
+}
+
+/*
+ * flatten_join_using_qual: extract Vars being joined from a JOIN/USING qual
+ *
+ * We assume that transformJoinUsingClause won't have produced anything except
+ * AND nodes, equality operator nodes, and possibly implicit coercions, and
+ * that the AND node inputs match left-to-right with the original USING list.
+ *
+ * Caller must initialize the result lists to NIL.
+ */
+static void
+flatten_join_using_qual(Node *qual, List **leftvars, List **rightvars)
+{
+    if (IsA(qual, BoolExpr))
+    {
+        /* Handle AND nodes by recursion */
+        BoolExpr   *b = (BoolExpr *) qual;
+        ListCell   *lc;
+
+        Assert(b->boolop == AND_EXPR);
+        foreach(lc, b->args)
+        {
+            flatten_join_using_qual((Node *) lfirst(lc),
+                                    leftvars, rightvars);
+        }
+    }
+    else if (IsA(qual, OpExpr))
+    {
+        /* Otherwise we should have an equality operator */
+        OpExpr       *op = (OpExpr *) qual;
+        Var           *var;
+
+        if (list_length(op->args) != 2)
+            elog(ERROR, "unexpected unary operator in JOIN/USING qual");
+        /* Arguments should be Vars with perhaps implicit coercions */
+        var = (Var *) strip_implicit_coercions((Node *) linitial(op->args));
+        if (!IsA(var, Var))
+            elog(ERROR, "unexpected node type in JOIN/USING qual: %d",
+                 (int) nodeTag(var));
+        *leftvars = lappend(*leftvars, var);
+        var = (Var *) strip_implicit_coercions((Node *) lsecond(op->args));
+        if (!IsA(var, Var))
+            elog(ERROR, "unexpected node type in JOIN/USING qual: %d",
+                 (int) nodeTag(var));
+        *rightvars = lappend(*rightvars, var);
+    }
+    else
+    {
+        /* Perhaps we have an implicit coercion to boolean? */
+        Node       *q = strip_implicit_coercions(qual);
+
+        if (q != qual)
+            flatten_join_using_qual(q, leftvars, rightvars);
+        else
+            elog(ERROR, "unexpected node type in JOIN/USING qual: %d",
+                 (int) nodeTag(qual));
+    }
+}
+
+/*
+ * get_rtable_name: convenience function to get a previously assigned RTE alias
+ *
+ * The RTE must belong to the topmost namespace level in "context".
+ */
+static char *
+get_rtable_name(int rtindex, deparse_context *context)
+{
+    deparse_namespace *dpns = (deparse_namespace *) linitial(context->namespaces);
+
+    Assert(rtindex > 0 && rtindex <= list_length(dpns->rtable_names));
+    return (char *) list_nth(dpns->rtable_names, rtindex - 1);
+}
+
+/*
+ * set_deparse_planstate: set up deparse_namespace to parse subexpressions
+ * of a given PlanState node
+ *
+ * This sets the planstate, outer_planstate, inner_planstate, outer_tlist,
+ * inner_tlist, and index_tlist fields.  Caller is responsible for adjusting
+ * the ancestors list if necessary.  Note that the rtable and ctes fields do
+ * not need to change when shifting attention to different plan nodes in a
+ * single plan tree.
+ */
+static void
+set_deparse_planstate(deparse_namespace *dpns, PlanState *ps)
+{// #lizard forgives
+    dpns->planstate = ps;
+
+    /*
+     * We special-case Append and MergeAppend to pretend that the first child
+     * plan is the OUTER referent; we have to interpret OUTER Vars in their
+     * tlists according to one of the children, and the first one is the most
+     * natural choice.  Likewise special-case ModifyTable to pretend that the
+     * first child plan is the OUTER referent; this is to support RETURNING
+     * lists containing references to non-target relations.
+     */
+    if (IsA(ps, AppendState))
+        dpns->outer_planstate = ((AppendState *) ps)->appendplans[0];
+    else if (IsA(ps, MergeAppendState))
+        dpns->outer_planstate = ((MergeAppendState *) ps)->mergeplans[0];
+    else if (IsA(ps, ModifyTableState))
+        dpns->outer_planstate = ((ModifyTableState *) ps)->mt_plans[0];
+    else
+        dpns->outer_planstate = outerPlanState(ps);
+
+    if (dpns->outer_planstate)
+        dpns->outer_tlist = dpns->outer_planstate->plan->targetlist;
+    else
+        dpns->outer_tlist = NIL;
+
+    /*
+     * For a SubqueryScan, pretend the subplan is INNER referent.  (We don't
+     * use OUTER because that could someday conflict with the normal meaning.)
+     * Likewise, for a CteScan, pretend the subquery's plan is INNER referent.
+     * For ON CONFLICT .. UPDATE we just need the inner tlist to point to the
+     * excluded expression's tlist. (Similar to the SubqueryScan we don't want
+     * to reuse OUTER, it's used for RETURNING in some modify table cases,
+     * although not INSERT .. CONFLICT).
+     */
+    if (IsA(ps, SubqueryScanState))
+        dpns->inner_planstate = ((SubqueryScanState *) ps)->subplan;
+    else if (IsA(ps, CteScanState))
+        dpns->inner_planstate = ((CteScanState *) ps)->cteplanstate;
+    else if (IsA(ps, ModifyTableState))
+        dpns->inner_planstate = ps;
+    else
+        dpns->inner_planstate = innerPlanState(ps);
+
+    if (IsA(ps, ModifyTableState))
+        dpns->inner_tlist = ((ModifyTableState *) ps)->mt_excludedtlist;
+    else if (dpns->inner_planstate)
+        dpns->inner_tlist = dpns->inner_planstate->plan->targetlist;
+    else
+        dpns->inner_tlist = NIL;
+
+    /* Set up referent for INDEX_VAR Vars, if needed */
+    if (IsA(ps->plan, IndexOnlyScan))
+        dpns->index_tlist = ((IndexOnlyScan *) ps->plan)->indextlist;
+    else if (IsA(ps->plan, ForeignScan))
+        dpns->index_tlist = ((ForeignScan *) ps->plan)->fdw_scan_tlist;
+    else if (IsA(ps->plan, CustomScan))
+        dpns->index_tlist = ((CustomScan *) ps->plan)->custom_scan_tlist;
+    else
+        dpns->index_tlist = NIL;
+}
+
+/*
+ * push_child_plan: temporarily transfer deparsing attention to a child plan
+ *
+ * When expanding an OUTER_VAR or INNER_VAR reference, we must adjust the
+ * deparse context in case the referenced expression itself uses
+ * OUTER_VAR/INNER_VAR.  We modify the top stack entry in-place to avoid
+ * affecting levelsup issues (although in a Plan tree there really shouldn't
+ * be any).
+ *
+ * Caller must provide a local deparse_namespace variable to save the
+ * previous state for pop_child_plan.
+ */
+static void
+push_child_plan(deparse_namespace *dpns, PlanState *ps,
+                deparse_namespace *save_dpns)
+{
+    /* Save state for restoration later */
+    *save_dpns = *dpns;
+
+    /* Link current plan node into ancestors list */
+    dpns->ancestors = lcons(dpns->planstate, dpns->ancestors);
+
+    /* Set attention on selected child */
+    set_deparse_planstate(dpns, ps);
+}
+
+/*
+ * pop_child_plan: undo the effects of push_child_plan
+ */
+static void
+pop_child_plan(deparse_namespace *dpns, deparse_namespace *save_dpns)
+{
+    List       *ancestors;
+
+    /* Get rid of ancestors list cell added by push_child_plan */
+    ancestors = list_delete_first(dpns->ancestors);
+
+    /* Restore fields changed by push_child_plan */
+    *dpns = *save_dpns;
+
+    /* Make sure dpns->ancestors is right (may be unnecessary) */
+    dpns->ancestors = ancestors;
+}
+
+/*
+ * push_ancestor_plan: temporarily transfer deparsing attention to an
+ * ancestor plan
+ *
+ * When expanding a Param reference, we must adjust the deparse context
+ * to match the plan node that contains the expression being printed;
+ * otherwise we'd fail if that expression itself contains a Param or
+ * OUTER_VAR/INNER_VAR/INDEX_VAR variable.
+ *
+ * The target ancestor is conveniently identified by the ListCell holding it
+ * in dpns->ancestors.
+ *
+ * Caller must provide a local deparse_namespace variable to save the
+ * previous state for pop_ancestor_plan.
+ */
+static void
+push_ancestor_plan(deparse_namespace *dpns, ListCell *ancestor_cell,
+                   deparse_namespace *save_dpns)
+{
+    PlanState  *ps = (PlanState *) lfirst(ancestor_cell);
+    List       *ancestors;
+
+    /* Save state for restoration later */
+    *save_dpns = *dpns;
+
+    /* Build a new ancestor list with just this node's ancestors */
+    ancestors = NIL;
+    while ((ancestor_cell = lnext(ancestor_cell)) != NULL)
+        ancestors = lappend(ancestors, lfirst(ancestor_cell));
+    dpns->ancestors = ancestors;
+
+    /* Set attention on selected ancestor */
+    set_deparse_planstate(dpns, ps);
+}
+
+/*
+ * pop_ancestor_plan: undo the effects of push_ancestor_plan
+ */
+static void
+pop_ancestor_plan(deparse_namespace *dpns, deparse_namespace *save_dpns)
+{
+    /* Free the ancestor list made in push_ancestor_plan */
+    list_free(dpns->ancestors);
+
+    /* Restore fields changed by push_ancestor_plan */
+    *dpns = *save_dpns;
+}
+
+
+/* ----------
+ * make_ruledef            - reconstruct the CREATE RULE command
+ *                  for a given pg_rewrite tuple
+ * ----------
+ */
+static void
+make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
+             int prettyFlags)
+{// #lizard forgives
+    char       *rulename;
+    char        ev_type;
+    Oid            ev_class;
+    bool        is_instead;
+    char       *ev_qual;
+    char       *ev_action;
+    List       *actions = NIL;
+    Relation    ev_relation;
+    TupleDesc    viewResultDesc = NULL;
+    int            fno;
+    Datum        dat;
+    bool        isnull;
+
+    /*
+     * Get the attribute values from the rules tuple
+     */
+    fno = SPI_fnumber(rulettc, "rulename");
+    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
+    Assert(!isnull);
+    rulename = NameStr(*(DatumGetName(dat)));
+
+    fno = SPI_fnumber(rulettc, "ev_type");
+    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
+    Assert(!isnull);
+    ev_type = DatumGetChar(dat);
+
+    fno = SPI_fnumber(rulettc, "ev_class");
+    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
+    Assert(!isnull);
+    ev_class = DatumGetObjectId(dat);
+
+    fno = SPI_fnumber(rulettc, "is_instead");
+    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
+    Assert(!isnull);
+    is_instead = DatumGetBool(dat);
+
+    /* these could be nulls */
+    fno = SPI_fnumber(rulettc, "ev_qual");
+    ev_qual = SPI_getvalue(ruletup, rulettc, fno);
+
+    fno = SPI_fnumber(rulettc, "ev_action");
+    ev_action = SPI_getvalue(ruletup, rulettc, fno);
+    if (ev_action != NULL)
+        actions = (List *) stringToNode(ev_action);
+
+    ev_relation = heap_open(ev_class, AccessShareLock);
+
+    /*
+     * Build the rules definition text
+     */
+    appendStringInfo(buf, "CREATE RULE %s AS",
+                     quote_identifier(rulename));
+
+    if (prettyFlags & PRETTYFLAG_INDENT)
+        appendStringInfoString(buf, "\n    ON ");
+    else
+        appendStringInfoString(buf, " ON ");
+
+    /* The event the rule is fired for */
+    switch (ev_type)
+    {
+        case '1':
+            appendStringInfoString(buf, "SELECT");
+            viewResultDesc = RelationGetDescr(ev_relation);
+            break;
+
+        case '2':
+            appendStringInfoString(buf, "UPDATE");
+            break;
+
+        case '3':
+            appendStringInfoString(buf, "INSERT");
+            break;
+
+        case '4':
+            appendStringInfoString(buf, "DELETE");
+            break;
+
+        default:
+            ereport(ERROR,
+                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+                     errmsg("rule \"%s\" has unsupported event type %d",
+                            rulename, ev_type)));
+            break;
+    }
+
+    /* The relation the rule is fired on */
+    appendStringInfo(buf, " TO %s", generate_relation_name(ev_class, NIL));
+
+    /* If the rule has an event qualification, add it */
+    if (ev_qual == NULL)
+        ev_qual = "";
+    if (strlen(ev_qual) > 0 && strcmp(ev_qual, "<>") != 0)
+    {
+        Node       *qual;
+        Query       *query;
+        deparse_context context;
+        deparse_namespace dpns;
+
+        if (prettyFlags & PRETTYFLAG_INDENT)
+            appendStringInfoString(buf, "\n  ");
+        appendStringInfoString(buf, " WHERE ");
+
+        qual = stringToNode(ev_qual);
+
+        /*
+         * We need to make a context for recognizing any Vars in the qual
+         * (which can only be references to OLD and NEW).  Use the rtable of
+         * the first query in the action list for this purpose.
+         */
+        query = (Query *) linitial(actions);
+
+        /*
+         * If the action is INSERT...SELECT, OLD/NEW have been pushed down
+         * into the SELECT, and that's what we need to look at. (Ugly kluge
+         * ... try to fix this when we redesign querytrees.)
+         */
+        query = getInsertSelectQuery(query, NULL);
+
+        /* Must acquire locks right away; see notes in get_query_def() */
+        AcquireRewriteLocks(query, false, false);
+
+        context.buf = buf;
+        context.namespaces = list_make1(&dpns);
+        context.windowClause = NIL;
+        context.windowTList = NIL;
+        context.varprefix = (list_length(query->rtable) != 1);
+        context.prettyFlags = prettyFlags;
+        context.wrapColumn = WRAP_COLUMN_DEFAULT;
+        context.indentLevel = PRETTYINDENT_STD;
+        context.special_exprkind = EXPR_KIND_NONE;
+
+        set_deparse_for_query(&dpns, query, NIL);
+
+        get_rule_expr(qual, &context, false);
+    }
+
+    appendStringInfoString(buf, " DO ");
+
+    /* The INSTEAD keyword (if so) */
+    if (is_instead)
+        appendStringInfoString(buf, "INSTEAD ");
+
+    /* Finally the rules actions */
+    if (list_length(actions) > 1)
+    {
+        ListCell   *action;
+        Query       *query;
+
+        appendStringInfoChar(buf, '(');
+        foreach(action, actions)
+        {
+            query = (Query *) lfirst(action);
+            get_query_def(query, buf, NIL, viewResultDesc,
+                          prettyFlags, WRAP_COLUMN_DEFAULT, 0
+#ifdef PGXC
+                          , false, false
+#endif /* PGXC */
+                );
+            if (prettyFlags)
+                appendStringInfoString(buf, ";\n");
+            else
+                appendStringInfoString(buf, "; ");
+        }
+        appendStringInfoString(buf, ");");
+    }
+    else if (list_length(actions) == 0)
+    {
+        appendStringInfoString(buf, "NOTHING;");
+    }
+    else
+    {
+        Query       *query;
+
+        query = (Query *) linitial(actions);
+        get_query_def(query, buf, NIL, viewResultDesc,
+                      prettyFlags, WRAP_COLUMN_DEFAULT, 0
+#ifdef PGXC
+                        , false, false
+#endif /* PGXC */
+        );
+        appendStringInfo(buf, ";");
+    }
+
+    heap_close(ev_relation, AccessShareLock);
+}
+
+
+/* ----------
+ * make_viewdef            - reconstruct the SELECT part of a
+ *                  view rewrite rule
+ * ----------
+ */
+static void
+make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
+             int prettyFlags, int wrapColumn)
+{// #lizard forgives
+    Query       *query;
+    char        ev_type;
+    Oid            ev_class;
+    bool        is_instead;
+    char       *ev_qual;
+    char       *ev_action;
+    List       *actions = NIL;
+    Relation    ev_relation;
+    int            fno;
+    Datum        dat;
+    bool        isnull;
+
+    /*
+     * Get the attribute values from the rules tuple
+     */
+    fno = SPI_fnumber(rulettc, "ev_type");
+    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
+    Assert(!isnull);
+    ev_type = DatumGetChar(dat);
+
+    fno = SPI_fnumber(rulettc, "ev_class");
+    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
+    Assert(!isnull);
+    ev_class = DatumGetObjectId(dat);
+
+    fno = SPI_fnumber(rulettc, "is_instead");
+    dat = SPI_getbinval(ruletup, rulettc, fno, &isnull);
+    Assert(!isnull);
+    is_instead = DatumGetBool(dat);
+
+    /* these could be nulls */
+    fno = SPI_fnumber(rulettc, "ev_qual");
+    ev_qual = SPI_getvalue(ruletup, rulettc, fno);
+
+    fno = SPI_fnumber(rulettc, "ev_action");
+    ev_action = SPI_getvalue(ruletup, rulettc, fno);
+    if (ev_action != NULL)
+        actions = (List *) stringToNode(ev_action);
+
+    if (list_length(actions) != 1)
+    {
+        /* keep output buffer empty and leave */
+        return;
+    }
+
+    query = (Query *) linitial(actions);
+
+    if (ev_type != '1' || !is_instead ||
+        strcmp(ev_qual, "<>") != 0 || query->commandType != CMD_SELECT)
+    {
+        /* keep output buffer empty and leave */
+        return;
+    }
+
+    ev_relation = heap_open(ev_class, AccessShareLock);
+
+    get_query_def(query, buf, NIL, RelationGetDescr(ev_relation),
+                  prettyFlags, wrapColumn, 0
+#ifdef PGXC
+                  , false, false
+#endif /* PGXC */
+                  );
+    appendStringInfo(buf, ";");
+
+    heap_close(ev_relation, AccessShareLock);
+}
+
+#ifdef PGXC
+/* ----------
+ * deparse_query            - Parse back one query parsetree
+ *
+ * Purpose of this function is to build up statement for a RemoteQuery
+ * It just calls get_query_def without pretty print flags
+ * ----------
+ */
+void
+deparse_query(Query *query, StringInfo buf, List *parentnamespace,
+                bool finalise_aggs, bool sortgroup_colno)
+{
+    get_query_def(query, buf, parentnamespace, NULL, 0, 0, 0, finalise_aggs,
+            sortgroup_colno);
+}
+
+/* code borrowed from get_insert_query_def */
+void
+get_query_def_from_valuesList(Query *query, StringInfo buf)
+{// #lizard forgives
+
+    RangeTblEntry *select_rte = NULL;
+    RangeTblEntry *values_rte = NULL;
+    RangeTblEntry *rte;
+    char       *sep;
+    ListCell   *values_cell;
+    ListCell   *l;
+    List       *strippedexprs;
+    deparse_context context;
+    deparse_namespace dpns;
+
+    /*
+     * Before we begin to examine the query, acquire locks on referenced
+     * relations, and fix up deleted columns in JOIN RTEs.    This ensures
+     * consistent results.    Note we assume it's OK to scribble on the passed
+     * querytree!
+     */
+    AcquireRewriteLocks(query, false, false);
+
+    context.buf = buf;
+    context.namespaces = NIL;
+    context.windowClause = NIL;
+    context.windowTList = NIL;
+    context.varprefix = (list_length(query->rtable) != 1);
+    context.prettyFlags = 0;
+    context.indentLevel = 0;
+    context.wrapColumn = 0;
+
+    dpns.rtable = query->rtable;
+    dpns.ctes = query->cteList;
+    dpns.planstate = NULL;
+    dpns.ancestors = NIL;
+    dpns.outer_planstate = dpns.inner_planstate = NULL;
+
+    /*
+     * If it's an INSERT ... SELECT or VALUES (...), (...), ... there will be
+     * a single RTE for the SELECT or VALUES.
+     */
+    foreach(l, query->rtable)
+    {
+        rte = (RangeTblEntry *) lfirst(l);
+
+        if (rte->rtekind == RTE_SUBQUERY)
+        {
+            if (select_rte)
+                elog(ERROR, "too many subquery RTEs in INSERT");
+            select_rte = rte;
+        }
+
+        if (rte->rtekind == RTE_VALUES)
+        {
+            if (values_rte)
+                elog(ERROR, "too many values RTEs in INSERT");
+            values_rte = rte;
+        }
+    }
+    if (select_rte && values_rte)
+        elog(ERROR, "both subquery and values RTEs in INSERT");
+
+    /*
+     * Start the query with INSERT INTO relname
+     */
+    rte = rt_fetch(query->resultRelation, query->rtable);
+    Assert(rte->rtekind == RTE_RELATION);
+
+    appendStringInfo(buf, "INSERT INTO %s (",
+                     generate_relation_name(rte->relid, NIL));
+
+    /*
+     * Add the insert-column-names list.  To handle indirection properly, we
+     * need to look for indirection nodes in the top targetlist (if it's
+     * INSERT ... SELECT or INSERT ... single VALUES), or in the first
+     * expression list of the VALUES RTE (if it's INSERT ... multi VALUES). We
+     * assume that all the expression lists will have similar indirection in
+     * the latter case.
+     */
+    if (values_rte)
+        values_cell = list_head((List *) linitial(values_rte->values_lists));
+    else
+        values_cell = NULL;
+    strippedexprs = NIL;
+    sep = "";
+    foreach(l, query->targetList)
+    {
+        TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+        elog(DEBUG1, "targetEntry type is %d\n)", tle->expr->type);
+        if (tle->resjunk || !IsA(tle->expr, Var))
+            continue;            /* ignore junk entries */
+
+        appendStringInfoString(buf, sep);
+        sep = ", ";
+
+        /*
+         * Put out name of target column; look in the catalogs, not at
+         * tle->resname, since resname will fail to track RENAME.
+         */
+        appendStringInfoString(buf,quote_identifier(get_relid_attribute_name(rte->relid, tle->resno)));
+
+        /*
+         * Print any indirection needed (subfields or subscripts), and strip
+         * off the top-level nodes representing the indirection assignments.
+         */
+        if (values_cell)
+        {
+            /* we discard the stripped expression in this case */
+            processIndirection((Node *) lfirst(values_cell), &context);
+            values_cell = lnext(values_cell);
+        }
+        else
+        {
+            /* we keep a list of the stripped expressions in this case */
+            strippedexprs = lappend(strippedexprs, processIndirection((Node *) tle->expr, &context));
+        }
+    }
+    appendStringInfo(buf, ") ");
+
+    if (select_rte)
+    {
+        /* Add the SELECT */
+        get_query_def(select_rte->subquery, buf, NIL, NULL,
+                      context.prettyFlags, context.wrapColumn,
+                      context.indentLevel,
+                      context.finalise_aggs, context.sortgroup_colno);
+    }
+    else if (values_rte)
+    {
+        /* A WITH clause is possible here */
+        get_with_clause(query, &context);
+        /* Add the multi-VALUES expression lists */
+        get_values_def(values_rte->values_lists, &context);
+    }
+    else
+    {
+        /* A WITH clause is possible here */
+        get_with_clause(query, &context);
+        /* Add the single-VALUES expression list */
+        appendContextKeyword(&context, "VALUES (",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 2);
+        get_rule_expr((Node *) strippedexprs, &context, false);
+        appendStringInfoChar(buf, ')');
+    }
+
+    /* Add RETURNING if present */
+    if (query->returningList)
+    {
+        appendContextKeyword(&context, " RETURNING",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+        get_target_list(query->returningList, &context, NULL);
+    }
+}
+#endif
+/* ----------
+ * get_query_def            - Parse back one query parsetree
+ *
+ * If resultDesc is not NULL, then it is the output tuple descriptor for
+ * the view represented by a SELECT query.
+ * ----------
+ */
+static void
+get_query_def(Query *query, StringInfo buf, List *parentnamespace,
+              TupleDesc resultDesc,
+              int prettyFlags, int wrapColumn, int startIndent,
+              bool finalise_aggs, bool sortgroup_colno)
+{// #lizard forgives
+    deparse_context context;
+    deparse_namespace dpns;
+
+    /* Guard against excessively long or deeply-nested queries */
+    CHECK_FOR_INTERRUPTS();
+    check_stack_depth();
+
+    /*
+     * Before we begin to examine the query, acquire locks on referenced
+     * relations, and fix up deleted columns in JOIN RTEs.  This ensures
+     * consistent results.  Note we assume it's OK to scribble on the passed
+     * querytree!
+     *
+     * We are only deparsing the query (we are not about to execute it), so we
+     * only need AccessShareLock on the relations it mentions.
+     */
+    AcquireRewriteLocks(query, false, false);
+
+    context.buf = buf;
+    context.namespaces = lcons(&dpns, list_copy(parentnamespace));
+    context.windowClause = NIL;
+    context.windowTList = NIL;
+    context.varprefix = (parentnamespace != NIL ||
+                         list_length(query->rtable) != 1);
+    context.prettyFlags = prettyFlags;
+    context.wrapColumn = wrapColumn;
+    context.indentLevel = startIndent;
+    context.special_exprkind = EXPR_KIND_NONE;
+    context.finalise_aggs = finalise_aggs;
+    context.sortgroup_colno = sortgroup_colno;
+
+    set_deparse_for_query(&dpns, query, parentnamespace);
+
+    switch (query->commandType)
+    {
+        case CMD_SELECT:
+            get_select_query_def(query, &context, resultDesc);
+            break;
+
+        case CMD_UPDATE:
+            get_update_query_def(query, &context);
+            break;
+
+        case CMD_INSERT:
+            get_insert_query_def(query, &context);
+            break;
+
+        case CMD_DELETE:
+            get_delete_query_def(query, &context);
+            break;
+
+        case CMD_NOTHING:
+            appendStringInfoString(buf, "NOTHING");
+            break;
+
+        case CMD_UTILITY:
+            get_utility_query_def(query, &context);
+            break;
+
+        default:
+            elog(ERROR, "unrecognized query command type: %d",
+                 query->commandType);
+            break;
+    }
+}
+
+/* ----------
+ * get_values_def            - Parse back a VALUES list
+ * ----------
+ */
+static void
+get_values_def(List *values_lists, deparse_context *context)
+{
+    StringInfo    buf = context->buf;
+    bool        first_list = true;
+    ListCell   *vtl;
+
+    appendStringInfoString(buf, "VALUES ");
+
+    foreach(vtl, values_lists)
+    {
+        List       *sublist = (List *) lfirst(vtl);
+        bool        first_col = true;
+        ListCell   *lc;
+
+        if (first_list)
+            first_list = false;
+        else
+            appendStringInfoString(buf, ", ");
+
+        appendStringInfoChar(buf, '(');
+        foreach(lc, sublist)
+        {
+            Node       *col = (Node *) lfirst(lc);
+
+            if (first_col)
+                first_col = false;
+            else
+                appendStringInfoChar(buf, ',');
+
+            /*
+             * Print the value.  Whole-row Vars need special treatment.
+             */
+            get_rule_expr_toplevel(col, context, false);
+        }
+        appendStringInfoChar(buf, ')');
+    }
+}
+
+/* ----------
+ * get_with_clause            - Parse back a WITH clause
+ * ----------
+ */
+static void
+get_with_clause(Query *query, deparse_context *context)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    const char *sep;
+    ListCell   *l;
+
+    if (query->cteList == NIL)
+        return;
+
+    if (PRETTY_INDENT(context))
+    {
+        context->indentLevel += PRETTYINDENT_STD;
+        appendStringInfoChar(buf, ' ');
+    }
+
+    if (query->hasRecursive)
+        sep = "WITH RECURSIVE ";
+    else
+        sep = "WITH ";
+    foreach(l, query->cteList)
+    {
+        CommonTableExpr *cte = (CommonTableExpr *) lfirst(l);
+
+        appendStringInfoString(buf, sep);
+        appendStringInfoString(buf, quote_identifier(cte->ctename));
+        if (cte->aliascolnames)
+        {
+            bool        first = true;
+            ListCell   *col;
+
+            appendStringInfoChar(buf, '(');
+            foreach(col, cte->aliascolnames)
+            {
+                if (first)
+                    first = false;
+                else
+                    appendStringInfoString(buf, ", ");
+                appendStringInfoString(buf,
+                                       quote_identifier(strVal(lfirst(col))));
+            }
+            appendStringInfoChar(buf, ')');
+        }
+       appendStringInfoString(buf, " AS ");
+		switch (cte->ctematerialized)
+		{
+			case CTEMaterializeDefault:
+				break;
+			case CTEMaterializeAlways:
+				appendStringInfoString(buf, "MATERIALIZED ");
+				break;
+			case CTEMaterializeNever:
+				appendStringInfoString(buf, "NOT MATERIALIZED ");
+				break;
+		}
+		appendStringInfoChar(buf, '(');
+        if (PRETTY_INDENT(context))
+            appendContextKeyword(context, "", 0, 0, 0);
+        get_query_def((Query *) cte->ctequery, buf, context->namespaces, NULL,
+                      context->prettyFlags, context->wrapColumn,
+                      context->indentLevel,
+                      context->finalise_aggs,
+                      context->sortgroup_colno);
+        if (PRETTY_INDENT(context))
+            appendContextKeyword(context, "", 0, 0, 0);
+        appendStringInfoChar(buf, ')');
+        sep = ", ";
+    }
+
+    if (PRETTY_INDENT(context))
+    {
+        context->indentLevel -= PRETTYINDENT_STD;
+        appendContextKeyword(context, "", 0, 0, 0);
+    }
+    else
+        appendStringInfoChar(buf, ' ');
+}
+
+/* ----------
+ * get_select_query_def            - Parse back a SELECT parsetree
+ * ----------
+ */
+static void
+get_select_query_def(Query *query, deparse_context *context,
+                     TupleDesc resultDesc)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    List       *save_windowclause;
+    List       *save_windowtlist;
+    bool        force_colno;
+    ListCell   *l;
+
+    /* Insert the WITH clause if given */
+    get_with_clause(query, context);
+
+    /* Set up context for possible window functions */
+    save_windowclause = context->windowClause;
+    context->windowClause = query->windowClause;
+    save_windowtlist = context->windowTList;
+    context->windowTList = query->targetList;
+
+    /*
+     * If the Query node has a setOperations tree, then it's the top level of
+     * a UNION/INTERSECT/EXCEPT query; only the WITH, ORDER BY and LIMIT
+     * fields are interesting in the top query itself.
+     */
+    if (query->setOperations)
+    {
+        get_setop_query(query->setOperations, query, context, resultDesc);
+        /* ORDER BY clauses must be simple in this case */
+        force_colno = true;
+    }
+    else
+    {
+        get_basic_select_query(query, context, resultDesc);
+        force_colno = false;
+    }
+
+    /* Add the ORDER BY clause if given */
+    if (query->sortClause != NIL)
+    {
+        appendContextKeyword(context, " ORDER BY ",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+        get_rule_orderby(query->sortClause, query->targetList,
+                         force_colno, context);
+    }
+
+    /* Add the LIMIT clause if given */
+    if (query->limitOffset != NULL)
+    {
+        appendContextKeyword(context, " OFFSET ",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+        get_rule_expr(query->limitOffset, context, false);
+    }
+    if (query->limitCount != NULL)
+    {
+        appendContextKeyword(context, " LIMIT ",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+        if (IsA(query->limitCount, Const) &&
+            ((Const *) query->limitCount)->constisnull)
+            appendStringInfoString(buf, "ALL");
+        else
+            get_rule_expr(query->limitCount, context, false);
+    }
+
+    /* Add FOR [KEY] UPDATE/SHARE clauses if present */
+    if (query->hasForUpdate)
+    {
+        foreach(l, query->rowMarks)
+        {
+            RowMarkClause *rc = (RowMarkClause *) lfirst(l);
+
+            /* don't print implicit clauses */
+            if (rc->pushedDown)
+                continue;
+
+            switch (rc->strength)
+            {
+                case LCS_NONE:
+                    /* we intentionally throw an error for LCS_NONE */
+                    elog(ERROR, "unrecognized LockClauseStrength %d",
+                         (int) rc->strength);
+                    break;
+                case LCS_FORKEYSHARE:
+                    appendContextKeyword(context, " FOR KEY SHARE",
+                                         -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+                    break;
+                case LCS_FORSHARE:
+                    appendContextKeyword(context, " FOR SHARE",
+                                         -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+                    break;
+                case LCS_FORNOKEYUPDATE:
+                    appendContextKeyword(context, " FOR NO KEY UPDATE",
+                                         -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+                    break;
+                case LCS_FORUPDATE:
+                    appendContextKeyword(context, " FOR UPDATE",
+                                         -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+                    break;
+            }
+
+            appendStringInfo(buf, " OF %s",
+                             quote_identifier(get_rtable_name(rc->rti,
+                                                              context)));
+            if (rc->waitPolicy == LockWaitError)
+                appendStringInfoString(buf, " NOWAIT");
+            else if (rc->waitPolicy == LockWaitSkip)
+                appendStringInfoString(buf, " SKIP LOCKED");
+        }
+    }
+
+    context->windowClause = save_windowclause;
+    context->windowTList = save_windowtlist;
+}
+
+/*
+ * Detect whether query looks like SELECT ... FROM VALUES();
+ * if so, return the VALUES RTE.  Otherwise return NULL.
+ */
+static RangeTblEntry *
+get_simple_values_rte(Query *query)
+{// #lizard forgives
+    RangeTblEntry *result = NULL;
+    ListCell   *lc;
+
+    /*
+     * We want to return TRUE even if the Query also contains OLD or NEW rule
+     * RTEs.  So the idea is to scan the rtable and see if there is only one
+     * inFromCl RTE that is a VALUES RTE.
+     */
+    foreach(lc, query->rtable)
+    {
+        RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);
+
+        if (rte->rtekind == RTE_VALUES && rte->inFromCl)
+        {
+            if (result)
+                return NULL;    /* multiple VALUES (probably not possible) */
+            result = rte;
+        }
+        else if (rte->rtekind == RTE_RELATION && !rte->inFromCl)
+            continue;            /* ignore rule entries */
+        else
+            return NULL;        /* something else -> not simple VALUES */
+    }
+
+    /*
+     * We don't need to check the targetlist in any great detail, because
+     * parser/analyze.c will never generate a "bare" VALUES RTE --- they only
+     * appear inside auto-generated sub-queries with very restricted
+     * structure.  However, DefineView might have modified the tlist by
+     * injecting new column aliases; so compare tlist resnames against the
+     * RTE's names to detect that.
+     */
+    if (result)
+    {
+        ListCell   *lcn;
+
+        if (list_length(query->targetList) != list_length(result->eref->colnames))
+            return NULL;        /* this probably cannot happen */
+        forboth(lc, query->targetList, lcn, result->eref->colnames)
+        {
+            TargetEntry *tle = (TargetEntry *) lfirst(lc);
+            char       *cname = strVal(lfirst(lcn));
+
+            if (tle->resjunk)
+                return NULL;    /* this probably cannot happen */
+            if (tle->resname == NULL || strcmp(tle->resname, cname) != 0)
+                return NULL;    /* column name has been changed */
+        }
+    }
+
+    return result;
+}
+
+static void
+get_basic_select_query(Query *query, deparse_context *context,
+                       TupleDesc resultDesc)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    RangeTblEntry *values_rte;
+    char       *sep;
+    ListCell   *l;
+
+    if (PRETTY_INDENT(context))
+    {
+        context->indentLevel += PRETTYINDENT_STD;
+        appendStringInfoChar(buf, ' ');
+    }
+
+    /*
+     * If the query looks like SELECT * FROM (VALUES ...), then print just the
+     * VALUES part.  This reverses what transformValuesClause() did at parse
+     * time.
+     */
+    values_rte = get_simple_values_rte(query);
+    if (values_rte)
+    {
+        get_values_def(values_rte->values_lists, context);
+        return;
+    }
+
+    /*
+     * Build up the query string - first we say SELECT
+     */
+    appendStringInfoString(buf, "SELECT");
+
+    /* Add the DISTINCT clause if given */
+    if (query->distinctClause != NIL)
+    {
+        if (query->hasDistinctOn)
+        {
+            appendStringInfoString(buf, " DISTINCT ON (");
+            sep = "";
+            foreach(l, query->distinctClause)
+            {
+                SortGroupClause *srt = (SortGroupClause *) lfirst(l);
+
+                appendStringInfoString(buf, sep);
+                get_rule_sortgroupclause(srt->tleSortGroupRef, query->targetList,
+                                         false, context);
+                sep = ", ";
+            }
+            appendStringInfoChar(buf, ')');
+        }
+        else
+            appendStringInfoString(buf, " DISTINCT");
+    }
+
+    /* Then we tell what to select (the targetlist) */
+    get_target_list(query->targetList, context, resultDesc);
+
+    /* Add the FROM clause if needed */
+    get_from_clause(query, " FROM ", context);
+
+    /* Add the WHERE clause if given */
+    if (query->jointree->quals != NULL)
+    {
+        appendContextKeyword(context, " WHERE ",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+        get_rule_expr(query->jointree->quals, context, false);
+    }
+
+    /* Add the GROUP BY clause if given */
+    if (query->groupClause != NULL || query->groupingSets != NULL)
+    {
+        ParseExprKind save_exprkind;
+
+        appendContextKeyword(context, " GROUP BY ",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+
+        save_exprkind = context->special_exprkind;
+        context->special_exprkind = EXPR_KIND_GROUP_BY;
+
+        if (query->groupingSets == NIL)
+        {
+            sep = "";
+            foreach(l, query->groupClause)
+            {
+                SortGroupClause *grp = (SortGroupClause *) lfirst(l);
+
+                appendStringInfoString(buf, sep);
+                get_rule_sortgroupclause(grp->tleSortGroupRef, query->targetList,
+                                         false, context);
+                sep = ", ";
+            }
+        }
+        else
+        {
+            sep = "";
+            foreach(l, query->groupingSets)
+            {
+                GroupingSet *grp = lfirst(l);
+
+                appendStringInfoString(buf, sep);
+                get_rule_groupingset(grp, query->targetList, true, context);
+                sep = ", ";
+            }
+        }
+
+        context->special_exprkind = save_exprkind;
+    }
+
+    /* Add the HAVING clause if given */
+    if (query->havingQual != NULL)
+    {
+        appendContextKeyword(context, " HAVING ",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 0);
+        get_rule_expr(query->havingQual, context, false);
+    }
+
+    /* Add the WINDOW clause if needed */
+    if (query->windowClause != NIL)
+        get_rule_windowclause(query, context);
+}
+
+/* ----------
+ * get_target_list            - Parse back a SELECT target list
+ *
+ * This is also used for RETURNING lists in INSERT/UPDATE/DELETE.
+ * ----------
+ */
+static void
+get_target_list(List *targetList, deparse_context *context,
+                TupleDesc resultDesc)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    StringInfoData targetbuf;
+    bool        last_was_multiline = false;
+    char       *sep;
+    int            colno;
+    ListCell   *l;
+#ifdef PGXC
+    bool no_targetlist = true;
+#endif
+
+    /* we use targetbuf to hold each TLE's text temporarily */
+    initStringInfo(&targetbuf);
+
+    sep = " ";
+    colno = 0;
+    foreach(l, targetList)
+    {
+        TargetEntry *tle = (TargetEntry *) lfirst(l);
+        char       *colname;
+        char       *attname;
+
+        if (tle->resjunk)
+            continue;            /* ignore junk entries */
+
+#ifdef PGXC
+        /* Found at least one element in the target list */
+        if (no_targetlist)
+            no_targetlist = false;
+#endif
+
+        appendStringInfoString(buf, sep);
+        sep = ", ";
+        colno++;
+
+        /*
+         * Put the new field text into targetbuf so we can decide after we've
+         * got it whether or not it needs to go on a new line.
+         */
+        resetStringInfo(&targetbuf);
+        context->buf = &targetbuf;
+
+        /*
+         * We special-case Var nodes rather than using get_rule_expr. This is
+         * needed because get_rule_expr will display a whole-row Var as
+         * "foo.*", which is the preferred notation in most contexts, but at
+         * the top level of a SELECT list it's not right (the parser will
+         * expand that notation into multiple columns, yielding behavior
+         * different from a whole-row Var).  We need to call get_variable
+         * directly so that we can tell it to do the right thing, and so that
+         * we can get the attribute name which is the default AS label.
+         */
+        if (tle->expr && (IsA(tle->expr, Var)))
+        {
+            attname = get_variable((Var *) tle->expr, 0, true, context);
+        }
+        else
+        {
+            get_rule_expr((Node *) tle->expr, context, true);
+            /* We'll show the AS name unless it's this: */
+            attname = "?column?";
+        }
+
+        /*
+         * Figure out what the result column should be called.  In the context
+         * of a view, use the view's tuple descriptor (so as to pick up the
+         * effects of any column RENAME that's been done on the view).
+         * Otherwise, just use what we can find in the TLE.
+         */
+        if (resultDesc && colno <= resultDesc->natts)
+            colname = NameStr(resultDesc->attrs[colno - 1]->attname);
+        else
+            colname = tle->resname;
+
+        /* Show AS unless the column's name is correct as-is */
+        if (colname)            /* resname could be NULL */
+        {
+            if (attname == NULL || strcmp(attname, colname) != 0)
+                appendStringInfo(&targetbuf, " AS %s", quote_identifier(colname));
+        }
+
+        /* Restore context's output buffer */
+        context->buf = buf;
+
+        /* Consider line-wrapping if enabled */
+        if (PRETTY_INDENT(context) && context->wrapColumn >= 0)
+        {
+            int            leading_nl_pos;
+
+            /* Does the new field start with a new line? */
+            if (targetbuf.len > 0 && targetbuf.data[0] == '\n')
+                leading_nl_pos = 0;
+            else
+                leading_nl_pos = -1;
+
+            /* If so, we shouldn't add anything */
+            if (leading_nl_pos >= 0)
+            {
+                /* instead, remove any trailing spaces currently in buf */
+                removeStringInfoSpaces(buf);
+            }
+            else
+            {
+                char       *trailing_nl;
+
+                /* Locate the start of the current line in the output buffer */
+                trailing_nl = strrchr(buf->data, '\n');
+                if (trailing_nl == NULL)
+                    trailing_nl = buf->data;
+                else
+                    trailing_nl++;
+
+                /*
+                 * Add a newline, plus some indentation, if the new field is
+                 * not the first and either the new field would cause an
+                 * overflow or the last field used more than one line.
+                 */
+                if (colno > 1 &&
+                    ((strlen(trailing_nl) + targetbuf.len > context->wrapColumn) ||
+                     last_was_multiline))
+                    appendContextKeyword(context, "", -PRETTYINDENT_STD,
+                                         PRETTYINDENT_STD, PRETTYINDENT_VAR);
+            }
+
+            /* Remember this field's multiline status for next iteration */
+            last_was_multiline =
+                (strchr(targetbuf.data + leading_nl_pos + 1, '\n') != NULL);
+        }
+
+        /* Add the new field */
+        appendStringInfoString(buf, targetbuf.data);
+    }
+
+#ifdef PGXC
+    /*
+     * Because the empty target list can generate invalid SQL
+     * clause. Here, just fill a '*' to process a table without
+     * any columns, this statement will be sent to Datanodes
+     * and treated correctly on remote nodes.
+     */
+    if (no_targetlist)
+        appendStringInfo(buf, " *");
+#endif
+    /* clean up */
+    pfree(targetbuf.data);
+}
+
+static void
+get_setop_query(Node *setOp, Query *query, deparse_context *context,
+                TupleDesc resultDesc)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    bool        need_paren;
+
+    /* Guard against excessively long or deeply-nested queries */
+    CHECK_FOR_INTERRUPTS();
+    check_stack_depth();
+
+    if (IsA(setOp, RangeTblRef))
+    {
+        RangeTblRef *rtr = (RangeTblRef *) setOp;
+        RangeTblEntry *rte = rt_fetch(rtr->rtindex, query->rtable);
+        Query       *subquery = rte->subquery;
+
+        Assert(subquery != NULL);
+        Assert(subquery->setOperations == NULL);
+        /* Need parens if WITH, ORDER BY, FOR UPDATE, or LIMIT; see gram.y */
+        need_paren = (subquery->cteList ||
+                      subquery->sortClause ||
+                      subquery->rowMarks ||
+                      subquery->limitOffset ||
+                      subquery->limitCount);
+        if (need_paren)
+            appendStringInfoChar(buf, '(');
+        get_query_def(subquery, buf, context->namespaces, resultDesc,
+                      context->prettyFlags, context->wrapColumn,
+                      context->indentLevel,
+                      context->finalise_aggs,
+                      context->sortgroup_colno);
+        if (need_paren)
+            appendStringInfoChar(buf, ')');
+    }
+    else if (IsA(setOp, SetOperationStmt))
+    {
+        SetOperationStmt *op = (SetOperationStmt *) setOp;
+        int            subindent;
+
+        /*
+         * We force parens when nesting two SetOperationStmts, except when the
+         * lefthand input is another setop of the same kind.  Syntactically,
+         * we could omit parens in rather more cases, but it seems best to use
+         * parens to flag cases where the setop operator changes.  If we use
+         * parens, we also increase the indentation level for the child query.
+         *
+         * There are some cases in which parens are needed around a leaf query
+         * too, but those are more easily handled at the next level down (see
+         * code above).
+         */
+        if (IsA(op->larg, SetOperationStmt))
+        {
+            SetOperationStmt *lop = (SetOperationStmt *) op->larg;
+
+            if (op->op == lop->op && op->all == lop->all)
+                need_paren = false;
+            else
+                need_paren = true;
+        }
+        else
+            need_paren = false;
+
+        if (need_paren)
+        {
+            appendStringInfoChar(buf, '(');
+            subindent = PRETTYINDENT_STD;
+            appendContextKeyword(context, "", subindent, 0, 0);
+        }
+        else
+            subindent = 0;
+
+        get_setop_query(op->larg, query, context, resultDesc);
+
+        if (need_paren)
+            appendContextKeyword(context, ") ", -subindent, 0, 0);
+        else if (PRETTY_INDENT(context))
+            appendContextKeyword(context, "", -subindent, 0, 0);
+        else
+            appendStringInfoChar(buf, ' ');
+
+        switch (op->op)
+        {
+            case SETOP_UNION:
+                appendStringInfoString(buf, "UNION ");
+                break;
+            case SETOP_INTERSECT:
+                appendStringInfoString(buf, "INTERSECT ");
+                break;
+            case SETOP_EXCEPT:
+                appendStringInfoString(buf, "EXCEPT ");
+                break;
+            default:
+                elog(ERROR, "unrecognized set op: %d",
+                     (int) op->op);
+        }
+        if (op->all)
+            appendStringInfoString(buf, "ALL ");
+
+        /* Always parenthesize if RHS is another setop */
+        need_paren = IsA(op->rarg, SetOperationStmt);
+
+        /*
+         * The indentation code here is deliberately a bit different from that
+         * for the lefthand input, because we want the line breaks in
+         * different places.
+         */
+        if (need_paren)
+        {
+            appendStringInfoChar(buf, '(');
+            subindent = PRETTYINDENT_STD;
+        }
+        else
+            subindent = 0;
+        appendContextKeyword(context, "", subindent, 0, 0);
+
+        get_setop_query(op->rarg, query, context, resultDesc);
+
+        if (PRETTY_INDENT(context))
+            context->indentLevel -= subindent;
+        if (need_paren)
+            appendContextKeyword(context, ")", 0, 0, 0);
+    }
+    else
+    {
+        elog(ERROR, "unrecognized node type: %d",
+             (int) nodeTag(setOp));
+    }
+}
+
+/*
+ * Display a sort/group clause.
+ *
+ * Also returns the expression tree, so caller need not find it again.
+ */
+static Node *
+get_rule_sortgroupclause(Index ref, List *tlist, bool force_colno,
+                         deparse_context *context)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    TargetEntry *tle;
+    Node       *expr;
+
+    tle = get_sortgroupref_tle(ref, tlist);
+    expr = (Node *) tle->expr;
+
+    /*
+     * Use column-number form if requested by caller.  Otherwise, if
+     * expression is a constant, force it to be dumped with an explicit cast
+     * as decoration --- this is because a simple integer constant is
+     * ambiguous (and will be misinterpreted by findTargetlistEntry()) if we
+     * dump it without any decoration.  If it's anything more complex than a
+     * simple Var, then force extra parens around it, to ensure it can't be
+     * misinterpreted as a cube() or rollup() construct.
+     */
+    if (force_colno)
+    {
+        Assert(!tle->resjunk);
+        appendStringInfo(buf, "%d", tle->resno);
+    }
+    else if (expr && IsA(expr, Const))
+        get_const_expr((Const *) expr, context, 1);
+    else if (!expr || IsA(expr, Var))
+        get_rule_expr(expr, context, true);
+    else
+    {
+        /*
+         * We must force parens for function-like expressions even if
+         * PRETTY_PAREN is off, since those are the ones in danger of
+         * misparsing. For other expressions we need to force them only if
+         * PRETTY_PAREN is on, since otherwise the expression will output them
+         * itself. (We can't skip the parens.)
+         */
+        bool        need_paren = (PRETTY_PAREN(context)
+                                  || IsA(expr, FuncExpr)
+                                  ||IsA(expr, Aggref)
+                                  ||IsA(expr, WindowFunc));
+
+        if (need_paren)
+            appendStringInfoString(context->buf, "(");
+        get_rule_expr(expr, context, true);
+        if (need_paren)
+            appendStringInfoString(context->buf, ")");
+    }
+
+    return expr;
+}
+
+/*
+ * Display a GroupingSet
+ */
+static void
+get_rule_groupingset(GroupingSet *gset, List *targetlist,
+                     bool omit_parens, deparse_context *context)
+{// #lizard forgives
+    ListCell   *l;
+    StringInfo    buf = context->buf;
+    bool        omit_child_parens = true;
+    char       *sep = "";
+
+    switch (gset->kind)
+    {
+        case GROUPING_SET_EMPTY:
+            appendStringInfoString(buf, "()");
+            return;
+
+        case GROUPING_SET_SIMPLE:
+            {
+                if (!omit_parens || list_length(gset->content) != 1)
+                    appendStringInfoString(buf, "(");
+
+                foreach(l, gset->content)
+                {
+                    Index        ref = lfirst_int(l);
+
+                    appendStringInfoString(buf, sep);
+                    get_rule_sortgroupclause(ref, targetlist,
+                                             false, context);
+                    sep = ", ";
+                }
+
+                if (!omit_parens || list_length(gset->content) != 1)
+                    appendStringInfoString(buf, ")");
+            }
+            return;
+
+        case GROUPING_SET_ROLLUP:
+            appendStringInfoString(buf, "ROLLUP(");
+            break;
+        case GROUPING_SET_CUBE:
+            appendStringInfoString(buf, "CUBE(");
+            break;
+        case GROUPING_SET_SETS:
+            appendStringInfoString(buf, "GROUPING SETS (");
+            omit_child_parens = false;
+            break;
+    }
+
+    foreach(l, gset->content)
+    {
+        appendStringInfoString(buf, sep);
+        get_rule_groupingset(lfirst(l), targetlist, omit_child_parens, context);
+        sep = ", ";
+    }
+
+    appendStringInfoString(buf, ")");
+}
+
+/*
+ * Display an ORDER BY list.
+ */
+static void
+get_rule_orderby(List *orderList, List *targetList,
+                 bool force_colno, deparse_context *context)
+{
+    StringInfo    buf = context->buf;
+    const char *sep;
+    ListCell   *l;
+
+    sep = "";
+    foreach(l, orderList)
+    {
+        SortGroupClause *srt = (SortGroupClause *) lfirst(l);
+        Node       *sortexpr;
+        Oid            sortcoltype;
+        TypeCacheEntry *typentry;
+
+        appendStringInfoString(buf, sep);
+        sortexpr = get_rule_sortgroupclause(srt->tleSortGroupRef, targetList,
+                                            force_colno, context);
+        sortcoltype = exprType(sortexpr);
+        /* See whether operator is default < or > for datatype */
+        typentry = lookup_type_cache(sortcoltype,
+                                     TYPECACHE_LT_OPR | TYPECACHE_GT_OPR);
+        if (srt->sortop == typentry->lt_opr)
+        {
+            /* ASC is default, so emit nothing for it */
+            if (srt->nulls_first)
+                appendStringInfoString(buf, " NULLS FIRST");
+        }
+        else if (srt->sortop == typentry->gt_opr)
+        {
+            appendStringInfoString(buf, " DESC");
+            /* DESC defaults to NULLS FIRST */
+            if (!srt->nulls_first)
+                appendStringInfoString(buf, " NULLS LAST");
+        }
+        else
+        {
+            appendStringInfo(buf, " USING %s",
+                             generate_operator_name(srt->sortop,
+                                                    sortcoltype,
+                                                    sortcoltype));
+            /* be specific to eliminate ambiguity */
+            if (srt->nulls_first)
+                appendStringInfoString(buf, " NULLS FIRST");
+            else
+                appendStringInfoString(buf, " NULLS LAST");
+        }
+        sep = ", ";
+    }
+}
+
+/*
+ * Display a WINDOW clause.
+ *
+ * Note that the windowClause list might contain only anonymous window
+ * specifications, in which case we should print nothing here.
+ */
+static void
+get_rule_windowclause(Query *query, deparse_context *context)
+{
+    StringInfo    buf = context->buf;
+    const char *sep;
+    ListCell   *l;
+
+    sep = NULL;
+    foreach(l, query->windowClause)
+    {
+        WindowClause *wc = (WindowClause *) lfirst(l);
+
+        if (wc->name == NULL)
+            continue;            /* ignore anonymous windows */
+
+        if (sep == NULL)
+            appendContextKeyword(context, " WINDOW ",
+                                 -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+        else
+            appendStringInfoString(buf, sep);
+
+        appendStringInfo(buf, "%s AS ", quote_identifier(wc->name));
+
+        get_rule_windowspec(wc, query->targetList, context);
+
+        sep = ", ";
+    }
+}
+
+/*
+ * Display a window definition
+ */
+static void
+get_rule_windowspec(WindowClause *wc, List *targetList,
+                    deparse_context *context)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    bool        needspace = false;
+    const char *sep;
+    ListCell   *l;
+
+    appendStringInfoChar(buf, '(');
+    if (wc->refname)
+    {
+        appendStringInfoString(buf, quote_identifier(wc->refname));
+        needspace = true;
+    }
+    /* partition clauses are always inherited, so only print if no refname */
+    if (wc->partitionClause && !wc->refname)
+    {
+        if (needspace)
+            appendStringInfoChar(buf, ' ');
+        appendStringInfoString(buf, "PARTITION BY ");
+        sep = "";
+        foreach(l, wc->partitionClause)
+        {
+            SortGroupClause *grp = (SortGroupClause *) lfirst(l);
+
+            appendStringInfoString(buf, sep);
+            get_rule_sortgroupclause(grp->tleSortGroupRef, targetList,
+                                     false, context);
+            sep = ", ";
+        }
+        needspace = true;
+    }
+    /* print ordering clause only if not inherited */
+    if (wc->orderClause && !wc->copiedOrder)
+    {
+        if (needspace)
+            appendStringInfoChar(buf, ' ');
+        appendStringInfoString(buf, "ORDER BY ");
+        get_rule_orderby(wc->orderClause, targetList, false, context);
+        needspace = true;
+    }
+    /* framing clause is never inherited, so print unless it's default */
+    if (wc->frameOptions & FRAMEOPTION_NONDEFAULT)
+    {
+        if (needspace)
+            appendStringInfoChar(buf, ' ');
+        if (wc->frameOptions & FRAMEOPTION_RANGE)
+            appendStringInfoString(buf, "RANGE ");
+        else if (wc->frameOptions & FRAMEOPTION_ROWS)
+            appendStringInfoString(buf, "ROWS ");
+        else
+            Assert(false);
+        if (wc->frameOptions & FRAMEOPTION_BETWEEN)
+            appendStringInfoString(buf, "BETWEEN ");
+        if (wc->frameOptions & FRAMEOPTION_START_UNBOUNDED_PRECEDING)
+            appendStringInfoString(buf, "UNBOUNDED PRECEDING ");
+        else if (wc->frameOptions & FRAMEOPTION_START_CURRENT_ROW)
+            appendStringInfoString(buf, "CURRENT ROW ");
+        else if (wc->frameOptions & FRAMEOPTION_START_VALUE)
+        {
+            get_rule_expr(wc->startOffset, context, false);
+            if (wc->frameOptions & FRAMEOPTION_START_VALUE_PRECEDING)
+                appendStringInfoString(buf, " PRECEDING ");
+            else if (wc->frameOptions & FRAMEOPTION_START_VALUE_FOLLOWING)
+                appendStringInfoString(buf, " FOLLOWING ");
+            else
+                Assert(false);
+        }
+        else
+            Assert(false);
+        if (wc->frameOptions & FRAMEOPTION_BETWEEN)
+        {
+            appendStringInfoString(buf, "AND ");
+            if (wc->frameOptions & FRAMEOPTION_END_UNBOUNDED_FOLLOWING)
+                appendStringInfoString(buf, "UNBOUNDED FOLLOWING ");
+            else if (wc->frameOptions & FRAMEOPTION_END_CURRENT_ROW)
+                appendStringInfoString(buf, "CURRENT ROW ");
+            else if (wc->frameOptions & FRAMEOPTION_END_VALUE)
+            {
+                get_rule_expr(wc->endOffset, context, false);
+                if (wc->frameOptions & FRAMEOPTION_END_VALUE_PRECEDING)
+                    appendStringInfoString(buf, " PRECEDING ");
+                else if (wc->frameOptions & FRAMEOPTION_END_VALUE_FOLLOWING)
+                    appendStringInfoString(buf, " FOLLOWING ");
+                else
+                    Assert(false);
+            }
+            else
+                Assert(false);
+        }
+        /* we will now have a trailing space; remove it */
+        buf->len--;
+    }
+    appendStringInfoChar(buf, ')');
+}
+
+/* ----------
+ * get_insert_query_def            - Parse back an INSERT parsetree
+ * ----------
+ */
+static void
+get_insert_query_def(Query *query, deparse_context *context)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    RangeTblEntry *select_rte = NULL;
+    RangeTblEntry *values_rte = NULL;
+    RangeTblEntry *rte;
+    char       *sep;
+    ListCell   *l;
+    List       *strippedexprs;
+
+    /* Insert the WITH clause if given */
+    get_with_clause(query, context);
+
+#ifdef __TBASE__
+    /* 
+      * If query has unshippable triggers, we have to do INSERT on coordinator,
+      * and we do not need select_rte and values_rte.
+      * Hence we keep both select_rte and values_rte NULL.
+         */
+    if (!query->hasUnshippableTriggers)
+    {
+#endif
+    /*
+     * If it's an INSERT ... SELECT or multi-row VALUES, there will be a
+     * single RTE for the SELECT or VALUES.  Plain VALUES has neither.
+     */
+    foreach(l, query->rtable)
+    {
+        rte = (RangeTblEntry *) lfirst(l);
+
+        if (rte->rtekind == RTE_SUBQUERY)
+        {
+            if (select_rte)
+                elog(ERROR, "too many subquery RTEs in INSERT");
+            select_rte = rte;
+        }
+
+        if (rte->rtekind == RTE_VALUES)
+        {
+            if (values_rte)
+                elog(ERROR, "too many values RTEs in INSERT");
+            values_rte = rte;
+        }
+    }
+#ifdef __TBASE__
+    }
+#endif
+    if (select_rte && values_rte)
+        elog(ERROR, "both subquery and values RTEs in INSERT");
+
+    /*
+     * Start the query with INSERT INTO relname
+     */
+    rte = rt_fetch(query->resultRelation, query->rtable);
+    Assert(rte->rtekind == RTE_RELATION);
+
+    if (PRETTY_INDENT(context))
+    {
+        context->indentLevel += PRETTYINDENT_STD;
+        appendStringInfoChar(buf, ' ');
+    }
+    appendStringInfo(buf, "INSERT INTO %s ",
+                     generate_relation_name(rte->relid, NIL));
+    /* INSERT requires AS keyword for target alias */
+    if (rte->alias != NULL)
+        appendStringInfo(buf, "AS %s ",
+                         quote_identifier(rte->alias->aliasname));
+
+    /*
+     * Add the insert-column-names list.  Any indirection decoration needed on
+     * the column names can be inferred from the top targetlist.
+     */
+    strippedexprs = NIL;
+    sep = "";
+    if (query->targetList)
+        appendStringInfoChar(buf, '(');
+    foreach(l, query->targetList)
+    {
+        TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+        if (tle->resjunk)
+            continue;            /* ignore junk entries */
+
+        appendStringInfoString(buf, sep);
+        sep = ", ";
+
+        /*
+         * Put out name of target column; look in the catalogs, not at
+         * tle->resname, since resname will fail to track RENAME.
+         */
+        appendStringInfoString(buf,
+                               quote_identifier(get_relid_attribute_name(rte->relid,
+                                                                         tle->resno)));
+
+        /*
+         * Print any indirection needed (subfields or subscripts), and strip
+         * off the top-level nodes representing the indirection assignments.
+         * Add the stripped expressions to strippedexprs.  (If it's a
+         * single-VALUES statement, the stripped expressions are the VALUES to
+         * print below.  Otherwise they're just Vars and not really
+         * interesting.)
+         */
+        strippedexprs = lappend(strippedexprs,
+                                processIndirection((Node *) tle->expr,
+                                                   context));
+    }
+    if (query->targetList)
+        appendStringInfoString(buf, ") ");
+
+    if (query->override)
+    {
+        if (query->override == OVERRIDING_SYSTEM_VALUE)
+            appendStringInfoString(buf, "OVERRIDING SYSTEM VALUE ");
+        else if (query->override == OVERRIDING_USER_VALUE)
+            appendStringInfoString(buf, "OVERRIDING USER VALUE ");
+    }
+
+    if (select_rte)
+    {
+        /* Add the SELECT */
+        get_query_def(select_rte->subquery, buf, NIL, NULL,
+                      context->prettyFlags, context->wrapColumn,
+                      context->indentLevel,
+                      context->finalise_aggs,
+                      context->sortgroup_colno);
+    }
+    else if (values_rte)
+    {
+        /* Add the multi-VALUES expression lists */
+        get_values_def(values_rte->values_lists, context);
+    }
+    else if (strippedexprs)
+    {
+        /* Add the single-VALUES expression list */
+        appendContextKeyword(context, "VALUES (",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 2);
+        get_rule_expr((Node *) strippedexprs, context, false);
+        appendStringInfoChar(buf, ')');
+    }
+    else
+    {
+        /* No expressions, so it must be DEFAULT VALUES */
+        appendStringInfoString(buf, "DEFAULT VALUES");
+    }
+
+    /* Add ON CONFLICT if present */
+    if (query->onConflict)
+    {
+        OnConflictExpr *confl = query->onConflict;
+
+        appendStringInfoString(buf, " ON CONFLICT");
+
+        if (confl->arbiterElems)
+        {
+            /* Add the single-VALUES expression list */
+            appendStringInfoChar(buf, '(');
+            get_rule_expr((Node *) confl->arbiterElems, context, false);
+            appendStringInfoChar(buf, ')');
+
+            /* Add a WHERE clause (for partial indexes) if given */
+            if (confl->arbiterWhere != NULL)
+            {
+                bool        save_varprefix;
+
+                /*
+                 * Force non-prefixing of Vars, since parser assumes that they
+                 * belong to target relation.  WHERE clause does not use
+                 * InferenceElem, so this is separately required.
+                 */
+                save_varprefix = context->varprefix;
+                context->varprefix = false;
+
+                appendContextKeyword(context, " WHERE ",
+                                     -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+                get_rule_expr(confl->arbiterWhere, context, false);
+
+                context->varprefix = save_varprefix;
+            }
+        }
+        else if (OidIsValid(confl->constraint))
+        {
+            char       *constraint = get_constraint_name(confl->constraint);
+
+            if (!constraint)
+                elog(ERROR, "cache lookup failed for constraint %u",
+                     confl->constraint);
+            appendStringInfo(buf, " ON CONSTRAINT %s",
+                             quote_identifier(constraint));
+        }
+
+        if (confl->action == ONCONFLICT_NOTHING)
+        {
+            appendStringInfoString(buf, " DO NOTHING");
+        }
+        else
+        {
+            appendStringInfoString(buf, " DO UPDATE SET ");
+            /* Deparse targetlist */
+            get_update_query_targetlist_def(query, confl->onConflictSet,
+                                            context, rte);
+
+            /* Add a WHERE clause if given */
+            if (confl->onConflictWhere != NULL)
+            {
+                appendContextKeyword(context, " WHERE ",
+                                     -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+                get_rule_expr(confl->onConflictWhere, context, false);
+            }
+        }
+    }
+
+    /* Add RETURNING if present */
+    if (query->returningList)
+    {
+        appendContextKeyword(context, " RETURNING",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+        get_target_list(query->returningList, context, NULL);
+    }
+}
+
+
+/* ----------
+ * get_update_query_def            - Parse back an UPDATE parsetree
+ * ----------
+ */
+static void
+get_update_query_def(Query *query, deparse_context *context)
+{
+    StringInfo    buf = context->buf;
+    RangeTblEntry *rte;
+
+    /* Insert the WITH clause if given */
+    get_with_clause(query, context);
+
+    /*
+     * Start the query with UPDATE relname SET
+     */
+    rte = rt_fetch(query->resultRelation, query->rtable);
+    Assert(rte->rtekind == RTE_RELATION);
+    if (PRETTY_INDENT(context))
+    {
+        appendStringInfoChar(buf, ' ');
+        context->indentLevel += PRETTYINDENT_STD;
+    }
+    appendStringInfo(buf, "UPDATE %s%s",
+                     only_marker(rte),
+                     generate_relation_name(rte->relid, NIL));
+    if (rte->alias != NULL)
+        appendStringInfo(buf, " %s",
+                         quote_identifier(rte->alias->aliasname));
+    appendStringInfoString(buf, " SET ");
+
+    /* Deparse targetlist */
+    get_update_query_targetlist_def(query, query->targetList, context, rte);
+
+    /* Add the FROM clause if needed */
+    get_from_clause(query, " FROM ", context);
+
+    /* Add a WHERE clause if given */
+    if (query->jointree->quals != NULL)
+    {
+        appendContextKeyword(context, " WHERE ",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+        get_rule_expr(query->jointree->quals, context, false);
+    }
+
+    /* Add RETURNING if present */
+    if (query->returningList)
+    {
+        appendContextKeyword(context, " RETURNING",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+        get_target_list(query->returningList, context, NULL);
+    }
+}
+
+
+/* ----------
+ * get_update_query_targetlist_def            - Parse back an UPDATE targetlist
+ * ----------
+ */
+static void
+get_update_query_targetlist_def(Query *query, List *targetList,
+                                deparse_context *context, RangeTblEntry *rte)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    ListCell   *l;
+    ListCell   *next_ma_cell;
+    int            remaining_ma_columns;
+    const char *sep;
+    SubLink    *cur_ma_sublink;
+    List       *ma_sublinks;
+
+    /*
+     * Prepare to deal with MULTIEXPR assignments: collect the source SubLinks
+     * into a list.  We expect them to appear, in ID order, in resjunk tlist
+     * entries.
+     */
+    ma_sublinks = NIL;
+    if (query->hasSubLinks)        /* else there can't be any */
+    {
+        foreach(l, targetList)
+        {
+            TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+            if (tle->resjunk && IsA(tle->expr, SubLink))
+            {
+                SubLink    *sl = (SubLink *) tle->expr;
+
+                if (sl->subLinkType == MULTIEXPR_SUBLINK)
+                {
+                    ma_sublinks = lappend(ma_sublinks, sl);
+                    Assert(sl->subLinkId == list_length(ma_sublinks));
+                }
+            }
+        }
+    }
+    next_ma_cell = list_head(ma_sublinks);
+    cur_ma_sublink = NULL;
+    remaining_ma_columns = 0;
+
+    /* Add the comma separated list of 'attname = value' */
+    sep = "";
+    foreach(l, targetList)
+    {
+        TargetEntry *tle = (TargetEntry *) lfirst(l);
+        Node       *expr;
+
+        if (tle->resjunk)
+            continue;            /* ignore junk entries */
+
+        /* Emit separator (OK whether we're in multiassignment or not) */
+        appendStringInfoString(buf, sep);
+        sep = ", ";
+
+        /*
+         * Check to see if we're starting a multiassignment group: if so,
+         * output a left paren.
+         */
+        if (next_ma_cell != NULL && cur_ma_sublink == NULL)
+        {
+            /*
+             * We must dig down into the expr to see if it's a PARAM_MULTIEXPR
+             * Param.  That could be buried under FieldStores and ArrayRefs
+             * and CoerceToDomains (cf processIndirection()), and underneath
+             * those there could be an implicit type coercion.  Because we
+             * would ignore implicit type coercions anyway, we don't need to
+             * be as careful as processIndirection() is about descending past
+             * implicit CoerceToDomains.
+             */
+            expr = (Node *) tle->expr;
+            while (expr)
+            {
+                if (IsA(expr, FieldStore))
+                {
+                    FieldStore *fstore = (FieldStore *) expr;
+
+                    expr = (Node *) linitial(fstore->newvals);
+                }
+                else if (IsA(expr, ArrayRef))
+                {
+                    ArrayRef   *aref = (ArrayRef *) expr;
+
+                    if (aref->refassgnexpr == NULL)
+                        break;
+                    expr = (Node *) aref->refassgnexpr;
+                }
+                else if (IsA(expr, CoerceToDomain))
+                {
+                    CoerceToDomain *cdomain = (CoerceToDomain *) expr;
+
+                    if (cdomain->coercionformat != COERCE_IMPLICIT_CAST)
+                        break;
+                    expr = (Node *) cdomain->arg;
+                }
+                else
+                    break;
+            }
+            expr = strip_implicit_coercions(expr);
+
+            if (expr && IsA(expr, Param) &&
+                ((Param *) expr)->paramkind == PARAM_MULTIEXPR)
+            {
+                cur_ma_sublink = (SubLink *) lfirst(next_ma_cell);
+                next_ma_cell = lnext(next_ma_cell);
+                remaining_ma_columns = count_nonjunk_tlist_entries(
+                                                                   ((Query *) cur_ma_sublink->subselect)->targetList);
+                Assert(((Param *) expr)->paramid ==
+                       ((cur_ma_sublink->subLinkId << 16) | 1));
+                appendStringInfoChar(buf, '(');
+            }
+        }
+
+        /*
+         * Put out name of target column; look in the catalogs, not at
+         * tle->resname, since resname will fail to track RENAME.
+         */
+        appendStringInfoString(buf,
+                               quote_identifier(get_relid_attribute_name(rte->relid,
+                                                                         tle->resno)));
+
+        /*
+         * Print any indirection needed (subfields or subscripts), and strip
+         * off the top-level nodes representing the indirection assignments.
+         */
+        expr = processIndirection((Node *) tle->expr, context);
+
+        /*
+         * If we're in a multiassignment, skip printing anything more, unless
+         * this is the last column; in which case, what we print should be the
+         * sublink, not the Param.
+         */
+        if (cur_ma_sublink != NULL)
+        {
+            if (--remaining_ma_columns > 0)
+                continue;        /* not the last column of multiassignment */
+            appendStringInfoChar(buf, ')');
+            expr = (Node *) cur_ma_sublink;
+            cur_ma_sublink = NULL;
+        }
+
+        appendStringInfoString(buf, " = ");
+
+        get_rule_expr(expr, context, false);
+    }
+}
+
+
+/* ----------
+ * get_delete_query_def            - Parse back a DELETE parsetree
+ * ----------
+ */
+static void
+get_delete_query_def(Query *query, deparse_context *context)
+{
+    StringInfo    buf = context->buf;
+    RangeTblEntry *rte;
+
+    /* Insert the WITH clause if given */
+    get_with_clause(query, context);
+
+    /*
+     * Start the query with DELETE FROM relname
+     */
+    rte = rt_fetch(query->resultRelation, query->rtable);
+    Assert(rte->rtekind == RTE_RELATION);
+    if (PRETTY_INDENT(context))
+    {
+        appendStringInfoChar(buf, ' ');
+        context->indentLevel += PRETTYINDENT_STD;
+    }
+    appendStringInfo(buf, "DELETE FROM %s%s",
+                     only_marker(rte),
+                     generate_relation_name(rte->relid, NIL));
+    if (rte->alias != NULL)
+        appendStringInfo(buf, " %s",
+                         quote_identifier(rte->alias->aliasname));
+
+    /* Add the USING clause if given */
+    get_from_clause(query, " USING ", context);
+
+    /* Add a WHERE clause if given */
+    if (query->jointree->quals != NULL)
+    {
+        appendContextKeyword(context, " WHERE ",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+        get_rule_expr(query->jointree->quals, context, false);
+    }
+
+    /* Add RETURNING if present */
+    if (query->returningList)
+    {
+        appendContextKeyword(context, " RETURNING",
+                             -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+        get_target_list(query->returningList, context, NULL);
+    }
+}
+
+
+/* ----------
+ * get_utility_query_def            - Parse back a UTILITY parsetree
+ * ----------
+ */
+static void
+get_utility_query_def(Query *query, deparse_context *context)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+
+    if (query->utilityStmt && IsA(query->utilityStmt, NotifyStmt))
+    {
+        NotifyStmt *stmt = (NotifyStmt *) query->utilityStmt;
+
+        appendContextKeyword(context, "",
+                             0, PRETTYINDENT_STD, 1);
+        appendStringInfo(buf, "NOTIFY %s",
+                         quote_identifier(stmt->conditionname));
+        if (stmt->payload)
+        {
+            appendStringInfoString(buf, ", ");
+            simple_quote_literal(buf, stmt->payload);
+        }
+    }
+#ifdef PGXC
+    else if (query->utilityStmt && IsA(query->utilityStmt, CreateStmt))
+    {
+        CreateStmt *stmt = (CreateStmt *) query->utilityStmt;
+        ListCell   *column;
+        const char *delimiter = "";
+        RangeVar   *relation = stmt->relation;
+        bool        istemp = (relation->relpersistence == RELPERSISTENCE_TEMP);
+        bool        isunlogged = (relation->relpersistence == RELPERSISTENCE_UNLOGGED);
+
+        appendStringInfo(buf, "CREATE %s %s %s TABLE %s ",
+                stmt->islocal ? "LOCAL" : "",
+                istemp ? "TEMP" : "",
+                isunlogged ? "UNLOGGED" : "",
+                stmt->if_not_exists ? "IF NOT EXISTS " : "");
+
+        if (!istemp && relation->schemaname && relation->schemaname[0])
+            appendStringInfo(buf, "%s.", quote_identifier(relation->schemaname));
+        appendStringInfo(buf, "%s", quote_identifier(relation->relname));
+
+        appendStringInfo(buf, "(");
+        foreach(column, stmt->tableElts)
+        {
+            Node *node = (Node *) lfirst(column);
+
+            appendStringInfo(buf, "%s", delimiter);
+            delimiter = ", ";
+
+            if (IsA(node, ColumnDef))
+            {
+                ColumnDef *coldef = (ColumnDef *) node;
+                TypeName *typename = coldef->typeName;
+#ifdef XCP
+                appendStringInfo(buf, "%s %s",
+                                 quote_identifier(coldef->colname),
+                                 format_type_with_typemod(typename->typeOid,
+                                                          typename->typemod));
+#else
+
+                /* error out if we have no recourse at all */
+                if (!OidIsValid(typename->typeOid))
+                    ereport(ERROR,
+                            (errcode(ERRCODE_SYNTAX_ERROR),
+                             errmsg("improper type oid: \"%u\"", typename->typeOid)));
+
+                /* get typename from the oid */
+                type = typeidType(typename->typeOid);
+
+                if (!HeapTupleIsValid(type))
+                    ereport(ERROR,
+                            (errcode(ERRCODE_UNDEFINED_OBJECT),
+                             errmsg("type \"%u\" does not exist",
+                                 typename->typeOid)));
+                appendStringInfo(buf, "%s %s", quote_identifier(coldef->colname),
+                        typeTypeName(type));
+                ReleaseSysCache(type);
+#endif
+            }
+            else
+                elog(ERROR, "Invalid table column definition.");
+        }
+        appendStringInfo(buf, ")");
+
+        /* Append storage parameters, like for instance WITH (OIDS) */
+        if (list_length(stmt->options) > 0)
+        {
+            Datum        reloptions;
+            static char *validnsps[] = HEAP_RELOPT_NAMESPACES;
+
+            reloptions = transformRelOptions((Datum) 0, stmt->options, NULL, validnsps,
+                                         false, false);
+
+            if (reloptions)
+            {
+                Datum   sep, txt;
+                /* Below is inspired from flatten_reloptions() */
+                sep = CStringGetTextDatum(", ");
+                txt = OidFunctionCall2(F_ARRAY_TO_TEXT, reloptions, sep);
+                appendStringInfo(buf, " WITH (%s)", TextDatumGetCString(txt));
+            }
+        }
+
+        /* add the on commit clauses for temporary tables */
+        switch (stmt->oncommit)
+        {
+            case ONCOMMIT_NOOP:
+                /* do nothing */
+                break;
+
+            case ONCOMMIT_PRESERVE_ROWS:
+                appendStringInfo(buf, " ON COMMIT PRESERVE ROWS");
+                break;
+
+            case ONCOMMIT_DELETE_ROWS:
+                appendStringInfo(buf, " ON COMMIT DELETE ROWS");
+                break;
+
+            case ONCOMMIT_DROP:
+                appendStringInfo(buf, " ON COMMIT DROP");
+                break;
+        }
+
+        if (stmt->distributeby)
+        {
+            /* add the on commit clauses for temporary tables */
+            switch (stmt->distributeby->disttype)
+            {
+                case DISTTYPE_REPLICATION:
+                    appendStringInfo(buf, " DISTRIBUTE BY REPLICATION");
+                    break;
+
+                case DISTTYPE_HASH:
+#ifdef __COLD_HOT__
+                    appendStringInfo(buf, " DISTRIBUTE BY HASH(%s)", strVal(linitial(stmt->distributeby->colname)));
+#else
+                    appendStringInfo(buf, " DISTRIBUTE BY HASH(%s)", stmt->distributeby->colname);
+#endif
+                    break;
+
+                case DISTTYPE_ROUNDROBIN:
+                    appendStringInfo(buf, " DISTRIBUTE BY ROUNDROBIN");
+                    break;
+
+                case DISTTYPE_MODULO:
+#ifdef __COLD_HOT__
+                    appendStringInfo(buf, " DISTRIBUTE BY MODULO(%s)",
+                            quote_identifier(strVal(linitial(stmt->distributeby->colname))));
+#else
+                    appendStringInfo(buf, " DISTRIBUTE BY MODULO(%s)",
+                            quote_identifier(stmt->distributeby->colname));
+#endif
+                    break;
+
+                default:
+                    ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
+                                errmsg("Invalid distribution type")));
+
+            }
+        }
+
+        if (stmt->subcluster)
+        {
+            ListCell   *cell;
+
+            switch (stmt->subcluster->clustertype)
+            {
+                case SUBCLUSTER_NODE:
+                    appendStringInfo(buf, " TO NODE (");
+
+                    /* Add node members */
+                    Assert(stmt->subcluster->members);
+                    foreach(cell, stmt->subcluster->members)
+                    {
+                        appendStringInfo(buf, " %s",
+                                quote_identifier(strVal(lfirst(cell))));
+                        if (cell->next)
+                            appendStringInfo(buf, ",");
+                    }
+                    appendStringInfo(buf, ")");
+                    break;
+
+                case SUBCLUSTER_GROUP:
+                    appendStringInfo(buf, " TO GROUP");
+
+                    /* Add group members */
+                    Assert(stmt->subcluster->members);
+                    foreach(cell, stmt->subcluster->members)
+                    {
+                        appendStringInfo(buf, " %s",
+                                quote_identifier(strVal(lfirst(cell))));
+                        if (cell->next)
+                            appendStringInfo(buf, ",");
+                    }
+                    break;
+
+                case SUBCLUSTER_NONE:
+                default:
+                    /* Nothing to do */
+                    break;
+            }
+        }
+    }
+#endif
+    else
+    {
+        /* Currently only NOTIFY utility commands can appear in rules */
+        elog(ERROR, "unexpected utility statement type");
+    }
+}
+
+/*
+ * Display a Var appropriately.
+ *
+ * In some cases (currently only when recursing into an unnamed join)
+ * the Var's varlevelsup has to be interpreted with respect to a context
+ * above the current one; levelsup indicates the offset.
+ *
+ * If istoplevel is TRUE, the Var is at the top level of a SELECT's
+ * targetlist, which means we need special treatment of whole-row Vars.
+ * Instead of the normal "tab.*", we'll print "tab.*::typename", which is a
+ * dirty hack to prevent "tab.*" from being expanded into multiple columns.
+ * (The parser will strip the useless coercion, so no inefficiency is added in
+ * dump and reload.)  We used to print just "tab" in such cases, but that is
+ * ambiguous and will yield the wrong result if "tab" is also a plain column
+ * name in the query.
+ *
+ * Returns the attname of the Var, or NULL if the Var has no attname (because
+ * it is a whole-row Var or a subplan output reference).
+ */
+static char *
+get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    RangeTblEntry *rte;
+    AttrNumber    attnum;
+    int            netlevelsup;
+    deparse_namespace *dpns;
+    deparse_columns *colinfo;
+    char       *refname;
+    char       *attname;
+
+    /* Find appropriate nesting depth */
+    netlevelsup = var->varlevelsup + levelsup;
+    if (netlevelsup >= list_length(context->namespaces))
+        elog(ERROR, "bogus varlevelsup: %d offset %d",
+             var->varlevelsup, levelsup);
+    dpns = (deparse_namespace *) list_nth(context->namespaces,
+                                          netlevelsup);
+
+    /*
+     * Try to find the relevant RTE in this rtable.  In a plan tree, it's
+     * likely that varno is OUTER_VAR or INNER_VAR, in which case we must dig
+     * down into the subplans, or INDEX_VAR, which is resolved similarly. Also
+     * find the aliases previously assigned for this RTE.
+     */
+    if (var->varno >= 1 && var->varno <= list_length(dpns->rtable))
+    {
+        rte = rt_fetch(var->varno, dpns->rtable);
+        refname = (char *) list_nth(dpns->rtable_names, var->varno - 1);
+        colinfo = deparse_columns_fetch(var->varno, dpns);
+        attnum = var->varattno;
+    }
+    else
+    {
+        resolve_special_varno((Node *) var, context, NULL,
+                              get_special_variable);
+        return NULL;
+    }
+
+    /*
+     * The planner will sometimes emit Vars referencing resjunk elements of a
+     * subquery's target list (this is currently only possible if it chooses
+     * to generate a "physical tlist" for a SubqueryScan or CteScan node).
+     * Although we prefer to print subquery-referencing Vars using the
+     * subquery's alias, that's not possible for resjunk items since they have
+     * no alias.  So in that case, drill down to the subplan and print the
+     * contents of the referenced tlist item.  This works because in a plan
+     * tree, such Vars can only occur in a SubqueryScan or CteScan node, and
+     * we'll have set dpns->inner_planstate to reference the child plan node.
+     */
+    if ((rte->rtekind == RTE_SUBQUERY || rte->rtekind == RTE_CTE) &&
+        attnum > list_length(rte->eref->colnames) &&
+        dpns->inner_planstate)
+    {
+        TargetEntry *tle;
+        deparse_namespace save_dpns;
+
+        tle = get_tle_by_resno(dpns->inner_tlist, var->varattno);
+        if (!tle)
+            elog(ERROR, "invalid attnum %d for relation \"%s\"",
+                 var->varattno, rte->eref->aliasname);
+
+        Assert(netlevelsup == 0);
+        push_child_plan(dpns, dpns->inner_planstate, &save_dpns);
+
+        /*
+         * Force parentheses because our caller probably assumed a Var is a
+         * simple expression.
+         */
+        if (!IsA(tle->expr, Var))
+            appendStringInfoChar(buf, '(');
+        get_rule_expr((Node *) tle->expr, context, true);
+        if (!IsA(tle->expr, Var))
+            appendStringInfoChar(buf, ')');
+
+        pop_child_plan(dpns, &save_dpns);
+        return NULL;
+    }
+
+#ifdef PGXC
+    if (rte->rtekind == RTE_REMOTE_DUMMY &&
+        attnum > list_length(rte->eref->colnames) &&
+        dpns->planstate)
+    {
+        TargetEntry *tle;
+        RemoteQuery *rqplan;
+        Assert(IsA(dpns->planstate, RemoteQueryState));
+        Assert(netlevelsup == 0);
+
+        /*
+         * Get the expression representing the given Var from base_tlist of the
+         * RemoteQuery
+         */
+        rqplan = (RemoteQuery *)dpns->planstate->plan;
+        Assert(IsA(rqplan, RemoteQuery));
+        tle = get_tle_by_resno(rqplan->base_tlist, var->varattno);
+        if (!tle)
+            elog(ERROR, "bogus varattno for remotequery var: %d", var->varattno);
+        /*
+         * Force parentheses because our caller probably assumed a Var is a
+         * simple expression.
+         */
+        if (!IsA(tle->expr, Var))
+            appendStringInfoChar(buf, '(');
+        get_rule_expr((Node *) tle->expr, context, true);
+        if (!IsA(tle->expr, Var))
+            appendStringInfoChar(buf, ')');
+
+        return NULL;
+    }
+#endif /* PGXC */
+
+    /*
+     * If it's an unnamed join, look at the expansion of the alias variable.
+     * If it's a simple reference to one of the input vars, then recursively
+     * print the name of that var instead.  When it's not a simple reference,
+     * we have to just print the unqualified join column name.  (This can only
+     * happen with "dangerous" merged columns in a JOIN USING; we took pains
+     * previously to make the unqualified column name unique in such cases.)
+     *
+     * This wouldn't work in decompiling plan trees, because we don't store
+     * joinaliasvars lists after planning; but a plan tree should never
+     * contain a join alias variable.
+     */
+    if (rte->rtekind == RTE_JOIN && rte->alias == NULL)
+    {
+        if (rte->joinaliasvars == NIL)
+            elog(ERROR, "cannot decompile join alias var in plan tree");
+        if (attnum > 0)
+        {
+            Var           *aliasvar;
+
+            aliasvar = (Var *) list_nth(rte->joinaliasvars, attnum - 1);
+            /* we intentionally don't strip implicit coercions here */
+            if (aliasvar && IsA(aliasvar, Var))
+            {
+                return get_variable(aliasvar, var->varlevelsup + levelsup,
+                                    istoplevel, context);
+            }
+        }
+
+        /*
+         * Unnamed join has no refname.  (Note: since it's unnamed, there is
+         * no way the user could have referenced it to create a whole-row Var
+         * for it.  So we don't have to cover that case below.)
+         */
+        Assert(refname == NULL);
+    }
+
+    if (attnum == InvalidAttrNumber)
+        attname = NULL;
+    else if (attnum > 0)
+    {
+        /* Get column name to use from the colinfo struct */
+        if (attnum > colinfo->num_cols)
+            elog(ERROR, "invalid attnum %d for relation \"%s\"",
+                 attnum, rte->eref->aliasname);
+        attname = colinfo->colnames[attnum - 1];
+        if (attname == NULL)    /* dropped column? */
+            elog(ERROR, "invalid attnum %d for relation \"%s\"",
+                 attnum, rte->eref->aliasname);
+    }
+    else
+    {
+        /* System column - name is fixed, get it from the catalog */
+        attname = get_rte_attribute_name(rte, attnum);
+    }
+
+    if (refname && (context->varprefix || attname == NULL))
+    {
+        appendStringInfoString(buf, quote_identifier(refname));
+        appendStringInfoChar(buf, '.');
+    }
+    if (attname)
+        appendStringInfoString(buf, quote_identifier(attname));
+    else
+    {
+        appendStringInfoChar(buf, '*');
+        if (istoplevel)
+            appendStringInfo(buf, "::%s",
+                             format_type_with_typemod(var->vartype,
+                                                      var->vartypmod));
+    }
+
+    return attname;
+}
+
+/*
+ * Deparse a Var which references OUTER_VAR, INNER_VAR, or INDEX_VAR.  This
+ * routine is actually a callback for get_special_varno, which handles finding
+ * the correct TargetEntry.  We get the expression contained in that
+ * TargetEntry and just need to deparse it, a job we can throw back on
+ * get_rule_expr.
+ */
+static void
+get_special_variable(Node *node, deparse_context *context, void *private)
+{
+    StringInfo    buf = context->buf;
+
+    /*
+     * Force parentheses because our caller probably assumed a Var is a simple
+     * expression.
+     */
+    if (!IsA(node, Var))
+        appendStringInfoChar(buf, '(');
+    get_rule_expr(node, context, true);
+    if (!IsA(node, Var))
+        appendStringInfoChar(buf, ')');
+}
+
+/*
+ * Chase through plan references to special varnos (OUTER_VAR, INNER_VAR,
+ * INDEX_VAR) until we find a real Var or some kind of non-Var node; then,
+ * invoke the callback provided.
+ */
+static void
+resolve_special_varno(Node *node, deparse_context *context, void *private,
+                      void (*callback) (Node *, deparse_context *, void *))
+{// #lizard forgives
+    Var           *var;
+    deparse_namespace *dpns;
+
+    /* If it's not a Var, invoke the callback. */
+    if (!IsA(node, Var))
+    {
+        callback(node, context, private);
+        return;
+    }
+
+    /* Find appropriate nesting depth */
+    var = (Var *) node;
+    dpns = (deparse_namespace *) list_nth(context->namespaces,
+                                          var->varlevelsup);
+
+    /*
+     * It's a special RTE, so recurse.
+     */
+    if (var->varno == OUTER_VAR && dpns->outer_tlist)
+    {
+        TargetEntry *tle;
+        deparse_namespace save_dpns;
+
+        tle = get_tle_by_resno(dpns->outer_tlist, var->varattno);
+        if (!tle)
+            elog(ERROR, "bogus varattno for OUTER_VAR var: %d", var->varattno);
+
+        push_child_plan(dpns, dpns->outer_planstate, &save_dpns);
+        resolve_special_varno((Node *) tle->expr, context, private, callback);
+        pop_child_plan(dpns, &save_dpns);
+        return;
+    }
+    else if (var->varno == INNER_VAR && dpns->inner_tlist)
+    {
+        TargetEntry *tle;
+        deparse_namespace save_dpns;
+
+        tle = get_tle_by_resno(dpns->inner_tlist, var->varattno);
+        if (!tle)
+            elog(ERROR, "bogus varattno for INNER_VAR var: %d", var->varattno);
+
+        push_child_plan(dpns, dpns->inner_planstate, &save_dpns);
+        resolve_special_varno((Node *) tle->expr, context, private, callback);
+        pop_child_plan(dpns, &save_dpns);
+        return;
+    }
+    else if (var->varno == INDEX_VAR && dpns->index_tlist)
+    {
+        TargetEntry *tle;
+
+        tle = get_tle_by_resno(dpns->index_tlist, var->varattno);
+        if (!tle)
+            elog(ERROR, "bogus varattno for INDEX_VAR var: %d", var->varattno);
+
+        resolve_special_varno((Node *) tle->expr, context, private, callback);
+        return;
+    }
+    else if (var->varno < 1 || var->varno > list_length(dpns->rtable))
+        elog(ERROR, "bogus varno: %d", var->varno);
+
+    /* Not special.  Just invoke the callback. */
+    callback(node, context, private);
+}
+
+/*
+ * Get the name of a field of an expression of composite type.  The
+ * expression is usually a Var, but we handle other cases too.
+ *
+ * levelsup is an extra offset to interpret the Var's varlevelsup correctly.
+ *
+ * This is fairly straightforward when the expression has a named composite
+ * type; we need only look up the type in the catalogs.  However, the type
+ * could also be RECORD.  Since no actual table or view column is allowed to
+ * have type RECORD, a Var of type RECORD must refer to a JOIN or FUNCTION RTE
+ * or to a subquery output.  We drill down to find the ultimate defining
+ * expression and attempt to infer the field name from it.  We ereport if we
+ * can't determine the name.
+ *
+ * Similarly, a PARAM of type RECORD has to refer to some expression of
+ * a determinable composite type.
+ */
+static const char *
+get_name_for_var_field(Var *var, int fieldno,
+                       int levelsup, deparse_context *context)
+{// #lizard forgives
+    RangeTblEntry *rte;
+    AttrNumber    attnum;
+    int            netlevelsup;
+    deparse_namespace *dpns;
+    TupleDesc    tupleDesc;
+    Node       *expr;
+
+    /*
+     * If it's a RowExpr that was expanded from a whole-row Var, use the
+     * column names attached to it.
+     */
+    if (IsA(var, RowExpr))
+    {
+        RowExpr    *r = (RowExpr *) var;
+
+        if (fieldno > 0 && fieldno <= list_length(r->colnames))
+            return strVal(list_nth(r->colnames, fieldno - 1));
+    }
+
+    /*
+     * If it's a Param of type RECORD, try to find what the Param refers to.
+     */
+    if (IsA(var, Param))
+    {
+        Param       *param = (Param *) var;
+        ListCell   *ancestor_cell;
+
+        expr = find_param_referent(param, context, &dpns, &ancestor_cell);
+        if (expr)
+        {
+            /* Found a match, so recurse to decipher the field name */
+            deparse_namespace save_dpns;
+            const char *result;
+
+            push_ancestor_plan(dpns, ancestor_cell, &save_dpns);
+            result = get_name_for_var_field((Var *) expr, fieldno,
+                                            0, context);
+            pop_ancestor_plan(dpns, &save_dpns);
+            return result;
+        }
+    }
+
+    /*
+     * If it's a Var of type RECORD, we have to find what the Var refers to;
+     * if not, we can use get_expr_result_type. If that fails, we try
+     * lookup_rowtype_tupdesc, which will probably fail too, but will ereport
+     * an acceptable message.
+     */
+    if (!IsA(var, Var) ||
+        var->vartype != RECORDOID)
+    {
+        if (get_expr_result_type((Node *) var, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
+            tupleDesc = lookup_rowtype_tupdesc_copy(exprType((Node *) var),
+                                                    exprTypmod((Node *) var));
+        Assert(tupleDesc);
+        /* Got the tupdesc, so we can extract the field name */
+        Assert(fieldno >= 1 && fieldno <= tupleDesc->natts);
+        return NameStr(tupleDesc->attrs[fieldno - 1]->attname);
+    }
+
+    /* Find appropriate nesting depth */
+    netlevelsup = var->varlevelsup + levelsup;
+    if (netlevelsup >= list_length(context->namespaces))
+        elog(ERROR, "bogus varlevelsup: %d offset %d",
+             var->varlevelsup, levelsup);
+    dpns = (deparse_namespace *) list_nth(context->namespaces,
+                                          netlevelsup);
+
+    /*
+     * Try to find the relevant RTE in this rtable.  In a plan tree, it's
+     * likely that varno is OUTER_VAR or INNER_VAR, in which case we must dig
+     * down into the subplans, or INDEX_VAR, which is resolved similarly.
+     */
+    if (var->varno >= 1 && var->varno <= list_length(dpns->rtable))
+    {
+        rte = rt_fetch(var->varno, dpns->rtable);
+        attnum = var->varattno;
+    }
+    else if (var->varno == OUTER_VAR && dpns->outer_tlist)
+    {
+        TargetEntry *tle;
+        deparse_namespace save_dpns;
+        const char *result;
+
+        tle = get_tle_by_resno(dpns->outer_tlist, var->varattno);
+        if (!tle)
+            elog(ERROR, "bogus varattno for OUTER_VAR var: %d", var->varattno);
+
+        Assert(netlevelsup == 0);
+        push_child_plan(dpns, dpns->outer_planstate, &save_dpns);
+
+        result = get_name_for_var_field((Var *) tle->expr, fieldno,
+                                        levelsup, context);
+
+        pop_child_plan(dpns, &save_dpns);
+        return result;
+    }
+    else if (var->varno == INNER_VAR && dpns->inner_tlist)
+    {
+        TargetEntry *tle;
+        deparse_namespace save_dpns;
+        const char *result;
+
+        tle = get_tle_by_resno(dpns->inner_tlist, var->varattno);
+        if (!tle)
+            elog(ERROR, "bogus varattno for INNER_VAR var: %d", var->varattno);
+
+        Assert(netlevelsup == 0);
+        push_child_plan(dpns, dpns->inner_planstate, &save_dpns);
+
+        result = get_name_for_var_field((Var *) tle->expr, fieldno,
+                                        levelsup, context);
+
+        pop_child_plan(dpns, &save_dpns);
+        return result;
+    }
+    else if (var->varno == INDEX_VAR && dpns->index_tlist)
+    {
+        TargetEntry *tle;
+        const char *result;
+
+        tle = get_tle_by_resno(dpns->index_tlist, var->varattno);
+        if (!tle)
+            elog(ERROR, "bogus varattno for INDEX_VAR var: %d", var->varattno);
+
+        Assert(netlevelsup == 0);
+
+        result = get_name_for_var_field((Var *) tle->expr, fieldno,
+                                        levelsup, context);
+
+        return result;
+    }
+    else
+    {
+        elog(ERROR, "bogus varno: %d", var->varno);
+        return NULL;            /* keep compiler quiet */
+    }
+
+    if (attnum == InvalidAttrNumber)
+    {
+        /* Var is whole-row reference to RTE, so select the right field */
+        return get_rte_attribute_name(rte, fieldno);
+    }
+
+    /*
+     * This part has essentially the same logic as the parser's
+     * expandRecordVariable() function, but we are dealing with a different
+     * representation of the input context, and we only need one field name
+     * not a TupleDesc.  Also, we need special cases for finding subquery and
+     * CTE subplans when deparsing Plan trees.
+     */
+    expr = (Node *) var;        /* default if we can't drill down */
+
+    switch (rte->rtekind)
+    {
+        case RTE_RELATION:
+        case RTE_VALUES:
+        case RTE_NAMEDTUPLESTORE:
+
+            /*
+             * This case should not occur: a column of a table or values list
+             * shouldn't have type RECORD.  Fall through and fail (most
+             * likely) at the bottom.
+             */
+            break;
+        case RTE_SUBQUERY:
+            /* Subselect-in-FROM: examine sub-select's output expr */
+            {
+                if (rte->subquery)
+                {
+                    TargetEntry *ste = get_tle_by_resno(rte->subquery->targetList,
+                                                        attnum);
+
+                    if (ste == NULL || ste->resjunk)
+                        elog(ERROR, "subquery %s does not have attribute %d",
+                             rte->eref->aliasname, attnum);
+                    expr = (Node *) ste->expr;
+                    if (IsA(expr, Var))
+                    {
+                        /*
+                         * Recurse into the sub-select to see what its Var
+                         * refers to. We have to build an additional level of
+                         * namespace to keep in step with varlevelsup in the
+                         * subselect.
+                         */
+                        deparse_namespace mydpns;
+                        const char *result;
+
+                        set_deparse_for_query(&mydpns, rte->subquery,
+                                              context->namespaces);
+
+                        context->namespaces = lcons(&mydpns,
+                                                    context->namespaces);
+
+                        result = get_name_for_var_field((Var *) expr, fieldno,
+                                                        0, context);
+
+                        context->namespaces =
+                            list_delete_first(context->namespaces);
+
+                        return result;
+                    }
+                    /* else fall through to inspect the expression */
+                }
+                else
+                {
+                    /*
+                     * We're deparsing a Plan tree so we don't have complete
+                     * RTE entries (in particular, rte->subquery is NULL). But
+                     * the only place we'd see a Var directly referencing a
+                     * SUBQUERY RTE is in a SubqueryScan plan node, and we can
+                     * look into the child plan's tlist instead.
+                     */
+                    TargetEntry *tle;
+                    deparse_namespace save_dpns;
+                    const char *result;
+
+                    if (!dpns->inner_planstate)
+                        elog(ERROR, "failed to find plan for subquery %s",
+                             rte->eref->aliasname);
+                    tle = get_tle_by_resno(dpns->inner_tlist, attnum);
+                    if (!tle)
+                        elog(ERROR, "bogus varattno for subquery var: %d",
+                             attnum);
+                    Assert(netlevelsup == 0);
+                    push_child_plan(dpns, dpns->inner_planstate, &save_dpns);
+
+                    result = get_name_for_var_field((Var *) tle->expr, fieldno,
+                                                    levelsup, context);
+
+                    pop_child_plan(dpns, &save_dpns);
+                    return result;
+                }
+            }
+            break;
+        case RTE_JOIN:
+            /* Join RTE --- recursively inspect the alias variable */
+            if (rte->joinaliasvars == NIL)
+                elog(ERROR, "cannot decompile join alias var in plan tree");
+            Assert(attnum > 0 && attnum <= list_length(rte->joinaliasvars));
+            expr = (Node *) list_nth(rte->joinaliasvars, attnum - 1);
+            Assert(expr != NULL);
+            /* we intentionally don't strip implicit coercions here */
+            if (IsA(expr, Var))
+                return get_name_for_var_field((Var *) expr, fieldno,
+                                              var->varlevelsup + levelsup,
+                                              context);
+            /* else fall through to inspect the expression */
+            break;
+        case RTE_FUNCTION:
+        case RTE_TABLEFUNC:
+
+            /*
+             * We couldn't get here unless a function is declared with one of
+             * its result columns as RECORD, which is not allowed.
+             */
+            break;
+        case RTE_CTE:
+            /* CTE reference: examine subquery's output expr */
+            {
+                CommonTableExpr *cte = NULL;
+                Index        ctelevelsup;
+                ListCell   *lc;
+
+                /*
+                 * Try to find the referenced CTE using the namespace stack.
+                 */
+                ctelevelsup = rte->ctelevelsup + netlevelsup;
+                if (ctelevelsup >= list_length(context->namespaces))
+                    lc = NULL;
+                else
+                {
+                    deparse_namespace *ctedpns;
+
+                    ctedpns = (deparse_namespace *)
+                        list_nth(context->namespaces, ctelevelsup);
+                    foreach(lc, ctedpns->ctes)
+                    {
+                        cte = (CommonTableExpr *) lfirst(lc);
+                        if (strcmp(cte->ctename, rte->ctename) == 0)
+                            break;
+                    }
+                }
+                if (lc != NULL)
+                {
+                    Query       *ctequery = (Query *) cte->ctequery;
+                    TargetEntry *ste = get_tle_by_resno(GetCTETargetList(cte),
+                                                        attnum);
+
+                    if (ste == NULL || ste->resjunk)
+                        elog(ERROR, "subquery %s does not have attribute %d",
+                             rte->eref->aliasname, attnum);
+                    expr = (Node *) ste->expr;
+                    if (IsA(expr, Var))
+                    {
+                        /*
+                         * Recurse into the CTE to see what its Var refers to.
+                         * We have to build an additional level of namespace
+                         * to keep in step with varlevelsup in the CTE.
+                         * Furthermore it could be an outer CTE, so we may
+                         * have to delete some levels of namespace.
+                         */
+                        List       *save_nslist = context->namespaces;
+                        List       *new_nslist;
+                        deparse_namespace mydpns;
+                        const char *result;
+
+                        set_deparse_for_query(&mydpns, ctequery,
+                                              context->namespaces);
+
+                        new_nslist = list_copy_tail(context->namespaces,
+                                                    ctelevelsup);
+                        context->namespaces = lcons(&mydpns, new_nslist);
+
+                        result = get_name_for_var_field((Var *) expr, fieldno,
+                                                        0, context);
+
+                        context->namespaces = save_nslist;
+
+                        return result;
+                    }
+                    /* else fall through to inspect the expression */
+                }
+                else
+                {
+                    /*
+                     * We're deparsing a Plan tree so we don't have a CTE
+                     * list.  But the only place we'd see a Var directly
+                     * referencing a CTE RTE is in a CteScan plan node, and we
+                     * can look into the subplan's tlist instead.
+                     */
+                    TargetEntry *tle;
+                    deparse_namespace save_dpns;
+                    const char *result;
+
+                    if (!dpns->inner_planstate)
+                        elog(ERROR, "failed to find plan for CTE %s",
+                             rte->eref->aliasname);
+                    tle = get_tle_by_resno(dpns->inner_tlist, attnum);
+                    if (!tle)
+                        elog(ERROR, "bogus varattno for subquery var: %d",
+                             attnum);
+                    Assert(netlevelsup == 0);
+                    push_child_plan(dpns, dpns->inner_planstate, &save_dpns);
+
+                    result = get_name_for_var_field((Var *) tle->expr, fieldno,
+                                                    levelsup, context);
+
+                    pop_child_plan(dpns, &save_dpns);
+                    return result;
+                }
+            }
+            break;
+#ifdef PGXC
+        case RTE_REMOTE_DUMMY:
+            elog(ERROR, "Invalid RTE found");
+            break;
+#endif /* PGXC */
+    }
+
+    /*
+     * We now have an expression we can't expand any more, so see if
+     * get_expr_result_type() can do anything with it.  If not, pass to
+     * lookup_rowtype_tupdesc() which will probably fail, but will give an
+     * appropriate error message while failing.
+     */
+    if (get_expr_result_type(expr, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
+        tupleDesc = lookup_rowtype_tupdesc_copy(exprType(expr),
+                                                exprTypmod(expr));
+    Assert(tupleDesc);
+    /* Got the tupdesc, so we can extract the field name */
+    Assert(fieldno >= 1 && fieldno <= tupleDesc->natts);
+    return NameStr(tupleDesc->attrs[fieldno - 1]->attname);
+}
+
+/*
+ * Try to find the referenced expression for a PARAM_EXEC Param that might
+ * reference a parameter supplied by an upper NestLoop or SubPlan plan node.
+ *
+ * If successful, return the expression and set *dpns_p and *ancestor_cell_p
+ * appropriately for calling push_ancestor_plan().  If no referent can be
+ * found, return NULL.
+ */
+static Node *
+find_param_referent(Param *param, deparse_context *context,
+                    deparse_namespace **dpns_p, ListCell **ancestor_cell_p)
+{// #lizard forgives
+    /* Initialize output parameters to prevent compiler warnings */
+    *dpns_p = NULL;
+    *ancestor_cell_p = NULL;
+
+    /*
+     * If it's a PARAM_EXEC parameter, look for a matching NestLoopParam or
+     * SubPlan argument.  This will necessarily be in some ancestor of the
+     * current expression's PlanState.
+     */
+    if (param->paramkind == PARAM_EXEC)
+    {
+        deparse_namespace *dpns;
+        PlanState  *child_ps;
+        bool        in_same_plan_level;
+        ListCell   *lc;
+
+        dpns = (deparse_namespace *) linitial(context->namespaces);
+        child_ps = dpns->planstate;
+        in_same_plan_level = true;
+
+        foreach(lc, dpns->ancestors)
+        {
+            PlanState  *ps = (PlanState *) lfirst(lc);
+            ListCell   *lc2;
+
+            /*
+             * NestLoops transmit params to their inner child only; also, once
+             * we've crawled up out of a subplan, this couldn't possibly be
+             * the right match.
+             */
+            if (IsA(ps, NestLoopState) &&
+                child_ps == innerPlanState(ps) &&
+                in_same_plan_level)
+            {
+                NestLoop   *nl = (NestLoop *) ps->plan;
+
+                foreach(lc2, nl->nestParams)
+                {
+                    NestLoopParam *nlp = (NestLoopParam *) lfirst(lc2);
+
+                    if (nlp->paramno == param->paramid)
+                    {
+                        /* Found a match, so return it */
+                        *dpns_p = dpns;
+                        *ancestor_cell_p = lc;
+                        return (Node *) nlp->paramval;
+                    }
+                }
+            }
+
+            /*
+             * Check to see if we're crawling up from a subplan.
+             */
+            foreach(lc2, ps->subPlan)
+            {
+                SubPlanState *sstate = (SubPlanState *) lfirst(lc2);
+                SubPlan    *subplan = sstate->subplan;
+                ListCell   *lc3;
+                ListCell   *lc4;
+
+                if (child_ps != sstate->planstate)
+                    continue;
+
+                /* Matched subplan, so check its arguments */
+                forboth(lc3, subplan->parParam, lc4, subplan->args)
+                {
+                    int            paramid = lfirst_int(lc3);
+                    Node       *arg = (Node *) lfirst(lc4);
+
+                    if (paramid == param->paramid)
+                    {
+                        /* Found a match, so return it */
+                        *dpns_p = dpns;
+                        *ancestor_cell_p = lc;
+                        return arg;
+                    }
+                }
+
+                /* Keep looking, but we are emerging from a subplan. */
+                in_same_plan_level = false;
+                break;
+            }
+
+            /*
+             * Likewise check to see if we're emerging from an initplan.
+             * Initplans never have any parParams, so no need to search that
+             * list, but we need to know if we should reset
+             * in_same_plan_level.
+             */
+            foreach(lc2, ps->initPlan)
+            {
+                SubPlanState *sstate = (SubPlanState *) lfirst(lc2);
+
+                if (child_ps != sstate->planstate)
+                    continue;
+
+                /* No parameters to be had here. */
+                Assert(sstate->subplan->parParam == NIL);
+
+                /* Keep looking, but we are emerging from an initplan. */
+                in_same_plan_level = false;
+                break;
+            }
+
+            /* No luck, crawl up to next ancestor */
+            child_ps = ps;
+        }
+    }
+
+    /* No referent found */
+    return NULL;
+}
+
+/*
+ * Display a Param appropriately.
+ */
+static void
+get_parameter(Param *param, deparse_context *context)
+{// #lizard forgives
+    Node       *expr;
+    deparse_namespace *dpns;
+    ListCell   *ancestor_cell;
+
+    /*
+     * If it's a PARAM_EXEC parameter, try to locate the expression from which
+     * the parameter was computed.  Note that failing to find a referent isn't
+     * an error, since the Param might well be a subplan output rather than an
+     * input.
+     */
+    expr = find_param_referent(param, context, &dpns, &ancestor_cell);
+    if (expr)
+    {
+        /* Found a match, so print it */
+        deparse_namespace save_dpns;
+        bool        save_varprefix;
+        bool        need_paren;
+
+        /* Switch attention to the ancestor plan node */
+        push_ancestor_plan(dpns, ancestor_cell, &save_dpns);
+
+        /*
+         * Force prefixing of Vars, since they won't belong to the relation
+         * being scanned in the original plan node.
+         */
+        save_varprefix = context->varprefix;
+        context->varprefix = true;
+
+        /*
+         * A Param's expansion is typically a Var, Aggref, or upper-level
+         * Param, which wouldn't need extra parentheses.  Otherwise, insert
+         * parens to ensure the expression looks atomic.
+         */
+        need_paren = !(IsA(expr, Var) ||
+                       IsA(expr, Aggref) ||
+                       IsA(expr, Param));
+        if (need_paren)
+            appendStringInfoChar(context->buf, '(');
+
+        get_rule_expr(expr, context, false);
+
+        if (need_paren)
+            appendStringInfoChar(context->buf, ')');
+
+        context->varprefix = save_varprefix;
+
+        pop_ancestor_plan(dpns, &save_dpns);
+
+        return;
+    }
+
+    /*
+     * Not PARAM_EXEC, or couldn't find referent: just print $N.
+     */
+    appendStringInfo(context->buf, "$%d", param->paramid);
+
+#ifdef __TBASE__
+    /* param need explicit cast */
+    if (param->explicit_cast)
+    {
+        appendStringInfo(context->buf, "::%s",
+                         format_type_with_typemod(param->paramtype, param->paramtypmod));
+    }
+#endif
+}
+
+/*
+ * get_simple_binary_op_name
+ *
+ * helper function for isSimpleNode
+ * will return single char binary operator name, or NULL if it's not
+ */
+static const char *
+get_simple_binary_op_name(OpExpr *expr)
+{
+    List       *args = expr->args;
+
+    if (list_length(args) == 2)
+    {
+        /* binary operator */
+        Node       *arg1 = (Node *) linitial(args);
+        Node       *arg2 = (Node *) lsecond(args);
+        const char *op;
+
+        op = generate_operator_name(expr->opno, exprType(arg1), exprType(arg2));
+        if (strlen(op) == 1)
+            return op;
+    }
+    return NULL;
+}
+
+
+/*
+ * isSimpleNode - check if given node is simple (doesn't need parenthesizing)
+ *
+ *    true   : simple in the context of parent node's type
+ *    false  : not simple
+ */
+static bool
+isSimpleNode(Node *node, Node *parentNode, int prettyFlags)
+{// #lizard forgives
+    if (!node)
+        return false;
+
+    switch (nodeTag(node))
+    {
+        case T_Var:
+        case T_Const:
+        case T_Param:
+        case T_CoerceToDomainValue:
+        case T_SetToDefault:
+        case T_CurrentOfExpr:
+            /* single words: always simple */
+            return true;
+
+        case T_ArrayRef:
+        case T_ArrayExpr:
+        case T_RowExpr:
+        case T_CoalesceExpr:
+        case T_MinMaxExpr:
+        case T_SQLValueFunction:
+        case T_XmlExpr:
+        case T_NextValueExpr:
+        case T_NullIfExpr:
+        case T_Aggref:
+        case T_WindowFunc:
+        case T_FuncExpr:
+            /* function-like: name(..) or name[..] */
+            return true;
+
+            /* CASE keywords act as parentheses */
+        case T_CaseExpr:
+            return true;
+
+        case T_FieldSelect:
+
+            /*
+             * appears simple since . has top precedence, unless parent is
+             * T_FieldSelect itself!
+             */
+            return (IsA(parentNode, FieldSelect) ? false : true);
+
+        case T_FieldStore:
+
+            /*
+             * treat like FieldSelect (probably doesn't matter)
+             */
+            return (IsA(parentNode, FieldStore) ? false : true);
+
+        case T_CoerceToDomain:
+            /* maybe simple, check args */
+            return isSimpleNode((Node *) ((CoerceToDomain *) node)->arg,
+                                node, prettyFlags);
+        case T_RelabelType:
+            return isSimpleNode((Node *) ((RelabelType *) node)->arg,
+                                node, prettyFlags);
+        case T_CoerceViaIO:
+            return isSimpleNode((Node *) ((CoerceViaIO *) node)->arg,
+                                node, prettyFlags);
+        case T_ArrayCoerceExpr:
+            return isSimpleNode((Node *) ((ArrayCoerceExpr *) node)->arg,
+                                node, prettyFlags);
+        case T_ConvertRowtypeExpr:
+            return isSimpleNode((Node *) ((ConvertRowtypeExpr *) node)->arg,
+                                node, prettyFlags);
+
+        case T_OpExpr:
+            {
+                /* depends on parent node type; needs further checking */
+                if (prettyFlags & PRETTYFLAG_PAREN && IsA(parentNode, OpExpr))
+                {
+                    const char *op;
+                    const char *parentOp;
+                    bool        is_lopriop;
+                    bool        is_hipriop;
+                    bool        is_lopriparent;
+                    bool        is_hipriparent;
+
+                    op = get_simple_binary_op_name((OpExpr *) node);
+                    if (!op)
+                        return false;
+
+                    /* We know only the basic operators + - and * / % */
+                    is_lopriop = (strchr("+-", *op) != NULL);
+                    is_hipriop = (strchr("*/%", *op) != NULL);
+                    if (!(is_lopriop || is_hipriop))
+                        return false;
+
+                    parentOp = get_simple_binary_op_name((OpExpr *) parentNode);
+                    if (!parentOp)
+                        return false;
+
+                    is_lopriparent = (strchr("+-", *parentOp) != NULL);
+                    is_hipriparent = (strchr("*/%", *parentOp) != NULL);
+                    if (!(is_lopriparent || is_hipriparent))
+                        return false;
+
+                    if (is_hipriop && is_lopriparent)
+                        return true;    /* op binds tighter than parent */
+
+                    if (is_lopriop && is_hipriparent)
+                        return false;
+
+                    /*
+                     * Operators are same priority --- can skip parens only if
+                     * we have (a - b) - c, not a - (b - c).
+                     */
+                    if (node == (Node *) linitial(((OpExpr *) parentNode)->args))
+                        return true;
+
+                    return false;
+                }
+                /* else do the same stuff as for T_SubLink et al. */
+                /* FALL THROUGH */
+            }
+
+        case T_SubLink:
+        case T_NullTest:
+        case T_BooleanTest:
+        case T_DistinctExpr:
+            switch (nodeTag(parentNode))
+            {
+                case T_FuncExpr:
+                    {
+                        /* special handling for casts */
+                        CoercionForm type = ((FuncExpr *) parentNode)->funcformat;
+
+                        if (type == COERCE_EXPLICIT_CAST ||
+                            type == COERCE_IMPLICIT_CAST)
+                            return false;
+                        return true;    /* own parentheses */
+                    }
+                case T_BoolExpr:    /* lower precedence */
+                case T_ArrayRef:    /* other separators */
+                case T_ArrayExpr:    /* other separators */
+                case T_RowExpr: /* other separators */
+                case T_CoalesceExpr:    /* own parentheses */
+                case T_MinMaxExpr:    /* own parentheses */
+                case T_XmlExpr: /* own parentheses */
+                case T_NullIfExpr:    /* other separators */
+                case T_Aggref:    /* own parentheses */
+                case T_WindowFunc:    /* own parentheses */
+                case T_CaseExpr:    /* other separators */
+                    return true;
+                default:
+                    return false;
+            }
+
+        case T_BoolExpr:
+            switch (nodeTag(parentNode))
+            {
+                case T_BoolExpr:
+                    if (prettyFlags & PRETTYFLAG_PAREN)
+                    {
+                        BoolExprType type;
+                        BoolExprType parentType;
+
+                        type = ((BoolExpr *) node)->boolop;
+                        parentType = ((BoolExpr *) parentNode)->boolop;
+                        switch (type)
+                        {
+                            case NOT_EXPR:
+                            case AND_EXPR:
+                                if (parentType == AND_EXPR || parentType == OR_EXPR)
+                                    return true;
+                                break;
+                            case OR_EXPR:
+                                if (parentType == OR_EXPR)
+                                    return true;
+                                break;
+                        }
+                    }
+                    return false;
+                case T_FuncExpr:
+                    {
+                        /* special handling for casts */
+                        CoercionForm type = ((FuncExpr *) parentNode)->funcformat;
+
+                        if (type == COERCE_EXPLICIT_CAST ||
+                            type == COERCE_IMPLICIT_CAST)
+                            return false;
+                        return true;    /* own parentheses */
+                    }
+                case T_ArrayRef:    /* other separators */
+                case T_ArrayExpr:    /* other separators */
+                case T_RowExpr: /* other separators */
+                case T_CoalesceExpr:    /* own parentheses */
+                case T_MinMaxExpr:    /* own parentheses */
+                case T_XmlExpr: /* own parentheses */
+                case T_NullIfExpr:    /* other separators */
+                case T_Aggref:    /* own parentheses */
+                case T_WindowFunc:    /* own parentheses */
+                case T_CaseExpr:    /* other separators */
+                    return true;
+                default:
+                    return false;
+            }
+
+        default:
+            break;
+    }
+    /* those we don't know: in dubio complexo */
+    return false;
+}
+
+
+/*
+ * appendContextKeyword - append a keyword to buffer
+ *
+ * If prettyPrint is enabled, perform a line break, and adjust indentation.
+ * Otherwise, just append the keyword.
+ */
+static void
+appendContextKeyword(deparse_context *context, const char *str,
+                     int indentBefore, int indentAfter, int indentPlus)
+{
+    StringInfo    buf = context->buf;
+
+    if (PRETTY_INDENT(context))
+    {
+        int            indentAmount;
+
+        context->indentLevel += indentBefore;
+
+        /* remove any trailing spaces currently in the buffer ... */
+        removeStringInfoSpaces(buf);
+        /* ... then add a newline and some spaces */
+        appendStringInfoChar(buf, '\n');
+
+        if (context->indentLevel < PRETTYINDENT_LIMIT)
+            indentAmount = Max(context->indentLevel, 0) + indentPlus;
+        else
+        {
+            /*
+             * If we're indented more than PRETTYINDENT_LIMIT characters, try
+             * to conserve horizontal space by reducing the per-level
+             * indentation.  For best results the scale factor here should
+             * divide all the indent amounts that get added to indentLevel
+             * (PRETTYINDENT_STD, etc).  It's important that the indentation
+             * not grow unboundedly, else deeply-nested trees use O(N^2)
+             * whitespace; so we also wrap modulo PRETTYINDENT_LIMIT.
+             */
+            indentAmount = PRETTYINDENT_LIMIT +
+                (context->indentLevel - PRETTYINDENT_LIMIT) /
+                (PRETTYINDENT_STD / 2);
+            indentAmount %= PRETTYINDENT_LIMIT;
+            /* scale/wrap logic affects indentLevel, but not indentPlus */
+            indentAmount += indentPlus;
+        }
+        appendStringInfoSpaces(buf, indentAmount);
+
+        appendStringInfoString(buf, str);
+
+        context->indentLevel += indentAfter;
+        if (context->indentLevel < 0)
+            context->indentLevel = 0;
+    }
+    else
+        appendStringInfoString(buf, str);
+}
+
+/*
+ * removeStringInfoSpaces - delete trailing spaces from a buffer.
+ *
+ * Possibly this should move to stringinfo.c at some point.
+ */
+static void
+removeStringInfoSpaces(StringInfo str)
+{
+    while (str->len > 0 && str->data[str->len - 1] == ' ')
+        str->data[--(str->len)] = '\0';
+}
+
+
+/*
+ * get_rule_expr_paren    - deparse expr using get_rule_expr,
+ * embracing the string with parentheses if necessary for prettyPrint.
+ *
+ * Never embrace if prettyFlags=0, because it's done in the calling node.
+ *
+ * Any node that does *not* embrace its argument node by sql syntax (with
+ * parentheses, non-operator keywords like CASE/WHEN/ON, or comma etc) should
+ * use get_rule_expr_paren instead of get_rule_expr so parentheses can be
+ * added.
+ */
+static void
+get_rule_expr_paren(Node *node, deparse_context *context,
+                    bool showimplicit, Node *parentNode)
+{
+    bool        need_paren;
+
+    need_paren = PRETTY_PAREN(context) &&
+        !isSimpleNode(node, parentNode, context->prettyFlags);
+
+    if (need_paren)
+        appendStringInfoChar(context->buf, '(');
+
+    get_rule_expr(node, context, showimplicit);
+
+    if (need_paren)
+        appendStringInfoChar(context->buf, ')');
+}
+
+
+/* ----------
+ * get_rule_expr            - Parse back an expression
+ *
+ * Note: showimplicit determines whether we display any implicit cast that
+ * is present at the top of the expression tree.  It is a passed argument,
+ * not a field of the context struct, because we change the value as we
+ * recurse down into the expression.  In general we suppress implicit casts
+ * when the result type is known with certainty (eg, the arguments of an
+ * OR must be boolean).  We display implicit casts for arguments of functions
+ * and operators, since this is needed to be certain that the same function
+ * or operator will be chosen when the expression is re-parsed.
+ * ----------
+ */
+static void
+get_rule_expr(Node *node, deparse_context *context,
+              bool showimplicit)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+
+    if (node == NULL)
+        return;
+
+    /* Guard against excessively long or deeply-nested queries */
+    CHECK_FOR_INTERRUPTS();
+    check_stack_depth();
+
+    /*
+     * Each level of get_rule_expr must emit an indivisible term
+     * (parenthesized if necessary) to ensure result is reparsed into the same
+     * expression tree.  The only exception is that when the input is a List,
+     * we emit the component items comma-separated with no surrounding
+     * decoration; this is convenient for most callers.
+     */
+    switch (nodeTag(node))
+    {
+        case T_Var:
+            (void) get_variable((Var *) node, 0, false, context);
+            break;
+
+        case T_Const:
+            get_const_expr((Const *) node, context, 0);
+            break;
+
+        case T_Param:
+            get_parameter((Param *) node, context);
+            break;
+
+        case T_Aggref:
+            get_agg_expr((Aggref *) node, context, (Aggref *) node);
+            break;
+
+        case T_GroupingFunc:
+            {
+                GroupingFunc *gexpr = (GroupingFunc *) node;
+
+                appendStringInfoString(buf, "GROUPING(");
+                get_rule_expr((Node *) gexpr->args, context, true);
+                appendStringInfoChar(buf, ')');
+            }
+            break;
+
+        case T_WindowFunc:
+            get_windowfunc_expr((WindowFunc *) node, context);
+            break;
+
+        case T_ArrayRef:
+            {
+                ArrayRef   *aref = (ArrayRef *) node;
+                bool        need_parens;
+
+                /*
+                 * If the argument is a CaseTestExpr, we must be inside a
+                 * FieldStore, ie, we are assigning to an element of an array
+                 * within a composite column.  Since we already punted on
+                 * displaying the FieldStore's target information, just punt
+                 * here too, and display only the assignment source
+                 * expression.
+                 */
+                if (IsA(aref->refexpr, CaseTestExpr))
+                {
+                    Assert(aref->refassgnexpr);
+                    get_rule_expr((Node *) aref->refassgnexpr,
+                                  context, showimplicit);
+                    break;
+                }
+
+                /*
+                 * Parenthesize the argument unless it's a simple Var or a
+                 * FieldSelect.  (In particular, if it's another ArrayRef, we
+                 * *must* parenthesize to avoid confusion.)
+                 */
+                need_parens = !IsA(aref->refexpr, Var) &&
+                    !IsA(aref->refexpr, FieldSelect);
+                if (need_parens)
+                    appendStringInfoChar(buf, '(');
+                get_rule_expr((Node *) aref->refexpr, context, showimplicit);
+                if (need_parens)
+                    appendStringInfoChar(buf, ')');
+
+                /*
+                 * If there's a refassgnexpr, we want to print the node in the
+                 * format "array[subscripts] := refassgnexpr".  This is not
+                 * legal SQL, so decompilation of INSERT or UPDATE statements
+                 * should always use processIndirection as part of the
+                 * statement-level syntax.  We should only see this when
+                 * EXPLAIN tries to print the targetlist of a plan resulting
+                 * from such a statement.
+                 */
+                if (aref->refassgnexpr)
+                {
+                    Node       *refassgnexpr;
+
+                    /*
+                     * Use processIndirection to print this node's subscripts
+                     * as well as any additional field selections or
+                     * subscripting in immediate descendants.  It returns the
+                     * RHS expr that is actually being "assigned".
+                     */
+                    refassgnexpr = processIndirection(node, context);
+                    appendStringInfoString(buf, " := ");
+                    get_rule_expr(refassgnexpr, context, showimplicit);
+                }
+                else
+                {
+                    /* Just an ordinary array fetch, so print subscripts */
+                    printSubscripts(aref, context);
+                }
+            }
+            break;
+
+        case T_FuncExpr:
+            get_func_expr((FuncExpr *) node, context, showimplicit);
+            break;
+
+        case T_NamedArgExpr:
+            {
+                NamedArgExpr *na = (NamedArgExpr *) node;
+
+                appendStringInfo(buf, "%s => ", quote_identifier(na->name));
+                get_rule_expr((Node *) na->arg, context, showimplicit);
+            }
+            break;
+
+        case T_OpExpr:
+            get_oper_expr((OpExpr *) node, context);
+            break;
+
+        case T_DistinctExpr:
+            {
+                DistinctExpr *expr = (DistinctExpr *) node;
+                List       *args = expr->args;
+                Node       *arg1 = (Node *) linitial(args);
+                Node       *arg2 = (Node *) lsecond(args);
+
+                if (!PRETTY_PAREN(context))
+                    appendStringInfoChar(buf, '(');
+                get_rule_expr_paren(arg1, context, true, node);
+                appendStringInfoString(buf, " IS DISTINCT FROM ");
+                get_rule_expr_paren(arg2, context, true, node);
+                if (!PRETTY_PAREN(context))
+                    appendStringInfoChar(buf, ')');
+            }
+            break;
+
+        case T_NullIfExpr:
+            {
+                NullIfExpr *nullifexpr = (NullIfExpr *) node;
+
+                appendStringInfoString(buf, "NULLIF(");
+                get_rule_expr((Node *) nullifexpr->args, context, true);
+                appendStringInfoChar(buf, ')');
+            }
+            break;
+
+        case T_ScalarArrayOpExpr:
+            {
+                ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) node;
+                List       *args = expr->args;
+                Node       *arg1 = (Node *) linitial(args);
+                Node       *arg2 = (Node *) lsecond(args);
+
+                if (!PRETTY_PAREN(context))
+                    appendStringInfoChar(buf, '(');
+                get_rule_expr_paren(arg1, context, true, node);
+                appendStringInfo(buf, " %s %s (",
+                                 generate_operator_name(expr->opno,
+                                                        exprType(arg1),
+                                                        get_base_element_type(exprType(arg2))),
+                                 expr->useOr ? "ANY" : "ALL");
+                get_rule_expr_paren(arg2, context, true, node);
+
+                /*
+                 * There's inherent ambiguity in "x op ANY/ALL (y)" when y is
+                 * a bare sub-SELECT.  Since we're here, the sub-SELECT must
+                 * be meant as a scalar sub-SELECT yielding an array value to
+                 * be used in ScalarArrayOpExpr; but the grammar will
+                 * preferentially interpret such a construct as an ANY/ALL
+                 * SubLink.  To prevent misparsing the output that way, insert
+                 * a dummy coercion (which will be stripped by parse analysis,
+                 * so no inefficiency is added in dump and reload).  This is
+                 * indeed most likely what the user wrote to get the construct
+                 * accepted in the first place.
+                 */
+                if (IsA(arg2, SubLink) &&
+                    ((SubLink *) arg2)->subLinkType == EXPR_SUBLINK)
+                    appendStringInfo(buf, "::%s",
+                                     format_type_with_typemod(exprType(arg2),
+                                                              exprTypmod(arg2)));
+                appendStringInfoChar(buf, ')');
+                if (!PRETTY_PAREN(context))
+                    appendStringInfoChar(buf, ')');
+            }
+            break;
+
+        case T_BoolExpr:
+            {
+                BoolExpr   *expr = (BoolExpr *) node;
+                Node       *first_arg = linitial(expr->args);
+                ListCell   *arg = lnext(list_head(expr->args));
+
+                switch (expr->boolop)
+                {
+                    case AND_EXPR:
+                        if (!PRETTY_PAREN(context))
+                            appendStringInfoChar(buf, '(');
+                        get_rule_expr_paren(first_arg, context,
+                                            false, node);
+                        while (arg)
+                        {
+                            appendStringInfoString(buf, " AND ");
+                            get_rule_expr_paren((Node *) lfirst(arg), context,
+                                                false, node);
+                            arg = lnext(arg);
+                        }
+                        if (!PRETTY_PAREN(context))
+                            appendStringInfoChar(buf, ')');
+                        break;
+
+                    case OR_EXPR:
+                        if (!PRETTY_PAREN(context))
+                            appendStringInfoChar(buf, '(');
+                        get_rule_expr_paren(first_arg, context,
+                                            false, node);
+                        while (arg)
+                        {
+                            appendStringInfoString(buf, " OR ");
+                            get_rule_expr_paren((Node *) lfirst(arg), context,
+                                                false, node);
+                            arg = lnext(arg);
+                        }
+                        if (!PRETTY_PAREN(context))
+                            appendStringInfoChar(buf, ')');
+                        break;
+
+                    case NOT_EXPR:
+                        if (!PRETTY_PAREN(context))
+                            appendStringInfoChar(buf, '(');
+                        appendStringInfoString(buf, "NOT ");
+                        get_rule_expr_paren(first_arg, context,
+                                            false, node);
+                        if (!PRETTY_PAREN(context))
+                            appendStringInfoChar(buf, ')');
+                        break;
+
+                    default:
+                        elog(ERROR, "unrecognized boolop: %d",
+                             (int) expr->boolop);
+                }
+            }
+            break;
+
+        case T_SubLink:
+            get_sublink_expr((SubLink *) node, context);
+            break;
+
+        case T_SubPlan:
+            {
+                SubPlan    *subplan = (SubPlan *) node;
+
+                /*
+                 * We cannot see an already-planned subplan in rule deparsing,
+                 * only while EXPLAINing a query plan.  We don't try to
+                 * reconstruct the original SQL, just reference the subplan
+                 * that appears elsewhere in EXPLAIN's result.
+                 */
+                if (subplan->useHashTable)
+                    appendStringInfo(buf, "(hashed %s)", subplan->plan_name);
+                else
+                    appendStringInfo(buf, "(%s)", subplan->plan_name);
+            }
+            break;
+
+        case T_AlternativeSubPlan:
+            {
+                AlternativeSubPlan *asplan = (AlternativeSubPlan *) node;
+                ListCell   *lc;
+
+                /* As above, this can only happen during EXPLAIN */
+                appendStringInfoString(buf, "(alternatives: ");
+                foreach(lc, asplan->subplans)
+                {
+                    SubPlan    *splan = lfirst_node(SubPlan, lc);
+
+                    if (splan->useHashTable)
+                        appendStringInfo(buf, "hashed %s", splan->plan_name);
+                    else
+                        appendStringInfoString(buf, splan->plan_name);
+                    if (lnext(lc))
+                        appendStringInfoString(buf, " or ");
+                }
+                appendStringInfoChar(buf, ')');
+            }
+            break;
+
+        case T_FieldSelect:
+            {
+                FieldSelect *fselect = (FieldSelect *) node;
+                Node       *arg = (Node *) fselect->arg;
+                int            fno = fselect->fieldnum;
+                const char *fieldname;
+                bool        need_parens;
+
+                /*
+                 * Parenthesize the argument unless it's an ArrayRef or
+                 * another FieldSelect.  Note in particular that it would be
+                 * WRONG to not parenthesize a Var argument; simplicity is not
+                 * the issue here, having the right number of names is.
+                 */
+                need_parens = !IsA(arg, ArrayRef) &&!IsA(arg, FieldSelect);
+                if (need_parens)
+                    appendStringInfoChar(buf, '(');
+                get_rule_expr(arg, context, true);
+                if (need_parens)
+                    appendStringInfoChar(buf, ')');
+
+                /*
+                 * Get and print the field name.
+                 */
+                fieldname = get_name_for_var_field((Var *) arg, fno,
+                                                   0, context);
+                appendStringInfo(buf, ".%s", quote_identifier(fieldname));
+            }
+            break;
+
+        case T_FieldStore:
+            {
+                FieldStore *fstore = (FieldStore *) node;
+                bool        need_parens;
+
+                /*
+                 * There is no good way to represent a FieldStore as real SQL,
+                 * so decompilation of INSERT or UPDATE statements should
+                 * always use processIndirection as part of the
+                 * statement-level syntax.  We should only get here when
+                 * EXPLAIN tries to print the targetlist of a plan resulting
+                 * from such a statement.  The plan case is even harder than
+                 * ordinary rules would be, because the planner tries to
+                 * collapse multiple assignments to the same field or subfield
+                 * into one FieldStore; so we can see a list of target fields
+                 * not just one, and the arguments could be FieldStores
+                 * themselves.  We don't bother to try to print the target
+                 * field names; we just print the source arguments, with a
+                 * ROW() around them if there's more than one.  This isn't
+                 * terribly complete, but it's probably good enough for
+                 * EXPLAIN's purposes; especially since anything more would be
+                 * either hopelessly confusing or an even poorer
+                 * representation of what the plan is actually doing.
+                 */
+                need_parens = (list_length(fstore->newvals) != 1);
+                if (need_parens)
+                    appendStringInfoString(buf, "ROW(");
+                get_rule_expr((Node *) fstore->newvals, context, showimplicit);
+                if (need_parens)
+                    appendStringInfoChar(buf, ')');
+            }
+            break;
+
+        case T_RelabelType:
+            {
+                RelabelType *relabel = (RelabelType *) node;
+                Node       *arg = (Node *) relabel->arg;
+
+                if (relabel->relabelformat == COERCE_IMPLICIT_CAST &&
+                    !showimplicit)
+                {
+                    /* don't show the implicit cast */
+                    get_rule_expr_paren(arg, context, false, node);
+                }
+                else
+                {
+                    get_coercion_expr(arg, context,
+                                      relabel->resulttype,
+                                      relabel->resulttypmod,
+                                      node);
+                }
+            }
+            break;
+
+        case T_CoerceViaIO:
+            {
+                CoerceViaIO *iocoerce = (CoerceViaIO *) node;
+                Node       *arg = (Node *) iocoerce->arg;
+
+                if (iocoerce->coerceformat == COERCE_IMPLICIT_CAST &&
+                    !showimplicit)
+                {
+                    /* don't show the implicit cast */
+                    get_rule_expr_paren(arg, context, false, node);
+                }
+                else
+                {
+                    get_coercion_expr(arg, context,
+                                      iocoerce->resulttype,
+                                      -1,
+                                      node);
+                }
+            }
+            break;
+
+        case T_ArrayCoerceExpr:
+            {
+                ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
+                Node       *arg = (Node *) acoerce->arg;
+
+                if (acoerce->coerceformat == COERCE_IMPLICIT_CAST &&
+                    !showimplicit)
+                {
+                    /* don't show the implicit cast */
+                    get_rule_expr_paren(arg, context, false, node);
+                }
+                else
+                {
+                    get_coercion_expr(arg, context,
+                                      acoerce->resulttype,
+                                      acoerce->resulttypmod,
+                                      node);
+                }
+            }
+            break;
+
+        case T_ConvertRowtypeExpr:
+            {
+                ConvertRowtypeExpr *convert = (ConvertRowtypeExpr *) node;
+                Node       *arg = (Node *) convert->arg;
+
+                if (convert->convertformat == COERCE_IMPLICIT_CAST &&
+                    !showimplicit)
+                {
+                    /* don't show the implicit cast */
+                    get_rule_expr_paren(arg, context, false, node);
+                }
+                else
+                {
+                    get_coercion_expr(arg, context,
+                                      convert->resulttype, -1,
+                                      node);
+                }
+            }
+            break;
+
+        case T_CollateExpr:
+            {
+                CollateExpr *collate = (CollateExpr *) node;
+                Node       *arg = (Node *) collate->arg;
+
+                if (!PRETTY_PAREN(context))
+                    appendStringInfoChar(buf, '(');
+                get_rule_expr_paren(arg, context, showimplicit, node);
+                appendStringInfo(buf, " COLLATE %s",
+                                 generate_collation_name(collate->collOid));
+                if (!PRETTY_PAREN(context))
+                    appendStringInfoChar(buf, ')');
+            }
+            break;
+
+        case T_CaseExpr:
+            {
+                CaseExpr   *caseexpr = (CaseExpr *) node;
+                ListCell   *temp;
+
+                appendContextKeyword(context, "CASE",
+                                     0, PRETTYINDENT_VAR, 0);
+                if (caseexpr->arg)
+                {
+                    appendStringInfoChar(buf, ' ');
+                    get_rule_expr((Node *) caseexpr->arg, context, true);
+                }
+                foreach(temp, caseexpr->args)
+                {
+                    CaseWhen   *when = (CaseWhen *) lfirst(temp);
+                    Node       *w = (Node *) when->expr;
+
+                    if (caseexpr->arg)
+                    {
+                        /*
+                         * The parser should have produced WHEN clauses of the
+                         * form "CaseTestExpr = RHS", possibly with an
+                         * implicit coercion inserted above the CaseTestExpr.
+                         * For accurate decompilation of rules it's essential
+                         * that we show just the RHS.  However in an
+                         * expression that's been through the optimizer, the
+                         * WHEN clause could be almost anything (since the
+                         * equality operator could have been expanded into an
+                         * inline function).  If we don't recognize the form
+                         * of the WHEN clause, just punt and display it as-is.
+                         */
+                        if (IsA(w, OpExpr))
+                        {
+                            List       *args = ((OpExpr *) w)->args;
+
+                            if (list_length(args) == 2 &&
+                                IsA(strip_implicit_coercions(linitial(args)),
+                                    CaseTestExpr))
+                                w = (Node *) lsecond(args);
+                        }
+                    }
+
+                    if (!PRETTY_INDENT(context))
+                        appendStringInfoChar(buf, ' ');
+                    appendContextKeyword(context, "WHEN ",
+                                         0, 0, 0);
+                    get_rule_expr(w, context, false);
+                    appendStringInfoString(buf, " THEN ");
+                    get_rule_expr((Node *) when->result, context, true);
+                }
+                if (!PRETTY_INDENT(context))
+                    appendStringInfoChar(buf, ' ');
+                appendContextKeyword(context, "ELSE ",
+                                     0, 0, 0);
+                get_rule_expr((Node *) caseexpr->defresult, context, true);
+                if (!PRETTY_INDENT(context))
+                    appendStringInfoChar(buf, ' ');
+                appendContextKeyword(context, "END",
+                                     -PRETTYINDENT_VAR, 0, 0);
+            }
+            break;
+
+        case T_CaseTestExpr:
+            {
+                /*
+                 * Normally we should never get here, since for expressions
+                 * that can contain this node type we attempt to avoid
+                 * recursing to it.  But in an optimized expression we might
+                 * be unable to avoid that (see comments for CaseExpr).  If we
+                 * do see one, print it as CASE_TEST_EXPR.
+                 */
+                appendStringInfoString(buf, "CASE_TEST_EXPR");
+            }
+            break;
+
+        case T_ArrayExpr:
+            {
+                ArrayExpr  *arrayexpr = (ArrayExpr *) node;
+
+                appendStringInfoString(buf, "ARRAY[");
+                get_rule_expr((Node *) arrayexpr->elements, context, true);
+                appendStringInfoChar(buf, ']');
+
+                /*
+                 * If the array isn't empty, we assume its elements are
+                 * coerced to the desired type.  If it's empty, though, we
+                 * need an explicit coercion to the array type.
+                 */
+                if (arrayexpr->elements == NIL)
+                    appendStringInfo(buf, "::%s",
+                                     format_type_with_typemod(arrayexpr->array_typeid, -1));
+            }
+            break;
+
+        case T_RowExpr:
+            {
+                RowExpr    *rowexpr = (RowExpr *) node;
+                TupleDesc    tupdesc = NULL;
+                ListCell   *arg;
+                int            i;
+                char       *sep;
+
+                /*
+                 * If it's a named type and not RECORD, we may have to skip
+                 * dropped columns and/or claim there are NULLs for added
+                 * columns.
+                 */
+                if (rowexpr->row_typeid != RECORDOID)
+                {
+                    tupdesc = lookup_rowtype_tupdesc(rowexpr->row_typeid, -1);
+                    Assert(list_length(rowexpr->args) <= tupdesc->natts);
+                }
+
+                /*
+                 * SQL99 allows "ROW" to be omitted when there is more than
+                 * one column, but for simplicity we always print it.
+                 */
+                appendStringInfoString(buf, "ROW(");
+                sep = "";
+                i = 0;
+                foreach(arg, rowexpr->args)
+                {
+                    Node       *e = (Node *) lfirst(arg);
+
+                    if (tupdesc == NULL ||
+                        !tupdesc->attrs[i]->attisdropped)
+                    {
+                        appendStringInfoString(buf, sep);
+                        /* Whole-row Vars need special treatment here */
+                        get_rule_expr_toplevel(e, context, true);
+                        sep = ", ";
+                    }
+                    i++;
+                }
+                if (tupdesc != NULL)
+                {
+                    while (i < tupdesc->natts)
+                    {
+                        if (!tupdesc->attrs[i]->attisdropped)
+                        {
+                            appendStringInfoString(buf, sep);
+                            appendStringInfoString(buf, "NULL");
+                            sep = ", ";
+                        }
+                        i++;
+                    }
+
+                    ReleaseTupleDesc(tupdesc);
+                }
+                appendStringInfoChar(buf, ')');
+                if (rowexpr->row_format == COERCE_EXPLICIT_CAST)
+                    appendStringInfo(buf, "::%s",
+                                     format_type_with_typemod(rowexpr->row_typeid, -1));
+            }
+            break;
+
+        case T_RowCompareExpr:
+            {
+                RowCompareExpr *rcexpr = (RowCompareExpr *) node;
+                ListCell   *arg;
+                char       *sep;
+
+                /*
+                 * SQL99 allows "ROW" to be omitted when there is more than
+                 * one column, but for simplicity we always print it.
+                 */
+                appendStringInfoString(buf, "(ROW(");
+                sep = "";
+                foreach(arg, rcexpr->largs)
+                {
+                    Node       *e = (Node *) lfirst(arg);
+
+                    appendStringInfoString(buf, sep);
+                    get_rule_expr(e, context, true);
+                    sep = ", ";
+                }
+
+                /*
+                 * We assume that the name of the first-column operator will
+                 * do for all the rest too.  This is definitely open to
+                 * failure, eg if some but not all operators were renamed
+                 * since the construct was parsed, but there seems no way to
+                 * be perfect.
+                 */
+                appendStringInfo(buf, ") %s ROW(",
+                                 generate_operator_name(linitial_oid(rcexpr->opnos),
+                                                        exprType(linitial(rcexpr->largs)),
+                                                        exprType(linitial(rcexpr->rargs))));
+                sep = "";
+                foreach(arg, rcexpr->rargs)
+                {
+                    Node       *e = (Node *) lfirst(arg);
+
+                    appendStringInfoString(buf, sep);
+                    get_rule_expr(e, context, true);
+                    sep = ", ";
+                }
+                appendStringInfoString(buf, "))");
+            }
+            break;
+
+        case T_CoalesceExpr:
+            {
+                CoalesceExpr *coalesceexpr = (CoalesceExpr *) node;
+
+                appendStringInfoString(buf, "COALESCE(");
+                get_rule_expr((Node *) coalesceexpr->args, context, true);
+                appendStringInfoChar(buf, ')');
+            }
+            break;
+
+        case T_MinMaxExpr:
+            {
+                MinMaxExpr *minmaxexpr = (MinMaxExpr *) node;
+
+                switch (minmaxexpr->op)
+                {
+                    case IS_GREATEST:
+                        appendStringInfoString(buf, "GREATEST(");
+                        break;
+                    case IS_LEAST:
+                        appendStringInfoString(buf, "LEAST(");
+                        break;
+                }
+                get_rule_expr((Node *) minmaxexpr->args, context, true);
+                appendStringInfoChar(buf, ')');
+            }
+            break;
+
+        case T_SQLValueFunction:
+            {
+                SQLValueFunction *svf = (SQLValueFunction *) node;
+
+                /*
+                 * Note: this code knows that typmod for time, timestamp, and
+                 * timestamptz just prints as integer.
+                 */
+                switch (svf->op)
+                {
+                    case SVFOP_CURRENT_DATE:
+                        appendStringInfoString(buf, "CURRENT_DATE");
+                        break;
+                    case SVFOP_CURRENT_TIME:
+                        appendStringInfoString(buf, "CURRENT_TIME");
+                        break;
+                    case SVFOP_CURRENT_TIME_N:
+                        appendStringInfo(buf, "CURRENT_TIME(%d)", svf->typmod);
+                        break;
+                    case SVFOP_CURRENT_TIMESTAMP:
+                        appendStringInfoString(buf, "CURRENT_TIMESTAMP");
+                        break;
+                    case SVFOP_CURRENT_TIMESTAMP_N:
+                        appendStringInfo(buf, "CURRENT_TIMESTAMP(%d)",
+                                         svf->typmod);
+                        break;
+                    case SVFOP_LOCALTIME:
+                        appendStringInfoString(buf, "LOCALTIME");
+                        break;
+                    case SVFOP_LOCALTIME_N:
+                        appendStringInfo(buf, "LOCALTIME(%d)", svf->typmod);
+                        break;
+                    case SVFOP_LOCALTIMESTAMP:
+                        appendStringInfoString(buf, "LOCALTIMESTAMP");
+                        break;
+                    case SVFOP_LOCALTIMESTAMP_N:
+                        appendStringInfo(buf, "LOCALTIMESTAMP(%d)",
+                                         svf->typmod);
+                        break;
+                    case SVFOP_CURRENT_ROLE:
+                        appendStringInfoString(buf, "CURRENT_ROLE");
+                        break;
+                    case SVFOP_CURRENT_USER:
+                        appendStringInfoString(buf, "CURRENT_USER");
+                        break;
+                    case SVFOP_USER:
+                        appendStringInfoString(buf, "USER");
+                        break;
+                    case SVFOP_SESSION_USER:
+                        appendStringInfoString(buf, "SESSION_USER");
+                        break;
+                    case SVFOP_CURRENT_CATALOG:
+                        appendStringInfoString(buf, "CURRENT_CATALOG");
+                        break;
+                    case SVFOP_CURRENT_SCHEMA:
+                        appendStringInfoString(buf, "CURRENT_SCHEMA");
+                        break;
+                }
+            }
+            break;
+
+        case T_NextValueExpr:
+            {
+                /*
+                 * This gets invoked by Fast Query Shipping code to deparse a
+                 * query. It seems enough to just generate a "DEFAULT" clause
+                 * and let the remote datanode handle finding the correct
+                 * sequence for replica identity.
+                 *
+                 * XXX PG10MERGE: If we do see issues with this, it might be
+                 * worthwhile to consider generating an expression such as,
+                 * nextval('sequence_name'::regclass)
+                 */
+                appendStringInfoString(buf, "DEFAULT");
+            }
+            break;
+
+        case T_XmlExpr:
+            {
+                XmlExpr    *xexpr = (XmlExpr *) node;
+                bool        needcomma = false;
+                ListCell   *arg;
+                ListCell   *narg;
+                Const       *con;
+
+                switch (xexpr->op)
+                {
+                    case IS_XMLCONCAT:
+                        appendStringInfoString(buf, "XMLCONCAT(");
+                        break;
+                    case IS_XMLELEMENT:
+                        appendStringInfoString(buf, "XMLELEMENT(");
+                        break;
+                    case IS_XMLFOREST:
+                        appendStringInfoString(buf, "XMLFOREST(");
+                        break;
+                    case IS_XMLPARSE:
+                        appendStringInfoString(buf, "XMLPARSE(");
+                        break;
+                    case IS_XMLPI:
+                        appendStringInfoString(buf, "XMLPI(");
+                        break;
+                    case IS_XMLROOT:
+                        appendStringInfoString(buf, "XMLROOT(");
+                        break;
+                    case IS_XMLSERIALIZE:
+                        appendStringInfoString(buf, "XMLSERIALIZE(");
+                        break;
+                    case IS_DOCUMENT:
+                        break;
+                }
+                if (xexpr->op == IS_XMLPARSE || xexpr->op == IS_XMLSERIALIZE)
+                {
+                    if (xexpr->xmloption == XMLOPTION_DOCUMENT)
+                        appendStringInfoString(buf, "DOCUMENT ");
+                    else
+                        appendStringInfoString(buf, "CONTENT ");
+                }
+                if (xexpr->name)
+                {
+                    appendStringInfo(buf, "NAME %s",
+                                     quote_identifier(map_xml_name_to_sql_identifier(xexpr->name)));
+                    needcomma = true;
+                }
+                if (xexpr->named_args)
+                {
+                    if (xexpr->op != IS_XMLFOREST)
+                    {
+                        if (needcomma)
+                            appendStringInfoString(buf, ", ");
+                        appendStringInfoString(buf, "XMLATTRIBUTES(");
+                        needcomma = false;
+                    }
+                    forboth(arg, xexpr->named_args, narg, xexpr->arg_names)
+                    {
+                        Node       *e = (Node *) lfirst(arg);
+                        char       *argname = strVal(lfirst(narg));
+
+                        if (needcomma)
+                            appendStringInfoString(buf, ", ");
+                        get_rule_expr((Node *) e, context, true);
+                        appendStringInfo(buf, " AS %s",
+                                         quote_identifier(map_xml_name_to_sql_identifier(argname)));
+                        needcomma = true;
+                    }
+                    if (xexpr->op != IS_XMLFOREST)
+                        appendStringInfoChar(buf, ')');
+                }
+                if (xexpr->args)
+                {
+                    if (needcomma)
+                        appendStringInfoString(buf, ", ");
+                    switch (xexpr->op)
+                    {
+                        case IS_XMLCONCAT:
+                        case IS_XMLELEMENT:
+                        case IS_XMLFOREST:
+                        case IS_XMLPI:
+                        case IS_XMLSERIALIZE:
+                            /* no extra decoration needed */
+                            get_rule_expr((Node *) xexpr->args, context, true);
+                            break;
+                        case IS_XMLPARSE:
+                            Assert(list_length(xexpr->args) == 2);
+
+                            get_rule_expr((Node *) linitial(xexpr->args),
+                                          context, true);
+
+                            con = lsecond_node(Const, xexpr->args);
+                            Assert(!con->constisnull);
+                            if (DatumGetBool(con->constvalue))
+                                appendStringInfoString(buf,
+                                                       " PRESERVE WHITESPACE");
+                            else
+                                appendStringInfoString(buf,
+                                                       " STRIP WHITESPACE");
+                            break;
+                        case IS_XMLROOT:
+                            Assert(list_length(xexpr->args) == 3);
+
+                            get_rule_expr((Node *) linitial(xexpr->args),
+                                          context, true);
+
+                            appendStringInfoString(buf, ", VERSION ");
+                            con = (Const *) lsecond(xexpr->args);
+                            if (IsA(con, Const) &&
+                                con->constisnull)
+                                appendStringInfoString(buf, "NO VALUE");
+                            else
+                                get_rule_expr((Node *) con, context, false);
+
+                            con = lthird_node(Const, xexpr->args);
+                            if (con->constisnull)
+                                 /* suppress STANDALONE NO VALUE */ ;
+                            else
+                            {
+                                switch (DatumGetInt32(con->constvalue))
+                                {
+                                    case XML_STANDALONE_YES:
+                                        appendStringInfoString(buf,
+                                                               ", STANDALONE YES");
+                                        break;
+                                    case XML_STANDALONE_NO:
+                                        appendStringInfoString(buf,
+                                                               ", STANDALONE NO");
+                                        break;
+                                    case XML_STANDALONE_NO_VALUE:
+                                        appendStringInfoString(buf,
+                                                               ", STANDALONE NO VALUE");
+                                        break;
+                                    default:
+                                        break;
+                                }
+                            }
+                            break;
+                        case IS_DOCUMENT:
+                            get_rule_expr_paren((Node *) xexpr->args, context, false, node);
+                            break;
+                    }
+
+                }
+                if (xexpr->op == IS_XMLSERIALIZE)
+                    appendStringInfo(buf, " AS %s",
+                                     format_type_with_typemod(xexpr->type,
+                                                              xexpr->typmod));
+                if (xexpr->op == IS_DOCUMENT)
+                    appendStringInfoString(buf, " IS DOCUMENT");
+                else
+                    appendStringInfoChar(buf, ')');
+            }
+            break;
+
+        case T_NullTest:
+            {
+                NullTest   *ntest = (NullTest *) node;
+
+                if (!PRETTY_PAREN(context))
+                    appendStringInfoChar(buf, '(');
+                get_rule_expr_paren((Node *) ntest->arg, context, true, node);
+
+                /*
+                 * For scalar inputs, we prefer to print as IS [NOT] NULL,
+                 * which is shorter and traditional.  If it's a rowtype input
+                 * but we're applying a scalar test, must print IS [NOT]
+                 * DISTINCT FROM NULL to be semantically correct.
+                 */
+                if (ntest->argisrow ||
+                    !type_is_rowtype(exprType((Node *) ntest->arg)))
+                {
+                    switch (ntest->nulltesttype)
+                    {
+                        case IS_NULL:
+                            appendStringInfoString(buf, " IS NULL");
+                            break;
+                        case IS_NOT_NULL:
+                            appendStringInfoString(buf, " IS NOT NULL");
+                            break;
+                        default:
+                            elog(ERROR, "unrecognized nulltesttype: %d",
+                                 (int) ntest->nulltesttype);
+                    }
+                }
+                else
+                {
+                    switch (ntest->nulltesttype)
+                    {
+                        case IS_NULL:
+                            appendStringInfoString(buf, " IS NOT DISTINCT FROM NULL");
+                            break;
+                        case IS_NOT_NULL:
+                            appendStringInfoString(buf, " IS DISTINCT FROM NULL");
+                            break;
+                        default:
+                            elog(ERROR, "unrecognized nulltesttype: %d",
+                                 (int) ntest->nulltesttype);
+                    }
+                }
+                if (!PRETTY_PAREN(context))
+                    appendStringInfoChar(buf, ')');
+            }
+            break;
+
+        case T_BooleanTest:
+            {
+                BooleanTest *btest = (BooleanTest *) node;
+
+                if (!PRETTY_PAREN(context))
+                    appendStringInfoChar(buf, '(');
+                get_rule_expr_paren((Node *) btest->arg, context, false, node);
+                switch (btest->booltesttype)
+                {
+                    case IS_TRUE:
+                        appendStringInfoString(buf, " IS TRUE");
+                        break;
+                    case IS_NOT_TRUE:
+                        appendStringInfoString(buf, " IS NOT TRUE");
+                        break;
+                    case IS_FALSE:
+                        appendStringInfoString(buf, " IS FALSE");
+                        break;
+                    case IS_NOT_FALSE:
+                        appendStringInfoString(buf, " IS NOT FALSE");
+                        break;
+                    case IS_UNKNOWN:
+                        appendStringInfoString(buf, " IS UNKNOWN");
+                        break;
+                    case IS_NOT_UNKNOWN:
+                        appendStringInfoString(buf, " IS NOT UNKNOWN");
+                        break;
+                    default:
+                        elog(ERROR, "unrecognized booltesttype: %d",
+                             (int) btest->booltesttype);
+                }
+                if (!PRETTY_PAREN(context))
+                    appendStringInfoChar(buf, ')');
+            }
+            break;
+
+        case T_CoerceToDomain:
+            {
+                CoerceToDomain *ctest = (CoerceToDomain *) node;
+                Node       *arg = (Node *) ctest->arg;
+
+                if (ctest->coercionformat == COERCE_IMPLICIT_CAST &&
+                    !showimplicit)
+                {
+                    /* don't show the implicit cast */
+                    get_rule_expr(arg, context, false);
+                }
+                else
+                {
+                    get_coercion_expr(arg, context,
+                                      ctest->resulttype,
+                                      ctest->resulttypmod,
+                                      node);
+                }
+            }
+            break;
+
+        case T_CoerceToDomainValue:
+            appendStringInfoString(buf, "VALUE");
+            break;
+
+        case T_SetToDefault:
+            appendStringInfoString(buf, "DEFAULT");
+            break;
+
+        case T_CurrentOfExpr:
+            {
+                CurrentOfExpr *cexpr = (CurrentOfExpr *) node;
+
+                if (cexpr->cursor_name)
+                    appendStringInfo(buf, "CURRENT OF %s",
+                                     quote_identifier(cexpr->cursor_name));
+                else
+                    appendStringInfo(buf, "CURRENT OF $%d",
+                                     cexpr->cursor_param);
+            }
+            break;
+
+        case T_InferenceElem:
+            {
+                InferenceElem *iexpr = (InferenceElem *) node;
+                bool        save_varprefix;
+                bool        need_parens;
+
+                /*
+                 * InferenceElem can only refer to target relation, so a
+                 * prefix is not useful, and indeed would cause parse errors.
+                 */
+                save_varprefix = context->varprefix;
+                context->varprefix = false;
+
+                /*
+                 * Parenthesize the element unless it's a simple Var or a bare
+                 * function call.  Follows pg_get_indexdef_worker().
+                 */
+                need_parens = !IsA(iexpr->expr, Var);
+                if (IsA(iexpr->expr, FuncExpr) &&
+                    ((FuncExpr *) iexpr->expr)->funcformat ==
+                    COERCE_EXPLICIT_CALL)
+                    need_parens = false;
+
+                if (need_parens)
+                    appendStringInfoChar(buf, '(');
+                get_rule_expr((Node *) iexpr->expr,
+                              context, false);
+                if (need_parens)
+                    appendStringInfoChar(buf, ')');
+
+                context->varprefix = save_varprefix;
+
+                if (iexpr->infercollid)
+                    appendStringInfo(buf, " COLLATE %s",
+                                     generate_collation_name(iexpr->infercollid));
+
+                /* Add the operator class name, if not default */
+                if (iexpr->inferopclass)
+                {
+                    Oid            inferopclass = iexpr->inferopclass;
+                    Oid            inferopcinputtype = get_opclass_input_type(iexpr->inferopclass);
+
+                    get_opclass_name(inferopclass, inferopcinputtype, buf);
+                }
+            }
+            break;
+
+        case T_PartitionBoundSpec:
+            {
+                PartitionBoundSpec *spec = (PartitionBoundSpec *) node;
+                ListCell   *cell;
+                char       *sep;
+
+                switch (spec->strategy)
+                {
+                    case PARTITION_STRATEGY_LIST:
+                        Assert(spec->listdatums != NIL);
+
+                        appendStringInfoString(buf, "FOR VALUES IN (");
+                        sep = "";
+                        foreach(cell, spec->listdatums)
+                        {
+                            Const       *val = castNode(Const, lfirst(cell));
+
+                            appendStringInfoString(buf, sep);
+                            get_const_expr(val, context, -1);
+                            sep = ", ";
+                        }
+
+                        appendStringInfoString(buf, ")");
+                        break;
+
+                    case PARTITION_STRATEGY_RANGE:
+                        Assert(spec->lowerdatums != NIL &&
+                               spec->upperdatums != NIL &&
+                               list_length(spec->lowerdatums) ==
+                               list_length(spec->upperdatums));
+
+                        appendStringInfo(buf, "FOR VALUES FROM %s TO %s",
+                                         get_range_partbound_string(spec->lowerdatums),
+                                         get_range_partbound_string(spec->upperdatums));
+                        break;
+
+                    default:
+                        elog(ERROR, "unrecognized partition strategy: %d",
+                             (int) spec->strategy);
+                        break;
+                }
+            }
+            break;
+
+        case T_List:
+            {
+                char       *sep;
+                ListCell   *l;
+
+                sep = "";
+                foreach(l, (List *) node)
+                {
+                    appendStringInfoString(buf, sep);
+                    get_rule_expr((Node *) lfirst(l), context, showimplicit);
+                    sep = ", ";
+                }
+            }
+            break;
+
+        case T_TableFunc:
+            get_tablefunc((TableFunc *) node, context, showimplicit);
+            break;
+
+        default:
+            elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
+            break;
+    }
+}
+
+/*
+ * get_rule_expr_toplevel        - Parse back a toplevel expression
+ *
+ * Same as get_rule_expr(), except that if the expr is just a Var, we pass
+ * istoplevel = true not false to get_variable().  This causes whole-row Vars
+ * to get printed with decoration that will prevent expansion of "*".
+ * We need to use this in contexts such as ROW() and VALUES(), where the
+ * parser would expand "foo.*" appearing at top level.  (In principle we'd
+ * use this in get_target_list() too, but that has additional worries about
+ * whether to print AS, so it needs to invoke get_variable() directly anyway.)
+ */
+static void
+get_rule_expr_toplevel(Node *node, deparse_context *context,
+                       bool showimplicit)
+{
+    if (node && IsA(node, Var))
+        (void) get_variable((Var *) node, 0, true, context);
+    else
+        get_rule_expr(node, context, showimplicit);
+}
+
+/*
+ * get_rule_expr_funccall        - Parse back a function-call expression
+ *
+ * Same as get_rule_expr(), except that we guarantee that the output will
+ * look like a function call, or like one of the things the grammar treats as
+ * equivalent to a function call (see the func_expr_windowless production).
+ * This is needed in places where the grammar uses func_expr_windowless and
+ * you can't substitute a parenthesized a_expr.  If what we have isn't going
+ * to look like a function call, wrap it in a dummy CAST() expression, which
+ * will satisfy the grammar --- and, indeed, is likely what the user wrote to
+ * produce such a thing.
+ */
+static void
+get_rule_expr_funccall(Node *node, deparse_context *context,
+                       bool showimplicit)
+{
+    if (looks_like_function(node))
+        get_rule_expr(node, context, showimplicit);
+    else
+    {
+        StringInfo    buf = context->buf;
+
+        appendStringInfoString(buf, "CAST(");
+        /* no point in showing any top-level implicit cast */
+        get_rule_expr(node, context, false);
+        appendStringInfo(buf, " AS %s)",
+                         format_type_with_typemod(exprType(node),
+                                                  exprTypmod(node)));
+    }
+}
+
+/*
+ * Helper function to identify node types that satisfy func_expr_windowless.
+ * If in doubt, "false" is always a safe answer.
+ */
+static bool
+looks_like_function(Node *node)
+{// #lizard forgives
+    if (node == NULL)
+        return false;            /* probably shouldn't happen */
+    switch (nodeTag(node))
+    {
+        case T_FuncExpr:
+            /* OK, unless it's going to deparse as a cast */
+            return (((FuncExpr *) node)->funcformat == COERCE_EXPLICIT_CALL);
+        case T_NullIfExpr:
+        case T_CoalesceExpr:
+        case T_MinMaxExpr:
+        case T_SQLValueFunction:
+        case T_XmlExpr:
+            /* these are all accepted by func_expr_common_subexpr */
+            return true;
+        default:
+            break;
+    }
+    return false;
+}
+
+
+/*
+ * get_oper_expr            - Parse back an OpExpr node
+ */
+static void
+get_oper_expr(OpExpr *expr, deparse_context *context)
+{
+    StringInfo    buf = context->buf;
+    Oid            opno = expr->opno;
+    List       *args = expr->args;
+
+    if (!PRETTY_PAREN(context))
+        appendStringInfoChar(buf, '(');
+    if (list_length(args) == 2)
+    {
+        /* binary operator */
+        Node       *arg1 = (Node *) linitial(args);
+        Node       *arg2 = (Node *) lsecond(args);
+
+        get_rule_expr_paren(arg1, context, true, (Node *) expr);
+        appendStringInfo(buf, " %s ",
+                         generate_operator_name(opno,
+                                                exprType(arg1),
+                                                exprType(arg2)));
+        get_rule_expr_paren(arg2, context, true, (Node *) expr);
+    }
+    else
+    {
+        /* unary operator --- but which side? */
+        Node       *arg = (Node *) linitial(args);
+        HeapTuple    tp;
+        Form_pg_operator optup;
+
+        tp = SearchSysCache1(OPEROID, ObjectIdGetDatum(opno));
+        if (!HeapTupleIsValid(tp))
+            elog(ERROR, "cache lookup failed for operator %u", opno);
+        optup = (Form_pg_operator) GETSTRUCT(tp);
+        switch (optup->oprkind)
+        {
+            case 'l':
+                appendStringInfo(buf, "%s ",
+                                 generate_operator_name(opno,
+                                                        InvalidOid,
+                                                        exprType(arg)));
+                get_rule_expr_paren(arg, context, true, (Node *) expr);
+                break;
+            case 'r':
+                get_rule_expr_paren(arg, context, true, (Node *) expr);
+                appendStringInfo(buf, " %s",
+                                 generate_operator_name(opno,
+                                                        exprType(arg),
+                                                        InvalidOid));
+                break;
+            default:
+                elog(ERROR, "bogus oprkind: %d", optup->oprkind);
+        }
+        ReleaseSysCache(tp);
+    }
+    if (!PRETTY_PAREN(context))
+        appendStringInfoChar(buf, ')');
+}
+
+/*
+ * get_func_expr            - Parse back a FuncExpr node
+ */
+static void
+get_func_expr(FuncExpr *expr, deparse_context *context,
+              bool showimplicit)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    Oid            funcoid = expr->funcid;
+    Oid            argtypes[FUNC_MAX_ARGS];
+    int            nargs;
+    List       *argnames;
+    bool        use_variadic;
+    ListCell   *l;
+
+    /*
+     * If the function call came from an implicit coercion, then just show the
+     * first argument --- unless caller wants to see implicit coercions.
+     */
+    if (expr->funcformat == COERCE_IMPLICIT_CAST && !showimplicit)
+    {
+        get_rule_expr_paren((Node *) linitial(expr->args), context,
+                            false, (Node *) expr);
+        return;
+    }
+
+    /*
+     * If the function call came from a cast, then show the first argument
+     * plus an explicit cast operation.
+     */
+    if (expr->funcformat == COERCE_EXPLICIT_CAST ||
+        expr->funcformat == COERCE_IMPLICIT_CAST)
+    {
+        Node       *arg = linitial(expr->args);
+        Oid            rettype = expr->funcresulttype;
+        int32        coercedTypmod;
+
+        /* Get the typmod if this is a length-coercion function */
+        (void) exprIsLengthCoercion((Node *) expr, &coercedTypmod);
+
+        get_coercion_expr(arg, context,
+                          rettype, coercedTypmod,
+                          (Node *) expr);
+
+        return;
+    }
+
+    /*
+     * Normal function: display as proname(args).  First we need to extract
+     * the argument datatypes.
+     */
+    if (list_length(expr->args) > FUNC_MAX_ARGS)
+        ereport(ERROR,
+                (errcode(ERRCODE_TOO_MANY_ARGUMENTS),
+                 errmsg("too many arguments")));
+    nargs = 0;
+    argnames = NIL;
+    foreach(l, expr->args)
+    {
+        Node       *arg = (Node *) lfirst(l);
+
+        if (IsA(arg, NamedArgExpr))
+            argnames = lappend(argnames, ((NamedArgExpr *) arg)->name);
+        argtypes[nargs] = exprType(arg);
+        nargs++;
+    }
+
+    appendStringInfo(buf, "%s(",
+                     generate_function_name(funcoid, nargs,
+                                            argnames, argtypes,
+                                            expr->funcvariadic,
+                                            &use_variadic,
+                                            context->special_exprkind));
+    nargs = 0;
+    foreach(l, expr->args)
+    {
+        if (nargs++ > 0)
+            appendStringInfoString(buf, ", ");
+        if (use_variadic && lnext(l) == NULL)
+            appendStringInfoString(buf, "VARIADIC ");
+        get_rule_expr((Node *) lfirst(l), context, true);
+    }
+    appendStringInfoChar(buf, ')');
+}
+
+/*
+ * get_agg_expr            - Parse back an Aggref node
+ */
+static void
+get_agg_expr(Aggref *aggref, deparse_context *context,
+             Aggref *original_aggref)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    Oid            argtypes[FUNC_MAX_ARGS];
+    int            nargs;
+#ifdef PGXC
+//    bool        added_finalfn = false;
+#endif /* PGXC */
+
+    bool        use_variadic;
+
+    /*
+     * For a combining aggregate, we look up and deparse the corresponding
+     * partial aggregate instead.  This is necessary because our input
+     * argument list has been replaced; the new argument list always has just
+     * one element, which will point to a partial Aggref that supplies us with
+     * transition states to combine.
+     */
+    if (DO_AGGSPLIT_COMBINE(aggref->aggsplit))
+    {
+        TargetEntry *tle = linitial_node(TargetEntry, aggref->args);
+
+        Assert(list_length(aggref->args) == 1);
+        resolve_special_varno((Node *) tle->expr, context, original_aggref,
+                              get_agg_combine_expr);
+        return;
+    }
+
+    /*
+     * Mark as PARTIAL, if appropriate.  We look to the original aggref so as
+     * to avoid printing this when recursing from the code just above.
+     */
+    if (DO_AGGSPLIT_SKIPFINAL(original_aggref->aggsplit))
+        appendStringInfoString(buf, "PARTIAL ");
+
+    /* Extract the argument types as seen by the parser */
+    nargs = get_aggregate_argtypes(aggref, argtypes);
+
+    /* Print the aggregate name, schema-qualified if needed */
+    appendStringInfo(buf, "%s(%s",
+                     generate_function_name(aggref->aggfnoid, nargs,
+                                            NIL, argtypes,
+                                            aggref->aggvariadic,
+                                            &use_variadic,
+                                            context->special_exprkind),
+                     (aggref->aggdistinct != NIL) ? "DISTINCT " : "");
+
+    if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
+    {
+        /*
+         * Ordered-set aggregates do not use "*" syntax.  Also, we needn't
+         * worry about inserting VARIADIC.  So we can just dump the direct
+         * args as-is.
+         */
+        Assert(!aggref->aggvariadic);
+        get_rule_expr((Node *) aggref->aggdirectargs, context, true);
+        Assert(aggref->aggorder != NIL);
+        appendStringInfoString(buf, ") WITHIN GROUP (ORDER BY ");
+        get_rule_orderby(aggref->aggorder, aggref->args, false, context);
+    }
+    else
+    {
+        /* aggstar can be set only in zero-argument aggregates */
+        if (aggref->aggstar)
+            appendStringInfoChar(buf, '*');
+        else
+        {
+            ListCell   *l;
+            int            i;
+
+            i = 0;
+            foreach(l, aggref->args)
+            {
+                TargetEntry *tle = (TargetEntry *) lfirst(l);
+                Node       *arg = (Node *) tle->expr;
+
+                Assert(!IsA(arg, NamedArgExpr));
+                if (tle->resjunk)
+                    continue;
+                if (i++ > 0)
+                    appendStringInfoString(buf, ", ");
+                if (use_variadic && i == nargs)
+                    appendStringInfoString(buf, "VARIADIC ");
+                get_rule_expr(arg, context, true);
+            }
+        }
+
+        if (aggref->aggorder != NIL)
+        {
+            appendStringInfoString(buf, " ORDER BY ");
+            get_rule_orderby(aggref->aggorder, aggref->args, false, context);
+        }
+    }
+
+    if (aggref->aggfilter != NULL)
+    {
+        appendStringInfoString(buf, ") FILTER (WHERE ");
+        get_rule_expr((Node *) aggref->aggfilter, context, false);
+    }
+
+    appendStringInfoChar(buf, ')');
+
+}
+
+/*
+ * This is a helper function for get_agg_expr().  It's used when we deparse
+ * a combining Aggref; resolve_special_varno locates the corresponding partial
+ * Aggref and then calls this.
+ */
+static void
+get_agg_combine_expr(Node *node, deparse_context *context, void *private)
+{
+    Aggref       *aggref;
+    Aggref       *original_aggref = private;
+
+    if (!IsA(node, Aggref))
+        elog(ERROR, "combining Aggref does not point to an Aggref");
+
+    aggref = (Aggref *) node;
+    get_agg_expr(aggref, context, original_aggref);
+}
+
+/*
+ * get_windowfunc_expr    - Parse back a WindowFunc node
+ */
+static void
+get_windowfunc_expr(WindowFunc *wfunc, deparse_context *context)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    Oid            argtypes[FUNC_MAX_ARGS];
+    int            nargs;
+    List       *argnames;
+    ListCell   *l;
+
+    if (list_length(wfunc->args) > FUNC_MAX_ARGS)
+        ereport(ERROR,
+                (errcode(ERRCODE_TOO_MANY_ARGUMENTS),
+                 errmsg("too many arguments")));
+    nargs = 0;
+    argnames = NIL;
+    foreach(l, wfunc->args)
+    {
+        Node       *arg = (Node *) lfirst(l);
+
+        if (IsA(arg, NamedArgExpr))
+            argnames = lappend(argnames, ((NamedArgExpr *) arg)->name);
+        argtypes[nargs] = exprType(arg);
+        nargs++;
+    }
+
+    appendStringInfo(buf, "%s(",
+                     generate_function_name(wfunc->winfnoid, nargs,
+                                            argnames, argtypes,
+                                            false, NULL,
+                                            context->special_exprkind));
+    /* winstar can be set only in zero-argument aggregates */
+    if (wfunc->winstar)
+        appendStringInfoChar(buf, '*');
+    else
+        get_rule_expr((Node *) wfunc->args, context, true);
+
+    if (wfunc->aggfilter != NULL)
+    {
+        appendStringInfoString(buf, ") FILTER (WHERE ");
+        get_rule_expr((Node *) wfunc->aggfilter, context, false);
+    }
+
+    appendStringInfoString(buf, ") OVER ");
+
+    foreach(l, context->windowClause)
+    {
+        WindowClause *wc = (WindowClause *) lfirst(l);
+
+        if (wc->winref == wfunc->winref)
+        {
+            if (wc->name)
+                appendStringInfoString(buf, quote_identifier(wc->name));
+            else
+                get_rule_windowspec(wc, context->windowTList, context);
+            break;
+        }
+    }
+    if (l == NULL)
+    {
+        if (context->windowClause)
+            elog(ERROR, "could not find window clause for winref %u",
+                 wfunc->winref);
+
+        /*
+         * In EXPLAIN, we don't have window context information available, so
+         * we have to settle for this:
+         */
+        appendStringInfoString(buf, "(?)");
+    }
+}
+
+/* ----------
+ * get_coercion_expr
+ *
+ *    Make a string representation of a value coerced to a specific type
+ * ----------
+ */
+static void
+get_coercion_expr(Node *arg, deparse_context *context,
+                  Oid resulttype, int32 resulttypmod,
+                  Node *parentNode)
+{
+    StringInfo    buf = context->buf;
+
+    /*
+     * Since parse_coerce.c doesn't immediately collapse application of
+     * length-coercion functions to constants, what we'll typically see in
+     * such cases is a Const with typmod -1 and a length-coercion function
+     * right above it.  Avoid generating redundant output. However, beware of
+     * suppressing casts when the user actually wrote something like
+     * 'foo'::text::char(3).
+     *
+     * Note: it might seem that we are missing the possibility of needing to
+     * print a COLLATE clause for such a Const.  However, a Const could only
+     * have nondefault collation in a post-constant-folding tree, in which the
+     * length coercion would have been folded too.  See also the special
+     * handling of CollateExpr in coerce_to_target_type(): any collation
+     * marking will be above the coercion node, not below it.
+     */
+    if (arg && IsA(arg, Const) &&
+        ((Const *) arg)->consttype == resulttype &&
+        ((Const *) arg)->consttypmod == -1)
+    {
+        /* Show the constant without normal ::typename decoration */
+        get_const_expr((Const *) arg, context, -1);
+    }
+    else
+    {
+        if (!PRETTY_PAREN(context))
+            appendStringInfoChar(buf, '(');
+        get_rule_expr_paren(arg, context, false, parentNode);
+        if (!PRETTY_PAREN(context))
+            appendStringInfoChar(buf, ')');
+    }
+    appendStringInfo(buf, "::%s",
+                     format_type_with_typemod(resulttype, resulttypmod));
+}
+
+/* ----------
+ * get_const_expr
+ *
+ *    Make a string representation of a Const
+ *
+ * showtype can be -1 to never show "::typename" decoration, or +1 to always
+ * show it, or 0 to show it only if the constant wouldn't be assumed to be
+ * the right type by default.
+ *
+ * If the Const's collation isn't default for its type, show that too.
+ * We mustn't do this when showtype is -1 (since that means the caller will
+ * print "::typename", and we can't put a COLLATE clause in between).  It's
+ * caller's responsibility that collation isn't missed in such cases.
+ * ----------
+ */
+static void
+get_const_expr(Const *constval, deparse_context *context, int showtype)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    Oid            typoutput;
+    bool        typIsVarlena;
+    char       *extval;
+    bool        needlabel = false;
+
+    if (constval->constisnull)
+    {
+        /*
+         * Always label the type of a NULL constant to prevent misdecisions
+         * about type when reparsing.
+         */
+        appendStringInfoString(buf, "NULL");
+        if (showtype >= 0)
+        {
+            appendStringInfo(buf, "::%s",
+                             format_type_with_typemod(constval->consttype,
+                                                      constval->consttypmod));
+            get_const_collation(constval, context);
+        }
+        return;
+    }
+
+    getTypeOutputInfo(constval->consttype,
+                      &typoutput, &typIsVarlena);
+
+    extval = OidOutputFunctionCall(typoutput, constval->constvalue);
+
+    switch (constval->consttype)
+    {
+        case INT4OID:
+
+            /*
+             * INT4 can be printed without any decoration, unless it is
+             * negative; in that case print it as '-nnn'::integer to ensure
+             * that the output will re-parse as a constant, not as a constant
+             * plus operator.  In most cases we could get away with printing
+             * (-nnn) instead, because of the way that gram.y handles negative
+             * literals; but that doesn't work for INT_MIN, and it doesn't
+             * seem that much prettier anyway.
+             */
+            if (extval[0] != '-')
+                appendStringInfoString(buf, extval);
+            else
+            {
+                appendStringInfo(buf, "'%s'", extval);
+                needlabel = true;    /* we must attach a cast */
+            }
+            break;
+
+        case NUMERICOID:
+
+            /*
+             * NUMERIC can be printed without quotes if it looks like a float
+             * constant (not an integer, and not Infinity or NaN) and doesn't
+             * have a leading sign (for the same reason as for INT4).
+             */
+            if (isdigit((unsigned char) extval[0]) &&
+                strcspn(extval, "eE.") != strlen(extval))
+            {
+                appendStringInfoString(buf, extval);
+            }
+            else
+            {
+                appendStringInfo(buf, "'%s'", extval);
+                needlabel = true;    /* we must attach a cast */
+            }
+            break;
+
+        case BITOID:
+        case VARBITOID:
+            appendStringInfo(buf, "B'%s'", extval);
+            break;
+
+        case BOOLOID:
+            if (strcmp(extval, "t") == 0)
+                appendStringInfoString(buf, "true");
+            else
+                appendStringInfoString(buf, "false");
+            break;
+
+        default:
+            simple_quote_literal(buf, extval);
+            break;
+    }
+
+    pfree(extval);
+
+    if (showtype < 0)
+        return;
+
+    /*
+     * For showtype == 0, append ::typename unless the constant will be
+     * implicitly typed as the right type when it is read in.
+     *
+     * XXX this code has to be kept in sync with the behavior of the parser,
+     * especially make_const.
+     */
+    switch (constval->consttype)
+    {
+        case BOOLOID:
+        case UNKNOWNOID:
+            /* These types can be left unlabeled */
+            needlabel = false;
+            break;
+        case INT4OID:
+            /* We determined above whether a label is needed */
+            break;
+        case NUMERICOID:
+
+            /*
+             * Float-looking constants will be typed as numeric, which we
+             * checked above; but if there's a nondefault typmod we need to
+             * show it.
+             */
+            needlabel |= (constval->consttypmod >= 0);
+            break;
+        default:
+            needlabel = true;
+            break;
+    }
+    if (needlabel || showtype > 0)
+        appendStringInfo(buf, "::%s",
+                         format_type_with_typemod(constval->consttype,
+                                                  constval->consttypmod));
+
+    get_const_collation(constval, context);
+}
+
+/*
+ * helper for get_const_expr: append COLLATE if needed
+ */
+static void
+get_const_collation(Const *constval, deparse_context *context)
+{
+    StringInfo    buf = context->buf;
+
+    if (OidIsValid(constval->constcollid))
+    {
+        Oid            typcollation = get_typcollation(constval->consttype);
+
+        if (constval->constcollid != typcollation)
+        {
+            appendStringInfo(buf, " COLLATE %s",
+                             generate_collation_name(constval->constcollid));
+        }
+    }
+}
+
+/*
+ * simple_quote_literal - Format a string as a SQL literal, append to buf
+ */
+static void
+simple_quote_literal(StringInfo buf, const char *val)
+{
+    const char *valptr;
+
+    /*
+     * We form the string literal according to the prevailing setting of
+     * standard_conforming_strings; we never use E''. User is responsible for
+     * making sure result is used correctly.
+     */
+    appendStringInfoChar(buf, '\'');
+    for (valptr = val; *valptr; valptr++)
+    {
+        char        ch = *valptr;
+
+        if (SQL_STR_DOUBLE(ch, !standard_conforming_strings))
+            appendStringInfoChar(buf, ch);
+        appendStringInfoChar(buf, ch);
+    }
+    appendStringInfoChar(buf, '\'');
+}
+
+
+/* ----------
+ * get_sublink_expr            - Parse back a sublink
+ * ----------
+ */
+static void
+get_sublink_expr(SubLink *sublink, deparse_context *context)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    Query       *query = (Query *) (sublink->subselect);
+    char       *opname = NULL;
+    bool        need_paren;
+
+    if (sublink->subLinkType == ARRAY_SUBLINK)
+        appendStringInfoString(buf, "ARRAY(");
+    else
+        appendStringInfoChar(buf, '(');
+
+    /*
+     * Note that we print the name of only the first operator, when there are
+     * multiple combining operators.  This is an approximation that could go
+     * wrong in various scenarios (operators in different schemas, renamed
+     * operators, etc) but there is not a whole lot we can do about it, since
+     * the syntax allows only one operator to be shown.
+     */
+    if (sublink->testexpr)
+    {
+        if (IsA(sublink->testexpr, OpExpr))
+        {
+            /* single combining operator */
+            OpExpr       *opexpr = (OpExpr *) sublink->testexpr;
+
+            get_rule_expr(linitial(opexpr->args), context, true);
+            opname = generate_operator_name(opexpr->opno,
+                                            exprType(linitial(opexpr->args)),
+                                            exprType(lsecond(opexpr->args)));
+        }
+        else if (IsA(sublink->testexpr, BoolExpr))
+        {
+            /* multiple combining operators, = or <> cases */
+            char       *sep;
+            ListCell   *l;
+
+            appendStringInfoChar(buf, '(');
+            sep = "";
+            foreach(l, ((BoolExpr *) sublink->testexpr)->args)
+            {
+                OpExpr       *opexpr = lfirst_node(OpExpr, l);
+
+                appendStringInfoString(buf, sep);
+                get_rule_expr(linitial(opexpr->args), context, true);
+                if (!opname)
+                    opname = generate_operator_name(opexpr->opno,
+                                                    exprType(linitial(opexpr->args)),
+                                                    exprType(lsecond(opexpr->args)));
+                sep = ", ";
+            }
+            appendStringInfoChar(buf, ')');
+        }
+        else if (IsA(sublink->testexpr, RowCompareExpr))
+        {
+            /* multiple combining operators, < <= > >= cases */
+            RowCompareExpr *rcexpr = (RowCompareExpr *) sublink->testexpr;
+
+            appendStringInfoChar(buf, '(');
+            get_rule_expr((Node *) rcexpr->largs, context, true);
+            opname = generate_operator_name(linitial_oid(rcexpr->opnos),
+                                            exprType(linitial(rcexpr->largs)),
+                                            exprType(linitial(rcexpr->rargs)));
+            appendStringInfoChar(buf, ')');
+        }
+        else
+            elog(ERROR, "unrecognized testexpr type: %d",
+                 (int) nodeTag(sublink->testexpr));
+    }
+
+    need_paren = true;
+
+    switch (sublink->subLinkType)
+    {
+        case EXISTS_SUBLINK:
+            appendStringInfoString(buf, "EXISTS ");
+            break;
+
+        case ANY_SUBLINK:
+            if (strcmp(opname, "=") == 0)    /* Represent = ANY as IN */
+                appendStringInfoString(buf, " IN ");
+            else
+                appendStringInfo(buf, " %s ANY ", opname);
+            break;
+
+        case ALL_SUBLINK:
+            appendStringInfo(buf, " %s ALL ", opname);
+            break;
+
+        case ROWCOMPARE_SUBLINK:
+            appendStringInfo(buf, " %s ", opname);
+            break;
+
+        case EXPR_SUBLINK:
+        case MULTIEXPR_SUBLINK:
+        case ARRAY_SUBLINK:
+            need_paren = false;
+            break;
+
+        case CTE_SUBLINK:        /* shouldn't occur in a SubLink */
+        default:
+            elog(ERROR, "unrecognized sublink type: %d",
+                 (int) sublink->subLinkType);
+            break;
+    }
+
+    if (need_paren)
+        appendStringInfoChar(buf, '(');
+
+    get_query_def(query, buf, context->namespaces, NULL,
+                  context->prettyFlags, context->wrapColumn,
+                  context->indentLevel,
+                  context->finalise_aggs,
+                  context->sortgroup_colno);
+
+    if (need_paren)
+        appendStringInfoString(buf, "))");
+    else
+        appendStringInfoChar(buf, ')');
+}
+
+
+/* ----------
+ * get_tablefunc            - Parse back a table function
+ * ----------
+ */
+static void
+get_tablefunc(TableFunc *tf, deparse_context *context, bool showimplicit)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+
+    /* XMLTABLE is the only existing implementation.  */
+
+    appendStringInfoString(buf, "XMLTABLE(");
+
+    if (tf->ns_uris != NIL)
+    {
+        ListCell   *lc1,
+                   *lc2;
+        bool        first = true;
+
+        appendStringInfoString(buf, "XMLNAMESPACES (");
+        forboth(lc1, tf->ns_uris, lc2, tf->ns_names)
+        {
+            Node       *expr = (Node *) lfirst(lc1);
+            char       *name = strVal(lfirst(lc2));
+
+            if (!first)
+                appendStringInfoString(buf, ", ");
+            else
+                first = false;
+
+            if (name != NULL)
+            {
+                get_rule_expr(expr, context, showimplicit);
+                appendStringInfo(buf, " AS %s", name);
+            }
+            else
+            {
+                appendStringInfoString(buf, "DEFAULT ");
+                get_rule_expr(expr, context, showimplicit);
+            }
+        }
+        appendStringInfoString(buf, "), ");
+    }
+
+    appendStringInfoChar(buf, '(');
+    get_rule_expr((Node *) tf->rowexpr, context, showimplicit);
+    appendStringInfoString(buf, ") PASSING (");
+    get_rule_expr((Node *) tf->docexpr, context, showimplicit);
+    appendStringInfoChar(buf, ')');
+
+    if (tf->colexprs != NIL)
+    {
+        ListCell   *l1;
+        ListCell   *l2;
+        ListCell   *l3;
+        ListCell   *l4;
+        ListCell   *l5;
+        int            colnum = 0;
+
+        l2 = list_head(tf->coltypes);
+        l3 = list_head(tf->coltypmods);
+        l4 = list_head(tf->colexprs);
+        l5 = list_head(tf->coldefexprs);
+
+        appendStringInfoString(buf, " COLUMNS ");
+        foreach(l1, tf->colnames)
+        {
+            char       *colname = strVal(lfirst(l1));
+            Oid            typid;
+            int32        typmod;
+            Node       *colexpr;
+            Node       *coldefexpr;
+            bool        ordinality = tf->ordinalitycol == colnum;
+            bool        notnull = bms_is_member(colnum, tf->notnulls);
+
+            typid = lfirst_oid(l2);
+            l2 = lnext(l2);
+            typmod = lfirst_int(l3);
+            l3 = lnext(l3);
+            colexpr = (Node *) lfirst(l4);
+            l4 = lnext(l4);
+            coldefexpr = (Node *) lfirst(l5);
+            l5 = lnext(l5);
+
+            if (colnum > 0)
+                appendStringInfoString(buf, ", ");
+            colnum++;
+
+            appendStringInfo(buf, "%s %s", quote_identifier(colname),
+                             ordinality ? "FOR ORDINALITY" :
+                             format_type_with_typemod(typid, typmod));
+            if (ordinality)
+                continue;
+
+            if (coldefexpr != NULL)
+            {
+                appendStringInfoString(buf, " DEFAULT (");
+                get_rule_expr((Node *) coldefexpr, context, showimplicit);
+                appendStringInfoChar(buf, ')');
+            }
+            if (colexpr != NULL)
+            {
+                appendStringInfoString(buf, " PATH (");
+                get_rule_expr((Node *) colexpr, context, showimplicit);
+                appendStringInfoChar(buf, ')');
+            }
+            if (notnull)
+                appendStringInfoString(buf, " NOT NULL");
+        }
+    }
+
+    appendStringInfoChar(buf, ')');
+}
+
+/* ----------
+ * get_from_clause            - Parse back a FROM clause
+ *
+ * "prefix" is the keyword that denotes the start of the list of FROM
+ * elements. It is FROM when used to parse back SELECT and UPDATE, but
+ * is USING when parsing back DELETE.
+ * ----------
+ */
+static void
+get_from_clause(Query *query, const char *prefix, deparse_context *context)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    bool        first = true;
+    ListCell   *l;
+
+    /*
+     * We use the query's jointree as a guide to what to print.  However, we
+     * must ignore auto-added RTEs that are marked not inFromCl. (These can
+     * only appear at the top level of the jointree, so it's sufficient to
+     * check here.)  This check also ensures we ignore the rule pseudo-RTEs
+     * for NEW and OLD.
+     */
+    foreach(l, query->jointree->fromlist)
+    {
+        Node       *jtnode = (Node *) lfirst(l);
+
+        if (IsA(jtnode, RangeTblRef))
+        {
+            int            varno = ((RangeTblRef *) jtnode)->rtindex;
+            RangeTblEntry *rte = rt_fetch(varno, query->rtable);
+
+            if (!rte->inFromCl)
+                continue;
+        }
+
+        if (first)
+        {
+            appendContextKeyword(context, prefix,
+                                 -PRETTYINDENT_STD, PRETTYINDENT_STD, 2);
+            first = false;
+
+            get_from_clause_item(jtnode, query, context);
+        }
+        else
+        {
+            StringInfoData itembuf;
+
+            appendStringInfoString(buf, ", ");
+
+            /*
+             * Put the new FROM item's text into itembuf so we can decide
+             * after we've got it whether or not it needs to go on a new line.
+             */
+            initStringInfo(&itembuf);
+            context->buf = &itembuf;
+
+            get_from_clause_item(jtnode, query, context);
+
+            /* Restore context's output buffer */
+            context->buf = buf;
+
+            /* Consider line-wrapping if enabled */
+            if (PRETTY_INDENT(context) && context->wrapColumn >= 0)
+            {
+                /* Does the new item start with a new line? */
+                if (itembuf.len > 0 && itembuf.data[0] == '\n')
+                {
+                    /* If so, we shouldn't add anything */
+                    /* instead, remove any trailing spaces currently in buf */
+                    removeStringInfoSpaces(buf);
+                }
+                else
+                {
+                    char       *trailing_nl;
+
+                    /* Locate the start of the current line in the buffer */
+                    trailing_nl = strrchr(buf->data, '\n');
+                    if (trailing_nl == NULL)
+                        trailing_nl = buf->data;
+                    else
+                        trailing_nl++;
+
+                    /*
+                     * Add a newline, plus some indentation, if the new item
+                     * would cause an overflow.
+                     */
+                    if (strlen(trailing_nl) + itembuf.len > context->wrapColumn)
+                        appendContextKeyword(context, "", -PRETTYINDENT_STD,
+                                             PRETTYINDENT_STD,
+                                             PRETTYINDENT_VAR);
+                }
+            }
+
+            /* Add the new item */
+            appendStringInfoString(buf, itembuf.data);
+
+            /* clean up */
+            pfree(itembuf.data);
+        }
+    }
+}
+
+static void
+get_from_clause_item(Node *jtnode, Query *query, deparse_context *context)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    deparse_namespace *dpns = (deparse_namespace *) linitial(context->namespaces);
+
+    if (IsA(jtnode, RangeTblRef))
+    {
+        int            varno = ((RangeTblRef *) jtnode)->rtindex;
+        RangeTblEntry *rte = rt_fetch(varno, query->rtable);
+        char       *refname = get_rtable_name(varno, context);
+        deparse_columns *colinfo = deparse_columns_fetch(varno, dpns);
+        RangeTblFunction *rtfunc1 = NULL;
+        bool        printalias;
+
+        if (rte->lateral)
+            appendStringInfoString(buf, "LATERAL ");
+
+        /* Print the FROM item proper */
+        switch (rte->rtekind)
+        {
+            case RTE_RELATION:
+                /* Normal relation RTE */
+                appendStringInfo(buf, "%s%s",
+                                 only_marker(rte),
+                                 generate_relation_name(rte->relid,
+                                                        context->namespaces));
+#ifdef __TBASE__
+                /* print for default partition */
+                if (rte->intervalparent && rte->isdefault)
+                {
+                    appendStringInfoString(buf, " PARTITION For Default ");
+                }
+#endif
+                break;
+            case RTE_SUBQUERY:
+                /* Subquery RTE */
+                appendStringInfoChar(buf, '(');
+                get_query_def(rte->subquery, buf, context->namespaces, NULL,
+                              context->prettyFlags, context->wrapColumn,
+                              context->indentLevel,
+                              context->finalise_aggs,
+                              context->sortgroup_colno);
+                appendStringInfoChar(buf, ')');
+                break;
+            case RTE_FUNCTION:
+                /* Function RTE */
+                rtfunc1 = (RangeTblFunction *) linitial(rte->functions);
+
+                /*
+                 * Omit ROWS FROM() syntax for just one function, unless it
+                 * has both a coldeflist and WITH ORDINALITY. If it has both,
+                 * we must use ROWS FROM() syntax to avoid ambiguity about
+                 * whether the coldeflist includes the ordinality column.
+                 */
+                if (list_length(rte->functions) == 1 &&
+                    (rtfunc1->funccolnames == NIL || !rte->funcordinality))
+                {
+                    get_rule_expr_funccall(rtfunc1->funcexpr, context, true);
+                    /* we'll print the coldeflist below, if it has one */
+                }
+                else
+                {
+                    bool        all_unnest;
+                    ListCell   *lc;
+
+                    /*
+                     * If all the function calls in the list are to unnest,
+                     * and none need a coldeflist, then collapse the list back
+                     * down to UNNEST(args).  (If we had more than one
+                     * built-in unnest function, this would get more
+                     * difficult.)
+                     *
+                     * XXX This is pretty ugly, since it makes not-terribly-
+                     * future-proof assumptions about what the parser would do
+                     * with the output; but the alternative is to emit our
+                     * nonstandard ROWS FROM() notation for what might have
+                     * been a perfectly spec-compliant multi-argument
+                     * UNNEST().
+                     */
+                    all_unnest = true;
+                    foreach(lc, rte->functions)
+                    {
+                        RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
+
+                        if (!IsA(rtfunc->funcexpr, FuncExpr) ||
+                            ((FuncExpr *) rtfunc->funcexpr)->funcid != F_ARRAY_UNNEST ||
+                            rtfunc->funccolnames != NIL)
+                        {
+                            all_unnest = false;
+                            break;
+                        }
+                    }
+
+                    if (all_unnest)
+                    {
+                        List       *allargs = NIL;
+
+                        foreach(lc, rte->functions)
+                        {
+                            RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
+                            List       *args = ((FuncExpr *) rtfunc->funcexpr)->args;
+
+                            allargs = list_concat(allargs, list_copy(args));
+                        }
+
+                        appendStringInfoString(buf, "UNNEST(");
+                        get_rule_expr((Node *) allargs, context, true);
+                        appendStringInfoChar(buf, ')');
+                    }
+                    else
+                    {
+                        int            funcno = 0;
+
+                        appendStringInfoString(buf, "ROWS FROM(");
+                        foreach(lc, rte->functions)
+                        {
+                            RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
+
+                            if (funcno > 0)
+                                appendStringInfoString(buf, ", ");
+                            get_rule_expr_funccall(rtfunc->funcexpr, context, true);
+                            if (rtfunc->funccolnames != NIL)
+                            {
+                                /* Reconstruct the column definition list */
+                                appendStringInfoString(buf, " AS ");
+                                get_from_clause_coldeflist(rtfunc,
+                                                           NULL,
+                                                           context);
+                            }
+                            funcno++;
+                        }
+                        appendStringInfoChar(buf, ')');
+                    }
+                    /* prevent printing duplicate coldeflist below */
+                    rtfunc1 = NULL;
+                }
+                if (rte->funcordinality)
+                    appendStringInfoString(buf, " WITH ORDINALITY");
+                break;
+            case RTE_TABLEFUNC:
+                get_tablefunc(rte->tablefunc, context, true);
+                break;
+            case RTE_VALUES:
+                /* Values list RTE */
+                appendStringInfoChar(buf, '(');
+                get_values_def(rte->values_lists, context);
+                appendStringInfoChar(buf, ')');
+                break;
+            case RTE_CTE:
+                appendStringInfoString(buf, quote_identifier(rte->ctename));
+                break;
+            default:
+                elog(ERROR, "unrecognized RTE kind: %d", (int) rte->rtekind);
+                break;
+        }
+
+        /* Print the relation alias, if needed */
+        printalias = false;
+        if (rte->alias != NULL)
+        {
+            /* Always print alias if user provided one */
+            printalias = true;
+        }
+        else if (colinfo->printaliases)
+        {
+            /* Always print alias if we need to print column aliases */
+            printalias = true;
+        }
+        else if (rte->rtekind == RTE_RELATION)
+        {
+            /*
+             * No need to print alias if it's same as relation name (this
+             * would normally be the case, but not if set_rtable_names had to
+             * resolve a conflict).
+             */
+            if (strcmp(refname, get_relation_name(rte->relid)) != 0)
+                printalias = true;
+        }
+#ifdef PGXC
+        else if (rte->rtekind == RTE_SUBQUERY && rte->eref->aliasname)
+        {
+            /*
+             *
+             * This condition arises when the from clause is a view. The
+             * corresponding subquery RTE has its eref set to view name.
+             * The remote query generated has this subquery of which the
+             * columns can be referred to as view_name.col1, so it should
+             * be possible to refer to this subquery object.
+             */            
+            appendStringInfo(buf, " %s",
+                             quote_identifier(rte->eref->aliasname));
+            printalias = true;
+        }
+#endif
+        else if (rte->rtekind == RTE_FUNCTION)
+        {
+            /*
+             * For a function RTE, always print alias.  This covers possible
+             * renaming of the function and/or instability of the
+             * FigureColname rules for things that aren't simple functions.
+             * Note we'd need to force it anyway for the columndef list case.
+             */
+            printalias = true;
+        }
+        else if (rte->rtekind == RTE_VALUES)
+        {
+            /* Alias is syntactically required for VALUES */
+            printalias = true;
+        }
+        else if (rte->rtekind == RTE_CTE)
+        {
+            /*
+             * No need to print alias if it's same as CTE name (this would
+             * normally be the case, but not if set_rtable_names had to
+             * resolve a conflict).
+             */
+            if (strcmp(refname, rte->ctename) != 0)
+                printalias = true;
+        }
+        if (printalias)
+            appendStringInfo(buf, " %s", quote_identifier(refname));
+
+        /* Print the column definitions or aliases, if needed */
+        if (rtfunc1 && rtfunc1->funccolnames != NIL)
+        {
+            /* Reconstruct the columndef list, which is also the aliases */
+            get_from_clause_coldeflist(rtfunc1, colinfo, context);
+        }
+        else
+        {
+            /* Else print column aliases as needed */
+            get_column_alias_list(colinfo, context);
+        }
+
+        /* Tablesample clause must go after any alias */
+        if (rte->rtekind == RTE_RELATION && rte->tablesample)
+            get_tablesample_def(rte->tablesample, context);
+    }
+    else if (IsA(jtnode, JoinExpr))
+    {
+        JoinExpr   *j = (JoinExpr *) jtnode;
+        deparse_columns *colinfo = deparse_columns_fetch(j->rtindex, dpns);
+        bool        need_paren_on_right;
+
+        need_paren_on_right = PRETTY_PAREN(context) &&
+            !IsA(j->rarg, RangeTblRef) &&
+            !(IsA(j->rarg, JoinExpr) &&((JoinExpr *) j->rarg)->alias != NULL);
+
+        if (!PRETTY_PAREN(context) || j->alias != NULL)
+            appendStringInfoChar(buf, '(');
+
+        get_from_clause_item(j->larg, query, context);
+
+        switch (j->jointype)
+        {
+            case JOIN_INNER:
+                if (j->quals)
+                    appendContextKeyword(context, " JOIN ",
+                                         -PRETTYINDENT_STD,
+                                         PRETTYINDENT_STD,
+                                         PRETTYINDENT_JOIN);
+                else
+                    appendContextKeyword(context, " CROSS JOIN ",
+                                         -PRETTYINDENT_STD,
+                                         PRETTYINDENT_STD,
+                                         PRETTYINDENT_JOIN);
+                break;
+            case JOIN_LEFT:
+                appendContextKeyword(context, " LEFT JOIN ",
+                                     -PRETTYINDENT_STD,
+                                     PRETTYINDENT_STD,
+                                     PRETTYINDENT_JOIN);
+                break;
+            case JOIN_FULL:
+                appendContextKeyword(context, " FULL JOIN ",
+                                     -PRETTYINDENT_STD,
+                                     PRETTYINDENT_STD,
+                                     PRETTYINDENT_JOIN);
+                break;
+            case JOIN_RIGHT:
+                appendContextKeyword(context, " RIGHT JOIN ",
+                                     -PRETTYINDENT_STD,
+                                     PRETTYINDENT_STD,
+                                     PRETTYINDENT_JOIN);
+                break;
+            default:
+                elog(ERROR, "unrecognized join type: %d",
+                     (int) j->jointype);
+        }
+
+        if (need_paren_on_right)
+            appendStringInfoChar(buf, '(');
+        get_from_clause_item(j->rarg, query, context);
+        if (need_paren_on_right)
+            appendStringInfoChar(buf, ')');
+
+        if (j->usingClause)
+        {
+            ListCell   *lc;
+            bool        first = true;
+
+            appendStringInfoString(buf, " USING (");
+            /* Use the assigned names, not what's in usingClause */
+            foreach(lc, colinfo->usingNames)
+            {
+                char       *colname = (char *) lfirst(lc);
+
+                if (first)
+                    first = false;
+                else
+                    appendStringInfoString(buf, ", ");
+                appendStringInfoString(buf, quote_identifier(colname));
+            }
+            appendStringInfoChar(buf, ')');
+        }
+        else if (j->quals)
+        {
+            appendStringInfoString(buf, " ON ");
+            if (!PRETTY_PAREN(context))
+                appendStringInfoChar(buf, '(');
+            get_rule_expr(j->quals, context, false);
+            if (!PRETTY_PAREN(context))
+                appendStringInfoChar(buf, ')');
+        }
+        else if (j->jointype != JOIN_INNER)
+        {
+            /* If we didn't say CROSS JOIN above, we must provide an ON */
+            appendStringInfoString(buf, " ON TRUE");
+        }
+
+        if (!PRETTY_PAREN(context) || j->alias != NULL)
+            appendStringInfoChar(buf, ')');
+
+        /* Yes, it's correct to put alias after the right paren ... */
+        if (j->alias != NULL)
+        {
+            appendStringInfo(buf, " %s",
+                             quote_identifier(j->alias->aliasname));
+            get_column_alias_list(colinfo, context);
+        }
+    }
+    else
+        elog(ERROR, "unrecognized node type: %d",
+             (int) nodeTag(jtnode));
+}
+
+/*
+ * get_column_alias_list - print column alias list for an RTE
+ *
+ * Caller must already have printed the relation's alias name.
+ */
+static void
+get_column_alias_list(deparse_columns *colinfo, deparse_context *context)
+{
+    StringInfo    buf = context->buf;
+    int            i;
+    bool        first = true;
+
+    /* Don't print aliases if not needed */
+    if (!colinfo->printaliases)
+        return;
+
+    for (i = 0; i < colinfo->num_new_cols; i++)
+    {
+        char       *colname = colinfo->new_colnames[i];
+
+        if (first)
+        {
+            appendStringInfoChar(buf, '(');
+            first = false;
+        }
+        else
+            appendStringInfoString(buf, ", ");
+        appendStringInfoString(buf, quote_identifier(colname));
+    }
+    if (!first)
+        appendStringInfoChar(buf, ')');
+}
+
+/*
+ * get_from_clause_coldeflist - reproduce FROM clause coldeflist
+ *
+ * When printing a top-level coldeflist (which is syntactically also the
+ * relation's column alias list), use column names from colinfo.  But when
+ * printing a coldeflist embedded inside ROWS FROM(), we prefer to use the
+ * original coldeflist's names, which are available in rtfunc->funccolnames.
+ * Pass NULL for colinfo to select the latter behavior.
+ *
+ * The coldeflist is appended immediately (no space) to buf.  Caller is
+ * responsible for ensuring that an alias or AS is present before it.
+ */
+static void
+get_from_clause_coldeflist(RangeTblFunction *rtfunc,
+                           deparse_columns *colinfo,
+                           deparse_context *context)
+{
+    StringInfo    buf = context->buf;
+    ListCell   *l1;
+    ListCell   *l2;
+    ListCell   *l3;
+    ListCell   *l4;
+    int            i;
+
+    appendStringInfoChar(buf, '(');
+
+    /* there's no forfour(), so must chase one list the hard way */
+    i = 0;
+    l4 = list_head(rtfunc->funccolnames);
+    forthree(l1, rtfunc->funccoltypes,
+             l2, rtfunc->funccoltypmods,
+             l3, rtfunc->funccolcollations)
+    {
+        Oid            atttypid = lfirst_oid(l1);
+        int32        atttypmod = lfirst_int(l2);
+        Oid            attcollation = lfirst_oid(l3);
+        char       *attname;
+
+        if (colinfo)
+            attname = colinfo->colnames[i];
+        else
+            attname = strVal(lfirst(l4));
+
+        Assert(attname);        /* shouldn't be any dropped columns here */
+
+        if (i > 0)
+            appendStringInfoString(buf, ", ");
+        appendStringInfo(buf, "%s %s",
+                         quote_identifier(attname),
+                         format_type_with_typemod(atttypid, atttypmod));
+        if (OidIsValid(attcollation) &&
+            attcollation != get_typcollation(atttypid))
+            appendStringInfo(buf, " COLLATE %s",
+                             generate_collation_name(attcollation));
+
+        l4 = lnext(l4);
+        i++;
+    }
+
+    appendStringInfoChar(buf, ')');
+}
+
+/*
+ * get_tablesample_def            - print a TableSampleClause
+ */
+static void
+get_tablesample_def(TableSampleClause *tablesample, deparse_context *context)
+{
+    StringInfo    buf = context->buf;
+    Oid            argtypes[1];
+    int            nargs;
+    ListCell   *l;
+
+    /*
+     * We should qualify the handler's function name if it wouldn't be
+     * resolved by lookup in the current search path.
+     */
+    argtypes[0] = INTERNALOID;
+    appendStringInfo(buf, " TABLESAMPLE %s (",
+                     generate_function_name(tablesample->tsmhandler, 1,
+                                            NIL, argtypes,
+                                            false, NULL, EXPR_KIND_NONE));
+
+    nargs = 0;
+    foreach(l, tablesample->args)
+    {
+        if (nargs++ > 0)
+            appendStringInfoString(buf, ", ");
+        get_rule_expr((Node *) lfirst(l), context, false);
+    }
+    appendStringInfoChar(buf, ')');
+
+    if (tablesample->repeatable != NULL)
+    {
+        appendStringInfoString(buf, " REPEATABLE (");
+        get_rule_expr((Node *) tablesample->repeatable, context, false);
+        appendStringInfoChar(buf, ')');
+    }
+}
+
+/*
+ * get_opclass_name            - fetch name of an index operator class
+ *
+ * The opclass name is appended (after a space) to buf.
+ *
+ * Output is suppressed if the opclass is the default for the given
+ * actual_datatype.  (If you don't want this behavior, just pass
+ * InvalidOid for actual_datatype.)
+ */
+static void
+get_opclass_name(Oid opclass, Oid actual_datatype,
+                 StringInfo buf)
+{
+    HeapTuple    ht_opc;
+    Form_pg_opclass opcrec;
+    char       *opcname;
+    char       *nspname;
+
+    ht_opc = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclass));
+    if (!HeapTupleIsValid(ht_opc))
+        elog(ERROR, "cache lookup failed for opclass %u", opclass);
+    opcrec = (Form_pg_opclass) GETSTRUCT(ht_opc);
+
+    if (!OidIsValid(actual_datatype) ||
+        GetDefaultOpClass(actual_datatype, opcrec->opcmethod) != opclass)
+    {
+        /* Okay, we need the opclass name.  Do we need to qualify it? */
+        opcname = NameStr(opcrec->opcname);
+        if (OpclassIsVisible(opclass))
+            appendStringInfo(buf, " %s", quote_identifier(opcname));
+        else
+        {
+            nspname = get_namespace_name(opcrec->opcnamespace);
+            appendStringInfo(buf, " %s.%s",
+                             quote_identifier(nspname),
+                             quote_identifier(opcname));
+        }
+    }
+    ReleaseSysCache(ht_opc);
+}
+
+/*
+ * processIndirection - take care of array and subfield assignment
+ *
+ * We strip any top-level FieldStore or assignment ArrayRef nodes that
+ * appear in the input, printing them as decoration for the base column
+ * name (which we assume the caller just printed).  We might also need to
+ * strip CoerceToDomain nodes, but only ones that appear above assignment
+ * nodes.
+ *
+ * Returns the subexpression that's to be assigned.
+ */
+static Node *
+processIndirection(Node *node, deparse_context *context)
+{// #lizard forgives
+    StringInfo    buf = context->buf;
+    CoerceToDomain *cdomain = NULL;
+
+    for (;;)
+    {
+        if (node == NULL)
+            break;
+        if (IsA(node, FieldStore))
+        {
+            FieldStore *fstore = (FieldStore *) node;
+            Oid            typrelid;
+            char       *fieldname;
+
+            /* lookup tuple type */
+            typrelid = get_typ_typrelid(fstore->resulttype);
+            if (!OidIsValid(typrelid))
+                elog(ERROR, "argument type %s of FieldStore is not a tuple type",
+                     format_type_be(fstore->resulttype));
+
+            /*
+             * Print the field name.  There should only be one target field in
+             * stored rules.  There could be more than that in executable
+             * target lists, but this function cannot be used for that case.
+             */
+            Assert(list_length(fstore->fieldnums) == 1);
+            fieldname = get_relid_attribute_name(typrelid,
+                                                 linitial_int(fstore->fieldnums));
+            appendStringInfo(buf, ".%s", quote_identifier(fieldname));
+
+            /*
+             * We ignore arg since it should be an uninteresting reference to
+             * the target column or subcolumn.
+             */
+            node = (Node *) linitial(fstore->newvals);
+        }
+        else if (IsA(node, ArrayRef))
+        {
+            ArrayRef   *aref = (ArrayRef *) node;
+
+            if (aref->refassgnexpr == NULL)
+                break;
+            printSubscripts(aref, context);
+
+            /*
+             * We ignore refexpr since it should be an uninteresting reference
+             * to the target column or subcolumn.
+             */
+            node = (Node *) aref->refassgnexpr;
+        }
+        else if (IsA(node, CoerceToDomain))
+        {
+            cdomain = (CoerceToDomain *) node;
+            /* If it's an explicit domain coercion, we're done */
+            if (cdomain->coercionformat != COERCE_IMPLICIT_CAST)
+                break;
+            /* Tentatively descend past the CoerceToDomain */
+            node = (Node *) cdomain->arg;
+        }
+        else
+            break;
+    }
+
+    /*
+     * If we descended past a CoerceToDomain whose argument turned out not to
+     * be a FieldStore or array assignment, back up to the CoerceToDomain.
+     * (This is not enough to be fully correct if there are nested implicit
+     * CoerceToDomains, but such cases shouldn't ever occur.)
+     */
+    if (cdomain && node == (Node *) cdomain->arg)
+        node = (Node *) cdomain;
+
+    return node;
+}
+
+static void
+printSubscripts(ArrayRef *aref, deparse_context *context)
+{
+    StringInfo    buf = context->buf;
+    ListCell   *lowlist_item;
+    ListCell   *uplist_item;
+
+    lowlist_item = list_head(aref->reflowerindexpr);    /* could be NULL */
+    foreach(uplist_item, aref->refupperindexpr)
+    {
+        appendStringInfoChar(buf, '[');
+        if (lowlist_item)
+        {
+            /* If subexpression is NULL, get_rule_expr prints nothing */
+            get_rule_expr((Node *) lfirst(lowlist_item), context, false);
+            appendStringInfoChar(buf, ':');
+            lowlist_item = lnext(lowlist_item);
+        }
+        /* If subexpression is NULL, get_rule_expr prints nothing */
+        get_rule_expr((Node *) lfirst(uplist_item), context, false);
+        appendStringInfoChar(buf, ']');
+    }
+}
+
+/*
+ * quote_identifier            - Quote an identifier only if needed
+ *
+ * When quotes are needed, we palloc the required space; slightly
+ * space-wasteful but well worth it for notational simplicity.
+ */
+const char *
+quote_identifier(const char *ident)
+{// #lizard forgives
+    /*
+     * Can avoid quoting if ident starts with a lowercase letter or underscore
+     * and contains only lowercase letters, digits, and underscores, *and* is
+     * not any SQL keyword.  Otherwise, supply quotes.
+     */
+    int            nquotes = 0;
+    bool        safe;
+    const char *ptr;
+    char       *result;
+    char       *optr;
+
+    /*
+     * would like to use <ctype.h> macros here, but they might yield unwanted
+     * locale-specific results...
+     */
+    safe = ((ident[0] >= 'a' && ident[0] <= 'z') || ident[0] == '_');
+
+    for (ptr = ident; *ptr; ptr++)
+    {
+        char        ch = *ptr;
+
+        if ((ch >= 'a' && ch <= 'z') ||
+            (ch >= '0' && ch <= '9') ||
+            (ch == '_'))
+        {
+            /* okay */
+        }
+        else
+        {
+            safe = false;
+            if (ch == '"')
+                nquotes++;
+        }
+    }
+
+    if (quote_all_identifiers)
+        safe = false;
+
+    if (safe)
+    {
+        /*
+         * Check for keyword.  We quote keywords except for unreserved ones.
+         * (In some cases we could avoid quoting a col_name or type_func_name
+         * keyword, but it seems much harder than it's worth to tell that.)
+         *
+         * Note: ScanKeywordLookup() does case-insensitive comparison, but
+         * that's fine, since we already know we have all-lower-case.
+         */
+        const ScanKeyword *keyword = ScanKeywordLookup(ident,
+                                                       ScanKeywords,
+                                                       NumScanKeywords);
+
+        if (keyword != NULL && keyword->category != UNRESERVED_KEYWORD)
+            safe = false;
+    }
+
+    if (safe)
+        return ident;            /* no change needed */
+
+    result = (char *) palloc(strlen(ident) + nquotes + 2 + 1);
+
+    optr = result;
+    *optr++ = '"';
+    for (ptr = ident; *ptr; ptr++)
+    {
+        char        ch = *ptr;
+
+        if (ch == '"')
+            *optr++ = '"';
+        *optr++ = ch;
+    }
+    *optr++ = '"';
+    *optr = '\0';
+
+    return result;
+}
+
+/*
+ * quote_qualified_identifier    - Quote a possibly-qualified identifier
+ *
+ * Return a name of the form qualifier.ident, or just ident if qualifier
+ * is NULL, quoting each component if necessary.  The result is palloc'd.
+ */
+char *
+quote_qualified_identifier(const char *qualifier,
+                           const char *ident)
+{
+    StringInfoData buf;
+
+    initStringInfo(&buf);
+    if (qualifier)
+        appendStringInfo(&buf, "%s.", quote_identifier(qualifier));
+    appendStringInfoString(&buf, quote_identifier(ident));
+    return buf.data;
+}
+
+/*
+ * get_relation_name
+ *        Get the unqualified name of a relation specified by OID
+ *
+ * This differs from the underlying get_rel_name() function in that it will
+ * throw error instead of silently returning NULL if the OID is bad.
+ */
+static char *
+get_relation_name(Oid relid)
+{
+    char       *relname = get_rel_name(relid);
+
+    if (!relname)
+        elog(ERROR, "cache lookup failed for relation %u", relid);
+    return relname;
+}
+
+/*
+ * generate_relation_name
+ *        Compute the name to display for a relation specified by OID
+ *
+ * The result includes all necessary quoting and schema-prefixing.
+ *
+ * If namespaces isn't NIL, it must be a list of deparse_namespace nodes.
+ * We will forcibly qualify the relation name if it equals any CTE name
+ * visible in the namespace list.
+ */
+static char *
+generate_relation_name(Oid relid, List *namespaces)
+{
+    HeapTuple    tp;
+    Form_pg_class reltup;
+    bool        need_qual;
+    ListCell   *nslist;
+    char       *relname;
+    char       *nspname;
+    char       *result;
+
+    tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
+    if (!HeapTupleIsValid(tp))
+        elog(ERROR, "cache lookup failed for relation %u", relid);
+    reltup = (Form_pg_class) GETSTRUCT(tp);
+    relname = NameStr(reltup->relname);
+
+    /* Check for conflicting CTE name */
+    need_qual = false;
+    foreach(nslist, namespaces)
+    {
+        deparse_namespace *dpns = (deparse_namespace *) lfirst(nslist);
+        ListCell   *ctlist;
+
+        foreach(ctlist, dpns->ctes)
+        {
+            CommonTableExpr *cte = (CommonTableExpr *) lfirst(ctlist);
+
+            if (strcmp(cte->ctename, relname) == 0)
+            {
+                need_qual = true;
+                break;
+            }
+        }
+        if (need_qual)
+            break;
+    }
+
+    /* Otherwise, qualify the name if not visible in search path */
+    if (!need_qual)
+        need_qual = !RelationIsVisible(relid);
+
+    if (need_qual)
+        nspname = get_namespace_name(reltup->relnamespace);
+    else
+        nspname = NULL;
+
+    result = quote_qualified_identifier(nspname, relname);
+
+    ReleaseSysCache(tp);
+
+    return result;
+}
+
+/*
+ * generate_qualified_relation_name
+ *        Compute the name to display for a relation specified by OID
+ *
+ * As above, but unconditionally schema-qualify the name.
+ */
+static char *
+generate_qualified_relation_name(Oid relid)
+{
+    HeapTuple    tp;
+    Form_pg_class reltup;
+    char       *relname;
+    char       *nspname;
+    char       *result;
+
+    tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
+    if (!HeapTupleIsValid(tp))
+        elog(ERROR, "cache lookup failed for relation %u", relid);
+    reltup = (Form_pg_class) GETSTRUCT(tp);
+    relname = NameStr(reltup->relname);
+
+    nspname = get_namespace_name(reltup->relnamespace);
+    if (!nspname)
+        elog(ERROR, "cache lookup failed for namespace %u",
+             reltup->relnamespace);
+
+    result = quote_qualified_identifier(nspname, relname);
+
+    ReleaseSysCache(tp);
+
+    return result;
+}
+
+/*
+ * generate_function_name
+ *        Compute the name to display for a function specified by OID,
+ *        given that it is being called with the specified actual arg names and
+ *        types.  (Those matter because of ambiguous-function resolution rules.)
+ *
+ * If we're dealing with a potentially variadic function (in practice, this
+ * means a FuncExpr or Aggref, not some other way of calling a function), then
+ * has_variadic must specify whether variadic arguments have been merged,
+ * and *use_variadic_p will be set to indicate whether to print VARIADIC in
+ * the output.  For non-FuncExpr cases, has_variadic should be FALSE and
+ * use_variadic_p can be NULL.
+ *
+ * The result includes all necessary quoting and schema-prefixing.
+ */
+static char *
+generate_function_name(Oid funcid, int nargs, List *argnames, Oid *argtypes,
+                       bool has_variadic, bool *use_variadic_p,
+                       ParseExprKind special_exprkind)
+{// #lizard forgives
+    char       *result;
+    HeapTuple    proctup;
+    Form_pg_proc procform;
+    char       *proname;
+    bool        use_variadic;
+    char       *nspname;
+    FuncDetailCode p_result;
+    Oid            p_funcid;
+    Oid            p_rettype;
+    bool        p_retset;
+    int            p_nvargs;
+    Oid            p_vatype;
+    Oid           *p_true_typeids;
+    bool        force_qualify = false;
+
+    proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
+    if (!HeapTupleIsValid(proctup))
+        elog(ERROR, "cache lookup failed for function %u", funcid);
+    procform = (Form_pg_proc) GETSTRUCT(proctup);
+    proname = NameStr(procform->proname);
+
+    /*
+     * Due to parser hacks to avoid needing to reserve CUBE, we need to force
+     * qualification in some special cases.
+     */
+    if (special_exprkind == EXPR_KIND_GROUP_BY)
+    {
+        if (strcmp(proname, "cube") == 0 || strcmp(proname, "rollup") == 0)
+            force_qualify = true;
+    }
+
+    /*
+     * Determine whether VARIADIC should be printed.  We must do this first
+     * since it affects the lookup rules in func_get_detail().
+     *
+     * Currently, we always print VARIADIC if the function has a merged
+     * variadic-array argument.  Note that this is always the case for
+     * functions taking a VARIADIC argument type other than VARIADIC ANY.
+     *
+     * In principle, if VARIADIC wasn't originally specified and the array
+     * actual argument is deconstructable, we could print the array elements
+     * separately and not print VARIADIC, thus more nearly reproducing the
+     * original input.  For the moment that seems like too much complication
+     * for the benefit, and anyway we do not know whether VARIADIC was
+     * originally specified if it's a non-ANY type.
+     */
+    if (use_variadic_p)
+    {
+        /* Parser should not have set funcvariadic unless fn is variadic */
+        Assert(!has_variadic || OidIsValid(procform->provariadic));
+        use_variadic = has_variadic;
+        *use_variadic_p = use_variadic;
+    }
+    else
+    {
+        Assert(!has_variadic);
+        use_variadic = false;
+    }
+
+    /*
+     * The idea here is to schema-qualify only if the parser would fail to
+     * resolve the correct function given the unqualified func name with the
+     * specified argtypes and VARIADIC flag.  But if we already decided to
+     * force qualification, then we can skip the lookup and pretend we didn't
+     * find it.
+     */
+    if (!force_qualify)
+        p_result = func_get_detail(list_make1(makeString(proname)),
+                                   NIL, argnames, nargs, argtypes,
+                                   !use_variadic, true,
+                                   &p_funcid, &p_rettype,
+                                   &p_retset, &p_nvargs, &p_vatype,
+                                   &p_true_typeids, NULL);
+    else
+    {
+        p_result = FUNCDETAIL_NOTFOUND;
+        p_funcid = InvalidOid;
+    }
+
+    if ((p_result == FUNCDETAIL_NORMAL ||
+         p_result == FUNCDETAIL_AGGREGATE ||
+         p_result == FUNCDETAIL_WINDOWFUNC) &&
+        p_funcid == funcid)
+        nspname = NULL;
+    else
+        nspname = get_namespace_name(procform->pronamespace);
+
+    result = quote_qualified_identifier(nspname, proname);
+
+    ReleaseSysCache(proctup);
+
+    return result;
+}
+
+/*
+ * generate_operator_name
+ *        Compute the name to display for an operator specified by OID,
+ *        given that it is being called with the specified actual arg types.
+ *        (Arg types matter because of ambiguous-operator resolution rules.
+ *        Pass InvalidOid for unused arg of a unary operator.)
+ *
+ * The result includes all necessary quoting and schema-prefixing,
+ * plus the OPERATOR() decoration needed to use a qualified operator name
+ * in an expression.
+ */
+static char *
+generate_operator_name(Oid operid, Oid arg1, Oid arg2)
+{// #lizard forgives
+    StringInfoData buf;
+    HeapTuple    opertup;
+    Form_pg_operator operform;
+    char       *oprname;
+    char       *nspname;
+    Operator    p_result;
+
+    initStringInfo(&buf);
+
+    opertup = SearchSysCache1(OPEROID, ObjectIdGetDatum(operid));
+    if (!HeapTupleIsValid(opertup))
+        elog(ERROR, "cache lookup failed for operator %u", operid);
+    operform = (Form_pg_operator) GETSTRUCT(opertup);
+    oprname = NameStr(operform->oprname);
+
+    /*
+     * The idea here is to schema-qualify only if the parser would fail to
+     * resolve the correct operator given the unqualified op name with the
+     * specified argtypes.
+     */
+    switch (operform->oprkind)
+    {
+        case 'b':
+            p_result = oper(NULL, list_make1(makeString(oprname)), arg1, arg2,
+                            true, -1);
+            break;
+        case 'l':
+            p_result = left_oper(NULL, list_make1(makeString(oprname)), arg2,
+                                 true, -1);
+            break;
+        case 'r':
+            p_result = right_oper(NULL, list_make1(makeString(oprname)), arg1,
+                                  true, -1);
+            break;
+        default:
+            elog(ERROR, "unrecognized oprkind: %d", operform->oprkind);
+            p_result = NULL;    /* keep compiler quiet */
+            break;
+    }
+
+    if (p_result != NULL && oprid(p_result) == operid)
+        nspname = NULL;
+    else
+    {
+        nspname = get_namespace_name(operform->oprnamespace);
+        appendStringInfo(&buf, "OPERATOR(%s.", quote_identifier(nspname));
+    }
+
+    appendStringInfoString(&buf, oprname);
+
+    if (nspname)
+        appendStringInfoChar(&buf, ')');
+
+    if (p_result != NULL)
+        ReleaseSysCache(p_result);
+
+    ReleaseSysCache(opertup);
+
+    return buf.data;
+}
+
+/*
+ * generate_collation_name
+ *        Compute the name to display for a collation specified by OID
+ *
+ * The result includes all necessary quoting and schema-prefixing.
+ */
+char *
+generate_collation_name(Oid collid)
+{
+    HeapTuple    tp;
+    Form_pg_collation colltup;
+    char       *collname;
+    char       *nspname;
+    char       *result;
+
+    tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
+    if (!HeapTupleIsValid(tp))
+        elog(ERROR, "cache lookup failed for collation %u", collid);
+    colltup = (Form_pg_collation) GETSTRUCT(tp);
+    collname = NameStr(colltup->collname);
+
+    if (!CollationIsVisible(collid))
+        nspname = get_namespace_name(colltup->collnamespace);
+    else
+        nspname = NULL;
+
+    result = quote_qualified_identifier(nspname, collname);
+
+    ReleaseSysCache(tp);
+
+    return result;
+}
+
+/*
+ * Given a C string, produce a TEXT datum.
+ *
+ * We assume that the input was palloc'd and may be freed.
+ */
+static text *
+string_to_text(char *str)
+{
+    text       *result;
+
+    result = cstring_to_text(str);
+    pfree(str);
+    return result;
+}
+
+/*
+ * Generate a C string representing a relation's reloptions, or NULL if none.
+ */
+static char *
+flatten_reloptions(Oid relid)
+{
+    char       *result = NULL;
+    HeapTuple    tuple;
+    Datum        reloptions;
+    bool        isnull;
+
+    tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
+    if (!HeapTupleIsValid(tuple))
+        elog(ERROR, "cache lookup failed for relation %u", relid);
+
+    reloptions = SysCacheGetAttr(RELOID, tuple,
+                                 Anum_pg_class_reloptions, &isnull);
+    if (!isnull)
+    {
+        StringInfoData buf;
+        Datum       *options;
+        int            noptions;
+        int            i;
+
+        initStringInfo(&buf);
+
+        deconstruct_array(DatumGetArrayTypeP(reloptions),
+                          TEXTOID, -1, false, 'i',
+                          &options, NULL, &noptions);
+
+        for (i = 0; i < noptions; i++)
+        {
+            char       *option = TextDatumGetCString(options[i]);
+            char       *name;
+            char       *separator;
+            char       *value;
+
+            /*
+             * Each array element should have the form name=value.  If the "="
+             * is missing for some reason, treat it like an empty value.
+             */
+            name = option;
+            separator = strchr(option, '=');
+            if (separator)
+            {
+                *separator = '\0';
+                value = separator + 1;
+            }
+            else
+                value = "";
+
+            if (i > 0)
+                appendStringInfoString(&buf, ", ");
+            appendStringInfo(&buf, "%s=", quote_identifier(name));
+
+            /*
+             * In general we need to quote the value; but to avoid unnecessary
+             * clutter, do not quote if it is an identifier that would not
+             * need quoting.  (We could also allow numbers, but that is a bit
+             * trickier than it looks --- for example, are leading zeroes
+             * significant?  We don't want to assume very much here about what
+             * custom reloptions might mean.)
+             */
+            if (quote_identifier(value) == value)
+                appendStringInfoString(&buf, value);
+            else
+                simple_quote_literal(&buf, value);
+
+            pfree(option);
+        }
+
+        result = buf.data;
+    }
+
+    ReleaseSysCache(tuple);
+
+    return result;
+}
+
+/*
+ * get_one_range_partition_bound_string
+ *        A C string representation of one range partition bound
+ */
+char *
+get_range_partbound_string(List *bound_datums)
+{
+    deparse_context context;
+    StringInfo    buf = makeStringInfo();
+    ListCell   *cell;
+    char       *sep;
+
+    memset(&context, 0, sizeof(deparse_context));
+    context.buf = buf;
+
+    appendStringInfoString(buf, "(");
+    sep = "";
+    foreach(cell, bound_datums)
+    {
+        PartitionRangeDatum *datum =
+        castNode(PartitionRangeDatum, lfirst(cell));
+
+        appendStringInfoString(buf, sep);
+        if (datum->kind == PARTITION_RANGE_DATUM_MINVALUE)
+            appendStringInfoString(buf, "MINVALUE");
+        else if (datum->kind == PARTITION_RANGE_DATUM_MAXVALUE)
+            appendStringInfoString(buf, "MAXVALUE");
+        else
+        {
+            Const       *val = castNode(Const, datum->value);
+
+            get_const_expr(val, &context, -1);
+        }
+        sep = ", ";
+    }
+    appendStringInfoString(buf, ")");
+
+    return buf->data;
+}
+
+#ifdef __TBASE__
+/* form interval partition child table/index name */
+char * 
+GetPartitionName(Oid parentrelid, int partidx, bool isindex)
+{
+    char *partname;
+    char relname[NAMEDATALEN];
+    char *parentname = get_rel_name(parentrelid);
+
+    StrNCpy(relname, parentname, NAMEDATALEN - 12);
+
+    partname = (char *)palloc0(NAMEDATALEN);
+
+    snprintf(partname, NAMEDATALEN,
+                 "%s_part_%d", relname, partidx);
+
+#if 0
+    if(!isindex)
+        snprintf(partname, NAMEDATALEN,
+                 "part_%d_%d", parentrelid, partidx);
+    else
+        snprintf(partname, NAMEDATALEN,
+                 "idx_%d_%d", parentrelid, partidx);
+#endif
+
+    return partname;
+}
+
+static int
+find_partidx_by_int(int64 start, int step, int partitions, 
+                        int64 value, QulificationType qualtype)
+{// #lizard forgives
+    int partidx = -1;
+    int gap = -1;
+    int align = -1;
+
+    if(value < start || value >= start + step*partitions)
+    {
+        return PARTITION_ROUTER_RESULT_NULL;
+    }
+    
+    gap = (int32)((value - start)/step);
+    
+    align = (int32)((value - start)%step);
+
+    switch(qualtype)
+    {
+        case QULIFICATION_TYPE_LS:
+            if(align == 0) gap--;
+        case QULIFICATION_TYPE_LE:
+            {
+                if(gap >= partitions)
+                    partidx = PARTITION_ROUTER_RESULT_FULL;
+                else if(gap < 0)
+                    partidx = PARTITION_ROUTER_RESULT_NULL;
+                else
+                    partidx = gap;
+            }
+            break;
+            
+        case QULIFICATION_TYPE_EQUAL:
+            {
+                if(gap >= partitions || gap < 0 )
+                    partidx = PARTITION_ROUTER_RESULT_NULL;
+                else
+                    partidx = gap;
+            }
+            break;
+            
+        case QULIFICATION_TYPE_GE:
+        case QULIFICATION_TYPE_GT:
+            {
+                if(gap >= partitions)
+                    partidx = PARTITION_ROUTER_RESULT_NULL;
+                else if(gap < 0)
+                    partidx = PARTITION_ROUTER_RESULT_FULL;
+                else
+                    partidx = gap;
+            }
+            break;
+        default:
+            elog(ERROR, "not supported Qulification Type[%d]", qualtype);
+    }
+    
+    return partidx;
+}
+
+static int get_daysofyear(int startyear, int startmonth, int startday, 
+                                int endyear, int endmonth, int endday)
+{// #lizard forgives
+    int result;
+
+    result = 0;
+
+    if(startyear > endyear
+        || (startyear == endyear && startmonth > endmonth)
+        || (startyear == endyear && startmonth == endmonth && startday > endday))
+        return -1;
+
+    if(startyear == endyear)
+    {
+        result = get_daysofmonth(startmonth, startday, endmonth, endday);
+    }
+    else
+    {
+        result += get_daysofmonth(startmonth,startday, 12, 31);
+        result += (endyear - startyear - 1)*366;
+        result += get_daysofmonth(1, 1, endmonth, endday);
+    }
+
+    return result;
+}
+
+static int get_daysofmonth(int startmonth, int startday, 
+                                int endmonth, int endday)
+{// #lizard forgives
+    int result;
+
+    if(startmonth <=0 || startmonth > 12
+        || startday <= 0 || startday > 31
+        || endmonth <=0 || endmonth > 12
+        || endday <= 0 || endday > 31)
+    {
+        elog(ERROR, "internal error: getdaysofmonth: parameters is invalid");
+    }
+    
+    result = 0;
+
+    if(startmonth > endmonth || (startmonth == endmonth && startday > endday))
+        return -1;
+
+    if(startmonth == endmonth)
+    {
+        result = endday - startday;
+    }
+    else
+    {
+        int monidx = 0;
+        
+        result += daysofmonth[startmonth] - startday;
+
+        monidx = startmonth + 1;
+        while(monidx < endmonth)
+            result += daysofmonth[monidx++];
+
+        result += endday;
+    }
+
+    return result;
+}
+
+static int get_monthesofyear(int startyear, int startmonth, 
+                                   int endyear, int endmonth)
+{
+    int32 gap;
+    if(endyear < startyear || (endyear == startyear && endmonth < startmonth))
+    {
+        gap = -1;
+    }
+    else
+    {
+        gap = (endyear - startyear) * 12 + (endmonth - startmonth);
+    }
+    return gap;
+}
+
+
+static int
+find_partidx_by_timestamp(TimestampTz start, int step, int steptype, int partitions, 
+                        TimestampTz value, QulificationType qualtype)
+{// #lizard forgives
+    int partidx = -1;
+    int gap;
+    struct pg_tm start_time;
+    fsec_t start_sec;
+    struct pg_tm current_time;
+    fsec_t current_sec;
+    bool    isalign = false;
+
+
+    /* timestamp convert to posix struct */
+    if(timestamp2tm(start, NULL, &start_time, &start_sec, NULL, NULL) != 0)
+        ereport(ERROR,
+                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                     errmsg("timestamp out of range")));
+    
+    if(timestamp2tm(value, NULL, &current_time, &current_sec, NULL, NULL) != 0)
+        ereport(ERROR,
+                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                     errmsg("timestamp out of range")));
+
+    if(current_time.tm_hour == 0 && current_time.tm_min == 0 && current_time.tm_sec == 0 && current_sec == 0)
+    {
+        isalign = true;
+    }
+
+    if(isalign && steptype == IntervalType_Month)
+    {
+        isalign = (current_time.tm_mday == 1);
+    }
+    
+    /* computer gap*/
+    if(steptype == IntervalType_Month)
+    {
+        if(current_time.tm_year < start_time.tm_year
+            || (current_time.tm_year == start_time.tm_year && current_time.tm_mon < start_time.tm_mon))
+        {
+            gap = -1;
+        }
+        else
+        {
+            gap = (current_time.tm_year - start_time.tm_year)*12 + (current_time.tm_mon - start_time.tm_mon);
+        }
+    }
+    else if(steptype == IntervalType_Day)
+    {
+        gap = get_daysofyear(start_time.tm_year, start_time.tm_mon, start_time.tm_mday,
+                                current_time.tm_year, current_time.tm_mon, current_time.tm_mday);
+    }
+    else
+    {
+        elog(ERROR,"step type[%d] is invalid", steptype);
+    }
+    
+    if(gap >= 0)
+    {
+        if(isalign)
+            isalign = (gap % step == 0);
+        gap = gap/step;
+    }
+    else
+    {
+        gap = -1;
+        isalign = false;
+    }
+    
+    switch(qualtype)
+    {
+        case QULIFICATION_TYPE_LS:
+            if(isalign)
+            {
+                if (!(is_first_day_from_start(step, steptype, &start_time, &current_time)))
+                {
+                    gap--;
+                }
+            }
+        case QULIFICATION_TYPE_LE:
+            {
+                if(gap >= partitions)
+                    partidx = PARTITION_ROUTER_RESULT_FULL; /* all partitions*/
+                else if(gap < 0)
+                    partidx = PARTITION_ROUTER_RESULT_NULL;
+                else
+                    partidx = gap;
+            }
+            break;
+            
+        case QULIFICATION_TYPE_EQUAL:
+            {
+                if(gap >= partitions || gap < 0)
+                    partidx = PARTITION_ROUTER_RESULT_NULL;
+                else
+                    partidx = gap;
+            }
+            break;
+            
+        case QULIFICATION_TYPE_GE:
+        case QULIFICATION_TYPE_GT:
+            {
+                if(gap >= partitions)
+                    partidx = PARTITION_ROUTER_RESULT_NULL;
+                else if(gap < 0)
+                    partidx = PARTITION_ROUTER_RESULT_FULL;
+                else
+                    partidx = gap;
+            }
+            break;
+        default:
+            elog(ERROR, "not supported Qulification Type[%d]", qualtype);
+    }
+    
+    return partidx;
+}
+
+int
+RelationGetPartitionIdxByValue(Relation rel, Datum value)
+{
+    int         partidx = -1;
+    Form_pg_partition_interval routerinfo = NULL;
+
+    routerinfo = rel->rd_partitions_info;
+
+    if(!routerinfo)
+    {
+        elog(ERROR, "relation[%s] is not a partitioned table.", RelationGetRelationName(rel));
+    }
+
+    switch(routerinfo->partdatatype)
+    {
+        case INT2OID:  /* int2 */
+            {
+                int value_int16;
+                value_int16 = DatumGetInt16(value);
+                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
+                                    routerinfo->partnparts, (int64)value_int16, QULIFICATION_TYPE_EQUAL);
+            }
+            break;
+        case INT4OID: /* int4 */
+            {
+                int value_int32;
+                value_int32 = DatumGetInt32(value);
+                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
+                                    routerinfo->partnparts, (int64)value_int32, QULIFICATION_TYPE_EQUAL);
+            }
+            break;
+        case INT8OID: /* int8 */
+            {
+                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
+                                routerinfo->partnparts, DatumGetInt64(value), QULIFICATION_TYPE_EQUAL);
+            }
+            break;
+        case TIMESTAMPOID: /* timestamp */
+            partidx = find_partidx_by_timestamp(routerinfo->partstartvalue_ts, routerinfo->partinterval_int,
+                                routerinfo->partinterval_type,
+                                routerinfo->partnparts, DatumGetTimestamp(value), QULIFICATION_TYPE_EQUAL);
+            break;
+        default:
+            elog(ERROR, "unsupported interval type:[%d]", routerinfo->partinterval_type);
+    }
+
+    return partidx;
+
+}
+
+Bitmapset *
+RelationGetPartitionByValue(Relation rel, Const *value)
+{
+    //TODO:
+    int partidx = -1;
+    AttrNumber partkey = InvalidAttrNumber;
+    Form_pg_attribute attr = NULL;
+    Bitmapset * bms = NULL;
+	char *partname = NULL;
+	Oid partoid = InvalidOid;
+
+    partkey = RelationGetPartitionColumnIndex(rel);
+    attr = rel->rd_att->attrs[partkey-1];
+
+    if(attr->atttypid != value->consttype)
+    {
+        elog(ERROR, "internal error: RelationGetPartitionByValue: data type of parameter is not same as relation definition");
+    }
+
+    partidx = RelationGetPartitionIdxByValue(rel,value->constvalue);
+
+	partname = GetPartitionName(RelationGetRelid(rel), partidx, false);
+	partoid = get_relname_relid(partname, RelationGetNamespace(rel));
+
+	if(partidx >= 0 && partoid)
+        bms = bms_make_singleton(partidx);
+    else
+        bms = NULL;
+
+    return bms;
+}
+
+List *
+RelationGetAllPartitions(Relation rel)
+{
+    int nparts = 0;
+    char *partname = NULL;
+    Oid     partoid = InvalidOid;
+    int partidx = 0;
+    List * result = NULL;
+
+    nparts = RelationGetNParts(rel);
+
+    for(partidx = 0; partidx < nparts; partidx++)
+    {
+        partname = GetPartitionName(RelationGetRelid(rel), partidx, false);
+        partoid = get_relname_relid(partname, RelationGetNamespace(rel));
+
+        if(partname)
+            pfree(partname);
+        partname = NULL;
+
+		if (InvalidOid == partoid)
+		{
+			continue;
+		}
+
+		result = lappend_oid(result, partoid);
+    }
+
+    return result;
+}
+
+int
+RelationGetChildIndex(Relation rel, Oid childoid)
+{
+    int nparts = 0;
+    char *partname = NULL;
+    Oid     partoid = InvalidOid;
+    int partidx = 0;
+    int result = -1;
+
+	if (childoid)
+    {
+		nparts = RelationGetNParts(rel);
+
+		for(partidx = 0; partidx < nparts; partidx++)
+        {
+			partname = GetPartitionName(RelationGetRelid(rel), partidx, false);
+			partoid = get_relname_relid(partname, RelationGetNamespace(rel));
+
+			if (partoid == childoid)
+			{
+				result = partidx;
+
+				if(partname)
+					pfree(partname);
+				partname = NULL;
+
+				break;
+			}
+
+            if(partname)
+                pfree(partname);
+            partname = NULL;
+        }
+    }
+
+    return result;
+}
+
+Oid
+RelationGetPartitionIndex(Relation rel, Oid indexOid, int partidx)
+{
+    char *partidxname = NULL;
+    Oid     partidxoid = InvalidOid;
+    partidxname = GetPartitionName(indexOid,partidx,true);
+    partidxoid = get_relname_relid(partidxname,RelationGetNamespace(rel));
+
+    pfree(partidxname);
+    partidxname = NULL;
+    return partidxoid;
+}
+
+Oid
+RelationGetPartition(Relation rel, int partidx, bool isindex)
+{
+    char *partname = NULL;
+    Oid     partoid = InvalidOid;
+
+    partname = GetPartitionName(RelationGetRelid(rel), partidx, isindex);
+
+    partoid = get_relname_relid(partname, RelationGetNamespace(rel));
+
+    if(partname)
+        pfree(partname);
+    partname = NULL;
+    return partoid;
+}
+
+Bitmapset *
+RelationGetPartitionsByQuals(Relation rel, List *strictinfos)
+{
+    Bitmapset * result;
+    Bitmapset * temp_bms;
+    Bitmapset * temp_result;
+    
+    ListCell *cell;
+    RestrictInfo *ele;
+    result = NULL;
+    temp_bms = NULL;
+    temp_result = NULL;
+
+    if(list_length(strictinfos) == 0)
+        return get_full_pruning_result(rel);
+        
+    foreach(cell, strictinfos)
+    {
+        ele = (RestrictInfo*)lfirst(cell);
+        temp_bms = pruning_walker(rel,(Node*)ele);
+        if(result)
+            temp_result = bms_intersect(result, temp_bms);
+        else
+            temp_result = bms_copy(temp_bms);
+        bms_free(result);
+        bms_free(temp_bms);
+        temp_bms = NULL;
+        result = temp_result;
+    }
+
+    return result;
+}
+
+static Bitmapset *
+pruning_walker(Relation rel, Node *expr)
+{
+    Bitmapset * result;
+    result = NULL;
+    
+    switch(nodeTag(expr))
+    {
+        case T_OpExpr:
+            {
+                result = pruning_opexpr(rel,(OpExpr*)expr);
+            }
+            break;
+        case T_RestrictInfo:
+            {
+                RestrictInfo *restricted = (RestrictInfo *)expr;
+                result = pruning_walker(rel, (Node *)restricted->clause);
+            }
+            break;
+        case T_BoolExpr:
+            {
+                BoolExpr *boolexpr = (BoolExpr*)expr;
+                switch(boolexpr->boolop)
+                {
+                    ListCell * cell;
+                    Bitmapset * temp_bms;
+                    Bitmapset * temp_result;
+                    Node *ele;
+
+                    temp_bms = NULL;
+                    temp_result = NULL;
+                    case AND_EXPR:
+                        {
+                            foreach(cell,boolexpr->args)
+                            {
+                                ele = (Node*)lfirst(cell);
+                                temp_bms = pruning_walker(rel,ele);
+                                if(result)
+                                    temp_result = bms_intersect(result, temp_bms);
+                                else
+                                    temp_result = bms_copy(temp_bms);
+                                bms_free(result);
+                                bms_free(temp_bms);
+                                temp_bms = NULL;
+                                result = temp_result;
+                            }                            
+                        }
+                        break;
+                    case OR_EXPR:
+                        {
+                            foreach(cell,boolexpr->args)
+                            {
+                                ele = (Node*)lfirst(cell);
+                                temp_bms = pruning_walker(rel,ele);
+                                temp_result = bms_union(result, temp_bms);
+                                bms_free(result);
+                                bms_free(temp_bms);
+                                temp_bms = NULL;
+                                result = temp_result;
+                            }                            
+                        }
+                        break;
+                    case NOT_EXPR:                        
+                    default:
+                        result = get_full_pruning_result(rel);
+                        break;
+                }
+            }
+            break;
+        default:
+            result = get_full_pruning_result(rel);
+            break;
+    }
+
+    return result;
+}
+
+static Bitmapset *
+pruning_opexpr(Relation rel, OpExpr *expr)
+{// #lizard forgives
+    Bitmapset     *result = NULL;
+    char        *opname = NULL;
+    Node        *leftarg = NULL;
+    Node        *rightarg = NULL;
+    Var            *arg_var = NULL;
+    Const        *arg_const = NULL;
+    bool        isswap = false;
+    int            npart;
+    int         partidx;
+    AttrNumber     partkey;
+    //Oid            parttype;
+    QulificationType     qualtype = QULIFICATION_TYPE_EQUAL;
+    Form_pg_partition_interval routerinfo;
+
+    partkey = RelationGetPartitionColumnIndex(rel);
+
+    //parttype = rel->rd_att->attrs[partkey - 1]->atttypid;
+    
+    if(list_length(expr->args) != 2)
+        return get_full_pruning_result(rel);
+
+    leftarg = (Node *)list_nth(expr->args,0);
+    rightarg = (Node *)list_nth(expr->args,1); 
+
+    if(IsA(leftarg,Var) && IsA(rightarg,Const))
+    {
+        arg_var = (Var *)leftarg;
+        arg_const = (Const *)rightarg;
+    }
+    else if(IsA(leftarg,Const) && IsA(rightarg,Var))
+    {
+        arg_var = (Var *)rightarg;
+        arg_const = (Const *)leftarg;
+        isswap = true;
+    }
+    else
+    {
+        return get_full_pruning_result(rel);
+    }
+
+    if(arg_var->varattno != partkey)
+    {
+        return get_full_pruning_result(rel);
+    }
+
+    opname = get_opname(expr->opno);
+
+    if(strcmp("<",opname) == 0)
+    {
+        if(!isswap)
+            qualtype = QULIFICATION_TYPE_LS;
+        else
+            qualtype = QULIFICATION_TYPE_GT;
+    }
+    else if(strcmp("<=",opname) == 0)
+    {
+        if(!isswap)
+            qualtype = QULIFICATION_TYPE_LE;
+        else
+            qualtype = QULIFICATION_TYPE_GE;
+    }
+    else if(strcmp("=",opname) == 0)
+    {        
+        qualtype = QULIFICATION_TYPE_EQUAL;        
+    }
+    else if(strcmp(">=",opname) == 0)
+    {
+        if(!isswap)
+            qualtype = QULIFICATION_TYPE_GE;
+        else
+            qualtype = QULIFICATION_TYPE_LE;
+    }
+    else if(strcmp(">",opname) == 0)
+    {
+        if(!isswap)
+            qualtype = QULIFICATION_TYPE_GT;
+        else
+            qualtype = QULIFICATION_TYPE_LS;
+    }
+    else
+    {
+        /* any other case, get full partitions */
+        return get_full_pruning_result(rel);
+    }
+
+    routerinfo = rel->rd_partitions_info;
+
+    if(!routerinfo)
+    {
+        elog(ERROR, "relation[%s] is not a partitioned table", RelationGetRelationName(rel));
+    }
+
+    switch(arg_const->consttype)
+    {
+        case INT2OID:  /* int2 */
+            {
+                int value_int16;
+                value_int16 = DatumGetInt16(arg_const->constvalue);
+                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
+                                    routerinfo->partnparts, (int64)value_int16, qualtype);
+            }
+            break;
+        case INT4OID: /* int4 */
+            {
+                int value_int32;
+                value_int32 = DatumGetInt32(arg_const->constvalue);
+                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
+                                    routerinfo->partnparts, (int64)value_int32, qualtype);
+            }
+            break;
+        case INT8OID: /* int8 */
+            {
+                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
+                                routerinfo->partnparts, DatumGetInt64(arg_const->constvalue), qualtype);
+            }
+            break;
+        case TIMESTAMPOID: /* timestamp */
+            partidx = find_partidx_by_timestamp(routerinfo->partstartvalue_ts, routerinfo->partinterval_int,
+                                routerinfo->partinterval_type,
+                                routerinfo->partnparts, DatumGetTimestamp(arg_const->constvalue), qualtype);
+            break;
+        default:
+            elog(ERROR, "unsupported const type:[%u]", arg_const->consttype);
+    }
+
+    npart = RelationGetNParts(rel);
+    if(npart <= 0)
+    {
+        elog(ERROR, "internal error: pruning_opexpr:partitioned table has no partitions");
+    }
+    
+    if(partidx == PARTITION_ROUTER_RESULT_FULL)
+        return get_full_pruning_result(rel);
+    else if(partidx == PARTITION_ROUTER_RESULT_NULL)
+        return NULL;
+    else if(partidx >= 0)
+    {
+		char *partname = NULL;
+		Oid partoid = InvalidOid;
+
+        switch(qualtype)
+        {
+            case QULIFICATION_TYPE_LS:                
+            case QULIFICATION_TYPE_LE:
+                {
+                    int i;
+                    for(i = 0; i <= partidx; i++)
+					{
+						partname = GetPartitionName(RelationGetRelid(rel), i, false);
+						partoid = get_relname_relid(partname, RelationGetNamespace(rel));
+						if(partoid)
+						{
+							result = bms_add_member(result, i);
+						}
+					}
+                }
+                break;
+            case QULIFICATION_TYPE_EQUAL:
+                {
+					partname = GetPartitionName(RelationGetRelid(rel), partidx, false);
+					partoid = get_relname_relid(partname, RelationGetNamespace(rel));
+					if(partoid)
+					{
+					    result = bms_make_singleton(partidx);
+					}
+                }
+                break;
+            case QULIFICATION_TYPE_GE:
+            case QULIFICATION_TYPE_GT:
+                {
+                    int i;
+                    for(i = partidx; i < npart; i++)
+					{
+						partname = GetPartitionName(RelationGetRelid(rel), i, false);
+						partoid = get_relname_relid(partname, RelationGetNamespace(rel));
+						if(partoid)
+						{
+						    result = bms_add_member(result, i);
+						}
+					}
+                }
+                break;
+            default:
+                //nerver occur
+                elog(ERROR, "internal error: pruning_opexpr: invalid QulificationType[%d]", qualtype);
+        }
+    }
+
+    return result;
+}
+
+static Bitmapset *
+get_full_pruning_result(Relation rel)
+{
+    Bitmapset *result = NULL;
+    int i = 0;
+    int nparts = RelationGetNParts(rel);
+	char *partname = NULL;
+	Oid partoid = InvalidOid;
+
+	Assert(nparts > 0);
+
+    for(i=0; i<nparts; i++)
+	{
+		partname = GetPartitionName(RelationGetRelid(rel), i, false);
+		partoid = get_relname_relid(partname, RelationGetNamespace(rel));
+		if (partoid)
+		{
+			result = bms_add_member(result,i);
+		}
+	}
+
+    return result;
+}
+
+void
+replace_target_relation(Node *node, Index targetrel, Relation partitionparent, int partidx)
+{// #lizard forgives
+    /*
+     * do nothing when we get to the end of a leaf on tree.
+     */
+    if (node == NULL)
+        return;
+
+    switch (nodeTag(node))
+    {
+            /*
+             * control nodes
+             */
+        case T_ModifyTable:
+        case T_Append:
+        case T_MergeAppend:
+        case T_RecursiveUnion:
+            break;
+        case T_Result:
+        case T_BitmapAnd:
+            {
+                List *planlist;
+                planlist = ((BitmapAnd*)node)->bitmapplans;
+                replace_target_relation((Node *)planlist,targetrel,partitionparent,partidx);
+            }
+            break;
+        case T_BitmapOr:
+            {
+                List *planlist;
+                planlist = ((BitmapOr*)node)->bitmapplans;
+                replace_target_relation((Node *)planlist,targetrel,partitionparent,partidx);
+            }
+            break;
+
+            /*
+             * scan nodes
+             */
+        case T_TidScan:
+        case T_SeqScan:
+            {
+                SeqScan *seqscan;
+                seqscan = (SeqScan*)node;
+
+                if(seqscan->ispartchild)
+                    break;
+                if(seqscan->scanrelid != targetrel)
+                    break;
+                seqscan->ispartchild = true;
+                seqscan->childidx = partidx;
+            }
+            break;
+
+        case T_IndexScan:
+            {
+                IndexScan *indexscan;
+                indexscan = (IndexScan*)node;
+
+                if(indexscan->scan.ispartchild)
+                    break;
+                if(indexscan->scan.scanrelid != targetrel)
+                    break;
+                indexscan->scan.ispartchild = true;
+                indexscan->scan.childidx = partidx;
+                indexscan->indexid = RelationGetPartitionIndex(partitionparent,indexscan->indexid,partidx);
+            }
+            break;
+
+        case T_IndexOnlyScan:
+            {
+                IndexOnlyScan *indexscan;
+                indexscan = (IndexOnlyScan*)node;
+
+                if(indexscan->scan.ispartchild)
+                    return;
+                if(indexscan->scan.scanrelid != targetrel)
+                    return;
+                indexscan->scan.ispartchild = true;
+                indexscan->scan.childidx = partidx;
+                indexscan->indexid = RelationGetPartitionIndex(partitionparent,indexscan->indexid,partidx);
+            }
+            break;
+
+        case T_BitmapIndexScan:
+            {
+                BitmapIndexScan *indexscan;
+                indexscan = (BitmapIndexScan*)node;
+
+                if(indexscan->scan.ispartchild)
+                    break;
+                if(indexscan->scan.scanrelid != targetrel)
+                    break;
+                indexscan->scan.ispartchild = true;
+                indexscan->scan.childidx = partidx;
+                indexscan->indexid = RelationGetPartitionIndex(partitionparent,indexscan->indexid,partidx);
+            }
+            break;
+
+        case T_BitmapHeapScan:
+            {
+                Scan *scan;
+                scan = (Scan*)node;
+
+                if(scan->ispartchild)
+                    break;
+                if(scan->scanrelid != targetrel)
+                    break;
+
+                scan->ispartchild = true;
+                scan->childidx = partidx;
+                replace_partidx_bitmapheapscan(partitionparent,(Node*)scan->plan.lefttree,partidx);
+                //replace_target_relation((Node*)scan->scan.plan.lefttree,targetrel,partitionparent,partidx);
+            }
+            break;
+
+        case T_SubqueryScan:
+            break;
+
+        case T_FunctionScan:
+        case T_ValuesScan:
+        case T_CteScan:
+        case T_WorkTableScan:
+        case T_ForeignScan:
+            break;
+
+            /*
+             * join nodes
+             */
+        case T_NestLoop:
+        case T_MergeJoin:
+        case T_HashJoin:
+            {
+                Plan *join;
+                join = (Plan*)node;
+                replace_target_relation((Node*)join->lefttree,targetrel,partitionparent,partidx);
+                replace_target_relation((Node*)join->righttree,targetrel,partitionparent,partidx);
+            }
+            break;
+
+            /*
+             * materialization nodes
+             */
+        case T_Material:
+        case T_Sort:
+        case T_Hash:
+            {
+                Plan *mat = (Plan*)node;
+                replace_target_relation((Node*)mat->lefttree,targetrel,partitionparent,partidx);
+            }
+            break;
+        case T_Group:
+        case T_Agg:
+        case T_WindowAgg:
+        case T_Unique:
+        case T_SetOp:
+        case T_LockRows:
+        case T_Limit:
+            break;
+        case T_List:
+            {
+                List * list;
+                ListCell *cell;
+                Node *element;
+                
+                list = (List *)node;
+                foreach(cell,list)
+                {
+                    element = (Node*)lfirst(cell);
+                    replace_target_relation(element,targetrel,partitionparent,partidx);
+                }
+            }
+            break;
+        case T_RemoteSubplan:
+            {
+                RemoteSubplan *plan = (RemoteSubplan *)node;
+
+                plan->cursor = get_internal_cursor();
+
+                replace_target_relation((Node*)((Plan *)plan)->lefttree,targetrel,partitionparent,partidx);
+            }
+            break;
+        case T_RemoteQuery:
+            elog(ERROR,"internal error: update partitioned parent table is forbidden in coordinator");
+            break;
+        default:
+            elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
+            break;
+    }
+}
+
+void
+replace_partidx_bitmapheapscan(Relation relation, Node *plan, int partidx)
+{
+    switch(nodeTag(plan))
+    {
+        case T_BitmapAnd:
+            {
+                List *planlist;
+                planlist = ((BitmapAnd*)plan)->bitmapplans;
+                replace_partidx_bitmapheapscan(relation,(Node*)planlist, partidx);
+            }
+            break;
+        case T_BitmapOr:
+            {
+                List *planlist;
+                planlist = ((BitmapOr*)plan)->bitmapplans;
+                replace_partidx_bitmapheapscan(relation,(Node*)planlist, partidx);
+            }
+            break;
+        case T_BitmapIndexScan:
+            {
+                Scan *sscan;
+                BitmapIndexScan *idxscan_child;
+                
+                sscan = (Scan *)plan;
+                sscan->ispartchild = true;
+                sscan->childidx = partidx;
+                
+                idxscan_child = (BitmapIndexScan *)plan;                    
+                idxscan_child->indexid = RelationGetPartitionIndex(relation,idxscan_child->indexid,partidx);
+            }
+            break;
+        case T_List:
+            {
+                List * list;
+                ListCell *cell;
+                Node *scan;
+                
+                list = (List *)plan;
+                foreach(cell,list)
+                {
+                    scan = (Node*)lfirst(cell);
+                    replace_partidx_bitmapheapscan(relation, scan, partidx);
+                }
+            }
+            break;
+        default:
+            elog(ERROR, "internal error: BitmapHeapScan cannot have this subplan[%d]", nodeTag(plan));
+            break;
+    }
+}
+
+int32
+get_timestamptz_gap(TimestampTz value, int32 interval)
+{
+    int32  gap;    
+    fsec_t fsec;
+    struct pg_tm user_time;
+    
+    if(timestamp2tm(value, NULL, &user_time, &fsec, NULL, NULL) != 0)
+        ereport(ERROR,
+                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                     errmsg("timestamp out of range")));    
+
+
+    switch (interval)
+    {
+        case IntervalType_Year:
+        {
+            gap = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon,
+                                    user_time.tm_year, 1);
+            break;
+        }
+                
+        case IntervalType_Month:
+        {
+            gap = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon,
+                                    user_time.tm_year, user_time.tm_mon);
+            break;
+        }
+        
+        case IntervalType_Day:
+        {
+            gap = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday,
+                                 user_time.tm_year, user_time.tm_mon, user_time.tm_mday);
+            break;
+        }
+
+        default:
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                        errmsg("partition interval %d not support hot and cold seperation", interval)));
+        }
+    }   
+    return gap;
+}
+
+int32
+get_timestamptz_diff(TimestampTz value, int32 interval)
+{
+    int32 gap1;    
+    int32 gap2;    
+    TimestampTz current_tmstamp;
+    fsec_t fsec;
+    struct pg_tm current_time;
+    struct pg_tm user_time;
+    
+    if(timestamp2tm(value, NULL, &user_time, &fsec, NULL, NULL) != 0)
+    {   
+        ereport(ERROR,
+                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                     errmsg("timestamp out of range")));
+    }
+    
+    current_tmstamp = GetCurrentTimestamp();
+    if(timestamp2tm(current_tmstamp, NULL, &current_time, &fsec, NULL, NULL) != 0)
+    {   
+        ereport(ERROR,
+                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                        errmsg("timestamp out of range")));
+    }
+    
+    switch (interval)
+    {
+        case IntervalType_Month:
+        {
+            gap1 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, 
+                                     current_time.tm_year, current_time.tm_mon);
+
+            gap2 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, 
+                                     user_time.tm_year, user_time.tm_mon);
+            break;
+        }
+        
+        case IntervalType_Day:
+        {
+            gap1 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday,
+                          current_time.tm_year, current_time.tm_mon, current_time.tm_mday);
+            gap2 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday,
+                                  user_time.tm_year, user_time.tm_mon, user_time.tm_mday);
+            break;
+        }
+
+        default:
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                        errmsg("partition interval %d not support hot and cold seperation", interval)));
+        }
+    }   
+    
+
+    return gap1 - gap2;
+}
+
+int32
+date_diff(struct pg_tm *user_time)
+{
+    int32 gap1;    
+    int32 gap2;    
+    TimestampTz current_tmstamp;
+    fsec_t fsec;
+    struct pg_tm current_time;
+    
+    current_tmstamp = GetCurrentTimestamp();
+    if(timestamp2tm(current_tmstamp, NULL, &current_time, &fsec, NULL, NULL) != 0)
+    {   
+        ereport(ERROR,
+                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                        errmsg("timestamp out of range")));
+    }
+
+    gap1 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, 
+                             current_time.tm_year, current_time.tm_mon);
+
+    gap2 = get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, 
+                             user_time->tm_year, user_time->tm_mon);
+            
+    
+
+    return gap1 - gap2;
+}
+
+int32
+date_diff_indays(struct pg_tm *user_time)
+{
+    int32 gap1;    
+    int32 gap2;    
+    TimestampTz current_tmstamp;
+    fsec_t fsec;
+    struct pg_tm current_time;
+    
+    current_tmstamp = GetCurrentTimestamp();
+    if(timestamp2tm(current_tmstamp, NULL, &current_time, &fsec, NULL, NULL) != 0)
+    {   
+        ereport(ERROR,
+                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                        errmsg("timestamp out of range")));
+    }
+
+    gap1 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday,
+                          current_time.tm_year, current_time.tm_mon, current_time.tm_mday);
+    
+
+    gap2 = get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday,
+                             user_time->tm_year, user_time->tm_mon, user_time->tm_mday);
+            
+    return gap1 - gap2;
+}
+
+int get_months_away_from_base(struct pg_tm * user_tm)
+{
+    return get_monthesofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, 
+                            user_tm->tm_year, user_tm->tm_mon);
+}
+
+int get_days_away_from_base(struct pg_tm * user_tm)
+{
+    return get_daysofyear(g_partition_base_time.tm_year, g_partition_base_time.tm_mon, g_partition_base_time.tm_mday,
+                          user_tm->tm_year, user_tm->tm_mon, user_tm->tm_mday);
+}
+
+bool is_sec_meet_temp_cold_date(TimestampTz secvalue, int32 interval, int step, TimestampTz startValue)
+{// #lizard forgives
+    bool ret;    
+    fsec_t fsec;
+    struct pg_tm sec_time;
+    
+    if(timestamp2tm(secvalue, NULL, &sec_time, &fsec, NULL, NULL) != 0)
+    {   
+        ereport(ERROR,
+                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                     errmsg("timestamp out of range")));
+    }
+
+    switch (interval)
+    {
+        case IntervalType_Year:
+        {
+            ret = (g_TempColdDataTime.tm_year == sec_time.tm_year); 
+            break;
+        }
+        case IntervalType_Month:
+        {
+            ret = (g_TempColdDataTime.tm_year == sec_time.tm_year) 
+                    && (g_TempColdDataTime.tm_mon == sec_time.tm_mon);
+            break;
+        }
+        
+        case IntervalType_Day:
+        {
+            ret = (g_TempColdDataTime.tm_year == sec_time.tm_year) 
+                    && (g_TempColdDataTime.tm_mon == sec_time.tm_mon)
+                    && (g_TempColdDataTime.tm_mday == sec_time.tm_mday);
+            if (!ret)
+            {
+                struct pg_tm start_time;
+    
+                if(timestamp2tm(startValue, NULL, &start_time, &fsec, NULL, NULL) != 0)
+                {   
+                    ereport(ERROR,
+                                (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                                 errmsg("timestamp out of range")));
+                }
+                
+                ret = is_first_day_from_start(step, interval, &start_time, &sec_time);
+                if (ret)
+                {
+                    if (g_TempColdDataTime.tm_year + 1 == sec_time.tm_year &&
+                        g_TempColdDataTime.tm_mon == 12 &&
+                        g_TempColdDataTime.tm_mday == 31)
+                    {
+                        ret = true;
+                    }
+                    else
+                    {
+                        ret = false;
+                    }
+                }
+            }
+            
+            break;
+        }
+
+        default:
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                        errmsg("partition interval %d not support hot and cold seperation", interval)));
+        }
+    }   
+
+    return ret;
+}
+
+int32 GetPartitionIndex(TimestampTz start, int step, int steptype, int partitions, TimestampTz value)
+{
+    return find_partidx_by_timestamp(start, step, steptype, partitions, value, QULIFICATION_TYPE_EQUAL);
+}
+
+/* is the first day of next year from start year */
+bool
+is_first_day_from_start(int step, int steptype, struct pg_tm *start_time, struct pg_tm *current_time)
+{
+    bool result = false;
+
+    /* partition by one day */
+    if (step == 1 && steptype == IntervalType_Day)
+    {
+        if (current_time->tm_year == start_time->tm_year + 1 && current_time->tm_mon == 1 &&
+            current_time->tm_mday == 1)
+        {
+            result = true;
+        }
+    }
+
+    return result;
+}
+#endif
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 71e853ba..527cb80d 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -1456,22 +1456,30 @@ typedef struct OnConflictClause
  *
  * We don't currently support the SEARCH or CYCLE clause.
  */
+typedef enum CTEMaterialize
+{
+	CTEMaterializeDefault,		/* no option specified */
+	CTEMaterializeAlways,		/* MATERIALIZED */
+	CTEMaterializeNever			/* NOT MATERIALIZED */
+} CTEMaterialize;
+
 typedef struct CommonTableExpr
 {
-    NodeTag        type;
-    char       *ctename;        /* query name (never qualified) */
-    List       *aliascolnames;    /* optional list of column names */
-    /* SelectStmt/InsertStmt/etc before parse analysis, Query afterwards: */
-    Node       *ctequery;        /* the CTE's subquery */
-    int            location;        /* token location, or -1 if unknown */
-    /* These fields are set during parse analysis: */
-    bool        cterecursive;    /* is this CTE actually recursive? */
-    int            cterefcount;    /* number of RTEs referencing this CTE
-                                  * (excluding internal self-references) */
-    List       *ctecolnames;    /* list of output column names */
-    List       *ctecoltypes;    /* OID list of output column type OIDs */
-    List       *ctecoltypmods;    /* integer list of output column typmods */
-    List       *ctecolcollations;    /* OID list of column collation OIDs */
+	NodeTag		type;
+	char	   *ctename;		/* query name (never qualified) */
+	List	   *aliascolnames;	/* optional list of column names */
+	CTEMaterialize ctematerialized; /* is this an optimization fence? */
+	/* SelectStmt/InsertStmt/etc before parse analysis, Query afterwards: */
+	Node	   *ctequery;		/* the CTE's subquery */
+	int			location;		/* token location, or -1 if unknown */
+	/* These fields are set during parse analysis: */
+	bool		cterecursive;	/* is this CTE actually recursive? */
+	int			cterefcount;	/* number of RTEs referencing this CTE
+								 * (excluding internal self-references) */
+	List	   *ctecolnames;	/* list of output column names */
+	List	   *ctecoltypes;	/* OID list of output column type OIDs */
+	List	   *ctecoltypmods;	/* integer list of output column typmods */
+	List	   *ctecolcollations;	/* OID list of column collation OIDs */
 } CommonTableExpr;
 
 /* Convenience macro to get the output tlist of a CTE's query */
diff --git a/src/test/regress/expected/foreign_key_2.out b/src/test/regress/expected/foreign_key_2.out
index 8b8ac8ac..ec92a35b 100644
--- a/src/test/regress/expected/foreign_key_2.out
+++ b/src/test/regress/expected/foreign_key_2.out
@@ -1373,24 +1373,23 @@ create temp table t1 (a integer primary key, b text);
 create temp table t2 (a integer, b integer references t1) distribute by hash (b);
 create rule r1 as on delete to t1 do delete from t2 where t2.b = old.a;
 explain (costs off) delete from t1 where a = 1;
-                            QUERY PLAN                            
-------------------------------------------------------------------
+                         QUERY PLAN                         
+------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1)
    ->  Delete on t2
          ->  Nested Loop
+               ->  Remote Subquery Scan on all (datanode_1)
+                     ->  Index Scan using t1_pkey on t1
+                           Index Cond: (a = 1)
                ->  Seq Scan on t2
                      Filter: (b = 1)
-               ->  Materialize
-                     ->  Remote Subquery Scan on all (datanode_1)
-                           ->  Index Scan using t1_pkey on t1
-                                 Index Cond: (a = 1)
  
  Remote Fast Query Execution
    Node/s: datanode_1
    ->  Delete on t1
          ->  Index Scan using t1_pkey on t1
                Index Cond: (a = 1)
-(15 rows)
+(14 rows)
 
 delete from t1 where a = 1;
 drop rule r1 on t1;
diff --git a/src/test/regress/expected/rowsecurity.out b/src/test/regress/expected/rowsecurity.out
index 18675344..bfeeedbe 100644
--- a/src/test/regress/expected/rowsecurity.out
+++ b/src/test/regress/expected/rowsecurity.out
@@ -1934,7 +1934,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test;
          Filter: (((a % 2) = 0) AND f_leak(b))
 (3 rows)
 
-PREPARE plancache_test2 AS WITH q AS (SELECT * FROM z1 WHERE f_leak(b)) SELECT * FROM q,z2;
+PREPARE plancache_test2 AS WITH q AS MATERIALIZED (SELECT * FROM z1 WHERE f_leak(b)) SELECT * FROM q,z2;
 EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
                            QUERY PLAN                            
 -----------------------------------------------------------------
@@ -1949,7 +1949,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
                ->  Seq Scan on z2
 (9 rows)
 
-PREPARE plancache_test3 AS WITH q AS (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b);
+PREPARE plancache_test3 AS WITH q AS MATERIALIZED (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b);
 EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
                            QUERY PLAN                            
 -----------------------------------------------------------------
diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out
index 670e9a06..60160a5a 100644
--- a/src/test/regress/expected/rowsecurity_1.out
+++ b/src/test/regress/expected/rowsecurity_1.out
@@ -2044,7 +2044,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test;
          Filter: (((a % 2) = 0) AND f_leak(b))
 (3 rows)
 
-PREPARE plancache_test2 AS WITH q AS (SELECT * FROM z1 WHERE f_leak(b)) SELECT * FROM q,z2;
+PREPARE plancache_test2 AS WITH q AS MATERIALIZED (SELECT * FROM z1 WHERE f_leak(b)) SELECT * FROM q,z2;
 EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
                            QUERY PLAN                            
 -----------------------------------------------------------------
@@ -2059,7 +2059,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
                ->  Seq Scan on z2
 (9 rows)
 
-PREPARE plancache_test3 AS WITH q AS (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b);
+PREPARE plancache_test3 AS WITH q AS MATERIALIZED (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b);
 EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
                            QUERY PLAN                            
 -----------------------------------------------------------------
@@ -2643,7 +2643,7 @@ ALTER TABLE t1 ENABLE ROW LEVEL SECURITY;
 GRANT ALL ON t1 TO regress_rls_bob;
 INSERT INTO t1 (SELECT x, md5(x::text) FROM generate_series(0,20) x);
 SET SESSION AUTHORIZATION regress_rls_bob;
-WITH cte1 AS (SELECT * FROM t1 WHERE f_leak(b) order by 1) SELECT * FROM cte1;
+WITH cte1 AS MATERIALIZED (SELECT * FROM t1 WHERE f_leak(b) order by 1) SELECT * FROM cte1;
  a  |                b                 
 ----+----------------------------------
   0 | cfcd208495d565ef66e7dff9f98764da
@@ -2659,7 +2659,8 @@ WITH cte1 AS (SELECT * FROM t1 WHERE f_leak(b) order by 1) SELECT * FROM cte1;
  20 | 98f13708210194c475687be6106a3b84
 (11 rows)
 
-EXPLAIN (COSTS OFF) WITH cte1 AS (SELECT * FROM t1 WHERE f_leak(b)) SELECT * FROM cte1;
+EXPLAIN (COSTS OFF)
+WITH cte1 AS MATERIALIZED (SELECT * FROM t1 WHERE f_leak(b)) SELECT * FROM cte1;
                          QUERY PLAN                          
 -------------------------------------------------------------
  CTE Scan on cte1
diff --git a/src/test/regress/expected/rowtypes.out b/src/test/regress/expected/rowtypes.out
index 86df2bcc..c1e107f2 100644
--- a/src/test/regress/expected/rowtypes.out
+++ b/src/test/regress/expected/rowtypes.out
@@ -696,7 +696,7 @@ from (values (1,row(1,2)), (1,row(null,null)), (1,null),
 (6 rows)
 
 explain (verbose, costs off)
-with r(a,b) as
+with r(a,b) as materialized
   (values (1,row(1,2)), (1,row(null,null)), (1,null),
           (null,row(1,2)), (null,row(null,null)), (null,null) )
 select r, r is null as isnull, r is not null as isnotnull from r;
@@ -709,7 +709,7 @@ select r, r is null as isnull, r is not null as isnotnull from r;
            Output: "*VALUES*".column1, "*VALUES*".column2
 (5 rows)
 
-with r(a,b) as
+with r(a,b) as materialized
   (values (1,row(1,2)), (1,row(null,null)), (1,null),
           (null,row(1,2)), (null,row(null,null)), (null,null) )
 select r, r is null as isnull, r is not null as isnotnull from r;
diff --git a/src/test/regress/expected/rowtypes_1.out b/src/test/regress/expected/rowtypes_1.out
index 57671100..b22e63f1 100644
--- a/src/test/regress/expected/rowtypes_1.out
+++ b/src/test/regress/expected/rowtypes_1.out
@@ -700,7 +700,7 @@ from (values (1,row(1,2)), (1,row(null,null)), (1,null),
 (6 rows)
 
 explain (verbose, costs off)
-with r(a,b) as
+with r(a,b) as materialized
   (values (1,row(1,2)), (1,row(null,null)), (1,null),
           (null,row(1,2)), (null,row(null,null)), (null,null) )
 select r, r is null as isnull, r is not null as isnotnull from r;
@@ -713,7 +713,7 @@ select r, r is null as isnull, r is not null as isnotnull from r;
            Output: "*VALUES*".column1, "*VALUES*".column2
 (5 rows)
 
-with r(a,b) as
+with r(a,b) as materialized
   (values (1,row(1,2)), (1,row(null,null)), (1,null),
           (null,row(1,2)), (null,row(null,null)), (null,null) )
 select r, r is null as isnull, r is not null as isnotnull from r;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index a169bf4a..0d96dff4 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -3168,7 +3168,7 @@ explain (costs off) INSERT INTO hats VALUES ('h8', 'forbidden') RETURNING *;
 (6 rows)
 
 -- ensure upserting into a rule, with a CTE (different offsets!) works
-WITH data(hat_name, hat_color) AS (
+WITH data(hat_name, hat_color) AS MATERIALIZED (
     VALUES ('h8', 'green'),
         ('h9', 'blue'),
         ('h7', 'forbidden')
@@ -3182,7 +3182,8 @@ RETURNING *;
  h9         | blue      
 (2 rows)
 
-EXPLAIN (nodes off, costs off) WITH data(hat_name, hat_color) AS (
+EXPLAIN (nodes off, costs off)
+WITH data(hat_name, hat_color) AS MATERIALIZED (
     VALUES ('h8', 'green'),
         ('h9', 'blue'),
         ('h7', 'forbidden')
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 802e760d..29096c74 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -530,12 +530,10 @@ update shipped_view set value = 11
     from int4_tbl a join int4_tbl b
       on (a.f1 = (select f1 from int4_tbl c where c.f1=b.f1))
     where ordnum = a.f1;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 select * from shipped_view;
- ttype | ordnum | partnum |  value  
--------+--------+---------+---------
- wt    |      0 | 1       | 1234.56
+ ttype | ordnum | partnum | value 
+-------+--------+---------+-------
+ wt    |      0 | 1       |    11
 (1 row)
 
 select f1, ss1 as relabel from
@@ -1158,3 +1156,265 @@ NOTICE:  x = 9, y = 13
 (3 rows)
 
 drop function tattle(x int, y int);
+
+--
+-- Tests for CTE inlining behavior
+--
+-- Basic subquery that can be inlined
+explain (verbose, costs off)
+with x as (select * from (select f1 from subselect_tbl) ss)
+select * from x where f1 = 1;
+                QUERY PLAN                
+------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: subselect_tbl.f1
+   ->  Seq Scan on public.subselect_tbl
+         Output: subselect_tbl.f1
+         Filter: (subselect_tbl.f1 = 1)
+(5 rows)
+
+-- Explicitly request materialization
+explain (verbose, costs off)
+with x as materialized (select * from (select f1 from subselect_tbl) ss)
+select * from x where f1 = 1;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ CTE Scan on x
+   Output: x.f1
+   Filter: (x.f1 = 1)
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+           Output: subselect_tbl.f1
+           ->  Seq Scan on public.subselect_tbl
+                 Output: subselect_tbl.f1
+(8 rows)
+
+-- Stable functions are safe to inline
+explain (verbose, costs off)
+with x as (select * from (select f1, now() from subselect_tbl) ss)
+select * from x where f1 = 1;
+                QUERY PLAN                
+------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: subselect_tbl.f1, now()
+   ->  Seq Scan on public.subselect_tbl
+         Output: subselect_tbl.f1, now()
+         Filter: (subselect_tbl.f1 = 1)
+(5 rows)
+
+-- Volatile functions prevent inlining
+explain (verbose, costs off)
+with x as (select * from (select f1, random() from subselect_tbl) ss)
+select * from x where f1 = 1;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ CTE Scan on x
+   Output: x.f1, x.random
+   Filter: (x.f1 = 1)
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+           Output: f1, random
+           ->  Seq Scan on public.subselect_tbl
+                 Output: subselect_tbl.f1, random()
+(8 rows)
+
+-- SELECT FOR UPDATE cannot be inlined
+explain (verbose, costs off)
+with x as (select * from (select f1 from subselect_tbl for update) ss)
+select * from x where f1 = 1;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ CTE Scan on x
+   Output: x.f1
+   Filter: (x.f1 = 1)
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+           Output: ss.f1
+           ->  Subquery Scan on ss
+                 Output: ss.f1
+                 ->  LockRows
+                       Output: subselect_tbl.f1, subselect_tbl.ctid
+                       ->  Seq Scan on public.subselect_tbl
+                             Output: subselect_tbl.f1, subselect_tbl.ctid
+(12 rows)
+
+-- Multiply-referenced CTEs are inlined only when requested
+explain (verbose, costs off)
+with x as (select * from (select f1, now() as n from subselect_tbl) ss)
+select * from x, x x2 where x.n = x2.n;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Merge Join
+   Output: x.f1, x.n, x2.f1, x2.n
+   Merge Cond: (x.n = x2.n)
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+           Output: subselect_tbl.f1, now()
+           ->  Seq Scan on public.subselect_tbl
+                 Output: subselect_tbl.f1, now()
+   ->  Sort
+         Output: x.f1, x.n
+         Sort Key: x.n
+         ->  CTE Scan on x
+               Output: x.f1, x.n
+   ->  Sort
+         Output: x2.f1, x2.n
+         Sort Key: x2.n
+         ->  CTE Scan on x x2
+               Output: x2.f1, x2.n
+(18 rows)
+
+explain (verbose, costs off)
+with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss)
+select * from x, x x2 where x.n = x2.n;
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: subselect_tbl.f1, now(), subselect_tbl_1.f1, now()
+   ->  Result
+         Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now())
+         One-Time Filter: (now() = now())
+         ->  Nested Loop
+               Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now())
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: subselect_tbl.f1, now()
+                     Distribute results by H: now()
+                     ->  Seq Scan on public.subselect_tbl
+                           Output: subselect_tbl.f1, now()
+               ->  Materialize
+                     Output: subselect_tbl_1.f1, (now())
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Output: subselect_tbl_1.f1, now()
+                           Distribute results by H: now()
+                           ->  Seq Scan on public.subselect_tbl subselect_tbl_1
+                                 Output: subselect_tbl_1.f1, now()
+(19 rows)
+
+-- Multiply-referenced CTEs can't be inlined if they contain outer self-refs
+explain (verbose, costs off)
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z1.a as a from z cross join z as z1
+    where length(z.a || z1.a) < 5))
+select * from x;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ CTE Scan on x
+   Output: x.a
+   CTE x
+     ->  Recursive Union
+           ->  Values Scan on "*VALUES*"
+                 Output: "*VALUES*".column1
+           ->  Nested Loop
+                 Output: (x_1.a || x_2.a)
+                 Join Filter: (length((x_1.a || x_2.a)) < 5)
+                 ->  WorkTable Scan on x x_1
+                       Output: x_1.a
+                 ->  WorkTable Scan on x x_2
+                       Output: x_2.a
+(13 rows)
+
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z1.a as a from z cross join z as z1
+    where length(z.a || z1.a) < 5))
+select * from x;
+  a   
+------
+ a
+ b
+ ab
+ abab
+(4 rows)
+
+explain (verbose, costs off)
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z.a as a from z
+    where length(z.a || z.a) < 5))
+select * from x;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ CTE Scan on x
+   Output: x.a
+   CTE x
+     ->  Recursive Union
+           ->  Values Scan on "*VALUES*"
+                 Output: "*VALUES*".column1
+           ->  WorkTable Scan on x x_1
+                 Output: (x_1.a || x_1.a)
+                 Filter: (length((x_1.a || x_1.a)) < 5)
+(9 rows)
+
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z.a as a from z
+    where length(z.a || z.a) < 5))
+select * from x;
+  a   
+------
+ a
+ b
+ aa
+ bb
+ aaaa
+ bbbb
+(6 rows)
+
+-- Check handling of outer references
+explain (verbose, costs off)
+with x as (select * from int4_tbl)
+select * from (with y as (select * from x) select * from y) ss;
+                QUERY PLAN                
+------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: f1
+   ->  Seq Scan on public.int4_tbl
+         Output: int4_tbl.f1
+(4 rows)
+
+explain (verbose, costs off)
+with x as materialized (select * from int4_tbl)
+select * from (with y as (select * from x) select * from y) ss;
+                    QUERY PLAN                    
+--------------------------------------------------
+ CTE Scan on x
+   Output: x.f1
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1)
+           Output: int4_tbl.f1
+           ->  Seq Scan on public.int4_tbl
+                 Output: int4_tbl.f1
+(7 rows)
+
+-- Ensure that we inline the currect CTE when there are
+-- multiple CTEs with the same name
+explain (verbose, costs off)
+with x as (select 1 as y)
+select * from (with x as (select 2 as y) select * from x) ss;
+ QUERY PLAN  
+-------------
+ Result
+   Output: 2
+(2 rows)
+
+-- Row marks are not pushed into CTEs
+explain (verbose, costs off)
+with x as (select * from subselect_tbl)
+select * from x for update;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3
+   ->  Seq Scan on public.subselect_tbl
+         Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3
+(4 rows)
+
diff --git a/src/test/regress/expected/xc_for_update_1.out b/src/test/regress/expected/xc_for_update_1.out
index 69bd0130..b9de2234 100644
--- a/src/test/regress/expected/xc_for_update_1.out
+++ b/src/test/regress/expected/xc_for_update_1.out
@@ -390,29 +390,23 @@ ERROR:  FOR SHARE is not allowed with joins
 explain (costs off, num_nodes off, nodes off, verbose on)  WITH q1 AS (SELECT * from t1 FOR UPDATE) SELECT * FROM q1,t2 FOR UPDATE;
 ERROR:  FOR UPDATE is not allowed with joins
 explain (costs off, num_nodes off, nodes off, verbose on)  WITH q1 AS (SELECT * from t1) SELECT * FROM q1;
-               QUERY PLAN                
------------------------------------------
- CTE Scan on q1
-   Output: q1.val, q1.val2
-   CTE q1
-     ->  Remote Subquery Scan on all
-           Output: t1.val, t1.val2
-           ->  Seq Scan on public.t1
-                 Output: t1.val, t1.val2
-(7 rows)
+           QUERY PLAN            
+---------------------------------
+ Remote Subquery Scan on all
+   Output: t1.val, t1.val2
+   ->  Seq Scan on public.t1
+         Output: t1.val, t1.val2
+(4 rows)
 
 -- make sure row marks are no ops for queries on WITH tables
 explain (costs off, num_nodes off, nodes off, verbose on)  WITH q1 AS (SELECT * from t1) SELECT * FROM q1 FOR UPDATE;
-               QUERY PLAN                
------------------------------------------
- CTE Scan on q1
-   Output: q1.val, q1.val2
-   CTE q1
-     ->  Remote Subquery Scan on all
-           Output: t1.val, t1.val2
-           ->  Seq Scan on public.t1
-                 Output: t1.val, t1.val2
-(7 rows)
+           QUERY PLAN            
+---------------------------------
+ Remote Subquery Scan on all
+   Output: t1.val, t1.val2
+   ->  Seq Scan on public.t1
+         Output: t1.val, t1.val2
+(4 rows)
 
 explain (costs off, num_nodes off, nodes off, verbose on)  WITH q1 AS (SELECT * from t1 FOR UPDATE) SELECT * FROM q1 FOR UPDATE;
                        QUERY PLAN                       
diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql
index a010dc72..bd588af8 100644
--- a/src/test/regress/sql/rowsecurity.sql
+++ b/src/test/regress/sql/rowsecurity.sql
@@ -847,10 +847,10 @@ EXPLAIN (COSTS OFF) SELECT * FROM z1 WHERE f_leak(b);
 PREPARE plancache_test AS SELECT * FROM z1 WHERE f_leak(b);
 EXPLAIN (COSTS OFF) EXECUTE plancache_test;
 
-PREPARE plancache_test2 AS WITH q AS (SELECT * FROM z1 WHERE f_leak(b)) SELECT * FROM q,z2;
+PREPARE plancache_test2 AS WITH q AS MATERIALIZED (SELECT * FROM z1 WHERE f_leak(b)) SELECT * FROM q,z2;
 EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
 
-PREPARE plancache_test3 AS WITH q AS (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b);
+PREPARE plancache_test3 AS WITH q AS MATERIALIZED (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b);
 EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
 
 SET ROLE regress_rls_group1;
@@ -1078,8 +1078,9 @@ INSERT INTO t1 (SELECT x, md5(x::text) FROM generate_series(0,20) x);
 
 SET SESSION AUTHORIZATION regress_rls_bob;
 
-WITH cte1 AS (SELECT * FROM t1 WHERE f_leak(b) order by 1) SELECT * FROM cte1;
-EXPLAIN (COSTS OFF) WITH cte1 AS (SELECT * FROM t1 WHERE f_leak(b)) SELECT * FROM cte1;
+WITH cte1 AS MATERIALIZED (SELECT * FROM t1 WHERE f_leak(b) order by 1) SELECT * FROM cte1;
+EXPLAIN (COSTS OFF)
+WITH cte1 AS MATERIALIZED (SELECT * FROM t1 WHERE f_leak(b)) SELECT * FROM cte1;
 
 WITH cte1 AS (UPDATE t1 SET a = a + 1 RETURNING *) SELECT * FROM cte1; --fail
 WITH cte1 AS (UPDATE t1 SET a = a RETURNING *) SELECT * FROM cte1; --ok
diff --git a/src/test/regress/sql/rowtypes.sql b/src/test/regress/sql/rowtypes.sql
index 4a046c2b..ab7e1488 100644
--- a/src/test/regress/sql/rowtypes.sql
+++ b/src/test/regress/sql/rowtypes.sql
@@ -306,12 +306,12 @@ from (values (1,row(1,2)), (1,row(null,null)), (1,null),
              (null,row(1,2)), (null,row(null,null)), (null,null) ) r(a,b);
 
 explain (verbose, costs off)
-with r(a,b) as
+with r(a,b) as materialized
   (values (1,row(1,2)), (1,row(null,null)), (1,null),
           (null,row(1,2)), (null,row(null,null)), (null,null) )
 select r, r is null as isnull, r is not null as isnotnull from r;
 
-with r(a,b) as
+with r(a,b) as materialized
   (values (1,row(1,2)), (1,row(null,null)), (1,null),
           (null,row(1,2)), (null,row(null,null)), (null,null) )
 select r, r is null as isnull, r is not null as isnotnull from r;
diff --git a/src/test/regress/sql/rules.sql b/src/test/regress/sql/rules.sql
index 96115bbe..6ebb4cec 100644
--- a/src/test/regress/sql/rules.sql
+++ b/src/test/regress/sql/rules.sql
@@ -1131,7 +1131,7 @@ SELECT tablename, rulename, definition FROM pg_rules
 explain (costs off) INSERT INTO hats VALUES ('h8', 'forbidden') RETURNING *;
 
 -- ensure upserting into a rule, with a CTE (different offsets!) works
-WITH data(hat_name, hat_color) AS (
+WITH data(hat_name, hat_color) AS MATERIALIZED (
     VALUES ('h8', 'green'),
         ('h9', 'blue'),
         ('h7', 'forbidden')
@@ -1139,7 +1139,8 @@ WITH data(hat_name, hat_color) AS (
 INSERT INTO hats
     SELECT * FROM data
 RETURNING *;
-EXPLAIN (nodes off, costs off) WITH data(hat_name, hat_color) AS (
+EXPLAIN (nodes off, costs off)
+WITH data(hat_name, hat_color) AS MATERIALIZED (
     VALUES ('h8', 'green'),
         ('h9', 'blue'),
         ('h7', 'forbidden')
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 11b365fe..9b3f974f 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -598,3 +598,96 @@ select * from
   where tattle(x, u);
 
 drop function tattle(x int, y int);
+
+--
+-- Tests for CTE inlining behavior
+--
+
+-- Basic subquery that can be inlined
+explain (verbose, costs off)
+with x as (select * from (select f1 from subselect_tbl) ss)
+select * from x where f1 = 1;
+
+-- Explicitly request materialization
+explain (verbose, costs off)
+with x as materialized (select * from (select f1 from subselect_tbl) ss)
+select * from x where f1 = 1;
+
+-- Stable functions are safe to inline
+explain (verbose, costs off)
+with x as (select * from (select f1, now() from subselect_tbl) ss)
+select * from x where f1 = 1;
+
+-- Volatile functions prevent inlining
+explain (verbose, costs off)
+with x as (select * from (select f1, random() from subselect_tbl) ss)
+select * from x where f1 = 1;
+
+-- SELECT FOR UPDATE cannot be inlined
+explain (verbose, costs off)
+with x as (select * from (select f1 from subselect_tbl for update) ss)
+select * from x where f1 = 1;
+
+-- Multiply-referenced CTEs are inlined only when requested
+explain (verbose, costs off)
+with x as (select * from (select f1, now() as n from subselect_tbl) ss)
+select * from x, x x2 where x.n = x2.n;
+
+explain (verbose, costs off)
+with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss)
+select * from x, x x2 where x.n = x2.n;
+
+-- Multiply-referenced CTEs can't be inlined if they contain outer self-refs
+explain (verbose, costs off)
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z1.a as a from z cross join z as z1
+    where length(z.a || z1.a) < 5))
+select * from x;
+
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z1.a as a from z cross join z as z1
+    where length(z.a || z1.a) < 5))
+select * from x;
+
+explain (verbose, costs off)
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z.a as a from z
+    where length(z.a || z.a) < 5))
+select * from x;
+
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z.a as a from z
+    where length(z.a || z.a) < 5))
+select * from x;
+
+-- Check handling of outer references
+explain (verbose, costs off)
+with x as (select * from int4_tbl)
+select * from (with y as (select * from x) select * from y) ss;
+
+explain (verbose, costs off)
+with x as materialized (select * from int4_tbl)
+select * from (with y as (select * from x) select * from y) ss;
+
+-- Ensure that we inline the currect CTE when there are
+-- multiple CTEs with the same name
+explain (verbose, costs off)
+with x as (select 1 as y)
+select * from (with x as (select 2 as y) select * from x) ss;
+
+-- Row marks are not pushed into CTEs
+explain (verbose, costs off)
+with x as (select * from subselect_tbl)
+select * from x for update;

From 037c2e524d68a3aa6fdf77ec54e9bee8de6a6928 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 11 Aug 2020 21:20:27 +0800
Subject: [PATCH 023/578] Prevent inlining of multiply-referenced CTEs with
 outer recursive refs.

This has to be prevented because inlining would result in multiple
self-references, which we don't support (and in fact that's disallowed
by the SQL spec, see statements about linearly vs. nonlinearly
recursive queries).  Bug fix for commit 608b167f9.

Per report from Yaroslav Schekin (via Andrew Gierth)

Discussion: https://postgr.es/m/87wolmg60q.fsf@news-spur.riddles.org.uk
---
 src/backend/optimizer/plan/subselect.c  | 66 +++++++++++++++++++++++++
 src/test/regress/expected/subselect.out | 43 ++++++++++++----
 2 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 3a7f8ccf..1ebdfefc 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -154,6 +154,8 @@ static bool testexpr_is_hashable(Node *testexpr);
 static bool hash_ok_operator(OpExpr *expr);
 static bool contain_dml(Node *node);
 static bool contain_dml_walker(Node *node, void *context);
+static bool contain_outer_selfref(Node *node);
+static bool contain_outer_selfref_walker(Node *node, Index *depth);
 static void inline_cte(PlannerInfo *root, CommonTableExpr *cte);
 static bool inline_cte_walker(Node *node, inline_cte_walker_context *context);
 static bool simplify_EXISTS_query(PlannerInfo *root, Query *query);
@@ -1298,6 +1300,10 @@ SS_process_ctes(PlannerInfo *root)
 		 * SELECT, or containing volatile functions.  Inlining might change
 		 * the side-effects, which would be bad.
 		 *
+		 * 4. The CTE is multiply-referenced and contains a self-reference to
+		 * a recursive CTE outside itself.  Inlining would result in multiple
+		 * recursive self-references, which we don't support.
+		 *
 		 * Otherwise, we have an option whether to inline or not.  That should
 		 * always be a win if there's just a single reference, but if the CTE
 		 * is multiply-referenced then it's unclear: inlining adds duplicate
@@ -1307,6 +1313,9 @@ SS_process_ctes(PlannerInfo *root)
 		 * the user express a preference.  Our default behavior is to inline
 		 * only singly-referenced CTEs, but a CTE marked CTEMaterializeNever
 		 * will be inlined even if multiply referenced.
+		 *
+		 * Note: we check for volatile functions last, because that's more
+		 * expensive than the other tests needed.
 		 */
 		if ((cte->ctematerialized == CTEMaterializeNever ||
 			 (cte->ctematerialized == CTEMaterializeDefault &&
@@ -1314,6 +1323,8 @@ SS_process_ctes(PlannerInfo *root)
 			!cte->cterecursive &&
 			cmdType == CMD_SELECT &&
 			!contain_dml(cte->ctequery) &&
+			(cte->cterefcount <= 1 ||
+			 !contain_outer_selfref(cte->ctequery)) &&
 			!contain_volatile_functions(cte->ctequery))
 		{
 			inline_cte(root, cte);
@@ -1733,6 +1744,61 @@ contain_dml_walker(Node *node, void *context)
 	return expression_tree_walker(node, contain_dml_walker, context);
 }
 
+/*
+ * contain_outer_selfref: is there an external recursive self-reference?
+ */
+static bool
+contain_outer_selfref(Node *node)
+{
+	Index		depth = 0;
+
+	/*
+	 * We should be starting with a Query, so that depth will be 1 while
+	 * examining its immediate contents.
+	 */
+	Assert(IsA(node, Query));
+
+	return contain_outer_selfref_walker(node, &depth);
+}
+
+static bool
+contain_outer_selfref_walker(Node *node, Index *depth)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, RangeTblEntry))
+	{
+		RangeTblEntry *rte = (RangeTblEntry *) node;
+
+		/*
+		 * Check for a self-reference to a CTE that's above the Query that our
+		 * search started at.
+		 */
+		if (rte->rtekind == RTE_CTE &&
+			rte->self_reference &&
+			rte->ctelevelsup >= *depth)
+			return true;
+		return false;			/* allow range_table_walker to continue */
+	}
+	if (IsA(node, Query))
+	{
+		/* Recurse into subquery, tracking nesting depth properly */
+		Query	   *query = (Query *) node;
+		bool		result;
+
+		(*depth)++;
+
+		result = query_tree_walker(query, contain_outer_selfref_walker,
+								   (void *) depth, QTW_EXAMINE_RTES_BEFORE);
+
+		(*depth)--;
+
+		return result;
+	}
+	return expression_tree_walker(node, contain_outer_selfref_walker,
+								  (void *) depth);
+}
+
 /*
  * inline_cte: convert RTE_CTE references to given CTE into RTE_SUBQUERYs
  */
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 29096c74..c480d768 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1299,8 +1299,8 @@ with recursive x(a) as
     select z.a || z1.a as a from z cross join z as z1
     where length(z.a || z1.a) < 5))
 select * from x;
-                         QUERY PLAN                          
--------------------------------------------------------------
+                        QUERY PLAN                        
+----------------------------------------------------------
  CTE Scan on x
    Output: x.a
    CTE x
@@ -1308,13 +1308,18 @@ select * from x;
            ->  Values Scan on "*VALUES*"
                  Output: "*VALUES*".column1
            ->  Nested Loop
-                 Output: (x_1.a || x_2.a)
-                 Join Filter: (length((x_1.a || x_2.a)) < 5)
-                 ->  WorkTable Scan on x x_1
-                       Output: x_1.a
-                 ->  WorkTable Scan on x x_2
-                       Output: x_2.a
-(13 rows)
+                 Output: (z.a || z1.a)
+                 Join Filter: (length((z.a || z1.a)) < 5)
+                 CTE z
+                   ->  WorkTable Scan on x x_1
+                         Output: x_1.a
+                 ->  CTE Scan on z
+                       Output: z.a
+                 ->  Materialize
+                       Output: z1.a
+                       ->  CTE Scan on z z1
+                             Output: z1.a
+(18 rows)
 
 with recursive x(a) as
   ((values ('a'), ('b'))
@@ -1327,9 +1332,27 @@ select * from x;
 ------
  a
  b
+ aa
  ab
+ ba
+ bb
+ aaaa
+ aaab
+ aaba
+ aabb
+ abaa
  abab
-(4 rows)
+ abba
+ abbb
+ baaa
+ baab
+ baba
+ babb
+ bbaa
+ bbab
+ bbba
+ bbbb
+(22 rows)
 
 explain (verbose, costs off)
 with recursive x(a) as

From dac9ce9ea7da01d083cf86dfe78954d05303abff Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Thu, 13 Aug 2020 12:55:02 +0800
Subject: [PATCH 024/578] Refine UPDATE/DELETE join distribution rules

Remove the replication_level restriction since we have to do the
replicate for UPDATE/DELETE anyway
---
 src/backend/optimizer/util/pathnode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 9bbf6040..ad966eb2 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2679,7 +2679,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 			if (resultRelLoc == RESULT_REL_INNER &&
 				pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL &&
 				pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI &&
-				nRemotePlans_outer < replication_level && !pathnode->inner_unique)
+				!pathnode->inner_unique)
 			{
 				/* Replicate outer */
 				pathnode->outerjoinpath = redistribute_path(
@@ -2697,7 +2697,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 			}
 			else if (resultRelLoc == RESULT_REL_OUTER &&
 					 pathnode->jointype != JOIN_RIGHT && pathnode->jointype != JOIN_FULL &&
-					 nRemotePlans_outer < replication_level && !pathnode->inner_unique)
+					 !pathnode->inner_unique)
 			{
 				/* Replicate inner */
 				pathnode->innerjoinpath = redistribute_path(

From cb79f573feb76dc0dc4e045cbe1f74f40a851e96 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 18 Aug 2020 19:05:02 +0800
Subject: [PATCH 025/578] Add GUC setting enable_sampling_analyze to choose the
 analyze method

---
 src/backend/utils/misc/guc.c  | 11 +++++++++++
 src/include/commands/vacuum.h |  6 +++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 5ee53910..ad023691 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2097,6 +2097,17 @@ static struct config_bool ConfigureNamesBool[] =
         NULL, NULL, NULL
     },
 
+	{
+		{
+			"enable_pgbouncer", PGC_SIGHUP, STATS_COLLECTOR,
+			gettext_noop("use pgbouncer as coordinator connection pool."),
+			NULL
+		},
+		&g_enable_bouncer,
+		false,
+		NULL, NULL, NULL
+	},	
+
     {
         {
             "enable_pgbouncer", PGC_SIGHUP, STATS_COLLECTOR,
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 9ceed0ce..356efa52 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -218,9 +218,9 @@ extern int    vacuum_freeze_table_age;
 extern int    vacuum_multixact_freeze_min_age;
 extern int    vacuum_multixact_freeze_table_age;
 #ifdef __TBASE__
-extern bool	  enable_sampling_analyze;
-extern bool   distributed_query_analyze;
-extern bool   explain_query_analyze;
+extern bool	enable_sampling_analyze;
+extern bool distributed_query_analyze;
+extern bool explain_query_analyze;
 
 /* max number of queries collected */
 #define MAX_DISTRIBUTED_QUERIES 512

From 228f4c4d77698d82e285b3792f5cac9e4cfd3b69 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Mon, 20 Jul 2020 20:56:18 +0800
Subject: [PATCH 026/578] Add subset extended statistics

Some of our customer experienced that data of two columns have strong
correlation. Although postgres extended statistic table support
dependency stats checking, but it's restricted to '=' operator based on
per-column stats, which is hard to support our customers.

Actually, it's a common case that user build their primary-key or
distribution-key by simple transform based on their business data
column. As a result, forcing query push-down by distribution key could
cause over-estimate the selectivity. Thus, we introduced the 'subset'
multi-column statistics to hint optimizer.
---
 src/backend/commands/statscmds.c              | 638 ++++++++++--------
 src/backend/optimizer/path/clausesel.c        | 462 +++++++------
 src/backend/optimizer/util/plancat.c          | 120 ++--
 src/backend/statistics/Makefile               |   2 +-
 src/backend/statistics/dependencies.c         | 317 ++++-----
 src/backend/statistics/extended_stats.c       | 438 +++++++-----
 src/backend/statistics/subset.c               | 360 ++++++++++
 src/include/catalog/pg_statistic_ext.h        |  43 +-
 .../statistics/extended_stats_internal.h      |   5 +-
 src/include/statistics/statistics.h           |  24 +-
 src/test/regress/expected/stats_ext_2.out     |  95 +++
 src/test/regress/expected/stats_ext_3.out     |  95 +++
 src/test/regress/sql/stats_ext.sql            |  47 ++
 13 files changed, 1710 insertions(+), 936 deletions(-)
 create mode 100644 src/backend/statistics/subset.c

diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c
index 6ea6a323..8fefe73b 100644
--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -47,301 +47,349 @@ compare_int16(const void *a, const void *b)
  */
 ObjectAddress
 CreateStatistics(CreateStatsStmt *stmt)
-{// #lizard forgives
-    int16        attnums[STATS_MAX_DIMENSIONS];
-    int            numcols = 0;
-    char       *namestr;
-    NameData    stxname;
-    Oid            statoid;
-    Oid            namespaceId;
-    Oid            stxowner = GetUserId();
-    HeapTuple    htup;
-    Datum        values[Natts_pg_statistic_ext];
-    bool        nulls[Natts_pg_statistic_ext];
-    int2vector *stxkeys;
-    Relation    statrel;
-    Relation    rel = NULL;
-    Oid            relid;
-    ObjectAddress parentobject,
-                myself;
-    Datum        types[2];        /* one for each possible type of statistic */
-    int            ntypes;
-    ArrayType  *stxkind;
-    bool        build_ndistinct;
-    bool        build_dependencies;
-    bool        requested_type = false;
-    int            i;
-    ListCell   *cell;
-
-    Assert(IsA(stmt, CreateStatsStmt));
-
-    /* resolve the pieces of the name (namespace etc.) */
-    namespaceId = QualifiedNameGetCreationNamespace(stmt->defnames, &namestr);
-    namestrcpy(&stxname, namestr);
-
-    /*
-     * Deal with the possibility that the statistics object already exists.
-     */
-    if (SearchSysCacheExists2(STATEXTNAMENSP,
-                              NameGetDatum(&stxname),
-                              ObjectIdGetDatum(namespaceId)))
-    {
-        if (stmt->if_not_exists)
-        {
-            ereport(NOTICE,
-                    (errcode(ERRCODE_DUPLICATE_OBJECT),
-                     errmsg("statistics object \"%s\" already exists, skipping",
-                            namestr)));
-            return InvalidObjectAddress;
-        }
-
-        ereport(ERROR,
-                (errcode(ERRCODE_DUPLICATE_OBJECT),
-                 errmsg("statistics object \"%s\" already exists", namestr)));
-    }
-
-    /*
-     * Examine the FROM clause.  Currently, we only allow it to be a single
-     * simple table, but later we'll probably allow multiple tables and JOIN
-     * syntax.  The grammar is already prepared for that, so we have to check
-     * here that what we got is what we can support.
-     */
-    if (list_length(stmt->relations) != 1)
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("only a single relation is allowed in CREATE STATISTICS")));
-
-    foreach(cell, stmt->relations)
-    {
-        Node       *rln = (Node *) lfirst(cell);
-
-        if (!IsA(rln, RangeVar))
-            ereport(ERROR,
-                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                     errmsg("only a single relation is allowed in CREATE STATISTICS")));
-
-        /*
-         * CREATE STATISTICS will influence future execution plans but does
-         * not interfere with currently executing plans.  So it should be
-         * enough to take only ShareUpdateExclusiveLock on relation,
-         * conflicting with ANALYZE and other DDL that sets statistical
-         * information, but not with normal queries.
-         */
-        rel = relation_openrv((RangeVar *) rln, ShareUpdateExclusiveLock);
-
-        /* Restrict to allowed relation types */
-        if (rel->rd_rel->relkind != RELKIND_RELATION &&
-            rel->rd_rel->relkind != RELKIND_MATVIEW &&
-            rel->rd_rel->relkind != RELKIND_FOREIGN_TABLE &&
-            rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
-            ereport(ERROR,
-                    (errcode(ERRCODE_WRONG_OBJECT_TYPE),
-                     errmsg("relation \"%s\" is not a table, foreign table, or materialized view",
-                            RelationGetRelationName(rel))));
-
-        /* You must own the relation to create stats on it */
-        if (!pg_class_ownercheck(RelationGetRelid(rel), stxowner))
-            aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
-                           RelationGetRelationName(rel));
-    }
-
-    Assert(rel);
-    relid = RelationGetRelid(rel);
-
-    /*
-     * Currently, we only allow simple column references in the expression
-     * list.  That will change someday, and again the grammar already supports
-     * it so we have to enforce restrictions here.  For now, we can convert
-     * the expression list to a simple array of attnums.  While at it, enforce
-     * some constraints.
-     */
-    foreach(cell, stmt->exprs)
-    {
-        Node       *expr = (Node *) lfirst(cell);
-        ColumnRef  *cref;
-        char       *attname;
-        HeapTuple    atttuple;
-        Form_pg_attribute attForm;
-        TypeCacheEntry *type;
-
-        if (!IsA(expr, ColumnRef))
-            ereport(ERROR,
-                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                     errmsg("only simple column references are allowed in CREATE STATISTICS")));
-        cref = (ColumnRef *) expr;
-
-        if (list_length(cref->fields) != 1)
-            ereport(ERROR,
-                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                     errmsg("only simple column references are allowed in CREATE STATISTICS")));
-        attname = strVal((Value *) linitial(cref->fields));
-
-        atttuple = SearchSysCacheAttName(relid, attname);
-        if (!HeapTupleIsValid(atttuple))
-            ereport(ERROR,
-                    (errcode(ERRCODE_UNDEFINED_COLUMN),
-                     errmsg("column \"%s\" referenced in statistics does not exist",
-                            attname)));
-        attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
-
-        /* Disallow use of system attributes in extended stats */
-        if (attForm->attnum <= 0)
-            ereport(ERROR,
-                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                     errmsg("statistics creation on system columns is not supported")));
-
-        /* Disallow data types without a less-than operator */
-        type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
-        if (type->lt_opr == InvalidOid)
-            ereport(ERROR,
-                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                     errmsg("column \"%s\" cannot be used in statistics because its type has no default btree operator class",
-                            attname)));
-
-        /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */
-        if (numcols >= STATS_MAX_DIMENSIONS)
-            ereport(ERROR,
-                    (errcode(ERRCODE_TOO_MANY_COLUMNS),
-                     errmsg("cannot have more than %d columns in statistics",
-                            STATS_MAX_DIMENSIONS)));
-
-        attnums[numcols] = attForm->attnum;
-        numcols++;
-        ReleaseSysCache(atttuple);
-    }
-
-    /*
-     * Check that at least two columns were specified in the statement. The
-     * upper bound was already checked in the loop above.
-     */
-    if (numcols < 2)
-        ereport(ERROR,
-                (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
-                 errmsg("extended statistics require at least 2 columns")));
-
-    /*
-     * Sort the attnums, which makes detecting duplicates somewhat easier, and
-     * it does not hurt (it does not affect the efficiency, unlike for
-     * indexes, for example).
-     */
-    qsort(attnums, numcols, sizeof(int16), compare_int16);
-
-    /*
-     * Check for duplicates in the list of columns. The attnums are sorted so
-     * just check consecutive elements.
-     */
-    for (i = 1; i < numcols; i++)
-    {
-        if (attnums[i] == attnums[i - 1])
-            ereport(ERROR,
-                    (errcode(ERRCODE_DUPLICATE_COLUMN),
-                     errmsg("duplicate column name in statistics definition")));
-    }
-
-    /* Form an int2vector representation of the sorted column list */
-    stxkeys = buildint2vector(attnums, numcols);
-
-    /*
-     * Parse the statistics types.
-     */
-    build_ndistinct = false;
-    build_dependencies = false;
-    foreach(cell, stmt->stat_types)
-    {
-        char       *type = strVal((Value *) lfirst(cell));
-
-        if (strcmp(type, "ndistinct") == 0)
-        {
-            build_ndistinct = true;
-            requested_type = true;
-        }
-        else if (strcmp(type, "dependencies") == 0)
-        {
-            build_dependencies = true;
-            requested_type = true;
-        }
-        else
-            ereport(ERROR,
-                    (errcode(ERRCODE_SYNTAX_ERROR),
-                     errmsg("unrecognized statistic type \"%s\"",
-                            type)));
-    }
-    /* If no statistic type was specified, build them all. */
-    if (!requested_type)
-    {
-        build_ndistinct = true;
-        build_dependencies = true;
-    }
-
-    /* construct the char array of enabled statistic types */
-    ntypes = 0;
-    if (build_ndistinct)
-        types[ntypes++] = CharGetDatum(STATS_EXT_NDISTINCT);
-    if (build_dependencies)
-        types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
-    Assert(ntypes > 0 && ntypes <= lengthof(types));
-    stxkind = construct_array(types, ntypes, CHAROID, 1, true, 'c');
-
-    /*
-     * Everything seems fine, so let's build the pg_statistic_ext tuple.
-     */
-    memset(values, 0, sizeof(values));
-    memset(nulls, false, sizeof(nulls));
-    values[Anum_pg_statistic_ext_stxrelid - 1] = ObjectIdGetDatum(relid);
-    values[Anum_pg_statistic_ext_stxname - 1] = NameGetDatum(&stxname);
-    values[Anum_pg_statistic_ext_stxnamespace - 1] = ObjectIdGetDatum(namespaceId);
-    values[Anum_pg_statistic_ext_stxowner - 1] = ObjectIdGetDatum(stxowner);
-    values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys);
-    values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind);
-
-    /* no statistics built yet */
-    nulls[Anum_pg_statistic_ext_stxndistinct - 1] = true;
-    nulls[Anum_pg_statistic_ext_stxdependencies - 1] = true;
-
-    /* insert it into pg_statistic_ext */
-    statrel = heap_open(StatisticExtRelationId, RowExclusiveLock);
-    htup = heap_form_tuple(statrel->rd_att, values, nulls);
-    statoid = CatalogTupleInsert(statrel, htup);
-    heap_freetuple(htup);
-    relation_close(statrel, RowExclusiveLock);
-
-    /*
-     * Invalidate relcache so that others see the new statistics object.
-     */
-    CacheInvalidateRelcache(rel);
-
-    relation_close(rel, NoLock);
-
-    /*
-     * Add an AUTO dependency on each column used in the stats, so that the
-     * stats object goes away if any or all of them get dropped.
-     */
-    ObjectAddressSet(myself, StatisticExtRelationId, statoid);
-
-    for (i = 0; i < numcols; i++)
-    {
-        ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]);
-        recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
-    }
-
-    /*
-     * Also add dependencies on namespace and owner.  These are required
-     * because the stats object might have a different namespace and/or owner
-     * than the underlying table(s).
-     */
-    ObjectAddressSet(parentobject, NamespaceRelationId, namespaceId);
-    recordDependencyOn(&myself, &parentobject, DEPENDENCY_NORMAL);
-
-    recordDependencyOnOwner(StatisticExtRelationId, statoid, stxowner);
-
-    /*
-     * XXX probably there should be a recordDependencyOnCurrentExtension call
-     * here too, but we'd have to add support for ALTER EXTENSION ADD/DROP
-     * STATISTICS, which is more work than it seems worth.
-     */
-
-    /* Return stats object's address */
-    return myself;
+{
+	int16		attnums[STATS_MAX_DIMENSIONS];
+#ifdef __TBASE__
+	int16		attnums_ori[STATS_MAX_DIMENSIONS];
+#endif
+	int			numcols = 0;
+	char	   *namestr;
+	NameData	stxname;
+	Oid			statoid;
+	Oid			namespaceId;
+	Oid			stxowner = GetUserId();
+	HeapTuple	htup;
+	Datum		values[Natts_pg_statistic_ext];
+	bool		nulls[Natts_pg_statistic_ext];
+	int2vector *stxkeys;
+	Relation	statrel;
+	Relation	rel = NULL;
+	Oid			relid;
+	ObjectAddress parentobject,
+				myself;
+	Datum		types[2];		/* one for each possible type of statistic */
+	int			ntypes;
+	ArrayType  *stxkind;
+	bool		build_ndistinct;
+	bool		build_dependencies;
+#ifdef __TBASE__
+	bool		build_subset;
+#endif
+	bool		requested_type = false;
+	int			i;
+	ListCell   *cell;
+
+	Assert(IsA(stmt, CreateStatsStmt));
+
+	/* resolve the pieces of the name (namespace etc.) */
+	namespaceId = QualifiedNameGetCreationNamespace(stmt->defnames, &namestr);
+	namestrcpy(&stxname, namestr);
+
+	/*
+	 * Deal with the possibility that the statistics object already exists.
+	 */
+	if (SearchSysCacheExists2(STATEXTNAMENSP,
+							  NameGetDatum(&stxname),
+							  ObjectIdGetDatum(namespaceId)))
+	{
+		if (stmt->if_not_exists)
+		{
+			ereport(NOTICE,
+					(errcode(ERRCODE_DUPLICATE_OBJECT),
+					 errmsg("statistics object \"%s\" already exists, skipping",
+							namestr)));
+			return InvalidObjectAddress;
+		}
+
+		ereport(ERROR,
+				(errcode(ERRCODE_DUPLICATE_OBJECT),
+				 errmsg("statistics object \"%s\" already exists", namestr)));
+	}
+
+	/*
+	 * Examine the FROM clause.  Currently, we only allow it to be a single
+	 * simple table, but later we'll probably allow multiple tables and JOIN
+	 * syntax.  The grammar is already prepared for that, so we have to check
+	 * here that what we got is what we can support.
+	 */
+	if (list_length(stmt->relations) != 1)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("only a single relation is allowed in CREATE STATISTICS")));
+
+	foreach(cell, stmt->relations)
+	{
+		Node	   *rln = (Node *) lfirst(cell);
+
+		if (!IsA(rln, RangeVar))
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("only a single relation is allowed in CREATE STATISTICS")));
+
+		/*
+		 * CREATE STATISTICS will influence future execution plans but does
+		 * not interfere with currently executing plans.  So it should be
+		 * enough to take only ShareUpdateExclusiveLock on relation,
+		 * conflicting with ANALYZE and other DDL that sets statistical
+		 * information, but not with normal queries.
+		 */
+		rel = relation_openrv((RangeVar *) rln, ShareUpdateExclusiveLock);
+
+		/* Restrict to allowed relation types */
+		if (rel->rd_rel->relkind != RELKIND_RELATION &&
+			rel->rd_rel->relkind != RELKIND_MATVIEW &&
+			rel->rd_rel->relkind != RELKIND_FOREIGN_TABLE &&
+			rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+			ereport(ERROR,
+					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+					 errmsg("relation \"%s\" is not a table, foreign table, or materialized view",
+							RelationGetRelationName(rel))));
+
+		/* You must own the relation to create stats on it */
+		if (!pg_class_ownercheck(RelationGetRelid(rel), stxowner))
+			aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
+						   RelationGetRelationName(rel));
+	}
+
+	Assert(rel);
+	relid = RelationGetRelid(rel);
+
+	/*
+	 * Currently, we only allow simple column references in the expression
+	 * list.  That will change someday, and again the grammar already supports
+	 * it so we have to enforce restrictions here.  For now, we can convert
+	 * the expression list to a simple array of attnums.  While at it, enforce
+	 * some constraints.
+	 */
+	foreach(cell, stmt->exprs)
+	{
+		Node	   *expr = (Node *) lfirst(cell);
+		ColumnRef  *cref;
+		char	   *attname;
+		HeapTuple	atttuple;
+		Form_pg_attribute attForm;
+		TypeCacheEntry *type;
+
+		if (!IsA(expr, ColumnRef))
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("only simple column references are allowed in CREATE STATISTICS")));
+		cref = (ColumnRef *) expr;
+
+		if (list_length(cref->fields) != 1)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("only simple column references are allowed in CREATE STATISTICS")));
+		attname = strVal((Value *) linitial(cref->fields));
+
+		atttuple = SearchSysCacheAttName(relid, attname);
+		if (!HeapTupleIsValid(atttuple))
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_COLUMN),
+					 errmsg("column \"%s\" referenced in statistics does not exist",
+							attname)));
+		attForm = (Form_pg_attribute) GETSTRUCT(atttuple);
+
+		/* Disallow use of system attributes in extended stats */
+		if (attForm->attnum <= 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("statistics creation on system columns is not supported")));
+
+		/* Disallow data types without a less-than operator */
+		type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR);
+		if (type->lt_opr == InvalidOid)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("column \"%s\" cannot be used in statistics because its type has no default btree operator class",
+							attname)));
+
+		/* Make sure no more than STATS_MAX_DIMENSIONS columns are used */
+		if (numcols >= STATS_MAX_DIMENSIONS)
+			ereport(ERROR,
+					(errcode(ERRCODE_TOO_MANY_COLUMNS),
+					 errmsg("cannot have more than %d columns in statistics",
+							STATS_MAX_DIMENSIONS)));
+
+		attnums[numcols] = attForm->attnum;
+#ifdef __TBASE__
+		attnums_ori[numcols] = attForm->attnum;
+#endif
+		numcols++;
+		ReleaseSysCache(atttuple);
+	}
+
+	/*
+	 * Check that at least two columns were specified in the statement. The
+	 * upper bound was already checked in the loop above.
+	 */
+	if (numcols < 2)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+				 errmsg("extended statistics require at least 2 columns")));
+
+	/*
+	 * Sort the attnums, which makes detecting duplicates somewhat easier, and
+	 * it does not hurt (it does not affect the efficiency, unlike for
+	 * indexes, for example).
+	 */
+	qsort(attnums, numcols, sizeof(int16), compare_int16);
+
+	/*
+	 * Check for duplicates in the list of columns. The attnums are sorted so
+	 * just check consecutive elements.
+	 */
+	for (i = 1; i < numcols; i++)
+	{
+		if (attnums[i] == attnums[i - 1])
+			ereport(ERROR,
+					(errcode(ERRCODE_DUPLICATE_COLUMN),
+					 errmsg("duplicate column name in statistics definition")));
+	}
+
+	/* Form an int2vector representation of the sorted column list */
+	stxkeys = buildint2vector(attnums, numcols);
+
+	/*
+	 * Parse the statistics types.
+	 */
+	build_ndistinct = false;
+	build_dependencies = false;
+#ifdef __TBASE__
+	build_subset = false;
+#endif
+	foreach(cell, stmt->stat_types)
+	{
+		char	   *type = strVal((Value *) lfirst(cell));
+
+		if (strcmp(type, "ndistinct") == 0)
+		{
+			build_ndistinct = true;
+			requested_type = true;
+		}
+		else if (strcmp(type, "dependencies") == 0)
+		{
+			build_dependencies = true;
+			requested_type = true;
+		}
+#ifdef __TBASE__
+		else if (strcmp(type, "subset") == 0)
+		{
+			if (list_length(stmt->exprs) != 2)
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+						 errmsg("subset statistics require exactly 2 columns")));
+			}
+
+			build_subset = true;
+			requested_type = true;
+
+			/*
+			 * The original stmt expr order implies the relation between them,
+			 * thus we need to keep the original order stored.
+			 */
+			stxkeys = buildint2vector(attnums_ori, numcols);
+		}
+#endif
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+					 errmsg("unrecognized statistic type \"%s\"",
+							type)));
+	}
+	/* If no statistic type was specified, build them all. */
+	if (!requested_type)
+	{
+		build_ndistinct = true;
+		build_dependencies = true;
+#ifdef __TBASE__
+		/* No need to build user defined knowledge */
+		build_subset = false;
+#endif
+	}
+
+	/* construct the char array of enabled statistic types */
+	ntypes = 0;
+	if (build_ndistinct)
+		types[ntypes++] = CharGetDatum(STATS_EXT_NDISTINCT);
+	if (build_dependencies)
+		types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
+#ifdef __TBASE__
+	/*
+	 * User defined subset hint should not coexists with other
+	 * types. Thus we don't need to extend the size of 'types'
+	 * array.
+	 */
+	if (build_subset)
+		types[ntypes++] = CharGetDatum(STATS_EXT_SUBSET);
+#endif
+	Assert(ntypes > 0 && ntypes <= lengthof(types));
+	stxkind = construct_array(types, ntypes, CHAROID, 1, true, 'c');
+
+	/*
+	 * Everything seems fine, so let's build the pg_statistic_ext tuple.
+	 */
+	memset(values, 0, sizeof(values));
+	memset(nulls, false, sizeof(nulls));
+	values[Anum_pg_statistic_ext_stxrelid - 1] = ObjectIdGetDatum(relid);
+	values[Anum_pg_statistic_ext_stxname - 1] = NameGetDatum(&stxname);
+	values[Anum_pg_statistic_ext_stxnamespace - 1] = ObjectIdGetDatum(namespaceId);
+	values[Anum_pg_statistic_ext_stxowner - 1] = ObjectIdGetDatum(stxowner);
+	values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys);
+	values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind);
+
+	/* no statistics built yet */
+	nulls[Anum_pg_statistic_ext_stxndistinct - 1] = true;
+	nulls[Anum_pg_statistic_ext_stxdependencies - 1] = true;
+#ifdef __TBASE__
+	nulls[Anum_pg_statistic_ext_stxsubset - 1] = true;
+#endif
+
+	/* insert it into pg_statistic_ext */
+	statrel = heap_open(StatisticExtRelationId, RowExclusiveLock);
+	htup = heap_form_tuple(statrel->rd_att, values, nulls);
+	statoid = CatalogTupleInsert(statrel, htup);
+	heap_freetuple(htup);
+	relation_close(statrel, RowExclusiveLock);
+
+	/*
+	 * Invalidate relcache so that others see the new statistics object.
+	 */
+	CacheInvalidateRelcache(rel);
+
+	relation_close(rel, NoLock);
+
+	/*
+	 * Add an AUTO dependency on each column used in the stats, so that the
+	 * stats object goes away if any or all of them get dropped.
+	 */
+	ObjectAddressSet(myself, StatisticExtRelationId, statoid);
+
+	for (i = 0; i < numcols; i++)
+	{
+		ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]);
+		recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO);
+	}
+
+	/*
+	 * Also add dependencies on namespace and owner.  These are required
+	 * because the stats object might have a different namespace and/or owner
+	 * than the underlying table(s).
+	 */
+	ObjectAddressSet(parentobject, NamespaceRelationId, namespaceId);
+	recordDependencyOn(&myself, &parentobject, DEPENDENCY_NORMAL);
+
+	recordDependencyOnOwner(StatisticExtRelationId, statoid, stxowner);
+
+	/*
+	 * XXX probably there should be a recordDependencyOnCurrentExtension call
+	 * here too, but we'd have to add support for ALTER EXTENSION ADD/DROP
+	 * STATISTICS, which is more work than it seems worth.
+	 */
+
+	/* Return stats object's address */
+	return myself;
 }
 
 /*
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index e0b06c13..8e6e1670 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -97,230 +97,244 @@ static RelOptInfo *find_single_rel_for_clauses(PlannerInfo *root,
  */
 Selectivity
 clauselist_selectivity(PlannerInfo *root,
-                       List *clauses,
-                       int varRelid,
-                       JoinType jointype,
-                       SpecialJoinInfo *sjinfo)
-{// #lizard forgives
-    Selectivity s1 = 1.0;
-    RelOptInfo *rel;
-    Bitmapset  *estimatedclauses = NULL;
-    RangeQueryClause *rqlist = NULL;
-    ListCell   *l;
-    int            listidx;
-
-    /*
-     * If there's exactly one clause, just go directly to
-     * clause_selectivity(). None of what we might do below is relevant.
-     */
-    if (list_length(clauses) == 1)
-        return clause_selectivity(root, (Node *) linitial(clauses),
-                                  varRelid, jointype, sjinfo);
-
-    /*
-     * Determine if these clauses reference a single relation.  If so, and if
-     * it has extended statistics, try to apply those.
-     */
-    rel = find_single_rel_for_clauses(root, clauses);
-    if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
-    {
-        /*
-         * Perform selectivity estimations on any clauses found applicable by
-         * dependencies_clauselist_selectivity.  'estimatedclauses' will be
-         * filled with the 0-based list positions of clauses used that way, so
-         * that we can ignore them below.
-         */
-        s1 *= dependencies_clauselist_selectivity(root, clauses, varRelid,
-                                                  jointype, sjinfo, rel,
-                                                  &estimatedclauses);
-
-        /*
-         * This would be the place to apply any other types of extended
-         * statistics selectivity estimations for remaining clauses.
-         */
-    }
-
-    /*
-     * Apply normal selectivity estimates for remaining clauses. We'll be
-     * careful to skip any clauses which were already estimated above.
-     *
-     * Anything that doesn't look like a potential rangequery clause gets
-     * multiplied into s1 and forgotten. Anything that does gets inserted into
-     * an rqlist entry.
-     */
-    listidx = -1;
-    foreach(l, clauses)
-    {
-        Node       *clause = (Node *) lfirst(l);
-        RestrictInfo *rinfo;
-        Selectivity s2;
-
-        listidx++;
-
-        /*
-         * Skip this clause if it's already been estimated by some other
-         * statistics above.
-         */
-        if (bms_is_member(listidx, estimatedclauses))
-            continue;
-
-        /* Always compute the selectivity using clause_selectivity */
-        s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo);
-
-        /*
-         * Check for being passed a RestrictInfo.
-         *
-         * If it's a pseudoconstant RestrictInfo, then s2 is either 1.0 or
-         * 0.0; just use that rather than looking for range pairs.
-         */
-        if (IsA(clause, RestrictInfo))
-        {
-            rinfo = (RestrictInfo *) clause;
-            if (rinfo->pseudoconstant)
-            {
-                s1 = s1 * s2;
-                continue;
-            }
-            clause = (Node *) rinfo->clause;
-        }
-        else
-            rinfo = NULL;
-
-        /*
-         * See if it looks like a restriction clause with a pseudoconstant on
-         * one side.  (Anything more complicated than that might not behave in
-         * the simple way we are expecting.)  Most of the tests here can be
-         * done more efficiently with rinfo than without.
-         */
-        if (is_opclause(clause) && list_length(((OpExpr *) clause)->args) == 2)
-        {
-            OpExpr       *expr = (OpExpr *) clause;
-            bool        varonleft = true;
-            bool        ok;
-
-            if (rinfo)
-            {
-                ok = (bms_membership(rinfo->clause_relids) == BMS_SINGLETON) &&
-                    (is_pseudo_constant_clause_relids(lsecond(expr->args),
-                                                      rinfo->right_relids) ||
-                     (varonleft = false,
-                      is_pseudo_constant_clause_relids(linitial(expr->args),
-                                                       rinfo->left_relids)));
-            }
-            else
-            {
-                ok = (NumRelids(clause) == 1) &&
-                    (is_pseudo_constant_clause(lsecond(expr->args)) ||
-                     (varonleft = false,
-                      is_pseudo_constant_clause(linitial(expr->args))));
-            }
-
-            if (ok)
-            {
-                /*
-                 * If it's not a "<" or ">" operator, just merge the
-                 * selectivity in generically.  But if it's the right oprrest,
-                 * add the clause to rqlist for later processing.
-                 */
-                switch (get_oprrest(expr->opno))
-                {
-                    case F_SCALARLTSEL:
-                        addRangeClause(&rqlist, clause,
-                                       varonleft, true, s2);
-                        break;
-                    case F_SCALARGTSEL:
-                        addRangeClause(&rqlist, clause,
-                                       varonleft, false, s2);
-                        break;
-                    default:
-                        /* Just merge the selectivity in generically */
-                        s1 = s1 * s2;
-                        break;
-                }
-                continue;        /* drop to loop bottom */
-            }
-        }
-
-        /* Not the right form, so treat it generically. */
-        s1 = s1 * s2;
-    }
-
-    /*
-     * Now scan the rangequery pair list.
-     */
-    while (rqlist != NULL)
-    {
-        RangeQueryClause *rqnext;
-
-        if (rqlist->have_lobound && rqlist->have_hibound)
-        {
-            /* Successfully matched a pair of range clauses */
-            Selectivity s2;
-
-            /*
-             * Exact equality to the default value probably means the
-             * selectivity function punted.  This is not airtight but should
-             * be good enough.
-             */
-            if (rqlist->hibound == DEFAULT_INEQ_SEL ||
-                rqlist->lobound == DEFAULT_INEQ_SEL)
-            {
-                s2 = DEFAULT_RANGE_INEQ_SEL;
-            }
-            else
-            {
-                s2 = rqlist->hibound + rqlist->lobound - 1.0;
-
-                /* Adjust for double-exclusion of NULLs */
-                s2 += nulltestsel(root, IS_NULL, rqlist->var,
-                                  varRelid, jointype, sjinfo);
-
-                /*
-                 * A zero or slightly negative s2 should be converted into a
-                 * small positive value; we probably are dealing with a very
-                 * tight range and got a bogus result due to roundoff errors.
-                 * However, if s2 is very negative, then we probably have
-                 * default selectivity estimates on one or both sides of the
-                 * range that we failed to recognize above for some reason.
-                 */
-                if (s2 <= 0.0)
-                {
-                    if (s2 < -0.01)
-                    {
-                        /*
-                         * No data available --- use a default estimate that
-                         * is small, but not real small.
-                         */
-                        s2 = DEFAULT_RANGE_INEQ_SEL;
-                    }
-                    else
-                    {
-                        /*
-                         * It's just roundoff error; use a small positive
-                         * value
-                         */
-                        s2 = 1.0e-10;
-                    }
-                }
-            }
-            /* Merge in the selectivity of the pair of clauses */
-            s1 *= s2;
-        }
-        else
-        {
-            /* Only found one of a pair, merge it in generically */
-            if (rqlist->have_lobound)
-                s1 *= rqlist->lobound;
-            else
-                s1 *= rqlist->hibound;
-        }
-        /* release storage and advance */
-        rqnext = rqlist->next;
-        pfree(rqlist);
-        rqlist = rqnext;
-    }
-
-    return s1;
+					   List *clauses,
+					   int varRelid,
+					   JoinType jointype,
+					   SpecialJoinInfo *sjinfo)
+{
+	Selectivity s1 = 1.0;
+	RelOptInfo *rel;
+	Bitmapset  *estimatedclauses = NULL;
+	RangeQueryClause *rqlist = NULL;
+	ListCell   *l;
+	int			listidx;
+
+	/*
+	 * If there's exactly one clause, just go directly to
+	 * clause_selectivity(). None of what we might do below is relevant.
+	 */
+	if (list_length(clauses) == 1)
+		return clause_selectivity(root, (Node *) linitial(clauses),
+								  varRelid, jointype, sjinfo);
+
+	/*
+	 * Determine if these clauses reference a single relation.  If so, and if
+	 * it has extended statistics, try to apply those.
+	 */
+	rel = find_single_rel_for_clauses(root, clauses);
+	if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
+	{
+#ifdef __TBASE__
+		/*
+		 * Perform subset eliminations on any clauses found applicable by
+		 * subset_clauselist_selectivity. Subset dependencies got higher
+		 * priority over statistic-based dependencies. 'estimatedclauses'
+		 * will be filled with the 0-based list positions of clauses used
+		 * that way, so that we can ignore them below in both dependencies
+		 * selectivity calculation and independent basic selectivity
+		 * calculation.
+		 */
+		s1 *= subset_clauselist_selectivity(root, clauses, varRelid, jointype,
+											sjinfo, rel, &estimatedclauses);
+#endif
+
+		/*
+		 * Perform selectivity estimations on any clauses found applicable by
+		 * dependencies_clauselist_selectivity.  'estimatedclauses' will be
+		 * filled with the 0-based list positions of clauses used that way, so
+		 * that we can ignore them below.
+		 */
+		s1 *= dependencies_clauselist_selectivity(root, clauses, varRelid,
+												  jointype, sjinfo, rel,
+												  &estimatedclauses);
+
+		/*
+		 * This would be the place to apply any other types of extended
+		 * statistics selectivity estimations for remaining clauses.
+		 */
+	}
+
+	/*
+	 * Apply normal selectivity estimates for remaining clauses. We'll be
+	 * careful to skip any clauses which were already estimated above.
+	 *
+	 * Anything that doesn't look like a potential rangequery clause gets
+	 * multiplied into s1 and forgotten. Anything that does gets inserted into
+	 * an rqlist entry.
+	 */
+	listidx = -1;
+	foreach(l, clauses)
+	{
+		Node	   *clause = (Node *) lfirst(l);
+		RestrictInfo *rinfo;
+		Selectivity s2;
+
+		listidx++;
+
+		/*
+		 * Skip this clause if it's already been estimated by some other
+		 * statistics above.
+		 */
+		if (bms_is_member(listidx, estimatedclauses))
+			continue;
+
+		/* Always compute the selectivity using clause_selectivity */
+		s2 = clause_selectivity(root, clause, varRelid, jointype, sjinfo);
+
+		/*
+		 * Check for being passed a RestrictInfo.
+		 *
+		 * If it's a pseudoconstant RestrictInfo, then s2 is either 1.0 or
+		 * 0.0; just use that rather than looking for range pairs.
+		 */
+		if (IsA(clause, RestrictInfo))
+		{
+			rinfo = (RestrictInfo *) clause;
+			if (rinfo->pseudoconstant)
+			{
+				s1 = s1 * s2;
+				continue;
+			}
+			clause = (Node *) rinfo->clause;
+		}
+		else
+			rinfo = NULL;
+
+		/*
+		 * See if it looks like a restriction clause with a pseudoconstant on
+		 * one side.  (Anything more complicated than that might not behave in
+		 * the simple way we are expecting.)  Most of the tests here can be
+		 * done more efficiently with rinfo than without.
+		 */
+		if (is_opclause(clause) && list_length(((OpExpr *) clause)->args) == 2)
+		{
+			OpExpr	   *expr = (OpExpr *) clause;
+			bool		varonleft = true;
+			bool		ok;
+
+			if (rinfo)
+			{
+				ok = (bms_membership(rinfo->clause_relids) == BMS_SINGLETON) &&
+					(is_pseudo_constant_clause_relids(lsecond(expr->args),
+													  rinfo->right_relids) ||
+					 (varonleft = false,
+					  is_pseudo_constant_clause_relids(linitial(expr->args),
+													   rinfo->left_relids)));
+			}
+			else
+			{
+				ok = (NumRelids(clause) == 1) &&
+					(is_pseudo_constant_clause(lsecond(expr->args)) ||
+					 (varonleft = false,
+					  is_pseudo_constant_clause(linitial(expr->args))));
+			}
+
+			if (ok)
+			{
+				/*
+				 * If it's not a "<" or ">" operator, just merge the
+				 * selectivity in generically.  But if it's the right oprrest,
+				 * add the clause to rqlist for later processing.
+				 */
+				switch (get_oprrest(expr->opno))
+				{
+					case F_SCALARLTSEL:
+						addRangeClause(&rqlist, clause,
+									   varonleft, true, s2);
+						break;
+					case F_SCALARGTSEL:
+						addRangeClause(&rqlist, clause,
+									   varonleft, false, s2);
+						break;
+					default:
+						/* Just merge the selectivity in generically */
+						s1 = s1 * s2;
+						break;
+				}
+				continue;		/* drop to loop bottom */
+			}
+		}
+
+		/* Not the right form, so treat it generically. */
+		s1 = s1 * s2;
+	}
+
+	/*
+	 * Now scan the rangequery pair list.
+	 */
+	while (rqlist != NULL)
+	{
+		RangeQueryClause *rqnext;
+
+		if (rqlist->have_lobound && rqlist->have_hibound)
+		{
+			/* Successfully matched a pair of range clauses */
+			Selectivity s2;
+
+			/*
+			 * Exact equality to the default value probably means the
+			 * selectivity function punted.  This is not airtight but should
+			 * be good enough.
+			 */
+			if (rqlist->hibound == DEFAULT_INEQ_SEL ||
+				rqlist->lobound == DEFAULT_INEQ_SEL)
+			{
+				s2 = DEFAULT_RANGE_INEQ_SEL;
+			}
+			else
+			{
+				s2 = rqlist->hibound + rqlist->lobound - 1.0;
+
+				/* Adjust for double-exclusion of NULLs */
+				s2 += nulltestsel(root, IS_NULL, rqlist->var,
+								  varRelid, jointype, sjinfo);
+
+				/*
+				 * A zero or slightly negative s2 should be converted into a
+				 * small positive value; we probably are dealing with a very
+				 * tight range and got a bogus result due to roundoff errors.
+				 * However, if s2 is very negative, then we probably have
+				 * default selectivity estimates on one or both sides of the
+				 * range that we failed to recognize above for some reason.
+				 */
+				if (s2 <= 0.0)
+				{
+					if (s2 < -0.01)
+					{
+						/*
+						 * No data available --- use a default estimate that
+						 * is small, but not real small.
+						 */
+						s2 = DEFAULT_RANGE_INEQ_SEL;
+					}
+					else
+					{
+						/*
+						 * It's just roundoff error; use a small positive
+						 * value
+						 */
+						s2 = 1.0e-10;
+					}
+				}
+			}
+			/* Merge in the selectivity of the pair of clauses */
+			s1 *= s2;
+		}
+		else
+		{
+			/* Only found one of a pair, merge it in generically */
+			if (rqlist->have_lobound)
+				s1 *= rqlist->lobound;
+			else
+				s1 *= rqlist->hibound;
+		}
+		/* release storage and advance */
+		rqnext = rqlist->next;
+		pfree(rqlist);
+		rqlist = rqnext;
+	}
+
+	return s1;
 }
 
 /*
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index d831e03d..a1248e65 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -1405,65 +1405,79 @@ get_relation_constraints(PlannerInfo *root,
 static List *
 get_relation_statistics(RelOptInfo *rel, Relation relation)
 {
-    List       *statoidlist;
-    List       *stainfos = NIL;
-    ListCell   *l;
-
-    statoidlist = RelationGetStatExtList(relation);
-
-    foreach(l, statoidlist)
-    {
-        Oid            statOid = lfirst_oid(l);
-        Form_pg_statistic_ext staForm;
-        HeapTuple    htup;
-        Bitmapset  *keys = NULL;
-        int            i;
-
-        htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
-        if (!htup)
-            elog(ERROR, "cache lookup failed for statistics object %u", statOid);
-        staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
-
-        /*
-         * First, build the array of columns covered.  This is ultimately
-         * wasted if no stats within the object have actually been built, but
-         * it doesn't seem worth troubling over that case.
-         */
-        for (i = 0; i < staForm->stxkeys.dim1; i++)
-            keys = bms_add_member(keys, staForm->stxkeys.values[i]);
-
-        /* add one StatisticExtInfo for each kind built */
-        if (statext_is_kind_built(htup, STATS_EXT_NDISTINCT))
-        {
-            StatisticExtInfo *info = makeNode(StatisticExtInfo);
+	List	   *statoidlist;
+	List	   *stainfos = NIL;
+	ListCell   *l;
+
+	statoidlist = RelationGetStatExtList(relation);
+
+	foreach(l, statoidlist)
+	{
+		Oid			statOid = lfirst_oid(l);
+		Form_pg_statistic_ext staForm;
+		HeapTuple	htup;
+		Bitmapset  *keys = NULL;
+		int			i;
+
+		htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
+		if (!htup)
+			elog(ERROR, "cache lookup failed for statistics object %u", statOid);
+		staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
+
+		/*
+		 * First, build the array of columns covered.  This is ultimately
+		 * wasted if no stats within the object have actually been built, but
+		 * it doesn't seem worth troubling over that case.
+		 */
+		for (i = 0; i < staForm->stxkeys.dim1; i++)
+			keys = bms_add_member(keys, staForm->stxkeys.values[i]);
+
+		/* add one StatisticExtInfo for each kind built */
+		if (statext_is_kind_built(htup, STATS_EXT_NDISTINCT))
+		{
+			StatisticExtInfo *info = makeNode(StatisticExtInfo);
+
+			info->statOid = statOid;
+			info->rel = rel;
+			info->kind = STATS_EXT_NDISTINCT;
+			info->keys = bms_copy(keys);
+
+			stainfos = lcons(info, stainfos);
+		}
+
+		if (statext_is_kind_built(htup, STATS_EXT_DEPENDENCIES))
+		{
+			StatisticExtInfo *info = makeNode(StatisticExtInfo);
+
+			info->statOid = statOid;
+			info->rel = rel;
+			info->kind = STATS_EXT_DEPENDENCIES;
+			info->keys = bms_copy(keys);
+
+			stainfos = lcons(info, stainfos);
+		}
 
-            info->statOid = statOid;
-            info->rel = rel;
-            info->kind = STATS_EXT_NDISTINCT;
-            info->keys = bms_copy(keys);
-
-            stainfos = lcons(info, stainfos);
-        }
-
-        if (statext_is_kind_built(htup, STATS_EXT_DEPENDENCIES))
-        {
-            StatisticExtInfo *info = makeNode(StatisticExtInfo);
+#ifdef __TBASE__
+		if (statext_is_kind_built(htup, STATS_EXT_SUBSET))
+		{
+			StatisticExtInfo *info = makeNode(StatisticExtInfo);
 
-            info->statOid = statOid;
-            info->rel = rel;
-            info->kind = STATS_EXT_DEPENDENCIES;
-            info->keys = bms_copy(keys);
+			info->statOid = statOid;
+			info->rel = rel;
+			info->kind = STATS_EXT_SUBSET;
+			info->keys = bms_copy(keys);
 
-            stainfos = lcons(info, stainfos);
-        }
+			stainfos = lcons(info, stainfos);
+		}
+#endif
 
-        ReleaseSysCache(htup);
-        bms_free(keys);
-    }
+		ReleaseSysCache(htup);
+		bms_free(keys);
+	}
 
-    list_free(statoidlist);
+	list_free(statoidlist);
 
-    return stainfos;
+	return stainfos;
 }
 
 /*
diff --git a/src/backend/statistics/Makefile b/src/backend/statistics/Makefile
index 3404e455..b9cc0290 100644
--- a/src/backend/statistics/Makefile
+++ b/src/backend/statistics/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/statistics
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = extended_stats.o dependencies.o mvdistinct.o
+OBJS = extended_stats.o dependencies.o mvdistinct.o subset.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c
index c828e935..2863517d 100644
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -909,159 +909,166 @@ find_strongest_dependency(StatisticExtInfo *stats, MVDependencies *dependencies,
  */
 Selectivity
 dependencies_clauselist_selectivity(PlannerInfo *root,
-                                    List *clauses,
-                                    int varRelid,
-                                    JoinType jointype,
-                                    SpecialJoinInfo *sjinfo,
-                                    RelOptInfo *rel,
-                                    Bitmapset **estimatedclauses)
-{// #lizard forgives
-    Selectivity s1 = 1.0;
-    ListCell   *l;
-    Bitmapset  *clauses_attnums = NULL;
-    StatisticExtInfo *stat;
-    MVDependencies *dependencies;
-    AttrNumber *list_attnums;
-    int            listidx;
-
-    /* check if there's any stats that might be useful for us. */
-    if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
-        return 1.0;
-
-    list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
-                                         list_length(clauses));
-
-    /*
-     * Pre-process the clauses list to extract the attnums seen in each item.
-     * We need to determine if there's any clauses which will be useful for
-     * dependency selectivity estimations. Along the way we'll record all of
-     * the attnums for each clause in a list which we'll reference later so we
-     * don't need to repeat the same work again. We'll also keep track of all
-     * attnums seen.
-     */
-    listidx = 0;
-    foreach(l, clauses)
-    {
-        Node       *clause = (Node *) lfirst(l);
-        AttrNumber    attnum;
-
-        if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
-        {
-            list_attnums[listidx] = attnum;
-            clauses_attnums = bms_add_member(clauses_attnums, attnum);
-        }
-        else
-            list_attnums[listidx] = InvalidAttrNumber;
-
-        listidx++;
-    }
-
-    /*
-     * If there's not at least two distinct attnums then reject the whole list
-     * of clauses. We must return 1.0 so the calling function's selectivity is
-     * unaffected.
-     */
-    if (bms_num_members(clauses_attnums) < 2)
-    {
-        pfree(list_attnums);
-        return 1.0;
-    }
-
-    /* find the best suited statistics object for these attnums */
-    stat = choose_best_statistics(rel->statlist, clauses_attnums,
-                                  STATS_EXT_DEPENDENCIES);
-
-    /* if no matching stats could be found then we've nothing to do */
-    if (!stat)
-    {
-        pfree(list_attnums);
-        return 1.0;
-    }
-
-    /* load the dependency items stored in the statistics object */
-    dependencies = statext_dependencies_load(stat->statOid);
-
-    /*
-     * Apply the dependencies recursively, starting with the widest/strongest
-     * ones, and proceeding to the smaller/weaker ones. At the end of each
-     * round we factor in the selectivity of clauses on the implied attribute,
-     * and remove the clauses from the list.
-     */
-    while (true)
-    {
-        Selectivity s2 = 1.0;
-        MVDependency *dependency;
-
-        /* the widest/strongest dependency, fully matched by clauses */
-        dependency = find_strongest_dependency(stat, dependencies,
-                                               clauses_attnums);
-
-        /* if no suitable dependency was found, we're done */
-        if (!dependency)
-            break;
-
-        /*
-         * We found an applicable dependency, so find all the clauses on the
-         * implied attribute - with dependency (a,b => c) we look for clauses
-         * on 'c'.
-         */
-        listidx = -1;
-        foreach(l, clauses)
-        {
-            Node       *clause;
-
-            listidx++;
-
-            /*
-             * Skip incompatible clauses, and ones we've already estimated on.
-             */
-            if (list_attnums[listidx] == InvalidAttrNumber ||
-                bms_is_member(listidx, *estimatedclauses))
-                continue;
-
-            /*
-             * Technically we could find more than one clause for a given
-             * attnum. Since these clauses must be equality clauses, we choose
-             * to only take the selectivity estimate from the final clause in
-             * the list for this attnum. If the attnum happens to be compared
-             * to a different Const in another clause then no rows will match
-             * anyway. If it happens to be compared to the same Const, then
-             * ignoring the additional clause is just the thing to do.
-             */
-            if (dependency_implies_attribute(dependency,
-                                             list_attnums[listidx]))
-            {
-                clause = (Node *) lfirst(l);
-
-                s2 = clause_selectivity(root, clause, varRelid, jointype,
-                                        sjinfo);
-
-                /* mark this one as done, so we don't touch it again. */
-                *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
-
-                /*
-                 * Mark that we've got and used the dependency on this clause.
-                 * We'll want to ignore this when looking for the next
-                 * strongest dependency above.
-                 */
-                clauses_attnums = bms_del_member(clauses_attnums,
-                                                 list_attnums[listidx]);
-            }
-        }
-
-        /*
-         * Now factor in the selectivity for all the "implied" clauses into
-         * the final one, using this formula:
-         *
-         * P(a,b) = P(a) * (f + (1-f) * P(b))
-         *
-         * where 'f' is the degree of validity of the dependency.
-         */
-        s1 *= (dependency->degree + (1 - dependency->degree) * s2);
-    }
-
-    pfree(dependencies);
-    pfree(list_attnums);
-
-    return s1;
+									List *clauses,
+									int varRelid,
+									JoinType jointype,
+									SpecialJoinInfo *sjinfo,
+									RelOptInfo *rel,
+									Bitmapset **estimatedclauses)
+{
+	Selectivity s1 = 1.0;
+	ListCell   *l;
+	Bitmapset  *clauses_attnums = NULL;
+	StatisticExtInfo *stat;
+	MVDependencies *dependencies;
+	AttrNumber *list_attnums;
+	int			listidx;
+
+	/* check if there's any stats that might be useful for us. */
+	if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
+		return 1.0;
+
+	list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
+										 list_length(clauses));
+
+	/*
+	 * Pre-process the clauses list to extract the attnums seen in each item.
+	 * We need to determine if there's any clauses which will be useful for
+	 * dependency selectivity estimations. Along the way we'll record all of
+	 * the attnums for each clause in a list which we'll reference later so we
+	 * don't need to repeat the same work again. We'll also keep track of all
+	 * attnums seen.
+	 */
+	listidx = 0;
+	foreach(l, clauses)
+	{
+		Node	   *clause = (Node *) lfirst(l);
+		AttrNumber	attnum;
+
+#ifdef __TBASE__
+		/* Could eliminated by the prior subset dependency */
+		if (bms_is_member(listidx, *estimatedclauses))
+		{
+			list_attnums[listidx] = InvalidAttrNumber;
+		}
+#endif
+		else if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
+		{
+			list_attnums[listidx] = attnum;
+			clauses_attnums = bms_add_member(clauses_attnums, attnum);
+		}
+		else
+			list_attnums[listidx] = InvalidAttrNumber;
+
+		listidx++;
+	}
+
+	/*
+	 * If there's not at least two distinct attnums then reject the whole list
+	 * of clauses. We must return 1.0 so the calling function's selectivity is
+	 * unaffected.
+	 */
+	if (bms_num_members(clauses_attnums) < 2)
+	{
+		pfree(list_attnums);
+		return 1.0;
+	}
+
+	/* find the best suited statistics object for these attnums */
+	stat = choose_best_statistics(rel->statlist, clauses_attnums,
+								  STATS_EXT_DEPENDENCIES);
+
+	/* if no matching stats could be found then we've nothing to do */
+	if (!stat)
+	{
+		pfree(list_attnums);
+		return 1.0;
+	}
+
+	/* load the dependency items stored in the statistics object */
+	dependencies = statext_dependencies_load(stat->statOid);
+
+	/*
+	 * Apply the dependencies recursively, starting with the widest/strongest
+	 * ones, and proceeding to the smaller/weaker ones. At the end of each
+	 * round we factor in the selectivity of clauses on the implied attribute,
+	 * and remove the clauses from the list.
+	 */
+	while (true)
+	{
+		Selectivity s2 = 1.0;
+		MVDependency *dependency;
+
+		/* the widest/strongest dependency, fully matched by clauses */
+		dependency = find_strongest_dependency(stat, dependencies,
+											   clauses_attnums);
+
+		/* if no suitable dependency was found, we're done */
+		if (!dependency)
+			break;
+
+		/*
+		 * We found an applicable dependency, so find all the clauses on the
+		 * implied attribute - with dependency (a,b => c) we look for clauses
+		 * on 'c'.
+		 */
+		listidx = -1;
+		foreach(l, clauses)
+		{
+			Node	   *clause;
+
+			listidx++;
+
+			/*
+			 * Skip incompatible clauses, and ones we've already estimated on.
+			 */
+			if (list_attnums[listidx] == InvalidAttrNumber ||
+				bms_is_member(listidx, *estimatedclauses))
+				continue;
+
+			/*
+			 * Technically we could find more than one clause for a given
+			 * attnum. Since these clauses must be equality clauses, we choose
+			 * to only take the selectivity estimate from the final clause in
+			 * the list for this attnum. If the attnum happens to be compared
+			 * to a different Const in another clause then no rows will match
+			 * anyway. If it happens to be compared to the same Const, then
+			 * ignoring the additional clause is just the thing to do.
+			 */
+			if (dependency_implies_attribute(dependency,
+											 list_attnums[listidx]))
+			{
+				clause = (Node *) lfirst(l);
+
+				s2 = clause_selectivity(root, clause, varRelid, jointype,
+										sjinfo);
+
+				/* mark this one as done, so we don't touch it again. */
+				*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
+
+				/*
+				 * Mark that we've got and used the dependency on this clause.
+				 * We'll want to ignore this when looking for the next
+				 * strongest dependency above.
+				 */
+				clauses_attnums = bms_del_member(clauses_attnums,
+												 list_attnums[listidx]);
+			}
+		}
+
+		/*
+		 * Now factor in the selectivity for all the "implied" clauses into
+		 * the final one, using this formula:
+		 *
+		 * P(a,b) = P(a) * (f + (1-f) * P(b))
+		 *
+		 * where 'f' is the degree of validity of the dependency.
+		 */
+		s1 *= (dependency->degree + (1 - dependency->degree) * s2);
+	}
+
+	pfree(dependencies);
+	pfree(list_attnums);
+
+	return s1;
 }
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index f0b11dd1..f1346f64 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -40,11 +40,14 @@
  */
 typedef struct StatExtEntry
 {
-    Oid            statOid;        /* OID of pg_statistic_ext entry */
-    char       *schema;            /* statistics object's schema */
-    char       *name;            /* statistics object's name */
-    Bitmapset  *columns;        /* attribute numbers covered by the object */
-    List       *types;            /* 'char' list of enabled statistic kinds */
+	Oid			statOid;		/* OID of pg_statistic_ext entry */
+	char	   *schema;			/* statistics object's schema */
+	char	   *name;			/* statistics object's name */
+	Bitmapset  *columns;		/* attribute numbers covered by the object */
+	List	   *types;			/* 'char' list of enabled statistic kinds */
+#ifdef __TBASE__
+	List	   *orderedColumns;	/* attribute numbers in order of dependency */
+#endif
 } StatExtEntry;
 
 
@@ -52,8 +55,11 @@ static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid);
 static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
                       int nvacatts, VacAttrStats **vacatts);
 static void statext_store(Relation pg_stext, Oid relid,
-              MVNDistinct *ndistinct, MVDependencies *dependencies,
-              VacAttrStats **stats);
+			  MVNDistinct *ndistinct, MVDependencies *dependencies,
+#ifdef __TBASE__
+			  MVDependencies *subset,
+#endif
+			  VacAttrStats **stats);
 
 
 /*
@@ -68,70 +74,83 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
                            int numrows, HeapTuple *rows,
                            int natts, VacAttrStats **vacattrstats)
 {
-    Relation    pg_stext;
-    ListCell   *lc;
-    List       *stats;
-    MemoryContext cxt;
-    MemoryContext oldcxt;
-
-    cxt = AllocSetContextCreate(CurrentMemoryContext, "stats ext",
-                                ALLOCSET_DEFAULT_SIZES);
-    oldcxt = MemoryContextSwitchTo(cxt);
-
-    pg_stext = heap_open(StatisticExtRelationId, RowExclusiveLock);
-    stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
-
-    foreach(lc, stats)
-    {
-        StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
-        MVNDistinct *ndistinct = NULL;
-        MVDependencies *dependencies = NULL;
-        VacAttrStats **stats;
-        ListCell   *lc2;
-
-        /*
-         * Check if we can build these stats based on the column analyzed. If
-         * not, report this fact (except in autovacuum) and move on.
-         */
-        stats = lookup_var_attr_stats(onerel, stat->columns,
-                                      natts, vacattrstats);
-        if (!stats && !IsAutoVacuumWorkerProcess())
-        {
-            ereport(WARNING,
-                    (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
-                     errmsg("statistics object \"%s.%s\" could not be computed for relation \"%s.%s\"",
-                            stat->schema, stat->name,
-                            get_namespace_name(onerel->rd_rel->relnamespace),
-                            RelationGetRelationName(onerel)),
-                     errtable(onerel)));
-            continue;
-        }
-
-        /* check allowed number of dimensions */
-        Assert(bms_num_members(stat->columns) >= 2 &&
-               bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS);
-
-        /* compute statistic of each requested type */
-        foreach(lc2, stat->types)
-        {
-            char        t = (char) lfirst_int(lc2);
-
-            if (t == STATS_EXT_NDISTINCT)
-                ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
-                                                    stat->columns, stats);
-            else if (t == STATS_EXT_DEPENDENCIES)
-                dependencies = statext_dependencies_build(numrows, rows,
-                                                          stat->columns, stats);
-        }
-
-        /* store the statistics in the catalog */
-        statext_store(pg_stext, stat->statOid, ndistinct, dependencies, stats);
-    }
-
-    heap_close(pg_stext, RowExclusiveLock);
-
-    MemoryContextSwitchTo(oldcxt);
-    MemoryContextDelete(cxt);
+	Relation	pg_stext;
+	ListCell   *lc;
+	List	   *stats;
+	MemoryContext cxt;
+	MemoryContext oldcxt;
+
+	cxt = AllocSetContextCreate(CurrentMemoryContext, "stats ext",
+								ALLOCSET_DEFAULT_SIZES);
+	oldcxt = MemoryContextSwitchTo(cxt);
+
+	pg_stext = heap_open(StatisticExtRelationId, RowExclusiveLock);
+	stats = fetch_statentries_for_relation(pg_stext, RelationGetRelid(onerel));
+
+	foreach(lc, stats)
+	{
+		StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
+		MVNDistinct *ndistinct = NULL;
+		MVDependencies *dependencies = NULL;
+#ifdef __TBASE__
+		MVDependencies *subset = NULL;
+#endif
+		VacAttrStats **stats;
+		ListCell   *lc2;
+
+		/*
+		 * Check if we can build these stats based on the column analyzed. If
+		 * not, report this fact (except in autovacuum) and move on.
+		 */
+		stats = lookup_var_attr_stats(onerel, stat->columns,
+									  natts, vacattrstats);
+		if (!stats && !IsAutoVacuumWorkerProcess())
+		{
+			ereport(WARNING,
+					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+					 errmsg("statistics object \"%s.%s\" could not be computed for relation \"%s.%s\"",
+							stat->schema, stat->name,
+							get_namespace_name(onerel->rd_rel->relnamespace),
+							RelationGetRelationName(onerel)),
+					 errtable(onerel)));
+			continue;
+		}
+
+		/* check allowed number of dimensions */
+		Assert(bms_num_members(stat->columns) >= 2 &&
+			   bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS);
+
+		/* compute statistic of each requested type */
+		foreach(lc2, stat->types)
+		{
+			char		t = (char) lfirst_int(lc2);
+
+			if (t == STATS_EXT_NDISTINCT)
+				ndistinct = statext_ndistinct_build(totalrows, numrows, rows,
+													stat->columns, stats);
+			else if (t == STATS_EXT_DEPENDENCIES)
+				dependencies = statext_dependencies_build(numrows, rows,
+														  stat->columns, stats);
+#ifdef __TBASE__
+			else if (t == STATS_EXT_SUBSET)
+				subset = statext_subset_build(numrows, stat->orderedColumns);
+#endif
+		}
+
+		/* store the statistics in the catalog */
+#ifdef __TBASE__
+		statext_store(pg_stext, stat->statOid,
+					  ndistinct, dependencies,
+					  subset, stats);
+#else
+		statext_store(pg_stext, stat->statOid, ndistinct, dependencies, stats);
+#endif
+	}
+
+	heap_close(pg_stext, RowExclusiveLock);
+
+	MemoryContextSwitchTo(oldcxt);
+	MemoryContextDelete(cxt);
 }
 
 /*
@@ -153,9 +172,15 @@ statext_is_kind_built(HeapTuple htup, char type)
             attnum = Anum_pg_statistic_ext_stxdependencies;
             break;
 
-        default:
-            elog(ERROR, "unexpected statistics type requested: %d", type);
-    }
+#ifdef __TBASE__
+		case STATS_EXT_SUBSET:
+			attnum = Anum_pg_statistic_ext_stxsubset;
+			break;
+#endif
+
+		default:
+			elog(ERROR, "unexpected statistics type requested: %d", type);
+	}
 
     return !heap_attisnull(htup, attnum, NULL);
 }
@@ -165,68 +190,93 @@ statext_is_kind_built(HeapTuple htup, char type)
  */
 static List *
 fetch_statentries_for_relation(Relation pg_statext, Oid relid)
-{// #lizard forgives
-    SysScanDesc scan;
-    ScanKeyData skey;
-    HeapTuple    htup;
-    List       *result = NIL;
-
-    /*
-     * Prepare to scan pg_statistic_ext for entries having stxrelid = this
-     * rel.
-     */
-    ScanKeyInit(&skey,
-                Anum_pg_statistic_ext_stxrelid,
-                BTEqualStrategyNumber, F_OIDEQ,
-                ObjectIdGetDatum(relid));
-
-    scan = systable_beginscan(pg_statext, StatisticExtRelidIndexId, true,
-                              NULL, 1, &skey);
-
-    while (HeapTupleIsValid(htup = systable_getnext(scan)))
-    {
-        StatExtEntry *entry;
-        Datum        datum;
-        bool        isnull;
-        int            i;
-        ArrayType  *arr;
-        char       *enabled;
-        Form_pg_statistic_ext staForm;
-
-        entry = palloc0(sizeof(StatExtEntry));
-        entry->statOid = HeapTupleGetOid(htup);
-        staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
-        entry->schema = get_namespace_name(staForm->stxnamespace);
-        entry->name = pstrdup(NameStr(staForm->stxname));
-        for (i = 0; i < staForm->stxkeys.dim1; i++)
-        {
-            entry->columns = bms_add_member(entry->columns,
-                                            staForm->stxkeys.values[i]);
-        }
-
-        /* decode the stxkind char array into a list of chars */
-        datum = SysCacheGetAttr(STATEXTOID, htup,
-                                Anum_pg_statistic_ext_stxkind, &isnull);
-        Assert(!isnull);
-        arr = DatumGetArrayTypeP(datum);
-        if (ARR_NDIM(arr) != 1 ||
-            ARR_HASNULL(arr) ||
-            ARR_ELEMTYPE(arr) != CHAROID)
-            elog(ERROR, "stxkind is not a 1-D char array");
-        enabled = (char *) ARR_DATA_PTR(arr);
-        for (i = 0; i < ARR_DIMS(arr)[0]; i++)
-        {
-            Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
-                   (enabled[i] == STATS_EXT_DEPENDENCIES));
-            entry->types = lappend_int(entry->types, (int) enabled[i]);
-        }
-
-        result = lappend(result, entry);
-    }
-
-    systable_endscan(scan);
-
-    return result;
+{
+	SysScanDesc scan;
+	ScanKeyData skey;
+	HeapTuple	htup;
+	List	   *result = NIL;
+
+	/*
+	 * Prepare to scan pg_statistic_ext for entries having stxrelid = this
+	 * rel.
+	 */
+	ScanKeyInit(&skey,
+				Anum_pg_statistic_ext_stxrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(relid));
+
+	scan = systable_beginscan(pg_statext, StatisticExtRelidIndexId, true,
+							  NULL, 1, &skey);
+
+	while (HeapTupleIsValid(htup = systable_getnext(scan)))
+	{
+		StatExtEntry *entry;
+		Datum		datum;
+		bool		isnull;
+		int			i;
+		ArrayType  *arr;
+		char	   *enabled;
+		Form_pg_statistic_ext staForm;
+#ifdef __TBASE__
+		bool		need_column_order = false;
+#endif
+
+		entry = palloc0(sizeof(StatExtEntry));
+		entry->statOid = HeapTupleGetOid(htup);
+		staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
+		entry->schema = get_namespace_name(staForm->stxnamespace);
+		entry->name = pstrdup(NameStr(staForm->stxname));
+		for (i = 0; i < staForm->stxkeys.dim1; i++)
+		{
+			entry->columns = bms_add_member(entry->columns,
+											staForm->stxkeys.values[i]);
+		}
+
+		/* decode the stxkind char array into a list of chars */
+		datum = SysCacheGetAttr(STATEXTOID, htup,
+								Anum_pg_statistic_ext_stxkind, &isnull);
+		Assert(!isnull);
+		arr = DatumGetArrayTypeP(datum);
+		if (ARR_NDIM(arr) != 1 ||
+			ARR_HASNULL(arr) ||
+			ARR_ELEMTYPE(arr) != CHAROID)
+			elog(ERROR, "stxkind is not a 1-D char array");
+		enabled = (char *) ARR_DATA_PTR(arr);
+		for (i = 0; i < ARR_DIMS(arr)[0]; i++)
+		{
+			Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
+				   (enabled[i] == STATS_EXT_DEPENDENCIES) ||
+				   (enabled[i] == STATS_EXT_SUBSET));
+			entry->types = lappend_int(entry->types, (int) enabled[i]);
+#ifdef __TBASE__
+
+			if (enabled[i] == STATS_EXT_SUBSET)
+			{
+				/* Currently we only support subset of two columns */
+				Assert(staForm->stxkeys.dim1 == 2);
+
+				/* Order of column defined indicates the subset relation */
+				need_column_order = true;
+			}
+		}
+
+		/* Build the list of columns with the original order */
+		if (need_column_order)
+		{
+			for (i = 0; i < staForm->stxkeys.dim1; i++)
+			{
+				entry->orderedColumns = lappend_int(entry->orderedColumns,
+													staForm->stxkeys.values[i]);
+			}
+#endif
+		}
+
+		result = lappend(result, entry);
+	}
+
+	systable_endscan(scan);
+
+	return result;
 }
 
 /*
@@ -291,57 +341,73 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
  */
 static void
 statext_store(Relation pg_stext, Oid statOid,
-              MVNDistinct *ndistinct, MVDependencies *dependencies,
-              VacAttrStats **stats)
+			  MVNDistinct *ndistinct, MVDependencies *dependencies,
+#ifdef __TBASE__
+			  MVDependencies *subset,
+#endif
+			  VacAttrStats **stats)
 {
-    HeapTuple    stup,
-                oldtup;
-    Datum        values[Natts_pg_statistic_ext];
-    bool        nulls[Natts_pg_statistic_ext];
-    bool        replaces[Natts_pg_statistic_ext];
-
-    memset(nulls, 1, Natts_pg_statistic_ext * sizeof(bool));
-    memset(replaces, 0, Natts_pg_statistic_ext * sizeof(bool));
-    memset(values, 0, Natts_pg_statistic_ext * sizeof(Datum));
-
-    /*
-     * Construct a new pg_statistic_ext tuple, replacing the calculated stats.
-     */
-    if (ndistinct != NULL)
-    {
-        bytea       *data = statext_ndistinct_serialize(ndistinct);
-
-        nulls[Anum_pg_statistic_ext_stxndistinct - 1] = (data == NULL);
-        values[Anum_pg_statistic_ext_stxndistinct - 1] = PointerGetDatum(data);
-    }
-
-    if (dependencies != NULL)
-    {
-        bytea       *data = statext_dependencies_serialize(dependencies);
-
-        nulls[Anum_pg_statistic_ext_stxdependencies - 1] = (data == NULL);
-        values[Anum_pg_statistic_ext_stxdependencies - 1] = PointerGetDatum(data);
-    }
-
-    /* always replace the value (either by bytea or NULL) */
-    replaces[Anum_pg_statistic_ext_stxndistinct - 1] = true;
-    replaces[Anum_pg_statistic_ext_stxdependencies - 1] = true;
-
-    /* there should already be a pg_statistic_ext tuple */
-    oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
-    if (!HeapTupleIsValid(oldtup))
-        elog(ERROR, "cache lookup failed for statistics object %u", statOid);
-
-    /* replace it */
-    stup = heap_modify_tuple(oldtup,
-                             RelationGetDescr(pg_stext),
-                             values,
-                             nulls,
-                             replaces);
-    ReleaseSysCache(oldtup);
-    CatalogTupleUpdate(pg_stext, &stup->t_self, stup);
-
-    heap_freetuple(stup);
+	HeapTuple	stup,
+				oldtup;
+	Datum		values[Natts_pg_statistic_ext];
+	bool		nulls[Natts_pg_statistic_ext];
+	bool		replaces[Natts_pg_statistic_ext];
+
+	memset(nulls, 1, Natts_pg_statistic_ext * sizeof(bool));
+	memset(replaces, 0, Natts_pg_statistic_ext * sizeof(bool));
+	memset(values, 0, Natts_pg_statistic_ext * sizeof(Datum));
+
+	/*
+	 * Construct a new pg_statistic_ext tuple, replacing the calculated stats.
+	 */
+	if (ndistinct != NULL)
+	{
+		bytea	   *data = statext_ndistinct_serialize(ndistinct);
+
+		nulls[Anum_pg_statistic_ext_stxndistinct - 1] = (data == NULL);
+		values[Anum_pg_statistic_ext_stxndistinct - 1] = PointerGetDatum(data);
+	}
+
+	if (dependencies != NULL)
+	{
+		bytea	   *data = statext_dependencies_serialize(dependencies);
+
+		nulls[Anum_pg_statistic_ext_stxdependencies - 1] = (data == NULL);
+		values[Anum_pg_statistic_ext_stxdependencies - 1] = PointerGetDatum(data);
+	}
+
+#ifdef __TBASE__
+	if (subset != NULL)
+	{
+		bytea	   *data = statext_dependencies_serialize(subset);
+
+		nulls[Anum_pg_statistic_ext_stxsubset - 1] = (data == NULL);
+		values[Anum_pg_statistic_ext_stxsubset - 1] = PointerGetDatum(data);
+	}
+#endif
+
+	/* always replace the value (either by bytea or NULL) */
+	replaces[Anum_pg_statistic_ext_stxndistinct - 1] = true;
+	replaces[Anum_pg_statistic_ext_stxdependencies - 1] = true;
+#ifdef __TBASE__
+	replaces[Anum_pg_statistic_ext_stxsubset - 1] = true;
+#endif
+
+	/* there should already be a pg_statistic_ext tuple */
+	oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
+	if (!HeapTupleIsValid(oldtup))
+		elog(ERROR, "cache lookup failed for statistics object %u", statOid);
+
+	/* replace it */
+	stup = heap_modify_tuple(oldtup,
+							 RelationGetDescr(pg_stext),
+							 values,
+							 nulls,
+							 replaces);
+	ReleaseSysCache(oldtup);
+	CatalogTupleUpdate(pg_stext, &stup->t_self, stup);
+
+	heap_freetuple(stup);
 }
 
 /* initialize multi-dimensional sort */
diff --git a/src/backend/statistics/subset.c b/src/backend/statistics/subset.c
new file mode 100644
index 00000000..1bac5b9a
--- /dev/null
+++ b/src/backend/statistics/subset.c
@@ -0,0 +1,360 @@
+/*-------------------------------------------------------------------------
+ *
+ * subset.c
+ *	  POSTGRES user defined column correlationship
+ *
+ * Portions Copyright (c) 2020-Present, TBase Development Team, Tencent
+ *
+ * IDENTIFICATION
+ *	  src/backend/statistics/knowledge.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "catalog/pg_statistic_ext.h"
+#include "nodes/relation.h"
+#include "optimizer/clauses.h"
+#include "optimizer/cost.h"
+#include "statistics/extended_stats_internal.h"
+#include "statistics/statistics.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
+
+static bool subset_is_compatible_clause(Node *clause, Index relid,
+						AttrNumber *attnum);
+static bool subset_implies_attribute(MVDependency *dependency,
+						AttrNumber attnum);
+
+/*
+ * Build subset dependencies between groups of columns
+ */
+MVDependencies *
+statext_subset_build(int numrows, List *columns)
+{
+	int			i;
+	int			k;
+
+	/* result */
+	MVDependencies *dependencies = NULL;
+	MVDependency   *d;
+
+	/* Currently, we only support subset defined with 2 columns */
+	Assert(list_length(columns) == 2);
+	k = list_length(columns);
+
+	/* initialize the list of dependencies */
+	dependencies = (MVDependencies *) palloc0(sizeof(MVDependencies));
+
+	dependencies->magic = STATS_DEPS_MAGIC;
+	dependencies->type = STATS_DEPS_TYPE_BASIC;
+	dependencies->ndeps = 1;
+
+	dependencies = (MVDependencies *) repalloc(dependencies,
+											   offsetof(MVDependencies, deps)
+											   + dependencies->ndeps * sizeof(MVDependency));
+
+	d = (MVDependency *) palloc0(offsetof(MVDependency, attributes)
+								 + k * sizeof(AttrNumber));
+	d->degree = 1.0;
+	d->nattributes = k;
+	for (i = 0; i < k; i++)
+	{
+		d->attributes[i] = list_nth_int(columns, i);
+	}
+
+	dependencies->deps[0] = d;
+
+	return dependencies;
+}
+
+/*
+ * statext_subset_load
+ *		Load the subset dependency for the indicated pg_statistic_ext tuple
+ */
+MVDependencies *
+statext_subset_load(Oid mvoid)
+{
+	bool		isnull;
+	Datum		deps;
+	HeapTuple	htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(mvoid));
+
+	if (!HeapTupleIsValid(htup))
+		elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
+
+	deps = SysCacheGetAttr(STATEXTOID, htup,
+						   Anum_pg_statistic_ext_stxsubset, &isnull);
+	Assert(!isnull);
+
+	ReleaseSysCache(htup);
+
+	/* Reuse the functional dependencies deserialize function */
+	return statext_dependencies_deserialize(DatumGetByteaP(deps));
+}
+
+/*
+ * subset_is_compatible_clause
+ *		Determines if the clause is compatible with subset dependencies
+ *
+ * When returning True attnum is set to the attribute number of the Var within
+ * the supported clause. Comparing to dependencies compatibility check, subset
+ * is less restrictive.
+ */
+static bool
+subset_is_compatible_clause(Node *clause, Index relid, AttrNumber *attnum)
+{
+	RestrictInfo *rinfo = (RestrictInfo *) clause;
+
+	if (!IsA(rinfo, RestrictInfo))
+		return false;
+
+	/* Pseudoconstants are not really interesting here. */
+	if (rinfo->pseudoconstant)
+		return false;
+
+	/* clauses referencing multiple varnos are incompatible */
+	if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
+		return false;
+
+	if (is_opclause(rinfo->clause))
+	{
+		OpExpr	   *expr = (OpExpr *) rinfo->clause;
+		Var		   *var;
+		bool		varonleft = true;
+		bool		ok;
+
+		/* Only expressions with two arguments are considered compatible. */
+		if (list_length(expr->args) != 2)
+			return false;
+
+		/* see if it actually has the right */
+		ok = (NumRelids((Node *) expr) == 1) &&
+			(is_pseudo_constant_clause(lsecond(expr->args)) ||
+			 (varonleft = false,
+			  is_pseudo_constant_clause(linitial(expr->args))));
+
+		/* unsupported structure (two variables or so) */
+		if (!ok)
+			return false;
+
+		var = (varonleft) ? linitial(expr->args) : lsecond(expr->args);
+
+		/* in case it's a T_RelableType */
+		if (IsA(var, RelabelType))
+			var = (Var *) ((RelabelType *) var)->arg;
+
+		/* We only support plain Vars for now */
+		if (!IsA(var, Var))
+			return false;
+
+		/* Ensure var is from the correct relation */
+		if (var->varno != relid)
+			return false;
+
+		/* we also better ensure the Var is from the current level */
+		if (var->varlevelsup > 0)
+			return false;
+
+		/* Also skip system attributes (we don't allow stats on those). */
+		if (!AttrNumberIsForUserDefinedAttr(var->varattno))
+			return false;
+
+		*attnum = var->varattno;
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * subset_eliminate_attribute
+ *		check that the attnum matches is implied by the subset dependency
+ */
+static bool
+subset_implies_attribute(MVDependency *dependency, AttrNumber attnum)
+{
+	if (attnum == dependency->attributes[dependency->nattributes - 1])
+		return true;
+
+	return false;
+}
+
+/*
+ * subset_clauselist_selectivity
+ *		Return the estimated selectivity of the given clauses using
+ *		functional dependency statistics, or 1.0 if no useful functional
+ *		dependency statistic exists.
+ *
+ * 'estimatedclauses' is an output argument that gets a bit set corresponding
+ * to the (zero-based) list index of clauses that are included in the
+ * estimated selectivity.
+ *
+ * Given equality clauses on attributes (a,b) we find the strongest dependency
+ * between them, i.e. either (a=>b) or (b=>a). Assuming (a=>b) is the selected
+ * dependency, we then combine the per-clause selectivities using the formula
+ */
+Selectivity
+subset_clauselist_selectivity(PlannerInfo *root,
+							  List *clauses,
+							  int varRelid,
+							  JoinType jointype,
+							  SpecialJoinInfo *sjinfo,
+							  RelOptInfo *rel,
+							  Bitmapset **estimatedclauses)
+{
+	Selectivity s1 = 1.0;
+	ListCell   *l;
+	Bitmapset  *clauses_attnums = NULL;
+	StatisticExtInfo *stat;
+	MVDependencies *dependencies;
+	AttrNumber *list_attnums;
+	int			listidx;
+
+	/* check if there's any stats that might be useful for us. */
+	if (!has_stats_of_kind(rel->statlist, STATS_EXT_SUBSET))
+		return 1.0;
+
+	list_attnums = (AttrNumber *) palloc(sizeof(AttrNumber) *
+										 list_length(clauses));
+
+	/*
+	 * Pre-process the clauses list to extract the attnums seen in each item.
+	 * We need to determine if there's any clauses which will be useful for
+	 * subset selectivity elimination. Along the way we'll record all of
+	 * the attnums for each clause in a list which we'll reference later so we
+	 * don't need to repeat the same work again. We'll also keep track of all
+	 * attnums seen.
+	 */
+	listidx = 0;
+	foreach(l, clauses)
+	{
+		Node	   *clause = (Node *) lfirst(l);
+		AttrNumber	attnum;
+
+		if (subset_is_compatible_clause(clause, rel->relid, &attnum))
+		{
+			list_attnums[listidx] = attnum;
+			clauses_attnums = bms_add_member(clauses_attnums, attnum);
+		}
+		else
+			list_attnums[listidx] = InvalidAttrNumber;
+
+		listidx++;
+	}
+
+	/*
+	 * If there's not at least two distinct attnums then reject the whole list
+	 * of clauses. We must return 1.0 so the calling function's selectivity is
+	 * unaffected.
+	 */
+	if (bms_num_members(clauses_attnums) < 2)
+	{
+		pfree(list_attnums);
+		return 1.0;
+	}
+
+	/* find the best suited statistics object for these attnums */
+	stat = choose_best_statistics(rel->statlist, clauses_attnums,
+								  STATS_EXT_SUBSET);
+
+	/* if no matching stats could be found then we've nothing to do */
+	if (!stat)
+	{
+		pfree(list_attnums);
+		return 1.0;
+	}
+
+	/*
+	 * Load the dependency items stored in the statistics object.
+	 */
+	dependencies = statext_subset_load(stat->statOid);
+
+	/*
+	 * Apply the dependencies recursively, starting with the widest/strongest
+	 * ones, and proceeding to the smaller/weaker ones. At the end of each
+	 * round we factor in the selectivity of clauses on the implied attribute,
+	 * and remove the clauses from the list.
+	 *
+	 * Actually, for subset dependency, there should be only one dependency
+	 * entry. But we still keep the while loop style align with normal
+	 * dependency selectivity calculation does, to get better support for
+	 * possible future enhancements.
+	 */
+	do
+	{
+		Selectivity s2 = 1.0;
+		MVDependency *dependency;
+
+		/* There is only one dependency to indicate the subset relation */
+		Assert(dependencies->ndeps == 1);
+		dependency = dependencies->deps[0];
+
+		/*
+		 * We found an applicable dependency, so find all the clauses on the
+		 * implied attribute - with dependency (a,b => c) we look for clauses
+		 * on 'c'.
+		 */
+		listidx = -1;
+		foreach(l, clauses)
+		{
+			Node	   *clause;
+
+			listidx++;
+
+			/*
+			 * Skip incompatible clauses, and ones we've already estimated on.
+			 */
+			if (list_attnums[listidx] == InvalidAttrNumber ||
+				bms_is_member(listidx, *estimatedclauses))
+				continue;
+
+			/*
+			 * Technically we could find more than one clause for a given
+			 * attnum. Since these clauses must be equality clauses, we choose
+			 * to only take the selectivity estimate from the final clause in
+			 * the list for this attnum. If the attnum happens to be compared
+			 * to a different Const in another clause then no rows will match
+			 * anyway. If it happens to be compared to the same Const, then
+			 * ignoring the additional clause is just the thing to do.
+			 */
+			if (subset_implies_attribute(dependency, list_attnums[listidx]))
+			{
+				clause = (Node *) lfirst(l);
+
+				s2 = clause_selectivity(root, clause, varRelid, jointype,
+										sjinfo);
+
+				/* mark this one as done, so we don't touch it again. */
+				*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
+
+				/*
+				 * Mark that we've got and used the dependency on this clause.
+				 * We'll want to ignore this when looking for the next
+				 * strongest dependency above.
+				 */
+				clauses_attnums = bms_del_member(clauses_attnums,
+												 list_attnums[listidx]);
+			}
+		}
+
+		/*
+		 * Now factor in the selectivity for all the "implied" clauses into
+		 * the final one, using this formula:
+		 *
+		 * P(a,b) = P(a) * (f + (1-f) * P(b))
+		 *
+		 * where 'f' is the degree of validity of the dependency.
+		 *
+		 * Currently, the subset statistic can only eliminate the implied
+		 * clause by forcing dependency degree to 1.0.
+		 */
+		Assert(dependency->degree == 1.0);
+		s1 *= (dependency->degree + (1 - dependency->degree) * s2);
+	} while(0);
+
+	pfree(dependencies);
+	pfree(list_attnums);
+
+	return s1;
+}
diff --git a/src/include/catalog/pg_statistic_ext.h b/src/include/catalog/pg_statistic_ext.h
index 108944f7..7dc60359 100644
--- a/src/include/catalog/pg_statistic_ext.h
+++ b/src/include/catalog/pg_statistic_ext.h
@@ -45,10 +45,13 @@ CATALOG(pg_statistic_ext,3381)
     int2vector    stxkeys;        /* array of column keys */
 
 #ifdef CATALOG_VARLEN
-    char        stxkind[1] BKI_FORCE_NOT_NULL;    /* statistic types requested
-                                                 * to build */
-    pg_ndistinct stxndistinct;    /* ndistinct coefficients (serialized) */
-    pg_dependencies stxdependencies;    /* dependencies (serialized) */
+	char		stxkind[1] BKI_FORCE_NOT_NULL;	/* statistic types requested
+												 * to build */
+	pg_ndistinct stxndistinct;	/* ndistinct coefficients (serialized) */
+	pg_dependencies stxdependencies;	/* dependencies (serialized) */
+#ifdef __TBASE__
+	pg_dependencies stxsubset;	/* subset (serialized) */
+#endif
 #endif
 
 } FormData_pg_statistic_ext;
@@ -64,17 +67,27 @@ typedef FormData_pg_statistic_ext *Form_pg_statistic_ext;
  *        compiler constants for pg_statistic_ext
  * ----------------
  */
-#define Natts_pg_statistic_ext                    8
-#define Anum_pg_statistic_ext_stxrelid            1
-#define Anum_pg_statistic_ext_stxname            2
-#define Anum_pg_statistic_ext_stxnamespace        3
-#define Anum_pg_statistic_ext_stxowner            4
-#define Anum_pg_statistic_ext_stxkeys            5
-#define Anum_pg_statistic_ext_stxkind            6
-#define Anum_pg_statistic_ext_stxndistinct        7
-#define Anum_pg_statistic_ext_stxdependencies    8
+#ifdef __TBASE__
+#define Natts_pg_statistic_ext					9
+#else
+#define Natts_pg_statistic_ext					8
+#endif
+#define Anum_pg_statistic_ext_stxrelid			1
+#define Anum_pg_statistic_ext_stxname			2
+#define Anum_pg_statistic_ext_stxnamespace		3
+#define Anum_pg_statistic_ext_stxowner			4
+#define Anum_pg_statistic_ext_stxkeys			5
+#define Anum_pg_statistic_ext_stxkind			6
+#define Anum_pg_statistic_ext_stxndistinct		7
+#define Anum_pg_statistic_ext_stxdependencies	8
+#ifdef __TBASE__
+#define Anum_pg_statistic_ext_stxsubset			9
+#endif
 
-#define STATS_EXT_NDISTINCT            'd'
-#define STATS_EXT_DEPENDENCIES        'f'
+#define STATS_EXT_NDISTINCT			'd'
+#define STATS_EXT_DEPENDENCIES		'f'
+#ifdef __TBASE__
+#define STATS_EXT_SUBSET			's'
+#endif
 
 #endif                            /* PG_STATISTIC_EXT_H */
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index ad0d6872..3dbf9e9e 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -53,7 +53,10 @@ extern bytea *statext_ndistinct_serialize(MVNDistinct *ndistinct);
 extern MVNDistinct *statext_ndistinct_deserialize(bytea *data);
 
 extern MVDependencies *statext_dependencies_build(int numrows, HeapTuple *rows,
-                           Bitmapset *attrs, VacAttrStats **stats);
+						   Bitmapset *attrs, VacAttrStats **stats);
+#ifdef __TBASE__
+extern MVDependencies *statext_subset_build(int numrows, List *columns);
+#endif
 extern bytea *statext_dependencies_serialize(MVDependencies *dependencies);
 extern MVDependencies *statext_dependencies_deserialize(bytea *data);
 
diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h
index 47b59887..e6923113 100644
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -80,18 +80,30 @@ typedef struct MVDependencies
 
 extern MVNDistinct *statext_ndistinct_load(Oid mvoid);
 extern MVDependencies *statext_dependencies_load(Oid mvoid);
+#ifdef __TBASE__
+extern MVDependencies *statext_subset_load(Oid mvoid);
+#endif
 
 extern void BuildRelationExtStatistics(Relation onerel, double totalrows,
                            int numrows, HeapTuple *rows,
                            int natts, VacAttrStats **vacattrstats);
 extern bool statext_is_kind_built(HeapTuple htup, char kind);
 extern Selectivity dependencies_clauselist_selectivity(PlannerInfo *root,
-                                    List *clauses,
-                                    int varRelid,
-                                    JoinType jointype,
-                                    SpecialJoinInfo *sjinfo,
-                                    RelOptInfo *rel,
-                                    Bitmapset **estimatedclauses);
+									List *clauses,
+									int varRelid,
+									JoinType jointype,
+									SpecialJoinInfo *sjinfo,
+									RelOptInfo *rel,
+									Bitmapset **estimatedclauses);
+#ifdef __TBASE__
+extern Selectivity subset_clauselist_selectivity(PlannerInfo *root,
+									List *clauses,
+									int varRelid,
+									JoinType jointype,
+									SpecialJoinInfo *sjinfo,
+									RelOptInfo *rel,
+									Bitmapset **estimatedclauses);
+#endif
 extern bool has_stats_of_kind(List *stats, char requiredkind);
 extern StatisticExtInfo *choose_best_statistics(List *stats,
                        Bitmapset *attnums, char requiredkind);
diff --git a/src/test/regress/expected/stats_ext_2.out b/src/test/regress/expected/stats_ext_2.out
index 3581037d..ca7aba0a 100644
--- a/src/test/regress/expected/stats_ext_2.out
+++ b/src/test/regress/expected/stats_ext_2.out
@@ -656,4 +656,99 @@ EXPLAIN (COSTS OFF)
                Index Cond: ((a = 1) AND (b = '1'::text))
 (7 rows)
 
+-- subset relational tests
+CREATE TABLE subset (
+    filler1 TEXT,
+    filler2 NUMERIC,
+    a INT,
+    b TEXT,
+    filler3 DATE,
+    c INT,
+    d TEXT
+);
+-- a => b, b==c
+INSERT INTO subset (a, b, c, filler1)
+     SELECT mod(i,100), 'prefix_'||mod(i,50), mod(i,50), i FROM generate_series(1,5000) s(i);
+ANALYZE subset;
+-- under-estimates when using only per-column statistics
+EXPLAIN 
+ SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Finalize Aggregate  (cost=255.01..255.02 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=255.00..255.01 rows=1 width=0)
+         ->  Partial Aggregate  (cost=155.00..155.01 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..155.00 rows=2 width=0)
+                     Filter: ((b = 'prefix_1'::text) AND (c = 1))
+(5 rows)
+
+SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
+ count 
+-------
+   100
+(1 row)
+
+-- create dependencies
+CREATE STATISTICS deps_stat (dependencies) ON a, b, c FROM subset;
+ANALYZE subset;
+-- the selectivity is corrected by dependencies stats
+EXPLAIN 
+ SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Finalize Aggregate  (cost=255.01..255.02 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=255.00..255.01 rows=1 width=0)
+         ->  Partial Aggregate  (cost=155.00..155.01 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..155.00 rows=100 width=0)
+                     Filter: ((b = 'prefix_1'::text) AND (c = 1))
+(5 rows)
+
+SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
+ count 
+-------
+   100
+(1 row)
+
+-- dependencies stats does not support operator other than '='
+EXPLAIN 
+ SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Finalize Aggregate  (cost=255.01..255.02 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=255.00..255.01 rows=1 width=0)
+         ->  Partial Aggregate  (cost=155.00..155.01 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..155.00 rows=10 width=0)
+                     Filter: ((b ~~ '%_1'::text) AND (c = 1))
+(5 rows)
+
+SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
+ count 
+-------
+   100
+(1 row)
+
+-- wrong definition, subset stat only support two column
+CREATE STATISTICS subset_stat (subset) ON a, b, c FROM subset;
+ERROR:  subset statistics require exactly 2 columns
+-- create subset stats as user defined hint
+CREATE STATISTICS subset_stat (subset) ON c, b FROM subset;
+ANALYZE subset;
+-- the selectivity is corrected by subset stats
+EXPLAIN 
+ SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Finalize Aggregate  (cost=255.01..255.02 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=255.00..255.01 rows=1 width=0)
+         ->  Partial Aggregate  (cost=155.00..155.01 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..155.00 rows=100 width=0)
+                     Filter: ((b ~~ '%_1'::text) AND (c = 1))
+(5 rows)
+
+SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
+ count 
+-------
+   100
+(1 row)
+
 RESET random_page_cost;
diff --git a/src/test/regress/expected/stats_ext_3.out b/src/test/regress/expected/stats_ext_3.out
index e69852b6..b0f2e1c4 100644
--- a/src/test/regress/expected/stats_ext_3.out
+++ b/src/test/regress/expected/stats_ext_3.out
@@ -668,4 +668,99 @@ EXPLAIN (COSTS OFF)
                Index Cond: ((a = 1) AND (b = '1'::text))
 (7 rows)
 
+-- subset relational tests
+CREATE TABLE subset (
+    filler1 TEXT,
+    filler2 NUMERIC,
+    a INT,
+    b TEXT,
+    filler3 DATE,
+    c INT,
+    d TEXT
+);
+-- a => b, b==c
+INSERT INTO subset (a, b, c, filler1)
+     SELECT mod(i,100), 'prefix_'||mod(i,50), mod(i,50), i FROM generate_series(1,5000) s(i);
+ANALYZE subset;
+-- under-estimates when using only per-column statistics
+EXPLAIN 
+ SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Finalize Aggregate  (cost=255.01..255.02 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=255.00..255.01 rows=1 width=0)
+         ->  Partial Aggregate  (cost=155.00..155.01 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..155.00 rows=2 width=0)
+                     Filter: ((b = 'prefix_1'::text) AND (c = 1))
+(5 rows)
+
+SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
+ count 
+-------
+   100
+(1 row)
+
+-- create dependencies
+CREATE STATISTICS deps_stat (dependencies) ON a, b, c FROM subset;
+ANALYZE subset;
+-- the selectivity is corrected by dependencies stats
+EXPLAIN 
+ SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Finalize Aggregate  (cost=255.01..255.02 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=255.00..255.01 rows=1 width=0)
+         ->  Partial Aggregate  (cost=155.00..155.01 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..155.00 rows=100 width=0)
+                     Filter: ((b = 'prefix_1'::text) AND (c = 1))
+(5 rows)
+
+SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
+ count 
+-------
+   100
+(1 row)
+
+-- dependencies stats does not support operator other than '='
+EXPLAIN 
+ SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Finalize Aggregate  (cost=255.01..255.02 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=255.00..255.01 rows=1 width=0)
+         ->  Partial Aggregate  (cost=155.00..155.01 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..155.00 rows=10 width=0)
+                     Filter: ((b ~~ '%_1'::text) AND (c = 1))
+(5 rows)
+
+SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
+ count 
+-------
+   100
+(1 row)
+
+-- wrong definition, subset stat only support two column
+CREATE STATISTICS subset_stat (subset) ON a, b, c FROM subset;
+ERROR:  subset statistics require exactly 2 columns
+-- create subset stats as user defined hint
+CREATE STATISTICS subset_stat (subset) ON c, b FROM subset;
+ANALYZE subset;
+-- the selectivity is corrected by subset stats
+EXPLAIN 
+ SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Finalize Aggregate  (cost=255.01..255.02 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=255.00..255.01 rows=1 width=0)
+         ->  Partial Aggregate  (cost=155.00..155.01 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..155.00 rows=100 width=0)
+                     Filter: ((b ~~ '%_1'::text) AND (c = 1))
+(5 rows)
+
+SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
+ count 
+-------
+   100
+(1 row)
+
 RESET random_page_cost;
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 221db426..a4b91e0c 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -297,4 +297,51 @@ ANALYZE functional_dependencies;
 EXPLAIN (COSTS OFF)
  SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1;
 
+-- subset relational tests
+CREATE TABLE subset (
+    filler1 TEXT,
+    filler2 NUMERIC,
+    a INT,
+    b TEXT,
+    filler3 DATE,
+    c INT,
+    d TEXT
+);
+
+-- a => b, b==c
+INSERT INTO subset (a, b, c, filler1)
+     SELECT mod(i,100), 'prefix_'||mod(i,50), mod(i,50), i FROM generate_series(1,5000) s(i);
+
+ANALYZE subset;
+
+-- under-estimates when using only per-column statistics
+EXPLAIN 
+ SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
+SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
+
+-- create dependencies
+CREATE STATISTICS deps_stat (dependencies) ON a, b, c FROM subset;
+ANALYZE subset;
+
+-- the selectivity is corrected by dependencies stats
+EXPLAIN 
+ SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
+SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
+
+-- dependencies stats does not support operator other than '='
+EXPLAIN 
+ SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
+SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
+
+-- wrong definition, subset stat only support two column
+CREATE STATISTICS subset_stat (subset) ON a, b, c FROM subset;
+-- create subset stats as user defined hint
+CREATE STATISTICS subset_stat (subset) ON c, b FROM subset;
+ANALYZE subset;
+
+-- the selectivity is corrected by subset stats
+EXPLAIN 
+ SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
+SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
+
 RESET random_page_cost;

From 92aba29af4c8dc7ce7d6a45a25f00bbd5151b071 Mon Sep 17 00:00:00 2001
From: qiannzhang <qiannzhang@tencent.com>
Date: Wed, 26 Aug 2020 14:56:05 +0800
Subject: [PATCH 027/578] Set keepalive, user_timeout, and connect_timeout in
 pooler

---
 src/backend/libpq/pqcomm.c       |  58 ++++++++++
 src/backend/pgxc/pool/pgxcnode.c |  21 ++--
 src/backend/pgxc/pool/poolmgr.c  | 192 ++++++++++++++++---------------
 src/include/libpq/libpq-be.h     |   4 +-
 4 files changed, 171 insertions(+), 104 deletions(-)

diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 6834a8c0..ca926c8c 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -2009,3 +2009,61 @@ pq_setkeepalivescount(int count, Port *port)
 
     return STATUS_OK;
 }
+
+/*
+ * Set socket keepalive and user_timeout.
+ * We can use this to detect the broken connection quickly.
+ */
+void
+SetSockKeepAlive(int sock)
+{
+	int keepalive = 1;
+	/* user_timeout in ms */
+	uint32 user_timeout = UINT32_MAX / 1000 < tcp_keepalives_idle ?
+						  0 : tcp_keepalives_idle * (uint32)1000;
+	struct tcp_info info;
+	int len = sizeof(info);
+	/* check sock */
+	getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len);
+	if (info.tcpi_state != TCP_ESTABLISHED)
+	{
+		return;
+	}
+
+	/* set keepalive */
+	if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+				   (char *)&keepalive, sizeof(keepalive)) < 0)
+	{
+		elog(LOG, "SetSockKeepAlive setsockopt(SO_KEEPALIVE) failed: %m");
+	}
+	if (tcp_keepalives_idle > 0 &&
+		setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
+				   (char *)&tcp_keepalives_idle,
+				   sizeof(tcp_keepalives_idle)) < 0)
+	{
+		elog(LOG, "SetSockKeepAlive setsockopt(TCP_KEEPIDLE) failed: %m");
+	}
+	if (tcp_keepalives_interval > 0 &&
+		setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
+				   (char *)&tcp_keepalives_interval,
+				   sizeof(tcp_keepalives_interval)) < 0)
+	{
+		elog(LOG, "SetSockKeepAlive setsockopt(TCP_KEEPINTVL) failed: %m");
+	}
+	if (tcp_keepalives_count > 0 &&
+		setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
+				   (char *)&tcp_keepalives_count,
+				   sizeof(tcp_keepalives_count)) < 0)
+	{
+		elog(LOG, "SetSockKeepAlive setsockopt(TCP_KEEPCNT) failed: %m");
+	}
+
+	/* set user_timeout */
+	if (user_timeout > 0 &&
+		setsockopt(sock, IPPROTO_TCP, TCP_USER_TIMEOUT,
+				   (char *)&user_timeout,
+				   sizeof(user_timeout)) < 0)
+	{
+		elog(LOG, "SetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m");
+	}
+}
\ No newline at end of file
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 4279a325..7bea908f 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -489,14 +489,19 @@ PGXCNodeConnStr(char *host, int port, char *dbname,
 #ifdef _MLS_
     }
 #endif    
-    /* Check for overflow */
-    if (num > 0 && num < sizeof(connstr))
-    {
-        /* Output result */
-        out = (char *) palloc(num + 1);
-        strcpy(out, connstr);
-        return out;
-    }
+	if (tcp_keepalives_idle > 0)
+	{
+		num += snprintf(connstr + num, sizeof(connstr) - num,
+					   " connect_timeout=%d", tcp_keepalives_idle);
+	}
+	/* Check for overflow */
+	if (num > 0 && num < sizeof(connstr))
+	{
+		/* Output result */
+		out = (char *) palloc(num + 1);
+		strcpy(out, connstr);
+		return out;
+	}
 
     /* return NULL if we have problem */
     return NULL;
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 1f4bb7b9..dd575f76 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -6986,24 +6986,24 @@ preconnect_and_warm(DatabasePool *dbPool)
                 return false;
             }
 
-            slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
-            
-            
-            /* Increase count of pool size */            
-            nodePool->slot[nodePool->freeSize] = slot;
-            
-            /* Insert at the end of the pool */
-            IncreasePoolerSize(nodePool, __FILE__, __LINE__);
-            IncreasePoolerFreesize(nodePool,__FILE__,__LINE__);
-            slot->released = time(NULL);
-            slot->checked  = slot->released;
-            slot->created  = slot->released;
-            slot->node_name = nodePool->node_name;
-            slot->backend_pid = ((PGconn *) slot->conn)->be_pid;
-            if (dbPool->oldest_idle == (time_t) 0)
-            {
-                dbPool->oldest_idle = slot->released;
-            }
+			slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
+			SetSockKeepAlive(((PGconn *)slot->conn)->sock);
+			
+			/* Increase count of pool size */			
+			nodePool->slot[nodePool->freeSize] = slot;
+			
+			/* Insert at the end of the pool */
+			IncreasePoolerSize(nodePool, __FILE__, __LINE__);
+			IncreasePoolerFreesize(nodePool,__FILE__,__LINE__);
+			slot->released = time(NULL);
+			slot->checked  = slot->released;
+			slot->created  = slot->released;
+			slot->node_name = nodePool->node_name;
+			slot->backend_pid = ((PGconn *) slot->conn)->be_pid;
+			if (dbPool->oldest_idle == (time_t) 0)
+			{
+				dbPool->oldest_idle = slot->released;
+			}
 
             if (PoolConnectDebugPrint)
             {
@@ -7062,55 +7062,56 @@ void *pooler_async_connection_management_thread(void *arg)
     PGXCPoolConnectReq  *request     = NULL;
     PGXCNodePoolSlot     *slot        = NULL;    
 
-    threadIndex = ((PGXCPoolConnThreadParam*)arg)->threadIndex;
-    while (1)
-    {
-        /* wait for signal */
-        ThreadSemaDown(&g_PoolConnControl.sem[threadIndex]);        
-        
-        /* create connect as needed */
-        request = (PGXCPoolConnectReq*)PipeGet(g_PoolConnControl.request[threadIndex]);
-        if (request)
-        {
-            /* record status of the task */
-            pooler_async_task_start(&g_PoolConnControl, threadIndex, request->nodeindex, NULL, InvalidOid, request->cmd);
-            
-            switch (request->cmd)
-            {
-                case COMMAND_CONNECTION_BUILD:
-                {
-                    for (i = 0; i < request->size; i++, request->validSize++)
-                    {            
-                        slot =  &request->slot[i]; 
-                        /* If connection fails, be sure that slot is destroyed cleanly */
-                        slot->xc_cancelConn = NULL;
-
-                        /* Establish connection */
-                        slot->conn = PGXCNodeConnectBarely(request->connstr);
-                        if (!PGXCNodeConnected(slot->conn))
-                        {        
-                            request->failed = true;
-                            break;
-                        }        
-                        slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
-                        slot->bwarmed       = false;
-                    }                    
-                    break;
-                }
-
-                case COMMAND_CONNECTION_CLOSE:
-                {                    
-                    PQfreeCancel((PGcancel *)request->slot[0].xc_cancelConn);    
-                    PGXCNodeClose(request->slot[0].conn);
-                    break;    
-                }
-
-                default:
-                {
-                    /* should never happen */
-                    abort();
-                }
-            }
+	threadIndex = ((PGXCPoolConnThreadParam*)arg)->threadIndex;
+	while (1)
+	{
+		/* wait for signal */
+		ThreadSemaDown(&g_PoolConnControl.sem[threadIndex]);		
+		
+		/* create connect as needed */
+		request = (PGXCPoolConnectReq*)PipeGet(g_PoolConnControl.request[threadIndex]);
+		if (request)
+		{
+			/* record status of the task */
+			pooler_async_task_start(&g_PoolConnControl, threadIndex, request->nodeindex, NULL, InvalidOid, request->cmd);
+			
+			switch (request->cmd)
+			{
+				case COMMAND_CONNECTION_BUILD:
+				{
+					for (i = 0; i < request->size; i++, request->validSize++)
+					{			
+						slot =  &request->slot[i]; 
+						/* If connection fails, be sure that slot is destroyed cleanly */
+						slot->xc_cancelConn = NULL;
+
+						/* Establish connection */
+						slot->conn = PGXCNodeConnectBarely(request->connstr);
+						if (!PGXCNodeConnected(slot->conn))
+						{		
+							request->failed = true;
+							break;
+						}		
+						slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
+						slot->bwarmed       = false;
+						SetSockKeepAlive(((PGconn *)slot->conn)->sock);
+					}					
+					break;
+				}
+
+				case COMMAND_CONNECTION_CLOSE:
+				{					
+					PQfreeCancel((PGcancel *)request->slot[0].xc_cancelConn);	
+					PGXCNodeClose(request->slot[0].conn);
+					break;	
+				}
+
+				default:
+				{
+					/* should never happen */
+					abort();
+				}
+			}
 
             /* clear the work status */
             pooler_async_task_done(&g_PoolConnControl, threadIndex);    
@@ -7357,34 +7358,35 @@ void *pooler_sync_remote_operator_thread(void *arg)
                                         request->nodepool->connstr);
                                     SpinLockRelease(&request->agent->port.lock);
 #endif
-                                    set_task_status(request->taskControl, PoolAyncCtlStaus_error);        
-                                    finish_task_request(request->taskControl);
-                                    break;
-                                }                
-
-                                slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
-                                slot->bwarmed       = false;
-                                
-                                /* set the time flags */
-                                slot->released = time(NULL);
-                                slot->checked  = slot->released;
-                                slot->created  = slot->released;
-                                
-                                /* increase usecount */
-                                slot->usecount++;
-                                slot->node_name = request->nodepool->node_name;
-                                slot->backend_pid = ((PGconn *) slot->conn)->be_pid;
-                                if (request->bCoord)
-                                {
-                                    request->agent->coord_connections[request->nodeindex] = slot;
-                                }
-                                else
-                                {
-                                    request->agent->dn_connections[request->nodeindex]       = slot;
-                                }
-                                request->current_status = PoolConnectStaus_connected;    
-#ifdef _POOLER_CHECK_                                
-                                snprintf(request->errmsg, POOLER_ERROR_MSG_LEN, "parallel connect thread build connection to node:%s backend_pid:%d nodeidx:%d succeed", slot->node_name, slot->backend_pid, request->nodeindex);
+									set_task_status(request->taskControl, PoolAyncCtlStaus_error);		
+									finish_task_request(request->taskControl);
+									break;
+								}				
+
+								slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
+								slot->bwarmed       = false;
+								SetSockKeepAlive(((PGconn *)slot->conn)->sock);
+								
+								/* set the time flags */
+								slot->released = time(NULL);
+								slot->checked  = slot->released;
+								slot->created  = slot->released;
+								
+								/* increase usecount */
+								slot->usecount++;
+								slot->node_name = request->nodepool->node_name;
+								slot->backend_pid = ((PGconn *) slot->conn)->be_pid;
+								if (request->bCoord)
+								{
+									request->agent->coord_connections[request->nodeindex] = slot;
+								}
+								else
+								{
+									request->agent->dn_connections[request->nodeindex] 	  = slot;
+								}
+								request->current_status = PoolConnectStaus_connected;	
+#ifdef _POOLER_CHECK_								
+								snprintf(request->errmsg, POOLER_ERROR_MSG_LEN, "parallel connect thread build connection to node:%s backend_pid:%d nodeidx:%d succeed", slot->node_name, slot->backend_pid, request->nodeindex);
 #endif
                                 continue;
                             }
diff --git a/src/include/libpq/libpq-be.h b/src/include/libpq/libpq-be.h
index 42b96486..474d9690 100644
--- a/src/include/libpq/libpq-be.h
+++ b/src/include/libpq/libpq-be.h
@@ -287,4 +287,6 @@ extern int    pq_setkeepalivesidle(int idle, Port *port);
 extern int    pq_setkeepalivesinterval(int interval, Port *port);
 extern int    pq_setkeepalivescount(int count, Port *port);
 
-#endif                            /* LIBPQ_BE_H */
+extern void SetSockKeepAlive(int sock);
+
+#endif							/* LIBPQ_BE_H */

From 5ab4c78bd9f89ce11b7d740ea44f83dc1d375a8b Mon Sep 17 00:00:00 2001
From: qiannzhang <qiannzhang@tencent.com>
Date: Fri, 28 Aug 2020 15:46:11 +0800
Subject: [PATCH 028/578] ID81500043: also check xmin if tmin is invalid

---
 src/backend/access/transam/twophase.c | 46 +++++++++++++++------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 204e9edd..28197d77 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -436,27 +436,31 @@ EndGlobalPrepare(GlobalTransaction gxact, bool isImplicit)
     }
 
 #ifdef __TBASE_DEBUG__
-    if(enable_distri_print)
-    {
-        InsertPreparedXid(pgxact->xid, GetGlobalPrepareTimestamp());
-    }
-#endif    
-
-    // SetGlobalPrepareTimestamp(InvalidGlobalTimestamp);
-    /* 
-     * Transfer the tmin to the prepared proc without locking. 
-     * As the prepare xact procs lie behind the normal procs in proc array,
-     * Get Snapshot would not miss the tmin even when it is being transferred.
-     */
-    pg_atomic_write_u64(&pgxact->tmin, pg_atomic_read_u64(&MyPgXact->tmin));
-    if(!GlobalTimestampIsValid(pg_atomic_read_u64(&MyPgXact->tmin)))
-    {
-        elog(LOG,
-             "prepare transaction %d does not have valid tmin. autovacuum %d", 
-             MyPgXact->xid, IsAutoVacuumWorkerProcess());
-    }
-    
-
+	if(enable_distri_print)
+	{
+		InsertPreparedXid(pgxact->xid, GetGlobalPrepareTimestamp());
+	}
+#endif	
+
+	// SetGlobalPrepareTimestamp(InvalidGlobalTimestamp);
+	/* 
+	 * Transfer the tmin to the prepared proc without locking. 
+	 * As the prepare xact procs lie behind the normal procs in proc array,
+	 * Get Snapshot would not miss the tmin even when it is being transferred.
+	 * 
+	 * According to PortalRunUtility, we do not set snapshot if transaction
+	 * only contains utilities that do not need one. In that case,
+	 * xmin and tmin are both invalid, for they are both set by snapshot.
+	 * So if xmin is valid, tmin should also be.
+	 */
+	pg_atomic_write_u64(&pgxact->tmin, pg_atomic_read_u64(&MyPgXact->tmin));
+	if(!GlobalTimestampIsValid(pg_atomic_read_u64(&MyPgXact->tmin)) &&
+		TransactionIdIsValid(MyPgXact->xmin))
+	{
+		elog(LOG,
+			 "prepare transaction %d does not have valid tmin. autovacuum %d", 
+			 MyPgXact->xid, IsAutoVacuumWorkerProcess());
+	}
 }
 
 
From be971c2fa1438411e14910ed1346014a4d1d18f0 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Fri, 7 Aug 2020 15:46:31 +0800
Subject: [PATCH 029/578] * support sublink pull up in targetlist.

Since subquery in the targetlist has scalar semantics.
Normal joins will join simply generate repeated tuples.
So we add a new join type JOIN_LEFT_SCALAR
which acts like left join and reports error when
scalar semantics is broken.
---
 src/backend/commands/explain.c            |  501 +++---
 src/backend/executor/nodeHashjoin.c       |  501 +++---
 src/backend/executor/nodeMergejoin.c      | 1803 +++++++++----------
 src/backend/executor/nodeNestloop.c       |  373 ++--
 src/backend/optimizer/path/allpaths.c     |   60 +-
 src/backend/optimizer/path/costsize.c     | 1968 +++++++++++----------
 src/backend/optimizer/path/indxpath.c     |   46 +-
 src/backend/optimizer/path/joinpath.c     |  733 ++++----
 src/backend/optimizer/path/joinrels.c     |  933 +++++-----
 src/backend/optimizer/plan/initsplan.c    | 1451 +++++++--------
 src/backend/optimizer/plan/setrefs.c      |  225 +--
 src/backend/optimizer/plan/subselect.c    |  127 +-
 src/backend/optimizer/prep/prepjointree.c | 1118 ++++++------
 src/backend/optimizer/util/pathnode.c     |  176 +-
 src/backend/utils/adt/network_selfuncs.c  |   91 +-
 src/backend/utils/adt/selfuncs.c          |  101 +-
 src/include/nodes/nodes.h                 |   83 +-
 src/include/optimizer/subselect.h         |    3 +-
 src/test/regress/expected/subselect.out   |  334 +++-
 src/test/regress/sql/subselect.sql        |   80 +
 20 files changed, 5729 insertions(+), 4978 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 0c22d92b..d49eebc8 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1312,255 +1312,260 @@ ExplainNode(PlanState *planstate, List *ancestors,
             }
             break;
 #endif /* XCP */
-        case T_IndexScan:
-            {
-                IndexScan  *indexscan = (IndexScan *) plan;
-
-                ExplainIndexScanDetails(indexscan->indexid,
-                                        indexscan->indexorderdir,
-                                        es);
-                ExplainScanTarget((Scan *) indexscan, es);
-            }
-            break;
-        case T_IndexOnlyScan:
-            {
-                IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
-
-                ExplainIndexScanDetails(indexonlyscan->indexid,
-                                        indexonlyscan->indexorderdir,
-                                        es);
-                ExplainScanTarget((Scan *) indexonlyscan, es);
-            }
-            break;
-        case T_BitmapIndexScan:
-            {
-                BitmapIndexScan *bitmapindexscan = (BitmapIndexScan *) plan;
-                const char *indexname =
-                explain_get_index_name(bitmapindexscan->indexid);
-
-                if (es->format == EXPLAIN_FORMAT_TEXT)
-                    appendStringInfo(es->str, " on %s", indexname);
-                else
-                    ExplainPropertyText("Index Name", indexname, es);
-            }
-            break;
-        case T_ModifyTable:
-            ExplainModifyTarget((ModifyTable *) plan, es);
-            break;
-        case T_NestLoop:
-        case T_MergeJoin:
-        case T_HashJoin:
-            {
-                const char *jointype;
-
-                switch (((Join *) plan)->jointype)
-                {
-                    case JOIN_INNER:
-                        jointype = "Inner";
-                        break;
-                    case JOIN_LEFT:
-                        jointype = "Left";
-                        break;
-                    case JOIN_FULL:
-                        jointype = "Full";
-                        break;
-                    case JOIN_RIGHT:
-                        jointype = "Right";
-                        break;
-                    case JOIN_SEMI:
-                        jointype = "Semi";
-                        break;
-                    case JOIN_ANTI:
-                        jointype = "Anti";
-                        break;
-                    default:
-                        jointype = "???";
-                        break;
-                }
-                if (es->format == EXPLAIN_FORMAT_TEXT)
-                {
-                    /*
-                     * For historical reasons, the join type is interpolated
-                     * into the node type name...
-                     */
-                    if (((Join *) plan)->jointype != JOIN_INNER)
-                        appendStringInfo(es->str, " %s Join", jointype);
-                    else if (!IsA(plan, NestLoop))
-                        appendStringInfoString(es->str, " Join");
-                }
-                else
-                    ExplainPropertyText("Join Type", jointype, es);
-            }
-            break;
-        case T_SetOp:
-            {
-                const char *setopcmd;
-
-                switch (((SetOp *) plan)->cmd)
-                {
-                    case SETOPCMD_INTERSECT:
-                        setopcmd = "Intersect";
-                        break;
-                    case SETOPCMD_INTERSECT_ALL:
-                        setopcmd = "Intersect All";
-                        break;
-                    case SETOPCMD_EXCEPT:
-                        setopcmd = "Except";
-                        break;
-                    case SETOPCMD_EXCEPT_ALL:
-                        setopcmd = "Except All";
-                        break;
-                    default:
-                        setopcmd = "???";
+		case T_IndexScan:
+			{
+				IndexScan  *indexscan = (IndexScan *) plan;
+
+				ExplainIndexScanDetails(indexscan->indexid,
+										indexscan->indexorderdir,
+										es);
+				ExplainScanTarget((Scan *) indexscan, es);
+			}
+			break;
+		case T_IndexOnlyScan:
+			{
+				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
+
+				ExplainIndexScanDetails(indexonlyscan->indexid,
+										indexonlyscan->indexorderdir,
+										es);
+				ExplainScanTarget((Scan *) indexonlyscan, es);
+			}
+			break;
+		case T_BitmapIndexScan:
+			{
+				BitmapIndexScan *bitmapindexscan = (BitmapIndexScan *) plan;
+				const char *indexname =
+				explain_get_index_name(bitmapindexscan->indexid);
+
+				if (es->format == EXPLAIN_FORMAT_TEXT)
+					appendStringInfo(es->str, " on %s", indexname);
+				else
+					ExplainPropertyText("Index Name", indexname, es);
+			}
+			break;
+		case T_ModifyTable:
+			ExplainModifyTarget((ModifyTable *) plan, es);
+			break;
+		case T_NestLoop:
+		case T_MergeJoin:
+		case T_HashJoin:
+			{
+				const char *jointype;
+
+				switch (((Join *) plan)->jointype)
+				{
+					case JOIN_INNER:
+						jointype = "Inner";
+						break;
+					case JOIN_LEFT:
+						jointype = "Left";
+						break;
+					case JOIN_FULL:
+						jointype = "Full";
+						break;
+					case JOIN_RIGHT:
+						jointype = "Right";
+						break;
+					case JOIN_SEMI:
+						jointype = "Semi";
+						break;
+					case JOIN_ANTI:
+						jointype = "Anti";
+						break;
+#ifdef __TBASE__
+                    case JOIN_LEFT_SCALAR:
+                        jointype = "Left Scalar";
                         break;
-                }
-                if (es->format == EXPLAIN_FORMAT_TEXT)
-                    appendStringInfo(es->str, " %s", setopcmd);
-                else
-                    ExplainPropertyText("Command", setopcmd, es);
-            }
-            break;
-        default:
-            break;
-    }
-
-    if (es->costs)
-    {
-        if (es->format == EXPLAIN_FORMAT_TEXT)
-        {
-            appendStringInfo(es->str, "  (cost=%.2f..%.2f rows=%.0f width=%d)",
-                             plan->startup_cost, plan->total_cost,
-                             plan->plan_rows, plan->plan_width);
-        }
-        else
-        {
-            ExplainPropertyFloat("Startup Cost", plan->startup_cost, 2, es);
-            ExplainPropertyFloat("Total Cost", plan->total_cost, 2, es);
-            ExplainPropertyFloat("Plan Rows", plan->plan_rows, 0, es);
-            ExplainPropertyInteger("Plan Width", plan->plan_width, es);
-        }
-    }
-
-    /*
-     * We have to forcibly clean up the instrumentation state because we
-     * haven't done ExecutorEnd yet.  This is pretty grotty ...
-     *
-     * Note: contrib/auto_explain could cause instrumentation to be set up
-     * even though we didn't ask for it here.  Be careful not to print any
-     * instrumentation results the user didn't ask for.  But we do the
-     * InstrEndLoop call anyway, if possible, to reduce the number of cases
-     * auto_explain has to contend with.
-     */
-    if (planstate->instrument)
-        InstrEndLoop(planstate->instrument);
-
-    if (es->analyze &&
-        planstate->instrument && planstate->instrument->nloops > 0)
-    {
-        double        nloops = planstate->instrument->nloops;
-        double        startup_sec = 1000.0 * planstate->instrument->startup / nloops;
-        double        total_sec = 1000.0 * planstate->instrument->total / nloops;
-        double        rows = planstate->instrument->ntuples / nloops;
-
-        if (es->format == EXPLAIN_FORMAT_TEXT)
-        {
-            if (es->timing)
-                appendStringInfo(es->str,
-                                 " (actual time=%.3f..%.3f rows=%.0f loops=%.0f)",
-                                 startup_sec, total_sec, rows, nloops);
-            else
-                appendStringInfo(es->str,
-                                 " (actual rows=%.0f loops=%.0f)",
-                                 rows, nloops);
-        }
-        else
-        {
-            if (es->timing)
-            {
-                ExplainPropertyFloat("Actual Startup Time", startup_sec, 3, es);
-                ExplainPropertyFloat("Actual Total Time", total_sec, 3, es);
-            }
-            ExplainPropertyFloat("Actual Rows", rows, 0, es);
-            ExplainPropertyFloat("Actual Loops", nloops, 0, es);
-        }
-    }
-    else if (es->analyze)
-    {
-        if (es->format == EXPLAIN_FORMAT_TEXT)
-            appendStringInfoString(es->str, " (never executed)");
-        else
-        {
-            if (es->timing)
-            {
-                ExplainPropertyFloat("Actual Startup Time", 0.0, 3, es);
-                ExplainPropertyFloat("Actual Total Time", 0.0, 3, es);
-            }
-            ExplainPropertyFloat("Actual Rows", 0.0, 0, es);
-            ExplainPropertyFloat("Actual Loops", 0.0, 0, es);
-        }
-    }
-
-    /* in text format, first line ends here */
-    if (es->format == EXPLAIN_FORMAT_TEXT)
-        appendStringInfoChar(es->str, '\n');
-
-    /* target list */
-    if (es->verbose)
-        show_plan_tlist(planstate, ancestors, es);
-
-    /* unique join */
-    switch (nodeTag(plan))
-    {
-        case T_NestLoop:
-        case T_MergeJoin:
-        case T_HashJoin:
-            /* try not to be too chatty about this in text mode */
-            if (es->format != EXPLAIN_FORMAT_TEXT ||
-                (es->verbose && ((Join *) plan)->inner_unique))
-                ExplainPropertyBool("Inner Unique",
-                                    ((Join *) plan)->inner_unique,
-                                    es);
-            break;
-        default:
-            break;
-    }
-
-    /* quals, sort keys, etc */
-    switch (nodeTag(plan))
-    {
-        case T_IndexScan:
-            show_scan_qual(((IndexScan *) plan)->indexqualorig,
-                           "Index Cond", planstate, ancestors, es);
-            if (((IndexScan *) plan)->indexqualorig)
-                show_instrumentation_count("Rows Removed by Index Recheck", 2,
-                                           planstate, es);
-            show_scan_qual(((IndexScan *) plan)->indexorderbyorig,
-                           "Order By", planstate, ancestors, es);
-            show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
-            if (plan->qual)
-                show_instrumentation_count("Rows Removed by Filter", 1,
-                                           planstate, es);
-            break;
-        case T_IndexOnlyScan:
-            show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
-                           "Index Cond", planstate, ancestors, es);
-            if (((IndexOnlyScan *) plan)->indexqual)
-                show_instrumentation_count("Rows Removed by Index Recheck", 2,
-                                           planstate, es);
-            show_scan_qual(((IndexOnlyScan *) plan)->indexorderby,
-                           "Order By", planstate, ancestors, es);
-            show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
-            if (plan->qual)
-                show_instrumentation_count("Rows Removed by Filter", 1,
-                                           planstate, es);
-            if (es->analyze)
-                ExplainPropertyLong("Heap Fetches",
-                                    ((IndexOnlyScanState *) planstate)->ioss_HeapFetches, es);
-            break;
-        case T_BitmapIndexScan:
-            show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
-                           "Index Cond", planstate, ancestors, es);
-            break;
+#endif
+					default:
+						jointype = "???";
+						break;
+				}
+				if (es->format == EXPLAIN_FORMAT_TEXT)
+				{
+					/*
+					 * For historical reasons, the join type is interpolated
+					 * into the node type name...
+					 */
+					if (((Join *) plan)->jointype != JOIN_INNER)
+						appendStringInfo(es->str, " %s Join", jointype);
+					else if (!IsA(plan, NestLoop))
+						appendStringInfoString(es->str, " Join");
+				}
+				else
+					ExplainPropertyText("Join Type", jointype, es);
+			}
+			break;
+		case T_SetOp:
+			{
+				const char *setopcmd;
+
+				switch (((SetOp *) plan)->cmd)
+				{
+					case SETOPCMD_INTERSECT:
+						setopcmd = "Intersect";
+						break;
+					case SETOPCMD_INTERSECT_ALL:
+						setopcmd = "Intersect All";
+						break;
+					case SETOPCMD_EXCEPT:
+						setopcmd = "Except";
+						break;
+					case SETOPCMD_EXCEPT_ALL:
+						setopcmd = "Except All";
+						break;
+					default:
+						setopcmd = "???";
+						break;
+				}
+				if (es->format == EXPLAIN_FORMAT_TEXT)
+					appendStringInfo(es->str, " %s", setopcmd);
+				else
+					ExplainPropertyText("Command", setopcmd, es);
+			}
+			break;
+		default:
+			break;
+	}
+
+	if (es->costs)
+	{
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			appendStringInfo(es->str, "  (cost=%.2f..%.2f rows=%.0f width=%d)",
+							 plan->startup_cost, plan->total_cost,
+							 plan->plan_rows, plan->plan_width);
+		}
+		else
+		{
+			ExplainPropertyFloat("Startup Cost", plan->startup_cost, 2, es);
+			ExplainPropertyFloat("Total Cost", plan->total_cost, 2, es);
+			ExplainPropertyFloat("Plan Rows", plan->plan_rows, 0, es);
+			ExplainPropertyInteger("Plan Width", plan->plan_width, es);
+		}
+	}
+
+	/*
+	 * We have to forcibly clean up the instrumentation state because we
+	 * haven't done ExecutorEnd yet.  This is pretty grotty ...
+	 *
+	 * Note: contrib/auto_explain could cause instrumentation to be set up
+	 * even though we didn't ask for it here.  Be careful not to print any
+	 * instrumentation results the user didn't ask for.  But we do the
+	 * InstrEndLoop call anyway, if possible, to reduce the number of cases
+	 * auto_explain has to contend with.
+	 */
+	if (planstate->instrument)
+		InstrEndLoop(planstate->instrument);
+
+	if (es->analyze &&
+		planstate->instrument && planstate->instrument->nloops > 0)
+	{
+		double		nloops = planstate->instrument->nloops;
+		double		startup_sec = 1000.0 * planstate->instrument->startup / nloops;
+		double		total_sec = 1000.0 * planstate->instrument->total / nloops;
+		double		rows = planstate->instrument->ntuples / nloops;
+
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			if (es->timing)
+				appendStringInfo(es->str,
+								 " (actual time=%.3f..%.3f rows=%.0f loops=%.0f)",
+								 startup_sec, total_sec, rows, nloops);
+			else
+				appendStringInfo(es->str,
+								 " (actual rows=%.0f loops=%.0f)",
+								 rows, nloops);
+		}
+		else
+		{
+			if (es->timing)
+			{
+				ExplainPropertyFloat("Actual Startup Time", startup_sec, 3, es);
+				ExplainPropertyFloat("Actual Total Time", total_sec, 3, es);
+			}
+			ExplainPropertyFloat("Actual Rows", rows, 0, es);
+			ExplainPropertyFloat("Actual Loops", nloops, 0, es);
+		}
+	}
+	else if (es->analyze)
+	{
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+			appendStringInfoString(es->str, " (never executed)");
+		else
+		{
+			if (es->timing)
+			{
+				ExplainPropertyFloat("Actual Startup Time", 0.0, 3, es);
+				ExplainPropertyFloat("Actual Total Time", 0.0, 3, es);
+			}
+			ExplainPropertyFloat("Actual Rows", 0.0, 0, es);
+			ExplainPropertyFloat("Actual Loops", 0.0, 0, es);
+		}
+	}
+
+	/* in text format, first line ends here */
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+		appendStringInfoChar(es->str, '\n');
+
+	/* target list */
+	if (es->verbose)
+		show_plan_tlist(planstate, ancestors, es);
+
+	/* unique join */
+	switch (nodeTag(plan))
+	{
+		case T_NestLoop:
+		case T_MergeJoin:
+		case T_HashJoin:
+			/* try not to be too chatty about this in text mode */
+			if (es->format != EXPLAIN_FORMAT_TEXT ||
+				(es->verbose && ((Join *) plan)->inner_unique))
+				ExplainPropertyBool("Inner Unique",
+									((Join *) plan)->inner_unique,
+									es);
+			break;
+		default:
+			break;
+	}
+
+	/* quals, sort keys, etc */
+	switch (nodeTag(plan))
+	{
+		case T_IndexScan:
+			show_scan_qual(((IndexScan *) plan)->indexqualorig,
+						   "Index Cond", planstate, ancestors, es);
+			if (((IndexScan *) plan)->indexqualorig)
+				show_instrumentation_count("Rows Removed by Index Recheck", 2,
+										   planstate, es);
+			show_scan_qual(((IndexScan *) plan)->indexorderbyorig,
+						   "Order By", planstate, ancestors, es);
+			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
+			break;
+		case T_IndexOnlyScan:
+			show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
+						   "Index Cond", planstate, ancestors, es);
+			if (((IndexOnlyScan *) plan)->indexqual)
+				show_instrumentation_count("Rows Removed by Index Recheck", 2,
+										   planstate, es);
+			show_scan_qual(((IndexOnlyScan *) plan)->indexorderby,
+						   "Order By", planstate, ancestors, es);
+			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
+			if (es->analyze)
+				ExplainPropertyLong("Heap Fetches",
+									((IndexOnlyScanState *) planstate)->ioss_HeapFetches, es);
+			break;
+		case T_BitmapIndexScan:
+			show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
+						   "Index Cond", planstate, ancestors, es);
+			break;
 #ifdef PGXC
         case T_RemoteQuery:
             /* Remote query */
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 87fde8bd..9f1b7b90 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -410,98 +410,104 @@ ExecHashJoin(PlanState *pstate)
                               }
                           }
 #endif
-                        /* set up to scan for unmatched inner tuples */
-                        ExecPrepHashTableForUnmatched(node);
-                        node->hj_JoinState = HJ_FILL_INNER_TUPLES;
-                    }
-                    else
-                        node->hj_JoinState = HJ_NEED_NEW_BATCH;
-                    continue;
-                }
-                
-                econtext->ecxt_outertuple = outerTupleSlot;
-                node->hj_MatchedOuter = false;
-
-                /*
-                 * Find the corresponding bucket for this tuple in the main
-                 * hash table or skew hash table.
-                 */
-                node->hj_CurHashValue = hashvalue;
-                ExecHashGetBucketAndBatch(hashtable, hashvalue,
-                                          &node->hj_CurBucketNo, &batchno);
-                node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
-                                                                 hashvalue);
-                node->hj_CurTuple = NULL;
-
-                /*
-                 * The tuple might not belong to the current batch (where
-                 * "current batch" includes the skew buckets if any).
-                 */
-                if (batchno != hashtable->curbatch &&
-                    node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
-                {
-                    /*
-                     * Need to postpone this outer tuple to a later batch.
-                     * Save it in the corresponding outer-batch file.
-                     */
-                    Assert(batchno > hashtable->curbatch);
-                    ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
-                                          hashvalue,
-                                          &hashtable->outerBatchFile[batchno]);
-                    /* Loop around, staying in HJ_NEED_NEW_OUTER state */
-                    continue;
-                }
-
-                /* OK, let's scan the bucket for matches */
-                node->hj_JoinState = HJ_SCAN_BUCKET;
-
-                /* FALL THRU */
-
-            case HJ_SCAN_BUCKET:
-
-                /*
-                 * Scan the selected hash bucket for matches to current outer
-                 */
-                if (!ExecScanHashBucket(node, econtext))
-                {
-                    /* out of matches; check for possible outer-join fill */
-                    node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
-                    continue;
-                }
-
-                /*
-                 * We've got a match, but still need to test non-hashed quals.
-                 * ExecScanHashBucket already set up all the state needed to
-                 * call ExecQual.
-                 *
-                 * If we pass the qual, then save state for next call and have
-                 * ExecProject form the projection, store it in the tuple
-                 * table, and return the slot.
-                 *
-                 * Only the joinquals determine tuple match status, but all
-                 * quals must pass to actually return the tuple.
-                 */
-                if (joinqual == NULL || ExecQual(joinqual, econtext))
-                {
-                    node->hj_MatchedOuter = true;
-                    HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
-
-                    /* In an antijoin, we never return a matched tuple */
-                    if (node->js.jointype == JOIN_ANTI)
-                    {
-                        node->hj_JoinState = HJ_NEED_NEW_OUTER;
-                        continue;
-                    }
-
-                    /*
-                     * If we only need to join to the first matching inner
-                     * tuple, then consider returning this one, but after that
-                     * continue with next outer tuple.
-                     */
-                    if (node->js.single_match)
-                        node->hj_JoinState = HJ_NEED_NEW_OUTER;
-
-                    if (otherqual == NULL || ExecQual(otherqual, econtext))
+						/* set up to scan for unmatched inner tuples */
+						ExecPrepHashTableForUnmatched(node);
+						node->hj_JoinState = HJ_FILL_INNER_TUPLES;
+					}
+					else
+						node->hj_JoinState = HJ_NEED_NEW_BATCH;
+					continue;
+				}
+				
+				econtext->ecxt_outertuple = outerTupleSlot;
+				node->hj_MatchedOuter = false;
+
+				/*
+				 * Find the corresponding bucket for this tuple in the main
+				 * hash table or skew hash table.
+				 */
+				node->hj_CurHashValue = hashvalue;
+				ExecHashGetBucketAndBatch(hashtable, hashvalue,
+										  &node->hj_CurBucketNo, &batchno);
+				node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
+																 hashvalue);
+				node->hj_CurTuple = NULL;
+
+				/*
+				 * The tuple might not belong to the current batch (where
+				 * "current batch" includes the skew buckets if any).
+				 */
+				if (batchno != hashtable->curbatch &&
+					node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
+				{
+					/*
+					 * Need to postpone this outer tuple to a later batch.
+					 * Save it in the corresponding outer-batch file.
+					 */
+					Assert(batchno > hashtable->curbatch);
+					ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
+										  hashvalue,
+										  &hashtable->outerBatchFile[batchno]);
+					/* Loop around, staying in HJ_NEED_NEW_OUTER state */
+					continue;
+				}
+
+				/* OK, let's scan the bucket for matches */
+				node->hj_JoinState = HJ_SCAN_BUCKET;
+
+				/* FALL THRU */
+
+			case HJ_SCAN_BUCKET:
+
+				/*
+				 * Scan the selected hash bucket for matches to current outer
+				 */
+				if (!ExecScanHashBucket(node, econtext))
+				{
+					/* out of matches; check for possible outer-join fill */
+					node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
+					continue;
+				}
+
+				/*
+				 * We've got a match, but still need to test non-hashed quals.
+				 * ExecScanHashBucket already set up all the state needed to
+				 * call ExecQual.
+				 *
+				 * If we pass the qual, then save state for next call and have
+				 * ExecProject form the projection, store it in the tuple
+				 * table, and return the slot.
+				 *
+				 * Only the joinquals determine tuple match status, but all
+				 * quals must pass to actually return the tuple.
+				 */
+				if (joinqual == NULL || ExecQual(joinqual, econtext))
+				{
+#ifdef __TBASE__
+                    if (node->js.jointype == JOIN_LEFT_SCALAR && node->hj_MatchedOuter)
+                        ereport(ERROR,
+                                (errcode(ERRCODE_CARDINALITY_VIOLATION),
+                                        errmsg("more than one row returned by a subquery used as an expression")));
+#endif
+					node->hj_MatchedOuter = true;
+					HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
+
+					/* In an antijoin, we never return a matched tuple */
+					if (node->js.jointype == JOIN_ANTI)
+					{
+						node->hj_JoinState = HJ_NEED_NEW_OUTER;
+						continue;
+					}
+
+					/*
+					 * If we only need to join to the first matching inner
+					 * tuple, then consider returning this one, but after that
+					 * continue with next outer tuple.
+					 */
+					if (node->js.single_match)
+						node->hj_JoinState = HJ_NEED_NEW_OUTER;
+
+					if (otherqual == NULL || ExecQual(otherqual, econtext))
 #ifdef __TBASE__
                     {
                         node->matched_tuples++;
@@ -593,161 +599,168 @@ ExecHashJoin(PlanState *pstate)
  */
 HashJoinState *
 ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
-{// #lizard forgives
-    HashJoinState *hjstate;
-    Plan       *outerNode;
-    Hash       *hashNode;
-    List       *lclauses;
-    List       *rclauses;
-    List       *hoperators;
-    ListCell   *l;
-
-    /* check for unsupported flags */
-    Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
-
-    /*
-     * create state structure
-     */
-    hjstate = makeNode(HashJoinState);
-    hjstate->js.ps.plan = (Plan *) node;
-    hjstate->js.ps.state = estate;
-    hjstate->js.ps.ExecProcNode = ExecHashJoin;
-
-    /*
-     * Miscellaneous initialization
-     *
-     * create expression context for node
-     */
-    ExecAssignExprContext(estate, &hjstate->js.ps);
-
-    /*
-     * initialize child expressions
-     */
-    hjstate->js.ps.qual =
-        ExecInitQual(node->join.plan.qual, (PlanState *) hjstate);
-    hjstate->js.jointype = node->join.jointype;
-    hjstate->js.joinqual =
-        ExecInitQual(node->join.joinqual, (PlanState *) hjstate);
-    hjstate->hashclauses =
-        ExecInitQual(node->hashclauses, (PlanState *) hjstate);
-
-    /*
-     * initialize child nodes
-     *
-     * Note: we could suppress the REWIND flag for the inner input, which
-     * would amount to betting that the hash will be a single batch.  Not
-     * clear if this would be a win or not.
-     */
-    outerNode = outerPlan(node);
-    hashNode = (Hash *) innerPlan(node);
-
-    outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
-    innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
-
-    /*
-     * tuple table initialization
-     */
-    ExecInitResultTupleSlot(estate, &hjstate->js.ps);
-    hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
-
-    /*
-     * detect whether we need only consider the first matching inner tuple
-     */
-    hjstate->js.single_match = (node->join.inner_unique ||
-                                node->join.jointype == JOIN_SEMI);
-
-    /* set up null tuples for outer joins, if needed */
-    switch (node->join.jointype)
-    {
-        case JOIN_INNER:
+{
+	HashJoinState *hjstate;
+	Plan	   *outerNode;
+	Hash	   *hashNode;
+	List	   *lclauses;
+	List	   *rclauses;
+	List	   *hoperators;
+	ListCell   *l;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	/*
+	 * create state structure
+	 */
+	hjstate = makeNode(HashJoinState);
+	hjstate->js.ps.plan = (Plan *) node;
+	hjstate->js.ps.state = estate;
+	hjstate->js.ps.ExecProcNode = ExecHashJoin;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &hjstate->js.ps);
+
+	/*
+	 * initialize child expressions
+	 */
+	hjstate->js.ps.qual =
+		ExecInitQual(node->join.plan.qual, (PlanState *) hjstate);
+	hjstate->js.jointype = node->join.jointype;
+	hjstate->js.joinqual =
+		ExecInitQual(node->join.joinqual, (PlanState *) hjstate);
+	hjstate->hashclauses =
+		ExecInitQual(node->hashclauses, (PlanState *) hjstate);
+
+	/*
+	 * initialize child nodes
+	 *
+	 * Note: we could suppress the REWIND flag for the inner input, which
+	 * would amount to betting that the hash will be a single batch.  Not
+	 * clear if this would be a win or not.
+	 */
+	outerNode = outerPlan(node);
+	hashNode = (Hash *) innerPlan(node);
+
+	outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
+	innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
+
+	/*
+	 * tuple table initialization
+	 */
+	ExecInitResultTupleSlot(estate, &hjstate->js.ps);
+	hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
+
+	/*
+	 * detect whether we need only consider the first matching inner tuple
+	 */
+	hjstate->js.single_match = (node->join.inner_unique ||
+								node->join.jointype == JOIN_SEMI);
+
+	/* set up null tuples for outer joins, if needed */
+	switch (node->join.jointype)
+	{
+		case JOIN_INNER:
         case JOIN_SEMI:
-            break;
-        case JOIN_LEFT:
-        case JOIN_ANTI:
-            hjstate->hj_NullInnerTupleSlot =
-                ExecInitNullTupleSlot(estate,
-                                      ExecGetResultType(innerPlanState(hjstate)));
-            break;
-        case JOIN_RIGHT:
-            hjstate->hj_NullOuterTupleSlot =
-                ExecInitNullTupleSlot(estate,
-                                      ExecGetResultType(outerPlanState(hjstate)));
-            break;
-        case JOIN_FULL:
-            hjstate->hj_NullOuterTupleSlot =
-                ExecInitNullTupleSlot(estate,
-                                      ExecGetResultType(outerPlanState(hjstate)));
+		    break;
+#ifdef __TBASE__
+        case JOIN_LEFT_SCALAR:
             hjstate->hj_NullInnerTupleSlot =
-                ExecInitNullTupleSlot(estate,
-                                      ExecGetResultType(innerPlanState(hjstate)));
+                    ExecInitNullTupleSlot(estate,
+                                          ExecGetResultType(innerPlanState(hjstate)));
             break;
-        default:
-            elog(ERROR, "unrecognized join type: %d",
-                 (int) node->join.jointype);
-    }
-
-    /*
-     * now for some voodoo.  our temporary tuple slot is actually the result
-     * tuple slot of the Hash node (which is our inner plan).  we can do this
-     * because Hash nodes don't return tuples via ExecProcNode() -- instead
-     * the hash join node uses ExecScanHashBucket() to get at the contents of
-     * the hash table.  -cim 6/9/91
-     */
-    {
-        HashState  *hashstate = (HashState *) innerPlanState(hjstate);
-        TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
-
-        hjstate->hj_HashTupleSlot = slot;
-    }
-
-    /*
-     * initialize tuple type and projection info
-     */
-    ExecAssignResultTypeFromTL(&hjstate->js.ps);
-    ExecAssignProjectionInfo(&hjstate->js.ps, NULL);
-
-    ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
-                          ExecGetResultType(outerPlanState(hjstate)));
-
-    /*
-     * initialize hash-specific info
-     */
-    hjstate->hj_HashTable = NULL;
-    hjstate->hj_FirstOuterTupleSlot = NULL;
-
-    hjstate->hj_CurHashValue = 0;
-    hjstate->hj_CurBucketNo = 0;
-    hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
-    hjstate->hj_CurTuple = NULL;
-
-    /*
-     * Deconstruct the hash clauses into outer and inner argument values, so
-     * that we can evaluate those subexpressions separately.  Also make a list
-     * of the hash operator OIDs, in preparation for looking up the hash
-     * functions to use.
-     */
-    lclauses = NIL;
-    rclauses = NIL;
-    hoperators = NIL;
-    foreach(l, node->hashclauses)
-    {
-        OpExpr       *hclause = lfirst_node(OpExpr, l);
-
-        lclauses = lappend(lclauses, ExecInitExpr(linitial(hclause->args),
-                                                  (PlanState *) hjstate));
-        rclauses = lappend(rclauses, ExecInitExpr(lsecond(hclause->args),
-                                                  (PlanState *) hjstate));
-        hoperators = lappend_oid(hoperators, hclause->opno);
-    }
-    hjstate->hj_OuterHashKeys = lclauses;
-    hjstate->hj_InnerHashKeys = rclauses;
-    hjstate->hj_HashOperators = hoperators;
-    /* child Hash node needs to evaluate inner hash keys, too */
-    ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
-
-    hjstate->hj_JoinState = HJ_BUILD_HASHTABLE;
-    hjstate->hj_MatchedOuter = false;
-    hjstate->hj_OuterNotEmpty = false;
+#endif
+		case JOIN_LEFT:
+		case JOIN_ANTI:
+			hjstate->hj_NullInnerTupleSlot =
+				ExecInitNullTupleSlot(estate,
+									  ExecGetResultType(innerPlanState(hjstate)));
+			break;
+		case JOIN_RIGHT:
+			hjstate->hj_NullOuterTupleSlot =
+				ExecInitNullTupleSlot(estate,
+									  ExecGetResultType(outerPlanState(hjstate)));
+			break;
+		case JOIN_FULL:
+			hjstate->hj_NullOuterTupleSlot =
+				ExecInitNullTupleSlot(estate,
+									  ExecGetResultType(outerPlanState(hjstate)));
+			hjstate->hj_NullInnerTupleSlot =
+				ExecInitNullTupleSlot(estate,
+									  ExecGetResultType(innerPlanState(hjstate)));
+			break;
+		default:
+			elog(ERROR, "unrecognized join type: %d",
+				 (int) node->join.jointype);
+	}
+
+	/*
+	 * now for some voodoo.  our temporary tuple slot is actually the result
+	 * tuple slot of the Hash node (which is our inner plan).  we can do this
+	 * because Hash nodes don't return tuples via ExecProcNode() -- instead
+	 * the hash join node uses ExecScanHashBucket() to get at the contents of
+	 * the hash table.  -cim 6/9/91
+	 */
+	{
+		HashState  *hashstate = (HashState *) innerPlanState(hjstate);
+		TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
+
+		hjstate->hj_HashTupleSlot = slot;
+	}
+
+	/*
+	 * initialize tuple type and projection info
+	 */
+	ExecAssignResultTypeFromTL(&hjstate->js.ps);
+	ExecAssignProjectionInfo(&hjstate->js.ps, NULL);
+
+	ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
+						  ExecGetResultType(outerPlanState(hjstate)));
+
+	/*
+	 * initialize hash-specific info
+	 */
+	hjstate->hj_HashTable = NULL;
+	hjstate->hj_FirstOuterTupleSlot = NULL;
+
+	hjstate->hj_CurHashValue = 0;
+	hjstate->hj_CurBucketNo = 0;
+	hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
+	hjstate->hj_CurTuple = NULL;
+
+	/*
+	 * Deconstruct the hash clauses into outer and inner argument values, so
+	 * that we can evaluate those subexpressions separately.  Also make a list
+	 * of the hash operator OIDs, in preparation for looking up the hash
+	 * functions to use.
+	 */
+	lclauses = NIL;
+	rclauses = NIL;
+	hoperators = NIL;
+	foreach(l, node->hashclauses)
+	{
+		OpExpr	   *hclause = lfirst_node(OpExpr, l);
+
+		lclauses = lappend(lclauses, ExecInitExpr(linitial(hclause->args),
+												  (PlanState *) hjstate));
+		rclauses = lappend(rclauses, ExecInitExpr(lsecond(hclause->args),
+												  (PlanState *) hjstate));
+		hoperators = lappend_oid(hoperators, hclause->opno);
+	}
+	hjstate->hj_OuterHashKeys = lclauses;
+	hjstate->hj_InnerHashKeys = rclauses;
+	hjstate->hj_HashOperators = hoperators;
+	/* child Hash node needs to evaluate inner hash keys, too */
+	((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
+
+	hjstate->hj_JoinState = HJ_BUILD_HASHTABLE;
+	hjstate->hj_MatchedOuter = false;
+	hjstate->hj_OuterNotEmpty = false;
 #ifdef __TBASE__
     hjstate->hj_OuterInited = false;
     hjstate->hj_InnerInited = false;
diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c
index 985362fe..d10b74b0 100644
--- a/src/backend/executor/nodeMergejoin.c
+++ b/src/backend/executor/nodeMergejoin.c
@@ -714,731 +714,737 @@ ExecMergeJoin(PlanState *pstate)
 #ifdef __TBASE__
                 node->mj_InnerInited = true;
 #endif
-                innerTupleSlot = ExecProcNode(innerPlan);
-                node->mj_InnerTupleSlot = innerTupleSlot;
-
-                /* Compute join values and check for unmatchability */
-                switch (MJEvalInnerValues(node, innerTupleSlot))
-                {
-                    case MJEVAL_MATCHABLE:
-
-                        /*
-                         * OK, we have the initial tuples.  Begin by skipping
-                         * non-matching tuples.
-                         */
-                        node->mj_JoinState = EXEC_MJ_SKIP_TEST;
-                        break;
-                    case MJEVAL_NONMATCHABLE:
-                        /* Mark before advancing, if wanted */
-                        if (node->mj_ExtraMarks)
-                            ExecMarkPos(innerPlan);
-                        /* Stay in same state to fetch next inner tuple */
-                        if (doFillInner)
-                        {
-                            /*
-                             * Generate a fake join tuple with nulls for the
-                             * outer tuple, and return it if it passes the
-                             * non-join quals.
-                             */
-                            TupleTableSlot *result;
-
-                            result = MJFillInner(node);
-                            if (result)
-                                return result;
-                        }
-                        break;
-                    case MJEVAL_ENDOFJOIN:
-                        /* No more inner tuples */
-                        MJ_printf("ExecMergeJoin: nothing in inner subplan\n");
-                        if (doFillOuter)
-                        {
-                            /*
-                             * Need to emit left-join tuples for all outer
-                             * tuples, including the one we just fetched.  We
-                             * set MatchedOuter = false to force the ENDINNER
-                             * state to emit first tuple before advancing
-                             * outer.
-                             */
-                            node->mj_JoinState = EXEC_MJ_ENDINNER;
-                            node->mj_MatchedOuter = false;
-                            break;
-                        }
-                        /* Otherwise we're done. */
-                        return NULL;
-                }
-                break;
-
-                /*
-                 * EXEC_MJ_JOINTUPLES means we have two tuples which satisfied
-                 * the merge clause so we join them and then proceed to get
-                 * the next inner tuple (EXEC_MJ_NEXTINNER).
-                 */
-            case EXEC_MJ_JOINTUPLES:
-                MJ_printf("ExecMergeJoin: EXEC_MJ_JOINTUPLES\n");
-
-                /*
-                 * Set the next state machine state.  The right things will
-                 * happen whether we return this join tuple or just fall
-                 * through to continue the state machine execution.
-                 */
-                node->mj_JoinState = EXEC_MJ_NEXTINNER;
-
-                /*
-                 * Check the extra qual conditions to see if we actually want
-                 * to return this join tuple.  If not, can proceed with merge.
-                 * We must distinguish the additional joinquals (which must
-                 * pass to consider the tuples "matched" for outer-join logic)
-                 * from the otherquals (which must pass before we actually
-                 * return the tuple).
-                 *
-                 * We don't bother with a ResetExprContext here, on the
-                 * assumption that we just did one while checking the merge
-                 * qual.  One per tuple should be sufficient.  We do have to
-                 * set up the econtext links to the tuples for ExecQual to
-                 * use.
-                 */
-                outerTupleSlot = node->mj_OuterTupleSlot;
-                econtext->ecxt_outertuple = outerTupleSlot;
-                innerTupleSlot = node->mj_InnerTupleSlot;
-                econtext->ecxt_innertuple = innerTupleSlot;
-
-                qualResult = (joinqual == NULL ||
-                              ExecQual(joinqual, econtext));
-                MJ_DEBUG_QUAL(joinqual, qualResult);
-
-                if (qualResult)
-                {
-                    node->mj_MatchedOuter = true;
-                    node->mj_MatchedInner = true;
-
-                    /* In an antijoin, we never return a matched tuple */
-                    if (node->js.jointype == JOIN_ANTI)
-                    {
-                        node->mj_JoinState = EXEC_MJ_NEXTOUTER;
-                        break;
-                    }
-
-                    /*
-                     * If we only need to join to the first matching inner
-                     * tuple, then consider returning this one, but after that
-                     * continue with next outer tuple.
-                     */
-                    if (node->js.single_match)
-                        node->mj_JoinState = EXEC_MJ_NEXTOUTER;
-
-                    qualResult = (otherqual == NULL ||
-                                  ExecQual(otherqual, econtext));
-                    MJ_DEBUG_QUAL(otherqual, qualResult);
-
-                    if (qualResult)
-                    {
-                        /*
-                         * qualification succeeded.  now form the desired
-                         * projection tuple and return the slot containing it.
-                         */
-                        MJ_printf("ExecMergeJoin: returning tuple\n");
-
-                        return ExecProject(node->js.ps.ps_ProjInfo);
-                    }
-                    else
-                        InstrCountFiltered2(node, 1);
-                }
-                else
-                    InstrCountFiltered1(node, 1);
-                break;
-
-                /*
-                 * EXEC_MJ_NEXTINNER means advance the inner scan to the next
-                 * tuple. If the tuple is not nil, we then proceed to test it
-                 * against the join qualification.
-                 *
-                 * Before advancing, we check to see if we must emit an
-                 * outer-join fill tuple for this inner tuple.
-                 */
-            case EXEC_MJ_NEXTINNER:
-                MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTINNER\n");
-
-                if (doFillInner && !node->mj_MatchedInner)
-                {
-                    /*
-                     * Generate a fake join tuple with nulls for the outer
-                     * tuple, and return it if it passes the non-join quals.
-                     */
-                    TupleTableSlot *result;
-
-                    node->mj_MatchedInner = true;    /* do it only once */
-
-                    result = MJFillInner(node);
-                    if (result)
-                        return result;
-                }
-
-                /*
-                 * now we get the next inner tuple, if any.  If there's none,
-                 * advance to next outer tuple (which may be able to join to
-                 * previously marked tuples).
-                 *
-                 * NB: must NOT do "extraMarks" here, since we may need to
-                 * return to previously marked tuples.
-                 */
-                innerTupleSlot = ExecProcNode(innerPlan);
-                node->mj_InnerTupleSlot = innerTupleSlot;
-                MJ_DEBUG_PROC_NODE(innerTupleSlot);
-                node->mj_MatchedInner = false;
-
-                /* Compute join values and check for unmatchability */
-                switch (MJEvalInnerValues(node, innerTupleSlot))
-                {
-                    case MJEVAL_MATCHABLE:
-
-                        /*
-                         * Test the new inner tuple to see if it matches
-                         * outer.
-                         *
-                         * If they do match, then we join them and move on to
-                         * the next inner tuple (EXEC_MJ_JOINTUPLES).
-                         *
-                         * If they do not match then advance to next outer
-                         * tuple.
-                         */
-                        compareResult = MJCompare(node);
-                        MJ_DEBUG_COMPARE(compareResult);
-
-                        if (compareResult == 0)
-                            node->mj_JoinState = EXEC_MJ_JOINTUPLES;
-                        else
-                        {
-                            Assert(compareResult < 0);
-                            node->mj_JoinState = EXEC_MJ_NEXTOUTER;
-                        }
-                        break;
-                    case MJEVAL_NONMATCHABLE:
-
-                        /*
-                         * It contains a NULL and hence can't match any outer
-                         * tuple, so we can skip the comparison and assume the
-                         * new tuple is greater than current outer.
-                         */
-                        node->mj_JoinState = EXEC_MJ_NEXTOUTER;
-                        break;
-                    case MJEVAL_ENDOFJOIN:
-
-                        /*
-                         * No more inner tuples.  However, this might be only
-                         * effective and not physical end of inner plan, so
-                         * force mj_InnerTupleSlot to null to make sure we
-                         * don't fetch more inner tuples.  (We need this hack
-                         * because we are not transiting to a state where the
-                         * inner plan is assumed to be exhausted.)
-                         */
-                        node->mj_InnerTupleSlot = NULL;
-                        node->mj_JoinState = EXEC_MJ_NEXTOUTER;
-                        break;
-                }
-                break;
-
-                /*-------------------------------------------
-                 * EXEC_MJ_NEXTOUTER means
-                 *
-                 *                outer inner
-                 * outer tuple -  5        5  - marked tuple
-                 *                  5        5
-                 *                  6        6  - inner tuple
-                 *                  7        7
-                 *
-                 * we know we just bumped into the
-                 * first inner tuple > current outer tuple (or possibly
-                 * the end of the inner stream)
-                 * so get a new outer tuple and then
-                 * proceed to test it against the marked tuple
-                 * (EXEC_MJ_TESTOUTER)
-                 *
-                 * Before advancing, we check to see if we must emit an
-                 * outer-join fill tuple for this outer tuple.
-                 *------------------------------------------------
-                 */
-            case EXEC_MJ_NEXTOUTER:
-                MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTOUTER\n");
-
-                if (doFillOuter && !node->mj_MatchedOuter)
-                {
-                    /*
-                     * Generate a fake join tuple with nulls for the inner
-                     * tuple, and return it if it passes the non-join quals.
-                     */
-                    TupleTableSlot *result;
-
-                    node->mj_MatchedOuter = true;    /* do it only once */
-
-                    result = MJFillOuter(node);
-                    if (result)
-                        return result;
-                }
-
-                /*
-                 * now we get the next outer tuple, if any
-                 */
-                outerTupleSlot = ExecProcNode(outerPlan);
-                node->mj_OuterTupleSlot = outerTupleSlot;
-                MJ_DEBUG_PROC_NODE(outerTupleSlot);
-                node->mj_MatchedOuter = false;
-
-                /* Compute join values and check for unmatchability */
-                switch (MJEvalOuterValues(node))
-                {
-                    case MJEVAL_MATCHABLE:
-                        /* Go test the new tuple against the marked tuple */
-                        node->mj_JoinState = EXEC_MJ_TESTOUTER;
-                        break;
-                    case MJEVAL_NONMATCHABLE:
-                        /* Can't match, so fetch next outer tuple */
-                        node->mj_JoinState = EXEC_MJ_NEXTOUTER;
-                        break;
-                    case MJEVAL_ENDOFJOIN:
-                        /* No more outer tuples */
-                        MJ_printf("ExecMergeJoin: end of outer subplan\n");
-                        innerTupleSlot = node->mj_InnerTupleSlot;
-                        if (doFillInner && !TupIsNull(innerTupleSlot))
-                        {
-                            /*
-                             * Need to emit right-join tuples for remaining
-                             * inner tuples.
-                             */
-                            node->mj_JoinState = EXEC_MJ_ENDOUTER;
-                            break;
-                        }
-                        /* Otherwise we're done. */
-                        return NULL;
-                }
-                break;
-
-                /*--------------------------------------------------------
-                 * EXEC_MJ_TESTOUTER If the new outer tuple and the marked
-                 * tuple satisfy the merge clause then we know we have
-                 * duplicates in the outer scan so we have to restore the
-                 * inner scan to the marked tuple and proceed to join the
-                 * new outer tuple with the inner tuples.
-                 *
-                 * This is the case when
-                 *                          outer inner
-                 *                            4      5  - marked tuple
-                 *             outer tuple -    5      5
-                 *         new outer tuple -    5      5
-                 *                            6      8  - inner tuple
-                 *                            7     12
-                 *
-                 *                new outer tuple == marked tuple
-                 *
-                 * If the outer tuple fails the test, then we are done
-                 * with the marked tuples, and we have to look for a
-                 * match to the current inner tuple.  So we will
-                 * proceed to skip outer tuples until outer >= inner
-                 * (EXEC_MJ_SKIP_TEST).
-                 *
-                 *        This is the case when
-                 *
-                 *                          outer inner
-                 *                            5      5  - marked tuple
-                 *             outer tuple -    5      5
-                 *         new outer tuple -    6      8  - inner tuple
-                 *                            7     12
-                 *
-                 *                new outer tuple > marked tuple
-                 *
-                 *---------------------------------------------------------
-                 */
-            case EXEC_MJ_TESTOUTER:
-                MJ_printf("ExecMergeJoin: EXEC_MJ_TESTOUTER\n");
-
-                /*
-                 * Here we must compare the outer tuple with the marked inner
-                 * tuple.  (We can ignore the result of MJEvalInnerValues,
-                 * since the marked inner tuple is certainly matchable.)
-                 */
-                innerTupleSlot = node->mj_MarkedTupleSlot;
-                (void) MJEvalInnerValues(node, innerTupleSlot);
-
-                compareResult = MJCompare(node);
-                MJ_DEBUG_COMPARE(compareResult);
-
-                if (compareResult == 0)
-                {
-                    /*
-                     * the merge clause matched so now we restore the inner
-                     * scan position to the first mark, and go join that tuple
-                     * (and any following ones) to the new outer.
-                     *
-                     * If we were able to determine mark and restore are not
-                     * needed, then we don't have to back up; the current
-                     * inner is already the first possible match.
-                     *
-                     * NOTE: we do not need to worry about the MatchedInner
-                     * state for the rescanned inner tuples.  We know all of
-                     * them will match this new outer tuple and therefore
-                     * won't be emitted as fill tuples.  This works *only*
-                     * because we require the extra joinquals to be constant
-                     * when doing a right or full join --- otherwise some of
-                     * the rescanned tuples might fail the extra joinquals.
-                     * This obviously won't happen for a constant-true extra
-                     * joinqual, while the constant-false case is handled by
-                     * forcing the merge clause to never match, so we never
-                     * get here.
-                     */
-                    if (!node->mj_SkipMarkRestore)
-                    {
-                        ExecRestrPos(innerPlan);
-
-                        /*
-                         * ExecRestrPos probably should give us back a new
-                         * Slot, but since it doesn't, use the marked slot.
-                         * (The previously returned mj_InnerTupleSlot cannot
-                         * be assumed to hold the required tuple.)
-                         */
-                        node->mj_InnerTupleSlot = innerTupleSlot;
-                        /* we need not do MJEvalInnerValues again */
-                    }
-
-                    node->mj_JoinState = EXEC_MJ_JOINTUPLES;
-                }
-                else
-                {
-                    /* ----------------
-                     *    if the new outer tuple didn't match the marked inner
-                     *    tuple then we have a case like:
-                     *
-                     *             outer inner
-                     *               4     4    - marked tuple
-                     * new outer - 5     4
-                     *               6     5    - inner tuple
-                     *               7
-                     *
-                     *    which means that all subsequent outer tuples will be
-                     *    larger than our marked inner tuples.  So we need not
-                     *    revisit any of the marked tuples but can proceed to
-                     *    look for a match to the current inner.  If there's
-                     *    no more inners, no more matches are possible.
-                     * ----------------
-                     */
-                    Assert(compareResult > 0);
-                    innerTupleSlot = node->mj_InnerTupleSlot;
-
-                    /* reload comparison data for current inner */
-                    switch (MJEvalInnerValues(node, innerTupleSlot))
-                    {
-                        case MJEVAL_MATCHABLE:
-                            /* proceed to compare it to the current outer */
-                            node->mj_JoinState = EXEC_MJ_SKIP_TEST;
-                            break;
-                        case MJEVAL_NONMATCHABLE:
-
-                            /*
-                             * current inner can't possibly match any outer;
-                             * better to advance the inner scan than the
-                             * outer.
-                             */
-                            node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE;
-                            break;
-                        case MJEVAL_ENDOFJOIN:
-                            /* No more inner tuples */
-                            if (doFillOuter)
-                            {
-                                /*
-                                 * Need to emit left-join tuples for remaining
-                                 * outer tuples.
-                                 */
-                                node->mj_JoinState = EXEC_MJ_ENDINNER;
-                                break;
-                            }
-                            /* Otherwise we're done. */
-                            return NULL;
-                    }
-                }
-                break;
-
-                /*----------------------------------------------------------
-                 * EXEC_MJ_SKIP means compare tuples and if they do not
-                 * match, skip whichever is lesser.
-                 *
-                 * For example:
-                 *
-                 *                outer inner
-                 *                  5        5
-                 *                  5        5
-                 * outer tuple -  6        8  - inner tuple
-                 *                  7    12
-                 *                  8    14
-                 *
-                 * we have to advance the outer scan
-                 * until we find the outer 8.
-                 *
-                 * On the other hand:
-                 *
-                 *                outer inner
-                 *                  5        5
-                 *                  5        5
-                 * outer tuple - 12        8  - inner tuple
-                 *                 14    10
-                 *                 17    12
-                 *
-                 * we have to advance the inner scan
-                 * until we find the inner 12.
-                 *----------------------------------------------------------
-                 */
-            case EXEC_MJ_SKIP_TEST:
-                MJ_printf("ExecMergeJoin: EXEC_MJ_SKIP_TEST\n");
-
-                /*
-                 * before we advance, make sure the current tuples do not
-                 * satisfy the mergeclauses.  If they do, then we update the
-                 * marked tuple position and go join them.
-                 */
-                compareResult = MJCompare(node);
-                MJ_DEBUG_COMPARE(compareResult);
-
-                if (compareResult == 0)
-                {
-                    if (!node->mj_SkipMarkRestore)
-                        ExecMarkPos(innerPlan);
-
-                    MarkInnerTuple(node->mj_InnerTupleSlot, node);
-
-                    node->mj_JoinState = EXEC_MJ_JOINTUPLES;
-                }
-                else if (compareResult < 0)
-                    node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE;
-                else
-                    /* compareResult > 0 */
-                    node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE;
-                break;
-
-                /*
-                 * SKIPOUTER_ADVANCE: advance over an outer tuple that is
-                 * known not to join to any inner tuple.
-                 *
-                 * Before advancing, we check to see if we must emit an
-                 * outer-join fill tuple for this outer tuple.
-                 */
-            case EXEC_MJ_SKIPOUTER_ADVANCE:
-                MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPOUTER_ADVANCE\n");
-
-                if (doFillOuter && !node->mj_MatchedOuter)
-                {
-                    /*
-                     * Generate a fake join tuple with nulls for the inner
-                     * tuple, and return it if it passes the non-join quals.
-                     */
-                    TupleTableSlot *result;
-
-                    node->mj_MatchedOuter = true;    /* do it only once */
-
-                    result = MJFillOuter(node);
-                    if (result)
-                        return result;
-                }
-
-                /*
-                 * now we get the next outer tuple, if any
-                 */
-                outerTupleSlot = ExecProcNode(outerPlan);
-                node->mj_OuterTupleSlot = outerTupleSlot;
-                MJ_DEBUG_PROC_NODE(outerTupleSlot);
-                node->mj_MatchedOuter = false;
-
-                /* Compute join values and check for unmatchability */
-                switch (MJEvalOuterValues(node))
-                {
-                    case MJEVAL_MATCHABLE:
-                        /* Go test the new tuple against the current inner */
-                        node->mj_JoinState = EXEC_MJ_SKIP_TEST;
-                        break;
-                    case MJEVAL_NONMATCHABLE:
-                        /* Can't match, so fetch next outer tuple */
-                        node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE;
-                        break;
-                    case MJEVAL_ENDOFJOIN:
-                        /* No more outer tuples */
-                        MJ_printf("ExecMergeJoin: end of outer subplan\n");
-                        innerTupleSlot = node->mj_InnerTupleSlot;
-                        if (doFillInner && !TupIsNull(innerTupleSlot))
-                        {
-                            /*
-                             * Need to emit right-join tuples for remaining
-                             * inner tuples.
-                             */
-                            node->mj_JoinState = EXEC_MJ_ENDOUTER;
-                            break;
-                        }
-                        /* Otherwise we're done. */
-                        return NULL;
-                }
-                break;
-
-                /*
-                 * SKIPINNER_ADVANCE: advance over an inner tuple that is
-                 * known not to join to any outer tuple.
-                 *
-                 * Before advancing, we check to see if we must emit an
-                 * outer-join fill tuple for this inner tuple.
-                 */
-            case EXEC_MJ_SKIPINNER_ADVANCE:
-                MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPINNER_ADVANCE\n");
-
-                if (doFillInner && !node->mj_MatchedInner)
-                {
-                    /*
-                     * Generate a fake join tuple with nulls for the outer
-                     * tuple, and return it if it passes the non-join quals.
-                     */
-                    TupleTableSlot *result;
-
-                    node->mj_MatchedInner = true;    /* do it only once */
-
-                    result = MJFillInner(node);
-                    if (result)
-                        return result;
-                }
-
-                /* Mark before advancing, if wanted */
-                if (node->mj_ExtraMarks)
-                    ExecMarkPos(innerPlan);
-
-                /*
-                 * now we get the next inner tuple, if any
-                 */
-                innerTupleSlot = ExecProcNode(innerPlan);
-                node->mj_InnerTupleSlot = innerTupleSlot;
-                MJ_DEBUG_PROC_NODE(innerTupleSlot);
-                node->mj_MatchedInner = false;
-
-                /* Compute join values and check for unmatchability */
-                switch (MJEvalInnerValues(node, innerTupleSlot))
-                {
-                    case MJEVAL_MATCHABLE:
-                        /* proceed to compare it to the current outer */
-                        node->mj_JoinState = EXEC_MJ_SKIP_TEST;
-                        break;
-                    case MJEVAL_NONMATCHABLE:
-
-                        /*
-                         * current inner can't possibly match any outer;
-                         * better to advance the inner scan than the outer.
-                         */
-                        node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE;
-                        break;
-                    case MJEVAL_ENDOFJOIN:
-                        /* No more inner tuples */
-                        MJ_printf("ExecMergeJoin: end of inner subplan\n");
-                        outerTupleSlot = node->mj_OuterTupleSlot;
-                        if (doFillOuter && !TupIsNull(outerTupleSlot))
-                        {
-                            /*
-                             * Need to emit left-join tuples for remaining
-                             * outer tuples.
-                             */
-                            node->mj_JoinState = EXEC_MJ_ENDINNER;
-                            break;
-                        }
-                        /* Otherwise we're done. */
-                        return NULL;
-                }
-                break;
-
-                /*
-                 * EXEC_MJ_ENDOUTER means we have run out of outer tuples, but
-                 * are doing a right/full join and therefore must null-fill
-                 * any remaining unmatched inner tuples.
-                 */
-            case EXEC_MJ_ENDOUTER:
-                MJ_printf("ExecMergeJoin: EXEC_MJ_ENDOUTER\n");
-
-                Assert(doFillInner);
-
-                if (!node->mj_MatchedInner)
-                {
-                    /*
-                     * Generate a fake join tuple with nulls for the outer
-                     * tuple, and return it if it passes the non-join quals.
-                     */
-                    TupleTableSlot *result;
-
-                    node->mj_MatchedInner = true;    /* do it only once */
-
-                    result = MJFillInner(node);
-                    if (result)
-                        return result;
-                }
-
-                /* Mark before advancing, if wanted */
-                if (node->mj_ExtraMarks)
-                    ExecMarkPos(innerPlan);
-
-                /*
-                 * now we get the next inner tuple, if any
-                 */
-                innerTupleSlot = ExecProcNode(innerPlan);
-                node->mj_InnerTupleSlot = innerTupleSlot;
-                MJ_DEBUG_PROC_NODE(innerTupleSlot);
-                node->mj_MatchedInner = false;
-
-                if (TupIsNull(innerTupleSlot))
-                {
-                    MJ_printf("ExecMergeJoin: end of inner subplan\n");
-                    return NULL;
-                }
-
-                /* Else remain in ENDOUTER state and process next tuple. */
-                break;
-
-                /*
-                 * EXEC_MJ_ENDINNER means we have run out of inner tuples, but
-                 * are doing a left/full join and therefore must null- fill
-                 * any remaining unmatched outer tuples.
-                 */
-            case EXEC_MJ_ENDINNER:
-                MJ_printf("ExecMergeJoin: EXEC_MJ_ENDINNER\n");
-
-                Assert(doFillOuter);
-
-                if (!node->mj_MatchedOuter)
-                {
-                    /*
-                     * Generate a fake join tuple with nulls for the inner
-                     * tuple, and return it if it passes the non-join quals.
-                     */
-                    TupleTableSlot *result;
-
-                    node->mj_MatchedOuter = true;    /* do it only once */
-
-                    result = MJFillOuter(node);
-                    if (result)
-                        return result;
-                }
-
-                /*
-                 * now we get the next outer tuple, if any
-                 */
-                outerTupleSlot = ExecProcNode(outerPlan);
-                node->mj_OuterTupleSlot = outerTupleSlot;
-                MJ_DEBUG_PROC_NODE(outerTupleSlot);
-                node->mj_MatchedOuter = false;
-
-                if (TupIsNull(outerTupleSlot))
-                {
-                    MJ_printf("ExecMergeJoin: end of outer subplan\n");
-                    return NULL;
-                }
-
-                /* Else remain in ENDINNER state and process next tuple. */
-                break;
-
-                /*
-                 * broken state value?
-                 */
-            default:
-                elog(ERROR, "unrecognized mergejoin state: %d",
-                     (int) node->mj_JoinState);
-        }
-    }
+				innerTupleSlot = ExecProcNode(innerPlan);
+				node->mj_InnerTupleSlot = innerTupleSlot;
+
+				/* Compute join values and check for unmatchability */
+				switch (MJEvalInnerValues(node, innerTupleSlot))
+				{
+					case MJEVAL_MATCHABLE:
+
+						/*
+						 * OK, we have the initial tuples.  Begin by skipping
+						 * non-matching tuples.
+						 */
+						node->mj_JoinState = EXEC_MJ_SKIP_TEST;
+						break;
+					case MJEVAL_NONMATCHABLE:
+						/* Mark before advancing, if wanted */
+						if (node->mj_ExtraMarks)
+							ExecMarkPos(innerPlan);
+						/* Stay in same state to fetch next inner tuple */
+						if (doFillInner)
+						{
+							/*
+							 * Generate a fake join tuple with nulls for the
+							 * outer tuple, and return it if it passes the
+							 * non-join quals.
+							 */
+							TupleTableSlot *result;
+
+							result = MJFillInner(node);
+							if (result)
+								return result;
+						}
+						break;
+					case MJEVAL_ENDOFJOIN:
+						/* No more inner tuples */
+						MJ_printf("ExecMergeJoin: nothing in inner subplan\n");
+						if (doFillOuter)
+						{
+							/*
+							 * Need to emit left-join tuples for all outer
+							 * tuples, including the one we just fetched.  We
+							 * set MatchedOuter = false to force the ENDINNER
+							 * state to emit first tuple before advancing
+							 * outer.
+							 */
+							node->mj_JoinState = EXEC_MJ_ENDINNER;
+							node->mj_MatchedOuter = false;
+							break;
+						}
+						/* Otherwise we're done. */
+						return NULL;
+				}
+				break;
+
+				/*
+				 * EXEC_MJ_JOINTUPLES means we have two tuples which satisfied
+				 * the merge clause so we join them and then proceed to get
+				 * the next inner tuple (EXEC_MJ_NEXTINNER).
+				 */
+			case EXEC_MJ_JOINTUPLES:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_JOINTUPLES\n");
+
+				/*
+				 * Set the next state machine state.  The right things will
+				 * happen whether we return this join tuple or just fall
+				 * through to continue the state machine execution.
+				 */
+				node->mj_JoinState = EXEC_MJ_NEXTINNER;
+
+				/*
+				 * Check the extra qual conditions to see if we actually want
+				 * to return this join tuple.  If not, can proceed with merge.
+				 * We must distinguish the additional joinquals (which must
+				 * pass to consider the tuples "matched" for outer-join logic)
+				 * from the otherquals (which must pass before we actually
+				 * return the tuple).
+				 *
+				 * We don't bother with a ResetExprContext here, on the
+				 * assumption that we just did one while checking the merge
+				 * qual.  One per tuple should be sufficient.  We do have to
+				 * set up the econtext links to the tuples for ExecQual to
+				 * use.
+				 */
+				outerTupleSlot = node->mj_OuterTupleSlot;
+				econtext->ecxt_outertuple = outerTupleSlot;
+				innerTupleSlot = node->mj_InnerTupleSlot;
+				econtext->ecxt_innertuple = innerTupleSlot;
+
+				qualResult = (joinqual == NULL ||
+							  ExecQual(joinqual, econtext));
+				MJ_DEBUG_QUAL(joinqual, qualResult);
+
+				if (qualResult)
+				{
+#ifdef __TBASE__
+                    if (node->js.jointype == JOIN_LEFT_SCALAR && node->mj_MatchedOuter)
+                        ereport(ERROR,
+                                (errcode(ERRCODE_CARDINALITY_VIOLATION),
+                                        errmsg("more than one row returned by a subquery used as an expression")));
+#endif
+					node->mj_MatchedOuter = true;
+					node->mj_MatchedInner = true;
+
+					/* In an antijoin, we never return a matched tuple */
+					if (node->js.jointype == JOIN_ANTI)
+					{
+						node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+						break;
+					}
+
+					/*
+					 * If we only need to join to the first matching inner
+					 * tuple, then consider returning this one, but after that
+					 * continue with next outer tuple.
+					 */
+					if (node->js.single_match)
+						node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+
+					qualResult = (otherqual == NULL ||
+								  ExecQual(otherqual, econtext));
+					MJ_DEBUG_QUAL(otherqual, qualResult);
+
+					if (qualResult)
+					{
+						/*
+						 * qualification succeeded.  now form the desired
+						 * projection tuple and return the slot containing it.
+						 */
+						MJ_printf("ExecMergeJoin: returning tuple\n");
+
+						return ExecProject(node->js.ps.ps_ProjInfo);
+					}
+					else
+						InstrCountFiltered2(node, 1);
+				}
+				else
+					InstrCountFiltered1(node, 1);
+				break;
+
+				/*
+				 * EXEC_MJ_NEXTINNER means advance the inner scan to the next
+				 * tuple. If the tuple is not nil, we then proceed to test it
+				 * against the join qualification.
+				 *
+				 * Before advancing, we check to see if we must emit an
+				 * outer-join fill tuple for this inner tuple.
+				 */
+			case EXEC_MJ_NEXTINNER:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTINNER\n");
+
+				if (doFillInner && !node->mj_MatchedInner)
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the outer
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					TupleTableSlot *result;
+
+					node->mj_MatchedInner = true;	/* do it only once */
+
+					result = MJFillInner(node);
+					if (result)
+						return result;
+				}
+
+				/*
+				 * now we get the next inner tuple, if any.  If there's none,
+				 * advance to next outer tuple (which may be able to join to
+				 * previously marked tuples).
+				 *
+				 * NB: must NOT do "extraMarks" here, since we may need to
+				 * return to previously marked tuples.
+				 */
+				innerTupleSlot = ExecProcNode(innerPlan);
+				node->mj_InnerTupleSlot = innerTupleSlot;
+				MJ_DEBUG_PROC_NODE(innerTupleSlot);
+				node->mj_MatchedInner = false;
+
+				/* Compute join values and check for unmatchability */
+				switch (MJEvalInnerValues(node, innerTupleSlot))
+				{
+					case MJEVAL_MATCHABLE:
+
+						/*
+						 * Test the new inner tuple to see if it matches
+						 * outer.
+						 *
+						 * If they do match, then we join them and move on to
+						 * the next inner tuple (EXEC_MJ_JOINTUPLES).
+						 *
+						 * If they do not match then advance to next outer
+						 * tuple.
+						 */
+						compareResult = MJCompare(node);
+						MJ_DEBUG_COMPARE(compareResult);
+
+						if (compareResult == 0)
+							node->mj_JoinState = EXEC_MJ_JOINTUPLES;
+						else
+						{
+							Assert(compareResult < 0);
+							node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+						}
+						break;
+					case MJEVAL_NONMATCHABLE:
+
+						/*
+						 * It contains a NULL and hence can't match any outer
+						 * tuple, so we can skip the comparison and assume the
+						 * new tuple is greater than current outer.
+						 */
+						node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+						break;
+					case MJEVAL_ENDOFJOIN:
+
+						/*
+						 * No more inner tuples.  However, this might be only
+						 * effective and not physical end of inner plan, so
+						 * force mj_InnerTupleSlot to null to make sure we
+						 * don't fetch more inner tuples.  (We need this hack
+						 * because we are not transiting to a state where the
+						 * inner plan is assumed to be exhausted.)
+						 */
+						node->mj_InnerTupleSlot = NULL;
+						node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+						break;
+				}
+				break;
+
+				/*-------------------------------------------
+				 * EXEC_MJ_NEXTOUTER means
+				 *
+				 *				outer inner
+				 * outer tuple -  5		5  - marked tuple
+				 *				  5		5
+				 *				  6		6  - inner tuple
+				 *				  7		7
+				 *
+				 * we know we just bumped into the
+				 * first inner tuple > current outer tuple (or possibly
+				 * the end of the inner stream)
+				 * so get a new outer tuple and then
+				 * proceed to test it against the marked tuple
+				 * (EXEC_MJ_TESTOUTER)
+				 *
+				 * Before advancing, we check to see if we must emit an
+				 * outer-join fill tuple for this outer tuple.
+				 *------------------------------------------------
+				 */
+			case EXEC_MJ_NEXTOUTER:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_NEXTOUTER\n");
+
+				if (doFillOuter && !node->mj_MatchedOuter)
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the inner
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					TupleTableSlot *result;
+
+					node->mj_MatchedOuter = true;	/* do it only once */
+
+					result = MJFillOuter(node);
+					if (result)
+						return result;
+				}
+
+				/*
+				 * now we get the next outer tuple, if any
+				 */
+				outerTupleSlot = ExecProcNode(outerPlan);
+				node->mj_OuterTupleSlot = outerTupleSlot;
+				MJ_DEBUG_PROC_NODE(outerTupleSlot);
+				node->mj_MatchedOuter = false;
+
+				/* Compute join values and check for unmatchability */
+				switch (MJEvalOuterValues(node))
+				{
+					case MJEVAL_MATCHABLE:
+						/* Go test the new tuple against the marked tuple */
+						node->mj_JoinState = EXEC_MJ_TESTOUTER;
+						break;
+					case MJEVAL_NONMATCHABLE:
+						/* Can't match, so fetch next outer tuple */
+						node->mj_JoinState = EXEC_MJ_NEXTOUTER;
+						break;
+					case MJEVAL_ENDOFJOIN:
+						/* No more outer tuples */
+						MJ_printf("ExecMergeJoin: end of outer subplan\n");
+						innerTupleSlot = node->mj_InnerTupleSlot;
+						if (doFillInner && !TupIsNull(innerTupleSlot))
+						{
+							/*
+							 * Need to emit right-join tuples for remaining
+							 * inner tuples.
+							 */
+							node->mj_JoinState = EXEC_MJ_ENDOUTER;
+							break;
+						}
+						/* Otherwise we're done. */
+						return NULL;
+				}
+				break;
+
+				/*--------------------------------------------------------
+				 * EXEC_MJ_TESTOUTER If the new outer tuple and the marked
+				 * tuple satisfy the merge clause then we know we have
+				 * duplicates in the outer scan so we have to restore the
+				 * inner scan to the marked tuple and proceed to join the
+				 * new outer tuple with the inner tuples.
+				 *
+				 * This is the case when
+				 *						  outer inner
+				 *							4	  5  - marked tuple
+				 *			 outer tuple -	5	  5
+				 *		 new outer tuple -	5	  5
+				 *							6	  8  - inner tuple
+				 *							7	 12
+				 *
+				 *				new outer tuple == marked tuple
+				 *
+				 * If the outer tuple fails the test, then we are done
+				 * with the marked tuples, and we have to look for a
+				 * match to the current inner tuple.  So we will
+				 * proceed to skip outer tuples until outer >= inner
+				 * (EXEC_MJ_SKIP_TEST).
+				 *
+				 *		This is the case when
+				 *
+				 *						  outer inner
+				 *							5	  5  - marked tuple
+				 *			 outer tuple -	5	  5
+				 *		 new outer tuple -	6	  8  - inner tuple
+				 *							7	 12
+				 *
+				 *				new outer tuple > marked tuple
+				 *
+				 *---------------------------------------------------------
+				 */
+			case EXEC_MJ_TESTOUTER:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_TESTOUTER\n");
+
+				/*
+				 * Here we must compare the outer tuple with the marked inner
+				 * tuple.  (We can ignore the result of MJEvalInnerValues,
+				 * since the marked inner tuple is certainly matchable.)
+				 */
+				innerTupleSlot = node->mj_MarkedTupleSlot;
+				(void) MJEvalInnerValues(node, innerTupleSlot);
+
+				compareResult = MJCompare(node);
+				MJ_DEBUG_COMPARE(compareResult);
+
+				if (compareResult == 0)
+				{
+					/*
+					 * the merge clause matched so now we restore the inner
+					 * scan position to the first mark, and go join that tuple
+					 * (and any following ones) to the new outer.
+					 *
+					 * If we were able to determine mark and restore are not
+					 * needed, then we don't have to back up; the current
+					 * inner is already the first possible match.
+					 *
+					 * NOTE: we do not need to worry about the MatchedInner
+					 * state for the rescanned inner tuples.  We know all of
+					 * them will match this new outer tuple and therefore
+					 * won't be emitted as fill tuples.  This works *only*
+					 * because we require the extra joinquals to be constant
+					 * when doing a right or full join --- otherwise some of
+					 * the rescanned tuples might fail the extra joinquals.
+					 * This obviously won't happen for a constant-true extra
+					 * joinqual, while the constant-false case is handled by
+					 * forcing the merge clause to never match, so we never
+					 * get here.
+					 */
+					if (!node->mj_SkipMarkRestore)
+					{
+						ExecRestrPos(innerPlan);
+
+						/*
+						 * ExecRestrPos probably should give us back a new
+						 * Slot, but since it doesn't, use the marked slot.
+						 * (The previously returned mj_InnerTupleSlot cannot
+						 * be assumed to hold the required tuple.)
+						 */
+						node->mj_InnerTupleSlot = innerTupleSlot;
+						/* we need not do MJEvalInnerValues again */
+					}
+
+					node->mj_JoinState = EXEC_MJ_JOINTUPLES;
+				}
+				else
+				{
+					/* ----------------
+					 *	if the new outer tuple didn't match the marked inner
+					 *	tuple then we have a case like:
+					 *
+					 *			 outer inner
+					 *			   4	 4	- marked tuple
+					 * new outer - 5	 4
+					 *			   6	 5	- inner tuple
+					 *			   7
+					 *
+					 *	which means that all subsequent outer tuples will be
+					 *	larger than our marked inner tuples.  So we need not
+					 *	revisit any of the marked tuples but can proceed to
+					 *	look for a match to the current inner.  If there's
+					 *	no more inners, no more matches are possible.
+					 * ----------------
+					 */
+					Assert(compareResult > 0);
+					innerTupleSlot = node->mj_InnerTupleSlot;
+
+					/* reload comparison data for current inner */
+					switch (MJEvalInnerValues(node, innerTupleSlot))
+					{
+						case MJEVAL_MATCHABLE:
+							/* proceed to compare it to the current outer */
+							node->mj_JoinState = EXEC_MJ_SKIP_TEST;
+							break;
+						case MJEVAL_NONMATCHABLE:
+
+							/*
+							 * current inner can't possibly match any outer;
+							 * better to advance the inner scan than the
+							 * outer.
+							 */
+							node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE;
+							break;
+						case MJEVAL_ENDOFJOIN:
+							/* No more inner tuples */
+							if (doFillOuter)
+							{
+								/*
+								 * Need to emit left-join tuples for remaining
+								 * outer tuples.
+								 */
+								node->mj_JoinState = EXEC_MJ_ENDINNER;
+								break;
+							}
+							/* Otherwise we're done. */
+							return NULL;
+					}
+				}
+				break;
+
+				/*----------------------------------------------------------
+				 * EXEC_MJ_SKIP means compare tuples and if they do not
+				 * match, skip whichever is lesser.
+				 *
+				 * For example:
+				 *
+				 *				outer inner
+				 *				  5		5
+				 *				  5		5
+				 * outer tuple -  6		8  - inner tuple
+				 *				  7    12
+				 *				  8    14
+				 *
+				 * we have to advance the outer scan
+				 * until we find the outer 8.
+				 *
+				 * On the other hand:
+				 *
+				 *				outer inner
+				 *				  5		5
+				 *				  5		5
+				 * outer tuple - 12		8  - inner tuple
+				 *				 14    10
+				 *				 17    12
+				 *
+				 * we have to advance the inner scan
+				 * until we find the inner 12.
+				 *----------------------------------------------------------
+				 */
+			case EXEC_MJ_SKIP_TEST:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_SKIP_TEST\n");
+
+				/*
+				 * before we advance, make sure the current tuples do not
+				 * satisfy the mergeclauses.  If they do, then we update the
+				 * marked tuple position and go join them.
+				 */
+				compareResult = MJCompare(node);
+				MJ_DEBUG_COMPARE(compareResult);
+
+				if (compareResult == 0)
+				{
+					if (!node->mj_SkipMarkRestore)
+						ExecMarkPos(innerPlan);
+
+					MarkInnerTuple(node->mj_InnerTupleSlot, node);
+
+					node->mj_JoinState = EXEC_MJ_JOINTUPLES;
+				}
+				else if (compareResult < 0)
+					node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE;
+				else
+					/* compareResult > 0 */
+					node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE;
+				break;
+
+				/*
+				 * SKIPOUTER_ADVANCE: advance over an outer tuple that is
+				 * known not to join to any inner tuple.
+				 *
+				 * Before advancing, we check to see if we must emit an
+				 * outer-join fill tuple for this outer tuple.
+				 */
+			case EXEC_MJ_SKIPOUTER_ADVANCE:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPOUTER_ADVANCE\n");
+
+				if (doFillOuter && !node->mj_MatchedOuter)
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the inner
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					TupleTableSlot *result;
+
+					node->mj_MatchedOuter = true;	/* do it only once */
+
+					result = MJFillOuter(node);
+					if (result)
+						return result;
+				}
+
+				/*
+				 * now we get the next outer tuple, if any
+				 */
+				outerTupleSlot = ExecProcNode(outerPlan);
+				node->mj_OuterTupleSlot = outerTupleSlot;
+				MJ_DEBUG_PROC_NODE(outerTupleSlot);
+				node->mj_MatchedOuter = false;
+
+				/* Compute join values and check for unmatchability */
+				switch (MJEvalOuterValues(node))
+				{
+					case MJEVAL_MATCHABLE:
+						/* Go test the new tuple against the current inner */
+						node->mj_JoinState = EXEC_MJ_SKIP_TEST;
+						break;
+					case MJEVAL_NONMATCHABLE:
+						/* Can't match, so fetch next outer tuple */
+						node->mj_JoinState = EXEC_MJ_SKIPOUTER_ADVANCE;
+						break;
+					case MJEVAL_ENDOFJOIN:
+						/* No more outer tuples */
+						MJ_printf("ExecMergeJoin: end of outer subplan\n");
+						innerTupleSlot = node->mj_InnerTupleSlot;
+						if (doFillInner && !TupIsNull(innerTupleSlot))
+						{
+							/*
+							 * Need to emit right-join tuples for remaining
+							 * inner tuples.
+							 */
+							node->mj_JoinState = EXEC_MJ_ENDOUTER;
+							break;
+						}
+						/* Otherwise we're done. */
+						return NULL;
+				}
+				break;
+
+				/*
+				 * SKIPINNER_ADVANCE: advance over an inner tuple that is
+				 * known not to join to any outer tuple.
+				 *
+				 * Before advancing, we check to see if we must emit an
+				 * outer-join fill tuple for this inner tuple.
+				 */
+			case EXEC_MJ_SKIPINNER_ADVANCE:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_SKIPINNER_ADVANCE\n");
+
+				if (doFillInner && !node->mj_MatchedInner)
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the outer
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					TupleTableSlot *result;
+
+					node->mj_MatchedInner = true;	/* do it only once */
+
+					result = MJFillInner(node);
+					if (result)
+						return result;
+				}
+
+				/* Mark before advancing, if wanted */
+				if (node->mj_ExtraMarks)
+					ExecMarkPos(innerPlan);
+
+				/*
+				 * now we get the next inner tuple, if any
+				 */
+				innerTupleSlot = ExecProcNode(innerPlan);
+				node->mj_InnerTupleSlot = innerTupleSlot;
+				MJ_DEBUG_PROC_NODE(innerTupleSlot);
+				node->mj_MatchedInner = false;
+
+				/* Compute join values and check for unmatchability */
+				switch (MJEvalInnerValues(node, innerTupleSlot))
+				{
+					case MJEVAL_MATCHABLE:
+						/* proceed to compare it to the current outer */
+						node->mj_JoinState = EXEC_MJ_SKIP_TEST;
+						break;
+					case MJEVAL_NONMATCHABLE:
+
+						/*
+						 * current inner can't possibly match any outer;
+						 * better to advance the inner scan than the outer.
+						 */
+						node->mj_JoinState = EXEC_MJ_SKIPINNER_ADVANCE;
+						break;
+					case MJEVAL_ENDOFJOIN:
+						/* No more inner tuples */
+						MJ_printf("ExecMergeJoin: end of inner subplan\n");
+						outerTupleSlot = node->mj_OuterTupleSlot;
+						if (doFillOuter && !TupIsNull(outerTupleSlot))
+						{
+							/*
+							 * Need to emit left-join tuples for remaining
+							 * outer tuples.
+							 */
+							node->mj_JoinState = EXEC_MJ_ENDINNER;
+							break;
+						}
+						/* Otherwise we're done. */
+						return NULL;
+				}
+				break;
+
+				/*
+				 * EXEC_MJ_ENDOUTER means we have run out of outer tuples, but
+				 * are doing a right/full join and therefore must null-fill
+				 * any remaining unmatched inner tuples.
+				 */
+			case EXEC_MJ_ENDOUTER:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_ENDOUTER\n");
+
+				Assert(doFillInner);
+
+				if (!node->mj_MatchedInner)
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the outer
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					TupleTableSlot *result;
+
+					node->mj_MatchedInner = true;	/* do it only once */
+
+					result = MJFillInner(node);
+					if (result)
+						return result;
+				}
+
+				/* Mark before advancing, if wanted */
+				if (node->mj_ExtraMarks)
+					ExecMarkPos(innerPlan);
+
+				/*
+				 * now we get the next inner tuple, if any
+				 */
+				innerTupleSlot = ExecProcNode(innerPlan);
+				node->mj_InnerTupleSlot = innerTupleSlot;
+				MJ_DEBUG_PROC_NODE(innerTupleSlot);
+				node->mj_MatchedInner = false;
+
+				if (TupIsNull(innerTupleSlot))
+				{
+					MJ_printf("ExecMergeJoin: end of inner subplan\n");
+					return NULL;
+				}
+
+				/* Else remain in ENDOUTER state and process next tuple. */
+				break;
+
+				/*
+				 * EXEC_MJ_ENDINNER means we have run out of inner tuples, but
+				 * are doing a left/full join and therefore must null- fill
+				 * any remaining unmatched outer tuples.
+				 */
+			case EXEC_MJ_ENDINNER:
+				MJ_printf("ExecMergeJoin: EXEC_MJ_ENDINNER\n");
+
+				Assert(doFillOuter);
+
+				if (!node->mj_MatchedOuter)
+				{
+					/*
+					 * Generate a fake join tuple with nulls for the inner
+					 * tuple, and return it if it passes the non-join quals.
+					 */
+					TupleTableSlot *result;
+
+					node->mj_MatchedOuter = true;	/* do it only once */
+
+					result = MJFillOuter(node);
+					if (result)
+						return result;
+				}
+
+				/*
+				 * now we get the next outer tuple, if any
+				 */
+				outerTupleSlot = ExecProcNode(outerPlan);
+				node->mj_OuterTupleSlot = outerTupleSlot;
+				MJ_DEBUG_PROC_NODE(outerTupleSlot);
+				node->mj_MatchedOuter = false;
+
+				if (TupIsNull(outerTupleSlot))
+				{
+					MJ_printf("ExecMergeJoin: end of outer subplan\n");
+					return NULL;
+				}
+
+				/* Else remain in ENDINNER state and process next tuple. */
+				break;
+
+				/*
+				 * broken state value?
+				 */
+			default:
+				elog(ERROR, "unrecognized mergejoin state: %d",
+					 (int) node->mj_JoinState);
+		}
+	}
 }
 
 /* ----------------------------------------------------------------
@@ -1447,180 +1453,189 @@ ExecMergeJoin(PlanState *pstate)
  */
 MergeJoinState *
 ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags)
-{// #lizard forgives
-    MergeJoinState *mergestate;
-
-    /* check for unsupported flags */
-    Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
-
-    MJ1_printf("ExecInitMergeJoin: %s\n",
-               "initializing node");
-
-    /*
-     * create state structure
-     */
-    mergestate = makeNode(MergeJoinState);
-    mergestate->js.ps.plan = (Plan *) node;
-    mergestate->js.ps.state = estate;
-    mergestate->js.ps.ExecProcNode = ExecMergeJoin;
-
-    /*
-     * Miscellaneous initialization
-     *
-     * create expression context for node
-     */
-    ExecAssignExprContext(estate, &mergestate->js.ps);
-
-    /*
-     * we need two additional econtexts in which we can compute the join
-     * expressions from the left and right input tuples.  The node's regular
-     * econtext won't do because it gets reset too often.
-     */
-    mergestate->mj_OuterEContext = CreateExprContext(estate);
-    mergestate->mj_InnerEContext = CreateExprContext(estate);
-
-    /*
-     * initialize child expressions
-     */
-    mergestate->js.ps.qual =
-        ExecInitQual(node->join.plan.qual, (PlanState *) mergestate);
-    mergestate->js.jointype = node->join.jointype;
-    mergestate->js.joinqual =
-        ExecInitQual(node->join.joinqual, (PlanState *) mergestate);
-    mergestate->mj_ConstFalseJoin = false;
-    /* mergeclauses are handled below */
-
-    /*
-     * initialize child nodes
-     *
-     * inner child must support MARK/RESTORE, unless we have detected that we
-     * don't need that.  Note that skip_mark_restore must never be set if
-     * there are non-mergeclause joinquals, since the logic wouldn't work.
-     */
-    Assert(node->join.joinqual == NIL || !node->skip_mark_restore);
-    mergestate->mj_SkipMarkRestore = node->skip_mark_restore;
-
-    outerPlanState(mergestate) = ExecInitNode(outerPlan(node), estate, eflags);
-    innerPlanState(mergestate) = ExecInitNode(innerPlan(node), estate,
-                                              mergestate->mj_SkipMarkRestore ?
-                                              eflags :
-                                              (eflags | EXEC_FLAG_MARK));
-
-    /*
-     * For certain types of inner child nodes, it is advantageous to issue
-     * MARK every time we advance past an inner tuple we will never return to.
-     * For other types, MARK on a tuple we cannot return to is a waste of
-     * cycles.  Detect which case applies and set mj_ExtraMarks if we want to
-     * issue "unnecessary" MARK calls.
-     *
-     * Currently, only Material wants the extra MARKs, and it will be helpful
-     * only if eflags doesn't specify REWIND.
-     */
-    if (IsA(innerPlan(node), Material) &&
-        (eflags & EXEC_FLAG_REWIND) == 0 &&
-        !mergestate->mj_SkipMarkRestore)
-        mergestate->mj_ExtraMarks = true;
-    else
-        mergestate->mj_ExtraMarks = false;
-
-    /*
-     * tuple table initialization
-     */
-    ExecInitResultTupleSlot(estate, &mergestate->js.ps);
-
-    mergestate->mj_MarkedTupleSlot = ExecInitExtraTupleSlot(estate);
-    ExecSetSlotDescriptor(mergestate->mj_MarkedTupleSlot,
-                          ExecGetResultType(innerPlanState(mergestate)));
-
-    /*
-     * detect whether we need only consider the first matching inner tuple
-     */
-    mergestate->js.single_match = (node->join.inner_unique ||
-                                   node->join.jointype == JOIN_SEMI);
-
-    /* set up null tuples for outer joins, if needed */
-    switch (node->join.jointype)
-    {
-        case JOIN_INNER:
+{
+	MergeJoinState *mergestate;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	MJ1_printf("ExecInitMergeJoin: %s\n",
+			   "initializing node");
+
+	/*
+	 * create state structure
+	 */
+	mergestate = makeNode(MergeJoinState);
+	mergestate->js.ps.plan = (Plan *) node;
+	mergestate->js.ps.state = estate;
+	mergestate->js.ps.ExecProcNode = ExecMergeJoin;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &mergestate->js.ps);
+
+	/*
+	 * we need two additional econtexts in which we can compute the join
+	 * expressions from the left and right input tuples.  The node's regular
+	 * econtext won't do because it gets reset too often.
+	 */
+	mergestate->mj_OuterEContext = CreateExprContext(estate);
+	mergestate->mj_InnerEContext = CreateExprContext(estate);
+
+	/*
+	 * initialize child expressions
+	 */
+	mergestate->js.ps.qual =
+		ExecInitQual(node->join.plan.qual, (PlanState *) mergestate);
+	mergestate->js.jointype = node->join.jointype;
+	mergestate->js.joinqual =
+		ExecInitQual(node->join.joinqual, (PlanState *) mergestate);
+	mergestate->mj_ConstFalseJoin = false;
+	/* mergeclauses are handled below */
+
+	/*
+	 * initialize child nodes
+	 *
+	 * inner child must support MARK/RESTORE, unless we have detected that we
+	 * don't need that.  Note that skip_mark_restore must never be set if
+	 * there are non-mergeclause joinquals, since the logic wouldn't work.
+	 */
+	Assert(node->join.joinqual == NIL || !node->skip_mark_restore);
+	mergestate->mj_SkipMarkRestore = node->skip_mark_restore;
+
+	outerPlanState(mergestate) = ExecInitNode(outerPlan(node), estate, eflags);
+	innerPlanState(mergestate) = ExecInitNode(innerPlan(node), estate,
+											  mergestate->mj_SkipMarkRestore ?
+											  eflags :
+											  (eflags | EXEC_FLAG_MARK));
+
+	/*
+	 * For certain types of inner child nodes, it is advantageous to issue
+	 * MARK every time we advance past an inner tuple we will never return to.
+	 * For other types, MARK on a tuple we cannot return to is a waste of
+	 * cycles.  Detect which case applies and set mj_ExtraMarks if we want to
+	 * issue "unnecessary" MARK calls.
+	 *
+	 * Currently, only Material wants the extra MARKs, and it will be helpful
+	 * only if eflags doesn't specify REWIND.
+	 */
+	if (IsA(innerPlan(node), Material) &&
+		(eflags & EXEC_FLAG_REWIND) == 0 &&
+		!mergestate->mj_SkipMarkRestore)
+		mergestate->mj_ExtraMarks = true;
+	else
+		mergestate->mj_ExtraMarks = false;
+
+	/*
+	 * tuple table initialization
+	 */
+	ExecInitResultTupleSlot(estate, &mergestate->js.ps);
+
+	mergestate->mj_MarkedTupleSlot = ExecInitExtraTupleSlot(estate);
+	ExecSetSlotDescriptor(mergestate->mj_MarkedTupleSlot,
+						  ExecGetResultType(innerPlanState(mergestate)));
+
+	/*
+	 * detect whether we need only consider the first matching inner tuple
+	 */
+	mergestate->js.single_match = (node->join.inner_unique ||
+								   node->join.jointype == JOIN_SEMI);
+
+	/* set up null tuples for outer joins, if needed */
+	switch (node->join.jointype)
+	{
+		case JOIN_INNER:
         case JOIN_SEMI:
-            mergestate->mj_FillOuter = false;
-            mergestate->mj_FillInner = false;
-            break;
-        case JOIN_LEFT:
-        case JOIN_ANTI:
+			mergestate->mj_FillOuter = false;
+			mergestate->mj_FillInner = false;
+			break;
+#ifdef __TBASE__
+        case JOIN_LEFT_SCALAR:
             mergestate->mj_FillOuter = true;
             mergestate->mj_FillInner = false;
             mergestate->mj_NullInnerTupleSlot =
-                ExecInitNullTupleSlot(estate,
-                                      ExecGetResultType(innerPlanState(mergestate)));
-            break;
-        case JOIN_RIGHT:
-            mergestate->mj_FillOuter = false;
-            mergestate->mj_FillInner = true;
-            mergestate->mj_NullOuterTupleSlot =
-                ExecInitNullTupleSlot(estate,
-                                      ExecGetResultType(outerPlanState(mergestate)));
-
-            /*
-             * Can't handle right or full join with non-constant extra
-             * joinclauses.  This should have been caught by planner.
-             */
-            if (!check_constant_qual(node->join.joinqual,
-                                     &mergestate->mj_ConstFalseJoin))
-                ereport(ERROR,
-                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                         errmsg("RIGHT JOIN is only supported with merge-joinable join conditions")));
-            break;
-        case JOIN_FULL:
-            mergestate->mj_FillOuter = true;
-            mergestate->mj_FillInner = true;
-            mergestate->mj_NullOuterTupleSlot =
-                ExecInitNullTupleSlot(estate,
-                                      ExecGetResultType(outerPlanState(mergestate)));
-            mergestate->mj_NullInnerTupleSlot =
-                ExecInitNullTupleSlot(estate,
-                                      ExecGetResultType(innerPlanState(mergestate)));
-
-            /*
-             * Can't handle right or full join with non-constant extra
-             * joinclauses.  This should have been caught by planner.
-             */
-            if (!check_constant_qual(node->join.joinqual,
-                                     &mergestate->mj_ConstFalseJoin))
-                ereport(ERROR,
-                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                         errmsg("FULL JOIN is only supported with merge-joinable join conditions")));
+                    ExecInitNullTupleSlot(estate,
+                                          ExecGetResultType(innerPlanState(mergestate)));
             break;
-        default:
-            elog(ERROR, "unrecognized join type: %d",
-                 (int) node->join.jointype);
-    }
-
-    /*
-     * initialize tuple type and projection info
-     */
-    ExecAssignResultTypeFromTL(&mergestate->js.ps);
-    ExecAssignProjectionInfo(&mergestate->js.ps, NULL);
-
-    /*
-     * preprocess the merge clauses
-     */
-    mergestate->mj_NumClauses = list_length(node->mergeclauses);
-    mergestate->mj_Clauses = MJExamineQuals(node->mergeclauses,
-                                            node->mergeFamilies,
-                                            node->mergeCollations,
-                                            node->mergeStrategies,
-                                            node->mergeNullsFirst,
-                                            (PlanState *) mergestate);
-
-    /*
-     * initialize join state
-     */
-    mergestate->mj_JoinState = EXEC_MJ_INITIALIZE_OUTER;
-    mergestate->mj_MatchedOuter = false;
-    mergestate->mj_MatchedInner = false;
-    mergestate->mj_OuterTupleSlot = NULL;
-    mergestate->mj_InnerTupleSlot = NULL;
+#endif
+		case JOIN_LEFT:
+		case JOIN_ANTI:
+			mergestate->mj_FillOuter = true;
+			mergestate->mj_FillInner = false;
+			mergestate->mj_NullInnerTupleSlot =
+				ExecInitNullTupleSlot(estate,
+									  ExecGetResultType(innerPlanState(mergestate)));
+			break;
+		case JOIN_RIGHT:
+			mergestate->mj_FillOuter = false;
+			mergestate->mj_FillInner = true;
+			mergestate->mj_NullOuterTupleSlot =
+				ExecInitNullTupleSlot(estate,
+									  ExecGetResultType(outerPlanState(mergestate)));
+
+			/*
+			 * Can't handle right or full join with non-constant extra
+			 * joinclauses.  This should have been caught by planner.
+			 */
+			if (!check_constant_qual(node->join.joinqual,
+									 &mergestate->mj_ConstFalseJoin))
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("RIGHT JOIN is only supported with merge-joinable join conditions")));
+			break;
+		case JOIN_FULL:
+			mergestate->mj_FillOuter = true;
+			mergestate->mj_FillInner = true;
+			mergestate->mj_NullOuterTupleSlot =
+				ExecInitNullTupleSlot(estate,
+									  ExecGetResultType(outerPlanState(mergestate)));
+			mergestate->mj_NullInnerTupleSlot =
+				ExecInitNullTupleSlot(estate,
+									  ExecGetResultType(innerPlanState(mergestate)));
+
+			/*
+			 * Can't handle right or full join with non-constant extra
+			 * joinclauses.  This should have been caught by planner.
+			 */
+			if (!check_constant_qual(node->join.joinqual,
+									 &mergestate->mj_ConstFalseJoin))
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("FULL JOIN is only supported with merge-joinable join conditions")));
+			break;
+		default:
+			elog(ERROR, "unrecognized join type: %d",
+				 (int) node->join.jointype);
+	}
+
+	/*
+	 * initialize tuple type and projection info
+	 */
+	ExecAssignResultTypeFromTL(&mergestate->js.ps);
+	ExecAssignProjectionInfo(&mergestate->js.ps, NULL);
+
+	/*
+	 * preprocess the merge clauses
+	 */
+	mergestate->mj_NumClauses = list_length(node->mergeclauses);
+	mergestate->mj_Clauses = MJExamineQuals(node->mergeclauses,
+											node->mergeFamilies,
+											node->mergeCollations,
+											node->mergeStrategies,
+											node->mergeNullsFirst,
+											(PlanState *) mergestate);
+
+	/*
+	 * initialize join state
+	 */
+	mergestate->mj_JoinState = EXEC_MJ_INITIALIZE_OUTER;
+	mergestate->mj_MatchedOuter = false;
+	mergestate->mj_MatchedInner = false;
+	mergestate->mj_OuterTupleSlot = NULL;
+	mergestate->mj_InnerTupleSlot = NULL;
 #ifdef __TBASE__
     mergestate->mj_InnerInited    = false;
 #endif
diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c
index 45b0023b..9a9ec8d4 100644
--- a/src/backend/executor/nodeNestloop.c
+++ b/src/backend/executor/nodeNestloop.c
@@ -167,102 +167,114 @@ ExecNestLoop(PlanState *pstate)
 #ifdef __TBASE__
         node->nl_InnerInited = true;
 #endif
-        innerTupleSlot = ExecProcNode(innerPlan);
-        econtext->ecxt_innertuple = innerTupleSlot;
+		innerTupleSlot = ExecProcNode(innerPlan);
+		econtext->ecxt_innertuple = innerTupleSlot;
 
-        if (TupIsNull(innerTupleSlot))
-        {
-            ENL1_printf("no inner tuple, need new outer tuple");
-
-            node->nl_NeedNewOuter = true;
+		if (TupIsNull(innerTupleSlot))
+		{
+			ENL1_printf("no inner tuple, need new outer tuple");
 
+			node->nl_NeedNewOuter = true;
+#ifdef __TBASE__
             if (!node->nl_MatchedOuter &&
                 (node->js.jointype == JOIN_LEFT ||
-                 node->js.jointype == JOIN_ANTI))
-            {
-                /*
-                 * We are doing an outer join and there were no join matches
-                 * for this outer tuple.  Generate a fake join tuple with
-                 * nulls for the inner tuple, and return it if it passes the
-                 * non-join quals.
-                 */
-                econtext->ecxt_innertuple = node->nl_NullInnerTupleSlot;
-
-                ENL1_printf("testing qualification for outer-join tuple");
-
-                if (otherqual == NULL || ExecQual(otherqual, econtext))
-                {
-                    /*
-                     * qualification was satisfied so we project and return
-                     * the slot containing the result tuple using
-                     * ExecProject().
-                     */
-                    ENL1_printf("qualification succeeded, projecting tuple");
-
-                    return ExecProject(node->js.ps.ps_ProjInfo);
-                }
-                else
-                    InstrCountFiltered2(node, 1);
-            }
-
-            /*
-             * Otherwise just return to top of loop for a new outer tuple.
-             */
-            continue;
-        }
-
-        /*
-         * at this point we have a new pair of inner and outer tuples so we
-         * test the inner and outer tuples to see if they satisfy the node's
-         * qualification.
-         *
-         * Only the joinquals determine MatchedOuter status, but all quals
-         * must pass to actually return the tuple.
-         */
-        ENL1_printf("testing qualification");
-
-        if (ExecQual(joinqual, econtext))
-        {
-            node->nl_MatchedOuter = true;
-
-            /* In an antijoin, we never return a matched tuple */
-            if (node->js.jointype == JOIN_ANTI)
-            {
-                node->nl_NeedNewOuter = true;
-                continue;        /* return to top of loop */
-            }
-
-            /*
-             * If we only need to join to the first matching inner tuple, then
-             * consider returning this one, but after that continue with next
-             * outer tuple.
-             */
-            if (node->js.single_match)
-                node->nl_NeedNewOuter = true;
-
-            if (otherqual == NULL || ExecQual(otherqual, econtext))
-            {
-                /*
-                 * qualification was satisfied so we project and return the
-                 * slot containing the result tuple using ExecProject().
-                 */
-                ENL1_printf("qualification succeeded, projecting tuple");
-
-                return ExecProject(node->js.ps.ps_ProjInfo);
-            }
-            else
-                InstrCountFiltered2(node, 1);
-        }
-        else
-            InstrCountFiltered1(node, 1);
-
-        /*
-         * Tuple fails qual, so free per-tuple memory and try again.
-         */
-        ResetExprContext(econtext);
-
-        ENL1_printf("qualification failed, looping");
-    }
+                 node->js.jointype == JOIN_ANTI ||
+                 node->js.jointype == JOIN_LEFT_SCALAR))
+#else
+			if (!node->nl_MatchedOuter &&
+				(node->js.jointype == JOIN_LEFT ||
+				 node->js.jointype == JOIN_ANTI))
+#endif
+			{
+				/*
+				 * We are doing an outer join and there were no join matches
+				 * for this outer tuple.  Generate a fake join tuple with
+				 * nulls for the inner tuple, and return it if it passes the
+				 * non-join quals.
+				 */
+				econtext->ecxt_innertuple = node->nl_NullInnerTupleSlot;
+
+				ENL1_printf("testing qualification for outer-join tuple");
+
+				if (otherqual == NULL || ExecQual(otherqual, econtext))
+				{
+					/*
+					 * qualification was satisfied so we project and return
+					 * the slot containing the result tuple using
+					 * ExecProject().
+					 */
+					ENL1_printf("qualification succeeded, projecting tuple");
+
+					return ExecProject(node->js.ps.ps_ProjInfo);
+				}
+				else
+					InstrCountFiltered2(node, 1);
+			}
+
+			/*
+			 * Otherwise just return to top of loop for a new outer tuple.
+			 */
+			continue;
+		}
+
+		/*
+		 * at this point we have a new pair of inner and outer tuples so we
+		 * test the inner and outer tuples to see if they satisfy the node's
+		 * qualification.
+		 *
+		 * Only the joinquals determine MatchedOuter status, but all quals
+		 * must pass to actually return the tuple.
+		 */
+		ENL1_printf("testing qualification");
+
+		if (ExecQual(joinqual, econtext))
+		{
+#ifdef __TBASE__
+            if (node->js.jointype == JOIN_LEFT_SCALAR && node->nl_MatchedOuter)
+                ereport(ERROR,
+                        (errcode(ERRCODE_CARDINALITY_VIOLATION),
+                                errmsg("more than one row returned by a subquery used as an expression")));
+#endif
+			node->nl_MatchedOuter = true;
+
+			/* In an antijoin, we never return a matched tuple */
+			if (node->js.jointype == JOIN_ANTI)
+			{
+				node->nl_NeedNewOuter = true;
+				continue;		/* return to top of loop */
+			}
+
+			/*
+			 * If we only need to join to the first matching inner tuple, then
+			 * consider returning this one, but after that continue with next
+			 * outer tuple.
+			 */
+			if (node->js.single_match)
+				node->nl_NeedNewOuter = true;
+
+			if (otherqual == NULL || ExecQual(otherqual, econtext))
+			{
+				/*
+				 * qualification was satisfied so we project and return the
+				 * slot containing the result tuple using ExecProject().
+				 */
+				ENL1_printf("qualification succeeded, projecting tuple");
+
+				return ExecProject(node->js.ps.ps_ProjInfo);
+			}
+			else
+				InstrCountFiltered2(node, 1);
+		}
+		else
+			InstrCountFiltered1(node, 1);
+
+		/*
+		 * Tuple fails qual, so free per-tuple memory and try again.
+		 */
+		ResetExprContext(econtext);
+
+		ENL1_printf("qualification failed, looping");
+	}
 }
 
 /* ----------------------------------------------------------------
@@ -271,94 +283,101 @@ ExecNestLoop(PlanState *pstate)
  */
 NestLoopState *
 ExecInitNestLoop(NestLoop *node, EState *estate, int eflags)
-{// #lizard forgives
-    NestLoopState *nlstate;
-
-    /* check for unsupported flags */
-    Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
-
-    NL1_printf("ExecInitNestLoop: %s\n",
-               "initializing node");
-
-    /*
-     * create state structure
-     */
-    nlstate = makeNode(NestLoopState);
-    nlstate->js.ps.plan = (Plan *) node;
-    nlstate->js.ps.state = estate;
-    nlstate->js.ps.ExecProcNode = ExecNestLoop;
-
-    /*
-     * Miscellaneous initialization
-     *
-     * create expression context for node
-     */
-    ExecAssignExprContext(estate, &nlstate->js.ps);
-
-    /*
-     * initialize child expressions
-     */
-    nlstate->js.ps.qual =
-        ExecInitQual(node->join.plan.qual, (PlanState *) nlstate);
-    nlstate->js.jointype = node->join.jointype;
-    nlstate->js.joinqual =
-        ExecInitQual(node->join.joinqual, (PlanState *) nlstate);
-
-    /*
-     * initialize child nodes
-     *
-     * If we have no parameters to pass into the inner rel from the outer,
-     * tell the inner child that cheap rescans would be good.  If we do have
-     * such parameters, then there is no point in REWIND support at all in the
-     * inner child, because it will always be rescanned with fresh parameter
-     * values.
-     */
-    outerPlanState(nlstate) = ExecInitNode(outerPlan(node), estate, eflags);
-    if (node->nestParams == NIL)
-        eflags |= EXEC_FLAG_REWIND;
-    else
-        eflags &= ~EXEC_FLAG_REWIND;
-    innerPlanState(nlstate) = ExecInitNode(innerPlan(node), estate, eflags);
-
-    /*
-     * tuple table initialization
-     */
-    ExecInitResultTupleSlot(estate, &nlstate->js.ps);
-
-    /*
-     * detect whether we need only consider the first matching inner tuple
-     */
-    nlstate->js.single_match = (node->join.inner_unique ||
-                                node->join.jointype == JOIN_SEMI);
-
-    /* set up null tuples for outer joins, if needed */
-    switch (node->join.jointype)
-    {
-        case JOIN_INNER:
+{
+	NestLoopState *nlstate;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
+
+	NL1_printf("ExecInitNestLoop: %s\n",
+			   "initializing node");
+
+	/*
+	 * create state structure
+	 */
+	nlstate = makeNode(NestLoopState);
+	nlstate->js.ps.plan = (Plan *) node;
+	nlstate->js.ps.state = estate;
+	nlstate->js.ps.ExecProcNode = ExecNestLoop;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &nlstate->js.ps);
+
+	/*
+	 * initialize child expressions
+	 */
+	nlstate->js.ps.qual =
+		ExecInitQual(node->join.plan.qual, (PlanState *) nlstate);
+	nlstate->js.jointype = node->join.jointype;
+	nlstate->js.joinqual =
+		ExecInitQual(node->join.joinqual, (PlanState *) nlstate);
+
+	/*
+	 * initialize child nodes
+	 *
+	 * If we have no parameters to pass into the inner rel from the outer,
+	 * tell the inner child that cheap rescans would be good.  If we do have
+	 * such parameters, then there is no point in REWIND support at all in the
+	 * inner child, because it will always be rescanned with fresh parameter
+	 * values.
+	 */
+	outerPlanState(nlstate) = ExecInitNode(outerPlan(node), estate, eflags);
+	if (node->nestParams == NIL)
+		eflags |= EXEC_FLAG_REWIND;
+	else
+		eflags &= ~EXEC_FLAG_REWIND;
+	innerPlanState(nlstate) = ExecInitNode(innerPlan(node), estate, eflags);
+
+	/*
+	 * tuple table initialization
+	 */
+	ExecInitResultTupleSlot(estate, &nlstate->js.ps);
+
+	/*
+	 * detect whether we need only consider the first matching inner tuple
+	 */
+	nlstate->js.single_match = (node->join.inner_unique ||
+								node->join.jointype == JOIN_SEMI);
+
+	/* set up null tuples for outer joins, if needed */
+	switch (node->join.jointype)
+	{
+		case JOIN_INNER:
         case JOIN_SEMI:
             break;
-        case JOIN_LEFT:
-        case JOIN_ANTI:
+#ifdef __TBASE__
+		case JOIN_LEFT_SCALAR:
             nlstate->nl_NullInnerTupleSlot =
-                ExecInitNullTupleSlot(estate,
-                                      ExecGetResultType(innerPlanState(nlstate)));
-            break;
-        default:
-            elog(ERROR, "unrecognized join type: %d",
-                 (int) node->join.jointype);
-    }
-
-    /*
-     * initialize tuple type and projection info
-     */
-    ExecAssignResultTypeFromTL(&nlstate->js.ps);
-    ExecAssignProjectionInfo(&nlstate->js.ps, NULL);
-
-    /*
-     * finally, wipe the current outer tuple clean.
-     */
-    nlstate->nl_NeedNewOuter = true;
-    nlstate->nl_MatchedOuter = false;
+                    ExecInitNullTupleSlot(estate,
+                                          ExecGetResultType(innerPlanState(nlstate)));
+			break;
+#endif
+		case JOIN_LEFT:
+		case JOIN_ANTI:
+			nlstate->nl_NullInnerTupleSlot =
+				ExecInitNullTupleSlot(estate,
+									  ExecGetResultType(innerPlanState(nlstate)));
+			break;
+		default:
+			elog(ERROR, "unrecognized join type: %d",
+				 (int) node->join.jointype);
+	}
+
+	/*
+	 * initialize tuple type and projection info
+	 */
+	ExecAssignResultTypeFromTL(&nlstate->js.ps);
+	ExecAssignProjectionInfo(&nlstate->js.ps, NULL);
+
+	/*
+	 * finally, wipe the current outer tuple clean.
+	 */
+	nlstate->nl_NeedNewOuter = true;
+	nlstate->nl_MatchedOuter = false;
 #ifdef __TBASE__
     nlstate->nl_InnerInited  = false;
 #endif
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index d629ec67..a4fec879 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -210,34 +210,40 @@ make_one_rel(PlannerInfo *root, List *joinlist)
 static void
 set_base_rel_consider_startup(PlannerInfo *root)
 {
-    /*
-     * Since parameterized paths can only be used on the inside of a nestloop
-     * join plan, there is usually little value in considering fast-start
-     * plans for them.  However, for relations that are on the RHS of a SEMI
-     * or ANTI join, a fast-start plan can be useful because we're only going
-     * to care about fetching one tuple anyway.
-     *
-     * To minimize growth of planning time, we currently restrict this to
-     * cases where the RHS is a single base relation, not a join; there is no
-     * provision for consider_param_startup to get set at all on joinrels.
-     * Also we don't worry about appendrels.  costsize.c's costing rules for
-     * nestloop semi/antijoins don't consider such cases either.
-     */
-    ListCell   *lc;
-
-    foreach(lc, root->join_info_list)
-    {
-        SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc);
-        int            varno;
-
-        if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_ANTI) &&
-            bms_get_singleton_member(sjinfo->syn_righthand, &varno))
-        {
-            RelOptInfo *rel = find_base_rel(root, varno);
+	/*
+	 * Since parameterized paths can only be used on the inside of a nestloop
+	 * join plan, there is usually little value in considering fast-start
+	 * plans for them.  However, for relations that are on the RHS of a SEMI
+	 * or ANTI join, a fast-start plan can be useful because we're only going
+	 * to care about fetching one tuple anyway.
+	 *
+	 * To minimize growth of planning time, we currently restrict this to
+	 * cases where the RHS is a single base relation, not a join; there is no
+	 * provision for consider_param_startup to get set at all on joinrels.
+	 * Also we don't worry about appendrels.  costsize.c's costing rules for
+	 * nestloop semi/antijoins don't consider such cases either.
+	 */
+	ListCell   *lc;
+
+	foreach(lc, root->join_info_list)
+	{
+		SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc);
+		int			varno;
+
+#ifdef __TBASE__
+        if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_ANTI ||
+             sjinfo->jointype == JOIN_LEFT_SCALAR) &&
+			bms_get_singleton_member(sjinfo->syn_righthand, &varno))
+#else
+		if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_ANTI) &&
+			bms_get_singleton_member(sjinfo->syn_righthand, &varno))
+#endif
+		{
+			RelOptInfo *rel = find_base_rel(root, varno);
 
-            rel->consider_param_startup = true;
-        }
-    }
+			rel->consider_param_startup = true;
+		}
+	}
 }
 
 /*
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 7de5eaa4..a6bba0cf 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -2103,65 +2103,72 @@ initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
                       Path *outer_path, Path *inner_path,
                       JoinPathExtraData *extra)
 {
-    Cost        startup_cost = 0;
-    Cost        run_cost = 0;
-    double        outer_path_rows = outer_path->rows;
-    Cost        inner_rescan_start_cost;
-    Cost        inner_rescan_total_cost;
-    Cost        inner_run_cost;
-    Cost        inner_rescan_run_cost;
+	Cost		startup_cost = 0;
+	Cost		run_cost = 0;
+	double		outer_path_rows = outer_path->rows;
+	Cost		inner_rescan_start_cost;
+	Cost		inner_rescan_total_cost;
+	Cost		inner_run_cost;
+	Cost		inner_rescan_run_cost;
 
-    /* estimate costs to rescan the inner relation */
-    cost_rescan(root, inner_path,
-                &inner_rescan_start_cost,
-                &inner_rescan_total_cost);
+	/* estimate costs to rescan the inner relation */
+	cost_rescan(root, inner_path,
+				&inner_rescan_start_cost,
+				&inner_rescan_total_cost);
 
-    /* cost of source data */
+	/* cost of source data */
 
-    /*
-     * NOTE: clearly, we must pay both outer and inner paths' startup_cost
-     * before we can start returning tuples, so the join's startup cost is
-     * their sum.  We'll also pay the inner path's rescan startup cost
-     * multiple times.
-     */
-    startup_cost += outer_path->startup_cost + inner_path->startup_cost;
-    run_cost += outer_path->total_cost - outer_path->startup_cost;
-    if (outer_path_rows > 1)
-        run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
+	/*
+	 * NOTE: clearly, we must pay both outer and inner paths' startup_cost
+	 * before we can start returning tuples, so the join's startup cost is
+	 * their sum.  We'll also pay the inner path's rescan startup cost
+	 * multiple times.
+	 */
+	startup_cost += outer_path->startup_cost + inner_path->startup_cost;
+	run_cost += outer_path->total_cost - outer_path->startup_cost;
+	if (outer_path_rows > 1)
+		run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
 
-    inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
-    inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
+	inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
+	inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
 
-    if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
-        extra->inner_unique)
-    {
-        /*
-         * With a SEMI or ANTI join, or if the innerrel is known unique, the
-         * executor will stop after the first match.
-         *
-         * Getting decent estimates requires inspection of the join quals,
-         * which we choose to postpone to final_cost_nestloop.
-         */
-
-        /* Save private data for final_cost_nestloop */
-        workspace->inner_run_cost = inner_run_cost;
-        workspace->inner_rescan_run_cost = inner_rescan_run_cost;
-    }
-    else
-    {
-        /* Normal case; we'll scan whole input rel for each outer row */
-        run_cost += inner_run_cost;
-        if (outer_path_rows > 1)
-            run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
-    }
+#ifdef __TBASE__
+    if (jointype == JOIN_SEMI ||
+        jointype == JOIN_ANTI ||
+        jointype == JOIN_LEFT_SCALAR ||
+		extra->inner_unique)
+#else
+	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
+		extra->inner_unique)
+#endif
+	{
+		/*
+		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
+		 * executor will stop after the first match.
+		 *
+		 * Getting decent estimates requires inspection of the join quals,
+		 * which we choose to postpone to final_cost_nestloop.
+		 */
+
+		/* Save private data for final_cost_nestloop */
+		workspace->inner_run_cost = inner_run_cost;
+		workspace->inner_rescan_run_cost = inner_rescan_run_cost;
+	}
+	else
+	{
+		/* Normal case; we'll scan whole input rel for each outer row */
+		run_cost += inner_run_cost;
+		if (outer_path_rows > 1)
+			run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
+	}
 
-    /* CPU costs left for later */
+	/* CPU costs left for later */
 
-    /* Public result fields */
-    workspace->startup_cost = startup_cost;
-    workspace->total_cost = startup_cost + run_cost;
-    /* Save private data for final_cost_nestloop */
-    workspace->run_cost = run_cost;
+	/* Public result fields */
+	workspace->startup_cost = startup_cost;
+	workspace->total_cost = startup_cost + run_cost;
+	/* Save private data for final_cost_nestloop */
+	workspace->run_cost = run_cost;
 }
 
 /*
@@ -2174,176 +2181,183 @@ initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
  */
 void
 final_cost_nestloop(PlannerInfo *root, NestPath *path,
-                    JoinCostWorkspace *workspace,
-                    JoinPathExtraData *extra)
-{// #lizard forgives
-    Path       *outer_path = path->outerjoinpath;
-    Path       *inner_path = path->innerjoinpath;
-    double        outer_path_rows = outer_path->rows;
-    double        inner_path_rows = inner_path->rows;
-    Cost        startup_cost = workspace->startup_cost;
-    Cost        run_cost = workspace->run_cost;
-    Cost        cpu_per_tuple;
-    QualCost    restrict_qual_cost;
-    double        ntuples;
-
-    /* Protect some assumptions below that rowcounts aren't zero or NaN */
-    if (outer_path_rows <= 0 || isnan(outer_path_rows))
-        outer_path_rows = 1;
-    if (inner_path_rows <= 0 || isnan(inner_path_rows))
-        inner_path_rows = 1;
-
-    /* Mark the path with the correct row estimate */
-    if (path->path.param_info)
-        path->path.rows = path->path.param_info->ppi_rows;
-    else
-        path->path.rows = path->path.parent->rows;
-
-    /* For partial paths, scale row estimate. */
-    if (path->path.parallel_workers > 0)
-    {
-        double        parallel_divisor = get_parallel_divisor(&path->path);
-
-        path->path.rows =
-            clamp_row_est(path->path.rows / parallel_divisor);
-    }
-
-    /*
-     * We could include disable_cost in the preliminary estimate, but that
-     * would amount to optimizing for the case where the join method is
-     * disabled, which doesn't seem like the way to bet.
-     */
-    if (!enable_nestloop)
-        startup_cost += disable_cost;
-
-    /* cost of inner-relation source data (we already dealt with outer rel) */
-
-    if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI ||
-        extra->inner_unique)
-    {
-        /*
-         * With a SEMI or ANTI join, or if the innerrel is known unique, the
-         * executor will stop after the first match.
-         */
-        Cost        inner_run_cost = workspace->inner_run_cost;
-        Cost        inner_rescan_run_cost = workspace->inner_rescan_run_cost;
-        double        outer_matched_rows;
-        double        outer_unmatched_rows;
-        Selectivity inner_scan_frac;
-
-        /*
-         * For an outer-rel row that has at least one match, we can expect the
-         * inner scan to stop after a fraction 1/(match_count+1) of the inner
-         * rows, if the matches are evenly distributed.  Since they probably
-         * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to
-         * that fraction.  (If we used a larger fuzz factor, we'd have to
-         * clamp inner_scan_frac to at most 1.0; but since match_count is at
-         * least 1, no such clamp is needed now.)
-         */
-        outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
-        outer_unmatched_rows = outer_path_rows - outer_matched_rows;
-        inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
-
-        /*
-         * Compute number of tuples processed (not number emitted!).  First,
-         * account for successfully-matched outer rows.
-         */
-        ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
-
-        /*
-         * Now we need to estimate the actual costs of scanning the inner
-         * relation, which may be quite a bit less than N times inner_run_cost
-         * due to early scan stops.  We consider two cases.  If the inner path
-         * is an indexscan using all the joinquals as indexquals, then an
-         * unmatched outer row results in an indexscan returning no rows,
-         * which is probably quite cheap.  Otherwise, the executor will have
-         * to scan the whole inner rel for an unmatched row; not so cheap.
-         */
-        if (has_indexed_join_quals(path))
-        {
-            /*
-             * Successfully-matched outer rows will only require scanning
-             * inner_scan_frac of the inner relation.  In this case, we don't
-             * need to charge the full inner_run_cost even when that's more
-             * than inner_rescan_run_cost, because we can assume that none of
-             * the inner scans ever scan the whole inner relation.  So it's
-             * okay to assume that all the inner scan executions can be
-             * fractions of the full cost, even if materialization is reducing
-             * the rescan cost.  At this writing, it's impossible to get here
-             * for a materialized inner scan, so inner_run_cost and
-             * inner_rescan_run_cost will be the same anyway; but just in
-             * case, use inner_run_cost for the first matched tuple and
-             * inner_rescan_run_cost for additional ones.
-             */
-            run_cost += inner_run_cost * inner_scan_frac;
-            if (outer_matched_rows > 1)
-                run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
-
-            /*
-             * Add the cost of inner-scan executions for unmatched outer rows.
-             * We estimate this as the same cost as returning the first tuple
-             * of a nonempty scan.  We consider that these are all rescans,
-             * since we used inner_run_cost once already.
-             */
-            run_cost += outer_unmatched_rows *
-                inner_rescan_run_cost / inner_path_rows;
+					JoinCostWorkspace *workspace,
+					JoinPathExtraData *extra)
+{
+	Path	   *outer_path = path->outerjoinpath;
+	Path	   *inner_path = path->innerjoinpath;
+	double		outer_path_rows = outer_path->rows;
+	double		inner_path_rows = inner_path->rows;
+	Cost		startup_cost = workspace->startup_cost;
+	Cost		run_cost = workspace->run_cost;
+	Cost		cpu_per_tuple;
+	QualCost	restrict_qual_cost;
+	double		ntuples;
+
+	/* Protect some assumptions below that rowcounts aren't zero or NaN */
+	if (outer_path_rows <= 0 || isnan(outer_path_rows))
+		outer_path_rows = 1;
+	if (inner_path_rows <= 0 || isnan(inner_path_rows))
+		inner_path_rows = 1;
+
+	/* Mark the path with the correct row estimate */
+	if (path->path.param_info)
+		path->path.rows = path->path.param_info->ppi_rows;
+	else
+		path->path.rows = path->path.parent->rows;
+
+	/* For partial paths, scale row estimate. */
+	if (path->path.parallel_workers > 0)
+	{
+		double		parallel_divisor = get_parallel_divisor(&path->path);
 
-            /*
-             * We won't be evaluating any quals at all for unmatched rows, so
-             * don't add them to ntuples.
-             */
-        }
-        else
-        {
-            /*
-             * Here, a complicating factor is that rescans may be cheaper than
-             * first scans.  If we never scan all the way to the end of the
-             * inner rel, it might be (depending on the plan type) that we'd
-             * never pay the whole inner first-scan run cost.  However it is
-             * difficult to estimate whether that will happen (and it could
-             * not happen if there are any unmatched outer rows!), so be
-             * conservative and always charge the whole first-scan cost once.
-             * We consider this charge to correspond to the first unmatched
-             * outer row, unless there isn't one in our estimate, in which
-             * case blame it on the first matched row.
-             */
+		path->path.rows =
+			clamp_row_est(path->path.rows / parallel_divisor);
+	}
 
-            /* First, count all unmatched join tuples as being processed */
-            ntuples += outer_unmatched_rows * inner_path_rows;
+	/*
+	 * We could include disable_cost in the preliminary estimate, but that
+	 * would amount to optimizing for the case where the join method is
+	 * disabled, which doesn't seem like the way to bet.
+	 */
+	if (!enable_nestloop)
+		startup_cost += disable_cost;
 
-            /* Now add the forced full scan, and decrement appropriate count */
-            run_cost += inner_run_cost;
-            if (outer_unmatched_rows >= 1)
-                outer_unmatched_rows -= 1;
-            else
-                outer_matched_rows -= 1;
+	/* cost of inner-relation source data (we already dealt with outer rel) */
 
-            /* Add inner run cost for additional outer tuples having matches */
-            if (outer_matched_rows > 0)
-                run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac;
+#ifdef __TBASE__
+    if (path->jointype == JOIN_SEMI ||
+        path->jointype == JOIN_ANTI ||
+        path->jointype == JOIN_LEFT_SCALAR ||
+		extra->inner_unique)
+#else
+	if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI ||
+		extra->inner_unique)
+#endif
+	{
+		/*
+		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
+		 * executor will stop after the first match.
+		 */
+		Cost		inner_run_cost = workspace->inner_run_cost;
+		Cost		inner_rescan_run_cost = workspace->inner_rescan_run_cost;
+		double		outer_matched_rows;
+		double		outer_unmatched_rows;
+		Selectivity inner_scan_frac;
+
+		/*
+		 * For an outer-rel row that has at least one match, we can expect the
+		 * inner scan to stop after a fraction 1/(match_count+1) of the inner
+		 * rows, if the matches are evenly distributed.  Since they probably
+		 * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to
+		 * that fraction.  (If we used a larger fuzz factor, we'd have to
+		 * clamp inner_scan_frac to at most 1.0; but since match_count is at
+		 * least 1, no such clamp is needed now.)
+		 */
+		outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
+		outer_unmatched_rows = outer_path_rows - outer_matched_rows;
+		inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
+
+		/*
+		 * Compute number of tuples processed (not number emitted!).  First,
+		 * account for successfully-matched outer rows.
+		 */
+		ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
+
+		/*
+		 * Now we need to estimate the actual costs of scanning the inner
+		 * relation, which may be quite a bit less than N times inner_run_cost
+		 * due to early scan stops.  We consider two cases.  If the inner path
+		 * is an indexscan using all the joinquals as indexquals, then an
+		 * unmatched outer row results in an indexscan returning no rows,
+		 * which is probably quite cheap.  Otherwise, the executor will have
+		 * to scan the whole inner rel for an unmatched row; not so cheap.
+		 */
+		if (has_indexed_join_quals(path))
+		{
+			/*
+			 * Successfully-matched outer rows will only require scanning
+			 * inner_scan_frac of the inner relation.  In this case, we don't
+			 * need to charge the full inner_run_cost even when that's more
+			 * than inner_rescan_run_cost, because we can assume that none of
+			 * the inner scans ever scan the whole inner relation.  So it's
+			 * okay to assume that all the inner scan executions can be
+			 * fractions of the full cost, even if materialization is reducing
+			 * the rescan cost.  At this writing, it's impossible to get here
+			 * for a materialized inner scan, so inner_run_cost and
+			 * inner_rescan_run_cost will be the same anyway; but just in
+			 * case, use inner_run_cost for the first matched tuple and
+			 * inner_rescan_run_cost for additional ones.
+			 */
+			run_cost += inner_run_cost * inner_scan_frac;
+			if (outer_matched_rows > 1)
+				run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
+
+			/*
+			 * Add the cost of inner-scan executions for unmatched outer rows.
+			 * We estimate this as the same cost as returning the first tuple
+			 * of a nonempty scan.  We consider that these are all rescans,
+			 * since we used inner_run_cost once already.
+			 */
+			run_cost += outer_unmatched_rows *
+				inner_rescan_run_cost / inner_path_rows;
+
+			/*
+			 * We won't be evaluating any quals at all for unmatched rows, so
+			 * don't add them to ntuples.
+			 */
+		}
+		else
+		{
+			/*
+			 * Here, a complicating factor is that rescans may be cheaper than
+			 * first scans.  If we never scan all the way to the end of the
+			 * inner rel, it might be (depending on the plan type) that we'd
+			 * never pay the whole inner first-scan run cost.  However it is
+			 * difficult to estimate whether that will happen (and it could
+			 * not happen if there are any unmatched outer rows!), so be
+			 * conservative and always charge the whole first-scan cost once.
+			 * We consider this charge to correspond to the first unmatched
+			 * outer row, unless there isn't one in our estimate, in which
+			 * case blame it on the first matched row.
+			 */
+
+			/* First, count all unmatched join tuples as being processed */
+			ntuples += outer_unmatched_rows * inner_path_rows;
+
+			/* Now add the forced full scan, and decrement appropriate count */
+			run_cost += inner_run_cost;
+			if (outer_unmatched_rows >= 1)
+				outer_unmatched_rows -= 1;
+			else
+				outer_matched_rows -= 1;
+
+			/* Add inner run cost for additional outer tuples having matches */
+			if (outer_matched_rows > 0)
+				run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac;
+
+			/* Add inner run cost for additional unmatched outer tuples */
+			if (outer_unmatched_rows > 0)
+				run_cost += outer_unmatched_rows * inner_rescan_run_cost;
+		}
+	}
+	else
+	{
+		/* Normal-case source costs were included in preliminary estimate */
 
-            /* Add inner run cost for additional unmatched outer tuples */
-            if (outer_unmatched_rows > 0)
-                run_cost += outer_unmatched_rows * inner_rescan_run_cost;
-        }
-    }
-    else
-    {
-        /* Normal-case source costs were included in preliminary estimate */
+		/* Compute number of tuples processed (not number emitted!) */
+		ntuples = outer_path_rows * inner_path_rows;
+	}
 
-        /* Compute number of tuples processed (not number emitted!) */
-        ntuples = outer_path_rows * inner_path_rows;
-    }
+	/* CPU costs */
+	cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root);
+	startup_cost += restrict_qual_cost.startup;
+	cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
+	run_cost += cpu_per_tuple * ntuples;
 
-    /* CPU costs */
-    cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root);
-    startup_cost += restrict_qual_cost.startup;
-    cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
-    run_cost += cpu_per_tuple * ntuples;
-
-    /* tlist eval costs are paid per output row, not per tuple scanned */
-    startup_cost += path->path.pathtarget->cost.startup;
-    run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
+	/* tlist eval costs are paid per output row, not per tuple scanned */
+	startup_cost += path->path.pathtarget->cost.startup;
+	run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
 
 #ifdef __TBASE__
 	/*
@@ -2630,252 +2644,261 @@ initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
  */
 void
 final_cost_mergejoin(PlannerInfo *root, MergePath *path,
-                     JoinCostWorkspace *workspace,
-                     JoinPathExtraData *extra)
-{// #lizard forgives
-    Path       *outer_path = path->jpath.outerjoinpath;
-    Path       *inner_path = path->jpath.innerjoinpath;
-    double        inner_path_rows = inner_path->rows;
-    List       *mergeclauses = path->path_mergeclauses;
-    List       *innersortkeys = path->innersortkeys;
-    Cost        startup_cost = workspace->startup_cost;
-    Cost        run_cost = workspace->run_cost;
-    Cost        inner_run_cost = workspace->inner_run_cost;
-    double        outer_rows = workspace->outer_rows;
-    double        inner_rows = workspace->inner_rows;
-    double        outer_skip_rows = workspace->outer_skip_rows;
-    double        inner_skip_rows = workspace->inner_skip_rows;
-    Cost        cpu_per_tuple,
-                bare_inner_cost,
-                mat_inner_cost;
-    QualCost    merge_qual_cost;
-    QualCost    qp_qual_cost;
-    double        mergejointuples,
-                rescannedtuples;
-    double        rescanratio;
-
-    /* Protect some assumptions below that rowcounts aren't zero or NaN */
-    if (inner_path_rows <= 0 || isnan(inner_path_rows))
-        inner_path_rows = 1;
-
-    /* Mark the path with the correct row estimate */
-    if (path->jpath.path.param_info)
-        path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
-    else
-        path->jpath.path.rows = path->jpath.path.parent->rows;
-
-    /* For partial paths, scale row estimate. */
-    if (path->jpath.path.parallel_workers > 0)
-    {
-        double        parallel_divisor = get_parallel_divisor(&path->jpath.path);
+					 JoinCostWorkspace *workspace,
+					 JoinPathExtraData *extra)
+{
+	Path	   *outer_path = path->jpath.outerjoinpath;
+	Path	   *inner_path = path->jpath.innerjoinpath;
+	double		inner_path_rows = inner_path->rows;
+	List	   *mergeclauses = path->path_mergeclauses;
+	List	   *innersortkeys = path->innersortkeys;
+	Cost		startup_cost = workspace->startup_cost;
+	Cost		run_cost = workspace->run_cost;
+	Cost		inner_run_cost = workspace->inner_run_cost;
+	double		outer_rows = workspace->outer_rows;
+	double		inner_rows = workspace->inner_rows;
+	double		outer_skip_rows = workspace->outer_skip_rows;
+	double		inner_skip_rows = workspace->inner_skip_rows;
+	Cost		cpu_per_tuple,
+				bare_inner_cost,
+				mat_inner_cost;
+	QualCost	merge_qual_cost;
+	QualCost	qp_qual_cost;
+	double		mergejointuples,
+				rescannedtuples;
+	double		rescanratio;
+
+	/* Protect some assumptions below that rowcounts aren't zero or NaN */
+	if (inner_path_rows <= 0 || isnan(inner_path_rows))
+		inner_path_rows = 1;
+
+	/* Mark the path with the correct row estimate */
+	if (path->jpath.path.param_info)
+		path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
+	else
+		path->jpath.path.rows = path->jpath.path.parent->rows;
+
+	/* For partial paths, scale row estimate. */
+	if (path->jpath.path.parallel_workers > 0)
+	{
+		double		parallel_divisor = get_parallel_divisor(&path->jpath.path);
 
-        path->jpath.path.rows =
-            clamp_row_est(path->jpath.path.rows / parallel_divisor);
-    }
+		path->jpath.path.rows =
+			clamp_row_est(path->jpath.path.rows / parallel_divisor);
+	}
 
-    /*
-     * We could include disable_cost in the preliminary estimate, but that
-     * would amount to optimizing for the case where the join method is
-     * disabled, which doesn't seem like the way to bet.
-     */
-    if (!enable_mergejoin)
-        startup_cost += disable_cost;
+	/*
+	 * We could include disable_cost in the preliminary estimate, but that
+	 * would amount to optimizing for the case where the join method is
+	 * disabled, which doesn't seem like the way to bet.
+	 */
+	if (!enable_mergejoin)
+		startup_cost += disable_cost;
 
-    /*
-     * Compute cost of the mergequals and qpquals (other restriction clauses)
-     * separately.
-     */
-    cost_qual_eval(&merge_qual_cost, mergeclauses, root);
-    cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
-    qp_qual_cost.startup -= merge_qual_cost.startup;
-    qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
+	/*
+	 * Compute cost of the mergequals and qpquals (other restriction clauses)
+	 * separately.
+	 */
+	cost_qual_eval(&merge_qual_cost, mergeclauses, root);
+	cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
+	qp_qual_cost.startup -= merge_qual_cost.startup;
+	qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
 
-    /*
-     * With a SEMI or ANTI join, or if the innerrel is known unique, the
-     * executor will stop scanning for matches after the first match.  When
-     * all the joinclauses are merge clauses, this means we don't ever need to
-     * back up the merge, and so we can skip mark/restore overhead.
-     */
-    if ((path->jpath.jointype == JOIN_SEMI ||
-         path->jpath.jointype == JOIN_ANTI ||
-         extra->inner_unique) &&
-        (list_length(path->jpath.joinrestrictinfo) ==
-         list_length(path->path_mergeclauses)))
-        path->skip_mark_restore = true;
-    else
-        path->skip_mark_restore = false;
+	/*
+	 * With a SEMI or ANTI join, or if the innerrel is known unique, the
+	 * executor will stop scanning for matches after the first match.  When
+	 * all the joinclauses are merge clauses, this means we don't ever need to
+	 * back up the merge, and so we can skip mark/restore overhead.
+	 */
+#ifdef __TBASE__
+	if ((path->jpath.jointype == JOIN_SEMI ||
+		 path->jpath.jointype == JOIN_ANTI ||
+         path->jpath.jointype == JOIN_LEFT_SCALAR ||
+		 extra->inner_unique) &&
+		(list_length(path->jpath.joinrestrictinfo) ==
+		 list_length(path->path_mergeclauses)))
+#else
+	if ((path->jpath.jointype == JOIN_SEMI ||
+		 path->jpath.jointype == JOIN_ANTI ||
+		 extra->inner_unique) &&
+		(list_length(path->jpath.joinrestrictinfo) ==
+		 list_length(path->path_mergeclauses)))
+#endif
+		path->skip_mark_restore = true;
+	else
+		path->skip_mark_restore = false;
 
-    /*
-     * Get approx # tuples passing the mergequals.  We use approx_tuple_count
-     * here because we need an estimate done with JOIN_INNER semantics.
-     */
-    mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
+	/*
+	 * Get approx # tuples passing the mergequals.  We use approx_tuple_count
+	 * here because we need an estimate done with JOIN_INNER semantics.
+	 */
+	mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
 
-    /*
-     * When there are equal merge keys in the outer relation, the mergejoin
-     * must rescan any matching tuples in the inner relation. This means
-     * re-fetching inner tuples; we have to estimate how often that happens.
-     *
-     * For regular inner and outer joins, the number of re-fetches can be
-     * estimated approximately as size of merge join output minus size of
-     * inner relation. Assume that the distinct key values are 1, 2, ..., and
-     * denote the number of values of each key in the outer relation as m1,
-     * m2, ...; in the inner relation, n1, n2, ...  Then we have
-     *
-     * size of join = m1 * n1 + m2 * n2 + ...
-     *
-     * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
-     * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
-     * relation
-     *
-     * This equation works correctly for outer tuples having no inner match
-     * (nk = 0), but not for inner tuples having no outer match (mk = 0); we
-     * are effectively subtracting those from the number of rescanned tuples,
-     * when we should not.  Can we do better without expensive selectivity
-     * computations?
-     *
-     * The whole issue is moot if we are working from a unique-ified outer
-     * input, or if we know we don't need to mark/restore at all.
-     */
-    if (IsA(outer_path, UniquePath) ||path->skip_mark_restore)
-        rescannedtuples = 0;
-    else
-    {
-        rescannedtuples = mergejointuples - inner_path_rows;
-        /* Must clamp because of possible underestimate */
-        if (rescannedtuples < 0)
-            rescannedtuples = 0;
-    }
-    /* We'll inflate various costs this much to account for rescanning */
-    rescanratio = 1.0 + (rescannedtuples / inner_path_rows);
+	/*
+	 * When there are equal merge keys in the outer relation, the mergejoin
+	 * must rescan any matching tuples in the inner relation. This means
+	 * re-fetching inner tuples; we have to estimate how often that happens.
+	 *
+	 * For regular inner and outer joins, the number of re-fetches can be
+	 * estimated approximately as size of merge join output minus size of
+	 * inner relation. Assume that the distinct key values are 1, 2, ..., and
+	 * denote the number of values of each key in the outer relation as m1,
+	 * m2, ...; in the inner relation, n1, n2, ...  Then we have
+	 *
+	 * size of join = m1 * n1 + m2 * n2 + ...
+	 *
+	 * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
+	 * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
+	 * relation
+	 *
+	 * This equation works correctly for outer tuples having no inner match
+	 * (nk = 0), but not for inner tuples having no outer match (mk = 0); we
+	 * are effectively subtracting those from the number of rescanned tuples,
+	 * when we should not.  Can we do better without expensive selectivity
+	 * computations?
+	 *
+	 * The whole issue is moot if we are working from a unique-ified outer
+	 * input, or if we know we don't need to mark/restore at all.
+	 */
+	if (IsA(outer_path, UniquePath) ||path->skip_mark_restore)
+		rescannedtuples = 0;
+	else
+	{
+		rescannedtuples = mergejointuples - inner_path_rows;
+		/* Must clamp because of possible underestimate */
+		if (rescannedtuples < 0)
+			rescannedtuples = 0;
+	}
+	/* We'll inflate various costs this much to account for rescanning */
+	rescanratio = 1.0 + (rescannedtuples / inner_path_rows);
 
-    /*
-     * Decide whether we want to materialize the inner input to shield it from
-     * mark/restore and performing re-fetches.  Our cost model for regular
-     * re-fetches is that a re-fetch costs the same as an original fetch,
-     * which is probably an overestimate; but on the other hand we ignore the
-     * bookkeeping costs of mark/restore.  Not clear if it's worth developing
-     * a more refined model.  So we just need to inflate the inner run cost by
-     * rescanratio.
-     */
-    bare_inner_cost = inner_run_cost * rescanratio;
+	/*
+	 * Decide whether we want to materialize the inner input to shield it from
+	 * mark/restore and performing re-fetches.  Our cost model for regular
+	 * re-fetches is that a re-fetch costs the same as an original fetch,
+	 * which is probably an overestimate; but on the other hand we ignore the
+	 * bookkeeping costs of mark/restore.  Not clear if it's worth developing
+	 * a more refined model.  So we just need to inflate the inner run cost by
+	 * rescanratio.
+	 */
+	bare_inner_cost = inner_run_cost * rescanratio;
 
-    /*
-     * When we interpose a Material node the re-fetch cost is assumed to be
-     * just cpu_operator_cost per tuple, independently of the underlying
-     * plan's cost; and we charge an extra cpu_operator_cost per original
-     * fetch as well.  Note that we're assuming the materialize node will
-     * never spill to disk, since it only has to remember tuples back to the
-     * last mark.  (If there are a huge number of duplicates, our other cost
-     * factors will make the path so expensive that it probably won't get
-     * chosen anyway.)    So we don't use cost_rescan here.
-     *
-     * Note: keep this estimate in sync with create_mergejoin_plan's labeling
-     * of the generated Material node.
-     */
-    mat_inner_cost = inner_run_cost +
-        cpu_operator_cost * inner_path_rows * rescanratio;
+	/*
+	 * When we interpose a Material node the re-fetch cost is assumed to be
+	 * just cpu_operator_cost per tuple, independently of the underlying
+	 * plan's cost; and we charge an extra cpu_operator_cost per original
+	 * fetch as well.  Note that we're assuming the materialize node will
+	 * never spill to disk, since it only has to remember tuples back to the
+	 * last mark.  (If there are a huge number of duplicates, our other cost
+	 * factors will make the path so expensive that it probably won't get
+	 * chosen anyway.)	So we don't use cost_rescan here.
+	 *
+	 * Note: keep this estimate in sync with create_mergejoin_plan's labeling
+	 * of the generated Material node.
+	 */
+	mat_inner_cost = inner_run_cost +
+		cpu_operator_cost * inner_path_rows * rescanratio;
 
-    /*
-     * If we don't need mark/restore at all, we don't need materialization.
-     */
-    if (path->skip_mark_restore)
-        path->materialize_inner = false;
+	/*
+	 * If we don't need mark/restore at all, we don't need materialization.
+	 */
+	if (path->skip_mark_restore)
+		path->materialize_inner = false;
 
-    /*
-     * Prefer materializing if it looks cheaper, unless the user has asked to
-     * suppress materialization.
-     */
-    else if (enable_material && mat_inner_cost < bare_inner_cost)
-        path->materialize_inner = true;
+	/*
+	 * Prefer materializing if it looks cheaper, unless the user has asked to
+	 * suppress materialization.
+	 */
+	else if (enable_material && mat_inner_cost < bare_inner_cost)
+		path->materialize_inner = true;
 
-    /*
-     * Even if materializing doesn't look cheaper, we *must* do it if the
-     * inner path is to be used directly (without sorting) and it doesn't
-     * support mark/restore.
-     *
-     * Since the inner side must be ordered, and only Sorts and IndexScans can
-     * create order to begin with, and they both support mark/restore, you
-     * might think there's no problem --- but you'd be wrong.  Nestloop and
-     * merge joins can *preserve* the order of their inputs, so they can be
-     * selected as the input of a mergejoin, and they don't support
-     * mark/restore at present.
-     *
-     * We don't test the value of enable_material here, because
-     * materialization is required for correctness in this case, and turning
-     * it off does not entitle us to deliver an invalid plan.
-     */
-    else if (innersortkeys == NIL &&
-             !ExecSupportsMarkRestore(inner_path))
-        path->materialize_inner = true;
+	/*
+	 * Even if materializing doesn't look cheaper, we *must* do it if the
+	 * inner path is to be used directly (without sorting) and it doesn't
+	 * support mark/restore.
+	 *
+	 * Since the inner side must be ordered, and only Sorts and IndexScans can
+	 * create order to begin with, and they both support mark/restore, you
+	 * might think there's no problem --- but you'd be wrong.  Nestloop and
+	 * merge joins can *preserve* the order of their inputs, so they can be
+	 * selected as the input of a mergejoin, and they don't support
+	 * mark/restore at present.
+	 *
+	 * We don't test the value of enable_material here, because
+	 * materialization is required for correctness in this case, and turning
+	 * it off does not entitle us to deliver an invalid plan.
+	 */
+	else if (innersortkeys == NIL &&
+			 !ExecSupportsMarkRestore(inner_path))
+		path->materialize_inner = true;
 
-    /*
-     * Also, force materializing if the inner path is to be sorted and the
-     * sort is expected to spill to disk.  This is because the final merge
-     * pass can be done on-the-fly if it doesn't have to support mark/restore.
-     * We don't try to adjust the cost estimates for this consideration,
-     * though.
-     *
-     * Since materialization is a performance optimization in this case,
-     * rather than necessary for correctness, we skip it if enable_material is
-     * off.
-     */
-    else if (enable_material && innersortkeys != NIL &&
-             relation_byte_size(inner_path_rows,
-                                inner_path->pathtarget->width) >
-             (work_mem * 1024L))
-        path->materialize_inner = true;
+	/*
+	 * Also, force materializing if the inner path is to be sorted and the
+	 * sort is expected to spill to disk.  This is because the final merge
+	 * pass can be done on-the-fly if it doesn't have to support mark/restore.
+	 * We don't try to adjust the cost estimates for this consideration,
+	 * though.
+	 *
+	 * Since materialization is a performance optimization in this case,
+	 * rather than necessary for correctness, we skip it if enable_material is
+	 * off.
+	 */
+	else if (enable_material && innersortkeys != NIL &&
+			 relation_byte_size(inner_path_rows,
+								inner_path->pathtarget->width) >
+			 (work_mem * 1024L))
+		path->materialize_inner = true;
 #ifdef XCP
-    /*
-     * Even if innersortkeys are specified, we never add the Sort node on top
-     * of RemoteSubplan, instead we set up internal sorter.
-     * Since RemoteSubplan does not support mark/restore we must materialize it
-     */
-    else if (inner_path->pathtype == T_RemoteSubplan)
-        path->materialize_inner = true;
+	/*
+	 * Even if innersortkeys are specified, we never add the Sort node on top
+	 * of RemoteSubplan, instead we set up internal sorter.
+	 * Since RemoteSubplan does not support mark/restore we must materialize it
+	 */
+	else if (inner_path->pathtype == T_RemoteSubplan)
+		path->materialize_inner = true;
 #endif
-    else
-        path->materialize_inner = false;
+	else
+		path->materialize_inner = false;
 
-    /* Charge the right incremental cost for the chosen case */
-    if (path->materialize_inner)
-        run_cost += mat_inner_cost;
-    else
-        run_cost += bare_inner_cost;
+	/* Charge the right incremental cost for the chosen case */
+	if (path->materialize_inner)
+		run_cost += mat_inner_cost;
+	else
+		run_cost += bare_inner_cost;
 
-    /* CPU costs */
+	/* CPU costs */
 
-    /*
-     * The number of tuple comparisons needed is approximately number of outer
-     * rows plus number of inner rows plus number of rescanned tuples (can we
-     * refine this?).  At each one, we need to evaluate the mergejoin quals.
-     */
-    startup_cost += merge_qual_cost.startup;
-    startup_cost += merge_qual_cost.per_tuple *
-        (outer_skip_rows + inner_skip_rows * rescanratio);
-    run_cost += merge_qual_cost.per_tuple *
-        ((outer_rows - outer_skip_rows) +
-         (inner_rows - inner_skip_rows) * rescanratio);
+	/*
+	 * The number of tuple comparisons needed is approximately number of outer
+	 * rows plus number of inner rows plus number of rescanned tuples (can we
+	 * refine this?).  At each one, we need to evaluate the mergejoin quals.
+	 */
+	startup_cost += merge_qual_cost.startup;
+	startup_cost += merge_qual_cost.per_tuple *
+		(outer_skip_rows + inner_skip_rows * rescanratio);
+	run_cost += merge_qual_cost.per_tuple *
+		((outer_rows - outer_skip_rows) +
+		 (inner_rows - inner_skip_rows) * rescanratio);
 
-    /*
-     * For each tuple that gets through the mergejoin proper, we charge
-     * cpu_tuple_cost plus the cost of evaluating additional restriction
-     * clauses that are to be applied at the join.  (This is pessimistic since
-     * not all of the quals may get evaluated at each tuple.)
-     *
-     * Note: we could adjust for SEMI/ANTI joins skipping some qual
-     * evaluations here, but it's probably not worth the trouble.
-     */
-    startup_cost += qp_qual_cost.startup;
-    cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
-    run_cost += cpu_per_tuple * mergejointuples;
+	/*
+	 * For each tuple that gets through the mergejoin proper, we charge
+	 * cpu_tuple_cost plus the cost of evaluating additional restriction
+	 * clauses that are to be applied at the join.  (This is pessimistic since
+	 * not all of the quals may get evaluated at each tuple.)
+	 *
+	 * Note: we could adjust for SEMI/ANTI joins skipping some qual
+	 * evaluations here, but it's probably not worth the trouble.
+	 */
+	startup_cost += qp_qual_cost.startup;
+	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
+	run_cost += cpu_per_tuple * mergejointuples;
 
-    /* tlist eval costs are paid per output row, not per tuple scanned */
-    startup_cost += path->jpath.path.pathtarget->cost.startup;
-    run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
+	/* tlist eval costs are paid per output row, not per tuple scanned */
+	startup_cost += path->jpath.path.pathtarget->cost.startup;
+	run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
 
-    path->jpath.path.startup_cost = startup_cost;
-    path->jpath.path.total_cost = startup_cost + run_cost;
+	path->jpath.path.startup_cost = startup_cost;
+	path->jpath.path.total_cost = startup_cost + run_cost;
 }
 
 /*
@@ -3071,220 +3094,231 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
  */
 void
 final_cost_hashjoin(PlannerInfo *root, HashPath *path,
-                    JoinCostWorkspace *workspace,
-                    JoinPathExtraData *extra)
-{// #lizard forgives
-    Path       *outer_path = path->jpath.outerjoinpath;
-    Path       *inner_path = path->jpath.innerjoinpath;
-    double        outer_path_rows = outer_path->rows;
-    double        inner_path_rows = inner_path->rows;
-    List       *hashclauses = path->path_hashclauses;
-    Cost        startup_cost = workspace->startup_cost;
-    Cost        run_cost = workspace->run_cost;
-    int            numbuckets = workspace->numbuckets;
-    int            numbatches = workspace->numbatches;
-    Cost        cpu_per_tuple;
-    QualCost    hash_qual_cost;
-    QualCost    qp_qual_cost;
-    double        hashjointuples;
-    double        virtualbuckets;
-    Selectivity innerbucketsize;
-    ListCell   *hcl;
-
-    /* Mark the path with the correct row estimate */
-    if (path->jpath.path.param_info)
-        path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
-    else
-        path->jpath.path.rows = path->jpath.path.parent->rows;
-
-    /* For partial paths, scale row estimate. */
-    if (path->jpath.path.parallel_workers > 0)
-    {
-        double        parallel_divisor = get_parallel_divisor(&path->jpath.path);
-
-        path->jpath.path.rows =
-            clamp_row_est(path->jpath.path.rows / parallel_divisor);
-    }
-
-    /*
-     * We could include disable_cost in the preliminary estimate, but that
-     * would amount to optimizing for the case where the join method is
-     * disabled, which doesn't seem like the way to bet.
-     */
-    if (!enable_hashjoin)
-        startup_cost += disable_cost;
+					JoinCostWorkspace *workspace,
+					JoinPathExtraData *extra)
+{
+	Path	   *outer_path = path->jpath.outerjoinpath;
+	Path	   *inner_path = path->jpath.innerjoinpath;
+	double		outer_path_rows = outer_path->rows;
+	double		inner_path_rows = inner_path->rows;
+	List	   *hashclauses = path->path_hashclauses;
+	Cost		startup_cost = workspace->startup_cost;
+	Cost		run_cost = workspace->run_cost;
+	int			numbuckets = workspace->numbuckets;
+	int			numbatches = workspace->numbatches;
+	Cost		cpu_per_tuple;
+	QualCost	hash_qual_cost;
+	QualCost	qp_qual_cost;
+	double		hashjointuples;
+	double		virtualbuckets;
+	Selectivity innerbucketsize;
+	ListCell   *hcl;
+
+	/* Mark the path with the correct row estimate */
+	if (path->jpath.path.param_info)
+		path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
+	else
+		path->jpath.path.rows = path->jpath.path.parent->rows;
+
+	/* For partial paths, scale row estimate. */
+	if (path->jpath.path.parallel_workers > 0)
+	{
+		double		parallel_divisor = get_parallel_divisor(&path->jpath.path);
 
-    /* mark the path with estimated # of batches */
-    path->num_batches = numbatches;
+		path->jpath.path.rows =
+			clamp_row_est(path->jpath.path.rows / parallel_divisor);
+	}
 
-    /* and compute the number of "virtual" buckets in the whole join */
-    virtualbuckets = (double) numbuckets * (double) numbatches;
+	/*
+	 * We could include disable_cost in the preliminary estimate, but that
+	 * would amount to optimizing for the case where the join method is
+	 * disabled, which doesn't seem like the way to bet.
+	 */
+	if (!enable_hashjoin)
+		startup_cost += disable_cost;
 
-    /*
-     * Determine bucketsize fraction for inner relation.  We use the smallest
-     * bucketsize estimated for any individual hashclause; this is undoubtedly
-     * conservative.
-     *
-     * BUT: if inner relation has been unique-ified, we can assume it's good
-     * for hashing.  This is important both because it's the right answer, and
-     * because we avoid contaminating the cache with a value that's wrong for
-     * non-unique-ified paths.
-     */
-    if (IsA(inner_path, UniquePath))
-        innerbucketsize = 1.0 / virtualbuckets;
-    else
-    {
-        innerbucketsize = 1.0;
-        foreach(hcl, hashclauses)
-        {
-            RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
-            Selectivity thisbucketsize;
+	/* mark the path with estimated # of batches */
+	path->num_batches = numbatches;
 
-            /*
-             * First we have to figure out which side of the hashjoin clause
-             * is the inner side.
-             *
-             * Since we tend to visit the same clauses over and over when
-             * planning a large query, we cache the bucketsize estimate in the
-             * RestrictInfo node to avoid repeated lookups of statistics.
-             */
-            if (bms_is_subset(restrictinfo->right_relids,
-                              inner_path->parent->relids))
-            {
-                /* righthand side is inner */
-                thisbucketsize = restrictinfo->right_bucketsize;
-                if (thisbucketsize < 0)
-                {
-                    /* not cached yet */
-                    thisbucketsize =
-                        estimate_hash_bucketsize(root,
-                                                 get_rightop(restrictinfo->clause),
-                                                 virtualbuckets);
-                    restrictinfo->right_bucketsize = thisbucketsize;
-                }
-            }
-            else
-            {
-                Assert(bms_is_subset(restrictinfo->left_relids,
-                                     inner_path->parent->relids));
-                /* lefthand side is inner */
-                thisbucketsize = restrictinfo->left_bucketsize;
-                if (thisbucketsize < 0)
-                {
-                    /* not cached yet */
-                    thisbucketsize =
-                        estimate_hash_bucketsize(root,
-                                                 get_leftop(restrictinfo->clause),
-                                                 virtualbuckets);
-                    restrictinfo->left_bucketsize = thisbucketsize;
-                }
-            }
+	/* and compute the number of "virtual" buckets in the whole join */
+	virtualbuckets = (double) numbuckets * (double) numbatches;
 
-            if (innerbucketsize > thisbucketsize)
-                innerbucketsize = thisbucketsize;
-        }
-    }
+	/*
+	 * Determine bucketsize fraction for inner relation.  We use the smallest
+	 * bucketsize estimated for any individual hashclause; this is undoubtedly
+	 * conservative.
+	 *
+	 * BUT: if inner relation has been unique-ified, we can assume it's good
+	 * for hashing.  This is important both because it's the right answer, and
+	 * because we avoid contaminating the cache with a value that's wrong for
+	 * non-unique-ified paths.
+	 */
+	if (IsA(inner_path, UniquePath))
+		innerbucketsize = 1.0 / virtualbuckets;
+	else
+	{
+		innerbucketsize = 1.0;
+		foreach(hcl, hashclauses)
+		{
+			RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
+			Selectivity thisbucketsize;
+
+			/*
+			 * First we have to figure out which side of the hashjoin clause
+			 * is the inner side.
+			 *
+			 * Since we tend to visit the same clauses over and over when
+			 * planning a large query, we cache the bucketsize estimate in the
+			 * RestrictInfo node to avoid repeated lookups of statistics.
+			 */
+			if (bms_is_subset(restrictinfo->right_relids,
+							  inner_path->parent->relids))
+			{
+				/* righthand side is inner */
+				thisbucketsize = restrictinfo->right_bucketsize;
+				if (thisbucketsize < 0)
+				{
+					/* not cached yet */
+					thisbucketsize =
+						estimate_hash_bucketsize(root,
+												 get_rightop(restrictinfo->clause),
+												 virtualbuckets);
+					restrictinfo->right_bucketsize = thisbucketsize;
+				}
+			}
+			else
+			{
+				Assert(bms_is_subset(restrictinfo->left_relids,
+									 inner_path->parent->relids));
+				/* lefthand side is inner */
+				thisbucketsize = restrictinfo->left_bucketsize;
+				if (thisbucketsize < 0)
+				{
+					/* not cached yet */
+					thisbucketsize =
+						estimate_hash_bucketsize(root,
+												 get_leftop(restrictinfo->clause),
+												 virtualbuckets);
+					restrictinfo->left_bucketsize = thisbucketsize;
+				}
+			}
+
+			if (innerbucketsize > thisbucketsize)
+				innerbucketsize = thisbucketsize;
+		}
+	}
 
-    /*
-     * Compute cost of the hashquals and qpquals (other restriction clauses)
-     * separately.
-     */
-    cost_qual_eval(&hash_qual_cost, hashclauses, root);
-    cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
-    qp_qual_cost.startup -= hash_qual_cost.startup;
-    qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
+	/*
+	 * Compute cost of the hashquals and qpquals (other restriction clauses)
+	 * separately.
+	 */
+	cost_qual_eval(&hash_qual_cost, hashclauses, root);
+	cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
+	qp_qual_cost.startup -= hash_qual_cost.startup;
+	qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
 
-    /* CPU costs */
+	/* CPU costs */
 
+#ifdef __TBASE__
     if (path->jpath.jointype == JOIN_SEMI ||
         path->jpath.jointype == JOIN_ANTI ||
+        path->jpath.jointype == JOIN_LEFT_SCALAR ||
         extra->inner_unique)
-    {
-        double        outer_matched_rows;
-        Selectivity inner_scan_frac;
-
-        /*
-         * With a SEMI or ANTI join, or if the innerrel is known unique, the
-         * executor will stop after the first match.
-         *
-         * For an outer-rel row that has at least one match, we can expect the
-         * bucket scan to stop after a fraction 1/(match_count+1) of the
-         * bucket's rows, if the matches are evenly distributed.  Since they
-         * probably aren't quite evenly distributed, we apply a fuzz factor of
-         * 2.0 to that fraction.  (If we used a larger fuzz factor, we'd have
-         * to clamp inner_scan_frac to at most 1.0; but since match_count is
-         * at least 1, no such clamp is needed now.)
-         */
-        outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
-        inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
-
-        startup_cost += hash_qual_cost.startup;
-        run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
-            clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
-
-        /*
-         * For unmatched outer-rel rows, the picture is quite a lot different.
-         * In the first place, there is no reason to assume that these rows
-         * preferentially hit heavily-populated buckets; instead assume they
-         * are uncorrelated with the inner distribution and so they see an
-         * average bucket size of inner_path_rows / virtualbuckets.  In the
-         * second place, it seems likely that they will have few if any exact
-         * hash-code matches and so very few of the tuples in the bucket will
-         * actually require eval of the hash quals.  We don't have any good
-         * way to estimate how many will, but for the moment assume that the
-         * effective cost per bucket entry is one-tenth what it is for
-         * matchable tuples.
-         */
-        run_cost += hash_qual_cost.per_tuple *
-            (outer_path_rows - outer_matched_rows) *
-            clamp_row_est(inner_path_rows / virtualbuckets) * 0.05;
-
-        /* Get # of tuples that will pass the basic join */
-        if (path->jpath.jointype == JOIN_SEMI)
-            hashjointuples = outer_matched_rows;
-        else
-            hashjointuples = outer_path_rows - outer_matched_rows;
-    }
-    else
-    {
-        /*
-         * The number of tuple comparisons needed is the number of outer
-         * tuples times the typical number of tuples in a hash bucket, which
-         * is the inner relation size times its bucketsize fraction.  At each
-         * one, we need to evaluate the hashjoin quals.  But actually,
-         * charging the full qual eval cost at each tuple is pessimistic,
-         * since we don't evaluate the quals unless the hash values match
-         * exactly.  For lack of a better idea, halve the cost estimate to
-         * allow for that.
-         */
-        startup_cost += hash_qual_cost.startup;
-        run_cost += hash_qual_cost.per_tuple * outer_path_rows *
-            clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
-
-        /*
-         * Get approx # tuples passing the hashquals.  We use
-         * approx_tuple_count here because we need an estimate done with
-         * JOIN_INNER semantics.
-         */
-        hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
-    }
+#else
+	if (path->jpath.jointype == JOIN_SEMI ||
+		path->jpath.jointype == JOIN_ANTI ||
+		extra->inner_unique)
+#endif
+	{
+		double		outer_matched_rows;
+		Selectivity inner_scan_frac;
+
+		/*
+		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
+		 * executor will stop after the first match.
+		 *
+		 * For an outer-rel row that has at least one match, we can expect the
+		 * bucket scan to stop after a fraction 1/(match_count+1) of the
+		 * bucket's rows, if the matches are evenly distributed.  Since they
+		 * probably aren't quite evenly distributed, we apply a fuzz factor of
+		 * 2.0 to that fraction.  (If we used a larger fuzz factor, we'd have
+		 * to clamp inner_scan_frac to at most 1.0; but since match_count is
+		 * at least 1, no such clamp is needed now.)
+		 */
+		outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
+		inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
+
+		startup_cost += hash_qual_cost.startup;
+		run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
+			clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
+
+		/*
+		 * For unmatched outer-rel rows, the picture is quite a lot different.
+		 * In the first place, there is no reason to assume that these rows
+		 * preferentially hit heavily-populated buckets; instead assume they
+		 * are uncorrelated with the inner distribution and so they see an
+		 * average bucket size of inner_path_rows / virtualbuckets.  In the
+		 * second place, it seems likely that they will have few if any exact
+		 * hash-code matches and so very few of the tuples in the bucket will
+		 * actually require eval of the hash quals.  We don't have any good
+		 * way to estimate how many will, but for the moment assume that the
+		 * effective cost per bucket entry is one-tenth what it is for
+		 * matchable tuples.
+		 */
+		run_cost += hash_qual_cost.per_tuple *
+			(outer_path_rows - outer_matched_rows) *
+			clamp_row_est(inner_path_rows / virtualbuckets) * 0.05;
+
+		/* Get # of tuples that will pass the basic join */
+#ifdef __TBASE__
+		if (path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_LEFT_SCALAR)
+#else
+		if (path->jpath.jointype == JOIN_SEMI)
+#endif
+			hashjointuples = outer_matched_rows;
+		else
+			hashjointuples = outer_path_rows - outer_matched_rows;
+	}
+	else
+	{
+		/*
+		 * The number of tuple comparisons needed is the number of outer
+		 * tuples times the typical number of tuples in a hash bucket, which
+		 * is the inner relation size times its bucketsize fraction.  At each
+		 * one, we need to evaluate the hashjoin quals.  But actually,
+		 * charging the full qual eval cost at each tuple is pessimistic,
+		 * since we don't evaluate the quals unless the hash values match
+		 * exactly.  For lack of a better idea, halve the cost estimate to
+		 * allow for that.
+		 */
+		startup_cost += hash_qual_cost.startup;
+		run_cost += hash_qual_cost.per_tuple * outer_path_rows *
+			clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
+
+		/*
+		 * Get approx # tuples passing the hashquals.  We use
+		 * approx_tuple_count here because we need an estimate done with
+		 * JOIN_INNER semantics.
+		 */
+		hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
+	}
 
-    /*
-     * For each tuple that gets through the hashjoin proper, we charge
-     * cpu_tuple_cost plus the cost of evaluating additional restriction
-     * clauses that are to be applied at the join.  (This is pessimistic since
-     * not all of the quals may get evaluated at each tuple.)
-     */
-    startup_cost += qp_qual_cost.startup;
-    cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
-    run_cost += cpu_per_tuple * hashjointuples;
+	/*
+	 * For each tuple that gets through the hashjoin proper, we charge
+	 * cpu_tuple_cost plus the cost of evaluating additional restriction
+	 * clauses that are to be applied at the join.  (This is pessimistic since
+	 * not all of the quals may get evaluated at each tuple.)
+	 */
+	startup_cost += qp_qual_cost.startup;
+	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
+	run_cost += cpu_per_tuple * hashjointuples;
 
-    /* tlist eval costs are paid per output row, not per tuple scanned */
-    startup_cost += path->jpath.path.pathtarget->cost.startup;
-    run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
+	/* tlist eval costs are paid per output row, not per tuple scanned */
+	startup_cost += path->jpath.path.pathtarget->cost.startup;
+	run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
 
-    path->jpath.path.startup_cost = startup_cost;
-    path->jpath.path.total_cost = startup_cost + run_cost;
+	path->jpath.path.startup_cost = startup_cost;
+	path->jpath.path.total_cost = startup_cost + run_cost;
 }
 
 
@@ -4277,138 +4311,141 @@ get_parameterized_joinrel_size(PlannerInfo *root, RelOptInfo *rel,
  */
 static double
 calc_joinrel_size_estimate(PlannerInfo *root,
-                           RelOptInfo *outer_rel,
-                           RelOptInfo *inner_rel,
-                           double outer_rows,
-                           double inner_rows,
-                           SpecialJoinInfo *sjinfo,
-                           List *restrictlist_in)
-{// #lizard forgives
-    /* This apparently-useless variable dodges a compiler bug in VS2013: */
-    List       *restrictlist = restrictlist_in;
-    JoinType    jointype = sjinfo->jointype;
-    Selectivity fkselec;
-    Selectivity jselec;
-    Selectivity pselec;
-    double        nrows;
-
-    /*
-     * Compute joinclause selectivity.  Note that we are only considering
-     * clauses that become restriction clauses at this join level; we are not
-     * double-counting them because they were not considered in estimating the
-     * sizes of the component rels.
-     *
-     * First, see whether any of the joinclauses can be matched to known FK
-     * constraints.  If so, drop those clauses from the restrictlist, and
-     * instead estimate their selectivity using FK semantics.  (We do this
-     * without regard to whether said clauses are local or "pushed down".
-     * Probably, an FK-matching clause could never be seen as pushed down at
-     * an outer join, since it would be strict and hence would be grounds for
-     * join strength reduction.)  fkselec gets the net selectivity for
-     * FK-matching clauses, or 1.0 if there are none.
-     */
-    fkselec = get_foreign_key_join_selectivity(root,
-                                               outer_rel->relids,
-                                               inner_rel->relids,
-                                               sjinfo,
-                                               &restrictlist);
-
-    /*
-     * For an outer join, we have to distinguish the selectivity of the join's
-     * own clauses (JOIN/ON conditions) from any clauses that were "pushed
-     * down".  For inner joins we just count them all as joinclauses.
-     */
-    if (IS_OUTER_JOIN(jointype))
-    {
-        List       *joinquals = NIL;
-        List       *pushedquals = NIL;
-        ListCell   *l;
-
-        /* Grovel through the clauses to separate into two lists */
-        foreach(l, restrictlist)
-        {
-            RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
+						   RelOptInfo *outer_rel,
+						   RelOptInfo *inner_rel,
+						   double outer_rows,
+						   double inner_rows,
+						   SpecialJoinInfo *sjinfo,
+						   List *restrictlist_in)
+{
+	/* This apparently-useless variable dodges a compiler bug in VS2013: */
+	List	   *restrictlist = restrictlist_in;
+	JoinType	jointype = sjinfo->jointype;
+	Selectivity fkselec;
+	Selectivity jselec;
+	Selectivity pselec;
+	double		nrows;
 
-            if (rinfo->is_pushed_down)
-                pushedquals = lappend(pushedquals, rinfo);
-            else
-                joinquals = lappend(joinquals, rinfo);
-        }
+	/*
+	 * Compute joinclause selectivity.  Note that we are only considering
+	 * clauses that become restriction clauses at this join level; we are not
+	 * double-counting them because they were not considered in estimating the
+	 * sizes of the component rels.
+	 *
+	 * First, see whether any of the joinclauses can be matched to known FK
+	 * constraints.  If so, drop those clauses from the restrictlist, and
+	 * instead estimate their selectivity using FK semantics.  (We do this
+	 * without regard to whether said clauses are local or "pushed down".
+	 * Probably, an FK-matching clause could never be seen as pushed down at
+	 * an outer join, since it would be strict and hence would be grounds for
+	 * join strength reduction.)  fkselec gets the net selectivity for
+	 * FK-matching clauses, or 1.0 if there are none.
+	 */
+	fkselec = get_foreign_key_join_selectivity(root,
+											   outer_rel->relids,
+											   inner_rel->relids,
+											   sjinfo,
+											   &restrictlist);
 
-        /* Get the separate selectivities */
-        jselec = clauselist_selectivity(root,
-                                        joinquals,
-                                        0,
-                                        jointype,
-                                        sjinfo);
-        pselec = clauselist_selectivity(root,
-                                        pushedquals,
-                                        0,
-                                        jointype,
-                                        sjinfo);
-
-        /* Avoid leaking a lot of ListCells */
-        list_free(joinquals);
-        list_free(pushedquals);
-    }
-    else
-    {
-        jselec = clauselist_selectivity(root,
-                                        restrictlist,
-                                        0,
-                                        jointype,
-                                        sjinfo);
-        pselec = 0.0;            /* not used, keep compiler quiet */
-    }
+	/*
+	 * For an outer join, we have to distinguish the selectivity of the join's
+	 * own clauses (JOIN/ON conditions) from any clauses that were "pushed
+	 * down".  For inner joins we just count them all as joinclauses.
+	 */
+	if (IS_OUTER_JOIN(jointype))
+	{
+		List	   *joinquals = NIL;
+		List	   *pushedquals = NIL;
+		ListCell   *l;
+
+		/* Grovel through the clauses to separate into two lists */
+		foreach(l, restrictlist)
+		{
+			RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
+
+			if (rinfo->is_pushed_down)
+				pushedquals = lappend(pushedquals, rinfo);
+			else
+				joinquals = lappend(joinquals, rinfo);
+		}
+
+		/* Get the separate selectivities */
+		jselec = clauselist_selectivity(root,
+										joinquals,
+										0,
+										jointype,
+										sjinfo);
+		pselec = clauselist_selectivity(root,
+										pushedquals,
+										0,
+										jointype,
+										sjinfo);
+
+		/* Avoid leaking a lot of ListCells */
+		list_free(joinquals);
+		list_free(pushedquals);
+	}
+	else
+	{
+		jselec = clauselist_selectivity(root,
+										restrictlist,
+										0,
+										jointype,
+										sjinfo);
+		pselec = 0.0;			/* not used, keep compiler quiet */
+	}
 
-    /*
-     * Basically, we multiply size of Cartesian product by selectivity.
-     *
-     * If we are doing an outer join, take that into account: the joinqual
-     * selectivity has to be clamped using the knowledge that the output must
-     * be at least as large as the non-nullable input.  However, any
-     * pushed-down quals are applied after the outer join, so their
-     * selectivity applies fully.
-     *
-     * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction
-     * of LHS rows that have matches, and we apply that straightforwardly.
-     */
-    switch (jointype)
-    {
-        case JOIN_INNER:
-            nrows = outer_rows * inner_rows * fkselec * jselec;
-            /* pselec not used */
-            break;
-        case JOIN_LEFT:
-            nrows = outer_rows * inner_rows * fkselec * jselec;
-            if (nrows < outer_rows)
-                nrows = outer_rows;
-            nrows *= pselec;
-            break;
-        case JOIN_FULL:
-            nrows = outer_rows * inner_rows * fkselec * jselec;
-            if (nrows < outer_rows)
-                nrows = outer_rows;
-            if (nrows < inner_rows)
-                nrows = inner_rows;
-            nrows *= pselec;
-            break;
+	/*
+	 * Basically, we multiply size of Cartesian product by selectivity.
+	 *
+	 * If we are doing an outer join, take that into account: the joinqual
+	 * selectivity has to be clamped using the knowledge that the output must
+	 * be at least as large as the non-nullable input.  However, any
+	 * pushed-down quals are applied after the outer join, so their
+	 * selectivity applies fully.
+	 *
+	 * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction
+	 * of LHS rows that have matches, and we apply that straightforwardly.
+	 */
+	switch (jointype)
+	{
+		case JOIN_INNER:
+			nrows = outer_rows * inner_rows * fkselec * jselec;
+			/* pselec not used */
+			break;
+		case JOIN_LEFT:
+			nrows = outer_rows * inner_rows * fkselec * jselec;
+			if (nrows < outer_rows)
+				nrows = outer_rows;
+			nrows *= pselec;
+			break;
+		case JOIN_FULL:
+			nrows = outer_rows * inner_rows * fkselec * jselec;
+			if (nrows < outer_rows)
+				nrows = outer_rows;
+			if (nrows < inner_rows)
+				nrows = inner_rows;
+			nrows *= pselec;
+			break;
         case JOIN_SEMI:
-            nrows = outer_rows * fkselec * jselec;
-            /* pselec not used */
-            break;
-        case JOIN_ANTI:
-            nrows = outer_rows * (1.0 - fkselec * jselec);
-            nrows *= pselec;
-            break;
-        default:
-            /* other values not expected here */
-            elog(ERROR, "unrecognized join type: %d", (int) jointype);
-            nrows = 0;            /* keep compiler quiet */
-            break;
-    }
+#ifdef __TBASE__
+        case JOIN_LEFT_SCALAR:
+#endif
+			nrows = outer_rows * fkselec * jselec;
+			/* pselec not used */
+			break;
+		case JOIN_ANTI:
+			nrows = outer_rows * (1.0 - fkselec * jselec);
+			nrows *= pselec;
+			break;
+		default:
+			/* other values not expected here */
+			elog(ERROR, "unrecognized join type: %d", (int) jointype);
+			nrows = 0;			/* keep compiler quiet */
+			break;
+	}
 
-    return clamp_row_est(nrows);
+	return clamp_row_est(nrows);
 }
 
 /*
@@ -4428,202 +4465,211 @@ calc_joinrel_size_estimate(PlannerInfo *root,
  */
 static Selectivity
 get_foreign_key_join_selectivity(PlannerInfo *root,
-                                 Relids outer_relids,
-                                 Relids inner_relids,
-                                 SpecialJoinInfo *sjinfo,
-                                 List **restrictlist)
-{// #lizard forgives
-    Selectivity fkselec = 1.0;
-    JoinType    jointype = sjinfo->jointype;
-    List       *worklist = *restrictlist;
-    ListCell   *lc;
-
-    /* Consider each FK constraint that is known to match the query */
-    foreach(lc, root->fkey_list)
-    {
-        ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc);
-        bool        ref_is_outer;
-        List       *removedlist;
-        ListCell   *cell;
-        ListCell   *prev;
-        ListCell   *next;
-
-        /*
-         * This FK is not relevant unless it connects a baserel on one side of
-         * this join to a baserel on the other side.
-         */
-        if (bms_is_member(fkinfo->con_relid, outer_relids) &&
-            bms_is_member(fkinfo->ref_relid, inner_relids))
-            ref_is_outer = false;
-        else if (bms_is_member(fkinfo->ref_relid, outer_relids) &&
-                 bms_is_member(fkinfo->con_relid, inner_relids))
-            ref_is_outer = true;
-        else
-            continue;
-
-        /*
-         * If we're dealing with a semi/anti join, and the FK's referenced
-         * relation is on the outside, then knowledge of the FK doesn't help
-         * us figure out what we need to know (which is the fraction of outer
-         * rows that have matches).  On the other hand, if the referenced rel
-         * is on the inside, then all outer rows must have matches in the
-         * referenced table (ignoring nulls).  But any restriction or join
-         * clauses that filter that table will reduce the fraction of matches.
-         * We can account for restriction clauses, but it's too hard to guess
-         * how many table rows would get through a join that's inside the RHS.
-         * Hence, if either case applies, punt and ignore the FK.
-         */
-        if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
-            (ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
-            continue;
-
-        /*
-         * Modify the restrictlist by removing clauses that match the FK (and
-         * putting them into removedlist instead).  It seems unsafe to modify
-         * the originally-passed List structure, so we make a shallow copy the
-         * first time through.
-         */
-        if (worklist == *restrictlist)
-            worklist = list_copy(worklist);
-
-        removedlist = NIL;
-        prev = NULL;
-        for (cell = list_head(worklist); cell; cell = next)
-        {
-            RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell);
-            bool        remove_it = false;
-            int            i;
-
-            next = lnext(cell);
-            /* Drop this clause if it matches any column of the FK */
-            for (i = 0; i < fkinfo->nkeys; i++)
-            {
-                if (rinfo->parent_ec)
-                {
-                    /*
-                     * EC-derived clauses can only match by EC.  It is okay to
-                     * consider any clause derived from the same EC as
-                     * matching the FK: even if equivclass.c chose to generate
-                     * a clause equating some other pair of Vars, it could
-                     * have generated one equating the FK's Vars.  So for
-                     * purposes of estimation, we can act as though it did so.
-                     *
-                     * Note: checking parent_ec is a bit of a cheat because
-                     * there are EC-derived clauses that don't have parent_ec
-                     * set; but such clauses must compare expressions that
-                     * aren't just Vars, so they cannot match the FK anyway.
-                     */
-                    if (fkinfo->eclass[i] == rinfo->parent_ec)
-                    {
-                        remove_it = true;
-                        break;
-                    }
-                }
-                else
-                {
-                    /*
-                     * Otherwise, see if rinfo was previously matched to FK as
-                     * a "loose" clause.
-                     */
-                    if (list_member_ptr(fkinfo->rinfos[i], rinfo))
-                    {
-                        remove_it = true;
-                        break;
-                    }
-                }
-            }
-            if (remove_it)
-            {
-                worklist = list_delete_cell(worklist, cell, prev);
-                removedlist = lappend(removedlist, rinfo);
-            }
-            else
-                prev = cell;
-        }
+								 Relids outer_relids,
+								 Relids inner_relids,
+								 SpecialJoinInfo *sjinfo,
+								 List **restrictlist)
+{
+	Selectivity fkselec = 1.0;
+	JoinType	jointype = sjinfo->jointype;
+	List	   *worklist = *restrictlist;
+	ListCell   *lc;
 
-        /*
-         * If we failed to remove all the matching clauses we expected to
-         * find, chicken out and ignore this FK; applying its selectivity
-         * might result in double-counting.  Put any clauses we did manage to
-         * remove back into the worklist.
-         *
-         * Since the matching clauses are known not outerjoin-delayed, they
-         * should certainly have appeared in the initial joinclause list.  If
-         * we didn't find them, they must have been matched to, and removed
-         * by, some other FK in a previous iteration of this loop.  (A likely
-         * case is that two FKs are matched to the same EC; there will be only
-         * one EC-derived clause in the initial list, so the first FK will
-         * consume it.)  Applying both FKs' selectivity independently risks
-         * underestimating the join size; in particular, this would undo one
-         * of the main things that ECs were invented for, namely to avoid
-         * double-counting the selectivity of redundant equality conditions.
-         * Later we might think of a reasonable way to combine the estimates,
-         * but for now, just punt, since this is a fairly uncommon situation.
-         */
-        if (list_length(removedlist) !=
-            (fkinfo->nmatched_ec + fkinfo->nmatched_ri))
-        {
-            worklist = list_concat(worklist, removedlist);
+	/* Consider each FK constraint that is known to match the query */
+	foreach(lc, root->fkey_list)
+	{
+		ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc);
+		bool		ref_is_outer;
+		List	   *removedlist;
+		ListCell   *cell;
+		ListCell   *prev;
+		ListCell   *next;
+
+		/*
+		 * This FK is not relevant unless it connects a baserel on one side of
+		 * this join to a baserel on the other side.
+		 */
+		if (bms_is_member(fkinfo->con_relid, outer_relids) &&
+			bms_is_member(fkinfo->ref_relid, inner_relids))
+			ref_is_outer = false;
+		else if (bms_is_member(fkinfo->ref_relid, outer_relids) &&
+				 bms_is_member(fkinfo->con_relid, inner_relids))
+			ref_is_outer = true;
+		else
+			continue;
+
+		/*
+		 * If we're dealing with a semi/anti join, and the FK's referenced
+		 * relation is on the outside, then knowledge of the FK doesn't help
+		 * us figure out what we need to know (which is the fraction of outer
+		 * rows that have matches).  On the other hand, if the referenced rel
+		 * is on the inside, then all outer rows must have matches in the
+		 * referenced table (ignoring nulls).  But any restriction or join
+		 * clauses that filter that table will reduce the fraction of matches.
+		 * We can account for restriction clauses, but it's too hard to guess
+		 * how many table rows would get through a join that's inside the RHS.
+		 * Hence, if either case applies, punt and ignore the FK.
+		 */
+#ifdef __TBASE__
+		if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI || jointype == JOIN_LEFT_SCALAR) &&
+			(ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
             continue;
-        }
-
-        /*
-         * Finally we get to the payoff: estimate selectivity using the
-         * knowledge that each referencing row will match exactly one row in
-         * the referenced table.
-         *
-         * XXX that's not true in the presence of nulls in the referencing
-         * column(s), so in principle we should derate the estimate for those.
-         * However (1) if there are any strict restriction clauses for the
-         * referencing column(s) elsewhere in the query, derating here would
-         * be double-counting the null fraction, and (2) it's not very clear
-         * how to combine null fractions for multiple referencing columns. So
-         * we do nothing for now about correcting for nulls.
-         *
-         * XXX another point here is that if either side of an FK constraint
-         * is an inheritance parent, we estimate as though the constraint
-         * covers all its children as well.  This is not an unreasonable
-         * assumption for a referencing table, ie the user probably applied
-         * identical constraints to all child tables (though perhaps we ought
-         * to check that).  But it's not possible to have done that for a
-         * referenced table.  Fortunately, precisely because that doesn't
-         * work, it is uncommon in practice to have an FK referencing a parent
-         * table.  So, at least for now, disregard inheritance here.
-         */
-        if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
-        {
-            /*
-             * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's
-             * referenced table is exactly the inside of the join.  The join
-             * selectivity is defined as the fraction of LHS rows that have
-             * matches.  The FK implies that every LHS row has a match *in the
-             * referenced table*; but any restriction clauses on it will
-             * reduce the number of matches.  Hence we take the join
-             * selectivity as equal to the selectivity of the table's
-             * restriction clauses, which is rows / tuples; but we must guard
-             * against tuples == 0.
-             */
-            RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
-            double        ref_tuples = Max(ref_rel->tuples, 1.0);
-
-            fkselec *= ref_rel->rows / ref_tuples;
-        }
-        else
-        {
-            /*
-             * Otherwise, selectivity is exactly 1/referenced-table-size; but
-             * guard against tuples == 0.  Note we should use the raw table
-             * tuple count, not any estimate of its filtered or joined size.
-             */
-            RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
-            double        ref_tuples = Max(ref_rel->tuples, 1.0);
-
-            fkselec *= 1.0 / ref_tuples;
-        }
-    }
+#else
+		if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
+			(ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
+			continue;
+#endif
+		/*
+		 * Modify the restrictlist by removing clauses that match the FK (and
+		 * putting them into removedlist instead).  It seems unsafe to modify
+		 * the originally-passed List structure, so we make a shallow copy the
+		 * first time through.
+		 */
+		if (worklist == *restrictlist)
+			worklist = list_copy(worklist);
+
+		removedlist = NIL;
+		prev = NULL;
+		for (cell = list_head(worklist); cell; cell = next)
+		{
+			RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell);
+			bool		remove_it = false;
+			int			i;
+
+			next = lnext(cell);
+			/* Drop this clause if it matches any column of the FK */
+			for (i = 0; i < fkinfo->nkeys; i++)
+			{
+				if (rinfo->parent_ec)
+				{
+					/*
+					 * EC-derived clauses can only match by EC.  It is okay to
+					 * consider any clause derived from the same EC as
+					 * matching the FK: even if equivclass.c chose to generate
+					 * a clause equating some other pair of Vars, it could
+					 * have generated one equating the FK's Vars.  So for
+					 * purposes of estimation, we can act as though it did so.
+					 *
+					 * Note: checking parent_ec is a bit of a cheat because
+					 * there are EC-derived clauses that don't have parent_ec
+					 * set; but such clauses must compare expressions that
+					 * aren't just Vars, so they cannot match the FK anyway.
+					 */
+					if (fkinfo->eclass[i] == rinfo->parent_ec)
+					{
+						remove_it = true;
+						break;
+					}
+				}
+				else
+				{
+					/*
+					 * Otherwise, see if rinfo was previously matched to FK as
+					 * a "loose" clause.
+					 */
+					if (list_member_ptr(fkinfo->rinfos[i], rinfo))
+					{
+						remove_it = true;
+						break;
+					}
+				}
+			}
+			if (remove_it)
+			{
+				worklist = list_delete_cell(worklist, cell, prev);
+				removedlist = lappend(removedlist, rinfo);
+			}
+			else
+				prev = cell;
+		}
+
+		/*
+		 * If we failed to remove all the matching clauses we expected to
+		 * find, chicken out and ignore this FK; applying its selectivity
+		 * might result in double-counting.  Put any clauses we did manage to
+		 * remove back into the worklist.
+		 *
+		 * Since the matching clauses are known not outerjoin-delayed, they
+		 * should certainly have appeared in the initial joinclause list.  If
+		 * we didn't find them, they must have been matched to, and removed
+		 * by, some other FK in a previous iteration of this loop.  (A likely
+		 * case is that two FKs are matched to the same EC; there will be only
+		 * one EC-derived clause in the initial list, so the first FK will
+		 * consume it.)  Applying both FKs' selectivity independently risks
+		 * underestimating the join size; in particular, this would undo one
+		 * of the main things that ECs were invented for, namely to avoid
+		 * double-counting the selectivity of redundant equality conditions.
+		 * Later we might think of a reasonable way to combine the estimates,
+		 * but for now, just punt, since this is a fairly uncommon situation.
+		 */
+		if (list_length(removedlist) !=
+			(fkinfo->nmatched_ec + fkinfo->nmatched_ri))
+		{
+			worklist = list_concat(worklist, removedlist);
+			continue;
+		}
+
+		/*
+		 * Finally we get to the payoff: estimate selectivity using the
+		 * knowledge that each referencing row will match exactly one row in
+		 * the referenced table.
+		 *
+		 * XXX that's not true in the presence of nulls in the referencing
+		 * column(s), so in principle we should derate the estimate for those.
+		 * However (1) if there are any strict restriction clauses for the
+		 * referencing column(s) elsewhere in the query, derating here would
+		 * be double-counting the null fraction, and (2) it's not very clear
+		 * how to combine null fractions for multiple referencing columns. So
+		 * we do nothing for now about correcting for nulls.
+		 *
+		 * XXX another point here is that if either side of an FK constraint
+		 * is an inheritance parent, we estimate as though the constraint
+		 * covers all its children as well.  This is not an unreasonable
+		 * assumption for a referencing table, ie the user probably applied
+		 * identical constraints to all child tables (though perhaps we ought
+		 * to check that).  But it's not possible to have done that for a
+		 * referenced table.  Fortunately, precisely because that doesn't
+		 * work, it is uncommon in practice to have an FK referencing a parent
+		 * table.  So, at least for now, disregard inheritance here.
+		 */
+#ifdef __TBASE__
+		if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || jointype == JOIN_LEFT_SCALAR)
+#else
+		if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
+#endif
+		{
+			/*
+			 * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's
+			 * referenced table is exactly the inside of the join.  The join
+			 * selectivity is defined as the fraction of LHS rows that have
+			 * matches.  The FK implies that every LHS row has a match *in the
+			 * referenced table*; but any restriction clauses on it will
+			 * reduce the number of matches.  Hence we take the join
+			 * selectivity as equal to the selectivity of the table's
+			 * restriction clauses, which is rows / tuples; but we must guard
+			 * against tuples == 0.
+			 */
+			RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
+			double		ref_tuples = Max(ref_rel->tuples, 1.0);
+
+			fkselec *= ref_rel->rows / ref_tuples;
+		}
+		else
+		{
+			/*
+			 * Otherwise, selectivity is exactly 1/referenced-table-size; but
+			 * guard against tuples == 0.  Note we should use the raw table
+			 * tuple count, not any estimate of its filtered or joined size.
+			 */
+			RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
+			double		ref_tuples = Max(ref_rel->tuples, 1.0);
+
+			fkselec *= 1.0 / ref_tuples;
+		}
+	}
 
-    *restrictlist = worklist;
-    return fkselec;
+	*restrictlist = worklist;
+	return fkselec;
 }
 
 /*
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index a738fc02..b377f0d6 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -2018,30 +2018,36 @@ adjust_rowcount_for_semijoins(PlannerInfo *root,
                               Index outer_relid,
                               double rowcount)
 {
-    ListCell   *lc;
+	ListCell   *lc;
 
-    foreach(lc, root->join_info_list)
-    {
-        SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc);
+	foreach(lc, root->join_info_list)
+	{
+		SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc);
 
-        if (sjinfo->jointype == JOIN_SEMI &&
+#ifdef __TBASE__
+        if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_LEFT_SCALAR ) &&
             bms_is_member(cur_relid, sjinfo->syn_lefthand) &&
             bms_is_member(outer_relid, sjinfo->syn_righthand))
-        {
-            /* Estimate number of unique-ified rows */
-            double        nraw;
-            double        nunique;
-
-            nraw = approximate_joinrel_size(root, sjinfo->syn_righthand);
-            nunique = estimate_num_groups(root,
-                                          sjinfo->semi_rhs_exprs,
-                                          nraw,
-                                          NULL);
-            if (rowcount > nunique)
-                rowcount = nunique;
-        }
-    }
-    return rowcount;
+#else
+		if (sjinfo->jointype == JOIN_SEMI &&
+			bms_is_member(cur_relid, sjinfo->syn_lefthand) &&
+			bms_is_member(outer_relid, sjinfo->syn_righthand))
+#endif
+		{
+			/* Estimate number of unique-ified rows */
+			double		nraw;
+			double		nunique;
+
+			nraw = approximate_joinrel_size(root, sjinfo->syn_righthand);
+			nunique = estimate_num_groups(root,
+										  sjinfo->semi_rhs_exprs,
+										  nraw,
+										  NULL);
+			if (rowcount > nunique)
+				rowcount = nunique;
+		}
+	}
+	return rowcount;
 }
 
 /*
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c
index 0b01ed12..c832b9d8 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -108,155 +108,163 @@ static void generate_mergejoin_paths(PlannerInfo *root,
  */
 void
 add_paths_to_joinrel(PlannerInfo *root,
-                     RelOptInfo *joinrel,
-                     RelOptInfo *outerrel,
-                     RelOptInfo *innerrel,
-                     JoinType jointype,
-                     SpecialJoinInfo *sjinfo,
-                     List *restrictlist)
-{// #lizard forgives
-    JoinPathExtraData extra;
-    bool        mergejoin_allowed = true;
-    ListCell   *lc;
-
-    extra.restrictlist = restrictlist;
-    extra.mergeclause_list = NIL;
-    extra.sjinfo = sjinfo;
-    extra.param_source_rels = NULL;
-
-    /*
-     * See if the inner relation is provably unique for this outer rel.
-     *
-     * We have some special cases: for JOIN_SEMI and JOIN_ANTI, it doesn't
-     * matter since the executor can make the equivalent optimization anyway;
-     * we need not expend planner cycles on proofs.  For JOIN_UNIQUE_INNER, we
-     * must be considering a semijoin whose inner side is not provably unique
-     * (else reduce_unique_semijoins would've simplified it), so there's no
-     * point in calling innerrel_is_unique.  However, if the LHS covers all of
-     * the semijoin's min_lefthand, then it's appropriate to set inner_unique
-     * because the path produced by create_unique_path will be unique relative
-     * to the LHS.  (If we have an LHS that's only part of the min_lefthand,
-     * that is *not* true.)  For JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid
-     * letting that value escape this module.
-     */
-    switch (jointype)
-    {
-        case JOIN_SEMI:
-        case JOIN_ANTI:
-            extra.inner_unique = false; /* well, unproven */
-            break;
-        case JOIN_UNIQUE_INNER:
-            extra.inner_unique = bms_is_subset(sjinfo->min_lefthand,
-                                               outerrel->relids);
-            break;
-        case JOIN_UNIQUE_OUTER:
-            extra.inner_unique = innerrel_is_unique(root,
-                                                    outerrel->relids,
-                                                    innerrel,
-                                                    JOIN_INNER,
-                                                    restrictlist,
-                                                    false);
-            break;
-        default:
-            extra.inner_unique = innerrel_is_unique(root,
-                                                    outerrel->relids,
-                                                    innerrel,
-                                                    jointype,
-                                                    restrictlist,
-                                                    false);
-            break;
-    }
-
-    /*
-     * Find potential mergejoin clauses.  We can skip this if we are not
-     * interested in doing a mergejoin.  However, mergejoin may be our only
-     * way of implementing a full outer join, so override enable_mergejoin if
-     * it's a full join.
-     */
-    if (enable_mergejoin || jointype == JOIN_FULL)
-        extra.mergeclause_list = select_mergejoin_clauses(root,
-                                                          joinrel,
-                                                          outerrel,
-                                                          innerrel,
-                                                          restrictlist,
-                                                          jointype,
-                                                          &mergejoin_allowed);
+					 RelOptInfo *joinrel,
+					 RelOptInfo *outerrel,
+					 RelOptInfo *innerrel,
+					 JoinType jointype,
+					 SpecialJoinInfo *sjinfo,
+					 List *restrictlist)
+{
+	JoinPathExtraData extra;
+	bool		mergejoin_allowed = true;
+	ListCell   *lc;
+
+	extra.restrictlist = restrictlist;
+	extra.mergeclause_list = NIL;
+	extra.sjinfo = sjinfo;
+	extra.param_source_rels = NULL;
+
+	/*
+	 * See if the inner relation is provably unique for this outer rel.
+	 *
+	 * We have some special cases: for JOIN_SEMI and JOIN_ANTI, it doesn't
+	 * matter since the executor can make the equivalent optimization anyway;
+	 * we need not expend planner cycles on proofs.  For JOIN_UNIQUE_INNER, we
+	 * must be considering a semijoin whose inner side is not provably unique
+	 * (else reduce_unique_semijoins would've simplified it), so there's no
+	 * point in calling innerrel_is_unique.  However, if the LHS covers all of
+	 * the semijoin's min_lefthand, then it's appropriate to set inner_unique
+	 * because the path produced by create_unique_path will be unique relative
+	 * to the LHS.  (If we have an LHS that's only part of the min_lefthand,
+	 * that is *not* true.)  For JOIN_UNIQUE_OUTER, pass JOIN_INNER to avoid
+	 * letting that value escape this module.
+	 */
+	switch (jointype)
+	{
+		case JOIN_SEMI:
+		case JOIN_ANTI:
+#ifdef __TBASE__
+        case JOIN_LEFT_SCALAR:
+#endif
+			extra.inner_unique = false; /* well, unproven */
+			break;
+		case JOIN_UNIQUE_INNER:
+			extra.inner_unique = bms_is_subset(sjinfo->min_lefthand,
+											   outerrel->relids);
+			break;
+		case JOIN_UNIQUE_OUTER:
+			extra.inner_unique = innerrel_is_unique(root,
+													outerrel->relids,
+													innerrel,
+													JOIN_INNER,
+													restrictlist,
+													false);
+			break;
+		default:
+			extra.inner_unique = innerrel_is_unique(root,
+													outerrel->relids,
+													innerrel,
+													jointype,
+													restrictlist,
+													false);
+			break;
+	}
 
-    /*
-     * If it's SEMI, ANTI, or inner_unique join, compute correction factors
-     * for cost estimation.  These will be the same for all paths.
-     */
-    if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || extra.inner_unique)
+	/*
+	 * Find potential mergejoin clauses.  We can skip this if we are not
+	 * interested in doing a mergejoin.  However, mergejoin may be our only
+	 * way of implementing a full outer join, so override enable_mergejoin if
+	 * it's a full join.
+	 */
+	if (enable_mergejoin || jointype == JOIN_FULL)
+		extra.mergeclause_list = select_mergejoin_clauses(root,
+														  joinrel,
+														  outerrel,
+														  innerrel,
+														  restrictlist,
+														  jointype,
+														  &mergejoin_allowed);
+
+	/*
+	 * If it's SEMI, ANTI, or inner_unique join, compute correction factors
+	 * for cost estimation.  These will be the same for all paths.
+	 */
+#ifdef __TBASE__
+    if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
+        jointype == JOIN_LEFT_SCALAR || extra.inner_unique)
+#else
+	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || extra.inner_unique)
+#endif
         compute_semi_anti_join_factors(root, outerrel, innerrel,
-                                       jointype, sjinfo, restrictlist,
-                                       &extra.semifactors);
-
-    /*
-     * Decide whether it's sensible to generate parameterized paths for this
-     * joinrel, and if so, which relations such paths should require.  There
-     * is usually no need to create a parameterized result path unless there
-     * is a join order restriction that prevents joining one of our input rels
-     * directly to the parameter source rel instead of joining to the other
-     * input rel.  (But see allow_star_schema_join().)    This restriction
-     * reduces the number of parameterized paths we have to deal with at
-     * higher join levels, without compromising the quality of the resulting
-     * plan.  We express the restriction as a Relids set that must overlap the
-     * parameterization of any proposed join path.
-     */
-    foreach(lc, root->join_info_list)
-    {
-        SpecialJoinInfo *sjinfo2 = (SpecialJoinInfo *) lfirst(lc);
-
-        /*
-         * SJ is relevant to this join if we have some part of its RHS
-         * (possibly not all of it), and haven't yet joined to its LHS.  (This
-         * test is pretty simplistic, but should be sufficient considering the
-         * join has already been proven legal.)  If the SJ is relevant, it
-         * presents constraints for joining to anything not in its RHS.
-         */
-        if (bms_overlap(joinrel->relids, sjinfo2->min_righthand) &&
-            !bms_overlap(joinrel->relids, sjinfo2->min_lefthand))
-            extra.param_source_rels = bms_join(extra.param_source_rels,
-                                               bms_difference(root->all_baserels,
-                                                              sjinfo2->min_righthand));
-
-        /* full joins constrain both sides symmetrically */
-        if (sjinfo2->jointype == JOIN_FULL &&
-            bms_overlap(joinrel->relids, sjinfo2->min_lefthand) &&
-            !bms_overlap(joinrel->relids, sjinfo2->min_righthand))
-            extra.param_source_rels = bms_join(extra.param_source_rels,
-                                               bms_difference(root->all_baserels,
-                                                              sjinfo2->min_lefthand));
-    }
-
-    /*
-     * However, when a LATERAL subquery is involved, there will simply not be
-     * any paths for the joinrel that aren't parameterized by whatever the
-     * subquery is parameterized by, unless its parameterization is resolved
-     * within the joinrel.  So we might as well allow additional dependencies
-     * on whatever residual lateral dependencies the joinrel will have.
-     */
-    extra.param_source_rels = bms_add_members(extra.param_source_rels,
-                                              joinrel->lateral_relids);
-
-    /*
-     * 1. Consider mergejoin paths where both relations must be explicitly
-     * sorted.  Skip this if we can't mergejoin.
-     */
-    if (mergejoin_allowed)
-        sort_inner_and_outer(root, joinrel, outerrel, innerrel,
-                             jointype, &extra);
+									   jointype, sjinfo, restrictlist,
+									   &extra.semifactors);
+
+	/*
+	 * Decide whether it's sensible to generate parameterized paths for this
+	 * joinrel, and if so, which relations such paths should require.  There
+	 * is usually no need to create a parameterized result path unless there
+	 * is a join order restriction that prevents joining one of our input rels
+	 * directly to the parameter source rel instead of joining to the other
+	 * input rel.  (But see allow_star_schema_join().)	This restriction
+	 * reduces the number of parameterized paths we have to deal with at
+	 * higher join levels, without compromising the quality of the resulting
+	 * plan.  We express the restriction as a Relids set that must overlap the
+	 * parameterization of any proposed join path.
+	 */
+	foreach(lc, root->join_info_list)
+	{
+		SpecialJoinInfo *sjinfo2 = (SpecialJoinInfo *) lfirst(lc);
+
+		/*
+		 * SJ is relevant to this join if we have some part of its RHS
+		 * (possibly not all of it), and haven't yet joined to its LHS.  (This
+		 * test is pretty simplistic, but should be sufficient considering the
+		 * join has already been proven legal.)  If the SJ is relevant, it
+		 * presents constraints for joining to anything not in its RHS.
+		 */
+		if (bms_overlap(joinrel->relids, sjinfo2->min_righthand) &&
+			!bms_overlap(joinrel->relids, sjinfo2->min_lefthand))
+			extra.param_source_rels = bms_join(extra.param_source_rels,
+											   bms_difference(root->all_baserels,
+															  sjinfo2->min_righthand));
+
+		/* full joins constrain both sides symmetrically */
+		if (sjinfo2->jointype == JOIN_FULL &&
+			bms_overlap(joinrel->relids, sjinfo2->min_lefthand) &&
+			!bms_overlap(joinrel->relids, sjinfo2->min_righthand))
+			extra.param_source_rels = bms_join(extra.param_source_rels,
+											   bms_difference(root->all_baserels,
+															  sjinfo2->min_lefthand));
+	}
 
-    /*
-     * 2. Consider paths where the outer relation need not be explicitly
-     * sorted. This includes both nestloops and mergejoins where the outer
-     * path is already ordered.  Again, skip this if we can't mergejoin.
-     * (That's okay because we know that nestloop can't handle right/full
-     * joins at all, so it wouldn't work in the prohibited cases either.)
-     */
-    if (mergejoin_allowed)
-        match_unsorted_outer(root, joinrel, outerrel, innerrel,
-                             jointype, &extra);
+	/*
+	 * However, when a LATERAL subquery is involved, there will simply not be
+	 * any paths for the joinrel that aren't parameterized by whatever the
+	 * subquery is parameterized by, unless its parameterization is resolved
+	 * within the joinrel.  So we might as well allow additional dependencies
+	 * on whatever residual lateral dependencies the joinrel will have.
+	 */
+	extra.param_source_rels = bms_add_members(extra.param_source_rels,
+											  joinrel->lateral_relids);
+
+	/*
+	 * 1. Consider mergejoin paths where both relations must be explicitly
+	 * sorted.  Skip this if we can't mergejoin.
+	 */
+	if (mergejoin_allowed)
+		sort_inner_and_outer(root, joinrel, outerrel, innerrel,
+							 jointype, &extra);
+
+	/*
+	 * 2. Consider paths where the outer relation need not be explicitly
+	 * sorted. This includes both nestloops and mergejoins where the outer
+	 * path is already ordered.  Again, skip this if we can't mergejoin.
+	 * (That's okay because we know that nestloop can't handle right/full
+	 * joins at all, so it wouldn't work in the prohibited cases either.)
+	 */
+	if (mergejoin_allowed)
+		match_unsorted_outer(root, joinrel, outerrel, innerrel,
+							 jointype, &extra);
 
 #ifdef NOT_USED
 
@@ -1264,224 +1272,227 @@ generate_mergejoin_paths(PlannerInfo *root,
  */
 static void
 match_unsorted_outer(PlannerInfo *root,
-                     RelOptInfo *joinrel,
-                     RelOptInfo *outerrel,
-                     RelOptInfo *innerrel,
-                     JoinType jointype,
-                     JoinPathExtraData *extra)
-{// #lizard forgives
-    JoinType    save_jointype = jointype;
-    bool        nestjoinOK;
-    bool        useallclauses;
-    Path       *inner_cheapest_total = innerrel->cheapest_total_path;
-    Path       *matpath = NULL;
-    ListCell   *lc1;
-
-    /*
-     * Nestloop only supports inner, left, semi, and anti joins.  Also, if we
-     * are doing a right or full mergejoin, we must use *all* the mergeclauses
-     * as join clauses, else we will not have a valid plan.  (Although these
-     * two flags are currently inverses, keep them separate for clarity and
-     * possible future changes.)
-     */
-    switch (jointype)
-    {
-        case JOIN_INNER:
-        case JOIN_LEFT:
-        case JOIN_SEMI:
-        case JOIN_ANTI:
-            nestjoinOK = true;
-            useallclauses = false;
-            break;
-        case JOIN_RIGHT:
-        case JOIN_FULL:
-            nestjoinOK = false;
-            useallclauses = true;
-            break;
-        case JOIN_UNIQUE_OUTER:
-        case JOIN_UNIQUE_INNER:
-            jointype = JOIN_INNER;
-            nestjoinOK = true;
-            useallclauses = false;
-            break;
-        default:
-            elog(ERROR, "unrecognized join type: %d",
-                 (int) jointype);
-            nestjoinOK = false; /* keep compiler quiet */
-            useallclauses = false;
-            break;
-    }
-
-    /*
-     * If inner_cheapest_total is parameterized by the outer rel, ignore it;
-     * we will consider it below as a member of cheapest_parameterized_paths,
-     * but the other possibilities considered in this routine aren't usable.
-     */
-    if (PATH_PARAM_BY_REL(inner_cheapest_total, outerrel))
-        inner_cheapest_total = NULL;
-
-    /*
-     * If we need to unique-ify the inner path, we will consider only the
-     * cheapest-total inner.
-     */
-    if (save_jointype == JOIN_UNIQUE_INNER)
-    {
-        /* No way to do this with an inner path parameterized by outer rel */
-        if (inner_cheapest_total == NULL)
-            return;
-        inner_cheapest_total = (Path *)
-            create_unique_path(root, innerrel, inner_cheapest_total, extra->sjinfo);
-        Assert(inner_cheapest_total);
-    }
-    else if (nestjoinOK)
-    {
-        /*
-         * Consider materializing the cheapest inner path, unless
-         * enable_material is off or the path in question materializes its
-         * output anyway.
-         */
-        if (enable_material && inner_cheapest_total != NULL &&
-            !ExecMaterializesOutput(inner_cheapest_total->pathtype))
-            matpath = (Path *)
-                create_material_path(innerrel, inner_cheapest_total);
-    }
-
-    foreach(lc1, outerrel->pathlist)
-    {
-        Path       *outerpath = (Path *) lfirst(lc1);
-        List       *merge_pathkeys;
-
-        /*
-         * We cannot use an outer path that is parameterized by the inner rel.
-         */
-        if (PATH_PARAM_BY_REL(outerpath, innerrel))
-            continue;
-
-        /*
-         * If we need to unique-ify the outer path, it's pointless to consider
-         * any but the cheapest outer.  (XXX we don't consider parameterized
-         * outers, nor inners, for unique-ified cases.  Should we?)
-         */
-        if (save_jointype == JOIN_UNIQUE_OUTER)
-        {
-            if (outerpath != outerrel->cheapest_total_path)
-                continue;
-            outerpath = (Path *) create_unique_path(root, outerrel,
-                                                    outerpath, extra->sjinfo);
-            Assert(outerpath);
-        }
-
-        /*
-         * The result will have this sort order (even if it is implemented as
-         * a nestloop, and even if some of the mergeclauses are implemented by
-         * qpquals rather than as true mergeclauses):
-         */
-        merge_pathkeys = build_join_pathkeys(root, joinrel, jointype,
-                                             outerpath->pathkeys);
-
-        if (save_jointype == JOIN_UNIQUE_INNER)
-        {
-            /*
-             * Consider nestloop join, but only with the unique-ified cheapest
-             * inner path
-             */
-            try_nestloop_path(root,
-                              joinrel,
-                              outerpath,
-                              inner_cheapest_total,
-                              merge_pathkeys,
-                              jointype,
-                              extra);
-        }
-        else if (nestjoinOK)
-        {
-            /*
-             * Consider nestloop joins using this outer path and various
-             * available paths for the inner relation.  We consider the
-             * cheapest-total paths for each available parameterization of the
-             * inner relation, including the unparameterized case.
-             */
-            ListCell   *lc2;
-
-            foreach(lc2, innerrel->cheapest_parameterized_paths)
-            {
-                Path       *innerpath = (Path *) lfirst(lc2);
-
-                try_nestloop_path(root,
-                                  joinrel,
-                                  outerpath,
-                                  innerpath,
-                                  merge_pathkeys,
-                                  jointype,
-                                  extra);
-            }
-
-            /* Also consider materialized form of the cheapest inner path */
-            if (matpath != NULL)
-                try_nestloop_path(root,
-                                  joinrel,
-                                  outerpath,
-                                  matpath,
-                                  merge_pathkeys,
-                                  jointype,
-                                  extra);
-        }
-
-        /* Can't do anything else if outer path needs to be unique'd */
-        if (save_jointype == JOIN_UNIQUE_OUTER)
-            continue;
-
-        /* Can't do anything else if inner rel is parameterized by outer */
-        if (inner_cheapest_total == NULL)
-            continue;
-
-        /* Generate merge join paths */
-        generate_mergejoin_paths(root, joinrel, innerrel, outerpath,
-                                 save_jointype, extra, useallclauses,
-                                 inner_cheapest_total, merge_pathkeys,
-                                 false);
-    }
-
-    /*
-     * Consider partial nestloop and mergejoin plan if outerrel has any
-     * partial path and the joinrel is parallel-safe.  However, we can't
-     * handle JOIN_UNIQUE_OUTER, because the outer path will be partial, and
-     * therefore we won't be able to properly guarantee uniqueness.  Nor can
-     * we handle extra_lateral_rels, since partial paths must not be
-     * parameterized. Similarly, we can't handle JOIN_FULL and JOIN_RIGHT,
-     * because they can produce false null extended rows.
-     */
-    if (joinrel->consider_parallel &&
-        save_jointype != JOIN_UNIQUE_OUTER &&
-        save_jointype != JOIN_FULL &&
-        save_jointype != JOIN_RIGHT &&
-        outerrel->partial_pathlist != NIL &&
-        bms_is_empty(joinrel->lateral_relids))
-    {
-        if (nestjoinOK)
-            consider_parallel_nestloop(root, joinrel, outerrel, innerrel,
-                                       save_jointype, extra);
+					 RelOptInfo *joinrel,
+					 RelOptInfo *outerrel,
+					 RelOptInfo *innerrel,
+					 JoinType jointype,
+					 JoinPathExtraData *extra)
+{
+	JoinType	save_jointype = jointype;
+	bool		nestjoinOK;
+	bool		useallclauses;
+	Path	   *inner_cheapest_total = innerrel->cheapest_total_path;
+	Path	   *matpath = NULL;
+	ListCell   *lc1;
+
+	/*
+	 * Nestloop only supports inner, left, semi, and anti joins.  Also, if we
+	 * are doing a right or full mergejoin, we must use *all* the mergeclauses
+	 * as join clauses, else we will not have a valid plan.  (Although these
+	 * two flags are currently inverses, keep them separate for clarity and
+	 * possible future changes.)
+	 */
+	switch (jointype)
+	{
+		case JOIN_INNER:
+		case JOIN_LEFT:
+		case JOIN_SEMI:
+		case JOIN_ANTI:
+#ifdef __TBASE__
+        case JOIN_LEFT_SCALAR:
+#endif
+			nestjoinOK = true;
+			useallclauses = false;
+			break;
+		case JOIN_RIGHT:
+		case JOIN_FULL:
+			nestjoinOK = false;
+			useallclauses = true;
+			break;
+		case JOIN_UNIQUE_OUTER:
+		case JOIN_UNIQUE_INNER:
+			jointype = JOIN_INNER;
+			nestjoinOK = true;
+			useallclauses = false;
+			break;
+		default:
+			elog(ERROR, "unrecognized join type: %d",
+				 (int) jointype);
+			nestjoinOK = false; /* keep compiler quiet */
+			useallclauses = false;
+			break;
+	}
 
-        /*
-         * If inner_cheapest_total is NULL or non parallel-safe then find the
-         * cheapest total parallel safe path.  If doing JOIN_UNIQUE_INNER, we
-         * can't use any alternative inner path.
-         */
-        if (inner_cheapest_total == NULL ||
-            !inner_cheapest_total->parallel_safe)
-        {
-            if (save_jointype == JOIN_UNIQUE_INNER)
-                return;
+	/*
+	 * If inner_cheapest_total is parameterized by the outer rel, ignore it;
+	 * we will consider it below as a member of cheapest_parameterized_paths,
+	 * but the other possibilities considered in this routine aren't usable.
+	 */
+	if (PATH_PARAM_BY_REL(inner_cheapest_total, outerrel))
+		inner_cheapest_total = NULL;
+
+	/*
+	 * If we need to unique-ify the inner path, we will consider only the
+	 * cheapest-total inner.
+	 */
+	if (save_jointype == JOIN_UNIQUE_INNER)
+	{
+		/* No way to do this with an inner path parameterized by outer rel */
+		if (inner_cheapest_total == NULL)
+			return;
+		inner_cheapest_total = (Path *)
+			create_unique_path(root, innerrel, inner_cheapest_total, extra->sjinfo);
+		Assert(inner_cheapest_total);
+	}
+	else if (nestjoinOK)
+	{
+		/*
+		 * Consider materializing the cheapest inner path, unless
+		 * enable_material is off or the path in question materializes its
+		 * output anyway.
+		 */
+		if (enable_material && inner_cheapest_total != NULL &&
+			!ExecMaterializesOutput(inner_cheapest_total->pathtype))
+			matpath = (Path *)
+				create_material_path(innerrel, inner_cheapest_total);
+	}
 
-            inner_cheapest_total = get_cheapest_parallel_safe_total_inner(
-                                                                          innerrel->pathlist);
-        }
+	foreach(lc1, outerrel->pathlist)
+	{
+		Path	   *outerpath = (Path *) lfirst(lc1);
+		List	   *merge_pathkeys;
+
+		/*
+		 * We cannot use an outer path that is parameterized by the inner rel.
+		 */
+		if (PATH_PARAM_BY_REL(outerpath, innerrel))
+			continue;
+
+		/*
+		 * If we need to unique-ify the outer path, it's pointless to consider
+		 * any but the cheapest outer.  (XXX we don't consider parameterized
+		 * outers, nor inners, for unique-ified cases.  Should we?)
+		 */
+		if (save_jointype == JOIN_UNIQUE_OUTER)
+		{
+			if (outerpath != outerrel->cheapest_total_path)
+				continue;
+			outerpath = (Path *) create_unique_path(root, outerrel,
+													outerpath, extra->sjinfo);
+			Assert(outerpath);
+		}
+
+		/*
+		 * The result will have this sort order (even if it is implemented as
+		 * a nestloop, and even if some of the mergeclauses are implemented by
+		 * qpquals rather than as true mergeclauses):
+		 */
+		merge_pathkeys = build_join_pathkeys(root, joinrel, jointype,
+											 outerpath->pathkeys);
+
+		if (save_jointype == JOIN_UNIQUE_INNER)
+		{
+			/*
+			 * Consider nestloop join, but only with the unique-ified cheapest
+			 * inner path
+			 */
+			try_nestloop_path(root,
+							  joinrel,
+							  outerpath,
+							  inner_cheapest_total,
+							  merge_pathkeys,
+							  jointype,
+							  extra);
+		}
+		else if (nestjoinOK)
+		{
+			/*
+			 * Consider nestloop joins using this outer path and various
+			 * available paths for the inner relation.  We consider the
+			 * cheapest-total paths for each available parameterization of the
+			 * inner relation, including the unparameterized case.
+			 */
+			ListCell   *lc2;
+
+			foreach(lc2, innerrel->cheapest_parameterized_paths)
+			{
+				Path	   *innerpath = (Path *) lfirst(lc2);
+
+				try_nestloop_path(root,
+								  joinrel,
+								  outerpath,
+								  innerpath,
+								  merge_pathkeys,
+								  jointype,
+								  extra);
+			}
+
+			/* Also consider materialized form of the cheapest inner path */
+			if (matpath != NULL)
+				try_nestloop_path(root,
+								  joinrel,
+								  outerpath,
+								  matpath,
+								  merge_pathkeys,
+								  jointype,
+								  extra);
+		}
+
+		/* Can't do anything else if outer path needs to be unique'd */
+		if (save_jointype == JOIN_UNIQUE_OUTER)
+			continue;
+
+		/* Can't do anything else if inner rel is parameterized by outer */
+		if (inner_cheapest_total == NULL)
+			continue;
+
+		/* Generate merge join paths */
+		generate_mergejoin_paths(root, joinrel, innerrel, outerpath,
+								 save_jointype, extra, useallclauses,
+								 inner_cheapest_total, merge_pathkeys,
+								 false);
+	}
 
-        if (inner_cheapest_total)
-            consider_parallel_mergejoin(root, joinrel, outerrel, innerrel,
-                                        save_jointype, extra,
-                                        inner_cheapest_total);
-    }
+	/*
+	 * Consider partial nestloop and mergejoin plan if outerrel has any
+	 * partial path and the joinrel is parallel-safe.  However, we can't
+	 * handle JOIN_UNIQUE_OUTER, because the outer path will be partial, and
+	 * therefore we won't be able to properly guarantee uniqueness.  Nor can
+	 * we handle extra_lateral_rels, since partial paths must not be
+	 * parameterized. Similarly, we can't handle JOIN_FULL and JOIN_RIGHT,
+	 * because they can produce false null extended rows.
+	 */
+	if (joinrel->consider_parallel &&
+		save_jointype != JOIN_UNIQUE_OUTER &&
+		save_jointype != JOIN_FULL &&
+		save_jointype != JOIN_RIGHT &&
+		outerrel->partial_pathlist != NIL &&
+		bms_is_empty(joinrel->lateral_relids))
+	{
+		if (nestjoinOK)
+			consider_parallel_nestloop(root, joinrel, outerrel, innerrel,
+									   save_jointype, extra);
+
+		/*
+		 * If inner_cheapest_total is NULL or non parallel-safe then find the
+		 * cheapest total parallel safe path.  If doing JOIN_UNIQUE_INNER, we
+		 * can't use any alternative inner path.
+		 */
+		if (inner_cheapest_total == NULL ||
+			!inner_cheapest_total->parallel_safe)
+		{
+			if (save_jointype == JOIN_UNIQUE_INNER)
+				return;
+
+			inner_cheapest_total = get_cheapest_parallel_safe_total_inner(
+																		  innerrel->pathlist);
+		}
+
+		if (inner_cheapest_total)
+			consider_parallel_mergejoin(root, joinrel, outerrel, innerrel,
+										save_jointype, extra,
+										inner_cheapest_total);
+	}
 }
 
 /*
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index d3022390..659a8494 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -328,316 +328,320 @@ make_rels_by_clauseless_joins(PlannerInfo *root,
  */
 static bool
 join_is_legal(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
-              Relids joinrelids,
-              SpecialJoinInfo **sjinfo_p, bool *reversed_p)
-{// #lizard forgives
-    SpecialJoinInfo *match_sjinfo;
-    bool        reversed;
-    bool        unique_ified;
-    bool        must_be_leftjoin;
-    ListCell   *l;
-
-    /*
-     * Ensure output params are set on failure return.  This is just to
-     * suppress uninitialized-variable warnings from overly anal compilers.
-     */
-    *sjinfo_p = NULL;
-    *reversed_p = false;
-
-    /*
-     * If we have any special joins, the proposed join might be illegal; and
-     * in any case we have to determine its join type.  Scan the join info
-     * list for matches and conflicts.
-     */
-    match_sjinfo = NULL;
-    reversed = false;
-    unique_ified = false;
-    must_be_leftjoin = false;
-
-    foreach(l, root->join_info_list)
-    {
-        SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(l);
-
-        /*
-         * This special join is not relevant unless its RHS overlaps the
-         * proposed join.  (Check this first as a fast path for dismissing
-         * most irrelevant SJs quickly.)
-         */
-        if (!bms_overlap(sjinfo->min_righthand, joinrelids))
-            continue;
-
-        /*
-         * Also, not relevant if proposed join is fully contained within RHS
-         * (ie, we're still building up the RHS).
-         */
-        if (bms_is_subset(joinrelids, sjinfo->min_righthand))
-            continue;
-
-        /*
-         * Also, not relevant if SJ is already done within either input.
-         */
-        if (bms_is_subset(sjinfo->min_lefthand, rel1->relids) &&
-            bms_is_subset(sjinfo->min_righthand, rel1->relids))
-            continue;
-        if (bms_is_subset(sjinfo->min_lefthand, rel2->relids) &&
-            bms_is_subset(sjinfo->min_righthand, rel2->relids))
-            continue;
-
-        /*
-         * If it's a semijoin and we already joined the RHS to any other rels
-         * within either input, then we must have unique-ified the RHS at that
-         * point (see below).  Therefore the semijoin is no longer relevant in
-         * this join path.
-         */
-        if (sjinfo->jointype == JOIN_SEMI)
-        {
-            if (bms_is_subset(sjinfo->syn_righthand, rel1->relids) &&
-                !bms_equal(sjinfo->syn_righthand, rel1->relids))
-                continue;
-            if (bms_is_subset(sjinfo->syn_righthand, rel2->relids) &&
-                !bms_equal(sjinfo->syn_righthand, rel2->relids))
-                continue;
-        }
-
-        /*
-         * If one input contains min_lefthand and the other contains
-         * min_righthand, then we can perform the SJ at this join.
-         *
-         * Reject if we get matches to more than one SJ; that implies we're
-         * considering something that's not really valid.
-         */
-        if (bms_is_subset(sjinfo->min_lefthand, rel1->relids) &&
-            bms_is_subset(sjinfo->min_righthand, rel2->relids))
-        {
-            if (match_sjinfo)
-                return false;    /* invalid join path */
-            match_sjinfo = sjinfo;
-            reversed = false;
-        }
-        else if (bms_is_subset(sjinfo->min_lefthand, rel2->relids) &&
-                 bms_is_subset(sjinfo->min_righthand, rel1->relids))
-        {
-            if (match_sjinfo)
-                return false;    /* invalid join path */
-            match_sjinfo = sjinfo;
-            reversed = true;
-        }
-        else if (sjinfo->jointype == JOIN_SEMI &&
-                 bms_equal(sjinfo->syn_righthand, rel2->relids) &&
-                 create_unique_path(root, rel2, rel2->cheapest_total_path,
-                                    sjinfo) != NULL)
-        {
-            /*----------
-             * For a semijoin, we can join the RHS to anything else by
-             * unique-ifying the RHS (if the RHS can be unique-ified).
-             * We will only get here if we have the full RHS but less
-             * than min_lefthand on the LHS.
-             *
-             * The reason to consider such a join path is exemplified by
-             *    SELECT ... FROM a,b WHERE (a.x,b.y) IN (SELECT c1,c2 FROM c)
-             * If we insist on doing this as a semijoin we will first have
-             * to form the cartesian product of A*B.  But if we unique-ify
-             * C then the semijoin becomes a plain innerjoin and we can join
-             * in any order, eg C to A and then to B.  When C is much smaller
-             * than A and B this can be a huge win.  So we allow C to be
-             * joined to just A or just B here, and then make_join_rel has
-             * to handle the case properly.
-             *
-             * Note that actually we'll allow unique-ified C to be joined to
-             * some other relation D here, too.  That is legal, if usually not
-             * very sane, and this routine is only concerned with legality not
-             * with whether the join is good strategy.
-             *----------
-             */
-            if (match_sjinfo)
-                return false;    /* invalid join path */
-            match_sjinfo = sjinfo;
-            reversed = false;
-            unique_ified = true;
-        }
-        else if (sjinfo->jointype == JOIN_SEMI &&
-                 bms_equal(sjinfo->syn_righthand, rel1->relids) &&
-                 create_unique_path(root, rel1, rel1->cheapest_total_path,
-                                    sjinfo) != NULL)
-        {
-            /* Reversed semijoin case */
-            if (match_sjinfo)
-                return false;    /* invalid join path */
-            match_sjinfo = sjinfo;
-            reversed = true;
-            unique_ified = true;
-        }
-        else
-        {
-            /*
-             * Otherwise, the proposed join overlaps the RHS but isn't a valid
-             * implementation of this SJ.  But don't panic quite yet: the RHS
-             * violation might have occurred previously, in one or both input
-             * relations, in which case we must have previously decided that
-             * it was OK to commute some other SJ with this one.  If we need
-             * to perform this join to finish building up the RHS, rejecting
-             * it could lead to not finding any plan at all.  (This can occur
-             * because of the heuristics elsewhere in this file that postpone
-             * clauseless joins: we might not consider doing a clauseless join
-             * within the RHS until after we've performed other, validly
-             * commutable SJs with one or both sides of the clauseless join.)
-             * This consideration boils down to the rule that if both inputs
-             * overlap the RHS, we can allow the join --- they are either
-             * fully within the RHS, or represent previously-allowed joins to
-             * rels outside it.
-             */
-            if (bms_overlap(rel1->relids, sjinfo->min_righthand) &&
-                bms_overlap(rel2->relids, sjinfo->min_righthand))
-                continue;        /* assume valid previous violation of RHS */
-
-            /*
-             * The proposed join could still be legal, but only if we're
-             * allowed to associate it into the RHS of this SJ.  That means
-             * this SJ must be a LEFT join (not SEMI or ANTI, and certainly
-             * not FULL) and the proposed join must not overlap the LHS.
-             */
-            if (sjinfo->jointype != JOIN_LEFT ||
-                bms_overlap(joinrelids, sjinfo->min_lefthand))
-                return false;    /* invalid join path */
-
-            /*
-             * To be valid, the proposed join must be a LEFT join; otherwise
-             * it can't associate into this SJ's RHS.  But we may not yet have
-             * found the SpecialJoinInfo matching the proposed join, so we
-             * can't test that yet.  Remember the requirement for later.
-             */
-            must_be_leftjoin = true;
-        }
-    }
-
-    /*
-     * Fail if violated any SJ's RHS and didn't match to a LEFT SJ: the
-     * proposed join can't associate into an SJ's RHS.
-     *
-     * Also, fail if the proposed join's predicate isn't strict; we're
-     * essentially checking to see if we can apply outer-join identity 3, and
-     * that's a requirement.  (This check may be redundant with checks in
-     * make_outerjoininfo, but I'm not quite sure, and it's cheap to test.)
-     */
-    if (must_be_leftjoin &&
-        (match_sjinfo == NULL ||
-         match_sjinfo->jointype != JOIN_LEFT ||
-         !match_sjinfo->lhs_strict))
-        return false;            /* invalid join path */
-
-    /*
-     * We also have to check for constraints imposed by LATERAL references.
-     */
-    if (root->hasLateralRTEs)
-    {
-        bool        lateral_fwd;
-        bool        lateral_rev;
-        Relids        join_lateral_rels;
-
-        /*
-         * The proposed rels could each contain lateral references to the
-         * other, in which case the join is impossible.  If there are lateral
-         * references in just one direction, then the join has to be done with
-         * a nestloop with the lateral referencer on the inside.  If the join
-         * matches an SJ that cannot be implemented by such a nestloop, the
-         * join is impossible.
-         *
-         * Also, if the lateral reference is only indirect, we should reject
-         * the join; whatever rel(s) the reference chain goes through must be
-         * joined to first.
-         *
-         * Another case that might keep us from building a valid plan is the
-         * implementation restriction described by have_dangerous_phv().
-         */
-        lateral_fwd = bms_overlap(rel1->relids, rel2->lateral_relids);
-        lateral_rev = bms_overlap(rel2->relids, rel1->lateral_relids);
-        if (lateral_fwd && lateral_rev)
-            return false;        /* have lateral refs in both directions */
-        if (lateral_fwd)
-        {
-            /* has to be implemented as nestloop with rel1 on left */
-            if (match_sjinfo &&
-                (reversed ||
-                 unique_ified ||
-                 match_sjinfo->jointype == JOIN_FULL))
-                return false;    /* not implementable as nestloop */
-            /* check there is a direct reference from rel2 to rel1 */
-            if (!bms_overlap(rel1->relids, rel2->direct_lateral_relids))
-                return false;    /* only indirect refs, so reject */
-            /* check we won't have a dangerous PHV */
-            if (have_dangerous_phv(root, rel1->relids, rel2->lateral_relids))
-                return false;    /* might be unable to handle required PHV */
-        }
-        else if (lateral_rev)
-        {
-            /* has to be implemented as nestloop with rel2 on left */
-            if (match_sjinfo &&
-                (!reversed ||
-                 unique_ified ||
-                 match_sjinfo->jointype == JOIN_FULL))
-                return false;    /* not implementable as nestloop */
-            /* check there is a direct reference from rel1 to rel2 */
-            if (!bms_overlap(rel2->relids, rel1->direct_lateral_relids))
-                return false;    /* only indirect refs, so reject */
-            /* check we won't have a dangerous PHV */
-            if (have_dangerous_phv(root, rel2->relids, rel1->lateral_relids))
-                return false;    /* might be unable to handle required PHV */
-        }
-
-        /*
-         * LATERAL references could also cause problems later on if we accept
-         * this join: if the join's minimum parameterization includes any rels
-         * that would have to be on the inside of an outer join with this join
-         * rel, then it's never going to be possible to build the complete
-         * query using this join.  We should reject this join not only because
-         * it'll save work, but because if we don't, the clauseless-join
-         * heuristics might think that legality of this join means that some
-         * other join rel need not be formed, and that could lead to failure
-         * to find any plan at all.  We have to consider not only rels that
-         * are directly on the inner side of an OJ with the joinrel, but also
-         * ones that are indirectly so, so search to find all such rels.
-         */
-        join_lateral_rels = min_join_parameterization(root, joinrelids,
-                                                      rel1, rel2);
-        if (join_lateral_rels)
-        {
-            Relids        join_plus_rhs = bms_copy(joinrelids);
-            bool        more;
-
-            do
-            {
-                more = false;
-                foreach(l, root->join_info_list)
-                {
-                    SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(l);
-
-                    if (bms_overlap(sjinfo->min_lefthand, join_plus_rhs) &&
-                        !bms_is_subset(sjinfo->min_righthand, join_plus_rhs))
-                    {
-                        join_plus_rhs = bms_add_members(join_plus_rhs,
-                                                        sjinfo->min_righthand);
-                        more = true;
-                    }
-                    /* full joins constrain both sides symmetrically */
-                    if (sjinfo->jointype == JOIN_FULL &&
-                        bms_overlap(sjinfo->min_righthand, join_plus_rhs) &&
-                        !bms_is_subset(sjinfo->min_lefthand, join_plus_rhs))
-                    {
-                        join_plus_rhs = bms_add_members(join_plus_rhs,
-                                                        sjinfo->min_lefthand);
-                        more = true;
-                    }
-                }
-            } while (more);
-            if (bms_overlap(join_plus_rhs, join_lateral_rels))
-                return false;    /* will not be able to join to some RHS rel */
-        }
-    }
-
-    /* Otherwise, it's a valid join */
-    *sjinfo_p = match_sjinfo;
-    *reversed_p = reversed;
-    return true;
+			  Relids joinrelids,
+			  SpecialJoinInfo **sjinfo_p, bool *reversed_p)
+{
+	SpecialJoinInfo *match_sjinfo;
+	bool		reversed;
+	bool		unique_ified;
+	bool		must_be_leftjoin;
+	ListCell   *l;
+
+	/*
+	 * Ensure output params are set on failure return.  This is just to
+	 * suppress uninitialized-variable warnings from overly anal compilers.
+	 */
+	*sjinfo_p = NULL;
+	*reversed_p = false;
+
+	/*
+	 * If we have any special joins, the proposed join might be illegal; and
+	 * in any case we have to determine its join type.  Scan the join info
+	 * list for matches and conflicts.
+	 */
+	match_sjinfo = NULL;
+	reversed = false;
+	unique_ified = false;
+	must_be_leftjoin = false;
+
+	foreach(l, root->join_info_list)
+	{
+		SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(l);
+
+		/*
+		 * This special join is not relevant unless its RHS overlaps the
+		 * proposed join.  (Check this first as a fast path for dismissing
+		 * most irrelevant SJs quickly.)
+		 */
+		if (!bms_overlap(sjinfo->min_righthand, joinrelids))
+			continue;
+
+		/*
+		 * Also, not relevant if proposed join is fully contained within RHS
+		 * (ie, we're still building up the RHS).
+		 */
+		if (bms_is_subset(joinrelids, sjinfo->min_righthand))
+			continue;
+
+		/*
+		 * Also, not relevant if SJ is already done within either input.
+		 */
+		if (bms_is_subset(sjinfo->min_lefthand, rel1->relids) &&
+			bms_is_subset(sjinfo->min_righthand, rel1->relids))
+			continue;
+		if (bms_is_subset(sjinfo->min_lefthand, rel2->relids) &&
+			bms_is_subset(sjinfo->min_righthand, rel2->relids))
+			continue;
+
+		/*
+		 * If it's a semijoin and we already joined the RHS to any other rels
+		 * within either input, then we must have unique-ified the RHS at that
+		 * point (see below).  Therefore the semijoin is no longer relevant in
+		 * this join path.
+		 */
+#ifdef __TBASE__
+		if (sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_LEFT_SCALAR)
+#else
+		if (sjinfo->jointype == JOIN_SEMI)
+#endif
+		{
+			if (bms_is_subset(sjinfo->syn_righthand, rel1->relids) &&
+				!bms_equal(sjinfo->syn_righthand, rel1->relids))
+				continue;
+			if (bms_is_subset(sjinfo->syn_righthand, rel2->relids) &&
+				!bms_equal(sjinfo->syn_righthand, rel2->relids))
+				continue;
+		}
+
+		/*
+		 * If one input contains min_lefthand and the other contains
+		 * min_righthand, then we can perform the SJ at this join.
+		 *
+		 * Reject if we get matches to more than one SJ; that implies we're
+		 * considering something that's not really valid.
+		 */
+		if (bms_is_subset(sjinfo->min_lefthand, rel1->relids) &&
+			bms_is_subset(sjinfo->min_righthand, rel2->relids))
+		{
+			if (match_sjinfo)
+				return false;	/* invalid join path */
+			match_sjinfo = sjinfo;
+			reversed = false;
+		}
+		else if (bms_is_subset(sjinfo->min_lefthand, rel2->relids) &&
+				 bms_is_subset(sjinfo->min_righthand, rel1->relids))
+		{
+			if (match_sjinfo)
+				return false;	/* invalid join path */
+			match_sjinfo = sjinfo;
+			reversed = true;
+		}
+		else if (sjinfo->jointype == JOIN_SEMI &&
+				 bms_equal(sjinfo->syn_righthand, rel2->relids) &&
+				 create_unique_path(root, rel2, rel2->cheapest_total_path,
+									sjinfo) != NULL)
+		{
+			/*----------
+			 * For a semijoin, we can join the RHS to anything else by
+			 * unique-ifying the RHS (if the RHS can be unique-ified).
+			 * We will only get here if we have the full RHS but less
+			 * than min_lefthand on the LHS.
+			 *
+			 * The reason to consider such a join path is exemplified by
+			 *	SELECT ... FROM a,b WHERE (a.x,b.y) IN (SELECT c1,c2 FROM c)
+			 * If we insist on doing this as a semijoin we will first have
+			 * to form the cartesian product of A*B.  But if we unique-ify
+			 * C then the semijoin becomes a plain innerjoin and we can join
+			 * in any order, eg C to A and then to B.  When C is much smaller
+			 * than A and B this can be a huge win.  So we allow C to be
+			 * joined to just A or just B here, and then make_join_rel has
+			 * to handle the case properly.
+			 *
+			 * Note that actually we'll allow unique-ified C to be joined to
+			 * some other relation D here, too.  That is legal, if usually not
+			 * very sane, and this routine is only concerned with legality not
+			 * with whether the join is good strategy.
+			 *----------
+			 */
+			if (match_sjinfo)
+				return false;	/* invalid join path */
+			match_sjinfo = sjinfo;
+			reversed = false;
+			unique_ified = true;
+		}
+		else if (sjinfo->jointype == JOIN_SEMI &&
+				 bms_equal(sjinfo->syn_righthand, rel1->relids) &&
+				 create_unique_path(root, rel1, rel1->cheapest_total_path,
+									sjinfo) != NULL)
+		{
+			/* Reversed semijoin case */
+			if (match_sjinfo)
+				return false;	/* invalid join path */
+			match_sjinfo = sjinfo;
+			reversed = true;
+			unique_ified = true;
+		}
+		else
+		{
+			/*
+			 * Otherwise, the proposed join overlaps the RHS but isn't a valid
+			 * implementation of this SJ.  But don't panic quite yet: the RHS
+			 * violation might have occurred previously, in one or both input
+			 * relations, in which case we must have previously decided that
+			 * it was OK to commute some other SJ with this one.  If we need
+			 * to perform this join to finish building up the RHS, rejecting
+			 * it could lead to not finding any plan at all.  (This can occur
+			 * because of the heuristics elsewhere in this file that postpone
+			 * clauseless joins: we might not consider doing a clauseless join
+			 * within the RHS until after we've performed other, validly
+			 * commutable SJs with one or both sides of the clauseless join.)
+			 * This consideration boils down to the rule that if both inputs
+			 * overlap the RHS, we can allow the join --- they are either
+			 * fully within the RHS, or represent previously-allowed joins to
+			 * rels outside it.
+			 */
+			if (bms_overlap(rel1->relids, sjinfo->min_righthand) &&
+				bms_overlap(rel2->relids, sjinfo->min_righthand))
+				continue;		/* assume valid previous violation of RHS */
+
+			/*
+			 * The proposed join could still be legal, but only if we're
+			 * allowed to associate it into the RHS of this SJ.  That means
+			 * this SJ must be a LEFT join (not SEMI or ANTI, and certainly
+			 * not FULL) and the proposed join must not overlap the LHS.
+			 */
+			if (sjinfo->jointype != JOIN_LEFT ||
+				bms_overlap(joinrelids, sjinfo->min_lefthand))
+				return false;	/* invalid join path */
+
+			/*
+			 * To be valid, the proposed join must be a LEFT join; otherwise
+			 * it can't associate into this SJ's RHS.  But we may not yet have
+			 * found the SpecialJoinInfo matching the proposed join, so we
+			 * can't test that yet.  Remember the requirement for later.
+			 */
+			must_be_leftjoin = true;
+		}
+	}
+
+	/*
+	 * Fail if violated any SJ's RHS and didn't match to a LEFT SJ: the
+	 * proposed join can't associate into an SJ's RHS.
+	 *
+	 * Also, fail if the proposed join's predicate isn't strict; we're
+	 * essentially checking to see if we can apply outer-join identity 3, and
+	 * that's a requirement.  (This check may be redundant with checks in
+	 * make_outerjoininfo, but I'm not quite sure, and it's cheap to test.)
+	 */
+	if (must_be_leftjoin &&
+		(match_sjinfo == NULL ||
+		 match_sjinfo->jointype != JOIN_LEFT ||
+		 !match_sjinfo->lhs_strict))
+		return false;			/* invalid join path */
+
+	/*
+	 * We also have to check for constraints imposed by LATERAL references.
+	 */
+	if (root->hasLateralRTEs)
+	{
+		bool		lateral_fwd;
+		bool		lateral_rev;
+		Relids		join_lateral_rels;
+
+		/*
+		 * The proposed rels could each contain lateral references to the
+		 * other, in which case the join is impossible.  If there are lateral
+		 * references in just one direction, then the join has to be done with
+		 * a nestloop with the lateral referencer on the inside.  If the join
+		 * matches an SJ that cannot be implemented by such a nestloop, the
+		 * join is impossible.
+		 *
+		 * Also, if the lateral reference is only indirect, we should reject
+		 * the join; whatever rel(s) the reference chain goes through must be
+		 * joined to first.
+		 *
+		 * Another case that might keep us from building a valid plan is the
+		 * implementation restriction described by have_dangerous_phv().
+		 */
+		lateral_fwd = bms_overlap(rel1->relids, rel2->lateral_relids);
+		lateral_rev = bms_overlap(rel2->relids, rel1->lateral_relids);
+		if (lateral_fwd && lateral_rev)
+			return false;		/* have lateral refs in both directions */
+		if (lateral_fwd)
+		{
+			/* has to be implemented as nestloop with rel1 on left */
+			if (match_sjinfo &&
+				(reversed ||
+				 unique_ified ||
+				 match_sjinfo->jointype == JOIN_FULL))
+				return false;	/* not implementable as nestloop */
+			/* check there is a direct reference from rel2 to rel1 */
+			if (!bms_overlap(rel1->relids, rel2->direct_lateral_relids))
+				return false;	/* only indirect refs, so reject */
+			/* check we won't have a dangerous PHV */
+			if (have_dangerous_phv(root, rel1->relids, rel2->lateral_relids))
+				return false;	/* might be unable to handle required PHV */
+		}
+		else if (lateral_rev)
+		{
+			/* has to be implemented as nestloop with rel2 on left */
+			if (match_sjinfo &&
+				(!reversed ||
+				 unique_ified ||
+				 match_sjinfo->jointype == JOIN_FULL))
+				return false;	/* not implementable as nestloop */
+			/* check there is a direct reference from rel1 to rel2 */
+			if (!bms_overlap(rel2->relids, rel1->direct_lateral_relids))
+				return false;	/* only indirect refs, so reject */
+			/* check we won't have a dangerous PHV */
+			if (have_dangerous_phv(root, rel2->relids, rel1->lateral_relids))
+				return false;	/* might be unable to handle required PHV */
+		}
+
+		/*
+		 * LATERAL references could also cause problems later on if we accept
+		 * this join: if the join's minimum parameterization includes any rels
+		 * that would have to be on the inside of an outer join with this join
+		 * rel, then it's never going to be possible to build the complete
+		 * query using this join.  We should reject this join not only because
+		 * it'll save work, but because if we don't, the clauseless-join
+		 * heuristics might think that legality of this join means that some
+		 * other join rel need not be formed, and that could lead to failure
+		 * to find any plan at all.  We have to consider not only rels that
+		 * are directly on the inner side of an OJ with the joinrel, but also
+		 * ones that are indirectly so, so search to find all such rels.
+		 */
+		join_lateral_rels = min_join_parameterization(root, joinrelids,
+													  rel1, rel2);
+		if (join_lateral_rels)
+		{
+			Relids		join_plus_rhs = bms_copy(joinrelids);
+			bool		more;
+
+			do
+			{
+				more = false;
+				foreach(l, root->join_info_list)
+				{
+					SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(l);
+
+					if (bms_overlap(sjinfo->min_lefthand, join_plus_rhs) &&
+						!bms_is_subset(sjinfo->min_righthand, join_plus_rhs))
+					{
+						join_plus_rhs = bms_add_members(join_plus_rhs,
+														sjinfo->min_righthand);
+						more = true;
+					}
+					/* full joins constrain both sides symmetrically */
+					if (sjinfo->jointype == JOIN_FULL &&
+						bms_overlap(sjinfo->min_righthand, join_plus_rhs) &&
+						!bms_is_subset(sjinfo->min_lefthand, join_plus_rhs))
+					{
+						join_plus_rhs = bms_add_members(join_plus_rhs,
+														sjinfo->min_lefthand);
+						more = true;
+					}
+				}
+			} while (more);
+			if (bms_overlap(join_plus_rhs, join_lateral_rels))
+				return false;	/* will not be able to join to some RHS rel */
+		}
+	}
+
+	/* Otherwise, it's a valid join */
+	*sjinfo_p = match_sjinfo;
+	*reversed_p = reversed;
+	return true;
 }
 
 
@@ -745,153 +749,168 @@ make_join_rel(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2)
  */
 static void
 populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1,
-                            RelOptInfo *rel2, RelOptInfo *joinrel,
-                            SpecialJoinInfo *sjinfo, List *restrictlist)
-{// #lizard forgives
-    /*
-     * Consider paths using each rel as both outer and inner.  Depending on
-     * the join type, a provably empty outer or inner rel might mean the join
-     * is provably empty too; in which case throw away any previously computed
-     * paths and mark the join as dummy.  (We do it this way since it's
-     * conceivable that dummy-ness of a multi-element join might only be
-     * noticeable for certain construction paths.)
-     *
-     * Also, a provably constant-false join restriction typically means that
-     * we can skip evaluating one or both sides of the join.  We do this by
-     * marking the appropriate rel as dummy.  For outer joins, a
-     * constant-false restriction that is pushed down still means the whole
-     * join is dummy, while a non-pushed-down one means that no inner rows
-     * will join so we can treat the inner rel as dummy.
-     *
-     * We need only consider the jointypes that appear in join_info_list, plus
-     * JOIN_INNER.
-     */
-    switch (sjinfo->jointype)
-    {
-        case JOIN_INNER:
-            if (is_dummy_rel(rel1) || is_dummy_rel(rel2) ||
-                restriction_is_constant_false(restrictlist, false))
-            {
-                mark_dummy_rel(joinrel);
-                break;
-            }
-            add_paths_to_joinrel(root, joinrel, rel1, rel2,
-                                 JOIN_INNER, sjinfo,
-                                 restrictlist);
-            add_paths_to_joinrel(root, joinrel, rel2, rel1,
-                                 JOIN_INNER, sjinfo,
-                                 restrictlist);
-            break;
-        case JOIN_LEFT:
-            if (is_dummy_rel(rel1) ||
-                restriction_is_constant_false(restrictlist, true))
-            {
-                mark_dummy_rel(joinrel);
-                break;
-            }
-            if (restriction_is_constant_false(restrictlist, false) &&
-                bms_is_subset(rel2->relids, sjinfo->syn_righthand))
-                mark_dummy_rel(rel2);
-            add_paths_to_joinrel(root, joinrel, rel1, rel2,
-                                 JOIN_LEFT, sjinfo,
-                                 restrictlist);
-            add_paths_to_joinrel(root, joinrel, rel2, rel1,
-                                 JOIN_RIGHT, sjinfo,
-                                 restrictlist);
-            break;
-        case JOIN_FULL:
-            if ((is_dummy_rel(rel1) && is_dummy_rel(rel2)) ||
-                restriction_is_constant_false(restrictlist, true))
-            {
-                mark_dummy_rel(joinrel);
-                break;
-            }
-            add_paths_to_joinrel(root, joinrel, rel1, rel2,
-                                 JOIN_FULL, sjinfo,
-                                 restrictlist);
-            add_paths_to_joinrel(root, joinrel, rel2, rel1,
-                                 JOIN_FULL, sjinfo,
-                                 restrictlist);
-
-            /*
-             * If there are join quals that aren't mergeable or hashable, we
-             * may not be able to build any valid plan.  Complain here so that
-             * we can give a somewhat-useful error message.  (Since we have no
-             * flexibility of planning for a full join, there's no chance of
-             * succeeding later with another pair of input rels.)
-             */
-            if (joinrel->pathlist == NIL)
-                ereport(ERROR,
-                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                         errmsg("FULL JOIN is only supported with merge-joinable or hash-joinable join conditions")));
-            break;
-        case JOIN_SEMI:
-
-            /*
-             * We might have a normal semijoin, or a case where we don't have
-             * enough rels to do the semijoin but can unique-ify the RHS and
-             * then do an innerjoin (see comments in join_is_legal).  In the
-             * latter case we can't apply JOIN_SEMI joining.
-             */
-            if (bms_is_subset(sjinfo->min_lefthand, rel1->relids) &&
-                bms_is_subset(sjinfo->min_righthand, rel2->relids))
-            {
-                if (is_dummy_rel(rel1) || is_dummy_rel(rel2) ||
-                    restriction_is_constant_false(restrictlist, false))
-                {
-                    mark_dummy_rel(joinrel);
-                    break;
-                }
-                add_paths_to_joinrel(root, joinrel, rel1, rel2,
-                                     JOIN_SEMI, sjinfo,
-                                     restrictlist);
-            }
-
-            /*
-             * If we know how to unique-ify the RHS and one input rel is
-             * exactly the RHS (not a superset) we can consider unique-ifying
-             * it and then doing a regular join.  (The create_unique_path
-             * check here is probably redundant with what join_is_legal did,
-             * but if so the check is cheap because it's cached.  So test
-             * anyway to be sure.)
-             */
-            if (bms_equal(sjinfo->syn_righthand, rel2->relids) &&
-                create_unique_path(root, rel2, rel2->cheapest_total_path,
-                                   sjinfo) != NULL)
-            {
-                if (is_dummy_rel(rel1) || is_dummy_rel(rel2) ||
-                    restriction_is_constant_false(restrictlist, false))
-                {
-                    mark_dummy_rel(joinrel);
-                    break;
-                }
-                add_paths_to_joinrel(root, joinrel, rel1, rel2,
-                                     JOIN_UNIQUE_INNER, sjinfo,
-                                     restrictlist);
-                add_paths_to_joinrel(root, joinrel, rel2, rel1,
-                                     JOIN_UNIQUE_OUTER, sjinfo,
-                                     restrictlist);
-            }
-            break;
-        case JOIN_ANTI:
-            if (is_dummy_rel(rel1) ||
-                restriction_is_constant_false(restrictlist, true))
-            {
-                mark_dummy_rel(joinrel);
-                break;
-            }
-            if (restriction_is_constant_false(restrictlist, false) &&
-                bms_is_subset(rel2->relids, sjinfo->syn_righthand))
-                mark_dummy_rel(rel2);
-            add_paths_to_joinrel(root, joinrel, rel1, rel2,
-                                 JOIN_ANTI, sjinfo,
-                                 restrictlist);
-            break;
-        default:
-            /* other values not expected here */
-            elog(ERROR, "unrecognized join type: %d", (int) sjinfo->jointype);
-            break;
-    }
+							RelOptInfo *rel2, RelOptInfo *joinrel,
+							SpecialJoinInfo *sjinfo, List *restrictlist)
+{
+	/*
+	 * Consider paths using each rel as both outer and inner.  Depending on
+	 * the join type, a provably empty outer or inner rel might mean the join
+	 * is provably empty too; in which case throw away any previously computed
+	 * paths and mark the join as dummy.  (We do it this way since it's
+	 * conceivable that dummy-ness of a multi-element join might only be
+	 * noticeable for certain construction paths.)
+	 *
+	 * Also, a provably constant-false join restriction typically means that
+	 * we can skip evaluating one or both sides of the join.  We do this by
+	 * marking the appropriate rel as dummy.  For outer joins, a
+	 * constant-false restriction that is pushed down still means the whole
+	 * join is dummy, while a non-pushed-down one means that no inner rows
+	 * will join so we can treat the inner rel as dummy.
+	 *
+	 * We need only consider the jointypes that appear in join_info_list, plus
+	 * JOIN_INNER.
+	 */
+	switch (sjinfo->jointype)
+	{
+		case JOIN_INNER:
+			if (is_dummy_rel(rel1) || is_dummy_rel(rel2) ||
+				restriction_is_constant_false(restrictlist, false))
+			{
+				mark_dummy_rel(joinrel);
+				break;
+			}
+			add_paths_to_joinrel(root, joinrel, rel1, rel2,
+								 JOIN_INNER, sjinfo,
+								 restrictlist);
+			add_paths_to_joinrel(root, joinrel, rel2, rel1,
+								 JOIN_INNER, sjinfo,
+								 restrictlist);
+			break;
+		case JOIN_LEFT:
+			if (is_dummy_rel(rel1) ||
+				restriction_is_constant_false(restrictlist, true))
+			{
+				mark_dummy_rel(joinrel);
+				break;
+			}
+			if (restriction_is_constant_false(restrictlist, false) &&
+				bms_is_subset(rel2->relids, sjinfo->syn_righthand))
+				mark_dummy_rel(rel2);
+			add_paths_to_joinrel(root, joinrel, rel1, rel2,
+								 JOIN_LEFT, sjinfo,
+								 restrictlist);
+			add_paths_to_joinrel(root, joinrel, rel2, rel1,
+								 JOIN_RIGHT, sjinfo,
+								 restrictlist);
+			break;
+		case JOIN_FULL:
+			if ((is_dummy_rel(rel1) && is_dummy_rel(rel2)) ||
+				restriction_is_constant_false(restrictlist, true))
+			{
+				mark_dummy_rel(joinrel);
+				break;
+			}
+			add_paths_to_joinrel(root, joinrel, rel1, rel2,
+								 JOIN_FULL, sjinfo,
+								 restrictlist);
+			add_paths_to_joinrel(root, joinrel, rel2, rel1,
+								 JOIN_FULL, sjinfo,
+								 restrictlist);
+
+			/*
+			 * If there are join quals that aren't mergeable or hashable, we
+			 * may not be able to build any valid plan.  Complain here so that
+			 * we can give a somewhat-useful error message.  (Since we have no
+			 * flexibility of planning for a full join, there's no chance of
+			 * succeeding later with another pair of input rels.)
+			 */
+			if (joinrel->pathlist == NIL)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("FULL JOIN is only supported with merge-joinable or hash-joinable join conditions")));
+			break;
+		case JOIN_SEMI:
+#ifdef __TBASE__
+        case JOIN_LEFT_SCALAR:
+#endif
+
+			/*
+			 * We might have a normal semijoin, or a case where we don't have
+			 * enough rels to do the semijoin but can unique-ify the RHS and
+			 * then do an innerjoin (see comments in join_is_legal).  In the
+			 * latter case we can't apply JOIN_SEMI joining.
+			 */
+			if (bms_is_subset(sjinfo->min_lefthand, rel1->relids) &&
+				bms_is_subset(sjinfo->min_righthand, rel2->relids))
+			{
+				if (is_dummy_rel(rel1) || is_dummy_rel(rel2) ||
+					restriction_is_constant_false(restrictlist, false))
+				{
+					mark_dummy_rel(joinrel);
+					break;
+				}
+#ifdef __TBASE__
+				add_paths_to_joinrel(root, joinrel, rel1, rel2,
+									 sjinfo->jointype, sjinfo,
+									 restrictlist);
+#else
+				add_paths_to_joinrel(root, joinrel, rel1, rel2,
+									 JOIN_SEMI, sjinfo,
+									 restrictlist);
+#endif
+			}
+
+			/*
+			 * If we know how to unique-ify the RHS and one input rel is
+			 * exactly the RHS (not a superset) we can consider unique-ifying
+			 * it and then doing a regular join.  (The create_unique_path
+			 * check here is probably redundant with what join_is_legal did,
+			 * but if so the check is cheap because it's cached.  So test
+			 * anyway to be sure.)
+			 */
+#ifdef __TBASE__
+			if (sjinfo->jointype == JOIN_SEMI && bms_equal(sjinfo->syn_righthand, rel2->relids) &&
+				create_unique_path(root, rel2, rel2->cheapest_total_path,
+								   sjinfo) != NULL)
+#else
+			if (bms_equal(sjinfo->syn_righthand, rel2->relids) &&
+				create_unique_path(root, rel2, rel2->cheapest_total_path,
+								   sjinfo) != NULL)
+#endif
+			{
+				if (is_dummy_rel(rel1) || is_dummy_rel(rel2) ||
+					restriction_is_constant_false(restrictlist, false))
+				{
+					mark_dummy_rel(joinrel);
+					break;
+				}
+				add_paths_to_joinrel(root, joinrel, rel1, rel2,
+									 JOIN_UNIQUE_INNER, sjinfo,
+									 restrictlist);
+				add_paths_to_joinrel(root, joinrel, rel2, rel1,
+									 JOIN_UNIQUE_OUTER, sjinfo,
+									 restrictlist);
+			}
+			break;
+		case JOIN_ANTI:
+			if (is_dummy_rel(rel1) ||
+				restriction_is_constant_false(restrictlist, true))
+			{
+				mark_dummy_rel(joinrel);
+				break;
+			}
+			if (restriction_is_constant_false(restrictlist, false) &&
+				bms_is_subset(rel2->relids, sjinfo->syn_righthand))
+				mark_dummy_rel(rel2);
+			add_paths_to_joinrel(root, joinrel, rel1, rel2,
+								 JOIN_ANTI, sjinfo,
+								 restrictlist);
+			break;
+		default:
+			/* other values not expected here */
+			elog(ERROR, "unrecognized join type: %d", (int) sjinfo->jointype);
+			break;
+	}
 }
 
 
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c
index 5cdbdec7..100d9db5 100644
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -775,318 +775,321 @@ deconstruct_recurse(PlannerInfo *root, Node *jtnode, bool below_outer_join,
         }
 #endif
         
-        /* A single baserel does not create an inner join */
-        *inner_join_rels = NULL;
-        joinlist = list_make1(jtnode);
-    }
-    else if (IsA(jtnode, FromExpr))
-    {
-        FromExpr   *f = (FromExpr *) jtnode;
-        List       *child_postponed_quals = NIL;
-        int            remaining;
-        ListCell   *l;
-
-        /*
-         * First, recurse to handle child joins.  We collapse subproblems into
-         * a single joinlist whenever the resulting joinlist wouldn't exceed
-         * from_collapse_limit members.  Also, always collapse one-element
-         * subproblems, since that won't lengthen the joinlist anyway.
-         */
-        *qualscope = NULL;
-        *inner_join_rels = NULL;
-        joinlist = NIL;
-        remaining = list_length(f->fromlist);
-        foreach(l, f->fromlist)
-        {
-            Relids        sub_qualscope;
-            List       *sub_joinlist;
-            int            sub_members;
-
-            sub_joinlist = deconstruct_recurse(root, lfirst(l),
-                                               below_outer_join,
-                                               &sub_qualscope,
-                                               inner_join_rels,
-                                               &child_postponed_quals);
-            *qualscope = bms_add_members(*qualscope, sub_qualscope);
-            sub_members = list_length(sub_joinlist);
-            remaining--;
-            if (sub_members <= 1 ||
-                list_length(joinlist) + sub_members + remaining <= from_collapse_limit)
-                joinlist = list_concat(joinlist, sub_joinlist);
-            else
-                joinlist = lappend(joinlist, sub_joinlist);
-        }
-
-        /*
-         * A FROM with more than one list element is an inner join subsuming
-         * all below it, so we should report inner_join_rels = qualscope. If
-         * there was exactly one element, we should (and already did) report
-         * whatever its inner_join_rels were.  If there were no elements (is
-         * that possible?) the initialization before the loop fixed it.
-         */
-        if (list_length(f->fromlist) > 1)
-            *inner_join_rels = *qualscope;
-
-        /*
-         * Try to process any quals postponed by children.  If they need
-         * further postponement, add them to my output postponed_qual_list.
-         */
-        foreach(l, child_postponed_quals)
-        {
-            PostponedQual *pq = (PostponedQual *) lfirst(l);
-
-            if (bms_is_subset(pq->relids, *qualscope))
-                distribute_qual_to_rels(root, pq->qual,
-                                        false, below_outer_join, JOIN_INNER,
-                                        root->qual_security_level,
-                                        *qualscope, NULL, NULL, NULL,
-                                        NULL);
-            else
-                *postponed_qual_list = lappend(*postponed_qual_list, pq);
-        }
-
-        /*
-         * Now process the top-level quals.
-         */
-        foreach(l, (List *) f->quals)
-        {
-            Node       *qual = (Node *) lfirst(l);
-
-            distribute_qual_to_rels(root, qual,
-                                    false, below_outer_join, JOIN_INNER,
-                                    root->qual_security_level,
-                                    *qualscope, NULL, NULL, NULL,
-                                    postponed_qual_list);
-        }
-    }
-    else if (IsA(jtnode, JoinExpr))
-    {
-        JoinExpr   *j = (JoinExpr *) jtnode;
-        List       *child_postponed_quals = NIL;
-        Relids        leftids,
-                    rightids,
-                    left_inners,
-                    right_inners,
-                    nonnullable_rels,
-                    nullable_rels,
-                    ojscope;
-        List       *leftjoinlist,
-                   *rightjoinlist;
-        List       *my_quals;
-        SpecialJoinInfo *sjinfo;
-        ListCell   *l;
-
-        /*
-         * Order of operations here is subtle and critical.  First we recurse
-         * to handle sub-JOINs.  Their join quals will be placed without
-         * regard for whether this level is an outer join, which is correct.
-         * Then we place our own join quals, which are restricted by lower
-         * outer joins in any case, and are forced to this level if this is an
-         * outer join and they mention the outer side.  Finally, if this is an
-         * outer join, we create a join_info_list entry for the join.  This
-         * will prevent quals above us in the join tree that use those rels
-         * from being pushed down below this level.  (It's okay for upper
-         * quals to be pushed down to the outer side, however.)
-         */
-        switch (j->jointype)
-        {
-            case JOIN_INNER:
-                leftjoinlist = deconstruct_recurse(root, j->larg,
-                                                   below_outer_join,
-                                                   &leftids, &left_inners,
-                                                   &child_postponed_quals);
-                rightjoinlist = deconstruct_recurse(root, j->rarg,
-                                                    below_outer_join,
-                                                    &rightids, &right_inners,
-                                                    &child_postponed_quals);
-                *qualscope = bms_union(leftids, rightids);
-                *inner_join_rels = *qualscope;
-                /* Inner join adds no restrictions for quals */
-                nonnullable_rels = NULL;
-                /* and it doesn't force anything to null, either */
-                nullable_rels = NULL;
-                break;
-            case JOIN_LEFT:
-            case JOIN_ANTI:
-                leftjoinlist = deconstruct_recurse(root, j->larg,
-                                                   below_outer_join,
-                                                   &leftids, &left_inners,
-                                                   &child_postponed_quals);
-                rightjoinlist = deconstruct_recurse(root, j->rarg,
-                                                    true,
-                                                    &rightids, &right_inners,
-                                                    &child_postponed_quals);
-                *qualscope = bms_union(leftids, rightids);
-                *inner_join_rels = bms_union(left_inners, right_inners);
-                nonnullable_rels = leftids;
-                nullable_rels = rightids;
-                break;
-            case JOIN_SEMI:
-                leftjoinlist = deconstruct_recurse(root, j->larg,
-                                                   below_outer_join,
-                                                   &leftids, &left_inners,
-                                                   &child_postponed_quals);
-                rightjoinlist = deconstruct_recurse(root, j->rarg,
-                                                    below_outer_join,
-                                                    &rightids, &right_inners,
-                                                    &child_postponed_quals);
-                *qualscope = bms_union(leftids, rightids);
-                *inner_join_rels = bms_union(left_inners, right_inners);
-                /* Semi join adds no restrictions for quals */
-                nonnullable_rels = NULL;
-
-                /*
-                 * Theoretically, a semijoin would null the RHS; but since the
-                 * RHS can't be accessed above the join, this is immaterial
-                 * and we needn't account for it.
-                 */
-                nullable_rels = NULL;
-                break;
-            case JOIN_FULL:
-                leftjoinlist = deconstruct_recurse(root, j->larg,
-                                                   true,
-                                                   &leftids, &left_inners,
-                                                   &child_postponed_quals);
-                rightjoinlist = deconstruct_recurse(root, j->rarg,
-                                                    true,
-                                                    &rightids, &right_inners,
-                                                    &child_postponed_quals);
-                *qualscope = bms_union(leftids, rightids);
-                *inner_join_rels = bms_union(left_inners, right_inners);
-                /* each side is both outer and inner */
-                nonnullable_rels = *qualscope;
-                nullable_rels = *qualscope;
-                break;
-            default:
-                /* JOIN_RIGHT was eliminated during reduce_outer_joins() */
-                elog(ERROR, "unrecognized join type: %d",
-                     (int) j->jointype);
-                nonnullable_rels = NULL;    /* keep compiler quiet */
-                nullable_rels = NULL;
-                leftjoinlist = rightjoinlist = NIL;
-                break;
-        }
-
-        /* Report all rels that will be nulled anywhere in the jointree */
-        root->nullable_baserels = bms_add_members(root->nullable_baserels,
-                                                  nullable_rels);
-
-        /*
-         * Try to process any quals postponed by children.  If they need
-         * further postponement, add them to my output postponed_qual_list.
-         * Quals that can be processed now must be included in my_quals, so
-         * that they'll be handled properly in make_outerjoininfo.
-         */
-        my_quals = NIL;
-        foreach(l, child_postponed_quals)
-        {
-            PostponedQual *pq = (PostponedQual *) lfirst(l);
-
-            if (bms_is_subset(pq->relids, *qualscope))
-                my_quals = lappend(my_quals, pq->qual);
-            else
-            {
-                /*
-                 * We should not be postponing any quals past an outer join.
-                 * If this Assert fires, pull_up_subqueries() messed up.
-                 */
-                Assert(j->jointype == JOIN_INNER);
-                *postponed_qual_list = lappend(*postponed_qual_list, pq);
-            }
-        }
-        /* list_concat is nondestructive of its second argument */
-        my_quals = list_concat(my_quals, (List *) j->quals);
-
-        /*
-         * For an OJ, form the SpecialJoinInfo now, because we need the OJ's
-         * semantic scope (ojscope) to pass to distribute_qual_to_rels.  But
-         * we mustn't add it to join_info_list just yet, because we don't want
-         * distribute_qual_to_rels to think it is an outer join below us.
-         *
-         * Semijoins are a bit of a hybrid: we build a SpecialJoinInfo, but we
-         * want ojscope = NULL for distribute_qual_to_rels.
-         */
-        if (j->jointype != JOIN_INNER)
-        {
-            sjinfo = make_outerjoininfo(root,
-                                        leftids, rightids,
-                                        *inner_join_rels,
-                                        j->jointype,
-                                        my_quals);
-            if (j->jointype == JOIN_SEMI)
-                ojscope = NULL;
-            else
-                ojscope = bms_union(sjinfo->min_lefthand,
-                                    sjinfo->min_righthand);
-        }
-        else
-        {
-            sjinfo = NULL;
-            ojscope = NULL;
-        }
-
-        /* Process the JOIN's qual clauses */
-        foreach(l, my_quals)
-        {
-            Node       *qual = (Node *) lfirst(l);
-
-            distribute_qual_to_rels(root, qual,
-                                    false, below_outer_join, j->jointype,
-                                    root->qual_security_level,
-                                    *qualscope,
-                                    ojscope, nonnullable_rels, NULL,
-                                    postponed_qual_list);
-        }
-
-        /* Now we can add the SpecialJoinInfo to join_info_list */
-        if (sjinfo)
-        {
-            root->join_info_list = lappend(root->join_info_list, sjinfo);
-            /* Each time we do that, recheck placeholder eval levels */
-            update_placeholder_eval_levels(root, sjinfo);
-        }
-
-        /*
-         * Finally, compute the output joinlist.  We fold subproblems together
-         * except at a FULL JOIN or where join_collapse_limit would be
-         * exceeded.
-         */
-        if (j->jointype == JOIN_FULL)
-        {
-            /* force the join order exactly at this node */
-            joinlist = list_make1(list_make2(leftjoinlist, rightjoinlist));
-        }
-        else if (list_length(leftjoinlist) + list_length(rightjoinlist) <=
-                 join_collapse_limit)
-        {
-            /* OK to combine subproblems */
-            joinlist = list_concat(leftjoinlist, rightjoinlist);
-        }
-        else
-        {
-            /* can't combine, but needn't force join order above here */
-            Node       *leftpart,
-                       *rightpart;
-
-            /* avoid creating useless 1-element sublists */
-            if (list_length(leftjoinlist) == 1)
-                leftpart = (Node *) linitial(leftjoinlist);
-            else
-                leftpart = (Node *) leftjoinlist;
-            if (list_length(rightjoinlist) == 1)
-                rightpart = (Node *) linitial(rightjoinlist);
-            else
-                rightpart = (Node *) rightjoinlist;
-            joinlist = list_make2(leftpart, rightpart);
-        }
-    }
-    else
-    {
-        elog(ERROR, "unrecognized node type: %d",
-             (int) nodeTag(jtnode));
-        joinlist = NIL;            /* keep compiler quiet */
-    }
-    return joinlist;
+		/* A single baserel does not create an inner join */
+		*inner_join_rels = NULL;
+		joinlist = list_make1(jtnode);
+	}
+	else if (IsA(jtnode, FromExpr))
+	{
+		FromExpr   *f = (FromExpr *) jtnode;
+		List	   *child_postponed_quals = NIL;
+		int			remaining;
+		ListCell   *l;
+
+		/*
+		 * First, recurse to handle child joins.  We collapse subproblems into
+		 * a single joinlist whenever the resulting joinlist wouldn't exceed
+		 * from_collapse_limit members.  Also, always collapse one-element
+		 * subproblems, since that won't lengthen the joinlist anyway.
+		 */
+		*qualscope = NULL;
+		*inner_join_rels = NULL;
+		joinlist = NIL;
+		remaining = list_length(f->fromlist);
+		foreach(l, f->fromlist)
+		{
+			Relids		sub_qualscope;
+			List	   *sub_joinlist;
+			int			sub_members;
+
+			sub_joinlist = deconstruct_recurse(root, lfirst(l),
+											   below_outer_join,
+											   &sub_qualscope,
+											   inner_join_rels,
+											   &child_postponed_quals);
+			*qualscope = bms_add_members(*qualscope, sub_qualscope);
+			sub_members = list_length(sub_joinlist);
+			remaining--;
+			if (sub_members <= 1 ||
+				list_length(joinlist) + sub_members + remaining <= from_collapse_limit)
+				joinlist = list_concat(joinlist, sub_joinlist);
+			else
+				joinlist = lappend(joinlist, sub_joinlist);
+		}
+
+		/*
+		 * A FROM with more than one list element is an inner join subsuming
+		 * all below it, so we should report inner_join_rels = qualscope. If
+		 * there was exactly one element, we should (and already did) report
+		 * whatever its inner_join_rels were.  If there were no elements (is
+		 * that possible?) the initialization before the loop fixed it.
+		 */
+		if (list_length(f->fromlist) > 1)
+			*inner_join_rels = *qualscope;
+
+		/*
+		 * Try to process any quals postponed by children.  If they need
+		 * further postponement, add them to my output postponed_qual_list.
+		 */
+		foreach(l, child_postponed_quals)
+		{
+			PostponedQual *pq = (PostponedQual *) lfirst(l);
+
+			if (bms_is_subset(pq->relids, *qualscope))
+				distribute_qual_to_rels(root, pq->qual,
+										false, below_outer_join, JOIN_INNER,
+										root->qual_security_level,
+										*qualscope, NULL, NULL, NULL,
+										NULL);
+			else
+				*postponed_qual_list = lappend(*postponed_qual_list, pq);
+		}
+
+		/*
+		 * Now process the top-level quals.
+		 */
+		foreach(l, (List *) f->quals)
+		{
+			Node	   *qual = (Node *) lfirst(l);
+
+			distribute_qual_to_rels(root, qual,
+									false, below_outer_join, JOIN_INNER,
+									root->qual_security_level,
+									*qualscope, NULL, NULL, NULL,
+									postponed_qual_list);
+		}
+	}
+	else if (IsA(jtnode, JoinExpr))
+	{
+		JoinExpr   *j = (JoinExpr *) jtnode;
+		List	   *child_postponed_quals = NIL;
+		Relids		leftids,
+					rightids,
+					left_inners,
+					right_inners,
+					nonnullable_rels,
+					nullable_rels,
+					ojscope;
+		List	   *leftjoinlist,
+				   *rightjoinlist;
+		List	   *my_quals;
+		SpecialJoinInfo *sjinfo;
+		ListCell   *l;
+
+		/*
+		 * Order of operations here is subtle and critical.  First we recurse
+		 * to handle sub-JOINs.  Their join quals will be placed without
+		 * regard for whether this level is an outer join, which is correct.
+		 * Then we place our own join quals, which are restricted by lower
+		 * outer joins in any case, and are forced to this level if this is an
+		 * outer join and they mention the outer side.  Finally, if this is an
+		 * outer join, we create a join_info_list entry for the join.  This
+		 * will prevent quals above us in the join tree that use those rels
+		 * from being pushed down below this level.  (It's okay for upper
+		 * quals to be pushed down to the outer side, however.)
+		 */
+		switch (j->jointype)
+		{
+			case JOIN_INNER:
+				leftjoinlist = deconstruct_recurse(root, j->larg,
+												   below_outer_join,
+												   &leftids, &left_inners,
+												   &child_postponed_quals);
+				rightjoinlist = deconstruct_recurse(root, j->rarg,
+													below_outer_join,
+													&rightids, &right_inners,
+													&child_postponed_quals);
+				*qualscope = bms_union(leftids, rightids);
+				*inner_join_rels = *qualscope;
+				/* Inner join adds no restrictions for quals */
+				nonnullable_rels = NULL;
+				/* and it doesn't force anything to null, either */
+				nullable_rels = NULL;
+				break;
+			case JOIN_LEFT:
+			case JOIN_ANTI:
+#ifdef __TBASE__
+            case JOIN_LEFT_SCALAR:
+#endif
+				leftjoinlist = deconstruct_recurse(root, j->larg,
+												   below_outer_join,
+												   &leftids, &left_inners,
+												   &child_postponed_quals);
+				rightjoinlist = deconstruct_recurse(root, j->rarg,
+													true,
+													&rightids, &right_inners,
+													&child_postponed_quals);
+				*qualscope = bms_union(leftids, rightids);
+				*inner_join_rels = bms_union(left_inners, right_inners);
+				nonnullable_rels = leftids;
+				nullable_rels = rightids;
+				break;
+			case JOIN_SEMI:
+				leftjoinlist = deconstruct_recurse(root, j->larg,
+												   below_outer_join,
+												   &leftids, &left_inners,
+												   &child_postponed_quals);
+				rightjoinlist = deconstruct_recurse(root, j->rarg,
+													below_outer_join,
+													&rightids, &right_inners,
+													&child_postponed_quals);
+				*qualscope = bms_union(leftids, rightids);
+				*inner_join_rels = bms_union(left_inners, right_inners);
+				/* Semi join adds no restrictions for quals */
+				nonnullable_rels = NULL;
+
+				/*
+				 * Theoretically, a semijoin would null the RHS; but since the
+				 * RHS can't be accessed above the join, this is immaterial
+				 * and we needn't account for it.
+				 */
+				nullable_rels = NULL;
+				break;
+			case JOIN_FULL:
+				leftjoinlist = deconstruct_recurse(root, j->larg,
+												   true,
+												   &leftids, &left_inners,
+												   &child_postponed_quals);
+				rightjoinlist = deconstruct_recurse(root, j->rarg,
+													true,
+													&rightids, &right_inners,
+													&child_postponed_quals);
+				*qualscope = bms_union(leftids, rightids);
+				*inner_join_rels = bms_union(left_inners, right_inners);
+				/* each side is both outer and inner */
+				nonnullable_rels = *qualscope;
+				nullable_rels = *qualscope;
+				break;
+			default:
+				/* JOIN_RIGHT was eliminated during reduce_outer_joins() */
+				elog(ERROR, "unrecognized join type: %d",
+					 (int) j->jointype);
+				nonnullable_rels = NULL;	/* keep compiler quiet */
+				nullable_rels = NULL;
+				leftjoinlist = rightjoinlist = NIL;
+				break;
+		}
+
+		/* Report all rels that will be nulled anywhere in the jointree */
+		root->nullable_baserels = bms_add_members(root->nullable_baserels,
+												  nullable_rels);
+
+		/*
+		 * Try to process any quals postponed by children.  If they need
+		 * further postponement, add them to my output postponed_qual_list.
+		 * Quals that can be processed now must be included in my_quals, so
+		 * that they'll be handled properly in make_outerjoininfo.
+		 */
+		my_quals = NIL;
+		foreach(l, child_postponed_quals)
+		{
+			PostponedQual *pq = (PostponedQual *) lfirst(l);
+
+			if (bms_is_subset(pq->relids, *qualscope))
+				my_quals = lappend(my_quals, pq->qual);
+			else
+			{
+				/*
+				 * We should not be postponing any quals past an outer join.
+				 * If this Assert fires, pull_up_subqueries() messed up.
+				 */
+				Assert(j->jointype == JOIN_INNER);
+				*postponed_qual_list = lappend(*postponed_qual_list, pq);
+			}
+		}
+		/* list_concat is nondestructive of its second argument */
+		my_quals = list_concat(my_quals, (List *) j->quals);
+
+		/*
+		 * For an OJ, form the SpecialJoinInfo now, because we need the OJ's
+		 * semantic scope (ojscope) to pass to distribute_qual_to_rels.  But
+		 * we mustn't add it to join_info_list just yet, because we don't want
+		 * distribute_qual_to_rels to think it is an outer join below us.
+		 *
+		 * Semijoins are a bit of a hybrid: we build a SpecialJoinInfo, but we
+		 * want ojscope = NULL for distribute_qual_to_rels.
+		 */
+		if (j->jointype != JOIN_INNER)
+		{
+			sjinfo = make_outerjoininfo(root,
+										leftids, rightids,
+										*inner_join_rels,
+										j->jointype,
+										my_quals);
+			if (j->jointype == JOIN_SEMI)
+				ojscope = NULL;
+			else
+				ojscope = bms_union(sjinfo->min_lefthand,
+									sjinfo->min_righthand);
+		}
+		else
+		{
+			sjinfo = NULL;
+			ojscope = NULL;
+		}
+
+		/* Process the JOIN's qual clauses */
+		foreach(l, my_quals)
+		{
+			Node	   *qual = (Node *) lfirst(l);
+
+			distribute_qual_to_rels(root, qual,
+									false, below_outer_join, j->jointype,
+									root->qual_security_level,
+									*qualscope,
+									ojscope, nonnullable_rels, NULL,
+									postponed_qual_list);
+		}
+
+		/* Now we can add the SpecialJoinInfo to join_info_list */
+		if (sjinfo)
+		{
+			root->join_info_list = lappend(root->join_info_list, sjinfo);
+			/* Each time we do that, recheck placeholder eval levels */
+			update_placeholder_eval_levels(root, sjinfo);
+		}
+
+		/*
+		 * Finally, compute the output joinlist.  We fold subproblems together
+		 * except at a FULL JOIN or where join_collapse_limit would be
+		 * exceeded.
+		 */
+		if (j->jointype == JOIN_FULL)
+		{
+			/* force the join order exactly at this node */
+			joinlist = list_make1(list_make2(leftjoinlist, rightjoinlist));
+		}
+		else if (list_length(leftjoinlist) + list_length(rightjoinlist) <=
+				 join_collapse_limit)
+		{
+			/* OK to combine subproblems */
+			joinlist = list_concat(leftjoinlist, rightjoinlist);
+		}
+		else
+		{
+			/* can't combine, but needn't force join order above here */
+			Node	   *leftpart,
+					   *rightpart;
+
+			/* avoid creating useless 1-element sublists */
+			if (list_length(leftjoinlist) == 1)
+				leftpart = (Node *) linitial(leftjoinlist);
+			else
+				leftpart = (Node *) leftjoinlist;
+			if (list_length(rightjoinlist) == 1)
+				rightpart = (Node *) linitial(rightjoinlist);
+			else
+				rightpart = (Node *) rightjoinlist;
+			joinlist = list_make2(leftpart, rightpart);
+		}
+	}
+	else
+	{
+		elog(ERROR, "unrecognized node type: %d",
+			 (int) nodeTag(jtnode));
+		joinlist = NIL;			/* keep compiler quiet */
+	}
+	return joinlist;
 }
 
 /*
@@ -1205,243 +1208,263 @@ static void mls_process_cls_quals(PlannerInfo *root,
  */
 static SpecialJoinInfo *
 make_outerjoininfo(PlannerInfo *root,
-                   Relids left_rels, Relids right_rels,
-                   Relids inner_join_rels,
-                   JoinType jointype, List *clause)
-{// #lizard forgives
-    SpecialJoinInfo *sjinfo = makeNode(SpecialJoinInfo);
-    Relids        clause_relids;
-    Relids        strict_relids;
-    Relids        min_lefthand;
-    Relids        min_righthand;
-    ListCell   *l;
-
-    /*
-     * We should not see RIGHT JOIN here because left/right were switched
-     * earlier
-     */
-    Assert(jointype != JOIN_INNER);
-    Assert(jointype != JOIN_RIGHT);
-
-    /*
-     * Presently the executor cannot support FOR [KEY] UPDATE/SHARE marking of
-     * rels appearing on the nullable side of an outer join. (It's somewhat
-     * unclear what that would mean, anyway: what should we mark when a result
-     * row is generated from no element of the nullable relation?)    So,
-     * complain if any nullable rel is FOR [KEY] UPDATE/SHARE.
-     *
-     * You might be wondering why this test isn't made far upstream in the
-     * parser.  It's because the parser hasn't got enough info --- consider
-     * FOR UPDATE applied to a view.  Only after rewriting and flattening do
-     * we know whether the view contains an outer join.
-     *
-     * We use the original RowMarkClause list here; the PlanRowMark list would
-     * list everything.
-     */
-    foreach(l, root->parse->rowMarks)
-    {
-        RowMarkClause *rc = (RowMarkClause *) lfirst(l);
-
-        if (bms_is_member(rc->rti, right_rels) ||
-            (jointype == JOIN_FULL && bms_is_member(rc->rti, left_rels)))
-            ereport(ERROR,
-                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-            /*------
-             translator: %s is a SQL row locking clause such as FOR UPDATE */
-                     errmsg("%s cannot be applied to the nullable side of an outer join",
-                            LCS_asString(rc->strength))));
-    }
-
-    sjinfo->syn_lefthand = left_rels;
-    sjinfo->syn_righthand = right_rels;
-    sjinfo->jointype = jointype;
-    /* this always starts out false */
-    sjinfo->delay_upper_joins = false;
-
-    compute_semijoin_info(sjinfo, clause);
-
-    /* If it's a full join, no need to be very smart */
-    if (jointype == JOIN_FULL)
-    {
-        sjinfo->min_lefthand = bms_copy(left_rels);
-        sjinfo->min_righthand = bms_copy(right_rels);
-        sjinfo->lhs_strict = false; /* don't care about this */
-        return sjinfo;
-    }
-
-    /*
-     * Retrieve all relids mentioned within the join clause.
-     */
-    clause_relids = pull_varnos((Node *) clause);
-
-    /*
-     * For which relids is the clause strict, ie, it cannot succeed if the
-     * rel's columns are all NULL?
-     */
-    strict_relids = find_nonnullable_rels((Node *) clause);
-
-    /* Remember whether the clause is strict for any LHS relations */
-    sjinfo->lhs_strict = bms_overlap(strict_relids, left_rels);
-
-    /*
-     * Required LHS always includes the LHS rels mentioned in the clause. We
-     * may have to add more rels based on lower outer joins; see below.
-     */
-    min_lefthand = bms_intersect(clause_relids, left_rels);
-
-    /*
-     * Similarly for required RHS.  But here, we must also include any lower
-     * inner joins, to ensure we don't try to commute with any of them.
-     */
-    min_righthand = bms_int_members(bms_union(clause_relids, inner_join_rels),
-                                    right_rels);
-
-    /*
-     * Now check previous outer joins for ordering restrictions.
-     */
-    foreach(l, root->join_info_list)
-    {
-        SpecialJoinInfo *otherinfo = (SpecialJoinInfo *) lfirst(l);
-
-        /*
-         * A full join is an optimization barrier: we can't associate into or
-         * out of it.  Hence, if it overlaps either LHS or RHS of the current
-         * rel, expand that side's min relset to cover the whole full join.
-         */
-        if (otherinfo->jointype == JOIN_FULL)
-        {
-            if (bms_overlap(left_rels, otherinfo->syn_lefthand) ||
-                bms_overlap(left_rels, otherinfo->syn_righthand))
-            {
-                min_lefthand = bms_add_members(min_lefthand,
-                                               otherinfo->syn_lefthand);
-                min_lefthand = bms_add_members(min_lefthand,
-                                               otherinfo->syn_righthand);
-            }
-            if (bms_overlap(right_rels, otherinfo->syn_lefthand) ||
-                bms_overlap(right_rels, otherinfo->syn_righthand))
-            {
-                min_righthand = bms_add_members(min_righthand,
-                                                otherinfo->syn_lefthand);
-                min_righthand = bms_add_members(min_righthand,
-                                                otherinfo->syn_righthand);
-            }
-            /* Needn't do anything else with the full join */
-            continue;
-        }
-
-        /*
-         * For a lower OJ in our LHS, if our join condition uses the lower
-         * join's RHS and is not strict for that rel, we must preserve the
-         * ordering of the two OJs, so add lower OJ's full syntactic relset to
-         * min_lefthand.  (We must use its full syntactic relset, not just its
-         * min_lefthand + min_righthand.  This is because there might be other
-         * OJs below this one that this one can commute with, but we cannot
-         * commute with them if we don't with this one.)  Also, if the current
-         * join is a semijoin or antijoin, we must preserve ordering
-         * regardless of strictness.
-         *
-         * Note: I believe we have to insist on being strict for at least one
-         * rel in the lower OJ's min_righthand, not its whole syn_righthand.
-         */
-        if (bms_overlap(left_rels, otherinfo->syn_righthand))
-        {
+				   Relids left_rels, Relids right_rels,
+				   Relids inner_join_rels,
+				   JoinType jointype, List *clause)
+{
+	SpecialJoinInfo *sjinfo = makeNode(SpecialJoinInfo);
+	Relids		clause_relids;
+	Relids		strict_relids;
+	Relids		min_lefthand;
+	Relids		min_righthand;
+	ListCell   *l;
+
+	/*
+	 * We should not see RIGHT JOIN here because left/right were switched
+	 * earlier
+	 */
+	Assert(jointype != JOIN_INNER);
+	Assert(jointype != JOIN_RIGHT);
+
+	/*
+	 * Presently the executor cannot support FOR [KEY] UPDATE/SHARE marking of
+	 * rels appearing on the nullable side of an outer join. (It's somewhat
+	 * unclear what that would mean, anyway: what should we mark when a result
+	 * row is generated from no element of the nullable relation?)	So,
+	 * complain if any nullable rel is FOR [KEY] UPDATE/SHARE.
+	 *
+	 * You might be wondering why this test isn't made far upstream in the
+	 * parser.  It's because the parser hasn't got enough info --- consider
+	 * FOR UPDATE applied to a view.  Only after rewriting and flattening do
+	 * we know whether the view contains an outer join.
+	 *
+	 * We use the original RowMarkClause list here; the PlanRowMark list would
+	 * list everything.
+	 */
+	foreach(l, root->parse->rowMarks)
+	{
+		RowMarkClause *rc = (RowMarkClause *) lfirst(l);
+
+		if (bms_is_member(rc->rti, right_rels) ||
+			(jointype == JOIN_FULL && bms_is_member(rc->rti, left_rels)))
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			/*------
+			 translator: %s is a SQL row locking clause such as FOR UPDATE */
+					 errmsg("%s cannot be applied to the nullable side of an outer join",
+							LCS_asString(rc->strength))));
+	}
+
+	sjinfo->syn_lefthand = left_rels;
+	sjinfo->syn_righthand = right_rels;
+	sjinfo->jointype = jointype;
+	/* this always starts out false */
+	sjinfo->delay_upper_joins = false;
+
+	compute_semijoin_info(sjinfo, clause);
+
+	/* If it's a full join, no need to be very smart */
+	if (jointype == JOIN_FULL)
+	{
+		sjinfo->min_lefthand = bms_copy(left_rels);
+		sjinfo->min_righthand = bms_copy(right_rels);
+		sjinfo->lhs_strict = false; /* don't care about this */
+		return sjinfo;
+	}
+
+	/*
+	 * Retrieve all relids mentioned within the join clause.
+	 */
+	clause_relids = pull_varnos((Node *) clause);
+
+	/*
+	 * For which relids is the clause strict, ie, it cannot succeed if the
+	 * rel's columns are all NULL?
+	 */
+	strict_relids = find_nonnullable_rels((Node *) clause);
+
+	/* Remember whether the clause is strict for any LHS relations */
+	sjinfo->lhs_strict = bms_overlap(strict_relids, left_rels);
+
+	/*
+	 * Required LHS always includes the LHS rels mentioned in the clause. We
+	 * may have to add more rels based on lower outer joins; see below.
+	 */
+	min_lefthand = bms_intersect(clause_relids, left_rels);
+
+	/*
+	 * Similarly for required RHS.  But here, we must also include any lower
+	 * inner joins, to ensure we don't try to commute with any of them.
+	 */
+	min_righthand = bms_int_members(bms_union(clause_relids, inner_join_rels),
+									right_rels);
+
+	/*
+	 * Now check previous outer joins for ordering restrictions.
+	 */
+	foreach(l, root->join_info_list)
+	{
+		SpecialJoinInfo *otherinfo = (SpecialJoinInfo *) lfirst(l);
+
+		/*
+		 * A full join is an optimization barrier: we can't associate into or
+		 * out of it.  Hence, if it overlaps either LHS or RHS of the current
+		 * rel, expand that side's min relset to cover the whole full join.
+		 */
+		if (otherinfo->jointype == JOIN_FULL)
+		{
+			if (bms_overlap(left_rels, otherinfo->syn_lefthand) ||
+				bms_overlap(left_rels, otherinfo->syn_righthand))
+			{
+				min_lefthand = bms_add_members(min_lefthand,
+											   otherinfo->syn_lefthand);
+				min_lefthand = bms_add_members(min_lefthand,
+											   otherinfo->syn_righthand);
+			}
+			if (bms_overlap(right_rels, otherinfo->syn_lefthand) ||
+				bms_overlap(right_rels, otherinfo->syn_righthand))
+			{
+				min_righthand = bms_add_members(min_righthand,
+												otherinfo->syn_lefthand);
+				min_righthand = bms_add_members(min_righthand,
+												otherinfo->syn_righthand);
+			}
+			/* Needn't do anything else with the full join */
+			continue;
+		}
+
+		/*
+		 * For a lower OJ in our LHS, if our join condition uses the lower
+		 * join's RHS and is not strict for that rel, we must preserve the
+		 * ordering of the two OJs, so add lower OJ's full syntactic relset to
+		 * min_lefthand.  (We must use its full syntactic relset, not just its
+		 * min_lefthand + min_righthand.  This is because there might be other
+		 * OJs below this one that this one can commute with, but we cannot
+		 * commute with them if we don't with this one.)  Also, if the current
+		 * join is a semijoin or antijoin, we must preserve ordering
+		 * regardless of strictness.
+		 *
+		 * Note: I believe we have to insist on being strict for at least one
+		 * rel in the lower OJ's min_righthand, not its whole syn_righthand.
+		 */
+		if (bms_overlap(left_rels, otherinfo->syn_righthand))
+		{
+#ifdef __TBASE__
             if (bms_overlap(clause_relids, otherinfo->syn_righthand) &&
-                (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
+                (jointype == JOIN_SEMI ||
+                 jointype == JOIN_ANTI ||
+                 jointype == JOIN_LEFT_SCALAR ||
                  !bms_overlap(strict_relids, otherinfo->min_righthand)))
-            {
-                min_lefthand = bms_add_members(min_lefthand,
-                                               otherinfo->syn_lefthand);
-                min_lefthand = bms_add_members(min_lefthand,
-                                               otherinfo->syn_righthand);
-            }
-        }
-
-        /*
-         * For a lower OJ in our RHS, if our join condition does not use the
-         * lower join's RHS and the lower OJ's join condition is strict, we
-         * can interchange the ordering of the two OJs; otherwise we must add
-         * the lower OJ's full syntactic relset to min_righthand.
-         *
-         * Also, if our join condition does not use the lower join's LHS
-         * either, force the ordering to be preserved.  Otherwise we can end
-         * up with SpecialJoinInfos with identical min_righthands, which can
-         * confuse join_is_legal (see discussion in backend/optimizer/README).
-         *
-         * Also, we must preserve ordering anyway if either the current join
-         * or the lower OJ is either a semijoin or an antijoin.
-         *
-         * Here, we have to consider that "our join condition" includes any
-         * clauses that syntactically appeared above the lower OJ and below
-         * ours; those are equivalent to degenerate clauses in our OJ and must
-         * be treated as such.  Such clauses obviously can't reference our
-         * LHS, and they must be non-strict for the lower OJ's RHS (else
-         * reduce_outer_joins would have reduced the lower OJ to a plain
-         * join).  Hence the other ways in which we handle clauses within our
-         * join condition are not affected by them.  The net effect is
-         * therefore sufficiently represented by the delay_upper_joins flag
-         * saved for us by check_outerjoin_delay.
-         */
-        if (bms_overlap(right_rels, otherinfo->syn_righthand))
-        {
-            if (bms_overlap(clause_relids, otherinfo->syn_righthand) ||
-                !bms_overlap(clause_relids, otherinfo->min_lefthand) ||
-                jointype == JOIN_SEMI ||
-                jointype == JOIN_ANTI ||
-                otherinfo->jointype == JOIN_SEMI ||
-                otherinfo->jointype == JOIN_ANTI ||
-                !otherinfo->lhs_strict || otherinfo->delay_upper_joins)
-            {
-                min_righthand = bms_add_members(min_righthand,
-                                                otherinfo->syn_lefthand);
-                min_righthand = bms_add_members(min_righthand,
-                                                otherinfo->syn_righthand);
-            }
-        }
-    }
-
-    /*
-     * Examine PlaceHolderVars.  If a PHV is supposed to be evaluated within
-     * this join's nullable side, then ensure that min_righthand contains the
-     * full eval_at set of the PHV.  This ensures that the PHV actually can be
-     * evaluated within the RHS.  Note that this works only because we should
-     * already have determined the final eval_at level for any PHV
-     * syntactically within this join.
-     */
-    foreach(l, root->placeholder_list)
-    {
-        PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(l);
-        Relids        ph_syn_level = phinfo->ph_var->phrels;
-
-        /* Ignore placeholder if it didn't syntactically come from RHS */
-        if (!bms_is_subset(ph_syn_level, right_rels))
-            continue;
-
-        /* Else, prevent join from being formed before we eval the PHV */
-        min_righthand = bms_add_members(min_righthand, phinfo->ph_eval_at);
-    }
-
-    /*
-     * If we found nothing to put in min_lefthand, punt and make it the full
-     * LHS, to avoid having an empty min_lefthand which will confuse later
-     * processing. (We don't try to be smart about such cases, just correct.)
-     * Likewise for min_righthand.
-     */
-    if (bms_is_empty(min_lefthand))
-        min_lefthand = bms_copy(left_rels);
-    if (bms_is_empty(min_righthand))
-        min_righthand = bms_copy(right_rels);
-
-    /* Now they'd better be nonempty */
-    Assert(!bms_is_empty(min_lefthand));
-    Assert(!bms_is_empty(min_righthand));
-    /* Shouldn't overlap either */
-    Assert(!bms_overlap(min_lefthand, min_righthand));
-
-    sjinfo->min_lefthand = min_lefthand;
-    sjinfo->min_righthand = min_righthand;
-
-    return sjinfo;
+#else
+			if (bms_overlap(clause_relids, otherinfo->syn_righthand) &&
+				(jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
+				 !bms_overlap(strict_relids, otherinfo->min_righthand)))
+#endif
+			{
+				min_lefthand = bms_add_members(min_lefthand,
+											   otherinfo->syn_lefthand);
+				min_lefthand = bms_add_members(min_lefthand,
+											   otherinfo->syn_righthand);
+			}
+		}
+
+		/*
+		 * For a lower OJ in our RHS, if our join condition does not use the
+		 * lower join's RHS and the lower OJ's join condition is strict, we
+		 * can interchange the ordering of the two OJs; otherwise we must add
+		 * the lower OJ's full syntactic relset to min_righthand.
+		 *
+		 * Also, if our join condition does not use the lower join's LHS
+		 * either, force the ordering to be preserved.  Otherwise we can end
+		 * up with SpecialJoinInfos with identical min_righthands, which can
+		 * confuse join_is_legal (see discussion in backend/optimizer/README).
+		 *
+		 * Also, we must preserve ordering anyway if either the current join
+		 * or the lower OJ is either a semijoin or an antijoin.
+		 *
+		 * Here, we have to consider that "our join condition" includes any
+		 * clauses that syntactically appeared above the lower OJ and below
+		 * ours; those are equivalent to degenerate clauses in our OJ and must
+		 * be treated as such.  Such clauses obviously can't reference our
+		 * LHS, and they must be non-strict for the lower OJ's RHS (else
+		 * reduce_outer_joins would have reduced the lower OJ to a plain
+		 * join).  Hence the other ways in which we handle clauses within our
+		 * join condition are not affected by them.  The net effect is
+		 * therefore sufficiently represented by the delay_upper_joins flag
+		 * saved for us by check_outerjoin_delay.
+		 */
+		if (bms_overlap(right_rels, otherinfo->syn_righthand))
+		{
+#ifdef __TBASE__
+			if (bms_overlap(clause_relids, otherinfo->syn_righthand) ||
+				!bms_overlap(clause_relids, otherinfo->min_lefthand) ||
+				jointype == JOIN_SEMI ||
+				jointype == JOIN_ANTI ||
+				jointype == JOIN_LEFT_SCALAR ||
+				otherinfo->jointype == JOIN_SEMI ||
+				otherinfo->jointype == JOIN_ANTI ||
+                otherinfo->jointype == JOIN_LEFT_SCALAR ||
+				!otherinfo->lhs_strict || otherinfo->delay_upper_joins)
+#else
+			if (bms_overlap(clause_relids, otherinfo->syn_righthand) ||
+				!bms_overlap(clause_relids, otherinfo->min_lefthand) ||
+				jointype == JOIN_SEMI ||
+				jointype == JOIN_ANTI ||
+				otherinfo->jointype == JOIN_SEMI ||
+				otherinfo->jointype == JOIN_ANTI ||
+				!otherinfo->lhs_strict || otherinfo->delay_upper_joins)
+#endif
+			{
+				min_righthand = bms_add_members(min_righthand,
+												otherinfo->syn_lefthand);
+				min_righthand = bms_add_members(min_righthand,
+												otherinfo->syn_righthand);
+			}
+		}
+	}
+
+	/*
+	 * Examine PlaceHolderVars.  If a PHV is supposed to be evaluated within
+	 * this join's nullable side, then ensure that min_righthand contains the
+	 * full eval_at set of the PHV.  This ensures that the PHV actually can be
+	 * evaluated within the RHS.  Note that this works only because we should
+	 * already have determined the final eval_at level for any PHV
+	 * syntactically within this join.
+	 */
+	foreach(l, root->placeholder_list)
+	{
+		PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(l);
+		Relids		ph_syn_level = phinfo->ph_var->phrels;
+
+		/* Ignore placeholder if it didn't syntactically come from RHS */
+		if (!bms_is_subset(ph_syn_level, right_rels))
+			continue;
+
+		/* Else, prevent join from being formed before we eval the PHV */
+		min_righthand = bms_add_members(min_righthand, phinfo->ph_eval_at);
+	}
+
+	/*
+	 * If we found nothing to put in min_lefthand, punt and make it the full
+	 * LHS, to avoid having an empty min_lefthand which will confuse later
+	 * processing. (We don't try to be smart about such cases, just correct.)
+	 * Likewise for min_righthand.
+	 */
+	if (bms_is_empty(min_lefthand))
+		min_lefthand = bms_copy(left_rels);
+	if (bms_is_empty(min_righthand))
+		min_righthand = bms_copy(right_rels);
+
+	/* Now they'd better be nonempty */
+	Assert(!bms_is_empty(min_lefthand));
+	Assert(!bms_is_empty(min_righthand));
+	/* Shouldn't overlap either */
+	Assert(!bms_overlap(min_lefthand, min_righthand));
+
+	sjinfo->min_lefthand = min_lefthand;
+	sjinfo->min_righthand = min_righthand;
+
+	return sjinfo;
 }
 
 /*
@@ -1453,171 +1476,175 @@ make_outerjoininfo(PlannerInfo *root,
  */
 static void
 compute_semijoin_info(SpecialJoinInfo *sjinfo, List *clause)
-{// #lizard forgives
-    List       *semi_operators;
-    List       *semi_rhs_exprs;
-    bool        all_btree;
-    bool        all_hash;
-    ListCell   *lc;
-
-    /* Initialize semijoin-related fields in case we can't unique-ify */
-    sjinfo->semi_can_btree = false;
-    sjinfo->semi_can_hash = false;
-    sjinfo->semi_operators = NIL;
-    sjinfo->semi_rhs_exprs = NIL;
-
-    /* Nothing more to do if it's not a semijoin */
-    if (sjinfo->jointype != JOIN_SEMI)
-        return;
-
-    /*
-     * Look to see whether the semijoin's join quals consist of AND'ed
-     * equality operators, with (only) RHS variables on only one side of each
-     * one.  If so, we can figure out how to enforce uniqueness for the RHS.
-     *
-     * Note that the input clause list is the list of quals that are
-     * *syntactically* associated with the semijoin, which in practice means
-     * the synthesized comparison list for an IN or the WHERE of an EXISTS.
-     * Particularly in the latter case, it might contain clauses that aren't
-     * *semantically* associated with the join, but refer to just one side or
-     * the other.  We can ignore such clauses here, as they will just drop
-     * down to be processed within one side or the other.  (It is okay to
-     * consider only the syntactically-associated clauses here because for a
-     * semijoin, no higher-level quals could refer to the RHS, and so there
-     * can be no other quals that are semantically associated with this join.
-     * We do things this way because it is useful to have the set of potential
-     * unique-ification expressions before we can extract the list of quals
-     * that are actually semantically associated with the particular join.)
-     *
-     * Note that the semi_operators list consists of the joinqual operators
-     * themselves (but commuted if needed to put the RHS value on the right).
-     * These could be cross-type operators, in which case the operator
-     * actually needed for uniqueness is a related single-type operator. We
-     * assume here that that operator will be available from the btree or hash
-     * opclass when the time comes ... if not, create_unique_plan() will fail.
-     */
-    semi_operators = NIL;
-    semi_rhs_exprs = NIL;
-    all_btree = true;
-    all_hash = enable_hashagg;    /* don't consider hash if not enabled */
-    foreach(lc, clause)
-    {
-        OpExpr       *op = (OpExpr *) lfirst(lc);
-        Oid            opno;
-        Node       *left_expr;
-        Node       *right_expr;
-        Relids        left_varnos;
-        Relids        right_varnos;
-        Relids        all_varnos;
-        Oid            opinputtype;
-
-        /* Is it a binary opclause? */
-        if (!IsA(op, OpExpr) ||
-            list_length(op->args) != 2)
-        {
-            /* No, but does it reference both sides? */
-            all_varnos = pull_varnos((Node *) op);
-            if (!bms_overlap(all_varnos, sjinfo->syn_righthand) ||
-                bms_is_subset(all_varnos, sjinfo->syn_righthand))
-            {
-                /*
-                 * Clause refers to only one rel, so ignore it --- unless it
-                 * contains volatile functions, in which case we'd better
-                 * punt.
-                 */
-                if (contain_volatile_functions((Node *) op))
-                    return;
-                continue;
-            }
-            /* Non-operator clause referencing both sides, must punt */
-            return;
-        }
-
-        /* Extract data from binary opclause */
-        opno = op->opno;
-        left_expr = linitial(op->args);
-        right_expr = lsecond(op->args);
-        left_varnos = pull_varnos(left_expr);
-        right_varnos = pull_varnos(right_expr);
-        all_varnos = bms_union(left_varnos, right_varnos);
-        opinputtype = exprType(left_expr);
-
-        /* Does it reference both sides? */
-        if (!bms_overlap(all_varnos, sjinfo->syn_righthand) ||
-            bms_is_subset(all_varnos, sjinfo->syn_righthand))
-        {
-            /*
-             * Clause refers to only one rel, so ignore it --- unless it
-             * contains volatile functions, in which case we'd better punt.
-             */
-            if (contain_volatile_functions((Node *) op))
-                return;
-            continue;
-        }
-
-        /* check rel membership of arguments */
-        if (!bms_is_empty(right_varnos) &&
-            bms_is_subset(right_varnos, sjinfo->syn_righthand) &&
-            !bms_overlap(left_varnos, sjinfo->syn_righthand))
-        {
-            /* typical case, right_expr is RHS variable */
-        }
-        else if (!bms_is_empty(left_varnos) &&
-                 bms_is_subset(left_varnos, sjinfo->syn_righthand) &&
-                 !bms_overlap(right_varnos, sjinfo->syn_righthand))
-        {
-            /* flipped case, left_expr is RHS variable */
-            opno = get_commutator(opno);
-            if (!OidIsValid(opno))
-                return;
-            right_expr = left_expr;
-        }
-        else
-        {
-            /* mixed membership of args, punt */
-            return;
-        }
-
-        /* all operators must be btree equality or hash equality */
-        if (all_btree)
-        {
-            /* oprcanmerge is considered a hint... */
-            if (!op_mergejoinable(opno, opinputtype) ||
-                get_mergejoin_opfamilies(opno) == NIL)
-                all_btree = false;
-        }
-        if (all_hash)
-        {
-            /* ... but oprcanhash had better be correct */
-            if (!op_hashjoinable(opno, opinputtype))
-                all_hash = false;
-        }
-        if (!(all_btree || all_hash))
-            return;
-
-        /* so far so good, keep building lists */
-        semi_operators = lappend_oid(semi_operators, opno);
-        semi_rhs_exprs = lappend(semi_rhs_exprs, copyObject(right_expr));
-    }
-
-    /* Punt if we didn't find at least one column to unique-ify */
-    if (semi_rhs_exprs == NIL)
-        return;
-
-    /*
-     * The expressions we'd need to unique-ify mustn't be volatile.
-     */
-    if (contain_volatile_functions((Node *) semi_rhs_exprs))
-        return;
-
-    /*
-     * If we get here, we can unique-ify the semijoin's RHS using at least one
-     * of sorting and hashing.  Save the information about how to do that.
-     */
-    sjinfo->semi_can_btree = all_btree;
-    sjinfo->semi_can_hash = all_hash;
-    sjinfo->semi_operators = semi_operators;
-    sjinfo->semi_rhs_exprs = semi_rhs_exprs;
+{
+	List	   *semi_operators;
+	List	   *semi_rhs_exprs;
+	bool		all_btree;
+	bool		all_hash;
+	ListCell   *lc;
+
+	/* Initialize semijoin-related fields in case we can't unique-ify */
+	sjinfo->semi_can_btree = false;
+	sjinfo->semi_can_hash = false;
+	sjinfo->semi_operators = NIL;
+	sjinfo->semi_rhs_exprs = NIL;
+
+	/* Nothing more to do if it's not a semijoin */
+#ifdef __TBASE__
+	if (sjinfo->jointype != JOIN_SEMI && sjinfo->jointype != JOIN_LEFT_SCALAR)
+#else
+	if (sjinfo->jointype != JOIN_SEMI)
+#endif
+		return;
+
+	/*
+	 * Look to see whether the semijoin's join quals consist of AND'ed
+	 * equality operators, with (only) RHS variables on only one side of each
+	 * one.  If so, we can figure out how to enforce uniqueness for the RHS.
+	 *
+	 * Note that the input clause list is the list of quals that are
+	 * *syntactically* associated with the semijoin, which in practice means
+	 * the synthesized comparison list for an IN or the WHERE of an EXISTS.
+	 * Particularly in the latter case, it might contain clauses that aren't
+	 * *semantically* associated with the join, but refer to just one side or
+	 * the other.  We can ignore such clauses here, as they will just drop
+	 * down to be processed within one side or the other.  (It is okay to
+	 * consider only the syntactically-associated clauses here because for a
+	 * semijoin, no higher-level quals could refer to the RHS, and so there
+	 * can be no other quals that are semantically associated with this join.
+	 * We do things this way because it is useful to have the set of potential
+	 * unique-ification expressions before we can extract the list of quals
+	 * that are actually semantically associated with the particular join.)
+	 *
+	 * Note that the semi_operators list consists of the joinqual operators
+	 * themselves (but commuted if needed to put the RHS value on the right).
+	 * These could be cross-type operators, in which case the operator
+	 * actually needed for uniqueness is a related single-type operator. We
+	 * assume here that that operator will be available from the btree or hash
+	 * opclass when the time comes ... if not, create_unique_plan() will fail.
+	 */
+	semi_operators = NIL;
+	semi_rhs_exprs = NIL;
+	all_btree = true;
+	all_hash = enable_hashagg;	/* don't consider hash if not enabled */
+	foreach(lc, clause)
+	{
+		OpExpr	   *op = (OpExpr *) lfirst(lc);
+		Oid			opno;
+		Node	   *left_expr;
+		Node	   *right_expr;
+		Relids		left_varnos;
+		Relids		right_varnos;
+		Relids		all_varnos;
+		Oid			opinputtype;
+
+		/* Is it a binary opclause? */
+		if (!IsA(op, OpExpr) ||
+			list_length(op->args) != 2)
+		{
+			/* No, but does it reference both sides? */
+			all_varnos = pull_varnos((Node *) op);
+			if (!bms_overlap(all_varnos, sjinfo->syn_righthand) ||
+				bms_is_subset(all_varnos, sjinfo->syn_righthand))
+			{
+				/*
+				 * Clause refers to only one rel, so ignore it --- unless it
+				 * contains volatile functions, in which case we'd better
+				 * punt.
+				 */
+				if (contain_volatile_functions((Node *) op))
+					return;
+				continue;
+			}
+			/* Non-operator clause referencing both sides, must punt */
+			return;
+		}
+
+		/* Extract data from binary opclause */
+		opno = op->opno;
+		left_expr = linitial(op->args);
+		right_expr = lsecond(op->args);
+		left_varnos = pull_varnos(left_expr);
+		right_varnos = pull_varnos(right_expr);
+		all_varnos = bms_union(left_varnos, right_varnos);
+		opinputtype = exprType(left_expr);
+
+		/* Does it reference both sides? */
+		if (!bms_overlap(all_varnos, sjinfo->syn_righthand) ||
+			bms_is_subset(all_varnos, sjinfo->syn_righthand))
+		{
+			/*
+			 * Clause refers to only one rel, so ignore it --- unless it
+			 * contains volatile functions, in which case we'd better punt.
+			 */
+			if (contain_volatile_functions((Node *) op))
+				return;
+			continue;
+		}
+
+		/* check rel membership of arguments */
+		if (!bms_is_empty(right_varnos) &&
+			bms_is_subset(right_varnos, sjinfo->syn_righthand) &&
+			!bms_overlap(left_varnos, sjinfo->syn_righthand))
+		{
+			/* typical case, right_expr is RHS variable */
+		}
+		else if (!bms_is_empty(left_varnos) &&
+				 bms_is_subset(left_varnos, sjinfo->syn_righthand) &&
+				 !bms_overlap(right_varnos, sjinfo->syn_righthand))
+		{
+			/* flipped case, left_expr is RHS variable */
+			opno = get_commutator(opno);
+			if (!OidIsValid(opno))
+				return;
+			right_expr = left_expr;
+		}
+		else
+		{
+			/* mixed membership of args, punt */
+			return;
+		}
+
+		/* all operators must be btree equality or hash equality */
+		if (all_btree)
+		{
+			/* oprcanmerge is considered a hint... */
+			if (!op_mergejoinable(opno, opinputtype) ||
+				get_mergejoin_opfamilies(opno) == NIL)
+				all_btree = false;
+		}
+		if (all_hash)
+		{
+			/* ... but oprcanhash had better be correct */
+			if (!op_hashjoinable(opno, opinputtype))
+				all_hash = false;
+		}
+		if (!(all_btree || all_hash))
+			return;
+
+		/* so far so good, keep building lists */
+		semi_operators = lappend_oid(semi_operators, opno);
+		semi_rhs_exprs = lappend(semi_rhs_exprs, copyObject(right_expr));
+	}
+
+	/* Punt if we didn't find at least one column to unique-ify */
+	if (semi_rhs_exprs == NIL)
+		return;
+
+	/*
+	 * The expressions we'd need to unique-ify mustn't be volatile.
+	 */
+	if (contain_volatile_functions((Node *) semi_rhs_exprs))
+		return;
+
+	/*
+	 * If we get here, we can unique-ify the semijoin's RHS using at least one
+	 * of sorting and hashing.  Save the information about how to do that.
+	 */
+	sjinfo->semi_can_btree = all_btree;
+	sjinfo->semi_can_hash = all_hash;
+	sjinfo->semi_operators = semi_operators;
+	sjinfo->semi_rhs_exprs = semi_rhs_exprs;
 }
 
 
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index 175058f9..9f1be6e4 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -1653,119 +1653,122 @@ fix_scan_expr_walker(Node *node, fix_scan_expr_context *context)
  */
 static void
 set_join_references(PlannerInfo *root, Join *join, int rtoffset)
-{// #lizard forgives
-    Plan       *outer_plan = join->plan.lefttree;
-    Plan       *inner_plan = join->plan.righttree;
-    indexed_tlist *outer_itlist;
-    indexed_tlist *inner_itlist;
-
-    outer_itlist = build_tlist_index(outer_plan->targetlist);
-    inner_itlist = build_tlist_index(inner_plan->targetlist);
-
-    /*
-     * First process the joinquals (including merge or hash clauses).  These
-     * are logically below the join so they can always use all values
-     * available from the input tlists.  It's okay to also handle
-     * NestLoopParams now, because those couldn't refer to nullable
-     * subexpressions.
-     */
-    join->joinqual = fix_join_expr(root,
-                                   join->joinqual,
-                                   outer_itlist,
-                                   inner_itlist,
-                                   (Index) 0,
-                                   rtoffset);
-
-    /* Now do join-type-specific stuff */
-    if (IsA(join, NestLoop))
-    {
-        NestLoop   *nl = (NestLoop *) join;
-        ListCell   *lc;
-
-        foreach(lc, nl->nestParams)
-        {
-            NestLoopParam *nlp = (NestLoopParam *) lfirst(lc);
-
-            nlp->paramval = (Var *) fix_upper_expr(root,
-                                                   (Node *) nlp->paramval,
-                                                   outer_itlist,
-                                                   OUTER_VAR,
-                                                   rtoffset);
-
-            /* Check we replaced any PlaceHolderVar with simple Var */
-            if (!(IsA(nlp->paramval, Var) &&
-                  nlp->paramval->varno == OUTER_VAR))
-                elog(ERROR, "NestLoopParam was not reduced to a simple Var");
-        }
-    }
-    else if (IsA(join, MergeJoin))
-    {
-        MergeJoin  *mj = (MergeJoin *) join;
-
-        mj->mergeclauses = fix_join_expr(root,
-                                         mj->mergeclauses,
-                                         outer_itlist,
-                                         inner_itlist,
-                                         (Index) 0,
-                                         rtoffset);
-    }
-    else if (IsA(join, HashJoin))
-    {
-        HashJoin   *hj = (HashJoin *) join;
-
-        hj->hashclauses = fix_join_expr(root,
-                                        hj->hashclauses,
-                                        outer_itlist,
-                                        inner_itlist,
-                                        (Index) 0,
-                                        rtoffset);
-    }
+{
+	Plan	   *outer_plan = join->plan.lefttree;
+	Plan	   *inner_plan = join->plan.righttree;
+	indexed_tlist *outer_itlist;
+	indexed_tlist *inner_itlist;
+
+	outer_itlist = build_tlist_index(outer_plan->targetlist);
+	inner_itlist = build_tlist_index(inner_plan->targetlist);
+
+	/*
+	 * First process the joinquals (including merge or hash clauses).  These
+	 * are logically below the join so they can always use all values
+	 * available from the input tlists.  It's okay to also handle
+	 * NestLoopParams now, because those couldn't refer to nullable
+	 * subexpressions.
+	 */
+	join->joinqual = fix_join_expr(root,
+								   join->joinqual,
+								   outer_itlist,
+								   inner_itlist,
+								   (Index) 0,
+								   rtoffset);
+
+	/* Now do join-type-specific stuff */
+	if (IsA(join, NestLoop))
+	{
+		NestLoop   *nl = (NestLoop *) join;
+		ListCell   *lc;
+
+		foreach(lc, nl->nestParams)
+		{
+			NestLoopParam *nlp = (NestLoopParam *) lfirst(lc);
+
+			nlp->paramval = (Var *) fix_upper_expr(root,
+												   (Node *) nlp->paramval,
+												   outer_itlist,
+												   OUTER_VAR,
+												   rtoffset);
+
+			/* Check we replaced any PlaceHolderVar with simple Var */
+			if (!(IsA(nlp->paramval, Var) &&
+				  nlp->paramval->varno == OUTER_VAR))
+				elog(ERROR, "NestLoopParam was not reduced to a simple Var");
+		}
+	}
+	else if (IsA(join, MergeJoin))
+	{
+		MergeJoin  *mj = (MergeJoin *) join;
+
+		mj->mergeclauses = fix_join_expr(root,
+										 mj->mergeclauses,
+										 outer_itlist,
+										 inner_itlist,
+										 (Index) 0,
+										 rtoffset);
+	}
+	else if (IsA(join, HashJoin))
+	{
+		HashJoin   *hj = (HashJoin *) join;
+
+		hj->hashclauses = fix_join_expr(root,
+										hj->hashclauses,
+										outer_itlist,
+										inner_itlist,
+										(Index) 0,
+										rtoffset);
+	}
 
-    /*
-     * Now we need to fix up the targetlist and qpqual, which are logically
-     * above the join.  This means they should not re-use any input expression
-     * that was computed in the nullable side of an outer join.  Vars and
-     * PlaceHolderVars are fine, so we can implement this restriction just by
-     * clearing has_non_vars in the indexed_tlist structs.
-     *
-     * XXX This is a grotty workaround for the fact that we don't clearly
-     * distinguish between a Var appearing below an outer join and the "same"
-     * Var appearing above it.  If we did, we'd not need to hack the matching
-     * rules this way.
-     */
-    switch (join->jointype)
-    {
-        case JOIN_LEFT:
-        case JOIN_SEMI:
-        case JOIN_ANTI:
-            inner_itlist->has_non_vars = false;
-            break;
-        case JOIN_RIGHT:
-            outer_itlist->has_non_vars = false;
-            break;
-        case JOIN_FULL:
-            outer_itlist->has_non_vars = false;
-            inner_itlist->has_non_vars = false;
-            break;
-        default:
-            break;
-    }
+	/*
+	 * Now we need to fix up the targetlist and qpqual, which are logically
+	 * above the join.  This means they should not re-use any input expression
+	 * that was computed in the nullable side of an outer join.  Vars and
+	 * PlaceHolderVars are fine, so we can implement this restriction just by
+	 * clearing has_non_vars in the indexed_tlist structs.
+	 *
+	 * XXX This is a grotty workaround for the fact that we don't clearly
+	 * distinguish between a Var appearing below an outer join and the "same"
+	 * Var appearing above it.  If we did, we'd not need to hack the matching
+	 * rules this way.
+	 */
+	switch (join->jointype)
+	{
+		case JOIN_LEFT:
+		case JOIN_SEMI:
+		case JOIN_ANTI:
+#ifdef __TBASE__
+        case JOIN_LEFT_SCALAR:
+#endif
+			inner_itlist->has_non_vars = false;
+			break;
+		case JOIN_RIGHT:
+			outer_itlist->has_non_vars = false;
+			break;
+		case JOIN_FULL:
+			outer_itlist->has_non_vars = false;
+			inner_itlist->has_non_vars = false;
+			break;
+		default:
+			break;
+	}
 
-    join->plan.targetlist = fix_join_expr(root,
-                                          join->plan.targetlist,
-                                          outer_itlist,
-                                          inner_itlist,
-                                          (Index) 0,
-                                          rtoffset);
-    join->plan.qual = fix_join_expr(root,
-                                    join->plan.qual,
-                                    outer_itlist,
-                                    inner_itlist,
-                                    (Index) 0,
-                                    rtoffset);
-
-    pfree(outer_itlist);
-    pfree(inner_itlist);
+	join->plan.targetlist = fix_join_expr(root,
+										  join->plan.targetlist,
+										  outer_itlist,
+										  inner_itlist,
+										  (Index) 0,
+										  rtoffset);
+	join->plan.qual = fix_join_expr(root,
+									join->plan.qual,
+									outer_itlist,
+									inner_itlist,
+									(Index) 0,
+									rtoffset);
+
+	pfree(outer_itlist);
+	pfree(inner_itlist);
 }
 
 /*
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 1ebdfefc..bff6e3fd 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -1641,7 +1641,9 @@ append_var_to_subquery_targetlist(Var *var, List *targetList, TargetEntry **targ
 
 	ent->resno = varno;
 	var->varattno = var->varoattno = varno;
-	*target = ent;
+
+	if(target != NULL)
+        *target = ent;
 	return targetList;
 }
 
@@ -2605,8 +2607,6 @@ convert_EXPR_sublink_to_join(PlannerInfo *root, OpExpr *expr,
 
 		ent->resno = varno;
 
-		//var->varattno = var->varoattno = varno;
-
 		/* determine the eqop and optional sortop */
 		get_sort_group_operators(restype,
 								 false, true, false,
@@ -2771,6 +2771,123 @@ get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, List **targetLis
 	return node;
 }
 
+#ifdef __TBASE__
+/*
+ * convert_TargetList_sublink_to_join :
+ *          try to convert an EXISTS SubLink in targetlist to a join
+ * On success, it returns not NULL.
+ */
+TargetEntry *
+convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
+{
+    Query *parse = root->parse;
+    Node  *whereClause = NULL;
+    Query   *subselect = NULL;
+    JoinExpr *joinExpr = NULL;
+    ParseState *pstate = NULL;
+    SubLink   *sublink = NULL;
+    RangeTblRef   *rtr = NULL;
+    RangeTblEntry *rte = NULL;
+    Var *var = NULL;
+
+    /* Sanity check */
+    if (!IsA(entry->expr, SubLink))
+        return NULL;
+
+    sublink = (SubLink *) entry->expr;
+    if (sublink->subLinkType != EXPR_SUBLINK)
+        return NULL;
+
+    /*
+     * Copy object so that we can modify it.
+     */
+    subselect = copyObject((Query *) sublink->subselect);
+    whereClause = subselect->jointree->quals;
+
+    /*
+     * Only one targetEntry can be handled.
+     */
+    if (list_length(subselect->targetList) > 1)
+        return NULL;
+
+    /*
+     * The subquery must have a nonempty jointree, else we won't have a join.
+     */
+    if (subselect->jointree->fromlist == NIL)
+        return NULL;
+
+    /*
+     * What we can not optimize.
+     */
+    if (subselect->commandType != CMD_SELECT ||
+        subselect->hasAggs || subselect->hasDistinctOn ||
+        subselect->setOperations || subselect->groupingSets ||
+        subselect->groupClause || subselect->hasWindowFuncs ||
+        subselect->hasTargetSRFs || subselect->hasModifyingCTE ||
+        subselect->havingQual || subselect->limitOffset ||
+        subselect->limitCount || subselect->rowMarks ||
+        subselect->cteList || subselect->sortClause)
+    {
+        return NULL;
+    }
+
+    /*
+     * On one hand, the WHERE clause must contain some Vars of the
+     * parent query, else it's not gonna be a join.
+     */
+    if (!contain_vars_of_level(whereClause, 1))
+        return NULL;
+
+    /*
+     * We don't risk optimizing if the WHERE clause is volatile, either.
+     */
+    if (contain_volatile_functions(whereClause))
+        return NULL;
+
+    /*
+     * The rest of the sub-select must not refer to any Vars of the parent
+     * query.  (Vars of higher levels should be okay, though.)
+     */
+    if (contain_vars_of_level((Node *) subselect, 1))
+        return NULL;
+
+    /*
+     * Move sub-select to the parent query.
+     */
+    pstate = make_parsestate(NULL);
+    rte = addRangeTableEntryForSubquery(pstate,
+                                        subselect,
+                                        makeAlias("TARGETLIST_subquery", NIL),
+                                        true,
+                                        false);
+    parse->rtable = lappend(parse->rtable, rte);
+
+    rtr = makeNode(RangeTblRef);
+    rtr->rtindex = list_length(parse->rtable);
+
+    /*
+     * Form join node.
+     */
+    joinExpr = makeNode(JoinExpr);
+    joinExpr->jointype = JOIN_LEFT_SCALAR;
+    joinExpr->isNatural = false;
+    joinExpr->larg = (Node *) root->parse->jointree;
+    joinExpr->rarg = (Node *) rtr;
+    joinExpr->usingClause = NIL;
+    joinExpr->alias   = NULL;
+    joinExpr->rtindex = 0; /* we don't need an RTE for it */
+    joinExpr->quals   = NULL;
+
+    /* Wrap join node in FromExpr as required. */
+    parse->jointree = makeFromExpr(list_make1(joinExpr), NULL);
+
+    /* Replace sublink node with Var. */
+    var = makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList));
+    entry->expr = (Expr *) var;
+    return entry;
+}
+#endif
+
 static Expr *
 convert_OR_EXIST_sublink_to_join(PlannerInfo *root, SubLink *sublink, Node **jtlink)
 {
@@ -2825,7 +2942,9 @@ convert_OR_EXIST_sublink_to_join(PlannerInfo *root, SubLink *sublink, Node **jtl
 		Oid restype;
 		SortGroupClause *grpcl;
 		TargetEntry *entry;
-		subselect->targetList = append_var_to_subquery_targetlist((Var *)lfirst(cell), subselect->targetList, &entry);
+
+        subselect->targetList = append_var_to_subquery_targetlist((Var *)lfirst(cell),
+                                                                  subselect->targetList, &entry);
 		restype = exprType((Node *)entry->expr);
 		get_sort_group_operators(restype,
 								 false, true, false,
diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c
index 824d6e0a..b7cbd5c0 100644
--- a/src/backend/optimizer/prep/prepjointree.c
+++ b/src/backend/optimizer/prep/prepjointree.c
@@ -216,22 +216,48 @@ static bool check_pull_up_sublinks_qual_or_recurse(PlannerInfo *root, Node *node
 void
 pull_up_sublinks(PlannerInfo *root)
 {
-    Node       *jtnode;
-    Relids        relids;
-
-    /* Begin recursion through the jointree */
-    jtnode = pull_up_sublinks_jointree_recurse(root,
-                                               (Node *) root->parse->jointree,
-                                               &relids);
+	Node	   *jtnode;
+	Relids		relids;
 
+#ifdef __TBASE__
     /*
-     * root->parse->jointree must always be a FromExpr, so insert a dummy one
-     * if we got a bare RangeTblRef or JoinExpr out of the recursion.
+     * Look for SubLinks in targetlist, and try to transform them into joins.
      */
-    if (IsA(jtnode, FromExpr))
-        root->parse->jointree = (FromExpr *) jtnode;
-    else
-        root->parse->jointree = makeFromExpr(list_make1(jtnode), NULL);
+    if(enable_pullup_subquery)
+    {
+        List *new_targetList = NIL;
+        ListCell *lc = NULL;
+        TargetEntry *entry = NULL;
+        TargetEntry *new_entry = NULL;
+
+        foreach(lc, root->parse->targetList)
+        {
+            entry = (TargetEntry *) lfirst(lc);
+
+            new_entry = convert_TargetList_sublink_to_join(root, entry);
+            if (new_entry)
+                new_targetList = lappend(new_targetList, new_entry);
+            else
+                new_targetList = lappend(new_targetList, entry);
+        }
+
+        root->parse->targetList = new_targetList;
+    }
+#endif
+
+	/* Begin recursion through the jointree */
+	jtnode = pull_up_sublinks_jointree_recurse(root,
+											   (Node *) root->parse->jointree,
+											   &relids);
+
+	/*
+	 * root->parse->jointree must always be a FromExpr, so insert a dummy one
+	 * if we got a bare RangeTblRef or JoinExpr out of the recursion.
+	 */
+	if (IsA(jtnode, FromExpr))
+		root->parse->jointree = (FromExpr *) jtnode;
+	else
+		root->parse->jointree = makeFromExpr(list_make1(jtnode), NULL);
 }
 
 /*
@@ -296,98 +322,101 @@ pull_up_sublinks_jointree_recurse(PlannerInfo *root, Node *jtnode,
 		}
 #endif
 
-        /*
-         * Note that the result will be either newf, or a stack of JoinExprs
-         * with newf at the base.  We rely on subsequent optimization steps to
-         * flatten this and rearrange the joins as needed.
-         *
-         * Although we could include the pulled-up subqueries in the returned
-         * relids, there's no need since upper quals couldn't refer to their
-         * outputs anyway.
-         */
-        *relids = frelids;
-        jtnode = jtlink;
-    }
-    else if (IsA(jtnode, JoinExpr))
-    {
-        JoinExpr   *j;
-        Relids        leftrelids;
-        Relids        rightrelids;
-        Node       *jtlink;
-
-        /*
-         * Make a modifiable copy of join node, but don't bother copying its
-         * subnodes (yet).
-         */
-        j = (JoinExpr *) palloc(sizeof(JoinExpr));
-        memcpy(j, jtnode, sizeof(JoinExpr));
-        jtlink = (Node *) j;
-
-        /* Recurse to process children and collect their relids */
-        j->larg = pull_up_sublinks_jointree_recurse(root, j->larg,
-                                                    &leftrelids);
-        j->rarg = pull_up_sublinks_jointree_recurse(root, j->rarg,
-                                                    &rightrelids);
-
-        /*
-         * Now process qual, showing appropriate child relids as available,
-         * and attach any pulled-up jointree items at the right place. In the
-         * inner-join case we put new JoinExprs above the existing one (much
-         * as for a FromExpr-style join).  In outer-join cases the new
-         * JoinExprs must go into the nullable side of the outer join. The
-         * point of the available_rels machinations is to ensure that we only
-         * pull up quals for which that's okay.
-         *
-         * We don't expect to see any pre-existing JOIN_SEMI or JOIN_ANTI
-         * nodes here.
-         */
-        switch (j->jointype)
-        {
-            case JOIN_INNER:
-                j->quals = pull_up_sublinks_qual_recurse(root, j->quals,
-                                                         &jtlink,
-                                                         bms_union(leftrelids,
-                                                                   rightrelids),
-                                                         NULL, NULL);
-                break;
-            case JOIN_LEFT:
-                j->quals = pull_up_sublinks_qual_recurse(root, j->quals,
-                                                         &j->rarg,
-                                                         rightrelids,
-                                                         NULL, NULL);
-                break;
-            case JOIN_FULL:
-                /* can't do anything with full-join quals */
-                break;
-            case JOIN_RIGHT:
-                j->quals = pull_up_sublinks_qual_recurse(root, j->quals,
-                                                         &j->larg,
-                                                         leftrelids,
-                                                         NULL, NULL);
-                break;
-            default:
-                elog(ERROR, "unrecognized join type: %d",
-                     (int) j->jointype);
-                break;
-        }
+		/*
+		 * Note that the result will be either newf, or a stack of JoinExprs
+		 * with newf at the base.  We rely on subsequent optimization steps to
+		 * flatten this and rearrange the joins as needed.
+		 *
+		 * Although we could include the pulled-up subqueries in the returned
+		 * relids, there's no need since upper quals couldn't refer to their
+		 * outputs anyway.
+		 */
+		*relids = frelids;
+		jtnode = jtlink;
+	}
+	else if (IsA(jtnode, JoinExpr))
+	{
+		JoinExpr   *j;
+		Relids		leftrelids;
+		Relids		rightrelids;
+		Node	   *jtlink;
+
+		/*
+		 * Make a modifiable copy of join node, but don't bother copying its
+		 * subnodes (yet).
+		 */
+		j = (JoinExpr *) palloc(sizeof(JoinExpr));
+		memcpy(j, jtnode, sizeof(JoinExpr));
+		jtlink = (Node *) j;
+
+		/* Recurse to process children and collect their relids */
+		j->larg = pull_up_sublinks_jointree_recurse(root, j->larg,
+													&leftrelids);
+		j->rarg = pull_up_sublinks_jointree_recurse(root, j->rarg,
+													&rightrelids);
+
+		/*
+		 * Now process qual, showing appropriate child relids as available,
+		 * and attach any pulled-up jointree items at the right place. In the
+		 * inner-join case we put new JoinExprs above the existing one (much
+		 * as for a FromExpr-style join).  In outer-join cases the new
+		 * JoinExprs must go into the nullable side of the outer join. The
+		 * point of the available_rels machinations is to ensure that we only
+		 * pull up quals for which that's okay.
+		 *
+		 * We don't expect to see any pre-existing JOIN_SEMI or JOIN_ANTI
+		 * nodes here.
+		 */
+		switch (j->jointype)
+		{
+			case JOIN_INNER:
+				j->quals = pull_up_sublinks_qual_recurse(root, j->quals,
+														 &jtlink,
+														 bms_union(leftrelids,
+																   rightrelids),
+														 NULL, NULL);
+				break;
+			case JOIN_LEFT:
+#ifdef __TBASE__
+            case JOIN_LEFT_SCALAR:
+#endif
+				j->quals = pull_up_sublinks_qual_recurse(root, j->quals,
+														 &j->rarg,
+														 rightrelids,
+														 NULL, NULL);
+				break;
+			case JOIN_FULL:
+				/* can't do anything with full-join quals */
+				break;
+			case JOIN_RIGHT:
+				j->quals = pull_up_sublinks_qual_recurse(root, j->quals,
+														 &j->larg,
+														 leftrelids,
+														 NULL, NULL);
+				break;
+			default:
+				elog(ERROR, "unrecognized join type: %d",
+					 (int) j->jointype);
+				break;
+		}
 
-        /*
-         * Although we could include the pulled-up subqueries in the returned
-         * relids, there's no need since upper quals couldn't refer to their
-         * outputs anyway.  But we *do* need to include the join's own rtindex
-         * because we haven't yet collapsed join alias variables, so upper
-         * levels would mistakenly think they couldn't use references to this
-         * join.
-         */
-        *relids = bms_join(leftrelids, rightrelids);
-        if (j->rtindex)
-            *relids = bms_add_member(*relids, j->rtindex);
-        jtnode = jtlink;
-    }
-    else
-        elog(ERROR, "unrecognized node type: %d",
-             (int) nodeTag(jtnode));
-    return jtnode;
+		/*
+		 * Although we could include the pulled-up subqueries in the returned
+		 * relids, there's no need since upper quals couldn't refer to their
+		 * outputs anyway.  But we *do* need to include the join's own rtindex
+		 * because we haven't yet collapsed join alias variables, so upper
+		 * levels would mistakenly think they couldn't use references to this
+		 * join.
+		 */
+		*relids = bms_join(leftrelids, rightrelids);
+		if (j->rtindex)
+			*relids = bms_add_member(*relids, j->rtindex);
+		jtnode = jtlink;
+	}
+	else
+		elog(ERROR, "unrecognized node type: %d",
+			 (int) nodeTag(jtnode));
+	return jtnode;
 }
 #ifdef __TBASE__
 
@@ -1073,185 +1102,188 @@ pull_up_subqueries(PlannerInfo *root)
  */
 static Node *
 pull_up_subqueries_recurse(PlannerInfo *root, Node *jtnode,
-                           JoinExpr *lowest_outer_join,
-                           JoinExpr *lowest_nulling_outer_join,
-                           AppendRelInfo *containing_appendrel,
-                           bool deletion_ok)
-{// #lizard forgives
-    Assert(jtnode != NULL);
-    if (IsA(jtnode, RangeTblRef))
-    {
-        int            varno = ((RangeTblRef *) jtnode)->rtindex;
-        RangeTblEntry *rte = rt_fetch(varno, root->parse->rtable);
-
-        /*
-         * Is this a subquery RTE, and if so, is the subquery simple enough to
-         * pull up?
-         *
-         * If we are looking at an append-relation member, we can't pull it up
-         * unless is_safe_append_member says so.
-         */
-        if (rte->rtekind == RTE_SUBQUERY &&
-            is_simple_subquery(rte->subquery, rte,
-                               lowest_outer_join, deletion_ok) &&
-            (containing_appendrel == NULL ||
-             is_safe_append_member(rte->subquery)))
-            return pull_up_simple_subquery(root, jtnode, rte,
-                                           lowest_outer_join,
-                                           lowest_nulling_outer_join,
-                                           containing_appendrel,
-                                           deletion_ok);
-
-        /*
-         * Alternatively, is it a simple UNION ALL subquery?  If so, flatten
-         * into an "append relation".
-         *
-         * It's safe to do this regardless of whether this query is itself an
-         * appendrel member.  (If you're thinking we should try to flatten the
-         * two levels of appendrel together, you're right; but we handle that
-         * in set_append_rel_pathlist, not here.)
-         */
-        if (rte->rtekind == RTE_SUBQUERY &&
-            is_simple_union_all(rte->subquery))
-            return pull_up_simple_union_all(root, jtnode, rte);
-
-        /*
-         * Or perhaps it's a simple VALUES RTE?
-         *
-         * We don't allow VALUES pullup below an outer join nor into an
-         * appendrel (such cases are impossible anyway at the moment).
-         */
-        if (rte->rtekind == RTE_VALUES &&
-            lowest_outer_join == NULL &&
-            containing_appendrel == NULL &&
-            is_simple_values(root, rte, deletion_ok))
-            return pull_up_simple_values(root, jtnode, rte);
-
-        /* Otherwise, do nothing at this node. */
-    }
-    else if (IsA(jtnode, FromExpr))
-    {
-        FromExpr   *f = (FromExpr *) jtnode;
-        bool        have_undeleted_child = false;
-        ListCell   *l;
-
-        Assert(containing_appendrel == NULL);
+						   JoinExpr *lowest_outer_join,
+						   JoinExpr *lowest_nulling_outer_join,
+						   AppendRelInfo *containing_appendrel,
+						   bool deletion_ok)
+{
+	Assert(jtnode != NULL);
+	if (IsA(jtnode, RangeTblRef))
+	{
+		int			varno = ((RangeTblRef *) jtnode)->rtindex;
+		RangeTblEntry *rte = rt_fetch(varno, root->parse->rtable);
+
+		/*
+		 * Is this a subquery RTE, and if so, is the subquery simple enough to
+		 * pull up?
+		 *
+		 * If we are looking at an append-relation member, we can't pull it up
+		 * unless is_safe_append_member says so.
+		 */
+		if (rte->rtekind == RTE_SUBQUERY &&
+			is_simple_subquery(rte->subquery, rte,
+							   lowest_outer_join, deletion_ok) &&
+			(containing_appendrel == NULL ||
+			 is_safe_append_member(rte->subquery)))
+			return pull_up_simple_subquery(root, jtnode, rte,
+										   lowest_outer_join,
+										   lowest_nulling_outer_join,
+										   containing_appendrel,
+										   deletion_ok);
+
+		/*
+		 * Alternatively, is it a simple UNION ALL subquery?  If so, flatten
+		 * into an "append relation".
+		 *
+		 * It's safe to do this regardless of whether this query is itself an
+		 * appendrel member.  (If you're thinking we should try to flatten the
+		 * two levels of appendrel together, you're right; but we handle that
+		 * in set_append_rel_pathlist, not here.)
+		 */
+		if (rte->rtekind == RTE_SUBQUERY &&
+			is_simple_union_all(rte->subquery))
+			return pull_up_simple_union_all(root, jtnode, rte);
+
+		/*
+		 * Or perhaps it's a simple VALUES RTE?
+		 *
+		 * We don't allow VALUES pullup below an outer join nor into an
+		 * appendrel (such cases are impossible anyway at the moment).
+		 */
+		if (rte->rtekind == RTE_VALUES &&
+			lowest_outer_join == NULL &&
+			containing_appendrel == NULL &&
+			is_simple_values(root, rte, deletion_ok))
+			return pull_up_simple_values(root, jtnode, rte);
+
+		/* Otherwise, do nothing at this node. */
+	}
+	else if (IsA(jtnode, FromExpr))
+	{
+		FromExpr   *f = (FromExpr *) jtnode;
+		bool		have_undeleted_child = false;
+		ListCell   *l;
 
-        /*
-         * If the FromExpr has quals, it's not deletable even if its parent
-         * would allow deletion.
-         */
-        if (f->quals)
-            deletion_ok = false;
+		Assert(containing_appendrel == NULL);
 
-        foreach(l, f->fromlist)
-        {
-            /*
-             * In a non-deletable FromExpr, we can allow deletion of child
-             * nodes so long as at least one child remains; so it's okay
-             * either if any previous child survives, or if there's more to
-             * come.  If all children are deletable in themselves, we'll force
-             * the last one to remain unflattened.
-             *
-             * As a separate matter, we can allow deletion of all children of
-             * the top-level FromExpr in a query, since that's a special case
-             * anyway.
-             */
-            bool        sub_deletion_ok = (deletion_ok ||
-                                           have_undeleted_child ||
-                                           lnext(l) != NULL ||
-                                           f == root->parse->jointree);
-
-            lfirst(l) = pull_up_subqueries_recurse(root, lfirst(l),
-                                                   lowest_outer_join,
-                                                   lowest_nulling_outer_join,
-                                                   NULL,
-                                                   sub_deletion_ok);
-            if (lfirst(l) != NULL)
-                have_undeleted_child = true;
-        }
+		/*
+		 * If the FromExpr has quals, it's not deletable even if its parent
+		 * would allow deletion.
+		 */
+		if (f->quals)
+			deletion_ok = false;
 
-        if (deletion_ok && !have_undeleted_child)
-        {
-            /* OK to delete this FromExpr entirely */
-            root->hasDeletedRTEs = true;    /* probably is set already */
-            return NULL;
-        }
-    }
-    else if (IsA(jtnode, JoinExpr))
-    {
-        JoinExpr   *j = (JoinExpr *) jtnode;
+		foreach(l, f->fromlist)
+		{
+			/*
+			 * In a non-deletable FromExpr, we can allow deletion of child
+			 * nodes so long as at least one child remains; so it's okay
+			 * either if any previous child survives, or if there's more to
+			 * come.  If all children are deletable in themselves, we'll force
+			 * the last one to remain unflattened.
+			 *
+			 * As a separate matter, we can allow deletion of all children of
+			 * the top-level FromExpr in a query, since that's a special case
+			 * anyway.
+			 */
+			bool		sub_deletion_ok = (deletion_ok ||
+										   have_undeleted_child ||
+										   lnext(l) != NULL ||
+										   f == root->parse->jointree);
+
+			lfirst(l) = pull_up_subqueries_recurse(root, lfirst(l),
+												   lowest_outer_join,
+												   lowest_nulling_outer_join,
+												   NULL,
+												   sub_deletion_ok);
+			if (lfirst(l) != NULL)
+				have_undeleted_child = true;
+		}
 
-        Assert(containing_appendrel == NULL);
-        /* Recurse, being careful to tell myself when inside outer join */
-        switch (j->jointype)
-        {
-            case JOIN_INNER:
+		if (deletion_ok && !have_undeleted_child)
+		{
+			/* OK to delete this FromExpr entirely */
+			root->hasDeletedRTEs = true;	/* probably is set already */
+			return NULL;
+		}
+	}
+	else if (IsA(jtnode, JoinExpr))
+	{
+		JoinExpr   *j = (JoinExpr *) jtnode;
 
-                /*
-                 * INNER JOIN can allow deletion of either child node, but not
-                 * both.  So right child gets permission to delete only if
-                 * left child didn't get removed.
-                 */
-                j->larg = pull_up_subqueries_recurse(root, j->larg,
-                                                     lowest_outer_join,
-                                                     lowest_nulling_outer_join,
-                                                     NULL,
-                                                     true);
-                j->rarg = pull_up_subqueries_recurse(root, j->rarg,
-                                                     lowest_outer_join,
-                                                     lowest_nulling_outer_join,
-                                                     NULL,
-                                                     j->larg != NULL);
-                break;
-            case JOIN_LEFT:
-            case JOIN_SEMI:
-            case JOIN_ANTI:
-                j->larg = pull_up_subqueries_recurse(root, j->larg,
-                                                     j,
-                                                     lowest_nulling_outer_join,
-                                                     NULL,
-                                                     false);
-                j->rarg = pull_up_subqueries_recurse(root, j->rarg,
-                                                     j,
-                                                     j,
-                                                     NULL,
-                                                     false);
-                break;
-            case JOIN_FULL:
-                j->larg = pull_up_subqueries_recurse(root, j->larg,
-                                                     j,
-                                                     j,
-                                                     NULL,
-                                                     false);
-                j->rarg = pull_up_subqueries_recurse(root, j->rarg,
-                                                     j,
-                                                     j,
-                                                     NULL,
-                                                     false);
-                break;
-            case JOIN_RIGHT:
-                j->larg = pull_up_subqueries_recurse(root, j->larg,
-                                                     j,
-                                                     j,
-                                                     NULL,
-                                                     false);
-                j->rarg = pull_up_subqueries_recurse(root, j->rarg,
-                                                     j,
-                                                     lowest_nulling_outer_join,
-                                                     NULL,
-                                                     false);
-                break;
-            default:
-                elog(ERROR, "unrecognized join type: %d",
-                     (int) j->jointype);
-                break;
-        }
-    }
-    else
-        elog(ERROR, "unrecognized node type: %d",
-             (int) nodeTag(jtnode));
-    return jtnode;
+		Assert(containing_appendrel == NULL);
+		/* Recurse, being careful to tell myself when inside outer join */
+		switch (j->jointype)
+		{
+			case JOIN_INNER:
+
+				/*
+				 * INNER JOIN can allow deletion of either child node, but not
+				 * both.  So right child gets permission to delete only if
+				 * left child didn't get removed.
+				 */
+				j->larg = pull_up_subqueries_recurse(root, j->larg,
+													 lowest_outer_join,
+													 lowest_nulling_outer_join,
+													 NULL,
+													 true);
+				j->rarg = pull_up_subqueries_recurse(root, j->rarg,
+													 lowest_outer_join,
+													 lowest_nulling_outer_join,
+													 NULL,
+													 j->larg != NULL);
+				break;
+			case JOIN_LEFT:
+			case JOIN_SEMI:
+#ifdef __TBASE__
+            case JOIN_LEFT_SCALAR:
+#endif
+			case JOIN_ANTI:
+				j->larg = pull_up_subqueries_recurse(root, j->larg,
+													 j,
+													 lowest_nulling_outer_join,
+													 NULL,
+													 false);
+				j->rarg = pull_up_subqueries_recurse(root, j->rarg,
+													 j,
+													 j,
+													 NULL,
+													 false);
+				break;
+			case JOIN_FULL:
+				j->larg = pull_up_subqueries_recurse(root, j->larg,
+													 j,
+													 j,
+													 NULL,
+													 false);
+				j->rarg = pull_up_subqueries_recurse(root, j->rarg,
+													 j,
+													 j,
+													 NULL,
+													 false);
+				break;
+			case JOIN_RIGHT:
+				j->larg = pull_up_subqueries_recurse(root, j->larg,
+													 j,
+													 j,
+													 NULL,
+													 false);
+				j->rarg = pull_up_subqueries_recurse(root, j->rarg,
+													 j,
+													 lowest_nulling_outer_join,
+													 NULL,
+													 false);
+				break;
+			default:
+				elog(ERROR, "unrecognized join type: %d",
+					 (int) j->jointype);
+				break;
+		}
+	}
+	else
+		elog(ERROR, "unrecognized node type: %d",
+			 (int) nodeTag(jtnode));
+	return jtnode;
 }
 
 /*
@@ -2957,277 +2989,281 @@ reduce_outer_joins_pass1(Node *jtnode)
  */
 static void
 reduce_outer_joins_pass2(Node *jtnode,
-                         reduce_outer_joins_state *state,
-                         PlannerInfo *root,
-                         Relids nonnullable_rels,
-                         List *nonnullable_vars,
-                         List *forced_null_vars)
-{// #lizard forgives
-    /*
-     * pass 2 should never descend as far as an empty subnode or base rel,
-     * because it's only called on subtrees marked as contains_outer.
-     */
-    if (jtnode == NULL)
-        elog(ERROR, "reached empty jointree");
-    if (IsA(jtnode, RangeTblRef))
-        elog(ERROR, "reached base rel");
-    else if (IsA(jtnode, FromExpr))
-    {
-        FromExpr   *f = (FromExpr *) jtnode;
-        ListCell   *l;
-        ListCell   *s;
-        Relids        pass_nonnullable_rels;
-        List       *pass_nonnullable_vars;
-        List       *pass_forced_null_vars;
-
-        /* Scan quals to see if we can add any constraints */
-        pass_nonnullable_rels = find_nonnullable_rels(f->quals);
-        pass_nonnullable_rels = bms_add_members(pass_nonnullable_rels,
-                                                nonnullable_rels);
-        /* NB: we rely on list_concat to not damage its second argument */
-        pass_nonnullable_vars = find_nonnullable_vars(f->quals);
-        pass_nonnullable_vars = list_concat(pass_nonnullable_vars,
-                                            nonnullable_vars);
-        pass_forced_null_vars = find_forced_null_vars(f->quals);
-        pass_forced_null_vars = list_concat(pass_forced_null_vars,
-                                            forced_null_vars);
-        /* And recurse --- but only into interesting subtrees */
-        Assert(list_length(f->fromlist) == list_length(state->sub_states));
-        forboth(l, f->fromlist, s, state->sub_states)
-        {
-            reduce_outer_joins_state *sub_state = lfirst(s);
-
-            if (sub_state->contains_outer)
-                reduce_outer_joins_pass2(lfirst(l), sub_state, root,
-                                         pass_nonnullable_rels,
-                                         pass_nonnullable_vars,
-                                         pass_forced_null_vars);
-        }
-        bms_free(pass_nonnullable_rels);
-        /* can't so easily clean up var lists, unfortunately */
-    }
-    else if (IsA(jtnode, JoinExpr))
-    {
-        JoinExpr   *j = (JoinExpr *) jtnode;
-        int            rtindex = j->rtindex;
-        JoinType    jointype = j->jointype;
-        reduce_outer_joins_state *left_state = linitial(state->sub_states);
-        reduce_outer_joins_state *right_state = lsecond(state->sub_states);
-        List       *local_nonnullable_vars = NIL;
-        bool        computed_local_nonnullable_vars = false;
-
-        /* Can we simplify this join? */
-        switch (jointype)
-        {
-            case JOIN_INNER:
-                break;
-            case JOIN_LEFT:
-                if (bms_overlap(nonnullable_rels, right_state->relids))
-                    jointype = JOIN_INNER;
-                break;
-            case JOIN_RIGHT:
-                if (bms_overlap(nonnullable_rels, left_state->relids))
-                    jointype = JOIN_INNER;
-                break;
-            case JOIN_FULL:
-                if (bms_overlap(nonnullable_rels, left_state->relids))
-                {
-                    if (bms_overlap(nonnullable_rels, right_state->relids))
-                        jointype = JOIN_INNER;
-                    else
-                        jointype = JOIN_LEFT;
-                }
-                else
-                {
-                    if (bms_overlap(nonnullable_rels, right_state->relids))
-                        jointype = JOIN_RIGHT;
-                }
-                break;
-            case JOIN_SEMI:
-            case JOIN_ANTI:
-
-                /*
-                 * These could only have been introduced by pull_up_sublinks,
-                 * so there's no way that upper quals could refer to their
-                 * righthand sides, and no point in checking.
-                 */
-                break;
-            default:
-                elog(ERROR, "unrecognized join type: %d",
-                     (int) jointype);
-                break;
-        }
-
-        /*
-         * Convert JOIN_RIGHT to JOIN_LEFT.  Note that in the case where we
-         * reduced JOIN_FULL to JOIN_RIGHT, this will mean the JoinExpr no
-         * longer matches the internal ordering of any CoalesceExpr's built to
-         * represent merged join variables.  We don't care about that at
-         * present, but be wary of it ...
-         */
-        if (jointype == JOIN_RIGHT)
-        {
-            Node       *tmparg;
-
-            tmparg = j->larg;
-            j->larg = j->rarg;
-            j->rarg = tmparg;
-            jointype = JOIN_LEFT;
-            right_state = linitial(state->sub_states);
-            left_state = lsecond(state->sub_states);
-        }
-
-        /*
-         * See if we can reduce JOIN_LEFT to JOIN_ANTI.  This is the case if
-         * the join's own quals are strict for any var that was forced null by
-         * higher qual levels.  NOTE: there are other ways that we could
-         * detect an anti-join, in particular if we were to check whether Vars
-         * coming from the RHS must be non-null because of table constraints.
-         * That seems complicated and expensive though (in particular, one
-         * would have to be wary of lower outer joins). For the moment this
-         * seems sufficient.
-         */
-        if (jointype == JOIN_LEFT)
-        {
-            List       *overlap;
+						 reduce_outer_joins_state *state,
+						 PlannerInfo *root,
+						 Relids nonnullable_rels,
+						 List *nonnullable_vars,
+						 List *forced_null_vars)
+{
+	/*
+	 * pass 2 should never descend as far as an empty subnode or base rel,
+	 * because it's only called on subtrees marked as contains_outer.
+	 */
+	if (jtnode == NULL)
+		elog(ERROR, "reached empty jointree");
+	if (IsA(jtnode, RangeTblRef))
+		elog(ERROR, "reached base rel");
+	else if (IsA(jtnode, FromExpr))
+	{
+		FromExpr   *f = (FromExpr *) jtnode;
+		ListCell   *l;
+		ListCell   *s;
+		Relids		pass_nonnullable_rels;
+		List	   *pass_nonnullable_vars;
+		List	   *pass_forced_null_vars;
+
+		/* Scan quals to see if we can add any constraints */
+		pass_nonnullable_rels = find_nonnullable_rels(f->quals);
+		pass_nonnullable_rels = bms_add_members(pass_nonnullable_rels,
+												nonnullable_rels);
+		/* NB: we rely on list_concat to not damage its second argument */
+		pass_nonnullable_vars = find_nonnullable_vars(f->quals);
+		pass_nonnullable_vars = list_concat(pass_nonnullable_vars,
+											nonnullable_vars);
+		pass_forced_null_vars = find_forced_null_vars(f->quals);
+		pass_forced_null_vars = list_concat(pass_forced_null_vars,
+											forced_null_vars);
+		/* And recurse --- but only into interesting subtrees */
+		Assert(list_length(f->fromlist) == list_length(state->sub_states));
+		forboth(l, f->fromlist, s, state->sub_states)
+		{
+			reduce_outer_joins_state *sub_state = lfirst(s);
 
-            local_nonnullable_vars = find_nonnullable_vars(j->quals);
-            computed_local_nonnullable_vars = true;
+			if (sub_state->contains_outer)
+				reduce_outer_joins_pass2(lfirst(l), sub_state, root,
+										 pass_nonnullable_rels,
+										 pass_nonnullable_vars,
+										 pass_forced_null_vars);
+		}
+		bms_free(pass_nonnullable_rels);
+		/* can't so easily clean up var lists, unfortunately */
+	}
+	else if (IsA(jtnode, JoinExpr))
+	{
+		JoinExpr   *j = (JoinExpr *) jtnode;
+		int			rtindex = j->rtindex;
+		JoinType	jointype = j->jointype;
+		reduce_outer_joins_state *left_state = linitial(state->sub_states);
+		reduce_outer_joins_state *right_state = lsecond(state->sub_states);
+		List	   *local_nonnullable_vars = NIL;
+		bool		computed_local_nonnullable_vars = false;
+
+		/* Can we simplify this join? */
+		switch (jointype)
+		{
+			case JOIN_INNER:
+				break;
+			case JOIN_LEFT:
+				if (bms_overlap(nonnullable_rels, right_state->relids))
+					jointype = JOIN_INNER;
+				break;
+			case JOIN_RIGHT:
+				if (bms_overlap(nonnullable_rels, left_state->relids))
+					jointype = JOIN_INNER;
+				break;
+			case JOIN_FULL:
+				if (bms_overlap(nonnullable_rels, left_state->relids))
+				{
+					if (bms_overlap(nonnullable_rels, right_state->relids))
+						jointype = JOIN_INNER;
+					else
+						jointype = JOIN_LEFT;
+				}
+				else
+				{
+					if (bms_overlap(nonnullable_rels, right_state->relids))
+						jointype = JOIN_RIGHT;
+				}
+				break;
+			case JOIN_SEMI:
+			case JOIN_ANTI:
+
+				/*
+				 * These could only have been introduced by pull_up_sublinks,
+				 * so there's no way that upper quals could refer to their
+				 * righthand sides, and no point in checking.
+				 */
+				break;
+			default:
+				elog(ERROR, "unrecognized join type: %d",
+					 (int) jointype);
+				break;
+		}
 
-            /*
-             * It's not sufficient to check whether local_nonnullable_vars and
-             * forced_null_vars overlap: we need to know if the overlap
-             * includes any RHS variables.
-             */
-            overlap = list_intersection(local_nonnullable_vars,
-                                        forced_null_vars);
-            if (overlap != NIL &&
-                bms_overlap(pull_varnos((Node *) overlap),
-                            right_state->relids))
-                jointype = JOIN_ANTI;
-        }
+		/*
+		 * Convert JOIN_RIGHT to JOIN_LEFT.  Note that in the case where we
+		 * reduced JOIN_FULL to JOIN_RIGHT, this will mean the JoinExpr no
+		 * longer matches the internal ordering of any CoalesceExpr's built to
+		 * represent merged join variables.  We don't care about that at
+		 * present, but be wary of it ...
+		 */
+		if (jointype == JOIN_RIGHT)
+		{
+			Node	   *tmparg;
+
+			tmparg = j->larg;
+			j->larg = j->rarg;
+			j->rarg = tmparg;
+			jointype = JOIN_LEFT;
+			right_state = linitial(state->sub_states);
+			left_state = lsecond(state->sub_states);
+		}
 
-        /* Apply the jointype change, if any, to both jointree node and RTE */
-        if (rtindex && jointype != j->jointype)
-        {
-            RangeTblEntry *rte = rt_fetch(rtindex, root->parse->rtable);
+		/*
+		 * See if we can reduce JOIN_LEFT to JOIN_ANTI.  This is the case if
+		 * the join's own quals are strict for any var that was forced null by
+		 * higher qual levels.  NOTE: there are other ways that we could
+		 * detect an anti-join, in particular if we were to check whether Vars
+		 * coming from the RHS must be non-null because of table constraints.
+		 * That seems complicated and expensive though (in particular, one
+		 * would have to be wary of lower outer joins). For the moment this
+		 * seems sufficient.
+		 */
+		if (jointype == JOIN_LEFT)
+		{
+			List	   *overlap;
+
+			local_nonnullable_vars = find_nonnullable_vars(j->quals);
+			computed_local_nonnullable_vars = true;
+
+			/*
+			 * It's not sufficient to check whether local_nonnullable_vars and
+			 * forced_null_vars overlap: we need to know if the overlap
+			 * includes any RHS variables.
+			 */
+			overlap = list_intersection(local_nonnullable_vars,
+										forced_null_vars);
+			if (overlap != NIL &&
+				bms_overlap(pull_varnos((Node *) overlap),
+							right_state->relids))
+				jointype = JOIN_ANTI;
+		}
 
-            Assert(rte->rtekind == RTE_JOIN);
-            Assert(rte->jointype == j->jointype);
-            rte->jointype = jointype;
-        }
-        j->jointype = jointype;
+		/* Apply the jointype change, if any, to both jointree node and RTE */
+		if (rtindex && jointype != j->jointype)
+		{
+			RangeTblEntry *rte = rt_fetch(rtindex, root->parse->rtable);
 
-        /* Only recurse if there's more to do below here */
-        if (left_state->contains_outer || right_state->contains_outer)
-        {
-            Relids        local_nonnullable_rels;
-            List       *local_forced_null_vars;
-            Relids        pass_nonnullable_rels;
-            List       *pass_nonnullable_vars;
-            List       *pass_forced_null_vars;
+			Assert(rte->rtekind == RTE_JOIN);
+			Assert(rte->jointype == j->jointype);
+			rte->jointype = jointype;
+		}
+		j->jointype = jointype;
 
-            /*
-             * If this join is (now) inner, we can add any constraints its
-             * quals provide to those we got from above.  But if it is outer,
-             * we can pass down the local constraints only into the nullable
-             * side, because an outer join never eliminates any rows from its
-             * non-nullable side.  Also, there is no point in passing upper
-             * constraints into the nullable side, since if there were any
-             * we'd have been able to reduce the join.  (In the case of upper
-             * forced-null constraints, we *must not* pass them into the
-             * nullable side --- they either applied here, or not.) The upshot
-             * is that we pass either the local or the upper constraints,
-             * never both, to the children of an outer join.
-             *
-             * Note that a SEMI join works like an inner join here: it's okay
-             * to pass down both local and upper constraints.  (There can't be
-             * any upper constraints affecting its inner side, but it's not
-             * worth having a separate code path to avoid passing them.)
-             *
-             * At a FULL join we just punt and pass nothing down --- is it
-             * possible to be smarter?
-             */
-            if (jointype != JOIN_FULL)
-            {
-                local_nonnullable_rels = find_nonnullable_rels(j->quals);
-                if (!computed_local_nonnullable_vars)
-                    local_nonnullable_vars = find_nonnullable_vars(j->quals);
-                local_forced_null_vars = find_forced_null_vars(j->quals);
-                if (jointype == JOIN_INNER || jointype == JOIN_SEMI)
-                {
-                    /* OK to merge upper and local constraints */
-                    local_nonnullable_rels = bms_add_members(local_nonnullable_rels,
-                                                             nonnullable_rels);
-                    local_nonnullable_vars = list_concat(local_nonnullable_vars,
-                                                         nonnullable_vars);
-                    local_forced_null_vars = list_concat(local_forced_null_vars,
-                                                         forced_null_vars);
-                }
-            }
-            else
-            {
-                /* no use in calculating these */
-                local_nonnullable_rels = NULL;
-                local_forced_null_vars = NIL;
-            }
+		/* Only recurse if there's more to do below here */
+		if (left_state->contains_outer || right_state->contains_outer)
+		{
+			Relids		local_nonnullable_rels;
+			List	   *local_forced_null_vars;
+			Relids		pass_nonnullable_rels;
+			List	   *pass_nonnullable_vars;
+			List	   *pass_forced_null_vars;
+
+			/*
+			 * If this join is (now) inner, we can add any constraints its
+			 * quals provide to those we got from above.  But if it is outer,
+			 * we can pass down the local constraints only into the nullable
+			 * side, because an outer join never eliminates any rows from its
+			 * non-nullable side.  Also, there is no point in passing upper
+			 * constraints into the nullable side, since if there were any
+			 * we'd have been able to reduce the join.  (In the case of upper
+			 * forced-null constraints, we *must not* pass them into the
+			 * nullable side --- they either applied here, or not.) The upshot
+			 * is that we pass either the local or the upper constraints,
+			 * never both, to the children of an outer join.
+			 *
+			 * Note that a SEMI join works like an inner join here: it's okay
+			 * to pass down both local and upper constraints.  (There can't be
+			 * any upper constraints affecting its inner side, but it's not
+			 * worth having a separate code path to avoid passing them.)
+			 *
+			 * At a FULL join we just punt and pass nothing down --- is it
+			 * possible to be smarter?
+			 */
+			if (jointype != JOIN_FULL)
+			{
+				local_nonnullable_rels = find_nonnullable_rels(j->quals);
+				if (!computed_local_nonnullable_vars)
+					local_nonnullable_vars = find_nonnullable_vars(j->quals);
+				local_forced_null_vars = find_forced_null_vars(j->quals);
+#ifdef __TBASE__
+                if (jointype == JOIN_INNER || jointype == JOIN_SEMI || jointype == JOIN_LEFT_SCALAR)
+#else
+				if (jointype == JOIN_INNER || jointype == JOIN_SEMI)
+#endif
+				{
+					/* OK to merge upper and local constraints */
+					local_nonnullable_rels = bms_add_members(local_nonnullable_rels,
+															 nonnullable_rels);
+					local_nonnullable_vars = list_concat(local_nonnullable_vars,
+														 nonnullable_vars);
+					local_forced_null_vars = list_concat(local_forced_null_vars,
+														 forced_null_vars);
+				}
+			}
+			else
+			{
+				/* no use in calculating these */
+				local_nonnullable_rels = NULL;
+				local_forced_null_vars = NIL;
+			}
 
-            if (left_state->contains_outer)
-            {
-                if (jointype == JOIN_INNER || jointype == JOIN_SEMI)
-                {
-                    /* pass union of local and upper constraints */
-                    pass_nonnullable_rels = local_nonnullable_rels;
-                    pass_nonnullable_vars = local_nonnullable_vars;
-                    pass_forced_null_vars = local_forced_null_vars;
-                }
-                else if (jointype != JOIN_FULL) /* ie, LEFT or ANTI */
-                {
-                    /* can't pass local constraints to non-nullable side */
-                    pass_nonnullable_rels = nonnullable_rels;
-                    pass_nonnullable_vars = nonnullable_vars;
-                    pass_forced_null_vars = forced_null_vars;
-                }
-                else
-                {
-                    /* no constraints pass through JOIN_FULL */
-                    pass_nonnullable_rels = NULL;
-                    pass_nonnullable_vars = NIL;
-                    pass_forced_null_vars = NIL;
-                }
-                reduce_outer_joins_pass2(j->larg, left_state, root,
-                                         pass_nonnullable_rels,
-                                         pass_nonnullable_vars,
-                                         pass_forced_null_vars);
-            }
+			if (left_state->contains_outer)
+			{
+				if (jointype == JOIN_INNER || jointype == JOIN_SEMI)
+				{
+					/* pass union of local and upper constraints */
+					pass_nonnullable_rels = local_nonnullable_rels;
+					pass_nonnullable_vars = local_nonnullable_vars;
+					pass_forced_null_vars = local_forced_null_vars;
+				}
+				else if (jointype != JOIN_FULL) /* ie, LEFT or ANTI */
+				{
+					/* can't pass local constraints to non-nullable side */
+					pass_nonnullable_rels = nonnullable_rels;
+					pass_nonnullable_vars = nonnullable_vars;
+					pass_forced_null_vars = forced_null_vars;
+				}
+				else
+				{
+					/* no constraints pass through JOIN_FULL */
+					pass_nonnullable_rels = NULL;
+					pass_nonnullable_vars = NIL;
+					pass_forced_null_vars = NIL;
+				}
+				reduce_outer_joins_pass2(j->larg, left_state, root,
+										 pass_nonnullable_rels,
+										 pass_nonnullable_vars,
+										 pass_forced_null_vars);
+			}
 
-            if (right_state->contains_outer)
-            {
-                if (jointype != JOIN_FULL)    /* ie, INNER/LEFT/SEMI/ANTI */
-                {
-                    /* pass appropriate constraints, per comment above */
-                    pass_nonnullable_rels = local_nonnullable_rels;
-                    pass_nonnullable_vars = local_nonnullable_vars;
-                    pass_forced_null_vars = local_forced_null_vars;
-                }
-                else
-                {
-                    /* no constraints pass through JOIN_FULL */
-                    pass_nonnullable_rels = NULL;
-                    pass_nonnullable_vars = NIL;
-                    pass_forced_null_vars = NIL;
-                }
-                reduce_outer_joins_pass2(j->rarg, right_state, root,
-                                         pass_nonnullable_rels,
-                                         pass_nonnullable_vars,
-                                         pass_forced_null_vars);
-            }
-            bms_free(local_nonnullable_rels);
-        }
-    }
-    else
-        elog(ERROR, "unrecognized node type: %d",
-             (int) nodeTag(jtnode));
+			if (right_state->contains_outer)
+			{
+				if (jointype != JOIN_FULL)	/* ie, INNER/LEFT/SEMI/ANTI */
+				{
+					/* pass appropriate constraints, per comment above */
+					pass_nonnullable_rels = local_nonnullable_rels;
+					pass_nonnullable_vars = local_nonnullable_vars;
+					pass_forced_null_vars = local_forced_null_vars;
+				}
+				else
+				{
+					/* no constraints pass through JOIN_FULL */
+					pass_nonnullable_rels = NULL;
+					pass_nonnullable_vars = NIL;
+					pass_forced_null_vars = NIL;
+				}
+				reduce_outer_joins_pass2(j->rarg, right_state, root,
+										 pass_nonnullable_rels,
+										 pass_nonnullable_vars,
+										 pass_forced_null_vars);
+			}
+			bms_free(local_nonnullable_rels);
+		}
+	}
+	else
+		elog(ERROR, "unrecognized node type: %d",
+			 (int) nodeTag(jtnode));
 }
 
 /*
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index ad966eb2..9b89cb4a 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1721,32 +1721,41 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
         return alternate;
     }
 
-    /*
-     * Check if we have inner replicated
-     * The "both replicated" case is already checked, so if innerd
-     * is replicated, then outerd is not replicated and it is not NULL.
-     * This case is not acceptable for some join types. If outer relation is
-     * nullable data nodes will produce joined rows with NULLs for cases when
-     * matching row exists, but on other data node.
-     */
-    if ((innerd && IsLocatorReplicated(innerd->distributionType)) &&
-            (pathnode->jointype == JOIN_INNER ||
-             pathnode->jointype == JOIN_LEFT ||
-             pathnode->jointype == JOIN_SEMI ||
-             pathnode->jointype == JOIN_ANTI))
-    {
-        /* We need inner relation is defined on all nodes where outer is */
-        if (!outerd || !bms_is_subset(outerd->nodes, innerd->nodes))
-            goto not_allowed_join;
-
-        targetd = makeNode(Distribution);
-        targetd->distributionType = outerd->distributionType;
-        targetd->nodes = bms_copy(outerd->nodes);
-        targetd->restrictNodes = bms_copy(outerd->restrictNodes);
-        targetd->distributionExpr = outerd->distributionExpr;
-        pathnode->path.distribution = targetd;
-        return alternate;
-    }
+	/*
+	 * Check if we have inner replicated
+	 * The "both replicated" case is already checked, so if innerd
+	 * is replicated, then outerd is not replicated and it is not NULL.
+	 * This case is not acceptable for some join types. If outer relation is
+	 * nullable data nodes will produce joined rows with NULLs for cases when
+	 * matching row exists, but on other data node.
+	 */
+#ifdef __TBASE__
+	if ((innerd && IsLocatorReplicated(innerd->distributionType)) &&
+			(pathnode->jointype == JOIN_INNER ||
+			 pathnode->jointype == JOIN_LEFT ||
+			 pathnode->jointype == JOIN_SEMI ||
+             pathnode->jointype == JOIN_LEFT_SCALAR ||
+			 pathnode->jointype == JOIN_ANTI))
+#else
+	if ((innerd && IsLocatorReplicated(innerd->distributionType)) &&
+			(pathnode->jointype == JOIN_INNER ||
+			 pathnode->jointype == JOIN_LEFT ||
+			 pathnode->jointype == JOIN_SEMI ||
+			 pathnode->jointype == JOIN_ANTI))
+#endif
+	{
+		/* We need inner relation is defined on all nodes where outer is */
+		if (!outerd || !bms_is_subset(outerd->nodes, innerd->nodes))
+			goto not_allowed_join;
+
+		targetd = makeNode(Distribution);
+		targetd->distributionType = outerd->distributionType;
+		targetd->nodes = bms_copy(outerd->nodes);
+		targetd->restrictNodes = bms_copy(outerd->restrictNodes);
+		targetd->distributionExpr = outerd->distributionExpr;
+		pathnode->path.distribution = targetd;
+		return alternate;
+	}
 
 
     /*
@@ -2046,43 +2055,52 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
      * If we could not determine the distribution redistribute the subpathes.
      */
 not_allowed_join:
-    /*
-     * If redistribution is required, sometimes the cheapest path would be if
-     * one of the subplan is replicated. If replication of any or all subplans
-     * is possible, return resulting plans as alternates. Try to distribute all
-     * by has as main variant.
-     */
+	/*
+	 * If redistribution is required, sometimes the cheapest path would be if
+	 * one of the subplan is replicated. If replication of any or all subplans
+	 * is possible, return resulting plans as alternates. Try to distribute all
+	 * by has as main variant.
+	 */
 
-#ifdef NOT_USED    
-    /* These join types allow replicated inner */
-    if (outerd &&
-            (pathnode->jointype == JOIN_INNER ||
-             pathnode->jointype == JOIN_LEFT ||
-             pathnode->jointype == JOIN_SEMI ||
-             pathnode->jointype == JOIN_ANTI))
-    {
-        /*
-         * Since we discard all alternate pathes except one it is OK if all they
-         * reference the same objects
-         */
-        JoinPath *altpath = flatCopyJoinPath(pathnode);
-        /* Redistribute inner subquery */
-        altpath->innerjoinpath = redistribute_path(
-                root,
-                altpath->innerjoinpath,
-                innerpathkeys,
-                LOCATOR_TYPE_REPLICATED,
-                NULL,
-                bms_copy(outerd->nodes),
-                bms_copy(outerd->restrictNodes));
-        targetd = makeNode(Distribution);
-        targetd->distributionType = outerd->distributionType;
-        targetd->nodes = bms_copy(outerd->nodes);
-        targetd->restrictNodes = bms_copy(outerd->restrictNodes);
-        targetd->distributionExpr = outerd->distributionExpr;
-        altpath->path.distribution = targetd;
-        alternate = lappend(alternate, altpath);
-    }
+#ifdef NOT_USED	
+	/* These join types allow replicated inner */
+#ifdef __TBASE__
+	if (outerd &&
+			(pathnode->jointype == JOIN_INNER ||
+			 pathnode->jointype == JOIN_LEFT ||
+			 pathnode->jointype == JOIN_SEMI ||
+			 pathnode->jointype == JOIN_LEFT_SCALAR ||
+			 pathnode->jointype == JOIN_ANTI))
+#else
+	if (outerd &&
+			(pathnode->jointype == JOIN_INNER ||
+			 pathnode->jointype == JOIN_LEFT ||
+			 pathnode->jointype == JOIN_SEMI ||
+			 pathnode->jointype == JOIN_ANTI))
+#endif
+	{
+		/*
+		 * Since we discard all alternate pathes except one it is OK if all they
+		 * reference the same objects
+		 */
+		JoinPath *altpath = flatCopyJoinPath(pathnode);
+		/* Redistribute inner subquery */
+		altpath->innerjoinpath = redistribute_path(
+				root,
+				altpath->innerjoinpath,
+				innerpathkeys,
+				LOCATOR_TYPE_REPLICATED,
+				NULL,
+				bms_copy(outerd->nodes),
+				bms_copy(outerd->restrictNodes));
+		targetd = makeNode(Distribution);
+		targetd->distributionType = outerd->distributionType;
+		targetd->nodes = bms_copy(outerd->nodes);
+		targetd->restrictNodes = bms_copy(outerd->restrictNodes);
+		targetd->distributionExpr = outerd->distributionExpr;
+		altpath->path.distribution = targetd;
+		alternate = lappend(alternate, altpath);
+	}
 
     /* These join types allow replicated outer */
     if (innerd &&
@@ -2161,7 +2179,9 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                     Expr *right_expr = right;
 #endif
                     Oid leftType PG_USED_FOR_ASSERTS_ONLY = exprType((Node *) left);
-                    Oid rightType PG_USED_FOR_ASSERTS_ONLY = exprType((Node *) right);
+#ifndef __TBASE__
+					Oid rightType PG_USED_FOR_ASSERTS_ONLY = exprType((Node *) right);
+#endif
                     Relids inner_rels = pathnode->innerjoinpath->parent->relids;
                     Relids outer_rels = pathnode->outerjoinpath->parent->relids;
                     QualCost cost;
@@ -2456,17 +2476,21 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                     nodes = bms_copy(outerd->nodes);
                 }
 
-                if(outer_size * inner_nodes < inner_size + outer_size &&
-                    (pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL &&
-                     pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI) &&
-                     innerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_outer &&
-                     get_num_connections(inner_nodes, nRemotePlans_outer + 1) < MaxConnections * REPLICATION_FACTOR &&
+				if(outer_size * inner_nodes < inner_size + outer_size &&
+					(pathnode->jointype != JOIN_LEFT &&
+					 pathnode->jointype != JOIN_FULL &&
+					 pathnode->jointype != JOIN_SEMI &&
+					 pathnode->jointype != JOIN_LEFT_SCALAR &&
+					 pathnode->jointype != JOIN_LEFT_SEMI &&
+					 pathnode->jointype != JOIN_ANTI) &&
+					 innerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_outer &&
+					 get_num_connections(inner_nodes, nRemotePlans_outer + 1) < MaxConnections * REPLICATION_FACTOR &&
 					 !replicate_inner && !dml && nRemotePlans_outer < replication_level && !pathnode->inner_unique)
-                {
-                    replicate_outer = true;
+				{
+					replicate_outer = true;
 
-                    nodes = bms_copy(innerd->nodes);
-                }
+					nodes = bms_copy(innerd->nodes);
+				}
 #endif
             }
             /*
@@ -2483,8 +2507,12 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 				 * replicate outer as an optimization to save network costs.
                  */
 				if(inner_size > outer_size * inner_nodes &&
-					(pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL &&
-					 pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI) &&
+					(pathnode->jointype != JOIN_LEFT &&
+					 pathnode->jointype != JOIN_FULL &&
+					 pathnode->jointype != JOIN_SEMI &&
+					 pathnode->jointype != JOIN_LEFT_SCALAR &&
+					 pathnode->jointype != JOIN_LEFT_SEMI &&
+					 pathnode->jointype != JOIN_ANTI) &&
 					 innerd->distributionType != LOCATOR_TYPE_REPLICATED && !redistribute_outer &&
 					 get_num_connections(inner_nodes, nRemotePlans_outer + 1) < MaxConnections * REPLICATION_FACTOR &&
 					 !dml && nRemotePlans_outer < replication_level && !pathnode->inner_unique)
diff --git a/src/backend/utils/adt/network_selfuncs.c b/src/backend/utils/adt/network_selfuncs.c
index 76db9c01..beb0e76a 100644
--- a/src/backend/utils/adt/network_selfuncs.c
+++ b/src/backend/utils/adt/network_selfuncs.c
@@ -200,50 +200,53 @@ networkjoinsel(PG_FUNCTION_ARGS)
 #ifdef NOT_USED
     JoinType    jointype = (JoinType) PG_GETARG_INT16(3);
 #endif
-    SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
-    double        selec;
-    VariableStatData vardata1;
-    VariableStatData vardata2;
-    bool        join_is_reversed;
-
-    get_join_variables(root, args, sjinfo,
-                       &vardata1, &vardata2, &join_is_reversed);
-
-    switch (sjinfo->jointype)
-    {
-        case JOIN_INNER:
-        case JOIN_LEFT:
-        case JOIN_FULL:
-
-            /*
-             * Selectivity for left/full join is not exactly the same as inner
-             * join, but we neglect the difference, as eqjoinsel does.
-             */
-            selec = networkjoinsel_inner(operator, &vardata1, &vardata2);
-            break;
-        case JOIN_SEMI:
-        case JOIN_ANTI:
-            /* Here, it's important that we pass the outer var on the left. */
-            if (!join_is_reversed)
-                selec = networkjoinsel_semi(operator, &vardata1, &vardata2);
-            else
-                selec = networkjoinsel_semi(get_commutator(operator),
-                                            &vardata2, &vardata1);
-            break;
-        default:
-            /* other values not expected here */
-            elog(ERROR, "unrecognized join type: %d",
-                 (int) sjinfo->jointype);
-            selec = 0;            /* keep compiler quiet */
-            break;
-    }
-
-    ReleaseVariableStats(vardata1);
-    ReleaseVariableStats(vardata2);
-
-    CLAMP_PROBABILITY(selec);
-
-    PG_RETURN_FLOAT8((float8) selec);
+	SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
+	double		selec;
+	VariableStatData vardata1;
+	VariableStatData vardata2;
+	bool		join_is_reversed;
+
+	get_join_variables(root, args, sjinfo,
+					   &vardata1, &vardata2, &join_is_reversed);
+
+	switch (sjinfo->jointype)
+	{
+		case JOIN_INNER:
+		case JOIN_LEFT:
+		case JOIN_FULL:
+
+			/*
+			 * Selectivity for left/full join is not exactly the same as inner
+			 * join, but we neglect the difference, as eqjoinsel does.
+			 */
+			selec = networkjoinsel_inner(operator, &vardata1, &vardata2);
+			break;
+		case JOIN_SEMI:
+		case JOIN_ANTI:
+#ifdef __TBASE__
+        case JOIN_LEFT_SCALAR:
+#endif
+			/* Here, it's important that we pass the outer var on the left. */
+			if (!join_is_reversed)
+				selec = networkjoinsel_semi(operator, &vardata1, &vardata2);
+			else
+				selec = networkjoinsel_semi(get_commutator(operator),
+											&vardata2, &vardata1);
+			break;
+		default:
+			/* other values not expected here */
+			elog(ERROR, "unrecognized join type: %d",
+				 (int) sjinfo->jointype);
+			selec = 0;			/* keep compiler quiet */
+			break;
+	}
+
+	ReleaseVariableStats(vardata1);
+	ReleaseVariableStats(vardata2);
+
+	CLAMP_PROBABILITY(selec);
+
+	PG_RETURN_FLOAT8((float8) selec);
 }
 
 /*
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 3b1ff05f..134664f8 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -2265,56 +2265,59 @@ eqjoinsel(PG_FUNCTION_ARGS)
 #ifdef NOT_USED
     JoinType    jointype = (JoinType) PG_GETARG_INT16(3);
 #endif
-    SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
-    double        selec;
-    VariableStatData vardata1;
-    VariableStatData vardata2;
-    bool        join_is_reversed;
-    RelOptInfo *inner_rel;
-
-    get_join_variables(root, args, sjinfo,
-                       &vardata1, &vardata2, &join_is_reversed);
-
-    switch (sjinfo->jointype)
-    {
-        case JOIN_INNER:
-        case JOIN_LEFT:
-        case JOIN_FULL:
-            selec = eqjoinsel_inner(operator, &vardata1, &vardata2);
-            break;
-        case JOIN_SEMI:
-        case JOIN_ANTI:
-
-            /*
-             * Look up the join's inner relation.  min_righthand is sufficient
-             * information because neither SEMI nor ANTI joins permit any
-             * reassociation into or out of their RHS, so the righthand will
-             * always be exactly that set of rels.
-             */
-            inner_rel = find_join_input_rel(root, sjinfo->min_righthand);
-
-            if (!join_is_reversed)
-                selec = eqjoinsel_semi(operator, &vardata1, &vardata2,
-                                       inner_rel);
-            else
-                selec = eqjoinsel_semi(get_commutator(operator),
-                                       &vardata2, &vardata1,
-                                       inner_rel);
-            break;
-        default:
-            /* other values not expected here */
-            elog(ERROR, "unrecognized join type: %d",
-                 (int) sjinfo->jointype);
-            selec = 0;            /* keep compiler quiet */
-            break;
-    }
-
-    ReleaseVariableStats(vardata1);
-    ReleaseVariableStats(vardata2);
-
-    CLAMP_PROBABILITY(selec);
+	SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
+	double		selec;
+	VariableStatData vardata1;
+	VariableStatData vardata2;
+	bool		join_is_reversed;
+	RelOptInfo *inner_rel;
+
+	get_join_variables(root, args, sjinfo,
+					   &vardata1, &vardata2, &join_is_reversed);
+
+	switch (sjinfo->jointype)
+	{
+		case JOIN_INNER:
+		case JOIN_LEFT:
+		case JOIN_FULL:
+			selec = eqjoinsel_inner(operator, &vardata1, &vardata2);
+			break;
+		case JOIN_SEMI:
+		case JOIN_ANTI:
+#ifdef __TBASE__
+        case JOIN_LEFT_SCALAR:
+#endif
 
-    PG_RETURN_FLOAT8((float8) selec);
+			/*
+			 * Look up the join's inner relation.  min_righthand is sufficient
+			 * information because neither SEMI nor ANTI joins permit any
+			 * reassociation into or out of their RHS, so the righthand will
+			 * always be exactly that set of rels.
+			 */
+			inner_rel = find_join_input_rel(root, sjinfo->min_righthand);
+
+			if (!join_is_reversed)
+				selec = eqjoinsel_semi(operator, &vardata1, &vardata2,
+									   inner_rel);
+			else
+				selec = eqjoinsel_semi(get_commutator(operator),
+									   &vardata2, &vardata1,
+									   inner_rel);
+			break;
+		default:
+			/* other values not expected here */
+			elog(ERROR, "unrecognized join type: %d",
+				 (int) sjinfo->jointype);
+			selec = 0;			/* keep compiler quiet */
+			break;
+	}
+
+	ReleaseVariableStats(vardata1);
+	ReleaseVariableStats(vardata2);
+
+	CLAMP_PROBABILITY(selec);
+
+	PG_RETURN_FLOAT8((float8) selec);
 }
 
 /*
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 9df9532f..9f974c28 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -765,37 +765,42 @@ typedef enum CmdType
  */
 typedef enum JoinType
 {
-    /*
-     * The canonical kinds of joins according to the SQL JOIN syntax. Only
-     * these codes can appear in parser output (e.g., JoinExpr nodes).
-     */
-    JOIN_INNER,                    /* matching tuple pairs only */
-    JOIN_LEFT,                    /* pairs + unmatched LHS tuples */
-    JOIN_FULL,                    /* pairs + unmatched LHS + unmatched RHS */
-    JOIN_RIGHT,                    /* pairs + unmatched RHS tuples */
-
-    /*
-     * Semijoins and anti-semijoins (as defined in relational theory) do not
-     * appear in the SQL JOIN syntax, but there are standard idioms for
-     * representing them (e.g., using EXISTS).  The planner recognizes these
-     * cases and converts them to joins.  So the planner and executor must
-     * support these codes.  NOTE: in JOIN_SEMI output, it is unspecified
-     * which matching RHS row is joined to.  In JOIN_ANTI output, the row is
-     * guaranteed to be null-extended.
-     */
-    JOIN_SEMI,                    /* 1 copy of each LHS row that has match(es) */
-    JOIN_ANTI,                    /* 1 copy of each LHS row that has no match */
+	/*
+	 * The canonical kinds of joins according to the SQL JOIN syntax. Only
+	 * these codes can appear in parser output (e.g., JoinExpr nodes).
+	 */
+	JOIN_INNER,					/* matching tuple pairs only */
+	JOIN_LEFT,					/* pairs + unmatched LHS tuples */
+	JOIN_FULL,					/* pairs + unmatched LHS + unmatched RHS */
+	JOIN_RIGHT,					/* pairs + unmatched RHS tuples */
+
+	/*
+	 * Semijoins and anti-semijoins (as defined in relational theory) do not
+	 * appear in the SQL JOIN syntax, but there are standard idioms for
+	 * representing them (e.g., using EXISTS).  The planner recognizes these
+	 * cases and converts them to joins.  So the planner and executor must
+	 * support these codes.  NOTE: in JOIN_SEMI output, it is unspecified
+	 * which matching RHS row is joined to.  In JOIN_ANTI output, the row is
+	 * guaranteed to be null-extended.
+	 */
+	JOIN_SEMI,					/* 1 copy of each LHS row that has match(es) */
+	JOIN_ANTI,					/* 1 copy of each LHS row that has no match */
+
+	/*
+	 * These codes are used internally in the planner, but are not supported
+	 * by the executor (nor, indeed, by most of the planner).
+	 */
+	JOIN_UNIQUE_OUTER,			/* LHS path must be made unique */
+	JOIN_UNIQUE_INNER,			/* RHS path must be made unique */
 
-    /*
-     * These codes are used internally in the planner, but are not supported
-     * by the executor (nor, indeed, by most of the planner).
-     */
-    JOIN_UNIQUE_OUTER,            /* LHS path must be made unique */
-    JOIN_UNIQUE_INNER            /* RHS path must be made unique */
+#ifdef __TBASE__
+	JOIN_LEFT_SCALAR            /* pairs + unmatched LHS tuples */
+	                            /* only 1 copy of echo LHS row else report error. */
+#endif
 
-    /*
-     * We might need additional join types someday.
-     */
+	/*
+	 * We might need additional join types someday.
+	 */
 } JoinType;
 
 /*
@@ -812,12 +817,22 @@ typedef enum JoinType
  * pushed-down quals.  This is convenient because for almost all purposes,
  * quals attached to a semijoin can be treated the same as innerjoin quals.
  */
+#ifdef __TBASE__
 #define IS_OUTER_JOIN(jointype) \
-    (((1 << (jointype)) & \
-      ((1 << JOIN_LEFT) | \
-       (1 << JOIN_FULL) | \
-       (1 << JOIN_RIGHT) | \
-       (1 << JOIN_ANTI))) != 0)
+	(((1 << (jointype)) & \
+	  ((1 << JOIN_LEFT) | \
+	   (1 << JOIN_LEFT_SCALAR) | \
+	   (1 << JOIN_FULL) | \
+	   (1 << JOIN_RIGHT) | \
+	   (1 << JOIN_ANTI))) != 0)
+#else
+#define IS_OUTER_JOIN(jointype) \
+	(((1 << (jointype)) & \
+	  ((1 << JOIN_LEFT) | \
+	   (1 << JOIN_FULL) | \
+	   (1 << JOIN_RIGHT) | \
+	   (1 << JOIN_ANTI))) != 0)
+#endif
 
 /*
  * AggStrategy -
diff --git a/src/include/optimizer/subselect.h b/src/include/optimizer/subselect.h
index d303ff05..ec687d2f 100644
--- a/src/include/optimizer/subselect.h
+++ b/src/include/optimizer/subselect.h
@@ -99,8 +99,9 @@ extern JoinExpr *convert_ALL_sublink_to_join(PlannerInfo *root, SubLink *sublink
                                Relids available_rels);
 extern bool check_or_exist_sublink_pullupable(PlannerInfo *root,Node *node);
 extern bool check_or_exist_qual_pullupable(PlannerInfo *root, Node *node);
-extern List * convert_OR_EXIST_sublink_to_join_recurse(PlannerInfo *root, Node *node, 
+extern List *convert_OR_EXIST_sublink_to_join_recurse(PlannerInfo *root, Node *node,
 									Node **jtlink);
+extern TargetEntry *convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry);
 #endif
 extern Node *SS_replace_correlation_vars(PlannerInfo *root, Node *expr);
 extern Node *SS_process_sublinks(PlannerInfo *root, Node *expr, bool isQual);
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index c480d768..5d770f1d 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -851,31 +851,35 @@ explain (verbose, costs off)
 select * from int4_tbl where
   (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in
   (select ten from tenk1 b);
-                                                  QUERY PLAN                                                   
----------------------------------------------------------------------------------------------------------------
+                                            QUERY PLAN                                             
+---------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: int4_tbl.f1
-   ->  Nested Loop Semi Join
+   ->  Hash Join
          Output: int4_tbl.f1
-         Join Filter: ((CASE WHEN (hashed SubPlan 1) THEN int4_tbl.f1 ELSE NULL::integer END) = b.ten)
-         ->  Remote Subquery Scan on all (datanode_1)
-               Output: int4_tbl.f1, CASE WHEN (hashed SubPlan 1) THEN int4_tbl.f1 ELSE NULL::integer END
-               Distribute results by H: CASE WHEN (hashed SubPlan 1) THEN f1 ELSE NULL::integer END
-               ->  Seq Scan on public.int4_tbl
-                     Output: int4_tbl.f1, CASE WHEN (hashed SubPlan 1) THEN int4_tbl.f1 ELSE NULL::integer END
-                     SubPlan 1
-                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                             Output: a.unique1
-                             ->  Seq Scan on public.tenk1 a
-                                   Output: a.unique1
-         ->  Materialize
+         Inner Unique: true
+         Hash Cond: (CASE WHEN (hashed SubPlan 1) THEN int4_tbl.f1 ELSE NULL::integer END = b.ten)
+         ->  Seq Scan on public.int4_tbl
+               Output: int4_tbl.f1
+         ->  Hash
                Output: b.ten
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  HashAggregate
                      Output: b.ten
-                     Distribute results by H: ten
-                     ->  Seq Scan on public.tenk1 b
+                     Group Key: b.ten
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Output: b.ten
-(22 rows)
+                           Distribute results by H: ten
+                           ->  HashAggregate
+                                 Output: b.ten
+                                 Group Key: b.ten
+                                 ->  Seq Scan on public.tenk1 b
+                                       Output: b.ten
+         SubPlan 1
+           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                 Output: a.unique1
+                 ->  Seq Scan on public.tenk1 a
+                       Output: a.unique1
+(26 rows)
 
 select * from int4_tbl where
   (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in
@@ -1441,3 +1445,295 @@ select * from x for update;
          Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3
 (4 rows)
 
+--
+-- Tests for pulling up more sublinks
+--
+
+set enable_pullup_subquery to true;
+create table tbl_a(a int,b int);
+create table tbl_b(a int,b int);
+insert into tbl_a select generate_series(1,10),1 ;
+insert into tbl_b select generate_series(2,11),1 ;
+-- check targetlist subquery scenario.
+set enable_nestloop to true;
+set enable_hashjoin to false;
+set enable_mergejoin to false;
+explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                                                 QUERY PLAN                                                 
+------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=36374.94..36378.32 rows=1350 width=8)
+   ->  Sort  (cost=36374.94..36378.32 rows=1350 width=8)
+         Sort Key: a.a, ((SubPlan 1))
+         ->  Seq Scan on tbl_a a  (cost=0.00..36304.75 rows=1350 width=8)
+               SubPlan 1
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=0.00..26.88 rows=7 width=4)
+                       ->  Seq Scan on tbl_b b  (cost=0.00..26.88 rows=7 width=4)
+                             Filter: (a = a.a)
+(8 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+set enable_nestloop to false;
+set enable_hashjoin to true;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, ((SubPlan 1))
+         ->  Seq Scan on tbl_a a
+               SubPlan 1
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       ->  Seq Scan on tbl_b b
+                             Filter: (a = a.a)
+(8 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+set enable_nestloop to false;
+set enable_hashjoin to false;
+set enable_mergejoin to true;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, ((SubPlan 1))
+         ->  Seq Scan on tbl_a a
+               SubPlan 1
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       ->  Seq Scan on tbl_b b
+                             Filter: (a = a.a)
+(8 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+-- check non-scalar scenario.
+insert into tbl_b values(2,2);
+set enable_nestloop to true;
+set enable_hashjoin to false;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, ((SubPlan 1))
+         ->  Seq Scan on tbl_a a
+               SubPlan 1
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       ->  Seq Scan on tbl_b b
+                             Filter: (a = a.a)
+(8 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ERROR:  more than one row returned by a subquery used as an expression
+set enable_nestloop to false;
+set enable_hashjoin to true;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, ((SubPlan 1))
+         ->  Seq Scan on tbl_a a
+               SubPlan 1
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       ->  Seq Scan on tbl_b b
+                             Filter: (a = a.a)
+(8 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ERROR:  more than one row returned by a subquery used as an expression
+set enable_nestloop to false;
+set enable_hashjoin to false;
+set enable_mergejoin to true;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, ((SubPlan 1))
+         ->  Seq Scan on tbl_a a
+               SubPlan 1
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       ->  Seq Scan on tbl_b b
+                             Filter: (a = a.a)
+(8 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ERROR:  more than one row returned by a subquery used as an expression
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2;
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, ((SubPlan 1))
+         ->  Seq Scan on tbl_a a
+               SubPlan 1
+                 ->  Remote Subquery Scan on all (datanode_1)
+                       ->  Result
+                             One-Time Filter: (a.a = 5)
+                             ->  Seq Scan on tbl_b b
+                                   Filter: (a = 5)
+(10 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2;
+ a  | q 
+----+---
+  1 |  
+  2 |  
+  3 |  
+  4 |  
+  5 | 5
+  6 |  
+  7 |  
+  8 |  
+  9 |  
+ 10 |  
+(10 rows)
+
+-- check distinct scenario.
+set enable_nestloop to true;
+set enable_hashjoin to false;
+set enable_mergejoin to false;
+explain (costs  off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, ((SubPlan 1))
+         ->  Seq Scan on tbl_a a
+               SubPlan 1
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       ->  Unique
+                             ->  Seq Scan on tbl_b b
+                                   Filter: (a = a.a)
+(9 rows)
+
+select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+set enable_nestloop to false;
+set enable_hashjoin to true;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, ((SubPlan 1))
+         ->  Seq Scan on tbl_a a
+               SubPlan 1
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       ->  Unique
+                             ->  Seq Scan on tbl_b b
+                                   Filter: (a = a.a)
+(9 rows)
+
+select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+set enable_nestloop to false;
+set enable_hashjoin to false;
+set enable_mergejoin to true;
+explain (costs  off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, ((SubPlan 1))
+         ->  Seq Scan on tbl_a a
+               SubPlan 1
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       ->  Unique
+                             ->  Seq Scan on tbl_b b
+                                   Filter: (a = a.a)
+(9 rows)
+
+select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+set enable_nestloop to true;
+set enable_hashjoin to true;
+set enable_mergejoin to true;
+drop table tbl_a;
+drop table tbl_b;
+set enable_pullup_subquery to false;
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 9b3f974f..2ebadb13 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -691,3 +691,83 @@ select * from (with x as (select 2 as y) select * from x) ss;
 explain (verbose, costs off)
 with x as (select * from subselect_tbl)
 select * from x for update;
+
+--
+-- Tests for pulling up more sublinks
+--
+
+set enable_pullup_subquery to true;
+create table tbl_a(a int,b int);
+create table tbl_b(a int,b int);
+insert into tbl_a select generate_series(1,10),1 ;
+insert into tbl_b select generate_series(2,11),1 ;
+
+-- check targetlist subquery scenario.
+set enable_nestloop to true;
+set enable_hashjoin to false;
+set enable_mergejoin to false;
+explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+
+set enable_nestloop to false;
+set enable_hashjoin to true;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+
+set enable_nestloop to false;
+set enable_hashjoin to false;
+set enable_mergejoin to true;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+
+-- check non-scalar scenario.
+insert into tbl_b values(2,2);
+
+set enable_nestloop to true;
+set enable_hashjoin to false;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+
+set enable_nestloop to false;
+set enable_hashjoin to true;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+
+set enable_nestloop to false;
+set enable_hashjoin to false;
+set enable_mergejoin to true;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2;
+select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2;
+
+-- check distinct scenario.
+set enable_nestloop to true;
+set enable_hashjoin to false;
+set enable_mergejoin to false;
+explain (costs  off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+
+set enable_nestloop to false;
+set enable_hashjoin to true;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+
+set enable_nestloop to false;
+set enable_hashjoin to false;
+set enable_mergejoin to true;
+explain (costs  off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+
+set enable_nestloop to true;
+set enable_hashjoin to true;
+set enable_mergejoin to true;
+
+drop table tbl_a;
+drop table tbl_b;
+set enable_pullup_subquery to false;

From 3ae0235f304583029688a1846157455b4e740e47 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Wed, 12 Aug 2020 15:29:39 +0800
Subject: [PATCH 030/578] Refine UPDATE/DELETE join distribution rules

1. Do not replicate outer path if join type is JOIN_LEFT_SCALAR
2. Remove the replication_level restriction since we have to do the
replicate for UPDATE/DELETE anyway
---
 src/backend/optimizer/util/pathnode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 9b89cb4a..6fc4ae2c 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2707,7 +2707,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 			if (resultRelLoc == RESULT_REL_INNER &&
 				pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL &&
 				pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI &&
-				!pathnode->inner_unique)
+				pathnode->jointype != JOIN_LEFT_SCALAR && !pathnode->inner_unique)
 			{
 				/* Replicate outer */
 				pathnode->outerjoinpath = redistribute_path(
@@ -2755,7 +2755,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 		if (innerd &&resultRelLoc == RESULT_REL_INNER &&
 			pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL &&
 			pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI &&
-			!pathnode->inner_unique)
+			pathnode->jointype != JOIN_LEFT_SCALAR && !pathnode->inner_unique)
 		{
 			pathnode->path.distribution = innerd;
 			return alternate;

From 9653f0d0d6fa8bbd91e4cc60290562347e73c8c0 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Mon, 17 Aug 2020 14:58:51 +0800
Subject: [PATCH 031/578] Support agg optimize for targetlist subquery

---
 src/backend/optimizer/plan/subselect.c | 100 ++++++++++++++++++++++++-
 src/test/regress/sql/subselect.sql     |   8 ++
 2 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index bff6e3fd..a0f01e9b 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -2819,8 +2819,7 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
     /*
      * What we can not optimize.
      */
-    if (subselect->commandType != CMD_SELECT ||
-        subselect->hasAggs || subselect->hasDistinctOn ||
+    if (subselect->commandType != CMD_SELECT || subselect->hasDistinctOn ||
         subselect->setOperations || subselect->groupingSets ||
         subselect->groupClause || subselect->hasWindowFuncs ||
         subselect->hasTargetSRFs || subselect->hasModifyingCTE ||
@@ -2848,8 +2847,105 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
      * The rest of the sub-select must not refer to any Vars of the parent
      * query.  (Vars of higher levels should be okay, though.)
      */
+    subselect->jointree->quals = NULL;
     if (contain_vars_of_level((Node *) subselect, 1))
         return NULL;
+    subselect->jointree->quals = whereClause;
+
+    if (subselect->hasAggs)
+    {
+        List *joinquals  = NULL;
+        List *vars       = NULL;
+        TargetEntry *ent = NULL;
+        ListCell *cell   = NULL;
+        int ressortgroupref = 0;
+        int varno = 0;
+
+        /* process 'op' and 'bool' expr only */
+        if (contain_notexpr_or_neopexpr(whereClause, true, &joinquals))
+            return NULL;
+
+        vars = pull_vars_of_level((Node *) joinquals, 0);
+
+        /* construct groupby clause */
+        foreach (cell, vars)
+        {
+            Oid sortop;
+            Oid eqop;
+            bool hashable;
+            Oid restype;
+            SortGroupClause *grpcl;
+            Var *var = (Var *) lfirst(cell);
+            RangeTblEntry *tbl = (RangeTblEntry *) list_nth(subselect->rtable, var->varno - 1);
+
+            if (tbl->rtekind != RTE_RELATION && tbl->rtekind != RTE_CTE)
+                return NULL;
+
+            restype = exprType((Node *) var);
+
+            grpcl = makeNode(SortGroupClause);
+
+            ressortgroupref++;
+
+            if (tbl->rtekind == RTE_RELATION)
+            {
+                ent = makeTargetEntry((Expr *) copyObject(var), var->varoattno,
+                                      get_relid_attribute_name(tbl->relid, var->varoattno), false);
+            }
+            else
+            {
+                int plan_id;
+                int ndx;
+                ListCell *lc;
+                Plan *cte_plan;
+                TargetEntry *cte_ent = NULL;
+
+                /*
+                 * Note: cte_plan_ids can be shorter than cteList, if we are still working
+                 * on planning the CTEs (ie, this is a side-reference from another CTE).
+                 * So we mustn't use forboth here.
+                 */
+                ndx = 0;
+                foreach (lc, root->parse->cteList)
+                {
+                    CommonTableExpr *cte = (CommonTableExpr *) lfirst(lc);
+
+                    if (strcmp(cte->ctename, tbl->ctename) == 0)
+                        break;
+                    ndx++;
+                }
+                if (lc == NULL) /* shouldn't happen */
+                    elog(ERROR, "could not find CTE \"%s\"", tbl->ctename);
+                if (ndx >= list_length(root->cte_plan_ids))
+                    elog(ERROR, "could not find plan for CTE \"%s\"", tbl->ctename);
+                plan_id = list_nth_int(root->cte_plan_ids, ndx);
+                cte_plan = (Plan *) lfirst(list_nth_cell(root->glob->subplans, plan_id - 1));
+                cte_ent = (TargetEntry *) lfirst(list_nth_cell(cte_plan->targetlist, var->varattno - 1));
+                ent = makeTargetEntry((Expr *) copyObject(var), var->varoattno, cte_ent->resname, false);
+            }
+
+            ent->ressortgroupref = ressortgroupref;
+
+            subselect->targetList = lappend(subselect->targetList, ent);
+
+            varno = list_length(subselect->targetList);
+            ent->resno = varno;
+
+            /* determine the eqop and optional sortop */
+            get_sort_group_operators(restype,
+                                     false, true, false,
+                                     &sortop, &eqop, NULL,
+                                     &hashable);
+
+            grpcl->tleSortGroupRef = ressortgroupref;
+            grpcl->eqop = eqop;
+            grpcl->sortop = sortop;
+            grpcl->nulls_first = false; /* OK with or without sortop */
+            grpcl->hashable = hashable;
+
+            subselect->groupClause = lappend(subselect->groupClause, grpcl);
+        }
+    }
 
     /*
      * Move sub-select to the parent query.
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 2ebadb13..8d1cdebd 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -768,6 +768,14 @@ set enable_nestloop to true;
 set enable_hashjoin to true;
 set enable_mergejoin to true;
 
+-- agg
+explain (costs off)  select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1;
+select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1;
+explain (costs off)  select (select count(b.a) from tbl_b b where b.a = a.a) from tbl_a a order by 1;
+select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1;
+explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1;
+select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1;
+
 drop table tbl_a;
 drop table tbl_b;
 set enable_pullup_subquery to false;

From ebf254f372c7e7a7eac1256bf4a46755f31215de Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 18 Aug 2020 10:30:31 +0800
Subject: [PATCH 032/578] Refine some code of JOIN_LEFT_SCALAR

1. Add missing switch case in reduce_outer_joins_pass2()
2. Refine some pre-compile definitions in set_joinpath_distribution()
---
 src/backend/optimizer/prep/prepjointree.c |  1 +
 src/backend/optimizer/util/pathnode.c     | 11 +++--------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c
index b7cbd5c0..d54e097a 100644
--- a/src/backend/optimizer/prep/prepjointree.c
+++ b/src/backend/optimizer/prep/prepjointree.c
@@ -3077,6 +3077,7 @@ reduce_outer_joins_pass2(Node *jtnode,
 				break;
 			case JOIN_SEMI:
 			case JOIN_ANTI:
+			case JOIN_LEFT_SCALAR:
 
 				/*
 				 * These could only have been introduced by pull_up_sublinks,
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 6fc4ae2c..1eefd477 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1729,20 +1729,15 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 	 * nullable data nodes will produce joined rows with NULLs for cases when
 	 * matching row exists, but on other data node.
 	 */
-#ifdef __TBASE__
+
 	if ((innerd && IsLocatorReplicated(innerd->distributionType)) &&
 			(pathnode->jointype == JOIN_INNER ||
 			 pathnode->jointype == JOIN_LEFT ||
 			 pathnode->jointype == JOIN_SEMI ||
+#ifdef __TBASE__
              pathnode->jointype == JOIN_LEFT_SCALAR ||
-			 pathnode->jointype == JOIN_ANTI))
-#else
-	if ((innerd && IsLocatorReplicated(innerd->distributionType)) &&
-			(pathnode->jointype == JOIN_INNER ||
-			 pathnode->jointype == JOIN_LEFT ||
-			 pathnode->jointype == JOIN_SEMI ||
-			 pathnode->jointype == JOIN_ANTI))
 #endif
+			 pathnode->jointype == JOIN_ANTI))
 	{
 		/* We need inner relation is defined on all nodes where outer is */
 		if (!outerd || !bms_is_subset(outerd->nodes, innerd->nodes))

From 7a33f7f382aa4d7a1ccba1186125f7607e4c8b9a Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Thu, 27 Aug 2020 20:15:39 +0800
Subject: [PATCH 033/578] Pullup targetlist sublink that wrapped in expression

Also improved the targetlist join type selection. We don't need
JOIN_LEFT_SCALAR if sublink with agg, since we've create groupby for
them.
---
 src/backend/optimizer/plan/subselect.c    | 55 ++++++++++++++---------
 src/backend/optimizer/prep/prepjointree.c | 18 +++++---
 src/test/regress/sql/subselect.sql        |  8 +++-
 3 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index a0f01e9b..05a21a79 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -2773,28 +2773,38 @@ get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, List **targetLis
 
 #ifdef __TBASE__
 /*
- * convert_TargetList_sublink_to_join :
- *          try to convert an EXISTS SubLink in targetlist to a join
- * On success, it returns not NULL.
+ * Try to convert an SubLink in targetlist to a join
+ *
+ * The sublink in targetlist has the semantic of SCALAR. Normal joins will join
+ * simply generate repeated tuples. So we add a new join type JOIN_LEFT_SCALAR
+ * which acts like left join and reports error when scalar semantics is broken.
+ *
+ * On success, it converts sublink to subquery to parent jointree and returns
+ * the converted new targetlist entry. Otherwise, it returnes NULL.
  */
 TargetEntry *
 convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 {
-    Query *parse = root->parse;
-    Node  *whereClause = NULL;
-    Query   *subselect = NULL;
-    JoinExpr *joinExpr = NULL;
-    ParseState *pstate = NULL;
-    SubLink   *sublink = NULL;
-    RangeTblRef   *rtr = NULL;
+    Query		*parse = root->parse;
+    Node		*whereClause = NULL;
+    Query		*subselect = NULL;
+    JoinExpr	*joinExpr = NULL;
+    ParseState	*pstate = NULL;
+    SubLink		*sublink = NULL;
+    RangeTblRef	*rtr = NULL;
     RangeTblEntry *rte = NULL;
-    Var *var = NULL;
+    Var			*var = NULL;
+	List 		*sublinks = NIL;
 
-    /* Sanity check */
-    if (!IsA(entry->expr, SubLink))
-        return NULL;
+	/* Find sublinks in the targetlist entry */
+	find_sublink_walker((Node *)entry->expr, &sublinks);
+
+	/* Only one sublink can be handled */
+	if (list_length(sublinks) != 1)
+		return NULL;
+
+	sublink = linitial(sublinks);
 
-    sublink = (SubLink *) entry->expr;
     if (sublink->subLinkType != EXPR_SUBLINK)
         return NULL;
 
@@ -2811,7 +2821,7 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
         return NULL;
 
     /*
-     * The subquery must have a nonempty jointree, else we won't have a join.
+     * The SubQuery must have a non-empty JoinTree, else we won't have a join.
      */
     if (subselect->jointree->fromlist == NIL)
         return NULL;
@@ -2845,7 +2855,7 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 
     /*
      * The rest of the sub-select must not refer to any Vars of the parent
-     * query.  (Vars of higher levels should be okay, though.)
+     * query. (Vars of higher levels should be okay, though.)
      */
     subselect->jointree->quals = NULL;
     if (contain_vars_of_level((Node *) subselect, 1))
@@ -2884,7 +2894,6 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
             restype = exprType((Node *) var);
 
             grpcl = makeNode(SortGroupClause);
-
             ressortgroupref++;
 
             if (tbl->rtekind == RTE_RELATION)
@@ -2965,7 +2974,7 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
      * Form join node.
      */
     joinExpr = makeNode(JoinExpr);
-    joinExpr->jointype = JOIN_LEFT_SCALAR;
+    joinExpr->jointype = subselect->hasAggs? JOIN_LEFT : JOIN_LEFT_SCALAR;
     joinExpr->isNatural = false;
     joinExpr->larg = (Node *) root->parse->jointree;
     joinExpr->rarg = (Node *) rtr;
@@ -2977,9 +2986,13 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
     /* Wrap join node in FromExpr as required. */
     parse->jointree = makeFromExpr(list_make1(joinExpr), NULL);
 
-    /* Replace sublink node with Var. */
+    /* Build a Var pointing to the subquery */
     var = makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList));
-    entry->expr = (Expr *) var;
+
+    /* Replace sublink node with Var. */
+    entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr,
+    												   sublink,
+													   (Node *)var);
     return entry;
 }
 #endif
diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c
index d54e097a..a94388d4 100644
--- a/src/backend/optimizer/prep/prepjointree.c
+++ b/src/backend/optimizer/prep/prepjointree.c
@@ -181,8 +181,11 @@ static void fix_append_rel_relids(List *append_rel_list, int varno,
 static Node *find_jointree_node_for_rel(Node *jtnode, int relid);
 
 #ifdef __TBASE__
-static Node * pull_up_or_sublinks_qual_recurse(PlannerInfo *root, Node *node, Node **jtlink,Node **orclauses);
-static bool check_pull_up_sublinks_qual_or_recurse(PlannerInfo *root, Node *node);
+static Node *pull_up_or_sublinks_qual_recurse(PlannerInfo *root, Node *node,
+									Node **jtlink,Node **orclauses);
+static bool check_pull_up_sublinks_qual_or_recurse(PlannerInfo *root,
+									Node *node);
+
 #endif
 
 /*
@@ -225,16 +228,19 @@ pull_up_sublinks(PlannerInfo *root)
      */
     if(enable_pullup_subquery)
     {
-        List *new_targetList = NIL;
-        ListCell *lc = NULL;
-        TargetEntry *entry = NULL;
-        TargetEntry *new_entry = NULL;
+        List 		*new_targetList = NIL;
+        ListCell 	*lc 			= NULL;
+        TargetEntry *entry 			= NULL;
+        TargetEntry *new_entry 		= NULL;
 
+        /* Iterate through out the target list */
         foreach(lc, root->parse->targetList)
         {
             entry = (TargetEntry *) lfirst(lc);
 
+            /* Try to convert sublink in targetlist entry to join */
             new_entry = convert_TargetList_sublink_to_join(root, entry);
+
             if (new_entry)
                 new_targetList = lappend(new_targetList, new_entry);
             else
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 8d1cdebd..a7ba5190 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -768,7 +768,7 @@ set enable_nestloop to true;
 set enable_hashjoin to true;
 set enable_mergejoin to true;
 
--- agg
+-- targetlist sublink with agg
 explain (costs off)  select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1;
 select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1;
 explain (costs off)  select (select count(b.a) from tbl_b b where b.a = a.a) from tbl_a a order by 1;
@@ -776,6 +776,12 @@ select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1
 explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1;
 select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1;
 
+-- targetlist sublink wrapped in expr
+explain (costs off)  select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+explain (costs off)  select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+
 drop table tbl_a;
 drop table tbl_b;
 set enable_pullup_subquery to false;

From 1e8fd4c631aaa2b98d291c546df8db98a34a6032 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Fri, 28 Aug 2020 17:05:02 +0800
Subject: [PATCH 034/578] Minor fix after merged all sublink pullup
 enhancements

---
 src/include/optimizer/clauses.h         |   8 +-
 src/test/regress/expected/subselect.out | 391 +++++++++++++++++-------
 2 files changed, 280 insertions(+), 119 deletions(-)

diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h
index ac066a6a..3e7b9e4c 100644
--- a/src/include/optimizer/clauses.h
+++ b/src/include/optimizer/clauses.h
@@ -14,11 +14,11 @@
 #ifndef CLAUSES_H
 #define CLAUSES_H
 
+#include "access/htup.h"
 #include "nodes/relation.h"
 
-
-#define is_opclause(clause)        ((clause) != NULL && IsA(clause, OpExpr))
-#define is_funcclause(clause)    ((clause) != NULL && IsA(clause, FuncExpr))
+#define is_opclause(clause)		((clause) != NULL && IsA(clause, OpExpr))
+#define is_funcclause(clause)	((clause) != NULL && IsA(clause, FuncExpr))
 
 typedef struct
 {
@@ -85,7 +85,7 @@ extern Node *eval_const_expressions(PlannerInfo *root, Node *node);
 extern Node *estimate_expression_value(PlannerInfo *root, Node *node);
 
 extern Query *inline_set_returning_function(PlannerInfo *root,
-                              RangeTblEntry *rte);
+							  RangeTblEntry *rte);
 
 extern Node *substitute_sublink_with_node(Node *expr, SubLink *sublink,
 		                                                               Node *node);
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 5d770f1d..79708c41 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1160,7 +1160,6 @@ NOTICE:  x = 9, y = 13
 (3 rows)
 
 drop function tattle(x int, y int);
-
 --
 -- Tests for CTE inlining behavior
 --
@@ -1448,7 +1447,6 @@ select * from x for update;
 --
 -- Tests for pulling up more sublinks
 --
-
 set enable_pullup_subquery to true;
 create table tbl_a(a int,b int);
 create table tbl_b(a int,b int);
@@ -1459,16 +1457,16 @@ set enable_nestloop to true;
 set enable_hashjoin to false;
 set enable_mergejoin to false;
 explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                                                 QUERY PLAN                                                 
-------------------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=36374.94..36378.32 rows=1350 width=8)
-   ->  Sort  (cost=36374.94..36378.32 rows=1350 width=8)
-         Sort Key: a.a, ((SubPlan 1))
-         ->  Seq Scan on tbl_a a  (cost=0.00..36304.75 rows=1350 width=8)
-               SubPlan 1
-                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=0.00..26.88 rows=7 width=4)
-                       ->  Seq Scan on tbl_b b  (cost=0.00..26.88 rows=7 width=4)
-                             Filter: (a = a.a)
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=15636.19..15637.88 rows=675 width=8)
+   ->  Sort  (cost=15636.19..15637.88 rows=675 width=8)
+         Sort Key: a.a, b.a
+         ->  Nested Loop Left Scalar Join  (cost=0.00..15604.47 rows=675 width=8)
+               Join Filter: (b.a = a.a)
+               ->  Seq Scan on tbl_a a  (cost=0.00..23.50 rows=1350 width=4)
+               ->  Materialize  (cost=0.00..30.25 rows=1350 width=4)
+                     ->  Seq Scan on tbl_b b  (cost=0.00..23.50 rows=1350 width=4)
 (8 rows)
 
 select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
@@ -1490,16 +1488,16 @@ set enable_nestloop to false;
 set enable_hashjoin to true;
 set enable_mergejoin to false;
 explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                               QUERY PLAN                                
--------------------------------------------------------------------------
+                     QUERY PLAN                      
+-----------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: a.a, ((SubPlan 1))
-         ->  Seq Scan on tbl_a a
-               SubPlan 1
-                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                       ->  Seq Scan on tbl_b b
-                             Filter: (a = a.a)
+         Sort Key: a.a, b.a
+         ->  Hash Left Scalar Join
+               Hash Cond: (a.a = b.a)
+               ->  Seq Scan on tbl_a a
+               ->  Hash
+                     ->  Seq Scan on tbl_b b
 (8 rows)
 
 select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
@@ -1521,17 +1519,20 @@ set enable_nestloop to false;
 set enable_hashjoin to false;
 set enable_mergejoin to true;
 explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                               QUERY PLAN                                
--------------------------------------------------------------------------
+                     QUERY PLAN                      
+-----------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: a.a, ((SubPlan 1))
-         ->  Seq Scan on tbl_a a
-               SubPlan 1
-                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                       ->  Seq Scan on tbl_b b
-                             Filter: (a = a.a)
-(8 rows)
+         Sort Key: a.a, b.a
+         ->  Merge Left Scalar Join
+               Merge Cond: (a.a = b.a)
+               ->  Sort
+                     Sort Key: a.a
+                     ->  Seq Scan on tbl_a a
+               ->  Sort
+                     Sort Key: b.a
+                     ->  Seq Scan on tbl_b b
+(11 rows)
 
 select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
  a  | q  
@@ -1554,16 +1555,16 @@ set enable_nestloop to true;
 set enable_hashjoin to false;
 set enable_mergejoin to false;
 explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                               QUERY PLAN                                
--------------------------------------------------------------------------
+                     QUERY PLAN                      
+-----------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: a.a, ((SubPlan 1))
-         ->  Seq Scan on tbl_a a
-               SubPlan 1
-                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                       ->  Seq Scan on tbl_b b
-                             Filter: (a = a.a)
+         Sort Key: a.a, b.a
+         ->  Nested Loop Left Scalar Join
+               Join Filter: (b.a = a.a)
+               ->  Seq Scan on tbl_a a
+               ->  Materialize
+                     ->  Seq Scan on tbl_b b
 (8 rows)
 
 select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
@@ -1572,16 +1573,16 @@ set enable_nestloop to false;
 set enable_hashjoin to true;
 set enable_mergejoin to false;
 explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                               QUERY PLAN                                
--------------------------------------------------------------------------
+                     QUERY PLAN                      
+-----------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: a.a, ((SubPlan 1))
-         ->  Seq Scan on tbl_a a
-               SubPlan 1
-                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                       ->  Seq Scan on tbl_b b
-                             Filter: (a = a.a)
+         Sort Key: a.a, b.a
+         ->  Hash Left Scalar Join
+               Hash Cond: (a.a = b.a)
+               ->  Seq Scan on tbl_a a
+               ->  Hash
+                     ->  Seq Scan on tbl_b b
 (8 rows)
 
 select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
@@ -1590,67 +1591,69 @@ set enable_nestloop to false;
 set enable_hashjoin to false;
 set enable_mergejoin to true;
 explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                               QUERY PLAN                                
--------------------------------------------------------------------------
+                     QUERY PLAN                      
+-----------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: a.a, ((SubPlan 1))
-         ->  Seq Scan on tbl_a a
-               SubPlan 1
-                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                       ->  Seq Scan on tbl_b b
-                             Filter: (a = a.a)
-(8 rows)
+         Sort Key: a.a, b.a
+         ->  Merge Left Scalar Join
+               Merge Cond: (a.a = b.a)
+               ->  Sort
+                     Sort Key: a.a
+                     ->  Seq Scan on tbl_a a
+               ->  Sort
+                     Sort Key: b.a
+                     ->  Seq Scan on tbl_b b
+(11 rows)
 
 select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
 ERROR:  more than one row returned by a subquery used as an expression
 explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2;
-                          QUERY PLAN                          
---------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
    ->  Sort
-         Sort Key: a.a, ((SubPlan 1))
-         ->  Seq Scan on tbl_a a
-               SubPlan 1
-                 ->  Remote Subquery Scan on all (datanode_1)
-                       ->  Result
-                             One-Time Filter: (a.a = 5)
-                             ->  Seq Scan on tbl_b b
-                                   Filter: (a = 5)
-(10 rows)
+         Sort Key: a.a, b.a
+         ->  Merge Left Scalar Join
+               Merge Cond: (a.a = b.a)
+               ->  Sort
+                     Sort Key: a.a
+                     ->  Seq Scan on tbl_a a
+               ->  Sort
+                     Sort Key: b.a
+                     ->  Seq Scan on tbl_b b
+                           Filter: (a = 5)
+(12 rows)
 
 select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2;
- a  | q 
-----+---
-  1 |  
-  2 |  
-  3 |  
-  4 |  
-  5 | 5
-  6 |  
-  7 |  
-  8 |  
-  9 |  
- 10 |  
-(10 rows)
+ a | q 
+---+---
+ 1 |  
+ 2 |  
+ 5 | 5
+ 6 |  
+ 8 |  
+ 9 |  
+(6 rows)
 
 -- check distinct scenario.
 set enable_nestloop to true;
 set enable_hashjoin to false;
 set enable_mergejoin to false;
 explain (costs  off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                               QUERY PLAN                                
--------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Sort
-         Sort Key: a.a, ((SubPlan 1))
-         ->  Seq Scan on tbl_a a
-               SubPlan 1
-                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                       ->  Unique
-                             ->  Seq Scan on tbl_b b
-                                   Filter: (a = a.a)
-(9 rows)
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Sort
+   Sort Key: a.a, a
+   ->  Nested Loop Left Scalar Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Unique
+                           ->  Seq Scan on tbl_b b
+                                 Filter: (a = a.a)
+(10 rows)
 
 select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
  a  | q  
@@ -1671,18 +1674,19 @@ set enable_nestloop to false;
 set enable_hashjoin to true;
 set enable_mergejoin to false;
 explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                               QUERY PLAN                                
--------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Sort
-         Sort Key: a.a, ((SubPlan 1))
-         ->  Seq Scan on tbl_a a
-               SubPlan 1
-                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                       ->  Unique
-                             ->  Seq Scan on tbl_b b
-                                   Filter: (a = a.a)
-(9 rows)
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Sort
+   Sort Key: a.a, a
+   ->  Nested Loop Left Scalar Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Unique
+                           ->  Seq Scan on tbl_b b
+                                 Filter: (a = a.a)
+(10 rows)
 
 select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
  a  | q  
@@ -1703,18 +1707,19 @@ set enable_nestloop to false;
 set enable_hashjoin to false;
 set enable_mergejoin to true;
 explain (costs  off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                               QUERY PLAN                                
--------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Sort
-         Sort Key: a.a, ((SubPlan 1))
-         ->  Seq Scan on tbl_a a
-               SubPlan 1
-                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                       ->  Unique
-                             ->  Seq Scan on tbl_b b
-                                   Filter: (a = a.a)
-(9 rows)
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Sort
+   Sort Key: a.a, a
+   ->  Nested Loop Left Scalar Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Unique
+                           ->  Seq Scan on tbl_b b
+                                 Filter: (a = a.a)
+(10 rows)
 
 select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
  a  | q  
@@ -1734,6 +1739,162 @@ select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a ord
 set enable_nestloop to true;
 set enable_hashjoin to true;
 set enable_mergejoin to true;
+-- targetlist sublink with agg
+explain (costs off)  select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Sort
+   Sort Key: "TARGETLIST_subquery".sum
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Subquery Scan on "TARGETLIST_subquery"
+                           ->  GroupAggregate
+                                 Group Key: b.a, b.b
+                                 ->  Seq Scan on tbl_b b
+                                       Filter: ((a = a.a) AND (b = a.b))
+(12 rows)
+
+select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1;
+ sum 
+-----
+   2
+   3
+   4
+   5
+   6
+   7
+   8
+   9
+  10
+    
+(10 rows)
+
+explain (costs off)  select (select count(b.a) from tbl_b b where b.a = a.a) from tbl_a a order by 1;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Sort
+   Sort Key: "TARGETLIST_subquery".count
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Subquery Scan on "TARGETLIST_subquery"
+                           ->  GroupAggregate
+                                 Group Key: b.a
+                                 ->  Seq Scan on tbl_b b
+                                       Filter: (a = a.a)
+(12 rows)
+
+select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1;
+ count 
+-------
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     2
+      
+(10 rows)
+
+explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1;
+                                       QUERY PLAN                                       
+----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: ((SubPlan 1))
+         ->  Seq Scan on tbl_a a
+               SubPlan 1
+                 ->  Finalize Aggregate
+                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                             ->  Partial Aggregate
+                                   ->  Seq Scan on tbl_b b
+                                         Filter: (((a = a.a) AND (b = a.b)) OR (a = 1))
+(10 rows)
+
+select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1;
+ sum 
+-----
+   2
+   3
+   4
+   5
+   6
+   7
+   8
+   9
+  10
+    
+(10 rows)
+
+-- targetlist sublink wrapped in expr
+explain (costs off)  select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+                                      QUERY PLAN                                       
+---------------------------------------------------------------------------------------
+ Sort
+   Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".sum ELSE '0'::bigint END)
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Subquery Scan on "TARGETLIST_subquery"
+                           ->  GroupAggregate
+                                 Group Key: b.a, b.b
+                                 ->  Seq Scan on tbl_b b
+                                       Filter: ((a = a.a) AND (b = a.b))
+(12 rows)
+
+select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+ case 
+------
+    2
+    3
+    4
+    5
+    6
+    7
+    8
+    9
+   10
+     
+(10 rows)
+
+explain (costs off)  select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: (CASE WHEN (a.b = 1) THEN b.a ELSE 0 END)
+         ->  Hash Left Scalar Join
+               Hash Cond: ((a.a = b.a) AND (a.b = b.b))
+               ->  Seq Scan on tbl_a a
+               ->  Hash
+                     ->  Seq Scan on tbl_b b
+(8 rows)
+
+select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+ case 
+------
+    2
+    3
+    4
+    5
+    6
+    7
+    8
+    9
+   10
+     
+(10 rows)
+
 drop table tbl_a;
 drop table tbl_b;
 set enable_pullup_subquery to false;

From 21ab7f55f152ebb5a82d74dcfc1da53aec712ae1 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Fri, 28 Aug 2020 18:03:43 +0800
Subject: [PATCH 035/578] Fix DDL deadlock and meta inconsistency. 1. Forward
 DDL to ascii minimized coordinator to serialize. 2. DDL execution will be
 executed on all the coordinators then datanodes. 3. DDL commit will be
 executed on all the datanodes then coordinators.

---
 src/backend/commands/tablecmds.c    | 221 +++++++++++-----------
 src/backend/pgxc/pool/execRemote.c  | 275 ++++++++++++++++++++++------
 src/backend/pgxc/pool/pgxcnode.c    | 155 ++++++++++++++--
 src/backend/postmaster/postmaster.c |   4 +
 src/backend/tcop/postgres.c         | 119 ++++++------
 src/backend/tcop/pquery.c           |  15 +-
 src/backend/tcop/utility.c          |  34 ++++
 src/backend/utils/misc/guc.c        |  38 +++-
 src/include/pgxc/pgxc.h             |  13 +-
 src/include/pgxc/pgxcnode.h         |  12 +-
 src/include/tcop/utility.h          |   3 +
 11 files changed, 637 insertions(+), 252 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 66273eb5..9e3b3f14 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -632,118 +632,123 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
      * namespace is selected.
      */
 #ifdef __TBASE__
-    if (stmt->interval_child)
+	if (stmt->interval_child)
+	{
+		/* interval partition child's namespace is same as parent. */
+		namespaceId = get_rel_namespace(stmt->interval_parentId);
+	}
+	else
     {
-        /* interval partition child's namespace is same as parent. */
-        namespaceId = get_rel_namespace(stmt->interval_parentId);
+        namespaceId =
+                RangeVarGetAndCheckCreationNamespace(stmt->relation, ExclusiveLock, NULL);
     }
-    else
+#else
+	namespaceId =
+		RangeVarGetAndCheckCreationNamespace(stmt->relation, NoLock, NULL);
 #endif
-    namespaceId =
-        RangeVarGetAndCheckCreationNamespace(stmt->relation, NoLock, NULL);
-
-    /*
-     * Security check: disallow creating temp tables from security-restricted
-     * code.  This is needed because calling code might not expect untrusted
-     * tables to appear in pg_temp at the front of its search path.
-     */
-    if (stmt->relation->relpersistence == RELPERSISTENCE_TEMP
-        && InSecurityRestrictedOperation())
-        ereport(ERROR,
-                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                 errmsg("cannot create temporary table within security-restricted operation")));
 
-    /*
-     * Select tablespace to use.  If not specified, use default tablespace
-     * (which may in turn default to database's default).
-     */
-    if (stmt->tablespacename)
-    {
-        tablespaceId = get_tablespace_oid(stmt->tablespacename, false);
-    }
-    else
-    {
-        tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence);
-        /* note InvalidOid is OK in this case */
-    }
-
-    /* Check permissions except when using database's default */
-    if (OidIsValid(tablespaceId) && tablespaceId != MyDatabaseTableSpace)
-    {
-        AclResult    aclresult;
-
-        aclresult = pg_tablespace_aclcheck(tablespaceId, GetUserId(),
-                                           ACL_CREATE);
-        if (aclresult != ACLCHECK_OK)
-            aclcheck_error(aclresult, ACL_KIND_TABLESPACE,
-                           get_tablespace_name(tablespaceId));
-    }
-
-    /* In all cases disallow placing user relations in pg_global */
-    if (tablespaceId == GLOBALTABLESPACE_OID)
-        ereport(ERROR,
-                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                 errmsg("only shared relations can be placed in pg_global tablespace")));
-
-    /* Identify user ID that will own the table */
-    if (!OidIsValid(ownerId))
-        ownerId = GetUserId();
-
-    /*
-     * Parse and validate reloptions, if any.
-     */
-    reloptions = transformRelOptions((Datum) 0, stmt->options, NULL, validnsps,
-                                     true, false);
-
-    if (relkind == RELKIND_VIEW)
-        (void) view_reloptions(reloptions, true);
-    else
-        (void) heap_reloptions(relkind, reloptions, true);
-
-    if (stmt->ofTypename)
-    {
-        AclResult    aclresult;
-
-        ofTypeId = typenameTypeId(NULL, stmt->ofTypename);
-
-        aclresult = pg_type_aclcheck(ofTypeId, GetUserId(), ACL_USAGE);
-        if (aclresult != ACLCHECK_OK)
-            aclcheck_error_type(aclresult, ofTypeId);
-    }
-    else
-        ofTypeId = InvalidOid;
-
-    /*
-     * Look up inheritance ancestors and generate relation schema, including
-     * inherited attributes.  (Note that stmt->tableElts is destructively
-     * modified by MergeAttributes.)
-     */
-    stmt->tableElts =
-        MergeAttributes(stmt->tableElts, stmt->inhRelations,
-                        stmt->relation->relpersistence,
-                        stmt->partbound != NULL,
-                        &inheritOids, &old_constraints, &parentOidCount);
-
-    /*
-     * Create a tuple descriptor from the relation schema.  Note that this
-     * deals with column names, types, and NOT NULL constraints, but not
-     * default values or CHECK constraints; we handle those below.
-     */
-    descriptor = BuildDescForRelation(stmt->tableElts);
-
-    /*
-     * Notice that we allow OIDs here only for plain tables and partitioned
-     * tables, even though some other relkinds can support them.  This is
-     * necessary because the default_with_oids GUC must apply only to plain
-     * tables and not any other relkind; doing otherwise would break existing
-     * pg_dump files.  We could allow explicit "WITH OIDS" while not allowing
-     * default_with_oids to affect other relkinds, but it would complicate
-     * interpretOidsOption().
-     */
-    localHasOids = interpretOidsOption(stmt->options,
-                                       (relkind == RELKIND_RELATION ||
-                                        relkind == RELKIND_PARTITIONED_TABLE));
-    descriptor->tdhasoid = (localHasOids || parentOidCount > 0);
+	/*
+	 * Security check: disallow creating temp tables from security-restricted
+	 * code.  This is needed because calling code might not expect untrusted
+	 * tables to appear in pg_temp at the front of its search path.
+	 */
+	if (stmt->relation->relpersistence == RELPERSISTENCE_TEMP
+		&& InSecurityRestrictedOperation())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("cannot create temporary table within security-restricted operation")));
+
+	/*
+	 * Select tablespace to use.  If not specified, use default tablespace
+	 * (which may in turn default to database's default).
+	 */
+	if (stmt->tablespacename)
+	{
+		tablespaceId = get_tablespace_oid(stmt->tablespacename, false);
+	}
+	else
+	{
+		tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence);
+		/* note InvalidOid is OK in this case */
+	}
+
+	/* Check permissions except when using database's default */
+	if (OidIsValid(tablespaceId) && tablespaceId != MyDatabaseTableSpace)
+	{
+		AclResult	aclresult;
+
+		aclresult = pg_tablespace_aclcheck(tablespaceId, GetUserId(),
+										   ACL_CREATE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, ACL_KIND_TABLESPACE,
+						   get_tablespace_name(tablespaceId));
+	}
+
+	/* In all cases disallow placing user relations in pg_global */
+	if (tablespaceId == GLOBALTABLESPACE_OID)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("only shared relations can be placed in pg_global tablespace")));
+
+	/* Identify user ID that will own the table */
+	if (!OidIsValid(ownerId))
+		ownerId = GetUserId();
+
+	/*
+	 * Parse and validate reloptions, if any.
+	 */
+	reloptions = transformRelOptions((Datum) 0, stmt->options, NULL, validnsps,
+									 true, false);
+
+	if (relkind == RELKIND_VIEW)
+		(void) view_reloptions(reloptions, true);
+	else
+		(void) heap_reloptions(relkind, reloptions, true);
+
+	if (stmt->ofTypename)
+	{
+		AclResult	aclresult;
+
+		ofTypeId = typenameTypeId(NULL, stmt->ofTypename);
+
+		aclresult = pg_type_aclcheck(ofTypeId, GetUserId(), ACL_USAGE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error_type(aclresult, ofTypeId);
+	}
+	else
+		ofTypeId = InvalidOid;
+
+	/*
+	 * Look up inheritance ancestors and generate relation schema, including
+	 * inherited attributes.  (Note that stmt->tableElts is destructively
+	 * modified by MergeAttributes.)
+	 */
+	stmt->tableElts =
+		MergeAttributes(stmt->tableElts, stmt->inhRelations,
+						stmt->relation->relpersistence,
+						stmt->partbound != NULL,
+						&inheritOids, &old_constraints, &parentOidCount);
+
+	/*
+	 * Create a tuple descriptor from the relation schema.  Note that this
+	 * deals with column names, types, and NOT NULL constraints, but not
+	 * default values or CHECK constraints; we handle those below.
+	 */
+	descriptor = BuildDescForRelation(stmt->tableElts);
+
+	/*
+	 * Notice that we allow OIDs here only for plain tables and partitioned
+	 * tables, even though some other relkinds can support them.  This is
+	 * necessary because the default_with_oids GUC must apply only to plain
+	 * tables and not any other relkind; doing otherwise would break existing
+	 * pg_dump files.  We could allow explicit "WITH OIDS" while not allowing
+	 * default_with_oids to affect other relkinds, but it would complicate
+	 * interpretOidsOption().
+	 */
+	localHasOids = interpretOidsOption(stmt->options,
+									   (relkind == RELKIND_RELATION ||
+										relkind == RELKIND_PARTITIONED_TABLE));
+	descriptor->tdhasoid = (localHasOids || parentOidCount > 0);
 #ifdef _SHARDING_
     if(IS_PGXC_DATANODE)
         has_extent = interpretExtentOption(stmt->options,
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 72aa55f1..452577db 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -51,6 +51,7 @@
 #include "utils/tuplesort.h"
 #include "utils/snapmgr.h"
 #include "utils/builtins.h"
+#include "tcop/utility.h"
 #include "pgxc/locator.h"
 #include "pgxc/pgxc.h"
 #include "parser/parse_type.h"
@@ -148,6 +149,7 @@ static void pgxc_abort_connections(PGXCNodeAllHandles *all_handles);
 static void pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle);
 static void pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle);
 static bool SetSnapshot(EState *state);
+static int pgxc_node_remote_commit_internal(PGXCNodeAllHandles *handles, TranscationType txn_type);
 #endif
 
 static void pgxc_connections_cleanup(ResponseCombiner *combiner);
@@ -4653,22 +4655,75 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
     return NULL;
 }
 
+#ifdef __TBASE__
+/*
+ * Commit transactions on remote nodes.
+ * If barrier lock is set wait while it is released.
+ * Release remote connection after completion.
+ *
+ * For DDL, DN will commit before CN does.
+ * Because DDLs normally have exclusive locks, then when CN gets committed,
+ * blocked user transactions will see DNs in a consistent state.
+ */
+static void
+pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
+{
+    int conn_count = 0;
+
+    if (!enable_parallel_ddl || !has_ddl)
+    {
+        /* normal cases */
+        conn_count = pgxc_node_remote_commit_internal(get_current_handles(), txn_type);
+    }
+    else
+    {
+        /* make sure first DN then CN */
+        conn_count =  pgxc_node_remote_commit_internal(get_current_dn_handles(), txn_type);
+        conn_count += pgxc_node_remote_commit_internal(get_current_cn_handles(), txn_type);
+    }
+
+    stat_transaction(conn_count);
+
+    if (need_release_handle)
+    {
+        if (!temp_object_included && !PersistentConnections)
+        {
+            /* Clean up remote sessions */
+            pgxc_node_remote_cleanup_all();
+            release_handles(false);
+        }
+    }
+    else
+    {
+        /* in subtxn, we just cleanup the connections. not release the handles. */
+        if (!temp_object_included && !PersistentConnections)
+        {
+            /* Clean up remote sessions without release handles. */
+            pgxc_node_remote_cleanup_all();
+        }
+    }
+
+    clear_handles();
+}
 
 /*
  * Commit transactions on remote nodes.
  * If barrier lock is set wait while it is released.
  * Release remote connection after completion.
  */
+static int
+pgxc_node_remote_commit_internal(PGXCNodeAllHandles *handles, TranscationType txn_type)
+#else
 static void
 pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
-{// #lizard forgives
-    int                result = 0;
-    char           *commitCmd = NULL;
-    int                i;
-    ResponseCombiner combiner;
-    PGXCNodeHandle **connections = NULL;
-    int                conn_count = 0;
-    PGXCNodeAllHandles *handles = get_current_handles();
+#endif
+{
+	int				result = 0;
+	char		   *commitCmd = NULL;
+	int				i;
+	ResponseCombiner combiner;
+	PGXCNodeHandle **connections = NULL;
+	int				conn_count = 0;
 
 #ifdef __TBASE__
     switch (txn_type)
@@ -4843,53 +4898,59 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
             result = EOF;
         }    
 
-        if (result)
-        {
-            if (combiner.errorMessage)
-            {
-                pgxc_node_report_error(&combiner);
-            }
-            else
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to COMMIT the transaction on one or more nodes")));
-            }
-        }
-        CloseCombiner(&combiner);
-    }
-    
-    stat_transaction(conn_count);    
+		if (result)
+		{
+			if (combiner.errorMessage)
+			{
+				pgxc_node_report_error(&combiner);
+			}
+			else
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("Failed to COMMIT the transaction on one or more nodes")));
+			}
+		}
+		CloseCombiner(&combiner);
+	}
 
-    
-    if (need_release_handle)
-    {
-        if (!temp_object_included && !PersistentConnections)
-        {
-            /* Clean up remote sessions */
-            pgxc_node_remote_cleanup_all();
-            release_handles(false);
-        }
-    }
-    else
-    {
-        /* in subtxn, we just cleanup the connections. not release the handles. */
-        if (!temp_object_included && !PersistentConnections)
-        {
-            /* Clean up remote sessions without release handles. */
-            pgxc_node_remote_cleanup_all();
-        }
-    }
-    
-    clear_handles();
+#ifndef __TBASE__
+	stat_transaction(conn_count);
+
+	
+	if (need_release_handle)
+	{
+		if (!temp_object_included && !PersistentConnections)
+		{
+			/* Clean up remote sessions */
+			pgxc_node_remote_cleanup_all();
+			release_handles(false);
+		}
+	}
+	else
+	{
+		/* in subtxn, we just cleanup the connections. not release the handles. */
+		if (!temp_object_included && !PersistentConnections)
+		{
+			/* Clean up remote sessions without release handles. */
+			pgxc_node_remote_cleanup_all();
+		}
+	}
+	
+	clear_handles();
+#endif
 
     pfree_pgxc_all_handles(handles);
 
-    if (connections)
-    {
-        pfree(connections);
-        connections = NULL;
-    }
+	if (connections)
+	{
+		pfree(connections);
+		connections = NULL;
+	}
+
+#ifdef __TBASE__
+	return conn_count;
+#endif
 }
 
 /*
@@ -6765,6 +6826,118 @@ ExecRemoteUtility(RemoteQuery *node)
         }
     }
 
+    /*
+	 * DDL will firstly be executed on coordinators then datanodes
+	 * which will avoid deadlocks in cluster.
+	 * Let us assume that user sql and ddl hold conflict locks,
+	 * then there will be two situations:
+	 * 1. The coordinator is not locked, user sql will see datanodes with no lock.
+	 * 2. The coordinator is locked, user sql will wait for ddl to complete.
+     *
+     * Send BEGIN control command to all coordinator nodes
+     */
+    if (pgxc_node_begin(co_conn_count,
+                        pgxc_connections->coord_handles,
+                        gxid,
+                        need_tran_block,
+                        false,
+                        PGXC_NODE_COORDINATOR))
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_INTERNAL_ERROR),
+                        errmsg("Could not begin transaction on coordinators")));
+    }
+
+    /* Send other txn related messages to coordinator nodes */
+    for (i = 0; i < co_conn_count; i++)
+    {
+        PGXCNodeHandle *conn = pgxc_connections->coord_handles[i];
+
+        if (snapshot && pgxc_node_send_snapshot(conn, snapshot))
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_INTERNAL_ERROR),
+                            errmsg("Failed to send command to coordinators")));
+        }
+        if (pgxc_node_send_cmd_id(conn, cid) < 0)
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_INTERNAL_ERROR),
+                            errmsg("Failed to send command ID to Datanodes")));
+        }
+
+        if (pgxc_node_send_query(conn, node->sql_statement) != 0)
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_INTERNAL_ERROR),
+                            errmsg("Failed to send command to coordinators")));
+        }
+    }
+
+    /* Make the same for Coordinators */
+    while (co_conn_count > 0)
+    {
+        int i = 0;
+
+        /* Wait until one of the connections has data available */
+        if (pgxc_node_receive(co_conn_count,
+                              pgxc_connections->coord_handles,
+                              NULL))
+        {
+            /*
+             * Got error
+             * TODO(Tbase): How do we check the error here?
+             */
+            break;
+        }
+
+        while (i < co_conn_count)
+        {
+            PGXCNodeHandle *conn = pgxc_connections->coord_handles[i];
+            int 			res = handle_response(conn, combiner);
+
+            if (res == RESPONSE_EOF)
+            {
+                i++;
+            }
+            else if (res == RESPONSE_COMPLETE)
+            {
+                /* Ignore, wait for ReadyForQuery */
+                if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+                {
+                    ereport(ERROR,
+                            (errcode(ERRCODE_INTERNAL_ERROR),
+                                    errmsg("Unexpected FATAL ERROR on Connection to "
+                                           "Coordinator %s pid %d",
+                                           pgxc_connections->coord_handles[i]->nodename,
+                                           pgxc_connections->coord_handles[i]->backend_pid)));
+                }
+            }
+            else if (res == RESPONSE_ERROR)
+            {
+                /* Ignore, wait for ReadyForQuery */
+            }
+            else if (res == RESPONSE_READY)
+            {
+                if (i < --co_conn_count)
+                    pgxc_connections->coord_handles[i] =
+                            pgxc_connections->coord_handles[co_conn_count];
+            }
+            else if (res == RESPONSE_TUPDESC)
+            {
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                                errmsg("Unexpected response from coordinator")));
+            }
+            else if (res == RESPONSE_DATAROW)
+            {
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                                errmsg("Unexpected response from coordinator")));
+            }
+        }
+    }
+
 	/*
 	 * Send BEGIN control command to all data nodes
 	 */
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 7bea908f..8aa01b1a 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -161,6 +161,9 @@ static int    get_char(PGXCNodeHandle * conn, char *out);
 static ParamEntry * paramlist_get_paramentry(List *param_list, const char *name);
 static ParamEntry * paramentry_copy(ParamEntry * src_entry);
 static void PGXCNodeHandleError(PGXCNodeHandle *handle, char *msg_body, int len);
+static PGXCNodeAllHandles * make_PGXCNodeAllHandles();
+static void get_current_dn_handles_internal(PGXCNodeAllHandles *result);
+static void get_current_cn_handles_internal(PGXCNodeAllHandles *result);
 #endif
 
 /*
@@ -3908,34 +3911,62 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
     return result;
 }
 
-PGXCNodeAllHandles *
-get_current_handles(void)
-{// #lizard forgives
+#ifdef __TBASE__
+static PGXCNodeAllHandles *
+make_PGXCNodeAllHandles()
+{
     PGXCNodeAllHandles *result;
-    PGXCNodeHandle       *node_handle;
-    int                    i;
-
     result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles));
     if (!result)
     {
         ereport(ERROR,
                 (errcode(ERRCODE_OUT_OF_MEMORY),
-                 errmsg("out of memory")));
+                        errmsg("out of memory")));
     }
 
     result->primary_handle = NULL;
     result->co_conn_count = 0;
     result->dn_conn_count = 0;
+    result->coord_handles = NULL;
+    result->datanode_handles = NULL;
 
-    result->datanode_handles = (PGXCNodeHandle **)
-                               palloc(NumDataNodes * sizeof(PGXCNodeHandle *));
-    if (!result->datanode_handles)
+    return result;
+}
+#endif
+
+PGXCNodeAllHandles *
+get_current_handles(void)
+{
+#ifndef __TBASE__
+    PGXCNodeAllHandles *result = make_PGXCNodeAllHandles();
+#else
+	PGXCNodeAllHandles *result;
+	PGXCNodeHandle	   *node_handle;
+	int					i;
+
+    result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles));
+    if (!result)
     {
         ereport(ERROR,
                 (errcode(ERRCODE_OUT_OF_MEMORY),
                  errmsg("out of memory")));
     }
 
+	result->primary_handle = NULL;
+	result->co_conn_count = 0;
+	result->dn_conn_count = 0;
+#endif
+
+#ifdef __TBASE__
+	result->datanode_handles = (PGXCNodeHandle **)
+							   palloc(NumDataNodes * sizeof(PGXCNodeHandle *));
+	if (!result->datanode_handles)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+	}
+
     for (i = 0; i < NumDataNodes; i++)
     {
         node_handle = &dn_handles[i];
@@ -3952,17 +3983,87 @@ get_current_handles(void)
                  errmsg("out of memory")));
     }
 
+	for (i = 0; i < NumCoords; i++)
+	{
+		node_handle = &co_handles[i];
+		if (node_handle->sock != NO_SOCKET)
+			result->coord_handles[result->co_conn_count++] = node_handle;
+	}
+#else
+    get_current_cn_handles_internal(result);
+    get_current_dn_handles_internal(result);
+#endif
+
+    return result;
+}
+
+#ifdef __TBASE__
+
+
+PGXCNodeAllHandles *
+get_current_cn_handles(void)
+{
+    PGXCNodeAllHandles *result = make_PGXCNodeAllHandles();
+
+    get_current_cn_handles_internal(result);
+    return result;
+}
+
+PGXCNodeAllHandles *
+get_current_dn_handles(void)
+{
+    PGXCNodeAllHandles *result = make_PGXCNodeAllHandles();
+
+    get_current_dn_handles_internal(result);
+    return result;
+}
+
+static void
+get_current_dn_handles_internal(PGXCNodeAllHandles *result)
+{
+    PGXCNodeHandle	   *node_handle;
+    int					i;
+
+    result->datanode_handles = (PGXCNodeHandle **)
+            palloc(NumDataNodes * sizeof(PGXCNodeHandle *));
+    if (!result->datanode_handles)
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_OUT_OF_MEMORY),
+                        errmsg("out of memory")));
+    }
+
+    for (i = 0; i < NumDataNodes; i++)
+    {
+        node_handle = &dn_handles[i];
+        if (node_handle->sock != NO_SOCKET)
+            result->datanode_handles[result->dn_conn_count++] = node_handle;
+    }
+}
+
+static void
+get_current_cn_handles_internal(PGXCNodeAllHandles *result)
+{
+    PGXCNodeHandle	   *node_handle;
+    int					i;
+
+    result->coord_handles = (PGXCNodeHandle **)
+            palloc(NumCoords * sizeof(PGXCNodeHandle *));
+    if (!result->coord_handles)
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_OUT_OF_MEMORY),
+                        errmsg("out of memory")));
+    }
+
     for (i = 0; i < NumCoords; i++)
     {
         node_handle = &co_handles[i];
         if (node_handle->sock != NO_SOCKET)
             result->coord_handles[result->co_conn_count++] = node_handle;
     }
-
-    return result;
 }
 
-#ifdef __TBASE__
 PGXCNodeAllHandles *
 get_sock_fatal_handles(void)
 {
@@ -5455,4 +5556,32 @@ void PGXCGetAllDnOid(Oid *nodelist)
 
 }
 
+#ifdef __TBASE__
+/*
+ * Return the name of ascii-minimized coordinator
+ */
+char* find_first_exec_cn(void)
+{
+    int i = 0;
+    char* result = co_handles[0].nodename;
+
+    for (i = 1; i < NumCoords; i++)
+    {
+        result = (strcmp(co_handles[i].nodename, result) < 0) ?
+                 co_handles[i].nodename :
+                 result;
+    }
+
+    return result;
+}
+
+/*
+ * Return whether I am the ascii-minimized coordinator
+ */
+bool is_first_exec_cn(char *first_cn)
+{
+    return strcmp(first_cn, PGXCNodeName) == 0;
+}
+#endif
+
 #endif
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 7af36bb3..8d5dc71a 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -671,6 +671,10 @@ bool isRestoreMode = false;
 
 int remoteConnType = REMOTE_CONN_APP;
 
+#ifdef __TBASE__
+bool is_forward = false;
+#endif
+
 /* key pair to be used as object id while using advisory lock for backup */
 Datum xc_lockForBackupKey1;
 Datum xc_lockForBackupKey2;
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index ce95d95d..6103517e 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3468,9 +3468,11 @@ finish_xact_command(void)
         MemoryContextStats(TopMemoryContext);
 #endif
 
-        xact_started = false;
-
-    }
+		xact_started = false;
+#ifdef __TBASE__
+        has_ddl = false;
+#endif
+	}
 }
 
 
@@ -5103,61 +5105,66 @@ PostgresMain(int argc, char *argv[],
             AuditProcessResultInfo(false);
         }
 #endif
-        /*
-         * Abort the current transaction in order to recover.
-         */
-        AbortCurrentTransaction();
-
-        if (am_walsender)
-            WalSndErrorCleanup();
+		/*
+		 * Abort the current transaction in order to recover.
+		 */
+		AbortCurrentTransaction();
+
+		if (am_walsender)
+			WalSndErrorCleanup();
+
+		/*
+		 * We can't release replication slots inside AbortTransaction() as we
+		 * need to be able to start and abort transactions while having a slot
+		 * acquired. But we never need to hold them across top level errors,
+		 * so releasing here is fine. There's another cleanup in ProcKill()
+		 * ensuring we'll correctly cleanup on FATAL errors as well.
+		 */
+		if (MyReplicationSlot != NULL)
+			ReplicationSlotRelease();
+
+		/* We also want to cleanup temporary slots on error. */
+		ReplicationSlotCleanup();
+
+		/*
+		 * Now return to normal top-level context and clear ErrorContext for
+		 * next time.
+		 */
+		MemoryContextSwitchTo(TopMemoryContext);
+		FlushErrorState();
+
+		/*
+		 * If we were handling an extended-query-protocol message, initiate
+		 * skip till next Sync.  This also causes us not to issue
+		 * ReadyForQuery (until we get Sync).
+		 */
+		if (doing_extended_query_message)
+			ignore_till_sync = true;
+
+		/* We don't have a transaction command open anymore */
+		xact_started = false;
 
-        /*
-         * We can't release replication slots inside AbortTransaction() as we
-         * need to be able to start and abort transactions while having a slot
-         * acquired. But we never need to hold them across top level errors,
-         * so releasing here is fine. There's another cleanup in ProcKill()
-         * ensuring we'll correctly cleanup on FATAL errors as well.
-         */
-        if (MyReplicationSlot != NULL)
-            ReplicationSlotRelease();
-
-        /* We also want to cleanup temporary slots on error. */
-        ReplicationSlotCleanup();
-
-        /*
-         * Now return to normal top-level context and clear ErrorContext for
-         * next time.
-         */
-        MemoryContextSwitchTo(TopMemoryContext);
-        FlushErrorState();
-
-        /*
-         * If we were handling an extended-query-protocol message, initiate
-         * skip till next Sync.  This also causes us not to issue
-         * ReadyForQuery (until we get Sync).
-         */
-        if (doing_extended_query_message)
-            ignore_till_sync = true;
-
-        /* We don't have a transaction command open anymore */
-        xact_started = false;
-
-        /*
-         * If an error occurred while we were reading a message from the
-         * client, we have potentially lost track of where the previous
-         * message ends and the next one begins.  Even though we have
-         * otherwise recovered from the error, we cannot safely read any more
-         * messages from the client, so there isn't much we can do with the
-         * connection anymore.
-         */
-        if (pq_is_reading_msg())
-            ereport(FATAL,
-                    (errcode(ERRCODE_PROTOCOL_VIOLATION),
-                     errmsg("terminating connection because protocol synchronization was lost")));
+#ifdef __TBASE__
+		/* Clear DDL flag */
+		has_ddl = false;
+#endif
 
-        /* Now we can allow interrupts again */
-        RESUME_INTERRUPTS();
-    }
+		/*
+		 * If an error occurred while we were reading a message from the
+		 * client, we have potentially lost track of where the previous
+		 * message ends and the next one begins.  Even though we have
+		 * otherwise recovered from the error, we cannot safely read any more
+		 * messages from the client, so there isn't much we can do with the
+		 * connection anymore.
+		 */
+		if (pq_is_reading_msg())
+			ereport(FATAL,
+					(errcode(ERRCODE_PROTOCOL_VIOLATION),
+					 errmsg("terminating connection because protocol synchronization was lost")));
+
+		/* Now we can allow interrupts again */
+		RESUME_INTERRUPTS();
+	}
 
 #ifdef __TBASE__
     /* for error code contrib */
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 983d04cc..b2b2b678 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -1894,12 +1894,15 @@ PortalRunUtility(Portal portal, PlannedStmt *pstmt,
 #endif
     {
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-        /* Avoid the start timestamp to be too old to execute on DNs */
-        if(IsA(utilityStmt, VacuumStmt) || IsA(utilityStmt, AlterNodeStmt))
-        {
-            snapshot = GetLocalTransactionSnapshot();
-        }else
-            snapshot = GetTransactionSnapshot();
+		/* Avoid the start timestamp to be too old to execute on DNs */
+		if(IsA(utilityStmt, VacuumStmt) || IsA(utilityStmt, AlterNodeStmt))
+		{
+			snapshot = GetLocalTransactionSnapshot();
+		}
+		else
+			snapshot = GetTransactionSnapshot();
+
+        has_ddl = true;
 #else
         snapshot = GetTransactionSnapshot();
 #endif
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index a7746d53..bad502d5 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -146,8 +146,12 @@ static bool IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString);
 static void ExecCreateKeyValuesStmt(Node *parsetree);
 static void RemoveSequeceBarely(DropStmt *stmt);
 extern void RegisterSeqDrop(char *name, int32 type);
+static bool forward_ddl(Node *node, const char *queryString);
 
 extern bool    g_GTM_skip_catalog;
+
+bool has_ddl;
+bool enable_parallel_ddl;
 #endif
 
 #endif
@@ -1730,6 +1734,36 @@ ProcessUtilityPost(PlannedStmt *pstmt,
         ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, auto_commit,
                 exec_type, is_temp, add_context);
 }
+
+#ifdef __TBASE__
+static bool forward_ddl(Node *node, const char *queryString)
+{
+    Oid  *oid_list = NULL;
+    char *first_cn = NULL;
+
+    if (!enable_parallel_ddl || !IS_PGXC_LOCAL_COORDINATOR)
+        return false;
+
+    if (IsA(node,IndexStmt) &&
+        castNode(IndexStmt,node)->concurrent)
+        return false;
+
+    first_cn = find_first_exec_cn();
+    if(is_first_exec_cn(first_cn))
+        return false;
+
+    oid_list = (Oid *) palloc0(sizeof(Oid));
+    oid_list[0] = get_pgxc_nodeoid(first_cn);
+
+    PGXCNodeSetParam(false, "is_forward", "true", 0);
+    pgxc_execute_on_nodes(1, oid_list, strdup(queryString));
+    PGXCNodeSetParam(false, "is_forward", "false", 0);
+
+    pfree(oid_list);
+    return true;
+}
+#endif
+
 /*
  * standard_ProcessUtility itself deals only with utility commands for
  * which we do not provide event trigger support.  Commands that do have
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ad023691..b78542f6 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -198,6 +198,7 @@ extern BackendId CoordSessionBackendId;
 extern bool    PlpgsqlDebugPrint;
 /* used for get total size of session */
 static int32 g_TotalMemorySize = 0;
+extern bool    enable_parallel_ddl;
 #endif
 static int    GUC_check_errcode_value;
 
@@ -2669,21 +2670,39 @@ static struct config_bool ConfigureNamesBool[] =
 #endif
 
 #ifdef __TBASE__
+	{
+		{"enable_lock_account", PGC_SUSET, CUSTOM_OPTIONS,
+			gettext_noop("Enable lock account when login fail serval times."),
+			NULL
+		},
+		&enable_lock_account,
+		false,
+		NULL, NULL, NULL
+	},
+	{
+		{"lock_account_print", PGC_SUSET, CUSTOM_OPTIONS,
+			gettext_noop("Enable print log in lock account procedure."),
+			NULL
+		},
+		&lock_account_print,
+		false,
+		NULL, NULL, NULL
+	},
     {
-        {"enable_lock_account", PGC_SUSET, CUSTOM_OPTIONS,
-            gettext_noop("Enable lock account when login fail serval times."),
-            NULL
+        {"enable_parallel_ddl", PGC_USERSET, CUSTOM_OPTIONS,
+             gettext_noop("Enable parallel DDL with no dead lock."),
+             NULL
         },
-        &enable_lock_account,
-        false,
+        &enable_parallel_ddl,
+        true,
         NULL, NULL, NULL
     },
     {
-        {"lock_account_print", PGC_SUSET, CUSTOM_OPTIONS,
-            gettext_noop("Enable print log in lock account procedure."),
+        {"is_forward", PGC_INTERNAL, CUSTOM_OPTIONS,
+            gettext_noop("Whether DDL is forwarded from another coordinator."),
             NULL
         },
-        &lock_account_print,
+        &is_forward,
         false,
         NULL, NULL, NULL
     },
@@ -8180,7 +8199,8 @@ set_config_option(const char *name, const char *value,
 	 */
     if ((source == PGC_S_SESSION || source == PGC_S_CLIENT)
         && (IS_PGXC_DATANODE || !IsConnFromCoord())
-        && (strcmp(name,"remotetype") != 0 && strcmp(name,"parentnode") != 0))
+        && (strcmp(name,"remotetype") != 0 && strcmp(name,"parentnode") != 0 &&
+            strcmp(name,"is_forward") != 0))
         send_to_nodes = true;
 #endif
 
diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h
index 770b213c..475f117d 100644
--- a/src/include/pgxc/pgxc.h
+++ b/src/include/pgxc/pgxc.h
@@ -95,7 +95,10 @@ typedef enum
 } RemoteConnTypes;
 
 /* Determine remote connection type for a PGXC backend */
-extern int        remoteConnType;
+extern int		remoteConnType;
+#ifdef __TBASE__
+extern bool		is_forward;
+#endif
 
 /* Local node name and numer */
 extern char    *PGXCNodeName;
@@ -123,10 +126,10 @@ extern Datum xc_lockForBackupKey2;
 #define PGXC_PARENT_NODE_TYPE    parentPGXCNodeType
 #define REMOTE_CONN_TYPE remoteConnType
 
-#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP)
-#define IsConnFromCoord() (remoteConnType == REMOTE_CONN_COORD)
-#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE)
-#define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM)
+#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP || is_forward == true)
+#define IsConnFromCoord() (remoteConnType == REMOTE_CONN_COORD && is_forward == false)
+#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE && is_forward == false)
+#define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM && is_forward == false)
 #define IsConnFromGtmProxy() (remoteConnType == REMOTE_CONN_GTM_PROXY)
 
 /* key pair to be used as object id while using advisory lock for backup */
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 7dcb5bf7..fcd765c0 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -175,6 +175,8 @@ extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist, bool
 
 extern PGXCNodeAllHandles *get_current_handles(void);
 #ifdef __TBASE__
+extern PGXCNodeAllHandles *get_current_cn_handles(void);
+extern PGXCNodeAllHandles *get_current_dn_handles(void);
 extern PGXCNodeAllHandles * get_sock_fatal_handles(void);
 #endif
 extern void pfree_pgxc_all_handles(PGXCNodeAllHandles *handles);
@@ -282,10 +284,12 @@ extern void pgxc_print_pending_data(PGXCNodeHandle *handle, bool reset);
 
 #ifdef __TBASE__
 void add_error_message_from_combiner(PGXCNodeHandle *handle, void *combiner_input);
-extern inline void pgxc_set_coordinator_proc_pid(int proc_pid);
-extern inline int pgxc_get_coordinator_proc_pid(void);
-extern inline void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid);
-extern inline TransactionId pgxc_get_coordinator_proc_vxid(void);
+inline void pgxc_set_coordinator_proc_pid(int proc_pid);
+inline int pgxc_get_coordinator_proc_pid(void);
+inline void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid);
+inline TransactionId pgxc_get_coordinator_proc_vxid(void);
+char* find_first_exec_cn(void);
+bool is_first_exec_cn(char *first_cn);
 #endif
 
 #ifdef __AUDIT__
diff --git a/src/include/tcop/utility.h b/src/include/tcop/utility.h
index b1c3c0f4..1df0be74 100644
--- a/src/include/tcop/utility.h
+++ b/src/include/tcop/utility.h
@@ -66,5 +66,8 @@ extern bool pgxc_lock_for_utility_stmt(Node *parsetree);
 #ifdef __TBASE__
 typedef void (*ErrcodeHookType) (ErrorData *edata, StringInfo buff);
 extern PGDLLIMPORT ErrcodeHookType g_pfErrcodeHook;
+
+extern bool has_ddl;
+extern bool enable_parallel_ddl;
 #endif
 #endif                            /* UTILITY_H */

From 15465a3c04ed909089c8c58a010821c68ad7c11b Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Mon, 31 Aug 2020 20:11:19 +0800
Subject: [PATCH 036/578] Perfects comments ,names and format.

---
 src/backend/pgxc/pool/execRemote.c  | 11 +++--
 src/backend/pgxc/pool/pgxcnode.c    | 69 ++++++++++++++---------------
 src/backend/postmaster/postmaster.c |  2 +-
 src/backend/tcop/postgres.c         | 35 ++++++++-------
 src/backend/tcop/pquery.c           |  6 +--
 src/backend/tcop/utility.c          | 48 +++++++++++++-------
 src/backend/utils/misc/guc.c        | 11 ++---
 src/include/pgxc/pgxc.h             |  9 ++--
 src/include/pgxc/pgxcnode.h         |  4 +-
 src/include/tcop/utility.h          |  4 +-
 10 files changed, 113 insertions(+), 86 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 452577db..36bdbed3 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -4662,15 +4662,15 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
  * Release remote connection after completion.
  *
  * For DDL, DN will commit before CN does.
- * Because DDLs normally have exclusive locks, then when CN gets committed,
- * blocked user transactions will see DNs in a consistent state.
+ * Because DDLs normally have conflict locks, when CN gets committed,
+ * DNs will be in a consistent state for blocked user transactions.
  */
 static void
 pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 {
     int conn_count = 0;
 
-    if (!enable_parallel_ddl || !has_ddl)
+    if (!enable_parallel_ddl || !is_txn_has_parallel_ddl)
     {
         /* normal cases */
         conn_count = pgxc_node_remote_commit_internal(get_current_handles(), txn_type);
@@ -6874,7 +6874,10 @@ ExecRemoteUtility(RemoteQuery *node)
         }
     }
 
-    /* Make the same for Coordinators */
+    /*
+     * Stop if all commands are completed or we got a data row and
+     * initialized state node for subsequent invocations
+     */
     while (co_conn_count > 0)
     {
         int i = 0;
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 8aa01b1a..812502d6 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -161,7 +161,7 @@ static int    get_char(PGXCNodeHandle * conn, char *out);
 static ParamEntry * paramlist_get_paramentry(List *param_list, const char *name);
 static ParamEntry * paramentry_copy(ParamEntry * src_entry);
 static void PGXCNodeHandleError(PGXCNodeHandle *handle, char *msg_body, int len);
-static PGXCNodeAllHandles * make_PGXCNodeAllHandles();
+static PGXCNodeAllHandles * get_empty_handles(void);
 static void get_current_dn_handles_internal(PGXCNodeAllHandles *result);
 static void get_current_cn_handles_internal(PGXCNodeAllHandles *result);
 #endif
@@ -3913,10 +3913,10 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
 
 #ifdef __TBASE__
 static PGXCNodeAllHandles *
-make_PGXCNodeAllHandles()
+get_empty_handles(void)
 {
     PGXCNodeAllHandles *result;
-    result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles));
+    result = (PGXCNodeAllHandles *) palloc0(sizeof(PGXCNodeAllHandles));
     if (!result)
     {
         ereport(ERROR,
@@ -3924,12 +3924,6 @@ make_PGXCNodeAllHandles()
                         errmsg("out of memory")));
     }
 
-    result->primary_handle = NULL;
-    result->co_conn_count = 0;
-    result->dn_conn_count = 0;
-    result->coord_handles = NULL;
-    result->datanode_handles = NULL;
-
     return result;
 }
 #endif
@@ -3937,27 +3931,23 @@ make_PGXCNodeAllHandles()
 PGXCNodeAllHandles *
 get_current_handles(void)
 {
-#ifndef __TBASE__
-    PGXCNodeAllHandles *result = make_PGXCNodeAllHandles();
+#ifdef __TBASE__
+    PGXCNodeAllHandles *result = get_empty_handles();
 #else
 	PGXCNodeAllHandles *result;
 	PGXCNodeHandle	   *node_handle;
 	int					i;
 
-    result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles));
-    if (!result)
-    {
-        ereport(ERROR,
-                (errcode(ERRCODE_OUT_OF_MEMORY),
-                 errmsg("out of memory")));
-    }
-
-	result->primary_handle = NULL;
-	result->co_conn_count = 0;
-	result->dn_conn_count = 0;
+	result = (PGXCNodeAllHandles *) palloc0(sizeof(PGXCNodeAllHandles));
+	if (!result)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+	}
 #endif
 
-#ifdef __TBASE__
+#ifndef __TBASE__
 	result->datanode_handles = (PGXCNodeHandle **)
 							   palloc(NumDataNodes * sizeof(PGXCNodeHandle *));
 	if (!result->datanode_handles)
@@ -4003,7 +3993,7 @@ get_current_handles(void)
 PGXCNodeAllHandles *
 get_current_cn_handles(void)
 {
-    PGXCNodeAllHandles *result = make_PGXCNodeAllHandles();
+    PGXCNodeAllHandles *result = get_empty_handles();
 
     get_current_cn_handles_internal(result);
     return result;
@@ -4012,7 +4002,7 @@ get_current_cn_handles(void)
 PGXCNodeAllHandles *
 get_current_dn_handles(void)
 {
-    PGXCNodeAllHandles *result = make_PGXCNodeAllHandles();
+    PGXCNodeAllHandles *result = get_empty_handles();
 
     get_current_dn_handles_internal(result);
     return result;
@@ -4033,11 +4023,14 @@ get_current_dn_handles_internal(PGXCNodeAllHandles *result)
                         errmsg("out of memory")));
     }
 
+    result->dn_conn_count = 0;
     for (i = 0; i < NumDataNodes; i++)
     {
         node_handle = &dn_handles[i];
         if (node_handle->sock != NO_SOCKET)
+        {
             result->datanode_handles[result->dn_conn_count++] = node_handle;
+        }
     }
 }
 
@@ -4056,11 +4049,14 @@ get_current_cn_handles_internal(PGXCNodeAllHandles *result)
                         errmsg("out of memory")));
     }
 
+    result->co_conn_count = 0;
     for (i = 0; i < NumCoords; i++)
     {
         node_handle = &co_handles[i];
         if (node_handle->sock != NO_SOCKET)
+        {
             result->coord_handles[result->co_conn_count++] = node_handle;
+        }
     }
 }
 
@@ -5558,27 +5554,30 @@ void PGXCGetAllDnOid(Oid *nodelist)
 
 #ifdef __TBASE__
 /*
- * Return the name of ascii-minimized coordinator
+ * Return the name of ascii-minimized coordinator as ddl leader cn
  */
-char* find_first_exec_cn(void)
+inline char*
+find_ddl_leader_cn(void)
 {
     int i = 0;
-    char* result = co_handles[0].nodename;
+    char* result = NULL;
 
-    for (i = 1; i < NumCoords; i++)
+    for (i = 0; i < NumCoords; i++)
     {
-        result = (strcmp(co_handles[i].nodename, result) < 0) ?
-                 co_handles[i].nodename :
-                 result;
+        if(result == NULL || strcmp(co_handles[i].nodename, result) < 0)
+        {
+            result = co_handles[i].nodename;
+        }
     }
 
-    return result;
+    return pstrdup(result);
 }
 
 /*
- * Return whether I am the ascii-minimized coordinator
+ * Return whether I am the leader cn
  */
-bool is_first_exec_cn(char *first_cn)
+inline bool
+is_ddl_leader_cn(char *first_cn)
 {
     return strcmp(first_cn, PGXCNodeName) == 0;
 }
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 8d5dc71a..1f8c33a5 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -672,7 +672,7 @@ bool isRestoreMode = false;
 int remoteConnType = REMOTE_CONN_APP;
 
 #ifdef __TBASE__
-bool is_forward = false;
+bool is_forward_request = false;
 #endif
 
 /* key pair to be used as object id while using advisory lock for backup */
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 6103517e..80b2fc4f 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -975,8 +975,9 @@ pg_analyze_and_rewrite_params(RawStmt *parsetree,
  */
 static List *
 pg_rewrite_query(Query *query)
-{// #lizard forgives
-    List       *querytree_list;
+{
+	List	   *querytree_list;
+	char       *leader_cn = NULL;
 
     if (Debug_print_parse)
         elog_node_display(LOG, "parse tree", query,
@@ -986,17 +987,21 @@ pg_rewrite_query(Query *query)
         ResetUsage();
 
 #ifdef PGXC
+	/* directly forward the request */
+	leader_cn = find_ddl_leader_cn();
+
     if (query->commandType == CMD_UTILITY &&
-        IsA(query->utilityStmt, CreateTableAsStmt))
-    {
-        /*
-         * CREATE TABLE AS SELECT and SELECT INTO are rewritten so that the
-         * target table is created first. The SELECT query is then transformed
-         * into an INSERT INTO statement
-         */
-        querytree_list = QueryRewriteCTAS(query);
-    }
-    else
+        IsA(query->utilityStmt, CreateTableAsStmt) &&
+        (enable_parallel_ddl && is_ddl_leader_cn(leader_cn)))
+	{
+		/*
+		 * CREATE TABLE AS SELECT and SELECT INTO are rewritten so that the
+		 * target table is created first. The SELECT query is then transformed
+		 * into an INSERT INTO statement
+		 */
+		querytree_list = QueryRewriteCTAS(query);
+	}
+	else
 #endif
     if (query->commandType == CMD_UTILITY)
     {
@@ -3470,7 +3475,7 @@ finish_xact_command(void)
 
 		xact_started = false;
 #ifdef __TBASE__
-        has_ddl = false;
+        is_txn_has_parallel_ddl = false;
 #endif
 	}
 }
@@ -5145,8 +5150,8 @@ PostgresMain(int argc, char *argv[],
 		xact_started = false;
 
 #ifdef __TBASE__
-		/* Clear DDL flag */
-		has_ddl = false;
+		/* Clear parallel DDL flag */
+		is_txn_has_parallel_ddl = false;
 #endif
 
 		/*
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index b2b2b678..67cf48ae 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -1900,9 +1900,9 @@ PortalRunUtility(Portal portal, PlannedStmt *pstmt,
 			snapshot = GetLocalTransactionSnapshot();
 		}
 		else
-			snapshot = GetTransactionSnapshot();
-
-        has_ddl = true;
+        {
+            snapshot = GetTransactionSnapshot();
+        }
 #else
         snapshot = GetTransactionSnapshot();
 #endif
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index bad502d5..cd0252f9 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -146,11 +146,11 @@ static bool IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString);
 static void ExecCreateKeyValuesStmt(Node *parsetree);
 static void RemoveSequeceBarely(DropStmt *stmt);
 extern void RegisterSeqDrop(char *name, int32 type);
-static bool forward_ddl(Node *node, const char *queryString);
+static bool forward_ddl_to_leader_cn(Node *node, const char *queryString);
 
 extern bool    g_GTM_skip_catalog;
 
-bool has_ddl;
+bool is_txn_has_parallel_ddl;
 bool enable_parallel_ddl;
 #endif
 
@@ -1736,30 +1736,46 @@ ProcessUtilityPost(PlannedStmt *pstmt,
 }
 
 #ifdef __TBASE__
-static bool forward_ddl(Node *node, const char *queryString)
+/*
+ * Forward specific DDLs request to leader cn
+ * on success return true else false
+ */
+static bool forward_ddl_to_leader_cn(Node *node, const char *queryString)
 {
-    Oid  *oid_list = NULL;
-    char *first_cn = NULL;
+    Oid  leader_cn = InvalidOid;
+    char *leader_name = NULL;
 
-    if (!enable_parallel_ddl || !IS_PGXC_LOCAL_COORDINATOR)
+    /* avoid forward recurse */
+    if (!enable_parallel_ddl || !IS_PGXC_LOCAL_COORDINATOR || is_forward_request)
+    {
         return false;
+    }
 
-    if (IsA(node,IndexStmt) &&
-        castNode(IndexStmt,node)->concurrent)
+    /* CONCURRENT INDEX is not supported */
+    if (IsA(node,IndexStmt) && castNode(IndexStmt,node)->concurrent)
+    {
         return false;
+    }
 
-    first_cn = find_first_exec_cn();
-    if(is_first_exec_cn(first_cn))
+    /* Set parallel ddl flag */
+    is_txn_has_parallel_ddl = true;
+
+    leader_name = find_ddl_leader_cn();
+    if(is_ddl_leader_cn(leader_name))
+    {
         return false;
+    }
+
+    leader_cn = get_pgxc_nodeoid(leader_name);
+
+    /* Set flag to indicate forwarded request */
+    PGXCNodeSetParam(false, "is_forward_request", "true", 0);
 
-    oid_list = (Oid *) palloc0(sizeof(Oid));
-    oid_list[0] = get_pgxc_nodeoid(first_cn);
+    pgxc_execute_on_nodes(1, &leader_cn, pstrdup(queryString));
 
-    PGXCNodeSetParam(false, "is_forward", "true", 0);
-    pgxc_execute_on_nodes(1, oid_list, strdup(queryString));
-    PGXCNodeSetParam(false, "is_forward", "false", 0);
+    /* Cancel forwarded flag for subsequent requests */
+    PGXCNodeSetParam(false, "is_forward_request", "false", 0);
 
-    pfree(oid_list);
     return true;
 }
 #endif
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b78542f6..93dc7020 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2690,7 +2690,7 @@ static struct config_bool ConfigureNamesBool[] =
 	},
     {
         {"enable_parallel_ddl", PGC_USERSET, CUSTOM_OPTIONS,
-             gettext_noop("Enable parallel DDL with no dead lock."),
+             gettext_noop("Enable parallel DDL with no deadlock."),
              NULL
         },
         &enable_parallel_ddl,
@@ -2698,11 +2698,12 @@ static struct config_bool ConfigureNamesBool[] =
         NULL, NULL, NULL
     },
     {
-        {"is_forward", PGC_INTERNAL, CUSTOM_OPTIONS,
+        {"is_forward_request", PGC_USERSET, CUSTOM_OPTIONS,
             gettext_noop("Whether DDL is forwarded from another coordinator."),
-            NULL
+            NULL,
+            GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_AUTO_FILE | GUC_DISALLOW_IN_FILE | GUC_NO_SHOW_ALL
         },
-        &is_forward,
+        &is_forward_request,
         false,
         NULL, NULL, NULL
     },
@@ -8200,7 +8201,7 @@ set_config_option(const char *name, const char *value,
     if ((source == PGC_S_SESSION || source == PGC_S_CLIENT)
         && (IS_PGXC_DATANODE || !IsConnFromCoord())
         && (strcmp(name,"remotetype") != 0 && strcmp(name,"parentnode") != 0 &&
-            strcmp(name,"is_forward") != 0))
+            strcmp(name,"is_forward_request") != 0))
         send_to_nodes = true;
 #endif
 
diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h
index 475f117d..9f3ed6f5 100644
--- a/src/include/pgxc/pgxc.h
+++ b/src/include/pgxc/pgxc.h
@@ -97,7 +97,8 @@ typedef enum
 /* Determine remote connection type for a PGXC backend */
 extern int		remoteConnType;
 #ifdef __TBASE__
-extern bool		is_forward;
+/* Is request forwarded another coordinator */
+extern bool		is_forward_request;
 #endif
 
 /* Local node name and numer */
@@ -126,9 +127,9 @@ extern Datum xc_lockForBackupKey2;
 #define PGXC_PARENT_NODE_TYPE    parentPGXCNodeType
 #define REMOTE_CONN_TYPE remoteConnType
 
-#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP || is_forward == true)
-#define IsConnFromCoord() (remoteConnType == REMOTE_CONN_COORD && is_forward == false)
-#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE && is_forward == false)
+#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP || is_forward_request == true)
+#define IsConnFromCoord() (remoteConnType == REMOTE_CONN_COORD && is_forward_request == false)
+#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE && is_forward_request == false)
 #define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM && is_forward == false)
 #define IsConnFromGtmProxy() (remoteConnType == REMOTE_CONN_GTM_PROXY)
 
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index fcd765c0..872536fc 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -288,8 +288,8 @@ inline void pgxc_set_coordinator_proc_pid(int proc_pid);
 inline int pgxc_get_coordinator_proc_pid(void);
 inline void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid);
 inline TransactionId pgxc_get_coordinator_proc_vxid(void);
-char* find_first_exec_cn(void);
-bool is_first_exec_cn(char *first_cn);
+inline char* find_ddl_leader_cn(void);
+inline bool  is_ddl_leader_cn(char *leader_cn);
 #endif
 
 #ifdef __AUDIT__
diff --git a/src/include/tcop/utility.h b/src/include/tcop/utility.h
index 1df0be74..92605dff 100644
--- a/src/include/tcop/utility.h
+++ b/src/include/tcop/utility.h
@@ -67,7 +67,9 @@ extern bool pgxc_lock_for_utility_stmt(Node *parsetree);
 typedef void (*ErrcodeHookType) (ErrorData *edata, StringInfo buff);
 extern PGDLLIMPORT ErrcodeHookType g_pfErrcodeHook;
 
-extern bool has_ddl;
+/* Does txn include parallel DDLs */
+extern bool is_txn_has_parallel_ddl;
+/* Parallel DDL switch */
 extern bool enable_parallel_ddl;
 #endif
 #endif                            /* UTILITY_H */

From 3837cd66de9e0c7b41a0dd41448920ef54a378a8 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Mon, 31 Aug 2020 20:27:13 +0800
Subject: [PATCH 037/578] Fix under single node mode.

---
 src/backend/pgxc/pool/pgxcnode.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 812502d6..f2886833 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -5570,7 +5570,10 @@ find_ddl_leader_cn(void)
         }
     }
 
-    return pstrdup(result);
+    if(result)
+        result = pstrdup(result);
+
+    return result;
 }
 
 /*
@@ -5579,6 +5582,9 @@ find_ddl_leader_cn(void)
 inline bool
 is_ddl_leader_cn(char *first_cn)
 {
+    if(first_cn == NULL)
+        return false;
+
     return strcmp(first_cn, PGXCNodeName) == 0;
 }
 #endif

From 3a33e59fa7f8bba084f6b4617ea4472006e36ea6 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 1 Sep 2020 10:48:45 +0800
Subject: [PATCH 038/578] regress fix

---
 src/test/regress/expected/sysviews.out | 3 ++-
 src/test/regress/expected/xc_misc.out  | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index ded66084..e2765dd9 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -111,6 +111,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_nestloop                   | on
  enable_null_string                | off
  enable_oracle_compatible          | off
+ enable_parallel_ddl               | on
  enable_pgbouncer                  | off
  enable_plpgsql_debug_print        | off
  enable_pooler_debug_print         | on
@@ -124,7 +125,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_tidscan                    | on
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
-(52 rows)
+(53 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/expected/xc_misc.out b/src/test/regress/expected/xc_misc.out
index 75d207cc..0c894fdd 100644
--- a/src/test/regress/expected/xc_misc.out
+++ b/src/test/regress/expected/xc_misc.out
@@ -55,7 +55,7 @@ SET check_function_bodies = false;
 create function f1 () returns setof my_tab1 as $$ create table my_tab2 (a int); select * from my_tab1; $$ language sql;
 ERROR:  function "f1" already exists with same argument types
 select f1();
-ERROR:  Unexpected response from Datanode
+ERROR:  Unexpected response from coordinator
 CONTEXT:  SQL function "f1" statement 1
 SET check_function_bodies = true;
 drop function f1();

From fe33aaa6cf030b59021cabaf473159e6ea876d9a Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 1 Sep 2020 14:53:43 +0800
Subject: [PATCH 039/578] Support pullup agg sublink with ScalarArrayOpExpr
 qual

---
 src/backend/optimizer/plan/subselect.c    | 456 +++++++------
 src/test/regress/expected/subselect.out   |  34 +
 src/test/regress/expected/subselect_1.out | 772 ++++++++++++++++++++++
 src/test/regress/sql/subselect.sql        | 188 +++---
 4 files changed, 1158 insertions(+), 292 deletions(-)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 05a21a79..8bb67513 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -1516,9 +1516,8 @@ simplify_ALL_query(PlannerInfo *root, Query *query)
 }
 
 /* 
-  * if whereclause contains 'not' boolexpr or not equal opexpr,
-  * return true.
-  */
+ * If where clause contains 'not' BoolExpr or not-equal OpExpr, return true.
+ */
 static bool
 contain_notexpr_or_neopexpr(Node *whereclause, bool check_or, List **joinquals)
 {// #lizard forgives
@@ -1536,7 +1535,7 @@ contain_notexpr_or_neopexpr(Node *whereclause, bool check_or, List **joinquals)
 			if(!check_or)
                 return true;
 
-			/* look for common expr */
+			/* Look for common EXPR */
 			foreach(cell, expr->args)
 			{
 				List *cur = NIL;
@@ -1561,11 +1560,11 @@ contain_notexpr_or_neopexpr(Node *whereclause, bool check_or, List **joinquals)
 			return false;
         }
 
-		/* and expr */
-        foreach(cell, expr->args)
-        {
-            bool result;
-            Node *arg = lfirst(cell);
+		/* AND EXPR */
+		foreach(cell, expr->args)
+		{
+			bool result;
+			Node *arg = lfirst(cell);
 
 			result = contain_notexpr_or_neopexpr(arg, check_or, joinquals);
 
@@ -1588,16 +1587,17 @@ contain_notexpr_or_neopexpr(Node *whereclause, bool check_or, List **joinquals)
 
         *joinquals = lappend(*joinquals, expr);
 
-		
+		/* Make sure the operator is hashjoinable */
 		if (!op_hashjoinable(expr->opno, exprType((Node *)lexpr)))
         {
 			return true;
-        }
-			
-        foreach(cell, expr->args)
-        {
-            bool result;
-            Node *arg = lfirst(cell);
+		}
+
+		/* Check the operands of the OpExpr */
+		foreach(cell, expr->args)
+		{
+			bool result;
+			Node *arg = lfirst(cell);
 
 			result = contain_notexpr_or_neopexpr(arg, check_or, joinquals);
 
@@ -1616,11 +1616,69 @@ contain_notexpr_or_neopexpr(Node *whereclause, bool check_or, List **joinquals)
         bool result;
         RelabelType *label = (RelabelType *)whereclause;
 
-		result = contain_notexpr_or_neopexpr((Node *)label->arg, check_or, joinquals);
-        if (result)
-                return true;
-        return false;
-    }
+		result = contain_notexpr_or_neopexpr((Node *)label->arg,
+											 check_or,
+											 joinquals);
+		if (result)
+				return true;
+		return false;
+	}
+	/* In case the where clause is "tbl.col_a IN ('0','1')" */
+	else if (IsA(whereclause, ScalarArrayOpExpr))
+	{
+		ListCell 		  *lc = NULL;
+		ScalarArrayOpExpr *scalarArray = (ScalarArrayOpExpr*)whereclause;
+		Expr 			  *lexpr = linitial(scalarArray->args);
+
+		if (!op_hashjoinable(scalarArray->opno, exprType((Node *)lexpr)))
+		{
+			return true;
+		}
+
+		foreach(lc, scalarArray->args)
+		{
+			if (contain_notexpr_or_neopexpr((Node *)lfirst(lc),
+											check_or,
+											joinquals))
+			{
+				return true;
+			}
+		}
+
+		return false;
+	}
+	/*
+	 * The right operand of ScalarArrayOpExpr, we only support array of
+	 * constant values
+	 */
+	else if (IsA(whereclause, ArrayExpr))
+	{
+		ListCell *lc = NULL;
+		ArrayExpr *arrayExpr = (ArrayExpr*)whereclause;
+
+		foreach(lc, arrayExpr->elements)
+		{
+			if (!IsA((Node *)lfirst(lc), Const))
+			{
+				return true;
+			}
+		}
+
+		return false;
+	}
+    /* In case the where clause is "tbl.col_a is(is not) NULL" */
+	else if (IsA(whereclause, NullTest))
+	{
+		NullTest *nullTestExpr = (NullTest *)whereclause;
+
+		if (contain_notexpr_or_neopexpr((Node *)nullTestExpr->arg,
+										check_or,
+										joinquals))
+		{
+			return true;
+		}
+		return false;
+	}
 
     return true;
 }
@@ -2785,15 +2843,15 @@ get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, List **targetLis
 TargetEntry *
 convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 {
-    Query		*parse = root->parse;
-    Node		*whereClause = NULL;
-    Query		*subselect = NULL;
-    JoinExpr	*joinExpr = NULL;
-    ParseState	*pstate = NULL;
-    SubLink		*sublink = NULL;
-    RangeTblRef	*rtr = NULL;
-    RangeTblEntry *rte = NULL;
-    Var			*var = NULL;
+	Query		*parse = root->parse;
+	Node		*whereClause = NULL;
+	Query		*subselect = NULL;
+	JoinExpr	*joinExpr = NULL;
+	ParseState	*pstate = NULL;
+	SubLink		*sublink = NULL;
+	RangeTblRef	*rtr = NULL;
+	RangeTblEntry *rte = NULL;
+	Var			*var = NULL;
 	List 		*sublinks = NIL;
 
 	/* Find sublinks in the targetlist entry */
@@ -2805,195 +2863,195 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 
 	sublink = linitial(sublinks);
 
-    if (sublink->subLinkType != EXPR_SUBLINK)
-        return NULL;
-
-    /*
-     * Copy object so that we can modify it.
-     */
-    subselect = copyObject((Query *) sublink->subselect);
-    whereClause = subselect->jointree->quals;
+	if (sublink->subLinkType != EXPR_SUBLINK)
+		return NULL;
 
     /*
-     * Only one targetEntry can be handled.
-     */
-    if (list_length(subselect->targetList) > 1)
-        return NULL;
+	 * Copy object so that we can modify it.
+	 */
+	subselect = copyObject((Query *) sublink->subselect);
+	whereClause = subselect->jointree->quals;
 
-    /*
-     * The SubQuery must have a non-empty JoinTree, else we won't have a join.
-     */
-    if (subselect->jointree->fromlist == NIL)
-        return NULL;
+	/*
+	 * Only one targetEntry can be handled.
+	 */
+	if (list_length(subselect->targetList) > 1)
+		return NULL;
 
-    /*
-     * What we can not optimize.
-     */
-    if (subselect->commandType != CMD_SELECT || subselect->hasDistinctOn ||
-        subselect->setOperations || subselect->groupingSets ||
-        subselect->groupClause || subselect->hasWindowFuncs ||
-        subselect->hasTargetSRFs || subselect->hasModifyingCTE ||
-        subselect->havingQual || subselect->limitOffset ||
-        subselect->limitCount || subselect->rowMarks ||
-        subselect->cteList || subselect->sortClause)
-    {
-        return NULL;
-    }
+	/*
+	 * The SubQuery must have a non-empty JoinTree, else we won't have a join.
+	 */
+	if (subselect->jointree->fromlist == NIL)
+		return NULL;
 
-    /*
-     * On one hand, the WHERE clause must contain some Vars of the
-     * parent query, else it's not gonna be a join.
-     */
-    if (!contain_vars_of_level(whereClause, 1))
-        return NULL;
+	/*
+	 * What we can not optimize.
+	 */
+	if (subselect->commandType != CMD_SELECT || subselect->hasDistinctOn ||
+		subselect->setOperations || subselect->groupingSets ||
+		subselect->groupClause || subselect->hasWindowFuncs ||
+		subselect->hasTargetSRFs || subselect->hasModifyingCTE ||
+		subselect->havingQual || subselect->limitOffset ||
+		subselect->limitCount || subselect->rowMarks ||
+		subselect->cteList || subselect->sortClause)
+	{
+		return NULL;
+	}
 
-    /*
-     * We don't risk optimizing if the WHERE clause is volatile, either.
-     */
-    if (contain_volatile_functions(whereClause))
-        return NULL;
+	/*
+	 * On one hand, the WHERE clause must contain some Vars of the
+	 * parent query, else it's not gonna be a join.
+	 */
+	if (!contain_vars_of_level(whereClause, 1))
+		return NULL;
 
-    /*
-     * The rest of the sub-select must not refer to any Vars of the parent
-     * query. (Vars of higher levels should be okay, though.)
-     */
-    subselect->jointree->quals = NULL;
-    if (contain_vars_of_level((Node *) subselect, 1))
-        return NULL;
-    subselect->jointree->quals = whereClause;
+	/*
+	 * We don't risk optimizing if the WHERE clause is volatile, either.
+	 */
+	if (contain_volatile_functions(whereClause))
+		return NULL;
 
-    if (subselect->hasAggs)
-    {
-        List *joinquals  = NULL;
-        List *vars       = NULL;
-        TargetEntry *ent = NULL;
-        ListCell *cell   = NULL;
-        int ressortgroupref = 0;
-        int varno = 0;
+	/*
+	 * The rest of the sub-select must not refer to any Vars of the parent
+	 * query. (Vars of higher levels should be okay, though.)
+	 */
+	subselect->jointree->quals = NULL;
+	if (contain_vars_of_level((Node *) subselect, 1))
+		return NULL;
+	subselect->jointree->quals = whereClause;
 
-        /* process 'op' and 'bool' expr only */
-        if (contain_notexpr_or_neopexpr(whereClause, true, &joinquals))
-            return NULL;
+	if (subselect->hasAggs)
+	{
+		List *joinquals  = NULL;
+		List *vars       = NULL;
+		TargetEntry *ent = NULL;
+		ListCell *cell   = NULL;
+		int ressortgroupref = 0;
+		int varno = 0;
+
+		/* process 'op' and 'bool' expr only */
+		if (contain_notexpr_or_neopexpr(whereClause, true, &joinquals))
+			return NULL;
 
-        vars = pull_vars_of_level((Node *) joinquals, 0);
+		vars = pull_vars_of_level((Node *) joinquals, 0);
 
-        /* construct groupby clause */
+		/* construct groupby clause */
         foreach (cell, vars)
         {
-            Oid sortop;
-            Oid eqop;
-            bool hashable;
-            Oid restype;
-            SortGroupClause *grpcl;
-            Var *var = (Var *) lfirst(cell);
-            RangeTblEntry *tbl = (RangeTblEntry *) list_nth(subselect->rtable, var->varno - 1);
-
-            if (tbl->rtekind != RTE_RELATION && tbl->rtekind != RTE_CTE)
-                return NULL;
-
-            restype = exprType((Node *) var);
-
-            grpcl = makeNode(SortGroupClause);
-            ressortgroupref++;
-
-            if (tbl->rtekind == RTE_RELATION)
-            {
-                ent = makeTargetEntry((Expr *) copyObject(var), var->varoattno,
-                                      get_relid_attribute_name(tbl->relid, var->varoattno), false);
-            }
-            else
-            {
-                int plan_id;
-                int ndx;
-                ListCell *lc;
-                Plan *cte_plan;
-                TargetEntry *cte_ent = NULL;
-
-                /*
-                 * Note: cte_plan_ids can be shorter than cteList, if we are still working
-                 * on planning the CTEs (ie, this is a side-reference from another CTE).
-                 * So we mustn't use forboth here.
-                 */
-                ndx = 0;
-                foreach (lc, root->parse->cteList)
-                {
-                    CommonTableExpr *cte = (CommonTableExpr *) lfirst(lc);
-
-                    if (strcmp(cte->ctename, tbl->ctename) == 0)
-                        break;
-                    ndx++;
-                }
-                if (lc == NULL) /* shouldn't happen */
-                    elog(ERROR, "could not find CTE \"%s\"", tbl->ctename);
-                if (ndx >= list_length(root->cte_plan_ids))
-                    elog(ERROR, "could not find plan for CTE \"%s\"", tbl->ctename);
-                plan_id = list_nth_int(root->cte_plan_ids, ndx);
-                cte_plan = (Plan *) lfirst(list_nth_cell(root->glob->subplans, plan_id - 1));
-                cte_ent = (TargetEntry *) lfirst(list_nth_cell(cte_plan->targetlist, var->varattno - 1));
-                ent = makeTargetEntry((Expr *) copyObject(var), var->varoattno, cte_ent->resname, false);
-            }
-
-            ent->ressortgroupref = ressortgroupref;
-
-            subselect->targetList = lappend(subselect->targetList, ent);
-
-            varno = list_length(subselect->targetList);
-            ent->resno = varno;
+        	Oid sortop;
+        	Oid eqop;
+        	bool hashable;
+        	Oid restype;
+        	SortGroupClause *grpcl;
+        	Var *var = (Var *) lfirst(cell);
+        	RangeTblEntry *tbl = (RangeTblEntry *) list_nth(subselect->rtable, var->varno - 1);
+
+        	if (tbl->rtekind != RTE_RELATION && tbl->rtekind != RTE_CTE)
+        		return NULL;
+
+        	restype = exprType((Node *) var);
+
+        	grpcl = makeNode(SortGroupClause);
+        	ressortgroupref++;
+
+        	if (tbl->rtekind == RTE_RELATION)
+        	{
+        		ent = makeTargetEntry((Expr *) copyObject(var), var->varoattno,
+        							  get_relid_attribute_name(tbl->relid, var->varoattno), false);
+        	}
+        	else
+        	{
+        		int plan_id;
+        		int ndx;
+        		ListCell *lc;
+        		Plan *cte_plan;
+        		TargetEntry *cte_ent = NULL;
+
+        		/*
+        		 * Note: cte_plan_ids can be shorter than cteList, if we are still working
+        		 * on planning the CTEs (ie, this is a side-reference from another CTE).
+        		 * So we mustn't use forboth here.
+        		 */
+        		ndx = 0;
+        		foreach (lc, root->parse->cteList)
+        		{
+        			CommonTableExpr *cte = (CommonTableExpr *) lfirst(lc);
+
+        			if (strcmp(cte->ctename, tbl->ctename) == 0)
+        				break;
+        			ndx++;
+        		}
+        		if (lc == NULL) /* shouldn't happen */
+        			elog(ERROR, "could not find CTE \"%s\"", tbl->ctename);
+        		if (ndx >= list_length(root->cte_plan_ids))
+        			elog(ERROR, "could not find plan for CTE \"%s\"", tbl->ctename);
+        		plan_id = list_nth_int(root->cte_plan_ids, ndx);
+        		cte_plan = (Plan *) lfirst(list_nth_cell(root->glob->subplans, plan_id - 1));
+        		cte_ent = (TargetEntry *) lfirst(list_nth_cell(cte_plan->targetlist, var->varattno - 1));
+        		ent = makeTargetEntry((Expr *) copyObject(var), var->varoattno, cte_ent->resname, false);
+        	}
+
+        	ent->ressortgroupref = ressortgroupref;
+
+        	subselect->targetList = lappend(subselect->targetList, ent);
+
+        	varno = list_length(subselect->targetList);
+        	ent->resno = varno;
+
+        	/* determine the eqop and optional sortop */
+        	get_sort_group_operators(restype,
+        							 false, true, false,
+        							 &sortop, &eqop, NULL,
+        							 &hashable);
+
+        	grpcl->tleSortGroupRef = ressortgroupref;
+        	grpcl->eqop = eqop;
+        	grpcl->sortop = sortop;
+        	grpcl->nulls_first = false; /* OK with or without sortop */
+        	grpcl->hashable = hashable;
+
+        	subselect->groupClause = lappend(subselect->groupClause, grpcl);
+        }
+	}
 
-            /* determine the eqop and optional sortop */
-            get_sort_group_operators(restype,
-                                     false, true, false,
-                                     &sortop, &eqop, NULL,
-                                     &hashable);
+	/*
+	 * Move sub-select to the parent query.
+	 */
+	pstate = make_parsestate(NULL);
+	rte = addRangeTableEntryForSubquery(pstate,
+										subselect,
+										makeAlias("TARGETLIST_subquery", NIL),
+										true,
+										false);
+	parse->rtable = lappend(parse->rtable, rte);
 
-            grpcl->tleSortGroupRef = ressortgroupref;
-            grpcl->eqop = eqop;
-            grpcl->sortop = sortop;
-            grpcl->nulls_first = false; /* OK with or without sortop */
-            grpcl->hashable = hashable;
+	rtr = makeNode(RangeTblRef);
+	rtr->rtindex = list_length(parse->rtable);
 
-            subselect->groupClause = lappend(subselect->groupClause, grpcl);
-        }
-    }
+	/*
+	 * Form join node.
+	 */
+	joinExpr = makeNode(JoinExpr);
+	joinExpr->jointype = subselect->hasAggs? JOIN_LEFT : JOIN_LEFT_SCALAR;
+	joinExpr->isNatural = false;
+	joinExpr->larg = (Node *) root->parse->jointree;
+	joinExpr->rarg = (Node *) rtr;
+	joinExpr->usingClause = NIL;
+	joinExpr->alias   = NULL;
+	joinExpr->rtindex = 0; /* we don't need an RTE for it */
+	joinExpr->quals   = NULL;
 
-    /*
-     * Move sub-select to the parent query.
-     */
-    pstate = make_parsestate(NULL);
-    rte = addRangeTableEntryForSubquery(pstate,
-                                        subselect,
-                                        makeAlias("TARGETLIST_subquery", NIL),
-                                        true,
-                                        false);
-    parse->rtable = lappend(parse->rtable, rte);
+	/* Wrap join node in FromExpr as required. */
+	parse->jointree = makeFromExpr(list_make1(joinExpr), NULL);
 
-    rtr = makeNode(RangeTblRef);
-    rtr->rtindex = list_length(parse->rtable);
+	/* Build a Var pointing to the subquery */
+	var = makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList));
 
-    /*
-     * Form join node.
-     */
-    joinExpr = makeNode(JoinExpr);
-    joinExpr->jointype = subselect->hasAggs? JOIN_LEFT : JOIN_LEFT_SCALAR;
-    joinExpr->isNatural = false;
-    joinExpr->larg = (Node *) root->parse->jointree;
-    joinExpr->rarg = (Node *) rtr;
-    joinExpr->usingClause = NIL;
-    joinExpr->alias   = NULL;
-    joinExpr->rtindex = 0; /* we don't need an RTE for it */
-    joinExpr->quals   = NULL;
-
-    /* Wrap join node in FromExpr as required. */
-    parse->jointree = makeFromExpr(list_make1(joinExpr), NULL);
-
-    /* Build a Var pointing to the subquery */
-    var = makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList));
-
-    /* Replace sublink node with Var. */
-    entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr,
-    												   sublink,
+	/* Replace sublink node with Var. */
+	entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr,
+													   sublink,
 													   (Node *)var);
-    return entry;
+	return entry;
 }
 #endif
 
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 79708c41..432740af 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1895,6 +1895,40 @@ select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b =
      
 (10 rows)
 
+explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
+ Sort
+   Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".count ELSE '0'::bigint END)
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Subquery Scan on "TARGETLIST_subquery"
+                           ->  GroupAggregate
+                                 Group Key: b.a, b.b
+                                 ->  Result
+                                       One-Time Filter: (a.b = ANY ('{1,2}'::integer[]))
+                                       ->  Seq Scan on tbl_b b
+                                             Filter: ((a = a.a) AND (b = a.b))
+(14 rows)
+
+select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
+ case 
+------
+    1
+    1
+    1
+    1
+    1
+    1
+    1
+    1
+    1
+     
+(10 rows)
+
 drop table tbl_a;
 drop table tbl_b;
 set enable_pullup_subquery to false;
diff --git a/src/test/regress/expected/subselect_1.out b/src/test/regress/expected/subselect_1.out
index e8cd553a..79ebb522 100644
--- a/src/test/regress/expected/subselect_1.out
+++ b/src/test/regress/expected/subselect_1.out
@@ -1167,3 +1167,775 @@ NOTICE:  x = 9, y = 13
 (3 rows)
 
 drop function tattle(x int, y int);
+--
+-- Tests for pulling up more sublinks
+--
+set enable_pullup_subquery to true;
+create table tbl_a(a int,b int);
+create table tbl_b(a int,b int);
+insert into tbl_a select generate_series(1,10),1 ;
+insert into tbl_b select generate_series(2,11),1 ;
+-- check targetlist subquery scenario.
+set enable_nestloop to true;
+set enable_hashjoin to false;
+set enable_mergejoin to false;
+explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=15636.19..15637.88 rows=675 width=8)
+   ->  Sort  (cost=15636.19..15637.88 rows=675 width=8)
+         Sort Key: a.a, b.a
+         ->  Nested Loop Left Scalar Join  (cost=0.00..15604.47 rows=675 width=8)
+               Join Filter: (b.a = a.a)
+               ->  Seq Scan on tbl_a a  (cost=0.00..23.50 rows=1350 width=4)
+               ->  Materialize  (cost=0.00..30.25 rows=1350 width=4)
+                     ->  Seq Scan on tbl_b b  (cost=0.00..23.50 rows=1350 width=4)
+(8 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+set enable_nestloop to false;
+set enable_hashjoin to true;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, b.a
+         ->  Hash Left Scalar Join
+               Hash Cond: (a.a = b.a)
+               ->  Seq Scan on tbl_a a
+               ->  Hash
+                     ->  Seq Scan on tbl_b b
+(8 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+set enable_nestloop to false;
+set enable_hashjoin to false;
+set enable_mergejoin to true;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, b.a
+         ->  Merge Left Scalar Join
+               Merge Cond: (a.a = b.a)
+               ->  Sort
+                     Sort Key: a.a
+                     ->  Seq Scan on tbl_a a
+               ->  Sort
+                     Sort Key: b.a
+                     ->  Seq Scan on tbl_b b
+(11 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+-- check non-scalar scenario.
+insert into tbl_b values(2,2);
+set enable_nestloop to true;
+set enable_hashjoin to false;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, b.a
+         ->  Nested Loop Left Scalar Join
+               Join Filter: (b.a = a.a)
+               ->  Seq Scan on tbl_a a
+               ->  Materialize
+                     ->  Seq Scan on tbl_b b
+(8 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ERROR:  more than one row returned by a subquery used as an expression
+set enable_nestloop to false;
+set enable_hashjoin to true;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, b.a
+         ->  Hash Left Scalar Join
+               Hash Cond: (a.a = b.a)
+               ->  Seq Scan on tbl_a a
+               ->  Hash
+                     ->  Seq Scan on tbl_b b
+(8 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ERROR:  more than one row returned by a subquery used as an expression
+set enable_nestloop to false;
+set enable_hashjoin to false;
+set enable_mergejoin to true;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, b.a
+         ->  Merge Left Scalar Join
+               Merge Cond: (a.a = b.a)
+               ->  Sort
+                     Sort Key: a.a
+                     ->  Seq Scan on tbl_a a
+               ->  Sort
+                     Sort Key: b.a
+                     ->  Seq Scan on tbl_b b
+(11 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ERROR:  more than one row returned by a subquery used as an expression
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   ->  Sort
+         Sort Key: a.a, b.a
+         ->  Merge Left Scalar Join
+               Merge Cond: (a.a = b.a)
+               ->  Sort
+                     Sort Key: a.a
+                     ->  Seq Scan on tbl_a a
+               ->  Sort
+                     Sort Key: b.a
+                     ->  Seq Scan on tbl_b b
+                           Filter: (a = 5)
+(12 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a and b.a = 5) q from tbl_a a order by 1,2;
+ a | q 
+---+---
+ 1 |  
+ 2 |  
+ 5 | 5
+ 6 |  
+ 8 |  
+ 9 |  
+(6 rows)
+
+-- check distinct scenario.
+set enable_nestloop to true;
+set enable_hashjoin to false;
+set enable_mergejoin to false;
+explain (costs  off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Sort
+   Sort Key: a.a, a
+   ->  Nested Loop Left Scalar Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Unique
+                           ->  Seq Scan on tbl_b b
+                                 Filter: (a = a.a)
+(10 rows)
+
+select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+set enable_nestloop to false;
+set enable_hashjoin to true;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Sort
+   Sort Key: a.a, a
+   ->  Nested Loop Left Scalar Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Unique
+                           ->  Seq Scan on tbl_b b
+                                 Filter: (a = a.a)
+(10 rows)
+
+select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+set enable_nestloop to false;
+set enable_hashjoin to false;
+set enable_mergejoin to true;
+explain (costs  off) select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Sort
+   Sort Key: a.a, a
+   ->  Nested Loop Left Scalar Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Unique
+                           ->  Seq Scan on tbl_b b
+                                 Filter: (a = a.a)
+(10 rows)
+
+select a.a,(select distinct b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+set enable_nestloop to true;
+set enable_hashjoin to true;
+set enable_mergejoin to true;
+-- targetlist sublink with agg
+explain (costs off)  select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Sort
+   Sort Key: "TARGETLIST_subquery".sum
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Subquery Scan on "TARGETLIST_subquery"
+                           ->  GroupAggregate
+                                 Group Key: b.a, b.b
+                                 ->  Seq Scan on tbl_b b
+                                       Filter: ((a = a.a) AND (b = a.b))
+(12 rows)
+
+select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) from tbl_a a order by 1;
+ sum 
+-----
+   2
+   3
+   4
+   5
+   6
+   7
+   8
+   9
+  10
+    
+(10 rows)
+
+explain (costs off)  select (select count(b.a) from tbl_b b where b.a = a.a) from tbl_a a order by 1;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Sort
+   Sort Key: "TARGETLIST_subquery".count
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Subquery Scan on "TARGETLIST_subquery"
+                           ->  GroupAggregate
+                                 Group Key: b.a
+                                 ->  Seq Scan on tbl_b b
+                                       Filter: (a = a.a)
+(12 rows)
+
+select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1;
+ count 
+-------
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     2
+      
+(10 rows)
+
+explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1;
+                                       QUERY PLAN                                       
+----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: ((SubPlan 1))
+         ->  Seq Scan on tbl_a a
+               SubPlan 1
+                 ->  Finalize Aggregate
+                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                             ->  Partial Aggregate
+                                   ->  Seq Scan on tbl_b b
+                                         Filter: (((a = a.a) AND (b = a.b)) OR (a = 1))
+(10 rows)
+
+select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1;
+ sum 
+-----
+   2
+   3
+   4
+   5
+   6
+   7
+   8
+   9
+  10
+    
+(10 rows)
+
+-- targetlist sublink wrapped in expr
+explain (costs off)  select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+                                      QUERY PLAN                                       
+---------------------------------------------------------------------------------------
+ Sort
+   Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".sum ELSE '0'::bigint END)
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Subquery Scan on "TARGETLIST_subquery"
+                           ->  GroupAggregate
+                                 Group Key: b.a, b.b
+                                 ->  Seq Scan on tbl_b b
+                                       Filter: ((a = a.a) AND (b = a.b))
+(12 rows)
+
+select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+ case 
+------
+    2
+    3
+    4
+    5
+    6
+    7
+    8
+    9
+   10
+     
+(10 rows)
+
+explain (costs off)  select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: (CASE WHEN (a.b = 1) THEN b.a ELSE 0 END)
+         ->  Hash Left Scalar Join
+               Hash Cond: ((a.a = b.a) AND (a.b = b.b))
+               ->  Seq Scan on tbl_a a
+               ->  Hash
+                     ->  Seq Scan on tbl_b b
+(8 rows)
+
+select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+ case 
+------
+    2
+    3
+    4
+    5
+    6
+    7
+    8
+    9
+   10
+     
+(10 rows)
+
+explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
+ Sort
+   Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".count ELSE '0'::bigint END)
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Subquery Scan on "TARGETLIST_subquery"
+                           ->  GroupAggregate
+                                 Group Key: b.a, b.b
+                                 ->  Result
+                                       One-Time Filter: (a.b = ANY ('{1,2}'::integer[]))
+                                       ->  Seq Scan on tbl_b b
+                                             Filter: ((a = a.a) AND (b = a.b))
+(14 rows)
+
+select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
+ case 
+------
+    1
+    1
+    1
+    1
+    1
+    1
+    1
+    1
+    1
+     
+(10 rows)
+
+drop table tbl_a;
+drop table tbl_b;
+set enable_pullup_subquery to false;
+--
+-- Tests for CTE inlining behavior
+--
+-- Basic subquery that can be inlined
+explain (verbose, costs off)
+with x as (select * from (select f1 from subselect_tbl) ss)
+select * from x where f1 = 1;
+                QUERY PLAN                
+------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: subselect_tbl.f1
+   ->  Seq Scan on public.subselect_tbl
+         Output: subselect_tbl.f1
+         Filter: (subselect_tbl.f1 = 1)
+(5 rows)
+
+-- Explicitly request materialization
+explain (verbose, costs off)
+with x as materialized (select * from (select f1 from subselect_tbl) ss)
+select * from x where f1 = 1;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ CTE Scan on x
+   Output: x.f1
+   Filter: (x.f1 = 1)
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+           Output: subselect_tbl.f1
+           ->  Seq Scan on public.subselect_tbl
+                 Output: subselect_tbl.f1
+(8 rows)
+
+-- Stable functions are safe to inline
+explain (verbose, costs off)
+with x as (select * from (select f1, now() from subselect_tbl) ss)
+select * from x where f1 = 1;
+                QUERY PLAN                
+------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: subselect_tbl.f1, now()
+   ->  Seq Scan on public.subselect_tbl
+         Output: subselect_tbl.f1, now()
+         Filter: (subselect_tbl.f1 = 1)
+(5 rows)
+
+-- Volatile functions prevent inlining
+explain (verbose, costs off)
+with x as (select * from (select f1, random() from subselect_tbl) ss)
+select * from x where f1 = 1;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ CTE Scan on x
+   Output: x.f1, x.random
+   Filter: (x.f1 = 1)
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+           Output: f1, random
+           ->  Seq Scan on public.subselect_tbl
+                 Output: subselect_tbl.f1, random()
+(8 rows)
+
+-- SELECT FOR UPDATE cannot be inlined
+explain (verbose, costs off)
+with x as (select * from (select f1 from subselect_tbl for update) ss)
+select * from x where f1 = 1;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ CTE Scan on x
+   Output: x.f1
+   Filter: (x.f1 = 1)
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+           Output: ss.f1
+           ->  Subquery Scan on ss
+                 Output: ss.f1
+                 ->  LockRows
+                       Output: subselect_tbl.f1, subselect_tbl.ctid
+                       ->  Seq Scan on public.subselect_tbl
+                             Output: subselect_tbl.f1, subselect_tbl.ctid
+(12 rows)
+
+-- Multiply-referenced CTEs are inlined only when requested
+explain (verbose, costs off)
+with x as (select * from (select f1, now() as n from subselect_tbl) ss)
+select * from x, x x2 where x.n = x2.n;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Merge Join
+   Output: x.f1, x.n, x2.f1, x2.n
+   Merge Cond: (x.n = x2.n)
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+           Output: subselect_tbl.f1, now()
+           ->  Seq Scan on public.subselect_tbl
+                 Output: subselect_tbl.f1, now()
+   ->  Sort
+         Output: x.f1, x.n
+         Sort Key: x.n
+         ->  CTE Scan on x
+               Output: x.f1, x.n
+   ->  Sort
+         Output: x2.f1, x2.n
+         Sort Key: x2.n
+         ->  CTE Scan on x x2
+               Output: x2.f1, x2.n
+(18 rows)
+
+explain (verbose, costs off)
+with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss)
+select * from x, x x2 where x.n = x2.n;
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: subselect_tbl.f1, now(), subselect_tbl_1.f1, now()
+   ->  Result
+         Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now())
+         One-Time Filter: (now() = now())
+         ->  Nested Loop
+               Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now())
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: subselect_tbl.f1, now()
+                     Distribute results by H: now()
+                     ->  Seq Scan on public.subselect_tbl
+                           Output: subselect_tbl.f1, now()
+               ->  Materialize
+                     Output: subselect_tbl_1.f1, (now())
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Output: subselect_tbl_1.f1, now()
+                           Distribute results by H: now()
+                           ->  Seq Scan on public.subselect_tbl subselect_tbl_1
+                                 Output: subselect_tbl_1.f1, now()
+(19 rows)
+
+-- Multiply-referenced CTEs can't be inlined if they contain outer self-refs
+explain (verbose, costs off)
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z1.a as a from z cross join z as z1
+    where length(z.a || z1.a) < 5))
+select * from x;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ CTE Scan on x
+   Output: x.a
+   CTE x
+     ->  Recursive Union
+           ->  Values Scan on "*VALUES*"
+                 Output: "*VALUES*".column1
+           ->  Nested Loop
+                 Output: (z.a || z1.a)
+                 Join Filter: (length((z.a || z1.a)) < 5)
+                 CTE z
+                   ->  WorkTable Scan on x x_1
+                         Output: x_1.a
+                 ->  CTE Scan on z
+                       Output: z.a
+                 ->  Materialize
+                       Output: z1.a
+                       ->  CTE Scan on z z1
+                             Output: z1.a
+(18 rows)
+
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z1.a as a from z cross join z as z1
+    where length(z.a || z1.a) < 5))
+select * from x;
+  a   
+------
+ a
+ b
+ aa
+ ab
+ ba
+ bb
+ aaaa
+ aaab
+ aaba
+ aabb
+ abaa
+ abab
+ abba
+ abbb
+ baaa
+ baab
+ baba
+ babb
+ bbaa
+ bbab
+ bbba
+ bbbb
+(22 rows)
+
+explain (verbose, costs off)
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z.a as a from z
+    where length(z.a || z.a) < 5))
+select * from x;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ CTE Scan on x
+   Output: x.a
+   CTE x
+     ->  Recursive Union
+           ->  Values Scan on "*VALUES*"
+                 Output: "*VALUES*".column1
+           ->  WorkTable Scan on x x_1
+                 Output: (x_1.a || x_1.a)
+                 Filter: (length((x_1.a || x_1.a)) < 5)
+(9 rows)
+
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z.a as a from z
+    where length(z.a || z.a) < 5))
+select * from x;
+  a   
+------
+ a
+ b
+ aa
+ bb
+ aaaa
+ bbbb
+(6 rows)
+
+-- Check handling of outer references
+explain (verbose, costs off)
+with x as (select * from int4_tbl)
+select * from (with y as (select * from x) select * from y) ss;
+                QUERY PLAN                
+------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: f1
+   ->  Seq Scan on public.int4_tbl
+         Output: int4_tbl.f1
+(4 rows)
+
+explain (verbose, costs off)
+with x as materialized (select * from int4_tbl)
+select * from (with y as (select * from x) select * from y) ss;
+                    QUERY PLAN                    
+--------------------------------------------------
+ CTE Scan on x
+   Output: x.f1
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1)
+           Output: int4_tbl.f1
+           ->  Seq Scan on public.int4_tbl
+                 Output: int4_tbl.f1
+(7 rows)
+
+-- Ensure that we inline the currect CTE when there are
+-- multiple CTEs with the same name
+explain (verbose, costs off)
+with x as (select 1 as y)
+select * from (with x as (select 2 as y) select * from x) ss;
+ QUERY PLAN  
+-------------
+ Result
+   Output: 2
+(2 rows)
+
+-- Row marks are not pushed into CTEs
+explain (verbose, costs off)
+with x as (select * from subselect_tbl)
+select * from x for update;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3
+   ->  Seq Scan on public.subselect_tbl
+         Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3
+(4 rows)
+
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index a7ba5190..82efdc9c 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -599,99 +599,6 @@ select * from
 
 drop function tattle(x int, y int);
 
---
--- Tests for CTE inlining behavior
---
-
--- Basic subquery that can be inlined
-explain (verbose, costs off)
-with x as (select * from (select f1 from subselect_tbl) ss)
-select * from x where f1 = 1;
-
--- Explicitly request materialization
-explain (verbose, costs off)
-with x as materialized (select * from (select f1 from subselect_tbl) ss)
-select * from x where f1 = 1;
-
--- Stable functions are safe to inline
-explain (verbose, costs off)
-with x as (select * from (select f1, now() from subselect_tbl) ss)
-select * from x where f1 = 1;
-
--- Volatile functions prevent inlining
-explain (verbose, costs off)
-with x as (select * from (select f1, random() from subselect_tbl) ss)
-select * from x where f1 = 1;
-
--- SELECT FOR UPDATE cannot be inlined
-explain (verbose, costs off)
-with x as (select * from (select f1 from subselect_tbl for update) ss)
-select * from x where f1 = 1;
-
--- Multiply-referenced CTEs are inlined only when requested
-explain (verbose, costs off)
-with x as (select * from (select f1, now() as n from subselect_tbl) ss)
-select * from x, x x2 where x.n = x2.n;
-
-explain (verbose, costs off)
-with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss)
-select * from x, x x2 where x.n = x2.n;
-
--- Multiply-referenced CTEs can't be inlined if they contain outer self-refs
-explain (verbose, costs off)
-with recursive x(a) as
-  ((values ('a'), ('b'))
-   union all
-   (with z as not materialized (select * from x)
-    select z.a || z1.a as a from z cross join z as z1
-    where length(z.a || z1.a) < 5))
-select * from x;
-
-with recursive x(a) as
-  ((values ('a'), ('b'))
-   union all
-   (with z as not materialized (select * from x)
-    select z.a || z1.a as a from z cross join z as z1
-    where length(z.a || z1.a) < 5))
-select * from x;
-
-explain (verbose, costs off)
-with recursive x(a) as
-  ((values ('a'), ('b'))
-   union all
-   (with z as not materialized (select * from x)
-    select z.a || z.a as a from z
-    where length(z.a || z.a) < 5))
-select * from x;
-
-with recursive x(a) as
-  ((values ('a'), ('b'))
-   union all
-   (with z as not materialized (select * from x)
-    select z.a || z.a as a from z
-    where length(z.a || z.a) < 5))
-select * from x;
-
--- Check handling of outer references
-explain (verbose, costs off)
-with x as (select * from int4_tbl)
-select * from (with y as (select * from x) select * from y) ss;
-
-explain (verbose, costs off)
-with x as materialized (select * from int4_tbl)
-select * from (with y as (select * from x) select * from y) ss;
-
--- Ensure that we inline the currect CTE when there are
--- multiple CTEs with the same name
-explain (verbose, costs off)
-with x as (select 1 as y)
-select * from (with x as (select 2 as y) select * from x) ss;
-
--- Row marks are not pushed into CTEs
-explain (verbose, costs off)
-with x as (select * from subselect_tbl)
-select * from x for update;
-
 --
 -- Tests for pulling up more sublinks
 --
@@ -781,7 +688,102 @@ explain (costs off)  select (case when a.b =1 then (select sum(b.a) from tbl_b b
 select (case when a.b =1 then (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
 explain (costs off)  select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
 select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
+explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
+select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
 
 drop table tbl_a;
 drop table tbl_b;
 set enable_pullup_subquery to false;
+
+--
+-- Tests for CTE inlining behavior
+--
+
+-- Basic subquery that can be inlined
+explain (verbose, costs off)
+with x as (select * from (select f1 from subselect_tbl) ss)
+select * from x where f1 = 1;
+
+-- Explicitly request materialization
+explain (verbose, costs off)
+with x as materialized (select * from (select f1 from subselect_tbl) ss)
+select * from x where f1 = 1;
+
+-- Stable functions are safe to inline
+explain (verbose, costs off)
+with x as (select * from (select f1, now() from subselect_tbl) ss)
+select * from x where f1 = 1;
+
+-- Volatile functions prevent inlining
+explain (verbose, costs off)
+with x as (select * from (select f1, random() from subselect_tbl) ss)
+select * from x where f1 = 1;
+
+-- SELECT FOR UPDATE cannot be inlined
+explain (verbose, costs off)
+with x as (select * from (select f1 from subselect_tbl for update) ss)
+select * from x where f1 = 1;
+
+-- Multiply-referenced CTEs are inlined only when requested
+explain (verbose, costs off)
+with x as (select * from (select f1, now() as n from subselect_tbl) ss)
+select * from x, x x2 where x.n = x2.n;
+
+explain (verbose, costs off)
+with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss)
+select * from x, x x2 where x.n = x2.n;
+
+-- Multiply-referenced CTEs can't be inlined if they contain outer self-refs
+explain (verbose, costs off)
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z1.a as a from z cross join z as z1
+    where length(z.a || z1.a) < 5))
+select * from x;
+
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z1.a as a from z cross join z as z1
+    where length(z.a || z1.a) < 5))
+select * from x;
+
+explain (verbose, costs off)
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z.a as a from z
+    where length(z.a || z.a) < 5))
+select * from x;
+
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z.a as a from z
+    where length(z.a || z.a) < 5))
+select * from x;
+
+-- Check handling of outer references
+explain (verbose, costs off)
+with x as (select * from int4_tbl)
+select * from (with y as (select * from x) select * from y) ss;
+
+explain (verbose, costs off)
+with x as materialized (select * from int4_tbl)
+select * from (with y as (select * from x) select * from y) ss;
+
+-- Ensure that we inline the currect CTE when there are
+-- multiple CTEs with the same name
+explain (verbose, costs off)
+with x as (select 1 as y)
+select * from (with x as (select 2 as y) select * from x) ss;
+
+-- Row marks are not pushed into CTEs
+explain (verbose, costs off)
+with x as (select * from subselect_tbl)
+select * from x for update;

From 7902557b5849c93352454a486fad80c42bcaadc2 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 1 Sep 2020 17:00:20 +0800
Subject: [PATCH 040/578] Support pullup agg sublink with NullTest qual

---
 src/backend/optimizer/plan/subselect.c  |  4 +--
 src/test/regress/expected/subselect.out | 34 +++++++++++++++++++++++++
 src/test/regress/sql/subselect.sql      |  3 ++-
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 8bb67513..7d799073 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -1654,7 +1654,7 @@ contain_notexpr_or_neopexpr(Node *whereclause, bool check_or, List **joinquals)
 	else if (IsA(whereclause, ArrayExpr))
 	{
 		ListCell *lc = NULL;
-		ArrayExpr *arrayExpr = (ArrayExpr*)whereclause;
+		ArrayExpr *arrayExpr = (ArrayExpr *)whereclause;
 
 		foreach(lc, arrayExpr->elements)
 		{
@@ -2866,7 +2866,7 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 	if (sublink->subLinkType != EXPR_SUBLINK)
 		return NULL;
 
-    /*
+	/*
 	 * Copy object so that we can modify it.
 	 */
 	subselect = copyObject((Query *) sublink->subselect);
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 432740af..e31f5f94 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1929,6 +1929,40 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and
      
 (10 rows)
 
+explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
+ Sort
+   Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".count ELSE '0'::bigint END)
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Subquery Scan on "TARGETLIST_subquery"
+                           ->  GroupAggregate
+                                 Group Key: b.a, b.b
+                                 ->  Result
+                                       One-Time Filter: (a.b IS NOT NULL)
+                                       ->  Seq Scan on tbl_b b
+                                             Filter: ((a = a.a) AND (b = a.b))
+(14 rows)
+
+select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;
+ case 
+------
+    1
+    1
+    1
+    1
+    1
+    1
+    1
+    1
+    1
+     
+(10 rows)
+
 drop table tbl_a;
 drop table tbl_b;
 set enable_pullup_subquery to false;
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 82efdc9c..68fa867e 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -690,7 +690,8 @@ explain (costs off)  select (case when a.b =1 then (select b.a from tbl_b b wher
 select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b = a.b) else 0 end) from tbl_a a order by 1;
 explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
 select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
-
+explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;
+select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;
 drop table tbl_a;
 drop table tbl_b;
 set enable_pullup_subquery to false;

From 41f2a048e1c58bb5b1038c5ca984318c3715e22e Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Wed, 2 Sep 2020 14:50:50 +0800
Subject: [PATCH 041/578] Fix pullup count agg subquery in targetlist

---
 src/backend/optimizer/plan/subselect.c  | 59 +++++++++++++++++++++----
 src/test/regress/expected/subselect.out | 12 ++---
 2 files changed, 56 insertions(+), 15 deletions(-)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 7d799073..483fdedb 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -2851,8 +2851,9 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 	SubLink		*sublink = NULL;
 	RangeTblRef	*rtr = NULL;
 	RangeTblEntry *rte = NULL;
-	Var			*var = NULL;
-	List 		*sublinks = NIL;
+	Node	   *target = NULL;
+	List 	 *sublinks = NIL;
+    bool     count_agg = false;
 
 	/* Find sublinks in the targetlist entry */
 	find_sublink_walker((Node *)entry->expr, &sublinks);
@@ -2922,17 +2923,49 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 
 	if (subselect->hasAggs)
 	{
+        int ressortgroupref = 0;
+        int varno = 0;
 		List *joinquals  = NULL;
 		List *vars       = NULL;
 		TargetEntry *ent = NULL;
 		ListCell *cell   = NULL;
-		int ressortgroupref = 0;
-		int varno = 0;
+		char  *name = NULL;
+		Aggref *agg = NULL;
+		Node  *expr = linitial(subselect->targetList);
 
 		/* process 'op' and 'bool' expr only */
 		if (contain_notexpr_or_neopexpr(whereClause, true, &joinquals))
 			return NULL;
 
+		expr = (Node *)((TargetEntry *)expr)->expr;
+		/*
+		 * First node must be Agg.
+		 * we optimize subquery only like "SELECT agg()",
+		 * others will not be optimized for now.
+		 */
+        if (!IsA(expr, Aggref))
+            return NULL;
+
+        agg = (Aggref *)expr;
+        name = get_func_name(agg->aggfnoid);
+        if(!name)
+        {
+            return NULL;
+        }
+
+        /* count agg */
+        if (pg_strcasecmp(name, "count") == 0)
+        {
+            count_agg = true;
+        }
+        /* strict aggs are allowed */
+        else if (pg_strcasecmp(name, "max") != 0    && pg_strcasecmp(name, "min") != 0 &&
+                 pg_strcasecmp(name, "stddev") != 0 && pg_strcasecmp(name, "sum") != 0 &&
+                 pg_strcasecmp(name, "avg") != 0    && pg_strcasecmp(name, "variance") != 0)
+        {
+            return NULL;
+        }
+
 		vars = pull_vars_of_level((Node *) joinquals, 0);
 
 		/* construct groupby clause */
@@ -3045,12 +3078,20 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 	parse->jointree = makeFromExpr(list_make1(joinExpr), NULL);
 
 	/* Build a Var pointing to the subquery */
-	var = makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList));
+	target = (Node *)makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList));
+
+	/* Add Coalesce(count,0) */
+    if (count_agg)
+    {
+        CoalesceExpr *coalesce = makeNode(CoalesceExpr);
+        coalesce->args = list_make2(target,
+                                    makeConst(INT8OID, -1, InvalidOid, sizeof(int64), Int64GetDatum(0), false, true));
+        coalesce->coalescetype = INT8OID;
+        target = (Node *) coalesce;
+    }
 
-	/* Replace sublink node with Var. */
-	entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr,
-													   sublink,
-													   (Node *)var);
+	/* Replace sublink node with Result. */
+	entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr, sublink, target);
 	return entry;
 }
 #endif
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index e31f5f94..4b7dda27 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1776,7 +1776,7 @@ explain (costs off)  select (select count(b.a) from tbl_b b where b.a = a.a) fro
                               QUERY PLAN                               
 -----------------------------------------------------------------------
  Sort
-   Sort Key: "TARGETLIST_subquery".count
+   Sort Key: (COALESCE("TARGETLIST_subquery".count, '0'::bigint))
    ->  Nested Loop Left Join
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on tbl_a a
@@ -1792,6 +1792,7 @@ explain (costs off)  select (select count(b.a) from tbl_b b where b.a = a.a) fro
 select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1;
  count 
 -------
+     0
      1
      1
      1
@@ -1801,7 +1802,6 @@ select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1
      1
      1
      2
-      
 (10 rows)
 
 explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1;
@@ -1896,10 +1896,10 @@ select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b =
 (10 rows)
 
 explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
-                                       QUERY PLAN                                        
------------------------------------------------------------------------------------------
+                                                   QUERY PLAN                                                   
+----------------------------------------------------------------------------------------------------------------
  Sort
-   Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".count ELSE '0'::bigint END)
+   Sort Key: (CASE WHEN (a.b = 1) THEN COALESCE("TARGETLIST_subquery".count, '0'::bigint) ELSE '0'::bigint END)
    ->  Nested Loop Left Join
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on tbl_a a
@@ -1917,6 +1917,7 @@ explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b
 select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
  case 
 ------
+    0
     1
     1
     1
@@ -1926,7 +1927,6 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and
     1
     1
     1
-     
 (10 rows)
 
 explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;

From bf4fd50c7846ef4683e796f78a8bbdbc769f8e4a Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Wed, 2 Sep 2020 15:31:01 +0800
Subject: [PATCH 042/578] update subselect test expect file

---
 src/test/regress/expected/subselect_1.out | 46 ++++++++++++++++++++---
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/src/test/regress/expected/subselect_1.out b/src/test/regress/expected/subselect_1.out
index 79ebb522..cf0f6db9 100644
--- a/src/test/regress/expected/subselect_1.out
+++ b/src/test/regress/expected/subselect_1.out
@@ -1499,7 +1499,7 @@ explain (costs off)  select (select count(b.a) from tbl_b b where b.a = a.a) fro
                               QUERY PLAN                               
 -----------------------------------------------------------------------
  Sort
-   Sort Key: "TARGETLIST_subquery".count
+   Sort Key: (COALESCE("TARGETLIST_subquery".count, '0'::bigint))
    ->  Nested Loop Left Join
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on tbl_a a
@@ -1515,6 +1515,7 @@ explain (costs off)  select (select count(b.a) from tbl_b b where b.a = a.a) fro
 select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1;
  count 
 -------
+     0
      1
      1
      1
@@ -1524,7 +1525,6 @@ select (select count(b.a) from tbl_b b where b.a = a.a ) from tbl_a a order by 1
      1
      1
      2
-      
 (10 rows)
 
 explain (costs off) select (select sum(b.a) from tbl_b b where b.a = a.a and b.b = a.b or b.a = 1) from tbl_a a order by 1;
@@ -1619,10 +1619,10 @@ select (case when a.b =1 then (select b.a from tbl_b b where b.a = a.a and b.b =
 (10 rows)
 
 explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
-                                       QUERY PLAN                                        
------------------------------------------------------------------------------------------
+                                                   QUERY PLAN                                                   
+----------------------------------------------------------------------------------------------------------------
  Sort
-   Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".count ELSE '0'::bigint END)
+   Sort Key: (CASE WHEN (a.b = 1) THEN COALESCE("TARGETLIST_subquery".count, '0'::bigint) ELSE '0'::bigint END)
    ->  Nested Loop Left Join
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on tbl_a a
@@ -1640,6 +1640,41 @@ explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b
 select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
  case 
 ------
+    0
+    1
+    1
+    1
+    1
+    1
+    1
+    1
+    1
+    1
+(10 rows)
+
+explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;
+                                                   QUERY PLAN                                                   
+----------------------------------------------------------------------------------------------------------------
+ Sort
+   Sort Key: (CASE WHEN (a.b = 1) THEN COALESCE("TARGETLIST_subquery".count, '0'::bigint) ELSE '0'::bigint END)
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tbl_a a
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Subquery Scan on "TARGETLIST_subquery"
+                           ->  GroupAggregate
+                                 Group Key: b.a, b.b
+                                 ->  Result
+                                       One-Time Filter: (a.b IS NOT NULL)
+                                       ->  Seq Scan on tbl_b b
+                                             Filter: ((a = a.a) AND (b = a.b))
+(14 rows)
+
+select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;
+ case 
+------
+    0
     1
     1
     1
@@ -1649,7 +1684,6 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and
     1
     1
     1
-     
 (10 rows)
 
 drop table tbl_a;

From 7935999fdb5efc755acc532b97017750a3e774b3 Mon Sep 17 00:00:00 2001
From: qiannzhang <qiannzhang@tencent.com>
Date: Wed, 2 Sep 2020 15:35:55 +0800
Subject: [PATCH 043/578] Fix coredump when alter gtm node

---
 src/backend/access/transam/gtm.c | 201 +++++++++++++++++--------------
 src/backend/pgxc/pool/pgxcnode.c |  94 +++++++--------
 2 files changed, 155 insertions(+), 140 deletions(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 4545592b..267c5b88 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -1081,100 +1081,105 @@ IsGTMConnected()
 }
 
 #ifdef __TBASE__
+/*
+ * Set gtm info with GtmHost and GtmPort.
+ *
+ * There are three cases:
+ * 1.New gtm info from create/alter gtm node command
+ * 2.Gtm info from pgxc_node
+ * 3.Gtm info from recovery gtm host
+ */
 static void 
 GetMasterGtmInfo(void)
-{// #lizard forgives
-    /* Check gtm host and port info */
-    Relation    rel;
-    HeapScanDesc scan;
-    HeapTuple    gtmtup;
-    Form_pgxc_node    nodeForm;
-    bool        found = false;
-
-    /* reset gtm info */
-    ResetGtmInfo();
-
-    /* we have no recovery gtm host info, just read from heap. */
-    if (!g_recovery_gtm_host->need_read)
-    {
-        rel = heap_open(PgxcNodeRelationId, AccessShareLock);
-        scan = heap_beginscan_catalog(rel, 0, NULL);
-
-        /* Only one record will match */
-        while (HeapTupleIsValid(gtmtup = heap_getnext(scan, ForwardScanDirection)))
-        {
-            nodeForm = (Form_pgxc_node) GETSTRUCT(gtmtup);
-            if (PGXC_NODE_GTM == nodeForm->node_type && nodeForm->nodeis_primary)
-            {
-                GtmHost = strdup(NameStr(nodeForm->node_host));
-                GtmPort = nodeForm->node_port;
-                found = true;
-                break;
-            }
-        }
-
-        heap_endscan(scan);
-        heap_close(rel, AccessShareLock);
-    }
-    else
-    {
-        /* get the gtm host info  */
-        GtmHost = strdup(NameStr(g_recovery_gtm_host->hostdata));
-        GtmPort = g_recovery_gtm_host->port;    
-        found = true;
-    }    
-
-    if (!found)
-    {
-        if (NewGtmHost && NewGtmPort != 0)
-        {
-            elog(LOG, "GetMasterGtmInfo: can not get master gtm info from pgxc_node, try use NewGtmHost:%s NewGtmPort:%d",
-                        NewGtmHost, NewGtmPort);
-        }
-        else
-        {
-            elog(LOG, "GetMasterGtmInfo: can not get master gtm info from pgxc_node");
-        }
-    }
+{
+	/* Check gtm host and port info */
+	Relation	rel;
+	HeapScanDesc scan;
+	HeapTuple	gtmtup;
+	Form_pgxc_node	nodeForm;
+	bool		found = false;
+
+	/* reset gtm info */
+	ResetGtmInfo();
+
+	/* If NewGtmHost and NewGtmPort, just use it. */
+	if (NewGtmHost && NewGtmPort != 0)
+	{
+		GtmHost = strdup(NewGtmHost);
+		GtmPort = NewGtmPort;
+
+		free(NewGtmHost);
+		NewGtmHost = NULL;
+		NewGtmPort = 0;
+
+		elog(LOG,
+			"GetMasterGtmInfo: set master gtm info with NewGtmHost:%s NewGtmPort:%d",
+			NewGtmHost, NewGtmPort);
+		return;
+	}
+
+	/* we have no recovery gtm host info, just read from heap. */
+	if (!g_recovery_gtm_host->need_read)
+	{
+		/*
+		 * We must be sure there is no error report, because we may be
+		 * in AbortTransaction now.
+		 * 1.If we are not in a transaction, we should not open relation.
+		 * 2.If we do not get lock, it is ok to try it next time.
+		 */
+		if (IsTransactionState() &&
+			ConditionalLockRelationOid(PgxcNodeRelationId, AccessShareLock))
+		{
+			rel = relation_open(PgxcNodeRelationId, NoLock);
+			scan = heap_beginscan_catalog(rel, 0, NULL);
+			/* Only one record will match */
+			while (HeapTupleIsValid(gtmtup = heap_getnext(scan, ForwardScanDirection)))
+			{
+				nodeForm = (Form_pgxc_node) GETSTRUCT(gtmtup);
+				if (PGXC_NODE_GTM == nodeForm->node_type && nodeForm->nodeis_primary)
+				{
+					GtmHost = strdup(NameStr(nodeForm->node_host));
+					GtmPort = nodeForm->node_port;
+					found = true;
+					break;
+				}
+			}
+			heap_endscan(scan);
+			relation_close(rel, AccessShareLock);
+		}
+	}
+	else
+	{
+		/* get the gtm host info  */
+		GtmHost = strdup(NameStr(g_recovery_gtm_host->hostdata));
+		GtmPort = g_recovery_gtm_host->port;	
+		found = true;
+	}	
+
+	if (!found)
+	{
+		elog(LOG,
+			"GetMasterGtmInfo: can not get master gtm info from pgxc_node");
+	}
 }
 #endif
 
 static void
 CheckConnection(void)
-{// #lizard forgives
-#ifdef __TBASE__
-    /* First time try connect to gtm, get gtm info from syscache first */
-    if (NULL == GtmHost && 0 == GtmPort)
-    {
-        GetMasterGtmInfo();
-    }
-
-    /* If NewGtmHost and NewGtmPort were set, we are in create/alter gtm node command */
-    if (NewGtmHost && NewGtmPort != 0)
-    {
-        ResetGtmInfo();
-        
-        GtmHost = strdup(NewGtmHost);
-        GtmPort = NewGtmPort;
-
-        free(NewGtmHost);
-        NewGtmHost = NULL;
-        NewGtmPort = 0;
-
-        /* Close old gtm connection */
-        CloseGTM();
-    }
-#endif
-
-    /* Be sure that a backend does not use a postmaster connection */
-    if (IsUnderPostmaster && GTMPQispostmaster(conn) == 1)
-    {
-        InitGTM();
-        return;
-    }
-
-    if (GTMPQstatus(conn) != CONNECTION_OK)
-        InitGTM();
+{
+	/* Be sure that a backend does not use a postmaster connection */
+	if (IsUnderPostmaster && GTMPQispostmaster(conn) == 1)
+	{
+		CloseGTM();
+		InitGTM();
+		return;
+	}
+
+	if (GTMPQstatus(conn) != CONNECTION_OK)
+	{
+		CloseGTM();
+		InitGTM();
+	}
 }
 
 void
@@ -1183,8 +1188,26 @@ InitGTM(void)
 #define  CONNECT_STR_LEN   256 /* 256 bytes should be enough */
     char conn_str[CONNECT_STR_LEN];
 #ifdef __TBASE__
-    int  try_cnt = 0;
-    const int max_try_cnt = 1;
+	int  try_cnt = 0;
+	const int max_try_cnt = 1;
+
+	/*
+	 * Only re-set gtm info in two cases:
+	 * 1.No gtm info
+	 * 2.New gtm info by create/alter gtm node command
+	 */
+	if ((GtmHost == NULL && GtmPort == 0) ||
+		(NewGtmHost != NULL && NewGtmPort != 0))
+	{
+		GetMasterGtmInfo();
+	}
+	if (GtmHost == NULL && GtmPort == 0)
+	{
+		ereport(LOG,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("GtmHost and GtmPort are not set")));
+		return;
+	}
 #endif
 
 try_connect_gtm:
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index f2886833..775b703d 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -1365,23 +1365,17 @@ get_message(PGXCNodeHandle *conn, int *len, char **msg)
  */
 void
 release_handles(bool force)
-{// #lizard forgives
-    bool        destroy = false;
-    int            i;
-    int             nbytes    = 0;    
-    if (!force)
-    {
-        if (HandlesInvalidatePending)
-        {
-            DoInvalidateRemoteHandles();
-            return;
-        }
-
-        /* don't free connection if holding a cluster lock */
-        if (cluster_ex_lock_held)
-        {
-            return;
-        }
+{
+	bool		destroy = false;
+	int			i;
+	int		 	nbytes	= 0;	
+	if (!force)
+	{
+		/* don't free connection if holding a cluster lock */
+		if (cluster_ex_lock_held)
+		{
+			return;
+		}
 
         if (datanode_count == 0 && coord_count == 0 && slavedatanode_count == 0)
         {
@@ -1425,34 +1419,33 @@ release_handles(bool force)
 #ifndef __USE_GLOBAL_SNAPSHOT__
         handle->sendGxidVersion = 0;
 #endif
-        nbytes = pgxc_node_is_data_enqueued(handle);
-        if (nbytes)
-        {
-            elog(PANIC, "Connection to Datanode %s has data %d pending",
-                     handle->nodename, nbytes);
-        }
-    }
-
-    
-    for (i = 0; i < NumSlaveDataNodes; i++)
-    {
-        PGXCNodeHandle *handle = &sdn_handles[i];
-        
-        if (handle->sock != NO_SOCKET)
-        {
-            /*
-             * Connections at this point should be completely inactive,
-             * otherwise abaandon them. We can not allow not cleaned up
-             * connection is returned to pool.
-             */
-            if (handle->state != DN_CONNECTION_STATE_IDLE ||
-                    handle->transaction_status != 'I')
-            {
-                destroy = true;
-                elog(DEBUG1, "Connection to Datanode %d has unexpected state %d and will be dropped",
-                     handle->nodeoid, handle->state);
-            }
-            
+		nbytes = pgxc_node_is_data_enqueued(handle);
+		if (nbytes)
+		{
+			elog(PANIC, "Connection to Datanode %s has data %d pending",
+					 handle->nodename, nbytes);
+		}
+	}
+	
+	for (i = 0; i < NumSlaveDataNodes; i++)
+	{
+		PGXCNodeHandle *handle = &sdn_handles[i];
+		
+		if (handle->sock != NO_SOCKET)
+		{
+			/*
+			 * Connections at this point should be completely inactive,
+			 * otherwise abaandon them. We can not allow not cleaned up
+			 * connection is returned to pool.
+			 */
+			if (handle->state != DN_CONNECTION_STATE_IDLE ||
+					handle->transaction_status != 'I')
+			{
+				destroy = true;
+				elog(DEBUG1, "Connection to Datanode %d has unexpected state %d and will be dropped",
+					 handle->nodeoid, handle->state);
+			}
+			
 #ifdef _PG_REGRESS_
             elog(LOG, "release_handles release a connection with datanode %s"
                       "remote backend PID %d",
@@ -1513,8 +1506,7 @@ release_handles(bool force)
         }
     }
 
-	//destroy = true;
-    /* And finally release all the connections on pooler */
+	/* And finally release all the connections on pooler */
 	PoolManagerReleaseConnections(destroy);
 
     datanode_count = 0;
@@ -4795,12 +4787,12 @@ DoInvalidateRemoteHandles(void)
 {
     bool            result = false;
 
-    HandlesInvalidatePending = false;
-    HandlesRefreshPending = false;
+	InitMultinodeExecutor(true);
 
-    InitMultinodeExecutor(true);
+	HandlesInvalidatePending = false;
+	HandlesRefreshPending = false;
 
-    return result;
+	return result;
 }
 
 /*

From f0a41293dedaa1988c21aeed0f7903ea734c91c8 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Thu, 3 Sep 2020 16:39:33 +0800
Subject: [PATCH 044/578] Support converting correlated ANY sublink to lateral
 subquery

We support both 1-level upper or above 1-level cases. Lateral flag of
subquery should be set when pulling up those sublinks with exactly one
upper level correlations. If the correlation is above one upper level,
the pullup will not have any side effect, thus we can threat them as
normal pullup.
---
 src/backend/optimizer/plan/subselect.c  | 427 +++++++------
 src/backend/optimizer/util/var.c        |   9 +
 src/include/optimizer/subselect.h       |   6 +-
 src/include/optimizer/var.h             |   1 -
 src/test/regress/expected/subselect.out | 816 ++++++++++++------------
 src/test/regress/sql/subselect.sql      |   9 +-
 6 files changed, 663 insertions(+), 605 deletions(-)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 483fdedb..f7832d4d 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -172,8 +172,12 @@ static bool finalize_primnode(Node *node, finalize_primnode_context *context);
 static bool finalize_agg_primnode(Node *node, finalize_primnode_context *context);
 
 #ifdef __TBASE__
-static Expr * convert_OR_EXIST_sublink_to_join(PlannerInfo *root, SubLink *sublink, Node **jtlink);
-static Node * get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node,List **targetList, List **joinClause, int *next_attno);
+static Expr * convert_OR_EXIST_sublink_to_join(PlannerInfo *root,
+			  SubLink *sublink, Node **jtlink);
+static Node * get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node,
+			  List **targetList, List **joinClause, int *next_attno);
+static bool is_simple_subquery(Query *subquery, JoinExpr *lowest_outer_join,
+			  bool deletion_ok);
 #endif
 /*
  * Select a PARAM_EXEC number to identify the given Var as a parameter for
@@ -538,6 +542,140 @@ get_first_col_type(Plan *plan, Oid *coltype, int32 *coltypmod,
     *colcollation = InvalidOid;
 }
 
+#ifdef __TBASE__
+/*
+ * Check if there is a range table entry of type func expr whose arguments
+ * are correlated
+ */
+bool
+has_correlation_in_funcexpr_rte(List *rtable)
+{
+	/*
+	 * check if correlation occurs in a func expr in the from clause of the
+	 * subselect
+	 */
+	ListCell   *lc_rte;
+
+	foreach(lc_rte, rtable)
+	{
+		RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc_rte);
+
+		if (rte->functions && contain_vars_upper_level((Node *) rte->functions, 1))
+		{
+			return true;
+		}
+	}
+	return false;
+}
+
+/*
+ * is_simple_subquery
+ *	  Check a subquery in the range table to see if it's simple enough
+ *	  to pull up into the parent query.
+ *
+ * rte is the RTE_SUBQUERY RangeTblEntry that contained the subquery.
+ * (Note subquery is not necessarily equal to rte->subquery; it could be a
+ * processed copy of that.)
+ * lowest_outer_join is the lowest outer join above the subquery, or NULL.
+ * deletion_ok is TRUE if it'd be okay to delete the subquery entirely.
+ */
+static bool
+is_simple_subquery(Query *subquery,
+				   JoinExpr *lowest_outer_join,
+				   bool deletion_ok)
+{
+	/*
+	 * Let's just make sure it's a valid subselect ...
+	 */
+	if (!IsA(subquery, Query) ||
+		subquery->commandType != CMD_SELECT)
+		elog(ERROR, "subquery is bogus");
+
+	/*
+	 * Can't currently pull up a query with setops (unless it's simple UNION
+	 * ALL, which is handled by a different code path). Maybe after querytree
+	 * redesign...
+	 */
+	if (subquery->setOperations)
+		return false;
+
+	/*
+	 * Can't pull up a subquery involving grouping, aggregation, SRFs,
+	 * sorting, limiting, or WITH.  (XXX WITH could possibly be allowed later)
+	 *
+	 * We also don't pull up a subquery that has explicit FOR UPDATE/SHARE
+	 * clauses, because pullup would cause the locking to occur semantically
+	 * higher than it should.  Implicit FOR UPDATE/SHARE is okay because in
+	 * that case the locking was originally declared in the upper query
+	 * anyway.
+	 */
+	if (subquery->hasAggs ||
+		subquery->hasWindowFuncs ||
+		subquery->hasTargetSRFs ||
+		subquery->groupClause ||
+		subquery->groupingSets ||
+		subquery->havingQual ||
+		subquery->sortClause ||
+		subquery->distinctClause ||
+		subquery->limitOffset ||
+		subquery->limitCount ||
+		subquery->hasForUpdate ||
+		subquery->cteList)
+		return false;
+
+	/*
+	 * Don't pull up a subquery with an empty jointree, unless it has no quals
+	 * and deletion_ok is TRUE and we're not underneath an outer join.
+	 *
+	 * query_planner() will correctly generate a Result plan for a jointree
+	 * that's totally empty, but we can't cope with an empty FromExpr
+	 * appearing lower down in a jointree: we identify join rels via baserelid
+	 * sets, so we couldn't distinguish a join containing such a FromExpr from
+	 * one without it.  We can only handle such cases if the place where the
+	 * subquery is linked is a FromExpr or inner JOIN that would still be
+	 * nonempty after removal of the subquery, so that it's still identifiable
+	 * via its contained baserelids.  Safe contexts are signaled by
+	 * deletion_ok.
+	 *
+	 * But even in a safe context, we must keep the subquery if it has any
+	 * quals, because it's unclear where to put them in the upper query.
+	 *
+	 * Also, we must forbid pullup if such a subquery is underneath an outer
+	 * join, because then we might need to wrap its output columns with
+	 * PlaceHolderVars, and the PHVs would then have empty relid sets meaning
+	 * we couldn't tell where to evaluate them.  (This test is separate from
+	 * the deletion_ok flag for possible future expansion: deletion_ok tells
+	 * whether the immediate parent site in the jointree could cope, not
+	 * whether we'd have PHV issues.  It's possible this restriction could be
+	 * fixed by letting the PHVs use the relids of the parent jointree item,
+	 * but that complication is for another day.)
+	 *
+	 * Note that deletion of a subquery is also dependent on the check below
+	 * that its targetlist contains no set-returning functions.  Deletion from
+	 * a FROM list or inner JOIN is okay only if the subquery must return
+	 * exactly one row.
+	 */
+	if (subquery->jointree->fromlist == NIL &&
+		(subquery->jointree->quals != NULL ||
+		 !deletion_ok ||
+		 lowest_outer_join != NULL))
+		return false;
+
+	/*
+	 * Don't pull up a subquery that has any volatile functions in its
+	 * targetlist.  Otherwise we might introduce multiple evaluations of these
+	 * functions, if they get copied to multiple places in the upper query,
+	 * leading to surprising results.  (Note: the PlaceHolderVar mechanism
+	 * doesn't quite guarantee single evaluation; else we could pull up anyway
+	 * and just wrap such items in PlaceHolderVars ...)
+	 */
+	if (contain_volatile_functions((Node *) subquery->targetList))
+		return false;
+
+	return true;
+}
+#endif
+
 /*
  * Convert a SubLink (as created by the parser) into a SubPlan.
  *
@@ -1453,12 +1591,6 @@ SS_process_ctes(PlannerInfo *root)
 }
 
 #ifdef __TBASE__
-static bool
-simplify_ANY_query(PlannerInfo *root, Query *query)
-{
-    return false;
-}
-
 static bool
 simplify_EXPR_query(PlannerInfo *root, Query *query)
 {// #lizard forgives
@@ -2005,82 +2137,73 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink,
     Node       *quals;
     ParseState *pstate;
 #ifdef __TBASE__
-    int            offset = 0;
-    Node       *whereClause = NULL;
+	bool		correlated = false;
 #endif
 
     Assert(sublink->subLinkType == ANY_SUBLINK);
 
 #ifdef __TBASE__
-    /*
-      * handle correlated subquery here.
-      * simple case: select * from a where a.X in (select b.X from b where a.Xx ? b.Xx.......);
-      */
-    if (simplify_ANY_query(root, subselect))
-    {
-        subselect = copyObject(subselect);
-         whereClause = subselect->jointree->quals;
-        subselect->jointree->quals = NULL;
-
-        /*
-         * The rest of the sub-select must not refer to any Vars of the parent
-         * query.  (Vars of higher levels should be okay, though.)
-         */
-        if (contain_vars_of_level((Node *) subselect, 1))
-            return NULL;
+	if (enable_pullup_subquery)
+	{
+		/*
+		 * If there are CTEs, then the transformation does not work. Don't attempt
+		 * to pullup.
+		 */
+		if (parse->cteList)
+			return NULL;
 
-        if (whereClause)
-        {        
+		/*
+		 * If uncorrelated, and no Var nodes on lhs, the subquery will be executed
+		 * only once.  It should become an InitPlan, but make_subplan() doesn't
+		 * handle that case, so just flatten it for now.
+		 * TODO: Let it become an InitPlan, so its QEs can be recycled.
+		 *
+		 * We only handle level 1 correlated cases. The sub-select must not refer
+		 * to any Vars of the parent query. (Vars of higher levels should be okay,
+		 * though.)
+		 */
+		correlated = contain_vars_of_level((Node *) subselect, 1);
 
-            if (contain_vars_of_level((Node *) subselect, 1))
-                return NULL;
-            /*
-             * the WHERE clause may contain some Vars of the
-             * parent query.
-             */
-            upper_varnos = pull_varnos_of_level(whereClause, 1);
+		if (correlated)
+		{
+			/*
+			 * If deeply(>1) correlated, then don't pull it up
+			 */
+			if (contain_vars_upper_level(sublink->subselect, 1))
+				return NULL;
 
-            if (upper_varnos)
-            {
-                /* whereclause contains vars from different parent query */
-                if (bms_num_members(upper_varnos) > 1)
-                {
-                    return NULL;
-                }
-    
-                if (!bms_is_subset(upper_varnos, available_rels))
-                {
-                    return NULL;
-                }
-            }
+			/*
+			 * Under certain conditions, we cannot pull up the subquery as a join.
+			 */
+			if (!is_simple_subquery(subselect, NULL, false))
+				return NULL;
 
-            /*
-             * We don't risk optimizing if the WHERE clause is volatile, either.
-             */
-            if (contain_volatile_functions(whereClause))
-                return NULL;
-        }
-    }
-    else
-    {    
-        whereClause = NULL;
+			/*
+			 * Do not pull subqueries with correlation in a func expr in the from
+			 * clause of the subselect
+			 */
+			if (has_correlation_in_funcexpr_rte(subselect->rtable))
+				return NULL;
 
-        if (under_not)
-        {
-            return NULL;
-        }
+			if (contain_subplans(subselect->jointree->quals))
+				return NULL;
+		}
+	}
+	else
+	{
+#endif
+	/*
+	 * The sub-select must not refer to any Vars of the parent query. (Vars of
+	 * higher levels should be okay, though.)
+	 */
+	if (contain_vars_of_level((Node *) subselect, 1))
+		return NULL;
+#ifdef __TBASE__
+	}
 
-        if (contain_vars_of_level((Node *) subselect, 1))
-            return NULL;
-    }
-    
-#else
-    /*
-     * The sub-select must not refer to any Vars of the parent query. (Vars of
-     * higher levels should be okay, though.)
-     */
-    if (contain_vars_of_level((Node *) subselect, 1))
-        return NULL;
+	/* TODO: Currently we do not pullup under_not */
+	if (under_not)
+		return NULL;
 #endif
 
     /*
@@ -2107,50 +2230,28 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink,
     /* Create a dummy ParseState for addRangeTableEntryForSubquery */
     pstate = make_parsestate(NULL);
 
+	/*
+	 * Okay, pull up the sub-select into upper range table.
+	 *
+	 * We rely here on the assumption that the outer query has no references
+	 * to the inner (necessarily true, other than the Vars that we build
+	 * below). Therefore this is a lot easier than what pull_up_subqueries has
+	 * to go through.
+	 *
+	 * If the subquery is correlated, i.e. it refers to any Vars of the
+	 * parent query, mark it as lateral.
+	 */
+	rte = addRangeTableEntryForSubquery(pstate,
+										subselect,
+										makeAlias("ANY_subquery", NIL),
 #ifdef __TBASE__
-    if (whereClause)
-    {
-        rtindex = list_length(parse->rtable);
-
-        offset  = rtindex;
-
-        OffsetVarNodes(whereClause, rtindex, 0);
-
-        IncrementVarSublevelsUp(whereClause, -1, 1);
-    }
-#endif
-
-    /*
-     * Okay, pull up the sub-select into upper range table.
-     *
-     * We rely here on the assumption that the outer query has no references
-     * to the inner (necessarily true, other than the Vars that we build
-     * below). Therefore this is a lot easier than what pull_up_subqueries has
-     * to go through.
-     */
-#ifdef __TBASE__
-    if (whereClause)
-    {
-#endif
-    rte = addRangeTableEntryForSubquery(pstate,
-                                        subselect,
-                                        makeAlias("ANY_subquery", NIL),
-                                        false,
-                                        false);
-#ifdef __TBASE__
-    }
-    else
-    {
-        rte = addRangeTableEntryForSubquery(pstate,
-                                    (Query *) sublink->subselect,
-                                    makeAlias("ANY_subquery", NIL),
-                                    false,
-                                    false);
-    }
+										correlated,	/* lateral */
+#else
+										false,
 #endif
-
-    parse->rtable = lappend(parse->rtable, rte);
-    rtindex = list_length(parse->rtable);
+										false);
+	parse->rtable = lappend(parse->rtable, rte);
+	rtindex = list_length(parse->rtable);
 
     /*
      * Form a RangeTblRef for the pulled-up sub-select.
@@ -2165,95 +2266,15 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink,
                                            subselect->targetList,
                                            rtindex);
 
-#ifdef __TBASE__
-    /* add vars from subquery in whereclause into targetlist */
-    if (whereClause)
-    {
-        ListCell *cell;
-        List *vars = pull_vars_of_level((Node *)whereClause, 0);
-
-        foreach(cell, vars)
-        {
-            Var *var = lfirst(cell);
-
-            if (var->varno == rtindex)
-            {
-                bool match = false;
-                ListCell *lc;
-                Var *temp_var = NULL;
-                TargetEntry *ent = NULL;
-                int varno        = 0;
-                int varlevelsup  = 0;
-
-                if (var->varlevelsup >= 1)
-                {
-                    varlevelsup = var->varlevelsup;
-                    var->varlevelsup = 0;
-                }
-                
-                temp_var = copyObject(var);
-                temp_var->varno -= offset;
-                temp_var->varnoold -= offset;
-
-                match = false;
-                foreach(lc, subselect->targetList)
-                {
-                    TargetEntry *tent = (TargetEntry *) lfirst(lc);
-
-                    if (IsA(tent->expr, Var))
-                    {
-                        if (equal(temp_var, tent->expr))
-                        {
-                            match = true;
-
-                            var->varattno = var->varoattno = tent->resno;
-                            
-                            break;
-                        }
-                    }
-                }
-
-                if (!match)
-                {
-                    ent = makeTargetEntry((Expr *)temp_var, temp_var->varoattno, NULL, false);
-        
-                    subselect->targetList = lappend(subselect->targetList, ent);
-
-                    varno = list_length(subselect->targetList);
-
-                    ent->resno = varno;
-
-                    var->varattno = var->varoattno = varno;
-                }
-
-                if (varlevelsup)
-                {
-                    var->varlevelsup = varlevelsup;
-                }
-            }
-        }
-    }
-#endif
-
-    /*
-     * Build the new join's qual expression, replacing Params with these Vars.
-     */
-    quals = convert_testexpr(root, sublink->testexpr, subquery_vars);
-
-#ifdef __TBASE__
-    /* make join quals with whereclause */
-    if (whereClause)
-    {
-        Expr *expr = makeBoolExpr(AND_EXPR, list_make2(quals, whereClause), 0);
-
-        quals = (Node *)expr;
-    }
-#endif
+	/*
+	 * Build the new join's qual expression, replacing Params with these Vars.
+	 */
+	quals = convert_testexpr(root, sublink->testexpr, subquery_vars);
 
-    /*
-     * And finally, build the JoinExpr node.
-     */
-    result = makeNode(JoinExpr);
+	/*
+	 * And finally, build the JoinExpr node.
+	 */
+	result = makeNode(JoinExpr);
 #ifdef __TBASE__
     result->jointype = under_not ? JOIN_ANTI : JOIN_SEMI;
 #else
diff --git a/src/backend/optimizer/util/var.c b/src/backend/optimizer/util/var.c
index a6bc46f1..228adc19 100644
--- a/src/backend/optimizer/util/var.c
+++ b/src/backend/optimizer/util/var.c
@@ -844,6 +844,15 @@ alias_relid_set(PlannerInfo *root, Relids relids)
 }
 
 #ifdef __TBASE__
+/*
+ * contain_vars_upper_level
+ *	  Recursively scan a clause to discover whether it contains any Var nodes
+ *	  of/above the specified query level.
+ *
+ *	  Returns true if any such Var found.
+ *
+ * Will recurse into sublinks.  Also, may be invoked directly on a Query.
+ */
 bool
 contain_vars_upper_level(Node *node, int levelsup)
 {
diff --git a/src/include/optimizer/subselect.h b/src/include/optimizer/subselect.h
index ec687d2f..47ba77f5 100644
--- a/src/include/optimizer/subselect.h
+++ b/src/include/optimizer/subselect.h
@@ -120,4 +120,8 @@ extern Param *assign_nestloop_param_placeholdervar(PlannerInfo *root,
                                      PlaceHolderVar *phv);
 extern int    SS_assign_special_param(PlannerInfo *root);
 
-#endif                            /* SUBSELECT_H */
+#ifdef __TBASE__
+extern bool has_correlation_in_funcexpr_rte(List *rtable);
+#endif
+
+#endif							/* SUBSELECT_H */
diff --git a/src/include/optimizer/var.h b/src/include/optimizer/var.h
index 968da4b2..5cbb90d2 100644
--- a/src/include/optimizer/var.h
+++ b/src/include/optimizer/var.h
@@ -98,7 +98,6 @@ extern List *pull_var_clause(Node *node, int flags);
 extern Node *flatten_join_alias_vars(PlannerInfo *root, Node *node);
 #ifdef __TBASE__
 extern bool contain_vars_upper_level(Node *node, int levelsup);
-
 #endif
 
 #endif                            /* VAR_H */
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 4b7dda27..096bb24f 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1161,411 +1161,127 @@ NOTICE:  x = 9, y = 13
 
 drop function tattle(x int, y int);
 --
--- Tests for CTE inlining behavior
+-- Tests for pulling up more sublinks
 --
--- Basic subquery that can be inlined
-explain (verbose, costs off)
-with x as (select * from (select f1 from subselect_tbl) ss)
-select * from x where f1 = 1;
-                QUERY PLAN                
-------------------------------------------
- Remote Subquery Scan on all (datanode_1)
-   Output: subselect_tbl.f1
-   ->  Seq Scan on public.subselect_tbl
-         Output: subselect_tbl.f1
-         Filter: (subselect_tbl.f1 = 1)
-(5 rows)
-
--- Explicitly request materialization
-explain (verbose, costs off)
-with x as materialized (select * from (select f1 from subselect_tbl) ss)
-select * from x where f1 = 1;
-                         QUERY PLAN                          
--------------------------------------------------------------
- CTE Scan on x
-   Output: x.f1
-   Filter: (x.f1 = 1)
-   CTE x
-     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-           Output: subselect_tbl.f1
-           ->  Seq Scan on public.subselect_tbl
-                 Output: subselect_tbl.f1
+set enable_pullup_subquery to true;
+create table tbl_a(a int,b int);
+create table tbl_b(a int,b int);
+insert into tbl_a select generate_series(1,10),1;
+insert into tbl_b select generate_series(2,11),1;
+-- check targetlist subquery scenario.
+set enable_nestloop to true;
+set enable_hashjoin to false;
+set enable_mergejoin to false;
+explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=15636.19..15637.88 rows=675 width=8)
+   ->  Sort  (cost=15636.19..15637.88 rows=675 width=8)
+         Sort Key: a.a, b.a
+         ->  Nested Loop Left Scalar Join  (cost=0.00..15604.47 rows=675 width=8)
+               Join Filter: (b.a = a.a)
+               ->  Seq Scan on tbl_a a  (cost=0.00..23.50 rows=1350 width=4)
+               ->  Materialize  (cost=0.00..30.25 rows=1350 width=4)
+                     ->  Seq Scan on tbl_b b  (cost=0.00..23.50 rows=1350 width=4)
 (8 rows)
 
--- Stable functions are safe to inline
-explain (verbose, costs off)
-with x as (select * from (select f1, now() from subselect_tbl) ss)
-select * from x where f1 = 1;
-                QUERY PLAN                
-------------------------------------------
- Remote Subquery Scan on all (datanode_1)
-   Output: subselect_tbl.f1, now()
-   ->  Seq Scan on public.subselect_tbl
-         Output: subselect_tbl.f1, now()
-         Filter: (subselect_tbl.f1 = 1)
-(5 rows)
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
 
--- Volatile functions prevent inlining
-explain (verbose, costs off)
-with x as (select * from (select f1, random() from subselect_tbl) ss)
-select * from x where f1 = 1;
-                         QUERY PLAN                          
--------------------------------------------------------------
- CTE Scan on x
-   Output: x.f1, x.random
-   Filter: (x.f1 = 1)
-   CTE x
-     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-           Output: f1, random
-           ->  Seq Scan on public.subselect_tbl
-                 Output: subselect_tbl.f1, random()
+set enable_nestloop to false;
+set enable_hashjoin to true;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, b.a
+         ->  Hash Left Scalar Join
+               Hash Cond: (a.a = b.a)
+               ->  Seq Scan on tbl_a a
+               ->  Hash
+                     ->  Seq Scan on tbl_b b
 (8 rows)
 
--- SELECT FOR UPDATE cannot be inlined
-explain (verbose, costs off)
-with x as (select * from (select f1 from subselect_tbl for update) ss)
-select * from x where f1 = 1;
-                                QUERY PLAN                                
---------------------------------------------------------------------------
- CTE Scan on x
-   Output: x.f1
-   Filter: (x.f1 = 1)
-   CTE x
-     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-           Output: ss.f1
-           ->  Subquery Scan on ss
-                 Output: ss.f1
-                 ->  LockRows
-                       Output: subselect_tbl.f1, subselect_tbl.ctid
-                       ->  Seq Scan on public.subselect_tbl
-                             Output: subselect_tbl.f1, subselect_tbl.ctid
-(12 rows)
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
 
--- Multiply-referenced CTEs are inlined only when requested
-explain (verbose, costs off)
-with x as (select * from (select f1, now() as n from subselect_tbl) ss)
-select * from x, x x2 where x.n = x2.n;
-                         QUERY PLAN                          
--------------------------------------------------------------
- Merge Join
-   Output: x.f1, x.n, x2.f1, x2.n
-   Merge Cond: (x.n = x2.n)
-   CTE x
-     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-           Output: subselect_tbl.f1, now()
-           ->  Seq Scan on public.subselect_tbl
-                 Output: subselect_tbl.f1, now()
-   ->  Sort
-         Output: x.f1, x.n
-         Sort Key: x.n
-         ->  CTE Scan on x
-               Output: x.f1, x.n
+set enable_nestloop to false;
+set enable_hashjoin to false;
+set enable_mergejoin to true;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Output: x2.f1, x2.n
-         Sort Key: x2.n
-         ->  CTE Scan on x x2
-               Output: x2.f1, x2.n
-(18 rows)
+         Sort Key: a.a, b.a
+         ->  Merge Left Scalar Join
+               Merge Cond: (a.a = b.a)
+               ->  Sort
+                     Sort Key: a.a
+                     ->  Seq Scan on tbl_a a
+               ->  Sort
+                     Sort Key: b.a
+                     ->  Seq Scan on tbl_b b
+(11 rows)
 
-explain (verbose, costs off)
-with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss)
-select * from x, x x2 where x.n = x2.n;
-                                   QUERY PLAN                                   
---------------------------------------------------------------------------------
+select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
+-- check non-scalar scenario.
+insert into tbl_b values(2,2);
+set enable_nestloop to true;
+set enable_hashjoin to false;
+set enable_mergejoin to false;
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
+                     QUERY PLAN                      
+-----------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   Output: subselect_tbl.f1, now(), subselect_tbl_1.f1, now()
-   ->  Result
-         Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now())
-         One-Time Filter: (now() = now())
-         ->  Nested Loop
-               Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now())
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     Output: subselect_tbl.f1, now()
-                     Distribute results by H: now()
-                     ->  Seq Scan on public.subselect_tbl
-                           Output: subselect_tbl.f1, now()
+   ->  Sort
+         Sort Key: a.a, b.a
+         ->  Nested Loop Left Scalar Join
+               Join Filter: (b.a = a.a)
+               ->  Seq Scan on tbl_a a
                ->  Materialize
-                     Output: subselect_tbl_1.f1, (now())
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Output: subselect_tbl_1.f1, now()
-                           Distribute results by H: now()
-                           ->  Seq Scan on public.subselect_tbl subselect_tbl_1
-                                 Output: subselect_tbl_1.f1, now()
-(19 rows)
-
--- Multiply-referenced CTEs can't be inlined if they contain outer self-refs
-explain (verbose, costs off)
-with recursive x(a) as
-  ((values ('a'), ('b'))
-   union all
-   (with z as not materialized (select * from x)
-    select z.a || z1.a as a from z cross join z as z1
-    where length(z.a || z1.a) < 5))
-select * from x;
-                        QUERY PLAN                        
-----------------------------------------------------------
- CTE Scan on x
-   Output: x.a
-   CTE x
-     ->  Recursive Union
-           ->  Values Scan on "*VALUES*"
-                 Output: "*VALUES*".column1
-           ->  Nested Loop
-                 Output: (z.a || z1.a)
-                 Join Filter: (length((z.a || z1.a)) < 5)
-                 CTE z
-                   ->  WorkTable Scan on x x_1
-                         Output: x_1.a
-                 ->  CTE Scan on z
-                       Output: z.a
-                 ->  Materialize
-                       Output: z1.a
-                       ->  CTE Scan on z z1
-                             Output: z1.a
-(18 rows)
-
-with recursive x(a) as
-  ((values ('a'), ('b'))
-   union all
-   (with z as not materialized (select * from x)
-    select z.a || z1.a as a from z cross join z as z1
-    where length(z.a || z1.a) < 5))
-select * from x;
-  a   
-------
- a
- b
- aa
- ab
- ba
- bb
- aaaa
- aaab
- aaba
- aabb
- abaa
- abab
- abba
- abbb
- baaa
- baab
- baba
- babb
- bbaa
- bbab
- bbba
- bbbb
-(22 rows)
-
-explain (verbose, costs off)
-with recursive x(a) as
-  ((values ('a'), ('b'))
-   union all
-   (with z as not materialized (select * from x)
-    select z.a || z.a as a from z
-    where length(z.a || z.a) < 5))
-select * from x;
-                       QUERY PLAN                       
---------------------------------------------------------
- CTE Scan on x
-   Output: x.a
-   CTE x
-     ->  Recursive Union
-           ->  Values Scan on "*VALUES*"
-                 Output: "*VALUES*".column1
-           ->  WorkTable Scan on x x_1
-                 Output: (x_1.a || x_1.a)
-                 Filter: (length((x_1.a || x_1.a)) < 5)
-(9 rows)
-
-with recursive x(a) as
-  ((values ('a'), ('b'))
-   union all
-   (with z as not materialized (select * from x)
-    select z.a || z.a as a from z
-    where length(z.a || z.a) < 5))
-select * from x;
-  a   
-------
- a
- b
- aa
- bb
- aaaa
- bbbb
-(6 rows)
-
--- Check handling of outer references
-explain (verbose, costs off)
-with x as (select * from int4_tbl)
-select * from (with y as (select * from x) select * from y) ss;
-                QUERY PLAN                
-------------------------------------------
- Remote Subquery Scan on all (datanode_1)
-   Output: f1
-   ->  Seq Scan on public.int4_tbl
-         Output: int4_tbl.f1
-(4 rows)
-
-explain (verbose, costs off)
-with x as materialized (select * from int4_tbl)
-select * from (with y as (select * from x) select * from y) ss;
-                    QUERY PLAN                    
---------------------------------------------------
- CTE Scan on x
-   Output: x.f1
-   CTE x
-     ->  Remote Subquery Scan on all (datanode_1)
-           Output: int4_tbl.f1
-           ->  Seq Scan on public.int4_tbl
-                 Output: int4_tbl.f1
-(7 rows)
-
--- Ensure that we inline the currect CTE when there are
--- multiple CTEs with the same name
-explain (verbose, costs off)
-with x as (select 1 as y)
-select * from (with x as (select 2 as y) select * from x) ss;
- QUERY PLAN  
--------------
- Result
-   Output: 2
-(2 rows)
-
--- Row marks are not pushed into CTEs
-explain (verbose, costs off)
-with x as (select * from subselect_tbl)
-select * from x for update;
-                              QUERY PLAN                              
-----------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)
-   Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3
-   ->  Seq Scan on public.subselect_tbl
-         Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3
-(4 rows)
-
---
--- Tests for pulling up more sublinks
---
-set enable_pullup_subquery to true;
-create table tbl_a(a int,b int);
-create table tbl_b(a int,b int);
-insert into tbl_a select generate_series(1,10),1 ;
-insert into tbl_b select generate_series(2,11),1 ;
--- check targetlist subquery scenario.
-set enable_nestloop to true;
-set enable_hashjoin to false;
-set enable_mergejoin to false;
-explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                                           QUERY PLAN                                            
--------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=15636.19..15637.88 rows=675 width=8)
-   ->  Sort  (cost=15636.19..15637.88 rows=675 width=8)
-         Sort Key: a.a, b.a
-         ->  Nested Loop Left Scalar Join  (cost=0.00..15604.47 rows=675 width=8)
-               Join Filter: (b.a = a.a)
-               ->  Seq Scan on tbl_a a  (cost=0.00..23.50 rows=1350 width=4)
-               ->  Materialize  (cost=0.00..30.25 rows=1350 width=4)
-                     ->  Seq Scan on tbl_b b  (cost=0.00..23.50 rows=1350 width=4)
-(8 rows)
-
-select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
- a  | q  
-----+----
-  1 |   
-  2 |  2
-  3 |  3
-  4 |  4
-  5 |  5
-  6 |  6
-  7 |  7
-  8 |  8
-  9 |  9
- 10 | 10
-(10 rows)
-
-set enable_nestloop to false;
-set enable_hashjoin to true;
-set enable_mergejoin to false;
-explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                     QUERY PLAN                      
------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Sort
-         Sort Key: a.a, b.a
-         ->  Hash Left Scalar Join
-               Hash Cond: (a.a = b.a)
-               ->  Seq Scan on tbl_a a
-               ->  Hash
-                     ->  Seq Scan on tbl_b b
-(8 rows)
-
-select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
- a  | q  
-----+----
-  1 |   
-  2 |  2
-  3 |  3
-  4 |  4
-  5 |  5
-  6 |  6
-  7 |  7
-  8 |  8
-  9 |  9
- 10 | 10
-(10 rows)
-
-set enable_nestloop to false;
-set enable_hashjoin to false;
-set enable_mergejoin to true;
-explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                     QUERY PLAN                      
------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Sort
-         Sort Key: a.a, b.a
-         ->  Merge Left Scalar Join
-               Merge Cond: (a.a = b.a)
-               ->  Sort
-                     Sort Key: a.a
-                     ->  Seq Scan on tbl_a a
-               ->  Sort
-                     Sort Key: b.a
-                     ->  Seq Scan on tbl_b b
-(11 rows)
-
-select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
- a  | q  
-----+----
-  1 |   
-  2 |  2
-  3 |  3
-  4 |  4
-  5 |  5
-  6 |  6
-  7 |  7
-  8 |  8
-  9 |  9
- 10 | 10
-(10 rows)
-
--- check non-scalar scenario.
-insert into tbl_b values(2,2);
-set enable_nestloop to true;
-set enable_hashjoin to false;
-set enable_mergejoin to false;
-explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                     QUERY PLAN                      
------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Sort
-         Sort Key: a.a, b.a
-         ->  Nested Loop Left Scalar Join
-               Join Filter: (b.a = a.a)
-               ->  Seq Scan on tbl_a a
-               ->  Materialize
-                     ->  Seq Scan on tbl_b b
-(8 rows)
+                     ->  Seq Scan on tbl_b b
+(8 rows)
 
 select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
 ERROR:  more than one row returned by a subquery used as an expression
@@ -1930,10 +1646,10 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and
 (10 rows)
 
 explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;
-                                       QUERY PLAN                                        
------------------------------------------------------------------------------------------
+                                                   QUERY PLAN                                                   
+----------------------------------------------------------------------------------------------------------------
  Sort
-   Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".count ELSE '0'::bigint END)
+   Sort Key: (CASE WHEN (a.b = 1) THEN COALESCE("TARGETLIST_subquery".count, '0'::bigint) ELSE '0'::bigint END)
    ->  Nested Loop Left Join
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on tbl_a a
@@ -1951,6 +1667,7 @@ explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b
 select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;
  case 
 ------
+    0
     1
     1
     1
@@ -1960,9 +1677,312 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and
     1
     1
     1
-     
 (10 rows)
 
+-- support pullup lateral ANY_SUBLINK
+explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
+                                                QUERY PLAN                                                
+----------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=140.38..193.10 rows=225 width=8)
+   ->  Hash Semi Join  (cost=140.38..193.10 rows=225 width=8)
+         Hash Cond: (a.b = b.a)
+         Join Filter: (b.b > a.b)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..141.05 rows=1350 width=8)
+               Distribute results by H: b
+               ->  Seq Scan on tbl_a a  (cost=0.00..23.50 rows=1350 width=8)
+         ->  Hash  (cost=23.50..23.50 rows=1350 width=8)
+               ->  Seq Scan on tbl_b b  (cost=0.00..23.50 rows=1350 width=8)
+(9 rows)
+
+select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
+ a | b 
+---+---
+(0 rows)
+
 drop table tbl_a;
 drop table tbl_b;
 set enable_pullup_subquery to false;
+--
+-- Tests for CTE inlining behavior
+--
+-- Basic subquery that can be inlined
+explain (verbose, costs off)
+with x as (select * from (select f1 from subselect_tbl) ss)
+select * from x where f1 = 1;
+                QUERY PLAN                
+------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: subselect_tbl.f1
+   ->  Seq Scan on public.subselect_tbl
+         Output: subselect_tbl.f1
+         Filter: (subselect_tbl.f1 = 1)
+(5 rows)
+
+-- Explicitly request materialization
+explain (verbose, costs off)
+with x as materialized (select * from (select f1 from subselect_tbl) ss)
+select * from x where f1 = 1;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ CTE Scan on x
+   Output: x.f1
+   Filter: (x.f1 = 1)
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+           Output: subselect_tbl.f1
+           ->  Seq Scan on public.subselect_tbl
+                 Output: subselect_tbl.f1
+(8 rows)
+
+-- Stable functions are safe to inline
+explain (verbose, costs off)
+with x as (select * from (select f1, now() from subselect_tbl) ss)
+select * from x where f1 = 1;
+                QUERY PLAN                
+------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: subselect_tbl.f1, now()
+   ->  Seq Scan on public.subselect_tbl
+         Output: subselect_tbl.f1, now()
+         Filter: (subselect_tbl.f1 = 1)
+(5 rows)
+
+-- Volatile functions prevent inlining
+explain (verbose, costs off)
+with x as (select * from (select f1, random() from subselect_tbl) ss)
+select * from x where f1 = 1;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ CTE Scan on x
+   Output: x.f1, x.random
+   Filter: (x.f1 = 1)
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+           Output: f1, random
+           ->  Seq Scan on public.subselect_tbl
+                 Output: subselect_tbl.f1, random()
+(8 rows)
+
+-- SELECT FOR UPDATE cannot be inlined
+explain (verbose, costs off)
+with x as (select * from (select f1 from subselect_tbl for update) ss)
+select * from x where f1 = 1;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ CTE Scan on x
+   Output: x.f1
+   Filter: (x.f1 = 1)
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+           Output: ss.f1
+           ->  Subquery Scan on ss
+                 Output: ss.f1
+                 ->  LockRows
+                       Output: subselect_tbl.f1, subselect_tbl.ctid
+                       ->  Seq Scan on public.subselect_tbl
+                             Output: subselect_tbl.f1, subselect_tbl.ctid
+(12 rows)
+
+-- Multiply-referenced CTEs are inlined only when requested
+explain (verbose, costs off)
+with x as (select * from (select f1, now() as n from subselect_tbl) ss)
+select * from x, x x2 where x.n = x2.n;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Merge Join
+   Output: x.f1, x.n, x2.f1, x2.n
+   Merge Cond: (x.n = x2.n)
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+           Output: subselect_tbl.f1, now()
+           ->  Seq Scan on public.subselect_tbl
+                 Output: subselect_tbl.f1, now()
+   ->  Sort
+         Output: x.f1, x.n
+         Sort Key: x.n
+         ->  CTE Scan on x
+               Output: x.f1, x.n
+   ->  Sort
+         Output: x2.f1, x2.n
+         Sort Key: x2.n
+         ->  CTE Scan on x x2
+               Output: x2.f1, x2.n
+(18 rows)
+
+explain (verbose, costs off)
+with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss)
+select * from x, x x2 where x.n = x2.n;
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: subselect_tbl.f1, now(), subselect_tbl_1.f1, now()
+   ->  Result
+         Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now())
+         One-Time Filter: (now() = now())
+         ->  Nested Loop
+               Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now())
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: subselect_tbl.f1, now()
+                     Distribute results by H: now()
+                     ->  Seq Scan on public.subselect_tbl
+                           Output: subselect_tbl.f1, now()
+               ->  Materialize
+                     Output: subselect_tbl_1.f1, (now())
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Output: subselect_tbl_1.f1, now()
+                           Distribute results by H: now()
+                           ->  Seq Scan on public.subselect_tbl subselect_tbl_1
+                                 Output: subselect_tbl_1.f1, now()
+(19 rows)
+
+-- Multiply-referenced CTEs can't be inlined if they contain outer self-refs
+explain (verbose, costs off)
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z1.a as a from z cross join z as z1
+    where length(z.a || z1.a) < 5))
+select * from x;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ CTE Scan on x
+   Output: x.a
+   CTE x
+     ->  Recursive Union
+           ->  Values Scan on "*VALUES*"
+                 Output: "*VALUES*".column1
+           ->  Nested Loop
+                 Output: (z.a || z1.a)
+                 Join Filter: (length((z.a || z1.a)) < 5)
+                 CTE z
+                   ->  WorkTable Scan on x x_1
+                         Output: x_1.a
+                 ->  CTE Scan on z
+                       Output: z.a
+                 ->  Materialize
+                       Output: z1.a
+                       ->  CTE Scan on z z1
+                             Output: z1.a
+(18 rows)
+
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z1.a as a from z cross join z as z1
+    where length(z.a || z1.a) < 5))
+select * from x;
+  a   
+------
+ a
+ b
+ aa
+ ab
+ ba
+ bb
+ aaaa
+ aaab
+ aaba
+ aabb
+ abaa
+ abab
+ abba
+ abbb
+ baaa
+ baab
+ baba
+ babb
+ bbaa
+ bbab
+ bbba
+ bbbb
+(22 rows)
+
+explain (verbose, costs off)
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z.a as a from z
+    where length(z.a || z.a) < 5))
+select * from x;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ CTE Scan on x
+   Output: x.a
+   CTE x
+     ->  Recursive Union
+           ->  Values Scan on "*VALUES*"
+                 Output: "*VALUES*".column1
+           ->  WorkTable Scan on x x_1
+                 Output: (x_1.a || x_1.a)
+                 Filter: (length((x_1.a || x_1.a)) < 5)
+(9 rows)
+
+with recursive x(a) as
+  ((values ('a'), ('b'))
+   union all
+   (with z as not materialized (select * from x)
+    select z.a || z.a as a from z
+    where length(z.a || z.a) < 5))
+select * from x;
+  a   
+------
+ a
+ b
+ aa
+ bb
+ aaaa
+ bbbb
+(6 rows)
+
+-- Check handling of outer references
+explain (verbose, costs off)
+with x as (select * from int4_tbl)
+select * from (with y as (select * from x) select * from y) ss;
+                QUERY PLAN                
+------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: f1
+   ->  Seq Scan on public.int4_tbl
+         Output: int4_tbl.f1
+(4 rows)
+
+explain (verbose, costs off)
+with x as materialized (select * from int4_tbl)
+select * from (with y as (select * from x) select * from y) ss;
+                    QUERY PLAN                    
+--------------------------------------------------
+ CTE Scan on x
+   Output: x.f1
+   CTE x
+     ->  Remote Subquery Scan on all (datanode_1)
+           Output: int4_tbl.f1
+           ->  Seq Scan on public.int4_tbl
+                 Output: int4_tbl.f1
+(7 rows)
+
+-- Ensure that we inline the currect CTE when there are
+-- multiple CTEs with the same name
+explain (verbose, costs off)
+with x as (select 1 as y)
+select * from (with x as (select 2 as y) select * from x) ss;
+ QUERY PLAN  
+-------------
+ Result
+   Output: 2
+(2 rows)
+
+-- Row marks are not pushed into CTEs
+explain (verbose, costs off)
+with x as (select * from subselect_tbl)
+select * from x for update;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3
+   ->  Seq Scan on public.subselect_tbl
+         Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3
+(4 rows)
+
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 68fa867e..01926d80 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -606,8 +606,8 @@ drop function tattle(x int, y int);
 set enable_pullup_subquery to true;
 create table tbl_a(a int,b int);
 create table tbl_b(a int,b int);
-insert into tbl_a select generate_series(1,10),1 ;
-insert into tbl_b select generate_series(2,11),1 ;
+insert into tbl_a select generate_series(1,10),1;
+insert into tbl_b select generate_series(2,11),1;
 
 -- check targetlist subquery scenario.
 set enable_nestloop to true;
@@ -692,6 +692,11 @@ explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b
 select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b in (1,2)) else 0 end) from tbl_a a order by 1;
 explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;
 select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;
+
+-- support pullup lateral ANY_SUBLINK
+explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
+select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
+
 drop table tbl_a;
 drop table tbl_b;
 set enable_pullup_subquery to false;

From 60423cc3247048893d688f8d77eb25fee58fa180 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Thu, 3 Sep 2020 20:54:28 +0800
Subject: [PATCH 045/578] Fix PortalDrop core when commandTag is NULL

---
 src/backend/utils/mmgr/portalmem.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c
index d8d6079e..deb2b8d6 100644
--- a/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -660,15 +660,18 @@ PortalDrop(Portal portal, bool isTopCommit)
     {
 
 #ifdef __TBASE__
-        /* 
-         * when dn recv rollback_subtxn, the resource already release by AbortSubTransaction,
-         * and the memory delete by CleanupSubTransaction (delete parent memory context op will delete child)
-         */
-		if (strcmp(portal->commandTag, "ROLLBACK SUBTXN") == 0)
-        {
-            elog(LOG, "skip delete portal resowner");
-        }
-        else
+		/* 
+		 * When CN/DN received rollback_subtxn, the resource already been
+		 * released by AbortSubTransaction, and the memory delete by
+		 * CleanupSubTransaction (delete parent memory context operation
+		 * will delete child)
+		 */
+		if (portal->commandTag &&
+			strcmp(portal->commandTag, "ROLLBACK SUBTXN") == 0)
+		{
+			elog(LOG, "skip delete portal resowner");
+		}
+		else
 #endif
         {
             bool        isCommit = (portal->status != PORTAL_FAILED);

From 57c842e57fa83040d6d8f5581368cc6ac8ccca0f Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Fri, 4 Sep 2020 10:25:36 +0800
Subject: [PATCH 046/578] Fix format

---
 src/backend/tcop/utility.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index cd0252f9..f331a920 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -1737,10 +1737,12 @@ ProcessUtilityPost(PlannedStmt *pstmt,
 
 #ifdef __TBASE__
 /*
- * Forward specific DDLs request to leader cn
- * on success return true else false
+ * Forward specific DDLs request to leader cn.
+ *
+ * On success return true else false.
  */
-static bool forward_ddl_to_leader_cn(Node *node, const char *queryString)
+static bool
+forward_ddl_to_leader_cn(Node *node, const char *queryString)
 {
     Oid  leader_cn = InvalidOid;
     char *leader_name = NULL;

From 1dfdfcca27a32c4dcdf7bbc2fa4dae56cbd60cab Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Fri, 4 Sep 2020 11:42:20 +0800
Subject: [PATCH 047/578] bugfix: select not committed when
 persistent_datanode_connections = on,
 https://git.code.oa.com/jasonysli/PG-XL-v10/merge_requests/5

---
 src/backend/pgxc/pool/execRemote.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 36bdbed3..22c3fade 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -8874,16 +8874,17 @@ ExecRemoteQuery(PlanState *pstate)
          */
         combiner->node_count = regular_conn_count;
 
-        /*
-         * Start transaction on data nodes if we are in explicit transaction
-         * or going to use extended query protocol or write to multiple nodes
-         */
-        if (step->force_autocommit)
-            need_tran_block = false;
-        else
-            need_tran_block = step->cursor ||
-                    (!step->read_only && total_conn_count > 1) ||
-                    (TransactionBlockStatusCode() == 'T');
+		/*
+		 * Start transaction on data nodes if we are in explicit transaction
+		 * or going to use extended query protocol or write to multiple nodes
+		 */
+		if (step->force_autocommit)
+			need_tran_block = false;
+		else
+			need_tran_block = step->cursor ||
+					step->statement || node->rqs_num_params ||
+					(!step->read_only && total_conn_count > 1) ||
+					(TransactionBlockStatusCode() == 'T');
 
 #ifdef __TBASE__
 		/* Set plpgsql transaction begin for all connections */

From f09fd4a587be84e40f8ca4233b5ad652c0268f94 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Sat, 8 Aug 2020 11:37:18 +0800
Subject: [PATCH 048/578] fix bug: when exec with+subquery sql ,report prepared
 statement already exists :
 http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696081159381

---
 src/backend/commands/prepare.c          | 86 ++++++++++++++-----------
 src/test/regress/expected/prepare.out   |  2 +-
 src/test/regress/expected/prepare_1.out |  2 +-
 3 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index e77df1ad..5a46fa7f 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -586,43 +586,55 @@ StorePreparedStatement(const char *stmt_name,
                        bool from_sql,
                        bool use_resowner)
 {
-    PreparedStatement *entry;
-    TimestampTz cur_ts = GetCurrentStatementStartTimestamp();
-    bool        found;
-
-    /* Initialize the hash table, if necessary */
-    if (!prepared_queries)
-        InitQueryHashTable();
-
-    /* Add entry to hash table */
-    entry = (PreparedStatement *) hash_search(prepared_queries,
-                                              stmt_name,
-                                              HASH_ENTER,
-                                              &found);
-
-    /* Shouldn't get a duplicate entry */
-    if (found)
-        ereport(ERROR,
-                (errcode(ERRCODE_DUPLICATE_PSTATEMENT),
-                 errmsg("prepared statement \"%s\" already exists",
-                        stmt_name)));
-
-    /* Fill in the hash table entry */
-    entry->plansource = plansource;
-    entry->from_sql = from_sql;
-    entry->prepare_time = cur_ts;
-    entry->use_resowner = use_resowner;
-
-    /* Now it's safe to move the CachedPlanSource to permanent memory */
-    SaveCachedPlan(plansource);
-#ifdef XCP    
-    if (use_resowner)
-    {
-        ResourceOwnerEnlargePreparedStmts(CurTransactionResourceOwner);
-        ResourceOwnerRememberPreparedStmt(CurTransactionResourceOwner,
-                entry->stmt_name);
-    }
-#endif        
+	PreparedStatement *entry;
+	TimestampTz cur_ts = GetCurrentStatementStartTimestamp();
+	bool		found;
+
+	/* Initialize the hash table, if necessary */
+	if (!prepared_queries)
+		InitQueryHashTable();
+
+	/* Add entry to hash table */
+	entry = (PreparedStatement *) hash_search(prepared_queries,
+											  stmt_name,
+											  HASH_ENTER,
+											  &found);
+
+	/* Shouldn't get a duplicate entry */
+	if (found)
+	{
+		if (!(plansource->commandTag == entry->plansource->commandTag &&
+				strcmp(plansource->query_string, entry->plansource->query_string) == 0))
+		{
+			ereport(ERROR,
+				(errcode(ERRCODE_DUPLICATE_PSTATEMENT),
+					errmsg("prepared statement \"%s\" already exists, and plansource is not the same.",
+					stmt_name)));
+		}
+		else
+		{
+			elog(LOG, " \"%s\" already exists in prepared_queries, skip it.", stmt_name);
+			return ;
+		}
+	}
+
+	/* Fill in the hash table entry */
+	entry->plansource = plansource;
+	entry->from_sql = from_sql;
+	entry->prepare_time = cur_ts;
+	entry->use_resowner = use_resowner;
+
+	/* Now it's safe to move the CachedPlanSource to permanent memory */
+	SaveCachedPlan(plansource);
+
+#ifdef XCP	
+	if (use_resowner)
+	{
+		ResourceOwnerEnlargePreparedStmts(CurTransactionResourceOwner);
+		ResourceOwnerRememberPreparedStmt(CurTransactionResourceOwner,
+				entry->stmt_name);
+	}
+#endif		
 }
 
 /*
diff --git a/src/test/regress/expected/prepare.out b/src/test/regress/expected/prepare.out
index 0b810146..787b242c 100644
--- a/src/test/regress/expected/prepare.out
+++ b/src/test/regress/expected/prepare.out
@@ -21,7 +21,7 @@ SELECT name, statement, parameter_types FROM pg_prepared_statements;
 
 -- should fail
 PREPARE q1 AS SELECT 2;
-ERROR:  prepared statement "q1" already exists
+ERROR:  prepared statement "q1" already exists, and plansource is not the same.
 -- should succeed
 DEALLOCATE q1;
 PREPARE q1 AS SELECT 2;
diff --git a/src/test/regress/expected/prepare_1.out b/src/test/regress/expected/prepare_1.out
index c1c15864..db1e190b 100644
--- a/src/test/regress/expected/prepare_1.out
+++ b/src/test/regress/expected/prepare_1.out
@@ -21,7 +21,7 @@ SELECT name, statement, parameter_types FROM pg_prepared_statements;
 
 -- should fail
 PREPARE q1 AS SELECT 2;
-ERROR:  prepared statement "q1" already exists
+ERROR:  prepared statement "q1" already exists, and plansource is not the same.
 -- should succeed
 DEALLOCATE q1;
 PREPARE q1 AS SELECT 2;

From 47f7719806bd4404056feacdc112d7545c06e733 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Wed, 9 Sep 2020 15:17:46 +0800
Subject: [PATCH 049/578] Support Fast Shipping Query if the subquery only
 contains constant value

1. Enable such optimization to avoid remote distribution the other hand
relation rte. This will pushdown more select cases to datanode.
2. Add GUC enable_subquery_shipping for more potential subquery
optimizations.
3. Fix deparse_query() for dual RTE
---
 src/backend/optimizer/util/pathnode.c  |  34 +-
 src/backend/optimizer/util/pgxcship.c  | 616 ++++++++++++++++---------
 src/backend/utils/adt/ruleutils.c      |  33 +-
 src/backend/utils/misc/guc.c           |  10 +
 src/include/optimizer/pathnode.h       |   1 +
 src/include/pgxc/locator.h             |  19 +-
 src/test/regress/expected/sysviews.out |   3 +-
 src/test/regress/expected/xc_FQS_2.out |  30 ++
 src/test/regress/sql/xc_FQS.sql        |   9 +
 9 files changed, 486 insertions(+), 269 deletions(-)

diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 1eefd477..eee0c16b 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -56,10 +56,12 @@
 #ifdef __TBASE__
 /*GUC parameter */
 bool prefer_olap;
-
+/* Max replication level on join to make Query more efficient */
 int replication_level;
-
+/* Restrict query to involved node as possible */
 bool restrict_query = false;
+/* Support fast query shipping for subquery */
+bool enable_subquery_shipping = false;
 
 #define  REPLICATION_FACTOR 0.8
 #endif
@@ -2173,24 +2175,24 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                     Expr *left_expr = left;
                     Expr *right_expr = right;
 #endif
-                    Oid leftType PG_USED_FOR_ASSERTS_ONLY = exprType((Node *) left);
+					Oid leftType PG_USED_FOR_ASSERTS_ONLY = exprType((Node *) left);
 #ifndef __TBASE__
 					Oid rightType PG_USED_FOR_ASSERTS_ONLY = exprType((Node *) right);
 #endif
-                    Relids inner_rels = pathnode->innerjoinpath->parent->relids;
-                    Relids outer_rels = pathnode->outerjoinpath->parent->relids;
-                    QualCost cost;
+					Relids inner_rels = pathnode->innerjoinpath->parent->relids;
+					Relids outer_rels = pathnode->outerjoinpath->parent->relids;
+					QualCost cost;
 
-#ifndef    __TBASE__
-                    /*
-                     * Check if both parts are of the same data type and choose
-                     * distribution type to redistribute.
-                     * XXX We may want more sophisticated algorithm to choose
-                     * the best condition to redistribute parts along.
-                     * For now use simple but reliable approach.
-                     */
-                    if (leftType != rightType)
-                        continue;
+#ifndef	__TBASE__
+					/*
+					 * Check if both parts are of the same data type and choose
+					 * distribution type to redistribute.
+					 * XXX We may want more sophisticated algorithm to choose
+					 * the best condition to redistribute parts along.
+					 * For now use simple but reliable approach.
+					 */
+					if (leftType != rightType)
+						continue;
 #endif
 #ifndef _PG_REGRESS_
                     {        
diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index f79eb3bd..ae1ca9d6 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -153,6 +153,7 @@ static ExecNodes *pgxc_FQS_datanodes_for_rtr(Index varno, Query *query);
 #ifdef __TBASE__
 static ExecNodes* pgxc_is_group_subquery_shippable(Query *query, Shippability_context *sc_context);
 static void pgxc_is_rte_subquery_shippable(Node *node, Shippability_context *sc_context);
+static bool pgxc_FQS_check_subquery_const(Query *query);
 #endif
 /*
  * Set the given reason in Shippability_context indicating why the query can not be
@@ -205,6 +206,124 @@ pgxc_set_exprtype_shippability(Oid exprtype, Shippability_context *sc_context)
         pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_TYPE);
 }
 
+#ifdef __TBASE__
+/*
+ * pgxc_FQS_check_const_recurse
+ * Recursively check the query node to see if it only contains constant values.
+ * We only support all constant values in same leaf nodes, correlated cases are
+ * not supported.
+ */
+static bool
+pgxc_FQS_check_const_recurse(Node *node, Query *query)
+{
+	if (!node)
+		return false;
+
+	switch(nodeTag(node))
+	{
+		case T_FromExpr:
+		{
+			FromExpr	*from_expr = (FromExpr *)node;
+			ListCell	*lcell;
+			bool		 result = true;
+
+			/*
+			 * Only support SELECT for now
+			 */
+			if (query->commandType != CMD_SELECT)
+				return false;
+
+			/*
+			 * Check the SetOperation to cover the case of
+			 * '(const subquery) UNION (const subquery)...'
+			 */
+			if (!from_expr->fromlist)
+			{
+				if (query->setOperations &&
+					IsA(query->setOperations, SetOperationStmt))
+				{
+					return pgxc_FQS_check_const_recurse(query->setOperations, query);
+				}
+				return false;
+			}
+
+			/* Check if all RTEs contains only constant values */
+			foreach (lcell, from_expr->fromlist)
+			{
+				Node	*fromlist_entry = lfirst(lcell);
+
+				if (!pgxc_FQS_check_const_recurse(fromlist_entry, query))
+				{
+					result = false;
+				}
+			}
+			return result;
+		}
+		case T_RangeTblRef:
+		{
+			RangeTblRef *rtr = (RangeTblRef *)node;
+			RangeTblEntry *rte = rt_fetch(rtr->rtindex, query->rtable);
+
+			if (rte->rtekind == RTE_SUBQUERY)
+			{
+				return pgxc_FQS_check_subquery_const(rte->subquery);
+			}
+			return false;
+		}
+		case T_JoinExpr:
+		{
+			/* TODO: Not supported yet */
+			return false;
+		}
+		case T_SetOperationStmt:
+		{
+			SetOperationStmt *setOp = (SetOperationStmt *)node;
+
+			/* Only handle UNION cases */
+			if (setOp->op == SETOP_UNION &&
+				pgxc_FQS_check_const_recurse(setOp->larg, query) &&
+				pgxc_FQS_check_const_recurse(setOp->rarg, query))
+			{
+				return true;
+			}
+			return false;
+		}
+		default:
+			return false;
+	}
+	/* Keep compiler happy */
+	return false;
+}
+
+/*
+ * pgxc_FQS_check_subquery_const
+ * Check the query node to see if it only contains constant values, we could
+ * provide more shipping optimizations based on this hint.
+ */
+static bool
+pgxc_FQS_check_subquery_const(Query *query)
+{
+	ListCell *lc;
+	bool result = true;
+
+	/* If all target list entries are T_Const, then we are done. */
+	foreach(lc, query->targetList)
+	{
+		TargetEntry *tle = lfirst(lc);
+		if (!IsA(tle->expr, Const))
+		{
+			result = false;
+		}
+	}
+
+	if (result == true)
+		return true;
+
+	/* Otherwise, check if all RTEs are const */
+	return pgxc_FQS_check_const_recurse((Node *)query->jointree, query);
+}
+#endif
+
 /*
  * pgxc_FQS_datanodes_for_rtr
  * For a given RangeTblRef find the datanodes where corresponding data is
@@ -254,19 +373,40 @@ pgxc_FQS_datanodes_for_rtr(Index varno, Query *query)
                 return NULL;
 #endif
 
-            return pgxc_FQS_get_relation_nodes(rte, varno, query);
-        }
-        break;
+			return pgxc_FQS_get_relation_nodes(rte, varno, query);
+		}
+		break;
+		case RTE_SUBQUERY:
+#ifdef __TBASE__
+		{
+			Query *subquery = rte->subquery;
 
-        /* For any other type of RTE, we return NULL for now */
-        case RTE_JOIN:
-        case RTE_CTE:
-        case RTE_SUBQUERY:
-        case RTE_FUNCTION:
-        case RTE_VALUES:
-        default:
-            return NULL;
-    }
+			/*
+			 * Current we only consider the case if subquery only contains
+			 * constant values. If so, we can treat them as replicated RTE.
+			 */
+			if (enable_subquery_shipping &&
+				pgxc_FQS_check_subquery_const(subquery))
+			{
+				ExecNodes *exec_nodes = makeNode(ExecNodes);
+				exec_nodes->baselocatortype = LOCATOR_TYPE_REPLICATED;
+				/* No locate info stored for such subquery RTEs, we use this
+				 * flag to force using the other hand locate info */
+				exec_nodes->const_subquery = true;
+
+				return exec_nodes;
+			}
+			return NULL;
+		}
+#endif
+		/* For any other type of RTE, we return NULL for now */
+		case RTE_JOIN:
+		case RTE_CTE:
+		case RTE_FUNCTION:
+		case RTE_VALUES:
+		default:
+			return NULL;
+	}
 }
 
 /*
@@ -276,85 +416,85 @@ pgxc_FQS_datanodes_for_rtr(Index varno, Query *query)
  */
 static ExecNodes *
 pgxc_FQS_find_datanodes_recurse(Node *node, Query *query, Bitmapset **relids)
-{// #lizard forgives
-    List        *query_rtable = query->rtable;
-
-    if (!node)
-        return NULL;
-
-    switch(nodeTag(node))
-    {
-        case T_FromExpr:
-        {
-            FromExpr    *from_expr = (FromExpr *)node;
-            ListCell    *lcell;
-            bool        first;
-            Bitmapset    *from_relids;
-            ExecNodes    *result_en;
-
-            /*
-             * For INSERT commands, we won't have any entries in the from list.
-             * Get the datanodes using the resultRelation index.
-             */
-            if (query->commandType != CMD_SELECT && !from_expr->fromlist)
-            {
-                *relids = bms_make_singleton(query->resultRelation);
-                return pgxc_FQS_datanodes_for_rtr(query->resultRelation,
-                                                        query);
-            }
-
-            /*
-             * All the entries in the From list are considered to be INNER
-             * joined with the quals as the JOIN condition. Get the datanodes
-             * for the first entry in the From list. For every subsequent entry
-             * determine whether the join between the relation in that entry and
-             * the cumulative JOIN of previous entries can be pushed down to the
-             * datanodes and the corresponding set of datanodes where the join
-             * can be pushed down.
-             */
-            first = true;
-            result_en = NULL;
-            from_relids = NULL;
-            foreach (lcell, from_expr->fromlist)
-            {
-                Node    *fromlist_entry = lfirst(lcell);
-                Bitmapset *fle_relids = NULL;
-                ExecNodes    *tmp_en;
-                ExecNodes *en = pgxc_FQS_find_datanodes_recurse(fromlist_entry,
-                                                                query, &fle_relids);
-                /*
-                 * If any entry in fromlist is not shippable, jointree is not
-                 * shippable
-                 */
-                if (!en)
-                {
-                    FreeExecNodes(&result_en);
-                    return NULL;
-                }
-
-                /* FQS does't ship a DML with more than one relation involved */
-                if (!first && query->commandType != CMD_SELECT)
-                {
-                    FreeExecNodes(&result_en);
-                    return NULL;
-                }
-
-                if (first)
-                {
-                    first = false;
-                    result_en = en;
-                    from_relids = fle_relids;
-                    continue;
-                }
+{
+	List		*query_rtable = query->rtable;
+
+	if (!node)
+		return NULL;
+
+	switch(nodeTag(node))
+	{
+		case T_FromExpr:
+		{
+			FromExpr	*from_expr = (FromExpr *)node;
+			ListCell	*lcell;
+			bool		first;
+			Bitmapset	*from_relids;
+			ExecNodes	*result_en;
+
+			/*
+			 * For INSERT commands, we won't have any entries in the from list.
+			 * Get the datanodes using the resultRelation index.
+			 */
+			if (query->commandType != CMD_SELECT && !from_expr->fromlist)
+			{
+				*relids = bms_make_singleton(query->resultRelation);
+				return pgxc_FQS_datanodes_for_rtr(query->resultRelation,
+														query);
+			}
 
-                tmp_en = result_en;
-                /*
-                 * Check whether the JOIN is pushable to the datanodes and
-                 * find the datanodes where the JOIN can be pushed to
-                 */
-                result_en = pgxc_is_join_shippable(result_en, en, from_relids,
-                                        fle_relids, JOIN_INNER,
-                                        make_ands_implicit((Expr *)from_expr->quals),
+			/*
+			 * All the entries in the From list are considered to be INNER
+			 * joined with the quals as the JOIN condition. Get the datanodes
+			 * for the first entry in the From list. For every subsequent entry
+			 * determine whether the join between the relation in that entry and
+			 * the cumulative JOIN of previous entries can be pushed down to the
+			 * datanodes and the corresponding set of datanodes where the join
+			 * can be pushed down.
+			 */
+			first = true;
+			result_en = NULL;
+			from_relids = NULL;
+			foreach (lcell, from_expr->fromlist)
+			{
+				Node *fromlist_entry = lfirst(lcell);
+				Bitmapset *fle_relids = NULL;
+				ExecNodes *tmp_en;
+				ExecNodes *en = pgxc_FQS_find_datanodes_recurse(fromlist_entry,
+																query, &fle_relids);
+				/*
+				 * If any entry in fromlist is not shippable, jointree is not
+				 * shippable
+				 */
+				if (!en)
+				{
+					FreeExecNodes(&result_en);
+					return NULL;
+				}
+
+				/* FQS does't ship a DML with more than one relation involved */
+				if (!first && query->commandType != CMD_SELECT)
+				{
+					FreeExecNodes(&result_en);
+					return NULL;
+				}
+
+				if (first)
+				{
+					first = false;
+					result_en = en;
+					from_relids = fle_relids;
+					continue;
+				}
+
+				tmp_en = result_en;
+				/*
+				 * Check whether the JOIN is pushable to the datanodes and
+				 * find the datanodes where the JOIN can be pushed to
+				 */
+				result_en = pgxc_is_join_shippable(result_en, en, from_relids,
+										fle_relids, JOIN_INNER,
+										make_ands_implicit((Expr *)from_expr->quals),
 #ifdef __TBASE__
                                         query,
 #endif
@@ -2343,48 +2483,52 @@ pgxc_is_join_shippable(ExecNodes *inner_en, ExecNodes *outer_en, Relids in_relid
 #ifdef __TBASE__
                         Query *query,
 #endif
-                        List *rtables)
-{// #lizard forgives
-    bool    merge_nodes = false;
-
-    /*
-     * If either of inner_en or outer_en is NULL, return NULL. We can't ship the
-     * join when either of the sides do not have datanodes to ship to.
-     */
-    if (!outer_en || !inner_en)
-        return NULL;
-    /*
-     * We only support reduction of INNER, LEFT [OUTER] and FULL [OUTER] joins.
-     * RIGHT [OUTER] join is converted to LEFT [OUTER] join during join tree
-     * deconstruction.
-     */
-    if (jointype != JOIN_INNER && jointype != JOIN_LEFT && jointype != JOIN_FULL)
-        return NULL;
-
-    /* If both sides are replicated or have single node each, we ship any kind of JOIN */
-    if ((IsExecNodesReplicated(inner_en) && IsExecNodesReplicated(outer_en)) ||
-         (list_length(inner_en->nodeList) == 1 &&
-            list_length(outer_en->nodeList) == 1))
-        merge_nodes = true;
-
-    /* If both sides are distributed, ... */
-    else if (IsExecNodesColumnDistributed(inner_en) &&
-                IsExecNodesColumnDistributed(outer_en))
-    {
-        /*
-         * If two sides are distributed in the same manner by a value, with an
-         * equi-join on the distribution column and that condition
-         * is shippable, ship the join if node lists from both sides can be
-         * merged.
-         */
-        if (inner_en->baselocatortype == outer_en->baselocatortype &&
-            IsExecNodesDistributedByValue(inner_en))
-        {
-            Expr *equi_join_expr = pgxc_find_dist_equijoin_qual(in_relids,
-                                                    out_relids, InvalidOid,
-                                                    (Node *)join_quals, rtables);
-            if (equi_join_expr && pgxc_is_expr_shippable(equi_join_expr, NULL))
-                merge_nodes = true;
+						List *rtables)
+{
+	bool	merge_nodes = false;
+
+	/*
+	 * If either of inner_en or outer_en is NULL, return NULL. We can't ship the
+	 * join when either of the sides do not have datanodes to ship to.
+	 */
+	if (!outer_en || !inner_en)
+		return NULL;
+	/*
+	 * We only support reduction of INNER, LEFT [OUTER] and FULL [OUTER] joins.
+	 * RIGHT [OUTER] join is converted to LEFT [OUTER] join during join tree
+	 * deconstruction.
+	 */
+	if (jointype != JOIN_INNER && jointype != JOIN_LEFT && jointype != JOIN_FULL)
+		return NULL;
+
+	/*
+	 * If both sides are replicated or have single node each, we ship any kind
+	 * of JOIN
+	 */
+	if ((IsExecNodesReplicated(inner_en) && IsExecNodesReplicated(outer_en) &&
+		 !inner_en->const_subquery && !outer_en->const_subquery) ||
+		(list_length(inner_en->nodeList) == 1 &&
+		 list_length(outer_en->nodeList) == 1))
+		merge_nodes = true;
+
+	/* If both sides are distributed, ... */
+	else if (IsExecNodesColumnDistributed(inner_en) &&
+			 IsExecNodesColumnDistributed(outer_en))
+	{
+		/*
+		 * If two sides are distributed in the same manner by a value, with an
+		 * equi-join on the distribution column and that condition
+		 * is shippable, ship the join if node lists from both sides can be
+		 * merged.
+		 */
+		if (inner_en->baselocatortype == outer_en->baselocatortype &&
+			IsExecNodesDistributedByValue(inner_en))
+		{
+			Expr *equi_join_expr = pgxc_find_dist_equijoin_qual(in_relids,
+													out_relids, InvalidOid,
+													(Node *)join_quals, rtables);
+			if (equi_join_expr && pgxc_is_expr_shippable(equi_join_expr, NULL))
+				merge_nodes = true;
 #ifdef __TBASE__
             if (merge_nodes && restrict_query && query->commandType == CMD_SELECT)
             {
@@ -2479,108 +2623,122 @@ pgxc_is_join_shippable(ExecNodes *inner_en, ExecNodes *outer_en, Relids in_relid
                 }
             }
 #endif
-        }
-    }
-    /*
-     * If outer side is distributed and inner side is replicated, we can ship
-     * LEFT OUTER and INNER join.
-     */
-    else if (IsExecNodesColumnDistributed(outer_en) &&
-                IsExecNodesReplicated(inner_en) &&
-                (jointype == JOIN_INNER || jointype == JOIN_LEFT))
-    {
-            merge_nodes = true;
+		}
+	}
+	/*
+	 * If outer side is distributed and inner side is replicated, we can ship
+	 * LEFT OUTER and INNER join.
+	 */
+	else if (IsExecNodesColumnDistributed(outer_en) &&
+			 IsExecNodesReplicated(inner_en) &&
+			 (jointype == JOIN_INNER || jointype == JOIN_LEFT))
+	{
+		merge_nodes = true;
 #ifdef __TBASE__
-        if (restrict_query)
-        {
-            if (query->commandType == CMD_SELECT)
-            {
-                if (!outer_en->restrict_shippable)
-                {
-                    List *nodelist = NULL;
-
-                    if (jointype == JOIN_INNER)
-                    {
-                        nodelist = pgxc_find_dist_equi_nodes(in_relids,
-                                                    out_relids, InvalidOid,
-                                                    (Node *)join_quals, rtables);
-                        if (nodelist && !list_difference_int(nodelist, inner_en->nodeList))
-                        {
-                            ExecNodes *merged_en = makeNode(ExecNodes);
-                            merged_en->nodeList = nodelist;
-                            merged_en->baselocatortype = outer_en->baselocatortype;
-                            merged_en->restrict_shippable = true;
-                            return merged_en;
-                        }
-                    }
-
-                    if (jointype == JOIN_INNER || jointype == JOIN_LEFT)
-                    {
-                        nodelist = pgxc_find_dist_equi_nodes(in_relids,
-                                                    out_relids, InvalidOid,
-                                                    (Node *)make_ands_implicit((Expr *)query->jointree->quals), rtables);
-                        if (nodelist && !list_difference_int(nodelist, inner_en->nodeList))
-                        {
-                            ExecNodes *merged_en = makeNode(ExecNodes);
-                            merged_en->nodeList = nodelist;
-                            merged_en->baselocatortype = outer_en->baselocatortype;
-                            merged_en->restrict_shippable = true;
-                            return merged_en;
-                        }
-                    }
-                }
+		/*
+		 * Push down to restrict datanodes based if join is on distributed
+		 * column or related qual
+		 */
+		if (restrict_query &&
+			query->commandType == CMD_SELECT &&
+			!outer_en->restrict_shippable)
+		{
+			List *nodelist = NULL;
+
+			if (jointype == JOIN_INNER)
+			{
+				nodelist = pgxc_find_dist_equi_nodes(in_relids,
+											out_relids, InvalidOid,
+											(Node *)join_quals, rtables);
+				if (nodelist && !list_difference_int(nodelist, inner_en->nodeList))
+				{
+					ExecNodes *merged_en = makeNode(ExecNodes);
+					merged_en->nodeList = nodelist;
+					merged_en->baselocatortype = outer_en->baselocatortype;
+					merged_en->restrict_shippable = true;
+					return merged_en;
+				}
+			}
 
-                return pgxc_merge_exec_nodes(inner_en, outer_en);
-            }
-        }
+			if (jointype == JOIN_INNER || jointype == JOIN_LEFT)
+			{
+				nodelist = pgxc_find_dist_equi_nodes(in_relids,
+											out_relids, InvalidOid,
+											(Node *)make_ands_implicit((Expr *)query->jointree->quals), rtables);
+				if (nodelist && !list_difference_int(nodelist, inner_en->nodeList))
+				{
+					ExecNodes *merged_en = makeNode(ExecNodes);
+					merged_en->nodeList = nodelist;
+					merged_en->baselocatortype = outer_en->baselocatortype;
+					merged_en->restrict_shippable = true;
+					return merged_en;
+				}
+			}
+		}
+
+		/* Inner side is constant subquery */
+		if (enable_subquery_shipping && inner_en->const_subquery)
+		{
+			ExecNodes *merged_en = makeNode(ExecNodes);
+			merged_en->nodeList = list_copy(outer_en->nodeList);
+			merged_en->baselocatortype = outer_en->baselocatortype;
+			return merged_en;
+		}
 #endif
-    }
-    /*
-     * If outer side is replicated and inner side is distributed, we can ship
-     * only for INNER join.
-     */
-    else if (IsExecNodesReplicated(outer_en) &&
-                IsExecNodesColumnDistributed(inner_en) &&
-                jointype == JOIN_INNER)
-    {
-        merge_nodes = true;
+	}
+	/*
+	 * If outer side is replicated and inner side is distributed, we can ship
+	 * only for INNER join.
+	 */
+	else if (IsExecNodesReplicated(outer_en) &&
+			 IsExecNodesColumnDistributed(inner_en) &&
+			 jointype == JOIN_INNER)
+	{
+		merge_nodes = true;
 #ifdef __TBASE__
-        if (restrict_query)
-        {
-            if (query->commandType == CMD_SELECT)
-            {
-                if (!inner_en->restrict_shippable)
-                {
-                    List *nodelist = NULL;
-                    
-                    nodelist = pgxc_find_dist_equi_nodes(in_relids,
-                                                out_relids, InvalidOid,
-                                                (Node *)join_quals, rtables);
-                    if (nodelist && !list_difference_int(nodelist, outer_en->nodeList))
-                    {
-                        ExecNodes *merged_en = makeNode(ExecNodes);
-                        merged_en->nodeList = nodelist;
-                        merged_en->baselocatortype = inner_en->baselocatortype;
-                        merged_en->restrict_shippable = true;
-                        return merged_en;
-                    }
-
-                    nodelist = pgxc_find_dist_equi_nodes(in_relids,
-                                                out_relids, InvalidOid,
-                                                (Node *)make_ands_implicit((Expr *)query->jointree->quals), rtables);
-                    if (nodelist && !list_difference_int(nodelist, outer_en->nodeList))
-                    {
-                        ExecNodes *merged_en = makeNode(ExecNodes);
-                        merged_en->nodeList = nodelist;
-                        merged_en->baselocatortype = inner_en->baselocatortype;
-                        merged_en->restrict_shippable = true;
-                        return merged_en;
-                    }
-                }
+		/*
+		 * Push down to restrict datanodes based if join is on distributed
+		 * column or related qual
+		 */
+		if (restrict_query &&
+			query->commandType == CMD_SELECT &&
+			!inner_en->restrict_shippable)
+		{
+			List *nodelist = NULL;
+
+			nodelist = pgxc_find_dist_equi_nodes(in_relids,
+										out_relids, InvalidOid,
+										(Node *)join_quals, rtables);
+			if (nodelist && !list_difference_int(nodelist, outer_en->nodeList))
+			{
+				ExecNodes *merged_en = makeNode(ExecNodes);
+				merged_en->nodeList = nodelist;
+				merged_en->baselocatortype = inner_en->baselocatortype;
+				merged_en->restrict_shippable = true;
+				return merged_en;
+			}
 
-                return pgxc_merge_exec_nodes(inner_en, outer_en);
-            }
-        }
+			nodelist = pgxc_find_dist_equi_nodes(in_relids,
+										out_relids, InvalidOid,
+										(Node *)make_ands_implicit((Expr *)query->jointree->quals), rtables);
+			if (nodelist && !list_difference_int(nodelist, outer_en->nodeList))
+			{
+				ExecNodes *merged_en = makeNode(ExecNodes);
+				merged_en->nodeList = nodelist;
+				merged_en->baselocatortype = inner_en->baselocatortype;
+				merged_en->restrict_shippable = true;
+				return merged_en;
+			}
+		}
+
+		/* Outer side is constant subquery */
+		if (enable_subquery_shipping && outer_en->const_subquery)
+		{
+			ExecNodes *merged_en = makeNode(ExecNodes);
+			merged_en->nodeList = list_copy(inner_en->nodeList);
+			merged_en->baselocatortype = inner_en->baselocatortype;
+			return merged_en;
+		}
 #endif
     }
     /*
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index feb22b86..dac8f826 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -10606,20 +10606,25 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context)
                 printalias = true;
         }
 #ifdef PGXC
-        else if (rte->rtekind == RTE_SUBQUERY && rte->eref->aliasname)
-        {
-            /*
-             *
-             * This condition arises when the from clause is a view. The
-             * corresponding subquery RTE has its eref set to view name.
-             * The remote query generated has this subquery of which the
-             * columns can be referred to as view_name.col1, so it should
-             * be possible to refer to this subquery object.
-             */            
-            appendStringInfo(buf, " %s",
-                             quote_identifier(rte->eref->aliasname));
-            printalias = true;
-        }
+		else if (rte->rtekind == RTE_SUBQUERY && rte->eref->aliasname)
+		{
+			/*
+			 * This condition arises when the from clause is a view. The
+			 * corresponding subquery RTE has its eref set to view name.
+			 * The remote query generated has this subquery of which the
+			 * columns can be referred to as view_name.col1, so it should
+			 * be possible to refer to this subquery object.
+			 */			
+			appendStringInfo(buf, " %s",
+							 quote_identifier(rte->eref->aliasname));
+
+			/*
+			 * For 'dual' rte, the aliasname is also 'dual', print alias will
+			 * lead to syntax error.
+			 */
+			if (strcmp(rte->eref->aliasname, "dual") != 0)
+				printalias = true;
+		}
 #endif
         else if (rte->rtekind == RTE_FUNCTION)
         {
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 93dc7020..01405c31 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2312,6 +2312,16 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+
+	{
+		{"enable_subquery_shipping", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("support fast query shipping for subquery"),
+			NULL
+		},
+		&enable_subquery_shipping,
+		true,
+		NULL, NULL, NULL
+	},
 #endif
 
 #ifdef _MIGRATE_
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index e27c80ff..f1ff4710 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -358,6 +358,7 @@ extern void contains_remotesubplan(Path *path, int *number, bool *redistribute);
 extern int replication_level;
 
 extern bool restrict_query;
+extern bool enable_subquery_shipping;
 #endif
 
 #endif                            /* PATHNODE_H */
diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h
index 26209a93..c6218522 100644
--- a/src/include/pgxc/locator.h
+++ b/src/include/pgxc/locator.h
@@ -96,21 +96,22 @@ typedef struct
  */
 typedef struct
 {
-    NodeTag        type;
-    List        *primarynodelist;
-    List        *nodeList;
-    char        baselocatortype;
-    Expr        *en_expr;        /* expression to evaluate at execution time if planner
-                         * can not determine execution nodes */
+	NodeTag		type;
+	List		*primarynodelist;
+	List		*nodeList;
+	char		baselocatortype;
+	Expr		*en_expr;		/* expression to evaluate at execution time if planner
+						 	 	 * can not determine execution nodes */
 #ifdef __COLD_HOT__
     Expr        *sec_en_expr;    /* Sec Expression to evaluate at execution time
                                  * if planner can not determine execution
                                  * nodes */
 #endif
-    Oid        en_relid;        /* Relation to determine execution nodes */
-    RelationAccessType accesstype;        /* Access type to determine execution nodes */
+	Oid			en_relid;			/* Relation to determine execution nodes */
+	RelationAccessType accesstype;	/* Access type to determine execution nodes */
 #ifdef __TBASE__
-    bool    restrict_shippable;
+	bool    	restrict_shippable; /* The ExecNode is choose by join qual on distribute column */
+	bool		const_subquery; 	/* The subquery rte only got constant values */
 #endif
 } ExecNodes;
 
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index e2765dd9..06796d76 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -122,10 +122,11 @@ select name, setting from pg_settings where name like 'enable%';
  enable_shard_statistic            | on
  enable_sort                       | on
  enable_statistic                  | on
+ enable_subquery_shipping          | on
  enable_tidscan                    | on
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
-(53 rows)
+(55 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/expected/xc_FQS_2.out b/src/test/regress/expected/xc_FQS_2.out
index 9f1e51d1..e3e73168 100644
--- a/src/test/regress/expected/xc_FQS_2.out
+++ b/src/test/regress/expected/xc_FQS_2.out
@@ -1607,6 +1607,36 @@ select * from tab1_replicated where val = 7;
 -----+------
 (0 rows)
 
+-- Constant subquery
+create table subquery_fqs(id int, a varchar, c int);
+insert into subquery_fqs values(1,'gd', 2);
+insert into subquery_fqs values(1,'zj', 2);
+insert into subquery_fqs values(1,'sz', 2);
+explain select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a);
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Remote Fast Query Execution  (cost=0.00..0.00 rows=0 width=0)
+   Node/s: datanode_1, datanode_2
+   ->  Hash Join  (cost=0.19..25.60 rows=1 width=80)
+         Hash Cond: ((t.id = (1)) AND ((t.a)::text = ('gd'::text)))
+         ->  Seq Scan on subquery_fqs t  (cost=0.00..18.80 rows=880 width=40)
+         ->  Hash  (cost=0.14..0.14 rows=3 width=40)
+               ->  HashAggregate  (cost=0.08..0.11 rows=3 width=40)
+                     Group Key: (1), ('gd'::text), (2)
+                     ->  Append  (cost=0.00..0.06 rows=3 width=40)
+                           ->  Result  (cost=0.00..0.01 rows=1 width=40)
+                           ->  Result  (cost=0.00..0.01 rows=1 width=40)
+                           ->  Result  (cost=0.00..0.01 rows=1 width=40)
+(12 rows)
+
+select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a);
+ id | a  | c | id | a  | c 
+----+----+---+----+----+---
+  1 | gd | 2 |  1 | gd | 2
+  1 | zj | 2 |  1 | zj | 2
+  1 | sz | 2 |  1 | sz | 2
+(3 rows)
+
 drop table tab1_rr;
 drop table tab1_hash;
 drop table tab1_modulo;
diff --git a/src/test/regress/sql/xc_FQS.sql b/src/test/regress/sql/xc_FQS.sql
index a6d6f15f..bdb9c02a 100644
--- a/src/test/regress/sql/xc_FQS.sql
+++ b/src/test/regress/sql/xc_FQS.sql
@@ -275,6 +275,15 @@ delete from tab1_replicated where val = 7;
 explain (verbose on, costs off) delete from tab1_replicated where val = 7; 
 select * from tab1_replicated where val = 7;
 
+-- Constant subquery
+create table subquery_fqs(id int, a varchar, c int);
+insert into subquery_fqs values(1,'gd', 2);
+insert into subquery_fqs values(1,'zj', 2);
+insert into subquery_fqs values(1,'sz', 2);
+explain select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a);
+select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a);
+
+
 drop table tab1_rr;
 drop table tab1_hash;
 drop table tab1_modulo;

From fa58aa7a75035b8dbf8fba1735744847bd39eb32 Mon Sep 17 00:00:00 2001
From: qiannzhang <qiannzhang@tencent.com>
Date: Wed, 9 Sep 2020 19:10:36 +0800
Subject: [PATCH 050/578] Fix subquery's pathkey ID81711417. The fix is from
 the second issue of pg commit 24c19e9f668.     convert_subquery_pathkeys
 would create pathkeys for subquery output     values if they match any
 EquivalenceClass known in the outer query     and are available in the
 subquery's syntactic targetlist.  However,     the second part of that
 condition is wrong, because such values might     not appear in the subquery
 relation's reltarget list, which would     mean that they couldn't be
 accessed above the level of the subquery     scan.  We must check that they
 appear in the reltarget list, instead.     This can lead to dropping
 knowledge about the subquery's sort     ordering, but I believe it's okay,
 because any sort key that the     outer query actually has any interest in
 would appear in the     reltarget list.

---
 src/backend/access/transam/gtm.c        |   1 +
 src/backend/optimizer/path/pathkeys.c   | 424 +++++++++++++-----------
 src/backend/pgxc/pool/poolmgr.c         |   1 +
 src/test/regress/expected/subselect.out |  31 ++
 src/test/regress/sql/subselect.sql      |  28 ++
 5 files changed, 289 insertions(+), 196 deletions(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 267c5b88..d2557802 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -40,6 +40,7 @@
 #include "utils/tqual.h"
 #include "pgxc/nodemgr.h"
 #include "access/xlog.h"
+#include "storage/lmgr.h"
 #endif
 
 /* To access sequences */
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index 804c58a8..8587420d 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -29,6 +29,7 @@
 
 
 static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys);
+static Var *find_var_for_subquery_tle(RelOptInfo *rel, TargetEntry *tle);
 static bool right_merge_direction(PlannerInfo *root, PathKey *pathkey);
 
 
@@ -599,206 +600,237 @@ build_expression_pathkey(PlannerInfo *root,
  * 'subquery_pathkeys': the subquery's output pathkeys, in its terms.
  * 'subquery_tlist': the subquery's output targetlist, in its terms.
  *
- * It is not necessary for caller to do truncate_useless_pathkeys(),
- * because we select keys in a way that takes usefulness of the keys into
- * account.
+ * We intentionally don't do truncate_useless_pathkeys() here, because there
+ * are situations where seeing the raw ordering of the subquery is helpful.
+ * For example, if it returns ORDER BY x DESC, that may prompt us to
+ * construct a mergejoin using DESC order rather than ASC order; but the
+ * right_merge_direction heuristic would have us throw the knowledge away.
  */
 List *
 convert_subquery_pathkeys(PlannerInfo *root, RelOptInfo *rel,
-                          List *subquery_pathkeys,
-                          List *subquery_tlist)
-{// #lizard forgives
-    List       *retval = NIL;
-    int            retvallen = 0;
-    int            outer_query_keys = list_length(root->query_pathkeys);
-    ListCell   *i;
-
-    foreach(i, subquery_pathkeys)
-    {
-        PathKey    *sub_pathkey = (PathKey *) lfirst(i);
-        EquivalenceClass *sub_eclass = sub_pathkey->pk_eclass;
-        PathKey    *best_pathkey = NULL;
-
-        if (sub_eclass->ec_has_volatile)
-        {
-            /*
-             * If the sub_pathkey's EquivalenceClass is volatile, then it must
-             * have come from an ORDER BY clause, and we have to match it to
-             * that same targetlist entry.
-             */
-            TargetEntry *tle;
-
-            if (sub_eclass->ec_sortref == 0)    /* can't happen */
-                elog(ERROR, "volatile EquivalenceClass has no sortref");
-            tle = get_sortgroupref_tle(sub_eclass->ec_sortref, subquery_tlist);
-            Assert(tle);
-            /* resjunk items aren't visible to outer query */
-            if (!tle->resjunk)
-            {
-                /* We can represent this sub_pathkey */
-                EquivalenceMember *sub_member;
-                Expr       *outer_expr;
-                EquivalenceClass *outer_ec;
-
-                Assert(list_length(sub_eclass->ec_members) == 1);
-                sub_member = (EquivalenceMember *) linitial(sub_eclass->ec_members);
-                outer_expr = (Expr *) makeVarFromTargetEntry(rel->relid, tle);
-
-                /*
-                 * Note: it might look funny to be setting sortref = 0 for a
-                 * reference to a volatile sub_eclass.  However, the
-                 * expression is *not* volatile in the outer query: it's just
-                 * a Var referencing whatever the subquery emitted. (IOW, the
-                 * outer query isn't going to re-execute the volatile
-                 * expression itself.)    So this is okay.  Likewise, it's
-                 * correct to pass nullable_relids = NULL, because we're
-                 * underneath any outer joins appearing in the outer query.
-                 */
-                outer_ec =
-                    get_eclass_for_sort_expr(root,
-                                             outer_expr,
-                                             NULL,
-                                             sub_eclass->ec_opfamilies,
-                                             sub_member->em_datatype,
-                                             sub_eclass->ec_collation,
-                                             0,
-                                             rel->relids,
-                                             false);
-
-                /*
-                 * If we don't find a matching EC, sub-pathkey isn't
-                 * interesting to the outer query
-                 */
-                if (outer_ec)
-                    best_pathkey =
-                        make_canonical_pathkey(root,
-                                               outer_ec,
-                                               sub_pathkey->pk_opfamily,
-                                               sub_pathkey->pk_strategy,
-                                               sub_pathkey->pk_nulls_first);
-            }
-        }
-        else
-        {
-            /*
-             * Otherwise, the sub_pathkey's EquivalenceClass could contain
-             * multiple elements (representing knowledge that multiple items
-             * are effectively equal).  Each element might match none, one, or
-             * more of the output columns that are visible to the outer query.
-             * This means we may have multiple possible representations of the
-             * sub_pathkey in the context of the outer query.  Ideally we
-             * would generate them all and put them all into an EC of the
-             * outer query, thereby propagating equality knowledge up to the
-             * outer query.  Right now we cannot do so, because the outer
-             * query's EquivalenceClasses are already frozen when this is
-             * called. Instead we prefer the one that has the highest "score"
-             * (number of EC peers, plus one if it matches the outer
-             * query_pathkeys). This is the most likely to be useful in the
-             * outer query.
-             */
-            int            best_score = -1;
-            ListCell   *j;
-
-            foreach(j, sub_eclass->ec_members)
-            {
-                EquivalenceMember *sub_member = (EquivalenceMember *) lfirst(j);
-                Expr       *sub_expr = sub_member->em_expr;
-                Oid            sub_expr_type = sub_member->em_datatype;
-                Oid            sub_expr_coll = sub_eclass->ec_collation;
-                ListCell   *k;
-
-                if (sub_member->em_is_child)
-                    continue;    /* ignore children here */
-
-                foreach(k, subquery_tlist)
-                {
-                    TargetEntry *tle = (TargetEntry *) lfirst(k);
-                    Expr       *tle_expr;
-                    Expr       *outer_expr;
-                    EquivalenceClass *outer_ec;
-                    PathKey    *outer_pk;
-                    int            score;
-
-                    /* resjunk items aren't visible to outer query */
-                    if (tle->resjunk)
-                        continue;
-
-                    /*
-                     * The targetlist entry is considered to match if it
-                     * matches after sort-key canonicalization.  That is
-                     * needed since the sub_expr has been through the same
-                     * process.
-                     */
-                    tle_expr = canonicalize_ec_expression(tle->expr,
-                                                          sub_expr_type,
-                                                          sub_expr_coll);
-                    if (!equal(tle_expr, sub_expr))
-                        continue;
-
-                    /*
-                     * Build a representation of this targetlist entry as an
-                     * outer Var.
-                     */
-                    outer_expr = (Expr *) makeVarFromTargetEntry(rel->relid,
-                                                                 tle);
-
-                    /* See if we have a matching EC for that */
-                    outer_ec = get_eclass_for_sort_expr(root,
-                                                        outer_expr,
-                                                        NULL,
-                                                        sub_eclass->ec_opfamilies,
-                                                        sub_expr_type,
-                                                        sub_expr_coll,
-                                                        0,
-                                                        rel->relids,
-                                                        false);
-
-                    /*
-                     * If we don't find a matching EC, this sub-pathkey isn't
-                     * interesting to the outer query
-                     */
-                    if (!outer_ec)
-                        continue;
-
-                    outer_pk = make_canonical_pathkey(root,
-                                                      outer_ec,
-                                                      sub_pathkey->pk_opfamily,
-                                                      sub_pathkey->pk_strategy,
-                                                      sub_pathkey->pk_nulls_first);
-                    /* score = # of equivalence peers */
-                    score = list_length(outer_ec->ec_members) - 1;
-                    /* +1 if it matches the proper query_pathkeys item */
-                    if (retvallen < outer_query_keys &&
-                        list_nth(root->query_pathkeys, retvallen) == outer_pk)
-                        score++;
-                    if (score > best_score)
-                    {
-                        best_pathkey = outer_pk;
-                        best_score = score;
-                    }
-                }
-            }
-        }
-
-        /*
-         * If we couldn't find a representation of this sub_pathkey, we're
-         * done (we can't use the ones to its right, either).
-         */
-        if (!best_pathkey)
-            break;
-
-        /*
-         * Eliminate redundant ordering info; could happen if outer query
-         * equivalences subquery keys...
-         */
-        if (!pathkey_is_redundant(best_pathkey, retval))
-        {
-            retval = lappend(retval, best_pathkey);
-            retvallen++;
-        }
-    }
+						  List *subquery_pathkeys,
+						  List *subquery_tlist)
+{
+	List	   *retval = NIL;
+	int			retvallen = 0;
+	int			outer_query_keys = list_length(root->query_pathkeys);
+	ListCell   *i;
+
+	foreach(i, subquery_pathkeys)
+	{
+		PathKey    *sub_pathkey = (PathKey *) lfirst(i);
+		EquivalenceClass *sub_eclass = sub_pathkey->pk_eclass;
+		PathKey    *best_pathkey = NULL;
+
+		if (sub_eclass->ec_has_volatile)
+		{
+			/*
+			 * If the sub_pathkey's EquivalenceClass is volatile, then it must
+			 * have come from an ORDER BY clause, and we have to match it to
+			 * that same targetlist entry.
+			 */
+			TargetEntry *tle;
+			Var		   *outer_var;
+
+			if (sub_eclass->ec_sortref == 0)	/* can't happen */
+				elog(ERROR, "volatile EquivalenceClass has no sortref");
+			tle = get_sortgroupref_tle(sub_eclass->ec_sortref, subquery_tlist);
+			Assert(tle);
+			/* Is TLE actually available to the outer query? */
+			outer_var = find_var_for_subquery_tle(rel, tle);
+			if (outer_var)
+			{
+				/* We can represent this sub_pathkey */
+				EquivalenceMember *sub_member;
+				EquivalenceClass *outer_ec;
+
+				Assert(list_length(sub_eclass->ec_members) == 1);
+				sub_member = (EquivalenceMember *) linitial(sub_eclass->ec_members);
+
+				/*
+				 * Note: it might look funny to be setting sortref = 0 for a
+				 * reference to a volatile sub_eclass.  However, the
+				 * expression is *not* volatile in the outer query: it's just
+				 * a Var referencing whatever the subquery emitted. (IOW, the
+				 * outer query isn't going to re-execute the volatile
+				 * expression itself.)	So this is okay.  Likewise, it's
+				 * correct to pass nullable_relids = NULL, because we're
+				 * underneath any outer joins appearing in the outer query.
+				 */
+				outer_ec =
+					get_eclass_for_sort_expr(root,
+											 (Expr *) outer_var,
+											 NULL,
+											 sub_eclass->ec_opfamilies,
+											 sub_member->em_datatype,
+											 sub_eclass->ec_collation,
+											 0,
+											 rel->relids,
+											 false);
+
+				/*
+				 * If we don't find a matching EC, sub-pathkey isn't
+				 * interesting to the outer query
+				 */
+				if (outer_ec)
+					best_pathkey =
+						make_canonical_pathkey(root,
+											   outer_ec,
+											   sub_pathkey->pk_opfamily,
+											   sub_pathkey->pk_strategy,
+											   sub_pathkey->pk_nulls_first);
+			}
+		}
+		else
+		{
+			/*
+			 * Otherwise, the sub_pathkey's EquivalenceClass could contain
+			 * multiple elements (representing knowledge that multiple items
+			 * are effectively equal).  Each element might match none, one, or
+			 * more of the output columns that are visible to the outer query.
+			 * This means we may have multiple possible representations of the
+			 * sub_pathkey in the context of the outer query.  Ideally we
+			 * would generate them all and put them all into an EC of the
+			 * outer query, thereby propagating equality knowledge up to the
+			 * outer query.  Right now we cannot do so, because the outer
+			 * query's EquivalenceClasses are already frozen when this is
+			 * called. Instead we prefer the one that has the highest "score"
+			 * (number of EC peers, plus one if it matches the outer
+			 * query_pathkeys). This is the most likely to be useful in the
+			 * outer query.
+			 */
+			int			best_score = -1;
+			ListCell   *j;
+
+			foreach(j, sub_eclass->ec_members)
+			{
+				EquivalenceMember *sub_member = (EquivalenceMember *) lfirst(j);
+				Expr	   *sub_expr = sub_member->em_expr;
+				Oid			sub_expr_type = sub_member->em_datatype;
+				Oid			sub_expr_coll = sub_eclass->ec_collation;
+				ListCell   *k;
+
+				if (sub_member->em_is_child)
+					continue;	/* ignore children here */
+
+				foreach(k, subquery_tlist)
+				{
+					TargetEntry *tle = (TargetEntry *) lfirst(k);
+					Var		   *outer_var;
+					Expr	   *tle_expr;
+					EquivalenceClass *outer_ec;
+					PathKey    *outer_pk;
+					int			score;
+
+					/* Is TLE actually available to the outer query? */
+					outer_var = find_var_for_subquery_tle(rel, tle);
+					if (!outer_var)
+						continue;
+
+					/*
+					 * The targetlist entry is considered to match if it
+					 * matches after sort-key canonicalization.  That is
+					 * needed since the sub_expr has been through the same
+					 * process.
+					 */
+					tle_expr = canonicalize_ec_expression(tle->expr,
+														  sub_expr_type,
+														  sub_expr_coll);
+					if (!equal(tle_expr, sub_expr))
+						continue;
+
+					/* See if we have a matching EC for the TLE */
+					outer_ec = get_eclass_for_sort_expr(root,
+														(Expr *) outer_var,
+														NULL,
+														sub_eclass->ec_opfamilies,
+														sub_expr_type,
+														sub_expr_coll,
+														0,
+														rel->relids,
+														false);
+
+					/*
+					 * If we don't find a matching EC, this sub-pathkey isn't
+					 * interesting to the outer query
+					 */
+					if (!outer_ec)
+						continue;
+
+					outer_pk = make_canonical_pathkey(root,
+													  outer_ec,
+													  sub_pathkey->pk_opfamily,
+													  sub_pathkey->pk_strategy,
+													  sub_pathkey->pk_nulls_first);
+					/* score = # of equivalence peers */
+					score = list_length(outer_ec->ec_members) - 1;
+					/* +1 if it matches the proper query_pathkeys item */
+					if (retvallen < outer_query_keys &&
+						list_nth(root->query_pathkeys, retvallen) == outer_pk)
+						score++;
+					if (score > best_score)
+					{
+						best_pathkey = outer_pk;
+						best_score = score;
+					}
+				}
+			}
+		}
+
+		/*
+		 * If we couldn't find a representation of this sub_pathkey, we're
+		 * done (we can't use the ones to its right, either).
+		 */
+		if (!best_pathkey)
+			break;
+
+		/*
+		 * Eliminate redundant ordering info; could happen if outer query
+		 * equivalences subquery keys...
+		 */
+		if (!pathkey_is_redundant(best_pathkey, retval))
+		{
+			retval = lappend(retval, best_pathkey);
+			retvallen++;
+		}
+	}
+
+	return retval;
+}
 
-    return retval;
+/*
+ * find_var_for_subquery_tle
+ *
+ * If the given subquery tlist entry is due to be emitted by the subquery's
+ * scan node, return a Var for it, else return NULL.
+ *
+ * We need this to ensure that we don't return pathkeys describing values
+ * that are unavailable above the level of the subquery scan.
+ */
+static Var *
+find_var_for_subquery_tle(RelOptInfo *rel, TargetEntry *tle)
+{
+	ListCell   *lc;
+
+	/* If the TLE is resjunk, it's certainly not visible to the outer query */
+	if (tle->resjunk)
+		return NULL;
+
+	/* Search the rel's targetlist to see what it will return */
+	foreach(lc, rel->reltarget->exprs)
+	{
+		Var		   *var = (Var *) lfirst(lc);
+
+		/* Ignore placeholders */
+		if (!IsA(var, Var))
+			continue;
+		Assert(var->varno == rel->relid);
+
+		/* If we find a Var referencing this TLE, we're good */
+		if (var->varattno == tle->resno)
+			return copyObject(var); /* Make a copy for safety */
+	}
+	return NULL;
 }
 
 /*
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index dd575f76..0392d08f 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -49,6 +49,7 @@
 #include "utils/lsyscache.h"
 #include "utils/resowner.h"
 #include "lib/stringinfo.h"
+#include "libpq/libpq-be.h"
 #include "libpq/pqformat.h"
 #include "common/username.h"
 #include "pgxc/locator.h"
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 096bb24f..52c196aa 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1986,3 +1986,34 @@ select * from x for update;
          Output: subselect_tbl.f1, subselect_tbl.f2, subselect_tbl.f3
 (4 rows)
 
+-- test subquery pathkey
+CREATE TABLE catalog_sales (
+    cs_sold_date_sk integer,
+    cs_item_sk integer NOT NULL,
+    cs_order_number integer NOT NULL
+);
+CREATE TABLE catalog_returns (
+    cr_returned_date_sk integer,
+    cr_item_sk integer NOT NULL,
+    cr_order_number integer NOT NULL
+);
+CREATE TABLE date_dim (
+    d_date_sk integer NOT NULL,
+    d_year integer
+);
+with cs as
+(
+    select d_year AS cs_sold_year, cs_item_sk
+    from catalog_sales
+        left join catalog_returns on cr_order_number=cs_order_number and cs_item_sk=cr_item_sk
+        join date_dim on cs_sold_date_sk = d_date_sk
+    order by d_year, cs_item_sk
+)
+select 1
+from date_dim
+    join cs on (cs_sold_year=d_year and cs_item_sk=cs_item_sk);
+ ?column? 
+----------
+(0 rows)
+
+drop table catalog_sales, catalog_returns, date_dim;
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 01926d80..818c6b4f 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -793,3 +793,31 @@ select * from (with x as (select 2 as y) select * from x) ss;
 explain (verbose, costs off)
 with x as (select * from subselect_tbl)
 select * from x for update;
+
+-- test subquery pathkey
+CREATE TABLE catalog_sales (
+    cs_sold_date_sk integer,
+    cs_item_sk integer NOT NULL,
+    cs_order_number integer NOT NULL
+);
+CREATE TABLE catalog_returns (
+    cr_returned_date_sk integer,
+    cr_item_sk integer NOT NULL,
+    cr_order_number integer NOT NULL
+);
+CREATE TABLE date_dim (
+    d_date_sk integer NOT NULL,
+    d_year integer
+);
+with cs as
+(
+    select d_year AS cs_sold_year, cs_item_sk
+    from catalog_sales
+        left join catalog_returns on cr_order_number=cs_order_number and cs_item_sk=cr_item_sk
+        join date_dim on cs_sold_date_sk = d_date_sk
+    order by d_year, cs_item_sk
+)
+select 1
+from date_dim
+    join cs on (cs_sold_year=d_year and cs_item_sk=cs_item_sk);
+drop table catalog_sales, catalog_returns, date_dim;
\ No newline at end of file

From 3dc65e912e19f98a386a4a3975647bcc2fef54d6 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Thu, 10 Sep 2020 10:53:01 +0800
Subject: [PATCH 051/578] Fix coredump during ExecEndRemoteSubplan when conn is
 NULL

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131081943789
---
 src/backend/pgxc/pool/execRemote.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 22c3fade..ea47904b 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -11505,6 +11505,16 @@ ExecEndRemoteSubplan(RemoteSubplanState *node)
 
         conn = combiner->connections[i];
 
+		/* connection can be null in sort, forget it */
+		if (!conn)
+		{
+			combiner->conn_count--;
+			combiner->connections[i] =
+					combiner->connections[combiner->conn_count];
+			i--;
+			continue;
+		}
+
         CHECK_OWNERSHIP(conn, combiner);
 
         if (pgxc_node_send_close(conn, true, cursor) != 0)

From 12c41f3e73b7e09fa803999fe686f136ce268586 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Sat, 12 Sep 2020 11:12:17 +0800
Subject: [PATCH 052/578] Fix warnings

---
 src/backend/executor/nodeModifyTable.c | 2 --
 src/backend/optimizer/util/relnode.c   | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 659f15b1..4a03adb3 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -3206,7 +3206,6 @@ ExecEndModifyTable(ModifyTableState *node)
 #ifdef __TBASE__
 	if (IS_PGXC_COORDINATOR)
 	{
-		ResponseCombiner   *combiner;
 		ModifyTable *plan = (ModifyTable *)node->ps.plan;
 
 		if (plan->remote_plans)
@@ -3217,7 +3216,6 @@ ExecEndModifyTable(ModifyTableState *node)
 			{
 				RemoteQuery *rq = (RemoteQuery *)list_nth(plan->remote_plans, i);
 
-				combiner = (ResponseCombiner *) node->mt_remoterels[i];
 
 				ExecEndNode(node->mt_remoterels[i]);
 
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index 153d3d36..b4359f52 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -507,9 +507,6 @@ build_join_rel(PlannerInfo *root,
 {
 	RelOptInfo *joinrel;
 	List	   *restrictlist;
-#ifdef __TBASE__
-	PlannerInfo *top_root = root;
-#endif
 
 	/*
 	 * See if we already have a joinrel for this set of base rels.

From 2fbad221b6749adb0ab93063a739a064cb6c5208 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 15 Sep 2020 11:30:20 +0800
Subject: [PATCH 053/578] Fix postgres log csv format.

---
 src/backend/utils/error/elog.c | 160 ++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 80 deletions(-)

diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index 9851cf57..58ff0658 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -2562,23 +2562,23 @@ log_line_prefix(StringInfo buf, ErrorData *edata)
                 {
                     char        strfbuf[128];
 
-					snprintf(strfbuf, sizeof(strfbuf) - 1, "%lx.%x,coord(%d,%u)",
+					snprintf(strfbuf, sizeof(strfbuf) - 1, "%lx.%x,coord(%d.%u)",
 							 (long) (MyStartTime), MyProcPid, 
 							 pgxc_get_coordinator_proc_pid(),
 							 pgxc_get_coordinator_proc_vxid());
-                    appendStringInfo(buf, "%*s", padding, strfbuf);
-                }
-                else
-					appendStringInfo(buf, "%lx.%x,coord(%d,%u)", 
+					appendStringInfo(buf, "%*s", padding, strfbuf);
+				}
+				else
+					appendStringInfo(buf, "%lx.%x,coord(%d.%u)",
 									(long) (MyStartTime), MyProcPid, 
 									pgxc_get_coordinator_proc_pid(),
 									pgxc_get_coordinator_proc_vxid());
-                break;
-            case 'p':
-                if (padding != 0)
-                    appendStringInfo(buf, "%*d", padding, MyProcPid);
-                else
-					appendStringInfo(buf, "%d,coord(%d,%u)",
+				break;
+			case 'p':
+				if (padding != 0)
+					appendStringInfo(buf, "%*d", padding, MyProcPid);
+				else
+					appendStringInfo(buf, "%d,coord(%d.%u)",
 									MyProcPid, 
 									pgxc_get_coordinator_proc_pid(),
 									pgxc_get_coordinator_proc_vxid());
@@ -2829,78 +2829,78 @@ appendCSVLiteral(StringInfo buf, const char *data)
  */
 static void
 write_csvlog(ErrorData *edata)
-{// #lizard forgives
-    StringInfoData buf;
-    bool        print_stmt = false;
-
-    /* static counter for line numbers */
-    static long log_line_number = 0;
-
-    /* has counter been reset in current process? */
-    static int    log_my_pid = 0;
-
-    /*
-     * This is one of the few places where we'd rather not inherit a static
-     * variable's value from the postmaster.  But since we will, reset it when
-     * MyProcPid changes.
-     */
-    if (log_my_pid != MyProcPid)
-    {
-        log_line_number = 0;
-        log_my_pid = MyProcPid;
-        formatted_start_time[0] = '\0';
-    }
-    log_line_number++;
-
-    initStringInfo(&buf);
-
-    /*
-     * timestamp with milliseconds
-     *
-     * Check if the timestamp is already calculated for the syslog message,
-     * and use it if so.  Otherwise, get the current timestamp.  This is done
-     * to put same timestamp in both syslog and csvlog messages.
-     */
-    if (formatted_log_time[0] == '\0')
-        setup_formatted_log_time();
-
-    appendStringInfoString(&buf, formatted_log_time);
-    appendStringInfoChar(&buf, ',');
-
-    /* username */
-    if (MyProcPort)
-        appendCSVLiteral(&buf, MyProcPort->user_name);
-    appendStringInfoChar(&buf, ',');
-
-    /* database name */
-    if (MyProcPort)
-        appendCSVLiteral(&buf, MyProcPort->database_name);
-    appendStringInfoChar(&buf, ',');
-
-    /* Process id  */
-    if (MyProcPid != 0)
-		appendStringInfo(&buf, "%d,coord(%d,%u)",
+{
+	StringInfoData buf;
+	bool		print_stmt = false;
+
+	/* static counter for line numbers */
+	static long log_line_number = 0;
+
+	/* has counter been reset in current process? */
+	static int	log_my_pid = 0;
+
+	/*
+	 * This is one of the few places where we'd rather not inherit a static
+	 * variable's value from the postmaster.  But since we will, reset it when
+	 * MyProcPid changes.
+	 */
+	if (log_my_pid != MyProcPid)
+	{
+		log_line_number = 0;
+		log_my_pid = MyProcPid;
+		formatted_start_time[0] = '\0';
+	}
+	log_line_number++;
+
+	initStringInfo(&buf);
+
+	/*
+	 * timestamp with milliseconds
+	 *
+	 * Check if the timestamp is already calculated for the syslog message,
+	 * and use it if so.  Otherwise, get the current timestamp.  This is done
+	 * to put same timestamp in both syslog and csvlog messages.
+	 */
+	if (formatted_log_time[0] == '\0')
+		setup_formatted_log_time();
+
+	appendStringInfoString(&buf, formatted_log_time);
+	appendStringInfoChar(&buf, ',');
+
+	/* username */
+	if (MyProcPort)
+		appendCSVLiteral(&buf, MyProcPort->user_name);
+	appendStringInfoChar(&buf, ',');
+
+	/* database name */
+	if (MyProcPort)
+		appendCSVLiteral(&buf, MyProcPort->database_name);
+	appendStringInfoChar(&buf, ',');
+
+	/* Process id  */
+	if (MyProcPid != 0)
+		appendStringInfo(&buf, "%d,coord(%d.%u)",
 						MyProcPid,
 						pgxc_get_coordinator_proc_pid(),
 						pgxc_get_coordinator_proc_vxid());
-    appendStringInfoChar(&buf, ',');
-
-    /* Remote host and port */
-    if (MyProcPort && MyProcPort->remote_host)
-    {
-        appendStringInfoChar(&buf, '"');
-        appendStringInfoString(&buf, MyProcPort->remote_host);
-        if (MyProcPort->remote_port && MyProcPort->remote_port[0] != '\0')
-        {
-            appendStringInfoChar(&buf, ':');
-            appendStringInfoString(&buf, MyProcPort->remote_port);
-        }
-        appendStringInfoChar(&buf, '"');
-    }
-    appendStringInfoChar(&buf, ',');
-
-    /* session id */
-	appendStringInfo(&buf, "%lx.%x,coord(%d,%u)", 
+	appendStringInfoChar(&buf, ',');
+
+	/* Remote host and port */
+	if (MyProcPort && MyProcPort->remote_host)
+	{
+		appendStringInfoChar(&buf, '"');
+		appendStringInfoString(&buf, MyProcPort->remote_host);
+		if (MyProcPort->remote_port && MyProcPort->remote_port[0] != '\0')
+		{
+			appendStringInfoChar(&buf, ':');
+			appendStringInfoString(&buf, MyProcPort->remote_port);
+		}
+		appendStringInfoChar(&buf, '"');
+	}
+	appendStringInfoChar(&buf, ',');
+
+	/* session id */
+	appendStringInfo(&buf, "%lx.%x,coord(%d.%u)",
 					(long) MyStartTime, MyProcPid,
 					pgxc_get_coordinator_proc_pid(),
 					pgxc_get_coordinator_proc_vxid());

From 168b8413f3be7172d74cf750266b1a7385b3d620 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 15 Sep 2020 19:56:56 +0800
Subject: [PATCH 054/578] adjust commit order in pgxc_node_remote_finish for
 parallel ddl.

---
 src/backend/pgxc/pool/execRemote.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index ea47904b..e3fa18d7 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -8585,9 +8585,34 @@ pgxc_node_remote_finish(char *prepareGID, bool commit,
         }
     }
 
-    for (i = 0; i < pgxc_handles->co_conn_count; i++)
+    /* Make sure datanode commit first */
+    if (conn_count && is_txn_has_parallel_ddl)
     {
-        PGXCNodeHandle *conn = pgxc_handles->coord_handles[i];
+        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+#ifdef __TWO_PHASE_TRANS__
+        g_twophase_state.response_operation =
+                (commit == true) ? REMOTE_FINISH_COMMIT : REMOTE_FINISH_ABORT;
+#endif
+        /* Receive responses */
+        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
+            !validate_combiner(&combiner))
+        {
+            if (combiner.errorMessage)
+                pgxc_node_report_error(&combiner);
+            else
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                                errmsg("Failed to COMMIT the transaction on one or more nodes")));
+        }
+        else
+            CloseCombiner(&combiner);
+
+        conn_count = 0;
+    }
+
+	for (i = 0; i < pgxc_handles->co_conn_count; i++)
+	{
+		PGXCNodeHandle *conn = pgxc_handles->coord_handles[i];
 #ifdef __TWO_PHASE_TRANS__
         twophase_index = g_twophase_state.coord_index;
         g_twophase_state.coord_state[twophase_index].is_participant = true;

From fd9e9f44c87464ebc02d590d5d9a091575ca5bc2 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Thu, 17 Sep 2020 19:52:11 +0800
Subject: [PATCH 055/578] Fix core during ExecEndCteScan

---
 src/backend/executor/nodeCtescan.c | 38 +++++++++++++++---------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/backend/executor/nodeCtescan.c b/src/backend/executor/nodeCtescan.c
index 1f309184..e9e4e0a3 100644
--- a/src/backend/executor/nodeCtescan.c
+++ b/src/backend/executor/nodeCtescan.c
@@ -279,25 +279,25 @@ ExecInitCteScan(CteScan *node, EState *estate, int eflags)
 void
 ExecEndCteScan(CteScanState *node)
 {
-    /*
-     * Free exprcontext
-     */
-    ExecFreeExprContext(&node->ss.ps);
-
-    /*
-     * clean out the tuple table
-     */
-    ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
-    ExecClearTuple(node->ss.ss_ScanTupleSlot);
-
-    /*
-     * If I am the leader, free the tuplestore.
-     */
-    if (node->leader == node)
-    {
-        tuplestore_end(node->cte_table);
-        node->cte_table = NULL;
-    }
+	/*
+	 * Free exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * If I am the leader, free the tuplestore.
+	 */
+	if (node->leader == node && node->cte_table)
+	{
+		tuplestore_end(node->cte_table);
+		node->cte_table = NULL;
+	}
 }
 
 /* ----------------------------------------------------------------

From ee7936e87b7b49f604cdfcdd96a4bc9248dade25 Mon Sep 17 00:00:00 2001
From: qiannzhang <qiannzhang@tencent.com>
Date: Fri, 18 Sep 2020 17:10:51 +0800
Subject: [PATCH 056/578] Set keepalive, user_timeout, and connect_timeout for
 gtm connection, TAPD:
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131082021889
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131082021563

---
 src/backend/access/transam/gtm.c | 168 ++++++++++++++++---------------
 src/gtm/client/fe-connect.c      |  69 ++++++++++++-
 src/include/gtm/libpq-fe.h       |   5 +-
 3 files changed, 156 insertions(+), 86 deletions(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index d2557802..9ec7287d 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -1212,90 +1212,98 @@ InitGTM(void)
 #endif
 
 try_connect_gtm:
-    /* If this thread is postmaster itself, it contacts gtm identifying itself */
-    if (!IsUnderPostmaster)
-    {
-        GTM_PGXCNodeType remote_type = GTM_NODE_DEFAULT;
-
-        if (IS_PGXC_COORDINATOR)
-            remote_type = GTM_NODE_COORDINATOR;
-        else if (IS_PGXC_DATANODE)
-            remote_type = GTM_NODE_DATANODE;
-
-        /* Use 60s as connection timeout */
-        snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s remote_type=%d postmaster=1 connect_timeout=%d",
-                                GtmHost, GtmPort, PGXCNodeName, remote_type,
-                                GtmConnectTimeout);
-
-        /* Log activity of GTM connections */
-        if(GTMDebugPrint)
-            elog(LOG, "Postmaster: connection established to GTM with string %s", conn_str);
-    }
-    else
-    {
-        /* Use 60s as connection timeout */
-        snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s connect_timeout=%d",
-                GtmHost, GtmPort, PGXCNodeName, GtmConnectTimeout);
-
-        /* Log activity of GTM connections */
-        if (IsAutoVacuumWorkerProcess() && GTMDebugPrint)
-            elog(LOG, "Autovacuum worker: connection established to GTM with string %s", conn_str);
-        else if (IsAutoVacuumLauncherProcess() && GTMDebugPrint)
-            elog(LOG, "Autovacuum launcher: connection established to GTM with string %s", conn_str);
-        else if (IsClusterMonitorProcess() && GTMDebugPrint)
-            elog(LOG, "Cluster monitor: connection established to GTM with string %s", conn_str);
-        else if(GTMDebugPrint)
-            elog(LOG, "Postmaster child: connection established to GTM with string %s", conn_str);
-    }
+	/* If this thread is postmaster itself, it contacts gtm identifying itself */
+	if (!IsUnderPostmaster)
+	{
+		GTM_PGXCNodeType remote_type = GTM_NODE_DEFAULT;
+
+		if (IS_PGXC_COORDINATOR)
+			remote_type = GTM_NODE_COORDINATOR;
+		else if (IS_PGXC_DATANODE)
+			remote_type = GTM_NODE_DATANODE;
+
+		/* Use 60s as connection timeout */
+		snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s remote_type=%d postmaster=1 connect_timeout=%d",
+								GtmHost, GtmPort, PGXCNodeName, remote_type,
+								tcp_keepalives_idle > 0 ?
+								tcp_keepalives_idle : GtmConnectTimeout);
+
+		/* Log activity of GTM connections */
+		if(GTMDebugPrint)
+			elog(LOG, "Postmaster: connection established to GTM with string %s", conn_str);
+	}
+	else
+	{
+		/* Use 60s as connection timeout */
+		snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s connect_timeout=%d",
+				GtmHost, GtmPort, PGXCNodeName,
+				tcp_keepalives_idle > 0 ?
+				tcp_keepalives_idle : GtmConnectTimeout);
+
+		/* Log activity of GTM connections */
+		if (IsAutoVacuumWorkerProcess() && GTMDebugPrint)
+			elog(LOG, "Autovacuum worker: connection established to GTM with string %s", conn_str);
+		else if (IsAutoVacuumLauncherProcess() && GTMDebugPrint)
+			elog(LOG, "Autovacuum launcher: connection established to GTM with string %s", conn_str);
+		else if (IsClusterMonitorProcess() && GTMDebugPrint)
+			elog(LOG, "Cluster monitor: connection established to GTM with string %s", conn_str);
+		else if(GTMDebugPrint)
+			elog(LOG, "Postmaster child: connection established to GTM with string %s", conn_str);
+	}
 
-    conn = PQconnectGTM(conn_str);
-    if (GTMPQstatus(conn) != CONNECTION_OK)
-    {
-        int save_errno = errno;
-        
-#ifdef __TBASE__    
-        if (try_cnt < max_try_cnt)
-        {
-            /* If connect gtm failed, get gtm info from syscache, and try again */
-            GetMasterGtmInfo();
-            if (GtmHost != NULL && GtmPort)
-            {
-                elog(DEBUG1, "[InitGTM] Get GtmHost:%s  GtmPort:%d try_cnt:%d max_try_cnt:%d", 
-                             GtmHost, GtmPort, try_cnt, max_try_cnt);
-            }
-            CloseGTM();
-            try_cnt++;
-            goto try_connect_gtm;
-        }
-        else
-#endif        
-        {
-            ResetGtmInfo();
+	conn = PQconnectGTM(conn_str);
+	if (GTMPQstatus(conn) != CONNECTION_OK)
+	{
+		int save_errno = errno;
+		
+#ifdef __TBASE__	
+		if (try_cnt < max_try_cnt)
+		{
+			/* If connect gtm failed, get gtm info from syscache, and try again */
+			GetMasterGtmInfo();
+			if (GtmHost != NULL && GtmPort)
+			{
+				elog(DEBUG1, "[InitGTM] Get GtmHost:%s  GtmPort:%d try_cnt:%d max_try_cnt:%d", 
+							 GtmHost, GtmPort, try_cnt, max_try_cnt);
+			}
+			CloseGTM();
+			try_cnt++;
+			goto try_connect_gtm;
+		}
+		else
+#endif		
+		{
+			ResetGtmInfo();
 
-            /* Use LOG instead of ERROR to avoid error stack overflow. */
-            if(conn)
-            {
-                ereport(LOG,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("can not connect to GTM: %s %m", GTMPQerrorMessage(conn))));
-            }
-            else
-            {
-                ereport(LOG,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("connection is null: %m")));
-            }
+			/* Use LOG instead of ERROR to avoid error stack overflow. */
+			if(conn)
+			{
+				ereport(LOG,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("can not connect to GTM: %s %m", GTMPQerrorMessage(conn))));
+			}
+			else
+			{
+				ereport(LOG,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("connection is null: %m")));
+			}
 
-            errno = save_errno;
+			errno = save_errno;
 
-            CloseGTM();
-        }
-        
-    }
-    else if (IS_PGXC_COORDINATOR)
-    {
-        register_session(conn, PGXCNodeName, MyProcPid, MyBackendId);
-    }
+			CloseGTM();
+		}
+		
+	}
+	else
+	{
+		GTMSetSockKeepAlive(conn, tcp_keepalives_idle,
+							tcp_keepalives_interval, tcp_keepalives_count);
+		if (IS_PGXC_COORDINATOR)
+		{
+			register_session(conn, PGXCNodeName, MyProcPid, MyBackendId);
+		}
+	}
 }
 
 void
diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c
index ed8ef6cc..1e3b712f 100644
--- a/src/gtm/client/fe-connect.c
+++ b/src/gtm/client/fe-connect.c
@@ -82,11 +82,10 @@ PQconnectGTM(const char *conninfo)
 {
     GTM_Conn       *conn = PQconnectGTMStart(conninfo);
 
-    if (conn && conn->status != CONNECTION_BAD)
-    {
-        (void)connectGTMComplete(conn);
-        
-    }
+	if (conn && conn->status != CONNECTION_BAD)
+	{
+		(void)connectGTMComplete(conn);
+	}
 #if 0
     else if (conn != NULL)
     {
@@ -1423,3 +1422,63 @@ GTMPQuntrace(GTM_Conn *conn)
         conn->Pfdebug = NULL;
     }
 }
+
+/*
+ * Set socket keepalive and user_timeout.
+ * We can use this to detect the broken connection quickly.
+ */
+void
+GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle,
+	int tcp_keepalives_interval, int tcp_keepalives_count)
+{
+	int sock = conn->sock;
+	int keepalive = 1;
+	/* user_timeout in ms */
+	uint32 user_timeout = UINT32_MAX / 1000 < tcp_keepalives_idle ?
+						  0 : tcp_keepalives_idle * (uint32)1000;
+	struct tcp_info info;
+	int len = sizeof(info);
+	/* check sock */
+	getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len);
+	if (info.tcpi_state != TCP_ESTABLISHED)
+	{
+		return;
+	}
+
+	/* set keepalive */
+	if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+				   (char *)&keepalive, sizeof(keepalive)) < 0)
+	{
+		elog(LOG, "GTMSetSockKeepAlive setsockopt(SO_KEEPALIVE) failed: %m");
+	}
+	if (tcp_keepalives_idle > 0 &&
+		setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
+				   (char *)&tcp_keepalives_idle,
+				   sizeof(tcp_keepalives_idle)) < 0)
+	{
+		elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_KEEPIDLE) failed: %m");
+	}
+	if (tcp_keepalives_interval > 0 &&
+		setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
+				   (char *)&tcp_keepalives_interval,
+				   sizeof(tcp_keepalives_interval)) < 0)
+	{
+		elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_KEEPINTVL) failed: %m");
+	}
+	if (tcp_keepalives_count > 0 &&
+		setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
+				   (char *)&tcp_keepalives_count,
+				   sizeof(tcp_keepalives_count)) < 0)
+	{
+		elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_KEEPCNT) failed: %m");
+	}
+
+	/* set user_timeout */
+	if (user_timeout > 0 &&
+		setsockopt(sock, IPPROTO_TCP, TCP_USER_TIMEOUT,
+				   (char *)&user_timeout,
+				   sizeof(user_timeout)) < 0)
+	{
+		elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m");
+	}
+}
\ No newline at end of file
diff --git a/src/include/gtm/libpq-fe.h b/src/include/gtm/libpq-fe.h
index 23a24e81..54058e5f 100644
--- a/src/include/gtm/libpq-fe.h
+++ b/src/include/gtm/libpq-fe.h
@@ -130,7 +130,10 @@ extern void GTMPQuntrace(GTM_Conn *conn);
 /* Force the write buffer to be written (or at least try) */
 extern int    PQflush(GTM_Conn *conn);
 
-#define libpq_gettext(x)    x
+extern void GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle,
+	int tcp_keepalives_interval, int tcp_keepalives_count);
+
+#define libpq_gettext(x)	x
 
 #ifdef __cplusplus
 }

From 13030218e3194dbc0bf327262796373cfc3d4a42 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 22 Sep 2020 19:30:23 +0800
Subject: [PATCH 057/578] Support nestloop join suppresion when outerpath
 selectivity could be under estimated

http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696082218207
---
 src/backend/optimizer/path/clausesel.c | 105 +++++++++++++
 src/backend/optimizer/path/costsize.c  |  44 ++++--
 src/backend/utils/misc/guc.c           | 209 +++++++++++++------------
 src/include/optimizer/cost.h           |  14 +-
 src/test/regress/expected/join_3.out   |  57 +++++++
 src/test/regress/expected/sysviews.out |   3 +-
 src/test/regress/sql/join.sql          |  29 ++++
 7 files changed, 342 insertions(+), 119 deletions(-)

diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index 8e6e1670..86fe951b 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -23,6 +23,11 @@
 #include "utils/lsyscache.h"
 #include "utils/selfuncs.h"
 #include "statistics/statistics.h"
+#ifdef __TBASE__
+#include "access/htup_details.h"
+#include "catalog/pg_operator.h"
+#include "utils/syscache.h"
+#endif
 
 
 /*
@@ -866,3 +871,103 @@ clause_selectivity(PlannerInfo *root,
 
     return s1;
 }
+
+#ifdef __TBASE__
+/*
+ * clause_selectivity_could_under_estimated
+ *	  Check whether BaseRelOpt of the path might got under estimated rows.
+ *
+ * In real user scenarios, multiple columns could have correlation. It needs
+ * more statistic hints for the optimizer to know the data model
+ * characteristics. Since the extended mutli-column statistic calculation only
+ * supports '=' operation, we introduced this function to check if the
+ * selectivity of input path is under estimated.
+ */
+bool
+clause_selectivity_could_under_estimated(PlannerInfo *root, Path *path)
+{
+	RelOptInfo *rel = NULL;
+
+	/* We only support 1-depth nestloop outer path for now. */
+	if (path->pathtype == T_SeqScan ||
+		path->pathtype == T_IndexScan ||
+		path->pathtype == T_IndexOnlyScan ||
+		path->pathtype == T_BitmapIndexScan ||
+		path->pathtype == T_BitmapHeapScan)
+	{
+		rel = path->parent;
+	}
+	else
+	{
+		return false;
+	}
+
+	Assert(rel);
+
+	/*
+	 * The correlation problem only happens when there are multiple
+	 * restrictions.
+	 */
+	if (list_length(rel->baserestrictinfo) > 1)
+	{
+		ListCell 	*lc;
+		Node 		*clause;
+		int 		count = 0;
+
+		/* Walk through all restrictions */
+		foreach (lc, rel->baserestrictinfo)
+		{
+			RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
+
+			/*
+			 * Proceed with examination of contained clause.  If the clause is
+			 * an OR-clause.
+			 */
+			if (ri->orclause)
+				clause = (Node *) ri->orclause;
+			else
+				clause = (Node *) ri->clause;
+
+			/*
+			 * The multi-column statistic only supports '=' operator based on
+			 * single column histograms. Thus we count all unsupported cases
+			 * here. is_opclause() covers the NULL check for 'clause'
+			 *
+			 * TODO(Tbase): Be more precise on other type of clauses.
+			 */
+			if (is_opclause(clause))
+			{
+				OpExpr	   		   *opclause = (OpExpr *) clause;
+				char		 	   *oprname;
+				Oid					opno = opclause->opno;
+				HeapTuple 			opTuple;
+				Form_pg_operator 	operform;
+
+				opTuple = SearchSysCache1(OPEROID, ObjectIdGetDatum(opno));
+				if (HeapTupleIsValid(opTuple))
+				{
+					operform = (Form_pg_operator)GETSTRUCT(opTuple);
+					oprname = NameStr(operform->oprname);
+				}
+				ReleaseSysCache(opTuple);
+
+				/* Supported case, skip the count. */
+				if (oprname && strcmp(oprname, "=") == 0)
+					continue;
+			}
+
+			/* Unsupported case */
+			count++;
+		}
+
+		/*
+		 * The path got some restrictions which could lead to selectivity
+		 * under estimation.
+		 */
+		if (count > 0)
+			return true;
+	}
+
+	return false;
+}
+#endif
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index a6bba0cf..0de40222 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -128,19 +128,20 @@ Cost        disable_cost = 1.0e10;
 
 int            max_parallel_workers_per_gather = 2;
 
-bool        enable_seqscan = true;
-bool        enable_indexscan = true;
-bool        enable_indexonlyscan = true;
-bool        enable_bitmapscan = true;
-bool        enable_tidscan = true;
-bool        enable_sort = true;
-bool        enable_hashagg = true;
-bool        enable_nestloop = true;
-bool        enable_material = true;
-bool        enable_mergejoin = true;
-bool        enable_hashjoin = true;
-bool        enable_fast_query_shipping = true;
-bool        enable_gathermerge = true;
+bool		enable_seqscan = true;
+bool		enable_indexscan = true;
+bool		enable_indexonlyscan = true;
+bool		enable_bitmapscan = true;
+bool		enable_tidscan = true;
+bool		enable_sort = true;
+bool		enable_hashagg = true;
+bool		enable_nestloop = true;
+bool		enable_material = true;
+bool		enable_mergejoin = true;
+bool		enable_hashjoin = true;
+bool		enable_fast_query_shipping = true;
+bool		enable_gathermerge = true;
+bool		enable_nestloop_suppression = false;
 
 typedef struct
 {
@@ -2345,6 +2346,22 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
 	{
 		/* Normal-case source costs were included in preliminary estimate */
 
+#ifdef __TBASE__
+		/*
+		 * When outerpath only got one row, we need to check if the number of
+		 * rows is under estimated. It might lead to huge cost estimation error
+		 * if innerpath is SeqScan.
+		 * If it is the case, we count additional disable_cost to suppress this
+		 * nestloop path. Thus Hashjoin or the rotated Nestloop join paths
+		 * could win.
+		 */
+		if (enable_nestloop_suppression &&
+			outer_path_rows == 1 && inner_path->pathtype == T_SeqScan &&
+			clause_selectivity_could_under_estimated(root, outer_path))
+		{
+			startup_cost += disable_cost;
+		}
+#endif
 		/* Compute number of tuples processed (not number emitted!) */
 		ntuples = outer_path_rows * inner_path_rows;
 	}
@@ -4022,7 +4039,6 @@ has_indexed_join_quals(NestPath *joinpath)
     return found_one;
 }
 
-
 /*
  * approx_tuple_count
  *        Quick-and-dirty estimation of the number of join rows passing
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 01405c31..af23b681 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1021,105 +1021,116 @@ static const unit_conversion time_unit_conversion_table[] =
 
 static struct config_bool ConfigureNamesBool[] =
 {
-    {
-        {"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD,
-            gettext_noop("Enables the planner's use of sequential-scan plans."),
-            NULL
-        },
-        &enable_seqscan,
-        true,
-        NULL, NULL, NULL
-    },
-    {
-        {"enable_indexscan", PGC_USERSET, QUERY_TUNING_METHOD,
-            gettext_noop("Enables the planner's use of index-scan plans."),
-            NULL
-        },
-        &enable_indexscan,
-        true,
-        NULL, NULL, NULL
-    },
-    {
-        {"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD,
-            gettext_noop("Enables the planner's use of index-only-scan plans."),
-            NULL
-        },
-        &enable_indexonlyscan,
-        true,
-        NULL, NULL, NULL
-    },
-    {
-        {"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD,
-            gettext_noop("Enables the planner's use of bitmap-scan plans."),
-            NULL
-        },
-        &enable_bitmapscan,
-        true,
-        NULL, NULL, NULL
-    },
-    {
-        {"enable_tidscan", PGC_USERSET, QUERY_TUNING_METHOD,
-            gettext_noop("Enables the planner's use of TID scan plans."),
-            NULL
-        },
-        &enable_tidscan,
-        true,
-        NULL, NULL, NULL
-    },
-    {
-        {"enable_sort", PGC_USERSET, QUERY_TUNING_METHOD,
-            gettext_noop("Enables the planner's use of explicit sort steps."),
-            NULL
-        },
-        &enable_sort,
-        true,
-        NULL, NULL, NULL
-    },
-    {
-        {"enable_hashagg", PGC_USERSET, QUERY_TUNING_METHOD,
-            gettext_noop("Enables the planner's use of hashed aggregation plans."),
-            NULL
-        },
-        &enable_hashagg,
-        true,
-        NULL, NULL, NULL
-    },
-    {
-        {"enable_material", PGC_USERSET, QUERY_TUNING_METHOD,
-            gettext_noop("Enables the planner's use of materialization."),
-            NULL
-        },
-        &enable_material,
-        true,
-        NULL, NULL, NULL
-    },
-    {
-        {"enable_nestloop", PGC_USERSET, QUERY_TUNING_METHOD,
-            gettext_noop("Enables the planner's use of nested-loop join plans."),
-            NULL
-        },
-        &enable_nestloop,
-        true,
-        NULL, NULL, NULL
-    },
-    {
-        {"enable_mergejoin", PGC_USERSET, QUERY_TUNING_METHOD,
-            gettext_noop("Enables the planner's use of merge join plans."),
-            NULL
-        },
-        &enable_mergejoin,
-        true,
-        NULL, NULL, NULL
-    },
-    {
-        {"enable_hashjoin", PGC_USERSET, QUERY_TUNING_METHOD,
-            gettext_noop("Enables the planner's use of hash join plans."),
-            NULL
-        },
-        &enable_hashjoin,
-        true,
-        NULL, NULL, NULL
-    },
+	{
+		{"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of sequential-scan plans."),
+			NULL
+		},
+		&enable_seqscan,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_indexscan", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of index-scan plans."),
+			NULL
+		},
+		&enable_indexscan,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_indexonlyscan", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of index-only-scan plans."),
+			NULL
+		},
+		&enable_indexonlyscan,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of bitmap-scan plans."),
+			NULL
+		},
+		&enable_bitmapscan,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_tidscan", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of TID scan plans."),
+			NULL
+		},
+		&enable_tidscan,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_sort", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of explicit sort steps."),
+			NULL
+		},
+		&enable_sort,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_hashagg", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of hashed aggregation plans."),
+			NULL
+		},
+		&enable_hashagg,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_material", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of materialization."),
+			NULL
+		},
+		&enable_material,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_nestloop", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of nested-loop join plans."),
+			NULL
+		},
+		&enable_nestloop,
+		true,
+		NULL, NULL, NULL
+	},
+#ifdef __TBASE__
+	{
+		{"enable_nestloop_suppression", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the selectivity hints when planning nested-loop joins."),
+			NULL
+		},
+		&enable_nestloop_suppression,
+		false,
+		NULL, NULL, NULL
+	},
+#endif
+	{
+		{"enable_mergejoin", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of merge join plans."),
+			NULL
+		},
+		&enable_mergejoin,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_hashjoin", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of hash join plans."),
+			NULL
+		},
+		&enable_hashjoin,
+		true,
+		NULL, NULL, NULL
+	},
 #ifdef PGXC
     {
         {"enable_fast_query_shipping", PGC_USERSET, QUERY_TUNING_METHOD,
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index e438ae2e..102795bb 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -77,7 +77,8 @@ extern bool enable_mergejoin;
 extern bool enable_hashjoin;
 extern bool enable_fast_query_shipping;
 extern bool enable_gathermerge;
-extern int    constraint_exclusion;
+extern bool enable_nestloop_suppression;
+extern int	constraint_exclusion;
 
 extern double clamp_row_est(double nrows);
 extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
@@ -219,10 +220,13 @@ extern Selectivity clauselist_selectivity(PlannerInfo *root,
                         JoinType jointype,
                         SpecialJoinInfo *sjinfo);
 extern Selectivity clause_selectivity(PlannerInfo *root,
-                    Node *clause,
-                    int varRelid,
-                    JoinType jointype,
-                    SpecialJoinInfo *sjinfo);
+				   Node *clause,
+				   int varRelid,
+				   JoinType jointype,
+				   SpecialJoinInfo *sjinfo);
+#ifdef __TBASE__
+extern bool clause_selectivity_could_under_estimated(PlannerInfo *root, Path *path);
+#endif
 extern void cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
                   RelOptInfo *rel, ParamPathInfo *param_info,
                   Cost input_startup_cost, Cost input_total_cost,
diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index f151b912..a133332a 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -6154,3 +6154,60 @@ where exists (select 1 from j3
 (19 rows)
 
 drop table j3;
+--
+-- Test nestloop path suppression if the selectivity could be under estimated
+--
+create table nestloop_suppression1(a int, b int, c int, d varchar(20));
+create table nestloop_suppression2(a int, b int, c int, d varchar(20));
+create table nestloop_suppression3(a int, b int);
+insert into nestloop_suppression1 select i, i+1, i+2, 'char'||i from generate_series(1,10000) i;
+insert into nestloop_suppression2 select i, i+1, i+2, 'char'||i from generate_series(1,10000) i;
+insert into nestloop_suppression3 select i, i+1 from generate_series(1,100) i;
+create index idx_nestloop_suppression1_b on nestloop_suppression1(b);
+analyze nestloop_suppression1;
+analyze nestloop_suppression2;
+analyze nestloop_suppression3;
+set enable_hashjoin = false;
+explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nestloop_suppression3 t3 
+	where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a;
+                                                          QUERY PLAN                                                          
+------------------------------------------------------------------------------------------------------------------------------
+ Nested Loop  (cost=200.16..596.19 rows=33 width=4)
+   Join Filter: (t3.b > t2.a)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..453.19 rows=1 width=4)
+         ->  Nested Loop  (cost=0.16..353.18 rows=1 width=4)
+               Join Filter: (t1.a = t2.a)
+               ->  Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1  (cost=0.16..8.18 rows=1 width=4)
+                     Index Cond: (b = 2)
+                     Filter: (((d)::text ~~ 'char%'::text) AND (c = 3))
+               ->  Seq Scan on nestloop_suppression2 t2  (cost=0.00..220.00 rows=10000 width=4)
+   ->  Materialize  (cost=100.00..141.75 rows=100 width=4)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..141.50 rows=100 width=4)
+               ->  Seq Scan on nestloop_suppression3 t3  (cost=0.00..41.00 rows=100 width=4)
+(12 rows)
+
+set enable_nestloop_suppression = true;
+explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nestloop_suppression3 t3 
+	where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a;
+                                                             QUERY PLAN                                                             
+------------------------------------------------------------------------------------------------------------------------------------
+ Nested Loop  (cost=200.16..621.19 rows=33 width=4)
+   Join Filter: (t3.b > t2.a)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..478.19 rows=1 width=4)
+         ->  Nested Loop  (cost=0.16..378.19 rows=1 width=4)
+               Join Filter: (t1.a = t2.a)
+               ->  Seq Scan on nestloop_suppression2 t2  (cost=0.00..220.00 rows=10000 width=4)
+               ->  Materialize  (cost=0.16..8.19 rows=1 width=4)
+                     ->  Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1  (cost=0.16..8.18 rows=1 width=4)
+                           Index Cond: (b = 2)
+                           Filter: (((d)::text ~~ 'char%'::text) AND (c = 3))
+   ->  Materialize  (cost=100.00..141.75 rows=100 width=4)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..141.50 rows=100 width=4)
+               ->  Seq Scan on nestloop_suppression3 t3  (cost=0.00..41.00 rows=100 width=4)
+(13 rows)
+
+drop table nestloop_suppression1;
+drop table nestloop_suppression2;
+drop table nestloop_suppression3;
+reset enable_nestloop_suppression;
+reset enable_hashjoin;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 06796d76..0422edd6 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -109,6 +109,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_multi_cluster              | on
  enable_multi_cluster_print        | off
  enable_nestloop                   | on
+ enable_nestloop_suppression       | off
  enable_null_string                | off
  enable_oracle_compatible          | off
  enable_parallel_ddl               | on
@@ -126,7 +127,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_tidscan                    | on
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
-(55 rows)
+(56 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index dceca27f..ee870752 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1987,3 +1987,32 @@ where exists (select 1 from j3
       and t1.unique1 < 1;
 
 drop table j3;
+
+--
+-- Test nestloop path suppression if the selectivity could be under estimated
+--
+create table nestloop_suppression1(a int, b int, c int, d varchar(20));
+create table nestloop_suppression2(a int, b int, c int, d varchar(20));
+create table nestloop_suppression3(a int, b int);
+
+insert into nestloop_suppression1 select i, i+1, i+2, 'char'||i from generate_series(1,10000) i;
+insert into nestloop_suppression2 select i, i+1, i+2, 'char'||i from generate_series(1,10000) i;
+insert into nestloop_suppression3 select i, i+1 from generate_series(1,100) i;
+create index idx_nestloop_suppression1_b on nestloop_suppression1(b);
+analyze nestloop_suppression1;
+analyze nestloop_suppression2;
+analyze nestloop_suppression3;
+
+set enable_hashjoin = false;
+explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nestloop_suppression3 t3 
+	where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a;
+set enable_nestloop_suppression = true;
+explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nestloop_suppression3 t3 
+	where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a;
+
+drop table nestloop_suppression1;
+drop table nestloop_suppression2;
+drop table nestloop_suppression3;
+
+reset enable_nestloop_suppression;
+reset enable_hashjoin;
\ No newline at end of file

From db62e69e0f182cd1818d8c3368ca15da74459654 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 22 Sep 2020 11:37:39 +0800
Subject: [PATCH 058/578] Support inline CTE with multiple references

http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696082218007
---
 src/backend/optimizer/plan/subselect.c  | 104 ++++++++++++++++++++++++
 src/test/regress/expected/subselect.out |  43 +++++-----
 2 files changed, 126 insertions(+), 21 deletions(-)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index f7832d4d..e8495fbc 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -98,6 +98,7 @@
 #include "pgxc/pgxc.h"
 #endif
 #ifdef __TBASE__
+#include <math.h>
 #include "nodes/pg_list.h"
 #include "parser/parse_oper.h"
 #include "parser/parse_func.h"
@@ -1383,6 +1384,80 @@ hash_ok_operator(OpExpr *expr)
     }
 }
 
+#ifdef __TBASE__
+/*
+ * Check if total cost of inlining to multiple subquery is cheaper.
+ *
+ * There are three alternatives to optimize CTE with multiple references.
+ * XXX Keep the CTE as an optimization fence, using materialized CTE scan could
+ * 	   be cost saving. But in TBase distributed system, this will lead to more
+ * 	   executor nodes perfored in CN, which could be much slower.
+ * XXX Inline the CTE to multiple subqueries. This could leverage more join
+ *     reordering and predicate pushdown opetimization automatically.
+ * XXX Inline the CTE to some of the reference place(s). This need an overall
+ *     cost based optimizer including CTE inline and sublink pullup phase,
+ *     postgres optimizer does not support this yet.
+ */
+static bool
+is_cte_worth_inline(CommonTableExpr *cte, Plan *plan, Path *path)
+{
+	Cost 	inline_total_cost = 0;
+	Cost 	cte_total_cost = 0;
+	Cost 	material_cost = 0;
+	double 	material_bytes = 0;
+	long	work_mem_bytes = work_mem * 1024L;
+
+	/* Force pullup multi-reference CTE when enable_pullup_subquery enabled */
+	if (enable_pullup_subquery)
+		return true;
+
+	/* Num bytes to be materialized by CTE */
+	material_bytes = plan->plan_rows * plan->plan_width;
+
+	/*
+	 * Whether spilling or not, charge 2x cpu_operator_cost per tuple to
+	 * reflect bookkeeping overhead.  (This rate must be more than what
+	 * cost_rescan charges for materialize, ie, cpu_operator_cost per tuple;
+	 * if it is exactly the same then there will be a cost tie between
+	 * nestloop with A outer, materialized B inner and nestloop with B outer,
+	 * materialized A inner.  The extra cost ensures we'll prefer
+	 * materializing the smaller rel.)	Note that this is normally a good deal
+	 * less than cpu_tuple_cost; which is OK because a Material plan node
+	 * doesn't do qual-checking or projection, so it's got less overhead than
+	 * most plan nodes.
+	 */
+	material_cost += 2 * cpu_operator_cost * plan->plan_rows;
+
+	/*
+	 * If we will spill to disk, charge at the rate of seq_page_cost per page.
+	 * This cost is assumed to be evenly spread through the plan run phase,
+	 * which isn't exactly accurate but our cost model doesn't allow for
+	 * nonuniform costs within the run phase.
+	 */
+	if (material_bytes > work_mem_bytes)
+	{
+		double npages = ceil(material_bytes / BLCKSZ);
+
+		material_bytes += seq_page_cost * npages;
+	}
+
+	/* Calculate total costs for different options */
+	cte_total_cost = plan->total_cost + material_cost;
+	inline_total_cost = plan->total_cost * cte->cterefcount;
+
+	/*
+	 * In a distributed system like TBase, the inline one could leverage more
+	 * optimizations like subquery pullup, predicate pushdown, etc. We add a
+	 * optimization factor 0.5 here to show case these cost saves.
+	 */
+	inline_total_cost = inline_total_cost * 0.5;
+
+	if (inline_total_cost <= cte_total_cost)
+		return true;
+	else
+		return false;
+}
+#endif
 
 /*
  * SS_process_ctes: process a query's WITH list
@@ -1508,6 +1583,35 @@ SS_process_ctes(PlannerInfo *root)
 
 		plan = create_plan(subroot, best_path);
 
+#ifdef __TBASE__
+		/*
+		 * Handle the CTE with multiple references in the main query. Since we
+		 * need to compare the cost between CTE Scan and inline subquery Scan,
+		 * perform the inline check after we got the best path of CTE subquery.
+		 */
+		if ((cte->ctematerialized == CTEMaterializeNever ||
+			 (cte->ctematerialized == CTEMaterializeDefault &&
+			  cte->cterefcount > 1)) &&
+			!cte->cterecursive &&
+			cmdType == CMD_SELECT &&
+			!contain_dml(cte->ctequery) &&
+			(cte->cterefcount <= 1 ||
+			 !contain_outer_selfref(cte->ctequery)) &&
+			!contain_volatile_functions(cte->ctequery))
+		{
+			/*
+			 * Check if total cost of inlining to multiple subquery is cheaper.
+			 */
+			if (is_cte_worth_inline(cte, plan, best_path))
+			{
+				inline_cte(root, cte);
+				/* Make a dummy entry in cte_plan_ids */
+				root->cte_plan_ids = lappend_int(root->cte_plan_ids, -1);
+				continue;
+			}
+		}
+#endif
+
 #ifdef XCP
         /* Add a remote subplan, if redistribution is needed. */
         if (subroot->distribution)
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 52c196aa..85af9fb1 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1787,27 +1787,28 @@ select * from x where f1 = 1;
 explain (verbose, costs off)
 with x as (select * from (select f1, now() as n from subselect_tbl) ss)
 select * from x, x x2 where x.n = x2.n;
-                         QUERY PLAN                          
--------------------------------------------------------------
- Merge Join
-   Output: x.f1, x.n, x2.f1, x2.n
-   Merge Cond: (x.n = x2.n)
-   CTE x
-     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-           Output: subselect_tbl.f1, now()
-           ->  Seq Scan on public.subselect_tbl
-                 Output: subselect_tbl.f1, now()
-   ->  Sort
-         Output: x.f1, x.n
-         Sort Key: x.n
-         ->  CTE Scan on x
-               Output: x.f1, x.n
-   ->  Sort
-         Output: x2.f1, x2.n
-         Sort Key: x2.n
-         ->  CTE Scan on x x2
-               Output: x2.f1, x2.n
-(18 rows)
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: subselect_tbl.f1, now(), subselect_tbl_1.f1, now()
+   ->  Result
+         Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now())
+         One-Time Filter: (now() = now())
+         ->  Nested Loop
+               Output: subselect_tbl.f1, (now()), subselect_tbl_1.f1, (now())
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: subselect_tbl.f1, now()
+                     Distribute results by H: now()
+                     ->  Seq Scan on public.subselect_tbl
+                           Output: subselect_tbl.f1, now()
+               ->  Materialize
+                     Output: subselect_tbl_1.f1, (now())
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Output: subselect_tbl_1.f1, now()
+                           Distribute results by H: now()
+                           ->  Seq Scan on public.subselect_tbl subselect_tbl_1
+                                 Output: subselect_tbl_1.f1, now()
+(19 rows)
 
 explain (verbose, costs off)
 with x as not materialized (select * from (select f1, now() as n from subselect_tbl) ss)

From 6d816ab54dbcf37787b462c66ea664a7ca9b8d53 Mon Sep 17 00:00:00 2001
From: qiannzhang <qiannzhang@tencent.com>
Date: Wed, 23 Sep 2020 09:31:40 +0800
Subject: [PATCH 059/578] Fix bug of send concurrently by DataPumpRawSendData
 and pq_flush in DN.
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131080938551

---
 src/backend/utils/error/elog.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index 58ff0658..ca77995a 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -1591,12 +1591,31 @@ EmitErrorReport(void)
     /* Send to client, if enabled */
     if (edata->output_to_client)
     {
-        if (true == g_enable_copy_silence)
+        if (true == g_enable_copy_silence ||
+			(IS_PGXC_DATANODE && edata->elevel < ERROR))
         {
+			/*
+			 * Do not send nonfatal msg to client for Datanode.
+			 *
+			 * It is possible that DataPumpRawSendData is sending data now,
+			 * and this msg can be mixed with data message
+			 * if the socket is written concurrently.
+			 *
+			 * In addition, the msg is not that important.
+			 */
             ;
         }
         else
         {
+			/*
+			 * For the same reason as above, shut down producer for Datanode
+			 * before send ERROR/FATAL msg.
+			 * It is ok to shut down it again in AbortTransaction.
+			 */
+			if (IS_PGXC_DATANODE)
+			{
+				SqueueProducerExit();
+			}
             send_message_to_frontend(edata);
         }
     }

From 524ea3d76c6a18a96c01a1b854f82cb491806c7b Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Wed, 23 Sep 2020 17:57:57 +0800
Subject: [PATCH 060/578] Improve regress stability

---
 src/test/regress/expected/join_3.out          | 46 +++++++++----------
 src/test/regress/expected/rowsecurity_1.out   |  2 +-
 .../regress/expected/select_parallel_4.out    | 18 ++++----
 src/test/regress/sql/join.sql                 |  6 ++-
 src/test/regress/sql/rowsecurity.sql          |  2 +-
 5 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index a133332a..ce5f9512 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -2979,6 +2979,7 @@ select * from int4_tbl a full join int4_tbl b on false order by 1,2;
 --
 -- test for ability to use a cartesian join when necessary
 --
+set enable_hashjoin = false;
 explain (num_nodes off, nodes off, costs off)
 select * from
   tenk1 join int4_tbl on f1 = twothousand,
@@ -2987,8 +2988,8 @@ select * from
 where q1 = thousand or q2 = thousand;
                                      QUERY PLAN                                     
 ------------------------------------------------------------------------------------
- Hash Join
-   Hash Cond: (tenk1.twothousand = int4_tbl.f1)
+ Nested Loop
+   Join Filter: (tenk1.twothousand = int4_tbl.f1)
    ->  Nested Loop
          ->  Nested Loop
                ->  Function Scan on q1
@@ -3002,7 +3003,7 @@ where q1 = thousand or q2 = thousand;
                                        Index Cond: (q1.q1 = thousand)
                                  ->  Bitmap Index Scan on tenk1_thous_tenthous
                                        Index Cond: (q2.q2 = thousand)
-   ->  Hash
+   ->  Materialize
          ->  Remote Subquery Scan on all
                ->  Seq Scan on int4_tbl
 (18 rows)
@@ -3015,8 +3016,8 @@ select * from
 where thousand = (q1 + q2);
                                 QUERY PLAN                                
 --------------------------------------------------------------------------
- Hash Join
-   Hash Cond: (tenk1.twothousand = int4_tbl.f1)
+ Nested Loop
+   Join Filter: (tenk1.twothousand = int4_tbl.f1)
    ->  Nested Loop
          ->  Nested Loop
                ->  Function Scan on q1
@@ -3027,41 +3028,38 @@ where thousand = (q1 + q2);
                            Recheck Cond: (thousand = (q1.q1 + q2.q2))
                            ->  Bitmap Index Scan on tenk1_thous_tenthous
                                  Index Cond: (thousand = (q1.q1 + q2.q2))
-   ->  Hash
+   ->  Materialize
          ->  Remote Subquery Scan on all
                ->  Seq Scan on int4_tbl
 (15 rows)
 
+set enable_hashjoin = true;
 --
 -- test ability to generate a suitable plan for a star-schema query
 --
+set enable_mergejoin = false;
 explain (costs off)
 select * from
   tenk1, int8_tbl a, int8_tbl b
 where thousand = a.q1 and tenthous = b.q1 and a.q2 = 1 and b.q2 = 2;
-                       QUERY PLAN                        
----------------------------------------------------------
+                    QUERY PLAN                    
+--------------------------------------------------
  Remote Fast Query Execution
    Node/s: datanode_1, datanode_2
-   ->  Merge Join
-         Merge Cond: (tenk1.thousand = a.q1)
-         ->  Sort
-               Sort Key: tenk1.thousand
-               ->  Merge Join
-                     Merge Cond: (tenk1.tenthous = b.q1)
-                     ->  Sort
-                           Sort Key: tenk1.tenthous
-                           ->  Seq Scan on tenk1
-                     ->  Sort
-                           Sort Key: b.q1
-                           ->  Seq Scan on int8_tbl b
-                                 Filter: (q2 = 2)
-         ->  Sort
-               Sort Key: a.q1
+   ->  Hash Join
+         Hash Cond: (tenk1.thousand = a.q1)
+         ->  Hash Join
+               Hash Cond: (tenk1.tenthous = b.q1)
+               ->  Seq Scan on tenk1
+               ->  Hash
+                     ->  Seq Scan on int8_tbl b
+                           Filter: (q2 = 2)
+         ->  Hash
                ->  Seq Scan on int8_tbl a
                      Filter: (q2 = 1)
-(19 rows)
+(13 rows)
 
+set enable_mergejoin = true;
 --
 -- test a corner case in which we shouldn't apply the star-schema optimization
 --
diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out
index 60160a5a..e0336e73 100644
--- a/src/test/regress/expected/rowsecurity_1.out
+++ b/src/test/regress/expected/rowsecurity_1.out
@@ -2260,7 +2260,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
 
 -- Query as view/table owner.  Should return all records.
 SET SESSION AUTHORIZATION regress_rls_alice;
-SELECT * FROM rls_view;
+SELECT * FROM rls_view order by 1;
  a |  b  
 ---+-----
  1 | aba
diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out
index 3ae6bc47..4d264b26 100644
--- a/src/test/regress/expected/select_parallel_4.out
+++ b/src/test/regress/expected/select_parallel_4.out
@@ -139,19 +139,19 @@ alter table tenk2 set (parallel_workers = 0);
 explain (costs off)
 	select count(*) from tenk1 where (two, four) not in
 	(select hundred, thousand from tenk2 where thousand > 100);
-                                           QUERY PLAN                                           
-------------------------------------------------------------------------------------------------
- Aggregate
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Finalize Aggregate
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Gather
                Workers Planned: 4
-               ->  Hash Anti Join
-                     Hash Cond: ((tenk1.two = tenk2.hundred) AND (tenk1.four = tenk2.thousand))
+               ->  Partial Aggregate
                      ->  Parallel Seq Scan on tenk1
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 ->  Seq Scan on tenk2
-                                       Filter: (thousand > 100)
+                           Filter: (NOT (hashed SubPlan 1))
+                           SubPlan 1
+                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                   ->  Seq Scan on tenk2
+                                         Filter: (thousand > 100)
 (11 rows)
 
 select count(*) from tenk1 where (two, four) not in
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index ee870752..16e8dd0b 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -884,7 +884,7 @@ select * from int4_tbl a full join int4_tbl b on false order by 1,2;
 --
 -- test for ability to use a cartesian join when necessary
 --
-
+set enable_hashjoin = false;
 explain (num_nodes off, nodes off, costs off)
 select * from
   tenk1 join int4_tbl on f1 = twothousand,
@@ -898,15 +898,17 @@ select * from
   int4(sin(1)) q1,
   int4(sin(0)) q2
 where thousand = (q1 + q2);
+set enable_hashjoin = true;
 
 --
 -- test ability to generate a suitable plan for a star-schema query
 --
-
+set enable_mergejoin = false;
 explain (costs off)
 select * from
   tenk1, int8_tbl a, int8_tbl b
 where thousand = a.q1 and tenthous = b.q1 and a.q2 = 1 and b.q2 = 2;
+set enable_mergejoin = true;
 
 --
 -- test a corner case in which we shouldn't apply the star-schema optimization
diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql
index bd588af8..3fa55ccc 100644
--- a/src/test/regress/sql/rowsecurity.sql
+++ b/src/test/regress/sql/rowsecurity.sql
@@ -892,7 +892,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
 
 -- Query as view/table owner.  Should return all records.
 SET SESSION AUTHORIZATION regress_rls_alice;
-SELECT * FROM rls_view;
+SELECT * FROM rls_view order by 1;
 EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
 DROP VIEW rls_view;
 

From 3d357481bd11159ffbcd3d3a7fed2cc7ab03445a Mon Sep 17 00:00:00 2001
From: qiannzhang <qiannzhang@tencent.com>
Date: Thu, 24 Sep 2020 16:14:12 +0800
Subject: [PATCH 061/578] remove elog in GTMSetSockKeepAlive

---
 src/backend/access/transam/gtm.c |  9 +++++++--
 src/gtm/client/fe-connect.c      | 17 ++++++++++-------
 src/include/gtm/libpq-fe.h       |  2 +-
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 9ec7287d..81bb209f 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -1297,8 +1297,13 @@ InitGTM(void)
 	}
 	else
 	{
-		GTMSetSockKeepAlive(conn, tcp_keepalives_idle,
-							tcp_keepalives_interval, tcp_keepalives_count);
+		if (!GTMSetSockKeepAlive(conn, tcp_keepalives_idle,
+							tcp_keepalives_interval, tcp_keepalives_count))
+		{
+			ereport(LOG,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("GTMSetSockKeepAlive failed: %m")));
+		}
 		if (IS_PGXC_COORDINATOR)
 		{
 			register_session(conn, PGXCNodeName, MyProcPid, MyBackendId);
diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c
index 1e3b712f..6e3b0306 100644
--- a/src/gtm/client/fe-connect.c
+++ b/src/gtm/client/fe-connect.c
@@ -1427,7 +1427,7 @@ GTMPQuntrace(GTM_Conn *conn)
  * Set socket keepalive and user_timeout.
  * We can use this to detect the broken connection quickly.
  */
-void
+bool
 GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle,
 	int tcp_keepalives_interval, int tcp_keepalives_count)
 {
@@ -1442,35 +1442,36 @@ GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle,
 	getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len);
 	if (info.tcpi_state != TCP_ESTABLISHED)
 	{
-		return;
+		/* No need to set */
+		return true;
 	}
 
 	/* set keepalive */
 	if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
 				   (char *)&keepalive, sizeof(keepalive)) < 0)
 	{
-		elog(LOG, "GTMSetSockKeepAlive setsockopt(SO_KEEPALIVE) failed: %m");
+		return false;
 	}
 	if (tcp_keepalives_idle > 0 &&
 		setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
 				   (char *)&tcp_keepalives_idle,
 				   sizeof(tcp_keepalives_idle)) < 0)
 	{
-		elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_KEEPIDLE) failed: %m");
+		return false;
 	}
 	if (tcp_keepalives_interval > 0 &&
 		setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
 				   (char *)&tcp_keepalives_interval,
 				   sizeof(tcp_keepalives_interval)) < 0)
 	{
-		elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_KEEPINTVL) failed: %m");
+		return false;
 	}
 	if (tcp_keepalives_count > 0 &&
 		setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
 				   (char *)&tcp_keepalives_count,
 				   sizeof(tcp_keepalives_count)) < 0)
 	{
-		elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_KEEPCNT) failed: %m");
+		return false;
 	}
 
 	/* set user_timeout */
@@ -1479,6 +1480,8 @@ GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle,
 				   (char *)&user_timeout,
 				   sizeof(user_timeout)) < 0)
 	{
-		elog(LOG, "GTMSetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m");
+		return false;
 	}
+
+	return true;
 }
\ No newline at end of file
diff --git a/src/include/gtm/libpq-fe.h b/src/include/gtm/libpq-fe.h
index 54058e5f..d5e3d2fb 100644
--- a/src/include/gtm/libpq-fe.h
+++ b/src/include/gtm/libpq-fe.h
@@ -130,7 +130,7 @@ extern void GTMPQuntrace(GTM_Conn *conn);
 /* Force the write buffer to be written (or at least try) */
 extern int    PQflush(GTM_Conn *conn);
 
-extern void GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle,
+extern bool GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle,
 	int tcp_keepalives_interval, int tcp_keepalives_count);
 
 #define libpq_gettext(x)	x

From 1efbc57281b85eefa01f388161bd48b42a061162 Mon Sep 17 00:00:00 2001
From: yeyukui <yukuiye@tencent.com>
Date: Fri, 25 Sep 2020 09:38:02 +0800
Subject: [PATCH 062/578] Fix the problem that the deleted table still leaves
 security metadata in the security feature

---
 src/backend/commands/user.c             | 17 ++------
 src/backend/utils/misc/cls.c            | 56 +++++++++++++++++++++++++
 src/backend/utils/misc/mls.c            | 46 ++++++++++----------
 src/include/utils/cls.h                 |  2 +
 src/test/regress/expected/mls_check.out | 49 +++++++++++++++++++++-
 src/test/regress/sql/mls_check.sql      | 17 +++++++-
 6 files changed, 150 insertions(+), 37 deletions(-)

diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c
index 8d35b8c6..038119db 100644
--- a/src/backend/commands/user.c
+++ b/src/backend/commands/user.c
@@ -1178,23 +1178,12 @@ DropRole(DropRoleStmt *stmt)
                      errdetail_log("%s", detail_log)));
 
 #ifdef _MLS_
-        if (true == mls_check_role_permission(roleid))
+        if (true == mls_check_role_permission(roleid) ||
+                true == cls_check_user_has_policy(roleid))
         {
-            elog(ERROR, "could not drop role:%s, cause this role has mls policy bound",
+            elog(ERROR, "could not drop role:%s, cause this role has mls poilcy bound", 
                             role);
         }
-
-        if (!is_mls_user() && userid_is_mls_user(roleid))
-        {
-            elog(ERROR, "non-mls user could not drop mls role:%s, permission denied",
-                 role);
-        }
-
-        if(is_mls_user() && !userid_is_mls_user(roleid))
-        {
-            elog(ERROR, "mls user could not drop role:%s, permission denied",
-                 role);
-        }
 #endif
         /*
          * Remove the role from the pg_authid table
diff --git a/src/backend/utils/misc/cls.c b/src/backend/utils/misc/cls.c
index 48366cac..916968ad 100644
--- a/src/backend/utils/misc/cls.c
+++ b/src/backend/utils/misc/cls.c
@@ -1109,5 +1109,61 @@ bool cls_check_table_col_has_policy(Oid relid, int attnum)
     return false;
 }
 
+/*
+ * check table has policy
+ */
+bool cls_check_table_has_policy(Oid relid)
+{
+	int16       attnum = InvalidAttrNumber;
+
+	attnum = cls_check_table_has_cls_policy(relid);
+	if (attnum != InvalidAttrNumber)
+	{
+		return true;
+	}
+	return false;
+}
+
+/*
+ * check user whether has policy
+ */
+bool cls_check_user_has_policy(Oid roleid)
+{
+	SysScanDesc scan;
+	ScanKeyData skey[1];
+	HeapTuple   htup;
+	Relation    rel;
+	bool        found = false;
+
+	ScanKeyInit(&skey[0],
+	            Anum_pg_cls_user_userid,
+	            BTEqualStrategyNumber,
+	            F_OIDEQ,
+	            ObjectIdGetDatum(roleid));
+
+	rel = heap_open(ClsUserRelationId, AccessShareLock);
+	scan = systable_beginscan(rel,
+	                          PgClsUserPolidUseridIndexId,
+	                          true,
+	                          NULL,
+	                          1,
+	                          skey);
+
+	while (HeapTupleIsValid(htup = systable_getnext(scan)))
+	{
+		Form_pg_cls_user form_cls_user = (Form_pg_cls_user) GETSTRUCT(htup);
+
+		if (form_cls_user)
+		{
+			found = true;
+			break;
+		}
+	}
+
+	systable_endscan(scan);
+	heap_close(rel, AccessShareLock);
+
+	return found;
+}
 
 #endif
diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c
index 2c4b3795..c29ed21c 100644
--- a/src/backend/utils/misc/mls.c
+++ b/src/backend/utils/misc/mls.c
@@ -363,11 +363,8 @@ Datum pg_trsprt_crypt_support_datatype(PG_FUNCTION_ARGS)
  */
 bool mls_check_relation_permission(Oid relid, bool * schema_bound)
 {
-    bool found;
     Oid  parent_oid;
 
-    found = false;
-
     if (!IS_SYSTEM_REL(relid))
     {
         if (schema_bound)
@@ -377,20 +374,27 @@ bool mls_check_relation_permission(Oid relid, bool * schema_bound)
         
         parent_oid = mls_get_parent_oid_by_relid(relid);
         
-        found = datamask_check_table_has_datamask(parent_oid);
-        if (true == found)
+        if (datamask_check_table_has_datamask(parent_oid) ||
+		        datamask_check_table_has_datamask(relid))
+        {
+            return true;
+        }
+
+        if (transparent_crypt_check_table_has_crypto(parent_oid,  true, schema_bound) ||
+		        transparent_crypt_check_table_has_crypto(relid,  true, schema_bound))
         {
-            return found;
+            return true;
         }
 
-        found = trsprt_crypt_check_table_has_crypt(parent_oid,  true, schema_bound);
-        if (true == found)
+        if (cls_check_table_has_policy(parent_oid) ||
+                cls_check_table_has_policy(relid))
         {
-            return found;
+        	return true;
         }
+
     }
 
-    return found;
+    return false;
 }
 
 bool mls_check_schema_permission(Oid schemaoid)
@@ -429,31 +433,31 @@ bool mls_check_schema_permission(Oid schemaoid)
 bool mls_check_column_permission(Oid relid, int attnum)
 {
     Oid  parent_oid;
-    bool found = false;
 
     if (!IS_SYSTEM_REL(relid))
     {
         parent_oid = mls_get_parent_oid_by_relid(relid);
-        found = dmask_check_table_col_has_dmask(parent_oid, attnum);
-        if (true == found)
+
+        if (datamask_check_table_col_has_datamask(parent_oid, attnum) ||
+		        datamask_check_table_col_has_datamask(relid, attnum))
         {
-            return found;
+            return true;
         }
 
-        found = trsprt_crypt_chk_tbl_col_has_crypt(parent_oid, attnum);
-        if (true == found)
+        if (transparent_crypt_check_table_col_has_crypto(parent_oid, attnum) ||
+		        transparent_crypt_check_table_col_has_crypto(relid, attnum))
         {
-            return found;
+            return true;
         }
 
-        found = cls_check_table_col_has_policy(parent_oid, attnum);
-        if (true == found)
+        if (cls_check_table_col_has_policy(parent_oid, attnum) ||
+		        cls_check_table_col_has_policy(relid, attnum))
         {
-            return found;
+            return true;
         }        
     }
 
-    return found;
+    return false;
 
 }
 
diff --git a/src/include/utils/cls.h b/src/include/utils/cls.h
index 3a91435c..2d753eec 100644
--- a/src/include/utils/cls.h
+++ b/src/include/utils/cls.h
@@ -84,5 +84,7 @@ extern void mls_update_cls_with_current_user(TupleTableSlot *slot);
 extern bool mls_cls_column_add_check(char * colname, Oid typoid);
 extern bool mls_cls_column_drop_check(char * name);
 extern bool cls_check_table_col_has_policy(Oid relid, int attnum);
+extern bool cls_check_table_has_policy(Oid relid);
+extern bool cls_check_user_has_policy(Oid relid);
 
 #endif
diff --git a/src/test/regress/expected/mls_check.out b/src/test/regress/expected/mls_check.out
index 0e5b955d..371bfe10 100644
--- a/src/test/regress/expected/mls_check.out
+++ b/src/test/regress/expected/mls_check.out
@@ -3776,6 +3776,34 @@ select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_SCHEMA('crypted_schema_alt_2');
  t
 (1 row)
 
+-- child table has bind
+select algorithm_id, nspname, tblname from pg_transparent_crypt_policy_map where nspname ilike '%alt%' order by 1,2,3;
+ algorithm_id |         nspname         |        tblname         
+--------------+-------------------------+------------------------
+            4 | no_crypted_schema_alt_2 | tbl_crypt_alt_2_part_0
+            4 | no_crypted_schema_alt_2 | tbl_crypt_alt_2_part_1
+            4 | no_crypted_schema_alt_2 | tbl_crypt_alt_2_part_2
+(3 rows)
+
+--clean child
+select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2_part_0');
+ mls_transparent_crypt_algorithm_unbind_table 
+----------------------------------------------
+ t
+(1 row)
+
+select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2_part_1');
+ mls_transparent_crypt_algorithm_unbind_table 
+----------------------------------------------
+ t
+(1 row)
+
+select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2_part_2');
+ mls_transparent_crypt_algorithm_unbind_table 
+----------------------------------------------
+ t
+(1 row)
+
 \c - godlike
 drop table no_crypted_schema_alt.tbl_crypted_alt;
 drop table no_crypted_schema_alt.tbl_nocrypt_alt;
@@ -5001,7 +5029,7 @@ select * from xixi where i = 3;
 (1 row)
 
 \c - badboy
---fails to update 
+--fails to update
 insert into xixi as x(i,j) values(6,6) on conflict(i) do update set j = 3096 where x.j = 2048 and x.i = 6;
 select * from xixi where i = 6;
  i | j | _cls 
@@ -5534,7 +5562,26 @@ truncate table lala3;
 drop table lala;
 drop table lala2;
 drop table lala3;
+\c - mls_admin
+select * from pg_cls_table;
+ polid | attnum | relid | enable | nspname | tblname | reloptions 
+-------+--------+-------+--------+---------+---------+------------
+    99 |      3 | 17061 | t      | public  | xixi    | 
+(1 row)
+
+select MLS_CLS_DROP_TABLE_LABEL('cls_compare', 'public', 'xixi');
+ mls_cls_drop_table_label 
+--------------------------
+ t
+(1 row)
+
+select * from pg_cls_table;
+ polid | attnum | relid | enable | nspname | tblname | reloptions 
+-------+--------+-------+--------+---------+---------+------------
+(0 rows)
+
 --everything is done
+\c - godlike
 drop table xixi;
 drop table momo;
 -----------------CLS END--------------------
diff --git a/src/test/regress/sql/mls_check.sql b/src/test/regress/sql/mls_check.sql
index 208fd38b..4369a706 100644
--- a/src/test/regress/sql/mls_check.sql
+++ b/src/test/regress/sql/mls_check.sql
@@ -1416,6 +1416,15 @@ select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt', 'tb
 select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2');
 select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_SCHEMA('crypted_schema_alt');
 select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_SCHEMA('crypted_schema_alt_2');
+
+-- child table has bind
+select algorithm_id, nspname, tblname from pg_transparent_crypt_policy_map where nspname ilike '%alt%' order by 1,2,3;
+
+--clean child
+select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2_part_0');
+select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2_part_1');
+select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('no_crypted_schema_alt_2', 'tbl_crypt_alt_2_part_2');
+
 \c - godlike
 drop table no_crypted_schema_alt.tbl_crypted_alt;
 drop table no_crypted_schema_alt.tbl_nocrypt_alt;
@@ -1923,7 +1932,7 @@ select * from xixi where i = 3;
 insert into xixi as x(i,j) values(3,3) on conflict(i) do update set j = 2048 where x.j = 1024 and x.i = 3;
 select * from xixi where i = 3;
 \c - badboy
---fails to update 
+--fails to update
 insert into xixi as x(i,j) values(6,6) on conflict(i) do update set j = 3096 where x.j = 2048 and x.i = 6;
 select * from xixi where i = 6;
 \c - godlike
@@ -2153,7 +2162,13 @@ drop table lala;
 drop table lala2;
 drop table lala3;
 
+\c - mls_admin
+select * from pg_cls_table;
+select MLS_CLS_DROP_TABLE_LABEL('cls_compare', 'public', 'xixi');
+select * from pg_cls_table;
+
 --everything is done
+\c - godlike
 drop table xixi;
 drop table momo;
 

From 0d82571a14784f41ec51376dbd07a5dff4d61210 Mon Sep 17 00:00:00 2001
From: jennyerchen <jennyerchen@tencent.com>
Date: Fri, 25 Sep 2020 06:24:31 +0000
Subject: [PATCH 063/578] Merge branch 'sequence_curval' into 'Tbase_v5.04'
 (merge request !95)

fix bug Incorrect acquisition of current session currval : http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696082313799
fix bug Incorrect acquisition of current session currval : http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696082313799

Signed-off-by: JennyJennyChen <chenzaini@sina.com>

(cherry picked from commit cd43ab51)

0489b282 fix bug Incorrect acquisition of current session currval : http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696082313799

Signed-off-by: JennyJennyChen <chenzaini@sina.com>
---
 src/backend/commands/sequence.c              | 43 ++++++++++----------
 src/test/regress/output/constraints.source   |  2 +-
 src/test/regress/output/constraints_2.source |  2 +-
 src/test/regress/output/constraints_3.source |  2 +-
 4 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index bc7cb490..5b6fd741 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -996,28 +996,29 @@ nextval_internal(Oid relid, bool check_permissions)
 Datum
 currval_oid(PG_FUNCTION_ARGS)
 {
-    Oid            relid = PG_GETARG_OID(0);
-    int64        result;
-    SeqTable    elm;
-    Relation    seqrel;
-    char *seqname = NULL;
+	Oid			relid = PG_GETARG_OID(0);
+	int64		result;
+	SeqTable	elm;
+	Relation	seqrel;
+	char *seqname = NULL;
+
+	/* open and lock sequence */
+	init_sequence(relid, &elm, &seqrel);
+
+	if (pg_class_aclcheck(elm->relid, GetUserId(),
+						  ACL_SELECT | ACL_USAGE) != ACLCHECK_OK)
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("permission denied for sequence %s",
+						RelationGetRelationName(seqrel))));
+
+	if (elm->last_valid)
+	{
+		result = elm->last;
+		relation_close(seqrel, NoLock);
+		PG_RETURN_INT64(result);
+	}
 
-    /* open and lock sequence */
-    init_sequence(relid, &elm, &seqrel);
-
-    if (pg_class_aclcheck(elm->relid, GetUserId(),
-                          ACL_SELECT | ACL_USAGE) != ACLCHECK_OK)
-        ereport(ERROR,
-                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                 errmsg("permission denied for sequence %s",
-                        RelationGetRelationName(seqrel))));
-#if 0
-    if (!elm->last_valid)
-        ereport(ERROR,
-                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-                 errmsg("currval of sequence \"%s\" is not yet defined in this session",
-                        RelationGetRelationName(seqrel))));
-#endif    
 #ifdef XCP
     {
         /*
diff --git a/src/test/regress/output/constraints.source b/src/test/regress/output/constraints.source
index 568efec7..e19ef775 100644
--- a/src/test/regress/output/constraints.source
+++ b/src/test/regress/output/constraints.source
@@ -187,7 +187,7 @@ DETAIL:  Failing row contains (8, Y, -8).
 SELECT 'eight' AS one, currval('insert_seq');
   one  | currval 
 -------+---------
- eight |       8
+ eight |       7
 (1 row)
 
 -- According to SQL, it is OK to insert a record that gives rise to NULL
diff --git a/src/test/regress/output/constraints_2.source b/src/test/regress/output/constraints_2.source
index 46a83703..241adcc1 100644
--- a/src/test/regress/output/constraints_2.source
+++ b/src/test/regress/output/constraints_2.source
@@ -188,7 +188,7 @@ DETAIL:  Failing row contains (9, Y, -9).
 SELECT 'eight' AS one, currval('insert_seq');
   one  | currval 
 -------+---------
- eight |       9
+ eight |       7
 (1 row)
 
 -- According to SQL, it is OK to insert a record that gives rise to NULL
diff --git a/src/test/regress/output/constraints_3.source b/src/test/regress/output/constraints_3.source
index bbdb9c1e..e19ef775 100644
--- a/src/test/regress/output/constraints_3.source
+++ b/src/test/regress/output/constraints_3.source
@@ -187,7 +187,7 @@ DETAIL:  Failing row contains (8, Y, -8).
 SELECT 'eight' AS one, currval('insert_seq');
   one  | currval 
 -------+---------
- eight |       9
+ eight |       7
 (1 row)
 
 -- According to SQL, it is OK to insert a record that gives rise to NULL

From 5db1bcbbd928fa9eab9673ac2d2a9044ddb9f447 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 24 Sep 2020 15:27:38 +0800
Subject: [PATCH 064/578] for gtm monitor ID859649763

---
 src/gtm/client/fe-connect.c      | 143 ++++----
 src/gtm/client/fe-protocol.c     | 541 ++++++++++++++++++-------------
 src/gtm/client/gtm_client.c      | 100 ++++++
 src/gtm/common/Makefile          |   2 +-
 src/gtm/common/bloom.c           | 282 ++++++++++++++++
 src/gtm/common/datapump.c        | 337 +++++++++++++++++++
 src/gtm/common/elog.c            | 136 ++++----
 src/gtm/gtm_ctl/gtm_ctl.c        | 449 ++++++++++++++++++-------
 src/gtm/libpq/pqformat.c         |   2 +-
 src/gtm/main/Makefile            |   2 +-
 src/gtm/main/gtm_seq.c           |  83 +++--
 src/gtm/main/gtm_stat.c          | 253 ++++++++++++++-
 src/gtm/main/gtm_stat_error.c    | 385 ++++++++++++++++++++++
 src/gtm/main/gtm_store.c         |   4 +-
 src/gtm/main/gtm_thread.c        | 110 ++++---
 src/gtm/main/main.c              | 269 ++++++++++-----
 src/include/gtm/bloom.h          |  39 +++
 src/include/gtm/datapump.h       |  52 +++
 src/include/gtm/elog.h           |   6 +
 src/include/gtm/gtm.h            |   4 +
 src/include/gtm/gtm_c.h          |   2 +-
 src/include/gtm/gtm_client.h     | 272 ++++++++--------
 src/include/gtm/gtm_msg.h        |   7 +-
 src/include/gtm/gtm_stat.h       |  86 +++++
 src/include/gtm/gtm_stat_error.h |  56 ++++
 25 files changed, 2837 insertions(+), 785 deletions(-)
 create mode 100644 src/gtm/common/bloom.c
 create mode 100644 src/gtm/common/datapump.c
 create mode 100644 src/gtm/main/gtm_stat_error.c
 create mode 100644 src/include/gtm/bloom.h
 create mode 100644 src/include/gtm/datapump.h
 create mode 100644 src/include/gtm/gtm_stat.h
 create mode 100644 src/include/gtm/gtm_stat_error.h

diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c
index 6e3b0306..97e6e0c5 100644
--- a/src/gtm/client/fe-connect.c
+++ b/src/gtm/client/fe-connect.c
@@ -929,76 +929,83 @@ freeGTM_Conn(GTM_Conn *conn)
     termGTMPQExpBuffer(&conn->errorMessage);
     termGTMPQExpBuffer(&conn->workBuffer);
 #ifdef XCP
-    if (conn->result)
-    {
-        /* Free last snapshot if defined */
-        if (conn->result->gr_snapshot.sn_xip)
-            free(conn->result->gr_snapshot.sn_xip);
-
-        /* Depending on result type there could be allocated data */
-        switch (conn->result->gr_type)
-        {
-            case SEQUENCE_INIT_RESULT:
-            case SEQUENCE_RESET_RESULT:
-            case SEQUENCE_CLOSE_RESULT:
-            case SEQUENCE_RENAME_RESULT:
-            case SEQUENCE_ALTER_RESULT:
-            case SEQUENCE_SET_VAL_RESULT:
-            case MSG_DB_SEQUENCE_RENAME_RESULT:
-                if (conn->result->gr_resdata.grd_seqkey.gsk_key)
-                    free(conn->result->gr_resdata.grd_seqkey.gsk_key);
-                break;
-
-            case SEQUENCE_GET_NEXT_RESULT:
-            case SEQUENCE_GET_LAST_RESULT:
-                if (conn->result->gr_resdata.grd_seq.seqkey.gsk_key)
-                    free(conn->result->gr_resdata.grd_seq.seqkey.gsk_key);
-                break;
-                
-            default:
-                break;
-        }
-
-        
-#ifdef __TBASE__                    
-        if (conn->result->grd_storage_data.len && conn->result->grd_storage_data.data)
-        {
-            free(conn->result->grd_storage_data.data);
-            conn->result->grd_storage_data.data = NULL;
-            conn->result->grd_storage_data.len  = 0;
-        }
-
-        if (conn->result->grd_store_seq.count && conn->result->grd_store_seq.seqs)
-        {
-            free(conn->result->grd_store_seq.seqs);
-            conn->result->grd_store_seq.seqs = NULL;
-            conn->result->grd_store_seq.count  = 0;
-        }
-
-        if (conn->result->grd_store_txn.count && conn->result->grd_store_txn.txns)
-        {
-            free(conn->result->grd_store_txn.txns);
-            conn->result->grd_store_txn.txns   = NULL;
-            conn->result->grd_store_txn.count  = 0;
-        }    
-
-        if (conn->result->grd_store_check_seq.count && conn->result->grd_store_check_seq.seqs)
-        {
-            free(conn->result->grd_store_check_seq.seqs);
-            conn->result->grd_store_check_seq.seqs   = NULL;
-            conn->result->grd_store_check_seq.count  = 0;
-        }
-
-        if (conn->result->grd_store_check_txn.count && conn->result->grd_store_check_txn.txns)
+	if (conn->result)
+	{
+		/* Free last snapshot if defined */
+		if (conn->result->gr_snapshot.sn_xip)
+			free(conn->result->gr_snapshot.sn_xip);
+
+		/* Depending on result type there could be allocated data */
+		switch (conn->result->gr_type)
+		{
+			case SEQUENCE_INIT_RESULT:
+			case SEQUENCE_RESET_RESULT:
+			case SEQUENCE_CLOSE_RESULT:
+			case SEQUENCE_RENAME_RESULT:
+			case SEQUENCE_ALTER_RESULT:
+			case SEQUENCE_SET_VAL_RESULT:
+			case MSG_DB_SEQUENCE_RENAME_RESULT:
+				if (conn->result->gr_resdata.grd_seqkey.gsk_key)
+					free(conn->result->gr_resdata.grd_seqkey.gsk_key);
+				break;
+
+			case SEQUENCE_GET_NEXT_RESULT:
+			case SEQUENCE_GET_LAST_RESULT:
+				if (conn->result->gr_resdata.grd_seq.seqkey.gsk_key)
+					free(conn->result->gr_resdata.grd_seq.seqkey.gsk_key);
+				break;
+				
+			default:
+				break;
+		}
+
+		
+#ifdef __TBASE__					
+		if (conn->result->grd_storage_data.len && conn->result->grd_storage_data.data)
+		{
+			free(conn->result->grd_storage_data.data);
+			conn->result->grd_storage_data.data = NULL;
+			conn->result->grd_storage_data.len  = 0;
+		}
+
+		if (conn->result->grd_store_seq.count && conn->result->grd_store_seq.seqs)
+		{
+			free(conn->result->grd_store_seq.seqs);
+			conn->result->grd_store_seq.seqs = NULL;
+			conn->result->grd_store_seq.count  = 0;
+		}
+
+		if (conn->result->grd_store_txn.count && conn->result->grd_store_txn.txns)
+		{
+			free(conn->result->grd_store_txn.txns);
+			conn->result->grd_store_txn.txns   = NULL;
+			conn->result->grd_store_txn.count  = 0;
+		}	
+
+		if (conn->result->grd_store_check_seq.count && conn->result->grd_store_check_seq.seqs)
+		{
+			free(conn->result->grd_store_check_seq.seqs);
+			conn->result->grd_store_check_seq.seqs   = NULL;
+			conn->result->grd_store_check_seq.count  = 0;
+		}
+
+		if (conn->result->grd_store_check_txn.count && conn->result->grd_store_check_txn.txns)
+		{
+			free(conn->result->grd_store_check_txn.txns);
+			conn->result->grd_store_check_txn.txns   = NULL;
+			conn->result->grd_store_check_txn.count  = 0;
+		}
+
+		if (conn->result->grd_errlog.len && conn->result->grd_errlog.errlog)
         {
-            free(conn->result->grd_store_check_txn.txns);
-            conn->result->grd_store_check_txn.txns   = NULL;
-            conn->result->grd_store_check_txn.count  = 0;
+		    free(conn->result->grd_errlog.errlog);
+            conn->result->grd_errlog.errlog = NULL;
+            conn->result->grd_errlog.len = 0;
         }
-        
-#endif    
-        free(conn->result);
-    }
+		
+#endif	
+		free(conn->result);
+	}
 #endif
 
     free(conn);
diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c
index 9bdfc9be..fb43649c 100644
--- a/src/gtm/client/fe-protocol.c
+++ b/src/gtm/client/fe-protocol.c
@@ -737,287 +737,368 @@ result->gr_status = GTM_RESULT_ERROR;
             }
 #endif
 
-            /* communication protocol: total data len, pkg number, {pkg_len,pkg_data}, {pkg_len,pkg_data},*/
-            if (gtmpqGetInt(&result->grd_storage_data.len,
-                            sizeof(uint32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            /* get loop count */
-            if (gtmpqGetInt(&loop_count,
-                            sizeof(uint32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            result->grd_storage_data.data = (char *) malloc(result->grd_storage_data.len);
-            data_buf = result->grd_storage_data.data;
-            for (i = 0; i < loop_count; i++)
-            {
-                /* a length of the next send pkg */
-                if (gtmpqGetInt(&data_len, sizeof(int32), conn))
-                {
-                    result->gr_status = GTM_RESULT_ERROR;
-                    break;
-                }
-
-                /* pkg body */
-                if (gtmpqGetnchar(data_buf + offset, data_len, conn))
-                {
-                    result->gr_status = GTM_RESULT_ERROR;
-                    break;
-                }
-                offset += data_len;
-            }
-
-            if (result->gr_status != GTM_RESULT_OK)
-            {
-                if (offset != result->grd_storage_data.len)
-                {
-                    abort();
-                }
-            }
-        }
-            break;
-
-        case TXN_FINISH_GID_RESULT:
+			/* communication protocol: total data len, pkg number, {pkg_len,pkg_data}, {pkg_len,pkg_data},*/
+			if (gtmpqGetInt(&result->grd_storage_data.len,
+							sizeof(uint32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			/* get loop count */
+			if (gtmpqGetInt(&loop_count,
+							sizeof(uint32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			result->grd_storage_data.data = (char *) malloc(result->grd_storage_data.len);
+			data_buf = result->grd_storage_data.data;
+			for (i = 0; i < loop_count; i++)
+			{
+				/* a length of the next send pkg */
+				if (gtmpqGetInt(&data_len, sizeof(int32), conn))
+				{
+					result->gr_status = GTM_RESULT_ERROR;
+					break;
+				}
+
+				/* pkg body */
+				if (gtmpqGetnchar(data_buf + offset, data_len, conn))
+				{
+					result->gr_status = GTM_RESULT_ERROR;
+					break;
+				}
+				offset += data_len;
+			}
+
+			if (result->gr_status != GTM_RESULT_OK)
+			{
+				if (offset != result->grd_storage_data.len)
+				{
+					abort();
+				}
+			}
+		}
+			break;
+
+		case TXN_FINISH_GID_RESULT:
+		{
+			if (gtmpqGetInt(&result->gr_finish_status,
+							sizeof(uint32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+			break;
+		}
+
+		case MSG_LIST_GTM_STORE_RESULT:
+		{
+			if (gtmpqGetInt64(&result->gtm_status.header.m_identifier, conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt(&result->gtm_status.header.m_major_version, sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt(&result->gtm_status.header.m_minor_version, sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt(&result->gtm_status.header.m_gtm_status, sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt64(&result->gtm_status.header.m_next_gts, conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_global_xmin, sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_next_gxid, sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_seq_freelist, sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_txn_freelist, sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt64(&result->gtm_status.header.m_lsn, conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+
+			if (gtmpqGetInt64(&result->gtm_status.header.m_last_update_time, conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_crc, sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt((int32 *) &result->gtm_status.seq_total, sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt((int32 *) &result->gtm_status.seq_used, sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt((int32 *) &result->gtm_status.txn_total, sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			if (gtmpqGetInt((int32 *) &result->gtm_status.txn_used, sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+			break;
+		}
+
+		case MSG_LIST_GTM_STORE_SEQ_RESULT:    /* List  gtm running sequence info */
+		{
+			if (conn->result->grd_store_seq.count && conn->result->grd_store_seq.seqs)
+			{
+				free(conn->result->grd_store_seq.seqs);
+				conn->result->grd_store_seq.seqs = NULL;
+				conn->result->grd_store_seq.count = 0;
+			}
+
+			if (gtmpqGetInt(&conn->result->grd_store_seq.count,
+							sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			conn->result->grd_store_seq.seqs =
+					(GTM_StoredSeqInfo *) malloc(sizeof(GTM_StoredSeqInfo) *
+												 conn->result->grd_store_seq.count);
+			for (i = 0; i < conn->result->grd_store_seq.count; i++)
+			{
+				if (gtmpqGetnchar((char *) &conn->result->grd_store_seq.seqs[i], sizeof(GTM_StoredSeqInfo), conn))
+				{
+					result->gr_status = GTM_RESULT_ERROR;
+					break;
+				}
+			}
+			break;
+		}
+
+		case MSG_LIST_GTM_TXN_STORE_RESULT:    /* List  gtm running sequence info */
+		{
+			if (conn->result->grd_store_txn.count && conn->result->grd_store_txn.txns)
+			{
+				free(conn->result->grd_store_txn.txns);
+				conn->result->grd_store_txn.txns = NULL;
+				conn->result->grd_store_txn.count = 0;
+			}
+
+			if (gtmpqGetInt(&conn->result->grd_store_txn.count,
+							sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			conn->result->grd_store_txn.txns =
+					(GTM_StoredTransactionInfo *) malloc(sizeof(GTM_StoredTransactionInfo) *
+														 conn->result->grd_store_txn.count);
+			for (i = 0; i < conn->result->grd_store_txn.count; i++)
+			{
+				if (gtmpqGetnchar((char *) &conn->result->grd_store_txn.txns[i], sizeof(GTM_StoredTransactionInfo),
+								  conn))
+				{
+					result->gr_status = GTM_RESULT_ERROR;
+					break;
+				}
+			}
+			break;
+		}
+
+
+		case MSG_CHECK_GTM_SEQ_STORE_RESULT:    /* Check gtm sequence valid info */
+		{
+			if (conn->result->grd_store_check_seq.count && conn->result->grd_store_check_seq.seqs)
+			{
+				free(conn->result->grd_store_check_seq.seqs);
+				conn->result->grd_store_check_seq.seqs = NULL;
+				conn->result->grd_store_check_seq.count = 0;
+			}
+
+			if (gtmpqGetInt(&conn->result->grd_store_check_seq.count,
+							sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			conn->result->grd_store_check_seq.seqs =
+					(GTMStorageSequneceStatus *) malloc(sizeof(GTMStorageSequneceStatus) *
+														conn->result->grd_store_check_seq.count);
+			for (i = 0; i < conn->result->grd_store_check_seq.count; i++)
+			{
+				if (gtmpqGetnchar((char *) &conn->result->grd_store_check_seq.seqs[i], sizeof(GTMStorageSequneceStatus),
+								  conn))
+				{
+					result->gr_status = GTM_RESULT_ERROR;
+					break;
+				}
+			}
+			break;
+		}
+
+		case MSG_CHECK_GTM_TXN_STORE_RESULT:    /* Check gtm transaction usage info */
+		{
+			if (conn->result->grd_store_check_txn.count && conn->result->grd_store_check_txn.txns)
+			{
+				free(conn->result->grd_store_check_txn.txns);
+				conn->result->grd_store_check_txn.txns = NULL;
+				conn->result->grd_store_check_txn.count = 0;
+			}
+
+			if (gtmpqGetInt(&conn->result->grd_store_check_txn.count,
+							sizeof(int32), conn))
+			{
+				result->gr_status = GTM_RESULT_ERROR;
+				break;
+			}
+
+			conn->result->grd_store_check_txn.txns =
+					(GTMStorageTransactionStatus *) malloc(sizeof(GTMStorageTransactionStatus) *
+														   conn->result->grd_store_check_txn.count);
+			for (i = 0; i < conn->result->grd_store_check_txn.count; i++)
+			{
+				if (gtmpqGetnchar((char *) &conn->result->grd_store_check_txn.txns[i],
+								  sizeof(GTMStorageTransactionStatus), conn))
+				{
+					result->gr_status = GTM_RESULT_ERROR;
+					break;
+				}
+			}
+			break;
+		}
+
+        case MSG_GET_GTM_STATISTICS_RESULT:
         {
-            if (gtmpqGetInt(&result->gr_finish_status,
-                            sizeof(uint32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-            break;
-        }
-
-        case MSG_LIST_GTM_STORE_RESULT:
-        {
-            if (gtmpqGetInt64(&result->gtm_status.header.m_identifier, conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            if (gtmpqGetInt(&result->gtm_status.header.m_major_version, sizeof(int32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            if (gtmpqGetInt(&result->gtm_status.header.m_minor_version, sizeof(int32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            if (gtmpqGetInt(&result->gtm_status.header.m_gtm_status, sizeof(int32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            if (gtmpqGetInt64(&result->gtm_status.header.m_next_gts, conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_global_xmin, sizeof(int32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_next_gxid, sizeof(int32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_seq_freelist, sizeof(int32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_txn_freelist, sizeof(int32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            if (gtmpqGetInt64(&result->gtm_status.header.m_lsn, conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-
-            if (gtmpqGetInt64(&result->gtm_status.header.m_last_update_time, conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            if (gtmpqGetInt((int32 *) &result->gtm_status.header.m_crc, sizeof(int32), conn))
+            if (gtmpqGetInt64(&result->gr_resdata.statistic_result.start_time, conn))
             {
                 result->gr_status = GTM_RESULT_ERROR;
                 break;
             }
 
-            if (gtmpqGetInt((int32 *) &result->gtm_status.seq_total, sizeof(int32), conn))
+            if (gtmpqGetInt64(&result->gr_resdata.statistic_result.end_time, conn))
             {
                 result->gr_status = GTM_RESULT_ERROR;
                 break;
             }
 
-            if (gtmpqGetInt((int32 *) &result->gtm_status.seq_used, sizeof(int32), conn))
+            if (gtmpqGetInt(&result->gr_resdata.statistic_result.sequences_remained,
+                              sizeof(int32), conn))
             {
                 result->gr_status = GTM_RESULT_ERROR;
                 break;
             }
 
-            if (gtmpqGetInt((int32 *) &result->gtm_status.txn_total, sizeof(int32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            if (gtmpqGetInt((int32 *) &result->gtm_status.txn_used, sizeof(int32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-            break;
-        }
-
-        case MSG_LIST_GTM_STORE_SEQ_RESULT:    /* List  gtm running sequence info */
-        {
-            if (conn->result->grd_store_seq.count && conn->result->grd_store_seq.seqs)
-            {
-                free(conn->result->grd_store_seq.seqs);
-                conn->result->grd_store_seq.seqs = NULL;
-                conn->result->grd_store_seq.count = 0;
-            }
-
-            if (gtmpqGetInt(&conn->result->grd_store_seq.count,
+            if (gtmpqGetInt(&result->gr_resdata.statistic_result.txn_remained,
                             sizeof(int32), conn))
             {
                 result->gr_status = GTM_RESULT_ERROR;
                 break;
             }
 
-            conn->result->grd_store_seq.seqs =
-                    (GTM_StoredSeqInfo *) malloc(sizeof(GTM_StoredSeqInfo) *
-                                                 conn->result->grd_store_seq.count);
-            for (i = 0; i < conn->result->grd_store_seq.count; i++)
+            for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++)
             {
-                if (gtmpqGetnchar((char *) &conn->result->grd_store_seq.seqs[i], sizeof(GTM_StoredSeqInfo), conn))
+                if (gtmpqGetInt((int32*) &result->gr_resdata.statistic_result.stat_info[i].total_request_times,
+                                sizeof(int32), conn))
                 {
                     result->gr_status = GTM_RESULT_ERROR;
                     break;
                 }
-            }
-            break;
-        }
 
-        case MSG_LIST_GTM_TXN_STORE_RESULT:    /* List  gtm running sequence info */
-        {
-            if (conn->result->grd_store_txn.count && conn->result->grd_store_txn.txns)
-            {
-                free(conn->result->grd_store_txn.txns);
-                conn->result->grd_store_txn.txns = NULL;
-                conn->result->grd_store_txn.count = 0;
-            }
-
-            if (gtmpqGetInt(&conn->result->grd_store_txn.count,
-                            sizeof(int32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
-
-            conn->result->grd_store_txn.txns =
-                    (GTM_StoredTransactionInfo *) malloc(sizeof(GTM_StoredTransactionInfo) *
-                                                         conn->result->grd_store_txn.count);
-            for (i = 0; i < conn->result->grd_store_txn.count; i++)
-            {
-                if (gtmpqGetnchar((char *) &conn->result->grd_store_txn.txns[i], sizeof(GTM_StoredTransactionInfo),
-                                  conn))
+                if (gtmpqGetInt((int32*) &result->gr_resdata.statistic_result.stat_info[i].avg_costtime,
+                                sizeof(int32), conn))
                 {
                     result->gr_status = GTM_RESULT_ERROR;
                     break;
                 }
-            }
-            break;
-        }
-
 
-        case MSG_CHECK_GTM_SEQ_STORE_RESULT:    /* Check gtm sequence valid info */
-        {
-            if (conn->result->grd_store_check_seq.count && conn->result->grd_store_check_seq.seqs)
-            {
-                free(conn->result->grd_store_check_seq.seqs);
-                conn->result->grd_store_check_seq.seqs = NULL;
-                conn->result->grd_store_check_seq.count = 0;
-            }
-
-            if (gtmpqGetInt(&conn->result->grd_store_check_seq.count,
-                            sizeof(int32), conn))
-            {
-                result->gr_status = GTM_RESULT_ERROR;
-                break;
-            }
+                if (gtmpqGetInt((int32*) &result->gr_resdata.statistic_result.stat_info[i].max_costtime,
+                                sizeof(int32), conn))
+                {
+                    result->gr_status = GTM_RESULT_ERROR;
+                    break;
+                }
 
-            conn->result->grd_store_check_seq.seqs =
-                    (GTMStorageSequneceStatus *) malloc(sizeof(GTMStorageSequneceStatus) *
-                                                        conn->result->grd_store_check_seq.count);
-            for (i = 0; i < conn->result->grd_store_check_seq.count; i++)
-            {
-                if (gtmpqGetnchar((char *) &conn->result->grd_store_check_seq.seqs[i], sizeof(GTMStorageSequneceStatus),
-                                  conn))
+                if (gtmpqGetInt((int32*) &result->gr_resdata.statistic_result.stat_info[i].min_costtime,
+                                sizeof(int32), conn))
                 {
                     result->gr_status = GTM_RESULT_ERROR;
                     break;
                 }
             }
+
             break;
         }
-
-        case MSG_CHECK_GTM_TXN_STORE_RESULT:    /* Check gtm transaction usage info */
+	    case MSG_GET_GTM_ERRORLOG_RESULT:
         {
-            if (conn->result->grd_store_check_txn.count && conn->result->grd_store_check_txn.txns)
+            result->grd_errlog.len = result->gr_msglen;
+            if (result->gr_msglen == 0)
             {
-                free(conn->result->grd_store_check_txn.txns);
-                conn->result->grd_store_check_txn.txns = NULL;
-                conn->result->grd_store_check_txn.count = 0;
+                break;
             }
 
-            if (gtmpqGetInt(&conn->result->grd_store_check_txn.count,
-                            sizeof(int32), conn))
+            result->grd_errlog.errlog =
+                    (char *) malloc(result->gr_msglen);
+            if (gtmpqGetnchar((char *) result->grd_errlog.errlog,
+                              result->gr_msglen, conn))
             {
                 result->gr_status = GTM_RESULT_ERROR;
                 break;
             }
-
-            conn->result->grd_store_check_txn.txns =
-                    (GTMStorageTransactionStatus *) malloc(sizeof(GTMStorageTransactionStatus) *
-                                                           conn->result->grd_store_check_txn.count);
-            for (i = 0; i < conn->result->grd_store_check_txn.count; i++)
-            {
-                if (gtmpqGetnchar((char *) &conn->result->grd_store_check_txn.txns[i],
-                                  sizeof(GTMStorageTransactionStatus), conn))
-                {
-                    result->gr_status = GTM_RESULT_ERROR;
-                    break;
-                }
-            }
             break;
         }
+
 #endif
         case SEQUENCE_LIST_RESULT:
             if (gtmpqGetInt(&result->gr_resdata.grd_seq_list.seq_count,
diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c
index d5b5b56d..c0ef6ab1 100644
--- a/src/gtm/client/gtm_client.c
+++ b/src/gtm/client/gtm_client.c
@@ -574,6 +574,106 @@ check_gtm_status(GTM_Conn *conn, int *status, GTM_Timestamp *master,XLogRecPtr *
     return GTM_RESULT_ERROR;
 }
 
+/*
+ * to get GTM statistics info
+ */
+int
+get_gtm_statistics(GTM_Conn *conn, int clear_flag, int timeout_seconds, GTM_StatisticsResult** result)
+{
+    GTM_Result *res = NULL;
+    time_t finish_time;
+
+    /* Start the message. */
+    if (gtmpqPutMsgStart('C', true, conn) ||
+        gtmpqPutInt(MSG_GET_STATISTICS, sizeof (GTM_MessageType), conn))
+        goto send_failed;
+
+    if (gtmpqPutInt(clear_flag,sizeof(int),conn))
+        goto send_failed;
+
+    /* Finish the message. */
+    if (gtmpqPutMsgEnd(conn))
+        goto send_failed;
+
+    /* Flush to ensure backend gets it. */
+    if (gtmpqFlush(conn))
+        goto send_failed;
+
+    /* add two seconds to allow extra wait */
+    finish_time = time(NULL) + timeout_seconds + 2;
+    if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+        gtmpqReadData(conn) < 0)
+        goto receive_failed;
+
+    if ((res = GTMPQgetResult(conn)) == NULL)
+        goto receive_failed;
+
+    if (GTM_RESULT_OK == res->gr_status)
+    {
+        *result = &(res->gr_resdata.statistic_result);
+        return GTM_RESULT_OK;
+    }
+    else
+    {
+        return GTM_RESULT_ERROR;
+    }
+
+receive_failed:
+send_failed:
+    conn->result = makeEmptyResultIfIsNull(conn->result);
+    conn->result->gr_status = GTM_RESULT_COMM_ERROR;
+    return GTM_RESULT_ERROR;
+}
+
+/*
+ * to get gtm error log
+ */
+int
+get_gtm_errlog(GTM_Conn *conn, int timeout_seconds, char** errlog, int* len)
+{
+    GTM_Result *res = NULL;
+    time_t finish_time;
+
+    /* Start the message. */
+    if (gtmpqPutMsgStart('C', true, conn) ||
+        gtmpqPutInt(MSG_GET_ERRORLOG, sizeof (GTM_MessageType), conn))
+        goto send_failed;
+
+    /* Finish the message. */
+    if (gtmpqPutMsgEnd(conn))
+        goto send_failed;
+
+    /* Flush to ensure backend gets it. */
+    if (gtmpqFlush(conn))
+        goto send_failed;
+
+    /* add two seconds to allow extra wait */
+    finish_time = time(NULL) + timeout_seconds + 2;
+    if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+        gtmpqReadData(conn) < 0)
+        goto receive_failed;
+
+    if ((res = GTMPQgetResult(conn)) == NULL)
+        goto receive_failed;
+
+    if (GTM_RESULT_OK == res->gr_status)
+    {
+        *errlog = res->grd_errlog.errlog;
+        *len = res->grd_errlog.len;
+        return GTM_RESULT_OK;
+    }
+    else
+    {
+        return GTM_RESULT_ERROR;
+    }
+
+receive_failed:
+send_failed:
+    conn->result = makeEmptyResultIfIsNull(conn->result);
+    conn->result->gr_status = GTM_RESULT_COMM_ERROR;
+    return GTM_RESULT_ERROR;
+}
+
 #endif
 /*
  * Transaction Management API
diff --git a/src/gtm/common/Makefile b/src/gtm/common/Makefile
index 8f91e968..43d80dad 100644
--- a/src/gtm/common/Makefile
+++ b/src/gtm/common/Makefile
@@ -23,7 +23,7 @@ LDFLAGS=-L$(top_builddir)/common -L$(top_builddir)/libpq
 LIBS=-lpthread -lrt
 
 OBJS = gtm_opt_handler.o aset.o mcxt.o gtm_utils.o elog.o assert.o stringinfo.o gtm_lock.o \
-       gtm_list.o gtm_serialize.o gtm_serialize_debug.o gtm_time.o gtm_gxid.o heap.o
+       gtm_list.o gtm_serialize.o gtm_serialize_debug.o gtm_time.o gtm_gxid.o heap.o datapump.o bloom.o
 
 all:all-lib
 
diff --git a/src/gtm/common/bloom.c b/src/gtm/common/bloom.c
new file mode 100644
index 00000000..14348110
--- /dev/null
+++ b/src/gtm/common/bloom.c
@@ -0,0 +1,282 @@
+/*-------------------------------------------------------------------------
+ *
+ * bloom.c
+ *
+ *	  a bloom filter, using murmurhash
+ *
+ * Copyright (c) 2020-Present TBase development team, Tencent
+ *
+ *
+ * IDENTIFICATION
+ *	  src/gtm/common/bloom.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include<limits.h>
+#include<stdarg.h>
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+#include "gtm/bloom.h"
+#include "gtm/palloc.h"
+
+#define SETBIT(bitmap, bit) ((bitmap)[(bit)/CHAR_BIT] |= (1<<((bit)%CHAR_BIT)))
+#define GETBIT(bitmap, bit) ((bitmap)[(bit)/CHAR_BIT] & (1<<((bit)%CHAR_BIT)))
+#define MIX(h,k,m) { k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; }
+
+/*
+ * Create a bloom filter, variable parameter is hash seed
+ * hash function num depend on seeds
+ */
+BLOOM *
+BloomCreate(int bitmap_size, int nfuncs, ...)
+{
+    BLOOM *bloom;
+    va_list l;
+    int i;
+
+    bloom = palloc(sizeof(BLOOM));
+    if (NULL == bloom)
+    {
+        return NULL;
+    }
+
+    bloom->bitmap = palloc0( ((bitmap_size + CHAR_BIT - 1) / CHAR_BIT) * sizeof(char));
+    if (NULL == bloom->bitmap)
+    {
+        pfree(bloom);
+        return NULL;
+    }
+
+    bloom->seeds = (uint32*)palloc(nfuncs * sizeof(uint32));
+    if (NULL == bloom->seeds)
+    {
+        pfree(bloom->bitmap);
+        pfree(bloom);
+        return NULL;
+    }
+
+    va_start(l, nfuncs);
+    for(i = 0; i < nfuncs; ++i)
+    {
+        bloom->seeds[i] = va_arg(l, uint32);
+    }
+    va_end(l);
+
+    bloom->bitmap_size = bitmap_size;
+    bloom->nfuncs = nfuncs;
+
+    return bloom;
+}
+
+/*
+ * Destroy a bloom filter
+ */
+int
+BloomDestroy(BLOOM *bloom)
+{
+    pfree(bloom->bitmap);
+    pfree(bloom->seeds);
+    pfree(bloom);
+
+    return 0;
+}
+
+/*
+ * Reset bloom filter's bitmap
+ */
+void
+BloomReset(BLOOM *bloom)
+{
+    MemSet(bloom->bitmap, 0, ((bloom->bitmap_size + CHAR_BIT - 1) / CHAR_BIT) * sizeof(char));
+}
+
+/*
+ * Add an item into bloom filter
+ */
+void
+BloomAdd(BLOOM *bloom, const char *s, int len)
+{
+    int i;
+    for(i = 0; i < bloom->nfuncs; ++i)
+    {
+        SETBIT(bloom->bitmap, MurmurHash2(s, len, bloom->seeds[i]) % bloom->bitmap_size);
+    }
+}
+
+/*
+ * Check if the item exist
+ */
+bool
+BloomCheck(BLOOM *bloom, const char *s, int len)
+{
+    int i;
+
+    for(i = 0; i < bloom->nfuncs; ++i)
+    {
+        if(!(GETBIT(bloom->bitmap, MurmurHash2(s, len, bloom->seeds[i]) % bloom->bitmap_size)))
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/*
+ * Check if the item exist, if not exist, add the item into bloom
+ */
+bool
+BloomCheckAndAdd(BLOOM *bloom, const char *s, int len)
+{
+    int i, j;
+    uint32 hash;
+    bool exist = true;
+    for(i = 0; i < bloom->nfuncs; ++i)
+    {
+        hash = MurmurHash2(s, len, bloom->seeds[i]) % bloom->bitmap_size;
+        if(!(GETBIT(bloom->bitmap, hash)))
+        {
+            exist = false;
+            SETBIT(bloom->bitmap, hash);
+            for (j = i + 1; j < bloom->nfuncs; ++j)
+            {
+                hash = MurmurHash2(s, len, bloom->seeds[j]) % bloom->bitmap_size;
+                SETBIT(bloom->bitmap, hash);
+            }
+            break;
+        }
+    }
+    return exist;
+}
+
+/*
+ * Murmurhash function
+ */
+uint32_t
+MurmurHash2(const void * key, int len, uint32_t seed)
+{
+    const uint32_t m = 0x5bd1e995;
+    const int32_t r = 24;
+    const uint8_t * data = (const uint8_t *)key;
+    uint32_t h = seed ^ len;
+    uint8_t align = (uintptr_t)data & 3;
+
+    if(align && (len >= 4))
+    {
+        /* Pre-load the temp registers */
+        uint32_t t = 0, d = 0;
+        int32_t sl;
+        int32_t sr;
+
+        switch(align)
+        {
+            case 1: t |= data[2] << 16;
+            case 2: t |= data[1] << 8;
+            case 3: t |= data[0];
+        }
+
+        t <<= (8 * align);
+
+        data += 4-align;
+        len -= 4-align;
+
+        sl = 8 * (4-align);
+        sr = 8 * align;
+
+        /* Mix */
+
+        while(len >= 4)
+        {
+            uint32_t k;
+
+            d = *(uint32_t *)data;
+            t = (t >> sr) | (d << sl);
+
+            k = t;
+
+            MIX(h,k,m);
+
+            t = d;
+
+            data += 4;
+            len -= 4;
+        }
+
+        /* Handle leftover data in temp registers */
+
+        d = 0;
+
+        if(len >= align)
+        {
+            uint32_t k;
+
+            switch(align)
+            {
+                case 3: d |= data[2] << 16;
+                case 2: d |= data[1] << 8;
+                case 1: d |= data[0];
+            }
+
+            k = (t >> sr) | (d << sl);
+            MIX(h,k,m);
+
+            data += align;
+            len -= align;
+
+            /* ----------
+             * Handle tail bytes */
+
+            switch(len)
+            {
+                case 3: h ^= data[2] << 16;
+                case 2: h ^= data[1] << 8;
+                case 1: h ^= data[0]; h *= m;
+            };
+        }
+        else
+        {
+            switch(len)
+            {
+                case 3: d |= data[2] << 16;
+                case 2: d |= data[1] << 8;
+                case 1: d |= data[0];
+                case 0: h ^= (t >> sr) | (d << sl); h *= m;
+            }
+        }
+
+        h ^= h >> 13;
+        h *= m;
+        h ^= h >> 15;
+
+        return h;
+    }
+    else
+    {
+        while(len >= 4)
+        {
+            uint32_t k = *(uint32_t *)data;
+
+            MIX(h,k,m);
+
+            data += 4;
+            len -= 4;
+        }
+
+        /* ----------
+         * Handle tail bytes */
+
+        switch(len)
+        {
+            case 3: h ^= data[2] << 16;
+            case 2: h ^= data[1] << 8;
+            case 1: h ^= data[0]; h *= m;
+        };
+
+        h ^= h >> 13;
+        h *= m;
+        h ^= h >> 15;
+
+        return h;
+    }
+}
diff --git a/src/gtm/common/datapump.c b/src/gtm/common/datapump.c
new file mode 100644
index 00000000..912dc4bd
--- /dev/null
+++ b/src/gtm/common/datapump.c
@@ -0,0 +1,337 @@
+/*-------------------------------------------------------------------------
+ *
+ * datapump.c
+ *
+ *
+ *	  lockless message queue
+ *
+ * Copyright (c) 2020-Present TBase development team, Tencent
+ *
+ *
+ * IDENTIFICATION
+ *	  src/gtm/common/datapump.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/datapump.h"
+
+
+/*
+ * The following funciton is used to handle lockless message queue.
+ */
+ 
+/*
+ * Get data pointer, use with the following functions.
+ */
+char *
+GetData(DataPumpBuf *buf, uint32 *uiLen)
+{
+	uint32 border = 0;
+	uint32 tail = 0;
+	char  *data;
+	if (buf)
+	{
+		if (0 == DataSize(buf))
+		{
+			return NULL;
+		}
+		
+		SpinLockAcquire(&(buf->pointer_lock));
+		border = buf->border;
+		tail   = buf->tail;
+		SpinLockRelease(&(buf->pointer_lock));
+		if (INVALID_BORDER == border)
+		{
+			*uiLen = 0;
+			return NULL;
+		}
+
+		/* read from tail to border*/
+		if (border >=  tail)
+		{ 
+			 /* Only sender increases tail, no need to lock. */
+			 *uiLen = border - tail;  
+			 data = buf->buf + tail;
+		}
+		else
+		{  
+			/* read from tail to end */
+			*uiLen = buf->length - tail;
+			data = buf->buf + tail;
+			buf->wrap_around = true;
+		} 
+		return data;
+	}
+	else
+	{
+		*uiLen = 0;
+		return NULL;
+	}
+}
+
+/*
+ * Increate data offset, used after finishing read data from queue.
+ */
+void
+IncDataOff(DataPumpBuf *buf, uint32 uiLen)
+{
+	if (buf)
+	{
+		SpinLockAcquire(&(buf->pointer_lock));
+		buf->tail  = (buf->tail + uiLen) % buf->length;
+		if (buf->tail == buf->border)
+        {
+            buf->border = INVALID_BORDER;
+        }
+		SpinLockRelease(&(buf->pointer_lock));
+	}
+}
+
+/*
+ * Return total data size in buffer
+ */
+uint32
+DataSize(DataPumpBuf *buf)
+{
+	uint32 border = 0;
+	uint32 head   = 0;
+	uint32 tail   = 0;
+	uint32 size   = 0;
+	if (buf)
+	{
+		SpinLockAcquire(&(buf->pointer_lock));
+		head   = buf->head;
+		tail   = buf->tail;
+		border = buf->border;
+		SpinLockRelease(&(buf->pointer_lock));
+
+		if (INVALID_BORDER == border)
+		{
+			return 0;
+		}
+
+		if (tail <= head)
+		{	
+			size = head - tail;
+		}
+		else
+		{
+			size = buf->length - tail + head;
+		}
+		
+		return size;
+	}
+	return 0;
+}
+
+/*
+ * Get the pointer to write and return the length to write.
+ */
+char *
+GetWriteOff(DataPumpBuf *buf, uint32 *uiLen)
+{
+	uint32 head = 0;
+	uint32 tail = 0;
+	char  *ptr  = NULL;
+	if (0 == FreeSpace(buf))
+	{
+		return NULL;
+	}
+	
+	if (buf)
+	{
+		SpinLockAcquire(&(buf->pointer_lock));
+		head = buf->head;
+		tail = buf->tail;
+		SpinLockRelease(&(buf->pointer_lock));
+		
+		if (head >=  tail)
+		{ 
+		   /* tail is the beginning of the queue. */
+		   if (tail != 0)
+		   {
+			   
+			   *uiLen = buf->length - head;
+		   }
+		   else
+		   {
+			   /* Reserved one byte as flag. */
+			   *uiLen = buf->length - head - 1;
+		   }
+		}
+		else
+		{			   
+		   /* Reserved one byte as flag. */
+		   *uiLen = tail - head - 1;
+		}		
+		ptr = buf->buf + head;
+		return ptr;
+	}
+	else
+	{
+		return NULL;
+	}	
+}
+
+/*
+ * Used to increase the write pointer after write some data.
+ */
+void
+IncWriteOff(DataPumpBuf *buf, uint32 uiLen)
+{
+	if (buf)
+	{
+		SpinLockAcquire(&(buf->pointer_lock));
+   		buf->head  += uiLen;
+   		buf->head  = buf->head % buf->length;
+		SpinLockRelease(&(buf->pointer_lock));
+	}
+}
+
+/*
+ * Reserve space in print buffer
+ */
+int
+ReserveSpace(DataPumpBuf *buf, uint32 len, uint32 *offset)
+{
+	/* not enough space avaliable, wait */
+	if (FreeSpace(buf) < len)
+	{
+		return -1;
+	}
+
+	if (buf)
+	{
+		*offset	= buf->head;
+   		buf->head  = (buf->head + len) % buf->length;
+	}
+	return 0;		
+}
+
+uint32
+BufferOffsetAdd(DataPumpBuf *buf, uint32 pointer, uint32 offset)
+{
+
+	if (buf)
+	{	
+   		return (pointer + offset) % buf->length;
+	}
+	return 0;		
+}
+
+/*
+ * No need to lock, reader never read the data before we set border.
+ */
+int
+ReturnSpace(DataPumpBuf *buf, uint32 offset)
+{
+	if (buf)
+	{
+		buf->head = offset;
+	}
+	return 0;		
+}
+
+/*
+ * Fill data into reserved by ReserveSpace
+ */
+void
+FillReserveSpace(DataPumpBuf *buf, uint32 offset, char *p, uint32 len)
+{
+	uint32 bytes2end      = 0;
+	uint32 bytesfrombegin = 0;   
+
+	if (buf)
+	{		
+		bytes2end = buf->length - offset;
+		if (len <= bytes2end)
+		{
+			memcpy(buf->buf + offset, p, len);
+		}
+		else
+		{
+			bytesfrombegin = len - bytes2end;				
+			memcpy(buf->buf + offset, p, bytes2end);
+			memcpy(buf->buf, (char*)p + bytes2end, bytesfrombegin);
+		}	
+	}
+}
+
+/*
+ * Return free space of the buffer.
+ */
+uint32
+FreeSpace(DataPumpBuf *buf)
+{
+	uint32 head = 0;
+	uint32 tail = 0;
+	uint32 len  = 0;
+	if (buf)
+	{
+		SpinLockAcquire(&(buf->pointer_lock));
+		head = buf->head;
+		tail = buf->tail;
+		SpinLockRelease(&(buf->pointer_lock));
+
+		if (tail <= head)
+		{
+			len = tail + buf->length - head - 1;
+		}
+		else
+		{
+			len = tail - head - 1;
+		}
+		return len;
+	}
+	else
+	{
+		return 0;
+	}
+}
+
+/*
+ * Set tuple end border of the buffer.
+ */
+void
+SetBorder(DataPumpBuf *buf)
+{
+	SpinLockAcquire(&(buf->pointer_lock));
+	buf->border = buf->head;
+	SpinLockRelease(&(buf->pointer_lock));
+}
+
+/*
+ * Send data into buffer
+ */
+void
+PutData(DataPumpBuf *buf, char *data, uint32 len)
+{	
+	char   *ptr;
+	uint32  bufferLen;
+	uint32  needLen;
+	uint32  offset = 0;
+	needLen = len;
+	while (1)
+	{			
+		ptr = GetWriteOff(buf, &bufferLen);
+		if (ptr)
+		{
+			if (bufferLen >= needLen)
+			{
+				memcpy(ptr, data + offset, needLen);
+				IncWriteOff(buf, needLen);
+				return;
+			}
+			else
+			{
+				memcpy(ptr, data + offset, bufferLen);
+				IncWriteOff(buf, bufferLen);
+				needLen -= bufferLen;
+				offset  += bufferLen;
+			}
+		}
+	}
+}
+
+
diff --git a/src/gtm/common/elog.c b/src/gtm/common/elog.c
index 1d30011b..833b9e25 100644
--- a/src/gtm/common/elog.c
+++ b/src/gtm/common/elog.c
@@ -52,6 +52,7 @@ int            Log_destination = LOG_DESTINATION_STDERR;
         } \
     } while (0)
 
+errlog_collection_hook_type errlog_collection_func = NULL;
 
 static void send_message_to_server_log(ErrorData *edata);
 static void send_message_to_frontend(Port *myport, ErrorData *edata);
@@ -61,8 +62,8 @@ static const char *error_severity(int elevel);
 static void append_with_tabs(StringInfo buf, const char *str);
 static bool is_log_level_output(int elevel, int log_min_level);
 
-int    log_min_messages = WARNING;
-char       *Log_line_prefix = "%l:%p:%m -";        /* format for extra log line info */
+int	log_min_messages = WARNING;
+char	   *Log_line_prefix = "%p:%m -";		/* format for extra log line info */
 
 #define FORMATTED_TS_LEN 128
 static char formatted_start_time[FORMATTED_TS_LEN];
@@ -797,70 +798,73 @@ DebugFileOpen(void)
  */
 static void
 send_message_to_server_log(ErrorData *edata)
-{// #lizard forgives
-    StringInfoData buf;
-
-    initStringInfo(&buf);
-
-    formatted_log_time[0] = '\0';
-
-    log_line_prefix(&buf);
-    appendStringInfo(&buf, "%s:  ", error_severity(edata->elevel));
-
-    if (edata->message)
-        append_with_tabs(&buf, edata->message);
-    else
-        append_with_tabs(&buf, _("missing error text"));
-
-    appendStringInfoChar(&buf, '\n');
-
-    if (edata->detail_log)
-    {
-        log_line_prefix(&buf);
-        appendStringInfoString(&buf, _("DETAIL:  "));
-        append_with_tabs(&buf, edata->detail_log);
-        appendStringInfoChar(&buf, '\n');
-    }
-    else if (edata->detail)
-    {
-        log_line_prefix(&buf);
-        appendStringInfoString(&buf, _("DETAIL:  "));
-        append_with_tabs(&buf, edata->detail);
-        appendStringInfoChar(&buf, '\n');
-    }
-    if (edata->hint)
-    {
-        log_line_prefix(&buf);
-        appendStringInfoString(&buf, _("HINT:  "));
-        append_with_tabs(&buf, edata->hint);
-        appendStringInfoChar(&buf, '\n');
-    }
-    if (edata->context)
-    {
-        log_line_prefix(&buf);
-        appendStringInfoString(&buf, _("CONTEXT:  "));
-        append_with_tabs(&buf, edata->context);
-        appendStringInfoChar(&buf, '\n');
-    }
-
-    /* assume no newlines in funcname or filename... */
-    if (edata->funcname && edata->filename)
-    {
-        appendStringInfo(&buf, _("LOCATION:  %s, %s:%d\n"),
-                         edata->funcname, edata->filename,
-                         edata->lineno);
-    }
-    else if (edata->filename)
-    {
-        appendStringInfo(&buf, _("LOCATION:  %s:%d\n"),
-                         edata->filename, edata->lineno);
-    }
-
-    /* Write to stderr, if enabled */
-    if (Log_destination & LOG_DESTINATION_STDERR)
-        write(fileno(stderr), buf.data, buf.len);
-
-    pfree(buf.data);
+{
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+
+	formatted_log_time[0] = '\0';
+
+	log_line_prefix(&buf);
+	appendStringInfo(&buf, "%s:  ", error_severity(edata->elevel));
+
+	if (edata->message)
+		append_with_tabs(&buf, edata->message);
+	else
+		append_with_tabs(&buf, _("missing error text"));
+
+	appendStringInfoChar(&buf, '\n');
+
+	if (edata->detail_log)
+	{
+		log_line_prefix(&buf);
+		appendStringInfoString(&buf, _("DETAIL:  "));
+		append_with_tabs(&buf, edata->detail_log);
+		appendStringInfoChar(&buf, '\n');
+	}
+	else if (edata->detail)
+	{
+		log_line_prefix(&buf);
+		appendStringInfoString(&buf, _("DETAIL:  "));
+		append_with_tabs(&buf, edata->detail);
+		appendStringInfoChar(&buf, '\n');
+	}
+	if (edata->hint)
+	{
+		log_line_prefix(&buf);
+		appendStringInfoString(&buf, _("HINT:  "));
+		append_with_tabs(&buf, edata->hint);
+		appendStringInfoChar(&buf, '\n');
+	}
+	if (edata->context)
+	{
+		log_line_prefix(&buf);
+		appendStringInfoString(&buf, _("CONTEXT:  "));
+		append_with_tabs(&buf, edata->context);
+		appendStringInfoChar(&buf, '\n');
+	}
+
+	/* assume no newlines in funcname or filename... */
+	if (edata->funcname && edata->filename)
+	{
+		appendStringInfo(&buf, _("LOCATION:  %s, %s:%d\n"),
+						 edata->funcname, edata->filename,
+						 edata->lineno);
+	}
+	else if (edata->filename)
+	{
+		appendStringInfo(&buf, _("LOCATION:  %s:%d\n"),
+						 edata->filename, edata->lineno);
+	}
+
+	/* Write to stderr, if enabled */
+	if (Log_destination & LOG_DESTINATION_STDERR)
+		write(fileno(stderr), buf.data, buf.len);
+
+    if (errlog_collection_func && (buf.len > 0) && ('\0' != buf.data[0]))
+        (*errlog_collection_func) (edata, &buf);
+
+	pfree(buf.data);
 }
 
 /*
diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c
index 59406ab2..3d1cd2f4 100644
--- a/src/gtm/gtm_ctl/gtm_ctl.c
+++ b/src/gtm/gtm_ctl/gtm_ctl.c
@@ -19,6 +19,9 @@
 #include <sys/stat.h>
 #include <sys/wait.h>
 #include <unistd.h>
+#include <inttypes.h>
+#include "gtm/gtm_stat.h"
+#include "gtm/gtm_stat_error.h"
 
 #ifdef HAVE_SYS_RESOURCE_H
 #include <sys/time.h>
@@ -32,6 +35,7 @@
 /* PID can be negative for standalone backend */
 typedef long pgpid_t;
 
+
 typedef enum
 {
     SMART_MODE,
@@ -49,10 +53,13 @@ typedef enum
     RESTART_COMMAND,
     STATUS_COMMAND,
 	RECONNECT_COMMAND,
-	RELOAD_COMMAND
+	RELOAD_COMMAND,
+	STAT_COMMAND,
+    ERRLOG_COMMAND
 } CtlCommand;
 
-#define DEFAULT_WAIT    60
+#define DEFAULT_WAIT	60
+#define DEFAULT_FLAG	0
 
 static bool do_wait = false;
 static bool wait_set = false;
@@ -78,6 +85,7 @@ GTM_ThreadID    TopMostThreadID;
 int    tcp_keepalives_idle = 0;
 int    tcp_keepalives_interval = 0;
 int tcp_keepalives_count = 0;
+static int	clear_flag = DEFAULT_FLAG;
 #endif
 static void
 write_stderr(const char *fmt,...)
@@ -1099,6 +1107,206 @@ do_status(void)
 }
 
 
+static void
+do_stat(void)
+{
+    int  ret = 0;
+    int  i = 0;
+    char gtm_connect_str[MAXPGPATH];
+    GTM_Conn *gtm_conn = NULL;
+    GTM_StatisticsResult* result = NULL;
+    struct tm timeinfo;
+    char   time_buff[128];
+    int    interval_time   = 0;
+    float  interval_minute = 0.0;
+    uint32 calcu_result[3];
+    static const float EPSINON = 0.00001;
+    static char* statistics_name_tab[CMD_STATISTICS_TYPE_COUNT] = {
+        "GET_GTS",
+        "SEQUENCE_GET_NEXT",
+        "TXN_START_PREPARED"
+    };
+
+    /* Connect gtm and get the lates timestamp. */
+    if (gtm_port == NULL || gtm_host == NULL)
+    {
+        return;
+    }
+
+    snprintf(gtm_connect_str, MAXPGPATH, "host=%s port=%s node_name=gtm_ctl remote_type=%d postmaster=0 connect_timeout=%d",
+             gtm_host, gtm_port, GTM_NODE_GTM_CTL,wait_seconds);
+    gtm_conn = connect_gtm(gtm_connect_str);
+    if (gtm_conn == NULL) {
+        return;
+    }
+
+    ret = get_gtm_statistics(gtm_conn, clear_flag, wait_seconds, &result);
+    if (!ret)
+    {
+        printf(_("GTM statistics:\n"));
+        strftime(time_buff, sizeof(time_buff),
+                 "%Y-%m-%d %H:%M:%S",
+                 localtime_r(&result->start_time, &timeinfo));
+        printf(_("statistics start time: %s\n"), time_buff);
+
+        strftime(time_buff, sizeof(time_buff),
+                 "%Y-%m-%d %H:%M:%S",
+                 localtime_r(&result->end_time, &timeinfo));
+        printf(_("statistics end time: %s\n"), time_buff);
+
+        printf(_("sequences remained: %d\n"), result->sequences_remained);
+        printf(_("txn remained: %d\n"), result->txn_remained);
+
+        interval_time = result->end_time - result->start_time;
+        calcu_result[0] = (interval_time == 0) ? 0 :
+                result->stat_info[0].total_request_times / interval_time;
+
+        interval_minute = (float)interval_time / (float)60.0;
+        if ((interval_minute >= - EPSINON) && (interval_minute <= EPSINON)) // 0
+        {
+            calcu_result[1] = 0;
+            calcu_result[2] = 0;
+        }
+        else
+        {
+            calcu_result[1] = (int)((float)result->stat_info[1].total_request_times / interval_minute);
+            calcu_result[2] = (int)((float)result->stat_info[2].total_request_times / interval_minute);
+        }
+
+        for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++)
+        {
+            printf(_("%s info:\n"), statistics_name_tab[i]);
+            printf(_("total request times: %u\n"), result->stat_info[i].total_request_times);
+            printf(_("avg costtime: %u(ms)\n"), result->stat_info[i].avg_costtime);
+            printf(_("max costtime: %u(ms)\n"), result->stat_info[i].max_costtime);
+            printf(_("min costtime: %u(ms)\n"), result->stat_info[i].min_costtime);
+            if (i == 0)
+            {
+                printf(_("requests per second: %u\n"), calcu_result[i]);
+            }
+            else
+            {
+                printf(_("requests per minute: %u\n"), calcu_result[i]);
+            }
+        }
+    }
+    else
+    {
+        printf(_("%s: Can not get statistics, please check gtm status!\n"),
+               progname);
+    }
+
+    disconnect_gtm(gtm_conn);
+    return;
+}
+
+/*
+* error_severity --- get localized string representing elevel
+*/
+static const char *
+error_severity(int elevel)
+{
+    const char *prefix;
+
+    switch (elevel)
+    {
+        case 10:
+        case 11:
+        case 12:
+        case 13:
+        case 14:
+            prefix = _("DEBUG");
+            break;
+        case 15:
+        case 16:
+            prefix = _("LOG");
+            break;
+        case 17:
+            prefix = _("INFO");
+            break;
+        case 18:
+            prefix = _("NOTICE");
+            break;
+        case 19:
+            prefix = _("WARNING");
+            break;
+        case 20:
+            prefix = _("ERROR");
+            break;
+        case 22:
+            prefix = _("FATAL");
+            break;
+        case 23:
+            prefix = _("PANIC");
+            break;
+        default:
+            prefix = "???";
+            break;
+    }
+
+    return prefix;
+}
+
+static void
+do_errlog(void)
+{
+    int  ret = 0;
+    char gtm_connect_str[MAXPGPATH];
+    GTM_Conn *gtm_conn = NULL;
+    char     *errlog   = NULL;
+    int      len = 0;
+    GTM_ErrLog* err_info = NULL;
+    struct tm timeinfo;
+    char time_buff[128];
+
+    /* Connect gtm and get the lates timestamp. */
+    if (gtm_port == NULL || gtm_host == NULL)
+    {
+        return;
+    }
+
+    snprintf(gtm_connect_str, MAXPGPATH, "host=%s port=%s node_name=gtm_ctl remote_type=%d postmaster=0 connect_timeout=%d",
+             gtm_host, gtm_port, GTM_NODE_GTM_CTL,wait_seconds);
+    gtm_conn = connect_gtm(gtm_connect_str);
+    if (gtm_conn == NULL) {
+        return;
+    }
+
+    ret = get_gtm_errlog(gtm_conn, wait_seconds, &errlog, &len);
+    if (!ret)
+    {
+        printf(_("%s: errlog len: %d \n"), progname, len);
+        while (len)
+        {
+            err_info = (GTM_ErrLog*)errlog;
+            err_info->proc_id = ntohl(err_info->proc_id);
+            err_info->error_no = ntohl(err_info->error_no);
+            err_info->log_time = be64toh(err_info->log_time);
+            err_info->err_level = ntohl(err_info->err_level);
+            err_info->errmsg_len = ntohl(err_info->errmsg_len);
+
+            strftime(time_buff, sizeof(time_buff),
+                        "%Y-%m-%d %H:%M:%S",
+                     localtime_r(&err_info->log_time, &timeinfo));
+
+            printf(_("%d|%d|%s|%s|%d|%s\n"), err_info->proc_id,
+                   err_info->error_no, time_buff, error_severity(err_info->err_level), err_info->errmsg_len,
+                   err_info->errmsg);
+
+            errlog += (sizeof(GTM_ErrLog) + err_info->errmsg_len);
+            len -= (sizeof(GTM_ErrLog) + err_info->errmsg_len);
+        }
+    }
+    else
+    {
+        printf(_("%s: Can not get errlog, please check gtm status!\n"),
+               progname);
+    }
+
+    disconnect_gtm(gtm_conn);
+    return;
+}
+
 /*
  *    utility routines
  */
@@ -1259,84 +1467,93 @@ main(int argc, char **argv)
      */
     optind = 1;
 
-    /* process command-line options */
-    while (optind < argc)
-    {
-		while ((c = getopt(argc, argv, "D:i:l:m:o:p:t:wWZ:H:P:g:")) != -1)
-        {
-            switch (c)
-            {
-                case 'D':
-                    {
-                        char       *gtmdata_D;
-                        char       *env_var = pg_malloc(strlen(optarg) + 9);
-
-                        gtmdata_D = xstrdup(optarg);
-                        canonicalize_path(gtmdata_D);
-                        snprintf(env_var, strlen(optarg) + 9, "GTMDATA=%s",
-                                 gtmdata_D);
-                        putenv(env_var);
-
-                        /*
-                         * We could pass GTMDATA just in an environment
-                         * variable but we do -D too for clearer gtm
-                         * 'ps' display
-                         */
-                        gtmdata_opt = (char *) pg_malloc(strlen(gtmdata_D) + 8);
-                        snprintf(gtmdata_opt, strlen(gtmdata_D) + 8,
-                                 "-D \"%s\" ",
-                                 gtmdata_D);
-                        break;
-                    }
-                case 'i':
-                    nodename = strdup(optarg);
-                    break;
-                case 'l':
-                    log_file = xstrdup(optarg);
-                    break;
-                case 'm':
-                    set_mode(optarg);
-                    break;
-                case 'o':
-                    gtm_opts = xstrdup(optarg);
-                    break;
-                case 'p':
-                    gtm_path = xstrdup(optarg);
-                    canonicalize_path(gtm_path);
-                    break;
-                case 't':
-                    wait_seconds = atoi(optarg);
-                    break;
-                case 'w':
-                    do_wait = true;
-                    wait_set = true;
-                    break;
-                case 'W':
-                    do_wait = false;
-                    wait_set = true;
-                    break;
-                case 'Z':
-                    gtm_app = xstrdup(optarg);
-                    if (strcmp(gtm_app,"gtm_proxy") != 0
-                        && strcmp(gtm_app,"gtm_standby") != 0
-                        && strcmp(gtm_app,"gtm") != 0)
-                    {
-                        write_stderr(_("%s: %s launch name set not correct\n"), progname, gtm_app);
-                        do_advice();
-                        exit(1);
-                    }
-                    break;
+	/* process command-line options */
+	while (optind < argc)
+	{
+		while ((c = getopt(argc, argv, "D:i:l:m:o:p:t:wWZ:H:P:g:c:")) != -1)
+		{
+			switch (c)
+			{
+				case 'D':
+					{
+						char	   *gtmdata_D;
+						char	   *env_var = pg_malloc(strlen(optarg) + 9);
+
+						gtmdata_D = xstrdup(optarg);
+						canonicalize_path(gtmdata_D);
+						snprintf(env_var, strlen(optarg) + 9, "GTMDATA=%s",
+								 gtmdata_D);
+						putenv(env_var);
+
+						/*
+						 * We could pass GTMDATA just in an environment
+						 * variable but we do -D too for clearer gtm
+						 * 'ps' display
+						 */
+						gtmdata_opt = (char *) pg_malloc(strlen(gtmdata_D) + 8);
+						snprintf(gtmdata_opt, strlen(gtmdata_D) + 8,
+								 "-D \"%s\" ",
+								 gtmdata_D);
+						break;
+					}
+				case 'i':
+					nodename = strdup(optarg);
+					break;
+				case 'l':
+					log_file = xstrdup(optarg);
+					break;
+				case 'm':
+					set_mode(optarg);
+					break;
+				case 'o':
+					gtm_opts = xstrdup(optarg);
+					break;
+				case 'p':
+					gtm_path = xstrdup(optarg);
+					canonicalize_path(gtm_path);
+					break;
+				case 't':
+					wait_seconds = atoi(optarg);
+					break;
+				case 'w':
+					do_wait = true;
+					wait_set = true;
+					break;
+				case 'W':
+					do_wait = false;
+					wait_set = true;
+					break;
+				case 'Z':
+					gtm_app = xstrdup(optarg);
+					if (strcmp(gtm_app,"gtm_proxy") != 0
+						&& strcmp(gtm_app,"gtm_standby") != 0
+						&& strcmp(gtm_app,"gtm") != 0)
+					{
+						write_stderr(_("%s: %s launch name set not correct\n"), progname, gtm_app);
+						do_advice();
+						exit(1);
+					}
+					break;
 #ifdef __TBASE__
-                case 'H':
-                    gtm_host = xstrdup(optarg);
-                    break;
-                    
-                case 'P':
-                    gtm_port = xstrdup(optarg);
+				case 'H':
+					gtm_host = xstrdup(optarg);
+					break;
+
+				case 'P':
+					gtm_port = xstrdup(optarg);
 					break;
 				case 'g':
 					startup_gts = xstrdup(optarg);
 					break;
+			    case 'c':
+			        clear_flag = atoi(optarg);
+			        if (clear_flag != 0 && clear_flag != 1)
+                    {
+                        write_stderr(_("%s: %d clear_flag set not correct\n"), progname, clear_flag);
+                        do_advice();
+                        exit(1);
+                    }
+			        break;
 #endif
                 default:
                     /* getopt_long already issued a suitable error message */
@@ -1369,16 +1586,20 @@ main(int argc, char **argv)
                 ctl_command = RECONNECT_COMMAND;
 			else if (strcmp(argv[optind], "reload") == 0)
 				ctl_command = RELOAD_COMMAND;
-            else
-            {
-                write_stderr(_("%s: unrecognized operation mode \"%s\"\n"),
-                             progname, argv[optind]);
-                do_advice();
-                exit(1);
-            }
-            optind++;
-        }
-    }
+			else if (strcmp(argv[optind], "stat") == 0)
+                ctl_command = STAT_COMMAND;
+            else if (strcmp(argv[optind], "errlog") == 0)
+                ctl_command = ERRLOG_COMMAND;
+			else
+			{
+				write_stderr(_("%s: unrecognized operation mode \"%s\"\n"),
+							 progname, argv[optind]);
+				do_advice();
+				exit(1);
+			}
+			optind++;
+		}
+	}
 
     if (ctl_command == NO_COMMAND)
     {
@@ -1395,13 +1616,14 @@ main(int argc, char **argv)
         canonicalize_path(gtm_data);
     }
 
-    if (!gtm_data && ctl_command != STATUS_COMMAND)
-    {
-        write_stderr("%s: no GTM/GTM Proxy directory specified \n",
-                     progname);
-        do_advice();
-        exit(1);
-    }
+	if (!gtm_data && ctl_command != STATUS_COMMAND &&
+	                ctl_command != STAT_COMMAND && ctl_command != ERRLOG_COMMAND)
+	{
+		write_stderr("%s: no GTM/GTM Proxy directory specified \n",
+					 progname);
+		do_advice();
+		exit(1);
+	}
 
     /*
      * pid files of gtm and gtm proxy are named differently
@@ -1442,12 +1664,13 @@ main(int argc, char **argv)
     }
 
 #ifdef __TBASE__
-    if(ctl_command == STATUS_COMMAND)
-    {
-        if(gtm_port == NULL)
-        {
-            write_stderr(_("%s: option -P GTM_port is not specified\n"),
-                         progname);
+	if(ctl_command == STATUS_COMMAND || ctl_command == STAT_COMMAND
+	            || ctl_command == ERRLOG_COMMAND)
+	{
+		if(gtm_port == NULL)
+		{
+			write_stderr(_("%s: option -P GTM_port is not specified\n"),
+						 progname);
             do_advice();
             exit(1);
         }
@@ -1463,15 +1686,17 @@ main(int argc, char **argv)
             case PROMOTE_COMMAND:
             case STATUS_COMMAND:
 			case RELOAD_COMMAND:
-                do_wait = false;
-                break;
-            case STOP_COMMAND:
-                do_wait = true;
-                break;
-            default:
-                break;
-        }
-    }
+            case STAT_COMMAND:
+            case ERRLOG_COMMAND:
+				do_wait = false;
+				break;
+			case STOP_COMMAND:
+				do_wait = true;
+				break;
+			default:
+				break;
+		}
+	}
 
     /* Build strings for pid file and option file */
     if(gtm_data)
@@ -1523,9 +1748,15 @@ main(int argc, char **argv)
 		case RELOAD_COMMAND:
 			do_reload();
 			break;
-        default:
+        case STAT_COMMAND:
+            do_stat();
             break;
-    }
+	    case ERRLOG_COMMAND:
+	        do_errlog();
+	        break;
+		default:
+			break;
+	}
 
     exit(0);
 }
diff --git a/src/gtm/libpq/pqformat.c b/src/gtm/libpq/pqformat.c
index 31d11d45..0a37c574 100644
--- a/src/gtm/libpq/pqformat.c
+++ b/src/gtm/libpq/pqformat.c
@@ -634,5 +634,5 @@ pq_getmsgend(StringInfo msg)
 int
 pq_getmsgunreadlen(StringInfo msg)
 {
-    return msg->len - msg->cursor;
+	return msg->len - msg->cursor;
 }
diff --git a/src/gtm/main/Makefile b/src/gtm/main/Makefile
index 7351019f..ca5910a6 100644
--- a/src/gtm/main/Makefile
+++ b/src/gtm/main/Makefile
@@ -15,7 +15,7 @@ ifneq ($(PORTNAME), win32)
 override CFLAGS += $(PTHREAD_CFLAGS)
 endif
 
-OBJS=main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o gtm_standby.o gtm_opt.o gtm_backup.o gtm_store.o gtm_xlog.o
+OBJS=main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o gtm_standby.o gtm_opt.o gtm_backup.o gtm_store.o gtm_xlog.o gtm_stat.o gtm_stat_error.o
 
 OTHERS= ../libpq/libpqcomm.a ../path/libgtmpath.a ../recovery/libgtmrecovery.a ../client/libgtmclient.a ../common/libgtm.a ../../port/libpgport.a
 
diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c
index abff352a..9941cb30 100644
--- a/src/gtm/main/gtm_seq.c
+++ b/src/gtm/main/gtm_seq.c
@@ -2115,48 +2115,47 @@ ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message)
  */
 void
 ProcessSequenceGetNextCommand(Port *myport, StringInfo message, bool is_backup)
-{// #lizard forgives
-    GTM_SequenceKeyData seqkey;
-    StringInfoData buf;
-    GTM_Sequence seqval;
-    GTM_Sequence range;
-    GTM_Sequence rangemax;
-    uint32 coord_namelen;
-    char  *coord_name;
-    uint32 coord_procid;
-    
-    if (Recovery_IsStandby())
-    {
-        if (myport->remote_type != GTM_NODE_GTM)
-        {
-            elog(ERROR, "gtm standby can't provide sequence to datanodes or coordinators.");
-        }
-    }    
-
-
-    seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
-    seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
-
-    coord_namelen = pq_getmsgint(message, sizeof(coord_namelen));
-    if (coord_namelen > 0)
-        coord_name = (char *)pq_getmsgbytes(message, coord_namelen);
-    else
-        coord_name = NULL;
-    coord_procid = pq_getmsgint(message, sizeof(coord_procid));
-    memcpy(&range, pq_getmsgbytes(message, sizeof (GTM_Sequence)),
-           sizeof (GTM_Sequence));
-
-    if (GTM_SeqGetNext(&seqkey, coord_name, coord_procid, range,
-                    &seqval, &rangemax))
-        ereport(ERROR,
-                (ERANGE,
-                 errmsg("Can not get current value of the sequence")));
-        
-
-    elog(DEBUG1, "Getting next value %ld for sequence %s", seqval, seqkey.gsk_key);
-
-    if (!is_backup)
-    {
+{
+	GTM_SequenceKeyData seqkey;
+	StringInfoData buf;
+	GTM_Sequence seqval;
+	GTM_Sequence range;
+	GTM_Sequence rangemax;
+	uint32 coord_namelen;
+	char  *coord_name;
+	uint32 coord_procid;
+	
+	if (Recovery_IsStandby())
+	{
+		if (myport->remote_type != GTM_NODE_GTM)
+		{
+			elog(ERROR, "gtm standby can't provide sequence to datanodes or coordinators.");
+		}
+	}	
+
+
+	seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
+	seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+
+	coord_namelen = pq_getmsgint(message, sizeof(coord_namelen));
+	if (coord_namelen > 0)
+		coord_name = (char *)pq_getmsgbytes(message, coord_namelen);
+	else
+		coord_name = NULL;
+	coord_procid = pq_getmsgint(message, sizeof(coord_procid));
+	memcpy(&range, pq_getmsgbytes(message, sizeof (GTM_Sequence)),
+		   sizeof (GTM_Sequence));
+
+	if (GTM_SeqGetNext(&seqkey, coord_name, coord_procid, range,
+					&seqval, &rangemax))
+		ereport(ERROR,
+				(ERANGE,
+				 errmsg("Can not get current value of the sequence")));
+
+	elog(DEBUG1, "Getting next value %ld for sequence %s", seqval, seqkey.gsk_key);
+
+	if (!is_backup)
+	{
 #ifndef __XLOG__
         /* Backup first */
         if (GetMyConnection(myport)->standby)
diff --git a/src/gtm/main/gtm_stat.c b/src/gtm/main/gtm_stat.c
index 7b8d7f1d..89a51d5f 100644
--- a/src/gtm/main/gtm_stat.c
+++ b/src/gtm/main/gtm_stat.c
@@ -14,24 +14,263 @@
  */
 #include "gtm/gtm_c.h"
 #include "gtm/gtm.h"
+#include "gtm/gtm_stat.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+#include <sys/timeb.h>
 
-uint32    GTM_Message_Stats[MSG_MAX_MESSAGE_TYPE];
-uint32    GTM_Result_Stats[GTM_MAX_RESULT_TYPE];
+extern int32  GTM_StoreGetUsedSeq(void);
+extern int32  GTM_StoreGetUsedTxn(void);
 
+
+GTM_Statistics GTMStatistics;
+
+/*
+ * Init global gtm statistic handle
+ */
 void
-gtm_msgstat_increment(int type)
+GTM_InitGtmStatistics(void)
+{
+    GTMStatistics.stat_start_time = time(NULL);;
+    SpinLockInit(&GTMStatistics.lock);
+}
+
+/*
+ * Init the worker statistics's handle
+ */
+static void
+GTM_InitStatisticsInfo(GTM_WorkerStatistics *stat_handle)
+{
+    int i = 0;
+    for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++)
+    {
+        pg_atomic_init_u32(&stat_handle->cmd_statistics[i].total_request_times, 0);
+        pg_atomic_init_u32(&stat_handle->cmd_statistics[i].total_costtime, 0);
+        pg_atomic_init_u32(&stat_handle->cmd_statistics[i].max_costtime, 0);
+        pg_atomic_init_u32(&stat_handle->cmd_statistics[i].min_costtime, PG_UINT32_MAX);
+    }
+}
+
+/*
+ * Reset the worker statistics's handle
+ */
+static void
+GTM_ResetStatisticsInfo(GTM_WorkerStatistics *stat_handle)
+{
+    int i = 0;
+    for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++)
+    {
+        pg_atomic_write_u32(&stat_handle->cmd_statistics[i].total_request_times, 0);
+        pg_atomic_write_u32(&stat_handle->cmd_statistics[i].total_costtime, 0);
+        pg_atomic_write_u32(&stat_handle->cmd_statistics[i].max_costtime, 0);
+        pg_atomic_write_u32(&stat_handle->cmd_statistics[i].min_costtime, PG_UINT32_MAX);
+    }
+}
+
+/*
+ * Init the statistics item
+ */
+static void
+GTM_InitStatisticsItemArray(GTM_StatisticsItem *cmd_item)
 {
-    GTM_Message_Stats[type]++;
+    int i = 0;
+    for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++)
+    {
+        cmd_item[i].total_request_times = 0;
+        cmd_item[i].total_costtime = 0;
+        cmd_item[i].max_costtime = 0;
+        cmd_item[i].min_costtime = PG_UINT32_MAX;
+    }
 }
 
+/*
+ * Init worker thread's statistics handle
+ * only worker thread need to call
+ */
 void
-gtm_resultstat_increment(int type)
+GTM_InitStatisticsHandle(void)
 {
-    GTM_Result_Stats[type]++;
+    GTM_ThreadInfo *thrinfo = GetMyThreadInfo;
+    MemoryContext oldContext;
+
+    AssertState(thrinfo->stat_handle == NULL);
+
+    oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+    thrinfo->stat_handle = palloc(sizeof(GTM_WorkerStatistics));
+    if (thrinfo->stat_handle == NULL)
+        ereport(ERROR, (ENOMEM, errmsg("Out of memory")));
+
+    GTM_InitStatisticsInfo(thrinfo->stat_handle);
+
+    MemoryContextSwitchTo(oldContext);
 }
 
+/*
+ * Update statistics, when completing a command
+ */
 void
-gtm_print_stats(void)
+GTM_UpdateStatistics(GTM_WorkerStatistics* stat_handle, GTM_MessageType mtype, uint32 costtime)
 {
+    GTM_StatisticsCmd mCmd;
+    GTM_StatisticsInfo* stat_info = NULL;
+
+    if (mtype == MSG_GETGTS)
+    {
+        mCmd = CMD_GETGTS;
+    }
+    else if (mtype == MSG_SEQUENCE_GET_NEXT)
+    {
+        mCmd = CMD_SEQUENCE_GET_NEXT;
+    }
+    else if (mtype == MSG_TXN_START_PREPARED)
+    {
+        mCmd = CMD_TXN_START_PREPARED;
+    }
+    else
+    {
+        return;
+    }
+
+    stat_info = &stat_handle->cmd_statistics[mCmd];
+    pg_atomic_fetch_add_u32(&stat_info->total_request_times, 1);
+    pg_atomic_fetch_add_u32(&stat_info->total_costtime, costtime);
+
+    if (costtime > pg_atomic_read_u32(&stat_info->max_costtime))
+    {
+        pg_atomic_write_u32(&stat_info->max_costtime, costtime);
+    }
+
+    if (costtime < pg_atomic_read_u32(&stat_info->min_costtime))
+    {
+        pg_atomic_write_u32(&stat_info->min_costtime, costtime);
+    }
+}
+
+/*
+ * Combine the statistics of each thread and calculate the result
+ */
+static void
+GTM_GetMergeResult(int clear_flag, pg_time_t *stat_start_time, pg_time_t *stat_end_time, GTM_StatisticsItem *result)
+{
+    GTM_ThreadInfo *thrinfo = NULL;
+    GTM_WorkerStatistics *stat_handle = NULL;
+    uint32 max_costtime = 0;
+    uint32 min_costtime = 0;
+    uint32 i = 0;
+    uint32 j = 0;
+
+    GTM_InitStatisticsItemArray(result);
+
+    SpinLockAcquire(&GTMStatistics.lock);
+    GTM_RWLockAcquire(&GTMThreads->gt_lock, GTM_LOCKMODE_READ);
+
+    /* Combine data from each thread */
+    for (i = 0; i < GTMThreads->gt_array_size; i++)
+    {
+        thrinfo = GTMThreads->gt_threads[i];
+        if(NULL == thrinfo)
+        {
+            elog(DEBUG1, "thread %d exits.", i);
+            continue;
+        }
+
+        if(false == thrinfo->thr_epoll_ok || NULL == thrinfo->stat_handle)
+        {
+            continue;
+        }
+
+        stat_handle = thrinfo->stat_handle;
+        for (j = 0; j < CMD_STATISTICS_TYPE_COUNT; j++)
+        {
+            result[j].total_request_times += pg_atomic_read_u32(&stat_handle->cmd_statistics[j].total_request_times);
+            result[j].total_costtime += pg_atomic_read_u32(&stat_handle->cmd_statistics[j].total_costtime);
+            max_costtime = pg_atomic_read_u32(&stat_handle->cmd_statistics[j].max_costtime);
+            min_costtime = pg_atomic_read_u32(&stat_handle->cmd_statistics[j].min_costtime);
+            if (result[j].max_costtime < max_costtime)
+            {
+                result[j].max_costtime = max_costtime;
+            }
+
+            if (result[j].min_costtime > min_costtime)
+            {
+                result[j].min_costtime = min_costtime;
+            }
+        }
+
+        if (clear_flag)
+        {
+            GTM_ResetStatisticsInfo(stat_handle);
+        }
+    }
+
+    *stat_start_time = GTMStatistics.stat_start_time;
+    *stat_end_time = time(NULL);
+    for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++)
+    {
+        result[i].avg_costtime = (result[i].total_request_times == 0) ? 0 :
+                                 result[i].total_costtime / result[i].total_request_times;
+    }
+
+    if (clear_flag)
+    {
+        GTMStatistics.stat_start_time = *stat_end_time;
+    }
+
+    GTM_RWLockRelease(&GTMThreads->gt_lock);
+    SpinLockRelease(&GTMStatistics.lock);
+}
+
+/*
+ * Process MSG_GET_STATISTICS message
+ */
+void
+ProcessGetStatisticsCommand(Port *myport, StringInfo message)
+{
+    int32 used_seq = 0;
+    int32 used_txn = 0;
+    int clear_flag = 0;
+    int i = 0;
+    StringInfoData buf;
+    pg_time_t stat_start_time = 0;
+    pg_time_t stat_end_time = 0;
+    GTM_StatisticsItem result_info[CMD_STATISTICS_TYPE_COUNT];
+
+    clear_flag = pq_getmsgint(message, sizeof (int));
+    pq_getmsgend(message);
+
+    GTM_GetMergeResult(clear_flag, &stat_start_time, &stat_end_time, result_info);
+    used_seq = GTM_StoreGetUsedSeq();
+    used_txn = GTM_StoreGetUsedTxn();
+
+    pq_beginmessage(&buf, 'S');
+    pq_sendint(&buf, MSG_GET_GTM_STATISTICS_RESULT, 4);
+
+    if (myport->remote_type == GTM_NODE_GTM_PROXY)
+    {
+        GTM_ProxyMsgHeader proxyhdr;
+        proxyhdr.ph_conid = myport->conn_id;
+        pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+    }
+
+    pq_sendint64(&buf, stat_start_time);
+    pq_sendint64(&buf, stat_end_time);
+    pq_sendint(&buf, GTM_MAX_SEQ_NUMBER - used_seq, sizeof(int32));
+    pq_sendint(&buf, MAX_PREPARED_TXN - used_txn, sizeof(int32));
+    for (i = 0; i < CMD_STATISTICS_TYPE_COUNT; i++)
+    {
+        pq_sendint(&buf, result_info[i].total_request_times, sizeof(int32));
+        pq_sendint(&buf, result_info[i].avg_costtime, sizeof(int32));
+        pq_sendint(&buf, result_info[i].max_costtime, sizeof(int32));
+        pq_sendint(&buf, result_info[i].min_costtime, sizeof(int32));
+    }
+
+    pq_endmessage(myport, &buf);
 
+    if (myport->remote_type != GTM_NODE_GTM_PROXY)
+    {
+        /* Don't flush to the backup because this does not change the internal status */
+        pq_flush(myport);
+    }
 }
diff --git a/src/gtm/main/gtm_stat_error.c b/src/gtm/main/gtm_stat_error.c
new file mode 100644
index 00000000..3e5cdd46
--- /dev/null
+++ b/src/gtm/main/gtm_stat_error.c
@@ -0,0 +1,385 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_stat_error.c
+
+ *	  collect error logs of gtm
+ *
+ * Copyright (c) 2020-Present TBase development team, Tencent
+ *
+ * IDENTIFICATION
+ *	  src/gtm/main/gtm_stat_error.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <unistd.h>
+#include "gtm/gtm.h"
+
+#include "gtm/elog.h"
+#include "gtm/palloc.h"
+#include "gtm/gtm_lock.h"
+#include "gtm/gtm_stat_error.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+
+static int gtm_err_log_min = ERROR;
+static int gtm_errmsg_size = GTM_MAX_ERRMSG_SIZE;
+static int gtm_max_errlog_tuple_len = sizeof(GTM_ErrLog) + GTM_MAX_ERRMSG_SIZE;
+
+GTM_LogCollector GlobalLogCollector;
+void GTM_ErrorLogCollector(ErrorData *edata, StringInfo buff);
+
+/*
+ * Build data pump buffer.
+ */
+DataPumpBuf*
+GTM_BuildDataPumpBuf(uint32 size)
+{
+    DataPumpBuf *buff = NULL;
+    buff = (DataPumpBuf*)palloc0(sizeof(DataPumpBuf));
+    if (NULL == buff)
+    {
+        return NULL;
+    }
+
+    buff->length = size * 1024;
+    buff->buf = (char*)palloc0(buff->length);
+    if (NULL == buff->buf)
+    {
+        pfree(buff);
+        return NULL;
+    }
+
+    SpinLockInit(&(buff->pointer_lock));
+
+    buff->head  		  = 0;
+    buff->tail  		  = 0;
+    buff->wrap_around    = 0;
+    buff->border		  = INVALID_BORDER;
+
+    return buff;
+}
+
+/*
+ * Destroy data pump buffer.
+ */
+void
+GTM_DestroyDataPumpBuf(DataPumpBuf *buff)
+{
+    pfree(buff->buf);
+    pfree(buff);
+    return;
+}
+
+/*
+ * Thread-level log collector
+ * call by each thread's send_message_to_server_log, can't log any error log
+ */
+void
+GTM_ErrorLogCollector(ErrorData *edata, StringInfo buff)
+{
+    GTM_ThreadInfo *thrinfo = GetMyThreadInfo;
+    uint32 errmsg_len = 0;
+    uint32 free_space = 0;
+    GTM_ErrLog err_info;
+    DataPumpBuf* datapump_buff = thrinfo->datapump_buff;
+
+    if (edata->elevel < gtm_err_log_min || 0 == buff->len)
+    {
+        return;
+    }
+
+    errmsg_len = Min(buff->len, gtm_errmsg_size - 1);
+
+    err_info.proc_id = getpid();
+    err_info.error_no = edata->saved_errno;
+    err_info.log_time = time(NULL);
+    err_info.err_level = edata->elevel;
+    err_info.errmsg_len = errmsg_len;
+
+    free_space = FreeSpace(datapump_buff);
+    if (free_space < sizeof(GTM_ErrLog) + errmsg_len)
+    {
+        return;
+    }
+
+    PutData(datapump_buff, (char*) &err_info, sizeof(GTM_ErrLog));
+    PutData(datapump_buff, buff->data, errmsg_len);
+    SetBorder(datapump_buff);
+}
+
+/*
+ * Init the global log collector
+ */
+int
+GTM_InitLogCollector(void)
+{
+    MemoryContext oldContext;
+    oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+    GlobalLogCollector.tmp_buff = palloc(gtm_max_errlog_tuple_len);
+    if (NULL == GlobalLogCollector.tmp_buff)
+    {
+        elog(ERROR, "Failed to create tmpBuf, out of memory.");
+        MemoryContextSwitchTo(oldContext);
+        return -1;
+    }
+
+    GlobalLogCollector.bloom_filter = BloomCreate(GTM_BLOOM_FILTER_SIZE, 2, 0, 97);
+    if (NULL == GlobalLogCollector.bloom_filter)
+    {
+        elog(ERROR, "Failed to create bloom filter, out of memory.");
+        pfree(GlobalLogCollector.tmp_buff);
+        MemoryContextSwitchTo(oldContext);
+        return -1;
+    }
+
+    GlobalLogCollector.datapump_buff = GTM_BuildDataPumpBuf(GTM_GLOBAL_ERRLOG_DATAPUMP_SIZE);
+    if (NULL == GlobalLogCollector.datapump_buff)
+    {
+        elog(ERROR, "Failed to datapump buf, out of memory.");
+        BloomDestroy(GlobalLogCollector.bloom_filter);
+        pfree(GlobalLogCollector.tmp_buff);
+        MemoryContextSwitchTo(oldContext);
+        return -1;
+    }
+
+    SpinLockInit(&GlobalLogCollector.lock);
+    pg_atomic_init_u32(&GlobalLogCollector.full, 0);
+
+    MemoryContextSwitchTo(oldContext);
+    return 0;
+}
+
+/*
+ * Deinit the global log collector
+ */
+void
+GTM_DeInitLogCollector(void)
+{
+    if (GlobalLogCollector.tmp_buff != NULL)
+    {
+        pfree(GlobalLogCollector.tmp_buff);
+        GlobalLogCollector.tmp_buff = NULL;
+    }
+
+    if (GlobalLogCollector.bloom_filter != NULL)
+    {
+        BloomDestroy(GlobalLogCollector.bloom_filter);
+        GlobalLogCollector.bloom_filter = NULL;
+    }
+
+    if (GlobalLogCollector.datapump_buff != NULL)
+    {
+        GTM_DestroyDataPumpBuf(GlobalLogCollector.datapump_buff);
+        GlobalLogCollector.datapump_buff = NULL;
+    }
+}
+
+/*
+ * Get a log tuple from datapump buff
+ */
+static int
+GTM_GetLogTupleFromDataPump(DataPumpBuf* dataPumpBuf, char* buf)
+{
+    char* data = NULL;
+    uint32 data_len = 0;
+    uint32 offset = 0;
+    GTM_ErrLog* err_info = NULL;
+    uint32 tuple_len = 0;
+
+    data = GetData(dataPumpBuf, &data_len);
+    if (NULL == data)
+    {
+        /* no data */
+        return -1;
+    }
+
+    if (data_len < sizeof(GTM_ErrLog))
+    {
+        /* copy the last part of datapumpbuff to temp buff */
+        memcpy(buf, data, data_len);
+        offset = data_len;
+
+        IncDataOff(dataPumpBuf, data_len);
+        data = GetData(dataPumpBuf, &data_len);
+        AssertState(data != NULL);
+        /* copy the rest */
+        memcpy((char*)buf + offset, data, sizeof(GTM_ErrLog) - offset);
+        data += (sizeof(GTM_ErrLog) - offset);
+
+        err_info = (GTM_ErrLog*)buf;
+        tuple_len = sizeof(GTM_ErrLog) + err_info->errmsg_len;
+
+        memcpy((char*)buf + sizeof(GTM_ErrLog), data, err_info->errmsg_len);
+        IncDataOff(dataPumpBuf, tuple_len - offset);
+    }
+    else
+    {
+        err_info = (GTM_ErrLog*)data;
+        tuple_len = sizeof(GTM_ErrLog) + err_info->errmsg_len;
+        if (data_len < tuple_len)
+        {
+            memcpy(buf, data, data_len);
+            offset = data_len;
+
+            IncDataOff(dataPumpBuf, data_len);
+            data = GetData(dataPumpBuf, &data_len);
+            AssertState(data != NULL);
+
+            memcpy((char*)buf + offset, data, tuple_len - offset);
+            IncDataOff(dataPumpBuf, tuple_len - offset);
+        }
+        else
+        {
+            memcpy((char*)buf, data, tuple_len);
+            IncDataOff(dataPumpBuf, tuple_len);
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Collect errlog data from various threads and eliminate duplication
+ */
+void
+GTM_ProcessLogCollection(void)
+{
+    GTM_ThreadInfo *thrinfo = NULL;
+    DataPumpBuf* datapump_buff = NULL;
+    DataPumpBuf* global_datapump_buff = GlobalLogCollector.datapump_buff;
+    BLOOM *bloom_filter = GlobalLogCollector.bloom_filter;
+    char *tmp_buff = GlobalLogCollector.tmp_buff;
+    GTM_ErrLog* err_info = NULL;
+    int errmsg_len = 0;
+    uint32 i = 0;
+    char *msg = NULL;
+
+    GTM_RWLockAcquire(&GTMThreads->gt_lock, GTM_LOCKMODE_READ);
+
+    for (i = 0; i < GTMThreads->gt_array_size; i++)
+    {
+        thrinfo = GTMThreads->gt_threads[i];
+        if(NULL == thrinfo)
+        {
+            elog(DEBUG1, "thread %d exits.", i);
+            continue;
+        }
+
+        datapump_buff = thrinfo->datapump_buff;
+        if (NULL == datapump_buff)
+        {
+            continue;
+        }
+
+        if (pg_atomic_read_u32(&GlobalLogCollector.full))
+        {
+            break;
+        }
+
+        while (FreeSpace(global_datapump_buff) >= gtm_max_errlog_tuple_len)
+        {
+            if (GTM_GetLogTupleFromDataPump(datapump_buff, tmp_buff))
+            {
+                break;
+            }
+
+            err_info = (GTM_ErrLog*)tmp_buff;
+            if (!BloomCheckAndAdd(bloom_filter, err_info->errmsg, err_info->errmsg_len))
+            {
+                /* replace \n with space */
+                msg = err_info->errmsg;
+                for (i = 0; i < err_info->errmsg_len; i++)
+                {
+                    if (msg[i] == '\n' || msg[i] == '\t' || msg[i] == '\r')
+                    {
+                        msg[i] = ' ';
+                    }
+                }
+
+                /* serialize */
+                errmsg_len = err_info->errmsg_len;
+                err_info->proc_id = htonl(err_info->proc_id);
+                err_info->error_no = htonl(err_info->error_no);
+                err_info->log_time = htobe64(err_info->log_time);
+                err_info->err_level = htonl(err_info->err_level);
+                err_info->errmsg_len = htonl(err_info->errmsg_len);
+
+                /* put err log into global datapumpbuff */
+                PutData(global_datapump_buff, (char*) err_info, sizeof(GTM_ErrLog) + errmsg_len);
+                SetBorder(global_datapump_buff);
+            }
+        }
+
+        if (FreeSpace(global_datapump_buff) < gtm_max_errlog_tuple_len)
+        {
+            pg_atomic_write_u32(&GlobalLogCollector.full, 1);
+            elog(DEBUG1, "global datapump buff is full.");
+        }
+    }
+
+    GTM_RWLockRelease(&GTMThreads->gt_lock);
+}
+
+/*
+ * Process MSG_GET_ERRORLOG message
+ */
+void
+ProcessGetErrorlogCommand(Port *myport, StringInfo message)
+{
+    char* data = NULL;
+    uint32 data_len = 0;
+    uint32 total_len = 0;
+    StringInfoData buf;
+    DataPumpBuf* global_datapump_buff = GlobalLogCollector.datapump_buff;
+    BLOOM *bloom_filter = GlobalLogCollector.bloom_filter;
+
+    pq_getmsgend(message);
+
+    SpinLockAcquire(&GlobalLogCollector.lock);
+
+    pq_beginmessage(&buf, 'S');
+    pq_sendint(&buf, MSG_GET_GTM_ERRORLOG_RESULT, 4);
+
+    if (myport->remote_type == GTM_NODE_GTM_PROXY)
+    {
+        GTM_ProxyMsgHeader proxyhdr;
+        proxyhdr.ph_conid = myport->conn_id;
+        pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+    }
+
+    data = GetData(global_datapump_buff, &data_len);
+    while (NULL != data)
+    {
+        total_len += data_len;
+        /* check max len，if the producer is faster than the consumer, it may block here */
+        if (total_len >= GTM_GLOBAL_ERRLOG_DATAPUMP_SIZE)
+        {
+            pg_atomic_write_u32(&GlobalLogCollector.full, 1);
+        }
+
+        pq_sendbytes(&buf, data, data_len);
+
+        IncDataOff(global_datapump_buff, data_len);
+        data = GetData(global_datapump_buff, &data_len);
+    }
+
+    /* clear bitmap */
+    BloomReset(bloom_filter);
+    if (pg_atomic_read_u32(&GlobalLogCollector.full))
+    {
+        pg_atomic_write_u32(&GlobalLogCollector.full, 0);
+    }
+
+    SpinLockRelease(&GlobalLogCollector.lock);
+
+    pq_endmessage(myport, &buf);
+
+    if (myport->remote_type != GTM_NODE_GTM_PROXY)
+    {
+        /* Don't flush to the backup because this does not change the internal status */
+        pq_flush(myport);
+    }
+}
\ No newline at end of file
diff --git a/src/gtm/main/gtm_store.c b/src/gtm/main/gtm_store.c
index 5b8acac5..0123a662 100644
--- a/src/gtm/main/gtm_store.c
+++ b/src/gtm/main/gtm_store.c
@@ -236,8 +236,8 @@ static int32  GTM_StoreSync(char *data, size_t size);
 static int32  GTM_StoreInitSync(char *data, size_t size);
 static bool   GTM_StoreCheckHeaderCRC(void);
 static int32  GTM_StoreGetHeader(GTMControlHeader *header);
-static int32  GTM_StoreGetUsedSeq(void);
-static int32  GTM_StoreGetUsedTxn(void);
+int32  GTM_StoreGetUsedSeq(void);
+int32  GTM_StoreGetUsedTxn(void);
 static bool   GTM_StoreCheckSeqCRC(GTM_StoredSeqInfo *seq);
 static bool   GTM_StoreCheckTxnCRC(GTM_StoredTransactionInfo *txn);
 static bool   GTM_StoreSeqInFreelist(GTM_StoredSeqInfo *seq);
diff --git a/src/gtm/main/gtm_thread.c b/src/gtm/main/gtm_thread.c
index 8dd2344c..9d71553f 100644
--- a/src/gtm/main/gtm_thread.c
+++ b/src/gtm/main/gtm_thread.c
@@ -19,6 +19,8 @@
 #include "gtm/gtm_xlog.h"
 #include "gtm/gtm_txn.h"
 #include "gtm/libpq.h"
+#include "gtm/gtm_stat_error.h"
+
 #ifdef __TBASE__
 #include "gtm/gtm_store.h"
 #endif
@@ -275,60 +277,64 @@ GTM_ThreadCreate(void *(* startroutine)(void *), int32 max_lock)
     thrinfo->insert_lock_id = -1;
     thrinfo->insert_try_lock_id = pthread_self() % NUM_XLOGINSERT_LOCKS;
     thrinfo->register_buff = NULL;
-    thrinfo->last_sync_gts = 0; 
+    thrinfo->last_sync_gts = 0;
+    thrinfo->stat_handle = NULL;
+    thrinfo->datapump_buff = GTM_BuildDataPumpBuf(GTM_THREAD_ERRLOG_DATAPUMP_SIZE);
 #endif
 
-    /*
-     * Each thread gets its own ErrorContext and its a child of ErrorContext of
-     * the main process
-     *
-     * This is a thread-specific context and is not shared between other
-     * threads
-     */
-    thrinfo->thr_error_context = AllocSetContextCreate(ErrorContext,
-                                                       "ErrorContext",
-                                                       8 * 1024,
-                                                       8 * 1024,
-                                                       8 * 1024,
-                                                       false);
-
-    thrinfo->thr_startroutine = startroutine;
-
-    /*
-     * Now start the thread. The thread will start executing the given
-     * "startroutine". The thrinfo structure is also passed to the thread. Any
-     * additional parameters should be passed via the thrinfo strcuture.
-     *
-     * Return the thrinfo structure to the caller
-     */
-    if ((err = pthread_create(&thrinfo->thr_id, NULL, GTM_ThreadMainWrapper,
-                              thrinfo)))
-    {
-        ereport(LOG,
-                (err,
-                 errmsg("Failed to create a new thread: error %s", strerror(err))));
-
-        GTM_ThreadRemove(thrinfo);
-
-        MemoryContextDelete(thrinfo->thr_error_context);
-        MemoryContextDelete(thrinfo->thr_thread_context);
+	/*
+	 * Each thread gets its own ErrorContext and its a child of ErrorContext of
+	 * the main process
+	 *
+	 * This is a thread-specific context and is not shared between other
+	 * threads
+	 */
+	thrinfo->thr_error_context = AllocSetContextCreate(ErrorContext,
+													   "ErrorContext",
+													   8 * 1024,
+													   8 * 1024,
+													   8 * 1024,
+													   false);
+
+	thrinfo->thr_startroutine = startroutine;
+
+	/*
+	 * Now start the thread. The thread will start executing the given
+	 * "startroutine". The thrinfo structure is also passed to the thread. Any
+	 * additional parameters should be passed via the thrinfo strcuture.
+	 *
+	 * Return the thrinfo structure to the caller
+	 */
+	if ((err = pthread_create(&thrinfo->thr_id, NULL, GTM_ThreadMainWrapper,
+							  thrinfo)))
+	{
+		ereport(LOG,
+				(err,
+				 errmsg("Failed to create a new thread: error %s", strerror(err))));
+
+		GTM_ThreadRemove(thrinfo);
+
+		MemoryContextDelete(thrinfo->thr_error_context);
+		MemoryContextDelete(thrinfo->thr_thread_context);
+
+		GTM_RWLockDestroy(&thrinfo->thr_lock);
+#ifdef __TBASE__
+        GTM_DestroyDataPumpBuf(thrinfo->datapump_buff);
+#endif
+		pfree(thrinfo);
 
-        GTM_RWLockDestroy(&thrinfo->thr_lock);
+		return NULL;
+	}
 
-        pfree(thrinfo);
+	/*
+	 * Ensure that the resources are released when the thread exits. (We used
+	 * to do this inside GTM_ThreadMainWrapper, but thrinfo->thr_id may not set
+	 * by the time GTM_ThreadMainWrapper starts executing, this possibly
+	 * calling the function on an invalid thr_id
+	 */
+	pthread_detach(thrinfo->thr_id);
 
-        return NULL;
-    }
-
-    /*
-     * Ensure that the resources are released when the thread exits. (We used
-     * to do this inside GTM_ThreadMainWrapper, but thrinfo->thr_id may not set
-     * by the time GTM_ThreadMainWrapper starts executing, this possibly
-     * calling the function on an invalid thr_id
-     */
-    pthread_detach(thrinfo->thr_id);
-
-    return thrinfo;
+	return thrinfo;
 }
 
 /*
@@ -398,8 +404,10 @@ GTM_ThreadCleanup(void *argp)
     RWLockCleanUp();
     if(thrinfo->locks_hold != NULL)
         pfree(thrinfo->locks_hold);
-    if(thrinfo->write_locks_hold != NULL)
-        pfree(thrinfo->write_locks_hold);
+	if(thrinfo->write_locks_hold != NULL)
+		pfree(thrinfo->write_locks_hold);
+	if(thrinfo->datapump_buff != NULL)
+        GTM_DestroyDataPumpBuf(thrinfo->datapump_buff);
 #endif
     /*
      * Switch to the memory context of the main process so that we can free up
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index 3cbfd061..03618ffe 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -57,7 +57,8 @@
 #include "gtm/gtm_utils.h"
 #include "gtm/gtm_backup.h"
 #include "gtm/gtm_time.h"
-
+#include "gtm/gtm_stat.h"
+#include "gtm/gtm_stat_error.h"
 
 #ifdef __TBASE__
 #include "gtm/gtm_store.h"
@@ -135,6 +136,9 @@ int             g_max_thread_number = 512; /* max thread number of gtm. */
 GTM_ThreadInfo  *g_timekeeper_thread = NULL;
 GTM_ThreadInfo    *g_timebackup_thread = NULL;
 GTM_ThreadInfo  *g_timer_thread = NULL;
+GTM_ThreadInfo  *g_logcollector_thread = NULL;
+void *GTM_ThreadLogCollector(void *argp);
+extern void GTM_ErrorLogCollector(ErrorData *edata, StringInfo buff);
 
 #ifdef __XLOG__
 GTM_ThreadInfo  *g_basebackup_thread   = NULL;
@@ -1476,25 +1480,41 @@ main(int argc, char *argv[])
     }
 
     for(i = 0; i < max_wal_sender; i++)
+	{
+		{
+    		GTM_ThreadInfo *thr = GTM_ThreadCreate(GTM_ThreadWalSender, g_max_lock_number);
+    		if (NULL == thr)
+    		{
+    			elog(ERROR, "Failed to create wal sender thread.");
+    			exit(1);
+    		}
+		}
+	}
+
+    g_logcollector_thread = GTM_ThreadCreate(GTM_ThreadLogCollector, g_max_lock_number);
+    if (NULL == g_logcollector_thread)
     {
-        {
-            GTM_ThreadInfo *thr = GTM_ThreadCreate(GTM_ThreadWalSender, g_max_lock_number);
-            if (NULL == thr)
-            {
-                elog(ERROR, "Failed to create wal sender thread.");
-                exit(1);
-            }
-        }
+        elog(ERROR, "Failed to create gtm log collector thread.");
+        exit(1);
     }
-    fprintf(stdout, "TBase create %d worker thread.\n", process_thread_num); 
-    
-    /* Processing threads + Timer + Timekeeper + Timebackup threads + Walwrite + CheckPointer*/
-    GTMThreads->gt_start_thread_count = process_thread_num + max_wal_sender + util_thread_cnt;
-    fprintf(stdout, "Start sever loop start thread count %d running thread count %d.\n",
-        GTMThreads->gt_start_thread_count, GTMThreads->gt_thread_count); 
-    
-    elog(LOG, "Start sever loop start thread count %d running thread count %d.\n",
-            GTMThreads->gt_start_thread_count, GTMThreads->gt_thread_count);
+    util_thread_cnt++;
+
+	fprintf(stdout, "TBase create %d worker thread.\n", process_thread_num); 
+	
+	/* Processing threads + Timer + Timekeeper + Timebackup threads + Walwrite + CheckPointer*/
+	GTMThreads->gt_start_thread_count = process_thread_num + max_wal_sender + util_thread_cnt;
+	fprintf(stdout, "Start sever loop start thread count %d running thread count %d.\n",
+		GTMThreads->gt_start_thread_count, GTMThreads->gt_thread_count); 
+	
+	elog(LOG, "Start sever loop start thread count %d running thread count %d.\n",
+			GTMThreads->gt_start_thread_count, GTMThreads->gt_thread_count);
+
+	/* init statistic time */
+    GTM_InitGtmStatistics();
+
+    /* init log hook */
+    errlog_collection_func = GTM_ErrorLogCollector;
+
 #endif
     fprintf(stdout, "TBase GTM is ready to go!!\n");
     /*
@@ -2352,6 +2372,92 @@ GTM_ThreadWalSender(void *argp)
     return my_threadinfo;    
 }
 
+/*
+ * Log collection thread, responsible for summarizing
+ * the log data of each thread to global datapump
+ */
+void *
+GTM_ThreadLogCollector(void *argp)
+{
+    GTM_ThreadInfo *my_threadinfo = (GTM_ThreadInfo *)argp;
+    sigjmp_buf  local_sigjmp_buf;
+    struct sigaction action;
+    int ret = 0;
+    action.sa_flags = 0;
+    action.sa_handler = GTM_ThreadSigHandler;
+
+    ret = sigaction(SIGQUIT, &action, NULL);
+    if (ret)
+    {
+        elog(LOG, "register thread quit handler failed");
+    }
+
+    elog(DEBUG8, "Starting the log collector thread");
+    MessageContext = AllocSetContextCreate(TopMemoryContext,
+                                           "MessageContext",
+                                           ALLOCSET_DEFAULT_MINSIZE,
+                                           ALLOCSET_DEFAULT_INITSIZE,
+                                           ALLOCSET_DEFAULT_MAXSIZE,
+                                           false);
+
+    /*
+     * POSTGRES main processing loop begins here
+     *
+     * If an exception is encountered, processing resumes here so we abort the
+     * current transaction and start a new one.
+     *
+     * You might wonder why this isn't coded as an infinite loop around a
+     * PG_TRY construct.  The reason is that this is the bottom of the
+     * exception stack, and so with PG_TRY there would be no exception handler
+     * in force at all during the CATCH part.  By leaving the outermost setjmp
+     * always active, we have at least some chance of recovering from an error
+     * during error recovery.  (If we get into an infinite loop thereby, it
+     * will soon be stopped by overflow of elog.c's internal state stack.)
+     */
+
+    if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+    {
+#ifdef __TBASE__
+        RWLockCleanUp();
+#endif
+        EmitErrorReport(NULL);
+
+        /*
+         * Now return to normal top-level context and clear ErrorContext for
+         * next time.
+         */
+        MemoryContextSwitchTo(TopMemoryContext);
+        FlushErrorState();
+    }
+
+    /* We can now handle ereport(ERROR) */
+    PG_exception_stack = &local_sigjmp_buf;
+
+    if (GTM_InitLogCollector() != 0)
+    {
+        elog(ERROR, "Failed to Init LogCollector.");
+        exit(1);
+    }
+
+    for(;;)
+    {
+        /* no need to lock here. */
+        if(GTM_SHUTTING_DOWN == GTMTransactions.gt_gtm_state)
+        {
+           break;
+        }
+
+        /* sleep GTM_LOG_COLLECT_CYCLE */
+        usleep(GTM_LOG_COLLECT_CYCLE);
+
+        GTM_ProcessLogCollection();
+    }
+
+    GTM_DeInitLogCollector();
+    elog(LOG, "GTM is shutting down, log collector exits!");
+    return my_threadinfo;
+}
+
 void
 SendXLogSyncStatus(GTM_Conn *conn)
 {// #lizard forgives
@@ -2955,13 +3061,15 @@ GTM_ThreadMain(void *argp)
     action.sa_handler = GTM_ThreadSigHandler;  
          
     ret = sigaction(SIGQUIT, &action, NULL);  
-    if (ret)
-    {
-        elog(LOG, "register thread quit handler failed");
-    }
+	if (ret)
+	{
+		elog(LOG, "register thread quit handler failed");
+	}
 
-    elog(DEBUG8, "Starting the connection helper thread");
-    bind_service_threads();
+	elog(DEBUG8, "Starting the connection helper thread");
+	bind_service_threads();
+
+    GTM_InitStatisticsHandle();
 
     /*
      * Create the memory context we will use in the main loop.
@@ -2971,52 +3079,52 @@ GTM_ThreadMain(void *argp)
      *
      * This context is thread-specific
      */
-    MessageContext = AllocSetContextCreate(TopMemoryContext,
-                                           "MessageContext",
-                                           ALLOCSET_DEFAULT_MINSIZE,
-                                           ALLOCSET_DEFAULT_INITSIZE,
-                                           ALLOCSET_DEFAULT_MAXSIZE,
-                                           false);
-    
-    efd = epoll_create1(0);
-    if(efd == -1)
-    {
-        elog(ERROR, "failed to create epoll");
-    }
-    thrinfo->thr_efd = efd;
-    thrinfo->thr_epoll_ok = true;
-    
-    /*
-     * Acquire the thread lock to prevent connection from GTM-Standby to update
-     * GTM-Standby registration.
-     */
-
-    /*
-     * Get the input_message in the TopMemoryContext so that we don't need to
-     * free/palloc it for every incoming message. Unlike Postgres, we don't
-     * expect the incoming messages to be of arbitrary sizes
-     */
-
-    initStringInfo(&input_message);
-
-    /*
-     * POSTGRES main processing loop begins here
-     *
-     * If an exception is encountered, processing resumes here so we abort the
-     * current transaction and start a new one.
-     *
-     * You might wonder why this isn't coded as an infinite loop around a
-     * PG_TRY construct.  The reason is that this is the bottom of the
-     * exception stack, and so with PG_TRY there would be no exception handler
-     * in force at all during the CATCH part.  By leaving the outermost setjmp
-     * always active, we have at least some chance of recovering from an error
-     * during error recovery.  (If we get into an infinite loop thereby, it
-     * will soon be stopped by overflow of elog.c's internal state stack.)
-     */
-
-    if (sigsetjmp(local_sigjmp_buf, 1) != 0)
-    {
-        bool    report = false;
+	MessageContext = AllocSetContextCreate(TopMemoryContext,
+										   "MessageContext",
+										   ALLOCSET_DEFAULT_MINSIZE,
+										   ALLOCSET_DEFAULT_INITSIZE,
+										   ALLOCSET_DEFAULT_MAXSIZE,
+										   false);
+	
+	efd = epoll_create1(0);
+	if(efd == -1)
+	{
+		elog(ERROR, "failed to create epoll");
+	}
+	thrinfo->thr_efd = efd;
+	thrinfo->thr_epoll_ok = true;
+	
+	/*
+	 * Acquire the thread lock to prevent connection from GTM-Standby to update
+	 * GTM-Standby registration.
+	 */
+
+	/*
+	 * Get the input_message in the TopMemoryContext so that we don't need to
+	 * free/palloc it for every incoming message. Unlike Postgres, we don't
+	 * expect the incoming messages to be of arbitrary sizes
+	 */
+
+	initStringInfo(&input_message);
+
+	/*
+	 * POSTGRES main processing loop begins here
+	 *
+	 * If an exception is encountered, processing resumes here so we abort the
+	 * current transaction and start a new one.
+	 *
+	 * You might wonder why this isn't coded as an infinite loop around a
+	 * PG_TRY construct.  The reason is that this is the bottom of the
+	 * exception stack, and so with PG_TRY there would be no exception handler
+	 * in force at all during the CATCH part.  By leaving the outermost setjmp
+	 * always active, we have at least some chance of recovering from an error
+	 * during error recovery.  (If we get into an infinite loop thereby, it
+	 * will soon be stopped by overflow of elog.c's internal state stack.)
+	 */
+
+	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+	{
+		bool	report = false;
 #ifdef __TBASE__
         RWLockCleanUp();
 #endif
@@ -3422,7 +3530,7 @@ ProcessCommand(Port *myport, StringInfo input_message)
 #ifdef __TBASE__
     GTM_ThreadInfo *my_threadinfo = NULL;
     long long  start_time;
-    long long  end_time;
+    long long  cost_time;
     my_threadinfo = GetMyThreadInfo;
 #ifndef __XLOG__
     GTM_ConnectionInfo *conn;
@@ -3641,6 +3749,16 @@ ProcessCommand(Port *myport, StringInfo input_message)
             break;
         }
 #endif
+        case MSG_GET_STATISTICS:
+        {
+            ProcessGetStatisticsCommand(myport,input_message);
+            break;
+        }
+        case MSG_GET_ERRORLOG:
+        {
+            ProcessGetErrorlogCommand(myport,input_message);
+            break;
+        }
 #endif
         default:
             ereport(FATAL,
@@ -3651,12 +3769,13 @@ ProcessCommand(Port *myport, StringInfo input_message)
 
     BeforeReplyToClientXLogTrigger();
 
-    end_time = getSystemTime();
+#ifdef __TBASE__
+    cost_time = getSystemTime() - start_time;
+    if(enable_gtm_debug || cost_time > warnning_time_cost)
+	    elog(LOG, "cost mtype = %s (%d) %lld ms.", gtm_util_message_name(mtype), (int)mtype,cost_time);
 
-    if(enable_gtm_debug || end_time - start_time > warnning_time_cost)
-	    elog(LOG, "cost mtype = %s (%d) %lld ms.", gtm_util_message_name(mtype), (int)mtype,end_time - start_time);
+    GTM_UpdateStatistics(my_threadinfo->stat_handle, mtype, cost_time);
 
-#ifdef __TBASE__
     if (my_threadinfo->handle_standby)
     {
         GTM_RWLockRelease(&my_threadinfo->thr_lock);
diff --git a/src/include/gtm/bloom.h b/src/include/gtm/bloom.h
new file mode 100644
index 00000000..5bdb4936
--- /dev/null
+++ b/src/include/gtm/bloom.h
@@ -0,0 +1,39 @@
+/*-------------------------------------------------------------------------
+ *
+ * bloom.h
+ *
+ *
+ *	  a bloom filter, using murmurhash
+ *
+ * Copyright (c) 2020-Present TBase development team, Tencent
+ *
+ *
+ * IDENTIFICATION
+ *	  src/include/gtm/bloom.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _BLOOM_H
+#define _BLOOM_H
+
+#include "gtm/gtm_lock.h"
+
+typedef unsigned int (*hashfunc_t)(const void *, int);
+
+typedef struct
+{
+    int            bitmap_size;   /* bitmap size of bloom filter */
+    unsigned char* bitmap;        /* bloom filter bitmap */
+    int            nfuncs;        /* hash functions num */
+    uint32*        seeds;         /* hash functions seeds */
+} BLOOM;
+
+BLOOM *BloomCreate(int bitmap_size, int nfuncs, ...);
+int BloomDestroy(BLOOM *bloom);
+void BloomReset(BLOOM *bloom);
+void BloomAdd(BLOOM *bloom, const char *s, int len);
+bool BloomCheck(BLOOM *bloom, const char *s, int len);
+bool BloomCheckAndAdd(BLOOM *bloom, const char *s, int len);
+uint32_t MurmurHash2(const void * key, int len, uint32_t seed);
+
+#endif
diff --git a/src/include/gtm/datapump.h b/src/include/gtm/datapump.h
new file mode 100644
index 00000000..360b56a7
--- /dev/null
+++ b/src/include/gtm/datapump.h
@@ -0,0 +1,52 @@
+/*-------------------------------------------------------------------------
+ *
+ * datapump.h
+ *
+ *
+ *	  lockless message queue
+ *
+ * Copyright (c) 2020-Present TBase development team, Tencent
+ *
+ *
+ * IDENTIFICATION
+ *	  src/include/gtm/datapump.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _DATAPUMP_H
+#define _DATAPUMP_H
+
+#include "gtm/gtm_c.h"
+#include "gtm/gtm_lock.h"
+
+#define  INVALID_BORDER 	(~((uint32)0))
+typedef struct
+{
+    char               *buf;         /* Data buffer */
+    unsigned           length;       /* Data buffer length */
+    s_lock_t	       pointer_lock; /* lock to protect offset and status */
+    volatile uint32	   head;         /* Head of the loop */
+    volatile uint32    tail;         /* Tail of the buffer */
+    volatile uint32    border;       /* end of last tuple, so that we can send a complete tuple */
+    volatile uint32    wrap_around;  /* wrap around of the queue , for read only */
+} DataPumpBuf;
+
+uint32 DataSize(DataPumpBuf *buf);
+uint32 FreeSpace(DataPumpBuf *buf);
+char  *GetData(DataPumpBuf *buf, uint32 *uiLen);
+void   IncDataOff(DataPumpBuf *buf, uint32 uiLen);
+char  *GetWriteOff(DataPumpBuf *buf, uint32 *uiLen);
+void   IncWriteOff(DataPumpBuf *buf, uint32 uiLen);
+char  *GetWriteOff(DataPumpBuf *buf, uint32 *uiLen);
+uint32 BufferOffsetAdd(DataPumpBuf *buf, uint32 pointer, uint32 offset);
+int    ReserveSpace(DataPumpBuf *buf, uint32 len, uint32 *offset);
+int ReturnSpace(DataPumpBuf *buf, uint32 offset);
+void   FillReserveSpace(DataPumpBuf *buf, uint32 offset, char *p, uint32 len);
+void   SetBorder(DataPumpBuf *buf);
+void  *DataPumpSenderThread(void *arg);
+void  PutData(DataPumpBuf *buf, char *data, uint32 len);
+
+
+
+
+#endif
diff --git a/src/include/gtm/elog.h b/src/include/gtm/elog.h
index c79eaba5..70387e54 100644
--- a/src/include/gtm/elog.h
+++ b/src/include/gtm/elog.h
@@ -76,6 +76,7 @@
 #define ELOG_H
 
 #include "c.h"
+#include "stringinfo.h"
 
 /* Error level codes */
 #define DEBUG8         9
@@ -315,4 +316,9 @@ write_stderr(const char *fmt,...)
    the supplied arguments. */
 __attribute__((format(printf, 1, 2)));
 
+
+/* log collection function hook */
+typedef void (*errlog_collection_hook_type) (ErrorData *edata, StringInfo buff);
+extern errlog_collection_hook_type errlog_collection_func;
+
 #endif   /* GTM_ELOG_H */
diff --git a/src/include/gtm/gtm.h b/src/include/gtm/gtm.h
index f98af0ff..715d91cb 100644
--- a/src/include/gtm/gtm.h
+++ b/src/include/gtm/gtm.h
@@ -25,6 +25,8 @@
 #include "gtm/elog.h"
 #include "gtm/gtm_list.h"
 #include "gtm/gtm_xlog_internal.h"
+#include "gtm/gtm_stat.h"
+#include "gtm/datapump.h"
 
 extern char *GTMLogFile;
 typedef enum GTM_ThreadStatus
@@ -98,6 +100,8 @@ typedef struct GTM_ThreadInfo
     XLogWaiter          xlog_waiter;
     bool                handle_standby;
 #endif
+    GTM_WorkerStatistics  *stat_handle;     /* statistics hanndle */
+    DataPumpBuf           *datapump_buff;   /* log collection buff */
 } GTM_ThreadInfo;
 
 typedef struct GTM_Threads
diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h
index b2c6382b..7af3e735 100644
--- a/src/include/gtm/gtm_c.h
+++ b/src/include/gtm/gtm_c.h
@@ -392,7 +392,7 @@ typedef enum
 #define GTM_GTS_ONE_SECOND               (1000 * 1000L)
 #define GTM_SYNC_CYCLE                     (5   * GTM_GTS_ONE_SECOND)
 #define GTM_SYNC_TIME_LIMIT              (60  * GTM_GTS_ONE_SECOND)
-
+#define GTM_LOG_COLLECT_CYCLE		     (5   * GTM_GTS_ONE_SECOND)
 
 #pragma pack()
 
diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h
index f7b17ac9..2381286a 100644
--- a/src/include/gtm/gtm_client.h
+++ b/src/include/gtm/gtm_client.h
@@ -22,6 +22,8 @@
 #include "gtm/register.h"
 #include "gtm/libpq-fe.h"
 #include "access/xlogdefs.h"
+#include "gtm/gtm_stat.h"
+
 #define MAX_HOSTADDR_LEN 32
 #define MAX_PORT_LEN     8
 
@@ -77,110 +79,111 @@ typedef union GTM_ResultData
     
 #endif
 
-    GlobalTransactionId            grd_gxid;            /* TXN_PREPARE        
-                                                     * TXN_START_PREPARED
-                                                     * TXN_ROLLBACK
-                                                     */
-    struct {
-        GlobalTransactionId            gxid;
-                                                    /* TXN_COMMIT
-                                                     * TXN_COMMIT_PREPARED
-                                                     */
-        int                            status;
-    } grd_eof_txn;
-
-    GlobalTransactionId            grd_next_gxid;
-
-    struct
-    {
-        GTM_TransactionHandle    txnhandle;
-        GlobalTransactionId        gxid;
-    } grd_txn;                                    /* TXN_GET_GXID */
-
-    GTM_SequenceKeyData            grd_seqkey;        /* SEQUENCE_INIT
-                                                 * SEQUENCE_RESET
-                                                 * SEQUENCE_CLOSE */
-    struct
-    {
-        GTM_SequenceKeyData        seqkey;
-        GTM_Sequence            seqval;
-        GTM_Sequence            rangemax;
-    } grd_seq;                                    /* SEQUENCE_GET_CURRENT
-                                                 * SEQUENCE_GET_NEXT */
-    struct
-    {
-        int32                    seq_count;
-        GTM_SeqInfo               *seq;
-    } grd_seq_list;                                /* SEQUENCE_GET_LIST */
-
-    struct
-    {  
-        int32                     txn_count;                 /* TXN_BEGIN_GETGXID_MULTI */
-        GlobalTransactionId        txn_gxid[GTM_MAX_GLOBAL_TRANSACTIONS];
-        GTM_Timestamp            timestamp;
-    } grd_txn_get_multi;
-
-    struct
-    {
-        int                ts_count;                 /* GETGTS_MULTI */
-        GTM_Timestamp        gts[GTM_MAX_GLOBAL_TRANSACTIONS];
-    } grd_gts_get_multi;
-
-    struct
-    {
-        int                txn_count;                /* TXN_COMMIT_MULTI */
-        int                status[GTM_MAX_GLOBAL_TRANSACTIONS];
-    } grd_txn_rc_multi;
-
-    struct
-    {
-        GTM_TransactionHandle    txnhandle;        /* SNAPSHOT_GXID_GET */
-        GlobalTransactionId        gxid;            /* SNAPSHOT_GET */
-        int                        txn_count;        /* SNAPSHOT_GET_MULTI */
-        int                        status[GTM_MAX_GLOBAL_TRANSACTIONS];
-    } grd_txn_snap_multi;
-
-    struct
-    {
-        GlobalTransactionId        gxid;
-        GlobalTransactionId        prepared_gxid;
-        int                nodelen;
-        char            *nodestring;
-    } grd_txn_get_gid_data;                    /* TXN_GET_GID_DATA_RESULT */
-
-    struct
-    {
-        char                *ptr;
-        int                  len;
-    } grd_txn_gid_list;                        /* TXN_GXID_LIST_RESULT */
-
-    struct
-    {
-        GTM_PGXCNodeType    type;            /* NODE_REGISTER */
-        int                  len;
-        char                *node_name;        /* NODE_UNREGISTER */
-        GlobalTransactionId xmin;
-    } grd_node;
-
-    struct
-    {
-        int                num_node;
-        GTM_PGXCNodeInfo        *nodeinfo[MAX_NODES];
-    } grd_node_list;
-
-    struct
-    {
-        GlobalTransactionId        latest_completed_xid;
-        GlobalTransactionId        global_xmin;
-        int                        errcode;
-    } grd_report_xmin;                        /* REPORT_XMIN */
-
-
-    /*
-     * TODO
-     *     TXN_GET_STATUS
-     *     TXN_GET_ALL_PREPARED
-     */
+	GlobalTransactionId			grd_gxid;			/* TXN_PREPARE		
+													 * TXN_START_PREPARED
+													 * TXN_ROLLBACK
+													 */
+	struct {
+		GlobalTransactionId			gxid;
+													/* TXN_COMMIT
+													 * TXN_COMMIT_PREPARED
+													 */
+		int							status;
+	} grd_eof_txn;
+
+	GlobalTransactionId			grd_next_gxid;
+
+	struct
+	{
+		GTM_TransactionHandle	txnhandle;
+		GlobalTransactionId		gxid;
+	} grd_txn;									/* TXN_GET_GXID */
+
+	GTM_SequenceKeyData			grd_seqkey;		/* SEQUENCE_INIT
+												 * SEQUENCE_RESET
+												 * SEQUENCE_CLOSE */
+	struct
+	{
+		GTM_SequenceKeyData		seqkey;
+		GTM_Sequence			seqval;
+		GTM_Sequence			rangemax;
+	} grd_seq;									/* SEQUENCE_GET_CURRENT
+												 * SEQUENCE_GET_NEXT */
+	struct
+	{
+		int32					seq_count;
+		GTM_SeqInfo			   *seq;
+	} grd_seq_list;								/* SEQUENCE_GET_LIST */
+
+	struct
+	{  
+		int32			     	txn_count; 				/* TXN_BEGIN_GETGXID_MULTI */
+		GlobalTransactionId		txn_gxid[GTM_MAX_GLOBAL_TRANSACTIONS];
+		GTM_Timestamp			timestamp;
+	} grd_txn_get_multi;
+
+	struct
+	{
+		int				ts_count; 				/* GETGTS_MULTI */
+		GTM_Timestamp		gts[GTM_MAX_GLOBAL_TRANSACTIONS];
+	} grd_gts_get_multi;
+
+	struct
+	{
+		int				txn_count;				/* TXN_COMMIT_MULTI */
+		int				status[GTM_MAX_GLOBAL_TRANSACTIONS];
+	} grd_txn_rc_multi;
+
+	struct
+	{
+		GTM_TransactionHandle	txnhandle;		/* SNAPSHOT_GXID_GET */
+		GlobalTransactionId		gxid;			/* SNAPSHOT_GET */
+		int						txn_count;		/* SNAPSHOT_GET_MULTI */
+		int						status[GTM_MAX_GLOBAL_TRANSACTIONS];
+	} grd_txn_snap_multi;
+
+	struct
+	{
+		GlobalTransactionId		gxid;
+		GlobalTransactionId		prepared_gxid;
+		int				nodelen;
+		char			*nodestring;
+	} grd_txn_get_gid_data;					/* TXN_GET_GID_DATA_RESULT */
+
+	struct
+	{
+		char				*ptr;
+		int  				len;
+	} grd_txn_gid_list;						/* TXN_GXID_LIST_RESULT */
+
+	struct
+	{
+		GTM_PGXCNodeType	type;			/* NODE_REGISTER */
+		int  				len;
+		char				*node_name;		/* NODE_UNREGISTER */
+		GlobalTransactionId xmin;
+	} grd_node;
+
+	struct
+	{
+		int				num_node;
+		GTM_PGXCNodeInfo		*nodeinfo[MAX_NODES];
+	} grd_node_list;
+
+	struct
+	{
+		GlobalTransactionId		latest_completed_xid;
+		GlobalTransactionId		global_xmin;
+		int						errcode;
+	} grd_report_xmin;						/* REPORT_XMIN */
+
+    GTM_StatisticsResult statistic_result;
+
+	/*
+	 * TODO
+	 * 	TXN_GET_STATUS
+	 * 	TXN_GET_ALL_PREPARED
+	 */
 } GTM_ResultData;
 
 #define GTM_RESULT_COMM_ERROR (-2) /* Communication error */
@@ -210,35 +213,41 @@ typedef struct GTM_Result
         XLogRecPtr              start_pos;
         TimeLineID              time_line;
 #endif
-    } grd_storage_data;                     /* STORAGE_TRANSFER_RESULT */
-    int                            gr_finish_status;    /* TXN_FINISH_GID_RESULT result */
-    GTMStorageStatus            gtm_status;
-    
-    struct 
-    {
-        int32                    count;
-        GTM_StoredSeqInfo       *seqs;
-    }grd_store_seq;
+	} grd_storage_data; 					/* STORAGE_TRANSFER_RESULT */
+	int							gr_finish_status;	/* TXN_FINISH_GID_RESULT result */
+	GTMStorageStatus            gtm_status;
+	
+	struct 
+	{
+		int32					count;
+		GTM_StoredSeqInfo       *seqs;
+	}grd_store_seq;
+
+	struct 
+	{
+		int32							 count;
+		GTM_StoredTransactionInfo       *txns;
+	}grd_store_txn;
+
+
+	struct 
+	{
+		int32							count;
+		GTMStorageSequneceStatus       *seqs;
+	}grd_store_check_seq;
+
+	struct 
+	{
+		int32							count;
+		GTMStorageTransactionStatus     *txns;
+	}grd_store_check_txn;
 
-    struct 
-    {
-        int32                             count;
-        GTM_StoredTransactionInfo       *txns;
-    }grd_store_txn;
-
-
-    struct 
+    struct
     {
-        int32                            count;
-        GTMStorageSequneceStatus       *seqs;
-    }grd_store_check_seq;
+        int len;
+        char* errlog;
+    } grd_errlog;
 
-    struct 
-    {
-        int32                            count;
-        GTMStorageTransactionStatus     *txns;
-    }grd_store_check_txn;
-    
 #endif
     /*
      * We keep these two items outside the union to avoid repeated malloc/free
@@ -296,6 +305,9 @@ int check_gtm_status(GTM_Conn *conn, int *status, GTM_Timestamp *master,XLogRecP
 int check_gtm_status(GTM_Conn *conn, int *status, GTM_Timestamp *master, GTM_Timestamp *standby, char *standbyhost, char *standbyport, int32 buflen);
 #endif
 int bkup_global_timestamp(GTM_Conn *conn, GlobalTimestamp timestamp);
+int get_gtm_statistics(GTM_Conn *conn, int clear_flag, int timeout_seconds, GTM_StatisticsResult** result);
+int get_gtm_errlog(GTM_Conn *conn, int timeout_seconds, char** errlog, int* len);
+
 #endif
 
 int bkup_begin_transaction_gxid(GTM_Conn *conn, GlobalTransactionId gxid,
diff --git a/src/include/gtm/gtm_msg.h b/src/include/gtm/gtm_msg.h
index 69fcd9ed..bb66c194 100644
--- a/src/include/gtm/gtm_msg.h
+++ b/src/include/gtm/gtm_msg.h
@@ -121,6 +121,8 @@ typedef enum GTM_MessageType
     MSG_GET_REPLICATION_STATUS,
     MSG_GET_REPLICATION_TRANSFER,
 #endif
+    MSG_GET_STATISTICS,
+    MSG_GET_ERRORLOG,
 
     /*
      * Must be at the end
@@ -204,7 +206,10 @@ typedef enum GTM_ResultType
     MSG_REPLICATION_CONTENT,
 #endif
 
-    RESULT_TYPE_COUNT
+    MSG_GET_GTM_STATISTICS_RESULT,
+    MSG_GET_GTM_ERRORLOG_RESULT,
+
+	RESULT_TYPE_COUNT
 } GTM_ResultType;
 
 /*
diff --git a/src/include/gtm/gtm_stat.h b/src/include/gtm/gtm_stat.h
new file mode 100644
index 00000000..b58dd330
--- /dev/null
+++ b/src/include/gtm/gtm_stat.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_stat.h
+ *
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ * Portions Copyright (c) 2012-2018 TBase Development Group
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _GTM_STAT_H
+#define _GTM_STAT_H
+
+#include "gtm/gtm_c.h"
+#include "gtm/gtm_lock.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/libpq-be.h"
+#include "gtm/stringinfo.h"
+#include "port/atomics.h"
+
+typedef int64 pg_time_t;
+#define CACHE_LINE_SIZE 64
+#define CACHE_LINE_ALIGN __attribute__((aligned(CACHE_LINE_SIZE)))
+
+typedef enum GTM_Statistic_Cmd
+{
+    CMD_GETGTS,
+    CMD_SEQUENCE_GET_NEXT,
+    CMD_TXN_START_PREPARED,
+    CMD_STATISTICS_TYPE_COUNT
+} GTM_StatisticsCmd;
+
+typedef struct
+{
+    pg_atomic_uint32 total_request_times;
+    pg_atomic_uint32 total_costtime;
+    pg_atomic_uint32 max_costtime;
+    pg_atomic_uint32 min_costtime;
+} CACHE_LINE_ALIGN GTM_StatisticsInfo;
+
+typedef struct
+{
+    GTM_StatisticsInfo cmd_statistics[CMD_STATISTICS_TYPE_COUNT];
+} GTM_WorkerStatistics;
+
+typedef struct
+{
+    uint32     total_request_times;
+    union
+    {
+        uint32 total_costtime;
+        uint32 avg_costtime;
+    };
+    uint32     max_costtime;
+    uint32     min_costtime;
+} GTM_StatisticsItem;
+
+typedef struct
+{
+    pg_time_t          start_time;                            /* statistics info start time */
+    pg_time_t          end_time;                              /* statistics info end time */
+    int32              sequences_remained;                    /* sequence remained num */
+    int32              txn_remained;                          /* txn remained num */
+    GTM_StatisticsItem stat_info[CMD_STATISTICS_TYPE_COUNT];  /* specific cmd statistics info */
+} GTM_StatisticsResult;
+
+typedef struct
+{
+    pg_time_t stat_start_time;      /* statistics info start time */
+    s_lock_t  lock;                 /* lock to avoid multi client */
+} GTM_Statistics;
+
+extern GTM_Statistics GTMStatistics;
+
+void GTM_InitGtmStatistics(void);
+
+void GTM_InitStatisticsHandle(void);
+
+void GTM_UpdateStatistics(GTM_WorkerStatistics* stat_handle, GTM_MessageType mtype, uint32 costtime);
+
+void ProcessGetStatisticsCommand(Port *myport, StringInfo message);
+#endif
diff --git a/src/include/gtm/gtm_stat_error.h b/src/include/gtm/gtm_stat_error.h
new file mode 100644
index 00000000..608eb854
--- /dev/null
+++ b/src/include/gtm/gtm_stat_error.h
@@ -0,0 +1,56 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_stat_error.h
+
+ *	  collect error logs of gtm
+ *
+ * Copyright (c) 2020-Present TBase development team, Tencent
+ *
+ * IDENTIFICATION
+ *	  src/gtm/main/gtm_stat_error.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _GTM_STAT_ERROR_H
+#define _GTM_STAT_ERROR_H
+
+#include "gtm/gtm_c.h"
+#include "gtm/gtm_lock.h"
+#include "gtm/datapump.h"
+#include "gtm/bloom.h"
+
+#define GTM_MAX_ERRMSG_SIZE (1024) 	/* max size of each error msg to track */
+#define GTM_BLOOM_FILTER_SIZE (1 * 1024 * 1024)
+#define GTM_GLOBAL_ERRLOG_DATAPUMP_SIZE (10 * 1024) /* k */
+#define GTM_THREAD_ERRLOG_DATAPUMP_SIZE (16) /* k */
+
+typedef int64 pg_time_t;
+
+typedef struct
+{
+    int			proc_id;		/* process id */
+    int			error_no;		/* errno  */
+    pg_time_t   log_time;		/* log time */
+    int		 	err_level;		/* error level */
+    int			errmsg_len;		/* length of valid bytes in error message */
+    char		errmsg[0];		/* variable length array - must be last */
+} GTM_ErrLog;
+
+typedef struct
+{
+    s_lock_t         lock;               /* lock to avoid multi client */
+    pg_atomic_uint32 full;               /* datapump is full */
+    char             *tmp_buff;          /* a buff use to read tuple data */
+    BLOOM            *bloom_filter;      /* bloom filter use to exclude duplicates */
+    DataPumpBuf      *datapump_buff;     /* circular queue buffer */
+} GTM_LogCollector;
+
+extern GTM_LogCollector GlobalLogCollector;
+
+DataPumpBuf *GTM_BuildDataPumpBuf(uint32 size);
+void GTM_DestroyDataPumpBuf(DataPumpBuf *buff);
+int GTM_InitLogCollector(void);
+void GTM_DeInitLogCollector(void);
+void GTM_ProcessLogCollection(void);
+void ProcessGetErrorlogCommand(Port *myport, StringInfo message);
+#endif

From d6aec823f4d5562041ff003154f79a1a6c241339 Mon Sep 17 00:00:00 2001
From: qiannzhang <qiannzhang@tencent.com>
Date: Fri, 25 Sep 2020 19:56:11 +0800
Subject: [PATCH 065/578] Run Node Lock/UnLock locally.

http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131082021563
---
 src/backend/tcop/pquery.c  | 46 +++++++++++++++++++++-----------------
 src/backend/tcop/utility.c |  6 ++++-
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 67cf48ae..e7068eab 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -1864,27 +1864,31 @@ PortalRunUtility(Portal portal, PlannedStmt *pstmt,
     GetGtmInfoFromUserCmd(utilityStmt);
 #endif
 
-    /*
-     * Set snapshot if utility stmt needs one.  Most reliable way to do this
-     * seems to be to enumerate those that do not need one; this is a short
-     * list.  Transaction control, LOCK, and SET must *not* set a snapshot
-     * since they need to be executable at the start of a transaction-snapshot
-     * mode transaction without freezing a snapshot.  By extension we allow
-     * SHOW not to set a snapshot.  The other stmts listed are just efficiency
-     * hacks.  Beware of listing anything that can modify the database --- if,
-     * say, it has to update an index with expressions that invoke
-     * user-defined functions, then it had better have a snapshot.
-     */
-    if (!(IsA(utilityStmt, TransactionStmt) ||
-          IsA(utilityStmt, LockStmt) ||
-          IsA(utilityStmt, VariableSetStmt) ||
-          IsA(utilityStmt, VariableShowStmt) ||
-          IsA(utilityStmt, ConstraintsSetStmt) ||
-    /* efficiency hacks from here down */
-          IsA(utilityStmt, FetchStmt) ||
-          IsA(utilityStmt, ListenStmt) ||
-          IsA(utilityStmt, NotifyStmt) ||
-          IsA(utilityStmt, UnlistenStmt) ||
+	/*
+	 * Set snapshot if utility stmt needs one.  Most reliable way to do this
+	 * seems to be to enumerate those that do not need one; this is a short
+	 * list.  Transaction control, LOCK, and SET must *not* set a snapshot
+	 * since they need to be executable at the start of a transaction-snapshot
+	 * mode transaction without freezing a snapshot.  By extension we allow
+	 * SHOW not to set a snapshot.  The other stmts listed are just efficiency
+	 * hacks.  Beware of listing anything that can modify the database --- if,
+	 * say, it has to update an index with expressions that invoke
+	 * user-defined functions, then it had better have a snapshot.
+	 */
+	if (!(IsA(utilityStmt, TransactionStmt) ||
+		  IsA(utilityStmt, LockStmt) ||
+		  IsA(utilityStmt, VariableSetStmt) ||
+		  IsA(utilityStmt, VariableShowStmt) ||
+		  IsA(utilityStmt, ConstraintsSetStmt) ||
+	/* efficiency hacks from here down */
+		  IsA(utilityStmt, FetchStmt) ||
+		  IsA(utilityStmt, ListenStmt) ||
+		  IsA(utilityStmt, NotifyStmt) ||
+		  IsA(utilityStmt, UnlistenStmt) ||
+#ifdef __TBASE__
+		  /* Node Lock/Unlock do not modify any data */
+		  IsA(utilityStmt, LockNodeStmt) ||
+#endif
 #ifdef PGXC
           IsA(utilityStmt, PauseClusterStmt) ||
           IsA(utilityStmt, BarrierStmt) ||
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index f331a920..f5d10269 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -6581,7 +6581,11 @@ IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString)
 #ifdef XCP
         case T_PauseClusterStmt:
 #endif
-            return ALLOW;
+#ifdef __TBASE__
+		/* Node Lock/Unlock do not modify any data */
+		case T_LockNodeStmt:
+#endif
+			return ALLOW;
 
         default:
             return DISALLOW;

From f50e456f0e54850f3acf5094d989403565dfc25e Mon Sep 17 00:00:00 2001
From: mark <marklv@tencent.com>
Date: Fri, 25 Sep 2020 17:51:58 +0800
Subject: [PATCH 066/578] [TAPD 82017165] Correctly reset the overdue
 information stored in shard map.

---
 src/backend/pgxc/shard/shardmap.c | 161 ++++++++++++++++--------------
 1 file changed, 86 insertions(+), 75 deletions(-)

diff --git a/src/backend/pgxc/shard/shardmap.c b/src/backend/pgxc/shard/shardmap.c
index a2b1d8e9..9e1cec0d 100644
--- a/src/backend/pgxc/shard/shardmap.c
+++ b/src/backend/pgxc/shard/shardmap.c
@@ -767,62 +767,72 @@ static bool SyncShardMapList_Node_DN(void)
         return false;
     }
 
-    self_node_oid = get_pgxc_nodeoid_extend(PGXCNodeName, PGXCMainClusterName);
-    if (InvalidOid == self_node_oid)
-    {
-        elog(LOG, "SyncShardMapList_Node_DN failed to get nodeoid, node:%s", PGXCNodeName);
-        return false;
-    }
-    curr_groupoid = GetGroupOidByNode(self_node_oid);
-    if (InvalidOid == curr_groupoid)
-    {
-        elog(LOG, "SyncShardMapList_Node_DN failed to get groupoid, node:%s, nodeoid:%d", PGXCNodeName, self_node_oid);
-        return false;
-    }            
-    
-    if (is_group_sharding_inited(curr_groupoid))
-    {
-        bms_clear(g_DatanodeShardgroupBitmap);
-        
-        /* If the group sharding has not been inited */
-        if (!g_GroupShardingMgr_DN->used)
-        {
-            g_GroupShardingMgr_DN->members->shardMapStatus = SHMEM_SHRADMAP_STATUS_LOADING;
-            g_GroupShardingMgr_DN->members->group = curr_groupoid;
-            g_GroupShardingMgr_DN->used = true;
-        }
-            
-        shardrel = heap_open(PgxcShardMapRelationId, AccessShareLock);
-        ScanKeyInit(&skey,
-            Anum_pgxc_shard_map_nodegroup,
-            BTEqualStrategyNumber, F_OIDEQ,
-            ObjectIdGetDatum(curr_groupoid));
-
-        sysscan = systable_beginscan(shardrel,
-                                  PgxcShardMapGroupIndexId, 
-                                  true,
-                                  NULL, 1, &skey);
-        
-        while(HeapTupleIsValid(oldtup = systable_getnext(sysscan)))
-        {
-            pgxc_shard = (Form_pgxc_shard_map)GETSTRUCT(oldtup);
-            InsertShardMap_DN(pgxc_shard);
-
-            /* 
-             * If node is DN AND pgxc_shard_map tuple's primary copy is itself,
-             * Add this shardid to bitmap.
-             */
-            BuildDatanodeVisibilityMap(pgxc_shard, self_node_oid);
-        }
-        systable_endscan(sysscan);
-        heap_close(shardrel, AccessShareLock);    
-        ShardMapInitDone_DN(curr_groupoid, false);                    
-    }
-    else
-    {
-        elog(LOG, "SyncShardMapList_Node_DN group %d is not inited.", curr_groupoid);
-        return false;
-    }
+	self_node_oid = get_pgxc_nodeoid_extend(PGXCNodeName, PGXCMainClusterName);
+	if (InvalidOid == self_node_oid)
+	{
+		elog(LOG, "SyncShardMapList_Node_DN failed to get nodeoid, node:%s", PGXCNodeName);
+		return false;
+	}
+	curr_groupoid = GetGroupOidByNode(self_node_oid);
+	if (InvalidOid == curr_groupoid)
+	{
+		elog(LOG, "SyncShardMapList_Node_DN failed to get groupoid, node:%s, nodeoid:%d", PGXCNodeName, self_node_oid);
+		return false;
+	}			
+	
+	if (is_group_sharding_inited(curr_groupoid))
+	{
+		bms_clear(g_DatanodeShardgroupBitmap);
+		
+		/* 
+		 * If sharding of the group has not been inited, or this sharding map is in use but 
+		 * store overdue information, possibly caused by group syncing backend crashing right
+		 * before the shmem sync.
+		 */
+		if (!g_GroupShardingMgr_DN->used || curr_groupoid != g_GroupShardingMgr_DN->members->group)
+		{
+			/* 
+			 * Datanodes can only be in one node group, so we save the effort of
+			 * removing entry and skip right into resetting the mgr.
+			 */
+			g_GroupShardingMgr_DN->members->shardMapStatus = SHMEM_SHRADMAP_STATUS_LOADING;
+			SpinLockAcquire(&g_GroupShardingMgr_DN->lock);
+			g_GroupShardingMgr_DN->members->group = curr_groupoid;
+			g_GroupShardingMgr_DN->used = true;
+			SpinLockRelease(&g_GroupShardingMgr_DN->lock);
+		}
+			
+		shardrel = heap_open(PgxcShardMapRelationId, AccessShareLock);
+		ScanKeyInit(&skey,
+			Anum_pgxc_shard_map_nodegroup,
+			BTEqualStrategyNumber, F_OIDEQ,
+			ObjectIdGetDatum(curr_groupoid));
+
+		sysscan = systable_beginscan(shardrel,
+								  PgxcShardMapGroupIndexId, 
+								  true,
+								  NULL, 1, &skey);
+		
+		while(HeapTupleIsValid(oldtup = systable_getnext(sysscan)))
+		{
+			pgxc_shard = (Form_pgxc_shard_map)GETSTRUCT(oldtup);
+			InsertShardMap_DN(pgxc_shard);
+
+			/* 
+			 * If node is DN AND pgxc_shard_map tuple's primary copy is itself,
+			 * Add this shardid to bitmap.
+			 */
+			BuildDatanodeVisibilityMap(pgxc_shard, self_node_oid);
+		}
+		systable_endscan(sysscan);
+		heap_close(shardrel, AccessShareLock);	
+		ShardMapInitDone_DN(curr_groupoid, false);					
+	}
+	else
+	{
+		elog(LOG, "SyncShardMapList_Node_DN group %d is not inited.", curr_groupoid);
+		return false;
+	}
 
     return true;
 }
@@ -1028,26 +1038,27 @@ static void ShardMapInitDone_CN(int32 map, Oid group, bool need_lock)
 
 
 static void ShardMapInitDone_DN(Oid group, bool need_lock)
-{// #lizard forgives
-    bool           dup = false;
-    int32           maxNodeIndex = 0;
-    int32          i;
-    int32          j;
-    int32          nodeindex = 0;
-    int32          nodeCnt = 0;
-    ShardMapItemDef item;
+{
+	bool           dup = false;
+	int32		   maxNodeIndex = 0;
+	int32          i;
+	int32          j;
+	int32          nodeindex = 0;
+	int32          nodeCnt = 0;
+	ShardMapItemDef item;
 
-    if(!IS_PGXC_DATANODE)
-    {
-        elog(ERROR, "ShardMapInitDone_DN should only be called in datanode");
-        return;
-    }
-    
-    if(group != g_GroupShardingMgr_DN->members->group)
-    {
-        elog(PANIC, "groupoid %d in mgr is not group %d", g_GroupShardingMgr_DN->members->group, group);
-        return;
-    }
+	if(!IS_PGXC_DATANODE)
+	{
+		elog(ERROR, "ShardMapInitDone_DN should only be called in datanode");
+		return;
+	}
+	
+	if(group != g_GroupShardingMgr_DN->members->group)
+	{
+		/* PANIC here is to reset shmem, although a more elegant way should be provided by ShardMapShmem AM */
+		elog(PANIC, "groupoid %d in mgr is not group %d", g_GroupShardingMgr_DN->members->group, group);
+		return;
+	}
 
     if (need_lock)
     {

From e7095fb698a6abd7713cb8c77649521a26fa54f6 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Sun, 27 Sep 2020 10:48:09 +0800
Subject: [PATCH 067/578] Support simplify subquery when pullup TargestList
 sublink and optimize targetlist convert logic

Subquery in TargetList could have 'limit 1' cluase or 'rownum=1'(Oracle
Compatibility) qualification to make sure only one row returned. In this
case, we can eliminate the limit clause by combining the first matched
logic to join operation, thus we introduced the new join type
JOIN_LEFT_SEMI. The new join type returns the first copy of each LHS row
that has match, and also returns unmatched LHS tuples.(Existing
JOIN_LEFT or JOIN_SEMI does not satisfy this sementic)
As an additional benifit, SubQuery in targetlist with aggregation can
also speed up by using this new JOIN_LEFT_SEMI join type by skip
detecting next matched inner row.(Reduce another hash probe, or more
nestloop costs).

http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696860181163
---
 src/backend/commands/explain.c            |   9 +-
 src/backend/executor/nodeHashjoin.c       |  10 +-
 src/backend/executor/nodeMergejoin.c      |  26 ++---
 src/backend/executor/nodeNestloop.c       |   7 +-
 src/backend/optimizer/path/allpaths.c     |   3 +-
 src/backend/optimizer/path/costsize.c     |  15 ++-
 src/backend/optimizer/path/indxpath.c     |   4 +-
 src/backend/optimizer/path/joinpath.c     |   5 +-
 src/backend/optimizer/path/joinrels.c     |   6 +-
 src/backend/optimizer/plan/initsplan.c    |   8 +-
 src/backend/optimizer/plan/setrefs.c      |   1 +
 src/backend/optimizer/plan/subselect.c    | 132 ++++++++++++++++++++--
 src/backend/optimizer/prep/prepjointree.c |   4 +
 src/backend/optimizer/util/pathnode.c     |   7 +-
 src/backend/utils/adt/network_selfuncs.c  |   1 +
 src/backend/utils/adt/selfuncs.c          |   1 +
 src/include/nodes/nodes.h                 |   7 +-
 src/test/regress/expected/subselect.out   |  39 ++++++-
 src/test/regress/sql/subselect.sql        |   4 +
 19 files changed, 237 insertions(+), 52 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index d49eebc8..c58bd433 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1374,9 +1374,12 @@ ExplainNode(PlanState *planstate, List *ancestors,
 						jointype = "Anti";
 						break;
 #ifdef __TBASE__
-                    case JOIN_LEFT_SCALAR:
-                        jointype = "Left Scalar";
-                        break;
+					case JOIN_LEFT_SCALAR:
+						jointype = "Left Scalar";
+						break;
+					case JOIN_LEFT_SEMI:
+						jointype = "Left Semi";
+						break;
 #endif
 					default:
 						jointype = "???";
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 9f1b7b90..6d57ae37 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -660,7 +660,8 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
 	 * detect whether we need only consider the first matching inner tuple
 	 */
 	hjstate->js.single_match = (node->join.inner_unique ||
-								node->join.jointype == JOIN_SEMI);
+								node->join.jointype == JOIN_SEMI ||
+								node->join.jointype == JOIN_LEFT_SEMI);
 
 	/* set up null tuples for outer joins, if needed */
 	switch (node->join.jointype)
@@ -669,11 +670,8 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
         case JOIN_SEMI:
 		    break;
 #ifdef __TBASE__
-        case JOIN_LEFT_SCALAR:
-            hjstate->hj_NullInnerTupleSlot =
-                    ExecInitNullTupleSlot(estate,
-                                          ExecGetResultType(innerPlanState(hjstate)));
-            break;
+		case JOIN_LEFT_SCALAR:
+		case JOIN_LEFT_SEMI:
 #endif
 		case JOIN_LEFT:
 		case JOIN_ANTI:
diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c
index d10b74b0..6989c3ed 100644
--- a/src/backend/executor/nodeMergejoin.c
+++ b/src/backend/executor/nodeMergejoin.c
@@ -694,14 +694,14 @@ ExecMergeJoin(PlanState *pstate)
                             break;
                         }
 #ifdef __TBASE__
-                        /*
-                          * if we have finished the join, and the inner never be executed,
-                          * we need to disconnect from remote node.
-                                              */
-                        if (!node->mj_InnerInited && IS_PGXC_DATANODE)
-                        {
-                            ExecDisconnectNode(innerPlan);
-                        }
+						/*
+						 * If we have finished the join, and the inner never
+						 * be executed, we need to disconnect from remote node.
+						 */
+						if (!node->mj_InnerInited && IS_PGXC_DATANODE)
+						{
+							ExecDisconnectNode(innerPlan);
+						}
 #endif
                         /* Otherwise we're done. */
                         return NULL;
@@ -1542,7 +1542,8 @@ ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags)
 	 * detect whether we need only consider the first matching inner tuple
 	 */
 	mergestate->js.single_match = (node->join.inner_unique ||
-								   node->join.jointype == JOIN_SEMI);
+								   node->join.jointype == JOIN_SEMI ||
+								   node->join.jointype == JOIN_LEFT_SEMI);
 
 	/* set up null tuples for outer joins, if needed */
 	switch (node->join.jointype)
@@ -1554,12 +1555,7 @@ ExecInitMergeJoin(MergeJoin *node, EState *estate, int eflags)
 			break;
 #ifdef __TBASE__
         case JOIN_LEFT_SCALAR:
-            mergestate->mj_FillOuter = true;
-            mergestate->mj_FillInner = false;
-            mergestate->mj_NullInnerTupleSlot =
-                    ExecInitNullTupleSlot(estate,
-                                          ExecGetResultType(innerPlanState(mergestate)));
-            break;
+        case JOIN_LEFT_SEMI:
 #endif
 		case JOIN_LEFT:
 		case JOIN_ANTI:
diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c
index 9a9ec8d4..d277cb19 100644
--- a/src/backend/executor/nodeNestloop.c
+++ b/src/backend/executor/nodeNestloop.c
@@ -179,7 +179,8 @@ ExecNestLoop(PlanState *pstate)
             if (!node->nl_MatchedOuter &&
                 (node->js.jointype == JOIN_LEFT ||
                  node->js.jointype == JOIN_ANTI ||
-                 node->js.jointype == JOIN_LEFT_SCALAR))
+                 node->js.jointype == JOIN_LEFT_SCALAR ||
+				 node->js.jointype == JOIN_LEFT_SEMI))
 #else
 			if (!node->nl_MatchedOuter &&
 				(node->js.jointype == JOIN_LEFT ||
@@ -341,7 +342,8 @@ ExecInitNestLoop(NestLoop *node, EState *estate, int eflags)
 	 * detect whether we need only consider the first matching inner tuple
 	 */
 	nlstate->js.single_match = (node->join.inner_unique ||
-								node->join.jointype == JOIN_SEMI);
+								node->join.jointype == JOIN_SEMI ||
+								node->join.jointype == JOIN_LEFT_SEMI);
 
 	/* set up null tuples for outer joins, if needed */
 	switch (node->join.jointype)
@@ -351,6 +353,7 @@ ExecInitNestLoop(NestLoop *node, EState *estate, int eflags)
             break;
 #ifdef __TBASE__
 		case JOIN_LEFT_SCALAR:
+		case JOIN_LEFT_SEMI:
             nlstate->nl_NullInnerTupleSlot =
                     ExecInitNullTupleSlot(estate,
                                           ExecGetResultType(innerPlanState(nlstate)));
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index a4fec879..42c19c2f 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -232,7 +232,8 @@ set_base_rel_consider_startup(PlannerInfo *root)
 
 #ifdef __TBASE__
         if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_ANTI ||
-             sjinfo->jointype == JOIN_LEFT_SCALAR) &&
+             sjinfo->jointype == JOIN_LEFT_SCALAR ||
+			 sjinfo->jointype == JOIN_LEFT_SEMI) &&
 			bms_get_singleton_member(sjinfo->syn_righthand, &varno))
 #else
 		if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_ANTI) &&
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 0de40222..4b984a23 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -2137,6 +2137,7 @@ initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
     if (jointype == JOIN_SEMI ||
         jointype == JOIN_ANTI ||
         jointype == JOIN_LEFT_SCALAR ||
+        jointype == JOIN_LEFT_SEMI ||
 		extra->inner_unique)
 #else
 	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
@@ -2230,6 +2231,7 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
     if (path->jointype == JOIN_SEMI ||
         path->jointype == JOIN_ANTI ||
         path->jointype == JOIN_LEFT_SCALAR ||
+        path->jointype == JOIN_LEFT_SEMI ||
 		extra->inner_unique)
 #else
 	if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI ||
@@ -2731,6 +2733,7 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path,
 	if ((path->jpath.jointype == JOIN_SEMI ||
 		 path->jpath.jointype == JOIN_ANTI ||
          path->jpath.jointype == JOIN_LEFT_SCALAR ||
+         path->jpath.jointype == JOIN_LEFT_SEMI ||
 		 extra->inner_unique) &&
 		(list_length(path->jpath.joinrestrictinfo) ==
 		 list_length(path->path_mergeclauses)))
@@ -3240,6 +3243,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
     if (path->jpath.jointype == JOIN_SEMI ||
         path->jpath.jointype == JOIN_ANTI ||
         path->jpath.jointype == JOIN_LEFT_SCALAR ||
+        path->jpath.jointype == JOIN_LEFT_SEMI ||
         extra->inner_unique)
 #else
 	if (path->jpath.jointype == JOIN_SEMI ||
@@ -3288,7 +3292,9 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 
 		/* Get # of tuples that will pass the basic join */
 #ifdef __TBASE__
-		if (path->jpath.jointype == JOIN_SEMI || path->jpath.jointype == JOIN_LEFT_SCALAR)
+		if (path->jpath.jointype == JOIN_SEMI ||
+			path->jpath.jointype == JOIN_LEFT_SCALAR ||
+			path->jpath.jointype == JOIN_LEFT_SEMI)
 #else
 		if (path->jpath.jointype == JOIN_SEMI)
 #endif
@@ -4446,6 +4452,7 @@ calc_joinrel_size_estimate(PlannerInfo *root,
         case JOIN_SEMI:
 #ifdef __TBASE__
         case JOIN_LEFT_SCALAR:
+        case JOIN_LEFT_SEMI:
 #endif
 			nrows = outer_rows * fkselec * jselec;
 			/* pselec not used */
@@ -4527,7 +4534,8 @@ get_foreign_key_join_selectivity(PlannerInfo *root,
 		 * Hence, if either case applies, punt and ignore the FK.
 		 */
 #ifdef __TBASE__
-		if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI || jointype == JOIN_LEFT_SCALAR) &&
+		if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
+			 jointype == JOIN_LEFT_SCALAR || jointype == JOIN_LEFT_SEMI) &&
 			(ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
             continue;
 #else
@@ -4649,7 +4657,8 @@ get_foreign_key_join_selectivity(PlannerInfo *root,
 		 * table.  So, at least for now, disregard inheritance here.
 		 */
 #ifdef __TBASE__
-		if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || jointype == JOIN_LEFT_SCALAR)
+		if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
+			jointype == JOIN_LEFT_SCALAR || jointype == JOIN_LEFT_SEMI)
 #else
 		if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
 #endif
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index b377f0d6..1e58fbdc 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -2025,7 +2025,9 @@ adjust_rowcount_for_semijoins(PlannerInfo *root,
 		SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc);
 
 #ifdef __TBASE__
-        if ((sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_LEFT_SCALAR ) &&
+        if ((sjinfo->jointype == JOIN_SEMI ||
+        	 sjinfo->jointype == JOIN_LEFT_SCALAR ||
+			 sjinfo->jointype == JOIN_LEFT_SEMI) &&
             bms_is_member(cur_relid, sjinfo->syn_lefthand) &&
             bms_is_member(outer_relid, sjinfo->syn_righthand))
 #else
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c
index c832b9d8..49852d77 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -145,6 +145,7 @@ add_paths_to_joinrel(PlannerInfo *root,
 		case JOIN_ANTI:
 #ifdef __TBASE__
         case JOIN_LEFT_SCALAR:
+        case JOIN_LEFT_SEMI:
 #endif
 			extra.inner_unique = false; /* well, unproven */
 			break;
@@ -191,7 +192,8 @@ add_paths_to_joinrel(PlannerInfo *root,
 	 */
 #ifdef __TBASE__
     if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
-        jointype == JOIN_LEFT_SCALAR || extra.inner_unique)
+        jointype == JOIN_LEFT_SCALAR || jointype == JOIN_LEFT_SEMI ||
+		extra.inner_unique)
 #else
 	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI || extra.inner_unique)
 #endif
@@ -1300,6 +1302,7 @@ match_unsorted_outer(PlannerInfo *root,
 		case JOIN_ANTI:
 #ifdef __TBASE__
         case JOIN_LEFT_SCALAR:
+        case JOIN_LEFT_SEMI:
 #endif
 			nestjoinOK = true;
 			useallclauses = false;
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index 659a8494..eb920d05 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -390,7 +390,9 @@ join_is_legal(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
 		 * this join path.
 		 */
 #ifdef __TBASE__
-		if (sjinfo->jointype == JOIN_SEMI || sjinfo->jointype == JOIN_LEFT_SCALAR)
+		if (sjinfo->jointype == JOIN_SEMI ||
+			sjinfo->jointype == JOIN_LEFT_SCALAR ||
+			sjinfo->jointype == JOIN_LEFT_SEMI)
 #else
 		if (sjinfo->jointype == JOIN_SEMI)
 #endif
@@ -832,8 +834,8 @@ populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1,
 		case JOIN_SEMI:
 #ifdef __TBASE__
         case JOIN_LEFT_SCALAR:
+        case JOIN_LEFT_SEMI:
 #endif
-
 			/*
 			 * We might have a normal semijoin, or a case where we don't have
 			 * enough rels to do the semijoin but can unique-ify the RHS and
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c
index 100d9db5..7c743fd2 100644
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -910,6 +910,7 @@ deconstruct_recurse(PlannerInfo *root, Node *jtnode, bool below_outer_join,
 			case JOIN_ANTI:
 #ifdef __TBASE__
             case JOIN_LEFT_SCALAR:
+            case JOIN_LEFT_SEMI:
 #endif
 				leftjoinlist = deconstruct_recurse(root, j->larg,
 												   below_outer_join,
@@ -1354,6 +1355,7 @@ make_outerjoininfo(PlannerInfo *root,
                 (jointype == JOIN_SEMI ||
                  jointype == JOIN_ANTI ||
                  jointype == JOIN_LEFT_SCALAR ||
+				 jointype == JOIN_LEFT_SEMI ||
                  !bms_overlap(strict_relids, otherinfo->min_righthand)))
 #else
 			if (bms_overlap(clause_relids, otherinfo->syn_righthand) &&
@@ -1401,9 +1403,11 @@ make_outerjoininfo(PlannerInfo *root,
 				jointype == JOIN_SEMI ||
 				jointype == JOIN_ANTI ||
 				jointype == JOIN_LEFT_SCALAR ||
+				jointype == JOIN_LEFT_SEMI ||
 				otherinfo->jointype == JOIN_SEMI ||
 				otherinfo->jointype == JOIN_ANTI ||
                 otherinfo->jointype == JOIN_LEFT_SCALAR ||
+				otherinfo->jointype == JOIN_LEFT_SEMI ||
 				!otherinfo->lhs_strict || otherinfo->delay_upper_joins)
 #else
 			if (bms_overlap(clause_relids, otherinfo->syn_righthand) ||
@@ -1491,7 +1495,9 @@ compute_semijoin_info(SpecialJoinInfo *sjinfo, List *clause)
 
 	/* Nothing more to do if it's not a semijoin */
 #ifdef __TBASE__
-	if (sjinfo->jointype != JOIN_SEMI && sjinfo->jointype != JOIN_LEFT_SCALAR)
+	if (sjinfo->jointype != JOIN_SEMI &&
+		sjinfo->jointype != JOIN_LEFT_SCALAR &&
+		sjinfo->jointype != JOIN_LEFT_SEMI)
 #else
 	if (sjinfo->jointype != JOIN_SEMI)
 #endif
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index 9f1be6e4..e5470fa8 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -1740,6 +1740,7 @@ set_join_references(PlannerInfo *root, Join *join, int rtoffset)
 		case JOIN_ANTI:
 #ifdef __TBASE__
         case JOIN_LEFT_SCALAR:
+        case JOIN_LEFT_SEMI:
 #endif
 			inner_itlist->has_non_vars = false;
 			break;
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index e8495fbc..7c342fc3 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -2955,6 +2955,95 @@ get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, List **targetLis
 }
 
 #ifdef __TBASE__
+/*
+ * simplify_TargetList_query:remove any useless stuff in an TargetList's
+ * subquery
+ *
+ * For subquery in targetlist, normally we use JOIN_LEFT_SCALAR type to
+ * make sure there will be only one row found. If subquery contains
+ * aggregation clause, then we are OK with JOIN_LEFT_SEMI. Further more, if
+ * subquery got 'limit 1' or  equivalent clauses such as Oracle 'rownum = 1'.
+ * Then we can remove the limit clause and use JOIN_SEMI to simplify the
+ * subquery.
+ *
+ * Returns TRUE if was able to discard the 'LIMIT 1' cluase or the subquery
+ * already simple enough, else FALSE.
+ */
+static bool
+simplify_TargetList_query(PlannerInfo *root, Query *query, bool *useLeftSemiJoin)
+{
+	/*
+	 * We don't try to simplify at all if the query uses set operations,
+	 * aggregates, grouping sets, SRFs, modifying CTEs, HAVING, OFFSET, or FOR
+	 * UPDATE/SHARE; none of these seem likely in normal usage and their
+	 * possible effects are complex.  (Note: we could ignore an "OFFSET 0"
+	 * clause, but that traditionally is used as an optimization fence, so we
+	 * don't.)
+	 */
+	if (query->commandType != CMD_SELECT ||
+		query->setOperations ||
+		query->groupingSets ||
+		query->hasWindowFuncs ||
+		query->hasTargetSRFs ||
+		query->hasModifyingCTE ||
+		query->havingQual ||
+		query->limitOffset ||
+		query->rowMarks)
+		return false;
+
+	/* By default, use JOIN_LEFT_SCALAR. */
+	Assert(useLeftSemiJoin);
+	*useLeftSemiJoin = false;
+
+	/* Handle 'limit 1' case as described above. */
+	if (query->limitCount)
+	{
+		/*
+		 * The LIMIT clause has not yet been through eval_const_expressions,
+		 * so we have to apply that here.  It might seem like this is a waste
+		 * of cycles, since the only case plausibly worth worrying about is
+		 * "LIMIT 1" ... but what we'll actually see is "LIMIT int8(1::int4)",
+		 * so we have to fold constants or we're not going to recognize it.
+		 */
+		Node	   *node = eval_const_expressions(root, query->limitCount);
+		Const	   *limit;
+		int64		limitValue;
+
+		/* Might as well update the query if we simplified the clause. */
+		query->limitCount = node;
+
+		if (!IsA(node, Const))
+			return false;
+
+		limit = (Const *) node;
+
+		Assert(limit->consttype == INT8OID);
+		limitValue = DatumGetInt64(limit->constvalue);
+
+		/* Invalid value, we have to get at least one row. */
+		if (!limit->constisnull && limitValue <= 0)
+			return false;
+
+		/*
+		 * If the SubQuery got limit 1(actually must be limit 1), then the
+		 * join Semantic equals JOIN_SEMI. We don't need to continue when got
+		 * one LHS match.
+		 */
+		if (limitValue == 1)
+		{
+			/*
+			 * Remove the limit clause for more possible subquery pullup
+			 * optimizations.
+			 */
+			query->limitCount = NULL;
+			/* Inform caller to use JOIN_LEFT_SEMI */
+			*useLeftSemiJoin = true;
+		}
+	}
+
+	return true;
+}
+
 /*
  * Try to convert an SubLink in targetlist to a join
  *
@@ -2976,9 +3065,12 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 	SubLink		*sublink = NULL;
 	RangeTblRef	*rtr = NULL;
 	RangeTblEntry *rte = NULL;
-	Node	   *target = NULL;
-	List 	 *sublinks = NIL;
-    bool     count_agg = false;
+	Node		*target = NULL;
+	List		*sublinks = NIL;
+    bool		 count_agg = false;
+    bool		 useLeftSemiJoin = false;
+    /* By default, JOIN_LEFT_SCALAR is the worst choice */
+    JoinType 	 finalJoinType = JOIN_LEFT_SCALAR;
 
 	/* Find sublinks in the targetlist entry */
 	find_sublink_walker((Node *)entry->expr, &sublinks);
@@ -3010,6 +3102,18 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 	if (subselect->jointree->fromlist == NIL)
 		return NULL;
 
+	/*
+	 * See if the subquery can be simplified. For now, we just try to remove
+	 * 'limit 1' clause. If it's been removed, we can use JOIN_LEFT_SEMI to
+	 * save more costs.
+	 */
+	if (!simplify_TargetList_query(root, subselect, &useLeftSemiJoin))
+		return NULL;
+
+	/* 'limit 1' optimized */
+	if (useLeftSemiJoin)
+		finalJoinType = JOIN_LEFT_SEMI;
+
 	/*
 	 * What we can not optimize.
 	 */
@@ -3170,6 +3274,13 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 
         	subselect->groupClause = lappend(subselect->groupClause, grpcl);
         }
+
+        /*
+         * If we got Aggregation clause, since there is only one TargetList,
+         * then we can use JOIN_LEFT_SEMI over JOIN_LEFT/JOIN_LEFT_SCALAR to
+         * save more costs.
+         */
+        finalJoinType = JOIN_LEFT_SEMI;
 	}
 
 	/*
@@ -3190,7 +3301,7 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 	 * Form join node.
 	 */
 	joinExpr = makeNode(JoinExpr);
-	joinExpr->jointype = subselect->hasAggs? JOIN_LEFT : JOIN_LEFT_SCALAR;
+	joinExpr->jointype = finalJoinType;
 	joinExpr->isNatural = false;
 	joinExpr->larg = (Node *) root->parse->jointree;
 	joinExpr->rarg = (Node *) rtr;
@@ -3203,20 +3314,25 @@ convert_TargetList_sublink_to_join(PlannerInfo *root, TargetEntry *entry)
 	parse->jointree = makeFromExpr(list_make1(joinExpr), NULL);
 
 	/* Build a Var pointing to the subquery */
-	target = (Node *)makeVarFromTargetEntry(rtr->rtindex, linitial(subselect->targetList));
+	target = (Node *)makeVarFromTargetEntry(rtr->rtindex,
+											linitial(subselect->targetList));
 
 	/* Add Coalesce(count,0) */
     if (count_agg)
     {
         CoalesceExpr *coalesce = makeNode(CoalesceExpr);
-        coalesce->args = list_make2(target,
-                                    makeConst(INT8OID, -1, InvalidOid, sizeof(int64), Int64GetDatum(0), false, true));
+        Const *constExpr = makeConst(INT8OID, -1, InvalidOid, sizeof(int64),
+        							 Int64GetDatum(0), false, true);
+
+        coalesce->args = list_make2(target, constExpr);
         coalesce->coalescetype = INT8OID;
         target = (Node *) coalesce;
     }
 
 	/* Replace sublink node with Result. */
-	entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr, sublink, target);
+	entry->expr = (Expr *)substitute_sublink_with_node((Node *)entry->expr,
+													   sublink,
+													   target);
 	return entry;
 }
 #endif
diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c
index a94388d4..93ceb77e 100644
--- a/src/backend/optimizer/prep/prepjointree.c
+++ b/src/backend/optimizer/prep/prepjointree.c
@@ -384,7 +384,9 @@ pull_up_sublinks_jointree_recurse(PlannerInfo *root, Node *jtnode,
 				break;
 			case JOIN_LEFT:
 #ifdef __TBASE__
+			case JOIN_SEMI:
             case JOIN_LEFT_SCALAR:
+            case JOIN_LEFT_SEMI:
 #endif
 				j->quals = pull_up_sublinks_qual_recurse(root, j->quals,
 														 &j->rarg,
@@ -1243,6 +1245,7 @@ pull_up_subqueries_recurse(PlannerInfo *root, Node *jtnode,
 			case JOIN_SEMI:
 #ifdef __TBASE__
             case JOIN_LEFT_SCALAR:
+            case JOIN_LEFT_SEMI:
 #endif
 			case JOIN_ANTI:
 				j->larg = pull_up_subqueries_recurse(root, j->larg,
@@ -3084,6 +3087,7 @@ reduce_outer_joins_pass2(Node *jtnode,
 			case JOIN_SEMI:
 			case JOIN_ANTI:
 			case JOIN_LEFT_SCALAR:
+			case JOIN_LEFT_SEMI:
 
 				/*
 				 * These could only have been introduced by pull_up_sublinks,
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index eee0c16b..464eccfb 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1738,6 +1738,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 			 pathnode->jointype == JOIN_SEMI ||
 #ifdef __TBASE__
              pathnode->jointype == JOIN_LEFT_SCALAR ||
+			 pathnode->jointype == JOIN_LEFT_SEMI ||
 #endif
 			 pathnode->jointype == JOIN_ANTI))
 	{
@@ -2704,7 +2705,8 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 			if (resultRelLoc == RESULT_REL_INNER &&
 				pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL &&
 				pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI &&
-				pathnode->jointype != JOIN_LEFT_SCALAR && !pathnode->inner_unique)
+				pathnode->jointype != JOIN_LEFT_SCALAR &&
+				pathnode->jointype != JOIN_LEFT_SEMI && !pathnode->inner_unique)
 			{
 				/* Replicate outer */
 				pathnode->outerjoinpath = redistribute_path(
@@ -2752,7 +2754,8 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 		if (innerd &&resultRelLoc == RESULT_REL_INNER &&
 			pathnode->jointype != JOIN_LEFT && pathnode->jointype != JOIN_FULL &&
 			pathnode->jointype != JOIN_SEMI && pathnode->jointype != JOIN_ANTI &&
-			pathnode->jointype != JOIN_LEFT_SCALAR && !pathnode->inner_unique)
+			pathnode->jointype != JOIN_LEFT_SCALAR &&
+			pathnode->jointype != JOIN_LEFT_SEMI && !pathnode->inner_unique)
 		{
 			pathnode->path.distribution = innerd;
 			return alternate;
diff --git a/src/backend/utils/adt/network_selfuncs.c b/src/backend/utils/adt/network_selfuncs.c
index beb0e76a..d05ffd86 100644
--- a/src/backend/utils/adt/network_selfuncs.c
+++ b/src/backend/utils/adt/network_selfuncs.c
@@ -225,6 +225,7 @@ networkjoinsel(PG_FUNCTION_ARGS)
 		case JOIN_ANTI:
 #ifdef __TBASE__
         case JOIN_LEFT_SCALAR:
+        case JOIN_LEFT_SEMI:
 #endif
 			/* Here, it's important that we pass the outer var on the left. */
 			if (!join_is_reversed)
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 134664f8..06b1d9fa 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -2286,6 +2286,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
 		case JOIN_ANTI:
 #ifdef __TBASE__
         case JOIN_LEFT_SCALAR:
+        case JOIN_LEFT_SEMI:
 #endif
 
 			/*
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 9f974c28..43f90ba9 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -794,8 +794,10 @@ typedef enum JoinType
 	JOIN_UNIQUE_INNER,			/* RHS path must be made unique */
 
 #ifdef __TBASE__
-	JOIN_LEFT_SCALAR            /* pairs + unmatched LHS tuples */
-	                            /* only 1 copy of echo LHS row else report error. */
+	JOIN_LEFT_SCALAR,			/* pairs + unmatched LHS tuples, only 1 copy of
+	 	 	 	 	 	 	 	 * each LHS row else report error. */
+	JOIN_LEFT_SEMI				/* 1 copy of each LHS row that has match(es) +
+								 * unmatched LHS tuples */
 #endif
 
 	/*
@@ -821,6 +823,7 @@ typedef enum JoinType
 #define IS_OUTER_JOIN(jointype) \
 	(((1 << (jointype)) & \
 	  ((1 << JOIN_LEFT) | \
+	   (1 << JOIN_LEFT_SEMI) | \
 	   (1 << JOIN_LEFT_SCALAR) | \
 	   (1 << JOIN_FULL) | \
 	   (1 << JOIN_RIGHT) | \
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 85af9fb1..17184f61 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1461,7 +1461,7 @@ explain (costs off)  select (select sum(b.a) from tbl_b b where b.a = a.a and b.
 -------------------------------------------------------------------------
  Sort
    Sort Key: "TARGETLIST_subquery".sum
-   ->  Nested Loop Left Join
+   ->  Nested Loop Left Semi Join
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on tbl_a a
          ->  Materialize
@@ -1493,7 +1493,7 @@ explain (costs off)  select (select count(b.a) from tbl_b b where b.a = a.a) fro
 -----------------------------------------------------------------------
  Sort
    Sort Key: (COALESCE("TARGETLIST_subquery".count, '0'::bigint))
-   ->  Nested Loop Left Join
+   ->  Nested Loop Left Semi Join
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on tbl_a a
          ->  Materialize
@@ -1556,7 +1556,7 @@ explain (costs off)  select (case when a.b =1 then (select sum(b.a) from tbl_b b
 ---------------------------------------------------------------------------------------
  Sort
    Sort Key: (CASE WHEN (a.b = 1) THEN "TARGETLIST_subquery".sum ELSE '0'::bigint END)
-   ->  Nested Loop Left Join
+   ->  Nested Loop Left Semi Join
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on tbl_a a
          ->  Materialize
@@ -1616,7 +1616,7 @@ explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b
 ----------------------------------------------------------------------------------------------------------------
  Sort
    Sort Key: (CASE WHEN (a.b = 1) THEN COALESCE("TARGETLIST_subquery".count, '0'::bigint) ELSE '0'::bigint END)
-   ->  Nested Loop Left Join
+   ->  Nested Loop Left Semi Join
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on tbl_a a
          ->  Materialize
@@ -1650,7 +1650,7 @@ explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b
 ----------------------------------------------------------------------------------------------------------------
  Sort
    Sort Key: (CASE WHEN (a.b = 1) THEN COALESCE("TARGETLIST_subquery".count, '0'::bigint) ELSE '0'::bigint END)
-   ->  Nested Loop Left Join
+   ->  Nested Loop Left Semi Join
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on tbl_a a
          ->  Materialize
@@ -1679,6 +1679,35 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and
     1
 (10 rows)
 
+-- targetlist sublink with limit 1
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a order by 1,2;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a.a, b.a
+         ->  Hash Left Semi Join
+               Hash Cond: (a.a = b.a)
+               ->  Seq Scan on tbl_a a
+               ->  Hash
+                     ->  Seq Scan on tbl_b b
+(8 rows)
+
+select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a order by 1,2;
+ a  | q  
+----+----
+  1 |   
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+(10 rows)
+
 -- support pullup lateral ANY_SUBLINK
 explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
                                                 QUERY PLAN                                                
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 818c6b4f..66c01e19 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -693,6 +693,10 @@ select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and
 explain (costs off)  select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;
 select (case when a.b =1 then (select count(*) from tbl_b b where b.a = a.a and b.b = a.b and a.b is not null) else 0 end) from tbl_a a order by 1;
 
+-- targetlist sublink with limit 1
+explain (costs off) select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a order by 1,2;
+select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a order by 1,2;
+
 -- support pullup lateral ANY_SUBLINK
 explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
 select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);

From b4ed52d92f2ae67795211e5560f88c54c51efbe3 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Wed, 30 Sep 2020 14:48:21 +0800
Subject: [PATCH 068/578] Fix pg_dump issue when group info been deleted

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131082507633
---
 src/bin/pg_dump/pg_dump.c | 45 +++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 5b3c3317..10a8ce5f 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -15835,27 +15835,30 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo)
                                       fmtId(tbinfo->attnames[hashkey - 1]));
                 }
 #ifdef __TBASE__
-                else if(tbinfo->pgxclocatortype == 'S' && !tbinfo->ispartition)
-                {
-                    int hashkey = tbinfo->pgxcattnum;
-                    int sechashkey = tbinfo->pgxcsecattnum;
-
-                    if (sechashkey)
-                    {
-                        appendPQExpBuffer(q, "\nDISTRIBUTE BY SHARD (%s,",
-                                          fmtId(tbinfo->attnames[hashkey - 1]));
-                        appendPQExpBuffer(q, "%s)",
-                                          fmtId(tbinfo->attnames[sechashkey - 1]));
-                    }
-                    else
-                        appendPQExpBuffer(q, "\nDISTRIBUTE BY SHARD (%s)",
-                                          fmtId(tbinfo->attnames[hashkey - 1]));
-
-                    if (tbinfo->coldgroupname)
-                        appendPQExpBuffer(q, " to GROUP %s %s", tbinfo->groupname, tbinfo->coldgroupname);
-                    else
-                        appendPQExpBuffer(q, " to GROUP %s", tbinfo->groupname);
-                }
+				else if(tbinfo->pgxclocatortype == 'S' && !tbinfo->ispartition)
+				{
+					int hashkey = tbinfo->pgxcattnum;
+					int sechashkey = tbinfo->pgxcsecattnum;
+
+					if (sechashkey)
+					{
+						appendPQExpBuffer(q, "\nDISTRIBUTE BY SHARD (%s,",
+									  	fmtId(tbinfo->attnames[hashkey - 1]));
+						appendPQExpBuffer(q, "%s)",
+									  	fmtId(tbinfo->attnames[sechashkey - 1]));
+					}
+					else
+						appendPQExpBuffer(q, "\nDISTRIBUTE BY SHARD (%s)",
+									  	fmtId(tbinfo->attnames[hashkey - 1]));
+
+					if (tbinfo->groupname)
+					{
+						if (tbinfo->coldgroupname)
+							appendPQExpBuffer(q, " to GROUP %s %s", tbinfo->groupname, tbinfo->coldgroupname);
+						else
+							appendPQExpBuffer(q, " to GROUP %s", tbinfo->groupname);
+					}
+				}
 #endif
             }
             if (include_nodes &&

From 2d67e124781e4016e22c1df214f5d6ee3627260f Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Tue, 29 Sep 2020 11:51:56 +0800
Subject: [PATCH 069/578] FQS support pushdown query with subquery to datanode

Currently we only support Subquery push down to single DN. Multiple DN
pushdown will have cross-phase issues between main query and subquery,
which is way more complicated. So we just skip the case by now.

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131081947401
---
 src/backend/optimizer/util/pgxcship.c  | 239 +++++++++++++++++++------
 src/backend/utils/adt/ruleutils.c      |  13 +-
 src/test/regress/expected/xc_FQS_2.out |  68 +++++++
 src/test/regress/sql/xc_FQS.sql        |   8 +
 4 files changed, 261 insertions(+), 67 deletions(-)

diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index ae1ca9d6..bfbc9e99 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -153,6 +153,7 @@ static ExecNodes *pgxc_FQS_datanodes_for_rtr(Index varno, Query *query);
 #ifdef __TBASE__
 static ExecNodes* pgxc_is_group_subquery_shippable(Query *query, Shippability_context *sc_context);
 static void pgxc_is_rte_subquery_shippable(Node *node, Shippability_context *sc_context);
+static bool pgxc_is_simple_subquery(Query *subquery);
 static bool pgxc_FQS_check_subquery_const(Query *query);
 #endif
 /*
@@ -339,7 +340,8 @@ pgxc_FQS_datanodes_for_rtr(Index varno, Query *query)
         {
             /* For anything, other than a table, we can't find the datanodes */
 #ifdef __TBASE__
-			if (rte->relkind != RELKIND_RELATION && rte->relkind != RELKIND_PARTITIONED_TABLE)
+			if (rte->relkind != RELKIND_RELATION &&
+				rte->relkind != RELKIND_PARTITIONED_TABLE)
 			{
 				return NULL;
 			}
@@ -364,7 +366,8 @@ pgxc_FQS_datanodes_for_rtr(Index varno, Query *query)
 			 * all partitioned tables should have the same distribution, try to 
 			 * get execution datanodes
 			 */
-			if (rte->inh && has_subclass(rte->relid) && rte->relkind != RELKIND_PARTITIONED_TABLE)
+			if (rte->inh && has_subclass(rte->relid) &&
+				rte->relkind != RELKIND_PARTITIONED_TABLE)
 			{
 				return NULL;
 			}
@@ -379,23 +382,54 @@ pgxc_FQS_datanodes_for_rtr(Index varno, Query *query)
 		case RTE_SUBQUERY:
 #ifdef __TBASE__
 		{
-			Query *subquery = rte->subquery;
+			Query 		*subquery = rte->subquery;
+			ExecNodes 	*exec_nodes = NULL;
 
 			/*
-			 * Current we only consider the case if subquery only contains
-			 * constant values. If so, we can treat them as replicated RTE.
+			 * Consider the case if subquery only contains constant values.
+			 * If so, we can treat them as replicated RTE.
 			 */
 			if (enable_subquery_shipping &&
 				pgxc_FQS_check_subquery_const(subquery))
 			{
-				ExecNodes *exec_nodes = makeNode(ExecNodes);
+				exec_nodes = makeNode(ExecNodes);
 				exec_nodes->baselocatortype = LOCATOR_TYPE_REPLICATED;
-				/* No locate info stored for such subquery RTEs, we use this
-				 * flag to force using the other hand locate info */
+				/*
+				 * No locate info stored for such subquery RTEs, we use this
+				 * flag to force using the other hand locate info.
+				 */
 				exec_nodes->const_subquery = true;
 
 				return exec_nodes;
 			}
+
+			/* Try to process exec_nodes for simple Subquery */
+			if (enable_subquery_shipping &&
+				pgxc_is_simple_subquery(subquery))
+			{
+				Bitmapset	*relids = NULL;
+
+				/* Recurse into the subquery to find executable datanodes. */
+				exec_nodes = pgxc_FQS_find_datanodes_recurse((Node *)subquery->jointree,
+															 subquery, &relids);
+
+				/* Clean up the relids used in recursion function */
+				bms_free(relids);
+				relids = NULL;
+
+				/*
+				 * Currently we only support Subquery push down to single DN.
+				 * Multiple DN pushdown will have cross-phase issues between
+				 * main query and subquery, it needs more complicate
+				 * calculation. So we just skip the case by now.
+				 */
+				if (exec_nodes && exec_nodes->nodeList &&
+					(list_length(exec_nodes->nodeList) == 1))
+					return exec_nodes;
+				else
+					return NULL;
+			}
+
 			return NULL;
 		}
 #endif
@@ -1767,7 +1801,59 @@ pgxc_is_shard_in_same_group(Var *var1, Var *var2, List *rtable)
 
     return result;
 }
+
+/*
+ * Check is the subquery is simple enough to pushdown to DN
+ */
+static bool
+pgxc_is_simple_subquery(Query *query)
+{
+	/*
+	 * Let's just make sure it's a valid select ...
+	 */
+	if (!IsA(query, Query) || query->commandType != CMD_SELECT)
+		return false;
+
+	/*
+	 * Can't currently pushdown a query with setops (unless it's simple UNION
+	 * ALL, which is handled by a different code path).
+	 */
+	if (query->setOperations)
+		return false;
+
+	/*
+	 * Can't pushdown a subquery involving grouping, aggregation, SRFs,
+	 * sorting, limiting, or WITH.
+	 */
+	if (query->hasAggs ||
+		query->hasWindowFuncs ||
+		query->hasTargetSRFs ||
+		query->groupClause ||
+		query->groupingSets ||
+		query->havingQual ||
+		query->sortClause ||
+		query->distinctClause ||
+		query->limitOffset ||
+		query->limitCount ||
+		query->hasForUpdate ||
+		query->cteList)
+		return false;
+
+	/*
+	 * Don't pushdown a subquery that has any volatile functions in its
+	 * targetlist.  Otherwise we might introduce multiple evaluations of these
+	 * functions, if they get copied to multiple places in the upper query,
+	 * leading to surprising results.  (Note: the PlaceHolderVar mechanism
+	 * doesn't quite guarantee single evaluation; else we could pull up anyway
+	 * and just wrap such items in PlaceHolderVars ...)
+	 */
+	if (contain_volatile_functions((Node *) query->targetList))
+		return false;
+
+	return true;
+}
 #endif
+
 /*
  * Returns whether or not the rtable (and its subqueries)
  * only contain pg_catalog entries.
@@ -1794,7 +1880,6 @@ pgxc_query_contains_only_pg_catalog(List *rtable)
     return true;
 }
 
-
 /*
  * pgxc_is_query_shippable
  * This function calls the query walker to analyse the query to gather
@@ -1807,60 +1892,98 @@ pgxc_query_contains_only_pg_catalog(List *rtable)
  */
 ExecNodes *
 pgxc_is_query_shippable(Query *query, int query_level)
-{// #lizard forgives
-    Shippability_context sc_context;
-    ExecNodes    *exec_nodes;
-    bool        canShip = true;
-    Bitmapset    *shippability;
+{
+	Shippability_context sc_context;
+	ExecNodes	*exec_nodes;
+	bool		canShip = true;
+	Bitmapset	*shippability;
 
-    memset(&sc_context, 0, sizeof(sc_context));
-    /* let's assume that by default query is shippable */
-    sc_context.sc_query = query;
-    sc_context.sc_query_level = query_level;
-    sc_context.sc_for_expr = false;
+	memset(&sc_context, 0, sizeof(sc_context));
+	/* let's assume that by default query is shippable */
+	sc_context.sc_query = query;
+	sc_context.sc_query_level = query_level;
+	sc_context.sc_for_expr = false;
 
-    /*
-     * We might have already decided not to ship the query to the Datanodes, but
-     * still walk it anyway to find out if there are any subqueries which can be
-     * shipped.
-     */
-    pgxc_shippability_walker((Node *)query, &sc_context);
+	/*
+	 * We might have already decided not to ship the query to the Datanodes, but
+	 * still walk it anyway to find out if there are any subqueries which can be
+	 * shipped.
+	 */
+	pgxc_shippability_walker((Node *)query, &sc_context);
 
-    exec_nodes = sc_context.sc_exec_nodes;
-    /*
-     * The shippability context contains two ExecNodes, one for the subLinks
-     * involved in the Query and other for the relation involved in FromClause.
-     * They are computed at different times while scanning the query. Merge both
-     * of them if they are both replicated. If query doesn't have SubLinks, we
-     * don't need to consider corresponding ExecNodes.
-     * PGXC_FQS_TODO:
-     * Merge the subquery ExecNodes if both of them are replicated.
-     * The logic to merge node lists with other distribution
-     * strategy is not clear yet.
-     */
-    if (query->hasSubLinks)
-    {
-        if (exec_nodes && IsExecNodesReplicated(exec_nodes) &&
-            sc_context.sc_subquery_en &&
-            IsExecNodesReplicated(sc_context.sc_subquery_en))
-            exec_nodes = pgxc_merge_exec_nodes(exec_nodes,
-                                               sc_context.sc_subquery_en);
-        else
-            exec_nodes = NULL;
-    }
+	exec_nodes = sc_context.sc_exec_nodes;
+	/*
+	 * The shippability context contains two ExecNodes, one for the subLinks
+	 * involved in the Query and other for the relation involved in FromClause.
+	 * They are computed at different times while scanning the query. Merge both
+	 * of them if they are both replicated. If query doesn't have SubLinks, we
+	 * don't need to consider corresponding ExecNodes.
+	 * PGXC_FQS_TODO:
+	 * Merge the subquery ExecNodes if both of them are replicated.
+	 * The logic to merge node lists with other distribution
+	 * strategy is not clear yet.
+	 */
+	if (query->hasSubLinks)
+	{
 
-    /*
-     * Look at the information gathered by the walker in Shippability_context and that
-     * in the Query structure to decide whether we should ship this query
-     * directly to the Datanode or not
-     */
+#ifdef __TBASE__
+		int num_fromclause_nodes = 0;
+		int num_sublink_nodes = 0;
 
-    /*
-     * If the planner was not able to find the Datanodes to the execute the
-     * query, the query is not completely shippable. So, return NULL
-     */
-    if (!exec_nodes)
-        return NULL;
+		/* Get number of DN nodes for Main Query result */
+		if (exec_nodes && exec_nodes->nodeList)
+		{
+			num_fromclause_nodes = list_length(exec_nodes->nodeList);
+		}
+
+		/* Get number of DN nodes for Sublink result */
+		if (sc_context.sc_subquery_en && sc_context.sc_subquery_en->nodeList)
+		{
+			num_sublink_nodes = list_length(sc_context.sc_subquery_en->nodeList);
+		}
+
+		/*
+		 * Try to merge sublink nodelist only if:
+		 * XXX Only cover CMD_SELECT
+		 * XXX Both main query and sublink results got single DN node
+		 * XXX With same column distributed type
+		 */
+		if (enable_subquery_shipping &&
+			exec_nodes && sc_context.sc_subquery_en &&
+			query->commandType == CMD_SELECT &&
+			IsExecNodesColumnDistributed(exec_nodes) &&
+			IsExecNodesColumnDistributed(sc_context.sc_subquery_en) &&
+			exec_nodes->baselocatortype == sc_context.sc_subquery_en->baselocatortype &&
+			(num_fromclause_nodes == 1) && (num_sublink_nodes == 1))
+		{
+			exec_nodes = pgxc_merge_exec_nodes(exec_nodes, sc_context.sc_subquery_en);
+		}
+		/* Fall back to PGXC logic that only try with replicated type */
+#endif
+		else if (exec_nodes && IsExecNodesReplicated(exec_nodes) &&
+				 sc_context.sc_subquery_en &&
+				 IsExecNodesReplicated(sc_context.sc_subquery_en))
+		{
+			exec_nodes = pgxc_merge_exec_nodes(exec_nodes, sc_context.sc_subquery_en);
+		}
+		else
+		{
+			exec_nodes = NULL;
+		}
+	}
+
+	/*
+	 * Look at the information gathered by the walker in Shippability_context and that
+	 * in the Query structure to decide whether we should ship this query
+	 * directly to the Datanode or not
+	 */
+
+	/*
+	 * If the planner was not able to find the Datanodes to the execute the
+	 * query, the query is not completely shippable. So, return NULL
+	 */
+	if (!exec_nodes)
+		return NULL;
 
     /* Copy the shippability reasons. We modify the copy for easier handling.
      * The original can be saved away */
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index dac8f826..fe4df6e7 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -10613,17 +10613,12 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context)
 			 * corresponding subquery RTE has its eref set to view name.
 			 * The remote query generated has this subquery of which the
 			 * columns can be referred to as view_name.col1, so it should
-			 * be possible to refer to this subquery object.
-			 */			
+			 * be possible to refer to this subquery object
+			 * We've finished the alias print here, no need to set printalias
+			 * again.
+			 */
 			appendStringInfo(buf, " %s",
 							 quote_identifier(rte->eref->aliasname));
-
-			/*
-			 * For 'dual' rte, the aliasname is also 'dual', print alias will
-			 * lead to syntax error.
-			 */
-			if (strcmp(rte->eref->aliasname, "dual") != 0)
-				printalias = true;
 		}
 #endif
         else if (rte->rtekind == RTE_FUNCTION)
diff --git a/src/test/regress/expected/xc_FQS_2.out b/src/test/regress/expected/xc_FQS_2.out
index e3e73168..ea10b9ad 100644
--- a/src/test/regress/expected/xc_FQS_2.out
+++ b/src/test/regress/expected/xc_FQS_2.out
@@ -1637,8 +1637,76 @@ select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union sele
   1 | sz | 2 |  1 | sz | 2
 (3 rows)
 
+-- Support subquery FQS only if subquery distributed on same DN with main query(only 1 DN node)
+explain select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1);
+                                            QUERY PLAN                                            
+--------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..142.30 rows=2 width=40)
+   ->  Nested Loop Semi Join  (cost=100.00..142.30 rows=2 width=40)
+         Join Filter: (t1.c = t2.c)
+         ->  Seq Scan on subquery_fqs t1  (cost=0.00..21.00 rows=4 width=40)
+               Filter: (id = 1)
+         ->  Materialize  (cost=100.00..121.07 rows=4 width=4)
+               ->  Remote Subquery Scan on all (datanode_1)  (cost=100.00..121.05 rows=4 width=4)
+                     ->  Seq Scan on subquery_fqs t2  (cost=0.00..21.00 rows=4 width=4)
+                           Filter: (id = 1)
+(9 rows)
+
+select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1);
+ id | a  | c 
+----+----+---
+  1 | gd | 2
+  1 | zj | 2
+  1 | sz | 2
+(3 rows)
+
+explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1);
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
+ Remote Fast Query Execution  (cost=0.00..0.00 rows=0 width=0)
+   Node/s: datanode_1
+   ->  Seq Scan on subquery_fqs t1  (cost=21.02..44.22 rows=1 width=40)
+         Filter: ((id = 1) AND (c = $0))
+         InitPlan 1 (returns $0)
+           ->  Limit  (cost=21.02..21.02 rows=1 width=4)
+                 ->  Sort  (cost=21.02..21.03 rows=4 width=4)
+                       Sort Key: t2.c
+                       ->  Seq Scan on subquery_fqs t2  (cost=0.00..21.00 rows=4 width=4)
+                             Filter: (id = 1)
+(10 rows)
+
+select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1);
+ id | a  | c 
+----+----+---
+  1 | gd | 2
+  1 | zj | 2
+  1 | sz | 2
+(3 rows)
+
+explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1);
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Remote Fast Query Execution  (cost=0.00..0.00 rows=0 width=0)
+   Node/s: datanode_1
+   ->  Seq Scan on subquery_fqs t1  (cost=21.02..44.22 rows=1 width=40)
+         Filter: ((id = 1) AND (c = $0))
+         InitPlan 1 (returns $0)
+           ->  Aggregate  (cost=21.01..21.02 rows=1 width=4)
+                 ->  Seq Scan on subquery_fqs t2  (cost=0.00..21.00 rows=4 width=4)
+                       Filter: (id = 1)
+(8 rows)
+
+select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1);
+ id | a  | c 
+----+----+---
+  1 | gd | 2
+  1 | zj | 2
+  1 | sz | 2
+(3 rows)
+
 drop table tab1_rr;
 drop table tab1_hash;
 drop table tab1_modulo;
 drop table tab1_replicated;
+drop table subquery_fqs;
 drop function cr_table(varchar, int[], varchar); 
diff --git a/src/test/regress/sql/xc_FQS.sql b/src/test/regress/sql/xc_FQS.sql
index bdb9c02a..14721a76 100644
--- a/src/test/regress/sql/xc_FQS.sql
+++ b/src/test/regress/sql/xc_FQS.sql
@@ -283,9 +283,17 @@ insert into subquery_fqs values(1,'sz', 2);
 explain select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a);
 select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a);
 
+-- Support subquery FQS only if subquery distributed on same DN with main query(only 1 DN node)
+explain select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1);
+select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1);
+explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1);
+select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1);
+explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1);
+select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1);
 
 drop table tab1_rr;
 drop table tab1_hash;
 drop table tab1_modulo;
 drop table tab1_replicated;
+drop table subquery_fqs;
 drop function cr_table(varchar, int[], varchar); 

From a4f370e4aa3afb72354502ccd1cbbacdddf66be7 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Sun, 4 Oct 2020 11:16:43 +0800
Subject: [PATCH 070/578] fix gtm coredump due to initilization failure.

---
 src/gtm/main/main.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index 03618ffe..78fbcff5 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -1092,28 +1092,28 @@ main(int argc, char *argv[])
             }
         } while(max_retry_times > 0);
 
-        if(ret)
-        {
-            elog(FATAL, "GTM_StoreMasterInit failed too many times exit, %s", strerror(errno));
-        }
-        
-        if (!gtm_standby_restore_next_gxid())
-        {
-            elog(FATAL, "Failed to restore next/last gxid from the active-GTM.");
-        }
-        elog(LOG, "Restoring next/last gxid from the active-GTM succeeded.");
+		if(ret)
+		{
+    		elog(FATAL, "GTM_StoreMasterInit failed too many times \"%s\", exit", strerror(errno));
+		}
+		
+		if (!gtm_standby_restore_next_gxid())
+		{
+			elog(FATAL, "Failed to restore next/last gxid from the active-GTM.");
+		}
+		elog(LOG, "Restoring next/last gxid from the active-GTM succeeded.");
 
-        if (!gtm_standby_restore_gxid())
-        {
-            elog(FATAL, "Failed to restore all of gxid(s) from the active-GTM.");
-        }
-        elog(LOG, "Restoring all of gxid(s) from the active-GTM succeeded.");
+		if (!gtm_standby_restore_gxid())
+		{
+			elog(FATAL, "Failed to restore all of gxid(s) from the active-GTM.");
+		}
+		elog(LOG, "Restoring all of gxid(s) from the active-GTM succeeded.");
 
-        if (!gtm_standby_restore_sequence())
-        {
-            elog(FATAL, "Failed to restore sequences from the active-GTM.");
-        }
-        elog(LOG, "Restoring sequences from the active-GTM succeeded.");
+		if (!gtm_standby_restore_sequence())
+		{
+			elog(FATAL, "Failed to restore sequences from the active-GTM.");
+		}
+		elog(LOG, "Restoring sequences from the active-GTM succeeded.");
 
 
 #else

From df7eab923eaf6d4299e23a4932ab8c5eb05260a6 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Sat, 10 Oct 2020 16:53:10 +0800
Subject: [PATCH 071/578] for pooler statistics extension (merge request !112)

---
 contrib/tbase_pooler_stat/Makefile            |  18 +
 .../tbase_pooler_stat--1.0.sql                |  41 ++
 .../tbase_pooler_stat--unpackaged--1.0.sql    |   8 +
 contrib/tbase_pooler_stat/tbase_pooler_stat.c | 307 ++++++++++
 .../tbase_pooler_stat.control                 |   5 +
 src/backend/libpq/pqformat.c                  |  17 +-
 src/backend/pgxc/pool/poolmgr.c               | 545 +++++++++++++++---
 src/include/libpq/pqformat.h                  |   1 +
 src/include/pgxc/poolmgr.h                    |  79 ++-
 9 files changed, 913 insertions(+), 108 deletions(-)
 create mode 100644 contrib/tbase_pooler_stat/Makefile
 create mode 100644 contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql
 create mode 100644 contrib/tbase_pooler_stat/tbase_pooler_stat--unpackaged--1.0.sql
 create mode 100644 contrib/tbase_pooler_stat/tbase_pooler_stat.c
 create mode 100644 contrib/tbase_pooler_stat/tbase_pooler_stat.control

diff --git a/contrib/tbase_pooler_stat/Makefile b/contrib/tbase_pooler_stat/Makefile
new file mode 100644
index 00000000..ed49584d
--- /dev/null
+++ b/contrib/tbase_pooler_stat/Makefile
@@ -0,0 +1,18 @@
+# contrib/tbase_pooler_stat/Makefile
+
+MODULE_big = tbase_pooler_stat
+OBJS = tbase_pooler_stat.o
+
+EXTENSION = tbase_pooler_stat
+DATA = tbase_pooler_stat--1.0.sql	tbase_pooler_stat--unpackaged--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/tbase_pooler_stat
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql b/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql
new file mode 100644
index 00000000..5ee8e1e6
--- /dev/null
+++ b/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql
@@ -0,0 +1,41 @@
+/* contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION tbase_pooler_stat" to load this file. \quit
+
+-- Register functions.
+CREATE OR REPLACE FUNCTION tbase_get_pooler_cmd_statistics(
+	OUT command_type text,
+	OUT request_times int8,
+	OUT avg_costtime int8,
+	OUT max_costtime int8,
+	OUT min_costtime int8
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+
+CREATE OR REPLACE FUNCTION tbase_reset_pooler_cmd_statistics()
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE OR REPLACE FUNCTION tbase_get_pooler_conn_statistics(
+	OUT database name,
+	OUT user_name name,
+	OUT node_name name,
+	OUT oid Oid,
+	OUT is_coord bool,
+	OUT conn_cnt int4,
+	OUT free_cnt int4,
+	OUT warming_cnt int4,
+	OUT query_cnt int4,
+	OUT exceed_keepalive_cnt int4,
+	OUT exceed_deadtime_cnt int4,
+	OUT exceed_maxlifetime_cnt int4
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat--unpackaged--1.0.sql b/contrib/tbase_pooler_stat/tbase_pooler_stat--unpackaged--1.0.sql
new file mode 100644
index 00000000..86f8cb72
--- /dev/null
+++ b/contrib/tbase_pooler_stat/tbase_pooler_stat--unpackaged--1.0.sql
@@ -0,0 +1,8 @@
+/* contrib/tbase_pooler_stat/tbase_pooler_stat--unpackaged--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION tbase_pooler_stat FROM unpackaged" to load this file. \quit
+
+ALTER EXTENSION tbase_pooler_stat ADD function tbase_get_pooler_cmd_statistics();
+ALTER EXTENSION tbase_pooler_stat ADD function tbase_reset_pooler_cmd_statistics();
+ALTER EXTENSION tbase_pooler_stat ADD function tbase_get_pooler_conn_statistics();
\ No newline at end of file
diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat.c b/contrib/tbase_pooler_stat/tbase_pooler_stat.c
new file mode 100644
index 00000000..d85e5405
--- /dev/null
+++ b/contrib/tbase_pooler_stat/tbase_pooler_stat.c
@@ -0,0 +1,307 @@
+/*
+ * contrib/tbase_pooler_stat/tbase_pooler_stat.c
+ *
+ * tbase_pooler_stat.c
+ *
+ * Copyright (c) 2020 Tbase Kernel Group
+ *
+ * Permission to use, copy, modify, and distribute this software and
+ * its documentation for any purpose, without fee, and without a
+ * written agreement is hereby granted, provided that the above
+ * copyright notice and this paragraph and the following two
+ * paragraphs appear in all copies.
+ *
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE TO ANY PARTY FOR DIRECT,
+ * INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
+ * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
+ * DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * THE AUTHOR SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS
+ * IS" BASIS, AND THE AUTHOR HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE,
+ * SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+ */
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "access/htup_details.h"
+#include "catalog/pg_type.h"
+#include <endian.h>
+#include "pgxc/poolmgr.h"
+#include "libpq/pqformat.h"
+#include "utils/builtins.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(tbase_get_pooler_cmd_statistics);
+PG_FUNCTION_INFO_V1(tbase_reset_pooler_cmd_statistics);
+PG_FUNCTION_INFO_V1(tbase_get_pooler_conn_statistics);
+
+typedef struct
+{
+    uint32               currIdx;         /* current handle item id */
+    PoolerCmdStatistics  *buf;            /* a fixed length buf store the result */
+} Pooler_CmdState;
+
+typedef struct
+{
+    uint32	     total_node_cursor;    /* total connection nodes count */
+    const char   *database;            /* node_cursor's database */
+    const char   *username;            /* node_cursor's username */
+    uint32       node_cursor;          /* current handle node cursor */
+    StringInfo   buf;                  /* a stringInfo buf store the result */
+} Pooler_ConnState;
+
+
+/* the g_pooler_cmd_name_tab and g_pooler_cmd must be in the same order */
+static char *g_pooler_cmd_name_tab[POOLER_CMD_COUNT] =
+{
+    "ABORT",                  /* ABORT */
+    "FIRE_TRANSACTION_BLOCK", /* Fire transaction-block commands on given nodes */
+    "CONNECT",                /* CONNECT */
+    "DISCONNECT",             /* DISCONNECT */
+    "CLEAN_CONN",             /* CLEAN CONNECTION */
+    "GET_CONN",               /* GET CONNECTIONS */
+    "CANCEL_SQL",             /* Cancel SQL Command in progress on specified connections */
+    "LOCK_UNLOCK_POOLER",     /* Lock/unlock pooler */
+    "RELOAD_CONN",            /* Reload connection info */
+    "PING_CONN",              /* Ping connection info */
+    "CHECK_CONN",             /* Check connection info consistency */
+    "RELEASE_CONN",           /* RELEASE CONNECTIONS */
+    "REFRESH_CONN",           /* Refresh connection info */
+    "SESSION_RELATED",        /* Session-related COMMAND */
+    "CLOSE_POOLER_CONN",      /* Close pooler connections*/
+    "GET_CMD_STATSTICS",      /* Get command statistics */
+    "RESET_CMD_STATISTICS",   /* Reset command statistics */
+    "GET_CONN_STATISTICS"     /* Get connection statistics */
+};
+
+/*
+ * get pooler command statistics
+ */
+Datum
+tbase_get_pooler_cmd_statistics(PG_FUNCTION_ARGS)
+{
+#define  LIST_POOLER_CMD_STATISTICS_COLUMNS 5
+    FuncCallContext 	*funcctx;
+    int32               ret = 0;
+    Pooler_CmdState     *status = NULL;
+    Datum		        values[LIST_POOLER_CMD_STATISTICS_COLUMNS];
+    bool		        nulls[LIST_POOLER_CMD_STATISTICS_COLUMNS];
+    HeapTuple	        tuple;
+    Datum		        result;
+    PoolerCmdStatistics stat_info;
+    int                 size = sizeof(PoolerCmdStatistics) * POOLER_CMD_COUNT;
+
+    MemSet(values, 0, sizeof(values));
+    MemSet(nulls,  0, sizeof(nulls));
+
+    if (SRF_IS_FIRSTCALL())
+    {
+        MemoryContext oldcontext;
+        TupleDesc	  tupdesc;
+
+        funcctx = SRF_FIRSTCALL_INIT();
+
+        oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+        tupdesc = CreateTemplateTupleDesc(LIST_POOLER_CMD_STATISTICS_COLUMNS, false);
+
+        TupleDescInitEntry(tupdesc, (AttrNumber) 1, "command_type",
+                           TEXTOID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 2, "request_times",
+                           INT8OID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 3, "avg_costtime",
+                           INT8OID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 4, "max_costtime",
+                           INT8OID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 5, "min_costtime",
+                           INT8OID, -1, 0);
+        funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+        status = (Pooler_CmdState*) palloc(sizeof(Pooler_CmdState));
+        status->currIdx = 0;
+        status->buf = (PoolerCmdStatistics*) palloc(size);
+
+        funcctx->user_fctx = (void*) status;
+
+        ret = PoolManagerGetCmdStatistics((char*)status->buf, size);
+        if (ret)
+        {
+            elog(ERROR, "get pooler cmd statictics info from pooler failed");
+        }
+
+        MemoryContextSwitchTo(oldcontext);
+    }
+
+    funcctx = SRF_PERCALL_SETUP();
+    status  = (Pooler_CmdState *) funcctx->user_fctx;
+
+    while (status->currIdx < POOLER_CMD_COUNT)
+    {
+        stat_info.total_request_times = be64toh(status->buf[status->currIdx].total_request_times);
+        stat_info.total_costtime = be64toh(status->buf[status->currIdx].total_costtime);
+        stat_info.max_costtime = be64toh(status->buf[status->currIdx].max_costtime);
+        stat_info.min_costtime = be64toh(status->buf[status->currIdx].min_costtime);
+
+        /* avg_costtime */
+        stat_info.avg_costtime = (stat_info.total_request_times == 0) ? 0 : (stat_info.total_costtime / stat_info.total_request_times);
+
+        values[0] = CStringGetTextDatum(g_pooler_cmd_name_tab[status->currIdx]);
+        values[1] = Int64GetDatum(stat_info.total_request_times);
+        values[2] = Int64GetDatum(stat_info.avg_costtime);
+        values[3] = Int64GetDatum(stat_info.max_costtime);
+        values[4] = Int64GetDatum(stat_info.min_costtime);
+
+        status->currIdx++;
+
+        tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+        result = HeapTupleGetDatum(tuple);
+        SRF_RETURN_NEXT(funcctx, result);
+    }
+
+    SRF_RETURN_DONE(funcctx);
+}
+
+/*
+ * reset pooler command statistics
+ */
+Datum
+tbase_reset_pooler_cmd_statistics(PG_FUNCTION_ARGS)
+{
+    PoolManagerResetCmdStatistics();
+
+    PG_RETURN_VOID();
+}
+
+/*
+ * get pooler connections statistics
+ */
+Datum
+tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS)
+{
+#define  LIST_POOLER_CONN_STATISTICS_COLUMNS 12
+    FuncCallContext 	 *funcctx = NULL;
+    int32                ret = 0;
+    Pooler_ConnState     *status = NULL;
+    Datum		         values[LIST_POOLER_CONN_STATISTICS_COLUMNS];
+    bool		         nulls[LIST_POOLER_CONN_STATISTICS_COLUMNS];
+    HeapTuple	         tuple;
+    Datum		         result;
+
+    if (SRF_IS_FIRSTCALL())
+    {
+        MemoryContext oldcontext;
+        TupleDesc	  tupdesc;
+
+        /* content will destroy in SRF_RETURN_DONE */
+        funcctx = SRF_FIRSTCALL_INIT();
+
+        oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+        tupdesc = CreateTemplateTupleDesc(LIST_POOLER_CONN_STATISTICS_COLUMNS, false);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 1, "database",
+                           NAMEOID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 2, "user_name",
+                           NAMEOID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 3, "node_name",
+                           NAMEOID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 4, "oid",
+                           OIDOID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 5, "is_coord",
+                           BOOLOID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 6, "conn_cnt",
+                           INT4OID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 7, "free_cnt",
+                           INT4OID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 8, "warming_cnt",
+                           INT4OID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 9, "query_cnt",
+                           INT4OID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 10, "exceed_keepalive_cnt",
+                           INT4OID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 11, "exceed_deadtime_cnt",
+                           INT4OID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 12, "exceed_maxlifetime_cnt",
+                           INT4OID, -1, 0);
+
+        funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+        status = (Pooler_ConnState*) palloc(sizeof(Pooler_ConnState));
+        status->database = NULL;
+        status->username = NULL;
+        status->node_cursor = 0;
+        status->buf = makeStringInfo();
+
+        funcctx->user_fctx = (void*) status;
+
+        ret = PoolManagerGetConnStatistics(status->buf);
+        if (ret)
+        {
+            elog(ERROR, "get pooler conn statictics info from pooler failed");
+        }
+        else
+        {
+            status->total_node_cursor = pq_getmsgint(status->buf, sizeof(uint32));
+        }
+
+        MemoryContextSwitchTo(oldcontext);
+    }
+
+    funcctx = SRF_PERCALL_SETUP();
+    status  = (Pooler_ConnState *) funcctx->user_fctx;
+
+    while (status->total_node_cursor)
+    {
+        MemSet(values, 0, sizeof(values));
+        MemSet(nulls,  0, sizeof(nulls));
+
+        if (status->node_cursor == 0)
+        {
+            /* get next database and username */
+            status->database = pq_getmsgstring(status->buf);
+            status->username = pq_getmsgstring(status->buf);
+            status->node_cursor = pq_getmsgint(status->buf, sizeof(uint32));
+        }
+
+        values[0] = CStringGetDatum(status->database);
+        values[1] = CStringGetDatum(status->username);
+        if (status->node_cursor == 0)
+        {
+            nulls[2] = true;
+            nulls[3] = true;
+            nulls[4] = true;
+            nulls[5] = true;
+            nulls[6] = true;
+            nulls[7] = true;
+            nulls[8] = true;
+            nulls[9] = true;
+            nulls[10] = true;
+            nulls[11] = true;
+        }
+        else
+        {
+            values[2] = CStringGetDatum(pq_getmsgstring(status->buf));
+            values[3] = ObjectIdGetDatum(pq_getmsgint(status->buf, sizeof(Oid)));
+            values[4] = BoolGetDatum(pq_getmsgint(status->buf, sizeof(bool)));
+            values[5] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
+            values[6] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
+            values[7] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
+            values[8] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
+            values[9] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
+            values[10] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
+            values[11] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
+            status->node_cursor--;
+        }
+
+        status->total_node_cursor--;
+
+        tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+        result = HeapTupleGetDatum(tuple);
+        SRF_RETURN_NEXT(funcctx, result);
+    }
+
+    SRF_RETURN_DONE(funcctx);
+}
\ No newline at end of file
diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat.control b/contrib/tbase_pooler_stat/tbase_pooler_stat.control
new file mode 100644
index 00000000..ae727b91
--- /dev/null
+++ b/contrib/tbase_pooler_stat/tbase_pooler_stat.control
@@ -0,0 +1,5 @@
+# tbase_pooler_stat extension
+comment = 'pooler statistics'
+default_version = '1.0'
+module_pathname = '$libdir/tbase_pooler_stat'
+relocatable = true
diff --git a/src/backend/libpq/pqformat.c b/src/backend/libpq/pqformat.c
index d2ba7d1b..2005bbd3 100644
--- a/src/backend/libpq/pqformat.c
+++ b/src/backend/libpq/pqformat.c
@@ -578,7 +578,22 @@ pq_copymsgbytes(StringInfo msg, char *buf, int datalen)
 }
 
 /* --------------------------------
- *        pq_getmsgtext    - get a counted text string (with conversion)
+ *		pq_updatemsgbytes - update the content of the specified location with buf
+ *
+ * --------------------------------
+ */
+void
+pq_updatemsgbytes(StringInfo msg, int offset, char *buf, int datalen)
+{
+    if (datalen < 0 || offset < 0 || offset + datalen > msg->len)
+        ereport(ERROR,
+                (EPROTO,
+                        errmsg("invalid update data in message")));
+    memcpy(&msg->data[offset], buf, datalen);
+}
+
+/* --------------------------------
+ *		pq_getmsgtext	- get a counted text string (with conversion)
  *
  *        Always returns a pointer to a freshly palloc'd result.
  *        The result has a trailing null, *and* we return its strlen in *nbytes.
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 0392d08f..d24a32f4 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -66,6 +66,7 @@
 #include "utils/varlena.h"
 #include "port.h"
 #include <math.h>
+#include <sys/timeb.h>
 
 /* the mini use conut of a connection */
 #define  MINI_USE_COUNT    10
@@ -138,6 +139,33 @@ typedef struct PoolerStatistics
 
 PoolerStatistics g_pooler_stat;
 
+/* global command statistics handle */
+PoolerCmdStatistics* g_pooler_cmd_stat = NULL;
+
+unsigned char g_pooler_cmd[POOLER_CMD_COUNT] =
+{
+    'a',                    /* ABORT */
+    'b',                    /* Fire transaction-block commands on given nodes */
+    'c',                    /* CONNECT */
+    'd',                    /* DISCONNECT */
+    'f',                    /* CLEAN CONNECTION */
+    'g',                    /* GET CONNECTIONS */
+    'h',                    /* Cancel SQL Command in progress on specified connections */
+    'o',                    /* Lock/unlock pooler */
+    'p',                    /* Reload connection info */
+    'P',                    /* Ping connection info */
+    'q',                    /* Check connection info consistency */
+    'r',                    /* RELEASE CONNECTIONS */
+    'R',                    /* Refresh connection info */
+    's',                    /* Session-related COMMAND */
+    't',                    /* Close pooler connections*/
+    'x',                    /* Get command statistics */
+    'y',                    /* Reset command statistics */
+    'z'                     /* Get connection statistics */
+};
+
+/* a map used to change msgtype to id */
+uint8 g_qtype2id[256];
 
 /* Flag to tell if we are Postgres-XC pooler process */
 static bool am_pgxc_pooler = false;
@@ -346,25 +374,27 @@ char *poolErrorMsg[] = {"No Error",
 
 typedef struct
 {
-    int32             cmd;            /* refer to handle_agent_input command tag */
-    bool              bCoord;          /* coordinator or datanode*/
-    PGXCASyncTaskCtl  *taskControl;    
-    PoolAgent         *agent;
-    PGXCNodePool      *nodepool;      /* node pool for current node */
-    PGXCNodePoolSlot  *slot;          /* connection slot , no need to free */
-    int32             current_status; /* currrent connect status*/
-    int32             final_status;   /* final status we are going to get to*/
-    int32             nodeindex;      /* node index of the remote peer */
-    bool              needfree;       /* whether need to free taskControl, last thread set the flag */
-
-    int32              req_seq;          /* req sequence number */
-    int32             pid;              /* pid that acquires the connection */
-    bool              needConnect;      /* check whether we need to build a new connection , we acquire new connections */    
-    bool              error_flag;      /* set when error */
-    SendSetQueryStatus setquery_status;    /* send set query status */ 
-    struct  timeval   start_time;        /* when acquire conn by sync thread, the time begin request */
-    struct  timeval   end_time;            /* when acquire conn by sync thread, the time finish request */
-    char              errmsg[POOLER_ERROR_MSG_LEN];
+	int32             cmd;  	      /* refer to handle_agent_input command tag */
+	bool              bCoord;		  /* coordinator or datanode*/
+	PGXCASyncTaskCtl  *taskControl;	
+	PoolAgent         *agent;
+	PGXCNodePool      *nodepool;	  /* node pool for current node */
+	PGXCNodePoolSlot  *slot;		  /* connection slot , no need to free */
+	int32             current_status; /* currrent connect status*/
+	int32             final_status;   /* final status we are going to get to*/
+	int32             nodeindex;      /* node index of the remote peer */
+	bool			  needfree;       /* whether need to free taskControl, last thread set the flag */
+
+	int32			  req_seq;		  /* req sequence number */
+	int32             pid;			  /* pid that acquires the connection */
+	bool              needConnect;	  /* check whether we need to build a new connection , we acquire new connections */	
+	bool			  error_flag;	  /* set when error */
+	SendSetQueryStatus setquery_status;    /* send set query status */ 
+	struct  timeval   start_time;		/* when acquire conn by sync thread, the time begin request */
+	struct  timeval   end_time;			/* when acquire conn by sync thread, the time finish request */
+	char              errmsg[POOLER_ERROR_MSG_LEN];
+	pg_time_t         cmd_start_time;   /* command start time, including the processing time in the main process */
+    pg_time_t         cmd_end_time;     /* command end time */
 }PGXCPoolAsyncReq;
 
 static inline void RebuildAgentIndex(void);
@@ -562,6 +592,11 @@ static int handle_close_pooled_connections(PoolAgent * agent, StringInfo s);
 static void ConnectPoolManager(void);
 #endif
 
+static void init_pooler_cmd_statistics(void);
+static void reset_pooler_cmd_statistics(void);
+static void update_pooler_cmd_statistics(unsigned char qtype, uint64 costtime);
+static void handle_get_cmd_statistics(PoolAgent *agent);
+static void handle_get_conn_statistics(PoolAgent *agent);
 
 #define IncreaseSlotRefCount(slot,filename,linenumber)\
 do\
@@ -693,8 +728,6 @@ do\
     }\
 }while(0)
 
-
-
 void
 PGXCPoolerProcessIam(void)
 {
@@ -1460,6 +1493,95 @@ PoolManagerLock(bool is_lock)
     RESUME_POOLER_RELOAD();
 }
 
+/*
+ * get pooler command statistics
+ */
+int
+PoolManagerGetCmdStatistics(char *s, int size)
+{
+    int qtype = 0;
+    char msgtype = 'x';
+    HOLD_POOLER_RELOAD();
+
+    if (poolHandle == NULL)
+    {
+        ConnectPoolManager();
+    }
+
+    /* Message type */
+    pool_putbytes(&poolHandle->port, &msgtype, 1);
+    pool_flush(&poolHandle->port);
+
+    qtype = pool_getbyte(&poolHandle->port);
+    if (qtype == EOF || (unsigned char)qtype != msgtype)
+    {
+        elog(ERROR, POOL_MGR_PREFIX"get command statistics error, qtype:%d", qtype);
+        RESUME_POOLER_RELOAD();
+        return -1;
+    }
+
+    /* get all command statistics messages */
+    pool_getbytes(&poolHandle->port, s, size);
+
+    RESUME_POOLER_RELOAD();
+    return 0;
+}
+
+/*
+ * reset command statistics
+ */
+void
+PoolManagerResetCmdStatistics(void)
+{
+    char msgtype = 'y';
+    HOLD_POOLER_RELOAD();
+
+    if (poolHandle == NULL)
+    {
+        ConnectPoolManager();
+    }
+
+    /* Message type */
+    pool_putbytes(&poolHandle->port, &msgtype, 1);
+    pool_flush(&poolHandle->port);
+
+    RESUME_POOLER_RELOAD();
+}
+
+/*
+ * get pooler connections statistics
+ */
+int
+PoolManagerGetConnStatistics(StringInfo s)
+{
+    int qtype = 0;
+    char msgtype = 'z';
+    HOLD_POOLER_RELOAD();
+
+    if (poolHandle == NULL)
+    {
+        ConnectPoolManager();
+    }
+
+    /* Message type */
+    pool_putbytes(&poolHandle->port, &msgtype, 1);
+    pool_flush(&poolHandle->port);
+
+    qtype = pool_getbyte(&poolHandle->port);
+    if (qtype == EOF || (unsigned char)qtype != msgtype)
+    {
+        elog(ERROR, POOL_MGR_PREFIX"get conn statistics error, qtype:%d", qtype);
+        RESUME_POOLER_RELOAD();
+        return -1;
+    }
+
+    /* get all the messages left */
+    pool_getmessage(&poolHandle->port, s, 0);
+
+    RESUME_POOLER_RELOAD();
+    return 0;
+}
+
 /*
  * Init PoolAgent
  */
@@ -1953,6 +2075,16 @@ PoolManagerReloadConnectionInfo(void)
     pool_flush(&poolHandle->port);
 }
 
+/*
+ * get systime time, ms
+ */
+static pg_time_t
+get_system_time()
+{
+    struct timeb t;
+    ftime(&t);
+    return 1000 * t.time + t.millitm;
+}
 
 /*
  * Handle messages to agent
@@ -5097,7 +5229,8 @@ PoolerLoop(void)
         pool_fd[i].events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
     }
 
-    reset_pooler_statistics();
+	reset_pooler_statistics();
+    init_pooler_cmd_statistics();
 
     for (;;)
     {
@@ -8043,9 +8176,12 @@ static inline bool dispatch_connection_request(PGXCASyncTaskCtl  *taskControl,
     {
         taskControl->m_status     = PoolAyncCtlStaus_dispatched;
 
-        /* also use this request to response to session*/
-        req->final_status         = PoolConnectStaus_destory;
-    }        
+		/* also use this request to response to session*/
+		req->final_status         = PoolConnectStaus_destory;
+
+		/* if last request, transfer cmd_start_time to req */
+        req->cmd_start_time       = agent->cmd_start_time;
+	}
 
     if (PoolConnectDebugPrint)
     {
@@ -8081,8 +8217,16 @@ static inline bool dispatch_connection_request(PGXCASyncTaskCtl  *taskControl,
         snprintf(agent->port.err_msg, POOL_ERR_MSG_LEN, "%s", poolErrorMsg[agent->port.error_code]);
         SpinLockRelease(&agent->port.lock);
 #endif
+	}
+    else
+    {
+        if (dispatched)
+        {
+            /* dispatch success, clear cmd start time in agent */
+            agent->cmd_start_time = 0;
+        }
     }
-    return ret;
+	return ret;
 }
 
 
@@ -8116,31 +8260,43 @@ static inline bool dispatch_local_set_request(PGXCASyncTaskCtl  *taskControl,
     {
         taskControl->m_status     = PoolAyncCtlStaus_dispatched;
 
-        /* also use this request to response to session*/
-        req->final_status         = PoolLocalSetStatus_destory;
-        req->current_status          = PoolLocalSetStatus_destory;
-    }    
-    
+		/* also use this request to response to session*/
+		req->final_status         = PoolLocalSetStatus_destory;
+		req->current_status    	  = PoolLocalSetStatus_destory;
+
+        /* if last request, transfer cmd_start_time to req */
+        req->cmd_start_time       = agent->cmd_start_time;
+	}	
+	
 
     if (PoolConnectDebugPrint)
     {
         elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async local set nodeindex:%d connection, current status:%d final status:%d", agent->pid, nodeindex, req->current_status, req->final_status);
     }
 
-    if (dispatched)
+	if (dispatched)
+	{
+		if (PoolConnectDebugPrint)
+		{
+			elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch last local set request!! nodeindex:%d connection, current status:%d final status:%d request_num:%d", agent->pid, nodeindex, req->current_status, req->final_status, taskControl->m_mumber_total);
+		}
+	}
+	ret = dispatch_async_network_operation(req);
+	if (!ret)
+	{
+		elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async local set request failed!! nodeindex:%d connection, current status:%d final status:%d request_num:%d", agent->pid, nodeindex, req->current_status, req->final_status, taskControl->m_mumber_total);		
+		pfree(req);
+	}
+    else
     {
-        if (PoolConnectDebugPrint)
+        if (dispatched)
         {
-            elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch last local set request!! nodeindex:%d connection, current status:%d final status:%d request_num:%d", agent->pid, nodeindex, req->current_status, req->final_status, taskControl->m_mumber_total);
+            /* dispatch success, clear cmd start time in agent */
+            agent->cmd_start_time = 0;
         }
     }
-    ret = dispatch_async_network_operation(req);
-    if (!ret)
-    {
-        elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async local set request failed!! nodeindex:%d connection, current status:%d final status:%d request_num:%d", agent->pid, nodeindex, req->current_status, req->final_status, taskControl->m_mumber_total);        
-        pfree(req);
-    }
-    return ret;
+
+	return ret;
 }
 
 static inline bool dispatch_set_command_request(PGXCASyncTaskCtl  *taskControl,
@@ -8183,10 +8339,13 @@ static inline bool dispatch_set_command_request(PGXCASyncTaskCtl  *taskControl,
     {
         taskControl->m_status     = PoolAyncCtlStaus_dispatched;
 
-        /* also use this request to response to session*/
-        req->final_status         = PoolSetCommandStatus_destory;
-        req->current_status       = PoolSetCommandStatus_destory;
-    }    
+		/* also use this request to response to session*/
+		req->final_status         = PoolSetCommandStatus_destory;
+		req->current_status       = PoolSetCommandStatus_destory;
+
+        /* if last request, transfer cmd_start_time to req */
+        req->cmd_start_time       = agent->cmd_start_time;
+	}	
 
     if (PoolConnectDebugPrint)
     {
@@ -8207,14 +8366,22 @@ static inline bool dispatch_set_command_request(PGXCASyncTaskCtl  *taskControl,
         }
     }
 
-    ret = dispatch_async_network_operation(req); 
-    if (!ret)
+	ret = dispatch_async_network_operation(req); 
+	if (!ret)
+	{
+		if (slot)
+		{
+			elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async set command request failed!! nodeindex:%d connection nodename:%s backend_pid:%d current status:%d final status:%d request_num:%d command:%s", agent->pid, nodeindex, slot->node_name, slot->backend_pid, req->current_status, req->final_status, taskControl->m_mumber_total, taskControl->m_command);		
+		}
+		pfree(req);
+	}
+    else
     {
-        if (slot)
+        if (dispatched)
         {
-            elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async set command request failed!! nodeindex:%d connection nodename:%s backend_pid:%d current status:%d final status:%d request_num:%d command:%s", agent->pid, nodeindex, slot->node_name, slot->backend_pid, req->current_status, req->final_status, taskControl->m_mumber_total, taskControl->m_command);        
+            /* dispatch success, clear cmd start time in agent */
+            agent->cmd_start_time = 0;
         }
-        pfree(req);
     }
 
     if (PoolConnectDebugPrint)
@@ -8268,34 +8435,37 @@ static inline bool dispatch_cancle_request(PGXCASyncTaskCtl  *taskControl,
     {
         taskControl->m_status     = PoolAyncCtlStaus_dispatched;
 
-        /* use this request to response to session*/
-        req->current_status          = PoolCancelStatus_destory;
-        req->final_status            = PoolCancelStatus_destory;
-    }
-    
-    if (bCoord)
-    {
-        slot = agent->coord_connections[nodeindex];
-        
-    }
-    else
-    {
-        slot = agent->dn_connections[nodeindex];
-    }
-                            
-    if (PoolConnectDebugPrint)
-    {
-        if (slot)
-        {
-            elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async CANCLE_QUERY nodeindex:%d connection, nodename:%s backend_pid:%d current status:%d final status:%d", 
-                                                                                                            agent->pid, 
-                                                                                                            nodeindex, 
-                                                                                                            slot->node_name, 
-                                                                                                            slot->backend_pid, 
-                                                                                                            req->current_status, 
-                                                                                                            req->final_status);
-        }
-    }
+		/* use this request to response to session*/
+		req->current_status    	  = PoolCancelStatus_destory;
+		req->final_status      	  = PoolCancelStatus_destory;
+
+        /* if last request, transfer cmd_start_time to req */
+        req->cmd_start_time       = agent->cmd_start_time;
+	}
+	
+	if (bCoord)
+	{
+		slot = agent->coord_connections[nodeindex];
+		
+	}
+	else
+	{
+		slot = agent->dn_connections[nodeindex];
+	}
+							
+	if (PoolConnectDebugPrint)
+	{
+		if (slot)
+		{
+			elog(LOG, POOL_MGR_PREFIX"pid:%d dispatch async CANCLE_QUERY nodeindex:%d connection, nodename:%s backend_pid:%d current status:%d final status:%d", 
+																										    agent->pid, 
+																										    nodeindex, 
+																										    slot->node_name, 
+																										    slot->backend_pid, 
+																										    req->current_status, 
+																										    req->final_status);
+		}
+	}
 
     if (dispatched)
     {
@@ -8336,8 +8506,17 @@ static inline bool dispatch_cancle_request(PGXCASyncTaskCtl  *taskControl,
         snprintf(agent->port.err_msg, POOL_ERR_MSG_LEN, "%s", poolErrorMsg[agent->port.error_code]);
         SpinLockRelease(&agent->port.lock);
 #endif
+	}
+    else
+    {
+        if (dispatched)
+        {
+            /* dispatch success, clear cmd start time in agent */
+            agent->cmd_start_time = 0;
+        }
     }
-    return ret;
+
+	return ret;
 }
 
 
@@ -10761,3 +10940,211 @@ ConnectPoolManager(void)
 
 }
 #endif
+
+/*
+ * init pooler command statistics
+ */
+static void
+init_pooler_cmd_statistics(void)
+{
+    int i = 0;
+    memset(g_qtype2id, -1, sizeof(g_qtype2id));
+
+    /* init type to id map */
+    for (i = 0; i < POOLER_CMD_COUNT; i++)
+    {
+        g_qtype2id[g_pooler_cmd[i]] = i;
+    }
+
+    /* init global statistics array */
+    g_pooler_cmd_stat = (PoolerCmdStatistics*) palloc(POOLER_CMD_COUNT * sizeof(PoolerCmdStatistics));
+    for (i = 0; i < POOLER_CMD_COUNT; i++)
+    {
+        g_pooler_cmd_stat[i].total_request_times = 0;
+        g_pooler_cmd_stat[i].total_costtime = 0;
+        g_pooler_cmd_stat[i].max_costtime = 0;
+        g_pooler_cmd_stat[i].min_costtime = MAX_UINT64;
+    }
+}
+
+/*
+ * reset pooler command statistics
+ */
+static void
+reset_pooler_cmd_statistics(void)
+{
+    int i = 0;
+    /* reset global statistics array */
+    for (i = 0; i < POOLER_CMD_COUNT; i++)
+    {
+        g_pooler_cmd_stat[i].total_request_times = 0;
+        g_pooler_cmd_stat[i].total_costtime = 0;
+        g_pooler_cmd_stat[i].max_costtime = 0;
+        g_pooler_cmd_stat[i].min_costtime = MAX_UINT64;
+    }
+}
+
+/*
+ * update pooler command statistics info
+ */
+static void
+update_pooler_cmd_statistics(unsigned char qtype, uint64 costtime)
+{
+    uint8 id = g_qtype2id[qtype];
+    if (id == MAX_UINT8)
+    {
+        return;
+    }
+
+    g_pooler_cmd_stat[id].total_request_times += 1;
+    g_pooler_cmd_stat[id].total_costtime += costtime;
+
+    if (costtime > g_pooler_cmd_stat[id].max_costtime)
+    {
+        g_pooler_cmd_stat[id].max_costtime = costtime;
+    }
+
+    if (costtime < g_pooler_cmd_stat[id].min_costtime)
+    {
+        g_pooler_cmd_stat[id].min_costtime = costtime;
+    }
+}
+
+/*
+ * handle get command statistics
+ */
+static void
+handle_get_cmd_statistics(PoolAgent *agent)
+{
+    int    i = 0;
+    uint64 n64 = 0;
+    char   msgtype = 'x';
+
+    /* response message type */
+    pool_putbytes(&agent->port, &msgtype, 1);
+
+    /* fixed length command statistics info */
+    for (i = 0; i < POOLER_CMD_COUNT; i++)
+    {
+        n64 = htobe64(g_pooler_cmd_stat[i].total_request_times);
+        pool_putbytes(&agent->port, (char *) &n64, sizeof(n64));
+
+        n64 = htobe64(g_pooler_cmd_stat[i].total_costtime);
+        pool_putbytes(&agent->port, (char *) &n64, sizeof(n64));
+
+        n64 = htobe64(g_pooler_cmd_stat[i].max_costtime);
+        pool_putbytes(&agent->port, (char *) &n64, sizeof(n64));
+
+        n64 = htobe64(g_pooler_cmd_stat[i].min_costtime);
+        pool_putbytes(&agent->port, (char *) &n64, sizeof(n64));
+    }
+
+    pool_flush(&agent->port);
+}
+
+/*
+ * handle get connections statistics
+ */
+static void
+handle_get_conn_statistics(PoolAgent *agent)
+{
+    DatabasePool     *database_pool = databasePools;
+    HASH_SEQ_STATUS  hseq_status;
+    PGXCNodePool     *node_pool = NULL;
+
+    uint32           node_cnt = 0;         /* the nodes count use the same database and username */
+    uint32           total_node_cnt = 0;   /* total nodes count */
+
+    /* var offset in buf */
+    uint32           node_cnt_offset = 0;
+    uint32           total_node_cnt_offset = 0;
+
+    uint32           exceed_keepalive_cnt = 0;
+    uint32           exceed_deadtime_cnt = 0;
+    uint32           exceed_maxlifetime_cnt = 0;
+    int              i = 0;
+    PGXCNodePoolSlot *slot = NULL;
+    time_t           now = time(NULL);
+    StringInfoData   buf;
+
+    initStringInfo(&buf);
+    /* reserve a place for total_node_cnt, record the offset of total_node_cnt */
+    total_node_cnt_offset = buf.len;
+    pq_sendint(&buf, total_node_cnt, sizeof(uint32));
+
+    /* total node count | database | username | node count in the same database and username | node pool conn statistics | ... | database | username | ... */
+    while (database_pool)
+    {
+        pq_sendstring(&buf, database_pool->database);
+        pq_sendstring(&buf, database_pool->user_name);
+
+        /* reserve a place for node_cnt, record the offset of node_cnt */
+        node_cnt = 0;
+        node_cnt_offset = buf.len;
+        pq_sendint(&buf, node_cnt, sizeof(uint32));
+
+        /* traverse all node_pool in hashtable */
+        hash_seq_init(&hseq_status, database_pool->nodePools);
+        while ((node_pool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
+        {
+            node_cnt++;
+
+            pq_sendstring(&buf, node_pool->node_name);
+            pq_sendint(&buf, node_pool->nodeoid, sizeof(Oid));
+            pq_sendint(&buf, node_pool->coord, sizeof(bool));
+            pq_sendint(&buf, node_pool->size, sizeof(uint32));
+            pq_sendint(&buf, node_pool->freeSize, sizeof(uint32));
+            pq_sendint(&buf, node_pool->nwarming, sizeof(uint32));
+            pq_sendint(&buf, node_pool->nquery, sizeof(uint32));
+
+            /* reset statistics count */
+            exceed_keepalive_cnt = 0;
+            exceed_deadtime_cnt = 0;
+            exceed_maxlifetime_cnt = 0;
+            /* statistical connection life cycle */
+            if (node_pool->slot)
+            {
+                for (i = 0; i < node_pool->freeSize; i++)
+                {
+                    slot = node_pool->slot[i];
+                    if (difftime(now, slot->released) > PoolConnKeepAlive)
+                    {
+                        exceed_keepalive_cnt++;
+                    }
+
+                    if (difftime(now, slot->created) > PoolConnDeadtime)
+                    {
+                        exceed_deadtime_cnt++;
+                    }
+
+                    if (difftime(now, slot->created) >= PoolConnMaxLifetime)
+                    {
+                        exceed_maxlifetime_cnt++;
+                    }
+                }
+            }
+
+            pq_sendint(&buf, exceed_keepalive_cnt, sizeof(uint32));
+            pq_sendint(&buf, exceed_deadtime_cnt, sizeof(uint32));
+            pq_sendint(&buf, exceed_maxlifetime_cnt, sizeof(uint32));
+        }
+
+
+        total_node_cnt += node_cnt;
+
+        /* change the nodes count in message buff */
+        node_cnt = htonl(node_cnt);
+        pq_updatemsgbytes(&buf, node_cnt_offset, (char*) &node_cnt, sizeof(uint32));
+        database_pool = database_pool->next;
+    }
+
+    /* change the total nodes count in message buff */
+    total_node_cnt = htonl(total_node_cnt);
+    pq_updatemsgbytes(&buf, total_node_cnt_offset, (char*) &total_node_cnt, sizeof(uint32));
+
+    /* send messages */
+    pool_putmessage(&agent->port, 'z', buf.data, buf.len);
+    pool_flush(&agent->port);
+
+    pfree(buf.data);
+}
diff --git a/src/include/libpq/pqformat.h b/src/include/libpq/pqformat.h
index bc1cb48f..0714f261 100644
--- a/src/include/libpq/pqformat.h
+++ b/src/include/libpq/pqformat.h
@@ -42,6 +42,7 @@ extern float4 pq_getmsgfloat4(StringInfo msg);
 extern float8 pq_getmsgfloat8(StringInfo msg);
 extern const char *pq_getmsgbytes(StringInfo msg, int datalen);
 extern void pq_copymsgbytes(StringInfo msg, char *buf, int datalen);
+extern void pq_updatemsgbytes(StringInfo msg, int offset, char *buf, int datalen);
 extern char *pq_getmsgtext(StringInfo msg, int rawbytes, int *nbytes);
 extern const char *pq_getmsgstring(StringInfo msg);
 extern const char *pq_getmsgrawstring(StringInfo msg);
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h
index bf90897e..c0d996d8 100644
--- a/src/include/pgxc/poolmgr.h
+++ b/src/include/pgxc/poolmgr.h
@@ -186,34 +186,36 @@ typedef struct PGXCASyncTaskCtl
  */
 typedef struct
 {
-    /* Process ID of postmaster child process associated to pool agent */
-    int                pid;
-    /* communication channel */
-    PoolPort        port;
-    DatabasePool   *pool;
-    MemoryContext    mcxt;
-    int                num_dn_connections;
-    int                num_coord_connections;
-    Oid                  *dn_conn_oids;        /* one for each Datanode */
-    Oid                  *coord_conn_oids;    /* one for each Coordinator */
-    PGXCNodePoolSlot **dn_connections; /* one for each Datanode */
-    PGXCNodePoolSlot **coord_connections; /* one for each Coordinator */
-    
-    char           *session_params;
-    char           *local_params;
-    List            *session_params_list; /* session param list */
-    List             *local_params_list;   /* local param list */
-    
-    bool            is_temp; /* Temporary objects used for this pool session? */
-
-    int             query_count;   /* query count, if exceed, need to reconnect database */
-    bool            breconnecting; /* whether we are reconnecting */
-    int             agentindex;
-
-    
-    bool            destory_pending; /* whether we have been ordered to destory */
-    int32            ref_count;         /* reference count */
-    PGXCASyncTaskCtl *task_control;  /* in error situation, we need to free the task control */
+	/* Process ID of postmaster child process associated to pool agent */
+	int				pid;
+	/* communication channel */
+	PoolPort		port;
+	DatabasePool   *pool;
+	MemoryContext	mcxt;
+	int				num_dn_connections;
+	int				num_coord_connections;
+	Oid		   	   *dn_conn_oids;		/* one for each Datanode */
+	Oid		   	   *coord_conn_oids;	/* one for each Coordinator */
+	PGXCNodePoolSlot **dn_connections; /* one for each Datanode */
+	PGXCNodePoolSlot **coord_connections; /* one for each Coordinator */
+	
+	char		   *session_params;
+	char		   *local_params;
+	List            *session_params_list; /* session param list */
+	List 			*local_params_list;   /* local param list */
+	
+	bool			is_temp; /* Temporary objects used for this pool session? */
+
+	int             query_count;   /* query count, if exceed, need to reconnect database */
+	bool            breconnecting; /* whether we are reconnecting */
+	int             agentindex;
+
+	
+	bool            destory_pending; /* whether we have been ordered to destory */
+	int32			ref_count;		 /* reference count */
+	PGXCASyncTaskCtl *task_control;  /* in error situation, we need to free the task control */
+
+    pg_time_t cmd_start_time;        /* command start time */
 } PoolAgent;
 
 /* Handle to the pool manager (Session's side) */
@@ -223,6 +225,23 @@ typedef struct
     PoolPort    port;
 } PoolHandle;
 
+typedef struct PoolerCmdStatistics
+{
+    uint64 total_request_times;     /* command total request times */
+    union
+    {
+        uint64 total_costtime;      /* total time spent processing commands */
+        uint64 avg_costtime;        /* avg time spent processing command */
+    };
+    uint64 max_costtime;            /* max time spent processing command */
+    uint64 min_costtime;            /* min time spent processing command */
+} PoolerCmdStatistics;
+
+
+#define POOLER_CMD_COUNT (18)
+
+
+
 #define     POOLER_ERROR_MSG_LEN  256
 
 extern int    MinPoolSize;
@@ -349,4 +368,8 @@ extern bool check_persistent_connections(bool *newval, void **extra,
 extern int PoolManagerRefreshConnectionInfo(void);
 extern int PoolManagerClosePooledConnections(const char *dbname, const char *username);
 
+extern int PoolManagerGetCmdStatistics(char *s, int size);
+extern void PoolManagerResetCmdStatistics(void);
+extern int PoolManagerGetConnStatistics(StringInfo s);
+
 #endif

From 1adcec8a4e09b7b37cb337f8f74921e86126b6c4 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 15 Oct 2020 17:02:18 +0800
Subject: [PATCH 072/578] add tbase_pooler_stat Makefile

---
 contrib/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/contrib/Makefile b/contrib/Makefile
index c1ff5226..9c5df8bf 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -52,7 +52,8 @@ SUBDIRS = \
                 tsm_system_time \
                 unaccent        \
                 vacuumlo        \
-                stormstats
+                stormstats		\
+				tbase_pooler_stat
 
 ifeq ($(with_openssl),yes)
 SUBDIRS += sslinfo

From 84a203be06e9b30b40b7866957d9375b26c6fc9d Mon Sep 17 00:00:00 2001
From: yeyukui <yukuiye@tencent.com>
Date: Mon, 2 Nov 2020 14:13:26 +0800
Subject: [PATCH 073/578] fix bug about "create extension if not exists"

---
 src/backend/tcop/utility.c          | 125 ++++++++++++++--------------
 src/include/catalog/objectaddress.h |   8 ++
 2 files changed, 72 insertions(+), 61 deletions(-)

diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index f5d10269..4a8871b6 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -3853,67 +3853,70 @@ ProcessUtilitySlow(ParseState *pstate,
 
                     }
 #endif
-                    /*
-                     * Add the CREATE INDEX node itself to stash right away;
-                     * if there were any commands stashed in the ALTER TABLE
-                     * code, we need them to appear after this one.
-                     */
-                    EventTriggerCollectSimpleCommand(address, secondaryObject,
-                                                     parsetree);
-                    commandCollected = true;
-                    EventTriggerAlterTableEnd();
-                }
-                break;
-
-            case T_CreateExtensionStmt:
-#ifdef __TBASE__                
-                {
-                    CreateExtensionStmt *stmt = (CreateExtensionStmt *) parsetree;
-                    char *extension_query_string = NULL;
-                    if (IS_PGXC_LOCAL_COORDINATOR && CREATEEXT_CREATE == stmt->action)
-                    {
-                        StringInfo qstring;
-                        /* stage 1 */
-                        address = PrepareExtension(pstate, stmt);
-
-                        qstring = makeStringInfo();
-                        initStringInfo(qstring);
-                    
-                        appendStringInfo(qstring, 
-                                        _("PREPARE %s"),
-                                        queryString);
-                        /* Send prepare extension msg to all other cn and dn */
-                        extension_query_string = qstring->data;
-                        ExecUtilityStmtOnNodes(parsetree, extension_query_string, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false);    
-                        
-                        /* stage 2 */
-                        ExecuteExtension(pstate, (CreateExtensionStmt *) parsetree);
-                        resetStringInfo(qstring);
-                        appendStringInfo(qstring, 
-                                        _("EXECUTE %s"),
-                                        queryString);
-                        /* Send execute extension msg to all other cn and dn */
-                        extension_query_string = qstring->data;
-                        ExecUtilityStmtOnNodes(parsetree, extension_query_string, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false);
-
-                        pfree(qstring->data);
-                        pfree(qstring);
-                    }
-                    else if (CREATEEXT_PREPARE == stmt->action)
-                    {
-                        address = PrepareExtension(pstate, stmt);
-                    }
-                    else if (CREATEEXT_EXECUTE == stmt->action)
-                    {
-                        ExecuteExtension(pstate, stmt);
-                    }
-                    else
-                    {
-                        address = CreateExtension(pstate, (CreateExtensionStmt *) parsetree);
-                    }
-                    
-                    break;
-                }
+					/*
+					 * Add the CREATE INDEX node itself to stash right away;
+					 * if there were any commands stashed in the ALTER TABLE
+					 * code, we need them to appear after this one.
+					 */
+					EventTriggerCollectSimpleCommand(address, secondaryObject,
+													 parsetree);
+					commandCollected = true;
+					EventTriggerAlterTableEnd();
+				}
+				break;
+
+			case T_CreateExtensionStmt:
+#ifdef __TBASE__				
+				{
+					CreateExtensionStmt *stmt = (CreateExtensionStmt *) parsetree;
+					char *extension_query_string = NULL;
+					if (IS_PGXC_LOCAL_COORDINATOR && CREATEEXT_CREATE == stmt->action)
+					{
+						StringInfo qstring;
+						/* stage 1 */
+						address = PrepareExtension(pstate, stmt);
+
+						if (ObjectAddressIsEqual(InvalidObjectAddress, address))
+							break;
+
+						qstring = makeStringInfo();
+						initStringInfo(qstring);
+					
+						appendStringInfo(qstring, 
+										_("PREPARE %s"),
+										queryString);
+						/* Send prepare extension msg to all other cn and dn */
+						extension_query_string = qstring->data;
+						ExecUtilityStmtOnNodes(parsetree, extension_query_string, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false);	
+						
+						/* stage 2 */
+						ExecuteExtension(pstate, (CreateExtensionStmt *) parsetree);
+						resetStringInfo(qstring);
+						appendStringInfo(qstring, 
+										_("EXECUTE %s"),
+										queryString);
+						/* Send execute extension msg to all other cn and dn */
+						extension_query_string = qstring->data;
+						ExecUtilityStmtOnNodes(parsetree, extension_query_string, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false);
+
+						pfree(qstring->data);
+						pfree(qstring);
+					}
+					else if (CREATEEXT_PREPARE == stmt->action)
+					{
+						address = PrepareExtension(pstate, stmt);
+					}
+					else if (CREATEEXT_EXECUTE == stmt->action)
+					{
+						ExecuteExtension(pstate, stmt);
+					}
+					else
+					{
+						address = CreateExtension(pstate, (CreateExtensionStmt *) parsetree);
+					}
+					
+					break;
+				}
 #endif
             case T_AlterExtensionStmt:
                 address = ExecAlterExtensionStmt(pstate, (AlterExtensionStmt *) parsetree);
diff --git a/src/include/catalog/objectaddress.h b/src/include/catalog/objectaddress.h
index edfa0219..0d80f74c 100644
--- a/src/include/catalog/objectaddress.h
+++ b/src/include/catalog/objectaddress.h
@@ -90,6 +90,14 @@ typedef struct ObjectAddress
 
 extern const ObjectAddress InvalidObjectAddress;
 
+/*
+ * Compare whether two ObjectAddress are the same
+ */
+#define ObjectAddressIsEqual(addr1, addr2) \
+	 ((addr1).classId == (addr2).classId &&  \
+		(addr1).objectId == (addr2).objectId &&  \
+		(addr1).objectSubId == (addr2).objectSubId)
+
 #define ObjectAddressSubSet(addr, class_id, object_id, object_sub_id) \
     do { \
         (addr).classId = (class_id); \

From 2ab5df38308972addcc577c5c37ae8659d6a2298 Mon Sep 17 00:00:00 2001
From: yeyukui <yukuiye@tencent.com>
Date: Wed, 4 Nov 2020 11:46:44 +0800
Subject: [PATCH 074/578] * add a parameter to support pg_dumpall dump security
 data

* fix bug about 'create extension if not exists'
---
 src/bin/pg_dump/pg_dumpall.c | 637 +++++++++++++++++++++--------------
 1 file changed, 377 insertions(+), 260 deletions(-)

diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c
index 6354cdde..6d885bc7 100644
--- a/src/bin/pg_dump/pg_dumpall.c
+++ b/src/bin/pg_dump/pg_dumpall.c
@@ -27,6 +27,7 @@
 
 /* version string we expect back from pg_dump */
 #define PGDUMP_VERSIONSTR "pg_dump (PostgreSQL) " PG_VERSION "\n"
+#define PGDUM_SERCURITY_VERSIONSTR "pg_dump_security (TBase) " PG_VERSION "\n"
 
 
 static void help(void);
@@ -48,7 +49,9 @@ static void makeAlterConfigCommand(PGconn *conn, const char *arrayitem,
 static void dumpDatabases(PGconn *conn);
 static void dumpTimestamp(const char *msg);
 
-static int    runPgDump(const char *dbname);
+static int	runPgDump(const char *dbname);
+static int  runPgDumpSecurity(PGconn *conn, const char *pghost, const char *pgport,
+                              const char *pguser, trivalue prompt_password);
 static void buildShSecLabels(PGconn *conn, const char *catalog_name,
                  uint32 objectId, PQExpBuffer buffer,
                  const char *target, const char *objname);
@@ -64,8 +67,10 @@ static void dumpNodeGroups(PGconn *conn);
 #endif /* PGXC */
 
 static char pg_dump_bin[MAXPGPATH];
+static char pg_dump_security_bin[MAXPGPATH];
 static const char *progname;
 static PQExpBuffer pgdumpopts;
+static PQExpBuffer pgdumpsecurityopts;
 static char *connstr = "";
 static bool skip_acls = false;
 static bool verbose = false;
@@ -99,6 +104,10 @@ static int include_nodes = 0;
 #endif /* PGXC */
 #define exit_nicely(code) exit(code)
 
+#ifdef __TBASE__
+static int dump_security_data = 0;
+#endif
+
 int
 main(int argc, char *argv[])
 {// #lizard forgives
@@ -152,270 +161,308 @@ main(int argc, char *argv[])
         {"dump-nodes", no_argument, &dump_nodes, 1},
         //{"include-nodes", no_argument, &include_nodes, 1},
 #endif
-        {NULL, 0, NULL, 0}
-    };
-
-    char       *pghost = NULL;
-    char       *pgport = NULL;
-    char       *pguser = NULL;
-    char       *pgdb = NULL;
-    char       *use_role = NULL;
-    trivalue    prompt_password = TRI_DEFAULT;
-    bool        data_only = false;
-    bool        globals_only = false;
-    bool        output_clean = false;
-    bool        roles_only = false;
-    bool        tablespaces_only = false;
-    PGconn       *conn;
-    int            encoding;
-    const char *std_strings;
-    int            c,
-                ret;
-    int            optindex;
-
-    set_pglocale_pgservice( argv[0], PG_TEXTDOMAIN("pg_dump") );
-
-    progname = get_progname( argv[0] );
-
-    if (argc > 1)
-    {
-        if ( strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
-        {
-            help();
-            exit_nicely(0);
-        }
-        if ( strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
-        {
-            puts("pg_dumpall(PostgreSQL) " PG_VERSION);
-            exit_nicely(0);
-        }
-    }
-
-    if ((ret = find_other_exec(argv[0], "pg_dump", PGDUMP_VERSIONSTR,
-                               pg_dump_bin)) < 0)
-    {
-        char        full_path[MAXPGPATH];
-
-        if (find_my_exec(argv[0], full_path) < 0)
-            strlcpy(full_path, progname, sizeof(full_path));
-
-        if (ret == -1)
-            fprintf(stderr,
-                    _("The program \"pg_dump\" is needed by %s "
-                      "but was not found in the\n"
-                      "same directory as \"%s\".\n"
-                      "Check your installation.\n"),
-                    progname, full_path);
-        else
-            fprintf(stderr,
-                    _("The program \"pg_dump\" was found by \"%s\"\n"
-                      "but was not the same version as %s.\n"
-                      "Check your installation.\n"),
-                    full_path, progname);
-        exit_nicely(1);
-    }
-
-    pgdumpopts = createPQExpBuffer();
-
-    while ((c = getopt_long(argc, argv, "acd:f:gh:l:oOp:rsS:tuU:vwWx", long_options, &optindex)) != -1)
-    {
-        switch (c)
-        {
-            case 'a':
-                data_only = true;
-                appendPQExpBufferStr(pgdumpopts, " -a");
-                break;
-
-            case 'c':
-                output_clean = true;
-                break;
-
-            case 'd':
-                connstr = pg_strdup(optarg);
-                break;
-
-            case 'f':
-                filename = pg_strdup(optarg);
-                appendPQExpBufferStr(pgdumpopts, " -f ");
-                appendShellString(pgdumpopts, filename);
-                break;
 
-            case 'g':
-                globals_only = true;
-                break;
-
-            case 'h':
-                pghost = pg_strdup(optarg);
-                break;
-
-            case 'l':
-                pgdb = pg_strdup(optarg);
-                break;
-
-            case 'o':
-                appendPQExpBufferStr(pgdumpopts, " -o");
-                break;
-
-            case 'O':
-                appendPQExpBufferStr(pgdumpopts, " -O");
-                break;
-
-            case 'p':
-                pgport = pg_strdup(optarg);
-                break;
-
-            case 'r':
-                roles_only = true;
-                break;
-
-            case 's':
-                appendPQExpBufferStr(pgdumpopts, " -s");
-                break;
-
-            case 'S':
-                appendPQExpBufferStr(pgdumpopts, " -S ");
-                appendShellString(pgdumpopts, optarg);
-                break;
-
-            case 't':
-                tablespaces_only = true;
-                break;
-            
+#ifdef __TBASE__
+		{"dump-security-data", no_argument, &dump_security_data, 1},
+#endif
+		{NULL, 0, NULL, 0}
+	};
+
+	char	   *pghost = NULL;
+	char	   *pgport = NULL;
+	char	   *pguser = NULL;
+	char	   *pgdb = NULL;
+	char	   *use_role = NULL;
+	trivalue	prompt_password = TRI_DEFAULT;
+	bool		data_only = false;
+	bool		globals_only = false;
+	bool		output_clean = false;
+	bool		roles_only = false;
+	bool		tablespaces_only = false;
+	PGconn	   *conn;
+	int			encoding;
+	const char *std_strings;
+	int			c,
+				ret;
+	int			optindex;
+
+	set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_dump"));
+
+	progname = get_progname(argv[0]);
+
+	if (argc > 1)
+	{
+		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
+		{
+			help();
+			exit_nicely(0);
+		}
+		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
+		{
+			puts("pg_dumpall (PostgreSQL) " PG_VERSION);
+			exit_nicely(0);
+		}
+	}
+
+	if ((ret = find_other_exec(argv[0], "pg_dump", PGDUMP_VERSIONSTR,
+							   pg_dump_bin)) < 0)
+	{
+		char		full_path[MAXPGPATH];
+
+		if (find_my_exec(argv[0], full_path) < 0)
+			strlcpy(full_path, progname, sizeof(full_path));
+
+		if (ret == -1)
+			fprintf(stderr,
+					_("The program \"pg_dump\" is needed by %s "
+					  "but was not found in the\n"
+					  "same directory as \"%s\".\n"
+					  "Check your installation.\n"),
+					progname, full_path);
+		else
+			fprintf(stderr,
+					_("The program \"pg_dump\" was found by \"%s\"\n"
+					  "but was not the same version as %s.\n"
+					  "Check your installation.\n"),
+					full_path, progname);
+		exit_nicely(1);
+	}
+
+	if ((ret = find_other_exec(argv[0], "pg_dump_security", PGDUM_SERCURITY_VERSIONSTR,
+			pg_dump_security_bin)) < 0)
+	{
+		char		full_path[MAXPGPATH];
+
+		if (find_my_exec(argv[0], full_path) < 0)
+			strlcpy(full_path, progname, sizeof(full_path));
+
+		if (ret == -1)
+			fprintf(stderr,
+			        _("The program \"pg_dump_security\" is needed by %s "
+			          "but was not found in the\n"
+			          "same directory as \"%s\".\n"
+			          "Check your installation.\n"),
+			        progname, full_path);
+		else
+			fprintf(stderr,
+			        _("The program \"pg_dump_security\" was found by \"%s\"\n"
+			          "but was not the same version as %s.\n"
+			          "Check your installation.\n"),
+			        full_path, progname);
+		exit_nicely(1);
+	}
+
+	pgdumpopts = createPQExpBuffer();
+
+	pgdumpsecurityopts = createPQExpBuffer();
+
+	while ((c = getopt_long(argc, argv, "acd:f:gh:l:oOp:rsS:tuU:vwWx", long_options, &optindex)) != -1)
+	{
+		switch (c)
+		{
+			case 'a':
+				data_only = true;
+				appendPQExpBufferStr(pgdumpopts, " -a");
+				break;
+
+			case 'c':
+				output_clean = true;
+				break;
+
+			case 'd':
+				connstr = pg_strdup(optarg);
+				break;
+
+			case 'f':
+				filename = pg_strdup(optarg);
+				appendPQExpBufferStr(pgdumpopts, " -f ");
+				appendShellString(pgdumpopts, filename);
+
+				appendPQExpBufferStr(pgdumpsecurityopts, " -f ");
+				appendShellString(pgdumpsecurityopts, filename);
+				break;
+
+			case 'g':
+				globals_only = true;
+				break;
+
+			case 'h':
+				pghost = pg_strdup(optarg);
+				appendPQExpBufferStr(pgdumpsecurityopts, " -h");
+				appendShellString(pgdumpsecurityopts, pghost);
+				break;
+
+			case 'l':
+				pgdb = pg_strdup(optarg);
+				break;
+
+			case 'o':
+				appendPQExpBufferStr(pgdumpopts, " -o");
+				break;
+
+			case 'O':
+				appendPQExpBufferStr(pgdumpopts, " -O");
+				break;
+
+			case 'p':
+				pgport = pg_strdup(optarg);
+				appendPQExpBufferStr(pgdumpsecurityopts, " -p");
+				appendShellString(pgdumpsecurityopts, pgport);
+				break;
+
+			case 'r':
+				roles_only = true;
+				break;
+
+			case 's':
+				appendPQExpBufferStr(pgdumpopts, " -s");
+				break;
+
+			case 'S':
+				appendPQExpBufferStr(pgdumpopts, " -S ");
+				appendShellString(pgdumpopts, optarg);
+				break;
+
+			case 't':
+				tablespaces_only = true;
+				break;
+			
 #ifdef __TBASE__
             case 'u':            
                 appendPQExpBufferStr(pgdumpopts, " -u");
                 break;
 #endif
 
-            case 'U':
-                pguser = pg_strdup(optarg);
-                break;
-
-            case 'v':
-                verbose = true;
-                appendPQExpBufferStr(pgdumpopts, " -v");
-                break;
-
-            case 'w':
-                prompt_password = TRI_NO;
-                appendPQExpBufferStr(pgdumpopts, " -w");
-                break;
-
-            case 'W':
-                prompt_password = TRI_YES;
-                appendPQExpBufferStr(pgdumpopts, " -W");
-                break;
-
-            case 'x':
-                skip_acls = true;
-                appendPQExpBufferStr(pgdumpopts, " -x");
-                break;
-
-            case 0:
-                break;
-
-            case 2:
-                appendPQExpBufferStr(pgdumpopts, " --lock-wait-timeout ");
-                appendShellString(pgdumpopts, optarg);
-                break;
-
-            case 3:
-                use_role = pg_strdup(optarg);
-                appendPQExpBufferStr(pgdumpopts, " --role ");
-                appendShellString(pgdumpopts, use_role);
-                break;
-
-            case 4:
-                dosync = false;
-                appendPQExpBufferStr(pgdumpopts, " --no-sync");
-                break;
-
-            default:
-                fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
-                exit_nicely(1);
-        }
-    }
-
-    /* Complain if any arguments remain */
-    if (optind < argc)
-    {
-        fprintf(stderr, _("%s: too many command-line arguments (first is \"%s\")\n"),
-                progname, argv[optind]);
-        fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
-                progname);
-        exit_nicely(1);
-    }
-
-    /* Make sure the user hasn't specified a mix of globals-only options */
-    if (globals_only && roles_only)
-    {
-        fprintf(stderr, _("%s: options -g/--globals-only and -r/--roles-only cannot be used together\n"),
-                progname);
-        fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
-                progname);
-        exit_nicely(1);
-    }
-
-    if (globals_only && tablespaces_only)
-    {
-        fprintf(stderr, _("%s: options -g/--globals-only and -t/--tablespaces-only cannot be used together\n"),
-                progname);
-        fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
-                progname);
-        exit_nicely(1);
-    }
-
-    if (if_exists && !output_clean)
-    {
-        fprintf(stderr, _("%s: option --if-exists requires option -c/--clean\n"),
-                progname);
-        exit_nicely(1);
-    }
-
-    if (roles_only && tablespaces_only)
-    {
-        fprintf(stderr, _("%s: options -r/--roles-only and -t/--tablespaces-only cannot be used together\n"),
-                progname);
-        fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
-                progname);
-        exit_nicely(1);
-    }
-
-    /*
-     * If password values are not required in the dump, switch to using
-     * pg_roles which is equally useful, just more likely to have unrestricted
-     * access than pg_authid.
-     */
-    if (no_role_passwords)
-        sprintf(role_catalog, "%s", PG_ROLES);
-    else
-        sprintf(role_catalog, "%s", PG_AUTHID);
-
-    /* Add long options to the pg_dump argument list */
-    if (binary_upgrade)
-        appendPQExpBufferStr(pgdumpopts, " --binary-upgrade");
-    if (column_inserts)
-        appendPQExpBufferStr(pgdumpopts, " --column-inserts");
-    if (disable_dollar_quoting)
-        appendPQExpBufferStr(pgdumpopts, " --disable-dollar-quoting");
-    if (disable_triggers)
-        appendPQExpBufferStr(pgdumpopts, " --disable-triggers");
-    if (inserts)
-        appendPQExpBufferStr(pgdumpopts, " --inserts");
-    if (no_tablespaces)
-        appendPQExpBufferStr(pgdumpopts, " --no-tablespaces");
-    if (quote_all_identifiers)
-        appendPQExpBufferStr(pgdumpopts, " --quote-all-identifiers");
-    if (use_setsessauth)
-        appendPQExpBufferStr(pgdumpopts, " --use-set-session-authorization");
-    if (no_publications)
-        appendPQExpBufferStr(pgdumpopts, " --no-publications");
-    if (no_security_labels)
-        appendPQExpBufferStr(pgdumpopts, " --no-security-labels");
-    if (no_subscriptions)
-        appendPQExpBufferStr(pgdumpopts, " --no-subscriptions");
-    if (no_unlogged_table_data)
-        appendPQExpBufferStr(pgdumpopts, " --no-unlogged-table-data");
+			case 'U':
+				pguser = pg_strdup(optarg);
+				break;
+
+			case 'v':
+				verbose = true;
+				appendPQExpBufferStr(pgdumpopts, " -v");
+				appendPQExpBufferStr(pgdumpsecurityopts, " -v");
+				break;
+
+			case 'w':
+				prompt_password = TRI_NO;
+				appendPQExpBufferStr(pgdumpopts, " -w");
+				break;
+
+			case 'W':
+				prompt_password = TRI_YES;
+				appendPQExpBufferStr(pgdumpopts, " -W");
+				break;
+
+			case 'x':
+				skip_acls = true;
+				appendPQExpBufferStr(pgdumpopts, " -x");
+				break;
+
+			case 0:
+				break;
+
+			case 2:
+				appendPQExpBufferStr(pgdumpopts, " --lock-wait-timeout ");
+				appendShellString(pgdumpopts, optarg);
+				break;
+
+			case 3:
+				use_role = pg_strdup(optarg);
+				appendPQExpBufferStr(pgdumpopts, " --role ");
+				appendShellString(pgdumpopts, use_role);
+				break;
+
+			case 4:
+				dosync = false;
+				appendPQExpBufferStr(pgdumpopts, " --no-sync");
+				break;
+
+			default:
+				fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
+				exit_nicely(1);
+		}
+	}
+
+	/* Complain if any arguments remain */
+	if (optind < argc)
+	{
+		fprintf(stderr, _("%s: too many command-line arguments (first is \"%s\")\n"),
+				progname, argv[optind]);
+		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
+				progname);
+		exit_nicely(1);
+	}
+
+	/* Make sure the user hasn't specified a mix of globals-only options */
+	if (globals_only && roles_only)
+	{
+		fprintf(stderr, _("%s: options -g/--globals-only and -r/--roles-only cannot be used together\n"),
+				progname);
+		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
+				progname);
+		exit_nicely(1);
+	}
+
+	if (globals_only && tablespaces_only)
+	{
+		fprintf(stderr, _("%s: options -g/--globals-only and -t/--tablespaces-only cannot be used together\n"),
+				progname);
+		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
+				progname);
+		exit_nicely(1);
+	}
+
+	if (if_exists && !output_clean)
+	{
+		fprintf(stderr, _("%s: option --if-exists requires option -c/--clean\n"),
+				progname);
+		exit_nicely(1);
+	}
+
+	if (roles_only && tablespaces_only)
+	{
+		fprintf(stderr, _("%s: options -r/--roles-only and -t/--tablespaces-only cannot be used together\n"),
+				progname);
+		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
+				progname);
+		exit_nicely(1);
+	}
+
+	/*
+	 * If password values are not required in the dump, switch to using
+	 * pg_roles which is equally useful, just more likely to have unrestricted
+	 * access than pg_authid.
+	 */
+	if (no_role_passwords)
+		sprintf(role_catalog, "%s", PG_ROLES);
+	else
+		sprintf(role_catalog, "%s", PG_AUTHID);
+
+	/* Add long options to the pg_dump argument list */
+	if (binary_upgrade)
+		appendPQExpBufferStr(pgdumpopts, " --binary-upgrade");
+	if (column_inserts)
+		appendPQExpBufferStr(pgdumpopts, " --column-inserts");
+	if (disable_dollar_quoting)
+		appendPQExpBufferStr(pgdumpopts, " --disable-dollar-quoting");
+	if (disable_triggers)
+		appendPQExpBufferStr(pgdumpopts, " --disable-triggers");
+	if (inserts)
+		appendPQExpBufferStr(pgdumpopts, " --inserts");
+	if (no_tablespaces)
+		appendPQExpBufferStr(pgdumpopts, " --no-tablespaces");
+	if (quote_all_identifiers)
+		appendPQExpBufferStr(pgdumpopts, " --quote-all-identifiers");
+	if (use_setsessauth)
+		appendPQExpBufferStr(pgdumpopts, " --use-set-session-authorization");
+	if (no_publications)
+		appendPQExpBufferStr(pgdumpopts, " --no-publications");
+	if (no_security_labels)
+		appendPQExpBufferStr(pgdumpopts, " --no-security-labels");
+	if (no_subscriptions)
+		appendPQExpBufferStr(pgdumpopts, " --no-subscriptions");
+	if (no_unlogged_table_data)
+		appendPQExpBufferStr(pgdumpopts, " --no-unlogged-table-data");
 
 #ifdef PGXC
     if (include_nodes)
@@ -586,7 +633,21 @@ main(int argc, char *argv[])
     if (!globals_only && !roles_only && !tablespaces_only)
         dumpDatabases(conn);
 
-    PQfinish(conn);
+	/*
+	 * support to dump security meta data
+	 */
+	if (dump_security_data)
+	{
+		ret = runPgDumpSecurity(conn, pghost, pgport, pguser, prompt_password);
+
+		if (ret != 0)
+		{
+			fprintf(stderr, _("%s: pg_dump_security failed on database \"%s\", exiting\n"), progname, pgdb);
+			exit_nicely(1);
+		}
+	}
+
+	PQfinish(conn);
 
     if (verbose)
         dumpTimestamp("Completed on");
@@ -629,7 +690,8 @@ help(void)
     printf(_("  -S, --superuser=NAME         superuser user name to use in the dump\n"));
     printf(_("  -t, --tablespaces-only       dump only tablespaces, no databases or roles\n"));
 #ifdef __TBASE__
-    printf(_("    -u, --with-dropped-column     dump the table schema with dropped columns\n"));
+	printf(_("	-u, --with-dropped-column	 dump the table schema with dropped columns\n"));
+	printf(_("  --dump-security-data         dump security meta data\n"));
 #endif
     printf(_("  -x, --no-privileges          do not dump privileges (grant/revoke)\n"));
     printf(_("  --binary-upgrade             for use by upgrade utilities only\n"));
@@ -1832,7 +1894,62 @@ dumpDatabases(PGconn *conn)
     PQclear(res);
 }
 
+/*
+ *  run pg_dump_security to dump security metadata
+ */
+static int
+runPgDumpSecurity(PGconn *old_conn, const char *pghost, const char *pgport,
+                  const char *pguser, trivalue prompt_password)
+{
+	PQExpBuffer cmd = createPQExpBuffer();
+	PQExpBuffer buf = createPQExpBuffer();
+	PGresult   *extnames;
+	PGconn     *new_conn;
+	int			ret;
+	PGresult   *res;
+	int			i;
+	char       *dbname;
+
+	res = executeQuery(old_conn, "SELECT datname FROM pg_database WHERE datallowconn ORDER BY 1");
+
+	for (i = 0; i < PQntuples(res); i++)
+	{
+		dbname = PQgetvalue(res, i, 0);
+
+		new_conn = connectDatabase(dbname, NULL, pghost, pgport, pguser, prompt_password, false);
+
+		extnames = executeQuery(new_conn, "SELECT extname from pg_extension WHERE extname='tbase_mls' ORDERY BY 1");
+		if (PQntuples(extnames) > 0)
+		{
+			break;
+		}
+	}
+
+	fprintf(OPF, "\\c %s mls_admin\n\n", dbname);
+
+	appendPQExpBuffer(cmd, "\"%s\" %s", pg_dump_security_bin,
+	                  pgdumpsecurityopts->data);
 
+	appendPQExpBufferStr(cmd, " -l");
+
+	appendShellString(cmd, dbname);
+
+	if (verbose)
+		fprintf(stderr, _("%s: running \"%s\"\n"), progname, cmd->data);
+
+	fflush(stdout);
+	fflush(stderr);
+
+	ret = system(cmd->data);
+
+	PQclear(res);
+	PQclear(extnames);
+	destroyPQExpBuffer(cmd);
+	destroyPQExpBuffer(buf);
+	PQfinish(new_conn);
+
+	return ret;
+}
 
 /*
  * Run pg_dump on dbname.

From b8ad31a95d2ce1f9b6796396d427c582b57d6989 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 18 Nov 2020 14:29:10 +0800
Subject: [PATCH 075/578] fix pooler log and pgsl_store core

---
 src/backend/pgxc/pool/poolmgr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index d24a32f4..afa55efc 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -776,9 +776,9 @@ PoolManagerInit()
      * processes do this.)
      */
 #ifdef HAVE_SETSID
-    if (setsid() < 0)
-        elog(LOG, POOL_MGR_PREFIX"setsid() failed: %m");
-        //elog(FATAL, POOL_MGR_PREFIX"setsid() failed: %m");
+	if (setsid() < 0)
+		elog(DEBUG1, POOL_MGR_PREFIX"setsid() failed: %m");
+		//elog(FATAL, POOL_MGR_PREFIX"setsid() failed: %m");
 #endif
     /*
      * Properly accept or ignore signals the postmaster might send us

From 8f7357232567540514e670bde4f740dd6f7c2415 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 25 Nov 2020 14:46:50 +0800
Subject: [PATCH 076/578] delete GUC parameter use_data_pump and change pooler
 log level

---
 src/backend/pgxc/pool/poolcomm.c | 196 +++++++++++++++----------------
 src/backend/pgxc/squeue/squeue.c |   2 +-
 src/backend/utils/misc/guc.c     |  28 ++---
 3 files changed, 108 insertions(+), 118 deletions(-)

diff --git a/src/backend/pgxc/pool/poolcomm.c b/src/backend/pgxc/pool/poolcomm.c
index c462027a..b1b68e04 100644
--- a/src/backend/pgxc/pool/poolcomm.c
+++ b/src/backend/pgxc/pool/poolcomm.c
@@ -1302,104 +1302,104 @@ pool_recvres_with_commandID(PoolPort *port, CommandId *cmdID, const char *sql)
  */
 int
 pool_recvres(PoolPort *port)
-{// #lizard forgives
-    int            r;
-    uint        n32 = 0;
-    uint        err = 0;
-    char        buf[SEND_RES_BUFFER_SIZE - POOL_ERR_MSG_LEN];
-    char        err_msg[POOL_ERR_MSG_LEN];
-    int recved_size = 0;
-    int size = SEND_RES_BUFFER_SIZE - POOL_ERR_MSG_LEN;
-    char *ptr = buf;
-    
-    /* receive message header first */
-    for(;;)
-    {
-        r = recv(Socket(*port), ptr + recved_size, size - recved_size, 0);
-        if (r < 0)
-        {
-            /*
-             * Report broken connection
-             */
-            elog(LOG, "recv size %d size %d n32 %d.", recved_size, size, n32);
-            ereport(LOG,
-                    (errcode_for_socket_access(),
-                     errmsg("could not receive data from client: %m")));
-            goto failure;
-        }
-        else if (r == 0)
-        {
-            if(recved_size == size)
-                break;
-            else
-                goto failure;
-        }
-
-        recved_size += r;
-        if(recved_size == size)
-            break;
-
-    }
-    /* Verify response */
-    if (buf[0] != 's')
-    {
-        ereport(LOG,
-                (errcode(ERRCODE_PROTOCOL_VIOLATION),
-                 errmsg("unexpected message code:%c", buf[0])));
-        goto failure;
-    }
-
-    memcpy(&n32, buf + 1, 4);
-    n32 = ntohl(n32);
-    if (n32 != 0)
-    {
-        ereport(LOG,
-                (errcode(ERRCODE_PROTOCOL_VIOLATION),
-                 errmsg("pool_recvres return code:%d", n32)));
-    }
-
-    memcpy(&err, buf + 5, 4);
-    err = ntohl(err);
-
-    /* if has err_msg, receive error message */
-    if (PoolErrIsValid(err))
-    {
-        ptr = err_msg;
-        size = POOL_ERR_MSG_LEN;
-        recved_size = 0;
-        for(;;)
-        {
-            r = recv(Socket(*port), ptr + recved_size, size - recved_size, 0);
-            if (r < 0)
-            {
-                /*
-                 * Report broken connection
-                 */
-                elog(LOG, "recv size %d size %d n32 %d.", recved_size, size, n32);
-                ereport(LOG,
-                        (errcode_for_socket_access(),
-                         errmsg("could not receive data from client: %m")));
-                goto failure;
-            }
-            else if (r == 0)
-            {
-                if(recved_size == size)
-                    break;
-                else
-                    goto failure;
-            }
-
-            recved_size += r;
-            if(recved_size == size)
-                break;
-
-        }
-
-        elog(WARNING, "%s", err_msg);
-    }
-    
-    return n32;
-    
+{
+	int			r;
+	uint		n32 = 0;
+	uint        err = 0;
+	char		buf[SEND_RES_BUFFER_SIZE - POOL_ERR_MSG_LEN];
+	char        err_msg[POOL_ERR_MSG_LEN];
+	int recved_size = 0;
+	int size = SEND_RES_BUFFER_SIZE - POOL_ERR_MSG_LEN;
+	char *ptr = buf;
+	
+	/* receive message header first */
+	for(;;)
+	{
+		r = recv(Socket(*port), ptr + recved_size, size - recved_size, 0);
+		if (r < 0)
+		{
+			/*
+			 * Report broken connection
+			 */
+			elog(LOG, "recv size %d size %d n32 %d.", recved_size, size, n32);
+			ereport(LOG,
+					(errcode_for_socket_access(),
+					 errmsg("could not receive data from client: %m")));
+			goto failure;
+		}
+		else if (r == 0)
+		{
+			if(recved_size == size)
+				break;
+			else
+				goto failure;
+		}
+
+		recved_size += r;
+		if(recved_size == size)
+			break;
+
+	}
+	/* Verify response */
+	if (buf[0] != 's')
+	{
+		ereport(LOG,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("unexpected message code:%c", buf[0])));
+		goto failure;
+	}
+
+	memcpy(&n32, buf + 1, 4);
+	n32 = ntohl(n32);
+	if (n32 != 0)
+	{
+		ereport(DEBUG1,
+				(errcode(ERRCODE_PROTOCOL_VIOLATION),
+				 errmsg("pool_recvres return code:%d", n32)));
+	}
+
+	memcpy(&err, buf + 5, 4);
+	err = ntohl(err);
+
+	/* if has err_msg, receive error message */
+	if (PoolErrIsValid(err))
+	{
+		ptr = err_msg;
+		size = POOL_ERR_MSG_LEN;
+		recved_size = 0;
+		for(;;)
+		{
+			r = recv(Socket(*port), ptr + recved_size, size - recved_size, 0);
+			if (r < 0)
+			{
+				/*
+				 * Report broken connection
+				 */
+				elog(LOG, "recv size %d size %d n32 %d.", recved_size, size, n32);
+				ereport(LOG,
+						(errcode_for_socket_access(),
+						 errmsg("could not receive data from client: %m")));
+				goto failure;
+			}
+			else if (r == 0)
+			{
+				if(recved_size == size)
+					break;
+				else
+					goto failure;
+			}
+
+			recved_size += r;
+			if(recved_size == size)
+				break;
+
+		}
+
+		elog(WARNING, "%s", err_msg);
+	}
+	
+	return n32;
+	
 failure:
     return EOF;
 }
diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
index ecc25323..515391a3 100644
--- a/src/backend/pgxc/squeue/squeue.c
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -66,7 +66,7 @@ int   SQueueSize = 64;
 #ifdef __TBASE__
 extern ProtocolVersion FrontendProtocol;
 
-bool  g_UseDataPump         = false;/* Use data pumb, true default. */
+bool  g_UseDataPump         = true;/* Use data pumb, true default. */
 bool  g_DataPumpDebug       = false;/* enable debug info */
 int32 g_SndThreadNum        = 8;    /* Two sender threads default.  */
 int32 g_SndThreadBufferSize = 16;   /* in Kilo bytes. */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index af23b681..825e4725 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2054,25 +2054,15 @@ static struct config_bool ConfigureNamesBool[] =
     },
 
 #ifdef __TBASE__
-    {
-        {"enable_statistic", PGC_SIGHUP, STATS_COLLECTOR,
-            gettext_noop("collect statistic information for debug."),
-            NULL
-        },
-        &enable_statistic,
-        false,
-        NULL, NULL, NULL
-    },
-    
-    {
-        {"use_data_pump", PGC_SIGHUP, CUSTOM_OPTIONS,
-            gettext_noop("use datapump to make data transfer more efficient."),
-            NULL
-        },
-        &g_UseDataPump,
-        true,
-        NULL, NULL, NULL
-    },
+	{
+		{"enable_statistic", PGC_SIGHUP, STATS_COLLECTOR,
+			gettext_noop("collect statistic information for debug."),
+			NULL
+		},
+		&enable_statistic,
+		false,
+		NULL, NULL, NULL
+	},
 
     {
         {"debug_data_pump", PGC_SIGHUP, CUSTOM_OPTIONS,

From c46bec597e4e7e74031fe0a608b9f93a522ef293 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 26 Nov 2020 10:49:18 +0800
Subject: [PATCH 077/578] add log switch in pool_recvres

---
 src/backend/pgxc/pool/poolcomm.c |  6 +++---
 src/backend/pgxc/pool/poolmgr.c  | 30 +++++++++++++++---------------
 src/include/pgxc/poolcomm.h      | 12 ++++++------
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/backend/pgxc/pool/poolcomm.c b/src/backend/pgxc/pool/poolcomm.c
index b1b68e04..8a70dffb 100644
--- a/src/backend/pgxc/pool/poolcomm.c
+++ b/src/backend/pgxc/pool/poolcomm.c
@@ -1301,7 +1301,7 @@ pool_recvres_with_commandID(PoolPort *port, CommandId *cmdID, const char *sql)
  * Return 0 at success or EOF at error.
  */
 int
-pool_recvres(PoolPort *port)
+pool_recvres(PoolPort *port, bool need_log)
 {
 	int			r;
 	uint		n32 = 0;
@@ -1351,9 +1351,9 @@ pool_recvres(PoolPort *port)
 
 	memcpy(&n32, buf + 1, 4);
 	n32 = ntohl(n32);
-	if (n32 != 0)
+	if (n32 != 0 && need_log)
 	{
-		ereport(DEBUG1,
+		ereport(LOG,
 				(errcode(ERRCODE_PROTOCOL_VIOLATION),
 				 errmsg("pool_recvres return code:%d", n32)));
 	}
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index afa55efc..73cf905c 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -1455,8 +1455,8 @@ PoolManagerSendLocalCommand(int dn_count, int* dn_list, int co_count, int* co_li
     pool_putmessage(&poolHandle->port, 'b', (char *) buf, (2 + dn_count + co_count) * sizeof(uint32));
     pool_flush(&poolHandle->port);
 
-    /* Get result */
-    return pool_recvres(&poolHandle->port);
+	/* Get result */
+	return pool_recvres(&poolHandle->port, true);
 }
 
 /*
@@ -2023,13 +2023,13 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch
 
     RESUME_POOLER_RELOAD();
 
-    /* Receive result message */
-    if (pool_recvres(&poolHandle->port) != CLEAN_CONNECTION_COMPLETED)
-    {
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg(POOL_MGR_PREFIX"Clean connections not completed. HINT: cannot drop the currently open database")));
-    }
+	/* Receive result message */
+	if (pool_recvres(&poolHandle->port, true) != CLEAN_CONNECTION_COMPLETED)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg(POOL_MGR_PREFIX"Clean connections not completed. HINT: cannot drop the currently open database")));
+	}
 }
 
 
@@ -2054,7 +2054,7 @@ PoolManagerCheckConnectionInfo(void)
     pool_putmessage(&poolHandle->port, 'q', NULL, 0);
     pool_flush(&poolHandle->port);
 
-    res = pool_recvres(&poolHandle->port);
+	res = pool_recvres(&poolHandle->port, true);
 
     if (res == POOL_CHECK_SUCCESS)
         return true;
@@ -3874,7 +3874,7 @@ PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list, i
     pool_putmessage(&poolHandle->port, 'h', (char *) buf, (2 + dn_count + co_count + 1) * sizeof(uint32));
     pool_flush(&poolHandle->port);
 
-    res = pool_recvres(&poolHandle->port);
+	res = pool_recvres(&poolHandle->port, false);
 
     if (res != (dn_count + co_count))
     {
@@ -10644,7 +10644,7 @@ PoolManagerRefreshConnectionInfo(void)
     pool_putmessage(&poolHandle->port, 'R', NULL, 0);
     pool_flush(&poolHandle->port);
 
-    res = pool_recvres(&poolHandle->port);
+	res = pool_recvres(&poolHandle->port, true);
 
     RESUME_POOLER_RELOAD();
 
@@ -10843,9 +10843,9 @@ PoolManagerClosePooledConnections(const char *dbname, const char *username)
 
     pool_flush(&poolHandle->port);
 
-    /* Then Get back Pids from Pooler */
-    res = pool_recvres(&poolHandle->port);
-    elog(LOG, "PoolManagerClosePooledConnections res:%d", res);
+	/* Then Get back Pids from Pooler */
+	res = pool_recvres(&poolHandle->port, true);
+	elog(LOG, "PoolManagerClosePooledConnections res:%d", res);
 
     RESUME_POOLER_RELOAD();
 
diff --git a/src/include/pgxc/poolcomm.h b/src/include/pgxc/poolcomm.h
index ab34974c..04fd7e6e 100644
--- a/src/include/pgxc/poolcomm.h
+++ b/src/include/pgxc/poolcomm.h
@@ -55,11 +55,11 @@ extern int    pool_putbytes(PoolPort *port, const char *s, size_t len);
 extern int    pool_flush(PoolPort *port);
 /*extern int    pool_sendfds(PoolPort *port, int *fds, int count);*/
 extern int  pool_sendfds(PoolPort *port, int *fds, int count, char *errbuf, int32 buf_len);
-extern int    pool_recvfds(PoolPort *port, int *fds, int count);
-extern int    pool_sendres(PoolPort *port, int res, char *errbuf, int32 buf_len, bool need_log);
-extern int    pool_recvres(PoolPort *port);
-extern int    pool_sendpids(PoolPort *port, int *pids, int count, char *errbuf, int32 buf_len);
-extern int    pool_recvpids(PoolPort *port, int **pids);
-extern int    pool_sendres_with_command_id(PoolPort *port, int res, CommandId cmdID, char *errbuf, int32 buf_len, char *errmsg, bool need_log);
+extern int	pool_recvfds(PoolPort *port, int *fds, int count);
+extern int	pool_sendres(PoolPort *port, int res, char *errbuf, int32 buf_len, bool need_log);
+extern int	pool_recvres(PoolPort *port, bool need_log);
+extern int	pool_sendpids(PoolPort *port, int *pids, int count, char *errbuf, int32 buf_len);
+extern int	pool_recvpids(PoolPort *port, int **pids);
+extern int	pool_sendres_with_command_id(PoolPort *port, int res, CommandId cmdID, char *errbuf, int32 buf_len, char *errmsg, bool need_log);
 extern int  pool_recvres_with_commandID(PoolPort *port, CommandId *cmdID, const char *sql);
 #endif   /* POOLCOMM_H */

From df2f355e166a401277fe530b4da46d1cc7a7daa1 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Tue, 24 Nov 2020 20:28:25 +0800
Subject: [PATCH 078/578] fix get_node_list bug (merge request !2)

---
 src/gtm/client/fe-protocol.c | 101 +++++++++++++++++++++--------------
 1 file changed, 60 insertions(+), 41 deletions(-)

diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c
index fb43649c..d545384c 100644
--- a/src/gtm/client/fe-protocol.c
+++ b/src/gtm/client/fe-protocol.c
@@ -1262,9 +1262,11 @@ result->gr_status = GTM_RESULT_ERROR;
 
             break;
 
-        case NODE_LIST_RESULT:
-        {
-            int i;
+		case NODE_LIST_RESULT:
+		{
+			int i;
+            char *buf = NULL;
+            int   buf_size = 8192;
 
             if (gtmpqGetInt(&result->gr_resdata.grd_node_list.num_node, sizeof(int32), conn))
             {
@@ -1272,48 +1274,65 @@ result->gr_status = GTM_RESULT_ERROR;
                 break;
             }
 
-            for (i = 0; i < result->gr_resdata.grd_node_list.num_node; i++)
+            buf = (char *) malloc(buf_size);
+            if (buf == NULL)
             {
-                int size;
-                char buf[8092];
-                GTM_PGXCNodeInfo *data = (GTM_PGXCNodeInfo *) malloc(sizeof(GTM_PGXCNodeInfo));
+                result->gr_status = GTM_RESULT_ERROR;
+                printfGTMPQExpBuffer(&conn->errorMessage, "malloc buffer for node list data failed");
+                break;
+            }
 
-                if (gtmpqGetInt(&size, sizeof(int32), conn))
-                {
-                    result->gr_status = GTM_RESULT_ERROR;
-                    free(data);
-                    break;
-                }
-                if (size > 8092)
-                {
-                    result->gr_status = GTM_RESULT_ERROR;
-                    printfGTMPQExpBuffer(&conn->errorMessage, "buffer size not large enough for node list data");
-                    free(data);
-                    continue;
-                }
+			for (i = 0; i < result->gr_resdata.grd_node_list.num_node; i++)
+			{
+				int size;
+				GTM_PGXCNodeInfo *data = (GTM_PGXCNodeInfo *) malloc(sizeof(GTM_PGXCNodeInfo));
 
-                if (gtmpqGetnchar((char *) &buf, size, conn))
-                {
-                    result->gr_status = GTM_RESULT_ERROR;
-                    free(data);
-                    break;
-                }
-                if (!gtm_deserialize_pgxcnodeinfo(data, buf, size, &conn->errorMessage))
-                {
-                    result->gr_status = GTM_RESULT_ERROR;
-                    free(data);
-                    break;
-                }
-                else
-                {
-                    result->gr_resdata.grd_node_list.nodeinfo[i] = data;
-                }
-            }
+				if (gtmpqGetInt(&size, sizeof(int32), conn))
+				{
+					result->gr_status = GTM_RESULT_ERROR;
+					free(data);
+					break;
+				}
 
-            break;
-        }
-        case BARRIER_RESULT:
-            break;
+				if (size > buf_size)
+				{
+                    buf = (char *) realloc(buf, size);
+                    if (buf == NULL)
+                    {
+                        result->gr_status = GTM_RESULT_ERROR;
+                        printfGTMPQExpBuffer(&conn->errorMessage, "realloc buffer for node list data failed");
+                        free(data);
+                        break;
+                    }
+                    buf_size = size;
+				}
+
+				if (gtmpqGetnchar(buf, size, conn))
+				{
+					result->gr_status = GTM_RESULT_ERROR;
+					free(data);
+					break;
+				}
+				if (!gtm_deserialize_pgxcnodeinfo(data, buf, size, &conn->errorMessage))
+				{
+					result->gr_status = GTM_RESULT_ERROR;
+					free(data);
+					break;
+				}
+				else
+				{
+					result->gr_resdata.grd_node_list.nodeinfo[i] = data;
+				}
+			}
+
+			if (buf != NULL)
+            {
+			    free(buf);
+            }
+			break;
+		}
+		case BARRIER_RESULT:
+			break;
 
         case REPORT_XMIN_RESULT:
             if (gtmpqGetnchar((char *) &result->gr_resdata.grd_report_xmin.latest_completed_xid,

From 6abed652955a1a7f7fdabdc6a356809f045a9aae Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Tue, 1 Dec 2020 14:19:30 +0800
Subject: [PATCH 079/578] fix ID83728819 gtm coredump

---
 src/gtm/main/gtm_store.c | 4 ++--
 src/gtm/main/main.c      | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/gtm/main/gtm_store.c b/src/gtm/main/gtm_store.c
index 0123a662..8208b72a 100644
--- a/src/gtm/main/gtm_store.c
+++ b/src/gtm/main/gtm_store.c
@@ -3117,9 +3117,9 @@ ProcessStorageTransferCommand(Port *myport, StringInfo message)
     g_GTM_Backup_Timer =  GTM_AddTimer(LockStoreStandbyCrashHandler, GTM_TIMER_TYPE_ONCE, LOCK_STORE_CRASH_HANDL_TIMEOUT, GetMyThreadInfo);
     if(g_GTM_Backup_Timer == INVALID_TIMER_HANDLE)
     {
+        GTM_RWLockRelease(&g_GTM_Backup_Timer_Lock);
         elog(ERROR, "Failed to register lock store crash handler, will exit!");
-        exit(1);
-    }   
+    }
     GTM_RWLockRelease(&g_GTM_Backup_Timer_Lock);
 
     /* send xlog replication relative data */
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index 78fbcff5..ded1a044 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -3371,6 +3371,9 @@ GTM_ThreadBasebackup(void *argp)
     if (sigsetjmp(local_sigjmp_buf, 1) != 0)
     {
         bool    report = false;
+#ifdef __TBASE__
+        RWLockCleanUp();
+#endif
         /*
          * NOTE: if you are tempted to add more code in this if-block,
          * consider the high probability that it should be in

From ca2ea1820d0d4e281aa44b3419f9d02311df7c1d Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Mon, 26 Oct 2020 14:33:29 +0800
Subject: [PATCH 080/578] bugfix: tpcc district not found fatal

---
 src/backend/pgxc/pool/execRemote.c | 89 +++++++++++++++++-------------
 1 file changed, 50 insertions(+), 39 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index e3fa18d7..1be26a46 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -7435,6 +7435,18 @@ PreCommit_Remote(char *prepareGID, char *nodestring, bool preparedLocalNode)
     
 }
 
+/*
+ * Whether node need clean: last command is not finished
+ * 'Z' message: ready for query
+ * 'C' message: command complete
+ */
+static inline bool
+node_need_clean(PGXCNodeHandle *handle)
+{
+	return handle->state != DN_CONNECTION_STATE_IDLE ||
+		(('Z' != handle->last_command) && ('C' != handle->last_command));
+}
+
 /*
  * Do abort processing for the transaction. We must abort the transaction on
  * all the involved nodes. If a node has already prepared a transaction, we run
@@ -7495,21 +7507,21 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle)
     {
         PGXCNodeHandle *handle = all_handles->coord_handles[i];
 		if (handle->sock != NO_SOCKET)
-        {        
-            if ((handle->state != DN_CONNECTION_STATE_IDLE) || !node_ready_for_query(handle))
-            {
-                /*
-                 * Forget previous combiner if any since input will be handled by
-                 * different one.
-                 */
-                handle->combiner = NULL;
-                clean_nodes[node_count++] = handle;                
-                cancel_co_list[cancel_co_count++] = PGXCNodeGetNodeId(handle->nodeoid, NULL);            
-                
-#ifdef _PG_REGRESS_    
-                ereport(LOG,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("PreAbort_Remote node:%s pid:%d status:%d need clean.", handle->nodename, handle->backend_pid, handle->state)));                                                    
+		{		
+			if (node_need_clean(handle))
+			{
+				/*
+				 * Forget previous combiner if any since input will be handled by
+				 * different one.
+				 */
+				handle->combiner = NULL;
+				clean_nodes[node_count++] = handle;				
+				cancel_co_list[cancel_co_count++] = PGXCNodeGetNodeId(handle->nodeoid, NULL);			
+				
+#ifdef _PG_REGRESS_	
+				ereport(LOG,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("PreAbort_Remote node:%s pid:%d status:%d need clean.", handle->nodename, handle->backend_pid, handle->state)));													
 #endif
                 if (handle->in_extended_query)
                 {                
@@ -7561,15 +7573,14 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle)
     {
         PGXCNodeHandle *handle = all_handles->datanode_handles[i];
 		if (handle->sock != NO_SOCKET)
-        {        
-            if (handle->state == DN_CONNECTION_STATE_COPY_IN  ||
-                handle->state == DN_CONNECTION_STATE_COPY_OUT || 
-                !node_ready_for_query(handle))
-            {
-#ifdef _PG_REGRESS_    
-                ereport(LOG,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("PreAbort_Remote node:%s pid:%d status:%d need clean.", handle->nodename, handle->backend_pid, handle->state)));                                                 
+		{		
+			if (handle->state == DN_CONNECTION_STATE_COPY_IN ||
+				handle->state == DN_CONNECTION_STATE_COPY_OUT)
+			{
+#ifdef _PG_REGRESS_	
+				ereport(LOG,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("PreAbort_Remote node:%s pid:%d status:%d need clean.", handle->nodename, handle->backend_pid, handle->state))); 												
 #endif
                 if (handle->in_extended_query)
                 {                
@@ -7605,21 +7616,21 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle)
                     clean_nodes[node_count++] = handle;                
                     cancel_dn_list[cancel_dn_count++] = PGXCNodeGetNodeId(handle->nodeoid, NULL);
                 }
-#endif                
-            }
-            else if (handle->state != DN_CONNECTION_STATE_IDLE)
-            {
-                /*
-                 * Forget previous combiner if any since input will be handled by
-                 * different one.
-                 */
-                handle->combiner = NULL;
-                clean_nodes[node_count++] = handle;                
-                cancel_dn_list[cancel_dn_count++] = PGXCNodeGetNodeId(handle->nodeoid, NULL);
-#ifdef _PG_REGRESS_    
-                ereport(LOG,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("PreAbort_Remote node:%s pid:%d status:%d need clean.", handle->nodename, handle->backend_pid, handle->state)));                                                
+#endif				
+			}
+			else if (node_need_clean(handle))
+			{
+				/*
+				 * Forget previous combiner if any since input will be handled by
+				 * different one.
+				 */
+				handle->combiner = NULL;
+				clean_nodes[node_count++] = handle;				
+				cancel_dn_list[cancel_dn_count++] = PGXCNodeGetNodeId(handle->nodeoid, NULL);
+#ifdef _PG_REGRESS_	
+				ereport(LOG,
+						(errcode(ERRCODE_INTERNAL_ERROR),
+						 errmsg("PreAbort_Remote node:%s pid:%d status:%d need clean.", handle->nodename, handle->backend_pid, handle->state)));												
 #endif
 
                 if (handle->in_extended_query)

From 3c2753ef45b562900d6d427e8fdea377d29a1e83 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Thu, 3 Dec 2020 10:02:50 +0800
Subject: [PATCH 081/578] fix bug of vacuum_freeze in interval partition table,
 tapd :http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131083556663

(cherry picked from commit b8eb6d7b)

1204141f add comments.

21706b73 add comment

2a1ee817 Fix vacuum of toast table.

09e34c29 fix bug of vacuum_freeze in interval partition table, tapd :http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131083556663
---
 src/backend/catalog/toasting.c    |  99 ++++++-------
 src/backend/commands/vacuum.c     | 225 +++++++++++++++---------------
 src/backend/commands/vacuumlazy.c |  35 ++---
 3 files changed, 181 insertions(+), 178 deletions(-)

diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c
index b9ebd095..d908bfc3 100644
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@@ -457,56 +457,57 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
  * (1) there are any toastable attributes, and (2) the maximum length
  * of a tuple could exceed TOAST_TUPLE_THRESHOLD.  (We don't want to
  * create a toast table for something like "f1 varchar(20)".)
+ * No need to create a TOAST table for partitioned tables.
  */
 static bool
 needs_toast_table(Relation rel)
-{// #lizard forgives
-    int32        data_length = 0;
-    bool        maxlength_unknown = false;
-    bool        has_toastable_attrs = false;
-    TupleDesc    tupdesc;
-    Form_pg_attribute *att;
-    int32        tuple_length;
-    int            i;
-
-    /* No TOAST for partitioned tables */
-    if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-    {
-        return false;
-    }
-
-    tupdesc = rel->rd_att;
-    att = tupdesc->attrs;
-
-    for (i = 0; i < tupdesc->natts; i++)
-    {
-        if (att[i]->attisdropped)
-            continue;
-        data_length = att_align_nominal(data_length, att[i]->attalign);
-        if (att[i]->attlen > 0)
-        {
-            /* Fixed-length types are never toastable */
-            data_length += att[i]->attlen;
-        }
-        else
-        {
-            int32        maxlen = type_maximum_size(att[i]->atttypid,
-                                                   att[i]->atttypmod);
-
-            if (maxlen < 0)
-                maxlength_unknown = true;
-            else
-                data_length += maxlen;
-            if (att[i]->attstorage != 'p')
-                has_toastable_attrs = true;
-        }
-    }
-    if (!has_toastable_attrs)
-        return false;            /* nothing to toast? */
-    if (maxlength_unknown)
-        return true;            /* any unlimited-length attrs? */
-    tuple_length = MAXALIGN(SizeofHeapTupleHeader +
-                            BITMAPLEN(tupdesc->natts)) +
-        MAXALIGN(data_length);
-    return (tuple_length > TOAST_TUPLE_THRESHOLD);
+{
+	int32		data_length = 0;
+	bool		maxlength_unknown = false;
+	bool		has_toastable_attrs = false;
+	TupleDesc	tupdesc;
+	Form_pg_attribute *att;
+	int32		tuple_length;
+	int			i;
+
+	/* No TOAST for partitioned tables */
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+	{
+		return false;
+	}
+
+	tupdesc = rel->rd_att;
+	att = tupdesc->attrs;
+
+	for (i = 0; i < tupdesc->natts; i++)
+	{
+		if (att[i]->attisdropped)
+			continue;
+		data_length = att_align_nominal(data_length, att[i]->attalign);
+		if (att[i]->attlen > 0)
+		{
+			/* Fixed-length types are never toastable */
+			data_length += att[i]->attlen;
+		}
+		else
+		{
+			int32		maxlen = type_maximum_size(att[i]->atttypid,
+												   att[i]->atttypmod);
+
+			if (maxlen < 0)
+				maxlength_unknown = true;
+			else
+				data_length += maxlen;
+			if (att[i]->attstorage != 'p')
+				has_toastable_attrs = true;
+		}
+	}
+	if (!has_toastable_attrs)
+		return false;			/* nothing to toast? */
+	if (maxlength_unknown)
+		return true;			/* any unlimited-length attrs? */
+	tuple_length = MAXALIGN(SizeofHeapTupleHeader +
+							BITMAPLEN(tupdesc->natts)) +
+		MAXALIGN(data_length);
+	return (tuple_length > TOAST_TUPLE_THRESHOLD);
 }
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 15fc537e..5fc12531 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1429,124 +1429,125 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params)
 #else
         else if (onerel->rd_rel->relnamespace == PG_CATALOG_NAMESPACE)
 #endif
-            ereport(WARNING,
-                    (errmsg("skipping \"%s\" --- only superuser or database owner can vacuum it",
-                            RelationGetRelationName(onerel))));
-        else
-            ereport(WARNING,
-                    (errmsg("skipping \"%s\" --- only table or database owner can vacuum it",
-                            RelationGetRelationName(onerel))));
-        relation_close(onerel, lmode);
-        PopActiveSnapshot();
-        CommitTransactionCommand();
-        return false;
-    }
-
-    /*
-     * Check that it's a vacuumable relation; we used to do this in
-     * get_rel_oids() but seems safer to check after we've locked the
-     * relation.
-     */
-    if (onerel->rd_rel->relkind != RELKIND_RELATION &&
-        onerel->rd_rel->relkind != RELKIND_MATVIEW &&
-        onerel->rd_rel->relkind != RELKIND_TOASTVALUE &&
-        onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
-    {
-        ereport(WARNING,
-                (errmsg("skipping \"%s\" --- cannot vacuum non-tables or special system tables",
-                        RelationGetRelationName(onerel))));
-        relation_close(onerel, lmode);
-        PopActiveSnapshot();
-        CommitTransactionCommand();
-        return false;
-    }
-
-    /*
-     * Silently ignore tables that are temp tables of other backends ---
-     * trying to vacuum these will lead to great unhappiness, since their
-     * contents are probably not up-to-date on disk.  (We don't throw a
-     * warning here; it would just lead to chatter during a database-wide
-     * VACUUM.)
-     */
-    if (RELATION_IS_OTHER_TEMP(onerel))
-    {
-        relation_close(onerel, lmode);
-        PopActiveSnapshot();
-        CommitTransactionCommand();
-        return false;
-    }
-
-    /*
-     * Get a session-level lock too. This will protect our access to the
-     * relation across multiple transactions, so that we can vacuum the
-     * relation's TOAST table (if any) secure in the knowledge that no one is
-     * deleting the parent relation.
-     *
-     * NOTE: this cannot block, even if someone else is waiting for access,
-     * because the lock manager knows that both lock requests are from the
-     * same process.
-     */
-    onerelid = onerel->rd_lockInfo.lockRelId;
-    LockRelationIdForSession(&onerelid, lmode);
-
-    /*
-     * Remember the relation's TOAST relation for later, if the caller asked
-     * us to process it.  In VACUUM FULL, though, the toast table is
-     * automatically rebuilt by cluster_rel so we shouldn't recurse to it.
-     */
-    if (!(options & VACOPT_SKIPTOAST) && !(options & VACOPT_FULL))
-        toast_relid = onerel->rd_rel->reltoastrelid;
-    else
-        toast_relid = InvalidOid;
-
-    /*
-     * Switch to the table owner's userid, so that any index functions are run
-     * as that user.  Also lock down security-restricted operations and
-     * arrange to make GUC variable changes local to this command. (This is
-     * unnecessary, but harmless, for lazy VACUUM.)
-     */
-    GetUserIdAndSecContext(&save_userid, &save_sec_context);
-    SetUserIdAndSecContext(onerel->rd_rel->relowner,
-                           save_sec_context | SECURITY_RESTRICTED_OPERATION);
-    save_nestlevel = NewGUCNestLevel();
+			ereport(WARNING,
+					(errmsg("skipping \"%s\" --- only superuser or database owner can vacuum it",
+							RelationGetRelationName(onerel))));
+		else
+			ereport(WARNING,
+					(errmsg("skipping \"%s\" --- only table or database owner can vacuum it",
+							RelationGetRelationName(onerel))));
+		relation_close(onerel, lmode);
+		PopActiveSnapshot();
+		CommitTransactionCommand();
+		return false;
+	}
 
-    /*
-     * Ignore partitioned tables as there is no work to be done.  Since we
-     * release the lock here, it's possible that any partitions added from
-     * this point on will not get processed, but that seems harmless.
-     */
-    if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-    {
-        /* Roll back any GUC changes executed by index functions */
-        AtEOXact_GUC(false, save_nestlevel);
+	/*
+	 * Check that it's a vacuumable relation; we used to do this in
+	 * get_rel_oids() but seems safer to check after we've locked the
+	 * relation.
+	 */
+	if (onerel->rd_rel->relkind != RELKIND_RELATION &&
+		onerel->rd_rel->relkind != RELKIND_MATVIEW &&
+		onerel->rd_rel->relkind != RELKIND_TOASTVALUE &&
+		onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+	{
+		ereport(WARNING,
+				(errmsg("skipping \"%s\" --- cannot vacuum non-tables or special system tables",
+						RelationGetRelationName(onerel))));
+		relation_close(onerel, lmode);
+		PopActiveSnapshot();
+		CommitTransactionCommand();
+		return false;
+	}
 
-        /* Restore userid and security context */
-        SetUserIdAndSecContext(save_userid, save_sec_context);
+	/*
+	 * Silently ignore tables that are temp tables of other backends ---
+	 * trying to vacuum these will lead to great unhappiness, since their
+	 * contents are probably not up-to-date on disk.  (We don't throw a
+	 * warning here; it would just lead to chatter during a database-wide
+	 * VACUUM.)
+	 */
+	if (RELATION_IS_OTHER_TEMP(onerel))
+	{
+		relation_close(onerel, lmode);
+		PopActiveSnapshot();
+		CommitTransactionCommand();
+		return false;
+	}
 
-        relation_close(onerel, NoLock);
-        PopActiveSnapshot();
-        CommitTransactionCommand();
+	/*
+	 * Get a session-level lock too. This will protect our access to the
+	 * relation across multiple transactions, so that we can vacuum the
+	 * relation's TOAST table (if any) secure in the knowledge that no one is
+	 * deleting the parent relation.
+	 *
+	 * NOTE: this cannot block, even if someone else is waiting for access,
+	 * because the lock manager knows that both lock requests are from the
+	 * same process.
+	 */
+	onerelid = onerel->rd_lockInfo.lockRelId;
+	LockRelationIdForSession(&onerelid, lmode);
+
+	/*
+	 * Remember the relation's TOAST relation for later, if the caller asked
+	 * us to process it.  In VACUUM FULL, though, the toast table is
+	 * automatically rebuilt by cluster_rel so we shouldn't recurse to it.
+	 */
+	if (!(options & VACOPT_SKIPTOAST) && !(options & VACOPT_FULL))
+		toast_relid = onerel->rd_rel->reltoastrelid;
+	else
+		toast_relid = InvalidOid;
+
+	/*
+	 * Switch to the table owner's userid, so that any index functions are run
+	 * as that user.  Also lock down security-restricted operations and
+	 * arrange to make GUC variable changes local to this command. (This is
+	 * unnecessary, but harmless, for lazy VACUUM.)
+	 */
+	GetUserIdAndSecContext(&save_userid, &save_sec_context);
+	SetUserIdAndSecContext(onerel->rd_rel->relowner,
+						   save_sec_context | SECURITY_RESTRICTED_OPERATION);
+	save_nestlevel = NewGUCNestLevel();
+
+	/*
+	 * Ignore partitioned tables as there is no work to be done.  Since we
+	 * release the lock here, it's possible that any partitions added from
+	 * this point on will not get processed, but that seems harmless.
+	 */
+	if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+	{
+		/* Roll back any GUC changes executed by index functions */
+		AtEOXact_GUC(false, save_nestlevel);
+
+		/* Restore userid and security context */
+		SetUserIdAndSecContext(save_userid, save_sec_context);
+
+		relation_close(onerel, NoLock);
+		PopActiveSnapshot();
+		CommitTransactionCommand();
+
+		/*
+		 * If the relation has a secondary toast rel, vacuum that too while we
+		 * still hold the session lock on the master table.  Note however that
+		 * "analyze" will not get done on the toast table.  This is good, because
+		 * the toaster always uses hardcoded index access and statistics are
+		 * totally unimportant for toast relations.
+		 */
+		if (toast_relid != InvalidOid)
+		{
+			vacuum_rel(toast_relid, relation, options, params);
+		}
 
-        /*
-         * If the relation has a secondary toast rel, vacuum that too while we
-         * still hold the session lock on the master table.  Note however that
-         * "analyze" will not get done on the toast table.  This is good, because
-         * the toaster always uses hardcoded index access and statistics are
-         * totally unimportant for toast relations.
-         */
-        if (toast_relid != InvalidOid)
-        {
-            vacuum_rel(toast_relid, relation, options, params);
-        }
+		/*
+		 * Now release the session-level lock on the master table.
+		 */
+		UnlockRelationIdForSession(&onerelid, lmode);
 
-        /*
-         * Now release the session-level lock on the master table.
-         */
-        UnlockRelationIdForSession(&onerelid, lmode);
+		/* It's OK for other commands to look at this table */
+		return true;
+	}
 
-        /* It's OK for other commands to look at this table */
-        return true;
-    }
 
 #ifdef XCP
     /*
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 0192cdc6..1e72ec49 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -187,20 +187,20 @@ lazy_vacuum_interval_rel(Relation onerel, VacuumParams *params)
 	int nindexes;
 	Relation *Irel;
 	bool hasindex;
-	TransactionId oldestXmin = InvalidTransactionId;
-	TransactionId freezeLimit = InvalidTransactionId;
-	MultiXactId multiXactCutoff = InvalidMultiXactId;
+    TransactionId oldestXmin = InvalidTransactionId;
+    TransactionId freezeLimit = InvalidTransactionId;
+    MultiXactId multiXactCutoff = InvalidMultiXactId;
 
-	if (params && onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
-	{
-	    vacuum_set_xid_limits(onerel,
+    if (params && onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+    {
+        vacuum_set_xid_limits(onerel,
                               params->freeze_min_age,
                               params->freeze_table_age,
                               params->multixact_freeze_min_age,
                               params->multixact_freeze_table_age,
                               &oldestXmin, &freezeLimit, NULL,
                               &multiXactCutoff, NULL);
-	}
+    }
 
 	childs = RelationGetAllPartitions(onerel);
 
@@ -260,14 +260,15 @@ lazy_vacuum_interval_rel(Relation onerel, VacuumParams *params)
 	pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
 								 PROGRESS_VACUUM_PHASE_FINAL_CLEANUP);
 
-	vac_update_relstats(onerel,
-						pages,
-						tuples,
-						visiblepages,
-						hasindex,
-						freezeLimit,
-						multiXactCutoff,
-						false);
+    /* save changes */
+    vac_update_relstats(onerel,
+                        pages,
+                        tuples,
+                        visiblepages,
+                        hasindex,
+                        freezeLimit,
+                        multiXactCutoff,
+                        false);
 
 	pgstat_report_vacuum(RelationGetRelid(onerel),
 						 onerel->rd_rel->relisshared,
@@ -313,10 +314,10 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params,
     Assert(params != NULL);
 
 #ifdef __TBASE__
+	/* update statistic info for interval partition parent table */
 	if (RELATION_IS_INTERVAL(onerel))
 	{
-	    /* update statistic info for interval partition parent table */
-	    lazy_vacuum_interval_rel(onerel, params);
+		lazy_vacuum_interval_rel(onerel, params);
 		return;
 	}
 #endif

From c5a7f0ec3cfcab9617d02007425f57a30b0a14d6 Mon Sep 17 00:00:00 2001
From: jackywpxie <jackywpxie@tencent.com>
Date: Wed, 2 Dec 2020 22:14:50 +0800
Subject: [PATCH 082/578] jacky/feature/MaintainGTS_Tbase_v2.15.16 (merge
 request !12)

Squash merge branch 'jacky/feature/MaintainGTS_Tbase_v2.15.16' into 'Tbase_v2.15.16'

* Revert 'fixed bug for persistent datanode connections.'

* MaintainGTS supports unlogged table.

* fixed bug for persistent datanode connections.

* add {}

* Merge branch 'Tbase_v2.15.16' into jacky/feature/MaintainGTS_Tbase_v2.15.16

* rollback modification

* Revert 'bugfix: tpcc district not found fatal'

* bugfix: tpcc district not found fatal

* delete extension: reset_gts

* clear the modification to buffer.h and buffer.c

* fixed bug: endless loop

* modified according to xiecanyang's suggestion.

* delete damaged_gts

* add damaged_gts test option

* add damage_gts

* fixed bug: count not open tlog file when the tuple has been frozen.

* delete a comment.

* modified the comment of PostmasterIsPrimaryAndNormal.

* rename PostmasterIsAlive to PostmasterIsPrimaryAndNormal

* modified code format

* delete enable_satisfies_any

* delete pg_memory_barrier()

* delete space.

* add pg_memory_barrier()

* fixed a error of going back.

* go back to before fixing the bug of persistent_datanode_connection.

* fixed bugs: insert abort when persistent_datanode_connecionts = on.

* correct a typo

* ajustted code format.

* fixed bugs:

* ajustted code format.

* roll back the  modification of ReadBuffer_common.

* mkdir maintain for trace log.

* fixed bug: release clog lock.

* printData/printStack call audit_log_trace.

* check and reset GTS before and after vacuum pages.

* add trace log accoording to audit fga log.

* optimized code format.

* 1. ajustted code format: suck as line break, etc.

* rollback: not fully tested

* reduce if logical judgement.

* modified acoording to jason's suggestion

* deal with special GTS

* modified according to code review comments.

* comment memory barrier.

* add GTS values: 3, 4.

* print the line number and file name of error stack.

* reset_gts = 1:

* fixed bug: Could not open file 'pg_commit_ts/XXX': No such file or

* reduce unnecessary logs.

* fixed bug: set persistent_datanode_connections to on, insert transaction

* support heap_page_reset_gts(get_buffer('table_name', page_number));

* 1.fix bug about errmsg, 'database tbase does not exist', in pg_log.

* solve the problem of GTS output big interger out of bounds.

* remove dependency on Kernal

* pg_archivecleanup support removing the .gts file.

* pg_waldump ... -r transaction command support GTS.

* rename tbase_gts to tbase_gts_tools

* fix bug: heap_page_items can not output t_data when page id is not normal.

* initialize values

* shuiwu20201029_2

* refactoring functions to simplify code.

* add tbase_gts extension in the Makefile of extensions.

* delete enable_satisfies_any from GUC

* modified txid_gts.

* add heap_page_reset_gts()

* add xmin_gts and xmax_gts in extension function

* add extension function heap_page_items_with_gts.

* add tbase_gts extendion

* add enable_satisfies_any

* fix bug:

* print correct CTID.

* make changes according to code viewing suggestions.

* 1. Print log when GTS is inserted into heaptuple

* add ctid information while checking GTS

* use __sync_synchronize() to prevent CPU reordering and compiler

* 1. increase log information when gts is incorrect.

* When gts is not set, its correctness is not checked.

* check the correctness of GTS before writing pages.

* fix bug ID82284643: GTS is not used for index ans system tables.

* fix bug ID82284643: reduce locks of checking GTS when reading pages.

* fix bug ID82284643: check and reset tuple's xmin_gts and xmax_gts according to the gts in tlog through write and read data page.

* fix bug ID82284643: check and reset tuple's xmin_gts and xmax_gts according to the gts in tlog through vacuum operation.
---
 contrib/Makefile                              |    1 +
 contrib/audit_test/audit_test_AlogQueue.c     |  839 ++++++
 contrib/pageinspect/heapfuncs.c               |  126 +-
 contrib/pgxc_ctl/make_signature               |    0
 contrib/tbase_gts_tools/Makefile              |   23 +
 .../tbase_gts_tools/tbase_gts_tools--1.0.sql  |  104 +
 contrib/tbase_gts_tools/tbase_gts_tools.c     |  357 +++
 .../tbase_gts_tools/tbase_gts_tools.control   |    5 +
 src/backend/access/rmgrdesc/xactdesc.c        |   70 +-
 src/backend/access/transam/commit_ts.c        |  178 +-
 src/backend/access/transam/gtm.c              |   44 +-
 src/backend/commands/vacuum.c                 |    4 +-
 src/backend/commands/vacuumlazy.c             | 1060 +++++---
 src/backend/main/main.c                       |   59 +-
 src/backend/pgxc/pool/poolmgr.c               |   26 +-
 src/backend/postmaster/auditlogger.c          | 2313 ++++++++++-------
 src/backend/postmaster/postmaster.c           |   20 +
 src/backend/replication/logical/decode.c      |   10 +-
 src/backend/utils/cache/relcache.c            |   91 +
 src/backend/utils/misc/guc.c                  |   48 +-
 src/backend/utils/misc/postgresql.conf.sample |    9 +
 src/backend/utils/time/tqual.c                |   68 +-
 src/bin/pg_archivecleanup/pg_archivecleanup.c |  188 +-
 src/include/access/htup_details.h             |    5 +
 src/include/bootstrap/bootstrap.h             |    3 +-
 src/include/commands/vacuum.h                 |   24 +-
 src/include/postmaster/auditlogger.h          |   25 +-
 src/include/postmaster/postmaster.h           |    1 +
 src/include/utils/relcache.h                  |    2 +
 29 files changed, 3987 insertions(+), 1716 deletions(-)
 create mode 100644 contrib/audit_test/audit_test_AlogQueue.c
 mode change 100644 => 100755 contrib/pgxc_ctl/make_signature
 create mode 100644 contrib/tbase_gts_tools/Makefile
 create mode 100644 contrib/tbase_gts_tools/tbase_gts_tools--1.0.sql
 create mode 100644 contrib/tbase_gts_tools/tbase_gts_tools.c
 create mode 100644 contrib/tbase_gts_tools/tbase_gts_tools.control

diff --git a/contrib/Makefile b/contrib/Makefile
index 9c5df8bf..1d0dcd37 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -46,6 +46,7 @@ SUBDIRS = \
                 seg             \
                 spi             \
                 tablefunc       \
+				tbase_gts_tools \
                 tcn             \
                 test_decoding   \
                 tsm_system_rows \
diff --git a/contrib/audit_test/audit_test_AlogQueue.c b/contrib/audit_test/audit_test_AlogQueue.c
new file mode 100644
index 00000000..6b81ad85
--- /dev/null
+++ b/contrib/audit_test/audit_test_AlogQueue.c
@@ -0,0 +1,839 @@
+/*
+ * contrib/audit_test/audit_test.c
+ */
+
+#include "postgres_fe.h"
+
+#include "libpq-fe.h"
+#include "pg_getopt.h"
+#include "port/atomics.h"
+#include <assert.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <pthread.h>
+
+#define TestAlogProducerCount  1000
+#define TestAlogQueueSize 1200
+#define TestAlogBuffSize 40960
+#define TestAlogFileSize 102400000
+
+#ifdef Assert
+#undef Assert
+#endif
+
+#define Assert assert
+
+typedef struct TestAuditLogQueue
+{
+	pid_t					q_pid;
+	int						q_size;
+	char					q_lock;
+	volatile int			q_head;
+	volatile int			q_tail;
+	char					q_area[FLEXIBLE_ARRAY_MEMBER];
+} AlogQueue;
+
+static int shared_queue_idx[TestAlogProducerCount] = { 0 };
+static AlogQueue * shared_queue [TestAlogProducerCount] = { 0 };
+static AlogQueue * local_cache = NULL;
+static char * alog_file_name = "test_alog.txt";
+static FILE * alog_file_fp = NULL;
+
+static char * 	alog_queue_offset_to(AlogQueue * queue, int offset);
+static bool 	alog_queue_is_full(int q_size, int q_head, int q_tail);
+static bool 	alog_queue_is_empty(int q_size, int q_head, int q_tail);
+static bool 	alog_queue_is_enough(int q_size, int q_head, int q_tail, int N);
+static int 		alog_queue_remain(int q_size, int q_head, int q_tail);
+static int 		alog_queue_used(int q_size, int q_head, int q_tail);
+static bool 	alog_queue_push(AlogQueue * queue, char * buff, int len);
+static bool 	alog_queue_push2(AlogQueue * queue, char * buff1, int len1, char * buff2, int len2);
+static bool 	alog_queue_pushn(AlogQueue * queue, char * buff[], int len[], int n);
+static int 		alog_queue_get_str_len(AlogQueue * queue, int offset);
+static void 	alog_queue_clear_str_len(AlogQueue * queue, int offset);
+static bool 	alog_queue_pop_to_queue(AlogQueue * from, AlogQueue * to);
+static bool 	alog_queue_pop_to_file(AlogQueue * from, FILE * logfile);
+static int		alog_write_log_file(const char *buffer, int count, FILE * logfile);
+static int 		alog_random_string(char buff[TestAlogBuffSize]);
+
+int test_alog();
+int test_alog0();
+
+
+/* --------------------------------
+ *		AlogQueue routines
+ * --------------------------------
+ */
+
+/*
+ * Get a write pointer in queue
+ */
+static char * alog_queue_offset_to(AlogQueue * queue, int offset)
+{
+	char * start = (char *) queue;
+
+	Assert(offset >= 0 && offset < queue->q_size);
+
+	start += offsetof(AlogQueue, q_area);
+	start += offset;
+
+	return start;
+}
+
+static bool alog_queue_is_full(int q_size, int q_head, int q_tail)
+{
+	Assert(q_size > 0 && q_head >= 0 && q_tail >= 0);
+	Assert(q_head < q_size && q_tail < q_size);
+
+    if ((q_tail + 1) % q_size == q_head)
+    {
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+static bool alog_queue_is_empty(int q_size, int q_head, int q_tail)
+{
+	Assert(q_size > 0 && q_head >= 0 && q_tail >= 0);
+	Assert(q_head < q_size && q_tail < q_size);
+
+    if (q_tail == q_head)
+    {
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+/*
+ * how many bytes already in used
+ */
+
+static int alog_queue_used(int q_size, int q_head, int q_tail)
+{
+	int used = (q_tail - q_head + q_size) % q_size;
+
+	Assert(q_size > 0 && q_head >= 0 && q_tail >= 0);
+	Assert(q_head < q_size && q_tail < q_size);
+
+	return used;
+}
+
+
+/*
+ * how many bytes remain in Queue
+ */
+static int alog_queue_remain(int q_size, int q_head, int q_tail)
+{
+	int remain = (q_head - q_tail + q_size - 1) % q_size;
+
+	Assert(q_size > 0 && q_head >= 0 && q_tail >= 0);
+	Assert(q_head < q_size && q_tail < q_size);
+	Assert(remain == (q_size - 1) - ((q_tail - q_head + q_size) % q_size));
+	Assert(remain == (q_size - 1) - alog_queue_used(q_size, q_head, q_tail));
+
+	return remain;
+}
+
+/*
+ * whether queue has enough space for N bytes ?
+ */
+static bool alog_queue_is_enough(int q_size, int q_head, int q_tail, int N)
+{
+	int remain = alog_queue_remain(q_size, q_head, q_tail);
+
+	Assert(q_size > 0 && q_head >= 0 && q_tail >= 0 && N > 0);
+	Assert(q_head < q_size && q_tail < q_size);
+
+	if (remain > N)
+	{
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * write buff to queue
+ *
+ * len = size(int) + strlen(str)
+ *
+ */
+static bool alog_queue_push(AlogQueue * queue, char * buff, int len)
+{
+	char * buff_array [] = { buff };
+	int len_array [] = { len };
+
+	return alog_queue_pushn(queue, buff_array, len_array, sizeof(len_array)/sizeof(len_array[0]));
+}
+
+/*
+ * write buff1 and buff2 to queue
+ */
+static bool alog_queue_push2(AlogQueue * queue, char * buff1, int len1, char * buff2, int len2)
+{
+	char * buff_array[] = {buff1, buff2};
+	int len_array[] = {len1, len2};
+
+	return alog_queue_pushn(queue, buff_array, len_array, sizeof(len_array)/sizeof(len_array[0]));
+}
+
+static bool alog_queue_pushn(AlogQueue * queue, char * buff[], int len[], int n)
+{
+	volatile int q_head = queue->q_head;
+	volatile int q_tail = queue->q_tail;
+	volatile int q_size = queue->q_size;
+
+	int q_head_before = q_head;
+	int q_tail_before = q_tail;
+	int q_size_before = q_size;
+
+	int q_used_before = 0;
+	int q_used_after = 0;
+
+	int total_len = 0;
+	int i = 0;
+
+	for (i = 0; i < n; i++)
+	{
+		total_len += len[i];
+	}
+
+	pg_memory_barrier();
+
+	Assert(q_size > 0 && q_head >= 0 && q_tail >= 0);
+	Assert(q_head < q_size && q_tail < q_size);
+	Assert(buff != NULL && len != 0 && n > 0 && total_len > 0);
+
+	q_used_before = alog_queue_used(q_size_before, q_head_before, q_tail_before);
+
+	if (alog_queue_is_full(q_size, q_head, q_tail))
+	{
+		return false;
+	}
+
+	if (!alog_queue_is_enough(q_size, q_head, q_tail, total_len))
+	{
+		return false;
+	}
+
+	for (i = 0; i < n; i++)
+	{
+		char * curr_buff = buff[i];
+		int curr_len = len[i];
+
+		/* has enough space, write directly */
+		if (q_size - q_tail >= curr_len)
+		{
+			char * p_start = alog_queue_offset_to(queue, q_tail);
+			memcpy(p_start, curr_buff, curr_len);
+		}
+		else
+		{
+			/* must write as two parts */
+			int first_len = q_size - q_tail;
+			int second_len = curr_len - first_len;
+
+			char * first_buf = curr_buff + 0;
+			char * second_buf = curr_buff + first_len;
+
+			char * p_start = NULL;
+
+			pg_memory_barrier();
+
+			Assert(first_len > 0 && first_len < q_size);
+			Assert(second_len > 0 && second_len < q_size);
+
+			/* 01. write the first parts into the tail of queue->q_area */
+			p_start = alog_queue_offset_to(queue, q_tail);
+			memcpy(p_start, first_buf, first_len);
+
+			Assert((q_tail + first_len) % q_size == 0);
+
+			/* 02. write the remain parts into the head of queue->q_area */
+			p_start = alog_queue_offset_to(queue, 0);
+			memcpy(p_start, second_buf, second_len);
+		}
+
+		q_tail = (q_tail + curr_len) % q_size;
+	}
+
+	queue->q_tail = q_tail;
+
+	q_used_after = alog_queue_used(q_size, q_head, q_tail);
+	Assert(q_used_before + total_len == q_used_after);
+
+	return true;
+}
+
+/*
+ * |<- strlen value ->|<- string message content ->|
+ * |											   |
+ * |											   |
+ * |<------------------ buff --------------------->|
+ *
+ * len = size(int) + strlen(str)
+ *
+ */
+static int alog_queue_get_str_len(AlogQueue * queue, int offset)
+{
+	volatile int q_size = queue->q_size;
+	char buff[sizeof(int)] = { '\0' };
+	int len = 0;
+
+	pg_memory_barrier();
+
+	Assert(offset >= 0 && offset < q_size);
+
+	/* read len directly */
+	if (q_size - offset >= sizeof(int))
+	{
+		char * q_start = alog_queue_offset_to(queue, offset);
+		memcpy(buff, q_start, sizeof(int));
+	}
+	else
+	{
+		/* must read as two parts */
+		int first_len = q_size - offset;
+		int second_len = sizeof(int) - first_len;
+
+		char * p_start = NULL;
+
+		pg_memory_barrier();
+
+		Assert(first_len > 0 && first_len < q_size);
+		Assert(second_len > 0 && second_len < sizeof(int));
+
+		/* 01. copy the first parts */
+		p_start = alog_queue_offset_to(queue, offset);
+		memcpy(buff, p_start, first_len);
+
+		/* 02. copy the remain parts */
+		p_start = alog_queue_offset_to(queue, 0);
+		memcpy(buff + first_len, p_start, second_len);
+	}
+
+	memcpy((char *)(&len), buff, sizeof(int));
+
+	Assert(len > 0 && len < q_size);
+
+	return len;
+}
+
+static void alog_queue_clear_str_len(AlogQueue * queue, int offset)
+{
+	volatile int q_size = queue->q_size;
+	char buff[sizeof(int)] = { '\0' };
+
+	pg_memory_barrier();
+
+	Assert(offset >= 0 && offset < q_size);
+
+	/* read len directly */
+	if (q_size - offset >= sizeof(int))
+	{
+		char * q_start = alog_queue_offset_to(queue, offset);
+		memcpy(q_start, buff, sizeof(int));
+	}
+	else
+	{
+		/* must read as two parts */
+		int first_len = q_size - offset;
+		int second_len = sizeof(int) - first_len;
+
+		char * p_start = NULL;
+
+		pg_memory_barrier();
+
+		Assert(first_len > 0 && first_len < q_size);
+		Assert(second_len > 0 && second_len < sizeof(int));
+
+		/* 01. copy the first parts */
+		p_start = alog_queue_offset_to(queue, offset);
+		memcpy(p_start, buff, first_len);
+
+		/* 02. copy the remain parts */
+		p_start = alog_queue_offset_to(queue, 0);
+		memcpy(p_start, buff, second_len);
+	}
+}
+
+/*
+ * copy message from queue to another as much as possible
+ *
+ * |<- strlen value ->|<- string message content ->|
+ * |											   |
+ * |											   |
+ * |<------------------ buff --------------------->|
+ *
+ * len = size(int) + strlen(str)
+ *
+ */
+static bool alog_queue_pop_to_queue(AlogQueue * from, AlogQueue * to)
+{
+	volatile int q_from_head = from->q_head;
+	volatile int q_from_tail = from->q_tail;
+	volatile int q_from_size = from->q_size;
+
+	volatile int q_to_head = to->q_head;
+	volatile int q_to_tail = to->q_tail;
+	volatile int q_to_size = to->q_size;
+
+	int from_head = q_from_head;
+	int from_tail = q_from_tail;
+	int from_size = q_from_size;
+
+	int to_head = q_to_head;
+	int to_tail = q_to_tail;
+	int to_size = q_to_size;
+
+	int from_total = 0;
+
+	int from_used = 0;
+	int from_copyed = 0;
+
+	int to_used = 0;
+	int to_copyed = 0;
+
+	pg_memory_barrier();
+
+	from_total = from_used = alog_queue_used(from_size, from_head, from_tail);
+	to_used = alog_queue_used(to_size, to_head, to_tail);
+
+	Assert(from_size > 0 && from_head >= 0 && from_tail >= 0);
+	Assert(from_head < from_size && from_tail < from_size && from_used <= from_size);
+
+	Assert(to_size > 0 && to_head >= 0 && to_tail >= 0);
+	Assert(to_head < to_size && to_tail < to_size && to_used <= to_size);
+
+	/* from is empty, ignore */
+	if (alog_queue_is_empty(from_size, from_head, from_tail))
+	{
+		return false;
+	}
+
+	/* to is full, can not write */
+	if (alog_queue_is_full(to_size, to_head, to_tail))
+	{
+		return false;
+	}
+
+	/* copy message into queue until to is full or from is empty */
+	do
+	{
+		int string_len = alog_queue_get_str_len(from, from_head);
+		int copy_len = sizeof(int) + string_len;
+
+		pg_memory_barrier();
+
+		Assert(string_len > 0 && string_len < from_size);
+		Assert(copy_len > 0 && copy_len < from_size);
+
+		if (!alog_queue_is_enough(to_size, to_head, to_tail, copy_len))
+		{
+			break;
+		}
+
+		/* just copy dierctly */
+		if (from_size - from_head >= copy_len)
+		{
+			char * p_start = alog_queue_offset_to(from, from_head);
+			if (!alog_queue_push(to, p_start, copy_len))
+			{
+				break;
+			}
+		}
+		else
+		{
+			/* must copy as two parts */
+			int first_len = from_size - from_head;
+			int second_len = copy_len - first_len;
+			char * p_first_start = NULL;
+			char * p_second_start = NULL;
+
+			Assert(first_len > 0 && first_len < from_size);
+			Assert(second_len > 0 && second_len < from_size);
+
+			p_first_start = alog_queue_offset_to(from, from_head);
+			p_second_start = alog_queue_offset_to(from, 0);
+
+			/* 01. copy the content parts into the tail of to->q_area */
+			if (!alog_queue_push2(to, p_first_start, first_len, p_second_start, second_len))
+			{
+				break;
+			}
+		}
+
+		from_head = (from_head + copy_len) % from_size;
+		to_tail = (to_tail + copy_len) % to_size;
+
+		from_copyed += copy_len;
+		to_copyed += copy_len;
+
+		Assert(from_copyed <= from_total);
+		Assert(from_used - copy_len >= 0);
+		Assert(to_used + copy_len <= to_size);
+		Assert(from_used - copy_len == alog_queue_used(from_size, from_head, from_tail));
+		Assert(to_used + copy_len == alog_queue_used(to_size, to_head, to_tail));
+
+		from_used = alog_queue_used(from_size, from_head, from_tail);
+		to_used = alog_queue_used(to_size, to_head, to_tail);
+	} while (!alog_queue_is_empty(from_size, from_head, from_tail));
+
+	from->q_head = from_head;
+
+	return true;
+}
+
+/*
+ * copy message from queue to file as much as possible
+ */
+static bool alog_queue_pop_to_file(AlogQueue * from, FILE * logfile)
+{
+	volatile int q_from_head = from->q_head;
+	volatile int q_from_tail = from->q_tail;
+	volatile int q_from_size = from->q_size;
+
+	int from_head = q_from_head;
+	int from_tail = q_from_tail;
+	int from_size = q_from_size;
+
+	int from_total = 0;
+
+	int from_used = 0;
+	int from_copyed = 0;
+
+	pg_memory_barrier();
+
+	from_total = from_used = alog_queue_used(from_size, from_head, from_tail);
+
+	Assert(from_size > 0 && from_head >= 0 && from_tail >= 0);
+	Assert(from_head < from_size && from_tail < from_size && from_used <= from_size);
+
+	/* from is empty, ignore */
+	if (alog_queue_is_empty(from_size, from_head, from_tail))
+	{
+		return false;
+	}
+
+	/* copy message into file until from is empty */
+	do
+	{
+		int string_len = alog_queue_get_str_len(from, from_head);
+		int copy_len = sizeof(int) + string_len;
+
+		pg_memory_barrier();
+
+		/* just copy dierctly */
+		if (from_size - from_head >= copy_len)
+		{
+			char * p_start = alog_queue_offset_to(from, from_head + sizeof(int));
+
+			/* only copy message content, not write message len */
+			alog_write_log_file(p_start, string_len, logfile);
+		}
+		else if (from_size - from_head > sizeof(int))
+		{
+			/* must copy as two parts */
+			int first_len = from_size - from_head - sizeof(int);
+			int second_len = string_len - first_len;
+			char * p_start = NULL;
+
+			Assert(first_len > 0 && first_len < from_size);
+			Assert(second_len > 0 && second_len < from_size);
+
+			p_start = alog_queue_offset_to(from, from_head + sizeof(int));
+			alog_write_log_file(p_start, first_len, logfile);
+
+			p_start = alog_queue_offset_to(from, 0);
+			alog_write_log_file(p_start, second_len, logfile);
+		}
+		else
+		{
+			/* just copy content only */
+			int cpy_offset = (from_head + sizeof(int)) % from_size;
+			char * p_start = alog_queue_offset_to(from, cpy_offset);
+
+			Assert(from_size - from_head <= sizeof(int));
+			alog_write_log_file(p_start, string_len, logfile);
+		}
+
+		from_head = (from_head + copy_len) % from_size;
+		from_copyed += copy_len;
+
+		Assert(from_copyed <= from_total);
+		Assert(from_used - copy_len >= 0);
+		Assert(from_used - copy_len == alog_queue_used(from_size, from_head, from_tail));
+
+		from_used = alog_queue_used(from_size, from_head, from_tail);
+	} while (!alog_queue_is_empty(from_size, from_head, from_tail));
+
+	from->q_head = from_head;
+
+	return true;
+}
+
+static int
+alog_write_log_file(const char *buffer, int count, FILE * logfile)
+{
+	int	rc = 0;
+	rc = fwrite(buffer, 1, count, logfile);
+
+	/* can't use ereport here because of possible recursion */
+	if (rc != count)
+	{
+		printf("could not write to audit log file: %s\n", strerror(errno));
+		return -1;
+	}
+
+	return 0;
+}
+
+static AlogQueue *
+alog_make_queue(int q_size_kb)
+{
+	AlogQueue * queue = NULL;
+	Size alogSize = 0;
+
+	alogSize = offsetof(AlogQueue, q_area);
+	alogSize = alogSize + q_size_kb * 1024;
+
+	queue = (AlogQueue *)malloc(alogSize);
+	if (queue == NULL)
+	{
+		return NULL;
+	}
+
+	memset(queue, 0, alogSize);
+
+	queue->q_pid = 0;
+	queue->q_size = q_size_kb * 1024;
+	queue->q_lock = 0;
+	queue->q_head = 0;
+	queue->q_tail = 0;
+
+	return queue;
+}
+
+static FILE *
+alog_open_log_file(const char *filename, const char *mode)
+{
+	FILE	   *fh = NULL;
+	mode_t		oumask = 0;
+
+	oumask = umask((mode_t) ((~(S_IWUSR | S_IRUSR | S_IWUSR)) & (S_IRWXU | S_IRWXG | S_IRWXO)));
+	fh = fopen(filename, mode);
+	umask(oumask);
+
+	if (fh)
+	{
+		setvbuf(fh, NULL, PG_IOLBF, 0);
+	}
+
+	return fh;
+}
+
+static int alog_random_string(char buff[TestAlogBuffSize])
+{
+	int i = 0;
+
+	char letter[] = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
+					  'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' };
+
+	int len = rand() % TestAlogBuffSize;
+
+	buff[0] = '\0';
+
+	if (len == 0)
+	{
+		len += (TestAlogBuffSize/10);
+	}
+	else if (len < 0)
+	{
+		len *= -1;
+	}
+
+	if (len >= TestAlogBuffSize - 10)
+	{
+		len  = TestAlogBuffSize - 10;
+	}
+
+	// len = 100;
+
+	memcpy(buff, (char *)(&len), sizeof(int));
+	for (i = 0; i < len - 1; i++)
+	{
+		int j = i % sizeof(letter);
+		buff[sizeof(int) + i] = letter[j];
+	}
+
+	buff[sizeof(int) + len - 1] = '\n';
+
+	return sizeof(int) + len;
+}
+
+static void * alog_producer(void * para)
+{
+	int * idx = (int *) para;
+	char buff[TestAlogBuffSize] = { '0' };
+
+	srand(time(NULL));
+
+	while (1)
+	{
+		int len = alog_random_string(buff);
+		AlogQueue * queue = shared_queue [*idx];
+
+		while (!alog_queue_push(queue, buff, len))
+		{
+			usleep(10000);
+		}
+	}
+
+	return NULL;
+}
+
+static void * alog_consumer(void * para)
+{
+	while (1)
+	{
+		int i = 0;
+
+		for (i = 0; i < TestAlogProducerCount; i++)
+		{
+			alog_queue_pop_to_queue(shared_queue[i], local_cache);
+
+			if (0)
+			{
+				if (ftell(alog_file_fp) >= TestAlogFileSize * 1024L)
+				{
+					FILE * fh = alog_open_log_file(alog_file_name, "w");
+					fclose(alog_file_fp);
+					alog_file_fp = fh;
+				}
+
+				alog_queue_pop_to_file(shared_queue[i], alog_file_fp);
+			}
+		}
+	}
+	return NULL;
+}
+
+static void * alog_writer(void * para)
+{
+	FILE * file = alog_file_fp;
+
+	while (1)
+	{
+		if (1)
+		{
+			if (ftell(file) >= TestAlogFileSize * 1024L)
+			{
+				FILE * fh = alog_open_log_file(alog_file_name, "w");
+				fclose(file);
+				file = fh;
+			}
+
+			alog_queue_pop_to_file(local_cache, file);
+		}
+	}
+
+	return NULL;
+}
+
+enum MT_thr_detach
+{
+	MT_THR_JOINABLE,
+	MT_THR_DETACHED
+};
+
+static int32 CreateThread(void *(*f) (void *), void *arg, int32 mode)
+{
+
+    pthread_attr_t attr;
+    pthread_t      threadid;
+    int            ret = 0;
+
+    pthread_attr_init(&attr);
+    switch (mode)
+    {
+        case MT_THR_JOINABLE:
+            {
+                pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+                break;
+            }
+        case MT_THR_DETACHED:
+            {
+                pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+                break;
+            }
+        default:
+			break;
+    }
+    ret = pthread_create(&threadid, &attr, f, arg);
+    return ret;
+}
+
+int test_alog()
+{
+	int queue_size_kb = TestAlogQueueSize;
+
+	int i = 0;
+
+	for (i = 0; i < TestAlogProducerCount; i++)
+	{
+		shared_queue[i] = alog_make_queue(queue_size_kb);
+		shared_queue_idx[i] = i;
+	}
+
+	local_cache = alog_make_queue(queue_size_kb);
+
+	alog_file_fp = alog_open_log_file(alog_file_name, "a");
+
+	CreateThread(alog_writer, NULL ,MT_THR_DETACHED);
+
+	for (i = 0; i < TestAlogProducerCount; i++)
+	{
+		CreateThread(alog_producer, (void *) (&(shared_queue_idx[i])), MT_THR_DETACHED);
+	}
+
+	alog_consumer(NULL);
+
+	return 0;
+}
+
+int test_alog0()
+{
+
+	int queue_size_kb = TestAlogQueueSize;
+	AlogQueue * q0 = NULL;
+	AlogQueue * q1 = NULL;
+
+	char buff[TestAlogBuffSize] = { '0' };
+	int len = 0;
+
+	srand(time(NULL));
+
+	q0 = alog_make_queue(queue_size_kb);
+	q1 = alog_make_queue(queue_size_kb);
+
+	do
+	{
+		len = alog_random_string(buff);
+	} while (alog_queue_push(q0, buff, len));
+
+	alog_queue_pop_to_queue(q0, q1);
+
+	do
+	{
+		FILE * file = alog_open_log_file(alog_file_name, "a");
+		alog_queue_pop_to_file(q1, file);
+	} while(0);
+
+	return 0;
+}
+
diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c
index e2bf4172..54c10d4f 100644
--- a/contrib/pageinspect/heapfuncs.c
+++ b/contrib/pageinspect/heapfuncs.c
@@ -215,69 +215,69 @@ heap_page_items(PG_FUNCTION_ARGS)
 #else
             values[10] = UInt8GetDatum(tuphdr->t_hoff);
 #endif
-            /* Copy raw tuple data into bytea attribute */
-            tuple_data_len = lp_len - tuphdr->t_hoff;
-            tuple_data_bytea = (bytea *) palloc(tuple_data_len + VARHDRSZ);
-            SET_VARSIZE(tuple_data_bytea, tuple_data_len + VARHDRSZ);
-            memcpy(VARDATA(tuple_data_bytea), (char *) tuphdr + tuphdr->t_hoff,
-                   tuple_data_len);
-            values[14] = PointerGetDatum(tuple_data_bytea);
-
-            /*
-             * We already checked that the item is completely within the raw
-             * page passed to us, with the length given in the line pointer.
-             * Let's check that t_hoff doesn't point over lp_len, before using
-             * it to access t_bits and oid.
-             */
-            if (tuphdr->t_hoff >= SizeofHeapTupleHeader &&
-                tuphdr->t_hoff <= lp_len &&
-                tuphdr->t_hoff == MAXALIGN(tuphdr->t_hoff))
-            {
-                if (tuphdr->t_infomask & HEAP_HASNULL)
-                {
-                    int            bits_len;
-
-                    bits_len =
-                        ((tuphdr->t_infomask2 & HEAP_NATTS_MASK) / 8 + 1) * 8;
-                    values[12] = CStringGetTextDatum(
-                                                     bits_to_text(tuphdr->t_bits, bits_len));
-                }
-                else
-                    nulls[12] = true;
-
-                if (tuphdr->t_infomask & HEAP_HASOID)
-                    values[13] = HeapTupleHeaderGetOid(tuphdr);
-                else
-                    nulls[13] = true;
-            }
-            else
-            {
-                nulls[12] = true;
-                nulls[13] = true;
-            }
-        }
-        else
-        {
-            /*
-             * The line pointer is not used, or it's invalid. Set the rest of
-             * the fields to NULL
-             */
-            int            i;
-
-            for (i = 4; i <= 13; i++)
-                nulls[i] = true;
-        }
-
-        /* Build and return the result tuple. */
-        resultTuple = heap_form_tuple(inter_call_data->tupd, values, nulls);
-        result = HeapTupleGetDatum(resultTuple);
-
-        inter_call_data->offset++;
-
-        SRF_RETURN_NEXT(fctx, result);
-    }
-    else
-        SRF_RETURN_DONE(fctx);
+			/* Copy raw tuple data into bytea attribute */
+			tuple_data_len = lp_len - tuphdr->t_hoff;
+			tuple_data_bytea = (bytea *) palloc(tuple_data_len + VARHDRSZ);
+			SET_VARSIZE(tuple_data_bytea, tuple_data_len + VARHDRSZ);
+			memcpy(VARDATA(tuple_data_bytea), (char *) tuphdr + tuphdr->t_hoff,
+				   tuple_data_len);
+			values[14] = PointerGetDatum(tuple_data_bytea);
+
+			/*
+			 * We already checked that the item is completely within the raw
+			 * page passed to us, with the length given in the line pointer.
+			 * Let's check that t_hoff doesn't point over lp_len, before using
+			 * it to access t_bits and oid.
+			 */
+			if (tuphdr->t_hoff >= SizeofHeapTupleHeader &&
+				tuphdr->t_hoff <= lp_len &&
+				tuphdr->t_hoff == MAXALIGN(tuphdr->t_hoff))
+			{
+				if (tuphdr->t_infomask & HEAP_HASNULL)
+				{
+					int			bits_len;
+
+					bits_len =
+						((tuphdr->t_infomask2 & HEAP_NATTS_MASK) / 8 + 1) * 8;
+					values[12] = CStringGetTextDatum(
+													 bits_to_text(tuphdr->t_bits, bits_len));
+				}
+				else
+					nulls[12] = true;
+
+				if (tuphdr->t_infomask & HEAP_HASOID)
+					values[13] = HeapTupleHeaderGetOid(tuphdr);
+				else
+					nulls[13] = true;
+			}
+			else
+			{
+				nulls[12] = true;
+				nulls[13] = true;
+			}
+		}
+		else
+		{
+			/*
+			 * The line pointer is not used, or it's invalid. Set the rest of
+			 * the fields to NULL
+			 */
+			int			i;
+
+			for (i = 4; i <= 14; i++)
+				nulls[i] = true;
+		}
+
+		/* Build and return the result tuple. */
+		resultTuple = heap_form_tuple(inter_call_data->tupd, values, nulls);
+		result = HeapTupleGetDatum(resultTuple);
+
+		inter_call_data->offset++;
+
+		SRF_RETURN_NEXT(fctx, result);
+	}
+	else
+		SRF_RETURN_DONE(fctx);
 }
 
 /*
diff --git a/contrib/pgxc_ctl/make_signature b/contrib/pgxc_ctl/make_signature
old mode 100644
new mode 100755
diff --git a/contrib/tbase_gts_tools/Makefile b/contrib/tbase_gts_tools/Makefile
new file mode 100644
index 00000000..4b82be9d
--- /dev/null
+++ b/contrib/tbase_gts_tools/Makefile
@@ -0,0 +1,23 @@
+# contrib/tbase_gts_tools/Makefile
+MODULES = tbase_gts_tools
+
+## 扩展名称;
+EXTENSION = tbase_gts_tools
+
+## 扩展安装的SQL文件;
+DATA = tbase_gts_tools--1.0.sql
+
+## 扩展描述;
+PGFILEDESC = "tbase_gts_tools - GTS wrapper for Tbase"
+
+### 以下为Pg构建扩展相关命令;
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)  ## 环境变量参数加载;
+else
+subdir = contrib/tbase_gts_tools
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/tbase_gts_tools/tbase_gts_tools--1.0.sql b/contrib/tbase_gts_tools/tbase_gts_tools--1.0.sql
new file mode 100644
index 00000000..92d0f1f4
--- /dev/null
+++ b/contrib/tbase_gts_tools/tbase_gts_tools--1.0.sql
@@ -0,0 +1,104 @@
+/* contrib/tbase_gts/tbase_gts_tools--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "create EXTENSION tbase_gts_tools" to load this file. \quit
+
+CREATE FUNCTION txid_gts(int)
+RETURNS bigint
+AS 'MODULE_PATHNAME', 'txid_gts'
+LANGUAGE C STRICT;
+
+--
+-- heap_page_items_with_gts()
+-- according to heap_page_items_with_gts() from pageinspect--1.5.sql
+--
+CREATE FUNCTION heap_page_items_with_gts(IN page bytea,
+    OUT lp smallint,
+    OUT lp_off smallint,
+    OUT lp_flags smallint,
+    OUT lp_len smallint,
+    OUT t_xmin xid,
+    OUT t_xmax xid,
+    OUT t_xmin_gts bigint,
+    OUT t_xmax_gts bigint,
+    OUT t_field3 int4,
+    OUT t_ctid tid,
+    OUT t_infomask2 integer,
+    OUT t_infomask integer,
+    OUT t_shard smallint,
+    OUT t_hoff smallint,
+    OUT t_bits text,
+    OUT t_oid oid,
+    OUT t_data bytea)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'heap_page_items_with_gts'
+LANGUAGE C STRICT PARALLEL SAFE;
+
+
+CREATE FUNCTION heap_page_ids(IN page bytea,
+    OUT lp smallint,
+    OUT lp_off smallint,
+    OUT lp_flags smallint,
+    OUT lp_len smallint,
+    OUT t_xmin xid,
+    OUT t_xmax xid,
+    OUT t_xmin_gts bigint,
+    OUT t_xmax_gts bigint,
+    OUT t_field3 int4,
+    OUT t_ctid tid,
+    OUT t_infomask2 integer,
+    OUT t_infomask integer,
+    OUT t_shard smallint,
+    OUT t_hoff smallint,
+    OUT t_bits text,
+    OUT t_oid oid,
+    OUT t_data bytea)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'heap_page_ids'
+LANGUAGE C STRICT PARALLEL SAFE;
+
+
+CREATE FUNCTION heap_page_items_without_data(IN page bytea,
+    OUT lp smallint,
+    OUT lp_off smallint,
+    OUT lp_flags smallint,
+    OUT lp_len smallint,
+    OUT t_xmin xid,
+    OUT t_xmax xid,
+    OUT t_xmin_gts bigint,
+    OUT t_xmax_gts bigint,
+    OUT t_field3 int4,
+    OUT t_ctid tid,
+    OUT t_infomask2 integer,
+    OUT t_infomask integer,
+    OUT t_shard smallint,
+    OUT t_hoff smallint,
+    OUT t_bits text,
+    OUT t_oid oid,
+    OUT t_data bytea)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'heap_page_items_without_data'
+LANGUAGE C STRICT PARALLEL SAFE;
+
+
+CREATE FUNCTION heap_page_items_with_gts_log(IN page bytea,
+    OUT lp smallint,
+    OUT lp_off smallint,
+    OUT lp_flags smallint,
+    OUT lp_len smallint,
+    OUT t_xmin xid,
+    OUT t_xmax xid,
+    OUT t_xmin_gts bigint,
+    OUT t_xmax_gts bigint,
+    OUT t_field3 int4,
+    OUT t_ctid tid,
+    OUT t_infomask2 integer,
+    OUT t_infomask integer,
+    OUT t_shard smallint,
+    OUT t_hoff smallint,
+    OUT t_bits text,
+    OUT t_oid oid,
+    OUT t_data bytea)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'heap_page_items_with_gts_log'
+LANGUAGE C STRICT PARALLEL SAFE;
diff --git a/contrib/tbase_gts_tools/tbase_gts_tools.c b/contrib/tbase_gts_tools/tbase_gts_tools.c
new file mode 100644
index 00000000..5de20a39
--- /dev/null
+++ b/contrib/tbase_gts_tools/tbase_gts_tools.c
@@ -0,0 +1,357 @@
+#include "postgres.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "catalog/pg_type.h"
+#include "catalog/namespace.h"
+#include "utils/timestamp.h"
+#include "utils/varlena.h"
+#include "utils/builtins.h"
+#include "utils/elog.h"
+#include "access/commit_ts.h"
+#include "access/htup_details.h"
+#include "storage/bufmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+static Datum
+items(PG_FUNCTION_ARGS, int log_level, bool with_data, bool only_id);
+
+/*
+ * bits_to_text
+ *
+ * Converts a bits8-array of 'len' bits to a human-readable
+ * c-string representation.
+ */
+static char *
+bits_to_text(bits8 *bits, int len)
+{
+	int i;
+	char *str;
+
+	str = palloc(len + 1);
+
+	for (i = 0; i < len; i++)
+		str[i] = (bits[(i / 8)] & (1 << (i % 8))) ? '1' : '0';
+
+	str[i] = '\0';
+
+	return str;
+}
+
+PG_FUNCTION_INFO_V1(txid_gts);
+
+Datum
+txid_gts(PG_FUNCTION_ARGS)
+{
+	TransactionId xid = PG_GETARG_UINT32(0);
+	TimestampTz gts;
+	bool found = false;
+
+	if (TransactionIdIsNormal(xid))
+	{
+		found = TransactionIdGetCommitTsData(xid, &gts, NULL);
+	}
+
+	if (!found)
+	{
+		PG_RETURN_NULL();
+	}
+
+	PG_RETURN_INT64(gts);
+}
+
+/*
+ * heap_page_items_with_gts
+ *
+ * Allows inspection of line pointers and tuple headers of a heap page.
+ */
+PG_FUNCTION_INFO_V1(heap_page_items_with_gts);
+
+typedef struct heap_page_items_state
+{
+	TupleDesc tupd;
+	Page page;
+	uint16 offset;
+} heap_page_items_state;
+
+Datum
+heap_page_items_with_gts(PG_FUNCTION_ARGS)
+{
+	return items(fcinfo, 0, true, false);
+}
+
+PG_FUNCTION_INFO_V1(heap_page_items_with_gts_log);
+
+Datum
+heap_page_items_with_gts_log(PG_FUNCTION_ARGS)
+{
+	return items(fcinfo, 1, true, false);
+}
+
+PG_FUNCTION_INFO_V1(heap_page_ids);
+
+Datum
+heap_page_ids(PG_FUNCTION_ARGS)
+{
+	return items(fcinfo, 1, false, true);
+}
+
+PG_FUNCTION_INFO_V1(heap_page_items_without_data);
+
+Datum
+heap_page_items_without_data(PG_FUNCTION_ARGS)
+{
+	return items(fcinfo, 1, false, false);
+}
+
+static Datum
+items(PG_FUNCTION_ARGS, int log_level, bool with_data, bool only_id)
+{
+	bytea	*raw_page;
+	int		raw_page_size;
+	heap_page_items_state *inter_call_data = NULL;
+	FuncCallContext *fctx;
+
+	if (!superuser())
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use raw page functions"))));
+	}
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		TupleDesc tupdesc;
+		MemoryContext mctx;
+
+		fctx = SRF_FIRSTCALL_INIT();
+		mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+
+		raw_page = PG_GETARG_BYTEA_P(0);
+		raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
+		if (raw_page_size < SizeOfPageHeaderData)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					errmsg("input page too small (%d bytes)", raw_page_size)));
+		}
+
+		inter_call_data = palloc(sizeof(heap_page_items_state));
+
+		/* Build a tuple descriptor for our result type */
+		if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		{
+			elog(ERROR, "return type must be a row type");
+		}
+
+		inter_call_data->tupd = tupdesc;
+
+		inter_call_data->offset = FirstOffsetNumber;
+		inter_call_data->page = VARDATA(raw_page);
+
+		fctx->max_calls = PageGetMaxOffsetNumber(inter_call_data->page);
+		fctx->user_fctx = inter_call_data;
+
+		MemoryContextSwitchTo(mctx);
+	}
+
+	fctx = SRF_PERCALL_SETUP();
+	inter_call_data = fctx->user_fctx;
+
+	if (fctx->call_cntr < fctx->max_calls)
+	{
+		Page page = inter_call_data->page;
+		HeapTuple resultTuple;
+		Datum result;
+		ItemId id;
+		Datum values[17];
+		bool nulls[17];
+		uint16 lp_offset;
+		uint16 lp_flags;
+		uint16 lp_len;
+
+		memset(values, 0, sizeof(values));
+		memset(nulls, 0, sizeof(nulls));
+
+		/* Extract information from the line pointer */
+
+		id = PageGetItemId(page, inter_call_data->offset);
+
+		lp_offset = ItemIdGetOffset(id);
+		lp_flags = ItemIdGetFlags(id);
+		lp_len = ItemIdGetLength(id);
+
+		values[0] = UInt16GetDatum(inter_call_data->offset);
+		values[1] = UInt16GetDatum(lp_offset);
+		values[2] = UInt16GetDatum(lp_flags);
+		values[3] = UInt16GetDatum(lp_len);
+
+		/*
+		 * We do just enough validity checking to make sure we don't reference
+		 * data outside the page passed to us. The page could be corrupt in
+		 * many other ways, but at least we won't crash.
+		 */
+		if (!only_id &&
+			ItemIdHasStorage(id) &&
+			lp_len >= MinHeapTupleSize &&
+			lp_offset == MAXALIGN(lp_offset) &&
+			lp_offset + lp_len <= BLCKSZ)
+		{
+			HeapTupleHeader tuphdr;
+			bytea *tuple_data_bytea;
+			int tuple_data_len;
+
+			/* Extract information from the tuple header */
+
+			tuphdr = (HeapTupleHeader)PageGetItem(page, id);
+
+			values[4] = UInt32GetDatum(HeapTupleHeaderGetRawXmin(tuphdr));
+			values[5] = UInt32GetDatum(HeapTupleHeaderGetRawXmax(tuphdr));
+			values[6] = Int64GetDatum(HeapTupleHeaderGetXminTimestamp(tuphdr));
+			values[7] = Int64GetDatum(HeapTupleHeaderGetXmaxTimestamp(tuphdr));
+
+			/* shared with xvac */
+			values[8] = UInt32GetDatum(HeapTupleHeaderGetRawCommandId(tuphdr));
+			values[9] = PointerGetDatum(&tuphdr->t_ctid);
+			values[10] = UInt32GetDatum(tuphdr->t_infomask2);
+			values[11] = UInt32GetDatum(tuphdr->t_infomask);
+#ifdef _MIGRATE_
+			values[12] = Int32GetDatum(tuphdr->t_shardid);
+			values[13] = UInt8GetDatum(tuphdr->t_hoff);
+#else
+			values[12] = UInt8GetDatum(tuphdr->t_hoff);
+#endif
+
+			if (with_data)
+			{
+				/* Copy raw tuple data into bytea attribute */
+				tuple_data_len = lp_len - tuphdr->t_hoff;
+				tuple_data_bytea = (bytea *)palloc(tuple_data_len + VARHDRSZ);
+				SET_VARSIZE(tuple_data_bytea, tuple_data_len + VARHDRSZ);
+				memcpy(VARDATA(tuple_data_bytea), (char *)tuphdr + tuphdr->t_hoff,
+					   tuple_data_len);
+				values[16] = PointerGetDatum(tuple_data_bytea);
+			}
+			else
+			{
+				nulls[16] = true;
+			}
+
+			/*
+			 * We already checked that the item is completely within the raw
+			 * page passed to us, with the length given in the line pointer.
+			 * Let's check that t_hoff doesn't point over lp_len, before using
+			 * it to access t_bits and oid.
+			 */
+			if (tuphdr->t_hoff >= SizeofHeapTupleHeader &&
+				tuphdr->t_hoff <= lp_len &&
+				tuphdr->t_hoff == MAXALIGN(tuphdr->t_hoff))
+			{
+				if (tuphdr->t_infomask & HEAP_HASNULL)
+				{
+					int bits_len;
+
+					bits_len =
+						((tuphdr->t_infomask2 & HEAP_NATTS_MASK) / 8 + 1) * 8;
+					values[14] = CStringGetTextDatum(
+						bits_to_text(tuphdr->t_bits, bits_len));
+				}
+				else
+				{
+					nulls[14] = true;
+				}
+
+				if (tuphdr->t_infomask & HEAP_HASOID)
+				{
+					values[15] = HeapTupleHeaderGetOid(tuphdr);
+				}
+				else
+				{
+					nulls[15] = true;
+				}
+			}
+			else
+			{
+				nulls[14] = true;
+				nulls[15] = true;
+			}
+		}
+		else
+		{
+			/*
+			 * The line pointer is not used, or it's invalid. Set the rest of
+			 * the fields to NULL
+			 */
+			int i;
+
+			for (i = 4; i <= 16; i++)
+				nulls[i] = true;
+		}
+
+		if (log_level > 0)
+		{
+			elog(LOG, "heap_page_items_with_gts_log: null[0~16] = "
+				"%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d \n",
+				 nulls[0], nulls[1], nulls[2], nulls[3],
+				 nulls[4], nulls[5], nulls[6], nulls[7],
+				 nulls[8], nulls[9], nulls[10], nulls[11],
+				 nulls[12], nulls[13], nulls[14], nulls[15],
+				 nulls[16]);
+
+			if (only_id)
+			{
+				elog(LOG, "heap_page_items_with_gts_log: "
+					"lp=%d, lp_off=%d, lp_flags=%d, lp_len=%d \n",
+					 DatumGetUInt16(values[0]),
+					 DatumGetUInt16(values[1]),
+					 DatumGetUInt16(values[2]),
+					 DatumGetUInt16(values[3]));
+			}
+			else
+			{
+				elog(LOG, "heap_page_items_with_gts_log: "
+						  "lp=%d, lp_off=%d, lp_flags=%d, lp_len=%d "
+						  "t_xmin=%u t_xmax=%u t_xmin_gts=%ld t_xmax_gts=%ld "
+						  "t_field3=%u t_infomask2=%u t_infomask=%u "
+						  "t_share=%d t_hoff=%d t_oid=%u "
+						  "\n",
+					 DatumGetUInt16(values[0]),
+					 DatumGetUInt16(values[1]),
+					 DatumGetUInt16(values[2]),
+					 DatumGetUInt16(values[3]),
+					 DatumGetUInt32(values[4]),
+					 DatumGetUInt32(values[5]),
+					 DatumGetInt64(values[6]),
+					 DatumGetInt64(values[7]),
+					 DatumGetUInt32(values[8]),
+					 /* ignore tid */
+					 DatumGetUInt32(values[10]),
+					 DatumGetUInt32(values[11]),
+					 DatumGetInt32(values[12]),
+					 DatumGetUInt8(values[13]),
+					 /* ignore text */
+					 (Oid)values[15]
+					 /*
+					  * ignore oid
+					  * ignore byte
+					  */
+				);
+			}
+		}
+
+		/* Build and return the result tuple. */
+		resultTuple = heap_form_tuple(inter_call_data->tupd, values, nulls);
+		result = HeapTupleGetDatum(resultTuple);
+
+		inter_call_data->offset++;
+
+		SRF_RETURN_NEXT(fctx, result);
+	}
+	else
+	{
+		SRF_RETURN_DONE(fctx);
+	}
+}
diff --git a/contrib/tbase_gts_tools/tbase_gts_tools.control b/contrib/tbase_gts_tools/tbase_gts_tools.control
new file mode 100644
index 00000000..a7b6e7f3
--- /dev/null
+++ b/contrib/tbase_gts_tools/tbase_gts_tools.control
@@ -0,0 +1,5 @@
+# tbase_gts_tools extension
+comment = 'GTS wrapper for Tbase'
+default_version = '1.0'
+module_pathname = '$libdir/tbase_gts_tools'
+relocatable = true
diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c
index 303e7a88..450e2594 100644
--- a/src/backend/access/rmgrdesc/xactdesc.c
+++ b/src/backend/access/rmgrdesc/xactdesc.c
@@ -338,41 +338,41 @@ xact_desc_assignment(StringInfo buf, xl_xact_assignment *xlrec)
 void
 xact_desc(StringInfo buf, XLogReaderState *record)
 {
-    char       *rec = XLogRecGetData(record);
-    uint8        info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
-
-    if (info == XLOG_XACT_COMMIT || info == XLOG_XACT_COMMIT_PREPARED)
-    {
-        xl_xact_commit *xlrec = (xl_xact_commit *) rec;
-
-        xact_desc_commit(buf, XLogRecGetInfo(record), xlrec,
-                         XLogRecGetOrigin(record));
-    }
-    else if (info == XLOG_XACT_ABORT || info == XLOG_XACT_ABORT_PREPARED)
-    {
-        xl_xact_abort *xlrec = (xl_xact_abort *) rec;
-
-        xact_desc_abort(buf, XLogRecGetInfo(record), xlrec);
-    }
-    else if (info == XLOG_XACT_ASSIGNMENT)
-    {
-        xl_xact_assignment *xlrec = (xl_xact_assignment *) rec;
-
-        /*
-         * Note that we ignore the WAL record's xid, since we're more
-         * interested in the top-level xid that issued the record and which
-         * xids are being reported here.
-         */
-        appendStringInfo(buf, "xtop %u: ", xlrec->xtop);
-        xact_desc_assignment(buf, xlrec);
-    }
-    #ifdef __TBASE__
-    else if (info == XLOG_XACT_ACQUIRE_GTS)
-    {
-        xl_xact_acquire_gts *xlrec = (xl_xact_acquire_gts *) rec;
-        appendStringInfo(buf, "acquire global timestamp "INT64_FORMAT" ", xlrec->global_timestamp);
-    }
-    #endif
+	char	   *rec = XLogRecGetData(record);
+	uint8		info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
+
+	if (info == XLOG_XACT_COMMIT || info == XLOG_XACT_COMMIT_PREPARED)
+	{
+		xl_xact_commit *xlrec = (xl_xact_commit *) rec;
+
+		xact_desc_commit(buf, XLogRecGetInfo(record), xlrec,
+						 XLogRecGetOrigin(record));
+	}
+	else if (info == XLOG_XACT_ABORT || info == XLOG_XACT_ABORT_PREPARED)
+	{
+		xl_xact_abort *xlrec = (xl_xact_abort *) rec;
+
+		xact_desc_abort(buf, XLogRecGetInfo(record), xlrec);
+	}
+	else if (info == XLOG_XACT_ASSIGNMENT)
+	{
+		xl_xact_assignment *xlrec = (xl_xact_assignment *) rec;
+
+		/*
+		 * Note that we ignore the WAL record's xid, since we're more
+		 * interested in the top-level xid that issued the record and which
+		 * xids are being reported here.
+		 */
+		appendStringInfo(buf, "xtop %u: ", xlrec->xtop);
+		xact_desc_assignment(buf, xlrec);
+	}
+#ifdef __TBASE__
+	else if (info == XLOG_XACT_ACQUIRE_GTS)
+	{
+		xl_xact_acquire_gts *xlrec = (xl_xact_acquire_gts *) rec;
+		appendStringInfo(buf, "acquire global timestamp "INT64_FORMAT" ", xlrec->global_timestamp);
+	}
+#endif
 }
 
 const char *
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index d7140d25..0a950156 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -208,63 +208,64 @@ TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids,
     else
         newestXact = xid;
 #endif
-    /*
-     * We split the xids to set the timestamp to in groups belonging to the
-     * same SLRU page; the first element in each such set is its head.  The
-     * first group has the main XID as the head; subsequent sets use the first
-     * subxid not on the previous page as head.  This way, we only have to
-     * lock/modify each SLRU page once.
-     */
-    for (i = 0, headxid = xid;;)
-    {
-        int            pageno = TransactionIdToCTsPage(headxid);
-        int            j;
-
-        for (j = i; j < nsubxids; j++)
-        {
-            if(enable_committs_print)
-            {
-                elog(LOG, "TransactionTreeSetCommitTsData, subxid xid %d i %d j %d nsubxids %d", subxids[j], i, j, nsubxids);
-            }
-            
-            if (TransactionIdToCTsPage(subxids[j]) != pageno)
-            {
-                if(enable_committs_print)
-                {
-                    elog(LOG, "break pageno %d subxid xid %d j %d", pageno, subxids[j], j);
-                }
-                break;
-            }
-        }
-        /* subxids[i..j] are on the same page as the head */
-        if(j - i > 0)
-        {
-            SetXidCommitTsInPage(headxid, j - i, subxids + i, global_timestamp, timestamp, nodeid,
-                                 pageno, lsn);
-        }
-        else
-        {
-            SetXidCommitTsInPage(headxid, 0, NULL, global_timestamp, timestamp, nodeid,
-                                 pageno, lsn);
-        }
-
-        if(enable_committs_print)
-        {
-            elog(LOG, "set committs data pageno %d xid %d head xid %d j-i %d i %d nsubxids %d committs "INT64_FORMAT, pageno, xid, headxid, j-i, 
-                i, nsubxids, global_timestamp);
-        }
-
-        /* if we wrote out all subxids, we're done. */
-        if (j + 1 > nsubxids)
-            break;
-
-        /*
-         * Set the new head and skip over it, as well as over the subxids we
-         * just wrote.
-         */
-        headxid = subxids[j];
-        i = j + 1;
-    }
+	/*
+	 * We split the xids to set the timestamp to in groups belonging to the
+	 * same SLRU page; the first element in each such set is its head.  The
+	 * first group has the main XID as the head; subsequent sets use the first
+	 * subxid not on the previous page as head.  This way, we only have to
+	 * lock/modify each SLRU page once.
+	 */
+	for (i = 0, headxid = xid;;)
+	{
+		int			pageno = TransactionIdToCTsPage(headxid);
+		int			j;
+
+		for (j = i; j < nsubxids; j++)
+		{
+			if(enable_committs_print)
+			{
+				elog(LOG, "TransactionTreeSetCommitTsData, subxid xid %d i %d j %d nsubxids %d", subxids[j], i, j, nsubxids);
+			}
+			
+			if (TransactionIdToCTsPage(subxids[j]) != pageno)
+			{
+				if(enable_committs_print)
+				{
+					elog(LOG, "break pageno %d subxid xid %d j %d", pageno, subxids[j], j);
+				}
+				break;
+			}
+		}
+		/* subxids[i..j] are on the same page as the head */
+		if(j - i > 0)
+		{
+			SetXidCommitTsInPage(headxid, j - i, subxids + i, global_timestamp, timestamp, nodeid,
+								 pageno, lsn);
+		}
+		else
+		{
+			SetXidCommitTsInPage(headxid, 0, NULL, global_timestamp, timestamp, nodeid,
+								 pageno, lsn);
+		}
+
+		if(enable_committs_print)
+		{
+			elog(LOG,
+				"TransactionTreeSetCommitTsData: set committs data pageno %d xid %d head xid %d j-i %d i %d nsubxids %d committs "INT64_FORMAT,
+				pageno, xid, headxid, j - i, i, nsubxids, global_timestamp);
+		}
+
+		/* if we wrote out all subxids, we're done. */
+		if (j + 1 > nsubxids)
+			break;
+
+		/*
+		 * Set the new head and skip over it, as well as over the subxids we
+		 * just wrote.
+		 */
+		headxid = subxids[j];
+		i = j + 1;
+	}
 #if 0
     /* update the cached value in shared memory */
     LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
@@ -317,21 +318,21 @@ static void
 TransactionIdSetCommitTs(TransactionId xid, TimestampTz gts, TimestampTz ts,
                          RepOriginId nodeid, int partitionno, int slotno, XLogRecPtr lsn)
 {
-    int            entryno = TransactionIdToCTsEntry(xid);
-    CommitTimestampEntry entry;
-
-//    Assert(TransactionIdIsNormal(xid));
-    if(enable_committs_print)
-    {
-        elog(LOG, "TransactionIdSetCommitTs xid %d", xid);
-    }
-    entry.global_timestamp = gts;
-    entry.time = ts;
-    entry.nodeid = nodeid;
-
-    memcpy(CommitTsCtl->shared[partitionno]->page_buffer[slotno] +
-           SizeOfCommitTimestampEntry * entryno,
-           &entry, SizeOfCommitTimestampEntry);
+	int			entryno = TransactionIdToCTsEntry(xid);
+	CommitTimestampEntry entry;
+
+//	Assert(TransactionIdIsNormal(xid));
+	if (enable_committs_print)
+	{
+		elog(LOG, "TransactionIdSetCommitTs: xid %d gts "INT64_FORMAT, xid, gts);
+	}
+	entry.global_timestamp = gts;
+	entry.time = ts;
+	entry.nodeid = nodeid;
+
+	memcpy(CommitTsCtl->shared[partitionno]->page_buffer[slotno] +
+		   SizeOfCommitTimestampEntry * entryno,
+		   &entry, SizeOfCommitTimestampEntry);
 
 #ifdef __TBASE__
     /*
@@ -1191,19 +1192,26 @@ WriteSetTimestampXlogRec(TransactionId mainxid, int nsubxids,
                          TransactionId *subxids, TimestampTz global_timestamp, TimestampTz timestamp,
                          RepOriginId nodeid)
 {
-    xl_commit_ts_set record;
-
-    record.global_timestamp = global_timestamp;
-    record.timestamp = timestamp;
-    record.nodeid = nodeid;
-    record.mainxid = mainxid;
-
-    XLogBeginInsert();
-    XLogRegisterData((char *) &record,
-                     offsetof(xl_commit_ts_set, mainxid) +
-                     sizeof(TransactionId));
-    XLogRegisterData((char *) subxids, nsubxids * sizeof(TransactionId));
-    XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_SETTS);
+	xl_commit_ts_set record;
+
+	record.global_timestamp = global_timestamp;
+	record.timestamp = timestamp;
+	record.nodeid = nodeid;
+	record.mainxid = mainxid;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &record,
+					 offsetof(xl_commit_ts_set, mainxid) +
+					 sizeof(TransactionId));
+	XLogRegisterData((char *) subxids, nsubxids * sizeof(TransactionId));
+	XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_SETTS);
+
+	if (enable_committs_print)
+	{
+		elog(LOG,
+			"WriteSetTimestampXlogRec: mainxid %d timestamp "INT64_FORMAT" global_timestamp "INT64_FORMAT,
+			mainxid, timestamp, global_timestamp);
+	}
 }
 
 /*
diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 81bb209f..981332b6 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -1382,36 +1382,28 @@ GetGlobalTimestampGTM(void)
     if (log_gtm_stats)
         ShowUsageCommon("BeginTranGTM", &start_r, &start_t);
 
-retry:
-    
-    latest_gts = GetLatestCommitTS();
-    if (gts_result.gts != InvalidGlobalTimestamp && latest_gts > (gts_result.gts + GTM_CHECK_DELTA))
-    {
-        if(retries < 3)
-        {
-            retries++;
-            goto retry;
-        }
-        
-        elog(ERROR, "global gts:%lu is earlier than local gts:%lu, please check GTM status!", gts_result.gts + GTM_CHECK_DELTA, latest_gts);
-    }
+	latest_gts = GetLatestCommitTS();
+	if (gts_result.gts != InvalidGlobalTimestamp && latest_gts > (gts_result.gts + GTM_CHECK_DELTA))
+	{
+		elog(ERROR, "global gts:%lu is earlier than local gts:%lu, please check GTM status!", gts_result.gts + GTM_CHECK_DELTA, latest_gts);
+	}
 
-    /* if we are standby, use timestamp subtracting given interval */
-    if (IsStandbyPostgres() && query_delay)
-    {
-        GTM_Timestamp  interval = query_delay * USECS_PER_SEC;
+	/* if we are standby, use timestamp subtracting given interval */
+	if (IsStandbyPostgres() && query_delay)
+	{
+		GTM_Timestamp  interval = query_delay * USECS_PER_SEC;
 
-        gts_result.gts = gts_result.gts - interval;
+		gts_result.gts = gts_result.gts - interval;
 
-        if (gts_result.gts < FirstGlobalTimestamp)
-        {
-            gts_result.gts = FirstGlobalTimestamp;
-        }
-    }
+		if (gts_result.gts < FirstGlobalTimestamp)
+		{
+			gts_result.gts = FirstGlobalTimestamp;
+		}
+	}
 
-    GTM_ReadOnly = gts_result.gtm_readonly;
-    
-    return gts_result.gts;
+	GTM_ReadOnly = gts_result.gtm_readonly;
+	
+	return gts_result.gts;
 }
 #endif
 
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 5fc12531..efb5aade 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -86,8 +86,6 @@ int            vacuum_multixact_freeze_min_age;
 int            vacuum_multixact_freeze_table_age;
 int            vacuum_defer_freeze_min_age;
 
-
-
 /* A few variables that don't seem worth passing around as parameters */
 static MemoryContext vac_context = NULL;
 static BufferAccessStrategy vac_strategy;
@@ -1938,7 +1936,7 @@ vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params)
     relname = RelationGetRelationName(onerel);
     nspname = get_namespace_name(RelationGetNamespace(onerel));
 
-    elog(LOG, "Getting relation statistics for %s.%s", nspname, relname);
+	elog(DEBUG5, "Getting relation statistics for %s.%s", nspname, relname);
 
 #ifdef __TBASE__
 	if (params && onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 1e72ec49..f1508d69 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -34,8 +34,10 @@
  */
 #include "postgres.h"
 
+#include <execinfo.h>
 #include <math.h>
 
+#include "access/commit_ts.h"
 #include "access/genam.h"
 #include "access/heapam.h"
 #include "access/heapam_xlog.h"
@@ -45,6 +47,7 @@
 #include "access/visibilitymap.h"
 #include "access/xlog.h"
 #include "access/xlogutils.h"
+#include "bootstrap/bootstrap.h"
 #include "catalog/catalog.h"
 #include "catalog/storage.h"
 #include "commands/dbcommands.h"
@@ -53,7 +56,9 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "portability/instr_time.h"
+#include "postmaster/auditlogger.h"
 #include "postmaster/autovacuum.h"
+#include "postmaster/postmaster.h"
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/lmgr.h"
@@ -135,6 +140,13 @@ typedef struct LVRelStats
     bool        lock_waiter_detected;
 } LVRelStats;
 
+int	gts_maintain_option;
+
+static void PrintStack(void);
+static void PrintData(RelFileNode *rnode,
+	BlockNumber blkno, Page page, OffsetNumber lineoff,
+	GlobalTimestamp tlog_xmin_gts, GlobalTimestamp tlog_xmax_gts);
+static void MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer);
 
 /* A few variables that don't seem worth passing around as parameters */
 static int    elevel = -1;
@@ -991,192 +1003,198 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
                                 relname, blkno)));
                 PageInit(page, BufferGetPageSize(buf), 0);
 #endif
-                empty_pages++;
-                UnlockReleaseBuffer(buf);
-                RecordNewPageWithFullFreeSpace(onerel, blkno);
-            }
-            else
-            {
-                UnlockReleaseBuffer(buf);
-                freespace = PageGetHeapFreeSpace(page);
-                RecordPageWithFreeSpace(onerel, blkno, freespace);
-                MarkBufferDirty(buf);
-            }
-#endif                        
-            continue;
-        }
-
-        if (PageIsEmpty(page))
-        {
-            empty_pages++;
-            freespace = PageGetHeapFreeSpace(page);
-
-            /* empty pages are always all-visible and all-frozen */
-            if (!PageIsAllVisible(page))
-            {
-                START_CRIT_SECTION();
-
-                /* mark buffer dirty before writing a WAL record */
-                MarkBufferDirty(buf);
-
-                /*
-                 * It's possible that another backend has extended the heap,
-                 * initialized the page, and then failed to WAL-log the page
-                 * due to an ERROR.  Since heap extension is not WAL-logged,
-                 * recovery might try to replay our record setting the page
-                 * all-visible and find that the page isn't initialized, which
-                 * will cause a PANIC.  To prevent that, check whether the
-                 * page has been previously WAL-logged, and if not, do that
-                 * now.
-                 */
-                if (RelationNeedsWAL(onerel) &&
-                    PageGetLSN(page) == InvalidXLogRecPtr)
-                    log_newpage_buffer(buf, true);
-
-                PageSetAllVisible(page);
-                visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
-                                  vmbuffer, InvalidTransactionId,
-                                  VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
-                END_CRIT_SECTION();
-            }
-
-            UnlockReleaseBuffer(buf);
-            RecordPageWithFreeSpace(onerel, blkno, freespace);
-            continue;
-        }
-
-
-        /*
-         * Prune all HOT-update chains in this page.
-         *
-         * We count tuples removed by the pruning step as removed by VACUUM.
-         */
-        tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
-                                         &vacrelstats->latestRemovedXid);
-
-        /*
-         * Now scan the page to collect vacuumable items and check for tuples
-         * requiring freezing.
-         */
-        all_visible = true;
-        has_dead_tuples = false;
-        nfrozen = 0;
-        hastup = false;
-        prev_dead_count = vacrelstats->num_dead_tuples;
-        maxoff = PageGetMaxOffsetNumber(page);
+				empty_pages++;
+				UnlockReleaseBuffer(buf);
+				RecordNewPageWithFullFreeSpace(onerel, blkno);
+			}
+			else
+			{
+				UnlockReleaseBuffer(buf);
+				freespace = PageGetHeapFreeSpace(page);
+				RecordPageWithFreeSpace(onerel, blkno, freespace);
+				MarkBufferDirty(buf);
+			}
+#endif						
+			continue;
+		}
 
-        /*
-         * Note: If you change anything in the loop below, also look at
-         * heap_page_is_all_visible to see if that needs to be changed.
-         */
-        for (offnum = FirstOffsetNumber;
-             offnum <= maxoff;
-             offnum = OffsetNumberNext(offnum))
-        {
-            ItemId        itemid;
+		if (PageIsEmpty(page))
+		{
+			empty_pages++;
+			freespace = PageGetHeapFreeSpace(page);
 
-            itemid = PageGetItemId(page, offnum);
+			/* empty pages are always all-visible and all-frozen */
+			if (!PageIsAllVisible(page))
+			{
+				START_CRIT_SECTION();
+
+				/* mark buffer dirty before writing a WAL record */
+				MarkBufferDirty(buf);
+
+				/*
+				 * It's possible that another backend has extended the heap,
+				 * initialized the page, and then failed to WAL-log the page
+				 * due to an ERROR.  Since heap extension is not WAL-logged,
+				 * recovery might try to replay our record setting the page
+				 * all-visible and find that the page isn't initialized, which
+				 * will cause a PANIC.  To prevent that, check whether the
+				 * page has been previously WAL-logged, and if not, do that
+				 * now.
+				 */
+				if (RelationNeedsWAL(onerel) &&
+					PageGetLSN(page) == InvalidXLogRecPtr)
+					log_newpage_buffer(buf, true);
+
+				PageSetAllVisible(page);
+				visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
+								  vmbuffer, InvalidTransactionId,
+								  VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
+				END_CRIT_SECTION();
+			}
 
-            /* Unused items require no processing, but we count 'em */
-            if (!ItemIdIsUsed(itemid))
-            {
-                nunused += 1;
-                continue;
-            }
+			UnlockReleaseBuffer(buf);
+			RecordPageWithFreeSpace(onerel, blkno, freespace);
+			continue;
+		}
 
-            /* Redirect items mustn't be touched */
-            if (ItemIdIsRedirected(itemid))
-            {
-                hastup = true;    /* this page won't be truncatable */
-                continue;
-            }
+#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
+		if (gts_maintain_option != GTS_MAINTAIN_NOTHING)
+		{
+			MaintainGTS(&onerel->rd_node, blkno, buf);
+		}
+#endif
 
-            ItemPointerSet(&(tuple.t_self), blkno, offnum);
+		/*
+		 * Prune all HOT-update chains in this page.
+		 *
+		 * We count tuples removed by the pruning step as removed by VACUUM.
+		 */
+		tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
+										 &vacrelstats->latestRemovedXid);
+
+		/*
+		 * Now scan the page to collect vacuumable items and check for tuples
+		 * requiring freezing.
+		 */
+		all_visible = true;
+		has_dead_tuples = false;
+		nfrozen = 0;
+		hastup = false;
+		prev_dead_count = vacrelstats->num_dead_tuples;
+		maxoff = PageGetMaxOffsetNumber(page);
+
+		/*
+		 * Note: If you change anything in the loop below, also look at
+		 * heap_page_is_all_visible to see if that needs to be changed.
+		 */
+		for (offnum = FirstOffsetNumber;
+			 offnum <= maxoff;
+			 offnum = OffsetNumberNext(offnum))
+		{
+			ItemId		itemid;
 
-            /*
-             * DEAD item pointers are to be vacuumed normally; but we don't
-             * count them in tups_vacuumed, else we'd be double-counting (at
-             * least in the common case where heap_page_prune() just freed up
-             * a non-HOT tuple).
-             */
-            if (ItemIdIsDead(itemid))
-            {
-                lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
-                all_visible = false;
-                continue;
-            }
+			itemid = PageGetItemId(page, offnum);
 
-            Assert(ItemIdIsNormal(itemid));
+			/* Unused items require no processing, but we count 'em */
+			if (!ItemIdIsUsed(itemid))
+			{
+				nunused += 1;
+				continue;
+			}
 
-            tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
-            tuple.t_len = ItemIdGetLength(itemid);
-            tuple.t_tableOid = RelationGetRelid(onerel);
+			/* Redirect items mustn't be touched */
+			if (ItemIdIsRedirected(itemid))
+			{
+				hastup = true;	/* this page won't be truncatable */
+				continue;
+			}
 
-            tupgone = false;
+			ItemPointerSet(&(tuple.t_self), blkno, offnum);
 
-            switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
-            {
-                case HEAPTUPLE_DEAD:
+			/*
+			 * DEAD item pointers are to be vacuumed normally; but we don't
+			 * count them in tups_vacuumed, else we'd be double-counting (at
+			 * least in the common case where heap_page_prune() just freed up
+			 * a non-HOT tuple).
+			 */
+			if (ItemIdIsDead(itemid))
+			{
+				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
+				all_visible = false;
+				continue;
+			}
 
-                    /*
-                     * Ordinarily, DEAD tuples would have been removed by
-                     * heap_page_prune(), but it's possible that the tuple
-                     * state changed since heap_page_prune() looked.  In
-                     * particular an INSERT_IN_PROGRESS tuple could have
-                     * changed to DEAD if the inserter aborted.  So this
-                     * cannot be considered an error condition.
-                     *
-                     * If the tuple is HOT-updated then it must only be
-                     * removed by a prune operation; so we keep it just as if
-                     * it were RECENTLY_DEAD.  Also, if it's a heap-only
-                     * tuple, we choose to keep it, because it'll be a lot
-                     * cheaper to get rid of it in the next pruning pass than
-                     * to treat it like an indexed tuple.
-                     */
-                    if (HeapTupleIsHotUpdated(&tuple) ||
-                        HeapTupleIsHeapOnly(&tuple))
-                        nkeep += 1;
-                    else
-                        tupgone = true; /* we can delete the tuple */
-                    all_visible = false;
-                    break;
-                case HEAPTUPLE_LIVE:
-                    /* Tuple is good --- but let's do some validity checks */
-                    if (onerel->rd_rel->relhasoids &&
-                        !OidIsValid(HeapTupleGetOid(&tuple)))
-                        elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
-                             relname, blkno, offnum);
+			Assert(ItemIdIsNormal(itemid));
 
-                    /*
-                     * Is the tuple definitely visible to all transactions?
-                     *
-                     * NB: Like with per-tuple hint bits, we can't set the
-                     * PD_ALL_VISIBLE flag if the inserter committed
-                     * asynchronously. See SetHintBits for more info. Check
-                     * that the tuple is hinted xmin-committed because of
-                     * that.
-                     */
-                    if (all_visible)
-                    {
-                        TransactionId xmin;
+			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
+			tuple.t_len = ItemIdGetLength(itemid);
+			tuple.t_tableOid = RelationGetRelid(onerel);
 
-                        if (!HeapTupleHeaderXminCommitted(tuple.t_data))
-                        {
-                            all_visible = false;
-                            break;
-                        }
+			tupgone = false;
 
-                        /*
-                         * The inserter definitely committed. But is it old
-                         * enough that everyone sees it as committed?
-                         */
-                        xmin = HeapTupleHeaderGetXmin(tuple.t_data);
-                        if (!TransactionIdPrecedes(xmin, OldestXmin))
-                        {
-                            all_visible = false;
-                            break;
-                        }
-                    
+			switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
+			{
+				case HEAPTUPLE_DEAD:
+
+					/*
+					 * Ordinarily, DEAD tuples would have been removed by
+					 * heap_page_prune(), but it's possible that the tuple
+					 * state changed since heap_page_prune() looked.  In
+					 * particular an INSERT_IN_PROGRESS tuple could have
+					 * changed to DEAD if the inserter aborted.  So this
+					 * cannot be considered an error condition.
+					 *
+					 * If the tuple is HOT-updated then it must only be
+					 * removed by a prune operation; so we keep it just as if
+					 * it were RECENTLY_DEAD.  Also, if it's a heap-only
+					 * tuple, we choose to keep it, because it'll be a lot
+					 * cheaper to get rid of it in the next pruning pass than
+					 * to treat it like an indexed tuple.
+					 */
+					if (HeapTupleIsHotUpdated(&tuple) ||
+						HeapTupleIsHeapOnly(&tuple))
+						nkeep += 1;
+					else
+						tupgone = true; /* we can delete the tuple */
+					all_visible = false;
+					break;
+				case HEAPTUPLE_LIVE:
+					/* Tuple is good --- but let's do some validity checks */
+					if (onerel->rd_rel->relhasoids &&
+						!OidIsValid(HeapTupleGetOid(&tuple)))
+						elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
+							 relname, blkno, offnum);
+
+					/*
+					 * Is the tuple definitely visible to all transactions?
+					 *
+					 * NB: Like with per-tuple hint bits, we can't set the
+					 * PD_ALL_VISIBLE flag if the inserter committed
+					 * asynchronously. See SetHintBits for more info. Check
+					 * that the tuple is hinted xmin-committed because of
+					 * that.
+					 */
+					if (all_visible)
+					{
+						TransactionId xmin;
+
+						if (!HeapTupleHeaderXminCommitted(tuple.t_data))
+						{
+							all_visible = false;
+							break;
+						}
+
+						/*
+						 * The inserter definitely committed. But is it old
+						 * enough that everyone sees it as committed?
+						 */
+						xmin = HeapTupleHeaderGetXmin(tuple.t_data);
+						if (!TransactionIdPrecedes(xmin, OldestXmin))
+						{
+							all_visible = false;
+							break;
+						}
+					
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
                         {
                             GlobalTimestamp committs = HeapTupleHderGetXminTimestapAtomic(tuple.t_data);
@@ -1203,220 +1221,227 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
                         }
 #endif
 
-                        /* Track newest xmin on page. */
-                        if (TransactionIdFollows(xmin, visibility_cutoff_xid))
-                            visibility_cutoff_xid = xmin;
-                    }
-                    break;
-                case HEAPTUPLE_RECENTLY_DEAD:
-
-                    /*
-                     * If tuple is recently deleted then we must not remove it
-                     * from relation.
-                     */
-                    nkeep += 1;
-                    all_visible = false;
-                    break;
-                case HEAPTUPLE_INSERT_IN_PROGRESS:
-                    /* This is an expected case during concurrent vacuum */
-                    all_visible = false;
-                    break;
-                case HEAPTUPLE_DELETE_IN_PROGRESS:
-                    /* This is an expected case during concurrent vacuum */
-                    all_visible = false;
-                    break;
-                default:
-                    elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
-                    break;
-            }
-
-            if (tupgone)
-            {
-                lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
-                HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
-                                                       &vacrelstats->latestRemovedXid);
-                tups_vacuumed += 1;
-                has_dead_tuples = true;
-            }
-            else
-            {
-                bool        tuple_totally_frozen;
-
-                num_tuples += 1;
-                hastup = true;
-
-                /*
-                 * Each non-removable tuple must be checked to see if it needs
-                 * freezing.  Note we already have exclusive buffer lock.
-                 */
-                if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit,
-                                              MultiXactCutoff, &frozen[nfrozen],
-                                              &tuple_totally_frozen))
-                    frozen[nfrozen++].offset = offnum;
-
-                if (!tuple_totally_frozen)
-                    all_frozen = false;
-            }
-        }                        /* scan along page */
-
-        /*
-         * If we froze any tuples, mark the buffer dirty, and write a WAL
-         * record recording the changes.  We must log the changes to be
-         * crash-safe against future truncation of CLOG.
-         */
-        if (nfrozen > 0)
-        {
-            START_CRIT_SECTION();
-
-            MarkBufferDirty(buf);
-
-            /* execute collected freezes */
-            for (i = 0; i < nfrozen; i++)
-            {
-                ItemId        itemid;
-                HeapTupleHeader htup;
-
-                itemid = PageGetItemId(page, frozen[i].offset);
-                htup = (HeapTupleHeader) PageGetItem(page, itemid);
+						/* Track newest xmin on page. */
+						if (TransactionIdFollows(xmin, visibility_cutoff_xid))
+							visibility_cutoff_xid = xmin;
+					}
+					break;
+				case HEAPTUPLE_RECENTLY_DEAD:
+
+					/*
+					 * If tuple is recently deleted then we must not remove it
+					 * from relation.
+					 */
+					nkeep += 1;
+					all_visible = false;
+					break;
+				case HEAPTUPLE_INSERT_IN_PROGRESS:
+					/* This is an expected case during concurrent vacuum */
+					all_visible = false;
+					break;
+				case HEAPTUPLE_DELETE_IN_PROGRESS:
+					/* This is an expected case during concurrent vacuum */
+					all_visible = false;
+					break;
+				default:
+					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+					break;
+			}
 
-                heap_execute_freeze_tuple(htup, &frozen[i]);
-            }
+			if (tupgone)
+			{
+				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
+				HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
+													   &vacrelstats->latestRemovedXid);
+				tups_vacuumed += 1;
+				has_dead_tuples = true;
+			}
+			else
+			{
+				bool		tuple_totally_frozen;
+
+				num_tuples += 1;
+				hastup = true;
+
+				/*
+				 * Each non-removable tuple must be checked to see if it needs
+				 * freezing.  Note we already have exclusive buffer lock.
+				 */
+				if (heap_prepare_freeze_tuple(tuple.t_data, FreezeLimit,
+											  MultiXactCutoff, &frozen[nfrozen],
+											  &tuple_totally_frozen))
+					frozen[nfrozen++].offset = offnum;
+
+				if (!tuple_totally_frozen)
+					all_frozen = false;
+			}
+		}						/* scan along page */
+
+		/*
+		 * If we froze any tuples, mark the buffer dirty, and write a WAL
+		 * record recording the changes.  We must log the changes to be
+		 * crash-safe against future truncation of CLOG.
+		 */
+		if (nfrozen > 0)
+		{
+			START_CRIT_SECTION();
 
-            /* Now WAL-log freezing if necessary */
-            if (RelationNeedsWAL(onerel))
-            {
-                XLogRecPtr    recptr;
+			MarkBufferDirty(buf);
 
-                recptr = log_heap_freeze(onerel, buf, FreezeLimit,
-                                         frozen, nfrozen);
-                PageSetLSN(page, recptr);
-            }
+			/* execute collected freezes */
+			for (i = 0; i < nfrozen; i++)
+			{
+				ItemId		itemid;
+				HeapTupleHeader htup;
 
-            END_CRIT_SECTION();
-        }
+				itemid = PageGetItemId(page, frozen[i].offset);
+				htup = (HeapTupleHeader) PageGetItem(page, itemid);
 
-        /*
-         * If there are no indexes then we can vacuum the page right now
-         * instead of doing a second scan.
-         */
-        if (nindexes == 0 &&
-            vacrelstats->num_dead_tuples > 0)
-        {
-            /* Remove tuples from heap */
-            lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer);
-            has_dead_tuples = false;
+				heap_execute_freeze_tuple(htup, &frozen[i]);
+			}
 
-            /*
-             * Forget the now-vacuumed tuples, and press on, but be careful
-             * not to reset latestRemovedXid since we want that value to be
-             * valid.
-             */
-            vacrelstats->num_dead_tuples = 0;
-            vacuumed_pages++;
-        }
+			/* Now WAL-log freezing if necessary */
+			if (RelationNeedsWAL(onerel))
+			{
+				XLogRecPtr	recptr;
 
-        freespace = PageGetHeapFreeSpace(page);
+				recptr = log_heap_freeze(onerel, buf, FreezeLimit,
+										 frozen, nfrozen);
+				PageSetLSN(page, recptr);
+			}
 
-        /* mark page all-visible, if appropriate */
-        if (all_visible && !all_visible_according_to_vm)
-        {
-            uint8        flags = VISIBILITYMAP_ALL_VISIBLE;
+			END_CRIT_SECTION();
+		}
 
-            if (all_frozen)
-                flags |= VISIBILITYMAP_ALL_FROZEN;
+		/*
+		 * If there are no indexes then we can vacuum the page right now
+		 * instead of doing a second scan.
+		 */
+		if (nindexes == 0 &&
+			vacrelstats->num_dead_tuples > 0)
+		{
+			/* Remove tuples from heap */
+			lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer);
+			has_dead_tuples = false;
+
+			/*
+			 * Forget the now-vacuumed tuples, and press on, but be careful
+			 * not to reset latestRemovedXid since we want that value to be
+			 * valid.
+			 */
+			vacrelstats->num_dead_tuples = 0;
+			vacuumed_pages++;
+		}
 
-            /*
-             * It should never be the case that the visibility map page is set
-             * while the page-level bit is clear, but the reverse is allowed
-             * (if checksums are not enabled).  Regardless, set the both bits
-             * so that we get back in sync.
-             *
-             * NB: If the heap page is all-visible but the VM bit is not set,
-             * we don't need to dirty the heap page.  However, if checksums
-             * are enabled, we do need to make sure that the heap page is
-             * dirtied before passing it to visibilitymap_set(), because it
-             * may be logged.  Given that this situation should only happen in
-             * rare cases after a crash, it is not worth optimizing.
-             */
-            PageSetAllVisible(page);
-            MarkBufferDirty(buf);
-            visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
-                              vmbuffer, visibility_cutoff_xid, flags);
-        }
+		freespace = PageGetHeapFreeSpace(page);
 
-        /*
-         * As of PostgreSQL 9.2, the visibility map bit should never be set if
-         * the page-level bit is clear.  However, it's possible that the bit
-         * got cleared after we checked it and before we took the buffer
-         * content lock, so we must recheck before jumping to the conclusion
-         * that something bad has happened.
-         */
-        else if (all_visible_according_to_vm && !PageIsAllVisible(page)
-                 && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
-        {
-            elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
-                 relname, blkno);
-            visibilitymap_clear(onerel, blkno, vmbuffer,
-                                VISIBILITYMAP_VALID_BITS);
-        }
+		/* mark page all-visible, if appropriate */
+		if (all_visible && !all_visible_according_to_vm)
+		{
+			uint8		flags = VISIBILITYMAP_ALL_VISIBLE;
+
+			if (all_frozen)
+				flags |= VISIBILITYMAP_ALL_FROZEN;
+
+			/*
+			 * It should never be the case that the visibility map page is set
+			 * while the page-level bit is clear, but the reverse is allowed
+			 * (if checksums are not enabled).  Regardless, set the both bits
+			 * so that we get back in sync.
+			 *
+			 * NB: If the heap page is all-visible but the VM bit is not set,
+			 * we don't need to dirty the heap page.  However, if checksums
+			 * are enabled, we do need to make sure that the heap page is
+			 * dirtied before passing it to visibilitymap_set(), because it
+			 * may be logged.  Given that this situation should only happen in
+			 * rare cases after a crash, it is not worth optimizing.
+			 */
+			PageSetAllVisible(page);
+			MarkBufferDirty(buf);
+			visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
+							  vmbuffer, visibility_cutoff_xid, flags);
+		}
 
-        /*
-         * It's possible for the value returned by GetOldestXmin() to move
-         * backwards, so it's not wrong for us to see tuples that appear to
-         * not be visible to everyone yet, while PD_ALL_VISIBLE is already
-         * set. The real safe xmin value never moves backwards, but
-         * GetOldestXmin() is conservative and sometimes returns a value
-         * that's unnecessarily small, so if we see that contradiction it just
-         * means that the tuples that we think are not visible to everyone yet
-         * actually are, and the PD_ALL_VISIBLE flag is correct.
-         *
-         * There should never be dead tuples on a page with PD_ALL_VISIBLE
-         * set, however.
-         */
-        else if (PageIsAllVisible(page) && has_dead_tuples)
-        {
-            elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
-                 relname, blkno);
-            PageClearAllVisible(page);
-            MarkBufferDirty(buf);
-            visibilitymap_clear(onerel, blkno, vmbuffer,
-                                VISIBILITYMAP_VALID_BITS);
-        }
+		/*
+		 * As of PostgreSQL 9.2, the visibility map bit should never be set if
+		 * the page-level bit is clear.  However, it's possible that the bit
+		 * got cleared after we checked it and before we took the buffer
+		 * content lock, so we must recheck before jumping to the conclusion
+		 * that something bad has happened.
+		 */
+		else if (all_visible_according_to_vm && !PageIsAllVisible(page)
+				 && VM_ALL_VISIBLE(onerel, blkno, &vmbuffer))
+		{
+			elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
+				 relname, blkno);
+			visibilitymap_clear(onerel, blkno, vmbuffer,
+								VISIBILITYMAP_VALID_BITS);
+		}
 
-        /*
-         * If the all-visible page is turned out to be all-frozen but not
-         * marked, we should so mark it.  Note that all_frozen is only valid
-         * if all_visible is true, so we must check both.
-         */
-        else if (all_visible_according_to_vm && all_visible && all_frozen &&
-                 !VM_ALL_FROZEN(onerel, blkno, &vmbuffer))
-        {
-            /*
-             * We can pass InvalidTransactionId as the cutoff XID here,
-             * because setting the all-frozen bit doesn't cause recovery
-             * conflicts.
-             */
-            visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
-                              vmbuffer, InvalidTransactionId,
-                              VISIBILITYMAP_ALL_FROZEN);
-        }
+		/*
+		 * It's possible for the value returned by GetOldestXmin() to move
+		 * backwards, so it's not wrong for us to see tuples that appear to
+		 * not be visible to everyone yet, while PD_ALL_VISIBLE is already
+		 * set. The real safe xmin value never moves backwards, but
+		 * GetOldestXmin() is conservative and sometimes returns a value
+		 * that's unnecessarily small, so if we see that contradiction it just
+		 * means that the tuples that we think are not visible to everyone yet
+		 * actually are, and the PD_ALL_VISIBLE flag is correct.
+		 *
+		 * There should never be dead tuples on a page with PD_ALL_VISIBLE
+		 * set, however.
+		 */
+		else if (PageIsAllVisible(page) && has_dead_tuples)
+		{
+			elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
+				 relname, blkno);
+			PageClearAllVisible(page);
+			MarkBufferDirty(buf);
+			visibilitymap_clear(onerel, blkno, vmbuffer,
+								VISIBILITYMAP_VALID_BITS);
+		}
 
-        UnlockReleaseBuffer(buf);
+		/*
+		 * If the all-visible page is turned out to be all-frozen but not
+		 * marked, we should so mark it.  Note that all_frozen is only valid
+		 * if all_visible is true, so we must check both.
+		 */
+		else if (all_visible_according_to_vm && all_visible && all_frozen &&
+				 !VM_ALL_FROZEN(onerel, blkno, &vmbuffer))
+		{
+			/*
+			 * We can pass InvalidTransactionId as the cutoff XID here,
+			 * because setting the all-frozen bit doesn't cause recovery
+			 * conflicts.
+			 */
+			visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
+							  vmbuffer, InvalidTransactionId,
+							  VISIBILITYMAP_ALL_FROZEN);
+		}
 
-        /* Remember the location of the last page with nonremovable tuples */
-        if (hastup)
-            vacrelstats->nonempty_pages = blkno + 1;
+#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
+		if (gts_maintain_option != GTS_MAINTAIN_NOTHING)
+		{
+			MaintainGTS(&onerel->rd_node, blkno, buf);
+		}
+#endif
 
-        /*
-         * If we remembered any tuples for deletion, then the page will be
-         * visited again by lazy_vacuum_heap, which will compute and record
-         * its post-compaction free space.  If not, then we're done with this
-         * page, so remember its free space as-is.  (This path will always be
-         * taken if there are no indexes.)
-         */
-        if (vacrelstats->num_dead_tuples == prev_dead_count)
-            RecordPageWithFreeSpace(onerel, blkno, freespace);
-    }
+		UnlockReleaseBuffer(buf);
+
+		/* Remember the location of the last page with nonremovable tuples */
+		if (hastup)
+			vacrelstats->nonempty_pages = blkno + 1;
+
+		/*
+		 * If we remembered any tuples for deletion, then the page will be
+		 * visited again by lazy_vacuum_heap, which will compute and record
+		 * its post-compaction free space.  If not, then we're done with this
+		 * page, so remember its free space as-is.  (This path will always be
+		 * taken if there are no indexes.)
+		 */
+		if (vacrelstats->num_dead_tuples == prev_dead_count)
+			RecordPageWithFreeSpace(onerel, blkno, freespace);
+	}
 
     /* report that everything is scanned and vacuumed */
     pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
@@ -1594,6 +1619,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
              errdetail_internal("%s", pg_rusage_show(&ru0))));
 }
 
+
 /*
  *    lazy_vacuum_page() -- free dead tuples on a page
  *                     and repair its fragmentation.
@@ -2655,3 +2681,297 @@ xlog_reinit_extent_pages(RelFileNode rnode, ExtentID eid)
 }
 
 #endif
+
+#define STACK_SIZE 64
+
+/*
+ * print error stack to maintain_trace file.
+ */
+static void
+PrintStack(void)
+{
+	void *trace[STACK_SIZE] = {0};
+	size_t size = backtrace(trace, STACK_SIZE);
+	char **symbols = (char **) backtrace_symbols(trace, size);
+	size_t i = 0;
+	time_t t = 0;
+	struct tm *timeInfo = NULL;
+
+	if (symbols == NULL)
+	{
+		return;
+	}
+
+	time(&t);
+	timeInfo = localtime(&t);
+	trace_log("Dumping stack starts at %s", asctime(timeInfo));
+	trace_log("backtrace() returned %zu addresses.", size);
+	for (i = 1; i < size; i++)
+	{
+		char syscom[MAXPGPATH] = {0};
+		FILE *fcmd = NULL;
+		char temp[MAXPGPATH] = {0};
+
+		trace_log("#%-2zu %s", i, symbols[i]);
+
+		snprintf(syscom, MAXPGPATH, "addr2line %p -e %s -f -C", trace[i], exename);
+		fcmd = popen(syscom, "r");
+		if (fcmd == NULL)
+		{
+			continue;
+		}
+		while (fgets(temp, sizeof(temp), fcmd) != NULL)
+		{
+			/* ignore the ending "\n" */
+			trace_log("    %.*s", (int) strlen(temp) - 1, temp);
+		}
+		pclose(fcmd);
+	}
+	trace_log("Dumping stack ends.\n");
+
+	free(symbols);
+}
+
+/*
+ * print error data to maintain file.
+ */
+static void
+PrintData(RelFileNode *rnode, BlockNumber blkno, Page page, OffsetNumber lineoff,
+	GlobalTimestamp tlog_xmin_gts, GlobalTimestamp tlog_xmax_gts)
+{
+	XLogRecPtr	lsn = PageGetLSN(page);
+	PageHeader pagehdr = (PageHeader) page;
+	ItemId id = PageGetItemId(page, lineoff);
+	uint16 lp_offset = 0;
+	uint16 lp_flags = 0;
+	uint16 lp_len = 0;
+	HeapTupleHeader tuphdr = NULL;
+	TransactionId xmin = 0;
+	TransactionId xmax = 0;
+	GlobalTimestamp tuple_xmin_gts = 0;
+	GlobalTimestamp tuple_xmax_gts = 0;
+	uint32 cid = 0;
+	uint32 infomask2 = 0;
+	uint32 infomask = 0;
+	ShardID shardid = 0;
+	uint8 hoff = 0;
+	time_t t = 0;
+	struct tm *timeInfo = NULL;
+
+	lp_offset = ItemIdGetOffset(id);
+	lp_flags = ItemIdGetFlags(id);
+	lp_len = ItemIdGetLength(id);
+
+	tuphdr = (HeapTupleHeader)PageGetItem(page, id);
+	xmin = HeapTupleHeaderGetRawXmin(tuphdr);
+	xmax = HeapTupleHeaderGetRawXmax(tuphdr);
+	tuple_xmin_gts = HeapTupleHeaderGetXminTimestamp(tuphdr);
+	tuple_xmax_gts = HeapTupleHeaderGetXmaxTimestamp(tuphdr);
+	cid = HeapTupleHeaderGetRawCommandId(tuphdr);
+	infomask2 = tuphdr->t_infomask2;
+	infomask = tuphdr->t_infomask;
+	shardid = tuphdr->t_shardid;
+	hoff = tuphdr->t_hoff;
+
+	time(&t);
+	timeInfo = localtime(&t);
+	trace_log("Printing error data starts at %s", asctime(timeInfo));
+	trace_log("relfilenode %u pageno %u \n"
+			"page: lsn=" UINT64_FORMAT " "
+			"checksum=%d flags=%d shard=%d "
+			"lower=%d upper=%d special=%d "
+			"pagesize=%zu version=%d "
+			"prune_xid=%u \n"
+			"item: lp=%d, lp_off=%d, lp_flags=%d, lp_len=%d \n"
+			"heaptuple header: t_xmin=%u t_xmax=%u t_xmin_gts=%ld t_xmax_gts=%ld "
+			"t_cid=%u t_infomask2=%u t_infomask=%u "
+			"t_shareid=%d t_hoff=%d \n"
+			"tlog: tlog_xmin_gts=%ld tlog_xmax_gts=%ld",
+		rnode->relNode, blkno,
+			lsn,
+			pagehdr->pd_checksum, pagehdr->pd_flags, pagehdr->pd_shard,
+			pagehdr->pd_lower, pagehdr->pd_upper, pagehdr->pd_special,
+			PageGetPageSize(page), (uint16) PageGetPageLayoutVersion(page),
+			pagehdr->pd_prune_xid,
+			lineoff, lp_offset, lp_flags, lp_len,
+			xmin, xmax, tuple_xmin_gts, tuple_xmax_gts,
+			cid, infomask2, infomask,
+			shardid, hoff,
+			tlog_xmin_gts, tlog_xmax_gts);
+	if (tlog_xmax_gts == 0)
+	{
+		trace_log("xmin_gts in tuple is not equal to xmin_gts in tlog!");
+	}
+	else
+	{
+		trace_log("xmax_gts in tuple is not equal to xmax_gts in tlog!");
+	}
+	trace_log("Printing error data ends.\n");
+}
+
+/*
+ *	MaintainGTS() -- check and reset gts in tuples according to gts in tlog.
+ *
+ * Buffer must be pinned and exclusive-locked.  (If caller does not hold
+ * exclusive lock, then somebody could be in process of writing the buffer,
+ * leading to risk of bad data written to disk.
+ *
+ *	Caller must hold pin and buffer cleanup lock on the buffer.
+ *	gts_maintain_option = 0: GTS_MAINTAIN_NOTHING, do nothing.
+ *	gts_maintain_option = 1: GTS_MAINTAIN_VACUUM_CHECK, check correctness of GTS
+ *	                         while doing vacuum.
+ *	gts_maintain_option = 2: GTS_MAINTAIN_VACUUM_RESET, check correctness of GTS
+ *                           and reset it according to tlog if it is wrong while
+ *                           doing vacuum.
+ */
+void
+MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer)
+{
+	Page page;
+	int lines;
+	OffsetNumber lineoff;
+	ItemId itemid;
+	bool changed = false;
+	bool reset = false;
+
+	/* GTS is only used for normal user tables, not systems table and any index. */
+	if (GTS_MAINTAIN_NOTHING == gts_maintain_option)
+	{
+		return;
+	}
+
+	if (!PostmasterIsPrimaryAndNormal())
+	{
+		return;
+	}
+
+	if (!RelationHasGTS(rnode->spcNode, rnode->relNode))
+	{
+		return;
+	}
+
+	if (GTS_MAINTAIN_VACUUM_RESET == gts_maintain_option)
+	{
+		reset = true;
+	}
+
+	page = BufferGetPage(buffer);
+	lines = PageGetMaxOffsetNumber(page);
+	for (lineoff = FirstOffsetNumber, itemid = PageGetItemId(page, lineoff);
+			lineoff <= lines;
+			lineoff++, itemid++)
+	{
+		HeapTupleHeader tuphdr;
+		TransactionId xmin = InvalidTransactionId;
+		TransactionId xmax = InvalidTransactionId;
+		GlobalTimestamp tlog_xmin_gts = InvalidGlobalTimestamp;
+		GlobalTimestamp tlog_xmax_gts = InvalidGlobalTimestamp;
+
+		if (!ItemIdIsNormal(itemid))
+		{
+			continue;
+		}
+
+		tuphdr = (HeapTupleHeader) PageGetItem(page, itemid);
+
+		xmin = HeapTupleHeaderGetRawXmin(tuphdr);
+		if (TransactionIdIsNormal(xmin) &&
+			HeapTupleHeaderXminCommitted(tuphdr) &&
+			!HeapTupleHeaderXminFrozen(tuphdr))
+		{
+			GlobalTimestamp tuple_xmin_gts = HeapTupleHeaderGetXminTimestampAtomic(tuphdr);
+
+			if (GlobalTimestampIsValid(tuple_xmin_gts)
+				&& !CommitTimestampIsLocal(tuple_xmin_gts)
+				&& !GlobalTimestampIsFrozen(tuple_xmin_gts)
+				&& TransactionIdGetCommitTsData(xmin, &tlog_xmin_gts, NULL)
+				&& tuple_xmin_gts != tlog_xmin_gts)
+			{
+				elog(WARNING,
+					"relfilenode %u "
+					"pageno %u lineoff %u "
+					"CTID %hu/%hu/%hu "
+					"infomask %d multixact %d, "
+					"xmin %u xmin_gts "INT64_FORMAT" "
+					"in tuple is not equal to xmin_gts "INT64_FORMAT" in tlog.",
+					rnode->relNode,
+					blkno, lineoff,
+					tuphdr->t_ctid.ip_blkid.bi_hi,
+					tuphdr->t_ctid.ip_blkid.bi_lo,
+					tuphdr->t_ctid.ip_posid,
+					tuphdr->t_infomask, tuphdr->t_infomask & HEAP_XMAX_IS_MULTI,
+					xmin, tuple_xmin_gts, tlog_xmin_gts);
+
+				PrintStack();
+				PrintData(rnode, blkno, page, lineoff, tlog_xmin_gts, 0);
+
+				if (reset)
+				{
+					changed = true;
+					HeapTupleHeaderSetXminTimestampAtomic(tuphdr, tlog_xmin_gts);
+					elog(WARNING,
+						"relfilenode %u "
+						"pageno %u lineoff %u xmin %u xmin_gts "INT64_FORMAT" "
+						"in tuple has been reset to xmin_gts "INT64_FORMAT" in tlog.",
+						rnode->relNode,
+						blkno, lineoff,
+						xmin, tuple_xmin_gts,
+						HeapTupleHeaderGetXminTimestamp(tuphdr));
+				}
+			}
+		}
+
+		xmax = HeapTupleHeaderGetRawXmax(tuphdr);
+		if (TransactionIdIsNormal(xmax) &&
+			HeapTupleHeaderXmaxCommitted(tuphdr))
+		{
+			GlobalTimestamp tuple_xmax_gts = HeapTupleHeaderGetXmaxTimestampAtomic(tuphdr);
+
+			if (GlobalTimestampIsValid(tuple_xmax_gts)
+				&& !CommitTimestampIsLocal(tuple_xmax_gts)
+				&& !GlobalTimestampIsFrozen(tuple_xmax_gts)
+				&& TransactionIdGetCommitTsData(xmax, &tlog_xmax_gts, NULL)
+				&& tuple_xmax_gts != tlog_xmax_gts)
+			{
+				elog(WARNING,
+					"relfilenode %u "
+					"pageno %u lineoff %u "
+					"CTID %hu/%hu/%hu "
+					"infomask %d multixact %d "
+					"xid %u xmax %u xmax_gts "INT64_FORMAT" "
+					"in tuple is not equal to xmax_gts "INT64_FORMAT" in tlog.",
+					rnode->relNode,
+					blkno, lineoff,
+					tuphdr->t_ctid.ip_blkid.bi_hi,
+					tuphdr->t_ctid.ip_blkid.bi_lo,
+					tuphdr->t_ctid.ip_posid,
+					tuphdr->t_infomask, tuphdr->t_infomask & HEAP_XMAX_IS_MULTI,
+					HeapTupleHeaderGetUpdateXid(tuphdr), xmax, tuple_xmax_gts,
+					tlog_xmax_gts);
+
+				PrintStack();
+				PrintData(rnode, blkno, page, lineoff, 0, tlog_xmax_gts);
+
+				if (reset)
+				{
+					changed = true;
+					HeapTupleHeaderSetXmaxTimestampAtomic(tuphdr, tlog_xmax_gts);
+					elog(WARNING,
+						"relfilenode "
+						"%u pageno %u lineoff %u "
+						"xmax %u xmax_gts "INT64_FORMAT" "
+						"in tuple has been reset to xmax_gts "INT64_FORMAT" in tlog.",
+						rnode->relNode,
+						blkno, lineoff,
+						xmax, tuple_xmax_gts,
+						HeapTupleHeaderGetXminTimestamp(tuphdr));
+				}
+			}
+		}
+	}
+
+	if (changed)
+	{
+		MarkBufferDirtyHint(buffer, true);
+	}
+}
diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index 32041d32..679afb31 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -45,6 +45,7 @@
 
 
 const char *progname;
+const char *exename;
 
 
 static void startup_hacks(const char *progname);
@@ -58,33 +59,37 @@ static void check_root(const char *progname);
  */
 int
 main(int argc, char *argv[])
-{// #lizard forgives
-    bool        do_check_root = true;
-
-    progname = get_progname(argv[0]);
-
-    /*
-     * Platform-specific startup hacks
-     */
-    startup_hacks(progname);
-
-    /*
-     * Remember the physical location of the initially given argv[] array for
-     * possible use by ps display.  On some platforms, the argv[] storage must
-     * be overwritten in order to set the process title for ps. In such cases
-     * save_ps_display_args makes and returns a new copy of the argv[] array.
-     *
-     * save_ps_display_args may also move the environment strings to make
-     * extra room. Therefore this should be done as early as possible during
-     * startup, to avoid entanglements with code that might save a getenv()
-     * result pointer.
-     */
-    argv = save_ps_display_args(argc, argv);
-
-    /*
-     * If supported on the current platform, set up a handler to be called if
-     * the backend/postmaster crashes with a fatal signal or exception.
-     */
+{
+	bool		do_check_root = true;
+
+	progname = get_progname(argv[0]);
+	/*
+	 * Make a copy. Leaks memory, but called only once.
+	 */
+	exename = strdup(argv[0]);
+
+	/*
+	 * Platform-specific startup hacks
+	 */
+	startup_hacks(progname);
+
+	/*
+	 * Remember the physical location of the initially given argv[] array for
+	 * possible use by ps display.  On some platforms, the argv[] storage must
+	 * be overwritten in order to set the process title for ps. In such cases
+	 * save_ps_display_args makes and returns a new copy of the argv[] array.
+	 *
+	 * save_ps_display_args may also move the environment strings to make
+	 * extra room. Therefore this should be done as early as possible during
+	 * startup, to avoid entanglements with code that might save a getenv()
+	 * result pointer.
+	 */
+	argv = save_ps_display_args(argc, argv);
+
+	/*
+	 * If supported on the current platform, set up a handler to be called if
+	 * the backend/postmaster crashes with a fatal signal or exception.
+	 */
 #if defined(WIN32) && defined(HAVE_MINIDUMP_TYPE)
     pgwin32_install_crashdump_handler();
 #endif
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 73cf905c..ca079833 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -9805,19 +9805,19 @@ TryPingUnhealthyNode(Oid nodeoid)
 void
 PoolPingNodeRecheck(Oid nodeoid)
 {
-    int status;
-    NodeDefinition *nodeDef;
-    char connstr[MAXPGPATH * 2 + 256];
-    bool    healthy;
-    const char *username = NULL;
-    char *errstr = NULL;
-
-    username = get_user_name(&errstr);
-    if (errstr != NULL)
-    {
-        elog(WARNING, "Could not get current username errmsg: %s", errstr);
-        return;
-    }
+	int status;
+	NodeDefinition *nodeDef;
+	char connstr[MAXPGPATH * 2 + 256];
+	bool	healthy;
+	const char *username = NULL;
+	char *errstr = NULL;
+
+	username = get_user_name(&errstr);
+	if (errstr != NULL)
+	{
+		elog(WARNING, "Could not get current username errmsg: %s", errstr);
+		return;
+	}
 
     nodeDef = PgxcNodeGetDefinition(nodeoid);
     if (nodeDef == NULL)
diff --git a/src/backend/postmaster/auditlogger.c b/src/backend/postmaster/auditlogger.c
index 5cf9325c..5aea9c14 100644
--- a/src/backend/postmaster/auditlogger.c
+++ b/src/backend/postmaster/auditlogger.c
@@ -146,12 +146,12 @@
 
 typedef struct AuditLogQueue
 {
-    pid_t                    q_pid; 
-    int                        q_size; 
-    slock_t                    q_lock;     
-    volatile int            q_head;      
-    volatile int            q_tail;
-    char                    q_area[FLEXIBLE_ARRAY_MEMBER];
+	pid_t					q_pid;
+	int						q_size;
+	slock_t					q_lock;
+	volatile int			q_head;
+	volatile int			q_tail;
+	char					q_area[FLEXIBLE_ARRAY_MEMBER];
 } AlogQueue;
 
 typedef struct AuditLogQueueArray
@@ -163,16 +163,25 @@ typedef struct AuditLogQueueArray
 
 typedef struct AuditLogQueueCache
 {
-    ThreadSema                q_sema;                                /* local ThreadSema for CommonLogWriter and FGALogWriter */
-    int                        q_count;
-    AlogQueue              * q_cache[FLEXIBLE_ARRAY_MEMBER];
+	/* local ThreadSema for CommonLogWriter, FGALogWriter and TraceLogWriter. */
+	ThreadSema				q_sema;
+	int						q_count;
+	AlogQueue			  * q_cache[FLEXIBLE_ARRAY_MEMBER];
 } AlogQueueCache;
 
-/* shared memory queue array */
-static AlogQueueArray      * AuditCommonLogQueueArray = NULL;    /* store common audit logs, each elem for a backend */
-static AlogQueueArray      * AuditFGALogQueueArray = NULL;        /* store fga audit logs, each elem for a backend */
+/*
+ * shared memory queue array
+ *
+ * store common audit logs, each elem for a backend
+ */
+static AlogQueueArray	  * AuditCommonLogQueueArray = NULL;
+/* store fga audit logs, each elem for a backend */
+static AlogQueueArray	  * AuditFGALogQueueArray = NULL;
+/* store trace audit logs, each elem for a backend */
+static AlogQueueArray	  * AuditTraceLogQueueArray = NULL;
 
-/* shared memory bitmap to notify consumers to read audit log from AlogQueueArray above
+/*
+ * shared memory bitmap to notify consumers to read audit log from AlogQueueArray above
  * each element for one consumer
  */
 static int                  *    AuditConsumerNotifyBitmap = NULL;
@@ -180,35 +189,59 @@ static int                  *    AuditConsumerNotifyBitmap = NULL;
 /*
  * Postgres backend state, used in postgres backend only
  *
- * Postgres backend write common audit log into AuditCommonLogQueueArray->a_queue[idx] 
+ * Postgres backend write common audit log into AuditCommonLogQueueArray->a_queue[idx]
  * and write fga audit log info AuditFGALogQueueArray->a_queue[idx]
+ * and write trace audit log info AuditTraceLogQueueArray->a_queue[idx]
  *
- * Postgres backend acqurie free index by AuditLoggerQueueAcquire 
+ * Postgres backend acqurie free index by AuditLoggerQueueAcquire
  *
  */
 static int                    AuditPostgresAlogQueueIndex = 0;
 
-/* Consumer local queue cache for AuditLog_max_worker_number consumers, used in audit logger process only */
-static AlogQueueCache      * AuditCommonLogLocalCache = NULL;    /* store common audit logs, each elem for a thread Consumer */
-static AlogQueueCache      * AuditFGALogLocalCache = NULL;        /* store fga audit logs, each elem for a trhead Consumer */
+/*
+ * Consumer local queue cache for AuditLog_max_worker_number consumers, used in
+ * audit logger process only.
+ *
+ * store common audit logs, each elem for a thread Consumer
+ */
+static AlogQueueCache	  * AuditCommonLogLocalCache = NULL;
+/* store fga audit logs, each elem for a thread Consumer */
+static AlogQueueCache	  * AuditFGALogLocalCache = NULL;
+/* store trace audit logs, each elem for a thread Consumer */
+static AlogQueueCache	  * AuditTraceLogLocalCache = NULL;
 
-/* local ThreadSema array for AuditLog_max_worker_number consumers, used in audit logger process only */
-static ThreadSema          * AuditConsumerNotifySemas = NULL;    /* each elem for a trhead Consumer */
+/*
+ * local ThreadSema array for AuditLog_max_worker_number consumers, used in audit
+ * logger process only.
+ *
+ * each elem for a thread Consumer.
+ */
+static ThreadSema		  * AuditConsumerNotifySemas = NULL;
 
 /*
  * GUC parameters.    can change at SIGHUP.
  */
-int                            AuditLog_RotationAge = HOURS_PER_DAY * MINS_PER_HOUR;
-int                            AuditLog_RotationSize = 10 * 1024;
-char                      * AuditLog_filename = NULL;
-bool                        AuditLog_truncate_on_rotation = false;
-int                            AuditLog_file_mode = S_IRUSR | S_IWUSR;
-
-int                            AuditLog_max_worker_number = 16;        /* max number of worker thead to read audit log */
-int                            AuditLog_common_log_queue_size_kb = 64;    /* size of AlogQueue->q_area for each backend to store common audit log, KB */
-int                            AuditLog_fga_log_queue_size_kb = 64;    /* size of AlogQueue->q_area for each backend to store audit log, KB */
-int                            AuditLog_common_log_cache_size_kb = 64;    /* size of common audit log local buffer for each worker */    
-int                            AuditLog_fga_log_cacae_size_kb = 64;    /* size of fga audit log local buffer for eache worker */
+int							AuditLog_RotationAge = HOURS_PER_DAY * MINS_PER_HOUR;
+int							AuditLog_RotationSize = 10 * 1024;
+char					  * AuditLog_filename = NULL;
+static char				  * TraceLog_filename = "maintain-%A-%H.log";
+bool						AuditLog_truncate_on_rotation = false;
+int							AuditLog_file_mode = S_IRUSR | S_IWUSR;
+
+/* max number of worker thead to read audit log */
+int							AuditLog_max_worker_number = 16;
+/* size of AlogQueue->q_area for each backend to store common audit log, KB */
+int							AuditLog_common_log_queue_size_kb = 64;
+/* size of AlogQueue->q_area for each backend to store fga audit log, KB */
+int							AuditLog_fga_log_queue_size_kb = 64;
+/* size of AlogQueue->q_area for each backend to store trace audit log, KB */
+int							Maintain_trace_log_queue_size_kb = 64;
+/* size of common audit log local buffer for each worker */
+int							AuditLog_common_log_cache_size_kb = 64;
+/* size of fga audit log local buffer for each worker */
+int							AuditLog_fga_log_cacae_size_kb = 64;
+/* size of trace audit log local buffer for each worker */
+int							Maintain_trace_log_cache_size_kb = 64;
 
 /*
  * Globally visible state
@@ -219,19 +252,24 @@ bool                        enable_auditlogger_warning = false;
 /*
  * Logger Private state
  */
-static pg_time_t            audit_next_rotation_time = 0;
-static bool                    audit_rotation_disabled = false;
-static FILE                  * audit_comm_log_file = NULL;
-static FILE                  * audit_fga_log_file = NULL;
-static slock_t                audit_comm_log_file_lock;
-static slock_t                audit_fga_log_file_lock;
-NON_EXEC_STATIC pg_time_t    audit_first_log_file_time = 0;
-static char                  * audit_last_comm_log_file_name = NULL;
-static char                  * audit_last_fga_log_file_name = NULL;
-static char                  * audit_log_directory = NULL;
-static char                  * audit_curr_log_dir = NULL;
-static char                  * audit_curr_log_file_name = NULL;
-static int                    audit_curr_log_rotation_age = 0;
+static pg_time_t			audit_next_rotation_time = 0;
+static bool					audit_rotation_disabled = false;
+static FILE				  * audit_comm_log_file = NULL;
+static FILE				  * audit_fga_log_file = NULL;
+static FILE				  * audit_trace_log_file = NULL;
+static slock_t				audit_comm_log_file_lock;
+static slock_t				audit_fga_log_file_lock;
+static slock_t				audit_trace_log_file_lock;
+NON_EXEC_STATIC pg_time_t	audit_first_log_file_time = 0;
+static char				  * audit_last_comm_log_file_name = NULL;
+static char				  * audit_last_fga_log_file_name = NULL;
+static char				  * audit_last_trace_log_file_name = NULL;
+static char				  * audit_log_directory = NULL;
+static char				  * trace_log_directory = NULL;
+static char				  * audit_curr_log_dir = NULL;
+static char				  * trace_curr_log_dir = NULL;
+static char				  * audit_curr_log_file_name = NULL;
+static int					audit_curr_log_rotation_age = 0;
 
 /*
  * Flags set by interrupt handlers for later service in the main loop.
@@ -273,26 +311,30 @@ static void        audit_assign_log_dir(void);
 #endif
 
 #ifdef AuditLog_002_For_ShareMemoryQueue
-static Size     audit_queue_elem_size(int queue_size_kb);
-
-static Size        audit_shared_queue_array_bitmap_offset(void);
-static Size        audit_shared_queue_array_header_size(void);
-static Size        audit_shared_common_queue_elem_size(void);
-static Size        audit_shared_common_queue_array_size(void);
-static Size        audit_shared_fga_queue_elem_size(void);
-static Size        audit_shared_fga_queue_array_size(void);
-static Size     audit_shared_consumer_bitmap_size(void);
-static int         audit_shared_consumer_bitmap_get_value(int consumer_id);
-static void     audit_shared_consumer_bitmap_set_value(int consumer_id, int value);
+static Size 	audit_queue_elem_size(int queue_size_kb);
+
+static Size		audit_shared_queue_array_bitmap_offset(void);
+static Size		audit_shared_queue_array_header_size(void);
+static Size		audit_shared_common_queue_elem_size(void);
+static Size		audit_shared_common_queue_array_size(void);
+static Size		audit_shared_fga_queue_elem_size(void);
+static Size		audit_shared_fga_queue_array_size(void);
+static Size		audit_shared_trace_queue_elem_size(void);
+static Size		audit_shared_trace_queue_array_size(void);
+static Size 	audit_shared_consumer_bitmap_size(void);
+static int 		audit_shared_consumer_bitmap_get_value(int consumer_id);
+static void 	audit_shared_consumer_bitmap_set_value(int consumer_id, int value);
 #endif
 
 #ifdef AuditLog_003_For_LogFile
-static int        audit_write_log_file(const char *buffer, int count, int destination);
-static FILE *    audit_open_log_file(const char *filename, const char *mode, bool allow_errors);
-static void        aduit_open_fga_log_file(void);
-static void        audit_rotate_log_file(bool time_based_rotation, int size_rotation_for);
-static char *    audit_log_file_getname(pg_time_t timestamp, const char *suffix);
-static void        audit_set_next_rotation_time(void);
+static int		audit_write_log_file(const char *buffer, int count, int destination);
+static FILE *	audit_open_log_file(const char *filename, const char *mode, bool allow_errors);
+static void		audit_open_fga_log_file(void);
+static void		audit_open_trace_log_file(void);
+static void		audit_rotate_log_file(bool time_based_rotation, int size_rotation_for);
+static char *	audit_log_file_getname(pg_time_t timestamp, const char *suffix);
+static char *	trace_log_file_getname(pg_time_t timestamp, const char *suffix);
+static void		audit_set_next_rotation_time(void);
 #endif
 
 #ifdef AuditLog_004_For_QueueReadWrite
@@ -314,10 +356,12 @@ static bool     alog_queue_pop_to_file(AlogQueue * from, int destination);
 #endif
 
 #ifdef AuditLog_005_For_ThreadWorker
-static AlogQueue *        alog_get_shared_common_queue(int idx);
-static AlogQueue *         alog_get_shared_fga_queue(int idx);
-static AlogQueue *        alog_get_local_common_cache(int consumer_id);
-static AlogQueue *        alog_get_local_fga_cache(int consumer_id);
+static AlogQueue *		alog_get_shared_common_queue(int idx);
+static AlogQueue * 		alog_get_shared_fga_queue(int idx);
+static AlogQueue * 		alog_get_shared_trace_queue(int idx);
+static AlogQueue *		alog_get_local_common_cache(int consumer_id);
+static AlogQueue *		alog_get_local_fga_cache(int consumer_id);
+static AlogQueue *		alog_get_local_trace_cache(int consumer_id);
 static AlogQueueCache * alog_make_local_cache(int cache_number, int queue_size_kb);
 static ThreadSema *        alog_make_consumer_semas(int consumer_count);
 static void             alog_consumer_wakeup(int consumer_id);
@@ -484,71 +528,75 @@ AuditLoggerMain(int argc, char *argv[])
 static void
 audit_logger_MainLoop(void)
 {
-    /*
-     * Create log directory if not present; ignore errors
-     */
-    audit_assign_log_dir();
-    mkdir(audit_log_directory, S_IRWXU);
-
-    /*
-     * Remember active logfile's name.    We recompute this from the reference
-     * time because passing down just the pg_time_t is a lot cheaper than
-     * passing a whole file path in the EXEC_BACKEND case.
-     */
-    audit_first_log_file_time = time(NULL);
-    audit_last_comm_log_file_name = audit_log_file_getname(audit_first_log_file_time, NULL);
-    audit_comm_log_file = audit_open_log_file(audit_last_comm_log_file_name, "a", false);
-    aduit_open_fga_log_file();
-
-    /* remember active logfile parameters */
-    audit_curr_log_dir = pstrdup(audit_log_directory);
-    audit_curr_log_file_name = pstrdup(AuditLog_filename);
-    audit_curr_log_rotation_age = AuditLog_RotationAge;
-
-    SpinLockInit(&(audit_comm_log_file_lock));
-    SpinLockInit(&(audit_fga_log_file_lock));
-
-    /* set next planned rotation time */
-    audit_set_next_rotation_time();
-
-    /* start consumer and writer thread*/
-    alog_start_all_worker();
-
-    /* main worker loop */
-    while (PostmasterIsAlive())
-    {
-        int    rc = 0;
-
-        /* Clear any already-pending wakeups */
-        ResetLatch(MyLatch);
-
-        audit_process_sighup();
-        audit_process_sigusr1();
-        audit_process_sigusr2();
-        audit_process_sigterm();
-        audit_process_sigint();
-        audit_process_sigquit();
-        audit_process_rotate();
-        audit_process_wakeup(false);
-
-        rc = WaitLatch(MyLatch,
-                       WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-                       AUDIT_LATCH_MICROSEC,
-                       WAIT_EVENT_AUDIT_LOGGER_MAIN);
-
-        if (rc & WL_POSTMASTER_DEATH)
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("audit logger exit after postmaster die")));
-            exit(2);
-        }
-        else if (rc & WL_TIMEOUT)
-        {
-            audit_consume_requested = true;
-            audit_process_wakeup(true);
-        }
-    }
+	/*
+	 * Create log directory if not present; ignore errors
+	 */
+	audit_assign_log_dir();
+	mkdir(audit_log_directory, S_IRWXU);
+	mkdir(trace_log_directory, S_IRWXU);
+
+	/*
+	 * Remember active logfile's name.	We recompute this from the reference
+	 * time because passing down just the pg_time_t is a lot cheaper than
+	 * passing a whole file path in the EXEC_BACKEND case.
+	 */
+	audit_first_log_file_time = time(NULL);
+	audit_last_comm_log_file_name = audit_log_file_getname(audit_first_log_file_time, NULL);
+	audit_comm_log_file = audit_open_log_file(audit_last_comm_log_file_name, "a", false);
+	audit_open_fga_log_file();
+	audit_open_trace_log_file();
+
+	/* remember active logfile parameters */
+	audit_curr_log_dir = pstrdup(audit_log_directory);
+	trace_curr_log_dir= pstrdup(trace_log_directory);
+	audit_curr_log_file_name = pstrdup(AuditLog_filename);
+	audit_curr_log_rotation_age = AuditLog_RotationAge;
+
+	SpinLockInit(&(audit_comm_log_file_lock));
+	SpinLockInit(&(audit_fga_log_file_lock));
+	SpinLockInit(&(audit_trace_log_file_lock));
+
+	/* set next planned rotation time */
+	audit_set_next_rotation_time();
+
+	/* start consumer and writer thread*/
+	alog_start_all_worker();
+
+	/* main worker loop */
+	while (PostmasterIsAlive())
+	{
+		int	rc = 0;
+
+		/* Clear any already-pending wakeups */
+		ResetLatch(MyLatch);
+
+		audit_process_sighup();
+		audit_process_sigusr1();
+		audit_process_sigusr2();
+		audit_process_sigterm();
+		audit_process_sigint();
+		audit_process_sigquit();
+		audit_process_rotate();
+		audit_process_wakeup(false);
+
+		rc = WaitLatch(MyLatch,
+					   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+					   AUDIT_LATCH_MICROSEC,
+					   WAIT_EVENT_AUDIT_LOGGER_MAIN);
+
+		if (rc & WL_POSTMASTER_DEATH)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("audit logger exit after postmaster die")));
+			exit(2);
+		}
+		else if (rc & WL_TIMEOUT)
+		{
+			audit_consume_requested = true;
+			audit_process_wakeup(true);
+		}
+	}
 }
 
 /* --------------------------------
@@ -765,101 +813,112 @@ static void audit_process_sigquit(void)
 }
 
 static void audit_process_rotate(void)
-{// #lizard forgives
-    bool        time_based_rotation = false;
-    int            size_rotation_for = 0;
-    pg_time_t    now = MyStartTime;
-
-    if (AuditLog_RotationAge > 0 && !audit_rotation_disabled)
-    {
-        /* Do a logfile rotation if it's time */
-        now = (pg_time_t) time(NULL);
-        if (now >= audit_next_rotation_time)
-            audit_rotation_requested = time_based_rotation = true;
-    }
-
-    if (!audit_rotation_requested && AuditLog_RotationSize > 0 && !audit_rotation_disabled)
-    {
-        /* Do a rotation if file is too big */
-        if (ftell(audit_comm_log_file) >= AuditLog_RotationSize * 1024L)
-        {
-            audit_rotation_requested = true;
-            size_rotation_for |= AUDIT_COMMON_LOG;
-        }
-
-        if (audit_fga_log_file != NULL &&
-            ftell(audit_fga_log_file) >= AuditLog_RotationSize * 1024L)
-        {
-            audit_rotation_requested = true;
-            size_rotation_for |= AUDIT_FGA_LOG;
-        }
-    }
-
-    if (audit_rotation_requested)
-    {
-        /*
-         * Force rotation when both values are zero. It means the request
-         * was sent by pg_rotate_log_file.
-         */
-        if (!time_based_rotation && size_rotation_for == 0)
-            size_rotation_for = AUDIT_COMMON_LOG | AUDIT_FGA_LOG;
-        audit_rotate_log_file(time_based_rotation, size_rotation_for);
-    }
+{
+	bool		time_based_rotation = false;
+	int			size_rotation_for = 0;
+	pg_time_t	now = MyStartTime;
+
+	if (AuditLog_RotationAge > 0 && !audit_rotation_disabled)
+	{
+		/* Do a logfile rotation if it's time */
+		now = (pg_time_t) time(NULL);
+		if (now >= audit_next_rotation_time)
+			audit_rotation_requested = time_based_rotation = true;
+	}
+
+	if (!audit_rotation_requested && AuditLog_RotationSize > 0 && !audit_rotation_disabled)
+	{
+		/* Do a rotation if file is too big */
+		if (ftell(audit_comm_log_file) >= AuditLog_RotationSize * 1024L)
+		{
+			audit_rotation_requested = true;
+			size_rotation_for |= AUDIT_COMMON_LOG;
+		}
+
+		if (audit_fga_log_file != NULL &&
+			ftell(audit_fga_log_file) >= AuditLog_RotationSize * 1024L)
+		{
+			audit_rotation_requested = true;
+			size_rotation_for |= AUDIT_FGA_LOG;
+		}
+
+		if (audit_trace_log_file != NULL &&
+			ftell(audit_trace_log_file) >= AuditLog_RotationSize * 1024L)
+		{
+			audit_rotation_requested = true;
+			size_rotation_for |= MAINTAIN_TRACE_LOG;
+		}
+	}
+
+	if (audit_rotation_requested)
+	{
+		/*
+		 * Force rotation when both values are zero. It means the request
+		 * was sent by pg_rotate_log_file.
+		 */
+		if (!time_based_rotation && size_rotation_for == 0)
+		{
+			size_rotation_for = AUDIT_COMMON_LOG | AUDIT_FGA_LOG | MAINTAIN_TRACE_LOG;
+		}
+		audit_rotate_log_file(time_based_rotation, size_rotation_for);
+	}
 }
 
 /*
  * if any audit log was coming,
  * wakeup a consumer to read
  */
-static void    audit_process_wakeup(bool timeout)
-{// #lizard forgives
-    if (audit_consume_requested)
-    {
-        int i = 0;
-
-        if (timeout)
-        {
-            for (i = 0; i < MaxBackends; i++)
-            {
-                int sharedIdx = i;
-                int consumer_id = sharedIdx % AuditLog_max_worker_number;
-
-                AlogQueue * shared_common_queue = alog_get_shared_common_queue(sharedIdx);
-                AlogQueue * shared_fga_queue = alog_get_shared_fga_queue(sharedIdx);
-            
-                bool b_common_is_empty = alog_queue_is_empty2(shared_common_queue);
-                bool b_fga_is_empty = alog_queue_is_empty2(shared_fga_queue);
-
-                pg_memory_barrier();
-
-                if (!b_common_is_empty || !b_fga_is_empty)
-                {
-                    if (!audit_shared_consumer_bitmap_get_value(consumer_id))
-                    {
-                        audit_shared_consumer_bitmap_set_value(consumer_id, 1);
-                        alog_consumer_wakeup(consumer_id);
-                    }
-                }
-            }
-        }
-        else
-        {
-            for (i = 0; i < AuditLog_max_worker_number; i++)
-            {
-                int consumer_id = i;
-                int bitmap_value = audit_shared_consumer_bitmap_get_value(consumer_id);
-
-                pg_memory_barrier();
-
-                if (bitmap_value)
-                {
-                    alog_consumer_wakeup(consumer_id);
-                }
-            }
-        }
-
-        audit_consume_requested = false;
-    }    
+static void	audit_process_wakeup(bool timeout)
+{
+	if (audit_consume_requested)
+	{
+		int i = 0;
+
+		if (timeout)
+		{
+			for (i = 0; i < MaxBackends; i++)
+			{
+				int sharedIdx = i;
+				int consumer_id = sharedIdx % AuditLog_max_worker_number;
+
+				AlogQueue * shared_common_queue = alog_get_shared_common_queue(sharedIdx);
+				AlogQueue * shared_fga_queue = alog_get_shared_fga_queue(sharedIdx);
+				AlogQueue * shared_trace_queue = alog_get_shared_trace_queue(sharedIdx);
+
+				bool b_common_is_empty = alog_queue_is_empty2(shared_common_queue);
+				bool b_fga_is_empty = alog_queue_is_empty2(shared_fga_queue);
+				bool b_trace_is_empty = alog_queue_is_empty2(shared_trace_queue);
+
+				pg_memory_barrier();
+
+				if (!b_common_is_empty || !b_fga_is_empty || !b_trace_is_empty)
+				{
+					if (!audit_shared_consumer_bitmap_get_value(consumer_id))
+					{
+						audit_shared_consumer_bitmap_set_value(consumer_id, 1);
+						alog_consumer_wakeup(consumer_id);
+					}
+				}
+			}
+		}
+		else
+		{
+			for (i = 0; i < AuditLog_max_worker_number; i++)
+			{
+				int consumer_id = i;
+				int bitmap_value = audit_shared_consumer_bitmap_get_value(consumer_id);
+
+				pg_memory_barrier();
+
+				if (bitmap_value)
+				{
+					alog_consumer_wakeup(consumer_id);
+				}
+			}
+		}
+
+		audit_consume_requested = false;
+	}
 }
 
 #endif
@@ -887,16 +946,19 @@ static void    audit_process_wakeup(bool timeout)
  * |         |                  |                       |                       |                                                                         |
  * |         |                  |                       |                       |<-------- audit_shared_common_queue_elem_size() * MaxBackends ---------->|
  * |<- 4B -> |<------ 4B ------>|<- 4 * MaxBackends B ->|<- AUDIT_BITMAP_SIZE ->|<--------- audit_shared_fga_queue_elem_size() * MaxBackends ------------>|
+ * |         |                  |                       |                       |<-------- audit_shared_trace_queue_elem_size() * MaxBackends ----------->|
  * |                                                                            |                                                                         |
  * |                                                                            |                                                                         |
- * |<------------------------- Shared Log Queue Array Header ------------------>|<------------------- AlogQueue Array [MaxBackends] --------------------->|                                                                   
+ * |<------------------------- Shared Log Queue Array Header ------------------>|<------------------- AlogQueue Array [MaxBackends] --------------------->|
  *
  *
  * 02. AlogQueue as follows
  *
- *                                             | q_area -> char[AuditLog_common_log_queue_size_kb * BYTES_PER_KB] | 
+ *                                             | q_area -> char[AuditLog_common_log_queue_size_kb * BYTES_PER_KB] |
  * | q_pid | q_size | q_lock | q_head | q_tail |                              OR                                  |
  *                                             | q_area ->  char[AuditLog_fga_log_queue_size_kb * BYTES_PER_KB]   |
+ *                                             |                              OR                                  |
+ *                                             | q_area ->  char[Maintain_trace_log_queue_size_kb * BYTES_PER_KB] |
  *
  * --------------------------------
  */
@@ -905,13 +967,13 @@ static Size audit_shared_queue_array_bitmap_offset(void)
 {
     Size alogQueueArrayBitmapOffset = 0;
 
-    /* store AlogQueueArray->a_count, a_bitmap */
-    alogQueueArrayBitmapOffset = add_size(alogQueueArrayBitmapOffset, 
-                                          offsetof(AlogQueueArray, a_queue));
+	/* store AlogQueueArray->a_count, a_bitmap */
+	alogQueueArrayBitmapOffset = add_size(alogQueueArrayBitmapOffset,
+										offsetof(AlogQueueArray, a_queue));
 
-    /* store AlogQueueArray->a_queue */
-    alogQueueArrayBitmapOffset = add_size(alogQueueArrayBitmapOffset, 
-                                          mul_size(sizeof(AlogQueue *), MaxBackends));
+	/* store AlogQueueArray->a_queue */
+	alogQueueArrayBitmapOffset = add_size(alogQueueArrayBitmapOffset,
+										  mul_size(sizeof(AlogQueue *), MaxBackends));
 
     return alogQueueArrayBitmapOffset;
 }
@@ -920,13 +982,13 @@ static Size audit_shared_queue_array_header_size(void)
 {
     Size alogQueueArrayHeaderSize = 0;
 
-    /* store AlogQueueArray->a_count, a_bitmap, a_queue */
-    alogQueueArrayHeaderSize = add_size(alogQueueArrayHeaderSize, 
-                                        audit_shared_queue_array_bitmap_offset());
+	/* store AlogQueueArray->a_count, a_bitmap, a_queue */
+	alogQueueArrayHeaderSize = add_size(alogQueueArrayHeaderSize,
+										audit_shared_queue_array_bitmap_offset());
 
-    /* store content of AlogQueueArray->a_bitmap */
-    alogQueueArrayHeaderSize = add_size(alogQueueArrayHeaderSize, 
-                                        AUDIT_BITMAP_SIZE);
+	/* store content of AlogQueueArray->a_bitmap */
+	alogQueueArrayHeaderSize = add_size(alogQueueArrayHeaderSize,
+										AUDIT_BITMAP_SIZE);
 
     return alogQueueArrayHeaderSize;
 }
@@ -989,6 +1051,30 @@ static Size audit_shared_fga_queue_array_size(void)
     return alogFgaQueueSize;
 }
 
+static Size audit_shared_trace_queue_elem_size(void)
+{
+	Size		alogTraceQueueItemSize = 0;
+
+	/* store content of trace audit log */
+	alogTraceQueueItemSize = audit_queue_elem_size(Maintain_trace_log_queue_size_kb);
+
+	return alogTraceQueueItemSize;
+}
+
+static Size audit_shared_trace_queue_array_size(void)
+{
+	Size		alogTraceQueueSize = 0;
+
+	/* store content of trace audit log */
+	alogTraceQueueSize = audit_shared_trace_queue_elem_size();
+	alogTraceQueueSize = mul_size(alogTraceQueueSize, MaxBackends);
+
+	alogTraceQueueSize = add_size(alogTraceQueueSize,
+								audit_shared_queue_array_header_size());
+
+	return alogTraceQueueSize;
+}
+
 static Size audit_shared_consumer_bitmap_size(void)
 {
     Size alogConsumerBitmapSize = 0;
@@ -1012,10 +1098,11 @@ static void audit_shared_consumer_bitmap_set_value(int consumer_id, int value)
 
 Size AuditLoggerShmemSize(void)
 {
-    Size        size = 0;
-    Size        alogCommonQueueSize = 0;
-    Size        alogFgaQueueSize = 0;
-    Size        alogConsumerBmpSize = 0;
+	Size		size = 0;
+	Size		alogCommonQueueSize = 0;
+	Size		alogFgaQueueSize = 0;
+	Size		alogTraceQueueSize = 0;
+	Size		alogConsumerBmpSize = 0;
 
     /* for common audit log */
     alogCommonQueueSize = audit_shared_common_queue_array_size();
@@ -1023,173 +1110,245 @@ Size AuditLoggerShmemSize(void)
     /* for fga audit log*/
     alogFgaQueueSize = audit_shared_fga_queue_array_size();
 
-    /* for consumer notify bitmap */
-    alogConsumerBmpSize = audit_shared_consumer_bitmap_size();
+	/* for trace audit log */
+	alogTraceQueueSize = audit_shared_trace_queue_array_size();
+
+	/* for consumer notify bitmap */
+	alogConsumerBmpSize = audit_shared_consumer_bitmap_size();
 
-    /* for total size */
-    size = add_size(alogCommonQueueSize, alogFgaQueueSize);
-    size = add_size(size, alogConsumerBmpSize);
+	/* for total size */
+	size = add_size(alogCommonQueueSize, alogFgaQueueSize);
+	size = add_size(size, alogTraceQueueSize);
+	size = add_size(size, alogConsumerBmpSize);
 
     return size;
 }
 
 void AuditLoggerShmemInit(void)
-{// #lizard forgives
-    Size        alogBmpOffset = 0;
-    Size        alogHeaderSize = 0;
-    Size        alogItemSize = 0;
-    Size        alogArraySize = 0;
-    Size        alogConsumerBmpSize = 0;
-
-    bool        found = false;
-    int            i = 0;
-
-    alogBmpOffset = audit_shared_queue_array_bitmap_offset();
-    alogHeaderSize = audit_shared_queue_array_header_size();
-    alogItemSize = audit_shared_common_queue_elem_size();
-    alogArraySize = audit_shared_common_queue_array_size();
-
-    AuditCommonLogQueueArray = ShmemInitStruct("Audit Common Log Queue",
-                                                alogArraySize,
-                                                &found);
-    /* Mark it empty upon creation */
-    if (!found)
-    {
-        AlogQueueArray * alogQueueArray = AuditCommonLogQueueArray;
-        int falogQueueArray = 0;
-        Size sharedMemSize = 0;
-
-        if (enable_auditlogger_warning)
-        {
-            sharedMemSize += alogHeaderSize;
-            MemSet(alogQueueArray, 'a', alogHeaderSize);
-
-            for (i = 0; i < MaxBackends; i++)
-            {
-                AlogQueue * alogQueueItem = NULL;
-
-                alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize);
-                sharedMemSize += audit_shared_common_queue_elem_size();
-                MemSet(alogQueueItem, 'b', audit_shared_common_queue_elem_size());
-            }
-
-            falogQueueArray = BasicOpenFile("AuditCommonLogQueueArray.txt", O_RDWR | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR);
-            if (falogQueueArray != -1)
-            {
-                write(falogQueueArray, alogQueueArray, alogArraySize);
-                write(falogQueueArray, "\nNew Line\n", strlen("\nNew Line\n"));
-            }
-
-            Assert(sharedMemSize == alogArraySize);
-        }
-
-        MemSet(alogQueueArray, 0, alogArraySize);
-
-        alogQueueArray->a_count = MaxBackends;        
-        alogQueueArray->a_bitmap = bms_make(((char *) alogQueueArray) + alogBmpOffset, 
-                                            MaxBackends);
-        for (i = 0; i < MaxBackends; i++)
-        {
-            AlogQueue * alogQueueItem = NULL;
-
-            alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize);
-
-            alog_queue_init(alogQueueItem, AuditLog_common_log_queue_size_kb);
-
-            alogQueueArray->a_queue[i] = alogQueueItem;
-        }
-
-        if (enable_auditlogger_warning)
-        {
-            if (falogQueueArray != -1)
-            {
-                write(falogQueueArray, alogQueueArray, alogArraySize);
-                close(falogQueueArray);
-            }
-        }
-    }
-
-    found = false;
-    i = 0;
-
-    alogBmpOffset = audit_shared_queue_array_bitmap_offset();
-    alogHeaderSize = audit_shared_queue_array_header_size();
-    alogItemSize = audit_shared_fga_queue_elem_size();
-    alogArraySize = audit_shared_fga_queue_array_size();
-
-    AuditFGALogQueueArray = ShmemInitStruct("Audit FGA Log Queue",
-                                            alogArraySize,
-                                            &found);
-    /* Mark it empty upon creation */
-    if (!found)
-    {
-        AlogQueueArray * alogQueueArray = AuditFGALogQueueArray;
-        int falogQueueArray = 0;
-        Size sharedMemSize = 0;
-
-        if (enable_auditlogger_warning)
-        {
-            sharedMemSize += alogHeaderSize;
-            MemSet(alogQueueArray, 'c', alogHeaderSize);
-
-            for (i = 0; i < MaxBackends; i++)
-            {
-                AlogQueue * alogQueueItem = NULL;
-
-                alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize);
-                sharedMemSize += audit_shared_fga_queue_elem_size();
-                MemSet(alogQueueItem, 'd', audit_shared_fga_queue_elem_size());
-            }
-
-            falogQueueArray = BasicOpenFile("AuditFGALogQueueArray.txt", O_RDWR | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR);
-            if (falogQueueArray != -1)
-            {
-                write(falogQueueArray, alogQueueArray, alogArraySize);
-                write(falogQueueArray, "\nNew Line\n", strlen("\nNew Line\n"));
-            }
-
-            Assert(sharedMemSize == alogArraySize);
-        }
-
-        MemSet(alogQueueArray, 0, alogArraySize);
-
-        alogQueueArray->a_count = MaxBackends;
-        alogQueueArray->a_bitmap = bms_make(((char *) alogQueueArray) + alogBmpOffset, 
-                                            MaxBackends);
-        for (i = 0; i < MaxBackends; i++)
-        {
-            AlogQueue * alogQueueItem = NULL;
-
-            alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize);
-
-            alog_queue_init(alogQueueItem, AuditLog_fga_log_queue_size_kb);
-
-            alogQueueArray->a_queue[i] = alogQueueItem;
-        }
-
-        if (enable_auditlogger_warning)
-        {
-            if (falogQueueArray != -1)
-            {
-                write(falogQueueArray, alogQueueArray, alogArraySize);
-                close(falogQueueArray);
-            }
-        }
-    }
-
-    found = false;
-    i = 0;
-
-    alogConsumerBmpSize = audit_shared_consumer_bitmap_size();
-
-    AuditConsumerNotifyBitmap = ShmemInitStruct("Audit Consumer Bitmap",
-                                          alogConsumerBmpSize,
-                                          &found);
-    /* Mark it empty upon creation */
-    if (!found)
-    {
-        MemSet(AuditConsumerNotifyBitmap, 0, alogConsumerBmpSize);
-    }
+{
+	Size		alogBmpOffset = 0;
+	Size		alogHeaderSize = 0;
+	Size		alogItemSize = 0;
+	Size		alogArraySize = 0;
+	Size		alogConsumerBmpSize = 0;
+
+	bool		found = false;
+	int			i = 0;
+
+	alogBmpOffset = audit_shared_queue_array_bitmap_offset();
+	alogHeaderSize = audit_shared_queue_array_header_size();
+	alogItemSize = audit_shared_common_queue_elem_size();
+	alogArraySize = audit_shared_common_queue_array_size();
+
+	AuditCommonLogQueueArray = ShmemInitStruct("Audit Common Log Queue",
+												alogArraySize,
+												&found);
+	/* Mark it empty upon creation */
+	if (!found)
+	{
+		AlogQueueArray * alogQueueArray = AuditCommonLogQueueArray;
+		int falogQueueArray = 0;
+		Size sharedMemSize = 0;
+
+		if (enable_auditlogger_warning)
+		{
+			sharedMemSize += alogHeaderSize;
+			MemSet(alogQueueArray, 'a', alogHeaderSize);
+
+			for (i = 0; i < MaxBackends; i++)
+			{
+				AlogQueue * alogQueueItem = NULL;
+
+				alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize);
+				sharedMemSize += audit_shared_common_queue_elem_size();
+				MemSet(alogQueueItem, 'b', audit_shared_common_queue_elem_size());
+			}
+
+			falogQueueArray = BasicOpenFile("AuditCommonLogQueueArray.txt", O_RDWR | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR);
+			if (falogQueueArray != -1)
+			{
+				write(falogQueueArray, alogQueueArray, alogArraySize);
+				write(falogQueueArray, "\nNew Line\n", strlen("\nNew Line\n"));
+			}
+
+			Assert(sharedMemSize == alogArraySize);
+		}
+
+		MemSet(alogQueueArray, 0, alogArraySize);
+
+		alogQueueArray->a_count = MaxBackends;
+		alogQueueArray->a_bitmap = bms_make(((char *) alogQueueArray) + alogBmpOffset, 
+											MaxBackends);
+		for (i = 0; i < MaxBackends; i++)
+		{
+			AlogQueue * alogQueueItem = NULL;
+
+			alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize);
+
+			alog_queue_init(alogQueueItem, AuditLog_common_log_queue_size_kb);
+
+			alogQueueArray->a_queue[i] = alogQueueItem;
+		}
+
+		if (enable_auditlogger_warning)
+		{
+			if (falogQueueArray != -1)
+			{
+				write(falogQueueArray, alogQueueArray, alogArraySize);
+				close(falogQueueArray);
+			}
+		}
+	}
+
+	found = false;
+	i = 0;
+
+	alogBmpOffset = audit_shared_queue_array_bitmap_offset();
+	alogHeaderSize = audit_shared_queue_array_header_size();
+	alogItemSize = audit_shared_fga_queue_elem_size();
+	alogArraySize = audit_shared_fga_queue_array_size();
+
+	AuditFGALogQueueArray = ShmemInitStruct("Audit FGA Log Queue",
+											alogArraySize,
+											&found);
+	/* Mark it empty upon creation */
+	if (!found)
+	{
+		AlogQueueArray * alogQueueArray = AuditFGALogQueueArray;
+		int falogQueueArray = 0;
+		Size sharedMemSize = 0;
+
+		if (enable_auditlogger_warning)
+		{
+			sharedMemSize += alogHeaderSize;
+			MemSet(alogQueueArray, 'c', alogHeaderSize);
+
+			for (i = 0; i < MaxBackends; i++)
+			{
+				AlogQueue * alogQueueItem = NULL;
+
+				alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize);
+				sharedMemSize += audit_shared_fga_queue_elem_size();
+				MemSet(alogQueueItem, 'd', audit_shared_fga_queue_elem_size());
+			}
+
+			falogQueueArray = BasicOpenFile("AuditFGALogQueueArray.txt", O_RDWR | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR);
+			if (falogQueueArray != -1)
+			{
+				write(falogQueueArray, alogQueueArray, alogArraySize);
+				write(falogQueueArray, "\nNew Line\n", strlen("\nNew Line\n"));
+			}
+
+			Assert(sharedMemSize == alogArraySize);
+		}
+
+		MemSet(alogQueueArray, 0, alogArraySize);
+
+		alogQueueArray->a_count = MaxBackends;
+		alogQueueArray->a_bitmap = bms_make(((char *) alogQueueArray) + alogBmpOffset, 
+										MaxBackends);
+		for (i = 0; i < MaxBackends; i++)
+		{
+			AlogQueue * alogQueueItem = NULL;
+
+			alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize);
+
+			alog_queue_init(alogQueueItem, AuditLog_fga_log_queue_size_kb);
+
+			alogQueueArray->a_queue[i] = alogQueueItem;
+		}
+
+		if (enable_auditlogger_warning)
+		{
+			if (falogQueueArray != -1)
+			{
+				write(falogQueueArray, alogQueueArray, alogArraySize);
+				close(falogQueueArray);
+			}
+		}
+	}
+
+	found = false;
+	i = 0;
+
+	alogBmpOffset = audit_shared_queue_array_bitmap_offset();
+	alogHeaderSize = audit_shared_queue_array_header_size();
+	alogItemSize = audit_shared_trace_queue_elem_size();
+	alogArraySize = audit_shared_trace_queue_array_size();
+
+	AuditTraceLogQueueArray = ShmemInitStruct("Audit Trace Log Queue",
+											alogArraySize,
+											&found);
+	/* Mark it empty upon creation */
+	if (!found)
+	{
+		AlogQueueArray * alogQueueArray = AuditTraceLogQueueArray;
+		int falogQueueArray = 0;
+		Size sharedMemSize = 0;
+
+		if (enable_auditlogger_warning)
+		{
+			sharedMemSize += alogHeaderSize;
+			MemSet(alogQueueArray, 'e', alogHeaderSize);
+
+			for (i = 0; i < MaxBackends; i++)
+			{
+				AlogQueue * alogQueueItem = NULL;
+
+				alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize);
+				sharedMemSize += audit_shared_trace_queue_elem_size();
+				MemSet(alogQueueItem, 'f', audit_shared_trace_queue_elem_size());
+			}
+
+			falogQueueArray = BasicOpenFile("AuditTraceLogQueueArray.txt", O_RDWR | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR);
+			if (falogQueueArray != -1)
+			{
+				write(falogQueueArray, alogQueueArray, alogArraySize);
+				write(falogQueueArray, "\nNew Line\n", strlen("\nNew Line\n"));
+			}
+
+			Assert(sharedMemSize == alogArraySize);
+		}
+
+		MemSet(alogQueueArray, 0, alogArraySize);
+
+		alogQueueArray->a_count = MaxBackends;
+		alogQueueArray->a_bitmap = bms_make(((char *) alogQueueArray) + alogBmpOffset,
+											MaxBackends);
+		for (i = 0; i < MaxBackends; i++)
+		{
+			AlogQueue * alogQueueItem = NULL;
+
+			alogQueueItem = (AlogQueue *)(((char *) alogQueueArray) + alogHeaderSize + i * alogItemSize);
+
+			alog_queue_init(alogQueueItem, Maintain_trace_log_queue_size_kb);
+
+			alogQueueArray->a_queue[i] = alogQueueItem;
+		}
+
+		if (enable_auditlogger_warning)
+		{
+			if (falogQueueArray != -1)
+			{
+				write(falogQueueArray, alogQueueArray, alogArraySize);
+				close(falogQueueArray);
+			}
+		}
+	}
+
+	found = false;
+	i = 0;
+
+	alogConsumerBmpSize = audit_shared_consumer_bitmap_size();
+
+	AuditConsumerNotifyBitmap = ShmemInitStruct("Audit Consumer Bitmap",
+										  alogConsumerBmpSize,
+										  &found);
+	/* Mark it empty upon creation */
+	if (!found)
+	{
+		MemSet(AuditConsumerNotifyBitmap, 0, alogConsumerBmpSize);
+	}
 }
 
 #endif
@@ -1204,7 +1363,7 @@ void AuditLoggerShmemInit(void)
 /*
  * Make audit_log_directory from Log_directory
  */
-static void        
+static void
 audit_assign_log_dir(void)
 {
     if (audit_log_directory != NULL)
@@ -1221,12 +1380,19 @@ audit_assign_log_dir(void)
     {
         StringInfoData alog_dir;
 
-        memset(&alog_dir, 0, sizeof(alog_dir));
-        initStringInfo(&alog_dir);        
-        appendStringInfo(&alog_dir, "%s/audit", Log_directory);
+		memset(&alog_dir, 0, sizeof(alog_dir));
+		initStringInfo(&alog_dir);
+		appendStringInfo(&alog_dir, "%s/audit", Log_directory);
 
-        audit_log_directory = alog_dir.data;
-    }
+		audit_log_directory = alog_dir.data;
+	}
+
+	if (trace_log_directory != NULL)
+	{
+		pfree(trace_log_directory);
+		trace_log_directory = NULL;
+	}
+	trace_log_directory = pstrdup("pg_log/maintain");
 }
 
 /*
@@ -1241,15 +1407,19 @@ audit_write_log_file(const char *buffer, int count, int destination)
 {
     int            rc = 0;
 
-    if (destination == AUDIT_FGA_LOG)
-    {
-        rc = fwrite(buffer, 1, count, audit_fga_log_file);
-    }
-    else
-    {
-        Assert(destination == AUDIT_COMMON_LOG);
-        rc = fwrite(buffer, 1, count, audit_comm_log_file);
-    }
+	if (destination == AUDIT_FGA_LOG)
+	{
+		rc = fwrite(buffer, 1, count, audit_fga_log_file);
+	}
+	else if (destination == MAINTAIN_TRACE_LOG)
+	{
+		rc = fwrite(buffer, 1, count, audit_trace_log_file);
+	}
+	else
+	{
+		Assert(destination == AUDIT_COMMON_LOG);
+		rc = fwrite(buffer, 1, count, audit_comm_log_file);
+	}
 
     /* can't use ereport here because of possible recursion */
     if (rc != count)
@@ -1262,7 +1432,7 @@ audit_write_log_file(const char *buffer, int count, int destination)
 }
 
 static void
-aduit_open_fga_log_file(void)
+audit_open_fga_log_file(void)
 {
     char       *filename = NULL;
 
@@ -1276,6 +1446,25 @@ aduit_open_fga_log_file(void)
     audit_last_fga_log_file_name = filename;
 }
 
+
+static void
+audit_open_trace_log_file(void)
+{
+	char	   *filename = NULL;
+
+	filename = trace_log_file_getname(time(NULL), ".trace");
+
+	audit_trace_log_file = audit_open_log_file(filename, "a", false);
+
+	if (audit_last_trace_log_file_name != NULL)
+	{
+		/* probably shouldn't happen */
+		pfree(audit_last_trace_log_file_name);
+	}
+
+	audit_last_trace_log_file_name = filename;
+}
+
 /*
  * Open a new logfile with proper permissions and buffering options.
  *
@@ -1325,128 +1514,192 @@ audit_open_log_file(const char *filename, const char *mode, bool allow_errors)
  */
 static void
 audit_rotate_log_file(bool time_based_rotation, int size_rotation_for)
-{// #lizard forgives
-    char       *filename = NULL;
-    char       *fgafilename = NULL;
-    pg_time_t    fntime = 0;
-    FILE       *fh = NULL;
-
-    audit_rotation_requested = false;
-
-    /*
-     * When doing a time-based rotation, invent the new logfile name based on
-     * the planned rotation time, not current time, to avoid "slippage" in the
-     * file name when we don't do the rotation immediately.
-     */
-    if (time_based_rotation)
-        fntime = audit_next_rotation_time;
-    else
-        fntime = time(NULL);
-    filename = audit_log_file_getname(fntime, NULL);
-    if (audit_fga_log_file != NULL)
-        fgafilename = audit_log_file_getname(fntime, ".fga");
-
-    /*
-     * Decide whether to overwrite or append.  We can overwrite if (a)
-     * AuditLog_truncate_on_rotation is set, (b) the rotation was triggered by
-     * elapsed time and not something else, and (c) the computed file name is
-     * different from what we were previously logging into.
-     *
-     * Note: audit_last_comm_log_file_name should never be NULL here, but if it is, append.
-     */
-    if (time_based_rotation || (size_rotation_for & AUDIT_COMMON_LOG))
-    {
-        if (AuditLog_truncate_on_rotation && time_based_rotation &&
-            audit_last_comm_log_file_name != NULL &&
-            strcmp(filename, audit_last_comm_log_file_name) != 0)
-            fh = audit_open_log_file(filename, "w", true);
-        else
-            fh = audit_open_log_file(filename, "a", true);
-
-        if (!fh)
-        {
-            /*
-             * ENFILE/EMFILE are not too surprising on a busy system; just
-             * keep using the old file till we manage to get a new one.
-             * Otherwise, assume something's wrong with audit_log_directory and stop
-             * trying to create files.
-             */
-            if (errno != ENFILE && errno != EMFILE)
-            {
-                ereport(LOG,
-                        (errmsg("disabling automatic rotation audit log file (use SIGHUP to re-enable)")));
-                audit_rotation_disabled = true;
-            }
-
-            pfree(filename);
-            if (fgafilename)
-                pfree(fgafilename);
-            return;
-        }
-
-        SpinLockAcquire(&(audit_comm_log_file_lock));
-        fclose(audit_comm_log_file);
-        audit_comm_log_file = fh;
-        SpinLockRelease(&(audit_comm_log_file_lock));
-
-        /* instead of pfree'ing filename, remember it for next time */
-        if (audit_last_comm_log_file_name != NULL)
-            pfree(audit_last_comm_log_file_name);
-        audit_last_comm_log_file_name = filename;
-        filename = NULL;
-    }
-
-    /* Same as above, but for fga audit log file. */
-
-    if (audit_fga_log_file != NULL &&
-        (time_based_rotation || (size_rotation_for & AUDIT_FGA_LOG)))
-    {
-        if (AuditLog_truncate_on_rotation && time_based_rotation &&
-            audit_last_fga_log_file_name != NULL &&
-            strcmp(fgafilename, audit_last_fga_log_file_name) != 0)
-            fh = audit_open_log_file(fgafilename, "w", true);
-        else
-            fh = audit_open_log_file(fgafilename, "a", true);
-
-        if (!fh)
-        {
-            /*
-             * ENFILE/EMFILE are not too surprising on a busy system; just
-             * keep using the old file till we manage to get a new one.
-             * Otherwise, assume something's wrong with audit_log_directory and stop
-             * trying to create files.
-             */
-            if (errno != ENFILE && errno != EMFILE)
-            {
-                ereport(LOG,
-                        (errmsg("disabling automatic rotation audit log file (use SIGHUP to re-enable)")));
-                audit_rotation_disabled = true;
-            }
-
-            if (filename)
-                pfree(filename);
-            pfree(fgafilename);
-            return;
-        }
-
-        SpinLockAcquire(&(audit_fga_log_file_lock));
-        fclose(audit_fga_log_file);
-        audit_fga_log_file = fh;
-        SpinLockRelease(&(audit_fga_log_file_lock));
-
-        /* instead of pfree'ing filename, remember it for next time */
-        if (audit_last_fga_log_file_name != NULL)
-            pfree(audit_last_fga_log_file_name);
-        audit_last_fga_log_file_name = fgafilename;
-        fgafilename = NULL;
-    }
-
-    if (filename)
-        pfree(filename);
-    if (fgafilename)
-        pfree(fgafilename);
-
-    audit_set_next_rotation_time();
+{
+	char	   *filename = NULL;
+	char	   *fgafilename = NULL;
+	char	   *tracefilename = NULL;
+	pg_time_t	fntime = 0;
+	FILE	   *fh = NULL;
+
+	audit_rotation_requested = false;
+
+	/*
+	 * When doing a time-based rotation, invent the new logfile name based on
+	 * the planned rotation time, not current time, to avoid "slippage" in the
+	 * file name when we don't do the rotation immediately.
+	 */
+	if (time_based_rotation)
+		fntime = audit_next_rotation_time;
+	else
+		fntime = time(NULL);
+	filename = audit_log_file_getname(fntime, NULL);
+	if (audit_fga_log_file != NULL)
+		fgafilename = audit_log_file_getname(fntime, ".fga");
+	if (audit_trace_log_file != NULL)
+		tracefilename = trace_log_file_getname(fntime, ".trace");
+
+	/*
+	 * Decide whether to overwrite or append.  We can overwrite if (a)
+	 * AuditLog_truncate_on_rotation is set, (b) the rotation was triggered by
+	 * elapsed time and not something else, and (c) the computed file name is
+	 * different from what we were previously logging into.
+	 *
+	 * Note: audit_last_comm_log_file_name should never be NULL here, but if it is, append.
+	 */
+	if (time_based_rotation || (size_rotation_for & AUDIT_COMMON_LOG))
+	{
+		if (AuditLog_truncate_on_rotation && time_based_rotation &&
+			audit_last_comm_log_file_name != NULL &&
+			strcmp(filename, audit_last_comm_log_file_name) != 0)
+			fh = audit_open_log_file(filename, "w", true);
+		else
+			fh = audit_open_log_file(filename, "a", true);
+
+		if (!fh)
+		{
+			/*
+			 * ENFILE/EMFILE are not too surprising on a busy system; just
+			 * keep using the old file till we manage to get a new one.
+			 * Otherwise, assume something's wrong with audit_log_directory and stop
+			 * trying to create files.
+			 */
+			if (errno != ENFILE && errno != EMFILE)
+			{
+				ereport(LOG,
+						(errmsg("disabling automatic rotation audit log file (use SIGHUP to re-enable)")));
+				audit_rotation_disabled = true;
+			}
+
+			if (filename)
+				pfree(filename);
+			if (fgafilename)
+				pfree(fgafilename);
+			if (tracefilename)
+				pfree(tracefilename);
+			return;
+		}
+
+		SpinLockAcquire(&(audit_comm_log_file_lock));
+		fclose(audit_comm_log_file);
+		audit_comm_log_file = fh;
+		SpinLockRelease(&(audit_comm_log_file_lock));
+
+		/* instead of pfree'ing filename, remember it for next time */
+		if (audit_last_comm_log_file_name != NULL)
+			pfree(audit_last_comm_log_file_name);
+		audit_last_comm_log_file_name = filename;
+		filename = NULL;
+	}
+
+	/* Same as above, but for fga audit log file. */
+	if (audit_fga_log_file != NULL &&
+		(time_based_rotation || (size_rotation_for & AUDIT_FGA_LOG)))
+	{
+		if (AuditLog_truncate_on_rotation && time_based_rotation &&
+			audit_last_fga_log_file_name != NULL &&
+			strcmp(fgafilename, audit_last_fga_log_file_name) != 0)
+			fh = audit_open_log_file(fgafilename, "w", true);
+		else
+			fh = audit_open_log_file(fgafilename, "a", true);
+
+		if (!fh)
+		{
+			/*
+			 * ENFILE/EMFILE are not too surprising on a busy system; just
+			 * keep using the old file till we manage to get a new one.
+			 * Otherwise, assume something's wrong with audit_log_directory and stop
+			 * trying to create files.
+			 */
+			if (errno != ENFILE && errno != EMFILE)
+			{
+				ereport(LOG,
+						(errmsg("disabling automatic rotation audit log file (use SIGHUP to re-enable)")));
+				audit_rotation_disabled = true;
+			}
+
+			if (filename)
+				pfree(filename);
+			if (fgafilename)
+				pfree(fgafilename);
+			if (tracefilename)
+				pfree(tracefilename);
+			return;
+		}
+
+		SpinLockAcquire(&(audit_fga_log_file_lock));
+		fclose(audit_fga_log_file);
+		audit_fga_log_file = fh;
+		SpinLockRelease(&(audit_fga_log_file_lock));
+
+		/* instead of pfree'ing filename, remember it for next time */
+		if (audit_last_fga_log_file_name != NULL)
+			pfree(audit_last_fga_log_file_name);
+		audit_last_fga_log_file_name = fgafilename;
+		fgafilename = NULL;
+	}
+
+	/* Same as above, but for trace audit log file. */
+	if (audit_trace_log_file != NULL &&
+		(time_based_rotation || (size_rotation_for & MAINTAIN_TRACE_LOG)))
+	{
+		/*
+		 * Only append write，do not consider overwrite for maintain trace log.
+		 * That is different from audit common log and fga log.
+		 *
+		if (AuditLog_truncate_on_rotation && time_based_rotation &&
+			audit_last_trace_log_file_name != NULL &&
+			strcmp(tracefilename, audit_last_trace_log_file_name) != 0)
+			fh = audit_open_log_file(tracefilename, "w", true);
+		else
+		*/
+		{
+			fh = audit_open_log_file(tracefilename, "a", true);
+		}
+
+		if (!fh)
+		{
+			/*
+			 * ENFILE/EMFILE are not too surprising on a busy system; just
+			 * keep using the old file till we manage to get a new one.
+			 * Otherwise, assume something's wrong with audit_log_directory and stop
+			 * trying to create files.
+			 */
+			if (errno != ENFILE && errno != EMFILE)
+			{
+				ereport(LOG,
+						(errmsg("disabling automatic rotation audit log file (use SIGHUP to re-enable)")));
+				audit_rotation_disabled = true;
+			}
+
+			if (filename)
+				pfree(filename);
+			if (fgafilename)
+				pfree(fgafilename);
+			if (tracefilename)
+				pfree(tracefilename);
+			return;
+		}
+
+		SpinLockAcquire(&(audit_trace_log_file_lock));
+		fclose(audit_trace_log_file);
+		audit_trace_log_file = fh;
+		SpinLockRelease(&(audit_trace_log_file_lock));
+
+		/* instead of pfree'ing filename, remember it for next time */
+		if (audit_last_trace_log_file_name != NULL)
+			pfree(audit_last_trace_log_file_name);
+		audit_last_trace_log_file_name = tracefilename;
+		tracefilename = NULL;
+	}
+
+	if (filename)
+		pfree(filename);
+	if (fgafilename)
+		pfree(fgafilename);
+	if (tracefilename)
+		pfree(tracefilename);
+
+	audit_set_next_rotation_time();
 }
 
 /*
@@ -1484,6 +1737,42 @@ audit_log_file_getname(pg_time_t timestamp, const char *suffix)
     return filename;
 }
 
+/*
+ * construct logfile name using timestamp information.
+ * acoording to audit_log_file_getname().
+ *
+ * If suffix isn't NULL, append it to the name, replacing any ".log"
+ * that may be in the pattern.
+ *
+ * Result is palloc'd.
+ */
+static char *
+trace_log_file_getname(pg_time_t timestamp, const char *suffix)
+{
+	char	   *filename = NULL;
+	int			len = 0;
+
+	filename = palloc(MAXPGPATH);
+
+	snprintf(filename, MAXPGPATH, "%s/", trace_log_directory);
+
+	len = strlen(filename);
+
+	/* treat AuditLog_filename as a strftime pattern */
+	pg_strftime(filename + len, MAXPGPATH - len, TraceLog_filename,
+				pg_localtime(&timestamp, log_timezone));
+
+	if (suffix != NULL)
+	{
+		len = strlen(filename);
+		if (len > 4 && (strcmp(filename + (len - 4), ".log") == 0))
+			len -= 4;
+		strlcpy(filename + len, suffix, MAXPGPATH - len);
+	}
+
+	return filename;
+}
+
 /*
  * Determine the next planned rotation time, and store in audit_next_rotation_time.
  */
@@ -1589,9 +1878,9 @@ static bool alog_queue_is_empty(int q_size, int q_head, int q_tail)
 
 static bool alog_queue_is_empty2(AlogQueue * queue)
 {
-    volatile int q_head = queue->q_head;      
-    volatile int q_tail = queue->q_tail;
-    volatile int q_size = queue->q_size;
+	volatile int q_head = queue->q_head;
+	volatile int q_tail = queue->q_tail;
+	volatile int q_size = queue->q_size;
 
     pg_memory_barrier();
 
@@ -1603,12 +1892,12 @@ static bool alog_queue_is_empty2(AlogQueue * queue)
  */
 static int alog_queue_used(int q_size, int q_head, int q_tail)
 {
-     int used = (q_tail - q_head + q_size) % q_size;
+	int used = (q_tail - q_head + q_size) % q_size;
 
     Assert(q_size > 0 && q_head >= 0 && q_tail >= 0);
     Assert(q_head < q_size && q_tail < q_size);
 
-     return used;
+	return used;
 }
 
 /*
@@ -1662,9 +1951,9 @@ static bool alog_queue_push(AlogQueue * queue, char * buff, int len)
  * write buff1 and buff2 to queue
  */
 static bool alog_queue_push2(AlogQueue * queue, char * buff1, int len1, char * buff2, int len2)
-{    
-    char * buff_array[] = {buff1, buff2};
-    int len_array[] = {len1, len2};
+{
+	char * buff_array[] = {buff1, buff2};
+	int len_array[] = {len1, len2};
 
     return alog_queue_pushn(queue, buff_array, len_array, sizeof(len_array)/sizeof(len_array[0]));
 }
@@ -1673,25 +1962,25 @@ static bool alog_queue_push2(AlogQueue * queue, char * buff1, int len1, char * b
  * write n buffs to queue
  */
 static bool alog_queue_pushn(AlogQueue * queue, char * buff[], int len[], int n)
-{// #lizard forgives
-    volatile int q_head = queue->q_head;      
-    volatile int q_tail = queue->q_tail;
-    volatile int q_size = queue->q_size;
+{
+	volatile int q_head = queue->q_head;
+	volatile int q_tail = queue->q_tail;
+	volatile int q_size = queue->q_size;
 
-    int q_head_before = q_head;
-    int q_tail_before = q_tail;
-    int q_size_before = q_size;
+	int q_head_before = q_head;
+	int q_tail_before = q_tail;
+	int q_size_before = q_size;
 
-    int q_used_before = 0;
-    int q_used_after = 0;
-    
-    int total_len = 0;
-    int i = 0;
-    
-    for (i = 0; i < n; i++)
-    {
-        total_len += len[i];
-    }
+	int q_used_before = 0;
+	int q_used_after = 0;
+
+	int total_len = 0;
+	int i = 0;
+
+	for (i = 0; i < n; i++)
+	{
+		total_len += len[i];
+	}
 
     pg_memory_barrier();
     alog_just_caller(&q_used_before);
@@ -1764,9 +2053,9 @@ static bool alog_queue_pushn(AlogQueue * queue, char * buff[], int len[], int n)
 
 /*
  * |<- strlen value ->|<- string message content ->|
- * |                                               |
- * |                                               |
- * |<------------------ buff --------------------->| 
+ * |											   |
+ * |											   |
+ * |<------------------ buff --------------------->|
  *
  * len = size(int) + strlen(str)
  *
@@ -1820,9 +2109,9 @@ static int alog_queue_get_str_len(AlogQueue * queue, int offset)
  * copy message from queue to another as much as possible
  *
  * |<- strlen value ->|<- string message content ->|
- * |                                               |
- * |                                               |
- * |<------------------ buff --------------------->| 
+ * |											   |
+ * |											   |
+ * |<------------------ buff --------------------->|
  *
  * len = size(int) + strlen(str)
  *
@@ -1949,107 +2238,114 @@ static bool alog_queue_pop_to_queue(AlogQueue * from, AlogQueue * to)
  * copy message from queue to file as much as possible
  */
 static bool alog_queue_pop_to_file(AlogQueue * from, int destination)
-{// #lizard forgives
-    volatile int q_from_head = from->q_head;      
-    volatile int q_from_tail = from->q_tail;
-    volatile int q_from_size = from->q_size;
-
-    int from_head = q_from_head;      
-    int from_tail = q_from_tail;
-    int from_size = q_from_size;
-
-    int from_total = 0;
-
-    int from_used = 0;
-    int from_copyed = 0;
-
-    volatile slock_t * file_lock = NULL;
-
-    pg_memory_barrier();
-    alog_just_caller(&from_total);
-
-    from_total = from_used = alog_queue_used(from_size, from_head, from_tail);
-
-    Assert(from_size > 0 && from_head >= 0 && from_tail >= 0);
-    Assert(from_head < from_size && from_tail < from_size && from_used <= from_size);
-    Assert(destination == AUDIT_COMMON_LOG || destination == AUDIT_FGA_LOG);
-
-    if (destination == AUDIT_COMMON_LOG)
-    {
-        file_lock = &audit_comm_log_file_lock;
-    }
-    else
-    {
-        file_lock = &audit_fga_log_file_lock;
-    }
-
-    /* from is empty, ignore */
-    if (alog_queue_is_empty(from_size, from_head, from_tail))
-    {
-        return false;
-    }
-
-    /* copy message into file until from is empty */
-    do
-    {
-        int string_len = alog_queue_get_str_len(from, from_head);
-        int copy_len = sizeof(int) + string_len;
-
-        pg_memory_barrier();
-
-        /* just copy dierctly */
-        if (from_size - from_head >= copy_len)
-        {
-            char * p_start = alog_queue_offset_to(from, from_head + sizeof(int));
-
-            /* only copy message content, not write message len */
-            SpinLockAcquire(file_lock);
-            audit_write_log_file(p_start, string_len, destination);
-            SpinLockRelease(file_lock);
-        }
-        else if (from_size - from_head > sizeof(int))
-        {
-            /* must copy as two parts */
-            int first_len = from_size - from_head - sizeof(int);
-            int second_len = string_len - first_len;
-            char * p_start = NULL;
-
-            Assert(first_len > 0 && first_len < from_size);
-            Assert(second_len > 0 && second_len < from_size);
-
-            SpinLockAcquire(file_lock);
-            p_start = alog_queue_offset_to(from, from_head + sizeof(int));
-            audit_write_log_file(p_start, first_len, destination);
-
-            p_start = alog_queue_offset_to(from, 0);
-            audit_write_log_file(p_start, second_len, destination);
-            SpinLockRelease(file_lock);
-        }
-        else 
-        {
-            /* just copy content only */
-            int cpy_offset = (from_head + sizeof(int)) % from_size;
-            char * p_start = alog_queue_offset_to(from, cpy_offset);
-
-            Assert(from_size - from_head <= sizeof(int));
-            SpinLockAcquire(file_lock);
-            audit_write_log_file(p_start, string_len, destination);
-            SpinLockRelease(file_lock);
-        }
-
-        from_head = (from_head + copy_len) % from_size;
-        from_copyed += copy_len;
-
-        Assert(from_copyed <= from_total);
-        Assert(from_used - copy_len >= 0);
-        Assert(from_used - copy_len == alog_queue_used(from_size, from_head, from_tail));
-
-        from_used = alog_queue_used(from_size, from_head, from_tail);
-    } while (!alog_queue_is_empty(from_size, from_head, from_tail));
-
-    from->q_head = from_head;
-
-    return true;
+{
+	volatile int q_from_head = from->q_head;
+	volatile int q_from_tail = from->q_tail;
+	volatile int q_from_size = from->q_size;
+
+	int from_head = q_from_head;
+	int from_tail = q_from_tail;
+	int from_size = q_from_size;
+
+	int from_total = 0;
+
+	int from_used = 0;
+	int from_copyed = 0;
+
+	volatile slock_t * file_lock = NULL;
+
+	pg_memory_barrier();
+	alog_just_caller(&from_total);
+
+	from_total = from_used = alog_queue_used(from_size, from_head, from_tail);
+
+	Assert(from_size > 0 && from_head >= 0 && from_tail >= 0);
+	Assert(from_head < from_size && from_tail < from_size && from_used <= from_size);
+	Assert(destination == AUDIT_COMMON_LOG ||
+		destination == AUDIT_FGA_LOG ||
+		destination == MAINTAIN_TRACE_LOG);
+
+	if (destination == AUDIT_COMMON_LOG)
+	{
+		file_lock = &audit_comm_log_file_lock;
+	}
+	else if (destination == AUDIT_FGA_LOG)
+	{
+		file_lock = &audit_fga_log_file_lock;
+	}
+	else
+	{
+		Assert(destination == MAINTAIN_TRACE_LOG);
+		file_lock = &audit_trace_log_file_lock;
+	}
+
+	/* from is empty, ignore */
+	if (alog_queue_is_empty(from_size, from_head, from_tail))
+	{
+		return false;
+	}
+
+	/* copy message into file until from is empty */
+	do
+	{
+		int string_len = alog_queue_get_str_len(from, from_head);
+		int copy_len = sizeof(int) + string_len;
+
+		pg_memory_barrier();
+
+		/* just copy dierctly */
+		if (from_size - from_head >= copy_len)
+		{
+			char * p_start = alog_queue_offset_to(from, from_head + sizeof(int));
+
+			/* only copy message content, not write message len */
+			SpinLockAcquire(file_lock);
+			audit_write_log_file(p_start, string_len, destination);
+			SpinLockRelease(file_lock);
+		}
+		else if (from_size - from_head > sizeof(int))
+		{
+			/* must copy as two parts */
+			int first_len = from_size - from_head - sizeof(int);
+			int second_len = string_len - first_len;
+			char * p_start = NULL;
+
+			Assert(first_len > 0 && first_len < from_size);
+			Assert(second_len > 0 && second_len < from_size);
+
+			SpinLockAcquire(file_lock);
+			p_start = alog_queue_offset_to(from, from_head + sizeof(int));
+			audit_write_log_file(p_start, first_len, destination);
+
+			p_start = alog_queue_offset_to(from, 0);
+			audit_write_log_file(p_start, second_len, destination);
+			SpinLockRelease(file_lock);
+		}
+		else
+		{
+			/* just copy content only */
+			int cpy_offset = (from_head + sizeof(int)) % from_size;
+			char * p_start = alog_queue_offset_to(from, cpy_offset);
+
+			Assert(from_size - from_head <= sizeof(int));
+			SpinLockAcquire(file_lock);
+			audit_write_log_file(p_start, string_len, destination);
+			SpinLockRelease(file_lock);
+		}
+
+		from_head = (from_head + copy_len) % from_size;
+		from_copyed += copy_len;
+
+		Assert(from_copyed <= from_total);
+		Assert(from_used - copy_len >= 0);
+		Assert(from_used - copy_len == alog_queue_used(from_size, from_head, from_tail));
+
+		from_used = alog_queue_used(from_size, from_head, from_tail);
+	} while (!alog_queue_is_empty(from_size, from_head, from_tail));
+
+	from->q_head = from_head;
+
+	return true;
 }
 
 #endif
@@ -2057,16 +2353,18 @@ static bool alog_queue_pop_to_file(AlogQueue * from, int destination)
 #ifdef AuditLog_005_For_ThreadWorker
 
 /*
- * find an unused shard entry id in AuditCommonLogQueueArray and AuditFGALogQueueArray
- * 
+ * find an unused shard entry id in AuditCommonLogQueueArray, AuditFGALogQueueArray
+ * and AuditTraceLogQueueArray.
+ *
  * called by postgres backend to init AuditPostgresAlogQueueIndex
  */
 int AuditLoggerQueueAcquire(void)
 {
     int alogIdx = -1;
 
-    AlogQueue * common_queue = NULL;
-    AlogQueue * fga_queue = NULL;
+	AlogQueue * common_queue = NULL;
+	AlogQueue * fga_queue = NULL;
+	AlogQueue * trace_queue = NULL;
 
     if (!IsBackendPostgres)
     {
@@ -2079,14 +2377,17 @@ int AuditLoggerQueueAcquire(void)
     alogIdx = MyProc->pgprocno;
     Assert(alogIdx >= 0 && alogIdx < MaxBackends);
 
-    common_queue = alog_get_shared_common_queue(alogIdx);
-    fga_queue = alog_get_shared_fga_queue(alogIdx);
+	common_queue = alog_get_shared_common_queue(alogIdx);
+	fga_queue = alog_get_shared_fga_queue(alogIdx);
+	trace_queue = alog_get_shared_trace_queue(alogIdx);
 
-    Assert(common_queue->q_pid == fga_queue->q_pid);
+	Assert(common_queue->q_pid == fga_queue->q_pid);
+	Assert(common_queue->q_pid == trace_queue->q_pid);
 
-    AuditPostgresAlogQueueIndex = alogIdx;
-    common_queue->q_pid = MyProcPid;
-    fga_queue->q_pid = MyProcPid;
+	AuditPostgresAlogQueueIndex = alogIdx;
+	common_queue->q_pid = MyProcPid;
+	fga_queue->q_pid = MyProcPid;
+	trace_queue->q_pid = MyProcPid;
 
     if (enable_auditlogger_warning)
     {
@@ -2117,6 +2418,16 @@ static AlogQueue * alog_get_shared_fga_queue(int idx)
     return queue;
 }
 
+static AlogQueue * alog_get_shared_trace_queue(int idx)
+{
+	AlogQueue * queue = NULL;
+
+	Assert(idx >= 0 && idx < MaxBackends);
+	queue = AuditTraceLogQueueArray->a_queue[idx];
+
+	return queue;
+}
+
 static AlogQueue * alog_get_local_common_cache(int consumer_id)
 {
     AlogQueue * queue = NULL;
@@ -2137,12 +2448,27 @@ static AlogQueue * alog_get_local_fga_cache(int consumer_id)
     return queue;
 }
 
+static AlogQueue * alog_get_local_trace_cache(int consumer_id)
+{
+	AlogQueue * queue = NULL;
+
+	Assert(consumer_id >= 0 && consumer_id < AuditLog_max_worker_number);
+	queue = AuditTraceLogLocalCache->q_cache[consumer_id];
+
+	return queue;
+}
+
 /*
  * local cache for log Consumer
  *
- * AuditCommonLogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, AuditLog_common_log_cache_size_kb);
+ * AuditCommonLogLocalCache = alog_make_local_cache(AuditLog_max_worker_number,
+ * 		AuditLog_common_log_cache_size_kb);
  *
- * AuditFGALogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, AuditLog_fga_log_cacae_size_kb);
+ * AuditFGALogLocalCache = alog_make_local_cache(AuditLog_max_worker_number,
+ * 		AuditLog_fga_log_cacae_size_kb);
+ *
+ * AuditTraceLogLocalCache = alog_make_local_cache(AuditLog_max_worker_number,
+ * 		Maintain_trace_log_cache_size_kb);
  */
 static AlogQueueCache * alog_make_local_cache(int cache_number, int queue_size_kb)
 {
@@ -2218,7 +2544,7 @@ static void alog_consumer_wakeup(int consumer_id)
 }
 
 /*
- * Sleep if there is no log to read in 
+ * Sleep if there is no log to read in
  * shared audit log queue
  */
 static void alog_consumer_sleep(int consumer_id)
@@ -2234,98 +2560,122 @@ static void alog_consumer_sleep(int consumer_id)
 /*
  * AuditLog_max_worker_number consumers
  *
- * read log from part of AuditCommonLogQueueArray and AuditFGALogQueueArray
- * and write to one cache in AuditCommonLogLocalCache and AuditFgaLogQueueCache
+ * read log from part of AuditCommonLogQueueArray, AuditFGALogQueueArray and
+ * AuditTraceLogQueueArray; write to one cache in AuditCommonLogLocalCache,
+ * AuditFgaLogQueueCache and AuditTraceLogQueueCache.
  *
- */ 
+ */
 static void * alog_consumer_main(void * arg)
-{// #lizard forgives
-    int consumer_id = *((int *) arg);
-    int i = 0;
-
-    AlogQueue * local_common_cache = NULL;
-    AlogQueue * local_fga_cache = NULL;
-
-    Assert(consumer_id >= 0 && consumer_id < AuditLog_max_worker_number);
-
-    /* get local common queue cache entry from AuditCommonLogLocalCache */
-    local_common_cache = alog_get_local_common_cache(consumer_id);
-
-    /* get local fga queue cache entry from AuditFgaLogQueueCache */
-    local_fga_cache = alog_get_local_fga_cache(consumer_id);
-
-    while (true)
-    {
-        bool shared_is_empty = true;
-
-        for (i = 0; i < ((MaxBackends / AuditLog_max_worker_number) + 1); i++)
-        {
-            int sharedIdx = consumer_id +  i * AuditLog_max_worker_number;
-            AlogQueue * shared_common_queue = NULL;
-            AlogQueue * shared_fga_queue = NULL;
-
-            Assert(consumer_id == (sharedIdx % AuditLog_max_worker_number));
-
-            if (sharedIdx < MaxBackends)
-            {
-                bool local_is_empty = false;
-
-                /* get shared common queue entry from AuditCommonLogQueueArray */
-                shared_common_queue = alog_get_shared_common_queue(sharedIdx);
-
-                /* get shared fga queue entry from AuditFGALogQueueArray */
-                shared_fga_queue = alog_get_shared_fga_queue(sharedIdx);
-
-                local_is_empty = false;
-                if (alog_queue_is_empty2(local_common_cache))
-                {
-                    local_is_empty = true;
-                }
-
-                /* read from shared queue, and write to local cache queue */
-                if (alog_queue_pop_to_queue(shared_common_queue, local_common_cache))
-                {
-                    if (local_is_empty)
-                    {
-                        alog_writer_wakeup(AUDIT_COMMON_LOG);
-                    }
-                }
-
-                local_is_empty = false;
-                if (alog_queue_is_empty2(local_fga_cache))
-                {
-                    local_is_empty = true;
-                }
-
-                if (alog_queue_pop_to_queue(shared_fga_queue, local_fga_cache))
-                {
-                    if (local_is_empty)
-                    {
-                        alog_writer_wakeup(AUDIT_FGA_LOG);
-                    }
-                }
-
-                if (!alog_queue_is_empty2(shared_common_queue) ||
-                    !alog_queue_is_empty2(shared_fga_queue))
-                {
-                    shared_is_empty = false;
-                }
-            }
-        }
-
-        if (shared_is_empty)
-        {
-            /*
-             * maybe shared input is empty, 
-             * local output is full,
-             * so wait a moment and retry
-             */
-            audit_shared_consumer_bitmap_set_value(consumer_id, 0);
-            alog_consumer_sleep(consumer_id);
-        }
-    }
-
-    return NULL;
+{
+	int consumer_id = *((int *) arg);
+	int i = 0;
+
+	AlogQueue * local_common_cache = NULL;
+	AlogQueue * local_fga_cache = NULL;
+	AlogQueue * local_trace_cache = NULL;
+
+	Assert(consumer_id >= 0 && consumer_id < AuditLog_max_worker_number);
+
+	/* get local common queue cache entry from AuditCommonLogLocalCache */
+	local_common_cache = alog_get_local_common_cache(consumer_id);
+
+	/* get local fga queue cache entry from AuditFgaLogQueueCache */
+	local_fga_cache = alog_get_local_fga_cache(consumer_id);
+
+	/* get local trace queue cache entry from AuditTraceLogQueueCache */
+	local_trace_cache = alog_get_local_trace_cache(consumer_id);
+
+	while (true)
+	{
+		bool shared_is_empty = true;
+
+		for (i = 0; i < ((MaxBackends / AuditLog_max_worker_number) + 1); i++)
+		{
+			int sharedIdx = consumer_id +  i * AuditLog_max_worker_number;
+			AlogQueue * shared_common_queue = NULL;
+			AlogQueue * shared_fga_queue = NULL;
+			AlogQueue * shared_trace_queue = NULL;
+
+			Assert(consumer_id == (sharedIdx % AuditLog_max_worker_number));
+
+			if (sharedIdx < MaxBackends)
+			{
+				bool local_is_empty = false;
+
+				/* get shared common queue entry from AuditCommonLogQueueArray */
+				shared_common_queue = alog_get_shared_common_queue(sharedIdx);
+
+				/* get shared fga queue entry from AuditFGALogQueueArray */
+				shared_fga_queue = alog_get_shared_fga_queue(sharedIdx);
+
+				/* get shared trace queue entry from AuditTraceLogQueueArray */
+				shared_trace_queue = alog_get_shared_trace_queue(sharedIdx);
+
+				local_is_empty = false;
+				if (alog_queue_is_empty2(local_common_cache))
+				{
+					local_is_empty = true;
+				}
+
+				/* read from shared queue, and write to local cache queue */
+				if (alog_queue_pop_to_queue(shared_common_queue, local_common_cache))
+				{
+					if (local_is_empty)
+					{
+						alog_writer_wakeup(AUDIT_COMMON_LOG);
+					}
+				}
+
+				local_is_empty = false;
+				if (alog_queue_is_empty2(local_fga_cache))
+				{
+					local_is_empty = true;
+				}
+
+				if (alog_queue_pop_to_queue(shared_fga_queue, local_fga_cache))
+				{
+					if (local_is_empty)
+					{
+						alog_writer_wakeup(AUDIT_FGA_LOG);
+					}
+				}
+
+				local_is_empty = false;
+				if (alog_queue_is_empty2(local_trace_cache))
+				{
+					local_is_empty = true;
+				}
+
+				if (alog_queue_pop_to_queue(shared_trace_queue, local_trace_cache))
+				{
+					if (local_is_empty)
+					{
+						alog_writer_wakeup(MAINTAIN_TRACE_LOG);
+					}
+				}
+
+				if (!alog_queue_is_empty2(shared_common_queue) ||
+					!alog_queue_is_empty2(shared_fga_queue) ||
+					!alog_queue_is_empty2(shared_trace_queue))
+				{
+					shared_is_empty = false;
+				}
+			}
+		}
+
+		if (shared_is_empty)
+		{
+			/*
+			 * maybe shared input is empty,
+			 * local output is full,
+			 * so wait a moment and retry
+			 */
+			audit_shared_consumer_bitmap_set_value(consumer_id, 0);
+			alog_consumer_sleep(consumer_id);
+		}
+	}
+
+	return NULL;
 }
 
 /*
@@ -2337,24 +2687,30 @@ static void alog_writer_wakeup(int writer_destination)
     ThreadSema * sema = NULL;
     AlogQueueCache * local_cache = NULL;
 
-    Assert(writer_destination == AUDIT_COMMON_LOG || 
-           writer_destination == AUDIT_FGA_LOG);
-
-    if (writer_destination == AUDIT_COMMON_LOG)
-    {
-        local_cache = AuditCommonLogLocalCache;
-    }
-    else
-    {
-        local_cache = AuditFGALogLocalCache;
-    }
+	Assert(writer_destination == AUDIT_COMMON_LOG ||
+		   writer_destination == AUDIT_FGA_LOG ||
+		   writer_destination == MAINTAIN_TRACE_LOG);
+
+	if (writer_destination == AUDIT_COMMON_LOG)
+	{
+		local_cache = AuditCommonLogLocalCache;
+	}
+	else if (writer_destination == AUDIT_FGA_LOG)
+	{
+		local_cache = AuditFGALogLocalCache;
+	}
+	else
+	{
+		Assert(writer_destination == MAINTAIN_TRACE_LOG);
+		local_cache = AuditTraceLogLocalCache;
+	}
 
     sema = (&(local_cache->q_sema));
     ThreadSemaUp(sema);
 }
 
 /*
- * Sleep if there is no log to read in 
+ * Sleep if there is no log to read in
  * local audit log cache
  */
 static void alog_writer_sleep(int writer_destination)
@@ -2362,79 +2718,93 @@ static void alog_writer_sleep(int writer_destination)
     ThreadSema * sema = NULL;
     AlogQueueCache * local_cache = NULL;
 
-    Assert(writer_destination == AUDIT_COMMON_LOG || 
-           writer_destination == AUDIT_FGA_LOG);
-
-    if (writer_destination == AUDIT_COMMON_LOG)
-    {
-        local_cache = AuditCommonLogLocalCache;
-    }
-    else
-    {
-        local_cache = AuditFGALogLocalCache;
-    }
+	Assert(writer_destination == AUDIT_COMMON_LOG ||
+		   writer_destination == AUDIT_FGA_LOG ||
+		   writer_destination == MAINTAIN_TRACE_LOG);
+
+	if (writer_destination == AUDIT_COMMON_LOG)
+	{
+		local_cache = AuditCommonLogLocalCache;
+	}
+	else if (writer_destination == AUDIT_FGA_LOG)
+	{
+		local_cache = AuditFGALogLocalCache;
+	}
+	else
+	{
+		Assert(writer_destination == MAINTAIN_TRACE_LOG);
+		local_cache = AuditTraceLogLocalCache;
+	}
 
     sema = (&(local_cache->q_sema));
     ThreadSemaDown(sema);
 }
 
 /*
- * two writer, write log to logfile
- * 
+ * three writer, write log to logfile
+ *
  * one for AuditCommonLogLocalCache
  * one for AuditFgaLogQueueCache
+ * one for AuditTraceLogQueueCache
  */
 static void * alog_writer_main(void * arg)
-{// #lizard forgives
-    int writer_destination = *((int *) arg);
-    AlogQueueCache * local_cache = NULL;
-
-    Assert(writer_destination == AUDIT_COMMON_LOG || 
-           writer_destination == AUDIT_FGA_LOG);
-
-    if (writer_destination == AUDIT_COMMON_LOG)
-    {
-        /* read from AuditCommonLogLocalCache, and write to fga log file */
-        local_cache = AuditCommonLogLocalCache;
-    }
-    else
-    {
-        /* read from AuditFgaLogQueueCache, and write to fga log file */
-        local_cache = AuditFGALogLocalCache;
-    }
-
-    while (true)
-    {
-        int i = 0;
-        bool copy_nothing = true;
-
-        for (i = 0; i < AuditLog_max_worker_number; i++)
-        {
-            int consumer_id = i;
-            AlogQueue * local_queue = local_cache->q_cache[i];
-
-            if (alog_queue_pop_to_file(local_queue, writer_destination))
-            {
-                copy_nothing = false;
-            }
-
-            if (alog_queue_is_empty2(local_queue))
-            {
-                alog_consumer_wakeup(consumer_id);
-            }
-        }
-
-        if (copy_nothing)
-        {
-            /*
-             * maybe local input is empty, 
-             * so wait a moment and retry
-             */
-            alog_writer_sleep(writer_destination);
-        }
-    }
-
-    return NULL;
+{
+	int writer_destination = *((int *) arg);
+	AlogQueueCache * local_cache = NULL;
+
+	Assert(writer_destination == AUDIT_COMMON_LOG ||
+		writer_destination == AUDIT_FGA_LOG ||
+		writer_destination == MAINTAIN_TRACE_LOG);
+
+	if (writer_destination == AUDIT_COMMON_LOG)
+	{
+		/* read from AuditCommonLogLocalCache, and write to common log file */
+		local_cache = AuditCommonLogLocalCache;
+	}
+	else if (writer_destination == AUDIT_FGA_LOG)
+	{
+		/* read from AuditFgaLogQueueCache, and write to fga log file */
+		local_cache = AuditFGALogLocalCache;
+	}
+	else
+	{
+		/* read from AuditTraceLogQueueCache, and write to trace log file */
+		Assert(writer_destination == MAINTAIN_TRACE_LOG);
+		local_cache = AuditTraceLogLocalCache;
+	}
+
+	while (true)
+	{
+		int i = 0;
+		bool copy_nothing = true;
+
+		for (i = 0; i < AuditLog_max_worker_number; i++)
+		{
+			int consumer_id = i;
+			AlogQueue * local_queue = local_cache->q_cache[i];
+
+			if (alog_queue_pop_to_file(local_queue, writer_destination))
+			{
+				copy_nothing = false;
+			}
+
+			if (alog_queue_is_empty2(local_queue))
+			{
+				alog_consumer_wakeup(consumer_id);
+			}
+		}
+
+		if (copy_nothing)
+		{
+			/*
+			 * maybe local input is empty,
+			 * so wait a moment and retry
+			 */
+			alog_writer_sleep(writer_destination);
+		}
+	}
+
+	return NULL;
 }
 
 static void alog_start_writer(int writer_destination)
@@ -2442,21 +2812,22 @@ static void alog_start_writer(int writer_destination)
     int * des = NULL;
     int ret = 0;
 
-    Assert(writer_destination == AUDIT_COMMON_LOG || 
-           writer_destination == AUDIT_FGA_LOG);
+	Assert(writer_destination == AUDIT_COMMON_LOG ||
+		writer_destination == AUDIT_FGA_LOG ||
+		writer_destination == MAINTAIN_TRACE_LOG);
 
     des = palloc0(sizeof(int));
     *des = writer_destination;
 
-    ret = CreateThread(alog_writer_main, (void *)des, MT_THR_DETACHED);
-    if (ret != 0)
-    {
-        /* failed to create thread, exit */
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("could not start audit log write worker")));
-        exit(6);
-    }    
+	ret = CreateThread(alog_writer_main, (void *)des, MT_THR_DETACHED);
+	if (ret != 0)
+	{
+		/* failed to create thread, exit */
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("could not start audit log write worker")));
+		exit(6);
+	}
 }
 
 static void alog_start_consumer(int consumer_id)
@@ -2484,15 +2855,18 @@ static void alog_start_all_worker(void)
 {
     int i = 0;
 
-    AuditCommonLogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, 
-                                                     AuditLog_common_log_cache_size_kb);
-    AuditFGALogLocalCache = alog_make_local_cache(AuditLog_max_worker_number, 
-                                                  AuditLog_fga_log_cacae_size_kb);
-    AuditConsumerNotifySemas = alog_make_consumer_semas(AuditLog_max_worker_number);
+	AuditCommonLogLocalCache = alog_make_local_cache(AuditLog_max_worker_number,
+											AuditLog_common_log_cache_size_kb);
+	AuditFGALogLocalCache = alog_make_local_cache(AuditLog_max_worker_number,
+											AuditLog_fga_log_cacae_size_kb);
+	AuditTraceLogLocalCache = alog_make_local_cache(AuditLog_max_worker_number,
+											Maintain_trace_log_cache_size_kb);
+	AuditConsumerNotifySemas = alog_make_consumer_semas(AuditLog_max_worker_number);
 
-    /* 00, start writer worker, one for common log, another for fga log */
-    alog_start_writer(AUDIT_COMMON_LOG);
-    alog_start_writer(AUDIT_FGA_LOG);
+	/* 00, start writer worker, one for common log, one for fga log, one for trace log. */
+	alog_start_writer(AUDIT_COMMON_LOG);
+	alog_start_writer(AUDIT_FGA_LOG);
+	alog_start_writer(MAINTAIN_TRACE_LOG);
 
     /* 001, start AuditLog_max_worker_number consumer worker */
     for (i = 0; i < AuditLog_max_worker_number; i++)
@@ -2509,92 +2883,110 @@ static void alog_start_all_worker(void)
 #ifdef AuditLog_006_For_Elog
 
 void alog(int destination, const char *fmt,...)
-{// #lizard forgives
-    StringInfoData buf;
-    AlogQueue * queue = NULL;
-
-    int len = 0;
-    int idx = 0;
-    int consumer_id = 0;
-
-    Assert(AuditPostgresAlogQueueIndex >= 0 &&
-           AuditPostgresAlogQueueIndex < MaxBackends);
-
-    idx = AuditPostgresAlogQueueIndex;
-    consumer_id = (idx % AuditLog_max_worker_number);
-
-    if(destination != AUDIT_COMMON_LOG && 
-       destination != AUDIT_FGA_LOG)
-    {
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("only common audit log and fag audit log can be processed")));
-        return;
-    }
-
-    if (!IsBackendPostgres ||
-        !IsUnderPostmaster)
-    {
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("only postgres backend can write audit log")));
-        return;
-    }
-
-    if (destination == AUDIT_COMMON_LOG)
-    {
-        queue = alog_get_shared_common_queue(idx);
-    }
-    else 
-    {
-        queue = alog_get_shared_fga_queue(idx);
-    }
-
-    Assert(queue->q_pid == getpid());
-
-    initStringInfo(&buf);
-    appendBinaryStringInfo(&buf, (const char *)(&len), sizeof(len));
-
-    for (;;)
-    {
-        va_list        args;
-        int            needed;
-        va_start(args, fmt);
-        needed = appendStringInfoVA(&buf, fmt, args);
-        va_end(args);
-        if (needed == 0)
-        {
-            break;
-        }
-        enlargeStringInfo(&buf, needed);
-    }
-
-    appendStringInfoChar(&buf, '\n');
-
-    /* push string len to header */
-    len = buf.len - sizeof(len);
-    memcpy(buf.data, (char *)(&len), sizeof(len));
-
-    /* push total buff into queue */
-    len = buf.len;
-    while (false == alog_queue_push(queue, buf.data, len))
-    {
-        pg_usleep(AUDIT_SLEEP_MICROSEC);
-    }
-
-    pfree(buf.data);
-
-    if (!audit_shared_consumer_bitmap_get_value(consumer_id))
-    {
-        /* 
-         * set shared consumer bitmap value to 1 to
-         * notify consumer to read audit log
-         */
-        audit_shared_consumer_bitmap_set_value(consumer_id, 1);
-
-        /* Notify audit logger process that it's got something to do */
-        SendPostmasterSignal(PMSIGNAL_WAKEN_AUDIT_LOGGER);
-    }
+{
+	StringInfoData buf;
+	AlogQueue * queue = NULL;
+
+	int len = 0;
+	int idx = 0;
+	int consumer_id = 0;
+
+	Assert(AuditPostgresAlogQueueIndex >= 0 &&
+		   AuditPostgresAlogQueueIndex < MaxBackends);
+
+	idx = AuditPostgresAlogQueueIndex;
+	consumer_id = (idx % AuditLog_max_worker_number);
+
+	if(destination != AUDIT_COMMON_LOG &&
+		destination != AUDIT_FGA_LOG &&
+		destination != MAINTAIN_TRACE_LOG)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("only common/fga/trace audit log can be processed")));
+		return;
+	}
+
+	if (!IsBackendPostgres ||
+		!IsUnderPostmaster)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("only postgres backend can write audit log")));
+		return;
+	}
+
+	if (destination == AUDIT_COMMON_LOG)
+	{
+		queue = alog_get_shared_common_queue(idx);
+	}
+	else if (destination == AUDIT_FGA_LOG)
+	{
+		queue = alog_get_shared_fga_queue(idx);
+	}
+	else
+	{
+		Assert(destination == MAINTAIN_TRACE_LOG);
+		queue = alog_get_shared_trace_queue(idx);
+	}
+
+	Assert(queue->q_pid == getpid());
+
+	initStringInfo(&buf);
+	appendBinaryStringInfo(&buf, (const char *)(&len), sizeof(len));
+
+	for (;;)
+	{
+		va_list		args;
+		int			needed;
+		va_start(args, fmt);
+		needed = appendStringInfoVA(&buf, fmt, args);
+		va_end(args);
+		if (needed == 0)
+		{
+			break;
+		}
+		enlargeStringInfo(&buf, needed);
+	}
+
+	appendStringInfoChar(&buf, '\n');
+
+	/* push string len to header */
+	len = buf.len - sizeof(len);
+	memcpy(buf.data, (char *)(&len), sizeof(len));
+
+	/* push total buff into queue */
+	len = buf.len;
+	while (false == alog_queue_push(queue, buf.data, len))
+	{
+		if (!audit_shared_consumer_bitmap_get_value(consumer_id))
+		{
+			/*
+			 * set shared consumer bitmap value to 1 to
+			 * notify consumer to read log
+			 */
+			audit_shared_consumer_bitmap_set_value(consumer_id, 1);
+
+			/* Notify logger process that it's got something to do */
+			SendPostmasterSignal(PMSIGNAL_WAKEN_AUDIT_LOGGER);
+		}
+
+		pg_usleep(AUDIT_SLEEP_MICROSEC);
+	}
+
+	pfree(buf.data);
+
+	if (!audit_shared_consumer_bitmap_get_value(consumer_id))
+	{
+		/*
+		 * set shared consumer bitmap value to 1 to
+		 * notify consumer to read audit log
+		 */
+		audit_shared_consumer_bitmap_set_value(consumer_id, 1);
+
+		/* Notify audit logger process that it's got something to do */
+		SendPostmasterSignal(PMSIGNAL_WAKEN_AUDIT_LOGGER);
+	}
 }
 
 #endif
@@ -2603,15 +2995,15 @@ void alog(int destination, const char *fmt,...)
 static void * alog_shard_stat_main(void * arg)
 {
 	atexit(FlushShardStatistic);
-	
-    while (true)
-    {
-        long shard_stat_interval = g_ShardInfoFlushInterval * 1000000L;
-        
-        FlushShardStatistic();
-        
-        pg_usleep(shard_stat_interval);
-    }
+
+	while (true)
+	{
+		long shard_stat_interval = g_ShardInfoFlushInterval * 1000000L;
+
+		FlushShardStatistic();
+
+		pg_usleep(shard_stat_interval);
+	}
 
     return NULL;
 }
@@ -2632,4 +3024,3 @@ static void    alog_start_shard_stat_worker(void)
 }
 
 #endif
-
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 1f8c33a5..e22a0cca 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -7247,6 +7247,26 @@ void PostmasterDisableTimeout(void)
     }
 }
 
+/*
+ * Whether the database is primary instance and it is normal.
+ */
+bool PostmasterIsPrimaryAndNormal(void)
+{
+	/*
+	 * Do not consider: pmState == PM_HOT_STANDBY. Because the original data may
+	 * be retained in the slave instance, which is inconsistent with the reset
+	 * data in the primary instance.
+	 */
+	if (pmState == PM_RUN)
+	{
+		return true;
+	}
+	else
+	{
+		return false;
+	}
+}
+
 void InitPostmasterLatch(void)
 {
     /* Initialize process-local latch support */
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 3f35d73d..f3577766 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -444,11 +444,11 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
             break;
 
 #ifdef __TBASE__
-            case XLOG_XACT_ACQUIRE_GTS:
-            {
-                /* nothing to do. */                
-            }
-            break;
+		case XLOG_XACT_ACQUIRE_GTS:
+			{
+				/* nothing to do. */
+			}
+			break;
 #endif
         default:
             elog(ERROR, "unexpected RM_XACT_ID record type: %u", info);
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index d4ecc464..0614298b 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -2364,6 +2364,97 @@ RelationIdGetRelation(Oid relationId)
     return rd;
 }
 
+/*
+ * Whether a relation has xmin_gts and max_gts.
+ */
+bool
+RelationHasGTS(Oid reltablespace, Oid relfilenode)
+{
+	bool		has = false;
+	SysScanDesc scandesc = NULL;
+	Relation	relation = NULL;
+	HeapTuple	ntp = NULL;
+	ScanKeyData skey[2];
+	bool		found = false;
+	Oid			relid = InvalidOid;
+	Form_pg_class	classform = NULL;
+
+	/* zero means this is a "mapped" relation */
+	if (0 == relfilenode || relfilenode < FirstNormalObjectId)
+	{
+		return false;
+	}
+
+	if (GLOBALTABLESPACE_OID == reltablespace)
+	{
+		return false;
+	}
+
+	/* pg_class will show 0 when the value is actually MyDatabaseTableSpace */
+	if (reltablespace == MyDatabaseTableSpace)
+	{
+		reltablespace = 0;
+	}
+
+	/*
+	 * Not a shared table, could either be a plain relation or a
+	 * non-shared, nailed one, like e.g. pg_class.
+	 *
+	 * check for plain relations by looking in pg_class
+	 */
+	relation = heap_open(RelationRelationId, AccessShareLock);
+
+	ScanKeyInit(&skey[0],
+				Anum_pg_class_reltablespace,
+				BTEqualStrategyNumber,
+				F_OIDEQ,
+				ObjectIdGetDatum(reltablespace));
+	ScanKeyInit(&skey[1],
+				Anum_pg_class_relfilenode,
+				BTEqualStrategyNumber,
+				F_OIDEQ,
+				ObjectIdGetDatum(relfilenode));
+
+	scandesc = systable_beginscan(relation,
+				ClassTblspcRelfilenodeIndexId,
+				true,
+				NULL,
+				2,
+				skey);
+
+	while (HeapTupleIsValid(ntp = systable_getnext(scandesc)))
+	{
+		if (found)
+		{
+			elog(ERROR,
+				"unexpected duplicate for tablespace %u, relfilenode %u",
+				reltablespace, relfilenode);
+		}
+
+		found = true;
+		relid = HeapTupleGetOid(ntp);
+		classform = (Form_pg_class) GETSTRUCT(ntp);
+	}
+
+	if (!found)
+	{
+		elog(WARNING,
+				"unexpected none for tablespace %u, relfilenode %u",
+				reltablespace, relfilenode);
+	}
+	else if ((classform->relkind == RELKIND_RELATION ||
+			classform->relkind == RELPERSISTENCE_UNLOGGED) &&
+			!IsSystemClass(relid, classform))
+	{
+		has = true;
+	}
+
+	systable_endscan(scandesc);
+	heap_close(relation, AccessShareLock);
+
+	return has;
+}
+
 /* ----------------------------------------------------------------
  *                cache invalidation support routines
  * ----------------------------------------------------------------
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 825e4725..3f02526e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3109,15 +3109,25 @@ static struct config_int ConfigureNamesInt[] =
         NULL, NULL, NULL
     },
 
-    {
-        {"max_files_per_process", PGC_POSTMASTER, RESOURCES_KERNEL,
-            gettext_noop("Sets the maximum number of simultaneously open files for each server process."),
-            NULL
-        },
-        &max_files_per_process,
-        1000, 25, INT_MAX,
-        NULL, NULL, NULL
-    },
+	{
+		{"gts_maintain_option", PGC_SIGHUP, DEVELOPER_OPTIONS,
+			gettext_noop("Enables check correctness of GTS and reseting it if it is wrong"),
+			NULL
+		},
+		&gts_maintain_option,
+		0, 0, 2,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"max_files_per_process", PGC_POSTMASTER, RESOURCES_KERNEL,
+			gettext_noop("Sets the maximum number of simultaneously open files for each server process."),
+			NULL
+		},
+		&max_files_per_process,
+		1000, 25, INT_MAX,
+		NULL, NULL, NULL
+	},
 
     /*
      * See also CheckRequiredParameterValues() if this parameter changes
@@ -3807,6 +3817,16 @@ static struct config_int ConfigureNamesInt[] =
         64, 8, INT_MAX / 1024,
         NULL, NULL, NULL
     },
+    {
+		{"alog_trace_queue_size", PGC_POSTMASTER, LOGGING_WHERE,
+			gettext_noop("Size of share memory queue for each backend to store trace audit log, kilobytes."),
+			NULL,
+			GUC_UNIT_KB
+		},
+		&Maintain_trace_log_queue_size_kb,
+		64, 8, INT_MAX / 1024,
+		NULL, NULL, NULL
+	},
     {
         {"alog_common_cache_size", PGC_POSTMASTER, LOGGING_WHERE,
             gettext_noop("Size of common audit log local buffer for each audit worker, kilobytes."),
@@ -3827,6 +3847,16 @@ static struct config_int ConfigureNamesInt[] =
         64, 8, INT_MAX / 1024,
         NULL, NULL, NULL
     },
+    {
+		{"alog_trace_cache_size", PGC_POSTMASTER, LOGGING_WHERE,
+			gettext_noop("Size of trace audit log local buffer for each audit worker, kilobytes."),
+			NULL,
+			GUC_UNIT_KB
+		},
+		&Maintain_trace_log_cache_size_kb,
+		64, 8, INT_MAX / 1024,
+		NULL, NULL, NULL
+	},
 #endif
     {
         {"max_function_args", PGC_INTERNAL, PRESET_OPTIONS,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index c5a8d1ba..e5cb868a 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -485,6 +485,15 @@
 #update_process_title = on
 
 
+# - Maintain GTS -
+
+#gts_maintain_option = 0	# range: 0-2. the default is 0.
+				# 0: do nothing.
+				# 1: check the correctness of the GTS of tuples by referring to
+				#    tlog while doing vacuum.
+				# 2: check the correctness of the GTS of tuples by referring to
+				#    tlog, and reset it if it is wrong while doing vacuum.
+
 #------------------------------------------------------------------------------
 # RUNTIME STATISTICS
 #------------------------------------------------------------------------------
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index 6b01aa3e..9fa4ad40 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -167,34 +167,38 @@ static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot);
 
 GlobalTimestamp HeapTupleHderGetXminTimestapAtomic(HeapTupleHeader tuple)
 {
-    if(HEAP_XMIN_TIMESTAMP_IS_UPDATED(tuple->t_infomask2))
-        return HeapTupleHeaderGetXminTimestamp(tuple);
-    else
-        return InvalidGlobalTimestamp;
-
+	if (HEAP_XMIN_TIMESTAMP_IS_UPDATED(tuple->t_infomask2))
+	{
+		return HeapTupleHeaderGetXminTimestamp(tuple);
+	}
+	else
+	{
+		return InvalidGlobalTimestamp;
+	}
 }
 
 GlobalTimestamp HeapTupleHderGetXmaxTimestapAtomic(HeapTupleHeader tuple)
 {
-    if(HEAP_XMAX_TIMESTAMP_IS_UPDATED(tuple->t_infomask2))
-        return HeapTupleHeaderGetXmaxTimestamp(tuple);
-    else
-        return InvalidGlobalTimestamp;
-
+	if (HEAP_XMAX_TIMESTAMP_IS_UPDATED(tuple->t_infomask2))
+	{
+		return HeapTupleHeaderGetXmaxTimestamp(tuple);
+	}
+	else
+	{
+		return InvalidGlobalTimestamp;
+	}
 }
 
 void HeapTupleHderSetXminTimestapAtomic(HeapTupleHeader tuple, GlobalTimestamp committs)
 {
-    HeapTupleHeaderSetXminTimestamp(tuple, committs);
-    tuple->t_infomask2 |= HEAP_XMIN_TIMESTAMP_UPDATED;
-
+	HeapTupleHeaderSetXminTimestamp(tuple, committs);
+	tuple->t_infomask2 |= HEAP_XMIN_TIMESTAMP_UPDATED;
 }
 
 void HeapTupleHderSetXmaxTimestapAtomic(HeapTupleHeader tuple, GlobalTimestamp committs)
 {
-    HeapTupleHeaderSetXmaxTimestamp(tuple, committs);
-    tuple->t_infomask2 |= HEAP_XMAX_TIMESTAMP_UPDATED;
-
+	HeapTupleHeaderSetXmaxTimestamp(tuple, committs);
+	tuple->t_infomask2 |= HEAP_XMAX_TIMESTAMP_UPDATED;
 }
 
 
@@ -263,6 +267,21 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer,
             if(TransactionIdIsNormal(xmin) && TransactionIdGetCommitTsData(xmin, &global_timestamp, NULL))
             {
             	HeapTupleHderSetXminTimestapAtomic(tuple, global_timestamp);
+                if (enable_committs_print)
+				{
+					BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
+					RelFileNode *rnode = &bufHdr->tag.rnode;
+
+					elog(LOG,
+						"SetHintBits: relfilenode %u pageno %u "
+						"CTID %hu/%hu/%hu "
+						"infomask %d xmin %u xmin_gts "INT64_FORMAT,
+						rnode->relNode, bufHdr->tag.blockNum,
+						tuple->t_ctid.ip_blkid.bi_hi,
+						tuple->t_ctid.ip_blkid.bi_lo,
+						tuple->t_ctid.ip_posid,
+						tuple->t_infomask, xmin, global_timestamp);
+				}
             }
         }
 
@@ -275,6 +294,23 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer,
             if(TransactionIdIsNormal(xmax) && TransactionIdGetCommitTsData(xmax, &global_timestamp, NULL))
             {
             	HeapTupleHderSetXmaxTimestapAtomic(tuple, global_timestamp);
+                if (enable_committs_print)
+				{
+					BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
+					RelFileNode *rnode = &bufHdr->tag.rnode;
+
+					elog(LOG,
+						"SetHintBits: relfilenode %u pageno %u "
+						"CTID %hu/%hu/%hu "
+						"infomask %d multixact %d "
+						"xid %u xmax %u xmax_gts "INT64_FORMAT,
+						rnode->relNode, bufHdr->tag.blockNum,
+						tuple->t_ctid.ip_blkid.bi_hi,
+						tuple->t_ctid.ip_blkid.bi_lo,
+						tuple->t_ctid.ip_posid,
+						tuple->t_infomask, tuple->t_infomask & HEAP_XMAX_IS_MULTI,
+						HeapTupleHeaderGetUpdateXid(tuple), xmax, global_timestamp);
+				}
             }
         }
     }
diff --git a/src/bin/pg_archivecleanup/pg_archivecleanup.c b/src/bin/pg_archivecleanup/pg_archivecleanup.c
index 4dc3d6ba..2a1e645b 100644
--- a/src/bin/pg_archivecleanup/pg_archivecleanup.c
+++ b/src/bin/pg_archivecleanup/pg_archivecleanup.c
@@ -30,11 +30,12 @@ bool        debug = false;        /* are we debugging? */
 bool        dryrun = false;        /* are we performing a dry-run operation? */
 char       *additional_ext = NULL;    /* Extension to remove from filenames */
 
-char       *archiveLocation;    /* where to find the archive? */
-char       *restartWALFileName; /* the file from which we can restart restore */
-char        WALFilePath[MAXPGPATH * 2]; /* the file path including archive */
-char        exclusiveCleanupFileName[MAXFNAMELEN];    /* the oldest file we want
-                                                     * to remain in archive */
+char	   *archiveLocation;	/* where to find the archive? */
+char	   *restartWALFileName; /* the file from which we can restart restore */
+char		WALFilePath[MAXPGPATH * 2]; /* the file path including archive */
+char		WALGTSFilePath[MAXPGPATH * 2];
+char		exclusiveCleanupFileName[MAXFNAMELEN];	/* the oldest file we want
+													 * to remain in archive */
 
 
 /* =====================================================================
@@ -93,87 +94,102 @@ TrimExtension(char *filename, char *extension)
 
 static void
 CleanupPriorWALFiles(void)
-{// #lizard forgives
-    int            rc;
-    DIR           *xldir;
-    struct dirent *xlde;
-    char        walfile[MAXPGPATH];
-
-    if ((xldir = opendir(archiveLocation)) != NULL)
-    {
-        while (errno = 0, (xlde = readdir(xldir)) != NULL)
-        {
-            /*
-             * Truncation is essentially harmless, because we skip names of
-             * length other than XLOG_FNAME_LEN.  (In principle, one could use
-             * a 1000-character additional_ext and get trouble.)
-             */
-            strlcpy(walfile, xlde->d_name, MAXPGPATH);
-            TrimExtension(walfile, additional_ext);
-
-            /*
-             * We ignore the timeline part of the XLOG segment identifiers in
-             * deciding whether a segment is still needed.  This ensures that
-             * we won't prematurely remove a segment from a parent timeline.
-             * We could probably be a little more proactive about removing
-             * segments of non-parent timelines, but that would be a whole lot
-             * more complicated.
-             *
-             * We use the alphanumeric sorting property of the filenames to
-             * decide which ones are earlier than the exclusiveCleanupFileName
-             * file. Note that this means files are not removed in the order
-             * they were originally written, in case this worries you.
-             */
-            if ((IsXLogFileName(walfile) || IsPartialXLogFileName(walfile)) &&
-                strcmp(walfile + 8, exclusiveCleanupFileName + 8) < 0)
-            {
-                /*
-                 * Use the original file name again now, including any
-                 * extension that might have been chopped off before testing
-                 * the sequence.
-                 */
-                snprintf(WALFilePath, sizeof(WALFilePath), "%s/%s",
-                         archiveLocation, xlde->d_name);
-
-                if (dryrun)
-                {
-                    /*
-                     * Prints the name of the file to be removed and skips the
-                     * actual removal.  The regular printout is so that the
-                     * user can pipe the output into some other program.
-                     */
-                    printf("%s\n", WALFilePath);
-                    if (debug)
-                        fprintf(stderr,
-                                _("%s: file \"%s\" would be removed\n"),
-                                progname, WALFilePath);
-                    continue;
-                }
-
-                if (debug)
-                    fprintf(stderr, _("%s: removing file \"%s\"\n"),
-                            progname, WALFilePath);
-
-                rc = unlink(WALFilePath);
-                if (rc != 0)
-                {
-                    fprintf(stderr, _("%s: ERROR: could not remove file \"%s\": %s\n"),
-                            progname, WALFilePath, strerror(errno));
-                    break;
-                }
-            }
-        }
-
-        if (errno)
-            fprintf(stderr, _("%s: could not read archive location \"%s\": %s\n"),
-                    progname, archiveLocation, strerror(errno));
-        if (closedir(xldir))
-            fprintf(stderr, _("%s: could not close archive location \"%s\": %s\n"),
-                    progname, archiveLocation, strerror(errno));
-    }
-    else
-        fprintf(stderr, _("%s: could not open archive location \"%s\": %s\n"),
-                progname, archiveLocation, strerror(errno));
+{
+	int			rc;
+	DIR		   *xldir;
+	struct dirent *xlde;
+	char		walfile[MAXPGPATH];
+
+	if ((xldir = opendir(archiveLocation)) != NULL)
+	{
+		while (errno = 0, (xlde = readdir(xldir)) != NULL)
+		{
+			/*
+			 * Truncation is essentially harmless, because we skip names of
+			 * length other than XLOG_FNAME_LEN.  (In principle, one could use
+			 * a 1000-character additional_ext and get trouble.)
+			 */
+			strlcpy(walfile, xlde->d_name, MAXPGPATH);
+			TrimExtension(walfile, additional_ext);
+
+			/*
+			 * We ignore the timeline part of the XLOG segment identifiers in
+			 * deciding whether a segment is still needed.  This ensures that
+			 * we won't prematurely remove a segment from a parent timeline.
+			 * We could probably be a little more proactive about removing
+			 * segments of non-parent timelines, but that would be a whole lot
+			 * more complicated.
+			 *
+			 * We use the alphanumeric sorting property of the filenames to
+			 * decide which ones are earlier than the exclusiveCleanupFileName
+			 * file. Note that this means files are not removed in the order
+			 * they were originally written, in case this worries you.
+			 */
+			if ((IsXLogFileName(walfile) || IsPartialXLogFileName(walfile)) &&
+				strcmp(walfile + 8, exclusiveCleanupFileName + 8) < 0)
+			{
+				/*
+				 * Use the original file name again now, including any
+				 * extension that might have been chopped off before testing
+				 * the sequence.
+				 */
+				snprintf(WALFilePath, sizeof(WALFilePath), "%s/%s",
+						 archiveLocation, xlde->d_name);
+				snprintf(WALGTSFilePath, sizeof(WALGTSFilePath), "%s/%s.gts",
+						 archiveLocation, xlde->d_name);
+
+				if (dryrun)
+				{
+					/*
+					 * Prints the name of the file to be removed and skips the
+					 * actual removal.  The regular printout is so that the
+					 * user can pipe the output into some other program.
+					 */
+					printf("%s\n", WALFilePath);
+					if (debug)
+						fprintf(stderr,
+								_("%s: file \"%s\" would be removed\n"),
+								progname, WALFilePath);
+					continue;
+				}
+
+				if (debug)
+					fprintf(stderr, _("%s: removing file \"%s\"\n"),
+							progname, WALFilePath);
+
+				rc = unlink(WALFilePath);
+				if (rc != 0)
+				{
+					fprintf(stderr, _("%s: ERROR: could not remove file \"%s\": %s\n"),
+							progname, WALFilePath, strerror(errno));
+					break;
+				}
+
+				if (debug)
+					fprintf(stderr, _("%s: removing file \"%s\"\n"),
+							progname, WALGTSFilePath);
+
+				/* remove the .gts file */
+				rc = unlink(WALGTSFilePath);
+				if (rc != 0)
+				{
+					fprintf(stderr, _("%s: ERROR: could not remove file \"%s\": %s\n"),
+							progname, WALGTSFilePath, strerror(errno));
+					break;
+				}
+			}
+		}
+
+		if (errno)
+			fprintf(stderr, _("%s: could not read archive location \"%s\": %s\n"),
+					progname, archiveLocation, strerror(errno));
+		if (closedir(xldir))
+			fprintf(stderr, _("%s: could not close archive location \"%s\": %s\n"),
+					progname, archiveLocation, strerror(errno));
+	}
+	else
+		fprintf(stderr, _("%s: could not open archive location \"%s\": %s\n"),
+				progname, archiveLocation, strerror(errno));
 }
 
 /*
diff --git a/src/include/access/htup_details.h b/src/include/access/htup_details.h
index 12fce61c..f312bd76 100644
--- a/src/include/access/htup_details.h
+++ b/src/include/access/htup_details.h
@@ -409,6 +409,11 @@ do { \
     ((tup)->t_infomask |= HEAP_XMIN_FROZEN) \
 )
 
+#define HeapTupleHeaderXmaxCommitted(tup) \
+( \
+	((tup)->t_infomask & HEAP_XMAX_COMMITTED) != 0 \
+)
+
 /*
  * HeapTupleHeaderGetRawXmax gets you the raw Xmax field.  To find out the Xid
  * that updated a tuple, you might need to resolve the MultiXactId if certain
diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h
index 98bf4db1..706759f4 100644
--- a/src/include/bootstrap/bootstrap.h
+++ b/src/include/bootstrap/bootstrap.h
@@ -30,7 +30,8 @@
 
 extern Relation boot_reldesc;
 extern Form_pg_attribute attrtypes[MAXATTR];
-extern int    numattr;
+extern int	numattr;
+extern const char *exename;
 
 
 extern void AuxiliaryProcessMain(int argc, char *argv[]) pg_attribute_noreturn();
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 356efa52..cd79ba61 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -211,12 +211,13 @@ typedef struct VacuumParams
 } VacuumParams;
 
 /* GUC parameters */
-extern PGDLLIMPORT int default_statistics_target;    /* PGDLLIMPORT for PostGIS */
-extern int    vacuum_freeze_min_age;
-extern int	  vacuum_defer_freeze_min_age;
-extern int    vacuum_freeze_table_age;
-extern int    vacuum_multixact_freeze_min_age;
-extern int    vacuum_multixact_freeze_table_age;
+extern PGDLLIMPORT int default_statistics_target;	/* PGDLLIMPORT for PostGIS */
+extern int	vacuum_freeze_min_age;
+extern int	vacuum_defer_freeze_min_age;
+extern int	vacuum_freeze_table_age;
+extern int	vacuum_multixact_freeze_min_age;
+extern int	vacuum_multixact_freeze_table_age;
+
 #ifdef __TBASE__
 extern bool	enable_sampling_analyze;
 extern bool distributed_query_analyze;
@@ -365,6 +366,17 @@ extern void ClearQueryAnalyzeInfo(void);
 extern char *GetAnalyzeInfo(int nodeid, char *key);
 
 extern void ExecSample(SampleStmt *stmt, DestReceiver *dest);
+
+extern int     gts_maintain_option;
+typedef enum
+{
+	GTS_MAINTAIN_NOTHING			= 0, /* do nothing */
+	GTS_MAINTAIN_VACUUM_CHECK		= 1, /* check correctness of GTS while
+										  *	doing vacuum. */
+	GTS_MAINTAIN_VACUUM_RESET		= 2, /* check correctness of GTS and reset
+										  * it according to tlog if it is wrong
+										  * while doing vacuum. */
+} GTSMaintainOption;
 #endif
 
 #endif                            /* VACUUM_H */
diff --git a/src/include/postmaster/auditlogger.h b/src/include/postmaster/auditlogger.h
index 4b14ccea..2ec37bc1 100644
--- a/src/include/postmaster/auditlogger.h
+++ b/src/include/postmaster/auditlogger.h
@@ -74,8 +74,10 @@
 
 #include <limits.h>
 
-#define                     AUDIT_COMMON_LOG        1
-#define                     AUDIT_FGA_LOG            2
+#define 					AUDIT_COMMON_LOG		(1 << 0)
+#define 					AUDIT_FGA_LOG			(1 << 1)
+/* size_rotation_for = AUDIT_COMMON_LOG | AUDIT_FGA_LOG | MAINTAIN_TRACE_LOG */
+#define 					MAINTAIN_TRACE_LOG      (1 << 2)
 
 extern int                    AuditLog_RotationAge;
 extern int                    AuditLog_RotationSize;
@@ -83,11 +85,13 @@ extern PGDLLIMPORT char *    AuditLog_filename;
 extern bool                 AuditLog_truncate_on_rotation;
 extern int                    AuditLog_file_mode;
 
-extern int                    AuditLog_max_worker_number;
-extern int                    AuditLog_common_log_queue_size_kb;
-extern int                    AuditLog_fga_log_queue_size_kb;
-extern int                    AuditLog_common_log_cache_size_kb;
-extern int                    AuditLog_fga_log_cacae_size_kb;
+extern int					AuditLog_max_worker_number;
+extern int					AuditLog_common_log_queue_size_kb;
+extern int					AuditLog_fga_log_queue_size_kb;
+extern int					Maintain_trace_log_queue_size_kb;
+extern int					AuditLog_common_log_cache_size_kb;
+extern int					AuditLog_fga_log_cacae_size_kb;
+extern int					Maintain_trace_log_cache_size_kb;
 
 extern bool                 am_auditlogger;
 extern bool                 enable_auditlogger_warning;
@@ -102,8 +106,9 @@ extern Size                 AuditLoggerShmemSize(void);
 extern void                 AuditLoggerShmemInit(void);
 extern int                    AuditLoggerQueueAcquire(void);
 
-extern void                 alog(int destination, const char *fmt,...) pg_attribute_printf(2, 3);
-#define                     audit_log(args...)        alog(AUDIT_COMMON_LOG, ##args)
-#define                     audit_log_fga(args...)    alog(AUDIT_FGA_LOG, ##args)
+extern void     alog(int destination, const char *fmt,...) pg_attribute_printf(2, 3);
+#define 		audit_log(args...)          alog(AUDIT_COMMON_LOG, ##args)
+#define 		audit_log_fga(args...)      alog(AUDIT_FGA_LOG, ##args)
+#define 		trace_log(args...)          alog(MAINTAIN_TRACE_LOG, ##args)
 
 #endif                            /* __AUDIT_LOGGER_H__ */
diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h
index 359cc258..f32e7db9 100644
--- a/src/include/postmaster/postmaster.h
+++ b/src/include/postmaster/postmaster.h
@@ -109,5 +109,6 @@ extern void ShmemBackendArrayAllocation(void);
 #ifdef __TBASE__
 extern void PostmasterEnableLogTimeout(void);
 extern void PostmasterDisableTimeout(void);
+extern bool PostmasterIsPrimaryAndNormal(void);
 #endif
 #endif                            /* _POSTMASTER_H */
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index 1cdbc044..bd96e72d 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -129,6 +129,8 @@ extern void RelationCacheInitFilePreInvalidate(void);
 extern void RelationCacheInitFilePostInvalidate(void);
 extern void RelationCacheInitFileRemove(void);
 
+extern bool RelationHasGTS(Oid reltablespace, Oid relfilenode);
+
 /* should be used only by relcache.c and catcache.c */
 extern bool criticalRelcachesBuilt;
 

From 9c176085ed9ba9d5f6e02b330218481e96395104 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Mon, 16 Nov 2020 14:41:57 +0800
Subject: [PATCH 083/578] fix bug ID82369889 (merge request !139)

---
 src/backend/commands/prepare.c   | 8 +++++++-
 src/backend/pgxc/squeue/squeue.c | 4 ++--
 src/backend/tcop/pquery.c        | 7 ++++++-
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index 5a46fa7f..1aa469d6 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -739,6 +739,13 @@ DropPreparedStatement(const char *stmt_name, bool showError)
 
     if (entry)
     {
+#ifdef XCP
+	    /* if a process SharedQueueRelease in DropCachedPlan, this SharedQueue
+	     * Can be created by another process, and SharedQueueDisconnectConsumer
+	     * will change the SharedQueue of another process's status,
+	     * so let SharedQueueDisconnectConsumer be in front of DropCachedPlan */
+        SharedQueueDisconnectConsumer(entry->stmt_name);
+#endif
         /* Release the plancache entry */
         DropCachedPlan(entry->plansource);
 
@@ -750,7 +757,6 @@ DropPreparedStatement(const char *stmt_name, bool showError)
         if (entry->use_resowner)
             ResourceOwnerForgetPreparedStmt(CurTransactionResourceOwner,
                     entry->stmt_name);
-        SharedQueueDisconnectConsumer(entry->stmt_name);
 #endif
 #ifdef __TBASE__
         if (distributed_query_analyze)
diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
index 515391a3..b1440b60 100644
--- a/src/backend/pgxc/squeue/squeue.c
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -977,11 +977,11 @@ SharedQueueAcquire(const char *sqname, int ncons)
             if (old_squeue)
             {
                 LWLockRelease(SQueuesLock);
-                pg_usleep(1000000L);
+				(trycount < 10) ? pg_usleep(10000L) : pg_usleep(1000000L);
                 elog(DEBUG1, "SQueue race condition, give the old producer to "
                         "finish the work and retry again");
                 trycount++;
-                if (trycount >= 10)
+                if (trycount >= 20)
                     elog(ERROR, "Couldn't resolve SQueue race condition after"
                             " %d tries", trycount);
                 goto tryagain;
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index e7068eab..13f56acc 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -696,10 +696,15 @@ PortalStart(Portal portal, ParamListInfo params,
                  * NB: Check queryDesc->plannedstmt->nParamExec > 0 is incorrect
                  * here since queryDesc->plannedstmt->nParamExec may be used
                  * just to allocate space for them and no actual values passed.
+				 *
+				 * If distributionType is LOCATOR_TYPE_SHARD, even with parameters
+				 * PARAM_EXEC, still follow the redistribution logic, otherwise,
+				 * it may cause SharedQueue conflict in the lower layer redistribution
                  */
 #ifdef __TBASE__
                 if (!paramPassDown && queryDesc->plannedstmt->nParamRemote > 0 &&
-                        queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC)
+						queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC &&
+						queryDesc->plannedstmt->distributionType != LOCATOR_TYPE_SHARD)
 #else
                 if (queryDesc->plannedstmt->nParamRemote > 0 &&
                         queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC)

From 87ec1ba9e0655c1961d350de8840e467cce3e17c Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Fri, 4 Dec 2020 17:42:20 +0800
Subject: [PATCH 084/578] Fix parallel ddl pushing guc variable order.

---
 src/backend/pgxc/pool/pgxcnode.c |   6 +
 src/backend/tcop/utility.c       |   4 +-
 src/include/pgxc/pgxcnode.h      | 240 ++++++++++++++++---------------
 3 files changed, 129 insertions(+), 121 deletions(-)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 775b703d..5a93bbb1 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -133,6 +133,9 @@ static List    *local_param_list = NIL;
 static StringInfo     session_params;
 static StringInfo    local_params;
 
+/* Is forward request to leader coordinator */
+bool   forward_mode = false;
+
 typedef struct
 {
     NameData name;
@@ -4634,6 +4637,9 @@ PGXCNodeGetSessionParamStr(void)
         if (IS_PGXC_COORDINATOR)
             appendStringInfo(session_params, "SET global_session TO %s_%d;",
                              PGXCNodeName, MyProcPid);
+		if (forward_mode)
+			appendStringInfo(session_params, "SET is_forward_request to true;");
+
         get_set_command(session_param_list, session_params, false);
         appendStringInfo(session_params, "SET parentPGXCPid TO %d;",
                              MyProcPid);
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 4a8871b6..55b62ce3 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -1771,12 +1771,12 @@ forward_ddl_to_leader_cn(Node *node, const char *queryString)
     leader_cn = get_pgxc_nodeoid(leader_name);
 
     /* Set flag to indicate forwarded request */
-    PGXCNodeSetParam(false, "is_forward_request", "true", 0);
+    forward_mode = true;
 
     pgxc_execute_on_nodes(1, &leader_cn, pstrdup(queryString));
 
     /* Cancel forwarded flag for subsequent requests */
-    PGXCNodeSetParam(false, "is_forward_request", "false", 0);
+	forward_mode = false;
 
     return true;
 }
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 872536fc..ccb84026 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -2,7 +2,7 @@
  *
  * pgxcnode.h
  *
- *      Utility functions to communicate to Datanodes and Coordinators
+ *	  Utility functions to communicate to Datanodes and Coordinators
  *
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
@@ -32,103 +32,103 @@ typedef struct PGcancel NODE_CANCEL;
 /* Helper structure to access Datanode from Session */
 typedef enum
 {
-    DN_CONNECTION_STATE_IDLE,            /* idle, ready for query */
-    DN_CONNECTION_STATE_QUERY,            /* query is sent, response expected */
-    DN_CONNECTION_STATE_CLOSE,            /* close is sent, confirmation expected */
-    DN_CONNECTION_STATE_ERROR_FATAL,    /* fatal error */
-    DN_CONNECTION_STATE_COPY_IN,
-    DN_CONNECTION_STATE_COPY_OUT
-}    DNConnectionState;
+	DN_CONNECTION_STATE_IDLE,			/* idle, ready for query */
+	DN_CONNECTION_STATE_QUERY,			/* query is sent, response expected */
+	DN_CONNECTION_STATE_CLOSE,			/* close is sent, confirmation expected */
+	DN_CONNECTION_STATE_ERROR_FATAL,	/* fatal error */
+	DN_CONNECTION_STATE_COPY_IN,
+	DN_CONNECTION_STATE_COPY_OUT
+}	DNConnectionState;
 
 typedef enum
 {
-    HANDLE_IDLE,
-    HANDLE_ERROR,
-    HANDLE_DEFAULT
-}    PGXCNode_HandleRequested;
+	HANDLE_IDLE,
+	HANDLE_ERROR,
+	HANDLE_DEFAULT
+}	PGXCNode_HandleRequested;
 
 
 #ifdef __TBASE__
 typedef enum
 {
-    DNStatus_OK      = 0, 
-    DNStatus_ERR     = 1,
-    DNStatus_EXPIRED = 2,
-    DNStatus_BUTTY
+	DNStatus_OK      = 0, 
+	DNStatus_ERR     = 1,
+	DNStatus_EXPIRED = 2,
+	DNStatus_BUTTY
 }DNStateEnum;
 
 typedef enum
 {
-    SendSetQuery_OK                    = 0,
-    SendSetQuery_EXPIRED            = 1,
-    SendSetQuery_SendQuery_ERROR    = 2,
-    SendSetQuery_Set_ERROR            = 3,
-    SendSetQuery_BUTTY
+	SendSetQuery_OK					= 0,
+	SendSetQuery_EXPIRED			= 1,
+	SendSetQuery_SendQuery_ERROR	= 2,
+	SendSetQuery_Set_ERROR			= 3,
+	SendSetQuery_BUTTY
 }SendSetQueryStatus;
 
-#define     MAX_ERROR_MSG_LENGTH          1024
+#define     MAX_ERROR_MSG_LENGTH 		 1024
 #endif
 
 
 #define DN_CONNECTION_STATE_ERROR(dnconn) \
-        ((dnconn)->state == DN_CONNECTION_STATE_ERROR_FATAL \
-            || (dnconn)->transaction_status == 'E')
+		((dnconn)->state == DN_CONNECTION_STATE_ERROR_FATAL \
+			|| (dnconn)->transaction_status == 'E')
 
 #define HAS_MESSAGE_BUFFERED(conn) \
-        ((conn)->inCursor + 4 < (conn)->inEnd \
-            && (conn)->inCursor + ntohl(*((uint32_t *) ((conn)->inBuffer + (conn)->inCursor + 1))) < (conn)->inEnd)
+		((conn)->inCursor + 4 < (conn)->inEnd \
+			&& (conn)->inCursor + ntohl(*((uint32_t *) ((conn)->inBuffer + (conn)->inCursor + 1))) < (conn)->inEnd)
 
 struct pgxc_node_handle
 {
-    Oid            nodeoid;
-    int            nodeid;
-    char        nodename[NAMEDATALEN];
-    char        nodehost[NAMEDATALEN];
-    int            nodeport;
-
-    /* fd of the connection */
-    int        sock;
-    /* pid of the remote backend process */
-    int        backend_pid;
-
-    /* Connection state */
-    char        transaction_status;
-    DNConnectionState state;
-    bool        read_only;
-    struct ResponseCombiner *combiner;
+	Oid			nodeoid;
+	int			nodeid;
+	char		nodename[NAMEDATALEN];
+	char		nodehost[NAMEDATALEN];
+	int			nodeport;
+
+	/* fd of the connection */
+	int		sock;
+	/* pid of the remote backend process */
+	int		backend_pid;
+
+	/* Connection state */
+	char		transaction_status;
+	DNConnectionState state;
+	bool		read_only;
+	struct ResponseCombiner *combiner;
 #ifdef DN_CONNECTION_DEBUG
-    bool        have_row_desc;
+	bool		have_row_desc;
 #endif
 #ifndef __USE_GLOBAL_SNAPSHOT__
-    uint64        sendGxidVersion;
+	uint64		sendGxidVersion;
 #endif
-    char        error[MAX_ERROR_MSG_LENGTH];
-    /* Output buffer */
-    char        *outBuffer;
-    size_t        outSize;
-    size_t        outEnd;
-    /* Input buffer */
-    char        *inBuffer;
-    size_t        inSize;
-    size_t        inStart;
-    size_t        inEnd;
-    size_t        inCursor;
-    /*
-     * Have a variable to enable/disable response checking and
-     * if enable then read the result of response checking
-     *
-     * For details see comments of RESP_ROLLBACK
-     */
-    bool        ck_resp_rollback;
-
-    bool        in_extended_query;
-    bool        needSync; /* set when error and extend query. */
+	char		error[MAX_ERROR_MSG_LENGTH];
+	/* Output buffer */
+	char		*outBuffer;
+	size_t		outSize;
+	size_t		outEnd;
+	/* Input buffer */
+	char		*inBuffer;
+	size_t		inSize;
+	size_t		inStart;
+	size_t		inEnd;
+	size_t		inCursor;
+	/*
+	 * Have a variable to enable/disable response checking and
+	 * if enable then read the result of response checking
+	 *
+	 * For details see comments of RESP_ROLLBACK
+	 */
+	bool		ck_resp_rollback;
+
+	bool		in_extended_query;
+	bool		needSync; /* set when error and extend query. */
 #ifdef __TBASE__
 	bool		sock_fatal_occurred;	/*Network failure occurred, and sock descriptor was closed */
-    char        last_command; /*last command we processed. */
-    long        recv_datarows;
-    bool         plpgsql_need_begin_sub_txn;
-    bool         plpgsql_need_begin_txn;
+	char        last_command; /*last command we processed. */
+	long        recv_datarows;
+	bool 		plpgsql_need_begin_sub_txn;
+	bool 		plpgsql_need_begin_txn;
 #endif
 };
 typedef struct pgxc_node_handle PGXCNodeHandle;
@@ -136,21 +136,23 @@ typedef struct pgxc_node_handle PGXCNodeHandle;
 /* Structure used to get all the handles involved in a transaction */
 typedef struct
 {
-    PGXCNodeHandle       *primary_handle;    /* Primary connection to PGXC node */
-    int                    dn_conn_count;    /* number of Datanode Handles including primary handle */
-    PGXCNodeHandle      **datanode_handles;    /* an array of Datanode handles */
-    int                    co_conn_count;    /* number of Coordinator handles */
-    PGXCNodeHandle      **coord_handles;    /* an array of Coordinator handles */
+	PGXCNodeHandle	   *primary_handle;	/* Primary connection to PGXC node */
+	int					dn_conn_count;	/* number of Datanode Handles including primary handle */
+	PGXCNodeHandle	  **datanode_handles;	/* an array of Datanode handles */
+	int					co_conn_count;	/* number of Coordinator handles */
+	PGXCNodeHandle	  **coord_handles;	/* an array of Coordinator handles */
 } PGXCNodeAllHandles;
 
+extern bool forward_mode;
+
 extern void InitMultinodeExecutor(bool is_force);
 extern Oid get_nodeoid_from_nodeid(int nodeid, char node_type);
 
 
 /* Open/close connection routines (invoked from Pool Manager) */
 extern char *PGXCNodeConnStr(char *host, int port, char *dbname, char *user,
-                             char *pgoptions,
-                             char *remote_type, char *parent_node);
+							 char *pgoptions,
+							 char *remote_type, char *parent_node);
 extern NODE_CONNECTION *PGXCNodeConnect(char *connstr);
 extern void PGXCNodeClose(NODE_CONNECTION * conn);
 extern int PGXCNodeConnected(NODE_CONNECTION * conn);
@@ -186,42 +188,42 @@ extern void release_handles(bool force);
 extern void clear_handles(void);
 
 extern int get_transaction_nodes(PGXCNodeHandle ** connections,
-                                  char client_conn_type,
-                                  PGXCNode_HandleRequested type_requested);
+								  char client_conn_type,
+								  PGXCNode_HandleRequested type_requested);
 extern char* collect_pgxcnode_names(char *nodestring, int conn_count, PGXCNodeHandle ** connections, char client_conn_type);
 extern char* collect_localnode_name(char *nodestring);
-extern int    get_active_nodes(PGXCNodeHandle ** connections);
-
-extern int    ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle * handle);
-extern int    ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle * handle);
-
-extern int    pgxc_node_send_query(PGXCNodeHandle * handle, const char *query);
-extern int    pgxc_node_send_rollback(PGXCNodeHandle * handle, const char *query);
-extern int    pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement,
-                        const char *name);
-extern int    pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch);
-extern int    pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
-                     const char *name);
-extern int    pgxc_node_send_sync(PGXCNodeHandle * handle);
+extern int	get_active_nodes(PGXCNodeHandle ** connections);
+
+extern int	ensure_in_buffer_capacity(size_t bytes_needed, PGXCNodeHandle * handle);
+extern int	ensure_out_buffer_capacity(size_t bytes_needed, PGXCNodeHandle * handle);
+
+extern int	pgxc_node_send_query(PGXCNodeHandle * handle, const char *query);
+extern int	pgxc_node_send_rollback(PGXCNodeHandle * handle, const char *query);
+extern int	pgxc_node_send_describe(PGXCNodeHandle * handle, bool is_statement,
+						const char *name);
+extern int	pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, int fetch);
+extern int	pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
+					 const char *name);
+extern int	pgxc_node_send_sync(PGXCNodeHandle * handle);
 #ifdef __SUBSCRIPTION__
 extern int pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_pk_conflict);
 #endif
 #ifdef __TBASE__
 extern int pgxc_node_send_disconnect(PGXCNodeHandle * handle, char *cursor, int cons);
 #endif
-extern int    pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
-                                const char *statement, int paramlen, char *params);
-extern int    pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
-                                 const char *query, short num_params, Oid *param_types);
-extern int    pgxc_node_send_flush(PGXCNodeHandle * handle);
-extern int    pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
-                              const char *statement, const char *portal,
-                              int num_params, Oid *param_types,
-                              int paramlen, char *params,
-                              bool send_describe, int fetch_size);
+extern int	pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
+								const char *statement, int paramlen, char *params);
+extern int	pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
+								 const char *query, short num_params, Oid *param_types);
+extern int	pgxc_node_send_flush(PGXCNodeHandle * handle);
+extern int	pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
+							  const char *statement, const char *portal,
+							  int num_params, Oid *param_types,
+							  int paramlen, char *params,
+							  bool send_describe, int fetch_size);
 extern int  pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
-                    const char *query, const char *planstr,
-                    short num_params, Oid *param_types);
+					const char *query, const char *planstr,
+					short num_params, Oid *param_types);
 extern int pgxc_node_send_gid(PGXCNodeHandle *handle, char* gid);
 #ifdef __TWO_PHASE_TRANS__
 extern int pgxc_node_send_starter(PGXCNodeHandle *handle, char* startnode);
@@ -231,10 +233,10 @@ extern int pgxc_node_send_clean(PGXCNodeHandle *handle);
 extern int pgxc_node_send_readonly(PGXCNodeHandle *handle);
 extern int pgxc_node_send_after_prepare(PGXCNodeHandle *handle);
 #endif
-extern int    pgxc_node_send_gxid(PGXCNodeHandle * handle, GlobalTransactionId gxid);
-extern int    pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid);
-extern int    pgxc_node_send_snapshot(PGXCNodeHandle * handle, Snapshot snapshot);
-extern int    pgxc_node_send_timestamp(PGXCNodeHandle * handle, TimestampTz timestamp);
+extern int	pgxc_node_send_gxid(PGXCNodeHandle * handle, GlobalTransactionId gxid);
+extern int	pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid);
+extern int	pgxc_node_send_snapshot(PGXCNodeHandle * handle, Snapshot snapshot);
+extern int	pgxc_node_send_timestamp(PGXCNodeHandle * handle, TimestampTz timestamp);
 extern int
 pgxc_node_send_prepare_timestamp(PGXCNodeHandle *handle, GlobalTimestamp timestamp);
 extern int
@@ -244,20 +246,20 @@ pgxc_node_send_global_timestamp(PGXCNodeHandle *handle, GlobalTimestamp timestam
 
 #ifdef __TBASE__
 extern int	pgxc_node_send_coord_info(PGXCNodeHandle * handle, int coord_pid, TransactionId coord_vxid);
-extern int    pgxc_node_receive(const int conn_count,
-                  PGXCNodeHandle ** connections, struct timeval * timeout);
+extern int	pgxc_node_receive(const int conn_count,
+				  PGXCNodeHandle ** connections, struct timeval * timeout);
 extern bool node_ready_for_query(PGXCNodeHandle *conn);
 extern bool validate_handles(void);
 #else
-extern bool    pgxc_node_receive(const int conn_count,
-                  PGXCNodeHandle ** connections, struct timeval * timeout);
+extern bool	pgxc_node_receive(const int conn_count,
+				  PGXCNodeHandle ** connections, struct timeval * timeout);
 #endif
-extern int    pgxc_node_read_data(PGXCNodeHandle * conn, bool close_if_error);
-extern int    pgxc_node_is_data_enqueued(PGXCNodeHandle *conn);
+extern int	pgxc_node_read_data(PGXCNodeHandle * conn, bool close_if_error);
+extern int	pgxc_node_is_data_enqueued(PGXCNodeHandle *conn);
 
-extern int    send_some(PGXCNodeHandle * handle, int len);
-extern int    pgxc_node_flush(PGXCNodeHandle *handle);
-extern void    pgxc_node_flush_read(PGXCNodeHandle *handle);
+extern int	send_some(PGXCNodeHandle * handle, int len);
+extern int	pgxc_node_flush(PGXCNodeHandle *handle);
+extern void	pgxc_node_flush_read(PGXCNodeHandle *handle);
 
 extern char get_message(PGXCNodeHandle *conn, int *len, char **msg);
 
@@ -266,7 +268,7 @@ extern void add_error_message(PGXCNodeHandle * handle, const char *message);
 extern Datum pgxc_execute_on_nodes(int numnodes, Oid *nodelist, char *query);
 
 extern void PGXCNodeSetParam(bool local, const char *name, const char *value,
-                int flags);
+				int flags);
 extern void PGXCNodeResetParams(bool only_local);
 extern char *PGXCNodeGetSessionParamStr(void);
 extern char *PGXCNodeGetTransactionParamStr(void);
@@ -275,9 +277,9 @@ extern void RequestInvalidateRemoteHandles(void);
 extern void RequestRefreshRemoteHandles(void);
 extern bool PoolerMessagesPending(void);
 extern void PGXCNodeSetConnectionState(PGXCNodeHandle *handle,
-        DNConnectionState new_state);
+		DNConnectionState new_state);
 extern bool PgxcNodeDiffBackendHandles(List **nodes_alter,
-               List **nodes_delete, List **nodes_add);
+			   List **nodes_delete, List **nodes_add);
 extern void PgxcNodeRefreshBackendHandlesShmem(List *nodes_alter);
 extern void HandlePoolerMessages(void);
 extern void pgxc_print_pending_data(PGXCNodeHandle *handle, bool reset);

From 34debce0d5ff23431ef8cd983b50aa9d77afa2f6 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Fri, 4 Dec 2020 17:43:25 +0800
Subject: [PATCH 085/578] set enable_parallel_ddl to false as default.

---
 src/backend/utils/misc/guc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3f02526e..562c6ca6 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2705,7 +2705,7 @@ static struct config_bool ConfigureNamesBool[] =
              NULL
         },
         &enable_parallel_ddl,
-        true,
+        false,
         NULL, NULL, NULL
     },
     {

From b80179e1535e3c6334f2943b42eb79492089c24c Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Fri, 4 Dec 2020 20:51:38 +0800
Subject: [PATCH 086/578] Set enable_parallel_ddl to default false.

---
 src/backend/pgxc/pool/pgxcnode.c       | 14 ++++++++++++++
 src/backend/tcop/postgres.c            |  3 ++-
 src/test/regress/expected/sysviews.out |  2 +-
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 5a93bbb1..a7aa41a6 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -4635,10 +4635,24 @@ PGXCNodeGetSessionParamStr(void)
     if (session_params->len == 0)
     {
         if (IS_PGXC_COORDINATOR)
+		{
             appendStringInfo(session_params, "SET global_session TO %s_%d;",
                              PGXCNodeName, MyProcPid);
+		}
+
+		/*
+		 * If forward_mode is true, target node must regard it as normal client
+		 * instead of internal connections ,so is_forward_request must be ahead of
+		 * any guc variables else they will be considered internal variables.
+		 */
 		if (forward_mode)
+		{
 			appendStringInfo(session_params, "SET is_forward_request to true;");
+		}
+		else
+		{
+			appendStringInfo(session_params, "SET is_forward_request to false;");
+		}
 
         get_set_command(session_param_list, session_params, false);
         appendStringInfo(session_params, "SET parentPGXCPid TO %d;",
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 80b2fc4f..697dcf5b 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -992,7 +992,8 @@ pg_rewrite_query(Query *query)
 
     if (query->commandType == CMD_UTILITY &&
         IsA(query->utilityStmt, CreateTableAsStmt) &&
-        (enable_parallel_ddl && is_ddl_leader_cn(leader_cn)))
+		((enable_parallel_ddl && is_ddl_leader_cn(leader_cn) ||
+		  !enable_parallel_ddl)))
 	{
 		/*
 		 * CREATE TABLE AS SELECT and SELECT INTO are rewritten so that the
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 0422edd6..b0e92c9f 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -112,7 +112,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_nestloop_suppression       | off
  enable_null_string                | off
  enable_oracle_compatible          | off
- enable_parallel_ddl               | on
+ enable_parallel_ddl               | off
  enable_pgbouncer                  | off
  enable_plpgsql_debug_print        | off
  enable_pooler_debug_print         | on

From 76fb6d1e14fc4665fedb2d85db58e9ddd0d2da9b Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Thu, 26 Nov 2020 17:44:54 +0800
Subject: [PATCH 087/578] Remove redundant transaction acquisition in
 ExecRemoteUtility.

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131083143135


(cherry picked from commit 30fcea21)

2f0a54b2 Fix format.

a833f783 Remove redundant transaction acquisition in ExecRemoteUtility.

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131083143135&url_cache_key=d4e1402777dc733479aac463ad1a9d24
---
 src/backend/nodes/copyfuncs.c      | 1 +
 src/backend/pgxc/pool/execRemote.c | 9 +++++++--
 src/backend/utils/misc/guc.c       | 1 +
 src/include/pgxc/planner.h         | 3 ++-
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 8a447982..49ad080a 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -1339,6 +1339,7 @@ _copyRemoteQuery(const RemoteQuery *from)
     COPY_SCALAR_FIELD(jf_xc_node_id);
     COPY_SCALAR_FIELD(jf_xc_wholerow);
     COPY_BITMAPSET_FIELD(conflict_cols);
+	COPY_SCALAR_FIELD(is_set);
 #endif
     return newnode;
 }
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 1be26a46..931c20cc 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -6617,6 +6617,7 @@ ExecRemoteUtility(RemoteQuery *node)
     ExecDirectType        exec_direct_type = node->exec_direct_type;
     int            i;
     CommandId    cid = GetCurrentCommandId(true);    
+	bool                utility_need_transcation = true;
 
     if (!force_autocommit)
         RegisterTransactionLocalNode(true);
@@ -6664,7 +6665,11 @@ ExecRemoteUtility(RemoteQuery *node)
     }
 
 #ifdef __TBASE__    
-    if (!ExecDDLWithoutAcquireXid(node->parsetree))
+	/* Some DDL such as ROLLBACK, SET does not need transaction */
+	utility_need_transcation =
+			(!ExecDDLWithoutAcquireXid(node->parsetree) && !node->is_set);
+
+	if (utility_need_transcation)
 #endif        
     {
         elog(LOG, "[SAVEPOINT] node->sql_statement:%s", node->sql_statement);
@@ -6675,7 +6680,7 @@ ExecRemoteUtility(RemoteQuery *node)
         snapshot = GetActiveSnapshot();
 
 #ifdef __TBASE__    
-    if (!ExecDDLWithoutAcquireXid(node->parsetree))
+	if (utility_need_transcation)
 #endif
     {
         if (!GlobalTransactionIdIsValid(gxid))
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 562c6ca6..49550fb5 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -9106,6 +9106,7 @@ set_config_option(const char *name, const char *value,
         /* force_autocommit is actually does not start transaction on nodes */
         step->force_autocommit = true;
         step->exec_type = EXEC_ON_CURRENT;
+		step->is_set = true;
         ExecRemoteUtility(step);
         pfree(step);
         pfree(poolcmd.data);
diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h
index 558ba13a..f08c4fce 100644
--- a/src/include/pgxc/planner.h
+++ b/src/include/pgxc/planner.h
@@ -224,7 +224,8 @@ typedef struct
     AttrNumber        jf_xc_wholerow;
     Bitmapset       *conflict_cols;
 
-    Node            *parsetree;  /* to recognise subtxn cmds(savepoint,rollback to,release savepoint) */
+	Node			*parsetree;  /* to recognize subtxn cmds (savepoint, rollback to, release savepoint) */
+	bool            is_set;      /* is SET statement ? */
 #endif
 } RemoteQuery;
 

From 6babaf3db63789c32af23b6eafbfde98a0c4440b Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Wed, 9 Dec 2020 19:12:16 +0800
Subject: [PATCH 088/578] Fix coredump due to null pointer.

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131081943789


(cherry picked from commit 32ce6c54)

575405ca Fix coredump due to null pointer.
---
 src/backend/pgxc/pool/execRemote.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 931c20cc..df03eb1b 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -11634,7 +11634,6 @@ ExecEndRemoteSubplan(RemoteSubplanState *node)
 
     ValidateAndCloseCombiner(combiner);
     combiner->conn_count = 0;
-    //pfree(node);
 
     if (log_remotesubplan_stats)
         ShowUsageCommon("ExecEndRemoteSubplan", &start_r, &start_t);

From 4ef1971dedbdfcdcff8d439e4b80f3e9227ed663 Mon Sep 17 00:00:00 2001
From: jackywpxie <jackywpxie@tencent.com>
Date: Wed, 9 Dec 2020 11:43:35 +0800
Subject: [PATCH 089/578] jacky/feature/MaintainGTS_Tbase_v2.15.16 (merge
 request !34)

Squash merge branch 'jacky/feature/MaintainGTS_Tbase_v2.15.16' into 'Tbase_v2.15.16'

* delete PrintStack()

* 1. optimize RelationHasGTS

* Merge branch 'Tbase_v2.15.16' into jacky/feature/MaintainGTS_Tbase_v2.15.16

* Revert 'fixed bug for persistent datanode connections.'

* MaintainGTS supports unlogged table.

* fixed bug for persistent datanode connections.

* add {}

* Merge branch 'Tbase_v2.15.16' into jacky/feature/MaintainGTS_Tbase_v2.15.16

* rollback modification

* Revert 'bugfix: tpcc district not found fatal'

* bugfix: tpcc district not found fatal

* delete extension: reset_gts

* clear the modification to buffer.h and buffer.c

* fixed bug: endless loop

* modified according to xiecanyang's suggestion.

* delete damaged_gts

* add damaged_gts test option

* add damage_gts

* fixed bug: count not open tlog file when the tuple has been frozen.

* delete a comment.

* modified the comment of PostmasterIsPrimaryAndNormal.

* rename PostmasterIsAlive to PostmasterIsPrimaryAndNormal

* modified code format

* delete enable_satisfies_any

* delete pg_memory_barrier()

* delete space.

* add pg_memory_barrier()

* fixed a error of going back.

* go back to before fixing the bug of persistent_datanode_connection.

* fixed bugs: insert abort when persistent_datanode_connecionts = on.

* correct a typo

* ajustted code format.

* fixed bugs:

* ajustted code format.

* roll back the  modification of ReadBuffer_common.

* mkdir maintain for trace log.

* fixed bug: release clog lock.

* printData/printStack call audit_log_trace.

* check and reset GTS before and after vacuum pages.

* add trace log accoording to audit fga log.

* optimized code format.

* 1. ajustted code format: suck as line break, etc.

* rollback: not fully tested

* reduce if logical judgement.

* modified acoording to jason's suggestion

* deal with special GTS

* modified according to code review comments.

* comment memory barrier.

* add GTS values: 3, 4.

* print the line number and file name of error stack.

* reset_gts = 1:

* fixed bug: Could not open file 'pg_commit_ts/XXX': No such file or

* reduce unnecessary logs.

* fixed bug: set persistent_datanode_connections to on, insert transaction

* support heap_page_reset_gts(get_buffer('table_name', page_number));

* 1.fix bug about errmsg, 'database tbase does not exist', in pg_log.

* solve the problem of GTS output big interger out of bounds.

* remove dependency on Kernal

* pg_archivecleanup support removing the .gts file.

* pg_waldump ... -r transaction command support GTS.

* rename tbase_gts to tbase_gts_tools

* fix bug: heap_page_items can not output t_data when page id is not normal.

* initialize values

* shuiwu20201029_2

* refactoring functions to simplify code.

* add tbase_gts extension in the Makefile of extensions.

* delete enable_satisfies_any from GUC

* modified txid_gts.

* add heap_page_reset_gts()

* add xmin_gts and xmax_gts in extension function

* add extension function heap_page_items_with_gts.

* add tbase_gts extendion

* add enable_satisfies_any

* fix bug:

* print correct CTID.

* make changes according to code viewing suggestions.

* 1. Print log when GTS is inserted into heaptuple

* add ctid information while checking GTS

* use __sync_synchronize() to prevent CPU reordering and compiler

* 1. increase log information when gts is incorrect.

* When gts is not set, its correctness is not checked.

* check the correctness of GTS before writing pages.

* fix bug ID82284643: GTS is not used for index ans system tables.

* fix bug ID82284643: reduce locks of checking GTS when reading pages.

* fix bug ID82284643: check and reset tuple's xmin_gts and xmax_gts according to the gts in tlog through write and read data page.

* fix bug ID82284643: check and reset tuple's xmin_gts and xmax_gts according to the gts in tlog through vacuum operation.
---
 src/backend/commands/vacuumlazy.c  | 74 ++++++----------------------
 src/backend/utils/cache/relcache.c | 78 ++----------------------------
 src/include/utils/relcache.h       | 66 ++++++++++++-------------
 3 files changed, 52 insertions(+), 166 deletions(-)

diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index f1508d69..92fe6c94 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -142,11 +142,10 @@ typedef struct LVRelStats
 
 int	gts_maintain_option;
 
-static void PrintStack(void);
 static void PrintData(RelFileNode *rnode,
 	BlockNumber blkno, Page page, OffsetNumber lineoff,
 	GlobalTimestamp tlog_xmin_gts, GlobalTimestamp tlog_xmax_gts);
-static void MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer);
+static void MaintainGTS(Relation rel, BlockNumber blkno, Buffer buffer);
 
 /* A few variables that don't seem worth passing around as parameters */
 static int    elevel = -1;
@@ -1060,7 +1059,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
 		if (gts_maintain_option != GTS_MAINTAIN_NOTHING)
 		{
-			MaintainGTS(&onerel->rd_node, blkno, buf);
+			MaintainGTS(onerel, blkno, buf);
 		}
 #endif
 
@@ -1422,7 +1421,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
 		if (gts_maintain_option != GTS_MAINTAIN_NOTHING)
 		{
-			MaintainGTS(&onerel->rd_node, blkno, buf);
+			MaintainGTS(onerel, blkno, buf);
 		}
 #endif
 
@@ -2684,54 +2683,6 @@ xlog_reinit_extent_pages(RelFileNode rnode, ExtentID eid)
 
 #define STACK_SIZE 64
 
-/*
- * print error stack to maintain_trace file.
- */
-static void
-PrintStack(void)
-{
-	void *trace[STACK_SIZE] = {0};
-	size_t size = backtrace(trace, STACK_SIZE);
-	char **symbols = (char **) backtrace_symbols(trace, size);
-	size_t i = 0;
-	time_t t = 0;
-	struct tm *timeInfo = NULL;
-
-	if (symbols == NULL)
-	{
-		return;
-	}
-
-	time(&t);
-	timeInfo = localtime(&t);
-	trace_log("Dumping stack starts at %s", asctime(timeInfo));
-	trace_log("backtrace() returned %zu addresses.", size);
-	for (i = 1; i < size; i++)
-	{
-		char syscom[MAXPGPATH] = {0};
-		FILE *fcmd = NULL;
-		char temp[MAXPGPATH] = {0};
-
-		trace_log("#%-2zu %s", i, symbols[i]);
-
-		snprintf(syscom, MAXPGPATH, "addr2line %p -e %s -f -C", trace[i], exename);
-		fcmd = popen(syscom, "r");
-		if (fcmd == NULL)
-		{
-			continue;
-		}
-		while (fgets(temp, sizeof(temp), fcmd) != NULL)
-		{
-			/* ignore the ending "\n" */
-			trace_log("    %.*s", (int) strlen(temp) - 1, temp);
-		}
-		pclose(fcmd);
-	}
-	trace_log("Dumping stack ends.\n");
-
-	free(symbols);
-}
-
 /*
  * print error data to maintain file.
  */
@@ -2825,8 +2776,9 @@ PrintData(RelFileNode *rnode, BlockNumber blkno, Page page, OffsetNumber lineoff
  *                           doing vacuum.
  */
 void
-MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer)
+MaintainGTS(Relation rel, BlockNumber blkno, Buffer buffer)
 {
+	RelFileNode *rnode = &rel->rd_node;
 	Page page;
 	int lines;
 	OffsetNumber lineoff;
@@ -2845,7 +2797,7 @@ MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer)
 		return;
 	}
 
-	if (!RelationHasGTS(rnode->spcNode, rnode->relNode))
+	if (!RelationHasGTS(rel))
 	{
 		return;
 	}
@@ -2902,9 +2854,6 @@ MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer)
 					tuphdr->t_infomask, tuphdr->t_infomask & HEAP_XMAX_IS_MULTI,
 					xmin, tuple_xmin_gts, tlog_xmin_gts);
 
-				PrintStack();
-				PrintData(rnode, blkno, page, lineoff, tlog_xmin_gts, 0);
-
 				if (reset)
 				{
 					changed = true;
@@ -2918,6 +2867,10 @@ MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer)
 						xmin, tuple_xmin_gts,
 						HeapTupleHeaderGetXminTimestamp(tuphdr));
 				}
+				else
+				{
+					PrintData(rnode, blkno, page, lineoff, tlog_xmin_gts, 0);
+				}
 			}
 		}
 
@@ -2949,9 +2902,6 @@ MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer)
 					HeapTupleHeaderGetUpdateXid(tuphdr), xmax, tuple_xmax_gts,
 					tlog_xmax_gts);
 
-				PrintStack();
-				PrintData(rnode, blkno, page, lineoff, 0, tlog_xmax_gts);
-
 				if (reset)
 				{
 					changed = true;
@@ -2966,6 +2916,10 @@ MaintainGTS(RelFileNode *rnode, BlockNumber blkno, Buffer buffer)
 						xmax, tuple_xmax_gts,
 						HeapTupleHeaderGetXminTimestamp(tuphdr));
 				}
+				else
+				{
+					PrintData(rnode, blkno, page, lineoff, 0, tlog_xmax_gts);
+				}
 			}
 		}
 	}
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 0614298b..98acab3e 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -2368,90 +2368,22 @@ RelationIdGetRelation(Oid relationId)
  * Whether a relation has xmin_gts and max_gts.
  */
 bool
-RelationHasGTS(Oid reltablespace, Oid relfilenode)
+RelationHasGTS(Relation rel)
 {
 	bool		has = false;
-	SysScanDesc scandesc = NULL;
-	Relation	relation = NULL;
-	HeapTuple	ntp = NULL;
-	ScanKeyData skey[2];
-	bool		found = false;
-	Oid			relid = InvalidOid;
-	Form_pg_class	classform = NULL;
-
-	/* zero means this is a "mapped" relation */
-	if (0 == relfilenode || relfilenode < FirstNormalObjectId)
-	{
-		return false;
-	}
 
-	if (GLOBALTABLESPACE_OID == reltablespace)
+	if (!RelationIsValid(rel))
 	{
 		return false;
 	}
 
-	/* pg_class will show 0 when the value is actually MyDatabaseTableSpace */
-	if (reltablespace == MyDatabaseTableSpace)
-	{
-		reltablespace = 0;
-	}
-
-	/*
-	 * Not a shared table, could either be a plain relation or a
-	 * non-shared, nailed one, like e.g. pg_class.
-	 *
-	 * check for plain relations by looking in pg_class
-	 */
-	relation = heap_open(RelationRelationId, AccessShareLock);
-
-	ScanKeyInit(&skey[0],
-				Anum_pg_class_reltablespace,
-				BTEqualStrategyNumber,
-				F_OIDEQ,
-				ObjectIdGetDatum(reltablespace));
-	ScanKeyInit(&skey[1],
-				Anum_pg_class_relfilenode,
-				BTEqualStrategyNumber,
-				F_OIDEQ,
-				ObjectIdGetDatum(relfilenode));
-
-	scandesc = systable_beginscan(relation,
-				ClassTblspcRelfilenodeIndexId,
-				true,
-				NULL,
-				2,
-				skey);
-
-	while (HeapTupleIsValid(ntp = systable_getnext(scandesc)))
-	{
-		if (found)
-		{
-			elog(ERROR,
-				"unexpected duplicate for tablespace %u, relfilenode %u",
-				reltablespace, relfilenode);
-		}
-
-		found = true;
-		relid = HeapTupleGetOid(ntp);
-		classform = (Form_pg_class) GETSTRUCT(ntp);
-	}
-
-	if (!found)
-	{
-		elog(WARNING,
-				"unexpected none for tablespace %u, relfilenode %u",
-				reltablespace, relfilenode);
-	}
-	else if ((classform->relkind == RELKIND_RELATION ||
-			classform->relkind == RELPERSISTENCE_UNLOGGED) &&
-			!IsSystemClass(relid, classform))
+	if ((rel->rd_rel->relkind == RELKIND_RELATION ||
+		rel->rd_rel->relkind == RELPERSISTENCE_UNLOGGED) &&
+		!IsSystemRelation(rel))
 	{
 		has = true;
 	}
 
-	systable_endscan(scandesc);
-	heap_close(relation, AccessShareLock);
-
 	return has;
 }
 
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index bd96e72d..98adccc1 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * relcache.h
- *      Relation descriptor cache definitions.
+ *	  Relation descriptor cache definitions.
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -21,9 +21,9 @@
 typedef struct RelationData *Relation;
 
 /* ----------------
- *        RelationPtr is used in the executor to support index scans
- *        where we have to keep track of several index relations in an
- *        array.  -cim 9/10/89
+ *		RelationPtr is used in the executor to support index scans
+ *		where we have to keep track of several index relations in an
+ *		array.  -cim 9/10/89
  * ----------------
  */
 typedef Relation *RelationPtr;
@@ -40,30 +40,30 @@ extern void RelationClose(Relation relation);
 extern List *RelationGetFKeyList(Relation relation);
 extern List *RelationGetIndexList(Relation relation);
 extern List *RelationGetStatExtList(Relation relation);
-extern Oid    RelationGetOidIndex(Relation relation);
-extern Oid    RelationGetPrimaryKeyIndex(Relation relation);
-extern Oid    RelationGetReplicaIndex(Relation relation);
+extern Oid	RelationGetOidIndex(Relation relation);
+extern Oid	RelationGetPrimaryKeyIndex(Relation relation);
+extern Oid	RelationGetReplicaIndex(Relation relation);
 extern List *RelationGetIndexExpressions(Relation relation);
 extern List *RelationGetIndexPredicate(Relation relation);
 
 typedef enum IndexAttrBitmapKind
 {
-    INDEX_ATTR_BITMAP_ALL,
-    INDEX_ATTR_BITMAP_KEY,
-    INDEX_ATTR_BITMAP_PRIMARY_KEY,
-    INDEX_ATTR_BITMAP_IDENTITY_KEY
+	INDEX_ATTR_BITMAP_ALL,
+	INDEX_ATTR_BITMAP_KEY,
+	INDEX_ATTR_BITMAP_PRIMARY_KEY,
+	INDEX_ATTR_BITMAP_IDENTITY_KEY
 } IndexAttrBitmapKind;
 
 extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation,
-                           IndexAttrBitmapKind keyAttrs);
+						   IndexAttrBitmapKind keyAttrs);
 
 extern void RelationGetExclusionInfo(Relation indexRelation,
-                         Oid **operators,
-                         Oid **procs,
-                         uint16 **strategies);
+						 Oid **operators,
+						 Oid **procs,
+						 uint16 **strategies);
 
 extern void RelationSetIndexList(Relation relation,
-                     List *indexIds, Oid oidIndex);
+					 List *indexIds, Oid oidIndex);
 
 extern void RelationInitIndexAccessInfo(Relation relation);
 
@@ -74,10 +74,10 @@ extern struct PublicationActions *GetRelationPublicationActions(Relation relatio
 /*
  * Routines to support ereport() reports of relation-related errors
  */
-extern int    errtable(Relation rel);
-extern int    errtablecol(Relation rel, int attnum);
-extern int    errtablecolname(Relation rel, const char *colname);
-extern int    errtableconstraint(Relation rel, const char *conname);
+extern int	errtable(Relation rel);
+extern int	errtablecol(Relation rel, int attnum);
+extern int	errtablecolname(Relation rel, const char *colname);
+extern int	errtableconstraint(Relation rel, const char *conname);
 
 /*
  * Routines for backend startup
@@ -90,21 +90,21 @@ extern void RelationCacheInitializePhase3(void);
  * Routine to create a relcache entry for an about-to-be-created relation
  */
 extern Relation RelationBuildLocalRelation(const char *relname,
-                           Oid relnamespace,
-                           TupleDesc tupDesc,
-                           Oid relid,
-                           Oid relfilenode,
-                           Oid reltablespace,
-                           bool shared_relation,
-                           bool mapped_relation,
-                           char relpersistence,
-                           char relkind);
+						   Oid relnamespace,
+						   TupleDesc tupDesc,
+						   Oid relid,
+						   Oid relfilenode,
+						   Oid reltablespace,
+						   bool shared_relation,
+						   bool mapped_relation,
+						   char relpersistence,
+						   char relkind);
 
 /*
  * Routine to manage assignment of new relfilenode to a relation
  */
 extern void RelationSetNewRelfilenode(Relation relation, char persistence,
-                          TransactionId freezeXid, MultiXactId minmulti);
+						  TransactionId freezeXid, MultiXactId minmulti);
 
 /*
  * Routines for flushing/rebuilding relcache entries in various scenarios
@@ -119,7 +119,7 @@ extern void RelationCloseSmgrByOid(Oid relationId);
 
 extern void AtEOXact_RelationCache(bool isCommit);
 extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
-                          SubTransactionId parentSubid);
+						  SubTransactionId parentSubid);
 
 /*
  * Routines to help manage rebuilding of relcache init files
@@ -129,7 +129,7 @@ extern void RelationCacheInitFilePreInvalidate(void);
 extern void RelationCacheInitFilePostInvalidate(void);
 extern void RelationCacheInitFileRemove(void);
 
-extern bool RelationHasGTS(Oid reltablespace, Oid relfilenode);
+extern bool RelationHasGTS(Relation rel);
 
 /* should be used only by relcache.c and catcache.c */
 extern bool criticalRelcachesBuilt;
@@ -137,4 +137,4 @@ extern bool criticalRelcachesBuilt;
 /* should be used only by relcache.c and postinit.c */
 extern bool criticalSharedRelcachesBuilt;
 
-#endif                            /* RELCACHE_H */
+#endif							/* RELCACHE_H */

From e2e7410c2163916e356847b8be5d65cfe4965531 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Wed, 9 Dec 2020 20:40:39 +0800
Subject: [PATCH 090/578] Add debug print for cold hot router.

(cherry picked from commit c53c3b77)

5e7ed0e2 edit according to review

6675111f Add debug print for cold hot router.
---
 src/backend/pgxc/shard/shardmap.c | 61 +++++++++++++++++++++++++++++--
 src/backend/utils/misc/guc.c      | 11 ++++++
 2 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/src/backend/pgxc/shard/shardmap.c b/src/backend/pgxc/shard/shardmap.c
index 9e1cec0d..d0a2e242 100644
--- a/src/backend/pgxc/shard/shardmap.c
+++ b/src/backend/pgxc/shard/shardmap.c
@@ -112,7 +112,11 @@
 #include "utils/ruleutils.h"
 #endif
 
+/* 12 month for a year */
+#define COLD_HOT_INTERVAL_YEAR 12
+
 bool g_IsExtension;
+bool enable_cold_hot_router_print;
 extern bool trace_extent;
 
 typedef struct
@@ -4569,12 +4573,24 @@ static bool IsTempColdData(Datum secValue, RelationAccessType access, int32 inte
 bool IsHotData(Datum secValue, RelationAccessType access, int32 interval,
                   int step, Datum startValue)
 {
-    //int32 gap;
     Timestamp hotDataTime;    
     
+	if (enable_cold_hot_router_print)
+	{
+		elog(LOG, "IsHotData Check value "INT64_FORMAT" access %d interval %d step %d "INT64_FORMAT,
+		     DatumGetInt64(secValue),
+		     access, interval, step,
+		     DatumGetInt64(startValue));
+	}
+
+
     /* trade temp cold data as cold data. checking is needed if data would satisfy temp_cold_date guc option */
     if (true == IsTempColdData(secValue, access, interval, step, startValue))
     {
+    	if (enable_cold_hot_router_print)
+	    {
+		    elog(LOG, "Return from TempColdData Value: %s", g_TempColdDate ? g_TempColdDate : "(null)");
+	    }
         return false;
     }
 #if 0
@@ -4589,6 +4605,27 @@ bool IsHotData(Datum secValue, RelationAccessType access, int32 interval,
              errmsg("timestamp out of range")));
     }
 
+	if (enable_cold_hot_router_print)
+	{
+		elog(LOG,"IsHotData Check hotDateTime "INT64_FORMAT
+				" Manual hot data time "
+				"{ tm_sec:%d tm_min:%d tm_hour:%d tm_mday:%d tm_mon:%d tm_year:%d tm_wday:%d tm_yday:%d"
+				"  tm_isdst:%d tm_gmtoff:%ld tm_zone:%s } ret: %d",
+             (int64)hotDataTime,
+			 g_ManualHotDataTime.tm_sec,
+			 g_ManualHotDataTime.tm_min,
+			 g_ManualHotDataTime.tm_hour,
+			 g_ManualHotDataTime.tm_mday,
+			 g_ManualHotDataTime.tm_mon,
+			 g_ManualHotDataTime.tm_year,
+			 g_ManualHotDataTime.tm_wday,
+			 g_ManualHotDataTime.tm_yday,
+			 g_ManualHotDataTime.tm_isdst,
+			 g_ManualHotDataTime.tm_gmtoff,
+			 g_ManualHotDataTime.tm_zone,
+			 ((Timestamp)secValue >= hotDataTime));
+	}
+
     return ((Timestamp)secValue >= hotDataTime);
 }
 
@@ -4618,6 +4655,7 @@ List* ShardMapRouter(Oid group, Oid coldgroup, Oid relation, Oid type, Datum dva
     TimestampTz  start_timestamp          = 0;
     Relation                 rel          = NULL;
     Form_pg_partition_interval routerinfo = NULL;    
+	bool         router_log_print         = false;
 
     rel = relation_open(relation, NoLock);
 
@@ -4628,6 +4666,9 @@ List* ShardMapRouter(Oid group, Oid coldgroup, Oid relation, Oid type, Datum dva
 
     relation_close(rel, NoLock);
 
+	router_log_print = (enable_cold_hot_router_print && accessType == RELATION_ACCESS_INSERT &&
+						(RELATION_IS_INTERVAL(rel) || RELATION_IS_CHILD(rel)));
+
     if (g_EnableKeyValue)
     {
         /* check whether the value is key value */
@@ -4655,7 +4696,11 @@ List* ShardMapRouter(Oid group, Oid coldgroup, Oid relation, Oid type, Datum dva
         bdualwrite = false;
     }
     
-    
+	if (router_log_print)
+	{
+		elog(LOG, "Group %d coldgroup %d relation %d secAttr %d isSecNull %d dualwrite %d",
+		     group, coldgroup, relation, secAttr, isSecNull, bdualwrite);
+	}
     
     /* get partition stragegy first */
     if (!isSecNull && secAttr != InvalidAttrNumber)
@@ -4669,7 +4714,7 @@ List* ShardMapRouter(Oid group, Oid coldgroup, Oid relation, Oid type, Datum dva
                 partitionStrategy = routerinfo->partinterval_type;
 
                 if (partitionStrategy == IntervalType_Month &&
-                    routerinfo->partinterval_int == 12)
+					routerinfo->partinterval_int == COLD_HOT_INTERVAL_YEAR)
                 {
                     partitionStrategy = IntervalType_Year;
                 }
@@ -4677,7 +4722,17 @@ List* ShardMapRouter(Oid group, Oid coldgroup, Oid relation, Oid type, Datum dva
 
             interval_step = routerinfo->partinterval_int;
             start_timestamp = routerinfo->partstartvalue_ts;
+
+			if (router_log_print)
+			{
+                elog(LOG, "has routerinfo %d", partitionStrategy);
+			}
+        }
+		else if (router_log_print)
+        {
+            elog(LOG, "no routerinfo %d", partitionStrategy);
         }
+
         relation_close(rel, NoLock);
     }
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 49550fb5..f75f6331 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -178,6 +178,7 @@ extern char *default_tablespace;
 extern char *temp_tablespaces;
 extern bool ignore_checksum_failure;
 extern bool synchronize_seqscans;
+extern bool enable_cold_hot_router_print;
 #ifdef _PUB_SUB_RELIABLE_
 static char * g_wal_stream_type_str;
 #endif
@@ -2718,6 +2719,16 @@ static struct config_bool ConfigureNamesBool[] =
         false,
         NULL, NULL, NULL
     },
+	{
+		{"enable_cold_hot_router_print", PGC_USERSET, CUSTOM_OPTIONS,
+			 gettext_noop("Whether print cold hot router."),
+			 NULL
+		},
+		&enable_cold_hot_router_print,
+		false,
+		NULL, NULL, NULL
+	},
+
 #endif
 
     /* End-of-list marker */

From 562798245c85dcc01d7d31d8ccf3b9838d5b171c Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Fri, 11 Dec 2020 17:48:38 +0800
Subject: [PATCH 091/578] Fix coldhot table router due to SQLValueFunction.

---
 src/backend/optimizer/prep/preptlist.c |  9 +++++++--
 src/backend/optimizer/util/clauses.c   | 28 ++++++++++++++++++++++++++
 src/include/optimizer/clauses.h        | 22 +++++++++++---------
 3 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/src/backend/optimizer/prep/preptlist.c b/src/backend/optimizer/prep/preptlist.c
index 43d9b501..69a80885 100644
--- a/src/backend/optimizer/prep/preptlist.c
+++ b/src/backend/optimizer/prep/preptlist.c
@@ -229,13 +229,18 @@ preprocess_targetlist(PlannerInfo *root, List *tlist)
 
                                     secDataType = exprType((Node *)keyTle->expr);
 
+									/* evaluate sql value function on coordinator */
+									keyTle->expr = (Expr *) replace_eval_sql_value_function(
+											(Node *)keyTle->expr);
+
                                     secConstExpr = (Const *) eval_const_expressions(root,
                                                              (Node *)keyTle->expr);
+
+									/* cold hot insert router must be on coordinator */
                                     if (!IsA(secConstExpr, Const) ||
                                                 secConstExpr->consttype != secDataType)
                                     {
-                                        list_free(nodeList);
-                                        goto END_restrict;
+										elog(ERROR, "expression on cold-hot separation column must be const.");
                                     }
 
                                     secisnull = secConstExpr->constisnull;
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index 69f02ae7..d840206d 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -5256,4 +5256,32 @@ bool find_sublink_walker(Node *node, List **list)
 
 	return expression_tree_walker(node, find_sublink_walker, list);
 }
+
+/*
+ * replace_eval_sql_value_function:
+ *  eval SQLValueFunction and replace as Const value.
+ */
+Node*
+replace_eval_sql_value_function(Node *node)
+{
+	if (node == NULL)
+		return NULL;
+
+	if (node->type == T_SQLValueFunction)
+	{
+		/*
+		 * All variants of SQLValueFunction are stable, so if we are
+		 * evaluating the expression's value, we should evaluate the
+		 * current function value.  Otherwise just copy.
+		 */
+		SQLValueFunction *svf = (SQLValueFunction *) node;
+
+		return (Node *) evaluate_expr((Expr *) svf,
+		                              svf->type,
+		                              svf->typmod,
+		                              InvalidOid);
+	}
+
+	return expression_tree_mutator(node, replace_eval_sql_value_function, NULL);
+}
 #endif
\ No newline at end of file
diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h
index 3e7b9e4c..fddeb132 100644
--- a/src/include/optimizer/clauses.h
+++ b/src/include/optimizer/clauses.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * clauses.h
- *      prototypes for clauses.c.
+ *	  prototypes for clauses.c.
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -22,14 +22,14 @@
 
 typedef struct
 {
-    int            numWindowFuncs; /* total number of WindowFuncs found */
-    Index        maxWinRef;        /* windowFuncs[] is indexed 0 .. maxWinRef */
-    List      **windowFuncs;    /* lists of WindowFuncs for each winref */
+	int			numWindowFuncs; /* total number of WindowFuncs found */
+	Index		maxWinRef;		/* windowFuncs[] is indexed 0 .. maxWinRef */
+	List	  **windowFuncs;	/* lists of WindowFuncs for each winref */
 } WindowFuncLists;
 
 extern Expr *make_opclause(Oid opno, Oid opresulttype, bool opretset,
-              Expr *leftop, Expr *rightop,
-              Oid opcollid, Oid inputcollid);
+			  Expr *leftop, Expr *rightop,
+			  Oid opcollid, Oid inputcollid);
 extern Node *get_leftop(const Expr *clause);
 extern Node *get_rightop(const Expr *clause);
 
@@ -50,7 +50,7 @@ extern List *make_ands_implicit(Expr *clause);
 
 extern bool contain_agg_clause(Node *clause);
 extern void get_agg_clause_costs(PlannerInfo *root, Node *clause,
-                     AggSplit aggsplit, AggClauseCosts *costs);
+					 AggSplit aggsplit, AggClauseCosts *costs);
 
 extern bool contain_window_function(Node *clause);
 extern WindowFuncLists *find_window_functions(Node *clause, Index maxWinRef);
@@ -75,7 +75,7 @@ extern Var *find_forced_null_var(Node *clause);
 extern bool is_pseudo_constant_clause(Node *clause);
 extern bool is_pseudo_constant_clause_relids(Node *clause, Relids relids);
 
-extern int    NumRelids(Node *clause);
+extern int	NumRelids(Node *clause);
 
 extern void CommuteOpExpr(OpExpr *clause);
 extern void CommuteRowCompareExpr(RowCompareExpr *clause);
@@ -88,7 +88,9 @@ extern Query *inline_set_returning_function(PlannerInfo *root,
 							  RangeTblEntry *rte);
 
 extern Node *substitute_sublink_with_node(Node *expr, SubLink *sublink,
-		                                                               Node *node);
+										 Node *node);
 extern bool find_sublink_walker(Node *node, List **list);
 
-#endif                            /* CLAUSES_H */
+extern Node *replace_eval_sql_value_function(Node *node);
+
+#endif							/* CLAUSES_H */

From 53fd7a270de955b26d256367e5a92908aa48da12 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Sun, 13 Dec 2020 15:08:17 +0800
Subject: [PATCH 092/578] Fix cold hot router error for
 now(),sysdate,currenttimestamp.

---
 src/backend/optimizer/prep/preptlist.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/backend/optimizer/prep/preptlist.c b/src/backend/optimizer/prep/preptlist.c
index 69a80885..9091f15f 100644
--- a/src/backend/optimizer/prep/preptlist.c
+++ b/src/backend/optimizer/prep/preptlist.c
@@ -233,14 +233,15 @@ preprocess_targetlist(PlannerInfo *root, List *tlist)
 									keyTle->expr = (Expr *) replace_eval_sql_value_function(
 											(Node *)keyTle->expr);
 
-                                    secConstExpr = (Const *) eval_const_expressions(root,
+									secConstExpr = (Const *) estimate_expression_value(root,
                                                              (Node *)keyTle->expr);
 
 									/* cold hot insert router must be on coordinator */
                                     if (!IsA(secConstExpr, Const) ||
                                                 secConstExpr->consttype != secDataType)
                                     {
-										elog(ERROR, "expression on cold-hot separation column must be const.");
+										list_free(nodeList);
+										goto END_restrict;
                                     }
 
                                     secisnull = secConstExpr->constisnull;

From da8f0f40e7d156659a1bd18f56fd0a66aefb1832 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 15 Dec 2020 14:35:37 +0800
Subject: [PATCH 093/578] Disable forward in parallel ddl.

---
 src/backend/pgxc/pool/pgxcnode.c       | 17 ------
 src/backend/postmaster/postmaster.c    |  4 --
 src/backend/tcop/postgres.c            |  8 +--
 src/backend/tcop/utility.c             | 73 +++++++++++++++-----------
 src/backend/utils/misc/guc.c           | 15 ++----
 src/include/pgxc/pgxc.h                | 12 ++---
 src/include/pgxc/pgxcnode.h            |  2 -
 src/test/regress/expected/sysviews.out |  3 +-
 8 files changed, 51 insertions(+), 83 deletions(-)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index a7aa41a6..60496f76 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -133,9 +133,6 @@ static List    *local_param_list = NIL;
 static StringInfo     session_params;
 static StringInfo    local_params;
 
-/* Is forward request to leader coordinator */
-bool   forward_mode = false;
-
 typedef struct
 {
     NameData name;
@@ -4640,20 +4637,6 @@ PGXCNodeGetSessionParamStr(void)
                              PGXCNodeName, MyProcPid);
 		}
 
-		/*
-		 * If forward_mode is true, target node must regard it as normal client
-		 * instead of internal connections ,so is_forward_request must be ahead of
-		 * any guc variables else they will be considered internal variables.
-		 */
-		if (forward_mode)
-		{
-			appendStringInfo(session_params, "SET is_forward_request to true;");
-		}
-		else
-		{
-			appendStringInfo(session_params, "SET is_forward_request to false;");
-		}
-
         get_set_command(session_param_list, session_params, false);
         appendStringInfo(session_params, "SET parentPGXCPid TO %d;",
                              MyProcPid);
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index e22a0cca..f7ed9637 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -671,10 +671,6 @@ bool isRestoreMode = false;
 
 int remoteConnType = REMOTE_CONN_APP;
 
-#ifdef __TBASE__
-bool is_forward_request = false;
-#endif
-
 /* key pair to be used as object id while using advisory lock for backup */
 Datum xc_lockForBackupKey1;
 Datum xc_lockForBackupKey2;
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 697dcf5b..4ed9b7d9 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -977,7 +977,6 @@ static List *
 pg_rewrite_query(Query *query)
 {
 	List	   *querytree_list;
-	char       *leader_cn = NULL;
 
     if (Debug_print_parse)
         elog_node_display(LOG, "parse tree", query,
@@ -987,13 +986,8 @@ pg_rewrite_query(Query *query)
         ResetUsage();
 
 #ifdef PGXC
-	/* directly forward the request */
-	leader_cn = find_ddl_leader_cn();
-
     if (query->commandType == CMD_UTILITY &&
-        IsA(query->utilityStmt, CreateTableAsStmt) &&
-		((enable_parallel_ddl && is_ddl_leader_cn(leader_cn) ||
-		  !enable_parallel_ddl)))
+	    IsA(query->utilityStmt, CreateTableAsStmt))
 	{
 		/*
 		 * CREATE TABLE AS SELECT and SELECT INTO are rewritten so that the
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 55b62ce3..5ed5bfcc 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -146,7 +146,6 @@ static bool IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString);
 static void ExecCreateKeyValuesStmt(Node *parsetree);
 static void RemoveSequeceBarely(DropStmt *stmt);
 extern void RegisterSeqDrop(char *name, int32 type);
-static bool forward_ddl_to_leader_cn(Node *node, const char *queryString);
 
 extern bool    g_GTM_skip_catalog;
 
@@ -1412,8 +1411,22 @@ ProcessUtilityPost(PlannedStmt *pstmt,
                         break;
                     }
                 }
+
+				/*
+				 * Also truncate on coordinators which makes parallel ddl possible.
+				 * temp table only exists on current coordinator
+				 * which parallel ddl has no effect.
+				 */
+				if (!is_temp)
+				{
+					exec_type = EXEC_ON_ALL_NODES;
+				}
+				else
+				{
                 exec_type = EXEC_ON_DATANODES;
             }
+
+			}
             break;
 
         case T_AlterDatabaseStmt:
@@ -1737,48 +1750,40 @@ ProcessUtilityPost(PlannedStmt *pstmt,
 
 #ifdef __TBASE__
 /*
- * Forward specific DDLs request to leader cn.
- *
- * On success return true else false.
+ * Enable parallel ddl for specific query.
  */
-static bool
-forward_ddl_to_leader_cn(Node *node, const char *queryString)
+static void
+parallel_ddl_process(Node *node)
 {
-    Oid  leader_cn = InvalidOid;
-    char *leader_name = NULL;
-
-    /* avoid forward recurse */
-    if (!enable_parallel_ddl || !IS_PGXC_LOCAL_COORDINATOR || is_forward_request)
+    if (!enable_parallel_ddl || !IS_PGXC_LOCAL_COORDINATOR)
     {
-        return false;
+        return ;
     }
 
+	switch (nodeTag(node))
+	{
+		case T_CreateStmt:
+		case T_CreateForeignTableStmt:
+		case T_CreateTableAsStmt:
+		case T_CreateSchemaStmt:
+		case T_AlterTableStmt:
+		case T_DefineStmt:
+		case T_DropStmt:
+		case T_RenameStmt:
+		case T_TruncateStmt:
+		case T_IndexStmt:
     /* CONCURRENT INDEX is not supported */
     if (IsA(node,IndexStmt) && castNode(IndexStmt,node)->concurrent)
     {
-        return false;
+				return ;
     }
-
-    /* Set parallel ddl flag */
-    is_txn_has_parallel_ddl = true;
-
-    leader_name = find_ddl_leader_cn();
-    if(is_ddl_leader_cn(leader_name))
-    {
-        return false;
+			break;
+		default:
+			return ;
     }
 
-    leader_cn = get_pgxc_nodeoid(leader_name);
-
-    /* Set flag to indicate forwarded request */
-    forward_mode = true;
-
-    pgxc_execute_on_nodes(1, &leader_cn, pstrdup(queryString));
-
-    /* Cancel forwarded flag for subsequent requests */
-	forward_mode = false;
-
-    return true;
+    /* Parallel ddl is enabled, set parallel ddl flag */
+    is_txn_has_parallel_ddl = true;
 }
 #endif
 
@@ -1807,6 +1812,10 @@ standard_ProcessUtility(PlannedStmt *pstmt,
     bool        isTopLevel = (context == PROCESS_UTILITY_TOPLEVEL);
     ParseState *pstate;
 
+#ifdef __TBASE__
+	 /* parallel enable check */
+	 parallel_ddl_process(parsetree);
+#endif
     /*
      * For more detail see comments in function pgxc_lock_for_backup.
      *
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f75f6331..c1770cd1 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2710,16 +2710,6 @@ static struct config_bool ConfigureNamesBool[] =
         NULL, NULL, NULL
     },
     {
-        {"is_forward_request", PGC_USERSET, CUSTOM_OPTIONS,
-            gettext_noop("Whether DDL is forwarded from another coordinator."),
-            NULL,
-            GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_AUTO_FILE | GUC_DISALLOW_IN_FILE | GUC_NO_SHOW_ALL
-        },
-        &is_forward_request,
-        false,
-        NULL, NULL, NULL
-    },
-	{
 		{"enable_cold_hot_router_print", PGC_USERSET, CUSTOM_OPTIONS,
 			 gettext_noop("Whether print cold hot router."),
 			 NULL
@@ -8252,9 +8242,10 @@ set_config_option(const char *name, const char *value,
 	 */
     if ((source == PGC_S_SESSION || source == PGC_S_CLIENT)
         && (IS_PGXC_DATANODE || !IsConnFromCoord())
-        && (strcmp(name,"remotetype") != 0 && strcmp(name,"parentnode") != 0 &&
-            strcmp(name,"is_forward_request") != 0))
+        && (strcmp(name,"remotetype") != 0 && strcmp(name,"parentnode") != 0))
+    {
         send_to_nodes = true;
+    }
 #endif
 
 #ifdef PGXC
diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h
index 9f3ed6f5..b69d747b 100644
--- a/src/include/pgxc/pgxc.h
+++ b/src/include/pgxc/pgxc.h
@@ -96,10 +96,6 @@ typedef enum
 
 /* Determine remote connection type for a PGXC backend */
 extern int		remoteConnType;
-#ifdef __TBASE__
-/* Is request forwarded another coordinator */
-extern bool		is_forward_request;
-#endif
 
 /* Local node name and numer */
 extern char    *PGXCNodeName;
@@ -127,10 +123,10 @@ extern Datum xc_lockForBackupKey2;
 #define PGXC_PARENT_NODE_TYPE    parentPGXCNodeType
 #define REMOTE_CONN_TYPE remoteConnType
 
-#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP || is_forward_request == true)
-#define IsConnFromCoord() (remoteConnType == REMOTE_CONN_COORD && is_forward_request == false)
-#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE && is_forward_request == false)
-#define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM && is_forward == false)
+#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP)
+#define IsConnFromCoord() (remoteConnType == REMOTE_CONN_COORD)
+#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE)
+#define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM)
 #define IsConnFromGtmProxy() (remoteConnType == REMOTE_CONN_GTM_PROXY)
 
 /* key pair to be used as object id while using advisory lock for backup */
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index ccb84026..4a2ee55b 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -143,8 +143,6 @@ typedef struct
 	PGXCNodeHandle	  **coord_handles;	/* an array of Coordinator handles */
 } PGXCNodeAllHandles;
 
-extern bool forward_mode;
-
 extern void InitMultinodeExecutor(bool is_force);
 extern Oid get_nodeoid_from_nodeid(int nodeid, char node_type);
 
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index b0e92c9f..2ab99d9d 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -78,6 +78,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_bitmapscan                 | on
  enable_check_password             | off
  enable_cls                        | on
+ enable_cold_hot_router_print      | off
  enable_cold_hot_visible           | off
  enable_cold_seperation            | off
  enable_committs_print             | off
@@ -127,7 +128,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_tidscan                    | on
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
-(56 rows)
+(57 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From 3d9f209b428237edcbb4cb0526612fae68246046 Mon Sep 17 00:00:00 2001
From: yeyukui <yukuiye@tencent.com>
Date: Fri, 18 Dec 2020 10:24:00 +0800
Subject: [PATCH 094/578] fix security table coredump about do sample and
 vacuum analyze

---
 src/backend/commands/analyze.c          | 10 ++++++--
 src/backend/utils/adt/rowtypes.c        | 32 ++++++++++++++++++++++++-
 src/backend/utils/misc/relcrypt.c       |  2 +-
 src/test/regress/expected/mls_check.out | 10 ++++++++
 src/test/regress/sql/mls_check.sql      |  5 ++++
 5 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 1b1db5a2..54a26e55 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -697,7 +697,7 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params,
             stats->tupDesc = onerel->rd_att;
 #ifdef _MLS_
             /* has column crypt */
-            if (stats->tupDesc->attrs_ext)
+            if (stats->tupDesc->attrs_ext && IS_PGXC_DATANODE)
             {
                 TRANSP_CRYPT_ATTRS_EXT_ENABLE(stats->tupDesc);
             }
@@ -707,7 +707,7 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params,
                                      numrows,
                                      totalrows);
 #ifdef _MLS_
-            if (stats->tupDesc->attrs_ext)
+            if (stats->tupDesc->attrs_ext && IS_PGXC_DATANODE)
             {
                 TRANSP_CRYPT_ATTRS_EXT_DISABLE(stats->tupDesc);
             }
@@ -2610,9 +2610,15 @@ compute_scalar_stats(VacAttrStatsP stats,
             {
                 if (0 != stats->tupDesc->transp_crypt[curr_attnum - 1].algo_id)
                 {
+                	if (stats->tupDesc->attrs_ext && IS_PGXC_DATANODE)
+	                {
                     TRANSP_CRYPT_ATTRS_EXT_ENABLE(stats->tupDesc);
+	                }
                     heap_deform_tuple(stats->rows[i], stats->tupDesc, tuple_values, tuple_isnull);
+	                if (stats->tupDesc->attrs_ext && IS_PGXC_DATANODE)
+	                {
                     TRANSP_CRYPT_ATTRS_EXT_DISABLE(stats->tupDesc);
+	                }
 
                     if (tuple_isnull[curr_attnum - 1])
                     {
diff --git a/src/backend/utils/adt/rowtypes.c b/src/backend/utils/adt/rowtypes.c
index 7964c77f..3ba7b15c 100644
--- a/src/backend/utils/adt/rowtypes.c
+++ b/src/backend/utils/adt/rowtypes.c
@@ -25,6 +25,10 @@
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
 #include "utils/typcache.h"
+#include "utils/relcrypt.h"
+#include "commands/relcryptcommand.h"
+#include "utils/mls.h"
+#include "utils/datamask.h"
 
 
 /*
@@ -311,6 +315,8 @@ record_out(PG_FUNCTION_ARGS)
     Datum       *values;
     bool       *nulls;
     StringInfoData buf;
+	Oid         parentOid = InvalidOid;
+	Form_pg_attribute *att = NULL;
 
     check_stack_depth();        /* recurses for record-type columns */
 
@@ -319,6 +325,7 @@ record_out(PG_FUNCTION_ARGS)
     tupTypmod = HeapTupleHeaderGetTypMod(rec);
     tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod);
     ncolumns = tupdesc->natts;
+	att = tupdesc->attrs;
 
     /* Build a temporary HeapTuple control structure */
     tuple.t_len = HeapTupleHeaderGetDatumLength(rec);
@@ -360,8 +367,31 @@ record_out(PG_FUNCTION_ARGS)
     values = (Datum *) palloc(ncolumns * sizeof(Datum));
     nulls = (bool *) palloc(ncolumns * sizeof(bool));
 
-    /* Break down the tuple into fields */
+	/* Break down the tuple into fields, if the table use encrypt, should
+	 * decrypt the data after deform_tuple on datanode.
+	 */
+	if (IS_PGXC_DATANODE && tupdesc->attrs_ext)
+	{
+		transparent_crypt_decrypt_all_cols_value_copy(&tuple, tupdesc, values, nulls);
+	}
+	else
+	{
     heap_deform_tuple(&tuple, tupdesc, values, nulls);
+	}
+
+	/*
+	 * Check table or parent table has datamask policy, if true, change data to
+	 * datamask to avoid data leak.
+	 */
+	if (tupdesc->natts > 0)
+	{
+		parentOid = mls_get_parent_oid_by_relid(att[0]->attrelid);
+	}
+
+	if (OidIsValid(parentOid) && datamask_check_table_has_datamask(parentOid))
+	{
+		datamask_exchange_all_cols_value_copy(tupdesc, values, nulls, parentOid);
+	}
 
     /* And build the result string */
     initStringInfo(&buf);
diff --git a/src/backend/utils/misc/relcrypt.c b/src/backend/utils/misc/relcrypt.c
index 65f3b65e..b3d25625 100644
--- a/src/backend/utils/misc/relcrypt.c
+++ b/src/backend/utils/misc/relcrypt.c
@@ -1578,7 +1578,7 @@ bool trsprt_crypt_chk_tbl_has_col_crypt(Oid relid)
     
     while (HeapTupleIsValid(htup = systable_getnext(scan)))
     {
-        Form_pg_transparent_crypt_policy_map form = (Form_pg_transparent_crypt_policy_map)htup;
+        Form_pg_transparent_crypt_policy_map form = (Form_pg_transparent_crypt_policy_map) GETSTRUCT(htup);
 
         if (form->attnum > InvalidAttrNumber)
         {
diff --git a/src/test/regress/expected/mls_check.out b/src/test/regress/expected/mls_check.out
index 371bfe10..496d0b8c 100644
--- a/src/test/regress/expected/mls_check.out
+++ b/src/test/regress/expected/mls_check.out
@@ -1284,6 +1284,14 @@ select * from alter_order_range_201702 order by f1 asc;
   2 | 9999 | 9999 | 9999 | XXXXe | XXXXe  | XXXXoworld           | XXXXe | 9999 | Tue May 05 05:05:05 2015 | 9999 | 9999
 (1 row)
 
+sample alter_order_range(3000);
+ samplenum | totalnum | deadnum | totalpages | visiblepages |                                                   rows                                                   
+-----------+----------+---------+------------+--------------+----------------------------------------------------------------------------------------------------------
+         2 |        2 |       0 |         40 |            0 | 
+           |          |         |            |              | (1,9999,9999,9999,XXXXe,"XXXXe ","XXXXoworld          ",XXXXe,9999,"Tue May 05 05:05:05 2015",9999,9999)
+           |          |         |            |              | (2,9999,9999,9999,XXXXe,"XXXXe ","XXXXoworld          ",XXXXe,9999,"Tue May 05 05:05:05 2015",9999,9999)
+(3 rows)
+
 alter table alter_order_range detach partition alter_order_range_201701;
 ERROR:  could not detach partition for table:alter_order_range, cause mls poilcy is bound
 \c - mls_admin
@@ -2509,6 +2517,8 @@ select * from tbl_mls_test where f1 = 1024 and f2 >= '2018-05-01' and f2 < '2018
  1024 | Tue May 01 00:00:00 2018 | 1024
 (1 row)
 
+-- test vacuum analyze
+vacuum analyze tbl_mls_test;
 --case: orignal partition, interval partition with index
 \c - godlike
 create table tbl_mls_part_list( a int ,b int ) PARTITION BY LIST (b) ;
diff --git a/src/test/regress/sql/mls_check.sql b/src/test/regress/sql/mls_check.sql
index 4369a706..83e4027c 100644
--- a/src/test/regress/sql/mls_check.sql
+++ b/src/test/regress/sql/mls_check.sql
@@ -478,6 +478,8 @@ select * from alter_order_range order by f1 asc;
 select * from alter_order_range_201701 order by f1 asc;
 select * from alter_order_range_201702 order by f1 asc;
 
+sample alter_order_range(3000);
+
 alter table alter_order_range detach partition alter_order_range_201701;
 
 \c - mls_admin
@@ -848,6 +850,9 @@ checkpoint;
 --explain select * from tbl_mls_test where f1 = 1024 and f2 >= '2018-05-01' and f2 < '2018-06-01' order by f1 limit 10 ;      
 select * from tbl_mls_test where f1 = 1024 and f2 >= '2018-05-01' and f2 < '2018-06-01' order by f1 limit 10 ;      
 
+-- test vacuum analyze
+vacuum analyze tbl_mls_test;
+
 --case: orignal partition, interval partition with index
 \c - godlike
 create table tbl_mls_part_list( a int ,b int ) PARTITION BY LIST (b) ;

From ce318d0355206873dfb62d73ba0f57cb2db6ceef Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 18 Dec 2020 10:49:57 +0800
Subject: [PATCH 095/578] fix nestloop bug

---
 src/backend/tcop/pquery.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 13f56acc..5cc3042c 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -703,8 +703,7 @@ PortalStart(Portal portal, ParamListInfo params,
                  */
 #ifdef __TBASE__
                 if (!paramPassDown && queryDesc->plannedstmt->nParamRemote > 0 &&
-						queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC &&
-						queryDesc->plannedstmt->distributionType != LOCATOR_TYPE_SHARD)
+						queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC)
 #else
                 if (queryDesc->plannedstmt->nParamRemote > 0 &&
                         queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC)

From 38aaa5469d66712cf00fc842a9cd71625792d30e Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Fri, 29 Jan 2021 10:43:35 +0800
Subject: [PATCH 096/578] Bugfix: prepared statement does not exist when dn
 restart, ID84661745 (merge request !123)

(cherry picked from commit e43101af)

7d7edd09 Bugfix: prepared statement does not exist when dn restart, ID84661745, code optimize 3

098b765e Bugfix: prepared statement does not exist when dn restart, ID84661745, code optimize 2

37164a3a Bugfix: prepared statement does not exist when dn restart, ID84661745, code optimize

709950c1 Bugfix: prepared statement does not exist when dn restart, ID84661745
---
 src/backend/commands/prepare.c   | 48 ++++++++++++++++++++++++++++++--
 src/backend/pgxc/pool/pgxcnode.c | 24 ++++++++++++++++
 src/include/commands/prepare.h   |  3 +-
 3 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index 1aa469d6..d4729433 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -1089,7 +1089,7 @@ HaveActiveDatanodeStatements(void)
  * prepared on the node
  */
 bool
-ActivateDatanodeStatementOnNode(const char *stmt_name, int noid)
+ActivateDatanodeStatementOnNode(const char *stmt_name, int nodeidx)
 {
     DatanodeStatement *entry;
     int i;
@@ -1099,13 +1099,55 @@ ActivateDatanodeStatementOnNode(const char *stmt_name, int noid)
 
     /* see if statement already active on the node */
     for (i = 0; i < entry->number_of_nodes; i++)
-        if (entry->dns_node_indices[i] == noid)
+		if (entry->dns_node_indices[i] == nodeidx)
             return true;
 
     /* statement is not active on the specified node append item to the list */
-    entry->dns_node_indices[entry->number_of_nodes++] = noid;
+	entry->dns_node_indices[entry->number_of_nodes++] = nodeidx;
     return false;
 }
+
+
+/*
+ * Mark datanode statement as inactive on specified node
+ */
+void
+InactivateDatanodeStatementOnNode(int nodeidx)
+{
+	HASH_SEQ_STATUS seq;
+	DatanodeStatement *entry;
+	int i;
+
+	/* nothing cached */
+	if (!datanode_queries)
+		return;
+
+	/* walk over cache */
+	hash_seq_init(&seq, datanode_queries);
+	while ((entry = hash_seq_search(&seq)) != NULL)
+	{
+		/* see if statement already active on the node */
+		for (i = 0; i < entry->number_of_nodes; i++)
+		{
+			if (entry->dns_node_indices[i] == nodeidx)
+			{
+				elog(DEBUG5, "InactivateDatanodeStatementOnNode: node index %d, "
+						"number_of_nodes %d, statement name %s", nodeidx,
+						entry->number_of_nodes, entry->stmt_name);
+
+				/* remove nodeidx from list */
+				entry->number_of_nodes--;
+				if (i < entry->number_of_nodes)
+				{
+					entry->dns_node_indices[i] =
+						entry->dns_node_indices[entry->number_of_nodes];
+				}
+				break;
+			}
+		}
+	}
+}
+
 #endif
 #ifdef __TBASE__
 /* prepare remoteDML statement on coordinator */
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 60496f76..9b5f7a84 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -3855,6 +3855,30 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
                         is_global_session ? 'T' : 'F');
 #endif
 
+				if (IS_PGXC_COORDINATOR)
+				{
+					char nodetype = PGXC_NODE_DATANODE;
+					int nodeidx = PGXCNodeGetNodeId(node_handle->nodeoid, &nodetype);
+					if (PGXC_NODE_DATANODE != nodetype)
+					{
+						elog(ERROR, "Unexpected node type %c, name %s, index %d, "
+								"oid %d, max nodes %d", nodetype,
+								node_handle->nodename, nodeidx,
+								node_handle->nodeoid, NumDataNodes);
+					}
+					if (nodeidx < 0  || nodeidx >= NumDataNodes)
+					{
+						elog(ERROR, "Invalid datanode index %d, name %s, oid %d, "
+								"type %c, max nodes %d", nodeidx,
+								node_handle->nodename, node_handle->nodeoid,
+								nodetype, NumDataNodes);
+					}
+
+					InactivateDatanodeStatementOnNode(nodeidx);
+					elog(DEBUG5, "Inactivate statement on datanode %s, nodeidx %d, "
+							"oid %d, type %c, max nodes %d", node_handle->nodename,
+							nodeidx, node_handle->nodeoid, nodetype, NumDataNodes);
+				}
             }
         }
         /* Initialisation for Coordinators */
diff --git a/src/include/commands/prepare.h b/src/include/commands/prepare.h
index e05003a0..57a72d94 100644
--- a/src/include/commands/prepare.h
+++ b/src/include/commands/prepare.h
@@ -132,9 +132,10 @@ extern void DropAllPreparedStatements(void);
 
 #ifdef PGXC
 extern DatanodeStatement *FetchDatanodeStatement(const char *stmt_name, bool throwError);
-extern bool ActivateDatanodeStatementOnNode(const char *stmt_name, int noid);
+extern bool ActivateDatanodeStatementOnNode(const char *stmt_name, int nodeidx);
 extern bool HaveActiveDatanodeStatements(void);
 extern void DropDatanodeStatement(const char *stmt_name);
+extern void InactivateDatanodeStatementOnNode(int nodeidx);
 extern int SetRemoteStatementName(Plan *plan, const char *stmt_name, int num_params,
                         Oid *param_types, int n);
 #endif

From 98dd71078ce41256f43be60aca3f576cabc0bf38 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 29 Jan 2021 14:54:39 +0800
Subject: [PATCH 097/578] fix
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084977249 (merge request
 !126)

---
 src/backend/pgxc/pool/pgxcnode.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 9b5f7a84..3aac2510 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -822,7 +822,6 @@ pgxc_node_receive(const int conn_count,
     }
 
 retry:
-    CHECK_FOR_INTERRUPTS();
     poll_val  = poll(pool_fd, conn_count, timeout_ms);
     if (poll_val < 0)
     {

From bec7ae72747858acdd3c90fa0f8b8216f9e2d6ca Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 27 Jan 2021 15:21:18 +0800
Subject: [PATCH 098/578] fix latch already owned caused by memory problem
 (merge request !118)

---
 src/backend/pgxc/squeue/squeue.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
index b1440b60..e006d2c8 100644
--- a/src/backend/pgxc/squeue/squeue.c
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -839,7 +839,9 @@ SharedQueueAcquire(const char *sqname, int ncons)
         int        qsize;   /* Size of one queue */
         int        i;
         char   *heapPtr;
-
+#ifdef __TBASE__
+        SQueueSync *sqsync = NULL;
+#endif
         elog(DEBUG1, "Create a new SQueue %s and format it for %d consumers", sqname, ncons);
 
         /* Initialize the shared queue */
@@ -899,6 +901,13 @@ SharedQueueAcquire(const char *sqname, int ncons)
         heapPtr = (char *) sq;
         /* Skip header */
         heapPtr += SQUEUE_HDR_SIZE(sq->sq_nconsumers);
+
+#ifdef __TBASE__
+		/* Init latch */
+		sqsync = sq->sq_sync;
+        InitSharedLatch(&sqsync->sqs_producer_latch);
+#endif
+
         /* Set up consumer queues */
         for (i = 0; i < sq->sq_nconsumers; i++)
         {
@@ -915,6 +924,7 @@ SharedQueueAcquire(const char *sqname, int ncons)
 #ifdef __TBASE__
             cstate->send_fd = false;
             cstate->cs_done = false;
+            InitSharedLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
 #endif
             heapPtr += qsize;
         }

From e111c5410f6e88286a1d4cf2dd18d371d9b410ed Mon Sep 17 00:00:00 2001
From: jackywpxie <jackywpxie@tencent.com>
Date: Wed, 20 Jan 2021 19:20:08 +0800
Subject: [PATCH 099/578] fixed bug: coredump while executing lots of SELECT
 PGXC_POOL_RELOAD() (merge request !105)

Squash merge branch 'jacky/bugfix/coredump_Tbase_v5.05.3' into 'Tbase_v5.05.3'

* fixed bug: coredump while executing lots of SELECT PGXC_POOL_RELOAD()


(cherry picked from commit 0b4d42f8)

9715c00f fixed bug: coredump while executing lots of SELECT PGXC_POOL_RELOAD()

http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696084789579&url_cache_key=e5779d19ee5ceffc54b891e3b94140f4
---
 src/backend/pgxc/pool/pgxcnode.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 3aac2510..12873dac 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -4813,11 +4813,15 @@ DoInvalidateRemoteHandles(void)
 {
     bool            result = false;
 
+	HOLD_INTERRUPTS();
+
 	InitMultinodeExecutor(true);
 
 	HandlesInvalidatePending = false;
 	HandlesRefreshPending = false;
 
+	RESUME_INTERRUPTS();
+
 	return result;
 }
 
@@ -4832,6 +4836,8 @@ DoRefreshRemoteHandles(void)
     int                numCoords, numDNodes, numSlaveDNodes, total_nodes;
     bool            res = true;
 
+	HOLD_INTERRUPTS();
+
     HandlesRefreshPending = false;
 
     PgxcNodeGetOidsExtend(&coOids, &dnOids, &sdnOids,&numCoords, &numDNodes, &numSlaveDNodes, false);
@@ -4982,6 +4988,8 @@ DoRefreshRemoteHandles(void)
     list_free(added);
     list_free(deleted);
 
+	RESUME_INTERRUPTS();
+
     return res;
 }
 

From 27e1e023dca671bd0e74c928ba17880c60852f4f Mon Sep 17 00:00:00 2001
From: jackywpxie <jackywpxie@tencent.com>
Date: Mon, 25 Jan 2021 17:21:53 +0800
Subject: [PATCH 100/578] =?UTF-8?q?:=E4=BF=AE=E8=A1=A5PROCSIG=5FPGXCPOOL?=
 =?UTF-8?q?=5FRELOAD=E4=BF=A1=E5=8F=B7=E5=B1=8F=E8=94=BD=E6=BC=8F=E6=B4=9E?=
 =?UTF-8?q?=20(merge=20request=20!116)=20(merge=20request=20!149)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Squash merge branch 'jacky/bugfix/PoolerReload_TBase_V2.15.16.9' into 'TBase_V2.15.16.9'

* :修补PROCSIG_PGXCPOOL_RELOAD信号屏蔽漏洞 (merge request !116)
---
 src/backend/pgxc/pool/poolutils.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/backend/pgxc/pool/poolutils.c b/src/backend/pgxc/pool/poolutils.c
index c61379ac..b03a4e2b 100644
--- a/src/backend/pgxc/pool/poolutils.c
+++ b/src/backend/pgxc/pool/poolutils.c
@@ -415,6 +415,9 @@ HandlePoolerReload(void)
     if (proc_exit_inprogress)
         return;
 
+	if (InterruptHoldoffCount != 0)
+		return;
+
 #ifdef __TBASE__
     if (PoolerReloadHoldoffCount)
     {
@@ -430,6 +433,16 @@ HandlePoolerReload(void)
     PoolerReloadPending = false;
 #endif
 
+	HOLD_INTERRUPTS();
+
+	/*
+	 * Reinitialize session, it updates the shared memory table.
+	 * Initialize XL executor. This must be done inside a transaction block.
+	 */
+	StartTransactionCommand();
+	InitMultinodeExecutor(true);
+	CommitTransactionCommand();
+
     /* Request query cancel, when convenient */
     InterruptPending = true;
     QueryCancelPending = true;
@@ -439,6 +452,8 @@ HandlePoolerReload(void)
 
     /* Prevent using of cached connections to remote nodes */
     RequestInvalidateRemoteHandles();
+
+	RESUME_INTERRUPTS();
 }
 
 /*

From e31babde1419022c884591477911dcee669f2269 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 27 Jan 2021 16:32:30 +0800
Subject: [PATCH 101/578] fix
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084803305 (merge request
 !98)

---
 src/backend/executor/execProcnode.c | 67 +++++++++++++++++++++++++++++
 src/backend/executor/nodeMaterial.c | 13 ++++++
 src/backend/executor/nodeNestloop.c |  4 ++
 src/backend/pgxc/pool/execRemote.c  |  4 ++
 src/include/executor/executor.h     |  3 ++
 5 files changed, 91 insertions(+)

diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index bdf04a2a..0119064b 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -927,6 +927,73 @@ ExecDisconnectNode(PlanState *node)
     ExecDisconnectNode(ps->righttree);
 }
 
+
+bool
+HasDisconnectNode(PlanState *node)
+{
+    PlanState *ps = node;
+    RemoteSubplanState *remotesubplan = NULL;
+
+    if (!node)
+        return false;
+
+    if (IsA(node, SubqueryScanState))
+    {
+        SubqueryScanState *substate = (SubqueryScanState *)node;
+        ps = substate->subplan;
+    }
+
+    switch (nodeTag(ps))
+    {
+        case T_RemoteSubplanState:
+        {
+            remotesubplan = (RemoteSubplanState *) ps;
+            if (remotesubplan->eflags & EXEC_FLAG_DISCONN)
+            {
+                return true;
+            }
+            return false;
+        }
+
+        case T_AppendState:
+        {
+            AppendState    *append = (AppendState *) ps;
+            int 			i;
+
+            for (i = 0; i < append->as_nplans; i++)
+            {
+                if (HasDisconnectNode(append->appendplans[i]))
+                {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        case T_MergeAppendState:
+        {
+            MergeAppendState *mstate = (MergeAppendState *) ps;
+            int			i;
+
+            for (i = 0; i < mstate->ms_nplans; i++)
+            {
+                if (HasDisconnectNode(mstate->mergeplans[i]))
+                {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        default:
+            break;
+    }
+
+    return HasDisconnectNode(ps->lefttree) || HasDisconnectNode(ps->righttree);
+}
+
 void
 ExecFinishNode(PlanState *node)
 {// #lizard forgives
diff --git a/src/backend/executor/nodeMaterial.c b/src/backend/executor/nodeMaterial.c
index d96a6d4a..fc02b41a 100644
--- a/src/backend/executor/nodeMaterial.c
+++ b/src/backend/executor/nodeMaterial.c
@@ -327,6 +327,19 @@ ExecReScanMaterial(MaterialState *node)
 
     if (node->eflags != 0)
     {
+#ifdef __TBASE__
+	    /*
+	     * If we haven't materialized yet, but some nodes have done disconnect,
+	     * maybe this node needs to be executed when the material is executed,
+	     * so re-scan here
+	     */
+	    if ((NULL == node->tuplestorestate) && HasDisconnectNode(outerPlan))
+        {
+            ExecReScan(outerPlan);
+            node->eof_underlying = false;
+            return;
+        }
+#endif
         /*
          * If we haven't materialized yet, just return. If outerplan's
          * chgParam is not NULL then it will be re-scanned by ExecProcNode,
diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c
index d277cb19..e9e95c2d 100644
--- a/src/backend/executor/nodeNestloop.c
+++ b/src/backend/executor/nodeNestloop.c
@@ -116,6 +116,10 @@ ExecNestLoop(PlanState *pstate)
 #ifdef __TBASE__
                 if (!node->nl_InnerInited && IS_PGXC_DATANODE)
                 {
+				    /*
+				     * Perform disconnection to make the redistribution on other nodes end normally,
+				     * otherwise need to wait for a timeout
+				     */
                     ExecDisconnectNode(innerPlan);
                 }
 #endif
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index df03eb1b..48778128 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -11030,6 +11030,9 @@ ExecReScanRemoteSubplan(RemoteSubplanState *node)
      * Force query is re-bound with new parameters
      */
     node->bound = false;
+#ifdef __TBASE__
+    node->eflags &= ~(EXEC_FLAG_DISCONN);
+#endif
 }
 
 #ifdef __TBASE__
@@ -11168,6 +11171,7 @@ ExecDisconnectRemoteSubplan(RemoteSubplanState *node)
         }
 
         node->bound = true;
+        node->eflags |= EXEC_FLAG_DISCONN;
 
         connections = (PGXCNodeHandle **)palloc(combiner->conn_count * sizeof(PGXCNodeHandle *));
 
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 3ec9ef06..5262c42e 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -67,6 +67,7 @@
 #ifdef XCP
 /* distributed executor may never execute the plan on this node  */
 #define EXEC_FLAG_SUBPLAN        0x0100
+#define EXEC_FLAG_DISCONN   	0x1000
 #endif
 
 #ifdef __TBASE__
@@ -526,6 +527,8 @@ extern Relation ExecOpenScanRelation(EState *estate, Index scanrelid, int eflags
 
 #ifdef __TBASE__
 extern Relation ExecOpenScanRelationPartition(EState *estate, Index scanrelid, int eflags, int partidx);
+
+extern bool HasDisconnectNode(PlanState *node);
 #endif
 extern void ExecCloseScanRelation(Relation scanrel);
 

From b37d77c69f80aefb8cf223c18c6eaa75135c38a8 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Mon, 22 Feb 2021 19:50:20 +0800
Subject: [PATCH 102/578] cherry-pick from f21bd27 fix RemoteSubplanMakeUnique
 for latch already owned error

---
 src/backend/pgxc/pool/execRemote.c | 7 ++++++-
 src/backend/storage/ipc/latch.c    | 2 +-
 src/backend/tcop/pquery.c          | 3 +++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 48778128..333838a3 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -64,6 +64,7 @@
 #include "postmaster/postmaster.h"
 #include "executor/nodeModifyTable.h"
 #include "utils/syscache.h"
+#include "nodes/print.h"
 #endif
 /*
  * We do not want it too long, when query is terminating abnormally we just
@@ -9503,7 +9504,8 @@ RemoteSubplanMakeUnique(Node *plan, int unique)
      */
     if (IsA(plan, RemoteSubplan))
     {
-        ((RemoteSubplan *)plan)->unique = unique;
+	    int old = ((RemoteSubplan *)plan)->unique;
+		((RemoteSubplan *)plan)->unique = old * MAX_NODES_NUMBER + unique;
     }
     /* Otherwise it is a Plan descendant */
     RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->lefttree, unique);
@@ -10055,6 +10057,8 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags)
              * unique.
              */
             RemoteSubplanMakeUnique((Node *) outerPlan(node), PGXCNodeId);
+            elog(DEBUG3, "RemoteSubplanMakeUnique for LOCATOR_TYPE_NONE unique: %d, cursor: %s",
+                 PGXCNodeId, node->cursor);
         }
         rstmt.planTree = outerPlan(node);
         /*
@@ -10255,6 +10259,7 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags)
 #ifdef __AUDIT__
             rstmt.queryString = NULL;
             rstmt.parseTree = NULL;
+			elog_node_display(DEBUG5, "SendPlanMessage", &rstmt, Debug_pretty_print);
 #endif
         }
         PG_CATCH();
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 55891078..f8fb52da 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -296,7 +296,7 @@ OwnLatch(volatile Latch *latch)
 #endif
 
     if (latch->owner_pid != 0)
-        elog(ERROR, "latch already owned");
+		elog(ERROR, "latch already owned by %d", latch->owner_pid);
 
     latch->owner_pid = MyProcPid;
 }
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 5cc3042c..3f2e7d62 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -770,6 +770,9 @@ PortalStart(Portal portal, ParamListInfo params,
                     RemoteSubplanMakeUnique(
                             (Node *) queryDesc->plannedstmt->planTree,
                             PGXC_PARENT_NODE_ID);
+
+                    elog(DEBUG3, "RemoteSubplanMakeUnique for PARAM_EXEC unique: %d, portal: %s",
+                         PGXC_PARENT_NODE_ID, portal->name);
                     /*
                      * Call ExecutorStart to prepare the plan for execution
                      */

From f304b0ec9b3d5aa870e539d7d9f1b21bdd34f22e Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 13 Jan 2021 10:27:41 +0800
Subject: [PATCH 103/578] fix
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084406117 (merge request
 !91)

---
 src/backend/pgxc/pool/execRemote.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 333838a3..cff54d43 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -11163,6 +11163,8 @@ ExecDisconnectRemoteSubplan(RemoteSubplanState *node)
 
             if (conn)
             {
+                CHECK_OWNERSHIP(conn, combiner);
+				
                 if (pgxc_node_send_disconnect(conn, cursor, list_length(plan->distributionRestrict)) != 0)
                     ereport(ERROR,
                             (errcode(ERRCODE_INTERNAL_ERROR),

From dd4e99e103f2c97fe74fdc3db870c2d7a90b11b6 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 4 Feb 2021 20:32:14 +0800
Subject: [PATCH 104/578] fix sequence bug (merge request !147)

---
 src/backend/access/transam/xact.c          |   2 +
 src/backend/commands/sequence.c            |   2 +-
 src/backend/commands/tablecmds.c           | 174 +++++++++++++++-
 src/backend/tcop/utility.c                 |  33 +++
 src/include/commands/tablecmds.h           |  45 ++--
 src/include/pg_config_manual.h             |   3 +
 src/test/regress/expected/create_index.out | 230 +++++++++++----------
 7 files changed, 358 insertions(+), 131 deletions(-)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 5369958d..9632a415 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -6247,6 +6247,8 @@ AbortSubTransaction(void)
         {
             CheckGTMConnection();
         }
+
+		FinishSeqOp(false);
     }
 #endif
 
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 5b6fd741..2557db35 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -375,7 +375,7 @@ DefineSequence(ParseState *pstate, CreateSeqStmt *seq)
         {
             ereport(ERROR,
                     (errcode(ERRCODE_CONNECTION_FAILURE),
-                     errmsg("GTM error, could not create sequence")));
+					 errmsg("GTM error, could not create sequence %s", seqname)));
         }
 
 #ifdef __TBASE__
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 9e3b3f14..6915458f 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -132,6 +132,11 @@
 #include "pgxc/shardmap.h"
 #include "pgxc/groupmgr.h"
 #endif
+
+#ifdef __TBASE__
+#include "parser/scansup.h"
+#endif
+
 /*
  * ON COMMIT action list
  */
@@ -1220,19 +1225,170 @@ DropErrorMsgWrongType(const char *relname, char wrongkind, char rightkind)
              (wentry->kind != '\0') ? errhint("%s", _(wentry->drophint_msg)) : 0));
 }
 
+#ifdef __TBASE__
+
+/*
+ * replace all invisible characters with ' ',
+ * leave no spaces next to ',' or '.'
+ */
+static void
+OmitqueryStringSpace(char *queryString)
+{
+    char *front = queryString;
+    char *last = queryString;
+    bool skip = false;
+
+    if (queryString == NULL)
+    {
+        return;
+    }
+
+    /* omit space */
+    while (scanner_isspace(*front))
+    {
+        ++front;
+    }
+
+    while ((*front) != '\0')
+    {
+        if(scanner_isspace(*front) && skip == false)
+        {
+            while(scanner_isspace(*front))
+            {
+                ++front;
+            }
+
+            if ((*front) == ',' || (*front) == '.')
+            {
+                /* no need space */
+            }
+            else if (last != queryString && (*(last - 1) == ',' || *(last - 1) == '.'))
+            {
+                /* no need space */
+            }
+            else
+            {
+                /* replace all invisible characters with ' ' */
+                *last = ' ';
+                ++last;
+                continue;
+            }
+        }
+
+        if ((*front) == '\"')
+        {
+            skip = (skip == true) ? false : true;
+            *last = *front;
+            ++front;
+        }
+        else
+        {
+            *last = *front;
+            ++front;
+        }
+        ++last;
+    }
+    *last = '\0';
+}
+
+/*
+ * remove relname in query string (replace with ' ')
+ */
+static void
+RemoveRelnameInQueryString(char *queryString, RangeVar *rel)
+{
+    char *ptr = NULL;
+    char *tmp = NULL;
+    char *tmpStr = NULL;
+    char *start_ptr = queryString;
+    char *end_ptr = queryString + strlen(queryString) - 1;
+    int  len = 0;
+    char full_name[MAXFULLNAMEDATALEN];
+
+    /* get remove obj full name */
+    snprintf(full_name, MAXFULLNAMEDATALEN, "%s%s%s%s%s", (rel->catalogname) ? (rel->catalogname) : "",
+                                                            (rel->catalogname) ? "." : "",
+                                                            (rel->schemaname) ? (rel->schemaname) : "",
+                                                             (rel->schemaname) ? "." : "",
+                                                             rel->relname);
+    tmpStr = queryString;
+    len = strlen(full_name);
+    while ((ptr = strstr(tmpStr, full_name)) != NULL)
+    {
+        /* is not independent string, skip */
+        if (((ptr - 1) >= start_ptr && *(ptr - 1) != ' ' && (*(ptr - 1) != ',')) ||
+                    ((ptr + len) <= end_ptr && *(ptr + len) != ' ' && *(ptr + len) != ',' && *(ptr + len) != ';'))
+        {
+            if (((ptr - 1) >= start_ptr && *(ptr - 1) == '\"' && (ptr + len) <= end_ptr && *(ptr + len) == '\"') &&
+                        ((ptr - 2) < start_ptr || *(ptr - 2) != '.'))
+            {
+                *(ptr - 1) = ' ';
+                *(ptr + len) = ' ';
+            }
+            else
+            {
+                tmpStr = ptr + len;
+                continue;
+            }
+        }
+
+        /* replace obj name with ' ' */
+        MemSet(ptr, ' ', len);
+
+        /* find the previous ',' */
+        tmp = ptr - 1;
+        while (tmp >= start_ptr && *tmp == ' ')
+        {
+            tmp--;
+        }
+
+        if (tmp >= start_ptr && *tmp == ',')
+        {
+            *tmp = ' ';
+        }
+        else
+        {
+            /* find the following ',' */
+            tmp = ptr + len;
+            while (tmp <= end_ptr && *tmp == ' ')
+            {
+                tmp++;
+            }
+
+            if (tmp <= end_ptr && *tmp == ',')
+            {
+                *tmp = ' ';
+            }
+        }
+
+        tmpStr = ptr + len;
+    }
+}
+
+#endif
+
 /*
  * RemoveRelations
  *        Implements DROP TABLE, DROP INDEX, DROP SEQUENCE, DROP VIEW,
  *        DROP MATERIALIZED VIEW, DROP FOREIGN TABLE
  */
+#ifdef __TBASE__
+int
+RemoveRelations(DropStmt *drop, char* queryString)
+#else
 void
 RemoveRelations(DropStmt *drop)
-{// #lizard forgives
+#endif
+{
     ObjectAddresses *objects;
     char        relkind;
     ListCell   *cell;
     int            flags = 0;
     LOCKMODE    lockmode = AccessExclusiveLock;
+#ifdef __TBASE__
+    bool        querystring_omit = false;
+    int         drop_cnt = 0;
+#endif
 
     /* DROP CONCURRENTLY uses a weaker lock, and has some restrictions */
     if (drop->concurrent)
@@ -1328,6 +1484,15 @@ RemoveRelations(DropStmt *drop)
         if (!OidIsValid(relOid))
         {
             DropErrorMsgNonExistent(rel, relkind, drop->missing_ok);
+#ifdef __TBASE__
+			if (!querystring_omit)
+            {
+                OmitqueryStringSpace(queryString);
+                querystring_omit = true;
+            }
+
+            RemoveRelnameInQueryString(queryString, rel);
+#endif
             continue;
         }
 
@@ -1374,11 +1539,18 @@ RemoveRelations(DropStmt *drop)
         obj.objectSubId = 0;
 
         add_exact_object_address(&obj, objects);
+#ifdef __TBASE__
+        drop_cnt++;
+#endif
     }
 
     performMultipleDeletions(objects, drop->behavior, flags);
 
     free_object_addresses(objects);
+
+#ifdef __TBASE__
+    return drop_cnt;
+#endif
 }
 
 /*
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 5ed5bfcc..c99d090e 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -3127,6 +3127,14 @@ ProcessUtilitySlow(ParseState *pstate,
                     stmts = transformCreateStmt((CreateStmt *) parsetree,
                             queryString, !is_local && !sentToRemote);
 
+#ifdef __TBASE__
+					if (NULL == stmts)
+                    {
+                        commandCollected = true;
+                        break;
+                    }
+#endif
+
                     if (IS_PGXC_LOCAL_COORDINATOR)
                     {
                         /*
@@ -4401,18 +4409,43 @@ ExecDropStmt(DropStmt *stmt, bool isTopLevel)
 #ifdef PGXC
             {
                 bool        is_temp = false;
+#ifdef __TBASE__
+				int         drop_cnt = 0;
+                char        *new_query_string = pstrdup(queryString);
+#endif
                 RemoteQueryExecType exec_type = EXEC_ON_ALL_NODES;
 
                 /* Check restrictions on objects dropped */
                 DropStmtPreTreatment((DropStmt *) stmt, queryString, sentToRemote,
                         &is_temp, &exec_type);
 #endif
+
+#ifdef __TBASE__
+                drop_cnt = RemoveRelations(stmt, new_query_string);
+#else
                 RemoveRelations(stmt);
+#endif
+
 #ifdef PGXC
+#ifdef __TBASE__
+                /* if drop nothing, skip */
+                if (drop_cnt == 0)
+                {
+                    pfree(new_query_string);
+                    break;
+                }
+
+				/* DROP is done depending on the object type and its temporary type */
+				if (IS_PGXC_LOCAL_COORDINATOR)
+					ExecUtilityStmtOnNodes(NULL, new_query_string, NULL, sentToRemote, false,
+							exec_type, is_temp, false);
+                pfree(new_query_string);
+#else
                 /* DROP is done depending on the object type and its temporary type */
                 if (IS_PGXC_LOCAL_COORDINATOR)
                     ExecUtilityStmtOnNodes(NULL, queryString, NULL, sentToRemote, false,
                             exec_type, is_temp, false);
+#endif				
             }
 #endif
             break;
diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h
index 702b1b80..e1a3252c 100644
--- a/src/include/commands/tablecmds.h
+++ b/src/include/commands/tablecmds.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * tablecmds.h
- *      prototypes for tablecmds.c.
+ *	  prototypes for tablecmds.c.
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -23,11 +23,14 @@
 
 
 extern ObjectAddress DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
-               ObjectAddress *typaddress, const char *queryString);
-
+			   ObjectAddress *typaddress, const char *queryString);
+#ifdef __TBASE__
+extern int RemoveRelations(DropStmt *drop, char* queryString);
+#else
 extern void RemoveRelations(DropStmt *drop);
+#endif
 
-extern Oid    AlterTableLookupRelation(AlterTableStmt *stmt, LOCKMODE lockmode);
+extern Oid	AlterTableLookupRelation(AlterTableStmt *stmt, LOCKMODE lockmode);
 
 extern void AlterTable(Oid relid, LOCKMODE lockmode, AlterTableStmt *stmt);
 
@@ -37,22 +40,22 @@ extern void ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, L
 
 extern void AlterTableInternal(Oid relid, List *cmds, bool recurse);
 
-extern Oid    AlterTableMoveAll(AlterTableMoveAllStmt *stmt);
+extern Oid	AlterTableMoveAll(AlterTableMoveAllStmt *stmt);
 
 extern ObjectAddress AlterTableNamespace(AlterObjectSchemaStmt *stmt,
-                    Oid *oldschema);
+					Oid *oldschema);
 
 extern void AlterTableNamespaceInternal(Relation rel, Oid oldNspOid,
-                            Oid nspOid, ObjectAddresses *objsMoved
+							Oid nspOid, ObjectAddresses *objsMoved
 #ifdef _MLS_
                             , const char * newschemaname
 #endif
-                            );
+							);
 
 extern void AlterRelationNamespaceInternal(Relation classRel, Oid relOid,
-                               Oid oldNspOid, Oid newNspOid,
-                               bool hasDependEntry,
-                               ObjectAddresses *objsMoved);
+							   Oid oldNspOid, Oid newNspOid,
+							   bool hasDependEntry,
+							   ObjectAddresses *objsMoved);
 
 extern void CheckTableNotInUse(Relation rel, const char *stmt);
 
@@ -69,11 +72,11 @@ extern ObjectAddress RenameConstraint(RenameStmt *stmt);
 extern ObjectAddress RenameRelation(RenameStmt *stmt);
 
 extern void RenameRelationInternal(Oid myrelid,
-                       const char *newrelname, bool is_internal);
+					   const char *newrelname, bool is_internal);
 
 extern void find_composite_type_dependencies(Oid typeOid,
-                                 Relation origRelation,
-                                 const char *origTypeName);
+								 Relation origRelation,
+								 const char *origTypeName);
 
 extern void check_of_type(HeapTuple typetuple);
 
@@ -83,23 +86,23 @@ extern void remove_on_commit_action(Oid relid);
 extern void PreCommit_on_commit_actions(void);
 extern void AtEOXact_on_commit_actions(bool isCommit);
 extern void AtEOSubXact_on_commit_actions(bool isCommit,
-                              SubTransactionId mySubid,
-                              SubTransactionId parentSubid);
+							  SubTransactionId mySubid,
+							  SubTransactionId parentSubid);
 #ifdef PGXC
 extern bool IsTempTable(Oid relid);
 extern bool IsLocalTempTable(Oid relid);
 extern bool IsIndexUsingTempTable(Oid relid);
 extern bool IsOnCommitActions(void);
 extern void DropTableThrowErrorExternal(RangeVar *relation,
-                                        ObjectType removeType,
-                                        bool missing_ok);
+										ObjectType removeType,
+										bool missing_ok);
 #endif
 
 extern void RangeVarCallbackOwnsTable(const RangeVar *relation,
-                          Oid relId, Oid oldRelId, void *arg);
+						  Oid relId, Oid oldRelId, void *arg);
 
 extern void RangeVarCallbackOwnsRelation(const RangeVar *relation,
-                             Oid relId, Oid oldRelId, void *noCatalogs);
+							 Oid relId, Oid oldRelId, void *noCatalogs);
 
 #ifdef _MIGRATE_
 extern bool oidarray_contian_oid(Oid *old_oids, int old_num, Oid new_oid);
@@ -114,4 +117,4 @@ extern void StoreIntervalPartitionInfo(Oid relationId, char partkind, Oid parent
 extern void ExecCheckOverLapStmt(CheckOverLapStmt *stmt);
 #endif
 
-#endif                            /* TABLECMDS_H */
+#endif							/* TABLECMDS_H */
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index b53f9d0d..8d110b88 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -82,6 +82,9 @@
  */
 #define NAMEDATALEN 64
 
+
+#define MAXFULLNAMEDATALEN (NAMEDATALEN * 3 + 2)
+
 /*
  * Maximum number of arguments to a function.
  *
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index ef804438..924c7c95 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -505,13 +505,14 @@ SELECT * FROM circle_tbl WHERE f1 && circle(point(1,-2), 1)
 
 EXPLAIN (COSTS OFF, NODES OFF)
 SELECT count(*) FROM gpolygon_tbl WHERE f1 && '(1000,1000,0,0)'::polygon;
-                            QUERY PLAN                            
-------------------------------------------------------------------
- Aggregate
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Finalize Aggregate
    ->  Remote Subquery Scan on all
-         ->  Index Scan using ggpolygonind on gpolygon_tbl
-               Index Cond: (f1 && '((1000,1000),(0,0))'::polygon)
-(4 rows)
+         ->  Partial Aggregate
+               ->  Index Scan using ggpolygonind on gpolygon_tbl
+                     Index Cond: (f1 && '((1000,1000),(0,0))'::polygon)
+(5 rows)
 
 SELECT count(*) FROM gpolygon_tbl WHERE f1 && '(1000,1000,0,0)'::polygon;
  count 
@@ -521,13 +522,14 @@ SELECT count(*) FROM gpolygon_tbl WHERE f1 && '(1000,1000,0,0)'::polygon;
 
 EXPLAIN (COSTS OFF, NODES OFF)
 SELECT count(*) FROM gcircle_tbl WHERE f1 && '<(500,500),500>'::circle;
-                         QUERY PLAN                          
--------------------------------------------------------------
- Aggregate
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Finalize Aggregate
    ->  Remote Subquery Scan on all
-         ->  Index Scan using ggcircleind on gcircle_tbl
-               Index Cond: (f1 && '<(500,500),500>'::circle)
-(4 rows)
+         ->  Partial Aggregate
+               ->  Index Scan using ggcircleind on gcircle_tbl
+                     Index Cond: (f1 && '<(500,500),500>'::circle)
+(5 rows)
 
 SELECT count(*) FROM gcircle_tbl WHERE f1 && '<(500,500),500>'::circle;
  count 
@@ -539,7 +541,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT count(*) FROM point_tbl WHERE f1 <@ box '(0,0,100,100)';
                         QUERY PLAN                        
 ----------------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Aggregate
          ->  Index Only Scan using gpointind on point_tbl
                Index Cond: (f1 <@ '(100,100),(0,0)'::box)
@@ -555,7 +557,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT count(*) FROM point_tbl WHERE box '(0,0,100,100)' @> f1;
                         QUERY PLAN                        
 ----------------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Aggregate
          ->  Index Only Scan using gpointind on point_tbl
                Index Cond: (f1 <@ '(100,100),(0,0)'::box)
@@ -571,7 +573,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT count(*) FROM point_tbl WHERE f1 <@ polygon '(0,0),(0,100),(100,100),(50,50),(100,0),(0,0)';
                                           QUERY PLAN                                          
 ----------------------------------------------------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Aggregate
          ->  Index Only Scan using gpointind on point_tbl
                Index Cond: (f1 <@ '((0,0),(0,100),(100,100),(50,50),(100,0),(0,0))'::polygon)
@@ -587,7 +589,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT count(*) FROM point_tbl WHERE f1 <@ circle '<(50,50),50>';
                         QUERY PLAN                        
 ----------------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Aggregate
          ->  Index Only Scan using gpointind on point_tbl
                Index Cond: (f1 <@ '<(50,50),50>'::circle)
@@ -603,7 +605,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT count(*) FROM point_tbl p WHERE p.f1 << '(0.0, 0.0)';
                          QUERY PLAN                         
 ------------------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Aggregate
          ->  Index Only Scan using gpointind on point_tbl p
                Index Cond: (f1 << '(0,0)'::point)
@@ -619,7 +621,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT count(*) FROM point_tbl p WHERE p.f1 >> '(0.0, 0.0)';
                          QUERY PLAN                         
 ------------------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Aggregate
          ->  Index Only Scan using gpointind on point_tbl p
                Index Cond: (f1 >> '(0,0)'::point)
@@ -635,7 +637,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT count(*) FROM point_tbl p WHERE p.f1 <^ '(0.0, 0.0)';
                          QUERY PLAN                         
 ------------------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Aggregate
          ->  Index Only Scan using gpointind on point_tbl p
                Index Cond: (f1 <^ '(0,0)'::point)
@@ -651,7 +653,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT count(*) FROM point_tbl p WHERE p.f1 >^ '(0.0, 0.0)';
                          QUERY PLAN                         
 ------------------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Aggregate
          ->  Index Only Scan using gpointind on point_tbl p
                Index Cond: (f1 >^ '(0,0)'::point)
@@ -667,7 +669,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT count(*) FROM point_tbl p WHERE p.f1 ~= '(-5, -12)';
                          QUERY PLAN                         
 ------------------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Aggregate
          ->  Index Only Scan using gpointind on point_tbl p
                Index Cond: (f1 ~= '(-5,-12)'::point)
@@ -683,7 +685,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT * FROM point_tbl ORDER BY f1 <-> '0,1';
                      QUERY PLAN                     
 ----------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Index Only Scan using gpointind on point_tbl
          Order By: (f1 <-> '(0,1)'::point)
 (3 rows)
@@ -719,7 +721,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT * FROM point_tbl WHERE f1 IS NOT NULL ORDER BY f1 <-> '0,1';
                      QUERY PLAN                     
 ----------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Index Only Scan using gpointind on point_tbl
          Index Cond: (f1 IS NOT NULL)
          Order By: (f1 <-> '(0,1)'::point)
@@ -740,7 +742,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1';
                       QUERY PLAN                      
 ------------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Index Only Scan using gpointind on point_tbl
          Index Cond: (f1 <@ '(10,10),(-10,-10)'::box)
          Order By: (f1 <-> '(0,1)'::point)
@@ -807,13 +809,14 @@ SELECT count(*) FROM quad_point_tbl;
 
 EXPLAIN (NODES OFF, COSTS OFF)
 SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
-                           QUERY PLAN                            
------------------------------------------------------------------
- Aggregate
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Finalize Aggregate
    ->  Remote Subquery Scan on all
-         ->  Index Only Scan using sp_quad_ind on quad_point_tbl
-               Index Cond: (p <@ '(1000,1000),(200,200)'::box)
-(4 rows)
+         ->  Partial Aggregate
+               ->  Index Only Scan using sp_quad_ind on quad_point_tbl
+                     Index Cond: (p <@ '(1000,1000),(200,200)'::box)
+(5 rows)
 
 SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
  count 
@@ -823,13 +826,14 @@ SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
 
 EXPLAIN (NODES OFF, COSTS OFF)
 SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
-                           QUERY PLAN                            
------------------------------------------------------------------
- Aggregate
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Finalize Aggregate
    ->  Remote Subquery Scan on all
-         ->  Index Only Scan using sp_quad_ind on quad_point_tbl
-               Index Cond: (p <@ '(1000,1000),(200,200)'::box)
-(4 rows)
+         ->  Partial Aggregate
+               ->  Index Only Scan using sp_quad_ind on quad_point_tbl
+                     Index Cond: (p <@ '(1000,1000),(200,200)'::box)
+(5 rows)
 
 SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
  count 
@@ -924,13 +928,14 @@ SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)';
 
 EXPLAIN (NODES OFF, COSTS OFF)
 SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
-                          QUERY PLAN                           
----------------------------------------------------------------
- Aggregate
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Finalize Aggregate
    ->  Remote Subquery Scan on all
-         ->  Index Only Scan using sp_kd_ind on kd_point_tbl
-               Index Cond: (p <@ '(1000,1000),(200,200)'::box)
-(4 rows)
+         ->  Partial Aggregate
+               ->  Index Only Scan using sp_kd_ind on kd_point_tbl
+                     Index Cond: (p <@ '(1000,1000),(200,200)'::box)
+(5 rows)
 
 SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
  count 
@@ -940,13 +945,14 @@ SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
 
 EXPLAIN (NODES OFF, COSTS OFF)
 SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p;
-                          QUERY PLAN                           
----------------------------------------------------------------
- Aggregate
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Finalize Aggregate
    ->  Remote Subquery Scan on all
-         ->  Index Only Scan using sp_kd_ind on kd_point_tbl
-               Index Cond: (p <@ '(1000,1000),(200,200)'::box)
-(4 rows)
+         ->  Partial Aggregate
+               ->  Index Only Scan using sp_kd_ind on kd_point_tbl
+                     Index Cond: (p <@ '(1000,1000),(200,200)'::box)
+(5 rows)
 
 SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p;
  count 
@@ -1315,7 +1321,7 @@ EXPLAIN (COSTS OFF, NODES OFF)
 SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1';
                             QUERY PLAN                            
 ------------------------------------------------------------------
- Remote Fast Query Execution
+ Remote Subquery Scan on all
    ->  Sort
          Sort Key: ((f1 <-> '(0,1)'::point))
          ->  Bitmap Heap Scan on point_tbl
@@ -1390,15 +1396,16 @@ SELECT count(*) FROM quad_point_tbl;
 
 EXPLAIN (NODES OFF, COSTS OFF)
 SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
-                             QUERY PLAN                              
----------------------------------------------------------------------
- Aggregate
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Finalize Aggregate
    ->  Remote Subquery Scan on all
-         ->  Bitmap Heap Scan on quad_point_tbl
-               Recheck Cond: (p <@ '(1000,1000),(200,200)'::box)
-               ->  Bitmap Index Scan on sp_quad_ind
-                     Index Cond: (p <@ '(1000,1000),(200,200)'::box)
-(6 rows)
+         ->  Partial Aggregate
+               ->  Bitmap Heap Scan on quad_point_tbl
+                     Recheck Cond: (p <@ '(1000,1000),(200,200)'::box)
+                     ->  Bitmap Index Scan on sp_quad_ind
+                           Index Cond: (p <@ '(1000,1000),(200,200)'::box)
+(7 rows)
 
 SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
  count 
@@ -1408,15 +1415,16 @@ SELECT count(*) FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)';
 
 EXPLAIN (NODES OFF, COSTS OFF)
 SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
-                             QUERY PLAN                              
----------------------------------------------------------------------
- Aggregate
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Finalize Aggregate
    ->  Remote Subquery Scan on all
-         ->  Bitmap Heap Scan on quad_point_tbl
-               Recheck Cond: ('(1000,1000),(200,200)'::box @> p)
-               ->  Bitmap Index Scan on sp_quad_ind
-                     Index Cond: ('(1000,1000),(200,200)'::box @> p)
-(6 rows)
+         ->  Partial Aggregate
+               ->  Bitmap Heap Scan on quad_point_tbl
+                     Recheck Cond: ('(1000,1000),(200,200)'::box @> p)
+                     ->  Bitmap Index Scan on sp_quad_ind
+                           Index Cond: ('(1000,1000),(200,200)'::box @> p)
+(7 rows)
 
 SELECT count(*) FROM quad_point_tbl WHERE box '(200,200,1000,1000)' @> p;
  count 
@@ -1521,15 +1529,16 @@ SELECT count(*) FROM quad_point_tbl WHERE p ~= '(4585, 365)';
 
 EXPLAIN (NODES OFF, COSTS OFF)
 SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
-                             QUERY PLAN                              
----------------------------------------------------------------------
- Aggregate
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Finalize Aggregate
    ->  Remote Subquery Scan on all
-         ->  Bitmap Heap Scan on kd_point_tbl
-               Recheck Cond: (p <@ '(1000,1000),(200,200)'::box)
-               ->  Bitmap Index Scan on sp_kd_ind
-                     Index Cond: (p <@ '(1000,1000),(200,200)'::box)
-(6 rows)
+         ->  Partial Aggregate
+               ->  Bitmap Heap Scan on kd_point_tbl
+                     Recheck Cond: (p <@ '(1000,1000),(200,200)'::box)
+                     ->  Bitmap Index Scan on sp_kd_ind
+                           Index Cond: (p <@ '(1000,1000),(200,200)'::box)
+(7 rows)
 
 SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
  count 
@@ -1539,15 +1548,16 @@ SELECT count(*) FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)';
 
 EXPLAIN (NODES OFF, COSTS OFF)
 SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p;
-                             QUERY PLAN                              
----------------------------------------------------------------------
- Aggregate
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Finalize Aggregate
    ->  Remote Subquery Scan on all
-         ->  Bitmap Heap Scan on kd_point_tbl
-               Recheck Cond: ('(1000,1000),(200,200)'::box @> p)
-               ->  Bitmap Index Scan on sp_kd_ind
-                     Index Cond: ('(1000,1000),(200,200)'::box @> p)
-(6 rows)
+         ->  Partial Aggregate
+               ->  Bitmap Heap Scan on kd_point_tbl
+                     Recheck Cond: ('(1000,1000),(200,200)'::box @> p)
+                     ->  Bitmap Index Scan on sp_kd_ind
+                           Index Cond: ('(1000,1000),(200,200)'::box @> p)
+(7 rows)
 
 SELECT count(*) FROM kd_point_tbl WHERE box '(200,200,1000,1000)' @> p;
  count 
@@ -2612,7 +2622,6 @@ DROP INDEX CONCURRENTLY "concur_index2";				-- works
 ERROR:  index "concur_index2" does not exist
 DROP INDEX CONCURRENTLY IF EXISTS "concur_index2";		-- notice
 NOTICE:  index "concur_index2" does not exist, skipping
-ERROR:  DROP INDEX CONCURRENTLY cannot run inside a transaction block
 -- failures
 DROP INDEX CONCURRENTLY "concur_index2", "concur_index3";
 ERROR:  index "concur_index2" does not exist
@@ -2623,7 +2632,6 @@ ROLLBACK;
 -- successes
 DROP INDEX CONCURRENTLY IF EXISTS "concur_index3";
 NOTICE:  index "concur_index3" does not exist, skipping
-ERROR:  DROP INDEX CONCURRENTLY cannot run inside a transaction block
 DROP INDEX CONCURRENTLY "concur_index4";
 ERROR:  index "concur_index4" does not exist
 DROP INDEX CONCURRENTLY "concur_index5";
@@ -2911,21 +2919,22 @@ SELECT * FROM tenk1
 EXPLAIN (NODES OFF, COSTS OFF)
 SELECT count(*) FROM tenk1
   WHERE hundred = 42 AND (thousand = 42 OR thousand = 99);
-                                      QUERY PLAN                                       
----------------------------------------------------------------------------------------
- Aggregate
+                                         QUERY PLAN                                          
+---------------------------------------------------------------------------------------------
+ Finalize Aggregate
    ->  Remote Subquery Scan on all
-         ->  Bitmap Heap Scan on tenk1
-               Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 99)))
-               ->  BitmapAnd
-                     ->  Bitmap Index Scan on tenk1_hundred
-                           Index Cond: (hundred = 42)
-                     ->  BitmapOr
-                           ->  Bitmap Index Scan on tenk1_thous_tenthous
-                                 Index Cond: (thousand = 42)
-                           ->  Bitmap Index Scan on tenk1_thous_tenthous
-                                 Index Cond: (thousand = 99)
-(12 rows)
+         ->  Partial Aggregate
+               ->  Bitmap Heap Scan on tenk1
+                     Recheck Cond: ((hundred = 42) AND ((thousand = 42) OR (thousand = 99)))
+                     ->  BitmapAnd
+                           ->  Bitmap Index Scan on tenk1_hundred
+                                 Index Cond: (hundred = 42)
+                           ->  BitmapOr
+                                 ->  Bitmap Index Scan on tenk1_thous_tenthous
+                                       Index Cond: (thousand = 42)
+                                 ->  Bitmap Index Scan on tenk1_thous_tenthous
+                                       Index Cond: (thousand = 99)
+(13 rows)
 
 SELECT count(*) FROM tenk1
   WHERE hundred = 42 AND (thousand = 42 OR thousand = 99);
@@ -2976,12 +2985,16 @@ explain (costs off)
 SELECT unique1 FROM tenk1
 WHERE unique1 IN (1,42,7)
 ORDER BY unique1;
-                         QUERY PLAN                          
--------------------------------------------------------------
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Index Only Scan using tenk1_unique1 on tenk1
-         Index Cond: (unique1 = ANY ('{1,42,7}'::integer[]))
-(3 rows)
+   ->  Sort
+         Sort Key: unique1
+         ->  Bitmap Heap Scan on tenk1
+               Recheck Cond: (unique1 = ANY ('{1,42,7}'::integer[]))
+               ->  Bitmap Index Scan on tenk1_unique1
+                     Index Cond: (unique1 = ANY ('{1,42,7}'::integer[]))
+(7 rows)
 
 SELECT unique1 FROM tenk1
 WHERE unique1 IN (1,42,7)
@@ -2997,13 +3010,14 @@ explain (costs off)
 SELECT thousand, tenthous FROM tenk1
 WHERE thousand < 2 AND tenthous IN (1001,3000)
 ORDER BY thousand;
-                         QUERY PLAN                          
--------------------------------------------------------------
+                                         QUERY PLAN                                         
+--------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Index Only Scan using tenk1_thous_tenthous on tenk1
-         Index Cond: (thousand < 2)
-         Filter: (tenthous = ANY ('{1001,3000}'::integer[]))
-(4 rows)
+   ->  Sort
+         Sort Key: thousand
+         ->  Index Only Scan using tenk1_thous_tenthous on tenk1
+               Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[])))
+(5 rows)
 
 SELECT thousand, tenthous FROM tenk1
 WHERE thousand < 2 AND tenthous IN (1001,3000)

From 12482d3b1681cf92cc095806c3300813095de565 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 5 Feb 2021 17:35:21 +0800
Subject: [PATCH 105/578] fix regress

---
 src/test/regress/expected/create_index.out | 26 +++++++++-------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index 924c7c95..dd727bc7 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -2489,16 +2489,14 @@ SET maintenance_work_mem = '1MB';
 CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fillfactor = 10);
 EXPLAIN (COSTS OFF)
 SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA';
-                            QUERY PLAN                             
--------------------------------------------------------------------
+                           QUERY PLAN                           
+----------------------------------------------------------------
  Finalize Aggregate
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Partial Aggregate
-               ->  Bitmap Heap Scan on tenk1
-                     Recheck Cond: (stringu1 = 'TVAAAA'::name)
-                     ->  Bitmap Index Scan on hash_tuplesort_idx
-                           Index Cond: (stringu1 = 'TVAAAA'::name)
-(7 rows)
+               ->  Index Scan using hash_tuplesort_idx on tenk1
+                     Index Cond: (stringu1 = 'TVAAAA'::name)
+(5 rows)
 
 SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA';
  count 
@@ -2985,16 +2983,12 @@ explain (costs off)
 SELECT unique1 FROM tenk1
 WHERE unique1 IN (1,42,7)
 ORDER BY unique1;
-                               QUERY PLAN                                
--------------------------------------------------------------------------
+                         QUERY PLAN                          
+-------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Sort
-         Sort Key: unique1
-         ->  Bitmap Heap Scan on tenk1
-               Recheck Cond: (unique1 = ANY ('{1,42,7}'::integer[]))
-               ->  Bitmap Index Scan on tenk1_unique1
-                     Index Cond: (unique1 = ANY ('{1,42,7}'::integer[]))
-(7 rows)
+   ->  Index Only Scan using tenk1_unique1 on tenk1
+         Index Cond: (unique1 = ANY ('{1,42,7}'::integer[]))
+(3 rows)
 
 SELECT unique1 FROM tenk1
 WHERE unique1 IN (1,42,7)

From 61c26682a51062d71331fdaaecfbbf9e1725470b Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Mon, 22 Feb 2021 20:31:30 +0800
Subject: [PATCH 106/578] report error if squeue null

---
 src/backend/tcop/pquery.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 3f2e7d62..7a529dac 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -1409,6 +1409,11 @@ PortalRun(Portal portal, long count, bool isTopLevel, bool run_once,
                     int             myindex = queryDesc->myindex;
                     TupleTableSlot *slot;
 
+                    if (squeue == NULL)
+                    {
+                        elog(ERROR, "squeue: %s is null, myindex: %d, atStart: %d, atEnd: %d", portal->name, myindex, portal->atStart, portal->atEnd);
+                    }
+
                     /*
                      * We are the consumer.
                      * We have skipped plan initialization, hence we do not have

From 73eaa3eccdfa575e9535035972872d1ea9e867da Mon Sep 17 00:00:00 2001
From: anthonyyan <anthonyyan@tencent.com>
Date: Sun, 7 Feb 2021 10:43:48 +0800
Subject: [PATCH 107/578] fix cn002 coredump when cn001 switchover, query
 pgxc_node must wrap transaction,
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084796329 (merge request
 !157)

(cherry picked from commit c5021b12)

1096f0a6 fix cn002 coredump when cn001 switchover, query pgxc_node must wrap transaction, http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084796329
---
 src/backend/pgxc/nodemgr/nodemgr.c | 25 ++++++++++++++++++++++++-
 src/backend/pgxc/pool/pgxcnode.c   |  2 +-
 src/backend/pgxc/pool/poolmgr.c    |  6 +++---
 src/include/pgxc/nodemgr.h         |  2 +-
 4 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c
index 8767e723..2a114a9e 100644
--- a/src/backend/pgxc/nodemgr/nodemgr.c
+++ b/src/backend/pgxc/nodemgr/nodemgr.c
@@ -577,7 +577,7 @@ count_coords_datanodes(Relation rel, int *num_coord, int *num_dns)
  *
  * Update node definitions in the shared memory tables from the catalog
  */
-void
+static void
 PgxcNodeListAndCount(void)
 {// #lizard forgives
     Relation rel;
@@ -800,6 +800,29 @@ PgxcNodeListAndCount(void)
     LWLockRelease(NodeTableLock);
 }
 
+/*
+ * PgxcNodeListAndCountWrapTransaction
+ *
+ * Update node definitions in the shared memory tables from the catalog wrap the transaction
+ */
+void
+PgxcNodeListAndCountWrapTransaction(void)
+{
+	bool need_abort = false;
+	
+	if (!IsTransactionOrTransactionBlock())
+	{
+		StartTransactionCommand();
+		need_abort = true;
+	}
+
+	PgxcNodeListAndCount();
+	
+	if (need_abort)
+	{
+		AbortCurrentTransaction();
+	}
+}
 
 /*
  * PgxcNodeGetIds
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 12873dac..8f76dc2a 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -240,7 +240,7 @@ InitMultinodeExecutor(bool is_force)
         return;
 
     /* Update node table in the shared memory */
-    PgxcNodeListAndCount();
+	PgxcNodeListAndCountWrapTransaction();
 
     /* Get classified list of node Oids */
     PgxcNodeGetOidsExtend(&coOids, &dnOids, &sdnOids, &NumCoords, &NumDataNodes, &NumSlaveDataNodes, true);
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index ca079833..f64b45c3 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -2050,7 +2050,7 @@ PoolManagerCheckConnectionInfo(void)
 		ConnectPoolManager();
     }
     
-    PgxcNodeListAndCount();
+	PgxcNodeListAndCountWrapTransaction();
     pool_putmessage(&poolHandle->port, 'q', NULL, 0);
     pool_flush(&poolHandle->port);
 
@@ -2070,7 +2070,7 @@ void
 PoolManagerReloadConnectionInfo(void)
 {
     Assert(poolHandle);
-    PgxcNodeListAndCount();
+	PgxcNodeListAndCountWrapTransaction();
     pool_putmessage(&poolHandle->port, 'p', NULL, 0);
     pool_flush(&poolHandle->port);
 }
@@ -10640,7 +10640,7 @@ PoolManagerRefreshConnectionInfo(void)
     HOLD_POOLER_RELOAD();
 
     Assert(poolHandle);
-    PgxcNodeListAndCount();
+	PgxcNodeListAndCountWrapTransaction();
     pool_putmessage(&poolHandle->port, 'R', NULL, 0);
     pool_flush(&poolHandle->port);
 
diff --git a/src/include/pgxc/nodemgr.h b/src/include/pgxc/nodemgr.h
index ee6c7417..0f31d8cc 100644
--- a/src/include/pgxc/nodemgr.h
+++ b/src/include/pgxc/nodemgr.h
@@ -47,7 +47,7 @@ extern Size NodeHashTableShmemSize(void);
 #endif
 extern Size NodeTablesShmemSize(void);
 
-extern void PgxcNodeListAndCount(void);
+extern void PgxcNodeListAndCountWrapTransaction(void);
 extern void
 PgxcNodeGetOidsExtend(Oid **coOids, Oid **dnOids, Oid **sdnOids,
                 int *num_coords, int *num_dns, int *num_sdns, bool update_preferred);

From f581cb881b97cfcf03d1775007a64eccabd76a8c Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Mon, 1 Feb 2021 12:43:41 +0000
Subject: [PATCH 108/578] Allow modifying pg_node_tree by guc: allow_force_ddl
 (merge request !106)

(cherry picked from commit 2a5a6850)
---
 src/backend/utils/adt/pseudotypes.c | 345 ++++++++++++++--------------
 1 file changed, 176 insertions(+), 169 deletions(-)

diff --git a/src/backend/utils/adt/pseudotypes.c b/src/backend/utils/adt/pseudotypes.c
index f55ffff9..60a08586 100644
--- a/src/backend/utils/adt/pseudotypes.c
+++ b/src/backend/utils/adt/pseudotypes.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pseudotypes.c
- *      Functions for the system pseudo-types.
+ *	  Functions for the system pseudo-types.
  *
  * A pseudo-type isn't really a type and never has any operations, but
  * we do need to supply input and output functions to satisfy the links
@@ -17,7 +17,7 @@
  *
  *
  * IDENTIFICATION
- *      src/backend/utils/adt/pseudotypes.c
+ *	  src/backend/utils/adt/pseudotypes.c
  *
  *-------------------------------------------------------------------------
  */
@@ -34,22 +34,25 @@
 #include "utils/lsyscache.h"
 #include "utils/syscache.h"
 #endif
+#ifdef __TBASE__
+#include "utils/guc.h"
+#endif
 
 /*
- * cstring_in        - input routine for pseudo-type CSTRING.
+ * cstring_in		- input routine for pseudo-type CSTRING.
  *
  * We might as well allow this to support constructs like "foo_in('blah')".
  */
 Datum
 cstring_in(PG_FUNCTION_ARGS)
 {
-    char       *str = PG_GETARG_CSTRING(0);
+	char	   *str = PG_GETARG_CSTRING(0);
 
-    PG_RETURN_CSTRING(pstrdup(str));
+	PG_RETURN_CSTRING(pstrdup(str));
 }
 
 /*
- * cstring_out        - output routine for pseudo-type CSTRING.
+ * cstring_out		- output routine for pseudo-type CSTRING.
  *
  * We allow this mainly so that "SELECT some_output_function(...)" does
  * what the user will expect.
@@ -57,61 +60,61 @@ cstring_in(PG_FUNCTION_ARGS)
 Datum
 cstring_out(PG_FUNCTION_ARGS)
 {
-    char       *str = PG_GETARG_CSTRING(0);
+	char	   *str = PG_GETARG_CSTRING(0);
 
-    PG_RETURN_CSTRING(pstrdup(str));
+	PG_RETURN_CSTRING(pstrdup(str));
 }
 
 /*
- * cstring_recv        - binary input routine for pseudo-type CSTRING.
+ * cstring_recv		- binary input routine for pseudo-type CSTRING.
  */
 Datum
 cstring_recv(PG_FUNCTION_ARGS)
 {
-    StringInfo    buf = (StringInfo) PG_GETARG_POINTER(0);
-    char       *str;
-    int            nbytes;
+	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
+	char	   *str;
+	int			nbytes;
 
-    str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
-    PG_RETURN_CSTRING(str);
+	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
+	PG_RETURN_CSTRING(str);
 }
 
 /*
- * cstring_send        - binary output routine for pseudo-type CSTRING.
+ * cstring_send		- binary output routine for pseudo-type CSTRING.
  */
 Datum
 cstring_send(PG_FUNCTION_ARGS)
 {
-    char       *str = PG_GETARG_CSTRING(0);
-    StringInfoData buf;
+	char	   *str = PG_GETARG_CSTRING(0);
+	StringInfoData buf;
 
-    pq_begintypsend(&buf);
-    pq_sendtext(&buf, str, strlen(str));
-    PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
+	pq_begintypsend(&buf);
+	pq_sendtext(&buf, str, strlen(str));
+	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 }
 
 /*
- * anyarray_in        - input routine for pseudo-type ANYARRAY.
+ * anyarray_in		- input routine for pseudo-type ANYARRAY.
  */
 Datum
 anyarray_in(PG_FUNCTION_ARGS)
 {
 #ifdef XCP
-    /*
-     * XCP version of array_in() understands prefix describing element type
-     */
-    return array_in(fcinfo);
+	/*
+	 * XCP version of array_in() understands prefix describing element type
+	 */
+	return array_in(fcinfo);
 #else
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("cannot accept a value of type %s", "anyarray")));
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of type %s", "anyarray")));
 
-    PG_RETURN_VOID();            /* keep compiler quiet */
+	PG_RETURN_VOID();			/* keep compiler quiet */
 #endif
 }
 
 /*
- * anyarray_out        - output routine for pseudo-type ANYARRAY.
+ * anyarray_out		- output routine for pseudo-type ANYARRAY.
  *
  * We may as well allow this, since array_out will in fact work.
  * XCP needs to send from data nodes to coordinator values of that type.
@@ -122,58 +125,58 @@ Datum
 anyarray_out(PG_FUNCTION_ARGS)
 {
 #ifdef XCP
-    /*
-     * Output prefix: (type_namespace_name.typename) to look up actual element
-     * type at the destination node then output in usual format for array
-     */
-    ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
-    Oid            element_type = ARR_ELEMTYPE(v);
-    Form_pg_type typeForm;
-    HeapTuple    typeTuple;
-    char       *typname,
-               *typnspname;
-    /* two identifiers, parenthesis, dot and trailing \0 */
-    char        prefix[2*NAMEDATALEN+4],
-               *retval,
-               *newval;
-    int         prefixlen, retvallen;
-    Datum        array_out_result;
-    MemoryContext save_context;
-
-    save_context = MemoryContextSwitchTo(fcinfo->flinfo->fn_mcxt);
-    /* Figure out type name and type namespace */
-    typeTuple = SearchSysCache(TYPEOID,
-                               ObjectIdGetDatum(element_type),
-                               0, 0, 0);
-    if (!HeapTupleIsValid(typeTuple))
-        elog(ERROR, "cache lookup failed for type %u", element_type);
-    typeForm = (Form_pg_type) GETSTRUCT(typeTuple);
-    typname = NameStr(typeForm->typname);
-    typnspname = get_namespace_name(typeForm->typnamespace);
-
-    sprintf(prefix, "(%s.%s)", typnspname, typname);
-    ReleaseSysCache(typeTuple);
-    MemoryContextSwitchTo(save_context);
-
-    /* Get standard output and make up prefixed result */
-    array_out_result = array_out(fcinfo);
-    retval = DatumGetCString(array_out_result);
-    prefixlen = strlen(prefix);
-    retvallen = strlen(retval);
-    newval = (char *) palloc(prefixlen + retvallen + 1);
-    strcpy(newval, prefix);
-    strcpy(newval + prefixlen, retval);
-
-    pfree(retval);
-
-    PG_RETURN_CSTRING(newval);
+	/*
+	 * Output prefix: (type_namespace_name.typename) to look up actual element
+	 * type at the destination node then output in usual format for array
+	 */
+	ArrayType  *v = PG_GETARG_ARRAYTYPE_P(0);
+	Oid			element_type = ARR_ELEMTYPE(v);
+	Form_pg_type typeForm;
+	HeapTuple	typeTuple;
+	char	   *typname,
+			   *typnspname;
+	/* two identifiers, parenthesis, dot and trailing \0 */
+	char		prefix[2*NAMEDATALEN+4],
+			   *retval,
+			   *newval;
+	int 		prefixlen, retvallen;
+	Datum		array_out_result;
+	MemoryContext save_context;
+
+	save_context = MemoryContextSwitchTo(fcinfo->flinfo->fn_mcxt);
+	/* Figure out type name and type namespace */
+	typeTuple = SearchSysCache(TYPEOID,
+							   ObjectIdGetDatum(element_type),
+							   0, 0, 0);
+	if (!HeapTupleIsValid(typeTuple))
+		elog(ERROR, "cache lookup failed for type %u", element_type);
+	typeForm = (Form_pg_type) GETSTRUCT(typeTuple);
+	typname = NameStr(typeForm->typname);
+	typnspname = get_namespace_name(typeForm->typnamespace);
+
+	sprintf(prefix, "(%s.%s)", typnspname, typname);
+	ReleaseSysCache(typeTuple);
+	MemoryContextSwitchTo(save_context);
+
+	/* Get standard output and make up prefixed result */
+	array_out_result = array_out(fcinfo);
+	retval = DatumGetCString(array_out_result);
+	prefixlen = strlen(prefix);
+	retvallen = strlen(retval);
+	newval = (char *) palloc(prefixlen + retvallen + 1);
+	strcpy(newval, prefix);
+	strcpy(newval + prefixlen, retval);
+
+	pfree(retval);
+
+	PG_RETURN_CSTRING(newval);
 #else
-    return array_out(fcinfo);
+	return array_out(fcinfo);
 #endif
 }
 
 /*
- * anyarray_recv        - binary input routine for pseudo-type ANYARRAY.
+ * anyarray_recv		- binary input routine for pseudo-type ANYARRAY.
  *
  * XXX this could actually be made to work, since the incoming array
  * data will contain the element type OID.  Need to think through
@@ -182,75 +185,75 @@ anyarray_out(PG_FUNCTION_ARGS)
 Datum
 anyarray_recv(PG_FUNCTION_ARGS)
 {
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("cannot accept a value of type %s", "anyarray")));
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of type %s", "anyarray")));
 
-    PG_RETURN_VOID();            /* keep compiler quiet */
+	PG_RETURN_VOID();			/* keep compiler quiet */
 }
 
 /*
- * anyarray_send        - binary output routine for pseudo-type ANYARRAY.
+ * anyarray_send		- binary output routine for pseudo-type ANYARRAY.
  *
  * We may as well allow this, since array_send will in fact work.
  */
 Datum
 anyarray_send(PG_FUNCTION_ARGS)
 {
-    return array_send(fcinfo);
+	return array_send(fcinfo);
 }
 
 
 /*
- * anyenum_in        - input routine for pseudo-type ANYENUM.
+ * anyenum_in		- input routine for pseudo-type ANYENUM.
  */
 Datum
 anyenum_in(PG_FUNCTION_ARGS)
 {
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("cannot accept a value of type %s", "anyenum")));
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of type %s", "anyenum")));
 
-    PG_RETURN_VOID();            /* keep compiler quiet */
+	PG_RETURN_VOID();			/* keep compiler quiet */
 }
 
 /*
- * anyenum_out        - output routine for pseudo-type ANYENUM.
+ * anyenum_out		- output routine for pseudo-type ANYENUM.
  *
  * We may as well allow this, since enum_out will in fact work.
  */
 Datum
 anyenum_out(PG_FUNCTION_ARGS)
 {
-    return enum_out(fcinfo);
+	return enum_out(fcinfo);
 }
 
 /*
- * anyrange_in        - input routine for pseudo-type ANYRANGE.
+ * anyrange_in		- input routine for pseudo-type ANYRANGE.
  */
 Datum
 anyrange_in(PG_FUNCTION_ARGS)
 {
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("cannot accept a value of type %s", "anyrange")));
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of type %s", "anyrange")));
 
-    PG_RETURN_VOID();            /* keep compiler quiet */
+	PG_RETURN_VOID();			/* keep compiler quiet */
 }
 
 /*
- * anyrange_out        - output routine for pseudo-type ANYRANGE.
+ * anyrange_out		- output routine for pseudo-type ANYRANGE.
  *
  * We may as well allow this, since range_out will in fact work.
  */
 Datum
 anyrange_out(PG_FUNCTION_ARGS)
 {
-    return range_out(fcinfo);
+	return range_out(fcinfo);
 }
 
 /*
- * void_in        - input routine for pseudo-type VOID.
+ * void_in		- input routine for pseudo-type VOID.
  *
  * We allow this so that PL functions can return VOID without any special
  * hack in the PL handler.  Whatever value the PL thinks it's returning
@@ -259,22 +262,22 @@ anyrange_out(PG_FUNCTION_ARGS)
 Datum
 void_in(PG_FUNCTION_ARGS)
 {
-    PG_RETURN_VOID();            /* you were expecting something different? */
+	PG_RETURN_VOID();			/* you were expecting something different? */
 }
 
 /*
- * void_out        - output routine for pseudo-type VOID.
+ * void_out		- output routine for pseudo-type VOID.
  *
  * We allow this so that "SELECT function_returning_void(...)" works.
  */
 Datum
 void_out(PG_FUNCTION_ARGS)
 {
-    PG_RETURN_CSTRING(pstrdup(""));
+	PG_RETURN_CSTRING(pstrdup(""));
 }
 
 /*
- * void_recv    - binary input routine for pseudo-type VOID.
+ * void_recv	- binary input routine for pseudo-type VOID.
  *
  * Note that since we consume no bytes, an attempt to send anything but
  * an empty string will result in an "invalid message format" error.
@@ -282,11 +285,11 @@ void_out(PG_FUNCTION_ARGS)
 Datum
 void_recv(PG_FUNCTION_ARGS)
 {
-    PG_RETURN_VOID();
+	PG_RETURN_VOID();
 }
 
 /*
- * void_send    - binary output routine for pseudo-type VOID.
+ * void_send	- binary output routine for pseudo-type VOID.
  *
  * We allow this so that "SELECT function_returning_void(...)" works
  * even when binary output is requested.
@@ -294,42 +297,42 @@ void_recv(PG_FUNCTION_ARGS)
 Datum
 void_send(PG_FUNCTION_ARGS)
 {
-    StringInfoData buf;
+	StringInfoData buf;
 
-    /* send an empty string */
-    pq_begintypsend(&buf);
-    PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
+	/* send an empty string */
+	pq_begintypsend(&buf);
+	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 }
 
 /*
- * shell_in        - input routine for "shell" types (those not yet filled in).
+ * shell_in		- input routine for "shell" types (those not yet filled in).
  */
 Datum
 shell_in(PG_FUNCTION_ARGS)
 {
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("cannot accept a value of a shell type")));
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of a shell type")));
 
-    PG_RETURN_VOID();            /* keep compiler quiet */
+	PG_RETURN_VOID();			/* keep compiler quiet */
 }
 
 /*
- * shell_out        - output routine for "shell" types.
+ * shell_out		- output routine for "shell" types.
  */
 Datum
 shell_out(PG_FUNCTION_ARGS)
 {
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("cannot display a value of a shell type")));
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot display a value of a shell type")));
 
-    PG_RETURN_VOID();            /* keep compiler quiet */
+	PG_RETURN_VOID();			/* keep compiler quiet */
 }
 
 
 /*
- * pg_node_tree_in        - input routine for type PG_NODE_TREE.
+ * pg_node_tree_in		- input routine for type PG_NODE_TREE.
  *
  * pg_node_tree isn't really a pseudotype --- it's real enough to be a table
  * column --- but it presently has no operations of its own, and disallows
@@ -338,53 +341,57 @@ shell_out(PG_FUNCTION_ARGS)
 Datum
 pg_node_tree_in(PG_FUNCTION_ARGS)
 {
-    /*
-     * We disallow input of pg_node_tree values because the SQL functions that
-     * operate on the type are not secure against malformed input.
-     */
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("cannot accept a value of type %s", "pg_node_tree")));
-
-    PG_RETURN_VOID();            /* keep compiler quiet */
+#ifdef __TBASE__
+	if (g_allow_force_ddl)
+		return textin(fcinfo);
+#endif
+	/*
+	 * We disallow input of pg_node_tree values because the SQL functions that
+	 * operate on the type are not secure against malformed input.
+	 */
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of type %s", "pg_node_tree")));
+
+	PG_RETURN_VOID();			/* keep compiler quiet */
 }
 
 
 /*
- * pg_node_tree_out        - output routine for type PG_NODE_TREE.
+ * pg_node_tree_out		- output routine for type PG_NODE_TREE.
  *
  * The internal representation is the same as TEXT, so just pass it off.
  */
 Datum
 pg_node_tree_out(PG_FUNCTION_ARGS)
 {
-    return textout(fcinfo);
+	return textout(fcinfo);
 }
 
 /*
- * pg_node_tree_recv        - binary input routine for type PG_NODE_TREE.
+ * pg_node_tree_recv		- binary input routine for type PG_NODE_TREE.
  */
 Datum
 pg_node_tree_recv(PG_FUNCTION_ARGS)
 {
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("cannot accept a value of type %s", "pg_node_tree")));
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of type %s", "pg_node_tree")));
 
-    PG_RETURN_VOID();            /* keep compiler quiet */
+	PG_RETURN_VOID();			/* keep compiler quiet */
 }
 
 /*
- * pg_node_tree_send        - binary output routine for type PG_NODE_TREE.
+ * pg_node_tree_send		- binary output routine for type PG_NODE_TREE.
  */
 Datum
 pg_node_tree_send(PG_FUNCTION_ARGS)
 {
-    return textsend(fcinfo);
+	return textsend(fcinfo);
 }
 
 /*
- * pg_ddl_command_in    - input routine for type PG_DDL_COMMAND.
+ * pg_ddl_command_in	- input routine for type PG_DDL_COMMAND.
  *
  * Like pg_node_tree, pg_ddl_command isn't really a pseudotype; it's here for
  * the same reasons as that one.
@@ -392,55 +399,55 @@ pg_node_tree_send(PG_FUNCTION_ARGS)
 Datum
 pg_ddl_command_in(PG_FUNCTION_ARGS)
 {
-    /*
-     * Disallow input of pg_ddl_command value.
-     */
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("cannot accept a value of type %s", "pg_ddl_command")));
-
-    PG_RETURN_VOID();            /* keep compiler quiet */
+	/*
+	 * Disallow input of pg_ddl_command value.
+	 */
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of type %s", "pg_ddl_command")));
+
+	PG_RETURN_VOID();			/* keep compiler quiet */
 }
 
 /*
- * pg_ddl_command_out        - output routine for type PG_DDL_COMMAND.
+ * pg_ddl_command_out		- output routine for type PG_DDL_COMMAND.
  *
  * We don't have any good way to output this type directly, so punt.
  */
 Datum
 pg_ddl_command_out(PG_FUNCTION_ARGS)
 {
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("cannot output a value of type %s", "pg_ddl_command")));
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot output a value of type %s", "pg_ddl_command")));
 
-    PG_RETURN_VOID();
+	PG_RETURN_VOID();
 }
 
 /*
- * pg_ddl_command_recv        - binary input routine for type PG_DDL_COMMAND.
+ * pg_ddl_command_recv		- binary input routine for type PG_DDL_COMMAND.
  */
 Datum
 pg_ddl_command_recv(PG_FUNCTION_ARGS)
 {
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("cannot accept a value of type %s", "pg_ddl_command")));
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of type %s", "pg_ddl_command")));
 
-    PG_RETURN_VOID();
+	PG_RETURN_VOID();
 }
 
 /*
- * pg_ddl_command_send        - binary output routine for type PG_DDL_COMMAND.
+ * pg_ddl_command_send		- binary output routine for type PG_DDL_COMMAND.
  */
 Datum
 pg_ddl_command_send(PG_FUNCTION_ARGS)
 {
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("cannot output a value of type %s", "pg_ddl_command")));
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot output a value of type %s", "pg_ddl_command")));
 
-    PG_RETURN_VOID();
+	PG_RETURN_VOID();
 }
 
 
@@ -453,21 +460,21 @@ pg_ddl_command_send(PG_FUNCTION_ARGS)
 Datum \
 typname##_in(PG_FUNCTION_ARGS) \
 { \
-    ereport(ERROR, \
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \
-             errmsg("cannot accept a value of type %s", #typname))); \
+	ereport(ERROR, \
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \
+			 errmsg("cannot accept a value of type %s", #typname))); \
 \
-    PG_RETURN_VOID();            /* keep compiler quiet */ \
+	PG_RETURN_VOID();			/* keep compiler quiet */ \
 } \
 \
 Datum \
 typname##_out(PG_FUNCTION_ARGS) \
 { \
-    ereport(ERROR, \
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \
-             errmsg("cannot display a value of type %s", #typname))); \
+	ereport(ERROR, \
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \
+			 errmsg("cannot display a value of type %s", #typname))); \
 \
-    PG_RETURN_VOID();            /* keep compiler quiet */ \
+	PG_RETURN_VOID();			/* keep compiler quiet */ \
 } \
 \
 extern int no_such_variable

From cf69d15ed2c89fd16a4146defe2e512f3041f9b9 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 11 May 2021 10:53:17 +0800
Subject: [PATCH 109/578] Support concurrent update with remote subplans (merge
 request !322)

Squash merge branch 'andrelin/try_update' into 'Tbase_v2.15.16.11'

* cover regression expectation

* Push epqContext including tid, range table idx, ntuples of epqTuple to remote

* Upgrade version to TBase_V2.15.16.11

* fix warnings

* Reset cursor name of remote subplans when EvalPlanQual

tapd: http://tapd.oa.com/pgxz/prong/stories/view/1010092131864520567
---
 src/backend/access/transam/gtm.c              |   1 -
 src/backend/executor/execMain.c               | 126 +++++++++++++++++-
 src/backend/nodes/nodeFuncs.c                 | 110 +++++++++++++++
 src/backend/optimizer/prep/preptlist.c        |  14 ++
 src/backend/pgxc/nodemgr/nodemgr.c            |   1 +
 src/backend/pgxc/pool/execRemote.c            |  65 ++++++++-
 src/backend/pgxc/pool/pgxcnode.c              |  23 +++-
 src/backend/tcop/postgres.c                   |  23 ++++
 src/backend/tcop/pquery.c                     |  24 +++-
 src/include/executor/execdesc.h               |   1 +
 src/include/nodes/execnodes.h                 |  14 ++
 src/include/nodes/plannodes.h                 |   5 +
 src/include/pgxc/pgxcnode.h                   |   3 +-
 src/include/utils/portal.h                    |   3 +
 src/test/regress/expected/subselect.out       |   8 +-
 src/test/regress/expected/xc_FQS_join_1.out   |  48 +++----
 src/test/regress/expected/xc_for_update_1.out |  80 +++++------
 17 files changed, 465 insertions(+), 84 deletions(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 981332b6..5f4e8218 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -1341,7 +1341,6 @@ GetGlobalTimestampGTM(void)
     GTM_Timestamp  latest_gts = InvalidGlobalTimestamp;
     struct rusage start_r;
     struct timeval start_t;
-    int retries = 0;
 
     if (log_gtm_stats)
         ResetUsageCommon(&start_r, &start_t);
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 4e0e708b..472bec42 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -84,6 +84,7 @@
 #include "pgxc/poolmgr.h"
 #endif
 #ifdef __TBASE__
+#include "optimizer/planmain.h"
 #include "pgxc/squeue.h"
 #include "utils/relfilenodemap.h"
 #endif
@@ -141,6 +142,9 @@ static void ExecPartitionCheck(ResultRelInfo *resultRelInfo,
 static int ExecCheckRTERelkindextPerms(RangeTblEntry *rte);
 #endif
 
+static bool ResetRemoteSubplanCursor(Plan *plan, List *subplans, void *context);
+static void AttachRemoteEPQContext(EState *estate, RemoteEPQContext *epq);
+
 /*
  * Note that GetUpdatedColumns() also exists in commands/trigger.c.  There does
  * not appear to be any good header to put it into, given the structures that
@@ -1161,6 +1165,8 @@ InitPlan(QueryDesc *queryDesc, int eflags)
     estate->es_epqTuple = NULL;
     estate->es_epqTupleSet = NULL;
     estate->es_epqScanDone = NULL;
+	if (queryDesc->epqContext != NULL)
+		AttachRemoteEPQContext(estate, queryDesc->epqContext);
 
     /*
      * Initialize private state information for each SubPlan.  We must do this
@@ -2677,6 +2683,15 @@ ExecBuildAuxRowMark(ExecRowMark *erm, List *targetlist)
                                                        resname);
         if (!AttributeNumberIsValid(aerm->ctidAttNo))
             elog(ERROR, "could not find junk %s column", resname);
+
+#ifdef __TBASE__
+		/* we need xc_node_id combined with ctid to determine physical tuple */
+		snprintf(resname, sizeof(resname), "xc_node_id%u", erm->rowmarkId);
+		aerm->nodeidAttNo = ExecFindJunkAttributeInTlist(targetlist,
+		                                                 resname);
+		if (!AttributeNumberIsValid(aerm->nodeidAttNo))
+			elog(ERROR, "could not find junk %s column", resname);
+#endif
     }
     else
     {
@@ -3054,11 +3069,14 @@ EvalPlanQualInit(EPQState *epqstate, EState *estate,
                  Plan *subplan, List *auxrowmarks, int epqParam)
 {
     /* Mark the EPQ state inactive */
+	epqstate->parentestate = estate;
     epqstate->estate = NULL;
     epqstate->planstate = NULL;
     epqstate->origslot = NULL;
     /* ... and remember data that EvalPlanQualBegin will need */
-    epqstate->plan = subplan;
+	epqstate->plan = copyObject(subplan);
+	/* Reset cursor name of remote subplans if any */
+	ResetRemoteSubplanCursor(epqstate->plan, estate->es_plannedstmt->subplans, "epq");
     epqstate->arowMarks = auxrowmarks;
     epqstate->epqParam = epqParam;
 }
@@ -3074,7 +3092,11 @@ EvalPlanQualSetPlan(EPQState *epqstate, Plan *subplan, List *auxrowmarks)
     /* If we have a live EPQ query, shut it down */
     EvalPlanQualEnd(epqstate);
     /* And set/change the plan pointer */
-    epqstate->plan = subplan;
+	epqstate->plan = copyObject(subplan);
+	/* Reset cursor name of remote subplans if any */
+	ResetRemoteSubplanCursor(epqstate->plan,
+	                         epqstate->parentestate->es_plannedstmt->subplans,
+	                         "epq");
     /* The rowmarks depend on the plan, too */
     epqstate->arowMarks = auxrowmarks;
 }
@@ -3205,8 +3227,15 @@ EvalPlanQualFetchRowMarks(EPQState *epqstate)
             {
                 /* ordinary table, fetch the tuple */
                 Buffer        buffer;
+				uint32      xc_node_id;
 
                 tuple.t_self = *((ItemPointer) DatumGetPointer(datum));
+				
+				xc_node_id = DatumGetUInt32(ExecGetJunkAttribute(epqstate->origslot,
+				                                                 aerm->nodeidAttNo,
+				                                                 &isNull));
+				if (xc_node_id == PGXCNodeIdentifier)
+				{
                 if (!heap_fetch(erm->relation, SnapshotAny, &tuple, &buffer,
                                 false, NULL))
                     elog(ERROR, "failed to fetch tuple for EvalPlanQual recheck");
@@ -3227,6 +3256,14 @@ EvalPlanQualFetchRowMarks(EPQState *epqstate)
 #endif
                 ReleaseBuffer(buffer);
             }
+				else
+				{
+					copyTuple = (HeapTuple) palloc(HEAPTUPLESIZE);
+					copyTuple->t_self = tuple.t_self;
+				}
+				
+				copyTuple->t_xc_node_id = xc_node_id;
+			}
 
             /* store tuple */
             EvalPlanQualSetTuple(epqstate, erm->rti, copyTuple);
@@ -3510,6 +3547,7 @@ EvalPlanQualEnd(EPQState *epqstate)
 
     /* Mark EPQState idle */
     epqstate->estate = NULL;
+	epqstate->parentestate = NULL;
     epqstate->planstate = NULL;
     epqstate->origslot = NULL;
 }
@@ -3957,4 +3995,88 @@ int ExecCheckPgclassAuthority(ScanState *node, TupleTableSlot *slot)
 }
 #endif
 
+/*
+ * ResetRemoteSubplanCursor
+ *      walker to find out RemoteSubplan and re-generate a cursor for it
+ *      currently it is used in EvalPlanQual, otherwise EvalPlanQual will
+ *      use old cursor name to create a duplicate portal, which is illegal.
+ */
+static bool
+ResetRemoteSubplanCursor(Plan *plan, List *subplans, void *context)
+{
+	if (plan == NULL)
+		return false;
+	
+	if (IsA(plan, RemoteSubplan))
+	{
+		RemoteSubplan *rsp = castNode(RemoteSubplan, plan);
+		char *origin_cursor = rsp->cursor;
+		rsp->cursor = (char *) palloc(NAMEDATALEN);
+		snprintf(rsp->cursor, NAMEDATALEN, "%s_%s", origin_cursor, (const char *) context);
+	}
+	
+	return plantree_walker(plan, subplans, ResetRemoteSubplanCursor, context);
+}
 
+static void
+AttachRemoteEPQContext(EState *estate, RemoteEPQContext *epq)
+{
+	int i;
+	int rtsize = list_length(estate->es_range_table);
+	Relation relation;
+
+	estate->es_epqTuple = (HeapTuple *)
+		palloc0(rtsize * sizeof(HeapTuple));
+	estate->es_epqTupleSet = (bool *)
+		palloc0(rtsize * sizeof(bool));
+	estate->es_epqScanDone = (bool *)
+		palloc0(rtsize * sizeof(bool));
+	
+	for(i = 0; i < epq->ntuples; i++)
+	{
+		HeapTuple       copyTuple;
+		HeapTupleData   tuple;
+		Buffer          buffer;
+		int             idx = epq->rtidx[i];
+		
+		if (epq->nodeid[i] != PGXCNodeIdentifier)
+		{
+			estate->es_epqTupleSet[idx - 1] = true;
+			estate->es_epqScanDone[idx - 1] = true;
+			continue;
+		}
+		
+		relation = relation_open(getrelid(idx, estate->es_range_table), NoLock);
+		if (relation->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+			elog(ERROR, "foreign table does not support remote epq process");
+		
+		tuple.t_self = epq->tid[i];
+		if (!heap_fetch(relation, SnapshotAny, &tuple, &buffer,
+		                false, NULL))
+		{
+			elog(DEBUG1, "failed to fetch tuple for remote EvalPlanQual recheck");
+			relation_close(relation, NoLock);
+			continue;
+		}
+		
+#ifdef _MLS_
+		if (HeapTupleHeaderGetNatts(tuple.t_data) <
+		    RelationGetDescr(relation)->natts)
+		{
+			copyTuple = heap_expand_tuple(&tuple,
+			                              RelationGetDescr(relation));
+		}
+		else
+#endif
+		{
+			/* successful, copy tuple */
+			copyTuple = heap_copytuple(&tuple);
+		}
+		
+		estate->es_epqTuple[idx - 1] = copyTuple;
+		estate->es_epqTupleSet[idx - 1] = true;
+		
+		ReleaseBuffer(buffer);
+		relation_close(relation, NoLock);
+	}
+}
diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c
index 47ebb30d..27a1b7a3 100644
--- a/src/backend/nodes/nodeFuncs.c
+++ b/src/backend/nodes/nodeFuncs.c
@@ -3872,3 +3872,113 @@ planstate_walk_members(List *plans, PlanState **planstates,
 
     return false;
 }
+
+/*
+ * Walk a list of SubPlans (or initPlans, which also use SubPlan nodes).
+ */
+static bool
+plantree_walk_initplans(List *plans,
+                        List *subplans,
+                        bool (*walker) (),
+                        void *context)
+{
+	ListCell   *lc;
+	
+	foreach(lc, plans)
+	{
+		Plan    *splan = list_nth_node(Plan, subplans,
+								 (lfirst_node(SubPlan, lc))->plan_id);
+		
+		if (walker(splan, context))
+			return true;
+	}
+	
+	return false;
+}
+
+/*
+ * plantree_walker --- walk plan trees
+ *
+ * The walker has already visited the current node, and so we need only
+ * recurse into any sub-nodes it has.
+ */
+bool
+plantree_walker(Plan *plan,
+                List *top_subplans,
+                bool (*walker) (),
+                void *context)
+{
+	ListCell   *lc;
+	
+	if (plan == NULL)
+		return false;
+	
+	/* initPlan-s */
+	if (plantree_walk_initplans(plan->initPlan, top_subplans, walker, context))
+		return true;
+	
+	/* lefttree */
+	if (walker(plan->lefttree, top_subplans, context))
+		return true;
+	
+	/* righttree */
+	if (walker(plan->righttree, top_subplans, context))
+		return true;
+	
+	/* special child plans */
+	switch (nodeTag(plan))
+	{
+		case T_ModifyTable:
+			foreach(lc, ((ModifyTable *) plan)->plans)
+			{
+				if (walker((Plan *) lfirst(lc), top_subplans, context))
+					return true;
+			}
+			break;
+		case T_Append:
+			foreach(lc, ((Append *) plan)->appendplans)
+			{
+				if (walker((Plan *) lfirst(lc), top_subplans, context))
+					return true;
+			}
+			break;
+		case T_MergeAppend:
+			foreach(lc, ((MergeAppend *) plan)->mergeplans)
+			{
+				if (walker((Plan *) lfirst(lc), top_subplans, context))
+					return true;
+			}
+			break;
+		case T_BitmapAnd:
+			foreach(lc, ((BitmapAnd *) plan)->bitmapplans)
+			{
+				if (walker((Plan *) lfirst(lc), top_subplans, context))
+					return true;
+			}
+			break;
+		case T_BitmapOr:
+			foreach(lc, ((BitmapOr *) plan)->bitmapplans)
+			{
+				if (walker((Plan *) lfirst(lc), top_subplans, context))
+					return true;
+			}
+			break;
+		case T_SubqueryScan:
+			{
+				if (walker(castNode(SubqueryScan, plan)->subplan, top_subplans, context))
+					return true;
+			}
+			break;
+		case T_CustomScan:
+			foreach(lc, ((CustomScan *) plan)->custom_plans)
+			{
+				if (walker((Plan *) lfirst(lc), top_subplans, context))
+					return true;
+			}
+			break;
+		default:
+			break;
+	}
+	
+	return false;
+}
diff --git a/src/backend/optimizer/prep/preptlist.c b/src/backend/optimizer/prep/preptlist.c
index 9091f15f..2a69be86 100644
--- a/src/backend/optimizer/prep/preptlist.c
+++ b/src/backend/optimizer/prep/preptlist.c
@@ -368,6 +368,20 @@ preprocess_targetlist(PlannerInfo *root, List *tlist)
                                   pstrdup(resname),
                                   true);
             tlist = lappend(tlist, tle);
+			
+			/* Need to fetch another xc_node_id */
+			var = makeVar(rc->rti,
+			              XC_NodeIdAttributeNumber,
+			              INT4OID,
+			              -1,
+			              InvalidOid,
+			              0);
+			snprintf(resname, sizeof(resname), "xc_node_id%u", rc->rowmarkId);
+			tle = makeTargetEntry((Expr *) var,
+			                      list_length(tlist) + 1,
+			                      pstrdup(resname),
+			                      true);
+			tlist = lappend(tlist, tle);
         }
         if (rc->allMarkTypes & (1 << ROW_MARK_COPY))
         {
diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c
index 2a114a9e..570aadee 100644
--- a/src/backend/pgxc/nodemgr/nodemgr.c
+++ b/src/backend/pgxc/nodemgr/nodemgr.c
@@ -39,6 +39,7 @@
 #endif
 
 #ifdef __TBASE__
+#include "access/xact.h"
 #include "libpq/libpq.h"
 #endif
 bool enable_multi_cluster = true;
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index cff54d43..80829005 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -10555,7 +10555,8 @@ append_param_data(StringInfo buf, Oid ptype, int pused, Datum value, bool isnull
 }
 
 
-static int encode_parameters(int nparams, RemoteParam *remoteparams,
+static int
+encode_parameters(int nparams, RemoteParam *remoteparams,
                              PlanState *planstate, char** result)
 {
     EState            *estate = planstate->state;
@@ -10616,6 +10617,57 @@ static int encode_parameters(int nparams, RemoteParam *remoteparams,
     return buf.len;
 }
 
+/*
+ * Encode executor context for EvalPlanQual process including:
+ * the number of epqTuples, the ctid and xc_node_id of each tuple.
+ */
+static int
+encode_epqcontext(PlanState *planstate, char **result)
+{
+	EState 		   *estate = planstate->state;
+	StringInfoData	buf;
+	uint16 			n16;
+	uint32          n32;
+	int             ntuples = list_length(estate->es_range_table);
+	int             i;
+	ExprContext	   *econtext;
+	MemoryContext 	oldcontext;
+	
+	if (planstate->ps_ExprContext == NULL)
+		ExecAssignExprContext(estate, planstate);
+	
+	econtext = planstate->ps_ExprContext;
+	oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+	
+	initStringInfo(&buf);
+	
+	/* Number of epq tuples */
+	n16 = htons(ntuples);
+	appendBinaryStringInfo(&buf, (char *) &n16, 2);
+	
+	for (i = 0; i < ntuples; i++)
+	{
+		ItemPointerData tid = estate->es_epqTuple[i]->t_self;
+		int             rtidx = i + 1;
+		
+		n16 = htons(rtidx);
+		appendBinaryStringInfo(&buf, (char *) &n16, 2);
+		n16 = htons(tid.ip_blkid.bi_hi);
+		appendBinaryStringInfo(&buf, (char *) &n16, 2);
+		n16 = htons(tid.ip_blkid.bi_lo);
+		appendBinaryStringInfo(&buf, (char *) &n16, 2);
+		n16 = htons(tid.ip_posid);
+		appendBinaryStringInfo(&buf, (char *) &n16, 2);
+		n32 = htonl(estate->es_epqTuple[i]->t_xc_node_id);
+		appendBinaryStringInfo(&buf, (char *) &n32, 4);
+	}
+	
+	/* Take data from the buffer */
+	*result = palloc(buf.len);
+	memcpy(*result, buf.data, buf.len);
+	MemoryContextSwitchTo(oldcontext);
+	return buf.len;
+}
 
 TupleTableSlot *
 ExecRemoteSubplan(PlanState *pstate)
@@ -10665,7 +10717,10 @@ ExecRemoteSubplan(PlanState *pstate)
     {
         int fetch = 0;
         int paramlen = 0;
+		int epqctxlen = 0;
         char *paramdata = NULL;
+		char *epqctxdata = NULL;
+		
         /*
          * Conditions when we want to execute query on the primary node first:
          * Coordinator running replicated ModifyTable on multiple nodes
@@ -10732,6 +10787,9 @@ ExecRemoteSubplan(PlanState *pstate)
                                          &combiner->ss.ps,
                                          &paramdata);
 
+		if (estate->es_epqTuple != NULL)
+			epqctxlen = encode_epqcontext(&combiner->ss.ps, &epqctxdata);
+
         /*
          * The subplan being rescanned, need to restore connections and
          * re-bind the portal
@@ -10771,7 +10829,7 @@ ExecRemoteSubplan(PlanState *pstate)
 
                 /* rebind */
                 pgxc_node_send_bind(conn, combiner->cursor, combiner->cursor,
-                                    paramlen, paramdata);
+									paramlen, paramdata, epqctxlen, epqctxdata);
                 if (enable_statistic)
                 {
                     elog(LOG, "Bind Message:pid:%d,remote_pid:%d,remote_ip:%s,remote_port:%d,fd:%d,cursor:%s",
@@ -10859,7 +10917,8 @@ ExecRemoteSubplan(PlanState *pstate)
                 }
 
                 /* bind */
-                pgxc_node_send_bind(conn, cursor, cursor, paramlen, paramdata);
+				pgxc_node_send_bind(conn, cursor, cursor, paramlen, paramdata,
+				                    epqctxlen, epqctxdata);
 
                 if (enable_statistic)
                 {
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 8f76dc2a..36558205 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -2145,13 +2145,15 @@ pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
  */
 int
 pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
-                    const char *statement, int paramlen, char *params)
-{// #lizard forgives
+					const char *statement, int paramlen, const char *params,
+					int epqctxlen, const char *epqctx)
+{
     int            pnameLen;
     int            stmtLen;
     int         paramCodeLen;
     int         paramValueLen;
     int         paramOutLen;
+	int         epqCtxLen;
     int            msgLen;
 
     /* Invalid connection state, return error */
@@ -2168,8 +2170,10 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
     paramValueLen = paramlen ? paramlen : 2;
     /* size of output parameter codes array (always empty for now) */
     paramOutLen = 2;
+	/* size of epq context, 2 if not epq */
+	epqCtxLen = epqctxlen ? epqctxlen : 2;
     /* size + pnameLen + stmtLen + parameters */
-    msgLen = 4 + pnameLen + stmtLen + paramCodeLen + paramValueLen + paramOutLen;
+	msgLen = 4 + pnameLen + stmtLen + paramCodeLen + paramValueLen + paramOutLen + epqCtxLen;
 
     /* msgType + msgLen */
     if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
@@ -2216,6 +2220,17 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
     /* output parameter codes (none) */
     handle->outBuffer[handle->outEnd++] = 0;
     handle->outBuffer[handle->outEnd++] = 0;
+	/* output epq context */
+	if (epqctxlen)
+	{
+		memcpy(handle->outBuffer + handle->outEnd, epqctx, epqctxlen);
+		handle->outEnd += epqctxlen;
+	}
+	else
+	{
+		handle->outBuffer[handle->outEnd++] = 0;
+		handle->outBuffer[handle->outEnd++] = 0;
+	}
 
     handle->in_extended_query = true;
      return 0;
@@ -2463,7 +2478,7 @@ pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
     if (query)
         if (pgxc_node_send_parse(handle, statement, query, num_params, param_types))
             return EOF;
-    if (pgxc_node_send_bind(handle, portal, statement, paramlen, params))
+	if (pgxc_node_send_bind(handle, portal, statement, paramlen, params, 0, NULL))
         return EOF;
     if (send_describe)
         if (pgxc_node_send_describe(handle, false, portal))
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 4ed9b7d9..def87c2d 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2151,6 +2151,7 @@ exec_bind_message(StringInfo input_message)
     int16       *pformats = NULL;
     int            numParams;
     int            numRFormats;
+	int         num_epq_tuple;
     int16       *rformats = NULL;
     CachedPlanSource *psrc;
     CachedPlan *cplan;
@@ -2687,6 +2688,28 @@ exec_bind_message(StringInfo input_message)
             rformats[i] = pq_getmsgint(input_message, 2);
     }
 
+	/* Get epq context */
+	num_epq_tuple = pq_getmsgint(input_message, 2);
+	if (num_epq_tuple > 0)
+	{
+		int			i;
+		
+		portal->epqContext = palloc(sizeof(RemoteEPQContext));
+		portal->epqContext->ntuples = num_epq_tuple;
+		portal->epqContext->tid = palloc(num_epq_tuple * sizeof(ItemPointerData));
+		portal->epqContext->rtidx = palloc(num_epq_tuple * sizeof(int));
+		portal->epqContext->nodeid = palloc(num_epq_tuple * sizeof(uint32));
+		
+		for (i = 0; i < num_epq_tuple; i++)
+		{
+			portal->epqContext->rtidx[i] = pq_getmsgint(input_message, 2);
+			portal->epqContext->tid[i].ip_blkid.bi_hi = pq_getmsgint(input_message, 2);
+			portal->epqContext->tid[i].ip_blkid.bi_lo = pq_getmsgint(input_message, 2);
+			portal->epqContext->tid[i].ip_posid = pq_getmsgint(input_message, 2);
+			portal->epqContext->nodeid[i] = pq_getmsgint(input_message, 4);
+		}
+	}
+	
     pq_getmsgend(input_message);
 
     /*
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 7a529dac..16179e73 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -129,6 +129,7 @@ CreateQueryDesc(PlannedStmt *plannedstmt,
 #ifdef __TBASE__
     qd->sender = NULL;
     qd->es_param_exec_vals = NULL;
+	qd->epqContext = NULL;
 #endif
 
     /* not yet executed */
@@ -681,6 +682,13 @@ PortalStart(Portal portal, ParamListInfo params,
                                             params,
                                             NULL,
                                             0);
+
+				/*
+				 * set information about EvalPlanQual if any, they will be fill in
+				 * estate later after it been created.
+				 */
+				queryDesc->epqContext = portal->epqContext;
+				
                 /*
                  * If parent node have sent down parameters, and at least one
                  * of them is PARAM_EXEC we should avoid "single execution"
@@ -697,13 +705,13 @@ PortalStart(Portal portal, ParamListInfo params,
                  * here since queryDesc->plannedstmt->nParamExec may be used
                  * just to allocate space for them and no actual values passed.
 				 *
-				 * If distributionType is LOCATOR_TYPE_SHARD, even with parameters
-				 * PARAM_EXEC, still follow the redistribution logic, otherwise,
-				 * it may cause SharedQueue conflict in the lower layer redistribution
+				 * Also, if we are doing EvalPlanQual, we will be rescan soon, which
+				 * is not supported in SharedQueue mode. Force to do it traditionally.
                  */
 #ifdef __TBASE__
-                if (!paramPassDown && queryDesc->plannedstmt->nParamRemote > 0 &&
-						queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC)
+				if ((!paramPassDown && queryDesc->plannedstmt->nParamRemote > 0 &&
+				     queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC) ||
+				    queryDesc->epqContext != NULL)
 #else
                 if (queryDesc->plannedstmt->nParamRemote > 0 &&
                         queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC)
@@ -1012,6 +1020,12 @@ PortalStart(Portal portal, ParamListInfo params,
                                             0);
 
                 /*
+				 * set information about EvalPlanQual if any, they will be fill in
+				 * estate later after it been created.
+				 */
+				queryDesc->epqContext = portal->epqContext;
+				
+				/*
                  * If it's a scrollable cursor, executor needs to support
                  * REWIND and backwards scan, as well as whatever the caller
                  * might've asked for.
diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h
index 94f6449d..00e26823 100644
--- a/src/include/executor/execdesc.h
+++ b/src/include/executor/execdesc.h
@@ -117,6 +117,7 @@ typedef struct QueryDesc
 #ifdef __TBASE__
      DataPumpSender sender; /* used for locally data transfering */
     ParamExecData *es_param_exec_vals;    /* values of internal params */
+	RemoteEPQContext *epqContext; /* information about EvalPlanQual from remote */
 #endif
                                  
     int         myindex;        /* -1 if locally executed subplan is producing
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 0b6c770c..1fdf29fe 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -596,6 +596,9 @@ typedef struct ExecAuxRowMark
     AttrNumber    ctidAttNo;        /* resno of ctid junk attribute, if any */
     AttrNumber    toidAttNo;        /* resno of tableoid junk attribute, if any */
     AttrNumber    wholeAttNo;        /* resno of whole-row junk attribute, if any */
+#ifdef __TBASE__
+	AttrNumber	nodeidAttNo;    /* resno of xc_node_id junk attribute, if any */
+#endif
 } ExecAuxRowMark;
 
 
@@ -995,6 +998,9 @@ typedef struct EPQState
     Plan       *plan;            /* plan tree to be executed */
     List       *arowMarks;        /* ExecAuxRowMarks (non-locking only) */
     int            epqParam;        /* ID of Param to force scan node re-eval */
+#ifdef __TBASE__
+	EState	   *parentestate;   /* parant EState, more information to modify plantree if needed */
+#endif
 } EPQState;
 
 
@@ -2297,4 +2303,12 @@ typedef struct LimitState
     TupleTableSlot *subSlot;    /* tuple last obtained from subplan */
 } LimitState;
 
+typedef struct RemoteEPQContext
+{
+	int              ntuples;
+	int             *rtidx;
+	ItemPointerData *tid;
+	uint32          *nodeid;
+} RemoteEPQContext;
+
 #endif                            /* EXECNODES_H */
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 8a456994..ce1f6719 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -1100,4 +1100,9 @@ typedef struct PlanInvalItem
     uint32        hashValue;        /* hash value of object's cache lookup key */
 } PlanInvalItem;
 
+extern bool plantree_walker(Plan *plan,
+                            List *top_subplans,
+                            bool (*walker) (),
+                            void *context);
+
 #endif                            /* PLANNODES_H */
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 4a2ee55b..3773cac2 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -210,7 +210,8 @@ extern int pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bo
 extern int pgxc_node_send_disconnect(PGXCNodeHandle * handle, char *cursor, int cons);
 #endif
 extern int	pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
-								const char *statement, int paramlen, char *params);
+								const char *statement, int paramlen, const char *params,
+								int eqpctxlen, const char *epqctx);
 extern int	pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
 								 const char *query, short num_params, Oid *param_types);
 extern int	pgxc_node_send_flush(PGXCNodeHandle * handle);
diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h
index 9c4515b8..2a4a6c42 100644
--- a/src/include/utils/portal.h
+++ b/src/include/utils/portal.h
@@ -263,6 +263,9 @@ typedef struct PortalData
                                  * portal marked failed in subtransaction 
                                  * in AtSubAbort_Portals
                                  */
+	
+	/* information about EvalPlanQual, pass it to queryDesc */
+	RemoteEPQContext *epqContext;
 #endif
 }            PortalData;
 
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 17184f61..a1f9d561 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1796,8 +1796,8 @@ select * from x where f1 = 1;
 explain (verbose, costs off)
 with x as (select * from (select f1 from subselect_tbl for update) ss)
 select * from x where f1 = 1;
-                                QUERY PLAN                                
---------------------------------------------------------------------------
+                                             QUERY PLAN                                             
+----------------------------------------------------------------------------------------------------
  CTE Scan on x
    Output: x.f1
    Filter: (x.f1 = 1)
@@ -1807,9 +1807,9 @@ select * from x where f1 = 1;
            ->  Subquery Scan on ss
                  Output: ss.f1
                  ->  LockRows
-                       Output: subselect_tbl.f1, subselect_tbl.ctid
+                       Output: subselect_tbl.f1, subselect_tbl.ctid, subselect_tbl.xc_node_id
                        ->  Seq Scan on public.subselect_tbl
-                             Output: subselect_tbl.f1, subselect_tbl.ctid
+                             Output: subselect_tbl.f1, subselect_tbl.ctid, subselect_tbl.xc_node_id
 (12 rows)
 
 -- Multiply-referenced CTEs are inlined only when requested
diff --git a/src/test/regress/expected/xc_FQS_join_1.out b/src/test/regress/expected/xc_FQS_join_1.out
index c80fb0f2..18836c1e 100644
--- a/src/test/regress/expected/xc_FQS_join_1.out
+++ b/src/test/regress/expected/xc_FQS_join_1.out
@@ -691,12 +691,12 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod, tab3_mod
 -- DMLs involving JOINs are not FQSed
 explain (verbose on, nodes off, costs off) update tab1_mod set val2 = 1000 from tab2_mod 
 		where tab1_mod.val = tab2_mod.val and tab1_mod. val2 = tab2_mod.val2;
-                                                     QUERY PLAN                                                      
----------------------------------------------------------------------------------------------------------------------
+                                                             QUERY PLAN                                                             
+------------------------------------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all
    ->  Update on public.tab1_mod
          ->  Merge Join
-               Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid
+               Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid, tab2_mod.xc_node_id
                Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
                ->  Sort
                      Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2
@@ -704,25 +704,25 @@ explain (verbose on, nodes off, costs off) update tab1_mod set val2 = 1000 from
                      ->  Seq Scan on public.tab1_mod
                            Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2
                ->  Materialize
-                     Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                     Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
                      ->  Remote Subquery Scan on all
-                           Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                           Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
                            Distribute results by M: val
                            ->  Sort
-                                 Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                                 Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
                                  Sort Key: tab2_mod.val, tab2_mod.val2
                                  ->  Seq Scan on public.tab2_mod
-                                       Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                                       Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
 (20 rows)
 
 explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod
 		where tab1_mod.val = tab2_mod.val and tab1_mod.val2 = tab2_mod.val2;
-                                                     QUERY PLAN                                                      
----------------------------------------------------------------------------------------------------------------------
+                                                          QUERY PLAN                                                          
+------------------------------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all
    ->  Delete on public.tab1_mod
          ->  Merge Join
-               Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid
+               Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid, tab2_mod.xc_node_id
                Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
                ->  Sort
                      Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2
@@ -730,25 +730,25 @@ explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod
                      ->  Seq Scan on public.tab1_mod
                            Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2
                ->  Materialize
-                     Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                     Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
                      ->  Remote Subquery Scan on all
-                           Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                           Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
                            Distribute results by M: val
                            ->  Sort
-                                 Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                                 Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
                                  Sort Key: tab2_mod.val, tab2_mod.val2
                                  ->  Seq Scan on public.tab2_mod
-                                       Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                                       Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
 (20 rows)
 
 explain (verbose on, nodes off, costs off) update tab1_rep set val2 = 1000 from tab2_rep
 		where tab1_rep.val = tab2_rep.val and tab1_rep.val2 = tab2_rep.val2;
-                                           QUERY PLAN                                           
-------------------------------------------------------------------------------------------------
+                                                  QUERY PLAN                                                   
+---------------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on any
    ->  Update on public.tab1_rep
          ->  Merge Join
-               Output: tab1_rep.val, 1000, tab1_rep.ctid, tab1_rep.shardid, tab2_rep.ctid
+               Output: tab1_rep.val, 1000, tab1_rep.ctid, tab1_rep.shardid, tab2_rep.ctid, tab2_rep.xc_node_id
                Merge Cond: ((tab1_rep.val = tab2_rep.val) AND (tab1_rep.val2 = tab2_rep.val2))
                ->  Sort
                      Output: tab1_rep.val, tab1_rep.ctid, tab1_rep.shardid, tab1_rep.val2
@@ -756,20 +756,20 @@ explain (verbose on, nodes off, costs off) update tab1_rep set val2 = 1000 from
                      ->  Seq Scan on public.tab1_rep
                            Output: tab1_rep.val, tab1_rep.ctid, tab1_rep.shardid, tab1_rep.val2
                ->  Sort
-                     Output: tab2_rep.ctid, tab2_rep.val, tab2_rep.val2
+                     Output: tab2_rep.ctid, tab2_rep.xc_node_id, tab2_rep.val, tab2_rep.val2
                      Sort Key: tab2_rep.val, tab2_rep.val2
                      ->  Seq Scan on public.tab2_rep
-                           Output: tab2_rep.ctid, tab2_rep.val, tab2_rep.val2
+                           Output: tab2_rep.ctid, tab2_rep.xc_node_id, tab2_rep.val, tab2_rep.val2
 (15 rows)
 
 explain (verbose on, nodes off, costs off) delete from tab1_rep using tab2_rep 
 		where tab1_rep.val = tab2_rep.val and tab1_rep.val2 = tab2_rep.val2;
-                                           QUERY PLAN                                           
-------------------------------------------------------------------------------------------------
+                                            QUERY PLAN                                             
+---------------------------------------------------------------------------------------------------
  Remote Subquery Scan on any
    ->  Delete on public.tab1_rep
          ->  Merge Join
-               Output: tab1_rep.ctid, tab1_rep.shardid, tab2_rep.ctid
+               Output: tab1_rep.ctid, tab1_rep.shardid, tab2_rep.ctid, tab2_rep.xc_node_id
                Merge Cond: ((tab1_rep.val = tab2_rep.val) AND (tab1_rep.val2 = tab2_rep.val2))
                ->  Sort
                      Output: tab1_rep.ctid, tab1_rep.shardid, tab1_rep.val, tab1_rep.val2
@@ -777,10 +777,10 @@ explain (verbose on, nodes off, costs off) delete from tab1_rep using tab2_rep
                      ->  Seq Scan on public.tab1_rep
                            Output: tab1_rep.ctid, tab1_rep.shardid, tab1_rep.val, tab1_rep.val2
                ->  Sort
-                     Output: tab2_rep.ctid, tab2_rep.val, tab2_rep.val2
+                     Output: tab2_rep.ctid, tab2_rep.xc_node_id, tab2_rep.val, tab2_rep.val2
                      Sort Key: tab2_rep.val, tab2_rep.val2
                      ->  Seq Scan on public.tab2_rep
-                           Output: tab2_rep.ctid, tab2_rep.val, tab2_rep.val2
+                           Output: tab2_rep.ctid, tab2_rep.xc_node_id, tab2_rep.val, tab2_rep.val2
 (15 rows)
 
 drop table tab1_rep;
diff --git a/src/test/regress/expected/xc_for_update_1.out b/src/test/regress/expected/xc_for_update_1.out
index b9de2234..66a13a33 100644
--- a/src/test/regress/expected/xc_for_update_1.out
+++ b/src/test/regress/expected/xc_for_update_1.out
@@ -97,12 +97,12 @@ explain (costs off, num_nodes off, nodes off, verbose on)  select * from t1 for
                             QUERY PLAN                            
 ------------------------------------------------------------------
  Remote Fast Query Execution
-   Output: t1.val, t1.val2, t1.ctid
+   Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id
    Remote query: SELECT val, val2 FROM t1 FOR UPDATE OF t1 NOWAIT
    ->  LockRows
-         Output: val, val2, ctid
+         Output: val, val2, ctid, xc_node_id
          ->  Seq Scan on public.t1
-               Output: val, val2, ctid
+               Output: val, val2, ctid, xc_node_id
 (7 rows)
 
 -- two table case
@@ -279,23 +279,23 @@ select * from t1 join t2 on (t1.val2 = t2.val2) join t3 on (t1.val2 = t3.val2) f
 ERROR:  FOR UPDATE is not allowed with joins
 -- check a few subquery cases
 explain (costs off, num_nodes off, nodes off, verbose on)  select * from (select * from t1 for update of t1 nowait) as foo;
-                      QUERY PLAN                      
-------------------------------------------------------
+                             QUERY PLAN                              
+---------------------------------------------------------------------
  Remote Subquery Scan on all
    Output: foo.val, foo.val2
    ->  Subquery Scan on foo
          Output: foo.val, foo.val2
          ->  LockRows
-               Output: t1.val, t1.val2, t1.ctid
+               Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id
                ->  Seq Scan on public.t1
-                     Output: t1.val, t1.val2, t1.ctid
+                     Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id
 (8 rows)
 
 explain (costs off, num_nodes off, nodes off, verbose on)  select * from t1 where val in (select val from t2 for update of t2 nowait) for update;
 ERROR:  FOR UPDATE is not allowed with joins
 explain (costs off, num_nodes off, nodes off, verbose on)  select * from t1 where val in (select val from t2 for update of t2 nowait);
-                          QUERY PLAN                           
----------------------------------------------------------------
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
  Remote Subquery Scan on all
    Output: t1.val, t1.val2
    ->  Hash Join
@@ -312,9 +312,9 @@ explain (costs off, num_nodes off, nodes off, verbose on)  select * from t1 wher
                      ->  Subquery Scan on "ANY_subquery"
                            Output: "ANY_subquery".val
                            ->  LockRows
-                                 Output: t2.val, t2.ctid
+                                 Output: t2.val, t2.ctid, t2.xc_node_id
                                  ->  Seq Scan on public.t2
-                                       Output: t2.val, t2.ctid
+                                       Output: t2.val, t2.ctid, t2.xc_node_id
 (19 rows)
 
 -- test multiple row marks
@@ -325,48 +325,48 @@ explain (costs off, num_nodes off, nodes off, verbose on)  select * from t1 for
                         QUERY PLAN                         
 -----------------------------------------------------------
  Remote Fast Query Execution
-   Output: t1.val, t1.val2, t1.ctid
+   Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id
    Remote query: SELECT val, val2 FROM t1 FOR UPDATE OF t1
    ->  LockRows
-         Output: val, val2, ctid
+         Output: val, val2, ctid, xc_node_id
          ->  Seq Scan on public.t1
-               Output: val, val2, ctid
+               Output: val, val2, ctid, xc_node_id
 (7 rows)
 
 explain (costs off, num_nodes off, nodes off, verbose on)  select * from t1 for update of t1 for share of t1;
                         QUERY PLAN                         
 -----------------------------------------------------------
  Remote Fast Query Execution
-   Output: t1.val, t1.val2, t1.ctid
+   Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id
    Remote query: SELECT val, val2 FROM t1 FOR UPDATE OF t1
    ->  LockRows
-         Output: val, val2, ctid
+         Output: val, val2, ctid, xc_node_id
          ->  Seq Scan on public.t1
-               Output: val, val2, ctid
+               Output: val, val2, ctid, xc_node_id
 (7 rows)
 
 explain (costs off, num_nodes off, nodes off, verbose on)  select * from t1 for share of t1 for share of t1 for update of t1;
                         QUERY PLAN                         
 -----------------------------------------------------------
  Remote Fast Query Execution
-   Output: t1.val, t1.val2, t1.ctid
+   Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id
    Remote query: SELECT val, val2 FROM t1 FOR UPDATE OF t1
    ->  LockRows
-         Output: val, val2, ctid
+         Output: val, val2, ctid, xc_node_id
          ->  Seq Scan on public.t1
-               Output: val, val2, ctid
+               Output: val, val2, ctid, xc_node_id
 (7 rows)
 
 explain (costs off, num_nodes off, nodes off, verbose on)  select * from t1 for share of t1 for share of t1 for share of t1;
                         QUERY PLAN                        
 ----------------------------------------------------------
  Remote Fast Query Execution
-   Output: t1.val, t1.val2, t1.ctid
+   Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id
    Remote query: SELECT val, val2 FROM t1 FOR SHARE OF t1
    ->  LockRows
-         Output: val, val2, ctid
+         Output: val, val2, ctid, xc_node_id
          ->  Seq Scan on public.t1
-               Output: val, val2, ctid
+               Output: val, val2, ctid, xc_node_id
 (7 rows)
 
 -- make sure NOWAIT is used in remote query even if it is not mentioned with FOR UPDATE clause
@@ -374,12 +374,12 @@ explain (costs off, num_nodes off, nodes off, verbose on)  select * from t1 for
                             QUERY PLAN                            
 ------------------------------------------------------------------
  Remote Fast Query Execution
-   Output: t1.val, t1.val2, t1.ctid
+   Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id
    Remote query: SELECT val, val2 FROM t1 FOR UPDATE OF t1 NOWAIT
    ->  LockRows
-         Output: val, val2, ctid
+         Output: val, val2, ctid, xc_node_id
          ->  Seq Scan on public.t1
-               Output: val, val2, ctid
+               Output: val, val2, ctid, xc_node_id
 (7 rows)
 
 -- same table , different aliases and different row marks for different aliases
@@ -409,17 +409,17 @@ explain (costs off, num_nodes off, nodes off, verbose on)  WITH q1 AS (SELECT *
 (4 rows)
 
 explain (costs off, num_nodes off, nodes off, verbose on)  WITH q1 AS (SELECT * from t1 FOR UPDATE) SELECT * FROM q1 FOR UPDATE;
-                       QUERY PLAN                       
---------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  CTE Scan on q1
    Output: q1.val, q1.val2
    CTE q1
      ->  Remote Subquery Scan on all
-           Output: t1.val, t1.val2, t1.ctid
+           Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id
            ->  LockRows
-                 Output: t1.val, t1.val2, t1.ctid
+                 Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id
                  ->  Seq Scan on public.t1
-                       Output: t1.val, t1.val2, t1.ctid
+                       Output: t1.val, t1.val2, t1.ctid, t1.xc_node_id
 (9 rows)
 
 -- test case of inheried tables
@@ -433,17 +433,17 @@ select * from p1 order by 1 for update;
 (4 rows)
 
 explain (costs off, num_nodes off, nodes off, verbose on)  select * from p1 for update;
-                          QUERY PLAN                          
---------------------------------------------------------------
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
  Remote Subquery Scan on all
-   Output: a, b, ctid, tableoid
+   Output: a, b, ctid, xc_node_id, tableoid
    ->  LockRows
-         Output: p1.a, p1.b, p1.ctid, p1.tableoid
+         Output: p1.a, p1.b, p1.ctid, p1.xc_node_id, p1.tableoid
          ->  Append
                ->  Seq Scan on public.p1
-                     Output: p1.a, p1.b, p1.ctid, p1.tableoid
+                     Output: p1.a, p1.b, p1.ctid, p1.xc_node_id, p1.tableoid
                ->  Seq Scan on public.c1
-                     Output: c1.a, c1.b, c1.ctid, c1.tableoid
+                     Output: c1.a, c1.b, c1.ctid, c1.xc_node_id, c1.tableoid
 (9 rows)
 
 select * from c1 order by 1 for update;
@@ -457,12 +457,12 @@ explain (costs off, num_nodes off, nodes off, verbose on)  select * from c1 for
                          QUERY PLAN                         
 ------------------------------------------------------------
  Remote Fast Query Execution
-   Output: c1.a, c1.b, c1.d, c1.e, c1.ctid
+   Output: c1.a, c1.b, c1.d, c1.e, c1.ctid, c1.xc_node_id
    Remote query: SELECT a, b, d, e FROM c1 FOR UPDATE OF c1
    ->  LockRows
-         Output: a, b, d, e, ctid
+         Output: a, b, d, e, ctid, xc_node_id
          ->  Seq Scan on public.c1
-               Output: a, b, d, e, ctid
+               Output: a, b, d, e, ctid, xc_node_id
 (7 rows)
 
 -- confirm that in various join scenarios for update gets to the remote query

From 3373fc7eece4e92796654fd55583b62e7acc9498 Mon Sep 17 00:00:00 2001
From: ceciliasu <ceciliasu@tencent.com>
Date: Mon, 10 May 2021 17:52:35 +0800
Subject: [PATCH 110/578] fix bug of not refresh relcache after clean-sharding.
 http://tapd.oa.com/20418349/bugtrace/bugs/view?bug_id=1020418349087059509

---
 src/backend/pgxc/shard/shardmap.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/backend/pgxc/shard/shardmap.c b/src/backend/pgxc/shard/shardmap.c
index d0a2e242..38b5044a 100644
--- a/src/backend/pgxc/shard/shardmap.c
+++ b/src/backend/pgxc/shard/shardmap.c
@@ -71,6 +71,7 @@
 #include "utils/lsyscache.h"
 #include "utils/fmgroids.h"
 #include "utils/rel.h"
+#include "utils/inval.h"
 #include "pgxc/shardmap.h"
 #include "pgxc/pgxc.h"
 #include "pgxc/pgxcnode.h"
@@ -1936,6 +1937,12 @@ void ForceRefreshShardMap(Oid groupoid)
         }
     }
     LWLockRelease(ShardMapLock);
+	
+        /*
+         * Invalidate the relcache after refresh shard map in shmem,
+         * because Relation->rd_locator_info changed.
+         */
+	CacheInvalidateRelcacheAll();
 }
 
 /*

From b223e1595d92ce2d2afb359180e1f9cac2e1121e Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 13 May 2021 13:09:14 +0800
Subject: [PATCH 111/578] Only datanodes need to parse epqTuples from remote

---
 src/backend/tcop/postgres.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index def87c2d..db2b5639 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2688,7 +2688,9 @@ exec_bind_message(StringInfo input_message)
             rformats[i] = pq_getmsgint(input_message, 2);
     }
 
-	/* Get epq context */
+	/* Get epq context, only datanodes need them */
+	if (IS_PGXC_DATANODE && (IsConnFromCoord() || IsConnFromDatanode()))
+	{
 	num_epq_tuple = pq_getmsgint(input_message, 2);
 	if (num_epq_tuple > 0)
 	{
@@ -2709,6 +2711,7 @@ exec_bind_message(StringInfo input_message)
 			portal->epqContext->nodeid[i] = pq_getmsgint(input_message, 4);
 		}
 	}
+	}
 	
     pq_getmsgend(input_message);
 

From da2bbd16e2f7246e3b4f87ffbd9a9f2b86484867 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 3 Jun 2021 15:10:41 +0800
Subject: [PATCH 112/578] fix compile errors

---
 src/backend/commands/vacuumlazy.c            |  8 ++---
 src/backend/pgxc/pool/poolmgr.c              | 32 +++++++++++++++++++-
 src/backend/utils/adt/rowtypes.c             |  4 +--
 src/backend/utils/misc/guc.c                 | 13 +-------
 src/backend/utils/misc/mls.c                 | 12 ++++----
 src/test/regress/expected/create_index_1.out |  2 --
 src/test/regress/expected/sysviews.out       |  1 +
 7 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 92fe6c94..4796152a 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -2831,7 +2831,7 @@ MaintainGTS(Relation rel, BlockNumber blkno, Buffer buffer)
 			HeapTupleHeaderXminCommitted(tuphdr) &&
 			!HeapTupleHeaderXminFrozen(tuphdr))
 		{
-			GlobalTimestamp tuple_xmin_gts = HeapTupleHeaderGetXminTimestampAtomic(tuphdr);
+			GlobalTimestamp tuple_xmin_gts = HeapTupleHderGetXminTimestapAtomic(tuphdr);
 
 			if (GlobalTimestampIsValid(tuple_xmin_gts)
 				&& !CommitTimestampIsLocal(tuple_xmin_gts)
@@ -2857,7 +2857,7 @@ MaintainGTS(Relation rel, BlockNumber blkno, Buffer buffer)
 				if (reset)
 				{
 					changed = true;
-					HeapTupleHeaderSetXminTimestampAtomic(tuphdr, tlog_xmin_gts);
+					HeapTupleHderSetXminTimestapAtomic(tuphdr, tlog_xmin_gts);
 					elog(WARNING,
 						"relfilenode %u "
 						"pageno %u lineoff %u xmin %u xmin_gts "INT64_FORMAT" "
@@ -2878,7 +2878,7 @@ MaintainGTS(Relation rel, BlockNumber blkno, Buffer buffer)
 		if (TransactionIdIsNormal(xmax) &&
 			HeapTupleHeaderXmaxCommitted(tuphdr))
 		{
-			GlobalTimestamp tuple_xmax_gts = HeapTupleHeaderGetXmaxTimestampAtomic(tuphdr);
+			GlobalTimestamp tuple_xmax_gts = HeapTupleHderGetXmaxTimestapAtomic(tuphdr);
 
 			if (GlobalTimestampIsValid(tuple_xmax_gts)
 				&& !CommitTimestampIsLocal(tuple_xmax_gts)
@@ -2905,7 +2905,7 @@ MaintainGTS(Relation rel, BlockNumber blkno, Buffer buffer)
 				if (reset)
 				{
 					changed = true;
-					HeapTupleHeaderSetXmaxTimestampAtomic(tuphdr, tlog_xmax_gts);
+					HeapTupleHderSetXmaxTimestapAtomic(tuphdr, tlog_xmax_gts);
 					elog(WARNING,
 						"relfilenode "
 						"%u pageno %u lineoff %u "
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index f64b45c3..4e1da81f 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -2102,6 +2102,8 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
     {
         int            res;
 
+        agent->cmd_start_time = get_system_time();
+
         /*
          * During a pool cleaning, Abort, Connect and Get Connections messages
          * are not allowed on pooler side.
@@ -2232,7 +2234,19 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
                 /* Send result */
                 pool_sendres(&agent->port, res, NULL, 0, true);
                 break;
-                
+
+            case 'x':          /* get command statistics */
+		        handle_get_cmd_statistics(agent);
+		        break;
+
+            case 'y':          /* reset command statistics */
+                reset_pooler_cmd_statistics();
+                break;
+
+            case 'z':          /* get connections statistics */
+                handle_get_conn_statistics(agent);
+                break;
+
             case EOF:            /* EOF */
                 agent_destroy(agent);
                 return;    
@@ -2242,6 +2256,12 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
                 return;
         }
 
+        /* if cmd_start_time is not 0, means cmd handle in main loop sync, statistic here */
+        if (agent->cmd_start_time != 0)
+        {
+            update_pooler_cmd_statistics(qtype, get_system_time() - agent->cmd_start_time);
+        }
+
         /* avoid reading from connection */
         if ((qtype = pool_pollbyte(&agent->port)) == EOF)
             break;
@@ -6263,6 +6283,11 @@ static void pooler_handle_sync_response_queue(void)
                     abort();
                 }
             }
+
+            if (connRsp->cmd_start_time != 0)
+            {
+                update_pooler_cmd_statistics(connRsp->cmd, connRsp->cmd_end_time - connRsp->cmd_start_time);
+            }
             
             /* handle pending agent, if any */
             agent_handle_pending_agent(agent);
@@ -7981,6 +8006,11 @@ void *pooler_sync_remote_operator_thread(void *arg)
             {
                 gettimeofday(&request->end_time, NULL);        
             }
+
+            if (request->cmd_start_time != 0)
+            {
+                request->cmd_end_time = get_system_time();
+            }
             
             /* clear task status */
             pooler_async_task_done(&g_PoolSyncNetworkControl, threadIndex);            
diff --git a/src/backend/utils/adt/rowtypes.c b/src/backend/utils/adt/rowtypes.c
index 3ba7b15c..38e30728 100644
--- a/src/backend/utils/adt/rowtypes.c
+++ b/src/backend/utils/adt/rowtypes.c
@@ -372,7 +372,7 @@ record_out(PG_FUNCTION_ARGS)
 	 */
 	if (IS_PGXC_DATANODE && tupdesc->attrs_ext)
 	{
-		transparent_crypt_decrypt_all_cols_value_copy(&tuple, tupdesc, values, nulls);
+		trsprt_crypt_dcrpt_all_col_vale_cp(&tuple, tupdesc, values, nulls);
 	}
 	else
 	{
@@ -390,7 +390,7 @@ record_out(PG_FUNCTION_ARGS)
 
 	if (OidIsValid(parentOid) && datamask_check_table_has_datamask(parentOid))
 	{
-		datamask_exchange_all_cols_value_copy(tupdesc, values, nulls, parentOid);
+		dmask_exchg_all_cols_value_copy(tupdesc, values, nulls, parentOid);
 	}
 
     /* And build the result string */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index c1770cd1..5a6afb51 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2109,18 +2109,7 @@ static struct config_bool ConfigureNamesBool[] =
 		&g_enable_bouncer,
 		false,
 		NULL, NULL, NULL
-	},	
-
-    {
-        {
-            "enable_pgbouncer", PGC_SIGHUP, STATS_COLLECTOR,
-            gettext_noop("use pgbouncer as coordinator connection pool."),
-            NULL
-        },
-        &g_enable_bouncer,
-        false,
-        NULL, NULL, NULL
-    },    
+	},   
 
     {
         {
diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c
index c29ed21c..8902bcaf 100644
--- a/src/backend/utils/misc/mls.c
+++ b/src/backend/utils/misc/mls.c
@@ -380,8 +380,8 @@ bool mls_check_relation_permission(Oid relid, bool * schema_bound)
             return true;
         }
 
-        if (transparent_crypt_check_table_has_crypto(parent_oid,  true, schema_bound) ||
-		        transparent_crypt_check_table_has_crypto(relid,  true, schema_bound))
+        if (trsprt_crypt_check_table_has_crypt(parent_oid,  true, schema_bound) ||
+		        trsprt_crypt_check_table_has_crypt(relid,  true, schema_bound))
         {
             return true;
         }
@@ -438,14 +438,14 @@ bool mls_check_column_permission(Oid relid, int attnum)
     {
         parent_oid = mls_get_parent_oid_by_relid(relid);
 
-        if (datamask_check_table_col_has_datamask(parent_oid, attnum) ||
-		        datamask_check_table_col_has_datamask(relid, attnum))
+        if (dmask_check_table_col_has_dmask(parent_oid, attnum) ||
+		        dmask_check_table_col_has_dmask(relid, attnum))
         {
             return true;
         }
 
-        if (transparent_crypt_check_table_col_has_crypto(parent_oid, attnum) ||
-		        transparent_crypt_check_table_col_has_crypto(relid, attnum))
+        if (trsprt_crypt_chk_tbl_col_has_crypt(parent_oid, attnum) ||
+		        trsprt_crypt_chk_tbl_col_has_crypt(relid, attnum))
         {
             return true;
         }
diff --git a/src/test/regress/expected/create_index_1.out b/src/test/regress/expected/create_index_1.out
index 30f42019..924c7c95 100644
--- a/src/test/regress/expected/create_index_1.out
+++ b/src/test/regress/expected/create_index_1.out
@@ -2622,7 +2622,6 @@ DROP INDEX CONCURRENTLY "concur_index2";				-- works
 ERROR:  index "concur_index2" does not exist
 DROP INDEX CONCURRENTLY IF EXISTS "concur_index2";		-- notice
 NOTICE:  index "concur_index2" does not exist, skipping
-ERROR:  DROP INDEX CONCURRENTLY cannot run inside a transaction block
 -- failures
 DROP INDEX CONCURRENTLY "concur_index2", "concur_index3";
 ERROR:  index "concur_index2" does not exist
@@ -2633,7 +2632,6 @@ ROLLBACK;
 -- successes
 DROP INDEX CONCURRENTLY IF EXISTS "concur_index3";
 NOTICE:  index "concur_index3" does not exist, skipping
-ERROR:  DROP INDEX CONCURRENTLY cannot run inside a transaction block
 DROP INDEX CONCURRENTLY "concur_index4";
 ERROR:  index "concur_index4" does not exist
 DROP INDEX CONCURRENTLY "concur_index5";
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 2ab99d9d..48b73026 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -120,6 +120,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_pooler_stuck_exit          | off
  enable_pullup_subquery            | on
  enable_replication_slot_debug     | off
+ enable_sampling_analyze           | on
  enable_seqscan                    | on
  enable_shard_statistic            | on
  enable_sort                       | on

From 7ff1831dede2d81ea827148a221a7d08418262d4 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 31 Dec 2020 11:41:45 +0800
Subject: [PATCH 113/578] [Bugfix] hash value calculation during redistributing
 data (merge request !65)

Should break after hash a type of datum, result may be wrong or even cause data distribute to a single DN
TAPD[ID84546415]: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084546415
---
 src/backend/executor/nodeAgg.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index d0610f2a..d771a28e 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -6122,16 +6122,19 @@ ReDistributeHash(Oid dataType, int numWorkers, Datum value, LocatorHashFunc hash
                 int64 val = DatumGetInt64(value);
                 result = (val % num) % numWorkers;
             }
+			break;
         case INT2OID:
             {
                 int16 val = DatumGetInt16(value);
                 result = (val % num) % numWorkers;
             }
+			break;
         case OIDOID:
             {
                 uint32 val = (uint32)DatumGetObjectId(value);
                 result = (val % num) % numWorkers;
             }
+			break;
         case INT4OID:
         case ABSTIMEOID:
         case RELTIMEOID:
@@ -6140,12 +6143,14 @@ ReDistributeHash(Oid dataType, int numWorkers, Datum value, LocatorHashFunc hash
                 int32 val = DatumGetInt32(value);
                 result = (val % num) % numWorkers;
             }
+			break;
         case BOOLOID:
         case CHAROID:
             {
                 int32 val = (int32)DatumGetChar(value);
                 result = (val % num) % numWorkers;
             }
+			break;
         case TIMEOID:
         case TIMESTAMPOID:
         case TIMESTAMPTZOID:
@@ -6153,6 +6158,7 @@ ReDistributeHash(Oid dataType, int numWorkers, Datum value, LocatorHashFunc hash
                 int64 val = DatumGetInt64(value);
                 result = (val % num) % numWorkers;
             }
+			break;
         default:
             {
                 unsigned int hashvalue = 0;

From e9af66964cca23897ca69691a9c2c92861493eee Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Wed, 13 Jan 2021 13:09:52 +0800
Subject: [PATCH 114/578] Support pull up subquery which has more than 2 RTE
 (merge request !92)

it's an unnecessary limitation and add limitation about LIMIT expression, adjust code style

TAPD: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084728795
---
 src/backend/optimizer/plan/subselect.c  |  6 +--
 src/test/regress/expected/subselect.out | 50 +++++++++++++++++++++++++
 src/test/regress/sql/subselect.sql      | 24 ++++++++++++
 3 files changed, 75 insertions(+), 5 deletions(-)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 7c342fc3..3e3339f8 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -1713,6 +1713,7 @@ simplify_EXPR_query(PlannerInfo *root, Query *query)
         query->hasModifyingCTE ||
         query->havingQual ||
         query->limitOffset ||
+		query->limitCount ||
         query->rowMarks ||
         query->hasSubLinks ||
         query->cteList ||
@@ -2604,11 +2605,6 @@ convert_EXPR_sublink_to_join(PlannerInfo *root, OpExpr *expr,
         return NULL;
     }
 
-    if (list_length(((Query *)sublink->subselect)->rtable) > 2)
-    {
-        return NULL;
-    }
-
     subselect = (Query *)copyObject(sublink->subselect);
 
     /* we can just handle simple case now! */
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index a1f9d561..4691a4a9 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1730,6 +1730,56 @@ select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
 
 drop table tbl_a;
 drop table tbl_b;
+-- more RTEs in subquery
+CREATE TABLE sub_t1 (a int4, b int4);
+CREATE TABLE sub_t2 (a int4, b int4);
+CREATE TABLE sub_interfere1 (a int4, b int4);
+CREATE TABLE sub_interfere2 (a int4, b int4);
+explain (costs off)
+select 1 from 
+	sub_t1 t1,
+	sub_t2 t2
+where t2.a = (
+	select 
+		min(t2.a)
+	from
+		sub_t2 t2,
+		sub_interfere1,
+		sub_interfere2
+	where
+		t1.a = t2.a
+);
+                                             QUERY PLAN                                              
+-----------------------------------------------------------------------------------------------------
+ Hash Join
+   Hash Cond: ("EXPR_subquery".min = t2.a)
+   ->  Hash Left Join
+         Hash Cond: (t1.a = "EXPR_subquery".a)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on sub_t1 t1
+         ->  Hash
+               ->  Subquery Scan on "EXPR_subquery"
+                     ->  HashAggregate
+                           Group Key: t2_1.a
+                           ->  Nested Loop
+                                 ->  Nested Loop
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Seq Scan on sub_t2 t2_1
+                                       ->  Materialize
+                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   ->  Seq Scan on sub_interfere1
+                                 ->  Materialize
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Seq Scan on sub_interfere2
+   ->  Hash
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on sub_t2 t2
+(23 rows)
+
+DROP TABLE sub_t1;
+DROP TABLE sub_t2;
+DROP TABLE sub_interfere1;
+DROP TABLE sub_interfere2;
 set enable_pullup_subquery to false;
 --
 -- Tests for CTE inlining behavior
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 66c01e19..256ddefa 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -703,6 +703,30 @@ select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
 
 drop table tbl_a;
 drop table tbl_b;
+
+-- more RTEs in subquery
+CREATE TABLE sub_t1 (a int4, b int4);
+CREATE TABLE sub_t2 (a int4, b int4);
+CREATE TABLE sub_interfere1 (a int4, b int4);
+CREATE TABLE sub_interfere2 (a int4, b int4);
+explain (costs off)
+select 1 from 
+	sub_t1 t1,
+	sub_t2 t2
+where t2.a = (
+	select 
+		min(t2.a)
+	from
+		sub_t2 t2,
+		sub_interfere1,
+		sub_interfere2
+	where
+		t1.a = t2.a
+);
+DROP TABLE sub_t1;
+DROP TABLE sub_t2;
+DROP TABLE sub_interfere1;
+DROP TABLE sub_interfere2;
 set enable_pullup_subquery to false;
 
 --

From df822f2db1693ac030e16aa0473540780ea76991 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Sun, 17 Jan 2021 22:04:16 +0800
Subject: [PATCH 115/578] Support tables join between different group by
 pulling up to CN (merge request !66)

The original check relied too much on global variables and guc values, and mistakenly prevented the unique path from pulling to CN for calculation. We deal with it by removing guc and a global variable, and guard it at a more proper position. Same guard was added in the "INSERT INTO SELECT FROM" case.
---
 src/backend/optimizer/plan/createplan.c | 32 ----------------
 src/backend/optimizer/plan/planner.c    | 20 ----------
 src/backend/optimizer/util/pathnode.c   | 51 +++++++++++++++++++++----
 src/backend/parser/analyze.c            | 24 +++++++++---
 src/backend/pgxc/plan/planner.c         | 18 ---------
 src/backend/utils/misc/guc.c            | 10 -----
 src/include/optimizer/planmain.h        |  2 -
 src/test/regress/expected/sysviews.out  |  3 +-
 8 files changed, 62 insertions(+), 98 deletions(-)

diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 23d04153..72a495e1 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -108,7 +108,6 @@ bool enable_group_across_query = false;
 bool enable_distributed_unique_plan = false;
 #endif
 #ifdef __COLD_HOT__
-bool has_distribute_remote_plan = false;
 bool has_cold_hot_table = false;
 #endif
 static Plan *create_plan_recurse(PlannerInfo *root, Path *best_path,
@@ -701,26 +700,11 @@ create_scan_plan(PlannerInfo *root, Path *best_path, int flags)
 
             if (AttributeNumberIsValid(loc->secAttrNum) || OidIsValid(loc->coldGroupId))
             {
-                if (has_distribute_remote_plan && list_length(groupOids) != 1)
-                {
-                    error = true;
-                }
-                else
-                {
                     has_cold_hot_table = true;
                 }
             }
-        }
 
         heap_close(relation, NoLock);
-
-        if (error)
-        {
-            has_distribute_remote_plan = false;
-            has_cold_hot_table = false;
-
-            elog(ERROR, "Tables which located in more than one group could not involved in query with join or redistribution");
-        }
     }
 #endif
 
@@ -6408,22 +6392,6 @@ make_remotesubplan(PlannerInfo *root,
     Assert(!equal(resultDistribution, execDistribution));
     Assert(!IsA(lefttree, RemoteSubplan));
 
-#ifdef __COLD_HOT__
-    if (distributionType != LOCATOR_TYPE_NONE)
-    {
-        if (has_cold_hot_table && list_length(groupOids) != 1 && root->parse->commandType != CMD_INSERT)
-        {
-            has_cold_hot_table = false;
-            has_distribute_remote_plan = false;
-            elog(ERROR, "Tables which located in more than one group could not involved in query with join or redistribution");
-        }
-        else
-        {
-            has_distribute_remote_plan = true;
-        }
-    }
-#endif
-
 #ifdef __TBASE__
     if((IsA(lefttree, HashJoin) || IsA(lefttree, SeqScan) 
         || IsA(lefttree, Agg) || IsA(lefttree, Group) ||
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 6ed3f131..4e9eda21 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -288,7 +288,6 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
     groupOids = NULL;
 #endif
 #ifdef __COLD_HOT__
-    has_distribute_remote_plan = false;
     has_cold_hot_table = false;
 #endif
     /*
@@ -526,25 +525,6 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
     result->partpruning = bms_copy(root->partpruning);
 #endif
 
-
-#ifdef __TBASE__
-    /* 
-      * sanity check 
-      * tables from different groups can not be joined, and shard table join with other table type 
-      * also permitted.
-      */
-    {
-        if (list_length(groupOids) > 1 && !enable_group_across_query && !has_cold_hot_table)
-        {
-            groupOids = NULL;
-            elog(ERROR, "Shard tables from different groups should not be invloved in one Query,\n"
-                        "Shard tables should not be invloved in one Query with other tables, such as hash table.");
-        }
-
-        groupOids = NULL;
-    }
-#endif
-
     return result;
 }
 
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 464eccfb..bd6d510c 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1682,6 +1682,33 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
         goto pull_up;
     }
 
+	/*
+	 * If outer or inner subpaths are distributed by shard and they do not exist
+	 * in same node set, which means we may need to redistribute tuples to data
+	 * nodes which use different router map to producer.
+	 * We don't support that, so pull it up to CN to accomplish the join.
+	 * 
+	 * TODO:
+	 *      1. if the join is "REPLICATION join SHARD", and node set of SHARD table
+	 *      is subset of REPLICATION table, no need to pull up.
+	 *      2. find out which side of this join needs to dispatch, and only decide
+	 *      whether to pull up by the distributionType of another side subpath.
+	 *      3. pass target router map to another group maybe ? thus nothing need to
+	 *      pull up to CN.
+	 */
+	if (innerd && outerd && 
+		(outerd->distributionType == LOCATOR_TYPE_SHARD ||
+		(innerd->distributionType == LOCATOR_TYPE_SHARD)) &&
+		!bms_equal(outerd->nodes, innerd->nodes))
+	{
+		goto pull_up;
+	}
+	
+	/*
+	 * the join of cold-hot tables must be pulled up to CN until we find a way 
+	 * to determine whether this join occurs in a specific group.
+	 */
+#ifdef __COLD_HOT__
     if (has_cold_hot_table)
     {
         if (list_length(groupOids) > 1)
@@ -1691,9 +1718,10 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
         else if (list_length(groupOids) < 1)
         {
             has_cold_hot_table = false;
-            elog(ERROR, "hot cold table joins without groups");
+			elog(ERROR, "cold-hot table joins without groups");
         }
     }
+#endif
 #endif
     /*
      * If both subpaths are distributed by replication, the resulting
@@ -2435,8 +2463,21 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
                     nodes = bms_add_member(nodes, i);
 
 #ifdef __TBASE__
+				/*
+				 * We end up here that we don't have replication table and whether
+				 * 1. we have no shard table at both sides OR
+				 * 2. we have shard table but spread in same node set
+				 * so check distribution type and decide what's next.
+				 */
+				if (innerd->distributionType == LOCATOR_TYPE_SHARD ||
+					outerd->distributionType == LOCATOR_TYPE_SHARD)
+				{
+					/* must be same node set, just copy */
+					Assert(bms_equal(innerd->nodes, innerd->nodes));
+					nodes = bms_copy(outerd->nodes);
+				}
 				/* check if we can distribute by shard */
-				if (OidIsValid(group))
+				else if (OidIsValid(group))
 				{
 					int      node_index;
 					int32	 dn_num;
@@ -3101,12 +3142,6 @@ create_redistribute_grouping_path(PlannerInfo *root, Query *parse, Path *path)
         te = (TargetEntry *)list_nth(parse->targetList,
                                      groupColIdx[colIdx]-1);
 
-        if (list_length(groupOids) > 1 && !enable_group_across_query)
-        {
-            groupOids = NULL;
-            elog(ERROR, "Tables from different groups should not be invloved in one Query.");
-        }
-
         if (groupOids)
         {
             group = linitial_oid(groupOids);
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index a157b502..27c44c45 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -703,18 +703,30 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
         ParseState *sub_pstate = make_parsestate(pstate);
         Query       *selectQuery;
 
-#ifdef __COLD_HOT__
+#ifdef __TBASE__
         /* prevent insert into cold_hot table select ... */
         if (pstate->p_target_relation)
         {
-            RelationLocInfo *rel_loc_info = pstate->p_target_relation->rd_locator_info;
+			RelationLocInfo *target_rel_loc_info = pstate->p_target_relation->rd_locator_info;
+			RelationLocInfo *from_rel_loc_info;
 
-            if (rel_loc_info)
+			if (target_rel_loc_info && target_rel_loc_info->locatorType == LOCATOR_TYPE_SHARD)
             {
-                if (AttributeNumberIsValid(rel_loc_info->secAttrNum) 
-                    || OidIsValid(rel_loc_info->coldGroupId))
+				foreach(lc, selectStmt->fromClause)
                 {
-                    elog(ERROR, "table in cold-hot group or key-value group could not join with other tables.");
+					Relation rel = heap_openrv((RangeVar *) lfirst(lc), AccessShareLock);
+					
+					from_rel_loc_info = rel->rd_locator_info;
+					if (from_rel_loc_info == NULL || /* from system table */
+#ifdef __COLD_HOT__
+					    from_rel_loc_info->coldGroupId != target_rel_loc_info->coldGroupId ||
+#endif
+					    from_rel_loc_info->groupId != target_rel_loc_info->groupId)
+					{
+						elog(ERROR, "shard table could not be inserted from any other tables in different group");
+					}
+					
+					heap_close(rel, AccessShareLock);
                 }
             }
         }
diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c
index c761f4a9..9bc141ad 100644
--- a/src/backend/pgxc/plan/planner.c
+++ b/src/backend/pgxc/plan/planner.c
@@ -349,24 +349,6 @@ pgxc_FQS_planner(Query *query, int cursorOptions, ParamListInfo boundParams)
     result->invalItems = glob->invalItems;
     result->rowMarks = glob->finalrowmarks;
 
-#ifdef __TBASE__
-    /* 
-      * sanity check 
-      * tables from different groups can not be joined, and shard table join with other table type 
-      * also permitted.
-      */
-    {
-        if (list_length(groupOids) > 1 && !enable_group_across_query)
-        {
-            groupOids = NULL;
-            elog(ERROR, "Shard tables from different groups should not be invloved in one Query,\n"
-                        "Shard tables should not be invloved in one Query with other tables, such as hash table.");
-        }
-
-        groupOids = NULL;
-    }
-#endif
-
     return result;
 }
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 5a6afb51..0f728e50 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2197,16 +2197,6 @@ static struct config_bool ConfigureNamesBool[] =
     },
 
     {
-        {"enable_group_across_query", PGC_USERSET, CUSTOM_OPTIONS,
-            gettext_noop("enable group-across queries."),
-            NULL
-        },
-        &enable_group_across_query,
-        false,
-        NULL, NULL, NULL
-    },
-
-	{
 		{"enable_distributed_unique_plan", PGC_USERSET, CUSTOM_OPTIONS,
 			gettext_noop("enable distributed unique plan."),
 			NULL
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index 58ffbce4..8139e134 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -97,9 +97,7 @@ extern int    force_parallel_mode;
 #ifdef __TBASE__
 extern int remote_subplan_depth;
 extern List *groupOids;
-extern bool enable_group_across_query;
 extern bool enable_distributed_unique_plan;
-extern bool has_distribute_remote_plan;
 extern bool has_cold_hot_table;
 
 #define INSERT_TRIGGER "tt_dn_in_"
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 48b73026..7a478711 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -96,7 +96,6 @@ select name, setting from pg_settings where name like 'enable%';
  enable_fast_query_shipping        | on
  enable_fga                        | on
  enable_gathermerge                | on
- enable_group_across_query         | off
  enable_gtm_debug_print            | off
  enable_gtm_proxy                  | off
  enable_hashagg                    | on
@@ -129,7 +128,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_tidscan                    | on
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
-(57 rows)
+(56 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From 1ee49bce6548b7415ef64c992f2e8f59fc26ff1c Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 29 Jan 2021 20:34:42 +0800
Subject: [PATCH 116/578] Bug fix, consider RangeVar only when check from
 clause of INSERT

---
 src/backend/parser/analyze.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index 27c44c45..fb5e27f1 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -714,7 +714,10 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
             {
 				foreach(lc, selectStmt->fromClause)
                 {
-					Relation rel = heap_openrv((RangeVar *) lfirst(lc), AccessShareLock);
+					Node   *node = lfirst(lc);
+					if (IsA(node, RangeVar))
+					{
+						Relation rel = heap_openrv((RangeVar *) node, AccessShareLock);
 					
 					from_rel_loc_info = rel->rd_locator_info;
 					if (from_rel_loc_info == NULL || /* from system table */
@@ -730,6 +733,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
                 }
             }
         }
+		}
 #endif
 
         /*

From 8958a0499a31a8c654ce11e278b6aecf53985de9 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Mon, 25 Jan 2021 16:01:30 +0800
Subject: [PATCH 117/578] remote subquery width fix

---
 src/backend/optimizer/plan/planner.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 4e9eda21..b909ad6a 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4281,6 +4281,8 @@ create_grouping_paths(PlannerInfo *root,
                 bool try_redistribute_grouping = false;
                 PathTarget * local_grouping_target = make_partial_grouping_target(root, target);
 
+				grouped_rel->reltarget = local_grouping_target;
+
                 /* Estimate number of partial groups. */
                 double dNumLocalGroups = get_number_of_groups(root,
                                                          cheapest_path->rows,
@@ -5326,6 +5328,9 @@ create_grouping_paths(PlannerInfo *root,
     {
         partial_grouping_target = make_partial_grouping_target(root, target);
 
+#ifdef __TBASE__
+		grouped_rel->reltarget = partial_grouping_target;
+#endif
         /* Estimate number of partial groups. */
         dNumPartialGroups = get_number_of_groups(root,
                                                  cheapest_path->rows,

From cfb79d48fc6bcdeef0fa438b6022396801fbf2ee Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Wed, 20 Jan 2021 19:59:50 +0800
Subject: [PATCH 118/578] Adjust costsize.c, consider number of nodes involved

adjust create_bitmap_subplan
---
 src/backend/optimizer/path/costsize.c   | 309 ++++++++++++++++++++++++
 src/backend/optimizer/plan/createplan.c |  36 ++-
 2 files changed, 342 insertions(+), 3 deletions(-)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 4b984a23..37683e78 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -179,6 +179,111 @@ static double relation_byte_size(double tuples, int width);
 static double page_size(double tuples, int width);
 static double get_parallel_divisor(Path *path);
 
+#ifdef __TBASE__
+/*
+ * In PostgreSQL, the row count estimate of a base rel scan, like a Seq Scan
+ * or an Index Scan, can be directly copied from RelOptInfo->rows/tuples. In
+ * TBase, it's not that straightforward as a Scan runs in parallel in the
+ * DNs, and the number of rows scanned by each Scan is RelOptInfo->rows /
+ * number of DN.
+ *
+ * That's pretty straightforward, too, but it means that we'd have to modify
+ * all the cost_seqscan, cost_index, etc. functions to take that into
+ * account. That's prone to bugs, because it is easy to miss references to
+ * rel->rows/tuples/pages. Even if we fix them all now, more can be
+ * introduced in merges with PostgreSQL, and it's not easy to notice because
+ * the only consequence is a bad cost estimate.
+ *
+ * To make that more robust with PostgreSQL merges, we do a little switcheroo
+ * with the RelOptInfo. The RelOptInfoDataNode struct is a "proxy" of
+ * RelOptInfo, containing the same fields, except that the rows/pages/tuple
+ * have already been divided by the number of data nodes. The costing functions
+ * have been modified so that on entry, they construct a RelOptInfoDataNode and
+ * use it in place of the RelOptInfo. That way, the formulas in the costing
+ * functions can still refer to "rel->pages", "rel->tuples" and so forth in
+ * the source code, keeping them unchanged from upstream, but will actually
+ * use the adjusted values.
+ *
+ * The RelOptInfoDataNode struct doesn't contain all the fields from RelOptInfo,
+ * only the ones commonly used in the cost_*() functions. If a reference to a
+ * new field is added in uptream, and it's not handled either by adding it to
+ * the RelOptInfoDataNode, or by modifying the reference to explictly point to
+ * the original RelOptInfo, you'll get a compiler error. That's good: it forces
+ * you to think whether the value needs to be divided by nDNs or not.
+ */
+typedef struct
+{
+	/* Values copied from RelOptInfo as is, for convenience */
+	Index		relid;
+	RTEKind		rtekind;		/* RELATION, SUBQUERY, or FUNCTION */
+	Oid			reltablespace;	/* containing tablespace */
+	double		allvisfrac;
+	
+	/* Values adjusted from RelOptInfo, by dividing by number of DNs */
+	double		rows;
+	BlockNumber pages;
+	double		tuples;
+	
+	/* the original RelOptInfo */
+	RelOptInfo *orig;
+} RelOptInfoDataNode;
+
+/* ParamPathInfoDataNode is a similar proxy for ParamPathInfo. */
+typedef struct
+{
+	double		ppi_rows;		/* estimated number of result tuples */
+	List	   *ppi_clauses;	/* join clauses available from outer rels */
+	
+	ParamPathInfo *orig;
+} ParamPathInfoDataNode;
+
+static ParamPathInfoDataNode *
+adjust_reloptinfo(Path *path, RelOptInfoDataNode *basescan, RelOptInfo *baserel_orig,
+                  ParamPathInfoDataNode *param_info, ParamPathInfo *param_info_orig)
+{
+	double		nodes;
+	
+	if (path->distribution && IsA(path->distribution, Distribution) &&
+		path->distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
+		path->distribution->distributionType != LOCATOR_TYPE_NONE)
+		nodes = bms_num_members(path->distribution->nodes);
+	else
+		nodes = 1;
+	
+	basescan->relid = baserel_orig->relid;
+	basescan->rtekind = baserel_orig->rtekind;
+	basescan->reltablespace = baserel_orig->reltablespace;
+	basescan->allvisfrac = baserel_orig->allvisfrac;
+	
+	basescan->rows = clamp_row_est(baserel_orig->rows / nodes);
+	basescan->tuples = clamp_row_est(baserel_orig->tuples / nodes);
+	basescan->pages = ceil((double) baserel_orig->pages / nodes);
+	
+	basescan->orig = baserel_orig;
+	
+	if (param_info_orig)
+	{
+		param_info->ppi_rows = clamp_row_est(param_info_orig->ppi_rows / nodes);
+		param_info->ppi_clauses = param_info_orig->ppi_clauses;
+		param_info->orig = param_info_orig;
+		return param_info;
+	}
+	else
+		return NULL;
+}
+
+/*
+ * ADJUST_BASESCAN initializes the proxy structs for RelOptInfo and ParamPathInfo,
+ * adjusting them by # of data nodes as needed.
+ */
+#define ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info) \
+	RelOptInfoDataNode baserel_adjusted; \
+	ParamPathInfoDataNode param_info_adjusted; \
+	RelOptInfoDataNode *baserel = &baserel_adjusted; \
+	ParamPathInfoDataNode *param_info = adjust_reloptinfo(path, &baserel_adjusted, baserel_orig, \
+															&param_info_adjusted, param_info_orig)
+#endif
+
 
 /*
  * clamp_row_est
@@ -210,8 +315,14 @@ clamp_row_est(double nrows)
  */
 void
 cost_seqscan(Path *path, PlannerInfo *root,
+#ifdef __TBASE__
+			 RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
+{
+	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
+#else
              RelOptInfo *baserel, ParamPathInfo *param_info)
 {
+#endif
     Cost        startup_cost = 0;
     Cost        cpu_run_cost;
     Cost        disk_run_cost;
@@ -243,7 +354,11 @@ cost_seqscan(Path *path, PlannerInfo *root,
     disk_run_cost = spc_seq_page_cost * baserel->pages;
 
     /* CPU costs */
+#ifdef __TBASE__
+	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
+#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
+#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
@@ -287,8 +402,14 @@ cost_seqscan(Path *path, PlannerInfo *root,
  */
 void
 cost_samplescan(Path *path, PlannerInfo *root,
+#ifdef __TBASE__
+				RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
+{
+	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
+#else
                 RelOptInfo *baserel, ParamPathInfo *param_info)
 {
+#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     RangeTblEntry *rte;
@@ -337,7 +458,11 @@ cost_samplescan(Path *path, PlannerInfo *root,
      * simple constants anyway.  We also don't charge anything for the
      * calculations the sampling method might do internally.
      */
+#ifdef __TBASE__
+	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
+#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
+#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
@@ -362,9 +487,14 @@ cost_samplescan(Path *path, PlannerInfo *root,
  */
 void
 cost_gather(GatherPath *path, PlannerInfo *root,
+#ifdef __TBASE__
+			RelOptInfo *rel_orig, ParamPathInfo *param_info_orig,
+#else
             RelOptInfo *rel, ParamPathInfo *param_info,
+#endif
             double *rows)
 {
+	ADJUST_BASESCAN(&path->path, rel_orig, rel, param_info_orig, param_info);
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
 
@@ -478,7 +608,12 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
            bool partial_path)
 {// #lizard forgives
     IndexOptInfo *index = path->indexinfo;
+#ifdef __TBASE__
+	RelOptInfo *baserel_orig = index->rel;
+	ADJUST_BASESCAN(&path->path, baserel_orig, baserel, path->path.param_info, param_info);
+#else
     RelOptInfo *baserel = index->rel;
+#endif
     bool        indexonly = (path->path.pathtype == T_IndexOnlyScan);
     amcostestimate_function amcostestimate;
     List       *qpquals;
@@ -500,10 +635,23 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
     double        pages_fetched;
     double        rand_heap_pages;
     double        index_pages;
+	double		nodes = 1;
 
+#ifdef __TBASE__
+	if (path->path.distribution && IsA(path->path.distribution, Distribution) &&
+	    path->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
+	    path->path.distribution->distributionType != LOCATOR_TYPE_NONE)
+	{
+		nodes = bms_num_members(path->path.distribution->nodes);
+	}
+	/* Should only be applied to base relations */
+	Assert(IsA(baserel_orig, RelOptInfo) &&
+	       IsA(index, IndexOptInfo));
+#else
     /* Should only be applied to base relations */
     Assert(IsA(baserel, RelOptInfo) &&
            IsA(index, IndexOptInfo));
+#endif
     Assert(baserel->relid > 0);
     Assert(baserel->rtekind == RTE_RELATION);
 
@@ -514,6 +662,18 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
      * baserestrictinfo as the list of relevant restriction clauses for the
      * rel.
      */
+#ifdef __TBASE__
+	if (param_info)
+	{
+		path->path.rows = param_info->ppi_rows;
+		/* qpquals come from the rel's restriction clauses and ppi_clauses */
+		qpquals = list_concat(
+							  extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
+														  path->indexquals),
+							  extract_nonindex_conditions(param_info->ppi_clauses,
+														  path->indexquals));
+	}
+#else
     if (path->path.param_info)
     {
         path->path.rows = path->path.param_info->ppi_rows;
@@ -524,6 +684,7 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
                               extract_nonindex_conditions(path->path.param_info->ppi_clauses,
                                                           path->indexquals));
     }
+#endif
     else
     {
         path->path.rows = baserel->rows;
@@ -549,6 +710,9 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
                    &indexSelectivity, &indexCorrelation,
                    &index_pages);
 
+	/* The index pages should be divided among all the data nodes like baserel dose. */
+	index_pages = ceil(index_pages / nodes);
+
     /*
      * Save amcostestimate's results for possible use in bitmap scan planning.
      * We don't bother to save indexStartupCost or indexCorrelation, because a
@@ -608,7 +772,11 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
          */
         pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
                                             baserel->pages,
+#ifdef __TBASE__
+											index_pages,
+#else
                                             (double) index->pages,
+#endif
                                             root);
 
         if (indexonly)
@@ -632,7 +800,11 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
 
         pages_fetched = index_pages_fetched(pages_fetched * loop_count,
                                             baserel->pages,
+#ifdef __TBASE__
+											index_pages,
+#else
                                             (double) index->pages,
+#endif
                                             root);
 
         if (indexonly)
@@ -648,7 +820,11 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
          */
         pages_fetched = index_pages_fetched(tuples_fetched,
                                             baserel->pages,
+#ifdef __TBASE__
+											index_pages,
+#else
                                             (double) index->pages,
+#endif
                                             root);
 
         if (indexonly)
@@ -1014,6 +1190,21 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
     cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
     cpu_run_cost = cpu_per_tuple * tuples_fetched;
 
+#ifdef __TBASE__
+	/* Adjust costing for parallelism between data nodes, if used. */
+	if (path->distribution && IsA(path->distribution, Distribution) &&
+	    path->distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
+	    path->distribution->distributionType != LOCATOR_TYPE_NONE)
+	{
+		double nodes = bms_num_members(path->distribution->nodes);
+
+		/* The CPU cost is divided among all the data nodes. */
+		cpu_run_cost /= nodes;
+
+		path->rows = clamp_row_est(path->rows / nodes);
+	}
+#endif
+	
     /* Adjust costing for parallelism, if used. */
     if (path->parallel_workers > 0)
     {
@@ -1177,8 +1368,14 @@ cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root)
  */
 void
 cost_tidscan(Path *path, PlannerInfo *root,
+#ifdef __TBASE__
+			 RelOptInfo *baserel_orig, List *tidquals, ParamPathInfo *param_info_orig)
+{
+	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
+#else
              RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info)
 {
+#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     bool        isCurrentOf = false;
@@ -1234,7 +1431,11 @@ cost_tidscan(Path *path, PlannerInfo *root,
      */
     if (isCurrentOf)
     {
+#ifdef __TBASE__
+		Assert(baserel->orig->baserestrictcost.startup >= disable_cost);
+#else
         Assert(baserel->baserestrictcost.startup >= disable_cost);
+#endif
         startup_cost -= disable_cost;
     }
     else if (!enable_tidscan)
@@ -1255,7 +1456,11 @@ cost_tidscan(Path *path, PlannerInfo *root,
     run_cost += spc_random_page_cost * ntuples;
 
     /* Add scanning CPU costs */
+#ifdef __TBASE__
+	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
+#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
+#endif
 
     /* XXX currently we assume TID quals are a subset of qpquals */
     startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
@@ -1280,8 +1485,14 @@ cost_tidscan(Path *path, PlannerInfo *root,
  */
 void
 cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
+#ifdef __TBASE__
+				  RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
+{
+	ADJUST_BASESCAN(&path->path, baserel_orig, baserel, param_info_orig, param_info);
+#else
                   RelOptInfo *baserel, ParamPathInfo *param_info)
 {
+#endif
     Cost        startup_cost;
     Cost        run_cost;
     QualCost    qpqual_cost;
@@ -1306,7 +1517,11 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
     path->path.startup_cost = path->subpath->startup_cost;
     path->path.total_cost = path->subpath->total_cost;
 
+#ifdef __TBASE__
+	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
+#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
+#endif
 
     startup_cost = qpqual_cost.startup;
     cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
@@ -1329,8 +1544,14 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
  */
 void
 cost_functionscan(Path *path, PlannerInfo *root,
+#ifdef __TBASE__
+				  RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
+{
+	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
+#else
                   RelOptInfo *baserel, ParamPathInfo *param_info)
 {
+#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     QualCost    qpqual_cost;
@@ -1367,7 +1588,11 @@ cost_functionscan(Path *path, PlannerInfo *root,
     startup_cost += exprcost.startup + exprcost.per_tuple;
 
     /* Add scanning CPU costs */
+#ifdef __TBASE__
+	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
+#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
+#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
@@ -1390,8 +1615,14 @@ cost_functionscan(Path *path, PlannerInfo *root,
  */
 void
 cost_tablefuncscan(Path *path, PlannerInfo *root,
+#ifdef __TBASE__
+				   RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
+{
+	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
+#else
                    RelOptInfo *baserel, ParamPathInfo *param_info)
 {
+#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     QualCost    qpqual_cost;
@@ -1423,7 +1654,11 @@ cost_tablefuncscan(Path *path, PlannerInfo *root,
     startup_cost += exprcost.startup + exprcost.per_tuple;
 
     /* Add scanning CPU costs */
+#ifdef __TBASE__
+	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
+#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
+#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
@@ -1446,8 +1681,14 @@ cost_tablefuncscan(Path *path, PlannerInfo *root,
  */
 void
 cost_valuesscan(Path *path, PlannerInfo *root,
+#ifdef __TBASE__
+				RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
+{
+	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
+#else
                 RelOptInfo *baserel, ParamPathInfo *param_info)
 {
+#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     QualCost    qpqual_cost;
@@ -1470,7 +1711,11 @@ cost_valuesscan(Path *path, PlannerInfo *root,
     cpu_per_tuple = cpu_operator_cost;
 
     /* Add scanning CPU costs */
+#ifdef __TBASE__
+	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
+#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
+#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
@@ -1496,8 +1741,14 @@ cost_valuesscan(Path *path, PlannerInfo *root,
  */
 void
 cost_ctescan(Path *path, PlannerInfo *root,
+#ifdef __TBASE__
+			 RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
+{
+	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
+#else
              RelOptInfo *baserel, ParamPathInfo *param_info)
 {
+#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     QualCost    qpqual_cost;
@@ -1517,7 +1768,11 @@ cost_ctescan(Path *path, PlannerInfo *root,
     cpu_per_tuple = cpu_tuple_cost;
 
     /* Add scanning CPU costs */
+#ifdef __TBASE__
+	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
+#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
+#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
@@ -1537,8 +1792,14 @@ cost_ctescan(Path *path, PlannerInfo *root,
  */
 void
 cost_namedtuplestorescan(Path *path, PlannerInfo *root,
+#ifdef __TBASE__
+						 RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
+{
+	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
+#else
                          RelOptInfo *baserel, ParamPathInfo *param_info)
 {
+#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     QualCost    qpqual_cost;
@@ -1558,7 +1819,11 @@ cost_namedtuplestorescan(Path *path, PlannerInfo *root,
     cpu_per_tuple = cpu_tuple_cost;
 
     /* Add scanning CPU costs */
+#ifdef __TBASE__
+	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
+#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
+#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
@@ -2208,6 +2473,17 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
 	else
 		path->path.rows = path->path.parent->rows;
 
+#ifdef __TBASE__
+	if (path->path.distribution && IsA(path->path.distribution, Distribution) &&
+	    path->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
+	    path->path.distribution->distributionType != LOCATOR_TYPE_NONE)
+	{
+		double nodes = bms_num_members(path->path.distribution->nodes);
+		
+		path->path.rows = clamp_row_est(path->path.rows / nodes);
+	}
+#endif
+	
 	/* For partial paths, scale row estimate. */
 	if (path->path.parallel_workers > 0)
 	{
@@ -2697,6 +2973,17 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path,
 	else
 		path->jpath.path.rows = path->jpath.path.parent->rows;
 
+#ifdef __TBASE__
+	if (path->jpath.path.distribution && IsA(path->jpath.path.distribution, Distribution) &&
+	    path->jpath.path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
+	    path->jpath.path.distribution->distributionType != LOCATOR_TYPE_NONE)
+	{
+		double nodes = bms_num_members(path->jpath.path.distribution->nodes);
+		
+		path->jpath.path.rows = clamp_row_est(path->jpath.path.rows / nodes);
+	}
+#endif
+	
 	/* For partial paths, scale row estimate. */
 	if (path->jpath.path.parallel_workers > 0)
 	{
@@ -3140,6 +3427,17 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 	else
 		path->jpath.path.rows = path->jpath.path.parent->rows;
 
+#ifdef __TBASE__
+	if (path->jpath.path.distribution && IsA(path->jpath.path.distribution, Distribution) &&
+	    path->jpath.path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
+	    path->jpath.path.distribution->distributionType != LOCATOR_TYPE_NONE)
+	{
+		double nodes = bms_num_members(path->jpath.path.distribution->nodes);
+		
+		path->jpath.path.rows = clamp_row_est(path->jpath.path.rows / nodes);
+	}
+#endif
+	
 	/* For partial paths, scale row estimate. */
 	if (path->jpath.path.parallel_workers > 0)
 	{
@@ -4728,6 +5026,17 @@ set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel)
      */
     sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
     rel->tuples = sub_final_rel->cheapest_total_path->rows;
+#ifdef __TBASE__
+	if (sub_final_rel->cheapest_total_path->distribution && IsA(sub_final_rel->cheapest_total_path->distribution, Distribution) &&
+		sub_final_rel->cheapest_total_path->distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
+		sub_final_rel->cheapest_total_path->distribution->distributionType != LOCATOR_TYPE_NONE)
+	{
+		double nodes = bms_num_members(sub_final_rel->cheapest_total_path->distribution->nodes);
+		
+		/* count tuples in all data nodes */
+		rel->tuples *= nodes;
+	}
+#endif
 
     /*
      * Compute per-output-column width estimates by examining the subquery's
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 72a495e1..a30ea56e 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -3811,6 +3811,16 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
         List       *subindexquals = NIL;
         List       *subindexECs = NIL;
         ListCell   *l;
+		double      nodes = 1;
+
+#ifdef __TBASE__
+		if (apath->path.distribution && IsA(apath->path.distribution, Distribution) &&
+		    apath->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
+		    apath->path.distribution->distributionType != LOCATOR_TYPE_NONE)
+		{
+			nodes = bms_num_members(apath->path.distribution->nodes);
+		}
+#endif
 
         /*
          * There may well be redundant quals among the subplans, since a
@@ -3839,7 +3849,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
         plan->startup_cost = apath->path.startup_cost;
         plan->total_cost = apath->path.total_cost;
         plan->plan_rows =
-            clamp_row_est(apath->bitmapselectivity * apath->path.parent->tuples);
+			clamp_row_est(apath->bitmapselectivity * apath->path.parent->tuples / nodes);
         plan->plan_width = 0;    /* meaningless */
         plan->parallel_aware = false;
         plan->parallel_safe = apath->path.parallel_safe;
@@ -3899,11 +3909,21 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
         }
         else
         {
+			double  nodes = 1;
+#ifdef __TBASE__
+			if (opath->path.distribution && IsA(opath->path.distribution, Distribution) &&
+			    opath->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
+			    opath->path.distribution->distributionType != LOCATOR_TYPE_NONE)
+			{
+				nodes = bms_num_members(opath->path.distribution->nodes);
+			}
+#endif
+			
             plan = (Plan *) make_bitmap_or(subplans);
             plan->startup_cost = opath->path.startup_cost;
             plan->total_cost = opath->path.total_cost;
             plan->plan_rows =
-                clamp_row_est(opath->bitmapselectivity * opath->path.parent->tuples);
+				clamp_row_est(opath->bitmapselectivity * opath->path.parent->tuples / nodes);
             plan->plan_width = 0;    /* meaningless */
             plan->parallel_aware = false;
             plan->parallel_safe = opath->path.parallel_safe;
@@ -3934,6 +3954,16 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
         IndexScan  *iscan;
         List       *subindexECs;
         ListCell   *l;
+		double      nodes = 1;
+		
+#ifdef __TBASE__
+		if (ipath->path.distribution && IsA(ipath->path.distribution, Distribution) &&
+		    ipath->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
+		    ipath->path.distribution->distributionType != LOCATOR_TYPE_NONE)
+		{
+			nodes = bms_num_members(ipath->path.distribution->nodes);
+		}
+#endif
 
         /* Use the regular indexscan plan build machinery... */
         iscan = castNode(IndexScan,
@@ -3948,7 +3978,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
         plan->startup_cost = 0.0;
         plan->total_cost = ipath->indextotalcost;
         plan->plan_rows =
-            clamp_row_est(ipath->indexselectivity * ipath->path.parent->tuples);
+			clamp_row_est(ipath->indexselectivity * ipath->path.parent->tuples / nodes);
         plan->plan_width = 0;    /* meaningless */
         plan->parallel_aware = false;
         plan->parallel_safe = ipath->path.parallel_safe;

From 52e562a09bef3a58b221d66c376f7a4a9dbdcd72 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 26 Jan 2021 20:47:12 +0800
Subject: [PATCH 119/578] Adjust gather cost if its upper path is a remote
 subquery

---
 src/backend/optimizer/path/costsize.c |  12 ++
 src/backend/optimizer/util/pathnode.c |  78 +++++++++--
 src/include/optimizer/cost.h          | 189 +++++++++++++-------------
 3 files changed, 175 insertions(+), 104 deletions(-)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 37683e78..60949690 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -518,6 +518,18 @@ cost_gather(GatherPath *path, PlannerInfo *root,
     path->path.total_cost = (startup_cost + run_cost);
 }
 
+#ifdef __TBASE__
+/* 
+ * gather node has been optimized, it only needs to do some initiating work
+ * so set total_cost to startup_cost which means run_cost = 0.
+ */
+void
+reset_cost_gather(GatherPath *path)
+{
+	path->path.total_cost = path->subpath->total_cost + path->path.startup_cost;
+}
+#endif
+
 /*
  * cost_gather_merge
  *      Determines and returns the cost of gather merge path.
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index bd6d510c..13cfb45d 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1404,7 +1404,41 @@ set_scanpath_distribution(PlannerInfo *root, RelOptInfo *rel, Path *pathnode)
     }
 }
 
+#ifdef __TBASE__
+static Path *
+create_remotesubplan_path_internal(PlannerInfo *root, Path *subpath,
+                                   Distribution *distribution, RelOptInfo *rel,
+                                   ParamPathInfo *param_info, List *pathkeys,
+                                   PathTarget *pathtarget, int replication,
+                                   Cost additional_startup_cost,
+                                   Cost additional_total_cost)
+{
+	RemoteSubPath   *pathnode;
+	
+	if (IsA(subpath, GatherPath))
+		reset_cost_gather((GatherPath *) subpath);
+	
+	pathnode = makeNode(RemoteSubPath);
+	pathnode->path.pathtype = T_RemoteSubplan;
+	pathnode->path.parent = rel;
+	pathnode->path.param_info = param_info;
+	pathnode->path.pathkeys = pathkeys;
+	pathnode->subpath = subpath;
+	pathnode->path.distribution = (Distribution *) copyObject(distribution);
+	
+	/* We don't want to run subplains in parallel workers */
+	pathnode->path.parallel_aware = false;
+	pathnode->path.parallel_safe = false;
+	
+	pathnode->path.pathtarget = pathtarget;
 
+	cost_remote_subplan((Path *) pathnode, subpath->startup_cost + additional_startup_cost,
+	                    subpath->total_cost + additional_total_cost, subpath->rows,
+	                    rel->reltarget->width, replication);
+	
+	return (Path *) pathnode;
+}
+#endif
 
 
 /*
@@ -1422,6 +1456,13 @@ create_remotesubplan_path(PlannerInfo *root, Path *subpath,
     RemoteSubPath  *pathnode;
     Distribution   *subdistribution = subpath->distribution;
 
+#ifdef __TBASE__
+	return create_remotesubplan_path_internal(root, subpath, distribution,
+	                                          rel, subpath->param_info,
+	                                          subpath->pathkeys, subpath->pathtarget,
+	                                          (subdistribution && IsLocatorReplicated(subdistribution->distributionType)) ?
+	                                          bms_num_members(subdistribution->nodes) : 1, 0, 0);
+#else
     pathnode = makeNode(RemoteSubPath);
     pathnode->path.pathtype = T_RemoteSubplan;
     pathnode->path.parent = rel;
@@ -1442,6 +1483,7 @@ create_remotesubplan_path(PlannerInfo *root, Path *subpath,
                         bms_num_members(subdistribution->nodes) : 1);
 
     return (Path *) pathnode;
+#endif
 }
 
 /*
@@ -1484,6 +1526,20 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
     if (IsA(subpath, MaterialPath))
     {
         MaterialPath *mpath = (MaterialPath *) subpath;
+#ifdef __TBASE__
+		if (IsA(mpath->subpath, RemoteSubPath))
+		{
+			pathnode = (RemoteSubPath *) mpath->subpath;
+			pathnode->path.distribution = (Distribution *) copyObject(distribution);
+		}
+		else
+		{
+			pathnode = (RemoteSubPath *) create_remotesubplan_path_internal(root, mpath->subpath,
+			                                                                distribution, rel, subpath->param_info,
+			                                                                subpath->pathkeys, rel->reltarget,
+			                                                                num_replication, 0, 0);
+		}
+#else
         /* If subpath is already a RemoteSubPath, just replace distribution */
         if (IsA(mpath->subpath, RemoteSubPath))
         {
@@ -1508,16 +1564,13 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
 
         subpath = pathnode->subpath;
         pathnode->path.distribution = distribution;
-        mpath->path.distribution = (Distribution *) copyObject(distribution);
         /* (re)calculate costs */
         cost_remote_subplan((Path *) pathnode, subpath->startup_cost,
                             subpath->total_cost, subpath->rows, rel->reltarget->width,
-#ifdef __TBASE__
-							num_replication);
-#else
                             IsLocatorReplicated(distributionType) ?
                                     bms_num_members(nodes) : 1);
 #endif
+		mpath->path.distribution = (Distribution *) copyObject(distribution);
         mpath->subpath = (Path *) pathnode;
         cost_material(&mpath->path,
                       pathnode->path.startup_cost,
@@ -1530,7 +1583,7 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
     {
         Cost    input_startup_cost = 0;
         Cost    input_total_cost = 0;
-
+#ifndef __TBASE__
         pathnode = makeNode(RemoteSubPath);
         pathnode->path.pathtype = T_RemoteSubplan;
         pathnode->path.parent = rel;
@@ -1538,7 +1591,7 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
         pathnode->path.param_info = subpath->param_info;
         pathnode->path.pathkeys = pathkeys ? pathkeys : subpath->pathkeys;
         pathnode->path.distribution = distribution;
-
+#endif
         /*
          * If we need to insert a Sort node, add it here, so that it gets
          * pushed down to the remote node.
@@ -1571,7 +1624,14 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
             input_startup_cost += sort_path.startup_cost;
             input_total_cost += sort_path.total_cost;
         }
-
+#ifdef __TBASE__
+		pathnode = (RemoteSubPath *) create_remotesubplan_path_internal(root, subpath,
+		                                                                distribution, rel, subpath->param_info,
+		                                                                pathkeys ? pathkeys : subpath->pathkeys, 
+		                                                                rel->reltarget, num_replication,
+		                                                                input_startup_cost - subpath->startup_cost,
+		                                                                input_total_cost - subpath->total_cost);
+#else
         pathnode->subpath = subpath;
 
         /* We don't want to run subplains in parallel workers */
@@ -1581,11 +1641,7 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
         cost_remote_subplan((Path *) pathnode,
                             input_startup_cost, input_total_cost,
                             subpath->rows, rel->reltarget->width,
-#ifdef __TBASE__
 							num_replication);
-#else
-                            IsLocatorReplicated(distributionType) ?
-                                    bms_num_members(nodes) : 1);
 #endif
         return (Path *) pathnode;
     }
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 102795bb..2198c9db 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * cost.h
- *      prototypes for costsize.c and clausesel.c.
+ *	  prototypes for costsize.c and clausesel.c.
  *
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
@@ -24,7 +24,7 @@
 /* If you change these, update backend/utils/misc/postgresql.sample.conf */
 #define DEFAULT_SEQ_PAGE_COST  1.0
 #define DEFAULT_RANDOM_PAGE_COST  4.0
-#define DEFAULT_CPU_TUPLE_COST    0.01
+#define DEFAULT_CPU_TUPLE_COST	0.01
 #define DEFAULT_CPU_INDEX_TUPLE_COST 0.005
 #define DEFAULT_CPU_OPERATOR_COST  0.0025
 #ifdef XCP
@@ -34,19 +34,19 @@
 #define DEFAULT_PARALLEL_TUPLE_COST 0.1
 #define DEFAULT_PARALLEL_SETUP_COST  1000.0
 
-#define DEFAULT_EFFECTIVE_CACHE_SIZE  524288    /* measured in pages */
+#define DEFAULT_EFFECTIVE_CACHE_SIZE  524288	/* measured in pages */
 
 typedef enum
 {
-    CONSTRAINT_EXCLUSION_OFF,    /* do not use c_e */
-    CONSTRAINT_EXCLUSION_ON,    /* apply c_e to all rels */
-    CONSTRAINT_EXCLUSION_PARTITION    /* apply c_e to otherrels only */
-}            ConstraintExclusionType;
+	CONSTRAINT_EXCLUSION_OFF,	/* do not use c_e */
+	CONSTRAINT_EXCLUSION_ON,	/* apply c_e to all rels */
+	CONSTRAINT_EXCLUSION_PARTITION	/* apply c_e to otherrels only */
+}			ConstraintExclusionType;
 
 
 /*
  * prototypes for costsize.c
- *      routines to compute costs and sizes
+ *	  routines to compute costs and sizes
  */
 
 /* parameter variables and flags */
@@ -63,7 +63,7 @@ extern PGDLLIMPORT double parallel_tuple_cost;
 extern PGDLLIMPORT double parallel_setup_cost;
 extern PGDLLIMPORT int effective_cache_size;
 extern Cost disable_cost;
-extern int    max_parallel_workers_per_gather;
+extern int	max_parallel_workers_per_gather;
 extern bool enable_seqscan;
 extern bool enable_indexscan;
 extern bool enable_indexonlyscan;
@@ -82,143 +82,146 @@ extern int	constraint_exclusion;
 
 extern double clamp_row_est(double nrows);
 extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,
-                    double index_pages, PlannerInfo *root);
+					double index_pages, PlannerInfo *root);
 extern void cost_seqscan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
-              ParamPathInfo *param_info);
+			 ParamPathInfo *param_info);
 extern void cost_samplescan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
-                ParamPathInfo *param_info);
+				ParamPathInfo *param_info);
 extern void cost_index(IndexPath *path, PlannerInfo *root,
-            double loop_count, bool partial_path);
+		   double loop_count, bool partial_path);
 extern void cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
-                      ParamPathInfo *param_info,
-                      Path *bitmapqual, double loop_count);
+					  ParamPathInfo *param_info,
+					  Path *bitmapqual, double loop_count);
 extern void cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root);
 extern void cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root);
 extern void cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec);
 extern void cost_tidscan(Path *path, PlannerInfo *root,
-              RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info);
+			 RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info);
 extern void cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
-                  RelOptInfo *baserel, ParamPathInfo *param_info);
+				  RelOptInfo *baserel, ParamPathInfo *param_info);
 extern void cost_functionscan(Path *path, PlannerInfo *root,
-                  RelOptInfo *baserel, ParamPathInfo *param_info);
+				  RelOptInfo *baserel, ParamPathInfo *param_info);
 extern void cost_tableexprscan(Path *path, PlannerInfo *root,
-                    RelOptInfo *baserel, ParamPathInfo *param_info);
+				   RelOptInfo *baserel, ParamPathInfo *param_info);
 extern void cost_valuesscan(Path *path, PlannerInfo *root,
-                RelOptInfo *baserel, ParamPathInfo *param_info);
+				RelOptInfo *baserel, ParamPathInfo *param_info);
 #ifdef PGXC
 extern void cost_remotequery(Path *path, PlannerInfo *root, RelOptInfo *baserel);
 #endif
 extern void cost_tablefuncscan(Path *path, PlannerInfo *root,
-                    RelOptInfo *baserel, ParamPathInfo *param_info);
+				   RelOptInfo *baserel, ParamPathInfo *param_info);
 extern void cost_ctescan(Path *path, PlannerInfo *root,
-              RelOptInfo *baserel, ParamPathInfo *param_info);
+			 RelOptInfo *baserel, ParamPathInfo *param_info);
 extern void cost_namedtuplestorescan(Path *path, PlannerInfo *root,
-                          RelOptInfo *baserel, ParamPathInfo *param_info);
+						 RelOptInfo *baserel, ParamPathInfo *param_info);
 extern void cost_recursive_union(Path *runion, Path *nrterm, Path *rterm);
 extern void cost_sort(Path *path, PlannerInfo *root,
-          List *pathkeys, Cost input_cost, double tuples, int width,
-          Cost comparison_cost, int sort_mem,
-          double limit_tuples);
+		  List *pathkeys, Cost input_cost, double tuples, int width,
+		  Cost comparison_cost, int sort_mem,
+		  double limit_tuples);
 extern void cost_merge_append(Path *path, PlannerInfo *root,
-                  List *pathkeys, int n_streams,
-                  Cost input_startup_cost, Cost input_total_cost,
-                  double tuples);
+				  List *pathkeys, int n_streams,
+				  Cost input_startup_cost, Cost input_total_cost,
+				  double tuples);
 extern void cost_material(Path *path,
-              Cost input_startup_cost, Cost input_total_cost,
-              double tuples, int width);
+			  Cost input_startup_cost, Cost input_total_cost,
+			  double tuples, int width);
 extern void cost_agg(Path *path, PlannerInfo *root,
-          AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
-          int numGroupCols, double numGroups,
-          Cost input_startup_cost, Cost input_total_cost,
-          double input_tuples);
+		 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
+		 int numGroupCols, double numGroups,
+		 Cost input_startup_cost, Cost input_total_cost,
+		 double input_tuples);
 extern void cost_windowagg(Path *path, PlannerInfo *root,
-                List *windowFuncs, int numPartCols, int numOrderCols,
-                Cost input_startup_cost, Cost input_total_cost,
-                double input_tuples);
+			   List *windowFuncs, int numPartCols, int numOrderCols,
+			   Cost input_startup_cost, Cost input_total_cost,
+			   double input_tuples);
 extern void cost_group(Path *path, PlannerInfo *root,
-            int numGroupCols, double numGroups,
-            Cost input_startup_cost, Cost input_total_cost,
-            double input_tuples);
+		   int numGroupCols, double numGroups,
+		   Cost input_startup_cost, Cost input_total_cost,
+		   double input_tuples);
 extern void initial_cost_nestloop(PlannerInfo *root,
-                      JoinCostWorkspace *workspace,
-                      JoinType jointype,
-                      Path *outer_path, Path *inner_path,
-                      JoinPathExtraData *extra);
+					  JoinCostWorkspace *workspace,
+					  JoinType jointype,
+					  Path *outer_path, Path *inner_path,
+					  JoinPathExtraData *extra);
 extern void final_cost_nestloop(PlannerInfo *root, NestPath *path,
-                    JoinCostWorkspace *workspace,
-                    JoinPathExtraData *extra);
+					JoinCostWorkspace *workspace,
+					JoinPathExtraData *extra);
 extern void initial_cost_mergejoin(PlannerInfo *root,
-                        JoinCostWorkspace *workspace,
-                        JoinType jointype,
-                        List *mergeclauses,
-                        Path *outer_path, Path *inner_path,
-                        List *outersortkeys, List *innersortkeys,
-                        JoinPathExtraData *extra);
+					   JoinCostWorkspace *workspace,
+					   JoinType jointype,
+					   List *mergeclauses,
+					   Path *outer_path, Path *inner_path,
+					   List *outersortkeys, List *innersortkeys,
+					   JoinPathExtraData *extra);
 extern void final_cost_mergejoin(PlannerInfo *root, MergePath *path,
-                      JoinCostWorkspace *workspace,
-                      JoinPathExtraData *extra);
+					 JoinCostWorkspace *workspace,
+					 JoinPathExtraData *extra);
 extern void initial_cost_hashjoin(PlannerInfo *root,
-                      JoinCostWorkspace *workspace,
-                      JoinType jointype,
-                      List *hashclauses,
-                      Path *outer_path, Path *inner_path,
-                      JoinPathExtraData *extra);
+					  JoinCostWorkspace *workspace,
+					  JoinType jointype,
+					  List *hashclauses,
+					  Path *outer_path, Path *inner_path,
+					  JoinPathExtraData *extra);
 extern void final_cost_hashjoin(PlannerInfo *root, HashPath *path,
-                    JoinCostWorkspace *workspace,
-                    JoinPathExtraData *extra);
+					JoinCostWorkspace *workspace,
+					JoinPathExtraData *extra);
 extern void cost_gather(GatherPath *path, PlannerInfo *root,
-            RelOptInfo *baserel, ParamPathInfo *param_info, double *rows);
+			RelOptInfo *baserel, ParamPathInfo *param_info, double *rows);
+#ifdef __TBASE__
+extern void reset_cost_gather(GatherPath *path);
+#endif
 extern void cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan);
 extern void cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root);
 extern void cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root);
 #ifdef XCP
 extern void cost_remote_subplan(Path *path,
-              Cost input_startup_cost, Cost input_total_cost,
-              double tuples, int width, int replication);
+			  Cost input_startup_cost, Cost input_total_cost,
+			  double tuples, int width, int replication);
 #endif
 extern void compute_semi_anti_join_factors(PlannerInfo *root,
-                                RelOptInfo *outerrel,
-                                RelOptInfo *innerrel,
-                                JoinType jointype,
-                                SpecialJoinInfo *sjinfo,
-                                List *restrictlist,
-                                SemiAntiJoinFactors *semifactors);
+							   RelOptInfo *outerrel,
+							   RelOptInfo *innerrel,
+							   JoinType jointype,
+							   SpecialJoinInfo *sjinfo,
+							   List *restrictlist,
+							   SemiAntiJoinFactors *semifactors);
 extern void set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel);
 extern double get_parameterized_baserel_size(PlannerInfo *root,
-                                RelOptInfo *rel,
-                                List *param_clauses);
+							   RelOptInfo *rel,
+							   List *param_clauses);
 extern double get_parameterized_joinrel_size(PlannerInfo *root,
-                                RelOptInfo *rel,
-                                Path *outer_path,
-                                Path *inner_path,
-                                SpecialJoinInfo *sjinfo,
-                                List *restrict_clauses);
+							   RelOptInfo *rel,
+							   Path *outer_path,
+							   Path *inner_path,
+							   SpecialJoinInfo *sjinfo,
+							   List *restrict_clauses);
 extern void set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
-                            RelOptInfo *outer_rel,
-                            RelOptInfo *inner_rel,
-                            SpecialJoinInfo *sjinfo,
-                            List *restrictlist);
+						   RelOptInfo *outer_rel,
+						   RelOptInfo *inner_rel,
+						   SpecialJoinInfo *sjinfo,
+						   List *restrictlist);
 extern void set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel);
 extern void set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel);
 extern void set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel);
 extern void set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel,
-                        double cte_rows);
+					   double cte_rows);
 extern void set_tablefunc_size_estimates(PlannerInfo *root, RelOptInfo *rel);
 extern void set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *rel);
 extern void set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel);
 extern PathTarget *set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target);
 extern double compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel,
-                      Path *bitmapqual, int loop_count, Cost *cost, double *tuple);
+					 Path *bitmapqual, int loop_count, Cost *cost, double *tuple);
 
 /*
  * prototypes for clausesel.c
- *      routines to compute clause selectivities
+ *	  routines to compute clause selectivities
  */
 extern Selectivity clauselist_selectivity(PlannerInfo *root,
-                        List *clauses,
-                        int varRelid,
-                        JoinType jointype,
-                        SpecialJoinInfo *sjinfo);
+					   List *clauses,
+					   int varRelid,
+					   JoinType jointype,
+					   SpecialJoinInfo *sjinfo);
 extern Selectivity clause_selectivity(PlannerInfo *root,
 				   Node *clause,
 				   int varRelid,
@@ -228,8 +231,8 @@ extern Selectivity clause_selectivity(PlannerInfo *root,
 extern bool clause_selectivity_could_under_estimated(PlannerInfo *root, Path *path);
 #endif
 extern void cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
-                  RelOptInfo *rel, ParamPathInfo *param_info,
-                  Cost input_startup_cost, Cost input_total_cost,
-                  double *rows);
+				  RelOptInfo *rel, ParamPathInfo *param_info,
+				  Cost input_startup_cost, Cost input_total_cost,
+				  double *rows);
 
-#endif                            /* COST_H */
+#endif							/* COST_H */

From faaf938a2b66199d1436425a469dca9633819b2d Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Mon, 1 Feb 2021 20:27:13 +0800
Subject: [PATCH 120/578] Adjust gather and add a guc control parallel agg
 worker num

---
 src/backend/optimizer/path/allpaths.c   |   1 +
 src/backend/optimizer/path/costsize.c   |  14 ++
 src/backend/optimizer/path/indxpath.c   |   2 +
 src/backend/optimizer/plan/createplan.c |  26 ++-
 src/backend/optimizer/util/pathnode.c   |   4 +-
 src/backend/utils/misc/guc.c            |  12 +-
 src/include/optimizer/paths.h           | 215 ++++++++++++------------
 7 files changed, 157 insertions(+), 117 deletions(-)

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 42c19c2f..310cec07 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -65,6 +65,7 @@ bool        enable_geqo = false;    /* just in case GUC doesn't set it */
 int            geqo_threshold;
 int            min_parallel_table_scan_size;
 int            min_parallel_index_scan_size;
+int			min_parallel_rows_size;
 
 /* Hook for plugins to get control in set_rel_pathlist() */
 set_rel_pathlist_hook_type set_rel_pathlist_hook = NULL;
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 60949690..5d40d93e 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -494,7 +494,9 @@ cost_gather(GatherPath *path, PlannerInfo *root,
 #endif
             double *rows)
 {
+#ifdef __TBASE__
 	ADJUST_BASESCAN(&path->path, rel_orig, rel, param_info_orig, param_info);
+#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
 
@@ -542,10 +544,17 @@ reset_cost_gather(GatherPath *path)
  */
 void
 cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
+#ifdef __TBASE__
+				  RelOptInfo *rel_orig, ParamPathInfo *param_info_orig,
+#else
                   RelOptInfo *rel, ParamPathInfo *param_info,
+#endif
                   Cost input_startup_cost, Cost input_total_cost,
                   double *rows)
 {
+#ifdef __TBASE__
+	ADJUST_BASESCAN(&path->path, rel_orig, rel, param_info_orig, param_info);
+#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     Cost        comparison_cost;
@@ -879,8 +888,13 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
          * sequential as for parallel scans the pages are accessed in random
          * order.
          */
+#ifdef __TBASE__
+		path->path.parallel_workers = compute_parallel_worker(baserel_orig,
+		                                                      rand_heap_pages, index_pages);
+#else
         path->path.parallel_workers = compute_parallel_worker(baserel,
                                                               rand_heap_pages, index_pages);
+#endif
 
         /*
          * Fall out if workers can't be assigned for parallel scan, because in
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 1e58fbdc..31f75070 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -1667,6 +1667,8 @@ bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel, List *paths)
                                                       required_outer);
     bpath.path.pathkeys = NIL;
     bpath.bitmapqual = (Path *) &apath;
+	/* TODO: get real distribution information */
+	bpath.path.distribution = NULL;
 
     /*
      * Check the cost of temporary path without considering parallelism.
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index a30ea56e..a98797ff 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -6416,6 +6416,7 @@ make_remotesubplan(PlannerInfo *root,
     Plan *gather_left = lefttree;
     Plan *gather_parent = NULL;
     bool need_sort = true;
+	double nodes = 1;
 #endif
 
     /* Sanity checks */
@@ -6423,6 +6424,16 @@ make_remotesubplan(PlannerInfo *root,
     Assert(!IsA(lefttree, RemoteSubplan));
 
 #ifdef __TBASE__
+	if (execDistribution &&
+	    (execDistribution->distributionType == LOCATOR_TYPE_HASH ||
+	     execDistribution->distributionType == LOCATOR_TYPE_SHARD))
+	{
+		nodes = bms_num_members(execDistribution->nodes);
+		if (nodes <= 0)
+			/* should not happen, but for safety */
+			nodes = 1;
+	}
+	
     if((IsA(lefttree, HashJoin) || IsA(lefttree, SeqScan) 
         || IsA(lefttree, Agg) || IsA(lefttree, Group) ||
         IsA(lefttree, Sort) || IsA(lefttree, Limit) || IsA(lefttree, Gather)) && 
@@ -6432,18 +6443,17 @@ make_remotesubplan(PlannerInfo *root,
          distributionType == LOCATOR_TYPE_NONE ||
          distributionType == LOCATOR_TYPE_SHARD))
     {
-        int    parallel_threshold_rows = 50000;
-        
         if (IsA(lefttree, Gather))
         {
             Gather *gather = (Gather *)lefttree;
             int nWorkers = gather->num_workers;
             Plan *leftplan = lefttree->lefttree;
-            double rows = GetPlanRows(leftplan);
+			/* rows estimate is cut down to per data nodes, set it to all nodes for parallel estimate. */
+			double rows = GetPlanRows(leftplan) * nodes;
             int    heap_parallel_threshold = 0;
             int    heap_parallel_workers = 1;
 
-            heap_parallel_threshold = Max(parallel_threshold_rows, 1);
+			heap_parallel_threshold = Max(min_parallel_rows_size, 1);
             while (rows >= (heap_parallel_threshold * 3))
             {
                 heap_parallel_workers++;
@@ -6481,7 +6491,7 @@ make_remotesubplan(PlannerInfo *root,
                 switch(nodeTag(lefttree))
                 {
                     case T_SeqScan:
-                        if (rows >= parallel_threshold_rows * 3)
+						if (rows >= min_parallel_rows_size * 3)
                         {
                             lefttree->parallel_aware = true;
                         }
@@ -6667,7 +6677,9 @@ make_remotesubplan(PlannerInfo *root,
                 }
             }
 
-            if (rows < parallel_threshold_rows * 3)
+			/* rows estimate is cut down to per data nodes, set it to all nodes for parallel estimate. */
+			rows *= nodes;
+			if (rows < min_parallel_rows_size * 3)
                 need_parallel = false;
 
             if (need_parallel)
@@ -6677,7 +6689,7 @@ make_remotesubplan(PlannerInfo *root,
                 Gather       *gather_plan           = NULL;
                 Plan       *subplan               = NULL;
 
-                heap_parallel_threshold = Max(parallel_threshold_rows, 1);
+				heap_parallel_threshold = Max(min_parallel_rows_size, 1);
                 while (rows >= (heap_parallel_threshold * 3))
                 {
                     heap_parallel_workers++;
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 13cfb45d..038871a6 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1415,8 +1415,8 @@ create_remotesubplan_path_internal(PlannerInfo *root, Path *subpath,
 {
 	RemoteSubPath   *pathnode;
 	
-	if (IsA(subpath, GatherPath))
-		reset_cost_gather((GatherPath *) subpath);
+	//if (IsA(subpath, GatherPath))
+		//reset_cost_gather((GatherPath *) subpath);
 	
 	pathnode = makeNode(RemoteSubPath);
 	pathnode->path.pathtype = T_RemoteSubplan;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 0f728e50..35c4981d 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4133,7 +4133,17 @@ static struct config_int ConfigureNamesInt[] =
         (512 * 1024) / BLCKSZ, 0, INT_MAX / 3,
         NULL, NULL, NULL
     },
-
+#ifdef __TBASE__
+	{
+		{"min_parallel_rows_size", PGC_USERSET, QUERY_TUNING_COST,
+			gettext_noop("Sets the minimum amount of rows for a parallel aggregate or scan."),
+			gettext_noop("If the planner estimates that it will read rows too small to reach this limit, a parallel plan will not be considered.")
+		},
+		&min_parallel_rows_size,
+		50000, 0, INT_MAX / 3,
+		NULL, NULL, NULL
+	},
+#endif
     {
         /* Can't be set in postgresql.conf */
         {"server_version_num", PGC_INTERNAL, PRESET_OPTIONS,
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index e32f7688..cf766c0f 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * paths.h
- *      prototypes for various files in optimizer/path
+ *	  prototypes for various files in optimizer/path
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -21,43 +21,44 @@
  * allpaths.c
  */
 extern bool enable_geqo;
-extern int    geqo_threshold;
-extern int    min_parallel_table_scan_size;
-extern int    min_parallel_index_scan_size;
+extern int	geqo_threshold;
+extern int	min_parallel_table_scan_size;
+extern int	min_parallel_index_scan_size;
+extern int  min_parallel_rows_size;
 
 /* Hook for plugins to get control in set_rel_pathlist() */
 typedef void (*set_rel_pathlist_hook_type) (PlannerInfo *root,
-                                            RelOptInfo *rel,
-                                            Index rti,
-                                            RangeTblEntry *rte);
+											RelOptInfo *rel,
+											Index rti,
+											RangeTblEntry *rte);
 extern PGDLLIMPORT set_rel_pathlist_hook_type set_rel_pathlist_hook;
 
 /* Hook for plugins to get control in add_paths_to_joinrel() */
 typedef void (*set_join_pathlist_hook_type) (PlannerInfo *root,
-                                             RelOptInfo *joinrel,
-                                             RelOptInfo *outerrel,
-                                             RelOptInfo *innerrel,
-                                             JoinType jointype,
-                                             JoinPathExtraData *extra);
+											 RelOptInfo *joinrel,
+											 RelOptInfo *outerrel,
+											 RelOptInfo *innerrel,
+											 JoinType jointype,
+											 JoinPathExtraData *extra);
 extern PGDLLIMPORT set_join_pathlist_hook_type set_join_pathlist_hook;
 
 /* Hook for plugins to replace standard_join_search() */
 typedef RelOptInfo *(*join_search_hook_type) (PlannerInfo *root,
-                                              int levels_needed,
-                                              List *initial_rels);
+											  int levels_needed,
+											  List *initial_rels);
 extern PGDLLIMPORT join_search_hook_type join_search_hook;
 
 
 extern RelOptInfo *make_one_rel(PlannerInfo *root, List *joinlist);
 extern void set_dummy_rel_pathlist(RelOptInfo *rel);
 extern RelOptInfo *standard_join_search(PlannerInfo *root, int levels_needed,
-                     List *initial_rels);
+					 List *initial_rels);
 
 extern void generate_gather_paths(PlannerInfo *root, RelOptInfo *rel);
 extern int compute_parallel_worker(RelOptInfo *rel, double heap_pages,
-                        double index_pages);
+						double index_pages);
 extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel,
-                            Path *bitmapqual);
+							Path *bitmapqual);
 
 #ifdef OPTIMIZER_DEBUG
 extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel);
@@ -65,167 +66,167 @@ extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel);
 
 /*
  * indxpath.c
- *      routines to generate index paths
+ *	  routines to generate index paths
  */
 extern void create_index_paths(PlannerInfo *root, RelOptInfo *rel);
 extern bool relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
-                              List *restrictlist,
-                              List *exprlist, List *oprlist);
+							  List *restrictlist,
+							  List *exprlist, List *oprlist);
 extern bool indexcol_is_bool_constant_for_query(IndexOptInfo *index,
-                                    int indexcol);
+									int indexcol);
 extern bool match_index_to_operand(Node *operand, int indexcol,
-                       IndexOptInfo *index);
+					   IndexOptInfo *index);
 extern void expand_indexqual_conditions(IndexOptInfo *index,
-                            List *indexclauses, List *indexclausecols,
-                            List **indexquals_p, List **indexqualcols_p);
+							List *indexclauses, List *indexclausecols,
+							List **indexquals_p, List **indexqualcols_p);
 extern void check_index_predicates(PlannerInfo *root, RelOptInfo *rel);
 extern Expr *adjust_rowcompare_for_index(RowCompareExpr *clause,
-                            IndexOptInfo *index,
-                            int indexcol,
-                            List **indexcolnos,
-                            bool *var_on_left_p);
+							IndexOptInfo *index,
+							int indexcol,
+							List **indexcolnos,
+							bool *var_on_left_p);
 
 /*
  * tidpath.h
- *      routines to generate tid paths
+ *	  routines to generate tid paths
  */
 extern void create_tidscan_paths(PlannerInfo *root, RelOptInfo *rel);
 
 /*
  * joinpath.c
- *       routines to create join paths
+ *	   routines to create join paths
  */
 extern void add_paths_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel,
-                     RelOptInfo *outerrel, RelOptInfo *innerrel,
-                     JoinType jointype, SpecialJoinInfo *sjinfo,
-                     List *restrictlist);
+					 RelOptInfo *outerrel, RelOptInfo *innerrel,
+					 JoinType jointype, SpecialJoinInfo *sjinfo,
+					 List *restrictlist);
 
 /*
  * joinrels.c
- *      routines to determine which relations to join
+ *	  routines to determine which relations to join
  */
 extern void join_search_one_level(PlannerInfo *root, int level);
 extern RelOptInfo *make_join_rel(PlannerInfo *root,
-              RelOptInfo *rel1, RelOptInfo *rel2);
+			  RelOptInfo *rel1, RelOptInfo *rel2);
 extern bool have_join_order_restriction(PlannerInfo *root,
-                            RelOptInfo *rel1, RelOptInfo *rel2);
+							RelOptInfo *rel1, RelOptInfo *rel2);
 extern bool have_dangerous_phv(PlannerInfo *root,
-                   Relids outer_relids, Relids inner_params);
+				   Relids outer_relids, Relids inner_params);
 
 /*
  * equivclass.c
- *      routines for managing EquivalenceClasses
+ *	  routines for managing EquivalenceClasses
  */
 typedef bool (*ec_matches_callback_type) (PlannerInfo *root,
-                                          RelOptInfo *rel,
-                                          EquivalenceClass *ec,
-                                          EquivalenceMember *em,
-                                          void *arg);
+										  RelOptInfo *rel,
+										  EquivalenceClass *ec,
+										  EquivalenceMember *em,
+										  void *arg);
 
 extern bool process_equivalence(PlannerInfo *root, RestrictInfo *restrictinfo,
-                    bool below_outer_join);
+					bool below_outer_join);
 extern Expr *canonicalize_ec_expression(Expr *expr,
-                           Oid req_type, Oid req_collation);
+						   Oid req_type, Oid req_collation);
 extern void reconsider_outer_join_clauses(PlannerInfo *root);
 extern EquivalenceClass *get_eclass_for_sort_expr(PlannerInfo *root,
-                         Expr *expr,
-                         Relids nullable_relids,
-                         List *opfamilies,
-                         Oid opcintype,
-                         Oid collation,
-                         Index sortref,
-                         Relids rel,
-                         bool create_it);
+						 Expr *expr,
+						 Relids nullable_relids,
+						 List *opfamilies,
+						 Oid opcintype,
+						 Oid collation,
+						 Index sortref,
+						 Relids rel,
+						 bool create_it);
 extern void generate_base_implied_equalities(PlannerInfo *root);
 extern List *generate_join_implied_equalities(PlannerInfo *root,
-                                 Relids join_relids,
-                                 Relids outer_relids,
-                                 RelOptInfo *inner_rel);
+								 Relids join_relids,
+								 Relids outer_relids,
+								 RelOptInfo *inner_rel);
 extern List *generate_join_implied_equalities_for_ecs(PlannerInfo *root,
-                                         List *eclasses,
-                                         Relids join_relids,
-                                         Relids outer_relids,
-                                         RelOptInfo *inner_rel);
+										 List *eclasses,
+										 Relids join_relids,
+										 Relids outer_relids,
+										 RelOptInfo *inner_rel);
 extern bool exprs_known_equal(PlannerInfo *root, Node *item1, Node *item2);
 extern EquivalenceClass *match_eclasses_to_foreign_key_col(PlannerInfo *root,
-                                  ForeignKeyOptInfo *fkinfo,
-                                  int colno);
+								  ForeignKeyOptInfo *fkinfo,
+								  int colno);
 extern void add_child_rel_equivalences(PlannerInfo *root,
-                           AppendRelInfo *appinfo,
-                           RelOptInfo *parent_rel,
-                           RelOptInfo *child_rel);
+						   AppendRelInfo *appinfo,
+						   RelOptInfo *parent_rel,
+						   RelOptInfo *child_rel);
 extern List *generate_implied_equalities_for_column(PlannerInfo *root,
-                                       RelOptInfo *rel,
-                                       ec_matches_callback_type callback,
-                                       void *callback_arg,
-                                       Relids prohibited_rels);
+									   RelOptInfo *rel,
+									   ec_matches_callback_type callback,
+									   void *callback_arg,
+									   Relids prohibited_rels);
 extern bool have_relevant_eclass_joinclause(PlannerInfo *root,
-                                RelOptInfo *rel1, RelOptInfo *rel2);
+								RelOptInfo *rel1, RelOptInfo *rel2);
 extern bool has_relevant_eclass_joinclause(PlannerInfo *root,
-                               RelOptInfo *rel1);
+							   RelOptInfo *rel1);
 extern bool eclass_useful_for_merging(PlannerInfo *root,
-                          EquivalenceClass *eclass,
-                          RelOptInfo *rel);
+						  EquivalenceClass *eclass,
+						  RelOptInfo *rel);
 extern bool is_redundant_derived_clause(RestrictInfo *rinfo, List *clauselist);
 
 /*
  * pathkeys.c
- *      utilities for matching and building path keys
+ *	  utilities for matching and building path keys
  */
 typedef enum
 {
-    PATHKEYS_EQUAL,                /* pathkeys are identical */
-    PATHKEYS_BETTER1,            /* pathkey 1 is a superset of pathkey 2 */
-    PATHKEYS_BETTER2,            /* vice versa */
-    PATHKEYS_DIFFERENT            /* neither pathkey includes the other */
+	PATHKEYS_EQUAL,				/* pathkeys are identical */
+	PATHKEYS_BETTER1,			/* pathkey 1 is a superset of pathkey 2 */
+	PATHKEYS_BETTER2,			/* vice versa */
+	PATHKEYS_DIFFERENT			/* neither pathkey includes the other */
 } PathKeysComparison;
 
 extern PathKeysComparison compare_pathkeys(List *keys1, List *keys2);
 extern bool pathkeys_contained_in(List *keys1, List *keys2);
 extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
-                               Relids required_outer,
-                               CostSelector cost_criterion,
-                               bool require_parallel_safe);
+							   Relids required_outer,
+							   CostSelector cost_criterion,
+							   bool require_parallel_safe);
 extern Path *get_cheapest_fractional_path_for_pathkeys(List *paths,
-                                          List *pathkeys,
-                                          Relids required_outer,
-                                          double fraction);
+										  List *pathkeys,
+										  Relids required_outer,
+										  double fraction);
 extern Path *get_cheapest_parallel_safe_total_inner(List *paths);
 extern List *build_index_pathkeys(PlannerInfo *root, IndexOptInfo *index,
-                     ScanDirection scandir);
+					 ScanDirection scandir);
 extern List *build_expression_pathkey(PlannerInfo *root, Expr *expr,
-                         Relids nullable_relids, Oid opno,
-                         Relids rel, bool create_it);
+						 Relids nullable_relids, Oid opno,
+						 Relids rel, bool create_it);
 extern List *convert_subquery_pathkeys(PlannerInfo *root, RelOptInfo *rel,
-                          List *subquery_pathkeys,
-                          List *subquery_tlist);
+						  List *subquery_pathkeys,
+						  List *subquery_tlist);
 extern List *build_join_pathkeys(PlannerInfo *root,
-                    RelOptInfo *joinrel,
-                    JoinType jointype,
-                    List *outer_pathkeys);
+					RelOptInfo *joinrel,
+					JoinType jointype,
+					List *outer_pathkeys);
 extern List *make_pathkeys_for_sortclauses(PlannerInfo *root,
-                              List *sortclauses,
-                              List *tlist);
+							  List *sortclauses,
+							  List *tlist);
 extern void initialize_mergeclause_eclasses(PlannerInfo *root,
-                                RestrictInfo *restrictinfo);
+								RestrictInfo *restrictinfo);
 extern void update_mergeclause_eclasses(PlannerInfo *root,
-                            RestrictInfo *restrictinfo);
+							RestrictInfo *restrictinfo);
 extern List *find_mergeclauses_for_pathkeys(PlannerInfo *root,
-                               List *pathkeys,
-                               bool outer_keys,
-                               List *restrictinfos);
+							   List *pathkeys,
+							   bool outer_keys,
+							   List *restrictinfos);
 extern List *select_outer_pathkeys_for_merge(PlannerInfo *root,
-                                List *mergeclauses,
-                                RelOptInfo *joinrel);
+								List *mergeclauses,
+								RelOptInfo *joinrel);
 extern List *make_inner_pathkeys_for_merge(PlannerInfo *root,
-                              List *mergeclauses,
-                              List *outer_pathkeys);
+							  List *mergeclauses,
+							  List *outer_pathkeys);
 extern List *truncate_useless_pathkeys(PlannerInfo *root,
-                          RelOptInfo *rel,
-                          List *pathkeys);
+						  RelOptInfo *rel,
+						  List *pathkeys);
 extern bool has_useful_pathkeys(PlannerInfo *root, RelOptInfo *rel);
 extern PathKey *make_canonical_pathkey(PlannerInfo *root,
-                       EquivalenceClass *eclass, Oid opfamily,
-                       int strategy, bool nulls_first);
+					   EquivalenceClass *eclass, Oid opfamily,
+					   int strategy, bool nulls_first);
 
-#endif                            /* PATHS_H */
+#endif							/* PATHS_H */

From c2d9b972409d601ab35898b6ee28676ced36eb28 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 2 Feb 2021 19:52:42 +0800
Subject: [PATCH 121/578] make group estimate compatible with multi datanodes

---
 src/backend/utils/adt/selfuncs.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 06b1d9fa..e24d7193 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3541,6 +3541,20 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
              */
             double        clamp = rel->tuples;
 
+#ifdef __TBASE__
+			double      nodes = 1;
+			if (list_length(rel->pathlist) > 0)
+			{
+				Path *path = linitial(rel->pathlist);
+				if (path->distribution &&
+				    (path->distribution->distributionType == LOCATOR_TYPE_HASH ||
+				     path->distribution->distributionType == LOCATOR_TYPE_SHARD))
+					nodes = bms_num_members(path->distribution->nodes);
+				/* for sanity */
+				if (nodes < 1)
+					nodes = 1;
+			}
+#endif
             if (relvarcount > 1)
             {
                 clamp *= 0.1;
@@ -3600,7 +3614,11 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
                     (1 - pow((rel->tuples - rel->rows) / rel->tuples,
                              rel->tuples / reldistinct));
             }
+#ifdef __TBASE__
+			reldistinct = clamp_row_est(reldistinct / nodes);
+#else
             reldistinct = clamp_row_est(reldistinct);
+#endif
 
             /*
              * Update estimate of total distinct groups.

From cfb34f5b9ebdca9b0155f3802cc8ab830ab7b50b Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Wed, 3 Feb 2021 19:57:47 +0800
Subject: [PATCH 122/578] Adjust remote cost

---
 src/backend/optimizer/path/costsize.c | 7 ++++---
 src/backend/optimizer/util/pathnode.c | 2 +-
 src/include/optimizer/cost.h          | 3 ++-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 5d40d93e..4c816ac7 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -5591,7 +5591,8 @@ page_size(double tuples, int width)
 void
 cost_remote_subplan(Path *path,
               Cost input_startup_cost, Cost input_total_cost,
-              double tuples, int width, int replication)
+			  double tuples, int width, int replication,
+			  int nworkers)
 {
     Cost        startup_cost = input_startup_cost + remote_query_cost;
     Cost        run_cost = input_total_cost - input_startup_cost;
@@ -5601,12 +5602,12 @@ cost_remote_subplan(Path *path,
     /*
      * Charge 2x cpu_operator_cost per tuple to reflect bookkeeping overhead.
      */
-    run_cost += 2 * cpu_operator_cost * tuples;
+	run_cost += 2 * cpu_operator_cost * tuples * nworkers;
 
     /*
      * Estimate cost of sending data over network
      */
-    run_cost += network_byte_cost * tuples * width * replication;
+	run_cost += network_byte_cost * tuples * width * replication * nworkers;
 
     path->startup_cost = startup_cost;
     path->total_cost = startup_cost + run_cost;
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 038871a6..e5551e0c 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1434,7 +1434,7 @@ create_remotesubplan_path_internal(PlannerInfo *root, Path *subpath,
 
 	cost_remote_subplan((Path *) pathnode, subpath->startup_cost + additional_startup_cost,
 	                    subpath->total_cost + additional_total_cost, subpath->rows,
-	                    rel->reltarget->width, replication);
+	                    rel->reltarget->width, replication, subpath->parallel_workers);
 	
 	return (Path *) pathnode;
 }
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 2198c9db..358e83b9 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -177,7 +177,8 @@ extern void cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root);
 #ifdef XCP
 extern void cost_remote_subplan(Path *path,
 			  Cost input_startup_cost, Cost input_total_cost,
-			  double tuples, int width, int replication);
+			  double tuples, int width, int replication,
+			  int nworkers);
 #endif
 extern void compute_semi_anti_join_factors(PlannerInfo *root,
 							   RelOptInfo *outerrel,

From 54ed9c61430726cd9a6b1bad22ee6f5d4db351fe Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 5 Feb 2021 10:29:37 +0800
Subject: [PATCH 123/578] Support parallel nestloop under remote sub query

---
 src/backend/optimizer/plan/createplan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index a98797ff..31bbb2d6 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -6434,7 +6434,7 @@ make_remotesubplan(PlannerInfo *root,
 			nodes = 1;
 	}
 	
-    if((IsA(lefttree, HashJoin) || IsA(lefttree, SeqScan) 
+	if((IsA(lefttree, HashJoin) || IsA(lefttree, NestLoop) || IsA(lefttree, SeqScan) 
         || IsA(lefttree, Agg) || IsA(lefttree, Group) ||
         IsA(lefttree, Sort) || IsA(lefttree, Limit) || IsA(lefttree, Gather)) && 
         max_parallel_workers_per_gather && root->glob->parallelModeOK &&

From 225c6a482de94c82dd0ed1d8588944e5ce936f49 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 5 Feb 2021 11:27:22 +0800
Subject: [PATCH 124/578] Cover regress expect

---
 src/test/regress/expected/create_index_1.out  |  18 +-
 src/test/regress/expected/fast_default.out    |  12 +-
 src/test/regress/expected/groupingsets.out    |   5 +-
 src/test/regress/expected/inherit_3.out       |  83 ++-
 .../regress/expected/insert_conflict_1.out    |   8 +-
 src/test/regress/expected/join_3.out          | 512 +++++++++---------
 src/test/regress/expected/privileges.out      |  27 +-
 src/test/regress/expected/rowsecurity_1.out   |  31 +-
 src/test/regress/expected/select_views.out    |  19 +-
 src/test/regress/expected/stats_ext_2.out     |  96 ++--
 src/test/regress/expected/subselect.out       |  38 +-
 src/test/regress/expected/xc_FQS_2.out        |  12 +-
 src/test/regress/expected/xc_FQS_join_1.out   | 104 ++--
 src/test/regress/expected/xc_groupby_1.out    | 216 +++-----
 src/test/regress/expected/xc_having_1.out     |  19 +-
 src/test/regress/expected/xl_join.out         |  33 +-
 16 files changed, 571 insertions(+), 662 deletions(-)

diff --git a/src/test/regress/expected/create_index_1.out b/src/test/regress/expected/create_index_1.out
index 924c7c95..32acf6f1 100644
--- a/src/test/regress/expected/create_index_1.out
+++ b/src/test/regress/expected/create_index_1.out
@@ -2490,15 +2490,13 @@ CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fi
 EXPLAIN (COSTS OFF)
 SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA';
                             QUERY PLAN                             
--------------------------------------------------------------------
+----------------------------------------------------------------
  Finalize Aggregate
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Partial Aggregate
-               ->  Bitmap Heap Scan on tenk1
-                     Recheck Cond: (stringu1 = 'TVAAAA'::name)
-                     ->  Bitmap Index Scan on hash_tuplesort_idx
+               ->  Index Scan using hash_tuplesort_idx on tenk1
                            Index Cond: (stringu1 = 'TVAAAA'::name)
-(7 rows)
+(5 rows)
 
 SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA';
  count 
@@ -2986,15 +2984,11 @@ SELECT unique1 FROM tenk1
 WHERE unique1 IN (1,42,7)
 ORDER BY unique1;
                                QUERY PLAN                                
--------------------------------------------------------------------------
+-------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Sort
-         Sort Key: unique1
-         ->  Bitmap Heap Scan on tenk1
-               Recheck Cond: (unique1 = ANY ('{1,42,7}'::integer[]))
-               ->  Bitmap Index Scan on tenk1_unique1
+   ->  Index Only Scan using tenk1_unique1 on tenk1
                      Index Cond: (unique1 = ANY ('{1,42,7}'::integer[]))
-(7 rows)
+(3 rows)
 
 SELECT unique1 FROM tenk1
 WHERE unique1 IN (1,42,7)
diff --git a/src/test/regress/expected/fast_default.out b/src/test/regress/expected/fast_default.out
index f2d63e30..16c60821 100644
--- a/src/test/regress/expected/fast_default.out
+++ b/src/test/regress/expected/fast_default.out
@@ -452,18 +452,16 @@ DELETE FROM T WHERE pk BETWEEN 10 AND 20 RETURNING *;
 
 EXPLAIN (VERBOSE TRUE, COSTS FALSE)
 DELETE FROM T WHERE pk BETWEEN 10 AND 20 RETURNING *;
-                           QUERY PLAN                            
------------------------------------------------------------------
+                      QUERY PLAN                       
+-------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: pk, c_bigint, c_text
    ->  Delete on fast_default.t
          Output: pk, c_bigint, c_text
-         ->  Bitmap Heap Scan on fast_default.t
+         ->  Seq Scan on fast_default.t
                Output: xc_node_id, ctid, shardid, pk
-               Recheck Cond: ((t.pk >= 10) AND (t.pk <= 20))
-               ->  Bitmap Index Scan on t_pkey
-                     Index Cond: ((t.pk >= 10) AND (t.pk <= 20))
-(9 rows)
+               Filter: ((t.pk >= 10) AND (t.pk <= 20))
+(7 rows)
 
 -- UPDATE
 UPDATE T SET c_text = '"' || c_text || '"'  WHERE pk < 10;
diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out
index 032ef9c2..e1524f49 100644
--- a/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@ -1456,13 +1456,12 @@ explain (costs off)
    Hash Key: ten
    Hash Key: hundred
    Hash Key: thousand
+   Hash Key: twothousand
    Group Key: unique1
-   Sort Key: twothousand
-     Group Key: twothousand
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Sort
                Sort Key: unique1
                ->  Seq Scan on tenk1
-(13 rows)
+(12 rows)
 
 -- end
diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out
index 6723ae3a..d0ff897f 100644
--- a/src/test/regress/expected/inherit_3.out
+++ b/src/test/regress/expected/inherit_3.out
@@ -1522,8 +1522,8 @@ vacuum analyze patest2;
 analyze int4_tbl;
 explain (costs off, num_nodes off, nodes off)
 select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on id = f1;
-                           QUERY PLAN                           
-----------------------------------------------------------------
+                          QUERY PLAN                          
+--------------------------------------------------------------
  Nested Loop
    ->  Limit
          ->  Remote Subquery Scan on all
@@ -1533,15 +1533,13 @@ select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on
    ->  Materialize
          ->  Remote Subquery Scan on all
                ->  Append
-                     ->  Bitmap Heap Scan on patest0
-                           Recheck Cond: (id = int4_tbl.f1)
-                           ->  Bitmap Index Scan on patest0i
-                                 Index Cond: (id = int4_tbl.f1)
+                     ->  Index Scan using patest0i on patest0
+                           Index Cond: (id = int4_tbl.f1)
                      ->  Index Scan using patest1i on patest1
                            Index Cond: (id = int4_tbl.f1)
                      ->  Index Scan using patest2i on patest2
                            Index Cond: (id = int4_tbl.f1)
-(17 rows)
+(15 rows)
 
 select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on id = f1;
  id | x | f1 
@@ -1554,8 +1552,8 @@ select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on
 drop index patest2i;
 explain (costs off, num_nodes off, nodes off)
 select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on id = f1;
-                           QUERY PLAN                           
-----------------------------------------------------------------
+                          QUERY PLAN                          
+--------------------------------------------------------------
  Nested Loop
    ->  Limit
          ->  Remote Subquery Scan on all
@@ -1565,15 +1563,13 @@ select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on
    ->  Materialize
          ->  Remote Subquery Scan on all
                ->  Append
-                     ->  Bitmap Heap Scan on patest0
-                           Recheck Cond: (id = int4_tbl.f1)
-                           ->  Bitmap Index Scan on patest0i
-                                 Index Cond: (id = int4_tbl.f1)
+                     ->  Index Scan using patest0i on patest0
+                           Index Cond: (id = int4_tbl.f1)
                      ->  Index Scan using patest1i on patest1
                            Index Cond: (id = int4_tbl.f1)
                      ->  Seq Scan on patest2
                            Filter: (int4_tbl.f1 = id)
-(17 rows)
+(15 rows)
 
 select * from patest0 join (select f1 from int4_tbl where f1 = 0 limit 1) ss on id = f1;
  id | x | f1 
@@ -1788,15 +1784,13 @@ SELECT thousand, thousand FROM tenk1
 ORDER BY thousand, tenthous;
                                   QUERY PLAN                                   
 -------------------------------------------------------------------------------
- Merge Append
-   Sort Key: tenk1.thousand, tenk1.tenthous
-   ->  Remote Subquery Scan on all
-         ->  Index Only Scan using tenk1_thous_tenthous on tenk1
-   ->  Remote Subquery Scan on all
-         ->  Sort
-               Sort Key: tenk1_1.thousand, tenk1_1.thousand
+ Remote Subquery Scan on all
+   ->  Sort
+         Sort Key: tenk1.thousand, tenk1.tenthous
+         ->  Append
+               ->  Index Only Scan using tenk1_thous_tenthous on tenk1
                ->  Index Only Scan using tenk1_thous_tenthous on tenk1 tenk1_1
-(8 rows)
+(6 rows)
 
 explain (costs off, num_nodes off, nodes off)
 SELECT thousand, tenthous, thousand+tenthous AS x FROM tenk1
@@ -1805,15 +1799,13 @@ SELECT 42, 42, hundred FROM tenk1
 ORDER BY thousand, tenthous;
                                QUERY PLAN                               
 ------------------------------------------------------------------------
- Merge Append
-   Sort Key: tenk1.thousand, tenk1.tenthous
-   ->  Remote Subquery Scan on all
-         ->  Index Only Scan using tenk1_thous_tenthous on tenk1
-   ->  Remote Subquery Scan on all
-         ->  Sort
-               Sort Key: 42, 42
+ Remote Subquery Scan on all
+   ->  Sort
+         Sort Key: tenk1.thousand, tenk1.tenthous
+         ->  Append
+               ->  Index Only Scan using tenk1_thous_tenthous on tenk1
                ->  Index Only Scan using tenk1_hundred on tenk1 tenk1_1
-(8 rows)
+(6 rows)
 
 explain (costs off, num_nodes off, nodes off)
 SELECT thousand, tenthous FROM tenk1
@@ -1822,15 +1814,14 @@ SELECT thousand, random()::integer FROM tenk1
 ORDER BY thousand, tenthous;
                                   QUERY PLAN                                   
 -------------------------------------------------------------------------------
- Merge Append
+ Sort
    Sort Key: tenk1.thousand, tenk1.tenthous
-   ->  Remote Subquery Scan on all
-         ->  Index Only Scan using tenk1_thous_tenthous on tenk1
-   ->  Remote Subquery Scan on all
-         ->  Sort
-               Sort Key: tenk1_1.thousand, ((random())::integer)
+   ->  Append
+         ->  Remote Subquery Scan on all
+               ->  Index Only Scan using tenk1_thous_tenthous on tenk1
+         ->  Remote Subquery Scan on all
                ->  Index Only Scan using tenk1_thous_tenthous on tenk1 tenk1_1
-(8 rows)
+(7 rows)
 
 -- Check min/max aggregate optimization
 explain (costs off, num_nodes off, nodes off)
@@ -1880,17 +1871,15 @@ SELECT x, y FROM
    UNION ALL
    SELECT unique2 AS x, unique2 AS y FROM tenk1 b) s
 ORDER BY x, y;
-                            QUERY PLAN                             
--------------------------------------------------------------------
- Merge Append
-   Sort Key: a.thousand, a.tenthous
-   ->  Remote Subquery Scan on all
-         ->  Index Only Scan using tenk1_thous_tenthous on tenk1 a
-   ->  Remote Subquery Scan on all
-         ->  Sort
-               Sort Key: b.unique2, b.unique2
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Sort
+         Sort Key: a.thousand, a.tenthous
+         ->  Append
+               ->  Index Only Scan using tenk1_thous_tenthous on tenk1 a
                ->  Index Only Scan using tenk1_unique2 on tenk1 b
-(8 rows)
+(6 rows)
 
 -- exercise rescan code path via a repeatedly-evaluated subquery
 explain (costs off)
diff --git a/src/test/regress/expected/insert_conflict_1.out b/src/test/regress/expected/insert_conflict_1.out
index 042c1c00..1dce5ece 100644
--- a/src/test/regress/expected/insert_conflict_1.out
+++ b/src/test/regress/expected/insert_conflict_1.out
@@ -51,8 +51,8 @@ explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on con
 
 explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on conflict (key, fruit) do update set fruit = excluded.fruit
   where exists (select 1 from insertconflicttest ii where ii.key = excluded.key);
-                                       QUERY PLAN                                       
-----------------------------------------------------------------------------------------
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_2)
    ->  Insert on insertconflicttest
          Conflict Resolution: UPDATE
@@ -61,8 +61,8 @@ explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on con
          ->  Result
          SubPlan 1
            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                 ->  Index Only Scan using both_index_expr_key on insertconflicttest ii
-                       Index Cond: (key = excluded.key)
+                 ->  Seq Scan on insertconflicttest ii
+                       Filter: (key = excluded.key)
 (10 rows)
 
 -- Neither collation nor operator class specifications are required --
diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index ce5f9512..a1c6c31b 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -1867,6 +1867,30 @@ SELECT '' AS "xxx", *
      | 1 | 4 | one | -1
 (1 row)
 
+--
+-- semijoin selectivity for <>
+--
+explain (costs off)
+select * from int4_tbl i4, tenk1 a
+where exists(select * from tenk1 b
+             where a.twothousand = b.twothousand and a.fivethous <> b.fivethous)
+      and i4.f1 = a.tenthous;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Join
+         Hash Cond: (a.tenthous = i4.f1)
+         ->  Hash Semi Join
+               Hash Cond: (a.twothousand = b.twothousand)
+               Join Filter: (a.fivethous <> b.fivethous)
+               ->  Seq Scan on tenk1 a
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Seq Scan on tenk1 b
+         ->  Hash
+               ->  Seq Scan on int4_tbl i4
+(12 rows)
+
 --
 -- More complicated constructs
 --
@@ -3014,8 +3038,8 @@ select * from
   int4(sin(1)) q1,
   int4(sin(0)) q2
 where thousand = (q1 + q2);
-                                QUERY PLAN                                
---------------------------------------------------------------------------
+                               QUERY PLAN                               
+------------------------------------------------------------------------
  Nested Loop
    Join Filter: (tenk1.twothousand = int4_tbl.f1)
    ->  Nested Loop
@@ -3024,14 +3048,12 @@ where thousand = (q1 + q2);
                ->  Function Scan on q2
          ->  Materialize
                ->  Remote Subquery Scan on all
-                     ->  Bitmap Heap Scan on tenk1
-                           Recheck Cond: (thousand = (q1.q1 + q2.q2))
-                           ->  Bitmap Index Scan on tenk1_thous_tenthous
-                                 Index Cond: (thousand = (q1.q1 + q2.q2))
+                     ->  Index Scan using tenk1_thous_tenthous on tenk1
+                           Index Cond: (thousand = (q1.q1 + q2.q2))
    ->  Materialize
          ->  Remote Subquery Scan on all
                ->  Seq Scan on int4_tbl
-(15 rows)
+(13 rows)
 
 set enable_hashjoin = true;
 --
@@ -3149,22 +3171,22 @@ select * from tenk1 a join tenk1 b on
  Nested Loop
    Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR ((a.unique2 = 3) AND (b.hundred = 4)))
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-         ->  Bitmap Heap Scan on tenk1 b
-               Recheck Cond: ((unique1 = 2) OR (hundred = 4))
+         ->  Bitmap Heap Scan on tenk1 a
+               Recheck Cond: ((unique1 = 1) OR (unique2 = 3))
                ->  BitmapOr
                      ->  Bitmap Index Scan on tenk1_unique1
-                           Index Cond: (unique1 = 2)
-                     ->  Bitmap Index Scan on tenk1_hundred
-                           Index Cond: (hundred = 4)
+                           Index Cond: (unique1 = 1)
+                     ->  Bitmap Index Scan on tenk1_unique2
+                           Index Cond: (unique2 = 3)
    ->  Materialize
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Bitmap Heap Scan on tenk1 a
-                     Recheck Cond: ((unique1 = 1) OR (unique2 = 3))
+               ->  Bitmap Heap Scan on tenk1 b
+                     Recheck Cond: ((unique1 = 2) OR (hundred = 4))
                      ->  BitmapOr
                            ->  Bitmap Index Scan on tenk1_unique1
-                                 Index Cond: (unique1 = 1)
-                           ->  Bitmap Index Scan on tenk1_unique2
-                                 Index Cond: (unique2 = 3)
+                                 Index Cond: (unique1 = 2)
+                           ->  Bitmap Index Scan on tenk1_hundred
+                                 Index Cond: (hundred = 4)
 (19 rows)
 
 explain (costs off)
@@ -3175,17 +3197,17 @@ select * from tenk1 a join tenk1 b on
  Nested Loop
    Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR ((a.unique2 = 3) AND (b.ten = 4)))
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-         ->  Seq Scan on tenk1 b
-               Filter: ((unique1 = 2) OR (ten = 4))
+         ->  Bitmap Heap Scan on tenk1 a
+               Recheck Cond: ((unique1 = 1) OR (unique2 = 3))
+               ->  BitmapOr
+                     ->  Bitmap Index Scan on tenk1_unique1
+                           Index Cond: (unique1 = 1)
+                     ->  Bitmap Index Scan on tenk1_unique2
+                           Index Cond: (unique2 = 3)
    ->  Materialize
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Bitmap Heap Scan on tenk1 a
-                     Recheck Cond: ((unique1 = 1) OR (unique2 = 3))
-                     ->  BitmapOr
-                           ->  Bitmap Index Scan on tenk1_unique1
-                                 Index Cond: (unique1 = 1)
-                           ->  Bitmap Index Scan on tenk1_unique2
-                                 Index Cond: (unique2 = 3)
+               ->  Seq Scan on tenk1 b
+                     Filter: ((unique1 = 2) OR (ten = 4))
 (14 rows)
 
 explain (costs off)
@@ -3245,11 +3267,9 @@ where t1.unique1 = 1;
                            ->  Hash
                                  ->  Remote Subquery Scan on all
                                        Distribute results by H: thousand
-                                       ->  Bitmap Heap Scan on tenk1 t2
-                                             Recheck Cond: (t1.hundred = hundred)
-                                             ->  Bitmap Index Scan on tenk1_hundred
-                                                   Index Cond: (t1.hundred = hundred)
-(22 rows)
+                                       ->  Index Scan using tenk1_hundred on tenk1 t2
+                                             Index Cond: (t1.hundred = hundred)
+(20 rows)
 
 explain (num_nodes off, nodes off, costs off)
 select * from tenk1 t1 left join
@@ -3276,19 +3296,17 @@ where t1.unique1 = 1;
                            ->  Hash
                                  ->  Remote Subquery Scan on all
                                        Distribute results by H: thousand
-                                       ->  Bitmap Heap Scan on tenk1 t2
-                                             Recheck Cond: (t1.hundred = hundred)
-                                             ->  Bitmap Index Scan on tenk1_hundred
-                                                   Index Cond: (t1.hundred = hundred)
-(22 rows)
+                                       ->  Index Scan using tenk1_hundred on tenk1 t2
+                                             Index Cond: (t1.hundred = hundred)
+(20 rows)
 
 explain (num_nodes off, nodes off, costs off)
 select count(*) from
   tenk1 a join tenk1 b on a.unique1 = b.unique2
   left join tenk1 c on a.unique2 = b.unique1 and c.thousand = a.thousand
   join int4_tbl on b.thousand = f1;
-                                              QUERY PLAN                                               
--------------------------------------------------------------------------------------------------------
+                                               QUERY PLAN                                               
+--------------------------------------------------------------------------------------------------------
  Finalize Aggregate
    ->  Remote Subquery Scan on all
          ->  Partial Aggregate
@@ -3306,13 +3324,11 @@ select count(*) from
                                              Distribute results by H: unique2
                                              ->  Nested Loop
                                                    ->  Seq Scan on int4_tbl
-                                                   ->  Bitmap Heap Scan on tenk1 b
-                                                         Recheck Cond: (thousand = int4_tbl.f1)
-                                                         ->  Bitmap Index Scan on tenk1_thous_tenthous
-                                                               Index Cond: (thousand = int4_tbl.f1)
+                                                   ->  Index Scan using tenk1_thous_tenthous on tenk1 b
+                                                         Index Cond: (thousand = int4_tbl.f1)
                                        ->  Index Scan using tenk1_unique1 on tenk1 a
                                              Index Cond: (unique1 = b.unique2)
-(23 rows)
+(21 rows)
 
 select count(*) from
   tenk1 a join tenk1 b on a.unique1 = b.unique2
@@ -3330,8 +3346,8 @@ select b.unique1 from
   join int4_tbl i1 on b.thousand = f1
   right join int4_tbl i2 on i2.f1 = b.tenthous
   order by 1;
-                                                 QUERY PLAN                                                  
--------------------------------------------------------------------------------------------------------------
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all
    ->  Sort
          Sort Key: b.unique1
@@ -3353,17 +3369,15 @@ select b.unique1 from
                                                    Distribute results by H: unique2
                                                    ->  Nested Loop
                                                          ->  Seq Scan on int4_tbl i1
-                                                         ->  Bitmap Heap Scan on tenk1 b
-                                                               Recheck Cond: (thousand = i1.f1)
-                                                               ->  Bitmap Index Scan on tenk1_thous_tenthous
-                                                                     Index Cond: (thousand = i1.f1)
+                                                         ->  Index Scan using tenk1_thous_tenthous on tenk1 b
+                                                               Index Cond: (thousand = i1.f1)
                                              ->  Index Scan using tenk1_unique1 on tenk1 a
                                                    Index Cond: (unique1 = b.unique2)
                ->  Hash
                      ->  Remote Subquery Scan on all
                            Distribute results by H: f1
                            ->  Seq Scan on int4_tbl i2
-(31 rows)
+(29 rows)
 
 select b.unique1 from
   tenk1 a join tenk1 b on a.unique1 = b.unique2
@@ -3391,16 +3405,16 @@ order by fault;
                                    QUERY PLAN                                   
 --------------------------------------------------------------------------------
  Remote Subquery Scan on all
-   ->  Nested Loop Left Join
+   ->  Hash Right Join
+         Hash Cond: (tenk1.unique2 = int8_tbl.q2)
          Filter: ((COALESCE(tenk1.unique1, '-1'::integer) + int8_tbl.q1) = 122)
          ->  Remote Subquery Scan on all
-               Distribute results by H: q2
-               ->  Seq Scan on int8_tbl
-         ->  Materialize
+               Distribute results by H: unique2
+               ->  Seq Scan on tenk1
+         ->  Hash
                ->  Remote Subquery Scan on all
-                     Distribute results by H: unique2
-                     ->  Index Scan using tenk1_unique2 on tenk1
-                           Index Cond: (int8_tbl.q2 = unique2)
+                     Distribute results by H: q2
+                     ->  Seq Scan on int8_tbl
 (11 rows)
 
 select * from
@@ -3425,16 +3439,16 @@ select q1, unique2, thousand, hundred
                                          QUERY PLAN                                         
 --------------------------------------------------------------------------------------------
  Remote Subquery Scan on all
-   ->  Nested Loop Left Join
+   ->  Hash Right Join
+         Hash Cond: (b.unique2 = a.q1)
          Filter: ((COALESCE(b.thousand, 123) = a.q1) AND (a.q1 = COALESCE(b.hundred, 123)))
          ->  Remote Subquery Scan on all
-               Distribute results by H: q1
-               ->  Seq Scan on int8_tbl a
-         ->  Materialize
+               Distribute results by H: COALESCE(thousand, 123)
+               ->  Seq Scan on tenk1 b
+         ->  Hash
                ->  Remote Subquery Scan on all
-                     Distribute results by H: COALESCE(thousand, 123)
-                     ->  Index Scan using tenk1_unique2 on tenk1 b
-                           Index Cond: (a.q1 = unique2)
+                     Distribute results by H: q1
+                     ->  Seq Scan on int8_tbl a
 (11 rows)
 
 select q1, unique2, thousand, hundred
@@ -3451,16 +3465,16 @@ select f1, unique2, case when unique2 is null then f1 else 0 end
                                 QUERY PLAN                                
 --------------------------------------------------------------------------
  Remote Subquery Scan on all
-   ->  Nested Loop Left Join
+   ->  Hash Right Join
+         Hash Cond: (b.unique2 = a.f1)
          Filter: (CASE WHEN (b.unique2 IS NULL) THEN a.f1 ELSE 0 END = 0)
          ->  Remote Subquery Scan on all
-               Distribute results by H: f1
-               ->  Seq Scan on int4_tbl a
-         ->  Materialize
+               Distribute results by H: unique2
+               ->  Seq Scan on tenk1 b
+         ->  Hash
                ->  Remote Subquery Scan on all
-                     Distribute results by H: unique2
-                     ->  Index Only Scan using tenk1_unique2 on tenk1 b
-                           Index Cond: (unique2 = a.f1)
+                     Distribute results by H: f1
+                     ->  Seq Scan on int4_tbl a
 (11 rows)
 
 select f1, unique2, case when unique2 is null then f1 else 0 end
@@ -3520,34 +3534,33 @@ left join
    using (join_key)
   ) foo3
 using (join_key);
-                                      QUERY PLAN                                      
---------------------------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Hash Right Join
    Output: "*VALUES*".column1, i1.f1, (666)
    Hash Cond: (i1.f1 = "*VALUES*".column1)
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          Output: i1.f1, 666
-         ->  Nested Loop Left Join
+         ->  Hash Right Join
                Output: i1.f1, 666
-               ->  Remote Subquery Scan on all (datanode_1)
-                     Output: i1.f1
-                     Distribute results by H: f1
-                     ->  Seq Scan on public.int4_tbl i1
-                           Output: i1.f1
-               ->  Materialize
+               Hash Cond: (i2.unique2 = i1.f1)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Output: i2.unique2
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: unique2
+                     ->  Seq Scan on public.tenk1 i2
                            Output: i2.unique2
-                           Distribute results by H: unique2
-                           Sort Key: i2.unique2
-                           ->  Index Only Scan using tenk1_unique2 on public.tenk1 i2
-                                 Output: i2.unique2
-                                 Index Cond: (i2.unique2 = i1.f1)
+               ->  Hash
+                     Output: i1.f1
+                     ->  Remote Subquery Scan on all (datanode_1)
+                           Output: i1.f1
+                           Distribute results by H: f1
+                           ->  Seq Scan on public.int4_tbl i1
+                                 Output: i1.f1
    ->  Hash
          Output: "*VALUES*".column1
          ->  Values Scan on "*VALUES*"
                Output: "*VALUES*".column1
-(25 rows)
+(24 rows)
 
 select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from
   (values (0),(1)) foo1(join_key)
@@ -3584,8 +3597,8 @@ select t1.* from
   on (t1.f1 = b1.d1)
   left join int4_tbl i4
   on (i8.q2 = i4.f1);
-                                    QUERY PLAN                                    
-----------------------------------------------------------------------------------
+                                 QUERY PLAN                                 
+----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: t1.f1
    ->  Nested Loop Left Join
@@ -3596,29 +3609,27 @@ select t1.* from
                Join Filter: (t1.f1 = '***'::text)
                ->  Seq Scan on public.text_tbl t1
                      Output: t1.f1
-               ->  Materialize
+               ->  Nested Loop Left Join
                      Output: i8.q2
-                     ->  Nested Loop Left Join
-                           Output: i8.q2
-                           Join Filter: ((NULL::integer) = i8b1.q2)
-                           ->  Seq Scan on public.int8_tbl i8b1
-                                 Output: i8b1.q1, i8b1.q2
-                           ->  Materialize
+                     Join Filter: ((NULL::integer) = i8b1.q2)
+                     ->  Seq Scan on public.int8_tbl i8b1
+                           Output: i8b1.q1, i8b1.q2
+                     ->  Materialize
+                           Output: i8.q2, (NULL::integer)
+                           ->  Nested Loop Left Join
                                  Output: i8.q2, (NULL::integer)
-                                 ->  Nested Loop Left Join
-                                       Output: i8.q2, (NULL::integer)
-                                       Join Filter: (i8.q1 = i8b2.q1)
-                                       ->  Seq Scan on public.int8_tbl i8
-                                             Output: i8.q1, i8.q2
-                                       ->  Materialize
-                                             Output: i8b2.q1, (NULL::integer)
-                                             ->  Seq Scan on public.int8_tbl i8b2
-                                                   Output: i8b2.q1, NULL::integer
+                                 Join Filter: (i8.q1 = i8b2.q1)
+                                 ->  Seq Scan on public.int8_tbl i8
+                                       Output: i8.q1, i8.q2
+                                 ->  Materialize
+                                       Output: i8b2.q1, (NULL::integer)
+                                       ->  Seq Scan on public.int8_tbl i8b2
+                                             Output: i8b2.q1, NULL::integer
          ->  Materialize
                Output: i4.f1
                ->  Seq Scan on public.int4_tbl i4
                      Output: i4.f1
-(32 rows)
+(30 rows)
 
 select t1.* from
   text_tbl t1
@@ -3647,8 +3658,8 @@ select t1.* from
   on (t1.f1 = b1.d1)
   left join int4_tbl i4
   on (i8.q2 = i4.f1);
-                                          QUERY PLAN                                          
-----------------------------------------------------------------------------------------------
+                                       QUERY PLAN                                       
+----------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: t1.f1
    ->  Nested Loop Left Join
@@ -3659,33 +3670,31 @@ select t1.* from
                Join Filter: (t1.f1 = '***'::text)
                ->  Seq Scan on public.text_tbl t1
                      Output: t1.f1
-               ->  Materialize
+               ->  Nested Loop Left Join
                      Output: i8.q2
-                     ->  Nested Loop Left Join
-                           Output: i8.q2
-                           Join Filter: ((NULL::integer) = i8b1.q2)
-                           ->  Seq Scan on public.int8_tbl i8b1
-                                 Output: i8b1.q1, i8b1.q2
-                           ->  Materialize
+                     Join Filter: ((NULL::integer) = i8b1.q2)
+                     ->  Seq Scan on public.int8_tbl i8b1
+                           Output: i8b1.q1, i8b1.q2
+                     ->  Materialize
+                           Output: i8.q2, (NULL::integer)
+                           ->  Nested Loop Left Join
                                  Output: i8.q2, (NULL::integer)
-                                 ->  Nested Loop Left Join
-                                       Output: i8.q2, (NULL::integer)
-                                       Join Filter: (i8.q1 = i8b2.q1)
-                                       ->  Seq Scan on public.int8_tbl i8
-                                             Output: i8.q1, i8.q2
-                                       ->  Materialize
-                                             Output: i8b2.q1, (NULL::integer)
-                                             ->  Nested Loop
-                                                   Output: i8b2.q1, NULL::integer
-                                                   ->  Seq Scan on public.int8_tbl i8b2
-                                                         Output: i8b2.q1, i8b2.q2
-                                                   ->  Materialize
-                                                         ->  Seq Scan on public.int4_tbl i4b2
+                                 Join Filter: (i8.q1 = i8b2.q1)
+                                 ->  Seq Scan on public.int8_tbl i8
+                                       Output: i8.q1, i8.q2
+                                 ->  Materialize
+                                       Output: i8b2.q1, (NULL::integer)
+                                       ->  Nested Loop
+                                             Output: i8b2.q1, NULL::integer
+                                             ->  Seq Scan on public.int8_tbl i8b2
+                                                   Output: i8b2.q1, i8b2.q2
+                                             ->  Materialize
+                                                   ->  Seq Scan on public.int4_tbl i4b2
          ->  Materialize
                Output: i4.f1
                ->  Seq Scan on public.int4_tbl i4
                      Output: i4.f1
-(36 rows)
+(34 rows)
 
 select t1.* from
   text_tbl t1
@@ -3715,8 +3724,8 @@ select t1.* from
   on (t1.f1 = b1.d1)
   left join int4_tbl i4
   on (i8.q2 = i4.f1);
-                                          QUERY PLAN                                          
-----------------------------------------------------------------------------------------------
+                                       QUERY PLAN                                       
+----------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: t1.f1
    ->  Nested Loop Left Join
@@ -3727,36 +3736,34 @@ select t1.* from
                Join Filter: (t1.f1 = '***'::text)
                ->  Seq Scan on public.text_tbl t1
                      Output: t1.f1
-               ->  Materialize
+               ->  Nested Loop Left Join
                      Output: i8.q2
-                     ->  Nested Loop Left Join
-                           Output: i8.q2
-                           Join Filter: ((NULL::integer) = i8b1.q2)
-                           ->  Seq Scan on public.int8_tbl i8b1
-                                 Output: i8b1.q1, i8b1.q2
-                           ->  Materialize
+                     Join Filter: ((NULL::integer) = i8b1.q2)
+                     ->  Seq Scan on public.int8_tbl i8b1
+                           Output: i8b1.q1, i8b1.q2
+                     ->  Materialize
+                           Output: i8.q2, (NULL::integer)
+                           ->  Nested Loop Left Join
                                  Output: i8.q2, (NULL::integer)
-                                 ->  Nested Loop Left Join
-                                       Output: i8.q2, (NULL::integer)
-                                       Join Filter: (i8.q1 = i8b2.q1)
-                                       ->  Seq Scan on public.int8_tbl i8
-                                             Output: i8.q1, i8.q2
-                                       ->  Materialize
-                                             Output: i8b2.q1, (NULL::integer)
-                                             ->  Nested Loop
-                                                   Output: i8b2.q1, NULL::integer
-                                                   Join Filter: (i8b2.q1 = i4b2.f1)
-                                                   ->  Seq Scan on public.int8_tbl i8b2
-                                                         Output: i8b2.q1, i8b2.q2
-                                                   ->  Materialize
+                                 Join Filter: (i8.q1 = i8b2.q1)
+                                 ->  Seq Scan on public.int8_tbl i8
+                                       Output: i8.q1, i8.q2
+                                 ->  Materialize
+                                       Output: i8b2.q1, (NULL::integer)
+                                       ->  Nested Loop
+                                             Output: i8b2.q1, NULL::integer
+                                             Join Filter: (i8b2.q1 = i4b2.f1)
+                                             ->  Seq Scan on public.int8_tbl i8b2
+                                                   Output: i8b2.q1, i8b2.q2
+                                             ->  Materialize
+                                                   Output: i4b2.f1
+                                                   ->  Seq Scan on public.int4_tbl i4b2
                                                          Output: i4b2.f1
-                                                         ->  Seq Scan on public.int4_tbl i4b2
-                                                               Output: i4b2.f1
          ->  Materialize
                Output: i4.f1
                ->  Seq Scan on public.int4_tbl i4
                      Output: i4.f1
-(39 rows)
+(37 rows)
 
 select t1.* from
   text_tbl t1
@@ -3849,11 +3856,9 @@ where t1.f1 = ss.f1;
                Output: t1.f1, i8.q1, i8.q2
                ->  Seq Scan on public.text_tbl t1
                      Output: t1.f1
-               ->  Materialize
+               ->  Seq Scan on public.int8_tbl i8
                      Output: i8.q1, i8.q2
-                     ->  Seq Scan on public.int8_tbl i8
-                           Output: i8.q1, i8.q2
-                           Filter: (i8.q2 = 123)
+                     Filter: (i8.q2 = 123)
    ->  Materialize
          Output: (i8.q1), t2.f1
          ->  Limit
@@ -3864,7 +3869,7 @@ where t1.f1 = ss.f1;
                            Output: (i8.q1), t2.f1
                            ->  Seq Scan on public.text_tbl t2
                                  Output: i8.q1, t2.f1
-(24 rows)
+(22 rows)
 
 select * from
   text_tbl t1
@@ -3885,26 +3890,24 @@ select * from
   lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss1,
   lateral (select ss1.* from text_tbl t3 limit 1) as ss2
 where t1.f1 = ss2.f1;
-                                 QUERY PLAN                                  
------------------------------------------------------------------------------
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
  Nested Loop
    Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1, ((i8.q1)), (t2.f1)
    Join Filter: (t1.f1 = (t2.f1))
-   ->  Nested Loop
-         Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         Output: t1.f1, i8.q1, i8.q2
+         ->  Nested Loop Left Join
                Output: t1.f1, i8.q1, i8.q2
-               ->  Nested Loop Left Join
-                     Output: t1.f1, i8.q1, i8.q2
-                     ->  Seq Scan on public.text_tbl t1
-                           Output: t1.f1
-                     ->  Materialize
-                           Output: i8.q1, i8.q2
-                           ->  Seq Scan on public.int8_tbl i8
-                                 Output: i8.q1, i8.q2
-                                 Filter: (i8.q2 = 123)
-         ->  Materialize
-               Output: (i8.q1), t2.f1
+               ->  Seq Scan on public.text_tbl t1
+                     Output: t1.f1
+               ->  Seq Scan on public.int8_tbl i8
+                     Output: i8.q1, i8.q2
+                     Filter: (i8.q2 = 123)
+   ->  Materialize
+         Output: (i8.q1), t2.f1, ((i8.q1)), (t2.f1)
+         ->  Nested Loop
+               Output: (i8.q1), t2.f1, ((i8.q1)), (t2.f1)
                ->  Limit
                      Output: (i8.q1), t2.f1
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
@@ -3913,17 +3916,17 @@ where t1.f1 = ss2.f1;
                                  Output: (i8.q1), t2.f1
                                  ->  Seq Scan on public.text_tbl t2
                                        Output: i8.q1, t2.f1
-   ->  Materialize
-         Output: ((i8.q1)), (t2.f1)
-         ->  Limit
-               Output: ((i8.q1)), (t2.f1)
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     Output: (i8.q1), t2.f1
+               ->  Materialize
+                     Output: ((i8.q1)), (t2.f1)
                      ->  Limit
                            Output: ((i8.q1)), (t2.f1)
-                           ->  Seq Scan on public.text_tbl t3
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                  Output: (i8.q1), t2.f1
-(36 rows)
+                                 ->  Limit
+                                       Output: ((i8.q1)), (t2.f1)
+                                       ->  Seq Scan on public.text_tbl t3
+                                             Output: (i8.q1), t2.f1
+(34 rows)
 
 select * from
   text_tbl t1
@@ -4286,16 +4289,14 @@ select d.* from d left join (select distinct * from b) s
                      QUERY PLAN                      
 -----------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Merge Right Join
-         Merge Cond: (b.id = d.a)
-         ->  Unique
-               ->  Sort
-                     Sort Key: b.id, b.c_id
-                     ->  Seq Scan on b
-         ->  Sort
-               Sort Key: d.a
+   ->  Hash Right Join
+         Hash Cond: (b.id = d.a)
+         ->  HashAggregate
+               Group Key: b.id, b.c_id
+               ->  Seq Scan on b
+         ->  Hash
                ->  Seq Scan on d
-(10 rows)
+(8 rows)
 
 -- check join removal works when uniqueness of the join condition is enforced
 -- by a UNION
@@ -4353,14 +4354,17 @@ explain (verbose false, costs false, nodes false)
   select p.*, linked from parent p
     left join (select c.*, true as linked from child c) as ss
     on (p.k = ss.k) order by p.k;
-                        QUERY PLAN                        
-----------------------------------------------------------
+                 QUERY PLAN                  
+---------------------------------------------
  Remote Subquery Scan on all
-   ->  Merge Left Join
-         Merge Cond: (p.k = c.k)
-         ->  Index Scan using parent_pkey on parent p
-         ->  Index Only Scan using child_k_key on child c
-(5 rows)
+   ->  Sort
+         Sort Key: p.k
+         ->  Hash Left Join
+               Hash Cond: (p.k = c.k)
+               ->  Seq Scan on parent p
+               ->  Hash
+                     ->  Seq Scan on child c
+(8 rows)
 
 -- check for a 9.0rc1 bug: join removal breaks pseudoconstant qual handling
 select p.* from
@@ -4468,20 +4472,20 @@ from
      left join uniquetbl u1 ON u1.f1 = t1.string4) ss
   on t0.f1 = ss.case1
 where ss.stringu2 !~* ss.case1;
-                                               QUERY PLAN                                               
---------------------------------------------------------------------------------------------------------
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Nested Loop
          Join Filter: ((CASE t1.ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END) = t0.f1)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END
-               ->  Nested Loop
-                     ->  Seq Scan on int4_tbl i4
-                     ->  Index Scan using tenk1_unique2 on tenk1 t1
-                           Index Cond: (unique2 = i4.f1)
-                           Filter: (stringu2 !~* CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END)
+         ->  Seq Scan on text_tbl t0
          ->  Materialize
-               ->  Seq Scan on text_tbl t0
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END
+                     ->  Nested Loop
+                           ->  Seq Scan on int4_tbl i4
+                           ->  Index Scan using tenk1_unique2 on tenk1 t1
+                                 Index Cond: (unique2 = i4.f1)
+                                 Filter: (stringu2 !~* CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END)
 (12 rows)
 
 select t0.*
@@ -4713,18 +4717,18 @@ select * from generate_series(100,200) g,
 explain (num_nodes off, nodes off, costs off)
   select count(*) from tenk1 a,
     tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x;
-                         QUERY PLAN                         
-------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Finalize Aggregate
    ->  Remote Subquery Scan on all
          ->  Partial Aggregate
                ->  Hash Join
-                     Hash Cond: (b.unique2 = a.unique1)
-                     ->  Remote Subquery Scan on all
-                           Distribute results by H: unique2
-                           ->  Seq Scan on tenk1 b
+                     Hash Cond: (a.unique1 = b.unique2)
+                     ->  Seq Scan on tenk1 a
                      ->  Hash
-                           ->  Seq Scan on tenk1 a
+                           ->  Remote Subquery Scan on all
+                                 Distribute results by H: unique2
+                                 ->  Seq Scan on tenk1 b
 (10 rows)
 
 select count(*) from tenk1 a,
@@ -5445,8 +5449,8 @@ select * from
   lateral (select f1 from int4_tbl
            where f1 = any (select unique1 from tenk1
                            where unique2 = v.x offset 0)) ss;
-                                    QUERY PLAN                                    
-----------------------------------------------------------------------------------
+                                       QUERY PLAN                                       
+----------------------------------------------------------------------------------------
  Nested Loop
    Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1
    ->  Values Scan on "*VALUES*"
@@ -5455,18 +5459,21 @@ select * from
          Output: int4_tbl.f1
          ->  Remote Subquery Scan on all
                Output: int4_tbl.f1
-               ->  Nested Loop
+               ->  Hash Join
                      Output: int4_tbl.f1
-                     Join Filter: (int4_tbl.f1 = tenk1.unique1)
-                     ->  HashAggregate
-                           Output: tenk1.unique1
-                           Group Key: tenk1.unique1
-                           ->  Index Scan using tenk1_unique2 on public.tenk1
-                                 Output: tenk1.unique1
-                                 Index Cond: (tenk1.unique2 = "*VALUES*".column2)
+                     Inner Unique: true
+                     Hash Cond: (int4_tbl.f1 = tenk1.unique1)
                      ->  Seq Scan on public.int4_tbl
                            Output: int4_tbl.f1
-(19 rows)
+                     ->  Hash
+                           Output: tenk1.unique1
+                           ->  HashAggregate
+                                 Output: tenk1.unique1
+                                 Group Key: tenk1.unique1
+                                 ->  Index Scan using tenk1_unique2 on public.tenk1
+                                       Output: tenk1.unique1
+                                       Index Cond: (tenk1.unique2 = "*VALUES*".column2)
+(22 rows)
 
 select * from
   (values (0,9998), (1,1000)) v(id,x),
@@ -6102,11 +6109,10 @@ where exists (select 1 from tenk1 t3
                            Output: t1.unique1
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                  Output: t1.unique1
-                                 ->  Bitmap Heap Scan on public.onek t1
+                                 Sort Key: t1.unique1
+                                 ->  Index Only Scan using onek_unique1 on public.onek t1
                                        Output: t1.unique1
-                                       Recheck Cond: (t1.unique1 < 1)
-                                       ->  Bitmap Index Scan on onek_unique1
-                                             Index Cond: (t1.unique1 < 1)
+                                       Index Cond: (t1.unique1 < 1)
          ->  Materialize
                Output: t2.hundred
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
@@ -6116,7 +6122,7 @@ where exists (select 1 from tenk1 t3
                      ->  Index Only Scan using tenk1_hundred on public.tenk1 t2
                            Output: t2.hundred
                            Index Cond: (t2.hundred = t3.tenthous)
-(39 rows)
+(38 rows)
 
 -- ... unless it actually is unique
 create table j3 as select unique1, tenthous from onek;
@@ -6128,8 +6134,8 @@ from onek t1, tenk1 t2
 where exists (select 1 from j3
               where j3.unique1 = t1.unique1 and j3.tenthous = t2.hundred)
       and t1.unique1 < 1;
-                                                                                                       QUERY PLAN                                                                                                        
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: t1.unique1, t2.hundred
    ->  Nested Loop
@@ -6138,18 +6144,16 @@ where exists (select 1 from j3
                Output: t1.unique1, j3.tenthous
                ->  Nested Loop
                      Output: t1.unique1, j3.tenthous
-                     ->  Bitmap Heap Scan on public.onek t1
-                           Output: t1.unique1, t1.unique2, t1.two, t1.four, t1.ten, t1.twenty, t1.hundred, t1.thousand, t1.twothousand, t1.fivethous, t1.tenthous, t1.odd, t1.even, t1.stringu1, t1.stringu2, t1.string4
-                           Recheck Cond: (t1.unique1 < 1)
-                           ->  Bitmap Index Scan on onek_unique1
-                                 Index Cond: (t1.unique1 < 1)
+                     ->  Index Only Scan using onek_unique1 on public.onek t1
+                           Output: t1.unique1
+                           Index Cond: (t1.unique1 < 1)
                      ->  Index Only Scan using j3_unique1_tenthous_idx on public.j3
                            Output: j3.unique1, j3.tenthous
                            Index Cond: (j3.unique1 = t1.unique1)
          ->  Index Only Scan using tenk1_hundred on public.tenk1 t2
                Output: t2.hundred
                Index Cond: (t2.hundred = j3.tenthous)
-(19 rows)
+(17 rows)
 
 drop table j3;
 --
@@ -6170,18 +6174,18 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes
 	where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a;
                                                           QUERY PLAN                                                          
 ------------------------------------------------------------------------------------------------------------------------------
- Nested Loop  (cost=200.16..596.19 rows=33 width=4)
+ Nested Loop  (cost=200.16..401.93 rows=33 width=4)
    Join Filter: (t3.b > t2.a)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..453.19 rows=1 width=4)
-         ->  Nested Loop  (cost=0.16..353.18 rows=1 width=4)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..280.68 rows=1 width=4)
+         ->  Nested Loop  (cost=0.16..180.68 rows=1 width=4)
                Join Filter: (t1.a = t2.a)
                ->  Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1  (cost=0.16..8.18 rows=1 width=4)
                      Index Cond: (b = 2)
                      Filter: (((d)::text ~~ 'char%'::text) AND (c = 3))
-               ->  Seq Scan on nestloop_suppression2 t2  (cost=0.00..220.00 rows=10000 width=4)
-   ->  Materialize  (cost=100.00..141.75 rows=100 width=4)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..141.50 rows=100 width=4)
-               ->  Seq Scan on nestloop_suppression3 t3  (cost=0.00..41.00 rows=100 width=4)
+               ->  Seq Scan on nestloop_suppression2 t2  (cost=0.00..110.00 rows=5000 width=4)
+   ->  Materialize  (cost=100.00..120.62 rows=50 width=4)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..120.50 rows=50 width=4)
+               ->  Seq Scan on nestloop_suppression3 t3  (cost=0.00..20.50 rows=50 width=4)
 (12 rows)
 
 set enable_nestloop_suppression = true;
@@ -6189,19 +6193,19 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes
 	where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a;
                                                              QUERY PLAN                                                             
 ------------------------------------------------------------------------------------------------------------------------------------
- Nested Loop  (cost=200.16..621.19 rows=33 width=4)
+ Nested Loop  (cost=200.16..414.44 rows=33 width=4)
    Join Filter: (t3.b > t2.a)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..478.19 rows=1 width=4)
-         ->  Nested Loop  (cost=0.16..378.19 rows=1 width=4)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..293.19 rows=1 width=4)
+         ->  Nested Loop  (cost=0.16..193.19 rows=1 width=4)
                Join Filter: (t1.a = t2.a)
-               ->  Seq Scan on nestloop_suppression2 t2  (cost=0.00..220.00 rows=10000 width=4)
+               ->  Seq Scan on nestloop_suppression2 t2  (cost=0.00..110.00 rows=5000 width=4)
                ->  Materialize  (cost=0.16..8.19 rows=1 width=4)
                      ->  Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1  (cost=0.16..8.18 rows=1 width=4)
                            Index Cond: (b = 2)
                            Filter: (((d)::text ~~ 'char%'::text) AND (c = 3))
-   ->  Materialize  (cost=100.00..141.75 rows=100 width=4)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..141.50 rows=100 width=4)
-               ->  Seq Scan on nestloop_suppression3 t3  (cost=0.00..41.00 rows=100 width=4)
+   ->  Materialize  (cost=100.00..120.62 rows=50 width=4)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..120.50 rows=50 width=4)
+               ->  Seq Scan on nestloop_suppression3 t3  (cost=0.00..20.50 rows=50 width=4)
 (13 rows)
 
 drop table nestloop_suppression1;
diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
index 85aea9c7..ccf6aba3 100644
--- a/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@@ -211,10 +211,12 @@ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
                Distribute results by H: b
                ->  Seq Scan on atest12 atest12_1
                      Filter: (b <<< 5)
-         ->  Index Scan using atest12_a_idx on atest12
-               Index Cond: (a = atest12_1.b)
+         ->  Bitmap Heap Scan on atest12
+               Recheck Cond: (a = atest12_1.b)
                Filter: (b <<< 5)
-(9 rows)
+               ->  Bitmap Index Scan on atest12_a_idx
+                     Index Cond: (a = atest12_1.b)
+(11 rows)
 
 -- And this one.
 EXPLAIN (COSTS OFF) SELECT * FROM atest12 x, atest12 y
@@ -247,16 +249,15 @@ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
                            QUERY PLAN                            
 -----------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Hash Join
-         Hash Cond: (atest12_1.b = atest12.a)
+   ->  Nested Loop
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Distribute results by H: b
                ->  Seq Scan on atest12 atest12_1
                      Filter: (b <<< 5)
-         ->  Hash
-               ->  Seq Scan on atest12
-                     Filter: (b <<< 5)
-(10 rows)
+         ->  Index Scan using atest12_a_idx on atest12
+               Index Cond: (a = atest12_1.b)
+               Filter: (b <<< 5)
+(9 rows)
 
 RESET random_page_cost;
 -- Now regress_user1 grants sufficient access to regress_user2.
@@ -273,10 +274,12 @@ EXPLAIN (COSTS OFF) SELECT * FROM atest12v x, atest12v y WHERE x.a = y.b;
                Distribute results by H: b
                ->  Seq Scan on atest12 atest12_1
                      Filter: (b <<< 5)
-         ->  Index Scan using atest12_a_idx on atest12
-               Index Cond: (a = atest12_1.b)
+         ->  Bitmap Heap Scan on atest12
+               Recheck Cond: (a = atest12_1.b)
                Filter: (b <<< 5)
-(9 rows)
+               ->  Bitmap Index Scan on atest12_a_idx
+                     Index Cond: (a = atest12_1.b)
+(11 rows)
 
 -- But not for this, due to lack of table-wide permissions needed
 -- to make use of the expression index's statistics.
diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out
index e0336e73..01debacc 100644
--- a/src/test/regress/expected/rowsecurity_1.out
+++ b/src/test/regress/expected/rowsecurity_1.out
@@ -1577,13 +1577,12 @@ WHERE t2.a = 3 and t3.a = 2 AND f_leak(t2.b) AND f_leak(t3.b);
  Remote Subquery Scan on all (datanode_2)
    ->  Update on t2
          ->  Nested Loop
-               ->  Seq Scan on t2
-                     Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
-               ->  Materialize
                      ->  Remote Subquery Scan on all (datanode_1)
                            ->  Seq Scan on t3
                                  Filter: ((a = 2) AND f_leak(b))
-(9 rows)
+               ->  Seq Scan on t2
+                     Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
+(8 rows)
 
 UPDATE t2 SET b=t2.b FROM t3
 WHERE t2.a = 3 and t3.a = 2 AND f_leak(t2.b) AND f_leak(t3.b);
@@ -2062,16 +2061,16 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
 PREPARE plancache_test3 AS WITH q AS MATERIALIZED (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b);
 EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
                            QUERY PLAN                            
------------------------------------------------------------------
+-------------------------------------------------------------
  Nested Loop
    CTE q
      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
            ->  Seq Scan on z2
-   ->  CTE Scan on q
-   ->  Materialize
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on z1
                      Filter: (((a % 2) = 0) AND f_leak(b))
+   ->  Materialize
+         ->  CTE Scan on q
 (9 rows)
 
 SET ROLE regress_rls_group1;
@@ -2114,16 +2113,16 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
 
 EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
                            QUERY PLAN                            
------------------------------------------------------------------
+-------------------------------------------------------------
  Nested Loop
    CTE q
      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
            ->  Seq Scan on z2
-   ->  CTE Scan on q
-   ->  Materialize
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on z1
                      Filter: (((a % 2) = 0) AND f_leak(b))
+   ->  Materialize
+         ->  CTE Scan on q
 (9 rows)
 
 SET SESSION AUTHORIZATION regress_rls_carol;
@@ -2166,16 +2165,16 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
 
 EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
                            QUERY PLAN                            
------------------------------------------------------------------
+-------------------------------------------------------------
  Nested Loop
    CTE q
      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
            ->  Seq Scan on z2
-   ->  CTE Scan on q
-   ->  Materialize
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on z1
                      Filter: (((a % 2) = 1) AND f_leak(b))
+   ->  Materialize
+         ->  CTE Scan on q
 (9 rows)
 
 SET ROLE regress_rls_group2;
@@ -2218,16 +2217,16 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
 
 EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
                            QUERY PLAN                            
------------------------------------------------------------------
+-------------------------------------------------------------
  Nested Loop
    CTE q
      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
            ->  Seq Scan on z2
-   ->  CTE Scan on q
-   ->  Materialize
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on z1
                      Filter: (((a % 2) = 1) AND f_leak(b))
+   ->  Materialize
+         ->  CTE Scan on q
 (9 rows)
 
 --
diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out
index ca729d3d..2406dabc 100644
--- a/src/test/regress/expected/select_views.out
+++ b/src/test/regress/expected/select_views.out
@@ -1449,18 +1449,17 @@ EXPLAIN (COSTS OFF) SELECT * FROM my_credit_card_usage_normal
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Nested Loop
          Join Filter: (l.cid = r.cid)
+         ->  Subquery Scan on l
+               Filter: f_leak(l.cnum)
+               ->  Hash Join
+                     Hash Cond: (r_1.cid = l_1.cid)
+                     ->  Seq Scan on credit_card r_1
+                     ->  Hash
+                           ->  Seq Scan on customer l_1
+                                 Filter: (name = (CURRENT_USER)::text)
          ->  Seq Scan on credit_usage r
                Filter: ((ymd >= '10-01-2011'::date) AND (ymd < '11-01-2011'::date))
-         ->  Materialize
-               ->  Subquery Scan on l
-                     Filter: f_leak(l.cnum)
-                     ->  Hash Join
-                           Hash Cond: (r_1.cid = l_1.cid)
-                           ->  Seq Scan on credit_card r_1
-                           ->  Hash
-                                 ->  Seq Scan on customer l_1
-                                       Filter: (name = (CURRENT_USER)::text)
-(14 rows)
+(13 rows)
 
 SELECT * FROM my_credit_card_usage_secure
        WHERE f_leak(cnum) AND ymd >= '2011-10-01' AND ymd < '2011-11-01';
diff --git a/src/test/regress/expected/stats_ext_2.out b/src/test/regress/expected/stats_ext_2.out
index ca7aba0a..315bcbc7 100644
--- a/src/test/regress/expected/stats_ext_2.out
+++ b/src/test/regress/expected/stats_ext_2.out
@@ -181,12 +181,10 @@ EXPLAIN (COSTS off)
          Group Key: a, b
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Distribute results by H: a
-               ->  Partial GroupAggregate
+               ->  Partial HashAggregate
                      Group Key: a, b
-                     ->  Sort
-                           Sort Key: a, b
-                           ->  Seq Scan on ndistinct
-(10 rows)
+                     ->  Seq Scan on ndistinct
+(8 rows)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY b, c;
@@ -197,12 +195,10 @@ EXPLAIN (COSTS off)
          Group Key: b, c
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Distribute results by H: b
-               ->  Partial GroupAggregate
+               ->  Partial HashAggregate
                      Group Key: b, c
-                     ->  Sort
-                           Sort Key: b, c
-                           ->  Seq Scan on ndistinct
-(10 rows)
+                     ->  Seq Scan on ndistinct
+(8 rows)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
@@ -213,12 +209,10 @@ EXPLAIN (COSTS off)
          Group Key: a, b, c
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Distribute results by H: a
-               ->  Partial GroupAggregate
+               ->  Partial HashAggregate
                      Group Key: a, b, c
-                     ->  Sort
-                           Sort Key: a, b, c
-                           ->  Seq Scan on ndistinct
-(10 rows)
+                     ->  Seq Scan on ndistinct
+(8 rows)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c, d;
@@ -229,12 +223,10 @@ EXPLAIN (COSTS off)
          Group Key: a, b, c, d
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Distribute results by H: a
-               ->  Partial GroupAggregate
+               ->  Partial HashAggregate
                      Group Key: a, b, c, d
-                     ->  Sort
-                           Sort Key: a, b, c, d
-                           ->  Seq Scan on ndistinct
-(10 rows)
+                     ->  Seq Scan on ndistinct
+(8 rows)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
@@ -245,12 +237,10 @@ EXPLAIN (COSTS off)
          Group Key: b, c, d
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Distribute results by H: b
-               ->  Partial GroupAggregate
+               ->  Partial HashAggregate
                      Group Key: b, c, d
-                     ->  Sort
-                           Sort Key: b, c, d
-                           ->  Seq Scan on ndistinct
-(10 rows)
+                     ->  Seq Scan on ndistinct
+(8 rows)
 
 -- correct command
 CREATE STATISTICS s10 ON a, b, c FROM ndistinct;
@@ -316,12 +306,10 @@ EXPLAIN (COSTS off)
          Group Key: a, b, c, d
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Distribute results by H: a
-               ->  Partial GroupAggregate
+               ->  Partial HashAggregate
                      Group Key: a, b, c, d
-                     ->  Sort
-                           Sort Key: a, b, c, d
-                           ->  Seq Scan on ndistinct
-(10 rows)
+                     ->  Seq Scan on ndistinct
+(8 rows)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY b, c, d;
@@ -332,12 +320,10 @@ EXPLAIN (COSTS off)
          Group Key: b, c, d
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Distribute results by H: b
-               ->  Partial GroupAggregate
+               ->  Partial HashAggregate
                      Group Key: b, c, d
-                     ->  Sort
-                           Sort Key: b, c, d
-                           ->  Seq Scan on ndistinct
-(10 rows)
+                     ->  Seq Scan on ndistinct
+(8 rows)
 
 TRUNCATE TABLE ndistinct;
 -- under-estimates when using only per-column statistics
@@ -363,12 +349,10 @@ EXPLAIN (COSTS off)
          Group Key: a, b
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Distribute results by H: b
-               ->  Partial GroupAggregate
+               ->  Partial HashAggregate
                      Group Key: a, b
-                     ->  Sort
-                           Sort Key: a, b
-                           ->  Seq Scan on ndistinct
-(10 rows)
+                     ->  Seq Scan on ndistinct
+(8 rows)
 
 EXPLAIN (COSTS off)
  SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c;
@@ -675,10 +659,10 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=255.01..255.02 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=255.00..255.01 rows=1 width=0)
-         ->  Partial Aggregate  (cost=155.00..155.01 rows=1 width=8)
-               ->  Seq Scan on subset  (cost=0.00..155.00 rows=2 width=0)
+ Finalize Aggregate  (cost=177.51..177.52 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.51 rows=1 width=8)
+         ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..77.50 rows=1 width=0)
                      Filter: ((b = 'prefix_1'::text) AND (c = 1))
 (5 rows)
 
@@ -696,10 +680,10 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=255.01..255.02 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=255.00..255.01 rows=1 width=0)
-         ->  Partial Aggregate  (cost=155.00..155.01 rows=1 width=8)
-               ->  Seq Scan on subset  (cost=0.00..155.00 rows=100 width=0)
+ Finalize Aggregate  (cost=177.51..177.52 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.51 rows=1 width=8)
+         ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..77.50 rows=50 width=0)
                      Filter: ((b = 'prefix_1'::text) AND (c = 1))
 (5 rows)
 
@@ -714,10 +698,10 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=255.01..255.02 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=255.00..255.01 rows=1 width=0)
-         ->  Partial Aggregate  (cost=155.00..155.01 rows=1 width=8)
-               ->  Seq Scan on subset  (cost=0.00..155.00 rows=10 width=0)
+ Finalize Aggregate  (cost=177.51..177.52 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.51 rows=1 width=8)
+         ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..77.50 rows=5 width=0)
                      Filter: ((b ~~ '%_1'::text) AND (c = 1))
 (5 rows)
 
@@ -738,10 +722,10 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=255.01..255.02 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=255.00..255.01 rows=1 width=0)
-         ->  Partial Aggregate  (cost=155.00..155.01 rows=1 width=8)
-               ->  Seq Scan on subset  (cost=0.00..155.00 rows=100 width=0)
+ Finalize Aggregate  (cost=177.51..177.52 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.51 rows=1 width=8)
+         ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..77.50 rows=50 width=0)
                      Filter: ((b ~~ '%_1'::text) AND (c = 1))
 (5 rows)
 
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 4691a4a9..c9dc3101 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1174,15 +1174,15 @@ set enable_hashjoin to false;
 set enable_mergejoin to false;
 explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
                                            QUERY PLAN                                            
--------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=15636.19..15637.88 rows=675 width=8)
-   ->  Sort  (cost=15636.19..15637.88 rows=675 width=8)
+-----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=3923.54..3924.39 rows=338 width=8)
+   ->  Sort  (cost=3923.54..3924.39 rows=338 width=8)
          Sort Key: a.a, b.a
-         ->  Nested Loop Left Scalar Join  (cost=0.00..15604.47 rows=675 width=8)
+         ->  Nested Loop Left Scalar Join  (cost=0.00..3909.35 rows=338 width=8)
                Join Filter: (b.a = a.a)
-               ->  Seq Scan on tbl_a a  (cost=0.00..23.50 rows=1350 width=4)
-               ->  Materialize  (cost=0.00..30.25 rows=1350 width=4)
-                     ->  Seq Scan on tbl_b b  (cost=0.00..23.50 rows=1350 width=4)
+               ->  Seq Scan on tbl_a a  (cost=0.00..11.75 rows=675 width=4)
+               ->  Materialize  (cost=0.00..15.12 rows=675 width=4)
+                     ->  Seq Scan on tbl_b b  (cost=0.00..11.75 rows=675 width=4)
 (8 rows)
 
 select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
@@ -1711,16 +1711,16 @@ select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a orde
 -- support pullup lateral ANY_SUBLINK
 explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
                                                 QUERY PLAN                                                
-----------------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=140.38..193.10 rows=225 width=8)
-   ->  Hash Semi Join  (cost=140.38..193.10 rows=225 width=8)
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=120.19..136.36 rows=112 width=8)
+   ->  Hash Semi Join  (cost=120.19..136.36 rows=112 width=8)
          Hash Cond: (a.b = b.a)
          Join Filter: (b.b > a.b)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..141.05 rows=1350 width=8)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..111.75 rows=675 width=8)
                Distribute results by H: b
-               ->  Seq Scan on tbl_a a  (cost=0.00..23.50 rows=1350 width=8)
-         ->  Hash  (cost=23.50..23.50 rows=1350 width=8)
-               ->  Seq Scan on tbl_b b  (cost=0.00..23.50 rows=1350 width=8)
+               ->  Seq Scan on tbl_a a  (cost=0.00..11.75 rows=675 width=8)
+         ->  Hash  (cost=11.75..11.75 rows=675 width=8)
+               ->  Seq Scan on tbl_b b  (cost=0.00..11.75 rows=675 width=8)
 (9 rows)
 
 select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
@@ -1750,9 +1750,12 @@ where t2.a = (
 		t1.a = t2.a
 );
                                              QUERY PLAN                                              
------------------------------------------------------------------------------------------------------
+-----------------------------------------------------------------------------------------------------------
  Hash Join
-   Hash Cond: ("EXPR_subquery".min = t2.a)
+   Hash Cond: (t2.a = "EXPR_subquery".min)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Seq Scan on sub_t2 t2
+   ->  Hash
    ->  Hash Left Join
          Hash Cond: (t1.a = "EXPR_subquery".a)
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
@@ -1771,9 +1774,6 @@ where t2.a = (
                                  ->  Materialize
                                        ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                              ->  Seq Scan on sub_interfere2
-   ->  Hash
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Seq Scan on sub_t2 t2
 (23 rows)
 
 DROP TABLE sub_t1;
diff --git a/src/test/regress/expected/xc_FQS_2.out b/src/test/regress/expected/xc_FQS_2.out
index ea10b9ad..c4e07fc5 100644
--- a/src/test/regress/expected/xc_FQS_2.out
+++ b/src/test/regress/expected/xc_FQS_2.out
@@ -1641,14 +1641,14 @@ select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union sele
 explain select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1);
                                             QUERY PLAN                                            
 --------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..142.30 rows=2 width=40)
-   ->  Nested Loop Semi Join  (cost=100.00..142.30 rows=2 width=40)
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..121.06 rows=1 width=40)
+   ->  Nested Loop Semi Join  (cost=100.00..121.06 rows=1 width=40)
          Join Filter: (t1.c = t2.c)
-         ->  Seq Scan on subquery_fqs t1  (cost=0.00..21.00 rows=4 width=40)
+         ->  Seq Scan on subquery_fqs t1  (cost=0.00..10.50 rows=2 width=40)
                Filter: (id = 1)
-         ->  Materialize  (cost=100.00..121.07 rows=4 width=4)
-               ->  Remote Subquery Scan on all (datanode_1)  (cost=100.00..121.05 rows=4 width=4)
-                     ->  Seq Scan on subquery_fqs t2  (cost=0.00..21.00 rows=4 width=4)
+         ->  Materialize  (cost=100.00..110.51 rows=2 width=4)
+               ->  Remote Subquery Scan on all (datanode_1)  (cost=100.00..110.50 rows=2 width=4)
+                     ->  Seq Scan on subquery_fqs t2  (cost=0.00..10.50 rows=2 width=4)
                            Filter: (id = 1)
 (9 rows)
 
diff --git a/src/test/regress/expected/xc_FQS_join_1.out b/src/test/regress/expected/xc_FQS_join_1.out
index 18836c1e..6cfb1dda 100644
--- a/src/test/regress/expected/xc_FQS_join_1.out
+++ b/src/test/regress/expected/xc_FQS_join_1.out
@@ -390,19 +390,19 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod natural join t
 ----------------------------------------------------------------------------------
  Hash Join
    Output: tab1_mod.val, tab1_mod.val2
-   Hash Cond: ((tab1_mod.val = tab4_rep.val) AND (tab1_mod.val2 = tab4_rep.val2))
-   ->  Remote Subquery Scan on all
-         Output: tab1_mod.val, tab1_mod.val2
-         ->  Seq Scan on public.tab1_mod
-               Output: tab1_mod.val, tab1_mod.val2
-               Filter: (tab1_mod.val > 2)
-   ->  Hash
-         Output: tab4_rep.val, tab4_rep.val2
+   Hash Cond: ((tab4_rep.val = tab1_mod.val) AND (tab4_rep.val2 = tab1_mod.val2))
          ->  Remote Subquery Scan on all
                Output: tab4_rep.val, tab4_rep.val2
                ->  Seq Scan on public.tab4_rep
                      Output: tab4_rep.val, tab4_rep.val2
                      Filter: (tab4_rep.val < 4)
+   ->  Hash
+         Output: tab1_mod.val, tab1_mod.val2
+         ->  Remote Subquery Scan on all
+               Output: tab1_mod.val, tab1_mod.val2
+               ->  Seq Scan on public.tab1_mod
+                     Output: tab1_mod.val, tab1_mod.val2
+                     Filter: (tab1_mod.val > 2)
 (15 rows)
 
 -- Join involving two distributed tables, never shipped
@@ -425,18 +425,18 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod natural join t
    Output: tab1_mod.val, tab1_mod.val2
    ->  Hash Join
          Output: tab1_mod.val, tab1_mod.val2
-         Hash Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
-         ->  Seq Scan on public.tab1_mod
-               Output: tab1_mod.val, tab1_mod.val2
-               Filter: (tab1_mod.val > 2)
-         ->  Hash
-               Output: tab2_mod.val, tab2_mod.val2
-               ->  Remote Subquery Scan on all
-                     Output: tab2_mod.val, tab2_mod.val2
-                     Distribute results by M: val
+         Hash Cond: ((tab2_mod.val = tab1_mod.val) AND (tab2_mod.val2 = tab1_mod.val2))
                      ->  Seq Scan on public.tab2_mod
                            Output: tab2_mod.val, tab2_mod.val2
                            Filter: (tab2_mod.val < 4)
+         ->  Hash
+               Output: tab1_mod.val, tab1_mod.val2
+               ->  Remote Subquery Scan on all
+                     Output: tab1_mod.val, tab1_mod.val2
+                     Distribute results by M: val
+                     ->  Seq Scan on public.tab1_mod
+                           Output: tab1_mod.val, tab1_mod.val2
+                           Filter: (tab1_mod.val > 2)
 (16 rows)
 
 -- Join involving a distributed table and two replicated tables, such that the
@@ -590,17 +590,17 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod
          Output: tab1_mod.val, tab1_mod.val2, tab1_mod.val2
          Join Filter: (tab1_mod.val2 = tab4_rep.val2)
          ->  Remote Subquery Scan on all
-               Output: tab1_mod.val, tab1_mod.val2
-               ->  Seq Scan on public.tab1_mod
-                     Output: tab1_mod.val, tab1_mod.val2
-                     Filter: (tab1_mod.val = 1)
-         ->  Materialize
-               Output: tab4_rep.val, tab4_rep.val2
-               ->  Remote Subquery Scan on all
                      Output: tab4_rep.val, tab4_rep.val2
                      ->  Seq Scan on public.tab4_rep
                            Output: tab4_rep.val, tab4_rep.val2
                            Filter: (tab4_rep.val = 1)
+         ->  Materialize
+               Output: tab1_mod.val, tab1_mod.val2
+               ->  Remote Subquery Scan on all
+                     Output: tab1_mod.val, tab1_mod.val2
+                     ->  Seq Scan on public.tab1_mod
+                           Output: tab1_mod.val, tab1_mod.val2
+                           Filter: (tab1_mod.val = 1)
 (18 rows)
 
 -- following join between distributed tables should get FQSed because both of
@@ -625,16 +625,16 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod
    ->  Nested Loop
          Output: tab1_mod.val2, tab1_mod.val, tab2_mod.val, tab1_mod.val
          Join Filter: (tab1_mod.val2 = tab2_mod.val2)
+         ->  Seq Scan on public.tab2_mod
+               Output: tab2_mod.val, tab2_mod.val2
+               Filter: (tab2_mod.val = 2)
+         ->  Materialize
+               Output: tab1_mod.val2, tab1_mod.val
          ->  Remote Subquery Scan on all
                Output: tab1_mod.val2, tab1_mod.val
                ->  Seq Scan on public.tab1_mod
                      Output: tab1_mod.val2, tab1_mod.val
                      Filter: (tab1_mod.val = 1)
-         ->  Materialize
-               Output: tab2_mod.val, tab2_mod.val2
-               ->  Seq Scan on public.tab2_mod
-                     Output: tab2_mod.val, tab2_mod.val2
-                     Filter: (tab2_mod.val = 2)
 (15 rows)
 
 -- JOIN involving the distributed table with equi-JOIN on the distributed column
@@ -696,24 +696,21 @@ explain (verbose on, nodes off, costs off) update tab1_mod set val2 = 1000 from
  Remote Subquery Scan on all
    ->  Update on public.tab1_mod
          ->  Merge Join
-               Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid, tab2_mod.xc_node_id
-               Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
+               Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid
+               Merge Cond: ((tab2_mod.val = tab1_mod.val) AND (tab2_mod.val2 = tab1_mod.val2))
+               ->  Remote Subquery Scan on all
+                     Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                           ->  Sort
+                                 Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
+                                 Sort Key: tab2_mod.val, tab2_mod.val2
+                                 ->  Seq Scan on public.tab2_mod
+                                 Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
                ->  Sort
                      Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2
                      Sort Key: tab1_mod.val, tab1_mod.val2
                      ->  Seq Scan on public.tab1_mod
                            Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2
-               ->  Materialize
-                     Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
-                     ->  Remote Subquery Scan on all
-                           Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
-                           Distribute results by M: val
-                           ->  Sort
-                                 Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
-                                 Sort Key: tab2_mod.val, tab2_mod.val2
-                                 ->  Seq Scan on public.tab2_mod
-                                       Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
-(20 rows)
+(17 rows)
 
 explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod
 		where tab1_mod.val = tab2_mod.val and tab1_mod.val2 = tab2_mod.val2;
@@ -722,24 +719,21 @@ explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod
  Remote Subquery Scan on all
    ->  Delete on public.tab1_mod
          ->  Merge Join
-               Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid, tab2_mod.xc_node_id
-               Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
+               Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid
+               Merge Cond: ((tab2_mod.val = tab1_mod.val) AND (tab2_mod.val2 = tab1_mod.val2))
+               ->  Remote Subquery Scan on all
+                     Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                           ->  Sort
+                                 Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
+                                 Sort Key: tab2_mod.val, tab2_mod.val2
+                                 ->  Seq Scan on public.tab2_mod
+                                 Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
                ->  Sort
                      Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2
                      Sort Key: tab1_mod.val, tab1_mod.val2
                      ->  Seq Scan on public.tab1_mod
                            Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2
-               ->  Materialize
-                     Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
-                     ->  Remote Subquery Scan on all
-                           Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
-                           Distribute results by M: val
-                           ->  Sort
-                                 Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
-                                 Sort Key: tab2_mod.val, tab2_mod.val2
-                                 ->  Seq Scan on public.tab2_mod
-                                       Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
-(20 rows)
+(17 rows)
 
 explain (verbose on, nodes off, costs off) update tab1_rep set val2 = 1000 from tab2_rep
 		where tab1_rep.val = tab2_rep.val and tab1_rep.val2 = tab2_rep.val2;
diff --git a/src/test/regress/expected/xc_groupby_1.out b/src/test/regress/expected/xc_groupby_1.out
index 8db42b7f..b33bfcf0 100644
--- a/src/test/regress/expected/xc_groupby_1.out
+++ b/src/test/regress/expected/xc_groupby_1.out
@@ -332,20 +332,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc
          ->  HashAggregate
                Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
                Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
-               ->  Merge Join
+               ->  Hash Join
                      Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
-                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
-                     ->  Sort
-                           Output: xc_groupby_tab1.val
-                           Sort Key: xc_groupby_tab1.val
-                           ->  Seq Scan on public.xc_groupby_tab1
-                                 Output: xc_groupby_tab1.val
-                     ->  Sort
+                     Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                     ->  Hash
                            Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-                           Sort Key: xc_groupby_tab2.val
                            ->  Seq Scan on public.xc_groupby_tab2
                                  Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-(22 rows)
+(18 rows)
 
 explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2;
                                                QUERY PLAN                                                
@@ -355,20 +351,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc
    ->  HashAggregate
          Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
          Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
-         ->  Merge Join
+         ->  Hash Join
                Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
-               Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
-               ->  Sort
-                     Output: xc_groupby_tab1.val
-                     Sort Key: xc_groupby_tab1.val
-                     ->  Seq Scan on public.xc_groupby_tab1
-                           Output: xc_groupby_tab1.val
-               ->  Sort
+               Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+               ->  Hash
                      Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-                     Sort Key: xc_groupby_tab2.val
                      ->  Seq Scan on public.xc_groupby_tab2
                            Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-(18 rows)
+(14 rows)
 
 select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1;
  ?column? 
@@ -397,20 +389,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc
                      ->  Partial HashAggregate
                            Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
                            Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
-                           ->  Merge Join
+                           ->  Hash Join
                                  Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
-                                 Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
-                                 ->  Sort
-                                       Output: xc_groupby_tab1.val
-                                       Sort Key: xc_groupby_tab1.val
-                                       ->  Seq Scan on public.xc_groupby_tab1
-                                             Output: xc_groupby_tab1.val
-                                 ->  Sort
+                                 Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                                 ->  Hash
                                        Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-                                       Sort Key: xc_groupby_tab2.val
                                        ->  Seq Scan on public.xc_groupby_tab2
                                              Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-(28 rows)
+(24 rows)
 
 explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2;
                                        QUERY PLAN                                        
@@ -426,20 +414,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc
                ->  Partial HashAggregate
                      Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
                      Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
-                     ->  Merge Join
+                     ->  Hash Join
                            Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
-                           Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
-                           ->  Sort
-                                 Output: xc_groupby_tab1.val
-                                 Sort Key: xc_groupby_tab1.val
-                                 ->  Seq Scan on public.xc_groupby_tab1
-                                       Output: xc_groupby_tab1.val
-                           ->  Sort
+                           Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                           ->  Hash
                                  Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-                                 Sort Key: xc_groupby_tab2.val
                                  ->  Seq Scan on public.xc_groupby_tab2
                                        Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-(24 rows)
+(20 rows)
 
 -- group by with aggregates in expression
 select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1;
@@ -870,12 +854,12 @@ explain (verbose true, costs false, nodes false) select * from (select b,count(b
  Remote Subquery Scan on all
    Output: b, count
    Sort Key: b
-   ->  Sort
-         Output: xc_groupby_def.b, (count(xc_groupby_def.b))
-         Sort Key: xc_groupby_def.b
-         ->  Finalize HashAggregate
-               Output: xc_groupby_def.b, count(xc_groupby_def.b)
-               Group Key: xc_groupby_def.b
+   ->  Finalize GroupAggregate
+         Output: xc_groupby_def.b, count(xc_groupby_def.b)
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: xc_groupby_def.b, (PARTIAL count(xc_groupby_def.b))
+               Sort Key: xc_groupby_def.b
                ->  Remote Subquery Scan on all
                      Output: xc_groupby_def.b, PARTIAL count(xc_groupby_def.b)
                      Distribute results by H: b
@@ -2302,20 +2286,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc
          ->  Sort
                Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
                Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
-               ->  Merge Join
+               ->  Hash Join
                      Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
-                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
-                     ->  Sort
-                           Output: xc_groupby_tab1.val
-                           Sort Key: xc_groupby_tab1.val
-                           ->  Seq Scan on public.xc_groupby_tab1
-                                 Output: xc_groupby_tab1.val
-                     ->  Sort
+                     Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                     ->  Hash
                            Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-                           Sort Key: xc_groupby_tab2.val
                            ->  Seq Scan on public.xc_groupby_tab2
                                  Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-(21 rows)
+(17 rows)
 
 select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1;
  ?column? 
@@ -2346,20 +2326,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc
                            ->  Sort
                                  Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
                                  Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
-                                 ->  Merge Join
+                                 ->  Hash Join
                                        Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
-                                       Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
-                                       ->  Sort
-                                             Output: xc_groupby_tab1.val
-                                             Sort Key: xc_groupby_tab1.val
-                                             ->  Seq Scan on public.xc_groupby_tab1
-                                                   Output: xc_groupby_tab1.val
-                                       ->  Sort
+                                       Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                                       ->  Seq Scan on public.xc_groupby_tab1
+                                             Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                                       ->  Hash
                                              Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-                                             Sort Key: xc_groupby_tab2.val
                                              ->  Seq Scan on public.xc_groupby_tab2
                                                    Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-(30 rows)
+(26 rows)
 
 -- group by with aggregates in expression
 select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1;
@@ -3968,20 +3944,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc
          ->  Sort
                Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
                Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
-               ->  Merge Join
+               ->  Hash Join
                      Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
-                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
-                     ->  Sort
-                           Output: xc_groupby_tab1.val
-                           Sort Key: xc_groupby_tab1.val
-                           ->  Seq Scan on public.xc_groupby_tab1
-                                 Output: xc_groupby_tab1.val
-                     ->  Sort
+                     Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                     ->  Hash
                            Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-                           Sort Key: xc_groupby_tab2.val
                            ->  Seq Scan on public.xc_groupby_tab2
                                  Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-(22 rows)
+(18 rows)
 
 explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2;
                                                                                                                         QUERY PLAN                                                                                                                         
@@ -4034,20 +4006,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc
                      ->  Partial HashAggregate
                            Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
                            Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
-                           ->  Merge Join
+                           ->  Hash Join
                                  Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
-                                 Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
-                                 ->  Sort
-                                       Output: xc_groupby_tab1.val
-                                       Sort Key: xc_groupby_tab1.val
-                                       ->  Seq Scan on public.xc_groupby_tab1
-                                             Output: xc_groupby_tab1.val
-                                 ->  Sort
+                                 Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                                 ->  Hash
                                        Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-                                       Sort Key: xc_groupby_tab2.val
                                        ->  Seq Scan on public.xc_groupby_tab2
                                              Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-(28 rows)
+(24 rows)
 
 explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2;
                                        QUERY PLAN                                        
@@ -4063,20 +4031,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc
                ->  Partial HashAggregate
                      Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
                      Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
-                     ->  Merge Join
+                     ->  Hash Join
                            Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
-                           Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
-                           ->  Sort
-                                 Output: xc_groupby_tab1.val
-                                 Sort Key: xc_groupby_tab1.val
-                                 ->  Seq Scan on public.xc_groupby_tab1
-                                       Output: xc_groupby_tab1.val
-                           ->  Sort
+                           Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                           ->  Hash
                                  Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-                                 Sort Key: xc_groupby_tab2.val
                                  ->  Seq Scan on public.xc_groupby_tab2
                                        Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-(24 rows)
+(20 rows)
 
 -- group by with aggregates in expression
 select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by val2;
@@ -4507,12 +4471,12 @@ explain (verbose true, costs false, nodes false) select * from (select b,count(b
  Remote Subquery Scan on all
    Output: b, count
    Sort Key: b
-   ->  Sort
-         Output: xc_groupby_def.b, (count(xc_groupby_def.b))
-         Sort Key: xc_groupby_def.b
-         ->  Finalize HashAggregate
-               Output: xc_groupby_def.b, count(xc_groupby_def.b)
-               Group Key: xc_groupby_def.b
+   ->  Finalize GroupAggregate
+         Output: xc_groupby_def.b, count(xc_groupby_def.b)
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: xc_groupby_def.b, (PARTIAL count(xc_groupby_def.b))
+               Sort Key: xc_groupby_def.b
                ->  Remote Subquery Scan on all
                      Output: xc_groupby_def.b, PARTIAL count(xc_groupby_def.b)
                      Distribute results by H: b
@@ -6058,20 +6022,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc
                ->  Sort
                      Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
                      Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
-                     ->  Merge Join
+                     ->  Hash Join
                            Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
-                           Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
-                           ->  Sort
-                                 Output: xc_groupby_tab1.val
-                                 Sort Key: xc_groupby_tab1.val
-                                 ->  Seq Scan on public.xc_groupby_tab1
-                                       Output: xc_groupby_tab1.val
-                           ->  Sort
+                           Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                           ->  Hash
                                  Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-                                 Sort Key: xc_groupby_tab2.val
                                  ->  Seq Scan on public.xc_groupby_tab2
                                        Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-(25 rows)
+(21 rows)
 
 explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2;
                                                                                                                         QUERY PLAN                                                                                                                         
@@ -6130,20 +6090,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc
                            ->  Sort
                                  Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
                                  Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
-                                 ->  Merge Join
+                                 ->  Hash Join
                                        Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
-                                       Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
-                                       ->  Sort
-                                             Output: xc_groupby_tab1.val
-                                             Sort Key: xc_groupby_tab1.val
-                                             ->  Seq Scan on public.xc_groupby_tab1
-                                                   Output: xc_groupby_tab1.val
-                                       ->  Sort
+                                       Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                                       ->  Seq Scan on public.xc_groupby_tab1
+                                             Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                                       ->  Hash
                                              Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-                                             Sort Key: xc_groupby_tab2.val
                                              ->  Seq Scan on public.xc_groupby_tab2
                                                    Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-(31 rows)
+(27 rows)
 
 explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2;
                                              QUERY PLAN                                              
@@ -6165,20 +6121,16 @@ explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc
                            ->  Sort
                                  Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
                                  Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
-                                 ->  Merge Join
+                                 ->  Hash Join
                                        Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
-                                       Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
-                                       ->  Sort
-                                             Output: xc_groupby_tab1.val
-                                             Sort Key: xc_groupby_tab1.val
-                                             ->  Seq Scan on public.xc_groupby_tab1
-                                                   Output: xc_groupby_tab1.val
-                                       ->  Sort
+                                       Hash Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                                       ->  Seq Scan on public.xc_groupby_tab1
+                                             Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                                       ->  Hash
                                              Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-                                             Sort Key: xc_groupby_tab2.val
                                              ->  Seq Scan on public.xc_groupby_tab2
                                                    Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
-(30 rows)
+(26 rows)
 
 -- group by with aggregates in expression
 select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1;
diff --git a/src/test/regress/expected/xc_having_1.out b/src/test/regress/expected/xc_having_1.out
index 93469960..9d914a2a 100644
--- a/src/test/regress/expected/xc_having_1.out
+++ b/src/test/regress/expected/xc_having_1.out
@@ -605,31 +605,26 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_having_
    ->  GroupAggregate
          Output: count(*), sum((xc_having_tab1.val * xc_having_tab2.val)), avg((xc_having_tab1.val * xc_having_tab2.val)), ((sum((xc_having_tab1.val * xc_having_tab2.val)))::double precision / (count(*))::double precision), xc_having_tab1.val2, xc_having_tab2.val2
          Group Key: xc_having_tab1.val2, xc_having_tab2.val2
-         ->  Merge Join
+         ->  Sort
                Output: xc_having_tab1.val2, xc_having_tab2.val2, xc_having_tab1.val, xc_having_tab2.val
-               Merge Cond: (xc_having_tab1.val2 = xc_having_tab2.val2)
+               Sort Key: xc_having_tab1.val2
+               ->  Hash Join
+                     Output: xc_having_tab1.val2, xc_having_tab2.val2, xc_having_tab1.val, xc_having_tab2.val
+                     Hash Cond: (xc_having_tab1.val2 = xc_having_tab2.val2)
                Join Filter: ((xc_having_tab1.val2 + xc_having_tab2.val2) > 2)
                ->  Remote Subquery Scan on all
                      Output: xc_having_tab1.val, xc_having_tab1.val2
                      Distribute results by H: val2
-                     Sort Key: xc_having_tab1.val2
-                     ->  Sort
-                           Output: xc_having_tab1.val, xc_having_tab1.val2
-                           Sort Key: xc_having_tab1.val2
                            ->  Seq Scan on public.xc_having_tab1
                                  Output: xc_having_tab1.val, xc_having_tab1.val2
-               ->  Materialize
+                     ->  Hash
                      Output: xc_having_tab2.val, xc_having_tab2.val2
                      ->  Remote Subquery Scan on all
                            Output: xc_having_tab2.val, xc_having_tab2.val2
                            Distribute results by H: val2
-                           Sort Key: xc_having_tab2.val2
-                           ->  Sort
-                                 Output: xc_having_tab2.val, xc_having_tab2.val2
-                                 Sort Key: xc_having_tab2.val2
                                  ->  Seq Scan on public.xc_having_tab2
                                        Output: xc_having_tab2.val, xc_having_tab2.val2
-(29 rows)
+(24 rows)
 
 -- group by and having, without aggregate in the target list
 select val2 from xc_having_tab1 group by val2 having sum(val) > 8;
diff --git a/src/test/regress/expected/xl_join.out b/src/test/regress/expected/xl_join.out
index 6369183d..6753f018 100644
--- a/src/test/regress/expected/xl_join.out
+++ b/src/test/regress/expected/xl_join.out
@@ -8,26 +8,25 @@ EXPLAIN (COSTS OFF)
 SELECT * FROM xl_join_t1
 	INNER JOIN xl_join_t2 ON xl_join_t1.val1 = xl_join_t2.val2 
 	INNER JOIN xl_join_t3 ON xl_join_t1.val1 = xl_join_t3.val1;
-                             QUERY PLAN                              
----------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Merge Join
-         Merge Cond: (xl_join_t2.val2 = xl_join_t1.val1)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: val2
-               ->  Sort
-                     Sort Key: xl_join_t2.val2
-                     ->  Seq Scan on xl_join_t2
-         ->  Materialize
-               ->  Merge Join
-                     Merge Cond: (xl_join_t1.val1 = xl_join_t3.val1)
-                     ->  Sort
-                           Sort Key: xl_join_t1.val1
-                           ->  Seq Scan on xl_join_t1
+         Merge Cond: (xl_join_t1.val1 = xl_join_t3.val1)
+         ->  Merge Join
+               Merge Cond: (xl_join_t2.val2 = xl_join_t1.val1)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: val2
                      ->  Sort
-                           Sort Key: xl_join_t3.val1
-                           ->  Seq Scan on xl_join_t3
-(17 rows)
+                           Sort Key: xl_join_t2.val2
+                           ->  Seq Scan on xl_join_t2
+               ->  Sort
+                     Sort Key: xl_join_t1.val1
+                     ->  Seq Scan on xl_join_t1
+         ->  Sort
+               Sort Key: xl_join_t3.val1
+               ->  Seq Scan on xl_join_t3
+(16 rows)
 
 SELECT * FROM xl_join_t1
 	INNER JOIN xl_join_t2 ON xl_join_t1.val1 = xl_join_t2.val2 

From cf30ce5bfb494fc97e2ab759eef4de08d4134985 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 5 Feb 2021 13:44:49 +0800
Subject: [PATCH 125/578] Abstract function path_count_datanodes for cost
 estimate

---
 src/backend/optimizer/path/costsize.c   | 59 ++++---------------------
 src/backend/optimizer/plan/createplan.c | 25 ++---------
 src/backend/optimizer/util/pathnode.c   | 27 +++++++++++
 src/backend/utils/adt/selfuncs.c        |  9 +---
 src/include/optimizer/paths.h           |  4 +-
 5 files changed, 43 insertions(+), 81 deletions(-)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 4c816ac7..06e9a7c3 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -241,14 +241,7 @@ static ParamPathInfoDataNode *
 adjust_reloptinfo(Path *path, RelOptInfoDataNode *basescan, RelOptInfo *baserel_orig,
                   ParamPathInfoDataNode *param_info, ParamPathInfo *param_info_orig)
 {
-	double		nodes;
-	
-	if (path->distribution && IsA(path->distribution, Distribution) &&
-		path->distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
-		path->distribution->distributionType != LOCATOR_TYPE_NONE)
-		nodes = bms_num_members(path->distribution->nodes);
-	else
-		nodes = 1;
+	double  nodes = path_count_datanodes(path);
 	
 	basescan->relid = baserel_orig->relid;
 	basescan->rtekind = baserel_orig->rtekind;
@@ -659,12 +652,7 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
 	double		nodes = 1;
 
 #ifdef __TBASE__
-	if (path->path.distribution && IsA(path->path.distribution, Distribution) &&
-	    path->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
-	    path->path.distribution->distributionType != LOCATOR_TYPE_NONE)
-	{
-		nodes = bms_num_members(path->path.distribution->nodes);
-	}
+	nodes = path_count_datanodes(&path->path);
 	/* Should only be applied to base relations */
 	Assert(IsA(baserel_orig, RelOptInfo) &&
 	       IsA(index, IndexOptInfo));
@@ -1217,12 +1205,9 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
     cpu_run_cost = cpu_per_tuple * tuples_fetched;
 
 #ifdef __TBASE__
-	/* Adjust costing for parallelism between data nodes, if used. */
-	if (path->distribution && IsA(path->distribution, Distribution) &&
-	    path->distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
-	    path->distribution->distributionType != LOCATOR_TYPE_NONE)
 	{
-		double nodes = bms_num_members(path->distribution->nodes);
+		/* Adjust costing for parallelism between data nodes, if used. */
+		double nodes = path_count_datanodes(path);
 
 		/* The CPU cost is divided among all the data nodes. */
 		cpu_run_cost /= nodes;
@@ -2500,14 +2485,7 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
 		path->path.rows = path->path.parent->rows;
 
 #ifdef __TBASE__
-	if (path->path.distribution && IsA(path->path.distribution, Distribution) &&
-	    path->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
-	    path->path.distribution->distributionType != LOCATOR_TYPE_NONE)
-	{
-		double nodes = bms_num_members(path->path.distribution->nodes);
-		
-		path->path.rows = clamp_row_est(path->path.rows / nodes);
-	}
+	path->path.rows = clamp_row_est(path->path.rows / path_count_datanodes(&path->path));
 #endif
 	
 	/* For partial paths, scale row estimate. */
@@ -3000,14 +2978,7 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path,
 		path->jpath.path.rows = path->jpath.path.parent->rows;
 
 #ifdef __TBASE__
-	if (path->jpath.path.distribution && IsA(path->jpath.path.distribution, Distribution) &&
-	    path->jpath.path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
-	    path->jpath.path.distribution->distributionType != LOCATOR_TYPE_NONE)
-	{
-		double nodes = bms_num_members(path->jpath.path.distribution->nodes);
-		
-		path->jpath.path.rows = clamp_row_est(path->jpath.path.rows / nodes);
-	}
+	path->jpath.path.rows = clamp_row_est(path->jpath.path.rows / path_count_datanodes(&path->jpath.path));
 #endif
 	
 	/* For partial paths, scale row estimate. */
@@ -3454,14 +3425,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 		path->jpath.path.rows = path->jpath.path.parent->rows;
 
 #ifdef __TBASE__
-	if (path->jpath.path.distribution && IsA(path->jpath.path.distribution, Distribution) &&
-	    path->jpath.path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
-	    path->jpath.path.distribution->distributionType != LOCATOR_TYPE_NONE)
-	{
-		double nodes = bms_num_members(path->jpath.path.distribution->nodes);
-		
-		path->jpath.path.rows = clamp_row_est(path->jpath.path.rows / nodes);
-	}
+	path->jpath.path.rows = clamp_row_est(path->jpath.path.rows / path_count_datanodes(&path->jpath.path));
 #endif
 	
 	/* For partial paths, scale row estimate. */
@@ -5053,15 +5017,8 @@ set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel)
     sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
     rel->tuples = sub_final_rel->cheapest_total_path->rows;
 #ifdef __TBASE__
-	if (sub_final_rel->cheapest_total_path->distribution && IsA(sub_final_rel->cheapest_total_path->distribution, Distribution) &&
-		sub_final_rel->cheapest_total_path->distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
-		sub_final_rel->cheapest_total_path->distribution->distributionType != LOCATOR_TYPE_NONE)
-	{
-		double nodes = bms_num_members(sub_final_rel->cheapest_total_path->distribution->nodes);
-		
 		/* count tuples in all data nodes */
-		rel->tuples *= nodes;
-	}
+	rel->tuples *= path_count_datanodes(sub_final_rel->cheapest_total_path);
 #endif
 
     /*
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 31bbb2d6..e4ccad0c 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -3814,12 +3814,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
 		double      nodes = 1;
 
 #ifdef __TBASE__
-		if (apath->path.distribution && IsA(apath->path.distribution, Distribution) &&
-		    apath->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
-		    apath->path.distribution->distributionType != LOCATOR_TYPE_NONE)
-		{
-			nodes = bms_num_members(apath->path.distribution->nodes);
-		}
+		nodes = path_count_datanodes(&apath->path);
 #endif
 
         /*
@@ -3911,14 +3906,8 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
         {
 			double  nodes = 1;
 #ifdef __TBASE__
-			if (opath->path.distribution && IsA(opath->path.distribution, Distribution) &&
-			    opath->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
-			    opath->path.distribution->distributionType != LOCATOR_TYPE_NONE)
-			{
-				nodes = bms_num_members(opath->path.distribution->nodes);
-			}
+			nodes = path_count_datanodes(&opath->path);
 #endif
-			
             plan = (Plan *) make_bitmap_or(subplans);
             plan->startup_cost = opath->path.startup_cost;
             plan->total_cost = opath->path.total_cost;
@@ -3955,16 +3944,9 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
         List       *subindexECs;
         ListCell   *l;
 		double      nodes = 1;
-		
 #ifdef __TBASE__
-		if (ipath->path.distribution && IsA(ipath->path.distribution, Distribution) &&
-		    ipath->path.distribution->distributionType != LOCATOR_TYPE_REPLICATED &&
-		    ipath->path.distribution->distributionType != LOCATOR_TYPE_NONE)
-		{
-			nodes = bms_num_members(ipath->path.distribution->nodes);
-		}
+		nodes = path_count_datanodes(&ipath->path);
 #endif
-
         /* Use the regular indexscan plan build machinery... */
         iscan = castNode(IndexScan,
                          create_indexscan_plan(root, ipath,
@@ -6424,6 +6406,7 @@ make_remotesubplan(PlannerInfo *root,
     Assert(!IsA(lefttree, RemoteSubplan));
 
 #ifdef __TBASE__
+	/* do things like path_count_datanodes, but we have only distribution here */
 	if (execDistribution &&
 	    (execDistribution->distributionType == LOCATOR_TYPE_HASH ||
 	     execDistribution->distributionType == LOCATOR_TYPE_SHARD))
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index e5551e0c..690adbfd 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1405,6 +1405,10 @@ set_scanpath_distribution(PlannerInfo *root, RelOptInfo *rel, Path *pathnode)
 }
 
 #ifdef __TBASE__
+/*
+ * implementation for create_remotesubplan_path, besides regular creation of remote subplan,
+ * we need it when redistributing join rel.
+ */
 static Path *
 create_remotesubplan_path_internal(PlannerInfo *root, Path *subpath,
                                    Distribution *distribution, RelOptInfo *rel,
@@ -6994,3 +6998,26 @@ reparameterize_path(PlannerInfo *root, Path *path,
     }
     return NULL;
 }
+
+#ifdef __TBASE__
+/*
+ * count datanode number for given path, consider replication table as 1
+ * because we use this function to figure out how many parts that data
+ * had been separated into, when we estimating costs of a plan. Therefore
+ * to get more accurate estimating result as in a distributed system.
+ */
+double
+path_count_datanodes(Path *path)
+{
+	if (path->distribution && IsA(path->distribution, Distribution) &&
+	    (path->distribution->distributionType == LOCATOR_TYPE_SHARD ||
+	     path->distribution->distributionType == LOCATOR_TYPE_HASH))
+	{
+		double nodes = bms_num_members(path->distribution->nodes);
+		if (nodes > 0)
+			return nodes;
+	}
+	
+	return 1;
+}
+#endif
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index e24d7193..80a4ce72 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3545,14 +3545,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 			double      nodes = 1;
 			if (list_length(rel->pathlist) > 0)
 			{
-				Path *path = linitial(rel->pathlist);
-				if (path->distribution &&
-				    (path->distribution->distributionType == LOCATOR_TYPE_HASH ||
-				     path->distribution->distributionType == LOCATOR_TYPE_SHARD))
-					nodes = bms_num_members(path->distribution->nodes);
-				/* for sanity */
-				if (nodes < 1)
-					nodes = 1;
+				nodes = path_count_datanodes(linitial(rel->pathlist));
 			}
 #endif
             if (relvarcount > 1)
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index cf766c0f..416d15d8 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -228,5 +228,7 @@ extern bool has_useful_pathkeys(PlannerInfo *root, RelOptInfo *rel);
 extern PathKey *make_canonical_pathkey(PlannerInfo *root,
 					   EquivalenceClass *eclass, Oid opfamily,
 					   int strategy, bool nulls_first);
-
+#ifdef __TBASE__
+extern double path_count_datanodes(Path *path);
+#endif
 #endif							/* PATHS_H */

From 90ad670bfcfe9ef3ed0eb789023b6d8dcdbe67c7 Mon Sep 17 00:00:00 2001
From: yeyukui <yukuiye@tencent.com>
Date: Thu, 31 Dec 2020 11:23:15 +0800
Subject: [PATCH 126/578] v2 support gbk and gb18030,
 tapd:http://tapd.oa.com/pgxz/prong/stories/view/1010092131861052005

---
 src/backend/access/common/heaptuple.c       |  33 +-
 src/backend/access/common/printtup.c        |  17 +
 src/backend/executor/execTuples.c           |   2 +
 src/backend/utils/adt/varlena.c             |  26 +
 src/backend/utils/mb/encnames.c             |   7 +-
 src/backend/utils/mb/mbutils.c              |   6 +
 src/backend/utils/mb/wchar.c                |  65 ++-
 src/include/c.h                             |   3 +
 src/include/mb/pg_wchar.h                   | 585 ++++++++++----------
 src/test/regress/expected/rowsecurity_1.out |  10 +-
 src/test/regress/expected/zhcn_gb18030.out  | 132 +++++
 src/test/regress/expected/zhcn_utf8.out     | 264 +++++++++
 src/test/regress/parallel_schedule          |   4 +-
 src/test/regress/pg_regress.c               |   3 +-
 src/test/regress/serial_schedule            |   2 +
 src/test/regress/sql/rowsecurity.sql        |  10 +-
 src/test/regress/sql/zhcn_gb18030.sql       |  65 +++
 src/test/regress/sql/zhcn_utf8.sql          | 140 +++++
 18 files changed, 1065 insertions(+), 309 deletions(-)
 create mode 100644 src/test/regress/expected/zhcn_gb18030.out
 create mode 100644 src/test/regress/expected/zhcn_utf8.out
 create mode 100644 src/test/regress/sql/zhcn_gb18030.sql
 create mode 100644 src/test/regress/sql/zhcn_utf8.sql

diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
index d134e17d..d858d75b 100644
--- a/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@ -78,6 +78,8 @@
 #ifdef __TBASE__
 #include "utils/typcache.h"
 #include "pgxc/execRemote.h"
+#include "catalog/pg_type.h"
+#include "mb/pg_wchar.h"
 #endif
 
 /* Does att's datatype allow packing into the 1-byte-header varlena format? */
@@ -1328,6 +1330,30 @@ slot_deform_tuple(TupleTableSlot *slot, int natts)
     slot->tts_slow = slow;
 }
 
+/**
+ * get maximum bytes number from column define size, if column is bounded string, return -1
+ * then InputFunctionCall -> varchar2_input|varchar_input|varchar2_input|nvarchar2_input
+ * avoid to verification the length of string which encoded by client encode
+ */
+static int
+get_typioparam_mod(Oid typioparam, int32 typmod)
+{
+    switch (typioparam)
+    {
+        case CHAROID:
+        case BPCHAROID:
+        case VARCHAROID:
+#ifdef _PG_ORCL_
+        case VARCHAR2OID:
+		case NVARCHAR2OID:
+#endif
+            return -1;
+
+        default:
+            return typmod;
+    }
+}
+
 /*
  * slot_deform_datarow
  *         Extract data from the DataRow message into Datum/isnull arrays.
@@ -1480,13 +1506,18 @@ slot_deform_datarow(TupleTableSlot *slot)
 #endif
         else
         {
+            int typmod = slot->tts_attinmeta->atttypmods[i];
             appendBinaryStringInfo(buffer, cur, len);
             cur += len;
 
+            if (GetDatabaseEncoding() != pg_get_client_encoding() &&
+                            pg_get_client_encoding() != PG_SQL_ASCII && IS_PGXC_LOCAL_COORDINATOR)
+                typmod = get_typioparam_mod(slot->tts_attinmeta->attioparams[i], typmod);
+
             slot->tts_values[i] = InputFunctionCall(slot->tts_attinmeta->attinfuncs + i,
                                                     buffer->data,
                                                     slot->tts_attinmeta->attioparams[i],
-                                                    slot->tts_attinmeta->atttypmods[i]);
+                                                    typmod);
             slot->tts_isnull[i] = false;
 
             resetStringInfo(buffer);
diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index 3124b935..c7f180a5 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -335,6 +335,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
     int            natts = typeinfo->natts;
     int            i;
     bool        binary = false;
+    bool        needEncodingConvert = false;
 
 #ifdef __TBASE__
     if (end_query_requested)
@@ -399,6 +400,12 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
 
     pq_sendint(&buf, natts, 2);
 
+	/* encoding convert only on datanode when connect from coordinator node or connect from app */
+	if (isPGXCDataNode && (IsConnFromCoord() || IsConnFromApp()))
+    {
+        needEncodingConvert = true;
+    }
+
     /*
      * send the attributes of this tuple
      */
@@ -430,10 +437,20 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
             char       *outputstr;
 
             outputstr = OutputFunctionCall(&thisState->finfo, attr);
+
+            if (needEncodingConvert)
+            {
             pq_sendcountedtext(&buf, outputstr, strlen(outputstr), false);
         }
         else
         {
+                int len = strlen(outputstr);
+                pq_sendint(&buf, len, 4);
+                appendBinaryStringInfo(&buf, outputstr, len);
+            }
+		}
+		else
+		{
             /* Binary output */
             bytea       *outputbytes;
 
diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c
index d36cc392..4cd13fa4 100644
--- a/src/backend/executor/execTuples.c
+++ b/src/backend/executor/execTuples.c
@@ -501,7 +501,9 @@ ExecClearTuple(TupleTableSlot *slot)    /* slot in which to store tuple */
         heap_free_minimal_tuple(slot->tts_mintuple);
 #ifdef PGXC
     if (slot->tts_shouldFreeRow)
+    {
         pfree(slot->tts_datarow);
+    }
 
     slot->tts_shouldFreeRow = false;
     slot->tts_datarow = NULL;
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 8f170abe..51d53bab 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -2220,6 +2220,19 @@ varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup)
      * memcmp() compares data from cachelines that are needed in L1 cache even
      * when the last comparison's result cannot be reused.
      */
+#ifdef __TBASE__
+	/**
+	 * on cn node, when client encoding is not equals server encoding, a1p is client encoding
+	 * so must convert a1p to server encoding
+	 */
+	if (GetDatabaseEncoding() != pg_get_client_encoding() &&
+			pg_get_client_encoding() != PG_SQL_ASCII && IS_PGXC_LOCAL_COORDINATOR)
+	{
+		a1p = pg_client_to_server(a1p, strnlen(a1p, len1));
+		len1 = strlen(a1p);
+	}
+#endif
+
     arg1_match = true;
     if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
     {
@@ -2235,6 +2248,19 @@ varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup)
      * it seems (at least with moderate to low cardinality sets), because
      * quicksort compares the same pivot against many values.
      */
+#ifdef __TBASE__
+    /**
+     * on cn node, when client encoding is not equals server encoding, a2p is client encoding
+     * so must convert a2p to server encoding
+     */
+	if (GetDatabaseEncoding() != pg_get_client_encoding() &&
+			pg_get_client_encoding() != PG_SQL_ASCII && IS_PGXC_LOCAL_COORDINATOR)
+	{
+		a2p = pg_client_to_server(a2p, strnlen(a2p, len2));
+		len2 = strlen(a2p);
+	}
+#endif
+
     if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
     {
         memcpy(sss->buf2, a2p, len2);
diff --git a/src/backend/utils/mb/encnames.c b/src/backend/utils/mb/encnames.c
index e79eb2fd..16422b65 100644
--- a/src/backend/utils/mb/encnames.c
+++ b/src/backend/utils/mb/encnames.c
@@ -449,6 +449,11 @@ static const char *const pg_enc2icu_tbl[] =
     "CP1255",                    /* PG_WIN1255 */
     "CP1257",                    /* PG_WIN1257 */
     "KOI8-U",                    /* PG_KOI8U */
+    NULL,					    /* Shift JIS (Windows-932) */
+    NULL,					    /* Big5 (Windows-950) */
+    "GBK",						/* GBK (Windows-936) */
+    NULL,						/* UHC (Windows-949) */
+    "GB18030",     				/* GB18030 */
 };
 
 bool
@@ -462,7 +467,7 @@ get_encoding_name_for_icu(int encoding)
 {
     const char *icu_encoding_name;
 
-    StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
+	StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_SERVER_ENCODING_BE_LAST + 1,
                      "pg_enc2icu_tbl incomplete");
 
     icu_encoding_name = pg_enc2icu_tbl[encoding];
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index f466a0da..8d34efa4 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -574,6 +574,12 @@ pg_any_to_server(const char *s, int len, int encoding)
     if (len <= 0)
         return (char *) s;        /* empty string is always valid */
 
+	/*
+	 * no need to convert on datanode node
+	 */
+    if (IsConnFromCoord() || IsConnFromDatanode())
+        return (char *) s;
+
     if (encoding == DatabaseEncoding->encoding ||
         encoding == PG_SQL_ASCII)
     {
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c
index 7344e44b..33e745d6 100644
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -1020,6 +1020,31 @@ pg_big5_dsplen(const unsigned char *s)
 /*
  * GBK
  */
+static int
+pg_gbk2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+    int			cnt = 0;
+
+    while (len > 0 && *from)
+    {
+        if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
+        {
+            *to = *from++ << 8;
+            *to |= *from++;
+            len -= 2;
+        }
+        else        /* should be ASCII */
+        {
+            *to = *from++;
+            len--;
+        }
+        to++;
+        cnt++;
+    }
+    *to = 0;
+    return cnt;
+}
+
 static int
 pg_gbk_mblen(const unsigned char *s)
 {
@@ -1075,6 +1100,42 @@ pg_uhc_dsplen(const unsigned char *s)
  * GB18030
  *    Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
  */
+static int
+pg_gb18030_2_wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+    int			cnt = 0;
+
+    while (len > 0 && *from)
+    {
+        if (IS_HIGHBIT_SET(*from) && len >= 2) /* 2 bytes */
+        {
+            if (IS_GB18030_SET(*(from + 1)) && len >= 4) /* 4 bytes for CJK */
+            {
+                *to = *from++ << 24;
+                *to |= *from++ << 16;
+                *to |= *from++ << 8;
+                *to |= *from++;
+                len -= 4;
+            }
+            else
+            {
+                *to = *from++ << 8;
+                *to |= *from++;
+                len -= 2;
+            }
+        }
+        else        /* should be ASCII */
+        {
+            *to = *from++;
+            len--;
+        }
+        to++;
+        cnt++;
+    }
+    *to = 0;
+    return cnt;
+}
+
 static int
 pg_gb18030_mblen(const unsigned char *s)
 {
@@ -1766,9 +1827,9 @@ const pg_wchar_tbl pg_wchar_table[] = {
     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */
     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
     {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
-    {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},    /* PG_GBK */
+	{pg_gbk2wchar_with_len, pg_wchar2euc_with_len, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},	/* PG_GBK */
     {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2},    /* PG_UHC */
-    {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4},    /* PG_GB18030 */
+	{pg_gb18030_2_wchar_with_len, pg_wchar2euc_with_len, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4},	/* PG_GB18030 */
     {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3},    /* PG_JOHAB */
     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}    /* PG_SHIFT_JIS_2004 */
 };
diff --git a/src/include/c.h b/src/include/c.h
index 4c2b1d98..d4a4033d 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -1047,6 +1047,9 @@ typedef NameData *Name;
 /* msb for char */
 #define HIGHBIT                    (0x80)
 #define IS_HIGHBIT_SET(ch)        ((unsigned char)(ch) & HIGHBIT)
+#define GB18030_2ND_MIX			(0x30)
+#define GB18030_2ND_MAX			(0x39)
+#define IS_GB18030_SET(ch)		((ch) <= GB18030_2ND_MAX && (ch) >= GB18030_2ND_MIX)
 
 #define STATUS_OK                (0)
 #define STATUS_ERROR            (-1)
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index bed8069b..44bb9b4a 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -1,18 +1,18 @@
 /*-------------------------------------------------------------------------
  *
  * pg_wchar.h
- *      multibyte-character support
+ *	  multibyte-character support
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * src/include/mb/pg_wchar.h
  *
- *    NOTES
- *        This is used both by the backend and by libpq, but should not be
- *        included by libpq client programs.  In particular, a libpq client
- *        should not assume that the encoding IDs used by the version of libpq
- *        it's linked to match up with the IDs declared here.
+ *	NOTES
+ *		This is used both by the backend and by libpq, but should not be
+ *		included by libpq client programs.  In particular, a libpq client
+ *		should not assume that the encoding IDs used by the version of libpq
+ *		it's linked to match up with the IDs declared here.
  *
  *-------------------------------------------------------------------------
  */
@@ -27,13 +27,13 @@ typedef unsigned int pg_wchar;
 /*
  * Maximum byte length of multibyte characters in any backend encoding
  */
-#define MAX_MULTIBYTE_CHAR_LEN    4
+#define MAX_MULTIBYTE_CHAR_LEN	4
 
 /*
  * various definitions for EUC
  */
-#define SS2 0x8e                /* single shift 2 (JIS0201) */
-#define SS3 0x8f                /* single shift 3 (JIS0212) */
+#define SS2 0x8e				/* single shift 2 (JIS0201) */
+#define SS3 0x8f				/* single shift 3 (JIS0212) */
 
 /*
  * SJIS validation macros
@@ -54,28 +54,28 @@ typedef unsigned int pg_wchar;
  * 1) 1-byte ASCII characters.  Each byte is below 0x80.
  *
  * 2) "Official" single byte charsets such as ISO-8859-1 (Latin1).
- *      Each MULE character consists of 2 bytes: LC1 + C1, where LC1 is
- *      an identifier for the charset (in the range 0x81 to 0x8d) and C1
- *      is the character code (in the range 0xa0 to 0xff).
+ *	  Each MULE character consists of 2 bytes: LC1 + C1, where LC1 is
+ *	  an identifier for the charset (in the range 0x81 to 0x8d) and C1
+ *	  is the character code (in the range 0xa0 to 0xff).
  *
  * 3) "Private" single byte charsets such as SISHENG.  Each MULE
- *      character consists of 3 bytes: LCPRV1 + LC12 + C1, where LCPRV1
- *      is a private-charset flag, LC12 is an identifier for the charset,
- *      and C1 is the character code (in the range 0xa0 to 0xff).
- *      LCPRV1 is either 0x9a (if LC12 is in the range 0xa0 to 0xdf)
- *      or 0x9b (if LC12 is in the range 0xe0 to 0xef).
+ *	  character consists of 3 bytes: LCPRV1 + LC12 + C1, where LCPRV1
+ *	  is a private-charset flag, LC12 is an identifier for the charset,
+ *	  and C1 is the character code (in the range 0xa0 to 0xff).
+ *	  LCPRV1 is either 0x9a (if LC12 is in the range 0xa0 to 0xdf)
+ *	  or 0x9b (if LC12 is in the range 0xe0 to 0xef).
  *
  * 4) "Official" multibyte charsets such as JIS X0208.  Each MULE
- *      character consists of 3 bytes: LC2 + C1 + C2, where LC2 is
- *      an identifier for the charset (in the range 0x90 to 0x99) and C1
- *      and C2 form the character code (each in the range 0xa0 to 0xff).
+ *	  character consists of 3 bytes: LC2 + C1 + C2, where LC2 is
+ *	  an identifier for the charset (in the range 0x90 to 0x99) and C1
+ *	  and C2 form the character code (each in the range 0xa0 to 0xff).
  *
  * 5) "Private" multibyte charsets such as CNS 11643-1992 Plane 3.
- *      Each MULE character consists of 4 bytes: LCPRV2 + LC22 + C1 + C2,
- *      where LCPRV2 is a private-charset flag, LC22 is an identifier for
- *      the charset, and C1 and C2 form the character code (each in the range
- *      0xa0 to 0xff).  LCPRV2 is either 0x9c (if LC22 is in the range 0xf0
- *      to 0xf4) or 0x9d (if LC22 is in the range 0xf5 to 0xfe).
+ *	  Each MULE character consists of 4 bytes: LCPRV2 + LC22 + C1 + C2,
+ *	  where LCPRV2 is a private-charset flag, LC22 is an identifier for
+ *	  the charset, and C1 and C2 form the character code (each in the range
+ *	  0xa0 to 0xff).  LCPRV2 is either 0x9c (if LC22 is in the range 0xf0
+ *	  to 0xf4) or 0x9d (if LC22 is in the range 0xf5 to 0xfe).
  *
  * "Official" encodings are those that have been assigned code numbers by
  * the XEmacs project; "private" encodings have Postgres-specific charset
@@ -99,119 +99,119 @@ typedef unsigned int pg_wchar;
 /*
  * Charset IDs for official single byte encodings (0x81-0x8e)
  */
-#define LC_ISO8859_1        0x81    /* ISO8859 Latin 1 */
-#define LC_ISO8859_2        0x82    /* ISO8859 Latin 2 */
-#define LC_ISO8859_3        0x83    /* ISO8859 Latin 3 */
-#define LC_ISO8859_4        0x84    /* ISO8859 Latin 4 */
-#define LC_TIS620            0x85    /* Thai (not supported yet) */
-#define LC_ISO8859_7        0x86    /* Greek (not supported yet) */
-#define LC_ISO8859_6        0x87    /* Arabic (not supported yet) */
-#define LC_ISO8859_8        0x88    /* Hebrew (not supported yet) */
-#define LC_JISX0201K        0x89    /* Japanese 1 byte kana */
-#define LC_JISX0201R        0x8a    /* Japanese 1 byte Roman */
+#define LC_ISO8859_1		0x81	/* ISO8859 Latin 1 */
+#define LC_ISO8859_2		0x82	/* ISO8859 Latin 2 */
+#define LC_ISO8859_3		0x83	/* ISO8859 Latin 3 */
+#define LC_ISO8859_4		0x84	/* ISO8859 Latin 4 */
+#define LC_TIS620			0x85	/* Thai (not supported yet) */
+#define LC_ISO8859_7		0x86	/* Greek (not supported yet) */
+#define LC_ISO8859_6		0x87	/* Arabic (not supported yet) */
+#define LC_ISO8859_8		0x88	/* Hebrew (not supported yet) */
+#define LC_JISX0201K		0x89	/* Japanese 1 byte kana */
+#define LC_JISX0201R		0x8a	/* Japanese 1 byte Roman */
 /* Note that 0x8b seems to be unused as of Emacs 20.7.
  * However, there might be a chance that 0x8b could be used
  * in later versions of Emacs.
  */
-#define LC_KOI8_R            0x8b    /* Cyrillic KOI8-R */
-#define LC_ISO8859_5        0x8c    /* ISO8859 Cyrillic */
-#define LC_ISO8859_9        0x8d    /* ISO8859 Latin 5 (not supported yet) */
-#define LC_ISO8859_15        0x8e    /* ISO8859 Latin 15 (not supported yet) */
-/* #define CONTROL_1        0x8f    control characters (unused) */
+#define LC_KOI8_R			0x8b	/* Cyrillic KOI8-R */
+#define LC_ISO8859_5		0x8c	/* ISO8859 Cyrillic */
+#define LC_ISO8859_9		0x8d	/* ISO8859 Latin 5 (not supported yet) */
+#define LC_ISO8859_15		0x8e	/* ISO8859 Latin 15 (not supported yet) */
+/* #define CONTROL_1		0x8f	control characters (unused) */
 
 /* Is a leading byte for "official" single byte encodings? */
-#define IS_LC1(c)    ((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d)
+#define IS_LC1(c)	((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8d)
 
 /*
  * Charset IDs for official multibyte encodings (0x90-0x99)
  * 0x9a-0x9d are free. 0x9e and 0x9f are reserved.
  */
-#define LC_JISX0208_1978    0x90    /* Japanese Kanji, old JIS (not supported) */
-#define LC_GB2312_80        0x91    /* Chinese */
-#define LC_JISX0208            0x92    /* Japanese Kanji (JIS X 0208) */
-#define LC_KS5601            0x93    /* Korean */
-#define LC_JISX0212            0x94    /* Japanese Kanji (JIS X 0212) */
-#define LC_CNS11643_1        0x95    /* CNS 11643-1992 Plane 1 */
-#define LC_CNS11643_2        0x96    /* CNS 11643-1992 Plane 2 */
-#define LC_JISX0213_1        0x97    /* Japanese Kanji (JIS X 0213 Plane 1)
-                                     * (not supported) */
-#define LC_BIG5_1            0x98    /* Plane 1 Chinese traditional (not
-                                     * supported) */
-#define LC_BIG5_2            0x99    /* Plane 1 Chinese traditional (not
-                                     * supported) */
+#define LC_JISX0208_1978	0x90	/* Japanese Kanji, old JIS (not supported) */
+#define LC_GB2312_80		0x91	/* Chinese */
+#define LC_JISX0208			0x92	/* Japanese Kanji (JIS X 0208) */
+#define LC_KS5601			0x93	/* Korean */
+#define LC_JISX0212			0x94	/* Japanese Kanji (JIS X 0212) */
+#define LC_CNS11643_1		0x95	/* CNS 11643-1992 Plane 1 */
+#define LC_CNS11643_2		0x96	/* CNS 11643-1992 Plane 2 */
+#define LC_JISX0213_1		0x97	/* Japanese Kanji (JIS X 0213 Plane 1)
+									 * (not supported) */
+#define LC_BIG5_1			0x98	/* Plane 1 Chinese traditional (not
+									 * supported) */
+#define LC_BIG5_2			0x99	/* Plane 1 Chinese traditional (not
+									 * supported) */
 
 /* Is a leading byte for "official" multibyte encodings? */
-#define IS_LC2(c)    ((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99)
+#define IS_LC2(c)	((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99)
 
 /*
  * Postgres-specific prefix bytes for "private" single byte encodings
  * (According to the MULE docs, we should be using 0x9e for this)
  */
-#define LCPRV1_A        0x9a
-#define LCPRV1_B        0x9b
-#define IS_LCPRV1(c)    ((unsigned char)(c) == LCPRV1_A || (unsigned char)(c) == LCPRV1_B)
-#define IS_LCPRV1_A_RANGE(c)    \
-    ((unsigned char)(c) >= 0xa0 && (unsigned char)(c) <= 0xdf)
-#define IS_LCPRV1_B_RANGE(c)    \
-    ((unsigned char)(c) >= 0xe0 && (unsigned char)(c) <= 0xef)
+#define LCPRV1_A		0x9a
+#define LCPRV1_B		0x9b
+#define IS_LCPRV1(c)	((unsigned char)(c) == LCPRV1_A || (unsigned char)(c) == LCPRV1_B)
+#define IS_LCPRV1_A_RANGE(c)	\
+	((unsigned char)(c) >= 0xa0 && (unsigned char)(c) <= 0xdf)
+#define IS_LCPRV1_B_RANGE(c)	\
+	((unsigned char)(c) >= 0xe0 && (unsigned char)(c) <= 0xef)
 
 /*
  * Postgres-specific prefix bytes for "private" multibyte encodings
  * (According to the MULE docs, we should be using 0x9f for this)
  */
-#define LCPRV2_A        0x9c
-#define LCPRV2_B        0x9d
-#define IS_LCPRV2(c)    ((unsigned char)(c) == LCPRV2_A || (unsigned char)(c) == LCPRV2_B)
-#define IS_LCPRV2_A_RANGE(c)    \
-    ((unsigned char)(c) >= 0xf0 && (unsigned char)(c) <= 0xf4)
-#define IS_LCPRV2_B_RANGE(c)    \
-    ((unsigned char)(c) >= 0xf5 && (unsigned char)(c) <= 0xfe)
+#define LCPRV2_A		0x9c
+#define LCPRV2_B		0x9d
+#define IS_LCPRV2(c)	((unsigned char)(c) == LCPRV2_A || (unsigned char)(c) == LCPRV2_B)
+#define IS_LCPRV2_A_RANGE(c)	\
+	((unsigned char)(c) >= 0xf0 && (unsigned char)(c) <= 0xf4)
+#define IS_LCPRV2_B_RANGE(c)	\
+	((unsigned char)(c) >= 0xf5 && (unsigned char)(c) <= 0xfe)
 
 /*
  * Charset IDs for private single byte encodings (0xa0-0xef)
  */
-#define LC_SISHENG            0xa0    /* Chinese SiSheng characters for
-                                      * PinYin/ZhuYin (not supported) */
-#define LC_IPA                0xa1    /* IPA (International Phonetic
-                                      * Association) (not supported) */
-#define LC_VISCII_LOWER        0xa2    /* Vietnamese VISCII1.1 lower-case (not
-                                      * supported) */
-#define LC_VISCII_UPPER        0xa3    /* Vietnamese VISCII1.1 upper-case (not
-                                      * supported) */
-#define LC_ARABIC_DIGIT        0xa4    /* Arabic digit (not supported) */
-#define LC_ARABIC_1_COLUMN    0xa5    /* Arabic 1-column (not supported) */
-#define LC_ASCII_RIGHT_TO_LEFT    0xa6    /* ASCII (left half of ISO8859-1) with
-                                          * right-to-left direction (not
-                                          * supported) */
-#define LC_LAO                0xa7    /* Lao characters (ISO10646 0E80..0EDF)
-                                      * (not supported) */
-#define LC_ARABIC_2_COLUMN    0xa8    /* Arabic 1-column (not supported) */
+#define LC_SISHENG			0xa0	/* Chinese SiSheng characters for
+									 * PinYin/ZhuYin (not supported) */
+#define LC_IPA				0xa1	/* IPA (International Phonetic
+									 * Association) (not supported) */
+#define LC_VISCII_LOWER		0xa2	/* Vietnamese VISCII1.1 lower-case (not
+									 * supported) */
+#define LC_VISCII_UPPER		0xa3	/* Vietnamese VISCII1.1 upper-case (not
+									 * supported) */
+#define LC_ARABIC_DIGIT		0xa4	/* Arabic digit (not supported) */
+#define LC_ARABIC_1_COLUMN	0xa5	/* Arabic 1-column (not supported) */
+#define LC_ASCII_RIGHT_TO_LEFT	0xa6	/* ASCII (left half of ISO8859-1) with
+										 * right-to-left direction (not
+										 * supported) */
+#define LC_LAO				0xa7	/* Lao characters (ISO10646 0E80..0EDF)
+									 * (not supported) */
+#define LC_ARABIC_2_COLUMN	0xa8	/* Arabic 1-column (not supported) */
 
 /*
  * Charset IDs for private multibyte encodings (0xf0-0xff)
  */
-#define LC_INDIAN_1_COLUMN    0xf0    /* Indian charset for 1-column width
-                                      * glyphs (not supported) */
-#define LC_TIBETAN_1_COLUMN 0xf1    /* Tibetan 1-column width glyphs (not
-                                      * supported) */
-#define LC_UNICODE_SUBSET_2 0xf2    /* Unicode characters of the range
-                                      * U+2500..U+33FF. (not supported) */
-#define LC_UNICODE_SUBSET_3 0xf3    /* Unicode characters of the range
-                                      * U+E000..U+FFFF. (not supported) */
-#define LC_UNICODE_SUBSET    0xf4    /* Unicode characters of the range
-                                      * U+0100..U+24FF. (not supported) */
-#define LC_ETHIOPIC            0xf5    /* Ethiopic characters (not supported) */
-#define LC_CNS11643_3        0xf6    /* CNS 11643-1992 Plane 3 */
-#define LC_CNS11643_4        0xf7    /* CNS 11643-1992 Plane 4 */
-#define LC_CNS11643_5        0xf8    /* CNS 11643-1992 Plane 5 */
-#define LC_CNS11643_6        0xf9    /* CNS 11643-1992 Plane 6 */
-#define LC_CNS11643_7        0xfa    /* CNS 11643-1992 Plane 7 */
-#define LC_INDIAN_2_COLUMN    0xfb    /* Indian charset for 2-column width
-                                      * glyphs (not supported) */
-#define LC_TIBETAN            0xfc    /* Tibetan (not supported) */
-/* #define FREE                0xfd    free (unused) */
-/* #define FREE                0xfe    free (unused) */
-/* #define FREE                0xff    free (unused) */
+#define LC_INDIAN_1_COLUMN	0xf0	/* Indian charset for 1-column width
+									 * glyphs (not supported) */
+#define LC_TIBETAN_1_COLUMN 0xf1	/* Tibetan 1-column width glyphs (not
+									 * supported) */
+#define LC_UNICODE_SUBSET_2 0xf2	/* Unicode characters of the range
+									 * U+2500..U+33FF. (not supported) */
+#define LC_UNICODE_SUBSET_3 0xf3	/* Unicode characters of the range
+									 * U+E000..U+FFFF. (not supported) */
+#define LC_UNICODE_SUBSET	0xf4	/* Unicode characters of the range
+									 * U+0100..U+24FF. (not supported) */
+#define LC_ETHIOPIC			0xf5	/* Ethiopic characters (not supported) */
+#define LC_CNS11643_3		0xf6	/* CNS 11643-1992 Plane 3 */
+#define LC_CNS11643_4		0xf7	/* CNS 11643-1992 Plane 4 */
+#define LC_CNS11643_5		0xf8	/* CNS 11643-1992 Plane 5 */
+#define LC_CNS11643_6		0xf9	/* CNS 11643-1992 Plane 6 */
+#define LC_CNS11643_7		0xfa	/* CNS 11643-1992 Plane 7 */
+#define LC_INDIAN_2_COLUMN	0xfb	/* Indian charset for 2-column width
+									 * glyphs (not supported) */
+#define LC_TIBETAN			0xfc	/* Tibetan (not supported) */
+/* #define FREE				0xfd	free (unused) */
+/* #define FREE				0xfe	free (unused) */
+/* #define FREE				0xff	free (unused) */
 
 /*----------------------------------------------------
  * end of MULE stuff
@@ -222,87 +222,88 @@ typedef unsigned int pg_wchar;
  * PostgreSQL encoding identifiers
  *
  * WARNING: the order of this enum must be same as order of entries
- *            in the pg_enc2name_tbl[] array (in mb/encnames.c), and
- *            in the pg_wchar_table[] array (in mb/wchar.c)!
+ *			in the pg_enc2name_tbl[] array (in mb/encnames.c), and
+ *			in the pg_wchar_table[] array (in mb/wchar.c)!
  *
- *            If you add some encoding don't forget to check
- *            PG_ENCODING_BE_LAST macro.
+ *			If you add some encoding don't forget to check
+ *			PG_ENCODING_BE_LAST macro.
  *
  * PG_SQL_ASCII is default encoding and must be = 0.
  *
- * XXX    We must avoid renumbering any backend encoding until libpq's major
+ * XXX	We must avoid renumbering any backend encoding until libpq's major
  * version number is increased beyond 5; it turns out that the backend
  * encoding IDs are effectively part of libpq's ABI as far as 8.2 initdb and
  * psql are concerned.
  */
 typedef enum pg_enc
 {
-    PG_SQL_ASCII = 0,            /* SQL/ASCII */
-    PG_EUC_JP,                    /* EUC for Japanese */
-    PG_EUC_CN,                    /* EUC for Chinese */
-    PG_EUC_KR,                    /* EUC for Korean */
-    PG_EUC_TW,                    /* EUC for Taiwan */
-    PG_EUC_JIS_2004,            /* EUC-JIS-2004 */
-    PG_UTF8,                    /* Unicode UTF8 */
-    PG_MULE_INTERNAL,            /* Mule internal code */
-    PG_LATIN1,                    /* ISO-8859-1 Latin 1 */
-    PG_LATIN2,                    /* ISO-8859-2 Latin 2 */
-    PG_LATIN3,                    /* ISO-8859-3 Latin 3 */
-    PG_LATIN4,                    /* ISO-8859-4 Latin 4 */
-    PG_LATIN5,                    /* ISO-8859-9 Latin 5 */
-    PG_LATIN6,                    /* ISO-8859-10 Latin6 */
-    PG_LATIN7,                    /* ISO-8859-13 Latin7 */
-    PG_LATIN8,                    /* ISO-8859-14 Latin8 */
-    PG_LATIN9,                    /* ISO-8859-15 Latin9 */
-    PG_LATIN10,                    /* ISO-8859-16 Latin10 */
-    PG_WIN1256,                    /* windows-1256 */
-    PG_WIN1258,                    /* Windows-1258 */
-    PG_WIN866,                    /* (MS-DOS CP866) */
-    PG_WIN874,                    /* windows-874 */
-    PG_KOI8R,                    /* KOI8-R */
-    PG_WIN1251,                    /* windows-1251 */
-    PG_WIN1252,                    /* windows-1252 */
-    PG_ISO_8859_5,                /* ISO-8859-5 */
-    PG_ISO_8859_6,                /* ISO-8859-6 */
-    PG_ISO_8859_7,                /* ISO-8859-7 */
-    PG_ISO_8859_8,                /* ISO-8859-8 */
-    PG_WIN1250,                    /* windows-1250 */
-    PG_WIN1253,                    /* windows-1253 */
-    PG_WIN1254,                    /* windows-1254 */
-    PG_WIN1255,                    /* windows-1255 */
-    PG_WIN1257,                    /* windows-1257 */
-    PG_KOI8U,                    /* KOI8-U */
-    /* PG_ENCODING_BE_LAST points to the above entry */
-
-    /* followings are for client encoding only */
-    PG_SJIS,                    /* Shift JIS (Windows-932) */
-    PG_BIG5,                    /* Big5 (Windows-950) */
-    PG_GBK,                        /* GBK (Windows-936) */
-    PG_UHC,                        /* UHC (Windows-949) */
-    PG_GB18030,                    /* GB18030 */
-    PG_JOHAB,                    /* EUC for Korean JOHAB */
-    PG_SHIFT_JIS_2004,            /* Shift-JIS-2004 */
-    _PG_LAST_ENCODING_            /* mark only */
+	PG_SQL_ASCII = 0,			/* SQL/ASCII */
+	PG_EUC_JP,					/* EUC for Japanese */
+	PG_EUC_CN,					/* EUC for Chinese */
+	PG_EUC_KR,					/* EUC for Korean */
+	PG_EUC_TW,					/* EUC for Taiwan */
+	PG_EUC_JIS_2004,			/* EUC-JIS-2004 */
+	PG_UTF8,					/* Unicode UTF8 */
+	PG_MULE_INTERNAL,			/* Mule internal code */
+	PG_LATIN1,					/* ISO-8859-1 Latin 1 */
+	PG_LATIN2,					/* ISO-8859-2 Latin 2 */
+	PG_LATIN3,					/* ISO-8859-3 Latin 3 */
+	PG_LATIN4,					/* ISO-8859-4 Latin 4 */
+	PG_LATIN5,					/* ISO-8859-9 Latin 5 */
+	PG_LATIN6,					/* ISO-8859-10 Latin6 */
+	PG_LATIN7,					/* ISO-8859-13 Latin7 */
+	PG_LATIN8,					/* ISO-8859-14 Latin8 */
+	PG_LATIN9,					/* ISO-8859-15 Latin9 */
+	PG_LATIN10,					/* ISO-8859-16 Latin10 */
+	PG_WIN1256,					/* windows-1256 */
+	PG_WIN1258,					/* Windows-1258 */
+	PG_WIN866,					/* (MS-DOS CP866) */
+	PG_WIN874,					/* windows-874 */
+	PG_KOI8R,					/* KOI8-R */
+	PG_WIN1251,					/* windows-1251 */
+	PG_WIN1252,					/* windows-1252 */
+	PG_ISO_8859_5,				/* ISO-8859-5 */
+	PG_ISO_8859_6,				/* ISO-8859-6 */
+	PG_ISO_8859_7,				/* ISO-8859-7 */
+	PG_ISO_8859_8,				/* ISO-8859-8 */
+	PG_WIN1250,					/* windows-1250 */
+	PG_WIN1253,					/* windows-1253 */
+	PG_WIN1254,					/* windows-1254 */
+	PG_WIN1255,					/* windows-1255 */
+	PG_WIN1257,					/* windows-1257 */
+	PG_KOI8U,					/* KOI8-U */
+	/* PG_ENCODING_BE_LAST points to the above entry */
+
+	/* followings are for client encoding only */
+	PG_SJIS,					/* Shift JIS (Windows-932) */
+	PG_BIG5,					/* Big5 (Windows-950) */
+	PG_GBK,						/* GBK (Windows-936) */
+	PG_UHC,						/* UHC (Windows-949) */
+	PG_GB18030,					/* GB18030 */
+	PG_JOHAB,					/* EUC for Korean JOHAB */
+	PG_SHIFT_JIS_2004,			/* Shift-JIS-2004 */
+	_PG_LAST_ENCODING_			/* mark only */
 
 } pg_enc;
 
 #define PG_ENCODING_BE_LAST PG_KOI8U
+#define PG_SERVER_ENCODING_BE_LAST PG_GB18030
 
 /*
  * Please use these tests before access to pg_encconv_tbl[]
  * or to other places...
  */
 #define PG_VALID_BE_ENCODING(_enc) \
-        ((_enc) >= 0 && (_enc) <= PG_ENCODING_BE_LAST)
+		(((_enc) >= 0 && (_enc) <= PG_ENCODING_BE_LAST) || (_enc) == PG_GBK || (_enc) == PG_GB18030)
 
 #define PG_ENCODING_IS_CLIENT_ONLY(_enc) \
-        ((_enc) > PG_ENCODING_BE_LAST && (_enc) < _PG_LAST_ENCODING_)
+		((_enc) > PG_ENCODING_BE_LAST && (_enc) < _PG_LAST_ENCODING_ && (_enc) != PG_GBK && (_enc) != PG_GB18030)
 
 #define PG_VALID_ENCODING(_enc) \
-        ((_enc) >= 0 && (_enc) < _PG_LAST_ENCODING_)
+		((_enc) >= 0 && (_enc) < _PG_LAST_ENCODING_)
 
 /* On FE are possible all encodings */
-#define PG_VALID_FE_ENCODING(_enc)    PG_VALID_ENCODING(_enc)
+#define PG_VALID_FE_ENCODING(_enc)	PG_VALID_ENCODING(_enc)
 
 /*
  * Table for mapping an encoding number to official encoding name and
@@ -310,14 +311,14 @@ typedef enum pg_enc
  * before accessing a table entry!
  *
  * if (PG_VALID_ENCODING(encoding))
- *        pg_enc2name_tbl[ encoding ];
+ *		pg_enc2name_tbl[ encoding ];
  */
 typedef struct pg_enc2name
 {
-    const char *name;
-    pg_enc        encoding;
+	const char *name;
+	pg_enc		encoding;
 #ifdef WIN32
-    unsigned    codepage;        /* codepage for WIN32 */
+	unsigned	codepage;		/* codepage for WIN32 */
 #endif
 } pg_enc2name;
 
@@ -328,8 +329,8 @@ extern const pg_enc2name pg_enc2name_tbl[];
  */
 typedef struct pg_enc2gettext
 {
-    pg_enc        encoding;
-    const char *name;
+	pg_enc		encoding;
+	const char *name;
 } pg_enc2gettext;
 
 extern const pg_enc2gettext pg_enc2gettext_tbl[];
@@ -344,12 +345,12 @@ extern const char *get_encoding_name_for_icu(int encoding);
  * pg_wchar stuff
  */
 typedef int (*mb2wchar_with_len_converter) (const unsigned char *from,
-                                            pg_wchar *to,
-                                            int len);
+											pg_wchar *to,
+											int len);
 
 typedef int (*wchar2mb_with_len_converter) (const pg_wchar *from,
-                                            unsigned char *to,
-                                            int len);
+											unsigned char *to,
+											int len);
 
 typedef int (*mblen_converter) (const unsigned char *mbstr);
 
@@ -361,14 +362,14 @@ typedef int (*mbverifier) (const unsigned char *mbstr, int len);
 
 typedef struct
 {
-    mb2wchar_with_len_converter mb2wchar_with_len;    /* convert a multibyte
-                                                      * string to a wchar */
-    wchar2mb_with_len_converter wchar2mb_with_len;    /* convert a wchar string
-                                                      * to a multibyte */
-    mblen_converter mblen;        /* get byte length of a char */
-    mbdisplaylen_converter dsplen;    /* get display width of a char */
-    mbverifier    mbverify;        /* verify multibyte sequence */
-    int            maxmblen;        /* max bytes for a char in this encoding */
+	mb2wchar_with_len_converter mb2wchar_with_len;	/* convert a multibyte
+													 * string to a wchar */
+	wchar2mb_with_len_converter wchar2mb_with_len;	/* convert a wchar string
+													 * to a multibyte */
+	mblen_converter mblen;		/* get byte length of a char */
+	mbdisplaylen_converter dsplen;	/* get display width of a char */
+	mbverifier	mbverify;		/* verify multibyte sequence */
+	int			maxmblen;		/* max bytes for a char in this encoding */
 } pg_wchar_tbl;
 
 extern const pg_wchar_tbl pg_wchar_table[];
@@ -384,8 +385,8 @@ extern const pg_wchar_tbl pg_wchar_table[];
  *
  * 1. Using a radix tree, from source to destination code.
  * 2. Using a sorted array of source -> destination code pairs. This
- *      method is used for "combining" characters. There are so few of
- *      them that building a radix tree would be wasteful.
+ *	  method is used for "combining" characters. There are so few of
+ *	  them that building a radix tree would be wasteful.
  * 3. Using a conversion function.
  */
 
@@ -415,44 +416,44 @@ extern const pg_wchar_tbl pg_wchar_table[];
  */
 typedef struct
 {
-    /*
-     * Array containing all the values. Only one of chars16 or chars32 is
-     * used, depending on how wide the values we need to represent are.
-     */
-    const uint16 *chars16;
-    const uint32 *chars32;
-
-    /* Radix tree for 1-byte inputs */
-    uint32        b1root;            /* offset of table in the chars[16|32] array */
-    uint8        b1_lower;        /* min allowed value for a single byte input */
-    uint8        b1_upper;        /* max allowed value for a single byte input */
-
-    /* Radix tree for 2-byte inputs */
-    uint32        b2root;            /* offset of 1st byte's table */
-    uint8        b2_1_lower;        /* min/max allowed value for 1st input byte */
-    uint8        b2_1_upper;
-    uint8        b2_2_lower;        /* min/max allowed value for 2nd input byte */
-    uint8        b2_2_upper;
-
-    /* Radix tree for 3-byte inputs */
-    uint32        b3root;            /* offset of 1st byte's table */
-    uint8        b3_1_lower;        /* min/max allowed value for 1st input byte */
-    uint8        b3_1_upper;
-    uint8        b3_2_lower;        /* min/max allowed value for 2nd input byte */
-    uint8        b3_2_upper;
-    uint8        b3_3_lower;        /* min/max allowed value for 3rd input byte */
-    uint8        b3_3_upper;
-
-    /* Radix tree for 4-byte inputs */
-    uint32        b4root;            /* offset of 1st byte's table */
-    uint8        b4_1_lower;        /* min/max allowed value for 1st input byte */
-    uint8        b4_1_upper;
-    uint8        b4_2_lower;        /* min/max allowed value for 2nd input byte */
-    uint8        b4_2_upper;
-    uint8        b4_3_lower;        /* min/max allowed value for 3rd input byte */
-    uint8        b4_3_upper;
-    uint8        b4_4_lower;        /* min/max allowed value for 4th input byte */
-    uint8        b4_4_upper;
+	/*
+	 * Array containing all the values. Only one of chars16 or chars32 is
+	 * used, depending on how wide the values we need to represent are.
+	 */
+	const uint16 *chars16;
+	const uint32 *chars32;
+
+	/* Radix tree for 1-byte inputs */
+	uint32		b1root;			/* offset of table in the chars[16|32] array */
+	uint8		b1_lower;		/* min allowed value for a single byte input */
+	uint8		b1_upper;		/* max allowed value for a single byte input */
+
+	/* Radix tree for 2-byte inputs */
+	uint32		b2root;			/* offset of 1st byte's table */
+	uint8		b2_1_lower;		/* min/max allowed value for 1st input byte */
+	uint8		b2_1_upper;
+	uint8		b2_2_lower;		/* min/max allowed value for 2nd input byte */
+	uint8		b2_2_upper;
+
+	/* Radix tree for 3-byte inputs */
+	uint32		b3root;			/* offset of 1st byte's table */
+	uint8		b3_1_lower;		/* min/max allowed value for 1st input byte */
+	uint8		b3_1_upper;
+	uint8		b3_2_lower;		/* min/max allowed value for 2nd input byte */
+	uint8		b3_2_upper;
+	uint8		b3_3_lower;		/* min/max allowed value for 3rd input byte */
+	uint8		b3_3_upper;
+
+	/* Radix tree for 4-byte inputs */
+	uint32		b4root;			/* offset of 1st byte's table */
+	uint8		b4_1_lower;		/* min/max allowed value for 1st input byte */
+	uint8		b4_1_upper;
+	uint8		b4_2_lower;		/* min/max allowed value for 2nd input byte */
+	uint8		b4_2_upper;
+	uint8		b4_3_lower;		/* min/max allowed value for 3rd input byte */
+	uint8		b4_3_upper;
+	uint8		b4_4_lower;		/* min/max allowed value for 4th input byte */
+	uint8		b4_4_upper;
 
 } pg_mb_radix_tree;
 
@@ -461,9 +462,9 @@ typedef struct
  */
 typedef struct
 {
-    uint32        utf1;            /* UTF-8 code 1 */
-    uint32        utf2;            /* UTF-8 code 2 */
-    uint32        code;            /* local code */
+	uint32		utf1;			/* UTF-8 code 1 */
+	uint32		utf2;			/* UTF-8 code 2 */
+	uint32		code;			/* local code */
 } pg_utf_to_local_combined;
 
 /*
@@ -471,9 +472,9 @@ typedef struct
  */
 typedef struct
 {
-    uint32        code;            /* local code */
-    uint32        utf1;            /* UTF-8 code 1 */
-    uint32        utf2;            /* UTF-8 code 2 */
+	uint32		code;			/* local code */
+	uint32		utf1;			/* UTF-8 code 1 */
+	uint32		utf2;			/* UTF-8 code 2 */
 } pg_local_to_utf_combined;
 
 /*
@@ -490,79 +491,79 @@ typedef uint32 (*utf_local_conversion_func) (uint32 code);
  * used by frontends.)
  */
 #define CHECK_ENCODING_CONVERSION_ARGS(srcencoding,destencoding) \
-    check_encoding_conversion_args(PG_GETARG_INT32(0), \
-                                   PG_GETARG_INT32(1), \
-                                   PG_GETARG_INT32(4), \
-                                   (srcencoding), \
-                                   (destencoding))
+	check_encoding_conversion_args(PG_GETARG_INT32(0), \
+								   PG_GETARG_INT32(1), \
+								   PG_GETARG_INT32(4), \
+								   (srcencoding), \
+								   (destencoding))
 
 
 /*
  * These functions are considered part of libpq's exported API and
  * are also declared in libpq-fe.h.
  */
-extern int    pg_char_to_encoding(const char *name);
+extern int	pg_char_to_encoding(const char *name);
 extern const char *pg_encoding_to_char(int encoding);
-extern int    pg_valid_server_encoding_id(int encoding);
+extern int	pg_valid_server_encoding_id(int encoding);
 
 /*
  * Remaining functions are not considered part of libpq's API, though many
  * of them do exist inside libpq.
  */
-extern int    pg_mb2wchar(const char *from, pg_wchar *to);
-extern int    pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
+extern int	pg_mb2wchar(const char *from, pg_wchar *to);
+extern int	pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len);
 extern int pg_encoding_mb2wchar_with_len(int encoding,
-                              const char *from, pg_wchar *to, int len);
-extern int    pg_wchar2mb(const pg_wchar *from, char *to);
-extern int    pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len);
+							  const char *from, pg_wchar *to, int len);
+extern int	pg_wchar2mb(const pg_wchar *from, char *to);
+extern int	pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len);
 extern int pg_encoding_wchar2mb_with_len(int encoding,
-                              const pg_wchar *from, char *to, int len);
-extern int    pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2);
-extern int    pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n);
-extern int    pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n);
+							  const pg_wchar *from, char *to, int len);
+extern int	pg_char_and_wchar_strcmp(const char *s1, const pg_wchar *s2);
+extern int	pg_wchar_strncmp(const pg_wchar *s1, const pg_wchar *s2, size_t n);
+extern int	pg_char_and_wchar_strncmp(const char *s1, const pg_wchar *s2, size_t n);
 extern size_t pg_wchar_strlen(const pg_wchar *wstr);
-extern int    pg_mblen(const char *mbstr);
-extern int    pg_dsplen(const char *mbstr);
-extern int    pg_encoding_mblen(int encoding, const char *mbstr);
-extern int    pg_encoding_dsplen(int encoding, const char *mbstr);
-extern int    pg_encoding_verifymb(int encoding, const char *mbstr, int len);
-extern int    pg_mule_mblen(const unsigned char *mbstr);
-extern int    pg_mic_mblen(const unsigned char *mbstr);
-extern int    pg_mbstrlen(const char *mbstr);
-extern int    pg_mbstrlen_with_len(const char *mbstr, int len);
-extern int    pg_mbcliplen(const char *mbstr, int len, int limit);
+extern int	pg_mblen(const char *mbstr);
+extern int	pg_dsplen(const char *mbstr);
+extern int	pg_encoding_mblen(int encoding, const char *mbstr);
+extern int	pg_encoding_dsplen(int encoding, const char *mbstr);
+extern int	pg_encoding_verifymb(int encoding, const char *mbstr, int len);
+extern int	pg_mule_mblen(const unsigned char *mbstr);
+extern int	pg_mic_mblen(const unsigned char *mbstr);
+extern int	pg_mbstrlen(const char *mbstr);
+extern int	pg_mbstrlen_with_len(const char *mbstr, int len);
+extern int	pg_mbcliplen(const char *mbstr, int len, int limit);
 extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,
-                      int len, int limit);
-extern int    pg_mbcharcliplen(const char *mbstr, int len, int imit);
-extern int    pg_encoding_max_length(int encoding);
-extern int    pg_database_encoding_max_length(void);
+					  int len, int limit);
+extern int	pg_mbcharcliplen(const char *mbstr, int len, int imit);
+extern int	pg_encoding_max_length(int encoding);
+extern int	pg_database_encoding_max_length(void);
 extern mbcharacter_incrementer pg_database_encoding_character_incrementer(void);
 
-extern int    PrepareClientEncoding(int encoding);
-extern int    SetClientEncoding(int encoding);
+extern int	PrepareClientEncoding(int encoding);
+extern int	SetClientEncoding(int encoding);
 extern void InitializeClientEncoding(void);
-extern int    pg_get_client_encoding(void);
+extern int	pg_get_client_encoding(void);
 extern const char *pg_get_client_encoding_name(void);
 
 extern void SetDatabaseEncoding(int encoding);
-extern int    GetDatabaseEncoding(void);
+extern int	GetDatabaseEncoding(void);
 extern const char *GetDatabaseEncodingName(void);
 extern void SetMessageEncoding(int encoding);
-extern int    GetMessageEncoding(void);
+extern int	GetMessageEncoding(void);
 
 #ifdef ENABLE_NLS
-extern int    pg_bind_textdomain_codeset(const char *domainname);
+extern int	pg_bind_textdomain_codeset(const char *domainname);
 #endif
 
-extern int    pg_valid_client_encoding(const char *name);
-extern int    pg_valid_server_encoding(const char *name);
+extern int	pg_valid_client_encoding(const char *name);
+extern int	pg_valid_server_encoding(const char *name);
 
 extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
 extern pg_wchar utf8_to_unicode(const unsigned char *c);
-extern int    pg_utf_mblen(const unsigned char *);
+extern int	pg_utf_mblen(const unsigned char *);
 extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len,
-                          int src_encoding,
-                          int dest_encoding);
+						  int src_encoding,
+						  int dest_encoding);
 
 extern char *pg_client_to_server(const char *s, int len);
 extern char *pg_server_to_client(const char *s, int len);
@@ -573,48 +574,48 @@ extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
 extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
 
 extern void UtfToLocal(const unsigned char *utf, int len,
-            unsigned char *iso,
-            const pg_mb_radix_tree *map,
-            const pg_utf_to_local_combined *cmap, int cmapsize,
-            utf_local_conversion_func conv_func,
-            int encoding);
+		   unsigned char *iso,
+		   const pg_mb_radix_tree *map,
+		   const pg_utf_to_local_combined *cmap, int cmapsize,
+		   utf_local_conversion_func conv_func,
+		   int encoding);
 extern void LocalToUtf(const unsigned char *iso, int len,
-            unsigned char *utf,
-            const pg_mb_radix_tree *map,
-            const pg_local_to_utf_combined *cmap, int cmapsize,
-            utf_local_conversion_func conv_func,
-            int encoding);
+		   unsigned char *utf,
+		   const pg_mb_radix_tree *map,
+		   const pg_local_to_utf_combined *cmap, int cmapsize,
+		   utf_local_conversion_func conv_func,
+		   int encoding);
 
 extern bool pg_verifymbstr(const char *mbstr, int len, bool noError);
 extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len,
-                bool noError);
+				bool noError);
 extern int pg_verify_mbstr_len(int encoding, const char *mbstr, int len,
-                    bool noError);
+					bool noError);
 
 extern void check_encoding_conversion_args(int src_encoding,
-                                int dest_encoding,
-                                int len,
-                                int expected_src_encoding,
-                                int expected_dest_encoding);
+							   int dest_encoding,
+							   int len,
+							   int expected_src_encoding,
+							   int expected_dest_encoding);
 
 extern void report_invalid_encoding(int encoding, const char *mbstr, int len) pg_attribute_noreturn();
 extern void report_untranslatable_char(int src_encoding, int dest_encoding,
-                            const char *mbstr, int len) pg_attribute_noreturn();
+						   const char *mbstr, int len) pg_attribute_noreturn();
 
 extern void local2local(const unsigned char *l, unsigned char *p, int len,
-            int src_encoding, int dest_encoding, const unsigned char *tab);
+			int src_encoding, int dest_encoding, const unsigned char *tab);
 extern void pg_ascii2mic(const unsigned char *l, unsigned char *p, int len);
 extern void pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len);
 extern void latin2mic(const unsigned char *l, unsigned char *p, int len,
-          int lc, int encoding);
+		  int lc, int encoding);
 extern void mic2latin(const unsigned char *mic, unsigned char *p, int len,
-          int lc, int encoding);
+		  int lc, int encoding);
 extern void latin2mic_with_table(const unsigned char *l, unsigned char *p,
-                      int len, int lc, int encoding,
-                      const unsigned char *tab);
+					 int len, int lc, int encoding,
+					 const unsigned char *tab);
 extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
-                      int len, int lc, int encoding,
-                      const unsigned char *tab);
+					 int len, int lc, int encoding,
+					 const unsigned char *tab);
 
 extern bool pg_utf8_islegal(const unsigned char *source, int length);
 
@@ -622,4 +623,4 @@ extern bool pg_utf8_islegal(const unsigned char *source, int length);
 extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len);
 #endif
 
-#endif                            /* PG_WCHAR_H */
+#endif							/* PG_WCHAR_H */
diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out
index 01debacc..237482a1 100644
--- a/src/test/regress/expected/rowsecurity_1.out
+++ b/src/test/regress/expected/rowsecurity_1.out
@@ -2238,7 +2238,7 @@ CREATE VIEW rls_view AS SELECT * FROM z1 WHERE f_leak(b) order by 1;
 GRANT SELECT ON rls_view TO regress_rls_bob;
 -- Query as role that is not owner of view or table.  Should return all records.
 SET SESSION AUTHORIZATION regress_rls_bob;
-SELECT * FROM rls_view;
+SELECT * FROM rls_view ORDER BY a,b;
  a |  b  
 ---+-----
  1 | aba
@@ -2259,7 +2259,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
 
 -- Query as view/table owner.  Should return all records.
 SET SESSION AUTHORIZATION regress_rls_alice;
-SELECT * FROM rls_view order by 1;
+SELECT * FROM rls_view ORDER BY a,b;
  a |  b  
 ---+-----
  1 | aba
@@ -2286,7 +2286,7 @@ GRANT SELECT ON rls_view TO regress_rls_alice;
 -- Query as role that is not owner of view but is owner of table.
 -- Should return records based on view owner policies.
 SET SESSION AUTHORIZATION regress_rls_alice;
-SELECT * FROM rls_view;
+SELECT * FROM rls_view ORDER BY a,b;
  a |  b  
 ---+-----
  2 | bbb
@@ -2306,7 +2306,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
 -- Query as role that is not owner of table but is owner of view.
 -- Should return records based on view owner policies.
 SET SESSION AUTHORIZATION regress_rls_bob;
-SELECT * FROM rls_view;
+SELECT * FROM rls_view ORDER BY a,b;
  a |  b  
 ---+-----
  2 | bbb
@@ -2332,7 +2332,7 @@ ERROR:  permission denied for relation rls_view
 -- Query as role that is not the owner of the table or view with permissions.
 SET SESSION AUTHORIZATION regress_rls_bob;
 GRANT SELECT ON rls_view TO regress_rls_carol;
-SELECT * FROM rls_view;
+SELECT * FROM rls_view ORDER BY a,b;
  a |  b  
 ---+-----
  2 | bbb
diff --git a/src/test/regress/expected/zhcn_gb18030.out b/src/test/regress/expected/zhcn_gb18030.out
new file mode 100644
index 00000000..e330ad5c
--- /dev/null
+++ b/src/test/regress/expected/zhcn_gb18030.out
@@ -0,0 +1,132 @@
+--
+-- gbk
+--
+\c db_gbk;
+SET client_encoding = gbk;
+-- regular expression query
+SELECT * FROM tbl_gbk WHERE f1 ~ '^��' ORDER BY f1;
+   f1   
+--------
+ ���Ұ�
+ ��һλ
+(2 rows)
+
+DROP TABLE tbl_gbk;
+CREATE TABLE tbl_gbk(f1 varchar(3));
+INSERT INTO tbl_gbk (f1) VALUES ('�˶���');
+INSERT INTO tbl_gbk (f1) VALUES ('�����');
+-- �F is not support by euc_cn, but support on gbk
+INSERT INTO tbl_gbk (f1) VALUES ('���F��');
+INSERT INTO tbl_gbk (f1) VALUES ('���Ұ�');
+INSERT INTO tbl_gbk (f1) VALUES ('��һλ');
+INSERT INTO tbl_gbk (f1) VALUES ('����');
+-- error
+INSERT INTO tbl_gbk (f1) VALUES ('���Ұ�2');
+ERROR:  value too long for type character varying(3)
+-- order by
+SELECT * FROM tbl_gbk ORDER BY f1;
+   f1   
+--------
+ �˶���
+ �����
+ ���Ұ�
+ ��һλ
+ ����
+ ���F��
+(6 rows)
+
+-- regular expression query
+SELECT * FROM tbl_gbk WHERE f1 ~ '^��' ORDER BY f1;
+   f1   
+--------
+ ���Ұ�
+ ��һλ
+(2 rows)
+
+-- query encoding length
+SELECT OCTET_LENGTH(f1) FROM tbl_gbk ORDER BY f1;
+ octet_length 
+--------------
+            6
+            6
+            6
+            6
+            4
+            6
+(6 rows)
+
+--
+-- gb18030
+--
+\c db_gb18030;
+SET client_encoding = gb18030;
+-- regular expression query
+SELECT * FROM tbl_gb18030 WHERE f1 ~ '^��' ORDER BY f1;
+   f1   
+--------
+ ���Ұ�
+ ��һλ
+(2 rows)
+
+SELECT * FROM tbl_gb18030 WHERE f1 ~ '^�0�0' ORDER BY f1;
+   f1   
+--------
+ �0�0�3�3�5�3
+(1 row)
+
+DROP TABLE tbl_gb18030;
+CREATE TABLE tbl_gb18030(f1 varchar(3));
+INSERT INTO tbl_gb18030 (f1) VALUES ('�˶���');
+INSERT INTO tbl_gb18030 (f1) VALUES ('�����');
+-- �F is not support by euc_cn, but support on gb18030
+INSERT INTO tbl_gb18030 (f1) VALUES ('���F��');
+INSERT INTO tbl_gb18030 (f1) VALUES ('���Ұ�');
+INSERT INTO tbl_gb18030 (f1) VALUES ('��һλ');
+INSERT INTO tbl_gb18030 (f1) VALUES ('����');
+-- which not support by gbk, but support on gb18030
+INSERT INTO tbl_gb18030 (f1) VALUES ('�0�0�3�3�5�3');
+-- out of bound error
+INSERT INTO tbl_gb18030 (f1) VALUES ('���Ұ�2');
+ERROR:  value too long for type character varying(3)
+INSERT INTO tbl_gb18030 (f1) VALUES ('�0�0�3�3�5�32');
+ERROR:  value too long for type character varying(3)
+-- order by
+SELECT * FROM tbl_gb18030 ORDER BY f1;
+   f1   
+--------
+ �0�0�3�3�5�3
+ �˶���
+ �����
+ ���Ұ�
+ ��һλ
+ ����
+ ���F��
+(7 rows)
+
+-- regular expression query
+SELECT * FROM tbl_gb18030 WHERE f1 ~ '^��' ORDER BY f1;
+   f1   
+--------
+ ���Ұ�
+ ��һλ
+(2 rows)
+
+SELECT * FROM tbl_gb18030 WHERE f1 ~ '^�0�0' ORDER BY f1;
+   f1   
+--------
+ �0�0�3�3�5�3
+(1 row)
+
+-- query encoding length
+SELECT OCTET_LENGTH(f1) FROM tbl_gb18030 ORDER BY f1;
+ octet_length 
+--------------
+           12
+            6
+            6
+            6
+            6
+            4
+            6
+(7 rows)
+
diff --git a/src/test/regress/expected/zhcn_utf8.out b/src/test/regress/expected/zhcn_utf8.out
new file mode 100644
index 00000000..a3ecc8e2
--- /dev/null
+++ b/src/test/regress/expected/zhcn_utf8.out
@@ -0,0 +1,264 @@
+--
+-- gbk
+--
+CREATE DATABASE db_gbk template template0 encoding = gbk LC_COLLATE = 'zh_CN.gbk' LC_CTYPE = 'zh_CN.gbk';
+\c db_gbk;
+CREATE TABLE tbl_gbk(f1 varchar(3));
+INSERT INTO tbl_gbk (f1) VALUES ('邓东宝');
+INSERT INTO tbl_gbk (f1) VALUES ('李尔王');
+-- 镕 is not support by euc_cn, but support on gbk
+INSERT INTO tbl_gbk (f1) VALUES ('朱镕非');
+INSERT INTO tbl_gbk (f1) VALUES ('王家坝');
+INSERT INTO tbl_gbk (f1) VALUES ('王一位');
+INSERT INTO tbl_gbk (f1) VALUES ('怡宝');
+-- error
+INSERT INTO tbl_gbk (f1) VALUES ('王家坝2');
+ERROR:  value too long for type character varying(3)
+-- order by
+SELECT * FROM tbl_gbk ORDER BY f1;
+   f1   
+--------
+ 邓东宝
+ 李尔王
+ 王家坝
+ 王一位
+ 怡宝
+ 朱镕非
+(6 rows)
+
+-- regular expression query
+SELECT * FROM tbl_gbk WHERE f1 ~ '^王' ORDER BY f1;
+   f1   
+--------
+ 王家坝
+ 王一位
+(2 rows)
+
+-- query encoding length
+SELECT OCTET_LENGTH(f1) FROM tbl_gbk ORDER BY f1;
+ octet_length 
+--------------
+            6
+            6
+            6
+            6
+            4
+            6
+(6 rows)
+
+-- MATERIALIZED VIEW join
+CREATE TABLE T_PERSON(i int, n varchar(32));
+INSERT INTO T_PERSON VALUES (1, '韩梅梅');
+INSERT INTO T_PERSON VALUES (2, '张雷');
+CREATE TABLE T_NICK(id int, name varchar(32));
+INSERT INTO T_NICK VALUES (1, '叶子');
+INSERT INTO T_NICK VALUES (2, '蓝天');
+CREATE MATERIALIZED VIEW T_MATER AS SELECT * FROM T_PERSON  WITH NO DATA;
+REFRESH MATERIALIZED VIEW T_MATER;
+SELECT * FROM T_MATER p JOIN T_NICK n on p.i = n.id order by i;
+ i |   n    | id | name 
+---+--------+----+------
+ 1 | 韩梅梅 |  1 | 叶子
+ 2 | 张雷   |  2 | 蓝天
+(2 rows)
+
+SELECT * FROM T_MATER p JOIN T_NICK n on p.i = n.id order by name;
+ i |   n    | id | name 
+---+--------+----+------
+ 2 | 张雷   |  2 | 蓝天
+ 1 | 韩梅梅 |  1 | 叶子
+(2 rows)
+
+SELECT * FROM T_MATER p JOIN T_NICK n on p.i = n.id order by n;
+ i |   n    | id | name 
+---+--------+----+------
+ 1 | 韩梅梅 |  1 | 叶子
+ 2 | 张雷   |  2 | 蓝天
+(2 rows)
+
+DROP MATERIALIZED VIEW T_MATER;
+DROP TABLE T_PERSON;
+DROP TABLE T_NICK;
+--
+-- gb18030
+--
+CREATE DATABASE db_gb18030 template template0 encoding = gb18030 LC_COLLATE = 'zh_CN.gb18030' LC_CTYPE = 'zh_CN.gb18030';
+\c db_gb18030;
+CREATE TABLE tbl_gb18030(f1 varchar(3));
+INSERT INTO tbl_gb18030 (f1) VALUES ('邓东宝');
+INSERT INTO tbl_gb18030 (f1) VALUES ('李尔王');
+-- 镕 is not support by euc_cn, but support on gb18030
+INSERT INTO tbl_gb18030 (f1) VALUES ('朱镕非');
+INSERT INTO tbl_gb18030 (f1) VALUES ('王家坝');
+INSERT INTO tbl_gb18030 (f1) VALUES ('王一位');
+INSERT INTO tbl_gb18030 (f1) VALUES ('怡宝');
+-- which not support by gbk, but support on gb18030
+INSERT INTO tbl_gb18030 (f1) VALUES ('𣘗𧄧');
+-- out of bound error
+INSERT INTO tbl_gb18030 (f1) VALUES ('王家坝2');
+ERROR:  value too long for type character varying(3)
+INSERT INTO tbl_gb18030 (f1) VALUES ('𣘗𧄧2');
+ERROR:  value too long for type character varying(3)
+-- text
+CREATE TABLE tbl_text(i int, f1 text);
+INSERT INTO tbl_text (f1) VALUES ('邓东宝');
+INSERT INTO tbl_text (f1) VALUES ('李尔王');
+-- 镕 is not support by euc_cn, but support on gb18030
+INSERT INTO tbl_text (f1) VALUES ('朱镕非');
+INSERT INTO tbl_text (f1) VALUES ('王家坝');
+INSERT INTO tbl_text (f1) VALUES ('王一位');
+INSERT INTO tbl_text (f1) VALUES ('怡宝');
+-- which not support by gbk, but support on gb18030
+INSERT INTO tbl_text (f1) VALUES ('𣘗𧄧');
+SELECT * FROM tbl_text ORDER BY f1;
+ i |     f1     
+---+------------
+   | \u0080𣘗𧄧
+   | 邓东宝
+   | 李尔王
+   | 王家坝
+   | 王一位
+   | 怡宝
+   | 朱镕非
+(7 rows)
+
+-- nvarchar2
+CREATE TABLE tbl_nvarchar2(i int, f1 nvarchar2(3) );
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('邓东宝');
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('李尔王');
+-- 镕 is not support by euc_cn, but support on gb18030
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('朱镕非');
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('王家坝');
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('王一位');
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('怡宝');
+-- which not support by gbk, but support on gb18030
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('𣘗𧄧');
+SELECT * FROM tbl_nvarchar2 ORDER BY f1;
+ i |     f1     
+---+------------
+   | \u0080𣘗𧄧
+   | 邓东宝
+   | 李尔王
+   | 王家坝
+   | 王一位
+   | 怡宝
+   | 朱镕非
+(7 rows)
+
+-- bpchar
+CREATE TABLE tbl_bpchar(i int, f1 bpchar(3) );
+INSERT INTO tbl_bpchar (f1) VALUES ('邓东宝');
+INSERT INTO tbl_bpchar (f1) VALUES ('李尔王');
+-- 镕 is not support by euc_cn, but support on gb18030
+INSERT INTO tbl_bpchar (f1) VALUES ('朱镕非');
+INSERT INTO tbl_bpchar (f1) VALUES ('王家坝');
+INSERT INTO tbl_bpchar (f1) VALUES ('王一位');
+INSERT INTO tbl_bpchar (f1) VALUES ('怡宝');
+-- which not support by gbk, but support on gb18030
+INSERT INTO tbl_bpchar (f1) VALUES ('𣘗𧄧');
+SELECT * FROM tbl_bpchar ORDER BY f1;
+ i |     f1     
+---+------------
+   | \u0080𣘗𧄧
+   | 邓东宝
+   | 李尔王
+   | 王家坝
+   | 王一位
+   | 怡宝 
+   | 朱镕非
+(7 rows)
+
+-- char
+CREATE TABLE tbl_char(i int, f1 char(3) );
+INSERT INTO tbl_char (f1) VALUES ('邓东宝');
+INSERT INTO tbl_char (f1) VALUES ('李尔王');
+-- 镕 is not support by euc_cn, but support on gb18030
+INSERT INTO tbl_char (f1) VALUES ('朱镕非');
+INSERT INTO tbl_char (f1) VALUES ('王家坝');
+INSERT INTO tbl_char (f1) VALUES ('王家1');
+INSERT INTO tbl_char (f1) VALUES ('王家2');
+INSERT INTO tbl_char (f1) VALUES ('王一位');
+INSERT INTO tbl_char (f1) VALUES ('怡宝');
+-- which not support by gbk, but support on gb18030
+INSERT INTO tbl_char (f1) VALUES ('𣘗𧄧');
+SELECT * FROM tbl_char ORDER BY f1;
+ i |     f1     
+---+------------
+   | \u0080𣘗𧄧
+   | 邓东宝
+   | 李尔王
+   | 王家1
+   | 王家2
+   | 王家坝
+   | 王一位
+   | 怡宝 
+   | 朱镕非
+(9 rows)
+
+-- order by
+SELECT * FROM tbl_gb18030 ORDER BY f1;
+     f1     
+------------
+ \u0080𣘗𧄧
+ 邓东宝
+ 李尔王
+ 王家坝
+ 王一位
+ 怡宝
+ 朱镕非
+(7 rows)
+
+-- regular expression query
+SELECT * FROM tbl_gb18030 WHERE f1 ~ '^王' ORDER BY f1;
+   f1   
+--------
+ 王家坝
+ 王一位
+(2 rows)
+
+-- query encoding length
+SELECT OCTET_LENGTH(f1) FROM tbl_gb18030 ORDER BY f1;
+ octet_length 
+--------------
+           12
+            6
+            6
+            6
+            6
+            4
+            6
+(7 rows)
+
+-- MATERIALIZED VIEW join
+CREATE TABLE T_PERSON(i int, n varchar(32));
+INSERT INTO T_PERSON VALUES (1, '韩梅梅');
+INSERT INTO T_PERSON VALUES (2, '李雷');
+CREATE TABLE T_NICK(id int, name varchar(32));
+INSERT INTO T_NICK VALUES (1, '叶子');
+INSERT INTO T_NICK VALUES (2, '蓝天');
+CREATE MATERIALIZED VIEW T_MATER AS SELECT * FROM T_PERSON  WITH NO DATA;
+REFRESH MATERIALIZED VIEW T_MATER;
+SELECT * FROM T_NICK n JOIN T_MATER p on n.id=p.i order by i;
+ id | name | i |   n    
+----+------+---+--------
+  1 | 叶子 | 1 | 韩梅梅
+  2 | 蓝天 | 2 | 李雷
+(2 rows)
+
+SELECT * FROM T_NICK n JOIN T_MATER p on n.id=p.i order by name;
+ id | name | i |   n    
+----+------+---+--------
+  2 | 蓝天 | 2 | 李雷
+  1 | 叶子 | 1 | 韩梅梅
+(2 rows)
+
+SELECT * FROM T_NICK n JOIN T_MATER p on n.id=p.i order by n;
+ id | name | i |   n    
+----+------+---+--------
+  1 | 叶子 | 1 | 韩梅梅
+  2 | 蓝天 | 2 | 李雷
+(2 rows)
+
+DROP MATERIALIZED VIEW T_MATER;
+DROP TABLE T_PERSON;
+DROP TABLE T_NICK;
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 4f52d0f8..ebd01715 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -23,10 +23,10 @@ test: tablespace
 # ----------
 # The first group of parallel tests
 # ----------
-test: boolean char name varchar text int2 int4 int8 oid float4 float8 bit numeric txid uuid enum money rangetypes pg_lsn regproc
+test: boolean char name varchar text int2 int4 int8 oid float4 float8 bit numeric txid uuid enum money rangetypes pg_lsn regproc zhcn_utf8
 
 # Depends on things setup during char, varchar and text
-test: strings
+test: strings zhcn_gb18030
 # Depends on int2, int4, int8, float4, float8
 test: numerology
 
diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index ec698123..f903bc43 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -871,7 +871,8 @@ set_node_config_file(PGXCNodeTypeNum node)
 
     fputs("log_min_messages = log\n", pg_conf);
     fputs("log_min_error_statement = log\n", pg_conf);
-    fputs("max_connections = 300\n", pg_conf);
+	fputs("max_connections = 500\n", pg_conf);
+	fputs("max_pool_size = 500\n", pg_conf);
     fputs("max_worker_processes = 256\n", pg_conf);
     fputs("max_parallel_workers  = 256\n", pg_conf);
     fputs("enable_statistic = on\n", pg_conf);
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 0371de42..04781232 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -34,6 +34,8 @@ test: rangetypes
 test: pg_lsn
 test: regproc
 test: strings
+test: zhcn_utf8
+test: zhcn_gb18030
 test: numerology
 test: point
 test: lseg
diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql
index 3fa55ccc..4ed98e68 100644
--- a/src/test/regress/sql/rowsecurity.sql
+++ b/src/test/regress/sql/rowsecurity.sql
@@ -887,12 +887,12 @@ GRANT SELECT ON rls_view TO regress_rls_bob;
 
 -- Query as role that is not owner of view or table.  Should return all records.
 SET SESSION AUTHORIZATION regress_rls_bob;
-SELECT * FROM rls_view;
+SELECT * FROM rls_view ORDER BY a,b;
 EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
 
 -- Query as view/table owner.  Should return all records.
 SET SESSION AUTHORIZATION regress_rls_alice;
-SELECT * FROM rls_view order by 1;
+SELECT * FROM rls_view ORDER BY a,b;
 EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
 DROP VIEW rls_view;
 
@@ -904,13 +904,13 @@ GRANT SELECT ON rls_view TO regress_rls_alice;
 -- Query as role that is not owner of view but is owner of table.
 -- Should return records based on view owner policies.
 SET SESSION AUTHORIZATION regress_rls_alice;
-SELECT * FROM rls_view;
+SELECT * FROM rls_view ORDER BY a,b;
 EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
 
 -- Query as role that is not owner of table but is owner of view.
 -- Should return records based on view owner policies.
 SET SESSION AUTHORIZATION regress_rls_bob;
-SELECT * FROM rls_view;
+SELECT * FROM rls_view ORDER BY a,b;
 EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
 
 -- Query as role that is not the owner of the table or view without permissions.
@@ -921,7 +921,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM rls_view; --fail - permission denied.
 -- Query as role that is not the owner of the table or view with permissions.
 SET SESSION AUTHORIZATION regress_rls_bob;
 GRANT SELECT ON rls_view TO regress_rls_carol;
-SELECT * FROM rls_view;
+SELECT * FROM rls_view ORDER BY a,b;
 EXPLAIN (COSTS OFF) SELECT * FROM rls_view;
 
 SET SESSION AUTHORIZATION regress_rls_bob;
diff --git a/src/test/regress/sql/zhcn_gb18030.sql b/src/test/regress/sql/zhcn_gb18030.sql
new file mode 100644
index 00000000..3846d9a6
--- /dev/null
+++ b/src/test/regress/sql/zhcn_gb18030.sql
@@ -0,0 +1,65 @@
+--
+-- gbk
+--
+\c db_gbk;
+SET client_encoding = gbk;
+
+-- regular expression query
+SELECT * FROM tbl_gbk WHERE f1 ~ '^��' ORDER BY f1;
+
+DROP TABLE tbl_gbk;
+CREATE TABLE tbl_gbk(f1 varchar(3));
+INSERT INTO tbl_gbk (f1) VALUES ('�˶���');
+INSERT INTO tbl_gbk (f1) VALUES ('�����');
+-- �F is not support by euc_cn, but support on gbk
+INSERT INTO tbl_gbk (f1) VALUES ('���F��');
+INSERT INTO tbl_gbk (f1) VALUES ('���Ұ�');
+INSERT INTO tbl_gbk (f1) VALUES ('��һλ');
+INSERT INTO tbl_gbk (f1) VALUES ('����');
+-- error
+INSERT INTO tbl_gbk (f1) VALUES ('���Ұ�2');
+
+-- order by
+SELECT * FROM tbl_gbk ORDER BY f1;
+
+-- regular expression query
+SELECT * FROM tbl_gbk WHERE f1 ~ '^��' ORDER BY f1;
+
+-- query encoding length
+SELECT OCTET_LENGTH(f1) FROM tbl_gbk ORDER BY f1;
+
+
+--
+-- gb18030
+--
+\c db_gb18030;
+
+SET client_encoding = gb18030;
+-- regular expression query
+SELECT * FROM tbl_gb18030 WHERE f1 ~ '^��' ORDER BY f1;
+SELECT * FROM tbl_gb18030 WHERE f1 ~ '^�0�0' ORDER BY f1;
+
+DROP TABLE tbl_gb18030;
+CREATE TABLE tbl_gb18030(f1 varchar(3));
+INSERT INTO tbl_gb18030 (f1) VALUES ('�˶���');
+INSERT INTO tbl_gb18030 (f1) VALUES ('�����');
+-- �F is not support by euc_cn, but support on gb18030
+INSERT INTO tbl_gb18030 (f1) VALUES ('���F��');
+INSERT INTO tbl_gb18030 (f1) VALUES ('���Ұ�');
+INSERT INTO tbl_gb18030 (f1) VALUES ('��һλ');
+INSERT INTO tbl_gb18030 (f1) VALUES ('����');
+-- which not support by gbk, but support on gb18030
+INSERT INTO tbl_gb18030 (f1) VALUES ('�0�0�3�3�5�3');
+-- out of bound error
+INSERT INTO tbl_gb18030 (f1) VALUES ('���Ұ�2');
+INSERT INTO tbl_gb18030 (f1) VALUES ('�0�0�3�3�5�32');
+
+-- order by
+SELECT * FROM tbl_gb18030 ORDER BY f1;
+-- regular expression query
+SELECT * FROM tbl_gb18030 WHERE f1 ~ '^��' ORDER BY f1;
+SELECT * FROM tbl_gb18030 WHERE f1 ~ '^�0�0' ORDER BY f1;
+
+-- query encoding length
+SELECT OCTET_LENGTH(f1) FROM tbl_gb18030 ORDER BY f1;
+
diff --git a/src/test/regress/sql/zhcn_utf8.sql b/src/test/regress/sql/zhcn_utf8.sql
new file mode 100644
index 00000000..764647f1
--- /dev/null
+++ b/src/test/regress/sql/zhcn_utf8.sql
@@ -0,0 +1,140 @@
+--
+-- gbk
+--
+CREATE DATABASE db_gbk template template0 encoding = gbk LC_COLLATE = 'zh_CN.gbk' LC_CTYPE = 'zh_CN.gbk';
+\c db_gbk;
+
+CREATE TABLE tbl_gbk(f1 varchar(3));
+INSERT INTO tbl_gbk (f1) VALUES ('邓东宝');
+INSERT INTO tbl_gbk (f1) VALUES ('李尔王');
+-- 镕 is not support by euc_cn, but support on gbk
+INSERT INTO tbl_gbk (f1) VALUES ('朱镕非');
+INSERT INTO tbl_gbk (f1) VALUES ('王家坝');
+INSERT INTO tbl_gbk (f1) VALUES ('王一位');
+INSERT INTO tbl_gbk (f1) VALUES ('怡宝');
+-- error
+INSERT INTO tbl_gbk (f1) VALUES ('王家坝2');
+
+-- order by
+SELECT * FROM tbl_gbk ORDER BY f1;
+
+-- regular expression query
+SELECT * FROM tbl_gbk WHERE f1 ~ '^王' ORDER BY f1;
+
+-- query encoding length
+SELECT OCTET_LENGTH(f1) FROM tbl_gbk ORDER BY f1;
+
+-- MATERIALIZED VIEW join
+CREATE TABLE T_PERSON(i int, n varchar(32));
+INSERT INTO T_PERSON VALUES (1, '韩梅梅');
+INSERT INTO T_PERSON VALUES (2, '张雷');
+CREATE TABLE T_NICK(id int, name varchar(32));
+INSERT INTO T_NICK VALUES (1, '叶子');
+INSERT INTO T_NICK VALUES (2, '蓝天');
+CREATE MATERIALIZED VIEW T_MATER AS SELECT * FROM T_PERSON  WITH NO DATA;
+REFRESH MATERIALIZED VIEW T_MATER;
+SELECT * FROM T_MATER p JOIN T_NICK n on p.i = n.id order by i;
+SELECT * FROM T_MATER p JOIN T_NICK n on p.i = n.id order by name;
+SELECT * FROM T_MATER p JOIN T_NICK n on p.i = n.id order by n;
+DROP MATERIALIZED VIEW T_MATER;
+DROP TABLE T_PERSON;
+DROP TABLE T_NICK;
+
+--
+-- gb18030
+--
+CREATE DATABASE db_gb18030 template template0 encoding = gb18030 LC_COLLATE = 'zh_CN.gb18030' LC_CTYPE = 'zh_CN.gb18030';
+\c db_gb18030;
+
+CREATE TABLE tbl_gb18030(f1 varchar(3));
+INSERT INTO tbl_gb18030 (f1) VALUES ('邓东宝');
+INSERT INTO tbl_gb18030 (f1) VALUES ('李尔王');
+-- 镕 is not support by euc_cn, but support on gb18030
+INSERT INTO tbl_gb18030 (f1) VALUES ('朱镕非');
+INSERT INTO tbl_gb18030 (f1) VALUES ('王家坝');
+INSERT INTO tbl_gb18030 (f1) VALUES ('王一位');
+INSERT INTO tbl_gb18030 (f1) VALUES ('怡宝');
+-- which not support by gbk, but support on gb18030
+INSERT INTO tbl_gb18030 (f1) VALUES ('𣘗𧄧');
+-- out of bound error
+INSERT INTO tbl_gb18030 (f1) VALUES ('王家坝2');
+INSERT INTO tbl_gb18030 (f1) VALUES ('𣘗𧄧2');
+
+-- text
+CREATE TABLE tbl_text(i int, f1 text);
+INSERT INTO tbl_text (f1) VALUES ('邓东宝');
+INSERT INTO tbl_text (f1) VALUES ('李尔王');
+-- 镕 is not support by euc_cn, but support on gb18030
+INSERT INTO tbl_text (f1) VALUES ('朱镕非');
+INSERT INTO tbl_text (f1) VALUES ('王家坝');
+INSERT INTO tbl_text (f1) VALUES ('王一位');
+INSERT INTO tbl_text (f1) VALUES ('怡宝');
+-- which not support by gbk, but support on gb18030
+INSERT INTO tbl_text (f1) VALUES ('𣘗𧄧');
+SELECT * FROM tbl_text ORDER BY f1;
+
+-- nvarchar2
+CREATE TABLE tbl_nvarchar2(i int, f1 nvarchar2(3) );
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('邓东宝');
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('李尔王');
+-- 镕 is not support by euc_cn, but support on gb18030
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('朱镕非');
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('王家坝');
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('王一位');
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('怡宝');
+-- which not support by gbk, but support on gb18030
+INSERT INTO tbl_nvarchar2 (f1) VALUES ('𣘗𧄧');
+SELECT * FROM tbl_nvarchar2 ORDER BY f1;
+
+-- bpchar
+CREATE TABLE tbl_bpchar(i int, f1 bpchar(3) );
+INSERT INTO tbl_bpchar (f1) VALUES ('邓东宝');
+INSERT INTO tbl_bpchar (f1) VALUES ('李尔王');
+-- 镕 is not support by euc_cn, but support on gb18030
+INSERT INTO tbl_bpchar (f1) VALUES ('朱镕非');
+INSERT INTO tbl_bpchar (f1) VALUES ('王家坝');
+INSERT INTO tbl_bpchar (f1) VALUES ('王一位');
+INSERT INTO tbl_bpchar (f1) VALUES ('怡宝');
+-- which not support by gbk, but support on gb18030
+INSERT INTO tbl_bpchar (f1) VALUES ('𣘗𧄧');
+SELECT * FROM tbl_bpchar ORDER BY f1;
+
+-- char
+CREATE TABLE tbl_char(i int, f1 char(3) );
+INSERT INTO tbl_char (f1) VALUES ('邓东宝');
+INSERT INTO tbl_char (f1) VALUES ('李尔王');
+-- 镕 is not support by euc_cn, but support on gb18030
+INSERT INTO tbl_char (f1) VALUES ('朱镕非');
+INSERT INTO tbl_char (f1) VALUES ('王家坝');
+INSERT INTO tbl_char (f1) VALUES ('王家1');
+INSERT INTO tbl_char (f1) VALUES ('王家2');
+INSERT INTO tbl_char (f1) VALUES ('王一位');
+INSERT INTO tbl_char (f1) VALUES ('怡宝');
+-- which not support by gbk, but support on gb18030
+INSERT INTO tbl_char (f1) VALUES ('𣘗𧄧');
+SELECT * FROM tbl_char ORDER BY f1;
+
+-- order by
+SELECT * FROM tbl_gb18030 ORDER BY f1;
+
+-- regular expression query
+SELECT * FROM tbl_gb18030 WHERE f1 ~ '^王' ORDER BY f1;
+
+-- query encoding length
+SELECT OCTET_LENGTH(f1) FROM tbl_gb18030 ORDER BY f1;
+
+-- MATERIALIZED VIEW join
+CREATE TABLE T_PERSON(i int, n varchar(32));
+INSERT INTO T_PERSON VALUES (1, '韩梅梅');
+INSERT INTO T_PERSON VALUES (2, '李雷');
+CREATE TABLE T_NICK(id int, name varchar(32));
+INSERT INTO T_NICK VALUES (1, '叶子');
+INSERT INTO T_NICK VALUES (2, '蓝天');
+CREATE MATERIALIZED VIEW T_MATER AS SELECT * FROM T_PERSON  WITH NO DATA;
+REFRESH MATERIALIZED VIEW T_MATER;
+SELECT * FROM T_NICK n JOIN T_MATER p on n.id=p.i order by i;
+SELECT * FROM T_NICK n JOIN T_MATER p on n.id=p.i order by name;
+SELECT * FROM T_NICK n JOIN T_MATER p on n.id=p.i order by n;
+DROP MATERIALIZED VIEW T_MATER;
+DROP TABLE T_PERSON;
+DROP TABLE T_NICK;

From e6f7cc721ebf602112ba3bab9fc05df09dfd4904 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Sat, 6 Feb 2021 12:54:03 +0800
Subject: [PATCH 127/578] Replicated distribution support composite type

---
 src/backend/access/common/printtup.c | 38 ++++++++++++++++++++++++++++
 src/test/regress/expected/rules.out  | 13 ++++------
 2 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index c7f180a5..a9b0b09b 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -29,9 +29,12 @@
 
 #include "miscadmin.h"
 #ifdef __TBASE__
+#include "access/htup_details.h"
+#include "catalog/pg_type.h"
 #include "postmaster/postmaster.h"
 #include "pgxc/squeue.h"
 #include "executor/executor.h"
+#include "utils/typcache.h"
 extern bool IsAbortedTransactionBlockState(void);
 #endif
 static void printtup_startup(DestReceiver *self, int operation,
@@ -444,6 +447,41 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
         }
         else
         {
+#ifdef __TBASE__
+	            if (slot->tts_tupleDescriptor->attrs[i]->atttypid == RECORDOID && self->mydest == DestRemoteExecute)
+	            {
+		            Oid			    tupType;
+		            int32           tupTypmod;
+		            TupleDesc       tupdesc;
+		            uint32          n32;
+		            StringInfoData  tupdesc_data;
+		            HeapTupleHeader rec;
+		            /* RECORD must be varlena */
+		            Datum   attr_detoast = PointerGetDatum(PG_DETOAST_DATUM(slot->tts_values[i]));
+		
+		            rec = DatumGetHeapTupleHeader(attr_detoast);
+		            
+		            initStringInfo(&tupdesc_data);
+		            
+		            /* Extract type info from the tuple itself */
+		            tupType = HeapTupleHeaderGetTypeId(rec);
+		            tupTypmod = HeapTupleHeaderGetTypMod(rec);
+		            tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod);
+		
+		            /* -2 to indicate this is composite type */
+		            n32 = htonl(-2);
+		            appendBinaryStringInfo(&buf, (char *) &n32, 4);
+		
+		            FormRowDescriptionMessage(tupdesc, NULL, NULL, &tupdesc_data);
+		            ReleaseTupleDesc(tupdesc);
+		            n32 = htonl(tupdesc_data.len);
+		            /* write rowDesctiption */
+		            appendBinaryStringInfo(&buf, (char *) &n32, 4);
+		            appendBinaryStringInfo(&buf, tupdesc_data.data, tupdesc_data.len);
+		
+		            pfree(tupdesc_data.data);
+	            }
+#endif
                 int len = strlen(outputstr);
                 pq_sendint(&buf, len, 4);
                 appendBinaryStringInfo(&buf, outputstr, len);
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 0d96dff4..89552269 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2667,19 +2667,16 @@ select * from id_ordered order by id;
 (6 rows)
 
 update id_ordered set name = 'update 2' where id = 2;
-ERROR:  input of anonymous composite types is not implemented
 update id_ordered set name = 'update 4' where id = 4;
-ERROR:  input of anonymous composite types is not implemented
 update id_ordered set name = 'update 5' where id = 5;
-ERROR:  input of anonymous composite types is not implemented
 select * from id_ordered order by id;
- id |  name  
-----+--------
+ id |   name   
+----+----------
   1 | Test 1
-  2 | Test 2
+  2 | update 2
   3 | Test 3
-  4 | Test 4
-  5 | Test 5
+  4 | update 4
+  5 | update 5
   6 | Test 6
 (6 rows)
 

From 328456a3581c9c38a5f087ac79c0290bd91bdc53 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Sat, 6 Feb 2021 13:33:35 +0800
Subject: [PATCH 128/578] cover regress expectation about cost changes

---
 src/test/regress/expected/create_view.out   | 49 +++++++++++++++++
 src/test/regress/expected/xc_FQS_join_1.out | 58 ++++++++++-----------
 2 files changed, 78 insertions(+), 29 deletions(-)

diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out
index 56e73b4e..57376793 100644
--- a/src/test/regress/expected/create_view.out
+++ b/src/test/regress/expected/create_view.out
@@ -38,6 +38,55 @@ SELECT * FROM viewtest ORDER BY a;
 
 CREATE OR REPLACE VIEW viewtest AS
 	SELECT a, b FROM viewtest_tbl WHERE a > 5 ORDER BY b DESC;
+EXPLAIN SELECT * FROM viewtest;
+                                        QUERY PLAN                                         
+-------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=22.23..22.79 rows=225 width=8)
+   ->  Sort  (cost=22.23..22.79 rows=225 width=8)
+         Sort Key: viewtest_tbl.b DESC
+         ->  Seq Scan on viewtest_tbl  (cost=0.00..13.44 rows=225 width=8)
+               Filter: (a > 5)
+(5 rows)
+
+SELECT * FROM viewtest;
+ a  | b  
+----+----
+ 20 | 25
+ 15 | 20
+ 10 | 15
+(3 rows)
+
+EXPLAIN SELECT a FROM viewtest;
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Subquery Scan on viewtest  (cost=22.23..25.04 rows=225 width=4)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=22.23..22.79 rows=225 width=8)
+         ->  Sort  (cost=22.23..22.79 rows=225 width=8)
+               Sort Key: viewtest_tbl.b DESC
+               ->  Seq Scan on viewtest_tbl  (cost=0.00..13.44 rows=225 width=8)
+                     Filter: (a > 5)
+(6 rows)
+
+SELECT a FROM viewtest;
+ a  
+----
+ 20
+ 15
+ 10
+(3 rows)
+
+EXPLAIN SELECT * FROM viewtest ORDER BY a;
+                                        QUERY PLAN                                         
+-------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=33.83..34.39 rows=225 width=8)
+   ->  Sort  (cost=33.83..34.39 rows=225 width=8)
+         Sort Key: viewtest_tbl.a
+         ->  Sort  (cost=22.23..22.79 rows=225 width=8)
+               Sort Key: viewtest_tbl.b DESC
+               ->  Seq Scan on viewtest_tbl  (cost=0.00..13.44 rows=225 width=8)
+                     Filter: (a > 5)
+(7 rows)
+
 SELECT * FROM viewtest ORDER BY a;
  a  | b  
 ----+----
diff --git a/src/test/regress/expected/xc_FQS_join_1.out b/src/test/regress/expected/xc_FQS_join_1.out
index 6cfb1dda..57ff7524 100644
--- a/src/test/regress/expected/xc_FQS_join_1.out
+++ b/src/test/regress/expected/xc_FQS_join_1.out
@@ -390,19 +390,19 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod natural join t
 ----------------------------------------------------------------------------------
  Hash Join
    Output: tab1_mod.val, tab1_mod.val2
-   Hash Cond: ((tab4_rep.val = tab1_mod.val) AND (tab4_rep.val2 = tab1_mod.val2))
-         ->  Remote Subquery Scan on all
-               Output: tab4_rep.val, tab4_rep.val2
-               ->  Seq Scan on public.tab4_rep
-                     Output: tab4_rep.val, tab4_rep.val2
-                     Filter: (tab4_rep.val < 4)
-   ->  Hash
-         Output: tab1_mod.val, tab1_mod.val2
+   Hash Cond: ((tab1_mod.val = tab4_rep.val) AND (tab1_mod.val2 = tab4_rep.val2))
          ->  Remote Subquery Scan on all
                Output: tab1_mod.val, tab1_mod.val2
                ->  Seq Scan on public.tab1_mod
                      Output: tab1_mod.val, tab1_mod.val2
                      Filter: (tab1_mod.val > 2)
+   ->  Hash
+         Output: tab4_rep.val, tab4_rep.val2
+         ->  Remote Subquery Scan on all
+               Output: tab4_rep.val, tab4_rep.val2
+               ->  Seq Scan on public.tab4_rep
+                     Output: tab4_rep.val, tab4_rep.val2
+                     Filter: (tab4_rep.val < 4)
 (15 rows)
 
 -- Join involving two distributed tables, never shipped
@@ -425,18 +425,18 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod natural join t
    Output: tab1_mod.val, tab1_mod.val2
    ->  Hash Join
          Output: tab1_mod.val, tab1_mod.val2
-         Hash Cond: ((tab2_mod.val = tab1_mod.val) AND (tab2_mod.val2 = tab1_mod.val2))
-                     ->  Seq Scan on public.tab2_mod
-                           Output: tab2_mod.val, tab2_mod.val2
-                           Filter: (tab2_mod.val < 4)
-         ->  Hash
-               Output: tab1_mod.val, tab1_mod.val2
-               ->  Remote Subquery Scan on all
-                     Output: tab1_mod.val, tab1_mod.val2
-                     Distribute results by M: val
+         Hash Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
                      ->  Seq Scan on public.tab1_mod
                            Output: tab1_mod.val, tab1_mod.val2
                            Filter: (tab1_mod.val > 2)
+         ->  Hash
+               Output: tab2_mod.val, tab2_mod.val2
+               ->  Remote Subquery Scan on all
+                     Output: tab2_mod.val, tab2_mod.val2
+                     Distribute results by M: val
+                     ->  Seq Scan on public.tab2_mod
+                           Output: tab2_mod.val, tab2_mod.val2
+                           Filter: (tab2_mod.val < 4)
 (16 rows)
 
 -- Join involving a distributed table and two replicated tables, such that the
@@ -590,17 +590,17 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod
          Output: tab1_mod.val, tab1_mod.val2, tab1_mod.val2
          Join Filter: (tab1_mod.val2 = tab4_rep.val2)
          ->  Remote Subquery Scan on all
-                     Output: tab4_rep.val, tab4_rep.val2
-                     ->  Seq Scan on public.tab4_rep
-                           Output: tab4_rep.val, tab4_rep.val2
-                           Filter: (tab4_rep.val = 1)
-         ->  Materialize
-               Output: tab1_mod.val, tab1_mod.val2
-               ->  Remote Subquery Scan on all
                      Output: tab1_mod.val, tab1_mod.val2
                      ->  Seq Scan on public.tab1_mod
                            Output: tab1_mod.val, tab1_mod.val2
                            Filter: (tab1_mod.val = 1)
+         ->  Materialize
+               Output: tab4_rep.val, tab4_rep.val2
+               ->  Remote Subquery Scan on all
+                     Output: tab4_rep.val, tab4_rep.val2
+                     ->  Seq Scan on public.tab4_rep
+                           Output: tab4_rep.val, tab4_rep.val2
+                           Filter: (tab4_rep.val = 1)
 (18 rows)
 
 -- following join between distributed tables should get FQSed because both of
@@ -625,16 +625,16 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod
    ->  Nested Loop
          Output: tab1_mod.val2, tab1_mod.val, tab2_mod.val, tab1_mod.val
          Join Filter: (tab1_mod.val2 = tab2_mod.val2)
-         ->  Seq Scan on public.tab2_mod
-               Output: tab2_mod.val, tab2_mod.val2
-               Filter: (tab2_mod.val = 2)
-         ->  Materialize
-               Output: tab1_mod.val2, tab1_mod.val
          ->  Remote Subquery Scan on all
                Output: tab1_mod.val2, tab1_mod.val
                ->  Seq Scan on public.tab1_mod
                      Output: tab1_mod.val2, tab1_mod.val
                      Filter: (tab1_mod.val = 1)
+         ->  Materialize
+               Output: tab2_mod.val, tab2_mod.val2
+               ->  Seq Scan on public.tab2_mod
+                     Output: tab2_mod.val, tab2_mod.val2
+                     Filter: (tab2_mod.val = 2)
 (15 rows)
 
 -- JOIN involving the distributed table with equi-JOIN on the distributed column

From 7408928cc774deb4bc743de3cbcddf2287a22087 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 15 Apr 2021 15:03:09 +0800
Subject: [PATCH 129/578] Prevent reenter ExecutorEnd during abort

This is following PG rule: skip executor shut down during error abort,
PGXC code violated it for treating shared queue, this commit fix this

tapd: http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696084977249&url_cache_key=25afc0aab46ec661eb190971ad54594d
---
 src/backend/commands/portalcmds.c  | 32 ++++++++++++++++++------------
 src/backend/utils/mmgr/portalmem.c |  9 ---------
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c
index e5a87499..4bea0943 100644
--- a/src/backend/commands/portalcmds.c
+++ b/src/backend/commands/portalcmds.c
@@ -359,8 +359,13 @@ PortalCleanup(Portal portal)
 #ifdef XCP
         if (portal->strategy == PORTAL_DISTRIBUTED)
         {
-            /* If portal is producing it has an executor which should be
-             * shut down */
+			/* If cleanup fails below prevent double cleanup */
+			portal->queryDesc = NULL;
+			
+			/*
+			 * If portal is producing it has an executor which should be
+			 * shut down
+			 */
             if (queryDesc->myindex == -1)
             {
                 if (portal->status == PORTAL_FAILED)
@@ -370,8 +375,6 @@ PortalCleanup(Portal portal)
                      * producers list.
                      */
                     removeProducingPortal(portal);
-                    /* If cleanup fails below prevent double cleanup */
-                    portal->queryDesc = NULL;
                     /*
                      * Inform consumers about failed producer if they are
                      * still waiting
@@ -384,28 +387,33 @@ PortalCleanup(Portal portal)
                 {
                     ResourceOwner saveResourceOwner;
 
-                    /* We must make the portal's resource owner current to
-                     * release resources properly */
+					/*
+					 * We must make the portal's resource owner current to
+					 * release resources properly
+					 */
                     saveResourceOwner = CurrentResourceOwner;
                     PG_TRY();
                     {
+						if (portal->resowner)
                         CurrentResourceOwner = portal->resowner;
+						/* do nothing about executor if portal is failed */
+						if (portal->status != PORTAL_FAILED)
+						{
                         /* Finish executor if it is not yet finished */
                         if (!queryDesc->estate->es_finished)
                             ExecutorFinish(queryDesc);
-                        /* Destroy executor if not yet destroyed */
-                        if (queryDesc->estate)
                             ExecutorEnd(queryDesc);
-                        if (portal->status == PORTAL_FAILED)
+							FreeQueryDesc(queryDesc);
+						}
+						else
                         {
                             /*
-                             * If portal if failed we can allow to be blocked
+							 * If portal is failed we can allow to be blocked
                              * here while UnBind is waiting for finishing
                              * consumers.
                              */
                             if (queryDesc->squeue)
                                 SharedQueueUnBind(queryDesc->squeue, true);
-                            FreeQueryDesc(queryDesc);
                         }
                     }
                     PG_CATCH();
@@ -428,8 +436,6 @@ PortalCleanup(Portal portal)
                 PG_TRY();
                 {
                     CurrentResourceOwner = portal->resowner;
-                    /* Prevent double cleanup in case of error below */
-                    portal->queryDesc = NULL;
                     /* Reset the squeue if exists */
                     if (queryDesc->squeue)
                         SharedQueueReset(queryDesc->squeue, queryDesc->myindex);
diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c
index deb2b8d6..567737b6 100644
--- a/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -606,15 +606,6 @@ PortalDrop(Portal portal, bool isTopCommit)
      */
     if (portalIsProducing(portal))
         return;
-
-    if (portal->queryDesc)
-    {
-        ResourceOwner saveResourceOwner = CurrentResourceOwner;
-        CurrentResourceOwner = portal->resowner;
-        FreeQueryDesc(portal->queryDesc);
-        CurrentResourceOwner = saveResourceOwner;
-        portal->queryDesc = NULL;
-    }
 #endif
 
     /*

From 3ed136704d98edea9e29188d143105d68e9a2bd6 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 15 Apr 2021 15:05:01 +0800
Subject: [PATCH 130/578] Revert "fix
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131084977249 (merge request
 !126)"

This reverts commit 5c0c40bc3b9658ac4282883088749416594f793d.
---
 src/backend/pgxc/pool/pgxcnode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 36558205..aa1070be 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -822,6 +822,7 @@ pgxc_node_receive(const int conn_count,
     }
 
 retry:
+	CHECK_FOR_INTERRUPTS();
     poll_val  = poll(pool_fd, conn_count, timeout_ms);
     if (poll_val < 0)
     {

From 88e73c5e43585eda9d2da2197f932a5e482a38cc Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 19 Feb 2021 19:49:57 +0800
Subject: [PATCH 131/578] Explain analyze enhancement

http://tapd.oa.com/pgxz/prong/stories/view/1010092131862892295
---
 src/backend/commands/Makefile       |   2 +-
 src/backend/commands/explain_dist.c | 630 ++++++++++++++++++++++++++++
 src/backend/executor/execParallel.c |  13 +
 src/backend/executor/execProcnode.c |   3 +
 src/backend/pgxc/pool/execRemote.c  |  57 ++-
 src/backend/pgxc/pool/pgxcnode.c    |  10 +-
 src/backend/tcop/postgres.c         |  67 ++-
 src/backend/tcop/pquery.c           |  12 +-
 src/include/commands/explain_dist.h |  36 ++
 src/include/pgxc/execRemote.h       |   6 +
 src/include/pgxc/pgxcnode.h         |   2 +-
 src/include/utils/plancache.h       |   1 +
 src/include/utils/portal.h          |   1 +
 13 files changed, 808 insertions(+), 32 deletions(-)
 create mode 100644 src/backend/commands/explain_dist.c
 create mode 100644 src/include/commands/explain_dist.h

diff --git a/src/backend/commands/Makefile b/src/backend/commands/Makefile
index 4a6c99e0..663eb71e 100644
--- a/src/backend/commands/Makefile
+++ b/src/backend/commands/Makefile
@@ -15,7 +15,7 @@ include $(top_builddir)/src/Makefile.global
 OBJS = amcmds.o aggregatecmds.o alter.o analyze.o async.o cluster.o comment.o \
 	collationcmds.o constraint.o conversioncmds.o copy.o createas.o \
 	dbcommands.o define.o discard.o dropcmds.o \
-	event_trigger.o explain.o extension.o foreigncmds.o functioncmds.o \
+	event_trigger.o explain.o explain_dist.o extension.o foreigncmds.o functioncmds.o \
 	indexcmds.o lockcmds.o matview.o operatorcmds.o opclasscmds.o \
 	policy.o portalcmds.o prepare.o proclang.o publicationcmds.o \
 	schemacmds.o seclabel.o sequence.o statscmds.o subscriptioncmds.o \
diff --git a/src/backend/commands/explain_dist.c b/src/backend/commands/explain_dist.c
new file mode 100644
index 00000000..41b7c5a1
--- /dev/null
+++ b/src/backend/commands/explain_dist.c
@@ -0,0 +1,630 @@
+/*-------------------------------------------------------------------------
+ *
+ * explain_dist.c
+ *    This code provides support for distributed explain analyze.
+ *
+ * Portions Copyright (c) 2020, Tencent TBase-C Group
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *        src/backend/commands/explain_dist.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "commands/explain_dist.h"
+#include "executor/hashjoin.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "nodes/nodeFuncs.h"
+
+/* Read instrument field */
+#define INSTR_READ_FIELD(fldname)                \
+do {                                             \
+    instr->fldname = strtod(tmp_head, &tmp_pos); \
+    tmp_head = tmp_pos + 1;                      \
+} while(0)
+
+/* Set max instrument */
+#define INSTR_MAX_FIELD(fldname)                          \
+do {                                                      \
+    target->fldname = Max(src->fldname, target->fldname); \
+} while(0)
+
+/* Serialize state */
+typedef struct
+{
+	/* ids of plan nodes we've handled */
+	Bitmapset  *printed_nodes;
+	/* send str buf */
+	StringInfoData buf;
+} SerializeState;
+
+/*
+ * InstrOut
+ *
+ * Serialize Instrumentation structure with the format
+ * "nodetype-plan_node_id{val,val,...,val}".
+ *
+ * NOTE: The function should be modified if the structure of Instrumentation
+ * or its relevant members has been changed.
+ */
+static void
+InstrOut(StringInfo buf, Plan *plan, Instrumentation *instr)
+{
+	/* nodeTag for varify */
+	appendStringInfo(buf, "%hd-%d{", nodeTag(plan), plan->plan_node_id);
+	
+	/* bool */
+	/* running should be false after InstrEndLoop */
+	appendStringInfo(buf, "%hd,", instr->need_timer);
+	appendStringInfo(buf, "%hd,", instr->need_bufusage);
+	appendStringInfo(buf, "%hd,", instr->running);
+	/* instr_time */
+	/* starttime and counter should be 0 after InstrEndLoop */
+	appendStringInfo(buf, "%ld,", instr->starttime.tv_sec);
+	appendStringInfo(buf, "%ld,", instr->starttime.tv_nsec);
+	appendStringInfo(buf, "%ld,", instr->counter.tv_sec);
+	appendStringInfo(buf, "%ld,", instr->counter.tv_nsec);
+	/* double */
+	/* firsttuple and tuplecount should be 0 after InstrEndLoop */
+	appendStringInfo(buf, "%.0f,", instr->firsttuple);
+	appendStringInfo(buf, "%.0f,", instr->tuplecount);
+	/* BufferUsage */
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.shared_blks_hit);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.shared_blks_read);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.shared_blks_dirtied);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.shared_blks_written);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.local_blks_hit);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.local_blks_read);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.local_blks_dirtied);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.local_blks_written);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.temp_blks_read);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.temp_blks_written);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.blk_read_time.tv_sec);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.blk_read_time.tv_nsec);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.blk_write_time.tv_sec);
+	appendStringInfo(buf, "%ld,", instr->bufusage_start.blk_write_time.tv_nsec);
+	/* double */
+	appendStringInfo(buf, "%.10f,", instr->startup);
+	appendStringInfo(buf, "%.10f,", instr->total);
+	appendStringInfo(buf, "%.0f,", instr->ntuples);
+	appendStringInfo(buf, "%.0f,", instr->nloops);
+	appendStringInfo(buf, "%.0f,", instr->nfiltered1);
+	appendStringInfo(buf, "%.0f,", instr->nfiltered2);
+	/* BufferUsage */
+	appendStringInfo(buf, "%ld,", instr->bufusage.shared_blks_hit);
+	appendStringInfo(buf, "%ld,", instr->bufusage.shared_blks_read);
+	appendStringInfo(buf, "%ld,", instr->bufusage.shared_blks_dirtied);
+	appendStringInfo(buf, "%ld,", instr->bufusage.shared_blks_written);
+	appendStringInfo(buf, "%ld,", instr->bufusage.local_blks_hit);
+	appendStringInfo(buf, "%ld,", instr->bufusage.local_blks_read);
+	appendStringInfo(buf, "%ld,", instr->bufusage.local_blks_dirtied);
+	appendStringInfo(buf, "%ld,", instr->bufusage.local_blks_written);
+	appendStringInfo(buf, "%ld,", instr->bufusage.temp_blks_read);
+	appendStringInfo(buf, "%ld,", instr->bufusage.temp_blks_written);
+	appendStringInfo(buf, "%ld,", instr->bufusage.blk_read_time.tv_sec);
+	appendStringInfo(buf, "%ld,", instr->bufusage.blk_read_time.tv_nsec);
+	appendStringInfo(buf, "%ld,", instr->bufusage.blk_write_time.tv_sec);
+	appendStringInfo(buf, "%ld}", instr->bufusage.blk_write_time.tv_nsec);
+	
+	elog(DEBUG1, "InstrOut: plan_node_id %d, nloops %.0f", plan->plan_node_id, instr->nloops);
+}
+
+/*
+ * WorkerInstrOut
+ *
+ * Serialize worker instrumentation with the format
+ * "n|val,val,..,val|...|val,val,..,val|". n indicates the worker num,
+ * and | separates each worker instrumentation.
+ */
+static void
+WorkerInstrOut(StringInfo buf, WorkerInstrumentation *worker_instr)
+{
+	int n;
+	
+	if (worker_instr == NULL)
+	{
+		appendStringInfo(buf, "0|");
+		return;
+	}
+	
+	appendStringInfo(buf, "%d|", worker_instr->num_workers);
+	for (n = 0; n < worker_instr->num_workers; n++)
+	{
+		Instrumentation *instr = &worker_instr->instrument[n];
+		
+		if (instr->nloops <= 0)
+			appendStringInfo(buf, "0,0,0,0|");
+		else
+			/* send startup, total, ntuples, loops for now */
+			appendStringInfo(buf, "%.10f,%.10f,%.0f,%.0f|",
+			                 instr->startup, instr->total, instr->ntuples, instr->nloops);
+	}
+}
+
+/*
+ * SpecInstrOut
+ *
+ * Serialize specific information in planstate with the format
+ * "1/0<val,val,...,val>", and 1/0 indicates if values are valid or not.
+ *
+ * NOTE: The function should be modified if the corresponding data structure
+ * has been changed.
+ * The function is VERY related to show_sort_info, show_hash_info.
+ */
+static void
+SpecInstrOut(StringInfo buf, NodeTag plantag, PlanState *planstate)
+{
+	switch(plantag)
+	{
+		case T_Gather:
+		{
+			appendStringInfo(buf, "%d>",
+			                 ((GatherState *) planstate)->nworkers_launched);
+		}
+			break;
+		
+		case T_GatherMerge:
+		{
+			appendStringInfo(buf, "%d>",
+			                 ((GatherMergeState *) planstate)->nworkers_launched);
+		}
+			break;
+#if 0
+		case T_Sort:
+		{
+			/* according to RemoteSortState and show_sort_info */
+			SortState *sortstate = castNode(SortState, planstate);
+			
+			if (sortstate->sort_Done && sortstate->tuplesortstate)
+			{
+				Tuplesortstate  *state = (Tuplesortstate *) sortstate->tuplesortstate;
+				char            *sortMethod;
+				char            *spaceType;
+				long            spaceUsed;
+				
+				tuplesort_get_stats(state, (const char **) &sortMethod, (const char **) &spaceType, &spaceUsed);
+				appendStringInfo(buf, "1<%s,%s,%ld>",
+				                 sortMethod, spaceType, spaceUsed);
+			}
+			else
+				appendStringInfo(buf, "0>");
+		}
+			break;
+		
+		case T_Hash:
+		{
+			/* according to RemoteHashState and show_hash_info */
+			HashState *hashstate = castNode(HashState, planstate);
+			HashJoinTable hashtable = hashstate->hashtable;
+			
+			if (hashtable)
+			{
+				hashtable->nbuckets = 0;
+				appendStringInfo(buf, "1<%d,%d,%d,%d,%ld>",
+				                 hashtable->nbuckets, hashtable->nbuckets_original,
+				                 hashtable->nbatch, hashtable->nbatch_original,
+				                 (hashtable->spacePeak + 1023) / 1024);
+			}
+			else
+				appendStringInfo(buf, "0>");
+		}
+			break;
+#endif
+		default:
+			break;
+	}
+}
+
+/*
+ * InstrIn
+ *
+ * DeSerialize of one Instrumentation.
+ */
+static void
+InstrIn(StringInfo str, RemoteInstr *rinstr)
+{
+	char *tmp_pos;
+	char *tmp_head = &str->data[str->cursor];
+	Instrumentation *instr = &rinstr->instr;
+	
+	if (str->len <= 0)
+		return;
+	
+	/* verify nodetype and plan_node_id */
+	rinstr->nodeTag = strtol(tmp_head, &tmp_pos, 0);
+	tmp_head = tmp_pos + 1;
+	rinstr->id = (int) strtol(tmp_head, &tmp_pos, 0);
+	tmp_head = tmp_pos + 1;
+	
+	/* read values */
+	INSTR_READ_FIELD(need_timer);
+	INSTR_READ_FIELD(need_bufusage);
+	INSTR_READ_FIELD(running);
+	
+	INSTR_READ_FIELD(starttime.tv_sec);
+	INSTR_READ_FIELD(starttime.tv_nsec);
+	INSTR_READ_FIELD(counter.tv_sec);
+	INSTR_READ_FIELD(counter.tv_nsec);
+	
+	INSTR_READ_FIELD(firsttuple);
+	INSTR_READ_FIELD(tuplecount);
+	
+	INSTR_READ_FIELD(bufusage_start.shared_blks_hit);
+	INSTR_READ_FIELD(bufusage_start.shared_blks_read);
+	INSTR_READ_FIELD(bufusage_start.shared_blks_dirtied);
+	INSTR_READ_FIELD(bufusage_start.shared_blks_written);
+	INSTR_READ_FIELD(bufusage_start.local_blks_hit);
+	INSTR_READ_FIELD(bufusage_start.local_blks_read);
+	INSTR_READ_FIELD(bufusage_start.local_blks_dirtied);
+	INSTR_READ_FIELD(bufusage_start.local_blks_written);
+	INSTR_READ_FIELD(bufusage_start.temp_blks_read);
+	INSTR_READ_FIELD(bufusage_start.temp_blks_written);
+	INSTR_READ_FIELD(bufusage_start.blk_read_time.tv_sec);
+	INSTR_READ_FIELD(bufusage_start.blk_read_time.tv_nsec);
+	INSTR_READ_FIELD(bufusage_start.blk_write_time.tv_sec);
+	INSTR_READ_FIELD(bufusage_start.blk_write_time.tv_nsec);
+	
+	INSTR_READ_FIELD(startup);
+	INSTR_READ_FIELD(total);
+	INSTR_READ_FIELD(ntuples);
+	INSTR_READ_FIELD(nloops);
+	INSTR_READ_FIELD(nfiltered1);
+	INSTR_READ_FIELD(nfiltered2);
+	
+	INSTR_READ_FIELD(bufusage.shared_blks_hit);
+	INSTR_READ_FIELD(bufusage.shared_blks_read);
+	INSTR_READ_FIELD(bufusage.shared_blks_dirtied);
+	INSTR_READ_FIELD(bufusage.shared_blks_written);
+	INSTR_READ_FIELD(bufusage.local_blks_hit);
+	INSTR_READ_FIELD(bufusage.local_blks_read);
+	INSTR_READ_FIELD(bufusage.local_blks_dirtied);
+	INSTR_READ_FIELD(bufusage.local_blks_written);
+	INSTR_READ_FIELD(bufusage.temp_blks_read);
+	INSTR_READ_FIELD(bufusage.temp_blks_written);
+	INSTR_READ_FIELD(bufusage.blk_read_time.tv_sec);
+	INSTR_READ_FIELD(bufusage.blk_read_time.tv_nsec);
+	INSTR_READ_FIELD(bufusage.blk_write_time.tv_sec);
+	INSTR_READ_FIELD(bufusage.blk_write_time.tv_nsec);
+	
+	elog(DEBUG1, "InstrIn: plan_node_id %d, nloops %.0f", rinstr->id, instr->nloops);
+	
+	/* tmp_head points to next instrument's nodetype or '\0' already */
+	str->cursor = tmp_head - &str->data[0];
+}
+
+/*
+ * SpecInstrIn
+ *
+ * DeSerialize of specific instrument info of current node.
+ */
+static void
+SpecInstrIn(StringInfo str, RemoteInstr *rinstr)
+{
+	char    *tmp_pos;
+	char    *tmp_head = &str->data[str->cursor];
+	
+	switch(rinstr->nodeTag)
+	{
+		case T_Gather:
+		case T_GatherMerge:
+		{
+			rinstr->nworkers_launched = (int) strtod(tmp_head, &tmp_pos);
+			tmp_head = tmp_pos + 1;
+		}
+			break;
+#if 0
+		case T_Sort:
+		{
+			RemoteSortState *instr = (RemoteSortState *)palloc0(
+				sizeof(RemoteSortState));
+			/* either stat or w_stat is valid */
+			INSTR_READ_FIELD(rs.isvalid);
+			if (instr->rs.isvalid)
+			{
+				INSTR_READ_FIELD(stat.sortMethod);
+				INSTR_READ_FIELD(stat.spaceType);
+				INSTR_READ_FIELD(stat.spaceUsed);
+			}
+			
+			INSTR_READ_FIELD(rs.num_workers);
+			if (instr->rs.num_workers > 0)
+			{
+				int n;
+				Size size;
+				
+				size = mul_size(sizeof(TuplesortInstrumentation),
+				                instr->rs.num_workers);
+				instr->w_stats = (TuplesortInstrumentation *)palloc0(size);
+				
+				for (n = 0; n < instr->rs.num_workers; n++)
+				{
+					INSTR_READ_FIELD(w_stats[n].sortMethod);
+					if (instr->w_stats[n].sortMethod != SORT_TYPE_STILL_IN_PROGRESS)
+					{
+						INSTR_READ_FIELD(w_stats[n].spaceType);
+						INSTR_READ_FIELD(w_stats[n].spaceUsed);
+					}
+				}
+			}
+			remote_instr->state = (RemoteState *) instr;
+		}
+			break;
+		
+		case T_Hash:
+		{
+			RemoteHashState *instr = (RemoteHashState *)palloc0(
+				sizeof(RemoteHashState));
+			INSTR_READ_FIELD(rs.isvalid);
+			if (instr->rs.isvalid)
+			{
+				INSTR_READ_FIELD(nbuckets);
+				INSTR_READ_FIELD(nbuckets_original);
+				INSTR_READ_FIELD(nbatch);
+				INSTR_READ_FIELD(nbatch_original);
+				INSTR_READ_FIELD(spacePeakKb);
+			}
+			remote_instr->state = (RemoteState *) instr;
+		}
+			break;
+#endif
+		default:
+			break;
+	}
+	
+	str->cursor = tmp_head - &str->data[0];
+}
+
+/*
+ * SerializeLocalInstr
+ *
+ * Serialize local instruments in the planstate tree for sending.
+ */
+static bool
+SerializeLocalInstr(PlanState *planstate, SerializeState *ss)
+{
+	/*
+	 * We should handle InitPlan/SubPlan the same as in ExplainSubPlans.
+	 * But we do not want another planstate_tree_walker,
+	 * it is ok to use plan_node_id in place of plan_id.
+	 */
+	int plan_node_id = planstate->plan->plan_node_id;
+	if (bms_is_member(plan_node_id, ss->printed_nodes))
+		return false;
+	else
+		ss->printed_nodes = bms_add_member(ss->printed_nodes, plan_node_id);
+	
+	/* For CteScan producer, deal with its child directly */
+	if (IsA(planstate, CteScanState))
+		planstate = ((CteScanState *)planstate)->cteplanstate;
+	
+	if (planstate->instrument)
+	{
+		/* clean up the instrumentation state as in ExplainNode */
+		InstrEndLoop(planstate->instrument);
+		InstrOut(&ss->buf, planstate->plan, planstate->instrument);
+		//WorkerInstrOut(&ss->buf, planstate->worker_instrument);
+		SpecInstrOut(&ss->buf, nodeTag(planstate->plan), planstate);
+	}
+	else
+	{
+		/* should not be NULL */
+		elog(ERROR, "SerializeLocalInstr: instrument is NULL, %d",
+		     nodeTag(planstate));
+	}
+
+	return planstate_tree_walker(planstate, SerializeLocalInstr, ss);
+}
+
+/*
+ * SendLocalInstr
+ *
+ * Serialize local instrument of the given planstate and send it to upper node.
+ */
+void
+SendLocalInstr(PlanState *planstate)
+{
+	SerializeState ss;
+	
+	/* Construct str with the same logic in ExplainNode */
+	ss.printed_nodes = NULL;
+	pq_beginmessage(&ss.buf, 'i');
+	SerializeLocalInstr(planstate, &ss);
+	pq_endmessage(&ss.buf);
+	bms_free(ss.printed_nodes);
+	pq_flush();
+}
+
+/*
+ * combineRemoteInstr
+ *
+ * tool function to combine received instrumentation of all nodes,
+ * currently it choose max value.
+ */
+static void
+combineRemoteInstr(RemoteInstr *rtarget, RemoteInstr *rsrc)
+{
+	Instrumentation *target = &rtarget->instr;
+	Instrumentation *src = &rsrc->instr;
+	
+	Assert(rtarget->id == rsrc->id);
+	Assert(rtarget->nodeTag == rsrc->nodeTag);
+	
+	INSTR_MAX_FIELD(need_timer);
+	INSTR_MAX_FIELD(need_bufusage);
+	INSTR_MAX_FIELD(running);
+	
+	INSTR_MAX_FIELD(starttime.tv_sec);
+	INSTR_MAX_FIELD(starttime.tv_nsec);
+	INSTR_MAX_FIELD(counter.tv_sec);
+	INSTR_MAX_FIELD(counter.tv_nsec);
+	
+	INSTR_MAX_FIELD(firsttuple);
+	INSTR_MAX_FIELD(tuplecount);
+	
+	INSTR_MAX_FIELD(bufusage_start.shared_blks_hit);
+	INSTR_MAX_FIELD(bufusage_start.shared_blks_read);
+	INSTR_MAX_FIELD(bufusage_start.shared_blks_dirtied);
+	INSTR_MAX_FIELD(bufusage_start.shared_blks_written);
+	INSTR_MAX_FIELD(bufusage_start.local_blks_hit);
+	INSTR_MAX_FIELD(bufusage_start.local_blks_read);
+	INSTR_MAX_FIELD(bufusage_start.local_blks_dirtied);
+	INSTR_MAX_FIELD(bufusage_start.local_blks_written);
+	INSTR_MAX_FIELD(bufusage_start.temp_blks_read);
+	INSTR_MAX_FIELD(bufusage_start.temp_blks_written);
+	INSTR_MAX_FIELD(bufusage_start.blk_read_time.tv_sec);
+	INSTR_MAX_FIELD(bufusage_start.blk_read_time.tv_nsec);
+	INSTR_MAX_FIELD(bufusage_start.blk_write_time.tv_sec);
+	INSTR_MAX_FIELD(bufusage_start.blk_write_time.tv_nsec);
+	
+	INSTR_MAX_FIELD(startup);
+	INSTR_MAX_FIELD(total);
+	INSTR_MAX_FIELD(ntuples);
+	INSTR_MAX_FIELD(nloops);
+	INSTR_MAX_FIELD(nfiltered1);
+	INSTR_MAX_FIELD(nfiltered2);
+	
+	INSTR_MAX_FIELD(bufusage.shared_blks_hit);
+	INSTR_MAX_FIELD(bufusage.shared_blks_read);
+	INSTR_MAX_FIELD(bufusage.shared_blks_dirtied);
+	INSTR_MAX_FIELD(bufusage.shared_blks_written);
+	INSTR_MAX_FIELD(bufusage.local_blks_hit);
+	INSTR_MAX_FIELD(bufusage.local_blks_read);
+	INSTR_MAX_FIELD(bufusage.local_blks_dirtied);
+	INSTR_MAX_FIELD(bufusage.local_blks_written);
+	INSTR_MAX_FIELD(bufusage.temp_blks_read);
+	INSTR_MAX_FIELD(bufusage.temp_blks_written);
+	INSTR_MAX_FIELD(bufusage.blk_read_time.tv_sec);
+	INSTR_MAX_FIELD(bufusage.blk_read_time.tv_nsec);
+	INSTR_MAX_FIELD(bufusage.blk_write_time.tv_sec);
+	INSTR_MAX_FIELD(bufusage.blk_write_time.tv_nsec);
+	
+	rtarget->nworkers_launched = Max(rtarget->nworkers_launched, rsrc->nworkers_launched);
+}
+
+/*
+ * HandleRemoteInstr
+ *
+ * Handle remote instrument message and save it by plan_node_id.
+ */
+void
+HandleRemoteInstr(char *msg_body, size_t len, int nodeoid, ResponseCombiner *combiner)
+{
+	RemoteInstr recv_instr;
+	StringInfo  recv_str;
+	bool        found;
+	RemoteInstr *cur_instr;
+	
+	if (combiner->recv_instr_htbl == NULL)
+	{
+		elog(ERROR, "combiner is not prepared for instrumentation");
+	}
+	elog(DEBUG1, "Handle remote instrument: nodeoid %d", nodeoid);
+	
+	recv_str = makeStringInfo();
+	appendBinaryStringInfo(recv_str, msg_body, len);
+	
+	while(recv_str->cursor < recv_str->len)
+	{
+		InstrIn(recv_str, &recv_instr);
+		SpecInstrIn(recv_str, &recv_instr);
+		cur_instr = (RemoteInstr *) hash_search(combiner->recv_instr_htbl,
+		                                        (void *) &recv_instr.id,
+		                                        HASH_ENTER, &found);
+		if (found)
+		{
+			combineRemoteInstr(cur_instr, &recv_instr);
+		}
+		else
+		{
+			memcpy(cur_instr, &recv_instr, sizeof(RemoteInstr));
+		}
+	}
+}
+
+/*
+ * attachRemoteSpecialInstr
+ *
+ * Attach specific information in planstate.
+ */
+static void
+attachRemoteSpecialInstr(PlanState *planstate, RemoteInstr *rinstr)
+{
+	int nodeTag = nodeTag(planstate->plan);
+	
+	switch(nodeTag)
+	{
+		case T_Gather:
+			{
+				GatherState *gs = (GatherState *) planstate;
+				gs->nworkers_launched = rinstr->nworkers_launched;
+			}
+			break;
+		case T_GatherMerge:
+			{
+				GatherMergeState *gms = (GatherMergeState *) planstate;
+				gms->nworkers_launched = rinstr->nworkers_launched;
+			}
+			break;
+		default:
+			break;
+	}
+}
+
+/*
+ * AttachRemoteInstr
+ *
+ * Attach instrument information in planstate from saved info in combiner.
+ */
+bool
+AttachRemoteInstr(PlanState *planstate, ResponseCombiner *combiner)
+{
+	int plan_node_id = planstate->plan->plan_node_id;
+	if (bms_is_member(plan_node_id, combiner->printed_nodes))
+		return false;
+	else
+		combiner->printed_nodes = bms_add_member(combiner->printed_nodes, plan_node_id);
+	
+	if (IsA(planstate, RemoteSubplanState) && NULL == planstate->lefttree)
+	{
+		Plan        *plan = planstate->plan;
+		PlanState   *remote_ps;
+		EState      *estate = planstate->state;
+
+		remote_ps = ExecInitNode(plan->lefttree, estate, EXEC_FLAG_EXPLAIN_ONLY);
+		planstate->lefttree = remote_ps;
+	}
+	
+	if (planstate->instrument)
+	{
+		bool        found;
+		RemoteInstr *rinstr= (RemoteInstr *) hash_search(combiner->recv_instr_htbl,
+		                                                 (void *) &plan_node_id,
+		                                                 HASH_FIND, &found);
+		if (!found)
+		{
+			elog(DEBUG1, "AttachRemoteInstr: remote instrumentation not found, tag %d id %d",
+			     nodeTag(planstate->plan), plan_node_id);
+		}
+		else
+		{
+			Assert(rinstr->nodeTag == nodeTag(planstate->plan));
+			Assert(rinstr->id == plan_node_id);
+			
+			memcpy(planstate->instrument, &rinstr->instr, sizeof(Instrumentation));
+			attachRemoteSpecialInstr(planstate, rinstr);
+		}
+	}
+	else
+	{
+		/* should not be NULL */
+		elog(ERROR, "AttachRemoteInstr: instrument is NULL, tag %d id %d",
+		     nodeTag(planstate), plan_node_id);
+	}
+
+	return planstate_tree_walker(planstate, AttachRemoteInstr, combiner);
+}
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index d051eb2b..8f0d9718 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -244,6 +244,19 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
     /* Count this node. */
     e->nnodes++;
 
+	/*
+	 * if we are running with instrument option, must init
+	 * full plantree here, to ensure e->nnodes correct.
+	 */
+	if (planstate->instrument &&
+	    IsA(planstate, RemoteSubplanState) &&
+	    NULL == planstate->lefttree)
+	{
+		planstate->lefttree = ExecInitNode(planstate->plan->lefttree,
+		                                   planstate->state,
+		                                   EXEC_FLAG_EXPLAIN_ONLY);
+	}
+
     /* Call estimators for parallel-aware nodes. */
     if (planstate->plan->parallel_aware)
     {
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 0119064b..f8f15db2 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -866,6 +866,9 @@ ExecShutdownNode(PlanState *node)
         case T_GatherMergeState:
             ExecShutdownGatherMerge((GatherMergeState *) node);
             break;
+		case T_RemoteSubplanState:
+			ExecShutdownRemoteSubplan((RemoteSubplanState *) node);
+			break;
         default:
             break;
     }
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 80829005..39796a82 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -59,6 +59,7 @@
 #include "pgxc/xc_maintenance_mode.h"
 #include "catalog/pgxc_class.h"
 #ifdef __TBASE__
+#include "commands/explain_dist.h"
 #include "pgxc/squeue.h"
 #include "executor/execParallel.h"
 #include "postmaster/postmaster.h"
@@ -297,6 +298,15 @@ InitResponseCombiner(ResponseCombiner *combiner, int node_count,
     combiner->recv_datarows  = 0;
     combiner->prerowBuffers  = NULL;
     combiner->is_abort = false;
+	combiner->printed_nodes = NULL;
+	{
+		HASHCTL		ctl;
+		
+		ctl.keysize = sizeof(int);
+		ctl.entrysize = sizeof(RemoteInstr);
+		
+		combiner->recv_instr_htbl = hash_create("Remote Instrument", 16, &ctl, HASH_ELEM);
+	}
 #endif
 }
 
@@ -1098,6 +1108,18 @@ CloseCombiner(ResponseCombiner *combiner)
         pfree(combiner->tapemarks);
         combiner->tapemarks = NULL;
     }
+#ifdef __TBASE__
+	if (combiner->recv_instr_htbl)
+	{
+		hash_destroy(combiner->recv_instr_htbl);
+		combiner->recv_instr_htbl = NULL;
+	}
+	if (combiner->printed_nodes)
+	{
+		bms_free(combiner->printed_nodes);
+		combiner->printed_nodes = NULL;
+	}
+#endif
 }
 
 /*
@@ -2671,6 +2693,10 @@ FetchTuple(ResponseCombiner *combiner)
         {
             /* Do nothing. It must have been handled in handle_response() */
         }
+		else if (res == RESPONSE_INSTR)
+		{
+			/* Do nothing. It must have been handled in handle_response() */
+		}
         else
         {
             // Can not get here?
@@ -3306,6 +3332,12 @@ handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner)
                 #endif
                 return RESPONSE_ASSIGN_GXID;
                 
+#ifdef __TBASE__
+			case 'i': /* Remote Instrument */
+				if (msg_len > 0)
+					HandleRemoteInstr(msg, msg_len, conn->nodeoid, combiner);
+				return RESPONSE_INSTR;
+#endif
             default:
                 /* sync lost? */
                 elog(WARNING, "Received unsupported message type: %c", msg_type);
@@ -10487,7 +10519,7 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node)
                      errmsg("Failed to send command ID to data nodes")));
         }
         pgxc_node_send_plan(connection, cursor, "Remote Subplan",
-                            node->subplanstr, node->nParamRemote, paramtypes);
+							node->subplanstr, node->nParamRemote, paramtypes, estate->es_instrument);
 
 		if (enable_statistic)
 		{
@@ -11100,6 +11132,29 @@ ExecReScanRemoteSubplan(RemoteSubplanState *node)
 }
 
 #ifdef __TBASE__
+/*
+ * ExecShutdownRemoteSubplan
+ * 
+ * for instrumentation only, init full planstate tree,
+ * then attach recieved remote instrumenation.
+ */
+void
+ExecShutdownRemoteSubplan(RemoteSubplanState *node)
+{
+	ResponseCombiner    *combiner = &node->combiner;
+	PlanState           *ps = &combiner->ss.ps;
+	Plan                *plan = ps->plan;
+	EState              *estate = ps->state;
+	
+	if (estate->es_instrument)
+	{
+		if (!ps->lefttree)
+			ps->lefttree = ExecInitNode(plan->lefttree, estate, EXEC_FLAG_EXPLAIN_ONLY);
+
+		AttachRemoteInstr(ps->lefttree, combiner);
+	}
+}
+
 void
 ExecFinishRemoteSubplan(RemoteSubplanState *node)
 {// #lizard forgives
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index aa1070be..3baad358 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -2064,7 +2064,7 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
 int
 pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
                     const char *query, const char *planstr,
-                    short num_params, Oid *param_types)
+					short num_params, Oid *param_types, int instrument_options)
 {
     int            stmtLen;
     int            queryLen;
@@ -2093,8 +2093,8 @@ pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
         paramTypes[i] = format_type_be(param_types[i]);
         paramTypeLen += strlen(paramTypes[i]) + 1;
     }
-    /* size + pnameLen + queryLen + parameters */
-    msgLen = 4 + queryLen + stmtLen + planLen + paramTypeLen;
+	/* size + pnameLen + queryLen + parameters + instrument_options */
+	msgLen = 4 + queryLen + stmtLen + planLen + paramTypeLen + 4;
 
     /* msgType + msgLen */
     if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
@@ -2134,6 +2134,10 @@ pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
         pfree(paramTypes[i]);
     }
     pfree(paramTypes);
+	/* instrument_options */
+	instrument_options = htonl(instrument_options);
+	memcpy(handle->outBuffer + handle->outEnd, &instrument_options, 4);
+	handle->outEnd += 4;
 
     handle->last_command = 'a';
 
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index db2b5639..c8bdc980 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -106,6 +106,7 @@
 #include "executor/execParallel.h"
 #include "pgxc/poolutils.h"
 #include "commands/vacuum.h"
+#include "commands/explain_dist.h"
 #endif
 #endif
 
@@ -1995,8 +1996,9 @@ exec_plan_message(const char *query_string,    /* source of the query */
                   const char *stmt_name,        /* name for prepared stmt */
                   const char *plan_string,        /* encoded plan to execute */
                   char **paramTypeNames,    /* parameter type names */
-                  int numParams)        /* number of parameters */
-{// #lizard forgives
+				  int numParams,		/* number of parameters */
+				  int instrument_options)		/* explain analyze option */
+{
     MemoryContext oldcontext;
     bool        save_log_statement_stats = log_statement_stats;
     char        msec_str[32];
@@ -2094,6 +2096,8 @@ exec_plan_message(const char *query_string,    /* source of the query */
     StorePreparedStatement(stmt_name, psrc, false, true);
 
     SetRemoteSubplan(psrc, plan_string);
+	/* set instrument_options, default 0 */
+	psrc->instrument_options = instrument_options;
 
     MemoryContextSwitchTo(oldcontext);
 
@@ -2691,26 +2695,26 @@ exec_bind_message(StringInfo input_message)
 	/* Get epq context, only datanodes need them */
 	if (IS_PGXC_DATANODE && (IsConnFromCoord() || IsConnFromDatanode()))
 	{
-	num_epq_tuple = pq_getmsgint(input_message, 2);
-	if (num_epq_tuple > 0)
-	{
-		int			i;
-		
-		portal->epqContext = palloc(sizeof(RemoteEPQContext));
-		portal->epqContext->ntuples = num_epq_tuple;
-		portal->epqContext->tid = palloc(num_epq_tuple * sizeof(ItemPointerData));
-		portal->epqContext->rtidx = palloc(num_epq_tuple * sizeof(int));
-		portal->epqContext->nodeid = palloc(num_epq_tuple * sizeof(uint32));
-		
-		for (i = 0; i < num_epq_tuple; i++)
-		{
-			portal->epqContext->rtidx[i] = pq_getmsgint(input_message, 2);
-			portal->epqContext->tid[i].ip_blkid.bi_hi = pq_getmsgint(input_message, 2);
-			portal->epqContext->tid[i].ip_blkid.bi_lo = pq_getmsgint(input_message, 2);
-			portal->epqContext->tid[i].ip_posid = pq_getmsgint(input_message, 2);
-			portal->epqContext->nodeid[i] = pq_getmsgint(input_message, 4);
-		}
-	}
+        num_epq_tuple = pq_getmsgint(input_message, 2);
+        if (num_epq_tuple > 0)
+        {
+            int			i;
+            
+            portal->epqContext = palloc(sizeof(RemoteEPQContext));
+            portal->epqContext->ntuples = num_epq_tuple;
+            portal->epqContext->tid = palloc(num_epq_tuple * sizeof(ItemPointerData));
+            portal->epqContext->rtidx = palloc(num_epq_tuple * sizeof(int));
+            portal->epqContext->nodeid = palloc(num_epq_tuple * sizeof(uint32));
+            
+            for (i = 0; i < num_epq_tuple; i++)
+            {
+                portal->epqContext->rtidx[i] = pq_getmsgint(input_message, 2);
+                portal->epqContext->tid[i].ip_blkid.bi_hi = pq_getmsgint(input_message, 2);
+                portal->epqContext->tid[i].ip_blkid.bi_lo = pq_getmsgint(input_message, 2);
+                portal->epqContext->tid[i].ip_posid = pq_getmsgint(input_message, 2);
+                portal->epqContext->nodeid[i] = pq_getmsgint(input_message, 4);
+            }
+        }
 	}
 	
     pq_getmsgend(input_message);
@@ -2760,6 +2764,9 @@ exec_bind_message(StringInfo input_message)
                       cplan->stmt_list,
                       cplan);
 
+	/* set instrument before PortalStart, default 0 */
+	portal->up_instrument = psrc->instrument_options;
+	
     /* Done with the snapshot used for parameter I/O and parsing/planning */
     if (snapshot_set)
         PopActiveSnapshot();
@@ -3025,6 +3032,15 @@ exec_execute_message(const char *portal_name, long max_rows)
             CommandCounterIncrement();
         }
 
+		
+#ifdef __TBASE__
+		if (portal->up_instrument &&
+		    portal->queryDesc &&
+		    portal->queryDesc->myindex == -1)
+		{
+			SendLocalInstr(portal->queryDesc->planstate);
+		}
+#endif
         /* Send appropriate CommandComplete to client */
         EndCommand(completionTag, dest);
 
@@ -5486,6 +5502,7 @@ PostgresMain(int argc, char *argv[],
                     const char *plan_string;
                     int            numParams;
                     char       **paramTypes = NULL;
+					int         instrument_options = 0;
 
                     /* Set statement_timestamp() */
                     SetCurrentStatementStartTimestamp();
@@ -5502,10 +5519,14 @@ PostgresMain(int argc, char *argv[],
                             paramTypes[i] = (char *)
                                     pq_getmsgstring(&input_message);
                     }
+					
+					instrument_options = pq_getmsgint(&input_message, 4);
+					
                     pq_getmsgend(&input_message);
 
                     exec_plan_message(query_string, stmt_name, plan_string,
-                                      paramTypes, numParams);
+									  paramTypes, numParams,
+									  instrument_options);
                 }
                 break;
 #endif
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 16179e73..c1eadf2c 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -681,8 +681,11 @@ PortalStart(Portal portal, ParamListInfo params,
                                             None_Receiver,
                                             params,
                                             NULL,
+#ifdef __TBASE__
+                                            portal->up_instrument);
+#else
                                             0);
-
+#endif
 				/*
 				 * set information about EvalPlanQual if any, they will be fill in
 				 * estate later after it been created.
@@ -1006,7 +1009,7 @@ PortalStart(Portal portal, ParamListInfo params,
                                                 None_Receiver,
                                                 params,
                                                 portal->queryEnv,
-                                                0);
+                                                portal->up_instrument);
                 }
                 else
 #endif
@@ -1017,8 +1020,11 @@ PortalStart(Portal portal, ParamListInfo params,
                                             None_Receiver,
                                             params,
                                             portal->queryEnv,
+#ifdef __TBASE__
+                                            portal->up_instrument);
+#else
                                             0);
-
+#endif
                 /*
 				 * set information about EvalPlanQual if any, they will be fill in
 				 * estate later after it been created.
diff --git a/src/include/commands/explain_dist.h b/src/include/commands/explain_dist.h
new file mode 100644
index 00000000..fe682bda
--- /dev/null
+++ b/src/include/commands/explain_dist.h
@@ -0,0 +1,36 @@
+/*-------------------------------------------------------------------------
+ *
+ * explain_dist.h
+ *
+ * Portions Copyright (c) 2018, Tencent TBase-C Group.
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/commands/explain_dist.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef EXPLAINDIST_H
+#define EXPLAINDIST_H
+
+#include "commands/explain.h"
+#include "pgxc/execRemote.h"
+
+/* Hash table entry */
+typedef struct
+{
+	int id;                 /* unique id of current plan node */
+	int nodeTag;            /* type of current plan node */
+	Instrumentation instr;  /* instrument of current plan node */
+	
+	/* for Gather */
+	int nworkers_launched;  /* worker num of gather */
+	
+	/* for Hash: */
+} RemoteInstr;
+
+extern void SendLocalInstr(PlanState *planstate);
+extern void HandleRemoteInstr(char *msg_body, size_t len, int nodeoid, ResponseCombiner *combiner);
+extern bool AttachRemoteInstr(PlanState *planstate, ResponseCombiner *combiner);
+
+#endif  /* EXPLAINDIST_H  */
\ No newline at end of file
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index c6e2e0e8..20e73f2e 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -52,6 +52,7 @@
 #endif
 
 #ifdef __TBASE__
+#define RESPONSE_INSTR 13
 #define     UINT32_BITS_NUM              32
 #define     WORD_NUMBER_FOR_NODES      (MAX_NODES_NUMBER / UINT32_BITS_NUM)
 
@@ -174,6 +175,10 @@ typedef struct ResponseCombiner
     PGXCNodeHandle **conns;        
     int                 ccount;    
     uint64     recv_datarows;
+	
+	/* for remote instrument */
+	Bitmapset       *printed_nodes;     /* ids of plan nodes we've handled */
+	HTAB            *recv_instr_htbl;        /* received str hash table for each plan_node_id */
 #endif
 }    ResponseCombiner;
 
@@ -422,6 +427,7 @@ extern void SetCurrentHandlesReadonly(void);
 extern TupleDesc create_tuple_desc(char *msg_body, size_t len);
 
 extern void ExecFinishRemoteSubplan(RemoteSubplanState *node);
+extern void ExecShutdownRemoteSubplan(RemoteSubplanState *node);
 #endif
 
 #ifdef __SUBSCRIPTION__
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 3773cac2..15c4ef46 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -222,7 +222,7 @@ extern int	pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *que
 							  bool send_describe, int fetch_size);
 extern int  pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
 					const char *query, const char *planstr,
-					short num_params, Oid *param_types);
+					short num_params, Oid *param_types, int instrument_options);
 extern int pgxc_node_send_gid(PGXCNodeHandle *handle, char* gid);
 #ifdef __TWO_PHASE_TRANS__
 extern int pgxc_node_send_starter(PGXCNodeHandle *handle, char* startnode);
diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h
index 0cdeadc3..55e70db2 100644
--- a/src/include/utils/plancache.h
+++ b/src/include/utils/plancache.h
@@ -181,6 +181,7 @@ typedef struct CachedPlanSource
 #endif
 #ifdef __TBASE__
     bool       insert_into;
+	int        instrument_options
 #endif
 } CachedPlanSource;
 
diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h
index 2a4a6c42..5d039875 100644
--- a/src/include/utils/portal.h
+++ b/src/include/utils/portal.h
@@ -266,6 +266,7 @@ typedef struct PortalData
 	
 	/* information about EvalPlanQual, pass it to queryDesc */
 	RemoteEPQContext *epqContext;
+	int			up_instrument;	/* explain analyze option from cn */
 #endif
 }            PortalData;
 

From 4e1a137415a3e685f4879ce2284ae9bac3c7b40d Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 26 Feb 2021 10:15:40 +0800
Subject: [PATCH 132/578] initialize hashtable only when instrument flaged

---
 src/backend/pgxc/pool/execRemote.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 39796a82..ab99d828 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -299,14 +299,7 @@ InitResponseCombiner(ResponseCombiner *combiner, int node_count,
     combiner->prerowBuffers  = NULL;
     combiner->is_abort = false;
 	combiner->printed_nodes = NULL;
-	{
-		HASHCTL		ctl;
-		
-		ctl.keysize = sizeof(int);
-		ctl.entrysize = sizeof(RemoteInstr);
-		
-		combiner->recv_instr_htbl = hash_create("Remote Instrument", 16, &ctl, HASH_ELEM);
-	}
+	combiner->recv_instr_htbl = NULL;
 #endif
 }
 
@@ -9915,6 +9908,16 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags)
     combiner->ss.ps.state = estate;
     combiner->ss.ps.ExecProcNode = ExecRemoteSubplan;
 
+	if (estate->es_instrument)
+	{
+		HASHCTL		ctl;
+		
+		ctl.keysize = sizeof(int);
+		ctl.entrysize = sizeof(RemoteInstr);
+		
+		combiner->recv_instr_htbl = hash_create("Remote Instrument", 16, &ctl, HASH_ELEM);
+	}
+	
     combiner->ss.ps.qual = NULL;
 
     combiner->request_type = REQUEST_TYPE_QUERY;

From 7e4b0d7d1be4f40c882e6afbca750e1970615982 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Mon, 1 Mar 2021 15:01:04 +0800
Subject: [PATCH 133/578] fix compile error

---
 src/include/utils/plancache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h
index 55e70db2..e04b03d8 100644
--- a/src/include/utils/plancache.h
+++ b/src/include/utils/plancache.h
@@ -181,7 +181,7 @@ typedef struct CachedPlanSource
 #endif
 #ifdef __TBASE__
     bool       insert_into;
-	int        instrument_options
+	int        instrument_options;
 #endif
 } CachedPlanSource;
 

From f6cd1fe9950c6ce911745dd9904b429e24791a0e Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 29 Aug 2017 13:22:49 -0400
Subject: [PATCH 134/578] Propagate sort instrumentation from workers back to
 leader.

Up until now, when parallel query was used, no details about the
sort method or space used by the workers were available; details
were shown only for any sorting done by the leader.  Fix that.

Commit 1177ab1dabf72bafee8f19d904cee3a299f25892 forced the test case
added by commit 1f6d515a67ec98194c23a5db25660856c9aab944 to run
without parallelism; now that we have this infrastructure, allow
that again, with a little tweaking to make it pass with and without
force_parallel_mode.

Robert Haas and Tom Lane

Discussion: http://postgr.es/m/CA+Tgmoa2VBZW6S8AAXfhpHczb=Rf6RqQ2br+zJvEgwJ0uoD_tQ@mail.gmail.com
---
 src/backend/commands/explain.c          |  57 ++++++++++++-
 src/backend/executor/execParallel.c     | 104 +++++++++++++++---------
 src/backend/executor/nodeSort.c         |  97 ++++++++++++++++++++++
 src/backend/utils/sort/tuplesort.c      |  56 ++++++++++---
 src/include/executor/nodeSort.h         |   9 +-
 src/include/nodes/execnodes.h           |  12 +++
 src/include/utils/tuplesort.h           |  96 ++++++++++++++--------
 src/test/regress/expected/subselect.out |  48 +++++++++++
 src/test/regress/sql/subselect.sql      |  43 +++++++++-
 9 files changed, 433 insertions(+), 89 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index c58bd433..4ba4dc81 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2534,15 +2534,21 @@ show_tablesample(TableSampleClause *tsc, PlanState *planstate,
 static void
 show_sort_info(SortState *sortstate, ExplainState *es)
 {
-    if (es->analyze && sortstate->sort_Done &&
-        sortstate->tuplesortstate != NULL)
+	if (!es->analyze)
+		return;
+
+	if (sortstate->sort_Done && sortstate->tuplesortstate != NULL)
     {
         Tuplesortstate *state = (Tuplesortstate *) sortstate->tuplesortstate;
+		TuplesortInstrumentation stats;
         const char *sortMethod;
         const char *spaceType;
         long        spaceUsed;
 
-        tuplesort_get_stats(state, &sortMethod, &spaceType, &spaceUsed);
+		tuplesort_get_stats(state, &stats);
+		sortMethod = tuplesort_method_name(stats.sortMethod);
+		spaceType = tuplesort_space_type_name(stats.spaceType);
+		spaceUsed = stats.spaceUsed;
 
         if (es->format == EXPLAIN_FORMAT_TEXT)
         {
@@ -2557,6 +2563,51 @@ show_sort_info(SortState *sortstate, ExplainState *es)
             ExplainPropertyText("Sort Space Type", spaceType, es);
         }
     }
+
+	if (sortstate->shared_info != NULL)
+	{
+		int			n;
+		bool		opened_group = false;
+
+		for (n = 0; n < sortstate->shared_info->num_workers; n++)
+		{
+			TuplesortInstrumentation *sinstrument;
+			const char *sortMethod;
+			const char *spaceType;
+			long		spaceUsed;
+
+			sinstrument = &sortstate->shared_info->sinstrument[n];
+			if (sinstrument->sortMethod == SORT_TYPE_STILL_IN_PROGRESS)
+				continue;		/* ignore any unfilled slots */
+			sortMethod = tuplesort_method_name(sinstrument->sortMethod);
+			spaceType = tuplesort_space_type_name(sinstrument->spaceType);
+			spaceUsed = sinstrument->spaceUsed;
+
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+				appendStringInfoSpaces(es->str, es->indent * 2);
+				appendStringInfo(es->str,
+								 "Worker %d:  Sort Method: %s  %s: %ldkB\n",
+								 n, sortMethod, spaceType, spaceUsed);
+			}
+			else
+			{
+				if (!opened_group)
+				{
+					ExplainOpenGroup("Workers", "Workers", false, es);
+					opened_group = true;
+				}
+				ExplainOpenGroup("Worker", NULL, true, es);
+				ExplainPropertyInteger("Worker Number", n, es);
+				ExplainPropertyText("Sort Method", sortMethod, es);
+				ExplainPropertyLong("Sort Space Used", spaceUsed, es);
+				ExplainPropertyText("Sort Space Type", spaceType, es);
+				ExplainCloseGroup("Worker", NULL, true, es);
+			}
+		}
+		if (opened_group)
+			ExplainCloseGroup("Workers", "Workers", false, es);
+	}
 }
 
 /*
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 8f0d9718..45dc56fb 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -28,9 +28,10 @@
 #include "executor/nodeBitmapHeapscan.h"
 #include "executor/nodeCustom.h"
 #include "executor/nodeForeignscan.h"
-#include "executor/nodeSeqscan.h"
 #include "executor/nodeIndexscan.h"
 #include "executor/nodeIndexonlyscan.h"
+#include "executor/nodeSeqscan.h"
+#include "executor/nodeSort.h"
 #include "executor/tqueue.h"
 #include "nodes/nodeFuncs.h"
 #include "optimizer/planmain.h"
@@ -227,10 +228,10 @@ ExecSerializePlan(Plan *plan, EState *estate)
 }
 
 /*
- * Ordinary plan nodes won't do anything here, but parallel-aware plan nodes
- * may need some state which is shared across all parallel workers.  Before
- * we size the DSM, give them a chance to call shm_toc_estimate_chunk or
- * shm_toc_estimate_keys on &pcxt->estimator.
+ * Parallel-aware plan nodes (and occasionally others) may need some state
+ * which is shared across all parallel workers.  Before we size the DSM, give
+ * them a chance to call shm_toc_estimate_chunk or shm_toc_estimate_keys on
+ * &pcxt->estimator.
  *
  * While we're at it, count the number of PlanState nodes in the tree, so
  * we know how many SharedPlanStateInstrumentation structures we need.
@@ -257,50 +258,56 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 		                                   EXEC_FLAG_EXPLAIN_ONLY);
 	}
 
-    /* Call estimators for parallel-aware nodes. */
-    if (planstate->plan->parallel_aware)
-    {
         switch (nodeTag(planstate))
         {
             case T_SeqScanState:
+			if (planstate->plan->parallel_aware)
                 ExecSeqScanEstimate((SeqScanState *) planstate,
                                     e->pcxt);
                 break;
             case T_IndexScanState:
+			if (planstate->plan->parallel_aware)
                 ExecIndexScanEstimate((IndexScanState *) planstate,
                                       e->pcxt);
                 break;
             case T_IndexOnlyScanState:
+			if (planstate->plan->parallel_aware)
                 ExecIndexOnlyScanEstimate((IndexOnlyScanState *) planstate,
                                           e->pcxt);
                 break;
             case T_ForeignScanState:
+			if (planstate->plan->parallel_aware)
                 ExecForeignScanEstimate((ForeignScanState *) planstate,
                                         e->pcxt);
                 break;
             case T_CustomScanState:
+			if (planstate->plan->parallel_aware)
                 ExecCustomScanEstimate((CustomScanState *) planstate,
                                        e->pcxt);
                 break;
             case T_BitmapHeapScanState:
+			if (planstate->plan->parallel_aware)
                 ExecBitmapHeapEstimate((BitmapHeapScanState *) planstate,
                                        e->pcxt);
                 break;
+		case T_SortState:
+			/* even when not parallel-aware */
+			ExecSortEstimate((SortState *) planstate, e->pcxt);
 #ifdef __TBASE__
-
+			if (planstate->plan->parallel_aware)
+				ReDistributeEstimate(planstate, e->pcxt);
+			break;
             /* For remote query and remote subplan, there is no need for shared storage. */
             case T_RemoteQueryState:                
             case T_RemoteSubplanState:
                 break;
-                
             case T_HashJoinState:
+			if (planstate->plan->parallel_aware)
                 ExecParallelHashJoinEstimate((HashJoinState*) planstate,
                                              e->pcxt);
                 break;
-            case T_SortState:
-                ReDistributeEstimate(planstate, e->pcxt);
-                break;
             case T_AggState:
+			if (planstate->plan->parallel_aware)
                 {
                     AggState *aggstate = (AggState *)planstate;
 
@@ -312,7 +319,6 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
             default:
                 break;
         }
-    }
 
     return planstate_tree_walker(planstate, ExecParallelEstimate, e);
 }
@@ -337,60 +343,70 @@ ExecParallelInitializeDSM(PlanState *planstate,
     d->nnodes++;
 
     /*
-     * Call initializers for parallel-aware plan nodes.
+	 * Call initializers for DSM-using plan nodes.
      *
-     * Ordinary plan nodes won't do anything here, but parallel-aware plan
-     * nodes may need to initialize shared state in the DSM before parallel
-     * workers are available.  They can allocate the space they previously
+	 * Most plan nodes won't do anything here, but plan nodes that allocated
+	 * DSM may need to initialize shared state in the DSM before parallel
+	 * workers are launched.  They can allocate the space they previously
      * estimated using shm_toc_allocate, and add the keys they previously
      * estimated using shm_toc_insert, in each case targeting pcxt->toc.
      */
-    if (planstate->plan->parallel_aware)
-    {
         switch (nodeTag(planstate))
         {
             case T_SeqScanState:
+			if (planstate->plan->parallel_aware)
                 ExecSeqScanInitializeDSM((SeqScanState *) planstate,
                                          d->pcxt);
                 break;
             case T_IndexScanState:
+			if (planstate->plan->parallel_aware)
                 ExecIndexScanInitializeDSM((IndexScanState *) planstate,
                                            d->pcxt);
                 break;
             case T_IndexOnlyScanState:
+			if (planstate->plan->parallel_aware)
                 ExecIndexOnlyScanInitializeDSM((IndexOnlyScanState *) planstate,
                                                d->pcxt);
                 break;
             case T_ForeignScanState:
+			if (planstate->plan->parallel_aware)
                 ExecForeignScanInitializeDSM((ForeignScanState *) planstate,
                                              d->pcxt);
                 break;
             case T_CustomScanState:
+			if (planstate->plan->parallel_aware)
                 ExecCustomScanInitializeDSM((CustomScanState *) planstate,
                                             d->pcxt);
                 break;
             case T_BitmapHeapScanState:
+			if (planstate->plan->parallel_aware)
                 ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate,
                                             d->pcxt);
                 break;
-            
+		case T_SortState:
+			/* even when not parallel-aware */
+			ExecSortInitializeDSM((SortState *) planstate, d->pcxt);
 #ifdef __TBASE__
+			if (planstate->plan->parallel_aware)
+				ReDistributeInitializeDSM(planstate, d->pcxt);
+			break;
             case T_RemoteQueryState:
+			if (planstate->plan->parallel_aware)
                 ExecRemoteQueryInitializeDSM((RemoteQueryState *)planstate,
                                                   d->pcxt);
                 break;
             case T_RemoteSubplanState:
+			if (planstate->plan->parallel_aware)
                 ExecRemoteSubPlanInitializeDSM((RemoteSubplanState *)planstate,
                                                   d->pcxt);
                 break;                
             case T_HashJoinState:
+			if (planstate->plan->parallel_aware)
                 ExecParallelHashJoinInitializeDSM((HashJoinState *) planstate,
                                                   d->pcxt);
                 break;
-            case T_SortState:
-                ReDistributeInitializeDSM(planstate, d->pcxt);
-                break;
             case T_AggState:
+			if (planstate->plan->parallel_aware)
                 {
                     AggState *aggstate = (AggState *)planstate;
                     
@@ -403,7 +419,6 @@ ExecParallelInitializeDSM(PlanState *planstate,
             default:
                 break;
         }
-    }
 
     return planstate_tree_walker(planstate, ExecParallelInitializeDSM, d);
 }
@@ -914,6 +929,13 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
     planstate->worker_instrument->num_workers = instrumentation->num_workers;
     memcpy(&planstate->worker_instrument->instrument, instrument, ibytes);
 
+	/*
+	 * Perform any node-type-specific work that needs to be done.  Currently,
+	 * only Sort nodes need to do anything here.
+	 */
+	if (IsA(planstate, SortState))
+		ExecSortRetrieveInstrumentation((SortState *) planstate);
+
     return planstate_tree_walker(planstate, ExecParallelRetrieveInstrumentation,
                                  instrumentation);
 }
@@ -1076,47 +1098,56 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc)
     if (planstate == NULL)
         return false;
 
-    /* Call initializers for parallel-aware plan nodes. */
-    if (planstate->plan->parallel_aware)
-    {
         switch (nodeTag(planstate))
         {
             case T_SeqScanState:
+			if (planstate->plan->parallel_aware)
                 ExecSeqScanInitializeWorker((SeqScanState *) planstate, toc);
                 break;
             case T_IndexScanState:
+			if (planstate->plan->parallel_aware)
                 ExecIndexScanInitializeWorker((IndexScanState *) planstate, toc);
                 break;
             case T_IndexOnlyScanState:
+			if (planstate->plan->parallel_aware)
                 ExecIndexOnlyScanInitializeWorker((IndexOnlyScanState *) planstate, toc);
                 break;
             case T_ForeignScanState:
+			if (planstate->plan->parallel_aware)
                 ExecForeignScanInitializeWorker((ForeignScanState *) planstate,
                                                 toc);
                 break;
             case T_CustomScanState:
+			if (planstate->plan->parallel_aware)
                 ExecCustomScanInitializeWorker((CustomScanState *) planstate,
                                                toc);
                 break;
             case T_BitmapHeapScanState:
-                ExecBitmapHeapInitializeWorker((BitmapHeapScanState *) planstate, toc);
+			if (planstate->plan->parallel_aware)
+				ExecBitmapHeapInitializeWorker(
+											   (BitmapHeapScanState *) planstate, toc);
                 break;
+		case T_SortState:
+			/* even when not parallel-aware */
+			ExecSortInitializeWorker((SortState *) planstate, toc);
 #ifdef __TBASE__
+			if (planstate->plan->parallel_aware)
+				ReDistributeInitializeWorker(planstate, toc);
+			break;
             case T_RemoteQueryState:
+			if (planstate->plan->parallel_aware)
                 ExecRemoteQueryInitializeDSMWorker((RemoteQueryState *)planstate, toc);
                 break;
-                
-            case T_RemoteSubplanState:                
+            case T_RemoteSubplanState: 
+            if (planstate->plan->parallel_aware)        
                 ExecRemoteSubPlanInitDSMWorker((RemoteSubplanState *)planstate, toc);
                 break;
-                
             case T_HashJoinState:
-                ExecParallelHashJoinInitWorker((HashJoinState *) planstate, toc);
-                break;
-            case T_SortState:
-                ReDistributeInitializeWorker(planstate, toc);
+                if (planstate->plan->parallel_aware)
+                    ExecParallelHashJoinInitWorker((HashJoinState *) planstate, toc);
                 break;
             case T_AggState:
+			    if (planstate->plan->parallel_aware)
                 {
                     AggState *aggstate = (AggState *)planstate;
 
@@ -1128,7 +1159,6 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc)
             default:
                 break;
         }
-    }
 
     return planstate_tree_walker(planstate, ExecParallelInitializeWorker, toc);
 }
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index 5a42eef5..3c35d902 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -15,6 +15,7 @@
 
 #include "postgres.h"
 
+#include "access/parallel.h"
 #include "executor/execdebug.h"
 #include "executor/nodeSort.h"
 #include "miscadmin.h"
@@ -232,6 +233,15 @@ ExecSort(PlanState *pstate)
         node->sort_Done = true;
         node->bounded_Done = node->bounded;
         node->bound_Done = node->bound;
+		if (node->shared_info && node->am_worker)
+		{
+			TuplesortInstrumentation *si;
+
+			Assert(IsParallelWorker());
+			Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
+			si = &node->shared_info->sinstrument[ParallelWorkerNumber];
+			tuplesort_get_stats(tuplesortstate, si);
+		}
         SO1_printf("ExecSort: %s\n", "sorting done");
     }
 
@@ -444,3 +454,90 @@ ExecReScanSort(SortState *node)
     else
         tuplesort_rescan((Tuplesortstate *) node->tuplesortstate);
 }
+
+/* ----------------------------------------------------------------
+ *						Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+/* ----------------------------------------------------------------
+ *		ExecSortEstimate
+ *
+ *		Estimate space required to propagate sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortEstimate(SortState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = mul_size(pcxt->nworkers, sizeof(TuplesortInstrumentation));
+	size = add_size(size, offsetof(SharedSortInfo, sinstrument));
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortInitializeDSM
+ *
+ *		Initialize DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = offsetof(SharedSortInfo, sinstrument)
+		+ pcxt->nworkers * sizeof(TuplesortInstrumentation);
+	node->shared_info = shm_toc_allocate(pcxt->toc, size);
+	/* ensure any unfilled slots will contain zeroes */
+	memset(node->shared_info, 0, size);
+	node->shared_info->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
+				   node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortInitializeWorker
+ *
+ *		Attach worker to DSM space for sort statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortInitializeWorker(SortState *node, shm_toc *toc)
+{
+	node->shared_info =
+		shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, true);
+	node->am_worker = true;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecSortRetrieveInstrumentation
+ *
+ *		Transfer sort statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortRetrieveInstrumentation(SortState *node)
+{
+	Size		size;
+	SharedSortInfo *si;
+
+	if (node->shared_info == NULL)
+		return;
+
+	size = offsetof(SharedSortInfo, sinstrument)
+		+ node->shared_info->num_workers * sizeof(TuplesortInstrumentation);
+	si = palloc(size);
+	memcpy(si, node->shared_info, size);
+	node->shared_info = si;
+}
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 676cb9bd..ad5d9988 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -3379,13 +3379,10 @@ tuplesort_restorepos(Tuplesortstate *state)
  *
  * This can be called after tuplesort_performsort() finishes to obtain
  * printable summary information about how the sort was performed.
- * spaceUsed is measured in kilobytes.
  */
 void
 tuplesort_get_stats(Tuplesortstate *state,
-                    const char **sortMethod,
-                    const char **spaceType,
-                    long *spaceUsed)
+					TuplesortInstrumentation *stats)
 {
     /*
      * Note: it might seem we should provide both memory and disk usage for a
@@ -3398,35 +3395,68 @@ tuplesort_get_stats(Tuplesortstate *state,
      */
     if (state->tapeset)
     {
-        *spaceType = "Disk";
-        *spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024);
+		stats->spaceType = SORT_SPACE_TYPE_DISK;
+		stats->spaceUsed = LogicalTapeSetBlocks(state->tapeset) * (BLCKSZ / 1024);
     }
     else
     {
-        *spaceType = "Memory";
-        *spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
+		stats->spaceType = SORT_SPACE_TYPE_MEMORY;
+		stats->spaceUsed = (state->allowedMem - state->availMem + 1023) / 1024;
     }
 
     switch (state->status)
     {
         case TSS_SORTEDINMEM:
             if (state->boundUsed)
-                *sortMethod = "top-N heapsort";
+				stats->sortMethod = SORT_TYPE_TOP_N_HEAPSORT;
             else
-                *sortMethod = "quicksort";
+				stats->sortMethod = SORT_TYPE_QUICKSORT;
             break;
         case TSS_SORTEDONTAPE:
-            *sortMethod = "external sort";
+			stats->sortMethod = SORT_TYPE_EXTERNAL_SORT;
             break;
         case TSS_FINALMERGE:
-            *sortMethod = "external merge";
+			stats->sortMethod = SORT_TYPE_EXTERNAL_MERGE;
             break;
         default:
-            *sortMethod = "still in progress";
+			stats->sortMethod = SORT_TYPE_STILL_IN_PROGRESS;
             break;
     }
 }
 
+/*
+ * Convert TuplesortMethod to a string.
+ */
+const char *
+tuplesort_method_name(TuplesortMethod m)
+{
+	switch (m)
+	{
+		case SORT_TYPE_STILL_IN_PROGRESS:
+			return "still in progress";
+		case SORT_TYPE_TOP_N_HEAPSORT:
+			return "top-N heapsort";
+		case SORT_TYPE_QUICKSORT:
+			return "quicksort";
+		case SORT_TYPE_EXTERNAL_SORT:
+			return "external sort";
+		case SORT_TYPE_EXTERNAL_MERGE:
+			return "external merge";
+	}
+
+	return "unknown";
+}
+
+/*
+ * Convert TuplesortSpaceType to a string.
+ */
+const char *
+tuplesort_space_type_name(TuplesortSpaceType t)
+{
+	Assert(t == SORT_SPACE_TYPE_DISK || t == SORT_SPACE_TYPE_MEMORY);
+	return t == SORT_SPACE_TYPE_DISK ? "Disk" : "Memory";
+}
+
 
 /*
  * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
diff --git a/src/include/executor/nodeSort.h b/src/include/executor/nodeSort.h
index fcf6b765..77ac0659 100644
--- a/src/include/executor/nodeSort.h
+++ b/src/include/executor/nodeSort.h
@@ -14,6 +14,7 @@
 #ifndef NODESORT_H
 #define NODESORT_H
 
+#include "access/parallel.h"
 #include "nodes/execnodes.h"
 
 extern SortState *ExecInitSort(Sort *node, EState *estate, int eflags);
@@ -22,4 +23,10 @@ extern void ExecSortMarkPos(SortState *node);
 extern void ExecSortRestrPos(SortState *node);
 extern void ExecReScanSort(SortState *node);
 
-#endif                            /* NODESORT_H */
+/* parallel instrumentation support */
+extern void ExecSortEstimate(SortState *node, ParallelContext *pcxt);
+extern void ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt);
+extern void ExecSortInitializeWorker(SortState *node, shm_toc *toc);
+extern void ExecSortRetrieveInstrumentation(SortState *node);
+
+#endif							/* NODESORT_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 1fdf29fe..cedcf547 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1965,6 +1965,16 @@ typedef struct ReDistributeState
                                             + ReDistributeBufferTotalSize * numWorkers * numWorkers)
 #endif
 
+/* ----------------
+ *	 Shared memory container for per-worker sort information
+ * ----------------
+ */
+typedef struct SharedSortInfo
+{
+	int			num_workers;
+	TuplesortInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
+} SharedSortInfo;
+
 /* ----------------
  *     SortState information
  * ----------------
@@ -1979,6 +1989,8 @@ typedef struct SortState
     bool        bounded_Done;    /* value of bounded we did the sort with */
     int64        bound_Done;        /* value of bound we did the sort with */
     void       *tuplesortstate; /* private state of tuplesort.c */
+	bool		am_worker;		/* are we a worker? */
+	SharedSortInfo *shared_info;	/* one entry per worker */
 #ifdef __TBASE__
     Size            stateLen;
     ReDistributeState *state;
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index 1b5ada2c..f3e81c70 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * tuplesort.h
- *      Generalized tuple sorting routines.
+ *	  Generalized tuple sorting routines.
  *
  * This module handles sorting of heap tuples, index tuples, or single
  * Datums (and could easily support other kinds of sortable objects,
@@ -35,6 +35,34 @@ struct ResponseCombiner;
  */
 typedef struct Tuplesortstate Tuplesortstate;
 
+/*
+ * Data structures for reporting sort statistics.  Note that
+ * TuplesortInstrumentation can't contain any pointers because we
+ * sometimes put it in shared memory.
+ */
+typedef enum
+{
+	SORT_TYPE_STILL_IN_PROGRESS = 0,
+	SORT_TYPE_TOP_N_HEAPSORT,
+	SORT_TYPE_QUICKSORT,
+	SORT_TYPE_EXTERNAL_SORT,
+	SORT_TYPE_EXTERNAL_MERGE
+} TuplesortMethod;
+
+typedef enum
+{
+	SORT_SPACE_TYPE_DISK,
+	SORT_SPACE_TYPE_MEMORY
+} TuplesortSpaceType;
+
+typedef struct TuplesortInstrumentation
+{
+	TuplesortMethod sortMethod; /* sort algorithm used */
+	TuplesortSpaceType spaceType;	/* type of space spaceUsed represents */
+	long		spaceUsed;		/* space consumption, in kB */
+} TuplesortInstrumentation;
+
+
 /*
  * We provide multiple interfaces to what is essentially the same code,
  * since different callers have different data to be sorted and want to
@@ -63,66 +91,66 @@ typedef struct Tuplesortstate Tuplesortstate;
  */
 
 extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc,
-                     int nkeys, AttrNumber *attNums,
-                     Oid *sortOperators, Oid *sortCollations,
-                     bool *nullsFirstFlags,
-                     int workMem, bool randomAccess);
+					 int nkeys, AttrNumber *attNums,
+					 Oid *sortOperators, Oid *sortCollations,
+					 bool *nullsFirstFlags,
+					 int workMem, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc,
-                        Relation indexRel,
-                        int workMem, bool randomAccess);
+						Relation indexRel,
+						int workMem, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel,
-                            Relation indexRel,
-                            bool enforceUnique,
-                            int workMem, bool randomAccess);
+							Relation indexRel,
+							bool enforceUnique,
+							int workMem, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel,
-                           Relation indexRel,
-                           uint32 high_mask,
-                           uint32 low_mask,
-                           uint32 max_buckets,
-                           int workMem, bool randomAccess);
+						   Relation indexRel,
+						   uint32 high_mask,
+						   uint32 low_mask,
+						   uint32 max_buckets,
+						   int workMem, bool randomAccess);
 extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
-                      Oid sortOperator, Oid sortCollation,
-                      bool nullsFirstFlag,
-                      int workMem, bool randomAccess);
+					  Oid sortOperator, Oid sortCollation,
+					  bool nullsFirstFlag,
+					  int workMem, bool randomAccess);
 #ifdef PGXC
 extern Tuplesortstate *tuplesort_begin_merge(TupleDesc tupDesc,
-                     int nkeys, AttrNumber *attNums,
-                     Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags,
-                     struct ResponseCombiner *combiner,
-                     int workMem);
+					 int nkeys, AttrNumber *attNums,
+					 Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags,
+					 struct ResponseCombiner *combiner,
+					 int workMem);
 #endif
 
 extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
 
 extern void tuplesort_puttupleslot(Tuplesortstate *state,
-                       TupleTableSlot *slot);
+					   TupleTableSlot *slot);
 extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup);
 extern void tuplesort_putindextuplevalues(Tuplesortstate *state,
-                              Relation rel, ItemPointer self,
-                              Datum *values, bool *isnull);
+							  Relation rel, ItemPointer self,
+							  Datum *values, bool *isnull);
 extern void tuplesort_putdatum(Tuplesortstate *state, Datum val,
-                   bool isNull);
+				   bool isNull);
 
 extern void tuplesort_performsort(Tuplesortstate *state);
 
 extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
-                       bool copy, TupleTableSlot *slot, Datum *abbrev);
+					   bool copy, TupleTableSlot *slot, Datum *abbrev);
 extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward);
 extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward);
 extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
-                   Datum *val, bool *isNull, Datum *abbrev);
+				   Datum *val, bool *isNull, Datum *abbrev);
 
 extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples,
-                     bool forward);
+					 bool forward);
 
 extern void tuplesort_end(Tuplesortstate *state);
 
 extern void tuplesort_get_stats(Tuplesortstate *state,
-                    const char **sortMethod,
-                    const char **spaceType,
-                    long *spaceUsed);
+					TuplesortInstrumentation *stats);
+extern const char *tuplesort_method_name(TuplesortMethod m);
+extern const char *tuplesort_space_type_name(TuplesortSpaceType t);
 
-extern int    tuplesort_merge_order(int64 allowedMem);
+extern int	tuplesort_merge_order(int64 allowedMem);
 
 /*
  * These routines may only be called if randomAccess was specified 'true'.
@@ -134,4 +162,4 @@ extern void tuplesort_rescan(Tuplesortstate *state);
 extern void tuplesort_markpos(Tuplesortstate *state);
 extern void tuplesort_restorepos(Tuplesortstate *state);
 
-#endif                            /* TUPLESORT_H */
+#endif							/* TUPLESORT_H */
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index c9dc3101..c573fbda 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1781,6 +1781,36 @@ DROP TABLE sub_t2;
 DROP TABLE sub_interfere1;
 DROP TABLE sub_interfere2;
 set enable_pullup_subquery to false;
+-- Test that LIMIT can be pushed to SORT through a subquery that just projects
+-- columns.  We check for that having happened by looking to see if EXPLAIN
+-- ANALYZE shows that a top-N sort was used.  We must suppress or filter away
+-- all the non-invariant parts of the EXPLAIN ANALYZE output.
+--
+create table sq_limit (pk int primary key, c1 int, c2 int);
+insert into sq_limit values
+    (1, 1, 1),
+    (2, 2, 2),
+    (3, 3, 3),
+    (4, 4, 4),
+    (5, 1, 1),
+    (6, 2, 2),
+    (7, 3, 3),
+    (8, 4, 4);
+create function explain_sq_limit() returns setof text language plpgsql as
+$$
+declare ln text;
+begin
+    for ln in
+        explain (analyze, summary off, timing off, costs off)
+        select * from (select pk,c2 from sq_limit order by c1,pk) as x limit 3
+    loop
+        ln := regexp_replace(ln, 'Memory: \S*',  'Memory: xxx');
+        -- this case might occur if force_parallel_mode is on:
+        ln := regexp_replace(ln, 'Worker 0:  Sort Method',  'Sort Method');
+        return next ln;
+    end loop;
+end;
+$$;
 --
 -- Tests for CTE inlining behavior
 --
@@ -2097,3 +2127,21 @@ from date_dim
 (0 rows)
 
 drop table catalog_sales, catalog_returns, date_dim;
+-- not in optimization
+create table notin_t1 (id1 int, num1 int not null);
+create table notin_t2 (id2 int, num2 int not null);
+explain(costs off) select num1 from notin_t1 where num1 not in (select num2 from notin_t2);
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Seq Scan on notin_t1
+         Filter: (NOT (hashed SubPlan 1))
+         SubPlan 1
+           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                 ->  Seq Scan on notin_t2
+(6 rows)
+
+drop table notin_t1;
+drop table notin_t2;
+drop function explain_sq_limit();
+drop table sq_limit;
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 256ddefa..8b5db9a2 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -728,6 +728,37 @@ DROP TABLE sub_t2;
 DROP TABLE sub_interfere1;
 DROP TABLE sub_interfere2;
 set enable_pullup_subquery to false;
+-- Test that LIMIT can be pushed to SORT through a subquery that just projects
+-- columns.  We check for that having happened by looking to see if EXPLAIN
+-- ANALYZE shows that a top-N sort was used.  We must suppress or filter away
+-- all the non-invariant parts of the EXPLAIN ANALYZE output.
+--
+create table sq_limit (pk int primary key, c1 int, c2 int);
+insert into sq_limit values
+    (1, 1, 1),
+    (2, 2, 2),
+    (3, 3, 3),
+    (4, 4, 4),
+    (5, 1, 1),
+    (6, 2, 2),
+    (7, 3, 3),
+    (8, 4, 4);
+
+create function explain_sq_limit() returns setof text language plpgsql as
+$$
+declare ln text;
+begin
+    for ln in
+        explain (analyze, summary off, timing off, costs off)
+        select * from (select pk,c2 from sq_limit order by c1,pk) as x limit 3
+    loop
+        ln := regexp_replace(ln, 'Memory: \S*',  'Memory: xxx');
+        -- this case might occur if force_parallel_mode is on:
+        ln := regexp_replace(ln, 'Worker 0:  Sort Method',  'Sort Method');
+        return next ln;
+    end loop;
+end;
+$$;
 
 --
 -- Tests for CTE inlining behavior
@@ -848,4 +879,14 @@ with cs as
 select 1
 from date_dim
     join cs on (cs_sold_year=d_year and cs_item_sk=cs_item_sk);
-drop table catalog_sales, catalog_returns, date_dim;
\ No newline at end of file
+drop table catalog_sales, catalog_returns, date_dim;
+
+-- not in optimization
+create table notin_t1 (id1 int, num1 int not null);
+create table notin_t2 (id2 int, num2 int not null);
+explain(costs off) select num1 from notin_t1 where num1 not in (select num2 from notin_t2);
+drop table notin_t1;
+drop table notin_t2;
+drop function explain_sq_limit();
+
+drop table sq_limit;

From 283e8ce8559ce6ae0834e17b8acfefd898159b0b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 16 Aug 2017 16:18:41 +0300
Subject: [PATCH 135/578] Use atomic ops to hand out pages to scan in parallel
 scan.

With a lot of CPUs, the spinlock that protects the current scan location
in a parallel scan can become a bottleneck. Use an atomic fetch-and-add
instruction instead.

David Rowley

Discussion: https://www.postgresql.org/message-id/CAKJS1f9tgsPhqBcoPjv9_KUPZvTLCZ4jy%3DB%3DbhqgaKn7cYzm-w@mail.gmail.com
---
 src/backend/access/heap/heapam.c | 104 ++++++++++++++++++-------------
 src/include/access/relscan.h     |   5 +-
 2 files changed, 63 insertions(+), 46 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index ab7920d8..5f6eb658 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -59,6 +59,7 @@
 #include "catalog/namespace.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "port/atomics.h"
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/lmgr.h"
@@ -106,6 +107,7 @@ static HeapScanDesc heap_beginscan_internal(Relation relation,
                         bool is_bitmapscan,
                         bool is_samplescan,
                         bool temp_snap);
+static void heap_parallelscan_startblock_init(HeapScanDesc scan);
 static BlockNumber heap_parallelscan_nextpage(HeapScanDesc scan);
 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
                     TransactionId xid, CommandId cid, int options);
@@ -549,6 +551,8 @@ heapgettup(HeapScanDesc scan,
             }
             if (scan->rs_parallel != NULL)
             {
+				heap_parallelscan_startblock_init(scan);
+
                 page = heap_parallelscan_nextpage(scan);
 
                 /* Other processes might have already finished the scan. */
@@ -929,6 +933,8 @@ heapgettup_pagemode(HeapScanDesc scan,
             }
             if (scan->rs_parallel != NULL)
             {
+				heap_parallelscan_startblock_init(scan);
+
                 page = heap_parallelscan_nextpage(scan);
 
                 /* Other processes might have already finished the scan. */
@@ -1744,14 +1750,10 @@ heap_rescan(HeapScanDesc scan,
 
         /*
          * Caller is responsible for making sure that all workers have
-         * finished the scan before calling this, so it really shouldn't be
-         * necessary to acquire the mutex at all.  We acquire it anyway, just
-         * to be tidy.
+		 * finished the scan before calling this.
          */
         parallel_scan = scan->rs_parallel;
-        SpinLockAcquire(&parallel_scan->phs_mutex);
-        parallel_scan->phs_cblock = parallel_scan->phs_startblock;
-        SpinLockRelease(&parallel_scan->phs_mutex);
+		pg_atomic_write_u64(&parallel_scan->phs_nallocated, 0);
     }
 }
 
@@ -1909,8 +1911,8 @@ heap_parallelscan_initialize(ParallelHeapScanDesc target, Relation relation,
         !RelationUsesLocalBuffers(relation) &&
         target->phs_nblocks > NBuffers / 4;
     SpinLockInit(&target->phs_mutex);
-    target->phs_cblock = InvalidBlockNumber;
     target->phs_startblock = InvalidBlockNumber;
+	pg_atomic_write_u64(&target->phs_nallocated, 0);
     SerializeSnapshot(snapshot, target->phs_snapshot_data);
 }
 
@@ -1934,20 +1936,17 @@ heap_beginscan_parallel(Relation relation, ParallelHeapScanDesc parallel_scan)
 }
 
 /* ----------------
- *        heap_parallelscan_nextpage - get the next page to scan
+ *		heap_parallelscan_startblock_init - find and set the scan's startblock
  *
- *        Get the next page to scan.  Even if there are no pages left to scan,
- *        another backend could have grabbed a page to scan and not yet finished
- *        looking at it, so it doesn't follow that the scan is done when the
- *        first backend gets an InvalidBlockNumber return.
+ *		Determine where the parallel seq scan should start.  This function may
+ *		be called many times, once by each parallel worker.  We must be careful
+ *		only to set the startblock once.
  * ----------------
  */
-static BlockNumber
-heap_parallelscan_nextpage(HeapScanDesc scan)
-{// #lizard forgives
-    BlockNumber page = InvalidBlockNumber;
+static void
+heap_parallelscan_startblock_init(HeapScanDesc scan)
+{
     BlockNumber sync_startpage = InvalidBlockNumber;
-    BlockNumber report_page = InvalidBlockNumber;
     ParallelHeapScanDesc parallel_scan;
 
     Assert(scan->rs_parallel);
@@ -1979,46 +1978,63 @@ heap_parallelscan_nextpage(HeapScanDesc scan)
             sync_startpage = ss_get_location(scan->rs_rd, scan->rs_nblocks);
             goto retry;
         }
-        parallel_scan->phs_cblock = parallel_scan->phs_startblock;
+	}
+	SpinLockRelease(&parallel_scan->phs_mutex);
     }
 
-    /*
-     * The current block number is the next one that needs to be scanned,
-     * unless it's InvalidBlockNumber already, in which case there are no more
-     * blocks to scan.  After remembering the current value, we must advance
-     * it so that the next call to this function returns the next block to be
-     * scanned.
+/* ----------------
+ *		heap_parallelscan_nextpage - get the next page to scan
+ *
+ *		Get the next page to scan.  Even if there are no pages left to scan,
+ *		another backend could have grabbed a page to scan and not yet finished
+ *		looking at it, so it doesn't follow that the scan is done when the
+ *		first backend gets an InvalidBlockNumber return.
+ * ----------------
      */
-    page = parallel_scan->phs_cblock;
-    if (page != InvalidBlockNumber)
-    {
-        parallel_scan->phs_cblock++;
-        if (parallel_scan->phs_cblock >= scan->rs_nblocks)
-            parallel_scan->phs_cblock = 0;
-        if (parallel_scan->phs_cblock == parallel_scan->phs_startblock)
+static BlockNumber
+heap_parallelscan_nextpage(HeapScanDesc scan)
         {
-            parallel_scan->phs_cblock = InvalidBlockNumber;
-            report_page = parallel_scan->phs_startblock;
-        }
-    }
-
-    /* Release the lock. */
-    SpinLockRelease(&parallel_scan->phs_mutex);
+	BlockNumber page;
+	ParallelHeapScanDesc parallel_scan;
+	uint64		nallocated;
+
+	Assert(scan->rs_parallel);
+	parallel_scan = scan->rs_parallel;
+
+	/*
+	 * phs_nallocated tracks how many pages have been allocated to workers
+	 * already.  When phs_nallocated >= rs_nblocks, all blocks have been
+	 * allocated.
+	 *
+	 * Because we use an atomic fetch-and-add to fetch the current value, the
+	 * phs_nallocated counter will exceed rs_nblocks, because workers will
+	 * still increment the value, when they try to allocate the next block but
+	 * all blocks have been allocated already. The counter must be 64 bits
+	 * wide because of that, to avoid wrapping around when rs_nblocks is close
+	 * to 2^32.
+	 *
+	 * The actual page to return is calculated by adding the counter to the
+	 * starting block number, modulo nblocks.
+	 */
+	nallocated = pg_atomic_fetch_add_u64(&parallel_scan->phs_nallocated, 1);
+	if (nallocated >= scan->rs_nblocks)
+		page = InvalidBlockNumber;	/* all blocks have been allocated */
+	else
+		page = (nallocated + parallel_scan->phs_startblock) % scan->rs_nblocks;
 
     /*
      * Report scan location.  Normally, we report the current page number.
      * When we reach the end of the scan, though, we report the starting page,
      * not the ending page, just so the starting positions for later scans
      * doesn't slew backwards.  We only report the position at the end of the
-     * scan once, though: subsequent callers will have report nothing, since
-     * they will have page == InvalidBlockNumber.
+	 * scan once, though: subsequent callers will report nothing.
      */
     if (scan->rs_syncscan)
     {
-        if (report_page == InvalidBlockNumber)
-            report_page = page;
-        if (report_page != InvalidBlockNumber)
-            ss_report_location(scan->rs_rd, report_page);
+		if (page != InvalidBlockNumber)
+			ss_report_location(scan->rs_rd, page);
+		else if (nallocated == scan->rs_nblocks)
+			ss_report_location(scan->rs_rd, parallel_scan->phs_startblock);
     }
 
     return page;
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 35cf3f10..79e8cab7 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -97,9 +97,10 @@ typedef struct ParallelHeapScanDescData
     Oid            phs_relid;        /* OID of relation to scan */
     bool        phs_syncscan;    /* report location to syncscan logic? */
     BlockNumber phs_nblocks;    /* # blocks in relation at start of scan */
-    slock_t        phs_mutex;        /* mutual exclusion for block number fields */
+	slock_t		phs_mutex;		/* mutual exclusion for setting startblock */
     BlockNumber phs_startblock; /* starting block number */
-    BlockNumber phs_cblock;        /* current block number */
+	pg_atomic_uint64 phs_nallocated;	/* number of blocks allocated to
+										 * workers so far. */
     char        phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER];
 }            ParallelHeapScanDescData;
 

From 80ceed7e71a5349323ad54d4325f104058e6e2cf Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 16 Nov 2017 17:28:11 -0800
Subject: [PATCH 136/578] Provide DSM segment to ExecXXXInitializeWorker
 functions.

Previously, executor nodes running in parallel worker processes didn't
have access to the dsm_segment object used for parallel execution.  In
order to support resource management based on DSM segment lifetime,
they need that.  So create a ParallelWorkerContext object to hold it
and pass it to all InitializeWorker functions.

Author: Thomas Munro
Reviewed-By: Andres Freund
Discussion: https://postgr.es/m/CAEepm=2W=cOkiZxcg6qiFQP-dHUe09aqTrEMM7yJDrHMhDv_RA@mail.gmail.com
---
 src/backend/executor/execParallel.c       |  44 ++-
 src/backend/executor/nodeAgg.c            |   3 +-
 src/backend/executor/nodeBitmapHeapscan.c |   5 +-
 src/backend/executor/nodeCustom.c         | 284 +++++++-------
 src/backend/executor/nodeForeignscan.c    |   7 +-
 src/backend/executor/nodeHashjoin.c       |   6 +-
 src/backend/executor/nodeIndexonlyscan.c  |   5 +-
 src/backend/executor/nodeIndexscan.c      |   5 +-
 src/backend/executor/nodeSeqscan.c        | 445 +++++++++++-----------
 src/backend/executor/nodeSort.c           |   4 +-
 src/backend/pgxc/pool/execRemote.c        |   8 +-
 src/include/access/parallel.h             |   6 +
 src/include/executor/nodeAgg.h            |   2 +-
 src/include/executor/nodeBitmapHeapscan.h |   8 +-
 src/include/executor/nodeCustom.h         |  10 +-
 src/include/executor/nodeForeignscan.h    |   8 +-
 src/include/executor/nodeHashjoin.h       |   2 +-
 src/include/executor/nodeIndexonlyscan.h  |   8 +-
 src/include/executor/nodeIndexscan.h      |  18 +-
 src/include/executor/nodeSeqscan.h        |   6 +-
 src/include/executor/nodeSort.h           |   3 +-
 src/include/pgxc/execRemote.h             |   4 +-
 src/tools/pgindent/typedefs.list          |   1 +
 23 files changed, 466 insertions(+), 426 deletions(-)

diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 45dc56fb..75a82009 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -1093,8 +1093,8 @@ ExecParallelReportInstrumentation(PlanState *planstate,
  * is allocated and initialized by executor; that is, after ExecutorStart().
  */
 static bool
-ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc)
-{// #lizard forgives
+ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
+{
     if (planstate == NULL)
         return false;
 
@@ -1102,49 +1102,50 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc)
         {
             case T_SeqScanState:
 			if (planstate->plan->parallel_aware)
-                ExecSeqScanInitializeWorker((SeqScanState *) planstate, toc);
+				ExecSeqScanInitializeWorker((SeqScanState *) planstate, pwcxt);
                 break;
             case T_IndexScanState:
 			if (planstate->plan->parallel_aware)
-                ExecIndexScanInitializeWorker((IndexScanState *) planstate, toc);
+				ExecIndexScanInitializeWorker((IndexScanState *) planstate,
+											  pwcxt);
                 break;
             case T_IndexOnlyScanState:
 			if (planstate->plan->parallel_aware)
-                ExecIndexOnlyScanInitializeWorker((IndexOnlyScanState *) planstate, toc);
+				ExecIndexOnlyScanInitializeWorker((IndexOnlyScanState *) planstate,
+												  pwcxt);
                 break;
             case T_ForeignScanState:
 			if (planstate->plan->parallel_aware)
                 ExecForeignScanInitializeWorker((ForeignScanState *) planstate,
-                                                toc);
+												pwcxt);
                 break;
             case T_CustomScanState:
 			if (planstate->plan->parallel_aware)
                 ExecCustomScanInitializeWorker((CustomScanState *) planstate,
-                                               toc);
+				                               pwcxt);
                 break;
             case T_BitmapHeapScanState:
 			if (planstate->plan->parallel_aware)
-				ExecBitmapHeapInitializeWorker(
-											   (BitmapHeapScanState *) planstate, toc);
+				ExecBitmapHeapInitializeWorker((BitmapHeapScanState *) planstate, pwcxt);
                 break;
 		case T_SortState:
 			/* even when not parallel-aware */
-			ExecSortInitializeWorker((SortState *) planstate, toc);
+			ExecSortInitializeWorker((SortState *) planstate, pwcxt);
 #ifdef __TBASE__
 			if (planstate->plan->parallel_aware)
-				ReDistributeInitializeWorker(planstate, toc);
+				ReDistributeInitializeWorker(planstate, pwcxt);
 			break;
             case T_RemoteQueryState:
 			if (planstate->plan->parallel_aware)
-                ExecRemoteQueryInitializeDSMWorker((RemoteQueryState *)planstate, toc);
+				ExecRemoteQueryInitializeDSMWorker((RemoteQueryState *)planstate, pwcxt);
                 break;
             case T_RemoteSubplanState: 
             if (planstate->plan->parallel_aware)        
-                ExecRemoteSubPlanInitDSMWorker((RemoteSubplanState *)planstate, toc);
-                break;
-            case T_HashJoinState:
-                if (planstate->plan->parallel_aware)
-                    ExecParallelHashJoinInitWorker((HashJoinState *) planstate, toc);
+				ExecRemoteSubPlanInitializeDSMWorker((RemoteSubplanState *)planstate, pwcxt);
+			break;
+		case T_HashJoinState:
+			if (planstate->plan->parallel_aware)
+				ExecParallelHashJoinInitializeWorker((HashJoinState *) planstate, pwcxt);
                 break;
             case T_AggState:
 			    if (planstate->plan->parallel_aware)
@@ -1152,7 +1153,7 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc)
                     AggState *aggstate = (AggState *)planstate;
 
                     if (aggstate->aggstrategy == AGG_HASHED)
-                        ReDistributeInitializeWorker(planstate, toc);
+					ReDistributeInitializeWorker(planstate, pwcxt);
                 }
                 break;
 #endif
@@ -1160,7 +1161,8 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc)
                 break;
         }
 
-    return planstate_tree_walker(planstate, ExecParallelInitializeWorker, toc);
+	return planstate_tree_walker(planstate, ExecParallelInitializeWorker,
+								 pwcxt);
 }
 
 /*
@@ -1189,6 +1191,7 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
     int            instrument_options = 0;
     void       *area_space;
     dsa_area   *area;
+	ParallelWorkerContext pwcxt;
 #ifdef __TBASE__
     int i                               = 0;
     int nWorkers                        = 0;
@@ -1320,6 +1323,9 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
         }
     }
 #endif
+	pwcxt.toc = toc;
+	pwcxt.seg = seg;
+	ExecParallelInitializeWorker(queryDesc->planstate, &pwcxt);
 
     /* Start up the executor */
     ExecutorStart(queryDesc, 0);
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index d771a28e..9f5678f5 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -5435,10 +5435,11 @@ ReDistributeInitializeDSM(PlanState *node, ParallelContext *pcxt)
 }
 
 void
-ReDistributeInitializeWorker(PlanState *node, shm_toc *toc)
+ReDistributeInitializeWorker(PlanState *node, ParallelWorkerContext *pwcxt)
 {
     int offset = 0;
     int i = 0;
+	shm_toc *toc = pwcxt->toc;
     ReDistributeState *state = NULL;
     ReDistributeState *rd_state = NULL;
     volatile ParallelWorkerStatus *numParallelWorkers = NULL;
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 2230a9c4..511dab7f 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -1061,12 +1061,13 @@ ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
  * ----------------------------------------------------------------
  */
 void
-ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, shm_toc *toc)
+ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node,
+							   ParallelWorkerContext *pwcxt)
 {
     ParallelBitmapHeapState *pstate;
     Snapshot    snapshot;
 
-    pstate = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, false);
+	pstate = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
     node->pstate = pstate;
 
     snapshot = RestoreSnapshot(pstate->phs_snapshot_data);
diff --git a/src/backend/executor/nodeCustom.c b/src/backend/executor/nodeCustom.c
index 7f7d78a2..4640093c 100644
--- a/src/backend/executor/nodeCustom.c
+++ b/src/backend/executor/nodeCustom.c
@@ -1,7 +1,7 @@
 /* ------------------------------------------------------------------------
  *
  * nodeCustom.c
- *        Routines to handle execution of custom scan node
+ *		Routines to handle execution of custom scan node
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -31,51 +31,51 @@ static TupleTableSlot *ExecCustomScan(PlanState *pstate);
 CustomScanState *
 ExecInitCustomScan(CustomScan *cscan, EState *estate, int eflags)
 {
-    CustomScanState *css;
-    Relation    scan_rel = NULL;
-    Index        scanrelid = cscan->scan.scanrelid;
-    Index        tlistvarno;
-
-    /*
-     * Allocate the CustomScanState object.  We let the custom scan provider
-     * do the palloc, in case it wants to make a larger object that embeds
-     * CustomScanState as the first field.  It must set the node tag and the
-     * methods field correctly at this time.  Other standard fields should be
-     * set to zero.
-     */
-    css = castNode(CustomScanState,
-                   cscan->methods->CreateCustomScanState(cscan));
-
-    /* ensure flags is filled correctly */
-    css->flags = cscan->flags;
-
-    /* fill up fields of ScanState */
-    css->ss.ps.plan = &cscan->scan.plan;
-    css->ss.ps.state = estate;
-    css->ss.ps.ExecProcNode = ExecCustomScan;
-
-    /* create expression context for node */
-    ExecAssignExprContext(estate, &css->ss.ps);
-
-    /* initialize child expressions */
-    css->ss.ps.qual =
-        ExecInitQual(cscan->scan.plan.qual, (PlanState *) css);
-
-    /* tuple table initialization */
-    ExecInitScanTupleSlot(estate, &css->ss);
-    ExecInitResultTupleSlot(estate, &css->ss.ps);
-
-    /*
-     * open the base relation, if any, and acquire an appropriate lock on it
-     */
-    if (scanrelid > 0)
-    {
-        scan_rel = ExecOpenScanRelation(estate, scanrelid, eflags);
-        css->ss.ss_currentRelation = scan_rel;
+	CustomScanState *css;
+	Relation	scan_rel = NULL;
+	Index		scanrelid = cscan->scan.scanrelid;
+	Index		tlistvarno;
+
+	/*
+	 * Allocate the CustomScanState object.  We let the custom scan provider
+	 * do the palloc, in case it wants to make a larger object that embeds
+	 * CustomScanState as the first field.  It must set the node tag and the
+	 * methods field correctly at this time.  Other standard fields should be
+	 * set to zero.
+	 */
+	css = castNode(CustomScanState,
+				   cscan->methods->CreateCustomScanState(cscan));
+
+	/* ensure flags is filled correctly */
+	css->flags = cscan->flags;
+
+	/* fill up fields of ScanState */
+	css->ss.ps.plan = &cscan->scan.plan;
+	css->ss.ps.state = estate;
+	css->ss.ps.ExecProcNode = ExecCustomScan;
+
+	/* create expression context for node */
+	ExecAssignExprContext(estate, &css->ss.ps);
+
+	/* initialize child expressions */
+	css->ss.ps.qual =
+		ExecInitQual(cscan->scan.plan.qual, (PlanState *) css);
+
+	/* tuple table initialization */
+	ExecInitScanTupleSlot(estate, &css->ss);
+	ExecInitResultTupleSlot(estate, &css->ss.ps);
+
+	/*
+	 * open the base relation, if any, and acquire an appropriate lock on it
+	 */
+	if (scanrelid > 0)
+	{
+		scan_rel = ExecOpenScanRelation(estate, scanrelid, eflags);
+		css->ss.ss_currentRelation = scan_rel;
 #ifdef _MLS_
         mls_check_datamask_need_passby((ScanState*)css, scan_rel->rd_id);
 #endif        
-    }
+	}
     else
     {
 #ifdef _MLS_
@@ -83,149 +83,165 @@ ExecInitCustomScan(CustomScan *cscan, EState *estate, int eflags)
 #endif
     }
 
-    /*
-     * Determine the scan tuple type.  If the custom scan provider provided a
-     * targetlist describing the scan tuples, use that; else use base
-     * relation's rowtype.
-     */
-    if (cscan->custom_scan_tlist != NIL || scan_rel == NULL)
-    {
-        TupleDesc    scan_tupdesc;
-
-        scan_tupdesc = ExecTypeFromTL(cscan->custom_scan_tlist, false);
-        ExecAssignScanType(&css->ss, scan_tupdesc);
-        /* Node's targetlist will contain Vars with varno = INDEX_VAR */
-        tlistvarno = INDEX_VAR;
-    }
-    else
-    {
-        ExecAssignScanType(&css->ss, RelationGetDescr(scan_rel));
-        /* Node's targetlist will contain Vars with varno = scanrelid */
-        tlistvarno = scanrelid;
-    }
-
-    /*
-     * Initialize result tuple type and projection info.
-     */
-    ExecAssignResultTypeFromTL(&css->ss.ps);
-    ExecAssignScanProjectionInfoWithVarno(&css->ss, tlistvarno);
-
-    /*
-     * The callback of custom-scan provider applies the final initialization
-     * of the custom-scan-state node according to its logic.
-     */
-    css->methods->BeginCustomScan(css, estate, eflags);
-
-    return css;
+	/*
+	 * Determine the scan tuple type.  If the custom scan provider provided a
+	 * targetlist describing the scan tuples, use that; else use base
+	 * relation's rowtype.
+	 */
+	if (cscan->custom_scan_tlist != NIL || scan_rel == NULL)
+	{
+		TupleDesc	scan_tupdesc;
+
+		scan_tupdesc = ExecTypeFromTL(cscan->custom_scan_tlist, false);
+		ExecAssignScanType(&css->ss, scan_tupdesc);
+		/* Node's targetlist will contain Vars with varno = INDEX_VAR */
+		tlistvarno = INDEX_VAR;
+	}
+	else
+	{
+		ExecAssignScanType(&css->ss, RelationGetDescr(scan_rel));
+		/* Node's targetlist will contain Vars with varno = scanrelid */
+		tlistvarno = scanrelid;
+	}
+
+	/*
+	 * Initialize result tuple type and projection info.
+	 */
+	ExecAssignResultTypeFromTL(&css->ss.ps);
+	ExecAssignScanProjectionInfoWithVarno(&css->ss, tlistvarno);
+
+	/*
+	 * The callback of custom-scan provider applies the final initialization
+	 * of the custom-scan-state node according to its logic.
+	 */
+	css->methods->BeginCustomScan(css, estate, eflags);
+
+	return css;
 }
 
 static TupleTableSlot *
 ExecCustomScan(PlanState *pstate)
 {
-    CustomScanState *node = castNode(CustomScanState, pstate);
+	CustomScanState *node = castNode(CustomScanState, pstate);
 
-    CHECK_FOR_INTERRUPTS();
+	CHECK_FOR_INTERRUPTS();
 
-    Assert(node->methods->ExecCustomScan != NULL);
-    return node->methods->ExecCustomScan(node);
+	Assert(node->methods->ExecCustomScan != NULL);
+	return node->methods->ExecCustomScan(node);
 }
 
 void
 ExecEndCustomScan(CustomScanState *node)
 {
-    Assert(node->methods->EndCustomScan != NULL);
-    node->methods->EndCustomScan(node);
+	Assert(node->methods->EndCustomScan != NULL);
+	node->methods->EndCustomScan(node);
 
-    /* Free the exprcontext */
-    ExecFreeExprContext(&node->ss.ps);
+	/* Free the exprcontext */
+	ExecFreeExprContext(&node->ss.ps);
 
-    /* Clean out the tuple table */
-    ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
-    ExecClearTuple(node->ss.ss_ScanTupleSlot);
+	/* Clean out the tuple table */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
 
-    /* Close the heap relation */
-    if (node->ss.ss_currentRelation)
-        ExecCloseScanRelation(node->ss.ss_currentRelation);
+	/* Close the heap relation */
+	if (node->ss.ss_currentRelation)
+		ExecCloseScanRelation(node->ss.ss_currentRelation);
 }
 
 void
 ExecReScanCustomScan(CustomScanState *node)
 {
-    Assert(node->methods->ReScanCustomScan != NULL);
-    node->methods->ReScanCustomScan(node);
+	Assert(node->methods->ReScanCustomScan != NULL);
+	node->methods->ReScanCustomScan(node);
 }
 
 void
 ExecCustomMarkPos(CustomScanState *node)
 {
-    if (!node->methods->MarkPosCustomScan)
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("custom scan \"%s\" does not support MarkPos",
-                        node->methods->CustomName)));
-    node->methods->MarkPosCustomScan(node);
+	if (!node->methods->MarkPosCustomScan)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("custom scan \"%s\" does not support MarkPos",
+						node->methods->CustomName)));
+	node->methods->MarkPosCustomScan(node);
 }
 
 void
 ExecCustomRestrPos(CustomScanState *node)
 {
-    if (!node->methods->RestrPosCustomScan)
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("custom scan \"%s\" does not support MarkPos",
-                        node->methods->CustomName)));
-    node->methods->RestrPosCustomScan(node);
+	if (!node->methods->RestrPosCustomScan)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("custom scan \"%s\" does not support MarkPos",
+						node->methods->CustomName)));
+	node->methods->RestrPosCustomScan(node);
 }
 
 void
 ExecCustomScanEstimate(CustomScanState *node, ParallelContext *pcxt)
 {
-    const CustomExecMethods *methods = node->methods;
-
-    if (methods->EstimateDSMCustomScan)
-    {
-        node->pscan_len = methods->EstimateDSMCustomScan(node, pcxt);
-        shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
-        shm_toc_estimate_keys(&pcxt->estimator, 1);
-    }
+	const CustomExecMethods *methods = node->methods;
+
+	if (methods->EstimateDSMCustomScan)
+	{
+		node->pscan_len = methods->EstimateDSMCustomScan(node, pcxt);
+		shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
+		shm_toc_estimate_keys(&pcxt->estimator, 1);
+	}
 }
 
 void
 ExecCustomScanInitializeDSM(CustomScanState *node, ParallelContext *pcxt)
 {
-    const CustomExecMethods *methods = node->methods;
+	const CustomExecMethods *methods = node->methods;
 
-    if (methods->InitializeDSMCustomScan)
-    {
-        int            plan_node_id = node->ss.ps.plan->plan_node_id;
-        void       *coordinate;
+	if (methods->InitializeDSMCustomScan)
+	{
+		int			plan_node_id = node->ss.ps.plan->plan_node_id;
+		void	   *coordinate;
 
-        coordinate = shm_toc_allocate(pcxt->toc, node->pscan_len);
-        methods->InitializeDSMCustomScan(node, pcxt, coordinate);
-        shm_toc_insert(pcxt->toc, plan_node_id, coordinate);
-    }
+		coordinate = shm_toc_allocate(pcxt->toc, node->pscan_len);
+		methods->InitializeDSMCustomScan(node, pcxt, coordinate);
+		shm_toc_insert(pcxt->toc, plan_node_id, coordinate);
+	}
 }
 
 void
-ExecCustomScanInitializeWorker(CustomScanState *node, shm_toc *toc)
+ExecCustomScanReInitializeDSM(CustomScanState *node, ParallelContext *pcxt)
 {
-    const CustomExecMethods *methods = node->methods;
+	const CustomExecMethods *methods = node->methods;
 
-    if (methods->InitializeWorkerCustomScan)
-    {
-        int            plan_node_id = node->ss.ps.plan->plan_node_id;
-        void       *coordinate;
+	if (methods->ReInitializeDSMCustomScan)
+	{
+		int			plan_node_id = node->ss.ps.plan->plan_node_id;
+		void	   *coordinate;
 
-        coordinate = shm_toc_lookup(toc, plan_node_id, false);
-        methods->InitializeWorkerCustomScan(node, toc, coordinate);
-    }
+		coordinate = shm_toc_lookup(pcxt->toc, plan_node_id, false);
+		methods->ReInitializeDSMCustomScan(node, pcxt, coordinate);
+	}
+}
+
+void
+ExecCustomScanInitializeWorker(CustomScanState *node,
+							   ParallelWorkerContext *pwcxt)
+{
+	const CustomExecMethods *methods = node->methods;
+
+	if (methods->InitializeWorkerCustomScan)
+	{
+		int			plan_node_id = node->ss.ps.plan->plan_node_id;
+		void	   *coordinate;
+
+		coordinate = shm_toc_lookup(pwcxt->toc, plan_node_id, false);
+		methods->InitializeWorkerCustomScan(node, pwcxt->toc, coordinate);
+	}
 }
 
 void
 ExecShutdownCustomScan(CustomScanState *node)
 {
-    const CustomExecMethods *methods = node->methods;
+	const CustomExecMethods *methods = node->methods;
 
-    if (methods->ShutdownCustomScan)
-        methods->ShutdownCustomScan(node);
+	if (methods->ShutdownCustomScan)
+		methods->ShutdownCustomScan(node);
 }
diff --git a/src/backend/executor/nodeForeignscan.c b/src/backend/executor/nodeForeignscan.c
index 07da53e3..69eeda5c 100644
--- a/src/backend/executor/nodeForeignscan.c
+++ b/src/backend/executor/nodeForeignscan.c
@@ -356,7 +356,8 @@ ExecForeignScanInitializeDSM(ForeignScanState *node, ParallelContext *pcxt)
  * ----------------------------------------------------------------
  */
 void
-ExecForeignScanInitializeWorker(ForeignScanState *node, shm_toc *toc)
+ExecForeignScanInitializeWorker(ForeignScanState *node,
+								ParallelWorkerContext *pwcxt)
 {
     FdwRoutine *fdwroutine = node->fdwroutine;
 
@@ -365,8 +366,8 @@ ExecForeignScanInitializeWorker(ForeignScanState *node, shm_toc *toc)
         int            plan_node_id = node->ss.ps.plan->plan_node_id;
         void       *coordinate;
 
-        coordinate = shm_toc_lookup(toc, plan_node_id, false);
-        fdwroutine->InitializeWorkerForeignScan(node, toc, coordinate);
+		coordinate = shm_toc_lookup(pwcxt->toc, plan_node_id, false);
+		fdwroutine->InitializeWorkerForeignScan(node, pwcxt->toc, coordinate);
     }
 }
 
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 6d57ae37..c6446cad 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -1422,14 +1422,14 @@ ExecParallelHashJoinInitializeDSM(HashJoinState *node,
  * ----------------------------------------------------------------
  */
 void
-ExecParallelHashJoinInitWorker(HashJoinState *node, shm_toc *toc)
+ExecParallelHashJoinInitWorker(HashJoinState *node, ParallelWorkerContext *pwcxt)
 {
     int offset = 0;
     ParallelHashJoinState *parallelState = NULL;
     volatile ParallelWorkerStatus *numParallelWorkers = NULL;
 
-    parallelState = shm_toc_lookup(toc, node->js.ps.plan->plan_node_id, false);
-    numParallelWorkers = GetParallelWorkerStatusInfo(toc);
+	parallelState = shm_toc_lookup(pwcxt->toc, node->js.ps.plan->plan_node_id, false);
+	numParallelWorkers = GetParallelWorkerStatusInfo(pwcxt->toc);
 
     node->hj_parallelState = (ParallelHashJoinState *)palloc0(sizeof(ParallelHashJoinState));
 
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 1ae02cf5..73df1306 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -734,11 +734,12 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
  * ----------------------------------------------------------------
  */
 void
-ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, shm_toc *toc)
+ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
+								  ParallelWorkerContext *pwcxt)
 {
     ParallelIndexScanDesc piscan;
 
-    piscan = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, false);
+	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
     node->ioss_ScanDesc =
         index_beginscan_parallel(node->ss.ss_currentRelation,
                                  node->ioss_RelationDesc,
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 9cda2201..23dfff75 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -1777,11 +1777,12 @@ ExecIndexScanInitializeDSM(IndexScanState *node,
  * ----------------------------------------------------------------
  */
 void
-ExecIndexScanInitializeWorker(IndexScanState *node, shm_toc *toc)
+ExecIndexScanInitializeWorker(IndexScanState *node,
+							  ParallelWorkerContext *pwcxt)
 {
     ParallelIndexScanDesc piscan;
 
-    piscan = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, false);
+	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
     node->iss_ScanDesc =
         index_beginscan_parallel(node->ss.ss_currentRelation,
                                  node->iss_RelationDesc,
diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c
index 42e2313f..e04a2be9 100644
--- a/src/backend/executor/nodeSeqscan.c
+++ b/src/backend/executor/nodeSeqscan.c
@@ -1,28 +1,28 @@
 /*-------------------------------------------------------------------------
  *
  * nodeSeqscan.c
- *      Support routines for sequential scans of relations.
+ *	  Support routines for sequential scans of relations.
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *      src/backend/executor/nodeSeqscan.c
+ *	  src/backend/executor/nodeSeqscan.c
  *
  *-------------------------------------------------------------------------
  */
 /*
  * INTERFACE ROUTINES
- *        ExecSeqScan                sequentially scans a relation.
- *        ExecSeqNext                retrieve next tuple in sequential order.
- *        ExecInitSeqScan            creates and initializes a seqscan node.
- *        ExecEndSeqScan            releases any storage allocated.
- *        ExecReScanSeqScan        rescans the relation
+ *		ExecSeqScan				sequentially scans a relation.
+ *		ExecSeqNext				retrieve next tuple in sequential order.
+ *		ExecInitSeqScan			creates and initializes a seqscan node.
+ *		ExecEndSeqScan			releases any storage allocated.
+ *		ExecReScanSeqScan		rescans the relation
  *
- *        ExecSeqScanEstimate        estimates DSM space needed for parallel scan
- *        ExecSeqScanInitializeDSM initialize DSM for parallel scan
- *        ExecSeqScanInitializeWorker attach to DSM info in parallel worker
+ *		ExecSeqScanEstimate		estimates DSM space needed for parallel scan
+ *		ExecSeqScanInitializeDSM initialize DSM for parallel scan
+ *		ExecSeqScanInitializeWorker attach to DSM info in parallel worker
  */
 #include "postgres.h"
 
@@ -43,82 +43,82 @@ static bool InitScanRelation(SeqScanState *node, EState *estate, int eflags);
 static TupleTableSlot *SeqNext(SeqScanState *node);
 
 /* ----------------------------------------------------------------
- *                        Scan Support
+ *						Scan Support
  * ----------------------------------------------------------------
  */
 
 /* ----------------------------------------------------------------
- *        SeqNext
+ *		SeqNext
  *
- *        This is a workhorse for ExecSeqScan
+ *		This is a workhorse for ExecSeqScan
  * ----------------------------------------------------------------
  */
 static TupleTableSlot *
 SeqNext(SeqScanState *node)
 {
-    HeapTuple    tuple;
-    HeapScanDesc scandesc;
-    EState       *estate;
-    ScanDirection direction;
-    TupleTableSlot *slot;
-
-    /*
-     * get information from the estate and scan state
-     */
-    scandesc = node->ss.ss_currentScanDesc;
-    estate = node->ss.ps.state;
-    direction = estate->es_direction;
-    slot = node->ss.ss_ScanTupleSlot;
-
-    if (scandesc == NULL)
-    {
-        /*
-         * We reach here if the scan is not parallel, or if we're executing a
-         * scan that was intended to be parallel serially.
-         */
-        scandesc = heap_beginscan(node->ss.ss_currentRelation,
-                                  estate->es_snapshot,
-                                  0, NULL);
-        if(enable_distri_print)
-        {
-            elog(LOG, "seq scan snapshot local %d start ts "INT64_FORMAT " rel %s", estate->es_snapshot->local,
-                            estate->es_snapshot->start_ts, RelationGetRelationName(node->ss.ss_currentRelation));
-        }
-        node->ss.ss_currentScanDesc = scandesc;
-    }
+	HeapTuple	tuple;
+	HeapScanDesc scandesc;
+	EState	   *estate;
+	ScanDirection direction;
+	TupleTableSlot *slot;
+
+	/*
+	 * get information from the estate and scan state
+	 */
+	scandesc = node->ss.ss_currentScanDesc;
+	estate = node->ss.ps.state;
+	direction = estate->es_direction;
+	slot = node->ss.ss_ScanTupleSlot;
+
+	if (scandesc == NULL)
+	{
+		/*
+		 * We reach here if the scan is not parallel, or if we're executing a
+		 * scan that was intended to be parallel serially.
+		 */
+		scandesc = heap_beginscan(node->ss.ss_currentRelation,
+								  estate->es_snapshot,
+								  0, NULL);
+		if(enable_distri_print)
+		{
+			elog(LOG, "seq scan snapshot local %d start ts "INT64_FORMAT " rel %s", estate->es_snapshot->local,
+							estate->es_snapshot->start_ts, RelationGetRelationName(node->ss.ss_currentRelation));
+		}
+		node->ss.ss_currentScanDesc = scandesc;
+	}
 
-    /*
-     * get the next tuple from the table
-     */
-    tuple = heap_getnext(scandesc, direction);
+	/*
+	 * get the next tuple from the table
+	 */
+	tuple = heap_getnext(scandesc, direction);
 
-    if(enable_distri_debug)
-    {
-        if(tuple)
-        {
-            scandesc->rs_scan_number++;
-        }
-    }
+	if(enable_distri_debug)
+	{
+		if(tuple)
+		{
+			scandesc->rs_scan_number++;
+		}
+	}
 
 
-    /*
-     * save the tuple and the buffer returned to us by the access methods in
-     * our scan tuple slot and return the slot.  Note: we pass 'false' because
-     * tuples returned by heap_getnext() are pointers onto disk pages and were
-     * not created with palloc() and so should not be pfree()'d.  Note also
-     * that ExecStoreTuple will increment the refcount of the buffer; the
-     * refcount will not be dropped until the tuple table slot is cleared.
-     */
-    if (tuple)
-        ExecStoreTuple(tuple,    /* tuple to store */
-                       slot,    /* slot to store in */
-                       scandesc->rs_cbuf,    /* buffer associated with this
-                                             * tuple */
-                       false);    /* don't pfree this pointer */
-    else
-        ExecClearTuple(slot);
-
-    return slot;
+	/*
+	 * save the tuple and the buffer returned to us by the access methods in
+	 * our scan tuple slot and return the slot.  Note: we pass 'false' because
+	 * tuples returned by heap_getnext() are pointers onto disk pages and were
+	 * not created with palloc() and so should not be pfree()'d.  Note also
+	 * that ExecStoreTuple will increment the refcount of the buffer; the
+	 * refcount will not be dropped until the tuple table slot is cleared.
+	 */
+	if (tuple)
+		ExecStoreTuple(tuple,	/* tuple to store */
+					   slot,	/* slot to store in */
+					   scandesc->rs_cbuf,	/* buffer associated with this
+											 * tuple */
+					   false);	/* don't pfree this pointer */
+	else
+		ExecClearTuple(slot);
+
+	return slot;
 }
 
 /*
@@ -127,64 +127,64 @@ SeqNext(SeqScanState *node)
 static bool
 SeqRecheck(SeqScanState *node, TupleTableSlot *slot)
 {
-    /*
-     * Note that unlike IndexScan, SeqScan never use keys in heap_beginscan
-     * (and this is very bad) - so, here we do not check are keys ok or not.
-     */
-    return true;
+	/*
+	 * Note that unlike IndexScan, SeqScan never use keys in heap_beginscan
+	 * (and this is very bad) - so, here we do not check are keys ok or not.
+	 */
+	return true;
 }
 
 /* ----------------------------------------------------------------
- *        ExecSeqScan(node)
+ *		ExecSeqScan(node)
  *
- *        Scans the relation sequentially and returns the next qualifying
- *        tuple.
- *        We call the ExecScan() routine and pass it the appropriate
- *        access method functions.
+ *		Scans the relation sequentially and returns the next qualifying
+ *		tuple.
+ *		We call the ExecScan() routine and pass it the appropriate
+ *		access method functions.
  * ----------------------------------------------------------------
  */
 static TupleTableSlot *
 ExecSeqScan(PlanState *pstate)
 {
-    SeqScanState *node = castNode(SeqScanState, pstate);
+	SeqScanState *node = castNode(SeqScanState, pstate);
 
-    return ExecScan(&node->ss,
-                    (ExecScanAccessMtd) SeqNext,
-                    (ExecScanRecheckMtd) SeqRecheck);
+	return ExecScan(&node->ss,
+					(ExecScanAccessMtd) SeqNext,
+					(ExecScanRecheckMtd) SeqRecheck);
 }
 
 /* ----------------------------------------------------------------
- *        InitScanRelation
+ *		InitScanRelation
  *
- *        Set up to access the scan relation.
+ *		Set up to access the scan relation.
  * ----------------------------------------------------------------
  */
 static bool
 InitScanRelation(SeqScanState *node, EState *estate, int eflags)
 {
-    Relation    currentRelation;
+	Relation	currentRelation;
 
-    /*
-     * get the relation object id from the relid'th entry in the range table,
-     * open that relation and acquire appropriate lock on it.
-     */
+	/*
+	 * get the relation object id from the relid'th entry in the range table,
+	 * open that relation and acquire appropriate lock on it.
+	 */
 #ifdef __TBASE__
-    /* if interval partition, scan child table instead */
-    if(((SeqScan *) node->ss.ps.plan)->ispartchild)
-    {
-        currentRelation = ExecOpenScanRelationPartition(estate,
-                                      ((SeqScan *) node->ss.ps.plan)->scanrelid,
-                                           eflags,
-                                           ((SeqScan *) node->ss.ps.plan)->childidx);
-    }
-    else
-    {
+	/* if interval partition, scan child table instead */
+	if(((SeqScan *) node->ss.ps.plan)->ispartchild)
+	{
+		currentRelation = ExecOpenScanRelationPartition(estate,
+									  ((SeqScan *) node->ss.ps.plan)->scanrelid,
+										   eflags,
+										   ((SeqScan *) node->ss.ps.plan)->childidx);
+	}
+	else
+	{
 #endif
-    currentRelation = ExecOpenScanRelation(estate,
-                                           ((SeqScan *) node->ss.ps.plan)->scanrelid,
-                                           eflags);
+	currentRelation = ExecOpenScanRelation(estate,
+										   ((SeqScan *) node->ss.ps.plan)->scanrelid,
+										   eflags);
 #ifdef __TBASE__
-    }
+	}
 #endif
 
 	if (!currentRelation)
@@ -196,56 +196,56 @@ InitScanRelation(SeqScanState *node, EState *estate, int eflags)
     mls_check_datamask_need_passby((ScanState*)node, currentRelation->rd_id);
 #endif 
 
-    node->ss.ss_currentRelation = currentRelation;
+	node->ss.ss_currentRelation = currentRelation;
 
-    /* and report the scan tuple slot's rowtype */
-    ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation));
+	/* and report the scan tuple slot's rowtype */
+	ExecAssignScanType(&node->ss, RelationGetDescr(currentRelation));
 
 	return true;
 }
 
 
 /* ----------------------------------------------------------------
- *        ExecInitSeqScan
+ *		ExecInitSeqScan
  * ---------------------------------------------------------------
  */
 SeqScanState *
 ExecInitSeqScan(SeqScan *node, EState *estate, int eflags)
 {
-    SeqScanState *scanstate;
+	SeqScanState *scanstate;
 	bool init_ret = true;
 
 #ifdef __AUDIT_FGA__
     ListCell      *item;
 #endif
 
-    /*
-     * Once upon a time it was possible to have an outerPlan of a SeqScan, but
-     * not any more.
-     */
-    Assert(outerPlan(node) == NULL);
-    Assert(innerPlan(node) == NULL);
-
-    /*
-     * create state structure
-     */
-    scanstate = makeNode(SeqScanState);
-    scanstate->ss.ps.plan = (Plan *) node;
-    scanstate->ss.ps.state = estate;
-    scanstate->ss.ps.ExecProcNode = ExecSeqScan;
-
-    /*
-     * Miscellaneous initialization
-     *
-     * create expression context for node
-     */
-    ExecAssignExprContext(estate, &scanstate->ss.ps);
-
-    /*
-     * initialize child expressions
-     */
-    scanstate->ss.ps.qual =
-        ExecInitQual(node->plan.qual, (PlanState *) scanstate);
+	/*
+	 * Once upon a time it was possible to have an outerPlan of a SeqScan, but
+	 * not any more.
+	 */
+	Assert(outerPlan(node) == NULL);
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * create state structure
+	 */
+	scanstate = makeNode(SeqScanState);
+	scanstate->ss.ps.plan = (Plan *) node;
+	scanstate->ss.ps.state = estate;
+	scanstate->ss.ps.ExecProcNode = ExecSeqScan;
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &scanstate->ss.ps);
+
+	/*
+	 * initialize child expressions
+	 */
+	scanstate->ss.ps.qual =
+		ExecInitQual(node->plan.qual, (PlanState *) scanstate);
     
 #ifdef __AUDIT_FGA__
     if (enable_fga)
@@ -268,15 +268,15 @@ ExecInitSeqScan(SeqScan *node, EState *estate, int eflags)
     }
 #endif
 
-    /*
-     * tuple table initialization
-     */
-    ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
-    ExecInitScanTupleSlot(estate, &scanstate->ss);
+	/*
+	 * tuple table initialization
+	 */
+	ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
+	ExecInitScanTupleSlot(estate, &scanstate->ss);
 
-    /*
-     * initialize scan relation
-     */
+	/*
+	 * initialize scan relation
+	 */
 	init_ret = InitScanRelation(scanstate, estate, eflags);
 	if (!init_ret)
 	{
@@ -284,137 +284,138 @@ ExecInitSeqScan(SeqScan *node, EState *estate, int eflags)
 		return NULL;
 	}
 
-    /*
-     * Initialize result tuple type and projection info.
-     */
-    ExecAssignResultTypeFromTL(&scanstate->ss.ps);
-    ExecAssignScanProjectionInfo(&scanstate->ss);
+	/*
+	 * Initialize result tuple type and projection info.
+	 */
+	ExecAssignResultTypeFromTL(&scanstate->ss.ps);
+	ExecAssignScanProjectionInfo(&scanstate->ss);
 
-    return scanstate;
+	return scanstate;
 }
 
 /* ----------------------------------------------------------------
- *        ExecEndSeqScan
+ *		ExecEndSeqScan
  *
- *        frees any storage allocated through C routines.
+ *		frees any storage allocated through C routines.
  * ----------------------------------------------------------------
  */
 void
 ExecEndSeqScan(SeqScanState *node)
 {
-    Relation    relation;
-    HeapScanDesc scanDesc;
-
-    /*
-     * get information from node
-     */
-    relation = node->ss.ss_currentRelation;
-    scanDesc = node->ss.ss_currentScanDesc;
-
-    /*
-     * Free the exprcontext
-     */
-    ExecFreeExprContext(&node->ss.ps);
-
-    /*
-     * clean out the tuple table
-     */
-    ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
-    ExecClearTuple(node->ss.ss_ScanTupleSlot);
-
-    /*
-     * close heap scan
-     */
-    if (scanDesc != NULL)
-        heap_endscan(scanDesc);
-
-    /*
-     * close the heap relation.
-     */
-    ExecCloseScanRelation(relation);
+	Relation	relation;
+	HeapScanDesc scanDesc;
+
+	/*
+	 * get information from node
+	 */
+	relation = node->ss.ss_currentRelation;
+	scanDesc = node->ss.ss_currentScanDesc;
+
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ss.ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
+	ExecClearTuple(node->ss.ss_ScanTupleSlot);
+
+	/*
+	 * close heap scan
+	 */
+	if (scanDesc != NULL)
+		heap_endscan(scanDesc);
+
+	/*
+	 * close the heap relation.
+	 */
+	ExecCloseScanRelation(relation);
 }
 
 /* ----------------------------------------------------------------
- *                        Join Support
+ *						Join Support
  * ----------------------------------------------------------------
  */
 
 /* ----------------------------------------------------------------
- *        ExecReScanSeqScan
+ *		ExecReScanSeqScan
  *
- *        Rescans the relation.
+ *		Rescans the relation.
  * ----------------------------------------------------------------
  */
 void
 ExecReScanSeqScan(SeqScanState *node)
 {
-    HeapScanDesc scan;
+	HeapScanDesc scan;
 
-    scan = node->ss.ss_currentScanDesc;
+	scan = node->ss.ss_currentScanDesc;
 
-    if (scan != NULL)
-        heap_rescan(scan,        /* scan desc */
-                    NULL);        /* new scan keys */
+	if (scan != NULL)
+		heap_rescan(scan,		/* scan desc */
+					NULL);		/* new scan keys */
 
-    ExecScanReScan((ScanState *) node);
+	ExecScanReScan((ScanState *) node);
 }
 
 /* ----------------------------------------------------------------
- *                        Parallel Scan Support
+ *						Parallel Scan Support
  * ----------------------------------------------------------------
  */
 
 /* ----------------------------------------------------------------
- *        ExecSeqScanEstimate
+ *		ExecSeqScanEstimate
  *
- *        estimates the space required to serialize seqscan node.
+ *		estimates the space required to serialize seqscan node.
  * ----------------------------------------------------------------
  */
 void
 ExecSeqScanEstimate(SeqScanState *node,
-                    ParallelContext *pcxt)
+					ParallelContext *pcxt)
 {
-    EState       *estate = node->ss.ps.state;
+	EState	   *estate = node->ss.ps.state;
 
-    node->pscan_len = heap_parallelscan_estimate(estate->es_snapshot);
-    shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
-    shm_toc_estimate_keys(&pcxt->estimator, 1);
+	node->pscan_len = heap_parallelscan_estimate(estate->es_snapshot);
+	shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
 }
 
 /* ----------------------------------------------------------------
- *        ExecSeqScanInitializeDSM
+ *		ExecSeqScanInitializeDSM
  *
- *        Set up a parallel heap scan descriptor.
+ *		Set up a parallel heap scan descriptor.
  * ----------------------------------------------------------------
  */
 void
 ExecSeqScanInitializeDSM(SeqScanState *node,
-                         ParallelContext *pcxt)
+						 ParallelContext *pcxt)
 {
-    EState       *estate = node->ss.ps.state;
-    ParallelHeapScanDesc pscan;
-
-    pscan = shm_toc_allocate(pcxt->toc, node->pscan_len);
-    heap_parallelscan_initialize(pscan,
-                                 node->ss.ss_currentRelation,
-                                 estate->es_snapshot);
-    shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan);
-    node->ss.ss_currentScanDesc =
-        heap_beginscan_parallel(node->ss.ss_currentRelation, pscan);
+	EState	   *estate = node->ss.ps.state;
+	ParallelHeapScanDesc pscan;
+
+	pscan = shm_toc_allocate(pcxt->toc, node->pscan_len);
+	heap_parallelscan_initialize(pscan,
+								 node->ss.ss_currentRelation,
+								 estate->es_snapshot);
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan);
+	node->ss.ss_currentScanDesc =
+		heap_beginscan_parallel(node->ss.ss_currentRelation, pscan);
 }
 
 /* ----------------------------------------------------------------
- *        ExecSeqScanInitializeWorker
+ *		ExecSeqScanInitializeWorker
  *
- *        Copy relevant information from TOC into planstate.
+ *		Copy relevant information from TOC into planstate.
  * ----------------------------------------------------------------
  */
 void
-ExecSeqScanInitializeWorker(SeqScanState *node, shm_toc *toc)
+ExecSeqScanInitializeWorker(SeqScanState *node,
+							ParallelWorkerContext *pwcxt)
 {
-    ParallelHeapScanDesc pscan;
+	ParallelHeapScanDesc pscan;
 
-    pscan = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, false);
-    node->ss.ss_currentScanDesc =
-        heap_beginscan_parallel(node->ss.ss_currentRelation, pscan);
+	pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
+	node->ss.ss_currentScanDesc =
+		heap_beginscan_parallel(node->ss.ss_currentRelation, pscan);
 }
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index 3c35d902..d891a645 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -513,10 +513,10 @@ ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt)
  * ----------------------------------------------------------------
  */
 void
-ExecSortInitializeWorker(SortState *node, shm_toc *toc)
+ExecSortInitializeWorker(SortState *node, ParallelWorkerContext *pwcxt)
 {
 	node->shared_info =
-		shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id, true);
+		shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
 	node->am_worker = true;
 }
 
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index ab99d828..e3be03b8 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -12021,7 +12021,7 @@ ExecRemoteQueryInitializeDSM(RemoteQueryState *node,
 
 void
 ExecRemoteSubPlanInitDSMWorker(RemoteSubplanState *node,
-                               shm_toc *toc)
+                               ParallelWorkerContext *pwcxt)
 {
     int32                  i                    = 0;
     int32                  length            = 0;
@@ -12033,7 +12033,7 @@ ExecRemoteSubPlanInitDSMWorker(RemoteSubplanState *node,
     List                     *locla_exec_nodes = NULL;
     ListCell              *node_list_item   = NULL;
 
-    worker_status  = GetParallelWorkerStatusInfo(toc);
+	worker_status  = GetParallelWorkerStatusInfo(pwcxt->toc);
     worker_num = ExecGetForWorkerNumber(worker_status);
     node->parallel_status = worker_status;    
     if (node->execOnAll)
@@ -12083,7 +12083,7 @@ ExecRemoteSubPlanInitDSMWorker(RemoteSubplanState *node,
 }
 void
 ExecRemoteQueryInitializeDSMWorker(RemoteQueryState *node,
-                               shm_toc *toc)
+                                   ParallelWorkerContext *pwcxt)
 {
     int32                 worker_num    = 0;
     ParallelWorkerStatus *worker_status = NULL;
@@ -12092,7 +12092,7 @@ ExecRemoteQueryInitializeDSMWorker(RemoteQueryState *node,
 
     combiner               = (ResponseCombiner *) node;
     step                   = (RemoteQuery *) combiner->ss.ps.plan;
-    worker_status            = GetParallelWorkerStatusInfo(toc);
+	worker_status  		  = GetParallelWorkerStatusInfo(pwcxt->toc);
     worker_num            = ExecGetForWorkerNumber(worker_status);
     node->parallel_status = worker_status;    
     worker_num              = worker_num; /* keep compiler quiet. */
diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h
index 7f8c75be..2e258ca4 100644
--- a/src/include/access/parallel.h
+++ b/src/include/access/parallel.h
@@ -118,6 +118,12 @@ typedef struct ParallelWorkerStatus
 } ParallelWorkerStatus;
 #endif
 
+typedef struct ParallelWorkerContext
+{
+	dsm_segment *seg;
+	shm_toc    *toc;
+} ParallelWorkerContext;
+
 extern volatile bool ParallelMessagePending;
 extern int    ParallelWorkerNumber;
 extern bool InitializingParallelWorker;
diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h
index 1f5022aa..7336aaf9 100644
--- a/src/include/executor/nodeAgg.h
+++ b/src/include/executor/nodeAgg.h
@@ -38,7 +38,7 @@ extern void ReDistributeEstimate(PlanState *node, ParallelContext *pcxt);
 
 extern void ReDistributeInitializeDSM(PlanState *node, ParallelContext *pcxt);
 
-extern void ReDistributeInitializeWorker(PlanState *node, shm_toc *toc);
+extern void ReDistributeInitializeWorker(PlanState *node, ParallelWorkerContext *pwcxt);
 
 extern void InitializeReDistribute(ReDistributeState *state, BufFile ***file);
 
diff --git a/src/include/executor/nodeBitmapHeapscan.h b/src/include/executor/nodeBitmapHeapscan.h
index 81d2d40b..ab98a23b 100644
--- a/src/include/executor/nodeBitmapHeapscan.h
+++ b/src/include/executor/nodeBitmapHeapscan.h
@@ -21,10 +21,10 @@ extern BitmapHeapScanState *ExecInitBitmapHeapScan(BitmapHeapScan *node, EState
 extern void ExecEndBitmapHeapScan(BitmapHeapScanState *node);
 extern void ExecReScanBitmapHeapScan(BitmapHeapScanState *node);
 extern void ExecBitmapHeapEstimate(BitmapHeapScanState *node,
-                       ParallelContext *pcxt);
+					   ParallelContext *pcxt);
 extern void ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
-                            ParallelContext *pcxt);
+							ParallelContext *pcxt);
 extern void ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node,
-                               shm_toc *toc);
+							   ParallelWorkerContext *pwcxt);
 
-#endif                            /* NODEBITMAPHEAPSCAN_H */
+#endif							/* NODEBITMAPHEAPSCAN_H */
diff --git a/src/include/executor/nodeCustom.h b/src/include/executor/nodeCustom.h
index 743b0bb4..ef99c01b 100644
--- a/src/include/executor/nodeCustom.h
+++ b/src/include/executor/nodeCustom.h
@@ -20,7 +20,7 @@
  * General executor code
  */
 extern CustomScanState *ExecInitCustomScan(CustomScan *custom_scan,
-                   EState *estate, int eflags);
+				   EState *estate, int eflags);
 extern void ExecEndCustomScan(CustomScanState *node);
 
 extern void ExecReScanCustomScan(CustomScanState *node);
@@ -31,11 +31,11 @@ extern void ExecCustomRestrPos(CustomScanState *node);
  * Parallel execution support
  */
 extern void ExecCustomScanEstimate(CustomScanState *node,
-                       ParallelContext *pcxt);
+					   ParallelContext *pcxt);
 extern void ExecCustomScanInitializeDSM(CustomScanState *node,
-                            ParallelContext *pcxt);
+							ParallelContext *pcxt);
 extern void ExecCustomScanInitializeWorker(CustomScanState *node,
-                               shm_toc *toc);
+							   ParallelWorkerContext *pwcxt);
 extern void ExecShutdownCustomScan(CustomScanState *node);
 
-#endif                            /* NODECUSTOM_H */
+#endif							/* NODECUSTOM_H */
diff --git a/src/include/executor/nodeForeignscan.h b/src/include/executor/nodeForeignscan.h
index 6498b632..663bdf77 100644
--- a/src/include/executor/nodeForeignscan.h
+++ b/src/include/executor/nodeForeignscan.h
@@ -22,11 +22,11 @@ extern void ExecEndForeignScan(ForeignScanState *node);
 extern void ExecReScanForeignScan(ForeignScanState *node);
 
 extern void ExecForeignScanEstimate(ForeignScanState *node,
-                        ParallelContext *pcxt);
+						ParallelContext *pcxt);
 extern void ExecForeignScanInitializeDSM(ForeignScanState *node,
-                             ParallelContext *pcxt);
+							 ParallelContext *pcxt);
 extern void ExecForeignScanInitializeWorker(ForeignScanState *node,
-                                shm_toc *toc);
+								ParallelWorkerContext *pwcxt);
 extern void ExecShutdownForeignScan(ForeignScanState *node);
 
-#endif                            /* NODEFOREIGNSCAN_H */
+#endif							/* NODEFOREIGNSCAN_H */
diff --git a/src/include/executor/nodeHashjoin.h b/src/include/executor/nodeHashjoin.h
index 19a9a5d4..49b04cd3 100644
--- a/src/include/executor/nodeHashjoin.h
+++ b/src/include/executor/nodeHashjoin.h
@@ -31,7 +31,7 @@ extern void ExecParallelHashJoinEstimate(HashJoinState *node, ParallelContext *p
 
 extern void ExecParallelHashJoinInitializeDSM(HashJoinState *node, ParallelContext *pcxt);
 
-extern void ExecParallelHashJoinInitWorker(HashJoinState *node, shm_toc *toc);
+extern void ExecParallelHashJoinInitWorker(HashJoinState *node, ParallelWorkerContext *pwcxt);
 
 extern void ParallelHashJoinEreport(void);
 #endif
diff --git a/src/include/executor/nodeIndexonlyscan.h b/src/include/executor/nodeIndexonlyscan.h
index 7c904f25..8bb3a65c 100644
--- a/src/include/executor/nodeIndexonlyscan.h
+++ b/src/include/executor/nodeIndexonlyscan.h
@@ -25,10 +25,10 @@ extern void ExecReScanIndexOnlyScan(IndexOnlyScanState *node);
 
 /* Support functions for parallel index-only scans */
 extern void ExecIndexOnlyScanEstimate(IndexOnlyScanState *node,
-                          ParallelContext *pcxt);
+						  ParallelContext *pcxt);
 extern void ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
-                               ParallelContext *pcxt);
+							   ParallelContext *pcxt);
 extern void ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
-                                  shm_toc *toc);
+								  ParallelWorkerContext *pwcxt);
 
-#endif                            /* NODEINDEXONLYSCAN_H */
+#endif							/* NODEINDEXONLYSCAN_H */
diff --git a/src/include/executor/nodeIndexscan.h b/src/include/executor/nodeIndexscan.h
index 17390c2e..ae0f4480 100644
--- a/src/include/executor/nodeIndexscan.h
+++ b/src/include/executor/nodeIndexscan.h
@@ -24,21 +24,23 @@ extern void ExecIndexRestrPos(IndexScanState *node);
 extern void ExecReScanIndexScan(IndexScanState *node);
 extern void ExecIndexScanEstimate(IndexScanState *node, ParallelContext *pcxt);
 extern void ExecIndexScanInitializeDSM(IndexScanState *node, ParallelContext *pcxt);
-extern void ExecIndexScanInitializeWorker(IndexScanState *node, shm_toc *toc);
+extern void ExecIndexScanReInitializeDSM(IndexScanState *node, ParallelContext *pcxt);
+extern void ExecIndexScanInitializeWorker(IndexScanState *node,
+							  ParallelWorkerContext *pwcxt);
 
 /*
  * These routines are exported to share code with nodeIndexonlyscan.c and
  * nodeBitmapIndexscan.c
  */
 extern void ExecIndexBuildScanKeys(PlanState *planstate, Relation index,
-                       List *quals, bool isorderby,
-                       ScanKey *scanKeys, int *numScanKeys,
-                       IndexRuntimeKeyInfo **runtimeKeys, int *numRuntimeKeys,
-                       IndexArrayKeyInfo **arrayKeys, int *numArrayKeys);
+					   List *quals, bool isorderby,
+					   ScanKey *scanKeys, int *numScanKeys,
+					   IndexRuntimeKeyInfo **runtimeKeys, int *numRuntimeKeys,
+					   IndexArrayKeyInfo **arrayKeys, int *numArrayKeys);
 extern void ExecIndexEvalRuntimeKeys(ExprContext *econtext,
-                         IndexRuntimeKeyInfo *runtimeKeys, int numRuntimeKeys);
+						 IndexRuntimeKeyInfo *runtimeKeys, int numRuntimeKeys);
 extern bool ExecIndexEvalArrayKeys(ExprContext *econtext,
-                       IndexArrayKeyInfo *arrayKeys, int numArrayKeys);
+					   IndexArrayKeyInfo *arrayKeys, int numArrayKeys);
 extern bool ExecIndexAdvanceArrayKeys(IndexArrayKeyInfo *arrayKeys, int numArrayKeys);
 
-#endif                            /* NODEINDEXSCAN_H */
+#endif							/* NODEINDEXSCAN_H */
diff --git a/src/include/executor/nodeSeqscan.h b/src/include/executor/nodeSeqscan.h
index aa2653c1..ee3b1a0b 100644
--- a/src/include/executor/nodeSeqscan.h
+++ b/src/include/executor/nodeSeqscan.h
@@ -24,6 +24,8 @@ extern void ExecReScanSeqScan(SeqScanState *node);
 /* parallel scan support */
 extern void ExecSeqScanEstimate(SeqScanState *node, ParallelContext *pcxt);
 extern void ExecSeqScanInitializeDSM(SeqScanState *node, ParallelContext *pcxt);
-extern void ExecSeqScanInitializeWorker(SeqScanState *node, shm_toc *toc);
+extern void ExecSeqScanReInitializeDSM(SeqScanState *node, ParallelContext *pcxt);
+extern void ExecSeqScanInitializeWorker(SeqScanState *node,
+							ParallelWorkerContext *pwcxt);
 
-#endif                            /* NODESEQSCAN_H */
+#endif							/* NODESEQSCAN_H */
diff --git a/src/include/executor/nodeSort.h b/src/include/executor/nodeSort.h
index 77ac0659..cc61a9db 100644
--- a/src/include/executor/nodeSort.h
+++ b/src/include/executor/nodeSort.h
@@ -26,7 +26,8 @@ extern void ExecReScanSort(SortState *node);
 /* parallel instrumentation support */
 extern void ExecSortEstimate(SortState *node, ParallelContext *pcxt);
 extern void ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt);
-extern void ExecSortInitializeWorker(SortState *node, shm_toc *toc);
+extern void ExecSortReInitializeDSM(SortState *node, ParallelContext *pcxt);
+extern void ExecSortInitializeWorker(SortState *node, ParallelWorkerContext *pwcxt);
 extern void ExecSortRetrieveInstrumentation(SortState *node);
 
 #endif							/* NODESORT_H */
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index 20e73f2e..03b16a62 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -413,9 +413,9 @@ extern void set_dbcleanup_callback(xact_callback function, void *paraminfo, int
 extern void ExecRemoteSubPlanInitializeDSM(RemoteSubplanState *node, ParallelContext *pcxt);
 extern void ExecRemoteQueryInitializeDSM(RemoteQueryState *node, ParallelContext *pcxt);
 extern void ExecRemoteSubPlanInitDSMWorker(RemoteSubplanState *node,
-                                                  shm_toc *toc);
+                                                  ParallelWorkerContext *pwcxt);
 extern void ExecRemoteQueryInitializeDSMWorker(RemoteQueryState *node,
-                                                   shm_toc *toc);
+                                               ParallelWorkerContext *pwcxt);
 
 extern bool ExecRemoteDML(ModifyTableState *mtstate, ItemPointer tupleid, HeapTuple oldtuple,
               TupleTableSlot *slot, TupleTableSlot *planSlot, EState *estate, EPQState *epqstate,
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 8166d86c..dedefbdf 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1530,6 +1530,7 @@ ParallelHeapScanDesc
 ParallelIndexScanDesc
 ParallelSlot
 ParallelState
+ParallelWorkerContext
 ParallelWorkerInfo
 Param
 ParamExecData

From 6b440a79956653a382210e675233b2df4145131c Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 30 Aug 2017 13:18:16 -0400
Subject: [PATCH 137/578] Separate reinitialization of shared parallel-scan
 state from ExecReScan.

Previously, the parallel executor logic did reinitialization of shared
state within the ExecReScan code for parallel-aware scan nodes.  This is
problematic, because it means that the ExecReScan call has to occur
synchronously (ie, during the parent Gather node's ReScan call).  That is
swimming very much against the tide so far as the ExecReScan machinery is
concerned; the fact that it works at all today depends on a lot of fragile
assumptions, such as that no plan node between Gather and a parallel-aware
scan node is parameterized.  Another objection is that because ExecReScan
might be called in workers as well as the leader, hacky extra tests are
needed in some places to prevent unwanted shared-state resets.

Hence, let's separate this code into two functions, a ReInitializeDSM
call and the ReScan call proper.  ReInitializeDSM is called only in
the leader and is guaranteed to run before we start new workers.
ReScan is returned to its traditional function of resetting only local
state, which means that ExecReScan's usual habits of delaying or
eliminating child rescan calls are safe again.

As with the preceding commit 7df2c1f8d, it doesn't seem to be necessary
to make these changes in 9.6, which is a good thing because the FDW and
CustomScan APIs are impacted.

Discussion: https://postgr.es/m/CAA4eK1JkByysFJNh9M349u_nNjqETuEnY_y1VUc_kJiU0bxtaQ@mail.gmail.com
---
 doc/src/sgml/custom-scan.sgml             |  29 ++-
 doc/src/sgml/fdwhandler.sgml              |  43 +++-
 src/backend/access/heap/heapam.c          |  28 ++-
 src/backend/executor/execParallel.c       |  97 ++++++--
 src/backend/executor/nodeBitmapHeapscan.c |  42 ++--
 src/backend/executor/nodeForeignscan.c    |  23 +-
 src/backend/executor/nodeGather.c         |  42 +++-
 src/backend/executor/nodeGatherMerge.c    |  41 +++-
 src/backend/executor/nodeIndexonlyscan.c  |  29 ++-
 src/backend/executor/nodeIndexscan.c      |  40 ++--
 src/backend/executor/nodeSeqscan.c        |  16 ++
 src/backend/executor/nodeSort.c           |  17 ++
 src/include/access/heapam.h               | 113 +++++-----
 src/include/executor/execParallel.h       |  25 ++-
 src/include/executor/nodeBitmapHeapscan.h |   2 +
 src/include/executor/nodeCustom.h         |   2 +
 src/include/executor/nodeForeignscan.h    |   2 +
 src/include/executor/nodeIndexonlyscan.h  |   2 +
 src/include/foreign/fdwapi.h              | 260 +++++++++++-----------
 src/include/nodes/extensible.h            | 121 +++++-----
 20 files changed, 606 insertions(+), 368 deletions(-)

diff --git a/doc/src/sgml/custom-scan.sgml b/doc/src/sgml/custom-scan.sgml
index 6159c3a2..9d1ca7bf 100644
--- a/doc/src/sgml/custom-scan.sgml
+++ b/doc/src/sgml/custom-scan.sgml
@@ -320,22 +320,39 @@ void (*InitializeDSMCustomScan) (CustomScanState *node,
                                  void *coordinate);
 </programlisting>
     Initialize the dynamic shared memory that will be required for parallel
-    operation; <literal>coordinate</> points to an amount of allocated space
-    equal to the return value of <function>EstimateDSMCustomScan</>.
+    operation.  <literal>coordinate</> points to a shared memory area of
+    size equal to the return value of <function>EstimateDSMCustomScan</>.
     This callback is optional, and need only be supplied if this custom
     scan provider supports parallel execution.
    </para>
 
    <para>
 <programlisting>
+void (*ReInitializeDSMCustomScan) (CustomScanState *node,
+                                   ParallelContext *pcxt,
+                                   void *coordinate);
+</programlisting>
+    Re-initialize the dynamic shared memory required for parallel operation
+    when the custom-scan plan node is about to be re-scanned.
+    This callback is optional, and need only be supplied if this custom
+    scan provider supports parallel execution.
+    Recommended practice is that this callback reset only shared state,
+    while the <function>ReScanCustomScan</> callback resets only local
+    state.  Currently, this callback will be called
+    before <function>ReScanCustomScan</>, but it's best not to rely on
+    that ordering.
+   </para>
+
+   <para>
+<programlisting>
 void (*InitializeWorkerCustomScan) (CustomScanState *node,
                                     shm_toc *toc,
                                     void *coordinate);
 </programlisting>
-    Initialize a parallel worker's custom state based on the shared state
-    set up in the leader by <literal>InitializeDSMCustomScan</>.
-    This callback is optional, and needs only be supplied if this
-    custom path supports parallel execution.
+    Initialize a parallel worker's local state based on the shared state
+    set up by the leader during <function>InitializeDSMCustomScan</>.
+    This callback is optional, and need only be supplied if this custom
+    scan provider supports parallel execution.
    </para>
 
    <para>
diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml
index dbeaab55..cfa68084 100644
--- a/doc/src/sgml/fdwhandler.sgml
+++ b/doc/src/sgml/fdwhandler.sgml
@@ -1191,12 +1191,12 @@ ImportForeignSchema (ImportForeignSchemaStmt *stmt, Oid serverOid);
     <para>
      A <structname>ForeignScan</> node can, optionally, support parallel
      execution.  A parallel <structname>ForeignScan</> will be executed
-     in multiple processes and should return each row only once across
+     in multiple processes and must return each row exactly once across
      all cooperating processes.  To do this, processes can coordinate through
-     fixed size chunks of dynamic shared memory.  This shared memory is not
-     guaranteed to be mapped at the same address in every process, so pointers
-     may not be used.  The following callbacks are all optional in general,
-     but required if parallel execution is to be supported.
+     fixed-size chunks of dynamic shared memory.  This shared memory is not
+     guaranteed to be mapped at the same address in every process, so it
+     must not contain pointers.  The following functions are all optional,
+     but most are required if parallel execution is to be supported.
     </para>
 
     <para>
@@ -1215,7 +1215,7 @@ IsForeignScanParallelSafe(PlannerInfo *root, RelOptInfo *rel,
     </para>
 
     <para>
-    If this callback is not defined, it is assumed that the scan must take
+    If this function is not defined, it is assumed that the scan must take
     place within the parallel leader.  Note that returning true does not mean
     that the scan itself can be done in parallel, only that the scan can be
     performed within a parallel worker.  Therefore, it can be useful to define
@@ -1230,6 +1230,9 @@ EstimateDSMForeignScan(ForeignScanState *node, ParallelContext *pcxt);
     Estimate the amount of dynamic shared memory that will be required
     for parallel operation.  This may be higher than the amount that will
     actually be used, but it must not be lower.  The return value is in bytes.
+    This function is optional, and can be omitted if not needed; but if it
+    is omitted, the next three functions must be omitted as well, because
+    no shared memory will be allocated for the FDW's use.
     </para>
 
     <para>
@@ -1239,8 +1242,25 @@ InitializeDSMForeignScan(ForeignScanState *node, ParallelContext *pcxt,
                          void *coordinate);
 </programlisting>
     Initialize the dynamic shared memory that will be required for parallel
-    operation; <literal>coordinate</> points to an amount of allocated space
-    equal to the return value of <function>EstimateDSMForeignScan</>.
+    operation.  <literal>coordinate</> points to a shared memory area of
+    size equal to the return value of <function>EstimateDSMForeignScan</>.
+    This function is optional, and can be omitted if not needed.
+   </para>
+
+    <para>
+<programlisting>
+void
+ReInitializeDSMForeignScan(ForeignScanState *node, ParallelContext *pcxt,
+                           void *coordinate);
+</programlisting>
+    Re-initialize the dynamic shared memory required for parallel operation
+    when the foreign-scan plan node is about to be re-scanned.
+    This function is optional, and can be omitted if not needed.
+    Recommended practice is that this function reset only shared state,
+    while the <function>ReScanForeignScan</> function resets only local
+    state.  Currently, this function will be called
+    before <function>ReScanForeignScan</>, but it's best not to rely on
+    that ordering.
    </para>
 
    <para>
@@ -1249,10 +1269,9 @@ void
 InitializeWorkerForeignScan(ForeignScanState *node, shm_toc *toc,
                             void *coordinate);
 </programlisting>
-    Initialize a parallel worker's custom state based on the shared state
-    set up in the leader by <literal>InitializeDSMForeignScan</>.
-    This callback is optional, and needs only be supplied if this
-    custom path supports parallel execution.
+    Initialize a parallel worker's local state based on the shared state
+    set up by the leader during <function>InitializeDSMForeignScan</>.
+    This function is optional, and can be omitted if not needed.
    </para>
 
    <para>
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 5f6eb658..97064050 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1740,21 +1740,6 @@ heap_rescan(HeapScanDesc scan,
      * reinitialize scan descriptor
      */
     initscan(scan, key, true);
-
-    /*
-     * reset parallel scan, if present
-     */
-    if (scan->rs_parallel != NULL)
-    {
-        ParallelHeapScanDesc parallel_scan;
-
-        /*
-         * Caller is responsible for making sure that all workers have
-		 * finished the scan before calling this.
-         */
-        parallel_scan = scan->rs_parallel;
-		pg_atomic_write_u64(&parallel_scan->phs_nallocated, 0);
-    }
 }
 
 /* ----------------
@@ -1916,6 +1901,19 @@ heap_parallelscan_initialize(ParallelHeapScanDesc target, Relation relation,
     SerializeSnapshot(snapshot, target->phs_snapshot_data);
 }
 
+/* ----------------
+ *		heap_parallelscan_reinitialize - reset a parallel scan
+ *
+ *		Call this in the leader process.  Caller is responsible for
+ *		making sure that all workers have finished the scan beforehand.
+ * ----------------
+ */
+void
+heap_parallelscan_reinitialize(ParallelHeapScanDesc parallel_scan)
+{
+	pg_atomic_write_u64(&parallel_scan->phs_nallocated, 0);
+}
+
 /* ----------------
  *        heap_beginscan_parallel - join a parallel scan
  *
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 75a82009..5ec13c1a 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -144,6 +144,8 @@ static bool ExecParallelInitializeDSM(PlanState *node,
                           ExecParallelInitializeDSMContext *d);
 static shm_mq_handle **ExecParallelSetupTupleQueues(ParallelContext *pcxt,
                              bool reinitialize);
+static bool ExecParallelReInitializeDSM(PlanState *planstate,
+							ParallelContext *pcxt);
 static bool ExecParallelRetrieveInstrumentation(PlanState *planstate,
                                     SharedExecutorInstrumentation *instrumentation);
 
@@ -415,7 +417,6 @@ ExecParallelInitializeDSM(PlanState *planstate,
                 }
                 break;
 #endif
-
             default:
                 break;
         }
@@ -475,18 +476,6 @@ ExecParallelSetupTupleQueues(ParallelContext *pcxt, bool reinitialize)
     return responseq;
 }
 
-/*
- * Re-initialize the parallel executor info such that it can be reused by
- * workers.
- */
-void
-ExecParallelReinitialize(ParallelExecutorInfo *pei)
-{
-    ReinitializeParallelDSM(pei->pcxt);
-    pei->tqueue = ExecParallelSetupTupleQueues(pei->pcxt, true);
-    pei->finished = false;
-}
-
 /*
  * Sets up the required infrastructure for backend workers to perform
  * execution and return results to the main backend.
@@ -875,7 +864,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, int nworkers)
     ExecParallelInitializeDSM(planstate, &d);
 
     /*
-     * Make sure that the world hasn't shifted under our feat.  This could
+	 * Make sure that the world hasn't shifted under our feet.  This could
      * probably just be an Assert(), but let's be conservative for now.
      */
     if (e.nnodes != d.nnodes)
@@ -885,6 +874,82 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, int nworkers)
     return pei;
 }
 
+/*
+ * Re-initialize the parallel executor shared memory state before launching
+ * a fresh batch of workers.
+ */
+void
+ExecParallelReinitialize(PlanState *planstate,
+						 ParallelExecutorInfo *pei)
+{
+	/* Old workers must already be shut down */
+	Assert(pei->finished);
+
+	ReinitializeParallelDSM(pei->pcxt);
+	pei->tqueue = ExecParallelSetupTupleQueues(pei->pcxt, true);
+	pei->finished = false;
+
+	/* Traverse plan tree and let each child node reset associated state. */
+	ExecParallelReInitializeDSM(planstate, pei->pcxt);
+}
+
+/*
+ * Traverse plan tree to reinitialize per-node dynamic shared memory state
+ */
+static bool
+ExecParallelReInitializeDSM(PlanState *planstate,
+							ParallelContext *pcxt)
+{
+	if (planstate == NULL)
+		return false;
+
+	/*
+	 * Call reinitializers for DSM-using plan nodes.
+	 */
+	switch (nodeTag(planstate))
+	{
+		case T_SeqScanState:
+			if (planstate->plan->parallel_aware)
+				ExecSeqScanReInitializeDSM((SeqScanState *) planstate,
+										   pcxt);
+			break;
+		case T_IndexScanState:
+			if (planstate->plan->parallel_aware)
+				ExecIndexScanReInitializeDSM((IndexScanState *) planstate,
+											 pcxt);
+			break;
+		case T_IndexOnlyScanState:
+			if (planstate->plan->parallel_aware)
+				ExecIndexOnlyScanReInitializeDSM((IndexOnlyScanState *) planstate,
+												 pcxt);
+			break;
+		case T_ForeignScanState:
+			if (planstate->plan->parallel_aware)
+				ExecForeignScanReInitializeDSM((ForeignScanState *) planstate,
+											   pcxt);
+			break;
+		case T_CustomScanState:
+			if (planstate->plan->parallel_aware)
+				ExecCustomScanReInitializeDSM((CustomScanState *) planstate,
+											  pcxt);
+			break;
+		case T_BitmapHeapScanState:
+			if (planstate->plan->parallel_aware)
+				ExecBitmapHeapReInitializeDSM((BitmapHeapScanState *) planstate,
+											  pcxt);
+			break;
+		case T_SortState:
+			/* even when not parallel-aware */
+			ExecSortReInitializeDSM((SortState *) planstate, pcxt);
+			break;
+
+		default:
+			break;
+	}
+
+	return planstate_tree_walker(planstate, ExecParallelReInitializeDSM, pcxt);
+}
+
 /*
  * Copy instrumentation information about this node and its descendants from
  * dynamic shared memory.
@@ -1325,14 +1390,12 @@ ParallelQueryMain(dsm_segment *seg, shm_toc *toc)
 #endif
 	pwcxt.toc = toc;
 	pwcxt.seg = seg;
-	ExecParallelInitializeWorker(queryDesc->planstate, &pwcxt);
-
     /* Start up the executor */
     ExecutorStart(queryDesc, 0);
 
     /* Special executor initialization steps for parallel workers */
     queryDesc->planstate->state->es_query_dsa = area;
-    ExecParallelInitializeWorker(queryDesc->planstate, toc);
+	ExecParallelInitializeWorker(queryDesc->planstate, &pwcxt);
 
     /* Run the plan */
     ExecutorRun(queryDesc, ForwardScanDirection, 0L, true);
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 511dab7f..7973d3b2 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -716,23 +716,6 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
     node->shared_tbmiterator = NULL;
     node->shared_prefetch_iterator = NULL;
 
-    /* Reset parallel bitmap state, if present */
-    if (node->pstate)
-    {
-        dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
-
-        node->pstate->state = BM_INITIAL;
-
-        if (DsaPointerIsValid(node->pstate->tbmiterator))
-            tbm_free_shared_area(dsa, node->pstate->tbmiterator);
-
-        if (DsaPointerIsValid(node->pstate->prefetch_iterator))
-            tbm_free_shared_area(dsa, node->pstate->prefetch_iterator);
-
-        node->pstate->tbmiterator = InvalidDsaPointer;
-        node->pstate->prefetch_iterator = InvalidDsaPointer;
-    }
-
     ExecScanReScan(&node->ss);
 
     /*
@@ -1054,6 +1037,31 @@ ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
     node->pstate = pstate;
 }
 
+/* ----------------------------------------------------------------
+ *		ExecBitmapHeapReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
+							  ParallelContext *pcxt)
+{
+	ParallelBitmapHeapState *pstate = node->pstate;
+	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
+
+	pstate->state = BM_INITIAL;
+
+	if (DsaPointerIsValid(pstate->tbmiterator))
+		tbm_free_shared_area(dsa, pstate->tbmiterator);
+
+	if (DsaPointerIsValid(pstate->prefetch_iterator))
+		tbm_free_shared_area(dsa, pstate->prefetch_iterator);
+
+	pstate->tbmiterator = InvalidDsaPointer;
+	pstate->prefetch_iterator = InvalidDsaPointer;
+}
+
 /* ----------------------------------------------------------------
  *        ExecBitmapHeapInitializeWorker
  *
diff --git a/src/backend/executor/nodeForeignscan.c b/src/backend/executor/nodeForeignscan.c
index 69eeda5c..314ab881 100644
--- a/src/backend/executor/nodeForeignscan.c
+++ b/src/backend/executor/nodeForeignscan.c
@@ -350,7 +350,28 @@ ExecForeignScanInitializeDSM(ForeignScanState *node, ParallelContext *pcxt)
 }
 
 /* ----------------------------------------------------------------
- *        ExecForeignScanInitializeDSM
+ *		ExecForeignScanReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecForeignScanReInitializeDSM(ForeignScanState *node, ParallelContext *pcxt)
+{
+	FdwRoutine *fdwroutine = node->fdwroutine;
+
+	if (fdwroutine->ReInitializeDSMForeignScan)
+	{
+		int			plan_node_id = node->ss.ps.plan->plan_node_id;
+		void	   *coordinate;
+
+		coordinate = shm_toc_lookup(pcxt->toc, plan_node_id, false);
+		fdwroutine->ReInitializeDSMForeignScan(node, pcxt, coordinate);
+	}
+}
+
+/* ----------------------------------------------------------------
+ *		ExecForeignScanInitializeWorker
  *
  *        Initialization according to the parallel coordination information
  * ----------------------------------------------------------------
diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c
index 4154b5d3..9c63e4eb 100644
--- a/src/backend/executor/nodeGather.c
+++ b/src/backend/executor/nodeGather.c
@@ -177,7 +177,7 @@ ExecGather(PlanState *pstate)
             ParallelContext *pcxt;
             ParallelWorkerStatus *num_parallel_workers = NULL;
 
-            /* Initialize the workers required to execute Gather node. */
+			/* Initialize, or re-initialize, shared state needed by workers. */
             if (!node->pei)
 #ifdef __TBASE__
                 node->pei = ExecInitParallelPlan(node->ps.lefttree,
@@ -189,6 +189,10 @@ ExecGather(PlanState *pstate)
                                                  estate,
                                                  gather->num_workers);
 #endif
+			else
+				ExecParallelReinitialize(node->ps.lefttree,
+										 node->pei);
+
             /*
              * Register backend workers. We might not get as many as we
              * requested, or indeed any at all.
@@ -527,7 +531,7 @@ ExecShutdownGather(GatherState *node)
 /* ----------------------------------------------------------------
  *        ExecReScanGather
  *
- *        Re-initialize the workers and rescans a relation via them.
+ *		Prepare to re-scan the result of a Gather.
  * ----------------------------------------------------------------
  */
 void
@@ -539,14 +543,46 @@ ExecReScanGather(GatherState *node)
      * to propagate any error or other information to master backend before
      * dying.  Parallel context will be reused for rescan.
      */
+	Gather	   *gather = (Gather *) node->ps.plan;
+	PlanState  *outerPlan = outerPlanState(node);
+
+	/* Make sure any existing workers are gracefully shut down */
     ExecShutdownGatherWorkers(node);
 
+	/* Mark node so that shared state will be rebuilt at next call */
     node->initialized = false;
 
     if (node->pei)
-        ExecParallelReinitialize(node->pei);
+		ExecParallelReinitialize(&node->ps, node->pei);
 
     ExecReScan(node->ps.lefttree);
+#if 0
+	=======
+	/*
+	 * Set child node's chgParam to tell it that the next scan might deliver a
+	 * different set of rows within the leader process.  (The overall rowset
+	 * shouldn't change, but the leader process's subset might; hence nodes
+	 * between here and the parallel table scan node mustn't optimize on the
+	 * assumption of an unchanging rowset.)
+	 */
+	if (gather->rescan_param >= 0)
+		outerPlan->chgParam = bms_add_member(outerPlan->chgParam,
+											 gather->rescan_param);
+
+	/*
+	 * If chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.  Note: because this does nothing if we have a
+	 * rescan_param, it's currently guaranteed that parallel-aware child nodes
+	 * will not see a ReScan call until after they get a ReInitializeDSM call.
+	 * That ordering might not be something to rely on, though.  A good rule
+	 * of thumb is that ReInitializeDSM should reset only shared state, ReScan
+	 * should reset only local state, and anything that depends on both of
+	 * those steps being finished must wait until the first ExecProcNode call.
+	 */
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+>>>>>>> 41b0dd987d... Separate reinitialization of shared parallel-scan state from ExecReScan.
+#endif
 }
 #ifdef __TBASE__
 void
diff --git a/src/backend/executor/nodeGatherMerge.c b/src/backend/executor/nodeGatherMerge.c
index 120cbc91..291cf644 100644
--- a/src/backend/executor/nodeGatherMerge.c
+++ b/src/backend/executor/nodeGatherMerge.c
@@ -197,7 +197,7 @@ ExecGatherMerge(PlanState *pstate)
             ParallelWorkerStatus *num_parallel_workers = NULL;
 #endif
 
-            /* Initialize data structures for workers. */
+			/* Initialize, or re-initialize, shared state needed by workers. */
             if (!node->pei)
 #ifdef __TBASE__
                 node->pei = ExecInitParallelPlan(node->ps.lefttree,
@@ -209,6 +209,9 @@ ExecGatherMerge(PlanState *pstate)
                                                  estate,
                                                  gm->num_workers);
 #endif
+			else
+				ExecParallelReinitialize(node->ps.lefttree,
+										 node->pei);
 
             /* Try to launch workers. */
             pcxt = node->pei->pcxt;
@@ -390,7 +393,7 @@ ExecShutdownGatherMergeWorkers(GatherMergeState *node)
 /* ----------------------------------------------------------------
  *        ExecReScanGatherMerge
  *
- *        Re-initialize the workers and rescans a relation via them.
+ *		Prepare to re-scan the result of a GatherMerge.
  * ----------------------------------------------------------------
  */
 void
@@ -402,14 +405,46 @@ ExecReScanGatherMerge(GatherMergeState *node)
      * to propagate any error or other information to master backend before
      * dying.  Parallel context will be reused for rescan.
      */
+	GatherMerge *gm = (GatherMerge *) node->ps.plan;
+	PlanState  *outerPlan = outerPlanState(node);
+
+	/* Make sure any existing workers are gracefully shut down */
     ExecShutdownGatherMergeWorkers(node);
 
+	/* Mark node so that shared state will be rebuilt at next call */
     node->initialized = false;
 
     if (node->pei)
-        ExecParallelReinitialize(node->pei);
+		ExecParallelReinitialize(&node->ps, node->pei);
 
     ExecReScan(node->ps.lefttree);
+#if 0
+=======
+	/*
+	 * Set child node's chgParam to tell it that the next scan might deliver a
+	 * different set of rows within the leader process.  (The overall rowset
+	 * shouldn't change, but the leader process's subset might; hence nodes
+	 * between here and the parallel table scan node mustn't optimize on the
+	 * assumption of an unchanging rowset.)
+	 */
+	if (gm->rescan_param >= 0)
+		outerPlan->chgParam = bms_add_member(outerPlan->chgParam,
+											 gm->rescan_param);
+
+	/*
+	 * If chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.  Note: because this does nothing if we have a
+	 * rescan_param, it's currently guaranteed that parallel-aware child nodes
+	 * will not see a ReScan call until after they get a ReInitializeDSM call.
+	 * That ordering might not be something to rely on, though.  A good rule
+	 * of thumb is that ReInitializeDSM should reset only shared state, ReScan
+	 * should reset only local state, and anything that depends on both of
+	 * those steps being finished must wait until the first ExecProcNode call.
+	 */
+	if (outerPlan->chgParam == NULL)
+		ExecReScan(outerPlan);
+>>>>>>> 41b0dd987d... Separate reinitialization of shared parallel-scan state from ExecReScan.
+#endif
 }
 
 /*
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 73df1306..2b6d4d61 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -25,6 +25,7 @@
  *                        parallel index-only scan
  *        ExecIndexOnlyScanInitializeDSM    initialize DSM for parallel
  *                        index-only scan
+ *		ExecIndexOnlyScanReInitializeDSM	reinitialize DSM for fresh scan
  *        ExecIndexOnlyScanInitializeWorker attach to DSM info in parallel worker
  */
 #include "postgres.h"
@@ -347,16 +348,6 @@ ExecIndexOnlyScan(PlanState *pstate)
 void
 ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
 {
-    bool        reset_parallel_scan = true;
-
-    /*
-     * If we are here to just update the scan keys, then don't reset parallel
-     * scan. For detailed reason behind this look in the comments for
-     * ExecReScanIndexScan.
-     */
-    if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady)
-        reset_parallel_scan = false;
-
     /*
      * If we are doing runtime key calculations (ie, any of the index key
      * values weren't simple Consts), compute the new key values.  But first,
@@ -377,15 +368,10 @@ ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
 
     /* reset index scan */
     if (node->ioss_ScanDesc)
-    {
-
         index_rescan(node->ioss_ScanDesc,
                      node->ioss_ScanKeys, node->ioss_NumScanKeys,
                      node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
 
-        if (reset_parallel_scan && node->ioss_ScanDesc->parallel_scan)
-            index_parallelrescan(node->ioss_ScanDesc);
-    }
     ExecScanReScan(&node->ss);
 }
 
@@ -727,6 +713,19 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
                      node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
 }
 
+/* ----------------------------------------------------------------
+ *		ExecIndexOnlyScanReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexOnlyScanReInitializeDSM(IndexOnlyScanState *node,
+								 ParallelContext *pcxt)
+{
+	index_parallelrescan(node->ioss_ScanDesc);
+}
+
 /* ----------------------------------------------------------------
  *        ExecIndexOnlyScanInitializeWorker
  *
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 23dfff75..9e0307a1 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -24,6 +24,7 @@
  *        ExecIndexRestrPos        restores scan position.
  *        ExecIndexScanEstimate    estimates DSM space needed for parallel index scan
  *        ExecIndexScanInitializeDSM initialize DSM for parallel indexscan
+ *		ExecIndexScanReInitializeDSM reinitialize DSM for fresh scan
  *        ExecIndexScanInitializeWorker attach to DSM info in parallel worker
  */
 #include "postgres.h"
@@ -588,19 +589,7 @@ ExecIndexScan(PlanState *pstate)
  */
 void
 ExecReScanIndexScan(IndexScanState *node)
-{// #lizard forgives
-    bool        reset_parallel_scan = true;
-
-    /*
-     * If we are here to just update the scan keys, then don't reset parallel
-     * scan.  We don't want each of the participating process in the parallel
-     * scan to update the shared parallel scan state at the start of the scan.
-     * It is quite possible that one of the participants has already begun
-     * scanning the index when another has yet to start it.
-     */
-    if (node->iss_NumRuntimeKeys != 0 && !node->iss_RuntimeKeysReady)
-        reset_parallel_scan = false;
-
+{
     /*
      * If we are doing runtime key calculations (ie, any of the index key
      * values weren't simple Consts), compute the new key values.  But first,
@@ -626,21 +615,11 @@ ExecReScanIndexScan(IndexScanState *node)
             reorderqueue_pop(node);
     }
 
-    /*
-     * Reset (parallel) index scan.  For parallel-aware nodes, the scan
-     * descriptor is initialized during actual execution of node and we can
-     * reach here before that (ex. during execution of nest loop join).  So,
-     * avoid updating the scan descriptor at that time.
-     */
+	/* reset index scan */
     if (node->iss_ScanDesc)
-    {
         index_rescan(node->iss_ScanDesc,
                      node->iss_ScanKeys, node->iss_NumScanKeys,
                      node->iss_OrderByKeys, node->iss_NumOrderByKeys);
-
-        if (reset_parallel_scan && node->iss_ScanDesc->parallel_scan)
-            index_parallelrescan(node->iss_ScanDesc);
-    }
     node->iss_ReachedEnd = false;
 
     ExecScanReScan(&node->ss);
@@ -1770,6 +1749,19 @@ ExecIndexScanInitializeDSM(IndexScanState *node,
                      node->iss_OrderByKeys, node->iss_NumOrderByKeys);
 }
 
+/* ----------------------------------------------------------------
+ *		ExecIndexScanReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecIndexScanReInitializeDSM(IndexScanState *node,
+							 ParallelContext *pcxt)
+{
+	index_parallelrescan(node->iss_ScanDesc);
+}
+
 /* ----------------------------------------------------------------
  *        ExecIndexScanInitializeWorker
  *
diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c
index e04a2be9..a55b9cbd 100644
--- a/src/backend/executor/nodeSeqscan.c
+++ b/src/backend/executor/nodeSeqscan.c
@@ -22,6 +22,7 @@
  *
  *		ExecSeqScanEstimate		estimates DSM space needed for parallel scan
  *		ExecSeqScanInitializeDSM initialize DSM for parallel scan
+ *		ExecSeqScanReInitializeDSM reinitialize DSM for fresh parallel scan
  *		ExecSeqScanInitializeWorker attach to DSM info in parallel worker
  */
 #include "postgres.h"
@@ -403,6 +404,21 @@ ExecSeqScanInitializeDSM(SeqScanState *node,
 		heap_beginscan_parallel(node->ss.ss_currentRelation, pscan);
 }
 
+/* ----------------------------------------------------------------
+ *		ExecSeqScanReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSeqScanReInitializeDSM(SeqScanState *node,
+						   ParallelContext *pcxt)
+{
+	HeapScanDesc scan = node->ss.ss_currentScanDesc;
+
+	heap_parallelscan_reinitialize(scan->rs_parallel);
+}
+
 /* ----------------------------------------------------------------
  *		ExecSeqScanInitializeWorker
  *
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index d891a645..2dd4bf89 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -506,6 +506,23 @@ ExecSortInitializeDSM(SortState *node, ParallelContext *pcxt)
 				   node->shared_info);
 }
 
+/* ----------------------------------------------------------------
+ *		ExecSortReInitializeDSM
+ *
+ *		Reset shared state before beginning a fresh scan.
+ * ----------------------------------------------------------------
+ */
+void
+ExecSortReInitializeDSM(SortState *node, ParallelContext *pcxt)
+{
+	/* If there's any instrumentation space, clear it for next time */
+	if (node->shared_info != NULL)
+	{
+		memset(node->shared_info->sinstrument, 0,
+			   node->shared_info->num_workers * sizeof(TuplesortInstrumentation));
+	}
+}
+
 /* ----------------------------------------------------------------
  *		ExecSortInitializeWorker
  *
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 33914a93..6cec82fb 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * heapam.h
- *      POSTGRES heap access method definitions.
+ *	  POSTGRES heap access method definitions.
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -25,9 +25,9 @@
 
 
 /* "options" flag bits for heap_insert */
-#define HEAP_INSERT_SKIP_WAL    0x0001
-#define HEAP_INSERT_SKIP_FSM    0x0002
-#define HEAP_INSERT_FROZEN        0x0004
+#define HEAP_INSERT_SKIP_WAL	0x0001
+#define HEAP_INSERT_SKIP_FSM	0x0002
+#define HEAP_INSERT_FROZEN		0x0004
 #define HEAP_INSERT_SPECULATIVE 0x0008
 
 typedef struct BulkInsertStateData *BulkInsertState;
@@ -37,17 +37,17 @@ typedef struct BulkInsertStateData *BulkInsertState;
  */
 typedef enum LockTupleMode
 {
-    /* SELECT FOR KEY SHARE */
-    LockTupleKeyShare,
-    /* SELECT FOR SHARE */
-    LockTupleShare,
-    /* SELECT FOR NO KEY UPDATE, and UPDATEs that don't modify key columns */
-    LockTupleNoKeyExclusive,
-    /* SELECT FOR UPDATE, UPDATEs that modify key columns, and DELETE */
-    LockTupleExclusive
+	/* SELECT FOR KEY SHARE */
+	LockTupleKeyShare,
+	/* SELECT FOR SHARE */
+	LockTupleShare,
+	/* SELECT FOR NO KEY UPDATE, and UPDATEs that don't modify key columns */
+	LockTupleNoKeyExclusive,
+	/* SELECT FOR UPDATE, UPDATEs that modify key columns, and DELETE */
+	LockTupleExclusive
 } LockTupleMode;
 
-#define MaxLockTupleMode    LockTupleExclusive
+#define MaxLockTupleMode	LockTupleExclusive
 
 /*
  * When heap_update, heap_delete, or heap_lock_tuple fail because the target
@@ -67,14 +67,14 @@ typedef enum LockTupleMode
  */
 typedef struct HeapUpdateFailureData
 {
-    ItemPointerData ctid;
-    TransactionId xmax;
-    CommandId    cmax;
+	ItemPointerData ctid;
+	TransactionId xmax;
+	CommandId	cmax;
 } HeapUpdateFailureData;
 
 
 /* ----------------
- *        function prototypes for heap access method
+ *		function prototypes for heap access method
  *
  * heap_create, heap_create_with_catalog, and heap_drop_with_catalog
  * are declared in catalog/heap.h
@@ -86,13 +86,13 @@ extern Relation relation_open(Oid relationId, LOCKMODE lockmode);
 extern Relation try_relation_open(Oid relationId, LOCKMODE lockmode);
 extern Relation relation_openrv(const RangeVar *relation, LOCKMODE lockmode);
 extern Relation relation_openrv_extended(const RangeVar *relation,
-                         LOCKMODE lockmode, bool missing_ok);
+						 LOCKMODE lockmode, bool missing_ok);
 extern void relation_close(Relation relation, LOCKMODE lockmode);
 
 extern Relation heap_open(Oid relationId, LOCKMODE lockmode);
 extern Relation heap_openrv(const RangeVar *relation, LOCKMODE lockmode);
 extern Relation heap_openrv_extended(const RangeVar *relation,
-                     LOCKMODE lockmode, bool missing_ok);
+					 LOCKMODE lockmode, bool missing_ok);
 
 #define heap_close(r,l)  relation_close(r,l)
 
@@ -102,47 +102,48 @@ typedef struct ParallelHeapScanDescData *ParallelHeapScanDesc;
 
 /*
  * HeapScanIsValid
- *        True iff the heap scan is valid.
+ *		True iff the heap scan is valid.
  */
 #define HeapScanIsValid(scan) PointerIsValid(scan)
 
 extern HeapScanDesc heap_beginscan(Relation relation, Snapshot snapshot,
-               int nkeys, ScanKey key);
+			   int nkeys, ScanKey key);
 extern HeapScanDesc heap_beginscan_catalog(Relation relation, int nkeys,
-                       ScanKey key);
+					   ScanKey key);
 extern HeapScanDesc heap_beginscan_strat(Relation relation, Snapshot snapshot,
-                     int nkeys, ScanKey key,
-                     bool allow_strat, bool allow_sync);
+					 int nkeys, ScanKey key,
+					 bool allow_strat, bool allow_sync);
 extern HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot,
-                  int nkeys, ScanKey key);
+				  int nkeys, ScanKey key);
 extern HeapScanDesc heap_beginscan_sampling(Relation relation,
-                        Snapshot snapshot, int nkeys, ScanKey key,
-                        bool allow_strat, bool allow_sync, bool allow_pagemode);
+						Snapshot snapshot, int nkeys, ScanKey key,
+						bool allow_strat, bool allow_sync, bool allow_pagemode);
 extern void heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk,
-                   BlockNumber endBlk);
+				   BlockNumber endBlk);
 extern void heapgetpage(HeapScanDesc scan, BlockNumber page);
 extern void heap_rescan(HeapScanDesc scan, ScanKey key);
 extern void heap_rescan_set_params(HeapScanDesc scan, ScanKey key,
-                       bool allow_strat, bool allow_sync, bool allow_pagemode);
+					   bool allow_strat, bool allow_sync, bool allow_pagemode);
 extern void heap_endscan(HeapScanDesc scan);
 extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction);
 
 extern Size heap_parallelscan_estimate(Snapshot snapshot);
 extern void heap_parallelscan_initialize(ParallelHeapScanDesc target,
-                             Relation relation, Snapshot snapshot);
+							 Relation relation, Snapshot snapshot);
+extern void heap_parallelscan_reinitialize(ParallelHeapScanDesc parallel_scan);
 extern HeapScanDesc heap_beginscan_parallel(Relation, ParallelHeapScanDesc);
 
 extern bool heap_fetch(Relation relation, Snapshot snapshot,
-           HeapTuple tuple, Buffer *userbuf, bool keep_buf,
-           Relation stats_relation);
+		   HeapTuple tuple, Buffer *userbuf, bool keep_buf,
+		   Relation stats_relation);
 extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation,
-                       Buffer buffer, Snapshot snapshot, HeapTuple heapTuple,
-                       bool *all_dead, bool first_call);
+					   Buffer buffer, Snapshot snapshot, HeapTuple heapTuple,
+					   bool *all_dead, bool first_call);
 extern bool heap_hot_search(ItemPointer tid, Relation relation,
-                Snapshot snapshot, bool *all_dead);
+				Snapshot snapshot, bool *all_dead);
 
 extern void heap_get_latest_tid(Relation relation, Snapshot snapshot,
-                    ItemPointer tid);
+					ItemPointer tid);
 extern void setLastTid(const ItemPointer tid);
 
 extern BulkInsertState GetBulkInsertState(void);
@@ -153,33 +154,33 @@ extern void FreeBulkInsertState(BulkInsertState);
 extern void ReleaseBulkInsertStatePin(BulkInsertState bistate);
 
 extern Oid heap_insert(Relation relation, HeapTuple tup, CommandId cid,
-            int options, BulkInsertState bistate);
+			int options, BulkInsertState bistate);
 extern void heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
-                  CommandId cid, int options, BulkInsertState bistate);
+				  CommandId cid, int options, BulkInsertState bistate);
 extern HTSU_Result heap_delete(Relation relation, ItemPointer tid,
-            CommandId cid, Snapshot crosscheck,  bool wait,
-            HeapUpdateFailureData *hufd);
+			CommandId cid, Snapshot crosscheck,  bool wait,
+			HeapUpdateFailureData *hufd);
 extern void heap_finish_speculative(Relation relation, HeapTuple tuple);
 extern void heap_abort_speculative(Relation relation, HeapTuple tuple);
 extern HTSU_Result heap_update(Relation relation, ItemPointer otid,
-            HeapTuple newtup,
-            CommandId cid, Snapshot crosscheck, bool wait,
-            HeapUpdateFailureData *hufd, LockTupleMode *lockmode);
+			HeapTuple newtup,
+			CommandId cid, Snapshot crosscheck, bool wait,
+			HeapUpdateFailureData *hufd, LockTupleMode *lockmode);
 extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple,
-                CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
-                bool follow_update,
-                Buffer *buffer, HeapUpdateFailureData *hufd);
+				CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
+				bool follow_update,
+				Buffer *buffer, HeapUpdateFailureData *hufd);
 extern void heap_inplace_update(Relation relation, HeapTuple tuple);
 extern bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
-                  TransactionId cutoff_multi);
+				  TransactionId cutoff_multi);
 extern bool heap_tuple_needs_freeze(HeapTupleHeader tuple, TransactionId cutoff_xid,
-                        MultiXactId cutoff_multi, Buffer buf);
+						MultiXactId cutoff_multi, Buffer buf);
 extern bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple);
 
-extern Oid    simple_heap_insert(Relation relation, HeapTuple tup);
+extern Oid	simple_heap_insert(Relation relation, HeapTuple tup);
 extern void simple_heap_delete(Relation relation, ItemPointer tid);
 extern void simple_heap_update(Relation relation, ItemPointer otid,
-                   HeapTuple tup);
+				   HeapTuple tup);
 
 extern void heap_sync(Relation relation);
 extern void heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot);
@@ -187,12 +188,12 @@ extern void heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot);
 /* in heap/pruneheap.c */
 extern void heap_page_prune_opt(Relation relation, Buffer buffer);
 extern int heap_page_prune(Relation relation, Buffer buffer,
-                TransactionId OldestXmin,
-                bool report_stats, TransactionId *latestRemovedXid);
+				TransactionId OldestXmin,
+				bool report_stats, TransactionId *latestRemovedXid);
 extern void heap_page_prune_execute(Buffer buffer,
-                        OffsetNumber *redirected, int nredirected,
-                        OffsetNumber *nowdead, int ndead,
-                        OffsetNumber *nowunused, int nunused);
+						OffsetNumber *redirected, int nredirected,
+						OffsetNumber *nowdead, int ndead,
+						OffsetNumber *nowunused, int nunused);
 extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 
 /* in heap/syncscan.c */
@@ -204,4 +205,4 @@ extern Size SyncScanShmemSize(void);
 extern void mls_enable_update_rolpassword(void);
 extern void mls_disable_update_rolpassword(void);
 #endif
-#endif                            /* HEAPAM_H */
+#endif							/* HEAPAM_H */
diff --git a/src/include/executor/execParallel.h b/src/include/executor/execParallel.h
index ebd06bfb..3614fc6a 100644
--- a/src/include/executor/execParallel.h
+++ b/src/include/executor/execParallel.h
@@ -1,12 +1,12 @@
 /*--------------------------------------------------------------------
  * execParallel.h
- *        POSTGRES parallel execution interface
+ *		POSTGRES parallel execution interface
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *        src/include/executor/execParallel.h
+ *		src/include/executor/execParallel.h
  *--------------------------------------------------------------------
  */
 
@@ -23,15 +23,15 @@ typedef struct SharedExecutorInstrumentation SharedExecutorInstrumentation;
 
 typedef struct ParallelExecutorInfo
 {
-    PlanState  *planstate;
-    ParallelContext *pcxt;
-    BufferUsage *buffer_usage;
-    SharedExecutorInstrumentation *instrumentation;
-    shm_mq_handle **tqueue;
-    dsa_area   *area;
-    bool        finished;
+	PlanState  *planstate;
+	ParallelContext *pcxt;
+	BufferUsage *buffer_usage;
+	SharedExecutorInstrumentation *instrumentation;
+	shm_mq_handle **tqueue;
+	dsa_area   *area;
+	bool		finished;
 #ifdef __TBASE__
-    bool        *executor_done;
+	bool        *executor_done;
 #endif
 } ParallelExecutorInfo;
 
@@ -48,7 +48,8 @@ extern ParallelExecutorInfo *ExecInitParallelPlan(PlanState *planstate,
 
 extern void ExecParallelFinish(ParallelExecutorInfo *pei);
 extern void ExecParallelCleanup(ParallelExecutorInfo *pei);
-extern void ExecParallelReinitialize(ParallelExecutorInfo *pei);
+extern void ExecParallelReinitialize(PlanState *planstate,
+						 ParallelExecutorInfo *pei);
 
 extern void ParallelQueryMain(dsm_segment *seg, shm_toc *toc);
 #ifdef __TBASE__
@@ -63,4 +64,4 @@ extern bool ParallelError(void);
 
 extern void HandleParallelExecutionError(void);
 #endif
-#endif                            /* EXECPARALLEL_H */
+#endif							/* EXECPARALLEL_H */
diff --git a/src/include/executor/nodeBitmapHeapscan.h b/src/include/executor/nodeBitmapHeapscan.h
index ab98a23b..7907ecc3 100644
--- a/src/include/executor/nodeBitmapHeapscan.h
+++ b/src/include/executor/nodeBitmapHeapscan.h
@@ -24,6 +24,8 @@ extern void ExecBitmapHeapEstimate(BitmapHeapScanState *node,
 					   ParallelContext *pcxt);
 extern void ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
 							ParallelContext *pcxt);
+extern void ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
+							  ParallelContext *pcxt);
 extern void ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node,
 							   ParallelWorkerContext *pwcxt);
 
diff --git a/src/include/executor/nodeCustom.h b/src/include/executor/nodeCustom.h
index ef99c01b..d7dcf3b8 100644
--- a/src/include/executor/nodeCustom.h
+++ b/src/include/executor/nodeCustom.h
@@ -34,6 +34,8 @@ extern void ExecCustomScanEstimate(CustomScanState *node,
 					   ParallelContext *pcxt);
 extern void ExecCustomScanInitializeDSM(CustomScanState *node,
 							ParallelContext *pcxt);
+extern void ExecCustomScanReInitializeDSM(CustomScanState *node,
+							  ParallelContext *pcxt);
 extern void ExecCustomScanInitializeWorker(CustomScanState *node,
 							   ParallelWorkerContext *pwcxt);
 extern void ExecShutdownCustomScan(CustomScanState *node);
diff --git a/src/include/executor/nodeForeignscan.h b/src/include/executor/nodeForeignscan.h
index 663bdf77..152abf02 100644
--- a/src/include/executor/nodeForeignscan.h
+++ b/src/include/executor/nodeForeignscan.h
@@ -25,6 +25,8 @@ extern void ExecForeignScanEstimate(ForeignScanState *node,
 						ParallelContext *pcxt);
 extern void ExecForeignScanInitializeDSM(ForeignScanState *node,
 							 ParallelContext *pcxt);
+extern void ExecForeignScanReInitializeDSM(ForeignScanState *node,
+							   ParallelContext *pcxt);
 extern void ExecForeignScanInitializeWorker(ForeignScanState *node,
 								ParallelWorkerContext *pwcxt);
 extern void ExecShutdownForeignScan(ForeignScanState *node);
diff --git a/src/include/executor/nodeIndexonlyscan.h b/src/include/executor/nodeIndexonlyscan.h
index 8bb3a65c..c5344a8d 100644
--- a/src/include/executor/nodeIndexonlyscan.h
+++ b/src/include/executor/nodeIndexonlyscan.h
@@ -28,6 +28,8 @@ extern void ExecIndexOnlyScanEstimate(IndexOnlyScanState *node,
 						  ParallelContext *pcxt);
 extern void ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
 							   ParallelContext *pcxt);
+extern void ExecIndexOnlyScanReInitializeDSM(IndexOnlyScanState *node,
+								 ParallelContext *pcxt);
 extern void ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 								  ParallelWorkerContext *pwcxt);
 
diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h
index 8cdedb5d..ef0fbe6f 100644
--- a/src/include/foreign/fdwapi.h
+++ b/src/include/foreign/fdwapi.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * fdwapi.h
- *      API for foreign-data wrappers
+ *	  API for foreign-data wrappers
  *
  * Copyright (c) 2010-2017, PostgreSQL Global Development Group
  *
@@ -25,136 +25,139 @@ struct ExplainState;
  */
 
 typedef void (*GetForeignRelSize_function) (PlannerInfo *root,
-                                            RelOptInfo *baserel,
-                                            Oid foreigntableid);
+											RelOptInfo *baserel,
+											Oid foreigntableid);
 
 typedef void (*GetForeignPaths_function) (PlannerInfo *root,
-                                          RelOptInfo *baserel,
-                                          Oid foreigntableid);
+										  RelOptInfo *baserel,
+										  Oid foreigntableid);
 
 typedef ForeignScan *(*GetForeignPlan_function) (PlannerInfo *root,
-                                                 RelOptInfo *baserel,
-                                                 Oid foreigntableid,
-                                                 ForeignPath *best_path,
-                                                 List *tlist,
-                                                 List *scan_clauses,
-                                                 Plan *outer_plan);
+												 RelOptInfo *baserel,
+												 Oid foreigntableid,
+												 ForeignPath *best_path,
+												 List *tlist,
+												 List *scan_clauses,
+												 Plan *outer_plan);
 
 typedef void (*BeginForeignScan_function) (ForeignScanState *node,
-                                           int eflags);
+										   int eflags);
 
 typedef TupleTableSlot *(*IterateForeignScan_function) (ForeignScanState *node);
 
 typedef bool (*RecheckForeignScan_function) (ForeignScanState *node,
-                                             TupleTableSlot *slot);
+											 TupleTableSlot *slot);
 
 typedef void (*ReScanForeignScan_function) (ForeignScanState *node);
 
 typedef void (*EndForeignScan_function) (ForeignScanState *node);
 
 typedef void (*GetForeignJoinPaths_function) (PlannerInfo *root,
-                                              RelOptInfo *joinrel,
-                                              RelOptInfo *outerrel,
-                                              RelOptInfo *innerrel,
-                                              JoinType jointype,
-                                              JoinPathExtraData *extra);
+											  RelOptInfo *joinrel,
+											  RelOptInfo *outerrel,
+											  RelOptInfo *innerrel,
+											  JoinType jointype,
+											  JoinPathExtraData *extra);
 
 typedef void (*GetForeignUpperPaths_function) (PlannerInfo *root,
-                                               UpperRelationKind stage,
-                                               RelOptInfo *input_rel,
-                                               RelOptInfo *output_rel);
+											   UpperRelationKind stage,
+											   RelOptInfo *input_rel,
+											   RelOptInfo *output_rel);
 
 typedef void (*AddForeignUpdateTargets_function) (Query *parsetree,
-                                                  RangeTblEntry *target_rte,
-                                                  Relation target_relation);
+												  RangeTblEntry *target_rte,
+												  Relation target_relation);
 
 typedef List *(*PlanForeignModify_function) (PlannerInfo *root,
-                                             ModifyTable *plan,
-                                             Index resultRelation,
-                                             int subplan_index);
+											 ModifyTable *plan,
+											 Index resultRelation,
+											 int subplan_index);
 
 typedef void (*BeginForeignModify_function) (ModifyTableState *mtstate,
-                                             ResultRelInfo *rinfo,
-                                             List *fdw_private,
-                                             int subplan_index,
-                                             int eflags);
+											 ResultRelInfo *rinfo,
+											 List *fdw_private,
+											 int subplan_index,
+											 int eflags);
 
 typedef TupleTableSlot *(*ExecForeignInsert_function) (EState *estate,
-                                                       ResultRelInfo *rinfo,
-                                                       TupleTableSlot *slot,
-                                                       TupleTableSlot *planSlot);
+													   ResultRelInfo *rinfo,
+													   TupleTableSlot *slot,
+													   TupleTableSlot *planSlot);
 
 typedef TupleTableSlot *(*ExecForeignUpdate_function) (EState *estate,
-                                                       ResultRelInfo *rinfo,
-                                                       TupleTableSlot *slot,
-                                                       TupleTableSlot *planSlot);
+													   ResultRelInfo *rinfo,
+													   TupleTableSlot *slot,
+													   TupleTableSlot *planSlot);
 
 typedef TupleTableSlot *(*ExecForeignDelete_function) (EState *estate,
-                                                       ResultRelInfo *rinfo,
-                                                       TupleTableSlot *slot,
-                                                       TupleTableSlot *planSlot);
+													   ResultRelInfo *rinfo,
+													   TupleTableSlot *slot,
+													   TupleTableSlot *planSlot);
 
 typedef void (*EndForeignModify_function) (EState *estate,
-                                           ResultRelInfo *rinfo);
+										   ResultRelInfo *rinfo);
 
 typedef int (*IsForeignRelUpdatable_function) (Relation rel);
 
 typedef bool (*PlanDirectModify_function) (PlannerInfo *root,
-                                           ModifyTable *plan,
-                                           Index resultRelation,
-                                           int subplan_index);
+										   ModifyTable *plan,
+										   Index resultRelation,
+										   int subplan_index);
 
 typedef void (*BeginDirectModify_function) (ForeignScanState *node,
-                                            int eflags);
+											int eflags);
 
 typedef TupleTableSlot *(*IterateDirectModify_function) (ForeignScanState *node);
 
 typedef void (*EndDirectModify_function) (ForeignScanState *node);
 
 typedef RowMarkType (*GetForeignRowMarkType_function) (RangeTblEntry *rte,
-                                                       LockClauseStrength strength);
+													   LockClauseStrength strength);
 
 typedef HeapTuple (*RefetchForeignRow_function) (EState *estate,
-                                                 ExecRowMark *erm,
-                                                 Datum rowid,
-                                                 bool *updated);
+												 ExecRowMark *erm,
+												 Datum rowid,
+												 bool *updated);
 
 typedef void (*ExplainForeignScan_function) (ForeignScanState *node,
-                                             struct ExplainState *es);
+											 struct ExplainState *es);
 
 typedef void (*ExplainForeignModify_function) (ModifyTableState *mtstate,
-                                               ResultRelInfo *rinfo,
-                                               List *fdw_private,
-                                               int subplan_index,
-                                               struct ExplainState *es);
+											   ResultRelInfo *rinfo,
+											   List *fdw_private,
+											   int subplan_index,
+											   struct ExplainState *es);
 
 typedef void (*ExplainDirectModify_function) (ForeignScanState *node,
-                                              struct ExplainState *es);
+											  struct ExplainState *es);
 
 typedef int (*AcquireSampleRowsFunc) (Relation relation, int elevel,
-                                      HeapTuple *rows, int targrows,
-                                      double *totalrows,
-                                      double *totaldeadrows);
+									  HeapTuple *rows, int targrows,
+									  double *totalrows,
+									  double *totaldeadrows);
 
 typedef bool (*AnalyzeForeignTable_function) (Relation relation,
-                                              AcquireSampleRowsFunc *func,
-                                              BlockNumber *totalpages);
+											  AcquireSampleRowsFunc *func,
+											  BlockNumber *totalpages);
 
 typedef List *(*ImportForeignSchema_function) (ImportForeignSchemaStmt *stmt,
-                                               Oid serverOid);
+											   Oid serverOid);
 
 typedef Size (*EstimateDSMForeignScan_function) (ForeignScanState *node,
-                                                 ParallelContext *pcxt);
+												 ParallelContext *pcxt);
 typedef void (*InitializeDSMForeignScan_function) (ForeignScanState *node,
-                                                   ParallelContext *pcxt,
-                                                   void *coordinate);
+												   ParallelContext *pcxt,
+												   void *coordinate);
+typedef void (*ReInitializeDSMForeignScan_function) (ForeignScanState *node,
+													 ParallelContext *pcxt,
+													 void *coordinate);
 typedef void (*InitializeWorkerForeignScan_function) (ForeignScanState *node,
-                                                      shm_toc *toc,
-                                                      void *coordinate);
+													  shm_toc *toc,
+													  void *coordinate);
 typedef void (*ShutdownForeignScan_function) (ForeignScanState *node);
 typedef bool (*IsForeignScanParallelSafe_function) (PlannerInfo *root,
-                                                    RelOptInfo *rel,
-                                                    RangeTblEntry *rte);
+													RelOptInfo *rel,
+													RangeTblEntry *rte);
 
 /*
  * FdwRoutine is the struct returned by a foreign-data wrapper's handler
@@ -168,75 +171,76 @@ typedef bool (*IsForeignScanParallelSafe_function) (PlannerInfo *root,
  */
 typedef struct FdwRoutine
 {
-    NodeTag        type;
-
-    /* Functions for scanning foreign tables */
-    GetForeignRelSize_function GetForeignRelSize;
-    GetForeignPaths_function GetForeignPaths;
-    GetForeignPlan_function GetForeignPlan;
-    BeginForeignScan_function BeginForeignScan;
-    IterateForeignScan_function IterateForeignScan;
-    ReScanForeignScan_function ReScanForeignScan;
-    EndForeignScan_function EndForeignScan;
-
-    /*
-     * Remaining functions are optional.  Set the pointer to NULL for any that
-     * are not provided.
-     */
-
-    /* Functions for remote-join planning */
-    GetForeignJoinPaths_function GetForeignJoinPaths;
-
-    /* Functions for remote upper-relation (post scan/join) planning */
-    GetForeignUpperPaths_function GetForeignUpperPaths;
-
-    /* Functions for updating foreign tables */
-    AddForeignUpdateTargets_function AddForeignUpdateTargets;
-    PlanForeignModify_function PlanForeignModify;
-    BeginForeignModify_function BeginForeignModify;
-    ExecForeignInsert_function ExecForeignInsert;
-    ExecForeignUpdate_function ExecForeignUpdate;
-    ExecForeignDelete_function ExecForeignDelete;
-    EndForeignModify_function EndForeignModify;
-    IsForeignRelUpdatable_function IsForeignRelUpdatable;
-    PlanDirectModify_function PlanDirectModify;
-    BeginDirectModify_function BeginDirectModify;
-    IterateDirectModify_function IterateDirectModify;
-    EndDirectModify_function EndDirectModify;
-
-    /* Functions for SELECT FOR UPDATE/SHARE row locking */
-    GetForeignRowMarkType_function GetForeignRowMarkType;
-    RefetchForeignRow_function RefetchForeignRow;
-    RecheckForeignScan_function RecheckForeignScan;
-
-    /* Support functions for EXPLAIN */
-    ExplainForeignScan_function ExplainForeignScan;
-    ExplainForeignModify_function ExplainForeignModify;
-    ExplainDirectModify_function ExplainDirectModify;
-
-    /* Support functions for ANALYZE */
-    AnalyzeForeignTable_function AnalyzeForeignTable;
-
-    /* Support functions for IMPORT FOREIGN SCHEMA */
-    ImportForeignSchema_function ImportForeignSchema;
-
-    /* Support functions for parallelism under Gather node */
-    IsForeignScanParallelSafe_function IsForeignScanParallelSafe;
-    EstimateDSMForeignScan_function EstimateDSMForeignScan;
-    InitializeDSMForeignScan_function InitializeDSMForeignScan;
-    InitializeWorkerForeignScan_function InitializeWorkerForeignScan;
-    ShutdownForeignScan_function ShutdownForeignScan;
+	NodeTag		type;
+
+	/* Functions for scanning foreign tables */
+	GetForeignRelSize_function GetForeignRelSize;
+	GetForeignPaths_function GetForeignPaths;
+	GetForeignPlan_function GetForeignPlan;
+	BeginForeignScan_function BeginForeignScan;
+	IterateForeignScan_function IterateForeignScan;
+	ReScanForeignScan_function ReScanForeignScan;
+	EndForeignScan_function EndForeignScan;
+
+	/*
+	 * Remaining functions are optional.  Set the pointer to NULL for any that
+	 * are not provided.
+	 */
+
+	/* Functions for remote-join planning */
+	GetForeignJoinPaths_function GetForeignJoinPaths;
+
+	/* Functions for remote upper-relation (post scan/join) planning */
+	GetForeignUpperPaths_function GetForeignUpperPaths;
+
+	/* Functions for updating foreign tables */
+	AddForeignUpdateTargets_function AddForeignUpdateTargets;
+	PlanForeignModify_function PlanForeignModify;
+	BeginForeignModify_function BeginForeignModify;
+	ExecForeignInsert_function ExecForeignInsert;
+	ExecForeignUpdate_function ExecForeignUpdate;
+	ExecForeignDelete_function ExecForeignDelete;
+	EndForeignModify_function EndForeignModify;
+	IsForeignRelUpdatable_function IsForeignRelUpdatable;
+	PlanDirectModify_function PlanDirectModify;
+	BeginDirectModify_function BeginDirectModify;
+	IterateDirectModify_function IterateDirectModify;
+	EndDirectModify_function EndDirectModify;
+
+	/* Functions for SELECT FOR UPDATE/SHARE row locking */
+	GetForeignRowMarkType_function GetForeignRowMarkType;
+	RefetchForeignRow_function RefetchForeignRow;
+	RecheckForeignScan_function RecheckForeignScan;
+
+	/* Support functions for EXPLAIN */
+	ExplainForeignScan_function ExplainForeignScan;
+	ExplainForeignModify_function ExplainForeignModify;
+	ExplainDirectModify_function ExplainDirectModify;
+
+	/* Support functions for ANALYZE */
+	AnalyzeForeignTable_function AnalyzeForeignTable;
+
+	/* Support functions for IMPORT FOREIGN SCHEMA */
+	ImportForeignSchema_function ImportForeignSchema;
+
+	/* Support functions for parallelism under Gather node */
+	IsForeignScanParallelSafe_function IsForeignScanParallelSafe;
+	EstimateDSMForeignScan_function EstimateDSMForeignScan;
+	InitializeDSMForeignScan_function InitializeDSMForeignScan;
+	ReInitializeDSMForeignScan_function ReInitializeDSMForeignScan;
+	InitializeWorkerForeignScan_function InitializeWorkerForeignScan;
+	ShutdownForeignScan_function ShutdownForeignScan;
 } FdwRoutine;
 
 
 /* Functions in foreign/foreign.c */
 extern FdwRoutine *GetFdwRoutine(Oid fdwhandler);
-extern Oid    GetForeignServerIdByRelId(Oid relid);
+extern Oid	GetForeignServerIdByRelId(Oid relid);
 extern FdwRoutine *GetFdwRoutineByServerId(Oid serverid);
 extern FdwRoutine *GetFdwRoutineByRelId(Oid relid);
 extern FdwRoutine *GetFdwRoutineForRelation(Relation relation, bool makecopy);
 extern bool IsImportableForeignTable(const char *tablename,
-                         ImportForeignSchemaStmt *stmt);
+						 ImportForeignSchemaStmt *stmt);
 extern Path *GetExistingLocalJoinPath(RelOptInfo *joinrel);
 
-#endif                            /* FDWAPI_H */
+#endif							/* FDWAPI_H */
diff --git a/src/include/nodes/extensible.h b/src/include/nodes/extensible.h
index bc4e07d8..0654e79c 100644
--- a/src/include/nodes/extensible.h
+++ b/src/include/nodes/extensible.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * extensible.h
- *      Definitions for extensible nodes and custom scans
+ *	  Definitions for extensible nodes and custom scans
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -21,7 +21,7 @@
 #include "nodes/relation.h"
 
 /* maximum length of an extensible node identifier */
-#define EXTNODENAME_MAX_LEN                    64
+#define EXTNODENAME_MAX_LEN					64
 
 /*
  * An extensible node is a new type of node defined by an extension.  The
@@ -31,8 +31,8 @@
  */
 typedef struct ExtensibleNode
 {
-    NodeTag        type;
-    const char *extnodename;    /* identifier of ExtensibleNodeMethods */
+	NodeTag		type;
+	const char *extnodename;	/* identifier of ExtensibleNodeMethods */
 } ExtensibleNode;
 
 /*
@@ -59,27 +59,27 @@ typedef struct ExtensibleNode
  */
 typedef struct ExtensibleNodeMethods
 {
-    const char *extnodename;
-    Size        node_size;
-    void        (*nodeCopy) (struct ExtensibleNode *newnode,
-                             const struct ExtensibleNode *oldnode);
-    bool        (*nodeEqual) (const struct ExtensibleNode *a,
-                              const struct ExtensibleNode *b);
-    void        (*nodeOut) (struct StringInfoData *str,
-                            const struct ExtensibleNode *node);
-    void        (*nodeRead) (struct ExtensibleNode *node);
+	const char *extnodename;
+	Size		node_size;
+	void		(*nodeCopy) (struct ExtensibleNode *newnode,
+							 const struct ExtensibleNode *oldnode);
+	bool		(*nodeEqual) (const struct ExtensibleNode *a,
+							  const struct ExtensibleNode *b);
+	void		(*nodeOut) (struct StringInfoData *str,
+							const struct ExtensibleNode *node);
+	void		(*nodeRead) (struct ExtensibleNode *node);
 } ExtensibleNodeMethods;
 
 extern void RegisterExtensibleNodeMethods(const ExtensibleNodeMethods *method);
 extern const ExtensibleNodeMethods *GetExtensibleNodeMethods(const char *name,
-                         bool missing_ok);
+						 bool missing_ok);
 
 /*
  * Flags for custom paths, indicating what capabilities the resulting scan
  * will have.
  */
-#define CUSTOMPATH_SUPPORT_BACKWARD_SCAN    0x0001
-#define CUSTOMPATH_SUPPORT_MARK_RESTORE        0x0002
+#define CUSTOMPATH_SUPPORT_BACKWARD_SCAN	0x0001
+#define CUSTOMPATH_SUPPORT_MARK_RESTORE		0x0002
 
 /*
  * Custom path methods.  Mostly, we just need to know how to convert a
@@ -87,16 +87,16 @@ extern const ExtensibleNodeMethods *GetExtensibleNodeMethods(const char *name,
  */
 typedef struct CustomPathMethods
 {
-    const char *CustomName;
+	const char *CustomName;
 
-    /* Convert Path to a Plan */
-    struct Plan *(*PlanCustomPath) (PlannerInfo *root,
-                                    RelOptInfo *rel,
-                                    struct CustomPath *best_path,
-                                    List *tlist,
-                                    List *clauses,
-                                    List *custom_plans);
-}            CustomPathMethods;
+	/* Convert Path to a Plan */
+	struct Plan *(*PlanCustomPath) (PlannerInfo *root,
+									RelOptInfo *rel,
+									struct CustomPath *best_path,
+									List *tlist,
+									List *clauses,
+									List *custom_plans);
+}			CustomPathMethods;
 
 /*
  * Custom scan.  Here again, there's not much to do: we need to be able to
@@ -104,10 +104,10 @@ typedef struct CustomPathMethods
  */
 typedef struct CustomScanMethods
 {
-    const char *CustomName;
+	const char *CustomName;
 
-    /* Create execution state (CustomScanState) from a CustomScan plan node */
-    Node       *(*CreateCustomScanState) (CustomScan *cscan);
+	/* Create execution state (CustomScanState) from a CustomScan plan node */
+	Node	   *(*CreateCustomScanState) (CustomScan *cscan);
 } CustomScanMethods;
 
 /*
@@ -116,39 +116,42 @@ typedef struct CustomScanMethods
  */
 typedef struct CustomExecMethods
 {
-    const char *CustomName;
-
-    /* Required executor methods */
-    void        (*BeginCustomScan) (CustomScanState *node,
-                                    EState *estate,
-                                    int eflags);
-    TupleTableSlot *(*ExecCustomScan) (CustomScanState *node);
-    void        (*EndCustomScan) (CustomScanState *node);
-    void        (*ReScanCustomScan) (CustomScanState *node);
-
-    /* Optional methods: needed if mark/restore is supported */
-    void        (*MarkPosCustomScan) (CustomScanState *node);
-    void        (*RestrPosCustomScan) (CustomScanState *node);
-
-    /* Optional methods: needed if parallel execution is supported */
-    Size        (*EstimateDSMCustomScan) (CustomScanState *node,
-                                          ParallelContext *pcxt);
-    void        (*InitializeDSMCustomScan) (CustomScanState *node,
-                                            ParallelContext *pcxt,
-                                            void *coordinate);
-    void        (*InitializeWorkerCustomScan) (CustomScanState *node,
-                                               shm_toc *toc,
-                                               void *coordinate);
-    void        (*ShutdownCustomScan) (CustomScanState *node);
-
-    /* Optional: print additional information in EXPLAIN */
-    void        (*ExplainCustomScan) (CustomScanState *node,
-                                      List *ancestors,
-                                      ExplainState *es);
+	const char *CustomName;
+
+	/* Required executor methods */
+	void		(*BeginCustomScan) (CustomScanState *node,
+									EState *estate,
+									int eflags);
+	TupleTableSlot *(*ExecCustomScan) (CustomScanState *node);
+	void		(*EndCustomScan) (CustomScanState *node);
+	void		(*ReScanCustomScan) (CustomScanState *node);
+
+	/* Optional methods: needed if mark/restore is supported */
+	void		(*MarkPosCustomScan) (CustomScanState *node);
+	void		(*RestrPosCustomScan) (CustomScanState *node);
+
+	/* Optional methods: needed if parallel execution is supported */
+	Size		(*EstimateDSMCustomScan) (CustomScanState *node,
+										  ParallelContext *pcxt);
+	void		(*InitializeDSMCustomScan) (CustomScanState *node,
+											ParallelContext *pcxt,
+											void *coordinate);
+	void		(*ReInitializeDSMCustomScan) (CustomScanState *node,
+											  ParallelContext *pcxt,
+											  void *coordinate);
+	void		(*InitializeWorkerCustomScan) (CustomScanState *node,
+											   shm_toc *toc,
+											   void *coordinate);
+	void		(*ShutdownCustomScan) (CustomScanState *node);
+
+	/* Optional: print additional information in EXPLAIN */
+	void		(*ExplainCustomScan) (CustomScanState *node,
+									  List *ancestors,
+									  ExplainState *es);
 } CustomExecMethods;
 
 extern void RegisterCustomScanMethods(const CustomScanMethods *methods);
 extern const CustomScanMethods *GetCustomScanMethods(const char *CustomName,
-                     bool missing_ok);
+					 bool missing_ok);
 
-#endif                            /* EXTENSIBLE_H */
+#endif							/* EXTENSIBLE_H */

From 46dd3624a6e495b72aa4175e8816bd7f9720fe90 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 5 Dec 2017 10:55:56 -0800
Subject: [PATCH 138/578] Fix EXPLAIN ANALYZE of hash join when the leader
 doesn't participate.

If a hash join appears in a parallel query, there may be no hash table
available for explain.c to inspect even though a hash table may have
been built in other processes.  This could happen either because
parallel_leader_participation was set to off or because the leader
happened to hit the end of the outer relation immediately (even though
the complete relation is not empty) and decided not to build the hash
table.

Commit bf11e7ee introduced a way for workers to exchange
instrumentation via the DSM segment for Sort nodes even though they
are not parallel-aware.  This commit does the same for Hash nodes, so
that explain.c has a way to find instrumentation data from an
arbitrary participant that actually built the hash table.

Author: Thomas Munro
Reviewed-By: Andres Freund
Discussion: https://postgr.es/m/CAEepm%3D3DUQC2-z252N55eOcZBer6DPdM%3DFzrxH9dZc5vYLsjaA%40mail.gmail.com
---
 src/backend/commands/explain.c      |  60 +++++--
 src/backend/executor/execParallel.c |  41 +++--
 src/backend/executor/execProcnode.c |   3 +
 src/backend/executor/nodeHash.c     | 104 ++++++++++++
 src/include/executor/nodeHash.h     |  47 +++---
 src/include/nodes/execnodes.h       |  26 +++
 src/test/regress/sql/join.sql       | 241 ++++++++++++++++++++++++++++
 7 files changed, 478 insertions(+), 44 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 4ba4dc81..4ec68401 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -20,7 +20,7 @@
 #include "commands/createas.h"
 #include "commands/defrem.h"
 #include "commands/prepare.h"
-#include "executor/hashjoin.h"
+#include "executor/nodeHash.h"
 #include "foreign/fdwapi.h"
 #include "nodes/extensible.h"
 #include "nodes/nodeFuncs.h"
@@ -2616,34 +2616,62 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 static void
 show_hash_info(HashState *hashstate, ExplainState *es)
 {
-    HashJoinTable hashtable;
+	HashInstrumentation *hinstrument = NULL;
 
-    hashtable = hashstate->hashtable;
+	/*
+	 * In a parallel query, the leader process may or may not have run the
+	 * hash join, and even if it did it may not have built a hash table due to
+	 * timing (if it started late it might have seen no tuples in the outer
+	 * relation and skipped building the hash table).  Therefore we have to be
+	 * prepared to get instrumentation data from a worker if there is no hash
+	 * table.
+	 */
+	if (hashstate->hashtable)
+	{
+		hinstrument = (HashInstrumentation *)
+			palloc(sizeof(HashInstrumentation));
+		ExecHashGetInstrumentation(hinstrument, hashstate->hashtable);
+	}
+	else if (hashstate->shared_info)
+	{
+		SharedHashInfo *shared_info = hashstate->shared_info;
+		int		i;
+
+		/* Find the first worker that built a hash table. */
+		for (i = 0; i < shared_info->num_workers; ++i)
+		{
+			if (shared_info->hinstrument[i].nbatch > 0)
+			{
+				hinstrument = &shared_info->hinstrument[i];
+				break;
+			}
+		}
+	}
 
-    if (hashtable)
+	if (hinstrument)
     {
-        long        spacePeakKb = (hashtable->spacePeak + 1023) / 1024;
+		long		spacePeakKb = (hinstrument->space_peak + 1023) / 1024;
 
         if (es->format != EXPLAIN_FORMAT_TEXT)
         {
-            ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es);
+			ExplainPropertyLong("Hash Buckets", hinstrument->nbuckets, es);
             ExplainPropertyLong("Original Hash Buckets",
-                                hashtable->nbuckets_original, es);
-            ExplainPropertyLong("Hash Batches", hashtable->nbatch, es);
+								hinstrument->nbuckets_original, es);
+			ExplainPropertyLong("Hash Batches", hinstrument->nbatch, es);
             ExplainPropertyLong("Original Hash Batches",
-                                hashtable->nbatch_original, es);
+								hinstrument->nbatch_original, es);
             ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
         }
-        else if (hashtable->nbatch_original != hashtable->nbatch ||
-                 hashtable->nbuckets_original != hashtable->nbuckets)
+		else if (hinstrument->nbatch_original != hinstrument->nbatch ||
+				 hinstrument->nbuckets_original != hinstrument->nbuckets)
         {
             appendStringInfoSpaces(es->str, es->indent * 2);
             appendStringInfo(es->str,
                              "Buckets: %d (originally %d)  Batches: %d (originally %d)  Memory Usage: %ldkB\n",
-                             hashtable->nbuckets,
-                             hashtable->nbuckets_original,
-                             hashtable->nbatch,
-                             hashtable->nbatch_original,
+							 hinstrument->nbuckets,
+							 hinstrument->nbuckets_original,
+							 hinstrument->nbatch,
+							 hinstrument->nbatch_original,
                              spacePeakKb);
         }
         else
@@ -2651,7 +2679,7 @@ show_hash_info(HashState *hashstate, ExplainState *es)
             appendStringInfoSpaces(es->str, es->indent * 2);
             appendStringInfo(es->str,
                              "Buckets: %d  Batches: %d  Memory Usage: %ldkB\n",
-                             hashtable->nbuckets, hashtable->nbatch,
+							 hinstrument->nbuckets, hinstrument->nbatch,
                              spacePeakKb);
         }
     }
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 5ec13c1a..7aa46865 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -28,6 +28,7 @@
 #include "executor/nodeBitmapHeapscan.h"
 #include "executor/nodeCustom.h"
 #include "executor/nodeForeignscan.h"
+#include "executor/nodeHash.h"
 #include "executor/nodeIndexscan.h"
 #include "executor/nodeIndexonlyscan.h"
 #include "executor/nodeSeqscan.h"
@@ -292,8 +293,12 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
                 ExecBitmapHeapEstimate((BitmapHeapScanState *) planstate,
                                        e->pcxt);
                 break;
+		case T_HashState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecHashEstimate((HashState *) planstate, e->pcxt);
+			break;
 		case T_SortState:
-			/* even when not parallel-aware */
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortEstimate((SortState *) planstate, e->pcxt);
 #ifdef __TBASE__
 			if (planstate->plan->parallel_aware)
@@ -385,8 +390,12 @@ ExecParallelInitializeDSM(PlanState *planstate,
                 ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate,
                                             d->pcxt);
                 break;
+		case T_HashState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecHashInitializeDSM((HashState *) planstate, d->pcxt);
+			break;
 		case T_SortState:
-			/* even when not parallel-aware */
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortInitializeDSM((SortState *) planstate, d->pcxt);
 #ifdef __TBASE__
 			if (planstate->plan->parallel_aware)
@@ -938,8 +947,12 @@ ExecParallelReInitializeDSM(PlanState *planstate,
 				ExecBitmapHeapReInitializeDSM((BitmapHeapScanState *) planstate,
 											  pcxt);
 			break;
+		case T_HashState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecHashReInitializeDSM((HashState *) planstate, pcxt);
+			break;
 		case T_SortState:
-			/* even when not parallel-aware */
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortReInitializeDSM((SortState *) planstate, pcxt);
 			break;
 
@@ -994,12 +1007,18 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
     planstate->worker_instrument->num_workers = instrumentation->num_workers;
     memcpy(&planstate->worker_instrument->instrument, instrument, ibytes);
 
-	/*
-	 * Perform any node-type-specific work that needs to be done.  Currently,
-	 * only Sort nodes need to do anything here.
-	 */
-	if (IsA(planstate, SortState))
+	/* Perform any node-type-specific work that needs to be done. */
+	switch (nodeTag(planstate))
+	{
+		case T_SortState:
 		ExecSortRetrieveInstrumentation((SortState *) planstate);
+			break;
+		case T_HashState:
+			ExecHashRetrieveInstrumentation((HashState *) planstate);
+			break;
+		default:
+			break;
+	}
 
     return planstate_tree_walker(planstate, ExecParallelRetrieveInstrumentation,
                                  instrumentation);
@@ -1193,8 +1212,12 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 			if (planstate->plan->parallel_aware)
 				ExecBitmapHeapInitializeWorker((BitmapHeapScanState *) planstate, pwcxt);
                 break;
+		case T_HashState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecHashInitializeWorker((HashState *) planstate, pwcxt);
+			break;
 		case T_SortState:
-			/* even when not parallel-aware */
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecSortInitializeWorker((SortState *) planstate, pwcxt);
 #ifdef __TBASE__
 			if (planstate->plan->parallel_aware)
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index f8f15db2..eb5df5b9 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -869,6 +869,9 @@ ExecShutdownNode(PlanState *node)
 		case T_RemoteSubplanState:
 			ExecShutdownRemoteSubplan((RemoteSubplanState *) node);
 			break;
+		case T_HashState:
+			ExecShutdownHash((HashState *) node);
+			break;
         default:
             break;
     }
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 3b9684f7..c81eb2fa 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -1822,6 +1822,110 @@ ExecHashRemoveNextSkewBucket(HashJoinTable hashtable)
     }
 }
 
+/*
+ * Reserve space in the DSM segment for instrumentation data.
+ */
+void
+ExecHashEstimate(HashState *node, ParallelContext *pcxt)
+{
+	size_t		size;
+
+	size = mul_size(pcxt->nworkers, sizeof(HashInstrumentation));
+	size = add_size(size, offsetof(SharedHashInfo, hinstrument));
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/*
+ * Set up a space in the DSM for all workers to record instrumentation data
+ * about their hash table.
+ */
+void
+ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt)
+{
+	size_t		size;
+
+	size = offsetof(SharedHashInfo, hinstrument) +
+		pcxt->nworkers * sizeof(HashInstrumentation);
+	node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size);
+	memset(node->shared_info, 0, size);
+	node->shared_info->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id,
+				   node->shared_info);
+}
+
+/*
+ * Reset shared state before beginning a fresh scan.
+ */
+void
+ExecHashReInitializeDSM(HashState *node, ParallelContext *pcxt)
+{
+	if (node->shared_info != NULL)
+	{
+		memset(node->shared_info->hinstrument, 0,
+			   node->shared_info->num_workers * sizeof(HashInstrumentation));
+	}
+}
+
+/*
+ * Locate the DSM space for hash table instrumentation data that we'll write
+ * to at shutdown time.
+ */
+void
+ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt)
+{
+	SharedHashInfo *shared_info;
+
+	shared_info = (SharedHashInfo *)
+		shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, true);
+	node->hinstrument = &shared_info->hinstrument[ParallelWorkerNumber];
+}
+
+/*
+ * Copy instrumentation data from this worker's hash table (if it built one)
+ * to DSM memory so the leader can retrieve it.  This must be done in an
+ * ExecShutdownHash() rather than ExecEndHash() because the latter runs after
+ * we've detached from the DSM segment.
+ */
+void
+ExecShutdownHash(HashState *node)
+{
+	if (node->hinstrument && node->hashtable)
+		ExecHashGetInstrumentation(node->hinstrument, node->hashtable);
+}
+
+/*
+ * Retrieve instrumentation data from workers before the DSM segment is
+ * detached, so that EXPLAIN can access it.
+ */
+void
+ExecHashRetrieveInstrumentation(HashState *node)
+{
+	SharedHashInfo *shared_info = node->shared_info;
+	size_t		size;
+
+	/* Replace node->shared_info with a copy in backend-local memory. */
+	size = offsetof(SharedHashInfo, hinstrument) +
+		shared_info->num_workers * sizeof(HashInstrumentation);
+	node->shared_info = palloc(size);
+	memcpy(node->shared_info, shared_info, size);
+}
+
+/*
+ * Copy the instrumentation data from 'hashtable' into a HashInstrumentation
+ * struct.
+ */
+void
+ExecHashGetInstrumentation(HashInstrumentation *instrument,
+						   HashJoinTable hashtable)
+{
+	instrument->nbuckets = hashtable->nbuckets;
+	instrument->nbuckets_original = hashtable->nbuckets_original;
+	instrument->nbatch = hashtable->nbatch;
+	instrument->nbatch_original = hashtable->nbatch_original;
+	instrument->space_peak = hashtable->spacePeak;
+}
+
 /*
  * Allocate 'size' bytes from the currently active HashMemoryChunk
  */
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index f52d3dc8..025f2a33 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * nodeHash.h
- *      prototypes for nodeHash.c
+ *	  prototypes for nodeHash.c
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -14,6 +14,7 @@
 #ifndef NODEHASH_H
 #define NODEHASH_H
 
+#include "access/parallel.h"
 #include "nodes/execnodes.h"
 
 extern HashState *ExecInitHash(Hash *node, EState *estate, int eflags);
@@ -22,37 +23,45 @@ extern void ExecEndHash(HashState *node);
 extern void ExecReScanHash(HashState *node);
 
 extern HashJoinTable ExecHashTableCreate(Hash *node, List *hashOperators,
-                    bool keepNulls);
+					bool keepNulls);
 #ifdef __TBASE__
 extern HashJoinTable ExecShmHashTableCreate(Hash *node, List *hashOperators,
-                    bool keepNulls);
+					bool keepNulls);
 extern Node *MultiExecShmHash(HashState *node);
 #endif
 
 extern void ExecHashTableDestroy(HashJoinTable hashtable);
 extern void ExecHashTableInsert(HashJoinTable hashtable,
-                    TupleTableSlot *slot,
-                    uint32 hashvalue);
+					TupleTableSlot *slot,
+					uint32 hashvalue);
 extern bool ExecHashGetHashValue(HashJoinTable hashtable,
-                     ExprContext *econtext,
-                     List *hashkeys,
-                     bool outer_tuple,
-                     bool keep_nulls,
-                     uint32 *hashvalue);
+					 ExprContext *econtext,
+					 List *hashkeys,
+					 bool outer_tuple,
+					 bool keep_nulls,
+					 uint32 *hashvalue);
 extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable,
-                          uint32 hashvalue,
-                          int *bucketno,
-                          int *batchno);
+						  uint32 hashvalue,
+						  int *bucketno,
+						  int *batchno);
 extern bool ExecScanHashBucket(HashJoinState *hjstate, ExprContext *econtext);
 extern void ExecPrepHashTableForUnmatched(HashJoinState *hjstate);
 extern bool ExecScanHashTableForUnmatched(HashJoinState *hjstate,
-                              ExprContext *econtext);
+							  ExprContext *econtext);
 extern void ExecHashTableReset(HashJoinTable hashtable);
 extern void ExecHashTableResetMatchFlags(HashJoinTable hashtable);
 extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
-                        int *numbuckets,
-                        int *numbatches,
-                        int *num_skew_mcvs);
-extern int    ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue);
+						int *numbuckets,
+						int *numbatches,
+						int *num_skew_mcvs);
+extern int	ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue);
+extern void ExecHashEstimate(HashState *node, ParallelContext *pcxt);
+extern void ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt);
+extern void ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt);
+extern void ExecHashReInitializeDSM(HashState *node, ParallelContext *pcxt);
+extern void ExecHashRetrieveInstrumentation(HashState *node);
+extern void ExecShutdownHash(HashState *node);
+extern void ExecHashGetInstrumentation(HashInstrumentation *instrument,
+									   HashJoinTable hashtable);
 
-#endif                            /* NODEHASH_H */
+#endif							/* NODEHASH_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index cedcf547..20c14341 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2222,6 +2222,29 @@ typedef struct GatherMergeState
     struct GMReaderTupleBuffer *gm_tuple_buffers;    /* tuple buffer per reader */
 } GatherMergeState;
 
+/* ----------------
+ *	 Values displayed by EXPLAIN ANALYZE
+ * ----------------
+ */
+typedef struct HashInstrumentation
+{
+	int			nbuckets;		/* number of buckets at end of execution */
+	int			nbuckets_original;	/* planned number of buckets */
+	int			nbatch;			/* number of batches at end of execution */
+	int			nbatch_original;	/* planned number of batches */
+	size_t		space_peak;		/* speak memory usage in bytes */
+} HashInstrumentation;
+
+/* ----------------
+ *	 Shared memory container for per-worker hash information
+ * ----------------
+ */
+typedef struct SharedHashInfo
+{
+	int			num_workers;
+	HashInstrumentation hinstrument[FLEXIBLE_ARRAY_MEMBER];
+} SharedHashInfo;
+
 /* ----------------
  *     HashState information
  * ----------------
@@ -2232,6 +2255,9 @@ typedef struct HashState
     HashJoinTable hashtable;    /* hash table for the hashjoin */
     List       *hashkeys;        /* list of ExprState nodes */
     /* hashkeys is same as parent's hj_InnerHashKeys */
+
+	SharedHashInfo *shared_info;	/* one entry per worker */
+	HashInstrumentation *hinstrument;	/* this worker's entry */
 } HashState;
 
 /* ----------------
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 16e8dd0b..31475056 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -2004,6 +2004,247 @@ create index idx_nestloop_suppression1_b on nestloop_suppression1(b);
 analyze nestloop_suppression1;
 analyze nestloop_suppression2;
 analyze nestloop_suppression3;
+begin;
+
+set local min_parallel_table_scan_size = 0;
+set local parallel_setup_cost = 0;
+
+-- Extract bucket and batch counts from an explain analyze plan.  In
+-- general we can't make assertions about how many batches (or
+-- buckets) will be required because it can vary, but we can in some
+-- special cases and we can check for growth.
+create or replace function find_hash(node json)
+returns json language plpgsql
+as
+$$
+declare
+  x json;
+  child json;
+begin
+  if node->>'Node Type' = 'Hash' then
+    return node;
+  else
+    for child in select json_array_elements(node->'Plans')
+    loop
+      x := find_hash(child);
+      if x is not null then
+        return x;
+      end if;
+    end loop;
+    return null;
+  end if;
+end;
+$$;
+create or replace function hash_join_batches(query text)
+returns table (original int, final int) language plpgsql
+as
+$$
+declare
+  whole_plan json;
+  hash_node json;
+begin
+  for whole_plan in
+    execute 'explain (analyze, format ''json'') ' || query
+  loop
+    hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan'));
+    original := hash_node->>'Original Hash Batches';
+    final := hash_node->>'Hash Batches';
+    return next;
+  end loop;
+end;
+$$;
+
+-- Make a simple relation with well distributed keys and correctly
+-- estimated size.
+create table simple as
+  select generate_series(1, 20000) AS id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa';
+alter table simple set (parallel_workers = 2);
+analyze simple;
+
+-- Make a relation whose size we will under-estimate.  We want stats
+-- to say 1000 rows, but actually there are 20,000 rows.
+create table bigger_than_it_looks as
+  select generate_series(1, 20000) as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa';
+alter table bigger_than_it_looks set (autovacuum_enabled = 'false');
+alter table bigger_than_it_looks set (parallel_workers = 2);
+analyze bigger_than_it_looks;
+update pg_class set reltuples = 1000 where relname = 'bigger_than_it_looks';
+
+-- Make a relation whose size we underestimate and that also has a
+-- kind of skew that breaks our batching scheme.  We want stats to say
+-- 2 rows, but actually there are 20,000 rows with the same key.
+create table extremely_skewed (id int, t text);
+alter table extremely_skewed set (autovacuum_enabled = 'false');
+alter table extremely_skewed set (parallel_workers = 2);
+analyze extremely_skewed;
+insert into extremely_skewed
+  select 42 as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
+  from generate_series(1, 20000);
+update pg_class
+  set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192
+  where relname = 'extremely_skewed';
+
+-- The "optimal" case: the hash table fits in memory; we plan for 1
+-- batch, we stick to that number, and peak memory usage stays within
+-- our work_mem budget
+
+-- non-parallel
+savepoint settings;
+set local max_parallel_workers_per_gather = 0;
+set local work_mem = '4MB';
+explain (costs off)
+  select count(*) from simple r join simple s using (id);
+select count(*) from simple r join simple s using (id);
+select original > 1 as initially_multibatch, final > original as increased_batches
+  from hash_join_batches(
+$$
+  select count(*) from simple r join simple s using (id);
+$$);
+rollback to settings;
+
+-- parallel with parallel-oblivious hash join
+savepoint settings;
+set local max_parallel_workers_per_gather = 2;
+set local work_mem = '4MB';
+explain (costs off)
+  select count(*) from simple r join simple s using (id);
+select count(*) from simple r join simple s using (id);
+select original > 1 as initially_multibatch, final > original as increased_batches
+  from hash_join_batches(
+$$
+  select count(*) from simple r join simple s using (id);
+$$);
+rollback to settings;
+
+-- The "good" case: batches required, but we plan the right number; we
+-- plan for some number of batches, and we stick to that number, and
+-- peak memory usage says within our work_mem budget
+
+-- non-parallel
+savepoint settings;
+set local max_parallel_workers_per_gather = 0;
+set local work_mem = '128kB';
+explain (costs off)
+  select count(*) from simple r join simple s using (id);
+select count(*) from simple r join simple s using (id);
+select original > 1 as initially_multibatch, final > original as increased_batches
+  from hash_join_batches(
+$$
+  select count(*) from simple r join simple s using (id);
+$$);
+rollback to settings;
+
+-- parallel with parallel-oblivious hash join
+savepoint settings;
+set local max_parallel_workers_per_gather = 2;
+set local work_mem = '128kB';
+explain (costs off)
+  select count(*) from simple r join simple s using (id);
+select count(*) from simple r join simple s using (id);
+select original > 1 as initially_multibatch, final > original as increased_batches
+  from hash_join_batches(
+$$
+  select count(*) from simple r join simple s using (id);
+$$);
+rollback to settings;
+
+-- The "bad" case: during execution we need to increase number of
+-- batches; in this case we plan for 1 batch, and increase at least a
+-- couple of times, and peak memory usage stays within our work_mem
+-- budget
+
+-- non-parallel
+savepoint settings;
+set local max_parallel_workers_per_gather = 0;
+set local work_mem = '128kB';
+explain (costs off)
+  select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id);
+select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id);
+select original > 1 as initially_multibatch, final > original as increased_batches
+  from hash_join_batches(
+$$
+  select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id);
+$$);
+rollback to settings;
+
+-- parallel with parallel-oblivious hash join
+savepoint settings;
+set local max_parallel_workers_per_gather = 2;
+set local work_mem = '128kB';
+explain (costs off)
+  select count(*) from simple r join bigger_than_it_looks s using (id);
+select count(*) from simple r join bigger_than_it_looks s using (id);
+select original > 1 as initially_multibatch, final > original as increased_batches
+  from hash_join_batches(
+$$
+  select count(*) from simple r join bigger_than_it_looks s using (id);
+$$);
+rollback to settings;
+
+-- The "ugly" case: increasing the number of batches during execution
+-- doesn't help, so stop trying to fit in work_mem and hope for the
+-- best; in this case we plan for 1 batch, increases just once and
+-- then stop increasing because that didn't help at all, so we blow
+-- right through the work_mem budget and hope for the best...
+
+-- non-parallel
+savepoint settings;
+set local max_parallel_workers_per_gather = 0;
+set local work_mem = '128kB';
+explain (costs off)
+  select count(*) from simple r join extremely_skewed s using (id);
+select count(*) from simple r join extremely_skewed s using (id);
+select * from hash_join_batches(
+$$
+  select count(*) from simple r join extremely_skewed s using (id);
+$$);
+rollback to settings;
+
+-- parallel with parallel-oblivious hash join
+savepoint settings;
+set local max_parallel_workers_per_gather = 2;
+set local work_mem = '128kB';
+explain (costs off)
+  select count(*) from simple r join extremely_skewed s using (id);
+select count(*) from simple r join extremely_skewed s using (id);
+select * from hash_join_batches(
+$$
+  select count(*) from simple r join extremely_skewed s using (id);
+$$);
+rollback to settings;
+
+-- A couple of other hash join tests unrelated to work_mem management.
+
+-- Check that EXPLAIN ANALYZE has data even if the leader doesn't participate
+savepoint settings;
+set local max_parallel_workers_per_gather = 2;
+set local work_mem = '4MB';
+set local parallel_leader_participation = off;
+select * from hash_join_batches(
+$$
+  select count(*) from simple r join simple s using (id);
+$$);
+rollback to settings;
+
+-- A full outer join where every record is matched.
+
+-- non-parallel
+savepoint settings;
+set local max_parallel_workers_per_gather = 0;
+explain (costs off)
+     select  count(*) from simple r full outer join simple s using (id);
+select  count(*) from simple r full outer join simple s using (id);
+rollback to settings;
+
+-- parallelism not possible with parallel-oblivious outer hash join
+savepoint settings;
+set local max_parallel_workers_per_gather = 2;
+explain (costs off)
+     select  count(*) from simple r full outer join simple s using (id);
+select  count(*) from simple r full outer join simple s using (id);
+rollback to settings;
+
+-- An full outer join where every record is not matched.
 
 set enable_hashjoin = false;
 explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nestloop_suppression3 t3 

From a8d69a8f0e3dcab441059ba8d3dc94f9e360baf4 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Mon, 1 Mar 2021 14:14:23 +0800
Subject: [PATCH 139/578] Fix bug after cherry-pick PG commit 8d4af49f32

---
 src/backend/executor/nodeAgg.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 9f5678f5..d36c3df4 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -550,6 +550,20 @@ typedef struct AggStatePerHashData
     Agg           *aggnode;        /* original Agg node, for numGroups etc. */
 }            AggStatePerHashData;
 
+#ifdef __TBASE__
+/*
+ * used in ReDistributeInitializeDSM and ReDistributeInitializeWorker
+ * to distinguish keys between shared memory for parallel and
+ * shared memory for redistributed optimization, for parallel it's
+ * plan_node_id the same as PG, for redistributed optimization, we
+ * use plan_node_id + this macro
+ *
+ * Node: refer to execParallel.c, only the first 4 bits been occupied
+ * for specific use, now we have to use an extra bit, but it's fine
+ * since plan_node_id is only a 32bit integer.
+ */
+#define PARALLEL_REDISTRIBUTE_OFFSET UINT64CONST(0xE800000000000000)
+#endif
 
 static void select_current_set(AggState *aggstate, int setno, bool is_hash);
 static void initialize_phase(AggState *aggstate, int newphase);
@@ -5430,7 +5444,7 @@ ReDistributeInitializeDSM(PlanState *node, ParallelContext *pcxt)
         state->buf[i]->dataType      = DT_None;
     }
     
-    shm_toc_insert(pcxt->toc, node->plan->plan_node_id, state);
+	shm_toc_insert(pcxt->toc, node->plan->plan_node_id + PARALLEL_REDISTRIBUTE_OFFSET, state);
     *state_ptr = state;
 }
 
@@ -5444,7 +5458,7 @@ ReDistributeInitializeWorker(PlanState *node, ParallelWorkerContext *pwcxt)
     ReDistributeState *rd_state = NULL;
     volatile ParallelWorkerStatus *numParallelWorkers = NULL;
 
-    state = shm_toc_lookup(toc, node->plan->plan_node_id, false);
+	state = shm_toc_lookup(toc, node->plan->plan_node_id + PARALLEL_REDISTRIBUTE_OFFSET, false);
     numParallelWorkers = GetParallelWorkerStatusInfo(toc);
 
     rd_state = (ReDistributeState *)palloc0(sizeof(ReDistributeState));

From d2d1f1f3ca4c75092c4ef6044029dec1092fdd9a Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 5 Mar 2021 15:04:58 +0800
Subject: [PATCH 140/578] Support sort and hash state, and show info of all
 datanodes if "verbose"

cherry-pick explain analyze tests from V3
---
 src/backend/commands/explain.c              |  31 +
 src/backend/commands/explain_dist.c         | 651 +++++++++++++++++---
 src/backend/executor/execParallel.c         | 227 ++++++-
 src/backend/executor/execProcnode.c         |   3 +
 src/backend/executor/nodeGather.c           |   3 +-
 src/backend/executor/nodeGatherMerge.c      |   4 +-
 src/backend/executor/nodeHash.c             |  23 +
 src/backend/executor/nodeHashjoin.c         |   9 +
 src/backend/executor/nodeSort.c             |   3 +
 src/backend/pgxc/pool/execRemote.c          |  56 +-
 src/backend/utils/cache/lsyscache.c         |  25 +
 src/include/commands/explain_dist.h         |  35 +-
 src/include/executor/instrument.h           |  90 +--
 src/include/nodes/execnodes.h               |   4 +
 src/include/pgxc/execRemote.h               |   2 +-
 src/include/utils/lsyscache.h               | 195 +++---
 src/test/regress/expected/join_3.out        | 440 +++++++++++++
 src/test/regress/expected/tbase_explain.out | 362 +++++++++++
 src/test/regress/parallel_schedule          |   6 +
 src/test/regress/sql/tbase_explain.sql      |  37 ++
 20 files changed, 1932 insertions(+), 274 deletions(-)
 create mode 100644 src/test/regress/expected/tbase_explain.out
 create mode 100644 src/test/regress/sql/tbase_explain.sql

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 4ec68401..7b56c3d3 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -47,6 +47,7 @@
 #include "pgxc/execRemote.h"
 #endif
 #ifdef __TBASE__
+#include "commands/explain_dist.h"
 #include "commands/vacuum.h"
 #endif
 
@@ -1492,6 +1493,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			ExplainPropertyFloat("Actual Loops", nloops, 0, es);
 		}
 	}
+	else if (es->analyze && planstate->dn_instrument)
+	{
+		ExplainCommonRemoteInstr(planstate, es);
+	}
 	else if (es->analyze)
 	{
 		if (es->format == EXPLAIN_FORMAT_TEXT)
@@ -2608,6 +2613,32 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 		if (opened_group)
 			ExplainCloseGroup("Workers", "Workers", false, es);
 	}
+#ifdef __TBASE__
+	else if (sortstate->instrument.spaceType != -1)
+	{
+		/* try our cached distributed instrument */
+		/* same logic above */
+		const char *sortMethod = tuplesort_method_name(sortstate->instrument.sortMethod);
+		const char *spaceType = tuplesort_space_type_name(sortstate->instrument.spaceType);
+		long        spaceUsed = sortstate->instrument.spaceUsed;
+		
+		/* -1 means invalid value, indicate that this node executed by ourself */
+		Assert(sortstate->instrument.sortMethod != -1);
+		
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			appendStringInfoSpaces(es->str, es->indent * 2);
+			appendStringInfo(es->str, "Sort Method: %s  %s: %ldkB\n",
+			                 sortMethod, spaceType, spaceUsed);
+		}
+		else
+		{
+			ExplainPropertyText("Sort Method", sortMethod, es);
+			ExplainPropertyLong("Sort Space Used", spaceUsed, es);
+			ExplainPropertyText("Sort Space Type", spaceType, es);
+		}
+	}
+#endif
 }
 
 /*
diff --git a/src/backend/commands/explain_dist.c b/src/backend/commands/explain_dist.c
index 41b7c5a1..65d9fed8 100644
--- a/src/backend/commands/explain_dist.c
+++ b/src/backend/commands/explain_dist.c
@@ -20,6 +20,8 @@
 #include "libpq/libpq.h"
 #include "libpq/pqformat.h"
 #include "nodes/nodeFuncs.h"
+#include "utils/lsyscache.h"
+#include "utils/tuplesort.h"
 
 /* Read instrument field */
 #define INSTR_READ_FIELD(fldname)                \
@@ -34,6 +36,15 @@ do {                                                      \
     target->fldname = Max(src->fldname, target->fldname); \
 } while(0)
 
+/* Tools for max/min */
+#define SET_MIN_MAX(min, max, tmp) \
+do {                               \
+    if (min > tmp)                 \
+        min = tmp;                 \
+    if (max < tmp)                 \
+        max = tmp;                 \
+} while(0)
+
 /* Serialize state */
 typedef struct
 {
@@ -47,16 +58,16 @@ typedef struct
  * InstrOut
  *
  * Serialize Instrumentation structure with the format
- * "nodetype-plan_node_id{val,val,...,val}".
+ * "nodetype-plan_node_id-node_oid{val,val,...,val}".
  *
  * NOTE: The function should be modified if the structure of Instrumentation
  * or its relevant members has been changed.
  */
 static void
-InstrOut(StringInfo buf, Plan *plan, Instrumentation *instr)
+InstrOut(StringInfo buf, Plan *plan, Instrumentation *instr, int current_node_id)
 {
 	/* nodeTag for varify */
-	appendStringInfo(buf, "%hd-%d{", nodeTag(plan), plan->plan_node_id);
+	appendStringInfo(buf, "%hd-%d-%d{", nodeTag(plan), plan->plan_node_id, current_node_id);
 	
 	/* bool */
 	/* running should be false after InstrEndLoop */
@@ -111,7 +122,7 @@ InstrOut(StringInfo buf, Plan *plan, Instrumentation *instr)
 	appendStringInfo(buf, "%ld,", instr->bufusage.blk_write_time.tv_sec);
 	appendStringInfo(buf, "%ld}", instr->bufusage.blk_write_time.tv_nsec);
 	
-	elog(DEBUG1, "InstrOut: plan_node_id %d, nloops %.0f", plan->plan_node_id, instr->nloops);
+	elog(DEBUG1, "InstrOut: plan_node_id %d, node %d, nloops %.0f", plan->plan_node_id, current_node_id, instr->nloops);
 }
 
 /*
@@ -174,7 +185,6 @@ SpecInstrOut(StringInfo buf, NodeTag plantag, PlanState *planstate)
 			                 ((GatherMergeState *) planstate)->nworkers_launched);
 		}
 			break;
-#if 0
 		case T_Sort:
 		{
 			/* according to RemoteSortState and show_sort_info */
@@ -182,39 +192,113 @@ SpecInstrOut(StringInfo buf, NodeTag plantag, PlanState *planstate)
 			
 			if (sortstate->sort_Done && sortstate->tuplesortstate)
 			{
-				Tuplesortstate  *state = (Tuplesortstate *) sortstate->tuplesortstate;
-				char            *sortMethod;
-				char            *spaceType;
-				long            spaceUsed;
-				
-				tuplesort_get_stats(state, (const char **) &sortMethod, (const char **) &spaceType, &spaceUsed);
-				appendStringInfo(buf, "1<%s,%s,%ld>",
-				                 sortMethod, spaceType, spaceUsed);
+				Tuplesortstate *state = (Tuplesortstate *) sortstate->tuplesortstate;
+				TuplesortInstrumentation stats;
+				tuplesort_get_stats(state, &stats);
+				Assert(stats.sortMethod != SORT_TYPE_STILL_IN_PROGRESS);
+				appendStringInfo(buf, "1<%hd,%hd,%ld>",
+				                 stats.sortMethod, stats.spaceType, stats.spaceUsed);
+			}
+			else if (sortstate->instrument.sortMethod != -1)
+			{
+				Assert(sortstate->instrument.sortMethod != SORT_TYPE_STILL_IN_PROGRESS);
+				Assert(sortstate->instrument.spaceType != -1);
+				appendStringInfo(buf, "1<%hd,%hd,%ld>",
+				                 sortstate->instrument.sortMethod,
+				                 sortstate->instrument.spaceType,
+				                 sortstate->instrument.spaceUsed);
+			}
+			else
+			{
+				appendStringInfo(buf, "0>");
+			}
+			
+			if (sortstate->shared_info)
+			{
+				int n;
+				appendStringInfo(buf, "%d>", sortstate->shared_info->num_workers);
+				for (n = 0; n < sortstate->shared_info->num_workers; n++)
+				{
+					TuplesortInstrumentation *w_stats;
+					w_stats = &sortstate->shared_info->sinstrument[n];
+					if (w_stats->sortMethod == SORT_TYPE_STILL_IN_PROGRESS)
+					{
+						appendStringInfo(buf, "0>");
+					}
+					else
+						appendStringInfo(buf, "%hd,%hd,%ld>",
+						                 w_stats->sortMethod,
+						                 w_stats->spaceType, w_stats->spaceUsed);
+					elog(DEBUG1, "send out parallel sort %d info: %d %d %ld",
+					     planstate->plan->plan_node_id,
+					     w_stats->sortMethod,
+					     w_stats->spaceType,
+					     w_stats->spaceUsed);
+				}
 			}
 			else
 				appendStringInfo(buf, "0>");
 		}
 			break;
-		
 		case T_Hash:
 		{
-			/* according to RemoteHashState and show_hash_info */
+			/* according to show_hash_info */
 			HashState *hashstate = castNode(HashState, planstate);
 			HashJoinTable hashtable = hashstate->hashtable;
 			
+			int     nbuckets = 0;
+			int     nbuckets_original = 0;
+			int     nbatch = 0;
+			int     nbatch_original = 0;
+			Size    spacePeak = 0;
+			bool    valid = true;
+			
 			if (hashtable)
 			{
-				hashtable->nbuckets = 0;
+				nbuckets = hashtable->nbuckets;
+				nbuckets_original = hashtable->nbuckets_original;
+				nbatch = hashtable->nbatch;
+				nbatch_original = hashtable->nbatch_original;
+				spacePeak = hashtable->spacePeak;
+			}
+			else if (hashstate->shared_info)
+			{
+				int n;
+				for (n = 0; n < hashstate->shared_info->num_workers; n++)
+				{
+					HashInstrumentation *w_stats = &hashstate->shared_info->hinstrument[n];
+					/* Find the first worker that built a hash table. same logic in show_hash_info */
+					if (w_stats->nbatch > 0)
+					{
+						nbuckets = w_stats->nbuckets;
+						nbuckets_original = w_stats->nbuckets_original;
+						nbatch = w_stats->nbatch;
+						nbatch_original = w_stats->nbatch_original;
+						spacePeak = w_stats->space_peak;
+						break;
+					}
+				}
+			}
+			else
+			{
+				Assert(hashstate->hinstrument == NULL);
+				valid = false;
+			}
+			
+			if (valid)
+			{
+				elog(DEBUG1, "send out hash %d peak %zu", planstate->plan->plan_node_id,
+				     spacePeak);
 				appendStringInfo(buf, "1<%d,%d,%d,%d,%ld>",
-				                 hashtable->nbuckets, hashtable->nbuckets_original,
-				                 hashtable->nbatch, hashtable->nbatch_original,
-				                 (hashtable->spacePeak + 1023) / 1024);
+				                 nbuckets, nbuckets_original,
+				                 nbatch, nbatch_original,
+				                 spacePeak);
 			}
 			else
 				appendStringInfo(buf, "0>");
 		}
 			break;
-#endif
+		
 		default:
 			break;
 	}
@@ -238,7 +322,9 @@ InstrIn(StringInfo str, RemoteInstr *rinstr)
 	/* verify nodetype and plan_node_id */
 	rinstr->nodeTag = strtol(tmp_head, &tmp_pos, 0);
 	tmp_head = tmp_pos + 1;
-	rinstr->id = (int) strtol(tmp_head, &tmp_pos, 0);
+	rinstr->key.plan_node_id = (int) strtol(tmp_head, &tmp_pos, 0);
+	tmp_head = tmp_pos + 1;
+	rinstr->key.node_id = strtol(tmp_head, &tmp_pos, 0);
 	tmp_head = tmp_pos + 1;
 	
 	/* read values */
@@ -291,7 +377,7 @@ InstrIn(StringInfo str, RemoteInstr *rinstr)
 	INSTR_READ_FIELD(bufusage.blk_write_time.tv_sec);
 	INSTR_READ_FIELD(bufusage.blk_write_time.tv_nsec);
 	
-	elog(DEBUG1, "InstrIn: plan_node_id %d, nloops %.0f", rinstr->id, instr->nloops);
+	elog(DEBUG1, "InstrIn: plan_node_id %d, node %d, nloops %.0f", rinstr->key.plan_node_id, rinstr->key.node_id, instr->nloops);
 	
 	/* tmp_head points to next instrument's nodetype or '\0' already */
 	str->cursor = tmp_head - &str->data[0];
@@ -303,75 +389,67 @@ InstrIn(StringInfo str, RemoteInstr *rinstr)
  * DeSerialize of specific instrument info of current node.
  */
 static void
-SpecInstrIn(StringInfo str, RemoteInstr *rinstr)
+SpecInstrIn(StringInfo str, RemoteInstr *instr)
 {
 	char    *tmp_pos;
 	char    *tmp_head = &str->data[str->cursor];
 	
-	switch(rinstr->nodeTag)
+	switch(instr->nodeTag)
 	{
 		case T_Gather:
 		case T_GatherMerge:
 		{
-			rinstr->nworkers_launched = (int) strtod(tmp_head, &tmp_pos);
-			tmp_head = tmp_pos + 1;
+			INSTR_READ_FIELD(nworkers_launched);
 		}
 			break;
-#if 0
 		case T_Sort:
 		{
-			RemoteSortState *instr = (RemoteSortState *)palloc0(
-				sizeof(RemoteSortState));
 			/* either stat or w_stat is valid */
-			INSTR_READ_FIELD(rs.isvalid);
-			if (instr->rs.isvalid)
+			bool isvalid = (bool) strtod(tmp_head, &tmp_pos);
+			tmp_head = tmp_pos + 1;
+			
+			if (isvalid)
 			{
-				INSTR_READ_FIELD(stat.sortMethod);
-				INSTR_READ_FIELD(stat.spaceType);
-				INSTR_READ_FIELD(stat.spaceUsed);
+				INSTR_READ_FIELD(sort_stat.sortMethod);
+				INSTR_READ_FIELD(sort_stat.spaceType);
+				INSTR_READ_FIELD(sort_stat.spaceUsed);
+				Assert(instr->sort_stat.sortMethod != SORT_TYPE_STILL_IN_PROGRESS);
 			}
 			
-			INSTR_READ_FIELD(rs.num_workers);
-			if (instr->rs.num_workers > 0)
+			INSTR_READ_FIELD(nworkers_launched);
+			if (instr->nworkers_launched > 0)
 			{
 				int n;
-				Size size;
-				
-				size = mul_size(sizeof(TuplesortInstrumentation),
-				                instr->rs.num_workers);
-				instr->w_stats = (TuplesortInstrumentation *)palloc0(size);
+				instr->w_sort_stats = (TuplesortInstrumentation *) palloc0(instr->nworkers_launched * sizeof(TuplesortInstrumentation));
 				
-				for (n = 0; n < instr->rs.num_workers; n++)
+				for (n = 0; n < instr->nworkers_launched; n++)
 				{
-					INSTR_READ_FIELD(w_stats[n].sortMethod);
-					if (instr->w_stats[n].sortMethod != SORT_TYPE_STILL_IN_PROGRESS)
+					INSTR_READ_FIELD(w_sort_stats[n].sortMethod);
+					if (instr->w_sort_stats[n].sortMethod != SORT_TYPE_STILL_IN_PROGRESS)
 					{
-						INSTR_READ_FIELD(w_stats[n].spaceType);
-						INSTR_READ_FIELD(w_stats[n].spaceUsed);
+						INSTR_READ_FIELD(w_sort_stats[n].spaceType);
+						INSTR_READ_FIELD(w_sort_stats[n].spaceUsed);
 					}
 				}
 			}
-			remote_instr->state = (RemoteState *) instr;
 		}
 			break;
-		
 		case T_Hash:
 		{
-			RemoteHashState *instr = (RemoteHashState *)palloc0(
-				sizeof(RemoteHashState));
-			INSTR_READ_FIELD(rs.isvalid);
-			if (instr->rs.isvalid)
+			bool isvalid = (bool) strtod(tmp_head, &tmp_pos);
+			tmp_head = tmp_pos + 1;
+			
+			if (isvalid)
 			{
-				INSTR_READ_FIELD(nbuckets);
-				INSTR_READ_FIELD(nbuckets_original);
-				INSTR_READ_FIELD(nbatch);
-				INSTR_READ_FIELD(nbatch_original);
-				INSTR_READ_FIELD(spacePeakKb);
+				INSTR_READ_FIELD(hash_stat.nbuckets);
+				INSTR_READ_FIELD(hash_stat.nbuckets_original);
+				INSTR_READ_FIELD(hash_stat.nbatch);
+				INSTR_READ_FIELD(hash_stat.nbatch_original);
+				INSTR_READ_FIELD(hash_stat.space_peak);
 			}
-			remote_instr->state = (RemoteState *) instr;
 		}
 			break;
-#endif
+		
 		default:
 			break;
 	}
@@ -406,9 +484,33 @@ SerializeLocalInstr(PlanState *planstate, SerializeState *ss)
 	{
 		/* clean up the instrumentation state as in ExplainNode */
 		InstrEndLoop(planstate->instrument);
-		InstrOut(&ss->buf, planstate->plan, planstate->instrument);
-		//WorkerInstrOut(&ss->buf, planstate->worker_instrument);
-		SpecInstrOut(&ss->buf, nodeTag(planstate->plan), planstate);
+		if (planstate->dn_instrument)
+		{
+			/* re-send our received remote instr to upstream. */
+			int n;
+			for (n = 0; n < planstate->dn_instrument->nnode; n++)
+			{
+				Instrumentation *instrument = &(planstate->dn_instrument->instrument[n].instr);
+				int              node_id = planstate->dn_instrument->instrument[n].nodeid;
+				
+				/* instrument valid only if node_oid set */
+				if (node_id != 0)
+				{
+					InstrOut(&ss->buf, planstate->plan, instrument, node_id);
+					SpecInstrOut(&ss->buf, nodeTag(planstate->plan), planstate);
+				}
+				else
+				{
+					elog(DEBUG1, "can't send instr out plan_node_id %d not attached", plan_node_id);
+				}
+			}
+		}
+		else
+		{
+			/* send our own instr */
+			InstrOut(&ss->buf, planstate->plan, planstate->instrument, 0);
+			SpecInstrOut(&ss->buf, nodeTag(planstate->plan), planstate);
+		}
 	}
 	else
 	{
@@ -439,6 +541,89 @@ SendLocalInstr(PlanState *planstate)
 	pq_flush();
 }
 
+static void
+combineSpecRemoteInstr(RemoteInstr *rtarget, RemoteInstr *rsrc)
+{
+	int i;
+	/* specific instrument */
+	switch (rsrc->nodeTag)
+	{
+		case T_Gather:
+		case T_GatherMerge:
+		{
+			rtarget->nworkers_launched = Max(rtarget->nworkers_launched, rsrc->nworkers_launched);
+		}
+			break;
+		case T_Sort:
+		{
+			if (rsrc->sort_stat.sortMethod != SORT_TYPE_STILL_IN_PROGRESS &&
+			    rsrc->sort_stat.sortMethod != -1)
+			{
+				/* TODO: figure out which sortMethod is worse */
+				rtarget->sort_stat.sortMethod = rsrc->sort_stat.sortMethod;
+				if (rtarget->sort_stat.spaceType == rsrc->sort_stat.spaceType)
+				{
+					/* same space type, just compare space used */
+					rtarget->sort_stat.spaceUsed = Max(rtarget->sort_stat.spaceUsed, rsrc->sort_stat.spaceUsed);
+				}
+				else if (rtarget->sort_stat.spaceType > rsrc->sort_stat.spaceType)
+				{
+					/* invalid > memory > disk */
+					rtarget->sort_stat.spaceType = rsrc->sort_stat.spaceType;
+					rtarget->sort_stat.spaceUsed = rsrc->sort_stat.spaceUsed;
+				}
+			}
+			
+			rtarget->nworkers_launched = Max(rtarget->nworkers_launched, rsrc->nworkers_launched);
+			if (rtarget->w_sort_stats == NULL)
+			{
+				rtarget->w_sort_stats = palloc0(rtarget->nworkers_launched * sizeof(TuplesortInstrumentation));
+				for (i = 0; i < rtarget->nworkers_launched; i++)
+					rtarget->w_sort_stats[i].spaceType = -1;
+			}
+			for (i = 0; i < rtarget->nworkers_launched; i++)
+			{
+				if (rsrc->w_sort_stats[i].sortMethod == SORT_TYPE_STILL_IN_PROGRESS ||
+				    rsrc->w_sort_stats[i].sortMethod == -1)
+					continue;
+				
+				/* same logic above */
+				/* TODO: figure out which sortMethod is worse */
+				rtarget->w_sort_stats[i].sortMethod = rsrc->w_sort_stats[i].sortMethod;
+				if (rtarget->w_sort_stats[i].spaceType == rsrc->w_sort_stats[i].spaceType)
+				{
+					/* same space type, just compare space used */
+					rtarget->w_sort_stats[i].spaceUsed = Max(rtarget->w_sort_stats[i].spaceUsed, rsrc->w_sort_stats[i].spaceUsed);
+				}
+				else if (rtarget->w_sort_stats[i].spaceType > rsrc->w_sort_stats[i].spaceType)
+				{
+					/* invalid > memory > disk */
+					rtarget->w_sort_stats[i].spaceType = rsrc->w_sort_stats[i].spaceType;
+					rtarget->w_sort_stats[i].spaceUsed = rsrc->w_sort_stats[i].spaceUsed;
+				}
+				
+				elog(DEBUG1, "combine parallel plan %d sort state %d %d %ld",
+				     rtarget->key.plan_node_id,
+				     rtarget->w_sort_stats[i].sortMethod,
+				     rtarget->w_sort_stats[i].spaceType,
+				     rtarget->w_sort_stats[i].spaceUsed);
+			}
+		}
+			break;
+		case T_Hash:
+		{
+			rtarget->hash_stat.nbuckets = Max(rtarget->hash_stat.nbuckets, rsrc->hash_stat.nbuckets);
+			rtarget->hash_stat.nbuckets_original = Max(rtarget->hash_stat.nbuckets_original, rsrc->hash_stat.nbuckets_original);
+			rtarget->hash_stat.nbatch = Max(rtarget->hash_stat.nbatch, rsrc->hash_stat.nbatch);
+			rtarget->hash_stat.nbatch_original = Max(rtarget->hash_stat.nbatch_original, rsrc->hash_stat.nbatch_original);
+			rtarget->hash_stat.space_peak = Max(rtarget->hash_stat.space_peak, rsrc->hash_stat.space_peak);
+		}
+			break;
+		default:
+			break;
+	}
+}
+
 /*
  * combineRemoteInstr
  *
@@ -451,9 +636,11 @@ combineRemoteInstr(RemoteInstr *rtarget, RemoteInstr *rsrc)
 	Instrumentation *target = &rtarget->instr;
 	Instrumentation *src = &rsrc->instr;
 	
-	Assert(rtarget->id == rsrc->id);
+	Assert(rtarget->key.node_id == rsrc->key.node_id);
+	Assert(rtarget->key.plan_node_id == rsrc->key.plan_node_id);
 	Assert(rtarget->nodeTag == rsrc->nodeTag);
 	
+	/* regular instrument */
 	INSTR_MAX_FIELD(need_timer);
 	INSTR_MAX_FIELD(need_bufusage);
 	INSTR_MAX_FIELD(running);
@@ -503,7 +690,7 @@ combineRemoteInstr(RemoteInstr *rtarget, RemoteInstr *rsrc)
 	INSTR_MAX_FIELD(bufusage.blk_write_time.tv_sec);
 	INSTR_MAX_FIELD(bufusage.blk_write_time.tv_nsec);
 	
-	rtarget->nworkers_launched = Max(rtarget->nworkers_launched, rsrc->nworkers_launched);
+	combineSpecRemoteInstr(rtarget, rsrc);
 }
 
 /*
@@ -512,28 +699,38 @@ combineRemoteInstr(RemoteInstr *rtarget, RemoteInstr *rsrc)
  * Handle remote instrument message and save it by plan_node_id.
  */
 void
-HandleRemoteInstr(char *msg_body, size_t len, int nodeoid, ResponseCombiner *combiner)
+HandleRemoteInstr(char *msg_body, size_t len, int nodeid, ResponseCombiner *combiner)
 {
 	RemoteInstr recv_instr;
 	StringInfo  recv_str;
 	bool        found;
 	RemoteInstr *cur_instr;
 	
+	/* must doing this under per query context */
+	MemoryContext oldcontext = MemoryContextSwitchTo(combiner->ss.ps.state->es_query_cxt);
+	
 	if (combiner->recv_instr_htbl == NULL)
 	{
 		elog(ERROR, "combiner is not prepared for instrumentation");
 	}
-	elog(DEBUG1, "Handle remote instrument: nodeoid %d", nodeoid);
+	elog(DEBUG1, "Handle remote instrument: nodeid %d", nodeid);
 	
 	recv_str = makeStringInfo();
 	appendBinaryStringInfo(recv_str, msg_body, len);
 	
 	while(recv_str->cursor < recv_str->len)
 	{
+		memset(&recv_instr, 0, sizeof(RemoteInstr));
+		recv_instr.sort_stat.sortMethod = -1;
+		recv_instr.sort_stat.spaceType = -1;
 		InstrIn(recv_str, &recv_instr);
 		SpecInstrIn(recv_str, &recv_instr);
+		
+		if (recv_instr.key.node_id == 0)
+			recv_instr.key.node_id = nodeid;
+		
 		cur_instr = (RemoteInstr *) hash_search(combiner->recv_instr_htbl,
-		                                        (void *) &recv_instr.id,
+		                                        (void *) &recv_instr.key,
 		                                        HASH_ENTER, &found);
 		if (found)
 		{
@@ -541,9 +738,21 @@ HandleRemoteInstr(char *msg_body, size_t len, int nodeoid, ResponseCombiner *com
 		}
 		else
 		{
+			elog(DEBUG1, "remote instr hashtable enter plan_node_id %d node %d",
+			     recv_instr.key.plan_node_id, recv_instr.key.node_id);
+			
 			memcpy(cur_instr, &recv_instr, sizeof(RemoteInstr));
+			if (recv_instr.nodeTag == T_Sort && recv_instr.nworkers_launched > 0)
+			{
+				Size size = sizeof(TuplesortInstrumentation) * recv_instr.nworkers_launched;
+				
+				cur_instr->w_sort_stats = palloc(size);
+				memcpy(cur_instr->w_sort_stats, recv_instr.w_sort_stats, size);
+			}
 		}
 	}
+	
+	MemoryContextSwitchTo(oldcontext);
 }
 
 /*
@@ -552,23 +761,83 @@ HandleRemoteInstr(char *msg_body, size_t len, int nodeoid, ResponseCombiner *com
  * Attach specific information in planstate.
  */
 static void
-attachRemoteSpecialInstr(PlanState *planstate, RemoteInstr *rinstr)
+attachRemoteSpecificInstr(PlanState *planstate, RemoteInstr *rinstr)
 {
 	int nodeTag = nodeTag(planstate->plan);
+	int nworkers = rinstr->nworkers_launched;
 	
 	switch(nodeTag)
 	{
 		case T_Gather:
+		{
+			GatherState *gs = (GatherState *) planstate;
+			gs->nworkers_launched = nworkers;
+		}
+			break;
+		case T_GatherMerge:
+		{
+			GatherMergeState *gms = (GatherMergeState *) planstate;
+			gms->nworkers_launched = nworkers;
+		}
+			break;
+		case T_Sort:
+		{
+			SortState *ss = (SortState *) planstate;
+			ss->instrument.sortMethod = rinstr->sort_stat.sortMethod;
+			ss->instrument.spaceType = rinstr->sort_stat.spaceType;
+			ss->instrument.spaceUsed = rinstr->sort_stat.spaceUsed;
+			elog(DEBUG1, "attach sort nworkers %d", nworkers);
+			
+			if (nworkers > 0)
 			{
-				GatherState *gs = (GatherState *) planstate;
-				gs->nworkers_launched = rinstr->nworkers_launched;
+				int  i;
+				if (ss->shared_info == NULL)
+				{
+					Size size = offsetof(SharedSortInfo, sinstrument)
+					            + nworkers * sizeof(TuplesortInstrumentation);
+					ss->shared_info = palloc0(size);
+				}
+				
+				ss->shared_info->num_workers = nworkers;
+				for (i = 0; i < nworkers; i++)
+				{
+					ss->shared_info->sinstrument[i].sortMethod = rinstr->w_sort_stats[i].sortMethod;
+					ss->shared_info->sinstrument[i].spaceType = rinstr->w_sort_stats[i].spaceType;
+					ss->shared_info->sinstrument[i].spaceUsed = rinstr->w_sort_stats[i].spaceUsed;
+					elog(DEBUG1, "attach parallel sort %d, info: %d %d %ld",
+					     planstate->plan->plan_node_id,
+					     ss->shared_info->sinstrument[i].sortMethod,
+					     ss->shared_info->sinstrument[i].spaceType,
+					     ss->shared_info->sinstrument[i].spaceUsed);
+				}
 			}
+		}
 			break;
-		case T_GatherMerge:
+		case T_Hash:
+		{
+			HashState *hs = (HashState *) planstate;
+			if (IsParallelWorker())
+			{
+				Assert(hs->hinstrument != NULL);
+				Assert(hs->shared_info != NULL);
+				Assert(hs->hashtable == NULL);
+				/* copy into first instrument */
+				memcpy(&hs->shared_info->hinstrument[0], &rinstr->hash_stat, sizeof(HashInstrumentation));
+				elog(DEBUG1, "parallel worker attach hash state plan %d peak %zu",
+				     planstate->plan->plan_node_id, hs->hinstrument->space_peak);
+			}
+			else
 			{
-				GatherMergeState *gms = (GatherMergeState *) planstate;
-				gms->nworkers_launched = rinstr->nworkers_launched;
+				if (hs->hashtable == NULL)
+					hs->hashtable = palloc(sizeof(HashJoinTableData));
+				
+				hs->hashtable->nbuckets = rinstr->hash_stat.nbuckets;
+				hs->hashtable->nbuckets_original = rinstr->hash_stat.nbuckets_original;
+				hs->hashtable->nbatch = rinstr->hash_stat.nbatch;
+				hs->hashtable->nbatch_original = rinstr->hash_stat.nbatch_original;
+				hs->hashtable->spacePeak = rinstr->hash_stat.space_peak;
 			}
+		}
 			break;
 		default:
 			break;
@@ -581,43 +850,81 @@ attachRemoteSpecialInstr(PlanState *planstate, RemoteInstr *rinstr)
  * Attach instrument information in planstate from saved info in combiner.
  */
 bool
-AttachRemoteInstr(PlanState *planstate, ResponseCombiner *combiner)
+AttachRemoteInstr(PlanState *planstate, AttachRemoteInstrContext *ctx)
 {
 	int plan_node_id = planstate->plan->plan_node_id;
-	if (bms_is_member(plan_node_id, combiner->printed_nodes))
+	
+	if (bms_is_member(plan_node_id, ctx->printed_nodes))
 		return false;
 	else
-		combiner->printed_nodes = bms_add_member(combiner->printed_nodes, plan_node_id);
+		ctx->printed_nodes = bms_add_member(ctx->printed_nodes, plan_node_id);
 	
-	if (IsA(planstate, RemoteSubplanState) && NULL == planstate->lefttree)
+	if (IsA(planstate, RemoteSubplanState) && planstate->lefttree == NULL)
 	{
-		Plan        *plan = planstate->plan;
-		PlanState   *remote_ps;
-		EState      *estate = planstate->state;
-
-		remote_ps = ExecInitNode(plan->lefttree, estate, EXEC_FLAG_EXPLAIN_ONLY);
-		planstate->lefttree = remote_ps;
+		/* subplan could be here, init it's child too */
+		planstate->lefttree = ExecInitNode(planstate->plan->lefttree,
+		                                   planstate->state,
+		                                   EXEC_FLAG_EXPLAIN_ONLY);
 	}
 	
 	if (planstate->instrument)
 	{
-		bool        found;
-		RemoteInstr *rinstr= (RemoteInstr *) hash_search(combiner->recv_instr_htbl,
-		                                                 (void *) &plan_node_id,
-		                                                 HASH_FIND, &found);
-		if (!found)
+		RemoteInstrKey  key;
+		bool            found;
+		RemoteInstr    *rinstr;
+		RemoteInstr     rinstr_final; /* for specific instrument */
+		bool            spec_need_attach = false;
+		ListCell       *lc;
+		
+		int n = 0;
+		int nnode = list_length(ctx->node_idx_List);
+		
+		key.plan_node_id = plan_node_id;
+		memset(&rinstr_final, 0, sizeof(RemoteInstr));
+		rinstr_final.sort_stat.sortMethod = -1;
+		rinstr_final.sort_stat.spaceType = -1;
+		
+		/* This is for non-parallel case. If parallel, we init dn_instrument in dsm. */
+		if (planstate->dn_instrument == NULL)
 		{
-			elog(DEBUG1, "AttachRemoteInstr: remote instrumentation not found, tag %d id %d",
-			     nodeTag(planstate->plan), plan_node_id);
+			Size size = offsetof(DatanodeInstrumentation, instrument) +
+			            mul_size(nnode, sizeof(RemoteInstrumentation));
+			Assert(!IsParallelWorker());
+			planstate->dn_instrument = palloc0(size);
+			planstate->dn_instrument->nnode = nnode;
 		}
-		else
+		
+		foreach(lc, ctx->node_idx_List)
 		{
-			Assert(rinstr->nodeTag == nodeTag(planstate->plan));
-			Assert(rinstr->id == plan_node_id);
+			key.node_id = get_pgxc_node_id(get_nodeoid_from_nodeid(lfirst_int(lc), PGXC_NODE_DATANODE));
+			elog(DEBUG1, "attach node %d, plan_node_id %d", key.node_id, key.plan_node_id);
+			rinstr = (RemoteInstr *) hash_search(ctx->htab,
+			                                     (void *) &key,
+			                                     HASH_FIND, &found);
 			
-			memcpy(planstate->instrument, &rinstr->instr, sizeof(Instrumentation));
-			attachRemoteSpecialInstr(planstate, rinstr);
+			if (found)
+			{
+				Assert(rinstr->nodeTag == nodeTag(planstate->plan));
+				Assert(rinstr->key.plan_node_id == plan_node_id);
+				
+				elog(DEBUG1, "instr attach plan_node_id %d node %d index %d", plan_node_id, key.node_id, n);
+				planstate->dn_instrument->instrument[n].nodeid = key.node_id;
+				memcpy(&planstate->dn_instrument->instrument[n].instr, &rinstr->instr, sizeof(Instrumentation));
+				/* TODO attach all nodes' remote specific instr */
+				rinstr_final.nodeTag = rinstr->nodeTag;
+				rinstr_final.key = rinstr->key;
+				combineSpecRemoteInstr(&rinstr_final, rinstr);
+				spec_need_attach = true;
+			}
+			else
+			{
+				elog(DEBUG1, "failed to find remote instr of plan_node_id %d node %d", plan_node_id, key.node_id);
+			}
+			n++;
 		}
+		/* TODO attach all nodes' remote specific instr */
+		if (spec_need_attach)
+			attachRemoteSpecificInstr(planstate, &rinstr_final);
 	}
 	else
 	{
@@ -626,5 +933,157 @@ AttachRemoteInstr(PlanState *planstate, ResponseCombiner *combiner)
 		     nodeTag(planstate), plan_node_id);
 	}
 
-	return planstate_tree_walker(planstate, AttachRemoteInstr, combiner);
+	return planstate_tree_walker(planstate, AttachRemoteInstr, ctx);
+}
+
+/*
+ * ExplainCommonRemoteInstr
+ *
+ * Explain remote instruments for common info of current node.
+ */
+void
+ExplainCommonRemoteInstr(PlanState *planstate, ExplainState *es)
+{
+	int     i;
+	int     nnode = planstate->dn_instrument->nnode;
+	
+	RemoteInstrumentation *rinstr = planstate->dn_instrument->instrument;
+	/* for min/max display */
+	double nloops_min, nloops_max, nloops;
+	double startup_sec_min, startup_sec_max, startup_sec;
+	double total_sec_min, total_sec_max, total_sec;
+	double rows_min, rows_max, rows;
+	/* for verbose */
+	StringInfoData buf;
+	
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		appendStringInfoChar(es->str, '\n');
+		appendStringInfoSpaces(es->str, es->indent * 2);
+	}
+	
+	/* give min max a startup value */
+	for (i = 0; i < nnode; i++)
+	{
+		Instrumentation *instr = &rinstr[i].instr;
+		if (instr->nloops != 0)
+		{
+			nloops_min = nloops_max = instr->nloops;
+			startup_sec_min = startup_sec_max = 1000.0 * instr->startup / nloops_min;
+			total_sec_min = total_sec_max = 1000.0 * instr->total / nloops_min;
+			rows_min = rows_max = instr->ntuples / nloops_min;
+			break;
+		}
+	}
+	if (i == nnode)
+	{
+		appendStringInfo(es->str, "DN (never executed)");
+		return;
+	}
+	
+	if (es->verbose)
+		initStringInfo(&buf);
+	
+	for (i = 0; i < nnode; i++)
+	{
+		Instrumentation *instr = &rinstr[i].instr;
+		int              node_id = rinstr[i].nodeid;
+		char            *dnname;
+		
+		if (node_id == 0)
+			continue;
+		
+		dnname = get_pgxc_nodename_from_identifier(node_id);
+		nloops = instr->nloops;
+		startup_sec = 1000.0 * instr->startup / nloops;
+		total_sec = 1000.0 * instr->total / nloops;
+		rows = instr->ntuples / nloops;
+		
+		SET_MIN_MAX(nloops_min, nloops_max, nloops);
+		SET_MIN_MAX(startup_sec_min, startup_sec_max, startup_sec);
+		SET_MIN_MAX(total_sec_min, total_sec_max, total_sec);
+		SET_MIN_MAX(rows_min, rows_max, rows);
+		
+		/* one line for each dn if verbose */
+		if (es->verbose)
+		{
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+				appendStringInfoChar(&buf, '\n');
+				appendStringInfoSpaces(&buf, es->indent * 2);
+				if (nloops <= 0)
+				{
+					appendStringInfo(&buf, "- %s (never executed)", dnname);
+				}
+				else
+				{
+					if (es->timing)
+						appendStringInfo(&buf,
+						                 "- %s (actual time=%.3f..%.3f rows=%.0f loops=%.0f)",
+						                 dnname, startup_sec, total_sec, rows, nloops);
+					else
+						appendStringInfo(&buf,
+						                 "- %s (actual rows=%.0f loops=%.0f)",
+						                 dnname, rows, nloops);
+				}
+			}
+			else
+			{
+				ExplainPropertyText("Data Node", dnname, es);
+				if (es->timing)
+				{
+					ExplainPropertyFloat("Actual Startup Time", startup_sec, 3, es);
+					ExplainPropertyFloat("Actual Total Time", total_sec, 3, es);
+				}
+				ExplainPropertyFloat("Actual Rows", rows, 0, es);
+				ExplainPropertyFloat("Actual Loops", nloops, 0, es);
+			}
+		}
+	}
+	
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		bool show_verbose = true;
+		
+		if (nloops_max <= 0)
+		{
+			show_verbose = false;
+			appendStringInfo(es->str, "DN (never executed)");
+		}
+		else
+		{
+			if (es->timing)
+				appendStringInfo(es->str,
+				                 "DN (actual startup time=%.3f..%.3f total time=%.3f..%.3f rows=%.0f..%.0f loops=%.0f..%.0f)",
+				                 startup_sec_min, startup_sec_max,
+				                 total_sec_min, total_sec_max, rows_min, rows_max,
+				                 nloops_min, nloops_max);
+			else
+				appendStringInfo(es->str,
+				                 "DN (actual rows=%.0f..%.0f loops=%.0f..%.0f)",
+				                 rows_min, rows_max, nloops_min, nloops_max);
+		}
+		
+		if (es->verbose)
+		{
+			if (show_verbose)
+				appendStringInfo(es->str, "%s", buf.data);
+			pfree(buf.data);
+		}
+	}
+	else
+	{
+		ExplainPropertyText("Data Node", "ALL", es);
+		if (es->timing)
+		{
+			ExplainPropertyFloat("Actual Min Startup Time", startup_sec_min, 3, es);
+			ExplainPropertyFloat("Actual Max Startup Time", startup_sec_max, 3, es);
+			ExplainPropertyFloat("Actual Min Total Time", total_sec_min, 3, es);
+			ExplainPropertyFloat("Actual Max Total Time", total_sec_max, 3, es);
+		}
+		ExplainPropertyFloat("Actual Min Rows", rows_min, 0, es);
+		ExplainPropertyFloat("Actual Max Rows", rows_max, 0, es);
+		ExplainPropertyFloat("Actual Min Loops", nloops_min, 0, es);
+		ExplainPropertyFloat("Actual Max Loops", nloops_max, 0, es);
+	}
 }
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 7aa46865..db2cdf60 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -82,6 +82,7 @@
 
 #define PARALLEL_KEY_EXEC_ERROR        UINT64CONST(0xE0000000000000B1)
 #define PARALLEL_KEY_EXEC_DONE         UINT64CONST(0xE0000000000000B2)
+#define PARALLEL_REMOTEINSTR_OFFSET    UINT64CONST(0xEC00000000000000)
 #endif
 
 #define PARALLEL_TUPLE_QUEUE_SIZE        65536
@@ -137,6 +138,15 @@ typedef struct ExecParallelInitializeDSMContext
     int            nnodes;
 } ExecParallelInitializeDSMContext;
 
+#ifdef __TBASE__
+/* Context object for ExecParallelInitializeRemoteInstr. */
+typedef struct ExecParallelRemoteInstrContext
+{
+	ParallelContext *pcxt;
+	int			ndatanode;
+} ExecParallelRemoteInstrContext;
+#endif
+
 /* Helper functions that run in the parallel leader. */
 static char *ExecSerializePlan(Plan *plan, EState *estate);
 static bool ExecParallelEstimate(PlanState *node,
@@ -149,7 +159,13 @@ static bool ExecParallelReInitializeDSM(PlanState *planstate,
 							ParallelContext *pcxt);
 static bool ExecParallelRetrieveInstrumentation(PlanState *planstate,
                                     SharedExecutorInstrumentation *instrumentation);
-
+#ifdef __TBASE__
+static bool ExecParallelEstimateRemoteInstr(PlanState *planstate,
+                                            ExecParallelRemoteInstrContext *ri);
+static bool ExecParallelInitRemoteInstrDSM(PlanState *planstate,
+                                           ExecParallelRemoteInstrContext *ri);
+static bool ExecInitializeWorkerRemoteInstr(PlanState *planstate, ParallelWorkerContext *pcxt);
+#endif
 /* Helper function that runs in the parallel worker. */
 static DestReceiver *ExecParallelGetReceiver(dsm_segment *seg, shm_toc *toc);
 
@@ -241,26 +257,16 @@ ExecSerializePlan(Plan *plan, EState *estate)
  */
 static bool
 ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
-{// #lizard forgives
+{
+#ifdef __TBASE__
+	int previous_nworkers;
+#endif
     if (planstate == NULL)
         return false;
 
     /* Count this node. */
     e->nnodes++;
 
-	/*
-	 * if we are running with instrument option, must init
-	 * full plantree here, to ensure e->nnodes correct.
-	 */
-	if (planstate->instrument &&
-	    IsA(planstate, RemoteSubplanState) &&
-	    NULL == planstate->lefttree)
-	{
-		planstate->lefttree = ExecInitNode(planstate->plan->lefttree,
-		                                   planstate->state,
-		                                   EXEC_FLAG_EXPLAIN_ONLY);
-	}
-
         switch (nodeTag(planstate))
         {
             case T_SeqScanState:
@@ -306,7 +312,27 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 			break;
             /* For remote query and remote subplan, there is no need for shared storage. */
             case T_RemoteQueryState:                
+			break;
             case T_RemoteSubplanState:
+			/*
+             * If we are running with instrument option, must init full plantree here,
+             * to ensure e->nnodes correct. Further, we estimate per node instrument
+             * for remote instrumentation.
+             */
+			if (planstate->instrument && NULL == planstate->lefttree)
+			{
+				ExecParallelRemoteInstrContext ri;
+				RemoteSubplanState *node = (RemoteSubplanState *) planstate;
+				
+				ri.ndatanode = list_length(((RemoteSubplan *)planstate->plan)->nodeList);
+				ri.pcxt = e->pcxt;
+				
+				planstate->lefttree = ExecInitNode(planstate->plan->lefttree,
+				                                   planstate->state,
+				                                   EXEC_FLAG_EXPLAIN_ONLY);
+				planstate_tree_walker(planstate, ExecParallelEstimateRemoteInstr, &ri);
+				node->combiner.remote_parallel_estimated = true;
+			}
                 break;
             case T_HashJoinState:
 			if (planstate->plan->parallel_aware)
@@ -322,12 +348,24 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
                         ReDistributeEstimate(planstate, e->pcxt);
                 }
                 break;
+		case T_GatherState:
+			previous_nworkers = e->pcxt->nworkers;
+			e->pcxt->nworkers = ((Gather *) planstate->plan)->num_workers;
 #endif
             default:
                 break;
         }
 
+#ifdef __TBASE__
+	planstate_tree_walker(planstate, ExecParallelEstimate, e);
+	
+	if (IsA(planstate, GatherState))
+		e->pcxt->nworkers = previous_nworkers;
+	
+	return false;
+#else
     return planstate_tree_walker(planstate, ExecParallelEstimate, e);
+#endif
 }
 
 /*
@@ -337,7 +375,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 static bool
 ExecParallelInitializeDSM(PlanState *planstate,
                           ExecParallelInitializeDSMContext *d)
-{// #lizard forgives
+{
+#ifdef __TBASE__
+	int previous_nworkers;
+#endif
     if (planstate == NULL)
         return false;
 
@@ -407,9 +448,21 @@ ExecParallelInitializeDSM(PlanState *planstate,
                                                   d->pcxt);
                 break;
             case T_RemoteSubplanState:
+			{
+				RemoteSubplanState *node = (RemoteSubplanState *) planstate;
+				if (node->combiner.remote_parallel_estimated)
+				{
+					ExecParallelRemoteInstrContext ri;
+					
+					ri.ndatanode = list_length(((RemoteSubplan *)planstate->plan)->nodeList);
+					ri.pcxt = d->pcxt;
+					
+					planstate_tree_walker(planstate, ExecParallelInitRemoteInstrDSM, &ri);
+				}
 			if (planstate->plan->parallel_aware)
                 ExecRemoteSubPlanInitializeDSM((RemoteSubplanState *)planstate,
                                                   d->pcxt);
+			}
                 break;                
             case T_HashJoinState:
 			if (planstate->plan->parallel_aware)
@@ -425,12 +478,24 @@ ExecParallelInitializeDSM(PlanState *planstate,
                         ReDistributeInitializeDSM(planstate, d->pcxt);
                 }
                 break;
+		case T_GatherState:
+			previous_nworkers = d->pcxt->nworkers;
+			d->pcxt->nworkers = ((Gather *) planstate->plan)->num_workers;
 #endif
             default:
                 break;
         }
 
+#ifdef __TBASE__
+	planstate_tree_walker(planstate, ExecParallelInitializeDSM, d);
+	
+	if (IsA(planstate, GatherState))
+		d->pcxt->nworkers = previous_nworkers;
+	
+	return false;
+#else
     return planstate_tree_walker(planstate, ExecParallelInitializeDSM, d);
+#endif
 }
 
 /*
@@ -1002,7 +1067,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
     ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation));
     planstate->worker_instrument =
         palloc(ibytes + offsetof(WorkerInstrumentation, instrument));
+#ifndef __TBASE__
     MemoryContextSwitchTo(oldcontext);
+#endif
 
     planstate->worker_instrument->num_workers = instrumentation->num_workers;
     memcpy(&planstate->worker_instrument->instrument, instrument, ibytes);
@@ -1019,6 +1086,26 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 		default:
 			break;
 	}
+#ifdef __TBASE__
+	/* also retrieve instrumentation from remote */
+	if (planstate->dn_instrument != NULL)
+	{
+		DatanodeInstrumentation *tmp_instrument = planstate->dn_instrument;
+		int     nnode = planstate->dn_instrument->nnode;
+		Size    size = offsetof(DatanodeInstrumentation, instrument) +
+		               mul_size(nnode, sizeof(RemoteInstrumentation));
+		
+		elog(DEBUG1, "retrieve downstream instrumentation, plan_node_id %d nnode %d", plan_node_id, nnode);
+		
+		planstate->dn_instrument = palloc0(size);
+		memcpy(planstate->dn_instrument, tmp_instrument, size);
+	}
+	/*
+	 * TBase switch memory context later to keep retrieved instrumentation live until 
+	 * sending them back to upstream.
+	 */
+	MemoryContextSwitchTo(oldcontext);
+#endif
 
     return planstate_tree_walker(planstate, ExecParallelRetrieveInstrumentation,
                                  instrumentation);
@@ -1228,6 +1315,15 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 				ExecRemoteQueryInitializeDSMWorker((RemoteQueryState *)planstate, pwcxt);
                 break;
             case T_RemoteSubplanState: 
+			if (planstate->instrument && NULL == planstate->lefttree)
+			{
+				/* if instrument needed, init full plantree in worker */
+				planstate->lefttree = ExecInitNode(planstate->plan->lefttree,
+				                                   planstate->state,
+				                                   EXEC_FLAG_EXPLAIN_ONLY);
+				/* attach share memory for it's child */
+				planstate_tree_walker(planstate, ExecInitializeWorkerRemoteInstr, pwcxt);
+			}
             if (planstate->plan->parallel_aware)        
 				ExecRemoteSubPlanInitializeDSMWorker((RemoteSubplanState *)planstate, pwcxt);
 			break;
@@ -1253,6 +1349,105 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 								 pwcxt);
 }
 
+#ifdef __TBASE__
+/*
+ * Estimate share memory space for plan nodes executed by remote, they contain instruments
+ * from all datanodes involved, and only leader worker receive these instruments.
+ */
+static bool
+ExecParallelEstimateRemoteInstr(PlanState *node, ExecParallelRemoteInstrContext *ri)
+{
+	ParallelContext *pcxt = ri->pcxt;
+	Size size = mul_size(ri->ndatanode, sizeof(RemoteInstrumentation));
+	size = add_size(size, offsetof(DatanodeInstrumentation, instrument));
+	
+	if (node == NULL)
+		return false;
+	
+	/*
+	 * only remote plan node could be here, we need disable parallel for these nodes
+	 * to prevent them from initializing other share memory for execution, they don't
+	 * need that, only init share memory for instrument collecting.
+	 */
+	node->plan->parallel_aware = false;
+	
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+	
+	/* for sub-plan */
+	if (IsA(node, RemoteSubplanState) && node->lefttree == NULL)
+	{
+		node->lefttree = ExecInitNode(node->plan->lefttree,
+		                              node->state,
+		                              EXEC_FLAG_EXPLAIN_ONLY);
+	}
+	
+	elog(DEBUG1, "parallel estimate shm remote instrument for plan node %d", node->plan->plan_node_id);
+	
+	return planstate_tree_walker(node, ExecParallelEstimateRemoteInstr,
+	                             ri);
+}
+
+/*
+ * Allocate share memory space for plan nodes executed by remote, they contain instruments
+ * from all datanodes involved, and only leader worker receive these instruments. use
+ * plan_node_id + offset as a unique key.
+ */
+static bool
+ExecParallelInitRemoteInstrDSM(PlanState *node, ExecParallelRemoteInstrContext *ri)
+{
+	ParallelContext *pcxt = ri->pcxt;
+	Size size = mul_size(ri->ndatanode, sizeof(RemoteInstrumentation));
+	size = add_size(size, offsetof(DatanodeInstrumentation, instrument));
+	
+	if (node == NULL)
+		return false;
+	
+	node->dn_instrument = shm_toc_allocate(pcxt->toc, size);
+	memset(node->dn_instrument, 0, size);
+	node->dn_instrument->nnode = ri->ndatanode;
+	shm_toc_insert(pcxt->toc, node->plan->plan_node_id + PARALLEL_REMOTEINSTR_OFFSET,
+	               node->dn_instrument);
+	
+	elog(DEBUG1, "parallel allocate shm remote instrument for plan node %d", node->plan->plan_node_id);
+	
+	return planstate_tree_walker(node, ExecParallelInitRemoteInstrDSM,
+	                             ri);
+}
+
+/*
+ * Fetch the share memory for plan nodes executed by remote, they will be fulfilled
+ * with instruments during RemoteSubplan node's execution. use plan_node_id + offset
+ * as the unique key.
+ */
+static bool
+ExecInitializeWorkerRemoteInstr(PlanState *planstate, ParallelWorkerContext *pwcxt)
+{
+	/*
+	 * only remote plan node could be here, we need disable parallel for these nodes
+	 * to prevent them from initializing other share memory for execution, they don't
+	 * need that, only init share memory for instrument collecting.
+	 */
+	planstate->plan->parallel_aware = false;
+	planstate->dn_instrument = shm_toc_lookup(pwcxt->toc,
+	                                          planstate->plan->plan_node_id + PARALLEL_REMOTEINSTR_OFFSET,
+	                                          false);
+	
+	/* for sub-plan */
+	if (IsA(planstate, RemoteSubplanState) && planstate->lefttree == NULL)
+	{
+		planstate->lefttree = ExecInitNode(planstate->plan->lefttree,
+		                                   planstate->state,
+		                                   EXEC_FLAG_EXPLAIN_ONLY);
+	}
+	
+	elog(DEBUG1, "parallel init worker remote instrument for plan node %d", planstate->plan->plan_node_id);
+	
+	return planstate_tree_walker(planstate, ExecInitializeWorkerRemoteInstr,
+	                             pwcxt);
+}
+#endif
+
 /*
  * Main entrypoint for parallel query worker processes.
  *
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index eb5df5b9..cec4400d 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -429,6 +429,9 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
     /* Set up instrumentation for this node if requested */
     if (estate->es_instrument)
         result->instrument = InstrAlloc(1, estate->es_instrument);
+#ifdef __TBASE__
+	result->dn_instrument = NULL;
+#endif
 
     return result;
 }
diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c
index 9c63e4eb..55686429 100644
--- a/src/backend/executor/nodeGather.c
+++ b/src/backend/executor/nodeGather.c
@@ -543,9 +543,10 @@ ExecReScanGather(GatherState *node)
      * to propagate any error or other information to master backend before
      * dying.  Parallel context will be reused for rescan.
      */
+#if 0 /* pg latest code disable for now */
 	Gather	   *gather = (Gather *) node->ps.plan;
 	PlanState  *outerPlan = outerPlanState(node);
-
+#endif
 	/* Make sure any existing workers are gracefully shut down */
     ExecShutdownGatherWorkers(node);
 
diff --git a/src/backend/executor/nodeGatherMerge.c b/src/backend/executor/nodeGatherMerge.c
index 291cf644..6f94db2b 100644
--- a/src/backend/executor/nodeGatherMerge.c
+++ b/src/backend/executor/nodeGatherMerge.c
@@ -405,8 +405,10 @@ ExecReScanGatherMerge(GatherMergeState *node)
      * to propagate any error or other information to master backend before
      * dying.  Parallel context will be reused for rescan.
      */
+#if 0 /* postgresql latest code */
 	GatherMerge *gm = (GatherMerge *) node->ps.plan;
 	PlanState  *outerPlan = outerPlanState(node);
+#endif
 
 	/* Make sure any existing workers are gracefully shut down */
     ExecShutdownGatherMergeWorkers(node);
@@ -418,7 +420,7 @@ ExecReScanGatherMerge(GatherMergeState *node)
 		ExecParallelReinitialize(&node->ps, node->pei);
 
     ExecReScan(node->ps.lefttree);
-#if 0
+#if 0 /* postgresql latest code */
 =======
 	/*
 	 * Set child node's chgParam to tell it that the next scan might deliver a
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index c81eb2fa..1c4d148d 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -184,6 +184,9 @@ ExecInitHash(Hash *node, EState *estate, int eflags)
     hashstate->ps.ExecProcNode = ExecHash;
     hashstate->hashtable = NULL;
     hashstate->hashkeys = NIL;    /* will be set by parent HashJoin */
+#ifdef __TBASE__
+	hashstate->shared_info = NULL;
+#endif
 
     /*
      * Miscellaneous initialization
@@ -1830,6 +1833,10 @@ ExecHashEstimate(HashState *node, ParallelContext *pcxt)
 {
 	size_t		size;
 
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ps.instrument || pcxt->nworkers == 0)
+		return;
+	
 	size = mul_size(pcxt->nworkers, sizeof(HashInstrumentation));
 	size = add_size(size, offsetof(SharedHashInfo, hinstrument));
 	shm_toc_estimate_chunk(&pcxt->estimator, size);
@@ -1845,6 +1852,10 @@ ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt)
 {
 	size_t		size;
 
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ps.instrument || pcxt->nworkers == 0)
+		return;
+	
 	size = offsetof(SharedHashInfo, hinstrument) +
 		pcxt->nworkers * sizeof(HashInstrumentation);
 	node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size);
@@ -1876,9 +1887,17 @@ ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt)
 {
 	SharedHashInfo *shared_info;
 
+	/* don't need this if not instrumenting */
+	if (!node->ps.instrument)
+		return;
+	
 	shared_info = (SharedHashInfo *)
 		shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, true);
 	node->hinstrument = &shared_info->hinstrument[ParallelWorkerNumber];
+#ifdef __TBASE__
+	/* set node->shared_info for distributed instrument */
+	node->shared_info = shared_info;
+#endif
 }
 
 /*
@@ -1890,6 +1909,7 @@ ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt)
 void
 ExecShutdownHash(HashState *node)
 {
+	/* Now accumulate data for the current (final) hash table */
 	if (node->hinstrument && node->hashtable)
 		ExecHashGetInstrumentation(node->hinstrument, node->hashtable);
 }
@@ -1904,6 +1924,9 @@ ExecHashRetrieveInstrumentation(HashState *node)
 	SharedHashInfo *shared_info = node->shared_info;
 	size_t		size;
 
+	if (shared_info == NULL)
+		return;
+	
 	/* Replace node->shared_info with a copy in backend-local memory. */
 	size = offsetof(SharedHashInfo, hinstrument) +
 		shared_info->num_workers * sizeof(HashInstrumentation);
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index c6446cad..5aab09e0 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -287,6 +287,8 @@ ExecHashJoin(PlanState *pstate)
                                                             node->hj_HashOperators,
                                                             HJ_FILL_INNER(node));
                         node->hj_HashTable = hashtable;
+						/* copy into hashNode too, for instrumentation */
+						hashNode->hashtable = hashtable;
                         parallelState->statusParallelWorker[ParallelWorkerNumber] = ParallelHashJoin_MergeShmHashTableDone;
                     }
                     else
@@ -1651,6 +1653,7 @@ ExecMergeShmHashTable(HashJoinState * hjstate, volatile ParallelHashJoinState *p
                     }
 
                     ht->totalTuples = ht->totalTuples + mergeHashtable->totalTuples;
+					ht->spacePeak = Max(ht->spacePeak, mergeHashtable->spacePeak);
                     
                     /* merge hashtable */
                     for(indexbucket = 0; indexbucket < ht->nbuckets; indexbucket++)
@@ -1803,6 +1806,12 @@ ExecMergeShmHashTable(HashJoinState * hjstate, volatile ParallelHashJoinState *p
         hashtable->totalTuples = ht->totalTuples;
         hashtable->skewEnabled = false;
         hashtable->growEnabled = false;
+		/* copy instrumentation too */
+		hashtable->nbuckets = ht->nbuckets;
+		hashtable->nbuckets_original = ht->nbuckets_original;
+		hashtable->nbatch = ht->nbatch;
+		hashtable->nbatch_original = ht->nbatch_original;
+		hashtable->spacePeak = ht->spacePeak;
     }
 
 #if 0
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index 2dd4bf89..40ae3ac0 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -299,6 +299,9 @@ ExecInitSort(Sort *node, EState *estate, int eflags)
     sortstate->state     = NULL;
     sortstate->file      = NULL;
     sortstate->dataslot  = NULL;
+	sortstate->instrument.sortMethod = -1;
+	sortstate->instrument.spaceType = -1;
+	sortstate->instrument.spaceUsed = 0;
 #endif
 
     /*
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index e3be03b8..40f3f655 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -298,7 +298,6 @@ InitResponseCombiner(ResponseCombiner *combiner, int node_count,
     combiner->recv_datarows  = 0;
     combiner->prerowBuffers  = NULL;
     combiner->is_abort = false;
-	combiner->printed_nodes = NULL;
 	combiner->recv_instr_htbl = NULL;
 #endif
 }
@@ -1107,11 +1106,6 @@ CloseCombiner(ResponseCombiner *combiner)
 		hash_destroy(combiner->recv_instr_htbl);
 		combiner->recv_instr_htbl = NULL;
 	}
-	if (combiner->printed_nodes)
-	{
-		bms_free(combiner->printed_nodes);
-		combiner->printed_nodes = NULL;
-	}
 #endif
 }
 
@@ -1900,7 +1894,15 @@ FetchTuple(ResponseCombiner *combiner)
      * Case if we run local subplan.
      * We do not have remote connections, so just get local tuple and return it
      */
-    if (outerPlanState(combiner))
+	if (outerPlanState(combiner)
+#ifdef __TBASE__
+	    /* 
+		 * if dn_instrument is not null, means this node is initialized for recv
+		 * instrument from remote, not execute it locally too.
+		 */
+	    && ((outerPlanState(combiner))->dn_instrument == NULL)
+#endif
+		)
     {
         RemoteSubplanState *planstate = (RemoteSubplanState *) combiner;
         RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
@@ -2686,10 +2688,6 @@ FetchTuple(ResponseCombiner *combiner)
         {
             /* Do nothing. It must have been handled in handle_response() */
         }
-		else if (res == RESPONSE_INSTR)
-		{
-			/* Do nothing. It must have been handled in handle_response() */
-		}
         else
         {
             // Can not get here?
@@ -3328,8 +3326,9 @@ handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner)
 #ifdef __TBASE__
 			case 'i': /* Remote Instrument */
 				if (msg_len > 0)
-					HandleRemoteInstr(msg, msg_len, conn->nodeoid, combiner);
-				return RESPONSE_INSTR;
+					HandleRemoteInstr(msg, msg_len, conn->nodeid, combiner);
+				/* just break to return EOF. */
+				break;
 #endif
             default:
                 /* sync lost? */
@@ -9907,17 +9906,19 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags)
     combiner->ss.ps.plan = (Plan *) node;
     combiner->ss.ps.state = estate;
     combiner->ss.ps.ExecProcNode = ExecRemoteSubplan;
-
+#ifdef __TBASE__
 	if (estate->es_instrument)
 	{
 		HASHCTL		ctl;
 		
-		ctl.keysize = sizeof(int);
+		ctl.keysize = sizeof(RemoteInstrKey);
 		ctl.entrysize = sizeof(RemoteInstr);
 		
-		combiner->recv_instr_htbl = hash_create("Remote Instrument", 16, &ctl, HASH_ELEM);
+		combiner->recv_instr_htbl = hash_create("Remote Instrument", 8 * NumDataNodes,
+		                                        &ctl, HASH_ELEM | HASH_BLOBS);
 	}
-	
+	combiner->remote_parallel_estimated = false;
+#endif
     combiner->ss.ps.qual = NULL;
 
     combiner->request_type = REQUEST_TYPE_QUERY;
@@ -10718,6 +10719,9 @@ ExecRemoteSubplan(PlanState *pstate)
     int count = 0;
 #endif
 #ifdef __TBASE__
+	if ((node->eflags & EXEC_FLAG_EXPLAIN_ONLY) != 0)
+		return NULL;
+	
     if (!node->local_exec && (!node->finish_init) && (!(node->eflags & EXEC_FLAG_SUBPLAN)))
     {
         if(node->execNodes)
@@ -11149,12 +11153,25 @@ ExecShutdownRemoteSubplan(RemoteSubplanState *node)
 	Plan                *plan = ps->plan;
 	EState              *estate = ps->state;
 	
+	if ((node->eflags & EXEC_FLAG_EXPLAIN_ONLY) != 0)
+		return;
+	
+	elog(DEBUG1, "shutdown remote subplan worker %d, plan_node_id %d", ParallelWorkerNumber, plan->plan_node_id);
+	
 	if (estate->es_instrument)
 	{
+		MemoryContext oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+		AttachRemoteInstrContext ctx;
+		
 		if (!ps->lefttree)
 			ps->lefttree = ExecInitNode(plan->lefttree, estate, EXEC_FLAG_EXPLAIN_ONLY);
 
-		AttachRemoteInstr(ps->lefttree, combiner);
+		ctx.htab = combiner->recv_instr_htbl;
+		ctx.node_idx_List = ((RemoteSubplan *) plan)->nodeList;
+		ctx.printed_nodes = NULL;
+		AttachRemoteInstr(ps->lefttree, &ctx);
+		
+		MemoryContextSwitchTo(oldcontext);
 	}
 }
 
@@ -11168,6 +11185,9 @@ ExecFinishRemoteSubplan(RemoteSubplanState *node)
     int *dn_list = NULL;
     char cursor[NAMEDATALEN];
 
+	if ((node->eflags & EXEC_FLAG_EXPLAIN_ONLY) != 0)
+		return;
+	
     if (!node->bound)
     {
         if (g_DataPumpDebug)
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index 46e81611..5f55c35d 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -2640,6 +2640,31 @@ is_pgxc_nodeprimary(Oid nodeid)
     return result;
 }
 
+#ifdef __TBASE__
+/*
+ * get_pgxc_nodename
+ *		Get node name for given identifier
+ */
+char *
+get_pgxc_nodename_from_identifier(int id)
+{
+	HeapTuple		tuple;
+	Form_pgxc_node	nodeForm;
+	char           *result;
+	
+	tuple = SearchSysCache1(PGXCNODEIDENTIFIER, Int32GetDatum(id));
+	
+	if (!HeapTupleIsValid(tuple))
+		elog(ERROR, "cache lookup failed for identifier %d", id);
+	
+	nodeForm = (Form_pgxc_node) GETSTRUCT(tuple);
+	result = pstrdup(NameStr(nodeForm->node_name));
+	ReleaseSysCache(tuple);
+	
+	return result;
+}
+#endif
+
 /*
  * get_pgxc_groupoid
  *        Obtain PGXC Group Oid for given group name
diff --git a/src/include/commands/explain_dist.h b/src/include/commands/explain_dist.h
index fe682bda..0c49bc66 100644
--- a/src/include/commands/explain_dist.h
+++ b/src/include/commands/explain_dist.h
@@ -16,21 +16,42 @@
 #include "commands/explain.h"
 #include "pgxc/execRemote.h"
 
+/* Key of hash table entry */
+typedef struct RemoteInstrKey
+{
+	int plan_node_id;   /* unique id of current plan node */
+	int node_id;        /* node id */
+} RemoteInstrKey;
+
 /* Hash table entry */
-typedef struct
+typedef struct RemoteInstr
 {
-	int id;                 /* unique id of current plan node */
+	RemoteInstrKey key;
+	
 	int nodeTag;            /* type of current plan node */
 	Instrumentation instr;  /* instrument of current plan node */
 	
-	/* for Gather */
-	int nworkers_launched;  /* worker num of gather */
+	/* for Gather and Sort */
+	int nworkers_launched;  /* worker num of gather or sort */
 	
-	/* for Hash: */
+	/* for Sort */
+	TuplesortInstrumentation sort_stat;      /* instrument if no parallel */
+	TuplesortInstrumentation *w_sort_stats;  /* instrument of parallel workers */
+	
+	/* for Hash */
+	HashInstrumentation hash_stat;
 } RemoteInstr;
 
+typedef struct AttachRemoteInstrContext
+{
+	List        *node_idx_List;     /* list of node index in dn_handles */
+	HTAB        *htab;              /* htab from combiner, stored remote instr */
+	Bitmapset   *printed_nodes;     /* ids of plan nodes we've handled */
+} AttachRemoteInstrContext;
+
 extern void SendLocalInstr(PlanState *planstate);
-extern void HandleRemoteInstr(char *msg_body, size_t len, int nodeoid, ResponseCombiner *combiner);
-extern bool AttachRemoteInstr(PlanState *planstate, ResponseCombiner *combiner);
+extern void HandleRemoteInstr(char *msg_body, size_t len, int nodeid, ResponseCombiner *combiner);
+extern bool AttachRemoteInstr(PlanState *planstate, AttachRemoteInstrContext *ctx);
+extern void ExplainCommonRemoteInstr(PlanState *planstate, ExplainState *es);
 
 #endif  /* EXPLAINDIST_H  */
\ No newline at end of file
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 072f7f5a..1f9af7fe 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * instrument.h
- *      definitions for run-time statistics collection
+ *	  definitions for run-time statistics collection
  *
  *
  * Copyright (c) 2001-2017, PostgreSQL Global Development Group
@@ -18,57 +18,71 @@
 
 typedef struct BufferUsage
 {
-    long        shared_blks_hit;    /* # of shared buffer hits */
-    long        shared_blks_read;    /* # of shared disk blocks read */
-    long        shared_blks_dirtied;    /* # of shared blocks dirtied */
-    long        shared_blks_written;    /* # of shared disk blocks written */
-    long        local_blks_hit; /* # of local buffer hits */
-    long        local_blks_read;    /* # of local disk blocks read */
-    long        local_blks_dirtied; /* # of shared blocks dirtied */
-    long        local_blks_written; /* # of local disk blocks written */
-    long        temp_blks_read; /* # of temp blocks read */
-    long        temp_blks_written;    /* # of temp blocks written */
-    instr_time    blk_read_time;    /* time spent reading */
-    instr_time    blk_write_time; /* time spent writing */
+	long		shared_blks_hit;	/* # of shared buffer hits */
+	long		shared_blks_read;	/* # of shared disk blocks read */
+	long		shared_blks_dirtied;	/* # of shared blocks dirtied */
+	long		shared_blks_written;	/* # of shared disk blocks written */
+	long		local_blks_hit; /* # of local buffer hits */
+	long		local_blks_read;	/* # of local disk blocks read */
+	long		local_blks_dirtied; /* # of shared blocks dirtied */
+	long		local_blks_written; /* # of local disk blocks written */
+	long		temp_blks_read; /* # of temp blocks read */
+	long		temp_blks_written;	/* # of temp blocks written */
+	instr_time	blk_read_time;	/* time spent reading */
+	instr_time	blk_write_time; /* time spent writing */
 } BufferUsage;
 
 /* Flag bits included in InstrAlloc's instrument_options bitmask */
 typedef enum InstrumentOption
 {
-    INSTRUMENT_TIMER = 1 << 0,    /* needs timer (and row counts) */
-    INSTRUMENT_BUFFERS = 1 << 1,    /* needs buffer usage */
-    INSTRUMENT_ROWS = 1 << 2,    /* needs row count */
-    INSTRUMENT_ALL = PG_INT32_MAX
+	INSTRUMENT_TIMER = 1 << 0,	/* needs timer (and row counts) */
+	INSTRUMENT_BUFFERS = 1 << 1,	/* needs buffer usage */
+	INSTRUMENT_ROWS = 1 << 2,	/* needs row count */
+	INSTRUMENT_ALL = PG_INT32_MAX
 } InstrumentOption;
 
 typedef struct Instrumentation
 {
-    /* Parameters set at node creation: */
-    bool        need_timer;        /* TRUE if we need timer data */
-    bool        need_bufusage;    /* TRUE if we need buffer usage data */
-    /* Info about current plan cycle: */
-    bool        running;        /* TRUE if we've completed first tuple */
-    instr_time    starttime;        /* Start time of current iteration of node */
-    instr_time    counter;        /* Accumulated runtime for this node */
-    double        firsttuple;        /* Time for first tuple of this cycle */
-    double        tuplecount;        /* Tuples emitted so far this cycle */
-    BufferUsage bufusage_start; /* Buffer usage at start */
-    /* Accumulated statistics across all completed cycles: */
-    double        startup;        /* Total startup time (in seconds) */
-    double        total;            /* Total total time (in seconds) */
-    double        ntuples;        /* Total tuples produced */
-    double        nloops;            /* # of run cycles for this node */
-    double        nfiltered1;        /* # tuples removed by scanqual or joinqual */
-    double        nfiltered2;        /* # tuples removed by "other" quals */
-    BufferUsage bufusage;        /* Total buffer usage */
+	/* Parameters set at node creation: */
+	bool		need_timer;		/* TRUE if we need timer data */
+	bool		need_bufusage;	/* TRUE if we need buffer usage data */
+	/* Info about current plan cycle: */
+	bool		running;		/* TRUE if we've completed first tuple */
+	instr_time	starttime;		/* Start time of current iteration of node */
+	instr_time	counter;		/* Accumulated runtime for this node */
+	double		firsttuple;		/* Time for first tuple of this cycle */
+	double		tuplecount;		/* Tuples emitted so far this cycle */
+	BufferUsage bufusage_start; /* Buffer usage at start */
+	/* Accumulated statistics across all completed cycles: */
+	double		startup;		/* Total startup time (in seconds) */
+	double		total;			/* Total total time (in seconds) */
+	double		ntuples;		/* Total tuples produced */
+	double		nloops;			/* # of run cycles for this node */
+	double		nfiltered1;		/* # tuples removed by scanqual or joinqual */
+	double		nfiltered2;		/* # tuples removed by "other" quals */
+	BufferUsage bufusage;		/* Total buffer usage */
 } Instrumentation;
 
 typedef struct WorkerInstrumentation
 {
-    int            num_workers;    /* # of structures that follow */
-    Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
+	int			num_workers;	/* # of structures that follow */
+	Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
 } WorkerInstrumentation;
 
+#ifdef __TBASE__
+typedef struct RemoteInstrumentation
+{
+	int              nodeid;    /* which datanode the instrument comes from */
+	Instrumentation  instr;     /* the instrumentation */
+} RemoteInstrumentation;
+
+typedef struct DatanodeInstrumentation
+{
+	int     nnode;             /* how many datanodes this node has been executed */
+	RemoteInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
+} DatanodeInstrumentation;
+#endif
+
 extern PGDLLIMPORT BufferUsage pgBufferUsage;
 
 extern Instrumentation *InstrAlloc(int n, int instrument_options);
@@ -81,4 +95,4 @@ extern void InstrStartParallelQuery(void);
 extern void InstrEndParallelQuery(BufferUsage *result);
 extern void InstrAccumParallelQuery(BufferUsage *result);
 
-#endif                            /* INSTRUMENT_H */
+#endif							/* INSTRUMENT_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 20c14341..087b2223 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -928,6 +928,9 @@ typedef struct PlanState
                                          * wrapper */
 
     Instrumentation *instrument;    /* Optional runtime stats for this node */
+#ifdef __TBASE__
+	DatanodeInstrumentation *dn_instrument;     /* per-datanode instrumentation */
+#endif
     WorkerInstrumentation *worker_instrument;    /* per-worker instrumentation */
 
     /*
@@ -1992,6 +1995,7 @@ typedef struct SortState
 	bool		am_worker;		/* are we a worker? */
 	SharedSortInfo *shared_info;	/* one entry per worker */
 #ifdef __TBASE__
+	TuplesortInstrumentation instrument; /* cached instrument from distributed nodes */
     Size            stateLen;
     ReDistributeState *state;
     BufFile        **file;
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index 03b16a62..8332a217 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -177,8 +177,8 @@ typedef struct ResponseCombiner
     uint64     recv_datarows;
 	
 	/* for remote instrument */
-	Bitmapset       *printed_nodes;     /* ids of plan nodes we've handled */
 	HTAB            *recv_instr_htbl;        /* received str hash table for each plan_node_id */
+	bool    remote_parallel_estimated;  /* hint for remote instrument in parallel mode */
 #endif
 }    ResponseCombiner;
 
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index c00fcb61..e0d757b0 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * lsyscache.h
- *      Convenience routines for common queries in the system catalog cache.
+ *	  Convenience routines for common queries in the system catalog cache.
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -21,41 +21,41 @@
 /* Result list element for get_op_btree_interpretation */
 typedef struct OpBtreeInterpretation
 {
-    Oid            opfamily_id;    /* btree opfamily containing operator */
-    int            strategy;        /* its strategy number */
-    Oid            oplefttype;        /* declared left input datatype */
-    Oid            oprighttype;    /* declared right input datatype */
+	Oid			opfamily_id;	/* btree opfamily containing operator */
+	int			strategy;		/* its strategy number */
+	Oid			oplefttype;		/* declared left input datatype */
+	Oid			oprighttype;	/* declared right input datatype */
 } OpBtreeInterpretation;
 
 /* I/O function selector for get_type_io_data */
 typedef enum IOFuncSelector
 {
-    IOFunc_input,
-    IOFunc_output,
-    IOFunc_receive,
-    IOFunc_send
+	IOFunc_input,
+	IOFunc_output,
+	IOFunc_receive,
+	IOFunc_send
 } IOFuncSelector;
 
 /* Flag bits for get_attstatsslot */
-#define ATTSTATSSLOT_VALUES        0x01
-#define ATTSTATSSLOT_NUMBERS    0x02
+#define ATTSTATSSLOT_VALUES		0x01
+#define ATTSTATSSLOT_NUMBERS	0x02
 
 /* Result struct for get_attstatsslot */
 typedef struct AttStatsSlot
 {
-    /* Always filled: */
-    Oid            staop;            /* Actual staop for the found slot */
-    /* Filled if ATTSTATSSLOT_VALUES is specified: */
-    Oid            valuetype;        /* Actual datatype of the values */
-    Datum       *values;            /* slot's "values" array, or NULL if none */
-    int            nvalues;        /* length of values[], or 0 */
-    /* Filled if ATTSTATSSLOT_NUMBERS is specified: */
-    float4       *numbers;        /* slot's "numbers" array, or NULL if none */
-    int            nnumbers;        /* length of numbers[], or 0 */
+	/* Always filled: */
+	Oid			staop;			/* Actual staop for the found slot */
+	/* Filled if ATTSTATSSLOT_VALUES is specified: */
+	Oid			valuetype;		/* Actual datatype of the values */
+	Datum	   *values;			/* slot's "values" array, or NULL if none */
+	int			nvalues;		/* length of values[], or 0 */
+	/* Filled if ATTSTATSSLOT_NUMBERS is specified: */
+	float4	   *numbers;		/* slot's "numbers" array, or NULL if none */
+	int			nnumbers;		/* length of numbers[], or 0 */
 
-    /* Remaining fields are private to get_attstatsslot/free_attstatsslot */
-    void       *values_arr;        /* palloc'd values array, if any */
-    void       *numbers_arr;    /* palloc'd numbers array, if any */
+	/* Remaining fields are private to get_attstatsslot/free_attstatsslot */
+	void	   *values_arr;		/* palloc'd values array, if any */
+	void	   *numbers_arr;	/* palloc'd numbers array, if any */
 } AttStatsSlot;
 
 /* Hook for plugins to get control in get_attavgwidth() */
@@ -63,40 +63,40 @@ typedef int32 (*get_attavgwidth_hook_type) (Oid relid, AttrNumber attnum);
 extern PGDLLIMPORT get_attavgwidth_hook_type get_attavgwidth_hook;
 
 extern bool op_in_opfamily(Oid opno, Oid opfamily);
-extern int    get_op_opfamily_strategy(Oid opno, Oid opfamily);
-extern Oid    get_op_opfamily_sortfamily(Oid opno, Oid opfamily);
+extern int	get_op_opfamily_strategy(Oid opno, Oid opfamily);
+extern Oid	get_op_opfamily_sortfamily(Oid opno, Oid opfamily);
 extern void get_op_opfamily_properties(Oid opno, Oid opfamily, bool ordering_op,
-                           int *strategy,
-                           Oid *lefttype,
-                           Oid *righttype);
+						   int *strategy,
+						   Oid *lefttype,
+						   Oid *righttype);
 extern Oid get_opfamily_member(Oid opfamily, Oid lefttype, Oid righttype,
-                    int16 strategy);
+					int16 strategy);
 extern bool get_ordering_op_properties(Oid opno,
-                           Oid *opfamily, Oid *opcintype, int16 *strategy);
-extern Oid    get_equality_op_for_ordering_op(Oid opno, bool *reverse);
-extern Oid    get_ordering_op_for_equality_op(Oid opno, bool use_lhs_type);
+						   Oid *opfamily, Oid *opcintype, int16 *strategy);
+extern Oid	get_equality_op_for_ordering_op(Oid opno, bool *reverse);
+extern Oid	get_ordering_op_for_equality_op(Oid opno, bool use_lhs_type);
 extern List *get_mergejoin_opfamilies(Oid opno);
 extern bool get_compatible_hash_operators(Oid opno,
-                              Oid *lhs_opno, Oid *rhs_opno);
+							  Oid *lhs_opno, Oid *rhs_opno);
 extern bool get_op_hash_functions(Oid opno,
-                      RegProcedure *lhs_procno, RegProcedure *rhs_procno);
+					  RegProcedure *lhs_procno, RegProcedure *rhs_procno);
 extern List *get_op_btree_interpretation(Oid opno);
 extern bool equality_ops_are_compatible(Oid opno1, Oid opno2);
 extern Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype,
-                  int16 procnum);
+				  int16 procnum);
 extern char *get_attname(Oid relid, AttrNumber attnum);
 extern char *get_relid_attribute_name(Oid relid, AttrNumber attnum);
 extern AttrNumber get_attnum(Oid relid, const char *attname);
 extern char get_attidentity(Oid relid, AttrNumber attnum);
-extern Oid    get_atttype(Oid relid, AttrNumber attnum);
+extern Oid	get_atttype(Oid relid, AttrNumber attnum);
 extern int32 get_atttypmod(Oid relid, AttrNumber attnum);
 extern void get_atttypetypmodcoll(Oid relid, AttrNumber attnum,
-                      Oid *typid, int32 *typmod, Oid *collid);
+					  Oid *typid, int32 *typmod, Oid *collid);
 extern char *get_collation_name(Oid colloid);
 #ifdef XCP
-extern Oid     get_collation_namespace(Oid colloid);
+extern Oid 	get_collation_namespace(Oid colloid);
 extern int32 get_collation_encoding(Oid colloid);
-extern Oid     get_collid(const char *collname, int32 collencoding, Oid collnsp);
+extern Oid 	get_collid(const char *collname, int32 collencoding, Oid collnsp);
 #endif
 extern char *get_constraint_name(Oid conoid);
 #ifdef __TBASE__
@@ -106,26 +106,26 @@ extern Oid get_rel_filenode(Oid relid);
 extern bool get_rel_stat(Oid relid, int *pages, float *tuples, int *all_visible_pages);
 #endif
 extern char *get_language_name(Oid langoid, bool missing_ok);
-extern Oid    get_opclass_family(Oid opclass);
-extern Oid    get_opclass_input_type(Oid opclass);
+extern Oid	get_opclass_family(Oid opclass);
+extern Oid	get_opclass_input_type(Oid opclass);
 extern RegProcedure get_opcode(Oid opno);
 extern char *get_opname(Oid opno);
-extern Oid    get_op_rettype(Oid opno);
+extern Oid	get_op_rettype(Oid opno);
 extern void op_input_types(Oid opno, Oid *lefttype, Oid *righttype);
 extern bool op_mergejoinable(Oid opno, Oid inputtype);
 extern bool op_hashjoinable(Oid opno, Oid inputtype);
 extern bool op_strict(Oid opno);
 extern char op_volatile(Oid opno);
-extern Oid    get_commutator(Oid opno);
-extern Oid    get_negator(Oid opno);
+extern Oid	get_commutator(Oid opno);
+extern Oid	get_negator(Oid opno);
 extern RegProcedure get_oprrest(Oid opno);
 extern RegProcedure get_oprjoin(Oid opno);
 extern char *get_func_name(Oid funcid);
-extern Oid    get_func_namespace(Oid funcid);
-extern Oid    get_func_rettype(Oid funcid);
-extern int    get_func_nargs(Oid funcid);
-extern Oid    get_func_signature(Oid funcid, Oid **argtypes, int *nargs);
-extern Oid    get_func_variadictype(Oid funcid);
+extern Oid	get_func_namespace(Oid funcid);
+extern Oid	get_func_rettype(Oid funcid);
+extern int	get_func_nargs(Oid funcid);
+extern Oid	get_func_signature(Oid funcid, Oid **argtypes, int *nargs);
+extern Oid	get_func_variadictype(Oid funcid);
 extern bool get_func_retset(Oid funcid);
 extern bool func_strict(Oid funcid);
 extern char func_volatile(Oid funcid);
@@ -133,33 +133,33 @@ extern char func_parallel(Oid funcid);
 extern bool get_func_leakproof(Oid funcid);
 extern float4 get_func_cost(Oid funcid);
 extern float4 get_func_rows(Oid funcid);
-extern Oid    get_relname_relid(const char *relname, Oid relnamespace);
+extern Oid	get_relname_relid(const char *relname, Oid relnamespace);
 #ifdef PGXC
-extern int    get_relnatts(Oid relid);
+extern int	get_relnatts(Oid relid);
 #endif
 extern char *get_rel_name(Oid relid);
-extern Oid    get_rel_namespace(Oid relid);
-extern Oid    get_rel_type_id(Oid relid);
+extern Oid	get_rel_namespace(Oid relid);
+extern Oid	get_rel_type_id(Oid relid);
 extern char get_rel_relkind(Oid relid);
-extern Oid    get_rel_tablespace(Oid relid);
+extern Oid	get_rel_tablespace(Oid relid);
 extern char get_rel_persistence(Oid relid);
-extern Oid    get_transform_fromsql(Oid typid, Oid langid, List *trftypes);
-extern Oid    get_transform_tosql(Oid typid, Oid langid, List *trftypes);
+extern Oid	get_transform_fromsql(Oid typid, Oid langid, List *trftypes);
+extern Oid	get_transform_tosql(Oid typid, Oid langid, List *trftypes);
 extern bool get_typisdefined(Oid typid);
 extern int16 get_typlen(Oid typid);
 extern bool get_typbyval(Oid typid);
 extern void get_typlenbyval(Oid typid, int16 *typlen, bool *typbyval);
 extern void get_typlenbyvalalign(Oid typid, int16 *typlen, bool *typbyval,
-                     char *typalign);
-extern Oid    getTypeIOParam(HeapTuple typeTuple);
+					 char *typalign);
+extern Oid	getTypeIOParam(HeapTuple typeTuple);
 extern void get_type_io_data(Oid typid,
-                 IOFuncSelector which_func,
-                 int16 *typlen,
-                 bool *typbyval,
-                 char *typalign,
-                 char *typdelim,
-                 Oid *typioparam,
-                 Oid *func);
+				 IOFuncSelector which_func,
+				 int16 *typlen,
+				 bool *typbyval,
+				 char *typalign,
+				 char *typdelim,
+				 Oid *typioparam,
+				 Oid *func);
 extern char get_typstorage(Oid typid);
 extern Node *get_typdefault(Oid typid);
 extern char get_typtype(Oid typid);
@@ -167,65 +167,68 @@ extern bool type_is_rowtype(Oid typid);
 extern bool type_is_enum(Oid typid);
 extern bool type_is_range(Oid typid);
 extern void get_type_category_preferred(Oid typid,
-                            char *typcategory,
-                            bool *typispreferred);
-extern Oid    get_typ_typrelid(Oid typid);
-extern Oid    get_element_type(Oid typid);
-extern Oid    get_array_type(Oid typid);
-extern Oid    get_promoted_array_type(Oid typid);
-extern Oid    get_base_element_type(Oid typid);
+							char *typcategory,
+							bool *typispreferred);
+extern Oid	get_typ_typrelid(Oid typid);
+extern Oid	get_element_type(Oid typid);
+extern Oid	get_array_type(Oid typid);
+extern Oid	get_promoted_array_type(Oid typid);
+extern Oid	get_base_element_type(Oid typid);
 extern void getTypeInputInfo(Oid type, Oid *typInput, Oid *typIOParam);
 extern void getTypeOutputInfo(Oid type, Oid *typOutput, bool *typIsVarlena);
 extern void getTypeBinaryInputInfo(Oid type, Oid *typReceive, Oid *typIOParam);
 extern void getTypeBinaryOutputInfo(Oid type, Oid *typSend, bool *typIsVarlena);
-extern Oid    get_typmodin(Oid typid);
-extern Oid    get_typcollation(Oid typid);
+extern Oid	get_typmodin(Oid typid);
+extern Oid	get_typcollation(Oid typid);
 extern bool type_is_collatable(Oid typid);
-extern Oid    getBaseType(Oid typid);
-extern Oid    getBaseTypeAndTypmod(Oid typid, int32 *typmod);
+extern Oid	getBaseType(Oid typid);
+extern Oid	getBaseTypeAndTypmod(Oid typid, int32 *typmod);
 #ifdef PGXC
 extern char *get_typename(Oid typid);
 extern char * get_typenamespace_typename(Oid typid);
 extern char *get_pgxc_nodename(Oid nodeoid);
-extern Oid    get_pgxc_nodeoid_extend(const char *nodename, const char *clustername);
+extern Oid	get_pgxc_nodeoid_extend(const char *nodename, const char *clustername);
 #define get_pgxc_nodeoid(nodename) get_pgxc_nodeoid_extend((nodename), (PGXCClusterName))
-extern uint32    get_pgxc_node_id(Oid nodeid);
-extern char    get_pgxc_nodetype(Oid nodeid);
-extern int    get_pgxc_nodeport(Oid nodeid);
+extern uint32	get_pgxc_node_id(Oid nodeid);
+extern char	get_pgxc_nodetype(Oid nodeid);
+extern int	get_pgxc_nodeport(Oid nodeid);
 extern char *get_pgxc_nodehost(Oid nodeid);
-extern bool    is_pgxc_nodepreferred(Oid nodeid);
-extern bool    is_pgxc_nodeprimary(Oid nodeid);
-extern Oid    get_pgxc_groupoid(const char *groupname);
-extern int    get_pgxc_groupmembers(Oid groupid, Oid **members);
-extern int    get_pgxc_classnodes(Oid tableid, Oid **nodes);
+extern bool	is_pgxc_nodepreferred(Oid nodeid);
+extern bool	is_pgxc_nodeprimary(Oid nodeid);
+#ifdef __TBASE__
+extern char *get_pgxc_nodename_from_identifier(int id);
+#endif
+extern Oid	get_pgxc_groupoid(const char *groupname);
+extern int	get_pgxc_groupmembers(Oid groupid, Oid **members);
+extern int	get_pgxc_classnodes(Oid tableid, Oid **nodes);
 extern char * get_pgxc_groupname(Oid groupid);
 #endif
 extern int32 get_typavgwidth(Oid typid, int32 typmod);
 extern int32 get_attavgwidth(Oid relid, AttrNumber attnum);
 extern bool get_attstatsslot(AttStatsSlot *sslot, HeapTuple statstuple,
-                 int reqkind, Oid reqop, int flags);
+				 int reqkind, Oid reqop, int flags);
 extern void free_attstatsslot(AttStatsSlot *sslot);
 extern char *get_namespace_name(Oid nspid);
 #ifdef XCP
-extern Oid    get_namespaceid(const char *nspname);
+extern Oid	get_namespaceid(const char *nspname);
 extern char *get_typ_name(Oid typid);
-extern Oid    get_typ_namespace(Oid typid);
-extern Oid    get_typname_typid(const char *typname, Oid typnamespace);
-extern Oid    get_funcid(const char *funcname, oidvector *argtypes, Oid funcnsp);
-extern Oid    get_opnamespace(Oid opno);
-extern Oid    get_operid(const char *oprname, Oid oprleft, Oid oprright, Oid oprnsp);
+extern Oid	get_typ_namespace(Oid typid);
+extern Oid	get_typname_typid(const char *typname, Oid typnamespace);
+extern Oid	get_funcid(const char *funcname, oidvector *argtypes, Oid funcnsp);
+extern Oid	get_opnamespace(Oid opno);
+extern Oid	get_operid(const char *oprname, Oid oprleft, Oid oprright, Oid oprnsp);
 #endif
 extern char *get_namespace_name_or_temp(Oid nspid);
-extern Oid    get_range_subtype(Oid rangeOid);
+extern Oid	get_range_subtype(Oid rangeOid);
 
 #ifdef XCP
-extern Oid    get_tablesample_method_id(const char *methodname);
+extern Oid	get_tablesample_method_id(const char *methodname);
 #endif
 
 #define type_is_array(typid)  (get_element_type(typid) != InvalidOid)
 /* type_is_array_domain accepts both plain arrays and domains over arrays */
 #define type_is_array_domain(typid)  (get_base_element_type(typid) != InvalidOid)
 
-#define TypeIsToastable(typid)    (get_typstorage(typid) != 'p')
+#define TypeIsToastable(typid)	(get_typstorage(typid) != 'p')
 
-#endif                            /* LSYSCACHE_H */
+#endif							/* LSYSCACHE_H */
diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index a1c6c31b..761f5a90 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -6169,6 +6169,446 @@ create index idx_nestloop_suppression1_b on nestloop_suppression1(b);
 analyze nestloop_suppression1;
 analyze nestloop_suppression2;
 analyze nestloop_suppression3;
+begin;
+set local min_parallel_table_scan_size = 0;
+set local parallel_setup_cost = 0;
+-- Extract bucket and batch counts from an explain analyze plan.  In
+-- general we can't make assertions about how many batches (or
+-- buckets) will be required because it can vary, but we can in some
+-- special cases and we can check for growth.
+create or replace function find_hash(node json)
+returns json language plpgsql
+as
+$$
+declare
+  x json;
+  child json;
+begin
+  if node->>'Node Type' = 'Hash' then
+    return node;
+  else
+    for child in select json_array_elements(node->'Plans')
+    loop
+      x := find_hash(child);
+      if x is not null then
+        return x;
+      end if;
+    end loop;
+    return null;
+  end if;
+end;
+$$;
+create or replace function hash_join_batches(query text)
+returns table (original int, final int) language plpgsql
+as
+$$
+declare
+  whole_plan json;
+  hash_node json;
+begin
+  for whole_plan in
+    execute 'explain (analyze, format ''json'') ' || query
+  loop
+    hash_node := find_hash(json_extract_path(whole_plan, '0', 'Plan'));
+    original := hash_node->>'Original Hash Batches';
+    final := hash_node->>'Hash Batches';
+    return next;
+  end loop;
+end;
+$$;
+-- Make a simple relation with well distributed keys and correctly
+-- estimated size.
+create table simple as
+  select generate_series(1, 20000) AS id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa';
+alter table simple set (parallel_workers = 2);
+analyze simple;
+-- Make a relation whose size we will under-estimate.  We want stats
+-- to say 1000 rows, but actually there are 20,000 rows.
+create table bigger_than_it_looks as
+  select generate_series(1, 20000) as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa';
+alter table bigger_than_it_looks set (autovacuum_enabled = 'false');
+alter table bigger_than_it_looks set (parallel_workers = 2);
+analyze bigger_than_it_looks;
+update pg_class set reltuples = 1000 where relname = 'bigger_than_it_looks';
+-- Make a relation whose size we underestimate and that also has a
+-- kind of skew that breaks our batching scheme.  We want stats to say
+-- 2 rows, but actually there are 20,000 rows with the same key.
+create table extremely_skewed (id int, t text);
+alter table extremely_skewed set (autovacuum_enabled = 'false');
+alter table extremely_skewed set (parallel_workers = 2);
+analyze extremely_skewed;
+insert into extremely_skewed
+  select 42 as id, 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
+  from generate_series(1, 20000);
+update pg_class
+  set reltuples = 2, relpages = pg_relation_size('extremely_skewed') / 8192
+  where relname = 'extremely_skewed';
+-- The "optimal" case: the hash table fits in memory; we plan for 1
+-- batch, we stick to that number, and peak memory usage stays within
+-- our work_mem budget
+-- non-parallel
+savepoint settings;
+set local max_parallel_workers_per_gather = 0;
+set local work_mem = '4MB';
+explain (costs off)
+  select count(*) from simple r join simple s using (id);
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Finalize Aggregate
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Partial Aggregate
+               ->  Hash Join
+                     Hash Cond: (r.id = s.id)
+                     ->  Seq Scan on simple r
+                     ->  Hash
+                           ->  Seq Scan on simple s
+(8 rows)
+
+select count(*) from simple r join simple s using (id);
+ count 
+-------
+ 20000
+(1 row)
+
+select original > 1 as initially_multibatch, final > original as increased_batches
+  from hash_join_batches(
+$$
+  select count(*) from simple r join simple s using (id);
+$$);
+ initially_multibatch | increased_batches 
+----------------------+-------------------
+ f                    | f
+(1 row)
+
+rollback to settings;
+-- parallel with parallel-oblivious hash join
+savepoint settings;
+set local max_parallel_workers_per_gather = 2;
+set local work_mem = '4MB';
+explain (costs off)
+  select count(*) from simple r join simple s using (id);
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Parallel Finalize Aggregate
+   ->  Parallel Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Gather
+               Workers Planned: 2
+               ->  Partial Aggregate
+                     ->  Parallel Hash Join
+                           Hash Cond: (r.id = s.id)
+                           ->  Parallel Seq Scan on simple r
+                           ->  Parallel Hash
+                                 ->  Parallel Seq Scan on simple s
+(10 rows)
+
+select count(*) from simple r join simple s using (id);
+ count 
+-------
+ 20000
+(1 row)
+
+select original > 1 as initially_multibatch, final > original as increased_batches
+  from hash_join_batches(
+$$
+  select count(*) from simple r join simple s using (id);
+$$);
+ initially_multibatch | increased_batches 
+----------------------+-------------------
+ f                    | f
+(1 row)
+
+rollback to settings;
+-- The "good" case: batches required, but we plan the right number; we
+-- plan for some number of batches, and we stick to that number, and
+-- peak memory usage says within our work_mem budget
+-- non-parallel
+savepoint settings;
+set local max_parallel_workers_per_gather = 0;
+set local work_mem = '128kB';
+explain (costs off)
+  select count(*) from simple r join simple s using (id);
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Finalize Aggregate
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Partial Aggregate
+               ->  Hash Join
+                     Hash Cond: (r.id = s.id)
+                     ->  Seq Scan on simple r
+                     ->  Hash
+                           ->  Seq Scan on simple s
+(8 rows)
+
+select count(*) from simple r join simple s using (id);
+ count 
+-------
+ 20000
+(1 row)
+
+select original > 1 as initially_multibatch, final > original as increased_batches
+  from hash_join_batches(
+$$
+  select count(*) from simple r join simple s using (id);
+$$);
+ initially_multibatch | increased_batches 
+----------------------+-------------------
+ t                    | f
+(1 row)
+
+rollback to settings;
+-- parallel with parallel-oblivious hash join
+savepoint settings;
+set local max_parallel_workers_per_gather = 2;
+set local work_mem = '128kB';
+explain (costs off)
+  select count(*) from simple r join simple s using (id);
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Parallel Finalize Aggregate
+   ->  Parallel Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Gather
+               Workers Planned: 2
+               ->  Partial Aggregate
+                     ->  Parallel Hash Join
+                           Hash Cond: (r.id = s.id)
+                           ->  Parallel Seq Scan on simple r
+                           ->  Parallel Hash
+                                 ->  Parallel Seq Scan on simple s
+(10 rows)
+
+select count(*) from simple r join simple s using (id);
+ count 
+-------
+ 20000
+(1 row)
+
+select original > 1 as initially_multibatch, final > original as increased_batches
+  from hash_join_batches(
+$$
+  select count(*) from simple r join simple s using (id);
+$$);
+ initially_multibatch | increased_batches 
+----------------------+-------------------
+ t                    | f
+(1 row)
+
+rollback to settings;
+-- The "bad" case: during execution we need to increase number of
+-- batches; in this case we plan for 1 batch, and increase at least a
+-- couple of times, and peak memory usage stays within our work_mem
+-- budget
+-- non-parallel
+savepoint settings;
+set local max_parallel_workers_per_gather = 0;
+set local work_mem = '128kB';
+explain (costs off)
+  select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id);
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Partial Aggregate
+               ->  Hash Join
+                     Hash Cond: (r.id = s.id)
+                     ->  Seq Scan on simple r
+                     ->  Hash
+                           ->  Seq Scan on bigger_than_it_looks s
+(8 rows)
+
+select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id);
+ count 
+-------
+ 20000
+(1 row)
+
+select original > 1 as initially_multibatch, final > original as increased_batches
+  from hash_join_batches(
+$$
+  select count(*) FROM simple r JOIN bigger_than_it_looks s USING (id);
+$$);
+ initially_multibatch | increased_batches 
+----------------------+-------------------
+ f                    | t
+(1 row)
+
+rollback to settings;
+-- parallel with parallel-oblivious hash join
+savepoint settings;
+set local max_parallel_workers_per_gather = 2;
+set local work_mem = '128kB';
+explain (costs off)
+  select count(*) from simple r join bigger_than_it_looks s using (id);
+                                   QUERY PLAN                                    
+---------------------------------------------------------------------------------
+ Parallel Finalize Aggregate
+   ->  Parallel Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Gather
+               Workers Planned: 2
+               ->  Partial Aggregate
+                     ->  Parallel Hash Join
+                           Hash Cond: (r.id = s.id)
+                           ->  Parallel Seq Scan on simple r
+                           ->  Parallel Hash
+                                 ->  Parallel Seq Scan on bigger_than_it_looks s
+(10 rows)
+
+select count(*) from simple r join bigger_than_it_looks s using (id);
+ count 
+-------
+ 20000
+(1 row)
+
+select original > 1 as initially_multibatch, final > original as increased_batches
+  from hash_join_batches(
+$$
+  select count(*) from simple r join bigger_than_it_looks s using (id);
+$$);
+ initially_multibatch | increased_batches 
+----------------------+-------------------
+ f                    | f
+(1 row)
+
+rollback to settings;
+-- The "ugly" case: increasing the number of batches during execution
+-- doesn't help, so stop trying to fit in work_mem and hope for the
+-- best; in this case we plan for 1 batch, increases just once and
+-- then stop increasing because that didn't help at all, so we blow
+-- right through the work_mem budget and hope for the best...
+-- non-parallel
+savepoint settings;
+set local max_parallel_workers_per_gather = 0;
+set local work_mem = '128kB';
+explain (costs off)
+  select count(*) from simple r join extremely_skewed s using (id);
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Finalize Aggregate
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Partial Aggregate
+               ->  Hash Join
+                     Hash Cond: (r.id = s.id)
+                     ->  Seq Scan on simple r
+                     ->  Hash
+                           ->  Seq Scan on extremely_skewed s
+(8 rows)
+
+select count(*) from simple r join extremely_skewed s using (id);
+ count 
+-------
+ 20000
+(1 row)
+
+select * from hash_join_batches(
+$$
+  select count(*) from simple r join extremely_skewed s using (id);
+$$);
+ original | final 
+----------+-------
+        1 |     2
+(1 row)
+
+rollback to settings;
+-- parallel with parallel-oblivious hash join
+savepoint settings;
+set local max_parallel_workers_per_gather = 2;
+set local work_mem = '128kB';
+explain (costs off)
+  select count(*) from simple r join extremely_skewed s using (id);
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Parallel Finalize Aggregate
+   ->  Parallel Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Gather
+               Workers Planned: 2
+               ->  Partial Aggregate
+                     ->  Parallel Hash Join
+                           Hash Cond: (r.id = s.id)
+                           ->  Parallel Seq Scan on simple r
+                           ->  Parallel Hash
+                                 ->  Parallel Seq Scan on extremely_skewed s
+(10 rows)
+
+select count(*) from simple r join extremely_skewed s using (id);
+ count 
+-------
+ 20000
+(1 row)
+
+select * from hash_join_batches(
+$$
+  select count(*) from simple r join extremely_skewed s using (id);
+$$);
+ original | final 
+----------+-------
+        1 |     1
+(1 row)
+
+rollback to settings;
+-- A couple of other hash join tests unrelated to work_mem management.
+-- Check that EXPLAIN ANALYZE has data even if the leader doesn't participate
+savepoint settings;
+set local max_parallel_workers_per_gather = 2;
+set local work_mem = '4MB';
+set local parallel_leader_participation = off;
+ERROR:  unrecognized configuration parameter "parallel_leader_participation"
+select * from hash_join_batches(
+$$
+  select count(*) from simple r join simple s using (id);
+$$);
+ERROR:  current transaction is aborted, commands ignored until end of transaction block
+rollback to settings;
+-- A full outer join where every record is matched.
+-- non-parallel
+savepoint settings;
+set local max_parallel_workers_per_gather = 0;
+explain (costs off)
+     select  count(*) from simple r full outer join simple s using (id);
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Finalize Aggregate
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Partial Aggregate
+               ->  Hash Full Join
+                     Hash Cond: (r.id = s.id)
+                     ->  Seq Scan on simple r
+                     ->  Hash
+                           ->  Seq Scan on simple s
+(8 rows)
+
+select  count(*) from simple r full outer join simple s using (id);
+ count 
+-------
+ 20000
+(1 row)
+
+rollback to settings;
+-- parallelism not possible with parallel-oblivious outer hash join
+savepoint settings;
+set local max_parallel_workers_per_gather = 2;
+explain (costs off)
+     select  count(*) from simple r full outer join simple s using (id);
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Parallel Finalize Aggregate
+   ->  Parallel Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Gather
+               Workers Planned: 2
+               ->  Partial Aggregate
+                     ->  Parallel Hash Full Join
+                           Hash Cond: (r.id = s.id)
+                           ->  Parallel Seq Scan on simple r
+                           ->  Parallel Hash
+                                 ->  Parallel Seq Scan on simple s
+(10 rows)
+
+select  count(*) from simple r full outer join simple s using (id);
+ count 
+-------
+ 20000
+(1 row)
+
+rollback to settings;
+-- An full outer join where every record is not matched.
 set enable_hashjoin = false;
 explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nestloop_suppression3 t3 
 	where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a;
diff --git a/src/test/regress/expected/tbase_explain.out b/src/test/regress/expected/tbase_explain.out
new file mode 100644
index 00000000..691d1bb5
--- /dev/null
+++ b/src/test/regress/expected/tbase_explain.out
@@ -0,0 +1,362 @@
+--explain analyze
+create table a1(id int, num int, name text);
+create table a2(id int, num int, name text);
+insert into a1 values(1,generate_series(1,100),'a');
+insert into a1 values(2,generate_series(1,100),'b');
+insert into a1 values(3,generate_series(1,100),'c');
+insert into a2 select * from a1;
+--normal cases
+explain (costs off,timing off,summary off,analyze,verbose)
+select count(*) from a1;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Finalize Aggregate (actual rows=1 loops=1)
+   Output: count(*)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=2 loops=1)
+         Output: PARTIAL count(*)
+         ->  Partial Aggregate
+               DN (actual rows=1..1 loops=1..1)
+               - datanode_1 (actual rows=1 loops=1)
+               - datanode_2 (actual rows=1 loops=1)
+               Output: PARTIAL count(*)
+               ->  Seq Scan on public.a1
+                     DN (actual rows=100..200 loops=1..1)
+                     - datanode_1 (actual rows=200 loops=1)
+                     - datanode_2 (actual rows=100 loops=1)
+                     Output: id, num, name
+(14 rows)
+
+explain (costs off,timing off,summary off,analyze,verbose)
+select num, count(*) cnt from a2 group by num order by cnt;
+                                  QUERY PLAN                                   
+-------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=100 loops=1)
+   Output: num, count(*)
+   Sort Key: count(*)
+   ->  Sort
+         DN (actual rows=42..58 loops=1..1)
+         - datanode_1 (actual rows=42 loops=1)
+         - datanode_2 (actual rows=58 loops=1)
+         Output: num, (count(*))
+         Sort Key: (count(*))
+         Sort Method: quicksort  Memory: 28kB
+         ->  Finalize HashAggregate
+               DN (actual rows=42..58 loops=1..1)
+               - datanode_1 (actual rows=42 loops=1)
+               - datanode_2 (actual rows=58 loops=1)
+               Output: num, count(*)
+               Group Key: a2.num
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     DN (actual rows=84..116 loops=1..1)
+                     - datanode_1 (actual rows=84 loops=1)
+                     - datanode_2 (actual rows=116 loops=1)
+                     Output: num, PARTIAL count(*)
+                     Distribute results by H: num
+                     ->  Partial HashAggregate
+                           DN (actual rows=100..100 loops=1..1)
+                           - datanode_1 (actual rows=100 loops=1)
+                           - datanode_2 (actual rows=100 loops=1)
+                           Output: num, PARTIAL count(*)
+                           Group Key: a2.num
+                           ->  Seq Scan on public.a2
+                                 DN (actual rows=100..200 loops=1..1)
+                                 - datanode_1 (actual rows=200 loops=1)
+                                 - datanode_2 (actual rows=100 loops=1)
+                                 Output: id, num, name
+(33 rows)
+
+explain (costs off,timing off,summary off,analyze,verbose)
+select * from a1, a2 where a1.num = a2.num;
+                                  QUERY PLAN                                   
+-------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=900 loops=1)
+   Output: a1.id, a1.num, a1.name, a2.id, a2.num, a2.name
+   ->  Hash Join
+         DN (actual rows=378..522 loops=1..1)
+         - datanode_1 (actual rows=378 loops=1)
+         - datanode_2 (actual rows=522 loops=1)
+         Output: a1.id, a1.num, a1.name, a2.id, a2.num, a2.name
+         Hash Cond: (a1.num = a2.num)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               DN (actual rows=126..174 loops=1..1)
+               - datanode_1 (actual rows=126 loops=1)
+               - datanode_2 (actual rows=174 loops=1)
+               Output: a1.id, a1.num, a1.name
+               Distribute results by H: num
+               ->  Seq Scan on public.a1
+                     DN (actual rows=100..200 loops=1..1)
+                     - datanode_1 (actual rows=200 loops=1)
+                     - datanode_2 (actual rows=100 loops=1)
+                     Output: a1.id, a1.num, a1.name
+         ->  Hash
+               DN (actual rows=126..174 loops=1..1)
+               - datanode_1 (actual rows=126 loops=1)
+               - datanode_2 (actual rows=174 loops=1)
+               Output: a2.id, a2.num, a2.name
+               Buckets: 1024  Batches: 1  Memory Usage: 16kB
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     DN (actual rows=126..174 loops=1..1)
+                     - datanode_1 (actual rows=126 loops=1)
+                     - datanode_2 (actual rows=174 loops=1)
+                     Output: a2.id, a2.num, a2.name
+                     Distribute results by H: num
+                     ->  Seq Scan on public.a2
+                           DN (actual rows=100..200 loops=1..1)
+                           - datanode_1 (actual rows=200 loops=1)
+                           - datanode_2 (actual rows=100 loops=1)
+                           Output: a2.id, a2.num, a2.name
+(36 rows)
+
+--append
+explain (costs off,timing off,summary off,analyze,verbose)
+select max(num) from a1 union select min(num) from a1 order by 1;
+                                             QUERY PLAN                                              
+-----------------------------------------------------------------------------------------------------
+ Unique (actual rows=2 loops=1)
+   Output: (max(a1.num))
+   ->  Sort (actual rows=2 loops=1)
+         Output: (max(a1.num))
+         Sort Key: (max(a1.num))
+         Sort Method: quicksort  Memory: 25kB
+         ->  Append (actual rows=2 loops=1)
+               ->  Finalize Aggregate (actual rows=1 loops=1)
+                     Output: max(a1.num)
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=2 loops=1)
+                           Output: PARTIAL max(a1.num)
+                           ->  Partial Aggregate
+                                 DN (actual rows=1..1 loops=1..1)
+                                 - datanode_1 (actual rows=1 loops=1)
+                                 - datanode_2 (actual rows=1 loops=1)
+                                 Output: PARTIAL max(a1.num)
+                                 ->  Seq Scan on public.a1
+                                       DN (actual rows=100..200 loops=1..1)
+                                       - datanode_1 (actual rows=200 loops=1)
+                                       - datanode_2 (actual rows=100 loops=1)
+                                       Output: a1.id, a1.num, a1.name
+               ->  Finalize Aggregate (actual rows=1 loops=1)
+                     Output: min(a1_1.num)
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=2 loops=1)
+                           Output: PARTIAL min(a1_1.num)
+                           ->  Partial Aggregate
+                                 DN (actual rows=1..1 loops=1..1)
+                                 - datanode_1 (actual rows=1 loops=1)
+                                 - datanode_2 (actual rows=1 loops=1)
+                                 Output: PARTIAL min(a1_1.num)
+                                 ->  Seq Scan on public.a1 a1_1
+                                       DN (actual rows=100..200 loops=1..1)
+                                       - datanode_1 (actual rows=200 loops=1)
+                                       - datanode_2 (actual rows=100 loops=1)
+                                       Output: a1_1.id, a1_1.num, a1_1.name
+(35 rows)
+
+--subplan
+explain (costs off,timing off,summary off,analyze,verbose)
+select * from a1 where id in (select count(*) from a2 where a1.num=a2.num);
+                                  QUERY PLAN                                   
+-------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=100 loops=1)
+   Output: a1.id, a1.num, a1.name
+   ->  Seq Scan on public.a1
+         DN (actual rows=0..100 loops=1..1)
+         - datanode_1 (actual rows=0 loops=1)
+         - datanode_2 (actual rows=100 loops=1)
+         Output: a1.id, a1.num, a1.name
+         Filter: (SubPlan 1)
+         SubPlan 1
+           ->  Finalize Aggregate
+                 DN (actual rows=1..1 loops=100..200)
+                 - datanode_1 (actual rows=1 loops=200)
+                 - datanode_2 (actual rows=1 loops=100)
+                 Output: count(*)
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       DN (actual rows=2..2 loops=100..200)
+                       - datanode_1 (actual rows=2 loops=200)
+                       - datanode_2 (actual rows=2 loops=100)
+                       Output: PARTIAL count(*)
+                       ->  Partial Aggregate
+                             DN (actual rows=1..1 loops=1..1)
+                             - datanode_1 (actual rows=1 loops=1)
+                             - datanode_2 (actual rows=1 loops=1)
+                             Output: PARTIAL count(*)
+                             ->  Seq Scan on public.a2
+                                   DN (actual rows=1..2 loops=1..1)
+                                   - datanode_1 (actual rows=2 loops=1)
+                                   - datanode_2 (actual rows=1 loops=1)
+                                   Output: a2.id, a2.num, a2.name
+                                   Filter: (a1.num = a2.num)
+(30 rows)
+
+--initplan
+explain (costs off,timing off,summary off,analyze,verbose)
+select * from a1 where num >= (select count(*) from a2 where name='a');
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=3 loops=1)
+   Output: a1.id, a1.num, a1.name
+   ->  Seq Scan on public.a1
+         DN (actual rows=1..2 loops=1..1)
+         - datanode_1 (actual rows=2 loops=1)
+         - datanode_2 (actual rows=1 loops=1)
+         Output: a1.id, a1.num, a1.name
+         Filter: (a1.num >= $0)
+         InitPlan 1 (returns $0)
+           ->  Finalize Aggregate
+                 DN (actual rows=1..1 loops=1..1)
+                 - datanode_1 (actual rows=1 loops=1)
+                 - datanode_2 (actual rows=1 loops=1)
+                 Output: count(*)
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       DN (actual rows=2..2 loops=1..1)
+                       - datanode_1 (actual rows=2 loops=1)
+                       - datanode_2 (actual rows=2 loops=1)
+                       Output: PARTIAL count(*)
+                       ->  Partial Aggregate
+                             DN (actual rows=1..1 loops=1..1)
+                             - datanode_1 (actual rows=1 loops=1)
+                             - datanode_2 (actual rows=1 loops=1)
+                             Output: PARTIAL count(*)
+                             ->  Seq Scan on public.a2
+                                   DN (actual rows=0..100 loops=1..1)
+                                   - datanode_1 (actual rows=100 loops=1)
+                                   - datanode_2 (actual rows=0 loops=1)
+                                   Output: a2.id, a2.num, a2.name
+                                   Filter: (a2.name = 'a'::text)
+(30 rows)
+
+explain (costs off,timing off,summary off,analyze,verbose)
+select * from a1 where num >= (select count(*) from a2 where name='b') order by id;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=3 loops=1)
+   Output: a1.id, a1.num, a1.name
+   Sort Key: a1.id
+   ->  Sort
+         DN (actual rows=1..2 loops=1..1)
+         - datanode_1 (actual rows=2 loops=1)
+         - datanode_2 (actual rows=1 loops=1)
+         Output: a1.id, a1.num, a1.name
+         Sort Key: a1.id
+         Sort Method: quicksort  Memory: 25kB
+         InitPlan 1 (returns $0)
+           ->  Finalize Aggregate
+                 DN (actual rows=1..1 loops=1..1)
+                 - datanode_1 (actual rows=1 loops=1)
+                 - datanode_2 (actual rows=1 loops=1)
+                 Output: count(*)
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       DN (actual rows=2..2 loops=1..1)
+                       - datanode_1 (actual rows=2 loops=1)
+                       - datanode_2 (actual rows=2 loops=1)
+                       Output: PARTIAL count(*)
+                       ->  Partial Aggregate
+                             DN (actual rows=1..1 loops=1..1)
+                             - datanode_1 (actual rows=1 loops=1)
+                             - datanode_2 (actual rows=1 loops=1)
+                             Output: PARTIAL count(*)
+                             ->  Seq Scan on public.a2
+                                   DN (actual rows=0..100 loops=1..1)
+                                   - datanode_1 (actual rows=100 loops=1)
+                                   - datanode_2 (actual rows=0 loops=1)
+                                   Output: a2.id, a2.num, a2.name
+                                   Filter: (a2.name = 'b'::text)
+         ->  Seq Scan on public.a1
+               DN (actual rows=1..2 loops=1..1)
+               - datanode_1 (actual rows=2 loops=1)
+               - datanode_2 (actual rows=1 loops=1)
+               Output: a1.id, a1.num, a1.name
+               Filter: (a1.num >= $0)
+(38 rows)
+
+explain (costs off,timing off,summary off,analyze,verbose)
+select * from a1 where num >= (select count(*) from a2 where name='c') limit 1;
+                                        QUERY PLAN                                         
+-------------------------------------------------------------------------------------------
+ Limit (actual rows=1 loops=1)
+   Output: a1.id, a1.num, a1.name
+   InitPlan 1 (returns $0)
+     ->  Finalize Aggregate (actual rows=1 loops=1)
+           Output: count(*)
+           ->  Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=2 loops=1)
+                 Output: PARTIAL count(*)
+                 ->  Partial Aggregate
+                       DN (actual rows=1..1 loops=1..1)
+                       - datanode_1 (actual rows=1 loops=1)
+                       - datanode_2 (actual rows=1 loops=1)
+                       Output: PARTIAL count(*)
+                       ->  Seq Scan on public.a2
+                             DN (actual rows=0..100 loops=1..1)
+                             - datanode_1 (actual rows=0 loops=1)
+                             - datanode_2 (actual rows=100 loops=1)
+                             Output: a2.id, a2.num, a2.name
+                             Filter: (a2.name = 'c'::text)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=1 loops=1)
+         Output: a1.id, a1.num, a1.name
+         ->  Limit
+               DN (actual rows=1..1 loops=1..1)
+               - datanode_1 (actual rows=1 loops=1)
+               - datanode_2 (actual rows=1 loops=1)
+               Output: a1.id, a1.num, a1.name
+               ->  Seq Scan on public.a1
+                     DN (actual rows=1..1 loops=1..1)
+                     - datanode_1 (actual rows=1 loops=1)
+                     - datanode_2 (actual rows=1 loops=1)
+                     Output: a1.id, a1.num, a1.name
+                     Filter: (a1.num >= $0)
+(31 rows)
+
+explain (costs off,timing off,summary off,analyze,verbose)
+select count(*) from a1 group by name having count(*) = (select count(*) from a2 where name='a');
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=3 loops=1)
+   Output: count(*), a1.name
+   ->  Finalize HashAggregate
+         DN (actual rows=1..2 loops=1..1)
+         - datanode_1 (actual rows=2 loops=1)
+         - datanode_2 (actual rows=1 loops=1)
+         Output: count(*), a1.name
+         Group Key: a1.name
+         Filter: (count(*) = $0)
+         InitPlan 1 (returns $0)
+           ->  Finalize Aggregate
+                 DN (actual rows=1..1 loops=1..1)
+                 - datanode_1 (actual rows=1 loops=1)
+                 - datanode_2 (actual rows=1 loops=1)
+                 Output: count(*)
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       DN (actual rows=2..2 loops=1..1)
+                       - datanode_1 (actual rows=2 loops=1)
+                       - datanode_2 (actual rows=2 loops=1)
+                       Output: PARTIAL count(*)
+                       ->  Partial Aggregate
+                             DN (actual rows=1..1 loops=1..1)
+                             - datanode_1 (actual rows=1 loops=1)
+                             - datanode_2 (actual rows=1 loops=1)
+                             Output: PARTIAL count(*)
+                             ->  Seq Scan on public.a2
+                                   DN (actual rows=0..100 loops=1..1)
+                                   - datanode_1 (actual rows=100 loops=1)
+                                   - datanode_2 (actual rows=0 loops=1)
+                                   Output: a2.id, a2.num, a2.name
+                                   Filter: (a2.name = 'a'::text)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               DN (actual rows=1..2 loops=1..1)
+               - datanode_1 (actual rows=2 loops=1)
+               - datanode_2 (actual rows=1 loops=1)
+               Output: a1.name, PARTIAL count(*)
+               Distribute results by H: name
+               ->  Partial HashAggregate
+                     DN (actual rows=1..2 loops=1..1)
+                     - datanode_1 (actual rows=2 loops=1)
+                     - datanode_2 (actual rows=1 loops=1)
+                     Output: a1.name, PARTIAL count(*)
+                     Group Key: a1.name
+                     ->  Seq Scan on public.a1
+                           DN (actual rows=100..200 loops=1..1)
+                           - datanode_1 (actual rows=200 loops=1)
+                           - datanode_2 (actual rows=100 loops=1)
+                           Output: a1.id, a1.num, a1.name
+(48 rows)
+
+--cleanup
+drop table a1, a2;
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index ebd01715..c0ccc373 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -161,3 +161,9 @@ test: xc_notrans_block
 
 # This runs XL specific tests
 test: xl_primary_key xl_foreign_key xl_distribution_column_types xl_alter_table xl_distribution_column_types_modulo xl_plan_pushdown xl_functions xl_limitations xl_user_defined_functions xl_join xl_distributed_xact xl_create_table
+
+# This runs TBase specific tests
+test: tbase_explain
+
+test: redistribute_custom_types
+test: nestloop_by_shard
diff --git a/src/test/regress/sql/tbase_explain.sql b/src/test/regress/sql/tbase_explain.sql
new file mode 100644
index 00000000..7e212bc7
--- /dev/null
+++ b/src/test/regress/sql/tbase_explain.sql
@@ -0,0 +1,37 @@
+--explain analyze
+create table a1(id int, num int, name text);
+create table a2(id int, num int, name text);
+insert into a1 values(1,generate_series(1,100),'a');
+insert into a1 values(2,generate_series(1,100),'b');
+insert into a1 values(3,generate_series(1,100),'c');
+insert into a2 select * from a1;
+
+--normal cases
+explain (costs off,timing off,summary off,analyze,verbose)
+select count(*) from a1;
+explain (costs off,timing off,summary off,analyze,verbose)
+select num, count(*) cnt from a2 group by num order by cnt;
+explain (costs off,timing off,summary off,analyze,verbose)
+select * from a1, a2 where a1.num = a2.num;
+
+--append
+explain (costs off,timing off,summary off,analyze,verbose)
+select max(num) from a1 union select min(num) from a1 order by 1;
+
+--subplan
+explain (costs off,timing off,summary off,analyze,verbose)
+select * from a1 where id in (select count(*) from a2 where a1.num=a2.num);
+
+--initplan
+explain (costs off,timing off,summary off,analyze,verbose)
+select * from a1 where num >= (select count(*) from a2 where name='a');
+explain (costs off,timing off,summary off,analyze,verbose)
+select * from a1 where num >= (select count(*) from a2 where name='b') order by id;
+explain (costs off,timing off,summary off,analyze,verbose)
+select * from a1 where num >= (select count(*) from a2 where name='c') limit 1;
+explain (costs off,timing off,summary off,analyze,verbose)
+select count(*) from a1 group by name having count(*) = (select count(*) from a2 where name='a');
+
+--cleanup
+drop table a1, a2;
+

From 799266e603f90f920e9152a33deeb68abd3ea039 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Wed, 10 Mar 2021 11:22:13 +0800
Subject: [PATCH 141/578] Fix a compile warning

---
 src/backend/access/common/printtup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index a9b0b09b..dfd64707 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -447,6 +447,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
         }
         else
         {
+	            int len = strlen(outputstr);
 #ifdef __TBASE__
 	            if (slot->tts_tupleDescriptor->attrs[i]->atttypid == RECORDOID && self->mydest == DestRemoteExecute)
 	            {
@@ -482,7 +483,6 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
 		            pfree(tupdesc_data.data);
 	            }
 #endif
-                int len = strlen(outputstr);
                 pq_sendint(&buf, len, 4);
                 appendBinaryStringInfo(&buf, outputstr, len);
             }

From d0dc3c6c72d1918ef025484b076c5a18cb767314 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Wed, 10 Mar 2021 14:35:33 +0800
Subject: [PATCH 142/578] fix bug in parallel sort and parallel hash
 redistributing data

http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131083333949
---
 src/backend/executor/nodeAgg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index d36c3df4..8b1695a6 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -6060,7 +6060,7 @@ GetReDistributeData(ReDistributeState *state, BufFile *file, TupleTableSlot **sl
         }
         else
         {
-            data = (char *)palloc0(nread);
+			data = (char *)palloc0(dataLen);
         }
     
 READ_DATA:

From ae830ca4141a8e4ba78387615b3e936acc59018d Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 16 Mar 2021 12:03:31 +0800
Subject: [PATCH 143/578] Support explain analyze for INSERT INTO SELECT
 statement

tapd: http://tapd.oa.com/pgxz/prong/stories/view/1010092131862892295
---
 src/backend/commands/explain_dist.c |  6 +++---
 src/backend/tcop/pquery.c           | 26 +++++++++++++++++++-------
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/src/backend/commands/explain_dist.c b/src/backend/commands/explain_dist.c
index 65d9fed8..1f2d50e4 100644
--- a/src/backend/commands/explain_dist.c
+++ b/src/backend/commands/explain_dist.c
@@ -706,15 +706,15 @@ HandleRemoteInstr(char *msg_body, size_t len, int nodeid, ResponseCombiner *comb
 	bool        found;
 	RemoteInstr *cur_instr;
 	
-	/* must doing this under per query context */
-	MemoryContext oldcontext = MemoryContextSwitchTo(combiner->ss.ps.state->es_query_cxt);
-	
 	if (combiner->recv_instr_htbl == NULL)
 	{
 		elog(ERROR, "combiner is not prepared for instrumentation");
 	}
 	elog(DEBUG1, "Handle remote instrument: nodeid %d", nodeid);
 	
+	/* must doing this under per query context */
+	MemoryContext oldcontext = MemoryContextSwitchTo(combiner->ss.ps.state->es_query_cxt);
+	
 	recv_str = makeStringInfo();
 	appendBinaryStringInfo(recv_str, msg_body, len);
 	
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index c1eadf2c..5d358337 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -45,6 +45,7 @@
 #include "optimizer/planner.h"
 #include "executor/execParallel.h"
 #include "commands/defrem.h"
+#include "commands/explain_dist.h"
 #include "commands/vacuum.h"
 #include "postmaster/postmaster.h"
 #include "optimizer/planmain.h"
@@ -65,7 +66,8 @@ static void ProcessQuery(PlannedStmt *plan,
              ParamListInfo params,
              QueryEnvironment *queryEnv,
              DestReceiver *dest,
-             char *completionTag);
+			 char *completionTag,
+			 int instrument);
 static void FillPortalStore(Portal portal, bool isTopLevel);
 static uint64 RunFromStore(Portal portal, ScanDirection direction, uint64 count,
              DestReceiver *dest);
@@ -179,8 +181,9 @@ ProcessQuery(PlannedStmt *plan,
              ParamListInfo params,
              QueryEnvironment *queryEnv,
              DestReceiver *dest,
-             char *completionTag)
-{// #lizard forgives
+			 char *completionTag,
+			 int instrument)
+{
     QueryDesc  *queryDesc;
 
     /*
@@ -191,13 +194,13 @@ ProcessQuery(PlannedStmt *plan,
     {
         queryDesc = CreateQueryDesc(plan, sourceText,
                             InvalidSnapshot, InvalidSnapshot,
-                            dest, params, queryEnv, 0);
+							dest, params, queryEnv, instrument);
     }
     else
 #endif
     queryDesc = CreateQueryDesc(plan, sourceText,
                                 GetActiveSnapshot(), InvalidSnapshot,
-                                dest, params, queryEnv, 0);
+								dest, params, queryEnv, instrument);
 
     /*
      * Call ExecutorStart to prepare the plan for execution
@@ -248,6 +251,13 @@ ProcessQuery(PlannedStmt *plan,
         }
     }
 
+#ifdef __TBASE__
+	if (instrument && queryDesc->planstate)
+	{
+		SendLocalInstr(queryDesc->planstate);
+	}
+#endif
+
     /*
      * Now, we close down all the scans and free allocated resources.
      */
@@ -2090,7 +2100,8 @@ PortalRunMulti(Portal portal,
                              portal->sourceText,
                              portal->portalParams,
                              portal->queryEnv,
-                             dest, completionTag);
+							 dest, completionTag,
+							 portal->up_instrument);
 #ifdef PGXC
                 /* it's special for INSERT */
                 if (IS_PGXC_COORDINATOR &&
@@ -2106,7 +2117,8 @@ PortalRunMulti(Portal portal,
                              portal->sourceText,
                              portal->portalParams,
                              portal->queryEnv,
-                             altdest, NULL);
+							 altdest, NULL,
+							 portal->up_instrument);
             }
 
             if (log_executor_stats)

From 1ceb1a4eba88297cdb128a3b796995e6acbcdfaf Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 16 Mar 2021 15:52:10 +0800
Subject: [PATCH 144/578] Make cost_xxx functions more readable

cherry-pick same refactor from V3 commit: ce5a1e72

http://tapd.oa.com/pgxz/prong/stories/view/1010092131862621757
---
 src/backend/optimizer/path/costsize.c     | 323 +++++-----------------
 src/backend/optimizer/util/pathnode.c     |   2 +-
 src/test/regress/expected/create_view.out |   6 +-
 3 files changed, 68 insertions(+), 263 deletions(-)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 06e9a7c3..491d9d40 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -179,7 +179,6 @@ static double relation_byte_size(double tuples, int width);
 static double page_size(double tuples, int width);
 static double get_parallel_divisor(Path *path);
 
-#ifdef __TBASE__
 /*
  * In PostgreSQL, the row count estimate of a base rel scan, like a Seq Scan
  * or an Index Scan, can be directly copied from RelOptInfo->rows/tuples. In
@@ -211,72 +210,14 @@ static double get_parallel_divisor(Path *path);
  * the original RelOptInfo, you'll get a compiler error. That's good: it forces
  * you to think whether the value needs to be divided by nDNs or not.
  */
-typedef struct
-{
-	/* Values copied from RelOptInfo as is, for convenience */
-	Index		relid;
-	RTEKind		rtekind;		/* RELATION, SUBQUERY, or FUNCTION */
-	Oid			reltablespace;	/* containing tablespace */
-	double		allvisfrac;
-	
-	/* Values adjusted from RelOptInfo, by dividing by number of DNs */
-	double		rows;
-	BlockNumber pages;
-	double		tuples;
-	
-	/* the original RelOptInfo */
-	RelOptInfo *orig;
-} RelOptInfoDataNode;
+#define PAGES_PER_DN(pages) \
+	(ceil((double) (pages) / num_nodes))
 
-/* ParamPathInfoDataNode is a similar proxy for ParamPathInfo. */
-typedef struct
-{
-	double		ppi_rows;		/* estimated number of result tuples */
-	List	   *ppi_clauses;	/* join clauses available from outer rels */
-	
-	ParamPathInfo *orig;
-} ParamPathInfoDataNode;
-
-static ParamPathInfoDataNode *
-adjust_reloptinfo(Path *path, RelOptInfoDataNode *basescan, RelOptInfo *baserel_orig,
-                  ParamPathInfoDataNode *param_info, ParamPathInfo *param_info_orig)
-{
-	double  nodes = path_count_datanodes(path);
-	
-	basescan->relid = baserel_orig->relid;
-	basescan->rtekind = baserel_orig->rtekind;
-	basescan->reltablespace = baserel_orig->reltablespace;
-	basescan->allvisfrac = baserel_orig->allvisfrac;
-	
-	basescan->rows = clamp_row_est(baserel_orig->rows / nodes);
-	basescan->tuples = clamp_row_est(baserel_orig->tuples / nodes);
-	basescan->pages = ceil((double) baserel_orig->pages / nodes);
-	
-	basescan->orig = baserel_orig;
-	
-	if (param_info_orig)
-	{
-		param_info->ppi_rows = clamp_row_est(param_info_orig->ppi_rows / nodes);
-		param_info->ppi_clauses = param_info_orig->ppi_clauses;
-		param_info->orig = param_info_orig;
-		return param_info;
-	}
-	else
-		return NULL;
-}
-
-/*
- * ADJUST_BASESCAN initializes the proxy structs for RelOptInfo and ParamPathInfo,
- * adjusting them by # of data nodes as needed.
- */
-#define ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info) \
-	RelOptInfoDataNode baserel_adjusted; \
-	ParamPathInfoDataNode param_info_adjusted; \
-	RelOptInfoDataNode *baserel = &baserel_adjusted; \
-	ParamPathInfoDataNode *param_info = adjust_reloptinfo(path, &baserel_adjusted, baserel_orig, \
-															&param_info_adjusted, param_info_orig)
-#endif
+#define ROWS_PER_DN(rows) \
+	(clamp_row_est((rows) / num_nodes))
 
+#define TUPLES_PER_DN(tuples) \
+	(clamp_row_est((tuples) / num_nodes))
 
 /*
  * clamp_row_est
@@ -298,7 +239,6 @@ clamp_row_est(double nrows)
     return nrows;
 }
 
-
 /*
  * cost_seqscan
  *      Determines and returns the cost of scanning a relation sequentially.
@@ -308,20 +248,15 @@ clamp_row_est(double nrows)
  */
 void
 cost_seqscan(Path *path, PlannerInfo *root,
-#ifdef __TBASE__
-			 RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
-{
-	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
-#else
              RelOptInfo *baserel, ParamPathInfo *param_info)
 {
-#endif
     Cost        startup_cost = 0;
     Cost        cpu_run_cost;
     Cost        disk_run_cost;
     double        spc_seq_page_cost;
     QualCost    qpqual_cost;
     Cost        cpu_per_tuple;
+	double		num_nodes = path_count_datanodes(path);
 
     /* Should only be applied to base relations */
     Assert(baserel->relid > 0);
@@ -329,9 +264,9 @@ cost_seqscan(Path *path, PlannerInfo *root,
 
     /* Mark the path with the correct row estimate */
     if (param_info)
-        path->rows = param_info->ppi_rows;
+		path->rows = ROWS_PER_DN(param_info->ppi_rows);
     else
-        path->rows = baserel->rows;
+		path->rows = ROWS_PER_DN(baserel->rows);
 
     if (!enable_seqscan)
         startup_cost += disable_cost;
@@ -344,18 +279,14 @@ cost_seqscan(Path *path, PlannerInfo *root,
     /*
      * disk costs
      */
-    disk_run_cost = spc_seq_page_cost * baserel->pages;
+	disk_run_cost = spc_seq_page_cost * PAGES_PER_DN(baserel->pages);
 
     /* CPU costs */
-#ifdef __TBASE__
-	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
-#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
-#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
-    cpu_run_cost = cpu_per_tuple * baserel->tuples;
+	cpu_run_cost = cpu_per_tuple * TUPLES_PER_DN(baserel->tuples);
     /* tlist eval costs are paid per output row, not per tuple scanned */
     startup_cost += path->pathtarget->cost.startup;
     cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
@@ -395,14 +326,8 @@ cost_seqscan(Path *path, PlannerInfo *root,
  */
 void
 cost_samplescan(Path *path, PlannerInfo *root,
-#ifdef __TBASE__
-				RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
-{
-	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
-#else
                 RelOptInfo *baserel, ParamPathInfo *param_info)
 {
-#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     RangeTblEntry *rte;
@@ -413,6 +338,7 @@ cost_samplescan(Path *path, PlannerInfo *root,
                 spc_page_cost;
     QualCost    qpqual_cost;
     Cost        cpu_per_tuple;
+	double		num_nodes = path_count_datanodes(path);
 
     /* Should only be applied to base relations with tablesample clauses */
     Assert(baserel->relid > 0);
@@ -424,9 +350,9 @@ cost_samplescan(Path *path, PlannerInfo *root,
 
     /* Mark the path with the correct row estimate */
     if (param_info)
-        path->rows = param_info->ppi_rows;
+		path->rows = ROWS_PER_DN(param_info->ppi_rows);
     else
-        path->rows = baserel->rows;
+		path->rows = ROWS_PER_DN(baserel->rows);
 
     /* fetch estimated page cost for tablespace containing table */
     get_tablespace_page_costs(baserel->reltablespace,
@@ -441,7 +367,7 @@ cost_samplescan(Path *path, PlannerInfo *root,
      * disk costs (recall that baserel->pages has already been set to the
      * number of pages the sampling method will visit)
      */
-    run_cost += spc_page_cost * baserel->pages;
+	run_cost += spc_page_cost * PAGES_PER_DN(baserel->pages);
 
     /*
      * CPU costs (recall that baserel->tuples has already been set to the
@@ -451,15 +377,11 @@ cost_samplescan(Path *path, PlannerInfo *root,
      * simple constants anyway.  We also don't charge anything for the
      * calculations the sampling method might do internally.
      */
-#ifdef __TBASE__
-	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
-#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
-#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
-    run_cost += cpu_per_tuple * baserel->tuples;
+	run_cost += cpu_per_tuple * TUPLES_PER_DN(baserel->tuples);
     /* tlist eval costs are paid per output row, not per tuple scanned */
     startup_cost += path->pathtarget->cost.startup;
     run_cost += path->pathtarget->cost.per_tuple * path->rows;
@@ -480,26 +402,20 @@ cost_samplescan(Path *path, PlannerInfo *root,
  */
 void
 cost_gather(GatherPath *path, PlannerInfo *root,
-#ifdef __TBASE__
-			RelOptInfo *rel_orig, ParamPathInfo *param_info_orig,
-#else
             RelOptInfo *rel, ParamPathInfo *param_info,
-#endif
             double *rows)
 {
-#ifdef __TBASE__
-	ADJUST_BASESCAN(&path->path, rel_orig, rel, param_info_orig, param_info);
-#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
+	double		num_nodes = path_count_datanodes((Path *) path);
 
     /* Mark the path with the correct row estimate */
     if (rows)
         path->path.rows = *rows;
     else if (param_info)
-        path->path.rows = param_info->ppi_rows;
+		path->path.rows = ROWS_PER_DN(param_info->ppi_rows);
     else
-        path->path.rows = rel->rows;
+		path->path.rows = ROWS_PER_DN(rel->rows);
 
     startup_cost = path->subpath->startup_cost;
 
@@ -537,30 +453,24 @@ reset_cost_gather(GatherPath *path)
  */
 void
 cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
-#ifdef __TBASE__
-				  RelOptInfo *rel_orig, ParamPathInfo *param_info_orig,
-#else
                   RelOptInfo *rel, ParamPathInfo *param_info,
-#endif
                   Cost input_startup_cost, Cost input_total_cost,
                   double *rows)
 {
-#ifdef __TBASE__
-	ADJUST_BASESCAN(&path->path, rel_orig, rel, param_info_orig, param_info);
-#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     Cost        comparison_cost;
     double        N;
     double        logN;
+	double		num_nodes = path_count_datanodes((Path *) path);
 
     /* Mark the path with the correct row estimate */
     if (rows)
         path->path.rows = *rows;
     else if (param_info)
-        path->path.rows = param_info->ppi_rows;
+		path->path.rows = ROWS_PER_DN(param_info->ppi_rows);
     else
-        path->path.rows = rel->rows;
+		path->path.rows = ROWS_PER_DN(rel->rows);
 
     if (!enable_gathermerge)
         startup_cost += disable_cost;
@@ -622,12 +532,7 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
            bool partial_path)
 {// #lizard forgives
     IndexOptInfo *index = path->indexinfo;
-#ifdef __TBASE__
-	RelOptInfo *baserel_orig = index->rel;
-	ADJUST_BASESCAN(&path->path, baserel_orig, baserel, path->path.param_info, param_info);
-#else
     RelOptInfo *baserel = index->rel;
-#endif
     bool        indexonly = (path->path.pathtype == T_IndexOnlyScan);
     amcostestimate_function amcostestimate;
     List       *qpquals;
@@ -650,17 +555,13 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
     double        rand_heap_pages;
     double        index_pages;
 	double		nodes = 1;
+	double		index_pages_per_dn;
+	double		baserel_pages_per_dn;
+	double		num_nodes = path_count_datanodes((Path *) path);
 
-#ifdef __TBASE__
-	nodes = path_count_datanodes(&path->path);
-	/* Should only be applied to base relations */
-	Assert(IsA(baserel_orig, RelOptInfo) &&
-	       IsA(index, IndexOptInfo));
-#else
     /* Should only be applied to base relations */
     Assert(IsA(baserel, RelOptInfo) &&
            IsA(index, IndexOptInfo));
-#endif
     Assert(baserel->relid > 0);
     Assert(baserel->rtekind == RTE_RELATION);
 
@@ -671,21 +572,9 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
      * baserestrictinfo as the list of relevant restriction clauses for the
      * rel.
      */
-#ifdef __TBASE__
-	if (param_info)
-	{
-		path->path.rows = param_info->ppi_rows;
-		/* qpquals come from the rel's restriction clauses and ppi_clauses */
-		qpquals = list_concat(
-							  extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
-														  path->indexquals),
-							  extract_nonindex_conditions(param_info->ppi_clauses,
-														  path->indexquals));
-	}
-#else
     if (path->path.param_info)
     {
-        path->path.rows = path->path.param_info->ppi_rows;
+		path->path.rows = ROWS_PER_DN(path->path.param_info->ppi_rows);
         /* qpquals come from the rel's restriction clauses and ppi_clauses */
         qpquals = list_concat(
                               extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
@@ -693,10 +582,9 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
                               extract_nonindex_conditions(path->path.param_info->ppi_clauses,
                                                           path->indexquals));
     }
-#endif
     else
     {
-        path->path.rows = baserel->rows;
+		path->path.rows = ROWS_PER_DN(baserel->rows);
         /* qpquals come from just the rel's restriction clauses */
         qpquals = extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
                                               path->indexquals);
@@ -720,7 +608,8 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
                    &index_pages);
 
 	/* The index pages should be divided among all the data nodes like baserel dose. */
-	index_pages = ceil(index_pages / nodes);
+	index_pages_per_dn = PAGES_PER_DN(index_pages);
+	baserel_pages_per_dn = PAGES_PER_DN(baserel->pages);
 
     /*
      * Save amcostestimate's results for possible use in bitmap scan planning.
@@ -735,7 +624,7 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
     run_cost += indexTotalCost - indexStartupCost;
 
     /* estimate number of main-table tuples fetched */
-    tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
+	tuples_fetched = clamp_row_est(indexSelectivity * TUPLES_PER_DN(baserel->tuples));
 
     /* fetch estimated page costs for tablespace containing table */
     get_tablespace_page_costs(baserel->reltablespace,
@@ -780,12 +669,8 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
          * fetches are random accesses.
          */
         pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
-                                            baserel->pages,
-#ifdef __TBASE__
-											index_pages,
-#else
-                                            (double) index->pages,
-#endif
+		                                    baserel_pages_per_dn,
+		                                    (double) index_pages_per_dn,
                                             root);
 
         if (indexonly)
@@ -805,15 +690,11 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
          * where such a plan is actually interesting, only one page would get
          * fetched per scan anyway, so it shouldn't matter much.)
          */
-        pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
+		pages_fetched = ceil(indexSelectivity * (double) PAGES_PER_DN(baserel->pages));
 
         pages_fetched = index_pages_fetched(pages_fetched * loop_count,
-                                            baserel->pages,
-#ifdef __TBASE__
-											index_pages,
-#else
-                                            (double) index->pages,
-#endif
+		                                    baserel_pages_per_dn,
+		                                    (double) index_pages_per_dn,
                                             root);
 
         if (indexonly)
@@ -828,12 +709,8 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
          * interpolate between that and the correlation-derived result.
          */
         pages_fetched = index_pages_fetched(tuples_fetched,
-                                            baserel->pages,
-#ifdef __TBASE__
-											index_pages,
-#else
-                                            (double) index->pages,
-#endif
+		                                    baserel_pages_per_dn,
+		                                    (double) index_pages_per_dn,
                                             root);
 
         if (indexonly)
@@ -845,7 +722,7 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
         max_IO_cost = pages_fetched * spc_random_page_cost;
 
         /* min_IO_cost is for the perfectly correlated case (csquared=1) */
-        pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
+		pages_fetched = ceil(indexSelectivity * (double) baserel_pages_per_dn);
 
         if (indexonly)
             pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
@@ -876,13 +753,8 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
          * sequential as for parallel scans the pages are accessed in random
          * order.
          */
-#ifdef __TBASE__
-		path->path.parallel_workers = compute_parallel_worker(baserel_orig,
-		                                                      rand_heap_pages, index_pages);
-#else
         path->path.parallel_workers = compute_parallel_worker(baserel,
-                                                              rand_heap_pages, index_pages);
-#endif
+															  rand_heap_pages, index_pages_per_dn);
 
         /*
          * Fall out if workers can't be assigned for parallel scan, because in
@@ -1379,14 +1251,8 @@ cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root)
  */
 void
 cost_tidscan(Path *path, PlannerInfo *root,
-#ifdef __TBASE__
-			 RelOptInfo *baserel_orig, List *tidquals, ParamPathInfo *param_info_orig)
-{
-	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
-#else
              RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info)
 {
-#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     bool        isCurrentOf = false;
@@ -1396,6 +1262,7 @@ cost_tidscan(Path *path, PlannerInfo *root,
     int            ntuples;
     ListCell   *l;
     double        spc_random_page_cost;
+	double		num_nodes = path_count_datanodes(path);
 
     /* Should only be applied to base relations */
     Assert(baserel->relid > 0);
@@ -1403,9 +1270,9 @@ cost_tidscan(Path *path, PlannerInfo *root,
 
     /* Mark the path with the correct row estimate */
     if (param_info)
-        path->rows = param_info->ppi_rows;
+		path->rows = ROWS_PER_DN(param_info->ppi_rows);
     else
-        path->rows = baserel->rows;
+		path->rows = ROWS_PER_DN(baserel->rows);
 
     /* Count how many tuples we expect to retrieve */
     ntuples = 0;
@@ -1442,11 +1309,7 @@ cost_tidscan(Path *path, PlannerInfo *root,
      */
     if (isCurrentOf)
     {
-#ifdef __TBASE__
-		Assert(baserel->orig->baserestrictcost.startup >= disable_cost);
-#else
         Assert(baserel->baserestrictcost.startup >= disable_cost);
-#endif
         startup_cost -= disable_cost;
     }
     else if (!enable_tidscan)
@@ -1467,11 +1330,7 @@ cost_tidscan(Path *path, PlannerInfo *root,
     run_cost += spc_random_page_cost * ntuples;
 
     /* Add scanning CPU costs */
-#ifdef __TBASE__
-	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
-#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
-#endif
 
     /* XXX currently we assume TID quals are a subset of qpquals */
     startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
@@ -1496,18 +1355,13 @@ cost_tidscan(Path *path, PlannerInfo *root,
  */
 void
 cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
-#ifdef __TBASE__
-				  RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
-{
-	ADJUST_BASESCAN(&path->path, baserel_orig, baserel, param_info_orig, param_info);
-#else
                   RelOptInfo *baserel, ParamPathInfo *param_info)
 {
-#endif
     Cost        startup_cost;
     Cost        run_cost;
     QualCost    qpqual_cost;
     Cost        cpu_per_tuple;
+	double		num_nodes = path_count_datanodes((Path *)path);
 
     /* Should only be applied to base relations that are subqueries */
     Assert(baserel->relid > 0);
@@ -1515,9 +1369,9 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
 
     /* Mark the path with the correct row estimate */
     if (param_info)
-        path->path.rows = param_info->ppi_rows;
+		path->path.rows = ROWS_PER_DN(param_info->ppi_rows);
     else
-        path->path.rows = baserel->rows;
+		path->path.rows = ROWS_PER_DN(baserel->rows);
 
     /*
      * Cost of path is cost of evaluating the subplan, plus cost of evaluating
@@ -1528,11 +1382,7 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
     path->path.startup_cost = path->subpath->startup_cost;
     path->path.total_cost = path->subpath->total_cost;
 
-#ifdef __TBASE__
-	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
-#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
-#endif
 
     startup_cost = qpqual_cost.startup;
     cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
@@ -1555,20 +1405,15 @@ cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
  */
 void
 cost_functionscan(Path *path, PlannerInfo *root,
-#ifdef __TBASE__
-				  RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
-{
-	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
-#else
                   RelOptInfo *baserel, ParamPathInfo *param_info)
 {
-#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     QualCost    qpqual_cost;
     Cost        cpu_per_tuple;
     RangeTblEntry *rte;
     QualCost    exprcost;
+	double		num_nodes = path_count_datanodes(path);
 
     /* Should only be applied to base relations that are functions */
     Assert(baserel->relid > 0);
@@ -1577,9 +1422,9 @@ cost_functionscan(Path *path, PlannerInfo *root,
 
     /* Mark the path with the correct row estimate */
     if (param_info)
-        path->rows = param_info->ppi_rows;
+		path->rows = ROWS_PER_DN(param_info->ppi_rows);
     else
-        path->rows = baserel->rows;
+		path->rows = ROWS_PER_DN(baserel->rows);
 
     /*
      * Estimate costs of executing the function expression(s).
@@ -1599,11 +1444,7 @@ cost_functionscan(Path *path, PlannerInfo *root,
     startup_cost += exprcost.startup + exprcost.per_tuple;
 
     /* Add scanning CPU costs */
-#ifdef __TBASE__
-	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
-#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
-#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
@@ -1626,20 +1467,15 @@ cost_functionscan(Path *path, PlannerInfo *root,
  */
 void
 cost_tablefuncscan(Path *path, PlannerInfo *root,
-#ifdef __TBASE__
-				   RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
-{
-	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
-#else
                    RelOptInfo *baserel, ParamPathInfo *param_info)
 {
-#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     QualCost    qpqual_cost;
     Cost        cpu_per_tuple;
     RangeTblEntry *rte;
     QualCost    exprcost;
+	double		num_nodes = path_count_datanodes(path);
 
     /* Should only be applied to base relations that are functions */
     Assert(baserel->relid > 0);
@@ -1648,9 +1484,9 @@ cost_tablefuncscan(Path *path, PlannerInfo *root,
 
     /* Mark the path with the correct row estimate */
     if (param_info)
-        path->rows = param_info->ppi_rows;
+		path->rows = ROWS_PER_DN(param_info->ppi_rows);
     else
-        path->rows = baserel->rows;
+		path->rows = ROWS_PER_DN(baserel->rows);
 
     /*
      * Estimate costs of executing the table func expression(s).
@@ -1665,15 +1501,11 @@ cost_tablefuncscan(Path *path, PlannerInfo *root,
     startup_cost += exprcost.startup + exprcost.per_tuple;
 
     /* Add scanning CPU costs */
-#ifdef __TBASE__
-	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
-#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
-#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
-    run_cost += cpu_per_tuple * baserel->tuples;
+	run_cost += cpu_per_tuple * TUPLES_PER_DN(baserel->tuples);
 
     /* tlist eval costs are paid per output row, not per tuple scanned */
     startup_cost += path->pathtarget->cost.startup;
@@ -1692,18 +1524,13 @@ cost_tablefuncscan(Path *path, PlannerInfo *root,
  */
 void
 cost_valuesscan(Path *path, PlannerInfo *root,
-#ifdef __TBASE__
-				RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
-{
-	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
-#else
                 RelOptInfo *baserel, ParamPathInfo *param_info)
 {
-#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     QualCost    qpqual_cost;
     Cost        cpu_per_tuple;
+	double		num_nodes = path_count_datanodes(path);
 
     /* Should only be applied to base relations that are values lists */
     Assert(baserel->relid > 0);
@@ -1711,9 +1538,9 @@ cost_valuesscan(Path *path, PlannerInfo *root,
 
     /* Mark the path with the correct row estimate */
     if (param_info)
-        path->rows = param_info->ppi_rows;
+		path->rows = ROWS_PER_DN(param_info->ppi_rows);
     else
-        path->rows = baserel->rows;
+		path->rows = ROWS_PER_DN(baserel->rows);
 
     /*
      * For now, estimate list evaluation cost at one operator eval per list
@@ -1722,15 +1549,11 @@ cost_valuesscan(Path *path, PlannerInfo *root,
     cpu_per_tuple = cpu_operator_cost;
 
     /* Add scanning CPU costs */
-#ifdef __TBASE__
-	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
-#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
-#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
-    run_cost += cpu_per_tuple * baserel->tuples;
+	run_cost += cpu_per_tuple * TUPLES_PER_DN(baserel->tuples);
 
     /* tlist eval costs are paid per output row, not per tuple scanned */
     startup_cost += path->pathtarget->cost.startup;
@@ -1752,18 +1575,13 @@ cost_valuesscan(Path *path, PlannerInfo *root,
  */
 void
 cost_ctescan(Path *path, PlannerInfo *root,
-#ifdef __TBASE__
-			 RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
-{
-	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
-#else
              RelOptInfo *baserel, ParamPathInfo *param_info)
 {
-#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     QualCost    qpqual_cost;
     Cost        cpu_per_tuple;
+	double		num_nodes = path_count_datanodes(path);
 
     /* Should only be applied to base relations that are CTEs */
     Assert(baserel->relid > 0);
@@ -1771,23 +1589,19 @@ cost_ctescan(Path *path, PlannerInfo *root,
 
     /* Mark the path with the correct row estimate */
     if (param_info)
-        path->rows = param_info->ppi_rows;
+		path->rows = ROWS_PER_DN(param_info->ppi_rows);
     else
-        path->rows = baserel->rows;
+		path->rows = ROWS_PER_DN(baserel->rows);
 
     /* Charge one CPU tuple cost per row for tuplestore manipulation */
     cpu_per_tuple = cpu_tuple_cost;
 
     /* Add scanning CPU costs */
-#ifdef __TBASE__
-	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
-#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
-#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
-    run_cost += cpu_per_tuple * baserel->tuples;
+	run_cost += cpu_per_tuple * TUPLES_PER_DN(baserel->tuples);
 
     /* tlist eval costs are paid per output row, not per tuple scanned */
     startup_cost += path->pathtarget->cost.startup;
@@ -1803,18 +1617,13 @@ cost_ctescan(Path *path, PlannerInfo *root,
  */
 void
 cost_namedtuplestorescan(Path *path, PlannerInfo *root,
-#ifdef __TBASE__
-						 RelOptInfo *baserel_orig, ParamPathInfo *param_info_orig)
-{
-	ADJUST_BASESCAN(path, baserel_orig, baserel, param_info_orig, param_info);
-#else
                          RelOptInfo *baserel, ParamPathInfo *param_info)
 {
-#endif
     Cost        startup_cost = 0;
     Cost        run_cost = 0;
     QualCost    qpqual_cost;
     Cost        cpu_per_tuple;
+	double		num_nodes = path_count_datanodes(path);
 
     /* Should only be applied to base relations that are Tuplestores */
     Assert(baserel->relid > 0);
@@ -1822,23 +1631,19 @@ cost_namedtuplestorescan(Path *path, PlannerInfo *root,
 
     /* Mark the path with the correct row estimate */
     if (param_info)
-        path->rows = param_info->ppi_rows;
+		path->rows = ROWS_PER_DN(param_info->ppi_rows);
     else
-        path->rows = baserel->rows;
+		path->rows = ROWS_PER_DN(baserel->rows);
 
     /* Charge one CPU tuple cost per row for tuplestore manipulation */
     cpu_per_tuple = cpu_tuple_cost;
 
     /* Add scanning CPU costs */
-#ifdef __TBASE__
-	get_restriction_qual_cost(root, baserel_orig, param_info_orig, &qpqual_cost);
-#else
     get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
-#endif
 
     startup_cost += qpqual_cost.startup;
     cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
-    run_cost += cpu_per_tuple * baserel->tuples;
+	run_cost += cpu_per_tuple * TUPLES_PER_DN(baserel->tuples);
 
     path->startup_cost = startup_cost;
     path->total_cost = startup_cost + run_cost;
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 690adbfd..c0847553 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -7001,7 +7001,7 @@ reparameterize_path(PlannerInfo *root, Path *path,
 
 #ifdef __TBASE__
 /*
- * count datanode number for given path, consider replication table as 1
+ * Count datanode number for given path, consider replication table as 1
  * because we use this function to figure out how many parts that data
  * had been separated into, when we estimating costs of a plan. Therefore
  * to get more accurate estimating result as in a distributed system.
diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out
index 57376793..b8836c0d 100644
--- a/src/test/regress/expected/create_view.out
+++ b/src/test/regress/expected/create_view.out
@@ -59,7 +59,7 @@ SELECT * FROM viewtest;
 EXPLAIN SELECT a FROM viewtest;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Subquery Scan on viewtest  (cost=22.23..25.04 rows=225 width=4)
+ Subquery Scan on viewtest  (cost=22.23..27.29 rows=225 width=4)
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=22.23..22.79 rows=225 width=8)
          ->  Sort  (cost=22.23..22.79 rows=225 width=8)
                Sort Key: viewtest_tbl.b DESC
@@ -78,8 +78,8 @@ SELECT a FROM viewtest;
 EXPLAIN SELECT * FROM viewtest ORDER BY a;
                                         QUERY PLAN                                         
 -------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=33.83..34.39 rows=225 width=8)
-   ->  Sort  (cost=33.83..34.39 rows=225 width=8)
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=36.08..36.64 rows=225 width=8)
+   ->  Sort  (cost=36.08..36.64 rows=225 width=8)
          Sort Key: viewtest_tbl.a
          ->  Sort  (cost=22.23..22.79 rows=225 width=8)
                Sort Key: viewtest_tbl.b DESC

From 0f2f19866f7776adc4a758bc300733f9c65d34b3 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 18 Mar 2021 15:00:14 +0800
Subject: [PATCH 145/578] Fix two compile warnings

---
 src/backend/commands/explain_dist.c   | 3 ++-
 src/backend/optimizer/path/costsize.c | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/explain_dist.c b/src/backend/commands/explain_dist.c
index 1f2d50e4..81fedadc 100644
--- a/src/backend/commands/explain_dist.c
+++ b/src/backend/commands/explain_dist.c
@@ -705,6 +705,7 @@ HandleRemoteInstr(char *msg_body, size_t len, int nodeid, ResponseCombiner *comb
 	StringInfo  recv_str;
 	bool        found;
 	RemoteInstr *cur_instr;
+	MemoryContext oldcontext;
 	
 	if (combiner->recv_instr_htbl == NULL)
 	{
@@ -713,7 +714,7 @@ HandleRemoteInstr(char *msg_body, size_t len, int nodeid, ResponseCombiner *comb
 	elog(DEBUG1, "Handle remote instrument: nodeid %d", nodeid);
 	
 	/* must doing this under per query context */
-	MemoryContext oldcontext = MemoryContextSwitchTo(combiner->ss.ps.state->es_query_cxt);
+	oldcontext = MemoryContextSwitchTo(combiner->ss.ps.state->es_query_cxt);
 	
 	recv_str = makeStringInfo();
 	appendBinaryStringInfo(recv_str, msg_body, len);
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 491d9d40..c0fa9bdf 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -554,7 +554,6 @@ cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
     double        pages_fetched;
     double        rand_heap_pages;
     double        index_pages;
-	double		nodes = 1;
 	double		index_pages_per_dn;
 	double		baserel_pages_per_dn;
 	double		num_nodes = path_count_datanodes((Path *) path);

From e60f8bde529bbfa2d045303ca55c7166f2efaed0 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 12 Mar 2021 14:14:52 +0800
Subject: [PATCH 146/578] Support hash varbit and bit for V2

we do not add meta data, just add a function for locator

tapd: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696082800841
---
 src/backend/pgxc/locator/locator.c |  4 +++
 src/backend/utils/adt/varbit.c     |  9 +++++++
 src/include/utils/varbit.h         | 40 ++++++++++++++++--------------
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index fdc500fa..431e5bc3 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -38,6 +38,7 @@
 #include "utils/relcache.h"
 #include "utils/tqual.h"
 #include "utils/syscache.h"
+#include "utils/varbit.h"
 #include "nodes/nodes.h"
 #include "optimizer/clauses.h"
 #include "parser/parse_coerce.h"
@@ -1020,6 +1021,9 @@ hash_func_ptr(Oid dataType)
             return hash_numeric;
         case UUIDOID:
             return uuid_hash;
+		case BITOID:
+		case VARBITOID:
+			return bithash;
         default:
             return NULL;
     }
diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c
index bde80471..933ad9ff 100644
--- a/src/backend/utils/adt/varbit.c
+++ b/src/backend/utils/adt/varbit.c
@@ -16,6 +16,7 @@
 
 #include "postgres.h"
 
+#include "access/hash.h"
 #include "access/htup_details.h"
 #include "libpq/pqformat.h"
 #include "nodes/nodeFuncs.h"
@@ -1871,3 +1872,11 @@ bitgetbit(PG_FUNCTION_ARGS)
     else
         PG_RETURN_INT32(0);
 }
+
+Datum
+bithash(PG_FUNCTION_ARGS)
+{
+	VarBit	   *arg1 = PG_GETARG_VARBIT_P(0);
+	
+	return hash_any(VARBITS(arg1), VARBITBYTES(arg1));
+}
diff --git a/src/include/utils/varbit.h b/src/include/utils/varbit.h
index f82f3aec..53c4f080 100644
--- a/src/include/utils/varbit.h
+++ b/src/include/utils/varbit.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * varbit.h
- *      Functions for the SQL datatypes BIT() and BIT VARYING().
+ *	  Functions for the SQL datatypes BIT() and BIT VARYING().
  *
  * Code originally contributed by Adriaan Joubert.
  *
@@ -24,10 +24,10 @@
  */
 typedef struct
 {
-    int32        vl_len_;        /* varlena header (do not touch directly!) */
-    int32        bit_len;        /* number of valid bits */
-    bits8        bit_dat[FLEXIBLE_ARRAY_MEMBER]; /* bit string, most sig. byte
-                                                 * first */
+	int32		vl_len_;		/* varlena header (do not touch directly!) */
+	int32		bit_len;		/* number of valid bits */
+	bits8		bit_dat[FLEXIBLE_ARRAY_MEMBER]; /* bit string, most sig. byte
+												 * first */
 } VarBit;
 
 /*
@@ -36,34 +36,36 @@ typedef struct
  * BIT and BIT VARYING are toastable varlena types.  They are the same
  * as far as representation goes, so we just have one set of macros.
  */
-#define DatumGetVarBitP(X)           ((VarBit *) PG_DETOAST_DATUM(X))
-#define DatumGetVarBitPCopy(X)       ((VarBit *) PG_DETOAST_DATUM_COPY(X))
-#define VarBitPGetDatum(X)           PointerGetDatum(X)
-#define PG_GETARG_VARBIT_P(n)       DatumGetVarBitP(PG_GETARG_DATUM(n))
+#define DatumGetVarBitP(X)		   ((VarBit *) PG_DETOAST_DATUM(X))
+#define DatumGetVarBitPCopy(X)	   ((VarBit *) PG_DETOAST_DATUM_COPY(X))
+#define VarBitPGetDatum(X)		   PointerGetDatum(X)
+#define PG_GETARG_VARBIT_P(n)	   DatumGetVarBitP(PG_GETARG_DATUM(n))
 #define PG_GETARG_VARBIT_P_COPY(n) DatumGetVarBitPCopy(PG_GETARG_DATUM(n))
-#define PG_RETURN_VARBIT_P(x)       return VarBitPGetDatum(x)
+#define PG_RETURN_VARBIT_P(x)	   return VarBitPGetDatum(x)
 
 /* Header overhead *in addition to* VARHDRSZ */
-#define VARBITHDRSZ            sizeof(int32)
+#define VARBITHDRSZ			sizeof(int32)
 /* Number of bits in this bit string */
-#define VARBITLEN(PTR)        (((VarBit *) (PTR))->bit_len)
+#define VARBITLEN(PTR)		(((VarBit *) (PTR))->bit_len)
 /* Pointer to the first byte containing bit string data */
-#define VARBITS(PTR)        (((VarBit *) (PTR))->bit_dat)
+#define VARBITS(PTR)		(((VarBit *) (PTR))->bit_dat)
 /* Number of bytes in the data section of a bit string */
-#define VARBITBYTES(PTR)    (VARSIZE(PTR) - VARHDRSZ - VARBITHDRSZ)
+#define VARBITBYTES(PTR)	(VARSIZE(PTR) - VARHDRSZ - VARBITHDRSZ)
 /* Padding of the bit string at the end (in bits) */
-#define VARBITPAD(PTR)        (VARBITBYTES(PTR)*BITS_PER_BYTE - VARBITLEN(PTR))
+#define VARBITPAD(PTR)		(VARBITBYTES(PTR)*BITS_PER_BYTE - VARBITLEN(PTR))
 /* Number of bytes needed to store a bit string of a given length */
-#define VARBITTOTALLEN(BITLEN)    (((BITLEN) + BITS_PER_BYTE-1)/BITS_PER_BYTE + \
-                                 VARHDRSZ + VARBITHDRSZ)
+#define VARBITTOTALLEN(BITLEN)	(((BITLEN) + BITS_PER_BYTE-1)/BITS_PER_BYTE + \
+								 VARHDRSZ + VARBITHDRSZ)
 /*
  * Maximum number of bits.  Several code sites assume no overflow from
  * computing bitlen + X; VARBITTOTALLEN() has the largest such X.
  */
-#define VARBITMAXLEN        (INT_MAX - BITS_PER_BYTE + 1)
+#define VARBITMAXLEN		(INT_MAX - BITS_PER_BYTE + 1)
 /* pointer beyond the end of the bit string (like end() in STL containers) */
-#define VARBITEND(PTR)        (((bits8 *) (PTR)) + VARSIZE(PTR))
+#define VARBITEND(PTR)		(((bits8 *) (PTR)) + VARSIZE(PTR))
 /* Mask that will cover exactly one byte, i.e. BITS_PER_BYTE bits */
 #define BITMASK 0xFF
 
+extern Datum bithash(PG_FUNCTION_ARGS);
+
 #endif

From 7ad70a744ff6974bcc69450d71bd2f3ada5ba9af Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Tue, 9 Mar 2021 15:20:53 +0800
Subject: [PATCH 147/578] fqs insert when distribute key's func returns a
 single result

---
 src/backend/commands/copy.c                   |   3 +
 src/backend/commands/explain.c                |   2 +-
 src/backend/commands/prepare.c                |  14 +-
 src/backend/executor/execMain.c               |  95 +++++++
 src/backend/executor/execUtils.c              |  19 +-
 src/backend/executor/functions.c              |   4 +
 src/backend/executor/spi.c                    |   5 +
 src/backend/nodes/copyfuncs.c                 |   2 +
 src/backend/optimizer/util/clauses.c          |  30 +++
 src/backend/optimizer/util/pgxcship.c         |  38 ++-
 src/backend/pgxc/locator/locator.c            |  17 ++
 src/backend/pgxc/plan/planner.c               |   8 +
 src/backend/pgxc/pool/execRemote.c            |  17 +-
 src/backend/pgxc/pool/pgxcnode.c              |  32 ++-
 src/backend/tcop/postgres.c                   |  17 +-
 src/backend/tcop/pquery.c                     |   9 +-
 src/backend/utils/adt/ruleutils.c             |  30 ++-
 src/include/commands/prepare.h                |   3 +-
 src/include/executor/executor.h               |   1 +
 src/include/optimizer/clauses.h               |   2 +
 src/include/pgxc/locator.h                    | 141 ++++++-----
 src/include/pgxc/planner.h                    |   1 +
 src/test/regress/expected/fast_default.out    |  20 +-
 .../regress/expected/insert_conflict_1.out    |  62 ++---
 src/test/regress/expected/prepare.out         | 235 ++++++++++++++++++
 src/test/regress/expected/rules.out           |   5 +-
 src/test/regress/output/constraints_3.source  |   2 +-
 src/test/regress/sql/prepare.sql              | 129 ++++++++++
 28 files changed, 798 insertions(+), 145 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 0753302f..9e5aec9f 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -1873,6 +1873,9 @@ BeginCopy(ParseState *pstate,
          *
          * ExecutorStart computes a result tupdesc for us
          */
+		if (query->returningList != NIL)
+			ExecutorStart(cstate->queryDesc, EXEC_FLAG_RETURNING);
+		else
         ExecutorStart(cstate->queryDesc, 0);
 
         tupDesc = cstate->queryDesc->tupDesc;
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 7b56c3d3..2722d951 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3988,7 +3988,7 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp
         estate = planstate->state;
         oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
 
-        node = ExecInitRemoteQuery(step, estate, 0);
+		node = ExecInitRemoteQuery(step, estate, EXEC_FLAG_EXPLAIN_ONLY);
         MemoryContextSwitchTo(oldcontext);
         result = ExecRemoteQuery((PlanState *) node);
         while (result != NULL && !TupIsNull(result))
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index d4729433..7cf29f5f 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -201,7 +201,8 @@ PrepareQuery(PrepareStmt *stmt, const char *queryString,
     StorePreparedStatement(stmt->name,
                            plansource,
                            true,
-                           false);
+						   false,
+						   'N');
 }
 
 /*
@@ -584,7 +585,8 @@ void
 StorePreparedStatement(const char *stmt_name,
                        CachedPlanSource *plansource,
                        bool from_sql,
-                       bool use_resowner)
+					   bool use_resowner,
+					   const char need_rewrite)
 {
 	PreparedStatement *entry;
 	TimestampTz cur_ts = GetCurrentStatementStartTimestamp();
@@ -603,7 +605,13 @@ StorePreparedStatement(const char *stmt_name,
 	/* Shouldn't get a duplicate entry */
 	if (found)
 	{
-		if (!(plansource->commandTag == entry->plansource->commandTag &&
+		if (need_rewrite == 'Y' &&
+			plansource->commandTag == entry->plansource->commandTag &&
+			strcmp(plansource->query_string, entry->plansource->query_string) != 0)
+		{
+			entry->plansource->query_string = plansource->query_string;
+		}
+		else if (!(plansource->commandTag == entry->plansource->commandTag &&
 				strcmp(plansource->query_string, entry->plansource->query_string) == 0))
 		{
 			ereport(ERROR,
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 472bec42..d516a94c 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -62,6 +62,7 @@
 #include "storage/lmgr.h"
 #include "tcop/utility.h"
 #include "utils/acl.h"
+#include "utils/builtins.h"
 #ifdef _MLS_
 #include "utils/mls.h"
 #endif
@@ -1899,6 +1900,95 @@ ExecEndPlan(PlanState *planstate, EState *estate)
     }
 }
 
+/*
+ *		RewriteForSql
+ * We must caculate the result of distribute key's function to know
+ * which datanode will execute the sql command. After we get the result,
+ * we should use the result to replace distribute key's function to
+ * generate a new sql that will be shipped to datanode.
+ * Note: for replication table, we should caculate all the results of
+ * functions before ship the sql. Otherwise the value may not be same
+ * in different datanodes.
+ */
+static void
+RewriteForSql(RemoteQuery *plan, Query *query,
+			  char *distribcol, bool isreplic)
+{
+	ListCell		*lc_deparse = NULL;
+	TargetEntry		*entry_deparse = NULL;
+	bool			find_target = false;
+	StringInfoData	buf;
+
+	foreach(lc_deparse, query->targetList)
+	{
+		entry_deparse = lfirst(lc_deparse);
+		if (isreplic)
+		{
+			entry_deparse->expr = (Expr *)replace_distribkey_func(
+									(Node *)entry_deparse->expr);
+			find_target = true;
+		}
+		else if (strcmp(entry_deparse->resname, distribcol) == 0)
+		{
+			entry_deparse->expr = (Expr *)replace_distribkey_func(
+									(Node *)entry_deparse->expr);
+			plan->exec_nodes->en_expr = entry_deparse->expr;
+			find_target = true;
+			break;
+		}
+	}
+
+	if (find_target)
+	{
+		initStringInfo(&buf);
+		/*
+		* We always finalise aggregates on datanodes for FQS.
+		* Use the expressions for ORDER BY or GROUP BY clauses.
+		*/
+		deparse_query(query, &buf, NIL, true, false);
+		plan->sql_statement = pstrdup(buf.data);
+		pfree(buf.data);
+	}
+}
+
+/*
+ *		RewriteFuncNode
+ * We ship the insert sql whose distribute key's value contains function. 
+ * So we must rewrite the func node by caculating result of the function.
+ */
+static void
+RewriteFuncNode(PlanState *planstate)
+{
+	RemoteQuery		*plan = (RemoteQuery *)planstate->plan;
+	ExecNodes		*exec_nodes = plan->exec_nodes;
+	Query			*query = copyObject(plan->forDeparse);
+	RelationLocInfo	*rel_loc_info = NULL;
+	char			*distribcol = NULL;
+
+	if ((!exec_nodes) || (!exec_nodes->need_rewrite))
+		return;
+
+	/* 
+	 * For replicated table, we need to execute func
+	 * and then ship to datanode 
+	 */
+	if (IsExecNodesReplicated(exec_nodes))
+	{
+		RewriteForSql(plan, query, NULL, true);
+		return;
+	}
+
+	if (exec_nodes->en_relid == InvalidOid || (!exec_nodes->en_expr))
+		return;
+
+	rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid);
+	if (!rel_loc_info)
+		return;
+
+	distribcol = GetRelationDistribColumn(rel_loc_info);
+	RewriteForSql(plan, query, distribcol, false);
+}
+
 /* ----------------------------------------------------------------
  *        ExecutePlan
  *
@@ -1947,6 +2037,11 @@ ExecutePlan(EState *estate,
     if (use_parallel_mode)
         EnterParallelMode();
 
+	if (operation == CMD_INSERT && planstate->plan->type == T_RemoteQuery)
+	{
+		RewriteFuncNode(planstate);
+	}
+
     /*
      * Loop until we've processed the proper number of tuples from the plan.
      */
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
index c9c06f89..79c629be 100644
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -119,6 +119,7 @@
 #include "utils/ruleutils.h"
 #endif
 
+#include "pgxc/execRemote.h"
 
 static void ShutdownExprContext(ExprContext *econtext, bool isCommit);
 
@@ -509,6 +510,7 @@ ExecAssignResultTypeFromTL(PlanState *planstate)
 {
     bool        hasoid;
     TupleDesc    tupDesc;
+	List		*targetList = NIL;
 
     if (ExecContextForcesOids(planstate, &hasoid))
     {
@@ -521,11 +523,26 @@ ExecAssignResultTypeFromTL(PlanState *planstate)
     }
 
     /*
+	 * If the command with returning syntax, the tupDesc's info should
+	 * be maked up of returningList
+	 */
+	if (IsA(planstate, RemoteQueryState) &&
+		(((((RemoteQueryState *)planstate)->eflags) & EXEC_FLAG_RETURNING) != 0))
+	{
+		if (planstate->state && planstate->state->es_plannedstmt &&
+			planstate->state->es_plannedstmt->parseTree &&
+			planstate->state->es_plannedstmt->parseTree->returningList)
+			targetList = planstate->state->es_plannedstmt->parseTree->returningList;
+	}
+	if (targetList == NIL)
+		targetList = planstate->plan->targetlist;
+
+	/*
      * ExecTypeFromTL needs the parse-time representation of the tlist, not a
      * list of ExprStates.  This is good because some plan nodes don't bother
      * to set up planstate->targetlist ...
      */
-    tupDesc = ExecTypeFromTL(planstate->plan->targetlist, hasoid);
+	tupDesc = ExecTypeFromTL(targetList, hasoid);
     ExecAssignResultType(planstate, tupDesc);
 }
 
diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c
index 08a35bd5..f35db8bf 100644
--- a/src/backend/executor/functions.c
+++ b/src/backend/executor/functions.c
@@ -853,6 +853,10 @@ postquel_start(execution_state *es, SQLFunctionCachePtr fcache)
             eflags = EXEC_FLAG_SKIP_TRIGGERS;
         else
             eflags = 0;            /* default run-to-completion flags */
+
+		if (es->qd->plannedstmt->hasReturning)
+			eflags |= EXEC_FLAG_RETURNING;
+
         ExecutorStart(es->qd, eflags);
     }
 
diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c
index b64f03f6..808c75f8 100644
--- a/src/backend/executor/spi.c
+++ b/src/backend/executor/spi.c
@@ -2403,6 +2403,11 @@ _SPI_pquery(QueryDesc *queryDesc, bool fire_triggers, uint64 tcount)
     else
         eflags = EXEC_FLAG_SKIP_TRIGGERS;
 
+	if (queryDesc->plannedstmt->hasReturning)
+	{
+		eflags |= EXEC_FLAG_RETURNING;
+	}
+
     ExecutorStart(queryDesc, eflags);
 
     ExecutorRun(queryDesc, ForwardScanDirection, tcount, true);
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 49ad080a..780ad8aa 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -1315,6 +1315,7 @@ _copyRemoteQuery(const RemoteQuery *from)
     COPY_NODE_FIELD(query_var_tlist);
     COPY_SCALAR_FIELD(is_temp);
 #ifdef __TBASE__
+	COPY_NODE_FIELD(forDeparse);
     COPY_STRING_FIELD(sql_select);
     COPY_STRING_FIELD(sql_select_base);
     COPY_SCALAR_FIELD(forUpadte);
@@ -1361,6 +1362,7 @@ _copyExecNodes(const ExecNodes *from)
 #endif
     COPY_SCALAR_FIELD(en_relid);
     COPY_SCALAR_FIELD(accesstype);
+	COPY_SCALAR_FIELD(need_rewrite);
 
     return newnode;
 }
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index d840206d..82a9f2ba 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -5257,6 +5257,36 @@ bool find_sublink_walker(Node *node, List **list)
 	return expression_tree_walker(node, find_sublink_walker, list);
 }
 
+/*
+ * replace_distribkey_func:
+ * evaluate the result of a function that returns only
+ * one value and replace as certain value.
+ */
+Node*
+replace_distribkey_func(Node *node)
+{
+	if (node == NULL)
+		return NULL;
+
+	if (node->type == T_FuncExpr)
+	{
+		FuncExpr *func = (FuncExpr *) node;
+
+		if (!func->funcretset)
+		{
+			Node *evalNode = (Node *) evaluate_expr((Expr *) func,
+			                              func->funcresulttype,
+			                              exprTypmod(node),
+			                              func->funccollid);
+			return evalNode;
+		}
+	}
+
+	return expression_tree_mutator(node, 
+								   replace_distribkey_func, 
+								   NULL);
+}
+
 /*
  * replace_eval_sql_value_function:
  *  eval SQLValueFunction and replace as Const value.
diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index bfbc9e99..d294de0b 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -103,7 +103,10 @@ typedef enum
     SS_HAS_AGG_EXPR,            /* it has aggregate expressions */
     SS_UNSHIPPABLE_TYPE,        /* the type of expression is unshippable */
     SS_UNSHIPPABLE_TRIGGER,        /* the type of trigger is unshippable */
-    SS_UPDATES_DISTRIBUTION_COLUMN    /* query updates the distribution column */
+	SS_UPDATES_DISTRIBUTION_COLUMN,	/* query updates the distribution column */
+	SS_NEED_FUNC_REWRITE		/* exist func expression of distribution column,
+								 * we should rewrite the expr for FQS 
+								 */
 } ShippabilityStat;
 
 extern void PoolPingNodes(void);
@@ -1249,13 +1252,18 @@ pgxc_shippability_walker(Node *node, Shippability_context *sc_context)
              * can be shipped to the Datanode and what can not be.
              */
             if (!pgxc_is_func_shippable(funcexpr->funcid))
+			{
+				/* Ship insert if function doesn't return set */
+				if (sc_context->sc_query &&
+					sc_context->sc_query->commandType == CMD_INSERT &&
+					!(funcexpr->funcretset && sc_context->sc_for_expr))
+				{
+					pgxc_set_shippability_reason(sc_context, SS_NEED_FUNC_REWRITE);
+				}
+				else
                 pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+			}
 
-            /*
-             * If this is a stand alone expression and the function returns a
-             * set of rows, we need to handle it along with the final result of
-             * other expressions. So, it can not be shippable.
-             */
             if (funcexpr->funcretset && sc_context->sc_for_expr)
                 pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
 
@@ -1318,10 +1326,6 @@ pgxc_shippability_walker(Node *node, Shippability_context *sc_context)
             Query *query = (Query *)node;
             ExecNodes *exec_nodes = NULL;
 
-            /* PGXCTODO : If the query has a returning list, it is not shippable as of now */
-            if (query->returningList)
-                pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
-
             /* A stand-alone expression containing Query is not shippable */
             if (sc_context->sc_for_expr)
             {
@@ -2018,6 +2022,20 @@ pgxc_is_query_shippable(Query *query, int query_level)
      */
     shippability = bms_del_member(shippability, SS_HAS_AGG_EXPR);
 
+	/*
+	 * If an insert sql command whose distribute key's value is a
+	 * function, we allow it to be shipped to datanode. But we must
+	 * must know the function's result before real execute. So set
+	 * the flag to identify rewrite in ExecutePlan.
+	 */
+	if (bms_is_member(SS_NEED_FUNC_REWRITE, shippability))
+	{
+		exec_nodes->need_rewrite = true;
+		shippability = bms_del_member(shippability, SS_NEED_FUNC_REWRITE);
+	}
+	else
+		exec_nodes->need_rewrite = false;
+
     /* Can not ship the query for some reason */
     if (!bms_is_empty(shippability))
         canShip = false;
diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index 431e5bc3..a6933ce2 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -2281,6 +2281,23 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
     return exec_nodes;
 }
 
+/*
+ * GetRelationNodesForExplain
+ * This is just for explain statement, just pick one datanode.
+ * The returned List is a copy, so it should be freed when finished.
+ */
+ExecNodes *
+GetRelationNodesForExplain(RelationLocInfo *rel_loc_info,
+				RelationAccessType accessType)
+{
+	ExecNodes	*exec_nodes;
+	exec_nodes = makeNode(ExecNodes);
+	exec_nodes->baselocatortype = rel_loc_info->locatorType;
+	exec_nodes->accesstype = accessType;
+	exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, 1);
+	return exec_nodes;
+}
+
 /*
  * GetRelationNodesByQuals
  * A wrapper around GetRelationNodes to reduce the node list by looking at the
diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c
index 9bc141ad..2692e307 100644
--- a/src/backend/pgxc/plan/planner.c
+++ b/src/backend/pgxc/plan/planner.c
@@ -348,6 +348,7 @@ pgxc_FQS_planner(Query *query, int cursorOptions, ParamListInfo boundParams)
     result->relationOids = glob->relationOids;
     result->invalItems = glob->invalItems;
     result->rowMarks = glob->finalrowmarks;
+	result->hasReturning = (query->returningList != NULL);
 
     return result;
 }
@@ -390,6 +391,13 @@ pgxc_FQS_create_remote_plan(Query *query, ExecNodes *exec_nodes, bool is_exec_di
         pfree(buf.data);
     }
 
+	if (query_step->exec_nodes &&
+		query_step->exec_nodes->need_rewrite &&
+		query->commandType == CMD_INSERT)
+	{
+		query_step->forDeparse = copyObject(query);
+	}
+
     /* Optimize multi-node handling */
     query_step->read_only = (query->commandType == CMD_SELECT && !query->hasForUpdate);
     query_step->has_row_marks = query->hasForUpdate;
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 40f3f655..d95a772a 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -6287,13 +6287,16 @@ get_exec_connections(RemoteQueryState *planstate,
             /* execution time determining of target Datanodes */
             bool isnull;
             ExecNodes *nodes;
+			Datum partvalue;
 #ifdef __COLD_HOT__
             bool secisnull;
             Datum secValue;
 #endif
             ExprState *estate = ExecInitExpr(exec_nodes->en_expr,
                                              (PlanState *) planstate);
-            Datum partvalue = ExecEvalExpr(estate,
+			/* For explain, no need to execute expr. */
+			if (planstate->eflags != EXEC_FLAG_EXPLAIN_ONLY)
+				partvalue = ExecEvalExpr(estate,
                                            planstate->combiner.ss.ps.ps_ExprContext,
                                            &isnull);
             RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid);
@@ -6303,6 +6306,8 @@ get_exec_connections(RemoteQueryState *planstate,
             {
                 estate = ExecInitExpr(exec_nodes->sec_en_expr,
                                              (PlanState *) planstate);
+				/* For explain, no need to execute expr. */
+				if (planstate->eflags != EXEC_FLAG_EXPLAIN_ONLY)
                 secValue = ExecEvalExpr(estate,
                                         planstate->combiner.ss.ps.ps_ExprContext,
                                         &secisnull);
@@ -6314,6 +6319,10 @@ get_exec_connections(RemoteQueryState *planstate,
             }
 #endif
 
+			if (planstate->eflags == EXEC_FLAG_EXPLAIN_ONLY)
+				nodes = GetRelationNodesForExplain(rel_loc_info,
+												   exec_nodes->accesstype);
+			else
             /* PGXCTODO what is the type of partvalue here */
             nodes = GetRelationNodes(rel_loc_info,
                                      partvalue,
@@ -6567,10 +6576,13 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection,
         int    fetch = 0;
         bool    prepared = false;
         char    nodetype = PGXC_NODE_DATANODE;
+		ExecNodes *exec_nodes = step->exec_nodes;
 
         /* if prepared statement is referenced see if it is already
          * exist */
-        if (step->statement)
+		if (exec_nodes && exec_nodes->need_rewrite == true)
+			prepared = false;
+		else if (step->statement)
             prepared =
                 ActivateDatanodeStatementOnNode(step->statement,
                         PGXCNodeGetNodeId(connection->nodeoid,
@@ -8799,6 +8811,7 @@ ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags)
     ResponseCombiner   *combiner;
 
     remotestate = makeNode(RemoteQueryState);
+	remotestate->eflags = eflags;
     combiner = (ResponseCombiner *) remotestate;
     InitResponseCombiner(combiner, 0, node->combine_type);
     combiner->ss.ps.plan = (Plan *) node;
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 3baad358..81027676 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -1991,6 +1991,10 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
     size_t        old_outEnd = handle->outEnd;
 #endif
 
+	ResponseCombiner	*combiner = handle->combiner;
+	bool				need_rewrite = false;
+	int					rewriteLen = 1;
+
     /* if there are parameters, param_types should exist */
     Assert(num_params <= 0 || param_types);
     /* 2 bytes for number of parameters, preceding the type names */
@@ -2010,8 +2014,8 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
         paramTypeLen += strlen(paramTypes[cnt_params]) + 1;
     }
 
-    /* size + stmtLen + strlen + paramTypeLen */
-    msgLen = 4 + stmtLen + strLen + paramTypeLen;
+	/* size + rewriteLen + stmtLen + strlen + paramTypeLen */
+	msgLen = 4 + rewriteLen + stmtLen + strLen + paramTypeLen;
 
     /* msgType + msgLen */
     if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
@@ -2025,6 +2029,7 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
     msgLen = htonl(msgLen);
     memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
     handle->outEnd += 4;
+
     /* statement name */
     if (statement)
     {
@@ -2053,6 +2058,29 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
         pfree(paramTypes[cnt_params]);
     }
     pfree(paramTypes);
+
+	/*
+	 * If the extended query contains an insert sql command whose
+	 * distribute key's value is a function, we caculte the function
+	 * and rewrite the insert sql with the const result. So after send
+	 * the sql to datanode, it will be cached, However, the sql command
+	 * changes as the result of the function, so datanode should use
+	 * the new sql instead of cached sql. The we send a 'need_rewrite'
+	 * flag to tell the datanode to use new sql.
+	 */
+	if (IsA((combiner->ss.ps.plan), RemoteQuery))
+	{
+		RemoteQuery *plan = (RemoteQuery *)(combiner->ss.ps.plan);
+		ExecNodes *exec_nodes = plan->exec_nodes;
+		if (exec_nodes && exec_nodes->need_rewrite)
+		{
+			handle->outBuffer[handle->outEnd++] = 'Y';
+			need_rewrite = true;
+		}
+	}
+	if (!need_rewrite)
+		handle->outBuffer[handle->outEnd++] = 'N';
+
     Assert(old_outEnd + ntohl(msgLen) + 1 == handle->outEnd);
 
      return 0;
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index c8bdc980..db91d32d 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -1648,8 +1648,9 @@ exec_parse_message(const char *query_string,    /* string to execute */
                    const char *stmt_name,    /* name for prepared stmt */
                    Oid *paramTypes, /* parameter types */
                    char **paramTypeNames,    /* parameter type names */
-                   int numParams)    /* number of parameters */
-{// #lizard forgives
+				   int numParams, /* number of parameters */
+				   const char need_rewrite) /* plancache need to be rewritted */
+{
     MemoryContext unnamed_stmt_context = NULL;
     MemoryContext oldcontext;
     List       *parsetree_list;
@@ -1929,11 +1930,11 @@ exec_parse_message(const char *query_string,    /* string to execute */
 #ifdef __TBASE__
         if (use_resowner)
         {
-            StorePreparedStatement(stmt_name, psrc, false, true);
+			StorePreparedStatement(stmt_name, psrc, false, true, need_rewrite);
         }
         else
 #endif
-        StorePreparedStatement(stmt_name, psrc, false, false);
+		StorePreparedStatement(stmt_name, psrc, false, false, need_rewrite);
     }
     else
     {
@@ -2093,7 +2094,7 @@ exec_plan_message(const char *query_string,    /* source of the query */
     /*
      * Store the query as a prepared statement.  See above comments.
      */
-    StorePreparedStatement(stmt_name, psrc, false, true);
+	StorePreparedStatement(stmt_name, psrc, false, true, 'N');
 
     SetRemoteSubplan(psrc, plan_string);
 	/* set instrument_options, default 0 */
@@ -5460,6 +5461,7 @@ PostgresMain(int argc, char *argv[],
                     int            numParams;
                     Oid           *paramTypes = NULL;
                     char       **paramTypeNames = NULL;
+					char		need_rewrite = 'N';
 
                     forbidden_in_wal_sender(firstchar);
 
@@ -5479,6 +5481,8 @@ PostgresMain(int argc, char *argv[],
                             paramTypeNames = (char **)palloc(numParams * sizeof(char *));
                             for (i = 0; i < numParams; i++)
                                 paramTypeNames[i] = (char *)pq_getmsgstring(&input_message);
+
+							need_rewrite = pq_getmsgbyte(&input_message);
                         }
                         else
 #endif /* PGXC */
@@ -5490,7 +5494,8 @@ PostgresMain(int argc, char *argv[],
                     pq_getmsgend(&input_message);
 
                     exec_parse_message(query_string, stmt_name,
-                                       paramTypes, paramTypeNames, numParams);
+									   paramTypes, paramTypeNames,
+									   numParams, need_rewrite);
                 }
                 break;
 
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 5d358337..b73224fe 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -1092,11 +1092,16 @@ PortalStart(Portal portal, ParamListInfo params,
                  */
                 {
                     PlannedStmt *pstmt;
+					List *list = NIL;
 
                     pstmt = PortalGetPrimaryStmt(portal);
+					if (portal->strategy == PORTAL_ONE_RETURNING &&
+						pstmt->parseTree && pstmt->parseTree->returningList)
+						list = pstmt->parseTree->returningList;
+					else
+						list = pstmt->planTree->targetlist;
                     portal->tupDesc =
-                        ExecCleanTypeFromTL(pstmt->planTree->targetlist,
-                                            false);
+						ExecCleanTypeFromTL(list, false);
                 }
 
                 /*
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index fe4df6e7..2ce117ab 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -6637,6 +6637,8 @@ get_update_query_targetlist_def(Query *query, List *targetList,
     {
         TargetEntry *tle = (TargetEntry *) lfirst(l);
         Node       *expr;
+		Node	   *aexpr = (Node*)tle->expr;
+		const char *attr_str;
 
         if (tle->resjunk)
             continue;            /* ignore junk entries */
@@ -6707,16 +6709,17 @@ get_update_query_targetlist_def(Query *query, List *targetList,
          * Put out name of target column; look in the catalogs, not at
          * tle->resname, since resname will fail to track RENAME.
          */
-        appendStringInfoString(buf,
-                               quote_identifier(get_relid_attribute_name(rte->relid,
-                                                                         tle->resno)));
+		attr_str = quote_identifier(
+							get_relid_attribute_name(rte->relid, tle->resno));
+		appendStringInfoString(buf, attr_str);
 
+		for (;;)
+		{
         /*
          * Print any indirection needed (subfields or subscripts), and strip
          * off the top-level nodes representing the indirection assignments.
          */
-        expr = processIndirection((Node *) tle->expr, context);
-
+			expr = processIndirection(aexpr, context);
         /*
          * If we're in a multiassignment, skip printing anything more, unless
          * this is the last column; in which case, what we print should be the
@@ -6725,7 +6728,7 @@ get_update_query_targetlist_def(Query *query, List *targetList,
         if (cur_ma_sublink != NULL)
         {
             if (--remaining_ma_columns > 0)
-                continue;        /* not the last column of multiassignment */
+					break;		/* not the last column of multiassignment */
             appendStringInfoChar(buf, ')');
             expr = (Node *) cur_ma_sublink;
             cur_ma_sublink = NULL;
@@ -6734,6 +6737,21 @@ get_update_query_targetlist_def(Query *query, List *targetList,
         appendStringInfoString(buf, " = ");
 
         get_rule_expr(expr, context, false);
+
+			/*
+			* expand multiple entries for the same target attribute if need.
+			* if this is the last one, we don't append sep and column msg.
+			*/
+			if (IsA(aexpr, ArrayRef) &&
+				IsA(((ArrayRef*)aexpr)->refexpr, ArrayRef))
+			{
+				appendStringInfoString(buf, sep);
+				appendStringInfoString(buf, attr_str);
+				aexpr = (Node*)((ArrayRef*)aexpr)->refexpr;
+			}
+			else
+				break;
+		}
     }
 }
 
diff --git a/src/include/commands/prepare.h b/src/include/commands/prepare.h
index 57a72d94..53fbdede 100644
--- a/src/include/commands/prepare.h
+++ b/src/include/commands/prepare.h
@@ -121,7 +121,8 @@ extern void ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into,
 extern void StorePreparedStatement(const char *stmt_name,
                        CachedPlanSource *plansource,
                        bool from_sql,
-                       bool use_resowner);
+					   bool use_resowner,
+					   const char need_rewrite);
 extern PreparedStatement *FetchPreparedStatement(const char *stmt_name,
                        bool throwError);
 extern void DropPreparedStatement(const char *stmt_name, bool showError);
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 5262c42e..7fb94908 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -64,6 +64,7 @@
 #define EXEC_FLAG_WITH_OIDS        0x0020    /* force OIDs in returned tuples */
 #define EXEC_FLAG_WITHOUT_OIDS    0x0040    /* force no OIDs in returned tuples */
 #define EXEC_FLAG_WITH_NO_DATA    0x0080    /* rel scannability doesn't matter */
+#define EXEC_FLAG_RETURNING		0x0800	/* returning tuples */
 #ifdef XCP
 /* distributed executor may never execute the plan on this node  */
 #define EXEC_FLAG_SUBPLAN        0x0100
diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h
index fddeb132..e55c6033 100644
--- a/src/include/optimizer/clauses.h
+++ b/src/include/optimizer/clauses.h
@@ -91,6 +91,8 @@ extern Node *substitute_sublink_with_node(Node *expr, SubLink *sublink,
 										 Node *node);
 extern bool find_sublink_walker(Node *node, List **list);
 
+extern Node *replace_distribkey_func(Node *node);
+
 extern Node *replace_eval_sql_value_function(Node *node);
 
 #endif							/* CLAUSES_H */
diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h
index c6218522..bef6e6a0 100644
--- a/src/include/pgxc/locator.h
+++ b/src/include/pgxc/locator.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * locator.h
- *        Externally declared locator functions
+ *		Externally declared locator functions
  *
  *
  * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -22,9 +22,9 @@
 #define LOCATOR_TYPE_CUSTOM 'C'
 #define LOCATOR_TYPE_MODULO 'M'
 #define LOCATOR_TYPE_NONE 'O'
-#define LOCATOR_TYPE_DISTRIBUTED 'D'    /* for distributed table without specific
-                                         * scheme, e.g. result of JOIN of
-                                         * replicated and distributed table */
+#define LOCATOR_TYPE_DISTRIBUTED 'D'	/* for distributed table without specific
+										 * scheme, e.g. result of JOIN of
+										 * replicated and distributed table */
 
 #ifdef _MIGRATE_
 #define LOCATOR_TYPE_SHARD 'S'
@@ -40,14 +40,14 @@
 #define IsLocatorNone(x) (x == LOCATOR_TYPE_NONE)
 #define IsLocatorReplicated(x) (x == LOCATOR_TYPE_REPLICATED)
 #define IsLocatorColumnDistributed(x) (x == LOCATOR_TYPE_HASH || \
-                                       x == LOCATOR_TYPE_RROBIN || \
-                                       x == LOCATOR_TYPE_MODULO || \
-                                       x == LOCATOR_TYPE_DISTRIBUTED || \
-                                       x == LOCATOR_TYPE_SHARD)
+									   x == LOCATOR_TYPE_RROBIN || \
+									   x == LOCATOR_TYPE_MODULO || \
+									   x == LOCATOR_TYPE_DISTRIBUTED || \
+									   x == LOCATOR_TYPE_SHARD)
 #define IsLocatorDistributedByValue(x) (x == LOCATOR_TYPE_HASH || \
-                                        x == LOCATOR_TYPE_MODULO || \
-                                        x == LOCATOR_TYPE_RANGE || \
-                                        x == LOCATOR_TYPE_SHARD)
+										x == LOCATOR_TYPE_MODULO || \
+										x == LOCATOR_TYPE_RANGE || \
+									    x == LOCATOR_TYPE_SHARD)
 
 #include "nodes/primnodes.h"
 #include "utils/relcache.h"
@@ -59,35 +59,35 @@ typedef int PartAttrNumber;
  */
 typedef enum
 {
-    RELATION_ACCESS_READ,                /* SELECT */
-    RELATION_ACCESS_READ_FQS,                /* SELECT for FQS */
-    RELATION_ACCESS_READ_FOR_UPDATE,    /* SELECT FOR UPDATE */
-    RELATION_ACCESS_UPDATE,                /* UPDATE OR DELETE */
-    RELATION_ACCESS_INSERT                /* INSERT */
+	RELATION_ACCESS_READ,				/* SELECT */
+	RELATION_ACCESS_READ_FQS,				/* SELECT for FQS */
+	RELATION_ACCESS_READ_FOR_UPDATE,	/* SELECT FOR UPDATE */
+	RELATION_ACCESS_UPDATE,				/* UPDATE OR DELETE */
+	RELATION_ACCESS_INSERT				/* INSERT */
 } RelationAccessType;
 
 typedef struct
 {
-    Oid        relid;
-    char        locatorType;
-    PartAttrNumber    partAttrNum;    /* if partitioned */
-    char        *partAttrName;        /* if partitioned */
+	Oid		relid;
+	char		locatorType;
+	PartAttrNumber	partAttrNum;	/* if partitioned */
+	char		*partAttrName;		/* if partitioned */
 #ifdef _MIGRATE_
-    Oid         groupId;            /* distribute group */
+	Oid         groupId;			/* distribute group */
 #endif
 #ifdef __COLD_HOT__
-    /* used for table in cold-hot group */
-    Oid         coldGroupId;        /* cold group oid if exist */
-    AttrNumber  secAttrNum;         /* second distributed column's attribute number */
-    char       *secAttrName;        /* second distributed column's name */
+	/* used for table in cold-hot group */
+	Oid         coldGroupId;        /* cold group oid if exist */
+	AttrNumber  secAttrNum;         /* second distributed column's attribute number */
+	char       *secAttrName;        /* second distributed column's name */
 #endif
-    List        *rl_nodeList;        /* Node Indices */
-    ListCell    *roundRobinNode;    /* index of the next one to use */
+	List		*rl_nodeList;		/* Node Indices */
+	ListCell	*roundRobinNode;	/* index of the next one to use */
 } RelationLocInfo;
 
-#define IsRelationReplicated(rel_loc)            IsLocatorReplicated((rel_loc)->locatorType)
-#define IsRelationColumnDistributed(rel_loc)     IsLocatorColumnDistributed((rel_loc)->locatorType)
-#define IsRelationDistributedByValue(rel_loc)    IsLocatorDistributedByValue((rel_loc)->locatorType)
+#define IsRelationReplicated(rel_loc)			IsLocatorReplicated((rel_loc)->locatorType)
+#define IsRelationColumnDistributed(rel_loc) 	IsLocatorColumnDistributed((rel_loc)->locatorType)
+#define IsRelationDistributedByValue(rel_loc)	IsLocatorDistributedByValue((rel_loc)->locatorType)
 /*
  * Nodes to execute on
  * primarynodelist is for replicated table writes, where to execute first.
@@ -103,9 +103,9 @@ typedef struct
 	Expr		*en_expr;		/* expression to evaluate at execution time if planner
 						 	 	 * can not determine execution nodes */
 #ifdef __COLD_HOT__
-    Expr        *sec_en_expr;    /* Sec Expression to evaluate at execution time
-                                 * if planner can not determine execution
-                                 * nodes */
+	Expr		*sec_en_expr;	/* Sec Expression to evaluate at execution time
+								 * if planner can not determine execution
+								 * nodes */
 #endif
 	Oid			en_relid;			/* Relation to determine execution nodes */
 	RelationAccessType accesstype;	/* Access type to determine execution nodes */
@@ -113,6 +113,7 @@ typedef struct
 	bool    	restrict_shippable; /* The ExecNode is choose by join qual on distribute column */
 	bool		const_subquery; 	/* The subquery rte only got constant values */
 #endif
+	bool		need_rewrite;		/* exists func, need to be rewritted when execute plan */
 } ExecNodes;
 
 
@@ -122,17 +123,17 @@ typedef struct
 
 typedef enum
 {
-    LOCATOR_LIST_NONE,    /* locator returns integers in range 0..NodeCount-1,
-                         * value of nodeList ignored and can be NULL */
-    LOCATOR_LIST_INT,    /* nodeList is an integer array (int *), value from
-                         * the array is returned */
-    LOCATOR_LIST_OID,    /* node list is an array of Oids (Oid *), value from
-                         * the array is returned */
-    LOCATOR_LIST_POINTER,    /* node list is an array of pointers (void **),
-                             * value from the array is returned */
-    LOCATOR_LIST_LIST,    /* node list is a list, item type is determined by
-                         * list type (integer, oid or pointer). NodeCount
-                         * is ignored */
+	LOCATOR_LIST_NONE,	/* locator returns integers in range 0..NodeCount-1,
+						 * value of nodeList ignored and can be NULL */
+	LOCATOR_LIST_INT,	/* nodeList is an integer array (int *), value from
+						 * the array is returned */
+	LOCATOR_LIST_OID,	/* node list is an array of Oids (Oid *), value from
+						 * the array is returned */
+	LOCATOR_LIST_POINTER,	/* node list is an array of pointers (void **),
+							 * value from the array is returned */
+	LOCATOR_LIST_LIST,	/* node list is a list, item type is determined by
+						 * list type (integer, oid or pointer). NodeCount
+						 * is ignored */
 } LocatorListType;
 
 typedef Datum (*LocatorHashFunc) (PG_FUNCTION_ARGS);
@@ -152,35 +153,35 @@ typedef struct _Locator Locator;
  *  accessType - see RelationAccessType enum
  *  dataType - actual data type of values provided to determine nodes
  *  listType - defines how nodeList parameter is interpreted, see
- *               LocatorListType enum for more details
+ *			   LocatorListType enum for more details
  *  nodeCount - number of nodes to distribute
- *    nodeList - detailed info about relation nodes. Either List or array or NULL
- *    result - returned address of the array where locator will output node
- *              references. Type of array items (int, Oid or pointer (void *))
- *              depends on listType.
- *    primary - set to true if caller ever wants to determine primary node.
+ *	nodeList - detailed info about relation nodes. Either List or array or NULL
+ *	result - returned address of the array where locator will output node
+ * 			 references. Type of array items (int, Oid or pointer (void *))
+ * 			 depends on listType.
+ *	primary - set to true if caller ever wants to determine primary node.
  *            Primary node will be returned as the first element of the
- *              result array
+ *			  result array
  */
 #ifdef _MIGRATE_
 extern Locator *createLocator(char locatorType, RelationAccessType accessType,
-              Oid dataType, LocatorListType listType, int nodeCount,
-              void *nodeList, void **result, bool primary, Oid groupid,
-              Oid coldGroupId, Oid secDataType, AttrNumber secAttrNum,
-              Oid relid);
+			  Oid dataType, LocatorListType listType, int nodeCount,
+			  void *nodeList, void **result, bool primary, Oid groupid,
+			  Oid coldGroupId, Oid secDataType, AttrNumber secAttrNum,
+			  Oid relid);
 #else
 extern Locator *createLocator(char locatorType, RelationAccessType accessType,
-              Oid dataType, LocatorListType listType, int nodeCount,
-              void *nodeList, void **result, bool primary);
+			  Oid dataType, LocatorListType listType, int nodeCount,
+			  void *nodeList, void **result, bool primary);
 #endif
 
 extern void freeLocator(Locator *locator);
 
 extern int GET_NODES(Locator *self, Datum value, bool isnull,
 #ifdef __COLD_HOT__
-                           Datum secValue, bool secIsNull,
+					       Datum secValue, bool secIsNull,
 #endif
-                           bool *hasprimary);
+	                       bool *hasprimary);
 extern void *getLocatorResults(Locator *self);
 extern void *getLocatorNodeMap(Locator *self);
 extern int getLocatorNodeCount(Locator *self);
@@ -200,20 +201,22 @@ extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo *src_info);
 extern char GetRelationLocType(Oid relid);
 extern bool IsTableDistOnPrimary(RelationLocInfo *rel_loc_info);
 extern bool IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2);
-extern int    GetRoundRobinNode(Oid relid);
+extern int	GetRoundRobinNode(Oid relid);
 extern ExecNodes *GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
-                                        bool isValueNull,
+										bool isValueNull,
 #ifdef __COLD_HOT__
-                                        Datum valueForSecDistCol, bool isSecValueNull,
+										Datum valueForSecDistCol, bool isSecValueNull,
 #endif
-                                        RelationAccessType accessType);
+										RelationAccessType accessType);
+extern ExecNodes *GetRelationNodesForExplain(RelationLocInfo *rel_loc_info,
+											 RelationAccessType accessType);
 extern ExecNodes *GetRelationNodesByQuals(Oid reloid,
-                                          RelationLocInfo *rel_loc_info,
-                                          Index varno,
-                                          Node *quals,
-                                          RelationAccessType relaccess,
-                                          Node **dis_qual,
-                                          Node **sec_quals);
+										  RelationLocInfo *rel_loc_info,
+										  Index varno,
+										  Node *quals,
+										  RelationAccessType relaccess,
+										  Node **dis_qual,
+										  Node **sec_quals);
 
 extern bool IsTypeHashDistributable(Oid col_type);
 extern List *GetAllDataNodes(void);
diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h
index f08c4fce..cb221759 100644
--- a/src/include/pgxc/planner.h
+++ b/src/include/pgxc/planner.h
@@ -207,6 +207,7 @@ typedef struct
       * triggers. In order to make triggers work, we separate UPSERT into INSERT and
       * UPDATE.
          */
+	Query           *forDeparse;      /* function statement */
     char            *sql_select;      /* select statement */
     char            *sql_select_base;
     bool            forUpadte;
diff --git a/src/test/regress/expected/fast_default.out b/src/test/regress/expected/fast_default.out
index 16c60821..d390a452 100644
--- a/src/test/regress/expected/fast_default.out
+++ b/src/test/regress/expected/fast_default.out
@@ -452,16 +452,20 @@ DELETE FROM T WHERE pk BETWEEN 10 AND 20 RETURNING *;
 
 EXPLAIN (VERBOSE TRUE, COSTS FALSE)
 DELETE FROM T WHERE pk BETWEEN 10 AND 20 RETURNING *;
-                      QUERY PLAN                       
--------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)
-   Output: pk, c_bigint, c_text
+                                           QUERY PLAN                                           
+------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: t.xc_node_id, t.ctid, t.shardid, t.pk
+   Node/s: datanode_1, datanode_2
+   Remote query: DELETE FROM t WHERE ((pk >= 10) AND (pk <= 20)) RETURNING pk, c_bigint, c_text
    ->  Delete on fast_default.t
          Output: pk, c_bigint, c_text
-         ->  Seq Scan on fast_default.t
-               Output: xc_node_id, ctid, shardid, pk
-               Filter: ((t.pk >= 10) AND (t.pk <= 20))
-(7 rows)
+         ->  Bitmap Heap Scan on fast_default.t
+               Output: ctid, shardid
+               Recheck Cond: ((t.pk >= 10) AND (t.pk <= 20))
+               ->  Bitmap Index Scan on t_pkey
+                     Index Cond: ((t.pk >= 10) AND (t.pk <= 20))
+(11 rows)
 
 -- UPDATE
 UPDATE T SET c_text = '"' || c_text || '"'  WHERE pk < 10;
diff --git a/src/test/regress/expected/insert_conflict_1.out b/src/test/regress/expected/insert_conflict_1.out
index 1dce5ece..1a544406 100644
--- a/src/test/regress/expected/insert_conflict_1.out
+++ b/src/test/regress/expected/insert_conflict_1.out
@@ -219,37 +219,37 @@ explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on con
 
 -- Does the same, but JSON format shows "Conflict Arbiter Index" as JSON array:
 explain (costs off, format json) insert into insertconflicttest values (0, 'Bilberry') on conflict (key) do update set fruit = excluded.fruit where insertconflicttest.fruit != 'Lime' returning *;
-                                 QUERY PLAN                                 
-----------------------------------------------------------------------------
- [                                                                         +
-   {                                                                       +
-     "Plan": {                                                             +
-       "Node Type": "Remote Subquery Scan",                                +
-       "Parallel Aware": false,                                            +
-       "Replicated": "no",                                                 +
-       "Node List": ["datanode_2"],                                        +
-       "Plans": [                                                          +
-         {                                                                 +
-           "Node Type": "ModifyTable",                                     +
-           "Operation": "Insert",                                          +
-           "Parent Relationship": "Outer",                                 +
-           "Parallel Aware": false,                                        +
-           "Relation Name": "insertconflicttest",                          +
-           "Alias": "insertconflicttest",                                  +
-           "Conflict Resolution": "UPDATE",                                +
-           "Conflict Arbiter Indexes": ["key_index"],                      +
-           "Conflict Filter": "(insertconflicttest.fruit <> 'Lime'::text)",+
-           "Plans": [                                                      +
-             {                                                             +
-               "Node Type": "Result",                                      +
-               "Parent Relationship": "Member",                            +
-               "Parallel Aware": false                                     +
-             }                                                             +
-           ]                                                               +
-         }                                                                 +
-       ]                                                                   +
-     }                                                                     +
-   }                                                                       +
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ [                                                                           +
+   {                                                                         +
+     "Plan": {                                                               +
+       "Node Type": "Remote Fast Query Execution",                           +
+       "Parallel Aware": false,                                              +
+       "Node expr": "0"                                                      +
+       "Remote plan": [                                                      +
+         {                                                                   +
+           "Plan": {                                                         +
+             "Node Type": "ModifyTable",                                     +
+             "Operation": "Insert",                                          +
+             "Parallel Aware": false,                                        +
+             "Relation Name": "insertconflicttest",                          +
+             "Alias": "insertconflicttest",                                  +
+             "Conflict Resolution": "UPDATE",                                +
+             "Conflict Arbiter Indexes": ["key_index"],                      +
+             "Conflict Filter": "(insertconflicttest.fruit <> 'Lime'::text)",+
+             "Plans": [                                                      +
+               {                                                             +
+                 "Node Type": "Result",                                      +
+                 "Parent Relationship": "Member",                            +
+                 "Parallel Aware": false                                     +
+               }                                                             +
+             ]                                                               +
+           }                                                                 +
+         }                                                                   +
+       ]                                                                     +
+     }                                                                       +
+   }                                                                         +
  ]
 (1 row)
 
diff --git a/src/test/regress/expected/prepare.out b/src/test/regress/expected/prepare.out
index 787b242c..7dd52d9a 100644
--- a/src/test/regress/expected/prepare.out
+++ b/src/test/regress/expected/prepare.out
@@ -162,3 +162,238 @@ SELECT name, statement, parameter_types FROM pg_prepared_statements
 ------+-----------+-----------------
 (0 rows)
 
+--
+-- search_path test 
+--
+CREATE DATABASE search_path_db;
+\c search_path_db
+CREATE TABLE tbl_test(
+    id int primary key,
+    name  varchar(30)
+);
+INSERT INTO tbl_test VALUES (1, 'public 01');
+INSERT INTO tbl_test VALUES (2, 'public 02');
+INSERT INTO tbl_test VALUES (3, 'public 03');
+select * from tbl_test order by id;
+ id |   name    
+----+-----------
+  1 | public 01
+  2 | public 02
+  3 | public 03
+(3 rows)
+
+-- create schema
+CREATE SCHEMA sch01;
+CREATE SCHEMA sch02;
+-- set schema to sch01
+SET search_path TO sch01;
+CREATE TABLE IF NOT EXISTS tbl_test(
+    id int primary key,
+    name  varchar(30)
+);
+BEGIN;
+INSERT INTO tbl_test VALUES (11, 'sch01 11');
+INSERT INTO tbl_test VALUES (12, 'sch01 12');
+INSERT INTO tbl_test VALUES (13, 'sch01 13');
+COMMIT;
+select * from tbl_test order by id;
+ id |   name   
+----+----------
+ 11 | sch01 11
+ 12 | sch01 12
+ 13 | sch01 13
+(3 rows)
+
+-- set schema to sch02
+SET search_path TO sch02;
+CREATE TABLE IF NOT EXISTS tbl_test(
+    id int primary key,
+    name  varchar(30)
+);
+BEGIN;
+INSERT INTO tbl_test VALUES (21, 'sch02 21');
+INSERT INTO tbl_test VALUES (22, 'sch02 22');
+INSERT INTO tbl_test VALUES (23, 'sch02 23');
+ROLLBACK;
+select * from tbl_test order by id;
+ id | name 
+----+------
+(0 rows)
+
+-- set schema to sch01
+SET search_path = sch01;
+SHOW search_path;
+ search_path 
+-------------
+ sch01
+(1 row)
+
+PREPARE ps_test_insert (int, varchar) AS INSERT INTO tbl_test VALUES ($1, $2);;
+PREPARE ps_test_select (int) AS select * from tbl_test where id < $1 order by id;
+BEGIN;
+EXECUTE ps_test_insert(14, 'sch01 14');
+EXECUTE ps_test_select(50);
+ id |   name   
+----+----------
+ 11 | sch01 11
+ 12 | sch01 12
+ 13 | sch01 13
+ 14 | sch01 14
+(4 rows)
+
+ROLLBACK;
+EXECUTE ps_test_select(50);
+ id |   name   
+----+----------
+ 11 | sch01 11
+ 12 | sch01 12
+ 13 | sch01 13
+(3 rows)
+
+SHOW search_path;
+ search_path 
+-------------
+ sch01
+(1 row)
+
+BEGIN;
+EXECUTE ps_test_insert(15, 'sch01 15');
+EXECUTE ps_test_select(50);
+ id |   name   
+----+----------
+ 11 | sch01 11
+ 12 | sch01 12
+ 13 | sch01 13
+ 15 | sch01 15
+(4 rows)
+
+COMMIT;
+EXECUTE ps_test_select(50);
+ id |   name   
+----+----------
+ 11 | sch01 11
+ 12 | sch01 12
+ 13 | sch01 13
+ 15 | sch01 15
+(4 rows)
+
+SHOW search_path;
+ search_path 
+-------------
+ sch01
+(1 row)
+
+EXECUTE ps_test_insert(16, 'sch01 16');
+EXECUTE ps_test_select(50);
+ id |   name   
+----+----------
+ 11 | sch01 11
+ 12 | sch01 12
+ 13 | sch01 13
+ 15 | sch01 15
+ 16 | sch01 16
+(5 rows)
+
+SHOW search_path;
+ search_path 
+-------------
+ sch01
+(1 row)
+
+DEALLOCATE PREPARE ps_test_insert;
+DEALLOCATE PREPARE ps_test_select;
+-- test insert fqs in prepare
+CREATE TABLE insert_fsq_test(id serial primary key, name  varchar(30));
+PREPARE ps_test_insert (varchar) AS INSERT INTO insert_fsq_test (name) VALUES ($1);
+EXECUTE ps_test_insert('1');
+EXECUTE ps_test_insert('2');
+EXECUTE ps_test_insert('3');
+EXECUTE ps_test_insert('4');
+EXECUTE ps_test_insert('5');
+SELECT * from insert_fsq_test order by id;
+ id | name 
+----+------
+  1 | 1
+  2 | 2
+  3 | 3
+  4 | 4
+  5 | 5
+(5 rows)
+
+DEALLOCATE PREPARE ps_test_insert;
+DROP TABLE insert_fsq_test cascade;
+--
+-- gb18030 test
+--
+CREATE DATABASE gb18030_db template template0 encoding = gb18030 LC_COLLATE = 'zh_CN.gb18030' LC_CTYPE = 'zh_CN.gb18030';
+\c gb18030_db;
+-- set client_encoding
+SET client_encoding = utf8;
+CREATE TABLE tbl_test(id int primary key, name varchar(3));
+INSERT INTO tbl_test VALUES (3, '张三');
+BEGIN;
+INSERT INTO tbl_test VALUES (4, '李四');
+INSERT INTO tbl_test VALUES (5, '王五');
+COMMIT;
+BEGIN;
+INSERT INTO tbl_test VALUES (6, '丁六');
+INSERT INTO tbl_test VALUES (7, '方七');
+ROLLBACK;
+SELECT * FROM tbl_test ORDER BY id;
+ id | name 
+----+------
+  3 | 张三
+  4 | 李四
+  5 | 王五
+(3 rows)
+
+SHOW client_encoding;
+ client_encoding 
+-----------------
+ UTF8
+(1 row)
+
+PREPARE ps_test (int) AS select * from tbl_test where id < $1 order by id;
+EXECUTE ps_test(20);
+ id | name 
+----+------
+  3 | 张三
+  4 | 李四
+  5 | 王五
+(3 rows)
+
+SHOW client_encoding;
+ client_encoding 
+-----------------
+ UTF8
+(1 row)
+
+EXECUTE ps_test(20);
+ id | name 
+----+------
+  3 | 张三
+  4 | 李四
+  5 | 王五
+(3 rows)
+
+SHOW client_encoding;
+ client_encoding 
+-----------------
+ UTF8
+(1 row)
+
+EXECUTE ps_test(20);
+ id | name 
+----+------
+  3 | 张三
+  4 | 李四
+  5 | 王五
+(3 rows)
+
+SHOW client_encoding;
+ client_encoding 
+-----------------
+ UTF8
+(1 row)
+
+DEALLOCATE PREPARE ps_test;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 89552269..14660970 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -3156,13 +3156,14 @@ SELECT tablename, rulename, definition FROM pg_rules
 explain (costs off) INSERT INTO hats VALUES ('h8', 'forbidden') RETURNING *;
                                               QUERY PLAN                                               
 -------------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1)
+ Remote Fast Query Execution
+   Node expr: 'h8'::bpchar
    ->  Insert on hat_data
          Conflict Resolution: UPDATE
          Conflict Arbiter Indexes: hat_data_unique_idx
          Conflict Filter: ((excluded.hat_color <> 'forbidden'::bpchar) AND (hat_data.* <> excluded.*))
          ->  Result
-(6 rows)
+(7 rows)
 
 -- ensure upserting into a rule, with a CTE (different offsets!) works
 WITH data(hat_name, hat_color) AS MATERIALIZED (
diff --git a/src/test/regress/output/constraints_3.source b/src/test/regress/output/constraints_3.source
index e19ef775..568efec7 100644
--- a/src/test/regress/output/constraints_3.source
+++ b/src/test/regress/output/constraints_3.source
@@ -187,7 +187,7 @@ DETAIL:  Failing row contains (8, Y, -8).
 SELECT 'eight' AS one, currval('insert_seq');
   one  | currval 
 -------+---------
- eight |       7
+ eight |       8
 (1 row)
 
 -- According to SQL, it is OK to insert a record that gives rise to NULL
diff --git a/src/test/regress/sql/prepare.sql b/src/test/regress/sql/prepare.sql
index 507c0668..9a465ab3 100644
--- a/src/test/regress/sql/prepare.sql
+++ b/src/test/regress/sql/prepare.sql
@@ -75,3 +75,132 @@ SELECT name, statement, parameter_types FROM pg_prepared_statements
 DEALLOCATE ALL;
 SELECT name, statement, parameter_types FROM pg_prepared_statements
     ORDER BY name;
+
+--
+-- search_path test 
+--
+CREATE DATABASE search_path_db;
+\c search_path_db
+
+CREATE TABLE tbl_test(
+    id int primary key,
+    name  varchar(30)
+);
+
+INSERT INTO tbl_test VALUES (1, 'public 01');
+INSERT INTO tbl_test VALUES (2, 'public 02');
+INSERT INTO tbl_test VALUES (3, 'public 03');
+
+select * from tbl_test order by id;
+
+-- create schema
+CREATE SCHEMA sch01;
+CREATE SCHEMA sch02;
+
+-- set schema to sch01
+SET search_path TO sch01;
+
+CREATE TABLE IF NOT EXISTS tbl_test(
+    id int primary key,
+    name  varchar(30)
+);
+
+BEGIN;
+INSERT INTO tbl_test VALUES (11, 'sch01 11');
+INSERT INTO tbl_test VALUES (12, 'sch01 12');
+INSERT INTO tbl_test VALUES (13, 'sch01 13');
+COMMIT;
+
+select * from tbl_test order by id;
+
+-- set schema to sch02
+SET search_path TO sch02;
+
+CREATE TABLE IF NOT EXISTS tbl_test(
+    id int primary key,
+    name  varchar(30)
+);
+
+BEGIN;
+INSERT INTO tbl_test VALUES (21, 'sch02 21');
+INSERT INTO tbl_test VALUES (22, 'sch02 22');
+INSERT INTO tbl_test VALUES (23, 'sch02 23');
+ROLLBACK;
+
+select * from tbl_test order by id;
+
+-- set schema to sch01
+SET search_path = sch01;
+SHOW search_path;
+
+PREPARE ps_test_insert (int, varchar) AS INSERT INTO tbl_test VALUES ($1, $2);;
+PREPARE ps_test_select (int) AS select * from tbl_test where id < $1 order by id;
+
+BEGIN;
+EXECUTE ps_test_insert(14, 'sch01 14');
+EXECUTE ps_test_select(50);
+ROLLBACK;
+EXECUTE ps_test_select(50);
+
+SHOW search_path;
+
+BEGIN;
+EXECUTE ps_test_insert(15, 'sch01 15');
+EXECUTE ps_test_select(50);
+COMMIT;
+EXECUTE ps_test_select(50);
+
+SHOW search_path;
+
+EXECUTE ps_test_insert(16, 'sch01 16');
+EXECUTE ps_test_select(50);
+
+SHOW search_path;
+
+DEALLOCATE PREPARE ps_test_insert;
+DEALLOCATE PREPARE ps_test_select;
+
+-- test insert fqs in prepare
+CREATE TABLE insert_fsq_test(id serial primary key, name  varchar(30));
+PREPARE ps_test_insert (varchar) AS INSERT INTO insert_fsq_test (name) VALUES ($1);
+EXECUTE ps_test_insert('1');
+EXECUTE ps_test_insert('2');
+EXECUTE ps_test_insert('3');
+EXECUTE ps_test_insert('4');
+EXECUTE ps_test_insert('5');
+SELECT * from insert_fsq_test order by id;
+DEALLOCATE PREPARE ps_test_insert;
+DROP TABLE insert_fsq_test cascade;
+
+--
+-- gb18030 test
+--
+CREATE DATABASE gb18030_db template template0 encoding = gb18030 LC_COLLATE = 'zh_CN.gb18030' LC_CTYPE = 'zh_CN.gb18030';
+\c gb18030_db;
+
+-- set client_encoding
+SET client_encoding = utf8;
+
+CREATE TABLE tbl_test(id int primary key, name varchar(3));
+
+INSERT INTO tbl_test VALUES (3, '张三');
+BEGIN;
+INSERT INTO tbl_test VALUES (4, '李四');
+INSERT INTO tbl_test VALUES (5, '王五');
+COMMIT;
+BEGIN;
+INSERT INTO tbl_test VALUES (6, '丁六');
+INSERT INTO tbl_test VALUES (7, '方七');
+ROLLBACK;
+SELECT * FROM tbl_test ORDER BY id;
+
+SHOW client_encoding;
+
+PREPARE ps_test (int) AS select * from tbl_test where id < $1 order by id;
+EXECUTE ps_test(20);
+SHOW client_encoding;
+EXECUTE ps_test(20);
+SHOW client_encoding;
+EXECUTE ps_test(20);
+SHOW client_encoding;
+DEALLOCATE PREPARE ps_test;

From 8496d5d717807b3e119994ca85fe1d2a335fbc6b Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Sat, 29 May 2021 14:17:34 +0800
Subject: [PATCH 148/578] get exec_nodes's func value in rewrite for sql
 http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131088058267&url_cache_key=3ba5cc9f7d4408eb8cb3e14319eb688f

---
 src/backend/executor/execMain.c    | 30 +++++++++++++++++++++++++-----
 src/backend/nodes/copyfuncs.c      |  3 +++
 src/backend/pgxc/pool/execRemote.c | 15 +++++++++++++--
 src/include/pgxc/locator.h         |  3 +++
 4 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index d516a94c..9ec3add2 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1911,13 +1911,19 @@ ExecEndPlan(PlanState *planstate, EState *estate)
  * in different datanodes.
  */
 static void
-RewriteForSql(RemoteQuery *plan, Query *query,
+RewriteForSql(RemoteQueryState *planstate, RemoteQuery *plan,
 			  char *distribcol, bool isreplic)
 {
+	Query			*query = copyObject(plan->forDeparse);
 	ListCell		*lc_deparse = NULL;
 	TargetEntry		*entry_deparse = NULL;
 	bool			find_target = false;
 	StringInfoData	buf;
+	bool			isnull;
+	Datum			partvalue;
+	ExprState		*estate = NULL;
+
+	plan->exec_nodes->rewrite_done = false;
 
 	foreach(lc_deparse, query->targetList)
 	{
@@ -1932,7 +1938,21 @@ RewriteForSql(RemoteQuery *plan, Query *query,
 		{
 			entry_deparse->expr = (Expr *)replace_distribkey_func(
 									(Node *)entry_deparse->expr);
-			plan->exec_nodes->en_expr = entry_deparse->expr;
+
+			/*
+			 * Get expr value here to avoid executing function again
+			 * in get_exec_connections.
+			 */
+			estate = ExecInitExpr(entry_deparse->expr,
+											 (PlanState *) planstate);
+			if (planstate->eflags != EXEC_FLAG_EXPLAIN_ONLY)
+				partvalue = ExecEvalExpr(estate,
+										 planstate->combiner.ss.ps.ps_ExprContext,
+										 &isnull);
+
+			plan->exec_nodes->rewrite_value = partvalue;
+			plan->exec_nodes->isnull = isnull;
+			plan->exec_nodes->rewrite_done = true;
 			find_target = true;
 			break;
 		}
@@ -1961,9 +1981,9 @@ RewriteFuncNode(PlanState *planstate)
 {
 	RemoteQuery		*plan = (RemoteQuery *)planstate->plan;
 	ExecNodes		*exec_nodes = plan->exec_nodes;
-	Query			*query = copyObject(plan->forDeparse);
 	RelationLocInfo	*rel_loc_info = NULL;
 	char			*distribcol = NULL;
+	RemoteQueryState *node = castNode(RemoteQueryState, planstate);
 
 	if ((!exec_nodes) || (!exec_nodes->need_rewrite))
 		return;
@@ -1974,7 +1994,7 @@ RewriteFuncNode(PlanState *planstate)
 	 */
 	if (IsExecNodesReplicated(exec_nodes))
 	{
-		RewriteForSql(plan, query, NULL, true);
+		RewriteForSql(node, plan, NULL, true);
 		return;
 	}
 
@@ -1986,7 +2006,7 @@ RewriteFuncNode(PlanState *planstate)
 		return;
 
 	distribcol = GetRelationDistribColumn(rel_loc_info);
-	RewriteForSql(plan, query, distribcol, false);
+	RewriteForSql(node, plan, distribcol, false);
 }
 
 /* ----------------------------------------------------------------
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 780ad8aa..3d4e8a68 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -1363,6 +1363,9 @@ _copyExecNodes(const ExecNodes *from)
     COPY_SCALAR_FIELD(en_relid);
     COPY_SCALAR_FIELD(accesstype);
 	COPY_SCALAR_FIELD(need_rewrite);
+	COPY_SCALAR_FIELD(rewrite_value);
+	COPY_SCALAR_FIELD(isnull);
+	COPY_SCALAR_FIELD(rewrite_done);
 
     return newnode;
 }
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index d95a772a..491e33d3 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -6288,18 +6288,29 @@ get_exec_connections(RemoteQueryState *planstate,
             bool isnull;
             ExecNodes *nodes;
 			Datum partvalue;
+			ExprState *estate;
 #ifdef __COLD_HOT__
             bool secisnull;
             Datum secValue;
 #endif
-            ExprState *estate = ExecInitExpr(exec_nodes->en_expr,
+			RelationLocInfo *rel_loc_info;
+			if (exec_nodes->rewrite_done)
+			{
+				partvalue = exec_nodes->rewrite_value;
+				isnull = exec_nodes->isnull;
+			}
+			else
+			{
+				estate = ExecInitExpr(exec_nodes->en_expr,
                                              (PlanState *) planstate);
 			/* For explain, no need to execute expr. */
 			if (planstate->eflags != EXEC_FLAG_EXPLAIN_ONLY)
 				partvalue = ExecEvalExpr(estate,
                                            planstate->combiner.ss.ps.ps_ExprContext,
                                            &isnull);
-            RelationLocInfo *rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid);
+			}
+			
+			rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid);
 
 #ifdef __COLD_HOT__
             if (exec_nodes->sec_en_expr)
diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h
index bef6e6a0..4e692237 100644
--- a/src/include/pgxc/locator.h
+++ b/src/include/pgxc/locator.h
@@ -114,6 +114,9 @@ typedef struct
 	bool		const_subquery; 	/* The subquery rte only got constant values */
 #endif
 	bool		need_rewrite;		/* exists func, need to be rewritted when execute plan */
+	Datum		rewrite_value;	/* function evaluate result */
+	bool		isnull;
+	bool		rewrite_done;		/* function rewritted */
 } ExecNodes;
 
 
From 7b40013a7d0a12b441eba460f92b3fdc1d470b4b Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 23 Apr 2021 15:56:28 +0800
Subject: [PATCH 149/578] Introduce global session view (merge request !276)

Squash merge branch 'andrelin/global_session' into 'Tbase_v2.15.19'

* Minor adjustment after review by YoungXie

* Should skip explain remote query when report planstate

* Introduce pg_cancel_session and pg_terminate_session to send signals to all backends of one session

* Add more comments

* Transport session id to parallel workers

* Copy backend id into local backend status to support cluster stat collect

* fix format

* Support multi query strategy

* Support collect backend stat from remote nodes

* Support EXEC_ON_ALL_NODES if sending NIL nodelist in ExecRemoteQuery

* fix a warning

* Add hook before PortalDrop and after PortalStart

* Allocate session id from gtm with format nodename_pid_timestamp in CN

* Initial commit of global session view

* fix http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131086872963
---
 contrib/Makefile                              |    3 +-
 contrib/pg_stat_cluster_activity/Makefile     |   27 +
 .../pg_stat_cluster_activity--1.0.sql         |   60 +
 ...stat_cluster_activity--unpackaged--1.0.sql |    8 +
 .../pg_stat_cluster_activity.c                | 1069 ++++++++++++++
 .../pg_stat_cluster_activity.conf             |    1 +
 .../pg_stat_cluster_activity.control          |    5 +
 src/backend/access/transam/parallel.c         |   16 +-
 src/backend/commands/explain.c                |    9 +
 src/backend/pgxc/pool/execRemote.c            |   34 +-
 src/backend/pgxc/pool/pgxcnode.c              |  123 +-
 src/backend/pgxc/squeue/squeue.c              |    6 +
 src/backend/postmaster/pgstat.c               |    1 +
 src/backend/postmaster/postmaster.c           |    3 +
 src/backend/tcop/postgres.c                   |   22 +-
 src/backend/tcop/pquery.c                     |    7 +
 src/backend/utils/mmgr/portalmem.c            |    6 +
 src/include/commands/explain.h                |    3 +
 src/include/pgstat.h                          | 1273 +++++++++--------
 src/include/pgxc/pgxc.h                       |    3 +
 src/include/pgxc/pgxcnode.h                   |   11 +-
 src/include/pgxc/squeue.h                     |   83 +-
 src/include/utils/portal.h                    |    6 +
 23 files changed, 2069 insertions(+), 710 deletions(-)
 create mode 100644 contrib/pg_stat_cluster_activity/Makefile
 create mode 100644 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql
 create mode 100644 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--unpackaged--1.0.sql
 create mode 100644 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
 create mode 100644 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.conf
 create mode 100644 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.control

diff --git a/contrib/Makefile b/contrib/Makefile
index 1d0dcd37..494da1e1 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -54,7 +54,8 @@ SUBDIRS = \
                 unaccent        \
                 vacuumlo        \
                 stormstats		\
-				tbase_pooler_stat
+				tbase_pooler_stat \
+                pg_stat_cluster_activity
 
 ifeq ($(with_openssl),yes)
 SUBDIRS += sslinfo
diff --git a/contrib/pg_stat_cluster_activity/Makefile b/contrib/pg_stat_cluster_activity/Makefile
new file mode 100644
index 00000000..a12ef09e
--- /dev/null
+++ b/contrib/pg_stat_cluster_activity/Makefile
@@ -0,0 +1,27 @@
+# contrib/pg_stat_cluster_activity/Makefile
+
+MODULE_big = pg_stat_cluster_activity
+OBJS = pg_stat_cluster_activity.o $(WIN32RES)
+
+EXTENSION = pg_stat_cluster_activity
+DATA = pg_stat_cluster_activity--1.0.sql
+PGFILEDESC = "pg_stat_cluster_activity - execution of cluster statistics"
+
+LDFLAGS_SL += $(filter -lm, $(LIBS))
+
+REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.conf
+REGRESS = pg_stat_cluster_activity
+# Disabled because these tests require "shared_preload_libraries=pg_stat_cluster_activity",
+# which typical installcheck users do not have (e.g. buildfarm clients).
+NO_INSTALLCHECK = 1
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_stat_cluster_activity
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql
new file mode 100644
index 00000000..9f524816
--- /dev/null
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql
@@ -0,0 +1,60 @@
+/* contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "CREATE EXTENSION pg_stat_cluster_activity" to load this file. \quit
+
+/* Now redefine */
+CREATE OR REPLACE FUNCTION pg_stat_get_cluster_activity(
+    sessionid text,
+    coordonly bool,
+    localonly bool,
+    OUT sessionid text,
+    OUT pid integer,
+    OUT client_addr inet,
+    OUT client_hostname text,
+    OUT client_port integer,
+    OUT nodename text,
+    OUT role text,
+    OUT datid oid,
+    OUT usesysid oid,
+    OUT wait_event_type text,
+    OUT wait_event text,
+    OUT state text,
+    OUT sqname text,
+    OUT sqdone bool,
+    OUT query text,
+    OUT planstate text,
+    OUT portal text,
+    OUT cursors text,
+    OUT backend_start timestamp,
+    OUT xact_start timestamp,
+    OUT query_start timestamp,
+    OUT state_change timestamp
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE OR REPLACE FUNCTION pg_signal_session(text, integer, bool)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE OR REPLACE FUNCTION pg_terminate_session(text)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE OR REPLACE FUNCTION pg_cancel_session(text)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE OR REPLACE VIEW pg_stat_cluster_activity AS
+  SELECT * FROM pg_stat_get_cluster_activity(NULL, false, false);
+
+CREATE OR REPLACE VIEW pg_stat_cluster_activity_cn AS
+  SELECT * FROM pg_stat_get_cluster_activity(NULL, true, false);
+
+GRANT SELECT ON pg_stat_cluster_activity TO PUBLIC;
+GRANT SELECT ON pg_stat_cluster_activity_cn TO PUBLIC;
diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--unpackaged--1.0.sql b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--unpackaged--1.0.sql
new file mode 100644
index 00000000..33f68860
--- /dev/null
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--unpackaged--1.0.sql
@@ -0,0 +1,8 @@
+/* contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--unpackaged--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_stat_cluster_activity" to load this file. \quit
+
+ALTER EXTENSION pg_stat_cluster_activity ADD function pg_stat_cluster_get_activity();
+ALTER EXTENSION pg_stat_statements ADD view pg_stat_cluster_activity;
+ALTER EXTENSION pg_stat_statements ADD view pg_stat_cluster_activity_cn;
diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
new file mode 100644
index 00000000..ff748ae0
--- /dev/null
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -0,0 +1,1069 @@
+#include "postgres.h"
+
+#include "catalog/pg_authid.h"
+#include "catalog/pg_type.h"
+#include "commands/explain.h"
+#include "common/ip.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "pgstat.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/pgxc.h"
+#include "pgxc/squeue.h"
+#include "port/atomics.h"
+#include "storage/ipc.h"
+#include "storage/procarray.h"
+#include "storage/shmem.h"
+#include "utils/builtins.h"
+#include "utils/portal.h"
+#include "utils/snapmgr.h"
+#include "utils/timestamp.h"
+
+PG_MODULE_MAGIC;
+
+#define PG_STAT_GET_ClUSTER_ACTIVITY_COLS 22
+
+/* ----------
+ * Total number of backends including auxiliary
+ *
+ * We reserve a slot for each possible BackendId, plus one for each
+ * possible auxiliary process type.  (This scheme assumes there is not
+ * more than one of any auxiliary process type at a time.) MaxBackends
+ * includes autovacuum workers and background workers as well.
+ * ----------
+ */
+#define NumBackendStatSlots (MaxBackends + NUM_AUXPROCTYPES)
+
+#define UINT32_ACCESS_ONCE(var)		 ((uint32)(*((volatile uint32 *)&(var))))
+
+/*
+ * PgClusterStatus is something like PgBackendStatus (see pgstat.c) but it
+ * contains information that a query executed in a cluster database system.
+ * Each PgClusterStatus stands for a backend process forked by postmaster,
+ * the same way PgBackendStatus does, like extended fields of PgBackendStatus.
+ * We show it in view pg_stat_cluster_activity, still, one tuple for an entry.
+ */
+typedef struct PgClusterStatus
+{
+	/*
+	 * To avoid locking overhead, we use the following protocol: a backend
+	 * increments changecount before modifying its entry, and again after
+	 * finishing a modification.  A would-be reader should note the value of
+	 * changecount, copy the entry into private memory, then check
+	 * changecount again.  If the value hasn't changed, and if it's even,
+	 * the copy is valid; otherwise start over.  This makes updates cheap
+	 * while reads are potentially expensive, but that's the tradeoff we want.
+	 *
+	 * The above protocol needs the memory barriers to ensure that the
+	 * apparent order of execution is as it desires. Otherwise, for example,
+	 * the CPU might rearrange the code so that changecount is incremented
+	 * twice before the modification on a machine with weak memory ordering.
+	 * This surprising result can lead to bugs.
+	 */
+	int changecount;
+	
+	bool valid;                     /* don't show this entry if false */
+	
+	/* fields that will be shown in pg_stat_cluster_activity */
+	char sessionid[NAMEDATALEN];    /* global session id in a cluster, one for a session */
+	char nodename[NAMEDATALEN];     /* nodename, determined after process started */
+	char role[NAMEDATALEN];         /* coord, datanode, producer or consumer */
+	
+	/* portal_name or portal_name_unique */
+	char sqname[NAMEDATALEN];
+	/* true if sharequeue end, but currently change when query ends in this backend */
+	bool sqdone;
+	/* part of plantree this backend is processing, OR last processed if backend is idle */
+	char planstate[4096];
+	
+	/*
+	 * portal name: the name of current portal, given by upper node of processing query 
+	 * cursor name: contained in planstate this backend is querying, which would be
+	 *              portal name of next layer of nodes bellow this backend
+	 *              
+	 * Note: with these two fields plus nodename, we can build a backend tree of executing query
+	 *       in whole distributed system.
+	 */
+	char portal[NAMEDATALEN];
+	char cursors[NAMEDATALEN * 64];
+} PgClusterStatus;
+
+static PgClusterStatus *ClusterStatusArray = NULL;
+static PgClusterStatus *MyCSEntry = NULL;
+
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+static PortalStart_hook_type prev_PortalStart = NULL;
+static PortalDrop_hook_type prev_PortalDrop = NULL;
+static ExecutorStart_hook_type prev_ExecutorStart = NULL;
+
+/*
+ * Macros to load and store st_changecount with the memory barriers.
+ *
+ * increment_changecount_before() and
+ * increment_changecount_after() need to be called before and after
+ * entries are modified, respectively. This makes sure that st_changecount
+ * is incremented around the modification.
+ *
+ * Also save_changecount_before() and save_changecount_after()
+ * need to be called before and after entries are copied into private memory
+ * respectively.
+ */
+#define increment_changecount_before(status)	\
+	do {	\
+		status->changecount++;	\
+		pg_write_barrier(); \
+	} while (0)
+
+#define increment_changecount_after(status) \
+	do {	\
+		pg_write_barrier(); \
+		status->changecount++;	\
+		Assert((status->changecount & 1) == 0); \
+	} while (0)
+
+#define save_changecount_before(status, save_changecount)	\
+	do {	\
+		save_changecount = status->changecount; \
+		pg_read_barrier();	\
+	} while (0)
+
+#define save_changecount_after(status, save_changecount)	\
+	do {	\
+		pg_read_barrier();	\
+		save_changecount = status->changecount; \
+	} while (0)
+
+Datum pg_stat_get_cluster_activity(PG_FUNCTION_ARGS);
+Datum pg_signal_session(PG_FUNCTION_ARGS);
+Datum pg_terminate_session(PG_FUNCTION_ARGS);
+Datum pg_cancel_session(PG_FUNCTION_ARGS);
+
+void _PG_init(void);
+void _PG_fini(void);
+
+PG_FUNCTION_INFO_V1(pg_stat_get_cluster_activity);
+PG_FUNCTION_INFO_V1(pg_signal_session);
+PG_FUNCTION_INFO_V1(pg_terminate_session);
+PG_FUNCTION_INFO_V1(pg_cancel_session);
+
+/*
+ * walk through planstate tree and gets cursors it contains in
+ * RemoteSubplan node, formed as a single string delimited each
+ * cursor by a space (one cursor stands for a RemoteSubplan node).
+ */
+static bool
+cursorCollectWalker(PlanState *planstate, StringInfo str)
+{
+	if (IsA(planstate, RemoteSubplanState))
+	{
+		RemoteSubplan *plan = (RemoteSubplan *) planstate->plan;
+		if (plan->cursor != NULL)
+		{
+			appendStringInfoString(str, plan->cursor);
+			if (plan->unique)
+				appendStringInfo(str, "_%d", plan->unique);
+			/* add a space as delimiter */
+			appendStringInfoString(str, " ");
+		}
+	}
+	
+	return planstate_tree_walker(planstate, cursorCollectWalker, str);
+}
+
+/*
+ * Initialize the shared status array and several string buffers
+ * during postmaster startup.
+ */
+static void
+CreateSharedClusterStatus(void)
+{
+	Size		size;
+	bool        found;
+	
+	/* Create or attach to the shared array */
+	size = mul_size(sizeof(PgClusterStatus), NumBackendStatSlots);
+	ClusterStatusArray = (PgClusterStatus *)
+		ShmemInitStruct("Cluster Status Array", size, &found);
+	
+	if (!found)
+	{
+		/*
+		 * We're the first - initialize.
+		 */
+		MemSet(ClusterStatusArray, 0, size);
+	}
+}
+
+/*
+ * Shut down a single backend's statistics reporting at process exit.
+ *
+ * Flush any remaining statistics counts out to the collector.
+ * Without this, operations triggered during backend exit (such as
+ * temp table deletions) won't be counted.
+ *
+ * Lastly, clear out our entry in the PgBackendStatus array.
+ */
+static void
+pgcs_shutdown_hook(int code, Datum arg)
+{
+	volatile PgClusterStatus *entry = MyCSEntry;
+	
+	/*
+	 * Clear my status entry, following the protocol of bumping st_changecount
+	 * before and after.  We use a volatile pointer here to ensure the
+	 * compiler doesn't try to get cute.
+	 */
+	increment_changecount_before(entry);
+	
+	entry->valid = false;	/* mark invalid to hide this entry */
+	
+	increment_changecount_after(entry);
+}
+
+/* ----------
+ * pgcs_entry_initialize() -
+ *
+ *	Initialize my cluster status entry, and set up our on-proc-exit hook.
+ *	as an extension but we don't have hook during process startup, so called
+ *	each time the backend try to report something.
+ * ----------
+ */
+static void
+pgcs_entry_initialize(void)
+{
+	/* already initialized */
+	if (MyCSEntry != NULL)
+		return;
+
+	if (ClusterStatusArray == NULL)
+	{
+		ereport(ERROR,
+		        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			        errmsg("shared memory for pg_stat_cluster_activity is not prepared"),
+			        errhint("maybe you need to set shared_preload_libraries in postgresql.conf file")));
+		return;
+	}
+	
+	/* Initialize MyCSEntry */
+	if (MyBackendId != InvalidBackendId)
+	{
+		Assert(MyBackendId >= 1 && MyBackendId <= MaxBackends);
+		MyCSEntry = &ClusterStatusArray[MyBackendId - 1];
+	}
+	else
+	{
+		/* Must be an auxiliary process */
+		Assert(MyAuxProcType != NotAnAuxProcess);
+		
+		/*
+		 * Assign the MyCSEntry for an auxiliary process. Since it doesn't
+		 * have a BackendId, the slot is statically allocated based on the
+		 * auxiliary process type (MyAuxProcType).  Backends use slots indexed
+		 * in the range from 1 to MaxBackends (inclusive), so we use
+		 * MaxBackends + AuxBackendType + 1 as the index of the slot for an
+		 * auxiliary process.
+		 */
+		MyCSEntry = &ClusterStatusArray[MaxBackends + MyAuxProcType];
+	}
+	
+	/* also set nodename here, it won't change anyway */
+	memcpy(MyCSEntry->nodename, PGXCNodeName, strlen(PGXCNodeName) + 1);
+	
+	/* Set up a process-exit hook to clean up */
+	on_shmem_exit(pgcs_shutdown_hook, 0);
+}
+
+/* ----------
+ * pgcs_report_common
+ * 
+ *  Report common fileds of cluster backend status activity,
+ *  called by pgcs_report_query_activity and pgcs_report_activity.
+ *  report role, sqname, also if this backend become consumer, remove
+ *  previous planstate and cursor.
+ * ----------
+ */
+static void
+pgcs_report_common(PgClusterStatus *entry, QueryDesc *desc)
+{
+	strncpy((char *) entry->sessionid, PGXCSessionId, NAMEDATALEN);
+	
+	entry->sqdone = false;
+	entry->valid = true;
+	
+	/* fields need queryDesc */
+	if (IS_PGXC_DATANODE)
+	{
+		if (desc != NULL && desc->squeue)
+		{
+			strncpy((char *) entry->sqname, SqueueName(desc->squeue), NAMEDATALEN);
+			if (IsSqueueProducer())
+			{
+				strncpy((char *) entry->role, "producer", NAMEDATALEN);
+			}
+			else if (IsSqueueConsumer())
+			{
+				strncpy((char *) entry->role, "consumer", NAMEDATALEN);
+				/* consumer does not know of planstate */
+				entry->planstate[0] = '\0';
+				entry->cursors[0] = '\0';
+			}
+			else
+			{
+				/* do not support */
+				entry->role[0] = '\0';
+			}
+		}
+		else if (IsParallelWorker())
+		{
+			strncpy((char *) entry->role, "parallel worker", NAMEDATALEN);
+		}
+		else
+		{
+			strncpy((char *) entry->role, "datanode", NAMEDATALEN);
+		}
+	}
+	else if (IS_PGXC_COORDINATOR)
+	{
+		strncpy((char *) entry->role, "coordinator", NAMEDATALEN);
+	}
+	else
+	{
+		/* do not support */
+		entry->role[0] = '\0';
+	}
+}
+
+/* ----------
+ * pgcs_report_query_activity
+ * 
+ *  Report fileds of per-query referred, hooked as ExecutorStart_hook
+ *  report planstate, cursors and common fields.
+ * ----------
+ */
+static void
+pgcs_report_query_activity(QueryDesc *desc, int eflags)
+{
+	volatile PgClusterStatus *entry;
+	StringInfo planstate_str = NULL;
+	StringInfo cursors = NULL;
+	
+	if (prev_ExecutorStart)
+		prev_ExecutorStart(desc, eflags);
+	else
+		standard_ExecutorStart(desc, eflags);
+	
+	pgcs_entry_initialize();
+	entry = MyCSEntry;
+	
+	if (!desc)
+		return;
+	
+	/* if query already done, just report sqdone and return */
+	if (desc->already_executed)
+	{
+		increment_changecount_before(entry);
+		entry->sqdone = true;
+		increment_changecount_after(entry);
+		return;
+	}
+	
+	if (desc->planstate != NULL)
+	{
+		ExplainState *es = NewExplainState();
+		
+		/* make planstate text tree */
+		es->costs = false;
+		/* we don't want plan->targetlist been changed */
+		es->skip_remote_query = true;
+		
+		ExplainBeginOutput(es);
+		ExplainPrintPlan(es, desc);
+		ExplainEndOutput(es);
+		/* remove last '\n' */
+		if (es->str->len > 1)
+			es->str->data[--es->str->len] = '\0';
+		planstate_str = es->str;
+		
+		/* find name of RemoteSubplan to show as cursors */
+		cursors = makeStringInfo();
+		cursorCollectWalker(desc->planstate, cursors);
+	}
+	
+	increment_changecount_before(entry);
+	
+	if (planstate_str != NULL && planstate_str->len > 0)
+		memcpy((char *) entry->planstate, planstate_str->data, Min(planstate_str->len + 1, 4096));
+	if (cursors != NULL && cursors->len > 0)
+		memcpy((char *) entry->cursors, cursors->data, Min(cursors->len + 1, NAMEDATALEN * 64));
+	
+	pgcs_report_common((PgClusterStatus *) entry, desc);
+	
+	increment_changecount_after(entry);
+}
+
+/* ----------
+ * pgcs_report_activity
+ * 
+ *  Report fileds of per-portal referred, hooked as PortalStart_hook
+ *  report portal name and common fields.
+ * ----------
+ */
+static void
+pgcs_report_activity(Portal portal)
+{
+	volatile PgClusterStatus *entry;
+	QueryDesc *desc = portal->queryDesc;
+	
+	pgcs_entry_initialize();
+	entry = MyCSEntry;
+	
+	/* if query already done, just report sqdone and return */
+	if (desc != NULL && desc->already_executed)
+	{
+		increment_changecount_before(entry);
+		entry->sqdone = true;
+		increment_changecount_after(entry);
+		return;
+	}
+	
+	increment_changecount_before(entry);
+	
+	strncpy((char *) entry->portal, portal->name, NAMEDATALEN);
+	pgcs_report_common((PgClusterStatus *) entry, desc);
+	
+	increment_changecount_after(entry);
+}
+
+/* ----------
+ * pgstat_fetch_stat_local_csentry
+ * 
+ *  Given a backend id, find particular cluster status entry, copy valid
+ *  entry into local memory, loop around changecount to ensure concurrency.
+ * ----------
+ */
+static PgClusterStatus *
+pgstat_fetch_stat_local_csentry(int beid)
+{
+	PgClusterStatus *csentry;
+	PgClusterStatus *local = palloc(sizeof(PgClusterStatus));
+	local->valid = false;
+	
+	if (ClusterStatusArray == NULL)
+	{
+		ereport(ERROR,
+		        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			        errmsg("shared memory for pg_stat_cluster_activity is not prepared"),
+			        errhint("maybe you need to set shared_preload_libraries in postgresql.conf")));
+		return NULL;
+	}
+	
+	if (beid < 1)
+		return NULL;
+	
+	csentry = &ClusterStatusArray[beid - 1];
+	
+	for (;;)
+	{
+		int			before_changecount;
+		int			after_changecount;
+		
+		save_changecount_before(csentry, before_changecount);
+		if (csentry->valid)
+		{
+			memcpy(local, csentry, sizeof(PgClusterStatus));
+		}
+		save_changecount_after(csentry, after_changecount);
+		if (before_changecount == after_changecount &&
+		    (before_changecount & 1) == 0)
+			break;
+		
+		/* Make sure we can break out of loop if stuck... */
+		CHECK_FOR_INTERRUPTS();
+	}
+	
+	return local;
+}
+
+/* ----------
+ * pg_stat_get_remote_activity
+ * 
+ *  Execute pg_stat_get_cluster_activity query remotely and save
+ *  results in tuplestore.
+ * ----------
+ */
+static void
+pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestorestate *tupstore)
+{
+#define QUERY_LEN 1024
+	char    query[QUERY_LEN];
+	int     i;
+	EState              *estate;
+	MemoryContext		oldcontext;
+	RemoteQuery 		*plan;
+	RemoteQueryState    *pstate;
+	Var 				*dummy;
+	TupleTableSlot		*result = NULL;
+	
+	/*
+	 * Here we call pg_stat_get_cluster_activity in remote with args:
+	 * coordonly = false, localonly = true, to prevent recursive calls in remote nodes.
+	 */
+	if (sessionid == NULL)
+		snprintf(query, QUERY_LEN, "select * from pg_stat_get_cluster_activity(NULL, false, true)");
+	else
+		snprintf(query, QUERY_LEN, "select * from pg_stat_get_cluster_activity('%s', false, true)", sessionid);
+	
+	plan = makeNode(RemoteQuery);
+	plan->combine_type = COMBINE_TYPE_NONE;
+	/*
+	 * set exec_nodes to NULL makes ExecRemoteQuery send query to all nodes
+	 * (local CN nodes won't recieved query again).
+	 */
+	plan->exec_nodes = NULL;
+	plan->exec_type = EXEC_ON_ALL_NODES;
+	plan->sql_statement = (char *) query;
+	plan->force_autocommit = false;
+	
+	if (coordonly)
+	{
+		plan->exec_nodes = makeNode(ExecNodes);
+		plan->exec_nodes->nodeList = GetAllCoordNodes();
+		plan->exec_type = EXEC_ON_COORDS;
+	}
+	
+	/*
+	 * We only need the target entry to determine result data type.
+	 * So create dummy even if real expression is a function.
+	 */
+	for (i = 1; i <= PG_STAT_GET_ClUSTER_ACTIVITY_COLS; i++)
+	{
+		dummy = makeVar(1, i, TEXTOID, 0, InvalidOid, 0);
+		plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist,
+		                                     makeTargetEntry((Expr *) dummy, i, NULL, false));
+	}
+	
+	/* prepare to execute */
+	estate = CreateExecutorState();
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+	estate->es_snapshot = GetActiveSnapshot();
+	pstate = ExecInitRemoteQuery(plan, estate, 0);
+	MemoryContextSwitchTo(oldcontext);
+	
+	result = ExecRemoteQuery((PlanState *) pstate);
+	
+	while (result != NULL && !TupIsNull(result))
+	{
+		slot_getallattrs(result);
+		
+		tuplestore_puttupleslot(tupstore, result);
+		result = ExecRemoteQuery((PlanState *) pstate);
+	}
+	
+	ExecEndRemoteQuery(pstate);
+	return;
+}
+
+/* ----------
+ * pg_stat_get_cluster_activity
+ * 
+ *  Internal SRF function of this extension, access sharememory to find
+ *  every live backend which executed or executing query. copy to local
+ *  and show status. also we collect some fields from PGBackendStatus
+ *
+ *  arguments:  sessionid -- global unique id for a session, generated by CN
+ *              coordonly -- only dispatch to other cn if true.
+ *              localonly -- collect local entries status if true.
+ *              
+ *  Note: since we also collect PGBackendStatus, get them first and use
+ *  backend id to access particular cluster status entry to narrow down
+ *  loop search range from all backend slots to localNumBackends (see pgstat.c)
+ * ----------
+ */
+Datum
+pg_stat_get_cluster_activity(PG_FUNCTION_ARGS)
+{
+	int              num_backends = pgstat_fetch_stat_numbackends();
+	int			     curr_backend;
+	bool             with_sessionid = !PG_ARGISNULL(0);
+	bool             coordonly = PG_ARGISNULL(1) ? false : PG_GETARG_BOOL(1);
+	bool             localonly = PG_ARGISNULL(2) ? false : PG_GETARG_BOOL(2);
+	const char      *sessionid = with_sessionid ? text_to_cstring(PG_GETARG_TEXT_P(0)) : NULL;
+	ReturnSetInfo   *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	     tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext    per_query_ctx;
+	MemoryContext    oldcontext;
+	
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+		        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			        errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+		        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			        errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+	
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+	
+	/* switch to query's memory context to save results during execution */
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+	
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+	
+	MemoryContextSwitchTo(oldcontext);
+	
+	/* dispatch query to remote if needed */
+	if (!localonly && IS_PGXC_COORDINATOR)
+		pg_stat_get_remote_activity(sessionid, coordonly, tupstore);
+	
+	/* 1-based index */
+	for (curr_backend = 1; curr_backend <= num_backends; curr_backend++)
+	{
+		/* for each row */
+		Datum		values[PG_STAT_GET_ClUSTER_ACTIVITY_COLS];
+		bool		nulls[PG_STAT_GET_ClUSTER_ACTIVITY_COLS];
+		
+		/* same as pg_stat_get_activity */
+		LocalPgBackendStatus *local_beentry;
+		PgBackendStatus *beentry;
+		PGPROC	   *proc;
+		const char *wait_event_type = NULL;
+		const char *wait_event = NULL;
+		
+		/* cluster information */
+		PgClusterStatus *local_csentry;
+		
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+		
+		/* Get the next one in the list */
+		local_beentry = pgstat_fetch_stat_local_beentry(curr_backend);
+		local_csentry = pgstat_fetch_stat_local_csentry(local_beentry->backend_id);
+		if (!local_beentry || !local_csentry)
+		{
+			int			i;
+			
+			/* Ignore missing entries if looking for specific sessionid */
+			if (with_sessionid)
+				continue;
+			
+			for (i = 0; i < lengthof(nulls); i++)
+				nulls[i] = true;
+			
+			nulls[13] = false;
+			values[13] = CStringGetTextDatum("<backend information not available>");
+			
+			tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+			continue;
+		}
+		
+		if (!local_csentry->valid)
+			continue;
+		
+		beentry = &local_beentry->backendStatus;
+		/* If looking for specific sessionid, ignore all the others */
+		if (with_sessionid && strcmp(sessionid, local_csentry->sessionid) != 0)
+			continue;
+		
+		/* Values available to all callers */
+		values[0] = CStringGetTextDatum(local_csentry->sessionid);
+		values[1] = Int32GetDatum(beentry->st_procpid);
+		
+		if (beentry->st_databaseid != InvalidOid)
+			values[7] = ObjectIdGetDatum(beentry->st_databaseid);
+		else
+			nulls[7] = true;
+		
+		if (beentry->st_userid != InvalidOid)
+			values[8] = ObjectIdGetDatum(beentry->st_userid);
+		else
+			nulls[8] = true;
+		
+		/* Values only available to owner or superuser or pg_read_all_stats */
+		if (has_privs_of_role(GetUserId(), beentry->st_userid) ||
+		    is_member_of_role(GetUserId(), DEFAULT_ROLE_READ_ALL_STATS))
+		{
+			SockAddr	zero_clientaddr;
+			
+			/* A zeroed client addr means we don't know */
+			memset(&zero_clientaddr, 0, sizeof(zero_clientaddr));
+			if (memcmp(&(beentry->st_clientaddr), &zero_clientaddr,
+			           sizeof(zero_clientaddr)) == 0)
+			{
+				nulls[2] = true;
+				nulls[3] = true;
+				nulls[4] = true;
+			}
+			else
+			{
+				if (beentry->st_clientaddr.addr.ss_family == AF_INET
+#ifdef HAVE_IPV6
+				    || beentry->st_clientaddr.addr.ss_family == AF_INET6
+#endif
+					)
+				{
+					char		remote_host[NI_MAXHOST];
+					char		remote_port[NI_MAXSERV];
+					int			ret;
+					
+					remote_host[0] = '\0';
+					remote_port[0] = '\0';
+					ret = pg_getnameinfo_all(&beentry->st_clientaddr.addr,
+					                         beentry->st_clientaddr.salen,
+					                         remote_host, sizeof(remote_host),
+					                         remote_port, sizeof(remote_port),
+					                         NI_NUMERICHOST | NI_NUMERICSERV);
+					if (ret == 0)
+					{
+						clean_ipv6_addr(beentry->st_clientaddr.addr.ss_family, remote_host);
+						values[2] = DirectFunctionCall1(inet_in,
+						                                CStringGetDatum(remote_host));
+						if (beentry->st_clienthostname &&
+						    beentry->st_clienthostname[0])
+							values[3] = CStringGetTextDatum(beentry->st_clienthostname);
+						else
+							nulls[3] = true;
+						values[4] = Int32GetDatum(atoi(remote_port));
+					}
+					else
+					{
+						nulls[2] = true;
+						nulls[3] = true;
+						nulls[4] = true;
+					}
+				}
+				else if (beentry->st_clientaddr.addr.ss_family == AF_UNIX)
+				{
+					/*
+					 * Unix sockets always reports NULL for host and -1 for
+					 * port, so it's possible to tell the difference to
+					 * connections we have no permissions to view, or with
+					 * errors.
+					 */
+					nulls[2] = true;
+					nulls[3] = true;
+					values[4] = DatumGetInt32(-1);
+				}
+				else
+				{
+					/* Unknown address type, should never happen */
+					nulls[2] = true;
+					nulls[3] = true;
+					nulls[4] = true;
+				}
+			}
+			
+			values[5] = CStringGetTextDatum(local_csentry->nodename);
+			values[6] = CStringGetTextDatum(local_csentry->role);
+			
+			proc = BackendPidGetProc(beentry->st_procpid);
+			if (proc != NULL)
+			{
+				uint32		raw_wait_event;
+				
+				raw_wait_event = UINT32_ACCESS_ONCE(proc->wait_event_info);
+				wait_event_type = pgstat_get_wait_event_type(raw_wait_event);
+				wait_event = pgstat_get_wait_event(raw_wait_event);
+			}
+			else if (beentry->st_backendType != B_BACKEND)
+			{
+				/*
+				 * For an auxiliary process, retrieve process info from
+				 * AuxiliaryProcs stored in shared-memory.
+				 */
+				proc = AuxiliaryPidGetProc(beentry->st_procpid);
+				
+				if (proc != NULL)
+				{
+					uint32		raw_wait_event;
+					
+					raw_wait_event =
+						UINT32_ACCESS_ONCE(proc->wait_event_info);
+					wait_event_type =
+						pgstat_get_wait_event_type(raw_wait_event);
+					wait_event = pgstat_get_wait_event(raw_wait_event);
+				}
+			}
+			
+			if (wait_event_type)
+				values[9] = CStringGetTextDatum(wait_event_type);
+			else
+				nulls[9] = true;
+			
+			if (wait_event)
+				values[10] = CStringGetTextDatum(wait_event);
+			else
+				nulls[10] = true;
+			
+			switch (beentry->st_state)
+			{
+				case STATE_IDLE:
+					values[11] = CStringGetTextDatum("idle");
+					break;
+				case STATE_RUNNING:
+					values[11] = CStringGetTextDatum("active");
+					break;
+				case STATE_IDLEINTRANSACTION:
+					values[11] = CStringGetTextDatum("idle in transaction");
+					break;
+				case STATE_FASTPATH:
+					values[11] = CStringGetTextDatum("fastpath function call");
+					break;
+				case STATE_IDLEINTRANSACTION_ABORTED:
+					values[11] = CStringGetTextDatum("idle in transaction (aborted)");
+					break;
+				case STATE_DISABLED:
+					values[11] = CStringGetTextDatum("disabled");
+					break;
+				case STATE_UNDEFINED:
+					nulls[11] = true;
+					break;
+			}
+			
+			values[12] = CStringGetTextDatum(local_csentry->sqname);
+			values[13] = BoolGetDatum(local_csentry->sqdone);
+			values[14] = CStringGetTextDatum(beentry->st_activity);
+			values[15] = CStringGetTextDatum(local_csentry->planstate);
+			values[16] = CStringGetTextDatum(local_csentry->portal);
+			values[17] = CStringGetTextDatum(local_csentry->cursors);
+			
+			if (beentry->st_proc_start_timestamp != 0)
+				values[18] = TimestampTzGetDatum(beentry->st_proc_start_timestamp);
+			else
+				nulls[18] = true;
+			
+			if (beentry->st_xact_start_timestamp != 0)
+				values[19] = TimestampTzGetDatum(beentry->st_xact_start_timestamp);
+			else
+				nulls[19] = true;
+			
+			if (beentry->st_activity_start_timestamp != 0)
+				values[20] = TimestampTzGetDatum(beentry->st_activity_start_timestamp);
+			else
+				nulls[20] = true;
+			
+			if (beentry->st_state_start_timestamp != 0)
+				values[21] = TimestampTzGetDatum(beentry->st_state_start_timestamp);
+			else
+				nulls[21] = true;
+		}
+		else
+		{
+			values[14] = CStringGetTextDatum("<insufficient privilege>");
+			nulls[2] = true;
+			nulls[3] = true;
+			nulls[4] = true;
+			nulls[5] = true;
+			nulls[6] = true;
+			nulls[9] = true;
+			nulls[10] = true;
+			nulls[11] = true;
+			nulls[12] = true;
+			nulls[13] = true;
+			nulls[15] = true;
+			nulls[16] = true;
+			nulls[17] = true;
+			nulls[18] = true;
+			nulls[19] = true;
+			nulls[20] = true;
+			nulls[21] = true;
+		}
+		
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+	
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+	
+	return (Datum) 0;
+}
+
+static bool
+pgcs_signal_session_remote(const char *sessionid, int signal)
+{
+#define QUERY_LEN 1024
+	char    query[QUERY_LEN];
+	EState              *estate;
+	MemoryContext		oldcontext;
+	RemoteQuery 		*plan;
+	RemoteQueryState    *pstate;
+	Var 				*dummy;
+	TupleTableSlot		*result = NULL;
+	
+	snprintf(query, QUERY_LEN, "select pg_signal_session('%s', %d, true)", sessionid, signal);
+	
+	plan = makeNode(RemoteQuery);
+	plan->combine_type = COMBINE_TYPE_NONE;
+	/*
+	 * set exec_nodes to NULL makes ExecRemoteQuery send query to all nodes
+	 * (local CN nodes won't recieved query again).
+	 */
+	plan->exec_nodes = NULL;
+	plan->exec_type = EXEC_ON_ALL_NODES;
+	plan->sql_statement = (char *) query;
+	plan->force_autocommit = false;
+	
+	/*
+	 * We only need the target entry to determine result data type.
+	 * So create dummy even if real expression is a function.
+	 */
+	dummy = makeVar(1, 1, TEXTOID, 0, InvalidOid, 0);
+	plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist,
+	                                     makeTargetEntry((Expr *) dummy, 1, NULL, false));
+	
+	/* prepare to execute */
+	estate = CreateExecutorState();
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+	estate->es_snapshot = GetActiveSnapshot();
+	pstate = ExecInitRemoteQuery(plan, estate, 0);
+	MemoryContextSwitchTo(oldcontext);
+	
+	result = ExecRemoteQuery((PlanState *) pstate);
+	ExecEndRemoteQuery(pstate);
+	if (TupIsNull(result))
+	{
+		elog(ERROR, "result of pg_signal_session executed remotely is NULL");
+		return false;
+	}
+	
+	return true;
+}
+
+static bool
+pgcs_signal_session(const char *sessionid, int signal)
+{
+	int         num_backends = pgstat_fetch_stat_numbackends();
+	int         curr_backend;
+	const char *funcname;
+	LocalPgBackendStatus *local_beentry;
+	PgClusterStatus      *local_csentry;
+	PgBackendStatus      *beentry;
+	
+	if (signal == SIGTERM)
+		funcname = "pg_terminate_backend";
+	else if (signal == SIGINT)
+		funcname = "pg_cancel_backend";
+	else
+		elog(ERROR, "pgcs_signal_session only support SIGTERM and SIGINT, not %d", signal);
+	
+	/* 1-based index */
+	for (curr_backend = 1; curr_backend <= num_backends; curr_backend++)
+	{
+		/* Get the next one in the list */
+		local_beentry = pgstat_fetch_stat_local_beentry(curr_backend);
+		local_csentry = pgstat_fetch_stat_local_csentry(local_beentry->backend_id);
+		
+		if (local_csentry->valid && strcmp(local_csentry->sessionid, sessionid) == 0)
+		{
+			beentry = &local_beentry->backendStatus;
+			OidFunctionCall1(fmgr_internal_function(funcname),
+			                 Int32GetDatum(beentry->st_procpid));
+		}
+	}
+	
+	return true;
+}
+
+Datum
+pg_signal_session(PG_FUNCTION_ARGS)
+{
+	const char *sessionid = text_to_cstring(PG_GETARG_TEXT_P(0));
+	int         signal = PG_GETARG_INT32(1);
+	bool        localonly = PG_ARGISNULL(2) ? false : PG_GETARG_BOOL(2);
+	bool        result;
+	
+	result = pgcs_signal_session(sessionid, signal);
+	if (result && !localonly)
+		result = pgcs_signal_session_remote(sessionid, signal);
+	
+	return BoolGetDatum(result);
+}
+
+Datum
+pg_terminate_session(PG_FUNCTION_ARGS)
+{
+	return DirectFunctionCall3(pg_signal_session,
+	                           PG_GETARG_DATUM(0),
+	                           Int32GetDatum(SIGTERM),
+	                           BoolGetDatum(false));
+}
+
+Datum
+pg_cancel_session(PG_FUNCTION_ARGS)
+{
+	return DirectFunctionCall3(pg_signal_session,
+	                           PG_GETARG_DATUM(0),
+	                           Int32GetDatum(SIGINT),
+	                           BoolGetDatum(false));
+}
+
+/*
+ * Hooked as shmem_startup_hook
+ */
+static void
+pgcs_shmem_startup(void)
+{
+	CreateSharedClusterStatus();
+}
+
+/*
+ * Estimate shared memory space needed.
+ */
+static Size
+pgcs_memsize(void)
+{
+	return mul_size(sizeof(PgClusterStatus), NumBackendStatSlots);
+}
+
+/*
+ * Module load callback
+ */
+void
+_PG_init(void)
+{
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+	
+	/*
+	 * Request additional shared resources.  (These are no-ops if we're not in
+	 * the postmaster process.)  We'll allocate or attach to the shared
+	 * resources in pgcs_shmem_startup().
+	 */
+	RequestAddinShmemSpace(pgcs_memsize());
+	
+	/*
+	 * Install hooks.
+	 */
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = pgcs_shmem_startup;
+	prev_PortalStart = PortalStart_hook;
+	PortalStart_hook = pgcs_report_activity;
+	prev_PortalDrop = PortalDrop_hook;
+	PortalDrop_hook = pgcs_report_activity;
+	prev_ExecutorStart = ExecutorStart_hook;
+	ExecutorStart_hook = pgcs_report_query_activity;
+}
+
+/*
+ * Module unload callback
+ */
+void
+_PG_fini(void)
+{
+	/* Uninstall hooks. */
+	shmem_startup_hook = prev_shmem_startup_hook;
+	PortalStart_hook = prev_PortalStart;
+	PortalDrop_hook = prev_PortalDrop;
+	ExecutorStart_hook = prev_ExecutorStart;
+}
diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.conf b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.conf
new file mode 100644
index 00000000..91c61803
--- /dev/null
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.conf
@@ -0,0 +1 @@
+shared_preload_libraries = 'pg_stat_cluster_activity'
diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.control b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.control
new file mode 100644
index 00000000..dacd5262
--- /dev/null
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.control
@@ -0,0 +1,5 @@
+# pg_stat_cluster_activity extension
+comment = 'track execution statistics in whole cluster scope'
+default_version = '1.0'
+module_pathname = '$libdir/pg_stat_cluster_activity'
+relocatable = true
diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
index 84256fe1..b67873e8 100644
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -69,6 +69,7 @@
 #define PARALLEL_KEY_GLOBALXID                UINT64CONST(0xFFFFFFFFFFFF0010)
 #endif
 #define PARALLEL_KEY_ENTRYPOINT                UINT64CONST(0xFFFFFFFFFFFF0009)
+#define PARALLEL_KEY_SESSIONID              UINT64CONST(0xFFFFFFFFFFFF0011)
 
 
@@ -205,6 +206,7 @@ InitializeParallelDSM(ParallelContext *pcxt)
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
     Size        gxidlen = 0;
 #endif
+	Size        sidlen = 0;
     Size        segsize = 0;
     int            i;
     FixedParallelState *fps;
@@ -241,8 +243,10 @@ InitializeParallelDSM(ParallelContext *pcxt)
         gxidlen = EstimateGlobalXidSpace();
         shm_toc_estimate_chunk(&pcxt->estimator, gxidlen);
 #endif
+		sidlen = PGXCSessionId[0] == '\0' ? 0 : strlen(PGXCSessionId) + 1;
+		shm_toc_estimate_chunk(&pcxt->estimator, sidlen);
         /* If you add more chunks here, you probably need to add keys. */
-        shm_toc_estimate_keys(&pcxt->estimator, 7);
+		shm_toc_estimate_keys(&pcxt->estimator, 8);
 
         /* Estimate space need for error queues. */
         StaticAssertStmt(BUFFERALIGN(PARALLEL_ERROR_QUEUE_SIZE) ==
@@ -312,6 +316,7 @@ InitializeParallelDSM(ParallelContext *pcxt)
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
         char       *gxidspace;
 #endif
+		char       *sidspace;
         char       *error_queue_space;
         char       *entrypointstate;
         Size        lnamelen;
@@ -351,6 +356,10 @@ InitializeParallelDSM(ParallelContext *pcxt)
         SerializeGlobalXid(gxidlen, gxidspace);
         shm_toc_insert(pcxt->toc, PARALLEL_KEY_GLOBALXID, gxidspace);
 #endif
+		/* global session id */
+		sidspace = shm_toc_allocate(pcxt->toc, sidlen);
+		SerializeSessionId(sidlen, sidspace);
+		shm_toc_insert(pcxt->toc, PARALLEL_KEY_SESSIONID, sidspace);
 
         /* Allocate space for worker information. */
         pcxt->worker = palloc0(sizeof(ParallelWorkerInfo) * pcxt->nworkers);
@@ -982,6 +991,7 @@ ParallelWorkerMain(Datum main_arg)
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
     char        *gxidspace;
 #endif
+	char       *sidspace;
     StringInfoData msgbuf;
 
     /* Set flag to indicate that we're initializing a parallel worker. */
@@ -1115,6 +1125,10 @@ ParallelWorkerMain(Datum main_arg)
     StartParallelWorkerGlobalXid(gxidspace);
 #endif
 
+	/* Restore session id */
+	sidspace = shm_toc_lookup(toc, PARALLEL_KEY_SESSIONID, false);
+	StartParallelWorkerSessionId(sidspace);
+	
     /* Restore combo CID state. */
     combocidspace = shm_toc_lookup(toc, PARALLEL_KEY_COMBO_CID, false);
     RestoreComboCIDState(combocidspace);
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 2722d951..09617d73 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3870,6 +3870,7 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp
 {// #lizard forgives
     ExecNodes    *en = plan->exec_nodes;
     /* add names of the nodes if they exist */
+
     if (en && es->nodes)
     {
         StringInfo node_names = makeStringInfo();
@@ -3914,6 +3915,14 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp
         }
     }
 
+	/*
+	 * if required, skip executing remote query, this
+	 * is happened when a backend report planstate it
+	 * processing, shouldn't execute it again.
+	 */
+	if (es->skip_remote_query)
+		return;
+
     if (en && en->en_expr)
         show_expression((Node *)en->en_expr, "Node expr", planstate, ancestors,
                         es->verbose, es);
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 491e33d3..86cafb0f 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3444,6 +3444,13 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
         if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
             BufferConnection(connections[i]);
 
+		/* Send global session id */
+		if (pgxc_node_send_sessionid(connections[i]))
+		{
+			elog(WARNING, "pgxc_node_begin sending session id failed");
+			return EOF;
+		}
+
         /* Send GXID and check for errors */
         if (pgxc_node_send_gxid(connections[i], gxid))
         {
@@ -6229,6 +6236,8 @@ get_exec_connections_all_dn(bool is_global_session)
 /*
  * Get Node connections depending on the connection type:
  * Datanodes Only, Coordinators only or both types
+ * If exec_nodes is NIL and exec_type is EXEC_ON_ALL_NODES
+ * connect to all nodes except myself
  */
 static PGXCNodeAllHandles *
 get_exec_connections(RemoteQueryState *planstate,
@@ -6303,9 +6312,9 @@ get_exec_connections(RemoteQueryState *planstate,
 			{
 				estate = ExecInitExpr(exec_nodes->en_expr,
                                              (PlanState *) planstate);
-			/* For explain, no need to execute expr. */
-			if (planstate->eflags != EXEC_FLAG_EXPLAIN_ONLY)
-				partvalue = ExecEvalExpr(estate,
+			    /* For explain, no need to execute expr. */
+			    if (planstate->eflags != EXEC_FLAG_EXPLAIN_ONLY)
+			    	partvalue = ExecEvalExpr(estate,
                                            planstate->combiner.ss.ps.ps_ExprContext,
                                            &isnull);
 			}
@@ -6447,6 +6456,12 @@ get_exec_connections(RemoteQueryState *planstate,
             co_conn_count = 0;
     }
     
+	if ((list_length(nodelist) == 0 && exec_type == EXEC_ON_ALL_NODES))
+	{
+		nodelist = GetAllDataNodes();
+		dn_conn_count = NumDataNodes;
+	}
+	
 #ifdef __TBASE__
     if (IsParallelWorker())
     {
@@ -8933,6 +8948,19 @@ ExecRemoteQuery(PlanState *pstate)
             need_global_snapshot = true;
 #endif
         }
+		else if (step->exec_type == EXEC_ON_ALL_NODES)
+		{
+			total_conn_count = regular_conn_count =
+				pgxc_connections->dn_conn_count + pgxc_connections->co_conn_count;
+			
+			connections = palloc(mul_size(total_conn_count, sizeof(PGXCNodeHandle *)));
+			memcpy(connections, pgxc_connections->datanode_handles,
+			       pgxc_connections->dn_conn_count * sizeof(PGXCNodeHandle *));
+			memcpy(connections + pgxc_connections->dn_conn_count, pgxc_connections->coord_handles,
+			       pgxc_connections->co_conn_count * sizeof(PGXCNodeHandle *));
+			
+			need_global_snapshot = g_set_global_snapshot;
+		}
 
 #ifdef __TBASE__
         /* set snapshot as needed */
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 81027676..233fd0e2 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -378,6 +378,8 @@ InitMultinodeExecutor(bool is_force)
 
     MemoryContextSwitchTo(oldcontext);
 
+	PGXCSessionId[0] = '\0';
+	
     if (IS_PGXC_COORDINATOR)
     {
         for (count = 0; count < NumCoords; count++)
@@ -386,6 +388,8 @@ InitMultinodeExecutor(bool is_force)
                        get_pgxc_nodename(co_handles[count].nodeoid)) == 0)
                 PGXCNodeId = count + 1;
         }
+		
+		sprintf(PGXCSessionId, "%s_%d_%ld", PGXCNodeName, MyProcPid, GetCurrentTimestamp());
     }
     else /* DataNode */
     {
@@ -410,7 +414,8 @@ InitMultinodeExecutor(bool is_force)
     
 }
 
-Oid get_nodeoid_from_nodeid(int nodeid, char node_type)
+Oid
+get_nodeoid_from_nodeid(int nodeid, char node_type)
 {
     if (PGXC_NODE_COORDINATOR == node_type)
     {
@@ -524,7 +529,8 @@ PGXCNodeConnect(char *connstr)
     return (NODE_CONNECTION *) conn;
 }
 
-int PGXCNodePing(const char *connstr)
+int
+PGXCNodePing(const char *connstr)
 {
     if (connstr[0])
     {
@@ -943,8 +949,9 @@ pgxc_node_receive(const int conn_count,
 }
 
 
-void pgxc_print_pending_data(PGXCNodeHandle *handle, bool reset)
-{// #lizard forgives
+void
+pgxc_print_pending_data(PGXCNodeHandle *handle, bool reset)
+{
     char       *msg;
     int32       ret;
     //DNConnectionState estate = 0;
@@ -1517,8 +1524,9 @@ release_handles(bool force)
 /*
  * Check whether there bad connections to remote nodes when abort transactions.
  */
-bool validate_handles(void)
-{// #lizard forgives
+bool
+validate_handles(void)
+{
     int            i;    
     int            ret;
     
@@ -2462,7 +2470,8 @@ pgxc_node_send_sync(PGXCNodeHandle * handle)
 /*
  * Send logical apply message down to the Datanode
  */
-int pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_pk_conflict)
+int
+pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_pk_conflict)
 {
     int    msgLen = 0;
 
@@ -3340,29 +3349,59 @@ pgxc_node_send_coord_info(PGXCNodeHandle * handle, int coord_pid, TransactionId
 	return 0;
 }
 
-inline void pgxc_set_coordinator_proc_pid(int proc_pid)
+void
+pgxc_set_coordinator_proc_pid(int proc_pid)
 {
 	pgxc_coordinator_proc_pid = (IS_PGXC_COORDINATOR ? MyProcPid : proc_pid);
 }
 
-inline void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid)
+void
+pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid)
 {
 	TransactionId lxid = (MyProc != NULL ? MyProc->lxid : InvalidTransactionId);
 
 	pgxc_coordinator_proc_vxid = (IS_PGXC_COORDINATOR ? lxid : proc_vxid);
 }
 
-inline int pgxc_get_coordinator_proc_pid(void)
+int
+pgxc_get_coordinator_proc_pid(void)
 {
 	return (IS_PGXC_COORDINATOR ? MyProcPid : pgxc_coordinator_proc_pid);
 }
 
-inline TransactionId pgxc_get_coordinator_proc_vxid(void)
+TransactionId
+pgxc_get_coordinator_proc_vxid(void)
 {
 	TransactionId lxid = (MyProc != NULL ? MyProc->lxid : InvalidTransactionId);
 
 	return (IS_PGXC_COORDINATOR ? lxid : pgxc_coordinator_proc_vxid);
 }
+
+int
+pgxc_node_send_sessionid(PGXCNodeHandle * handle)
+{
+	int	msgLen = 0;
+	
+	/* size + sessionid_str + '\0' */
+	msgLen = 4 + strlen(PGXCSessionId) + 1;
+	
+	/* msgType + msgLen */
+	if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+	{
+		add_error_message(handle, "pgxc_node_send_sessionid out of memory");
+		return EOF;
+	}
+	
+	handle->outBuffer[handle->outEnd++] = 'o';		/* session id */
+	
+	msgLen = htonl(msgLen);
+	memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+	handle->outEnd += 4;
+	
+	memcpy(handle->outBuffer + handle->outEnd, PGXCSessionId, strlen(PGXCSessionId) + 1);
+	handle->outEnd += strlen(PGXCSessionId) + 1;
+	return 0;
+}
 #endif
 
 /*
@@ -3416,8 +3455,9 @@ add_error_message(PGXCNodeHandle *handle, const char *message)
     }
 }
 #ifdef __TBASE__
-void add_error_message_from_combiner(PGXCNodeHandle *handle, void *combiner_input)
-{// #lizard forgives
+void
+add_error_message_from_combiner(PGXCNodeHandle *handle, void *combiner_input)
+{
     ResponseCombiner *combiner;
 
     combiner = (ResponseCombiner*)combiner_input;
@@ -4205,8 +4245,8 @@ pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
 }
 
 /* Do translation for non-main cluster */
-
-Oid PGXCGetLocalNodeOid(Oid nodeoid)
+Oid
+PGXCGetLocalNodeOid(Oid nodeoid)
 {
     
     if(false == IsPGXCMainCluster)
@@ -4224,7 +4264,8 @@ Oid PGXCGetLocalNodeOid(Oid nodeoid)
     return nodeoid;
 }
 
-Oid PGXCGetMainNodeOid(Oid nodeoid)
+Oid
+PGXCGetMainNodeOid(Oid nodeoid)
 {
 
     if(false == IsPGXCMainCluster)
@@ -4420,7 +4461,9 @@ paramlist_delete_param(List *param_list, const char *name)
 
        return param_list;
 }
-static ParamEntry * paramlist_get_paramentry(List *param_list, const char *name)
+
+static ParamEntry *
+paramlist_get_paramentry(List *param_list, const char *name)
 {
     ListCell   *cur_item;
 
@@ -4439,7 +4482,9 @@ static ParamEntry * paramlist_get_paramentry(List *param_list, const char *name)
 
     return NULL;
 }
-static ParamEntry * paramentry_copy(ParamEntry * src_entry)
+
+static ParamEntry *
+paramentry_copy(ParamEntry * src_entry)
 {
     ParamEntry *dst_entry = NULL;
     if (src_entry)
@@ -5432,7 +5477,8 @@ PGXCNodeSendSetQuery(NODE_CONNECTION *conn, const char *sql_command, char *errms
     return error ? -1 : 0;
 }
 
-bool node_ready_for_query(PGXCNodeHandle *conn) 
+bool
+node_ready_for_query(PGXCNodeHandle *conn) 
 {
     return ('Z' == (conn)->last_command);
 }
@@ -5611,7 +5657,8 @@ void PGXCGetCoordOidOthers(Oid **nodelist)
 
 }
 
-void PGXCGetAllDnOid(Oid *nodelist)
+void
+PGXCGetAllDnOid(Oid *nodelist)
 {
     Oid     node_oid;
     int     i;
@@ -5663,4 +5710,40 @@ is_ddl_leader_cn(char *first_cn)
 }
 #endif
 
+/*
+ * SerializeSessionId
+ *		Dumps the serialized session id onto the memory location at 
+ *		start_address for parallel workers
+ */
+void
+SerializeSessionId(Size maxsize, char *start_address)
+{
+	
+	if(PGXCSessionId[0] == '\0')
+	{
+		*(int *) start_address = 0;
+	}
+	else
+	{
+		int len = strlen(PGXCSessionId) + 1;
+		
+		*(int *) start_address = len;
+		memcpy(start_address + sizeof(int), PGXCSessionId, len);
+	}
+}
+
+/*
+ * StartParallelWorkerSessionId
+ *		Reads the serialized session id and set it on parallel workers
+ */
+void
+StartParallelWorkerSessionId(char *address)
+{
+	char *sidspace = address + sizeof(int);
+	
+	if (*(int *) address == 0) /* len */
+		PGXCSessionId[0] = '\0';
+	else
+		strncpy((char *) PGXCSessionId, sidspace, NAMEDATALEN);
+}
 #endif
diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
index e006d2c8..a4deed0d 100644
--- a/src/backend/pgxc/squeue/squeue.c
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -9443,3 +9443,9 @@ int PipeLength(PGPipe *pPipe)
 }
 
 #endif
+
+const char *
+SqueueName(SharedQueue sq)
+{
+	return sq->sq_key;
+}
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index cfe20c82..0d77754a 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3379,6 +3379,7 @@ pgstat_read_current_status(void)
             BackendIdGetTransactionIds(i,
                                        &localentry->backend_xid,
                                        &localentry->backend_xmin);
+			localentry->backend_id = i;
 
             localentry++;
             localappname += NAMEDATALEN;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index f7ed9637..c3fe228d 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -466,6 +466,9 @@ char            *PGXCClusterName = NULL;
 char            *PGXCMainClusterName = NULL;
 bool        IsPGXCMainCluster = false;
 int            PGXCNodeId = 0;
+#ifdef __TBASE__
+char             PGXCSessionId[NAMEDATALEN];
+#endif
 /*
  * When a particular node starts up, store the node identifier in this variable
  * so that we dont have to calculate it OR do a search in cache any where else
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index db91d32d..7020932d 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -655,6 +655,7 @@ SocketBackend(StringInfo inBuf)
 #ifdef __TBASE__
         case 'N':
 		case 'U':				/* coord info: coord_pid and top_xid */
+		case 'o':               /* global session id */
 #endif
         case 'M':                /* Command ID */
         case 'g':                /* GXID */
@@ -2838,6 +2839,8 @@ exec_execute_message(const char *portal_name, long max_rows)
     bool        execute_is_fetch;
     bool        was_logged = false;
     char        msec_str[32];
+	int         instrument;
+	QueryDesc  *desc;
 
     /* Adjust destination to tell printtup.c what to do */
     dest = whereToSendOutput;
@@ -3005,6 +3008,9 @@ exec_execute_message(const char *portal_name, long max_rows)
         portal->cplan->stmt_list = portal->cplan->stmt_list_backup;
         portal->cplan->stmt_list_backup = NULL;
     }
+	
+	desc = PortalGetQueryDesc(portal);
+	instrument = portal->up_instrument;
 #endif
 
 #ifdef __AUDIT__
@@ -3033,13 +3039,12 @@ exec_execute_message(const char *portal_name, long max_rows)
             CommandCounterIncrement();
         }
 
-		
 #ifdef __TBASE__
-		if (portal->up_instrument &&
-		    portal->queryDesc &&
-		    portal->queryDesc->myindex == -1)
+		if (instrument &&
+		    desc != NULL &&
+		    desc->myindex == -1)
 		{
-			SendLocalInstr(portal->queryDesc->planstate);
+			SendLocalInstr(desc->planstate);
 		}
 #endif
         /* Send appropriate CommandComplete to client */
@@ -5728,6 +5733,13 @@ PostgresMain(int argc, char *argv[],
 					elog(DEBUG5, "Received coord_pid: %d, coord_vxid: %u", coord_pid, coord_vxid);
 				}
 				break;
+			case 'o':       /* session id */
+				{
+					const char *sessionid = pq_getmsgstring(&input_message);
+					pq_getmsgend(&input_message);
+					strncpy((char *) PGXCSessionId, sessionid, NAMEDATALEN);
+				}
+				break;
 #endif
                 /*
                  * 'X' means that the frontend is closing down the socket. EOF
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index b73224fe..2064e594 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -54,6 +54,10 @@
 #ifdef __TBASE__
 bool     paramPassDown = false;
 #endif
+
+/* Hooks for plugins to get control in PortalStart */
+PortalStart_hook_type PortalStart_hook = NULL;
+
 /*
  * ActivePortal is the currently executing Portal (the most closely nested,
  * if there are several).
@@ -1138,6 +1142,9 @@ PortalStart(Portal portal, ParamListInfo params,
                 portal->tupDesc = NULL;
                 break;
         }
+		
+		if (PortalStart_hook)
+			PortalStart_hook(portal);
     }
     PG_CATCH();
     {
diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c
index 567737b6..6fb83b8d 100644
--- a/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -96,6 +96,9 @@ do { \
         elog(WARNING, "trying to delete portal name that does not exist"); \
 } while(0)
 
+/* Hooks for plugins to get control in PortalDrop */
+PortalDrop_hook_type PortalDrop_hook = NULL;
+
 static MemoryContext PortalMemory = NULL;
 
 
@@ -564,6 +567,9 @@ PortalDrop(Portal portal, bool isTopCommit)
                 (errcode(ERRCODE_INVALID_CURSOR_STATE),
                  errmsg("cannot drop active portal \"%s\"", portal->name)));
 
+	if (PortalDrop_hook)
+		PortalDrop_hook(portal);
+	
     /*
      * Allow portalcmds.c to clean up the state it knows about, in particular
      * shutting down the executor if still active.  This step potentially runs
diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h
index 0f8c2765..19c744a4 100644
--- a/src/include/commands/explain.h
+++ b/src/include/commands/explain.h
@@ -37,6 +37,9 @@ typedef struct ExplainState
     bool        nodes;            /* print nodes in RemoteQuery node */
     bool        num_nodes;        /* print number of nodes in RemoteQuery node */
 #endif /* PGXC */
+#ifdef __TBASE__
+	bool        skip_remote_query;  /* skip execute remote query */
+#endif
     bool        timing;            /* print detailed node timing */
     bool        summary;        /* print total planning and execution timing */
     ExplainFormat format;        /* output format */
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 8d0fb02e..15dd8b59 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -1,12 +1,12 @@
 /* ----------
- *    pgstat.h
+ *	pgstat.h
  *
- *    Definitions for the PostgreSQL statistics collector daemon.
+ *	Definitions for the PostgreSQL statistics collector daemon.
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
- *    Copyright (c) 2001-2017, PostgreSQL Global Development Group
+ *	Copyright (c) 2001-2017, PostgreSQL Global Development Group
  *
- *    src/include/pgstat.h
+ *	src/include/pgstat.h
  * ----------
  */
 #ifndef PGSTAT_H
@@ -27,20 +27,20 @@
  * Paths for the statistics files (relative to installation's $PGDATA).
  * ----------
  */
-#define PGSTAT_STAT_PERMANENT_DIRECTORY        "pg_stat"
-#define PGSTAT_STAT_PERMANENT_FILENAME        "pg_stat/global.stat"
-#define PGSTAT_STAT_PERMANENT_TMPFILE        "pg_stat/global.tmp"
+#define PGSTAT_STAT_PERMANENT_DIRECTORY		"pg_stat"
+#define PGSTAT_STAT_PERMANENT_FILENAME		"pg_stat/global.stat"
+#define PGSTAT_STAT_PERMANENT_TMPFILE		"pg_stat/global.tmp"
 
 /* Default directory to store temporary statistics data in */
-#define PG_STAT_TMP_DIR        "pg_stat_tmp"
+#define PG_STAT_TMP_DIR		"pg_stat_tmp"
 
 /* Values for track_functions GUC variable --- order is significant! */
 typedef enum TrackFunctionsLevel
 {
-    TRACK_FUNC_OFF,
-    TRACK_FUNC_PL,
-    TRACK_FUNC_ALL
-}            TrackFunctionsLevel;
+	TRACK_FUNC_OFF,
+	TRACK_FUNC_PL,
+	TRACK_FUNC_ALL
+}			TrackFunctionsLevel;
 
 /* ----------
  * The types of backend -> collector messages
@@ -48,24 +48,24 @@ typedef enum TrackFunctionsLevel
  */
 typedef enum StatMsgType
 {
-    PGSTAT_MTYPE_DUMMY,
-    PGSTAT_MTYPE_INQUIRY,
-    PGSTAT_MTYPE_TABSTAT,
-    PGSTAT_MTYPE_TABPURGE,
-    PGSTAT_MTYPE_DROPDB,
-    PGSTAT_MTYPE_RESETCOUNTER,
-    PGSTAT_MTYPE_RESETSHAREDCOUNTER,
-    PGSTAT_MTYPE_RESETSINGLECOUNTER,
-    PGSTAT_MTYPE_AUTOVAC_START,
-    PGSTAT_MTYPE_VACUUM,
-    PGSTAT_MTYPE_ANALYZE,
-    PGSTAT_MTYPE_ARCHIVER,
-    PGSTAT_MTYPE_BGWRITER,
-    PGSTAT_MTYPE_FUNCSTAT,
-    PGSTAT_MTYPE_FUNCPURGE,
-    PGSTAT_MTYPE_RECOVERYCONFLICT,
-    PGSTAT_MTYPE_TEMPFILE,
-    PGSTAT_MTYPE_DEADLOCK
+	PGSTAT_MTYPE_DUMMY,
+	PGSTAT_MTYPE_INQUIRY,
+	PGSTAT_MTYPE_TABSTAT,
+	PGSTAT_MTYPE_TABPURGE,
+	PGSTAT_MTYPE_DROPDB,
+	PGSTAT_MTYPE_RESETCOUNTER,
+	PGSTAT_MTYPE_RESETSHAREDCOUNTER,
+	PGSTAT_MTYPE_RESETSINGLECOUNTER,
+	PGSTAT_MTYPE_AUTOVAC_START,
+	PGSTAT_MTYPE_VACUUM,
+	PGSTAT_MTYPE_ANALYZE,
+	PGSTAT_MTYPE_ARCHIVER,
+	PGSTAT_MTYPE_BGWRITER,
+	PGSTAT_MTYPE_FUNCSTAT,
+	PGSTAT_MTYPE_FUNCPURGE,
+	PGSTAT_MTYPE_RECOVERYCONFLICT,
+	PGSTAT_MTYPE_TEMPFILE,
+	PGSTAT_MTYPE_DEADLOCK
 } StatMsgType;
 
 /* ----------
@@ -75,7 +75,7 @@ typedef enum StatMsgType
 typedef int64 PgStat_Counter;
 
 /* ----------
- * PgStat_TableCounts            The actual per-table counts kept by a backend
+ * PgStat_TableCounts			The actual per-table counts kept by a backend
  *
  * This struct should contain only actual event counters, because we memcmp
  * it against zeroes to detect whether there are any counts to transmit.
@@ -97,37 +97,37 @@ typedef int64 PgStat_Counter;
  */
 typedef struct PgStat_TableCounts
 {
-    PgStat_Counter t_numscans;
+	PgStat_Counter t_numscans;
 
-    PgStat_Counter t_tuples_returned;
-    PgStat_Counter t_tuples_fetched;
+	PgStat_Counter t_tuples_returned;
+	PgStat_Counter t_tuples_fetched;
 
-    PgStat_Counter t_tuples_inserted;
-    PgStat_Counter t_tuples_updated;
-    PgStat_Counter t_tuples_deleted;
-    PgStat_Counter t_tuples_hot_updated;
-    bool        t_truncated;
+	PgStat_Counter t_tuples_inserted;
+	PgStat_Counter t_tuples_updated;
+	PgStat_Counter t_tuples_deleted;
+	PgStat_Counter t_tuples_hot_updated;
+	bool		t_truncated;
 
-    PgStat_Counter t_delta_live_tuples;
-    PgStat_Counter t_delta_dead_tuples;
-    PgStat_Counter t_changed_tuples;
+	PgStat_Counter t_delta_live_tuples;
+	PgStat_Counter t_delta_dead_tuples;
+	PgStat_Counter t_changed_tuples;
 
-    PgStat_Counter t_blocks_fetched;
-    PgStat_Counter t_blocks_hit;
+	PgStat_Counter t_blocks_fetched;
+	PgStat_Counter t_blocks_hit;
 } PgStat_TableCounts;
 
 /* Possible targets for resetting cluster-wide shared values */
 typedef enum PgStat_Shared_Reset_Target
 {
-    RESET_ARCHIVER,
-    RESET_BGWRITER
+	RESET_ARCHIVER,
+	RESET_BGWRITER
 } PgStat_Shared_Reset_Target;
 
 /* Possible object types for resetting single counters */
 typedef enum PgStat_Single_Reset_Type
 {
-    RESET_TABLE,
-    RESET_FUNCTION
+	RESET_TABLE,
+	RESET_FUNCTION
 } PgStat_Single_Reset_Type;
 
 /* ------------------------------------------------------------
@@ -137,7 +137,7 @@ typedef enum PgStat_Single_Reset_Type
 
 
 /* ----------
- * PgStat_TableStatus            Per-table status within a backend
+ * PgStat_TableStatus			Per-table status within a backend
  *
  * Many of the event counters are nontransactional, ie, we count events
  * in committed and aborted transactions alike.  For these, we just count
@@ -153,34 +153,34 @@ typedef enum PgStat_Single_Reset_Type
  */
 typedef struct PgStat_TableStatus
 {
-    Oid            t_id;            /* table's OID */
+	Oid			t_id;			/* table's OID */
 #ifdef __TBASE__
 	Oid			t_parent_id;	/* parent's OID for interval child table, of InvalidOid */
 #endif
-    bool        t_shared;        /* is it a shared catalog? */
-    struct PgStat_TableXactStatus *trans;    /* lowest subxact's counts */
-    PgStat_TableCounts t_counts;    /* event counts to be sent */
+	bool		t_shared;		/* is it a shared catalog? */
+	struct PgStat_TableXactStatus *trans;	/* lowest subxact's counts */
+	PgStat_TableCounts t_counts;	/* event counts to be sent */
 } PgStat_TableStatus;
 
 /* ----------
- * PgStat_TableXactStatus        Per-table, per-subtransaction status
+ * PgStat_TableXactStatus		Per-table, per-subtransaction status
  * ----------
  */
 typedef struct PgStat_TableXactStatus
 {
-    PgStat_Counter tuples_inserted; /* tuples inserted in (sub)xact */
-    PgStat_Counter tuples_updated;    /* tuples updated in (sub)xact */
-    PgStat_Counter tuples_deleted;    /* tuples deleted in (sub)xact */
-    bool        truncated;        /* relation truncated in this (sub)xact */
-    PgStat_Counter inserted_pre_trunc;    /* tuples inserted prior to truncate */
-    PgStat_Counter updated_pre_trunc;    /* tuples updated prior to truncate */
-    PgStat_Counter deleted_pre_trunc;    /* tuples deleted prior to truncate */
-    int            nest_level;        /* subtransaction nest level */
-    /* links to other structs for same relation: */
-    struct PgStat_TableXactStatus *upper;    /* next higher subxact if any */
-    PgStat_TableStatus *parent; /* per-table status */
-    /* structs of same subxact level are linked here: */
-    struct PgStat_TableXactStatus *next;    /* next of same subxact */
+	PgStat_Counter tuples_inserted; /* tuples inserted in (sub)xact */
+	PgStat_Counter tuples_updated;	/* tuples updated in (sub)xact */
+	PgStat_Counter tuples_deleted;	/* tuples deleted in (sub)xact */
+	bool		truncated;		/* relation truncated in this (sub)xact */
+	PgStat_Counter inserted_pre_trunc;	/* tuples inserted prior to truncate */
+	PgStat_Counter updated_pre_trunc;	/* tuples updated prior to truncate */
+	PgStat_Counter deleted_pre_trunc;	/* tuples deleted prior to truncate */
+	int			nest_level;		/* subtransaction nest level */
+	/* links to other structs for same relation: */
+	struct PgStat_TableXactStatus *upper;	/* next higher subxact if any */
+	PgStat_TableStatus *parent; /* per-table status */
+	/* structs of same subxact level are linked here: */
+	struct PgStat_TableXactStatus *next;	/* next of same subxact */
 } PgStat_TableXactStatus;
 
 
@@ -191,13 +191,13 @@ typedef struct PgStat_TableXactStatus
 
 
 /* ----------
- * PgStat_MsgHdr                The common message header
+ * PgStat_MsgHdr				The common message header
  * ----------
  */
 typedef struct PgStat_MsgHdr
 {
-    StatMsgType m_type;
-    int            m_size;
+	StatMsgType m_type;
+	int			m_size;
 } PgStat_MsgHdr;
 
 /* ----------
@@ -208,22 +208,22 @@ typedef struct PgStat_MsgHdr
  * ----------
  */
 #define PGSTAT_MAX_MSG_SIZE 1000
-#define PGSTAT_MSG_PAYLOAD    (PGSTAT_MAX_MSG_SIZE - sizeof(PgStat_MsgHdr))
+#define PGSTAT_MSG_PAYLOAD	(PGSTAT_MAX_MSG_SIZE - sizeof(PgStat_MsgHdr))
 
 
 /* ----------
- * PgStat_MsgDummy                A dummy message, ignored by the collector
+ * PgStat_MsgDummy				A dummy message, ignored by the collector
  * ----------
  */
 typedef struct PgStat_MsgDummy
 {
-    PgStat_MsgHdr m_hdr;
+	PgStat_MsgHdr m_hdr;
 } PgStat_MsgDummy;
 
 
 /* ----------
- * PgStat_MsgInquiry            Sent by a backend to ask the collector
- *                                to write the stats file(s).
+ * PgStat_MsgInquiry			Sent by a backend to ask the collector
+ *								to write the stats file(s).
  *
  * Ordinarily, an inquiry message prompts writing of the global stats file,
  * the stats file for shared catalogs, and the stats file for the specified
@@ -242,219 +242,219 @@ typedef struct PgStat_MsgDummy
 
 typedef struct PgStat_MsgInquiry
 {
-    PgStat_MsgHdr m_hdr;
-    TimestampTz clock_time;        /* observed local clock time */
-    TimestampTz cutoff_time;    /* minimum acceptable file timestamp */
-    Oid            databaseid;        /* requested DB (InvalidOid => shared only) */
+	PgStat_MsgHdr m_hdr;
+	TimestampTz clock_time;		/* observed local clock time */
+	TimestampTz cutoff_time;	/* minimum acceptable file timestamp */
+	Oid			databaseid;		/* requested DB (InvalidOid => shared only) */
 } PgStat_MsgInquiry;
 
 
 /* ----------
- * PgStat_TableEntry            Per-table info in a MsgTabstat
+ * PgStat_TableEntry			Per-table info in a MsgTabstat
  * ----------
  */
 typedef struct PgStat_TableEntry
 {
-    Oid            t_id;
+	Oid			t_id;
 #ifdef __TBASE__
 	Oid			t_parent_id;
 #endif
-    PgStat_TableCounts t_counts;
+	PgStat_TableCounts t_counts;
 } PgStat_TableEntry;
 
 /* ----------
- * PgStat_MsgTabstat            Sent by the backend to report table
- *                                and buffer access statistics.
+ * PgStat_MsgTabstat			Sent by the backend to report table
+ *								and buffer access statistics.
  * ----------
  */
 #define PGSTAT_NUM_TABENTRIES  \
-    ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - 3 * sizeof(int) - 2 * sizeof(PgStat_Counter))    \
-     / sizeof(PgStat_TableEntry))
+	((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - 3 * sizeof(int) - 2 * sizeof(PgStat_Counter))	\
+	 / sizeof(PgStat_TableEntry))
 
 typedef struct PgStat_MsgTabstat
 {
-    PgStat_MsgHdr m_hdr;
-    Oid            m_databaseid;
-    int            m_nentries;
-    int            m_xact_commit;
-    int            m_xact_rollback;
-    PgStat_Counter m_block_read_time;    /* times in microseconds */
-    PgStat_Counter m_block_write_time;
-    PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES];
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
+	int			m_nentries;
+	int			m_xact_commit;
+	int			m_xact_rollback;
+	PgStat_Counter m_block_read_time;	/* times in microseconds */
+	PgStat_Counter m_block_write_time;
+	PgStat_TableEntry m_entry[PGSTAT_NUM_TABENTRIES];
 } PgStat_MsgTabstat;
 
 
 /* ----------
- * PgStat_MsgTabpurge            Sent by the backend to tell the collector
- *                                about dead tables.
+ * PgStat_MsgTabpurge			Sent by the backend to tell the collector
+ *								about dead tables.
  * ----------
  */
 #define PGSTAT_NUM_TABPURGE  \
-    ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
-     / sizeof(Oid))
+	((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
+	 / sizeof(Oid))
 
 typedef struct PgStat_MsgTabpurge
 {
-    PgStat_MsgHdr m_hdr;
-    Oid            m_databaseid;
-    int            m_nentries;
-    Oid            m_tableid[PGSTAT_NUM_TABPURGE];
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
+	int			m_nentries;
+	Oid			m_tableid[PGSTAT_NUM_TABPURGE];
 } PgStat_MsgTabpurge;
 
 
 /* ----------
- * PgStat_MsgDropdb                Sent by the backend to tell the collector
- *                                about a dropped database
+ * PgStat_MsgDropdb				Sent by the backend to tell the collector
+ *								about a dropped database
  * ----------
  */
 typedef struct PgStat_MsgDropdb
 {
-    PgStat_MsgHdr m_hdr;
-    Oid            m_databaseid;
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
 } PgStat_MsgDropdb;
 
 
 /* ----------
- * PgStat_MsgResetcounter        Sent by the backend to tell the collector
- *                                to reset counters
+ * PgStat_MsgResetcounter		Sent by the backend to tell the collector
+ *								to reset counters
  * ----------
  */
 typedef struct PgStat_MsgResetcounter
 {
-    PgStat_MsgHdr m_hdr;
-    Oid            m_databaseid;
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
 } PgStat_MsgResetcounter;
 
 /* ----------
  * PgStat_MsgResetsharedcounter Sent by the backend to tell the collector
- *                                to reset a shared counter
+ *								to reset a shared counter
  * ----------
  */
 typedef struct PgStat_MsgResetsharedcounter
 {
-    PgStat_MsgHdr m_hdr;
-    PgStat_Shared_Reset_Target m_resettarget;
+	PgStat_MsgHdr m_hdr;
+	PgStat_Shared_Reset_Target m_resettarget;
 } PgStat_MsgResetsharedcounter;
 
 /* ----------
  * PgStat_MsgResetsinglecounter Sent by the backend to tell the collector
- *                                to reset a single counter
+ *								to reset a single counter
  * ----------
  */
 typedef struct PgStat_MsgResetsinglecounter
 {
-    PgStat_MsgHdr m_hdr;
-    Oid            m_databaseid;
-    PgStat_Single_Reset_Type m_resettype;
-    Oid            m_objectid;
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
+	PgStat_Single_Reset_Type m_resettype;
+	Oid			m_objectid;
 } PgStat_MsgResetsinglecounter;
 
 /* ----------
- * PgStat_MsgAutovacStart        Sent by the autovacuum daemon to signal
- *                                that a database is going to be processed
+ * PgStat_MsgAutovacStart		Sent by the autovacuum daemon to signal
+ *								that a database is going to be processed
  * ----------
  */
 typedef struct PgStat_MsgAutovacStart
 {
-    PgStat_MsgHdr m_hdr;
-    Oid            m_databaseid;
-    TimestampTz m_start_time;
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
+	TimestampTz m_start_time;
 } PgStat_MsgAutovacStart;
 
 
 /* ----------
- * PgStat_MsgVacuum                Sent by the backend or autovacuum daemon
- *                                after VACUUM
+ * PgStat_MsgVacuum				Sent by the backend or autovacuum daemon
+ *								after VACUUM
  * ----------
  */
 typedef struct PgStat_MsgVacuum
 {
-    PgStat_MsgHdr m_hdr;
-    Oid            m_databaseid;
-    Oid            m_tableoid;
-    bool        m_autovacuum;
-    TimestampTz m_vacuumtime;
-    PgStat_Counter m_live_tuples;
-    PgStat_Counter m_dead_tuples;
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
+	Oid			m_tableoid;
+	bool		m_autovacuum;
+	TimestampTz m_vacuumtime;
+	PgStat_Counter m_live_tuples;
+	PgStat_Counter m_dead_tuples;
 } PgStat_MsgVacuum;
 
 
 /* ----------
- * PgStat_MsgAnalyze            Sent by the backend or autovacuum daemon
- *                                after ANALYZE
+ * PgStat_MsgAnalyze			Sent by the backend or autovacuum daemon
+ *								after ANALYZE
  * ----------
  */
 typedef struct PgStat_MsgAnalyze
 {
-    PgStat_MsgHdr m_hdr;
-    Oid            m_databaseid;
-    Oid            m_tableoid;
-    bool        m_autovacuum;
-    bool        m_resetcounter;
-    TimestampTz m_analyzetime;
-    PgStat_Counter m_live_tuples;
-    PgStat_Counter m_dead_tuples;
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
+	Oid			m_tableoid;
+	bool		m_autovacuum;
+	bool		m_resetcounter;
+	TimestampTz m_analyzetime;
+	PgStat_Counter m_live_tuples;
+	PgStat_Counter m_dead_tuples;
 } PgStat_MsgAnalyze;
 
 
 /* ----------
- * PgStat_MsgArchiver            Sent by the archiver to update statistics.
+ * PgStat_MsgArchiver			Sent by the archiver to update statistics.
  * ----------
  */
 typedef struct PgStat_MsgArchiver
 {
-    PgStat_MsgHdr m_hdr;
-    bool        m_failed;        /* Failed attempt */
-    char        m_xlog[MAX_XFN_CHARS + 1];
-    TimestampTz m_timestamp;
+	PgStat_MsgHdr m_hdr;
+	bool		m_failed;		/* Failed attempt */
+	char		m_xlog[MAX_XFN_CHARS + 1];
+	TimestampTz m_timestamp;
 } PgStat_MsgArchiver;
 
 /* ----------
- * PgStat_MsgBgWriter            Sent by the bgwriter to update statistics.
+ * PgStat_MsgBgWriter			Sent by the bgwriter to update statistics.
  * ----------
  */
 typedef struct PgStat_MsgBgWriter
 {
-    PgStat_MsgHdr m_hdr;
-
-    PgStat_Counter m_timed_checkpoints;
-    PgStat_Counter m_requested_checkpoints;
-    PgStat_Counter m_buf_written_checkpoints;
-    PgStat_Counter m_buf_written_clean;
-    PgStat_Counter m_maxwritten_clean;
-    PgStat_Counter m_buf_written_backend;
-    PgStat_Counter m_buf_fsync_backend;
-    PgStat_Counter m_buf_alloc;
-    PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */
-    PgStat_Counter m_checkpoint_sync_time;
+	PgStat_MsgHdr m_hdr;
+
+	PgStat_Counter m_timed_checkpoints;
+	PgStat_Counter m_requested_checkpoints;
+	PgStat_Counter m_buf_written_checkpoints;
+	PgStat_Counter m_buf_written_clean;
+	PgStat_Counter m_maxwritten_clean;
+	PgStat_Counter m_buf_written_backend;
+	PgStat_Counter m_buf_fsync_backend;
+	PgStat_Counter m_buf_alloc;
+	PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */
+	PgStat_Counter m_checkpoint_sync_time;
 } PgStat_MsgBgWriter;
 
 /* ----------
- * PgStat_MsgRecoveryConflict    Sent by the backend upon recovery conflict
+ * PgStat_MsgRecoveryConflict	Sent by the backend upon recovery conflict
  * ----------
  */
 typedef struct PgStat_MsgRecoveryConflict
 {
-    PgStat_MsgHdr m_hdr;
+	PgStat_MsgHdr m_hdr;
 
-    Oid            m_databaseid;
-    int            m_reason;
+	Oid			m_databaseid;
+	int			m_reason;
 } PgStat_MsgRecoveryConflict;
 
 /* ----------
- * PgStat_MsgTempFile    Sent by the backend upon creating a temp file
+ * PgStat_MsgTempFile	Sent by the backend upon creating a temp file
  * ----------
  */
 typedef struct PgStat_MsgTempFile
 {
-    PgStat_MsgHdr m_hdr;
+	PgStat_MsgHdr m_hdr;
 
-    Oid            m_databaseid;
-    size_t        m_filesize;
+	Oid			m_databaseid;
+	size_t		m_filesize;
 } PgStat_MsgTempFile;
 
 /* ----------
- * PgStat_FunctionCounts    The actual per-function counts kept by a backend
+ * PgStat_FunctionCounts	The actual per-function counts kept by a backend
  *
  * This struct should contain only actual event counters, because we memcmp
  * it against zeroes to detect whether there are any counts to transmit.
@@ -465,103 +465,103 @@ typedef struct PgStat_MsgTempFile
  */
 typedef struct PgStat_FunctionCounts
 {
-    PgStat_Counter f_numcalls;
-    instr_time    f_total_time;
-    instr_time    f_self_time;
+	PgStat_Counter f_numcalls;
+	instr_time	f_total_time;
+	instr_time	f_self_time;
 } PgStat_FunctionCounts;
 
 /* ----------
- * PgStat_BackendFunctionEntry    Entry in backend's per-function hash table
+ * PgStat_BackendFunctionEntry	Entry in backend's per-function hash table
  * ----------
  */
 typedef struct PgStat_BackendFunctionEntry
 {
-    Oid            f_id;
-    PgStat_FunctionCounts f_counts;
+	Oid			f_id;
+	PgStat_FunctionCounts f_counts;
 } PgStat_BackendFunctionEntry;
 
 /* ----------
- * PgStat_FunctionEntry            Per-function info in a MsgFuncstat
+ * PgStat_FunctionEntry			Per-function info in a MsgFuncstat
  * ----------
  */
 typedef struct PgStat_FunctionEntry
 {
-    Oid            f_id;
-    PgStat_Counter f_numcalls;
-    PgStat_Counter f_total_time;    /* times in microseconds */
-    PgStat_Counter f_self_time;
+	Oid			f_id;
+	PgStat_Counter f_numcalls;
+	PgStat_Counter f_total_time;	/* times in microseconds */
+	PgStat_Counter f_self_time;
 } PgStat_FunctionEntry;
 
 /* ----------
- * PgStat_MsgFuncstat            Sent by the backend to report function
- *                                usage statistics.
+ * PgStat_MsgFuncstat			Sent by the backend to report function
+ *								usage statistics.
  * ----------
  */
-#define PGSTAT_NUM_FUNCENTRIES    \
-    ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
-     / sizeof(PgStat_FunctionEntry))
+#define PGSTAT_NUM_FUNCENTRIES	\
+	((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
+	 / sizeof(PgStat_FunctionEntry))
 
 typedef struct PgStat_MsgFuncstat
 {
-    PgStat_MsgHdr m_hdr;
-    Oid            m_databaseid;
-    int            m_nentries;
-    PgStat_FunctionEntry m_entry[PGSTAT_NUM_FUNCENTRIES];
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
+	int			m_nentries;
+	PgStat_FunctionEntry m_entry[PGSTAT_NUM_FUNCENTRIES];
 } PgStat_MsgFuncstat;
 
 /* ----------
- * PgStat_MsgFuncpurge            Sent by the backend to tell the collector
- *                                about dead functions.
+ * PgStat_MsgFuncpurge			Sent by the backend to tell the collector
+ *								about dead functions.
  * ----------
  */
 #define PGSTAT_NUM_FUNCPURGE  \
-    ((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
-     / sizeof(Oid))
+	((PGSTAT_MSG_PAYLOAD - sizeof(Oid) - sizeof(int))  \
+	 / sizeof(Oid))
 
 typedef struct PgStat_MsgFuncpurge
 {
-    PgStat_MsgHdr m_hdr;
-    Oid            m_databaseid;
-    int            m_nentries;
-    Oid            m_functionid[PGSTAT_NUM_FUNCPURGE];
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
+	int			m_nentries;
+	Oid			m_functionid[PGSTAT_NUM_FUNCPURGE];
 } PgStat_MsgFuncpurge;
 
 /* ----------
- * PgStat_MsgDeadlock            Sent by the backend to tell the collector
- *                                about a deadlock that occurred.
+ * PgStat_MsgDeadlock			Sent by the backend to tell the collector
+ *								about a deadlock that occurred.
  * ----------
  */
 typedef struct PgStat_MsgDeadlock
 {
-    PgStat_MsgHdr m_hdr;
-    Oid            m_databaseid;
+	PgStat_MsgHdr m_hdr;
+	Oid			m_databaseid;
 } PgStat_MsgDeadlock;
 
 
 /* ----------
- * PgStat_Msg                    Union over all possible messages.
+ * PgStat_Msg					Union over all possible messages.
  * ----------
  */
 typedef union PgStat_Msg
 {
-    PgStat_MsgHdr msg_hdr;
-    PgStat_MsgDummy msg_dummy;
-    PgStat_MsgInquiry msg_inquiry;
-    PgStat_MsgTabstat msg_tabstat;
-    PgStat_MsgTabpurge msg_tabpurge;
-    PgStat_MsgDropdb msg_dropdb;
-    PgStat_MsgResetcounter msg_resetcounter;
-    PgStat_MsgResetsharedcounter msg_resetsharedcounter;
-    PgStat_MsgResetsinglecounter msg_resetsinglecounter;
-    PgStat_MsgAutovacStart msg_autovacuum;
-    PgStat_MsgVacuum msg_vacuum;
-    PgStat_MsgAnalyze msg_analyze;
-    PgStat_MsgArchiver msg_archiver;
-    PgStat_MsgBgWriter msg_bgwriter;
-    PgStat_MsgFuncstat msg_funcstat;
-    PgStat_MsgFuncpurge msg_funcpurge;
-    PgStat_MsgRecoveryConflict msg_recoveryconflict;
-    PgStat_MsgDeadlock msg_deadlock;
+	PgStat_MsgHdr msg_hdr;
+	PgStat_MsgDummy msg_dummy;
+	PgStat_MsgInquiry msg_inquiry;
+	PgStat_MsgTabstat msg_tabstat;
+	PgStat_MsgTabpurge msg_tabpurge;
+	PgStat_MsgDropdb msg_dropdb;
+	PgStat_MsgResetcounter msg_resetcounter;
+	PgStat_MsgResetsharedcounter msg_resetsharedcounter;
+	PgStat_MsgResetsinglecounter msg_resetsinglecounter;
+	PgStat_MsgAutovacStart msg_autovacuum;
+	PgStat_MsgVacuum msg_vacuum;
+	PgStat_MsgAnalyze msg_analyze;
+	PgStat_MsgArchiver msg_archiver;
+	PgStat_MsgBgWriter msg_bgwriter;
+	PgStat_MsgFuncstat msg_funcstat;
+	PgStat_MsgFuncpurge msg_funcpurge;
+	PgStat_MsgRecoveryConflict msg_recoveryconflict;
+	PgStat_MsgDeadlock msg_deadlock;
 } PgStat_Msg;
 
 
@@ -573,96 +573,96 @@ typedef union PgStat_Msg
  * ------------------------------------------------------------
  */
 
-#define PGSTAT_FILE_FORMAT_ID    0x01A5BC9D
+#define PGSTAT_FILE_FORMAT_ID	0x01A5BC9D
 
 /* ----------
- * PgStat_StatDBEntry            The collector's data per database
+ * PgStat_StatDBEntry			The collector's data per database
  * ----------
  */
 typedef struct PgStat_StatDBEntry
 {
-    Oid            databaseid;
-    PgStat_Counter n_xact_commit;
-    PgStat_Counter n_xact_rollback;
-    PgStat_Counter n_blocks_fetched;
-    PgStat_Counter n_blocks_hit;
-    PgStat_Counter n_tuples_returned;
-    PgStat_Counter n_tuples_fetched;
-    PgStat_Counter n_tuples_inserted;
-    PgStat_Counter n_tuples_updated;
-    PgStat_Counter n_tuples_deleted;
-    TimestampTz last_autovac_time;
-    PgStat_Counter n_conflict_tablespace;
-    PgStat_Counter n_conflict_lock;
-    PgStat_Counter n_conflict_snapshot;
-    PgStat_Counter n_conflict_bufferpin;
-    PgStat_Counter n_conflict_startup_deadlock;
-    PgStat_Counter n_temp_files;
-    PgStat_Counter n_temp_bytes;
-    PgStat_Counter n_deadlocks;
-    PgStat_Counter n_block_read_time;    /* times in microseconds */
-    PgStat_Counter n_block_write_time;
-
-    TimestampTz stat_reset_timestamp;
-    TimestampTz stats_timestamp;    /* time of db stats file update */
-
-    /*
-     * tables and functions must be last in the struct, because we don't write
-     * the pointers out to the stats file.
-     */
-    HTAB       *tables;
-    HTAB       *functions;
+	Oid			databaseid;
+	PgStat_Counter n_xact_commit;
+	PgStat_Counter n_xact_rollback;
+	PgStat_Counter n_blocks_fetched;
+	PgStat_Counter n_blocks_hit;
+	PgStat_Counter n_tuples_returned;
+	PgStat_Counter n_tuples_fetched;
+	PgStat_Counter n_tuples_inserted;
+	PgStat_Counter n_tuples_updated;
+	PgStat_Counter n_tuples_deleted;
+	TimestampTz last_autovac_time;
+	PgStat_Counter n_conflict_tablespace;
+	PgStat_Counter n_conflict_lock;
+	PgStat_Counter n_conflict_snapshot;
+	PgStat_Counter n_conflict_bufferpin;
+	PgStat_Counter n_conflict_startup_deadlock;
+	PgStat_Counter n_temp_files;
+	PgStat_Counter n_temp_bytes;
+	PgStat_Counter n_deadlocks;
+	PgStat_Counter n_block_read_time;	/* times in microseconds */
+	PgStat_Counter n_block_write_time;
+
+	TimestampTz stat_reset_timestamp;
+	TimestampTz stats_timestamp;	/* time of db stats file update */
+
+	/*
+	 * tables and functions must be last in the struct, because we don't write
+	 * the pointers out to the stats file.
+	 */
+	HTAB	   *tables;
+	HTAB	   *functions;
 } PgStat_StatDBEntry;
 
 
 /* ----------
- * PgStat_StatTabEntry            The collector's data per table (or index)
+ * PgStat_StatTabEntry			The collector's data per table (or index)
  * ----------
  */
 typedef struct PgStat_StatTabEntry
 {
-    Oid            tableid;
+	Oid			tableid;
 
-    PgStat_Counter numscans;
+	PgStat_Counter numscans;
 
-    PgStat_Counter tuples_returned;
-    PgStat_Counter tuples_fetched;
+	PgStat_Counter tuples_returned;
+	PgStat_Counter tuples_fetched;
 
-    PgStat_Counter tuples_inserted;
-    PgStat_Counter tuples_updated;
-    PgStat_Counter tuples_deleted;
-    PgStat_Counter tuples_hot_updated;
+	PgStat_Counter tuples_inserted;
+	PgStat_Counter tuples_updated;
+	PgStat_Counter tuples_deleted;
+	PgStat_Counter tuples_hot_updated;
 
-    PgStat_Counter n_live_tuples;
-    PgStat_Counter n_dead_tuples;
-    PgStat_Counter changes_since_analyze;
+	PgStat_Counter n_live_tuples;
+	PgStat_Counter n_dead_tuples;
+	PgStat_Counter changes_since_analyze;
 
-    PgStat_Counter blocks_fetched;
-    PgStat_Counter blocks_hit;
+	PgStat_Counter blocks_fetched;
+	PgStat_Counter blocks_hit;
 
-    TimestampTz vacuum_timestamp;    /* user initiated vacuum */
-    PgStat_Counter vacuum_count;
-    TimestampTz autovac_vacuum_timestamp;    /* autovacuum initiated */
-    PgStat_Counter autovac_vacuum_count;
-    TimestampTz analyze_timestamp;    /* user initiated */
-    PgStat_Counter analyze_count;
-    TimestampTz autovac_analyze_timestamp;    /* autovacuum initiated */
-    PgStat_Counter autovac_analyze_count;
+	TimestampTz vacuum_timestamp;	/* user initiated vacuum */
+	PgStat_Counter vacuum_count;
+	TimestampTz autovac_vacuum_timestamp;	/* autovacuum initiated */
+	PgStat_Counter autovac_vacuum_count;
+	TimestampTz analyze_timestamp;	/* user initiated */
+	PgStat_Counter analyze_count;
+	TimestampTz autovac_analyze_timestamp;	/* autovacuum initiated */
+	PgStat_Counter autovac_analyze_count;
 } PgStat_StatTabEntry;
 
 
 /* ----------
- * PgStat_StatFuncEntry            The collector's data per function
+ * PgStat_StatFuncEntry			The collector's data per function
  * ----------
  */
 typedef struct PgStat_StatFuncEntry
 {
-    Oid            functionid;
+	Oid			functionid;
 
-    PgStat_Counter f_numcalls;
+	PgStat_Counter f_numcalls;
 
-    PgStat_Counter f_total_time;    /* times in microseconds */
-    PgStat_Counter f_self_time;
+	PgStat_Counter f_total_time;	/* times in microseconds */
+	PgStat_Counter f_self_time;
 } PgStat_StatFuncEntry;
 
 
@@ -671,15 +671,15 @@ typedef struct PgStat_StatFuncEntry
  */
 typedef struct PgStat_ArchiverStats
 {
-    PgStat_Counter archived_count;    /* archival successes */
-    char        last_archived_wal[MAX_XFN_CHARS + 1];    /* last WAL file
-                                                         * archived */
-    TimestampTz last_archived_timestamp;    /* last archival success time */
-    PgStat_Counter failed_count;    /* failed archival attempts */
-    char        last_failed_wal[MAX_XFN_CHARS + 1]; /* WAL file involved in
-                                                     * last failure */
-    TimestampTz last_failed_timestamp;    /* last archival failure time */
-    TimestampTz stat_reset_timestamp;
+	PgStat_Counter archived_count;	/* archival successes */
+	char		last_archived_wal[MAX_XFN_CHARS + 1];	/* last WAL file
+														 * archived */
+	TimestampTz last_archived_timestamp;	/* last archival success time */
+	PgStat_Counter failed_count;	/* failed archival attempts */
+	char		last_failed_wal[MAX_XFN_CHARS + 1]; /* WAL file involved in
+													 * last failure */
+	TimestampTz last_failed_timestamp;	/* last archival failure time */
+	TimestampTz stat_reset_timestamp;
 } PgStat_ArchiverStats;
 
 /*
@@ -687,18 +687,18 @@ typedef struct PgStat_ArchiverStats
  */
 typedef struct PgStat_GlobalStats
 {
-    TimestampTz stats_timestamp;    /* time of stats file update */
-    PgStat_Counter timed_checkpoints;
-    PgStat_Counter requested_checkpoints;
-    PgStat_Counter checkpoint_write_time;    /* times in milliseconds */
-    PgStat_Counter checkpoint_sync_time;
-    PgStat_Counter buf_written_checkpoints;
-    PgStat_Counter buf_written_clean;
-    PgStat_Counter maxwritten_clean;
-    PgStat_Counter buf_written_backend;
-    PgStat_Counter buf_fsync_backend;
-    PgStat_Counter buf_alloc;
-    TimestampTz stat_reset_timestamp;
+	TimestampTz stats_timestamp;	/* time of stats file update */
+	PgStat_Counter timed_checkpoints;
+	PgStat_Counter requested_checkpoints;
+	PgStat_Counter checkpoint_write_time;	/* times in milliseconds */
+	PgStat_Counter checkpoint_sync_time;
+	PgStat_Counter buf_written_checkpoints;
+	PgStat_Counter buf_written_clean;
+	PgStat_Counter maxwritten_clean;
+	PgStat_Counter buf_written_backend;
+	PgStat_Counter buf_fsync_backend;
+	PgStat_Counter buf_alloc;
+	TimestampTz stat_reset_timestamp;
 } PgStat_GlobalStats;
 
 
@@ -708,18 +708,18 @@ typedef struct PgStat_GlobalStats
  */
 typedef enum BackendType
 {
-    B_AUTOVAC_LAUNCHER,
-    B_AUTOVAC_WORKER,
-    B_BACKEND,
-    B_BG_WORKER,
-    B_BG_WRITER,
-    B_CHECKPOINTER,
-    B_STARTUP,
-    B_WAL_RECEIVER,
-    B_WAL_SENDER,
-    B_WAL_WRITER,
-    B_PGXL_CLUSTER_MONITOR,
-    B_PGXL_POOLER
+	B_AUTOVAC_LAUNCHER,
+	B_AUTOVAC_WORKER,
+	B_BACKEND,
+	B_BG_WORKER,
+	B_BG_WRITER,
+	B_CHECKPOINTER,
+	B_STARTUP,
+	B_WAL_RECEIVER,
+	B_WAL_SENDER,
+	B_WAL_WRITER,
+	B_PGXL_CLUSTER_MONITOR,
+	B_PGXL_POOLER
 } BackendType;
 
 
@@ -729,13 +729,13 @@ typedef enum BackendType
  */
 typedef enum BackendState
 {
-    STATE_UNDEFINED,
-    STATE_IDLE,
-    STATE_RUNNING,
-    STATE_IDLEINTRANSACTION,
-    STATE_FASTPATH,
-    STATE_IDLEINTRANSACTION_ABORTED,
-    STATE_DISABLED
+	STATE_UNDEFINED,
+	STATE_IDLE,
+	STATE_RUNNING,
+	STATE_IDLEINTRANSACTION,
+	STATE_FASTPATH,
+	STATE_IDLEINTRANSACTION_ABORTED,
+	STATE_DISABLED
 } BackendState;
 
 
@@ -743,15 +743,15 @@ typedef enum BackendState
  * Wait Classes
  * ----------
  */
-#define PG_WAIT_LWLOCK                0x01000000U
-#define PG_WAIT_LOCK                0x03000000U
-#define PG_WAIT_BUFFER_PIN            0x04000000U
-#define PG_WAIT_ACTIVITY            0x05000000U
-#define PG_WAIT_CLIENT                0x06000000U
-#define PG_WAIT_EXTENSION            0x07000000U
-#define PG_WAIT_IPC                    0x08000000U
-#define PG_WAIT_TIMEOUT                0x09000000U
-#define PG_WAIT_IO                    0x0A000000U
+#define PG_WAIT_LWLOCK				0x01000000U
+#define PG_WAIT_LOCK				0x03000000U
+#define PG_WAIT_BUFFER_PIN			0x04000000U
+#define PG_WAIT_ACTIVITY			0x05000000U
+#define PG_WAIT_CLIENT				0x06000000U
+#define PG_WAIT_EXTENSION			0x07000000U
+#define PG_WAIT_IPC					0x08000000U
+#define PG_WAIT_TIMEOUT				0x09000000U
+#define PG_WAIT_IO					0x0A000000U
 
 /* ----------
  * Wait Events - Activity
@@ -763,27 +763,27 @@ typedef enum BackendState
  */
 typedef enum
 {
-    WAIT_EVENT_ARCHIVER_MAIN = PG_WAIT_ACTIVITY,
-    WAIT_EVENT_AUTOVACUUM_MAIN,
-    WAIT_EVENT_BGWRITER_HIBERNATE,
-    WAIT_EVENT_BGWRITER_MAIN,
-    WAIT_EVENT_CHECKPOINTER_MAIN,
-    WAIT_EVENT_LOGICAL_LAUNCHER_MAIN,
-    WAIT_EVENT_LOGICAL_APPLY_MAIN,
-    WAIT_EVENT_PGSTAT_MAIN,
-    WAIT_EVENT_RECOVERY_WAL_ALL,
-    WAIT_EVENT_RECOVERY_WAL_STREAM,
-    WAIT_EVENT_SYSLOGGER_MAIN,
+	WAIT_EVENT_ARCHIVER_MAIN = PG_WAIT_ACTIVITY,
+	WAIT_EVENT_AUTOVACUUM_MAIN,
+	WAIT_EVENT_BGWRITER_HIBERNATE,
+	WAIT_EVENT_BGWRITER_MAIN,
+	WAIT_EVENT_CHECKPOINTER_MAIN,
+	WAIT_EVENT_LOGICAL_LAUNCHER_MAIN,
+	WAIT_EVENT_LOGICAL_APPLY_MAIN,
+	WAIT_EVENT_PGSTAT_MAIN,
+	WAIT_EVENT_RECOVERY_WAL_ALL,
+	WAIT_EVENT_RECOVERY_WAL_STREAM,
+	WAIT_EVENT_SYSLOGGER_MAIN,
 #ifdef __AUDIT__
-    WAIT_EVENT_AUDIT_LOGGER_MAIN,
+	WAIT_EVENT_AUDIT_LOGGER_MAIN,
 #endif
-    WAIT_EVENT_WAL_RECEIVER_MAIN,
-    WAIT_EVENT_WAL_SENDER_MAIN,
-    WAIT_EVENT_WAL_WRITER_MAIN,
+	WAIT_EVENT_WAL_RECEIVER_MAIN,
+	WAIT_EVENT_WAL_SENDER_MAIN,
+	WAIT_EVENT_WAL_WRITER_MAIN,
 #ifdef __AUDIT_FGA__
     WAIT_EVENT_AUDIT_FGA_MAIN,
 #endif
-    WAIT_EVENT_CLUSTER_MONITOR_MAIN
+	WAIT_EVENT_CLUSTER_MONITOR_MAIN
 } WaitEventActivity;
 
 /* ----------
@@ -796,14 +796,14 @@ typedef enum
  */
 typedef enum
 {
-    WAIT_EVENT_CLIENT_READ = PG_WAIT_CLIENT,
-    WAIT_EVENT_CLIENT_WRITE,
-    WAIT_EVENT_LIBPQWALRECEIVER_CONNECT,
-    WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE,
-    WAIT_EVENT_SSL_OPEN_SERVER,
-    WAIT_EVENT_WAL_RECEIVER_WAIT_START,
-    WAIT_EVENT_WAL_SENDER_WAIT_WAL,
-    WAIT_EVENT_WAL_SENDER_WRITE_DATA
+	WAIT_EVENT_CLIENT_READ = PG_WAIT_CLIENT,
+	WAIT_EVENT_CLIENT_WRITE,
+	WAIT_EVENT_LIBPQWALRECEIVER_CONNECT,
+	WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE,
+	WAIT_EVENT_SSL_OPEN_SERVER,
+	WAIT_EVENT_WAL_RECEIVER_WAIT_START,
+	WAIT_EVENT_WAL_SENDER_WAIT_WAL,
+	WAIT_EVENT_WAL_SENDER_WRITE_DATA
 } WaitEventClient;
 
 /* ----------
@@ -815,23 +815,23 @@ typedef enum
  */
 typedef enum
 {
-    WAIT_EVENT_BGWORKER_SHUTDOWN = PG_WAIT_IPC,
-    WAIT_EVENT_BGWORKER_STARTUP,
-    WAIT_EVENT_BTREE_PAGE,
-    WAIT_EVENT_EXECUTE_GATHER,
-    WAIT_EVENT_LOGICAL_SYNC_DATA,
-    WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE,
-    WAIT_EVENT_MQ_INTERNAL,
-    WAIT_EVENT_MQ_PUT_MESSAGE,
-    WAIT_EVENT_MQ_RECEIVE,
-    WAIT_EVENT_MQ_SEND,
-    WAIT_EVENT_PARALLEL_FINISH,
-    WAIT_EVENT_PARALLEL_BITMAP_SCAN,
-    WAIT_EVENT_PROCARRAY_GROUP_UPDATE,
-    WAIT_EVENT_REPLICATION_ORIGIN_DROP,
-    WAIT_EVENT_REPLICATION_SLOT_DROP,
-    WAIT_EVENT_SAFE_SNAPSHOT,
-    WAIT_EVENT_SYNC_REP
+	WAIT_EVENT_BGWORKER_SHUTDOWN = PG_WAIT_IPC,
+	WAIT_EVENT_BGWORKER_STARTUP,
+	WAIT_EVENT_BTREE_PAGE,
+	WAIT_EVENT_EXECUTE_GATHER,
+	WAIT_EVENT_LOGICAL_SYNC_DATA,
+	WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE,
+	WAIT_EVENT_MQ_INTERNAL,
+	WAIT_EVENT_MQ_PUT_MESSAGE,
+	WAIT_EVENT_MQ_RECEIVE,
+	WAIT_EVENT_MQ_SEND,
+	WAIT_EVENT_PARALLEL_FINISH,
+	WAIT_EVENT_PARALLEL_BITMAP_SCAN,
+	WAIT_EVENT_PROCARRAY_GROUP_UPDATE,
+	WAIT_EVENT_REPLICATION_ORIGIN_DROP,
+	WAIT_EVENT_REPLICATION_SLOT_DROP,
+	WAIT_EVENT_SAFE_SNAPSHOT,
+	WAIT_EVENT_SYNC_REP
 } WaitEventIPC;
 
 /* ----------
@@ -842,9 +842,9 @@ typedef enum
  */
 typedef enum
 {
-    WAIT_EVENT_BASE_BACKUP_THROTTLE = PG_WAIT_TIMEOUT,
-    WAIT_EVENT_PG_SLEEP,
-    WAIT_EVENT_RECOVERY_APPLY_DELAY
+	WAIT_EVENT_BASE_BACKUP_THROTTLE = PG_WAIT_TIMEOUT,
+	WAIT_EVENT_PG_SLEEP,
+	WAIT_EVENT_RECOVERY_APPLY_DELAY
 } WaitEventTimeout;
 
 /* ----------
@@ -855,86 +855,86 @@ typedef enum
  */
 typedef enum
 {
-    WAIT_EVENT_BUFFILE_READ = PG_WAIT_IO,
-    WAIT_EVENT_BUFFILE_WRITE,
-    WAIT_EVENT_CONTROL_FILE_READ,
-    WAIT_EVENT_CONTROL_FILE_SYNC,
-    WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE,
-    WAIT_EVENT_CONTROL_FILE_WRITE,
-    WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE,
-    WAIT_EVENT_COPY_FILE_READ,
-    WAIT_EVENT_COPY_FILE_WRITE,
+	WAIT_EVENT_BUFFILE_READ = PG_WAIT_IO,
+	WAIT_EVENT_BUFFILE_WRITE,
+	WAIT_EVENT_CONTROL_FILE_READ,
+	WAIT_EVENT_CONTROL_FILE_SYNC,
+	WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE,
+	WAIT_EVENT_CONTROL_FILE_WRITE,
+	WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE,
+	WAIT_EVENT_COPY_FILE_READ,
+	WAIT_EVENT_COPY_FILE_WRITE,
 #ifdef _MLS_
     WAIT_EVENT_CRYPT_KEY_MAP_READ,
     WAIT_EVENT_CRYPT_KEY_MAP_SYNC,
     WAIT_EVENT_CRYPT_KEY_MAP_WRITE,
-#endif    
-    WAIT_EVENT_DATA_FILE_EXTEND,
-    WAIT_EVENT_DATA_FILE_FLUSH,
-    WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC,
-    WAIT_EVENT_DATA_FILE_PREFETCH,
-    WAIT_EVENT_DATA_FILE_READ,
-    WAIT_EVENT_DATA_FILE_SYNC,
-    WAIT_EVENT_DATA_FILE_TRUNCATE,
+#endif	
+	WAIT_EVENT_DATA_FILE_EXTEND,
+	WAIT_EVENT_DATA_FILE_FLUSH,
+	WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC,
+	WAIT_EVENT_DATA_FILE_PREFETCH,
+	WAIT_EVENT_DATA_FILE_READ,
+	WAIT_EVENT_DATA_FILE_SYNC,
+	WAIT_EVENT_DATA_FILE_TRUNCATE,
 #ifdef _SHARDING_
-    WAIT_EVENT_DATA_FILE_DEALLOC,
+	WAIT_EVENT_DATA_FILE_DEALLOC,
 #endif
-    WAIT_EVENT_DATA_FILE_WRITE,
-    WAIT_EVENT_DSM_FILL_ZERO_WRITE,
-    WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ,
-    WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC,
-    WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE,
-    WAIT_EVENT_LOCK_FILE_CREATE_READ,
-    WAIT_EVENT_LOCK_FILE_CREATE_SYNC,
-    WAIT_EVENT_LOCK_FILE_CREATE_WRITE,
-    WAIT_EVENT_LOCK_FILE_RECHECKDATADIR_READ,
-    WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC,
-    WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC,
-    WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE,
-    WAIT_EVENT_LOGICAL_REWRITE_SYNC,
-    WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE,
-    WAIT_EVENT_LOGICAL_REWRITE_WRITE,
+	WAIT_EVENT_DATA_FILE_WRITE,
+	WAIT_EVENT_DSM_FILL_ZERO_WRITE,
+	WAIT_EVENT_LOCK_FILE_ADDTODATADIR_READ,
+	WAIT_EVENT_LOCK_FILE_ADDTODATADIR_SYNC,
+	WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE,
+	WAIT_EVENT_LOCK_FILE_CREATE_READ,
+	WAIT_EVENT_LOCK_FILE_CREATE_SYNC,
+	WAIT_EVENT_LOCK_FILE_CREATE_WRITE,
+	WAIT_EVENT_LOCK_FILE_RECHECKDATADIR_READ,
+	WAIT_EVENT_LOGICAL_REWRITE_CHECKPOINT_SYNC,
+	WAIT_EVENT_LOGICAL_REWRITE_MAPPING_SYNC,
+	WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE,
+	WAIT_EVENT_LOGICAL_REWRITE_SYNC,
+	WAIT_EVENT_LOGICAL_REWRITE_TRUNCATE,
+	WAIT_EVENT_LOGICAL_REWRITE_WRITE,
 #ifdef _MLS_
     WAIT_EVENT_REL_CRYPT_MAP_READ,
     WAIT_EVENT_REL_CRYPT_MAP_SYNC,
     WAIT_EVENT_REL_CRYPT_MAP_WRITE,
 #endif
-    WAIT_EVENT_RELATION_MAP_READ,
-    WAIT_EVENT_RELATION_MAP_SYNC,
-    WAIT_EVENT_RELATION_MAP_WRITE,
-    WAIT_EVENT_REORDER_BUFFER_READ,
-    WAIT_EVENT_REORDER_BUFFER_WRITE,
-    WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ,
-    WAIT_EVENT_REPLICATION_SLOT_READ,
-    WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC,
-    WAIT_EVENT_REPLICATION_SLOT_SYNC,
-    WAIT_EVENT_REPLICATION_SLOT_WRITE,
-    WAIT_EVENT_SLRU_FLUSH_SYNC,
-    WAIT_EVENT_SLRU_READ,
-    WAIT_EVENT_SLRU_SYNC,
-    WAIT_EVENT_SLRU_WRITE,
-    WAIT_EVENT_SNAPBUILD_READ,
-    WAIT_EVENT_SNAPBUILD_SYNC,
-    WAIT_EVENT_SNAPBUILD_WRITE,
-    WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC,
-    WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE,
-    WAIT_EVENT_TIMELINE_HISTORY_READ,
-    WAIT_EVENT_TIMELINE_HISTORY_SYNC,
-    WAIT_EVENT_TIMELINE_HISTORY_WRITE,
-    WAIT_EVENT_TWOPHASE_FILE_READ,
-    WAIT_EVENT_TWOPHASE_FILE_SYNC,
-    WAIT_EVENT_TWOPHASE_FILE_WRITE,
-    WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ,
-    WAIT_EVENT_WAL_BOOTSTRAP_SYNC,
-    WAIT_EVENT_WAL_BOOTSTRAP_WRITE,
-    WAIT_EVENT_WAL_COPY_READ,
-    WAIT_EVENT_WAL_COPY_SYNC,
-    WAIT_EVENT_WAL_COPY_WRITE,
-    WAIT_EVENT_WAL_INIT_SYNC,
-    WAIT_EVENT_WAL_INIT_WRITE,
-    WAIT_EVENT_WAL_READ,
-    WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN,
-    WAIT_EVENT_WAL_WRITE
+	WAIT_EVENT_RELATION_MAP_READ,
+	WAIT_EVENT_RELATION_MAP_SYNC,
+	WAIT_EVENT_RELATION_MAP_WRITE,
+	WAIT_EVENT_REORDER_BUFFER_READ,
+	WAIT_EVENT_REORDER_BUFFER_WRITE,
+	WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ,
+	WAIT_EVENT_REPLICATION_SLOT_READ,
+	WAIT_EVENT_REPLICATION_SLOT_RESTORE_SYNC,
+	WAIT_EVENT_REPLICATION_SLOT_SYNC,
+	WAIT_EVENT_REPLICATION_SLOT_WRITE,
+	WAIT_EVENT_SLRU_FLUSH_SYNC,
+	WAIT_EVENT_SLRU_READ,
+	WAIT_EVENT_SLRU_SYNC,
+	WAIT_EVENT_SLRU_WRITE,
+	WAIT_EVENT_SNAPBUILD_READ,
+	WAIT_EVENT_SNAPBUILD_SYNC,
+	WAIT_EVENT_SNAPBUILD_WRITE,
+	WAIT_EVENT_TIMELINE_HISTORY_FILE_SYNC,
+	WAIT_EVENT_TIMELINE_HISTORY_FILE_WRITE,
+	WAIT_EVENT_TIMELINE_HISTORY_READ,
+	WAIT_EVENT_TIMELINE_HISTORY_SYNC,
+	WAIT_EVENT_TIMELINE_HISTORY_WRITE,
+	WAIT_EVENT_TWOPHASE_FILE_READ,
+	WAIT_EVENT_TWOPHASE_FILE_SYNC,
+	WAIT_EVENT_TWOPHASE_FILE_WRITE,
+	WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ,
+	WAIT_EVENT_WAL_BOOTSTRAP_SYNC,
+	WAIT_EVENT_WAL_BOOTSTRAP_WRITE,
+	WAIT_EVENT_WAL_COPY_READ,
+	WAIT_EVENT_WAL_COPY_SYNC,
+	WAIT_EVENT_WAL_COPY_WRITE,
+	WAIT_EVENT_WAL_INIT_SYNC,
+	WAIT_EVENT_WAL_INIT_WRITE,
+	WAIT_EVENT_WAL_READ,
+	WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN,
+	WAIT_EVENT_WAL_WRITE
 } WaitEventIO;
 
 /* ----------
@@ -943,11 +943,11 @@ typedef enum
  */
 typedef enum ProgressCommandType
 {
-    PROGRESS_COMMAND_INVALID,
-    PROGRESS_COMMAND_VACUUM
+	PROGRESS_COMMAND_INVALID,
+	PROGRESS_COMMAND_VACUUM
 } ProgressCommandType;
 
-#define PGSTAT_NUM_PROGRESS_PARAM    10
+#define PGSTAT_NUM_PROGRESS_PARAM	10
 
 /* ----------
  * Shared-memory data structures
@@ -963,12 +963,12 @@ typedef enum ProgressCommandType
  */
 typedef struct PgBackendSSLStatus
 {
-    /* Information about SSL connection */
-    int            ssl_bits;
-    bool        ssl_compression;
-    char        ssl_version[NAMEDATALEN];    /* MUST be null-terminated */
-    char        ssl_cipher[NAMEDATALEN];    /* MUST be null-terminated */
-    char        ssl_clientdn[NAMEDATALEN];    /* MUST be null-terminated */
+	/* Information about SSL connection */
+	int			ssl_bits;
+	bool		ssl_compression;
+	char		ssl_version[NAMEDATALEN];	/* MUST be null-terminated */
+	char		ssl_cipher[NAMEDATALEN];	/* MUST be null-terminated */
+	char		ssl_clientdn[NAMEDATALEN];	/* MUST be null-terminated */
 } PgBackendSSLStatus;
 
 
@@ -986,66 +986,66 @@ typedef struct PgBackendSSLStatus
  */
 typedef struct PgBackendStatus
 {
-    /*
-     * To avoid locking overhead, we use the following protocol: a backend
-     * increments st_changecount before modifying its entry, and again after
-     * finishing a modification.  A would-be reader should note the value of
-     * st_changecount, copy the entry into private memory, then check
-     * st_changecount again.  If the value hasn't changed, and if it's even,
-     * the copy is valid; otherwise start over.  This makes updates cheap
-     * while reads are potentially expensive, but that's the tradeoff we want.
-     *
-     * The above protocol needs the memory barriers to ensure that the
-     * apparent order of execution is as it desires. Otherwise, for example,
-     * the CPU might rearrange the code so that st_changecount is incremented
-     * twice before the modification on a machine with weak memory ordering.
-     * This surprising result can lead to bugs.
-     */
-    int            st_changecount;
-
-    /* The entry is valid iff st_procpid > 0, unused if st_procpid == 0 */
-    int            st_procpid;
-
-    /* Type of backends */
-    BackendType st_backendType;
-
-    /* Times when current backend, transaction, and activity started */
-    TimestampTz st_proc_start_timestamp;
-    TimestampTz st_xact_start_timestamp;
-    TimestampTz st_activity_start_timestamp;
-    TimestampTz st_state_start_timestamp;
-
-    /* Database OID, owning user's OID, connection client address */
-    Oid            st_databaseid;
-    Oid            st_userid;
-    SockAddr    st_clientaddr;
-    char       *st_clienthostname;    /* MUST be null-terminated */
-
-    /* Information about SSL connection */
-    bool        st_ssl;
-    PgBackendSSLStatus *st_sslstatus;
-
-    /* current state */
-    BackendState st_state;
-
-    /* application name; MUST be null-terminated */
-    char       *st_appname;
-
-    /* current command string; MUST be null-terminated */
-    char       *st_activity;
-
-    /*
-     * Command progress reporting.  Any command which wishes can advertise
-     * that it is running by setting st_progress_command,
-     * st_progress_command_target, and st_progress_param[].
-     * st_progress_command_target should be the OID of the relation which the
-     * command targets (we assume there's just one, as this is meant for
-     * utility commands), but the meaning of each element in the
-     * st_progress_param array is command-specific.
-     */
-    ProgressCommandType st_progress_command;
-    Oid            st_progress_command_target;
-    int64        st_progress_param[PGSTAT_NUM_PROGRESS_PARAM];
+	/*
+	 * To avoid locking overhead, we use the following protocol: a backend
+	 * increments st_changecount before modifying its entry, and again after
+	 * finishing a modification.  A would-be reader should note the value of
+	 * st_changecount, copy the entry into private memory, then check
+	 * st_changecount again.  If the value hasn't changed, and if it's even,
+	 * the copy is valid; otherwise start over.  This makes updates cheap
+	 * while reads are potentially expensive, but that's the tradeoff we want.
+	 *
+	 * The above protocol needs the memory barriers to ensure that the
+	 * apparent order of execution is as it desires. Otherwise, for example,
+	 * the CPU might rearrange the code so that st_changecount is incremented
+	 * twice before the modification on a machine with weak memory ordering.
+	 * This surprising result can lead to bugs.
+	 */
+	int			st_changecount;
+
+	/* The entry is valid if st_procpid > 0, unused if st_procpid == 0 */
+	int			st_procpid;
+
+	/* Type of backends */
+	BackendType st_backendType;
+
+	/* Times when current backend, transaction, and activity started */
+	TimestampTz st_proc_start_timestamp;
+	TimestampTz st_xact_start_timestamp;
+	TimestampTz st_activity_start_timestamp;
+	TimestampTz st_state_start_timestamp;
+
+	/* Database OID, owning user's OID, connection client address */
+	Oid			st_databaseid;
+	Oid			st_userid;
+	SockAddr	st_clientaddr;
+	char	   *st_clienthostname;	/* MUST be null-terminated */
+
+	/* Information about SSL connection */
+	bool		st_ssl;
+	PgBackendSSLStatus *st_sslstatus;
+
+	/* current state */
+	BackendState st_state;
+
+	/* application name; MUST be null-terminated */
+	char	   *st_appname;
+
+	/* current command string; MUST be null-terminated */
+	char	   *st_activity;
+
+	/*
+	 * Command progress reporting.  Any command which wishes can advertise
+	 * that it is running by setting st_progress_command,
+	 * st_progress_command_target, and st_progress_param[].
+	 * st_progress_command_target should be the OID of the relation which the
+	 * command targets (we assume there's just one, as this is meant for
+	 * utility commands), but the meaning of each element in the
+	 * st_progress_param array is command-specific.
+	 */
+	ProgressCommandType st_progress_command;
+	Oid			st_progress_command_target;
+	int64		st_progress_param[PGSTAT_NUM_PROGRESS_PARAM];
 } PgBackendStatus;
 
 /*
@@ -1060,30 +1060,30 @@ typedef struct PgBackendStatus
  * need to be called before and after PgBackendStatus entries are copied into
  * private memory, respectively.
  */
-#define pgstat_increment_changecount_before(beentry)    \
-    do {    \
-        beentry->st_changecount++;    \
-        pg_write_barrier(); \
-    } while (0)
+#define pgstat_increment_changecount_before(beentry)	\
+	do {	\
+		beentry->st_changecount++;	\
+		pg_write_barrier(); \
+	} while (0)
 
 #define pgstat_increment_changecount_after(beentry) \
-    do {    \
-        pg_write_barrier(); \
-        beentry->st_changecount++;    \
-        Assert((beentry->st_changecount & 1) == 0); \
-    } while (0)
-
-#define pgstat_save_changecount_before(beentry, save_changecount)    \
-    do {    \
-        save_changecount = beentry->st_changecount; \
-        pg_read_barrier();    \
-    } while (0)
-
-#define pgstat_save_changecount_after(beentry, save_changecount)    \
-    do {    \
-        pg_read_barrier();    \
-        save_changecount = beentry->st_changecount; \
-    } while (0)
+	do {	\
+		pg_write_barrier(); \
+		beentry->st_changecount++;	\
+		Assert((beentry->st_changecount & 1) == 0); \
+	} while (0)
+
+#define pgstat_save_changecount_before(beentry, save_changecount)	\
+	do {	\
+		save_changecount = beentry->st_changecount; \
+		pg_read_barrier();	\
+	} while (0)
+
+#define pgstat_save_changecount_after(beentry, save_changecount)	\
+	do {	\
+		pg_read_barrier();	\
+		save_changecount = beentry->st_changecount; \
+	} while (0)
 
 /* ----------
  * LocalPgBackendStatus
@@ -1095,22 +1095,25 @@ typedef struct PgBackendStatus
  */
 typedef struct LocalPgBackendStatus
 {
-    /*
-     * Local version of the backend status entry.
-     */
-    PgBackendStatus backendStatus;
-
-    /*
-     * The xid of the current transaction if available, InvalidTransactionId
-     * if not.
-     */
-    TransactionId backend_xid;
-
-    /*
-     * The xmin of the current session if available, InvalidTransactionId if
-     * not.
-     */
-    TransactionId backend_xmin;
+	/*
+	 * Local version of the backend status entry.
+	 */
+	PgBackendStatus backendStatus;
+
+	/*
+	 * The xid of the current transaction if available, InvalidTransactionId
+	 * if not.
+	 */
+	TransactionId backend_xid;
+
+	/*
+	 * The xmin of the current session if available, InvalidTransactionId if
+	 * not.
+	 */
+	TransactionId backend_xmin;
+	
+	/* copy of backend id */
+	BackendId backend_id;
 } LocalPgBackendStatus;
 
 /*
@@ -1118,15 +1121,15 @@ typedef struct LocalPgBackendStatus
  */
 typedef struct PgStat_FunctionCallUsage
 {
-    /* Link to function's hashtable entry (must still be there at exit!) */
-    /* NULL means we are not tracking the current function call */
-    PgStat_FunctionCounts *fs;
-    /* Total time previously charged to function, as of function start */
-    instr_time    save_f_total_time;
-    /* Backend-wide total time as of function start */
-    instr_time    save_total;
-    /* system clock as of function start */
-    instr_time    f_start;
+	/* Link to function's hashtable entry (must still be there at exit!) */
+	/* NULL means we are not tracking the current function call */
+	PgStat_FunctionCounts *fs;
+	/* Total time previously charged to function, as of function start */
+	instr_time	save_f_total_time;
+	/* Backend-wide total time as of function start */
+	instr_time	save_total;
+	/* system clock as of function start */
+	instr_time	f_start;
 } PgStat_FunctionCallUsage;
 
 
@@ -1136,7 +1139,7 @@ typedef struct PgStat_FunctionCallUsage
  */
 extern bool pgstat_track_activities;
 extern bool pgstat_track_counts;
-extern int    pgstat_track_functions;
+extern int	pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_track_activity_query_size;
 extern char *pgstat_stat_directory;
 extern char *pgstat_stat_tmpname;
@@ -1161,7 +1164,7 @@ extern Size BackendStatusShmemSize(void);
 extern void CreateSharedBackendStatus(void);
 
 extern void pgstat_init(void);
-extern int    pgstat_start(void);
+extern int	pgstat_start(void);
 extern void pgstat_reset_all(void);
 extern void allow_immediate_pgstat_restart(void);
 
@@ -1187,10 +1190,10 @@ extern void pgstat_reset_single_counter(Oid objectid, PgStat_Single_Reset_Type t
 
 extern void pgstat_report_autovac(Oid dboid);
 extern void pgstat_report_vacuum(Oid tableoid, bool shared,
-                     PgStat_Counter livetuples, PgStat_Counter deadtuples);
+					 PgStat_Counter livetuples, PgStat_Counter deadtuples);
 extern void pgstat_report_analyze(Relation rel,
-                      PgStat_Counter livetuples, PgStat_Counter deadtuples,
-                      bool resetcounter);
+					  PgStat_Counter livetuples, PgStat_Counter deadtuples,
+					  bool resetcounter);
 
 extern void pgstat_report_recovery_conflict(int reason);
 extern void pgstat_report_deadlock(void);
@@ -1206,14 +1209,14 @@ extern const char *pgstat_get_wait_event(uint32 wait_event_info);
 extern const char *pgstat_get_wait_event_type(uint32 wait_event_info);
 extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser);
 extern const char *pgstat_get_crashed_backend_activity(int pid, char *buffer,
-                                    int buflen);
+									int buflen);
 extern const char *pgstat_get_backend_desc(BackendType backendType);
 
 extern void pgstat_progress_start_command(ProgressCommandType cmdtype,
-                              Oid relid);
+							  Oid relid);
 extern void pgstat_progress_update_param(int index, int64 val);
 extern void pgstat_progress_update_multi_param(int nparam, const int *index,
-                                   const int64 *val);
+								   const int64 *val);
 extern void pgstat_progress_end_command(void);
 
 extern PgStat_TableStatus *find_tabstat_entry(Oid rel_id);
@@ -1224,13 +1227,13 @@ extern void pgstat_initstats(Relation rel);
 /* ----------
  * pgstat_report_wait_start() -
  *
- *    Called from places where server process needs to wait.  This is called
- *    to report wait event information.  The wait information is stored
- *    as 4-bytes where first byte represents the wait event class (type of
- *    wait, for different types of wait, refer WaitClass) and the next
- *    3-bytes represent the actual wait event.  Currently 2-bytes are used
- *    for wait event which is sufficient for current usage, 1-byte is
- *    reserved for future usage.
+ *	Called from places where server process needs to wait.  This is called
+ *	to report wait event information.  The wait information is stored
+ *	as 4-bytes where first byte represents the wait event class (type of
+ *	wait, for different types of wait, refer WaitClass) and the next
+ *	3-bytes represent the actual wait event.  Currently 2-bytes are used
+ *	for wait event which is sufficient for current usage, 1-byte is
+ *	reserved for future usage.
  *
  * NB: this *must* be able to survive being called before MyProc has been
  * initialized.
@@ -1239,22 +1242,22 @@ extern void pgstat_initstats(Relation rel);
 static inline void
 pgstat_report_wait_start(uint32 wait_event_info)
 {
-    volatile PGPROC *proc = MyProc;
+	volatile PGPROC *proc = MyProc;
 
-    if (!pgstat_track_activities || !proc)
-        return;
+	if (!pgstat_track_activities || !proc)
+		return;
 
-    /*
-     * Since this is a four-byte field which is always read and written as
-     * four-bytes, updates are atomic.
-     */
-    proc->wait_event_info = wait_event_info;
+	/*
+	 * Since this is a four-byte field which is always read and written as
+	 * four-bytes, updates are atomic.
+	 */
+	proc->wait_event_info = wait_event_info;
 }
 
 /* ----------
  * pgstat_report_wait_end() -
  *
- *    Called to report end of a wait.
+ *	Called to report end of a wait.
  *
  * NB: this *must* be able to survive being called before MyProc has been
  * initialized.
@@ -1263,59 +1266,59 @@ pgstat_report_wait_start(uint32 wait_event_info)
 static inline void
 pgstat_report_wait_end(void)
 {
-    volatile PGPROC *proc = MyProc;
+	volatile PGPROC *proc = MyProc;
 
-    if (!pgstat_track_activities || !proc)
-        return;
+	if (!pgstat_track_activities || !proc)
+		return;
 
-    /*
-     * Since this is a four-byte field which is always read and written as
-     * four-bytes, updates are atomic.
-     */
-    proc->wait_event_info = 0;
+	/*
+	 * Since this is a four-byte field which is always read and written as
+	 * four-bytes, updates are atomic.
+	 */
+	proc->wait_event_info = 0;
 }
 
 /* nontransactional event counts are simple enough to inline */
 
-#define pgstat_count_heap_scan(rel)                                    \
-    do {                                                            \
-        if ((rel)->pgstat_info != NULL)                                \
-            (rel)->pgstat_info->t_counts.t_numscans++;                \
-    } while (0)
-#define pgstat_count_heap_getnext(rel)                                \
-    do {                                                            \
-        if ((rel)->pgstat_info != NULL)                                \
-            (rel)->pgstat_info->t_counts.t_tuples_returned++;        \
-    } while (0)
-#define pgstat_count_heap_fetch(rel)                                \
-    do {                                                            \
-        if ((rel)->pgstat_info != NULL)                                \
-            (rel)->pgstat_info->t_counts.t_tuples_fetched++;        \
-    } while (0)
-#define pgstat_count_index_scan(rel)                                \
-    do {                                                            \
-        if ((rel)->pgstat_info != NULL)                                \
-            (rel)->pgstat_info->t_counts.t_numscans++;                \
-    } while (0)
-#define pgstat_count_index_tuples(rel, n)                            \
-    do {                                                            \
-        if ((rel)->pgstat_info != NULL)                                \
-            (rel)->pgstat_info->t_counts.t_tuples_returned += (n);    \
-    } while (0)
-#define pgstat_count_buffer_read(rel)                                \
-    do {                                                            \
-        if ((rel)->pgstat_info != NULL)                                \
-            (rel)->pgstat_info->t_counts.t_blocks_fetched++;        \
-    } while (0)
-#define pgstat_count_buffer_hit(rel)                                \
-    do {                                                            \
-        if ((rel)->pgstat_info != NULL)                                \
-            (rel)->pgstat_info->t_counts.t_blocks_hit++;            \
-    } while (0)
-#define pgstat_count_buffer_read_time(n)                            \
-    (pgStatBlockReadTime += (n))
-#define pgstat_count_buffer_write_time(n)                            \
-    (pgStatBlockWriteTime += (n))
+#define pgstat_count_heap_scan(rel)									\
+	do {															\
+		if ((rel)->pgstat_info != NULL)								\
+			(rel)->pgstat_info->t_counts.t_numscans++;				\
+	} while (0)
+#define pgstat_count_heap_getnext(rel)								\
+	do {															\
+		if ((rel)->pgstat_info != NULL)								\
+			(rel)->pgstat_info->t_counts.t_tuples_returned++;		\
+	} while (0)
+#define pgstat_count_heap_fetch(rel)								\
+	do {															\
+		if ((rel)->pgstat_info != NULL)								\
+			(rel)->pgstat_info->t_counts.t_tuples_fetched++;		\
+	} while (0)
+#define pgstat_count_index_scan(rel)								\
+	do {															\
+		if ((rel)->pgstat_info != NULL)								\
+			(rel)->pgstat_info->t_counts.t_numscans++;				\
+	} while (0)
+#define pgstat_count_index_tuples(rel, n)							\
+	do {															\
+		if ((rel)->pgstat_info != NULL)								\
+			(rel)->pgstat_info->t_counts.t_tuples_returned += (n);	\
+	} while (0)
+#define pgstat_count_buffer_read(rel)								\
+	do {															\
+		if ((rel)->pgstat_info != NULL)								\
+			(rel)->pgstat_info->t_counts.t_blocks_fetched++;		\
+	} while (0)
+#define pgstat_count_buffer_hit(rel)								\
+	do {															\
+		if ((rel)->pgstat_info != NULL)								\
+			(rel)->pgstat_info->t_counts.t_blocks_hit++;			\
+	} while (0)
+#define pgstat_count_buffer_read_time(n)							\
+	(pgStatBlockReadTime += (n))
+#define pgstat_count_buffer_write_time(n)							\
+	(pgStatBlockWriteTime += (n))
 
 extern void pgstat_count_heap_insert(Relation rel, PgStat_Counter n);
 extern void pgstat_count_heap_update(Relation rel, bool hot);
@@ -1329,9 +1332,9 @@ extern void pgstat_count_remote_delete(Relation rel, int n);
 #endif
 
 extern void pgstat_init_function_usage(FunctionCallInfoData *fcinfo,
-                           PgStat_FunctionCallUsage *fcu);
+						   PgStat_FunctionCallUsage *fcu);
 extern void pgstat_end_function_usage(PgStat_FunctionCallUsage *fcu,
-                          bool finalize);
+						  bool finalize);
 
 extern void AtEOXact_PgStat(bool isCommit);
 extern void AtEOSubXact_PgStat(bool isCommit, int nestDepth);
@@ -1340,9 +1343,9 @@ extern void AtPrepare_PgStat(void);
 extern void PostPrepare_PgStat(void);
 
 extern void pgstat_twophase_postcommit(TransactionId xid, uint16 info,
-                           void *recdata, uint32 len);
+						   void *recdata, uint32 len);
 extern void pgstat_twophase_postabort(TransactionId xid, uint16 info,
-                          void *recdata, uint32 len);
+						  void *recdata, uint32 len);
 
 extern void pgstat_send_archiver(const char *xlog, bool failed);
 extern void pgstat_send_bgwriter(void);
@@ -1357,8 +1360,8 @@ extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid);
 extern PgBackendStatus *pgstat_fetch_stat_beentry(int beid);
 extern LocalPgBackendStatus *pgstat_fetch_stat_local_beentry(int beid);
 extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
-extern int    pgstat_fetch_stat_numbackends(void);
+extern int	pgstat_fetch_stat_numbackends(void);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
 
-#endif                            /* PGSTAT_H */
+#endif							/* PGSTAT_H */
diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h
index b69d747b..6c5abcf4 100644
--- a/src/include/pgxc/pgxc.h
+++ b/src/include/pgxc/pgxc.h
@@ -105,6 +105,9 @@ extern uint32    PGXCNodeIdentifier;
 extern char *PGXCClusterName;
 extern char *PGXCMainClusterName;
 extern char *PGXCDefaultClusterName;
+#ifdef __TBASE__
+extern char PGXCSessionId[NAMEDATALEN];
+#endif
 
 
 extern Datum xc_lockForBackupKey1;
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 15c4ef46..f79bc2b8 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -285,12 +285,15 @@ extern void pgxc_print_pending_data(PGXCNodeHandle *handle, bool reset);
 
 #ifdef __TBASE__
 void add_error_message_from_combiner(PGXCNodeHandle *handle, void *combiner_input);
-inline void pgxc_set_coordinator_proc_pid(int proc_pid);
-inline int pgxc_get_coordinator_proc_pid(void);
-inline void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid);
-inline TransactionId pgxc_get_coordinator_proc_vxid(void);
+void pgxc_set_coordinator_proc_pid(int proc_pid);
+int pgxc_get_coordinator_proc_pid(void);
+void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid);
+TransactionId pgxc_get_coordinator_proc_vxid(void);
 inline char* find_ddl_leader_cn(void);
 inline bool  is_ddl_leader_cn(char *leader_cn);
+extern int pgxc_node_send_sessionid(PGXCNodeHandle * handle);
+extern void SerializeSessionId(Size maxsize, char *start_address);
+extern void StartParallelWorkerSessionId(char *address);
 #endif
 
 #ifdef __AUDIT__
diff --git a/src/include/pgxc/squeue.h b/src/include/pgxc/squeue.h
index a48ba09b..3f0a6408 100644
--- a/src/include/pgxc/squeue.h
+++ b/src/include/pgxc/squeue.h
@@ -2,13 +2,13 @@
  *
  * barrier.h
  *
- *      Definitions for the shared queue handling
+ *	  Definitions for the shared queue handling
  *
  *
  * Copyright (c) 2012-2014, TransLattice, Inc.
  *
  * IDENTIFICATION
- *      $$
+ *	  $$
  *
  *-------------------------------------------------------------------------
  */
@@ -74,24 +74,24 @@ extern void SharedQueueAcquire(const char *sqname, int ncons, bool parallelSend,
 extern void SharedQueueAcquire(const char *sqname, int ncons);
 #endif
 extern SharedQueue SharedQueueBind(const char *sqname, List *consNodes,
-                                   List *distNodes, int *myindex, int *consMap
+								   List *distNodes, int *myindex, int *consMap
 #ifdef __TBASE__
-                                   ,
-                                    DataPumpSender *sender
+								   ,
+									DataPumpSender *sender
 #endif
-                                    );
+									);
 extern void SharedQueueUnBind(SharedQueue squeue, bool failed);
 extern void SharedQueueRelease(const char *sqname);
 extern void SharedQueuesCleanup(int code, Datum arg);
 
-extern int    SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc,
-                  Tuplestorestate **tuplestore);
+extern int	SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc,
+				  Tuplestorestate **tuplestore);
 
 extern void SharedQueueWrite(SharedQueue squeue, int consumerIdx,
-                 TupleTableSlot *slot, Tuplestorestate **tuplestore,
-                 MemoryContext tmpcxt);
+				 TupleTableSlot *slot, Tuplestorestate **tuplestore,
+				 MemoryContext tmpcxt);
 extern bool SharedQueueRead(SharedQueue squeue, int consumerIdx,
-                TupleTableSlot *slot, bool canwait);
+				TupleTableSlot *slot, bool canwait);
 extern void SharedQueueDisconnectConsumer(const char *sqname);
 extern void SharedQueueReset(SharedQueue squeue, int consumerIdx);
 extern void SharedQueueResetNotConnected(SharedQueue squeue);
@@ -100,35 +100,35 @@ extern bool SharedQueueWaitOnProducerLatch(SharedQueue squeue, long timeout);
 #ifdef __TBASE__
 typedef enum 
 { 
-    DataPumpOK                               = 0,
-    DataPumpSndError_no_socket             = -1, 
-    DataPumpSndError_no_space              = -2, 
-    DataPumpSndError_io_error                = -3,
-    DataPumpSndError_node_error            = -4,
-    DataPumpSndError_bad_status             = -5,
-    DataPumpSndError_unreachable_node      = -6,
-    DataPumpConvert_error                    = -7
+	DataPumpOK						       = 0,
+	DataPumpSndError_no_socket             = -1, 
+	DataPumpSndError_no_space              = -2, 
+	DataPumpSndError_io_error 	           = -3,
+	DataPumpSndError_node_error            = -4,
+	DataPumpSndError_bad_status      	   = -5,
+	DataPumpSndError_unreachable_node      = -6,
+	DataPumpConvert_error             	   = -7
 }DataPumpSndError;
 
 #define  DATAPUMP_UNREACHABLE_NODE_FD     (-2)
 
 typedef enum
 {
-    ConvertRunning,
-    ConvertListenError,
-    ConvertAcceptError,
-    ConvertRecvNodeidError,
-    ConvertRecvNodeindexError,
-    ConvertRecvSockfdError,
-    ConvertSetSockfdError,
-    ConvertExit
+	ConvertRunning,
+	ConvertListenError,
+	ConvertAcceptError,
+	ConvertRecvNodeidError,
+	ConvertRecvNodeindexError,
+	ConvertRecvSockfdError,
+	ConvertSetSockfdError,
+	ConvertExit
 }ConvertStatus;
 
 typedef enum 
 {
-    Squeue_Consumer,
-    Squeue_Producer,
-    Squeue_None
+	Squeue_Consumer,
+	Squeue_Producer,
+	Squeue_None
 } SqueueRole;
 
 extern bool IsSqueueProducer(void);
@@ -196,15 +196,15 @@ extern void RemoteSubplanSigusr2Handler(SIGNAL_ARGS);
 #ifdef __TBASE__
 enum MT_thr_detach 
 { 
-    MT_THR_JOINABLE, 
-    MT_THR_DETACHED 
+	MT_THR_JOINABLE, 
+	MT_THR_DETACHED 
 };
 
 typedef struct
 {
-    int             m_cnt;
-    pthread_mutex_t m_mutex;
-    pthread_cond_t  m_cond;
+	int             m_cnt;
+	pthread_mutex_t m_mutex;
+	pthread_cond_t  m_cond;
 }ThreadSema;
 
 extern void ThreadSemaInit(ThreadSema *sema, int32 init);
@@ -214,11 +214,11 @@ extern void ThreadSemaUp(ThreadSema *sema);
 
 typedef struct 
 {
-    void                 **m_List; /*循环队列数组*/
-    uint32               m_Length; /*队列队列长度*/
-    slock_t              m_lock;   /*保护下面的两个变量*/
-    volatile uint32      m_Head;   /*队列头部，数据插入往头部插入，头部加一等于尾则队列满*/
-    volatile uint32      m_Tail;   /*队列尾部，尾部等于头部，则队列为空*/
+	void                 **m_List; /*循环队列数组*/
+	uint32               m_Length; /*队列队列长度*/
+	slock_t              m_lock;   /*保护下面的两个变量*/
+	volatile uint32      m_Head;   /*队列头部，数据插入往头部插入，头部加一等于尾则队列满*/
+	volatile uint32      m_Tail;   /*队列尾部，尾部等于头部，则队列为空*/
 }PGPipe;
 extern PGPipe* CreatePipe(uint32 size);
 extern void    DestoryPipe(PGPipe *pPipe);
@@ -226,10 +226,11 @@ extern void    *PipeGet(PGPipe *pPipe);
 extern int     PipePut(PGPipe *pPipe, void *p);
 extern bool    PipeIsFull(PGPipe *pPipe);
 extern bool    IsEmpty(PGPipe *pPipe);
-extern int        PipeLength(PGPipe *pPipe);
+extern int 	   PipeLength(PGPipe *pPipe);
 
 extern int32 CreateThread(void *(*f) (void *), void *arg, int32 mode);
 
+extern const char *SqueueName(SharedQueue sq);
 
 #endif
 
diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h
index 5d039875..d662e3f6 100644
--- a/src/include/utils/portal.h
+++ b/src/include/utils/portal.h
@@ -282,6 +282,12 @@ typedef struct PortalData
 #define PortalGetQueryDesc(portal)    ((portal)->queryDesc)
 #define PortalGetHeapMemory(portal) ((portal)->heap)
 
+/* Hook for plugins to get control after PortalStart() */
+typedef void (*PortalStart_hook_type) (Portal portal);
+extern PGDLLIMPORT PortalStart_hook_type PortalStart_hook;
+/* Hook for plugins to get control before PortalDrop() */
+typedef void (*PortalDrop_hook_type) (Portal portal);
+extern PGDLLIMPORT PortalDrop_hook_type PortalDrop_hook;
 
 /* Prototypes for functions in utils/mmgr/portalmem.c */
 extern void EnablePortalManager(void);

From d4ba06ee4d12b3c901d4a857411a653c14a265c5 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Mon, 26 Apr 2021 11:12:01 +0800
Subject: [PATCH 150/578] Support showing xc_node_id in parallel scan

by setting PGXCNodeIdentifier for parallel workers when they started
same operation like PGXCNodeId.

TAPD: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696085995445
---
 src/backend/access/transam/parallel.c         | 11 +++++++++++
 .../regress/expected/select_parallel_4.out    | 19 +++++++++++++++++++
 src/test/regress/sql/select_parallel.sql      |  6 ++++++
 3 files changed, 36 insertions(+)

diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
index b67873e8..d6108a3a 100644
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -38,7 +38,10 @@
 #include "utils/resowner.h"
 #include "utils/snapmgr.h"
 #ifdef __TBASE__
+#include "catalog/pg_collation.h"
 #include "pgxc/squeue.h"
+#include "utils/formatting.h"
+#include "utils/lsyscache.h"
 #endif
 
 /*
@@ -1102,6 +1105,14 @@ ParallelWorkerMain(Datum main_arg)
     StartTransactionCommand();
     /* Initialize XL executor. This must be done inside a transaction block. */
     InitMultinodeExecutor(false);
+	/* set PGXCNodeIdentifier for workers */
+	if (PGXCNodeIdentifier == 0)
+	{
+		char *node_name;
+		node_name = str_tolower(PGXCNodeName, strlen(PGXCNodeName), DEFAULT_COLLATION_OID);
+		PGXCNodeIdentifier = get_pgxc_node_id(get_pgxc_nodeoid(node_name));
+		pfree(node_name);
+	}
     CommitTransactionCommand();
 
     /*
diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out
index 4d264b26..6bc02325 100644
--- a/src/test/regress/expected/select_parallel_4.out
+++ b/src/test/regress/expected/select_parallel_4.out
@@ -360,6 +360,25 @@ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1;
    Node/s: datanode_1, datanode_2
 (2 rows)
 
+-- make sure identifier was set in workers
+CREATE TABLE t_worker_identifier (a int);
+INSERT INTO t_worker_identifier values(1);
+EXPLAIN (costs off) SELECT xc_node_id != 0 FROM t_worker_identifier;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Gather
+         Workers Planned: 3
+         ->  Parallel Seq Scan on t_worker_identifier
+(5 rows)
+
+SELECT xc_node_id != 0 FROM t_worker_identifier;
+ ?column? 
+----------
+ t
+(1 row)
+
 -- provoke error in worker
 select stringu1::int2 from tenk1 where unique1 = 1;
 ERROR:  invalid input syntax for integer: "BAAAAA"
diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql
index 70d0f0fb..d2cca20f 100644
--- a/src/test/regress/sql/select_parallel.sql
+++ b/src/test/regress/sql/select_parallel.sql
@@ -136,6 +136,12 @@ EXPLAIN (timing off, summary off, costs off) SELECT * FROM tenk1;
 
 EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1;
 
+-- make sure identifier was set in workers
+CREATE TABLE t_worker_identifier (a int);
+INSERT INTO t_worker_identifier values(1);
+EXPLAIN (costs off) SELECT xc_node_id != 0 FROM t_worker_identifier;
+SELECT xc_node_id != 0 FROM t_worker_identifier;
+
 -- provoke error in worker
 select stringu1::int2 from tenk1 where unique1 = 1;
 

From 0f174f369abdef3a11c6156df7c565379887c135 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 27 Apr 2021 17:04:18 +0800
Subject: [PATCH 151/578] Remote subplan should distinguish params from
 initplan ro subplan (merge request !296)

Squash merge branch 'andrelin/params' into 'Tbase_v2.15.19'

* Use enum to make code more readable

* Remote subplan should distinguish params from initplan or subplan
---
 src/backend/nodes/copyfuncs.c          |  1 +
 src/backend/nodes/outfuncs.c           |  1 +
 src/backend/nodes/readfuncs.c          |  1 +
 src/backend/optimizer/plan/planner.c   |  1 +
 src/backend/optimizer/plan/subselect.c | 85 ++++++++++++++++++++++++++
 src/backend/pgxc/pool/execRemote.c     |  8 ++-
 src/backend/tcop/pquery.c              | 21 +++++--
 src/include/optimizer/subselect.h      |  1 +
 src/include/pgxc/execRemote.h          | 10 +++
 src/include/pgxc/planner.h             |  2 +
 10 files changed, 124 insertions(+), 7 deletions(-)

diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 3d4e8a68..c2885241 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -1420,6 +1420,7 @@ _copyRemoteSubplan(const RemoteSubplan *from)
     COPY_SCALAR_FIELD(unique);
 #ifdef __TBASE__
     COPY_SCALAR_FIELD(parallelWorkerSendTuple);
+	COPY_BITMAPSET_FIELD(initPlanParams);
 #endif
     return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 7d7a9704..f550ed23 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -1697,6 +1697,7 @@ _outRemoteSubplan(StringInfo str, const RemoteSubplan *node)
     WRITE_STRING_FIELD(cursor);
     WRITE_INT_FIELD(unique);
     WRITE_BOOL_FIELD(parallelWorkerSendTuple);
+	WRITE_BITMAPSET_FIELD(initPlanParams);
 
 #ifdef __TBASE__
     if (IS_PGXC_COORDINATOR && !g_set_global_snapshot)
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 7207a98c..d653fbf3 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -3776,6 +3776,7 @@ _readRemoteSubplan(void)
     READ_STRING_FIELD(cursor);
     READ_INT_FIELD(unique);
     READ_BOOL_FIELD(parallelWorkerSendTuple);
+	READ_BITMAPSET_FIELD(initPlanParams);
 
     READ_DONE();
 }
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b909ad6a..8c62e688 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -403,6 +403,7 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
         top_plan = (Plan *) make_remotesubplan(root, top_plan, NULL,
                                                root->distribution,
                                                root->sort_pathkeys);
+        SS_remote_attach_initplans(root, top_plan);
         remote_subplan_depth--;
     }
 #endif
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 3e3339f8..0f24fa9d 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -5598,3 +5598,88 @@ SS_make_initplan_from_plan(PlannerInfo *root,
     /* Set costs of SubPlan using info from the plan tree */
     cost_subplan(subroot, node, plan);
 }
+/*
+ * SS_remote_attach_initplans
+ *
+ * recursively look into a plantree, find any RemoteSubplan and
+ * attach params id that generated from init-plan of this query.
+ */
+void
+SS_remote_attach_initplans(PlannerInfo *root, Plan *plan)
+{
+	ListCell   *lc;
+	
+	if (plan == NULL)
+		return;
+	
+	if (IsA(plan, RemoteSubplan))
+	{
+		ListCell *plan_lc, *param_lc;
+		RemoteSubplan *rsplan = (RemoteSubplan *) plan;
+		Assert(rsplan->initPlanParams == NULL);
+		foreach(plan_lc, root->init_plans)
+		{
+			SubPlan *initplan = (SubPlan *) lfirst(plan_lc);
+			foreach(param_lc, initplan->setParam)
+			{
+				rsplan->initPlanParams = 
+					bms_add_member(rsplan->initPlanParams, lfirst_int(param_lc));
+			}
+		}
+	}
+	
+	switch (nodeTag(plan))
+	{
+		case T_SubqueryScan:
+		{
+			SubqueryScan *sscan = (SubqueryScan *) plan;
+			RelOptInfo *rel;
+			
+			rel = find_base_rel(root, sscan->scan.scanrelid);
+			SS_remote_attach_initplans(rel->subroot, sscan->subplan);
+		}
+			break;
+		case T_CustomScan:
+		{
+			foreach(lc, ((CustomScan *) plan)->custom_plans)
+				SS_remote_attach_initplans(root, (Plan *) lfirst(lc));
+		}
+			break;
+		case T_ModifyTable:
+		{
+			foreach(lc, ((ModifyTable *) plan)->plans)
+				SS_remote_attach_initplans(root, (Plan *) lfirst(lc));
+		}
+			break;
+		case T_Append:
+		{
+			foreach(lc, ((Append *) plan)->appendplans)
+				SS_remote_attach_initplans(root, (Plan *) lfirst(lc));
+		}
+			break;
+		case T_MergeAppend:
+		{
+			foreach(lc, ((MergeAppend *) plan)->mergeplans)
+				SS_remote_attach_initplans(root, (Plan *) lfirst(lc));
+		}
+			break;
+		case T_BitmapAnd:
+		{
+			foreach(lc, ((BitmapAnd *) plan)->bitmapplans)
+				SS_remote_attach_initplans(root, (Plan *) lfirst(lc));
+		}
+			break;
+		case T_BitmapOr:
+		{
+			foreach(lc, ((BitmapOr *) plan)->bitmapplans)
+				SS_remote_attach_initplans(root, (Plan *) lfirst(lc));
+		}
+			break;
+		default:
+			break;
+	}
+	
+	/* Process left and right child plans, if any */
+	SS_remote_attach_initplans(root, plan->lefttree);
+	SS_remote_attach_initplans(root, plan->righttree);
+}
\ No newline at end of file
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 86cafb0f..52850951 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -10208,12 +10208,13 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags)
                      */
                     if (OidIsValid(param->ptype))
                     {
-                        rstmt.remoteparams[paramno].paramused = 1;
+						rstmt.remoteparams[paramno].paramused =
+							bms_is_member(i, node->initPlanParams) ? REMOTE_PARAM_INITPLAN : REMOTE_PARAM_SUBPLAN;
                         rstmt.remoteparams[paramno].paramtype = param->ptype;
                     }
                     else
                     {
-                        rstmt.remoteparams[paramno].paramused = 0;
+						rstmt.remoteparams[paramno].paramused = REMOTE_PARAM_UNUSED;
                         rstmt.remoteparams[paramno].paramtype = INT4OID;
                     }
 
@@ -10237,7 +10238,8 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags)
                     rstmt.remoteparams[paramno].paramkind = PARAM_EXEC;
                     rstmt.remoteparams[paramno].paramid = i;
                     rstmt.remoteparams[paramno].paramtype = prmdata->ptype;
-                    rstmt.remoteparams[paramno].paramused = 1;
+					rstmt.remoteparams[paramno].paramused =
+						bms_is_member(i, node->initPlanParams) ? REMOTE_PARAM_INITPLAN : REMOTE_PARAM_SUBPLAN;
                     /* Will scan plan tree to find out data type of the param */
                     if (prmdata->ptype == InvalidOid)
                         defineParams = bms_add_member(defineParams, i);
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 2064e594..29c53160 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -676,6 +676,10 @@ PortalStart(Portal portal, ParamListInfo params,
         {
 #ifdef XCP
             case PORTAL_DISTRIBUTED:
+			{
+				int  i;
+				bool paramNeedPassDown = false;
+				
                 /* No special ability is needed */
                 eflags = 0;
                 /* Must set snapshot before starting executor. */
@@ -726,9 +730,18 @@ PortalStart(Portal portal, ParamListInfo params,
 				 * is not supported in SharedQueue mode. Force to do it traditionally.
                  */
 #ifdef __TBASE__
-				if ((!paramPassDown && queryDesc->plannedstmt->nParamRemote > 0 &&
-				     queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC) ||
-				    queryDesc->epqContext != NULL)
+                for (i = 0; i < queryDesc->plannedstmt->nParamRemote; i++)
+                {
+                        RemoteParam *rparam = &queryDesc->plannedstmt->remoteparams[i];
+                        if (rparam->paramkind == PARAM_EXEC &&
+                            rparam->paramused != REMOTE_PARAM_INITPLAN) /* if it's from initplan, still work with shared queue */
+                        {
+                                paramNeedPassDown = true;
+                                break;
+                        }
+                }
+
+                if ((!paramPassDown && paramNeedPassDown) || queryDesc->epqContext != NULL)   
 #else
                 if (queryDesc->plannedstmt->nParamRemote > 0 &&
                         queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC)
@@ -737,7 +750,6 @@ PortalStart(Portal portal, ParamListInfo params,
                     int        *consMap;
                     int         len;
                     ListCell   *lc;
-                    int         i;
                     Locator       *locator;
                     Oid            keytype;
                     DestReceiver *dest;
@@ -983,6 +995,7 @@ PortalStart(Portal portal, ParamListInfo params,
                 portal->portalPos = 0;
 
                 PopActiveSnapshot();
+			}
                 break;
 #endif
 
diff --git a/src/include/optimizer/subselect.h b/src/include/optimizer/subselect.h
index 47ba77f5..bfba85b3 100644
--- a/src/include/optimizer/subselect.h
+++ b/src/include/optimizer/subselect.h
@@ -120,6 +120,7 @@ extern Param *assign_nestloop_param_placeholdervar(PlannerInfo *root,
                                      PlaceHolderVar *phv);
 extern int    SS_assign_special_param(PlannerInfo *root);
 
+extern void SS_remote_attach_initplans(PlannerInfo *root, Plan *plan);
 #ifdef __TBASE__
 extern bool has_correlation_in_funcexpr_rte(List *rtable);
 #endif
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index 8332a217..60910919 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -88,6 +88,16 @@ typedef enum
     REMOTE_COPY_TUPLESTORE    /* Store data in tuplestore */
 } RemoteCopyType;
 
+/*
+ * Type of remote param from init-plan or subplan
+ */
+typedef enum
+{
+	REMOTE_PARAM_UNUSED,
+	REMOTE_PARAM_INITPLAN,
+	REMOTE_PARAM_SUBPLAN
+} RemoteParamType;
+
 /* Combines results of INSERT statements using multiple values */
 typedef struct CombineTag
 {
diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h
index cb221759..c868136a 100644
--- a/src/include/pgxc/planner.h
+++ b/src/include/pgxc/planner.h
@@ -253,6 +253,8 @@ typedef struct
       * directly without gather node?
      */
     bool        parallelWorkerSendTuple; 
+	/* params that generated by initplan */
+	Bitmapset  *initPlanParams;
 #endif
 
 } RemoteSubplan;

From 89038e41a8b3f1b1a9d215b6c47ec463ce14a0d8 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 1 Jun 2021 20:59:18 +0800
Subject: [PATCH 152/578] Support tablesample for partition table

tapd: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696086883685
---
 src/backend/executor/nodeSamplescan.c   | 15 +++++++++++++++
 src/backend/optimizer/plan/createplan.c |  1 +
 2 files changed, 16 insertions(+)

diff --git a/src/backend/executor/nodeSamplescan.c b/src/backend/executor/nodeSamplescan.c
index 4738beb1..9be86efe 100644
--- a/src/backend/executor/nodeSamplescan.c
+++ b/src/backend/executor/nodeSamplescan.c
@@ -128,9 +128,24 @@ InitScanRelation(SampleScanState *node, EState *estate, int eflags)
      * get the relation object id from the relid'th entry in the range table,
      * open that relation and acquire appropriate lock on it.
      */
+#ifdef __TBASE__
+	/* if interval partition, scan child table instead */
+	if(((SeqScan *) node->ss.ps.plan)->ispartchild)
+	{
+		currentRelation = ExecOpenScanRelationPartition(estate,
+		                                                ((SeqScan *) node->ss.ps.plan)->scanrelid,
+		                                                eflags,
+		                                                ((SeqScan *) node->ss.ps.plan)->childidx);
+	}
+	else
+	{
+#endif
     currentRelation = ExecOpenScanRelation(estate,
                                            ((SampleScan *) node->ss.ps.plan)->scan.scanrelid,
                                            eflags);
+#ifdef __TBASE__
+	}
+#endif
 #ifdef _MLS_
     mls_check_datamask_need_passby((ScanState*)node, currentRelation->rd_id);
 #endif 
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index e4ccad0c..1924a389 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -920,6 +920,7 @@ create_scan_plan(PlannerInfo *root, Path *best_path, int flags)
                 switch(nodeTag(child))
                 {
                     case T_SeqScan:
+					case T_SampleScan:
                         break;
                     case T_IndexScan:
                         {

From 275a095b4a78945188c9c20147a3ebe08feb7263 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Wed, 5 May 2021 14:12:00 +0800
Subject: [PATCH 153/578] Fix some mistakes of remotesubplan cost calculation

1. Remove parallel workers factor when calculate cost of remote
subplan. There was a bug introduced when considering parallel cost of
remote subplan, that the parallel_workers is 0 if parallel not enabled
for this node. This will lead to 0 remote subplan running cost in
non-parallel mode. Since this is a tbase specific execution model, and
we will refactor all parallel optimization/execution logics later to
catch up with Postgres. Here we just remove this nworkers logic.

2. Consider tuple replication times for better estimation of the
distribution cost. Currently the path->rows popped up from baserel is
devided into PER_DN scope. In this module, the rows of replicate
distribution node should multiply the number of nodes.
There is another big issue that we messed up the distribution type usage
between LOCATOR_TYPE_REPLICATED(for replicate distribution) and
LOCATOR_TYPE_NONE(for CN gather distribution). Will refactor this later.

http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131087396641
---
 src/backend/executor/execParallel.c           |   4 +-
 src/backend/optimizer/path/costsize.c         |   9 +-
 src/backend/optimizer/util/pathnode.c         | 108 +++---------------
 src/backend/pgxc/locator/locator.c            |  19 +++
 src/include/optimizer/cost.h                  |   3 +-
 src/include/pgxc/locator.h                    |   1 +
 src/test/regress/expected/foreign_key_2.out   |  13 ++-
 src/test/regress/expected/join_3.out          |  37 +++---
 src/test/regress/expected/rowsecurity_1.out   |  79 +++++++------
 src/test/regress/expected/rules.out           |   3 +
 .../regress/expected/select_parallel_4.out    |  18 +--
 src/test/regress/expected/stats_ext_2.out     |  16 +--
 src/test/regress/expected/subselect.out       |   6 +-
 src/test/regress/expected/tbase_explain.out   |  10 +-
 src/test/regress/expected/xc_FQS_2.out        |   8 +-
 src/test/regress/expected/xc_FQS_join_1.out   |  34 +++---
 16 files changed, 162 insertions(+), 206 deletions(-)

diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index db2cdf60..920bc32e 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -1325,11 +1325,11 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 				planstate_tree_walker(planstate, ExecInitializeWorkerRemoteInstr, pwcxt);
 			}
             if (planstate->plan->parallel_aware)        
-				ExecRemoteSubPlanInitializeDSMWorker((RemoteSubplanState *)planstate, pwcxt);
+				ExecRemoteSubPlanInitDSMWorker((RemoteSubplanState *)planstate, pwcxt);
 			break;
 		case T_HashJoinState:
 			if (planstate->plan->parallel_aware)
-				ExecParallelHashJoinInitializeWorker((HashJoinState *) planstate, pwcxt);
+				ExecParallelHashJoinInitWorker((HashJoinState *) planstate, pwcxt);
                 break;
             case T_AggState:
 			    if (planstate->plan->parallel_aware)
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index c0fa9bdf..9d82cec9 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -5352,23 +5352,22 @@ page_size(double tuples, int width)
 void
 cost_remote_subplan(Path *path,
               Cost input_startup_cost, Cost input_total_cost,
-			  double tuples, int width, int replication,
-			  int nworkers)
+			  double tuples, int width, int replication)
 {
     Cost        startup_cost = input_startup_cost + remote_query_cost;
     Cost        run_cost = input_total_cost - input_startup_cost;
 
-    path->rows = tuples;
+	path->rows = tuples * replication;
 
     /*
      * Charge 2x cpu_operator_cost per tuple to reflect bookkeeping overhead.
      */
-	run_cost += 2 * cpu_operator_cost * tuples * nworkers;
+	run_cost += 2 * cpu_operator_cost * tuples;
 
     /*
      * Estimate cost of sending data over network
      */
-	run_cost += network_byte_cost * tuples * width * replication * nworkers;
+	run_cost += network_byte_cost * tuples * width * replication;
 
     path->startup_cost = startup_cost;
     path->total_cost = startup_cost + run_cost;
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index c0847553..17546a77 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1404,47 +1404,6 @@ set_scanpath_distribution(PlannerInfo *root, RelOptInfo *rel, Path *pathnode)
     }
 }
 
-#ifdef __TBASE__
-/*
- * implementation for create_remotesubplan_path, besides regular creation of remote subplan,
- * we need it when redistributing join rel.
- */
-static Path *
-create_remotesubplan_path_internal(PlannerInfo *root, Path *subpath,
-                                   Distribution *distribution, RelOptInfo *rel,
-                                   ParamPathInfo *param_info, List *pathkeys,
-                                   PathTarget *pathtarget, int replication,
-                                   Cost additional_startup_cost,
-                                   Cost additional_total_cost)
-{
-	RemoteSubPath   *pathnode;
-	
-	//if (IsA(subpath, GatherPath))
-		//reset_cost_gather((GatherPath *) subpath);
-	
-	pathnode = makeNode(RemoteSubPath);
-	pathnode->path.pathtype = T_RemoteSubplan;
-	pathnode->path.parent = rel;
-	pathnode->path.param_info = param_info;
-	pathnode->path.pathkeys = pathkeys;
-	pathnode->subpath = subpath;
-	pathnode->path.distribution = (Distribution *) copyObject(distribution);
-	
-	/* We don't want to run subplains in parallel workers */
-	pathnode->path.parallel_aware = false;
-	pathnode->path.parallel_safe = false;
-	
-	pathnode->path.pathtarget = pathtarget;
-
-	cost_remote_subplan((Path *) pathnode, subpath->startup_cost + additional_startup_cost,
-	                    subpath->total_cost + additional_total_cost, subpath->rows,
-	                    rel->reltarget->width, replication, subpath->parallel_workers);
-	
-	return (Path *) pathnode;
-}
-#endif
-
-
 /*
  * create_remotesubplan_path
  *    Redistribute the data to match the distribution.
@@ -1458,15 +1417,8 @@ create_remotesubplan_path(PlannerInfo *root, Path *subpath,
 {
     RelOptInfo       *rel = subpath->parent;
     RemoteSubPath  *pathnode;
-    Distribution   *subdistribution = subpath->distribution;
+	Distribution   *subDist = subpath->distribution;
 
-#ifdef __TBASE__
-	return create_remotesubplan_path_internal(root, subpath, distribution,
-	                                          rel, subpath->param_info,
-	                                          subpath->pathkeys, subpath->pathtarget,
-	                                          (subdistribution && IsLocatorReplicated(subdistribution->distributionType)) ?
-	                                          bms_num_members(subdistribution->nodes) : 1, 0, 0);
-#else
     pathnode = makeNode(RemoteSubPath);
     pathnode->path.pathtype = T_RemoteSubplan;
     pathnode->path.parent = rel;
@@ -1483,11 +1435,9 @@ create_remotesubplan_path(PlannerInfo *root, Path *subpath,
 
     cost_remote_subplan((Path *) pathnode, subpath->startup_cost,
                         subpath->total_cost, subpath->rows, rel->reltarget->width,
-                        (subdistribution && IsLocatorReplicated(subdistribution->distributionType)) ?
-                        bms_num_members(subdistribution->nodes) : 1);
+						subDist ? calcDistReplications(subDist->distributionType, subDist->nodes) : 1);
 
     return (Path *) pathnode;
-#endif
 }
 
 /*
@@ -1506,13 +1456,6 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
     Distribution   *distribution = NULL;
     RelOptInfo       *rel = subpath->parent;
     RemoteSubPath  *pathnode;
-#ifdef __TBASE__
-	int				num_replication;
-
-	/* IsLocatorNone() also indicates we are replicating through input nodes */
-	num_replication = (IsLocatorReplicated(distributionType) ||
-				IsLocatorNone(distributionType)) ? bms_num_members(nodes) : 1;
-#endif
 
      if (distributionType != LOCATOR_TYPE_NONE)
     {
@@ -1530,20 +1473,6 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
     if (IsA(subpath, MaterialPath))
     {
         MaterialPath *mpath = (MaterialPath *) subpath;
-#ifdef __TBASE__
-		if (IsA(mpath->subpath, RemoteSubPath))
-		{
-			pathnode = (RemoteSubPath *) mpath->subpath;
-			pathnode->path.distribution = (Distribution *) copyObject(distribution);
-		}
-		else
-		{
-			pathnode = (RemoteSubPath *) create_remotesubplan_path_internal(root, mpath->subpath,
-			                                                                distribution, rel, subpath->param_info,
-			                                                                subpath->pathkeys, rel->reltarget,
-			                                                                num_replication, 0, 0);
-		}
-#else
         /* If subpath is already a RemoteSubPath, just replace distribution */
         if (IsA(mpath->subpath, RemoteSubPath))
         {
@@ -1569,11 +1498,13 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
         subpath = pathnode->subpath;
         pathnode->path.distribution = distribution;
         /* (re)calculate costs */
-        cost_remote_subplan((Path *) pathnode, subpath->startup_cost,
-                            subpath->total_cost, subpath->rows, rel->reltarget->width,
-                            IsLocatorReplicated(distributionType) ?
-                                    bms_num_members(nodes) : 1);
-#endif
+		cost_remote_subplan((Path *) pathnode,
+							subpath->startup_cost,
+							subpath->total_cost,
+							subpath->rows,
+							rel->reltarget->width,
+							calcDistReplications(distributionType, nodes));
+
 		mpath->path.distribution = (Distribution *) copyObject(distribution);
         mpath->subpath = (Path *) pathnode;
         cost_material(&mpath->path,
@@ -1587,7 +1518,7 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
     {
         Cost    input_startup_cost = 0;
         Cost    input_total_cost = 0;
-#ifndef __TBASE__
+
         pathnode = makeNode(RemoteSubPath);
         pathnode->path.pathtype = T_RemoteSubplan;
         pathnode->path.parent = rel;
@@ -1595,7 +1526,7 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
         pathnode->path.param_info = subpath->param_info;
         pathnode->path.pathkeys = pathkeys ? pathkeys : subpath->pathkeys;
         pathnode->path.distribution = distribution;
-#endif
+
         /*
          * If we need to insert a Sort node, add it here, so that it gets
          * pushed down to the remote node.
@@ -1628,14 +1559,6 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
             input_startup_cost += sort_path.startup_cost;
             input_total_cost += sort_path.total_cost;
         }
-#ifdef __TBASE__
-		pathnode = (RemoteSubPath *) create_remotesubplan_path_internal(root, subpath,
-		                                                                distribution, rel, subpath->param_info,
-		                                                                pathkeys ? pathkeys : subpath->pathkeys, 
-		                                                                rel->reltarget, num_replication,
-		                                                                input_startup_cost - subpath->startup_cost,
-		                                                                input_total_cost - subpath->total_cost);
-#else
         pathnode->subpath = subpath;
 
         /* We don't want to run subplains in parallel workers */
@@ -1643,10 +1566,11 @@ redistribute_path(PlannerInfo *root, Path *subpath, List *pathkeys,
         pathnode->path.parallel_safe = false;
 
         cost_remote_subplan((Path *) pathnode,
-                            input_startup_cost, input_total_cost,
-                            subpath->rows, rel->reltarget->width,
-							num_replication);
-#endif
+							input_startup_cost,
+							input_total_cost,
+							subpath->rows,
+							rel->reltarget->width,
+							calcDistReplications(distributionType, nodes));
         return (Path *) pathnode;
     }
 }
diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index a6933ce2..76c72d85 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -2160,6 +2160,25 @@ IsDistributedColumn(AttrNumber attr, RelationLocInfo *relation_loc_info)
 
     return result;
 }
+
+/*
+ * Calculate the tuple replication times based on replication type and number
+ * of target nodes.
+ */
+int
+calcDistReplications(char distributionType, Bitmapset *nodes)
+{
+	if (!nodes)
+		return 1;
+
+	if (IsLocatorReplicated(distributionType) ||
+		IsLocatorNone(distributionType))
+	{
+		return  bms_num_members(nodes);
+	}
+
+	return 1;
+}
 #endif
 
 void *
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 358e83b9..2198c9db 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -177,8 +177,7 @@ extern void cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root);
 #ifdef XCP
 extern void cost_remote_subplan(Path *path,
 			  Cost input_startup_cost, Cost input_total_cost,
-			  double tuples, int width, int replication,
-			  int nworkers);
+			  double tuples, int width, int replication);
 #endif
 extern void compute_semi_anti_join_factors(PlannerInfo *root,
 							   RelOptInfo *outerrel,
diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h
index 4e692237..3fd1f6b9 100644
--- a/src/include/pgxc/locator.h
+++ b/src/include/pgxc/locator.h
@@ -249,6 +249,7 @@ extern bool IsTypeDistributable(Oid col_type);
 extern char getLocatorDisType(Locator *self);
 extern bool prefer_olap;
 extern bool IsDistributedColumn(AttrNumber attr, RelationLocInfo *relation_loc_info);
+extern int calcDistReplications(char distributionType, Bitmapset *nodes);
 #endif
 
 #ifdef _MLS_
diff --git a/src/test/regress/expected/foreign_key_2.out b/src/test/regress/expected/foreign_key_2.out
index ec92a35b..8b8ac8ac 100644
--- a/src/test/regress/expected/foreign_key_2.out
+++ b/src/test/regress/expected/foreign_key_2.out
@@ -1373,23 +1373,24 @@ create temp table t1 (a integer primary key, b text);
 create temp table t2 (a integer, b integer references t1) distribute by hash (b);
 create rule r1 as on delete to t1 do delete from t2 where t2.b = old.a;
 explain (costs off) delete from t1 where a = 1;
-                         QUERY PLAN                         
-------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1)
    ->  Delete on t2
          ->  Nested Loop
-               ->  Remote Subquery Scan on all (datanode_1)
-                     ->  Index Scan using t1_pkey on t1
-                           Index Cond: (a = 1)
                ->  Seq Scan on t2
                      Filter: (b = 1)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1)
+                           ->  Index Scan using t1_pkey on t1
+                                 Index Cond: (a = 1)
  
  Remote Fast Query Execution
    Node/s: datanode_1
    ->  Delete on t1
          ->  Index Scan using t1_pkey on t1
                Index Cond: (a = 1)
-(14 rows)
+(15 rows)
 
 delete from t1 where a = 1;
 drop rule r1 on t1;
diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index 761f5a90..0da9548e 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -1876,20 +1876,23 @@ where exists(select * from tenk1 b
              where a.twothousand = b.twothousand and a.fivethous <> b.fivethous)
       and i4.f1 = a.tenthous;
                                  QUERY PLAN                                  
------------------------------------------------------------------------------
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Hash Join
-         Hash Cond: (a.tenthous = i4.f1)
          ->  Hash Semi Join
                Hash Cond: (a.twothousand = b.twothousand)
                Join Filter: (a.fivethous <> b.fivethous)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: twothousand
+               ->  Hash Join
+                     Hash Cond: (a.tenthous = i4.f1)
                ->  Seq Scan on tenk1 a
                ->  Hash
+                           ->  Seq Scan on int4_tbl i4
+         ->  Hash
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: twothousand
                            ->  Seq Scan on tenk1 b
-         ->  Hash
-               ->  Seq Scan on int4_tbl i4
-(12 rows)
+(15 rows)
 
 --
 -- More complicated constructs
@@ -4718,17 +4721,17 @@ explain (num_nodes off, nodes off, costs off)
   select count(*) from tenk1 a,
     tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x;
                             QUERY PLAN                            
-------------------------------------------------------------------
+------------------------------------------------------------
  Finalize Aggregate
    ->  Remote Subquery Scan on all
          ->  Partial Aggregate
                ->  Hash Join
-                     Hash Cond: (a.unique1 = b.unique2)
-                     ->  Seq Scan on tenk1 a
-                     ->  Hash
+                     Hash Cond: (b.unique2 = a.unique1)
                            ->  Remote Subquery Scan on all
                                  Distribute results by H: unique2
                                  ->  Seq Scan on tenk1 b
+                     ->  Hash
+                           ->  Seq Scan on tenk1 a
 (10 rows)
 
 select count(*) from tenk1 a,
@@ -6614,17 +6617,17 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes
 	where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a;
                                                           QUERY PLAN                                                          
 ------------------------------------------------------------------------------------------------------------------------------
- Nested Loop  (cost=200.16..401.93 rows=33 width=4)
+ Nested Loop  (cost=200.16..402.39 rows=33 width=4)
    Join Filter: (t3.b > t2.a)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..280.68 rows=1 width=4)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..280.69 rows=1 width=4)
          ->  Nested Loop  (cost=0.16..180.68 rows=1 width=4)
                Join Filter: (t1.a = t2.a)
                ->  Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1  (cost=0.16..8.18 rows=1 width=4)
                      Index Cond: (b = 2)
                      Filter: (((d)::text ~~ 'char%'::text) AND (c = 3))
                ->  Seq Scan on nestloop_suppression2 t2  (cost=0.00..110.00 rows=5000 width=4)
-   ->  Materialize  (cost=100.00..120.62 rows=50 width=4)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..120.50 rows=50 width=4)
+   ->  Materialize  (cost=100.00..121.08 rows=50 width=4)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..120.95 rows=50 width=4)
                ->  Seq Scan on nestloop_suppression3 t3  (cost=0.00..20.50 rows=50 width=4)
 (12 rows)
 
@@ -6633,7 +6636,7 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes
 	where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a;
                                                              QUERY PLAN                                                             
 ------------------------------------------------------------------------------------------------------------------------------------
- Nested Loop  (cost=200.16..414.44 rows=33 width=4)
+ Nested Loop  (cost=200.16..414.89 rows=33 width=4)
    Join Filter: (t3.b > t2.a)
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..293.19 rows=1 width=4)
          ->  Nested Loop  (cost=0.16..193.19 rows=1 width=4)
@@ -6643,8 +6646,8 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes
                      ->  Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1  (cost=0.16..8.18 rows=1 width=4)
                            Index Cond: (b = 2)
                            Filter: (((d)::text ~~ 'char%'::text) AND (c = 3))
-   ->  Materialize  (cost=100.00..120.62 rows=50 width=4)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..120.50 rows=50 width=4)
+   ->  Materialize  (cost=100.00..121.08 rows=50 width=4)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..120.95 rows=50 width=4)
                ->  Seq Scan on nestloop_suppression3 t3  (cost=0.00..20.50 rows=50 width=4)
 (13 rows)
 
diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out
index 237482a1..7ea346ae 100644
--- a/src/test/regress/expected/rowsecurity_1.out
+++ b/src/test/regress/expected/rowsecurity_1.out
@@ -1577,43 +1577,47 @@ WHERE t2.a = 3 and t3.a = 2 AND f_leak(t2.b) AND f_leak(t3.b);
  Remote Subquery Scan on all (datanode_2)
    ->  Update on t2
          ->  Nested Loop
+               ->  Seq Scan on t2
+                     Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
+               ->  Materialize
                      ->  Remote Subquery Scan on all (datanode_1)
                            ->  Seq Scan on t3
                                  Filter: ((a = 2) AND f_leak(b))
-               ->  Seq Scan on t2
-                     Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
-(8 rows)
+(9 rows)
 
 UPDATE t2 SET b=t2.b FROM t3
 WHERE t2.a = 3 and t3.a = 2 AND f_leak(t2.b) AND f_leak(t3.b);
 EXPLAIN (COSTS OFF) UPDATE t1 SET b=t1.b FROM t2
 WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b);
-                                 QUERY PLAN                                  
------------------------------------------------------------------------------
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_2)
    ->  Update on t1
          Update on t1
          Update on t2 t2_1
          Update on t3
          ->  Nested Loop
-               ->  Remote Subquery Scan on all (datanode_2)
-                     ->  Seq Scan on t2
-                           Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
                ->  Seq Scan on t1
                      Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b))
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           ->  Seq Scan on t2
+                                 Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
          ->  Nested Loop
-               ->  Remote Subquery Scan on all (datanode_2)
-                     ->  Seq Scan on t2
-                           Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
                ->  Seq Scan on t2 t2_1
                      Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b))
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           ->  Seq Scan on t2
+                                 Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
          ->  Nested Loop
-               ->  Remote Subquery Scan on all (datanode_2)
-                     ->  Seq Scan on t2
-                           Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
                ->  Seq Scan on t3
                      Filter: ((a = 3) AND ((a % 2) = 0) AND f_leak(b))
-(23 rows)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           ->  Seq Scan on t2
+                                 Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
+(26 rows)
 
 UPDATE t1 SET b=t1.b FROM t2
 WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b);
@@ -1643,18 +1647,19 @@ WHERE t1.a = 3 and t2.a = 3 AND f_leak(t1.b) AND f_leak(t2.b);
 EXPLAIN (COSTS OFF) UPDATE t2 t2_1 SET b = t2_2.b FROM t2 t2_2
 WHERE t2_1.a = 3 AND t2_2.a = t2_1.a AND t2_2.b = t2_1.b
 AND f_leak(t2_1.b) AND f_leak(t2_2.b) RETURNING *, t2_1, t2_2;
-                                 QUERY PLAN                                  
------------------------------------------------------------------------------
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_2)
    ->  Update on t2 t2_1
          ->  Nested Loop
                Join Filter: (t2_1.b = t2_2.b)
-               ->  Remote Subquery Scan on all (datanode_2)
-                     ->  Seq Scan on t2 t2_2
-                           Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
                ->  Seq Scan on t2 t2_1
                      Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
-(9 rows)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           ->  Seq Scan on t2 t2_2
+                                 Filter: ((a = 3) AND ((a % 2) = 1) AND f_leak(b))
+(10 rows)
 
 UPDATE t2 t2_1 SET b = t2_2.b FROM t2 t2_2
 WHERE t2_1.a = 3 AND t2_2.a = t2_1.a AND t2_2.b = t2_1.b
@@ -2060,15 +2065,15 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
 
 PREPARE plancache_test3 AS WITH q AS MATERIALIZED (SELECT * FROM z2) SELECT * FROM q,z1 WHERE f_leak(z1.b);
 EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
-                           QUERY PLAN                            
+                         QUERY PLAN                          
 -------------------------------------------------------------
  Nested Loop
    CTE q
      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
            ->  Seq Scan on z2
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Seq Scan on z1
-                     Filter: (((a % 2) = 0) AND f_leak(b))
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Seq Scan on z1
+               Filter: (((a % 2) = 0) AND f_leak(b))
    ->  Materialize
          ->  CTE Scan on q
 (9 rows)
@@ -2112,15 +2117,15 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
 (9 rows)
 
 EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
-                           QUERY PLAN                            
+                         QUERY PLAN                          
 -------------------------------------------------------------
  Nested Loop
    CTE q
      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
            ->  Seq Scan on z2
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Seq Scan on z1
-                     Filter: (((a % 2) = 0) AND f_leak(b))
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Seq Scan on z1
+               Filter: (((a % 2) = 0) AND f_leak(b))
    ->  Materialize
          ->  CTE Scan on q
 (9 rows)
@@ -2164,15 +2169,15 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
 (9 rows)
 
 EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
-                           QUERY PLAN                            
+                         QUERY PLAN                          
 -------------------------------------------------------------
  Nested Loop
    CTE q
      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
            ->  Seq Scan on z2
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Seq Scan on z1
-                     Filter: (((a % 2) = 1) AND f_leak(b))
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Seq Scan on z1
+               Filter: (((a % 2) = 1) AND f_leak(b))
    ->  Materialize
          ->  CTE Scan on q
 (9 rows)
@@ -2216,15 +2221,15 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test2;
 (9 rows)
 
 EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
-                           QUERY PLAN                            
+                         QUERY PLAN                          
 -------------------------------------------------------------
  Nested Loop
    CTE q
      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
            ->  Seq Scan on z2
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Seq Scan on z1
-                     Filter: (((a % 2) = 1) AND f_leak(b))
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Seq Scan on z1
+               Filter: (((a % 2) = 1) AND f_leak(b))
    ->  Materialize
          ->  CTE Scan on q
 (9 rows)
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 14660970..f8f574f9 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2397,6 +2397,9 @@ toyemp| SELECT emp.name,
     emp.location,
     (12 * emp.salary) AS annualsal
    FROM emp;
+zv1| SELECT zt1.f1,
+    'dummy'::text AS junk
+   FROM pg_temp_17.zt1;
 SELECT tablename, rulename, definition FROM pg_rules
 	ORDER BY tablename, rulename;
 pg_settings|pg_settings_n|CREATE RULE pg_settings_n AS
diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out
index 6bc02325..85e23f84 100644
--- a/src/test/regress/expected/select_parallel_4.out
+++ b/src/test/regress/expected/select_parallel_4.out
@@ -82,16 +82,16 @@ select length(stringu1) from tenk1 group by length(stringu1);
 explain (costs off)
 	select stringu1, count(*) from tenk1 group by stringu1 order by stringu1;
                            QUERY PLAN                            
------------------------------------------------------------------
- Sort
-   Sort Key: stringu1
-   ->  Finalize HashAggregate
+-----------------------------------------------------------
+ Finalize GroupAggregate
          Group Key: stringu1
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Gather
-                     Workers Planned: 4
+         ->  Sort
+               Sort Key: stringu1
                      ->  Partial HashAggregate
                            Group Key: stringu1
+                     ->  Gather
+                           Workers Planned: 4
                            ->  Parallel Seq Scan on tenk1
 (10 rows)
 
@@ -99,16 +99,16 @@ explain (costs off)
 	select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong
 		from tenk1 group by islong order by num;
                                                        QUERY PLAN                                                       
-------------------------------------------------------------------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------
  Sort
    Sort Key: (count(stringu1))
    ->  Finalize HashAggregate
          Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Partial HashAggregate
+                     Group Key: (CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END)
                ->  Gather
                      Workers Planned: 4
-                     ->  Partial HashAggregate
-                           Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END
                            ->  Parallel Seq Scan on tenk1
 (10 rows)
 
diff --git a/src/test/regress/expected/stats_ext_2.out b/src/test/regress/expected/stats_ext_2.out
index 315bcbc7..e058f176 100644
--- a/src/test/regress/expected/stats_ext_2.out
+++ b/src/test/regress/expected/stats_ext_2.out
@@ -659,8 +659,8 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=177.51..177.52 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.51 rows=1 width=8)
+ Finalize Aggregate  (cost=177.52..177.53 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.52 rows=1 width=8)
          ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
                ->  Seq Scan on subset  (cost=0.00..77.50 rows=1 width=0)
                      Filter: ((b = 'prefix_1'::text) AND (c = 1))
@@ -680,8 +680,8 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=177.51..177.52 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.51 rows=1 width=8)
+ Finalize Aggregate  (cost=177.52..177.53 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.52 rows=1 width=8)
          ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
                ->  Seq Scan on subset  (cost=0.00..77.50 rows=50 width=0)
                      Filter: ((b = 'prefix_1'::text) AND (c = 1))
@@ -698,8 +698,8 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=177.51..177.52 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.51 rows=1 width=8)
+ Finalize Aggregate  (cost=177.52..177.53 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.52 rows=1 width=8)
          ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
                ->  Seq Scan on subset  (cost=0.00..77.50 rows=5 width=0)
                      Filter: ((b ~~ '%_1'::text) AND (c = 1))
@@ -722,8 +722,8 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=177.51..177.52 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.51 rows=1 width=8)
+ Finalize Aggregate  (cost=177.52..177.53 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.52 rows=1 width=8)
          ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
                ->  Seq Scan on subset  (cost=0.00..77.50 rows=50 width=0)
                      Filter: ((b ~~ '%_1'::text) AND (c = 1))
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index c573fbda..78e554cc 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1712,11 +1712,11 @@ select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a orde
 explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
                                                 QUERY PLAN                                                
 ---------------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=120.19..136.36 rows=112 width=8)
-   ->  Hash Semi Join  (cost=120.19..136.36 rows=112 width=8)
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=120.19..145.13 rows=112 width=8)
+   ->  Hash Semi Join  (cost=120.19..145.13 rows=112 width=8)
          Hash Cond: (a.b = b.a)
          Join Filter: (b.b > a.b)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..111.75 rows=675 width=8)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..120.53 rows=675 width=8)
                Distribute results by H: b
                ->  Seq Scan on tbl_a a  (cost=0.00..11.75 rows=675 width=8)
          ->  Hash  (cost=11.75..11.75 rows=675 width=8)
diff --git a/src/test/regress/expected/tbase_explain.out b/src/test/regress/expected/tbase_explain.out
index 691d1bb5..e51bb895 100644
--- a/src/test/regress/expected/tbase_explain.out
+++ b/src/test/regress/expected/tbase_explain.out
@@ -292,17 +292,13 @@ select * from a1 where num >= (select count(*) from a2 where name='c') limit 1;
    ->  Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=1 loops=1)
          Output: a1.id, a1.num, a1.name
          ->  Limit
-               DN (actual rows=1..1 loops=1..1)
-               - datanode_1 (actual rows=1 loops=1)
-               - datanode_2 (actual rows=1 loops=1)
+               DN (never executed)
                Output: a1.id, a1.num, a1.name
                ->  Seq Scan on public.a1
-                     DN (actual rows=1..1 loops=1..1)
-                     - datanode_1 (actual rows=1 loops=1)
-                     - datanode_2 (actual rows=1 loops=1)
+                     DN (never executed)
                      Output: a1.id, a1.num, a1.name
                      Filter: (a1.num >= $0)
-(31 rows)
+(27 rows)
 
 explain (costs off,timing off,summary off,analyze,verbose)
 select count(*) from a1 group by name having count(*) = (select count(*) from a2 where name='a');
diff --git a/src/test/regress/expected/xc_FQS_2.out b/src/test/regress/expected/xc_FQS_2.out
index c4e07fc5..9b35d802 100644
--- a/src/test/regress/expected/xc_FQS_2.out
+++ b/src/test/regress/expected/xc_FQS_2.out
@@ -1641,13 +1641,13 @@ select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union sele
 explain select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1);
                                             QUERY PLAN                                            
 --------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..121.06 rows=1 width=40)
-   ->  Nested Loop Semi Join  (cost=100.00..121.06 rows=1 width=40)
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..121.16 rows=1 width=40)
+   ->  Nested Loop Semi Join  (cost=100.00..121.16 rows=1 width=40)
          Join Filter: (t1.c = t2.c)
          ->  Seq Scan on subquery_fqs t1  (cost=0.00..10.50 rows=2 width=40)
                Filter: (id = 1)
-         ->  Materialize  (cost=100.00..110.51 rows=2 width=4)
-               ->  Remote Subquery Scan on all (datanode_1)  (cost=100.00..110.50 rows=2 width=4)
+         ->  Materialize  (cost=100.00..110.55 rows=4 width=4)
+               ->  Remote Subquery Scan on all (datanode_1)  (cost=100.00..110.53 rows=4 width=4)
                      ->  Seq Scan on subquery_fqs t2  (cost=0.00..10.50 rows=2 width=4)
                            Filter: (id = 1)
 (9 rows)
diff --git a/src/test/regress/expected/xc_FQS_join_1.out b/src/test/regress/expected/xc_FQS_join_1.out
index 57ff7524..dc995cb5 100644
--- a/src/test/regress/expected/xc_FQS_join_1.out
+++ b/src/test/regress/expected/xc_FQS_join_1.out
@@ -697,20 +697,23 @@ explain (verbose on, nodes off, costs off) update tab1_mod set val2 = 1000 from
    ->  Update on public.tab1_mod
          ->  Merge Join
                Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid
-               Merge Cond: ((tab2_mod.val = tab1_mod.val) AND (tab2_mod.val2 = tab1_mod.val2))
+               Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
+               ->  Sort
+                     Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2
+                     Sort Key: tab1_mod.val, tab1_mod.val2
+                     ->  Seq Scan on public.tab1_mod
+                           Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2
+               ->  Materialize
+                     Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
                ->  Remote Subquery Scan on all
                      Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                           Distribute results by M: val
                            ->  Sort
                                  Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
                                  Sort Key: tab2_mod.val, tab2_mod.val2
                                  ->  Seq Scan on public.tab2_mod
                                  Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
-               ->  Sort
-                     Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2
-                     Sort Key: tab1_mod.val, tab1_mod.val2
-                     ->  Seq Scan on public.tab1_mod
-                           Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2
-(17 rows)
+(20 rows)
 
 explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod
 		where tab1_mod.val = tab2_mod.val and tab1_mod.val2 = tab2_mod.val2;
@@ -720,20 +723,23 @@ explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod
    ->  Delete on public.tab1_mod
          ->  Merge Join
                Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid
-               Merge Cond: ((tab2_mod.val = tab1_mod.val) AND (tab2_mod.val2 = tab1_mod.val2))
+               Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
+               ->  Sort
+                     Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2
+                     Sort Key: tab1_mod.val, tab1_mod.val2
+                     ->  Seq Scan on public.tab1_mod
+                           Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2
+               ->  Materialize
+                     Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
                ->  Remote Subquery Scan on all
                      Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                           Distribute results by M: val
                            ->  Sort
                                  Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
                                  Sort Key: tab2_mod.val, tab2_mod.val2
                                  ->  Seq Scan on public.tab2_mod
                                  Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
-               ->  Sort
-                     Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2
-                     Sort Key: tab1_mod.val, tab1_mod.val2
-                     ->  Seq Scan on public.tab1_mod
-                           Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2
-(17 rows)
+(20 rows)
 
 explain (verbose on, nodes off, costs off) update tab1_rep set val2 = 1000 from tab2_rep
 		where tab1_rep.val = tab2_rep.val and tab1_rep.val2 = tab2_rep.val2;

From d8272a5422047cc9be21eb904cb33a6ca6002d54 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 3 Jun 2021 19:31:23 +0800
Subject: [PATCH 154/578] fix compile warnings

---
 src/backend/commands/explain_dist.c     | 2 ++
 src/backend/optimizer/plan/createplan.c | 1 -
 src/backend/optimizer/plan/planner.c    | 3 ++-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/explain_dist.c b/src/backend/commands/explain_dist.c
index 81fedadc..2b37ba0d 100644
--- a/src/backend/commands/explain_dist.c
+++ b/src/backend/commands/explain_dist.c
@@ -125,6 +125,7 @@ InstrOut(StringInfo buf, Plan *plan, Instrumentation *instr, int current_node_id
 	elog(DEBUG1, "InstrOut: plan_node_id %d, node %d, nloops %.0f", plan->plan_node_id, current_node_id, instr->nloops);
 }
 
+#if 0
 /*
  * WorkerInstrOut
  *
@@ -156,6 +157,7 @@ WorkerInstrOut(StringInfo buf, WorkerInstrumentation *worker_instr)
 			                 instr->startup, instr->total, instr->ntuples, instr->nloops);
 	}
 }
+#endif
 
 /*
  * SpecInstrOut
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 1924a389..cbd3bef4 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -689,7 +689,6 @@ create_scan_plan(PlannerInfo *root, Path *best_path, int flags)
     /* find is there any tables located in more than one group */
     if ((rel->reloptkind == RELOPT_BASEREL || rel->reloptkind == RELOPT_OTHER_MEMBER_REL) && rel->rtekind == RTE_RELATION)
     {
-        bool error = false;
         
         rte = root->simple_rte_array[rel->relid];
         relation = heap_open(rte->relid, NoLock);
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 8c62e688..a3af8985 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4280,12 +4280,13 @@ create_grouping_paths(PlannerInfo *root,
             {
 #ifdef __TBASE__
                 bool try_redistribute_grouping = false;
+                double dNumLocalGroups;
                 PathTarget * local_grouping_target = make_partial_grouping_target(root, target);
 
 				grouped_rel->reltarget = local_grouping_target;
 
                 /* Estimate number of partial groups. */
-                double dNumLocalGroups = get_number_of_groups(root,
+                dNumLocalGroups = get_number_of_groups(root,
                                                          cheapest_path->rows,
                                                          gd);
 #endif

From 594b715db8acaa3f613fededad8543c3172888a1 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 4 Jun 2021 10:22:30 +0800
Subject: [PATCH 155/578] fix regress expected errors

---
 src/test/regress/expected/create_view.out     |  49 --------
 src/test/regress/expected/join_3.out          | 106 +++++++-----------
 src/test/regress/expected/rules.out           |   3 -
 .../regress/expected/select_parallel_4.out    |  16 +--
 src/test/regress/expected/subselect.out       |  38 +++----
 src/test/regress/expected/tbase_explain.out   |  10 +-
 src/test/regress/expected/xc_FQS_join_1.out   |  40 +++----
 src/test/regress/expected/xc_groupby_1.out    |  24 ++--
 src/test/regress/expected/xc_having_1.out     |  14 +--
 src/test/regress/parallel_schedule            |   3 -
 10 files changed, 115 insertions(+), 188 deletions(-)

diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out
index b8836c0d..56e73b4e 100644
--- a/src/test/regress/expected/create_view.out
+++ b/src/test/regress/expected/create_view.out
@@ -38,55 +38,6 @@ SELECT * FROM viewtest ORDER BY a;
 
 CREATE OR REPLACE VIEW viewtest AS
 	SELECT a, b FROM viewtest_tbl WHERE a > 5 ORDER BY b DESC;
-EXPLAIN SELECT * FROM viewtest;
-                                        QUERY PLAN                                         
--------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=22.23..22.79 rows=225 width=8)
-   ->  Sort  (cost=22.23..22.79 rows=225 width=8)
-         Sort Key: viewtest_tbl.b DESC
-         ->  Seq Scan on viewtest_tbl  (cost=0.00..13.44 rows=225 width=8)
-               Filter: (a > 5)
-(5 rows)
-
-SELECT * FROM viewtest;
- a  | b  
-----+----
- 20 | 25
- 15 | 20
- 10 | 15
-(3 rows)
-
-EXPLAIN SELECT a FROM viewtest;
-                                           QUERY PLAN                                            
--------------------------------------------------------------------------------------------------
- Subquery Scan on viewtest  (cost=22.23..27.29 rows=225 width=4)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=22.23..22.79 rows=225 width=8)
-         ->  Sort  (cost=22.23..22.79 rows=225 width=8)
-               Sort Key: viewtest_tbl.b DESC
-               ->  Seq Scan on viewtest_tbl  (cost=0.00..13.44 rows=225 width=8)
-                     Filter: (a > 5)
-(6 rows)
-
-SELECT a FROM viewtest;
- a  
-----
- 20
- 15
- 10
-(3 rows)
-
-EXPLAIN SELECT * FROM viewtest ORDER BY a;
-                                        QUERY PLAN                                         
--------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=36.08..36.64 rows=225 width=8)
-   ->  Sort  (cost=36.08..36.64 rows=225 width=8)
-         Sort Key: viewtest_tbl.a
-         ->  Sort  (cost=22.23..22.79 rows=225 width=8)
-               Sort Key: viewtest_tbl.b DESC
-               ->  Seq Scan on viewtest_tbl  (cost=0.00..13.44 rows=225 width=8)
-                     Filter: (a > 5)
-(7 rows)
-
 SELECT * FROM viewtest ORDER BY a;
  a  | b  
 ----+----
diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index 0da9548e..53a75d2f 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -1867,33 +1867,6 @@ SELECT '' AS "xxx", *
      | 1 | 4 | one | -1
 (1 row)
 
---
--- semijoin selectivity for <>
---
-explain (costs off)
-select * from int4_tbl i4, tenk1 a
-where exists(select * from tenk1 b
-             where a.twothousand = b.twothousand and a.fivethous <> b.fivethous)
-      and i4.f1 = a.tenthous;
-                                 QUERY PLAN                                  
------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)
-         ->  Hash Semi Join
-               Hash Cond: (a.twothousand = b.twothousand)
-               Join Filter: (a.fivethous <> b.fivethous)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: twothousand
-               ->  Hash Join
-                     Hash Cond: (a.tenthous = i4.f1)
-               ->  Seq Scan on tenk1 a
-               ->  Hash
-                           ->  Seq Scan on int4_tbl i4
-         ->  Hash
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     Distribute results by H: twothousand
-                           ->  Seq Scan on tenk1 b
-(15 rows)
-
 --
 -- More complicated constructs
 --
@@ -2447,7 +2420,7 @@ select count(*) from tenk1 a, tenk1 b
          ->  Partial Aggregate
                ->  Hash Join
                      Hash Cond: (a.hundred = b.thousand)
-                     ->  Seq Scan on tenk1 a
+                     ->  Index Only Scan using tenk1_hundred on tenk1 a
                      ->  Hash
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                  ->  Seq Scan on tenk1 b
@@ -3318,7 +3291,7 @@ select count(*) from
                      Join Filter: (a.unique2 = b.unique1)
                      ->  Remote Subquery Scan on all
                            Distribute results by H: thousand
-                           ->  Seq Scan on tenk1 c
+                           ->  Index Only Scan using tenk1_thous_tenthous on tenk1 c
                      ->  Hash
                            ->  Remote Subquery Scan on all
                                  Distribute results by H: thousand
@@ -3363,7 +3336,7 @@ select b.unique1 from
                            Join Filter: (b.unique1 = 42)
                            ->  Remote Subquery Scan on all
                                  Distribute results by H: 42
-                                 ->  Seq Scan on tenk1 c
+                                 ->  Index Only Scan using tenk1_thous_tenthous on tenk1 c
                            ->  Hash
                                  ->  Remote Subquery Scan on all
                                        Distribute results by H: unique1
@@ -3468,17 +3441,19 @@ select f1, unique2, case when unique2 is null then f1 else 0 end
                                 QUERY PLAN                                
 --------------------------------------------------------------------------
  Remote Subquery Scan on all
-   ->  Hash Right Join
-         Hash Cond: (b.unique2 = a.f1)
+   ->  Merge Right Join
+         Merge Cond: (b.unique2 = a.f1)
          Filter: (CASE WHEN (b.unique2 IS NULL) THEN a.f1 ELSE 0 END = 0)
          ->  Remote Subquery Scan on all
                Distribute results by H: unique2
-               ->  Seq Scan on tenk1 b
-         ->  Hash
+               ->  Index Only Scan using tenk1_unique2 on tenk1 b
+         ->  Materialize
                ->  Remote Subquery Scan on all
                      Distribute results by H: f1
-                     ->  Seq Scan on int4_tbl a
-(11 rows)
+                     ->  Sort
+                           Sort Key: a.f1
+                           ->  Seq Scan on int4_tbl a
+(13 rows)
 
 select f1, unique2, case when unique2 is null then f1 else 0 end
   from int4_tbl a left join tenk1 b on f1 = unique2
@@ -3537,33 +3512,37 @@ left join
    using (join_key)
   ) foo3
 using (join_key);
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
  Hash Right Join
    Output: "*VALUES*".column1, i1.f1, (666)
    Hash Cond: (i1.f1 = "*VALUES*".column1)
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          Output: i1.f1, 666
-         ->  Hash Right Join
+         ->  Merge Right Join
                Output: i1.f1, 666
-               Hash Cond: (i2.unique2 = i1.f1)
+               Merge Cond: (i2.unique2 = i1.f1)
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Output: i2.unique2
                      Distribute results by H: unique2
-                     ->  Seq Scan on public.tenk1 i2
+                     Sort Key: i2.unique2
+                     ->  Index Only Scan using tenk1_unique2 on public.tenk1 i2
                            Output: i2.unique2
-               ->  Hash
+               ->  Materialize
                      Output: i1.f1
                      ->  Remote Subquery Scan on all (datanode_1)
                            Output: i1.f1
                            Distribute results by H: f1
-                           ->  Seq Scan on public.int4_tbl i1
+                           ->  Sort
                                  Output: i1.f1
+                                 Sort Key: i1.f1
+                                 ->  Seq Scan on public.int4_tbl i1
+                                       Output: i1.f1
    ->  Hash
          Output: "*VALUES*".column1
          ->  Values Scan on "*VALUES*"
                Output: "*VALUES*".column1
-(24 rows)
+(28 rows)
 
 select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from
   (values (0),(1)) foo1(join_key)
@@ -4720,19 +4699,18 @@ select * from generate_series(100,200) g,
 explain (num_nodes off, nodes off, costs off)
   select count(*) from tenk1 a,
     tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x;
-                            QUERY PLAN                            
-------------------------------------------------------------
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
  Finalize Aggregate
    ->  Remote Subquery Scan on all
          ->  Partial Aggregate
-               ->  Hash Join
-                     Hash Cond: (b.unique2 = a.unique1)
-                           ->  Remote Subquery Scan on all
-                                 Distribute results by H: unique2
-                                 ->  Seq Scan on tenk1 b
-                     ->  Hash
-                           ->  Seq Scan on tenk1 a
-(10 rows)
+               ->  Merge Join
+                     Merge Cond: (b.unique2 = a.unique1)
+                     ->  Remote Subquery Scan on all
+                           Distribute results by H: unique2
+                           ->  Index Only Scan using tenk1_unique2 on tenk1 b
+                     ->  Index Only Scan using tenk1_unique1 on tenk1 a
+(9 rows)
 
 select count(*) from tenk1 a,
   tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x;
@@ -4745,18 +4723,18 @@ select count(*) from tenk1 a,
 explain (num_nodes off, nodes off, costs off)
   select count(*) from tenk1 a,
     tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x;
-                     QUERY PLAN                      
------------------------------------------------------
+                               QUERY PLAN                               
+------------------------------------------------------------------------
  Aggregate
    ->  Hash Join
          Hash Cond: ("*VALUES*".column1 = b.unique2)
          ->  Nested Loop
                ->  Remote Subquery Scan on all
-                     ->  Seq Scan on tenk1 a
+                     ->  Index Only Scan using tenk1_unique1 on tenk1 a
                ->  Values Scan on "*VALUES*"
          ->  Hash
                ->  Remote Subquery Scan on all
-                     ->  Seq Scan on tenk1 b
+                     ->  Index Only Scan using tenk1_unique2 on tenk1 b
 (10 rows)
 
 select count(*) from tenk1 a,
@@ -6085,8 +6063,8 @@ from onek t1, tenk1 t2
 where exists (select 1 from tenk1 t3
               where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
       and t1.unique1 < 1;
-                                                                                                                QUERY PLAN                                                                                                                 
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: t1.unique1, t2.hundred
    ->  Nested Loop
@@ -6101,13 +6079,13 @@ where exists (select 1 from tenk1 t3
                            Output: t3.thousand, t3.tenthous
                            Group Key: t3.thousand, t3.tenthous
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4
+                                 Output: t3.thousand, t3.tenthous
                                  Distribute results by H: thousand
                                  ->  HashAggregate
-                                       Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4
+                                       Output: t3.thousand, t3.tenthous
                                        Group Key: t3.thousand, t3.tenthous
-                                       ->  Seq Scan on public.tenk1 t3
-                                             Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4
+                                       ->  Index Only Scan using tenk1_thous_tenthous on public.tenk1 t3
+                                             Output: t3.thousand, t3.tenthous
                      ->  Hash
                            Output: t1.unique1
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index f8f574f9..14660970 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2397,9 +2397,6 @@ toyemp| SELECT emp.name,
     emp.location,
     (12 * emp.salary) AS annualsal
    FROM emp;
-zv1| SELECT zt1.f1,
-    'dummy'::text AS junk
-   FROM pg_temp_17.zt1;
 SELECT tablename, rulename, definition FROM pg_rules
 	ORDER BY tablename, rulename;
 pg_settings|pg_settings_n|CREATE RULE pg_settings_n AS
diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out
index 85e23f84..0b6353b7 100644
--- a/src/test/regress/expected/select_parallel_4.out
+++ b/src/test/regress/expected/select_parallel_4.out
@@ -81,15 +81,15 @@ select length(stringu1) from tenk1 group by length(stringu1);
 
 explain (costs off)
 	select stringu1, count(*) from tenk1 group by stringu1 order by stringu1;
-                           QUERY PLAN                            
+                        QUERY PLAN                         
 -----------------------------------------------------------
  Finalize GroupAggregate
-         Group Key: stringu1
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+   Group Key: stringu1
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Sort
                Sort Key: stringu1
-                     ->  Partial HashAggregate
-                           Group Key: stringu1
+               ->  Partial HashAggregate
+                     Group Key: stringu1
                      ->  Gather
                            Workers Planned: 4
                            ->  Parallel Seq Scan on tenk1
@@ -98,7 +98,7 @@ explain (costs off)
 explain (costs off)
 	select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong
 		from tenk1 group by islong order by num;
-                                                       QUERY PLAN                                                       
+                                                     QUERY PLAN                                                     
 --------------------------------------------------------------------------------------------------------------------
  Sort
    Sort Key: (count(stringu1))
@@ -107,8 +107,8 @@ explain (costs off)
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Partial HashAggregate
                      Group Key: (CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END)
-               ->  Gather
-                     Workers Planned: 4
+                     ->  Gather
+                           Workers Planned: 4
                            ->  Parallel Seq Scan on tenk1
 (10 rows)
 
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 78e554cc..6e607200 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -877,7 +877,7 @@ select * from int4_tbl where
          SubPlan 1
            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                  Output: a.unique1
-                 ->  Seq Scan on public.tenk1 a
+                 ->  Index Only Scan using tenk1_unique1 on public.tenk1 a
                        Output: a.unique1
 (26 rows)
 
@@ -1173,7 +1173,7 @@ set enable_nestloop to true;
 set enable_hashjoin to false;
 set enable_mergejoin to false;
 explain select a.a,(select b.a from tbl_b b where b.a = a.a) q from tbl_a a order by 1,2;
-                                           QUERY PLAN                                            
+                                          QUERY PLAN                                           
 -----------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=3923.54..3924.39 rows=338 width=8)
    ->  Sort  (cost=3923.54..3924.39 rows=338 width=8)
@@ -1710,7 +1710,7 @@ select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a orde
 
 -- support pullup lateral ANY_SUBLINK
 explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
-                                                QUERY PLAN                                                
+                                               QUERY PLAN                                                
 ---------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=120.19..145.13 rows=112 width=8)
    ->  Hash Semi Join  (cost=120.19..145.13 rows=112 width=8)
@@ -1749,31 +1749,31 @@ where t2.a = (
 	where
 		t1.a = t2.a
 );
-                                             QUERY PLAN                                              
+                                                QUERY PLAN                                                 
 -----------------------------------------------------------------------------------------------------------
  Hash Join
    Hash Cond: (t2.a = "EXPR_subquery".min)
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Seq Scan on sub_t2 t2
    ->  Hash
-   ->  Hash Left Join
-         Hash Cond: (t1.a = "EXPR_subquery".a)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Seq Scan on sub_t1 t1
-         ->  Hash
-               ->  Subquery Scan on "EXPR_subquery"
-                     ->  HashAggregate
-                           Group Key: t2_1.a
-                           ->  Nested Loop
+         ->  Hash Left Join
+               Hash Cond: (t1.a = "EXPR_subquery".a)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Seq Scan on sub_t1 t1
+               ->  Hash
+                     ->  Subquery Scan on "EXPR_subquery"
+                           ->  HashAggregate
+                                 Group Key: t2_1.a
                                  ->  Nested Loop
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             ->  Seq Scan on sub_t2 t2_1
+                                       ->  Nested Loop
+                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   ->  Seq Scan on sub_t2 t2_1
+                                             ->  Materialize
+                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                         ->  Seq Scan on sub_interfere1
                                        ->  Materialize
                                              ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                   ->  Seq Scan on sub_interfere1
-                                 ->  Materialize
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             ->  Seq Scan on sub_interfere2
+                                                   ->  Seq Scan on sub_interfere2
 (23 rows)
 
 DROP TABLE sub_t1;
diff --git a/src/test/regress/expected/tbase_explain.out b/src/test/regress/expected/tbase_explain.out
index e51bb895..691d1bb5 100644
--- a/src/test/regress/expected/tbase_explain.out
+++ b/src/test/regress/expected/tbase_explain.out
@@ -292,13 +292,17 @@ select * from a1 where num >= (select count(*) from a2 where name='c') limit 1;
    ->  Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=1 loops=1)
          Output: a1.id, a1.num, a1.name
          ->  Limit
-               DN (never executed)
+               DN (actual rows=1..1 loops=1..1)
+               - datanode_1 (actual rows=1 loops=1)
+               - datanode_2 (actual rows=1 loops=1)
                Output: a1.id, a1.num, a1.name
                ->  Seq Scan on public.a1
-                     DN (never executed)
+                     DN (actual rows=1..1 loops=1..1)
+                     - datanode_1 (actual rows=1 loops=1)
+                     - datanode_2 (actual rows=1 loops=1)
                      Output: a1.id, a1.num, a1.name
                      Filter: (a1.num >= $0)
-(27 rows)
+(31 rows)
 
 explain (costs off,timing off,summary off,analyze,verbose)
 select count(*) from a1 group by name having count(*) = (select count(*) from a2 where name='a');
diff --git a/src/test/regress/expected/xc_FQS_join_1.out b/src/test/regress/expected/xc_FQS_join_1.out
index dc995cb5..18836c1e 100644
--- a/src/test/regress/expected/xc_FQS_join_1.out
+++ b/src/test/regress/expected/xc_FQS_join_1.out
@@ -391,11 +391,11 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod natural join t
  Hash Join
    Output: tab1_mod.val, tab1_mod.val2
    Hash Cond: ((tab1_mod.val = tab4_rep.val) AND (tab1_mod.val2 = tab4_rep.val2))
-         ->  Remote Subquery Scan on all
+   ->  Remote Subquery Scan on all
+         Output: tab1_mod.val, tab1_mod.val2
+         ->  Seq Scan on public.tab1_mod
                Output: tab1_mod.val, tab1_mod.val2
-               ->  Seq Scan on public.tab1_mod
-                     Output: tab1_mod.val, tab1_mod.val2
-                     Filter: (tab1_mod.val > 2)
+               Filter: (tab1_mod.val > 2)
    ->  Hash
          Output: tab4_rep.val, tab4_rep.val2
          ->  Remote Subquery Scan on all
@@ -426,9 +426,9 @@ explain (verbose on, nodes off, costs off) select * from tab1_mod natural join t
    ->  Hash Join
          Output: tab1_mod.val, tab1_mod.val2
          Hash Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
-                     ->  Seq Scan on public.tab1_mod
-                           Output: tab1_mod.val, tab1_mod.val2
-                           Filter: (tab1_mod.val > 2)
+         ->  Seq Scan on public.tab1_mod
+               Output: tab1_mod.val, tab1_mod.val2
+               Filter: (tab1_mod.val > 2)
          ->  Hash
                Output: tab2_mod.val, tab2_mod.val2
                ->  Remote Subquery Scan on all
@@ -590,10 +590,10 @@ explain (verbose on, nodes off, costs off, num_nodes on) select * from tab1_mod
          Output: tab1_mod.val, tab1_mod.val2, tab1_mod.val2
          Join Filter: (tab1_mod.val2 = tab4_rep.val2)
          ->  Remote Subquery Scan on all
+               Output: tab1_mod.val, tab1_mod.val2
+               ->  Seq Scan on public.tab1_mod
                      Output: tab1_mod.val, tab1_mod.val2
-                     ->  Seq Scan on public.tab1_mod
-                           Output: tab1_mod.val, tab1_mod.val2
-                           Filter: (tab1_mod.val = 1)
+                     Filter: (tab1_mod.val = 1)
          ->  Materialize
                Output: tab4_rep.val, tab4_rep.val2
                ->  Remote Subquery Scan on all
@@ -696,7 +696,7 @@ explain (verbose on, nodes off, costs off) update tab1_mod set val2 = 1000 from
  Remote Subquery Scan on all
    ->  Update on public.tab1_mod
          ->  Merge Join
-               Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid
+               Output: tab1_mod.val, 1000, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab2_mod.ctid, tab2_mod.xc_node_id
                Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
                ->  Sort
                      Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2
@@ -704,15 +704,15 @@ explain (verbose on, nodes off, costs off) update tab1_mod set val2 = 1000 from
                      ->  Seq Scan on public.tab1_mod
                            Output: tab1_mod.val, tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val2
                ->  Materialize
-                     Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
-               ->  Remote Subquery Scan on all
-                     Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                     Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
+                     ->  Remote Subquery Scan on all
+                           Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
                            Distribute results by M: val
                            ->  Sort
                                  Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
                                  Sort Key: tab2_mod.val, tab2_mod.val2
                                  ->  Seq Scan on public.tab2_mod
-                                 Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                                       Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
 (20 rows)
 
 explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod
@@ -722,7 +722,7 @@ explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod
  Remote Subquery Scan on all
    ->  Delete on public.tab1_mod
          ->  Merge Join
-               Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid
+               Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab2_mod.ctid, tab2_mod.xc_node_id
                Merge Cond: ((tab1_mod.val = tab2_mod.val) AND (tab1_mod.val2 = tab2_mod.val2))
                ->  Sort
                      Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2
@@ -730,15 +730,15 @@ explain (verbose on, nodes off, costs off) delete from tab1_mod using tab2_mod
                      ->  Seq Scan on public.tab1_mod
                            Output: tab1_mod.xc_node_id, tab1_mod.ctid, tab1_mod.shardid, tab1_mod.val, tab1_mod.val2
                ->  Materialize
-                     Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
-               ->  Remote Subquery Scan on all
-                     Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                     Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
+                     ->  Remote Subquery Scan on all
+                           Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
                            Distribute results by M: val
                            ->  Sort
                                  Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
                                  Sort Key: tab2_mod.val, tab2_mod.val2
                                  ->  Seq Scan on public.tab2_mod
-                                 Output: tab2_mod.ctid, tab2_mod.val, tab2_mod.val2
+                                       Output: tab2_mod.ctid, tab2_mod.xc_node_id, tab2_mod.val, tab2_mod.val2
 (20 rows)
 
 explain (verbose on, nodes off, costs off) update tab1_rep set val2 = 1000 from tab2_rep
diff --git a/src/test/regress/expected/xc_groupby_1.out b/src/test/regress/expected/xc_groupby_1.out
index b33bfcf0..c411847d 100644
--- a/src/test/regress/expected/xc_groupby_1.out
+++ b/src/test/regress/expected/xc_groupby_1.out
@@ -4057,12 +4057,12 @@ explain (verbose true, costs false, nodes false) select count(*) + sum(val) + av
  Remote Subquery Scan on all
    Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
    Sort Key: xc_groupby_tab1.val2
-   ->  Sort
-         Output: ((((count(*) + sum(val)))::numeric + avg(val))), val2
-         Sort Key: xc_groupby_tab1.val2
-         ->  Finalize HashAggregate
-               Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
-               Group Key: xc_groupby_tab1.val2
+   ->  Finalize GroupAggregate
+         Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val))
+               Sort Key: xc_groupby_tab1.val2
                ->  Remote Subquery Scan on all
                      Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
                      Distribute results by H: val2
@@ -4106,12 +4106,12 @@ explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 *
  Remote Subquery Scan on all
    Output: sum(val), avg(val), (2 * val2)
    Sort Key: (2 * xc_groupby_tab1.val2)
-   ->  Sort
-         Output: (sum(val)), (avg(val)), ((2 * val2))
-         Sort Key: ((2 * xc_groupby_tab1.val2))
-         ->  Finalize HashAggregate
-               Output: sum(val), avg(val), ((2 * val2))
-               Group Key: (2 * xc_groupby_tab1.val2)
+   ->  Finalize GroupAggregate
+         Output: sum(val), avg(val), ((2 * val2))
+         Group Key: ((2 * xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((2 * val2)), (PARTIAL sum(val)), (PARTIAL avg(val))
+               Sort Key: ((2 * xc_groupby_tab1.val2))
                ->  Remote Subquery Scan on all
                      Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val)
                      Distribute results by H: (2 * val2)
diff --git a/src/test/regress/expected/xc_having_1.out b/src/test/regress/expected/xc_having_1.out
index 9d914a2a..dd87a084 100644
--- a/src/test/regress/expected/xc_having_1.out
+++ b/src/test/regress/expected/xc_having_1.out
@@ -611,17 +611,17 @@ explain (verbose true, costs false, nodes false) select count(*), sum(xc_having_
                ->  Hash Join
                      Output: xc_having_tab1.val2, xc_having_tab2.val2, xc_having_tab1.val, xc_having_tab2.val
                      Hash Cond: (xc_having_tab1.val2 = xc_having_tab2.val2)
-               Join Filter: ((xc_having_tab1.val2 + xc_having_tab2.val2) > 2)
-               ->  Remote Subquery Scan on all
-                     Output: xc_having_tab1.val, xc_having_tab1.val2
-                     Distribute results by H: val2
+                     Join Filter: ((xc_having_tab1.val2 + xc_having_tab2.val2) > 2)
+                     ->  Remote Subquery Scan on all
+                           Output: xc_having_tab1.val, xc_having_tab1.val2
+                           Distribute results by H: val2
                            ->  Seq Scan on public.xc_having_tab1
                                  Output: xc_having_tab1.val, xc_having_tab1.val2
                      ->  Hash
-                     Output: xc_having_tab2.val, xc_having_tab2.val2
-                     ->  Remote Subquery Scan on all
                            Output: xc_having_tab2.val, xc_having_tab2.val2
-                           Distribute results by H: val2
+                           ->  Remote Subquery Scan on all
+                                 Output: xc_having_tab2.val, xc_having_tab2.val2
+                                 Distribute results by H: val2
                                  ->  Seq Scan on public.xc_having_tab2
                                        Output: xc_having_tab2.val, xc_having_tab2.val2
 (24 rows)
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index c0ccc373..31e0b077 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -164,6 +164,3 @@ test: xl_primary_key xl_foreign_key xl_distribution_column_types xl_alter_table
 
 # This runs TBase specific tests
 test: tbase_explain
-
-test: redistribute_custom_types
-test: nestloop_by_shard

From b467bf42642e489bffb86f6c6299c1c77294deb1 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Fri, 8 Jan 2021 16:34:22 +0800
Subject: [PATCH 156/578] [Bugfix] gb18030 decode failed when use prepared
 statement, ID84482999 (merge request !82)

Squash merge branch 'Tbase_v5.05.2_bugfix_gb18030' into 'Tbase_v5.05.2'

* bugfix: gb18030 decode failed when use prepared statement, ID84482999, add regress cases

* bugfix: gb18030 decode failed when use prepared statement, ID84482999

(cherry picked from commit 10aa5b2b)

005c001e bugfix: gb18030 decode failed when use prepared statement, ID84482999, add regress cases

6c195741 bugfix: gb18030 decode failed when use prepared statement, ID84482999
---
 src/backend/pgxc/pool/execRemote.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 52850951..5a7d8b4d 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3664,6 +3664,12 @@ pgxc_node_remote_cleanup_all(void)
         return;
     }
 
+	/* Do not cleanup connections if we have prepared statements on nodes */
+	if (HaveActiveDatanodeStatements())
+	{
+		return;
+	}
+
     /*
      * Send down snapshot followed by DISCARD ALL command.
      */

From ebe449a5689077b3e718c69cab9c6d5b1557bc04 Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Tue, 30 Mar 2021 14:39:55 +0800
Subject: [PATCH 157/578] only rewrite for distribute key

---
 src/backend/executor/execMain.c       | 28 ++++++---------------------
 src/backend/optimizer/util/pgxcship.c |  4 +++-
 src/test/regress/expected/prepare.out | 21 ++++++++++++++++++++
 src/test/regress/sql/prepare.sql      | 12 ++++++++++++
 4 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 9ec3add2..571f9473 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -1906,13 +1906,10 @@ ExecEndPlan(PlanState *planstate, EState *estate)
  * which datanode will execute the sql command. After we get the result,
  * we should use the result to replace distribute key's function to
  * generate a new sql that will be shipped to datanode.
- * Note: for replication table, we should caculate all the results of
- * functions before ship the sql. Otherwise the value may not be same
- * in different datanodes.
  */
 static void
 RewriteForSql(RemoteQueryState *planstate, RemoteQuery *plan,
-			  char *distribcol, bool isreplic)
+				char *distribcol)
 {
 	Query			*query = copyObject(plan->forDeparse);
 	ListCell		*lc_deparse = NULL;
@@ -1928,13 +1925,10 @@ RewriteForSql(RemoteQueryState *planstate, RemoteQuery *plan,
 	foreach(lc_deparse, query->targetList)
 	{
 		entry_deparse = lfirst(lc_deparse);
-		if (isreplic)
-		{
-			entry_deparse->expr = (Expr *)replace_distribkey_func(
-									(Node *)entry_deparse->expr);
-			find_target = true;
-		}
-		else if (strcmp(entry_deparse->resname, distribcol) == 0)
+
+		/* Only rewrite distribute key's function. */
+		if (strcmp(entry_deparse->resname, distribcol) == 0 &&
+				!pgxc_is_expr_shippable(entry_deparse->expr, NULL))
 		{
 			entry_deparse->expr = (Expr *)replace_distribkey_func(
 									(Node *)entry_deparse->expr);
@@ -1988,16 +1982,6 @@ RewriteFuncNode(PlanState *planstate)
 	if ((!exec_nodes) || (!exec_nodes->need_rewrite))
 		return;
 
-	/* 
-	 * For replicated table, we need to execute func
-	 * and then ship to datanode 
-	 */
-	if (IsExecNodesReplicated(exec_nodes))
-	{
-		RewriteForSql(node, plan, NULL, true);
-		return;
-	}
-
 	if (exec_nodes->en_relid == InvalidOid || (!exec_nodes->en_expr))
 		return;
 
@@ -2006,7 +1990,7 @@ RewriteFuncNode(PlanState *planstate)
 		return;
 
 	distribcol = GetRelationDistribColumn(rel_loc_info);
-	RewriteForSql(node, plan, distribcol, false);
+	RewriteForSql(node, plan, distribcol);
 }
 
 /* ----------------------------------------------------------------
diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index d294de0b..7edfd4be 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -2028,7 +2028,9 @@ pgxc_is_query_shippable(Query *query, int query_level)
 	 * must know the function's result before real execute. So set
 	 * the flag to identify rewrite in ExecutePlan.
 	 */
-	if (bms_is_member(SS_NEED_FUNC_REWRITE, shippability))
+	if (bms_is_member(SS_NEED_FUNC_REWRITE, shippability) &&
+		(IsLocatorColumnDistributed(exec_nodes->baselocatortype) ||
+		IsLocatorDistributedByValue(exec_nodes->baselocatortype)))
 	{
 		exec_nodes->need_rewrite = true;
 		shippability = bms_del_member(shippability, SS_NEED_FUNC_REWRITE);
diff --git a/src/test/regress/expected/prepare.out b/src/test/regress/expected/prepare.out
index 7dd52d9a..1e93e2ad 100644
--- a/src/test/regress/expected/prepare.out
+++ b/src/test/regress/expected/prepare.out
@@ -322,6 +322,27 @@ SELECT * from insert_fsq_test order by id;
 
 DEALLOCATE PREPARE ps_test_insert;
 DROP TABLE insert_fsq_test cascade;
+-- test non-distribute key with function, no need rewrite
+CREATE TABLE insert_fsq_test1(v int, w int);
+CREATE SEQUENCE test_seq;
+PREPARE ps_test_insert1(int) AS INSERT INTO insert_fsq_test1 values($1, nextval('test_seq'));
+EXECUTE ps_test_insert1(1);
+EXECUTE ps_test_insert1(2);
+EXECUTE ps_test_insert1(3);
+EXECUTE ps_test_insert1(4);
+EXECUTE ps_test_insert1(5);
+SELECT * from insert_fsq_test1 order by v;
+ v | w 
+---+---
+ 1 | 1
+ 2 | 2
+ 3 | 3
+ 4 | 4
+ 5 | 5
+(5 rows)
+
+DEALLOCATE PREPARE ps_test_insert1;
+DROP TABLE insert_fsq_test1 cascade;
 --
 -- gb18030 test
 --
diff --git a/src/test/regress/sql/prepare.sql b/src/test/regress/sql/prepare.sql
index 9a465ab3..f20b20e8 100644
--- a/src/test/regress/sql/prepare.sql
+++ b/src/test/regress/sql/prepare.sql
@@ -171,6 +171,18 @@ EXECUTE ps_test_insert('5');
 SELECT * from insert_fsq_test order by id;
 DEALLOCATE PREPARE ps_test_insert;
 DROP TABLE insert_fsq_test cascade;
+-- test non-distribute key with function, no need rewrite
+CREATE TABLE insert_fsq_test1(v int, w int);
+CREATE SEQUENCE test_seq;
+PREPARE ps_test_insert1(int) AS INSERT INTO insert_fsq_test1 values($1, nextval('test_seq'));
+EXECUTE ps_test_insert1(1);
+EXECUTE ps_test_insert1(2);
+EXECUTE ps_test_insert1(3);
+EXECUTE ps_test_insert1(4);
+EXECUTE ps_test_insert1(5);
+SELECT * from insert_fsq_test1 order by v;
+DEALLOCATE PREPARE ps_test_insert1;
+DROP TABLE insert_fsq_test1 cascade;
 
 --
 -- gb18030 test

From 3756cf349fbec83e4adbc25e98bdd299e3ae50f8 Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Wed, 27 Jan 2021 20:53:23 +0800
Subject: [PATCH 158/578] select cmd ship to datanode when single datanode

---
 src/backend/commands/explain.c        | 86 +++++++++++++++++++++++++++
 src/backend/optimizer/util/pgxcship.c |  6 ++
 2 files changed, 92 insertions(+)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 09617d73..2f7ea8e7 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -147,6 +147,7 @@ static void ExplainDummyGroup(const char *objtype, const char *labelname,
 static void ExplainExecNodes(ExecNodes *en, ExplainState *es);
 static void ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate,
                                 List *ancestors, ExplainState *es);
+static char **StrSplit(const char *str, const char *delimiter, int *n);
 #endif
 static void ExplainXMLTag(const char *tagname, int flags, ExplainState *es);
 static void ExplainJSONLineEnding(ExplainState *es);
@@ -3493,6 +3494,74 @@ ExplainPropertyListNested(const char *qlabel, List *data, ExplainState *es)
     }
 }
 
+/* split a string based on a delimiter */
+static char **
+StrSplit(const char *str, const char *delimiter, int *n)
+{
+	char *tmp = NULL;
+	char **rtn = NULL;
+	char *token = NULL;
+
+	*n = 0;
+	if (!str)
+		return NULL;
+
+	/* copy str to tmp as strtok will mangle the string */
+	tmp = pstrdup(str);
+
+	if (!strlen(tmp) || !delimiter || !strlen(delimiter))
+	{
+		*n = 1;
+		rtn = (char **) palloc(*n * sizeof(char *));
+		rtn[0] = pstrdup(tmp);
+		pfree(tmp);
+		return rtn;
+	}
+
+	token = strtok(tmp, delimiter);
+	while (token != NULL)
+	{
+		if (*n < 1)
+		{
+			rtn = (char **) palloc(sizeof(char *));
+		}
+		else
+		{
+			rtn = (char **) repalloc(rtn, (*n + 1) * sizeof(char *));
+		}
+
+		rtn[*n] = NULL;
+		rtn[*n] = pstrdup(token);
+		*n = *n + 1;
+
+		token = strtok(NULL, delimiter);
+	}
+
+	pfree(tmp);
+	return rtn;
+}
+
+static void
+DealRemoteJson(StringInfo explainResult, const char *value, int spaceLen)
+{
+	int i = 0;
+	int num = 0;
+	char **result = NULL;
+	result = StrSplit(value, "\n", &num);
+	for (i = 0; i < num; i++)
+	{
+		if (i > 0)
+		{
+			appendStringInfo(explainResult, "\n");
+			appendStringInfoSpaces(explainResult, spaceLen);
+		}
+		appendStringInfo(explainResult, "%s", result[i]);			
+		pfree(result[i]);
+	}
+	if (result)
+		pfree(result);
+}
+
 /*
  * Explain a simple property.
  *
@@ -4007,9 +4076,18 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp
             value = slot_getattr(result, 1, &isnull);
             if (!isnull)
             {
+				if (es->format == EXPLAIN_FORMAT_JSON)
+				{
+					if (!firstline)
+						appendStringInfo(&explainResult, "\n");
+					DealRemoteJson(&explainResult, TextDatumGetCString(value), 2 * es->indent);
+				}
+				else
+				{
                 if (!firstline)
                     appendStringInfoSpaces(&explainResult, 2 * es->indent);
                 appendStringInfo(&explainResult, "%s\n", TextDatumGetCString(value));
+				}
                 firstline = false;
             }
 
@@ -4020,10 +4098,18 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp
 
         if (es->format == EXPLAIN_FORMAT_TEXT)
             appendStringInfo(es->str, "%s", explainResult.data);
+		else if (es->format == EXPLAIN_FORMAT_JSON)
+		{
+			appendStringInfoChar(es->str, '\n');
+			appendStringInfoSpaces(es->str, es->indent * 2);
+			appendStringInfo(es->str, "%s: %s", "\"Remote plan\"", explainResult.data);
+		}
         else
+		{
             ExplainPropertyText("Remote plan", explainResult.data, es);
     }
 }
+}
 #endif
 
 /*
diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index 7edfd4be..df8aa3d5 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -1916,6 +1916,12 @@ pgxc_is_query_shippable(Query *query, int query_level)
 	pgxc_shippability_walker((Node *)query, &sc_context);
 
 	exec_nodes = sc_context.sc_exec_nodes;
+
+	/* For single datanode and select command, we ship it directly. */
+	if (NumDataNodes == 1 && query->commandType == CMD_SELECT &&
+		!bms_is_member(SS_NEEDS_COORD, sc_context.sc_shippability))
+		return exec_nodes;
+		
 	/*
 	 * The shippability context contains two ExecNodes, one for the subLinks
 	 * involved in the Query and other for the relation involved in FromClause.

From 8a94c0b1220238afb4aae28070757b732ca5f967 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 4 Jun 2021 16:54:33 +0800
Subject: [PATCH 159/578] fix compile warnning:pgxc_is_expr_shippable implicit
 declaration

---
 src/backend/executor/execMain.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 571f9473..7424b45a 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -88,6 +88,7 @@
 #include "optimizer/planmain.h"
 #include "pgxc/squeue.h"
 #include "utils/relfilenodemap.h"
+#include "optimizer/pgxcship.h"
 #endif
 
 #ifdef __AUDIT__

From 40179d321f20435db553fc62955b40ae8754a8fe Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 4 Jun 2021 16:40:56 +0800
Subject: [PATCH 160/578] Pooler support not raising error when fail to connect
 to nodes

Only used in pg_stat_cluster_activity extension for now

tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131088425671
---
 contrib/pg_clean/pg_clean.c                   | 3198 +++++++++++++++++
 .../pg_stat_cluster_activity.c                |    3 +-
 src/backend/pgxc/barrier/barrier.c            |  626 ++--
 src/backend/pgxc/cluster/pause.c              |    4 +-
 src/backend/pgxc/pool/execRemote.c            |   59 +-
 src/backend/pgxc/pool/pgxcnode.c              |   22 +-
 src/backend/pgxc/pool/poolmgr.c               |   74 +-
 src/backend/replication/logical/worker.c      |    2 +-
 src/include/pgxc/locator.h                    |    1 +
 src/include/pgxc/pgxcnode.h                   |    3 +-
 src/include/pgxc/poolmgr.h                    |    3 +-
 11 files changed, 3648 insertions(+), 347 deletions(-)
 create mode 100644 contrib/pg_clean/pg_clean.c

diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c
new file mode 100644
index 00000000..08a189f9
--- /dev/null
+++ b/contrib/pg_clean/pg_clean.c
@@ -0,0 +1,3198 @@
+#include "postgres.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+
+#include <stdio.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "storage/procarray.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "utils/varlena.h"
+#include "utils/lsyscache.h"
+#include "utils/palloc.h"
+#include "utils/builtins.h"
+
+#include "executor/tuptable.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/pgxcnode.h"
+#include "access/tupdesc.h"
+#include "access/htup_details.h"
+#include "lib/stringinfo.h"
+
+#include "access/gtm.h"
+#include "datatype/timestamp.h"
+#include "access/xact.h"
+#include "pgxc/pgxcnode.h"
+#include "pgxc/poolmgr.h"
+#include "utils/timestamp.h"
+#include "catalog/pg_control.h"
+#include "commands/dbcommands.h"
+
+#include "utils/memutils.h"
+#include "nodes/memnodes.h"
+
+#ifdef XCP
+#include "catalog/pg_type.h"
+#include "catalog/pgxc_node.h"
+#include "executor/executor.h"
+#include "nodes/makefuncs.h"
+#include "utils/snapmgr.h"
+#endif
+#ifdef PGXC
+#include "pgxc/nodemgr.h"
+#include "pgxc/pgxc.h"
+#endif
+
+#include "storage/fd.h"
+#include "pgstat.h"
+#include "access/xact.h"
+#include "access/twophase.h"
+#include "access/hash.h"
+
+/*hash_create hash_search*/
+#include "utils/hsearch.h"
+
+#define TWOPHASE_RECORD_DIR "pg_2pc"
+int  transaction_threshold = 200000;
+#define MAXIMUM_CLEAR_FILE 10000
+#define MAXIMUM_OUTPUT_FILE 1000
+#define XIDPREFIX "_$XC$"
+#define DEFAULT_CLEAN_TIME_INTERVAL 120000000
+#ifdef __TWO_PHASE_TESTS__
+#define LEAST_CLEAN_TIME_INTERVAL 10000000 /* in pg_clean test_mode should not clean twophase trans prepared in ten seconds or commit in ten seconds */
+#else
+#define LEAST_CLEAN_TIME_INTERVAL 60000000 /* should not clean twophase trans prepared in a minite or commit in a minite */
+#endif
+GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL;
+
+
+PG_MODULE_MAGIC;
+
+#define MAX_GID 50
+#define MAX_DBNAME	64
+#define GET_START_XID "startxid:"
+#define GET_COMMIT_TIMESTAMP "global_commit_timestamp:"
+#define GET_START_NODE "startnode:"
+#define GET_NODE "nodes:"
+#define GET_XID "\nxid:"
+#define GET_READONLY "readonly"
+#define GIDSIZE (200 + 24)
+#define MAX_TWOPC_TXN 1000
+#define STRING_BUFF_LEN 1024
+
+#define MAX_CMD_LENGTH 120
+
+#define XIDFOUND 1
+#define XIDNOTFOUND -1
+#define XIDEXECFAIL -2
+
+#define FILEFOUND 1
+#define FILEUNKOWN -1
+#define FILENOTFOUND -2
+
+#define INIT(x)\
+do{\
+	x = NULL;\
+	x##_count = 0;\
+	x##_size = 0;\
+}while(0);
+
+#define RPALLOC(x)\
+do{\
+    if (x##_size < x##_count+1)\
+    {\
+        int temp_size = (x##_size > 0) ? x##_size : 1;\
+        if (NULL == x)\
+        {\
+			x = palloc0(2*temp_size*sizeof(*x));\
+		}\
+        else\
+        {\
+        	x = repalloc(x, 2*temp_size*sizeof(*x));\
+        }\
+    	x##_size = 2*temp_size;\
+    }\
+}while(0);
+
+#define PALLOC(x, y)\
+do{\
+    RPALLOC(x);\
+    x[x##_count] = y;\
+    x##_count++;\
+}while(0);
+
+#define RFREE(x)\
+do{\
+    if (x##_size > 0)\
+    {\
+        pfree(x);\
+    }\
+    x = NULL;\
+    x##_count = 0;\
+    x##_size = 0;\
+}while(0);
+	
+#define ENUM_TOCHAR_CASE(x)   case x: return(#x);
+
+/*data structures*/
+typedef enum TXN_STATUS
+{
+	TXN_STATUS_INITIAL = 0,	/* Initial */
+	TXN_STATUS_PREPARED,
+	TXN_STATUS_COMMITTED,
+	TXN_STATUS_ABORTED,
+	TXN_STATUS_INPROGRESS,
+	TXN_STATUS_FAILED,		/* Error detected while interacting with the node */
+	TXN_STATUS_UNKNOWN	/* Unknown: Frozen, running, or not started */
+} TXN_STATUS;
+
+
+typedef enum 
+{
+	UNDO = 0,
+	ABORT,
+	COMMIT
+} OPERATION;
+
+typedef enum
+{
+    TWOPHASE_FILE_EXISTS = 0,
+    TWOPHASE_FILE_NOT_EXISTS,
+    TWOPHASE_FILE_OLD, 
+    TWOPHASE_FILE_ERROR
+}TWOPHASE_FILE_STATUS;
+	
+typedef struct txn_info
+{
+	char			gid[MAX_GID];
+	uint32			*xid;				/* xid used in prepare */
+	TimestampTz		*prepare_timestamp;
+	char			*owner;
+    char            *participants;
+	Oid				origcoord;			/* Original coordinator who initiated the txn */
+    bool            after_first_phase;
+    uint32          startxid;           /* xid in Original coordinator */
+	bool			isorigcoord_part;	/* Is original coordinator a
+										   participant? */
+	int				num_dnparts;		/* Number of participant datanodes */
+	int				num_coordparts;		/* Number of participant coordinators */
+	int				*dnparts;			/* Whether a node was participant in the txn */
+	int				*coordparts;
+	TXN_STATUS		*txn_stat;			/* Array for each nodes */
+	char			*msg;				/* Notice message for this txn. */
+	GlobalTimestamp  global_commit_timestamp;	/* get global_commit_timestamp from node once it is committed*/
+
+	TXN_STATUS		global_txn_stat;
+	OPERATION		op;
+	bool			op_issuccess;
+    bool            is_readonly;
+    bool            belong_abnormal_node;
+}txn_info;
+
+typedef struct database_info
+{
+	struct database_info *next;
+	char *database_name;
+
+    HTAB *all_txn_info;
+#if 0 
+	txn_info *head_txn_info;
+	txn_info *last_txn_info;
+#endif
+} database_info;
+
+typedef struct 
+{
+	int index;
+	txn_info **txn;
+	int txn_count;
+	int txn_size;
+	MemoryContext mycontext;
+} print_txn_info;
+
+typedef struct
+{
+	int index;
+	int count;
+	char **gid;
+	int gid_count;
+	int gid_size;
+	char **database;
+	int database_count;
+	int database_size;
+	char **global_status;
+	int global_status_count;
+	int global_status_size;
+	char **status;
+	int status_count;
+	int status_size;
+	MemoryContext mycontext;
+} print_status;
+
+typedef struct 
+{
+	char ***slot;	/*slot[i][j] stores value of row i, colum j*/
+	int slot_count;	/*number of rows*/
+	int slot_size;
+	int attnum;
+}TupleTableSlots;
+
+/*global variable*/
+static Oid	        *cn_node_list = NULL;
+static Oid	        *dn_node_list = NULL;
+static bool         *cn_health_map = NULL;
+static bool         *dn_health_map = NULL;
+static int	        cn_nodes_num = 0;
+static int	        dn_nodes_num = 0;
+static int	        pgxc_clean_node_count = 0;
+static Oid	        my_nodeoid;
+static 
+database_info       *head_database_info = NULL;
+static 
+database_info       *last_database_info = NULL;
+bool		        execute = false;
+int                 total_twopc_txn = 0;
+
+TimestampTz         current_time;
+GlobalTimestamp     abnormal_time = InvalidGlobalTimestamp;
+char                *abnormal_nodename = NULL;
+Oid                 abnormal_nodeoid = InvalidOid;
+bool                clear_2pc_belong_node = false;
+
+
+/*function list*/
+	/*plugin entry function*/
+
+static bool check_node_health(Oid node_oid);
+static Datum 
+	 execute_query_on_single_node(Oid node, const char * query, int attnum, TupleTableSlots * tuples);
+void DestroyTxnHash(void);
+static void ResetGlobalVariables(void);
+
+static Oid  
+	 getMyNodeoid(void);
+static void 
+	 getDatabaseList(void);
+static char* TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num);
+static void DropTupleTableSlots(TupleTableSlots *
+Slots);
+static void 
+	 getTxnInfoOnNodesAll(void);
+void getTxnInfoOnNode(Oid node);
+void add_txn_info(char * dbname, Oid node_oid, uint32 xid, char * gid, char * owner, 
+					  TimestampTz prepared_time, TXN_STATUS status);
+TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info * txn, Oid node_oid);
+static txn_info *
+	 find_txn(char *gid);
+txn_info*	
+	 make_txn_info(char * dbname, char * gid, char * owner);
+database_info*	
+	 find_database_info(char *database_name);
+database_info*
+	 add_database_info(char *database_name);
+int	 find_node_index(Oid node_oid);
+Oid  find_node_oid(int node_idx);
+void getTxnInfoOnOtherNodesAll(void);
+void getTxnInfoOnOtherNodesForDatabase(database_info *database);
+void getTxnInfoOnOtherNodes(txn_info *txn);
+int Get2PCXidByGid(Oid node_oid, char * gid, uint32 * transactionid);
+int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid);
+
+char *get2PCInfo(const char *tid);
+
+void getTxnStatus(txn_info * txn, int node_idx);
+void recover2PCForDatabaseAll(void);
+void recover2PCForDatabase(database_info * db_info);
+#if 0    
+static bool 
+	 setMaintenanceMode(bool status);
+#endif
+bool send_query_clean_transaction(PGXCNodeHandle * conn, txn_info * txn, const char * finish_cmd);
+bool check_2pc_belong_node(txn_info * txn);
+bool check_node_participate(txn_info * txn, int node_idx);
+
+void recover2PC(txn_info * txn);
+TXN_STATUS 
+	 check_txn_global_status(txn_info *txn);
+bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check);
+bool clean_2PC_files(txn_info *txn);
+void Init_print_txn_info(print_txn_info *print_txn);
+void Init_print_stats_all(print_status *pstatus);
+void Init_print_stats(txn_info * txn, char * database, print_status * pstatus);
+static const char *
+	 txn_status_to_string(TXN_STATUS status);
+static const char *
+	 txn_op_to_string(OPERATION op);
+static void 
+     CheckFirstPhase(txn_info *txn);
+static void 
+     get_transaction_handles(PGXCNodeAllHandles **pgxc_handles, txn_info *txn);
+static void 
+     get_node_handles(PGXCNodeAllHandles ** pgxc_handles, Oid nodeoid);
+
+Datum	pg_clean_execute(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pg_clean_execute);
+Datum	pg_clean_execute(PG_FUNCTION_ARGS)
+{
+#ifdef ACCESS_CONTROL_ATTR_NUM
+#undef ACCESS_CONTROL_ATTR_NUM
+#endif
+#define ACCESS_CONTROL_ATTR_NUM  4
+	FuncCallContext 	*funcctx;
+	HeapTuple			tuple;		
+	print_txn_info		*print_txn = NULL;
+	txn_info 			*temp_txn;
+	char				txn_gid[100];
+	char				txn_status[100];
+	char				txn_op[100];
+	char				txn_op_issuccess[100];
+	
+	Datum		values[ACCESS_CONTROL_ATTR_NUM];
+	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
+	
+	if(!IS_PGXC_COORDINATOR)
+	{
+		elog(ERROR, "can only called on coordinator");
+	}
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcontext;
+		TupleDesc	tupdesc;
+		MemoryContext mycontext;
+		funcctx = SRF_FIRSTCALL_INIT();
+		
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "global_transaction_status",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "operation",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "operation_status",
+						   TEXTOID, -1, 0);
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+		
+		funcctx->user_fctx = (print_txn_info *)palloc0(sizeof(print_txn_info));
+		print_txn = (print_txn_info *) funcctx->user_fctx;
+	
+		
+		MemoryContextSwitchTo(oldcontext);
+		mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx,
+												  "clean_check",
+												  ALLOCSET_DEFAULT_MINSIZE,
+												  ALLOCSET_DEFAULT_INITSIZE,
+												  ALLOCSET_DEFAULT_MAXSIZE);
+		oldcontext = MemoryContextSwitchTo(mycontext);
+		
+        /*clear Global*/
+        ResetGlobalVariables();
+        execute = true;
+        clean_time_interval = PG_GETARG_INT32(0) * 1000000;
+        if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval)
+        {
+            clean_time_interval = LEAST_CLEAN_TIME_INTERVAL;
+        }
+        
+		/*get node list*/
+		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
+						&cn_nodes_num, &dn_nodes_num, true);
+		pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
+		my_nodeoid = getMyNodeoid();
+		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+
+		/*add my database info*/
+		add_database_info(get_database_name(MyDatabaseId));
+
+		/*get all info of 2PC transactions*/
+		getTxnInfoOnNodesAll();
+
+		/*get txn info on other nodes all*/
+		getTxnInfoOnOtherNodesAll();
+
+		/*recover all 2PC transactions*/
+		recover2PCForDatabaseAll();
+
+		Init_print_txn_info(print_txn);
+		
+		print_txn->mycontext = mycontext;
+		
+		MemoryContextSwitchTo(oldcontext);
+
+	}
+	
+	funcctx = SRF_PERCALL_SETUP();	
+	print_txn = (print_txn_info *) funcctx->user_fctx;
+	
+	if (print_txn->index < print_txn->txn_count)
+	{
+		temp_txn = print_txn->txn[print_txn->index];
+		strncpy(txn_gid, temp_txn->gid, 100);
+		strncpy(txn_status, txn_status_to_string(temp_txn->global_txn_stat), 100);
+		strncpy(txn_op, txn_op_to_string(temp_txn->op), 100);
+		if (temp_txn->op_issuccess)
+			strncpy(txn_op_issuccess, "success", 100);
+		else
+			strncpy(txn_op_issuccess, "fail", 100);
+		
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		values[0] = PointerGetDatum(cstring_to_text(txn_gid));
+		values[1] = PointerGetDatum(cstring_to_text(txn_status));
+		values[2] = PointerGetDatum(cstring_to_text(txn_op));
+		values[3] = PointerGetDatum(cstring_to_text(txn_op_issuccess));
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+		print_txn->index++;
+		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+	}
+	else
+	{
+		
+		//MemoryContextDelete(print_txn->mycontext);
+		DestroyTxnHash();
+		ResetGlobalVariables();
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
+/*
+ * clear 2pc after oss detect abnormal node and restart it , 
+ * only clear 2pc belong the abnormal node and before the abnormal time
+ */
+Datum	pg_clean_execute_on_node(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pg_clean_execute_on_node);
+Datum	pg_clean_execute_on_node(PG_FUNCTION_ARGS)
+{
+#ifdef ACCESS_CONTROL_ATTR_NUM
+#undef ACCESS_CONTROL_ATTR_NUM
+#endif
+#define ACCESS_CONTROL_ATTR_NUM  4
+	FuncCallContext 	*funcctx;
+	HeapTuple			tuple;		
+	print_txn_info		*print_txn = NULL;
+	txn_info 			*temp_txn;
+	char				txn_gid[100];
+	char				txn_status[100];
+	char				txn_op[100];
+	char				txn_op_issuccess[100];
+	
+	Datum		values[ACCESS_CONTROL_ATTR_NUM];
+	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
+	
+	if(!IS_PGXC_COORDINATOR)
+	{
+		elog(ERROR, "can only called on coordinator");
+	}
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcontext;
+		TupleDesc	tupdesc;
+		MemoryContext mycontext;
+		funcctx = SRF_FIRSTCALL_INIT();
+		
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "global_transaction_status",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "operation",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "operation_status",
+						   TEXTOID, -1, 0);
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+		
+		funcctx->user_fctx = (print_txn_info *)palloc0(sizeof(print_txn_info));
+		print_txn = (print_txn_info *) funcctx->user_fctx;
+	
+		
+		MemoryContextSwitchTo(oldcontext);
+		mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx,
+												  "clean_check",
+												  ALLOCSET_DEFAULT_MINSIZE,
+												  ALLOCSET_DEFAULT_INITSIZE,
+												  ALLOCSET_DEFAULT_MAXSIZE);
+		oldcontext = MemoryContextSwitchTo(mycontext);
+		
+        /*clear Global*/
+        ResetGlobalVariables();
+        execute = true;
+        clear_2pc_belong_node = true;
+
+        abnormal_nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
+        abnormal_nodeoid = get_pgxc_nodeoid(abnormal_nodename);
+        if (InvalidOid == abnormal_nodeoid)
+        {
+            elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of invalid nodename '%s'", abnormal_nodename);
+        }
+        abnormal_time = PG_GETARG_INT64(1);
+        current_time = GetCurrentTimestamp();
+        if (abnormal_time >= current_time)
+        {
+            elog(ERROR, "pg_clean_execute_on_node, abnormal time "INT64_FORMAT" must before current_time "INT64_FORMAT, abnormal_time, current_time);
+        }
+        
+		/*get node list*/
+		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
+						&cn_nodes_num, &dn_nodes_num, true);
+		pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
+		my_nodeoid = getMyNodeoid();
+		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+
+		/*add my database info*/
+		add_database_info(get_database_name(MyDatabaseId));
+
+		/*get all info of 2PC transactions*/
+		getTxnInfoOnNodesAll();
+
+		/*get txn info on other nodes all*/
+		getTxnInfoOnOtherNodesAll();
+
+		/*recover all 2PC transactions*/
+		recover2PCForDatabaseAll();
+
+		Init_print_txn_info(print_txn);
+		
+		print_txn->mycontext = mycontext;
+		
+		MemoryContextSwitchTo(oldcontext);
+
+	}
+	
+	funcctx = SRF_PERCALL_SETUP();	
+	print_txn = (print_txn_info *) funcctx->user_fctx;
+	
+	if (print_txn->index < print_txn->txn_count)
+	{
+		temp_txn = print_txn->txn[print_txn->index];
+		strncpy(txn_gid, temp_txn->gid, 100);
+		strncpy(txn_status, txn_status_to_string(temp_txn->global_txn_stat), 100);
+		strncpy(txn_op, txn_op_to_string(temp_txn->op), 100);
+		if (temp_txn->op_issuccess)
+			strncpy(txn_op_issuccess, "success", 100);
+		else
+			strncpy(txn_op_issuccess, "fail", 100);
+		
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		values[0] = PointerGetDatum(cstring_to_text(txn_gid));
+		values[1] = PointerGetDatum(cstring_to_text(txn_status));
+		values[2] = PointerGetDatum(cstring_to_text(txn_op));
+		values[3] = PointerGetDatum(cstring_to_text(txn_op_issuccess));
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+		print_txn->index++;
+		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+	}
+	else
+	{
+		DestroyTxnHash();
+        pfree(abnormal_nodename);
+		ResetGlobalVariables();
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
+
+Datum	pg_clean_check_txn(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pg_clean_check_txn);
+Datum	pg_clean_check_txn(PG_FUNCTION_ARGS)
+{
+#ifdef ACCESS_CONTROL_ATTR_NUM
+#undef ACCESS_CONTROL_ATTR_NUM
+#endif
+#define ACCESS_CONTROL_ATTR_NUM  4
+	FuncCallContext 	*funcctx;
+	HeapTuple			tuple;		
+	print_status		*pstatus = NULL;
+	
+	Datum		values[ACCESS_CONTROL_ATTR_NUM];
+	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
+	execute = false;
+    
+	if(!IS_PGXC_COORDINATOR)
+	{
+		elog(ERROR, "can only called on coordinator");
+	}
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcontext;
+		MemoryContext mycontext;
+		TupleDesc	tupdesc;
+		funcctx = SRF_FIRSTCALL_INIT();
+		
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "database",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "global_transaction_status",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "transaction_status_on_allnodes",
+						   TEXTOID, -1, 0);
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+		
+		funcctx->user_fctx = (print_status *)palloc0(sizeof(print_status));
+		pstatus = (print_status *) funcctx->user_fctx;
+		pstatus->index = pstatus->count = 0;
+		pstatus->gid = NULL;
+		pstatus->global_status = pstatus->status = (char **)NULL;
+		pstatus->database = NULL;
+		pstatus->mycontext = NULL;
+	
+
+		MemoryContextSwitchTo(oldcontext);
+
+		mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx,
+												  "clean_check",
+												  ALLOCSET_DEFAULT_MINSIZE,
+												  ALLOCSET_DEFAULT_INITSIZE,
+												  ALLOCSET_DEFAULT_MAXSIZE);
+		oldcontext = MemoryContextSwitchTo(mycontext);
+
+        /*clear Global*/
+        ResetGlobalVariables();
+        
+        clean_time_interval = PG_GETARG_INT32(0) * 1000000;
+        if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval)
+        {
+            clean_time_interval = LEAST_CLEAN_TIME_INTERVAL;
+        }
+		/*get node list*/
+		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
+						&cn_nodes_num, &dn_nodes_num, true);
+        if (cn_node_list == NULL || dn_node_list == NULL)
+            elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list");
+		pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
+		my_nodeoid = getMyNodeoid();
+		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+
+		/*get all database info*/
+		getDatabaseList();
+
+		/*get all info of 2PC transactions*/
+		getTxnInfoOnNodesAll();
+
+		/*get txn info on other nodes all*/
+		getTxnInfoOnOtherNodesAll();
+
+		/*recover all 2PC transactions*/
+		Init_print_stats_all(pstatus);
+	
+		pstatus->mycontext = mycontext;
+	
+		MemoryContextSwitchTo(oldcontext);
+
+	}
+	
+	funcctx = SRF_PERCALL_SETUP();	
+	pstatus = (print_status *) funcctx->user_fctx;
+	
+	if (pstatus->index < pstatus->count)
+	{
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		values[0] = PointerGetDatum(cstring_to_text(pstatus->gid[pstatus->index]));
+		values[1] = PointerGetDatum(cstring_to_text(pstatus->database[pstatus->index]));
+		values[2] = PointerGetDatum(cstring_to_text(pstatus->global_status[pstatus->index]));
+		values[3] = PointerGetDatum(cstring_to_text(pstatus->status[pstatus->index]));
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+		pstatus->index++;
+		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+	}
+	else
+	{
+		/*
+		MemoryContextDelete(pstatus->mycontext);
+		DropDatabaseInfo();
+		*/
+		DestroyTxnHash();
+		ResetGlobalVariables();
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
+void DestroyTxnHash(void)
+{
+    database_info *dbinfo = head_database_info;
+    while (dbinfo)
+    {
+        hash_destroy(dbinfo->all_txn_info);
+        dbinfo = dbinfo->next;
+    }
+}
+
+static void ResetGlobalVariables(void)
+{
+	cn_node_list = NULL;
+	dn_node_list = NULL;
+	cn_health_map = NULL;
+	dn_health_map = NULL;
+	cn_nodes_num = 0;
+	dn_nodes_num = 0;
+	pgxc_clean_node_count = 0;
+	execute = false;
+    total_twopc_txn = 0;
+
+	head_database_info = last_database_info = NULL;
+
+    current_time = 0;
+    abnormal_time = InvalidGlobalTimestamp;
+    abnormal_nodename = NULL;
+    abnormal_nodeoid = InvalidOid;
+    clear_2pc_belong_node = false;
+
+}
+
+static Oid getMyNodeoid(void)
+{
+	return get_pgxc_nodeoid(PGXCNodeName);
+}
+
+/* 
+ * execute_query_on_single_node -- execute query on certain node and get results
+ * input: 	node oid, execute query, number of attribute in results, results
+ * return:	(Datum) 0
+ */
+static Datum
+execute_query_on_single_node(Oid node, const char *query, int attnum, TupleTableSlots *tuples)  //delete numnodes, delete nodelist, insert node
+{
+	int 		ii;
+	bool		issuccess = false;
+
+	/*check health of node*/
+	bool ishealthy = check_node_health(node);
+
+#ifdef XCP
+	EState				*estate;
+	MemoryContext		oldcontext;
+	RemoteQuery			*plan;
+	RemoteQueryState	*pstate;
+	TupleTableSlot		*result = NULL;
+	Var			   		*dummy;
+	char ntype = PGXC_NODE_NONE;
+
+	/*
+	 * Make up RemoteQuery plan node
+	 */
+	plan = makeNode(RemoteQuery);
+	plan->combine_type = COMBINE_TYPE_NONE;
+	plan->exec_nodes = makeNode(ExecNodes);
+	plan->exec_type = EXEC_ON_NONE;
+
+	plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList,
+		PGXCNodeGetNodeId(node, &ntype));
+	if (ntype == PGXC_NODE_NONE)
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("Unknown node Oid: %u", node)));
+	else if (ntype == PGXC_NODE_COORDINATOR) 
+	{
+		plan->exec_type = EXEC_ON_COORDS;
+	}
+	else
+	{
+		plan->exec_type = EXEC_ON_DATANODES;
+	}
+
+	plan->sql_statement = (char *)query;
+	plan->force_autocommit = false;
+	/*
+	 * We only need the target entry to determine result data type.
+	 * So create dummy even if real expression is a function.
+	 */
+	for (ii = 1; ii <= attnum; ii++)
+	{
+		dummy = makeVar(1, ii, TEXTOID, 0, InvalidOid, 0);
+		plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist,
+										  makeTargetEntry((Expr *) dummy, ii, NULL, false));
+	}
+	/* prepare to execute */
+	estate = CreateExecutorState();
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+	estate->es_snapshot = GetActiveSnapshot();
+	pstate = ExecInitRemoteQuery(plan, estate, 0);
+	MemoryContextSwitchTo(oldcontext);
+
+	/*execute query on node when node is healthy*/
+	INIT(tuples->slot);
+	tuples->attnum = 0;	
+	if (ishealthy)
+	{
+		int i_tuple = 0;
+		int i_attnum = 0;
+		issuccess = true;
+		result = ExecRemoteQuery((PlanState *) pstate);
+		tuples->attnum = attnum;
+		while (result != NULL && !TupIsNull(result))
+		{
+			slot_getallattrs(result); 
+			RPALLOC(tuples->slot);
+			tuples->slot[i_tuple] = (char **) palloc0(attnum * sizeof(char *));
+		
+			for (i_attnum = 0; i_attnum < attnum; i_attnum++)
+			{
+				/*if (result->tts_values[i_attnum] != (Datum)0)*/
+				if (result->tts_isnull[i_attnum] == false)
+				{
+					tuples->slot[i_tuple][i_attnum] = text_to_cstring(DatumGetTextP(result->tts_values[i_attnum]));
+				}
+				else
+				{
+					tuples->slot[i_tuple][i_attnum] = NULL;
+				}
+			}
+			tuples->slot_count++;
+
+			result = ExecRemoteQuery((PlanState *) pstate);
+			i_tuple++;
+		}
+	}
+	ExecEndRemoteQuery(pstate);
+#endif
+	return issuccess == true ? (Datum) 1 : (Datum) 0;
+}
+
+static bool check_node_health(Oid node_oid)
+{
+	int i;
+	bool ishealthy = false;
+	
+	PoolPingNodeRecheck(node_oid);
+	PgxcNodeGetHealthMap(cn_node_list, dn_node_list, 
+						&cn_nodes_num, &dn_nodes_num, 
+						cn_health_map, dn_health_map);
+	if (get_pgxc_nodetype(node_oid) == 'C')
+	{
+		for (i = 0; i < cn_nodes_num; i++)
+		{
+			if (cn_node_list[i] == node_oid)
+			{
+				ishealthy = cn_health_map[i];
+			}
+		}
+	}
+	else
+	{
+		for (i = 0; i < dn_nodes_num; i++)
+		{
+			if (dn_node_list[i] == node_oid)
+			{
+				ishealthy = dn_health_map[i];
+			}
+		}
+	}
+	return ishealthy;
+}
+
+static void getDatabaseList(void)
+{
+	int i;
+	TupleTableSlots result_db;
+	const char *query_db = "select datname::text from pg_database;";
+	/*add datname into tail of head_database_info*/
+	if (execute_query_on_single_node(my_nodeoid, query_db, 1, &result_db) == (Datum) 1)
+	{
+		for (i = 0; i < result_db.slot_count; i++)
+		{
+			if (TTSgetvalue(&result_db, i, 0))
+			{
+				add_database_info(TTSgetvalue(&result_db, i, 0));
+			}
+		}
+	}
+	else
+	{
+		elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(my_nodeoid));
+	}
+	DropTupleTableSlots(&result_db);
+}
+
+/* 
+ * TTSgetvalue -- get attribute from TupleTableSlots
+ * input: 	result, index of tuple, index of field
+ * return:	attribute result
+ */
+static char * TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num)
+{
+	return result->slot[tup_num][field_num];
+}
+
+static void DropTupleTableSlots(TupleTableSlots *
+Slots)
+{
+	int i;
+	int j;
+	for (i = 0; i < Slots->slot_count; i++)
+	{
+		if (Slots->slot[i])
+		{
+			for (j = 0; j < Slots->attnum; j++)
+			{
+				if (Slots->slot[i][j])
+				{
+					pfree(Slots->slot[i][j]);
+				}
+			}
+			pfree(Slots->slot[i]);
+		}
+	}
+	RFREE(Slots->slot);
+	Slots->attnum = 0;
+	return;
+}
+
+static void getTxnInfoOnNodesAll(void)
+{
+	int i;
+	current_time = GetCurrentTimestamp();
+	/*upload 2PC transaction from CN*/
+	for (i = 0; i < cn_nodes_num; i++)
+	{
+        if (total_twopc_txn >= MAX_TWOPC_TXN)
+            return;
+		getTxnInfoOnNode(cn_node_list[i]);
+	}
+
+	/*upload 2PC transaction from DN*/
+	for (i = 0; i < dn_nodes_num; i++)
+	{
+        if (total_twopc_txn >= MAX_TWOPC_TXN)
+            return;
+		getTxnInfoOnNode(dn_node_list[i]);
+	}
+}
+
+void getTxnInfoOnNode(Oid node)
+{
+	int i;
+	TupleTableSlots result_txn;
+	Datum execute_res;
+	char query_execute[1024];
+	const char *query_txn_status = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text "
+										  "from pg_prepared_xacts;";
+	const char *query_txn_status_execute = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text "
+										  		  "from pg_prepared_xacts where database = '%s';";
+	snprintf(query_execute, 1024, query_txn_status_execute, get_database_name(MyDatabaseId));
+
+	if (execute)
+		execute_res = execute_query_on_single_node(node, query_execute, 5, &result_txn);
+	else
+		execute_res = execute_query_on_single_node(node, query_txn_status, 5, &result_txn);
+	
+	if (execute_res == (Datum) 1)
+	{
+		for (i = 0; i < result_txn.slot_count; i++)
+		{
+			uint32	xid;
+			char*	gid;
+			char*	owner;
+			char*	datname;
+			TimestampTz	prepared_time;
+			
+			/*read results from each tuple*/
+			xid		= strtoul(TTSgetvalue(&result_txn, i, 0), NULL, 10);
+			gid		= TTSgetvalue(&result_txn, i, 1);
+			owner	= TTSgetvalue(&result_txn, i, 2);
+			datname	= TTSgetvalue(&result_txn, i, 3);
+			prepared_time = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
+												CStringGetDatum(TTSgetvalue(&result_txn, i, 4)),
+												ObjectIdGetDatum(InvalidOid),
+												Int32GetDatum(-1)));
+			
+			/*add txn to database*/
+			add_txn_info(datname, node, xid, gid, owner, prepared_time, TXN_STATUS_PREPARED);
+            if (total_twopc_txn >= MAX_TWOPC_TXN)
+            {
+                break;
+            }
+		}
+	}
+	else
+	{
+		elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(node));
+	}
+	DropTupleTableSlots(&result_txn);
+}
+
+void add_txn_info(char* dbname, Oid node_oid, uint32 xid, char * gid, 
+						char * owner, TimestampTz prepared_time, TXN_STATUS status)
+{
+	txn_info *txn = NULL;
+	int	nodeidx;
+
+	if ((txn = find_txn(gid)) == NULL)
+	{
+		txn = make_txn_info(dbname, gid, owner);
+        total_twopc_txn++;
+		if (txn == NULL)
+		{
+			/*no more memory*/
+			elog(ERROR, "there is no more memory for palloc a 2PC transaction");
+		}
+	}
+	nodeidx = find_node_index(node_oid);
+	txn->txn_stat[nodeidx] = status;
+	txn->xid[nodeidx] = xid;
+	txn->prepare_timestamp[nodeidx] = prepared_time;
+	if (nodeidx < cn_nodes_num)
+	{
+		txn->coordparts[nodeidx] = 1;
+		txn->num_coordparts++;
+	}
+	else
+	{
+		txn->dnparts[nodeidx-cn_nodes_num] = 1;
+		txn->num_dnparts++;
+	}
+	return;
+}
+
+TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
+{
+	/*get all the participates and initiate to each transactions*/
+	TWOPHASE_FILE_STATUS res = TWOPHASE_FILE_NOT_EXISTS;
+	TupleTableSlots result;
+	char *partnodes = NULL;
+    char *startnode = NULL;
+    char *file_content = NULL;
+    uint32 startxid = 0;
+    char *str_startxid = NULL;
+    char *str_timestamp = NULL;
+	char *temp = NULL;
+	Oid	 temp_nodeoid;
+	char temp_nodetype;
+	int  temp_nodeidx;
+	char stmt[1024];
+	static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text";
+	snprintf(stmt, 1024, STMT_FORM, txn->gid, txn->gid, txn->gid, txn->gid);
+    
+	if (execute_query_on_single_node(node_oid, stmt, 1, &result) == (Datum) 1)
+	{
+		if (result.slot_count && TTSgetvalue(&result, 0, 0))
+#if 0
+            TTSgetvalue(&result, 0, 0) && 
+            TTSgetvalue(&result, 0, 1) && 
+            TTSgetvalue(&result, 0, 2))
+#endif
+		{
+            file_content = TTSgetvalue(&result, 0, 0);    
+            
+            if (!IsXidImplicit(txn->gid) && strstr(file_content, GET_READONLY))
+            {
+                txn->is_readonly = true;
+                txn->global_txn_stat = TXN_STATUS_COMMITTED;
+                DropTupleTableSlots(&result);
+	            return TWOPHASE_FILE_EXISTS;
+            }
+            startnode = strstr(file_content, GET_START_NODE);
+            str_startxid = strstr(file_content, GET_START_XID);
+            partnodes = strstr(file_content, GET_NODE);
+            temp = strstr(file_content, GET_COMMIT_TIMESTAMP);
+            
+            /* get the last global_commit_timestamp */
+            while (temp)
+            {
+                str_timestamp = temp;
+                temp += strlen(GET_COMMIT_TIMESTAMP);
+                temp = strstr(temp, GET_COMMIT_TIMESTAMP);
+            }
+            
+            if (startnode)
+            {
+                startnode += strlen(GET_START_NODE);
+                startnode = strtok(startnode, "\n");
+                txn->origcoord = get_pgxc_nodeoid(startnode);
+            }
+            
+            if (str_startxid)
+            {
+                str_startxid += strlen(GET_START_XID);
+                str_startxid = strtok(str_startxid, "\n");
+                startxid = strtoul(str_startxid, NULL, 10);
+                txn->startxid = startxid;
+            }
+            
+            if (partnodes)
+            {
+                partnodes += strlen(GET_NODE);
+                partnodes = strtok(partnodes, "\n");
+                txn->participants = (char *) palloc0(strlen(partnodes) + 1);
+                strncpy(txn->participants, partnodes, strlen(partnodes) + 1);
+            }
+            
+            if (NULL == startnode || NULL == str_startxid)
+            {
+                res = TWOPHASE_FILE_OLD;
+                DropTupleTableSlots(&result);
+                return res;
+            }
+
+            if (NULL == partnodes)
+            {
+                res = TWOPHASE_FILE_ERROR;
+                DropTupleTableSlots(&result);
+                return res;
+            }
+
+            if (str_timestamp)
+            {
+                str_timestamp += strlen(GET_COMMIT_TIMESTAMP);
+                str_timestamp = strtok(str_timestamp, "\n");
+                txn->global_commit_timestamp = strtoull(str_timestamp, NULL, 10);
+            }
+            
+            elog(DEBUG1, "get 2pc txn:%s partnodes in nodename: %s (nodeoid:%u) result: partnodes:%s, startnode:%s, startnodeoid:%u, startxid:%u", 
+                txn->gid, get_pgxc_nodename(node_oid), node_oid, partnodes, startnode, txn->origcoord, startxid);
+            /* in explicit transaction startnode participate the transaction */
+            if (strstr(partnodes, startnode) || !IsXidImplicit(txn->gid))
+            {
+                txn->isorigcoord_part = true;
+            }
+            else
+            {
+                txn->isorigcoord_part = false;
+            }
+            
+			res = TWOPHASE_FILE_EXISTS;
+			txn->num_coordparts = 0;
+			txn->num_dnparts = 0;
+			temp = strtok(partnodes,", ");
+			while(temp)
+			{
+				/*check node type*/
+				temp_nodeoid = get_pgxc_nodeoid(temp);
+                if (temp_nodeoid == InvalidOid)
+                {
+                    res = TWOPHASE_FILE_ERROR;
+                    break;
+                }
+				temp_nodetype = get_pgxc_nodetype(temp_nodeoid);
+				temp_nodeidx = find_node_index(temp_nodeoid);
+				
+				switch (temp_nodetype)
+				{
+					case 'C':
+						txn->coordparts[temp_nodeidx] = 1;
+						txn->num_coordparts++;
+						break;
+					case 'D':
+						txn->dnparts[temp_nodeidx-cn_nodes_num] = 1;
+						txn->num_dnparts++;
+						break;
+					default:
+						elog(ERROR,"nodetype of %s is not 'C' or 'D'", temp);
+						break;
+				}
+				temp = strtok(NULL,", ");
+			}
+		}
+	}
+	else
+	{
+		elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(node_oid));
+		res = TWOPHASE_FILE_ERROR;
+	}
+	DropTupleTableSlots(&result);
+	return res;
+}
+
+static txn_info *find_txn(char *gid)
+{
+  bool found;
+  database_info *cur_db;
+  txn_info *txn;
+
+  for (cur_db = head_database_info; cur_db; cur_db = cur_db->next)
+  {
+#if 0
+	  for (cur_txn = cur_db->head_txn_info; cur_txn; cur_txn = cur_txn->next)
+	  {
+		  if (0 == strcmp(cur_txn->gid, gid))
+			  return cur_txn;
+	  }
+#endif
+      txn = (txn_info *)hash_search(cur_db->all_txn_info, (void *)gid, HASH_FIND, &found);
+      if (found)
+        return txn;
+  }
+  return NULL;
+}
+
+txn_info* make_txn_info(char* dbname, char* gid, char* owner)
+{
+    bool found;
+    txn_info *txn_insert_pos = NULL;
+	database_info *dbinfo;
+	txn_info *txn;
+
+	dbinfo = add_database_info(dbname);
+	txn = (txn_info *)palloc0(sizeof(txn_info));
+	if (txn == NULL)
+		return NULL;
+	//txn->next = NULL;
+	
+	//txn->gid = (char *)palloc0(strlen(gid)+1);
+	strncpy(txn->gid, gid, strlen(gid)+1);
+	txn->owner = (char *)palloc0(strlen(owner)+1);
+	strncpy(txn->owner, owner, strlen(owner)+1);
+	
+	txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count);
+	txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count);
+	txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count);
+	txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int));
+	
+	txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int));
+	if (txn->gid == NULL || txn->owner == NULL || txn->txn_stat == NULL
+		|| txn->xid == NULL || txn->coordparts == NULL || txn->dnparts == NULL || txn->prepare_timestamp == NULL)
+	{
+		pfree(txn);
+		return(NULL);
+	}
+
+    txn_insert_pos = (txn_info *)hash_search(dbinfo->all_txn_info, 
+                   (void *)txn->gid, HASH_ENTER, &found);
+    if (!found)
+        memcpy(txn_insert_pos, txn, sizeof(txn_info));
+
+#if 0        
+	if (dbinfo->head_txn_info == NULL)
+	{
+		dbinfo->head_txn_info = dbinfo->last_txn_info = txn;
+	}
+	else
+	{
+		dbinfo->last_txn_info->next = txn;
+		dbinfo->last_txn_info = txn;
+	}
+#endif
+
+	return txn_insert_pos;
+}
+
+database_info *find_database_info(char *database_name)
+{
+	database_info *cur_database_info = head_database_info;
+
+	for (;cur_database_info; cur_database_info = cur_database_info->next)
+	{
+		if(cur_database_info->database_name &&
+		   database_name && 
+		   strcmp(cur_database_info->database_name, database_name) == 0)
+			return(cur_database_info);
+	}
+	return(NULL);
+}
+
+database_info *add_database_info(char *database_name)
+{
+	database_info *rv;
+    HASHCTL txn_ctl;
+    char tabname[STRING_BUFF_LEN];
+
+	if ((rv = find_database_info(database_name)) != NULL)
+		return rv;		/* Already in the list */
+	rv = (database_info *)palloc0(sizeof(database_info));
+	if (rv == NULL)
+		return NULL;
+	rv->next = NULL;
+	rv->database_name = (char *)palloc0(strlen(database_name) + 1);
+	strncpy(rv->database_name, database_name, strlen(database_name) + 1);
+	if (rv->database_name == NULL)
+	{
+		pfree(rv);
+		return NULL;
+	}
+#if 0    
+	rv->head_txn_info = NULL;
+	rv->last_txn_info = NULL;
+#endif
+
+    snprintf(tabname, STRING_BUFF_LEN, "%s txn info", rv->database_name);
+    txn_ctl.keysize = MAX_GID;
+    txn_ctl.entrysize = sizeof(txn_info); 
+    rv->all_txn_info = hash_create(tabname, 64, 
+                                   &txn_ctl, HASH_ELEM);
+	if (head_database_info == NULL)
+	{
+		head_database_info = last_database_info = rv;
+		return rv;
+	}
+	else
+	{
+		last_database_info->next = rv;
+		last_database_info = rv;
+		return rv;
+	}
+}
+
+int find_node_index(Oid node_oid)
+{
+	int res = -1;
+	int i;
+	if (get_pgxc_nodetype(node_oid) == 'C')
+	{
+		for (i = 0; i < cn_nodes_num; i++)
+		{
+			if (node_oid == cn_node_list[i])
+			{
+				res = i;
+				break;
+			}
+		}
+	}
+	else
+	{
+		for (i = 0; i < dn_nodes_num; i++)
+		{
+			if (node_oid == dn_node_list[i])
+			{
+				res = i+cn_nodes_num;
+				break;
+			}
+		}
+	}
+	return res;
+}
+
+Oid find_node_oid(int node_idx)
+{
+	return (node_idx < cn_nodes_num) ? cn_node_list[node_idx] :
+									   dn_node_list[node_idx-cn_nodes_num];
+}
+
+void getTxnInfoOnOtherNodesAll(void)
+{
+	database_info *cur_database;
+
+	for (cur_database = head_database_info; cur_database; cur_database = cur_database->next)
+	{
+		getTxnInfoOnOtherNodesForDatabase(cur_database);
+	}
+}
+
+void getTxnInfoOnOtherNodesForDatabase(database_info *database)
+{
+	txn_info *cur_txn;
+	HASH_SEQ_STATUS status;
+    HTAB *txn = database->all_txn_info;
+	hash_seq_init(&status, txn);
+
+    while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL)
+    {
+		getTxnInfoOnOtherNodes(cur_txn);
+    }
+#if 0
+	for (cur_txn = database->head_txn_info; cur_txn; cur_txn = cur_txn->next)
+	{
+		getTxnInfoOnOtherNodes(cur_txn);
+	}
+#endif
+}
+
+void getTxnInfoOnOtherNodes(txn_info *txn)
+{
+	int ii;
+    int ret;
+	char node_type;
+    TWOPHASE_FILE_STATUS status = TWOPHASE_FILE_NOT_EXISTS;
+    Oid node_oid;
+    uint32 transactionid = 0;
+    char gid[MAX_GID];
+    char *ptr = NULL;
+
+    if (IsXidImplicit(txn->gid))
+    {
+        strncpy(gid, txn->gid, strlen(txn->gid)+1);
+        ptr = strtok(gid, ":");
+        ptr = strtok(NULL, ":");
+        node_oid = get_pgxc_nodeoid(ptr);
+        status = GetTransactionPartNodes(txn, node_oid);
+    }
+    else
+    {
+        for (ii = 0; ii < cn_nodes_num + dn_nodes_num; ii++)
+        {
+            if (ii < cn_nodes_num)
+            {
+                status = GetTransactionPartNodes(txn, cn_node_list[ii]);
+                if (TWOPHASE_FILE_EXISTS == status || 
+                    TWOPHASE_FILE_OLD == status || 
+                    TWOPHASE_FILE_ERROR == status)
+                {
+                    node_oid = cn_node_list[ii];
+                    break;
+                }
+            }
+            else
+            {
+                status = GetTransactionPartNodes(txn, dn_node_list[ii - cn_nodes_num]);
+                if (TWOPHASE_FILE_EXISTS == status || 
+                    TWOPHASE_FILE_OLD == status || 
+                    TWOPHASE_FILE_ERROR == status)
+                {
+                    node_oid = dn_node_list[ii - cn_nodes_num];
+                    break;
+                }
+            }
+        }
+        
+        /* since there may be explicit readonly  twophase transactions */
+        if (txn->is_readonly)
+        {
+            return;
+        }
+        if (TWOPHASE_FILE_EXISTS == status && 
+            InvalidGlobalTimestamp == txn->global_commit_timestamp && 
+            node_oid != txn->origcoord)
+        {
+            status = GetTransactionPartNodes(txn, txn->origcoord);
+        }
+
+    }
+    
+    if (TWOPHASE_FILE_EXISTS != status)
+    {
+        /*
+         * if 2pc file not exists in all nodes, the trans did not pass the prepared phase, 
+         * 
+         */
+        txn->global_txn_stat = (TWOPHASE_FILE_NOT_EXISTS == status) ? 
+                                TXN_STATUS_ABORTED : TXN_STATUS_UNKNOWN;
+        return;
+    }
+
+
+    /* judge the range of global status */
+    CheckFirstPhase(txn);
+
+	for (ii = 0; ii < pgxc_clean_node_count; ii++)
+	{
+		if (txn->txn_stat[ii] == TXN_STATUS_INITIAL)
+		{
+			/*check node ii is 'C' or 'D'*/
+            node_oid = find_node_oid(ii);
+            if (node_oid == txn->origcoord)
+                continue;
+			node_type = get_pgxc_nodetype(node_oid);
+			if (node_type == 'C' && txn->coordparts[ii] != 1)
+				continue;
+			if (node_type == 'D' && txn->dnparts[ii - cn_nodes_num] != 1)
+				continue;
+			/*check coordparts or dnparts*/
+			if (txn->xid[ii] == 0)
+			{
+                ret = Get2PCXidByGid(node_oid, txn->gid, &transactionid);
+                if (ret == XIDFOUND)
+                {
+                    txn->xid[ii] = transactionid;
+                    if (txn->xid[ii] > 0)
+                        getTxnStatus(txn, ii);
+                }
+                else if (ret == XIDNOTFOUND)
+                {
+                    if (txn->after_first_phase)
+                        txn->txn_stat[ii] = TXN_STATUS_COMMITTED;
+                }
+                else
+                    txn->txn_stat[ii] = TXN_STATUS_UNKNOWN;
+
+			}
+		}
+	}
+}
+
+/*get xid by gid on node_oid*/
+int Get2PCXidByGid(Oid node_oid, char *gid, uint32 *transactionid)
+{
+    int ret = XIDFOUND;
+	TupleTableSlots result;
+	uint32 xid = 0;
+	static const char *STMT_FORM = "select pgxc_get_2pc_xid('%s')::text;";
+	char stmt[100];
+	snprintf(stmt, 100, STMT_FORM, gid);
+	/*if exist get xid by gid on node_oid*/
+	if (execute_query_on_single_node(node_oid, stmt, 1, &result) != (Datum) 0)
+	{
+		if (result.slot_count)
+		{
+			if (TTSgetvalue(&result, 0, 0))
+			{
+				xid = strtoul(TTSgetvalue(&result, 0, 0), NULL, 10);
+                *transactionid = xid;
+                if (xid == 0)
+                    ret = XIDNOTFOUND;
+			}
+            else
+                ret = XIDNOTFOUND;
+		}
+		else
+			ret = XIDNOTFOUND;
+	}
+	else
+		ret = XIDEXECFAIL;
+	DropTupleTableSlots(&result);
+	return ret;
+}
+
+int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid)
+{
+    int ret = FILEFOUND;
+	TupleTableSlots result;
+	static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text;";
+	char stmt[100];
+	snprintf(stmt, 100, STMT_FORM, gid);
+	/*if exist get xid by gid on node_oid*/
+	if (execute_query_on_single_node(node_oid, stmt, 1, &result) != (Datum) 0)
+	{
+		if (result.slot_count)
+		{
+			if (!TTSgetvalue(&result, 0, 0))
+			{
+                ret = FILENOTFOUND;
+			}
+            else
+            {
+                ret = FILEFOUND;
+            }
+		}
+		else
+			ret = FILENOTFOUND;
+	}
+	else
+		ret = FILEUNKOWN;
+	DropTupleTableSlots(&result);
+	return ret;
+}
+
+
+void getTxnStatus(txn_info *txn, int node_idx)
+{
+	Oid				node_oid;
+	char			stmt[1024];
+	char			*att1;
+	TupleTableSlots result;
+
+	static const char *STMT_FORM = "SELECT pgxc_is_committed('%d'::xid)::text";
+	snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx], txn->xid[node_idx]);
+
+	node_oid = find_node_oid(node_idx);
+	if (0 != execute_query_on_single_node(node_oid, stmt, 1, &result))
+	{
+		att1 = TTSgetvalue(&result, 0, 0);
+		
+		if (att1)
+		{
+			if (strcmp(att1, "true") == 0)
+			{
+				txn->txn_stat[node_idx] = TXN_STATUS_COMMITTED;
+			}
+			else
+				txn->txn_stat[node_idx] = TXN_STATUS_ABORTED;
+		}
+		else
+		{
+            txn->txn_stat[node_idx] = TXN_STATUS_INITIAL;
+		}
+	}
+	else
+		txn->txn_stat[node_idx] = TXN_STATUS_UNKNOWN;
+	DropTupleTableSlots(&result);
+}
+
+char *get2PCInfo(const char *tid)
+{
+    char *result = NULL;
+    char *info = NULL;
+    int size = 0;
+    File fd = -1;
+    int ret = -1;
+    struct stat filestate;
+    char path[MAXPGPATH];
+
+    info = get_2pc_info_from_cache(tid);
+    if (NULL != info)
+    {
+        size = strlen(info);
+        result = (char *)palloc0(size + 1);
+        memcpy(result, info, size);
+        return result;
+    }
+
+    elog(LOG, "try to get 2pc info from disk, tid: %s", tid);
+
+    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
+    if(access(path, F_OK) == 0)
+    {
+        if(stat(path, &filestate) == -1)
+        {
+            ereport(ERROR,
+                (errcode_for_file_access(),
+                errmsg("could not get status of file \"%s\"", path)));
+        }
+        
+        size = filestate.st_size;
+
+        if (0 == size) 
+        {
+            return NULL;
+        }
+
+        result = (char *)palloc0(size + 1);
+
+        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
+        if (fd < 0)
+        {   
+            pfree(result);
+            ereport(ERROR,
+                (errcode_for_file_access(),
+                errmsg("could not open file \"%s\" for read", path)));
+        } 
+
+        ret = FileRead(fd, result, size, WAIT_EVENT_BUFFILE_READ);
+        if(ret != size)
+        {
+            pfree(result);
+            ereport(ERROR,
+                (errcode_for_file_access(),
+                errmsg("could not read file \"%s\"", path)));
+        }
+
+        FileClose(fd);
+        return result;
+    }
+
+    return NULL;
+}
+
+Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_file);
+Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS)
+{
+    char *tid = NULL;
+    char *result = NULL;
+    text *t_result = NULL;
+
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+    result = get2PCInfo(tid);
+    if (NULL != result)
+    {
+        t_result = cstring_to_text(result);
+        pfree(result);
+        return PointerGetDatum(t_result);
+    }
+    PG_RETURN_NULL();
+}
+
+
+Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_nodes);
+Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS)
+{
+    char *tid = NULL;
+    char *result = NULL;
+    char *nodename = NULL;
+    text *t_result = NULL;
+
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+    result = get2PCInfo(tid);
+    if (NULL != result)
+    {
+        nodename = strstr(result, GET_NODE);
+        if (NULL != nodename)
+        {
+            nodename += strlen(GET_NODE);
+            nodename = strtok(nodename, "\n");
+            t_result = cstring_to_text(nodename);
+            pfree(result);
+            return PointerGetDatum(t_result);
+        }
+    }
+
+    PG_RETURN_NULL();
+}
+
+Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_startnode);
+Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS)
+{
+    char *tid = NULL;
+    char *result = NULL;
+    char *nodename = NULL;
+    text *t_result = NULL;
+
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+    result = get2PCInfo(tid);
+    if (NULL != result)
+    {
+        nodename = strstr(result, GET_START_NODE);
+        if (NULL != nodename)
+        {
+            nodename += strlen(GET_START_NODE);
+            nodename = strtok(nodename, "\n");
+            t_result = cstring_to_text(nodename);
+            pfree(result);
+            return PointerGetDatum(t_result);
+
+        }
+    }
+    PG_RETURN_NULL();
+}
+
+Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_startxid);
+Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS)
+{
+    char *tid = NULL;
+    char *result = NULL;
+    char *startxid = NULL;
+    text *t_result = NULL;
+
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+    result = get2PCInfo(tid);
+    if (NULL != result)
+    {
+        startxid = strstr(result, GET_START_XID);
+        if (NULL != startxid)
+        {
+            startxid += strlen(GET_START_XID);
+            startxid = strtok(startxid, "\n");
+            t_result = cstring_to_text(startxid);
+            pfree(result);
+            return PointerGetDatum(t_result);
+        }
+    }
+    PG_RETURN_NULL();
+}
+
+
+Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_commit_timestamp);
+Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS)
+{
+    char *tid = NULL;
+    char *result = NULL;
+    char *commit_timestamp = NULL;
+    text *t_result = NULL;
+
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+    result = get2PCInfo(tid);
+    if (NULL != result)
+    {
+        commit_timestamp = strstr(result, GET_COMMIT_TIMESTAMP);
+        if (NULL != commit_timestamp)
+        {
+            commit_timestamp += strlen(GET_COMMIT_TIMESTAMP);
+            commit_timestamp = strtok(commit_timestamp, "\n");
+            t_result = cstring_to_text(commit_timestamp);
+            pfree(result);
+            return PointerGetDatum(t_result);
+        }
+    }
+    PG_RETURN_NULL();
+}
+
+
+
+Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_xid);
+Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS)
+{
+    char *tid = NULL;
+    char *result = NULL;
+    char *str_xid = NULL;
+    GlobalTransactionId xid;
+
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+    result = get2PCInfo(tid);
+    if (NULL != result)
+    {
+        str_xid = strstr(result, GET_XID);
+        if (NULL != str_xid)
+        {
+            str_xid += strlen(GET_XID);
+            str_xid = strtok(str_xid, "\n");
+            xid = strtoul(str_xid, NULL, 10);
+            pfree(result);
+            PG_RETURN_UINT32(xid);
+        }
+    }
+    PG_RETURN_NULL();
+}
+
+Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_remove_2pc_records);
+Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS)
+{
+    char *tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+	remove_2pc_records(tid, true);
+    pfree(tid);
+    PG_RETURN_BOOL(true);
+}
+
+Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_clear_2pc_records);
+Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
+{
+	MemoryContext oldcontext;
+	MemoryContext mycontext;
+	
+	int i = 0;
+    int count = 0;
+    TupleTableSlots *result;
+    TupleTableSlots clear_result;
+    const char *query = "select pgxc_get_record_list()::text";
+    const char *CLEAR_STMT = "select pgxc_remove_2pc_records('%s')::text";
+    char clear_query[100];
+    char *twopcfiles = NULL;
+    char *ptr = NULL;
+    bool res = true;
+    
+	if(!IS_PGXC_COORDINATOR)
+	{
+		elog(ERROR, "can only called on coordinator");
+	}
+	
+	mycontext = AllocSetContextCreate(CurrentMemoryContext,
+											  "clean_check",
+											  ALLOCSET_DEFAULT_MINSIZE,
+											  ALLOCSET_DEFAULT_INITSIZE,
+											  ALLOCSET_DEFAULT_MAXSIZE);
+	oldcontext = MemoryContextSwitchTo(mycontext);
+
+    ResetGlobalVariables();
+#if 0
+	if((dir = opendir(TWOPHASE_RECORD_DIR)))
+	{		
+		while((ptr = readdir(dir)) != NULL)
+	    {
+	    	if (count > 999)
+				break;
+	        if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0)
+	        {
+	            continue;
+	        }       
+			snprintf(path[count], MAX_GID, "/%s", ptr->d_name);
+			//snprintf(path[count], MAX_GID, "/%s", ptr->d_name);
+			count++;
+		}
+
+		closedir(dir);
+	}
+#endif
+
+	/*get node list*/
+	PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
+					&cn_nodes_num, &dn_nodes_num, true);
+	pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
+	my_nodeoid = getMyNodeoid();
+	cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+	dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+    result = (TupleTableSlots *)palloc0(pgxc_clean_node_count * sizeof(TupleTableSlots));
+
+    /*collect the 2pc file in nodes*/
+    for (i = 0; i < cn_nodes_num; i++)
+    {
+        (void) execute_query_on_single_node(cn_node_list[i], query, 1, result+i);
+    }
+
+    for (i = 0; i < dn_nodes_num; i++)
+    {
+        (void) execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i);
+    }
+	/*get all database info*/
+	getDatabaseList();
+	
+	/*get all info of 2PC transactions*/
+	getTxnInfoOnNodesAll();
+#if 0
+	if((dir = opendir(TWOPHASE_RECORD_DIR)))
+	{		
+		while (i < count)
+		{
+			if (!find_txn(path[i]))
+			{
+				unlink(path[i]);
+				WriteClean2pcXlogRec(path[i]);
+			}
+			i++;
+		}
+
+		closedir(dir);
+	}
+#endif
+    /*delete all rest 2pc file in each nodes*/
+    for (i = 0; i < cn_nodes_num; i++)
+    {
+        if (0 == result[i].slot_count)
+        {
+            continue;
+        }
+        if (!(twopcfiles = TTSgetvalue(result+i, 0, 0)))
+            continue;
+        ptr = strtok(twopcfiles, ",");
+        while(ptr)
+        {
+            if (count >= MAXIMUM_CLEAR_FILE)
+                break;
+            if (!find_txn(ptr))
+            {
+                snprintf(clear_query, 100, CLEAR_STMT, ptr);
+                if (execute_query_on_single_node(cn_node_list[i], clear_query, 1, &clear_result) == (Datum)0)
+                    res = false;
+                DropTupleTableSlots(&clear_result);
+                count++;
+            }
+            ptr = strtok(NULL, ",");
+        }
+    }
+
+    for (i = 0; i < dn_nodes_num; i++)
+    {
+        if (0 == result[cn_nodes_num+i].slot_count)
+        {
+            continue;
+        }
+        if (!(twopcfiles = TTSgetvalue(result+cn_nodes_num+i, 0, 0)))
+            continue;
+        ptr = strtok(twopcfiles, ",");
+        while(ptr)
+        {
+            if (count >= MAXIMUM_CLEAR_FILE)
+                break;
+            if (!find_txn(ptr))
+            {
+                snprintf(clear_query, 100, CLEAR_STMT, ptr);
+                if (execute_query_on_single_node(dn_node_list[i], clear_query, 1, &clear_result) == (Datum)0)
+                    res = false;
+                DropTupleTableSlots(&clear_result);
+                count++;
+            }
+            ptr = strtok(NULL, ",");
+        }
+    }
+
+    for (i = 0; i < pgxc_clean_node_count; i++)
+        DropTupleTableSlots(result+i);
+    
+    DestroyTxnHash();
+	ResetGlobalVariables();
+
+	MemoryContextSwitchTo(oldcontext);
+	MemoryContextDelete(mycontext);
+	
+	
+    PG_RETURN_BOOL(res);
+}
+
+Datum pgxc_get_record_list(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_record_list);
+Datum pgxc_get_record_list(PG_FUNCTION_ARGS)
+{
+    int count = 0;
+    DIR *dir = NULL;
+    struct dirent *ptr = NULL;
+    char *recordList = NULL;
+    text *t_recordList = NULL;
+
+    /* get from hash table */
+    recordList = get_2pc_list_from_cache(&count);
+    if (count >= MAXIMUM_OUTPUT_FILE)
+    {
+        Assert(NULL != recordList);
+        t_recordList = cstring_to_text(recordList);
+        return PointerGetDatum(t_recordList);
+    }
+
+    /* get from disk */
+    if(!(dir = opendir(TWOPHASE_RECORD_DIR)))
+    {
+        if(NULL == recordList)
+        {
+            PG_RETURN_NULL();
+        }
+
+        t_recordList = cstring_to_text(recordList);
+        return PointerGetDatum(t_recordList);
+    }
+
+    while((ptr = readdir(dir)) != NULL)
+    {
+        if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0)
+        {
+            continue;
+        }       
+        if (count >= MAXIMUM_OUTPUT_FILE)
+        {
+            break;
+        }
+        
+        if(!recordList)
+        {
+            recordList = (char *)palloc0(strlen(ptr->d_name) + 1);
+            sprintf(recordList, "%s", ptr->d_name);
+        }
+        else
+        {
+            recordList = (char *) repalloc(recordList,
+                                    strlen(ptr->d_name) + strlen(recordList) + 2);
+            sprintf(recordList, "%s,%s", recordList, ptr->d_name);
+        }
+        count++;
+    }
+
+    closedir(dir);
+
+    if(!recordList)
+    {
+        PG_RETURN_NULL();
+    }
+    else
+    {
+        t_recordList = cstring_to_text(recordList);
+        return PointerGetDatum(t_recordList);
+    }
+}
+
+Datum pgxc_commit_on_node(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_commit_on_node);
+Datum pgxc_commit_on_node(PG_FUNCTION_ARGS)
+{
+    /* nodename, gid */
+    char *nodename;
+    Oid  nodeoid;
+    char *gid;
+    txn_info *txn;
+	char command[MAX_CMD_LENGTH];
+	PGXCNodeHandle **connections = NULL;
+	int					conn_count = 0;
+	ResponseCombiner	combiner;
+	PGXCNodeAllHandles *pgxc_handles = NULL;
+    PGXCNodeHandle *conn = NULL;
+    
+    /*clear Global*/
+    ResetGlobalVariables();
+    /*get node list*/
+    PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
+                    &cn_nodes_num, &dn_nodes_num, true);
+    if (cn_node_list == NULL || dn_node_list == NULL)
+        elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list");
+    pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
+    my_nodeoid = getMyNodeoid();
+    cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+    dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+ 
+    nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
+    gid = text_to_cstring(PG_GETARG_TEXT_P(1));
+    nodeoid = get_pgxc_nodeoid(nodename);
+    if (InvalidOid == nodeoid)
+    {
+        elog(ERROR, "Invalid nodename '%s'", nodename);
+    }
+    
+	txn = (txn_info *)palloc0(sizeof(txn_info));
+	if (txn == NULL)
+	{
+		PG_RETURN_BOOL(false);
+	}
+	txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count);
+	txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count);
+	txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count);
+	txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int));
+	txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int));
+
+	strncpy(txn->gid, gid, strlen(gid)+1);
+    getTxnInfoOnOtherNodes(txn);
+	snprintf(command, MAX_CMD_LENGTH, "commit prepared '%s'", txn->gid);
+
+
+    if (InvalidGlobalTimestamp == txn->global_commit_timestamp)
+    {
+        if (!txn->is_readonly)
+        {
+            elog(ERROR, "in pg_clean, fail to get global_commit_timestamp for transaction '%s' on", gid);
+        }
+        else
+        {
+            txn->global_commit_timestamp = GetGlobalTimestampGTM();
+        }
+    }
+    
+	connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*));
+    get_node_handles(&pgxc_handles, nodeoid);
+
+    conn = (PGXC_NODE_COORDINATOR == get_pgxc_nodetype(nodeoid)) ? 
+            pgxc_handles->coord_handles[0] : pgxc_handles->datanode_handles[0];
+    if (!send_query_clean_transaction(conn, txn, command))
+    {
+        elog(ERROR, "pg_clean: send query '%s' from '%s' to '%s' failed ", 
+            command, get_pgxc_nodename(my_nodeoid) , nodename);
+    }
+    else
+    {
+        connections[conn_count++] = conn;
+    }
+    /* receive response */
+    if (conn_count)
+    {
+        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
+                !validate_combiner(&combiner))
+        {
+            if (combiner.errorMessage)
+                pgxc_node_report_error(&combiner);
+            else
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                         errmsg("Failed to FINISH the transaction on one or more nodes")));
+        }
+        else
+            CloseCombiner(&combiner);
+    }
+    /*clear Global*/
+    ResetGlobalVariables();
+	clear_handles();
+	pfree_pgxc_all_handles(pgxc_handles);
+    pgxc_handles = NULL;
+    pfree(connections);
+    connections = NULL;
+
+    PG_RETURN_BOOL(true);
+}
+
+Datum pgxc_abort_on_node(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_abort_on_node);
+Datum pgxc_abort_on_node(PG_FUNCTION_ARGS)
+{
+    /* nodename, gid */
+    char *nodename;
+    Oid  nodeoid;
+    char *gid;
+    txn_info *txn;
+	char command[MAX_CMD_LENGTH];
+	PGXCNodeHandle **connections = NULL;
+	int					conn_count = 0;
+	ResponseCombiner	combiner;
+	PGXCNodeAllHandles *pgxc_handles = NULL;
+    PGXCNodeHandle *conn = NULL;
+    
+    /*clear Global*/
+    ResetGlobalVariables();
+    /*get node list*/
+    PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
+                    &cn_nodes_num, &dn_nodes_num, true);
+    if (cn_node_list == NULL || dn_node_list == NULL)
+        elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list");
+    pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
+    my_nodeoid = getMyNodeoid();
+    cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+    dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+ 
+    nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
+    gid = text_to_cstring(PG_GETARG_TEXT_P(1));
+    nodeoid = get_pgxc_nodeoid(nodename);
+    if (InvalidOid == nodeoid)
+    {
+        elog(ERROR, "Invalid nodename '%s'", nodename);
+    }
+    
+	txn = (txn_info *)palloc0(sizeof(txn_info));
+	if (txn == NULL)
+	{
+		PG_RETURN_BOOL(false);
+	}
+	txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count);
+	txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count);
+	txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count);
+	txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int));
+	txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int));
+
+	strncpy(txn->gid, gid, strlen(gid)+1);
+	connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*));
+    getTxnInfoOnOtherNodes(txn);
+	snprintf(command, MAX_CMD_LENGTH, "rollback prepared '%s'", txn->gid);
+#if 0    
+	if (!setMaintenanceMode(true))
+	{
+		elog(ERROR, "Error: fail to set maintenance mode on in pg_clean");
+	}
+#endif    
+
+    get_node_handles(&pgxc_handles, nodeoid);
+
+    conn = (PGXC_NODE_COORDINATOR == get_pgxc_nodetype(nodeoid)) ? 
+            pgxc_handles->coord_handles[0] : pgxc_handles->datanode_handles[0];
+    if (!send_query_clean_transaction(conn, txn, command))
+    {
+        elog(ERROR, "pg_clean: send query '%s' from '%s' to '%s' failed ", 
+            command, get_pgxc_nodename(my_nodeoid) , nodename);
+    }
+    else
+    {
+        connections[conn_count++] = conn;
+    }
+    /* receive response */
+    if (conn_count)
+    {
+        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
+                !validate_combiner(&combiner))
+        {
+            if (combiner.errorMessage)
+                pgxc_node_report_error(&combiner);
+            else
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                         errmsg("Failed to FINISH the transaction on one or more nodes")));
+        }
+        else
+            CloseCombiner(&combiner);
+    }
+    /*clear Global*/
+    ResetGlobalVariables();
+	clear_handles();
+	pfree_pgxc_all_handles(pgxc_handles);
+    pgxc_handles = NULL;
+    pfree(connections);
+    connections = NULL;
+
+    PG_RETURN_BOOL(true);
+}
+
+
+
+void recover2PCForDatabaseAll(void)
+{
+	database_info *cur_db = head_database_info;
+	while (cur_db)
+	{
+		recover2PCForDatabase(cur_db);
+		cur_db = cur_db->next;
+	}
+	//clean_old_2PC_files();
+}
+
+void recover2PCForDatabase(database_info * db_info)
+{
+	txn_info *cur_txn;
+	HASH_SEQ_STATUS status;
+    HTAB *txn = db_info->all_txn_info;
+
+	hash_seq_init(&status, txn);
+	while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL)
+	{
+		recover2PC(cur_txn);
+    }
+}
+
+bool send_query_clean_transaction(PGXCNodeHandle* conn, txn_info *txn, const char *finish_cmd)
+{
+#ifdef __TWO_PHASE_TESTS__
+    if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case &&
+        PG_CLEAN_SEND_QUERY >= twophase_exception_case)
+    {
+        twophase_in = IN_PG_CLEAN;
+    }
+#endif
+	if (!GlobalTimestampIsValid(txn->global_commit_timestamp) && 
+        TXN_STATUS_COMMITTED == txn->global_txn_stat &&
+        !txn->is_readonly)
+		return false;
+	
+    if (pgxc_node_send_clean(conn))
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_INTERNAL_ERROR),
+                 errmsg("in pg_clean failed to send pg_clean flag for %s PREPARED command",
+                        TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        return false;
+    }
+    if (txn->is_readonly && pgxc_node_send_readonly(conn))
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_INTERNAL_ERROR),
+                 errmsg("in pg_clean failed to send readonly flag for %s PREPARED command",
+                        TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        return false;
+    }
+
+    if (txn->after_first_phase && pgxc_node_send_after_prepare(conn))
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_INTERNAL_ERROR),
+                 errmsg("in pg_clean failed to send after prepare flag for %s PREPARED command",
+                        TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        return false;
+    }
+    
+    /* 
+     * only transaction finished in commit prepared/rollback prepared phase send timestamp 
+     * partial prepared transaction has no need to send other information
+     */
+	if (InvalidGlobalTimestamp != txn->global_commit_timestamp && 
+        pgxc_node_send_global_timestamp(conn, txn->global_commit_timestamp))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("in pg_clean failed to send global committs for %s PREPARED command",
+						TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+	}
+    if (!txn->is_readonly)
+    {
+        if (InvalidOid != txn->origcoord && pgxc_node_send_starter(conn, get_pgxc_nodename(txn->origcoord)))
+        {
+    		ereport(ERROR,
+    				(errcode(ERRCODE_INTERNAL_ERROR),
+    				 errmsg("in pg_clean failed to send start node for %s PREPARED command",
+    						TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        }
+
+        if (InvalidTransactionId != txn->startxid && pgxc_node_send_startxid(conn, txn->startxid))
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_INTERNAL_ERROR),
+                     errmsg("in pg_clean failed to send start xid for %s PREPARED command",
+                            TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        }
+        
+        if (NULL != txn->participants && pgxc_node_send_partnodes(conn, txn->participants))
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_INTERNAL_ERROR),
+                     errmsg("in pg_clean failed to send participants for %s PREPARED command",
+                            TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        }
+    }
+    
+    if (pgxc_node_send_query(conn, finish_cmd))
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_INTERNAL_ERROR),
+                 errmsg("in pg_clean failed to send query for %s PREPARED command",
+                        TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        return false;
+    }
+	return true;
+}
+
+bool check_2pc_belong_node(txn_info * txn)
+{
+    int node_index = 0;
+    char node_type;
+    node_index = find_node_index(abnormal_nodeoid);
+    if (abnormal_nodeoid == txn->origcoord)
+    {
+        txn->belong_abnormal_node = true;
+        return true;
+    }
+    node_type = get_pgxc_nodetype(abnormal_nodeoid);
+    if (node_type == 'C' && txn->coordparts[node_index] == 1)
+    {
+        txn->belong_abnormal_node = true;
+        return true;
+    }
+    if (node_type == 'D' && txn->dnparts[node_index - cn_nodes_num] == 1)
+    {
+        txn->belong_abnormal_node = true;
+        return true;
+    }
+    txn->belong_abnormal_node = false;
+    return false;
+}
+
+bool check_node_participate(txn_info * txn, int node_idx)
+{
+    char node_type = get_pgxc_nodetype(abnormal_nodeoid);
+    if (PGXC_NODE_COORDINATOR == node_type) 
+    {
+        return txn->coordparts[node_idx] == 1 ? true : false;
+    } else if (PGXC_NODE_DATANODE == node_type)
+    {
+        return txn->dnparts[node_idx] == 1 ? true : false;
+    }
+    return false;
+}
+
+void recover2PC(txn_info * txn)
+{
+	TXN_STATUS txn_stat;
+	txn_stat = check_txn_global_status(txn);
+	txn->global_txn_stat = txn_stat;
+
+#ifdef DEBUG_EXECABORT
+	txn_stat = TXN_STATUS_ABORTED;
+#endif
+
+	switch (txn_stat)
+	{
+		case TXN_STATUS_FAILED:
+			elog(LOG, "cannot recover 2PC transaction %s for TXN_STATUS_FAILED", txn->gid);
+            txn->op = UNDO;
+			txn->op_issuccess = true;
+			break;
+		
+		case TXN_STATUS_UNKNOWN:
+			elog(LOG, "cannot recover 2PC transaction %s for TXN_STATUS_UNKNOWN", txn->gid);
+            txn->op = UNDO;
+			txn->op_issuccess = true;
+			break;
+		
+		case TXN_STATUS_PREPARED:
+			elog(DEBUG1, "2PC recovery of transaction %s not needed for TXN_STATUS_PREPARED", txn->gid);
+            txn->op = UNDO;
+			txn->op_issuccess = true;
+			break;
+		
+		case TXN_STATUS_COMMITTED:
+            if (InvalidOid == txn->origcoord || txn->is_readonly)
+            {
+                txn->op = UNDO;
+                txn->op_issuccess = true;
+            }
+            else
+            {
+    			txn->op = COMMIT;
+    			/* check whether all nodes can commit prepared */
+    			if (!clean_2PC_iscommit(txn, true, true))
+    			{
+    				txn->op_issuccess = false;
+    				elog(LOG, "check commit 2PC transaction %s failed", txn->gid);
+    				return;
+    			}
+    			/* send commit prepared to all nodes */
+    			if (!clean_2PC_iscommit(txn, true, false))
+    			{
+    				txn->op_issuccess = false;
+    				elog(LOG, "commit 2PC transaction %s failed", txn->gid);
+    				return;
+    			}
+    			txn->op_issuccess = true;
+    			clean_2PC_files(txn);
+            }
+			break;
+		
+		case TXN_STATUS_ABORTED:
+			txn->op = ABORT;
+			/* check whether all nodes can rollback prepared */
+			if (!clean_2PC_iscommit(txn, false, true))
+			{
+				txn->op_issuccess = false;
+				elog(LOG, "check rollback 2PC transaction %s failed", txn->gid);
+				return;
+			}
+			/* send rollback prepared to all nodes */
+			if (!clean_2PC_iscommit(txn, false, false))
+			{
+				txn->op_issuccess = false;
+				elog(LOG, "rollback 2PC transaction %s failed", txn->gid);
+				return;
+			}
+			txn->op_issuccess = true;
+			clean_2PC_files(txn);
+			break;
+		
+		case TXN_STATUS_INPROGRESS:
+			elog(DEBUG1, "2PC recovery of transaction %s not needed for TXN_STATUS_INPROGRESS", txn->gid);
+            txn->op = UNDO;
+			txn->op_issuccess = true;
+			break;
+		
+		default:
+			elog(ERROR, "cannot recover 2PC transaction %s for unkown status", txn->gid);
+			break;
+	}
+	return;
+}
+
+TXN_STATUS check_txn_global_status(txn_info *txn)
+{
+#define TXN_PREPARED 	0x0001
+#define TXN_COMMITTED 	0x0002
+#define TXN_ABORTED		0x0004
+#define TXN_UNKNOWN		0x0008
+#define TXN_INITIAL		0x0010
+#define TXN_INPROGRESS	0X0020
+	int ii;
+	int check_flag = 0;
+    int node_idx = 0;
+	TimestampTz prepared_time = 0;
+	TimestampTz time_gap = clean_time_interval;
+
+    if (!IsXidImplicit(txn->gid) && txn->is_readonly)
+    {
+        return TXN_STATUS_COMMITTED;
+    }
+    if (txn->global_txn_stat == TXN_STATUS_UNKNOWN)
+    {
+        check_flag |= TXN_UNKNOWN;
+    }
+    if (txn->global_txn_stat == TXN_STATUS_ABORTED)
+    {
+        check_flag |= TXN_ABORTED;
+    }
+
+	/*check dn participates*/
+	for (ii = 0; ii < dn_nodes_num; ii++)
+	{
+		if (txn->dnparts[ii] == 1)
+		{
+			if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_INITIAL)
+				check_flag |= TXN_INITIAL;
+			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_UNKNOWN)
+				check_flag |= TXN_UNKNOWN;
+			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_PREPARED)
+			{
+				check_flag |= TXN_PREPARED;
+				prepared_time = txn->prepare_timestamp[ii + cn_nodes_num] > prepared_time ? 
+								txn->prepare_timestamp[ii + cn_nodes_num] : prepared_time;
+			}
+			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_INPROGRESS)
+				check_flag |= TXN_INPROGRESS;
+			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_COMMITTED)
+				check_flag |= TXN_COMMITTED;
+			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_ABORTED)
+				check_flag |= TXN_ABORTED;
+			else
+				return TXN_STATUS_FAILED;
+		}
+	}
+	/*check cn participates*/
+	for (ii = 0; ii < cn_nodes_num; ii++)
+	{
+		if (txn->coordparts[ii] == 1)
+		{
+			if (txn->txn_stat[ii] == TXN_STATUS_INITIAL)
+				check_flag |= TXN_ABORTED;
+			else if (txn->txn_stat[ii] == TXN_STATUS_UNKNOWN)
+				check_flag |= TXN_UNKNOWN;
+			else if (txn->txn_stat[ii] == TXN_STATUS_PREPARED)
+			{
+				check_flag |= TXN_PREPARED;
+				prepared_time = txn->prepare_timestamp[ii] > prepared_time ? 
+								txn->prepare_timestamp[ii] : prepared_time;
+			}
+			else if (txn->txn_stat[ii] == TXN_STATUS_INPROGRESS)
+				check_flag |= TXN_INPROGRESS;
+			else if (txn->txn_stat[ii] == TXN_STATUS_COMMITTED)
+				check_flag |= TXN_COMMITTED;
+			else if (txn->txn_stat[ii] == TXN_STATUS_ABORTED)
+				check_flag |= TXN_ABORTED;
+			else
+				return TXN_STATUS_FAILED;
+		}
+	}
+
+    /*
+     * first check the prepare timestamp of both implicit and explicit trans within the time_gap or not
+     * if not, check the commit timestamp explicit trans within the time_gap or not 
+     */
+#if 0     
+    if ((check_flag & TXN_INPROGRESS) ||
+        (IsXidImplicit(txn->gid) && current_time - prepared_time <= time_gap) ||
+        (!IsXidImplicit(txn->gid) && 
+            ((!txn->after_first_phase && current_time - prepared_time <= time_gap) ||
+            (txn->after_first_phase && 
+                (InvalidGlobalTimestamp != commit_time && 
+                current_time - commit_time <= time_gap)))))
+    {
+		/* transaction inprogress */
+        return TXN_STATUS_INPROGRESS;
+    }
+#endif                
+    if (clear_2pc_belong_node)
+    {
+        node_idx = find_node_index(abnormal_nodeoid);
+        if (!check_2pc_belong_node(txn) || 
+           !check_node_participate(txn, node_idx) ||
+            abnormal_time < txn->prepare_timestamp[node_idx])
+        {
+            return TXN_STATUS_INPROGRESS;
+        }
+    }
+    else
+    {
+        if (check_flag & TXN_INPROGRESS ||current_time - prepared_time <= time_gap)
+        {
+            /* transaction inprogress */
+            return TXN_STATUS_INPROGRESS;
+        }
+    }
+
+
+    if (!IsXidImplicit(txn->gid) && txn->after_first_phase && (TXN_PREPARED == check_flag))
+    {
+        return TXN_STATUS_PREPARED;
+    }
+
+	if (check_flag & TXN_UNKNOWN)
+		return TXN_STATUS_UNKNOWN;
+    
+	if ((check_flag & TXN_COMMITTED) && (check_flag & TXN_ABORTED))
+		/* Mix of committed and aborted. This should not happen. */
+		return TXN_STATUS_UNKNOWN;
+    
+	if ((check_flag & TXN_PREPARED) == 0)
+		/* Should be at least one "prepared statement" in nodes */
+		return TXN_STATUS_FAILED;
+		
+	if (check_flag & TXN_COMMITTED)
+		/* Some 2PC transactions are committed.  Need to commit others. */
+		return TXN_STATUS_COMMITTED;
+	/* All the transactions remain prepared.   No need to recover. */
+	return TXN_STATUS_ABORTED;
+}
+
+bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check)
+{
+	int ii;
+	static const char *STMT_FORM = "%s prepared '%s';";
+	static const char *STMT_FORM_CHECK = "%s prepared '%s' for check only;";
+	char command[MAX_CMD_LENGTH];
+	int node_idx;
+    Oid node_oid;
+	PGXCNodeHandle **connections = NULL;
+	int					conn_count = 0;
+	ResponseCombiner	combiner;
+	PGXCNodeAllHandles *pgxc_handles = NULL;
+
+	if (is_commit)
+	{
+		if (is_check)
+		{
+			snprintf(command, MAX_CMD_LENGTH, STMT_FORM_CHECK, "commit", txn->gid);
+		}
+		else
+		{
+			snprintf(command, MAX_CMD_LENGTH, STMT_FORM, "commit", txn->gid);
+		}
+	}
+	else
+	{
+		if (is_check)
+		{
+			snprintf(command, MAX_CMD_LENGTH, STMT_FORM_CHECK, "rollback", txn->gid);
+		}
+		else
+		{
+			snprintf(command, MAX_CMD_LENGTH, STMT_FORM, "rollback", txn->gid);
+		}
+	}
+	if (is_commit && InvalidGlobalTimestamp == txn->global_commit_timestamp)	
+	{
+		elog(ERROR, "twophase transaction '%s' has InvalidGlobalCommitTimestamp", txn->gid);
+	}
+
+	connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*) * (txn->num_dnparts + txn->num_coordparts));
+	if (connections == NULL)
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_OUT_OF_MEMORY),
+                 errmsg("out of memory for connections")));
+    }
+    get_transaction_handles(&pgxc_handles, txn);
+	//pgxc_handles = get_handles(nodelist, coordlist, false, true);
+#ifdef __TWO_PHASE_TESTS__
+    if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && 
+        PG_CLEAN_ELOG_ERROR >= twophase_exception_case)
+    {
+        exception_count = 0;
+    }
+#endif
+	for (ii = 0; ii < pgxc_handles->dn_conn_count; ii++)
+	{
+        node_oid = pgxc_handles->datanode_handles[ii]->nodeoid;
+    	node_idx = find_node_index(node_oid);
+        if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx])
+        {
+            continue;
+        }
+		/*send global timestamp to dn_node_list[ii]*/
+		if (!send_query_clean_transaction(pgxc_handles->datanode_handles[ii], txn, command))
+		{
+			elog(LOG, "pg_clean: send query '%s' from '%s' to '%s' failed ", 
+				command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->datanode_handles[ii]->nodename);
+			return false;
+		}
+        else
+        {
+            connections[conn_count++] = pgxc_handles->datanode_handles[ii];
+#ifdef __TWO_PHASE_TESTS__
+            if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && 
+                PG_CLEAN_ELOG_ERROR >= twophase_exception_case)
+            {
+                exception_count++;
+                if (1 == exception_count && 
+                    PG_CLEAN_ELOG_ERROR == twophase_exception_case)
+                {
+                    elog(ERROR, "PG_CLEAN_ELOG_ERROR complish");
+                }
+            }
+#endif
+        }
+	}
+
+	for (ii = 0; ii < pgxc_handles->co_conn_count; ii++)
+	{
+        node_oid = pgxc_handles->coord_handles[ii]->nodeoid;
+    	node_idx = find_node_index(node_oid);
+        if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx])
+        {
+            continue;
+        }
+		/*send global timestamp to dn_node_list[ii]*/
+		if (!send_query_clean_transaction(pgxc_handles->coord_handles[ii], txn, command))
+		{
+			elog(LOG, "pg_clean: send query '%s' from '%s' to '%s' failed ", 
+				command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->coord_handles[ii]->nodename);
+			return false;
+		}
+        else
+        {
+            connections[conn_count++] = pgxc_handles->coord_handles[ii];
+#ifdef __TWO_PHASE_TESTS__
+            if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && 
+                PG_CLEAN_ELOG_ERROR >= twophase_exception_case)
+            {
+                exception_count++;
+                if (1 == exception_count && 
+                    PG_CLEAN_ELOG_ERROR == twophase_exception_case)
+                {
+                    elog(ERROR, "PG_CLEAN_ELOG_ERROR complish");
+                }
+            }
+#endif
+        }
+
+	}
+
+    /* receive response */
+    if (conn_count)
+    {
+        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
+                !validate_combiner(&combiner))
+        {
+            if (combiner.errorMessage)
+                pgxc_node_report_error(&combiner);
+            else
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                         errmsg("Failed to FINISH the transaction on one or more nodes")));
+        }
+        else
+            CloseCombiner(&combiner);
+    }
+    if (enable_distri_print)
+    {
+        for (ii = 0; ii < conn_count; ii++)
+        {
+            if (DN_CONNECTION_STATE_IDLE != connections[ii]->state)
+            {
+                elog(WARNING, "IN pg_clean node:%s invalid stauts:%d", connections[ii]->nodename, connections[ii]->state);
+            }
+        }
+    }
+    conn_count = 0;
+	clear_handles();
+	pfree_pgxc_all_handles(pgxc_handles);
+    pgxc_handles = NULL;
+
+	/*last commit or rollback on origcoord if it participate this txn, since after commit the 2pc file is deleted on origcoord*/
+    if (txn->origcoord != InvalidOid)
+    {
+    	node_idx = find_node_index(txn->origcoord);
+    	if (txn->coordparts[node_idx] == 1)
+    	{
+			/*send global timestamp to dn_node_list[ii]*/
+            
+			if (txn->txn_stat[node_idx] == TXN_STATUS_PREPARED)
+			{
+                get_node_handles(&pgxc_handles, txn->origcoord);
+                if (!send_query_clean_transaction(pgxc_handles->coord_handles[0], txn, command))
+                {
+                    elog(LOG, "pg_clean: send query '%s' from %s to %s failed ", 
+                        command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->coord_handles[0]->nodename);
+                    return false;
+                }
+                else
+                {
+                    connections[conn_count++] = pgxc_handles->coord_handles[0];
+                }
+            }
+    	}
+    }
+	
+    /* receive response */
+    if (conn_count)
+    {
+        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
+                !validate_combiner(&combiner))
+        {
+            if (combiner.errorMessage)
+                pgxc_node_report_error(&combiner);
+            else
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                         errmsg("Failed to FINISH the transaction on one or more nodes")));
+        }
+        else
+            CloseCombiner(&combiner);
+    }
+	/*free hash record from gtm*/
+	FinishGIDGTM(txn->gid);
+    
+	clear_handles();
+	pfree_pgxc_all_handles(pgxc_handles);
+    pgxc_handles = NULL;
+    pfree(connections);
+    connections = NULL;
+	return true;
+}
+
+bool clean_2PC_files(txn_info * txn)
+{
+	int ii;
+	TupleTableSlots result;
+	bool issuccess = true;
+	static const char *STMT_FORM = "select pgxc_remove_2pc_records('%s')::text";
+	char query[MAX_CMD_LENGTH];
+	
+	snprintf(query, MAX_CMD_LENGTH, STMT_FORM, txn->gid);
+
+	for (ii = 0; ii < dn_nodes_num; ii++)
+	{
+		if (execute_query_on_single_node(dn_node_list[ii], query, 1, &result) == (Datum) 1)
+		{
+			if (TTSgetvalue(&result, 0, 0) == false)
+			{
+				elog(LOG, "pg_clean: delete 2PC file failed of transaction %s on node %s",
+						  txn->gid, get_pgxc_nodename(txn->dnparts[ii]));
+				issuccess = false;
+			}
+		}
+		else
+		{
+			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(dn_node_list[ii]));
+			issuccess = false;
+		}
+		DropTupleTableSlots(&result);
+		if (!issuccess)
+			return false;
+	}
+
+	for (ii = 0; ii < cn_nodes_num; ii++)
+	{
+		if (execute_query_on_single_node(cn_node_list[ii], query, 1, &result) == (Datum) 1)
+		{
+			if (TTSgetvalue(&result, 0, 0) == false)
+			{
+				elog(LOG, "Error:delete 2PC file failed of transaction %s on node %s",
+						  txn->gid, get_pgxc_nodename(txn->coordparts[ii]));
+				issuccess = false;
+			}
+		}
+		else
+		{
+			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(cn_node_list[ii]));
+			issuccess = false;
+		}
+		DropTupleTableSlots(&result);
+		if (!issuccess)
+			return false;
+	}
+	return true;
+}
+
+void Init_print_txn_info(print_txn_info * print_txn)
+{
+	database_info *cur_database = head_database_info;
+	txn_info *cur_txn;
+	HASH_SEQ_STATUS status;
+    HTAB *txn;
+
+	print_txn->index = 0;
+	INIT(print_txn->txn);
+
+	for (; cur_database; cur_database = cur_database->next)
+	{
+        txn = cur_database->all_txn_info;
+        hash_seq_init(&status, txn);
+        while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL)
+        {
+            if (clear_2pc_belong_node && !cur_txn->belong_abnormal_node)
+            {
+                continue;
+            }
+			if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS)
+				PALLOC(print_txn->txn, cur_txn);
+        }
+        
+#if 0
+		cur_txn = cur_database->head_txn_info;
+		for (; cur_txn; cur_txn = cur_txn->next)
+		{
+			if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS)
+				PALLOC(print_txn->txn, cur_txn);
+		}
+#endif
+	}
+}
+
+void Init_print_stats_all(print_status *pstatus)
+{
+	database_info *cur_database;
+	txn_info *cur_txn;
+	HASH_SEQ_STATUS status;
+    HTAB *txn;
+
+	pstatus->index = 0;
+	pstatus->count = 0;
+	INIT(pstatus->gid);
+	INIT(pstatus->global_status);
+	INIT(pstatus->status);
+	INIT(pstatus->database);
+
+	for (cur_database = head_database_info; cur_database; cur_database = cur_database->next)
+	{
+        txn = cur_database->all_txn_info;
+        hash_seq_init(&status, txn);
+        while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL)
+        {
+			cur_txn->global_txn_stat = check_txn_global_status(cur_txn);
+			if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS)
+				Init_print_stats(cur_txn, cur_database->database_name, pstatus);
+        }
+#if 0
+		for (cur_txn = cur_database->head_txn_info; cur_txn; cur_txn = cur_txn->next)
+		{
+			cur_txn->global_txn_stat = check_txn_global_status(cur_txn);
+			if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS)
+				Init_print_stats(cur_txn, cur_database->database_name, pstatus);
+		}
+#endif
+	}
+}
+
+void Init_print_stats(txn_info *txn, char *database, print_status * pstatus)
+{
+	int ii;
+	StringInfoData	query;	
+	initStringInfo(&query);
+
+	RPALLOC(pstatus->gid);
+	RPALLOC(pstatus->global_status);
+	RPALLOC(pstatus->status);
+	RPALLOC(pstatus->database);
+
+	pstatus->gid[pstatus->count] = (char *)palloc0(100 * sizeof(char));
+	pstatus->database[pstatus->count] = (char *)palloc0(100 * sizeof(char));
+	pstatus->global_status[pstatus->count] = (char *)palloc0(100 * sizeof(char));
+
+	strncpy(pstatus->gid[pstatus->count], txn->gid, 100);
+	strncpy(pstatus->database[pstatus->count], database, 100);
+	strncpy(pstatus->global_status[pstatus->count], txn_status_to_string(check_txn_global_status(txn)), 100);
+
+	for (ii = 0; ii < pgxc_clean_node_count; ii++)
+	{
+		appendStringInfo(&query, "%-12s:%-15s", get_pgxc_nodename(find_node_oid(ii)), 
+						txn_status_to_string(txn->txn_stat[ii]));
+		if (ii < pgxc_clean_node_count - 1)
+		{
+			appendStringInfoChar(&query, '\n');
+		}
+	}
+
+	pstatus->status[pstatus->count] = (char *)palloc0((strlen(query.data)+1) * sizeof(char));
+	strncpy(pstatus->status[pstatus->count], query.data, strlen(query.data)+1);
+	pstatus->gid_count++;
+	pstatus->database_count++;
+	pstatus->global_status_count++;
+	pstatus->status_count++;
+	pstatus->count++;
+}
+
+static const char *txn_status_to_string(TXN_STATUS status)
+{
+	switch (status)
+	{
+		ENUM_TOCHAR_CASE(TXN_STATUS_INITIAL)
+	    ENUM_TOCHAR_CASE(TXN_STATUS_UNKNOWN)
+	    ENUM_TOCHAR_CASE(TXN_STATUS_PREPARED)
+	    ENUM_TOCHAR_CASE(TXN_STATUS_COMMITTED)       
+	    ENUM_TOCHAR_CASE(TXN_STATUS_ABORTED)
+	    ENUM_TOCHAR_CASE(TXN_STATUS_INPROGRESS)
+	    ENUM_TOCHAR_CASE(TXN_STATUS_FAILED)
+	}
+	return NULL;
+}
+
+static const char *txn_op_to_string(OPERATION op)
+{
+	switch (op)
+	{
+		ENUM_TOCHAR_CASE(UNDO)
+	    ENUM_TOCHAR_CASE(ABORT)
+	    ENUM_TOCHAR_CASE(COMMIT)
+	}
+	return NULL;
+}
+
+
+static void 
+CheckFirstPhase(txn_info *txn)
+{
+//    int ret;
+    Oid orignode = txn->origcoord;
+    uint32 startxid = txn->startxid;
+//    uint32 transactionid;
+    int nodeidx;
+
+    /*
+     * if the twophase trans does not success in prepare phase, the orignode == InvalidOid.
+     */
+    if (InvalidOid == orignode)
+    {
+        return;
+    }
+    nodeidx = find_node_index(orignode);
+    if (0 == txn->xid[nodeidx])
+    {
+        txn->xid[nodeidx] = startxid;
+    }
+    /* start node participate */
+    if (txn->isorigcoord_part)
+    {
+        if (0 == txn->coordparts[nodeidx])
+        {
+            txn->coordparts[nodeidx] = 1;
+            txn->num_coordparts++;
+        }
+        if (txn->txn_stat[nodeidx] == TXN_STATUS_INITIAL)
+        {
+            /*select * from pgxc_is_committed...*/
+            getTxnStatus(txn, nodeidx);
+        }
+        if (txn->txn_stat[nodeidx] == TXN_STATUS_PREPARED && txn->global_commit_timestamp != InvalidGlobalTimestamp)
+        {
+            txn->after_first_phase = true;
+        }
+    }
+    /* start node node participate */
+    else
+    {
+#if 0        
+        ret = Get2PCFile(orignode, txn->gid, &transactionid);
+        if (ret == FILENOTFOUND)
+            txn->after_first_phase = false;
+        else if (ret == FILEUNKOWN)
+            txn->global_txn_stat = TXN_STATUS_UNKNOWN;
+        else if (ret == FILEFOUND && txn->global_commit_timestamp != InvalidGlobalTimestamp)
+            txn->after_first_phase = true;
+#endif
+        if (txn->global_commit_timestamp != InvalidGlobalTimestamp)
+        {
+            txn->after_first_phase = true;
+        } else {
+            txn->after_first_phase = false;
+        }
+    }
+}
+
+void get_transaction_handles(PGXCNodeAllHandles **pgxc_handles, txn_info *txn)
+{
+    int dn_index = 0;
+    int cn_index = 0;
+    int  nodeIndex;
+    char nodetype;
+	List *coordlist = NIL;
+	List *nodelist = NIL;
+    
+    while (dn_index < dn_nodes_num)
+    {
+
+        /* Get node type and index */
+        nodetype = PGXC_NODE_NONE;
+        if (TXN_STATUS_PREPARED != txn->txn_stat[dn_index + cn_nodes_num])
+        {
+            dn_index++;
+            continue;
+        }
+        nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(dn_node_list[dn_index]), &nodetype);
+        if (nodetype == PGXC_NODE_NONE)
+            ereport(ERROR,
+                    (errcode(ERRCODE_UNDEFINED_OBJECT),
+                     errmsg("PGXC Node %s: object not defined",
+                            get_pgxc_nodename(dn_node_list[dn_index]))));
+
+        /* Check if node is requested is the self-node or not */
+        if (nodetype == PGXC_NODE_DATANODE)
+        {
+            nodelist = lappend_int(nodelist, nodeIndex);
+        }
+        dn_index++;
+
+    }
+
+    while (cn_index < cn_nodes_num)
+    {
+        /* Get node type and index */
+        nodetype = PGXC_NODE_NONE;
+        if (TXN_STATUS_PREPARED != txn->txn_stat[cn_index] || cn_node_list[cn_index] == txn->origcoord)
+        {
+            cn_index++;
+            continue;
+        }
+        nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(cn_node_list[cn_index]), &nodetype);
+        if (nodetype == PGXC_NODE_NONE)
+            ereport(ERROR,
+                    (errcode(ERRCODE_UNDEFINED_OBJECT),
+                     errmsg("PGXC Node %s: object not defined",
+                            get_pgxc_nodename(cn_node_list[cn_index]))));
+
+        /* Check if node is requested is the self-node or not */
+        if (nodetype == PGXC_NODE_COORDINATOR)
+        {
+            coordlist = lappend_int(coordlist, nodeIndex);
+        }
+        cn_index++;
+    }
+    *pgxc_handles = get_handles(nodelist, coordlist, false, true, true);
+}
+
+void get_node_handles(PGXCNodeAllHandles **pgxc_handles, Oid nodeoid)
+{
+    char nodetype = PGXC_NODE_NONE;
+	int nodeIndex;
+	List *coordlist = NIL;
+	List *nodelist = NIL;
+
+	nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(nodeoid), &nodetype);
+	if (nodetype == PGXC_NODE_COORDINATOR)
+	{
+		coordlist = lappend_int(coordlist, nodeIndex);
+	}
+    else
+    {
+        nodelist = lappend_int(nodelist, nodeIndex);
+    }
+	*pgxc_handles = get_handles(nodelist, coordlist, false, true, true);
+}
+
diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
index ff748ae0..5efecaf6 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -526,10 +526,11 @@ pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestoresta
 	plan->exec_type = EXEC_ON_ALL_NODES;
 	plan->sql_statement = (char *) query;
 	plan->force_autocommit = false;
+	plan->exec_nodes = makeNode(ExecNodes);
+	plan->exec_nodes->missing_ok = true;
 	
 	if (coordonly)
 	{
-		plan->exec_nodes = makeNode(ExecNodes);
 		plan->exec_nodes->nodeList = GetAllCoordNodes();
 		plan->exec_type = EXEC_ON_COORDS;
 	}
diff --git a/src/backend/pgxc/barrier/barrier.c b/src/backend/pgxc/barrier/barrier.c
index c73a13dd..a4ea113c 100644
--- a/src/backend/pgxc/barrier/barrier.c
+++ b/src/backend/pgxc/barrier/barrier.c
@@ -2,14 +2,14 @@
  *
  * barrier.c
  *
- *      Barrier handling for PITR
+ *	  Barrier handling for PITR
  *
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
  *
  * IDENTIFICATION
- *      $$
+ *	  $$
  *
  *-------------------------------------------------------------------------
  */
@@ -53,25 +53,25 @@ static void EndBarrier(PGXCNodeAllHandles *handles, const char *id);
 void
 ProcessCreateBarrierPrepare(const char *id)
 {
-    StringInfoData buf;
+	StringInfoData buf;
 
-    if (!IS_PGXC_REMOTE_COORDINATOR)
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("The CREATE BARRIER PREPARE message is expected to "
-                        "arrive at a Coordinator from another Coordinator")));
+	if (!IS_PGXC_REMOTE_COORDINATOR)
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("The CREATE BARRIER PREPARE message is expected to "
+						"arrive at a Coordinator from another Coordinator")));
 
-    LWLockAcquire(BarrierLock, LW_EXCLUSIVE);
+	LWLockAcquire(BarrierLock, LW_EXCLUSIVE);
 
-    pq_beginmessage(&buf, 'b');
-    pq_sendstring(&buf, id);
-    pq_endmessage(&buf);
-    pq_flush();
+	pq_beginmessage(&buf, 'b');
+	pq_sendstring(&buf, id);
+	pq_endmessage(&buf);
+	pq_flush();
 
-    /*
-     * TODO Start a timer to terminate the pending barrier after a specified
-     * timeout
-     */
+	/*
+	 * TODO Start a timer to terminate the pending barrier after a specified
+	 * timeout
+	 */
 }
 
 /*
@@ -81,24 +81,24 @@ ProcessCreateBarrierPrepare(const char *id)
 void
 ProcessCreateBarrierEnd(const char *id)
 {
-    StringInfoData buf;
+	StringInfoData buf;
 
-    if (!IS_PGXC_REMOTE_COORDINATOR)
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("The CREATE BARRIER END message is expected to "
-                        "arrive at a Coordinator from another Coordinator")));
+	if (!IS_PGXC_REMOTE_COORDINATOR)
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("The CREATE BARRIER END message is expected to "
+						"arrive at a Coordinator from another Coordinator")));
 
-    LWLockRelease(BarrierLock);
+	LWLockRelease(BarrierLock);
 
-    pq_beginmessage(&buf, 'b');
-    pq_sendstring(&buf, id);
-    pq_endmessage(&buf);
-    pq_flush();
+	pq_beginmessage(&buf, 'b');
+	pq_sendstring(&buf, id);
+	pq_endmessage(&buf);
+	pq_flush();
 
-    /*
-     * TODO Stop the timer
-     */
+	/*
+	 * TODO Stop the timer
+	 */
 }
 
 /*
@@ -109,186 +109,186 @@ ProcessCreateBarrierEnd(const char *id)
 void
 ProcessCreateBarrierExecute(const char *id)
 {
-    StringInfoData buf;
-
-    if (!IsConnFromCoord())
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("The CREATE BARRIER EXECUTE message is expected to "
-                        "arrive from a Coordinator")));
-    {
-        XLogRecPtr recptr;
-
-        XLogBeginInsert();
-        XLogRegisterData((char *) &id, strlen(id) + 1);
-        recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE);
-        XLogFlush(recptr);
-    }
-
-    pq_beginmessage(&buf, 'b');
-    pq_sendstring(&buf, id);
-    pq_endmessage(&buf);
-    pq_flush();
+	StringInfoData buf;
+
+	if (!IsConnFromCoord())
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("The CREATE BARRIER EXECUTE message is expected to "
+						"arrive from a Coordinator")));
+	{
+		XLogRecPtr recptr;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &id, strlen(id) + 1);
+		recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE);
+		XLogFlush(recptr);
+	}
+
+	pq_beginmessage(&buf, 'b');
+	pq_sendstring(&buf, id);
+	pq_endmessage(&buf);
+	pq_flush();
 }
 
 static const char *
 generate_barrier_id(const char *id)
 {
-    char genid[1024];
-    TimestampTz ts;
+	char genid[1024];
+	TimestampTz ts;
 
-    /*
-     * If the caller can passed a NULL value, generate an id which is
-     * guaranteed to be unique across the cluster. We use a combination of
-     * the Coordinator node id and current timestamp.
-     */
+	/*
+	 * If the caller can passed a NULL value, generate an id which is
+	 * guaranteed to be unique across the cluster. We use a combination of
+	 * the Coordinator node id and current timestamp.
+	 */
 
-    if (id)
-       return id;
+	if (id)
+	   return id;
 
-    ts = GetCurrentTimestamp();
+	ts = GetCurrentTimestamp();
 #ifdef HAVE_INT64_TIMESTAMP
-    sprintf(genid, "%s_"INT64_FORMAT, PGXCNodeName, ts);
+	sprintf(genid, "%s_"INT64_FORMAT, PGXCNodeName, ts);
 #else
-    sprintf(genid, "%s_%.0f", PGXCNodeName, ts);
+	sprintf(genid, "%s_%.0f", PGXCNodeName, ts);
 #endif
-    return pstrdup(genid);
+	return pstrdup(genid);
 }
 
 static PGXCNodeAllHandles *
 SendBarrierPrepareRequest(List *coords, const char *id)
 {
-    PGXCNodeAllHandles *coord_handles;
-    int conn;
-    int msglen;
-    int barrier_idlen;
+	PGXCNodeAllHandles *coord_handles;
+	int conn;
+	int msglen;
+	int barrier_idlen;
 
-    coord_handles = get_handles(NIL, coords, true, true);
+	coord_handles = get_handles(NIL, coords, true, true, true);
 
-    for (conn = 0; conn < coord_handles->co_conn_count; conn++)
-    {
-        PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
+	for (conn = 0; conn < coord_handles->co_conn_count; conn++)
+	{
+		PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
 
-        /* Invalid connection state, return error */
-        if (handle->state != DN_CONNECTION_STATE_IDLE)
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Failed to send CREATE BARRIER PREPARE request "
-                             "to the node")));
+		/* Invalid connection state, return error */
+		if (handle->state != DN_CONNECTION_STATE_IDLE)
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Failed to send CREATE BARRIER PREPARE request "
+						 	"to the node")));
 
-        barrier_idlen = strlen(id) + 1;
+		barrier_idlen = strlen(id) + 1;
 
-        msglen = 4; /* for the length itself */
-        msglen += barrier_idlen;
-        msglen += 1; /* for barrier command itself */
+		msglen = 4; /* for the length itself */
+		msglen += barrier_idlen;
+		msglen += 1; /* for barrier command itself */
 
-        /* msgType + msgLen */
-        if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Out of memory")));
-        }
+		/* msgType + msgLen */
+		if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Out of memory")));
+		}
 
-        handle->outBuffer[handle->outEnd++] = 'b';
-        msglen = htonl(msglen);
-        memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
-        handle->outEnd += 4;
+		handle->outBuffer[handle->outEnd++] = 'b';
+		msglen = htonl(msglen);
+		memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+		handle->outEnd += 4;
 
-        handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_PREPARE;
+		handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_PREPARE;
 
-        memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen);
-        handle->outEnd += barrier_idlen;
+		memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen);
+		handle->outEnd += barrier_idlen;
 
-        PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
+		PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
 
-        pgxc_node_flush(handle);
-    }
+		pgxc_node_flush(handle);
+	}
 
-    return coord_handles;
+	return coord_handles;
 }
 
 static void
 CheckBarrierCommandStatus(PGXCNodeAllHandles *conn_handles, const char *id,
-                          const char *command)
+						  const char *command)
 {
-    int conn;
-    int count = conn_handles->co_conn_count + conn_handles->dn_conn_count;
-
-    elog(DEBUG2, "Check CREATE BARRIER <%s> %s command status", id, command);
-
-    for (conn = 0; conn < count; conn++)
-    {
-        PGXCNodeHandle *handle;
-
-        if (conn < conn_handles->co_conn_count)
-            handle = conn_handles->coord_handles[conn];
-        else
-            handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count];
-
-        if (pgxc_node_receive(1, &handle, NULL))
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Failed to receive response from the remote side")));
-
-        if (handle_response(handle, NULL) != RESPONSE_BARRIER_OK)
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("CREATE BARRIER PREPARE command failed "
-                             "with error %s", handle->error)));
-    }
-
-    elog(DEBUG2, "Successfully completed CREATE BARRIER <%s> %s command on "
-                 "all nodes", id, command);
+	int conn;
+	int count = conn_handles->co_conn_count + conn_handles->dn_conn_count;
+
+	elog(DEBUG2, "Check CREATE BARRIER <%s> %s command status", id, command);
+
+	for (conn = 0; conn < count; conn++)
+	{
+		PGXCNodeHandle *handle;
+
+		if (conn < conn_handles->co_conn_count)
+			handle = conn_handles->coord_handles[conn];
+		else
+			handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count];
+
+		if (pgxc_node_receive(1, &handle, NULL))
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Failed to receive response from the remote side")));
+
+		if (handle_response(handle, NULL) != RESPONSE_BARRIER_OK)
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("CREATE BARRIER PREPARE command failed "
+						 	"with error %s", handle->error)));
+	}
+
+	elog(DEBUG2, "Successfully completed CREATE BARRIER <%s> %s command on "
+				 "all nodes", id, command);
 }
 
 static void
 SendBarrierEndRequest(PGXCNodeAllHandles *coord_handles, const char *id)
 {
-    int conn;
-    int msglen;
-    int barrier_idlen;
+	int conn;
+	int msglen;
+	int barrier_idlen;
 
-    elog(DEBUG2, "Sending CREATE BARRIER <%s> END command to all Coordinators", id);
+	elog(DEBUG2, "Sending CREATE BARRIER <%s> END command to all Coordinators", id);
 
-    for (conn = 0; conn < coord_handles->co_conn_count; conn++)
-    {
-        PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
+	for (conn = 0; conn < coord_handles->co_conn_count; conn++)
+	{
+		PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
 
-        /* Invalid connection state, return error */
-        if (handle->state != DN_CONNECTION_STATE_IDLE)
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Failed to send CREATE BARRIER PREPARE request "
-                             "to the node")));
+		/* Invalid connection state, return error */
+		if (handle->state != DN_CONNECTION_STATE_IDLE)
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Failed to send CREATE BARRIER PREPARE request "
+						 	"to the node")));
 
-        barrier_idlen = strlen(id) + 1;
+		barrier_idlen = strlen(id) + 1;
 
-        msglen = 4; /* for the length itself */
-        msglen += barrier_idlen;
-        msglen += 1; /* for barrier command itself */
+		msglen = 4; /* for the length itself */
+		msglen += barrier_idlen;
+		msglen += 1; /* for barrier command itself */
 
-        /* msgType + msgLen */
-        if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Out of memory")));
-        }
+		/* msgType + msgLen */
+		if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Out of memory")));
+		}
 
-        handle->outBuffer[handle->outEnd++] = 'b';
-        msglen = htonl(msglen);
-        memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
-        handle->outEnd += 4;
+		handle->outBuffer[handle->outEnd++] = 'b';
+		msglen = htonl(msglen);
+		memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+		handle->outEnd += 4;
 
-        handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_END;
+		handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_END;
 
-        memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen);
-        handle->outEnd += barrier_idlen;
+		memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen);
+		handle->outEnd += barrier_idlen;
 
-        PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
-        pgxc_node_flush(handle);
-    }
+		PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
+		pgxc_node_flush(handle);
+	}
 
 }
 
@@ -306,35 +306,35 @@ SendBarrierEndRequest(PGXCNodeAllHandles *coord_handles, const char *id)
 static PGXCNodeAllHandles *
 PrepareBarrier(const char *id)
 {
-    PGXCNodeAllHandles *coord_handles;
+	PGXCNodeAllHandles *coord_handles;
 
-    elog(DEBUG2, "Preparing Coordinators for BARRIER");
+	elog(DEBUG2, "Preparing Coordinators for BARRIER");
 
-    /*
-     * Send a CREATE BARRIER PREPARE message to all the Coordinators. We should
-     * send an asynchronous request so that we can disable local commits and
-     * then wait for the remote Coordinators to finish the work
-     */
-    coord_handles = SendBarrierPrepareRequest(GetAllCoordNodes(), id);
+	/*
+	 * Send a CREATE BARRIER PREPARE message to all the Coordinators. We should
+	 * send an asynchronous request so that we can disable local commits and
+	 * then wait for the remote Coordinators to finish the work
+	 */
+	coord_handles = SendBarrierPrepareRequest(GetAllCoordNodes(), id);
 
-    /*
-     * Disable local commits
-     */
-    LWLockAcquire(BarrierLock, LW_EXCLUSIVE);
+	/*
+	 * Disable local commits
+	 */
+	LWLockAcquire(BarrierLock, LW_EXCLUSIVE);
 
-    elog(DEBUG2, "Disabled 2PC commits originating at the driving Coordinator");
+	elog(DEBUG2, "Disabled 2PC commits originating at the driving Coordinator");
 
-    /*
-     * TODO Start a timer to cancel the barrier request in case of a timeout
-     */
+	/*
+	 * TODO Start a timer to cancel the barrier request in case of a timeout
+	 */
 
-    /*
-     * Local in-flight commits are now over. Check status of the remote
-     * Coordinators
-     */
-    CheckBarrierCommandStatus(coord_handles, id, "PREPARE");
+	/*
+	 * Local in-flight commits are now over. Check status of the remote
+	 * Coordinators
+	 */
+	CheckBarrierCommandStatus(coord_handles, id, "PREPARE");
 
-    return coord_handles;
+	return coord_handles;
 }
 
 /*
@@ -344,80 +344,80 @@ PrepareBarrier(const char *id)
 static void
 ExecuteBarrier(const char *id)
 {
-    List *barrierDataNodeList = GetAllDataNodes();
-    List *barrierCoordList = GetAllCoordNodes();
-    PGXCNodeAllHandles *conn_handles;
-    int conn;
-    int msglen;
-    int barrier_idlen;
-
-    conn_handles = get_handles(barrierDataNodeList, barrierCoordList, false, true);
-
-    elog(DEBUG2, "Sending CREATE BARRIER <%s> EXECUTE message to "
-                 "Datanodes and Coordinator", id);
-    /*
-     * Send a CREATE BARRIER request to all the Datanodes and the Coordinators
-     */
-    for (conn = 0; conn < conn_handles->co_conn_count + conn_handles->dn_conn_count; conn++)
-    {
-        PGXCNodeHandle *handle;
-
-        if (conn < conn_handles->co_conn_count)
-            handle = conn_handles->coord_handles[conn];
-        else
-            handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count];
-
-        /* Invalid connection state, return error */
-        if (handle->state != DN_CONNECTION_STATE_IDLE)
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Failed to send CREATE BARRIER EXECUTE request "
-                             "to the node")));
-
-        barrier_idlen = strlen(id) + 1;
-
-        msglen = 4; /* for the length itself */
-        msglen += barrier_idlen;
-        msglen += 1; /* for barrier command itself */
-
-        /* msgType + msgLen */
-        if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Out of memory")));
-        }
-
-        handle->outBuffer[handle->outEnd++] = 'b';
-        msglen = htonl(msglen);
-        memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
-        handle->outEnd += 4;
-
-        handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_EXECUTE;
-
-        memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen);
-        handle->outEnd += barrier_idlen;
-
-        PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
-        pgxc_node_flush(handle);
-    }
-
-    CheckBarrierCommandStatus(conn_handles, id, "EXECUTE");
-
-    pfree_pgxc_all_handles(conn_handles);
-
-    /*
-     * Also WAL log the BARRIER locally and flush the WAL buffers to disk
-     */
-    {
-        XLogRecPtr recptr;
-
-        XLogBeginInsert();
-        XLogRegisterData((char *) &id, strlen(id) + 1);
-
-        recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE);
-        XLogFlush(recptr);
-    }
+	List *barrierDataNodeList = GetAllDataNodes();
+	List *barrierCoordList = GetAllCoordNodes();
+	PGXCNodeAllHandles *conn_handles;
+	int conn;
+	int msglen;
+	int barrier_idlen;
+
+	conn_handles = get_handles(barrierDataNodeList, barrierCoordList, false, true, true);
+
+	elog(DEBUG2, "Sending CREATE BARRIER <%s> EXECUTE message to "
+				 "Datanodes and Coordinator", id);
+	/*
+	 * Send a CREATE BARRIER request to all the Datanodes and the Coordinators
+	 */
+	for (conn = 0; conn < conn_handles->co_conn_count + conn_handles->dn_conn_count; conn++)
+	{
+		PGXCNodeHandle *handle;
+
+		if (conn < conn_handles->co_conn_count)
+			handle = conn_handles->coord_handles[conn];
+		else
+			handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count];
+
+		/* Invalid connection state, return error */
+		if (handle->state != DN_CONNECTION_STATE_IDLE)
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Failed to send CREATE BARRIER EXECUTE request "
+						 	"to the node")));
+
+		barrier_idlen = strlen(id) + 1;
+
+		msglen = 4; /* for the length itself */
+		msglen += barrier_idlen;
+		msglen += 1; /* for barrier command itself */
+
+		/* msgType + msgLen */
+		if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Out of memory")));
+		}
+
+		handle->outBuffer[handle->outEnd++] = 'b';
+		msglen = htonl(msglen);
+		memcpy(handle->outBuffer + handle->outEnd, &msglen, 4);
+		handle->outEnd += 4;
+
+		handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_EXECUTE;
+
+		memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen);
+		handle->outEnd += barrier_idlen;
+
+		PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
+		pgxc_node_flush(handle);
+	}
+
+	CheckBarrierCommandStatus(conn_handles, id, "EXECUTE");
+
+	pfree_pgxc_all_handles(conn_handles);
+
+	/*
+	 * Also WAL log the BARRIER locally and flush the WAL buffers to disk
+	 */
+	{
+		XLogRecPtr recptr;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &id, strlen(id) + 1);
+
+		recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE);
+		XLogFlush(recptr);
+	}
 }
 
 /*
@@ -426,70 +426,70 @@ ExecuteBarrier(const char *id)
 static void
 EndBarrier(PGXCNodeAllHandles *prepared_handles, const char *id)
 {
-    /* Resume 2PC locally */
-    LWLockRelease(BarrierLock);
+	/* Resume 2PC locally */
+	LWLockRelease(BarrierLock);
 
-    SendBarrierEndRequest(prepared_handles, id);
+	SendBarrierEndRequest(prepared_handles, id);
 
-    CheckBarrierCommandStatus(prepared_handles, id, "END");
+	CheckBarrierCommandStatus(prepared_handles, id, "END");
 }
 
 void
 RequestBarrier(const char *id, char *completionTag)
 {
-    PGXCNodeAllHandles *prepared_handles;
-    const char *barrier_id;
-
-    elog(DEBUG2, "CREATE BARRIER request received");
-    /*
-     * Ensure that we are a Coordinator and the request is not from another
-     * coordinator
-     */
-    if (!IS_PGXC_COORDINATOR)
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("CREATE BARRIER command must be sent to a Coordinator")));
-
-    if (IsConnFromCoord())
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("CREATE BARRIER command is not expected from another Coordinator")));
-
-    /*
-     * Get a barrier id if the user has not supplied it
-     */
-    barrier_id = generate_barrier_id(id);
-
-    elog(DEBUG2, "CREATE BARRIER <%s>", barrier_id);
-
-    /*
-     * Step One. Prepare all Coordinators for upcoming barrier request
-     */
-    prepared_handles = PrepareBarrier(barrier_id);
-
-    /*
-     * Step two. Issue BARRIER command to all involved components, including
-     * Coordinators and Datanodes
-     */
-    ExecuteBarrier(barrier_id);
-
-    /*
-     * Step three. Inform Coordinators about a successfully completed barrier
-     */
-    EndBarrier(prepared_handles, barrier_id);
-    /* Finally report the barrier to GTM to backup its restart point */
-    ReportBarrierGTM(barrier_id);
-
-    /* Free the handles */
-    pfree_pgxc_all_handles(prepared_handles);
-
-    if (completionTag)
-        snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "BARRIER %s", barrier_id);
+	PGXCNodeAllHandles *prepared_handles;
+	const char *barrier_id;
+
+	elog(DEBUG2, "CREATE BARRIER request received");
+	/*
+	 * Ensure that we are a Coordinator and the request is not from another
+	 * coordinator
+	 */
+	if (!IS_PGXC_COORDINATOR)
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("CREATE BARRIER command must be sent to a Coordinator")));
+
+	if (IsConnFromCoord())
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("CREATE BARRIER command is not expected from another Coordinator")));
+
+	/*
+	 * Get a barrier id if the user has not supplied it
+	 */
+	barrier_id = generate_barrier_id(id);
+
+	elog(DEBUG2, "CREATE BARRIER <%s>", barrier_id);
+
+	/*
+	 * Step One. Prepare all Coordinators for upcoming barrier request
+	 */
+	prepared_handles = PrepareBarrier(barrier_id);
+
+	/*
+	 * Step two. Issue BARRIER command to all involved components, including
+	 * Coordinators and Datanodes
+	 */
+	ExecuteBarrier(barrier_id);
+
+	/*
+	 * Step three. Inform Coordinators about a successfully completed barrier
+	 */
+	EndBarrier(prepared_handles, barrier_id);
+	/* Finally report the barrier to GTM to backup its restart point */
+	ReportBarrierGTM(barrier_id);
+
+	/* Free the handles */
+	pfree_pgxc_all_handles(prepared_handles);
+
+	if (completionTag)
+		snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "BARRIER %s", barrier_id);
 }
 
 void
 barrier_redo(XLogReaderState *record)
 {
-    /* Nothing to do */
-    return;
+	/* Nothing to do */
+	return;
 }
diff --git a/src/backend/pgxc/cluster/pause.c b/src/backend/pgxc/cluster/pause.c
index cf2433cb..66c6d5e1 100644
--- a/src/backend/pgxc/cluster/pause.c
+++ b/src/backend/pgxc/cluster/pause.c
@@ -122,7 +122,7 @@ HandleClusterPause(bool pause, bool initiator)
      * coordinators to respond back
      */
 
-    coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true);
+	coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true, true);
 
     for (conn = 0; conn < coord_handles->co_conn_count; conn++)
     {
@@ -309,7 +309,7 @@ PGXCCleanClusterLock(int code, Datum arg)
     if (IsConnFromCoord())
         return;
 
-    coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true);
+	coord_handles = get_handles(NIL, GetAllCoordNodes(), true, true, true);
     /* Try best-effort to UNPAUSE other coordinators now */
     for (conn = 0; conn < coord_handles->co_conn_count; conn++)
     {
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 5a7d8b4d..38f5bb3a 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -5865,7 +5865,7 @@ DataNodeCopyBegin(RemoteCopyData *rcstate)
     else
     {
         PGXCNodeAllHandles *pgxc_handles;
-        pgxc_handles = get_handles(nodelist, NULL, false, true);
+		pgxc_handles = get_handles(nodelist, NULL, false, true, true);
         connections = pgxc_handles->datanode_handles;
         Assert(pgxc_handles->dn_conn_count == conn_count);
         pfree(pgxc_handles);
@@ -6261,6 +6261,7 @@ get_exec_connections(RemoteQueryState *planstate,
     int            co_conn_count, dn_conn_count;
     bool        is_query_coord_only = false;
     PGXCNodeAllHandles *pgxc_handles = NULL;
+	bool        missing_ok = (exec_nodes ? exec_nodes->missing_ok : false);
 
 #ifdef __TBASE__
     if (IsParallelWorker())
@@ -6527,7 +6528,7 @@ get_exec_connections(RemoteQueryState *planstate,
 #endif
 
     /* Get other connections (non-primary) */
-    pgxc_handles = get_handles(nodelist, coordlist, is_query_coord_only, is_global_session);
+	pgxc_handles = get_handles(nodelist, coordlist, is_query_coord_only, is_global_session, !missing_ok);
     if (!pgxc_handles)
         ereport(ERROR,
                 (errcode(ERRCODE_INTERNAL_ERROR),
@@ -6538,7 +6539,7 @@ get_exec_connections(RemoteQueryState *planstate,
     {
         /* Let's assume primary connection is always a Datanode connection for the moment */
         PGXCNodeAllHandles *pgxc_conn_res;
-        pgxc_conn_res = get_handles(primarynode, NULL, false, is_global_session);
+		pgxc_conn_res = get_handles(primarynode, NULL, false, is_global_session, true);
 
         /* primary connection is unique */
         primaryconnection = pgxc_conn_res->datanode_handles[0];
@@ -6552,6 +6553,50 @@ get_exec_connections(RemoteQueryState *planstate,
         pgxc_handles->primary_handle = primaryconnection;
     }
 
+	if (missing_ok)
+	{
+		/* compact handle list exclude missing nodes */
+		int i = 0;
+		while (dn_conn_count && i < dn_conn_count)
+		{
+			if (DN_CONNECTION_STATE_ERROR(pgxc_handles->datanode_handles[i]))
+			{
+				/* find last healthy handle */
+				while (dn_conn_count - 1 > i &&
+				       DN_CONNECTION_STATE_ERROR(pgxc_handles->datanode_handles[dn_conn_count - 1]))
+					dn_conn_count--;
+				
+				/* replace bad handle with last healthy handle */
+				pgxc_handles->datanode_handles[i] =
+					pgxc_handles->datanode_handles[dn_conn_count - 1];
+				/* exclude bad handle */
+				pgxc_handles->datanode_handles[dn_conn_count - 1] = NULL;
+				dn_conn_count--;
+			}
+			i++;
+		}
+		
+		i = 0;
+		while (co_conn_count && i < co_conn_count)
+		{
+			if (DN_CONNECTION_STATE_ERROR(pgxc_handles->coord_handles[i]))
+			{
+				/* find last healthy handle */
+				while (co_conn_count - 1 > i &&
+				       DN_CONNECTION_STATE_ERROR(pgxc_handles->coord_handles[co_conn_count - 1]))
+					co_conn_count--;
+				
+				/* replace bad handle with last healthy handle */
+				pgxc_handles->coord_handles[i] =
+					pgxc_handles->coord_handles[co_conn_count - 1];
+				/* exclude bad handle */
+				pgxc_handles->coord_handles[co_conn_count - 1] = NULL;
+				co_conn_count--;
+			}
+			i++;
+		}
+	}
+	
     /* Depending on the execution type, we still need to save the initial node counts */
     pgxc_handles->dn_conn_count = dn_conn_count;
     pgxc_handles->co_conn_count = co_conn_count;
@@ -7168,7 +7213,7 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist)
         return;
 
     /* get needed Datanode connections */
-    all_handles = get_handles(nodelist, NIL, false, true);
+	all_handles = get_handles(nodelist, NIL, false, true, true);
     conn_count = all_handles->dn_conn_count;
     connections = all_handles->datanode_handles;
 
@@ -8188,7 +8233,7 @@ pgxc_node_remote_prefinish(char *prepareGID, char *nodestring)
     if (nodelist == NIL && coordlist == NIL)
         return false;
 
-    pgxc_handles = get_handles(nodelist, coordlist, false, true);
+	pgxc_handles = get_handles(nodelist, coordlist, false, true, true);
 
     for (i = 0; i < pgxc_handles->dn_conn_count; i++)
     {
@@ -8548,7 +8593,7 @@ pgxc_node_remote_finish(char *prepareGID, bool commit,
         return prepared_local;
 
 
-    pgxc_handles = get_handles(nodelist, coordlist, false, true);
+	pgxc_handles = get_handles(nodelist, coordlist, false, true, true);
 #ifdef __TWO_PHASE_TRANS__
     SetLocalTwoPhaseStateHandles(pgxc_handles);
 #endif
@@ -10466,7 +10511,7 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node)
     if (node->execOnAll)
     {
         PGXCNodeAllHandles *pgxc_connections;
-        pgxc_connections = get_handles(node->execNodes, NIL, false, true);
+		pgxc_connections = get_handles(node->execNodes, NIL, false, true, true);
         combiner->conn_count = pgxc_connections->dn_conn_count;
         combiner->connections = pgxc_connections->datanode_handles;
         combiner->current_conn = 0;
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 233fd0e2..d37a0b48 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -3615,7 +3615,7 @@ get_any_handle(List *datanodelist)
                     //char   *init_str = NULL;
                     List   *allocate = list_make1_int(node);
                     int       *pids;
-                    int    *fds = PoolManagerGetConnections(allocate, NIL,
+					int    *fds = PoolManagerGetConnections(allocate, NIL, true,
                             &pids);
                     PGXCNodeHandle        *node_handle;
 
@@ -3685,8 +3685,8 @@ get_any_handle(List *datanodelist)
  * Coordinator fds is returned only if transaction uses a DDL
  */
 PGXCNodeAllHandles *
-get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool is_global_session)
-{// #lizard forgives
+get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool is_global_session, bool raise_error)
+{
     PGXCNodeAllHandles    *result;
     ListCell        *node_list_item;
     List            *dn_allocate = NIL;
@@ -3864,7 +3864,7 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
     {
         int    j = 0;
         int *pids;
-        int    *fds = PoolManagerGetConnections(dn_allocate, co_allocate, &pids);
+		int	*fds = PoolManagerGetConnections(dn_allocate, co_allocate, raise_error, &pids);
 
         if (!fds)
         {
@@ -3927,6 +3927,13 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
                 }
 
                 node_handle = &dn_handles[node];
+				
+				if (be_pid == 0 && !raise_error)
+				{
+					PGXCNodeSetConnectionState(node_handle, DN_CONNECTION_STATE_ERROR_FATAL);
+					continue;
+				}
+				
                 pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
                 dn_handles[node] = *node_handle;
                 datanode_count++;
@@ -3985,6 +3992,13 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
                 }
 
                 node_handle = &co_handles[node];
+				
+				if (be_pid == 0 && !raise_error)
+				{
+					PGXCNodeSetConnectionState(node_handle, DN_CONNECTION_STATE_ERROR_FATAL);
+					continue;
+				}
+				
                 pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
                 co_handles[node] = *node_handle;
                 coord_count++;
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 4e1da81f..7a688879 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -489,7 +489,8 @@ static void insert_database_pool(DatabasePool *pool);
 static void reload_database_pools(PoolAgent *agent);
 static DatabasePool *find_database_pool(const char *database, const char *user_name, const char *pgoptions);
 
-static int agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist, int32 *num, int **fd_result, int **pid_result);
+static int agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist,
+									 bool raise_error, int32 *num, int **fd_result, int **pid_result);
 static int send_local_commands(PoolAgent *agent, List *datanodelist, List *coordlist);
 static int cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist, int signal);
 static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, PGXCNodePool **pool,int32 nodeidx, Oid node, bool bCoord);
@@ -1797,13 +1798,15 @@ PoolManagerDisconnect(void)
  * Get pooled connections
  */
 int *
-PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
-{// #lizard forgives
+PoolManagerGetConnections(List *datanodelist, List *coordlist, bool raise_error, int **pids)
+{
     int            i;
     ListCell   *nodelist_item;
     int           *fds;
     int            totlen = list_length(datanodelist) + list_length(coordlist);
+	int         totsize = sizeof(int) * (totlen + 2) + 1; /* sizeof nodes list + raise_error flag */
     int            nodes[totlen + 2];
+	char       *msg;
     int         pool_recvpids_num;
     int         pool_recvfds_ret;
 
@@ -1850,7 +1853,11 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids)
                  errmsg(POOL_MGR_PREFIX"out of memory")));
     }
 
-    pool_putmessage(&poolHandle->port, 'g', (char *) nodes, sizeof(int) * (totlen + 2));
+	msg = palloc(totsize);
+	memcpy(msg, (char *) nodes, totsize - 1);
+	msg[totsize - 1] = (char) raise_error;
+	pool_putmessage(&poolHandle->port, 'g', msg, totsize);
+	pfree(msg);
 
     if (PoolConnectDebugPrint)
     {
@@ -2913,8 +2920,9 @@ agent_set_command(PoolAgent *agent,
  * return 0 : when fd_result and pid_result is not NULL, acquire connection is done(acquire from freeslot in pool).
  */
 static int 
-agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist, int32 *num, int **fd_result, int **pid_result)
-{// #lizard forgives
+agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist,
+						  bool raise_error, int32 *num, int **fd_result, int **pid_result)
+{
     int32              i    = 0;
     int32             acquire_seq = 0;
     int                  node = 0;
@@ -3101,6 +3109,7 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist,
                     elog(LOG, POOL_MGR_PREFIX"[agent_acquire_connections]going to acquire conn by sync thread for node:%s.", nodePool->node_name);
                 }
                 
+				asyncTaskCtl->m_missing_ok = !raise_error;
                 /* dispatch build connection request */
                 succeed = dispatch_connection_request(asyncTaskCtl,
                                                         false,
@@ -3153,6 +3162,7 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist,
                         asyncTaskCtl = create_task_control(datanodelist, coordlist, *fd_result, *pid_result);
                     }
                     
+					asyncTaskCtl->m_missing_ok = !raise_error;
                     /* dispatch set param request */
                     succeed = dispatch_connection_request(asyncTaskCtl,
                                                             false,
@@ -3233,6 +3243,7 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist,
                     asyncTaskCtl = create_task_control(datanodelist, coordlist, *fd_result, *pid_result);
                 }
 
+				asyncTaskCtl->m_missing_ok = !raise_error;
                 /* dispatch build connection request */
                 succeed = dispatch_connection_request(asyncTaskCtl,
                                                         true,
@@ -3287,6 +3298,8 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist,
                     {
                         asyncTaskCtl = create_task_control(datanodelist, coordlist, *fd_result, *pid_result);
                     }
+					
+					asyncTaskCtl->m_missing_ok = !raise_error;
                     /* dispatch set param request */
                     succeed = dispatch_connection_request(asyncTaskCtl,
                                                             true,
@@ -7509,17 +7522,28 @@ void *pooler_sync_remote_operator_thread(void *arg)
                                         PGXCNodeClose(slot->conn);
                                         slot->conn = NULL;
                                     }
-                                    request->current_status = PoolConnectStaus_error;
+									
+									finish_task_request(request->taskControl);
+									
+									if (request->taskControl->m_missing_ok)
+									{
+										request->current_status = PoolConnectStaus_done;
+										break;
+									}
+									else
+									{
+                                        request->current_status = PoolConnectStaus_error;
 #ifdef __TBASE__
-                                    SpinLockAcquire(&request->agent->port.lock);
-                                    request->agent->port.error_code = POOL_ERR_GET_CONNECTIONS_CONNECTION_BAD;
-                                    snprintf(request->agent->port.err_msg, POOL_ERR_MSG_LEN, "%s, connection info [%s]", poolErrorMsg[POOL_ERR_GET_CONNECTIONS_CONNECTION_BAD],
-                                        request->nodepool->connstr);
-                                    SpinLockRelease(&request->agent->port.lock);
+                                        SpinLockAcquire(&request->agent->port.lock);
+                                        request->agent->port.error_code = POOL_ERR_GET_CONNECTIONS_CONNECTION_BAD;
+                                        snprintf(request->agent->port.err_msg, POOL_ERR_MSG_LEN, "%s, connection info [%s]", poolErrorMsg[POOL_ERR_GET_CONNECTIONS_CONNECTION_BAD],
+                                            request->nodepool->connstr);
+                                        SpinLockRelease(&request->agent->port.lock);
 #endif
-									set_task_status(request->taskControl, PoolAyncCtlStaus_error);		
-									finish_task_request(request->taskControl);
-									break;
+                                        set_task_status(request->taskControl, PoolAyncCtlStaus_error);		
+                                        pooler_thread_logger(LOG, "connection not connect for node:[%s] failed errno %d", request->nodepool->connstr, errno);
+                                        break;
+								    }				
 								}				
 
 								slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
@@ -7710,6 +7734,12 @@ void *pooler_sync_remote_operator_thread(void *arg)
 #endif
                                             node_number++;
                                         }
+										else if (request->taskControl->m_missing_ok)
+										{
+											request->taskControl->m_result[node_number] = 0;
+											request->taskControl->m_pidresult[node_number] = 0;
+											node_number++;
+										}
                                     }
 
                                     /* Save then in the array fds for Coordinators */
@@ -7726,6 +7756,12 @@ void *pooler_sync_remote_operator_thread(void *arg)
 #endif
                                             node_number++;
                                         }
+										else
+										{
+											request->taskControl->m_result[node_number] = 0;
+											request->taskControl->m_pidresult[node_number] = 0;
+											node_number++;
+										}
                                     }                
 
 #ifdef     _POOLER_CHECK_    
@@ -8687,7 +8723,8 @@ static inline bool get_acquire_success_status(PGXCASyncTaskCtl  *taskControl)
 {        
     bool bsucceed;
     SpinLockAcquire(&taskControl->m_lock);
-    bsucceed = taskControl->m_number_done == taskControl->m_number_succeed;
+	bsucceed = taskControl->m_number_done == taskControl->m_number_succeed ||
+	           taskControl->m_missing_ok;
     SpinLockRelease(&taskControl->m_lock);
     return bsucceed;
 }
@@ -10241,6 +10278,7 @@ handle_get_connections(PoolAgent * agent, StringInfo s)
     List   *datanodelist = NIL;
     List   *coordlist = NIL;
     int     connect_num = 0;
+	bool    raise_error = true;
     /*
      * Length of message is caused by:
      * - Message header = 4bytes
@@ -10273,6 +10311,8 @@ handle_get_connections(PoolAgent * agent, StringInfo s)
     {
         elog(LOG, POOL_MGR_PREFIX"backend required %d coordinator connections, pid:%d", coordcount, agent->pid);
     }
+	
+	raise_error = pq_getmsgbyte(s);
     pq_getmsgend(s);
 
     if(!is_pool_locked)
@@ -10282,7 +10322,7 @@ handle_get_connections(PoolAgent * agent, StringInfo s)
          * In case of error agent_acquire_connections will log
          * the error and return -1
          */
-        ret = agent_acquire_connections(agent, datanodelist, coordlist, &connect_num, &fds, &pids);
+		ret = agent_acquire_connections(agent, datanodelist, coordlist, raise_error, &connect_num, &fds, &pids);
         /* async acquire connection will be done in parallel threads */
         if (0 == ret && fds && pids)
         {
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index 6b8994dc..ba16fbbb 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -661,7 +661,7 @@ apply_exec_on_nodes(StringInfo s, char *nspname, char *relname, ExecNodes * exec
 		return;
 
 	/* send apply message to DN and wait response */
-	all_handles = get_handles(exec_nodes->nodeList, NIL, false, true);
+	all_handles = get_handles(exec_nodes->nodeList, NIL, false, true, true);
 
 	/* send insert/update/delete to DN and wait exec finish */
 	apply_exec_on_dn_nodes(s, nspname, relname, all_handles);
diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h
index 3fd1f6b9..30926928 100644
--- a/src/include/pgxc/locator.h
+++ b/src/include/pgxc/locator.h
@@ -117,6 +117,7 @@ typedef struct
 	Datum		rewrite_value;	/* function evaluate result */
 	bool		isnull;
 	bool		rewrite_done;		/* function rewritted */
+	bool        missing_ok;
 } ExecNodes;
 
 
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index f79bc2b8..4e64e650 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -171,7 +171,8 @@ extern Oid PGXCGetMainNodeOid(Oid nodeoid);
 extern int PGXCNodeGetNodeIdFromName(char *node_name, char *node_type);
 extern Oid PGXCNodeGetNodeOid(int nodeid, char node_type);
 
-extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist, bool is_query_coord_only, bool is_global_session);
+extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist,
+									   bool is_query_coord_only, bool is_global_session, bool raise_error);
 
 extern PGXCNodeAllHandles *get_current_handles(void);
 #ifdef __TBASE__
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h
index c0d996d8..1f2f57b3 100644
--- a/src/include/pgxc/poolmgr.h
+++ b/src/include/pgxc/poolmgr.h
@@ -174,6 +174,7 @@ typedef struct PGXCASyncTaskCtl
     CommandId             m_max_command_id;
 
     /* errmsg and error status. */
+	bool                  m_missing_ok;
     int32                  m_error_offset;
     char                  m_error_msg[PGXC_POOL_ERROR_MSG_LEN];
 }PGXCASyncTaskCtl;
@@ -328,7 +329,7 @@ extern int PoolManagerSetCommand(PGXCNodeHandle **connections, int32 count, Pool
                                   const char *set_command);
 
 /* Get pooled connections */
-extern int *PoolManagerGetConnections(List *datanodelist, List *coordlist, int **pids);
+extern int *PoolManagerGetConnections(List *datanodelist, List *coordlist, bool raise_error, int **pids);
 
 /* Clean pool connections */
 extern void PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, char *username);

From 458585a25f229338da2df3685b02c2cac0a571a1 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 8 Apr 2021 15:54:18 +0800
Subject: [PATCH 161/578] for pooler log
 http://tapd.oa.com/pgxz/prong/stories/view/1010092131863477681 (merge request
 !268)

---
 src/backend/pgxc/pool/poolmgr.c | 187 ++++++++++++++++++++++++++++++++
 src/backend/utils/misc/guc.c    |   9 ++
 src/include/pgxc/poolmgr.h      |   1 +
 3 files changed, 197 insertions(+)

diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 7a688879..c4bcc94e 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -100,6 +100,7 @@ char        *g_unpooled_user         = "mls_admin";
 
 bool         PoolConnectDebugPrint  = false; /* Pooler connect debug print */
 bool         PoolerStuckExit         = true;  /* Pooler exit when stucked */
+bool         PoolSubThreadLogPrint  = true;  /* Pooler sub thread log print */
 
 #define      POOL_ASYN_WARM_PIPE_LEN      32   /* length of asyn warm pipe */
 #define      POOL_ASYN_WARN_NUM           1      /* how many connections to warm once maintaince per node pool */
@@ -397,6 +398,20 @@ typedef struct
     pg_time_t         cmd_end_time;     /* command end time */
 }PGXCPoolAsyncReq;
 
+static void pooler_subthread_write_log(int elevel, int lineno, const char *filename, const char *funcname, const char *fmt, ...)__attribute__((format(printf, 5, 6)));
+
+/* Use this macro when a sub thread needs to print logs */
+#define pooler_thread_logger(elevel, ...) \
+    do { \
+        pooler_subthread_write_log(elevel, __LINE__, __FILE__, PG_FUNCNAME_MACRO, __VA_ARGS__); \
+    } while(0)
+
+#define FORMATTED_TS_LEN                (128)                                          /* format timestamp buf length */
+#define POOLER_WRITE_LOG_ONCE_LIMIT     (5)                                            /* number of logs written at a time */
+#define MAX_THREAD_LOG_PIPE_LEN         (2 * 1024)                                     /* length of thread log pipe */
+#define DEFAULT_LOG_BUF_LEN             (1024)                                         /* length of thread log length */
+PGPipe  *g_ThreadLogQueue = NULL;
+
 static inline void RebuildAgentIndex(void);
 
 static inline PGXCASyncTaskCtl* create_task_control(List *datanodelist,    List *coordlist, int32 *fd_result, int32 *pid_result);
@@ -5134,6 +5149,170 @@ destroy_node_pool_free_slots(PGXCNodePool *node_pool)
     }
 }
 
+/*
+ * setup current log time
+ */
+static void
+setup_formatted_current_log_time(char* formatted_current_log_time)
+{
+    pg_time_t	stamp_time;
+    char		msbuf[13];
+    struct timeval timeval;
+
+    gettimeofday(&timeval, NULL);
+    stamp_time = (pg_time_t) timeval.tv_sec;
+
+    /*
+     * Note: we expect that guc.c will ensure that log_timezone is set up (at
+     * least with a minimal GMT value) before Log_line_prefix can become
+     * nonempty or CSV mode can be selected.
+     */
+    pg_strftime(formatted_current_log_time, FORMATTED_TS_LEN,
+            /* leave room for milliseconds... */
+                "%Y-%m-%d %H:%M:%S     %Z",
+                pg_localtime(&stamp_time, log_timezone));
+
+    /* 'paste' milliseconds into place... */
+    sprintf(msbuf, ".%03d", (int) (timeval.tv_usec / 1000));
+    memcpy(formatted_current_log_time + 19, msbuf, 4);
+}
+
+/*
+ * write pooler's subthread log into thread log queue
+ * only call by pooler's subthread in elog
+ */
+static void
+pooler_subthread_write_log(int elevel, int lineno, const char *filename, const char *funcname, const char *fmt, ...)
+{
+    char *buf = NULL;
+    int buf_len = 0;
+    int offset = 0;
+    char formatted_current_log_time[FORMATTED_TS_LEN];
+
+    if (!PoolSubThreadLogPrint)
+    {
+        /* not enable sun thread log print, return */
+        return;
+    }
+
+    if (PipeIsFull(g_ThreadLogQueue))
+    {
+        return;
+    }
+
+    /* use malloc in sub thread */
+    buf_len = strlen(filename) + strlen(funcname) + DEFAULT_LOG_BUF_LEN;
+    buf = (char*)malloc(buf_len);
+    if (buf == NULL)
+    {
+        /* no log */
+        return;
+    }
+
+    /* construction log, format: elevel | lineno | filename | funcname | log content */
+    *(int*)(buf + offset) = elevel;
+    offset += sizeof(elevel);
+    *(int*)(buf + offset) = lineno;
+    offset += sizeof(lineno);
+    memcpy(buf + offset, filename, strlen(filename) + 1);
+    offset += (strlen(filename) + 1);
+    memcpy(buf + offset, funcname, strlen(funcname) + 1);
+    offset += (strlen(funcname) + 1);
+
+    /*
+     * because the main thread writes the log of the sub thread asynchronously,
+     * record the actual log writing time here
+     */
+    setup_formatted_current_log_time(formatted_current_log_time);
+    memcpy(buf + offset, formatted_current_log_time, strlen(formatted_current_log_time));
+    offset += strlen(formatted_current_log_time);
+    *(char*)(buf + offset) = ' ';
+    offset += sizeof(char);
+
+    /* Generate actual output --- have to use appendStringInfoVA */
+    for (;;)
+    {
+        va_list		args;
+        int			avail;
+        int			nprinted;
+
+        avail = buf_len - offset - 1;
+        va_start(args, fmt);
+        nprinted = vsnprintf(buf + offset, avail, fmt, args);
+        va_end(args);
+        if (nprinted >= 0 && nprinted < avail - 1)
+        {
+            offset += nprinted;
+            *(char*)(buf + offset) = '\0';
+			offset += sizeof(char);
+            break;
+        }
+
+        buf_len = (buf_len * 2 > (int) MaxAllocSize) ? MaxAllocSize : buf_len * 2;
+        buf = (char *) realloc(buf, buf_len);
+        if (buf == NULL)
+        {
+            /* no log */
+            return;
+        }
+    }
+
+    /* put log into thread log queue, drop log if queue is full */
+    if (-1 == PipePut(g_ThreadLogQueue, buf))
+    {
+        free(buf);
+    }
+}
+
+/*
+ * write subthread log in main thread
+ */
+static void
+pooler_handle_subthread_log(bool is_pooler_exit)
+{
+    int write_log_cnt = 0;
+    int offset = 0;
+    int elevel = LOG;
+    int lineno = 0;
+    char *log_buf = NULL;
+    char *filename = NULL;
+    char *funcname = NULL;
+    char *log_content = NULL;
+
+    while ((log_buf = (char*)PipeGet(g_ThreadLogQueue)) != NULL)
+    {
+        /* elevel | lineno | filename | funcname | log content */
+        elevel = *(int*)log_buf;
+        offset = sizeof(elevel);
+        lineno = *(int*)(log_buf + offset);
+        offset += sizeof(lineno);
+        filename = log_buf + offset;
+        offset += (strlen(filename) + 1);
+        funcname = log_buf + offset;
+        offset += (strlen(funcname) + 1);
+        log_content = log_buf + offset;
+
+        /* write log here */
+        elog_start(filename, lineno,
+#ifdef USE_MODULE_MSGIDS
+                PGXL_MSG_MODULE, PGXL_MSG_FILEID, __COUNTER__,
+#endif
+                funcname);
+        elog_finish(elevel, "%s", log_content);
+
+        free(log_buf);
+
+        /*
+         * if the number of logs written at one time exceeds POOLER_WRITE_LOG_ONCE_LIMIT,
+         * in order not to block the main thread, return here
+         */
+        if (write_log_cnt++ >= POOLER_WRITE_LOG_ONCE_LIMIT && !is_pooler_exit)
+        {
+            return;
+        }
+    }
+}
+
 /*
  * Main handling loop
  */
@@ -5200,6 +5379,9 @@ PoolerLoop(void)
     }
 #endif
 
+    /* create log queue */
+    g_ThreadLogQueue = CreatePipe(MAX_THREAD_LOG_PIPE_LEN);
+
     /* create utility thread */
     g_AsynUtilityPipeSender = CreatePipe(POOL_ASYN_WARM_PIPE_LEN);
     ThreadSemaInit(&g_AsnyUtilitysem, 0);
@@ -5282,6 +5464,7 @@ PoolerLoop(void)
          */
         if (!PostmasterIsAlive())
         {
+            pooler_handle_subthread_log(true);
             exit(1);
         }
         
@@ -5309,6 +5492,7 @@ PoolerLoop(void)
              *  Just close the socket and exit. Linux will help to release the resouces.
               */        
             close(server_fd);
+            pooler_handle_subthread_log(true);
             exit(0);
         }        
 
@@ -5420,6 +5604,9 @@ PoolerLoop(void)
         check_duplicate_allocated_conn();
 #endif
         print_pooler_statistics();
+
+		/* handle sub thread's log */
+        pooler_handle_subthread_log(false);
     }
 }
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 35c4981d..2260027e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2423,6 +2423,15 @@ static struct config_bool ConfigureNamesBool[] =
         NULL, NULL, NULL
     },
     {
+        {"enable_pooler_thread_log_print", PGC_USERSET, CUSTOM_OPTIONS,
+         gettext_noop("enable pooler manager sub thread log print"),
+         NULL
+        },
+        &PoolSubThreadLogPrint,
+        true,
+        NULL, NULL, NULL
+    },
+	{
         {"enable_plpgsql_debug_print", PGC_SUSET, CUSTOM_OPTIONS,
             gettext_noop("enable plpgsql debug infomation print"),
             NULL
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h
index 1f2f57b3..9fff0445 100644
--- a/src/include/pgxc/poolmgr.h
+++ b/src/include/pgxc/poolmgr.h
@@ -268,6 +268,7 @@ extern int  PoolDNSetTimeout;
 extern int  PoolCheckSlotTimeout;
 extern int  PoolPrintStatTimeout;
 extern bool PoolConnectDebugPrint; 
+extern bool PoolSubThreadLogPrint;
 /* Status inquiry functions */
 extern void PGXCPoolerProcessIam(void);
 extern bool IsPGXCPoolerProcess(void);

From f955f8a51f2cc200bdcf236cd18ff7cd5468960b Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 4 Jun 2021 21:20:30 +0800
Subject: [PATCH 162/578] fix sysviews expected info

---
 src/test/regress/expected/sysviews.out | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 7a478711..7f805e93 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -117,6 +117,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_plpgsql_debug_print        | off
  enable_pooler_debug_print         | on
  enable_pooler_stuck_exit          | off
+ enable_pooler_thread_log_print    | on
  enable_pullup_subquery            | on
  enable_replication_slot_debug     | off
  enable_sampling_analyze           | on
@@ -128,7 +129,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_tidscan                    | on
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
-(56 rows)
+(57 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From 614d2598ab608f4ab2602cb46f0da010ba6adcfd Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 4 Jun 2021 21:25:27 +0800
Subject: [PATCH 163/578] Receive 1 more byte for raise error flag in pooler

---
 src/backend/pgxc/pool/poolmgr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index c4bcc94e..6efe045c 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -10473,10 +10473,11 @@ handle_get_connections(PoolAgent * agent, StringInfo s)
      * - List of Coordinators = NumPoolCoords * 4bytes (max)
      * - Number of Datanodes sent = 4bytes
      * - Number of Coordinators sent = 4bytes
+	 * - Raise error flag = 1byte
      * It is better to send in a same message the list of Co and Dn at the same
      * time, this permits to reduce interactions between postmaster and pooler
      */
-    pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 12);
+	pool_getmessage(&agent->port, s, 4 * agent->num_dn_connections + 4 * agent->num_coord_connections + 13);
     datanodecount = pq_getmsgint(s, 4);
     for (i = 0; i < datanodecount; i++)
     {

From 06918bf23dab631a9c621b3f75090099538e8d8f Mon Sep 17 00:00:00 2001
From: jackywpxie <jackywpxie@tencent.com>
Date: Mon, 28 Dec 2020 11:37:25 +0800
Subject: [PATCH 164/578] 
 jacky/feature/PersistentDatanodeConnection_Tbase_v2.15 (merge request !54)

Squash merge branch 'jacky/feature/PersistentDatanodeConnection_Tbase_v2.15' into 'Tbase_v2.15'

* fixed merged bugs.

* jacky/feature/PersistentDatanodeConnection_Tbase_v2.15.16 (merge request !15)
---
 src/backend/pgxc/pool/execRemote.c            | 162 ++++++++----------
 src/backend/pgxc/pool/pgxcnode.c              | 128 ++++++++++++--
 src/backend/pgxc/shard/shardmap.c             |  19 +-
 src/include/pgxc/pgxcnode.h                   |   2 +-
 src/test/regress/expected/mls_check.out       |  14 +-
 src/test/regress/expected/namespace.out       |  42 +++++
 .../regress/expected/xl_limitations_1.out     |  16 +-
 src/test/regress/sql/mls_check.sql            |   4 +-
 src/test/regress/sql/namespace.sql            |  28 +++
 src/test/regress/sql/xl_limitations.sql       |  12 +-
 10 files changed, 294 insertions(+), 133 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 38f5bb3a..a06692a6 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3454,11 +3454,11 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
         /* Send GXID and check for errors */
         if (pgxc_node_send_gxid(connections[i], gxid))
         {
-            elog(WARNING, "pgxc_node_begin gxid is invalid.");
+			elog(WARNING, "pgxc_node_begin gxid %u is invalid.", gxid);
             return EOF;
         }
-        /* Send timestamp and check for errors */
 
+		/* Send timestamp and check for errors */
 		if (GlobalTimestampIsValid(timestamp) &&
 			pgxc_node_send_timestamp(connections[i], timestamp))
 		{
@@ -3565,21 +3565,6 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
 					connections[i]->nodename, connections[i]->backend_pid);
 			new_connections[new_count++] = connections[i];
         }
-        
-#if 0        
-        /* Send BEGIN if not already in transaction */
-        if (need_tran_block && connections[i]->transaction_status == 'I')
-        {
-            /* Send the BEGIN TRANSACTION command and check for errors */
-            if (pgxc_node_send_query(connections[i], cmd))
-            {
-                return EOF;
-            }
-
-            elog(LOG, "pgxc_node_begin send BEGIN to node %s, pid:%d", connections[i]->nodename, connections[i]->backend_pid);
-            new_connections[new_count++] = connections[i];
-        }
-#endif        
     }
 
     /*
@@ -3627,6 +3612,8 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
         for (i = 0; i < new_count; i++)
         {
             pgxc_node_set_query(new_connections[i], init_str);
+			elog(DEBUG5, "pgxc_node_begin send %s to node %s, pid:%d", init_str,
+					new_connections[i]->nodename, new_connections[i]->backend_pid);
         }
     }
 
@@ -3869,6 +3856,7 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
             elog(LOG, "prepare remote transaction xid %d gid %s", GetTopTransactionIdIfAny(), prepareGID);
         }
         global_prepare_ts = GetGlobalTimestampGTM();
+
 #ifdef __TWO_PHASE_TESTS__
     if (PART_PREPARE_GET_TIMESTAMP == twophase_exception_case)
     {
@@ -4397,15 +4385,6 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 
         clear_handles();
         pfree_pgxc_all_handles(handles);
-
-#if 0
-        if (!temp_object_included && !PersistentConnections)
-        {
-            /* Clean up remote sessions */
-            pgxc_node_remote_cleanup_all();
-            release_handles();
-        }
-#endif        
     }
 
     pfree(prepare_cmd);
@@ -4505,6 +4484,7 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
             }
             g_twophase_state.datanode_state[conn_state_index].state = TWO_PHASE_ABORTTING;
 #endif
+
             /* Send down abort prepared command */
 #ifdef __USE_GLOBAL_SNAPSHOT__
             if (pgxc_node_send_gxid(conn, auxXid))
@@ -4512,7 +4492,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 #ifdef __TWO_PHASE_TRANS__
                 g_twophase_state.datanode_state[conn_state_index].conn_state = 
                     TWO_PHASE_SEND_GXID_ERROR;
-                g_twophase_state.datanode_state[conn_state_index].state = TWO_PHASE_ABORT_ERROR;
+				g_twophase_state.datanode_state[conn_state_index].state =
+					TWO_PHASE_ABORT_ERROR;
 #endif
                 /*
                  * Prepared transaction is left on the node, but we can not
@@ -4520,10 +4501,11 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
                  */
                 ereport(WARNING,
                         (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("failed to send xid to "
-                                "the node %u", conn->nodeoid)));
+						 errmsg("failed to send xid %u to "
+								"the node %u", auxXid, conn->nodeoid)));
             }
 #endif
+
             if (pgxc_node_send_query(conn, abort_cmd))
             {
 #ifdef __TWO_PHASE_TRANS__
@@ -4598,7 +4580,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 #ifdef __TWO_PHASE_TRANS__
                 g_twophase_state.coord_state[conn_state_index].conn_state = 
                     TWO_PHASE_SEND_GXID_ERROR;
-                g_twophase_state.coord_state[conn_state_index].state = TWO_PHASE_ABORT_ERROR;
+				g_twophase_state.coord_state[conn_state_index].state =
+					TWO_PHASE_ABORT_ERROR;
 #endif
                 /*
                  * Prepared transaction is left on the node, but we can not
@@ -4606,10 +4589,11 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
                  */
                 ereport(WARNING,
                         (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("failed to send xid to "
-                                "the node %u", conn->nodeoid)));
+						errmsg("failed to send xid %u to "
+								"the node %u", auxXid, conn->nodeoid)));
             }
 #endif
+
             if (pgxc_node_send_query(conn, abort_cmd))
             {
 #ifdef __TWO_PHASE_TRANS__
@@ -4662,12 +4646,20 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
     else
         elog(ERROR, "failed to PREPARE transaction on one or more nodes");
 
-    if (!temp_object_included && !PersistentConnections)
+	if (!temp_object_included)
     {
         /* Clean up remote sessions */
         pgxc_node_remote_cleanup_all();
+
+		if (PersistentConnections)
+		{
+			reset_handles();
+		}
+		else
+		{
         release_handles(false);
     }
+	}
     
     clear_handles();
 
@@ -4700,7 +4692,7 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
  * Release remote connection after completion.
  *
  * For DDL, DN will commit before CN does.
- * Because DDLs normally have conflict locks, when CN gets committed,
+ * Because DDL normally has conflict locks, when CN gets committed,
  * DNs will be in a consistent state for blocked user transactions.
  */
 static void
@@ -4722,22 +4714,21 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 
     stat_transaction(conn_count);
 
-    if (need_release_handle)
-    {
-        if (!temp_object_included && !PersistentConnections)
+	if (!temp_object_included)
         {
             /* Clean up remote sessions */
             pgxc_node_remote_cleanup_all();
-            release_handles(false);
-        }
+
+		if (need_release_handle)
+		{
+			if (PersistentConnections)
+			{
+				reset_handles();
     }
     else
     {
-        /* in subtxn, we just cleanup the connections. not release the handles. */
-        if (!temp_object_included && !PersistentConnections)
-        {
-            /* Clean up remote sessions without release handles. */
-            pgxc_node_remote_cleanup_all();
+				release_handles(false);
+			}
         }
     }
 
@@ -4783,7 +4774,6 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 
     if(IS_PGXC_COORDINATOR)
     {
-        
         global_committs = GetGlobalTimestampGTM();
         if(!GlobalTimestampIsValid(global_committs)){
             ereport(ERROR,
@@ -4955,23 +4945,21 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 #ifndef __TBASE__
 	stat_transaction(conn_count);
 
+	if (!temp_object_included)
+	{
+		/* Clean up remote sessions */
+		pgxc_node_remote_cleanup_all();
 	
 	if (need_release_handle)
 	{
-		if (!temp_object_included && !PersistentConnections)
+			if (PersistentConnections)
 		{
-			/* Clean up remote sessions */
-			pgxc_node_remote_cleanup_all();
-			release_handles(false);
-		}
+				reset_handles();
 	}
 	else
 	{
-		/* in subtxn, we just cleanup the connections. not release the handles. */
-		if (!temp_object_included && !PersistentConnections)
-		{
-			/* Clean up remote sessions without release handles. */
-			pgxc_node_remote_cleanup_all();
+				release_handles(false);
+			}
 		}
 	}
 	
@@ -4992,7 +4980,7 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 }
 
 /*
- * Set the node begein transaction in plpgsql function
+ * Set the node begin transaction in plpgsql function
  */
 static void
 SetPlpgsqlTransactionBegin(PGXCNodeHandle *conn)
@@ -5802,22 +5790,21 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
     }
 #endif    
     
-    if (need_release_handle)
-    {
+	/*
+	 * Drop the connections to ensure aborts are handled properly.
+	 *
+	 * XXX We should really be consulting PersistentConnections parameter and
+	 * keep the connections if its set. But as a short term measure, to address
+	 * certain issues for aborted transactions, we drop the connections.
+	 * Revisit and fix the issue
+	 */
         if (!temp_object_included)
         {
             /* Clean up remote sessions */
             pgxc_node_remote_cleanup_all();
-			release_handles(false);
-        }
-    }
-    else
-    {
-        /* in subtxn, we just cleanup the connections. not release the handles. */
-        if (!temp_object_included)
+		if (need_release_handle)
         {
-            /* Clean up remote sessions without release handles. */
-            pgxc_node_remote_cleanup_all();
+			release_handles(false);
         }
     }
     
@@ -6647,7 +6634,9 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection,
 
     if (snapshot && pgxc_node_send_snapshot(connection, snapshot))
         return false;
-    if (step->statement || step->cursor || remotestate->rqs_num_params)
+	if ((step->statement && step->statement[0] != '\0') ||
+		step->cursor ||
+		remotestate->rqs_num_params)
     {
         /* need to use Extended Query Protocol */
         int    fetch = 0;
@@ -7878,29 +7867,6 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle)
 
     pgxc_node_remote_abort(txn_type, need_release_handle);
 
-    /*
-     * Drop the connections to ensure aborts are handled properly.
-     *
-     * XXX We should really be consulting PersistentConnections parameter and
-     * keep the connections if its set. But as a short term measure, to address
-     * certain issues for aborted transactions, we drop the connections.
-     * Revisit and fix the issue
-     */
-    elog(DEBUG5, "temp_object_included %d", temp_object_included);
-    /* cleanup and release handles is already done in pgxc_node_remote_abort */
-#if 0    
-    if (release_handle)
-    {
-        if (!temp_object_included)
-        {
-            /* Clean up remote sessions */
-            pgxc_node_remote_cleanup_all();
-            release_handles();
-        }
-    }
-    
-    clear_handles();
-#endif
     pfree_pgxc_all_handles(all_handles);
 
     if (log_gtm_stats)
@@ -8844,12 +8810,19 @@ pgxc_node_remote_finish(char *prepareGID, bool commit,
     }
 #endif    
 
-    if (!temp_object_included && !PersistentConnections)
+	if (!temp_object_included)
     {
         /* Clean up remote sessions */
         pgxc_node_remote_cleanup_all();
+		if (PersistentConnections)
+		{
+			reset_handles();
+		}
+		else
+		{
         release_handles(false);
     }
+	}
     clear_handles();
     pfree_pgxc_all_handles(pgxc_handles);
     pfree(finish_cmd);
@@ -9050,8 +9023,9 @@ ExecRemoteQuery(PlanState *pstate)
 		if (step->force_autocommit)
 			need_tran_block = false;
 		else
-			need_tran_block = step->cursor ||
-					step->statement || node->rqs_num_params ||
+			need_tran_block = (step->statement && step->statement[0] != '\0') ||
+				step->cursor ||
+				node->rqs_num_params ||
 					(!step->read_only && total_conn_count > 1) ||
 					(TransactionBlockStatusCode() == 'T');
 
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index d37a0b48..eb400fb4 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -316,6 +316,10 @@ InitMultinodeExecutor(bool is_force)
         {
             node_handle_ent->nodeoid = dn_handles[count].nodeoid;
             node_handle_ent->nodeidx = count;
+
+			elog(DEBUG5,
+				"node_handles_hash enter primary datanode nodeoid: %d",
+				node_handle_ent->nodeoid);
         }
 #endif        
         
@@ -342,6 +346,10 @@ InitMultinodeExecutor(bool is_force)
         {
             node_handle_ent->nodeoid = sdn_handles[count].nodeoid;
             node_handle_ent->nodeidx = count;
+
+			elog(DEBUG5,
+				"node_handles_hash enter slave datanode nodeoid: %d",
+				node_handle_ent->nodeoid);
         }
 #endif        
         
@@ -367,6 +375,10 @@ InitMultinodeExecutor(bool is_force)
         {
             node_handle_ent->nodeoid = co_handles[count].nodeoid;
             node_handle_ent->nodeidx = count;
+
+			elog(DEBUG5,
+				"node_handles_hash enter coordinator nodeoid: %d",
+				node_handle_ent->nodeoid);
         }
 #endif    
     }
@@ -655,7 +667,7 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
 #ifdef DN_CONNECTION_DEBUG
     handle->have_row_desc = false;
 #endif
-    memset(handle->error, 0X00, MAX_ERROR_MSG_LENGTH);
+	handle->error[0] = '\0';
     handle->outEnd = 0;
     handle->inStart = 0;
     handle->inEnd = 0;
@@ -1405,7 +1417,7 @@ release_handles(bool force)
         {
             /*
              * Connections at this point should be completely inactive,
-             * otherwise abaandon them. We can not allow not cleaned up
+			 * otherwise abandon them. We can not allow not cleaned up
              * connection is returned to pool.
              */
             if (handle->state != DN_CONNECTION_STATE_IDLE ||
@@ -1442,7 +1454,7 @@ release_handles(bool force)
 		{
 			/*
 			 * Connections at this point should be completely inactive,
-			 * otherwise abaandon them. We can not allow not cleaned up
+			 * otherwise abandon them. We can not allow not cleaned up
 			 * connection is returned to pool.
 			 */
 			if (handle->state != DN_CONNECTION_STATE_IDLE ||
@@ -1521,6 +1533,67 @@ release_handles(bool force)
     slavedatanode_count = 0;
 }
 
+/*
+ * Reset all Datanode and Coordinator connections occupied memory.
+ */
+void
+reset_handles(void)
+{
+	int			i;
+
+	/* don't reset connection if holding a cluster lock */
+	if (cluster_ex_lock_held)
+	{
+		return;
+	}
+
+	if (datanode_count == 0 && coord_count == 0 && slavedatanode_count == 0)
+	{
+		return;
+	}
+
+	/* Do not reset connections if we have prepared statements on nodes */
+	if (HaveActiveDatanodeStatements())
+	{
+		return;
+	}
+
+	/* Reset Datanodes handles occupied memory */
+	for (i = 0; i < NumDataNodes; i++)
+	{
+		PGXCNodeHandle *handle = &dn_handles[i];
+
+		if (handle->sock != NO_SOCKET)
+		{
+			pgxc_node_init(handle, handle->sock, true, handle->backend_pid);
+		}
+	}
+
+	for (i = 0; i < NumSlaveDataNodes; i++)
+	{
+		PGXCNodeHandle *handle = &sdn_handles[i];
+
+		if (handle->sock != NO_SOCKET)
+		{
+			pgxc_node_init(handle, handle->sock, true, handle->backend_pid);
+		}
+	}
+
+	if (IS_PGXC_COORDINATOR)
+	{
+		/* Collect Coordinator handles */
+		for (i = 0; i < NumCoords; i++)
+		{
+			PGXCNodeHandle *handle = &co_handles[i];
+
+			if (handle->sock != NO_SOCKET)
+			{
+				pgxc_node_init(handle, handle->sock, true, handle->backend_pid);
+			}
+		}
+	}
+}
+
 /*
  * Check whether there bad connections to remote nodes when abort transactions.
  */
@@ -3052,8 +3125,6 @@ int
 pgxc_node_send_snapshot(PGXCNodeHandle *handle, Snapshot snapshot)
 {// #lizard forgives
     int            msglen PG_USED_FOR_ASSERTS_ONLY;
-    int            nval PG_USED_FOR_ASSERTS_ONLY;
-    int            i PG_USED_FOR_ASSERTS_ONLY;
 
     /* Invalid connection state, return error */
     if (handle->state != DN_CONNECTION_STATE_IDLE)
@@ -3272,7 +3343,10 @@ pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp)
     /* Invalid connection state, return error */
     if (handle->state != DN_CONNECTION_STATE_IDLE)
     {
-        elog(WARNING, "pgxc_node_send_timestamp datanode:%u invalid stauts:%d, no need to send data, return NOW", handle->nodeoid, handle->state);
+		elog(WARNING,
+			"pgxc_node_send_timestamp datanode:%u invalid stauts:%d, "
+			"no need to send data, return NOW",
+			handle->nodeoid, handle->state);
         return EOF;
     }
 
@@ -3311,7 +3385,7 @@ pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp)
 /*
  * Send the Coordinator info down to the PGXC node at the beginning of transaction,
  * In this way, Datanode can print this Coordinator info into logfile, 
- * and those infos can be found in Datanode logifile if needed during debugging
+ * and those infos can be found in Datanode logfile if needed during debugging
  */
 int
 pgxc_node_send_coord_info(PGXCNodeHandle * handle, int coord_pid, TransactionId coord_vxid)
@@ -4249,13 +4323,23 @@ pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
 #endif
 
     if (pgxc_handles->primary_handle)
+	{
         pfree(pgxc_handles->primary_handle);
+		pgxc_handles->primary_handle = NULL;
+	}
     if (pgxc_handles->datanode_handles)
+	{
         pfree(pgxc_handles->datanode_handles);
+		pgxc_handles->datanode_handles = NULL;
+	}
     if (pgxc_handles->coord_handles)
+	{
         pfree(pgxc_handles->coord_handles);
+		pgxc_handles->coord_handles = NULL;
+	}
 
     pfree(pgxc_handles);
+	pgxc_handles = NULL;
 }
 
 /* Do translation for non-main cluster */
@@ -4314,13 +4398,15 @@ PGXCNodeGetNodeId(Oid nodeoid, char *node_type)
     
     if (NULL == node_handles_hash)
     {
+		elog(DEBUG5, "node_handles_hash is null.");
         goto NOT_FOUND;
     }
-    nodeoid = PGXCGetLocalNodeOid(nodeoid);
     
+	nodeoid = PGXCGetLocalNodeOid(nodeoid);
     entry = (PGXCNodeHandlesLookupEnt *) hash_search(node_handles_hash, &nodeoid, HASH_FIND, &found);
     if (false == found)
     {
+		elog(DEBUG5, "node_handles_hash does not has %d", nodeoid);
         goto NOT_FOUND;
     }
 
@@ -4721,11 +4807,23 @@ get_set_command(List *param_list, StringInfo command, bool local)
             {
                 search_path_value[index++] = '"';
             }
+
+			if ((char *) strstr(search_path_value, "public") ||
+				(char *) strstr(search_path_value, "PUBLIC"))
+			{
             appendStringInfo(command, "SET %s %s TO %s;", local ? "LOCAL" : "",
              NameStr(entry->name), search_path_value);
         }
         else
         {
+				appendStringInfo(command, "SET %s %s TO %s, public;", local ? "LOCAL" : "",
+					NameStr(entry->name), search_path_value);
+			}
+
+			elog(DEBUG5, "get_set_command: %s", command->data);
+		}
+		else
+		{
             appendStringInfo(command, "SET %s %s TO %s;", local ? "LOCAL" : "",
              NameStr(entry->name), value);
         }
@@ -4735,7 +4833,7 @@ get_set_command(List *param_list, StringInfo command, bool local)
 
 /*
  * Returns SET commands needed to initialize remote session.
- * The command may already be biult and valid, return it right away if the case.
+ * The command may already be built and valid, return it right away if the case.
  * Otherwise build it up.
  * To support Distributed Session machinery coordinator should generate and
  * send a distributed session identifier to remote nodes. Generate it here.
@@ -4777,7 +4875,7 @@ PGXCNodeGetSessionParamStr(void)
 
 /*
  * Returns SET commands needed to initialize transaction on a remote session.
- * The command may already be biult and valid, return it right away if the case.
+ * The command may already be built and valid, return it right away if the case.
  * Otherwise build it up.
  */
 char *
@@ -5214,8 +5312,16 @@ PgxcNodeDiffBackendHandles(List **nodes_alter,
         Oid nodeoid;
         char ntype = PGXC_NODE_NONE;
 
-        if(enable_multi_cluster && strcmp(NameStr(nodeForm->node_cluster_name), PGXCClusterName))
+		if (enable_multi_cluster &&
+			strcmp(NameStr(nodeForm->node_cluster_name), PGXCClusterName))
+		{
+			continue;
+		}
+
+		if (PGXC_NODE_GTM == nodeForm->node_type)
+		{
             continue;
+		}
         
         nodeoid = HeapTupleGetOid(tuple);
         catoids = lappend_oid(catoids, nodeoid);
diff --git a/src/backend/pgxc/shard/shardmap.c b/src/backend/pgxc/shard/shardmap.c
index 38b5044a..6583be1c 100644
--- a/src/backend/pgxc/shard/shardmap.c
+++ b/src/backend/pgxc/shard/shardmap.c
@@ -889,7 +889,9 @@ static void InsertShardMap_CN(int32 map, Form_pgxc_shard_map record)
             nodeindex = PGXCNodeGetNodeId(record->primarycopy, &node_type);
             if (nodeindex < 0)
             {
-                elog(ERROR, " get node:%u for index failed", record->primarycopy);
+				elog(ERROR,
+					"InsertShardMap_CN get node:%u for index failed",
+					record->primarycopy);
             }
             
             g_GroupShardingMgr->members[map]->shmemshardmap[record->shardgroupid].primarycopy  = record->primarycopy;
@@ -898,7 +900,11 @@ static void InsertShardMap_CN(int32 map, Form_pgxc_shard_map record)
         }
         else
         {
-            elog(ERROR, " invalid pgxc_shard_map record with shardgroupid:%d", record->shardgroupid);
+			elog(ERROR,
+				"invalid pgxc_shard_map record with shardgroupid: %d, map %d "
+				"and shmemNum: %d",
+				record->shardgroupid, map,
+				g_GroupShardingMgr->members[map]->shmemNumShardGroups);
         }        
     }
 }
@@ -918,7 +924,9 @@ static void InsertShardMap_DN(Form_pgxc_shard_map record)
         nodeindex = PGXCNodeGetNodeId(record->primarycopy, &node_type);
         if (nodeindex < 0)
         {
-            elog(ERROR, " get node:%u for index failed", record->primarycopy);
+			elog(ERROR,
+				"InsertShardMap_DN get node:%u for index failed",
+				record->primarycopy);
         }
         
         g_GroupShardingMgr_DN->members->shmemshardmap[record->shardgroupid].primarycopy  = record->primarycopy;
@@ -927,7 +935,10 @@ static void InsertShardMap_DN(Form_pgxc_shard_map record)
     }
     else
     {
-        elog(ERROR, "[InsertShardMap_DN]invalid pgxc_shard_map record with shardgroupid:%d", record->shardgroupid);
+		elog(ERROR,
+			"InsertShardMap_DN has invalid pgxc_shard_map record with shardgroupid: "
+			"%d and shmemNum: %d",
+			record->shardgroupid, g_GroupShardingMgr_DN->members->shmemNumShardGroups);
     }
 }
 
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 4e64e650..f15515c3 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -183,7 +183,7 @@ extern PGXCNodeAllHandles * get_sock_fatal_handles(void);
 extern void pfree_pgxc_all_handles(PGXCNodeAllHandles *handles);
 
 extern void release_handles(bool force);
-
+extern void reset_handles(void);
 extern void clear_handles(void);
 
 extern int get_transaction_nodes(PGXCNodeHandle ** connections,
diff --git a/src/test/regress/expected/mls_check.out b/src/test/regress/expected/mls_check.out
index 496d0b8c..fd8a30b5 100644
--- a/src/test/regress/expected/mls_check.out
+++ b/src/test/regress/expected/mls_check.out
@@ -5573,10 +5573,10 @@ drop table lala;
 drop table lala2;
 drop table lala3;
 \c - mls_admin
-select * from pg_cls_table;
- polid | attnum | relid | enable | nspname | tblname | reloptions 
--------+--------+-------+--------+---------+---------+------------
-    99 |      3 | 17061 | t      | public  | xixi    | 
+select polid, attnum, enable, nspname, tblname, reloptions from pg_cls_table;
+ polid | attnum | enable | nspname | tblname | reloptions 
+-------+--------+--------+---------+---------+------------
+    99 |      3 | t      | public  | xixi    | 
 (1 row)
 
 select MLS_CLS_DROP_TABLE_LABEL('cls_compare', 'public', 'xixi');
@@ -5585,9 +5585,9 @@ select MLS_CLS_DROP_TABLE_LABEL('cls_compare', 'public', 'xixi');
  t
 (1 row)
 
-select * from pg_cls_table;
- polid | attnum | relid | enable | nspname | tblname | reloptions 
--------+--------+-------+--------+---------+---------+------------
+select polid, attnum, enable, nspname, tblname, reloptions from pg_cls_table;
+ polid | attnum | enable | nspname | tblname | reloptions 
+-------+--------+--------+---------+---------+------------
 (0 rows)
 
 --everything is done
diff --git a/src/test/regress/expected/namespace.out b/src/test/regress/expected/namespace.out
index b081c977..1d2ecfee 100644
--- a/src/test/regress/expected/namespace.out
+++ b/src/test/regress/expected/namespace.out
@@ -69,3 +69,45 @@ SELECT COUNT(*) FROM pg_class WHERE relnamespace =
      0
 (1 row)
 
+CREATE SCHEMA test_schema_2
+       CREATE TABLE ab (
+              a serial,
+              b int UNIQUE
+       );
+CREATE SCHEMA test_schema_3;
+CREATE SCHEMA test_schema_4
+       CREATE TABLE ab (
+              a serial,
+              b int UNIQUE
+       );
+INSERT INTO test_schema_2.ab(b) VALUES(1);
+INSERT INTO test_schema_2.ab(b) VALUES(2);
+SELECT * FROM test_schema_2.ab ORDER BY a, b;
+ a | b 
+---+---
+ 1 | 1
+ 2 | 2
+(2 rows)
+
+INSERT INTO test_schema_3.ab(b) VALUES(3);
+ERROR:  relation "test_schema_3.ab" does not exist
+LINE 1: INSERT INTO test_schema_3.ab(b) VALUES(3);
+                    ^
+SELECT * FROM test_schema_3.ab ORDER BY a, b;
+ERROR:  relation "test_schema_3.ab" does not exist
+LINE 1: SELECT * FROM test_schema_3.ab ORDER BY a, b;
+                      ^
+INSERT INTO test_schema_4.ab(b) VALUES(4);
+INSERT INTO test_schema_4.ab(b) VALUES(5);
+SELECT * FROM test_schema_4.ab ORDER BY a, b;
+ a | b 
+---+---
+ 1 | 4
+ 2 | 5
+(2 rows)
+
+DROP SCHEMA test_schema_2 CASCADE;
+NOTICE:  drop cascades to table test_schema_2.ab
+DROP SCHEMA test_schema_3 CASCADE;
+DROP SCHEMA test_schema_4 CASCADE;
+NOTICE:  drop cascades to table test_schema_4.ab
diff --git a/src/test/regress/expected/xl_limitations_1.out b/src/test/regress/expected/xl_limitations_1.out
index c44f0d64..161cd7b4 100644
--- a/src/test/regress/expected/xl_limitations_1.out
+++ b/src/test/regress/expected/xl_limitations_1.out
@@ -514,7 +514,7 @@ INSERT INTO xl_names("name", "name1")VALUES ('W', 'W1');
 INSERT INTO xl_names("name", "name1")VALUES ('X', 'X1');
 INSERT INTO xl_names("name", "name1")VALUES ('Y', 'Y1');
 INSERT INTO xl_names("name", "name1")VALUES ('Z', 'Z1');
-select xl_nodename_from_id(xc_node_id), * from xl_t order by 1;
+select xl_nodename_from_id(xc_node_id), * from xl_t order by 1, 2, 3;
  xl_nodename_from_id | no | name 
 ---------------------+----+------
  datanode_1          |  1 | A
@@ -523,7 +523,7 @@ select xl_nodename_from_id(xc_node_id), * from xl_t order by 1;
  datanode_2          |  4 | D
 (4 rows)
 
-select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1;
+select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1, 2, 3;
  xl_nodename_from_id | no1 | name1 
 ---------------------+-----+-------
  datanode_1          |   1 | Z
@@ -552,7 +552,7 @@ where xl_t.no = T1.no1;
 update xl_t1  set name1 = T1.name1 
 from (select name,name1 from xl_names) T1 
 where xl_t1.name1 = T1.name;
-select xl_nodename_from_id(xc_node_id), * from xl_t order by 1;
+select xl_nodename_from_id(xc_node_id), * from xl_t order by 1, 2, 3;
  xl_nodename_from_id | no | name 
 ---------------------+----+------
  datanode_1          |  1 | Z
@@ -561,13 +561,13 @@ select xl_nodename_from_id(xc_node_id), * from xl_t order by 1;
  datanode_2          |  4 | W
 (4 rows)
 
-select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1;
+select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1, 2, 3;
  xl_nodename_from_id | no1 | name1 
 ---------------------+-----+-------
- datanode_1          |   2 | Y1
  datanode_1          |   1 | Z1
- datanode_2          |   4 | W1
+ datanode_1          |   2 | Y1
  datanode_2          |   3 | X1
+ datanode_2          |   4 | W1
 (4 rows)
 
 --testing correlated delete:
@@ -578,7 +578,7 @@ where xl_t.no in (select no1 from xl_t1 where name1 in ('Z', 'X'))
 delete from xl_t1  
 where xl_t1.name1 in (select name1 from xl_names where name in ('Z', 'X'))
 ;
-select xl_nodename_from_id(xc_node_id), * from xl_t order by 1;
+select xl_nodename_from_id(xc_node_id), * from xl_t order by 1, 2, 3;
  xl_nodename_from_id | no | name 
 ---------------------+----+------
  datanode_1          |  1 | Z
@@ -587,7 +587,7 @@ select xl_nodename_from_id(xc_node_id), * from xl_t order by 1;
  datanode_2          |  4 | W
 (4 rows)
 
-select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1;
+select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1, 2, 3;
  xl_nodename_from_id | no1 | name1 
 ---------------------+-----+-------
  datanode_1          |   2 | Y1
diff --git a/src/test/regress/sql/mls_check.sql b/src/test/regress/sql/mls_check.sql
index 83e4027c..0b96a0c6 100644
--- a/src/test/regress/sql/mls_check.sql
+++ b/src/test/regress/sql/mls_check.sql
@@ -2168,9 +2168,9 @@ drop table lala2;
 drop table lala3;
 
 \c - mls_admin
-select * from pg_cls_table;
+select polid, attnum, enable, nspname, tblname, reloptions from pg_cls_table;
 select MLS_CLS_DROP_TABLE_LABEL('cls_compare', 'public', 'xixi');
-select * from pg_cls_table;
+select polid, attnum, enable, nspname, tblname, reloptions from pg_cls_table;
 
 --everything is done
 \c - godlike
diff --git a/src/test/regress/sql/namespace.sql b/src/test/regress/sql/namespace.sql
index ade2e5e3..77444aea 100644
--- a/src/test/regress/sql/namespace.sql
+++ b/src/test/regress/sql/namespace.sql
@@ -42,3 +42,31 @@ DROP SCHEMA test_schema_renamed CASCADE;
 -- verify that the objects were dropped
 SELECT COUNT(*) FROM pg_class WHERE relnamespace =
     (SELECT oid FROM pg_namespace WHERE nspname = 'test_schema_renamed');
+
+
+CREATE SCHEMA test_schema_2
+       CREATE TABLE ab (
+              a serial,
+              b int UNIQUE
+       );
+CREATE SCHEMA test_schema_3;
+CREATE SCHEMA test_schema_4
+       CREATE TABLE ab (
+              a serial,
+              b int UNIQUE
+       );
+
+INSERT INTO test_schema_2.ab(b) VALUES(1);
+INSERT INTO test_schema_2.ab(b) VALUES(2);
+SELECT * FROM test_schema_2.ab ORDER BY a, b;
+
+INSERT INTO test_schema_3.ab(b) VALUES(3);
+SELECT * FROM test_schema_3.ab ORDER BY a, b;
+
+INSERT INTO test_schema_4.ab(b) VALUES(4);
+INSERT INTO test_schema_4.ab(b) VALUES(5);
+SELECT * FROM test_schema_4.ab ORDER BY a, b;
+
+DROP SCHEMA test_schema_2 CASCADE;
+DROP SCHEMA test_schema_3 CASCADE;
+DROP SCHEMA test_schema_4 CASCADE;
diff --git a/src/test/regress/sql/xl_limitations.sql b/src/test/regress/sql/xl_limitations.sql
index a75bfcdb..3f9e7779 100644
--- a/src/test/regress/sql/xl_limitations.sql
+++ b/src/test/regress/sql/xl_limitations.sql
@@ -267,9 +267,9 @@ INSERT INTO xl_names("name", "name1")VALUES ('X', 'X1');
 INSERT INTO xl_names("name", "name1")VALUES ('Y', 'Y1');
 INSERT INTO xl_names("name", "name1")VALUES ('Z', 'Z1');
 
-select xl_nodename_from_id(xc_node_id), * from xl_t order by 1;
+select xl_nodename_from_id(xc_node_id), * from xl_t order by 1, 2, 3;
 
-select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1;
+select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1, 2, 3;
 
 select xl_nodename_from_id(xc_node_id), * from xl_names order by name;
 
@@ -282,8 +282,8 @@ update xl_t1  set name1 = T1.name1
 from (select name,name1 from xl_names) T1 
 where xl_t1.name1 = T1.name;
 
-select xl_nodename_from_id(xc_node_id), * from xl_t order by 1;
-select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1;
+select xl_nodename_from_id(xc_node_id), * from xl_t order by 1, 2, 3;
+select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1, 2, 3;
 
 --testing correlated delete:
 delete from xl_t 
@@ -295,9 +295,9 @@ delete from xl_t1
 where xl_t1.name1 in (select name1 from xl_names where name in ('Z', 'X'))
 ;
 
-select xl_nodename_from_id(xc_node_id), * from xl_t order by 1;
+select xl_nodename_from_id(xc_node_id), * from xl_t order by 1, 2, 3;
 
-select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1;
+select xl_nodename_from_id(xc_node_id), * from xl_t1 order by 1, 2, 3;
 
 drop table xl_t;
 drop table xl_t1;

From 250e3deb58705a81ebd2d6e7cd715299cebbf81e Mon Sep 17 00:00:00 2001
From: jackywpxie <jackywpxie@tencent.com>
Date: Mon, 4 Jan 2021 11:35:26 +0800
Subject: [PATCH 165/578] jacky/feature/MemoryProtect_Tbase_v2.15.16 (merge
 request !13)

Squash merge branch 'jacky/feature/MemoryProtect_Tbase_v2.15.16' into 'Tbase_v2.15.16'

* modified according to codeview suggestions.

* rafactor code

* modified according to codereview suggestions.

* fixed an index_insert error

* modified guc para values

* refactor code

* rollback

* modified for enable_buffer_mprotect

* enable memory protection while doing command: make check

* refactor code

* fixed MLS bugs for enable_buffer_memory_protect

* refactor code

* fixed merged bugs.

* Merge branch 'Tbase_v2.15.16' into jacky/feature/MemoryProtect_Tbase_v2.15.16

* enable_buffer_memory_protect

* Merge branch 'Tbase_v2.15.16' into jacky/feature/MemoryProtect_Tbase_v2.15.16

* Merge branch 'Tbase_v2.15.16' into jacky/feature/MemoryProtect_Tbase_v2.15.16

* Merge branch 'Tbase_v2.15.16' into jacky/feature/MemoryProtect_Tbase_v2.15.16

* delete nouse head file.

* enable_xlog_memory_protect = on, make check successfully.

* fixed coredump for xlog memory protect.

* enable_clog_memory_protect = on, make check successfully.

* enable_tlog_memory_protect = on, make check successfully.

* add GUC para:

* fixed a error of cherry-pick.

* convert blank to tab.

* adjust code according to mr.

* Fix format.

* Bug fix in using mprotect.

* add mlock before mprotect to shard memory as required.

* Add memory protect for tlog.
---
 src/backend/access/gist/gistget.c             |  10 ++
 src/backend/access/hash/hash.c                |  15 ++
 src/backend/access/hash/hashsearch.c          |  23 +++
 src/backend/access/nbtree/nbtinsert.c         |   9 +
 src/backend/access/nbtree/nbtutils.c          |  15 ++
 src/backend/access/transam/clog.c             |   4 +
 src/backend/access/transam/commit_ts.c        |  12 +-
 src/backend/access/transam/lru.c              | 106 ++++++++++-
 src/backend/access/transam/multixact.c        |   2 +
 src/backend/access/transam/slru.c             |  45 +++++
 src/backend/access/transam/subtrans.c         |   2 +
 src/backend/access/transam/xlog.c             |  44 +++++
 src/backend/commands/async.c                  |   2 +
 src/backend/commands/sequence.c               |   6 +-
 src/backend/storage/buffer/buf_init.c         |  13 ++
 src/backend/storage/buffer/bufmgr.c           | 109 ++++++++++-
 src/backend/storage/freespace/freespace.c     |  11 +-
 src/backend/storage/freespace/fsmpage.c       |   6 +
 src/backend/utils/misc/guc.c                  |  57 ++++++
 src/backend/utils/misc/mls.c                  |  13 ++
 src/backend/utils/misc/postgresql.conf.sample |   8 +
 src/backend/utils/time/tqual.c                |   6 +-
 src/include/access/lru.h                      |   5 +-
 src/include/access/slru.h                     | 169 +++++++++---------
 src/include/access/xlog.h                     |   3 +
 src/include/c.h                               |   1 +
 src/include/storage/buf_internals.h           |   3 +
 src/include/utils/guc.h                       |   4 +
 src/test/regress/expected/oracle.out          |   1 +
 src/test/regress/expected/sysviews.out        |   6 +-
 .../regress/output/xc_notrans_block.source    |   1 -
 .../regress/output/xc_notrans_block_1.source  |   1 -
 32 files changed, 608 insertions(+), 104 deletions(-)

diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index 17494242..8a18e693 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -21,6 +21,7 @@
 #include "pgstat.h"
 #include "lib/pairingheap.h"
 #include "utils/builtins.h"
+#include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
@@ -52,7 +53,16 @@ gistkillitems(IndexScanDesc scan)
     if (!BufferIsValid(buffer))
         return;
 
+	if (enable_buffer_mprotect)
+	{
+		/* ItemIdMarkDead() will write pages */
+		LockBuffer(buffer, GIST_EXCLUSIVE);
+	}
+	else
+	{
     LockBuffer(buffer, GIST_SHARE);
+	}
+	
     gistcheckpage(scan->indexRelation, buffer);
     page = BufferGetPage(buffer);
 
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 636c21cf..7d04dd80 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -26,6 +26,7 @@
 #include "miscadmin.h"
 #include "optimizer/plancat.h"
 #include "utils/builtins.h"
+#include "utils/guc.h"
 #include "utils/index_selfuncs.h"
 #include "utils/rel.h"
 #include "miscadmin.h"
@@ -482,7 +483,14 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
      */
     if (so->numKilled > 0)
     {
+		if (enable_buffer_mprotect)
+		{
+			LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE);
+		}
+		else
+		{
         LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
+		}
         _hash_kill_items(scan);
         LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
     }
@@ -520,7 +528,14 @@ hashendscan(IndexScanDesc scan)
      */
     if (so->numKilled > 0)
     {
+		if (enable_buffer_mprotect)
+		{
+			LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE);
+		}
+		else
+		{
         LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
+		}
         _hash_kill_items(scan);
         LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
     }
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index cea3e835..e4ea31de 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -18,6 +18,7 @@
 #include "access/relscan.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "utils/guc.h"
 #include "utils/rel.h"
 
 
@@ -467,7 +468,18 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 
                     /* Before leaving current page, deal with any killed items */
                     if (so->numKilled > 0)
+					{
+						if (enable_buffer_mprotect)
+						{
+							LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
+							LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE);
+							_hash_kill_items(scan);
+						}
+						else
+						{
                         _hash_kill_items(scan);
+						}
+					}
 
                     /*
                      * ran off the end of this page, try the next
@@ -524,7 +536,18 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 
                     /* Before leaving current page, deal with any killed items */
                     if (so->numKilled > 0)
+					{
+						if (enable_buffer_mprotect)
+						{
+							LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
+							LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE);
+							_hash_kill_items(scan);
+						}
+						else
+						{
                         _hash_kill_items(scan);
+						}
+					}
 
                     /*
                      * ran off the end of this page, try the next
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 65936677..4d102b2f 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -23,6 +23,7 @@
 #include "miscadmin.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "utils/guc.h"
 #include "utils/tqual.h"
 
 
@@ -472,7 +473,15 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
             for (;;)
             {
                 nblkno = opaque->btpo_next;
+				if (enable_buffer_mprotect)
+				{
+					nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_WRITE);
+				}
+				else
+				{
                 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
+				}
+				
                 page = BufferGetPage(nbuf);
                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
                 if (!P_IGNORE(opaque))
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index a2ee713b..a031db5b 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -22,6 +22,7 @@
 #include "access/relscan.h"
 #include "miscadmin.h"
 #include "utils/array.h"
+#include "utils/guc.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
@@ -1756,7 +1757,14 @@ _bt_killitems(IndexScanDesc scan)
          * re-use of any TID on the page, so there is no need to check the
          * LSN.
          */
+		if (enable_buffer_mprotect)
+		{
+			LockBuffer(so->currPos.buf, BT_WRITE);
+		}
+		else
+		{
         LockBuffer(so->currPos.buf, BT_READ);
+		}
 
         page = BufferGetPage(so->currPos.buf);
     }
@@ -1765,7 +1773,14 @@ _bt_killitems(IndexScanDesc scan)
         Buffer        buf;
 
         /* Attempt to re-read the buffer, getting pin and lock. */
+		if (enable_buffer_mprotect)
+		{
+			buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_WRITE);
+		}
+		else
+		{
         buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ);
+		}
 
         /* It might not exist anymore; in which case we can't hint it. */
         if (!BufferIsValid(buf))
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 7cfbec69..98f1f7d1 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -365,7 +365,9 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
     byteval = *byteptr;
     byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift);
     byteval |= (status << bshift);
+	SlruClogDisableMemoryProtection(ClogCtl->shared->page_buffer[slotno]);
     *byteptr = byteval;
+	SlruClogEnableMemoryProtection(ClogCtl->shared->page_buffer[slotno]);
 
     /*
      * Update the group LSN if the transaction completion LSN is higher.
@@ -570,9 +572,11 @@ TrimCLOG(void)
         byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
 
         /* Zero so-far-unused positions in the current byte */
+		SlruClogDisableMemoryProtection(ClogCtl->shared->page_buffer[slotno]);
         *byteptr &= (1 << bshift) - 1;
         /* Zero the rest of the page */
         MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+		SlruClogEnableMemoryProtection(ClogCtl->shared->page_buffer[slotno]);
 
         ClogCtl->shared->page_dirty[slotno] = true;
     }
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 0a950156..e971e3c1 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -24,6 +24,9 @@
  */
 #include "postgres.h"
 
+#include <unistd.h>
+#include <sys/mman.h>
+
 #include "access/commit_ts.h"
 #include "access/htup_details.h"
 #include "access/lru.h"
@@ -330,9 +333,11 @@ TransactionIdSetCommitTs(TransactionId xid, TimestampTz gts, TimestampTz ts,
 	entry.time = ts;
 	entry.nodeid = nodeid;
 
+	LruTlogDisableMemoryProtection(CommitTsCtl->shared[partitionno]->page_buffer[slotno]);
 	memcpy(CommitTsCtl->shared[partitionno]->page_buffer[slotno] +
 		   SizeOfCommitTimestampEntry * entryno,
 		   &entry, SizeOfCommitTimestampEntry);
+	LruTlogEnableMemoryProtection(CommitTsCtl->shared[partitionno]->page_buffer[slotno]);
 
 #ifdef __TBASE__
     /*
@@ -868,7 +873,7 @@ TrimCommitTs(void)
     CommitTsCtl->global_shared->latest_page_number = pageno;
     LWLockRelease(CommitTsControlLock);
     
-    elog(LOG, "Trim committs next xid %d latest page number %d entryno %d", xid, pageno, entryno);
+	elog(DEBUG10, "Trim committs next xid %d latest page number %d entryno %d", xid, pageno, entryno);
     
     
     /*
@@ -902,9 +907,12 @@ TrimCommitTs(void)
         
         byteptr = CommitTsCtl->shared[partitionno]->page_buffer[slotno] + byteno;
         
+		LruTlogDisableMemoryProtection(CommitTsCtl->shared[partitionno]->page_buffer[slotno]);
         /* Zero the rest of the page */
         MemSet(byteptr, 0, BLCKSZ - byteno);
-        elog(LOG, "zero out the remaining page starting from byteno %d len BLCKSZ -byteno %d entryno %d sizeofentry %lu", 
+		LruTlogEnableMemoryProtection(CommitTsCtl->shared[partitionno]->page_buffer[slotno]);
+
+        elog(DEBUG10, "zero out the remaining page starting from byteno %d len BLCKSZ -byteno %d entryno %d sizeofentry %lu",
             byteno, BLCKSZ - byteno, entryno, SizeOfCommitTimestampEntry);
         CommitTsCtl->shared[partitionno]->page_dirty[slotno] = true;
         
diff --git a/src/backend/access/transam/lru.c b/src/backend/access/transam/lru.c
index 6276f8d7..0c772617 100644
--- a/src/backend/access/transam/lru.c
+++ b/src/backend/access/transam/lru.c
@@ -109,6 +109,7 @@
 
 #include <fcntl.h>
 #include <sys/stat.h>
+#include <sys/mman.h>
 #include <unistd.h>
 
 #include "access/lru.h"
@@ -117,6 +118,7 @@
 #include "pgstat.h"
 #include "storage/fd.h"
 #include "storage/shmem.h"
+#include "utils/guc.h"
 #include "miscadmin.h"
 
 
@@ -184,6 +186,9 @@ static LruErrorCause lru_errcause;
 static int    lru_errno;
 
 
+bool enable_tlog_mprotect = false;
+
+
 static void LruZeroLSNs(LruCtl ctl, int partitionno, int slotno);
 static void LruWaitIO(LruCtl ctl, int partitionno, int slotno);
 static void LruInternalWritePage(LruCtl ctl, int partitionno, int slotno, LruFlushPt fdata);
@@ -197,6 +202,71 @@ static bool LruScanDirCbDeleteCutoff(LruCtl ctl, char *filename,
                           int segpage, void *data);
 static void LruInternalDeleteSegment(LruCtl ctl, char *filename);
 
+/*
+ * Set a page [ptr, ptr + BLCKSZ - 1] with read only constraint.
+ *
+ * Coredump when being written without setting it writable.
+ */
+void
+SetPageReadOnly(char *address)
+{
+	/* prevent wild pointer */
+	if (((uint64) address) % BLCKSZ != 0)
+	{
+		elog(PANIC, "address %p is not aligned with page", address);
+	}
+
+	/* set page read only with syscall */
+	if (mprotect(address, BLCKSZ, PROT_READ) != 0)
+	{
+		elog(PANIC, "mprotect failed %s at %p", strerror(errno), address);
+	}
+}
+
+/*
+ * Set a page [ptr, ptr + BLCKSZ - 1] with writable attribute
+ * which cooperates with SetPageReadOnly.
+ */
+void
+SetPageReadWrite(char *address)
+{
+	/* prevent wild pointer */
+	if (((uint64) address) % BLCKSZ != 0)
+	{
+		elog(PANIC, "address %p is not aligned with page", address);
+	}
+
+	/* set page read write with syscall */
+	if (mprotect(address, BLCKSZ, PROT_WRITE | PROT_READ) != 0)
+	{
+		elog(PANIC, "mprotect failed %s at %p", strerror(errno), address);
+	}
+}
+
+/*
+ * enable tlog memory protection
+ */
+inline void
+LruTlogEnableMemoryProtection(char *address)
+{
+	if (enable_tlog_mprotect)
+	{
+		SetPageReadOnly(address);
+	}
+}
+
+/*
+ * disable tlog memory protection
+ */
+inline void
+LruTlogDisableMemoryProtection(char *address)
+{
+	if (enable_tlog_mprotect)
+	{
+		SetPageReadWrite(address);
+	}
+}
+
 /*
  * Initialization of shared memory
  */
@@ -218,6 +288,12 @@ LruShmemSize(int nslots, int nlsns)
     if (nlsns > 0)
         sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));    /* group_lsn[] */
 
+	if (enable_tlog_mprotect)
+	{
+		/* add BLCKSZ for memory protect */
+		return BUFFERALIGN(sz) + BLCKSZ + BLCKSZ * nslots;
+	}
+
     return BUFFERALIGN(sz) + BLCKSZ * nslots;
 }
 
@@ -397,8 +473,12 @@ LruInit(LruCtl ctl, const char *name, int nslots, int nlsns, int nbufs,
         global_shared->ControlLock = ctllock;
         global_shared->latest_page_number = 0;
         
-    }else
+	}
+	else
+	{	
         Assert(found);
+	}
+	
     ctl->global_shared = global_shared;
     for(partitionno = 0; partitionno < NUM_PARTITIONS; partitionno++){
         snprintf(full_name, 64, "%s:%d", name, partitionno);
@@ -451,13 +531,21 @@ LruInit(LruCtl ctl, const char *name, int nslots, int nlsns, int nbufs,
             strlcpy(shared->lwlock_tranche_name, name, LRU_MAX_NAME_LENGTH);
             shared->lwlock_tranche_id = tranche_id;
 
-
             ptr += BUFFERALIGN(offset);
+			if (enable_tlog_mprotect)
+			{
+				ptr = (char *) BLOCKALIGN(ptr);
+			}
             for (slotno = 0; slotno < nslots; slotno++)
             {
                 LWLockInitialize(&shared->buffer_locks[slotno].lock,
                                  shared->lwlock_tranche_id);
 
+				if (enable_tlog_mprotect)
+				{
+					/* protect page */
+					SetPageReadOnly(ptr);
+				}
                 shared->page_buffer[slotno] = ptr;
                 shared->page_status[slotno] = LRU_PAGE_EMPTY;
                 shared->page_dirty[slotno] = false;
@@ -531,8 +619,10 @@ LruZeroPage(LruCtl ctl, int partitionno, int pageno)
     shared->page_dirty[slotno] = true;
     LruRecentlyUsed(shared, slotno);
 
+	LruTlogDisableMemoryProtection(shared->page_buffer[slotno]);
     /* Set the buffer to zeroes */
     MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+	LruTlogEnableMemoryProtection(shared->page_buffer[slotno]);
 
     /* Set the LSNs for this new page to zero */
     LruZeroLSNs(ctl, partitionno, slotno);
@@ -1056,7 +1146,9 @@ LruPhysicalReadPage(LruCtl ctl, int partitionno, int pageno, int slotno)
         ereport(LOG,
                 (errmsg("file \"%s\" doesn't exist, reading as zeroes",
                         path)));
+		LruTlogDisableMemoryProtection(shared->page_buffer[slotno]);
         MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+		LruTlogEnableMemoryProtection(shared->page_buffer[slotno]);
         return true;
     }
 
@@ -1070,16 +1162,20 @@ LruPhysicalReadPage(LruCtl ctl, int partitionno, int pageno, int slotno)
 
     errno = 0;
     pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
+	LruTlogDisableMemoryProtection(shared->page_buffer[slotno]);
     if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
     {
-        elog(ERROR, "read fails path %s partitionno %d slotno %d pageno %d ", 
-                                                path, partitionno, slotno, pageno);
+		LruTlogEnableMemoryProtection(shared->page_buffer[slotno]);
         pgstat_report_wait_end();
         lru_errcause = LRU_READ_FAILED;
         lru_errno = errno;
         CloseTransientFile(fd);
-        return false;
+		elog(ERROR, "read fails path %s partitionno %d slotno %d pageno %d ",
+			 path, partitionno, slotno, pageno);
     }
+
+	LruTlogEnableMemoryProtection(shared->page_buffer[slotno]);
+
     pgstat_report_wait_end();
 
     if (CloseTransientFile(fd))
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 1b2b7694..264384b4 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -2044,7 +2044,9 @@ TrimMultiXact(void)
         offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
         offptr += entryno;
 
+		SlruClogDisableMemoryProtection(MultiXactOffsetCtl->shared->page_buffer[slotno]);
         MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+		SlruClogEnableMemoryProtection(MultiXactOffsetCtl->shared->page_buffer[slotno]);
 
         MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
     }
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 3d3c8c84..acf0562d 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -51,6 +51,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "access/lru.h"
 #include "access/slru.h"
 #include "access/transam.h"
 #include "access/xlog.h"
@@ -123,6 +124,7 @@ typedef enum
 static SlruErrorCause slru_errcause;
 static int    slru_errno;
 
+bool enable_clog_mprotect = false;
 
 static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
 static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
@@ -158,6 +160,12 @@ SimpleLruShmemSize(int nslots, int nlsns)
     if (nlsns > 0)
         sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));    /* group_lsn[] */
 
+	if (enable_clog_mprotect)
+	{
+		/* add BLCKSZ for memory protect */
+		return BUFFERALIGN(sz) + BLCKSZ + BLCKSZ * nslots;
+	}
+
     return BUFFERALIGN(sz) + BLCKSZ * nslots;
 }
 
@@ -220,11 +228,16 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
         shared->lwlock_tranche_id = tranche_id;
 
         ptr += BUFFERALIGN(offset);
+		if (enable_clog_mprotect)
+		{
+			ptr = (char *) BLOCKALIGN(ptr);
+		}
         for (slotno = 0; slotno < nslots; slotno++)
         {
             LWLockInitialize(&shared->buffer_locks[slotno].lock,
                              shared->lwlock_tranche_id);
 
+			SlruClogEnableMemoryProtection(ptr);
             shared->page_buffer[slotno] = ptr;
             shared->page_status[slotno] = SLRU_PAGE_EMPTY;
             shared->page_dirty[slotno] = false;
@@ -278,8 +291,10 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno)
     shared->page_dirty[slotno] = true;
     SlruRecentlyUsed(shared, slotno);
 
+	SlruClogDisableMemoryProtection(shared->page_buffer[slotno]);
     /* Set the buffer to zeroes */
     MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+	SlruClogEnableMemoryProtection(shared->page_buffer[slotno]);
 
     /* Set the LSNs for this new page to zero */
     SimpleLruZeroLSNs(ctl, slotno);
@@ -681,7 +696,9 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
         ereport(LOG,
                 (errmsg("file \"%s\" doesn't exist, reading as zeroes",
                         path)));
+		SlruClogDisableMemoryProtection(shared->page_buffer[slotno]);
         MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+		SlruClogEnableMemoryProtection(shared->page_buffer[slotno]);
         return true;
     }
 
@@ -695,14 +712,18 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 
     errno = 0;
     pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
+	SlruClogDisableMemoryProtection(shared->page_buffer[slotno]);
     if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
     {
+		SlruClogEnableMemoryProtection(shared->page_buffer[slotno]);
         pgstat_report_wait_end();
         slru_errcause = SLRU_READ_FAILED;
         slru_errno = errno;
         CloseTransientFile(fd);
         return false;
     }
+
+	SlruClogEnableMemoryProtection(shared->page_buffer[slotno]);
     pgstat_report_wait_end();
 
     if (CloseTransientFile(fd))
@@ -1420,3 +1441,27 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
 
     return retval;
 }
+
+/*
+ * enable clog memory protection
+ */
+inline void
+SlruClogEnableMemoryProtection(char *address)
+{
+	if (enable_clog_mprotect)
+	{
+		SetPageReadOnly(address);
+	}
+}
+
+/*
+ * disable clog memory protection
+ */
+inline void
+SlruClogDisableMemoryProtection(char *address)
+{
+	if (enable_clog_mprotect)
+	{
+		SetPageReadWrite(address);
+	}
+}
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 93051191..4180f71c 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -95,7 +95,9 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
     if (*ptr != parent)
     {
         Assert(*ptr == InvalidTransactionId);
+		SlruClogDisableMemoryProtection(SubTransCtl->shared->page_buffer[slotno]);
         *ptr = parent;
+		SlruClogEnableMemoryProtection(SubTransCtl->shared->page_buffer[slotno]);
         SubTransCtl->shared->page_dirty[slotno] = true;
     }
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 14e26640..405e9bac 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -24,6 +24,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/lru.h"
 #include "access/multixact.h"
 #include "access/rewriteheap.h"
 #include "access/subtrans.h"
@@ -245,6 +246,8 @@ bool        InRecovery = false;
 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 HotStandbyState standbyState = STANDBY_DISABLED;
 
+bool enable_xlog_mprotect = false;
+
 static XLogRecPtr LastRec;
 
 /* Local copy of WalRcv->receivedUpto */
@@ -1575,7 +1578,9 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
              * Write what fits on this page, and continue on the next page.
              */
             Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
+			XlogDisableMemoryProtection(XLogCtl->pages + XLogRecPtrToBufIdx(CurrPos) * (Size) XLOG_BLCKSZ);
             memcpy(currpos, rdata_data, freespace);
+			XlogEnableMemoryProtection(XLogCtl->pages + XLogRecPtrToBufIdx(CurrPos) * (Size) XLOG_BLCKSZ);
             rdata_data += freespace;
             rdata_len -= freespace;
             written += freespace;
@@ -1592,8 +1597,10 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
              */
             currpos = GetXLogBuffer(CurrPos);
             pagehdr = (XLogPageHeader) currpos;
+			XlogDisableMemoryProtection(XLogCtl->pages + XLogRecPtrToBufIdx(CurrPos) * (Size) XLOG_BLCKSZ);
             pagehdr->xlp_rem_len = write_len - written;
             pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
+			XlogEnableMemoryProtection(XLogCtl->pages + XLogRecPtrToBufIdx(CurrPos) * (Size) XLOG_BLCKSZ);
 
             /* skip over the page header */
             if (CurrPos % XLogSegSize == 0)
@@ -1623,7 +1630,9 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
 #endif
 
         Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
+		XlogDisableMemoryProtection(XLogCtl->pages + XLogRecPtrToBufIdx(CurrPos) * (Size) XLOG_BLCKSZ);
         memcpy(currpos, rdata_data, rdata_len);
+		XlogEnableMemoryProtection(XLogCtl->pages + XLogRecPtrToBufIdx(CurrPos) * (Size) XLOG_BLCKSZ);
         currpos += rdata_len;
         CurrPos += rdata_len;
         freespace -= rdata_len;
@@ -2247,6 +2256,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
          * Be sure to re-zero the buffer so that bytes beyond what we've
          * written will look like zeroes and not valid XLOG records...
          */
+		XlogDisableMemoryProtection((char *) NewPage);
         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
 
         /*
@@ -2289,6 +2299,8 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
             NewPage->xlp_info |= XLP_LONG_HEADER;
         }
 
+		XlogEnableMemoryProtection((char *) NewPage);
+
         /*
          * Make sure the initialization of the page becomes visible to others
          * before the xlblocks update. GetXLogBuffer() reads xlblocks without
@@ -5067,6 +5079,13 @@ XLOGShmemInit(void)
     allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
     XLogCtl->pages = allocptr;
     memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
+	if (enable_xlog_mprotect)
+	{
+		for (i = 0; i < XLOGbuffers; i++)
+		{
+			XlogEnableMemoryProtection(XLogCtl->pages + XLOG_BLCKSZ * i);
+		}
+	}
 
     /*
      * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
@@ -7968,8 +7987,10 @@ StartupXLOG(void)
         /* Copy the valid part of the last block, and zero the rest */
         page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
         len = EndOfLog % XLOG_BLCKSZ;
+		XlogDisableMemoryProtection(page);
         memcpy(page, xlogreader->readBuf, len);
         memset(page + len, 0, XLOG_BLCKSZ - len);
+		XlogEnableMemoryProtection(page);
 
         XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
         XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
@@ -13267,3 +13288,26 @@ void wal_reset_stream(void)
 
 #endif
 
+/*
+ * enable xlog memory protection
+ */
+inline void
+XlogEnableMemoryProtection(char *address)
+{
+	if (enable_xlog_mprotect)
+	{
+		SetPageReadOnly(address);
+	}
+}
+
+/*
+ * disable xlog memory protection
+ */
+inline void
+XlogDisableMemoryProtection(char *address)
+{
+	if (enable_xlog_mprotect)
+	{
+		SetPageReadWrite(address);
+	}
+}
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index c8fa6541..517f1262 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -1374,9 +1374,11 @@ asyncQueueAddEntries(ListCell *nextNotify)
         }
 
         /* Now copy qe into the shared buffer page */
+		SlruClogDisableMemoryProtection(AsyncCtl->shared->page_buffer[slotno]);
         memcpy(AsyncCtl->shared->page_buffer[slotno] + offset,
                &qe,
                qe.length);
+		SlruClogEnableMemoryProtection(AsyncCtl->shared->page_buffer[slotno]);
 
         /* Advance queue_head appropriately, and detect if page is full */
         if (asyncQueueAdvance(&(queue_head), qe.length))
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 2557db35..3d522795 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -490,16 +490,14 @@ fill_seq_with_data(Relation rel, HeapTuple tuple)
     buf = ReadBuffer(rel, P_NEW);
     Assert(BufferGetBlockNumber(buf) == 0);
 
+	/* Now insert sequence tuple */
+	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     page = BufferGetPage(buf);
 
     PageInit(page, BufferGetPageSize(buf), sizeof(sequence_magic));
     sm = (sequence_magic *) PageGetSpecialPointer(page);
     sm->magic = SEQ_MAGIC;
 
-    /* Now insert sequence tuple */
-
-    LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
-
     /*
      * Since VACUUM does not process sequences, we have to force the tuple to
      * have xmin = FrozenTransactionId now.  Otherwise it would become
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 88b2dde4..66d18518 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -16,6 +16,7 @@
 
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
+#include "utils/guc.h"
 
 
 BufferDescPadded *BufferDescriptors;
@@ -80,9 +81,19 @@ InitBufferPool(void)
                         NBuffers * sizeof(BufferDescPadded),
                         &foundDescs);
 
+	if (enable_buffer_mprotect)
+	{
+		BufferBlocks = (char *)
+			ShmemInitStruct("Buffer Blocks",
+						NBuffers * (Size) BLCKSZ + BLCKSZ, &foundBufs);
+		BufferBlocks = (char *) BLOCKALIGN(BufferBlocks);
+	}
+	else
+	{
     BufferBlocks = (char *)
         ShmemInitStruct("Buffer Blocks",
                         NBuffers * (Size) BLCKSZ, &foundBufs);
+	}
 
     /* Align lwlocks to cacheline boundary */
     BufferIOLWLockArray = (LWLockMinimallyPadded *)
@@ -135,6 +146,8 @@ InitBufferPool(void)
              */
             buf->freeNext = i + 1;
 
+			BufEnableMemoryProtection(BufferBlocks + i * BLCKSZ, false);
+
             LWLockInitialize(BufferDescriptorGetContentLock(buf),
                              LWTRANCHE_BUFFER_CONTENT);
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 5c2f3bd2..56276b4a 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -34,6 +34,7 @@
 #include <sys/file.h>
 #include <unistd.h>
 
+#include "access/lru.h"
 #include "access/xlog.h"
 #include "catalog/catalog.h"
 #include "catalog/storage.h"
@@ -125,6 +126,8 @@ int            checkpoint_flush_after = 0;
 int            bgwriter_flush_after = 0;
 int            backend_flush_after = 0;
 
+bool		enable_buffer_mprotect = false;
+
 /*
  * How many buffers PrefetchBuffer callers should try to stay ahead of their
  * ReadBuffer calls by.  This is maintained by the assign hook for
@@ -818,8 +821,12 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
             if (!isLocalBuf)
             {
                 if (mode == RBM_ZERO_AND_LOCK)
+				{
+					BufDisableMemoryProtection(BufferGetPage(
+						BufferDescriptorGetBuffer(bufHdr)), isLocalBuf);
                     LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
                                   LW_EXCLUSIVE);
+				}
                 else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
                     LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
             }
@@ -899,7 +906,10 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
     if (isExtend)
     {
         /* new buffers are zero-filled */
+		BufDisableMemoryProtection(bufBlock, isLocalBuf);
         MemSet((char *) bufBlock, 0, BLCKSZ);
+		BufEnableMemoryProtection(bufBlock, isLocalBuf);
+
         /* don't set checksum for all-zero page */
         smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
 
@@ -917,7 +927,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
          * just wants us to allocate a buffer.
          */
         if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+		{
+			BufDisableMemoryProtection(bufBlock, isLocalBuf);
             MemSet((char *) bufBlock, 0, BLCKSZ);
+			BufEnableMemoryProtection(bufBlock, isLocalBuf);
+		}
         else
         {
             instr_time    io_start,
@@ -926,7 +940,9 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
             if (track_io_timing)
                 INSTR_TIME_SET_CURRENT(io_start);
 
+			BufDisableMemoryProtection(bufBlock, isLocalBuf);
             smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
+			BufEnableMemoryProtection(bufBlock, isLocalBuf);
 
             if (track_io_timing)
             {
@@ -944,7 +960,9 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                 {
                     if (algo_id == smgr->smgr_relcrypt.algo_id)
                     {
+						BufDisableMemoryProtection(bufBlock, isLocalBuf);
                         rel_crypt_page_decrypt(&(smgr->smgr_relcrypt), (Page)bufBlock);
+						BufEnableMemoryProtection(bufBlock, isLocalBuf);
                     }
                     else
                     {
@@ -967,7 +985,10 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                              errmsg("invalid page in block %u of relation %s; zeroing out page",
                                     blockNum,
                                     relpath(smgr->smgr_rnode, forkNum))));
+
+					BufDisableMemoryProtection(bufBlock, isLocalBuf);
                     MemSet((char *) bufBlock, 0, BLCKSZ);
+					BufEnableMemoryProtection(bufBlock, isLocalBuf);
                 }
                 else
                 {
@@ -995,6 +1016,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
     if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
         !isLocalBuf)
     {
+		BufDisableMemoryProtection(BufferGetPage(
+			BufferDescriptorGetBuffer(bufHdr)), isLocalBuf);
         LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
     }
 
@@ -1180,8 +1203,26 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
              * happens to be trying to split the page the first one got from
              * StrategyGetBuffer.)
              */
-            if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
-                                         LW_SHARED))
+			bool ret = false;
+
+			if (enable_buffer_mprotect)
+			{
+				/* Encrypting buffer needs LW_EXCLUSIVE */
+				ret = LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+					LW_EXCLUSIVE);
+				if (ret)
+				{
+					BufDisableMemoryProtection(
+						BufferGetPage(BufferDescriptorGetBuffer(buf)), false);
+				}
+			}
+			else
+			{
+				ret = LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+					LW_SHARED);
+			}
+
+			if (ret)
             {
                 /*
                  * If using a nondefault strategy, and writing the buffer
@@ -1202,6 +1243,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                     if (XLogNeedsFlush(lsn) &&
                         StrategyRejectBuffer(strategy, buf))
                     {
+						BufEnableMemoryProtection(
+							BufferGetPage(BufferDescriptorGetBuffer(buf)), false);
                         /* Drop lock/pin and loop around for another buffer */
                         LWLockRelease(BufferDescriptorGetContentLock(buf));
                         UnpinBuffer(buf, true);
@@ -1216,6 +1259,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                                                           smgr->smgr_rnode.node.relNode);
 
                 FlushBuffer(buf, NULL);
+				BufEnableMemoryProtection(
+					BufferGetPage(BufferDescriptorGetBuffer(buf)), false);
                 LWLockRelease(BufferDescriptorGetContentLock(buf));
 
                 ScheduleBufferTagForWriteback(&BackendWritebackContext,
@@ -3844,14 +3889,29 @@ LockBuffer(Buffer buffer, int mode)
     buf = GetBufferDescriptor(buffer - 1);
 
     if (mode == BUFFER_LOCK_UNLOCK)
+	{
+		if (enable_buffer_mprotect &&
+			LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE))
+		{
+			BufEnableMemoryProtection(BufferGetPage(buffer), false);
+		}
+
         LWLockRelease(BufferDescriptorGetContentLock(buf));
+	}
     else if (mode == BUFFER_LOCK_SHARE)
+	{
         LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
+	}
     else if (mode == BUFFER_LOCK_EXCLUSIVE)
+	{
+		BufDisableMemoryProtection(BufferGetPage(buffer), false);
         LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
+	}
     else
+	{
         elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
+}
 
 /*
  * Acquire the content_lock for the buffer, but only if we don't have to wait.
@@ -3867,6 +3927,7 @@ ConditionalLockBuffer(Buffer buffer)
     if (BufferIsLocal(buffer))
         return true;            /* act as though we got it */
 
+	BufDisableMemoryProtection(BufferGetPage(buffer), false);
     buf = GetBufferDescriptor(buffer - 1);
 
     return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
@@ -4800,7 +4861,16 @@ static int SyncBufferPrePhase1(int buf_id)
      * try to lock the buffer, returning false means other process(start or backend) having lock the buffer in LW_EXCLUSIVE,
      * so, we skip this buffer, it would be treated in next checkpoint round.
      */
-	ret = LWLockConditionalAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+	if (enable_buffer_mprotect && !BufferIsLocal(buf_id) && BufferIsValid(buf_id))
+	{
+		ret = LWLockConditionalAcquire(
+			BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
+	}
+	else
+	{
+		ret = LWLockConditionalAcquire(
+			BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+	}
     if (false == ret)
     {
         return SYNC_BUF_LWLOCK_CONFLICT;
@@ -5052,7 +5122,6 @@ static List* SyncBufferPostPhase1(List * buf_id_list, WritebackContext *wb_conte
         info = (SyncBufIdInfo *) lfirst(l);
 
         buf = GetBufferDescriptor(info->buf_id);
-        
         LWLockRelease(BufferDescriptorGetContentLock(buf));
 
         tag = buf->tag;
@@ -5234,4 +5303,36 @@ char * BufHdrGetBlockFunc(BufferDesc * buf)
 
 #endif
 
+/*
+ * enable buffer memory protection
+ */
+inline void
+BufEnableMemoryProtection(char *address, bool localbuffer)
+{
+	if (localbuffer)
+	{
+		return;
+	}
+
+	if (enable_buffer_mprotect)
+	{
+		SetPageReadOnly(address);
+	}
+}
+
+/*
+ * disable buffer memory protection
+ */
+inline void
+BufDisableMemoryProtection(char *address, bool localbuffer)
+{
+	if (localbuffer)
+	{
+		return;
+	}
 
+	if (enable_buffer_mprotect)
+	{
+		SetPageReadWrite(address);
+	}
+}
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 21e16a02..a08b4a7d 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -31,6 +31,7 @@
 #include "storage/extentmapping.h"
 #include "storage/lmgr.h"
 #include "storage/smgr.h"
+#include "utils/guc.h"
 
 
 /*
@@ -779,7 +780,6 @@ fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
     page = BufferGetPage(buf);
-
     if (fsm_set_avail_extent(page, slot, newValue, &root_modified, &new_root, &old_root))
         MarkBufferDirtyHint(buf, false);
 
@@ -978,7 +978,16 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
      * pages, increasing the chances that a later vacuum can truncate the
      * relation.
      */
+	if (enable_buffer_mprotect)
+	{
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
     ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	}
+	else
+	{
+		((FSMPage) PageGetContents(page))->fp_next_slot = 0;
+	}
 
     ReleaseBuffer(buf);
 
diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c
index 4ec1dcaa..62868bf7 100644
--- a/src/backend/storage/freespace/fsmpage.c
+++ b/src/backend/storage/freespace/fsmpage.c
@@ -24,6 +24,7 @@
 
 #include "storage/bufmgr.h"
 #include "storage/fsm_internals.h"
+#include "utils/guc.h"
 
 /* Macros to navigate the tree within a page. Root has index zero. */
 #define leftchild(x)    (2 * (x) + 1)
@@ -325,6 +326,11 @@ fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext,
      *
      * Wrap-around is handled at the beginning of this function.
      */
+	if (enable_buffer_mprotect && !exclusive_lock_held)
+	{
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+	}
     fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0);
 
     return slot;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 2260027e..97a0b26f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -39,6 +39,7 @@
 #include "access/xact.h"
 #include "access/xlog_internal.h"
 #include "access/heapam_xlog.h"
+#include "access/lru.h"
 #include "catalog/namespace.h"
 #include "catalog/pg_authid.h"
 #include "commands/async.h"
@@ -2698,6 +2699,62 @@ static struct config_bool ConfigureNamesBool[] =
         NULL, NULL, NULL
     },
     {
+		{"enable_buffer_mprotect", PGC_POSTMASTER, CUSTOM_OPTIONS,
+			gettext_noop("Protect memory corruption for share buffer"),
+			NULL,
+			GUC_NOT_IN_SAMPLE,
+		},
+		&enable_buffer_mprotect,
+#ifdef _PG_REGRESS_
+		true,
+#else
+		false,
+#endif
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_clog_mprotect", PGC_POSTMASTER, CUSTOM_OPTIONS,
+			gettext_noop("Protect memory corruption for clog"),
+			NULL,
+			GUC_NOT_IN_SAMPLE,
+		},
+		&enable_clog_mprotect,
+#ifdef _PG_REGRESS_
+		true,
+#else
+		false,
+#endif
+		NULL, NULL, NULL
+	},
+	{
+	{"enable_tlog_mprotect", PGC_POSTMASTER, CUSTOM_OPTIONS,
+		gettext_noop("Protect memory corruption for tlog"),
+		NULL,
+		GUC_NOT_IN_SAMPLE,
+		},
+		&enable_tlog_mprotect,
+#ifdef _PG_REGRESS_
+		true,
+#else
+		false,
+#endif
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_xlog_mprotect", PGC_POSTMASTER, CUSTOM_OPTIONS,
+			gettext_noop("Protect memory corruption for xlog"),
+			NULL,
+			GUC_NOT_IN_SAMPLE,
+		},
+		&enable_xlog_mprotect,
+#ifdef _PG_REGRESS_
+		true,
+#else
+		false,
+#endif
+		NULL, NULL, NULL
+	},
+	{
 		{"enable_cold_hot_router_print", PGC_USERSET, CUSTOM_OPTIONS,
 			 gettext_noop("Whether print cold hot router."),
 			 NULL
diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c
index 8902bcaf..014e81b2 100644
--- a/src/backend/utils/misc/mls.c
+++ b/src/backend/utils/misc/mls.c
@@ -717,6 +717,8 @@ static void* mls_crypt_worker(void * input)
 
     for (;;)
     {
+        bool need_mprotect = false;
+
         if (false == g_crypt_parellel_main_running)
         {
             break;
@@ -747,7 +749,18 @@ static void* mls_crypt_worker(void * input)
         buf_need_encrypt = page_new + BLCKSZ;
 
         /* 2.2 do the encrypt */
+        need_mprotect = enable_buffer_mprotect &&
+            !BufferIsLocal(encrypt_element.buf_id) &&
+            BufferIsValid(encrypt_element.buf_id);
+        if (need_mprotect)
+        {
+            BufDisableMemoryProtection(buf, false);
+        }
         ret      = rel_crypt_page_encrypting_parellel(encrypt_element.algo_id, buf, buf_need_encrypt, page_new, encrypt_element.cryptkey, workerid);
+        if (need_mprotect)
+        {
+            BufEnableMemoryProtection(buf, false);
+        }
 
         /* 3. put it to crypted queue */
         while (QueueIsFull(crypted_queue))
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e5cb868a..12d19740 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -485,6 +485,14 @@
 #update_process_title = on
 
 
+# - Memory Protection -
+
+#enable_buffer_mprotect = off
+#enable_clog_mprotect = off
+#enable_tlog_mprotect = off
+#enable_xlog_mprotect = off
+
+
 # - Maintain GTS -
 
 #gts_maintain_option = 0	# range: 0-2. the default is 0.
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index 9fa4ad40..247cabfe 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -256,7 +256,11 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer,
         }
     }
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-    
+	if (enable_buffer_mprotect)
+	{
+		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+	}
     if(infomask & HEAP_XMIN_COMMITTED)
     {
 
diff --git a/src/include/access/lru.h b/src/include/access/lru.h
index 01a05dd6..7d9980a3 100644
--- a/src/include/access/lru.h
+++ b/src/include/access/lru.h
@@ -215,7 +215,6 @@ typedef struct LruCtlData
 
 typedef LruCtlData *LruCtl;
 
-
 #define PARTITION_LOCK_IDX(shared) ((shared)->num_slots)
 
 extern Size LruShmemSize(int nslots, int nlsns);
@@ -244,6 +243,10 @@ extern bool LruScanDirCbReportPresence(LruCtl ctl, char *filename,
                             int segpage, void *data);
 extern bool LruScanDirCbDeleteAll(LruCtl ctl, char *filename, int segpage,
                        void *data);
+extern void LruTlogEnableMemoryProtection(char *address);
+extern void LruTlogDisableMemoryProtection(char *address);
+extern void SetPageReadOnly(char *address);
+extern void SetPageReadWrite(char *address);
  
 
 #endif                            /* SLRU_H */
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index cfc559d0..3a38460b 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * slru.h
- *        Simple LRU buffering for transaction status logfiles
+ *		Simple LRU buffering for transaction status logfiles
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -30,10 +30,10 @@
  * take no explicit notice of that fact in slru.c, except when comparing
  * segment and page numbers in SimpleLruTruncate (see PagePrecedes()).
  */
-#define SLRU_PAGES_PER_SEGMENT    32
+#define SLRU_PAGES_PER_SEGMENT	32
 
 /* Maximum length of an SLRU name */
-#define SLRU_MAX_NAME_LENGTH    32
+#define SLRU_MAX_NAME_LENGTH	32
 
 /*
  * Page status codes.  Note that these do not include the "dirty" bit.
@@ -43,10 +43,10 @@
  */
 typedef enum
 {
-    SLRU_PAGE_EMPTY,            /* buffer is not in use */
-    SLRU_PAGE_READ_IN_PROGRESS, /* page is being read in */
-    SLRU_PAGE_VALID,            /* page is valid and not being written */
-    SLRU_PAGE_WRITE_IN_PROGRESS /* page is being written out */
+	SLRU_PAGE_EMPTY,			/* buffer is not in use */
+	SLRU_PAGE_READ_IN_PROGRESS, /* page is being read in */
+	SLRU_PAGE_VALID,			/* page is valid and not being written */
+	SLRU_PAGE_WRITE_IN_PROGRESS /* page is being written out */
 } SlruPageStatus;
 
 /*
@@ -54,54 +54,54 @@ typedef enum
  */
 typedef struct SlruSharedData
 {
-    LWLock       *ControlLock;
-
-    /* Number of buffers managed by this SLRU structure */
-    int            num_slots;
-
-    /*
-     * Arrays holding info for each buffer slot.  Page number is undefined
-     * when status is EMPTY, as is page_lru_count.
-     */
-    char      **page_buffer;
-    SlruPageStatus *page_status;
-    bool       *page_dirty;
-    int           *page_number;
-    int           *page_lru_count;
-
-    /*
-     * Optional array of WAL flush LSNs associated with entries in the SLRU
-     * pages.  If not zero/NULL, we must flush WAL before writing pages (true
-     * for pg_xact, false for multixact, pg_subtrans, pg_notify).  group_lsn[]
-     * has lsn_groups_per_page entries per buffer slot, each containing the
-     * highest LSN known for a contiguous group of SLRU entries on that slot's
-     * page.
-     */
-    XLogRecPtr *group_lsn;
-    int            lsn_groups_per_page;
-
-    /*----------
-     * We mark a page "most recently used" by setting
-     *        page_lru_count[slotno] = ++cur_lru_count;
-     * The oldest page is therefore the one with the highest value of
-     *        cur_lru_count - page_lru_count[slotno]
-     * The counts will eventually wrap around, but this calculation still
-     * works as long as no page's age exceeds INT_MAX counts.
-     *----------
-     */
-    int            cur_lru_count;
-
-    /*
-     * latest_page_number is the page number of the current end of the log;
-     * this is not critical data, since we use it only to avoid swapping out
-     * the latest page.
-     */
-    int            latest_page_number;
-
-    /* LWLocks */
-    int            lwlock_tranche_id;
-    char        lwlock_tranche_name[SLRU_MAX_NAME_LENGTH];
-    LWLockPadded *buffer_locks;
+	LWLock	   *ControlLock;
+
+	/* Number of buffers managed by this SLRU structure */
+	int			num_slots;
+
+	/*
+	 * Arrays holding info for each buffer slot.  Page number is undefined
+	 * when status is EMPTY, as is page_lru_count.
+	 */
+	char	  **page_buffer;
+	SlruPageStatus *page_status;
+	bool	   *page_dirty;
+	int		   *page_number;
+	int		   *page_lru_count;
+
+	/*
+	 * Optional array of WAL flush LSNs associated with entries in the SLRU
+	 * pages.  If not zero/NULL, we must flush WAL before writing pages (true
+	 * for pg_xact, false for multixact, pg_subtrans, pg_notify).  group_lsn[]
+	 * has lsn_groups_per_page entries per buffer slot, each containing the
+	 * highest LSN known for a contiguous group of SLRU entries on that slot's
+	 * page.
+	 */
+	XLogRecPtr *group_lsn;
+	int			lsn_groups_per_page;
+
+	/*----------
+	 * We mark a page "most recently used" by setting
+	 *		page_lru_count[slotno] = ++cur_lru_count;
+	 * The oldest page is therefore the one with the highest value of
+	 *		cur_lru_count - page_lru_count[slotno]
+	 * The counts will eventually wrap around, but this calculation still
+	 * works as long as no page's age exceeds INT_MAX counts.
+	 *----------
+	 */
+	int			cur_lru_count;
+
+	/*
+	 * latest_page_number is the page number of the current end of the log;
+	 * this is not critical data, since we use it only to avoid swapping out
+	 * the latest page.
+	 */
+	int			latest_page_number;
+
+	/* LWLocks */
+	int			lwlock_tranche_id;
+	char		lwlock_tranche_name[SLRU_MAX_NAME_LENGTH];
+	LWLockPadded *buffer_locks;
 } SlruSharedData;
 
 typedef SlruSharedData *SlruShared;
@@ -112,26 +112,26 @@ typedef SlruSharedData *SlruShared;
  */
 typedef struct SlruCtlData
 {
-    SlruShared    shared;
-
-    /*
-     * This flag tells whether to fsync writes (true for pg_xact and multixact
-     * stuff, false for pg_subtrans and pg_notify).
-     */
-    bool        do_fsync;
-
-    /*
-     * Decide which of two page numbers is "older" for truncation purposes. We
-     * need to use comparison of TransactionIds here in order to do the right
-     * thing with wraparound XID arithmetic.
-     */
-    bool        (*PagePrecedes) (int, int);
-
-    /*
-     * Dir is set during SimpleLruInit and does not change thereafter. Since
-     * it's always the same, it doesn't need to be in shared memory.
-     */
-    char        Dir[64];
+	SlruShared	shared;
+
+	/*
+	 * This flag tells whether to fsync writes (true for pg_xact and multixact
+	 * stuff, false for pg_subtrans and pg_notify).
+	 */
+	bool		do_fsync;
+
+	/*
+	 * Decide which of two page numbers is "older" for truncation purposes. We
+	 * need to use comparison of TransactionIds here in order to do the right
+	 * thing with wraparound XID arithmetic.
+	 */
+	bool		(*PagePrecedes) (int, int);
+
+	/*
+	 * Dir is set during SimpleLruInit and does not change thereafter. Since
+	 * it's always the same, it doesn't need to be in shared memory.
+	 */
+	char		Dir[64];
 } SlruCtlData;
 
 typedef SlruCtlData *SlruCtl;
@@ -139,26 +139,29 @@ typedef SlruCtlData *SlruCtl;
 
 extern Size SimpleLruShmemSize(int nslots, int nlsns);
 extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
-              LWLock *ctllock, const char *subdir, int tranche_id);
-extern int    SimpleLruZeroPage(SlruCtl ctl, int pageno);
+			  LWLock *ctllock, const char *subdir, int tranche_id);
+extern int	SimpleLruZeroPage(SlruCtl ctl, int pageno);
 extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
-                  TransactionId xid);
+				  TransactionId xid);
 extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno,
-                           TransactionId xid);
+						   TransactionId xid);
 extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
 extern void SimpleLruFlush(SlruCtl ctl, bool allow_redirtied);
 extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
 extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno);
 
 typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage,
-                                  void *data);
+								  void *data);
 extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data);
 extern void SlruDeleteSegment(SlruCtl ctl, int segno);
 
 /* SlruScanDirectory public callbacks */
 extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename,
-                            int segpage, void *data);
+							int segpage, void *data);
 extern bool SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage,
-                       void *data);
+					   void *data);
 
-#endif                            /* SLRU_H */
+extern void SlruClogEnableMemoryProtection(char *address);
+extern void SlruClogDisableMemoryProtection(char *address);
+
+#endif							/* SLRU_H */
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 9dfcd122..a0db442b 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -289,6 +289,9 @@ extern XLogRecPtr GetLastImportantRecPtr(void);
 extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch);
 extern void RemovePromoteSignalFiles(void);
 
+extern void XlogEnableMemoryProtection(char *address);
+extern void XlogDisableMemoryProtection(char *address);
+
 extern bool CheckPromoteSignal(void);
 extern void WakeupRecovery(void);
 extern void SetWalWriterSleeping(bool sleeping);
diff --git a/src/include/c.h b/src/include/c.h
index d4a4033d..9e18db2b 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -651,6 +651,7 @@ typedef NameData *Name;
 #define MAXALIGN(LEN)            TYPEALIGN(MAXIMUM_ALIGNOF, (LEN))
 /* MAXALIGN covers only built-in types, not buffers */
 #define BUFFERALIGN(LEN)        TYPEALIGN(ALIGNOF_BUFFER, (LEN))
+#define BLOCKALIGN(LEN)			TYPEALIGN(BLCKSZ, (LEN))
 #define CACHELINEALIGN(LEN)        TYPEALIGN(PG_CACHE_LINE_SIZE, (LEN))
 
 #define TYPEALIGN_DOWN(ALIGNVAL,LEN)  \
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index d4afa5d7..6eae2368 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -412,4 +412,7 @@ extern void AtEOXact_LocalBuffers(bool isCommit);
 #ifdef _MLS_
 extern char * BufHdrGetBlockFunc(BufferDesc *buf);
 #endif
+
+extern void BufEnableMemoryProtection(char *address, bool localbuffer);
+extern void BufDisableMemoryProtection(char *address, bool localbuffer);
 #endif                            /* BUFMGR_INTERNALS_H */
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index ac00ae2d..5df24178 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -269,6 +269,10 @@ extern int32   g_TransferSpeed;
 /* slicent copy from */
 extern bool g_enable_copy_silence;
 extern bool g_enable_user_authority_force_check;
+extern bool enable_buffer_mprotect;
+extern bool enable_clog_mprotect;
+extern bool enable_tlog_mprotect;
+extern bool enable_xlog_mprotect;
 extern int query_delay;
 #endif
 extern int    log_min_error_statement;
diff --git a/src/test/regress/expected/oracle.out b/src/test/regress/expected/oracle.out
index 9a08138a..8cc3ca0d 100644
--- a/src/test/regress/expected/oracle.out
+++ b/src/test/regress/expected/oracle.out
@@ -2280,6 +2280,7 @@ ERROR:  failed to set the requested LC_COLLATE value [Nls_sortR =  tt_RU.utf8@iq
 drop table test_sort;
 \c postgres
 DROP DATABASE IF EXISTS regression_sort;
+ERROR:  database "regression_sort" is being accessed by other users
 SET client_encoding to default;
 -- test !=- operator
 set enable_oracle_compatible = off;
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 7f805e93..b0bd7fb4 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -76,7 +76,9 @@ select name, setting from pg_settings where name like 'enable%';
  enable_audit_warning              | off
  enable_auditlogger_warning        | off
  enable_bitmapscan                 | on
+ enable_buffer_mprotect            | on
  enable_check_password             | off
+ enable_clog_mprotect              | on
  enable_cls                        | on
  enable_cold_hot_router_print      | off
  enable_cold_hot_visible           | off
@@ -127,9 +129,11 @@ select name, setting from pg_settings where name like 'enable%';
  enable_statistic                  | on
  enable_subquery_shipping          | on
  enable_tidscan                    | on
+ enable_tlog_mprotect              | on
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
-(57 rows)
+ enable_xlog_mprotect              | on
+(61 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/output/xc_notrans_block.source b/src/test/regress/output/xc_notrans_block.source
index fc060427..388b5081 100644
--- a/src/test/regress/output/xc_notrans_block.source
+++ b/src/test/regress/output/xc_notrans_block.source
@@ -153,6 +153,5 @@ select * from xc_tab1 order by id;
 \c regression
 clean connection to all for database xc_db;
 drop database xc_db;
-ERROR:  database "xc_db" is being accessed by other users
 drop tablespace xc_testspace1;
 ERROR:  tablespace "xc_testspace1" is not empty
diff --git a/src/test/regress/output/xc_notrans_block_1.source b/src/test/regress/output/xc_notrans_block_1.source
index 3e79eb70..574cdc77 100644
--- a/src/test/regress/output/xc_notrans_block_1.source
+++ b/src/test/regress/output/xc_notrans_block_1.source
@@ -156,5 +156,4 @@ select * from xc_tab1 order by id;
 \c regression
 clean connection to all for database xc_db;
 drop database xc_db;
-ERROR:  database "xc_db" is being accessed by other users
 drop tablespace xc_testspace1;

From f3a496b6dfae6d7d6ccc280f8dd0a3ad44e98778 Mon Sep 17 00:00:00 2001
From: jackywpxie <jackywpxie@tencent.com>
Date: Fri, 5 Feb 2021 21:07:15 +0800
Subject: [PATCH 166/578] jacky/feature/MemoryOptimization_Tbase_v2.15 (merge
 request !134)

Squash merge branch 'jacky/feature/MemoryOptimization_Tbase_v2.15' into 'Tbase_v2.15'

* modified addcording to suggestion.

* Empty commit for add mr reviewer

* fixed merged bug.

* Merge branch 'Tbase_v2.15' into jacky/feature/MemoryOptimization_Tbase_v2.15

* refactor

* delete debug code of catcache

* refactor

* delete debug code for relation LRU: RelCacheLogMemorySize is deleted.

* delete debug code of locator

* free Locator's memory

* refator for relation LRU

* delete debug code of memory details

* add comment.

* change the default value of pool_session_memory_limit t0 10

* support pool_session_memory_limit = -1 for forward compatibility.

* fixed a warning

* correct spelling errors

* fixed bugs for number_replaced_relations

* support show session_memory_size;

* support number_replaced_relations

* update max_relcache_relations

* set enable_memory_optimization to on while doing regress test.

* ignore system relation while do LRU replacing.

* ignore unit

* update conn->inCursor

* ignore 5 bytes:

* add condition of enable_memory_optimization

* close enable_memory_optimization.

* support enable_memory_optimization

* fixed the bug for create table.

* support replacing relations in RelationLRUInsert()

* support create and drop table.

* add RelationLRUInsert and RelationLRUDelete.

* rename the name to number_replaced_relations

* add cache memory optimization GUC para.

* add pg_session_memory_detail()

* change the unit of memory.

* add tbase_memory_tools

* add memory size log

* add memory debug information.

* delete PoolConnDeadtime

* add the unit of session_memory_size

* correct the unit of pool_session_memory_limit.

* add debug info for memorycontext.
---
 contrib/Makefile                              |   1 +
 contrib/tbase_memory_tools/Makefile           |  23 +
 .../tbase_memory_tools--1.0.sql               |  29 +
 .../tbase_memory_tools/tbase_memory_tools.c   | 277 +++++++
 .../tbase_memory_tools.control                |   5 +
 .../tbase_pooler_stat--1.0.sql                |   1 -
 contrib/tbase_pooler_stat/tbase_pooler_stat.c |   8 +-
 src/backend/pgxc/locator/locator.c            |   8 +
 src/backend/pgxc/pool/pgxcnode.c              |   4 +-
 src/backend/pgxc/pool/poolmgr.c               |  65 +-
 src/backend/utils/cache/catcache.c            |   1 -
 src/backend/utils/cache/relcache.c            | 105 ++-
 src/backend/utils/misc/guc.c                  |  46 +-
 src/backend/utils/misc/postgresql.conf.sample |   5 +
 src/include/utils/guc.h                       |   4 +
 src/include/utils/rel.h                       | 713 +++++++++---------
 src/include/utils/relcache.h                  |  11 +
 src/test/regress/expected/sysviews.out        |   3 +-
 18 files changed, 916 insertions(+), 393 deletions(-)
 create mode 100644 contrib/tbase_memory_tools/Makefile
 create mode 100644 contrib/tbase_memory_tools/tbase_memory_tools--1.0.sql
 create mode 100644 contrib/tbase_memory_tools/tbase_memory_tools.c
 create mode 100644 contrib/tbase_memory_tools/tbase_memory_tools.control

diff --git a/contrib/Makefile b/contrib/Makefile
index 494da1e1..22110f25 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -47,6 +47,7 @@ SUBDIRS = \
                 spi             \
                 tablefunc       \
 				tbase_gts_tools \
+		tbase_memory_tools	\
                 tcn             \
                 test_decoding   \
                 tsm_system_rows \
diff --git a/contrib/tbase_memory_tools/Makefile b/contrib/tbase_memory_tools/Makefile
new file mode 100644
index 00000000..6fa3fbca
--- /dev/null
+++ b/contrib/tbase_memory_tools/Makefile
@@ -0,0 +1,23 @@
+# contrib/tbase_memory_tools/Makefile
+MODULES = tbase_memory_tools
+
+## 扩展名称;
+EXTENSION = tbase_memory_tools
+
+## 扩展安装的SQL文件;
+DATA = tbase_memory_tools--1.0.sql
+
+## 扩展描述;
+PGFILEDESC = "tbase_memory_tools - memory wrapper for Tbase"
+
+### 以下为Pg构建扩展相关命令;
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)  ## 环境变量参数加载;
+else
+subdir = contrib/tbase_memory_tools
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/tbase_memory_tools/tbase_memory_tools--1.0.sql b/contrib/tbase_memory_tools/tbase_memory_tools--1.0.sql
new file mode 100644
index 00000000..80f9b8a3
--- /dev/null
+++ b/contrib/tbase_memory_tools/tbase_memory_tools--1.0.sql
@@ -0,0 +1,29 @@
+/* contrib/tbase_memory/tbase_memory_tools--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "create EXTENSION tbase_memory_tools" to load this file. \quit
+
+--
+-- pg_node_memory_detail()
+--
+CREATE FUNCTION pg_node_memory_detail(
+    OUT nodename text,
+    OUT pid int,
+    OUT memorytype text,
+    OUT memorykbytes int)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pg_node_memory_detail'
+LANGUAGE C STRICT PARALLEL SAFE;
+
+--
+-- pg_session_memory_detail()
+--
+CREATE FUNCTION pg_session_memory_detail(
+    OUT contextname text,
+    OUT contextlevel int,
+    OUT parent text,
+    OUT totalsize int,
+    OUT freesize int)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pg_session_memory_detail'
+LANGUAGE C STRICT PARALLEL SAFE;
\ No newline at end of file
diff --git a/contrib/tbase_memory_tools/tbase_memory_tools.c b/contrib/tbase_memory_tools/tbase_memory_tools.c
new file mode 100644
index 00000000..d722e2a9
--- /dev/null
+++ b/contrib/tbase_memory_tools/tbase_memory_tools.c
@@ -0,0 +1,277 @@
+#include "postgres.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "access/htup_details.h"
+#include "catalog/pg_type.h"
+#include "catalog/namespace.h"
+#include "utils/timestamp.h"
+#include "utils/varlena.h"
+#include "utils/builtins.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+#define LINUX_PAGE_SIZE 4096
+#define	MAX_MEMORY_DETAIL	2048
+
+typedef struct
+{
+	char 	*memory_context_name;
+	int		level;
+	char 	*parent_name;
+	int		parent_index;
+	long	self_total_space;
+	long	self_free_space;
+	long	all_total_space;
+	long	all_free_space;
+} MemoryContextDetail;
+
+typedef struct
+{
+	int	current;
+	int length;
+	MemoryContextDetail	details[MAX_MEMORY_DETAIL];
+} SessionMemoryContexts;
+
+int get_memory_detail(MemoryContext mctx,
+	MemoryContext parent,
+	int level,
+	int ind_on_parent,
+	const int ind_on_stat,
+	SessionMemoryContexts *contexts);
+
+
+/*
+ * pg_node_memory_detail
+ *
+ * node  memory detail
+ */
+PG_FUNCTION_INFO_V1(pg_node_memory_detail);
+
+Datum
+pg_node_memory_detail(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *fctx;
+
+	if (!superuser())
+	{
+		ereport(ERROR,
+			(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+			(errmsg("must be superuser to use memory functions"))));
+	}
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext mctx;
+		TupleDesc tupdesc;
+
+		fctx = SRF_FIRSTCALL_INIT();
+		mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+
+		/* Build a tuple descriptor for our result type */
+		if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		{
+			elog(ERROR, "return type must be a row type");
+		}
+
+		fctx->max_calls = 1;
+		fctx->tuple_desc = tupdesc;
+
+		MemoryContextSwitchTo(mctx);
+	}
+
+	fctx = SRF_PERCALL_SETUP();
+
+	if (fctx->call_cntr < fctx->max_calls)
+	{
+		HeapTuple	resultTuple;
+		Datum	result;
+		Datum	values[4];
+		bool	nulls[4];
+		int64	size		= 0;
+		Size	totalPages	= 0;
+		Size	rssPages	= 0;
+		Size	sharePages	= 0;
+		char	file[MAXPGPATH]   = {0};
+		char	buf[MAXPGPATH] = {0};
+		FILE	*handle       = NULL;
+
+		memset(values, 0, sizeof(values));
+		memset(nulls, 0, sizeof(nulls));
+
+		values[0] = CStringGetTextDatum(PGXCNodeName);
+		values[1] = Int64GetDatum(MyProcPid);
+		values[2] = CStringGetTextDatum("process_used_memory");
+
+		snprintf(file, MAXPGPATH, "/proc/%d/statm", MyProcPid);
+		handle = fopen(file, "r");
+		if (handle != NULL && fgets(buf, MAXPGPATH, handle) > 0)
+		{
+			if (3 == sscanf(buf, "%lu %lu %lu", &totalPages, &rssPages, &sharePages))
+			{
+				size = ((rssPages - sharePages) * LINUX_PAGE_SIZE) / 1024;
+			}
+		}
+		values[3] = Int64GetDatum(size);
+
+		/* Build and return the result tuple. */
+		resultTuple = heap_form_tuple(fctx->tuple_desc, values, nulls);
+		result = HeapTupleGetDatum(resultTuple);
+
+		SRF_RETURN_NEXT(fctx, result);
+	}
+	else
+	{
+		SRF_RETURN_DONE(fctx);
+	}
+}
+
+/*
+ * pg_session_memory_detail
+ *
+ * session memory detail
+ */
+PG_FUNCTION_INFO_V1(pg_session_memory_detail);
+
+Datum
+pg_session_memory_detail(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *fctx;
+
+	if (!superuser())
+	{
+		ereport(ERROR,
+			(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+			(errmsg("must be superuser to use memory functions"))));
+	}
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext mctx;
+		TupleDesc tupdesc;
+		SessionMemoryContexts *contexts;
+
+		fctx = SRF_FIRSTCALL_INIT();
+		mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);
+
+		/* Build a tuple descriptor for our result type */
+		if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		{
+			elog(ERROR, "return type must be a row type");
+		}
+		fctx->tuple_desc = tupdesc;
+
+		contexts = (SessionMemoryContexts *) palloc0(sizeof(SessionMemoryContexts));
+		contexts->current = 0;
+		contexts->length = 0;
+		contexts->details[0].memory_context_name = pstrdup("TopMemoryContext");
+		contexts->details[0].level = 0;
+		contexts->details[0].parent_name = NULL;
+		contexts->details[0].parent_index = 0;
+		(void) get_memory_detail(TopMemoryContext, NULL, 0, 0, 0, contexts);
+
+		fctx->user_fctx = contexts;
+		fctx->max_calls = contexts->length;
+
+		MemoryContextSwitchTo(mctx);
+	}
+
+	fctx = SRF_PERCALL_SETUP();
+
+	if (fctx->call_cntr < fctx->max_calls)
+	{
+		HeapTuple	resultTuple;
+		Datum	result;
+		Datum	values[5];
+		bool	nulls[5];
+		SessionMemoryContexts *contexts = fctx->user_fctx;
+		MemoryContextDetail *detail = &contexts->details[fctx->call_cntr];
+
+		memset(values, 0, sizeof(values));
+		memset(nulls, 0, sizeof(nulls));
+
+		values[0] = CStringGetTextDatum(detail->memory_context_name);
+		values[1] = Int64GetDatum(detail->level);
+		if (detail->parent_name == NULL)
+		{
+			nulls[2] = true;
+		}
+		else
+		{
+			values[2] = CStringGetTextDatum(detail->parent_name);
+		}
+		values[3] = Int64GetDatum(detail->all_total_space);
+		values[3] = Int64GetDatum(detail->all_free_space);
+
+		/* Build and return the result tuple. */
+		resultTuple = heap_form_tuple(fctx->tuple_desc, values, nulls);
+		result = HeapTupleGetDatum(resultTuple);
+
+		SRF_RETURN_NEXT(fctx, result);
+	}
+	else
+	{
+		SRF_RETURN_DONE(fctx);
+	}
+}
+
+/*
+ * get memory details of self and children.
+ */
+int
+get_memory_detail(MemoryContext mctx,
+	MemoryContext parent,
+	int level,
+	int ind_on_parent,
+	const int ind_on_stat,
+	SessionMemoryContexts *contexts)
+{
+	MemoryContext	iter;
+	int				child_index = 0;
+	int				itr_indx_on_stat = 0;
+	int 			next_ind_on_stat = 0;
+	MemoryContextDetail *stat = NULL;
+
+	if (ind_on_stat >= MAX_MEMORY_DETAIL)
+	{
+		elog(WARNING, "too many memory contexts!");
+		return ind_on_stat;
+	}
+
+	stat = &contexts->details[ind_on_stat];
+	stat->memory_context_name = pstrdup(mctx->name);
+	stat->parent_name = parent ? pstrdup(parent->name) : NULL;
+	stat->parent_index = ind_on_parent;
+	stat->level = level;
+	stat->self_free_space = -1;
+	stat->self_total_space = -1;
+	if (IsA(mctx,AllocSetContext))
+	{
+		AllocSetStats_Output(mctx, &stat->self_total_space, &stat->self_free_space);
+		stat->all_free_space = stat->self_free_space;
+		stat->all_total_space = stat->self_total_space;
+	}
+
+	itr_indx_on_stat = ind_on_stat + 1;
+	contexts->length += 1;
+	child_index = 0;
+	iter = mctx->firstchild;
+	while (iter)
+	{
+		next_ind_on_stat = get_memory_detail(iter, mctx, level+1, child_index, itr_indx_on_stat, contexts);
+		iter = iter->nextchild;
+
+		stat->all_free_space += contexts->details[itr_indx_on_stat].all_free_space;
+		stat->all_total_space += contexts->details[itr_indx_on_stat].all_total_space;
+
+		itr_indx_on_stat = next_ind_on_stat;
+
+		child_index++;
+	}
+
+	return itr_indx_on_stat;
+}
diff --git a/contrib/tbase_memory_tools/tbase_memory_tools.control b/contrib/tbase_memory_tools/tbase_memory_tools.control
new file mode 100644
index 00000000..0c1a3bbd
--- /dev/null
+++ b/contrib/tbase_memory_tools/tbase_memory_tools.control
@@ -0,0 +1,5 @@
+# tbase_memory_tools extension
+comment = 'memory wrapper for TBase'
+default_version = '1.0'
+module_pathname = '$libdir/tbase_memory_tools'
+relocatable = true
diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql b/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql
index 5ee8e1e6..93b0be11 100644
--- a/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql
+++ b/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql
@@ -32,7 +32,6 @@ CREATE OR REPLACE FUNCTION tbase_get_pooler_conn_statistics(
 	OUT warming_cnt int4,
 	OUT query_cnt int4,
 	OUT exceed_keepalive_cnt int4,
-	OUT exceed_deadtime_cnt int4,
 	OUT exceed_maxlifetime_cnt int4
 )
 RETURNS SETOF record
diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat.c b/contrib/tbase_pooler_stat/tbase_pooler_stat.c
index d85e5405..ac77982b 100644
--- a/contrib/tbase_pooler_stat/tbase_pooler_stat.c
+++ b/contrib/tbase_pooler_stat/tbase_pooler_stat.c
@@ -182,7 +182,7 @@ tbase_reset_pooler_cmd_statistics(PG_FUNCTION_ARGS)
 Datum
 tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS)
 {
-#define  LIST_POOLER_CONN_STATISTICS_COLUMNS 12
+#define  LIST_POOLER_CONN_STATISTICS_COLUMNS 11
     FuncCallContext 	 *funcctx = NULL;
     int32                ret = 0;
     Pooler_ConnState     *status = NULL;
@@ -222,9 +222,7 @@ tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS)
                            INT4OID, -1, 0);
         TupleDescInitEntry(tupdesc, (AttrNumber) 10, "exceed_keepalive_cnt",
                            INT4OID, -1, 0);
-        TupleDescInitEntry(tupdesc, (AttrNumber) 11, "exceed_deadtime_cnt",
-                           INT4OID, -1, 0);
-        TupleDescInitEntry(tupdesc, (AttrNumber) 12, "exceed_maxlifetime_cnt",
+        TupleDescInitEntry(tupdesc, (AttrNumber) 11, "exceed_maxlifetime_cnt",
                            INT4OID, -1, 0);
 
         funcctx->tuple_desc = BlessTupleDesc(tupdesc);
@@ -279,7 +277,6 @@ tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS)
             nulls[8] = true;
             nulls[9] = true;
             nulls[10] = true;
-            nulls[11] = true;
         }
         else
         {
@@ -292,7 +289,6 @@ tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS)
             values[8] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
             values[9] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
             values[10] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
-            values[11] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
             status->node_cursor--;
         }
 
diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index 76c72d85..9ce0b468 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -915,6 +915,14 @@ FreeRelationLocInfo(RelationLocInfo *relationLocInfo)
     {
         if (relationLocInfo->partAttrName)
             pfree(relationLocInfo->partAttrName);
+
+#ifdef __COLD_HOT__
+		if (relationLocInfo->secAttrName)
+			pfree(relationLocInfo->secAttrName);
+#endif
+
+		list_free(relationLocInfo->rl_nodeList);
+
         pfree(relationLocInfo);
     }
 }
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index eb400fb4..5811f647 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -5532,9 +5532,11 @@ PGXCNodeSendShowQuery(NODE_CONNECTION *conn, const char *sql_command)
     resStatus = PQresultStatus(result);
     if (resStatus == PGRES_TUPLES_OK || resStatus == PGRES_COMMAND_OK)
     {           
-        snprintf(number, 128, "%s", PQgetvalue(result, 0, 0));            
+		/* ignore unit */
+		snprintf(number, result->tuples[0][0].len, "%s", PQgetvalue(result, 0, 0));
     }    
     PQclear(result);    
+
     return number;
 }
 
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 6efe045c..b83e273f 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -83,7 +83,6 @@ int            PoolMaintenanceTimeout = 30;
 int            PoolSizeCheckGap       = 120;  /* max check memory size gap, in seconds */
 int            PoolConnMaxLifetime    = 600;  /* max lifetime of a pooled connection, in seconds */
 int            PoolWarmConnMaxLifetime = 7200;  /* max lifetime of a warm-needed pooled connection, in seconds */
-int            PoolConnDeadtime       = 1800; /* a pooled connection must be closed when lifetime exceed this, in seconds */
 int            PoolMaxMemoryLimit     = 10;
 int            PoolConnectTimeOut     = 10;
 int            PoolScaleFactor        = 2;
@@ -4755,8 +4754,7 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
             }
         }    
         else if (((nodePool->freeSize > 0) && (nodePool->nwarming + nodePool->nquery) > MinFreeSize) ||                                         
-            (difftime(now, slot->created) >= PoolConnMaxLifetime) ||
-             ((difftime(now, slot->created) >= PoolConnDeadtime) && (PoolConnDeadtime > PoolConnMaxLifetime)))
+			(difftime(now, slot->created) >= PoolConnMaxLifetime))
         {
             force_destroy = true;
             if (PoolConnectDebugPrint)
@@ -4777,7 +4775,6 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
             pooler_async_warm_connection(dbPool, slot, nodePool, node);
             grow_pool(dbPool, nodeidx, node, bCoord);
         }
-
         else
         {        
             if ((difftime(now, slot->checked) >=  PoolSizeCheckGap) && !IS_ASYNC_PIPE_FULL())
@@ -5318,8 +5315,8 @@ pooler_handle_subthread_log(bool is_pooler_exit)
  */
 static void
 PoolerLoop(void)
-{// #lizard forgives
-    bool           warme_initd = false;
+{
+	bool           warm_inited = false;
     StringInfoData input_message;
     int            maxfd       = MaxConnections + 1024;
     struct pollfd *pool_fd;
@@ -5589,10 +5586,10 @@ PoolerLoop(void)
         }
 
         /* create preload database pooler */
-        if (!warme_initd)
+		if (!warm_inited)
         {
             connect_pools();
-            warme_initd = true;
+			warm_inited = true;
         }
         pooler_pools_warm();
         
@@ -5828,7 +5825,6 @@ shrink_pool(DatabasePool *pool)
             {
                 /* no need to shrik warmed slot, only discard them when they use too much memroy */
                 if (!slot->bwarmed && ((difftime(now, slot->released) > PoolConnKeepAlive) || 
-                                      (difftime(now, slot->created) > PoolConnDeadtime)   ||
                                       (difftime(now, slot->created) >= PoolConnMaxLifetime)))
                 {                    
                     if (PoolConnectDebugPrint)
@@ -8266,13 +8262,18 @@ void *pooler_async_utility_thread(void *arg)
                 case COMMAND_CONNECTION_WARM:
                 {
                     CommandId commandID = InvalidCommandId;
-                    ret = PGXCNodeSendSetQuery((NODE_CONNECTION *)pWarmInfo->slot->conn, "set warm_shared_buffer to true;", NULL, 0, &pWarmInfo->set_query_status, &commandID);
+
+					ret = PGXCNodeSendSetQuery(
+						(NODE_CONNECTION *) pWarmInfo->slot->conn,
+						"set warm_shared_buffer to true;",
+						NULL,
+						0,
+						&pWarmInfo->set_query_status, &commandID);
                     /* only set warm flag when warm succeed */
                     if (0 == ret)
                     {
                         pWarmInfo->slot->bwarmed = true;                    
                     }
-                    
                 }
                 break;
                 
@@ -8280,14 +8281,44 @@ void *pooler_async_utility_thread(void *arg)
                 {
                     int   mbytes = 0;
                     char *size = NULL;
-                    size = PGXCNodeSendShowQuery((NODE_CONNECTION *)pWarmInfo->slot->conn, "show session_memory_size;");
+					CommandId commandID = InvalidCommandId;
+
+					(void) PGXCNodeSendSetQuery(
+						(NODE_CONNECTION *) pWarmInfo->slot->conn,
+						"set remotetype to application;",
+						NULL,
+						0,
+						&pWarmInfo->set_query_status, &commandID);
+
+					size = PGXCNodeSendShowQuery(
+						(NODE_CONNECTION *) pWarmInfo->slot->conn,
+						"show session_memory_size;");
                     pWarmInfo->cmd = COMMAND_JUDGE_CONNECTION_MEMSIZE;
                     mbytes = atoi(size);
-                    if (mbytes >= PoolMaxMemoryLimit)
+					if (PoolMaxMemoryLimit > 0 && mbytes >= PoolMaxMemoryLimit)
                     {
                         pWarmInfo->cmd = COMMAND_CONNECTION_NEED_CLOSE;
                     }
                     pWarmInfo->size = mbytes;
+
+					if (IS_PGXC_COORDINATOR)
+					{
+						(void) PGXCNodeSendSetQuery(
+							(NODE_CONNECTION *) pWarmInfo->slot->conn,
+							"set remotetype to coordinator;",
+							NULL,
+							0,
+							&pWarmInfo->set_query_status, &commandID);
+					}
+					else
+					{
+						(void) PGXCNodeSendSetQuery(
+							(NODE_CONNECTION *) pWarmInfo->slot->conn,
+							"set remotetype to datanode;",
+							NULL,
+							0,
+							&pWarmInfo->set_query_status, &commandID);
+					}
                 }
                 break;
 
@@ -11318,7 +11349,6 @@ handle_get_conn_statistics(PoolAgent *agent)
     uint32           total_node_cnt_offset = 0;
 
     uint32           exceed_keepalive_cnt = 0;
-    uint32           exceed_deadtime_cnt = 0;
     uint32           exceed_maxlifetime_cnt = 0;
     int              i = 0;
     PGXCNodePoolSlot *slot = NULL;
@@ -11357,7 +11387,6 @@ handle_get_conn_statistics(PoolAgent *agent)
 
             /* reset statistics count */
             exceed_keepalive_cnt = 0;
-            exceed_deadtime_cnt = 0;
             exceed_maxlifetime_cnt = 0;
             /* statistical connection life cycle */
             if (node_pool->slot)
@@ -11370,11 +11399,6 @@ handle_get_conn_statistics(PoolAgent *agent)
                         exceed_keepalive_cnt++;
                     }
 
-                    if (difftime(now, slot->created) > PoolConnDeadtime)
-                    {
-                        exceed_deadtime_cnt++;
-                    }
-
                     if (difftime(now, slot->created) >= PoolConnMaxLifetime)
                     {
                         exceed_maxlifetime_cnt++;
@@ -11383,7 +11407,6 @@ handle_get_conn_statistics(PoolAgent *agent)
             }
 
             pq_sendint(&buf, exceed_keepalive_cnt, sizeof(uint32));
-            pq_sendint(&buf, exceed_deadtime_cnt, sizeof(uint32));
             pq_sendint(&buf, exceed_maxlifetime_cnt, sizeof(uint32));
         }
 
diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c
index bbe4710e..541174b0 100644
--- a/src/backend/utils/cache/catcache.c
+++ b/src/backend/utils/cache/catcache.c
@@ -91,7 +91,6 @@ static CatCTup *CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp,
                         bool negative);
 static HeapTuple build_dummy_tuple(CatCache *cache, int nkeys, ScanKey skeys);
 
-
 /*
  *                    internal support functions
  */
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 98acab3e..eb895929 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -114,6 +114,11 @@
 
 #define RELCACHE_INIT_FILEMAGIC        0x573266    /* version ID value */
 
+bool enable_memory_optimization = false;
+int max_relcache_relations = 2000;
+int number_replaced_relations = 10;
+RelCacheHeader *RelCacheHdr = NULL;
+
 /*
  *        hardcoded tuple descriptors, contents generated by genbki.pl
  */
@@ -212,13 +217,19 @@ do { \
         Assert(replace_allowed); \
         hentry->reldesc = (RELATION); \
         if (RelationHasReferenceCountZero(_old_rel)) \
+		{ \
+			RelationLRUDelete(_old_rel); \
             RelationDestroyRelation(_old_rel, false); \
+		} \
         else if (!IsBootstrapProcessingMode()) \
             elog(WARNING, "leaking still-referenced relcache entry for \"%s\"", \
                  RelationGetRelationName(_old_rel)); \
     } \
     else \
+	{ \
         hentry->reldesc = (RELATION); \
+	} \
+	RelationLRUInsert(RELATION); \
 } while(0)
 
 #define RelationIdCacheLookup(ID, RELATION) \
@@ -242,6 +253,8 @@ do { \
     if (hentry == NULL) \
         elog(WARNING, "failed to delete relcache entry for OID %u", \
              (RELATION)->rd_id); \
+	\
+	RelationLRUDelete(RELATION); \
 } while(0)
 
 
@@ -608,10 +621,12 @@ RelationBuildTupleDesc(Relation relation)
         if (attp->atthasdef)
         {
             if (attrdef == NULL)
+			{
                 attrdef = (AttrDefault *)
                     MemoryContextAllocZero(CacheMemoryContext,
                                            relation->rd_rel->relnatts *
                                            sizeof(AttrDefault));
+			}
             attrdef[ndef].adnum = attnum;
             attrdef[ndef].adbin = NULL;
             ndef++;
@@ -637,10 +652,12 @@ RelationBuildTupleDesc(Relation relation)
                     Datum       missval;
 
                     if (attrmiss == NULL)
+					{
                         attrmiss = (AttrMissing *)
                             MemoryContextAllocZero(CacheMemoryContext,
                                                    relation->rd_rel->relnatts *
                                                    sizeof(AttrMissing));
+					}
 
                     missval = array_get_element(missingval,
                                                 1,
@@ -1523,6 +1540,7 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
      * initialize the tuple descriptor (relation->rd_att).
      */
     RelationBuildTupleDesc(relation);
+
 #ifdef __TBASE__
     }
 #endif
@@ -1571,6 +1589,7 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
 #endif
     
 #endif
+
     if (relation->rd_rel->relrowsecurity)
     {
         /* See comments near RelationBuildRuleLocok for details */
@@ -2387,6 +2406,75 @@ RelationHasGTS(Relation rel)
 	return has;
 }
 
+/*
+ * Insert relation elem to RelCache LRU list.
+ * If the number of elems in RelCache exceeds MAX_RELCACHE_RELATIONS，it will
+ * replace relation elems from RelCache LRU list。
+ */
+void
+RelationLRUInsert(Relation rel)
+{
+	if (!enable_memory_optimization)
+	{
+		return;
+	}
+
+	if (RelCacheHdr->rh_ntup >= RelCacheHdr->rh_maxtup)
+	{
+		dlist_head *head;
+		dlist_iter iter;
+		int replaced = 0;
+
+		Assert(number_replaced_relations > 0);
+		head = &RelCacheHdr->rh_lrulist;
+		for (iter.end = &head->head,
+			iter.cur = iter.end->prev ? iter.end->prev : iter.end;
+			iter.cur != iter.end;)
+		{
+			Relation relation;
+
+			relation = dlist_container(RelationData, rd_lru_list_elem, iter.cur);
+			iter.cur = iter.cur->prev;
+
+			if (IsSystemRelation(relation))
+			{
+				continue;
+			}
+
+			if (relation->rd_refcnt == 0)
+			{
+				RelationCacheDelete(relation);
+				RelationDestroyRelation(relation, false);
+
+				replaced += 1;
+				if (replaced == number_replaced_relations)
+				{
+					Assert(RelCacheHdr->rh_ntup < RelCacheHdr->rh_maxtup);
+					break;
+				}
+			}
+		}
+	}
+
+	dlist_push_head(&RelCacheHdr->rh_lrulist, &rel->rd_lru_list_elem);
+	RelCacheHdr->rh_ntup += 1;
+}
+
+/*
+ * Remove relation elem from Relcache LRU list.
+ */
+void
+RelationLRUDelete(Relation rel)
+{
+	if (!enable_memory_optimization)
+	{
+		return;
+	}
+
+	dlist_delete(&rel->rd_lru_list_elem);
+	RelCacheHdr->rh_ntup -= 1;
+}
+
 /* ----------------------------------------------------------------
  *                cache invalidation support routines
  * ----------------------------------------------------------------
@@ -2650,6 +2738,10 @@ RelationDestroyRelation(Relation relation, bool remember_tupdesc)
         pfree(relation->rd_partcheck);
     if (relation->rd_fdwroutine)
         pfree(relation->rd_fdwroutine);
+#ifdef PGXC
+	if (relation->rd_locator_info)
+		FreeRelationLocInfo(relation->rd_locator_info);
+#endif
     pfree(relation);
 }
 
@@ -2915,6 +3007,10 @@ RelationClearRelation(Relation relation, bool rebuild)
             SWAPFIELD(MemoryContext, rd_pdcxt);
         }
 
+		if (enable_memory_optimization)
+		{
+			SWAPFIELD(dlist_node, rd_lru_list_elem);
+		}
 #undef SWAPFIELD
 
         /* And now we can throw away the temporary entry */
@@ -3646,7 +3742,6 @@ RelationBuildLocalRelation(const char *relname,
     else
         rel->rd_rel->relfilenode = relfilenode;
 
-
     RelationInitLockInfo(rel);    /* see lmgr.c */
 
     RelationInitPhysicalAddr(rel);
@@ -3838,6 +3933,14 @@ RelationCacheInitialize(void)
     RelationIdCache = hash_create("Relcache by OID", INITRELCACHESIZE,
                                   &ctl, HASH_ELEM | HASH_BLOBS);
 
+	if (enable_memory_optimization && RelCacheHdr == NULL)
+	{
+		RelCacheHdr = (RelCacheHeader *) palloc0(sizeof(RelCacheHeader));
+		RelCacheHdr->rh_ntup = 0;
+		RelCacheHdr->rh_maxtup = max_relcache_relations;
+		dlist_init(&RelCacheHdr->rh_lrulist);
+	}
+
     /*
      * relation mapper needs to be initialized too
      */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 97a0b26f..8288cf36 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2763,6 +2763,19 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+    {
+		{"enable_memory_optimization", PGC_POSTMASTER, RESOURCES,
+			gettext_noop("enable session cache memory control"),
+			NULL
+		},
+		&enable_memory_optimization,
+#ifdef _PG_REGRESS_
+		true,
+#else
+		false,
+#endif
+		NULL, NULL, NULL
+	},
 
 #endif
 
@@ -3791,6 +3804,28 @@ static struct config_int ConfigureNamesInt[] =
         0, 0, 31536000,
         NULL, NULL, NULL
     },
+	{
+		{"max_relcache_relations", PGC_POSTMASTER, RESOURCES,
+			gettext_noop("max relcache relations per session."),
+			NULL
+		},
+		&max_relcache_relations,
+#ifdef _PG_REGRESS_
+		500, 500, INT_MAX,
+#else
+		2000, 500, INT_MAX,
+#endif
+		NULL, NULL, NULL
+	},
+	{
+		{"number_replaced_relations", PGC_POSTMASTER, RESOURCES,
+			gettext_noop("max relcache relations while replacing."),
+			NULL
+		},
+		&number_replaced_relations,
+		10, 1, 500,
+		NULL, NULL, NULL
+	},
 #endif
     {
         {"log_rotation_age", PGC_SIGHUP, LOGGING_WHERE,
@@ -4344,10 +4379,10 @@ static struct config_int ConfigureNamesInt[] =
         {"pool_session_memory_limit", PGC_SIGHUP, DATA_NODES,
             gettext_noop("Datanode session max memory context size."),
             gettext_noop("Exceed limit will be closed."),
-            GUC_UNIT_S
+			GUC_UNIT_MB
         },
         &PoolMaxMemoryLimit,
-        10, 1, 10000,
+		10, -1, 10000,
         NULL, NULL, NULL
     },    
     {
@@ -4402,7 +4437,8 @@ static struct config_int ConfigureNamesInt[] =
     {
         {"session_memory_size", PGC_USERSET, RESOURCES_MEM,
             gettext_noop("Used to get the total memory size of the session, in M Bytes."),
-            gettext_noop("Used to get the total memory size of the session, in M Bytes.")
+			gettext_noop("Used to get the total memory size of the session, in M Bytes."),
+			GUC_UNIT_MB
         },
         &g_TotalMemorySize,
         0, 0, INT_MAX,
@@ -5968,7 +6004,7 @@ static struct config_enum ConfigureNamesEnum[] =
 
 #ifdef PGXC
     {
-        {"remotetype", PGC_BACKEND, CONN_AUTH,
+		{"remotetype", PGC_USERSET, CONN_AUTH,
             gettext_noop("Sets the type of Postgres-XL remote connection"),
             NULL
         },
@@ -13447,7 +13483,7 @@ show_total_memorysize(void)
     int32   size;
     static char buf[64];
     size = get_total_memory_size();
-    snprintf(buf, sizeof(buf), "%d", size);
+	snprintf(buf, sizeof(buf), "%dM", size);
     return buf;
 }
 #endif
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 12d19740..c03c59df 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -502,6 +502,11 @@
 				# 2: check the correctness of the GTS of tuples by referring to
 				#    tlog, and reset it if it is wrong while doing vacuum.
 
+# - Memory Optimization -
+#enable_memory_optimization = false
+#max_relcache_relations = 2000
+#number_replaced_relations = 10
+
 #------------------------------------------------------------------------------
 # RUNTIME STATISTICS
 #------------------------------------------------------------------------------
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 5df24178..7ae45b95 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -274,6 +274,10 @@ extern bool enable_clog_mprotect;
 extern bool enable_tlog_mprotect;
 extern bool enable_xlog_mprotect;
 extern int query_delay;
+
+extern bool enable_memory_optimization;
+extern int max_relcache_relations;
+extern int number_replaced_relations;
 #endif
 extern int    log_min_error_statement;
 extern int    log_min_messages;
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 01246e8c..03e6041b 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * rel.h
- *      POSTGRES relation descriptor (a/k/a relcache entry) definitions.
+ *	  POSTGRES relation descriptor (a/k/a relcache entry) definitions.
  *
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
@@ -41,13 +41,13 @@
 
 typedef struct LockRelId
 {
-    Oid            relId;            /* a relation identifier */
-    Oid            dbId;            /* a database identifier */
+	Oid			relId;			/* a relation identifier */
+	Oid			dbId;			/* a database identifier */
 } LockRelId;
 
 typedef struct LockInfoData
 {
-    LockRelId    lockRelId;
+	LockRelId	lockRelId;
 } LockInfoData;
 
 typedef LockInfoData *LockInfo;
@@ -57,28 +57,28 @@ typedef LockInfoData *LockInfo;
  */
 typedef struct PartitionKeyData
 {
-    char        strategy;        /* partitioning strategy */
-    int16        partnatts;        /* number of columns in the partition key */
-    AttrNumber *partattrs;        /* attribute numbers of columns in the
-                                 * partition key */
-    List       *partexprs;        /* list of expressions in the partitioning
-                                 * key, or NIL */
-
-    Oid           *partopfamily;    /* OIDs of operator families */
-    Oid           *partopcintype;    /* OIDs of opclass declared input data types */
-    FmgrInfo   *partsupfunc;    /* lookup info for support funcs */
-
-    /* Partitioning collation per attribute */
-    Oid           *partcollation;
-
-    /* Type information per attribute */
-    Oid           *parttypid;
-    int32       *parttypmod;
-    int16       *parttyplen;
-    bool       *parttypbyval;
-    char       *parttypalign;
-    Oid           *parttypcoll;
-}            PartitionKeyData;
+	char		strategy;		/* partitioning strategy */
+	int16		partnatts;		/* number of columns in the partition key */
+	AttrNumber *partattrs;		/* attribute numbers of columns in the
+								 * partition key */
+	List	   *partexprs;		/* list of expressions in the partitioning
+								 * key, or NIL */
+
+	Oid		   *partopfamily;	/* OIDs of operator families */
+	Oid		   *partopcintype;	/* OIDs of opclass declared input data types */
+	FmgrInfo   *partsupfunc;	/* lookup info for support funcs */
+
+	/* Partitioning collation per attribute */
+	Oid		   *partcollation;
+
+	/* Type information per attribute */
+	Oid		   *parttypid;
+	int32	   *parttypmod;
+	int16	   *parttyplen;
+	bool	   *parttypbyval;
+	char	   *parttypalign;
+	Oid		   *parttypcoll;
+}			PartitionKeyData;
 
 typedef struct PartitionKeyData *PartitionKey;
 
@@ -97,155 +97,156 @@ typedef struct tagClsExprStruct
 
 typedef struct RelationData
 {
-    RelFileNode rd_node;        /* relation physical identifier */
-    /* use "struct" here to avoid needing to include smgr.h: */
-    struct SMgrRelationData *rd_smgr;    /* cached file handle, or NULL */
-    int            rd_refcnt;        /* reference count */
-    BackendId    rd_backend;        /* owning backend id, if temporary relation */
-    bool        rd_islocaltemp; /* rel is a temp rel of this session */
-    bool        rd_isnailed;    /* rel is nailed in cache */
-    bool        rd_isvalid;        /* relcache entry is valid */
-    char        rd_indexvalid;    /* state of rd_indexlist: 0 = not valid, 1 =
-                                 * valid, 2 = temporarily forced */
-    bool        rd_statvalid;    /* is rd_statlist valid? */
-
-    /*
-     * rd_createSubid is the ID of the highest subtransaction the rel has
-     * survived into; or zero if the rel was not created in the current top
-     * transaction.  This can be now be relied on, whereas previously it could
-     * be "forgotten" in earlier releases. Likewise, rd_newRelfilenodeSubid is
-     * the ID of the highest subtransaction the relfilenode change has
-     * survived into, or zero if not changed in the current transaction (or we
-     * have forgotten changing it). rd_newRelfilenodeSubid can be forgotten
-     * when a relation has multiple new relfilenodes within a single
-     * transaction, with one of them occurring in a subsequently aborted
-     * subtransaction, e.g. BEGIN; TRUNCATE t; SAVEPOINT save; TRUNCATE t;
-     * ROLLBACK TO save; -- rd_newRelfilenode is now forgotten
-     */
-    SubTransactionId rd_createSubid;    /* rel was created in current xact */
-    SubTransactionId rd_newRelfilenodeSubid;    /* new relfilenode assigned in
-                                                 * current xact */
-
-    Form_pg_class rd_rel;        /* RELATION tuple */
-    TupleDesc    rd_att;            /* tuple descriptor */
-    Oid            rd_id;            /* relation's object id */
-    LockInfoData rd_lockInfo;    /* lock mgr's info for locking relation */
-    RuleLock   *rd_rules;        /* rewrite rules */
-    MemoryContext rd_rulescxt;    /* private memory cxt for rd_rules, if any */
-    TriggerDesc *trigdesc;        /* Trigger info, or NULL if rel has none */
-    /* use "struct" here to avoid needing to include rowsecurity.h: */
-    struct RowSecurityDesc *rd_rsdesc;    /* row security policies, or NULL */
+	RelFileNode rd_node;		/* relation physical identifier */
+	/* use "struct" here to avoid needing to include smgr.h: */
+	struct SMgrRelationData *rd_smgr;	/* cached file handle, or NULL */
+	int			rd_refcnt;		/* reference count */
+	BackendId	rd_backend;		/* owning backend id, if temporary relation */
+	bool		rd_islocaltemp; /* rel is a temp rel of this session */
+	bool		rd_isnailed;	/* rel is nailed in cache */
+	bool		rd_isvalid;		/* relcache entry is valid */
+	char		rd_indexvalid;	/* state of rd_indexlist: 0 = not valid, 1 =
+								 * valid, 2 = temporarily forced */
+	bool		rd_statvalid;	/* is rd_statlist valid? */
+
+	/*
+	 * rd_createSubid is the ID of the highest subtransaction the rel has
+	 * survived into; or zero if the rel was not created in the current top
+	 * transaction.  This can be now be relied on, whereas previously it could
+	 * be "forgotten" in earlier releases. Likewise, rd_newRelfilenodeSubid is
+	 * the ID of the highest subtransaction the relfilenode change has
+	 * survived into, or zero if not changed in the current transaction (or we
+	 * have forgotten changing it). rd_newRelfilenodeSubid can be forgotten
+	 * when a relation has multiple new relfilenodes within a single
+	 * transaction, with one of them occurring in a subsequently aborted
+	 * subtransaction, e.g. BEGIN; TRUNCATE t; SAVEPOINT save; TRUNCATE t;
+	 * ROLLBACK TO save; -- rd_newRelfilenode is now forgotten
+	 */
+	SubTransactionId rd_createSubid;	/* rel was created in current xact */
+	SubTransactionId rd_newRelfilenodeSubid;	/* new relfilenode assigned in
+												 * current xact */
+
+	Form_pg_class rd_rel;		/* RELATION tuple */
+	TupleDesc	rd_att;			/* tuple descriptor */
+	Oid			rd_id;			/* relation's object id */
+	LockInfoData rd_lockInfo;	/* lock mgr's info for locking relation */
+	RuleLock   *rd_rules;		/* rewrite rules */
+	MemoryContext rd_rulescxt;	/* private memory cxt for rd_rules, if any */
+	TriggerDesc *trigdesc;		/* Trigger info, or NULL if rel has none */
+	/* use "struct" here to avoid needing to include rowsecurity.h: */
+	struct RowSecurityDesc *rd_rsdesc;	/* row security policies, or NULL */
 #ifdef _MLS_
     ClsExprStruct * rd_cls_struct;/* pg_cls_check function call expr */
 #endif
 
-    /* data managed by RelationGetFKeyList: */
-    List       *rd_fkeylist;    /* list of ForeignKeyCacheInfo (see below) */
-    bool        rd_fkeyvalid;    /* true if list has been computed */
-
-    MemoryContext rd_partkeycxt;    /* private memory cxt for the below */
-    struct PartitionKeyData *rd_partkey;    /* partition key, or NULL */
-    MemoryContext rd_pdcxt;        /* private context for partdesc */
-    struct PartitionDescData *rd_partdesc;    /* partitions, or NULL */
-    List       *rd_partcheck;    /* partition CHECK quals */
-
-    /* data managed by RelationGetIndexList: */
-    List       *rd_indexlist;    /* list of OIDs of indexes on relation */
-    Oid            rd_oidindex;    /* OID of unique index on OID, if any */
-    Oid            rd_pkindex;        /* OID of primary key, if any */
-    Oid            rd_replidindex; /* OID of replica identity index, if any */
-
-    /* data managed by RelationGetStatExtList: */
-    List       *rd_statlist;    /* list of OIDs of extended stats */
-
-    /* data managed by RelationGetIndexAttrBitmap: */
-    Bitmapset  *rd_indexattr;    /* identifies columns used in indexes */
-    Bitmapset  *rd_keyattr;        /* cols that can be ref'd by foreign keys */
-    Bitmapset  *rd_pkattr;        /* cols included in primary key */
-    Bitmapset  *rd_idattr;        /* included in replica identity index */
-
-    PublicationActions *rd_pubactions;    /* publication actions */
-
-    /*
-     * rd_options is set whenever rd_rel is loaded into the relcache entry.
-     * Note that you can NOT look into rd_rel for this data.  NULL means "use
-     * defaults".
-     */
-    bytea       *rd_options;        /* parsed pg_class.reloptions */
-
-    /* These are non-NULL only for an index relation: */
-    Form_pg_index rd_index;        /* pg_index tuple describing this index */
-    /* use "struct" here to avoid needing to include htup.h: */
-    struct HeapTupleData *rd_indextuple;    /* all of pg_index tuple */
-
-    /*
-     * index access support info (used only for an index relation)
-     *
-     * Note: only default support procs for each opclass are cached, namely
-     * those with lefttype and righttype equal to the opclass's opcintype. The
-     * arrays are indexed by support function number, which is a sufficient
-     * identifier given that restriction.
-     *
-     * Note: rd_amcache is available for index AMs to cache private data about
-     * an index.  This must be just a cache since it may get reset at any time
-     * (in particular, it will get reset by a relcache inval message for the
-     * index).  If used, it must point to a single memory chunk palloc'd in
-     * rd_indexcxt.  A relcache reset will include freeing that chunk and
-     * setting rd_amcache = NULL.
-     */
-    Oid            rd_amhandler;    /* OID of index AM's handler function */
-    MemoryContext rd_indexcxt;    /* private memory cxt for this stuff */
-    /* use "struct" here to avoid needing to include amapi.h: */
-    struct IndexAmRoutine *rd_amroutine;    /* index AM's API struct */
-    Oid           *rd_opfamily;    /* OIDs of op families for each index col */
-    Oid           *rd_opcintype;    /* OIDs of opclass declared input data types */
-    RegProcedure *rd_support;    /* OIDs of support procedures */
-    FmgrInfo   *rd_supportinfo; /* lookup info for support procedures */
-    int16       *rd_indoption;    /* per-column AM-specific flags */
-    List       *rd_indexprs;    /* index expression trees, if any */
-    List       *rd_indpred;        /* index predicate tree, if any */
-    Oid           *rd_exclops;        /* OIDs of exclusion operators, if any */
-    Oid           *rd_exclprocs;    /* OIDs of exclusion ops' procs, if any */
-    uint16       *rd_exclstrats;    /* exclusion ops' strategy numbers, if any */
-    void       *rd_amcache;        /* available for use by index AM */
-    Oid           *rd_indcollation;    /* OIDs of index collations */
-
-    /*
-     * foreign-table support
-     *
-     * rd_fdwroutine must point to a single memory chunk palloc'd in
-     * CacheMemoryContext.  It will be freed and reset to NULL on a relcache
-     * reset.
-     */
-
-    /* use "struct" here to avoid needing to include fdwapi.h: */
-    struct FdwRoutine *rd_fdwroutine;    /* cached function pointers, or NULL */
-
-    /*
-     * Hack for CLUSTER, rewriting ALTER TABLE, etc: when writing a new
-     * version of a table, we need to make any toast pointers inserted into it
-     * have the existing toast table's OID, not the OID of the transient toast
-     * table.  If rd_toastoid isn't InvalidOid, it is the OID to place in
-     * toast pointers inserted into this rel.  (Note it's set on the new
-     * version of the main heap, not the toast table itself.)  This also
-     * causes toast_save_datum() to try to preserve toast value OIDs.
-     */
-    Oid            rd_toastoid;    /* Real TOAST table's OID, or InvalidOid */
-
-    /* use "struct" here to avoid needing to include pgstat.h: */
-    struct PgStat_TableStatus *pgstat_info; /* statistics collection area */
+	/* data managed by RelationGetFKeyList: */
+	List	   *rd_fkeylist;	/* list of ForeignKeyCacheInfo (see below) */
+	bool		rd_fkeyvalid;	/* true if list has been computed */
+
+	MemoryContext rd_partkeycxt;	/* private memory cxt for the below */
+	struct PartitionKeyData *rd_partkey;	/* partition key, or NULL */
+	MemoryContext rd_pdcxt;		/* private context for partdesc */
+	struct PartitionDescData *rd_partdesc;	/* partitions, or NULL */
+	List	   *rd_partcheck;	/* partition CHECK quals */
+
+	/* data managed by RelationGetIndexList: */
+	List	   *rd_indexlist;	/* list of OIDs of indexes on relation */
+	Oid			rd_oidindex;	/* OID of unique index on OID, if any */
+	Oid			rd_pkindex;		/* OID of primary key, if any */
+	Oid			rd_replidindex; /* OID of replica identity index, if any */
+
+	/* data managed by RelationGetStatExtList: */
+	List	   *rd_statlist;	/* list of OIDs of extended stats */
+
+	/* data managed by RelationGetIndexAttrBitmap: */
+	Bitmapset  *rd_indexattr;	/* identifies columns used in indexes */
+	Bitmapset  *rd_keyattr;		/* cols that can be ref'd by foreign keys */
+	Bitmapset  *rd_pkattr;		/* cols included in primary key */
+	Bitmapset  *rd_idattr;		/* included in replica identity index */
+
+	PublicationActions *rd_pubactions;	/* publication actions */
+
+	/*
+	 * rd_options is set whenever rd_rel is loaded into the relcache entry.
+	 * Note that you can NOT look into rd_rel for this data.  NULL means "use
+	 * defaults".
+	 */
+	bytea	   *rd_options;		/* parsed pg_class.reloptions */
+
+	/* These are non-NULL only for an index relation: */
+	Form_pg_index rd_index;		/* pg_index tuple describing this index */
+	/* use "struct" here to avoid needing to include htup.h: */
+	struct HeapTupleData *rd_indextuple;	/* all of pg_index tuple */
+
+	/*
+	 * index access support info (used only for an index relation)
+	 *
+	 * Note: only default support procs for each opclass are cached, namely
+	 * those with lefttype and righttype equal to the opclass's opcintype. The
+	 * arrays are indexed by support function number, which is a sufficient
+	 * identifier given that restriction.
+	 *
+	 * Note: rd_amcache is available for index AMs to cache private data about
+	 * an index.  This must be just a cache since it may get reset at any time
+	 * (in particular, it will get reset by a relcache inval message for the
+	 * index).  If used, it must point to a single memory chunk palloc'd in
+	 * rd_indexcxt.  A relcache reset will include freeing that chunk and
+	 * setting rd_amcache = NULL.
+	 */
+	Oid			rd_amhandler;	/* OID of index AM's handler function */
+	MemoryContext rd_indexcxt;	/* private memory cxt for this stuff */
+	/* use "struct" here to avoid needing to include amapi.h: */
+	struct IndexAmRoutine *rd_amroutine;	/* index AM's API struct */
+	Oid		   *rd_opfamily;	/* OIDs of op families for each index col */
+	Oid		   *rd_opcintype;	/* OIDs of opclass declared input data types */
+	RegProcedure *rd_support;	/* OIDs of support procedures */
+	FmgrInfo   *rd_supportinfo; /* lookup info for support procedures */
+	int16	   *rd_indoption;	/* per-column AM-specific flags */
+	List	   *rd_indexprs;	/* index expression trees, if any */
+	List	   *rd_indpred;		/* index predicate tree, if any */
+	Oid		   *rd_exclops;		/* OIDs of exclusion operators, if any */
+	Oid		   *rd_exclprocs;	/* OIDs of exclusion ops' procs, if any */
+	uint16	   *rd_exclstrats;	/* exclusion ops' strategy numbers, if any */
+	void	   *rd_amcache;		/* available for use by index AM */
+	Oid		   *rd_indcollation;	/* OIDs of index collations */
+
+	/*
+	 * foreign-table support
+	 *
+	 * rd_fdwroutine must point to a single memory chunk palloc'd in
+	 * CacheMemoryContext.  It will be freed and reset to NULL on a relcache
+	 * reset.
+	 */
+
+	/* use "struct" here to avoid needing to include fdwapi.h: */
+	struct FdwRoutine *rd_fdwroutine;	/* cached function pointers, or NULL */
+
+	/*
+	 * Hack for CLUSTER, rewriting ALTER TABLE, etc: when writing a new
+	 * version of a table, we need to make any toast pointers inserted into it
+	 * have the existing toast table's OID, not the OID of the transient toast
+	 * table.  If rd_toastoid isn't InvalidOid, it is the OID to place in
+	 * toast pointers inserted into this rel.  (Note it's set on the new
+	 * version of the main heap, not the toast table itself.)  This also
+	 * causes toast_save_datum() to try to preserve toast value OIDs.
+	 */
+	Oid			rd_toastoid;	/* Real TOAST table's OID, or InvalidOid */
+
+	/* use "struct" here to avoid needing to include pgstat.h: */
+	struct PgStat_TableStatus *pgstat_info; /* statistics collection area */
 #ifdef PGXC
-    RelationLocInfo *rd_locator_info;
+	RelationLocInfo *rd_locator_info;
 #endif
 #ifdef __TBASE__
-    Form_pg_partition_interval  rd_partitions_info;
+	Form_pg_partition_interval  rd_partitions_info;
+	dlist_node		rd_lru_list_elem;	/* list member of LRU list */
 #endif
 } RelationData;
 
 
 /*
  * ForeignKeyCacheInfo
- *        Information the relcache can cache about foreign key constraints
+ *		Information the relcache can cache about foreign key constraints
  *
  * This is basically just an image of relevant columns from pg_constraint.
  * We make it a subclass of Node so that copyObject() can be used on a list
@@ -259,20 +260,20 @@ typedef struct RelationData
  */
 typedef struct ForeignKeyCacheInfo
 {
-    NodeTag        type;
-    Oid            conrelid;        /* relation constrained by the foreign key */
-    Oid            confrelid;        /* relation referenced by the foreign key */
-    int            nkeys;            /* number of columns in the foreign key */
-    /* these arrays each have nkeys valid entries: */
-    AttrNumber    conkey[INDEX_MAX_KEYS]; /* cols in referencing table */
-    AttrNumber    confkey[INDEX_MAX_KEYS];    /* cols in referenced table */
-    Oid            conpfeqop[INDEX_MAX_KEYS];    /* PK = FK operator OIDs */
+	NodeTag		type;
+	Oid			conrelid;		/* relation constrained by the foreign key */
+	Oid			confrelid;		/* relation referenced by the foreign key */
+	int			nkeys;			/* number of columns in the foreign key */
+	/* these arrays each have nkeys valid entries: */
+	AttrNumber	conkey[INDEX_MAX_KEYS]; /* cols in referencing table */
+	AttrNumber	confkey[INDEX_MAX_KEYS];	/* cols in referenced table */
+	Oid			conpfeqop[INDEX_MAX_KEYS];	/* PK = FK operator OIDs */
 } ForeignKeyCacheInfo;
 
 
 /*
  * StdRdOptions
- *        Standard contents of rd_options for heaps and generic indexes.
+ *		Standard contents of rd_options for heaps and generic indexes.
  *
  * RelationGetFillFactor() and RelationGetTargetPageFreeSpace() can only
  * be applied to relations that use this format or a superset for
@@ -281,134 +282,134 @@ typedef struct ForeignKeyCacheInfo
  /* autovacuum-related reloptions. */
 typedef struct AutoVacOpts
 {
-    bool        enabled;
-    int            vacuum_threshold;
-    int            analyze_threshold;
-    int            vacuum_cost_delay;
-    int            vacuum_cost_limit;
-    int            freeze_min_age;
-    int            freeze_max_age;
-    int            freeze_table_age;
-    int            multixact_freeze_min_age;
-    int            multixact_freeze_max_age;
-    int            multixact_freeze_table_age;
-    int            log_min_duration;
-    float8        vacuum_scale_factor;
-    float8        analyze_scale_factor;
+	bool		enabled;
+	int			vacuum_threshold;
+	int			analyze_threshold;
+	int			vacuum_cost_delay;
+	int			vacuum_cost_limit;
+	int			freeze_min_age;
+	int			freeze_max_age;
+	int			freeze_table_age;
+	int			multixact_freeze_min_age;
+	int			multixact_freeze_max_age;
+	int			multixact_freeze_table_age;
+	int			log_min_duration;
+	float8		vacuum_scale_factor;
+	float8		analyze_scale_factor;
 } AutoVacOpts;
 
 typedef struct StdRdOptions
 {
-    int32        vl_len_;        /* varlena header (do not touch directly!) */
-    int            fillfactor;        /* page fill factor in percent (0..100) */
-    AutoVacOpts autovacuum;        /* autovacuum-related options */
-    bool        user_catalog_table; /* use as an additional catalog relation */
-    int            parallel_workers;    /* max number of parallel workers */
+	int32		vl_len_;		/* varlena header (do not touch directly!) */
+	int			fillfactor;		/* page fill factor in percent (0..100) */
+	AutoVacOpts autovacuum;		/* autovacuum-related options */
+	bool		user_catalog_table; /* use as an additional catalog relation */
+	int			parallel_workers;	/* max number of parallel workers */
 } StdRdOptions;
 
-#define HEAP_MIN_FILLFACTOR            10
-#define HEAP_DEFAULT_FILLFACTOR        100
+#define HEAP_MIN_FILLFACTOR			10
+#define HEAP_DEFAULT_FILLFACTOR		100
 
 /*
  * RelationGetFillFactor
- *        Returns the relation's fillfactor.  Note multiple eval of argument!
+ *		Returns the relation's fillfactor.  Note multiple eval of argument!
  */
 #define RelationGetFillFactor(relation, defaultff) \
-    ((relation)->rd_options ? \
-     ((StdRdOptions *) (relation)->rd_options)->fillfactor : (defaultff))
+	((relation)->rd_options ? \
+	 ((StdRdOptions *) (relation)->rd_options)->fillfactor : (defaultff))
 
 /*
  * RelationGetTargetPageUsage
- *        Returns the relation's desired space usage per page in bytes.
+ *		Returns the relation's desired space usage per page in bytes.
  */
 #define RelationGetTargetPageUsage(relation, defaultff) \
-    (BLCKSZ * RelationGetFillFactor(relation, defaultff) / 100)
+	(BLCKSZ * RelationGetFillFactor(relation, defaultff) / 100)
 
 /*
  * RelationGetTargetPageFreeSpace
- *        Returns the relation's desired freespace per page in bytes.
+ *		Returns the relation's desired freespace per page in bytes.
  */
 #define RelationGetTargetPageFreeSpace(relation, defaultff) \
-    (BLCKSZ * (100 - RelationGetFillFactor(relation, defaultff)) / 100)
+	(BLCKSZ * (100 - RelationGetFillFactor(relation, defaultff)) / 100)
 
 /*
  * RelationIsUsedAsCatalogTable
- *        Returns whether the relation should be treated as a catalog table
- *        from the pov of logical decoding.  Note multiple eval of argument!
+ *		Returns whether the relation should be treated as a catalog table
+ *		from the pov of logical decoding.  Note multiple eval of argument!
  */
-#define RelationIsUsedAsCatalogTable(relation)    \
-    ((relation)->rd_options && \
-     ((relation)->rd_rel->relkind == RELKIND_RELATION || \
-      (relation)->rd_rel->relkind == RELKIND_MATVIEW) ? \
-     ((StdRdOptions *) (relation)->rd_options)->user_catalog_table : false)
+#define RelationIsUsedAsCatalogTable(relation)	\
+	((relation)->rd_options && \
+	 ((relation)->rd_rel->relkind == RELKIND_RELATION || \
+	  (relation)->rd_rel->relkind == RELKIND_MATVIEW) ? \
+	 ((StdRdOptions *) (relation)->rd_options)->user_catalog_table : false)
 
 /*
  * RelationGetParallelWorkers
- *        Returns the relation's parallel_workers reloption setting.
- *        Note multiple eval of argument!
+ *		Returns the relation's parallel_workers reloption setting.
+ *		Note multiple eval of argument!
  */
 #define RelationGetParallelWorkers(relation, defaultpw) \
-    ((relation)->rd_options ? \
-     ((StdRdOptions *) (relation)->rd_options)->parallel_workers : (defaultpw))
+	((relation)->rd_options ? \
+	 ((StdRdOptions *) (relation)->rd_options)->parallel_workers : (defaultpw))
 
 
 /*
  * ViewOptions
- *        Contents of rd_options for views
+ *		Contents of rd_options for views
  */
 typedef struct ViewOptions
 {
-    int32        vl_len_;        /* varlena header (do not touch directly!) */
-    bool        security_barrier;
-    int            check_option_offset;
+	int32		vl_len_;		/* varlena header (do not touch directly!) */
+	bool		security_barrier;
+	int			check_option_offset;
 } ViewOptions;
 
 /*
  * RelationIsSecurityView
- *        Returns whether the relation is security view, or not.  Note multiple
- *        eval of argument!
+ *		Returns whether the relation is security view, or not.  Note multiple
+ *		eval of argument!
  */
-#define RelationIsSecurityView(relation)    \
-    ((relation)->rd_options ?                \
-     ((ViewOptions *) (relation)->rd_options)->security_barrier : false)
+#define RelationIsSecurityView(relation)	\
+	((relation)->rd_options ?				\
+	 ((ViewOptions *) (relation)->rd_options)->security_barrier : false)
 
 /*
  * RelationHasCheckOption
- *        Returns true if the relation is a view defined with either the local
- *        or the cascaded check option.  Note multiple eval of argument!
+ *		Returns true if the relation is a view defined with either the local
+ *		or the cascaded check option.  Note multiple eval of argument!
  */
-#define RelationHasCheckOption(relation)                                    \
-    ((relation)->rd_options &&                                                \
-     ((ViewOptions *) (relation)->rd_options)->check_option_offset != 0)
+#define RelationHasCheckOption(relation)									\
+	((relation)->rd_options &&												\
+	 ((ViewOptions *) (relation)->rd_options)->check_option_offset != 0)
 
 /*
  * RelationHasLocalCheckOption
- *        Returns true if the relation is a view defined with the local check
- *        option.  Note multiple eval of argument!
+ *		Returns true if the relation is a view defined with the local check
+ *		option.  Note multiple eval of argument!
  */
-#define RelationHasLocalCheckOption(relation)                                \
-    ((relation)->rd_options &&                                                \
-     ((ViewOptions *) (relation)->rd_options)->check_option_offset != 0 ?    \
-     strcmp((char *) (relation)->rd_options +                                \
-            ((ViewOptions *) (relation)->rd_options)->check_option_offset,    \
-            "local") == 0 : false)
+#define RelationHasLocalCheckOption(relation)								\
+	((relation)->rd_options &&												\
+	 ((ViewOptions *) (relation)->rd_options)->check_option_offset != 0 ?	\
+	 strcmp((char *) (relation)->rd_options +								\
+			((ViewOptions *) (relation)->rd_options)->check_option_offset,	\
+			"local") == 0 : false)
 
 /*
  * RelationHasCascadedCheckOption
- *        Returns true if the relation is a view defined with the cascaded check
- *        option.  Note multiple eval of argument!
+ *		Returns true if the relation is a view defined with the cascaded check
+ *		option.  Note multiple eval of argument!
  */
-#define RelationHasCascadedCheckOption(relation)                            \
-    ((relation)->rd_options &&                                                \
-     ((ViewOptions *) (relation)->rd_options)->check_option_offset != 0 ?    \
-     strcmp((char *) (relation)->rd_options +                                \
-            ((ViewOptions *) (relation)->rd_options)->check_option_offset,    \
-            "cascaded") == 0 : false)
+#define RelationHasCascadedCheckOption(relation)							\
+	((relation)->rd_options &&												\
+	 ((ViewOptions *) (relation)->rd_options)->check_option_offset != 0 ?	\
+	 strcmp((char *) (relation)->rd_options +								\
+			((ViewOptions *) (relation)->rd_options)->check_option_offset,	\
+			"cascaded") == 0 : false)
 
 
 /*
  * RelationIsValid
- *        True iff relation descriptor is valid.
+ *		True iff relation descriptor is valid.
  */
 #define RelationIsValid(relation) PointerIsValid(relation)
 
@@ -416,263 +417,263 @@ typedef struct ViewOptions
 
 /*
  * RelationHasReferenceCountZero
- *        True iff relation reference count is zero.
+ *		True iff relation reference count is zero.
  *
  * Note:
- *        Assumes relation descriptor is valid.
+ *		Assumes relation descriptor is valid.
  */
 #define RelationHasReferenceCountZero(relation) \
-        ((bool)((relation)->rd_refcnt == 0))
+		((bool)((relation)->rd_refcnt == 0))
 
 /*
  * RelationGetForm
- *        Returns pg_class tuple for a relation.
+ *		Returns pg_class tuple for a relation.
  *
  * Note:
- *        Assumes relation descriptor is valid.
+ *		Assumes relation descriptor is valid.
  */
 #define RelationGetForm(relation) ((relation)->rd_rel)
 
 /*
  * RelationGetRelid
- *        Returns the OID of the relation
+ *		Returns the OID of the relation
  */
 #define RelationGetRelid(relation) ((relation)->rd_id)
 
 /*
  * RelationGetNumberOfAttributes
- *        Returns the number of attributes in a relation.
+ *		Returns the number of attributes in a relation.
  */
 #define RelationGetNumberOfAttributes(relation) ((relation)->rd_rel->relnatts)
 
 /*
  * RelationGetDescr
- *        Returns tuple descriptor for a relation.
+ *		Returns tuple descriptor for a relation.
  */
 #define RelationGetDescr(relation) ((relation)->rd_att)
 
 /*
  * RelationGetRelationName
- *        Returns the rel's name.
+ *		Returns the rel's name.
  *
  * Note that the name is only unique within the containing namespace.
  */
 #define RelationGetRelationName(relation) \
-    (NameStr((relation)->rd_rel->relname))
+	(NameStr((relation)->rd_rel->relname))
 
 /*
  * RelationGetNamespace
- *        Returns the rel's namespace OID.
+ *		Returns the rel's namespace OID.
  */
 #define RelationGetNamespace(relation) \
-    ((relation)->rd_rel->relnamespace)
+	((relation)->rd_rel->relnamespace)
 
 /*
  * RelationIsMapped
- *        True if the relation uses the relfilenode map.
+ *		True if the relation uses the relfilenode map.
  *
  * NB: this is only meaningful for relkinds that have storage, else it
  * will misleadingly say "true".
  */
 #define RelationIsMapped(relation) \
-    ((relation)->rd_rel->relfilenode == InvalidOid)
+	((relation)->rd_rel->relfilenode == InvalidOid)
 
 /*
  * RelationOpenSmgr
- *        Open the relation at the smgr level, if not already done.
+ *		Open the relation at the smgr level, if not already done.
  */
 #define RelationOpenSmgr(relation) \
-    do { \
-        if ((relation)->rd_smgr == NULL) \
-            smgrsetowner(&((relation)->rd_smgr), smgropen((relation)->rd_node, (relation)->rd_backend)); \
-            (relation)->rd_smgr->smgr_hasextent = RelationHasExtent(relation); \
-    } while (0)
+	do { \
+		if ((relation)->rd_smgr == NULL) \
+			smgrsetowner(&((relation)->rd_smgr), smgropen((relation)->rd_node, (relation)->rd_backend)); \
+			(relation)->rd_smgr->smgr_hasextent = RelationHasExtent(relation); \
+	} while (0)
 
 /*
  * RelationCloseSmgr
- *        Close the relation at the smgr level, if not already done.
+ *		Close the relation at the smgr level, if not already done.
  *
  * Note: smgrclose should unhook from owner pointer, hence the Assert.
  */
 #define RelationCloseSmgr(relation) \
-    do { \
-        if ((relation)->rd_smgr != NULL) \
-        { \
-            smgrclose((relation)->rd_smgr); \
-            Assert((relation)->rd_smgr == NULL); \
-        } \
-    } while (0)
+	do { \
+		if ((relation)->rd_smgr != NULL) \
+		{ \
+			smgrclose((relation)->rd_smgr); \
+			Assert((relation)->rd_smgr == NULL); \
+		} \
+	} while (0)
 
 /*
  * RelationGetTargetBlock
- *        Fetch relation's current insertion target block.
+ *		Fetch relation's current insertion target block.
  *
  * Returns InvalidBlockNumber if there is no current target block.  Note
  * that the target block status is discarded on any smgr-level invalidation.
  */
 #define RelationGetTargetBlock(relation) \
-    ( (relation)->rd_smgr != NULL ? (relation)->rd_smgr->smgr_targblock : InvalidBlockNumber )
+	( (relation)->rd_smgr != NULL ? (relation)->rd_smgr->smgr_targblock : InvalidBlockNumber )
 
 #ifdef _SHARDING_
 #define RelationGetTargetBlock_Shard(relation, shardid) \
-    ( (relation)->rd_smgr != NULL ? smgr_get_target_block((relation)->rd_smgr, shardid) : InvalidBlockNumber )
+	( (relation)->rd_smgr != NULL ? smgr_get_target_block((relation)->rd_smgr, shardid) : InvalidBlockNumber )
 #endif
 /*
  * RelationSetTargetBlock
- *        Set relation's current insertion target block.
+ *		Set relation's current insertion target block.
  */
 #define RelationSetTargetBlock(relation, targblock) \
-    do { \
-        RelationOpenSmgr(relation); \
-        (relation)->rd_smgr->smgr_targblock = (targblock); \
-    } while (0)
+	do { \
+		RelationOpenSmgr(relation); \
+		(relation)->rd_smgr->smgr_targblock = (targblock); \
+	} while (0)
 
 #ifdef _SHARDING_
 #define RelationSetTargetBlock_Shard(relation, targblock, shardid) \
-    do { \
-        RelationOpenSmgr(relation); \
-        smgr_set_target_block((relation)->rd_smgr, shardid, targblock); \
-    } while (0)
+	do { \
+		RelationOpenSmgr(relation); \
+		smgr_set_target_block((relation)->rd_smgr, shardid, targblock); \
+	} while (0)
 #endif
 /*
  * RelationNeedsWAL
- *        True if relation needs WAL.
+ *		True if relation needs WAL.
  */
 #define RelationNeedsWAL(relation) \
-    ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
-    
+	((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
+	
 #ifdef _SHARDING_
 #if 0
 #define RelationHasExtent(relation) \
-    (IS_PGXC_DATANODE \
-    && (relation)->rd_rel->relpersistence == 'p' \
-    && ((relation)->rd_rel->relkind == RELKIND_RELATION || (relation)->rd_rel->relkind == RELKIND_TOASTVALUE)\
-    && (relation)->rd_locator_info ? ((relation)->rd_locator_info->locatorType == LOCATOR_TYPE_HASH ? true : false) : false\
-    && RelationGetRelid(relation) >= FirstNormalObjectId)
+	(IS_PGXC_DATANODE \
+	&& (relation)->rd_rel->relpersistence == 'p' \
+	&& ((relation)->rd_rel->relkind == RELKIND_RELATION || (relation)->rd_rel->relkind == RELKIND_TOASTVALUE)\
+	&& (relation)->rd_locator_info ? ((relation)->rd_locator_info->locatorType == LOCATOR_TYPE_HASH ? true : false) : false\
+	&& RelationGetRelid(relation) >= FirstNormalObjectId)
 #endif
 #define RelationHasExtent(relation) \
-    ((relation)->rd_rel->relhasextent)
+	((relation)->rd_rel->relhasextent)
 #define RelationGetDisKey(relation) \
-    ((relation)->rd_locator_info ? (relation)->rd_locator_info->partAttrNum : InvalidAttrNumber)
+	((relation)->rd_locator_info ? (relation)->rd_locator_info->partAttrNum : InvalidAttrNumber)
 
 #define RelationGetSecDisKey(relation) \
-    ((relation)->rd_locator_info ? (relation)->rd_locator_info->secAttrNum : InvalidAttrNumber)
+	((relation)->rd_locator_info ? (relation)->rd_locator_info->secAttrNum : InvalidAttrNumber)
 
 #define RelationIsSharded(relation) \
-    ((relation)->rd_locator_info ? (relation)->rd_locator_info->locatorType == LOCATOR_TYPE_SHARD : false)
+	((relation)->rd_locator_info ? (relation)->rd_locator_info->locatorType == LOCATOR_TYPE_SHARD : false)
 
 #define RelationHasToast(relation) \
-    OidIsValid((relation)->rd_toastoid)
+	OidIsValid((relation)->rd_toastoid)
 #endif
 
 /*
  * RelationUsesLocalBuffers
- *        True if relation's pages are stored in local buffers.
+ *		True if relation's pages are stored in local buffers.
  */
 #ifdef XCP
 #define RelationUsesLocalBuffers(relation) \
-    (!OidIsValid(MyCoordId) && \
-        ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP))
+	(!OidIsValid(MyCoordId) && \
+		((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP))
 #else
 #define RelationUsesLocalBuffers(relation) \
-    ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
+	((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
 #endif
 
 #ifdef PGXC
 /*
  * RelationGetLocInfo
- *        Return the location info of relation
+ *		Return the location info of relation
  */
 #define RelationGetLocInfo(relation) ((relation)->rd_locator_info)
 #endif
 
 /*
  * RELATION_IS_LOCAL
- *        If a rel is either temp or newly created in the current transaction,
- *        it can be assumed to be accessible only to the current backend.
- *        This is typically used to decide that we can skip acquiring locks.
+ *		If a rel is either temp or newly created in the current transaction,
+ *		it can be assumed to be accessible only to the current backend.
+ *		This is typically used to decide that we can skip acquiring locks.
  *
  * Beware of multiple eval of argument
  */
 #ifdef XCP
 #define RELATION_IS_LOCAL(relation) \
-    ((!OidIsValid(MyCoordId) && (relation)->rd_backend == MyBackendId) || \
-     (OidIsValid(MyCoordId) && (relation)->rd_backend == MyFirstBackendId) || \
-     ((relation)->rd_backend == MyBackendId || \
-     (relation)->rd_createSubid != InvalidSubTransactionId))
+	((!OidIsValid(MyCoordId) && (relation)->rd_backend == MyBackendId) || \
+	 (OidIsValid(MyCoordId) && (relation)->rd_backend == MyFirstBackendId) || \
+	 ((relation)->rd_backend == MyBackendId || \
+	 (relation)->rd_createSubid != InvalidSubTransactionId))
 #else
 #define RELATION_IS_LOCAL(relation) \
-    ((relation)->rd_islocaltemp || \
-     (relation)->rd_createSubid != InvalidSubTransactionId)
+	((relation)->rd_islocaltemp || \
+	 (relation)->rd_createSubid != InvalidSubTransactionId)
 #endif
 
 #ifdef XCP
 /*
  * RelationGetLocatorType
- *        Returns the rel's locator type.
+ *		Returns the rel's locator type.
  */
 #define RelationGetLocatorType(relation) \
-    ((relation)->rd_locator_info->locatorType)
+	((relation)->rd_locator_info->locatorType)
 
 #endif
 
 /*
  * RELATION_IS_OTHER_TEMP
- *        Test for a temporary relation that belongs to some other session.
+ *		Test for a temporary relation that belongs to some other session.
  *
  * Beware of multiple eval of argument
  */
 #ifdef XCP
 #define RELATION_IS_OTHER_TEMP(relation) \
-    (((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP && \
-     (relation)->rd_backend != MyBackendId) && \
-     ((!OidIsValid(MyCoordId) && (relation)->rd_backend != MyBackendId) || \
-      (OidIsValid(MyCoordId) && (relation)->rd_backend != MyFirstBackendId)))
+	(((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP && \
+	 (relation)->rd_backend != MyBackendId) && \
+	 ((!OidIsValid(MyCoordId) && (relation)->rd_backend != MyBackendId) || \
+	  (OidIsValid(MyCoordId) && (relation)->rd_backend != MyFirstBackendId)))
 #else
 #define RELATION_IS_OTHER_TEMP(relation) \
-    ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP && \
-     !(relation)->rd_islocaltemp)
+	((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP && \
+	 !(relation)->rd_islocaltemp)
 #endif
 
 #ifdef XCP
 /*
  * RELATION_IS_COORDINATOR_LOCAL
- *     Test for a coordinator only relation such as LOCAL TEMP table or a MATVIEW
+ * 	Test for a coordinator only relation such as LOCAL TEMP table or a MATVIEW
  */
 #define RELATION_IS_COORDINATOR_LOCAL(relation) \
-    ((RELATION_IS_LOCAL(relation) && !RelationGetLocInfo(relation)))
+	((RELATION_IS_LOCAL(relation) && !RelationGetLocInfo(relation)))
 #endif
 
 /*
  * RelationIsScannable
- *        Currently can only be false for a materialized view which has not been
- *        populated by its query.  This is likely to get more complicated later,
- *        so use a macro which looks like a function.
+ *		Currently can only be false for a materialized view which has not been
+ *		populated by its query.  This is likely to get more complicated later,
+ *		so use a macro which looks like a function.
  */
 #define RelationIsScannable(relation) ((relation)->rd_rel->relispopulated)
 
 /*
  * RelationIsPopulated
- *        Currently, we don't physically distinguish the "populated" and
- *        "scannable" properties of matviews, but that may change later.
- *        Hence, use the appropriate one of these macros in code tests.
+ *		Currently, we don't physically distinguish the "populated" and
+ *		"scannable" properties of matviews, but that may change later.
+ *		Hence, use the appropriate one of these macros in code tests.
  */
 #define RelationIsPopulated(relation) ((relation)->rd_rel->relispopulated)
 
 /*
  * RelationIsAccessibleInLogicalDecoding
- *        True if we need to log enough information to have access via
- *        decoding snapshot.
+ *		True if we need to log enough information to have access via
+ *		decoding snapshot.
  */
 #define RelationIsAccessibleInLogicalDecoding(relation) \
-    (XLogLogicalInfoActive() && \
-     RelationNeedsWAL(relation) && \
-     (IsCatalogRelation(relation) || RelationIsUsedAsCatalogTable(relation)))
+	(XLogLogicalInfoActive() && \
+	 RelationNeedsWAL(relation) && \
+	 (IsCatalogRelation(relation) || RelationIsUsedAsCatalogTable(relation)))
 
 /*
  * RelationIsLogicallyLogged
- *        True if we need to log enough information to extract the data from the
- *        WAL stream.
+ *		True if we need to log enough information to extract the data from the
+ *		WAL stream.
  *
  * We don't log information for unlogged tables (since they don't WAL log
  * anyway) and for system tables (their content is hard to make sense of, and
@@ -681,46 +682,46 @@ typedef struct ViewOptions
  * interesting to the user...
  */
 #define RelationIsLogicallyLogged(relation) \
-    (XLogLogicalInfoActive() && \
-     RelationNeedsWAL(relation) && \
-     !IsCatalogRelation(relation))
+	(XLogLogicalInfoActive() && \
+	 RelationNeedsWAL(relation) && \
+	 !IsCatalogRelation(relation))
 
 /*
  * RelationGetPartitionKey
- *        Returns the PartitionKey of a relation
+ *		Returns the PartitionKey of a relation
  */
 #define RelationGetPartitionKey(relation) ((relation)->rd_partkey)
 
 #ifdef __TBASE__
 #define RelationGetNParts(relation) \
-    ((relation)->rd_partitions_info ? (relation)->rd_partitions_info->partnparts : 0)
+	((relation)->rd_partitions_info ? (relation)->rd_partitions_info->partnparts : 0)
 
 #define RelationGetPartitionColumnIndex(relation) \
-    ((relation)->rd_partitions_info ? (relation)->rd_partitions_info->partpartkey : InvalidAttrNumber)
+	((relation)->rd_partitions_info ? (relation)->rd_partitions_info->partpartkey : InvalidAttrNumber)
 
 #define RELATION_IS_INTERVAL(relation) \
-    ((relation)->rd_rel->relpartkind == RELPARTKIND_PARENT)
-    //((relation)->rd_partkey && (relation)->rd_partkey->strategy == PARTITION_STRATEGY_INTERVAL)
+	((relation)->rd_rel->relpartkind == RELPARTKIND_PARENT)
+	//((relation)->rd_partkey && (relation)->rd_partkey->strategy == PARTITION_STRATEGY_INTERVAL)
 
 #define RELATION_IS_CHILD(relation) \
-    ((relation)->rd_rel->relpartkind == RELPARTKIND_CHILD)
+	((relation)->rd_rel->relpartkind == RELPARTKIND_CHILD)
 
 #define RELATION_GET_PARENT(relation) \
-    ((relation)->rd_rel->relparent)
+	((relation)->rd_rel->relparent)
 
 
 #define RELATION_IS_REGULAR(relation) \
-    ((relation)->rd_rel->relpartkind == RELPARTKIND_NONE)
+	((relation)->rd_rel->relpartkind == RELPARTKIND_NONE)
 
 
 #define IndexGetRelationId(relation) \
-    (    \
-        (relation)->rd_rel->relkind == RELKIND_INDEX ? \
-                (relation)->rd_index->indrelid : InvalidOid    \
-    )
+	(	\
+		(relation)->rd_rel->relkind == RELKIND_INDEX ? \
+				(relation)->rd_index->indrelid : InvalidOid	\
+	)
 
 #define PARTITION_KEY_IS_TIMESTAMP(partoid) \
-    ((partoid) == 1114 || (partoid) == 1184)
+	((partoid) == 1114 || (partoid) == 1184)
 
 extern int64 get_total_relation_size(Relation rel);
 #endif
@@ -731,19 +732,19 @@ extern int64 get_total_relation_size(Relation rel);
 static inline int
 get_partition_strategy(PartitionKey key)
 {
-    return key->strategy;
+	return key->strategy;
 }
 
 static inline int
 get_partition_natts(PartitionKey key)
 {
-    return key->partnatts;
+	return key->partnatts;
 }
 
 static inline List *
 get_partition_exprs(PartitionKey key)
 {
-    return key->partexprs;
+	return key->partexprs;
 }
 
 /*
@@ -752,24 +753,24 @@ get_partition_exprs(PartitionKey key)
 static inline int16
 get_partition_col_attnum(PartitionKey key, int col)
 {
-    return key->partattrs[col];
+	return key->partattrs[col];
 }
 
 static inline Oid
 get_partition_col_typid(PartitionKey key, int col)
 {
-    return key->parttypid[col];
+	return key->parttypid[col];
 }
 
 static inline int32
 get_partition_col_typmod(PartitionKey key, int col)
 {
-    return key->parttypmod[col];
+	return key->parttypmod[col];
 }
 
 /*
  * RelationGetPartitionDesc
- *        Returns partition descriptor for a relation.
+ *		Returns partition descriptor for a relation.
  */
 #define RelationGetPartitionDesc(relation) ((relation)->rd_partdesc)
 
@@ -779,4 +780,4 @@ extern void RelationDecrementReferenceCount(Relation rel);
 extern bool RelationHasUnloggedIndex(Relation rel);
 extern List *RelationGetRepsetList(Relation rel);
 
-#endif                            /* REL_H */
+#endif							/* REL_H */
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index 98adccc1..243834fa 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -15,6 +15,7 @@
 #define RELCACHE_H
 
 #include "access/tupdesc.h"
+#include "lib/ilist.h"
 #include "nodes/bitmapset.h"
 
 
@@ -54,6 +55,13 @@ typedef enum IndexAttrBitmapKind
 	INDEX_ATTR_BITMAP_IDENTITY_KEY
 } IndexAttrBitmapKind;
 
+typedef struct relcacheheader
+{
+	int			rh_ntup;		/* # of tuples in relation cache */
+	int			rh_maxtup;		/* max number of LRU relations */
+	dlist_head 	rh_lrulist;		/* LRU list, most recent first */
+} RelCacheHeader;
+
 extern Bitmapset *RelationGetIndexAttrBitmap(Relation relation,
 						   IndexAttrBitmapKind keyAttrs);
 
@@ -131,6 +139,9 @@ extern void RelationCacheInitFileRemove(void);
 
 extern bool RelationHasGTS(Relation rel);
 
+extern void RelationLRUInsert(Relation rel);
+extern void RelationLRUDelete(Relation rel);
+
 /* should be used only by relcache.c and catcache.c */
 extern bool criticalRelcachesBuilt;
 
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index b0bd7fb4..b624ad7c 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -107,6 +107,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_key_value                  | off
  enable_lock_account               | off
  enable_material                   | on
+ enable_memory_optimization        | on
  enable_mergejoin                  | on
  enable_multi_cluster              | on
  enable_multi_cluster_print        | off
@@ -133,7 +134,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
  enable_xlog_mprotect              | on
-(61 rows)
+(62 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From 6d6fe64caf9699f9ecf0d8a66562e8fe342aa034 Mon Sep 17 00:00:00 2001
From: jackywpxie <jackywpxie@tencent.com>
Date: Tue, 20 Apr 2021 21:24:43 +0800
Subject: [PATCH 167/578] =?UTF-8?q?Tbase=5Fv2.15.18=E5=90=88=E6=B5=81?=
 =?UTF-8?q?=E5=88=B0=E4=B8=BB=E7=BA=BFTbase=5Fv2.15=20(merge=20request=20!?=
 =?UTF-8?q?282)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Squash merge branch 'Tbase_v2.15.18' into 'Tbase_v2.15'

* Merge branch 'Tbase_v2.15' into Tbase_v2.15.18

* merge_Tbase_v2.15.18 (merge request !280)

* ExecRemoteQuery should ignore received tuple desc from DN if CN already has one

* fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131086220995 (merge request !265)

* Make value scan as replicated distribution.

* Push value scan to datanode.

* fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1110092131080582878

* fix Failed to get pooled connections http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131083143069 (merge request !224)

* 修复初始化shared buffer未加写锁的问题 (merge request !240)

* jacky/bugfix/consistent_Tbase_v2.15.18 (merge request !243)

* jacky/bugfix/coredump_Tbase_v5.05.3 (merge request !206)

* http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131084864233&url_cache_key=3ba5cc9f7d4408eb8cb3e14319eb688f

* use subroot for view sort in case of no sort operator in plan

* fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131085879935 (merge request !214)

* snyc dynamic shared memory from pg

* jacky/bugfix/pooler_Tbase_v2.15.12 (merge request !163)

* Merge branch 'Tbase_v2.15.18' of http://git.code.oa.com/Tbase/PG-XL-v10 into Tbase_v2.15.18

* add error print in pg_check while pgxc_node is wrong

* fix bug in bitmap_scan_cost

* Only print local cached plan if execution got error and minor fix (merge request !182)

* create branch Tbase_v2.15.18
---
 .../tbase_pooler_stat--1.0.sql                |   1 +
 contrib/tbase_pooler_stat/tbase_pooler_stat.c |  13 +-
 src/backend/access/hash/hash.c                |   9 +
 src/backend/access/hash/hashsearch.c          |  18 -
 src/backend/commands/analyze.c                |   5 +
 src/backend/nodes/copyfuncs.c                 |   1 +
 src/backend/optimizer/path/allpaths.c         |  56 ++-
 src/backend/optimizer/path/indxpath.c         |   2 +
 src/backend/optimizer/plan/createplan.c       |  16 +-
 src/backend/optimizer/util/pgxcship.c         |  25 +-
 src/backend/pgxc/pool/execRemote.c            |   9 +
 src/backend/pgxc/pool/poolmgr.c               | 357 ++++++++++++------
 src/backend/storage/buffer/bufmgr.c           |   4 +
 src/backend/storage/freespace/emapage.c       |   4 +
 src/backend/storage/ipc/dsm.c                 |  35 +-
 src/backend/utils/misc/mls.c                  |   4 +-
 src/backend/utils/mmgr/dsa.c                  |  78 +++-
 src/backend/utils/mmgr/freepage.c             |  25 +-
 src/backend/utils/time/tqual.c                |  22 +-
 src/include/pgxc/planner.h                    |   1 +
 src/include/pgxc/poolmgr.h                    |   4 +-
 src/test/regress/expected/aggregates_1.out    |  27 --
 src/test/regress/expected/create_view.out     |  32 ++
 src/test/regress/expected/gist_1.out          |   5 +-
 src/test/regress/expected/groupingsets.out    | 135 +++----
 src/test/regress/expected/groupingsets_1.out  |  28 +-
 src/test/regress/expected/join_3.out          |  51 ++-
 src/test/regress/expected/rules.out           |   3 +-
 src/test/regress/expected/subselect.out       |  44 ++-
 src/test/regress/expected/tablesample_1.out   |  31 +-
 src/test/regress/sql/create_view.sql          |   8 +
 31 files changed, 700 insertions(+), 353 deletions(-)

diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql b/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql
index 93b0be11..5ee8e1e6 100644
--- a/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql
+++ b/contrib/tbase_pooler_stat/tbase_pooler_stat--1.0.sql
@@ -32,6 +32,7 @@ CREATE OR REPLACE FUNCTION tbase_get_pooler_conn_statistics(
 	OUT warming_cnt int4,
 	OUT query_cnt int4,
 	OUT exceed_keepalive_cnt int4,
+	OUT exceed_deadtime_cnt int4,
 	OUT exceed_maxlifetime_cnt int4
 )
 RETURNS SETOF record
diff --git a/contrib/tbase_pooler_stat/tbase_pooler_stat.c b/contrib/tbase_pooler_stat/tbase_pooler_stat.c
index ac77982b..d38a4755 100644
--- a/contrib/tbase_pooler_stat/tbase_pooler_stat.c
+++ b/contrib/tbase_pooler_stat/tbase_pooler_stat.c
@@ -182,7 +182,7 @@ tbase_reset_pooler_cmd_statistics(PG_FUNCTION_ARGS)
 Datum
 tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS)
 {
-#define  LIST_POOLER_CONN_STATISTICS_COLUMNS 11
+#define  LIST_POOLER_CONN_STATISTICS_COLUMNS 12
     FuncCallContext 	 *funcctx = NULL;
     int32                ret = 0;
     Pooler_ConnState     *status = NULL;
@@ -222,7 +222,12 @@ tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS)
                            INT4OID, -1, 0);
         TupleDescInitEntry(tupdesc, (AttrNumber) 10, "exceed_keepalive_cnt",
                            INT4OID, -1, 0);
-        TupleDescInitEntry(tupdesc, (AttrNumber) 11, "exceed_maxlifetime_cnt",
+        /*
+         * This field is reserved for compatibility
+         */
+        TupleDescInitEntry(tupdesc, (AttrNumber) 11, "exceed_deadtime_cnt",
+                           INT4OID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 12, "exceed_maxlifetime_cnt",
                            INT4OID, -1, 0);
 
         funcctx->tuple_desc = BlessTupleDesc(tupdesc);
@@ -277,6 +282,7 @@ tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS)
             nulls[8] = true;
             nulls[9] = true;
             nulls[10] = true;
+            nulls[11] = true;
         }
         else
         {
@@ -288,7 +294,8 @@ tbase_get_pooler_conn_statistics(PG_FUNCTION_ARGS)
             values[7] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
             values[8] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
             values[9] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
-            values[10] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
+            values[10] = UInt32GetDatum(0);
+            values[11] = UInt32GetDatum(pq_getmsgint(status->buf, sizeof(uint32)));
             status->node_cursor--;
         }
 
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 7d04dd80..b7e21348 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -284,7 +284,16 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
      * Reacquire the read lock here.
      */
     if (BufferIsValid(so->hashso_curbuf))
+	{
+		if (enable_buffer_mprotect)
+		{
+			LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE);
+		}
+		else
+		{
         LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
+		}
+	}
 
     /*
      * If we've already initialized this scan, we can just advance it in the
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index e4ea31de..32df3ca4 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -469,17 +469,8 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                     /* Before leaving current page, deal with any killed items */
                     if (so->numKilled > 0)
 					{
-						if (enable_buffer_mprotect)
-						{
-							LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
-							LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE);
-							_hash_kill_items(scan);
-						}
-						else
-						{
                         _hash_kill_items(scan);
 						}
-					}
 
                     /*
                      * ran off the end of this page, try the next
@@ -537,17 +528,8 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
                     /* Before leaving current page, deal with any killed items */
                     if (so->numKilled > 0)
 					{
-						if (enable_buffer_mprotect)
-						{
-							LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
-							LockBuffer(so->hashso_curbuf, BUFFER_LOCK_EXCLUSIVE);
-							_hash_kill_items(scan);
-						}
-						else
-						{
                         _hash_kill_items(scan);
 						}
-					}
 
                     /*
                      * ran off the end of this page, try the next
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 54a26e55..48cf8d22 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -5133,6 +5133,11 @@ acquire_coordinator_sample_rows(Relation onerel, int elevel,
 	dummy = makeVar(1, 5, onerel->rd_rel->reltype, 0, InvalidOid, 0);
 	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
 										 makeTargetEntry((Expr *) dummy, 5, "rows", false));
+	/*
+	 * ANALYZE has known it's result slot desc, should
+	 * ignore received one to avoid duplicate name issue
+	 */
+	step->ignore_tuple_desc = true;
 
 	/* Execute query on the data nodes */
 	estate = CreateExecutorState();
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index c2885241..6b103aa4 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -1341,6 +1341,7 @@ _copyRemoteQuery(const RemoteQuery *from)
     COPY_SCALAR_FIELD(jf_xc_wholerow);
     COPY_BITMAPSET_FIELD(conflict_cols);
 	COPY_SCALAR_FIELD(is_set);
+	COPY_SCALAR_FIELD(ignore_tuple_desc);
 #endif
     return newnode;
 }
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 310cec07..821c2aef 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -44,6 +44,7 @@
 #include "optimizer/var.h"
 #include "parser/parse_clause.h"
 #include "parser/parsetree.h"
+#include "pgxc/nodemgr.h"
 #ifdef PGXC
 #include "nodes/makefuncs.h"
 #include "miscadmin.h"
@@ -141,6 +142,7 @@ static void recurse_push_qual(Node *setOp, Query *topquery,
 static void remove_unused_subquery_outputs(Query *subquery, RelOptInfo *rel);
 static void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
                         List *live_childrels);
+static bool check_list_contain_all_const(List *list);
 
 
 /*
@@ -2078,6 +2080,35 @@ set_function_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
                                            pathkeys, required_outer));
 }
 
+/*
+ * check_list_contain_all_const
+ *      Check the list is contain all consts.
+ */
+static bool
+check_list_contain_all_const(List *list)
+{
+	ListCell *lc = NULL;
+	Node   *node = NULL;
+
+	foreach(lc, list)
+	{
+		node = lfirst(lc);
+		if (IsA(node, List))
+		{
+			if (!check_list_contain_all_const(node))
+			{
+				return false;
+			}
+		}
+		else if (!IsA(node, Const))
+		{
+			return false;
+		}
+	}
+
+	return true;
+}
+
 /*
  * set_values_pathlist
  *        Build the (single) access path for a VALUES RTE
@@ -2086,6 +2117,7 @@ static void
 set_values_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 {
     Relids        required_outer;
+	Path        *new_path = NULL;
 
     /*
      * We don't support pushing join clauses into the quals of a values scan,
@@ -2095,7 +2127,29 @@ set_values_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
     required_outer = rel->lateral_relids;
 
     /* Generate appropriate path */
-    add_path(rel, create_valuesscan_path(root, rel, required_outer));
+	new_path = create_valuesscan_path(root, rel, required_outer);
+
+	/* Mark scan as replicated if selected value list is all const */
+	if (root->parse->commandType == CMD_SELECT &&
+	    check_list_contain_all_const((List *)rte->values_lists))
+	{
+		Distribution *targetd = NULL;
+		int node_index = 0;
+
+		targetd = makeNode(Distribution);
+		targetd->distributionType = LOCATOR_TYPE_REPLICATED;
+		targetd->nodes = NULL;
+
+		for (node_index = 0; node_index < NumDataNodes; node_index++)
+		{
+			targetd->nodes = bms_add_member(targetd->nodes, node_index);
+		}
+
+		targetd->restrictNodes = NULL;
+		new_path->distribution = targetd;
+	}
+
+	add_path(rel, new_path);
 }
 
 /*
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 31f75070..4d3b3cce 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -1620,6 +1620,8 @@ bitmap_scan_cost_est(PlannerInfo *root, RelOptInfo *rel, Path *ipath)
                                                       required_outer);
     bpath.path.pathkeys = NIL;
     bpath.bitmapqual = ipath;
+	/* TODO: get real distribution information */
+	bpath.path.distribution = NULL;
 
     /*
      * Check the cost of temporary path without considering parallelism.
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index cbd3bef4..48589d94 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -104,6 +104,7 @@
 int remote_subplan_depth = 0;
 List *groupOids = NULL;
 bool mergejoin = false;
+bool child_of_gather = false;
 bool enable_group_across_query = false;
 bool enable_distributed_unique_plan = false;
 #endif
@@ -1955,6 +1956,14 @@ create_gather_plan(PlannerInfo *root, GatherPath *best_path)
     Gather       *gather_plan;
     Plan       *subplan;
     List       *tlist;
+    bool reset = false;
+
+    /* if child_of_gather is false, set child_of_gather true, and reset the value before return */
+    if (!child_of_gather)
+    {
+        child_of_gather = true;
+        reset = true;
+    }
 
     /*
      * Although the Gather node can project, we prefer to push down such work
@@ -1975,6 +1984,11 @@ create_gather_plan(PlannerInfo *root, GatherPath *best_path)
     /* use parallel mode for parallel plans. */
     root->glob->parallelModeNeeded = true;
 
+	if (reset)
+    {
+        child_of_gather = false;
+    }
+
     return gather_plan;
 }
 
@@ -7086,7 +7100,7 @@ make_remotesubplan(PlannerInfo *root,
         gather_plan->parallelWorker_sendTuple = true;
     }
 
-    if ((IsA(lefttree, Gather) || lefttree->parallel_aware) &&
+	if ((IsA(lefttree, Gather) || lefttree->parallel_aware || child_of_gather) &&
         olap_optimizer)
     {
         plan->parallel_aware = true;
diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index df8aa3d5..3d00fa58 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -1884,6 +1884,16 @@ pgxc_query_contains_only_pg_catalog(List *rtable)
     return true;
 }
 
+ExecNodes *
+make_FQS_single_node()
+{
+	ExecNodes       *exec_nodes;
+	exec_nodes = makeNode(ExecNodes);
+	exec_nodes->accesstype = RELATION_ACCESS_READ_FQS;
+	exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, 0);
+	return exec_nodes;
+}
+
 /*
  * pgxc_is_query_shippable
  * This function calls the query walker to analyse the query to gather
@@ -1917,10 +1927,19 @@ pgxc_is_query_shippable(Query *query, int query_level)
 
 	exec_nodes = sc_context.sc_exec_nodes;
 
-	/* For single datanode and select command, we ship it directly. */
-	if (NumDataNodes == 1 && query->commandType == CMD_SELECT &&
-		!bms_is_member(SS_NEEDS_COORD, sc_context.sc_shippability))
+	/* For single datanode and select command, if we don't need coord
+	 * and exec_nodes exists, return it directly. But if exec_nodes is
+	 * NULL we make exec_nodes for FQS;
+	 */
+	if (!bms_is_member(SS_NEEDS_COORD, sc_context.sc_shippability))
+	{
+		if (NumDataNodes == 1 && query->commandType == CMD_SELECT)
+		{
+			if (exec_nodes)
 		return exec_nodes;
+			return make_FQS_single_node();
+		}
+	}
 		
 	/*
 	 * The shippability context contains two ExecNodes, one for the subLinks
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index a06692a6..d105295a 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -545,6 +545,15 @@ HandleRowDescription(ResponseCombiner *combiner, char *msg_body, size_t len)
                 (errcode(ERRCODE_DATA_CORRUPTED),
                  errmsg("Unexpected response from the Datanodes for 'T' message, current request type %d", combiner->request_type)));
     }
+	
+	/* should ignore received tuple desc if already got one to avoid duplicate name issue */
+	if (combiner->ss.ps.plan != NULL &&
+		IsA(combiner->ss.ps.plan, RemoteQuery) &&
+		((RemoteQuery *) combiner->ss.ps.plan)->ignore_tuple_desc)
+	{
+		return false;
+	}
+	
     /* Increment counter and check if it was first */
     if (combiner->description_count == 0)
     {
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index b83e273f..63d2c3e9 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -82,7 +82,6 @@ int            PoolConnKeepAlive      = 600;
 int            PoolMaintenanceTimeout = 30;
 int            PoolSizeCheckGap       = 120;  /* max check memory size gap, in seconds */
 int            PoolConnMaxLifetime    = 600;  /* max lifetime of a pooled connection, in seconds */
-int            PoolWarmConnMaxLifetime = 7200;  /* max lifetime of a warm-needed pooled connection, in seconds */
 int            PoolMaxMemoryLimit     = 10;
 int            PoolConnectTimeOut     = 10;
 int            PoolScaleFactor        = 2;
@@ -207,7 +206,7 @@ typedef struct
     Oid                  nodeoid;      /* Node Oid related to this pool */
     char                *connstr;    /* palloc memory, need free */
 
-    int32             m_version;  /* version of node pool */
+	time_t            m_version;  /* version of node pool */
     int32             size;        /* total pool size */
     int32               validSize;  /* valid data element number */    
     bool              failed;
@@ -567,7 +566,7 @@ static void  *pooler_async_utility_thread(void *arg);
 static void  *pooler_async_connection_management_thread(void *arg);
 static void  *pooler_sync_remote_operator_thread(void *arg);
 
-static bool   pooler_async_build_connection(DatabasePool *pool, int32 pool_version, int32 nodeidx, Oid node, 
+static bool   pooler_async_build_connection(DatabasePool *pool, time_t pool_version, int32 nodeidx, Oid node, 
                                             int32 size, char *connStr, bool bCoord);
 static BitmapMgr *BmpMgrCreate(uint32 objnum);
 static int        BmpMgrAlloc(BitmapMgr *mgr);
@@ -597,7 +596,6 @@ static void handle_clean_connection(PoolAgent * agent, StringInfo s);
 static void handle_get_connections(PoolAgent * agent, StringInfo s);
 static void handle_query_cancel(PoolAgent * agent, StringInfo s);
 static void handle_session_command(PoolAgent * agent, StringInfo s);
-static bool remove_all_agent_references(Oid nodeoid);
 static int  refresh_database_pools(PoolAgent *agent);
 
 static void pooler_async_ping_node(Oid node);
@@ -3149,6 +3147,17 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist,
             else
             {
                 acquire_succeed_num++;    
+				if (!slot->bdestoryed && difftime(time(NULL), slot->created) > PoolConnMaxLifetime)
+				{
+					elog(WARNING,
+						POOL_MGR_PREFIX"agent_acquire_connections acquired connection to "
+						"database:%s user:%s "
+						"node:%s backend_pid:%d nodeidx:%d "
+						"nodepool size:%d freeSize:%d that should be destoried.",
+						agent->pool->database, agent->pool->user_name,
+						nodePool->node_name, slot->backend_pid, node,
+						nodePool->size, nodePool->freeSize);
+				}
                 if (PoolConnectDebugPrint)
                 {
                     /* double check, to ensure no double destory and multiple agents for one slot */
@@ -4470,7 +4479,12 @@ reload_database_pools(PoolAgent *agent)
                 if (nodePool->size == nodePool->freeSize)
 #endif
                 {
-                    elog(LOG, POOL_MGR_PREFIX"nodePool:%s has been changed, size:%d, freeSize:%d, destory it now", nodePool->connstr, nodePool->size, nodePool->freeSize);                    
+					elog(LOG, POOL_MGR_PREFIX"nodePool:%s has been changed, "
+						"size:%d, freeSize:%d, reload_database_pools: nodePools "
+						"of node (%u, %s) is removed.",
+						nodePool->connstr,
+						nodePool->size, nodePool->freeSize,
+						nodePool->nodeoid, nodePool->node_name);
                     destroy_node_pool(nodePool);
                     hash_search(databasePool->nodePools, &nodePool->nodeoid,
                                 HASH_REMOVE, NULL);
@@ -4480,7 +4494,13 @@ reload_database_pools(PoolAgent *agent)
                     destroy_node_pool_free_slots(nodePool);
 
                     /* increase the node pool version */
-                    nodePool->m_version++;    
+					nodePool->m_version = time(NULL);
+					elog(LOG, POOL_MGR_PREFIX"nodePool:%s has been changed, "
+						"size:%d, freeSize:%d, reload_database_pools: nodePools "
+						"of node (%u, %s) has increased version %lu.",
+						nodePool->connstr,
+						nodePool->size, nodePool->freeSize,
+						nodePool->nodeoid, nodePool->node_name, nodePool->m_version);
 
                     /* fresh the connect string so that new coming connection will connect to the new node  */
                     if (connstr_chk)
@@ -4687,7 +4707,11 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
 
     nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node, HASH_FIND,
                                             NULL);
-    if (nodePool == NULL)
+	/*
+	 * The node pool of connections may has been created just now and the size is
+	 * initialized to 0. This situation needs to be excluded.
+	 */
+	if ((nodePool == NULL) || (nodePool->size == 0))
     {
         /*
          * The node may be altered or dropped.
@@ -4695,8 +4719,14 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
          */
         if (PoolConnectDebugPrint)
         {
-            elog(LOG, POOL_MGR_PREFIX"release_connection connection to node:%s backend_pid:%d nodeidx:%d size:%d freeSize:%d can not find nodepool, just destory it", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize);
-            abort();
+			elog(LOG,
+				POOL_MGR_PREFIX"release_connection connection to "
+				"database:%s user:%s "
+				"node:%s backend_pid:%d nodeidx:%d "
+				"size:%d freeSize:%d can not find nodepool, just destory it",
+				dbPool->database, dbPool->user_name,
+				nodePool->node_name, slot->backend_pid, nodeidx,
+				nodePool->size, nodePool->freeSize);
         }
         destroy_slot(nodeidx, node, slot);
         return;
@@ -4704,7 +4734,14 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
 
     if (PoolConnectDebugPrint)
     {
-        elog(LOG, POOL_MGR_PREFIX"release_connection connection to nodename:%s backend_pid:%d nodeidx:%d size:%d freeSize:%d begin to release", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize);
+		elog(LOG,
+			POOL_MGR_PREFIX"release_connection connection to "
+			"database:%s user:%s "
+			"nodename:%s backend_pid:%d nodeidx:%d "
+			"size:%d freeSize:%d begin to release",
+			dbPool->database, dbPool->user_name,
+			nodePool->node_name, slot->backend_pid, nodeidx,
+			nodePool->size, nodePool->freeSize);
     }
 
     /* force destroy the connection when pool not enabled */
@@ -4716,11 +4753,19 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
     /* destory the slot of former nodePool */
     if (slot->m_version != nodePool->m_version)
     {
-        force_destroy = true;
         if (PoolConnectDebugPrint)
         {
-            elog(LOG, POOL_MGR_PREFIX"release_connection connection to node:%s backend_pid:%d nodeidx:%d agentCount:%d size:%d freeSize:%d node version:%d slot version:%d not match", nodePool->node_name, slot->backend_pid, nodeidx, agentCount, nodePool->size, nodePool->freeSize, nodePool->m_version, slot->m_version);
+			elog(LOG,
+				POOL_MGR_PREFIX"release_connection connection to "
+				"database:%s user:%s "
+				"node:%s backend_pid:%d nodeidx:%d agentCount:%d "
+				"size:%d freeSize:%d node version:%lu slot version:%lu not match",
+				dbPool->database, dbPool->user_name,
+				nodePool->node_name, slot->backend_pid, nodeidx, agentCount,
+				nodePool->size, nodePool->freeSize, nodePool->m_version, slot->m_version);
         }
+		destroy_slot(nodeidx, node, slot);
+		return;
     }
     
     if (!force_destroy)
@@ -4731,24 +4776,38 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
             /* warm a connection is a hard job, when release them, we need make sure it has worked long enough. */
             if (slot->bwarmed)
             {
-                if (nodePool->freeSize > MinFreeSize && difftime(now, slot->created) > PoolWarmConnMaxLifetime)
+				if (nodePool->freeSize > MinFreeSize || difftime(now, slot->created) > PoolConnMaxLifetime)
                 {
                     force_destroy = true;
                     if (PoolConnectDebugPrint)
                     {
-                        elog(LOG, POOL_MGR_PREFIX"warmed connection to node:%s backend_pid:%d nodeidx:%d lifetime expired, closed it, size:%d freeSize:%d", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize);
+						elog(LOG,
+							POOL_MGR_PREFIX"warmed connection to "
+							"database:%s user:%s "
+							"node:%s backend_pid:%d nodeidx:%d lifetime expired, "
+							"closed it, size:%d freeSize:%d",
+							dbPool->database, dbPool->user_name,
+							nodePool->node_name, slot->backend_pid, nodeidx,
+							nodePool->size, nodePool->freeSize);
                     }
                 }                
             }
             else 
             {
                 if (((nodePool->freeSize > 0) && (nodePool->nwarming + nodePool->nquery) > MinFreeSize) ||                                         
-                    (difftime(now, slot->created) >= PoolWarmConnMaxLifetime))
+					(difftime(now, slot->created) >= PoolConnMaxLifetime))
                 {
                     force_destroy = true;
                     if (PoolConnectDebugPrint)
                     {
-                        elog(LOG, POOL_MGR_PREFIX"unwarmed connection to node:%s backend_pid:%d nodeidx:%d lifetime expired, closed it, size:%d freeSize:%d", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize);
+						elog(LOG,
+							POOL_MGR_PREFIX"unwarmed connection to "
+							"database:%s user:%s "
+							"node:%s backend_pid:%d nodeidx:%d lifetime expired, "
+							"closed it, size:%d freeSize:%d",
+							dbPool->database, dbPool->user_name,
+							nodePool->node_name, slot->backend_pid, nodeidx,
+							nodePool->size, nodePool->freeSize);
                     }
                 }
             }
@@ -4759,7 +4818,14 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
             force_destroy = true;
             if (PoolConnectDebugPrint)
             {
-                elog(LOG, POOL_MGR_PREFIX"connection to node:%s backend_pid:%d nodeidx:%d lifetime expired, closed it, size:%d freeSize:%d", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize);
+				elog(LOG,
+					POOL_MGR_PREFIX"connection to "
+					"database:%s user:%s "
+					"node:%s backend_pid:%d nodeidx:%d lifetime expired, "
+					"closed it, size:%d freeSize:%d",
+					dbPool->database, dbPool->user_name,
+					nodePool->node_name, slot->backend_pid, nodeidx,
+					nodePool->size, nodePool->freeSize);
             }
         }
     }
@@ -4795,7 +4861,14 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
                 slot->released = now;
                 if (PoolConnectDebugPrint)
                 {
-                    elog(LOG, POOL_MGR_PREFIX"release_connection return connection to node:%s backend_pid:%d nodeidx:%d nodepool size:%d freeSize:%d", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize);
+					elog(LOG,
+						POOL_MGR_PREFIX"release_connection return connection to "
+						"database:%s user:%s "
+						"node:%s backend_pid:%d nodeidx:%d "
+						"nodepool size:%d freeSize:%d",
+						dbPool->database, dbPool->user_name,
+						nodePool->node_name, slot->backend_pid, nodeidx,
+						nodePool->size, nodePool->freeSize);
                 }
             }
         }    
@@ -4805,7 +4878,14 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
         elog(DEBUG1, POOL_MGR_PREFIX"Cleaning up connection from pool %s, closing", nodePool->connstr);
         if (PoolConnectDebugPrint)
         {
-            elog(LOG, POOL_MGR_PREFIX"release_connection destory connection to node:%s backend_pid:%d nodeidx:%d nodepool size:%d freeSize:%d", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize);
+			elog(LOG,
+				POOL_MGR_PREFIX"release_connection destory connection to "
+				"database:%s user:%s "
+				"node:%s backend_pid:%d nodeidx:%d "
+				"nodepool size:%d freeSize:%d",
+				dbPool->database, dbPool->user_name,
+				nodePool->node_name, slot->backend_pid, nodeidx,
+				nodePool->size, nodePool->freeSize);
         }
         destroy_slot(nodeidx, node, slot);
         
@@ -4819,6 +4899,18 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
             grow_pool(dbPool, nodeidx, node, bCoord);
         }        
     }
+
+	if (!slot->bdestoryed && difftime(time(NULL), slot->created) > PoolConnMaxLifetime)
+	{
+		elog(WARNING,
+			POOL_MGR_PREFIX"release_connection has not destoried connection to "
+			"database:%s user:%s "
+			"node:%s backend_pid:%d nodeidx:%d "
+			"nodepool size:%d freeSize:%d",
+			dbPool->database, dbPool->user_name,
+			nodePool->node_name, slot->backend_pid, nodeidx,
+			nodePool->size, nodePool->freeSize);
+	}
 }
 
 /*
@@ -4872,6 +4964,11 @@ grow_pool(DatabasePool *dbPool, int32 nodeidx, Oid node, bool bCoord)
         }
         snprintf(nodePool->node_name, NAMEDATALEN, "%s", name_str);
         MemoryContextSwitchTo(oldcontext);
+
+		nodePool->m_version = time(NULL);
+		elog(LOG,
+			"grow_pool: nodePools of node (%u, %s) is created.",
+			nodePool->nodeoid, nodePool->node_name);
     }    
 
     /* here, we move the connection build work to async threads */
@@ -5110,6 +5207,8 @@ destroy_node_pool(PGXCNodePool *node_pool)
             destroy_slot(nodeidx, node_pool->nodeoid, node_pool->slot[i]);
         }
         pfree(node_pool->slot);
+		node_pool->size -= node_pool->freeSize;
+		node_pool->freeSize = 0;
     }
 }
 
@@ -5128,8 +5227,11 @@ destroy_node_pool_free_slots(PGXCNodePool *node_pool)
 
     if (PoolConnectDebugPrint)
     {
-        elog(LOG, POOL_MGR_PREFIX"About to destroy slots of node pool %s, agentCount is %d, node_pool version:%d current size is %d, freeSize is %d, %d connections are in use",
-             node_pool->connstr, node_pool->m_version, agentCount, node_pool->size, node_pool->freeSize, node_pool->size - node_pool->freeSize);    
+		elog(LOG,
+			POOL_MGR_PREFIX"About to destroy slots of node pool %s, node_pool version:%lu "
+			"agentCount is %d current size is %d, freeSize is %d, %d connections are in use",
+			node_pool->connstr, node_pool->m_version,
+			agentCount, node_pool->size, node_pool->freeSize, node_pool->size - node_pool->freeSize);
     }
 
     if (node_pool->slot)
@@ -5141,8 +5243,8 @@ destroy_node_pool_free_slots(PGXCNodePool *node_pool)
             destroy_slot(nodeidx, node_pool->nodeoid, node_pool->slot[i]);
             node_pool->slot[i] = NULL;
         }
-        node_pool->freeSize = 0;
         node_pool->size     -= node_pool->freeSize;        
+		node_pool->freeSize = 0;
     }
 }
 
@@ -5829,7 +5931,14 @@ shrink_pool(DatabasePool *pool)
                 {                    
                     if (PoolConnectDebugPrint)
                     {
-                        elog(LOG, POOL_MGR_PREFIX"shrink_pool destroy a connection to node:%s backend_pid:%d nodeidx:%d nodepool size:%d freeSize:%d", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize);
+						elog(LOG,
+							POOL_MGR_PREFIX"shrink_pool destroy a connection to "
+							"database:%s user:%s "
+							"node:%s backend_pid:%d nodeidx:%d "
+							"nodepool size:%d freeSize:%d",
+							pool->database, pool->user_name,
+							nodePool->node_name, slot->backend_pid, nodeidx,
+							nodePool->size, nodePool->freeSize);
                     }
                     /* connection is idle for long, close it */                    
                     destroy_slot(nodeidx, nodePool->nodeoid, slot);
@@ -5884,9 +5993,13 @@ shrink_pool(DatabasePool *pool)
         {
             if (PoolConnectDebugPrint)
             {
-                elog(LOG, POOL_MGR_PREFIX"close %d long time free node:%u, poolsize:%d, freeSize:%d", freeCount, nodePool->nodeoid, 
-                                                                                       nodePool->size, 
-                                                                                       nodePool->freeSize);            
+				elog(LOG,
+					POOL_MGR_PREFIX"close %d long time free connections of "
+						"database:%s user:%s "
+						"node:%u, poolsize:%d, freeSize:%d",
+						freeCount,
+						pool->database, pool->user_name,
+						nodePool->nodeoid, nodePool->size, nodePool->freeSize);
             }
 
             /* only grow pool when pool needed. */
@@ -5896,12 +6009,40 @@ shrink_pool(DatabasePool *pool)
             }
         }
         
+		if (PoolConnectDebugPrint)
+		{
+			for (i = 0; i < nodePool->freeSize; i++)
+			{
+				PGXCNodePoolSlot *slot = nodePool->slot[i];
+
+				if (slot && !slot->bdestoryed && difftime(time(NULL), slot->created) > PoolConnMaxLifetime)
+				{
+					elog(WARNING,
+						POOL_MGR_PREFIX"shrink_pool found connection to "
+						"database:%s user:%s "
+						"nodename:%s nodeid:%d "
+						"nodepool size:%d freeSize:%d that should be destoried.",
+						pool->database, pool->user_name,
+						nodePool->node_name, nodePool->nodeoid,
+						nodePool->size, nodePool->freeSize);
+
+					break;
+				}
+			}
+		}
+
         if (nodePool->size > 0)
         {
             empty = false;
         }
         else
         {
+			if (PoolConnectDebugPrint)
+			{
+				elog(LOG,
+					"shrink_pool: nodePools of node (%u, %s) is removed.",
+					nodePool->nodeoid, nodePool->node_name);
+			}
             destroy_node_pool(nodePool);
             hash_search(pool->nodePools, &nodePool->nodeoid, HASH_REMOVE, NULL);
         }
@@ -6561,9 +6702,10 @@ static void pooler_sync_connections_to_nodepool(void)
                 
                 nodePool = (PGXCNodePool *) hash_search(asyncInfo->dbPool->nodePools, &asyncInfo->node,
                                     HASH_ENTER, &found);
-
                 if (!found)
                 {
+					elog(WARNING, POOL_MGR_PREFIX"The nodePool has not found when the slot is warmed up.");
+
                     oldcontext = MemoryContextSwitchTo(PoolerMemoryContext);
                     nodePool->connstr = build_node_conn_str(asyncInfo->node, asyncInfo->dbPool);
                     if (!nodePool->connstr)
@@ -6587,6 +6729,7 @@ static void pooler_sync_connections_to_nodepool(void)
                     nodePool->coord      = false; /* in this case, only datanode */
                     nodePool->nwarming   = 0;
                     nodePool->nquery     = 0;
+					nodePool->m_version = time(NULL);
 
                     name_str = get_node_name_by_nodeoid(asyncInfo->node);
                     if (NULL == name_str)
@@ -6632,16 +6775,18 @@ static void pooler_sync_connections_to_nodepool(void)
                 }
                 else
                 {                    
-                    
-                    nodeidx = get_node_index_by_nodeoid(asyncInfo->node);
-                    destroy_slot(nodeidx, asyncInfo->node, asyncInfo->slot);
-                    
-                    /* Decrease pool size */
-                    DecreasePoolerSize(nodePool,__FILE__, __LINE__);                    
                     if (PoolConnectDebugPrint)
                     {
-                        elog(LOG, POOL_MGR_PREFIX"destory connection to node:%u nodeidx:%d nodepool size:%d freeSize:%d for unmatch version, slot->m_version:%d, nodePool->m_version:%d", asyncInfo->node, nodeidx, nodePool->size, nodePool->freeSize, asyncInfo->slot->m_version, nodePool->m_version);
+						elog(LOG, POOL_MGR_PREFIX"destory connection to node:%u "
+							"nodeidx:%d nodepool size:%d freeSize:%d for unmatch "
+							"version, slot->m_version:%lu, nodePool->m_version:%lu",
+							asyncInfo->node,
+							nodeidx, nodePool->size, nodePool->freeSize,
+							asyncInfo->slot->m_version, nodePool->m_version);
                     }
+					nodeidx = get_node_index_by_nodeoid(asyncInfo->node);
+					destroy_slot(nodeidx, asyncInfo->node, asyncInfo->slot);
+					break;
                 }
                 
                 if (COMMAND_CONNECTION_WARM == asyncInfo->cmd)
@@ -6753,6 +6898,12 @@ static void pooler_sync_connections_to_nodepool(void)
                                      errmsg(POOL_MGR_PREFIX"get node %u name failed", connRsp->nodeoid)));
                         }
                         snprintf(nodePool->node_name, NAMEDATALEN, "%s", name_str);
+
+						nodePool->m_version = now;
+						elog(LOG,
+							"pooler_sync_connections_to_nodepool: nodePools of "
+							"node (%u, %s) is created.",
+							nodePool->nodeoid, nodePool->node_name);
                     }
 
                     /* add connection to hash table */
@@ -6801,14 +6952,12 @@ static void pooler_sync_connections_to_nodepool(void)
                             destroy_slot(connRsp->nodeindex, connRsp->nodeoid, slot);
                             if (PoolConnectDebugPrint)
                             {
-                                elog(LOG, POOL_MGR_PREFIX"destroy slot poolsize:%d, freeSize:%d, node:%u, MaxPoolSize:%d, connRsp->m_version:%d, nodePool->m_version:%d", 
+								elog(LOG, POOL_MGR_PREFIX"destroy slot poolsize:%d, "
+									"freeSize:%d, node:%u, MaxPoolSize:%d, "
+									"connRsp->m_version:%lu, nodePool->m_version:%lu",
                                                                                                                            nodePool->size, 
-                                                                                                                           nodePool->freeSize, 
-                                                                                                                           nodePool->nodeoid, 
-                                                                                                                           MaxPoolSize,
-                                                                                                                           connRsp->m_version,
-                                                                                                                           nodePool->m_version
-                                                                                                                           );
+									nodePool->freeSize, nodePool->nodeoid, MaxPoolSize,
+									connRsp->m_version, nodePool->m_version);
                             }
                         }
                                     
@@ -6998,7 +7147,7 @@ static void pooler_async_ping_node(Oid node)
 
 
 /* async batch connection build  */
-static bool pooler_async_build_connection(DatabasePool *pool, int32 pool_version, int32 nodeidx, Oid node, int32 size, char *connStr, bool bCoord)
+static bool pooler_async_build_connection(DatabasePool *pool, time_t pool_version, int32 nodeidx, Oid node, int32 size, char *connStr, bool bCoord)
 {
 	int32 threadid; 
 	uint64 pipeput_loops = 0;
@@ -7308,6 +7457,11 @@ preconnect_and_warm(DatabasePool *dbPool)
                          errmsg(POOL_MGR_PREFIX"get node %u name failed", dnOids[i])));
             }
             snprintf(nodePool->node_name, NAMEDATALEN, "%s", name_str);
+
+			nodePool->m_version = time(NULL);
+			elog(LOG,
+				"preconnect_and_warm: nodePools of node (%u, %s) is created.",
+				nodePool->nodeoid, nodePool->node_name);
         }
 
         while (nodePool->size < MinPoolSize || (nodePool->freeSize < MinFreeSize && nodePool->size < MaxPoolSize))
@@ -7761,8 +7915,9 @@ void *pooler_sync_remote_operator_thread(void *arg)
                             {
                                 if (PoolConnectStaus_connected == request->final_status)
                                 {
-                                    finish_task_request(request->taskControl);
+								    /* Increase success count first and then finish count */
                                     acquire_command_increase_succeed(request->taskControl);
+									finish_task_request(request->taskControl);
                                     request->current_status = PoolConnectStaus_done;
                                 }
                                 else
@@ -7847,8 +8002,9 @@ void *pooler_sync_remote_operator_thread(void *arg)
                                     {
                                         /* job succeed */
                                         request->current_status = PoolConnectStaus_done;
-                                        finish_task_request(request->taskControl);    
+                                        /* Increase success count first and then finish count */
                                         acquire_command_increase_succeed(request->taskControl);
+										finish_task_request(request->taskControl);
                                     }
                                 }                                                            
                                 continue;
@@ -7864,9 +8020,10 @@ void *pooler_sync_remote_operator_thread(void *arg)
                             {
                                 int32                ret2    = 0;
                                                 
+                                /* Increase success count first and then finish count */
+                                acquire_command_increase_succeed(request->taskControl);
                                 /* set myself finish count */
                                 finish_task_request(request->taskControl);
-                                acquire_command_increase_succeed(request->taskControl);
                                 
                                 /* wait for others to finish */    
                                 while (!check_is_task_done(request->taskControl))
@@ -8452,6 +8609,9 @@ static inline bool dispatch_connection_request(PGXCASyncTaskCtl  *taskControl,
         
         /* use version to tag every slot */
         slot->m_version = nodepool->m_version;    
+		slot->created = time(NULL);
+		slot->checked = slot->created;
+		slot->released = slot->created;
     }
 
 
@@ -9916,7 +10076,7 @@ static void print_pooler_slot(PGXCNodePoolSlot  *slot)
     }
     else
     {
-        elog(LOG, "slot=%p bwarmed=%d usecount=%d refcount=%d m_version=%d pid=%d seqnum=%d "
+		elog(LOG, "slot=%p bwarmed=%d usecount=%d refcount=%d m_version=%lu pid=%d seqnum=%d "
                   "bdestoryed=%d file=%s lineno=%d node_name=%s backend_pid=%d",
                   slot, slot->bwarmed,
                   slot->usecount, slot->refcount,slot->m_version,slot->pid,slot->seqnum,
@@ -10744,65 +10904,6 @@ handle_session_command(PoolAgent * agent, StringInfo s)
 }
 
 
-
-static bool
-remove_all_agent_references(Oid nodeoid)
-{// #lizard forgives
-    int i, j;
-    bool res = true;
-
-    /*
-     * Identify if it's a coordinator or datanode first
-     * and get its index
-     */
-    for (i = 1; i <= agentCount; i++)
-    {
-        bool found = false;
-
-        PoolAgent *agent = poolAgents[i - 1];
-        for (j = 0; j < agent->num_dn_connections; j++)
-        {
-            if (agent->dn_conn_oids[j] == nodeoid)
-            {
-                found = true;
-                break;
-            }
-        }
-        if (found)
-        {
-            PGXCNodePoolSlot *slot = agent->dn_connections[j];
-            if (slot)
-                release_connection(agent->pool, slot, j, agent->dn_conn_oids[j], false, false);
-            agent->dn_connections[j] = NULL;
-        }
-        else
-        {
-            for (j = 0; j < agent->num_coord_connections; j++)
-            {
-                if (agent->coord_conn_oids[j] == nodeoid)
-                {
-                    found = true;
-                    break;
-                }
-            }
-            if (found)
-            {
-                PGXCNodePoolSlot *slot = agent->coord_connections[j];
-                if (slot)
-                    release_connection(agent->pool, slot, j, agent->coord_conn_oids[j], true, true);
-                agent->coord_connections[j] = NULL;
-            }
-            else
-            {
-                elog(LOG, "Node not found! (%u)", nodeoid);
-                res = false;
-            }
-        }
-    }
-    return res;
-}
-
-
 /*
  * refresh_database_pools
  *        refresh information for all database pools
@@ -10875,22 +10976,44 @@ refresh_database_pools(PoolAgent *agent)
 
             if (strcmp(connstr_chk, nodePool->connstr))
             {
-                elog(LOG, "Found an altered node (%u)", nodePool->nodeoid);
-                /*
-                 * Node has been altered. First remove
-                 * all references to this node from ALL the
-                 * agents before destroying it..
-                 */
-                if (!remove_all_agent_references(nodePool->nodeoid))
+				if (nodePool->size == nodePool->freeSize)
                 {
-                    res = POOL_REFRESH_FAILED;
-                    break;
-                }
-
+					elog(LOG,
+						"refresh_database_pools: Found an altered node (%u %s) "
+						"size %d freesize %d is removed. "
+						"connstr_chk=%s, nodePool->connstr=%s",
+						nodePool->nodeoid, nodePool->node_name,
+						nodePool->size, nodePool->freeSize,
+						connstr_chk, nodePool->connstr);
                 destroy_node_pool(nodePool);
                 hash_search(databasePool->nodePools, &nodePool->nodeoid,
                             HASH_REMOVE, NULL);
             }
+				else
+				{
+					destroy_node_pool_free_slots(nodePool);
+
+					/* increase the node pool version */
+					nodePool->m_version = time(NULL);
+					elog(LOG,
+						"refresh_database_pools: Found an altered node (%u %s) "
+						"size %d freesize %d increased m_version %lu"
+						"connstr_chk=%s, nodePool->connstr=%s",
+						nodePool->nodeoid, nodePool->node_name,
+						nodePool->size, nodePool->freeSize, nodePool->m_version,
+						connstr_chk, nodePool->connstr);
+
+					/* fresh the connect string so that new coming connection will connect to the new node  */
+					if (connstr_chk)
+					{
+						if (nodePool->connstr)
+						{
+							pfree(nodePool->connstr);
+						}
+						nodePool->connstr = pstrdup(connstr_chk);
+					}
+				}
+			}
 
             if (connstr_chk)
                 pfree(connstr_chk);
@@ -11181,7 +11304,7 @@ handle_close_pooled_connections(PoolAgent * agent, StringInfo s)
                 destroy_node_pool_free_slots(nodePool);
 
                 /* increase the node pool version */
-                nodePool->m_version++;    
+				nodePool->m_version = time(NULL);
             }
         }
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 56276b4a..3f4a8089 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2966,7 +2966,9 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
         if (REL_CRYPT_ENTRY_IS_VALID(&(reln->smgr_relcrypt)) 
             && (MAIN_FORKNUM == buf->tag.forkNum || EXTENT_FORKNUM == buf->tag.forkNum))
         {
+			BufDisableMemoryProtection(bufBlock, false);
             bufBlockEncrypt = rel_crypt_page_encrypt((RelCrypt)&(reln->smgr_relcrypt), bufToWrite);
+			BufEnableMemoryProtection(bufBlock, false);
         }
         else
         {       
@@ -3505,7 +3507,9 @@ FlushRelationBuffers(Relation rel)
                 if (REL_CRYPT_ENTRY_IS_VALID(&(rel->rd_smgr->smgr_relcrypt))
                     && (MAIN_FORKNUM == bufHdr->tag.forkNum || EXTENT_FORKNUM == bufHdr->tag.forkNum))
                 {
+					BufDisableMemoryProtection(localpage, false);
                     bufBlockEncrypt = rel_crypt_page_encrypt((RelCrypt)&(rel->rd_smgr->smgr_relcrypt), localpage);
+					BufDisableMemoryProtection(localpage, false);
                 }
                 else
                 {
diff --git a/src/backend/storage/freespace/emapage.c b/src/backend/storage/freespace/emapage.c
index b288c04e..063d08f3 100644
--- a/src/backend/storage/freespace/emapage.c
+++ b/src/backend/storage/freespace/emapage.c
@@ -220,6 +220,7 @@ extent_readbuffer(Relation rel, BlockNumber blkno, bool extend)
     buf = ReadBufferExtended(rel, EXTENT_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
     if (PageIsNew(BufferGetPage(buf)))
     {
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
         PageInit_shard(BufferGetPage(buf), BLCKSZ, 0, InvalidShardID, true);
         switch(pagetype)
         {
@@ -236,6 +237,7 @@ extent_readbuffer(Relation rel, BlockNumber blkno, bool extend)
                 elog(PANIC, "page type %d is not supported.", pagetype);
                 break;
         }
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     }
     return buf;
 }
@@ -324,6 +326,7 @@ extent_readbuffer_for_redo(RelFileNode rnode, BlockNumber blkno, bool extend)
     buf = XLogReadBufferExtended(rnode, EXTENT_FORKNUM, blkno, RBM_ZERO_ON_ERROR);
     if (PageIsNew(BufferGetPage(buf)))
     {
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
         PageInit_shard(BufferGetPage(buf), BLCKSZ, 0, InvalidShardID, true);
         switch(pagetype)
         {
@@ -340,6 +343,7 @@ extent_readbuffer_for_redo(RelFileNode rnode, BlockNumber blkno, bool extend)
                 elog(PANIC, "page type %d is not supported.", pagetype);
                 break;
         }
+		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
     }
     return buf;
 }
diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c
index ac677b63..eaf5145f 100644
--- a/src/backend/storage/ipc/dsm.c
+++ b/src/backend/storage/ipc/dsm.c
@@ -512,8 +512,6 @@ dsm_create(Size size, int flags)
     /* Verify that we can support an additional mapping. */
     if (nitems >= dsm_control->maxitems)
     {
-        if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
-        {
             LWLockRelease(DynamicSharedMemoryControlLock);
             dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
                         &seg->mapped_address, &seg->mapped_size, WARNING);
@@ -521,8 +519,10 @@ dsm_create(Size size, int flags)
                 ResourceOwnerForgetDSM(seg->resowner, seg);
             dlist_delete(&seg->node);
             pfree(seg);
+
+		if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
             return NULL;
-        }
+
         ereport(ERROR,
                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
                  errmsg("too many dynamic shared memory segments")));
@@ -597,22 +597,20 @@ dsm_attach(dsm_handle h)
     nitems = dsm_control->nitems;
     for (i = 0; i < nitems; ++i)
     {
-        /* If the reference count is 0, the slot is actually unused. */
-        if (dsm_control->item[i].refcnt == 0)
+		/*
+		 * If the reference count is 0, the slot is actually unused.  If the
+		 * reference count is 1, the slot is still in use, but the segment is
+		 * in the process of going away; even if the handle matches, another
+		 * slot may already have started using the same handle value by
+		 * coincidence so we have to keep searching.
+		 */
+		if (dsm_control->item[i].refcnt <= 1)
             continue;
 
         /* If the handle doesn't match, it's not the slot we want. */
         if (dsm_control->item[i].handle != seg->handle)
             continue;
 
-        /*
-         * If the reference count is 1, the slot is still in use, but the
-         * segment is in the process of going away.  Treat that as if we
-         * didn't find a match.
-         */
-        if (dsm_control->item[i].refcnt == 1)
-            break;
-
         /* Otherwise we've found a match. */
         dsm_control->item[i].refcnt++;
         seg->control_slot = i;
@@ -728,8 +726,12 @@ dsm_detach(dsm_segment *seg)
     /*
      * Invoke registered callbacks.  Just in case one of those callbacks
      * throws a further error that brings us back here, pop the callback
-     * before invoking it, to avoid infinite error recursion.
+	 * before invoking it, to avoid infinite error recursion. Don't allow
+	 * interrupts while running the individual callbacks in non-error code
+	 * paths, to avoid leaving cleanup work unfinished if we're interrupted by
+	 * a statement timeout or similar.
      */
+	HOLD_INTERRUPTS();
     while (!slist_is_empty(&seg->on_detach))
     {
         slist_node *node;
@@ -745,6 +747,7 @@ dsm_detach(dsm_segment *seg)
 
         function(seg, arg);
     }
+	RESUME_INTERRUPTS();
 
     /*
      * Try to remove the mapping, if one exists.  Normally, there will be, but
@@ -906,8 +909,8 @@ dsm_unpin_segment(dsm_handle handle)
     LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
     for (i = 0; i < dsm_control->nitems; ++i)
     {
-        /* Skip unused slots. */
-        if (dsm_control->item[i].refcnt == 0)
+		/* Skip unused slots and segments that are concurrently going away. */
+		if (dsm_control->item[i].refcnt <= 1)
             continue;
 
         /* If we've found our handle, we can stop searching. */
diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c
index 014e81b2..e55f36dd 100644
--- a/src/backend/utils/misc/mls.c
+++ b/src/backend/utils/misc/mls.c
@@ -749,9 +749,7 @@ static void* mls_crypt_worker(void * input)
         buf_need_encrypt = page_new + BLCKSZ;
 
         /* 2.2 do the encrypt */
-        need_mprotect = enable_buffer_mprotect &&
-            !BufferIsLocal(encrypt_element.buf_id) &&
-            BufferIsValid(encrypt_element.buf_id);
+        need_mprotect = enable_buffer_mprotect && !BufferIsLocal(encrypt_element.buf_id);
         if (need_mprotect)
         {
             BufDisableMemoryProtection(buf, false);
diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c
index 1382c516..f7f11c06 100644
--- a/src/backend/utils/mmgr/dsa.c
+++ b/src/backend/utils/mmgr/dsa.c
@@ -256,7 +256,7 @@ static const uint16 dsa_size_classes[] = {
  * round the size of the object up to the next multiple of 8 bytes, and then
  * index into this array.
  */
-static char dsa_size_class_map[] = {
+static const uint8 dsa_size_class_map[] = {
     2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 11, 12, 12, 13, 13,
     14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17,
     18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
@@ -405,6 +405,7 @@ static dsa_area *create_internal(void *place, size_t size,
 static dsa_area *attach_internal(void *place, dsm_segment *segment,
                 dsa_handle handle);
 static void check_for_freed_segments(dsa_area *area);
+static void check_for_freed_segments_locked(dsa_area *area);
 
 /*
  * Create a new shared area in a new DSM segment.  Further DSM segments will
@@ -649,7 +650,7 @@ dsa_pin_mapping(dsa_area *area)
  * will result in an ERROR.
  *
  * DSA_ALLOC_NO_OOM causes this function to return InvalidDsaPointer when
- * no memory is available or a size limit establed by set_dsa_size_limit
+ * no memory is available or a size limit establed by dsa_set_size_limit
  * would be exceeded.  Otherwise, such allocations will result in an ERROR.
  *
  * DSA_ALLOC_ZERO causes the allocated memory to be zeroed.  Otherwise, the
@@ -692,7 +693,16 @@ dsa_allocate_extended(dsa_area *area, Size size, int flags)
         /* Obtain a span object. */
         span_pointer = alloc_object(area, DSA_SCLASS_BLOCK_OF_SPANS);
         if (!DsaPointerIsValid(span_pointer))
+		{
+			/* Raise error unless asked not to. */
+			if ((flags & DSA_ALLOC_NO_OOM) == 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory"),
+						 errdetail("Failed on DSA request of size %zu.",
+								   size)));
             return InvalidDsaPointer;
+		}
 
         LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
 
@@ -1065,6 +1075,7 @@ dsa_dump(dsa_area *area)
      */
 
     LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	check_for_freed_segments_locked(area);
     fprintf(stderr, "dsa_area handle %x:\n", area->control->handle);
     fprintf(stderr, "  max_total_segment_size: %zu\n",
             area->control->max_total_segment_size);
@@ -1667,13 +1678,15 @@ ensure_active_superblock(dsa_area *area, dsa_area_pool *pool,
             return false;
         }
     }
+
+	/*
+	 * This shouldn't happen: get_best_segment() or make_new_segment()
+	 * promised that we can successfully allocate npages.
+	 */
     if (!FreePageManagerGet(segment_map->fpm, npages, &first_page))
-    {
-        LWLockRelease(DSA_AREA_LOCK(area));
-        if (size_class != DSA_SCLASS_BLOCK_OF_SPANS)
-            dsa_free(area, span_pointer);
-        return false;
-    }
+		elog(FATAL,
+			 "dsa_allocate could not find %zu free pages for superblock",
+			 npages);
     LWLockRelease(DSA_AREA_LOCK(area));
 
     /* Compute the start of the superblock. */
@@ -1762,6 +1775,23 @@ get_segment_by_index(dsa_area *area, dsa_segment_index index)
                (DSA_SEGMENT_HEADER_MAGIC ^ area->control->handle ^ index));
     }
 
+	/*
+	 * Callers of dsa_get_address() and dsa_free() don't hold the area lock,
+	 * but it's a bug in the calling code and undefined behavior if the
+	 * address is not live (ie if the segment might possibly have been freed,
+	 * they're trying to use a dangling pointer).
+	 *
+	 * For dsa.c code that holds the area lock to manipulate segment_bins
+	 * lists, it would be a bug if we ever reach a freed segment here.  After
+	 * it's marked as freed, the only thing any backend should do with it is
+	 * unmap it, and it should always have done that in
+	 * check_for_freed_segments_locked() before arriving here to resolve an
+	 * index to a segment_map.
+	 *
+	 * Either way we can assert that we aren't returning a freed segment.
+	 */
+	Assert(!area->segment_maps[index].header->freed);
+
     return &area->segment_maps[index];
 }
 
@@ -1778,9 +1808,6 @@ destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
     int            size_class = span->size_class;
     dsa_segment_map *segment_map;
 
-    segment_map =
-        get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start));
-
     /* Remove it from its fullness class list. */
     unlink_span(area, span);
 
@@ -1790,6 +1817,9 @@ destroy_superblock(dsa_area *area, dsa_pointer span_pointer)
      * could deadlock.
      */
     LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+	check_for_freed_segments_locked(area);
+	segment_map =
+		get_segment_by_index(area, DSA_EXTRACT_SEGMENT_NUMBER(span->start));
     FreePageManagerPut(segment_map->fpm,
                        DSA_EXTRACT_OFFSET(span->start) / FPM_PAGE_SIZE,
                        span->npages);
@@ -1944,6 +1974,7 @@ get_best_segment(dsa_area *area, Size npages)
     Size        bin;
 
     Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
+	check_for_freed_segments_locked(area);
 
     /*
      * Start searching from the first bin that *might* have enough contiguous
@@ -2220,10 +2251,30 @@ check_for_freed_segments(dsa_area *area)
     freed_segment_counter = area->control->freed_segment_counter;
     if (unlikely(area->freed_segment_counter != freed_segment_counter))
     {
-        int            i;
-
         /* Check all currently mapped segments to find what's been freed. */
         LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
+		check_for_freed_segments_locked(area);
+		LWLockRelease(DSA_AREA_LOCK(area));
+	}
+}
+
+/*
+ * Workhorse for check_for_freed_segments(), and also used directly in path
+ * where the area lock is already held.  This should be called after acquiring
+ * the lock but before looking up any segment by index number, to make sure we
+ * unmap any stale segments that might have previously had the same index as a
+ * current segment.
+ */
+static void
+check_for_freed_segments_locked(dsa_area *area)
+{
+	size_t		freed_segment_counter;
+	int			i;
+
+	Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
+	freed_segment_counter = area->control->freed_segment_counter;
+	if (unlikely(area->freed_segment_counter != freed_segment_counter))
+	{
         for (i = 0; i <= area->high_segment_index; ++i)
         {
             if (area->segment_maps[i].header != NULL &&
@@ -2235,7 +2286,6 @@ check_for_freed_segments(dsa_area *area)
                 area->segment_maps[i].mapped_address = NULL;
             }
         }
-        LWLockRelease(DSA_AREA_LOCK(area));
         area->freed_segment_counter = freed_segment_counter;
     }
 }
diff --git a/src/backend/utils/mmgr/freepage.c b/src/backend/utils/mmgr/freepage.c
index f61c6547..aa8bc7eb 100644
--- a/src/backend/utils/mmgr/freepage.c
+++ b/src/backend/utils/mmgr/freepage.c
@@ -164,7 +164,7 @@ static void FreePagePushSpanLeader(FreePageManager *fpm, Size first_page,
 static Size FreePageManagerLargestContiguous(FreePageManager *fpm);
 static void FreePageManagerUpdateLargest(FreePageManager *fpm);
 
-#if FPM_EXTRA_ASSERTS
+#ifdef FPM_EXTRA_ASSERTS
 static Size sum_free_pages(FreePageManager *fpm);
 #endif
 
@@ -231,7 +231,7 @@ FreePageManagerGet(FreePageManager *fpm, Size npages, Size *first_page)
 
     /*
      * FreePageManagerGetInternal may have set contiguous_pages_dirty.
-     * Recompute contigous_pages if so.
+	 * Recompute contiguous_pages if so.
      */
     FreePageManagerUpdateLargest(fpm);
 
@@ -455,7 +455,7 @@ FreePageManagerDump(FreePageManager *fpm)
     recycle = relptr_access(base, fpm->btree_recycle);
     if (recycle != NULL)
     {
-        appendStringInfo(&buf, "btree recycle:");
+		appendStringInfoString(&buf, "btree recycle:");
         FreePageManagerDumpSpans(fpm, recycle, 1, &buf);
     }
 
@@ -468,7 +468,7 @@ FreePageManagerDump(FreePageManager *fpm)
             continue;
         if (!dumped_any_freelist)
         {
-            appendStringInfo(&buf, "freelists:\n");
+			appendStringInfoString(&buf, "freelists:\n");
             dumped_any_freelist = true;
         }
         appendStringInfo(&buf, "  %zu:", f + 1);
@@ -742,8 +742,8 @@ FreePageBtreeConsolidate(FreePageManager *fpm, FreePageBtree *btp)
 
     /*
      * If we can fit our keys onto our left sibling's page, consolidate. In
-     * this case, we move our keys onto the other page rather than visca
-     * versa, to avoid having to adjust ancestor keys.
+	 * this case, we move our keys onto the other page rather than vice versa,
+	 * to avoid having to adjust ancestor keys.
      */
     np = FreePageBtreeFindLeftSibling(base, btp);
     if (np != NULL && btp->hdr.nused + np->hdr.nused <= max)
@@ -1275,7 +1275,7 @@ FreePageManagerDumpBtree(FreePageManager *fpm, FreePageBtree *btp,
                              btp->u.leaf_key[index].first_page,
                              btp->u.leaf_key[index].npages);
     }
-    appendStringInfo(buf, "\n");
+	appendStringInfoChar(buf, '\n');
 
     if (btp->hdr.magic == FREE_PAGE_INTERNAL_MAGIC)
     {
@@ -1308,7 +1308,7 @@ FreePageManagerDumpSpans(FreePageManager *fpm, FreePageSpanLeader *span,
         span = relptr_access(base, span->next);
     }
 
-    appendStringInfo(buf, "\n");
+	appendStringInfoChar(buf, '\n');
 }
 
 /*
@@ -1470,9 +1470,7 @@ FreePageManagerGetInternal(FreePageManager *fpm, Size npages, Size *first_page)
  * pages; if false, do it always.  Returns 0 if the soft flag caused the
  * insertion to be skipped, or otherwise the size of the contiguous span
  * created by the insertion.  This may be larger than npages if we're able
- * to consolidate with an adjacent range.  *internal_pages_used is set to
- * true if the btree allocated pages for internal purposes, which might
- * invalidate the current largest run requiring it to be recomputed.
+ * to consolidate with an adjacent range.
  */
 static Size
 FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages,
@@ -1526,6 +1524,9 @@ FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages,
 
             if (!relptr_is_null(fpm->btree_recycle))
                 root = FreePageBtreeGetRecycled(fpm);
+			/* Should not allocate if soft. */
+			else if (soft)
+				return 0;
             else if (FreePageManagerGetInternal(fpm, 1, &root_page))
                 root = (FreePageBtree *) fpm_page_to_pointer(base, root_page);
             else
@@ -1692,7 +1693,7 @@ FreePageManagerPutInternal(FreePageManager *fpm, Size first_page, Size npages,
 
             /*
              * The act of allocating pages to recycle may have invalidated the
-             * results of our previous btree reserch, so repeat it. (We could
+			 * results of our previous btree research, so repeat it. (We could
              * recheck whether any of our split-avoidance strategies that were
              * not viable before now are, but it hardly seems worthwhile, so
              * we don't bother. Consolidation can't be possible now if it
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index 247cabfe..5efbd805 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -243,6 +243,9 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer,
     GlobalTimestamp global_timestamp;
 #endif
 
+	BufferDesc *buf = NULL;
+	bool mprotect = false;
+
     if (TransactionIdIsValid(xid))
     {
         /* NB: xid must be known committed here! */
@@ -256,10 +259,18 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer,
         }
     }
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-	if (enable_buffer_mprotect)
+	/*
+	 * BUFFER_LOCK_EXCLUSIVE has made the buffer writable, but BUFFER_LOCK_SHARED
+	 * does not, so it has to be set to be writable.
+	 *
+	 * After setting GTS, it needs to set the memory protection again.
+	 */
+	buf = GetBufferDescriptor(buffer - 1);
+	mprotect = enable_buffer_mprotect &&
+		LWLockHeldByMeInMode(BufferDescriptorGetContentLock(buf), LW_SHARED);
+	if (mprotect)
 	{
-		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
-		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+		BufDisableMemoryProtection(BufferGetPage(buffer), false);
 	}
     if(infomask & HEAP_XMIN_COMMITTED)
     {
@@ -323,6 +334,11 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer,
 
     tuple->t_infomask |= infomask;
     MarkBufferDirtyHint(buffer, true);
+
+	if (mprotect)
+	{
+		BufEnableMemoryProtection(BufferGetPage(buffer), false);
+	}
 }
 
 
diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h
index c868136a..9e16886d 100644
--- a/src/include/pgxc/planner.h
+++ b/src/include/pgxc/planner.h
@@ -227,6 +227,7 @@ typedef struct
 
 	Node			*parsetree;  /* to recognize subtxn cmds (savepoint, rollback to, release savepoint) */
 	bool            is_set;      /* is SET statement ? */
+	bool            ignore_tuple_desc; /* should ignore received tuple slot desc ? */
 #endif
 } RemoteQuery;
 
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h
index 9fff0445..e5cee0b6 100644
--- a/src/include/pgxc/poolmgr.h
+++ b/src/include/pgxc/poolmgr.h
@@ -105,7 +105,7 @@ typedef struct
 
     /* trace info */    
     int32  refcount;   /* reference count */
-    int32  m_version;  /* version of node slot */
+	time_t m_version;  /* version of node slot */
     int32  pid;           /* agent pid that contains the slot */
     int32  seqnum;       /* slot seqnum for the slot, unique for one slot */
     bool   bdestoryed; /* used to show whether we are destoryed */
@@ -128,7 +128,7 @@ typedef struct
     int            size;          /* total pool size */
 
     char        node_name[NAMEDATALEN]; /* name of the node.*/
-    int32       m_version;    /* version of node pool */
+	time_t		m_version;	/* version of node pool */
     PGXCNodePoolSlot **slot;
 } PGXCNodePool;
 
diff --git a/src/test/regress/expected/aggregates_1.out b/src/test/regress/expected/aggregates_1.out
index d967655b..9602196b 100644
--- a/src/test/regress/expected/aggregates_1.out
+++ b/src/test/regress/expected/aggregates_1.out
@@ -1872,8 +1872,6 @@ create aggregate my_sum(int4)
 );
 -- aggregate state should be shared as aggs are the same.
 select my_avg(one),my_avg(one) from (values(1),(3)) t(one);
-NOTICE:  avg_transfn called with 1
-NOTICE:  avg_transfn called with 3
  my_avg | my_avg 
 --------+--------
       2 |      2
@@ -1881,8 +1879,6 @@ NOTICE:  avg_transfn called with 3
 
 -- aggregate state should be shared as transfn is the same for both aggs.
 select my_avg(one),my_sum(one) from (values(1),(3)) t(one);
-NOTICE:  avg_transfn called with 1
-NOTICE:  avg_transfn called with 3
  my_avg | my_sum 
 --------+--------
       2 |      4
@@ -1890,8 +1886,6 @@ NOTICE:  avg_transfn called with 3
 
 -- same as previous one, but with DISTINCT, which requires sorting the input.
 select my_avg(distinct one),my_sum(distinct one) from (values(1),(3),(1)) t(one);
-NOTICE:  avg_transfn called with 1
-NOTICE:  avg_transfn called with 3
  my_avg | my_sum 
 --------+--------
       2 |      4
@@ -1899,10 +1893,6 @@ NOTICE:  avg_transfn called with 3
 
 -- shouldn't share states due to the distinctness not matching.
 select my_avg(distinct one),my_sum(one) from (values(1),(3)) t(one);
-NOTICE:  avg_transfn called with 1
-NOTICE:  avg_transfn called with 3
-NOTICE:  avg_transfn called with 1
-NOTICE:  avg_transfn called with 3
  my_avg | my_sum 
 --------+--------
       2 |      4
@@ -1910,9 +1900,6 @@ NOTICE:  avg_transfn called with 3
 
 -- shouldn't share states due to the filter clause not matching.
 select my_avg(one) filter (where one > 1),my_sum(one) from (values(1),(3)) t(one);
-NOTICE:  avg_transfn called with 1
-NOTICE:  avg_transfn called with 3
-NOTICE:  avg_transfn called with 3
  my_avg | my_sum 
 --------+--------
       3 |      4
@@ -1920,10 +1907,6 @@ NOTICE:  avg_transfn called with 3
 
 -- this should not share the state due to different input columns.
 select my_avg(one),my_sum(two) from (values(1,2),(3,4)) t(one,two);
-NOTICE:  avg_transfn called with 2
-NOTICE:  avg_transfn called with 1
-NOTICE:  avg_transfn called with 4
-NOTICE:  avg_transfn called with 3
  my_avg | my_sum 
 --------+--------
       2 |      6
@@ -1953,8 +1936,6 @@ create aggregate my_avg_init2(int4)
 );
 -- state should be shared if INITCONDs are matching
 select my_sum_init(one),my_avg_init(one) from (values(1),(3)) t(one);
-NOTICE:  avg_transfn called with 1
-NOTICE:  avg_transfn called with 3
  my_sum_init | my_avg_init 
 -------------+-------------
           14 |           7
@@ -1962,10 +1943,6 @@ NOTICE:  avg_transfn called with 3
 
 -- Varying INITCONDs should cause the states not to be shared.
 select my_sum_init(one),my_avg_init2(one) from (values(1),(3)) t(one);
-NOTICE:  avg_transfn called with 1
-NOTICE:  avg_transfn called with 1
-NOTICE:  avg_transfn called with 3
-NOTICE:  avg_transfn called with 3
  my_sum_init | my_avg_init2 
 -------------+--------------
           14 |            4
@@ -2017,10 +1994,6 @@ create aggregate my_half_sum(int4)
 );
 -- Agg state should be shared even though my_sum has no finalfn
 select my_sum(one),my_half_sum(one) from (values(1),(2),(3),(4)) t(one);
-NOTICE:  sum_transfn called with 1
-NOTICE:  sum_transfn called with 2
-NOTICE:  sum_transfn called with 3
-NOTICE:  sum_transfn called with 4
  my_sum | my_half_sum 
 --------+-------------
      10 |           5
diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out
index 56e73b4e..d1285a50 100644
--- a/src/test/regress/expected/create_view.out
+++ b/src/test/regress/expected/create_view.out
@@ -1713,3 +1713,35 @@ DROP SCHEMA temp_view_test CASCADE;
 NOTICE:  drop cascades to 27 other objects
 DROP SCHEMA testviewschm2 CASCADE;
 NOTICE:  drop cascades to 62 other objects
+-- check plan without sort operator, but need merge sort
+set enable_seqscan = off;
+create table test(v int primary key, w int) distribute by shard(v); 
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into test values(generate_series(1,50), generate_series(1,50));
+create view test_sort as select * from test where v in (select v from test where w < 20) order by v asc;
+select * from test_sort;
+ v  | w  
+----+----
+  1 |  1
+  2 |  2
+  3 |  3
+  4 |  4
+  5 |  5
+  6 |  6
+  7 |  7
+  8 |  8
+  9 |  9
+ 10 | 10
+ 11 | 11
+ 12 | 12
+ 13 | 13
+ 14 | 14
+ 15 | 15
+ 16 | 16
+ 17 | 17
+ 18 | 18
+ 19 | 19
+(19 rows)
+
+drop table test cascade;
+NOTICE:  drop cascades to view test_sort
diff --git a/src/test/regress/expected/gist_1.out b/src/test/regress/expected/gist_1.out
index 99b03902..0653fb98 100644
--- a/src/test/regress/expected/gist_1.out
+++ b/src/test/regress/expected/gist_1.out
@@ -129,7 +129,8 @@ cross join lateral
                                       QUERY PLAN                                      
 --------------------------------------------------------------------------------------
  Nested Loop
-   ->  Values Scan on "*VALUES*"
+   ->  Remote Subquery Scan on all (datanode_1)
+         ->  Values Scan on "*VALUES*"
    ->  Materialize
          ->  Limit
                ->  Remote Subquery Scan on all (datanode_1)
@@ -137,7 +138,7 @@ cross join lateral
                            ->  Index Only Scan using gist_tbl_point_index on gist_tbl
                                  Index Cond: (p <@ "*VALUES*".column1)
                                  Order By: (p <-> ("*VALUES*".column1)[0])
-(9 rows)
+(10 rows)
 
 select p from
   (values (box(point(0,0), point(0.5,0.5))),
diff --git a/src/test/regress/expected/groupingsets.out b/src/test/regress/expected/groupingsets.out
index e1524f49..56a23289 100644
--- a/src/test/regress/expected/groupingsets.out
+++ b/src/test/regress/expected/groupingsets.out
@@ -658,20 +658,21 @@ select v.c, (select count(*) from gstest2 group by () having v.c)
 explain (costs off)
   select v.c, (select count(*) from gstest2 group by () having v.c)
     from (values (false),(true)) v(c) order by v.c;
-                               QUERY PLAN                                
--------------------------------------------------------------------------
- Sort
-   Sort Key: "*VALUES*".column1
-   ->  Values Scan on "*VALUES*"
-         SubPlan 1
-           ->  Aggregate
-                 Group Key: ()
-                 Filter: "*VALUES*".column1
-                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                       ->  Result
-                             One-Time Filter: "*VALUES*".column1
-                             ->  Seq Scan on gstest2
-(11 rows)
+                                  QUERY PLAN                                   
+-------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   ->  Sort
+         Sort Key: "*VALUES*".column1
+         ->  Values Scan on "*VALUES*"
+               SubPlan 1
+                 ->  Aggregate
+                       Group Key: ()
+                       Filter: "*VALUES*".column1
+                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                             ->  Result
+                                   One-Time Filter: "*VALUES*".column1
+                                   ->  Seq Scan on gstest2
+(12 rows)
 
 -- HAVING with GROUPING queries
 select ten, grouping(ten) from onek
@@ -885,15 +886,16 @@ select a, b, grouping(a,b), sum(v), count(*), max(v)
 
 explain (costs off) select a, b, grouping(a,b), sum(v), count(*), max(v)
   from gstest1 group by grouping sets ((a),(b)) order by 3,1,2;
-                                               QUERY PLAN                                               
---------------------------------------------------------------------------------------------------------
- Sort
-   Sort Key: (GROUPING("*VALUES*".column1, "*VALUES*".column2)), "*VALUES*".column1, "*VALUES*".column2
-   ->  HashAggregate
-         Hash Key: "*VALUES*".column1
-         Hash Key: "*VALUES*".column2
-         ->  Values Scan on "*VALUES*"
-(6 rows)
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   ->  Sort
+         Sort Key: (GROUPING("*VALUES*".column1, "*VALUES*".column2)), "*VALUES*".column1, "*VALUES*".column2
+         ->  HashAggregate
+               Hash Key: "*VALUES*".column1
+               Hash Key: "*VALUES*".column2
+               ->  Values Scan on "*VALUES*"
+(7 rows)
 
 select a, b, grouping(a,b), sum(v), count(*), max(v)
   from gstest1 group by cube(a,b) order by 3,1,2;
@@ -919,34 +921,36 @@ select a, b, grouping(a,b), sum(v), count(*), max(v)
 
 explain (costs off) select a, b, grouping(a,b), sum(v), count(*), max(v)
   from gstest1 group by cube(a,b) order by 3,1,2;
-                                               QUERY PLAN                                               
---------------------------------------------------------------------------------------------------------
- Sort
-   Sort Key: (GROUPING("*VALUES*".column1, "*VALUES*".column2)), "*VALUES*".column1, "*VALUES*".column2
-   ->  MixedAggregate
-         Hash Key: "*VALUES*".column1, "*VALUES*".column2
-         Hash Key: "*VALUES*".column1
-         Hash Key: "*VALUES*".column2
-         Group Key: ()
-         ->  Values Scan on "*VALUES*"
-(8 rows)
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   ->  Sort
+         Sort Key: (GROUPING("*VALUES*".column1, "*VALUES*".column2)), "*VALUES*".column1, "*VALUES*".column2
+         ->  MixedAggregate
+               Hash Key: "*VALUES*".column1, "*VALUES*".column2
+               Hash Key: "*VALUES*".column1
+               Hash Key: "*VALUES*".column2
+               Group Key: ()
+               ->  Values Scan on "*VALUES*"
+(9 rows)
 
 -- shouldn't try and hash
 explain (costs off)
   select a, b, grouping(a,b), array_agg(v order by v)
     from gstest1 group by cube(a,b);
-                        QUERY PLAN                        
-----------------------------------------------------------
- GroupAggregate
-   Group Key: "*VALUES*".column1, "*VALUES*".column2
-   Group Key: "*VALUES*".column1
-   Group Key: ()
-   Sort Key: "*VALUES*".column2
-     Group Key: "*VALUES*".column2
-   ->  Sort
-         Sort Key: "*VALUES*".column1, "*VALUES*".column2
-         ->  Values Scan on "*VALUES*"
-(9 rows)
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   ->  GroupAggregate
+         Group Key: "*VALUES*".column1, "*VALUES*".column2
+         Group Key: "*VALUES*".column1
+         Group Key: ()
+         Sort Key: "*VALUES*".column2
+           Group Key: "*VALUES*".column2
+         ->  Sort
+               Sort Key: "*VALUES*".column1, "*VALUES*".column2
+               ->  Values Scan on "*VALUES*"
+(10 rows)
 
 -- mixed hashable/sortable cases
 select unhashable_col, unsortable_col,
@@ -1134,15 +1138,16 @@ explain (costs off)
   select a, b, sum(v.x)
     from (values (1),(2)) v(x), gstest_data(v.x)
    group by grouping sets (a,b);
-                QUERY PLAN                
-------------------------------------------
+                      QUERY PLAN                      
+------------------------------------------------------
  HashAggregate
    Hash Key: gstest_data.a
    Hash Key: gstest_data.b
    ->  Nested Loop
-         ->  Values Scan on "*VALUES*"
+         ->  Remote Subquery Scan on all (datanode_1)
+               ->  Values Scan on "*VALUES*"
          ->  Function Scan on gstest_data
-(6 rows)
+(7 rows)
 
 select *
   from (values (1),(2)) v(x),
@@ -1188,16 +1193,17 @@ select a, b, grouping(a,b), sum(v), count(*), max(v)
 explain (costs off)
   select a, b, grouping(a,b), sum(v), count(*), max(v)
     from gstest1 group by grouping sets ((a,b),(a+1,b+1),(a+2,b+2)) order by 3,6;
-                                        QUERY PLAN                                         
--------------------------------------------------------------------------------------------
- Sort
-   Sort Key: (GROUPING("*VALUES*".column1, "*VALUES*".column2)), (max("*VALUES*".column3))
-   ->  HashAggregate
-         Hash Key: "*VALUES*".column1, "*VALUES*".column2
-         Hash Key: ("*VALUES*".column1 + 1), ("*VALUES*".column2 + 1)
-         Hash Key: ("*VALUES*".column1 + 2), ("*VALUES*".column2 + 2)
-         ->  Values Scan on "*VALUES*"
-(7 rows)
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   ->  Sort
+         Sort Key: (GROUPING("*VALUES*".column1, "*VALUES*".column2)), (max("*VALUES*".column3))
+         ->  HashAggregate
+               Hash Key: "*VALUES*".column1, "*VALUES*".column2
+               Hash Key: ("*VALUES*".column1 + 1), ("*VALUES*".column2 + 1)
+               Hash Key: ("*VALUES*".column1 + 2), ("*VALUES*".column2 + 2)
+               ->  Values Scan on "*VALUES*"
+(8 rows)
 
 select a, b, sum(c), sum(sum(c)) over (order by a,b) as rsum
   from gstest2 group by cube (a,b) order by rsum, a, b;
@@ -1255,8 +1261,8 @@ explain (costs off)
   select a, b, sum(v.x)
     from (values (1),(2)) v(x), gstest_data(v.x)
    group by cube (a,b) order by a,b;
-                   QUERY PLAN                   
-------------------------------------------------
+                         QUERY PLAN                         
+------------------------------------------------------------
  Sort
    Sort Key: gstest_data.a, gstest_data.b
    ->  MixedAggregate
@@ -1265,9 +1271,10 @@ explain (costs off)
          Hash Key: gstest_data.b
          Group Key: ()
          ->  Nested Loop
-               ->  Values Scan on "*VALUES*"
+               ->  Remote Subquery Scan on all (datanode_1)
+                     ->  Values Scan on "*VALUES*"
                ->  Function Scan on gstest_data
-(10 rows)
+(11 rows)
 
 -- More rescan tests
 select * from (values (1),(2)) v(a) left join lateral (select v.a, four, ten, count(*) from onek group by cube(four,ten)) s on true order by v.a,four,ten;
diff --git a/src/test/regress/expected/groupingsets_1.out b/src/test/regress/expected/groupingsets_1.out
index 93958dfc..e1524f49 100644
--- a/src/test/regress/expected/groupingsets_1.out
+++ b/src/test/regress/expected/groupingsets_1.out
@@ -1410,14 +1410,16 @@ explain (costs off)
    Hash Key: four
    Hash Key: ten
    Hash Key: hundred
-   Hash Key: thousand
-   Hash Key: twothousand
    Group Key: unique1
+   Sort Key: twothousand
+     Group Key: twothousand
+   Sort Key: thousand
+     Group Key: thousand
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Sort
                Sort Key: unique1
                ->  Seq Scan on tenk1
-(12 rows)
+(14 rows)
 
 explain (costs off)
   select unique1,
@@ -1448,16 +1450,18 @@ explain (costs off)
     from tenk1 group by grouping sets (unique1,twothousand,thousand,hundred,ten,four,two);
                         QUERY PLAN                         
 -----------------------------------------------------------
- HashAggregate
-   Hash Key: unique1
-   Hash Key: twothousand
-   Hash Key: thousand
-   Hash Key: hundred
-   Hash Key: ten
-   Hash Key: four
+ MixedAggregate
    Hash Key: two
+   Hash Key: four
+   Hash Key: ten
+   Hash Key: hundred
+   Hash Key: thousand
+   Hash Key: twothousand
+   Group Key: unique1
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-         ->  Seq Scan on tenk1
-(10 rows)
+         ->  Sort
+               Sort Key: unique1
+               ->  Seq Scan on tenk1
+(12 rows)
 
 -- end
diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index 53a75d2f..3841e3eb 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -3513,13 +3513,13 @@ left join
   ) foo3
 using (join_key);
                                    QUERY PLAN                                   
---------------------------------------------------------------------------------
- Hash Right Join
-   Output: "*VALUES*".column1, i1.f1, (666)
-   Hash Cond: (i1.f1 = "*VALUES*".column1)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-         Output: i1.f1, 666
-         ->  Merge Right Join
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: "*VALUES*".column1, i1.f1, 666
+   ->  Hash Right Join
+         Output: "*VALUES*".column1, i1.f1, (666)
+         Hash Cond: (i1.f1 = "*VALUES*".column1)
+         ->  Hash Right Join
                Output: i1.f1, 666
                Merge Cond: (i2.unique2 = i1.f1)
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
@@ -3540,9 +3540,12 @@ using (join_key);
                                        Output: i1.f1
    ->  Hash
          Output: "*VALUES*".column1
+               ->  Remote Subquery Scan on all (datanode_1)
+                     Output: "*VALUES*".column1
+                     Distribute results by H: column1
          ->  Values Scan on "*VALUES*"
                Output: "*VALUES*".column1
-(28 rows)
+(27 rows)
 
 select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from
   (values (0),(1)) foo1(join_key)
@@ -3558,8 +3561,8 @@ left join
 using (join_key);
  foo1_id | foo3_id | bug_field 
 ---------+---------+-----------
-       0 |       0 |       666
        1 |         |          
+       0 |       0 |       666
 (2 rows)
 
 --
@@ -5431,15 +5434,13 @@ select * from
            where f1 = any (select unique1 from tenk1
                            where unique2 = v.x offset 0)) ss;
                                        QUERY PLAN                                       
-----------------------------------------------------------------------------------------
- Nested Loop
+----------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1
+   ->  Nested Loop
    Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1
    ->  Values Scan on "*VALUES*"
          Output: "*VALUES*".column1, "*VALUES*".column2
-   ->  Materialize
-         Output: int4_tbl.f1
-         ->  Remote Subquery Scan on all
-               Output: int4_tbl.f1
                ->  Hash Join
                      Output: int4_tbl.f1
                      Inner Unique: true
@@ -5454,7 +5455,7 @@ select * from
                                  ->  Index Scan using tenk1_unique2 on public.tenk1
                                        Output: tenk1.unique1
                                        Index Cond: (tenk1.unique2 = "*VALUES*".column2)
-(22 rows)
+(20 rows)
 
 select * from
   (values (0,9998), (1,1000)) v(id,x),
@@ -5478,19 +5479,17 @@ lateral (select * from int8_tbl t1,
                                        and (select v.id=0)) offset 0) ss2) ss
          where t1.q1 = ss.q2) ss0;
                                     QUERY PLAN                                     
------------------------------------------------------------------------------------
- Nested Loop
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
    Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2
-   ->  Values Scan on "*VALUES*"
-         Output: "*VALUES*".column1
-   ->  Materialize
-         Output: t1.q1, t1.q2, ss2.q1, ss2.q2
-         ->  Remote Subquery Scan on all (datanode_1)
-               Output: t1.q1, t1.q2, ss2.q1, ss2.q2
                ->  Nested Loop
-                     Output: t1.q1, t1.q2, ss2.q1, ss2.q2
+         Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2
                      ->  Seq Scan on public.int8_tbl t1
                            Output: t1.q1, t1.q2
+         ->  Nested Loop
+               Output: "*VALUES*".column1, ss2.q1, ss2.q2
+               ->  Values Scan on "*VALUES*"
+                     Output: "*VALUES*".column1
                      ->  Subquery Scan on ss2
                            Output: ss2.q1, ss2.q2
                            Filter: (t1.q1 = ss2.q2)
@@ -5512,7 +5511,7 @@ lateral (select * from int8_tbl t1,
                                                ->  Seq Scan on public.int8_tbl t3
                                                      Output: t3.q1, t3.q2
                                                      Filter: (t3.q2 = $2)
-(33 rows)
+(31 rows)
 
 select * from (values (0), (1)) v(id),
 lateral (select * from int8_tbl t1,
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 14660970..25688014 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -3197,11 +3197,12 @@ RETURNING *;
          Conflict Arbiter Indexes: hat_data_unique_idx
          Conflict Filter: ((excluded.hat_color <> 'forbidden'::bpchar) AND (hat_data.* <> excluded.*))
          CTE data
+           ->  Remote Subquery Scan on all
            ->  Values Scan on "*VALUES*"
          ->  Remote Subquery Scan on all
                Distribute results by H: hat_name
                ->  CTE Scan on data
-(10 rows)
+(11 rows)
 
 SELECT * FROM hat_data WHERE hat_name IN ('h8', 'h9', 'h7') ORDER BY hat_name;
   hat_name  | hat_color  
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 6e607200..2743774f 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -777,8 +777,10 @@ explain (verbose, costs off)
   select x, x from
     (select (select now()) as x from (values(1),(2)) v(y)) ss;
         QUERY PLAN         
----------------------------
- Values Scan on "*VALUES*"
+------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: $0, $1
+   ->  Values Scan on "*VALUES*"
    Output: $0, $1
    InitPlan 1 (returns $0)
      ->  Result
@@ -786,28 +788,32 @@ explain (verbose, costs off)
    InitPlan 2 (returns $1)
      ->  Result
            Output: now()
-(8 rows)
+(10 rows)
 
 explain (verbose, costs off)
   select x, x from
     (select (select random()) as x from (values(1),(2)) v(y)) ss;
             QUERY PLAN            
-----------------------------------
- Subquery Scan on ss
+------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: ss.x, ss.x
+   ->  Subquery Scan on ss
    Output: ss.x, ss.x
    ->  Values Scan on "*VALUES*"
          Output: $0
          InitPlan 1 (returns $0)
            ->  Result
                  Output: random()
-(7 rows)
+(9 rows)
 
 explain (verbose, costs off)
   select x, x from
     (select (select now() where y=y) as x from (values(1),(2)) v(y)) ss;
                               QUERY PLAN                              
-----------------------------------------------------------------------
- Values Scan on "*VALUES*"
+----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: (SubPlan 1), (SubPlan 2)
+   ->  Values Scan on "*VALUES*"
    Output: (SubPlan 1), (SubPlan 2)
    SubPlan 1
      ->  Result
@@ -817,14 +823,16 @@ explain (verbose, costs off)
      ->  Result
            Output: now()
            One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1)
-(10 rows)
+(12 rows)
 
 explain (verbose, costs off)
   select x, x from
     (select (select random() where y=y) as x from (values(1),(2)) v(y)) ss;
                                  QUERY PLAN                                 
-----------------------------------------------------------------------------
- Subquery Scan on ss
+----------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: ss.x, ss.x
+   ->  Subquery Scan on ss
    Output: ss.x, ss.x
    ->  Values Scan on "*VALUES*"
          Output: (SubPlan 1)
@@ -832,7 +840,7 @@ explain (verbose, costs off)
            ->  Result
                  Output: random()
                  One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1)
-(8 rows)
+(10 rows)
 
 --
 -- Check we behave sanely in corner case of empty SELECT list (bug #8648)
@@ -1955,10 +1963,12 @@ with recursive x(a) as
     where length(z.a || z1.a) < 5))
 select * from x;
                         QUERY PLAN                        
-----------------------------------------------------------
+----------------------------------------------------------------
  CTE Scan on x
    Output: x.a
    CTE x
+     ->  Remote Subquery Scan on all (datanode_1)
+           Output: x_1.a
      ->  Recursive Union
            ->  Values Scan on "*VALUES*"
                  Output: "*VALUES*".column1
@@ -1974,7 +1984,7 @@ select * from x;
                        Output: z1.a
                        ->  CTE Scan on z z1
                              Output: z1.a
-(18 rows)
+(20 rows)
 
 with recursive x(a) as
   ((values ('a'), ('b'))
@@ -2018,17 +2028,19 @@ with recursive x(a) as
     where length(z.a || z.a) < 5))
 select * from x;
                        QUERY PLAN                       
---------------------------------------------------------
+--------------------------------------------------------------
  CTE Scan on x
    Output: x.a
    CTE x
+     ->  Remote Subquery Scan on all (datanode_1)
+           Output: x.a
      ->  Recursive Union
            ->  Values Scan on "*VALUES*"
                  Output: "*VALUES*".column1
            ->  WorkTable Scan on x x_1
                  Output: (x_1.a || x_1.a)
                  Filter: (length((x_1.a || x_1.a)) < 5)
-(9 rows)
+(11 rows)
 
 with recursive x(a) as
   ((values ('a'), ('b'))
diff --git a/src/test/regress/expected/tablesample_1.out b/src/test/regress/expected/tablesample_1.out
index 133927af..7f528c9b 100644
--- a/src/test/regress/expected/tablesample_1.out
+++ b/src/test/regress/expected/tablesample_1.out
@@ -242,17 +242,22 @@ select pct, count(unique1) from
   (values (0),(100)) v(pct),
   lateral (select * from tenk1 tablesample bernoulli (pct)) ss
   group by pct;
-                              QUERY PLAN                               
------------------------------------------------------------------------
- HashAggregate
-   Group Key: "*VALUES*".column1
-   ->  Nested Loop
-         ->  Values Scan on "*VALUES*"
-         ->  Materialize
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Finalize GroupAggregate
+         Group Key: "*VALUES*".column1
+         ->  Sort
+               Sort Key: "*VALUES*".column1
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     ->  Sample Scan on tenk1
-                           Sampling: bernoulli ("*VALUES*".column1)
-(8 rows)
+                     Distribute results by H: column1
+                     ->  Partial HashAggregate
+                           Group Key: "*VALUES*".column1
+                           ->  Nested Loop
+                                 ->  Values Scan on "*VALUES*"
+                                 ->  Sample Scan on tenk1
+                                       Sampling: bernoulli ("*VALUES*".column1)
+(13 rows)
 
 select pct, count(unique1) from
   (values (0),(100)) v(pct),
@@ -260,7 +265,8 @@ select pct, count(unique1) from
   group by pct;
  pct | count 
 -----+-------
-(0 rows)
+ 100 | 10000
+(1 row)
 
 select pct, count(unique1) from
   (values (0),(100)) v(pct),
@@ -268,7 +274,8 @@ select pct, count(unique1) from
   group by pct;
  pct | count 
 -----+-------
-(0 rows)
+ 100 | 10000
+(1 row)
 
 -- errors
 SELECT id FROM test_tablesample TABLESAMPLE FOOBAR (1);
diff --git a/src/test/regress/sql/create_view.sql b/src/test/regress/sql/create_view.sql
index 47ef2f99..cd7a3309 100644
--- a/src/test/regress/sql/create_view.sql
+++ b/src/test/regress/sql/create_view.sql
@@ -584,3 +584,11 @@ select pg_get_ruledef(oid, true) from pg_rewrite
 \set VERBOSITY terse \\ -- suppress cascade details
 DROP SCHEMA temp_view_test CASCADE;
 DROP SCHEMA testviewschm2 CASCADE;
+
+-- check plan without sort operator, but need merge sort
+set enable_seqscan = off;
+create table test(v int primary key, w int) distribute by shard(v); 
+insert into test values(generate_series(1,50), generate_series(1,50));
+create view test_sort as select * from test where v in (select v from test where w < 20) order by v asc;
+select * from test_sort;
+drop table test cascade;

From f86b7d5f0a58be4154515647d55fe3b08e3ea9bf Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Sat, 5 Jun 2021 20:29:59 +0800
Subject: [PATCH 168/578] fix compile warnings and regress expected info

---
 src/backend/optimizer/path/allpaths.c     |   2 +-
 src/backend/optimizer/util/pgxcship.c     |   2 +-
 src/test/regress/expected/create_view.out |   2 +-
 src/test/regress/expected/join_3.out      |  92 +++++++++---------
 src/test/regress/expected/rules.out       |   2 +-
 src/test/regress/expected/subselect.out   | 110 +++++++++++-----------
 src/test/regress/sql/create_view.sql      |   2 +-
 7 files changed, 106 insertions(+), 106 deletions(-)

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 821c2aef..7a17b7d7 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -2095,7 +2095,7 @@ check_list_contain_all_const(List *list)
 		node = lfirst(lc);
 		if (IsA(node, List))
 		{
-			if (!check_list_contain_all_const(node))
+			if (!check_list_contain_all_const((List *)node))
 			{
 				return false;
 			}
diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index 3d00fa58..7c577339 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -1884,7 +1884,7 @@ pgxc_query_contains_only_pg_catalog(List *rtable)
     return true;
 }
 
-ExecNodes *
+static ExecNodes *
 make_FQS_single_node()
 {
 	ExecNodes       *exec_nodes;
diff --git a/src/test/regress/expected/create_view.out b/src/test/regress/expected/create_view.out
index d1285a50..b0007ead 100644
--- a/src/test/regress/expected/create_view.out
+++ b/src/test/regress/expected/create_view.out
@@ -1719,7 +1719,7 @@ create table test(v int primary key, w int) distribute by shard(v);
 NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
 insert into test values(generate_series(1,50), generate_series(1,50));
 create view test_sort as select * from test where v in (select v from test where w < 20) order by v asc;
-select * from test_sort;
+select * from test_sort order by 1;
  v  | w  
 ----+----
   1 |  1
diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index 3841e3eb..4b1d3032 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -3513,13 +3513,13 @@ left join
   ) foo3
 using (join_key);
                                    QUERY PLAN                                   
------------------------------------------------------------------------
+--------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: "*VALUES*".column1, i1.f1, 666
    ->  Hash Right Join
          Output: "*VALUES*".column1, i1.f1, (666)
          Hash Cond: (i1.f1 = "*VALUES*".column1)
-         ->  Hash Right Join
+         ->  Merge Right Join
                Output: i1.f1, 666
                Merge Cond: (i2.unique2 = i1.f1)
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
@@ -3538,14 +3538,14 @@ using (join_key);
                                  Sort Key: i1.f1
                                  ->  Seq Scan on public.int4_tbl i1
                                        Output: i1.f1
-   ->  Hash
-         Output: "*VALUES*".column1
+         ->  Hash
+               Output: "*VALUES*".column1
                ->  Remote Subquery Scan on all (datanode_1)
                      Output: "*VALUES*".column1
                      Distribute results by H: column1
-         ->  Values Scan on "*VALUES*"
-               Output: "*VALUES*".column1
-(27 rows)
+                     ->  Values Scan on "*VALUES*"
+                           Output: "*VALUES*".column1
+(31 rows)
 
 select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from
   (values (0),(1)) foo1(join_key)
@@ -5433,28 +5433,28 @@ select * from
   lateral (select f1 from int4_tbl
            where f1 = any (select unique1 from tenk1
                            where unique2 = v.x offset 0)) ss;
-                                       QUERY PLAN                                       
+                                    QUERY PLAN                                    
 ----------------------------------------------------------------------------------
  Remote Subquery Scan on all
    Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1
    ->  Nested Loop
-   Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1
-   ->  Values Scan on "*VALUES*"
-         Output: "*VALUES*".column1, "*VALUES*".column2
-               ->  Hash Join
+         Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1
+         ->  Values Scan on "*VALUES*"
+               Output: "*VALUES*".column1, "*VALUES*".column2
+         ->  Hash Join
+               Output: int4_tbl.f1
+               Inner Unique: true
+               Hash Cond: (int4_tbl.f1 = tenk1.unique1)
+               ->  Seq Scan on public.int4_tbl
                      Output: int4_tbl.f1
-                     Inner Unique: true
-                     Hash Cond: (int4_tbl.f1 = tenk1.unique1)
-                     ->  Seq Scan on public.int4_tbl
-                           Output: int4_tbl.f1
-                     ->  Hash
+               ->  Hash
+                     Output: tenk1.unique1
+                     ->  HashAggregate
                            Output: tenk1.unique1
-                           ->  HashAggregate
+                           Group Key: tenk1.unique1
+                           ->  Index Scan using tenk1_unique2 on public.tenk1
                                  Output: tenk1.unique1
-                                 Group Key: tenk1.unique1
-                                 ->  Index Scan using tenk1_unique2 on public.tenk1
-                                       Output: tenk1.unique1
-                                       Index Cond: (tenk1.unique2 = "*VALUES*".column2)
+                                 Index Cond: (tenk1.unique2 = "*VALUES*".column2)
 (20 rows)
 
 select * from
@@ -5478,39 +5478,39 @@ lateral (select * from int8_tbl t1,
                                      where q2 = (select greatest(t1.q1,t2.q2))
                                        and (select v.id=0)) offset 0) ss2) ss
          where t1.q1 = ss.q2) ss0;
-                                    QUERY PLAN                                     
+                                 QUERY PLAN                                  
 -----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1)
    Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2
-               ->  Nested Loop
+   ->  Nested Loop
          Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2
-                     ->  Seq Scan on public.int8_tbl t1
-                           Output: t1.q1, t1.q2
+         ->  Seq Scan on public.int8_tbl t1
+               Output: t1.q1, t1.q2
          ->  Nested Loop
                Output: "*VALUES*".column1, ss2.q1, ss2.q2
                ->  Values Scan on "*VALUES*"
                      Output: "*VALUES*".column1
-                     ->  Subquery Scan on ss2
-                           Output: ss2.q1, ss2.q2
-                           Filter: (t1.q1 = ss2.q2)
-                           ->  Seq Scan on public.int8_tbl t2
-                                 Output: t2.q1, t2.q2
-                                 Filter: (SubPlan 3)
-                                 SubPlan 3
-                                   ->  Remote Subquery Scan on all (datanode_1)
+               ->  Subquery Scan on ss2
+                     Output: ss2.q1, ss2.q2
+                     Filter: (t1.q1 = ss2.q2)
+                     ->  Seq Scan on public.int8_tbl t2
+                           Output: t2.q1, t2.q2
+                           Filter: (SubPlan 3)
+                           SubPlan 3
+                             ->  Remote Subquery Scan on all (datanode_1)
+                                   Output: t3.q2
+                                   ->  Result
                                          Output: t3.q2
-                                         ->  Result
-                                               Output: t3.q2
-                                               One-Time Filter: $4
-                                               InitPlan 1 (returns $2)
-                                                 ->  Result
-                                                       Output: GREATEST($0, t2.q2)
-                                               InitPlan 2 (returns $4)
-                                                 ->  Result
-                                                       Output: ($3 = 0)
-                                               ->  Seq Scan on public.int8_tbl t3
-                                                     Output: t3.q1, t3.q2
-                                                     Filter: (t3.q2 = $2)
+                                         One-Time Filter: $4
+                                         InitPlan 1 (returns $2)
+                                           ->  Result
+                                                 Output: GREATEST($0, t2.q2)
+                                         InitPlan 2 (returns $4)
+                                           ->  Result
+                                                 Output: ($3 = 0)
+                                         ->  Seq Scan on public.int8_tbl t3
+                                               Output: t3.q1, t3.q2
+                                               Filter: (t3.q2 = $2)
 (31 rows)
 
 select * from (values (0), (1)) v(id),
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 25688014..f750332d 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -3198,7 +3198,7 @@ RETURNING *;
          Conflict Filter: ((excluded.hat_color <> 'forbidden'::bpchar) AND (hat_data.* <> excluded.*))
          CTE data
            ->  Remote Subquery Scan on all
-           ->  Values Scan on "*VALUES*"
+                 ->  Values Scan on "*VALUES*"
          ->  Remote Subquery Scan on all
                Distribute results by H: hat_name
                ->  CTE Scan on data
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 2743774f..36a5f074 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -776,70 +776,70 @@ where a.thousand = b.thousand
 explain (verbose, costs off)
   select x, x from
     (select (select now()) as x from (values(1),(2)) v(y)) ss;
-        QUERY PLAN         
+                QUERY PLAN                
 ------------------------------------------
  Remote Subquery Scan on all (datanode_1)
    Output: $0, $1
    ->  Values Scan on "*VALUES*"
-   Output: $0, $1
-   InitPlan 1 (returns $0)
-     ->  Result
-           Output: now()
-   InitPlan 2 (returns $1)
-     ->  Result
-           Output: now()
+         Output: $0, $1
+         InitPlan 1 (returns $0)
+           ->  Result
+                 Output: now()
+         InitPlan 2 (returns $1)
+           ->  Result
+                 Output: now()
 (10 rows)
 
 explain (verbose, costs off)
   select x, x from
     (select (select random()) as x from (values(1),(2)) v(y)) ss;
-            QUERY PLAN            
+                QUERY PLAN                
 ------------------------------------------
  Remote Subquery Scan on all (datanode_1)
    Output: ss.x, ss.x
    ->  Subquery Scan on ss
-   Output: ss.x, ss.x
-   ->  Values Scan on "*VALUES*"
-         Output: $0
-         InitPlan 1 (returns $0)
-           ->  Result
-                 Output: random()
+         Output: ss.x, ss.x
+         ->  Values Scan on "*VALUES*"
+               Output: $0
+               InitPlan 1 (returns $0)
+                 ->  Result
+                       Output: random()
 (9 rows)
 
 explain (verbose, costs off)
   select x, x from
     (select (select now() where y=y) as x from (values(1),(2)) v(y)) ss;
-                              QUERY PLAN                              
+                                 QUERY PLAN                                 
 ----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1)
    Output: (SubPlan 1), (SubPlan 2)
    ->  Values Scan on "*VALUES*"
-   Output: (SubPlan 1), (SubPlan 2)
-   SubPlan 1
-     ->  Result
-           Output: now()
-           One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1)
-   SubPlan 2
-     ->  Result
-           Output: now()
-           One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1)
+         Output: (SubPlan 1), (SubPlan 2)
+         SubPlan 1
+           ->  Result
+                 Output: now()
+                 One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1)
+         SubPlan 2
+           ->  Result
+                 Output: now()
+                 One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1)
 (12 rows)
 
 explain (verbose, costs off)
   select x, x from
     (select (select random() where y=y) as x from (values(1),(2)) v(y)) ss;
-                                 QUERY PLAN                                 
+                                    QUERY PLAN                                    
 ----------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1)
    Output: ss.x, ss.x
    ->  Subquery Scan on ss
-   Output: ss.x, ss.x
-   ->  Values Scan on "*VALUES*"
-         Output: (SubPlan 1)
-         SubPlan 1
-           ->  Result
-                 Output: random()
-                 One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1)
+         Output: ss.x, ss.x
+         ->  Values Scan on "*VALUES*"
+               Output: (SubPlan 1)
+               SubPlan 1
+                 ->  Result
+                       Output: random()
+                       One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1)
 (10 rows)
 
 --
@@ -1962,28 +1962,28 @@ with recursive x(a) as
     select z.a || z1.a as a from z cross join z as z1
     where length(z.a || z1.a) < 5))
 select * from x;
-                        QUERY PLAN                        
+                           QUERY PLAN                           
 ----------------------------------------------------------------
  CTE Scan on x
    Output: x.a
    CTE x
      ->  Remote Subquery Scan on all (datanode_1)
            Output: x_1.a
-     ->  Recursive Union
-           ->  Values Scan on "*VALUES*"
-                 Output: "*VALUES*".column1
-           ->  Nested Loop
-                 Output: (z.a || z1.a)
-                 Join Filter: (length((z.a || z1.a)) < 5)
-                 CTE z
-                   ->  WorkTable Scan on x x_1
-                         Output: x_1.a
-                 ->  CTE Scan on z
-                       Output: z.a
-                 ->  Materialize
-                       Output: z1.a
-                       ->  CTE Scan on z z1
+           ->  Recursive Union
+                 ->  Values Scan on "*VALUES*"
+                       Output: "*VALUES*".column1
+                 ->  Nested Loop
+                       Output: (z.a || z1.a)
+                       Join Filter: (length((z.a || z1.a)) < 5)
+                       CTE z
+                         ->  WorkTable Scan on x x_1
+                               Output: x_1.a
+                       ->  CTE Scan on z
+                             Output: z.a
+                       ->  Materialize
                              Output: z1.a
+                             ->  CTE Scan on z z1
+                                   Output: z1.a
 (20 rows)
 
 with recursive x(a) as
@@ -2027,19 +2027,19 @@ with recursive x(a) as
     select z.a || z.a as a from z
     where length(z.a || z.a) < 5))
 select * from x;
-                       QUERY PLAN                       
+                          QUERY PLAN                          
 --------------------------------------------------------------
  CTE Scan on x
    Output: x.a
    CTE x
      ->  Remote Subquery Scan on all (datanode_1)
            Output: x.a
-     ->  Recursive Union
-           ->  Values Scan on "*VALUES*"
-                 Output: "*VALUES*".column1
-           ->  WorkTable Scan on x x_1
-                 Output: (x_1.a || x_1.a)
-                 Filter: (length((x_1.a || x_1.a)) < 5)
+           ->  Recursive Union
+                 ->  Values Scan on "*VALUES*"
+                       Output: "*VALUES*".column1
+                 ->  WorkTable Scan on x x_1
+                       Output: (x_1.a || x_1.a)
+                       Filter: (length((x_1.a || x_1.a)) < 5)
 (11 rows)
 
 with recursive x(a) as
diff --git a/src/test/regress/sql/create_view.sql b/src/test/regress/sql/create_view.sql
index cd7a3309..4ec3f6f5 100644
--- a/src/test/regress/sql/create_view.sql
+++ b/src/test/regress/sql/create_view.sql
@@ -590,5 +590,5 @@ set enable_seqscan = off;
 create table test(v int primary key, w int) distribute by shard(v); 
 insert into test values(generate_series(1,50), generate_series(1,50));
 create view test_sort as select * from test where v in (select v from test where w < 20) order by v asc;
-select * from test_sort;
+select * from test_sort order by 1;
 drop table test cascade;

From 949164ce186fc66a544a8d53967dfb74db81021a Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 15 Jun 2021 17:25:17 +0800
Subject: [PATCH 169/578] add TBase Community Code of Conduct

---
 Code-of-Conduct.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 Code-of-Conduct.md

diff --git a/Code-of-Conduct.md b/Code-of-Conduct.md
new file mode 100644
index 00000000..a103d91a
--- /dev/null
+++ b/Code-of-Conduct.md
@@ -0,0 +1,5 @@
+# TBase Community Code of Conduct
+TBase follows the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/master/code-of-conduct.md).
+
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the TBase Code of Conduct Committee via email: TBase_Community@qq.com
\ No newline at end of file

From 87f8908699eb2f76a4016534b7595ccac948add6 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 15 Jun 2021 18:09:53 +0800
Subject: [PATCH 170/578] Should call prev hook when startup
 pg_stat_cluster_activity

---
 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
index 5efecaf6..304ee872 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -1015,6 +1015,9 @@ pg_cancel_session(PG_FUNCTION_ARGS)
 static void
 pgcs_shmem_startup(void)
 {
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+	
 	CreateSharedClusterStatus();
 }
 

From fc29b95c5110ee660bc0882d29314fc08c1a798f Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 16 Jun 2021 10:46:14 +0800
Subject: [PATCH 171/578] update CONTRIBUTING

---
 CONTRIBUTING.md | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d06b4e57..63f601c2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,26 +1,26 @@
 # Contributing  
 ---
-如果你有好的意见或建议，欢迎给我们提 [Issues](https://github.com/Tencent/TBase/issues) 或 [Pull Requests](https://github.com/Tencent/TBase/pulls)，为TBase开源社区贡献力量。TBase 持续招募贡献者，即使是在 issue 中回答问题，或者做一些简单的 bugfix ，也会给 TBase 带来很大的帮助。
+If you have good comments or suggestions, welcome to create [Issues](https://github.com/Tencent/TBase/issues) or [Pull Requests](https://github.com/Tencent/TBase/pulls)，contribute to the TBase open source community.TBase continues to recruit contributors, even if it is answering questions in the issue, or doing some simple bugfixes, it will be of great help to TBase.
 
-[腾讯开源激励计划](https://opensource.tencent.com/contribution) 鼓励开发者的参与和贡献，期待你的加入。
+[Tencent Open Source Incentive Program](https://opensource.tencent.com/contribution) Encourage developers to participate and contribute, and look forward to your joining.
 
 ## Issue  
-#### 对于贡献者  
+#### For contributors 
 
-在提 issue 前请确保满足一下条件：
+Please ensure that the following conditions are met before submitting an issue:
 
-* 必须是一个 bug 或者功能新增
-* 已经在 issue 中搜索过，并且没有找到相似的 issue 或者解决方案
-* 新建 Issue 时请提供详细的描述、截屏或者短视频来辅助我们定位问题
+* Must be a bug or new feature
+* Have searched in the issue, and did not find a similar issue or solution
+* When creating a new issue, please provide a detailed description, screenshot or short video to help us locate the problem
 
 ## Pull Request  
-我们欢迎大家贡献代码来使我们的产品更加强大，代码团队会监控所有的 Pull request, 我们会做相应的代码检查和测试，测试通过之后我们就会接纳 PR ，但是不会立即合并到 master 分支。
+We welcome everyone to contribute code to make our product more powerful. The code team will monitor all pull requests, and we will do the corresponding code inspection and testing. After the test passes, we will accept the PR, but will not immediately merge into the master branch.
 
-在完成一个 PR 之前请做一下确认:
+Please confirm before completing a PR:
 
-1. 从 master 分支中 fork 你自己的分支。
-2. 在修改了代码之后请修改对应的文档和注释。
-3. 在新建的文件中请加入 License 和 Copyright 申明。
-4. 确保一致的代码风格。
-5. 做充分的测试。
-6. 然后，你可以提交你的代码到 dev 分支。
\ No newline at end of file
+1. Fork your own branch from the master branch.
+2. Please modify the corresponding documents and comments after modifying the code.
+3. Please add License and Copyright declarations in the newly created file.
+4. Ensure a consistent code style.
+5. Do adequate testing.
+6. Then, you can submit your code to the dev branch.
\ No newline at end of file

From bf113c199b8f2da0d41f05dcd34feb8b5d467864 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Thu, 17 Jun 2021 19:40:03 +0800
Subject: [PATCH 172/578] Support NOT IN/ANY sublink pullup after cherry-picked
 PG lateral impl (merge request !406)

Previously we skipped NOT IN/ANY sublink pull up after merged new
postgres lateral related code. Here we removed the restirction.
Now we also support the case that var is nullable by adding NullTest
expr.

http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131088904293
---
 src/backend/nodes/makefuncs.c                 |  47 +++++
 src/backend/optimizer/plan/subselect.c        | 171 +++++++++++++++++-
 src/include/nodes/makefuncs.h                 |   4 +
 .../regress/expected/select_parallel_4.out    |  16 +-
 src/test/regress/expected/subselect.out       |  75 +++++++-
 src/test/regress/sql/subselect.sql            |  14 ++
 6 files changed, 305 insertions(+), 22 deletions(-)

diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index 3748c170..30c49729 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -695,3 +695,50 @@ makeGroupingSet(GroupingSetKind kind, List *content, int location)
     n->location = location;
     return n;
 }
+
+#ifdef __TBASE__
+/*
+ * makeNullTest -
+ *	  creates a Null Test expr like "expr is (NOT) NULL"
+ */
+NullTest *
+makeNullTest(NullTestType type, Expr *expr)
+{
+	NullTest *n = makeNode(NullTest);
+
+	n->nulltesttype = type;
+	n->arg = expr;
+
+	return n;
+}
+
+/*
+ * makeBoolExpr -
+ *	  creates a BoolExpr tree node.
+ */
+Expr *
+makeBoolExprTreeNode(BoolExprType boolop, List *args)
+{
+	Node *node = NULL;
+	ListCell *lc = NULL;
+
+	foreach (lc, args)
+	{
+		BoolExpr* b = NULL;
+
+		if (node == NULL)
+		{
+			node = (Node*)lfirst(lc);
+			continue;
+		}
+
+		b = makeNode(BoolExpr);
+		b->boolop = boolop;
+		b->args = list_make2(node, lfirst(lc));
+		b->location = 0;
+		node = (Node*)b;
+	}
+
+	return (Expr*)node;
+}
+#endif
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 0f24fa9d..98ed5c26 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -152,6 +152,11 @@ static Node *convert_testexpr_mutator(Node *node,
                          convert_testexpr_context *context);
 static bool subplan_is_hashable(Plan *plan);
 static bool testexpr_is_hashable(Node *testexpr);
+#ifdef __TBASE__
+static Node *convert_joinqual_to_antiqual(Node* node, Query* parse);
+static Node *convert_opexpr_to_boolexpr_for_antijoin(Node* node, Query* parse);
+static bool var_is_nullable(Node *node, Query *parse);
+#endif
 static bool hash_ok_operator(OpExpr *expr);
 static bool contain_dml(Node *node);
 static bool contain_dml_walker(Node *node, void *context);
@@ -1342,6 +1347,98 @@ testexpr_is_hashable(Node *testexpr)
     return false;
 }
 
+#ifdef __TBASE__
+/*
+ * Rewrite qual to complete nullability check for NOT IN/ANY sublink pullup
+ */
+static Node*
+convert_joinqual_to_antiqual(Node* node, Query* parse)
+{
+	Node* antiqual = NULL;
+
+	if (node == NULL)
+		return NULL;
+
+	switch (nodeTag(node))
+	{
+		case T_OpExpr:
+			antiqual = convert_opexpr_to_boolexpr_for_antijoin(node, parse);
+			break;
+		case T_BoolExpr:
+		{
+			/* Not IN, should be and clause.*/
+            if (and_clause(node))
+            {
+            	BoolExpr* boolexpr = (BoolExpr*)node;
+            	List* andarglist = NIL;
+            	ListCell* l = NULL;
+
+            	foreach (l, boolexpr->args)
+            	{
+            		Node* andarg = (Node*)lfirst(l);
+            		Node* expr = NULL;
+
+            		/* The listcell type of args should be OpExpr. */
+            		expr = convert_opexpr_to_boolexpr_for_antijoin(andarg, parse);
+            		if (expr == NULL)
+            			return NULL;
+
+            		andarglist = lappend(andarglist, expr);
+            	}
+
+            	antiqual = (Node*)makeBoolExpr(AND_EXPR, andarglist, boolexpr->location);
+            }
+            else
+            	return NULL;
+        }
+			break;
+		case T_ScalarArrayOpExpr:
+		case T_RowCompareExpr:
+		default:
+			antiqual = NULL;
+			break;
+	}
+
+	return antiqual;
+}
+
+static Node *
+convert_opexpr_to_boolexpr_for_antijoin(Node *node, Query *parse)
+{
+	Node	*boolexpr = NULL;
+	List	*antiqual = NIL;
+	OpExpr	*opexpr = NULL;
+	Node	*larg = NULL;
+	Node	*rarg = NULL;
+
+	if (!IsA(node, OpExpr))
+		return NULL;
+	else
+		opexpr = (OpExpr*)node;
+
+	antiqual = (List*)list_make1(opexpr);
+
+	larg = (Node*)linitial(opexpr->args);
+	if (IsA(larg, RelabelType))
+		larg = (Node*)((RelabelType*)larg)->arg;
+	if (var_is_nullable(larg, parse))
+		antiqual = lappend(antiqual, makeNullTest(IS_NULL, (Expr*)copyObject(larg)));
+
+	rarg = (Node*)lsecond(opexpr->args);
+	if (IsA(rarg, RelabelType))
+		rarg = (Node*)((RelabelType*)rarg)->arg;
+	if (var_is_nullable(rarg, parse))
+		antiqual = lappend(antiqual, makeNullTest(IS_NULL, (Expr*)copyObject(rarg)));
+
+	if (list_length(antiqual) > 1)
+		boolexpr = (Node*)makeBoolExprTreeNode(OR_EXPR, antiqual);
+	else
+		boolexpr = (Node*)opexpr;
+
+	return boolexpr;
+}
+#endif
+
 /*
  * Check expression is hashable + strict
  *
@@ -2305,10 +2402,6 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 		return NULL;
 #ifdef __TBASE__
 	}
-
-	/* TODO: Currently we do not pullup under_not */
-	if (under_not)
-		return NULL;
 #endif
 
     /*
@@ -2380,16 +2473,33 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 	 * And finally, build the JoinExpr node.
 	 */
 	result = makeNode(JoinExpr);
+
 #ifdef __TBASE__
-    result->jointype = under_not ? JOIN_ANTI : JOIN_SEMI;
-#else
-    result->jointype = JOIN_SEMI;
+	/* Different logic for NOT IN/ANY sublink */
+	if (under_not)
+	{
+        Node* antiquals = NULL;
+
+        antiquals = convert_joinqual_to_antiqual(quals, parse);
+
+        if (antiquals == NULL)
+            return NULL;
+
+        result->jointype = JOIN_ANTI;
+        result->quals = antiquals;
+	}
+	else
+	{
+		/* Basic logic for IN/ANY sublink */
+		result->jointype = JOIN_SEMI;
+		result->quals = quals;
+	}
 #endif
+
     result->isNatural = false;
     result->larg = NULL;        /* caller must fill this in */
     result->rarg = (Node *) rtr;
     result->usingClause = NIL;
-    result->quals = quals;
     result->alias = NULL;
     result->rtindex = 0;        /* we don't need an RTE for it */
 
@@ -5682,4 +5792,47 @@ SS_remote_attach_initplans(PlannerInfo *root, Plan *plan)
 	/* Process left and right child plans, if any */
 	SS_remote_attach_initplans(root, plan->lefttree);
 	SS_remote_attach_initplans(root, plan->righttree);
-}
\ No newline at end of file
+}
+
+#ifdef __TBASE__
+static bool
+var_is_nullable(Node *node, Query *parse)
+{
+	RangeTblEntry* rte;
+	bool result = true;
+	Var *var = NULL;
+
+	if (IsA(node, Var))
+		var = (Var*) node;
+	else
+		return true;
+
+	if (IS_SPECIAL_VARNO(var->varno) ||
+		var->varno <= 0 || var->varno > list_length(parse->rtable))
+		return true;
+
+	rte = (RangeTblEntry *)list_nth(parse->rtable, var->varno - 1);
+	if (rte->rtekind == RTE_RELATION)
+	{
+		HeapTuple tp;
+
+		tp = SearchSysCache2(ATTNUM, ObjectIdGetDatum(rte->relid), Int16GetDatum(var->varattno));
+		if (!HeapTupleIsValid(tp))
+			return true;
+		result = !((Form_pg_attribute)GETSTRUCT(tp))->attnotnull;
+		ReleaseSysCache(tp);
+	}
+	else if (rte->rtekind == RTE_SUBQUERY)
+	{
+		if (rte->subquery->groupingSets == NIL)
+		{
+			TargetEntry *te = (TargetEntry *)list_nth(rte->subquery->targetList, var->varattno - 1);
+			if (IsA(te->expr, Var))
+				result = var_is_nullable((Node *)te->expr, rte->subquery);
+		}
+	}
+
+	return result;
+}
+
+#endif
diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h
index 16390a28..6b1997ea 100644
--- a/src/include/nodes/makefuncs.h
+++ b/src/include/nodes/makefuncs.h
@@ -150,4 +150,8 @@ extern DefElem *makeDefElemExtended(char *nameSpace, char *name, Node *arg,
 
 extern GroupingSet *makeGroupingSet(GroupingSetKind kind, List *content, int location);
 
+#ifdef __TBASE__
+extern NullTest *makeNullTest(NullTestType type, Expr *expr);
+extern Expr *makeBoolExprTreeNode(BoolExprType boolop, List *args);
+#endif
 #endif                            /* MAKEFUNC_H */
diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out
index 0b6353b7..e5527088 100644
--- a/src/test/regress/expected/select_parallel_4.out
+++ b/src/test/regress/expected/select_parallel_4.out
@@ -140,19 +140,17 @@ explain (costs off)
 	select count(*) from tenk1 where (two, four) not in
 	(select hundred, thousand from tenk2 where thousand > 100);
                                      QUERY PLAN                                      
--------------------------------------------------------------------------------------
- Finalize Aggregate
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Aggregate
+   ->  Nested Loop Anti Join
+         Join Filter: (((tenk1.two = tenk2.hundred) OR (tenk1.two IS NULL) OR (tenk2.hundred IS NULL)) AND ((tenk1.four = tenk2.thousand) OR (tenk1.four IS NULL) OR (tenk2.thousand IS NULL)))
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-         ->  Gather
-               Workers Planned: 4
-               ->  Partial Aggregate
-                     ->  Parallel Seq Scan on tenk1
-                           Filter: (NOT (hashed SubPlan 1))
-                           SubPlan 1
+               ->  Seq Scan on tenk1
+         ->  Materialize
                              ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                    ->  Seq Scan on tenk2
                                          Filter: (thousand > 100)
-(11 rows)
+(9 rows)
 
 select count(*) from tenk1 where (two, four) not in
 	(select hundred, thousand from tenk2 where thousand > 100);
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 36a5f074..c3a94e88 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1736,6 +1736,69 @@ select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
 ---+---
 (0 rows)
 
+explain select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b);
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Nested Loop Anti Join  (cost=200.00..6935.40 rows=1123 width=8)
+   Join Filter: ((b.b > a.b) AND ((a.b = b.a) OR (a.b IS NULL) OR (b.a IS NULL)))
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..120.53 rows=675 width=8)
+         ->  Seq Scan on tbl_a a  (cost=0.00..11.75 rows=675 width=8)
+   ->  Materialize  (cost=100.00..123.90 rows=675 width=8)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..120.53 rows=675 width=8)
+               ->  Seq Scan on tbl_b b  (cost=0.00..11.75 rows=675 width=8)
+(7 rows)
+
+select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b);
+ a  | b 
+----+---
+  1 | 1
+  2 | 1
+  5 | 1
+  6 | 1
+  8 | 1
+  9 | 1
+  3 | 1
+  4 | 1
+  7 | 1
+ 10 | 1
+(10 rows)
+
+drop table tbl_a;
+drop table tbl_b;
+-- test NOT IN/ANY with NOT NULL restriction
+create table tbl_a(a int NOT NULL, b int NOT NULL);
+create table tbl_b(a int NOT NULL, b int NOT NULL);
+insert into tbl_a select generate_series(1,10),1;
+insert into tbl_b select generate_series(2,11),1;
+explain select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b);
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=120.19..150.77 rows=562 width=8)
+   ->  Hash Anti Join  (cost=120.19..150.77 rows=562 width=8)
+         Hash Cond: (a.b = b.a)
+         Join Filter: (b.b > a.b)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..120.53 rows=675 width=8)
+               Distribute results by H: b
+               ->  Seq Scan on tbl_a a  (cost=0.00..11.75 rows=675 width=8)
+         ->  Hash  (cost=11.75..11.75 rows=675 width=8)
+               ->  Seq Scan on tbl_b b  (cost=0.00..11.75 rows=675 width=8)
+(9 rows)
+
+select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b);
+ a  | b 
+----+---
+  1 | 1
+  2 | 1
+  5 | 1
+  6 | 1
+  8 | 1
+  9 | 1
+  3 | 1
+  4 | 1
+  7 | 1
+ 10 | 1
+(10 rows)
+
 drop table tbl_a;
 drop table tbl_b;
 -- more RTEs in subquery
@@ -2144,14 +2207,18 @@ create table notin_t1 (id1 int, num1 int not null);
 create table notin_t2 (id2 int, num2 int not null);
 explain(costs off) select num1 from notin_t1 where num1 not in (select num2 from notin_t2);
                             QUERY PLAN                             
--------------------------------------------------------------------
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Anti Join
+         Hash Cond: (notin_t1.num1 = notin_t2.num2)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: num1
    ->  Seq Scan on notin_t1
-         Filter: (NOT (hashed SubPlan 1))
-         SubPlan 1
+         ->  Hash
            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: num2
                  ->  Seq Scan on notin_t2
-(6 rows)
+(10 rows)
 
 drop table notin_t1;
 drop table notin_t2;
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 8b5db9a2..f17f38f3 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -700,6 +700,20 @@ select a.a,(select b.a from tbl_b b where b.a = a.a limit 1) q from tbl_a a orde
 -- support pullup lateral ANY_SUBLINK
 explain select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
 select * from tbl_a a where a.b IN (select b.a from tbl_b b where b.b > a.b);
+explain select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b);
+select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b);
+
+drop table tbl_a;
+drop table tbl_b;
+
+-- test NOT IN/ANY with NOT NULL restriction
+create table tbl_a(a int NOT NULL, b int NOT NULL);
+create table tbl_b(a int NOT NULL, b int NOT NULL);
+insert into tbl_a select generate_series(1,10),1;
+insert into tbl_b select generate_series(2,11),1;
+
+explain select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b);
+select * from tbl_a a where a.b NOT IN (select b.a from tbl_b b where b.b > a.b);
 
 drop table tbl_a;
 drop table tbl_b;

From bccd602235129be841604f85d77b8f990c893bd2 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 18 Jun 2021 12:46:28 +0800
Subject: [PATCH 173/578] fix regress subselect/select_parallel expected

---
 src/test/regress/expected/select_parallel_4.out | 10 +++++-----
 src/test/regress/expected/subselect.out         |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out
index e5527088..684d4989 100644
--- a/src/test/regress/expected/select_parallel_4.out
+++ b/src/test/regress/expected/select_parallel_4.out
@@ -139,17 +139,17 @@ alter table tenk2 set (parallel_workers = 0);
 explain (costs off)
 	select count(*) from tenk1 where (two, four) not in
 	(select hundred, thousand from tenk2 where thousand > 100);
-                                     QUERY PLAN                                      
+                                                                                           QUERY PLAN                                                                                           
 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Aggregate
    ->  Nested Loop Anti Join
          Join Filter: (((tenk1.two = tenk2.hundred) OR (tenk1.two IS NULL) OR (tenk2.hundred IS NULL)) AND ((tenk1.four = tenk2.thousand) OR (tenk1.four IS NULL) OR (tenk2.thousand IS NULL)))
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Seq Scan on tenk1
          ->  Materialize
-                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                   ->  Seq Scan on tenk2
-                                         Filter: (thousand > 100)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Seq Scan on tenk2
+                           Filter: (thousand > 100)
 (9 rows)
 
 select count(*) from tenk1 where (two, four) not in
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index c3a94e88..f38e79c4 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -2206,18 +2206,18 @@ drop table catalog_sales, catalog_returns, date_dim;
 create table notin_t1 (id1 int, num1 int not null);
 create table notin_t2 (id2 int, num2 int not null);
 explain(costs off) select num1 from notin_t1 where num1 not in (select num2 from notin_t2);
-                            QUERY PLAN                             
+                              QUERY PLAN                               
 -----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Hash Anti Join
          Hash Cond: (notin_t1.num1 = notin_t2.num2)
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Distribute results by H: num1
-   ->  Seq Scan on notin_t1
+               ->  Seq Scan on notin_t1
          ->  Hash
-           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Distribute results by H: num2
-                 ->  Seq Scan on notin_t2
+                     ->  Seq Scan on notin_t2
 (10 rows)
 
 drop table notin_t1;

From cb7363386c0800bd25aa3db3db359be18d449976 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Wed, 9 Jun 2021 17:46:03 +0800
Subject: [PATCH 174/578] Raise warning istead of error when remote instrument
 recieved but htbl is not initialized

this happens in cases like ABORT, CN will create a new combiner to
recieve data as more as it can. But, if there is any good DN success
to finish it's "EXPLAIN ANALYZE" job, remote instrument will be sent
to CN, and processed by a newly created combiner, then error out and
make further abortion goes wrong.

Raising warning and just return here is okay enough for debug, in
normal cases remote instrument definitely received with es_instrument
flag on.

tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131088466047&jump_count=1
---
 src/backend/commands/explain_dist.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/explain_dist.c b/src/backend/commands/explain_dist.c
index 2b37ba0d..dbf689df 100644
--- a/src/backend/commands/explain_dist.c
+++ b/src/backend/commands/explain_dist.c
@@ -711,7 +711,8 @@ HandleRemoteInstr(char *msg_body, size_t len, int nodeid, ResponseCombiner *comb
 	
 	if (combiner->recv_instr_htbl == NULL)
 	{
-		elog(ERROR, "combiner is not prepared for instrumentation");
+		elog(WARNING, "combiner is not prepared for instrumentation");
+		return;
 	}
 	elog(DEBUG1, "Handle remote instrument: nodeid %d", nodeid);
 	

From 900283515e254ddfd3fd215466b30e0b008ff1cf Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 11 Jun 2021 11:16:40 +0800
Subject: [PATCH 175/578] Add time zone to timestamp type output in
 pg_stat_cluster_activity

tapd: http://tapd.oa.com/pgxz/prong/stories/view/1010092131865528291
---
 .../pg_stat_cluster_activity--1.0.sql                     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql
index 9f524816..c5514458 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql
@@ -26,10 +26,10 @@ CREATE OR REPLACE FUNCTION pg_stat_get_cluster_activity(
     OUT planstate text,
     OUT portal text,
     OUT cursors text,
-    OUT backend_start timestamp,
-    OUT xact_start timestamp,
-    OUT query_start timestamp,
-    OUT state_change timestamp
+    OUT backend_start timestamp with time zone,
+    OUT xact_start timestamp with time zone,
+    OUT query_start timestamp with time zone,
+    OUT state_change timestamp with time zone
 )
 RETURNS SETOF record
 AS 'MODULE_PATHNAME'

From 3a4bb4fb791a5f4fb3dda1b1ddaa8c3e02abd663 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Wed, 9 Jun 2021 19:35:06 +0800
Subject: [PATCH 176/578] Add a guc for pg_stat_cluster_activity extension

to disable showing planstate in result sets, prevent from potential corner failure when DN call it
---
 .../pg_stat_cluster_activity.c                | 53 ++++++++++++++-----
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
index 304ee872..0cc836d3 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -18,6 +18,7 @@
 #include "storage/procarray.h"
 #include "storage/shmem.h"
 #include "utils/builtins.h"
+#include "utils/guc.h"
 #include "utils/portal.h"
 #include "utils/snapmgr.h"
 #include "utils/timestamp.h"
@@ -99,6 +100,8 @@ static PortalStart_hook_type prev_PortalStart = NULL;
 static PortalDrop_hook_type prev_PortalDrop = NULL;
 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
 
+static bool pgcs_enable_planstate; /* whether to show planstate in result sets */
+
 /*
  * Macros to load and store st_changecount with the memory barriers.
  *
@@ -372,20 +375,28 @@ pgcs_report_query_activity(QueryDesc *desc, int eflags)
 	
 	if (desc->planstate != NULL)
 	{
-		ExplainState *es = NewExplainState();
-		
-		/* make planstate text tree */
-		es->costs = false;
-		/* we don't want plan->targetlist been changed */
-		es->skip_remote_query = true;
-		
-		ExplainBeginOutput(es);
-		ExplainPrintPlan(es, desc);
-		ExplainEndOutput(es);
-		/* remove last '\n' */
-		if (es->str->len > 1)
-			es->str->data[--es->str->len] = '\0';
-		planstate_str = es->str;
+		/* make planstate text tree if enabled */
+		if (pgcs_enable_planstate)
+		{
+			ExplainState *es = NewExplainState();
+			
+			es->costs = false;
+			/* we don't want plan->targetlist been changed */
+			es->skip_remote_query = true;
+			
+			ExplainBeginOutput(es);
+			ExplainPrintPlan(es, desc);
+			ExplainEndOutput(es);
+			/* remove last '\n' */
+			if (es->str->len > 1)
+				es->str->data[--es->str->len] = '\0';
+			planstate_str = es->str;
+		}
+		else
+		{
+			planstate_str = makeStringInfo();
+			appendStringInfoString(planstate_str, "disabled");
+		}
 		
 		/* find name of RemoteSubplan to show as cursors */
 		cursors = makeStringInfo();
@@ -1039,6 +1050,20 @@ _PG_init(void)
 	if (!process_shared_preload_libraries_in_progress)
 		return;
 	
+	/*
+	 * Define (or redefine) custom GUC variables.
+	 */
+	DefineCustomBoolVariable("pg_stat_cluster_activity.enable_planstate",
+	                         "whether to show planstate in result sets.",
+	                         NULL,
+	                         &pgcs_enable_planstate,
+	                         true,
+	                         PGC_SUSET,
+	                         0,
+	                         NULL,
+	                         NULL,
+	                         NULL);
+	
 	/*
 	 * Request additional shared resources.  (These are no-ops if we're not in
 	 * the postmaster process.)  We'll allocate or attach to the shared

From 97d59366b0fa5235655db603c52cdd79dbb80842 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 14 Nov 2017 17:49:49 -0500
Subject: [PATCH 177/578] Prevent int128 from requiring more than MAXALIGN
 alignment.

Our initial work with int128 neglected alignment considerations, an
oversight that came back to bite us in bug #14897 from Vincent Lachenal.
It is unsurprising that int128 might have a 16-byte alignment requirement;
what's slightly more surprising is that even notoriously lax Intel chips
sometimes enforce that.

Raising MAXALIGN seems out of the question: the costs in wasted disk and
memory space would be significant, and there would also be an on-disk
compatibility break.  Nor does it seem very practical to try to allow some
data structures to have more-than-MAXALIGN alignment requirement, as we'd
have to push knowledge of that throughout various code that copies data
structures around.

The only way out of the box is to make type int128 conform to the system's
alignment assumptions.  Fortunately, gcc supports that via its
__attribute__(aligned()) pragma; and since we don't currently support
int128 on non-gcc-workalike compilers, we shouldn't be losing any platform
support this way.

Although we could have just done pg_attribute_aligned(MAXIMUM_ALIGNOF) and
called it a day, I did a little bit of extra work to make the code more
portable than that: it will also support int128 on compilers without
__attribute__(aligned()), if the native alignment of their 128-bit-int
type is no more than that of int64.

Add a regression test case that exercises the one known instance of the
problem, in parallel aggregation over a bigint column.

Back-patch of commit 751804998.  The code known to be affected only exists
in 9.6 and later, but we do have some stuff using int128 in 9.5, so patch
back to 9.5.

Discussion: https://postgr.es/m/20171110185747.31519.28038@wrigleys.postgresql.org
---
 config/c-compiler.m4          |  9 +++++---
 configure                     | 42 +++++++++++++++++++++++++++++++++--
 configure.in                  |  7 ++++--
 src/include/c.h               | 27 +++++++++++++++++-----
 src/include/pg_config.h.in    |  3 +++
 src/include/pg_config.h.win32 |  3 +++
 6 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 7275ea69..8d9844ab 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -96,9 +96,11 @@ undefine([Ac_cachevar])dnl
 # PGAC_TYPE_128BIT_INT
 # ---------------------
 # Check if __int128 is a working 128 bit integer type, and if so
-# define PG_INT128_TYPE to that typename.  This currently only detects
-# a GCC/clang extension, but support for different environments may be
-# added in the future.
+# define PG_INT128_TYPE to that typename, and define ALIGNOF_PG_INT128_TYPE
+# as its alignment requirement.
+#
+# This currently only detects a GCC/clang extension, but support for other
+# environments may be added in the future.
 #
 # For the moment we only test for support for 128bit math; support for
 # 128bit literals and snprintf is not required.
@@ -128,6 +130,7 @@ return 1;
 [pgac_cv__128bit_int=no])])
 if test x"$pgac_cv__128bit_int" = xyes ; then
   AC_DEFINE(PG_INT128_TYPE, __int128, [Define to the name of a signed 128-bit integer type.])
+  AC_CHECK_ALIGNOF(PG_INT128_TYPE)
 fi])# PGAC_TYPE_128BIT_INT
 
 
diff --git a/configure b/configure
index ae61c606..26843895 100755
--- a/configure
+++ b/configure
@@ -14985,7 +14985,10 @@ _ACEOF
 
 # Compute maximum alignment of any basic type.
 # We assume long's alignment is at least as strong as char, short, or int;
-# but we must check long long (if it exists) and double.
+# but we must check long long (if it is being used for int64) and double.
+# Note that we intentionally do not consider any types wider than 64 bits,
+# as allowing MAXIMUM_ALIGNOF to exceed 8 would be too much of a penalty
+# for disk and memory space.
 
 MAX_ALIGNOF=$ac_cv_alignof_long
 if test $MAX_ALIGNOF -lt $ac_cv_alignof_double ; then
@@ -15045,7 +15048,7 @@ _ACEOF
 fi
 
 
-# Check for extensions offering the integer scalar type __int128.
+# Some compilers offer a 128-bit integer scalar type.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __int128" >&5
 $as_echo_n "checking for __int128... " >&6; }
 if ${pgac_cv__128bit_int+:} false; then :
@@ -15095,6 +15098,41 @@ if test x"$pgac_cv__128bit_int" = xyes ; then
 
 $as_echo "#define PG_INT128_TYPE __int128" >>confdefs.h
 
+  # The cast to long int works around a bug in the HP C Compiler,
+# see AC_CHECK_SIZEOF for more information.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking alignment of PG_INT128_TYPE" >&5
+$as_echo_n "checking alignment of PG_INT128_TYPE... " >&6; }
+if ${ac_cv_alignof_PG_INT128_TYPE+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) offsetof (ac__type_alignof_, y)" "ac_cv_alignof_PG_INT128_TYPE"        "$ac_includes_default
+#ifndef offsetof
+# define offsetof(type, member) ((char *) &((type *) 0)->member - (char *) 0)
+#endif
+typedef struct { char x; PG_INT128_TYPE y; } ac__type_alignof_;"; then :
+
+else
+  if test "$ac_cv_type_PG_INT128_TYPE" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute alignment of PG_INT128_TYPE
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_alignof_PG_INT128_TYPE=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_alignof_PG_INT128_TYPE" >&5
+$as_echo "$ac_cv_alignof_PG_INT128_TYPE" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define ALIGNOF_PG_INT128_TYPE $ac_cv_alignof_PG_INT128_TYPE
+_ACEOF
+
+
 fi
 
 # Check for various atomic operations now that we have checked how to declare
diff --git a/configure.in b/configure.in
index a623005a..830aa103 100644
--- a/configure.in
+++ b/configure.in
@@ -1845,7 +1845,10 @@ AC_CHECK_ALIGNOF(double)
 
 # Compute maximum alignment of any basic type.
 # We assume long's alignment is at least as strong as char, short, or int;
-# but we must check long long (if it exists) and double.
+# but we must check long long (if it is being used for int64) and double.
+# Note that we intentionally do not consider any types wider than 64 bits,
+# as allowing MAXIMUM_ALIGNOF to exceed 8 would be too much of a penalty
+# for disk and memory space.
 
 MAX_ALIGNOF=$ac_cv_alignof_long
 if test $MAX_ALIGNOF -lt $ac_cv_alignof_double ; then
@@ -1862,7 +1865,7 @@ AC_DEFINE_UNQUOTED(MAXIMUM_ALIGNOF, $MAX_ALIGNOF, [Define as the maximum alignme
 AC_CHECK_TYPES([int8, uint8, int64, uint64], [], [],
 [#include <stdio.h>])
 
-# Check for extensions offering the integer scalar type __int128.
+# Some compilers offer a 128-bit integer scalar type.
 PGAC_TYPE_128BIT_INT
 
 # Check for various atomic operations now that we have checked how to declare
diff --git a/src/include/c.h b/src/include/c.h
index 9e18db2b..f2c1d8c2 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -377,13 +377,30 @@ typedef unsigned long long int uint64;
 
 /*
  * 128-bit signed and unsigned integers
- *        There currently is only a limited support for the type. E.g. 128bit
- *        literals and snprintf are not supported; but math is.
+
+ *		There currently is only limited support for such types.
+ *		E.g. 128bit literals and snprintf are not supported; but math is.
+ *		Also, because we exclude such types when choosing MAXIMUM_ALIGNOF,
+ *		it must be possible to coerce the compiler to allocate them on no
+ *		more than MAXALIGN boundaries.
  */
 #if defined(PG_INT128_TYPE)
-#define HAVE_INT128
-typedef PG_INT128_TYPE int128;
-typedef unsigned PG_INT128_TYPE uint128;
+#if defined(pg_attribute_aligned) || ALIGNOF_PG_INT128_TYPE <= MAXIMUM_ALIGNOF
+#define HAVE_INT128 1
+
+typedef PG_INT128_TYPE int128
+#if defined(pg_attribute_aligned)
+pg_attribute_aligned(MAXIMUM_ALIGNOF)
+#endif
+;
+
+typedef unsigned PG_INT128_TYPE uint128
+#if defined(pg_attribute_aligned)
+pg_attribute_aligned(MAXIMUM_ALIGNOF)
+#endif
+;
+
+#endif
 #endif
 
 /*
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index aa4b2974..0e32abd7 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -27,6 +27,9 @@
 /* The normal alignment of `long long int', in bytes. */
 #undef ALIGNOF_LONG_LONG_INT
 
+/* The normal alignment of `PG_INT128_TYPE', in bytes. */
+#undef ALIGNOF_PG_INT128_TYPE
+
 /* The normal alignment of `short', in bytes. */
 #undef ALIGNOF_SHORT
 
diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32
index 20bff1c4..79bd66cd 100644
--- a/src/include/pg_config.h.win32
+++ b/src/include/pg_config.h.win32
@@ -34,6 +34,9 @@
 /* The alignment requirement of a `long long int'. */
 #define ALIGNOF_LONG_LONG_INT 8
 
+/* The normal alignment of `PG_INT128_TYPE', in bytes. */
+#undef ALIGNOF_PG_INT128_TYPE
+
 /* The alignment requirement of a `short'. */
 #define ALIGNOF_SHORT 2
 

From dbaba14440fb7abdd9c4dc2d9e8d71ae6aeb0aac Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 1 Jul 2021 11:41:49 +0800
Subject: [PATCH 178/578] Remove invalid assertion, this is useless after we
 introduce shard

---
 src/backend/optimizer/util/pgxcship.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index 7c577339..7bff63dd 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -953,7 +953,6 @@ pgxc_FQS_get_relation_nodes(RangeTblEntry *rte, Index varno, Query *query)
                 return NULL;
         }
         
-        Assert(tle);
         /* We found the TargetEntry for the partition column */
         list_free(rel_exec_nodes->primarynodelist);
         rel_exec_nodes->primarynodelist = NULL;

From 949f970e5c6e714a7f1d2dd0e145759de3ccab9b Mon Sep 17 00:00:00 2001
From: ceciliasu <ceciliasu@tencent.com>
Date: Tue, 6 Jul 2021 21:39:33 +0800
Subject: [PATCH 179/578] fix bug: alloacate shm of wrong size for
 PGXCSessionId in InitializeParallelDSM
 http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view\?bug_id\=1020421696089628859

---
 src/backend/access/transam/parallel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
index d6108a3a..7639bc2f 100644
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -246,7 +246,7 @@ InitializeParallelDSM(ParallelContext *pcxt)
         gxidlen = EstimateGlobalXidSpace();
         shm_toc_estimate_chunk(&pcxt->estimator, gxidlen);
 #endif
-		sidlen = PGXCSessionId[0] == '\0' ? 0 : strlen(PGXCSessionId) + 1;
+		sidlen = sizeof(int) + (PGXCSessionId[0] == '\0' ? 0 : strlen(PGXCSessionId) + 1);
 		shm_toc_estimate_chunk(&pcxt->estimator, sidlen);
         /* If you add more chunks here, you probably need to add keys. */
 		shm_toc_estimate_keys(&pcxt->estimator, 8);

From c47bebc338b01177687a3a4060d61b235bf2c03f Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 1 Oct 2021 11:49:16 +0800
Subject: [PATCH 180/578] fix pgxc_ctl monitor bug
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131092865125 (merge request
 !773)

Signed-off-by: JennyJennyChen <chenzaini@sina.com>
---
 contrib/pgxc_ctl/monitor.c | 4 ++--
 src/include/gtm/gtm_c.h    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/contrib/pgxc_ctl/monitor.c b/contrib/pgxc_ctl/monitor.c
index bb540b6f..91537d0e 100644
--- a/contrib/pgxc_ctl/monitor.c
+++ b/contrib/pgxc_ctl/monitor.c
@@ -484,9 +484,9 @@ do_gtm_ping(char *host, int port)
         elog(ERROR, "ERROR: Invalid port number, %d.\n", port);
         return -1;
     }
-    /* Use 60s as connection timeout */
+	/* Use 60s as connection timeout, use GTM_NODE_GTM_CTL as remote type here */
     sprintf(connect_str, "host=%s port=%d node_name=%s remote_type=%d postmaster=0 connect_timeout=60",
-            host, port, myName, GTM_NODE_COORDINATOR);
+			host, port, myName, GTM_NODE_GTM_CTL);
     if ((conn = PQconnectGTM(connect_str)) == NULL || GTMPQstatus(conn) == CONNECTION_BAD)
     {
         elog(DEBUG3, "DEBUG3: Could not connect to %s, %d\n", host, port);
diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h
index 7af3e735..b5a18302 100644
--- a/src/include/gtm/gtm_c.h
+++ b/src/include/gtm/gtm_c.h
@@ -58,7 +58,7 @@ typedef enum GTM_PGXCNodeType
     GTM_NODE_DATANODE = 4,
     GTM_NODE_GTM = 5,
 #ifdef __TBASE__
-    GTM_NODE_GTM_CTL   = 6,    /* gtm ctl will never register and unregister. */
+	GTM_NODE_GTM_CTL   = 6,	/* gtm ctl will never register and unregister, maybe used by gtm_ctl or pgxc_ctl */
 #endif
     GTM_NODE_DEFAULT/* In case nothing is associated to connection */
 } GTM_PGXCNodeType;

From 285b804afb6a69f97fb6fd10cef01bf033dd4ca7 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 1 Oct 2021 11:49:16 +0800
Subject: [PATCH 181/578] fix pgxc_ctl monitor bug
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131092865125 (merge request
 !773)

Signed-off-by: JennyJennyChen <chenzaini@sina.com>
---
 contrib/pgxc_ctl/monitor.c | 4 ++--
 src/include/gtm/gtm_c.h    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/contrib/pgxc_ctl/monitor.c b/contrib/pgxc_ctl/monitor.c
index 65b3af88..b8d07c72 100644
--- a/contrib/pgxc_ctl/monitor.c
+++ b/contrib/pgxc_ctl/monitor.c
@@ -485,9 +485,9 @@ do_gtm_ping(char *host, int port)
         elog(ERROR, "ERROR: Invalid port number, %d.\n", port);
         return -1;
     }
-    /* Use 60s as connection timeout */
+	/* Use 60s as connection timeout, use GTM_NODE_GTM_CTL as remote type here */
     sprintf(connect_str, "host=%s port=%d node_name=%s remote_type=%d postmaster=0 connect_timeout=60",
-            host, port, myName, GTM_NODE_COORDINATOR);
+			host, port, myName, GTM_NODE_GTM_CTL);
     if ((conn = PQconnectGTM(connect_str)) == NULL || GTMPQstatus(conn) == CONNECTION_BAD)
     {
         elog(DEBUG3, "DEBUG3: Could not connect to %s, %d\n", host, port);
diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h
index b2c6382b..5e52fcad 100644
--- a/src/include/gtm/gtm_c.h
+++ b/src/include/gtm/gtm_c.h
@@ -58,7 +58,7 @@ typedef enum GTM_PGXCNodeType
     GTM_NODE_DATANODE = 4,
     GTM_NODE_GTM = 5,
 #ifdef __TBASE__
-    GTM_NODE_GTM_CTL   = 6,    /* gtm ctl will never register and unregister. */
+	GTM_NODE_GTM_CTL   = 6,	/* gtm ctl will never register and unregister, maybe used by gtm_ctl or pgxc_ctl */
 #endif
     GTM_NODE_DEFAULT/* In case nothing is associated to connection */
 } GTM_PGXCNodeType;

From b9f4bd4adb997993a33e274cb9152043d8c06039 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 11 Jun 2021 12:05:44 +0800
Subject: [PATCH 182/578] Consider es_plannedstmt NULL when calling
 ResetRemoteSubplanCursor

---
 src/backend/executor/execMain.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 7424b45a..78810446 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -3176,7 +3176,10 @@ EvalPlanQualInit(EPQState *epqstate, EState *estate,
     /* ... and remember data that EvalPlanQualBegin will need */
 	epqstate->plan = copyObject(subplan);
 	/* Reset cursor name of remote subplans if any */
-	ResetRemoteSubplanCursor(epqstate->plan, estate->es_plannedstmt->subplans, "epq");
+	ResetRemoteSubplanCursor(epqstate->plan,
+	                         (estate->es_plannedstmt ?
+	                          estate->es_plannedstmt->subplans : NULL),
+	                         "epq");
     epqstate->arowMarks = auxrowmarks;
     epqstate->epqParam = epqParam;
 }
@@ -3195,7 +3198,8 @@ EvalPlanQualSetPlan(EPQState *epqstate, Plan *subplan, List *auxrowmarks)
 	epqstate->plan = copyObject(subplan);
 	/* Reset cursor name of remote subplans if any */
 	ResetRemoteSubplanCursor(epqstate->plan,
-	                         epqstate->parentestate->es_plannedstmt->subplans,
+	                         (epqstate->parentestate->es_plannedstmt ?
+	                          epqstate->parentestate->es_plannedstmt->subplans : NULL),
 	                         "epq");
     /* The rowmarks depend on the plan, too */
     epqstate->arowMarks = auxrowmarks;

From 3b3fd5a46ff108e6c11b725f761b5f30606e6a92 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Mon, 28 Jun 2021 19:58:43 +0800
Subject: [PATCH 183/578] Fix wrong call of list_nth_node

---
 src/backend/nodes/nodeFuncs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c
index 27a1b7a3..a7ab020e 100644
--- a/src/backend/nodes/nodeFuncs.c
+++ b/src/backend/nodes/nodeFuncs.c
@@ -3886,7 +3886,7 @@ plantree_walk_initplans(List *plans,
 	
 	foreach(lc, plans)
 	{
-		Plan    *splan = list_nth_node(Plan, subplans,
+		Plan    *splan = (Plan *) list_nth(subplans,
 								 (lfirst_node(SubPlan, lc))->plan_id);
 		
 		if (walker(splan, context))

From 86ff9315abdb858c83db01c8a579357031c81011 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 24 Jun 2020 16:43:03 +0800
Subject: [PATCH 184/578] Assorted preparatory refactoring for partition-wise
 join.

---
 src/backend/catalog/partition.c       |  9 ++-
 src/backend/optimizer/path/joinpath.c | 27 +++++----
 src/backend/optimizer/util/pathnode.c | 11 ++--
 src/backend/optimizer/util/relnode.c  | 39 +++++++-----
 src/backend/utils/cache/relcache.c    |  4 +-
 src/include/catalog/partition.h       | 85 ++++++++++++++-------------
 src/include/optimizer/pathnode.h      |  7 ++-
 7 files changed, 99 insertions(+), 83 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 30be04e6..9ecd77ea 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -595,7 +595,7 @@ RelationBuildPartitionDesc(Relation rel)
  * representation of partition bounds.
  */
 bool
-partition_bounds_equal(PartitionKey key,
+partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
                        PartitionBoundInfo b1, PartitionBoundInfo b2)
 {// #lizard forgives
     int            i;
@@ -613,7 +613,7 @@ partition_bounds_equal(PartitionKey key,
     {
         int            j;
 
-        for (j = 0; j < key->partnatts; j++)
+		for (j = 0; j < partnatts; j++)
         {
             /* For range partitions, the bounds might not be finite. */
             if (b1->kind != NULL)
@@ -639,8 +639,7 @@ partition_bounds_equal(PartitionKey key,
              * context.  datumIsEqual() should be simple enough to be safe.
              */
             if (!datumIsEqual(b1->datums[i][j], b2->datums[i][j],
-                              key->parttypbyval[j],
-                              key->parttyplen[j]))
+							  parttypbyval[j], parttyplen[j]))
                 return false;
         }
 
@@ -649,7 +648,7 @@ partition_bounds_equal(PartitionKey key,
     }
 
     /* There are ndatums+1 indexes in case of range partitions */
-    if (key->strategy == PARTITION_STRATEGY_RANGE &&
+	if (b1->strategy == PARTITION_STRATEGY_RANGE &&
         b1->indexes[i] != b2->indexes[i])
         return false;
 
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c
index 49852d77..72a766af 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -331,18 +331,15 @@ add_paths_to_joinrel(PlannerInfo *root,
  */
 static inline bool
 allow_star_schema_join(PlannerInfo *root,
-                       Path *outer_path,
-                       Path *inner_path)
+					   Relids outerrelids,
+					   Relids inner_paramrels)
 {
-    Relids        innerparams = PATH_REQ_OUTER(inner_path);
-    Relids        outerrelids = outer_path->parent->relids;
-
     /*
      * It's a star-schema case if the outer rel provides some but not all of
      * the inner rel's parameterization.
      */
-    return (bms_overlap(innerparams, outerrelids) &&
-            bms_nonempty_difference(innerparams, outerrelids));
+	return (bms_overlap(inner_paramrels, outerrelids) &&
+			bms_nonempty_difference(inner_paramrels, outerrelids));
 }
 
 /*
@@ -361,6 +358,12 @@ try_nestloop_path(PlannerInfo *root,
 {
     Relids        required_outer;
     JoinCostWorkspace workspace;
+	RelOptInfo *innerrel = inner_path->parent;
+	RelOptInfo *outerrel = outer_path->parent;
+	Relids		innerrelids = innerrel->relids;
+	Relids		outerrelids = outerrel->relids;
+	Relids		inner_paramrels = PATH_REQ_OUTER(inner_path);
+	Relids		outer_paramrels = PATH_REQ_OUTER(outer_path);
 
     /*
      * Check to see if proposed path is still parameterized, and reject if the
@@ -369,14 +372,12 @@ try_nestloop_path(PlannerInfo *root,
      * doesn't like the look of it, which could only happen if the nestloop is
      * still parameterized.
      */
-    required_outer = calc_nestloop_required_outer(outer_path,
-                                                  inner_path);
+	required_outer = calc_nestloop_required_outer(outerrelids, outer_paramrels,
+												  innerrelids, inner_paramrels);
     if (required_outer &&
         ((!bms_overlap(required_outer, extra->param_source_rels) &&
-          !allow_star_schema_join(root, outer_path, inner_path)) ||
-         have_dangerous_phv(root,
-                            outer_path->parent->relids,
-                            PATH_REQ_OUTER(inner_path))))
+		  !allow_star_schema_join(root, outerrelids, inner_paramrels)) ||
+		 have_dangerous_phv(root, outerrelids, inner_paramrels)))
     {
         /* Waste no memory when we reject a path here */
         bms_free(required_outer);
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 17546a77..49e6658f 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -5241,14 +5241,15 @@ create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel,
  * Note: result must not share storage with either input
  */
 Relids
-calc_nestloop_required_outer(Path *outer_path, Path *inner_path)
+calc_nestloop_required_outer(Relids outerrelids,
+							 Relids outer_paramrels,
+							 Relids innerrelids,
+							 Relids inner_paramrels)
 {
-    Relids        outer_paramrels = PATH_REQ_OUTER(outer_path);
-    Relids        inner_paramrels = PATH_REQ_OUTER(inner_path);
     Relids        required_outer;
 
     /* inner_path can require rels from outer path, but not vice versa */
-    Assert(!bms_overlap(outer_paramrels, inner_path->parent->relids));
+	Assert(!bms_overlap(outer_paramrels, innerrelids));
     /* easy case if inner path is not parameterized */
     if (!inner_paramrels)
         return bms_copy(outer_paramrels);
@@ -5256,7 +5257,7 @@ calc_nestloop_required_outer(Path *outer_path, Path *inner_path)
     required_outer = bms_union(outer_paramrels, inner_paramrels);
     /* ... and remove any mention of now-satisfied outer rels */
     required_outer = bms_del_members(required_outer,
-                                     outer_path->parent->relids);
+									 outerrelids);
     /* maintain invariant that required_outer is exactly NULL if empty */
     if (bms_is_empty(required_outer))
     {
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index b4359f52..9fba700e 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -1106,12 +1106,8 @@ get_baserel_parampathinfo(PlannerInfo *root, RelOptInfo *baserel,
     Assert(!bms_overlap(baserel->relids, required_outer));
 
     /* If we already have a PPI for this parameterization, just return it */
-    foreach(lc, baserel->ppilist)
-    {
-        ppi = (ParamPathInfo *) lfirst(lc);
-        if (bms_equal(ppi->ppi_req_outer, required_outer))
+	if ((ppi = find_param_path_info(baserel, required_outer)))
             return ppi;
-    }
 
     /*
      * Identify all joinclauses that are movable to this base rel given this
@@ -1348,12 +1344,8 @@ get_joinrel_parampathinfo(PlannerInfo *root, RelOptInfo *joinrel,
     *restrict_clauses = list_concat(pclauses, *restrict_clauses);
 
     /* If we already have a PPI for this parameterization, just return it */
-    foreach(lc, joinrel->ppilist)
-    {
-        ppi = (ParamPathInfo *) lfirst(lc);
-        if (bms_equal(ppi->ppi_req_outer, required_outer))
+	if ((ppi = find_param_path_info(joinrel, required_outer)))
             return ppi;
-    }
 
     /* Estimate the number of rows returned by the parameterized join */
     rows = get_parameterized_joinrel_size(root, joinrel,
@@ -1392,7 +1384,6 @@ ParamPathInfo *
 get_appendrel_parampathinfo(RelOptInfo *appendrel, Relids required_outer)
 {
     ParamPathInfo *ppi;
-    ListCell   *lc;
 
     /* Unparameterized paths have no ParamPathInfo */
     if (bms_is_empty(required_outer))
@@ -1401,12 +1392,8 @@ get_appendrel_parampathinfo(RelOptInfo *appendrel, Relids required_outer)
     Assert(!bms_overlap(appendrel->relids, required_outer));
 
     /* If we already have a PPI for this parameterization, just return it */
-    foreach(lc, appendrel->ppilist)
-    {
-        ppi = (ParamPathInfo *) lfirst(lc);
-        if (bms_equal(ppi->ppi_req_outer, required_outer))
+	if ((ppi = find_param_path_info(appendrel, required_outer)))
             return ppi;
-    }
 
     /* Else build the ParamPathInfo */
     ppi = makeNode(ParamPathInfo);
@@ -1417,3 +1404,23 @@ get_appendrel_parampathinfo(RelOptInfo *appendrel, Relids required_outer)
 
     return ppi;
 }
+
+/*
+ * Returns a ParamPathInfo for the parameterization given by required_outer, if
+ * already available in the given rel. Returns NULL otherwise.
+ */
+ParamPathInfo *
+find_param_path_info(RelOptInfo *rel, Relids required_outer)
+{
+	ListCell   *lc;
+
+	foreach(lc, rel->ppilist)
+	{
+		ParamPathInfo *ppi = (ParamPathInfo *) lfirst(lc);
+
+		if (bms_equal(ppi->ppi_req_outer, required_outer))
+			return ppi;
+	}
+
+	return NULL;
+}
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index eb895929..66aebfe9 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -1396,7 +1396,9 @@ equalPartitionDescs(PartitionKey key, PartitionDesc partdesc1,
             if (partdesc2->boundinfo == NULL)
                 return false;
 
-            if (!partition_bounds_equal(key, partdesc1->boundinfo,
+			if (!partition_bounds_equal(key->partnatts, key->parttyplen,
+										key->parttypbyval,
+										partdesc1->boundinfo,
                                         partdesc2->boundinfo))
                 return false;
         }
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index 2efe3ea6..bef7a0f5 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -1,8 +1,8 @@
 /*-------------------------------------------------------------------------
  *
  * partition.h
- *        Header file for structures and utility functions related to
- *        partitioning
+ *		Header file for structures and utility functions related to
+ *		partitioning
  *
  * Copyright (c) 2007-2017, PostgreSQL Global Development Group
  *
@@ -32,9 +32,9 @@ typedef struct PartitionBoundInfoData *PartitionBoundInfo;
  */
 typedef struct PartitionDescData
 {
-    int            nparts;            /* Number of partitions */
-    Oid           *oids;            /* OIDs of partitions */
-    PartitionBoundInfo boundinfo;    /* collection of partition bounds */
+	int			nparts;			/* Number of partitions */
+	Oid		   *oids;			/* OIDs of partitions */
+	PartitionBoundInfo boundinfo;	/* collection of partition bounds */
 } PartitionDescData;
 
 typedef struct PartitionDescData *PartitionDesc;
@@ -43,60 +43,61 @@ typedef struct PartitionDescData *PartitionDesc;
  * PartitionDispatch - information about one partitioned table in a partition
  * hierarchy required to route a tuple to one of its partitions
  *
- *    reldesc        Relation descriptor of the table
- *    key            Partition key information of the table
- *    keystate    Execution state required for expressions in the partition key
- *    partdesc    Partition descriptor of the table
- *    tupslot        A standalone TupleTableSlot initialized with this table's tuple
- *                descriptor
- *    tupmap        TupleConversionMap to convert from the parent's rowtype to
- *                this table's rowtype (when extracting the partition key of a
- *                tuple just before routing it through this table)
- *    indexes        Array with partdesc->nparts members (for details on what
- *                individual members represent, see how they are set in
- *                RelationGetPartitionDispatchInfo())
+ *	reldesc		Relation descriptor of the table
+ *	key			Partition key information of the table
+ *	keystate	Execution state required for expressions in the partition key
+ *	partdesc	Partition descriptor of the table
+ *	tupslot		A standalone TupleTableSlot initialized with this table's tuple
+ *				descriptor
+ *	tupmap		TupleConversionMap to convert from the parent's rowtype to
+ *				this table's rowtype (when extracting the partition key of a
+ *				tuple just before routing it through this table)
+ *	indexes		Array with partdesc->nparts members (for details on what
+ *				individual members represent, see how they are set in
+ *				RelationGetPartitionDispatchInfo())
  *-----------------------
  */
 typedef struct PartitionDispatchData
 {
-    Relation    reldesc;
-    PartitionKey key;
-    List       *keystate;        /* list of ExprState */
-    PartitionDesc partdesc;
-    TupleTableSlot *tupslot;
-    TupleConversionMap *tupmap;
-    int           *indexes;
+	Relation	reldesc;
+	PartitionKey key;
+	List	   *keystate;		/* list of ExprState */
+	PartitionDesc partdesc;
+	TupleTableSlot *tupslot;
+	TupleConversionMap *tupmap;
+	int		   *indexes;
 } PartitionDispatchData;
 
 typedef struct PartitionDispatchData *PartitionDispatch;
 
 extern void RelationBuildPartitionDesc(Relation relation);
-extern bool partition_bounds_equal(PartitionKey key,
-                       PartitionBoundInfo p1, PartitionBoundInfo p2);
+extern bool partition_bounds_equal(int partnatts, int16 *parttyplen,
+					   bool *parttypbyval, PartitionBoundInfo b1,
+					   PartitionBoundInfo b2);
 
 extern void check_new_partition_bound(char *relname, Relation parent,
-                          PartitionBoundSpec *spec);
-extern Oid    get_partition_parent(Oid relid);
+						  PartitionBoundSpec *spec);
+extern Oid	get_partition_parent(Oid relid);
 extern List *get_qual_from_partbound(Relation rel, Relation parent,
-                        PartitionBoundSpec *spec);
+						PartitionBoundSpec *spec);
 extern List *map_partition_varattnos(List *expr, int target_varno,
-                        Relation partrel, Relation parent,
-                        bool *found_whole_row);
+						Relation partrel, Relation parent,
+						bool *found_whole_row);
 extern List *RelationGetPartitionQual(Relation rel);
 extern Expr *get_partition_qual_relid(Oid relid);
 
 /* For tuple routing */
 extern PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel,
-                                 int lockmode, int *num_parted,
-                                 List **leaf_part_oids);
+								 int lockmode, int *num_parted,
+								 List **leaf_part_oids);
 extern void FormPartitionKeyDatum(PartitionDispatch pd,
-                      TupleTableSlot *slot,
-                      EState *estate,
-                      Datum *values,
-                      bool *isnull);
+					  TupleTableSlot *slot,
+					  EState *estate,
+					  Datum *values,
+					  bool *isnull);
 extern int get_partition_for_tuple(PartitionDispatch *pd,
-                        TupleTableSlot *slot,
-                        EState *estate,
-                        PartitionDispatchData **failed_at,
-                        TupleTableSlot **failed_slot);
-#endif                            /* PARTITION_H */
+						TupleTableSlot *slot,
+						EState *estate,
+						PartitionDispatchData **failed_at,
+						TupleTableSlot **failed_slot);
+#endif							/* PARTITION_H */
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index f1ff4710..3df87235 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -173,7 +173,10 @@ extern ForeignPath *create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel,
                         Path *fdw_outerpath,
                         List *fdw_private);
 
-extern Relids calc_nestloop_required_outer(Path *outer_path, Path *inner_path);
+extern Relids calc_nestloop_required_outer(Relids outerrelids,
+							 Relids outer_paramrels,
+							 Relids innerrelids,
+							 Relids inner_paramrels);
 extern Relids calc_non_nestloop_required_outer(Path *outer_path, Path *inner_path);
 
 extern NestPath *create_nestloop_path(PlannerInfo *root,
@@ -349,6 +352,8 @@ extern ParamPathInfo *get_joinrel_parampathinfo(PlannerInfo *root,
                           List **restrict_clauses);
 extern ParamPathInfo *get_appendrel_parampathinfo(RelOptInfo *appendrel,
                             Relids required_outer);
+extern ParamPathInfo *find_param_path_info(RelOptInfo *rel,
+                                        Relids required_outer);
 
 #ifdef __TBASE__
 extern Path *create_redistribute_grouping_path(PlannerInfo *root, 

From a649b7cee68c5c525aad8a81009df5a7c3f94869 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 24 Jun 2020 16:47:33 +0800
Subject: [PATCH 185/578] Refactor validation of new partitions a little bit

---
 src/backend/commands/tablecmds.c | 318 +++++++++++++++++--------------
 1 file changed, 172 insertions(+), 146 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 6915458f..db41c7fe 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -538,6 +538,11 @@ static void CreateInheritance(Relation child_rel, Relation parent_rel);
 static void RemoveInheritance(Relation child_rel, Relation parent_rel);
 static ObjectAddress ATExecAttachPartition(List **wqueue, Relation rel,
                       PartitionCmd *cmd);
+static bool PartConstraintImpliedByRelConstraint(Relation scanrel,
+                                    List *partConstraint);
+static void ValidatePartitionConstraints(List **wqueue, Relation scanrel,
+                            List *scanrel_children,
+                            List *partConstraint);
 static ObjectAddress ATExecDetachPartition(Relation rel, RangeVar *name);
 #ifdef _SHARDING_
 static void AtExecRebuildExtent(Relation rel);
@@ -16342,6 +16347,169 @@ ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
     }
 }
 
+/*
+ * PartConstraintImpliedByRelConstraint
+ *		Does scanrel's existing constraints imply the partition constraint?
+ *
+ * Existing constraints includes its check constraints and column-level
+ * NOT NULL constraints and partConstraint describes the partition constraint.
+ */
+static bool
+PartConstraintImpliedByRelConstraint(Relation scanrel,
+									 List *partConstraint)
+{
+	List	   *existConstraint = NIL;
+	TupleConstr *constr = RelationGetDescr(scanrel)->constr;
+	int			num_check,
+				i;
+
+	if (constr && constr->has_not_null)
+	{
+		int			natts = scanrel->rd_att->natts;
+
+		for (i = 1; i <= natts; i++)
+		{
+			Form_pg_attribute att = scanrel->rd_att->attrs[i - 1];
+
+			if (att->attnotnull && !att->attisdropped)
+			{
+				NullTest   *ntest = makeNode(NullTest);
+
+				ntest->arg = (Expr *) makeVar(1,
+											  i,
+											  att->atttypid,
+											  att->atttypmod,
+											  att->attcollation,
+											  0);
+				ntest->nulltesttype = IS_NOT_NULL;
+
+				/*
+				 * argisrow=false is correct even for a composite column,
+				 * because attnotnull does not represent a SQL-spec IS NOT
+				 * NULL test in such a case, just IS DISTINCT FROM NULL.
+				 */
+				ntest->argisrow = false;
+				ntest->location = -1;
+				existConstraint = lappend(existConstraint, ntest);
+			}
+		}
+	}
+
+	num_check = (constr != NULL) ? constr->num_check : 0;
+	for (i = 0; i < num_check; i++)
+	{
+		Node	   *cexpr;
+
+		/*
+		 * If this constraint hasn't been fully validated yet, we must ignore
+		 * it here.
+		 */
+		if (!constr->check[i].ccvalid)
+			continue;
+
+		cexpr = stringToNode(constr->check[i].ccbin);
+
+		/*
+		 * Run each expression through const-simplification and
+		 * canonicalization.  It is necessary, because we will be comparing it
+		 * to similarly-processed partition constraint expressions, and may
+		 * fail to detect valid matches without this.
+		 */
+		cexpr = eval_const_expressions(NULL, cexpr);
+		cexpr = (Node *) canonicalize_qual((Expr *) cexpr);
+
+		existConstraint = list_concat(existConstraint,
+									  make_ands_implicit((Expr *) cexpr));
+	}
+
+	if (existConstraint != NIL)
+		existConstraint = list_make1(make_ands_explicit(existConstraint));
+
+	/* And away we go ... */
+	return predicate_implied_by(partConstraint, existConstraint, true);
+}
+
+/*
+ * ValidatePartitionConstraints
+ *
+ * Check whether all rows in the given table obey the given partition
+ * constraint; if so, it can be attached as a partition.  We do this by
+ * scanning the table (or all of its leaf partitions) row by row, except when
+ * the existing constraints are sufficient to prove that the new partitioning
+ * constraint must already hold.
+ */
+static void
+ValidatePartitionConstraints(List **wqueue, Relation scanrel,
+							 List *scanrel_children,
+							 List *partConstraint)
+{
+	bool		found_whole_row;
+	ListCell   *lc;
+
+	if (partConstraint == NIL)
+		return;
+
+	/*
+	 * Based on the table's existing constraints, determine if we can skip
+	 * scanning the table to validate the partition constraint.
+	 */
+	if (PartConstraintImpliedByRelConstraint(scanrel, partConstraint))
+	{
+		ereport(INFO,
+				(errmsg("partition constraint for table \"%s\" is implied by existing constraints",
+						RelationGetRelationName(scanrel))));
+		return;
+	}
+
+	/* Constraints proved insufficient, so we need to scan the table. */
+	foreach(lc, scanrel_children)
+	{
+		AlteredTableInfo *tab;
+		Oid			part_relid = lfirst_oid(lc);
+		Relation	part_rel;
+		List	   *my_partconstr = partConstraint;
+
+		/* Lock already taken */
+		if (part_relid != RelationGetRelid(scanrel))
+			part_rel = heap_open(part_relid, NoLock);
+		else
+			part_rel = scanrel;
+
+		/*
+		 * Skip if the partition is itself a partitioned table.  We can only
+		 * ever scan RELKIND_RELATION relations.
+		 */
+		if (part_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+		{
+			if (part_rel != scanrel)
+				heap_close(part_rel, NoLock);
+			continue;
+		}
+
+		if (part_rel != scanrel)
+		{
+			/*
+			 * Adjust the constraint for scanrel so that it matches this
+			 * partition's attribute numbers.
+			 */
+			my_partconstr = map_partition_varattnos(my_partconstr, 1,
+													part_rel, scanrel,
+													&found_whole_row);
+			/* There can never be a whole-row reference here */
+			if (found_whole_row)
+				elog(ERROR, "unexpected whole-row reference found in partition key");
+		}
+
+		/* Grab a work queue entry. */
+		tab = ATGetQueueEntry(wqueue, part_rel);
+		tab->partition_constraint = (Expr *) linitial(my_partconstr);
+
+		/* keep our lock until commit */
+		if (part_rel != scanrel)
+			heap_close(part_rel, NoLock);
+	}
+}
+
 /*
  * ALTER TABLE <name> ATTACH PARTITION <partition-name> FOR VALUES
  *
@@ -16353,15 +16521,12 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
     Relation    attachrel,
                 catalog;
     List       *attachrel_children;
-    TupleConstr *attachrel_constr;
-    List       *partConstraint,
-               *existConstraint;
+	List	   *partConstraint;
     SysScanDesc scan;
     ScanKeyData skey;
     AttrNumber    attno;
     int            natts;
     TupleDesc    tupleDesc;
-    bool        skip_validate = false;
     ObjectAddress address;
     const char *trigger_name;
     bool        found_whole_row;
@@ -16555,148 +16720,9 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
     if (found_whole_row)
         elog(ERROR, "unexpected whole-row reference found in partition key");
 
-    /*
-     * Check if we can do away with having to scan the table being attached to
-     * validate the partition constraint, by *proving* that the existing
-     * constraints of the table *imply* the partition predicate.  We include
-     * the table's check constraints and NOT NULL constraints in the list of
-     * clauses passed to predicate_implied_by().
-     *
-     * There is a case in which we cannot rely on just the result of the
-     * proof.
-     */
-    attachrel_constr = tupleDesc->constr;
-    existConstraint = NIL;
-    if (attachrel_constr != NULL)
-    {
-        int            num_check = attachrel_constr->num_check;
-        int            i;
-
-        if (attachrel_constr->has_not_null)
-        {
-            int            natts = attachrel->rd_att->natts;
-
-            for (i = 1; i <= natts; i++)
-            {
-                Form_pg_attribute att = attachrel->rd_att->attrs[i - 1];
-
-                if (att->attnotnull && !att->attisdropped)
-                {
-                    NullTest   *ntest = makeNode(NullTest);
-
-                    ntest->arg = (Expr *) makeVar(1,
-                                                  i,
-                                                  att->atttypid,
-                                                  att->atttypmod,
-                                                  att->attcollation,
-                                                  0);
-                    ntest->nulltesttype = IS_NOT_NULL;
-
-                    /*
-                     * argisrow=false is correct even for a composite column,
-                     * because attnotnull does not represent a SQL-spec IS NOT
-                     * NULL test in such a case, just IS DISTINCT FROM NULL.
-                     */
-                    ntest->argisrow = false;
-                    ntest->location = -1;
-                    existConstraint = lappend(existConstraint, ntest);
-                }
-            }
-        }
-
-        for (i = 0; i < num_check; i++)
-        {
-            Node       *cexpr;
-
-            /*
-             * If this constraint hasn't been fully validated yet, we must
-             * ignore it here.
-             */
-            if (!attachrel_constr->check[i].ccvalid)
-                continue;
-
-            cexpr = stringToNode(attachrel_constr->check[i].ccbin);
-
-            /*
-             * Run each expression through const-simplification and
-             * canonicalization.  It is necessary, because we will be
-             * comparing it to similarly-processed qual clauses, and may fail
-             * to detect valid matches without this.
-             */
-            cexpr = eval_const_expressions(NULL, cexpr);
-            cexpr = (Node *) canonicalize_qual((Expr *) cexpr);
-
-            existConstraint = list_concat(existConstraint,
-                                          make_ands_implicit((Expr *) cexpr));
-        }
-
-        existConstraint = list_make1(make_ands_explicit(existConstraint));
-
-        /* And away we go ... */
-        if (predicate_implied_by(partConstraint, existConstraint, true))
-            skip_validate = true;
-    }
-
-    if (skip_validate)
-    {
-        /* No need to scan the table after all. */
-        ereport(INFO,
-                (errmsg("partition constraint for table \"%s\" is implied by existing constraints",
-                        RelationGetRelationName(attachrel))));
-    }
-    else
-    {
-        /* Constraints proved insufficient, so we need to scan the table. */
-        ListCell   *lc;
-
-        foreach(lc, attachrel_children)
-        {
-            AlteredTableInfo *tab;
-            Oid            part_relid = lfirst_oid(lc);
-            Relation    part_rel;
-            List       *my_partconstr = partConstraint;
-
-            /* Lock already taken */
-            if (part_relid != RelationGetRelid(attachrel))
-                part_rel = heap_open(part_relid, NoLock);
-            else
-                part_rel = attachrel;
-
-            /*
-             * Skip if the partition is itself a partitioned table.  We can
-             * only ever scan RELKIND_RELATION relations.
-             */
-            if (part_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-            {
-                if (part_rel != attachrel)
-                    heap_close(part_rel, NoLock);
-                continue;
-            }
-
-            if (part_rel != attachrel)
-            {
-                /*
-                 * Adjust the constraint that we constructed above for
-                 * attachRel so that it matches this partition's attribute
-                 * numbers.
-                 */
-                my_partconstr = map_partition_varattnos(my_partconstr, 1,
-                                                        part_rel, attachrel,
-                                                        &found_whole_row);
-                /* There can never be a whole-row reference here */
-                if (found_whole_row)
-                    elog(ERROR, "unexpected whole-row reference found in partition key");
-            }
-
-            /* Grab a work queue entry. */
-            tab = ATGetQueueEntry(wqueue, part_rel);
-            tab->partition_constraint = (Expr *) linitial(my_partconstr);
-
-            /* keep our lock until commit */
-            if (part_rel != attachrel)
-                heap_close(part_rel, NoLock);
-        }
-    }
+	/* Validate partition constraints against the table being attached. */
+	ValidatePartitionConstraints(wqueue, attachrel, attachrel_children,
+								 partConstraint);
 
     ObjectAddressSet(address, RelationRelationId, RelationGetRelid(attachrel));
 

From d050c3064be485fb61e5d0dcb5f1c3564f68971c Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 24 Jun 2020 16:52:51 +0800
Subject: [PATCH 186/578] Don't lock tables in RelationGetPartitionDispatchInfo

---
 src/backend/catalog/partition.c | 53 +++++++++++++++++----------------
 src/backend/executor/execMain.c | 10 +++++--
 src/include/catalog/partition.h |  3 +-
 3 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 9ecd77ea..3ea32102 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -1011,12 +1011,16 @@ get_partition_qual_relid(Oid relid)
  * RelationGetPartitionDispatchInfo
  *        Returns information necessary to route tuples down a partition tree
  *
- * All the partitions will be locked with lockmode, unless it is NoLock.
- * A list of the OIDs of all the leaf partitions of rel is returned in
- * *leaf_part_oids.
+ * The number of elements in the returned array (that is, the number of
+ * PartitionDispatch objects for the partitioned tables in the partition tree)
+ * is returned in *num_parted and a list of the OIDs of all the leaf
+ * partitions of rel is returned in *leaf_part_oids.
+ *
+ * All the relations in the partition tree (including 'rel') must have been
+ * locked (using at least the AccessShareLock) by the caller.
  */
 PartitionDispatch *
-RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
+RelationGetPartitionDispatchInfo(Relation rel,
                                  int *num_parted, List **leaf_part_oids)
 {
     PartitionDispatchData **pd;
@@ -1031,14 +1035,18 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
                 offset;
 
     /*
-     * Lock partitions and make a list of the partitioned ones to prepare
-     * their PartitionDispatch objects below.
+	 * We rely on the relcache to traverse the partition tree to build both
+	 * the leaf partition OIDs list and the array of PartitionDispatch objects
+	 * for the partitioned tables in the tree.  That means every partitioned
+	 * table in the tree must be locked, which is fine since we require the
+	 * caller to lock all the partitions anyway.
      *
-     * Cannot use find_all_inheritors() here, because then the order of OIDs
-     * in parted_rels list would be unknown, which does not help, because we
-     * assign indexes within individual PartitionDispatch in an order that is
-     * predetermined (determined by the order of OIDs in individual partition
-     * descriptors).
+	 * For every partitioned table in the tree, starting with the root
+	 * partitioned table, add its relcache entry to parted_rels, while also
+	 * queuing its partitions (in the order in which they appear in the
+	 * partition descriptor) to be looked at later in the same loop.  This is
+	 * a bit tricky but works because the foreach() macro doesn't fetch the
+	 * next list element until the bottom of the loop.
      */
     *num_parted = 1;
     parted_rels = list_make1(rel);
@@ -1047,29 +1055,24 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
     APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
     forboth(lc1, all_parts, lc2, all_parents)
     {
-        Relation    partrel = heap_open(lfirst_oid(lc1), lockmode);
+		Oid			partrelid = lfirst_oid(lc1);
         Relation    parent = lfirst(lc2);
-        PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
 
+		if (get_rel_relkind(partrelid) == RELKIND_PARTITIONED_TABLE)
+		{
         /*
-         * If this partition is a partitioned table, add its children to the
-         * end of the list, so that they are processed as well.
+			 * Already locked by the caller.  Note that it is the
+			 * responsibility of the caller to close the below relcache entry,
+			 * once done using the information being collected here (for
+			 * example, in ExecEndModifyTable).
          */
-        if (partdesc)
-        {
+			Relation	partrel = heap_open(partrelid, NoLock);
+
             (*num_parted)++;
             parted_rels = lappend(parted_rels, partrel);
             parted_rel_parents = lappend(parted_rel_parents, parent);
             APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
         }
-        else
-            heap_close(partrel, NoLock);
-
-        /*
-         * We keep the partitioned ones open until we're done using the
-         * information being collected here (for example, see
-         * ExecEndModifyTable).
-         */
     }
 
     /*
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 78810446..776c9d41 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -44,6 +44,7 @@
 #include "access/xact.h"
 #include "catalog/namespace.h"
 #include "catalog/partition.h"
+#include "catalog/pg_inherits_fn.h"
 #include "catalog/pg_publication.h"
 #ifdef _MLS_
 #include "catalog/pg_class.h"
@@ -3696,9 +3697,12 @@ ExecSetupPartitionTupleRouting(Relation rel,
     int            i;
     ResultRelInfo *leaf_part_rri;
 
-    /* Get the tuple-routing information and lock partitions */
-    *pd = RelationGetPartitionDispatchInfo(rel, RowExclusiveLock, num_parted,
-                                           &leaf_parts);
+   /*
+    * Get the information about the partition tree after locking all the
+    * partitions.
+    */
+   (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL);
+   *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts);
     *num_partitions = list_length(leaf_parts);
     *partitions = (ResultRelInfo *) palloc0(*num_partitions *
                                            sizeof(ResultRelInfo));
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index bef7a0f5..2283c675 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -88,8 +88,7 @@ extern Expr *get_partition_qual_relid(Oid relid);
 
 /* For tuple routing */
 extern PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel,
-								 int lockmode, int *num_parted,
-								 List **leaf_part_oids);
+								 int *num_parted, List **leaf_part_oids);
 extern void FormPartitionKeyDatum(PartitionDispatch pd,
 					  TupleTableSlot *slot,
 					  EState *estate,

From 85b8156ded49910ea31425ed73238cd3361731b9 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 24 Jun 2020 17:31:26 +0800
Subject: [PATCH 187/578] Expand partitioned tables in PartDesc order.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/optimizer/prep/prepunion.c | 311 +++++++++++++++++--------
 src/test/regress/expected/insert.out   |   4 +-
 2 files changed, 215 insertions(+), 100 deletions(-)

diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 68b7cd0f..ec3de76b 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -34,6 +34,7 @@
 #include "access/heapam.h"
 #include "access/htup_details.h"
 #include "access/sysattr.h"
+#include "catalog/partition.h"
 #include "catalog/pg_inherits_fn.h"
 #include "catalog/pg_type.h"
 #include "miscadmin.h"
@@ -100,6 +101,19 @@ static List *generate_append_tlist(List *colTypes, List *colCollations,
 static List *generate_setop_grouplist(SetOperationStmt *op, List *targetlist);
 static void expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte,
                          Index rti);
+static void expand_partitioned_rtentry(PlannerInfo *root,
+						   RangeTblEntry *parentrte,
+						   Index parentRTindex, Relation parentrel,
+						   PlanRowMark *parentrc, PartitionDesc partdesc,
+						   LOCKMODE lockmode,
+						   bool *has_child, List **appinfos,
+						   List **partitioned_child_rels);
+static void expand_single_inheritance_child(PlannerInfo *root,
+								RangeTblEntry *parentrte,
+								Index parentRTindex, Relation parentrel,
+								PlanRowMark *parentrc, Relation childrel,
+								bool *has_child, List **appinfos,
+								List **partitioned_child_rels);
 static void make_inh_translation_list(Relation oldrelation,
                           Relation newrelation,
                           Index newvarno,
@@ -1441,8 +1455,12 @@ expand_inherited_tables(PlannerInfo *root)
  * table, but with inh = false, to represent the parent table in its role
  * as a simple member of the inheritance set.
  *
- * A childless table is never considered to be an inheritance set; therefore
- * a parent RTE must always have at least two associated AppendRelInfos.
+* A childless table is never considered to be an inheritance set. For
+* regular inheritance, a parent RTE must always have at least two associated
+* AppendRelInfos: one corresponding to the parent table as a simple member of
+* inheritance set and one or more corresponding to the actual children.
+* Since a partitioned table is not scanned, it might have only one associated
+* AppendRelInfo.
  */
 static void
 expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
@@ -1455,7 +1473,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
     List       *inhOIDs;
     List       *appinfos;
     ListCell   *l;
-    bool        need_append;
+   bool        has_child;
     PartitionedChildRelInfo *pcinfo;
     List       *partitioned_child_rels = NIL;
 
@@ -1529,14 +1547,35 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
 
     /* Scan the inheritance set and expand it */
     appinfos = NIL;
-    need_append = false;
+   has_child = false;
+   if (RelationGetPartitionDesc(oldrelation) != NULL)
+   {
+       /*
+        * If this table has partitions, recursively expand them in the order
+        * in which they appear in the PartitionDesc.  But first, expand the
+        * parent itself.
+        */
+       expand_single_inheritance_child(root, rte, rti, oldrelation, oldrc,
+                                       oldrelation,
+                                       &has_child, &appinfos,
+                                       &partitioned_child_rels);
+       expand_partitioned_rtentry(root, rte, rti, oldrelation, oldrc,
+                                     RelationGetPartitionDesc(oldrelation),
+                                     lockmode,
+                                     &has_child, &appinfos,
+                                     &partitioned_child_rels);
+   }
+   else
+   {
+       /*
+        * This table has no partitions.  Expand any plain inheritance
+        * children in the order the OIDs were returned by
+        * find_all_inheritors.
+        */
     foreach(l, inhOIDs)
     {
         Oid            childOID = lfirst_oid(l);
         Relation    newrelation;
-        RangeTblEntry *childrte;
-        Index        childRTindex;
-        AppendRelInfo *appinfo;
 
         /* Open rel if needed; we already have required locks */
         if (childOID != parentOID)
@@ -1547,8 +1586,8 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
         /*
          * It is possible that the parent table has children that are temp
          * tables of other backends.  We cannot safely access such tables
-         * (because of buffering issues), and the best thing to do seems to be
-         * to silently ignore them.
+            * (because of buffering issues), and the best thing to do seems
+            * to be to silently ignore them.
          */
         if (childOID != parentOID && RELATION_IS_OTHER_TEMP(newrelation))
         {
@@ -1556,21 +1595,139 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
             continue;
         }
 
+           expand_single_inheritance_child(root, rte, rti, oldrelation, oldrc,
+                                           newrelation,
+                                           &has_child, &appinfos,
+                                           &partitioned_child_rels);
+
+           /* Close child relations, but keep locks */
+           if (childOID != parentOID)
+               heap_close(newrelation, NoLock);
+       }
+   }
+
+   heap_close(oldrelation, NoLock);
+
         /*
-         * Build an RTE for the child, and attach to query's rangetable list.
-         * We copy most fields of the parent's RTE, but replace relation OID
-         * and relkind, and set inh = false.  Also, set requiredPerms to zero
-         * since all required permissions checks are done on the original RTE.
-         * Likewise, set the child's securityQuals to empty, because we only
-         * want to apply the parent's RLS conditions regardless of what RLS
-         * properties individual children may have.  (This is an intentional
-         * choice to make inherited RLS work like regular permissions checks.)
-         * The parent securityQuals will be propagated to children along with
-         * other base restriction clauses, so we don't need to do it here.
+    * If all the children were temp tables or a partitioned parent did not
+    * have any leaf partitions, pretend it's a non-inheritance situation; we
+    * don't need Append node in that case.  The duplicate RTE we added for
+    * the parent table is harmless, so we don't bother to get rid of it;
+    * ditto for the useless PlanRowMark node.
          */
-        childrte = copyObject(rte);
+   if (!has_child)
+   {
+       /* Clear flag before returning */
+       rte->inh = false;
+       return;
+   }
+
+   /*
+    * We keep a list of objects in root, each of which maps a partitioned
+    * parent RT index to the list of RT indexes of its partitioned child
+    * tables.  When creating an Append or a ModifyTable path for the parent,
+    * we copy the child RT index list verbatim to the path so that it could
+    * be carried over to the executor so that the latter could identify the
+    * partitioned child tables.
+    */
+   if (partitioned_child_rels != NIL)
+   {
+       pcinfo = makeNode(PartitionedChildRelInfo);
+
+       Assert(rte->relkind == RELKIND_PARTITIONED_TABLE);
+       pcinfo->parent_relid = rti;
+       pcinfo->child_rels = partitioned_child_rels;
+       root->pcinfo_list = lappend(root->pcinfo_list, pcinfo);
+   }
+
+   /* Otherwise, OK to add to root->append_rel_list */
+   root->append_rel_list = list_concat(root->append_rel_list, appinfos);
+}
+
+static void
+expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
+						   Index parentRTindex, Relation parentrel,
+						   PlanRowMark *parentrc, PartitionDesc partdesc,
+						   LOCKMODE lockmode,
+						   bool *has_child, List **appinfos,
+						   List **partitioned_child_rels)
+{
+	int			i;
+
+	check_stack_depth();
+
+	for (i = 0; i < partdesc->nparts; i++)
+	{
+		Oid			childOID = partdesc->oids[i];
+		Relation	childrel;
+
+		/* Open rel; we already have required locks */
+		childrel = heap_open(childOID, NoLock);
+
+		/* As in expand_inherited_rtentry, skip non-local temp tables */
+		if (RELATION_IS_OTHER_TEMP(childrel))
+		{
+			heap_close(childrel, lockmode);
+			continue;
+		}
+
+		expand_single_inheritance_child(root, parentrte, parentRTindex,
+										parentrel, parentrc, childrel,
+										has_child, appinfos,
+										partitioned_child_rels);
+
+		/* If this child is itself partitioned, recurse */
+		if (childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+			expand_partitioned_rtentry(root, parentrte, parentRTindex,
+										  parentrel, parentrc,
+										  RelationGetPartitionDesc(childrel),
+										  lockmode,
+										  has_child, appinfos,
+										  partitioned_child_rels);
+
+		/* Close child relation, but keep locks */
+		heap_close(childrel, NoLock);
+	}
+}
+
+/*
+ * expand_single_inheritance_child
+ *		Expand a single inheritance child, if needed.
+ *
+ * If this is a temp table of another backend, we'll return without doing
+ * anything at all.  Otherwise, we'll set "has_child" to true, build a
+ * RangeTblEntry and either a PartitionedChildRelInfo or AppendRelInfo as
+ * appropriate, plus maybe a PlanRowMark.
+ */
+static void
+expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte,
+								Index parentRTindex, Relation parentrel,
+								PlanRowMark *parentrc, Relation childrel,
+								bool *has_child, List **appinfos,
+								List **partitioned_child_rels)
+{
+	Query	   *parse = root->parse;
+	Oid			parentOID = RelationGetRelid(parentrel);
+	Oid			childOID = RelationGetRelid(childrel);
+	RangeTblEntry *childrte;
+	Index		childRTindex;
+	AppendRelInfo *appinfo;
+
+	/*
+	 * Build an RTE for the child, and attach to query's rangetable list. We
+	 * copy most fields of the parent's RTE, but replace relation OID and
+	 * relkind, and set inh = false.  Also, set requiredPerms to zero since
+	 * all required permissions checks are done on the original RTE. Likewise,
+	 * set the child's securityQuals to empty, because we only want to apply
+	 * the parent's RLS conditions regardless of what RLS properties
+	 * individual children may have.  (This is an intentional choice to make
+	 * inherited RLS work like regular permissions checks.) The parent
+	 * securityQuals will be propagated to children along with other base
+	 * restriction clauses, so we don't need to do it here.
+	 */
+	childrte = copyObject(parentrte);
         childrte->relid = childOID;
-        childrte->relkind = newrelation->rd_rel->relkind;
+	childrte->relkind = childrel->rd_rel->relkind;
         childrte->inh = false;
         childrte->requiredPerms = 0;
         childrte->securityQuals = NIL;
@@ -1578,118 +1735,76 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
         childRTindex = list_length(parse->rtable);
 
         /*
-         * Build an AppendRelInfo for this parent and child, unless the child
-         * is a partitioned table.
+	 * Build an AppendRelInfo for this parent and child, unless the child is a
+	 * partitioned table.
          */
         if (childrte->relkind != RELKIND_PARTITIONED_TABLE)
         {
-            need_append = true;
+		/* Remember if we saw a real child. */
+		if (childOID != parentOID)
+			*has_child = true;
+
             appinfo = makeNode(AppendRelInfo);
-            appinfo->parent_relid = rti;
+		appinfo->parent_relid = parentRTindex;
             appinfo->child_relid = childRTindex;
-            appinfo->parent_reltype = oldrelation->rd_rel->reltype;
-            appinfo->child_reltype = newrelation->rd_rel->reltype;
-            make_inh_translation_list(oldrelation, newrelation, childRTindex,
+		appinfo->parent_reltype = parentrel->rd_rel->reltype;
+		appinfo->child_reltype = childrel->rd_rel->reltype;
+		make_inh_translation_list(parentrel, childrel, childRTindex,
                                       &appinfo->translated_vars);
             appinfo->parent_reloid = parentOID;
-            appinfos = lappend(appinfos, appinfo);
+		*appinfos = lappend(*appinfos, appinfo);
 
             /*
-             * Translate the column permissions bitmaps to the child's attnums
-             * (we have to build the translated_vars list before we can do
-             * this). But if this is the parent table, leave copyObject's
-             * result alone.
+		 * Translate the column permissions bitmaps to the child's attnums (we
+		 * have to build the translated_vars list before we can do this). But
+		 * if this is the parent table, leave copyObject's result alone.
              *
              * Note: we need to do this even though the executor won't run any
-             * permissions checks on the child RTE.  The
-             * insertedCols/updatedCols bitmaps may be examined for
-             * trigger-firing purposes.
+		 * permissions checks on the child RTE.  The insertedCols/updatedCols
+		 * bitmaps may be examined for trigger-firing purposes.
              */
             if (childOID != parentOID)
             {
-                childrte->selectedCols = translate_col_privs(rte->selectedCols,
+			childrte->selectedCols = translate_col_privs(parentrte->selectedCols,
                                                              appinfo->translated_vars);
-                childrte->insertedCols = translate_col_privs(rte->insertedCols,
+			childrte->insertedCols = translate_col_privs(parentrte->insertedCols,
                                                              appinfo->translated_vars);
-                childrte->updatedCols = translate_col_privs(rte->updatedCols,
+			childrte->updatedCols = translate_col_privs(parentrte->updatedCols,
                                                             appinfo->translated_vars);
             }
         }
         else
-            partitioned_child_rels = lappend_int(partitioned_child_rels,
+		*partitioned_child_rels = lappend_int(*partitioned_child_rels,
                                                  childRTindex);
 
         /*
          * Build a PlanRowMark if parent is marked FOR UPDATE/SHARE.
          */
-        if (oldrc)
+	if (parentrc)
         {
-            PlanRowMark *newrc = makeNode(PlanRowMark);
+		PlanRowMark *childrc = makeNode(PlanRowMark);
 
-            newrc->rti = childRTindex;
-            newrc->prti = rti;
-            newrc->rowmarkId = oldrc->rowmarkId;
+		childrc->rti = childRTindex;
+		childrc->prti = parentRTindex;
+		childrc->rowmarkId = parentrc->rowmarkId;
             /* Reselect rowmark type, because relkind might not match parent */
-            newrc->markType = select_rowmark_type(childrte, oldrc->strength);
-            newrc->allMarkTypes = (1 << newrc->markType);
-            newrc->strength = oldrc->strength;
-            newrc->waitPolicy = oldrc->waitPolicy;
+		childrc->markType = select_rowmark_type(childrte, parentrc->strength);
+		childrc->allMarkTypes = (1 << childrc->markType);
+		childrc->strength = parentrc->strength;
+		childrc->waitPolicy = parentrc->waitPolicy;
 
             /*
-             * We mark RowMarks for partitioned child tables as parent
-             * RowMarks so that the executor ignores them (except their
-             * existence means that the child tables be locked using
-             * appropriate mode).
+		 * We mark RowMarks for partitioned child tables as parent RowMarks so
+		 * that the executor ignores them (except their existence means that
+		 * the child tables be locked using appropriate mode).
              */
-            newrc->isParent = (childrte->relkind == RELKIND_PARTITIONED_TABLE);
+		childrc->isParent = (childrte->relkind == RELKIND_PARTITIONED_TABLE);
 
             /* Include child's rowmark type in parent's allMarkTypes */
-            oldrc->allMarkTypes |= newrc->allMarkTypes;
-
-            root->rowMarks = lappend(root->rowMarks, newrc);
-        }
+		parentrc->allMarkTypes |= childrc->allMarkTypes;
 
-        /* Close child relations, but keep locks */
-        if (childOID != parentOID)
-            heap_close(newrelation, NoLock);
+		root->rowMarks = lappend(root->rowMarks, childrc);
     }
-
-    heap_close(oldrelation, NoLock);
-
-    /*
-     * If all the children were temp tables or a partitioned parent did not
-     * have any leaf partitions, pretend it's a non-inheritance situation; we
-     * don't need Append node in that case.  The duplicate RTE we added for
-     * the parent table is harmless, so we don't bother to get rid of it;
-     * ditto for the useless PlanRowMark node.
-     */
-    if (!need_append)
-    {
-        /* Clear flag before returning */
-        rte->inh = false;
-        return;
-    }
-
-    /*
-     * We keep a list of objects in root, each of which maps a partitioned
-     * parent RT index to the list of RT indexes of its partitioned child
-     * tables.  When creating an Append or a ModifyTable path for the parent,
-     * we copy the child RT index list verbatim to the path so that it could
-     * be carried over to the executor so that the latter could identify the
-     * partitioned child tables.
-     */
-    if (partitioned_child_rels != NIL)
-    {
-        pcinfo = makeNode(PartitionedChildRelInfo);
-
-        Assert(rte->relkind == RELKIND_PARTITIONED_TABLE);
-        pcinfo->parent_relid = rti;
-        pcinfo->child_rels = partitioned_child_rels;
-        root->pcinfo_list = lappend(root->pcinfo_list, pcinfo);
-    }
-
-    /* Otherwise, OK to add to root->append_rel_list */
-    root->append_rel_list = list_concat(root->append_rel_list, appinfos);
 }
 
 /*
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index 70a7ea2f..944336b7 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -280,12 +280,12 @@ select tableoid::regclass, * from list_parted;
 -------------+----+----
  part_aa_bb  | aA |   
  part_cc_dd  | cC |  1
- part_null   |    |  0
- part_null   |    |  1
  part_ee_ff1 | ff |  1
  part_ee_ff1 | EE |  1
  part_ee_ff2 | ff | 11
  part_ee_ff2 | EE | 10
+ part_null   |    |  0
+ part_null   |    |  1
 (8 rows)
 
 -- some more tests to exercise tuple-routing with multi-level partitioning

From 182433b35f5647030187095efef7293146e8784a Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 10:46:39 +0800
Subject: [PATCH 188/578] Introduce 64-bit hash functions with a 64-bit seed.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/xindex.sgml                    |  13 +-
 src/backend/access/hash/hashfunc.c          | 372 +++++++++++++++++++-
 src/backend/access/hash/hashpage.c          |   2 +-
 src/backend/access/hash/hashutil.c          |   6 +-
 src/backend/access/hash/hashvalidate.c      |  44 ++-
 src/backend/commands/opclasscmds.c          |  28 +-
 src/backend/utils/adt/acl.c                 |  15 +
 src/backend/utils/adt/arrayfuncs.c          |  79 +++++
 src/backend/utils/adt/date.c                |  21 ++
 src/backend/utils/adt/jsonb_op.c            |  43 +++
 src/backend/utils/adt/jsonb_util.c          |  43 +++
 src/backend/utils/adt/mac.c                 |   9 +
 src/backend/utils/adt/mac8.c                |   9 +
 src/backend/utils/adt/network.c             |  10 +
 src/backend/utils/adt/numeric.c             |  60 ++++
 src/backend/utils/adt/pg_lsn.c              | 210 +++++------
 src/backend/utils/adt/rangetypes.c          |  63 ++++
 src/backend/utils/adt/timestamp.c           |  19 +
 src/backend/utils/adt/uuid.c                |   8 +
 src/backend/utils/adt/varchar.c             |  18 +
 src/backend/utils/cache/lsyscache.c         |   8 +-
 src/backend/utils/cache/typcache.c          |  58 ++-
 src/include/access/hash.h                   |  30 +-
 src/include/catalog/pg_amproc.h             |  37 +-
 src/include/catalog/pg_proc.h               |  54 +++
 src/include/fmgr.h                          |   1 +
 src/include/utils/jsonb.h                   | 316 ++++++++---------
 src/include/utils/typcache.h                | 208 +++++------
 src/test/regress/expected/alter_generic.out |   4 +-
 src/test/regress/expected/hash_func.out     | 300 ++++++++++++++++
 src/test/regress/parallel_schedule          |   2 +-
 src/test/regress/sql/hash_func.sql          | 222 ++++++++++++
 32 files changed, 1912 insertions(+), 400 deletions(-)
 create mode 100644 src/test/regress/expected/hash_func.out
 create mode 100644 src/test/regress/sql/hash_func.sql

diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml
index 333a36c4..745b4d56 100644
--- a/doc/src/sgml/xindex.sgml
+++ b/doc/src/sgml/xindex.sgml
@@ -436,7 +436,8 @@
    </table>
 
   <para>
-   Hash indexes require one support function, shown in <xref
+   Hash indexes require one support function, and allow a second one to be
+   supplied at the operator class author's option, as shown in <xref
    linkend="xindex-hash-support-table">.
   </para>
 
@@ -451,9 +452,17 @@
      </thead>
      <tbody>
       <row>
-       <entry>Compute the hash value for a key</entry>
+       <entry>Compute the 32-bit hash value for a key</entry>
        <entry>1</entry>
       </row>
+      <row>
+       <entry>
+         Compute the 64-bit hash value for a key given a 64-bit salt; if
+         the salt is 0, the low 32 bits will match the value that would
+         have been computed by function 1
+       </entry>
+       <entry>2</entry>
+      </row>
      </tbody>
     </tgroup>
    </table>
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
index ff46a854..f4959255 100644
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -54,18 +54,36 @@ hashchar(PG_FUNCTION_ARGS)
     return hash_uint32((int32) PG_GETARG_CHAR(0));
 }
 
+Datum
+hashcharextended(PG_FUNCTION_ARGS)
+{
+	return hash_uint32_extended((int32) PG_GETARG_CHAR(0), PG_GETARG_INT64(1));
+}
+
 Datum
 hashint2(PG_FUNCTION_ARGS)
 {
     return hash_uint32((int32) PG_GETARG_INT16(0));
 }
 
+Datum
+hashint2extended(PG_FUNCTION_ARGS)
+{
+	return hash_uint32_extended((int32) PG_GETARG_INT16(0), PG_GETARG_INT64(1));
+}
+
 Datum
 hashint4(PG_FUNCTION_ARGS)
 {
     return hash_uint32(PG_GETARG_INT32(0));
 }
 
+Datum
+hashint4extended(PG_FUNCTION_ARGS)
+{
+	return hash_uint32_extended(PG_GETARG_INT32(0), PG_GETARG_INT64(1));
+}
+
 Datum
 hashint8(PG_FUNCTION_ARGS)
 {
@@ -86,18 +104,43 @@ hashint8(PG_FUNCTION_ARGS)
     return hash_uint32(lohalf);
 }
 
+Datum
+hashint8extended(PG_FUNCTION_ARGS)
+{
+	/* Same approach as hashint8 */
+	int64		val = PG_GETARG_INT64(0);
+	uint32		lohalf = (uint32) val;
+	uint32		hihalf = (uint32) (val >> 32);
+
+	lohalf ^= (val >= 0) ? hihalf : ~hihalf;
+
+	return hash_uint32_extended(lohalf, PG_GETARG_INT64(1));
+}
+
 Datum
 hashoid(PG_FUNCTION_ARGS)
 {
     return hash_uint32((uint32) PG_GETARG_OID(0));
 }
 
+Datum
+hashoidextended(PG_FUNCTION_ARGS)
+{
+	return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1));
+}
+
 Datum
 hashenum(PG_FUNCTION_ARGS)
 {
     return hash_uint32((uint32) PG_GETARG_OID(0));
 }
 
+Datum
+hashenumextended(PG_FUNCTION_ARGS)
+{
+	return hash_uint32_extended((uint32) PG_GETARG_OID(0), PG_GETARG_INT64(1));
+}
+
 Datum
 hashfloat4(PG_FUNCTION_ARGS)
 {
@@ -124,6 +167,21 @@ hashfloat4(PG_FUNCTION_ARGS)
     return hash_any((unsigned char *) &key8, sizeof(key8));
 }
 
+Datum
+hashfloat4extended(PG_FUNCTION_ARGS)
+{
+	float4		key = PG_GETARG_FLOAT4(0);
+	uint64		seed = PG_GETARG_INT64(1);
+	float8		key8;
+
+	/* Same approach as hashfloat4 */
+	if (key == (float4) 0)
+		PG_RETURN_UINT64(seed);
+	key8 = key;
+
+	return hash_any_extended((unsigned char *) &key8, sizeof(key8), seed);
+}
+
 Datum
 hashfloat8(PG_FUNCTION_ARGS)
 {
@@ -140,6 +198,19 @@ hashfloat8(PG_FUNCTION_ARGS)
     return hash_any((unsigned char *) &key, sizeof(key));
 }
 
+Datum
+hashfloat8extended(PG_FUNCTION_ARGS)
+{
+	float8		key = PG_GETARG_FLOAT8(0);
+	uint64		seed = PG_GETARG_INT64(1);
+
+	/* Same approach as hashfloat8 */
+	if (key == (float8) 0)
+		PG_RETURN_UINT64(seed);
+
+	return hash_any_extended((unsigned char *) &key, sizeof(key), seed);
+}
+
 Datum
 hashoidvector(PG_FUNCTION_ARGS)
 {
@@ -148,6 +219,16 @@ hashoidvector(PG_FUNCTION_ARGS)
     return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid));
 }
 
+Datum
+hashoidvectorextended(PG_FUNCTION_ARGS)
+{
+	oidvector  *key = (oidvector *) PG_GETARG_POINTER(0);
+
+	return hash_any_extended((unsigned char *) key->values,
+							 key->dim1 * sizeof(Oid),
+							 PG_GETARG_INT64(1));
+}
+
 Datum
 hashname(PG_FUNCTION_ARGS)
 {
@@ -156,6 +237,15 @@ hashname(PG_FUNCTION_ARGS)
     return hash_any((unsigned char *) key, strlen(key));
 }
 
+Datum
+hashnameextended(PG_FUNCTION_ARGS)
+{
+	char	   *key = NameStr(*PG_GETARG_NAME(0));
+
+	return hash_any_extended((unsigned char *) key, strlen(key),
+							 PG_GETARG_INT64(1));
+}
+
 Datum
 hashtext(PG_FUNCTION_ARGS)
 {
@@ -176,6 +266,22 @@ hashtext(PG_FUNCTION_ARGS)
     return result;
 }
 
+Datum
+hashtextextended(PG_FUNCTION_ARGS)
+{
+	text	   *key = PG_GETARG_TEXT_PP(0);
+	Datum		result;
+
+	/* Same approach as hashtext */
+	result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
+							   VARSIZE_ANY_EXHDR(key),
+							   PG_GETARG_INT64(1));
+
+	PG_FREE_IF_COPY(key, 0);
+
+	return result;
+}
+
 /*
  * hashvarlena() can be used for any varlena datatype in which there are
  * no non-significant bits, ie, distinct bitpatterns never compare as equal.
@@ -195,6 +301,21 @@ hashvarlena(PG_FUNCTION_ARGS)
     return result;
 }
 
+Datum
+hashvarlenaextended(PG_FUNCTION_ARGS)
+{
+	struct varlena *key = PG_GETARG_VARLENA_PP(0);
+	Datum		result;
+
+	result = hash_any_extended((unsigned char *) VARDATA_ANY(key),
+							   VARSIZE_ANY_EXHDR(key),
+							   PG_GETARG_INT64(1));
+
+	PG_FREE_IF_COPY(key, 0);
+
+	return result;
+}
+
 /*
  * This hash function was written by Bob Jenkins
  * (bob_jenkins@burtleburtle.net), and superficially adapted
@@ -510,7 +631,227 @@ hash_any(register const unsigned char *k, register int keylen)
 }
 
 /*
- * hash_uint32() -- hash a 32-bit value
+ * hash_any_extended() -- hash into a 64-bit value, using an optional seed
+ *		k		: the key (the unaligned variable-length array of bytes)
+ *		len		: the length of the key, counting by bytes
+ *		seed	: a 64-bit seed (0 means no seed)
+ *
+ * Returns a uint64 value.  Otherwise similar to hash_any.
+ */
+Datum
+hash_any_extended(register const unsigned char *k, register int keylen,
+				  uint64 seed)
+{
+	register uint32 a,
+				b,
+				c,
+				len;
+
+	/* Set up the internal state */
+	len = keylen;
+	a = b = c = 0x9e3779b9 + len + 3923095;
+
+	/* If the seed is non-zero, use it to perturb the internal state. */
+	if (seed != 0)
+	{
+		/*
+		 * In essence, the seed is treated as part of the data being hashed,
+		 * but for simplicity, we pretend that it's padded with four bytes of
+		 * zeroes so that the seed constitutes a 12-byte chunk.
+		 */
+		a += (uint32) (seed >> 32);
+		b += (uint32) seed;
+		mix(a, b, c);
+	}
+
+	/* If the source pointer is word-aligned, we use word-wide fetches */
+	if (((uintptr_t) k & UINT32_ALIGN_MASK) == 0)
+	{
+		/* Code path for aligned source data */
+		register const uint32 *ka = (const uint32 *) k;
+
+		/* handle most of the key */
+		while (len >= 12)
+		{
+			a += ka[0];
+			b += ka[1];
+			c += ka[2];
+			mix(a, b, c);
+			ka += 3;
+			len -= 12;
+		}
+
+		/* handle the last 11 bytes */
+		k = (const unsigned char *) ka;
+#ifdef WORDS_BIGENDIAN
+		switch (len)
+		{
+			case 11:
+				c += ((uint32) k[10] << 8);
+				/* fall through */
+			case 10:
+				c += ((uint32) k[9] << 16);
+				/* fall through */
+			case 9:
+				c += ((uint32) k[8] << 24);
+				/* the lowest byte of c is reserved for the length */
+				/* fall through */
+			case 8:
+				b += ka[1];
+				a += ka[0];
+				break;
+			case 7:
+				b += ((uint32) k[6] << 8);
+				/* fall through */
+			case 6:
+				b += ((uint32) k[5] << 16);
+				/* fall through */
+			case 5:
+				b += ((uint32) k[4] << 24);
+				/* fall through */
+			case 4:
+				a += ka[0];
+				break;
+			case 3:
+				a += ((uint32) k[2] << 8);
+				/* fall through */
+			case 2:
+				a += ((uint32) k[1] << 16);
+				/* fall through */
+			case 1:
+				a += ((uint32) k[0] << 24);
+				/* case 0: nothing left to add */
+		}
+#else							/* !WORDS_BIGENDIAN */
+		switch (len)
+		{
+			case 11:
+				c += ((uint32) k[10] << 24);
+				/* fall through */
+			case 10:
+				c += ((uint32) k[9] << 16);
+				/* fall through */
+			case 9:
+				c += ((uint32) k[8] << 8);
+				/* the lowest byte of c is reserved for the length */
+				/* fall through */
+			case 8:
+				b += ka[1];
+				a += ka[0];
+				break;
+			case 7:
+				b += ((uint32) k[6] << 16);
+				/* fall through */
+			case 6:
+				b += ((uint32) k[5] << 8);
+				/* fall through */
+			case 5:
+				b += k[4];
+				/* fall through */
+			case 4:
+				a += ka[0];
+				break;
+			case 3:
+				a += ((uint32) k[2] << 16);
+				/* fall through */
+			case 2:
+				a += ((uint32) k[1] << 8);
+				/* fall through */
+			case 1:
+				a += k[0];
+				/* case 0: nothing left to add */
+		}
+#endif							/* WORDS_BIGENDIAN */
+	}
+	else
+	{
+		/* Code path for non-aligned source data */
+
+		/* handle most of the key */
+		while (len >= 12)
+		{
+#ifdef WORDS_BIGENDIAN
+			a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
+			b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
+			c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
+#else							/* !WORDS_BIGENDIAN */
+			a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
+			b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
+			c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
+#endif							/* WORDS_BIGENDIAN */
+			mix(a, b, c);
+			k += 12;
+			len -= 12;
+		}
+
+		/* handle the last 11 bytes */
+#ifdef WORDS_BIGENDIAN
+		switch (len)			/* all the case statements fall through */
+		{
+			case 11:
+				c += ((uint32) k[10] << 8);
+			case 10:
+				c += ((uint32) k[9] << 16);
+			case 9:
+				c += ((uint32) k[8] << 24);
+				/* the lowest byte of c is reserved for the length */
+			case 8:
+				b += k[7];
+			case 7:
+				b += ((uint32) k[6] << 8);
+			case 6:
+				b += ((uint32) k[5] << 16);
+			case 5:
+				b += ((uint32) k[4] << 24);
+			case 4:
+				a += k[3];
+			case 3:
+				a += ((uint32) k[2] << 8);
+			case 2:
+				a += ((uint32) k[1] << 16);
+			case 1:
+				a += ((uint32) k[0] << 24);
+				/* case 0: nothing left to add */
+		}
+#else							/* !WORDS_BIGENDIAN */
+		switch (len)			/* all the case statements fall through */
+		{
+			case 11:
+				c += ((uint32) k[10] << 24);
+			case 10:
+				c += ((uint32) k[9] << 16);
+			case 9:
+				c += ((uint32) k[8] << 8);
+				/* the lowest byte of c is reserved for the length */
+			case 8:
+				b += ((uint32) k[7] << 24);
+			case 7:
+				b += ((uint32) k[6] << 16);
+			case 6:
+				b += ((uint32) k[5] << 8);
+			case 5:
+				b += k[4];
+			case 4:
+				a += ((uint32) k[3] << 24);
+			case 3:
+				a += ((uint32) k[2] << 16);
+			case 2:
+				a += ((uint32) k[1] << 8);
+			case 1:
+				a += k[0];
+				/* case 0: nothing left to add */
+		}
+#endif							/* WORDS_BIGENDIAN */
+	}
+
+	final(a, b, c);
+
+	/* report the result */
+	PG_RETURN_UINT64(((uint64) b << 32) | c);
+}
+
+/*
+ * hash_uint32() -- hash a 32-bit value to a 32-bit value
  *
  * This has the same result as
  *        hash_any(&k, sizeof(uint32))
@@ -532,6 +873,35 @@ hash_uint32(uint32 k)
     return UInt32GetDatum(c);
 }
 
+/*
+ * hash_uint32_extended() -- hash a 32-bit value to a 64-bit value, with a seed
+ *
+ * Like hash_uint32, this is a convenience function.
+ */
+Datum
+hash_uint32_extended(uint32 k, uint64 seed)
+{
+   register uint32 a,
+               b,
+               c;
+
+   a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
+
+   if (seed != 0)
+   {
+       a += (uint32) (seed >> 32);
+       b += (uint32) seed;
+       mix(a, b, c);
+   }
+
+   a += k;
+
+   final(a, b, c);
+
+   /* report the result */
+   PG_RETURN_UINT64(((uint64) b << 32) | c);
+}
+
 #ifdef PGXC
 /*
  * compute_hash()
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index e592499e..eb524a73 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -373,7 +373,7 @@ _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
     if (ffactor < 10)
         ffactor = 10;
 
-    procid = index_getprocid(rel, 1, HASHPROC);
+	procid = index_getprocid(rel, 1, HASHSTANDARD_PROC);
 
     /*
      * We initialize the metapage, the first N bucket pages, and the first
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
index 8542ae3a..15468e78 100644
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -85,7 +85,7 @@ _hash_datum2hashkey(Relation rel, Datum key)
     Oid            collation;
 
     /* XXX assumes index has only one attribute */
-    procinfo = index_getprocinfo(rel, 1, HASHPROC);
+	procinfo = index_getprocinfo(rel, 1, HASHSTANDARD_PROC);
     collation = rel->rd_indcollation[0];
 
     return DatumGetUInt32(FunctionCall1Coll(procinfo, collation, key));
@@ -108,10 +108,10 @@ _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
     hash_proc = get_opfamily_proc(rel->rd_opfamily[0],
                                   keytype,
                                   keytype,
-                                  HASHPROC);
+								  HASHSTANDARD_PROC);
     if (!RegProcedureIsValid(hash_proc))
         elog(ERROR, "missing support function %d(%u,%u) for index \"%s\"",
-             HASHPROC, keytype, keytype,
+			 HASHSTANDARD_PROC, keytype, keytype,
              RelationGetRelationName(rel));
     collation = rel->rd_indcollation[0];
 
diff --git a/src/backend/access/hash/hashvalidate.c b/src/backend/access/hash/hashvalidate.c
index 7e4364f8..a027d782 100644
--- a/src/backend/access/hash/hashvalidate.c
+++ b/src/backend/access/hash/hashvalidate.c
@@ -29,7 +29,7 @@
 #include "utils/syscache.h"
 
 
-static bool check_hash_func_signature(Oid funcid, Oid restype, Oid argtype);
+static bool check_hash_func_signature(Oid funcid, int16 amprocnum, Oid argtype);
 
 
 /*
@@ -105,8 +105,9 @@ hashvalidate(Oid opclassoid)
         /* Check procedure numbers and function signatures */
         switch (procform->amprocnum)
         {
-            case HASHPROC:
-                if (!check_hash_func_signature(procform->amproc, INT4OID,
+			case HASHSTANDARD_PROC:
+			case HASHEXTENDED_PROC:
+				if (!check_hash_func_signature(procform->amproc, procform->amprocnum,
                                                procform->amproclefttype))
                 {
                     ereport(INFO,
@@ -264,19 +265,37 @@ hashvalidate(Oid opclassoid)
  * hacks in the core hash opclass definitions.
  */
 static bool
-check_hash_func_signature(Oid funcid, Oid restype, Oid argtype)
-{// #lizard forgives
+check_hash_func_signature(Oid funcid, int16 amprocnum, Oid argtype)
+{
     bool        result = true;
+	Oid			restype;
+	int16		nargs;
     HeapTuple    tp;
     Form_pg_proc procform;
 
+	switch (amprocnum)
+	{
+		case HASHSTANDARD_PROC:
+			restype = INT4OID;
+			nargs = 1;
+			break;
+
+		case HASHEXTENDED_PROC:
+			restype = INT8OID;
+			nargs = 2;
+			break;
+
+		default:
+			elog(ERROR, "invalid amprocnum");
+	}
+
     tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
     if (!HeapTupleIsValid(tp))
         elog(ERROR, "cache lookup failed for function %u", funcid);
     procform = (Form_pg_proc) GETSTRUCT(tp);
 
     if (procform->prorettype != restype || procform->proretset ||
-        procform->pronargs != 1)
+		procform->pronargs != nargs)
         result = false;
 
     if (!IsBinaryCoercible(argtype, procform->proargtypes.values[0]))
@@ -290,24 +309,29 @@ check_hash_func_signature(Oid funcid, Oid restype, Oid argtype)
          * identity, not just its input type, because hashvarlena() takes
          * INTERNAL and allowing any such function seems too scary.
          */
-        if (funcid == F_HASHINT4 &&
+		if ((funcid == F_HASHINT4 || funcid == F_HASHINT4EXTENDED) &&
             (argtype == DATEOID ||
              argtype == ABSTIMEOID || argtype == RELTIMEOID ||
              argtype == XIDOID || argtype == CIDOID))
              /* okay, allowed use of hashint4() */ ;
-        else if (funcid == F_TIMESTAMP_HASH &&
+		else if ((funcid == F_TIMESTAMP_HASH ||
+				  funcid == F_TIMESTAMP_HASH_EXTENDED) &&
                  argtype == TIMESTAMPTZOID)
              /* okay, allowed use of timestamp_hash() */ ;
-        else if (funcid == F_HASHCHAR &&
+		else if ((funcid == F_HASHCHAR || funcid == F_HASHCHAREXTENDED) &&
                  argtype == BOOLOID)
              /* okay, allowed use of hashchar() */ ;
-        else if (funcid == F_HASHVARLENA &&
+		else if ((funcid == F_HASHVARLENA || funcid == F_HASHVARLENAEXTENDED) &&
                  argtype == BYTEAOID)
              /* okay, allowed use of hashvarlena() */ ;
         else
             result = false;
     }
 
+	/* If function takes a second argument, it must be for a 64-bit salt. */
+	if (nargs == 2 && procform->proargtypes.values[1] != INT8OID)
+		result = false;
+
     ReleaseSysCache(tp);
     return result;
 }
diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c
index 80cbadb2..6e0f12b7 100644
--- a/src/backend/commands/opclasscmds.c
+++ b/src/backend/commands/opclasscmds.c
@@ -78,6 +78,7 @@
 #include <limits.h>
 
 #include "access/genam.h"
+#include "access/hash.h"
 #include "access/heapam.h"
 #include "access/nbtree.h"
 #include "access/htup_details.h"
@@ -1189,7 +1190,8 @@ assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid)
     /*
      * btree comparison procs must be 2-arg procs returning int4, while btree
      * sortsupport procs must take internal and return void.  hash support
-     * procs must be 1-arg procs returning int4.  Otherwise we don't know.
+	 * proc 1 must be a 1-arg proc returning int4, while proc 2 must be a
+	 * 2-arg proc returning int8.  Otherwise we don't know.
      */
     if (amoid == BTREE_AM_OID)
     {
@@ -1232,14 +1234,28 @@ assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid)
     }
     else if (amoid == HASH_AM_OID)
     {
+	    if (member->number == HASHSTANDARD_PROC)
+		{
         if (procform->pronargs != 1)
             ereport(ERROR,
                     (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
-                     errmsg("hash procedures must have one argument")));
-        if (procform->prorettype != INT4OID)
-            ereport(ERROR,
-                    (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
-                     errmsg("hash procedures must return integer")));
+		                    errmsg("hash procedure 1 must have one argument")));
+		    if (procform->prorettype != INT4OID)
+		        ereport(ERROR,
+		                   (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+		                    errmsg("hash procedure 1 must return integer")));
+		}
+		else if (member->number == HASHEXTENDED_PROC)
+		{
+		    if (procform->pronargs != 2)
+		        ereport(ERROR,
+		                   (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+		                    errmsg("hash procedure 2 must have two arguments")));
+		    if (procform->prorettype != INT8OID)
+		        ereport(ERROR,
+		                   (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+		                    errmsg("hash procedure 2 must return bigint")));
+		}
 
         /*
          * If lefttype/righttype isn't specified, use the proc's input type
diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c
index 7c290f3a..6c638e3c 100644
--- a/src/backend/utils/adt/acl.c
+++ b/src/backend/utils/adt/acl.c
@@ -16,6 +16,7 @@
 
 #include <ctype.h>
 
+#include "access/hash.h"
 #include "access/htup_details.h"
 #include "catalog/catalog.h"
 #include "catalog/namespace.h"
@@ -716,6 +717,20 @@ hash_aclitem(PG_FUNCTION_ARGS)
     PG_RETURN_UINT32((uint32) (a->ai_privs + a->ai_grantee + a->ai_grantor));
 }
 
+/*
+ * 64-bit hash function for aclitem.
+ *
+ * Similar to hash_aclitem, but accepts a seed and returns a uint64 value.
+ */
+Datum
+hash_aclitem_extended(PG_FUNCTION_ARGS)
+{
+	AclItem    *a = PG_GETARG_ACLITEM_P(0);
+	uint64		seed = PG_GETARG_INT64(1);
+	uint32		sum = (uint32) (a->ai_privs + a->ai_grantee + a->ai_grantor);
+
+	return (seed == 0) ? UInt64GetDatum(sum) : hash_uint32_extended(sum, seed);
+}
 
 /*
  * acldefault()  --- create an ACL describing default access permissions
diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index 15b7a03c..06f20055 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -21,6 +21,7 @@
 #endif
 #include <math.h>
 
+#include "access/hash.h"
 #include "access/htup_details.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
@@ -4043,6 +4044,84 @@ hash_array(PG_FUNCTION_ARGS)
     PG_RETURN_UINT32(result);
 }
 
+/*
+ * Returns 64-bit value by hashing a value to a 64-bit value, with a seed.
+ * Otherwise, similar to hash_array.
+ */
+Datum
+hash_array_extended(PG_FUNCTION_ARGS)
+{
+	AnyArrayType *array = PG_GETARG_ANY_ARRAY(0);
+	uint64		seed = PG_GETARG_INT64(1);
+	int			ndims = AARR_NDIM(array);
+	int		   *dims = AARR_DIMS(array);
+	Oid			element_type = AARR_ELEMTYPE(array);
+	uint64		result = 1;
+	int			nitems;
+	TypeCacheEntry *typentry;
+	int			typlen;
+	bool		typbyval;
+	char		typalign;
+	int			i;
+	array_iter	iter;
+	FunctionCallInfoData locfcinfo;
+
+	typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+	if (typentry == NULL ||
+		typentry->type_id != element_type)
+	{
+		typentry = lookup_type_cache(element_type,
+									 TYPECACHE_HASH_EXTENDED_PROC_FINFO);
+		if (!OidIsValid(typentry->hash_extended_proc_finfo.fn_oid))
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_FUNCTION),
+					 errmsg("could not identify an extended hash function for type %s",
+							format_type_be(element_type))));
+		fcinfo->flinfo->fn_extra = (void *) typentry;
+	}
+	typlen = typentry->typlen;
+	typbyval = typentry->typbyval;
+	typalign = typentry->typalign;
+
+	InitFunctionCallInfoData(locfcinfo, &typentry->hash_extended_proc_finfo, 2,
+							 InvalidOid, NULL, NULL);
+
+	/* Loop over source data */
+	nitems = ArrayGetNItems(ndims, dims);
+	array_iter_setup(&iter, array);
+
+	for (i = 0; i < nitems; i++)
+	{
+		Datum		elt;
+		bool		isnull;
+		uint64		elthash;
+
+		/* Get element, checking for NULL */
+		elt = array_iter_next(&iter, &isnull, i, typlen, typbyval, typalign);
+
+		if (isnull)
+		{
+			elthash = 0;
+		}
+		else
+		{
+			/* Apply the hash function */
+			locfcinfo.arg[0] = elt;
+			locfcinfo.arg[1] = seed;
+			locfcinfo.argnull[0] = false;
+			locfcinfo.argnull[1] = false;
+			locfcinfo.isnull = false;
+			elthash = DatumGetUInt64(FunctionCallInvoke(&locfcinfo));
+		}
+
+		result = (result << 5) - result + elthash;
+	}
+
+	AARR_FREE_IF_COPY(array, 0);
+
+	PG_RETURN_UINT64(result);
+}
+
 
 /*-----------------------------------------------------------------------------
  * array overlap/containment comparisons
diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c
index ca7454de..cb29169f 100644
--- a/src/backend/utils/adt/date.c
+++ b/src/backend/utils/adt/date.c
@@ -1520,6 +1520,12 @@ time_hash(PG_FUNCTION_ARGS)
     return hashint8(fcinfo);
 }
 
+Datum
+time_hash_extended(PG_FUNCTION_ARGS)
+{
+	return hashint8extended(fcinfo);
+}
+
 Datum
 time_larger(PG_FUNCTION_ARGS)
 {
@@ -2225,6 +2231,21 @@ timetz_hash(PG_FUNCTION_ARGS)
     PG_RETURN_UINT32(thash);
 }
 
+Datum
+timetz_hash_extended(PG_FUNCTION_ARGS)
+{
+	TimeTzADT  *key = PG_GETARG_TIMETZADT_P(0);
+	uint64		seed = PG_GETARG_DATUM(1);
+	uint64		thash;
+
+	/* Same approach as timetz_hash */
+	thash = DatumGetUInt64(DirectFunctionCall2(hashint8extended,
+											   Int64GetDatumFast(key->time),
+											   seed));
+	thash ^= DatumGetUInt64(hash_uint32_extended(key->zone, seed));
+	PG_RETURN_UINT64(thash);
+}
+
 Datum
 timetz_larger(PG_FUNCTION_ARGS)
 {
diff --git a/src/backend/utils/adt/jsonb_op.c b/src/backend/utils/adt/jsonb_op.c
index 2ceaeef7..83219269 100644
--- a/src/backend/utils/adt/jsonb_op.c
+++ b/src/backend/utils/adt/jsonb_op.c
@@ -291,3 +291,46 @@ jsonb_hash(PG_FUNCTION_ARGS)
     PG_FREE_IF_COPY(jb, 0);
     PG_RETURN_INT32(hash);
 }
+
+Datum
+jsonb_hash_extended(PG_FUNCTION_ARGS)
+{
+	Jsonb	   *jb = PG_GETARG_JSONB(0);
+	uint64		seed = PG_GETARG_INT64(1);
+	JsonbIterator *it;
+	JsonbValue	v;
+	JsonbIteratorToken r;
+	uint64		hash = 0;
+
+	if (JB_ROOT_COUNT(jb) == 0)
+		PG_RETURN_UINT64(seed);
+
+	it = JsonbIteratorInit(&jb->root);
+
+	while ((r = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
+	{
+		switch (r)
+		{
+			/* Rotation is left to JsonbHashScalarValueExtended() */
+			case WJB_BEGIN_ARRAY:
+				hash ^= ((UINT64CONST(JB_FARRAY) << 32) | UINT64CONST(JB_FARRAY));
+				break;
+			case WJB_BEGIN_OBJECT:
+				hash ^= ((UINT64CONST(JB_FOBJECT) << 32) | UINT64CONST(JB_FOBJECT));
+				break;
+			case WJB_KEY:
+			case WJB_VALUE:
+			case WJB_ELEM:
+				JsonbHashScalarValueExtended(&v, &hash, seed);
+				break;
+			case WJB_END_ARRAY:
+			case WJB_END_OBJECT:
+				break;
+			default:
+				elog(ERROR, "invalid JsonbIteratorNext rc: %d", (int) r);
+		}
+	}
+
+	PG_FREE_IF_COPY(jb, 0);
+	PG_RETURN_UINT64(hash);
+}
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 6bb335e0..91078189 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -1249,6 +1249,49 @@ JsonbHashScalarValue(const JsonbValue *scalarVal, uint32 *hash)
     *hash ^= tmp;
 }
 
+/*
+ * Hash a value to a 64-bit value, with a seed. Otherwise, similar to
+ * JsonbHashScalarValue.
+ */
+void
+JsonbHashScalarValueExtended(const JsonbValue *scalarVal, uint64 *hash,
+							 uint64 seed)
+{
+	uint64		tmp;
+
+	switch (scalarVal->type)
+	{
+		case jbvNull:
+			tmp = seed + 0x01;
+			break;
+		case jbvString:
+			tmp = DatumGetUInt64(hash_any_extended((const unsigned char *) scalarVal->val.string.val,
+												   scalarVal->val.string.len,
+												   seed));
+			break;
+		case jbvNumeric:
+			tmp = DatumGetUInt64(DirectFunctionCall2(hash_numeric_extended,
+													 NumericGetDatum(scalarVal->val.numeric),
+													 UInt64GetDatum(seed)));
+			break;
+		case jbvBool:
+			if (seed)
+				tmp = DatumGetUInt64(DirectFunctionCall2(hashcharextended,
+														 BoolGetDatum(scalarVal->val.boolean),
+														 UInt64GetDatum(seed)));
+			else
+				tmp = scalarVal->val.boolean ? 0x02 : 0x04;
+
+			break;
+		default:
+			elog(ERROR, "invalid jsonb scalar type");
+			break;
+	}
+
+	*hash = ROTATE_HIGH_AND_LOW_32BITS(*hash);
+	*hash ^= tmp;
+}
+
 /*
  * Are two scalar JsonbValues of the same type a and b equal?
  */
diff --git a/src/backend/utils/adt/mac.c b/src/backend/utils/adt/mac.c
index 27819a01..7fc50865 100644
--- a/src/backend/utils/adt/mac.c
+++ b/src/backend/utils/adt/mac.c
@@ -271,6 +271,15 @@ hashmacaddr(PG_FUNCTION_ARGS)
     return hash_any((unsigned char *) key, sizeof(macaddr));
 }
 
+Datum
+hashmacaddrextended(PG_FUNCTION_ARGS)
+{
+	macaddr    *key = PG_GETARG_MACADDR_P(0);
+
+	return hash_any_extended((unsigned char *) key, sizeof(macaddr),
+							 PG_GETARG_INT64(1));
+}
+
 /*
  * Arithmetic functions: bitwise NOT, AND, OR.
  */
diff --git a/src/backend/utils/adt/mac8.c b/src/backend/utils/adt/mac8.c
index 0a239dc3..90be3efa 100644
--- a/src/backend/utils/adt/mac8.c
+++ b/src/backend/utils/adt/mac8.c
@@ -407,6 +407,15 @@ hashmacaddr8(PG_FUNCTION_ARGS)
     return hash_any((unsigned char *) key, sizeof(macaddr8));
 }
 
+Datum
+hashmacaddr8extended(PG_FUNCTION_ARGS)
+{
+	macaddr8   *key = PG_GETARG_MACADDR8_P(0);
+
+	return hash_any_extended((unsigned char *) key, sizeof(macaddr8),
+							 PG_GETARG_INT64(1));
+}
+
 /*
  * Arithmetic functions: bitwise NOT, AND, OR.
  */
diff --git a/src/backend/utils/adt/network.c b/src/backend/utils/adt/network.c
index 1514f39e..f0f339bc 100644
--- a/src/backend/utils/adt/network.c
+++ b/src/backend/utils/adt/network.c
@@ -486,6 +486,16 @@ hashinet(PG_FUNCTION_ARGS)
     return hash_any((unsigned char *) VARDATA_ANY(addr), addrsize + 2);
 }
 
+Datum
+hashinetextended(PG_FUNCTION_ARGS)
+{
+	inet	   *addr = PG_GETARG_INET_PP(0);
+	int			addrsize = ip_addrsize(addr);
+
+	return hash_any_extended((unsigned char *) VARDATA_ANY(addr), addrsize + 2,
+							 PG_GETARG_INT64(1));
+}
+
 /*
  *    Boolean network-inclusion tests.
  */
diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index b6aad0ae..d159c430 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -2230,6 +2230,66 @@ hash_numeric(PG_FUNCTION_ARGS)
     PG_RETURN_DATUM(result);
 }
 
+/*
+ * Returns 64-bit value by hashing a value to a 64-bit value, with a seed.
+ * Otherwise, similar to hash_numeric.
+ */
+Datum
+hash_numeric_extended(PG_FUNCTION_ARGS)
+{
+	Numeric		key = PG_GETARG_NUMERIC(0);
+	uint64		seed = PG_GETARG_INT64(1);
+	Datum		digit_hash;
+	Datum		result;
+	int			weight;
+	int			start_offset;
+	int			end_offset;
+	int			i;
+	int			hash_len;
+	NumericDigit *digits;
+
+	if (NUMERIC_IS_NAN(key))
+		PG_RETURN_UINT64(seed);
+
+	weight = NUMERIC_WEIGHT(key);
+	start_offset = 0;
+	end_offset = 0;
+
+	digits = NUMERIC_DIGITS(key);
+	for (i = 0; i < NUMERIC_NDIGITS(key); i++)
+	{
+		if (digits[i] != (NumericDigit) 0)
+			break;
+
+		start_offset++;
+
+		weight--;
+	}
+
+	if (NUMERIC_NDIGITS(key) == start_offset)
+		PG_RETURN_UINT64(seed - 1);
+
+	for (i = NUMERIC_NDIGITS(key) - 1; i >= 0; i--)
+	{
+		if (digits[i] != (NumericDigit) 0)
+			break;
+
+		end_offset++;
+	}
+
+	Assert(start_offset + end_offset < NUMERIC_NDIGITS(key));
+
+	hash_len = NUMERIC_NDIGITS(key) - start_offset - end_offset;
+	digit_hash = hash_any_extended((unsigned char *) (NUMERIC_DIGITS(key)
+													  + start_offset),
+								   hash_len * sizeof(NumericDigit),
+								   seed);
+
+	result = digit_hash ^ weight;
+
+	PG_RETURN_DATUM(result);
+}
+
 
 /* ----------------------------------------------------------------------
  *
diff --git a/src/backend/utils/adt/pg_lsn.c b/src/backend/utils/adt/pg_lsn.c
index abdd3eed..7ad30a26 100644
--- a/src/backend/utils/adt/pg_lsn.c
+++ b/src/backend/utils/adt/pg_lsn.c
@@ -1,13 +1,13 @@
 /*-------------------------------------------------------------------------
  *
  * pg_lsn.c
- *      Operations for the pg_lsn datatype.
+ *	  Operations for the pg_lsn datatype.
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *      src/backend/utils/adt/pg_lsn.c
+ *	  src/backend/utils/adt/pg_lsn.c
  *
  *-------------------------------------------------------------------------
  */
@@ -19,8 +19,8 @@
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 
-#define MAXPG_LSNLEN            17
-#define MAXPG_LSNCOMPONENT    8
+#define MAXPG_LSNLEN			17
+#define MAXPG_LSNCOMPONENT	8
 
 /*----------------------------------------------------------
  * Formatting and conversion routines.
@@ -29,180 +29,186 @@
 Datum
 pg_lsn_in(PG_FUNCTION_ARGS)
 {
-    char       *str = PG_GETARG_CSTRING(0);
-    int            len1,
-                len2;
-    uint32        id,
-                off;
-    XLogRecPtr    result;
-
-    /* Sanity check input format. */
-    len1 = strspn(str, "0123456789abcdefABCDEF");
-    if (len1 < 1 || len1 > MAXPG_LSNCOMPONENT || str[len1] != '/')
-        ereport(ERROR,
-                (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-                 errmsg("invalid input syntax for type %s: \"%s\"",
-                        "pg_lsn", str)));
-    len2 = strspn(str + len1 + 1, "0123456789abcdefABCDEF");
-    if (len2 < 1 || len2 > MAXPG_LSNCOMPONENT || str[len1 + 1 + len2] != '\0')
-        ereport(ERROR,
-                (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-                 errmsg("invalid input syntax for type %s: \"%s\"",
-                        "pg_lsn", str)));
-
-    /* Decode result. */
-    id = (uint32) strtoul(str, NULL, 16);
-    off = (uint32) strtoul(str + len1 + 1, NULL, 16);
-    result = ((uint64) id << 32) | off;
-
-    PG_RETURN_LSN(result);
+	char	   *str = PG_GETARG_CSTRING(0);
+	int			len1,
+				len2;
+	uint32		id,
+				off;
+	XLogRecPtr	result;
+
+	/* Sanity check input format. */
+	len1 = strspn(str, "0123456789abcdefABCDEF");
+	if (len1 < 1 || len1 > MAXPG_LSNCOMPONENT || str[len1] != '/')
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid input syntax for type %s: \"%s\"",
+						"pg_lsn", str)));
+	len2 = strspn(str + len1 + 1, "0123456789abcdefABCDEF");
+	if (len2 < 1 || len2 > MAXPG_LSNCOMPONENT || str[len1 + 1 + len2] != '\0')
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid input syntax for type %s: \"%s\"",
+						"pg_lsn", str)));
+
+	/* Decode result. */
+	id = (uint32) strtoul(str, NULL, 16);
+	off = (uint32) strtoul(str + len1 + 1, NULL, 16);
+	result = ((uint64) id << 32) | off;
+
+	PG_RETURN_LSN(result);
 }
 
 Datum
 pg_lsn_out(PG_FUNCTION_ARGS)
 {
-    XLogRecPtr    lsn = PG_GETARG_LSN(0);
-    char        buf[MAXPG_LSNLEN + 1];
-    char       *result;
-    uint32        id,
-                off;
-
-    /* Decode ID and offset */
-    id = (uint32) (lsn >> 32);
-    off = (uint32) lsn;
-
-    snprintf(buf, sizeof buf, "%X/%X", id, off);
-    result = pstrdup(buf);
-    PG_RETURN_CSTRING(result);
+	XLogRecPtr	lsn = PG_GETARG_LSN(0);
+	char		buf[MAXPG_LSNLEN + 1];
+	char	   *result;
+	uint32		id,
+				off;
+
+	/* Decode ID and offset */
+	id = (uint32) (lsn >> 32);
+	off = (uint32) lsn;
+
+	snprintf(buf, sizeof buf, "%X/%X", id, off);
+	result = pstrdup(buf);
+	PG_RETURN_CSTRING(result);
 }
 
 Datum
 pg_lsn_recv(PG_FUNCTION_ARGS)
 {
-    StringInfo    buf = (StringInfo) PG_GETARG_POINTER(0);
-    XLogRecPtr    result;
+	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
+	XLogRecPtr	result;
 
-    result = pq_getmsgint64(buf);
-    PG_RETURN_LSN(result);
+	result = pq_getmsgint64(buf);
+	PG_RETURN_LSN(result);
 }
 
 Datum
 pg_lsn_send(PG_FUNCTION_ARGS)
 {
-    XLogRecPtr    lsn = PG_GETARG_LSN(0);
-    StringInfoData buf;
+	XLogRecPtr	lsn = PG_GETARG_LSN(0);
+	StringInfoData buf;
 
-    pq_begintypsend(&buf);
-    pq_sendint64(&buf, lsn);
-    PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
+	pq_begintypsend(&buf);
+	pq_sendint64(&buf, lsn);
+	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 }
 
 
 /*----------------------------------------------------------
- *    Operators for PostgreSQL LSNs
+ *	Operators for PostgreSQL LSNs
  *---------------------------------------------------------*/
 
 Datum
 pg_lsn_eq(PG_FUNCTION_ARGS)
 {
-    XLogRecPtr    lsn1 = PG_GETARG_LSN(0);
-    XLogRecPtr    lsn2 = PG_GETARG_LSN(1);
+	XLogRecPtr	lsn1 = PG_GETARG_LSN(0);
+	XLogRecPtr	lsn2 = PG_GETARG_LSN(1);
 
-    PG_RETURN_BOOL(lsn1 == lsn2);
+	PG_RETURN_BOOL(lsn1 == lsn2);
 }
 
 Datum
 pg_lsn_ne(PG_FUNCTION_ARGS)
 {
-    XLogRecPtr    lsn1 = PG_GETARG_LSN(0);
-    XLogRecPtr    lsn2 = PG_GETARG_LSN(1);
+	XLogRecPtr	lsn1 = PG_GETARG_LSN(0);
+	XLogRecPtr	lsn2 = PG_GETARG_LSN(1);
 
-    PG_RETURN_BOOL(lsn1 != lsn2);
+	PG_RETURN_BOOL(lsn1 != lsn2);
 }
 
 Datum
 pg_lsn_lt(PG_FUNCTION_ARGS)
 {
-    XLogRecPtr    lsn1 = PG_GETARG_LSN(0);
-    XLogRecPtr    lsn2 = PG_GETARG_LSN(1);
+	XLogRecPtr	lsn1 = PG_GETARG_LSN(0);
+	XLogRecPtr	lsn2 = PG_GETARG_LSN(1);
 
-    PG_RETURN_BOOL(lsn1 < lsn2);
+	PG_RETURN_BOOL(lsn1 < lsn2);
 }
 
 Datum
 pg_lsn_gt(PG_FUNCTION_ARGS)
 {
-    XLogRecPtr    lsn1 = PG_GETARG_LSN(0);
-    XLogRecPtr    lsn2 = PG_GETARG_LSN(1);
+	XLogRecPtr	lsn1 = PG_GETARG_LSN(0);
+	XLogRecPtr	lsn2 = PG_GETARG_LSN(1);
 
-    PG_RETURN_BOOL(lsn1 > lsn2);
+	PG_RETURN_BOOL(lsn1 > lsn2);
 }
 
 Datum
 pg_lsn_le(PG_FUNCTION_ARGS)
 {
-    XLogRecPtr    lsn1 = PG_GETARG_LSN(0);
-    XLogRecPtr    lsn2 = PG_GETARG_LSN(1);
+	XLogRecPtr	lsn1 = PG_GETARG_LSN(0);
+	XLogRecPtr	lsn2 = PG_GETARG_LSN(1);
 
-    PG_RETURN_BOOL(lsn1 <= lsn2);
+	PG_RETURN_BOOL(lsn1 <= lsn2);
 }
 
 Datum
 pg_lsn_ge(PG_FUNCTION_ARGS)
 {
-    XLogRecPtr    lsn1 = PG_GETARG_LSN(0);
-    XLogRecPtr    lsn2 = PG_GETARG_LSN(1);
+	XLogRecPtr	lsn1 = PG_GETARG_LSN(0);
+	XLogRecPtr	lsn2 = PG_GETARG_LSN(1);
 
-    PG_RETURN_BOOL(lsn1 >= lsn2);
+	PG_RETURN_BOOL(lsn1 >= lsn2);
 }
 
 /* btree index opclass support */
 Datum
 pg_lsn_cmp(PG_FUNCTION_ARGS)
 {
-    XLogRecPtr    a = PG_GETARG_LSN(0);
-    XLogRecPtr    b = PG_GETARG_LSN(1);
-
-    if (a > b)
-        PG_RETURN_INT32(1);
-    else if (a == b)
-        PG_RETURN_INT32(0);
-    else
-        PG_RETURN_INT32(-1);
+	XLogRecPtr	a = PG_GETARG_LSN(0);
+	XLogRecPtr	b = PG_GETARG_LSN(1);
+
+	if (a > b)
+		PG_RETURN_INT32(1);
+	else if (a == b)
+		PG_RETURN_INT32(0);
+	else
+		PG_RETURN_INT32(-1);
 }
 
 /* hash index opclass support */
 Datum
 pg_lsn_hash(PG_FUNCTION_ARGS)
 {
-    /* We can use hashint8 directly */
-    return hashint8(fcinfo);
+	/* We can use hashint8 directly */
+	return hashint8(fcinfo);
+}
+
+Datum
+pg_lsn_hash_extended(PG_FUNCTION_ARGS)
+{
+	return hashint8extended(fcinfo);
 }
 
 
 /*----------------------------------------------------------
- *    Arithmetic operators on PostgreSQL LSNs.
+ *	Arithmetic operators on PostgreSQL LSNs.
  *---------------------------------------------------------*/
 
 Datum
 pg_lsn_mi(PG_FUNCTION_ARGS)
 {
-    XLogRecPtr    lsn1 = PG_GETARG_LSN(0);
-    XLogRecPtr    lsn2 = PG_GETARG_LSN(1);
-    char        buf[256];
-    Datum        result;
-
-    /* Output could be as large as plus or minus 2^63 - 1. */
-    if (lsn1 < lsn2)
-        snprintf(buf, sizeof buf, "-" UINT64_FORMAT, lsn2 - lsn1);
-    else
-        snprintf(buf, sizeof buf, UINT64_FORMAT, lsn1 - lsn2);
-
-    /* Convert to numeric. */
-    result = DirectFunctionCall3(numeric_in,
-                                 CStringGetDatum(buf),
-                                 ObjectIdGetDatum(0),
-                                 Int32GetDatum(-1));
-
-    return result;
+	XLogRecPtr	lsn1 = PG_GETARG_LSN(0);
+	XLogRecPtr	lsn2 = PG_GETARG_LSN(1);
+	char		buf[256];
+	Datum		result;
+
+	/* Output could be as large as plus or minus 2^63 - 1. */
+	if (lsn1 < lsn2)
+		snprintf(buf, sizeof buf, "-" UINT64_FORMAT, lsn2 - lsn1);
+	else
+		snprintf(buf, sizeof buf, UINT64_FORMAT, lsn1 - lsn2);
+
+	/* Convert to numeric. */
+	result = DirectFunctionCall3(numeric_in,
+								 CStringGetDatum(buf),
+								 ObjectIdGetDatum(0),
+								 Int32GetDatum(-1));
+
+	return result;
 }
diff --git a/src/backend/utils/adt/rangetypes.c b/src/backend/utils/adt/rangetypes.c
index c013179b..166ebf9f 100644
--- a/src/backend/utils/adt/rangetypes.c
+++ b/src/backend/utils/adt/rangetypes.c
@@ -1280,6 +1280,69 @@ hash_range(PG_FUNCTION_ARGS)
     PG_RETURN_INT32(result);
 }
 
+/*
+ * Returns 64-bit value by hashing a value to a 64-bit value, with a seed.
+ * Otherwise, similar to hash_range.
+ */
+Datum
+hash_range_extended(PG_FUNCTION_ARGS)
+{
+	RangeType  *r = PG_GETARG_RANGE(0);
+	uint64		seed = PG_GETARG_INT64(1);
+	uint64		result;
+	TypeCacheEntry *typcache;
+	TypeCacheEntry *scache;
+	RangeBound	lower;
+	RangeBound	upper;
+	bool		empty;
+	char		flags;
+	uint64		lower_hash;
+	uint64		upper_hash;
+
+	check_stack_depth();
+
+	typcache = range_get_typcache(fcinfo, RangeTypeGetOid(r));
+
+	range_deserialize(typcache, r, &lower, &upper, &empty);
+	flags = range_get_flags(r);
+
+	scache = typcache->rngelemtype;
+	if (!OidIsValid(scache->hash_extended_proc_finfo.fn_oid))
+	{
+		scache = lookup_type_cache(scache->type_id,
+								   TYPECACHE_HASH_EXTENDED_PROC_FINFO);
+		if (!OidIsValid(scache->hash_extended_proc_finfo.fn_oid))
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_FUNCTION),
+					 errmsg("could not identify a hash function for type %s",
+							format_type_be(scache->type_id))));
+	}
+
+	if (RANGE_HAS_LBOUND(flags))
+		lower_hash = DatumGetUInt64(FunctionCall2Coll(&scache->hash_extended_proc_finfo,
+													  typcache->rng_collation,
+													  lower.val,
+													  seed));
+	else
+		lower_hash = 0;
+
+	if (RANGE_HAS_UBOUND(flags))
+		upper_hash = DatumGetUInt64(FunctionCall2Coll(&scache->hash_extended_proc_finfo,
+													  typcache->rng_collation,
+													  upper.val,
+													  seed));
+	else
+		upper_hash = 0;
+
+	/* Merge hashes of flags and bounds */
+	result = hash_uint32_extended((uint32) flags, seed);
+	result ^= lower_hash;
+	result = ROTATE_HIGH_AND_LOW_32BITS(result);
+	result ^= upper_hash;
+
+	PG_RETURN_UINT64(result);
+}
+
 /*
  *----------------------------------------------------------
  * CANONICAL FUNCTIONS
diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c
index 8fc9b6b3..70e1125e 100644
--- a/src/backend/utils/adt/timestamp.c
+++ b/src/backend/utils/adt/timestamp.c
@@ -2141,6 +2141,11 @@ timestamp_hash(PG_FUNCTION_ARGS)
     return hashint8(fcinfo);
 }
 
+Datum
+timestamp_hash_extended(PG_FUNCTION_ARGS)
+{
+	return hashint8extended(fcinfo);
+}
 
 /*
  * Cross-type comparison functions for timestamp vs timestamptz
@@ -2447,6 +2452,20 @@ interval_hash(PG_FUNCTION_ARGS)
     return DirectFunctionCall1(hashint8, Int64GetDatumFast(span64));
 }
 
+Datum
+interval_hash_extended(PG_FUNCTION_ARGS)
+{
+	Interval   *interval = PG_GETARG_INTERVAL_P(0);
+	INT128		span = interval_cmp_value(interval);
+	int64		span64;
+
+	/* Same approach as interval_hash */
+	span64 = int128_to_int64(span);
+
+	return DirectFunctionCall2(hashint8extended, Int64GetDatumFast(span64),
+							   PG_GETARG_DATUM(1));
+}
+
 /* overlaps_timestamp() --- implements the SQL OVERLAPS operator.
  *
  * Algorithm is per SQL spec.  This is much harder than you'd think
diff --git a/src/backend/utils/adt/uuid.c b/src/backend/utils/adt/uuid.c
index 3cf4fa70..035cd44d 100644
--- a/src/backend/utils/adt/uuid.c
+++ b/src/backend/utils/adt/uuid.c
@@ -408,3 +408,11 @@ uuid_hash(PG_FUNCTION_ARGS)
 
     return hash_any(key->data, UUID_LEN);
 }
+
+Datum
+uuid_hash_extended(PG_FUNCTION_ARGS)
+{
+	pg_uuid_t  *key = PG_GETARG_UUID_P(0);
+
+	return hash_any_extended(key->data, UUID_LEN, PG_GETARG_INT64(1));
+}
diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c
index 329c7d4b..c60b452b 100644
--- a/src/backend/utils/adt/varchar.c
+++ b/src/backend/utils/adt/varchar.c
@@ -1007,6 +1007,24 @@ hashbpchar(PG_FUNCTION_ARGS)
     return result;
 }
 
+Datum
+hashbpcharextended(PG_FUNCTION_ARGS)
+{
+	BpChar	   *key = PG_GETARG_BPCHAR_PP(0);
+	char	   *keydata;
+	int			keylen;
+	Datum		result;
+
+	keydata = VARDATA_ANY(key);
+	keylen = bcTruelen(key);
+
+	result = hash_any_extended((unsigned char *) keydata, keylen,
+							   PG_GETARG_INT64(1));
+
+	PG_FREE_IF_COPY(key, 0);
+
+	return result;
+}
 
 /*
  * The following operators support character-by-character comparison
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index 5f55c35d..d8a59308 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -499,8 +499,8 @@ get_compatible_hash_operators(Oid opno,
 
 /*
  * get_op_hash_functions
- *        Get the OID(s) of hash support function(s) compatible with the given
- *        operator, operating on its LHS and/or RHS datatype as required.
+ *		Get the OID(s) of the standard hash support function(s) compatible with
+ *		the given operator, operating on its LHS and/or RHS datatype as required.
  *
  * A function for the LHS type is sought and returned into *lhs_procno if
  * lhs_procno isn't NULL.  Similarly, a function for the RHS type is sought
@@ -551,7 +551,7 @@ get_op_hash_functions(Oid opno,
                 *lhs_procno = get_opfamily_proc(aform->amopfamily,
                                                 aform->amoplefttype,
                                                 aform->amoplefttype,
-                                                HASHPROC);
+												HASHSTANDARD_PROC);
                 if (!OidIsValid(*lhs_procno))
                     continue;
                 /* Matching LHS found, done if caller doesn't want RHS */
@@ -573,7 +573,7 @@ get_op_hash_functions(Oid opno,
                 *rhs_procno = get_opfamily_proc(aform->amopfamily,
                                                 aform->amoprighttype,
                                                 aform->amoprighttype,
-                                                HASHPROC);
+												HASHSTANDARD_PROC);
                 if (!OidIsValid(*rhs_procno))
                 {
                     /* Forget any LHS function from this opfamily */
diff --git a/src/backend/utils/cache/typcache.c b/src/backend/utils/cache/typcache.c
index 60787238..c9579be4 100644
--- a/src/backend/utils/cache/typcache.c
+++ b/src/backend/utils/cache/typcache.c
@@ -90,6 +90,7 @@ static TypeCacheEntry *firstDomainTypeEntry = NULL;
 #define TCFLAGS_HAVE_FIELD_EQUALITY            0x1000
 #define TCFLAGS_HAVE_FIELD_COMPARE            0x2000
 #define TCFLAGS_CHECKED_DOMAIN_CONSTRAINTS    0x4000
+#define TCFLAGS_CHECKED_HASH_EXTENDED_PROC	0x8000
 
 /*
  * Data stored about a domain type's constraints.  Note that we do not create
@@ -314,6 +315,8 @@ lookup_type_cache(Oid type_id, int flags)
         flags |= TYPECACHE_HASH_OPFAMILY;
 
     if ((flags & (TYPECACHE_HASH_PROC | TYPECACHE_HASH_PROC_FINFO |
+				  TYPECACHE_HASH_EXTENDED_PROC |
+				  TYPECACHE_HASH_EXTENDED_PROC_FINFO |
                   TYPECACHE_HASH_OPFAMILY)) &&
         !(typentry->flags & TCFLAGS_CHECKED_HASH_OPCLASS))
     {
@@ -336,6 +339,7 @@ lookup_type_cache(Oid type_id, int flags)
          * decision is still good.
          */
         typentry->flags &= ~(TCFLAGS_CHECKED_HASH_PROC);
+		typentry->flags &= ~(TCFLAGS_CHECKED_HASH_EXTENDED_PROC);
         typentry->flags |= TCFLAGS_CHECKED_HASH_OPCLASS;
     }
 
@@ -379,11 +383,12 @@ lookup_type_cache(Oid type_id, int flags)
         typentry->eq_opr = eq_opr;
 
         /*
-         * Reset info about hash function whenever we pick up new info about
-         * equality operator.  This is so we can ensure that the hash function
-         * matches the operator.
+		 * Reset info about hash functions whenever we pick up new info about
+		 * equality operator.  This is so we can ensure that the hash functions
+		 * match the operator.
          */
         typentry->flags &= ~(TCFLAGS_CHECKED_HASH_PROC);
+		typentry->flags &= ~(TCFLAGS_CHECKED_HASH_EXTENDED_PROC);
         typentry->flags |= TCFLAGS_CHECKED_EQ_OPR;
     }
     if ((flags & TYPECACHE_LT_OPR) &&
@@ -474,7 +479,7 @@ lookup_type_cache(Oid type_id, int flags)
             hash_proc = get_opfamily_proc(typentry->hash_opf,
                                           typentry->hash_opintype,
                                           typentry->hash_opintype,
-                                          HASHPROC);
+										  HASHSTANDARD_PROC);
 
         /*
          * As above, make sure hash_array will succeed.  We don't currently
@@ -492,6 +497,43 @@ lookup_type_cache(Oid type_id, int flags)
         typentry->hash_proc = hash_proc;
         typentry->flags |= TCFLAGS_CHECKED_HASH_PROC;
     }
+	if ((flags & (TYPECACHE_HASH_EXTENDED_PROC |
+				  TYPECACHE_HASH_EXTENDED_PROC_FINFO)) &&
+		!(typentry->flags & TCFLAGS_CHECKED_HASH_EXTENDED_PROC))
+	{
+		Oid			hash_extended_proc = InvalidOid;
+
+		/*
+		 * We insist that the eq_opr, if one has been determined, match the
+		 * hash opclass; else report there is no hash function.
+		 */
+		if (typentry->hash_opf != InvalidOid &&
+			(!OidIsValid(typentry->eq_opr) ||
+			 typentry->eq_opr == get_opfamily_member(typentry->hash_opf,
+													 typentry->hash_opintype,
+													 typentry->hash_opintype,
+													 HTEqualStrategyNumber)))
+			hash_extended_proc = get_opfamily_proc(typentry->hash_opf,
+												   typentry->hash_opintype,
+												   typentry->hash_opintype,
+												   HASHEXTENDED_PROC);
+
+		/*
+		 * As above, make sure hash_array_extended will succeed.  We don't
+		 * currently support hashing for composite types, but when we do,
+		 * we'll need more logic here to check that case too.
+		 */
+		if (hash_extended_proc == F_HASH_ARRAY_EXTENDED &&
+			!array_element_has_hashing(typentry))
+			hash_extended_proc = InvalidOid;
+
+		/* Force update of hash_proc_finfo only if we're changing state */
+		if (typentry->hash_extended_proc != hash_extended_proc)
+			typentry->hash_extended_proc_finfo.fn_oid = InvalidOid;
+
+		typentry->hash_extended_proc = hash_extended_proc;
+		typentry->flags |= TCFLAGS_CHECKED_HASH_EXTENDED_PROC;
+	}
 
     /*
      * Set up fmgr lookup info as requested
@@ -530,6 +572,14 @@ lookup_type_cache(Oid type_id, int flags)
         fmgr_info_cxt(typentry->hash_proc, &typentry->hash_proc_finfo,
                       CacheMemoryContext);
     }
+	if ((flags & TYPECACHE_HASH_EXTENDED_PROC_FINFO) &&
+		typentry->hash_extended_proc_finfo.fn_oid == InvalidOid &&
+		typentry->hash_extended_proc != InvalidOid)
+	{
+		fmgr_info_cxt(typentry->hash_extended_proc,
+					  &typentry->hash_extended_proc_finfo,
+					  CacheMemoryContext);
+	}
 
     /*
      * If it's a composite type (row type), get tupdesc if requested
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index a461a8fe..96cfcc34 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -38,6 +38,17 @@ typedef uint32 Bucket;
 #define BUCKET_TO_BLKNO(metap,B) \
         ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1)
 
+/*
+ * Rotate the high 32 bits and the low 32 bits separately.  The standard
+ * hash function sometimes rotates the low 32 bits by one bit when
+ * combining elements.  We want extended hash functions to be compatible with
+ * that algorithm when the seed is 0, so we can't just do a normal rotation.
+ * This works, though.
+ */
+#define ROTATE_HIGH_AND_LOW_32BITS(v) \
+	((((v) << 1) & UINT64CONST(0xfffffffefffffffe)) | \
+	(((v) >> 31) & UINT64CONST(0x100000001)))
+
 /*
  * Special space for hash index pages.
  *
@@ -290,11 +301,19 @@ typedef HashMetaPageData *HashMetaPage;
 
 /*
  *    When a new operator class is declared, we require that the user supply
- *    us with an amproc procudure for hashing a key of the new type.
- *    Since we only have one such proc in amproc, it's number 1.
+ * us with an amproc procudure for hashing a key of the new type, returning
+ * a 32-bit hash value.  We call this the "standard" hash procedure.  We
+ * also allow an optional "extended" hash procedure which accepts a salt and
+ * returns a 64-bit hash value.  This is highly recommended but, for reasons
+ * of backward compatibility, optional.
+ *
+ * When the salt is 0, the low 32 bits of the value returned by the extended
+ * hash procedure should match the value that would have been returned by the
+ * standard hash procedure.
  */
-#define HASHPROC        1
-#define HASHNProcs        1
+#define HASHSTANDARD_PROC		1
+#define HASHEXTENDED_PROC		2
+#define HASHNProcs				2
 
 
 /* public routines */
@@ -322,7 +341,10 @@ extern bytea *hashoptions(Datum reloptions, bool validate);
 extern bool hashvalidate(Oid opclassoid);
 
 extern Datum hash_any(register const unsigned char *k, register int keylen);
+extern Datum hash_any_extended(register const unsigned char *k,
+				  register int keylen, uint64 seed);
 extern Datum hash_uint32(uint32 k);
+extern Datum hash_uint32_extended(uint32 k, uint64 seed);
 
 /* private routines */
 
diff --git a/src/include/catalog/pg_amproc.h b/src/include/catalog/pg_amproc.h
index b6d88568..b239bbec 100644
--- a/src/include/catalog/pg_amproc.h
+++ b/src/include/catalog/pg_amproc.h
@@ -153,42 +153,77 @@ DATA(insert (    4033   3802 3802 1 4044 ));
 
 /* hash */
 DATA(insert (    427   1042 1042 1 1080 ));
+DATA(insert (  427   1042 1042 2 972 ));
 DATA(insert (    431   18 18 1 454 ));
+DATA(insert (  431   18 18 2 446 ));
 DATA(insert (    435   1082 1082 1 450 ));
+DATA(insert (  435   1082 1082 2 425 ));
 DATA(insert (    627   2277 2277 1 626 ));
+DATA(insert (  627   2277 2277 2 782 ));
 DATA(insert (    1971   700 700 1 451 ));
+DATA(insert (  1971   700 700 2 443 ));
 DATA(insert (    1971   701 701 1 452 ));
+DATA(insert (  1971   701 701 2 444 ));
 DATA(insert (    1975   869 869 1 422 ));
+DATA(insert (  1975   869 869 2 779 ));
 DATA(insert (    1977   21 21 1 449 ));
+DATA(insert (  1977   21 21 2 441 ));
 DATA(insert (    1977   23 23 1 450 ));
+DATA(insert (  1977   23 23 2 425 ));
 DATA(insert (    1977   20 20 1 949 ));
+DATA(insert (  1977   20 20 2 442 ));
 DATA(insert (    1983   1186 1186 1 1697 ));
+DATA(insert (  1983   1186 1186 2 3418 ));
 DATA(insert (    1985   829 829 1 399 ));
+DATA(insert (  1985   829 829 2 778 ));
 DATA(insert (    1987   19 19 1 455 ));
+DATA(insert (  1987   19 19 2 447 ));
 DATA(insert (    1990   26 26 1 453 ));
+DATA(insert (  1990   26 26 2 445 ));
 DATA(insert (    1992   30 30 1 457 ));
+DATA(insert (  1992   30 30 2 776 ));
 DATA(insert (    1995   25 25 1 400 ));
+DATA(insert (  1995   25 25 2 448));
 DATA(insert (    1997   1083 1083 1 1688 ));
+DATA(insert (  1997   1083 1083 2 3409 ));
 DATA(insert (    1998   1700 1700 1 432 ));
+DATA(insert (  1998   1700 1700 2 780 ));
 DATA(insert (    1999   1184 1184 1 2039 ));
+DATA(insert (  1999   1184 1184 2 3411 ));
 DATA(insert (    2001   1266 1266 1 1696 ));
+DATA(insert (  2001   1266 1266 2 3410 ));
 DATA(insert (    2040   1114 1114 1 2039 ));
+DATA(insert (  2040   1114 1114 2 3411 ));
 DATA(insert (    2222   16 16 1 454 ));
+DATA(insert (  2222   16 16 2 446 ));
 DATA(insert (    2223   17 17 1 456 ));
+DATA(insert (  2223   17 17 2 772 ));
 DATA(insert (    2225   28 28 1 450 ));
+DATA(insert (  2225   28 28 2 425));
 DATA(insert (    2226   29 29 1 450 ));
+DATA(insert (  2226   29 29 2 425 ));
 DATA(insert (    2227   702 702 1 450 ));
+DATA(insert (  2227   702 702 2 425 ));
 DATA(insert (    2228   703 703 1 450 ));
+DATA(insert (  2228   703 703 2 425 ));
 DATA(insert (    2229   25 25 1 400 ));
+DATA(insert (  2229   25 25 2 448 ));
 DATA(insert (    2231   1042 1042 1 1080 ));
+DATA(insert (  2231   1042 1042 2 972 ));
 DATA(insert (    2235   1033 1033 1 329 ));
+DATA(insert (  2235   1033 1033 2 777 ));
 DATA(insert (    2969   2950 2950 1 2963 ));
+DATA(insert (  2969   2950 2950 2 3412 ));
 DATA(insert (    3254   3220 3220 1 3252 ));
+DATA(insert (  3254   3220 3220 2 3413 ));
 DATA(insert (    3372   774 774 1 328 ));
+DATA(insert (  3372   774 774 2 781 ));
 DATA(insert (    3523   3500 3500 1 3515 ));
+DATA(insert (  3523   3500 3500 2 3414 ));
 DATA(insert (    3903   3831 3831 1 3902 ));
+DATA(insert (  3903   3831 3831 2 3417 ));
 DATA(insert (    4034   3802 3802 1 4045 ));
-
+DATA(insert (  4034   3802 3802 2 3416));
 
 /* gist */
 DATA(insert (    1029   600 600 1 2179 ));
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 1f40ef5e..e5bcf8ae 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -770,6 +770,38 @@ DATA(insert OID = 432 (  hash_numeric       PGNSP PGUID 12 1 0 0 0 f f f f t f i
 DESCR("hash");
 DATA(insert OID = 328 (  hashmacaddr8       PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "774" _null_ _null_ _null_ _null_ _null_ hashmacaddr8 _null_ _null_ _null_ ));
 DESCR("hash");
+DATA(insert OID = 4660 (  hashint2extended  PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "21 20" _null_ _null_ _null_ _null_ _null_ hashint2extended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4661 (  hashint4extended  PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "23 20" _null_ _null_ _null_ _null_ _null_ hashint4extended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4662 (  hashint8extended  PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "20 20" _null_ _null_ _null_ _null_ _null_ hashint8extended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4663 (  hashfloat4extended    PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "700 20" _null_ _null_ _null_ _null_ _null_ hashfloat4extended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4664 (  hashfloat8extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "701 20" _null_ _null_ _null_ _null_ _null_ hashfloat8extended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4665 (  hashoidextended   PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "26 20" _null_ _null_ _null_ _null_ _null_ hashoidextended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4666 (  hashcharextended  PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "18 20" _null_ _null_ _null_ _null_ _null_ hashcharextended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4667 (  hashnameextended  PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "19 20" _null_ _null_ _null_ _null_ _null_ hashnameextended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4668 (  hashtextextended  PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "25 20" _null_ _null_ _null_ _null_ _null_ hashtextextended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4669 (  hashvarlenaextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "2281 20" _null_ _null_ _null_ _null_ _null_ hashvarlenaextended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4670 (  hashoidvectorextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "30 20" _null_ _null_ _null_ _null_ _null_ hashoidvectorextended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4671 (  hash_aclitem_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1033 20" _null_ _null_ _null_ _null_ _null_ hash_aclitem_extended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4672 (  hashmacaddrextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "829 20" _null_ _null_ _null_ _null_ _null_ hashmacaddrextended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4673 (  hashinetextended  PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "869 20" _null_ _null_ _null_ _null_ _null_ hashinetextended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4674 (  hash_numeric_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1700 20" _null_ _null_ _null_ _null_ _null_ hash_numeric_extended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4675 (  hashmacaddr8extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "774 20" _null_ _null_ _null_ _null_ _null_ hashmacaddr8extended _null_ _null_ _null_ ));
+DESCR("hash");
 
 DATA(insert OID = 438 (  num_nulls           PGNSP PGUID 12 1 0 2276 0 f f f f f f i s 1 0 23 "2276" "{2276}" "{v}" _null_ _null_ _null_ pg_num_nulls _null_ _null_ _null_ ));
 DESCR("count the number of NULL arguments");
@@ -819,6 +851,8 @@ DESCR("convert float8 to int8");
 
 DATA(insert OID = 626 (  hash_array           PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "2277" _null_ _null_ _null_ _null_ _null_ hash_array _null_ _null_ _null_ ));
 DESCR("hash");
+DATA(insert OID = 4686 (  hash_array_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "2277 20" _null_ _null_ _null_ _null_ _null_ hash_array_extended _null_ _null_ _null_ ));
+DESCR("hash");
 
 DATA(insert OID = 652 (  float4               PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 700 "20" _null_ _null_ _null_ _null_ _null_ i8tof _null_ _null_ _null_ ));
 DESCR("convert int8 to float4");
@@ -1269,6 +1303,8 @@ DATA(insert OID = 1080 (  hashbpchar       PGNSP PGUID 12 1 0 0 0 f f f f t f i
 DESCR("hash");
 DATA(insert OID = 1081 (  format_type       PGNSP PGUID 12 1 0 0 0 f f f f f f s s 2 0 25 "26 23" _null_ _null_ _null_ _null_ _null_ format_type _null_ _null_ _null_ ));
 DESCR("format a type oid and atttypmod to canonical SQL");
+DATA(insert OID = 4676 (  hashbpcharextended PGNSP PGUID 12 1 0 0 0 f f f f f t f i s 2 0 20 "1042 20" _null_ _null_ _null_ _null_ _null_    hashbpcharextended _null_ _null_ _null_ ));
+DESCR("hash");
 DATA(insert OID = 1084 (  date_in           PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 1082 "2275" _null_ _null_ _null_ _null_ _null_ date_in _null_ _null_ _null_ ));
 DESCR("I/O");
 DATA(insert OID = 1085 (  date_out           PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 2275 "1082" _null_ _null_ _null_ _null_ _null_ date_out _null_ _null_ _null_ ));
@@ -2412,6 +2448,12 @@ DATA(insert OID = 1696 (  timetz_hash        PGNSP PGUID 12 1 0 0 0 f f f f t f
 DESCR("hash");
 DATA(insert OID = 1697 (  interval_hash        PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "1186" _null_ _null_ _null_ _null_ _null_ interval_hash _null_ _null_ _null_ ));
 DESCR("hash");
+DATA(insert OID = 4677 (  time_hash_extended   PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1083 20" _null_ _null_ _null_ _null_ _null_ time_hash_extended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4678 (  timetz_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1266 20" _null_ _null_ _null_ _null_ _null_ timetz_hash_extended _null_ _null_ _null_ ));
+DESCR("hash");
+DATA(insert OID = 4679 (  interval_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1186 20" _null_ _null_ _null_ _null_ _null_ interval_hash_extended _null_ _null_ _null_ ));
+DESCR("hash");
 
 
 /* OID's 1700 - 1799 NUMERIC data type */
@@ -3219,6 +3261,8 @@ DATA(insert OID = 2039 (  timestamp_hash    PGNSP PGUID 12 1 0 0 0 f f f f t f i
 DESCR("hash");
 DATA(insert OID = 2041 ( overlaps            PGNSP PGUID 12 1 0 0 0 f f f f f f i s 4 0 16 "1114 1114 1114 1114" _null_ _null_ _null_ _null_ _null_    overlaps_timestamp _null_ _null_ _null_ ));
 DESCR("intervals overlap?");
+DATA(insert OID = 4680 (  timestamp_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f f t f i s 2 0 20 "1114 20" _null_ _null_ _null_ _null_ _null_ timestamp_hash_extended _null_ _null_ _null_ ));
+DESCR("hash");
 DATA(insert OID = 2042 ( overlaps            PGNSP PGUID 14 1 0 0 0 f f f f f f i s 4 0 16 "1114 1186 1114 1186" _null_ _null_ _null_ _null_ _null_ "select ($1, ($1 + $2)) overlaps ($3, ($3 + $4))" _null_ _null_ _null_ ));
 DESCR("intervals overlap?");
 DATA(insert OID = 2043 ( overlaps            PGNSP PGUID 14 1 0 0 0 f f f f f f i s 4 0 16 "1114 1114 1114 1186" _null_ _null_ _null_ _null_ _null_ "select ($1, $2) overlaps ($3, ($3 + $4))" _null_ _null_ _null_ ));
@@ -4691,6 +4735,8 @@ DATA(insert OID = 2962 (  uuid_send           PGNSP PGUID 12 1 0 0 0 f f f f t f
 DESCR("I/O");
 DATA(insert OID = 2963 (  uuid_hash           PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "2950" _null_ _null_ _null_ _null_ _null_ uuid_hash _null_ _null_ _null_ ));
 DESCR("hash");
+DATA(insert OID = 4681 (  uuid_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "2950 20" _null_ _null_ _null_ _null_ _null_ uuid_hash_extended _null_ _null_ _null_ ));
+DESCR("hash");
 
 /* pg_lsn */
 DATA(insert OID = 3229 (  pg_lsn_in        PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3220 "2275" _null_ _null_ _null_ _null_ _null_ pg_lsn_in _null_ _null_ _null_ ));
@@ -4712,6 +4758,8 @@ DATA(insert OID = 3251 (  pg_lsn_cmp    PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2
 DESCR("less-equal-greater");
 DATA(insert OID = 3252 (  pg_lsn_hash    PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "3220" _null_ _null_ _null_ _null_ _null_ pg_lsn_hash _null_ _null_ _null_ ));
 DESCR("hash");
+DATA(insert OID = 4682 (  pg_lsn_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "3220 20" _null_ _null_ _null_ _null_ _null_ pg_lsn_hash_extended _null_ _null_ _null_ ));
+DESCR("hash");
 
 /* enum related procs */
 DATA(insert OID = 3504 (  anyenum_in    PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 3500 "2275" _null_ _null_ _null_ _null_ _null_ anyenum_in _null_ _null_ _null_ ));
@@ -4734,6 +4782,8 @@ DATA(insert OID = 3515 (  hashenum        PGNSP PGUID 12 1 0 0 0 f f f f t f i s
 DESCR("hash");
 DATA(insert OID = 3524 (  enum_smaller    PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3500 "3500 3500" _null_ _null_ _null_ _null_ _null_ enum_smaller _null_ _null_ _null_ ));
 DESCR("smaller of two");
+DATA(insert OID = 4683 (  hashenumextended PGNSP PGUID 12 1 0 0 0 f f f f f t f i s 2 0 20 "3500 20" _null_ _null_ _null_ _null_ _null_ hashenumextended _null_ _null_ _null_ ));
+DESCR("hash");
 DATA(insert OID = 3525 (  enum_larger    PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3500 "3500 3500" _null_ _null_ _null_ _null_ _null_ enum_larger _null_ _null_ _null_ ));
 DESCR("larger of two");
 DATA(insert OID = 3526 (  max            PGNSP PGUID 12 1 0 0 0 t f f f f f i s 1 0 3500 "3500" _null_ _null_ _null_ _null_ _null_ aggregate_dummy _null_ _null_ _null_ ));
@@ -5129,6 +5179,8 @@ DATA(insert OID = 4044 (  jsonb_cmp           PGNSP PGUID 12 1 0 0 0 f f f f t f
 DESCR("less-equal-greater");
 DATA(insert OID = 4045 (  jsonb_hash       PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "3802" _null_ _null_ _null_ _null_ _null_ jsonb_hash _null_ _null_ _null_ ));
 DESCR("hash");
+DATA(insert OID = 4684 (  jsonb_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "3802 20" _null_ _null_ _null_ _null_ _null_ jsonb_hash_extended _null_ _null_ _null_ ));
+DESCR("hash");
 DATA(insert OID = 4046 (  jsonb_contains   PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3802 3802" _null_ _null_ _null_ _null_ _null_ jsonb_contains _null_ _null_ _null_ ));
 DATA(insert OID = 4047 (  jsonb_exists     PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3802 25" _null_ _null_ _null_ _null_ _null_ jsonb_exists _null_ _null_ _null_ ));
 DATA(insert OID = 4048 (  jsonb_exists_any     PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "3802 1009" _null_ _null_ _null_ _null_ _null_ jsonb_exists_any _null_ _null_ _null_ ));
@@ -5319,6 +5371,8 @@ DATA(insert OID = 3881 (  range_gist_same        PGNSP PGUID 12 1 0 0 0 f f f f
 DESCR("GiST support");
 DATA(insert OID = 3902 (  hash_range            PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 23 "3831" _null_ _null_ _null_ _null_ _null_ hash_range _null_ _null_ _null_ ));
 DESCR("hash a range");
+DATA(insert OID = 4685 (  hash_range_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "3831 20" _null_ _null_ _null_ _null_ _null_ hash_range_extended _null_ _null_ _null_ ));
+DESCR("hash a range");
 DATA(insert OID = 3916 (  range_typanalyze        PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 16 "2281" _null_ _null_ _null_ _null_ _null_ range_typanalyze _null_ _null_ _null_ ));
 DESCR("range typanalyze");
 DATA(insert OID = 3169 (  rangesel                PGNSP PGUID 12 1 0 0 0 f f f f t f s s 4 0 701 "2281 26 2281 23" _null_ _null_ _null_ _null_ _null_ rangesel _null_ _null_ _null_ ));
diff --git a/src/include/fmgr.h b/src/include/fmgr.h
index 5c54e2be..75f9b34b 100644
--- a/src/include/fmgr.h
+++ b/src/include/fmgr.h
@@ -325,6 +325,7 @@ extern struct varlena *pg_detoast_datum_packed(struct varlena *datum);
 #define PG_RETURN_FLOAT4(x)  return Float4GetDatum(x)
 #define PG_RETURN_FLOAT8(x)  return Float8GetDatum(x)
 #define PG_RETURN_INT64(x)     return Int64GetDatum(x)
+#define PG_RETURN_UINT64(x)  return UInt64GetDatum(x)
 /* RETURN macros for other pass-by-ref types will typically look like this: */
 #define PG_RETURN_BYTEA_P(x)   PG_RETURN_POINTER(x)
 #define PG_RETURN_TEXT_P(x)    PG_RETURN_POINTER(x)
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 9b07ee9b..24f49166 100644
--- a/src/include/utils/jsonb.h
+++ b/src/include/utils/jsonb.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * jsonb.h
- *      Declarations for jsonb data type support.
+ *	  Declarations for jsonb data type support.
  *
  * Copyright (c) 1996-2017, PostgreSQL Global Development Group
  *
@@ -19,21 +19,21 @@
 /* Tokens used when sequentially processing a jsonb value */
 typedef enum
 {
-    WJB_DONE,
-    WJB_KEY,
-    WJB_VALUE,
-    WJB_ELEM,
-    WJB_BEGIN_ARRAY,
-    WJB_END_ARRAY,
-    WJB_BEGIN_OBJECT,
-    WJB_END_OBJECT
+	WJB_DONE,
+	WJB_KEY,
+	WJB_VALUE,
+	WJB_ELEM,
+	WJB_BEGIN_ARRAY,
+	WJB_END_ARRAY,
+	WJB_BEGIN_OBJECT,
+	WJB_END_OBJECT
 } JsonbIteratorToken;
 
 /* Strategy numbers for GIN index opclasses */
-#define JsonbContainsStrategyNumber        7
-#define JsonbExistsStrategyNumber        9
-#define JsonbExistsAnyStrategyNumber    10
-#define JsonbExistsAllStrategyNumber    11
+#define JsonbContainsStrategyNumber		7
+#define JsonbExistsStrategyNumber		9
+#define JsonbExistsAnyStrategyNumber	10
+#define JsonbExistsAllStrategyNumber	11
 
 /*
  * In the standard jsonb_ops GIN opclass for jsonb, we choose to index both
@@ -56,19 +56,19 @@ typedef enum
  * matches against the heap tuple; currently, this costs nothing because we
  * must always recheck for other reasons.
  */
-#define JGINFLAG_KEY    0x01    /* key (or string array element) */
-#define JGINFLAG_NULL    0x02    /* null value */
-#define JGINFLAG_BOOL    0x03    /* boolean value */
-#define JGINFLAG_NUM    0x04    /* numeric value */
-#define JGINFLAG_STR    0x05    /* string value (if not an array element) */
-#define JGINFLAG_HASHED 0x10    /* OR'd into flag if value was hashed */
-#define JGIN_MAXLENGTH    125        /* max length of text part before hashing */
+#define JGINFLAG_KEY	0x01	/* key (or string array element) */
+#define JGINFLAG_NULL	0x02	/* null value */
+#define JGINFLAG_BOOL	0x03	/* boolean value */
+#define JGINFLAG_NUM	0x04	/* numeric value */
+#define JGINFLAG_STR	0x05	/* string value (if not an array element) */
+#define JGINFLAG_HASHED 0x10	/* OR'd into flag if value was hashed */
+#define JGIN_MAXLENGTH	125		/* max length of text part before hashing */
 
 /* Convenience macros */
-#define DatumGetJsonb(d)    ((Jsonb *) PG_DETOAST_DATUM(d))
-#define JsonbGetDatum(p)    PointerGetDatum(p)
-#define PG_GETARG_JSONB(x)    DatumGetJsonb(PG_GETARG_DATUM(x))
-#define PG_RETURN_JSONB(x)    PG_RETURN_POINTER(x)
+#define DatumGetJsonb(d)	((Jsonb *) PG_DETOAST_DATUM(d))
+#define JsonbGetDatum(p)	PointerGetDatum(p)
+#define PG_GETARG_JSONB(x)	DatumGetJsonb(PG_GETARG_DATUM(x))
+#define PG_RETURN_JSONB(x)	PG_RETURN_POINTER(x)
 
 typedef struct JsonbPair JsonbPair;
 typedef struct JsonbValue JsonbValue;
@@ -138,38 +138,38 @@ typedef struct JsonbValue JsonbValue;
  */
 typedef uint32 JEntry;
 
-#define JENTRY_OFFLENMASK        0x0FFFFFFF
-#define JENTRY_TYPEMASK            0x70000000
-#define JENTRY_HAS_OFF            0x80000000
+#define JENTRY_OFFLENMASK		0x0FFFFFFF
+#define JENTRY_TYPEMASK			0x70000000
+#define JENTRY_HAS_OFF			0x80000000
 
 /* values stored in the type bits */
-#define JENTRY_ISSTRING            0x00000000
-#define JENTRY_ISNUMERIC        0x10000000
-#define JENTRY_ISBOOL_FALSE        0x20000000
-#define JENTRY_ISBOOL_TRUE        0x30000000
-#define JENTRY_ISNULL            0x40000000
-#define JENTRY_ISCONTAINER        0x50000000    /* array or object */
+#define JENTRY_ISSTRING			0x00000000
+#define JENTRY_ISNUMERIC		0x10000000
+#define JENTRY_ISBOOL_FALSE		0x20000000
+#define JENTRY_ISBOOL_TRUE		0x30000000
+#define JENTRY_ISNULL			0x40000000
+#define JENTRY_ISCONTAINER		0x50000000	/* array or object */
 
 /* Access macros.  Note possible multiple evaluations */
-#define JBE_OFFLENFLD(je_)        ((je_) & JENTRY_OFFLENMASK)
-#define JBE_HAS_OFF(je_)        (((je_) & JENTRY_HAS_OFF) != 0)
-#define JBE_ISSTRING(je_)        (((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING)
-#define JBE_ISNUMERIC(je_)        (((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC)
-#define JBE_ISCONTAINER(je_)    (((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER)
-#define JBE_ISNULL(je_)            (((je_) & JENTRY_TYPEMASK) == JENTRY_ISNULL)
-#define JBE_ISBOOL_TRUE(je_)    (((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_TRUE)
-#define JBE_ISBOOL_FALSE(je_)    (((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE)
-#define JBE_ISBOOL(je_)            (JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
+#define JBE_OFFLENFLD(je_)		((je_) & JENTRY_OFFLENMASK)
+#define JBE_HAS_OFF(je_)		(((je_) & JENTRY_HAS_OFF) != 0)
+#define JBE_ISSTRING(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING)
+#define JBE_ISNUMERIC(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC)
+#define JBE_ISCONTAINER(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER)
+#define JBE_ISNULL(je_)			(((je_) & JENTRY_TYPEMASK) == JENTRY_ISNULL)
+#define JBE_ISBOOL_TRUE(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_TRUE)
+#define JBE_ISBOOL_FALSE(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE)
+#define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
 
 /* Macro for advancing an offset variable to the next JEntry */
 #define JBE_ADVANCE_OFFSET(offset, je) \
-    do { \
-        JEntry    je_ = (je); \
-        if (JBE_HAS_OFF(je_)) \
-            (offset) = JBE_OFFLENFLD(je_); \
-        else \
-            (offset) += JBE_OFFLENFLD(je_); \
-    } while(0)
+	do { \
+		JEntry	je_ = (je); \
+		if (JBE_HAS_OFF(je_)) \
+			(offset) = JBE_OFFLENFLD(je_); \
+		else \
+			(offset) += JBE_OFFLENFLD(je_); \
+	} while(0)
 
 /*
  * We store an offset, not a length, every JB_OFFSET_STRIDE children.
@@ -178,7 +178,7 @@ typedef uint32 JEntry;
  * bits instead.  This allows changes in the offset-placement heuristic
  * without breaking on-disk compatibility.
  */
-#define JB_OFFSET_STRIDE        32
+#define JB_OFFSET_STRIDE		32
 
 /*
  * A jsonb array or object node, within a Jsonb Datum.
@@ -192,96 +192,96 @@ typedef uint32 JEntry;
  */
 typedef struct JsonbContainer
 {
-    uint32        header;            /* number of elements or key/value pairs, and
-                                 * flags */
-    JEntry        children[FLEXIBLE_ARRAY_MEMBER];
+	uint32		header;			/* number of elements or key/value pairs, and
+								 * flags */
+	JEntry		children[FLEXIBLE_ARRAY_MEMBER];
 
-    /* the data for each child node follows. */
+	/* the data for each child node follows. */
 } JsonbContainer;
 
 /* flags for the header-field in JsonbContainer */
-#define JB_CMASK                0x0FFFFFFF    /* mask for count field */
-#define JB_FSCALAR                0x10000000    /* flag bits */
-#define JB_FOBJECT                0x20000000
-#define JB_FARRAY                0x40000000
+#define JB_CMASK				0x0FFFFFFF	/* mask for count field */
+#define JB_FSCALAR				0x10000000	/* flag bits */
+#define JB_FOBJECT				0x20000000
+#define JB_FARRAY				0x40000000
 
 /* convenience macros for accessing a JsonbContainer struct */
-#define JsonContainerSize(jc)        ((jc)->header & JB_CMASK)
-#define JsonContainerIsScalar(jc)    (((jc)->header & JB_FSCALAR) != 0)
-#define JsonContainerIsObject(jc)    (((jc)->header & JB_FOBJECT) != 0)
-#define JsonContainerIsArray(jc)    (((jc)->header & JB_FARRAY) != 0)
+#define JsonContainerSize(jc)		((jc)->header & JB_CMASK)
+#define JsonContainerIsScalar(jc)	(((jc)->header & JB_FSCALAR) != 0)
+#define JsonContainerIsObject(jc)	(((jc)->header & JB_FOBJECT) != 0)
+#define JsonContainerIsArray(jc)	(((jc)->header & JB_FARRAY) != 0)
 
 /* The top-level on-disk format for a jsonb datum. */
 typedef struct
 {
-    int32        vl_len_;        /* varlena header (do not touch directly!) */
-    JsonbContainer root;
+	int32		vl_len_;		/* varlena header (do not touch directly!) */
+	JsonbContainer root;
 } Jsonb;
 
 /* convenience macros for accessing the root container in a Jsonb datum */
-#define JB_ROOT_COUNT(jbp_)        (*(uint32 *) VARDATA(jbp_) & JB_CMASK)
+#define JB_ROOT_COUNT(jbp_)		(*(uint32 *) VARDATA(jbp_) & JB_CMASK)
 #define JB_ROOT_IS_SCALAR(jbp_) ((*(uint32 *) VARDATA(jbp_) & JB_FSCALAR) != 0)
 #define JB_ROOT_IS_OBJECT(jbp_) ((*(uint32 *) VARDATA(jbp_) & JB_FOBJECT) != 0)
-#define JB_ROOT_IS_ARRAY(jbp_)    ((*(uint32 *) VARDATA(jbp_) & JB_FARRAY) != 0)
+#define JB_ROOT_IS_ARRAY(jbp_)	((*(uint32 *) VARDATA(jbp_) & JB_FARRAY) != 0)
 
 
 enum jbvType
 {
-    /* Scalar types */
-    jbvNull = 0x0,
-    jbvString,
-    jbvNumeric,
-    jbvBool,
-    /* Composite types */
-    jbvArray = 0x10,
-    jbvObject,
-    /* Binary (i.e. struct Jsonb) jbvArray/jbvObject */
-    jbvBinary
+	/* Scalar types */
+	jbvNull = 0x0,
+	jbvString,
+	jbvNumeric,
+	jbvBool,
+	/* Composite types */
+	jbvArray = 0x10,
+	jbvObject,
+	/* Binary (i.e. struct Jsonb) jbvArray/jbvObject */
+	jbvBinary
 };
 
 /*
- * JsonbValue:    In-memory representation of Jsonb.  This is a convenient
+ * JsonbValue:	In-memory representation of Jsonb.  This is a convenient
  * deserialized representation, that can easily support using the "val"
  * union across underlying types during manipulation.  The Jsonb on-disk
  * representation has various alignment considerations.
  */
 struct JsonbValue
 {
-    enum jbvType type;            /* Influences sort order */
-
-    union
-    {
-        Numeric numeric;
-        bool        boolean;
-        struct
-        {
-            int            len;
-            char       *val;    /* Not necessarily null-terminated */
-        }            string;        /* String primitive type */
-
-        struct
-        {
-            int            nElems;
-            JsonbValue *elems;
-            bool        rawScalar;    /* Top-level "raw scalar" array? */
-        }            array;        /* Array container type */
-
-        struct
-        {
-            int            nPairs; /* 1 pair, 2 elements */
-            JsonbPair  *pairs;
-        }            object;        /* Associative container type */
-
-        struct
-        {
-            int            len;
-            JsonbContainer *data;
-        }            binary;        /* Array or object, in on-disk format */
-    }            val;
+	enum jbvType type;			/* Influences sort order */
+
+	union
+	{
+		Numeric numeric;
+		bool		boolean;
+		struct
+		{
+			int			len;
+			char	   *val;	/* Not necessarily null-terminated */
+		}			string;		/* String primitive type */
+
+		struct
+		{
+			int			nElems;
+			JsonbValue *elems;
+			bool		rawScalar;	/* Top-level "raw scalar" array? */
+		}			array;		/* Array container type */
+
+		struct
+		{
+			int			nPairs; /* 1 pair, 2 elements */
+			JsonbPair  *pairs;
+		}			object;		/* Associative container type */
+
+		struct
+		{
+			int			len;
+			JsonbContainer *data;
+		}			binary;		/* Array or object, in on-disk format */
+	}			val;
 };
 
-#define IsAJsonbScalar(jsonbval)    ((jsonbval)->type >= jbvNull && \
-                                     (jsonbval)->type <= jbvBool)
+#define IsAJsonbScalar(jsonbval)	((jsonbval)->type >= jbvNull && \
+									 (jsonbval)->type <= jbvBool)
 
 /*
  * Key/value pair within an Object.
@@ -295,17 +295,17 @@ struct JsonbValue
  */
 struct JsonbPair
 {
-    JsonbValue    key;            /* Must be a jbvString */
-    JsonbValue    value;            /* May be of any type */
-    uint32        order;            /* Pair's index in original sequence */
+	JsonbValue	key;			/* Must be a jbvString */
+	JsonbValue	value;			/* May be of any type */
+	uint32		order;			/* Pair's index in original sequence */
 };
 
 /* Conversion state used when parsing Jsonb from text, or for type coercion */
 typedef struct JsonbParseState
 {
-    JsonbValue    contVal;
-    Size        size;
-    struct JsonbParseState *next;
+	JsonbValue	contVal;
+	Size		size;
+	struct JsonbParseState *next;
 } JsonbParseState;
 
 /*
@@ -314,68 +314,70 @@ typedef struct JsonbParseState
  */
 typedef enum
 {
-    JBI_ARRAY_START,
-    JBI_ARRAY_ELEM,
-    JBI_OBJECT_START,
-    JBI_OBJECT_KEY,
-    JBI_OBJECT_VALUE
+	JBI_ARRAY_START,
+	JBI_ARRAY_ELEM,
+	JBI_OBJECT_START,
+	JBI_OBJECT_KEY,
+	JBI_OBJECT_VALUE
 } JsonbIterState;
 
 typedef struct JsonbIterator
 {
-    /* Container being iterated */
-    JsonbContainer *container;
-    uint32        nElems;            /* Number of elements in children array (will
-                                 * be nPairs for objects) */
-    bool        isScalar;        /* Pseudo-array scalar value? */
-    JEntry       *children;        /* JEntrys for child nodes */
-    /* Data proper.  This points to the beginning of the variable-length data */
-    char       *dataProper;
-
-    /* Current item in buffer (up to nElems) */
-    int            curIndex;
-
-    /* Data offset corresponding to current item */
-    uint32        curDataOffset;
-
-    /*
-     * If the container is an object, we want to return keys and values
-     * alternately; so curDataOffset points to the current key, and
-     * curValueOffset points to the current value.
-     */
-    uint32        curValueOffset;
-
-    /* Private state */
-    JsonbIterState state;
-
-    struct JsonbIterator *parent;
+	/* Container being iterated */
+	JsonbContainer *container;
+	uint32		nElems;			/* Number of elements in children array (will
+								 * be nPairs for objects) */
+	bool		isScalar;		/* Pseudo-array scalar value? */
+	JEntry	   *children;		/* JEntrys for child nodes */
+	/* Data proper.  This points to the beginning of the variable-length data */
+	char	   *dataProper;
+
+	/* Current item in buffer (up to nElems) */
+	int			curIndex;
+
+	/* Data offset corresponding to current item */
+	uint32		curDataOffset;
+
+	/*
+	 * If the container is an object, we want to return keys and values
+	 * alternately; so curDataOffset points to the current key, and
+	 * curValueOffset points to the current value.
+	 */
+	uint32		curValueOffset;
+
+	/* Private state */
+	JsonbIterState state;
+
+	struct JsonbIterator *parent;
 } JsonbIterator;
 
 
 /* Support functions */
 extern uint32 getJsonbOffset(const JsonbContainer *jc, int index);
 extern uint32 getJsonbLength(const JsonbContainer *jc, int index);
-extern int    compareJsonbContainers(JsonbContainer *a, JsonbContainer *b);
+extern int	compareJsonbContainers(JsonbContainer *a, JsonbContainer *b);
 extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *sheader,
-                            uint32 flags,
-                            JsonbValue *key);
+							uint32 flags,
+							JsonbValue *key);
 extern JsonbValue *getIthJsonbValueFromContainer(JsonbContainer *sheader,
-                              uint32 i);
+							  uint32 i);
 extern JsonbValue *pushJsonbValue(JsonbParseState **pstate,
-               JsonbIteratorToken seq, JsonbValue *jbVal);
+			   JsonbIteratorToken seq, JsonbValue *jbVal);
 extern JsonbIterator *JsonbIteratorInit(JsonbContainer *container);
 extern JsonbIteratorToken JsonbIteratorNext(JsonbIterator **it, JsonbValue *val,
-                  bool skipNested);
+				  bool skipNested);
 extern Jsonb *JsonbValueToJsonb(JsonbValue *val);
 extern bool JsonbDeepContains(JsonbIterator **val,
-                  JsonbIterator **mContained);
+				  JsonbIterator **mContained);
 extern void JsonbHashScalarValue(const JsonbValue *scalarVal, uint32 *hash);
+extern void JsonbHashScalarValueExtended(const JsonbValue *scalarVal,
+							 uint64 *hash, uint64 seed);
 
 /* jsonb.c support functions */
 extern char *JsonbToCString(StringInfo out, JsonbContainer *in,
-               int estimated_len);
+			   int estimated_len);
 extern char *JsonbToCStringIndent(StringInfo out, JsonbContainer *in,
-                     int estimated_len);
+					 int estimated_len);
 
 
-#endif                            /* __JSONB_H__ */
+#endif							/* __JSONB_H__ */
diff --git a/src/include/utils/typcache.h b/src/include/utils/typcache.h
index abe7737d..b4f75921 100644
--- a/src/include/utils/typcache.h
+++ b/src/include/utils/typcache.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * typcache.h
- *      Type cache definitions.
+ *	  Type cache definitions.
  *
  * The type cache exists to speed lookup of certain information about data
  * types that is not directly available from a type's pg_type row.
@@ -28,98 +28,102 @@ struct TypeCacheEnumData;
 
 typedef struct TypeCacheEntry
 {
-    /* typeId is the hash lookup key and MUST BE FIRST */
-    Oid            type_id;        /* OID of the data type */
-
-    /* some subsidiary information copied from the pg_type row */
-    int16        typlen;
-    bool        typbyval;
-    char        typalign;
-    char        typstorage;
-    char        typtype;
-    Oid            typrelid;
-
-    /*
-     * Information obtained from opfamily entries
-     *
-     * These will be InvalidOid if no match could be found, or if the
-     * information hasn't yet been requested.  Also note that for array and
-     * composite types, typcache.c checks that the contained types are
-     * comparable or hashable before allowing eq_opr etc to become set.
-     */
-    Oid            btree_opf;        /* the default btree opclass' family */
-    Oid            btree_opintype; /* the default btree opclass' opcintype */
-    Oid            hash_opf;        /* the default hash opclass' family */
-    Oid            hash_opintype;    /* the default hash opclass' opcintype */
-    Oid            eq_opr;            /* the equality operator */
-    Oid            lt_opr;            /* the less-than operator */
-    Oid            gt_opr;            /* the greater-than operator */
-    Oid            cmp_proc;        /* the btree comparison function */
-    Oid            hash_proc;        /* the hash calculation function */
-
-    /*
-     * Pre-set-up fmgr call info for the equality operator, the btree
-     * comparison function, and the hash calculation function.  These are kept
-     * in the type cache to avoid problems with memory leaks in repeated calls
-     * to functions such as array_eq, array_cmp, hash_array.  There is not
-     * currently a need to maintain call info for the lt_opr or gt_opr.
-     */
-    FmgrInfo    eq_opr_finfo;
-    FmgrInfo    cmp_proc_finfo;
-    FmgrInfo    hash_proc_finfo;
-
-    /*
-     * Tuple descriptor if it's a composite type (row type).  NULL if not
-     * composite or information hasn't yet been requested.  (NOTE: this is a
-     * reference-counted tupledesc.)
-     */
-    TupleDesc    tupDesc;
-
-    /*
-     * Fields computed when TYPECACHE_RANGE_INFO is requested.  Zeroes if not
-     * a range type or information hasn't yet been requested.  Note that
-     * rng_cmp_proc_finfo could be different from the element type's default
-     * btree comparison function.
-     */
-    struct TypeCacheEntry *rngelemtype; /* range's element type */
-    Oid            rng_collation;    /* collation for comparisons, if any */
-    FmgrInfo    rng_cmp_proc_finfo; /* comparison function */
-    FmgrInfo    rng_canonical_finfo;    /* canonicalization function, if any */
-    FmgrInfo    rng_subdiff_finfo;    /* difference function, if any */
-
-    /*
-     * Domain constraint data if it's a domain type.  NULL if not domain, or
-     * if domain has no constraints, or if information hasn't been requested.
-     */
-    DomainConstraintCache *domainData;
-
-    /* Private data, for internal use of typcache.c only */
-    int            flags;            /* flags about what we've computed */
-
-    /*
-     * Private information about an enum type.  NULL if not enum or
-     * information hasn't been requested.
-     */
-    struct TypeCacheEnumData *enumData;
-
-    /* We also maintain a list of all known domain-type cache entries */
-    struct TypeCacheEntry *nextDomain;
+	/* typeId is the hash lookup key and MUST BE FIRST */
+	Oid			type_id;		/* OID of the data type */
+
+	/* some subsidiary information copied from the pg_type row */
+	int16		typlen;
+	bool		typbyval;
+	char		typalign;
+	char		typstorage;
+	char		typtype;
+	Oid			typrelid;
+
+	/*
+	 * Information obtained from opfamily entries
+	 *
+	 * These will be InvalidOid if no match could be found, or if the
+	 * information hasn't yet been requested.  Also note that for array and
+	 * composite types, typcache.c checks that the contained types are
+	 * comparable or hashable before allowing eq_opr etc to become set.
+	 */
+	Oid			btree_opf;		/* the default btree opclass' family */
+	Oid			btree_opintype; /* the default btree opclass' opcintype */
+	Oid			hash_opf;		/* the default hash opclass' family */
+	Oid			hash_opintype;	/* the default hash opclass' opcintype */
+	Oid			eq_opr;			/* the equality operator */
+	Oid			lt_opr;			/* the less-than operator */
+	Oid			gt_opr;			/* the greater-than operator */
+	Oid			cmp_proc;		/* the btree comparison function */
+	Oid			hash_proc;		/* the hash calculation function */
+	Oid			hash_extended_proc; /* the extended hash calculation function */
+
+	/*
+	 * Pre-set-up fmgr call info for the equality operator, the btree
+	 * comparison function, and the hash calculation function.  These are kept
+	 * in the type cache to avoid problems with memory leaks in repeated calls
+	 * to functions such as array_eq, array_cmp, hash_array.  There is not
+	 * currently a need to maintain call info for the lt_opr or gt_opr.
+	 */
+	FmgrInfo	eq_opr_finfo;
+	FmgrInfo	cmp_proc_finfo;
+	FmgrInfo	hash_proc_finfo;
+	FmgrInfo	hash_extended_proc_finfo;
+
+	/*
+	 * Tuple descriptor if it's a composite type (row type).  NULL if not
+	 * composite or information hasn't yet been requested.  (NOTE: this is a
+	 * reference-counted tupledesc.)
+	 */
+	TupleDesc	tupDesc;
+
+	/*
+	 * Fields computed when TYPECACHE_RANGE_INFO is requested.  Zeroes if not
+	 * a range type or information hasn't yet been requested.  Note that
+	 * rng_cmp_proc_finfo could be different from the element type's default
+	 * btree comparison function.
+	 */
+	struct TypeCacheEntry *rngelemtype; /* range's element type */
+	Oid			rng_collation;	/* collation for comparisons, if any */
+	FmgrInfo	rng_cmp_proc_finfo; /* comparison function */
+	FmgrInfo	rng_canonical_finfo;	/* canonicalization function, if any */
+	FmgrInfo	rng_subdiff_finfo;	/* difference function, if any */
+
+	/*
+	 * Domain constraint data if it's a domain type.  NULL if not domain, or
+	 * if domain has no constraints, or if information hasn't been requested.
+	 */
+	DomainConstraintCache *domainData;
+
+	/* Private data, for internal use of typcache.c only */
+	int			flags;			/* flags about what we've computed */
+
+	/*
+	 * Private information about an enum type.  NULL if not enum or
+	 * information hasn't been requested.
+	 */
+	struct TypeCacheEnumData *enumData;
+
+	/* We also maintain a list of all known domain-type cache entries */
+	struct TypeCacheEntry *nextDomain;
 } TypeCacheEntry;
 
 /* Bit flags to indicate which fields a given caller needs to have set */
-#define TYPECACHE_EQ_OPR            0x0001
-#define TYPECACHE_LT_OPR            0x0002
-#define TYPECACHE_GT_OPR            0x0004
-#define TYPECACHE_CMP_PROC            0x0008
-#define TYPECACHE_HASH_PROC            0x0010
-#define TYPECACHE_EQ_OPR_FINFO        0x0020
-#define TYPECACHE_CMP_PROC_FINFO    0x0040
-#define TYPECACHE_HASH_PROC_FINFO    0x0080
-#define TYPECACHE_TUPDESC            0x0100
-#define TYPECACHE_BTREE_OPFAMILY    0x0200
-#define TYPECACHE_HASH_OPFAMILY        0x0400
-#define TYPECACHE_RANGE_INFO        0x0800
-#define TYPECACHE_DOMAIN_INFO        0x1000
+#define TYPECACHE_EQ_OPR			0x0001
+#define TYPECACHE_LT_OPR			0x0002
+#define TYPECACHE_GT_OPR			0x0004
+#define TYPECACHE_CMP_PROC			0x0008
+#define TYPECACHE_HASH_PROC			0x0010
+#define TYPECACHE_EQ_OPR_FINFO		0x0020
+#define TYPECACHE_CMP_PROC_FINFO	0x0040
+#define TYPECACHE_HASH_PROC_FINFO	0x0080
+#define TYPECACHE_TUPDESC			0x0100
+#define TYPECACHE_BTREE_OPFAMILY	0x0200
+#define TYPECACHE_HASH_OPFAMILY		0x0400
+#define TYPECACHE_RANGE_INFO		0x0800
+#define TYPECACHE_DOMAIN_INFO		0x1000
+#define TYPECACHE_HASH_EXTENDED_PROC		0x2000
+#define TYPECACHE_HASH_EXTENDED_PROC_FINFO	0x4000
 
 /*
  * Callers wishing to maintain a long-lived reference to a domain's constraint
@@ -129,21 +133,21 @@ typedef struct TypeCacheEntry
  */
 typedef struct DomainConstraintRef
 {
-    List       *constraints;    /* list of DomainConstraintState nodes */
-    MemoryContext refctx;        /* context holding DomainConstraintRef */
-    TypeCacheEntry *tcache;        /* typcache entry for domain type */
-    bool        need_exprstate; /* does caller need check_exprstate? */
-
-    /* Management data --- treat these fields as private to typcache.c */
-    DomainConstraintCache *dcc; /* current constraints, or NULL if none */
-    MemoryContextCallback callback; /* used to release refcount when done */
+	List	   *constraints;	/* list of DomainConstraintState nodes */
+	MemoryContext refctx;		/* context holding DomainConstraintRef */
+	TypeCacheEntry *tcache;		/* typcache entry for domain type */
+	bool		need_exprstate; /* does caller need check_exprstate? */
+
+	/* Management data --- treat these fields as private to typcache.c */
+	DomainConstraintCache *dcc; /* current constraints, or NULL if none */
+	MemoryContextCallback callback; /* used to release refcount when done */
 } DomainConstraintRef;
 
 
 extern TypeCacheEntry *lookup_type_cache(Oid type_id, int flags);
 
 extern void InitDomainConstraintRef(Oid type_id, DomainConstraintRef *ref,
-                        MemoryContext refctx, bool need_exprstate);
+						MemoryContext refctx, bool need_exprstate);
 
 extern void UpdateDomainConstraintRef(DomainConstraintRef *ref);
 
@@ -152,12 +156,12 @@ extern bool DomainHasConstraints(Oid type_id);
 extern TupleDesc lookup_rowtype_tupdesc(Oid type_id, int32 typmod);
 
 extern TupleDesc lookup_rowtype_tupdesc_noerror(Oid type_id, int32 typmod,
-                               bool noError);
+							   bool noError);
 
 extern TupleDesc lookup_rowtype_tupdesc_copy(Oid type_id, int32 typmod);
 
 extern void assign_record_type_typmod(TupleDesc tupDesc);
 
-extern int    compare_values_of_enum(TypeCacheEntry *tcache, Oid arg1, Oid arg2);
+extern int	compare_values_of_enum(TypeCacheEntry *tcache, Oid arg1, Oid arg2);
 
-#endif                            /* TYPCACHE_H */
+#endif							/* TYPCACHE_H */
diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out
index 61bd1cf1..2d7998ff 100644
--- a/src/test/regress/expected/alter_generic.out
+++ b/src/test/regress/expected/alter_generic.out
@@ -427,7 +427,7 @@ BEGIN TRANSACTION;
 CREATE OPERATOR FAMILY alt_opf13 USING hash;
 CREATE FUNCTION fn_opf13  (int4) RETURNS BIGINT AS 'SELECT NULL::BIGINT;' LANGUAGE SQL;
 ALTER OPERATOR FAMILY alt_opf13 USING hash ADD FUNCTION 1 fn_opf13(int4);
-ERROR:  hash procedures must return integer
+ERROR:  hash procedure 1 must return integer
 DROP OPERATOR FAMILY alt_opf13 USING hash;
 ERROR:  current transaction is aborted, commands ignored until end of transaction block
 ROLLBACK;
@@ -445,7 +445,7 @@ BEGIN TRANSACTION;
 CREATE OPERATOR FAMILY alt_opf15 USING hash;
 CREATE FUNCTION fn_opf15 (int4, int2) RETURNS BIGINT AS 'SELECT NULL::BIGINT;' LANGUAGE SQL;
 ALTER OPERATOR FAMILY alt_opf15 USING hash ADD FUNCTION 1 fn_opf15(int4, int2);
-ERROR:  hash procedures must have one argument
+ERROR:  hash procedure 1 must have one argument
 DROP OPERATOR FAMILY alt_opf15 USING hash;
 ERROR:  current transaction is aborted, commands ignored until end of transaction block
 ROLLBACK;
diff --git a/src/test/regress/expected/hash_func.out b/src/test/regress/expected/hash_func.out
new file mode 100644
index 00000000..da0948e9
--- /dev/null
+++ b/src/test/regress/expected/hash_func.out
@@ -0,0 +1,300 @@
+--
+-- Test hash functions
+--
+-- When the salt is 0, the extended hash function should produce a result
+-- whose low 32 bits match the standard hash function.  When the salt is
+-- not 0, we should get a different result.
+--
+SELECT v as value, hashint2(v)::bit(32) as standard,
+       hashint2extended(v, 0)::bit(32) as extended0,
+       hashint2extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0::int2), (1::int2), (17::int2), (42::int2)) x(v)
+WHERE  hashint2(v)::bit(32) != hashint2extended(v, 0)::bit(32)
+       OR hashint2(v)::bit(32) = hashint2extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashint4(v)::bit(32) as standard,
+	   hashint4extended(v, 0)::bit(32) as extended0,
+	   hashint4extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v)
+WHERE  hashint4(v)::bit(32) != hashint4extended(v, 0)::bit(32)
+       OR hashint4(v)::bit(32) = hashint4extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashint8(v)::bit(32) as standard,
+	   hashint8extended(v, 0)::bit(32) as extended0,
+	   hashint8extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v)
+WHERE  hashint8(v)::bit(32) != hashint8extended(v, 0)::bit(32)
+       OR hashint8(v)::bit(32) = hashint8extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashfloat4(v)::bit(32) as standard,
+	   hashfloat4extended(v, 0)::bit(32) as extended0,
+	   hashfloat4extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v)
+WHERE  hashfloat4(v)::bit(32) != hashfloat4extended(v, 0)::bit(32)
+       OR hashfloat4(v)::bit(32) = hashfloat4extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashfloat8(v)::bit(32) as standard,
+	   hashfloat8extended(v, 0)::bit(32) as extended0,
+	   hashfloat8extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v)
+WHERE  hashfloat8(v)::bit(32) != hashfloat8extended(v, 0)::bit(32)
+       OR hashfloat8(v)::bit(32) = hashfloat8extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashoid(v)::bit(32) as standard,
+	   hashoidextended(v, 0)::bit(32) as extended0,
+	   hashoidextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v)
+WHERE  hashoid(v)::bit(32) != hashoidextended(v, 0)::bit(32)
+       OR hashoid(v)::bit(32) = hashoidextended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashchar(v)::bit(32) as standard,
+	   hashcharextended(v, 0)::bit(32) as extended0,
+	   hashcharextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::"char"), ('1'), ('x'), ('X'), ('p'), ('N')) x(v)
+WHERE  hashchar(v)::bit(32) != hashcharextended(v, 0)::bit(32)
+       OR hashchar(v)::bit(32) = hashcharextended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashname(v)::bit(32) as standard,
+	   hashnameextended(v, 0)::bit(32) as extended0,
+	   hashnameextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL), ('PostgreSQL'), ('eIpUEtqmY89'), ('AXKEJBTK'),
+       ('muop28x03'), ('yi3nm0d73')) x(v)
+WHERE  hashname(v)::bit(32) != hashnameextended(v, 0)::bit(32)
+       OR hashname(v)::bit(32) = hashnameextended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashtext(v)::bit(32) as standard,
+	   hashtextextended(v, 0)::bit(32) as extended0,
+	   hashtextextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL), ('PostgreSQL'), ('eIpUEtqmY89'), ('AXKEJBTK'),
+       ('muop28x03'), ('yi3nm0d73')) x(v)
+WHERE  hashtext(v)::bit(32) != hashtextextended(v, 0)::bit(32)
+       OR hashtext(v)::bit(32) = hashtextextended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashoidvector(v)::bit(32) as standard,
+	   hashoidvectorextended(v, 0)::bit(32) as extended0,
+	   hashoidvectorextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::oidvector), ('0 1 2 3 4'), ('17 18 19 20'),
+        ('42 43 42 45'), ('550273 550273 570274'),
+        ('207112489 207112499 21512 2155 372325 1363252')) x(v)
+WHERE  hashoidvector(v)::bit(32) != hashoidvectorextended(v, 0)::bit(32)
+       OR hashoidvector(v)::bit(32) = hashoidvectorextended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hash_aclitem(v)::bit(32) as standard,
+	   hash_aclitem_extended(v, 0)::bit(32) as extended0,
+	   hash_aclitem_extended(v, 1)::bit(32) as extended1
+FROM   (SELECT DISTINCT(relacl[1]) FROM pg_class LIMIT 10) x(v)
+WHERE  hash_aclitem(v)::bit(32) != hash_aclitem_extended(v, 0)::bit(32)
+       OR hash_aclitem(v)::bit(32) = hash_aclitem_extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashmacaddr(v)::bit(32) as standard,
+	   hashmacaddrextended(v, 0)::bit(32) as extended0,
+	   hashmacaddrextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::macaddr), ('08:00:2b:01:02:04'), ('08:00:2b:01:02:04'),
+		('e2:7f:51:3e:70:49'), ('d6:a9:4a:78:1c:d5'),
+        ('ea:29:b1:5e:1f:a5')) x(v)
+WHERE  hashmacaddr(v)::bit(32) != hashmacaddrextended(v, 0)::bit(32)
+       OR hashmacaddr(v)::bit(32) = hashmacaddrextended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashinet(v)::bit(32) as standard,
+	   hashinetextended(v, 0)::bit(32) as extended0,
+	   hashinetextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::inet), ('192.168.100.128/25'), ('192.168.100.0/8'),
+		('172.168.10.126/16'), ('172.18.103.126/24'), ('192.188.13.16/32')) x(v)
+WHERE  hashinet(v)::bit(32) != hashinetextended(v, 0)::bit(32)
+       OR hashinet(v)::bit(32) = hashinetextended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hash_numeric(v)::bit(32) as standard,
+	   hash_numeric_extended(v, 0)::bit(32) as extended0,
+	   hash_numeric_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0), (1.149484958), (17.149484958), (42.149484958),
+        (149484958.550273), (2071124898672)) x(v)
+WHERE  hash_numeric(v)::bit(32) != hash_numeric_extended(v, 0)::bit(32)
+       OR hash_numeric(v)::bit(32) = hash_numeric_extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashmacaddr8(v)::bit(32) as standard,
+	   hashmacaddr8extended(v, 0)::bit(32) as extended0,
+	   hashmacaddr8extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::macaddr8), ('08:00:2b:01:02:04:36:49'),
+        ('08:00:2b:01:02:04:f0:e8'), ('e2:7f:51:3e:70:49:16:29'),
+        ('d6:a9:4a:78:1c:d5:47:32'), ('ea:29:b1:5e:1f:a5')) x(v)
+WHERE  hashmacaddr8(v)::bit(32) != hashmacaddr8extended(v, 0)::bit(32)
+       OR hashmacaddr8(v)::bit(32) = hashmacaddr8extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hash_array(v)::bit(32) as standard,
+	   hash_array_extended(v, 0)::bit(32) as extended0,
+	   hash_array_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES ('{0}'::int4[]), ('{0,1,2,3,4}'), ('{17,18,19,20}'),
+        ('{42,34,65,98}'), ('{550273,590027, 870273}'),
+        ('{207112489, 807112489}')) x(v)
+WHERE  hash_array(v)::bit(32) != hash_array_extended(v, 0)::bit(32)
+       OR hash_array(v)::bit(32) = hash_array_extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hashbpchar(v)::bit(32) as standard,
+	   hashbpcharextended(v, 0)::bit(32) as extended0,
+	   hashbpcharextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL), ('PostgreSQL'), ('eIpUEtqmY89'), ('AXKEJBTK'),
+       ('muop28x03'), ('yi3nm0d73')) x(v)
+WHERE  hashbpchar(v)::bit(32) != hashbpcharextended(v, 0)::bit(32)
+       OR hashbpchar(v)::bit(32) = hashbpcharextended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, time_hash(v)::bit(32) as standard,
+	   time_hash_extended(v, 0)::bit(32) as extended0,
+	   time_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::time), ('11:09:59'), ('1:09:59'), ('11:59:59'),
+        ('7:9:59'), ('5:15:59')) x(v)
+WHERE  time_hash(v)::bit(32) != time_hash_extended(v, 0)::bit(32)
+       OR time_hash(v)::bit(32) = time_hash_extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, timetz_hash(v)::bit(32) as standard,
+	   timetz_hash_extended(v, 0)::bit(32) as extended0,
+	   timetz_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::timetz), ('00:11:52.518762-07'), ('00:11:52.51762-08'),
+		('00:11:52.62-01'), ('00:11:52.62+01'), ('11:59:59+04')) x(v)
+WHERE  timetz_hash(v)::bit(32) != timetz_hash_extended(v, 0)::bit(32)
+       OR timetz_hash(v)::bit(32) = timetz_hash_extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, interval_hash(v)::bit(32) as standard,
+	   interval_hash_extended(v, 0)::bit(32) as extended0,
+	   interval_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::interval),
+        ('5 month 7 day 46 minutes'), ('1 year 7 day 46 minutes'),
+		('1 year 7 month 20 day 46 minutes'), ('5 month'),
+		('17 year 11 month 7 day 9 hours 46 minutes 5 seconds')) x(v)
+WHERE  interval_hash(v)::bit(32) != interval_hash_extended(v, 0)::bit(32)
+       OR interval_hash(v)::bit(32) = interval_hash_extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, timestamp_hash(v)::bit(32) as standard,
+	   timestamp_hash_extended(v, 0)::bit(32) as extended0,
+	   timestamp_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::timestamp), ('2017-08-22 00:09:59.518762'),
+        ('2015-08-20 00:11:52.51762-08'),
+		('2017-05-22 00:11:52.62-01'),
+        ('2013-08-22 00:11:52.62+01'), ('2013-08-22 11:59:59+04')) x(v)
+WHERE  timestamp_hash(v)::bit(32) != timestamp_hash_extended(v, 0)::bit(32)
+       OR timestamp_hash(v)::bit(32) = timestamp_hash_extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, uuid_hash(v)::bit(32) as standard,
+	   uuid_hash_extended(v, 0)::bit(32) as extended0,
+	   uuid_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::uuid), ('a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11'),
+		('5a9ba4ac-8d6f-11e7-bb31-be2e44b06b34'),
+        ('99c6705c-d939-461c-a3c9-1690ad64ed7b'),
+		('7deed3ca-8d6f-11e7-bb31-be2e44b06b34'),
+        ('9ad46d4f-6f2a-4edd-aadb-745993928e1e')) x(v)
+WHERE  uuid_hash(v)::bit(32) != uuid_hash_extended(v, 0)::bit(32)
+       OR uuid_hash(v)::bit(32) = uuid_hash_extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, pg_lsn_hash(v)::bit(32) as standard,
+	   pg_lsn_hash_extended(v, 0)::bit(32) as extended0,
+	   pg_lsn_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::pg_lsn), ('16/B374D84'), ('30/B374D84'),
+		('255/B374D84'), ('25/B379D90'), ('900/F37FD90')) x(v)
+WHERE  pg_lsn_hash(v)::bit(32) != pg_lsn_hash_extended(v, 0)::bit(32)
+       OR pg_lsn_hash(v)::bit(32) = pg_lsn_hash_extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+CREATE TYPE mood AS ENUM ('sad', 'ok', 'happy');
+SELECT v as value, hashenum(v)::bit(32) as standard,
+	   hashenumextended(v, 0)::bit(32) as extended0,
+	   hashenumextended(v, 1)::bit(32) as extended1
+FROM   (VALUES ('sad'::mood), ('ok'), ('happy')) x(v)
+WHERE  hashenum(v)::bit(32) != hashenumextended(v, 0)::bit(32)
+       OR hashenum(v)::bit(32) = hashenumextended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+DROP TYPE mood;
+SELECT v as value, jsonb_hash(v)::bit(32) as standard,
+	   jsonb_hash_extended(v, 0)::bit(32) as extended0,
+	   jsonb_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::jsonb),
+    ('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'),
+	('{"foo": [true, "bar"], "tags": {"e": 1, "f": null}}'),
+    ('{"g": {"h": "value"}}')) x(v)
+WHERE  jsonb_hash(v)::bit(32) != jsonb_hash_extended(v, 0)::bit(32)
+       OR jsonb_hash(v)::bit(32) = jsonb_hash_extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
+SELECT v as value, hash_range(v)::bit(32) as standard,
+	   hash_range_extended(v, 0)::bit(32) as extended0,
+	   hash_range_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (int4range(10, 20)), (int4range(23, 43)),
+         (int4range(5675, 550273)),
+		 (int4range(550274, 1550274)), (int4range(1550275, 208112489))) x(v)
+WHERE  hash_range(v)::bit(32) != hash_range_extended(v, 0)::bit(32)
+       OR hash_range(v)::bit(32) = hash_range_extended(v, 1)::bit(32);
+ value | standard | extended0 | extended1 
+-------+----------+-----------+-----------
+(0 rows)
+
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 31e0b077..f9eabd53 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -70,7 +70,7 @@ test: create_index create_view
 # ----------
 # Another group of parallel tests
 # ----------
-test: create_aggregate create_function_3 create_cast constraints triggers inherit create_table_like typed_table vacuum drop_if_exists updatable_views rolenames roleattributes create_am
+test: create_aggregate create_function_3 create_cast constraints triggers inherit create_table_like typed_table vacuum drop_if_exists updatable_views rolenames roleattributes create_am hash_func
 
 # ----------
 # sanity_check does a vacuum, affecting the sort order of SELECT *
diff --git a/src/test/regress/sql/hash_func.sql b/src/test/regress/sql/hash_func.sql
new file mode 100644
index 00000000..b7ce8b21
--- /dev/null
+++ b/src/test/regress/sql/hash_func.sql
@@ -0,0 +1,222 @@
+--
+-- Test hash functions
+--
+-- When the salt is 0, the extended hash function should produce a result
+-- whose low 32 bits match the standard hash function.  When the salt is
+-- not 0, we should get a different result.
+--
+
+SELECT v as value, hashint2(v)::bit(32) as standard,
+       hashint2extended(v, 0)::bit(32) as extended0,
+       hashint2extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0::int2), (1::int2), (17::int2), (42::int2)) x(v)
+WHERE  hashint2(v)::bit(32) != hashint2extended(v, 0)::bit(32)
+       OR hashint2(v)::bit(32) = hashint2extended(v, 1)::bit(32);
+
+SELECT v as value, hashint4(v)::bit(32) as standard,
+	   hashint4extended(v, 0)::bit(32) as extended0,
+	   hashint4extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v)
+WHERE  hashint4(v)::bit(32) != hashint4extended(v, 0)::bit(32)
+       OR hashint4(v)::bit(32) = hashint4extended(v, 1)::bit(32);
+
+SELECT v as value, hashint8(v)::bit(32) as standard,
+	   hashint8extended(v, 0)::bit(32) as extended0,
+	   hashint8extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v)
+WHERE  hashint8(v)::bit(32) != hashint8extended(v, 0)::bit(32)
+       OR hashint8(v)::bit(32) = hashint8extended(v, 1)::bit(32);
+
+SELECT v as value, hashfloat4(v)::bit(32) as standard,
+	   hashfloat4extended(v, 0)::bit(32) as extended0,
+	   hashfloat4extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v)
+WHERE  hashfloat4(v)::bit(32) != hashfloat4extended(v, 0)::bit(32)
+       OR hashfloat4(v)::bit(32) = hashfloat4extended(v, 1)::bit(32);
+
+SELECT v as value, hashfloat8(v)::bit(32) as standard,
+	   hashfloat8extended(v, 0)::bit(32) as extended0,
+	   hashfloat8extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v)
+WHERE  hashfloat8(v)::bit(32) != hashfloat8extended(v, 0)::bit(32)
+       OR hashfloat8(v)::bit(32) = hashfloat8extended(v, 1)::bit(32);
+
+SELECT v as value, hashoid(v)::bit(32) as standard,
+	   hashoidextended(v, 0)::bit(32) as extended0,
+	   hashoidextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0), (1), (17), (42), (550273), (207112489)) x(v)
+WHERE  hashoid(v)::bit(32) != hashoidextended(v, 0)::bit(32)
+       OR hashoid(v)::bit(32) = hashoidextended(v, 1)::bit(32);
+
+SELECT v as value, hashchar(v)::bit(32) as standard,
+	   hashcharextended(v, 0)::bit(32) as extended0,
+	   hashcharextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::"char"), ('1'), ('x'), ('X'), ('p'), ('N')) x(v)
+WHERE  hashchar(v)::bit(32) != hashcharextended(v, 0)::bit(32)
+       OR hashchar(v)::bit(32) = hashcharextended(v, 1)::bit(32);
+
+SELECT v as value, hashname(v)::bit(32) as standard,
+	   hashnameextended(v, 0)::bit(32) as extended0,
+	   hashnameextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL), ('PostgreSQL'), ('eIpUEtqmY89'), ('AXKEJBTK'),
+       ('muop28x03'), ('yi3nm0d73')) x(v)
+WHERE  hashname(v)::bit(32) != hashnameextended(v, 0)::bit(32)
+       OR hashname(v)::bit(32) = hashnameextended(v, 1)::bit(32);
+
+SELECT v as value, hashtext(v)::bit(32) as standard,
+	   hashtextextended(v, 0)::bit(32) as extended0,
+	   hashtextextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL), ('PostgreSQL'), ('eIpUEtqmY89'), ('AXKEJBTK'),
+       ('muop28x03'), ('yi3nm0d73')) x(v)
+WHERE  hashtext(v)::bit(32) != hashtextextended(v, 0)::bit(32)
+       OR hashtext(v)::bit(32) = hashtextextended(v, 1)::bit(32);
+
+SELECT v as value, hashoidvector(v)::bit(32) as standard,
+	   hashoidvectorextended(v, 0)::bit(32) as extended0,
+	   hashoidvectorextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::oidvector), ('0 1 2 3 4'), ('17 18 19 20'),
+        ('42 43 42 45'), ('550273 550273 570274'),
+        ('207112489 207112499 21512 2155 372325 1363252')) x(v)
+WHERE  hashoidvector(v)::bit(32) != hashoidvectorextended(v, 0)::bit(32)
+       OR hashoidvector(v)::bit(32) = hashoidvectorextended(v, 1)::bit(32);
+
+SELECT v as value, hash_aclitem(v)::bit(32) as standard,
+	   hash_aclitem_extended(v, 0)::bit(32) as extended0,
+	   hash_aclitem_extended(v, 1)::bit(32) as extended1
+FROM   (SELECT DISTINCT(relacl[1]) FROM pg_class LIMIT 10) x(v)
+WHERE  hash_aclitem(v)::bit(32) != hash_aclitem_extended(v, 0)::bit(32)
+       OR hash_aclitem(v)::bit(32) = hash_aclitem_extended(v, 1)::bit(32);
+
+SELECT v as value, hashmacaddr(v)::bit(32) as standard,
+	   hashmacaddrextended(v, 0)::bit(32) as extended0,
+	   hashmacaddrextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::macaddr), ('08:00:2b:01:02:04'), ('08:00:2b:01:02:04'),
+		('e2:7f:51:3e:70:49'), ('d6:a9:4a:78:1c:d5'),
+        ('ea:29:b1:5e:1f:a5')) x(v)
+WHERE  hashmacaddr(v)::bit(32) != hashmacaddrextended(v, 0)::bit(32)
+       OR hashmacaddr(v)::bit(32) = hashmacaddrextended(v, 1)::bit(32);
+
+SELECT v as value, hashinet(v)::bit(32) as standard,
+	   hashinetextended(v, 0)::bit(32) as extended0,
+	   hashinetextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::inet), ('192.168.100.128/25'), ('192.168.100.0/8'),
+		('172.168.10.126/16'), ('172.18.103.126/24'), ('192.188.13.16/32')) x(v)
+WHERE  hashinet(v)::bit(32) != hashinetextended(v, 0)::bit(32)
+       OR hashinet(v)::bit(32) = hashinetextended(v, 1)::bit(32);
+
+SELECT v as value, hash_numeric(v)::bit(32) as standard,
+	   hash_numeric_extended(v, 0)::bit(32) as extended0,
+	   hash_numeric_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (0), (1.149484958), (17.149484958), (42.149484958),
+        (149484958.550273), (2071124898672)) x(v)
+WHERE  hash_numeric(v)::bit(32) != hash_numeric_extended(v, 0)::bit(32)
+       OR hash_numeric(v)::bit(32) = hash_numeric_extended(v, 1)::bit(32);
+
+SELECT v as value, hashmacaddr8(v)::bit(32) as standard,
+	   hashmacaddr8extended(v, 0)::bit(32) as extended0,
+	   hashmacaddr8extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::macaddr8), ('08:00:2b:01:02:04:36:49'),
+        ('08:00:2b:01:02:04:f0:e8'), ('e2:7f:51:3e:70:49:16:29'),
+        ('d6:a9:4a:78:1c:d5:47:32'), ('ea:29:b1:5e:1f:a5')) x(v)
+WHERE  hashmacaddr8(v)::bit(32) != hashmacaddr8extended(v, 0)::bit(32)
+       OR hashmacaddr8(v)::bit(32) = hashmacaddr8extended(v, 1)::bit(32);
+
+SELECT v as value, hash_array(v)::bit(32) as standard,
+	   hash_array_extended(v, 0)::bit(32) as extended0,
+	   hash_array_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES ('{0}'::int4[]), ('{0,1,2,3,4}'), ('{17,18,19,20}'),
+        ('{42,34,65,98}'), ('{550273,590027, 870273}'),
+        ('{207112489, 807112489}')) x(v)
+WHERE  hash_array(v)::bit(32) != hash_array_extended(v, 0)::bit(32)
+       OR hash_array(v)::bit(32) = hash_array_extended(v, 1)::bit(32);
+
+SELECT v as value, hashbpchar(v)::bit(32) as standard,
+	   hashbpcharextended(v, 0)::bit(32) as extended0,
+	   hashbpcharextended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL), ('PostgreSQL'), ('eIpUEtqmY89'), ('AXKEJBTK'),
+       ('muop28x03'), ('yi3nm0d73')) x(v)
+WHERE  hashbpchar(v)::bit(32) != hashbpcharextended(v, 0)::bit(32)
+       OR hashbpchar(v)::bit(32) = hashbpcharextended(v, 1)::bit(32);
+
+SELECT v as value, time_hash(v)::bit(32) as standard,
+	   time_hash_extended(v, 0)::bit(32) as extended0,
+	   time_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::time), ('11:09:59'), ('1:09:59'), ('11:59:59'),
+        ('7:9:59'), ('5:15:59')) x(v)
+WHERE  time_hash(v)::bit(32) != time_hash_extended(v, 0)::bit(32)
+       OR time_hash(v)::bit(32) = time_hash_extended(v, 1)::bit(32);
+
+SELECT v as value, timetz_hash(v)::bit(32) as standard,
+	   timetz_hash_extended(v, 0)::bit(32) as extended0,
+	   timetz_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::timetz), ('00:11:52.518762-07'), ('00:11:52.51762-08'),
+		('00:11:52.62-01'), ('00:11:52.62+01'), ('11:59:59+04')) x(v)
+WHERE  timetz_hash(v)::bit(32) != timetz_hash_extended(v, 0)::bit(32)
+       OR timetz_hash(v)::bit(32) = timetz_hash_extended(v, 1)::bit(32);
+
+SELECT v as value, interval_hash(v)::bit(32) as standard,
+	   interval_hash_extended(v, 0)::bit(32) as extended0,
+	   interval_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::interval),
+        ('5 month 7 day 46 minutes'), ('1 year 7 day 46 minutes'),
+		('1 year 7 month 20 day 46 minutes'), ('5 month'),
+		('17 year 11 month 7 day 9 hours 46 minutes 5 seconds')) x(v)
+WHERE  interval_hash(v)::bit(32) != interval_hash_extended(v, 0)::bit(32)
+       OR interval_hash(v)::bit(32) = interval_hash_extended(v, 1)::bit(32);
+
+SELECT v as value, timestamp_hash(v)::bit(32) as standard,
+	   timestamp_hash_extended(v, 0)::bit(32) as extended0,
+	   timestamp_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::timestamp), ('2017-08-22 00:09:59.518762'),
+        ('2015-08-20 00:11:52.51762-08'),
+		('2017-05-22 00:11:52.62-01'),
+        ('2013-08-22 00:11:52.62+01'), ('2013-08-22 11:59:59+04')) x(v)
+WHERE  timestamp_hash(v)::bit(32) != timestamp_hash_extended(v, 0)::bit(32)
+       OR timestamp_hash(v)::bit(32) = timestamp_hash_extended(v, 1)::bit(32);
+
+SELECT v as value, uuid_hash(v)::bit(32) as standard,
+	   uuid_hash_extended(v, 0)::bit(32) as extended0,
+	   uuid_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::uuid), ('a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11'),
+		('5a9ba4ac-8d6f-11e7-bb31-be2e44b06b34'),
+        ('99c6705c-d939-461c-a3c9-1690ad64ed7b'),
+		('7deed3ca-8d6f-11e7-bb31-be2e44b06b34'),
+        ('9ad46d4f-6f2a-4edd-aadb-745993928e1e')) x(v)
+WHERE  uuid_hash(v)::bit(32) != uuid_hash_extended(v, 0)::bit(32)
+       OR uuid_hash(v)::bit(32) = uuid_hash_extended(v, 1)::bit(32);
+
+SELECT v as value, pg_lsn_hash(v)::bit(32) as standard,
+	   pg_lsn_hash_extended(v, 0)::bit(32) as extended0,
+	   pg_lsn_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::pg_lsn), ('16/B374D84'), ('30/B374D84'),
+		('255/B374D84'), ('25/B379D90'), ('900/F37FD90')) x(v)
+WHERE  pg_lsn_hash(v)::bit(32) != pg_lsn_hash_extended(v, 0)::bit(32)
+       OR pg_lsn_hash(v)::bit(32) = pg_lsn_hash_extended(v, 1)::bit(32);
+
+CREATE TYPE mood AS ENUM ('sad', 'ok', 'happy');
+SELECT v as value, hashenum(v)::bit(32) as standard,
+	   hashenumextended(v, 0)::bit(32) as extended0,
+	   hashenumextended(v, 1)::bit(32) as extended1
+FROM   (VALUES ('sad'::mood), ('ok'), ('happy')) x(v)
+WHERE  hashenum(v)::bit(32) != hashenumextended(v, 0)::bit(32)
+       OR hashenum(v)::bit(32) = hashenumextended(v, 1)::bit(32);
+DROP TYPE mood;
+
+SELECT v as value, jsonb_hash(v)::bit(32) as standard,
+	   jsonb_hash_extended(v, 0)::bit(32) as extended0,
+	   jsonb_hash_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (NULL::jsonb),
+    ('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'),
+	('{"foo": [true, "bar"], "tags": {"e": 1, "f": null}}'),
+    ('{"g": {"h": "value"}}')) x(v)
+WHERE  jsonb_hash(v)::bit(32) != jsonb_hash_extended(v, 0)::bit(32)
+       OR jsonb_hash(v)::bit(32) = jsonb_hash_extended(v, 1)::bit(32);
+
+SELECT v as value, hash_range(v)::bit(32) as standard,
+	   hash_range_extended(v, 0)::bit(32) as extended0,
+	   hash_range_extended(v, 1)::bit(32) as extended1
+FROM   (VALUES (int4range(10, 20)), (int4range(23, 43)),
+         (int4range(5675, 550273)),
+		 (int4range(550274, 1550274)), (int4range(1550275, 208112489))) x(v)
+WHERE  hash_range(v)::bit(32) != hash_range_extended(v, 0)::bit(32)
+       OR hash_range(v)::bit(32) = hash_range_extended(v, 1)::bit(32);

From 7327d44e8d14385cafdbb926cfd7dd15a9a67958 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 7 Sep 2017 21:07:47 -0400
Subject: [PATCH 189/578] Refactor get_partition_for_tuple a bit.

Pending patches for both default partitioning and hash partitioning
find the current coding pattern to be inconvenient.  Change it so that
we switch on the partitioning method first and then do whatever is
needed.

Amul Sul, reviewed by Jeevan Ladhe, with a few adjustments by me.

Discussion: http://postgr.es/m/CAAJ_b97mTb=dG2pv6+1ougxEVZFVnZJajW+0QHj46mEE7WsoOQ@mail.gmail.com
Discussion: http://postgr.es/m/CAOgcT0M37CAztEinpvjJc18EdHfm23fw0EG9-36Ya=+rEFUqaQ@mail.gmail.com
---
 src/backend/catalog/partition.c | 80 +++++++++++++++++----------------
 1 file changed, 42 insertions(+), 38 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 3ea32102..20450d8a 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -1952,10 +1952,7 @@ get_partition_for_tuple(PartitionDispatch *pd,
     PartitionDispatch parent;
     Datum        values[PARTITION_MAX_KEYS];
     bool        isnull[PARTITION_MAX_KEYS];
-    int            cur_offset,
-                cur_index;
-    int            i,
-                result;
+	int			result;
     ExprContext *ecxt = GetPerTupleExprContext(estate);
     TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
 
@@ -1967,6 +1964,7 @@ get_partition_for_tuple(PartitionDispatch *pd,
         PartitionDesc partdesc = parent->partdesc;
         TupleTableSlot *myslot = parent->tupslot;
         TupleConversionMap *map = parent->tupmap;
+		int		cur_index = -1;
 
         if (myslot != NULL && map != NULL)
         {
@@ -1998,12 +1996,38 @@ get_partition_for_tuple(PartitionDispatch *pd,
         ecxt->ecxt_scantuple = slot;
         FormPartitionKeyDatum(parent, slot, estate, values, isnull);
 
-        if (key->strategy == PARTITION_STRATEGY_RANGE)
+		/* Route as appropriate based on partitioning strategy. */
+		switch (key->strategy)
         {
-            /*
-             * Since we cannot route tuples with NULL partition keys through a
-             * range-partitioned table, simply return that no partition exists
-             */
+			case PARTITION_STRATEGY_LIST:
+
+				if (isnull[0])
+				{
+					if (partition_bound_accepts_nulls(partdesc->boundinfo))
+						cur_index = partdesc->boundinfo->null_index;
+				}
+				else
+				{
+					bool		equal = false;
+					int			cur_offset;
+
+					cur_offset = partition_bound_bsearch(key,
+														 partdesc->boundinfo,
+														 values,
+														 false,
+														 &equal);
+					if (cur_offset >= 0 && equal)
+						cur_index = partdesc->boundinfo->indexes[cur_offset];
+				}
+				break;
+
+			case PARTITION_STRATEGY_RANGE:
+				{
+					bool		equal = false;
+					int			cur_offset;
+					int			i;
+
+					/* No range includes NULL. */
             for (i = 0; i < key->partnatts; i++)
             {
                 if (isnull[i])
@@ -2014,46 +2038,26 @@ get_partition_for_tuple(PartitionDispatch *pd,
                     goto error_exit;
                 }
             }
-        }
 
-        /*
-         * A null partition key is only acceptable if null-accepting list
-         * partition exists.
-         */
-        cur_index = -1;
-        if (isnull[0] && partition_bound_accepts_nulls(partdesc->boundinfo))
-            cur_index = partdesc->boundinfo->null_index;
-        else if (!isnull[0])
-        {
-            /* Else bsearch in partdesc->boundinfo */
-            bool        equal = false;
-
-            cur_offset = partition_bound_bsearch(key, partdesc->boundinfo,
-                                                 values, false, &equal);
-            switch (key->strategy)
-            {
-                case PARTITION_STRATEGY_LIST:
-                    if (cur_offset >= 0 && equal)
-                        cur_index = partdesc->boundinfo->indexes[cur_offset];
-                    else
-                        cur_index = -1;
-                    break;
-
-                case PARTITION_STRATEGY_RANGE:
+					cur_offset = partition_bound_bsearch(key,
+														 partdesc->boundinfo,
+														 values,
+														 false,
+														 &equal);
 
                     /*
-                     * Offset returned is such that the bound at offset is
-                     * found to be less or equal with the tuple. So, the bound
-                     * at offset+1 would be the upper bound.
+					 * The offset returned is such that the bound at cur_offset
+					 * is less than or equal to the tuple value, so the bound
+					 * at offset+1 is the upper bound.
                      */
                     cur_index = partdesc->boundinfo->indexes[cur_offset + 1];
+				}
                     break;
 
                 default:
                     elog(ERROR, "unexpected partition strategy: %d",
                          (int) key->strategy);
             }
-        }
 
         /*
          * cur_index < 0 means we failed to find a partition of this parent.

From d396fffdcdce9966c9108593cc38babda3f31cc8 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 12:25:07 +0800
Subject: [PATCH 190/578] Allow a partitioned table to have a default
 partition.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/catalogs.sgml                 |  11 +
 doc/src/sgml/ref/alter_table.sgml          |  31 +-
 doc/src/sgml/ref/create_table.sgml         |  35 +-
 src/backend/catalog/heap.c                 |  41 +-
 src/backend/catalog/partition.c            | 635 +++++++++++++++++++--
 src/backend/commands/tablecmds.c           | 160 +++++-
 src/backend/nodes/copyfuncs.c              |   1 +
 src/backend/nodes/equalfuncs.c             |   1 +
 src/backend/nodes/outfuncs.c               |   1 +
 src/backend/nodes/readfuncs.c              |   1 +
 src/backend/parser/gram.y                  |  27 +-
 src/backend/parser/parse_utilcmd.c         |  12 +
 src/backend/utils/adt/ruleutils.c          |   8 +-
 src/bin/psql/describe.c                    |   9 +-
 src/bin/psql/tab-complete.c                |   4 +-
 src/include/catalog/partition.h            |   7 +
 src/include/catalog/pg_partitioned_table.h |  71 +--
 src/include/commands/tablecmds.h           |   3 +
 src/include/nodes/parsenodes.h             |   1 +
 src/test/regress/expected/alter_table.out  |  49 ++
 src/test/regress/expected/create_table.out |  20 +
 src/test/regress/expected/insert.out       | 147 ++++-
 src/test/regress/expected/insert_1.out     | 147 ++++-
 src/test/regress/expected/plancache.out    |  26 +
 src/test/regress/expected/sanity_check.out |   4 +
 src/test/regress/expected/update.out       |  33 ++
 src/test/regress/sql/alter_table.sql       |  47 ++
 src/test/regress/sql/create_table.sql      |  20 +
 src/test/regress/sql/insert.sql            |  68 ++-
 src/test/regress/sql/plancache.sql         |  21 +
 src/test/regress/sql/update.sql            |  24 +
 31 files changed, 1510 insertions(+), 155 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index b3d23a64..fdac2074 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -4771,6 +4771,17 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
       <entry>The number of columns in partition key</entry>
      </row>
 
+     <row>
+      <entry><structfield>partdefid</structfield></entry>
+      <entry><type>oid</type></entry>
+      <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
+      <entry>
+       The OID of the <structname>pg_class</> entry for the default partition
+       of this partitioned table, or zero if this partitioned table does not
+       have a default partition.
+     </entry>
+     </row>
+
      <row>
       <entry><structfield>partattrs</structfield></entry>
       <entry><type>int2vector</type></entry>
diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml
index 9e0d6b31..06c5655e 100644
--- a/doc/src/sgml/ref/alter_table.sgml
+++ b/doc/src/sgml/ref/alter_table.sgml
@@ -34,7 +34,7 @@ ALTER TABLE [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable>
 ALTER TABLE ALL IN TABLESPACE <replaceable class="PARAMETER">name</replaceable> [ OWNED BY <replaceable class="PARAMETER">role_name</replaceable> [, ... ] ]
     SET TABLESPACE <replaceable class="PARAMETER">new_tablespace</replaceable> [ NOWAIT ]
 ALTER TABLE [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable>
-    ATTACH PARTITION <replaceable class="PARAMETER">partition_name</replaceable> FOR VALUES <replaceable class="PARAMETER">partition_bound_spec</replaceable>
+    ATTACH PARTITION <replaceable class="PARAMETER">partition_name</replaceable> { FOR VALUES <replaceable class="PARAMETER">partition_bound_spec</replaceable> | DEFAULT }
 ALTER TABLE [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable>
     DETACH PARTITION <replaceable class="PARAMETER">partition_name</replaceable>
 
@@ -830,11 +830,18 @@ ALTER TABLE [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable>
     </listitem>
     </varlistentry>
     <varlistentry>
-    <term><literal>ATTACH PARTITION <replaceable class="PARAMETER">partition_name</replaceable> FOR VALUES <replaceable class="PARAMETER">partition_bound_spec</replaceable></literal></term>
+    <term><literal>ATTACH PARTITION <replaceable class="PARAMETER">partition_name</replaceable> { FOR VALUES <replaceable class="    PARAMETER">partition_bound_spec</replaceable> | DEFAULT }</literal></term>
     <listitem>
      <para>
       This form attaches an existing table (which might itself be partitioned)
-      as a partition of the target table using the same syntax for
+      as a partition of the target table. The table can be attached
+      as a partition for specific values using <literal>FOR VALUES
+      </literal> or as a default partition by using <literal>DEFAULT
+      </literal>.
+     </para>
+
+     <para>
+      A partition using <literal>FOR VALUES</literal> uses same syntax for
       <replaceable class="PARAMETER">partition_bound_spec</replaceable> as
       <xref linkend="sql-createtable">.  The partition bound specification
       must correspond to the partitioning strategy and partition key of the
@@ -871,6 +878,17 @@ ALTER TABLE [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable>
       (See the discussion in <xref linkend="SQL-CREATEFOREIGNTABLE"> about
       constraints on the foreign table.)
      </para>
+
+     <para>
+      When a table has a default partition, defining a new partition changes
+      the partition constraint for the default partition. The default
+      partition can't contain any rows that would need to be moved to the new
+      partition, and will be scanned to verify that none are present. This
+      scan, like the scan of the new partition, can be avoided if an
+      appropriate <literal>CHECK</literal> constraint is present. Also like
+      the scan of the new partition, it is always skipped when the default
+      partition is a foreign table.
+     </para>
     </listitem>
    </varlistentry>
 
@@ -1604,6 +1622,13 @@ ALTER TABLE cities
     ATTACH PARTITION cities_ab FOR VALUES IN ('a', 'b');
 </programlisting></para>
 
+  <para>
+   Attach a default partition to a partitioned table:
+<programlisting>
+ALTER TABLE cities
+    ATTACH PARTITION cities_partdef DEFAULT;
+</programlisting></para>
+
   <para>
    Detach a partition from partitioned table:
 <programlisting>
diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml
index cacdad1f..e46601b7 100644
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -55,7 +55,7 @@ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXI
   { <replaceable class="PARAMETER">column_name</replaceable> [ WITH OPTIONS ] [ <replaceable class="PARAMETER">column_constraint</replaceable> [ ... ] ]
     | <replaceable>table_constraint</replaceable> }
     [, ... ]
-) ] FOR VALUES <replaceable class="PARAMETER">partition_bound_spec</replaceable>
+) ] { FOR VALUES <replaceable class="PARAMETER">partition_bound_spec</replaceable> | DEFAULT }
 [ PARTITION BY { RANGE | LIST } ( { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ COLLATE <replaceable class="parameter">collation</replaceable> ] [ <replaceable class="parameter">opclass</replaceable> ] [, ... ] ) ]
 [ WITH ( <replaceable class="PARAMETER">storage_parameter</replaceable> [= <replaceable class="PARAMETER">value</replaceable>] [, ... ] ) | WITH OIDS | WITHOUT OIDS ]
 [ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ]
@@ -262,11 +262,13 @@ FROM ( { <replaceable class="PARAMETER">numeric_literal</replaceable> | <replace
    </varlistentry>
 
    <varlistentry id="SQL-CREATETABLE-PARTITION">
-    <term><literal>PARTITION OF <replaceable class="PARAMETER">parent_table</replaceable> FOR VALUES <replaceable class="PARAMETER">partition_bound_spec</replaceable></literal></term>
+    <term><literal>PARTITION OF <replaceable class="PARAMETER">parent_table</replaceable> { FOR VALUES <replaceable class="PARAMETER">partition_bound_spec</replaceable> | DEFAULT }</literal></term>
     <listitem>
      <para>
       Creates the table as a <firstterm>partition</firstterm> of the specified
-      parent table.
+      parent table. The table can be created either as a partition for specific
+      values using <literal>FOR VALUES</literal> or as a default partition
+      using <literal>DEFAULT</literal>.
      </para>
 
      <para>
@@ -354,6 +356,26 @@ FROM ( { <replaceable class="PARAMETER">numeric_literal</replaceable> | <replace
       allows precisely one value to be stored &mdash; "infinity".
      </para>
 
+     <para>
+      If <literal>DEFAULT</literal> is specified, the table will be
+      created as a default partition of the parent table. The parent can
+      either be a list or range partitioned table. A partition key value
+      not fitting into any other partition of the given parent will be
+      routed to the default partition. There can be only one default
+      partition for a given parent table.
+     </para>
+
+     <para>
+      When a table has an existing <literal>DEFAULT</literal> partition and
+      a new partition is added to it, the existing default partition must
+      be scanned to verify that it does not contain any rows which properly
+      belong in the new partition.  If the default partition contains a
+      large number of rows, this may be slow.  The scan will be skipped if
+      the default partition is a foreign table or if it has a constraint which
+      proves that it cannot contain rows which should be placed in the new
+      partition.
+     </para>
+
      <para>
       A partition must have the same column names and types as the partitioned
       table to which it belongs.  If the parent is specified <literal>WITH
@@ -1827,6 +1849,13 @@ CREATE TABLE cities_ab
 CREATE TABLE cities_ab_10000_to_100000
     PARTITION OF cities_ab FOR VALUES FROM (10000) TO (100000);
 </programlisting></para>
+
+  <para>
+   Create a default partition:
+<programlisting>
+CREATE TABLE cities_partdef
+    PARTITION OF cities DEFAULT;
+</programlisting></para>
  </refsect1>
 
  <refsect1 id="SQL-CREATETABLE-compatibility">
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index d2058c17..0c382fe7 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -2784,7 +2784,8 @@ heap_drop_with_catalog(Oid relid)
 {// #lizard forgives
     Relation    rel;
     HeapTuple    tuple;
-    Oid            parentOid = InvalidOid;
+	Oid			parentOid = InvalidOid,
+				defaultPartOid = InvalidOid;
 
     /*
      * To drop a partition safely, we must grab exclusive lock on its parent,
@@ -2800,6 +2801,14 @@ heap_drop_with_catalog(Oid relid)
     {
         parentOid = get_partition_parent(relid);
         LockRelationOid(parentOid, AccessExclusiveLock);
+
+		/*
+		 * If this is not the default partition, dropping it will change the
+		 * default partition's partition constraint, so we must lock it.
+		 */
+		defaultPartOid = get_default_partition_oid(parentOid);
+		if (OidIsValid(defaultPartOid) && relid != defaultPartOid)
+			LockRelationOid(defaultPartOid, AccessExclusiveLock);
     }
 
     ReleaseSysCache(tuple);
@@ -2851,6 +2860,13 @@ heap_drop_with_catalog(Oid relid)
         RemovePartitionKeyByRelId(relid);
 
     /*
+	 * If the relation being dropped is the default partition itself,
+	 * invalidate its entry in pg_partitioned_table.
+	 */
+	if (relid == defaultPartOid)
+		update_default_partition_oid(parentOid, InvalidOid);
+
+	/*
      * Schedule unlinking of the relation's physical files at commit.
      */
     if (rel->rd_rel->relkind != RELKIND_VIEW &&
@@ -2914,6 +2930,14 @@ heap_drop_with_catalog(Oid relid)
     if (OidIsValid(parentOid))
     {
         /*
+		 * If this is not the default partition, the partition constraint of
+		 * the default partition has changed to include the portion of the key
+		 * space previously covered by the dropped partition.
+		 */
+		if (OidIsValid(defaultPartOid) && relid != defaultPartOid)
+			CacheInvalidateRelcacheByRelid(defaultPartOid);
+
+		/*
          * Invalidate the parent's relcache so that the partition is no longer
          * included in its partition descriptor.
          */
@@ -4250,6 +4274,7 @@ StorePartitionKey(Relation rel,
     values[Anum_pg_partitioned_table_partrelid - 1] = ObjectIdGetDatum(RelationGetRelid(rel));
     values[Anum_pg_partitioned_table_partstrat - 1] = CharGetDatum(strategy);
     values[Anum_pg_partitioned_table_partnatts - 1] = Int16GetDatum(partnatts);
+	values[Anum_pg_partitioned_table_partdefid - 1] = ObjectIdGetDatum(InvalidOid);
     values[Anum_pg_partitioned_table_partattrs - 1] = PointerGetDatum(partattrs_vec);
     values[Anum_pg_partitioned_table_partclass - 1] = PointerGetDatum(partopclass_vec);
     values[Anum_pg_partitioned_table_partcollation - 1] = PointerGetDatum(partcollation_vec);
@@ -4427,7 +4452,8 @@ RemovePartitionKeyByRelId(Oid relid)
  *        relispartition to true
  *
  * Also, invalidate the parent's relcache, so that the next rebuild will load
- * the new partition's info into its partition descriptor.
+ * the new partition's info into its partition descriptor.  If there is a
+ * default partition, we must invalidate its relcache entry as well.
  */
 void
 StorePartitionBound(Relation rel, Relation parent, PartitionBoundSpec *bound)
@@ -4438,6 +4464,7 @@ StorePartitionBound(Relation rel, Relation parent, PartitionBoundSpec *bound)
     Datum        new_val[Natts_pg_class];
     bool        new_null[Natts_pg_class],
                 new_repl[Natts_pg_class];
+	Oid			defaultPartOid;
 
     /* Update pg_class tuple */
     classRel = heap_open(RelationRelationId, RowExclusiveLock);
@@ -4475,6 +4502,16 @@ StorePartitionBound(Relation rel, Relation parent, PartitionBoundSpec *bound)
     heap_freetuple(newtuple);
     heap_close(classRel, RowExclusiveLock);
 
+	/*
+	 * The partition constraint for the default partition depends on the
+	 * partition bounds of every other partition, so we must invalidate the
+	 * relcache entry for that partition every time a partition is added or
+	 * removed.
+	 */
+	defaultPartOid = get_default_oid_from_partdesc(RelationGetPartitionDesc(parent));
+	if (OidIsValid(defaultPartOid))
+		CacheInvalidateRelcacheByRelid(defaultPartOid);
+
     CacheInvalidateRelcache(parent);
 }
 
diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 20450d8a..a4ef01e7 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -27,7 +27,9 @@
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_inherits_fn.h"
 #include "catalog/pg_opclass.h"
+#include "catalog/pg_partitioned_table.h"
 #include "catalog/pg_type.h"
+#include "commands/tablecmds.h"
 #include "executor/executor.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
@@ -35,6 +37,7 @@
 #include "nodes/parsenodes.h"
 #include "optimizer/clauses.h"
 #include "optimizer/planmain.h"
+#include "optimizer/prep.h"
 #include "optimizer/var.h"
 #include "rewrite/rewriteManip.h"
 #include "storage/lmgr.h"
@@ -80,9 +83,12 @@ typedef struct PartitionBoundInfoData
                                  * partitioned table) */
     int            null_index;        /* Index of the null-accepting partition; -1
                                  * if there isn't one */
+	int			default_index;	/* Index of the default partition; -1 if there
+								 * isn't one */
 } PartitionBoundInfoData;
 
 #define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1)
+#define partition_bound_has_default(bi) ((bi)->default_index != -1)
 
 /*
  * When qsort'ing partition bounds after reading from the catalog, each bound
@@ -120,8 +126,10 @@ static void get_range_key_properties(PartitionKey key, int keynum,
                          ListCell **partexprs_item,
                          Expr **keyCol,
                          Const **lower_val, Const **upper_val);
-static List *get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec);
-static List *get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec);
+static List *get_qual_for_list(Relation parent, PartitionBoundSpec *spec);
+static List *get_qual_for_range(Relation parent, PartitionBoundSpec *spec,
+				   bool for_default);
+static List *get_range_nulltest(PartitionKey key);
 static List *generate_partition_qual(Relation rel);
 
 static PartitionRangeBound *make_one_range_bound(PartitionKey key, int index,
@@ -162,6 +170,7 @@ RelationBuildPartitionDesc(Relation rel)
     MemoryContext oldcxt;
 
     int            ndatums = 0;
+	int			default_index = -1;
 
     /* List partitioning specific */
     PartitionListValue **all_values = NULL;
@@ -222,6 +231,22 @@ RelationBuildPartitionDesc(Relation rel)
         old_portable_output = set_portable_input(false);
 #endif
         boundspec = (Node *) stringToNode(TextDatumGetCString(datum));
+
+		/*
+		* Sanity check: If the PartitionBoundSpec says this is the default
+		* partition, its OID should correspond to whatever's stored in
+		* pg_partitioned_table.partdefid; if not, the catalog is corrupt.
+		*/
+		if (castNode(PartitionBoundSpec, boundspec)->is_default)
+		{
+		   Oid         partdefid;
+
+		   partdefid = get_default_partition_oid(RelationGetRelid(rel));
+		   if (partdefid != inhrelid)
+		       elog(ERROR, "expected partdefid %u, but got %u",
+		            inhrelid, partdefid);
+		}
+
 #ifdef __TBASE__
         set_portable_input(old_portable_output);
 #endif
@@ -258,6 +283,18 @@ RelationBuildPartitionDesc(Relation rel)
                 if (spec->strategy != PARTITION_STRATEGY_LIST)
                     elog(ERROR, "invalid strategy in partition bound spec");
 
+				/*
+				 * Note the index of the partition bound spec for the default
+				 * partition. There's no datum to add to the list of non-null
+				 * datums for this partition.
+				 */
+				if (spec->is_default)
+				{
+					default_index = i;
+					i++;
+					continue;
+				}
+
                 foreach(c, spec->listdatums)
                 {
                     Const       *val = castNode(Const, lfirst(c));
@@ -340,6 +377,17 @@ RelationBuildPartitionDesc(Relation rel)
                 if (spec->strategy != PARTITION_STRATEGY_RANGE)
                     elog(ERROR, "invalid strategy in partition bound spec");
 
+				/*
+				 * Note the index of the partition bound spec for the default
+				 * partition. There's no datum to add to the allbounds array
+				 * for this partition.
+				 */
+				if (spec->is_default)
+				{
+					default_index = i++;
+					continue;
+				}
+
                 lower = make_one_range_bound(key, i, spec->lowerdatums,
                                              true);
                 upper = make_one_range_bound(key, i, spec->upperdatums,
@@ -349,10 +397,12 @@ RelationBuildPartitionDesc(Relation rel)
                 j += 2;
                 i++;
             }
-            Assert(j == 2 * nparts);
+
+			Assert(j == nparts * 2 ||
+                  (default_index != -1 && j == (nparts - 1) * 2));
 
             /* Sort all the bounds in ascending order */
-            qsort_arg(all_bounds, 2 * nparts,
+			qsort_arg(all_bounds, j,
                       sizeof(PartitionRangeBound *),
                       qsort_partition_rbound_cmp,
                       (void *) key);
@@ -453,6 +503,7 @@ RelationBuildPartitionDesc(Relation rel)
         boundinfo = (PartitionBoundInfoData *)
             palloc0(sizeof(PartitionBoundInfoData));
         boundinfo->strategy = key->strategy;
+		boundinfo->default_index = -1;
         boundinfo->ndatums = ndatums;
         boundinfo->null_index = -1;
         boundinfo->datums = (Datum **) palloc0(ndatums * sizeof(Datum *));
@@ -505,6 +556,21 @@ RelationBuildPartitionDesc(Relation rel)
                         boundinfo->null_index = mapping[null_index];
                     }
 
+					/* Assign mapped index for the default partition. */
+					if (default_index != -1)
+					{
+						/*
+						 * The default partition accepts any value not
+						 * specified in the lists of other partitions, hence
+						 * it should not get mapped index while assigning
+						 * those for non-null datums.
+						 */
+						Assert(default_index >= 0 &&
+							   mapping[default_index] == -1);
+						mapping[default_index] = next_index++;
+						boundinfo->default_index = mapping[default_index];
+					}
+
                     /* All partition must now have a valid mapping */
                     Assert(next_index == nparts);
                     break;
@@ -559,6 +625,14 @@ RelationBuildPartitionDesc(Relation rel)
                             boundinfo->indexes[i] = mapping[orig_index];
                         }
                     }
+
+					/* Assign mapped index for the default partition. */
+					if (default_index != -1)
+					{
+						Assert(default_index >= 0 && mapping[default_index] == -1);
+						mapping[default_index] = next_index++;
+						boundinfo->default_index = mapping[default_index];
+					}
                     boundinfo->indexes[i] = -1;
                     break;
                 }
@@ -609,6 +683,9 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
     if (b1->null_index != b2->null_index)
         return false;
 
+	if (b1->default_index != b2->default_index)
+		return false;
+
     for (i = 0; i < b1->ndatums; i++)
     {
         int            j;
@@ -667,10 +744,24 @@ check_new_partition_bound(char *relname, Relation parent,
 {// #lizard forgives
     PartitionKey key = RelationGetPartitionKey(parent);
     PartitionDesc partdesc = RelationGetPartitionDesc(parent);
+	PartitionBoundInfo boundinfo = partdesc->boundinfo;
     ParseState *pstate = make_parsestate(NULL);
     int            with = -1;
     bool        overlap = false;
 
+	if (spec->is_default)
+	{
+		if (boundinfo == NULL || !partition_bound_has_default(boundinfo))
+			return;
+
+		/* Default partition already exists, error out. */
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+				 errmsg("partition \"%s\" conflicts with existing default partition \"%s\"",
+						relname, get_rel_name(partdesc->oids[boundinfo->default_index])),
+				 parser_errposition(pstate, spec->location)));
+	}
+
     switch (key->strategy)
     {
         case PARTITION_STRATEGY_LIST:
@@ -679,13 +770,13 @@ check_new_partition_bound(char *relname, Relation parent,
 
                 if (partdesc->nparts > 0)
                 {
-                    PartitionBoundInfo boundinfo = partdesc->boundinfo;
                     ListCell   *cell;
 
                     Assert(boundinfo &&
                            boundinfo->strategy == PARTITION_STRATEGY_LIST &&
                            (boundinfo->ndatums > 0 ||
-                            partition_bound_accepts_nulls(boundinfo)));
+							partition_bound_accepts_nulls(boundinfo) ||
+							partition_bound_has_default(boundinfo)));
 
                     foreach(cell, spec->listdatums)
                     {
@@ -750,8 +841,10 @@ check_new_partition_bound(char *relname, Relation parent,
                     int            offset;
                     bool        equal;
 
-                    Assert(boundinfo && boundinfo->ndatums > 0 &&
-                           boundinfo->strategy == PARTITION_STRATEGY_RANGE);
+					Assert(boundinfo &&
+						   boundinfo->strategy == PARTITION_STRATEGY_RANGE &&
+						   (boundinfo->ndatums > 0 ||
+							partition_bound_has_default(boundinfo)));
 
                     /*
                      * Test whether the new lower bound (which is treated
@@ -828,6 +921,139 @@ check_new_partition_bound(char *relname, Relation parent,
     }
 }
 
+/*
+ * check_default_allows_bound
+ *
+ * This function checks if there exists a row in the default partition that
+ * would properly belong to the new partition being added.  If it finds one,
+ * it throws an error.
+ */
+void
+check_default_allows_bound(Relation parent, Relation default_rel,
+						   PartitionBoundSpec *new_spec)
+{
+	List	   *new_part_constraints;
+	List	   *def_part_constraints;
+	List	   *all_parts;
+	ListCell   *lc;
+
+	new_part_constraints = (new_spec->strategy == PARTITION_STRATEGY_LIST)
+		? get_qual_for_list(parent, new_spec)
+		: get_qual_for_range(parent, new_spec, false);
+	def_part_constraints =
+		get_proposed_default_constraint(new_part_constraints);
+
+	/*
+	 * If the existing constraints on the default partition imply that it will
+	 * not contain any row that would belong to the new partition, we can
+	 * avoid scanning the default partition.
+	 */
+	if (PartConstraintImpliedByRelConstraint(default_rel, def_part_constraints))
+	{
+		ereport(INFO,
+				(errmsg("partition constraint for table \"%s\" is implied by existing constraints",
+						RelationGetRelationName(default_rel))));
+		return;
+	}
+
+	/*
+	 * Scan the default partition and its subpartitions, and check for rows
+	 * that do not satisfy the revised partition constraints.
+	 */
+	if (default_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+		all_parts = find_all_inheritors(RelationGetRelid(default_rel),
+										AccessExclusiveLock, NULL);
+	else
+		all_parts = list_make1_oid(RelationGetRelid(default_rel));
+
+	foreach(lc, all_parts)
+	{
+		Oid			part_relid = lfirst_oid(lc);
+		Relation	part_rel;
+		Expr	   *constr;
+		Expr	   *partition_constraint;
+		EState	   *estate;
+		HeapTuple	tuple;
+		ExprState  *partqualstate = NULL;
+		Snapshot	snapshot;
+		TupleDesc	tupdesc;
+		ExprContext *econtext;
+		HeapScanDesc scan;
+		MemoryContext oldCxt;
+		TupleTableSlot *tupslot;
+
+		/* Lock already taken above. */
+		if (part_relid != RelationGetRelid(default_rel))
+			part_rel = heap_open(part_relid, NoLock);
+		else
+			part_rel = default_rel;
+
+		/*
+		 * Only RELKIND_RELATION relations (i.e. leaf partitions) need to be
+		 * scanned.
+		 */
+		if (part_rel->rd_rel->relkind != RELKIND_RELATION)
+		{
+			if (part_rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+				ereport(WARNING,
+						(errcode(ERRCODE_CHECK_VIOLATION),
+						 errmsg("skipped scanning foreign table \"%s\" which is a partition of default partition \"%s\"",
+								RelationGetRelationName(part_rel),
+								RelationGetRelationName(default_rel))));
+
+			if (RelationGetRelid(default_rel) != RelationGetRelid(part_rel))
+				heap_close(part_rel, NoLock);
+
+			continue;
+		}
+
+		tupdesc = CreateTupleDescCopy(RelationGetDescr(part_rel));
+		constr = linitial(def_part_constraints);
+		partition_constraint = (Expr *)
+			map_partition_varattnos((List *) constr,
+									1, part_rel, parent, NULL);
+		estate = CreateExecutorState();
+
+		/* Build expression execution states for partition check quals */
+		partqualstate = ExecPrepareExpr(partition_constraint, estate);
+
+		econtext = GetPerTupleExprContext(estate);
+		snapshot = RegisterSnapshot(GetLatestSnapshot());
+		scan = heap_beginscan(part_rel, snapshot, 0, NULL);
+		tupslot = MakeSingleTupleTableSlot(tupdesc);
+
+		/*
+		 * Switch to per-tuple memory context and reset it for each tuple
+		 * produced, so we don't leak memory.
+		 */
+		oldCxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
+
+		while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+		{
+			ExecStoreTuple(tuple, tupslot, InvalidBuffer, false);
+			econtext->ecxt_scantuple = tupslot;
+
+			if (!ExecCheck(partqualstate, econtext))
+				ereport(ERROR,
+						(errcode(ERRCODE_CHECK_VIOLATION),
+						 errmsg("updated partition constraint for default partition \"%s\" would be violated by some row",
+								RelationGetRelationName(default_rel))));
+
+			ResetExprContext(econtext);
+			CHECK_FOR_INTERRUPTS();
+		}
+
+		MemoryContextSwitchTo(oldCxt);
+		heap_endscan(scan);
+		UnregisterSnapshot(snapshot);
+		ExecDropSingleTupleTableSlot(tupslot);
+		FreeExecutorState(estate);
+
+		if (RelationGetRelid(default_rel) != RelationGetRelid(part_rel))
+			heap_close(part_rel, NoLock);	/* keep the lock until commit */
+	}
+}
+
 /*
  * get_partition_parent
  *
@@ -892,12 +1118,12 @@ get_qual_from_partbound(Relation rel, Relation parent,
     {
         case PARTITION_STRATEGY_LIST:
             Assert(spec->strategy == PARTITION_STRATEGY_LIST);
-            my_qual = get_qual_for_list(key, spec);
+			my_qual = get_qual_for_list(parent, spec);
             break;
 
         case PARTITION_STRATEGY_RANGE:
             Assert(spec->strategy == PARTITION_STRATEGY_RANGE);
-            my_qual = get_qual_for_range(key, spec);
+			my_qual = get_qual_for_range(parent, spec, false);
             break;
 
         default:
@@ -967,7 +1193,8 @@ RelationGetPartitionQual(Relation rel)
  * get_partition_qual_relid
  *
  * Returns an expression tree describing the passed-in relation's partition
- * constraint.
+ * constraint. If there is no partition constraint returns NULL; this can
+ * happen if the default partition is the only partition.
  */
 Expr *
 get_partition_qual_relid(Oid relid)
@@ -980,7 +1207,10 @@ get_partition_qual_relid(Oid relid)
     if (rel->rd_rel->relispartition)
     {
         and_args = generate_partition_qual(rel);
-        if (list_length(and_args) > 1)
+
+		if (and_args == NIL)
+			result = NULL;
+		else if (list_length(and_args) > 1)
             result = makeBoolExpr(AND_EXPR, and_args, -1);
         else
             result = linitial(and_args);
@@ -1295,10 +1525,14 @@ make_partition_op_expr(PartitionKey key, int keynum,
  *
  * Returns an implicit-AND list of expressions to use as a list partition's
  * constraint, given the partition key and bound structures.
+ *
+ * The function returns NIL for a default partition when it's the only
+ * partition since in that case there is no constraint.
  */
 static List *
-get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec)
-{// #lizard forgives
+get_qual_for_list(Relation parent, PartitionBoundSpec *spec)
+{
+	PartitionKey key = RelationGetPartitionKey(parent);
     List       *result;
     Expr       *keyCol;
     ArrayExpr  *arr;
@@ -1325,7 +1559,54 @@ get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec)
     else
         keyCol = (Expr *) copyObject(linitial(key->partexprs));
 
-    /* Create list of Consts for the allowed values, excluding any nulls */
+	/*
+	 * For default list partition, collect datums for all the partitions. The
+	 * default partition constraint should check that the partition key is
+	 * equal to none of those.
+	 */
+	if (spec->is_default)
+	{
+		int			i;
+		int			ndatums = 0;
+		PartitionDesc pdesc = RelationGetPartitionDesc(parent);
+		PartitionBoundInfo boundinfo = pdesc->boundinfo;
+
+		if (boundinfo)
+		{
+			ndatums = boundinfo->ndatums;
+
+			if (partition_bound_accepts_nulls(boundinfo))
+				list_has_null = true;
+		}
+
+		/*
+		 * If default is the only partition, there need not be any partition
+		 * constraint on it.
+		 */
+		if (ndatums == 0 && !list_has_null)
+			return NIL;
+
+		for (i = 0; i < ndatums; i++)
+		{
+			Const	   *val;
+
+			/* Construct const from datum */
+			val = makeConst(key->parttypid[0],
+							key->parttypmod[0],
+							key->parttypcoll[0],
+							key->parttyplen[0],
+							*boundinfo->datums[i],
+							false,	/* isnull */
+							key->parttypbyval[0]);
+
+			arrelems = lappend(arrelems, val);
+		}
+	}
+	else
+	{
+		/*
+		 * Create list of Consts for the allowed values, excluding any nulls.
+		 */
     foreach(cell, spec->listdatums)
     {
         Const       *val = castNode(Const, lfirst(cell));
@@ -1335,6 +1616,7 @@ get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec)
         else
             arrelems = lappend(arrelems, copyObject(val));
     }
+	}
 
     if (arrelems)
     {
@@ -1397,6 +1679,18 @@ get_qual_for_list(PartitionKey key, PartitionBoundSpec *spec)
             result = list_make1(nulltest);
     }
 
+	/*
+	 * Note that, in general, applying NOT to a constraint expression doesn't
+	 * necessarily invert the set of rows it accepts, because NOT (NULL) is
+	 * NULL.  However, the partition constraints we construct here never
+	 * evaluate to NULL, so applying NOT works as intended.
+	 */
+	if (spec->is_default)
+	{
+		result = list_make1(make_ands_explicit(result));
+		result = list_make1(makeBoolExpr(NOT_EXPR, result, -1));
+	}
+
     return result;
 }
 
@@ -1453,6 +1747,53 @@ get_range_key_properties(PartitionKey key, int keynum,
         *upper_val = NULL;
 }
 
+/*
+  * get_range_nulltest
+  *
+  * A non-default range partition table does not currently allow partition
+  * keys to be null, so emit an IS NOT NULL expression for each key column.
+  */
+static List *
+get_range_nulltest(PartitionKey key)
+{
+	List	   *result = NIL;
+	NullTest   *nulltest;
+	ListCell   *partexprs_item;
+	int			i;
+
+	partexprs_item = list_head(key->partexprs);
+	for (i = 0; i < key->partnatts; i++)
+	{
+		Expr	   *keyCol;
+
+		if (key->partattrs[i] != 0)
+		{
+			keyCol = (Expr *) makeVar(1,
+									  key->partattrs[i],
+									  key->parttypid[i],
+									  key->parttypmod[i],
+									  key->parttypcoll[i],
+									  0);
+		}
+		else
+		{
+			if (partexprs_item == NULL)
+				elog(ERROR, "wrong number of partition key expressions");
+			keyCol = copyObject(lfirst(partexprs_item));
+			partexprs_item = lnext(partexprs_item);
+		}
+
+		nulltest = makeNode(NullTest);
+		nulltest->arg = keyCol;
+		nulltest->nulltesttype = IS_NOT_NULL;
+		nulltest->argisrow = false;
+		nulltest->location = -1;
+		result = lappend(result, nulltest);
+	}
+
+	return result;
+}
+
 /*
  * get_qual_for_range
  *
@@ -1491,12 +1832,16 @@ get_range_key_properties(PartitionKey key, int keynum,
  * In most common cases with only one partition column, say a, the following
  * expression tree will be generated: a IS NOT NULL AND a >= al AND a < au
  *
- * If we end up with an empty result list, we return a single-member list
- * containing a constant TRUE, because callers expect a non-empty list.
+ * For default partition, it returns the negation of the constraints of all
+ * the other partitions.
+ *
+ * External callers should pass for_default as false; we set it to true only
+ * when recursing.
  */
 static List *
-get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec)
-{// #lizard forgives
+get_qual_for_range(Relation parent, PartitionBoundSpec *spec,
+				   bool for_default)
+{
     List       *result = NIL;
     ListCell   *cell1,
                *cell2,
@@ -1506,10 +1851,10 @@ get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec)
                 j;
     PartitionRangeDatum *ldatum,
                *udatum;
+	PartitionKey key = RelationGetPartitionKey(parent);
     Expr       *keyCol;
     Const       *lower_val,
                *upper_val;
-    NullTest   *nulltest;
     List       *lower_or_arms,
                *upper_or_arms;
     int            num_or_arms,
@@ -1519,44 +1864,77 @@ get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec)
     bool        need_next_lower_arm,
                 need_next_upper_arm;
 
-    lower_or_start_datum = list_head(spec->lowerdatums);
-    upper_or_start_datum = list_head(spec->upperdatums);
-    num_or_arms = key->partnatts;
+	if (spec->is_default)
+	{
+		List	   *or_expr_args = NIL;
+		PartitionDesc pdesc = RelationGetPartitionDesc(parent);
+		Oid		   *inhoids = pdesc->oids;
+		int			nparts = pdesc->nparts,
+					i;
 
-    /*
-     * A range-partitioned table does not currently allow partition keys to be
-     * null, so emit an IS NOT NULL expression for each key column.
-     */
-    partexprs_item = list_head(key->partexprs);
-    for (i = 0; i < key->partnatts; i++)
+		for (i = 0; i < nparts; i++)
     {
-        Expr       *keyCol;
-
-        if (key->partattrs[i] != 0)
+			Oid			inhrelid = inhoids[i];
+			HeapTuple	tuple;
+			Datum		datum;
+			bool		isnull;
+			PartitionBoundSpec *bspec;
+
+			tuple = SearchSysCache1(RELOID, inhrelid);
+			if (!HeapTupleIsValid(tuple))
+				elog(ERROR, "cache lookup failed for relation %u", inhrelid);
+
+			datum = SysCacheGetAttr(RELOID, tuple,
+									Anum_pg_class_relpartbound,
+									&isnull);
+
+			Assert(!isnull);
+			bspec = (PartitionBoundSpec *)
+				stringToNode(TextDatumGetCString(datum));
+			if (!IsA(bspec, PartitionBoundSpec))
+				elog(ERROR, "expected PartitionBoundSpec");
+
+			if (!bspec->is_default)
         {
-            keyCol = (Expr *) makeVar(1,
-                                      key->partattrs[i],
-                                      key->parttypid[i],
-                                      key->parttypmod[i],
-                                      key->parttypcoll[i],
-                                      0);
+				List	   *part_qual;
+
+				part_qual = get_qual_for_range(parent, bspec, true);
+
+				/*
+				 * AND the constraints of the partition and add to
+				 * or_expr_args
+				 */
+				or_expr_args = lappend(or_expr_args, list_length(part_qual) > 1
+									   ? makeBoolExpr(AND_EXPR, part_qual, -1)
+									   : linitial(part_qual));
         }
-        else
+			ReleaseSysCache(tuple);
+		}
+
+		if (or_expr_args != NIL)
         {
-            if (partexprs_item == NULL)
-                elog(ERROR, "wrong number of partition key expressions");
-            keyCol = copyObject(lfirst(partexprs_item));
-            partexprs_item = lnext(partexprs_item);
+			/* OR all the non-default partition constraints; then negate it */
+			result = lappend(result,
+							 list_length(or_expr_args) > 1
+							 ? makeBoolExpr(OR_EXPR, or_expr_args, -1)
+							 : linitial(or_expr_args));
+			result = list_make1(makeBoolExpr(NOT_EXPR, result, -1));
         }
 
-        nulltest = makeNode(NullTest);
-        nulltest->arg = keyCol;
-        nulltest->nulltesttype = IS_NOT_NULL;
-        nulltest->argisrow = false;
-        nulltest->location = -1;
-        result = lappend(result, nulltest);
+		return result;
     }
 
+	lower_or_start_datum = list_head(spec->lowerdatums);
+	upper_or_start_datum = list_head(spec->upperdatums);
+	num_or_arms = key->partnatts;
+
+	/*
+	 * If it is the recursive call for default, we skip the get_range_nulltest
+	 * to avoid accumulating the NullTest on the same keys for each partition.
+	 */
+	if (!for_default)
+		result = get_range_nulltest(key);
+
     /*
      * Iterate over the key columns and check if the corresponding lower and
      * upper datums are equal using the btree equality operator for the
@@ -1778,9 +2156,16 @@ get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec)
                          ? makeBoolExpr(OR_EXPR, upper_or_arms, -1)
                          : linitial(upper_or_arms));
 
-    /* As noted above, caller expects the list to be non-empty. */
+	/*
+	 * As noted above, for non-default, we return list with constant TRUE. If
+	 * the result is NIL during the recursive call for default, it implies
+	 * this is the only other partition which can hold every value of the key
+	 * except NULL. Hence we return the NullTest result skipped earlier.
+	 */
     if (result == NIL)
-        result = list_make1(makeBoolConst(true, false));
+		result = for_default
+			? get_range_nulltest(key)
+			: list_make1(makeBoolConst(true, false));
 
     return result;
 }
@@ -1788,7 +2173,8 @@ get_qual_for_range(PartitionKey key, PartitionBoundSpec *spec)
 /*
  * generate_partition_qual
  *
- * Generate partition predicate from rel's partition bound expression
+ * Generate partition predicate from rel's partition bound expression. The
+ * function returns a NIL list if there is no predicate.
  *
  * Result expression tree is stored CacheMemoryContext to ensure it survives
  * as long as the relcache entry. But we should be running in a less long-lived
@@ -2023,14 +2409,25 @@ get_partition_for_tuple(PartitionDispatch *pd,
 
 			case PARTITION_STRATEGY_RANGE:
 				{
-					bool		equal = false;
+					bool		equal = false,
+								range_partkey_has_null = false;
 					int			cur_offset;
 					int			i;
 
-					/* No range includes NULL. */
+					/*
+					 * No range includes NULL, so this will be accepted by the
+					 * default partition if there is one, and otherwise
+					 * rejected.
+					 */
             for (i = 0; i < key->partnatts; i++)
             {
-                if (isnull[i])
+						if (isnull[i] &&
+							partition_bound_has_default(partdesc->boundinfo))
+						{
+							range_partkey_has_null = true;
+							break;
+						}
+						else if (isnull[i])
                 {
                     *failed_at = parent;
                     *failed_slot = slot;
@@ -2039,6 +2436,13 @@ get_partition_for_tuple(PartitionDispatch *pd,
                 }
             }
 
+					/*
+					 * No need to search for partition, as the null key will
+					 * be routed to the default partition.
+					 */
+					if (range_partkey_has_null)
+						break;
+
 					cur_offset = partition_bound_bsearch(key,
 														 partdesc->boundinfo,
 														 values,
@@ -2046,9 +2450,9 @@ get_partition_for_tuple(PartitionDispatch *pd,
 														 &equal);
 
                     /*
-					 * The offset returned is such that the bound at cur_offset
-					 * is less than or equal to the tuple value, so the bound
-					 * at offset+1 is the upper bound.
+					 * The offset returned is such that the bound at
+					 * cur_offset is less than or equal to the tuple value, so
+					 * the bound at offset+1 is the upper bound.
                      */
                     cur_index = partdesc->boundinfo->indexes[cur_offset + 1];
 				}
@@ -2061,8 +2465,16 @@ get_partition_for_tuple(PartitionDispatch *pd,
 
         /*
          * cur_index < 0 means we failed to find a partition of this parent.
-         * cur_index >= 0 means we either found the leaf partition, or the
-         * next parent to find a partition of.
+		 * Use the default partition, if there is one.
+		 */
+		if (cur_index < 0)
+			cur_index = partdesc->boundinfo->default_index;
+
+		/*
+		 * If cur_index is still less than 0 at this point, there's no
+		 * partition for this tuple.  Otherwise, we either found the leaf
+		 * partition, or a child partitioned table through which we have to
+		 * route the tuple.
          */
         if (cur_index < 0)
         {
@@ -2116,6 +2528,8 @@ make_one_range_bound(PartitionKey key, int index, List *datums, bool lower)
     ListCell   *lc;
     int            i;
 
+	Assert(datums != NIL);
+
     bound = (PartitionRangeBound *) palloc0(sizeof(PartitionRangeBound));
     bound->index = index;
     bound->datums = (Datum *) palloc0(key->partnatts * sizeof(Datum));
@@ -2352,3 +2766,104 @@ partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo,
 
     return lo;
 }
+
+/*
+ * get_default_oid_from_partdesc
+ *
+ * Given a partition descriptor, return the OID of the default partition, if
+ * one exists; else, return InvalidOid.
+ */
+Oid
+get_default_oid_from_partdesc(PartitionDesc partdesc)
+{
+	if (partdesc && partdesc->boundinfo &&
+		partition_bound_has_default(partdesc->boundinfo))
+		return partdesc->oids[partdesc->boundinfo->default_index];
+
+	return InvalidOid;
+}
+
+/*
+ * get_default_partition_oid
+ *
+ * Given a relation OID, return the OID of the default partition, if one
+ * exists.  Use get_default_oid_from_partdesc where possible, for
+ * efficiency.
+ */
+Oid
+get_default_partition_oid(Oid parentId)
+{
+	HeapTuple	tuple;
+	Oid			defaultPartId = InvalidOid;
+
+	tuple = SearchSysCache1(PARTRELID, ObjectIdGetDatum(parentId));
+
+	if (HeapTupleIsValid(tuple))
+	{
+		Form_pg_partitioned_table part_table_form;
+
+		part_table_form = (Form_pg_partitioned_table) GETSTRUCT(tuple);
+		defaultPartId = part_table_form->partdefid;
+	}
+
+	ReleaseSysCache(tuple);
+	return defaultPartId;
+}
+
+/*
+ * update_default_partition_oid
+ *
+ * Update pg_partition_table.partdefid with a new default partition OID.
+ */
+void
+update_default_partition_oid(Oid parentId, Oid defaultPartId)
+{
+	HeapTuple	tuple;
+	Relation	pg_partitioned_table;
+	Form_pg_partitioned_table part_table_form;
+
+	pg_partitioned_table = heap_open(PartitionedRelationId, RowExclusiveLock);
+
+	tuple = SearchSysCacheCopy1(PARTRELID, ObjectIdGetDatum(parentId));
+
+	if (!HeapTupleIsValid(tuple))
+		elog(ERROR, "cache lookup failed for partition key of relation %u",
+			 parentId);
+
+	part_table_form = (Form_pg_partitioned_table) GETSTRUCT(tuple);
+	part_table_form->partdefid = defaultPartId;
+	CatalogTupleUpdate(pg_partitioned_table, &tuple->t_self, tuple);
+
+	heap_freetuple(tuple);
+	heap_close(pg_partitioned_table, RowExclusiveLock);
+}
+
+/*
+ * get_proposed_default_constraint
+ *
+ * This function returns the negation of new_part_constraints, which
+ * would be an integral part of the default partition constraints after
+ * addition of the partition to which the new_part_constraints belongs.
+ */
+List *
+get_proposed_default_constraint(List *new_part_constraints)
+{
+	Expr	   *defPartConstraint;
+
+	defPartConstraint = make_ands_explicit(new_part_constraints);
+
+	/*
+	 * Derive the partition constraints of default partition by negating the
+	 * given partition constraints. The partition constraint never evaluates
+	 * to NULL, so negating it like this is safe.
+	 */
+	defPartConstraint = makeBoolExpr(NOT_EXPR,
+									 list_make1(defPartConstraint),
+									 -1);
+	defPartConstraint =
+		(Expr *) eval_const_expressions(NULL,
+										(Node *) defPartConstraint);
+	defPartConstraint = canonicalize_qual(defPartConstraint);
+
+	return list_make1(defPartConstraint);
+}
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index db41c7fe..73e65cd9 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -213,6 +213,8 @@ typedef struct AlteredTableInfo
     bool        chgPersistence; /* T if SET LOGGED/UNLOGGED is used */
     char        newrelpersistence;    /* if above is true */
     Expr       *partition_constraint;    /* for attach partition validation */
+	/* true, if validating default due to some other attach/detach */
+	bool		validate_default;
     /* Objects to rebuild after completing ALTER TYPE operations */
     List       *changedConstraintOids;    /* OIDs of constraints to rebuild */
     List       *changedConstraintDefs;    /* string definitions of same */
@@ -538,11 +540,10 @@ static void CreateInheritance(Relation child_rel, Relation parent_rel);
 static void RemoveInheritance(Relation child_rel, Relation parent_rel);
 static ObjectAddress ATExecAttachPartition(List **wqueue, Relation rel,
                       PartitionCmd *cmd);
-static bool PartConstraintImpliedByRelConstraint(Relation scanrel,
-                                    List *partConstraint);
 static void ValidatePartitionConstraints(List **wqueue, Relation scanrel,
                             List *scanrel_children,
-                            List *partConstraint);
+							List *partConstraint,
+                            bool validate_default);
 static ObjectAddress ATExecDetachPartition(Relation rel, RangeVar *name);
 #ifdef _SHARDING_
 static void AtExecRebuildExtent(Relation rel);
@@ -1006,8 +1007,10 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
     {
         PartitionBoundSpec *bound;
         ParseState *pstate;
-        Oid            parentId = linitial_oid(inheritOids);
-        Relation    parent;
+		Oid			parentId = linitial_oid(inheritOids),
+					defaultPartOid;
+		Relation	parent,
+					defaultRel = NULL;
 
         /* Already have strong enough lock on the parent */
         parent = heap_open(parentId, NoLock);
@@ -1022,6 +1025,30 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
                      errmsg("\"%s\" is not partitioned",
                             RelationGetRelationName(parent))));
 
+		/*
+		 * The partition constraint of the default partition depends on the
+		 * partition bounds of every other partition. It is possible that
+		 * another backend might be about to execute a query on the default
+		 * partition table, and that the query relies on previously cached
+		 * default partition constraints. We must therefore take a table lock
+		 * strong enough to prevent all queries on the default partition from
+		 * proceeding until we commit and send out a shared-cache-inval notice
+		 * that will make them update their index lists.
+		 *
+		 * Order of locking: The relation being added won't be visible to
+		 * other backends until it is committed, hence here in
+		 * DefineRelation() the order of locking the default partition and the
+		 * relation being added does not matter. But at all other places we
+		 * need to lock the default relation before we lock the relation being
+		 * added or removed i.e. we should take the lock in same order at all
+		 * the places such that lock parent, lock default partition and then
+		 * lock the partition so as to avoid a deadlock.
+		 */
+		defaultPartOid =
+			get_default_oid_from_partdesc(RelationGetPartitionDesc(parent));
+		if (OidIsValid(defaultPartOid))
+			defaultRel = heap_open(defaultPartOid, AccessExclusiveLock);
+
         /* Tranform the bound values */
         pstate = make_parsestate(NULL);
         pstate->p_sourcetext = queryString;
@@ -1030,14 +1057,31 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 
         /*
          * Check first that the new partition's bound is valid and does not
-         * overlap with any of existing partitions of the parent - note that
-         * it does not return on error.
+		 * overlap with any of existing partitions of the parent.
          */
         check_new_partition_bound(relname, parent, bound);
 
+		/*
+		 * If the default partition exists, its partition constraints will
+		 * change after the addition of this new partition such that it won't
+		 * allow any row that qualifies for this new partition. So, check that
+		 * the existing data in the default partition satisfies the constraint
+		 * as it will exist after adding this partition.
+		 */
+		if (OidIsValid(defaultPartOid))
+		{
+			check_default_allows_bound(parent, defaultRel, bound);
+			/* Keep the lock until commit. */
+			heap_close(defaultRel, NoLock);
+		}
+
         /* Update the pg_class entry. */
         StorePartitionBound(rel, parent, bound);
 
+		/* Update the default partition oid */
+		if (bound->is_default)
+			update_default_partition_oid(RelationGetRelid(parent), relationId);
+
         heap_close(parent, NoLock);
 
         /*
@@ -6188,9 +6232,16 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
             }
 
             if (partqualstate && !ExecCheck(partqualstate, econtext))
+			{
+				if (tab->validate_default)
+					ereport(ERROR,
+							(errcode(ERRCODE_CHECK_VIOLATION),
+							 errmsg("updated partition constraint for default partition would be violated by some row")));
+				else
                 ereport(ERROR,
                         (errcode(ERRCODE_CHECK_VIOLATION),
                          errmsg("partition constraint is violated by some row")));
+			}
 
             /* Write the tuple out to the new relation */
             if (newrel)
@@ -16354,7 +16405,7 @@ ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
  * Existing constraints includes its check constraints and column-level
  * NOT NULL constraints and partConstraint describes the partition constraint.
  */
-static bool
+bool
 PartConstraintImpliedByRelConstraint(Relation scanrel,
 									 List *partConstraint)
 {
@@ -16441,7 +16492,8 @@ PartConstraintImpliedByRelConstraint(Relation scanrel,
 static void
 ValidatePartitionConstraints(List **wqueue, Relation scanrel,
 							 List *scanrel_children,
-							 List *partConstraint)
+							 List *partConstraint,
+							 bool validate_default)
 {
 	bool		found_whole_row;
 	ListCell   *lc;
@@ -16503,6 +16555,7 @@ ValidatePartitionConstraints(List **wqueue, Relation scanrel,
 		/* Grab a work queue entry. */
 		tab = ATGetQueueEntry(wqueue, part_rel);
 		tab->partition_constraint = (Expr *) linitial(my_partconstr);
+		tab->validate_default = validate_default;
 
 		/* keep our lock until commit */
 		if (part_rel != scanrel)
@@ -16530,6 +16583,17 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
     ObjectAddress address;
     const char *trigger_name;
     bool        found_whole_row;
+	Oid			defaultPartOid;
+	List	   *partBoundConstraint;
+
+	/*
+	 * We must lock the default partition, because attaching a new partition
+	 * will change its partition constraint.
+	 */
+	defaultPartOid =
+		get_default_oid_from_partdesc(RelationGetPartitionDesc(rel));
+	if (OidIsValid(defaultPartOid))
+		LockRelationOid(defaultPartOid, AccessExclusiveLock);
 
     attachrel = heap_openrv(cmd->name, AccessExclusiveLock);
 
@@ -16686,6 +16750,11 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
     /* OK to create inheritance.  Rest of the checks performed there */
     CreateInheritance(attachrel, rel);
 
+	/* Update the default partition oid */
+	if (cmd->bound->is_default)
+		update_default_partition_oid(RelationGetRelid(rel),
+									 RelationGetRelid(attachrel));
+
     /*
      * Check that the new partition's bound is valid and does not overlap any
      * of existing partitions of the parent - note that it does not return on
@@ -16702,10 +16771,15 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
      * If the parent itself is a partition, make sure to include its
      * constraint as well.
      */
-    partConstraint = list_concat(get_qual_from_partbound(attachrel, rel,
-                                                         cmd->bound),
+	partBoundConstraint = get_qual_from_partbound(attachrel, rel, cmd->bound);
+	partConstraint = list_concat(partBoundConstraint,
                                  RelationGetPartitionQual(rel));
-    partConstraint = (List *) eval_const_expressions(NULL,
+
+	/* Skip validation if there are no constraints to validate. */
+	if (partConstraint)
+	{
+		partConstraint =
+			(List *) eval_const_expressions(NULL,
                                                      (Node *) partConstraint);
     partConstraint = (List *) canonicalize_qual((Expr *) partConstraint);
     partConstraint = list_make1(make_ands_explicit(partConstraint));
@@ -16718,11 +16792,40 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
                                              rel, &found_whole_row);
     /* There can never be a whole-row reference here */
     if (found_whole_row)
-        elog(ERROR, "unexpected whole-row reference found in partition key");
+			elog(ERROR,
+				 "unexpected whole-row reference found in partition key");
 
 	/* Validate partition constraints against the table being attached. */
 	ValidatePartitionConstraints(wqueue, attachrel, attachrel_children,
-								 partConstraint);
+									 partConstraint, false);
+	}
+
+	/*
+	 * Check whether default partition has a row that would fit the partition
+	 * being attached.
+	 */
+	defaultPartOid =
+		get_default_oid_from_partdesc(RelationGetPartitionDesc(rel));
+	if (OidIsValid(defaultPartOid))
+	{
+		Relation	defaultrel;
+		List	   *defaultrel_children;
+		List	   *defPartConstraint;
+
+		/* We already have taken a lock on default partition. */
+		defaultrel = heap_open(defaultPartOid, NoLock);
+		defPartConstraint =
+			get_proposed_default_constraint(partBoundConstraint);
+		defaultrel_children =
+			find_all_inheritors(defaultPartOid,
+								AccessExclusiveLock, NULL);
+		ValidatePartitionConstraints(wqueue, defaultrel,
+									 defaultrel_children,
+									 defPartConstraint, true);
+
+		/* keep our lock until commit. */
+		heap_close(defaultrel, NoLock);
+	}
 
     ObjectAddressSet(address, RelationRelationId, RelationGetRelid(attachrel));
 
@@ -16749,6 +16852,7 @@ ATExecDetachPartition(Relation rel, RangeVar *name)
                 new_null[Natts_pg_class],
                 new_repl[Natts_pg_class];
     ObjectAddress address;
+	Oid         defaultPartOid;
 #ifdef _MLS_
     bool        schema_bound;
     Oid         partoid;
@@ -16773,6 +16877,16 @@ ATExecDetachPartition(Relation rel, RangeVar *name)
         elog(ERROR, "must be owner of relation %s", NameStr(rel->rd_rel->relname));
     }
 #endif
+
+   /*
+    * We must lock the default partition, because detaching this partition
+    * will changing its partition constrant.
+    */
+   defaultPartOid =
+       get_default_oid_from_partdesc(RelationGetPartitionDesc(rel));
+   if (OidIsValid(defaultPartOid))
+       LockRelationOid(defaultPartOid, AccessExclusiveLock);
+
     partRel = heap_openrv(name, ShareUpdateExclusiveLock);
 
     /* All inheritance related checks are performed within the function */
@@ -16806,6 +16920,24 @@ ATExecDetachPartition(Relation rel, RangeVar *name)
     heap_freetuple(newtuple);
     heap_close(classRel, RowExclusiveLock);
 
+	if (OidIsValid(defaultPartOid))
+	{
+		/*
+		 * If the detach relation is the default partition itself, invalidate
+		 * its entry in pg_partitioned_table.
+		 */
+		if (RelationGetRelid(partRel) == defaultPartOid)
+			update_default_partition_oid(RelationGetRelid(rel), InvalidOid);
+		else
+		{
+			/*
+			 * We must invalidate default partition's relcache, for the same
+			 * reasons explained in StorePartitionBound().
+			 */
+			CacheInvalidateRelcacheByRelid(defaultPartOid);
+		}
+	}
+
     /*
      * Invalidate the parent's relcache so that the partition is no longer
      * included in its partition descriptor.
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 6b103aa4..a7e1d32a 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -4828,6 +4828,7 @@ _copyPartitionBoundSpec(const PartitionBoundSpec *from)
     PartitionBoundSpec *newnode = makeNode(PartitionBoundSpec);
 
     COPY_SCALAR_FIELD(strategy);
+	COPY_SCALAR_FIELD(is_default);
     COPY_NODE_FIELD(listdatums);
     COPY_NODE_FIELD(lowerdatums);
     COPY_NODE_FIELD(upperdatums);
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 3dbcb393..6efee4a8 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -2952,6 +2952,7 @@ static bool
 _equalPartitionBoundSpec(const PartitionBoundSpec *a, const PartitionBoundSpec *b)
 {
     COMPARE_SCALAR_FIELD(strategy);
+	COMPARE_SCALAR_FIELD(is_default);
     COMPARE_NODE_FIELD(listdatums);
     COMPARE_NODE_FIELD(lowerdatums);
     COMPARE_NODE_FIELD(upperdatums);
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index f550ed23..daf0445f 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -5002,6 +5002,7 @@ _outPartitionBoundSpec(StringInfo str, const PartitionBoundSpec *node)
     WRITE_NODE_TYPE("PARTITIONBOUNDSPEC");
 
     WRITE_CHAR_FIELD(strategy);
+	WRITE_BOOL_FIELD(is_default);
     WRITE_NODE_FIELD(listdatums);
     WRITE_NODE_FIELD(lowerdatums);
     WRITE_NODE_FIELD(upperdatums);
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index d653fbf3..32c879f7 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -4076,6 +4076,7 @@ _readPartitionBoundSpec(void)
     READ_LOCALS(PartitionBoundSpec);
 
     READ_CHAR_FIELD(strategy);
+	READ_BOOL_FIELD(is_default);
     READ_NODE_FIELD(listdatums);
     READ_NODE_FIELD(lowerdatums);
     READ_NODE_FIELD(upperdatums);
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index ad22456b..41b045c3 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -618,7 +618,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <str>			part_strategy
 %type <partelem>	part_elem
 %type <list>		part_params
-%type <partboundspec> ForValues
+%type <partboundspec> PartitionBoundSpec
 %type <node>		partbound_datum PartitionRangeDatum
 %type <list>		partbound_datum_list range_datum_list
 
@@ -2108,7 +2108,7 @@ alter_group_cmds:
 		
 partition_cmd:
 			/* ALTER TABLE <name> ATTACH PARTITION <table_name> FOR VALUES */
-			ATTACH PARTITION qualified_name ForValues
+			ATTACH PARTITION qualified_name PartitionBoundSpec
 				{
 					AlterTableCmd *n = makeNode(AlterTableCmd);
 					PartitionCmd *cmd = makeNode(PartitionCmd);
@@ -2833,13 +2833,14 @@ alter_identity_column_option:
 				}
 		;
 
-ForValues:
+PartitionBoundSpec:
 			/* a LIST partition */
 			FOR VALUES IN_P '(' partbound_datum_list ')'
 				{
 					PartitionBoundSpec *n = makeNode(PartitionBoundSpec);
 
 					n->strategy = PARTITION_STRATEGY_LIST;
+					n->is_default = false;
 					n->listdatums = $5;
 					n->location = @3;
 
@@ -2852,10 +2853,22 @@ ForValues:
 					PartitionBoundSpec *n = makeNode(PartitionBoundSpec);
 
 					n->strategy = PARTITION_STRATEGY_RANGE;
+					n->is_default = false;
 					n->lowerdatums = $5;
 					n->upperdatums = $9;
 					n->location = @3;
 
+					$$ = n;
+				}
+
+			/* a DEFAULT partition */
+			| DEFAULT
+				{
+					PartitionBoundSpec *n = makeNode(PartitionBoundSpec);
+
+					n->is_default = true;
+					n->location = @1;
+
 					$$ = n;
 				}
 		;
@@ -3417,7 +3430,7 @@ CreateStmt:	CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')'
 					$$ = (Node *)n;
 				}
 		| CREATE OptTemp TABLE qualified_name PARTITION OF qualified_name
-			OptTypedTableElementList ForValues OptPartitionSpec OptWith
+			OptTypedTableElementList PartitionBoundSpec OptPartitionSpec OptWith
 			OnCommitOption OptTableSpace
 				{
 					CreateStmt *n = makeNode(CreateStmt);
@@ -3436,7 +3449,7 @@ CreateStmt:	CREATE OptTemp TABLE qualified_name '(' OptTableElementList ')'
 					$$ = (Node *)n;
 				}
 		| CREATE OptTemp TABLE IF_P NOT EXISTS qualified_name PARTITION OF
-			qualified_name OptTypedTableElementList ForValues OptPartitionSpec
+			qualified_name OptTypedTableElementList PartitionBoundSpec OptPartitionSpec
 			OptWith OnCommitOption OptTableSpace
 				{
 					CreateStmt *n = makeNode(CreateStmt);
@@ -5369,7 +5382,7 @@ CreateForeignTableStmt:
 					$$ = (Node *) n;
 				}
 		| CREATE FOREIGN TABLE qualified_name
-			PARTITION OF qualified_name OptTypedTableElementList ForValues
+			PARTITION OF qualified_name OptTypedTableElementList PartitionBoundSpec
 			SERVER name create_generic_options
 				{
 					CreateForeignTableStmt *n = makeNode(CreateForeignTableStmt);
@@ -5390,7 +5403,7 @@ CreateForeignTableStmt:
 					$$ = (Node *) n;
 				}
 		| CREATE FOREIGN TABLE IF_P NOT EXISTS qualified_name
-			PARTITION OF qualified_name OptTypedTableElementList ForValues
+			PARTITION OF qualified_name OptTypedTableElementList PartitionBoundSpec
 			SERVER name create_generic_options
 				{
 					CreateForeignTableStmt *n = makeNode(CreateForeignTableStmt);
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index ad70850b..3695d9dc 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -4963,6 +4963,18 @@ transformPartitionBound(ParseState *pstate, Relation parent,
     /* Avoid scribbling on input */
     result_spec = copyObject(spec);
 
+	if (spec->is_default)
+	{
+		/*
+		 * In case of the default partition, parser had no way to identify the
+		 * partition strategy. Assign the parent's strategy to the default
+		 * partition bound spec.
+		 */
+		result_spec->strategy = strategy;
+
+		return result_spec;
+	}
+
     if (strategy == PARTITION_STRATEGY_LIST)
     {
         ListCell   *cell;
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 2ce117ab..2b83875e 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -1845,7 +1845,7 @@ pg_get_partition_constraintdef(PG_FUNCTION_ARGS)
 
     constr_expr = get_partition_qual_relid(relationId);
 
-    /* Quick exit if not a partition */
+	/* Quick exit if no partition constraint */
     if (constr_expr == NULL)
         PG_RETURN_NULL();
 
@@ -9371,6 +9371,12 @@ get_rule_expr(Node *node, deparse_context *context,
                 ListCell   *cell;
                 char       *sep;
 
+				if (spec->is_default)
+				{
+					appendStringInfoString(buf, "DEFAULT");
+					break;
+				}
+
                 switch (spec->strategy)
                 {
                     case PARTITION_STRATEGY_LIST:
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 6de147c8..266c3c31 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -1906,19 +1906,20 @@ describeOneTableDetails(const char *schemaname,
             parent_name = PQgetvalue(result, 0, 0);
             partdef = PQgetvalue(result, 0, 1);
 
-            if (PQnfields(result) == 3)
+			if (PQnfields(result) == 3 && !PQgetisnull(result, 0, 2))
                 partconstraintdef = PQgetvalue(result, 0, 2);
 
             printfPQExpBuffer(&tmpbuf, _("Partition of: %s %s"), parent_name,
                               partdef);
             printTableAddFooter(&cont, tmpbuf.data);
 
-            if (partconstraintdef)
-            {
+			/* If there isn't any constraint, show that explicitly */
+			if (partconstraintdef == NULL || partconstraintdef[0] == '\0')
+				printfPQExpBuffer(&tmpbuf, _("No partition constraint"));
+			else
                 printfPQExpBuffer(&tmpbuf, _("Partition constraint: %s"),
                                   partconstraintdef);
                 printTableAddFooter(&cont, tmpbuf.data);
-            }
 
             PQclear(result);
         }
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 75511955..4ce5a90e 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -2072,7 +2072,7 @@ psql_completion(const char *text, int start, int end)
         COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_tables, "");
     /* Limited completion support for partition bound specification */
     else if (TailMatches3("ATTACH", "PARTITION", MatchAny))
-        COMPLETE_WITH_CONST("FOR VALUES");
+		COMPLETE_WITH_LIST2("FOR VALUES", "DEFAULT");
     else if (TailMatches2("FOR", "VALUES"))
         COMPLETE_WITH_LIST2("FROM (", "IN (");
 
@@ -2541,7 +2541,7 @@ psql_completion(const char *text, int start, int end)
         COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_tables, "");
     /* Limited completion support for partition bound specification */
     else if (TailMatches3("PARTITION", "OF", MatchAny))
-        COMPLETE_WITH_CONST("FOR VALUES");
+		COMPLETE_WITH_LIST2("FOR VALUES", "DEFAULT");
 
 /* CREATE TABLESPACE */
     else if (Matches3("CREATE", "TABLESPACE", MatchAny))
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index 2283c675..454a940a 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -99,4 +99,11 @@ extern int get_partition_for_tuple(PartitionDispatch *pd,
 						EState *estate,
 						PartitionDispatchData **failed_at,
 						TupleTableSlot **failed_slot);
+extern Oid	get_default_oid_from_partdesc(PartitionDesc partdesc);
+extern Oid	get_default_partition_oid(Oid parentId);
+extern void update_default_partition_oid(Oid parentId, Oid defaultPartId);
+extern void check_default_allows_bound(Relation parent, Relation defaultRel,
+						   PartitionBoundSpec *new_spec);
+extern List *get_proposed_default_constraint(List *new_part_constaints);
+
 #endif							/* PARTITION_H */
diff --git a/src/include/catalog/pg_partitioned_table.h b/src/include/catalog/pg_partitioned_table.h
index bf6e7a52..525e541f 100644
--- a/src/include/catalog/pg_partitioned_table.h
+++ b/src/include/catalog/pg_partitioned_table.h
@@ -1,8 +1,8 @@
 /*-------------------------------------------------------------------------
  *
  * pg_partitioned_table.h
- *      definition of the system "partitioned table" relation
- *      along with the relation's initial contents.
+ *	  definition of the system "partitioned table" relation
+ *	  along with the relation's initial contents.
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -10,8 +10,8 @@
  * src/include/catalog/pg_partitioned_table.h
  *
  * NOTES
- *      the genbki.sh script reads this file and generates .bki
- *      information from the DATA() statements.
+ *	  the genbki.sh script reads this file and generates .bki
+ *	  information from the DATA() statements.
  *
  *-------------------------------------------------------------------------
  */
@@ -21,54 +21,57 @@
 #include "catalog/genbki.h"
 
 /* ----------------
- *        pg_partitioned_table definition.  cpp turns this into
- *        typedef struct FormData_pg_partitioned_table
+ *		pg_partitioned_table definition.  cpp turns this into
+ *		typedef struct FormData_pg_partitioned_table
  * ----------------
  */
 #define PartitionedRelationId 3350
 
 CATALOG(pg_partitioned_table,3350) BKI_WITHOUT_OIDS
 {
-    Oid            partrelid;        /* partitioned table oid */
-    char        partstrat;        /* partitioning strategy */
-    int16        partnatts;        /* number of partition key columns */
+	Oid			partrelid;		/* partitioned table oid */
+	char		partstrat;		/* partitioning strategy */
+	int16		partnatts;		/* number of partition key columns */
+	Oid			partdefid;		/* default partition oid; InvalidOid if there
+								 * isn't one */
 
-    /*
-     * variable-length fields start here, but we allow direct access to
-     * partattrs via the C struct.  That's because the first variable-length
-     * field of a heap tuple can be reliably accessed using its C struct
-     * offset, as previous fields are all non-nullable fixed-length fields.
-     */
-    int2vector    partattrs;        /* each member of the array is the attribute
-                                 * number of a partition key column, or 0 if
-                                 * the column is actually an expression */
+	/*
+	 * variable-length fields start here, but we allow direct access to
+	 * partattrs via the C struct.  That's because the first variable-length
+	 * field of a heap tuple can be reliably accessed using its C struct
+	 * offset, as previous fields are all non-nullable fixed-length fields.
+	 */
+	int2vector	partattrs;		/* each member of the array is the attribute
+								 * number of a partition key column, or 0 if
+								 * the column is actually an expression */
 
 #ifdef CATALOG_VARLEN
-    oidvector    partclass;        /* operator class to compare keys */
-    oidvector    partcollation;    /* user-specified collation for keys */
-    pg_node_tree partexprs;        /* list of expressions in the partition key;
-                                 * one item for each zero entry in partattrs[] */
+	oidvector	partclass;		/* operator class to compare keys */
+	oidvector	partcollation;	/* user-specified collation for keys */
+	pg_node_tree partexprs;		/* list of expressions in the partition key;
+								 * one item for each zero entry in partattrs[] */
 #endif
 } FormData_pg_partitioned_table;
 
 /* ----------------
- *        Form_pg_partitioned_table corresponds to a pointer to a tuple with
- *        the format of pg_partitioned_table relation.
+ *		Form_pg_partitioned_table corresponds to a pointer to a tuple with
+ *		the format of pg_partitioned_table relation.
  * ----------------
  */
 typedef FormData_pg_partitioned_table *Form_pg_partitioned_table;
 
 /* ----------------
- *        compiler constants for pg_partitioned_table
+ *		compiler constants for pg_partitioned_table
  * ----------------
  */
-#define Natts_pg_partitioned_table                7
-#define Anum_pg_partitioned_table_partrelid        1
-#define Anum_pg_partitioned_table_partstrat        2
-#define Anum_pg_partitioned_table_partnatts        3
-#define Anum_pg_partitioned_table_partattrs        4
-#define Anum_pg_partitioned_table_partclass        5
-#define Anum_pg_partitioned_table_partcollation 6
-#define Anum_pg_partitioned_table_partexprs        7
+#define Natts_pg_partitioned_table				8
+#define Anum_pg_partitioned_table_partrelid		1
+#define Anum_pg_partitioned_table_partstrat		2
+#define Anum_pg_partitioned_table_partnatts		3
+#define Anum_pg_partitioned_table_partdefid		4
+#define Anum_pg_partitioned_table_partattrs		5
+#define Anum_pg_partitioned_table_partclass		6
+#define Anum_pg_partitioned_table_partcollation 7
+#define Anum_pg_partitioned_table_partexprs		8
 
-#endif                            /* PG_PARTITIONED_TABLE_H */
+#endif							/* PG_PARTITIONED_TABLE_H */
diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h
index e1a3252c..ea788476 100644
--- a/src/include/commands/tablecmds.h
+++ b/src/include/commands/tablecmds.h
@@ -18,6 +18,7 @@
 #include "catalog/dependency.h"
 #include "catalog/objectaddress.h"
 #include "nodes/parsenodes.h"
+#include "catalog/partition.h"
 #include "storage/lock.h"
 #include "utils/relcache.h"
 
@@ -103,6 +104,8 @@ extern void RangeVarCallbackOwnsTable(const RangeVar *relation,
 
 extern void RangeVarCallbackOwnsRelation(const RangeVar *relation,
 							 Oid relId, Oid oldRelId, void *noCatalogs);
+extern bool PartConstraintImpliedByRelConstraint(Relation scanrel,
+                                    List *partConstraint);
 
 #ifdef _MIGRATE_
 extern bool oidarray_contian_oid(Oid *old_oids, int old_num, Oid new_oid);
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 527cb80d..c508a87d 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -876,6 +876,7 @@ typedef struct PartitionBoundSpec
     NodeTag        type;
 
     char        strategy;        /* see PARTITION_STRATEGY codes above */
+	bool		is_default;		/* is it a default partition bound? */
 
     /* Partitioning info for LIST strategy: */
     List       *listdatums;        /* List of Consts (or A_Consts in raw tree) */
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index f8f1494d..09a22ad0 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3304,6 +3304,14 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg
 CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS);
 ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
 ERROR:  partition "fail_part" would overlap partition "part_1"
+-- check that an existing table can be attached as a default partition
+CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT;
+-- check attaching default partition fails if a default partition already
+-- exists
+CREATE TABLE fail_def_part (LIKE part_1 INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT;
+ERROR:  partition "fail_def_part" conflicts with existing default partition "def_part"
 -- check validation when attaching list partitions
 CREATE TABLE list_parted2 (
 	a int,
@@ -3317,6 +3325,15 @@ ERROR:  partition constraint is violated by some row
 -- should be ok after deleting the bad row
 DELETE FROM part_2;
 ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
+-- check partition cannot be attached if default has some row for its values
+CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT;
+INSERT INTO list_parted2_def VALUES (11, 'z');
+CREATE TABLE part_3 (LIKE list_parted2);
+ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11);
+ERROR:  updated partition constraint for default partition would be violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM list_parted2_def WHERE a = 11;
+ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11);
 -- adding constraints that describe the desired partition constraint
 -- (or more restrictive) will help skip the validation scan
 CREATE TABLE part_3_4 (
@@ -3332,6 +3349,10 @@ ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
 ALTER TABLE part_3_4 ALTER a SET NOT NULL;
 ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4);
 INFO:  partition constraint for table "part_3_4" is implied by existing constraints
+-- check if default partition scan skipped
+ALTER TABLE list_parted2_def ADD CONSTRAINT check_a CHECK (a IN (5, 6));
+CREATE TABLE part_55_66 PARTITION OF list_parted2 FOR VALUES IN (55, 66);
+INFO:  partition constraint for table "list_parted2_def" is implied by existing constraints
 -- check validation when attaching range partitions
 CREATE TABLE range_parted (
 	a int,
@@ -3357,6 +3378,19 @@ CREATE TABLE part2 (
 );
 ALTER TABLE range_parted ATTACH PARTITION part2 FOR VALUES FROM (1, 10) TO (1, 20);
 INFO:  partition constraint for table "part2" is implied by existing constraints
+-- Create default partition
+CREATE TABLE partr_def1 PARTITION OF range_parted DEFAULT;
+-- Only one default partition is allowed, hence, following should give error
+CREATE TABLE partr_def2 (LIKE part1 INCLUDING CONSTRAINTS);
+ALTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT;
+ERROR:  partition "partr_def2" conflicts with existing default partition "partr_def1"
+-- Overlapping partitions cannot be attached, hence, following should give error
+INSERT INTO partr_def1 VALUES (2, 10);
+CREATE TABLE part3 (LIKE range_parted);
+ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (2, 10) TO (2, 20);
+ERROR:  updated partition constraint for default partition would be violated by some row
+-- Attaching partitions should be successful when there are no overlapping rows
+ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (3, 10) TO (3, 20);
 -- check that leaf partitions are scanned when attaching a partitioned
 -- table
 CREATE TABLE part_5 (
@@ -3411,6 +3445,7 @@ ALTER TABLE part_7 ATTACH PARTITION part_7_a_null FOR VALUES IN ('a', null);
 INFO:  partition constraint for table "part_7_a_null" is implied by existing constraints
 ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7);
 INFO:  partition constraint for table "part_7" is implied by existing constraints
+INFO:  partition constraint for table "list_parted2_def" is implied by existing constraints
 -- Same example, but check this time that the constraint correctly detects
 -- violating rows
 ALTER TABLE list_parted2 DETACH PARTITION part_7;
@@ -3424,7 +3459,20 @@ SELECT tableoid::regclass, a, b FROM part_7 order by a;
 (2 rows)
 
 ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7);
+INFO:  partition constraint for table "list_parted2_def" is implied by existing constraints
 ERROR:  partition constraint is violated by some row
+-- check that leaf partitions of default partition are scanned when
+-- attaching a partitioned table.
+ALTER TABLE part_5 DROP CONSTRAINT check_a;
+CREATE TABLE part5_def PARTITION OF part_5 DEFAULT PARTITION BY LIST(a);
+CREATE TABLE part5_def_p1 PARTITION OF part5_def FOR VALUES IN (5);
+INSERT INTO part5_def_p1 VALUES (5, 'y');
+CREATE TABLE part5_p1 (LIKE part_5);
+ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y');
+ERROR:  updated partition constraint for default partition would be violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM part5_def_p1 WHERE b = 'y';
+ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y');
 -- check that the table being attached is not already a partition
 ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
 ERROR:  "part_2" is already a partition
@@ -3547,6 +3595,7 @@ ALTER TABLE list_parted2 ALTER COLUMN b TYPE text;
 ERROR:  cannot alter type of column named in partition key
 -- cleanup
 DROP TABLE list_parted, list_parted2, range_parted;
+DROP TABLE fail_def_part;
 -- more tests for certain multi-level partitioning scenarios
 create table p (a int, b int) partition by range (a, b);
 create table p1 (b int, a int not null) partition by range (b);
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index d4b9bf0e..982e28f0 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -470,6 +470,10 @@ CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) TO (2);
 ERROR:  invalid bound specification for a list partition
 LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) T...
                                                              ^
+-- check default partition cannot be created more than once
+CREATE TABLE part_default PARTITION OF list_parted DEFAULT;
+CREATE TABLE fail_default_part PARTITION OF list_parted DEFAULT;
+ERROR:  partition "fail_default_part" conflicts with existing default partition "part_default"
 -- specified literal can't be cast to the partition column data type
 CREATE TABLE bools (
 	a bool
@@ -563,10 +567,15 @@ CREATE TABLE list_parted2 (
 ) PARTITION BY LIST (a);
 CREATE TABLE part_null_z PARTITION OF list_parted2 FOR VALUES IN (null, 'z');
 CREATE TABLE part_ab PARTITION OF list_parted2 FOR VALUES IN ('a', 'b');
+CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT;
 CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN (null);
 ERROR:  partition "fail_part" would overlap partition "part_null_z"
 CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('b', 'c');
 ERROR:  partition "fail_part" would overlap partition "part_ab"
+-- check default partition overlap
+INSERT INTO list_parted2 VALUES('X');
+CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('W', 'X', 'Y');
+ERROR:  updated partition constraint for default partition "list_parted2_def" would be violated by some row
 CREATE TABLE range_parted2 (
 	a int
 ) PARTITION BY RANGE (a);
@@ -590,6 +599,16 @@ CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (30);
 ERROR:  partition "fail_part" would overlap partition "part2"
 CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (50);
 ERROR:  partition "fail_part" would overlap partition "part2"
+-- Create a default partition for range partitioned table
+CREATE TABLE range2_default PARTITION OF range_parted2 DEFAULT;
+-- More than one default partition is not allowed, so this should give error
+CREATE TABLE fail_default_part PARTITION OF range_parted2 DEFAULT;
+ERROR:  partition "fail_default_part" conflicts with existing default partition "range2_default"
+-- Check if the range for default partitions overlap
+INSERT INTO range_parted2 VALUES (85);
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (80) TO (90);
+ERROR:  updated partition constraint for default partition "range2_default" would be violated by some row
+CREATE TABLE part4 PARTITION OF range_parted2 FOR VALUES FROM (90) TO (100);
 -- now check for multi-column range partition key
 CREATE TABLE range_parted3 (
 	a int,
@@ -603,6 +622,7 @@ CREATE TABLE part11 PARTITION OF range_parted3 FOR VALUES FROM (1, 1) TO (1, 10)
 CREATE TABLE part12 PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, maxvalue);
 CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, 20);
 ERROR:  partition "fail_part" would overlap partition "part12"
+CREATE TABLE range3_default PARTITION OF range_parted3 DEFAULT;
 -- cannot create a partition that says column b is allowed to range
 -- from -infinity to +infinity, while there exist partitions that have
 -- more specific ranges
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index 944336b7..9d5b125e 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -221,17 +221,63 @@ insert into part_null values (null, 0);
 create table part_ee_ff partition of list_parted for values in ('ee', 'ff') partition by range (b);
 create table part_ee_ff1 partition of part_ee_ff for values from (1) to (10);
 create table part_ee_ff2 partition of part_ee_ff for values from (10) to (20);
+-- test default partition
+create table part_default partition of list_parted default;
+-- Negative test: a row, which would fit in other partition, does not fit
+-- default partition, even when inserted directly
+insert into part_default values ('aa', 2);
+ERROR:  new row for relation "part_default" violates partition constraint
+DETAIL:  Failing row contains (aa, 2).
+insert into part_default values (null, 2);
+ERROR:  new row for relation "part_default" violates partition constraint
+DETAIL:  Failing row contains (null, 2).
+-- ok
+insert into part_default values ('Zz', 2);
+-- test if default partition works as expected for multi-level partitioned
+-- table as well as when default partition itself is further partitioned
+drop table part_default;
+create table part_xx_yy partition of list_parted for values in ('xx', 'yy') partition by list (a);
+create table part_xx_yy_p1 partition of part_xx_yy for values in ('xx');
+create table part_xx_yy_defpart partition of part_xx_yy default;
+create table part_default partition of list_parted default partition by range(b);
+create table part_default_p1 partition of part_default for values from (20) to (30);
+create table part_default_p2 partition of part_default for values from (30) to (40);
 -- fail
 insert into part_ee_ff1 values ('EE', 11);
 ERROR:  new row for relation "part_ee_ff1" violates partition constraint
 DETAIL:  Failing row contains (EE, 11).
+insert into part_default_p2 values ('gg', 43);
+ERROR:  new row for relation "part_default_p2" violates partition constraint
+DETAIL:  Failing row contains (gg, 43).
 -- fail (even the parent's, ie, part_ee_ff's partition constraint applies)
 insert into part_ee_ff1 values ('cc', 1);
 ERROR:  new row for relation "part_ee_ff1" violates partition constraint
 DETAIL:  Failing row contains (cc, 1).
+insert into part_default values ('gg', 43);
+ERROR:  no partition of relation "part_default" found for row
+DETAIL:  Partition key of the failing row contains (b) = (43).
 -- ok
 insert into part_ee_ff1 values ('ff', 1);
 insert into part_ee_ff2 values ('ff', 11);
+insert into part_default_p1 values ('cd', 25);
+insert into part_default_p2 values ('de', 35);
+insert into list_parted values ('ab', 21);
+insert into list_parted values ('xx', 1);
+insert into list_parted values ('yy', 2);
+select tableoid::regclass, * from list_parted;
+      tableoid      | a  | b  
+--------------------+----+----
+ part_cc_dd         | cC |  1
+ part_ee_ff1        | ff |  1
+ part_ee_ff2        | ff | 11
+ part_xx_yy_p1      | xx |  1
+ part_xx_yy_defpart | yy |  2
+ part_null          |    |  0
+ part_default_p1    | cd | 25
+ part_default_p1    | ab | 21
+ part_default_p2    | de | 35
+(9 rows)
+
 -- Check tuple routing for partitioned tables
 -- fail
 insert into range_parted values ('a', 0);
@@ -251,6 +297,18 @@ insert into range_parted values ('b', 10);
 insert into range_parted values ('a');
 ERROR:  no partition of relation "range_parted" found for row
 DETAIL:  Partition key of the failing row contains (a, (b + 0)) = (a, null).
+-- Check default partition
+create table part_def partition of range_parted default;
+-- fail
+insert into part_def values ('b', 10);
+ERROR:  new row for relation "part_def" violates partition constraint
+DETAIL:  Failing row contains (b, 10).
+-- ok
+insert into part_def values ('c', 10);
+insert into range_parted values (null, null);
+insert into range_parted values ('a', null);
+insert into range_parted values (null, 19);
+insert into range_parted values ('b', 20);
 select tableoid::regclass, * from range_parted order by 1, 2, 3;
  tableoid | a | b  
 ----------+---+----
@@ -260,7 +318,12 @@ select tableoid::regclass, * from range_parted order by 1, 2, 3;
  part3    | b |  1
  part4    | b | 10
  part4    | b | 10
-(6 rows)
+ part_def | c | 10
+ part_def |   |   
+ part_def | a |   
+ part_def |   | 19
+ part_def | b | 20
+(11 rows)
 
 -- ok
 insert into list_parted values (null, 1);
@@ -276,17 +339,22 @@ DETAIL:  Partition key of the failing row contains (b) = (0).
 insert into list_parted values ('EE', 1);
 insert into part_ee_ff values ('EE', 10);
 select tableoid::regclass, * from list_parted;
-  tableoid   | a  | b  
--------------+----+----
- part_aa_bb  | aA |   
- part_cc_dd  | cC |  1
- part_ee_ff1 | ff |  1
- part_ee_ff1 | EE |  1
- part_ee_ff2 | ff | 11
- part_ee_ff2 | EE | 10
- part_null   |    |  0
- part_null   |    |  1
-(8 rows)
+      tableoid      | a  | b  
+--------------------+----+----
+ part_aa_bb         | aA |   
+ part_cc_dd         | cC |  1
+ part_ee_ff1        | ff |  1
+ part_ee_ff1        | EE |  1
+ part_ee_ff2        | ff | 11
+ part_ee_ff2        | EE | 10
+ part_xx_yy_p1      | xx |  1
+ part_xx_yy_defpart | yy |  2
+ part_null          |    |  0
+ part_null          |    |  1
+ part_default_p1    | cd | 25
+ part_default_p1    | ab | 21
+ part_default_p2    | de | 35
+(13 rows)
 
 -- some more tests to exercise tuple-routing with multi-level partitioning
 create table part_gg partition of list_parted for values in ('gg') partition by range (b);
@@ -318,6 +386,31 @@ select tableoid::regclass::text, a, min(b) as min_b, max(b) as max_b from list_p
 
 -- cleanup
 drop table range_parted, list_parted;
+-- test that a default partition added as the first partition accepts any value
+-- including null
+create table list_parted (a int) partition by list (a);
+create table part_default partition of list_parted default;
+\d+ part_default
+                               Table "public.part_default"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+Partition of: list_parted DEFAULT
+No partition constraint
+
+insert into part_default values (null);
+insert into part_default values (1);
+insert into part_default values (-1);
+select tableoid::regclass, a from list_parted;
+   tableoid   | a  
+--------------+----
+ part_default |   
+ part_default |  1
+ part_default | -1
+(3 rows)
+
+-- cleanup
+drop table list_parted;
 -- more tests for certain multi-level partitioning scenarios
 create table mlparted (a int, b int) partition by range (a, b);
 create table mlparted1 (a int not null, b int not null) partition by range ((b+0));
@@ -458,6 +551,36 @@ ERROR:  Postgres-XL does not support TRIGGER yet
 DETAIL:  The feature is not currently supported
 insert into mlparted5 (a, b, c) values (1, 40, 'a');
 drop table mlparted5;
+alter table mlparted drop constraint check_b;
+-- Check multi-level default partition
+create table mlparted_def partition of mlparted default partition by range(a);
+create table mlparted_def1 partition of mlparted_def for values from (40) to (50);
+create table mlparted_def2 partition of mlparted_def for values from (50) to (60);
+insert into mlparted values (40, 100);
+insert into mlparted_def1 values (42, 100);
+insert into mlparted_def2 values (54, 50);
+-- fail
+insert into mlparted values (70, 100);
+ERROR:  no partition of relation "mlparted_def" found for row
+DETAIL:  Partition key of the failing row contains (a) = (70).
+insert into mlparted_def1 values (52, 50);
+ERROR:  new row for relation "mlparted_def1" violates partition constraint
+DETAIL:  Failing row contains (52, 50, null).
+insert into mlparted_def2 values (34, 50);
+ERROR:  new row for relation "mlparted_def2" violates partition constraint
+DETAIL:  Failing row contains (34, 50, null).
+-- ok
+create table mlparted_defd partition of mlparted_def default;
+insert into mlparted values (70, 100);
+select tableoid::regclass, * from mlparted_def;
+   tableoid    | a  |  b  | c 
+---------------+----+-----+---
+ mlparted_def1 | 40 | 100 | 
+ mlparted_def1 | 42 | 100 | 
+ mlparted_def2 | 54 |  50 | 
+ mlparted_defd | 70 | 100 | 
+(4 rows)
+
 -- check that message shown after failure to find a partition shows the
 -- appropriate key description (or none) in various situations
 create table key_desc (a int, b int) partition by list ((a+0));
diff --git a/src/test/regress/expected/insert_1.out b/src/test/regress/expected/insert_1.out
index 592137e9..66cffedd 100644
--- a/src/test/regress/expected/insert_1.out
+++ b/src/test/regress/expected/insert_1.out
@@ -221,17 +221,63 @@ insert into part_null values (null, 0);
 create table part_ee_ff partition of list_parted for values in ('ee', 'ff') partition by range (b);
 create table part_ee_ff1 partition of part_ee_ff for values from (1) to (10);
 create table part_ee_ff2 partition of part_ee_ff for values from (10) to (20);
+-- test default partition
+create table part_default partition of list_parted default;
+-- Negative test: a row, which would fit in other partition, does not fit
+-- default partition, even when inserted directly
+insert into part_default values ('aa', 2);
+ERROR:  new row for relation "part_default" violates partition constraint
+DETAIL:  Failing row contains (aa, 2).
+insert into part_default values (null, 2);
+ERROR:  new row for relation "part_default" violates partition constraint
+DETAIL:  Failing row contains (null, 2).
+-- ok
+insert into part_default values ('Zz', 2);
+-- test if default partition works as expected for multi-level partitioned
+-- table as well as when default partition itself is further partitioned
+drop table part_default;
+create table part_xx_yy partition of list_parted for values in ('xx', 'yy') partition by list (a);
+create table part_xx_yy_p1 partition of part_xx_yy for values in ('xx');
+create table part_xx_yy_defpart partition of part_xx_yy default;
+create table part_default partition of list_parted default partition by range(b);
+create table part_default_p1 partition of part_default for values from (20) to (30);
+create table part_default_p2 partition of part_default for values from (30) to (40);
 -- fail
 insert into part_ee_ff1 values ('EE', 11);
 ERROR:  new row for relation "part_ee_ff1" violates partition constraint
 DETAIL:  Failing row contains (EE, 11).
+insert into part_default_p2 values ('gg', 43);
+ERROR:  new row for relation "part_default_p2" violates partition constraint
+DETAIL:  Failing row contains (gg, 43).
 -- fail (even the parent's, ie, part_ee_ff's partition constraint applies)
 insert into part_ee_ff1 values ('cc', 1);
 ERROR:  new row for relation "part_ee_ff1" violates partition constraint
 DETAIL:  Failing row contains (cc, 1).
+insert into part_default values ('gg', 43);
+ERROR:  no partition of relation "part_default" found for row
+DETAIL:  Partition key of the failing row contains (b) = (43).
 -- ok
 insert into part_ee_ff1 values ('ff', 1);
 insert into part_ee_ff2 values ('ff', 11);
+insert into part_default_p1 values ('cd', 25);
+insert into part_default_p2 values ('de', 35);
+insert into list_parted values ('ab', 21);
+insert into list_parted values ('xx', 1);
+insert into list_parted values ('yy', 2);
+select tableoid::regclass, * from list_parted;
+      tableoid      | a  | b  
+--------------------+----+----
+ part_cc_dd         | cC |  1
+ part_ee_ff1        | ff |  1
+ part_ee_ff2        | ff | 11
+ part_xx_yy_p1      | xx |  1
+ part_xx_yy_defpart | yy |  2
+ part_null          |    |  0
+ part_default_p1    | cd | 25
+ part_default_p1    | ab | 21
+ part_default_p2    | de | 35
+(9 rows)
+
 -- Check tuple routing for partitioned tables
 -- fail
 insert into range_parted values ('a', 0);
@@ -251,6 +297,18 @@ insert into range_parted values ('b', 10);
 insert into range_parted values ('a');
 ERROR:  no partition of relation "range_parted" found for row
 DETAIL:  Partition key of the failing row contains (a, (b + 0)) = (a, null).
+-- Check default partition
+create table part_def partition of range_parted default;
+-- fail
+insert into part_def values ('b', 10);
+ERROR:  new row for relation "part_def" violates partition constraint
+DETAIL:  Failing row contains (b, 10).
+-- ok
+insert into part_def values ('c', 10);
+insert into range_parted values (null, null);
+insert into range_parted values ('a', null);
+insert into range_parted values (null, 19);
+insert into range_parted values ('b', 20);
 select tableoid::regclass, * from range_parted order by 1, 2, 3;
  tableoid | a | b  
 ----------+---+----
@@ -260,7 +318,12 @@ select tableoid::regclass, * from range_parted order by 1, 2, 3;
  part3    | b |  1
  part4    | b | 10
  part4    | b | 10
-(6 rows)
+ part_def | c | 10
+ part_def |   |   
+ part_def | a |   
+ part_def |   | 19
+ part_def | b | 20
+(11 rows)
 
 -- ok
 insert into list_parted values (null, 1);
@@ -276,17 +339,22 @@ DETAIL:  Partition key of the failing row contains (b) = (0).
 insert into list_parted values ('EE', 1);
 insert into part_ee_ff values ('EE', 10);
 select tableoid::regclass, * from list_parted order by 1,2,3;
-  tableoid   | a  | b  
--------------+----+----
- part_aa_bb  | aA |   
- part_cc_dd  | cC |  1
- part_null   |    |  0
- part_null   |    |  1
- part_ee_ff1 | EE |  1
- part_ee_ff1 | ff |  1
- part_ee_ff2 | EE | 10
- part_ee_ff2 | ff | 11
-(8 rows)
+      tableoid      | a  | b  
+--------------------+----+----
+ part_aa_bb         | aA |   
+ part_cc_dd         | cC |  1
+ part_ee_ff1        | ff |  1
+ part_ee_ff1        | EE |  1
+ part_ee_ff2        | ff | 11
+ part_ee_ff2        | EE | 10
+ part_xx_yy_p1      | xx |  1
+ part_xx_yy_defpart | yy |  2
+ part_null          |    |  0
+ part_null          |    |  1
+ part_default_p1    | cd | 25
+ part_default_p1    | ab | 21
+ part_default_p2    | de | 35
+(13 rows)
 
 -- some more tests to exercise tuple-routing with multi-level partitioning
 create table part_gg partition of list_parted for values in ('gg') partition by range (b);
@@ -318,6 +386,31 @@ select tableoid::regclass::text, a, min(b) as min_b, max(b) as max_b from list_p
 
 -- cleanup
 drop table range_parted, list_parted;
+-- test that a default partition added as the first partition accepts any value
+-- including null
+create table list_parted (a int) partition by list (a);
+create table part_default partition of list_parted default;
+\d+ part_default
+                               Table "public.part_default"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
+Partition of: list_parted DEFAULT
+No partition constraint
+
+insert into part_default values (null);
+insert into part_default values (1);
+insert into part_default values (-1);
+select tableoid::regclass, a from list_parted;
+   tableoid   | a  
+--------------+----
+ part_default |   
+ part_default |  1
+ part_default | -1
+(3 rows)
+
+-- cleanup
+drop table list_parted;
 -- more tests for certain multi-level partitioning scenarios
 create table mlparted (a int, b int) partition by range (a, b);
 create table mlparted1 (a int not null, b int not null) partition by range ((b+0));
@@ -458,6 +551,36 @@ ERROR:  Postgres-XL does not support TRIGGER yet
 DETAIL:  The feature is not currently supported
 insert into mlparted5 (a, b, c) values (1, 40, 'a');
 drop table mlparted5;
+alter table mlparted drop constraint check_b;
+-- Check multi-level default partition
+create table mlparted_def partition of mlparted default partition by range(a);
+create table mlparted_def1 partition of mlparted_def for values from (40) to (50);
+create table mlparted_def2 partition of mlparted_def for values from (50) to (60);
+insert into mlparted values (40, 100);
+insert into mlparted_def1 values (42, 100);
+insert into mlparted_def2 values (54, 50);
+-- fail
+insert into mlparted values (70, 100);
+ERROR:  no partition of relation "mlparted_def" found for row
+DETAIL:  Partition key of the failing row contains (a) = (70).
+insert into mlparted_def1 values (52, 50);
+ERROR:  new row for relation "mlparted_def1" violates partition constraint
+DETAIL:  Failing row contains (52, 50, null).
+insert into mlparted_def2 values (34, 50);
+ERROR:  new row for relation "mlparted_def2" violates partition constraint
+DETAIL:  Failing row contains (34, 50, null).
+-- ok
+create table mlparted_defd partition of mlparted_def default;
+insert into mlparted values (70, 100);
+select tableoid::regclass, * from mlparted_def;
+   tableoid    | a  |  b  | c 
+---------------+----+-----+---
+ mlparted_def1 | 40 | 100 | 
+ mlparted_def1 | 42 | 100 | 
+ mlparted_def2 | 54 |  50 | 
+ mlparted_defd | 70 | 100 | 
+(4 rows)
+
 -- check that message shown after failure to find a partition shows the
 -- appropriate key description (or none) in various situations
 create table key_desc (a int, b int) partition by list ((a+0));
diff --git a/src/test/regress/expected/plancache.out b/src/test/regress/expected/plancache.out
index 6d14b3a2..086f7977 100644
--- a/src/test/regress/expected/plancache.out
+++ b/src/test/regress/expected/plancache.out
@@ -252,3 +252,29 @@ NOTICE:  3
  
 (1 row)
 
+-- Check that addition or removal of any partition is correctly dealt with by
+-- default partition table when it is being used in prepared statement.
+create table list_parted (a int) partition by list(a);
+create table list_part_null partition of list_parted for values in (null);
+create table list_part_1 partition of list_parted for values in (1);
+create table list_part_def partition of list_parted default;
+prepare pstmt_def_insert (int) as insert into list_part_def values($1);
+-- should fail
+execute pstmt_def_insert(null);
+ERROR:  new row for relation "list_part_def" violates partition constraint
+DETAIL:  Failing row contains (null).
+execute pstmt_def_insert(1);
+ERROR:  new row for relation "list_part_def" violates partition constraint
+DETAIL:  Failing row contains (1).
+create table list_part_2 partition of list_parted for values in (2);
+execute pstmt_def_insert(2);
+ERROR:  new row for relation "list_part_def" violates partition constraint
+DETAIL:  Failing row contains (2).
+alter table list_parted detach partition list_part_null;
+-- should be ok
+execute pstmt_def_insert(null);
+drop table list_part_1;
+-- should be ok
+execute pstmt_def_insert(1);
+drop table list_parted, list_part_null;
+deallocate pstmt_def_insert;
diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out
index d490c40c..20bce908 100644
--- a/src/test/regress/expected/sanity_check.out
+++ b/src/test/regress/expected/sanity_check.out
@@ -77,6 +77,10 @@ mlparted12|f
 mlparted2|f
 mlparted3|f
 mlparted4|f
+mlparted_def|f
+mlparted_def1|f
+mlparted_def2|f
+mlparted_defd|f
 money_data|f
 num_data|f
 num_exp_add|t
diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out
index f761e47e..2989db8f 100644
--- a/src/test/regress/expected/update.out
+++ b/src/test/regress/expected/update.out
@@ -218,5 +218,38 @@ ERROR:  new row for relation "part_b_10_b_20" violates partition constraint
 DETAIL:  Failing row contains (b, 9).
 -- ok
 update range_parted set b = b + 1 where b = 10;
+-- Creating default partition for range
+create table part_def partition of range_parted default;
+\d+ part_def
+                                  Table "public.part_def"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | text    |           |          |         | extended |              | 
+ b      | integer |           |          |         | plain    |              | 
+Partition of: range_parted DEFAULT
+Partition constraint: (NOT (((a = 'a'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'a'::text) AND (b >= 10) AND (b < 20)) OR ((a = 'b'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'b'::text) AND (b >= 10) AND (b < 20))))
+
+insert into range_parted values ('c', 9);
+-- ok
+update part_def set a = 'd' where a = 'c';
+-- fail
+update part_def set a = 'a' where a = 'd';
+ERROR:  new row for relation "part_def" violates partition constraint
+DETAIL:  Failing row contains (a, 9).
+create table list_parted (
+	a text,
+	b int
+) partition by list (a);
+create table list_part1  partition of list_parted for values in ('a', 'b');
+create table list_default partition of list_parted default;
+insert into list_part1 values ('a', 1);
+insert into list_default values ('d', 10);
+-- fail
+update list_default set a = 'a' where a = 'd';
+ERROR:  new row for relation "list_default" violates partition constraint
+DETAIL:  Failing row contains (a, 10).
+-- ok
+update list_default set a = 'x' where a = 'd';
 -- cleanup
 drop table range_parted;
+drop table list_parted;
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index 55261e2d..f996ca7a 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -2112,6 +2112,13 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg
 -- check that the new partition won't overlap with an existing partition
 CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS);
 ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+-- check that an existing table can be attached as a default partition
+CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT;
+-- check attaching default partition fails if a default partition already
+-- exists
+CREATE TABLE fail_def_part (LIKE part_1 INCLUDING CONSTRAINTS);
+ALTER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT;
 
 -- check validation when attaching list partitions
 CREATE TABLE list_parted2 (
@@ -2128,6 +2135,15 @@ ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
 DELETE FROM part_2;
 ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
 
+-- check partition cannot be attached if default has some row for its values
+CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT;
+INSERT INTO list_parted2_def VALUES (11, 'z');
+CREATE TABLE part_3 (LIKE list_parted2);
+ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11);
+-- should be ok after deleting the bad row
+DELETE FROM list_parted2_def WHERE a = 11;
+ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11);
+
 -- adding constraints that describe the desired partition constraint
 -- (or more restrictive) will help skip the validation scan
 CREATE TABLE part_3_4 (
@@ -2145,6 +2161,9 @@ ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
 ALTER TABLE part_3_4 ALTER a SET NOT NULL;
 ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4);
 
+-- check if default partition scan skipped
+ALTER TABLE list_parted2_def ADD CONSTRAINT check_a CHECK (a IN (5, 6));
+CREATE TABLE part_55_66 PARTITION OF list_parted2 FOR VALUES IN (55, 66);
 
 -- check validation when attaching range partitions
 CREATE TABLE range_parted (
@@ -2173,6 +2192,21 @@ CREATE TABLE part2 (
 );
 ALTER TABLE range_parted ATTACH PARTITION part2 FOR VALUES FROM (1, 10) TO (1, 20);
 
+-- Create default partition
+CREATE TABLE partr_def1 PARTITION OF range_parted DEFAULT;
+
+-- Only one default partition is allowed, hence, following should give error
+CREATE TABLE partr_def2 (LIKE part1 INCLUDING CONSTRAINTS);
+ALTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT;
+
+-- Overlapping partitions cannot be attached, hence, following should give error
+INSERT INTO partr_def1 VALUES (2, 10);
+CREATE TABLE part3 (LIKE range_parted);
+ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (2, 10) TO (2, 20);
+
+-- Attaching partitions should be successful when there are no overlapping rows
+ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (3, 10) TO (3, 20);
+
 -- check that leaf partitions are scanned when attaching a partitioned
 -- table
 CREATE TABLE part_5 (
@@ -2235,6 +2269,18 @@ INSERT INTO part_7 (a, b) VALUES (8, null), (9, 'a');
 SELECT tableoid::regclass, a, b FROM part_7 order by a;
 ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7);
 
+-- check that leaf partitions of default partition are scanned when
+-- attaching a partitioned table.
+ALTER TABLE part_5 DROP CONSTRAINT check_a;
+CREATE TABLE part5_def PARTITION OF part_5 DEFAULT PARTITION BY LIST(a);
+CREATE TABLE part5_def_p1 PARTITION OF part5_def FOR VALUES IN (5);
+INSERT INTO part5_def_p1 VALUES (5, 'y');
+CREATE TABLE part5_p1 (LIKE part_5);
+ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y');
+-- should be ok after deleting the bad row
+DELETE FROM part5_def_p1 WHERE b = 'y';
+ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y');
+
 -- check that the table being attached is not already a partition
 ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
 
@@ -2330,6 +2376,7 @@ ALTER TABLE list_parted2 ALTER COLUMN b TYPE text;
 
 -- cleanup
 DROP TABLE list_parted, list_parted2, range_parted;
+DROP TABLE fail_def_part;
 
 -- more tests for certain multi-level partitioning scenarios
 create table p (a int, b int) partition by range (a, b);
diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql
index 51ad8cda..1a74fdd1 100644
--- a/src/test/regress/sql/create_table.sql
+++ b/src/test/regress/sql/create_table.sql
@@ -449,6 +449,10 @@ CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES IN ();
 -- trying to specify range for list partitioned table
 CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) TO (2);
 
+-- check default partition cannot be created more than once
+CREATE TABLE part_default PARTITION OF list_parted DEFAULT;
+CREATE TABLE fail_default_part PARTITION OF list_parted DEFAULT;
+
 -- specified literal can't be cast to the partition column data type
 CREATE TABLE bools (
 	a bool
@@ -526,9 +530,13 @@ CREATE TABLE list_parted2 (
 ) PARTITION BY LIST (a);
 CREATE TABLE part_null_z PARTITION OF list_parted2 FOR VALUES IN (null, 'z');
 CREATE TABLE part_ab PARTITION OF list_parted2 FOR VALUES IN ('a', 'b');
+CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT;
 
 CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN (null);
 CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('b', 'c');
+-- check default partition overlap
+INSERT INTO list_parted2 VALUES('X');
+CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('W', 'X', 'Y');
 
 CREATE TABLE range_parted2 (
 	a int
@@ -548,6 +556,17 @@ CREATE TABLE part3 PARTITION OF range_parted2 FOR VALUES FROM (30) TO (40);
 CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (30);
 CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (50);
 
+-- Create a default partition for range partitioned table
+CREATE TABLE range2_default PARTITION OF range_parted2 DEFAULT;
+
+-- More than one default partition is not allowed, so this should give error
+CREATE TABLE fail_default_part PARTITION OF range_parted2 DEFAULT;
+
+-- Check if the range for default partitions overlap
+INSERT INTO range_parted2 VALUES (85);
+CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (80) TO (90);
+CREATE TABLE part4 PARTITION OF range_parted2 FOR VALUES FROM (90) TO (100);
+
 -- now check for multi-column range partition key
 CREATE TABLE range_parted3 (
 	a int,
@@ -561,6 +580,7 @@ CREATE TABLE part10 PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO
 CREATE TABLE part11 PARTITION OF range_parted3 FOR VALUES FROM (1, 1) TO (1, 10);
 CREATE TABLE part12 PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, maxvalue);
 CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, 20);
+CREATE TABLE range3_default PARTITION OF range_parted3 DEFAULT;
 
 -- cannot create a partition that says column b is allowed to range
 -- from -infinity to +infinity, while there exist partitions that have
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 75d801b9..bbfc03c4 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -132,13 +132,39 @@ create table part_ee_ff partition of list_parted for values in ('ee', 'ff') part
 create table part_ee_ff1 partition of part_ee_ff for values from (1) to (10);
 create table part_ee_ff2 partition of part_ee_ff for values from (10) to (20);
 
+-- test default partition
+create table part_default partition of list_parted default;
+-- Negative test: a row, which would fit in other partition, does not fit
+-- default partition, even when inserted directly
+insert into part_default values ('aa', 2);
+insert into part_default values (null, 2);
+-- ok
+insert into part_default values ('Zz', 2);
+-- test if default partition works as expected for multi-level partitioned
+-- table as well as when default partition itself is further partitioned
+drop table part_default;
+create table part_xx_yy partition of list_parted for values in ('xx', 'yy') partition by list (a);
+create table part_xx_yy_p1 partition of part_xx_yy for values in ('xx');
+create table part_xx_yy_defpart partition of part_xx_yy default;
+create table part_default partition of list_parted default partition by range(b);
+create table part_default_p1 partition of part_default for values from (20) to (30);
+create table part_default_p2 partition of part_default for values from (30) to (40);
+
 -- fail
 insert into part_ee_ff1 values ('EE', 11);
+insert into part_default_p2 values ('gg', 43);
 -- fail (even the parent's, ie, part_ee_ff's partition constraint applies)
 insert into part_ee_ff1 values ('cc', 1);
+insert into part_default values ('gg', 43);
 -- ok
 insert into part_ee_ff1 values ('ff', 1);
 insert into part_ee_ff2 values ('ff', 11);
+insert into part_default_p1 values ('cd', 25);
+insert into part_default_p2 values ('de', 35);
+insert into list_parted values ('ab', 21);
+insert into list_parted values ('xx', 1);
+insert into list_parted values ('yy', 2);
+select tableoid::regclass, * from list_parted;
 
 -- Check tuple routing for partitioned tables
 
@@ -154,8 +180,18 @@ insert into range_parted values ('b', 1);
 insert into range_parted values ('b', 10);
 -- fail (partition key (b+0) is null)
 insert into range_parted values ('a');
-select tableoid::regclass, * from range_parted order by 1, 2, 3;
+-- Check default partition
+create table part_def partition of range_parted default;
+-- fail
+insert into part_def values ('b', 10);
+-- ok
+insert into part_def values ('c', 10);
+insert into range_parted values (null, null);
+insert into range_parted values ('a', null);
+insert into range_parted values (null, 19);
+insert into range_parted values ('b', 20);
 
+select tableoid::regclass, * from range_parted order by 1, 2, 3;
 -- ok
 insert into list_parted values (null, 1);
 insert into list_parted (a) values ('aA');
@@ -188,6 +224,18 @@ select tableoid::regclass::text, a, min(b) as min_b, max(b) as max_b from list_p
 -- cleanup
 drop table range_parted, list_parted;
 
+-- test that a default partition added as the first partition accepts any value
+-- including null
+create table list_parted (a int) partition by list (a);
+create table part_default partition of list_parted default;
+\d+ part_default
+insert into part_default values (null);
+insert into part_default values (1);
+insert into part_default values (-1);
+select tableoid::regclass, a from list_parted;
+-- cleanup
+drop table list_parted;
+
 -- more tests for certain multi-level partitioning scenarios
 create table mlparted (a int, b int) partition by range (a, b);
 create table mlparted1 (a int not null, b int not null) partition by range ((b+0));
@@ -269,6 +317,24 @@ create function mlparted5abrtrig_func() returns trigger as $$ begin new.c = 'b';
 create trigger mlparted5abrtrig before insert on mlparted5a for each row execute procedure mlparted5abrtrig_func();
 insert into mlparted5 (a, b, c) values (1, 40, 'a');
 drop table mlparted5;
+alter table mlparted drop constraint check_b;
+
+-- Check multi-level default partition
+create table mlparted_def partition of mlparted default partition by range(a);
+create table mlparted_def1 partition of mlparted_def for values from (40) to (50);
+create table mlparted_def2 partition of mlparted_def for values from (50) to (60);
+insert into mlparted values (40, 100);
+insert into mlparted_def1 values (42, 100);
+insert into mlparted_def2 values (54, 50);
+-- fail
+insert into mlparted values (70, 100);
+insert into mlparted_def1 values (52, 50);
+insert into mlparted_def2 values (34, 50);
+-- ok
+create table mlparted_defd partition of mlparted_def default;
+insert into mlparted values (70, 100);
+
+select tableoid::regclass, * from mlparted_def;
 
 -- check that message shown after failure to find a partition shows the
 -- appropriate key description (or none) in various situations
diff --git a/src/test/regress/sql/plancache.sql b/src/test/regress/sql/plancache.sql
index ca6acc41..c9d9269d 100644
--- a/src/test/regress/sql/plancache.sql
+++ b/src/test/regress/sql/plancache.sql
@@ -156,3 +156,24 @@ end$$ language plpgsql;
 
 select cachebug();
 select cachebug();
+
+-- Check that addition or removal of any partition is correctly dealt with by
+-- default partition table when it is being used in prepared statement.
+create table list_parted (a int) partition by list(a);
+create table list_part_null partition of list_parted for values in (null);
+create table list_part_1 partition of list_parted for values in (1);
+create table list_part_def partition of list_parted default;
+prepare pstmt_def_insert (int) as insert into list_part_def values($1);
+-- should fail
+execute pstmt_def_insert(null);
+execute pstmt_def_insert(1);
+create table list_part_2 partition of list_parted for values in (2);
+execute pstmt_def_insert(2);
+alter table list_parted detach partition list_part_null;
+-- should be ok
+execute pstmt_def_insert(null);
+drop table list_part_1;
+-- should be ok
+execute pstmt_def_insert(1);
+drop table list_parted, list_part_null;
+deallocate pstmt_def_insert;
diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql
index 0b5b3309..42c5e405 100644
--- a/src/test/regress/sql/update.sql
+++ b/src/test/regress/sql/update.sql
@@ -125,5 +125,29 @@ update range_parted set b = b - 1 where b = 10;
 -- ok
 update range_parted set b = b + 1 where b = 10;
 
+-- Creating default partition for range
+create table part_def partition of range_parted default;
+\d+ part_def
+insert into range_parted values ('c', 9);
+-- ok
+update part_def set a = 'd' where a = 'c';
+-- fail
+update part_def set a = 'a' where a = 'd';
+
+create table list_parted (
+	a text,
+	b int
+) partition by list (a);
+create table list_part1  partition of list_parted for values in ('a', 'b');
+create table list_default partition of list_parted default;
+insert into list_part1 values ('a', 1);
+insert into list_default values ('d', 10);
+
+-- fail
+update list_default set a = 'a' where a = 'd';
+-- ok
+update list_default set a = 'x' where a = 'd';
+
 -- cleanup
 drop table range_parted;
+drop table list_parted;

From dd83fa24abfc48cc67de41df9eb5e48d877bdd58 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 8 Sep 2017 19:04:32 -0400
Subject: [PATCH 191/578] Fix uninitialized-variable bug.

map_partition_varattnos() failed to set its found_whole_row output
parameter if the given expression list was NIL.  This seems to be
a pre-existing bug that chanced to be exposed by commit 6f6b99d13.
It might be unreachable in v10, but I have little faith in that
proposition, so back-patch.

Per buildfarm.
---
 src/backend/catalog/partition.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index a4ef01e7..92054927 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -1153,11 +1153,11 @@ map_partition_varattnos(List *expr, int target_varno,
                         Relation partrel, Relation parent,
                         bool *found_whole_row)
 {
-    AttrNumber *part_attnos;
-    bool        my_found_whole_row;
+	bool		my_found_whole_row = false;
 
-    if (expr == NIL)
-        return NIL;
+	if (expr != NIL)
+	{
+		AttrNumber *part_attnos;
 
     part_attnos = convert_tuples_by_name_map(RelationGetDescr(partrel),
                                              RelationGetDescr(parent),
@@ -1168,6 +1168,8 @@ map_partition_varattnos(List *expr, int target_varno,
                                         RelationGetDescr(parent)->natts,
                                         RelationGetForm(partrel)->reltype,
                                         &my_found_whole_row);
+	}
+
     if (found_whole_row)
         *found_whole_row = my_found_whole_row;
 

From 2fd6649a29039b4cd1c1eda72cd60f5c4ffcc13a Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 14 Sep 2017 10:43:44 -0400
Subject: [PATCH 192/578] Set partitioned_rels appropriately when UNION ALL is
 used.

In most cases, this omission won't matter, because the appropriate
locks will have been acquired during parse/plan or by AcquireExecutorLocks.
But it's a bug all the same.

Report by Ashutosh Bapat.  Patch by me, reviewed by Amit Langote.

Discussion: http://postgr.es/m/CAFjFpRdHb_ZnoDTuBXqrudWXh3H1ibLkr6nHsCFT96fSK4DXtA@mail.gmail.com
---
 src/backend/optimizer/path/allpaths.c | 38 +++++++++++++++++++++++++--
 src/backend/optimizer/plan/planner.c  |  6 ++---
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 7a17b7d7..a3f54c14 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -1307,14 +1307,35 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
     ListCell   *l;
     List       *partitioned_rels = NIL;
     RangeTblEntry *rte;
+	bool		build_partitioned_rels = false;
 
+	/*
+	 * A plain relation will already have a PartitionedChildRelInfo if it is
+	 * partitioned.  For a subquery RTE, no PartitionedChildRelInfo exists; we
+	 * collect all partitioned_rels associated with any child.  (This assumes
+	 * that we don't need to look through multiple levels of subquery RTEs; if
+	 * we ever do, we could create a PartitionedChildRelInfo with the
+	 * accumulated list of partitioned_rels which would then be found when
+	 * populated our parent rel with paths.  For the present, that appears to
+	 * be unnecessary.)
+	 */
     rte = planner_rt_fetch(rel->relid, root);
+	switch (rte->rtekind)
+	{
+		case RTE_RELATION:
     if (rte->relkind == RELKIND_PARTITIONED_TABLE)
     {
-        partitioned_rels = get_partitioned_child_rels(root, rel->relid);
-        /* The root partitioned table is included as a child rel */
+				partitioned_rels =
+					get_partitioned_child_rels(root, rel->relid);
         Assert(list_length(partitioned_rels) >= 1);
     }
+			break;
+		case RTE_SUBQUERY:
+			build_partitioned_rels = true;
+			break;
+		default:
+			elog(ERROR, "unexpcted rtekind: %d", (int) rte->rtekind);
+	}
 
     /*
      * For every non-dummy child, remember the cheapest path.  Also, identify
@@ -1327,6 +1348,19 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
         ListCell   *lcp;
 
         /*
+		 * If we need to build partitioned_rels, accumulate the partitioned
+		 * rels for this child.
+		 */
+		if (build_partitioned_rels)
+		{
+			List	   *cprels;
+
+			cprels = get_partitioned_child_rels(root, childrel->relid);
+			partitioned_rels = list_concat(partitioned_rels,
+										   list_copy(cprels));
+		}
+
+		/*
          * If child has an unparameterized cheapest-total path, add that to
          * the unparameterized Append path we are constructing for the parent.
          * If not, there's no workable unparameterized path.
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index a3af8985..137e77ca 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -7800,7 +7800,8 @@ grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path,
  *        Returns a list of the RT indexes of the partitioned child relations
  *        with rti as the root parent RT index.
  *
- * Note: Only call this function on RTEs known to be partitioned tables.
+ * Note: This function might get called even for range table entries that
+ * are not partitioned tables; in such a case, it will simply return NIL.
  */
 List *
 get_partitioned_child_rels(PlannerInfo *root, Index rti)
@@ -7819,9 +7820,6 @@ get_partitioned_child_rels(PlannerInfo *root, Index rti)
         }
     }
 
-    /* The root partitioned table is included as a child rel */
-    Assert(list_length(result) >= 1);
-
     return result;
 }
 

From 7b9b82dda6a304f253939b993c03c90ff0bc0806 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 14 Sep 2017 12:28:50 -0400
Subject: [PATCH 193/578] Make RelationGetPartitionDispatchInfo expand
 depth-first.

With this change, the order of leaf partitions as returned by
RelationGetPartitionDispatchInfo should now be the same as the
order used by expand_inherited_rtentry.  This will make it simpler
for future patches to match up the partition dispatch information
with the planner data structures.  The new code is also, in my
opinion anyway, simpler and easier to understand.

Amit Langote, reviewed by Amit Khandekar.  I also reviewed and
made a few cosmetic revisions.

Discussion: http://postgr.es/m/d98d4761-5071-1762-501e-0e15047c714b@lab.ntt.co.jp
---
 src/backend/catalog/partition.c        | 220 +++++++++++--------------
 src/backend/optimizer/prep/prepunion.c |   7 +
 2 files changed, 100 insertions(+), 127 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 92054927..16920224 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -147,6 +147,8 @@ static int32 partition_bound_cmp(PartitionKey key,
 static int partition_bound_bsearch(PartitionKey key,
                         PartitionBoundInfo boundinfo,
                         void *probe, bool probe_is_bound, bool *is_equal);
+static void get_partition_dispatch_recurse(Relation rel, Relation parent,
+							   List **pds, List **leaf_part_oids);
 
 /*
  * RelationBuildPartitionDesc
@@ -1224,21 +1226,6 @@ get_partition_qual_relid(Oid relid)
     return result;
 }
 
-/*
- * Append OIDs of rel's partitions to the list 'partoids' and for each OID,
- * append pointer rel to the list 'parents'.
- */
-#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
-    do\
-    {\
-        int        i;\
-        for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
-        {\
-            (partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
-            (parents) = lappend((parents), (rel));\
-        }\
-    } while(0)
-
 /*
  * RelationGetPartitionDispatchInfo
  *        Returns information necessary to route tuples down a partition tree
@@ -1255,151 +1242,130 @@ PartitionDispatch *
 RelationGetPartitionDispatchInfo(Relation rel,
                                  int *num_parted, List **leaf_part_oids)
 {
+	List	   *pdlist = NIL;
     PartitionDispatchData **pd;
-    List       *all_parts = NIL,
-               *all_parents = NIL,
-               *parted_rels,
-               *parted_rel_parents;
-    ListCell   *lc1,
-               *lc2;
-    int            i,
-                k,
-                offset;
+	ListCell   *lc;
+	int			i;
 
-    /*
-	 * We rely on the relcache to traverse the partition tree to build both
-	 * the leaf partition OIDs list and the array of PartitionDispatch objects
-	 * for the partitioned tables in the tree.  That means every partitioned
-	 * table in the tree must be locked, which is fine since we require the
-	 * caller to lock all the partitions anyway.
-     *
-	 * For every partitioned table in the tree, starting with the root
-	 * partitioned table, add its relcache entry to parted_rels, while also
-	 * queuing its partitions (in the order in which they appear in the
-	 * partition descriptor) to be looked at later in the same loop.  This is
-	 * a bit tricky but works because the foreach() macro doesn't fetch the
-	 * next list element until the bottom of the loop.
-     */
-    *num_parted = 1;
-    parted_rels = list_make1(rel);
-    /* Root partitioned table has no parent, so NULL for parent */
-    parted_rel_parents = list_make1(NULL);
-    APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
-    forboth(lc1, all_parts, lc2, all_parents)
-    {
-		Oid			partrelid = lfirst_oid(lc1);
-        Relation    parent = lfirst(lc2);
+	Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
 
-		if (get_rel_relkind(partrelid) == RELKIND_PARTITIONED_TABLE)
-		{
-        /*
-			 * Already locked by the caller.  Note that it is the
-			 * responsibility of the caller to close the below relcache entry,
-			 * once done using the information being collected here (for
-			 * example, in ExecEndModifyTable).
-         */
-			Relation	partrel = heap_open(partrelid, NoLock);
+	*num_parted = 0;
+	*leaf_part_oids = NIL;
 
-            (*num_parted)++;
-            parted_rels = lappend(parted_rels, partrel);
-            parted_rel_parents = lappend(parted_rel_parents, parent);
-            APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
+	get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids);
+	*num_parted = list_length(pdlist);
+	pd = (PartitionDispatchData **) palloc(*num_parted *
+										   sizeof(PartitionDispatchData *));
+	i = 0;
+	foreach(lc, pdlist)
+	{
+		pd[i++] = lfirst(lc);
         }
+
+	return pd;
     }
 
     /*
-     * We want to create two arrays - one for leaf partitions and another for
-     * partitioned tables (including the root table and internal partitions).
-     * While we only create the latter here, leaf partition array of suitable
-     * objects (such as, ResultRelInfo) is created by the caller using the
-     * list of OIDs we return.  Indexes into these arrays get assigned in a
-     * breadth-first manner, whereby partitions of any given level are placed
-     * consecutively in the respective arrays.
+ * get_partition_dispatch_recurse
+ *		Recursively expand partition tree rooted at rel
+ *
+ * As the partition tree is expanded in a depth-first manner, we mantain two
+ * global lists: of PartitionDispatch objects corresponding to partitioned
+ * tables in *pds and of the leaf partition OIDs in *leaf_part_oids.
+ *
+ * Note that the order of OIDs of leaf partitions in leaf_part_oids matches
+ * the order in which the planner's expand_partitioned_rtentry() processes
+ * them.  It's not necessarily the case that the offsets match up exactly,
+ * because constraint exclusion might prune away some partitions on the
+ * planner side, whereas we'll always have the complete list; but unpruned
+ * partitions will appear in the same order in the plan as they are returned
+ * here.
      */
-    pd = (PartitionDispatchData **) palloc(*num_parted *
-                                           sizeof(PartitionDispatchData *));
-    *leaf_part_oids = NIL;
-    i = k = offset = 0;
-    forboth(lc1, parted_rels, lc2, parted_rel_parents)
-    {
-        Relation    partrel = lfirst(lc1);
-        Relation    parent = lfirst(lc2);
-        PartitionKey partkey = RelationGetPartitionKey(partrel);
-        TupleDesc    tupdesc = RelationGetDescr(partrel);
-        PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
-        int            j,
-                    m;
-
-        pd[i] = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
-        pd[i]->reldesc = partrel;
-        pd[i]->key = partkey;
-        pd[i]->keystate = NIL;
-        pd[i]->partdesc = partdesc;
+static void
+get_partition_dispatch_recurse(Relation rel, Relation parent,
+							   List **pds, List **leaf_part_oids)
+{
+	TupleDesc	tupdesc = RelationGetDescr(rel);
+	PartitionDesc partdesc = RelationGetPartitionDesc(rel);
+	PartitionKey partkey = RelationGetPartitionKey(rel);
+	PartitionDispatch pd;
+	int			i;
+
+	check_stack_depth();
+
+	/* Build a PartitionDispatch for this table and add it to *pds. */
+	pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
+	*pds = lappend(*pds, pd);
+	pd->reldesc = rel;
+	pd->key = partkey;
+	pd->keystate = NIL;
+	pd->partdesc = partdesc;
         if (parent != NULL)
         {
             /*
-             * For every partitioned table other than root, we must store a
-             * tuple table slot initialized with its tuple descriptor and a
-             * tuple conversion map to convert a tuple from its parent's
-             * rowtype to its own. That is to make sure that we are looking at
-             * the correct row using the correct tuple descriptor when
-             * computing its partition key for tuple routing.
+		 * For every partitioned table other than the root, we must store a
+		 * tuple table slot initialized with its tuple descriptor and a tuple
+		 * conversion map to convert a tuple from its parent's rowtype to its
+		 * own. That is to make sure that we are looking at the correct row
+		 * using the correct tuple descriptor when computing its partition key
+		 * for tuple routing.
              */
-            pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
-            pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
+		pd->tupslot = MakeSingleTupleTableSlot(tupdesc);
+		pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
                                                    tupdesc,
                                                    gettext_noop("could not convert row type"));
         }
         else
         {
             /* Not required for the root partitioned table */
-            pd[i]->tupslot = NULL;
-            pd[i]->tupmap = NULL;
+		pd->tupslot = NULL;
+		pd->tupmap = NULL;
         }
-        pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
 
         /*
-         * Indexes corresponding to the internal partitions are multiplied by
-         * -1 to distinguish them from those of leaf partitions.  Encountering
-         * an index >= 0 means we found a leaf partition, which is immediately
-         * returned as the partition we are looking for.  A negative index
-         * means we found a partitioned table, whose PartitionDispatch object
-         * is located at the above index multiplied back by -1.  Using the
-         * PartitionDispatch object, search is continued further down the
-         * partition tree.
-         */
-        m = 0;
-        for (j = 0; j < partdesc->nparts; j++)
-        {
-            Oid            partrelid = partdesc->oids[j];
+	 * Go look at each partition of this table.  If it's a leaf partition,
+	 * simply add its OID to *leaf_part_oids.  If it's a partitioned table,
+	 * recursively call get_partition_dispatch_recurse(), so that its
+	 * partitions are processed as well and a corresponding PartitionDispatch
+	 * object gets added to *pds.
+	 *
+	 * About the values in pd->indexes: for a leaf partition, it contains the
+	 * leaf partition's position in the global list *leaf_part_oids minus 1,
+	 * whereas for a partitioned table partition, it contains the partition's
+	 * position in the global list *pds multiplied by -1.  The latter is
+	 * multiplied by -1 to distinguish partitioned tables from leaf partitions
+	 * when going through the values in pd->indexes.  So, for example, when
+	 * using it during tuple-routing, encountering a value >= 0 means we found
+	 * a leaf partition.  It is immediately returned as the index in the array
+	 * of ResultRelInfos of all the leaf partitions, using which we insert the
+	 * tuple into that leaf partition.  A negative value means we found a
+	 * partitioned table.  The value multiplied by -1 is returned as the index
+	 * in the array of PartitionDispatch objects of all partitioned tables in
+	 * the tree.  This value is used to continue the search in the next level
+	 * of the partition tree.
+	 */
+	pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
+	for (i = 0; i < partdesc->nparts; i++)
+	{
+		Oid			partrelid = partdesc->oids[i];
 
             if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
             {
                 *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
-                pd[i]->indexes[j] = k++;
+			pd->indexes[i] = list_length(*leaf_part_oids) - 1;
             }
             else
             {
                 /*
-                 * offset denotes the number of partitioned tables of upper
-                 * levels including those of the current level.  Any partition
-                 * of this table must belong to the next level and hence will
-                 * be placed after the last partitioned table of this level.
+			 * We assume all tables in the partition tree were already locked
+			 * by the caller.
                  */
-                pd[i]->indexes[j] = -(1 + offset + m);
-                m++;
-            }
-        }
-        i++;
+			Relation	partrel = heap_open(partrelid, NoLock);
 
-        /*
-         * This counts the number of partitioned tables at upper levels
-         * including those of the current level.
-         */
-        offset += m;
+			pd->indexes[i] = -list_length(*pds);
+			get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids);
+		}
     }
-
-    return pd;
 }
 
 /* Module-local functions */
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index ec3de76b..1d8eb868 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -1644,6 +1644,13 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
    root->append_rel_list = list_concat(root->append_rel_list, appinfos);
 }
 
+/*
+ * expand_partitioned_rtentry
+ *		Recursively expand an RTE for a partitioned table.
+ *
+ * Note that RelationGetPartitionDispatchInfo will expand partitions in the
+ * same order as this code.
+ */
 static void
 expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
 						   Index parentRTindex, Relation parentrel,

From 3878c9057d4c3b013a575a4b19e489f4d2e2d577 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 14:58:27 +0800
Subject: [PATCH 194/578] Expand partitioned table RTEs level by level, without
 flattening.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/optimizer/path/allpaths.c  |  28 ++--
 src/backend/optimizer/plan/initsplan.c |  22 ++-
 src/backend/optimizer/plan/planner.c   |  80 ++++++---
 src/backend/optimizer/prep/prepunion.c | 224 ++++++++++++++-----------
 src/include/nodes/relation.h           |   8 +-
 src/test/regress/expected/inherit.out  |  22 +++
 src/test/regress/expected/join_4.out   |  52 ++++++
 src/test/regress/sql/inherit.sql       |  17 ++
 src/test/regress/sql/join.sql          |  23 +++
 9 files changed, 344 insertions(+), 132 deletions(-)

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index a3f54c14..60f3dd20 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -26,6 +26,7 @@
 #include "catalog/pg_operator.h"
 #include "catalog/pg_proc.h"
 #include "foreign/fdwapi.h"
+#include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
 #ifdef OPTIMIZER_DEBUG
@@ -368,8 +369,8 @@ set_rel_size(PlannerInfo *root, RelOptInfo *rel,
                 else if (rte->relkind == RELKIND_PARTITIONED_TABLE)
                 {
                     /*
-                     * A partitioned table without leaf partitions is marked
-                     * as a dummy rel.
+					 * A partitioned table without any partitions is marked as
+					 * a dummy rel.
                      */
                     set_dummy_rel_pathlist(rel);
                 }
@@ -887,6 +888,9 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
     int            nattrs;
     ListCell   *l;
 
+	/* Guard against stack overflow due to overly deep inheritance tree. */
+	check_stack_depth();
+
     Assert(IS_SIMPLE_REL(rel));
 
     /*
@@ -1310,25 +1314,23 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
 	bool		build_partitioned_rels = false;
 
 	/*
-	 * A plain relation will already have a PartitionedChildRelInfo if it is
-	 * partitioned.  For a subquery RTE, no PartitionedChildRelInfo exists; we
-	 * collect all partitioned_rels associated with any child.  (This assumes
-	 * that we don't need to look through multiple levels of subquery RTEs; if
-	 * we ever do, we could create a PartitionedChildRelInfo with the
-	 * accumulated list of partitioned_rels which would then be found when
-	 * populated our parent rel with paths.  For the present, that appears to
-	 * be unnecessary.)
+	 * A root partition will already have a PartitionedChildRelInfo, and a
+	 * non-root partitioned table doesn't need one, because its Append paths
+	 * will get flattened into the parent anyway.  For a subquery RTE, no
+	 * PartitionedChildRelInfo exists; we collect all partitioned_rels
+	 * associated with any child.  (This assumes that we don't need to look
+	 * through multiple levels of subquery RTEs; if we ever do, we could
+	 * create a PartitionedChildRelInfo with the accumulated list of
+	 * partitioned_rels which would then be found when populated our parent
+	 * rel with paths.  For the present, that appears to be unnecessary.)
 	 */
     rte = planner_rt_fetch(rel->relid, root);
 	switch (rte->rtekind)
 	{
 		case RTE_RELATION:
     if (rte->relkind == RELKIND_PARTITIONED_TABLE)
-    {
 				partitioned_rels =
 					get_partitioned_child_rels(root, rel->relid);
-        Assert(list_length(partitioned_rels) >= 1);
-    }
 			break;
 		case RTE_SUBQUERY:
 			build_partitioned_rels = true;
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c
index 7c743fd2..ab0972d5 100644
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -15,6 +15,7 @@
 #include "postgres.h"
 
 #include "catalog/pg_type.h"
+#include "catalog/pg_class.h"
 #include "nodes/nodeFuncs.h"
 #include "optimizer/clauses.h"
 #include "optimizer/cost.h"
@@ -634,11 +635,28 @@ create_lateral_join_info(PlannerInfo *root)
     for (rti = 1; rti < root->simple_rel_array_size; rti++)
     {
         RelOptInfo *brel = root->simple_rel_array[rti];
+		RangeTblEntry *brte = root->simple_rte_array[rti];
 
-        if (brel == NULL || brel->reloptkind != RELOPT_BASEREL)
+		if (brel == NULL)
+			continue;
+
+		/*
+		 * In the case of table inheritance, the parent RTE is directly linked
+		 * to every child table via an AppendRelInfo.  In the case of table
+		 * partitioning, the inheritance hierarchy is expanded one level at a
+		 * time rather than flattened.  Therefore, an other member rel that is
+		 * a partitioned table may have children of its own, and must
+		 * therefore be marked with the appropriate lateral info so that those
+		 * children eventually get marked also.
+		 */
+		Assert(IS_SIMPLE_REL(brel));
+		Assert(brte);
+		if (brel->reloptkind == RELOPT_OTHER_MEMBER_REL &&
+			(brte->rtekind != RTE_RELATION ||
+			 brte->relkind != RELKIND_PARTITIONED_TABLE))
             continue;
 
-        if (root->simple_rte_array[rti]->inh)
+		if (brte->inh)
         {
             foreach(lc, root->append_rel_list)
             {
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 137e77ca..11454dfb 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1213,7 +1213,7 @@ static void
 inheritance_planner(PlannerInfo *root)
 {// #lizard forgives
     Query       *parse = root->parse;
-    int            parentRTindex = parse->resultRelation;
+	int			top_parentRTindex = parse->resultRelation;
     Bitmapset  *subqueryRTindexes;
     Bitmapset  *modifiableARIindexes;
     int            nominalRelation = -1;
@@ -1231,6 +1231,10 @@ inheritance_planner(PlannerInfo *root)
     Index        rti;
     RangeTblEntry *parent_rte;
     List       *partitioned_rels = NIL;
+	PlannerInfo *parent_root;
+	Query	   *parent_parse;
+	Bitmapset  *parent_relids = bms_make_singleton(top_parentRTindex);
+	PlannerInfo **parent_roots = NULL;
 
     Assert(parse->commandType != CMD_INSERT);
 
@@ -1294,11 +1298,31 @@ inheritance_planner(PlannerInfo *root)
      * (including the root parent) as child members of the inheritance set do
      * not appear anywhere else in the plan.  The situation is exactly the
      * opposite in the case of non-partitioned inheritance parent as described
-     * below.
+	 * below. For the same reason, collect the list of descendant partitioned
+	 * tables to be saved in ModifyTable node, so that executor can lock those
+	 * as well.
      */
-    parent_rte = rt_fetch(parentRTindex, root->parse->rtable);
+	parent_rte = rt_fetch(top_parentRTindex, root->parse->rtable);
     if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE)
-        nominalRelation = parentRTindex;
+	{
+		nominalRelation = top_parentRTindex;
+		partitioned_rels = get_partitioned_child_rels(root, top_parentRTindex);
+		/* The root partitioned table is included as a child rel */
+		Assert(list_length(partitioned_rels) >= 1);
+	}
+
+	/*
+	 * The PlannerInfo for each child is obtained by translating the relevant
+	 * members of the PlannerInfo for its immediate parent, which we find
+	 * using the parent_relid in its AppendRelInfo.  We save the PlannerInfo
+	 * for each parent in an array indexed by relid for fast retrieval. Since
+	 * the maximum number of parents is limited by the number of RTEs in the
+	 * query, we use that number to allocate the array. An extra entry is
+	 * needed since relids start from 1.
+	 */
+	parent_roots = (PlannerInfo **) palloc0((list_length(parse->rtable) + 1) *
+											sizeof(PlannerInfo *));
+	parent_roots[top_parentRTindex] = root;
 
     /*
      * And now we can get on with generating a plan for each child table.
@@ -1312,15 +1336,24 @@ inheritance_planner(PlannerInfo *root)
         Path       *subpath;
 
         /* append_rel_list contains all append rels; ignore others */
-        if (appinfo->parent_relid != parentRTindex)
+		if (!bms_is_member(appinfo->parent_relid, parent_relids))
             continue;
 
         /*
+		 * expand_inherited_rtentry() always processes a parent before any of
+		 * that parent's children, so the parent_root for this relation should
+		 * already be available.
+		 */
+		parent_root = parent_roots[appinfo->parent_relid];
+		Assert(parent_root != NULL);
+		parent_parse = parent_root->parse;
+
+		/*
          * We need a working copy of the PlannerInfo so that we can control
          * propagation of information back to the main copy.
          */
         subroot = makeNode(PlannerInfo);
-        memcpy(subroot, root, sizeof(PlannerInfo));
+		memcpy(subroot, parent_root, sizeof(PlannerInfo));
 
         /*
          * Generate modified query with this rel as target.  We first apply
@@ -1329,15 +1362,15 @@ inheritance_planner(PlannerInfo *root)
          * then fool around with subquery RTEs.
          */
         subroot->parse = (Query *)
-            adjust_appendrel_attrs(root,
-                                   (Node *) parse,
+			adjust_appendrel_attrs(parent_root,
+								   (Node *) parent_parse,
                                    appinfo);
 
         /*
          * If there are securityQuals attached to the parent, move them to the
          * child rel (they've already been transformed properly for that).
          */
-        parent_rte = rt_fetch(parentRTindex, subroot->parse->rtable);
+		parent_rte = rt_fetch(appinfo->parent_relid, subroot->parse->rtable);
         child_rte = rt_fetch(appinfo->child_relid, subroot->parse->rtable);
         child_rte->securityQuals = parent_rte->securityQuals;
         parent_rte->securityQuals = NIL;
@@ -1348,7 +1381,7 @@ inheritance_planner(PlannerInfo *root)
          * executor doesn't need to see the modified copies --- we can just
          * pass it the original rowMarks list.)
          */
-        subroot->rowMarks = copyObject(root->rowMarks);
+		subroot->rowMarks = copyObject(parent_root->rowMarks);
 
         /*
          * The append_rel_list likewise might contain references to subquery
@@ -1365,7 +1398,7 @@ inheritance_planner(PlannerInfo *root)
             ListCell   *lc2;
 
             subroot->append_rel_list = NIL;
-            foreach(lc2, root->append_rel_list)
+			foreach(lc2, parent_root->append_rel_list)
             {
                 AppendRelInfo *appinfo2 = (AppendRelInfo *) lfirst(lc2);
 
@@ -1400,7 +1433,7 @@ inheritance_planner(PlannerInfo *root)
             ListCell   *lr;
 
             rti = 1;
-            foreach(lr, parse->rtable)
+			foreach(lr, parent_parse->rtable)
             {
                 RangeTblEntry *rte = (RangeTblEntry *) lfirst(lr);
 
@@ -1447,6 +1480,22 @@ inheritance_planner(PlannerInfo *root)
         /* hack to mark target relation as an inheritance partition */
         subroot->hasInheritedTarget = true;
 
+		/*
+		 * If the child is further partitioned, remember it as a parent. Since
+		 * a partitioned table does not have any data, we don't need to create
+		 * a plan for it. We do, however, need to remember the PlannerInfo for
+		 * use when processing its children.
+		 */
+		if (child_rte->inh)
+		{
+			Assert(child_rte->relkind == RELKIND_PARTITIONED_TABLE);
+			parent_relids =
+				bms_add_member(parent_relids, appinfo->child_relid);
+			parent_roots[appinfo->child_relid] = subroot;
+
+			continue;
+		}
+
         /* Generate Path(s) for accessing this result relation */
         grouping_planner(subroot, true, 0.0 /* retrieve all tuples */ );
 
@@ -1577,13 +1626,6 @@ inheritance_planner(PlannerInfo *root)
         Assert(!parse->onConflict);
     }
 
-    if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE)
-    {
-        partitioned_rels = get_partitioned_child_rels(root, parentRTindex);
-        /* The root partitioned table is included as a child rel */
-        Assert(list_length(partitioned_rels) >= 1);
-    }
-
     /* Result path must go into outer query's FINAL upperrel */
     final_rel = fetch_upper_rel(root, UPPERREL_FINAL, NULL);
 
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 1d8eb868..6057868c 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -104,16 +104,14 @@ static void expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte,
 static void expand_partitioned_rtentry(PlannerInfo *root,
 						   RangeTblEntry *parentrte,
 						   Index parentRTindex, Relation parentrel,
-						   PlanRowMark *parentrc, PartitionDesc partdesc,
-						   LOCKMODE lockmode,
-						   bool *has_child, List **appinfos,
-						   List **partitioned_child_rels);
+						   PlanRowMark *top_parentrc, LOCKMODE lockmode,
+						   List **appinfos, List **partitioned_child_rels);
 static void expand_single_inheritance_child(PlannerInfo *root,
 								RangeTblEntry *parentrte,
 								Index parentRTindex, Relation parentrel,
-								PlanRowMark *parentrc, Relation childrel,
-								bool *has_child, List **appinfos,
-								List **partitioned_child_rels);
+								PlanRowMark *top_parentrc, Relation childrel,
+								List **appinfos, RangeTblEntry **childrte_p,
+								Index *childRTindex_p);
 static void make_inh_translation_list(Relation oldrelation,
                           Relation newrelation,
                           Index newvarno,
@@ -1427,9 +1425,9 @@ expand_inherited_tables(PlannerInfo *root)
     ListCell   *rl;
 
     /*
-     * expand_inherited_rtentry may add RTEs to parse->rtable; there is no
-     * need to scan them since they can't have inh=true.  So just scan as far
-     * as the original end of the rtable list.
+	 * expand_inherited_rtentry may add RTEs to parse->rtable. The function is
+	 * expected to recursively handle any RTEs that it creates with inh=true.
+	 * So just scan as far as the original end of the rtable list.
      */
     nrtes = list_length(root->parse->rtable);
     rl = list_head(root->parse->rtable);
@@ -1471,11 +1469,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
     Relation    oldrelation;
     LOCKMODE    lockmode;
     List       *inhOIDs;
-    List       *appinfos;
     ListCell   *l;
-   bool        has_child;
-    PartitionedChildRelInfo *pcinfo;
-    List       *partitioned_child_rels = NIL;
 
     /* Does RT entry allow inheritance? */
     if (!rte->inh)
@@ -1546,27 +1540,44 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
     oldrelation = heap_open(parentOID, NoLock);
 
     /* Scan the inheritance set and expand it */
-    appinfos = NIL;
-   has_child = false;
    if (RelationGetPartitionDesc(oldrelation) != NULL)
    {
+		List	   *partitioned_child_rels = NIL;
+
+		Assert(rte->relkind == RELKIND_PARTITIONED_TABLE);
+
        /*
         * If this table has partitions, recursively expand them in the order
-        * in which they appear in the PartitionDesc.  But first, expand the
-        * parent itself.
+		 * in which they appear in the PartitionDesc.
         */
-       expand_single_inheritance_child(root, rte, rti, oldrelation, oldrc,
-                                       oldrelation,
-                                       &has_child, &appinfos,
-                                       &partitioned_child_rels);
        expand_partitioned_rtentry(root, rte, rti, oldrelation, oldrc,
-                                     RelationGetPartitionDesc(oldrelation),
-                                     lockmode,
-                                     &has_child, &appinfos,
+								   lockmode, &root->append_rel_list,
                                      &partitioned_child_rels);
+
+		/*
+		 * We keep a list of objects in root, each of which maps a root
+		 * partitioned parent RT index to the list of RT indexes of descendant
+		 * partitioned child tables.  When creating an Append or a ModifyTable
+		 * path for the parent, we copy the child RT index list verbatim to
+		 * the path so that it could be carried over to the executor so that
+		 * the latter could identify the partitioned child tables.
+		 */
+		if (rte->inh && partitioned_child_rels != NIL)
+		{
+			PartitionedChildRelInfo *pcinfo;
+
+			pcinfo = makeNode(PartitionedChildRelInfo);
+			pcinfo->parent_relid = rti;
+			pcinfo->child_rels = partitioned_child_rels;
+			root->pcinfo_list = lappend(root->pcinfo_list, pcinfo);
+		}
    }
    else
    {
+		List	   *appinfos = NIL;
+		RangeTblEntry *childrte;
+		Index		childRTindex;
+
        /*
         * This table has no partitions.  Expand any plain inheritance
         * children in the order the OIDs were returned by
@@ -1597,51 +1608,30 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
 
            expand_single_inheritance_child(root, rte, rti, oldrelation, oldrc,
                                            newrelation,
-                                           &has_child, &appinfos,
-                                           &partitioned_child_rels);
+											&appinfos, &childrte,
+											&childRTindex);
 
            /* Close child relations, but keep locks */
            if (childOID != parentOID)
                heap_close(newrelation, NoLock);
        }
-   }
-
-   heap_close(oldrelation, NoLock);
 
         /*
-    * If all the children were temp tables or a partitioned parent did not
-    * have any leaf partitions, pretend it's a non-inheritance situation; we
-    * don't need Append node in that case.  The duplicate RTE we added for
-    * the parent table is harmless, so we don't bother to get rid of it;
-    * ditto for the useless PlanRowMark node.
+		 * If all the children were temp tables, pretend it's a
+		 * non-inheritance situation; we don't need Append node in that case.
+		 * The duplicate RTE we added for the parent table is harmless, so we
+		 * don't bother to get rid of it; ditto for the useless PlanRowMark
+		 * node.
          */
-   if (!has_child)
-   {
-       /* Clear flag before returning */
+		if (list_length(appinfos) < 2)
        rte->inh = false;
-       return;
-   }
-
-   /*
-    * We keep a list of objects in root, each of which maps a partitioned
-    * parent RT index to the list of RT indexes of its partitioned child
-    * tables.  When creating an Append or a ModifyTable path for the parent,
-    * we copy the child RT index list verbatim to the path so that it could
-    * be carried over to the executor so that the latter could identify the
-    * partitioned child tables.
-    */
-   if (partitioned_child_rels != NIL)
-   {
-       pcinfo = makeNode(PartitionedChildRelInfo);
+		else
+			root->append_rel_list = list_concat(root->append_rel_list,
+												appinfos);
 
-       Assert(rte->relkind == RELKIND_PARTITIONED_TABLE);
-       pcinfo->parent_relid = rti;
-       pcinfo->child_rels = partitioned_child_rels;
-       root->pcinfo_list = lappend(root->pcinfo_list, pcinfo);
    }
 
-   /* Otherwise, OK to add to root->append_rel_list */
-   root->append_rel_list = list_concat(root->append_rel_list, appinfos);
+	heap_close(oldrelation, NoLock);
 }
 
 /*
@@ -1654,15 +1644,35 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
 static void
 expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
 						   Index parentRTindex, Relation parentrel,
-						   PlanRowMark *parentrc, PartitionDesc partdesc,
-						   LOCKMODE lockmode,
-						   bool *has_child, List **appinfos,
-						   List **partitioned_child_rels)
+						   PlanRowMark *top_parentrc, LOCKMODE lockmode,
+						   List **appinfos, List **partitioned_child_rels)
 {
 	int			i;
+	RangeTblEntry *childrte;
+	Index		childRTindex;
+	bool		has_child = false;
+	PartitionDesc partdesc = RelationGetPartitionDesc(parentrel);
 
 	check_stack_depth();
 
+	/* A partitioned table should always have a partition descriptor. */
+	Assert(partdesc);
+
+	Assert(parentrte->inh);
+
+	/* First expand the partitioned table itself. */
+	expand_single_inheritance_child(root, parentrte, parentRTindex, parentrel,
+									top_parentrc, parentrel,
+									appinfos, &childrte, &childRTindex);
+
+	/*
+	 * The partitioned table does not have data for itself but still need to
+	 * be locked. Update given list of partitioned children with RTI of this
+	 * partitioned relation.
+	 */
+	*partitioned_child_rels = lappend_int(*partitioned_child_rels,
+										  childRTindex);
+
 	for (i = 0; i < partdesc->nparts; i++)
 	{
 		Oid			childOID = partdesc->oids[i];
@@ -1678,23 +1688,30 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
 			continue;
 		}
 
+		/* We have a real partition. */
+		has_child = true;
+
 		expand_single_inheritance_child(root, parentrte, parentRTindex,
-										parentrel, parentrc, childrel,
-										has_child, appinfos,
-										partitioned_child_rels);
+										parentrel, top_parentrc, childrel,
+										appinfos, &childrte, &childRTindex);
 
 		/* If this child is itself partitioned, recurse */
 		if (childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-			expand_partitioned_rtentry(root, parentrte, parentRTindex,
-										  parentrel, parentrc,
-										  RelationGetPartitionDesc(childrel),
-										  lockmode,
-										  has_child, appinfos,
-										  partitioned_child_rels);
+			expand_partitioned_rtentry(root, childrte, childRTindex,
+									   childrel, top_parentrc, lockmode,
+									   appinfos, partitioned_child_rels);
 
 		/* Close child relation, but keep locks */
 		heap_close(childrel, NoLock);
 	}
+
+	/*
+	 * If the partitioned table has no partitions or all the partitions are
+	 * temporary tables from other backends, treat this as non-inheritance
+	 * case.
+	 */
+	if (!has_child)
+		parentrte->inh = false;
 }
 
 /*
@@ -1702,16 +1719,31 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
  *		Expand a single inheritance child, if needed.
  *
  * If this is a temp table of another backend, we'll return without doing
- * anything at all.  Otherwise, we'll set "has_child" to true, build a
- * RangeTblEntry and either a PartitionedChildRelInfo or AppendRelInfo as
+ * anything at all.  Otherwise, build a RangeTblEntry and an AppendRelInfo, if
  * appropriate, plus maybe a PlanRowMark.
+ *
+ * We now expand the partition hierarchy level by level, creating a
+ * corresponding hierarchy of AppendRelInfos and RelOptInfos, where each
+ * partitioned descendant acts as a parent of its immediate partitions.
+ * (This is a difference from what older versions of PostgreSQL did and what
+ * is still done in the case of table inheritance for unpartitioned tables,
+ * where the hierarchy is flattened during RTE expansion.)
+ *
+ * PlanRowMarks still carry the top-parent's RTI, and the top-parent's
+ * allMarkTypes field still accumulates values from all descendents.
+ *
+ * "parentrte" and "parentRTindex" are immediate parent's RTE and
+ * RTI. "top_parentrc" is top parent's PlanRowMark.
+ *
+ * The child RangeTblEntry and its RTI are returned in "childrte_p" and
+ * "childRTindex_p" resp.
  */
 static void
 expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte,
 								Index parentRTindex, Relation parentrel,
-								PlanRowMark *parentrc, Relation childrel,
-								bool *has_child, List **appinfos,
-								List **partitioned_child_rels)
+								PlanRowMark *top_parentrc, Relation childrel,
+								List **appinfos, RangeTblEntry **childrte_p,
+								Index *childRTindex_p)
 {
 	Query	   *parse = root->parse;
 	Oid			parentOID = RelationGetRelid(parentrel);
@@ -1733,24 +1765,30 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte,
 	 * restriction clauses, so we don't need to do it here.
 	 */
 	childrte = copyObject(parentrte);
+	*childrte_p = childrte;
         childrte->relid = childOID;
 	childrte->relkind = childrel->rd_rel->relkind;
+	/* A partitioned child will need to be expanded further. */
+	if (childOID != parentOID &&
+		childrte->relkind == RELKIND_PARTITIONED_TABLE)
+		childrte->inh = true;
+	else
         childrte->inh = false;
         childrte->requiredPerms = 0;
         childrte->securityQuals = NIL;
         parse->rtable = lappend(parse->rtable, childrte);
         childRTindex = list_length(parse->rtable);
+	*childRTindex_p = childRTindex;
 
         /*
-	 * Build an AppendRelInfo for this parent and child, unless the child is a
-	 * partitioned table.
+	 * We need an AppendRelInfo if paths will be built for the child RTE. If
+	 * childrte->inh is true, then we'll always need to generate append paths
+	 * for it.  If childrte->inh is false, we must scan it if it's not a
+	 * partitioned table; but if it is a partitioned table, then it never has
+	 * any data of its own and need not be scanned.
          */
-        if (childrte->relkind != RELKIND_PARTITIONED_TABLE)
+	if (childrte->relkind != RELKIND_PARTITIONED_TABLE || childrte->inh)
         {
-		/* Remember if we saw a real child. */
-		if (childOID != parentOID)
-			*has_child = true;
-
             appinfo = makeNode(AppendRelInfo);
 		appinfo->parent_relid = parentRTindex;
             appinfo->child_relid = childRTindex;
@@ -1780,25 +1818,23 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte,
                                                             appinfo->translated_vars);
             }
         }
-        else
-		*partitioned_child_rels = lappend_int(*partitioned_child_rels,
-                                                 childRTindex);
 
         /*
          * Build a PlanRowMark if parent is marked FOR UPDATE/SHARE.
          */
-	if (parentrc)
+	if (top_parentrc)
         {
 		PlanRowMark *childrc = makeNode(PlanRowMark);
 
 		childrc->rti = childRTindex;
-		childrc->prti = parentRTindex;
-		childrc->rowmarkId = parentrc->rowmarkId;
+		childrc->prti = top_parentrc->rti;
+		childrc->rowmarkId = top_parentrc->rowmarkId;
             /* Reselect rowmark type, because relkind might not match parent */
-		childrc->markType = select_rowmark_type(childrte, parentrc->strength);
+		childrc->markType = select_rowmark_type(childrte,
+												top_parentrc->strength);
 		childrc->allMarkTypes = (1 << childrc->markType);
-		childrc->strength = parentrc->strength;
-		childrc->waitPolicy = parentrc->waitPolicy;
+		childrc->strength = top_parentrc->strength;
+		childrc->waitPolicy = top_parentrc->waitPolicy;
 
             /*
 		 * We mark RowMarks for partitioned child tables as parent RowMarks so
@@ -1807,8 +1843,8 @@ expand_single_inheritance_child(PlannerInfo *root, RangeTblEntry *parentrte,
              */
 		childrc->isParent = (childrte->relkind == RELKIND_PARTITIONED_TABLE);
 
-            /* Include child's rowmark type in parent's allMarkTypes */
-		parentrc->allMarkTypes |= childrc->allMarkTypes;
+		/* Include child's rowmark type in top parent's allMarkTypes */
+		top_parentrc->allMarkTypes |= childrc->allMarkTypes;
 
 		root->rowMarks = lappend(root->rowMarks, childrc);
     }
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 99a6325c..04dd12d6 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -2080,10 +2080,10 @@ typedef struct SpecialJoinInfo
  *
  * When we expand an inheritable table or a UNION-ALL subselect into an
  * "append relation" (essentially, a list of child RTEs), we build an
- * AppendRelInfo for each non-partitioned child RTE.  The list of
- * AppendRelInfos indicates which child RTEs must be included when expanding
- * the parent, and each node carries information needed to translate Vars
- * referencing the parent into Vars referencing that child.
+ * AppendRelInfo for each child RTE.  The list of AppendRelInfos indicates
+ * which child RTEs must be included when expanding the parent, and each node
+ * carries information needed to translate Vars referencing the parent into
+ * Vars referencing that child.
  *
  * These structs are kept in the PlannerInfo node's append_rel_list.
  * Note that we just throw all the structs into one list, and scan the
diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
index 03b14328..cabe7df5 100644
--- a/src/test/regress/expected/inherit.out
+++ b/src/test/regress/expected/inherit.out
@@ -862,6 +862,28 @@ select tableoid::regclass::text as relname, parted_tab.* from parted_tab order b
 (3 rows)
 
 drop table parted_tab;
+-- Check UPDATE with multi-level partitioned inherited target
+create table mlparted_tab (a int, b char, c text) partition by list (a);
+create table mlparted_tab_part1 partition of mlparted_tab for values in (1);
+create table mlparted_tab_part2 partition of mlparted_tab for values in (2) partition by list (b);
+create table mlparted_tab_part3 partition of mlparted_tab for values in (3);
+create table mlparted_tab_part2a partition of mlparted_tab_part2 for values in ('a');
+create table mlparted_tab_part2b partition of mlparted_tab_part2 for values in ('b');
+insert into mlparted_tab values (1, 'a'), (2, 'a'), (2, 'b'), (3, 'a');
+update mlparted_tab mlp set c = 'xxx'
+from
+  (select a from some_tab union all select a+1 from some_tab) ss (a)
+where (mlp.a = ss.a and mlp.b = 'b') or mlp.a = 3;
+select tableoid::regclass::text as relname, mlparted_tab.* from mlparted_tab order by 1,2;
+       relname       | a | b |  c  
+---------------------+---+---+-----
+ mlparted_tab_part1  | 1 | a | 
+ mlparted_tab_part2a | 2 | a | 
+ mlparted_tab_part2b | 2 | b | xxx
+ mlparted_tab_part3  | 3 | a | xxx
+(4 rows)
+
+drop table mlparted_tab;
 drop table some_tab cascade;
 NOTICE:  drop cascades to table some_tab_child
 /* Test multiple inheritance of column defaults */
diff --git a/src/test/regress/expected/join_4.out b/src/test/regress/expected/join_4.out
index 39d9419d..c0af4d24 100644
--- a/src/test/regress/expected/join_4.out
+++ b/src/test/regress/expected/join_4.out
@@ -5648,6 +5648,58 @@ SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b =
   3000
 (1 row)
 
+-- test LATERAL reference propagation down a multi-level inheritance hierarchy
+-- produced for a multi-level partitioned table hierarchy.
+--
+create table pt1 (a int, b int, c varchar) partition by range(a);
+create table pt1p1 partition of pt1 for values from (0) to (100) partition by range(b);
+create table pt1p2 partition of pt1 for values from (100) to (200);
+create table pt1p1p1 partition of pt1p1 for values from (0) to (100);
+insert into pt1 values (1, 1, 'x'), (101, 101, 'y');
+create table ut1 (a int, b int, c varchar);
+insert into ut1 values (101, 101, 'y'), (2, 2, 'z');
+explain (verbose, costs off)
+select t1.b, ss.phv from ut1 t1 left join lateral
+              (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv
+                     from pt1 t2 join ut1 t3 on t2.a = t3.b) ss
+              on t1.a = ss.t2a order by t1.a;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Sort
+   Output: t1.b, (LEAST(t1.a, t2.a, t3.a)), t1.a
+   Sort Key: t1.a
+   ->  Nested Loop Left Join
+         Output: t1.b, (LEAST(t1.a, t2.a, t3.a)), t1.a
+         ->  Seq Scan on public.ut1 t1
+               Output: t1.a, t1.b, t1.c
+         ->  Hash Join
+               Output: t2.a, LEAST(t1.a, t2.a, t3.a)
+               Hash Cond: (t3.b = t2.a)
+               ->  Seq Scan on public.ut1 t3
+                     Output: t3.a, t3.b, t3.c
+               ->  Hash
+                     Output: t2.a
+                     ->  Append
+                           ->  Seq Scan on public.pt1p1p1 t2
+                                 Output: t2.a
+                                 Filter: (t1.a = t2.a)
+                           ->  Seq Scan on public.pt1p2 t2_1
+                                 Output: t2_1.a
+                                 Filter: (t1.a = t2_1.a)
+(21 rows)
+
+select t1.b, ss.phv from ut1 t1 left join lateral
+              (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv
+                     from pt1 t2 join ut1 t3 on t2.a = t3.b) ss
+              on t1.a = ss.t2a order by t1.a;
+  b  | phv 
+-----+-----
+   2 |    
+ 101 | 101
+(2 rows)
+
+drop table pt1;
+drop table ut1;
 --
 -- test that foreign key join estimation performs sanely for outer joins
 --
diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
index 18f3a7b2..ff11dbcb 100644
--- a/src/test/regress/sql/inherit.sql
+++ b/src/test/regress/sql/inherit.sql
@@ -223,6 +223,23 @@ where parted_tab.a = ss.a;
 select tableoid::regclass::text as relname, parted_tab.* from parted_tab order by 1,2;
 
 drop table parted_tab;
+
+-- Check UPDATE with multi-level partitioned inherited target
+create table mlparted_tab (a int, b char, c text) partition by list (a);
+create table mlparted_tab_part1 partition of mlparted_tab for values in (1);
+create table mlparted_tab_part2 partition of mlparted_tab for values in (2) partition by list (b);
+create table mlparted_tab_part3 partition of mlparted_tab for values in (3);
+create table mlparted_tab_part2a partition of mlparted_tab_part2 for values in ('a');
+create table mlparted_tab_part2b partition of mlparted_tab_part2 for values in ('b');
+insert into mlparted_tab values (1, 'a'), (2, 'a'), (2, 'b'), (3, 'a');
+
+update mlparted_tab mlp set c = 'xxx'
+from
+  (select a from some_tab union all select a+1 from some_tab) ss (a)
+where (mlp.a = ss.a and mlp.b = 'b') or mlp.a = 3;
+select tableoid::regclass::text as relname, mlparted_tab.* from mlparted_tab order by 1,2;
+
+drop table mlparted_tab;
 drop table some_tab cascade;
 
 /* Test multiple inheritance of column defaults */
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 31475056..a81f4a1b 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1816,6 +1816,29 @@ set enable_hashjoin TO false;
 EXPLAIN (VERBOSE, COSTS OFF) SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
 SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
 
+--
+-- test LATERAL reference propagation down a multi-level inheritance hierarchy
+-- produced for a multi-level partitioned table hierarchy.
+--
+create table pt1 (a int, b int, c varchar) partition by range(a);
+create table pt1p1 partition of pt1 for values from (0) to (100) partition by range(b);
+create table pt1p2 partition of pt1 for values from (100) to (200);
+create table pt1p1p1 partition of pt1p1 for values from (0) to (100);
+insert into pt1 values (1, 1, 'x'), (101, 101, 'y');
+create table ut1 (a int, b int, c varchar);
+insert into ut1 values (101, 101, 'y'), (2, 2, 'z');
+explain (verbose, costs off)
+select t1.b, ss.phv from ut1 t1 left join lateral
+              (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv
+                     from pt1 t2 join ut1 t3 on t2.a = t3.b) ss
+              on t1.a = ss.t2a order by t1.a;
+select t1.b, ss.phv from ut1 t1 left join lateral
+              (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv
+                     from pt1 t2 join ut1 t3 on t2.a = t3.b) ss
+              on t1.a = ss.t2a order by t1.a;
+
+drop table pt1;
+drop table ut1;
 --
 -- test that foreign key join estimation performs sanely for outer joins
 --

From 1158170a41577db0a37073c9893493fac587fffa Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 15:27:44 +0800
Subject: [PATCH 195/578] Associate partitioning information with each
 RelOptInfo.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/optimizer/util/plancat.c | 159 +++++++++++++++++++++++++++
 src/backend/optimizer/util/relnode.c |  35 +++++-
 src/include/nodes/relation.h         |  56 +++++++++-
 3 files changed, 248 insertions(+), 2 deletions(-)

diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index a1248e65..23938198 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -79,6 +79,10 @@ static List *get_relation_constraints(PlannerInfo *root,
 static List *build_index_tlist(PlannerInfo *root, IndexOptInfo *index,
                   Relation heapRelation);
 static List *get_relation_statistics(RelOptInfo *rel, Relation relation);
+static void set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel,
+                           Relation relation);
+static PartitionScheme find_partition_scheme(PlannerInfo *root, Relation rel);
+static List **build_baserel_partition_key_exprs(Relation relation, Index varno);
 
 #ifdef __TBASE__
 static BlockNumber GetIntervalPartitionPages(Relation rel, bool isindex, bool statistic);
@@ -470,6 +474,13 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
     /* Collect info about relation's foreign keys, if relevant */
     get_relation_foreign_keys(root, rel, relation, inhparent);
 
+	/*
+	 * Collect info about relation's partitioning scheme, if any. Only
+	 * inheritance parents may be partitioned.
+	 */
+	if (inhparent && relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+		set_relation_partition_info(root, rel, relation);
+
     heap_close(relation, NoLock);
 
     /*
@@ -1956,6 +1967,154 @@ has_row_triggers(PlannerInfo *root, Index rti, CmdType event)
     return result;
 }
 
+/*
+ * set_relation_partition_info
+ *
+ * Set partitioning scheme and related information for a partitioned table.
+ */
+static void
+set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel,
+                           Relation relation)
+{
+   PartitionDesc partdesc;
+
+   Assert(relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
+
+   partdesc = RelationGetPartitionDesc(relation);
+   rel->part_scheme = find_partition_scheme(root, relation);
+   Assert(partdesc != NULL && rel->part_scheme != NULL);
+   rel->boundinfo = partdesc->boundinfo;
+   rel->nparts = partdesc->nparts;
+   rel->partexprs = build_baserel_partition_key_exprs(relation, rel->relid);
+}
+
+/*
+ * find_partition_scheme
+ *
+ * Find or create a PartitionScheme for this Relation.
+ */
+static PartitionScheme
+find_partition_scheme(PlannerInfo *root, Relation relation)
+{
+   PartitionKey partkey = RelationGetPartitionKey(relation);
+   ListCell   *lc;
+   int         partnatts;
+   PartitionScheme part_scheme;
+
+   /* A partitioned table should have a partition key. */
+   Assert(partkey != NULL);
+
+   partnatts = partkey->partnatts;
+
+   /* Search for a matching partition scheme and return if found one. */
+   foreach(lc, root->part_schemes)
+   {
+       part_scheme = lfirst(lc);
+
+       /* Match partitioning strategy and number of keys. */
+       if (partkey->strategy != part_scheme->strategy ||
+           partnatts != part_scheme->partnatts)
+           continue;
+
+       /* Match the partition key types. */
+       if (memcmp(partkey->partopfamily, part_scheme->partopfamily,
+                  sizeof(Oid) * partnatts) != 0 ||
+           memcmp(partkey->partopcintype, part_scheme->partopcintype,
+                  sizeof(Oid) * partnatts) != 0 ||
+           memcmp(partkey->parttypcoll, part_scheme->parttypcoll,
+                  sizeof(Oid) * partnatts) != 0)
+           continue;
+
+       /*
+        * Length and byval information should match when partopcintype
+        * matches.
+        */
+       Assert(memcmp(partkey->parttyplen, part_scheme->parttyplen,
+                     sizeof(int16) * partnatts) == 0);
+       Assert(memcmp(partkey->parttypbyval, part_scheme->parttypbyval,
+                     sizeof(bool) * partnatts) == 0);
+
+       /* Found matching partition scheme. */
+       return part_scheme;
+   }
+
+   /*
+    * Did not find matching partition scheme. Create one copying relevant
+    * information from the relcache. Instead of copying whole arrays, copy
+    * the pointers in relcache. It's safe to do so since
+    * RelationClearRelation() wouldn't change it while planner is using it.
+    */
+   part_scheme = (PartitionScheme) palloc0(sizeof(PartitionSchemeData));
+   part_scheme->strategy = partkey->strategy;
+   part_scheme->partnatts = partkey->partnatts;
+   part_scheme->partopfamily = partkey->partopfamily;
+   part_scheme->partopcintype = partkey->partopcintype;
+   part_scheme->parttypcoll = partkey->parttypcoll;
+   part_scheme->parttyplen = partkey->parttyplen;
+   part_scheme->parttypbyval = partkey->parttypbyval;
+
+   /* Add the partitioning scheme to PlannerInfo. */
+   root->part_schemes = lappend(root->part_schemes, part_scheme);
+
+   return part_scheme;
+}
+
+/*
+ * build_baserel_partition_key_exprs
+ *
+ * Collects partition key expressions for a given base relation.  Any single
+ * column partition keys are converted to Var nodes.  All Var nodes are set
+ * to the given varno.  The partition key expressions are returned as an array
+ * of single element lists to be stored in RelOptInfo of the base relation.
+ */
+static List **
+build_baserel_partition_key_exprs(Relation relation, Index varno)
+{
+   PartitionKey partkey = RelationGetPartitionKey(relation);
+   int         partnatts;
+   int         cnt;
+   List      **partexprs;
+   ListCell   *lc;
+
+   /* A partitioned table should have a partition key. */
+   Assert(partkey != NULL);
+
+   partnatts = partkey->partnatts;
+   partexprs = (List **) palloc(sizeof(List *) * partnatts);
+   lc = list_head(partkey->partexprs);
+
+   for (cnt = 0; cnt < partnatts; cnt++)
+   {
+       Expr       *partexpr;
+       AttrNumber  attno = partkey->partattrs[cnt];
+
+       if (attno != InvalidAttrNumber)
+       {
+           /* Single column partition key is stored as a Var node. */
+           Assert(attno > 0);
+
+           partexpr = (Expr *) makeVar(varno, attno,
+                                       partkey->parttypid[cnt],
+                                       partkey->parttypmod[cnt],
+                                       partkey->parttypcoll[cnt], 0);
+       }
+       else
+       {
+           if (lc == NULL)
+               elog(ERROR, "wrong number of partition key expressions");
+
+           /* Re-stamp the expression with given varno. */
+           partexpr = (Expr *) copyObject(lfirst(lc));
+           ChangeVarNodes((Node *) partexpr, 1, varno, 0);
+           lc = lnext(lc);
+       }
+
+       partexprs[cnt] = list_make1(partexpr);
+   }
+
+   return partexprs;
+}
+
 #ifdef __TBASE__
 /* Get statistic/physical page num of interval partition table or its index */
 static BlockNumber 
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index 9fba700e..0ada588b 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -155,6 +155,11 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
     rel->baserestrict_min_security = UINT_MAX;
     rel->joininfo = NIL;
     rel->has_eclass_joins = false;
+	rel->part_scheme = NULL;
+	rel->nparts = 0;
+	rel->boundinfo = NULL;
+	rel->part_rels = NULL;
+	rel->partexprs = NULL;
 #ifdef __TBASE__
 	rel->intervalparent = false;
 	rel->isdefault      = rte->isdefault;
@@ -273,18 +278,41 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
     if (rte->inh)
     {
         ListCell   *l;
+		int			nparts = rel->nparts;
+		int			cnt_parts = 0;
+
+		if (nparts > 0)
+			rel->part_rels = (RelOptInfo **)
+				palloc(sizeof(RelOptInfo *) * nparts);
 
         foreach(l, root->append_rel_list)
         {
             AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l);
+			RelOptInfo *childrel;
 
             /* append_rel_list contains all append rels; ignore others */
             if (appinfo->parent_relid != relid)
                 continue;
 
-            (void) build_simple_rel(root, appinfo->child_relid,
+			childrel = build_simple_rel(root, appinfo->child_relid,
                                     rel);
+
+			/* Nothing more to do for an unpartitioned table. */
+			if (!rel->part_scheme)
+				continue;
+
+			/*
+			 * The order of partition OIDs in append_rel_list is the same as
+			 * the order in the PartitionDesc, so the order of part_rels will
+			 * also match the PartitionDesc.  See expand_partitioned_rtentry.
+			 */
+			Assert(cnt_parts < nparts);
+			rel->part_rels[cnt_parts] = childrel;
+			cnt_parts++;
         }
+
+		/* We should have seen all the child partitions. */
+		Assert(cnt_parts == nparts);
     }
 
     return rel;
@@ -582,6 +610,11 @@ build_join_rel(PlannerInfo *root,
 	joinrel->joininfo = NIL;
 	joinrel->has_eclass_joins = false;
 	joinrel->top_parent_relids = NULL;
+    joinrel->part_scheme = NULL;
+	joinrel->nparts = 0;
+	joinrel->boundinfo = NULL;
+	joinrel->part_rels = NULL;
+	joinrel->partexprs = NULL;
 #ifdef __TBASE__
 	joinrel->resultRelLoc = RESULT_REL_NONE;
 #endif
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 04dd12d6..31bdde3e 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -360,6 +360,9 @@ typedef struct PlannerInfo
     List       *distinct_pathkeys;    /* distinctClause pathkeys, if any */
     List       *sort_pathkeys;    /* sortClause pathkeys, if any */
 
+	List	   *part_schemes;	/* Canonicalised partition schemes used in the
+								 * query. */
+
     List       *initial_rels;    /* RelOptInfos we are now trying to join */
 
     /* Use fetch_upper_rel() to get any particular upper rel */
@@ -440,6 +443,34 @@ typedef struct PlannerInfo
     ((root)->simple_rte_array ? (root)->simple_rte_array[rti] : \
      rt_fetch(rti, (root)->parse->rtable))
 
+/*
+ * If multiple relations are partitioned the same way, all such partitions
+ * will have a pointer to the same PartitionScheme.  A list of PartitionScheme
+ * objects is attached to the PlannerInfo.  By design, the partition scheme
+ * incorporates only the general properties of the partition method (LIST vs.
+ * RANGE, number of partitioning columns and the type information for each)
+ * and not the specific bounds.
+ *
+ * We store the opclass-declared input data types instead of the partition key
+ * datatypes since the former rather than the latter are used to compare
+ * partition bounds. Since partition key data types and the opclass declared
+ * input data types are expected to be binary compatible (per ResolveOpClass),
+ * both of those should have same byval and length properties.
+ */
+typedef struct PartitionSchemeData
+{
+	char		strategy;		/* partition strategy */
+	int16		partnatts;		/* number of partition attributes */
+	Oid		   *partopfamily;	/* OIDs of operator families */
+	Oid		   *partopcintype;	/* OIDs of opclass declared input data types */
+	Oid		   *parttypcoll;	/* OIDs of collations of partition keys. */
+
+	/* Cached information about partition key data types. */
+	int16	   *parttyplen;
+	bool	   *parttypbyval;
+}			PartitionSchemeData;
+
+typedef struct PartitionSchemeData *PartitionScheme;
 
 /*----------
  * RelOptInfo
@@ -570,7 +601,7 @@ typedef struct PlannerInfo
  *                    other rels for which we have tried and failed to prove
  *                    this one unique
  *
- * The presence of the remaining fields depends on the restrictions
+ * The presence of the following fields depends on the restrictions
  * and joins that the relation participates in:
  *
  *        baserestrictinfo - List of RestrictInfo nodes, containing info about
@@ -601,6 +632,21 @@ typedef struct PlannerInfo
  * We store baserestrictcost in the RelOptInfo (for base relations) because
  * we know we will need it at least once (to price the sequential scan)
  * and may need it multiple times to price index scans.
+ *
+ * If the relation is partitioned, these fields will be set:
+ *
+ * 		part_scheme - Partitioning scheme of the relation
+ * 		boundinfo - Partition bounds
+ * 		nparts - Number of partitions
+ * 		part_rels - RelOptInfos for each partition
+ * 		partexprs - Partition key expressions
+ *
+ * Note: A base relation always has only one set of partition keys, but a join
+ * relation may have as many sets of partition keys as the number of relations
+ * being joined. partexprs is an array containing part_scheme->partnatts
+ * elements, each of which is a list of partition key expressions. For a base
+ * relation each list contains only one expression, but for a join relation
+ * there can be one per baserel.
  *----------
  */
 typedef enum RelOptKind
@@ -706,6 +752,14 @@ typedef struct RelOptInfo
 
     /* used by "other" relations */
     Relids        top_parent_relids;    /* Relids of topmost parents */
+
+	/* used for partitioned relations */
+	PartitionScheme part_scheme;    /* Partitioning scheme. */
+	int         nparts;         /* number of partitions */
+	struct PartitionBoundInfoData *boundinfo;   /* Partition bounds */
+	struct RelOptInfo **part_rels;  /* Array of RelOptInfos of partitions,
+	                                * stored in the same order of bounds */
+	List      **partexprs;      /* Partition key expressions. */
 #ifdef __TBASE__
 	/* used for interval partition */
 	bool		intervalparent;     /* is interval partition */

From a32140c9839857fdfb2bb4986b460e127d12d84f Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 5 Oct 2017 12:19:40 -0400
Subject: [PATCH 196/578] Improve error message when skipping scan of default
 partition.

It seems like a good idea to clearly distinguish between skipping the
scan of the new partition itself and skipping the scan of the default
partition.

Amit Langote

Discussion: http://postgr.es/m/1f08b844-0078-aa8d-452e-7af3bf77d05f@lab.ntt.co.jp
---
 src/backend/commands/tablecmds.c          | 5 +++++
 src/test/regress/expected/alter_table.out | 4 ++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 73e65cd9..e6455b86 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -16507,9 +16507,14 @@ ValidatePartitionConstraints(List **wqueue, Relation scanrel,
 	 */
 	if (PartConstraintImpliedByRelConstraint(scanrel, partConstraint))
 	{
+		if (!validate_default)
 		ereport(INFO,
 				(errmsg("partition constraint for table \"%s\" is implied by existing constraints",
 						RelationGetRelationName(scanrel))));
+		else
+			ereport(INFO,
+					(errmsg("updated partition constraint for default partition \"%s\" is implied by existing constraints",
+							RelationGetRelationName(scanrel))));
 		return;
 	}
 
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 09a22ad0..93f5db12 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3445,7 +3445,7 @@ ALTER TABLE part_7 ATTACH PARTITION part_7_a_null FOR VALUES IN ('a', null);
 INFO:  partition constraint for table "part_7_a_null" is implied by existing constraints
 ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7);
 INFO:  partition constraint for table "part_7" is implied by existing constraints
-INFO:  partition constraint for table "list_parted2_def" is implied by existing constraints
+INFO:  updated partition constraint for default partition "list_parted2_def" is implied by existing constraints
 -- Same example, but check this time that the constraint correctly detects
 -- violating rows
 ALTER TABLE list_parted2 DETACH PARTITION part_7;
@@ -3459,7 +3459,7 @@ SELECT tableoid::regclass, a, b FROM part_7 order by a;
 (2 rows)
 
 ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7);
-INFO:  partition constraint for table "list_parted2_def" is implied by existing constraints
+INFO:  updated partition constraint for default partition "list_parted2_def" is implied by existing constraints
 ERROR:  partition constraint is violated by some row
 -- check that leaf partitions of default partition are scanned when
 -- attaching a partitioned table.

From d68e653f593cdfc8d792b1cf62d409e0d4daf26f Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 5 Oct 2017 13:06:46 -0400
Subject: [PATCH 197/578] On attach, consider skipping validation of
 subpartitions individually.

If the table attached as a partition is itself partitioned, individual
partitions might have constraints strong enough to skip scanning the
table even if the table actually attached does not.  This is pretty
cheap to check, and possibly a big win if it works out.

Amit Langote, with test case changes by me.

Discussion: http://postgr.es/m/1f08b844-0078-aa8d-452e-7af3bf77d05f@lab.ntt.co.jp
---
 src/backend/commands/tablecmds.c          | 15 +++++++++++++++
 src/test/regress/expected/alter_table.out | 14 ++++++++++++++
 src/test/regress/sql/alter_table.sql      | 14 ++++++++++++++
 3 files changed, 43 insertions(+)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index e6455b86..360fd0ee 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -16555,6 +16555,21 @@ ValidatePartitionConstraints(List **wqueue, Relation scanrel,
 			/* There can never be a whole-row reference here */
 			if (found_whole_row)
 				elog(ERROR, "unexpected whole-row reference found in partition key");
+
+			/* Can we skip scanning this part_rel? */
+			if (PartConstraintImpliedByRelConstraint(part_rel, my_partconstr))
+			{
+				if (!validate_default)
+					ereport(INFO,
+							(errmsg("partition constraint for table \"%s\" is implied by existing constraints",
+									RelationGetRelationName(part_rel))));
+				else
+					ereport(INFO,
+							(errmsg("updated partition constraint for default partition \"%s\" is implied by existing constraints",
+									RelationGetRelationName(part_rel))));
+				heap_close(part_rel, NoLock);
+				continue;
+			}
 		}
 
 		/* Grab a work queue entry. */
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 93f5db12..2b11b685 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3483,6 +3483,20 @@ DETAIL:  "part_5" is already a child of "list_parted2".
 ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0);
 ERROR:  circular inheritance not allowed
 DETAIL:  "list_parted2" is already a child of "list_parted2".
+-- If the partitioned table being attached does not have a constraint that
+-- would allow validation scan to be skipped, but an individual partition
+-- does, then the partition's validation scan is skipped.
+CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a);
+CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b);
+CREATE TABLE quuux_default1 PARTITION OF quuux_default (
+	CONSTRAINT check_1 CHECK (a IS NOT NULL AND a = 1)
+) FOR VALUES IN ('b');
+CREATE TABLE quuux1 (a int, b text);
+ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate!
+CREATE TABLE quuux2 (a int, b text);
+ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation
+INFO:  updated partition constraint for default partition "quuux_default1" is implied by existing constraints
+DROP TABLE quuux;
 --
 -- DETACH PARTITION
 --
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index f996ca7a..8b73bdf4 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -2288,6 +2288,20 @@ ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
 ALTER TABLE part_5 ATTACH PARTITION list_parted2 FOR VALUES IN ('b');
 ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0);
 
+-- If the partitioned table being attached does not have a constraint that
+-- would allow validation scan to be skipped, but an individual partition
+-- does, then the partition's validation scan is skipped.
+CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a);
+CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b);
+CREATE TABLE quuux_default1 PARTITION OF quuux_default (
+	CONSTRAINT check_1 CHECK (a IS NOT NULL AND a = 1)
+) FOR VALUES IN ('b');
+CREATE TABLE quuux1 (a int, b text);
+ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate!
+CREATE TABLE quuux2 (a int, b text);
+ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation
+DROP TABLE quuux;
+
 --
 -- DETACH PARTITION
 --

From 953d7eb5d8f9fa71e76ee0738e94b03680626f92 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 5 Oct 2017 13:21:50 -0400
Subject: [PATCH 198/578] On CREATE TABLE, consider skipping validation of
 subpartitions.

This is just like commit 14f67a8ee282ebc0de78e773fbd597f460ab4a54, but
for CREATE PARTITION rather than ATTACH PARTITION.

Jeevan Ladhe, with test case changes by me.

Discussion: http://postgr.es/m/CAOgcT0MWwG8WBw8frFMtRYHAgDD=tpt6U7WcsO_L2k0KYpm4Jg@mail.gmail.com
---
 src/backend/catalog/partition.c           | 18 ++++++++++++++++++
 src/test/regress/expected/alter_table.out | 12 +++++++++---
 src/test/regress/sql/alter_table.sql      | 11 ++++++++---
 3 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 16920224..70a3d6d8 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -986,7 +986,25 @@ check_default_allows_bound(Relation parent, Relation default_rel,
 
 		/* Lock already taken above. */
 		if (part_relid != RelationGetRelid(default_rel))
+		{
 			part_rel = heap_open(part_relid, NoLock);
+
+			/*
+			 * If the partition constraints on default partition child imply
+			 * that it will not contain any row that would belong to the new
+			 * partition, we can avoid scanning the child table.
+			 */
+			if (PartConstraintImpliedByRelConstraint(part_rel,
+													 def_part_constraints))
+			{
+				ereport(INFO,
+						(errmsg("partition constraint for table \"%s\" is implied by existing constraints",
+								RelationGetRelationName(part_rel))));
+
+				heap_close(part_rel, NoLock);
+				continue;
+			}
+		}
 		else
 			part_rel = default_rel;
 
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 2b11b685..1748add2 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3483,9 +3483,10 @@ DETAIL:  "part_5" is already a child of "list_parted2".
 ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0);
 ERROR:  circular inheritance not allowed
 DETAIL:  "list_parted2" is already a child of "list_parted2".
--- If the partitioned table being attached does not have a constraint that
--- would allow validation scan to be skipped, but an individual partition
--- does, then the partition's validation scan is skipped.
+-- If a partitioned table being created or an existing table being attached
+-- as a paritition does not have a constraint that would allow validation scan
+-- to be skipped, but an individual partition does, then the partition's
+-- validation scan is skipped.
 CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a);
 CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b);
 CREATE TABLE quuux_default1 PARTITION OF quuux_default (
@@ -3496,6 +3497,11 @@ ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate!
 CREATE TABLE quuux2 (a int, b text);
 ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation
 INFO:  updated partition constraint for default partition "quuux_default1" is implied by existing constraints
+DROP TABLE quuux1, quuux2;
+-- should validate for quuux1, but not for quuux2
+CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1);
+CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2);
+INFO:  partition constraint for table "quuux_default1" is implied by existing constraints
 DROP TABLE quuux;
 --
 -- DETACH PARTITION
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index 8b73bdf4..e2c0219e 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -2288,9 +2288,10 @@ ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2);
 ALTER TABLE part_5 ATTACH PARTITION list_parted2 FOR VALUES IN ('b');
 ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0);
 
--- If the partitioned table being attached does not have a constraint that
--- would allow validation scan to be skipped, but an individual partition
--- does, then the partition's validation scan is skipped.
+-- If a partitioned table being created or an existing table being attached
+-- as a paritition does not have a constraint that would allow validation scan
+-- to be skipped, but an individual partition does, then the partition's
+-- validation scan is skipped.
 CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a);
 CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b);
 CREATE TABLE quuux_default1 PARTITION OF quuux_default (
@@ -2300,6 +2301,10 @@ CREATE TABLE quuux1 (a int, b text);
 ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate!
 CREATE TABLE quuux2 (a int, b text);
 ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation
+DROP TABLE quuux1, quuux2;
+-- should validate for quuux1, but not for quuux2
+CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1);
+CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2);
 DROP TABLE quuux;
 
 --

From 15ca8404ae2b66667a4e767d50e3496d97882432 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 16:29:14 +0800
Subject: [PATCH 199/578] Basic partition-wise join functionality.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 .../postgres_fdw/expected/postgres_fdw.out    |  120 ++
 contrib/postgres_fdw/sql/postgres_fdw.sql     |   53 +
 doc/src/sgml/config.sgml                      |   20 +
 doc/src/sgml/fdwhandler.sgml                  |   20 +
 src/backend/optimizer/README                  |   26 +
 src/backend/optimizer/geqo/geqo_eval.c        |    3 +
 src/backend/optimizer/path/allpaths.c         |  238 ++-
 src/backend/optimizer/path/costsize.c         |    1 +
 src/backend/optimizer/path/joinpath.c         |  102 +-
 src/backend/optimizer/path/joinrels.c         |  316 ++-
 src/backend/optimizer/plan/createplan.c       |   35 +-
 src/backend/optimizer/plan/planner.c          |   23 +
 src/backend/optimizer/plan/setrefs.c          |   59 +-
 src/backend/optimizer/prep/prepunion.c        |   96 +
 src/backend/optimizer/util/pathnode.c         |  364 ++++
 src/backend/optimizer/util/placeholder.c      |   58 +
 src/backend/optimizer/util/plancat.c          |   32 +-
 src/backend/optimizer/util/relnode.c          |  368 +++-
 src/backend/utils/misc/guc.c                  |    9 +
 src/backend/utils/misc/postgresql.conf.sample |    1 +
 src/include/foreign/fdwapi.h                  |    6 +
 src/include/nodes/extensible.h                |    3 +
 src/include/nodes/relation.h                  |   51 +-
 src/include/optimizer/cost.h                  |    1 +
 src/include/optimizer/pathnode.h              |    6 +
 src/include/optimizer/paths.h                 |    5 +
 src/include/optimizer/placeholder.h           |   14 +-
 src/include/optimizer/planner.h               |    2 +
 src/include/optimizer/prep.h                  |   16 +-
 src/test/regress/expected/partition_join.out  | 1789 +++++++++++++++++
 src/test/regress/expected/sysviews.out        |    1 +
 src/test/regress/expected/sysviews_1.out      |    1 +
 src/test/regress/parallel_schedule            |    2 +-
 src/test/regress/serial_schedule              |    1 +
 src/test/regress/sql/partition_join.sql       |  354 ++++
 35 files changed, 4075 insertions(+), 121 deletions(-)
 create mode 100644 src/test/regress/expected/partition_join.out
 create mode 100644 src/test/regress/sql/partition_join.sql

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 77a6e2ce..09aee7c5 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -7346,3 +7346,123 @@ AND ftoptions @> array['fetch_size=60000'];
 (1 row)
 
 ROLLBACK;
+-- ===================================================================
+-- test partition-wise-joins
+-- ===================================================================
+SET enable_partition_wise_join=on;
+CREATE TABLE fprt1 (a int, b int, c varchar) PARTITION BY RANGE(a);
+CREATE TABLE fprt1_p1 (LIKE fprt1);
+CREATE TABLE fprt1_p2 (LIKE fprt1);
+INSERT INTO fprt1_p1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 249, 2) i;
+INSERT INTO fprt1_p2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(250, 499, 2) i;
+CREATE FOREIGN TABLE ftprt1_p1 PARTITION OF fprt1 FOR VALUES FROM (0) TO (250)
+	SERVER loopback OPTIONS (table_name 'fprt1_p1', use_remote_estimate 'true');
+CREATE FOREIGN TABLE ftprt1_p2 PARTITION OF fprt1 FOR VALUES FROM (250) TO (500)
+	SERVER loopback OPTIONS (TABLE_NAME 'fprt1_p2');
+ANALYZE fprt1;
+ANALYZE fprt1_p1;
+ANALYZE fprt1_p2;
+CREATE TABLE fprt2 (a int, b int, c varchar) PARTITION BY RANGE(b);
+CREATE TABLE fprt2_p1 (LIKE fprt2);
+CREATE TABLE fprt2_p2 (LIKE fprt2);
+INSERT INTO fprt2_p1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 249, 3) i;
+INSERT INTO fprt2_p2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(250, 499, 3) i;
+CREATE FOREIGN TABLE ftprt2_p1 PARTITION OF fprt2 FOR VALUES FROM (0) TO (250)
+	SERVER loopback OPTIONS (table_name 'fprt2_p1', use_remote_estimate 'true');
+CREATE FOREIGN TABLE ftprt2_p2 PARTITION OF fprt2 FOR VALUES FROM (250) TO (500)
+	SERVER loopback OPTIONS (table_name 'fprt2_p2', use_remote_estimate 'true');
+ANALYZE fprt2;
+ANALYZE fprt2_p1;
+ANALYZE fprt2_p2;
+-- inner join three tables
+EXPLAIN (COSTS OFF)
+SELECT t1.a,t2.b,t3.c FROM fprt1 t1 INNER JOIN fprt2 t2 ON (t1.a = t2.b) INNER JOIN fprt1 t3 ON (t2.b = t3.a) WHERE t1.a % 25 =0 ORDER BY 1,2,3;
+                                                     QUERY PLAN                                                     
+--------------------------------------------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a, t3.c
+   ->  Append
+         ->  Foreign Scan
+               Relations: ((public.ftprt1_p1 t1) INNER JOIN (public.ftprt2_p1 t2)) INNER JOIN (public.ftprt1_p1 t3)
+         ->  Foreign Scan
+               Relations: ((public.ftprt1_p2 t1) INNER JOIN (public.ftprt2_p2 t2)) INNER JOIN (public.ftprt1_p2 t3)
+(7 rows)
+
+SELECT t1.a,t2.b,t3.c FROM fprt1 t1 INNER JOIN fprt2 t2 ON (t1.a = t2.b) INNER JOIN fprt1 t3 ON (t2.b = t3.a) WHERE t1.a % 25 =0 ORDER BY 1,2,3;
+  a  |  b  |  c   
+-----+-----+------
+   0 |   0 | 0000
+ 150 | 150 | 0003
+ 250 | 250 | 0005
+ 400 | 400 | 0008
+(4 rows)
+
+-- left outer join + nullable clasue
+EXPLAIN (COSTS OFF)
+SELECT t1.a,t2.b,t2.c FROM fprt1 t1 LEFT JOIN (SELECT * FROM fprt2 WHERE a < 10) t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a < 10 ORDER BY 1,2,3;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a, ftprt2_p1.b, ftprt2_p1.c
+   ->  Append
+         ->  Foreign Scan
+               Relations: (public.ftprt1_p1 t1) LEFT JOIN (public.ftprt2_p1 fprt2)
+(5 rows)
+
+SELECT t1.a,t2.b,t2.c FROM fprt1 t1 LEFT JOIN (SELECT * FROM fprt2 WHERE a < 10) t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a < 10 ORDER BY 1,2,3;
+ a | b |  c   
+---+---+------
+ 0 | 0 | 0000
+ 2 |   | 
+ 4 |   | 
+ 6 | 6 | 0000
+ 8 |   | 
+(5 rows)
+
+-- with whole-row reference
+EXPLAIN (COSTS OFF)
+SELECT t1,t2 FROM fprt1 t1 JOIN fprt2 t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a % 25 =0 ORDER BY 1,2;
+                                   QUERY PLAN                                    
+---------------------------------------------------------------------------------
+ Sort
+   Sort Key: ((t1.*)::fprt1), ((t2.*)::fprt2)
+   ->  Append
+         ->  Foreign Scan
+               Relations: (public.ftprt1_p1 t1) INNER JOIN (public.ftprt2_p1 t2)
+         ->  Foreign Scan
+               Relations: (public.ftprt1_p2 t1) INNER JOIN (public.ftprt2_p2 t2)
+(7 rows)
+
+SELECT t1,t2 FROM fprt1 t1 JOIN fprt2 t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a % 25 =0 ORDER BY 1,2;
+       t1       |       t2       
+----------------+----------------
+ (0,0,0000)     | (0,0,0000)
+ (150,150,0003) | (150,150,0003)
+ (250,250,0005) | (250,250,0005)
+ (400,400,0008) | (400,400,0008)
+(4 rows)
+
+-- join with lateral reference
+EXPLAIN (COSTS OFF)
+SELECT t1.a,t1.b FROM fprt1 t1, LATERAL (SELECT t2.a, t2.b FROM fprt2 t2 WHERE t1.a = t2.b AND t1.b = t2.a) q WHERE t1.a%25 = 0 ORDER BY 1,2;
+                                   QUERY PLAN                                    
+---------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a, t1.b
+   ->  Append
+         ->  Foreign Scan
+               Relations: (public.ftprt1_p1 t1) INNER JOIN (public.ftprt2_p1 t2)
+         ->  Foreign Scan
+               Relations: (public.ftprt1_p2 t1) INNER JOIN (public.ftprt2_p2 t2)
+(7 rows)
+
+SELECT t1.a,t1.b FROM fprt1 t1, LATERAL (SELECT t2.a, t2.b FROM fprt2 t2 WHERE t1.a = t2.b AND t1.b = t2.a) q WHERE t1.a%25 = 0 ORDER BY 1,2;
+  a  |  b  
+-----+-----
+   0 |   0
+ 150 | 150
+ 250 | 250
+ 400 | 400
+(4 rows)
+
+RESET enable_partition_wise_join;
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 5048bff6..471bceae 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -1764,3 +1764,56 @@ WHERE ftrelid = 'table30000'::regclass
 AND ftoptions @> array['fetch_size=60000'];
 
 ROLLBACK;
+
+-- ===================================================================
+-- test partition-wise-joins
+-- ===================================================================
+SET enable_partition_wise_join=on;
+
+CREATE TABLE fprt1 (a int, b int, c varchar) PARTITION BY RANGE(a);
+CREATE TABLE fprt1_p1 (LIKE fprt1);
+CREATE TABLE fprt1_p2 (LIKE fprt1);
+INSERT INTO fprt1_p1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 249, 2) i;
+INSERT INTO fprt1_p2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(250, 499, 2) i;
+CREATE FOREIGN TABLE ftprt1_p1 PARTITION OF fprt1 FOR VALUES FROM (0) TO (250)
+	SERVER loopback OPTIONS (table_name 'fprt1_p1', use_remote_estimate 'true');
+CREATE FOREIGN TABLE ftprt1_p2 PARTITION OF fprt1 FOR VALUES FROM (250) TO (500)
+	SERVER loopback OPTIONS (TABLE_NAME 'fprt1_p2');
+ANALYZE fprt1;
+ANALYZE fprt1_p1;
+ANALYZE fprt1_p2;
+
+CREATE TABLE fprt2 (a int, b int, c varchar) PARTITION BY RANGE(b);
+CREATE TABLE fprt2_p1 (LIKE fprt2);
+CREATE TABLE fprt2_p2 (LIKE fprt2);
+INSERT INTO fprt2_p1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 249, 3) i;
+INSERT INTO fprt2_p2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(250, 499, 3) i;
+CREATE FOREIGN TABLE ftprt2_p1 PARTITION OF fprt2 FOR VALUES FROM (0) TO (250)
+	SERVER loopback OPTIONS (table_name 'fprt2_p1', use_remote_estimate 'true');
+CREATE FOREIGN TABLE ftprt2_p2 PARTITION OF fprt2 FOR VALUES FROM (250) TO (500)
+	SERVER loopback OPTIONS (table_name 'fprt2_p2', use_remote_estimate 'true');
+ANALYZE fprt2;
+ANALYZE fprt2_p1;
+ANALYZE fprt2_p2;
+
+-- inner join three tables
+EXPLAIN (COSTS OFF)
+SELECT t1.a,t2.b,t3.c FROM fprt1 t1 INNER JOIN fprt2 t2 ON (t1.a = t2.b) INNER JOIN fprt1 t3 ON (t2.b = t3.a) WHERE t1.a % 25 =0 ORDER BY 1,2,3;
+SELECT t1.a,t2.b,t3.c FROM fprt1 t1 INNER JOIN fprt2 t2 ON (t1.a = t2.b) INNER JOIN fprt1 t3 ON (t2.b = t3.a) WHERE t1.a % 25 =0 ORDER BY 1,2,3;
+
+-- left outer join + nullable clasue
+EXPLAIN (COSTS OFF)
+SELECT t1.a,t2.b,t2.c FROM fprt1 t1 LEFT JOIN (SELECT * FROM fprt2 WHERE a < 10) t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a < 10 ORDER BY 1,2,3;
+SELECT t1.a,t2.b,t2.c FROM fprt1 t1 LEFT JOIN (SELECT * FROM fprt2 WHERE a < 10) t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a < 10 ORDER BY 1,2,3;
+
+-- with whole-row reference
+EXPLAIN (COSTS OFF)
+SELECT t1,t2 FROM fprt1 t1 JOIN fprt2 t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a % 25 =0 ORDER BY 1,2;
+SELECT t1,t2 FROM fprt1 t1 JOIN fprt2 t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a % 25 =0 ORDER BY 1,2;
+
+-- join with lateral reference
+EXPLAIN (COSTS OFF)
+SELECT t1.a,t1.b FROM fprt1 t1, LATERAL (SELECT t2.a, t2.b FROM fprt2 t2 WHERE t1.a = t2.b AND t1.b = t2.a) q WHERE t1.a%25 = 0 ORDER BY 1,2;
+SELECT t1.a,t1.b FROM fprt1 t1, LATERAL (SELECT t2.a, t2.b FROM fprt2 t2 WHERE t1.a = t2.b AND t1.b = t2.a) q WHERE t1.a%25 = 0 ORDER BY 1,2;
+
+RESET enable_partition_wise_join;
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e3c5bab5..939ddd85 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -3705,6 +3705,26 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-enable-partition-wise-join" xreflabel="enable_partition_wise_join">
+      <term><varname>enable_partition_wise_join</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>enable_partition_wise_join</> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables or disables the query planner's use of partition-wise join,
+        which allows a join between partitioned tables to be performed by
+        joining the matching partitions.  Partition-wise join currently applies
+        only when the join conditions include all the partition keys, which
+        must be of the same data type and have exactly matching sets of child
+        partitions.  Because partition-wise join planning can use significantly
+        more CPU time and memory during planning, the default is
+        <literal>off</>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-enable-seqscan" xreflabel="enable_seqscan">
       <term><varname>enable_seqscan</varname> (<type>boolean</type>)
       <indexterm>
diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml
index cfa68084..cab6ade2 100644
--- a/doc/src/sgml/fdwhandler.sgml
+++ b/doc/src/sgml/fdwhandler.sgml
@@ -1289,6 +1289,26 @@ ShutdownForeignScan(ForeignScanState *node);
    </para>
    </sect2>
 
+   <sect2 id="fdw-callbacks-reparameterize-paths">
+    <title>FDW Routines For reparameterization of paths</title>
+
+    <para>
+<programlisting>
+List *
+ReparameterizeForeignPathByChild(PlannerInfo *root, List *fdw_private,
+                                 RelOptInfo *child_rel);
+</programlisting>
+    This function is called while converting a path parameterized by the
+    top-most parent of the given child relation <literal>child_rel</> to be
+    parameterized by the child relation. The function is used to reparameterize
+    any paths or translate any expression nodes saved in the given
+    <literal>fdw_private</> member of a <structname>ForeignPath</>. The
+    callback may use <literal>reparameterize_path_by_child</>,
+    <literal>adjust_appendrel_attrs</> or
+    <literal>adjust_appendrel_attrs_multilevel</> as required.
+    </para>
+   </sect2>
+
    </sect1>
 
    <sect1 id="fdw-helpers">
diff --git a/src/backend/optimizer/README b/src/backend/optimizer/README
index fc0fca41..273e5914 100644
--- a/src/backend/optimizer/README
+++ b/src/backend/optimizer/README
@@ -1076,3 +1076,29 @@ be desirable to postpone the Gather stage until as near to the top of the
 plan as possible.  Expanding the range of cases in which more work can be
 pushed below the Gather (and costing them accurately) is likely to keep us
 busy for a long time to come.
+
+Partition-wise joins
+--------------------
+A join between two similarly partitioned tables can be broken down into joins
+between their matching partitions if there exists an equi-join condition
+between the partition keys of the joining tables. The equi-join between
+partition keys implies that all join partners for a given row in one
+partitioned table must be in the corresponding partition of the other
+partitioned table. Because of this the join between partitioned tables to be
+broken into joins between the matching partitions. The resultant join is
+partitioned in the same way as the joining relations, thus allowing an N-way
+join between similarly partitioned tables having equi-join condition between
+their partition keys to be broken down into N-way joins between their matching
+partitions. This technique of breaking down a join between partition tables
+into join between their partitions is called partition-wise join. We will use
+term "partitioned relation" for either a partitioned table or a join between
+compatibly partitioned tables.
+
+The partitioning properties of a partitioned relation are stored in its
+RelOptInfo.  The information about data types of partition keys are stored in
+PartitionSchemeData structure. The planner maintains a list of canonical
+partition schemes (distinct PartitionSchemeData objects) so that RelOptInfo of
+any two partitioned relations with same partitioning scheme point to the same
+PartitionSchemeData object.  This reduces memory consumed by
+PartitionSchemeData objects and makes it easy to compare the partition schemes
+of joining relations.
diff --git a/src/backend/optimizer/geqo/geqo_eval.c b/src/backend/optimizer/geqo/geqo_eval.c
index b5848268..108b866c 100644
--- a/src/backend/optimizer/geqo/geqo_eval.c
+++ b/src/backend/optimizer/geqo/geqo_eval.c
@@ -264,6 +264,9 @@ merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, bool force)
             /* Keep searching if join order is not valid */
             if (joinrel)
             {
+				/* Create paths for partition-wise joins. */
+				generate_partition_wise_join_paths(root, joinrel);
+
                 /* Create GatherPaths for any useful partial paths for rel */
                 generate_gather_paths(root, joinrel);
 
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 60f3dd20..0774ff46 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -940,12 +940,79 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
         childrel = find_base_rel(root, childRTindex);
         Assert(childrel->reloptkind == RELOPT_OTHER_MEMBER_REL);
 
+		if (rel->part_scheme)
+		{
+			AttrNumber	attno;
+
+			/*
+			 * We need attr_needed data for building targetlist of a join
+			 * relation representing join between matching partitions for
+			 * partition-wise join. A given attribute of a child will be
+			 * needed in the same highest joinrel where the corresponding
+			 * attribute of parent is needed. Hence it suffices to use the
+			 * same Relids set for parent and child.
+			 */
+			for (attno = rel->min_attr; attno <= rel->max_attr; attno++)
+			{
+				int			index = attno - rel->min_attr;
+				Relids		attr_needed = rel->attr_needed[index];
+
+				/* System attributes do not need translation. */
+				if (attno <= 0)
+				{
+					Assert(rel->min_attr == childrel->min_attr);
+					childrel->attr_needed[index] = attr_needed;
+				}
+				else
+				{
+					Var		   *var = list_nth_node(Var,
+													appinfo->translated_vars,
+													attno - 1);
+					int			child_index;
+
+					child_index = var->varattno - childrel->min_attr;
+					childrel->attr_needed[child_index] = attr_needed;
+				}
+			}
+		}
+
+		/*
+		 * Copy/Modify targetlist. Even if this child is deemed empty, we need
+		 * its targetlist in case it falls on nullable side in a child-join
+		 * because of partition-wise join.
+		 *
+		 * NB: the resulting childrel->reltarget->exprs may contain arbitrary
+		 * expressions, which otherwise would not occur in a rel's targetlist.
+		 * Code that might be looking at an appendrel child must cope with
+		 * such.  (Normally, a rel's targetlist would only include Vars and
+		 * PlaceHolderVars.)  XXX we do not bother to update the cost or width
+		 * fields of childrel->reltarget; not clear if that would be useful.
+		 */
+		childrel->reltarget->exprs = (List *)
+			adjust_appendrel_attrs(root,
+								   (Node *) rel->reltarget->exprs,
+								   1, &appinfo);
+
         /*
-         * We have to copy the parent's targetlist and quals to the child,
-         * with appropriate substitution of variables.  However, only the
-         * baserestrictinfo quals are needed before we can check for
-         * constraint exclusion; so do that first and then check to see if we
-         * can disregard this child.
+		 * We have to make child entries in the EquivalenceClass data
+		 * structures as well.  This is needed either if the parent
+		 * participates in some eclass joins (because we will want to consider
+		 * inner-indexscan joins on the individual children) or if the parent
+		 * has useful pathkeys (because we should try to build MergeAppend
+		 * paths that produce those sort orderings). Even if this child is
+		 * deemed dummy, it may fall on nullable side in a child-join, which
+		 * in turn may participate in a MergeAppend, where we will need the
+		 * EquivalenceClass data structures.
+		 */
+		if (rel->has_eclass_joins || has_useful_pathkeys(root, rel))
+			add_child_rel_equivalences(root, appinfo, rel, childrel);
+		childrel->has_eclass_joins = rel->has_eclass_joins;
+
+		/*
+		 * We have to copy the parent's quals to the child, with appropriate
+		 * substitution of variables.  However, only the baserestrictinfo
+		 * quals are needed before we can check for constraint exclusion; so
+		 * do that first and then check to see if we can disregard this child.
          *
          * The child rel's targetlist might contain non-Var expressions, which
          * means that substitution into the quals could produce opportunities
@@ -1072,44 +1139,11 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
             continue;
         }
 
-        /*
-         * CE failed, so finish copying/modifying targetlist and join quals.
-         *
-         * NB: the resulting childrel->reltarget->exprs may contain arbitrary
-         * expressions, which otherwise would not occur in a rel's targetlist.
-         * Code that might be looking at an appendrel child must cope with
-         * such.  (Normally, a rel's targetlist would only include Vars and
-         * PlaceHolderVars.)  XXX we do not bother to update the cost or width
-         * fields of childrel->reltarget; not clear if that would be useful.
-         */
+		/* CE failed, so finish copying/modifying join quals. */
         childrel->joininfo = (List *)
             adjust_appendrel_attrs(root,
                                    (Node *) rel->joininfo,
                                    appinfo);
-        childrel->reltarget->exprs = (List *)
-            adjust_appendrel_attrs(root,
-                                   (Node *) rel->reltarget->exprs,
-                                   appinfo);
-
-        /*
-         * We have to make child entries in the EquivalenceClass data
-         * structures as well.  This is needed either if the parent
-         * participates in some eclass joins (because we will want to consider
-         * inner-indexscan joins on the individual children) or if the parent
-         * has useful pathkeys (because we should try to build MergeAppend
-         * paths that produce those sort orderings).
-         */
-        if (rel->has_eclass_joins || has_useful_pathkeys(root, rel))
-            add_child_rel_equivalences(root, appinfo, rel, childrel);
-        childrel->has_eclass_joins = rel->has_eclass_joins;
-
-        /*
-         * Note: we could compute appropriate attr_needed data for the child's
-         * variables, by transforming the parent's attr_needed through the
-         * translated_vars mapping.  However, currently there's no need
-         * because attr_needed is only examined for base relations not
-         * otherrels.  So we just leave the child's attr_needed empty.
-         */
 
         /*
          * If parallelism is allowable for this query in general, see whether
@@ -1282,14 +1316,14 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
         live_childrels = lappend(live_childrels, childrel);
     }
 
-    /* Add paths to the "append" relation. */
+	/* Add paths to the append relation. */
     add_paths_to_append_rel(root, rel, live_childrels);
 }
 
 
 /*
  * add_paths_to_append_rel
- *        Generate paths for given "append" relation given the set of non-dummy
+ *		Generate paths for the given append relation given the set of non-dummy
  *        child rels.
  *
  * The function collects all parameterizations and orderings supported by the
@@ -1313,16 +1347,19 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
     RangeTblEntry *rte;
 	bool		build_partitioned_rels = false;
 
+	if (IS_SIMPLE_REL(rel))
+	{
 	/*
 	 * A root partition will already have a PartitionedChildRelInfo, and a
-	 * non-root partitioned table doesn't need one, because its Append paths
-	 * will get flattened into the parent anyway.  For a subquery RTE, no
-	 * PartitionedChildRelInfo exists; we collect all partitioned_rels
-	 * associated with any child.  (This assumes that we don't need to look
-	 * through multiple levels of subquery RTEs; if we ever do, we could
-	 * create a PartitionedChildRelInfo with the accumulated list of
-	 * partitioned_rels which would then be found when populated our parent
-	 * rel with paths.  For the present, that appears to be unnecessary.)
+		* non-root partitioned table doesn't need one, because its Append
+		* paths will get flattened into the parent anyway.  For a subquery
+		* RTE, no PartitionedChildRelInfo exists; we collect all
+		* partitioned_rels associated with any child.  (This assumes that we
+		* don't need to look through multiple levels of subquery RTEs; if we
+		* ever do, we could create a PartitionedChildRelInfo with the
+		* accumulated list of partitioned_rels which would then be found when
+		* populated our parent rel with paths.  For the present, that appears
+		* to be unnecessary.)
 	 */
     rte = planner_rt_fetch(rel->relid, root);
 	switch (rte->rtekind)
@@ -1338,6 +1375,17 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
 		default:
 			elog(ERROR, "unexpcted rtekind: %d", (int) rte->rtekind);
 	}
+	}
+	else if (rel->reloptkind == RELOPT_JOINREL && rel->part_scheme)
+	{
+		/*
+		* Associate PartitionedChildRelInfo of the root partitioned tables
+		* being joined with the root partitioned join (indicated by
+		* RELOPT_JOINREL).
+		*/
+		partitioned_rels = get_partitioned_child_rels_for_join(root,
+		                                                      rel->relids);
+	}
 
     /*
      * For every non-dummy child, remember the cheapest path.  Also, identify
@@ -2555,16 +2603,22 @@ standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels)
         join_search_one_level(root, lev);
 
         /*
-         * Run generate_gather_paths() for each just-processed joinrel.  We
-         * could not do this earlier because both regular and partial paths
-         * can get added to a particular joinrel at multiple times within
-         * join_search_one_level.  After that, we're done creating paths for
-         * the joinrel, so run set_cheapest().
+		 * Run generate_partition_wise_join_paths() and
+		 * generate_gather_paths() for each just-processed joinrel.  We could
+		 * not do this earlier because both regular and partial paths can get
+		 * added to a particular joinrel at multiple times within
+		 * join_search_one_level.
+		 *
+		 * After that, we're done creating paths for the joinrel, so run
+		 * set_cheapest().
          */
         foreach(lc, root->join_rel_level[lev])
         {
             rel = (RelOptInfo *) lfirst(lc);
 
+			/* Create paths for partition-wise joins. */
+			generate_partition_wise_join_paths(root, rel);
+
             /* Create GatherPaths for any useful partial paths for rel */
             generate_gather_paths(root, rel);
 
@@ -3312,6 +3366,82 @@ compute_parallel_worker(RelOptInfo *rel, double heap_pages, double index_pages)
     return parallel_workers;
 }
 
+/*
+ * generate_partition_wise_join_paths
+ * 		Create paths representing partition-wise join for given partitioned
+ * 		join relation.
+ *
+ * This must not be called until after we are done adding paths for all
+ * child-joins. Otherwise, add_path might delete a path to which some path
+ * generated here has a reference.
+ */
+void
+generate_partition_wise_join_paths(PlannerInfo *root, RelOptInfo *rel)
+{
+	List	   *live_children = NIL;
+	int			cnt_parts;
+	int			num_parts;
+	RelOptInfo **part_rels;
+
+	/* Handle only join relations here. */
+	if (!IS_JOIN_REL(rel))
+		return;
+
+	/*
+	 * If we've already proven this join is empty, we needn't consider any
+	 * more paths for it.
+	 */
+	if (IS_DUMMY_REL(rel))
+		return;
+
+	/*
+	 * Nothing to do if the relation is not partitioned. An outer join
+	 * relation which had empty inner relation in every pair will have rest of
+	 * the partitioning properties set except the child-join RelOptInfos. See
+	 * try_partition_wise_join() for more explanation.
+	 */
+	if (rel->nparts <= 0 || rel->part_rels == NULL)
+		return;
+
+	/* Guard against stack overflow due to overly deep partition hierarchy. */
+	check_stack_depth();
+
+	num_parts = rel->nparts;
+	part_rels = rel->part_rels;
+
+	/* Collect non-dummy child-joins. */
+	for (cnt_parts = 0; cnt_parts < num_parts; cnt_parts++)
+	{
+		RelOptInfo *child_rel = part_rels[cnt_parts];
+
+		/* Add partition-wise join paths for partitioned child-joins. */
+		generate_partition_wise_join_paths(root, child_rel);
+
+		/* Dummy children will not be scanned, so ingore those. */
+		if (IS_DUMMY_REL(child_rel))
+			continue;
+
+		set_cheapest(child_rel);
+
+#ifdef OPTIMIZER_DEBUG
+		debug_print_rel(root, rel);
+#endif
+
+		live_children = lappend(live_children, child_rel);
+	}
+
+	/* If all child-joins are dummy, parent join is also dummy. */
+	if (!live_children)
+	{
+		mark_dummy_rel(rel);
+		return;
+	}
+
+	/* Build additional paths for this rel from child-join paths. */
+	add_paths_to_append_rel(root, rel, live_children);
+	list_free(live_children);
+}
+
 
 /*****************************************************************************
  *            DEBUG SUPPORT
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 9d82cec9..18ca6a7d 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -141,6 +141,7 @@ bool		enable_mergejoin = true;
 bool		enable_hashjoin = true;
 bool		enable_fast_query_shipping = true;
 bool		enable_gathermerge = true;
+bool        enable_partition_wise_join = false;
 bool		enable_nestloop_suppression = false;
 
 typedef struct
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c
index 72a766af..de4c1137 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -29,9 +29,19 @@
 /* Hook for plugins to get control in add_paths_to_joinrel() */
 set_join_pathlist_hook_type set_join_pathlist_hook = NULL;
 
-#define PATH_PARAM_BY_REL(path, rel)  \
+/*
+ * Paths parameterized by the parent can be considered to be parameterized by
+ * any of its child.
+ */
+#define PATH_PARAM_BY_PARENT(path, rel)	\
+	((path)->param_info && bms_overlap(PATH_REQ_OUTER(path),	\
+									   (rel)->top_parent_relids))
+#define PATH_PARAM_BY_REL_SELF(path, rel)  \
     ((path)->param_info && bms_overlap(PATH_REQ_OUTER(path), (rel)->relids))
 
+#define PATH_PARAM_BY_REL(path, rel)	\
+	(PATH_PARAM_BY_REL_SELF(path, rel) || PATH_PARAM_BY_PARENT(path, rel))
+
 static void try_partial_mergejoin_path(PlannerInfo *root,
                            RelOptInfo *joinrel,
                            Path *outer_path,
@@ -118,6 +128,19 @@ add_paths_to_joinrel(PlannerInfo *root,
 	JoinPathExtraData extra;
 	bool		mergejoin_allowed = true;
 	ListCell   *lc;
+	Relids		joinrelids;
+
+	/*
+	 * PlannerInfo doesn't contain the SpecialJoinInfos created for joins
+	 * between child relations, even if there is a SpecialJoinInfo node for
+	 * the join between the topmost parents. So, while calculating Relids set
+	 * representing the restriction, consider relids of topmost parent of
+	 * partitions.
+	 */
+	if (joinrel->reloptkind == RELOPT_OTHER_JOINREL)
+		joinrelids = joinrel->top_parent_relids;
+	else
+		joinrelids = joinrel->relids;
 
 	extra.restrictlist = restrictlist;
 	extra.mergeclause_list = NIL;
@@ -224,16 +247,16 @@ add_paths_to_joinrel(PlannerInfo *root,
 		 * join has already been proven legal.)  If the SJ is relevant, it
 		 * presents constraints for joining to anything not in its RHS.
 		 */
-		if (bms_overlap(joinrel->relids, sjinfo2->min_righthand) &&
-			!bms_overlap(joinrel->relids, sjinfo2->min_lefthand))
+		if (bms_overlap(joinrelids, sjinfo2->min_righthand) &&
+			!bms_overlap(joinrelids, sjinfo2->min_lefthand))
 			extra.param_source_rels = bms_join(extra.param_source_rels,
 											   bms_difference(root->all_baserels,
 															  sjinfo2->min_righthand));
 
 		/* full joins constrain both sides symmetrically */
 		if (sjinfo2->jointype == JOIN_FULL &&
-			bms_overlap(joinrel->relids, sjinfo2->min_lefthand) &&
-			!bms_overlap(joinrel->relids, sjinfo2->min_righthand))
+			bms_overlap(joinrelids, sjinfo2->min_lefthand) &&
+			!bms_overlap(joinrelids, sjinfo2->min_righthand))
 			extra.param_source_rels = bms_join(extra.param_source_rels,
 											   bms_difference(root->all_baserels,
 															  sjinfo2->min_lefthand));
@@ -360,12 +383,26 @@ try_nestloop_path(PlannerInfo *root,
     JoinCostWorkspace workspace;
 	RelOptInfo *innerrel = inner_path->parent;
 	RelOptInfo *outerrel = outer_path->parent;
-	Relids		innerrelids = innerrel->relids;
-	Relids		outerrelids = outerrel->relids;
+	Relids		innerrelids;
+	Relids		outerrelids;
 	Relids		inner_paramrels = PATH_REQ_OUTER(inner_path);
 	Relids		outer_paramrels = PATH_REQ_OUTER(outer_path);
 
     /*
+	 * Paths are parameterized by top-level parents, so run parameterization
+	 * tests on the parent relids.
+	 */
+	if (innerrel->top_parent_relids)
+		innerrelids = innerrel->top_parent_relids;
+	else
+		innerrelids = innerrel->relids;
+
+	if (outerrel->top_parent_relids)
+		outerrelids = outerrel->top_parent_relids;
+	else
+		outerrelids = outerrel->relids;
+
+	/*
      * Check to see if proposed path is still parameterized, and reject if the
      * parameterization wouldn't be sensible --- unless allow_star_schema_join
      * says to allow it anyway.  Also, we must reject if have_dangerous_phv
@@ -400,6 +437,27 @@ try_nestloop_path(PlannerInfo *root,
                           workspace.startup_cost, workspace.total_cost,
                           pathkeys, required_outer))
     {
+		/*
+		 * If the inner path is parameterized, it is parameterized by the
+		 * topmost parent of the outer rel, not the outer rel itself.  Fix
+		 * that.
+		 */
+		if (PATH_PARAM_BY_PARENT(inner_path, outer_path->parent))
+		{
+			inner_path = reparameterize_path_by_child(root, inner_path,
+													  outer_path->parent);
+
+			/*
+			 * If we could not translate the path, we can't create nest loop
+			 * path.
+			 */
+			if (!inner_path)
+			{
+				bms_free(required_outer);
+				return;
+			}
+		}
+
         add_path(joinrel, (Path *)
                  create_nestloop_path(root,
                                       joinrel,
@@ -445,8 +503,20 @@ try_partial_nestloop_path(PlannerInfo *root,
     if (inner_path->param_info != NULL)
     {
         Relids        inner_paramrels = inner_path->param_info->ppi_req_outer;
+		RelOptInfo *outerrel = outer_path->parent;
+		Relids		outerrelids;
 
-        if (!bms_is_subset(inner_paramrels, outer_path->parent->relids))
+		/*
+		 * The inner and outer paths are parameterized, if at all, by the top
+		 * level parents, not the child relations, so we must use those relids
+		 * for our paramaterization tests.
+		 */
+		if (outerrel->top_parent_relids)
+			outerrelids = outerrel->top_parent_relids;
+		else
+			outerrelids = outerrel->relids;
+
+		if (!bms_is_subset(inner_paramrels, outerrelids))
             return;
     }
 
@@ -459,6 +529,22 @@ try_partial_nestloop_path(PlannerInfo *root,
     if (!add_partial_path_precheck(joinrel, workspace.total_cost, pathkeys))
         return;
 
+	/*
+	 * If the inner path is parameterized, it is parameterized by the topmost
+	 * parent of the outer rel, not the outer rel itself.  Fix that.
+	 */
+	if (PATH_PARAM_BY_PARENT(inner_path, outer_path->parent))
+	{
+		inner_path = reparameterize_path_by_child(root, inner_path,
+												  outer_path->parent);
+
+		/*
+		 * If we could not translate the path, we can't create nest loop path.
+		 */
+		if (!inner_path)
+			return;
+	}
+
     /* Might be good enough to be worth trying, so let's try it. */
     add_partial_path(joinrel, (Path *)
                      create_nestloop_path(root,
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index eb920d05..ad902dcf 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -14,10 +14,17 @@
  */
 #include "postgres.h"
 
+#include "miscadmin.h"
+#include "catalog/partition.h"
+#include "nodes/relation.h"
+#include "optimizer/clauses.h"
 #include "optimizer/joininfo.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
+#include "optimizer/prep.h"
+#include "optimizer/cost.h"
 #include "utils/memutils.h"
+#include "utils/lsyscache.h"
 
 
 static void make_rels_by_clause_joins(PlannerInfo *root,
@@ -29,12 +36,17 @@ static void make_rels_by_clauseless_joins(PlannerInfo *root,
 static bool has_join_restriction(PlannerInfo *root, RelOptInfo *rel);
 static bool has_legal_joinclause(PlannerInfo *root, RelOptInfo *rel);
 static bool is_dummy_rel(RelOptInfo *rel);
-static void mark_dummy_rel(RelOptInfo *rel);
 static bool restriction_is_constant_false(List *restrictlist,
                               bool only_pushed_down);
 static void populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1,
                             RelOptInfo *rel2, RelOptInfo *joinrel,
                             SpecialJoinInfo *sjinfo, List *restrictlist);
+static void try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1,
+						RelOptInfo *rel2, RelOptInfo *joinrel,
+						SpecialJoinInfo *parent_sjinfo,
+						List *parent_restrictlist);
+static int match_expr_to_partition_keys(Expr *expr, RelOptInfo *rel,
+							 bool strict_op);
 
 
 /*
@@ -913,6 +925,9 @@ populate_joinrel_with_paths(PlannerInfo *root, RelOptInfo *rel1,
 			elog(ERROR, "unrecognized join type: %d", (int) sjinfo->jointype);
 			break;
 	}
+
+	/* Apply partition-wise join technique, if possible. */
+	try_partition_wise_join(root, rel1, rel2, joinrel, sjinfo, restrictlist);
 }
 
 
@@ -1218,7 +1233,7 @@ is_dummy_rel(RelOptInfo *rel)
  * is that the best solution is to explicitly make the dummy path in the same
  * context the given RelOptInfo is in.
  */
-static void
+void
 mark_dummy_rel(RelOptInfo *rel)
 {
     MemoryContext oldcontext;
@@ -1289,3 +1304,300 @@ restriction_is_constant_false(List *restrictlist, bool only_pushed_down)
     }
     return false;
 }
+
+/*
+ * Assess whether join between given two partitioned relations can be broken
+ * down into joins between matching partitions; a technique called
+ * "partition-wise join"
+ *
+ * Partition-wise join is possible when a. Joining relations have same
+ * partitioning scheme b. There exists an equi-join between the partition keys
+ * of the two relations.
+ *
+ * Partition-wise join is planned as follows (details: optimizer/README.)
+ *
+ * 1. Create the RelOptInfos for joins between matching partitions i.e
+ * child-joins and add paths to them.
+ *
+ * 2. Construct Append or MergeAppend paths across the set of child joins.
+ * This second phase is implemented by generate_partition_wise_join_paths().
+ *
+ * The RelOptInfo, SpecialJoinInfo and restrictlist for each child join are
+ * obtained by translating the respective parent join structures.
+ */
+static void
+try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
+						RelOptInfo *joinrel, SpecialJoinInfo *parent_sjinfo,
+						List *parent_restrictlist)
+{
+	int			nparts;
+	int			cnt_parts;
+
+	/* Guard against stack overflow due to overly deep partition hierarchy. */
+	check_stack_depth();
+
+	/* Nothing to do, if the join relation is not partitioned. */
+	if (!IS_PARTITIONED_REL(joinrel))
+		return;
+
+	/*
+	 * set_rel_pathlist() may not create paths in children of an empty
+	 * partitioned table and so we can not add paths to child-joins. So, deem
+	 * such a join as unpartitioned. When a partitioned relation is deemed
+	 * empty because all its children are empty, dummy path will be set in
+	 * each of the children.  In such a case we could still consider the join
+	 * as partitioned, but it might not help much.
+	 */
+	if (IS_DUMMY_REL(rel1) || IS_DUMMY_REL(rel2))
+		return;
+
+	/*
+	 * Since this join relation is partitioned, all the base relations
+	 * participating in this join must be partitioned and so are all the
+	 * intermediate join relations.
+	 */
+	Assert(IS_PARTITIONED_REL(rel1) && IS_PARTITIONED_REL(rel2));
+	Assert(REL_HAS_ALL_PART_PROPS(rel1) && REL_HAS_ALL_PART_PROPS(rel2));
+
+	/*
+	 * The partition scheme of the join relation should match that of the
+	 * joining relations.
+	 */
+	Assert(joinrel->part_scheme == rel1->part_scheme &&
+		   joinrel->part_scheme == rel2->part_scheme);
+
+	/*
+	 * Since we allow partition-wise join only when the partition bounds of
+	 * the joining relations exactly match, the partition bounds of the join
+	 * should match those of the joining relations.
+	 */
+	Assert(partition_bounds_equal(joinrel->part_scheme->partnatts,
+								  joinrel->part_scheme->parttyplen,
+								  joinrel->part_scheme->parttypbyval,
+								  joinrel->boundinfo, rel1->boundinfo));
+	Assert(partition_bounds_equal(joinrel->part_scheme->partnatts,
+								  joinrel->part_scheme->parttyplen,
+								  joinrel->part_scheme->parttypbyval,
+								  joinrel->boundinfo, rel2->boundinfo));
+
+	nparts = joinrel->nparts;
+
+	/* Allocate space to hold child-joins RelOptInfos, if not already done. */
+	if (!joinrel->part_rels)
+		joinrel->part_rels =
+			(RelOptInfo **) palloc0(sizeof(RelOptInfo *) * nparts);
+
+	/*
+	 * Create child-join relations for this partitioned join, if those don't
+	 * exist. Add paths to child-joins for a pair of child relations
+	 * corresponding to the given pair of parent relations.
+	 */
+	for (cnt_parts = 0; cnt_parts < nparts; cnt_parts++)
+	{
+		RelOptInfo *child_rel1 = rel1->part_rels[cnt_parts];
+		RelOptInfo *child_rel2 = rel2->part_rels[cnt_parts];
+		SpecialJoinInfo *child_sjinfo;
+		List	   *child_restrictlist;
+		RelOptInfo *child_joinrel;
+		Relids		child_joinrelids;
+		AppendRelInfo **appinfos;
+		int			nappinfos;
+
+		/* We should never try to join two overlapping sets of rels. */
+		Assert(!bms_overlap(child_rel1->relids, child_rel2->relids));
+		child_joinrelids = bms_union(child_rel1->relids, child_rel2->relids);
+		appinfos = find_appinfos_by_relids(root, child_joinrelids, &nappinfos);
+
+		/*
+		 * Construct SpecialJoinInfo from parent join relations's
+		 * SpecialJoinInfo.
+		 */
+		child_sjinfo = build_child_join_sjinfo(root, parent_sjinfo,
+											   child_rel1->relids,
+											   child_rel2->relids);
+
+		/*
+		 * Construct restrictions applicable to the child join from those
+		 * applicable to the parent join.
+		 */
+		child_restrictlist =
+			(List *) adjust_appendrel_attrs(root,
+											(Node *) parent_restrictlist,
+											nappinfos, appinfos);
+		pfree(appinfos);
+
+		child_joinrel = joinrel->part_rels[cnt_parts];
+		if (!child_joinrel)
+		{
+			child_joinrel = build_child_join_rel(root, child_rel1, child_rel2,
+												 joinrel, child_restrictlist,
+												 child_sjinfo,
+												 child_sjinfo->jointype);
+			joinrel->part_rels[cnt_parts] = child_joinrel;
+		}
+
+		Assert(bms_equal(child_joinrel->relids, child_joinrelids));
+
+		populate_joinrel_with_paths(root, child_rel1, child_rel2,
+									child_joinrel, child_sjinfo,
+									child_restrictlist);
+	}
+}
+
+/*
+ * Returns true if there exists an equi-join condition for each pair of
+ * partition keys from given relations being joined.
+ */
+bool
+have_partkey_equi_join(RelOptInfo *rel1, RelOptInfo *rel2, JoinType jointype,
+					   List *restrictlist)
+{
+	PartitionScheme part_scheme = rel1->part_scheme;
+	ListCell   *lc;
+	int			cnt_pks;
+	bool		pk_has_clause[PARTITION_MAX_KEYS];
+	bool		strict_op;
+
+	/*
+	 * This function should be called when the joining relations have same
+	 * partitioning scheme.
+	 */
+	Assert(rel1->part_scheme == rel2->part_scheme);
+	Assert(part_scheme);
+
+	memset(pk_has_clause, 0, sizeof(pk_has_clause));
+	foreach(lc, restrictlist)
+	{
+		RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
+		OpExpr	   *opexpr;
+		Expr	   *expr1;
+		Expr	   *expr2;
+		int			ipk1;
+		int			ipk2;
+
+		/* If processing an outer join, only use its own join clauses. */
+		if (IS_OUTER_JOIN(jointype) && rinfo->is_pushed_down)
+			continue;
+
+		/* Skip clauses which can not be used for a join. */
+		if (!rinfo->can_join)
+			continue;
+
+		/* Skip clauses which are not equality conditions. */
+		if (!rinfo->mergeopfamilies)
+			continue;
+
+		opexpr = (OpExpr *) rinfo->clause;
+		Assert(is_opclause(opexpr));
+
+		/*
+		 * The equi-join between partition keys is strict if equi-join between
+		 * at least one partition key is using a strict operator. See
+		 * explanation about outer join reordering identity 3 in
+		 * optimizer/README
+		 */
+		strict_op = op_strict(opexpr->opno);
+
+		/* Match the operands to the relation. */
+		if (bms_is_subset(rinfo->left_relids, rel1->relids) &&
+			bms_is_subset(rinfo->right_relids, rel2->relids))
+		{
+			expr1 = linitial(opexpr->args);
+			expr2 = lsecond(opexpr->args);
+		}
+		else if (bms_is_subset(rinfo->left_relids, rel2->relids) &&
+				 bms_is_subset(rinfo->right_relids, rel1->relids))
+		{
+			expr1 = lsecond(opexpr->args);
+			expr2 = linitial(opexpr->args);
+		}
+		else
+			continue;
+
+		/*
+		 * Only clauses referencing the partition keys are useful for
+		 * partition-wise join.
+		 */
+		ipk1 = match_expr_to_partition_keys(expr1, rel1, strict_op);
+		if (ipk1 < 0)
+			continue;
+		ipk2 = match_expr_to_partition_keys(expr2, rel2, strict_op);
+		if (ipk2 < 0)
+			continue;
+
+		/*
+		 * If the clause refers to keys at different ordinal positions, it can
+		 * not be used for partition-wise join.
+		 */
+		if (ipk1 != ipk2)
+			continue;
+
+		/*
+		 * The clause allows partition-wise join if only it uses the same
+		 * operator family as that specified by the partition key.
+		 */
+		if (!list_member_oid(rinfo->mergeopfamilies,
+							 part_scheme->partopfamily[ipk1]))
+			continue;
+
+		/* Mark the partition key as having an equi-join clause. */
+		pk_has_clause[ipk1] = true;
+	}
+
+	/* Check whether every partition key has an equi-join condition. */
+	for (cnt_pks = 0; cnt_pks < part_scheme->partnatts; cnt_pks++)
+	{
+		if (!pk_has_clause[cnt_pks])
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Find the partition key from the given relation matching the given
+ * expression. If found, return the index of the partition key, else return -1.
+ */
+static int
+match_expr_to_partition_keys(Expr *expr, RelOptInfo *rel, bool strict_op)
+{
+	int			cnt;
+
+	/* This function should be called only for partitioned relations. */
+	Assert(rel->part_scheme);
+
+	/* Remove any relabel decorations. */
+	while (IsA(expr, RelabelType))
+		expr = (Expr *) (castNode(RelabelType, expr))->arg;
+
+	for (cnt = 0; cnt < rel->part_scheme->partnatts; cnt++)
+	{
+		ListCell   *lc;
+
+		Assert(rel->partexprs);
+		foreach(lc, rel->partexprs[cnt])
+		{
+			if (equal(lfirst(lc), expr))
+				return cnt;
+		}
+
+		if (!strict_op)
+			continue;
+
+		/*
+		 * If it's a strict equi-join a NULL partition key on one side will
+		 * not join a NULL partition key on the other side. So, rows with NULL
+		 * partition key from a partition on one side can not join with those
+		 * from a non-matching partition on the other side. So, search the
+		 * nullable partition keys as well.
+		 */
+		Assert(rel->nullable_partexprs);
+		foreach(lc, rel->nullable_partexprs[cnt])
+		{
+			if (equal(lfirst(lc), expr))
+				return cnt;
+		}
+	}
+
+	return -1;
+}
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 48589d94..45880a2f 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -301,7 +301,8 @@ static Plan *prepare_sort_from_pathkeys(Plan *lefttree, List *pathkeys,
 static EquivalenceMember *find_ec_member_for_tle(EquivalenceClass *ec,
                        TargetEntry *tle,
                        Relids relids);
-static Sort *make_sort_from_pathkeys(Plan *lefttree, List *pathkeys);
+static Sort *make_sort_from_pathkeys(Plan *lefttree, List *pathkeys,
+						Relids relids);
 static Sort *make_sort_from_groupcols(List *groupcls,
                          AttrNumber *grpColIdx,
                          Plan *lefttree);
@@ -2193,7 +2194,7 @@ create_sort_plan(PlannerInfo *root, SortPath *best_path, int flags)
     subplan = create_plan_recurse(root, best_path->subpath,
                                   flags | CP_SMALL_TLIST);
 
-    plan = make_sort_from_pathkeys(subplan, best_path->path.pathkeys);
+	plan = make_sort_from_pathkeys(subplan, best_path->path.pathkeys, NULL);
 
     copy_generic_path_info(&plan->plan, (Path *) best_path);
 
@@ -4849,6 +4850,8 @@ create_mergejoin_plan(PlannerInfo *root,
     ListCell   *lc;
     ListCell   *lop;
     ListCell   *lip;
+        Path       *outer_path = best_path->jpath.outerjoinpath;
+        Path       *inner_path = best_path->jpath.innerjoinpath;
 #ifdef __TBASE__
     bool       reset = false;
 
@@ -4920,8 +4923,10 @@ create_mergejoin_plan(PlannerInfo *root,
      */
     if (best_path->outersortkeys)
     {
+		Relids		outer_relids = outer_path->parent->relids;
         Sort       *sort = make_sort_from_pathkeys(outer_plan,
-                                                   best_path->outersortkeys);
+												   best_path->outersortkeys,
+												   outer_relids);
 
         label_sort_with_costsize(root, sort, -1.0);
         outer_plan = (Plan *) sort;
@@ -4932,8 +4937,10 @@ create_mergejoin_plan(PlannerInfo *root,
 
     if (best_path->innersortkeys)
     {
+		Relids		inner_relids = inner_path->parent->relids;
         Sort       *sort = make_sort_from_pathkeys(inner_plan,
-                                                   best_path->innersortkeys);
+												   best_path->innersortkeys,
+												   inner_relids);
 
         label_sort_with_costsize(root, sort, -1.0);
         inner_plan = (Plan *) sort;
@@ -7478,8 +7485,9 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll, bool nulls_first,
  * the output parameters *p_numsortkeys etc.
  *
  * When looking for matches to an EquivalenceClass's members, we will only
- * consider child EC members if they match 'relids'.  This protects against
- * possible incorrect matches to child expressions that contain no Vars.
+ * consider child EC members if they belong to given 'relids'.  This protects
+ * against possible incorrect matches to child expressions that contain no
+ * Vars.
  *
  * If reqColIdx isn't NULL then it contains sort key column numbers that
  * we should match.  This is used when making child plans for a MergeAppend;
@@ -7634,11 +7642,11 @@ prepare_sort_from_pathkeys(Plan *lefttree, List *pathkeys,
                     continue;
 
                 /*
-                 * Ignore child members unless they match the rel being
+				 * Ignore child members unless they belong to the rel being
                  * sorted.
                  */
                 if (em->em_is_child &&
-                    !bms_equal(em->em_relids, relids))
+					!bms_is_subset(em->em_relids, relids))
                     continue;
 
                 sortexpr = em->em_expr;
@@ -7730,7 +7738,7 @@ prepare_sort_from_pathkeys(Plan *lefttree, List *pathkeys,
  * find_ec_member_for_tle
  *        Locate an EquivalenceClass member matching the given TLE, if any
  *
- * Child EC members are ignored unless they match 'relids'.
+ * Child EC members are ignored unless they belong to given 'relids'.
  */
 static EquivalenceMember *
 find_ec_member_for_tle(EquivalenceClass *ec,
@@ -7758,10 +7766,10 @@ find_ec_member_for_tle(EquivalenceClass *ec,
             continue;
 
         /*
-         * Ignore child members unless they match the rel being sorted.
+		 * Ignore child members unless they belong to the rel being sorted.
          */
         if (em->em_is_child &&
-            !bms_equal(em->em_relids, relids))
+			!bms_is_subset(em->em_relids, relids))
             continue;
 
         /* Match if same expression (after stripping relabel) */
@@ -7782,9 +7790,10 @@ find_ec_member_for_tle(EquivalenceClass *ec,
  *
  *      'lefttree' is the node which yields input tuples
  *      'pathkeys' is the list of pathkeys by which the result is to be sorted
+ *	  'relids' is the set of relations required by prepare_sort_from_pathkeys()
  */
 static Sort *
-make_sort_from_pathkeys(Plan *lefttree, List *pathkeys)
+make_sort_from_pathkeys(Plan *lefttree, List *pathkeys, Relids relids)
 {
     int            numsortkeys;
     AttrNumber *sortColIdx;
@@ -7794,7 +7803,7 @@ make_sort_from_pathkeys(Plan *lefttree, List *pathkeys)
 
     /* Compute sort column info, and adjust lefttree as needed */
     lefttree = prepare_sort_from_pathkeys(lefttree, pathkeys,
-                                          NULL,
+										  relids,
                                           NULL,
                                           false,
                                           &numsortkeys,
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 11454dfb..498b1d5e 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -7866,6 +7866,29 @@ get_partitioned_child_rels(PlannerInfo *root, Index rti)
 }
 
 
+/*
+ * get_partitioned_child_rels_for_join
+ *     Build and return a list containing the RTI of every partitioned
+ *     relation which is a child of some rel included in the join.
+ */
+List *
+get_partitioned_child_rels_for_join(PlannerInfo *root, Relids join_relids)
+{
+   List       *result = NIL;
+   ListCell   *l;
+
+   foreach(l, root->pcinfo_list)
+   {
+       PartitionedChildRelInfo *pc = lfirst(l);
+
+       if (bms_is_member(pc->parent_relid, join_relids))
+           result = list_concat(result, list_copy(pc->child_rels));
+   }
+
+   return result;
+}
+
+
 static bool
 groupingsets_distribution_match(PlannerInfo *root, Query *parse, Path *path)
 {
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index e5470fa8..805585b7 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -45,6 +45,9 @@ typedef struct
     int            num_vars;        /* number of plain Var tlist entries */
     bool        has_ph_vars;    /* are there PlaceHolderVar entries? */
     bool        has_non_vars;    /* are there other entries? */
+	bool		has_conv_whole_rows;	/* are there ConvertRowtypeExpr
+										 * entries encapsulating a whole-row
+										 * Var? */
     tlist_vinfo vars[FLEXIBLE_ARRAY_MEMBER];    /* has num_vars entries */
 } indexed_tlist;
 
@@ -151,6 +154,7 @@ static List *set_returning_clause_references(PlannerInfo *root,
                                 int rtoffset);
 static bool extract_query_dependencies_walker(Node *node,
                                   PlannerInfo *context);
+static bool is_converted_whole_row_reference(Node *node);
 
 #ifdef XCP
 static void set_remotesubplan_references(PlannerInfo *root, Plan *plan, int rtoffset);
@@ -2010,6 +2014,7 @@ build_tlist_index(List *tlist)
     itlist->tlist = tlist;
     itlist->has_ph_vars = false;
     itlist->has_non_vars = false;
+	itlist->has_conv_whole_rows = false;
 
     /* Find the Vars and fill in the index array */
     vinfo = itlist->vars;
@@ -2028,6 +2033,8 @@ build_tlist_index(List *tlist)
         }
         else if (tle->expr && IsA(tle->expr, PlaceHolderVar))
             itlist->has_ph_vars = true;
+		else if (is_converted_whole_row_reference((Node *) tle->expr))
+			itlist->has_conv_whole_rows = true;
         else
             itlist->has_non_vars = true;
     }
@@ -2043,7 +2050,10 @@ build_tlist_index(List *tlist)
  * This is like build_tlist_index, but we only index tlist entries that
  * are Vars belonging to some rel other than the one specified.  We will set
  * has_ph_vars (allowing PlaceHolderVars to be matched), but not has_non_vars
- * (so nothing other than Vars and PlaceHolderVars can be matched).
+ * (so nothing other than Vars and PlaceHolderVars can be matched). In case of
+ * DML, where this function will be used, returning lists from child relations
+ * will be appended similar to a simple append relation. That does not require
+ * fixing ConvertRowtypeExpr references. So, those are not considered here.
  */
 static indexed_tlist *
 build_tlist_index_other_vars(List *tlist, Index ignore_rel)
@@ -2060,6 +2070,7 @@ build_tlist_index_other_vars(List *tlist, Index ignore_rel)
     itlist->tlist = tlist;
     itlist->has_ph_vars = false;
     itlist->has_non_vars = false;
+	itlist->has_conv_whole_rows = false;
 
     /* Find the desired Vars and fill in the index array */
     vinfo = itlist->vars;
@@ -2263,6 +2274,7 @@ static Node *
 fix_join_expr_mutator(Node *node, fix_join_expr_context *context)
 {// #lizard forgives
     Var           *newvar;
+	bool		converted_whole_row;
 
     if (node == NULL)
         return NULL;
@@ -2332,8 +2344,12 @@ fix_join_expr_mutator(Node *node, fix_join_expr_context *context)
     }
     if (IsA(node, Param))
         return fix_param_node(context->root, (Param *) node);
+
     /* Try matching more complex expressions too, if tlists have any */
-    if (context->outer_itlist && context->outer_itlist->has_non_vars)
+	converted_whole_row = is_converted_whole_row_reference(node);
+	if (context->outer_itlist &&
+		(context->outer_itlist->has_non_vars ||
+		 (context->outer_itlist->has_conv_whole_rows && converted_whole_row)))
     {
         newvar = search_indexed_tlist_for_non_var((Expr *) node,
                                                   context->outer_itlist,
@@ -2341,7 +2357,9 @@ fix_join_expr_mutator(Node *node, fix_join_expr_context *context)
         if (newvar)
             return (Node *) newvar;
     }
-    if (context->inner_itlist && context->inner_itlist->has_non_vars)
+	if (context->inner_itlist &&
+		(context->inner_itlist->has_non_vars ||
+		 (context->inner_itlist->has_conv_whole_rows && converted_whole_row)))
     {
         newvar = search_indexed_tlist_for_non_var((Expr *) node,
                                                   context->inner_itlist,
@@ -2461,7 +2479,9 @@ fix_upper_expr_mutator(Node *node, fix_upper_expr_context *context)
         /* If no match, just fall through to process it normally */
     }
     /* Try matching more complex expressions too, if tlist has any */
-    if (context->subplan_itlist->has_non_vars)
+	if (context->subplan_itlist->has_non_vars ||
+		(context->subplan_itlist->has_conv_whole_rows &&
+		 is_converted_whole_row_reference(node)))
     {
         newvar = search_indexed_tlist_for_non_var((Expr *) node,
                                                   context->subplan_itlist,
@@ -2669,6 +2689,37 @@ extract_query_dependencies_walker(Node *node, PlannerInfo *context)
                                   (void *) context);
 }
 
+
+/*
+ * is_converted_whole_row_reference
+ *     If the given node is a ConvertRowtypeExpr encapsulating a whole-row
+ *     reference as implicit cast, return true. Otherwise return false.
+ */
+static bool
+is_converted_whole_row_reference(Node *node)
+{
+   ConvertRowtypeExpr *convexpr;
+
+   if (!node || !IsA(node, ConvertRowtypeExpr))
+       return false;
+
+   /* Traverse nested ConvertRowtypeExpr's. */
+   convexpr = castNode(ConvertRowtypeExpr, node);
+   while (convexpr->convertformat == COERCE_IMPLICIT_CAST &&
+          IsA(convexpr->arg, ConvertRowtypeExpr))
+       convexpr = castNode(ConvertRowtypeExpr, convexpr->arg);
+
+   if (IsA(convexpr->arg, Var))
+   {
+       Var *var = castNode(Var, convexpr->arg);
+
+       if (var->varattno == 0)
+           return true;
+   }
+
+   return false;
+}
+
 #ifdef XCP
 /*
  * set_remotesubplan_references
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 6057868c..1fe5a341 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -2281,6 +2281,59 @@ adjust_relid_set(Relids relids, Index oldrelid, Index newrelid)
     return relids;
 }
 
+/*
+ * Replace any relid present in top_parent_relids with its child in
+ * child_relids. Members of child_relids can be multiple levels below top
+ * parent in the partition hierarchy.
+ */
+Relids
+adjust_child_relids_multilevel(PlannerInfo *root, Relids relids,
+							   Relids child_relids, Relids top_parent_relids)
+{
+	AppendRelInfo **appinfos;
+	int			nappinfos;
+	Relids		parent_relids = NULL;
+	Relids		result;
+	Relids		tmp_result = NULL;
+	int			cnt;
+
+	/*
+	 * If the given relids set doesn't contain any of the top parent relids,
+	 * it will remain unchanged.
+	 */
+	if (!bms_overlap(relids, top_parent_relids))
+		return relids;
+
+	appinfos = find_appinfos_by_relids(root, child_relids, &nappinfos);
+
+	/* Construct relids set for the immediate parent of the given child. */
+	for (cnt = 0; cnt < nappinfos; cnt++)
+	{
+		AppendRelInfo *appinfo = appinfos[cnt];
+
+		parent_relids = bms_add_member(parent_relids, appinfo->parent_relid);
+	}
+
+	/* Recurse if immediate parent is not the top parent. */
+	if (!bms_equal(parent_relids, top_parent_relids))
+	{
+		tmp_result = adjust_child_relids_multilevel(root, relids,
+													parent_relids,
+													top_parent_relids);
+		relids = tmp_result;
+	}
+
+	result = adjust_child_relids(relids, nappinfos, appinfos);
+
+	/* Free memory consumed by any intermediate result. */
+	if (tmp_result)
+		bms_free(tmp_result);
+	bms_free(parent_relids);
+	pfree(appinfos);
+
+	return result;
+}
+
 /*
  * Adjust the targetlist entries of an inherited UPDATE operation
  *
@@ -2400,3 +2453,46 @@ adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node,
     /* Now translate for this child */
     return adjust_appendrel_attrs(root, node, appinfo);
 }
+
+/*
+ * Construct the SpecialJoinInfo for a child-join by translating
+ * SpecialJoinInfo for the join between parents. left_relids and right_relids
+ * are the relids of left and right side of the join respectively.
+ */
+SpecialJoinInfo *
+build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo,
+                       Relids left_relids, Relids right_relids)
+{
+   SpecialJoinInfo *sjinfo = makeNode(SpecialJoinInfo);
+   AppendRelInfo **left_appinfos;
+   int         left_nappinfos;
+   AppendRelInfo **right_appinfos;
+   int         right_nappinfos;
+
+   memcpy(sjinfo, parent_sjinfo, sizeof(SpecialJoinInfo));
+   left_appinfos = find_appinfos_by_relids(root, left_relids,
+                                           &left_nappinfos);
+   right_appinfos = find_appinfos_by_relids(root, right_relids,
+                                            &right_nappinfos);
+
+   sjinfo->min_lefthand = adjust_child_relids(sjinfo->min_lefthand,
+                                              left_nappinfos, left_appinfos);
+   sjinfo->min_righthand = adjust_child_relids(sjinfo->min_righthand,
+                                               right_nappinfos,
+                                               right_appinfos);
+   sjinfo->syn_lefthand = adjust_child_relids(sjinfo->syn_lefthand,
+                                              left_nappinfos, left_appinfos);
+   sjinfo->syn_righthand = adjust_child_relids(sjinfo->syn_righthand,
+                                               right_nappinfos,
+                                               right_appinfos);
+   sjinfo->semi_rhs_exprs = (List *) adjust_appendrel_attrs(root,
+                                                            (Node *) sjinfo->semi_rhs_exprs,
+                                                            right_nappinfos,
+                                                            right_appinfos);
+
+   pfree(left_appinfos);
+   pfree(right_appinfos);
+
+   return sjinfo;
+}
+
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 49e6658f..c2d27db7 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -19,15 +19,20 @@
 
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
+#include "nodes/extensible.h"
 #include "optimizer/clauses.h"
 #include "optimizer/cost.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/planmain.h"
+#include "optimizer/prep.h"
 #include "optimizer/restrictinfo.h"
+#include "optimizer/tlist.h"
 #include "optimizer/var.h"
 #include "parser/parsetree.h"
+#include "foreign/fdwapi.h"
 #include "utils/lsyscache.h"
+#include "utils/memutils.h"
 #include "utils/selfuncs.h"
 #ifdef XCP
 #include "access/heapam.h"
@@ -82,6 +87,10 @@ typedef enum
 #define STD_FUZZ_FACTOR 1.01
 
 static List *translate_sub_tlist(List *tlist, int relid);
+static List *reparameterize_pathlist_by_child(PlannerInfo *root,
+                                                                List *pathlist,
+                                                                RelOptInfo *child_rel);
+
 #ifdef XCP
 static void restrict_distribution(PlannerInfo *root, RestrictInfo *ri,
                                   Path *pathnode);
@@ -6924,6 +6933,361 @@ reparameterize_path(PlannerInfo *root, Path *path,
     return NULL;
 }
 
+/*
+ * reparameterize_path_by_child
+ * 		Given a path parameterized by the parent of the given child relation,
+ * 		translate the path to be parameterized by the given child relation.
+ *
+ * The function creates a new path of the same type as the given path, but
+ * parameterized by the given child relation.  Most fields from the original
+ * path can simply be flat-copied, but any expressions must be adjusted to
+ * refer to the correct varnos, and any paths must be recursively
+ * reparameterized.  Other fields that refer to specific relids also need
+ * adjustment.
+ *
+ * The cost, number of rows, width and parallel path properties depend upon
+ * path->parent, which does not change during the translation. Hence those
+ * members are copied as they are.
+ *
+ * If the given path can not be reparameterized, the function returns NULL.
+ */
+Path *
+reparameterize_path_by_child(PlannerInfo *root, Path *path,
+							 RelOptInfo *child_rel)
+{
+
+#define FLAT_COPY_PATH(newnode, node, nodetype)  \
+	( (newnode) = makeNode(nodetype), \
+	  memcpy((newnode), (node), sizeof(nodetype)) )
+
+#define ADJUST_CHILD_ATTRS(node) \
+	((node) = \
+	 (List *) adjust_appendrel_attrs_multilevel(root, (Node *) (node), \
+												child_rel->relids, \
+												child_rel->top_parent_relids))
+
+#define REPARAMETERIZE_CHILD_PATH(path) \
+do { \
+	(path) = reparameterize_path_by_child(root, (path), child_rel); \
+	if ((path) == NULL) \
+		return NULL; \
+} while(0);
+
+#define REPARAMETERIZE_CHILD_PATH_LIST(pathlist) \
+do { \
+	if ((pathlist) != NIL) \
+	{ \
+		(pathlist) = reparameterize_pathlist_by_child(root, (pathlist), \
+													  child_rel); \
+		if ((pathlist) == NIL) \
+			return NULL; \
+	} \
+} while(0);
+
+	Path	   *new_path;
+	ParamPathInfo *new_ppi;
+	ParamPathInfo *old_ppi;
+	Relids		required_outer;
+
+	/*
+	 * If the path is not parameterized by parent of the given relation, it
+	 * doesn't need reparameterization.
+	 */
+	if (!path->param_info ||
+		!bms_overlap(PATH_REQ_OUTER(path), child_rel->top_parent_relids))
+		return path;
+
+	/* Reparameterize a copy of given path. */
+	switch (nodeTag(path))
+	{
+		case T_Path:
+			FLAT_COPY_PATH(new_path, path, Path);
+			break;
+
+		case T_IndexPath:
+			{
+				IndexPath  *ipath;
+
+				FLAT_COPY_PATH(ipath, path, IndexPath);
+				ADJUST_CHILD_ATTRS(ipath->indexclauses);
+				ADJUST_CHILD_ATTRS(ipath->indexquals);
+				new_path = (Path *) ipath;
+			}
+			break;
+
+		case T_BitmapHeapPath:
+			{
+				BitmapHeapPath *bhpath;
+
+				FLAT_COPY_PATH(bhpath, path, BitmapHeapPath);
+				REPARAMETERIZE_CHILD_PATH(bhpath->bitmapqual);
+				new_path = (Path *) bhpath;
+			}
+			break;
+
+		case T_BitmapAndPath:
+			{
+				BitmapAndPath *bapath;
+
+				FLAT_COPY_PATH(bapath, path, BitmapAndPath);
+				REPARAMETERIZE_CHILD_PATH_LIST(bapath->bitmapquals);
+				new_path = (Path *) bapath;
+			}
+			break;
+
+		case T_BitmapOrPath:
+			{
+				BitmapOrPath *bopath;
+
+				FLAT_COPY_PATH(bopath, path, BitmapOrPath);
+				REPARAMETERIZE_CHILD_PATH_LIST(bopath->bitmapquals);
+				new_path = (Path *) bopath;
+			}
+			break;
+
+		case T_TidPath:
+			{
+				TidPath    *tpath;
+
+				/*
+				 * TidPath contains tidquals, which do not contain any
+				 * external parameters per create_tidscan_path(). So don't
+				 * bother to translate those.
+				 */
+				FLAT_COPY_PATH(tpath, path, TidPath);
+				new_path = (Path *) tpath;
+			}
+			break;
+
+		case T_ForeignPath:
+			{
+				ForeignPath *fpath;
+				ReparameterizeForeignPathByChild_function rfpc_func;
+
+				FLAT_COPY_PATH(fpath, path, ForeignPath);
+				if (fpath->fdw_outerpath)
+					REPARAMETERIZE_CHILD_PATH(fpath->fdw_outerpath);
+
+				/* Hand over to FDW if needed. */
+				rfpc_func =
+					path->parent->fdwroutine->ReparameterizeForeignPathByChild;
+				if (rfpc_func)
+					fpath->fdw_private = rfpc_func(root, fpath->fdw_private,
+												   child_rel);
+				new_path = (Path *) fpath;
+			}
+			break;
+
+		case T_CustomPath:
+			{
+				CustomPath *cpath;
+
+				FLAT_COPY_PATH(cpath, path, CustomPath);
+				REPARAMETERIZE_CHILD_PATH_LIST(cpath->custom_paths);
+				if (cpath->methods &&
+					cpath->methods->ReparameterizeCustomPathByChild)
+					cpath->custom_private =
+						cpath->methods->ReparameterizeCustomPathByChild(root,
+																		cpath->custom_private,
+																		child_rel);
+				new_path = (Path *) cpath;
+			}
+			break;
+
+		case T_NestPath:
+			{
+				JoinPath   *jpath;
+
+				FLAT_COPY_PATH(jpath, path, NestPath);
+
+				REPARAMETERIZE_CHILD_PATH(jpath->outerjoinpath);
+				REPARAMETERIZE_CHILD_PATH(jpath->innerjoinpath);
+				ADJUST_CHILD_ATTRS(jpath->joinrestrictinfo);
+				new_path = (Path *) jpath;
+			}
+			break;
+
+		case T_MergePath:
+			{
+				JoinPath   *jpath;
+				MergePath  *mpath;
+
+				FLAT_COPY_PATH(mpath, path, MergePath);
+
+				jpath = (JoinPath *) mpath;
+				REPARAMETERIZE_CHILD_PATH(jpath->outerjoinpath);
+				REPARAMETERIZE_CHILD_PATH(jpath->innerjoinpath);
+				ADJUST_CHILD_ATTRS(jpath->joinrestrictinfo);
+				ADJUST_CHILD_ATTRS(mpath->path_mergeclauses);
+				new_path = (Path *) mpath;
+			}
+			break;
+
+		case T_HashPath:
+			{
+				JoinPath   *jpath;
+				HashPath   *hpath;
+
+				FLAT_COPY_PATH(hpath, path, HashPath);
+
+				jpath = (JoinPath *) hpath;
+				REPARAMETERIZE_CHILD_PATH(jpath->outerjoinpath);
+				REPARAMETERIZE_CHILD_PATH(jpath->innerjoinpath);
+				ADJUST_CHILD_ATTRS(jpath->joinrestrictinfo);
+				ADJUST_CHILD_ATTRS(hpath->path_hashclauses);
+				new_path = (Path *) hpath;
+			}
+			break;
+
+		case T_AppendPath:
+			{
+				AppendPath *apath;
+
+				FLAT_COPY_PATH(apath, path, AppendPath);
+				REPARAMETERIZE_CHILD_PATH_LIST(apath->subpaths);
+				new_path = (Path *) apath;
+			}
+			break;
+
+		case T_MergeAppend:
+			{
+				MergeAppendPath *mapath;
+
+				FLAT_COPY_PATH(mapath, path, MergeAppendPath);
+				REPARAMETERIZE_CHILD_PATH_LIST(mapath->subpaths);
+				new_path = (Path *) mapath;
+			}
+			break;
+
+		case T_MaterialPath:
+			{
+				MaterialPath *mpath;
+
+				FLAT_COPY_PATH(mpath, path, MaterialPath);
+				REPARAMETERIZE_CHILD_PATH(mpath->subpath);
+				new_path = (Path *) mpath;
+			}
+			break;
+
+		case T_UniquePath:
+			{
+				UniquePath *upath;
+
+				FLAT_COPY_PATH(upath, path, UniquePath);
+				REPARAMETERIZE_CHILD_PATH(upath->subpath);
+				ADJUST_CHILD_ATTRS(upath->uniq_exprs);
+				new_path = (Path *) upath;
+			}
+			break;
+
+		case T_GatherPath:
+			{
+				GatherPath *gpath;
+
+				FLAT_COPY_PATH(gpath, path, GatherPath);
+				REPARAMETERIZE_CHILD_PATH(gpath->subpath);
+				new_path = (Path *) gpath;
+			}
+			break;
+
+		case T_GatherMergePath:
+			{
+				GatherMergePath *gmpath;
+
+				FLAT_COPY_PATH(gmpath, path, GatherMergePath);
+				REPARAMETERIZE_CHILD_PATH(gmpath->subpath);
+				new_path = (Path *) gmpath;
+			}
+			break;
+
+		default:
+
+			/* We don't know how to reparameterize this path. */
+			return NULL;
+	}
+
+	/*
+	 * Adjust the parameterization information, which refers to the topmost
+	 * parent. The topmost parent can be multiple levels away from the given
+	 * child, hence use multi-level expression adjustment routines.
+	 */
+	old_ppi = new_path->param_info;
+	required_outer =
+		adjust_child_relids_multilevel(root, old_ppi->ppi_req_outer,
+									   child_rel->relids,
+									   child_rel->top_parent_relids);
+
+	/* If we already have a PPI for this parameterization, just return it */
+	new_ppi = find_param_path_info(new_path->parent, required_outer);
+
+	/*
+	 * If not, build a new one and link it to the list of PPIs. For the same
+	 * reason as explained in mark_dummy_rel(), allocate new PPI in the same
+	 * context the given RelOptInfo is in.
+	 */
+	if (new_ppi == NULL)
+	{
+		MemoryContext oldcontext;
+		RelOptInfo *rel = path->parent;
+
+		oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(rel));
+
+		new_ppi = makeNode(ParamPathInfo);
+		new_ppi->ppi_req_outer = bms_copy(required_outer);
+		new_ppi->ppi_rows = old_ppi->ppi_rows;
+		new_ppi->ppi_clauses = old_ppi->ppi_clauses;
+		ADJUST_CHILD_ATTRS(new_ppi->ppi_clauses);
+		rel->ppilist = lappend(rel->ppilist, new_ppi);
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+	bms_free(required_outer);
+
+	new_path->param_info = new_ppi;
+
+	/*
+	 * Adjust the path target if the parent of the outer relation is
+	 * referenced in the targetlist. This can happen when only the parent of
+	 * outer relation is laterally referenced in this relation.
+	 */
+	if (bms_overlap(path->parent->lateral_relids,
+					child_rel->top_parent_relids))
+	{
+		new_path->pathtarget = copy_pathtarget(new_path->pathtarget);
+		ADJUST_CHILD_ATTRS(new_path->pathtarget->exprs);
+	}
+
+	return new_path;
+}
+
+/*
+ * reparameterize_pathlist_by_child
+ * 		Helper function to reparameterize a list of paths by given child rel.
+ */
+static List *
+reparameterize_pathlist_by_child(PlannerInfo *root,
+								 List *pathlist,
+								 RelOptInfo *child_rel)
+{
+	ListCell   *lc;
+	List	   *result = NIL;
+
+	foreach(lc, pathlist)
+	{
+		Path	   *path = reparameterize_path_by_child(root, lfirst(lc),
+														child_rel);
+		if (path == NULL)
+		{
+			list_free(result);
+			return NIL;
+		}
+
+		result = lappend(result, path);
+	}
+
+	return result;
+}
+
 #ifdef __TBASE__
 /*
  * Count datanode number for given path, consider replication table as 1
diff --git a/src/backend/optimizer/util/placeholder.c b/src/backend/optimizer/util/placeholder.c
index 9b29be4c..0d5351a6 100644
--- a/src/backend/optimizer/util/placeholder.c
+++ b/src/backend/optimizer/util/placeholder.c
@@ -20,6 +20,7 @@
 #include "optimizer/pathnode.h"
 #include "optimizer/placeholder.h"
 #include "optimizer/planmain.h"
+#include "optimizer/prep.h"
 #include "optimizer/var.h"
 #include "utils/lsyscache.h"
 
@@ -414,6 +415,10 @@ add_placeholders_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel,
     Relids        relids = joinrel->relids;
     ListCell   *lc;
 
+	/* This function is called only on the parent relations. */
+	Assert(!IS_OTHER_REL(joinrel) && !IS_OTHER_REL(outer_rel) &&
+		   !IS_OTHER_REL(inner_rel));
+
     foreach(lc, root->placeholder_list)
     {
         PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(lc);
@@ -459,3 +464,56 @@ add_placeholders_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel,
         }
     }
 }
+
+/*
+ * add_placeholders_to_child_joinrel
+ *		Translate the PHVs in parent's targetlist and add them to the child's
+ *		targetlist. Also adjust the cost
+ */
+void
+add_placeholders_to_child_joinrel(PlannerInfo *root, RelOptInfo *childrel,
+								  RelOptInfo *parentrel)
+{
+	ListCell   *lc;
+	AppendRelInfo **appinfos;
+	int			nappinfos;
+
+	Assert(IS_JOIN_REL(childrel) && IS_JOIN_REL(parentrel));
+	Assert(IS_OTHER_REL(childrel));
+
+	/* Nothing to do if no PHVs. */
+	if (root->placeholder_list == NIL)
+		return;
+
+	appinfos = find_appinfos_by_relids(root, childrel->relids, &nappinfos);
+	foreach(lc, parentrel->reltarget->exprs)
+	{
+		PlaceHolderVar *phv = lfirst(lc);
+
+		if (IsA(phv, PlaceHolderVar))
+		{
+			/*
+			 * In case the placeholder Var refers to any of the parent
+			 * relations, translate it to refer to the corresponding child.
+			 */
+			if (bms_overlap(phv->phrels, parentrel->relids) &&
+				childrel->reloptkind == RELOPT_OTHER_JOINREL)
+			{
+				phv = (PlaceHolderVar *) adjust_appendrel_attrs(root,
+																(Node *) phv,
+																nappinfos,
+																appinfos);
+			}
+
+			childrel->reltarget->exprs = lappend(childrel->reltarget->exprs,
+												 phv);
+		}
+	}
+
+	/* Adjust the cost and width of child targetlist. */
+	childrel->reltarget->cost.startup = parentrel->reltarget->cost.startup;
+	childrel->reltarget->cost.per_tuple = parentrel->reltarget->cost.per_tuple;
+	childrel->reltarget->width = parentrel->reltarget->width;
+
+	pfree(appinfos);
+}
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 23938198..55ea9c8c 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -82,7 +82,8 @@ static List *get_relation_statistics(RelOptInfo *rel, Relation relation);
 static void set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel,
                            Relation relation);
 static PartitionScheme find_partition_scheme(PlannerInfo *root, Relation rel);
-static List **build_baserel_partition_key_exprs(Relation relation, Index varno);
+static void set_baserel_partition_key_exprs(Relation relation,
+                                                               RelOptInfo *rel);
 
 #ifdef __TBASE__
 static BlockNumber GetIntervalPartitionPages(Relation rel, bool isindex, bool statistic);
@@ -1985,7 +1986,7 @@ set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel,
    Assert(partdesc != NULL && rel->part_scheme != NULL);
    rel->boundinfo = partdesc->boundinfo;
    rel->nparts = partdesc->nparts;
-   rel->partexprs = build_baserel_partition_key_exprs(relation, rel->relid);
+	set_baserel_partition_key_exprs(relation, rel);
 }
 
 /*
@@ -2060,21 +2061,24 @@ find_partition_scheme(PlannerInfo *root, Relation relation)
 }
 
 /*
- * build_baserel_partition_key_exprs
+ * set_baserel_partition_key_exprs
  *
- * Collects partition key expressions for a given base relation.  Any single
- * column partition keys are converted to Var nodes.  All Var nodes are set
- * to the given varno.  The partition key expressions are returned as an array
- * of single element lists to be stored in RelOptInfo of the base relation.
+ * Builds partition key expressions for the given base relation and sets them
+ * in given RelOptInfo.  Any single column partition keys are converted to Var
+ * nodes.  All Var nodes are restamped with the relid of given relation.
  */
-static List **
-build_baserel_partition_key_exprs(Relation relation, Index varno)
+static void
+set_baserel_partition_key_exprs(Relation relation,
+								RelOptInfo *rel)
 {
    PartitionKey partkey = RelationGetPartitionKey(relation);
    int         partnatts;
    int         cnt;
    List      **partexprs;
    ListCell   *lc;
+	Index		varno = rel->relid;
+
+	Assert(IS_SIMPLE_REL(rel) && rel->relid > 0);
 
    /* A partitioned table should have a partition key. */
    Assert(partkey != NULL);
@@ -2112,7 +2116,15 @@ build_baserel_partition_key_exprs(Relation relation, Index varno)
        partexprs[cnt] = list_make1(partexpr);
    }
 
-   return partexprs;
+	rel->partexprs = partexprs;
+
+	/*
+	 * A base relation can not have nullable partition key expressions. We
+	 * still allocate array of empty expressions lists to keep partition key
+	 * expression handling code simple. See build_joinrel_partition_info() and
+	 * match_expr_to_partition_keys().
+	 */
+	rel->nullable_partexprs = (List **) palloc0(sizeof(List *) * partnatts);
 }
 
 #ifdef __TBASE__
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index 0ada588b..0896b4c2 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -17,12 +17,14 @@
 #include <limits.h>
 
 #include "miscadmin.h"
+#include "catalog/partition.h"
 #include "optimizer/clauses.h"
 #include "optimizer/cost.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/placeholder.h"
 #include "optimizer/plancat.h"
+#include "optimizer/prep.h"
 #include "optimizer/restrictinfo.h"
 #include "optimizer/tlist.h"
 #include "utils/hsearch.h"
@@ -61,6 +63,9 @@ static List *subbuild_joinrel_joinlist(RelOptInfo *joinrel,
 static void set_foreign_rel_properties(RelOptInfo *joinrel,
                            RelOptInfo *outer_rel, RelOptInfo *inner_rel);
 static void add_join_rel(PlannerInfo *root, RelOptInfo *joinrel);
+static void build_joinrel_partition_info(RelOptInfo *joinrel,
+							 RelOptInfo *outer_rel, RelOptInfo *inner_rel,
+							 List *restrictlist, JoinType jointype);
 
 
 /*
@@ -160,6 +165,7 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
 	rel->boundinfo = NULL;
 	rel->part_rels = NULL;
 	rel->partexprs = NULL;
+        rel->nullable_partexprs = NULL;
 #ifdef __TBASE__
 	rel->intervalparent = false;
 	rel->isdefault      = rte->isdefault;
@@ -536,6 +542,9 @@ build_join_rel(PlannerInfo *root,
 	RelOptInfo *joinrel;
 	List	   *restrictlist;
 
+	/* This function should be used only for join between parents. */
+	Assert(!IS_OTHER_REL(outer_rel) && !IS_OTHER_REL(inner_rel));
+
 	/*
 	 * See if we already have a joinrel for this set of base rels.
 	 */
@@ -615,6 +624,7 @@ build_join_rel(PlannerInfo *root,
 	joinrel->boundinfo = NULL;
 	joinrel->part_rels = NULL;
 	joinrel->partexprs = NULL;
+    joinrel->nullable_partexprs = NULL;
 #ifdef __TBASE__
 	joinrel->resultRelLoc = RESULT_REL_NONE;
 #endif
@@ -663,6 +673,10 @@ build_join_rel(PlannerInfo *root,
 	 */
 	joinrel->has_eclass_joins = has_relevant_eclass_joinclause(root, joinrel);
 
+	/* Store the partition information. */
+	build_joinrel_partition_info(joinrel, outer_rel, inner_rel, restrictlist,
+								 sjinfo->jointype);
+
 	/*
 	 * Set estimates of the joinrel's size.
 	 */
@@ -708,6 +722,138 @@ build_join_rel(PlannerInfo *root,
 	return joinrel;
 }
 
+/*
+ * build_child_join_rel
+ *	  Builds RelOptInfo representing join between given two child relations.
+ *
+ * 'outer_rel' and 'inner_rel' are the RelOptInfos of child relations being
+ *		joined
+ * 'parent_joinrel' is the RelOptInfo representing the join between parent
+ *		relations. Some of the members of new RelOptInfo are produced by
+ *		translating corresponding members of this RelOptInfo
+ * 'sjinfo': child-join context info
+ * 'restrictlist': list of RestrictInfo nodes that apply to this particular
+ *		pair of joinable relations
+ * 'join_appinfos': list of AppendRelInfo nodes for base child relations
+ *		involved in this join
+ */
+RelOptInfo *
+build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
+					 RelOptInfo *inner_rel, RelOptInfo *parent_joinrel,
+					 List *restrictlist, SpecialJoinInfo *sjinfo,
+					 JoinType jointype)
+{
+	RelOptInfo *joinrel = makeNode(RelOptInfo);
+	AppendRelInfo **appinfos;
+	int			nappinfos;
+
+	/* Only joins between "other" relations land here. */
+	Assert(IS_OTHER_REL(outer_rel) && IS_OTHER_REL(inner_rel));
+
+	joinrel->reloptkind = RELOPT_OTHER_JOINREL;
+	joinrel->relids = bms_union(outer_rel->relids, inner_rel->relids);
+	joinrel->rows = 0;
+	/* cheap startup cost is interesting iff not all tuples to be retrieved */
+	joinrel->consider_startup = (root->tuple_fraction > 0);
+	joinrel->consider_param_startup = false;
+	joinrel->consider_parallel = false;
+	joinrel->reltarget = create_empty_pathtarget();
+	joinrel->pathlist = NIL;
+	joinrel->ppilist = NIL;
+	joinrel->partial_pathlist = NIL;
+	joinrel->cheapest_startup_path = NULL;
+	joinrel->cheapest_total_path = NULL;
+	joinrel->cheapest_unique_path = NULL;
+	joinrel->cheapest_parameterized_paths = NIL;
+	joinrel->direct_lateral_relids = NULL;
+	joinrel->lateral_relids = NULL;
+	joinrel->relid = 0;			/* indicates not a baserel */
+	joinrel->rtekind = RTE_JOIN;
+	joinrel->min_attr = 0;
+	joinrel->max_attr = 0;
+	joinrel->attr_needed = NULL;
+	joinrel->attr_widths = NULL;
+	joinrel->lateral_vars = NIL;
+	joinrel->lateral_referencers = NULL;
+	joinrel->indexlist = NIL;
+	joinrel->pages = 0;
+	joinrel->tuples = 0;
+	joinrel->allvisfrac = 0;
+	joinrel->subroot = NULL;
+	joinrel->subplan_params = NIL;
+	joinrel->serverid = InvalidOid;
+	joinrel->userid = InvalidOid;
+	joinrel->useridiscurrent = false;
+	joinrel->fdwroutine = NULL;
+	joinrel->fdw_private = NULL;
+	joinrel->baserestrictinfo = NIL;
+	joinrel->baserestrictcost.startup = 0;
+	joinrel->baserestrictcost.per_tuple = 0;
+	joinrel->joininfo = NIL;
+	joinrel->has_eclass_joins = false;
+	joinrel->top_parent_relids = NULL;
+	joinrel->part_scheme = NULL;
+	joinrel->part_rels = NULL;
+	joinrel->partexprs = NULL;
+	joinrel->nullable_partexprs = NULL;
+
+	joinrel->top_parent_relids = bms_union(outer_rel->top_parent_relids,
+										   inner_rel->top_parent_relids);
+
+	/* Compute information relevant to foreign relations. */
+	set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
+
+	/* Build targetlist */
+	build_joinrel_tlist(root, joinrel, outer_rel);
+	build_joinrel_tlist(root, joinrel, inner_rel);
+	/* Add placeholder variables. */
+	add_placeholders_to_child_joinrel(root, joinrel, parent_joinrel);
+
+	/* Construct joininfo list. */
+	appinfos = find_appinfos_by_relids(root, joinrel->relids, &nappinfos);
+	joinrel->joininfo = (List *) adjust_appendrel_attrs(root,
+														(Node *) parent_joinrel->joininfo,
+														nappinfos,
+														appinfos);
+	pfree(appinfos);
+
+	/*
+	 * Lateral relids referred in child join will be same as that referred in
+	 * the parent relation. Throw any partial result computed while building
+	 * the targetlist.
+	 */
+	bms_free(joinrel->direct_lateral_relids);
+	bms_free(joinrel->lateral_relids);
+	joinrel->direct_lateral_relids = (Relids) bms_copy(parent_joinrel->direct_lateral_relids);
+	joinrel->lateral_relids = (Relids) bms_copy(parent_joinrel->lateral_relids);
+
+	/*
+	 * If the parent joinrel has pending equivalence classes, so does the
+	 * child.
+	 */
+	joinrel->has_eclass_joins = parent_joinrel->has_eclass_joins;
+
+	/* Is the join between partitions itself partitioned? */
+	build_joinrel_partition_info(joinrel, outer_rel, inner_rel, restrictlist,
+								 jointype);
+
+	/* Child joinrel is parallel safe if parent is parallel safe. */
+	joinrel->consider_parallel = parent_joinrel->consider_parallel;
+
+
+	/* Set estimates of the child-joinrel's size. */
+	set_joinrel_size_estimates(root, joinrel, outer_rel, inner_rel,
+							   sjinfo, restrictlist);
+
+	/* We build the join only once. */
+	Assert(!find_join_rel(root, joinrel->relids));
+
+	/* Add the relation to the PlannerInfo. */
+	add_join_rel(root, joinrel);
+
+	return joinrel;
+}
+
 /*
  * min_join_parameterization
  *
@@ -763,9 +909,15 @@ static void
 build_joinrel_tlist(PlannerInfo *root, RelOptInfo *joinrel,
                     RelOptInfo *input_rel)
 {
-    Relids        relids = joinrel->relids;
+	Relids		relids;
     ListCell   *vars;
 
+	/* attrs_needed refers to parent relids and not those of a child. */
+	if (joinrel->top_parent_relids)
+		relids = joinrel->top_parent_relids;
+	else
+		relids = joinrel->relids;
+
     foreach(vars, input_rel->reltarget->exprs)
     {
         Var           *var = (Var *) lfirst(vars);
@@ -780,24 +932,55 @@ build_joinrel_tlist(PlannerInfo *root, RelOptInfo *joinrel,
             continue;
 
         /*
-         * Otherwise, anything in a baserel or joinrel targetlist ought to be
-         * a Var.  (More general cases can only appear in appendrel child
-         * rels, which will never be seen here.)
+		 * Otherwise, anything in a baserel or joinrel targetlist ought to be a
+		 * Var. Children of a partitioned table may have ConvertRowtypeExpr
+		 * translating whole-row Var of a child to that of the parent. Children
+		 * of an inherited table or subquery child rels can not directly
+		 * participate in a join, so other kinds of nodes here.
          */
-        if (!IsA(var, Var))
+		if (IsA(var, Var))
+		{
+			baserel = find_base_rel(root, var->varno);
+			ndx = var->varattno - baserel->min_attr;
+		}
+		else if (IsA(var, ConvertRowtypeExpr))
+		{
+			ConvertRowtypeExpr *child_expr = (ConvertRowtypeExpr *) var;
+			Var		   *childvar = (Var *) child_expr->arg;
+
+			/*
+			 * Child's whole-row references are converted to look like those
+			 * of parent using ConvertRowtypeExpr. There can be as many
+			 * ConvertRowtypeExpr decorations as the depth of partition tree.
+			 * The argument to the deepest ConvertRowtypeExpr is expected to
+			 * be a whole-row reference of the child.
+			 */
+			while (IsA(childvar, ConvertRowtypeExpr))
+			{
+				child_expr = (ConvertRowtypeExpr *) childvar;
+				childvar = (Var *) child_expr->arg;
+			}
+			Assert(IsA(childvar, Var) && childvar->varattno == 0);
+
+			baserel = find_base_rel(root, childvar->varno);
+			ndx = 0 - baserel->min_attr;
+		}
+		else
             elog(ERROR, "unexpected node type in rel targetlist: %d",
                  (int) nodeTag(var));
 
-        /* Get the Var's original base rel */
-        baserel = find_base_rel(root, var->varno);
 
-        /* Is it still needed above this joinrel? */
-        ndx = var->varattno - baserel->min_attr;
+		/* Is the target expression still needed above this joinrel? */
         if (bms_nonempty_difference(baserel->attr_needed[ndx], relids))
         {
             /* Yup, add it to the output */
             joinrel->reltarget->exprs = lappend(joinrel->reltarget->exprs, var);
-            /* Vars have cost zero, so no need to adjust reltarget->cost */
+
+			/*
+			 * Vars have cost zero, so no need to adjust reltarget->cost. Even
+			 * if it's a ConvertRowtypeExpr, it will be computed only for the
+			 * base relation, costing nothing for a join.
+			 */
             joinrel->reltarget->width += baserel->attr_widths[ndx];
         }
     }
@@ -900,6 +1083,9 @@ subbuild_joinrel_restrictlist(RelOptInfo *joinrel,
 {
     ListCell   *l;
 
+	/* Expected to be called only for join between parent relations. */
+	Assert(joinrel->reloptkind == RELOPT_JOINREL);
+
     foreach(l, joininfo_list)
     {
         RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
@@ -1457,3 +1643,165 @@ find_param_path_info(RelOptInfo *rel, Relids required_outer)
 
 	return NULL;
 }
+
+/*
+ * build_joinrel_partition_info
+ *		If the two relations have same partitioning scheme, their join may be
+ *		partitioned and will follow the same partitioning scheme as the joining
+ *		relations. Set the partition scheme and partition key expressions in
+ *		the join relation.
+ */
+static void
+build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel,
+							 RelOptInfo *inner_rel, List *restrictlist,
+							 JoinType jointype)
+{
+	int			partnatts;
+	int			cnt;
+	PartitionScheme part_scheme;
+
+	/* Nothing to do if partition-wise join technique is disabled. */
+	if (!enable_partition_wise_join)
+	{
+		Assert(!IS_PARTITIONED_REL(joinrel));
+		return;
+	}
+
+	/*
+	 * We can only consider this join as an input to further partition-wise
+	 * joins if (a) the input relations are partitioned, (b) the partition
+	 * schemes match, and (c) we can identify an equi-join between the
+	 * partition keys.  Note that if it were possible for
+	 * have_partkey_equi_join to return different answers for the same joinrel
+	 * depending on which join ordering we try first, this logic would break.
+	 * That shouldn't happen, though, because of the way the query planner
+	 * deduces implied equalities and reorders the joins. Please see
+	 * optimizer/README for details.
+	 */
+	if (!IS_PARTITIONED_REL(outer_rel) || !IS_PARTITIONED_REL(inner_rel) ||
+		outer_rel->part_scheme != inner_rel->part_scheme ||
+		!have_partkey_equi_join(outer_rel, inner_rel, jointype, restrictlist))
+	{
+		Assert(!IS_PARTITIONED_REL(joinrel));
+		return;
+	}
+
+	part_scheme = outer_rel->part_scheme;
+
+	Assert(REL_HAS_ALL_PART_PROPS(outer_rel) &&
+		   REL_HAS_ALL_PART_PROPS(inner_rel));
+
+	/*
+	 * For now, our partition matching algorithm can match partitions only
+	 * when the partition bounds of the joining relations are exactly same.
+	 * So, bail out otherwise.
+	 */
+	if (outer_rel->nparts != inner_rel->nparts ||
+		!partition_bounds_equal(part_scheme->partnatts,
+								part_scheme->parttyplen,
+								part_scheme->parttypbyval,
+								outer_rel->boundinfo, inner_rel->boundinfo))
+	{
+		Assert(!IS_PARTITIONED_REL(joinrel));
+		return;
+	}
+
+	/*
+	 * This function will be called only once for each joinrel, hence it
+	 * should not have partition scheme, partition bounds, partition key
+	 * expressions and array for storing child relations set.
+	 */
+	Assert(!joinrel->part_scheme && !joinrel->partexprs &&
+		   !joinrel->nullable_partexprs && !joinrel->part_rels &&
+		   !joinrel->boundinfo);
+
+	/*
+	 * Join relation is partitioned using the same partitioning scheme as the
+	 * joining relations and has same bounds.
+	 */
+	joinrel->part_scheme = part_scheme;
+	joinrel->boundinfo = outer_rel->boundinfo;
+	joinrel->nparts = outer_rel->nparts;
+	partnatts = joinrel->part_scheme->partnatts;
+	joinrel->partexprs = (List **) palloc0(sizeof(List *) * partnatts);
+	joinrel->nullable_partexprs =
+		(List **) palloc0(sizeof(List *) *partnatts);
+
+	/*
+	 * Construct partition keys for the join.
+	 *
+	 * An INNER join between two partitioned relations can be regarded as
+	 * partitioned by either key expression.  For example, A INNER JOIN B ON A.a =
+	 * B.b can be regarded as partitioned on A.a or on B.b; they are equivalent.
+	 *
+	 * For a SEMI or ANTI join, the result can only be regarded as being
+	 * partitioned in the same manner as the outer side, since the inner columns
+	 * are not retained.
+	 *
+	 * An OUTER join like (A LEFT JOIN B ON A.a = B.b) may produce rows with
+	 * B.b NULL. These rows may not fit the partitioning conditions imposed on
+	 * B.b. Hence, strictly speaking, the join is not partitioned by B.b and
+	 * thus partition keys of an OUTER join should include partition key
+	 * expressions from the OUTER side only.  However, because all
+	 * commonly-used comparison operators are strict, the presence of nulls on
+	 * the outer side doesn't cause any problem; they can't match anything at
+	 * future join levels anyway.  Therefore, we track two sets of expressions:
+	 * those that authentically partition the relation (partexprs) and those
+	 * that partition the relation with the exception that extra nulls may be
+	 * present (nullable_partexprs).  When the comparison operator is strict,
+	 * the latter is just as good as the former.
+	 */
+	for (cnt = 0; cnt < partnatts; cnt++)
+	{
+		List	   *outer_expr;
+		List	   *outer_null_expr;
+		List	   *inner_expr;
+		List	   *inner_null_expr;
+		List	   *partexpr = NIL;
+		List	   *nullable_partexpr = NIL;
+
+		outer_expr = list_copy(outer_rel->partexprs[cnt]);
+		outer_null_expr = list_copy(outer_rel->nullable_partexprs[cnt]);
+		inner_expr = list_copy(inner_rel->partexprs[cnt]);
+		inner_null_expr = list_copy(inner_rel->nullable_partexprs[cnt]);
+
+		switch (jointype)
+		{
+			case JOIN_INNER:
+				partexpr = list_concat(outer_expr, inner_expr);
+				nullable_partexpr = list_concat(outer_null_expr,
+												inner_null_expr);
+				break;
+
+			case JOIN_SEMI:
+			case JOIN_ANTI:
+				partexpr = outer_expr;
+				nullable_partexpr = outer_null_expr;
+				break;
+
+			case JOIN_LEFT:
+				partexpr = outer_expr;
+				nullable_partexpr = list_concat(inner_expr,
+												outer_null_expr);
+				nullable_partexpr = list_concat(nullable_partexpr,
+												inner_null_expr);
+				break;
+
+			case JOIN_FULL:
+				nullable_partexpr = list_concat(outer_expr,
+												inner_expr);
+				nullable_partexpr = list_concat(nullable_partexpr,
+												outer_null_expr);
+				nullable_partexpr = list_concat(nullable_partexpr,
+												inner_null_expr);
+				break;
+
+			default:
+				elog(ERROR, "unrecognized join type: %d", (int) jointype);
+
+		}
+
+		joinrel->partexprs[cnt] = partexpr;
+		joinrel->nullable_partexprs[cnt] = nullable_partexpr;
+	}
+}
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 8288cf36..e7ba54b0 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1195,6 +1195,15 @@ static struct config_bool ConfigureNamesBool[] =
         true,
         NULL, NULL, NULL
     },
+	{
+		{"enable_partition_wise_join", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables partition-wise join."),
+			NULL
+		},
+		&enable_partition_wise_join,
+		false,
+		NULL, NULL, NULL
+	},
 
     {
         {"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index c03c59df..5ef4e565 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -304,6 +304,7 @@
 #enable_seqscan = on
 #enable_sort = on
 #enable_tidscan = on
+#enable_partition_wise_join = off
 
 # - Planner Cost Constants -
 
diff --git a/src/include/foreign/fdwapi.h b/src/include/foreign/fdwapi.h
index ef0fbe6f..e188cba1 100644
--- a/src/include/foreign/fdwapi.h
+++ b/src/include/foreign/fdwapi.h
@@ -158,6 +158,9 @@ typedef void (*ShutdownForeignScan_function) (ForeignScanState *node);
 typedef bool (*IsForeignScanParallelSafe_function) (PlannerInfo *root,
 													RelOptInfo *rel,
 													RangeTblEntry *rte);
+typedef List *(*ReparameterizeForeignPathByChild_function) (PlannerInfo *root,
+															List *fdw_private,
+															RelOptInfo *child_rel);
 
 /*
  * FdwRoutine is the struct returned by a foreign-data wrapper's handler
@@ -230,6 +233,9 @@ typedef struct FdwRoutine
 	ReInitializeDSMForeignScan_function ReInitializeDSMForeignScan;
 	InitializeWorkerForeignScan_function InitializeWorkerForeignScan;
 	ShutdownForeignScan_function ShutdownForeignScan;
+
+        /* Support functions for path reparameterization. */
+        ReparameterizeForeignPathByChild_function ReparameterizeForeignPathByChild;
 } FdwRoutine;
 
 
diff --git a/src/include/nodes/extensible.h b/src/include/nodes/extensible.h
index 0654e79c..c3436c7a 100644
--- a/src/include/nodes/extensible.h
+++ b/src/include/nodes/extensible.h
@@ -96,6 +96,9 @@ typedef struct CustomPathMethods
 									List *tlist,
 									List *clauses,
 									List *custom_plans);
+	struct List *(*ReparameterizeCustomPathByChild) (PlannerInfo *root,
+													 List *custom_private,
+													 RelOptInfo *child_rel);
 }			CustomPathMethods;
 
 /*
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 31bdde3e..96258106 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -505,6 +505,11 @@ typedef struct PartitionSchemeData *PartitionScheme;
  * handling join alias Vars.  Currently this is not needed because all join
  * alias Vars are expanded to non-aliased form during preprocess_expression.
  *
+ * We also have relations representing joins between child relations of
+ * different partitioned tables. These relations are not added to
+ * join_rel_level lists as they are not joined directly by the dynamic
+ * programming algorithm.
+ *
  * There is also a RelOptKind for "upper" relations, which are RelOptInfos
  * that describe post-scan/join processing steps, such as aggregation.
  * Many of the fields in these RelOptInfos are meaningless, but their Path
@@ -639,14 +644,18 @@ typedef struct PartitionSchemeData *PartitionScheme;
  * 		boundinfo - Partition bounds
  * 		nparts - Number of partitions
  * 		part_rels - RelOptInfos for each partition
- * 		partexprs - Partition key expressions
+ * 		partexprs, nullable_partexprs - Partition key expressions
  *
  * Note: A base relation always has only one set of partition keys, but a join
  * relation may have as many sets of partition keys as the number of relations
- * being joined. partexprs is an array containing part_scheme->partnatts
- * elements, each of which is a list of partition key expressions. For a base
- * relation each list contains only one expression, but for a join relation
- * there can be one per baserel.
+ * being joined. partexprs and nullable_partexprs are arrays containing
+ * part_scheme->partnatts elements each. Each of these elements is a list of
+ * partition key expressions.  For a base relation each list in partexprs
+ * contains only one expression and nullable_partexprs is not populated. For a
+ * join relation, partexprs and nullable_partexprs contain partition key
+ * expressions from non-nullable and nullable relations resp. Lists at any
+ * given position in those arrays together contain as many elements as the
+ * number of joining relations.
  *----------
  */
 typedef enum RelOptKind
@@ -654,6 +663,7 @@ typedef enum RelOptKind
     RELOPT_BASEREL,
     RELOPT_JOINREL,
     RELOPT_OTHER_MEMBER_REL,
+	RELOPT_OTHER_JOINREL,
     RELOPT_UPPER_REL,
     RELOPT_DEADREL
 } RelOptKind;
@@ -667,13 +677,17 @@ typedef enum RelOptKind
      (rel)->reloptkind == RELOPT_OTHER_MEMBER_REL)
 
 /* Is the given relation a join relation? */
-#define IS_JOIN_REL(rel) ((rel)->reloptkind == RELOPT_JOINREL)
+#define IS_JOIN_REL(rel)	\
+	((rel)->reloptkind == RELOPT_JOINREL || \
+	 (rel)->reloptkind == RELOPT_OTHER_JOINREL)
 
 /* Is the given relation an upper relation? */
 #define IS_UPPER_REL(rel) ((rel)->reloptkind == RELOPT_UPPER_REL)
 
 /* Is the given relation an "other" relation? */
-#define IS_OTHER_REL(rel) ((rel)->reloptkind == RELOPT_OTHER_MEMBER_REL)
+#define IS_OTHER_REL(rel) \
+	((rel)->reloptkind == RELOPT_OTHER_MEMBER_REL || \
+	 (rel)->reloptkind == RELOPT_OTHER_JOINREL)
 
 typedef struct RelOptInfo
 {
@@ -759,7 +773,8 @@ typedef struct RelOptInfo
 	struct PartitionBoundInfoData *boundinfo;   /* Partition bounds */
 	struct RelOptInfo **part_rels;  /* Array of RelOptInfos of partitions,
 	                                * stored in the same order of bounds */
-	List      **partexprs;      /* Partition key expressions. */
+        List      **partexprs;      /* Non-nullable partition key expressions. */
+        List      **nullable_partexprs; /* Nullable partition key expressions. */
 #ifdef __TBASE__
 	/* used for interval partition */
 	bool		intervalparent;     /* is interval partition */
@@ -773,6 +788,26 @@ typedef struct RelOptInfo
 
 } RelOptInfo;
 
+/*
+ * Is given relation partitioned?
+ *
+ * A join between two partitioned relations with same partitioning scheme
+ * without any matching partitions will not have any partition in it but will
+ * have partition scheme set. So a relation is deemed to be partitioned if it
+ * has a partitioning scheme, bounds and positive number of partitions.
+ */
+#define IS_PARTITIONED_REL(rel) \
+   ((rel)->part_scheme && (rel)->boundinfo && (rel)->nparts > 0)
+
+/*
+ * Convenience macro to make sure that a partitioned relation has all the
+ * required members set.
+ */
+#define REL_HAS_ALL_PART_PROPS(rel)    \
+   ((rel)->part_scheme && (rel)->boundinfo && (rel)->nparts > 0 && \
+    (rel)->part_rels && (rel)->partexprs && (rel)->nullable_partexprs)
+
+
 /*
  * IndexOptInfo
  *        Per-index information for planning/optimization
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 2198c9db..7c527ec8 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -77,6 +77,7 @@ extern bool enable_mergejoin;
 extern bool enable_hashjoin;
 extern bool enable_fast_query_shipping;
 extern bool enable_gathermerge;
+extern bool enable_partition_wise_join;
 extern bool enable_nestloop_suppression;
 extern int	constraint_exclusion;
 
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 3df87235..d6e8ffdb 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -312,6 +312,8 @@ extern LimitPath *create_limit_path(PlannerInfo *root, RelOptInfo *rel,
 extern Path *reparameterize_path(PlannerInfo *root, Path *path,
                     Relids required_outer,
                     double loop_count);
+extern Path *reparameterize_path_by_child(PlannerInfo *root, Path *path,
+                                                        RelOptInfo *child_rel);
 
 extern Path *create_remotesubplan_path(PlannerInfo *root, Path *subpath,
                                         Distribution *distribution);
@@ -354,6 +356,10 @@ extern ParamPathInfo *get_appendrel_parampathinfo(RelOptInfo *appendrel,
                             Relids required_outer);
 extern ParamPathInfo *find_param_path_info(RelOptInfo *rel,
                                         Relids required_outer);
+extern RelOptInfo *build_child_join_rel(PlannerInfo *root,
+                                        RelOptInfo *outer_rel, RelOptInfo *inner_rel,
+                                        RelOptInfo *parent_joinrel, List *restrictlist,
+                                        SpecialJoinInfo *sjinfo, JoinType jointype);
 
 #ifdef __TBASE__
 extern Path *create_redistribute_grouping_path(PlannerInfo *root, 
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 416d15d8..48d6f994 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -59,6 +59,8 @@ extern int compute_parallel_worker(RelOptInfo *rel, double heap_pages,
 						double index_pages);
 extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel,
 							Path *bitmapqual);
+extern void generate_partition_wise_join_paths(PlannerInfo *root,
+								   RelOptInfo *rel);
 
 #ifdef OPTIMIZER_DEBUG
 extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel);
@@ -112,6 +114,9 @@ extern bool have_join_order_restriction(PlannerInfo *root,
 							RelOptInfo *rel1, RelOptInfo *rel2);
 extern bool have_dangerous_phv(PlannerInfo *root,
 				   Relids outer_relids, Relids inner_params);
+extern void mark_dummy_rel(RelOptInfo *rel);
+extern bool have_partkey_equi_join(RelOptInfo *rel1, RelOptInfo *rel2,
+					   JoinType jointype, List *restrictlist);
 
 /*
  * equivclass.c
diff --git a/src/include/optimizer/placeholder.h b/src/include/optimizer/placeholder.h
index 772fef33..a4a7b79f 100644
--- a/src/include/optimizer/placeholder.h
+++ b/src/include/optimizer/placeholder.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * placeholder.h
- *      prototypes for optimizer/util/placeholder.c.
+ *	  prototypes for optimizer/util/placeholder.c.
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -18,15 +18,17 @@
 
 
 extern PlaceHolderVar *make_placeholder_expr(PlannerInfo *root, Expr *expr,
-                      Relids phrels);
+					  Relids phrels);
 extern PlaceHolderInfo *find_placeholder_info(PlannerInfo *root,
-                      PlaceHolderVar *phv, bool create_new_ph);
+					  PlaceHolderVar *phv, bool create_new_ph);
 extern void find_placeholders_in_jointree(PlannerInfo *root);
 extern void update_placeholder_eval_levels(PlannerInfo *root,
-                               SpecialJoinInfo *new_sjinfo);
+							   SpecialJoinInfo *new_sjinfo);
 extern void fix_placeholder_input_needed_levels(PlannerInfo *root);
 extern void add_placeholders_to_base_rels(PlannerInfo *root);
 extern void add_placeholders_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel,
-                            RelOptInfo *outer_rel, RelOptInfo *inner_rel);
+							RelOptInfo *outer_rel, RelOptInfo *inner_rel);
+extern void add_placeholders_to_child_joinrel(PlannerInfo *root,
+								  RelOptInfo *childrel, RelOptInfo *parentrel);
 
-#endif                            /* PLACEHOLDER_H */
+#endif							/* PLACEHOLDER_H */
diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h
index b10500a3..464efbe4 100644
--- a/src/include/optimizer/planner.h
+++ b/src/include/optimizer/planner.h
@@ -122,6 +122,8 @@ extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr);
 extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid);
 
 extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti);
+extern List *get_partitioned_child_rels_for_join(PlannerInfo *root,
+                                                                       Relids join_relids);
 
 extern void preprocess_rowmarks(PlannerInfo *root);
 
diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h
index 08b43b52..e51066ed 100644
--- a/src/include/optimizer/prep.h
+++ b/src/include/optimizer/prep.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * prep.h
- *      prototypes for files in optimizer/prep/
+ *	  prototypes for files in optimizer/prep/
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -41,7 +41,7 @@ extern Expr *canonicalize_qual(Expr *qual);
 extern List *preprocess_targetlist(PlannerInfo *root, List *tlist);
 
 extern List *preprocess_onconflict_targetlist(List *tlist,
-                                 int result_relation, List *range_table);
+								 int result_relation, List *range_table);
 
 extern PlanRowMark *get_plan_rowmark(List *rowmarks, Index rtindex);
 
@@ -53,9 +53,15 @@ extern RelOptInfo *plan_set_operations(PlannerInfo *root);
 extern void expand_inherited_tables(PlannerInfo *root);
 
 extern Node *adjust_appendrel_attrs(PlannerInfo *root, Node *node,
-                       AppendRelInfo *appinfo);
+					   AppendRelInfo *appinfo);
 
 extern Node *adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node,
-                                  RelOptInfo *child_rel);
+								  RelOptInfo *child_rel);
 
-#endif                            /* PREP_H */
+extern SpecialJoinInfo *build_child_join_sjinfo(PlannerInfo *root,
+                                               SpecialJoinInfo *parent_sjinfo,
+                                               Relids left_relids, Relids right_relids);
+extern Relids adjust_child_relids_multilevel(PlannerInfo *root, Relids relids,
+                                                          Relids child_relids, Relids top_parent_relids);
+
+#endif							/* PREP_H */
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
new file mode 100644
index 00000000..234b8b53
--- /dev/null
+++ b/src/test/regress/expected/partition_join.out
@@ -0,0 +1,1789 @@
+--
+-- PARTITION_JOIN
+-- Test partition-wise join between partitioned tables
+--
+-- Enable partition-wise join, which by default is disabled.
+SET enable_partition_wise_join to true;
+--
+-- partitioned by a single column
+--
+CREATE TABLE prt1 (a int, b int, c varchar) PARTITION BY RANGE(a);
+CREATE TABLE prt1_p1 PARTITION OF prt1 FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt1_p3 PARTITION OF prt1 FOR VALUES FROM (500) TO (600);
+CREATE TABLE prt1_p2 PARTITION OF prt1 FOR VALUES FROM (250) TO (500);
+INSERT INTO prt1 SELECT i, i % 25, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 2 = 0;
+CREATE INDEX iprt1_p1_a on prt1_p1(a);
+CREATE INDEX iprt1_p2_a on prt1_p2(a);
+CREATE INDEX iprt1_p3_a on prt1_p3(a);
+ANALYZE prt1;
+CREATE TABLE prt2 (a int, b int, c varchar) PARTITION BY RANGE(b);
+CREATE TABLE prt2_p1 PARTITION OF prt2 FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt2_p2 PARTITION OF prt2 FOR VALUES FROM (250) TO (500);
+CREATE TABLE prt2_p3 PARTITION OF prt2 FOR VALUES FROM (500) TO (600);
+INSERT INTO prt2 SELECT i % 25, i, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 3 = 0;
+CREATE INDEX iprt2_p1_b on prt2_p1(b);
+CREATE INDEX iprt2_p2_b on prt2_p2(b);
+CREATE INDEX iprt2_p3_b on prt2_p3(b);
+ANALYZE prt2;
+-- inner join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+                    QUERY PLAN                    
+--------------------------------------------------
+ Sort
+   Sort Key: t1.a
+   ->  Append
+         ->  Hash Join
+               Hash Cond: (t2.b = t1.a)
+               ->  Seq Scan on prt2_p1 t2
+               ->  Hash
+                     ->  Seq Scan on prt1_p1 t1
+                           Filter: (b = 0)
+         ->  Hash Join
+               Hash Cond: (t2_1.b = t1_1.a)
+               ->  Seq Scan on prt2_p2 t2_1
+               ->  Hash
+                     ->  Seq Scan on prt1_p2 t1_1
+                           Filter: (b = 0)
+         ->  Hash Join
+               Hash Cond: (t2_2.b = t1_2.a)
+               ->  Seq Scan on prt2_p3 t2_2
+               ->  Hash
+                     ->  Seq Scan on prt1_p3 t1_2
+                           Filter: (b = 0)
+(21 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+ 150 | 0150 | 150 | 0150
+ 300 | 0300 | 300 | 0300
+ 450 | 0450 | 450 | 0450
+(4 rows)
+
+-- left outer join, with whole-row reference
+EXPLAIN (COSTS OFF)
+SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Sort
+   Sort Key: t1.a, t2.b
+   ->  Result
+         ->  Append
+               ->  Hash Right Join
+                     Hash Cond: (t2.b = t1.a)
+                     ->  Seq Scan on prt2_p1 t2
+                     ->  Hash
+                           ->  Seq Scan on prt1_p1 t1
+                                 Filter: (b = 0)
+               ->  Hash Right Join
+                     Hash Cond: (t2_1.b = t1_1.a)
+                     ->  Seq Scan on prt2_p2 t2_1
+                     ->  Hash
+                           ->  Seq Scan on prt1_p2 t1_1
+                                 Filter: (b = 0)
+               ->  Hash Right Join
+                     Hash Cond: (t2_2.b = t1_2.a)
+                     ->  Seq Scan on prt2_p3 t2_2
+                     ->  Hash
+                           ->  Seq Scan on prt1_p3 t1_2
+                                 Filter: (b = 0)
+(22 rows)
+
+SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+      t1      |      t2      
+--------------+--------------
+ (0,0,0000)   | (0,0,0000)
+ (50,0,0050)  | 
+ (100,0,0100) | 
+ (150,0,0150) | (0,150,0150)
+ (200,0,0200) | 
+ (250,0,0250) | 
+ (300,0,0300) | (0,300,0300)
+ (350,0,0350) | 
+ (400,0,0400) | 
+ (450,0,0450) | (0,450,0450)
+ (500,0,0500) | 
+ (550,0,0550) | 
+(12 rows)
+
+-- right outer join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a, t2.b
+   ->  Result
+         ->  Append
+               ->  Hash Right Join
+                     Hash Cond: (t1.a = t2.b)
+                     ->  Seq Scan on prt1_p1 t1
+                     ->  Hash
+                           ->  Seq Scan on prt2_p1 t2
+                                 Filter: (a = 0)
+               ->  Hash Right Join
+                     Hash Cond: (t1_1.a = t2_1.b)
+                     ->  Seq Scan on prt1_p2 t1_1
+                     ->  Hash
+                           ->  Seq Scan on prt2_p2 t2_1
+                                 Filter: (a = 0)
+               ->  Nested Loop Left Join
+                     ->  Seq Scan on prt2_p3 t2_2
+                           Filter: (a = 0)
+                     ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                           Index Cond: (a = t2_2.b)
+(21 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+ 150 | 0150 | 150 | 0150
+ 300 | 0300 | 300 | 0300
+ 450 | 0450 | 450 | 0450
+     |      |  75 | 0075
+     |      | 225 | 0225
+     |      | 375 | 0375
+     |      | 525 | 0525
+(8 rows)
+
+-- full outer join, with placeholder vars
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b;
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Sort
+   Sort Key: prt1_p1.a, prt2_p1.b
+   ->  Append
+         ->  Hash Full Join
+               Hash Cond: (prt1_p1.a = prt2_p1.b)
+               Filter: (((50) = prt1_p1.a) OR ((75) = prt2_p1.b))
+               ->  Seq Scan on prt1_p1
+                     Filter: (b = 0)
+               ->  Hash
+                     ->  Seq Scan on prt2_p1
+                           Filter: (a = 0)
+         ->  Hash Full Join
+               Hash Cond: (prt1_p2.a = prt2_p2.b)
+               Filter: (((50) = prt1_p2.a) OR ((75) = prt2_p2.b))
+               ->  Seq Scan on prt1_p2
+                     Filter: (b = 0)
+               ->  Hash
+                     ->  Seq Scan on prt2_p2
+                           Filter: (a = 0)
+         ->  Hash Full Join
+               Hash Cond: (prt1_p3.a = prt2_p3.b)
+               Filter: (((50) = prt1_p3.a) OR ((75) = prt2_p3.b))
+               ->  Seq Scan on prt1_p3
+                     Filter: (b = 0)
+               ->  Hash
+                     ->  Seq Scan on prt2_p3
+                           Filter: (a = 0)
+(27 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b;
+ a  |  c   | b  |  c   
+----+------+----+------
+ 50 | 0050 |    | 
+    |      | 75 | 0075
+(2 rows)
+
+-- Join with pruned partitions from joining relations
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Sort
+   Sort Key: t1.a
+   ->  Append
+         ->  Hash Join
+               Hash Cond: (t2.b = t1.a)
+               ->  Seq Scan on prt2_p2 t2
+                     Filter: (b > 250)
+               ->  Hash
+                     ->  Seq Scan on prt1_p2 t1
+                           Filter: ((a < 450) AND (b = 0))
+(10 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+ 300 | 0300 | 300 | 0300
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Sort
+   Sort Key: prt1_p1.a, b
+   ->  Append
+         ->  Hash Left Join
+               Hash Cond: (prt1_p1.a = b)
+               ->  Seq Scan on prt1_p1
+                     Filter: ((a < 450) AND (b = 0))
+               ->  Hash
+                     ->  Result
+                           One-Time Filter: false
+         ->  Hash Right Join
+               Hash Cond: (prt2_p2.b = prt1_p2.a)
+               ->  Seq Scan on prt2_p2
+                     Filter: (b > 250)
+               ->  Hash
+                     ->  Seq Scan on prt1_p2
+                           Filter: ((a < 450) AND (b = 0))
+(17 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |     | 
+  50 | 0050 |     | 
+ 100 | 0100 |     | 
+ 150 | 0150 |     | 
+ 200 | 0200 |     | 
+ 250 | 0250 |     | 
+ 300 | 0300 | 300 | 0300
+ 350 | 0350 |     | 
+ 400 | 0400 |     | 
+(9 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Sort
+   Sort Key: prt1_p1.a, b
+   ->  Append
+         ->  Hash Full Join
+               Hash Cond: (prt1_p1.a = b)
+               Filter: ((prt1_p1.b = 0) OR (a = 0))
+               ->  Seq Scan on prt1_p1
+                     Filter: (a < 450)
+               ->  Hash
+                     ->  Result
+                           One-Time Filter: false
+         ->  Hash Full Join
+               Hash Cond: (prt1_p2.a = prt2_p2.b)
+               Filter: ((prt1_p2.b = 0) OR (prt2_p2.a = 0))
+               ->  Seq Scan on prt1_p2
+                     Filter: (a < 450)
+               ->  Hash
+                     ->  Seq Scan on prt2_p2
+                           Filter: (b > 250)
+         ->  Hash Full Join
+               Hash Cond: (prt2_p3.b = a)
+               Filter: ((b = 0) OR (prt2_p3.a = 0))
+               ->  Seq Scan on prt2_p3
+                     Filter: (b > 250)
+               ->  Hash
+                     ->  Result
+                           One-Time Filter: false
+(27 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |     | 
+  50 | 0050 |     | 
+ 100 | 0100 |     | 
+ 150 | 0150 |     | 
+ 200 | 0200 |     | 
+ 250 | 0250 |     | 
+ 300 | 0300 | 300 | 0300
+ 350 | 0350 |     | 
+ 400 | 0400 |     | 
+     |      | 375 | 0375
+     |      | 450 | 0450
+     |      | 525 | 0525
+(12 rows)
+
+-- Semi-join
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a;
+                    QUERY PLAN                    
+--------------------------------------------------
+ Sort
+   Sort Key: t1.a
+   ->  Append
+         ->  Hash Semi Join
+               Hash Cond: (t1.a = t2.b)
+               ->  Seq Scan on prt1_p1 t1
+                     Filter: (b = 0)
+               ->  Hash
+                     ->  Seq Scan on prt2_p1 t2
+                           Filter: (a = 0)
+         ->  Hash Semi Join
+               Hash Cond: (t1_1.a = t2_1.b)
+               ->  Seq Scan on prt1_p2 t1_1
+                     Filter: (b = 0)
+               ->  Hash
+                     ->  Seq Scan on prt2_p2 t2_1
+                           Filter: (a = 0)
+         ->  Nested Loop Semi Join
+               Join Filter: (t1_2.a = t2_2.b)
+               ->  Seq Scan on prt1_p3 t1_2
+                     Filter: (b = 0)
+               ->  Materialize
+                     ->  Seq Scan on prt2_p3 t2_2
+                           Filter: (a = 0)
+(24 rows)
+
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   
+-----+---+------
+   0 | 0 | 0000
+ 150 | 0 | 0150
+ 300 | 0 | 0300
+ 450 | 0 | 0450
+(4 rows)
+
+-- Anti-join with aggregates
+EXPLAIN (COSTS OFF)
+SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b);
+                    QUERY PLAN                    
+--------------------------------------------------
+ Aggregate
+   ->  Append
+         ->  Hash Anti Join
+               Hash Cond: (t1.a = t2.b)
+               ->  Seq Scan on prt1_p1 t1
+               ->  Hash
+                     ->  Seq Scan on prt2_p1 t2
+         ->  Hash Anti Join
+               Hash Cond: (t1_1.a = t2_1.b)
+               ->  Seq Scan on prt1_p2 t1_1
+               ->  Hash
+                     ->  Seq Scan on prt2_p2 t2_1
+         ->  Hash Anti Join
+               Hash Cond: (t1_2.a = t2_2.b)
+               ->  Seq Scan on prt1_p3 t1_2
+               ->  Hash
+                     ->  Seq Scan on prt2_p3 t2_2
+(17 rows)
+
+SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b);
+  sum  |         avg          | sum  |         avg         
+-------+----------------------+------+---------------------
+ 60000 | 300.0000000000000000 | 2400 | 12.0000000000000000
+(1 row)
+
+-- lateral reference
+EXPLAIN (COSTS OFF)
+SELECT * FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a;
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a
+   ->  Result
+         ->  Append
+               ->  Nested Loop Left Join
+                     ->  Seq Scan on prt1_p1 t1
+                           Filter: (b = 0)
+                     ->  Nested Loop
+                           ->  Index Only Scan using iprt1_p1_a on prt1_p1 t2
+                                 Index Cond: (a = t1.a)
+                           ->  Index Scan using iprt2_p1_b on prt2_p1 t3
+                                 Index Cond: (b = t2.a)
+               ->  Nested Loop Left Join
+                     ->  Seq Scan on prt1_p2 t1_1
+                           Filter: (b = 0)
+                     ->  Nested Loop
+                           ->  Index Only Scan using iprt1_p2_a on prt1_p2 t2_1
+                                 Index Cond: (a = t1_1.a)
+                           ->  Index Scan using iprt2_p2_b on prt2_p2 t3_1
+                                 Index Cond: (b = t2_1.a)
+               ->  Nested Loop Left Join
+                     ->  Seq Scan on prt1_p3 t1_2
+                           Filter: (b = 0)
+                     ->  Nested Loop
+                           ->  Index Only Scan using iprt1_p3_a on prt1_p3 t2_2
+                                 Index Cond: (a = t1_2.a)
+                           ->  Index Scan using iprt2_p3_b on prt2_p3 t3_2
+                                 Index Cond: (b = t2_2.a)
+(28 rows)
+
+SELECT * FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   | t2a | t3a | least 
+-----+---+------+-----+-----+-------
+   0 | 0 | 0000 |   0 |   0 |     0
+  50 | 0 | 0050 |     |     |      
+ 100 | 0 | 0100 |     |     |      
+ 150 | 0 | 0150 | 150 |   0 |   150
+ 200 | 0 | 0200 |     |     |      
+ 250 | 0 | 0250 |     |     |      
+ 300 | 0 | 0300 | 300 |   0 |   300
+ 350 | 0 | 0350 |     |     |      
+ 400 | 0 | 0400 |     |     |      
+ 450 | 0 | 0450 | 450 |   0 |   450
+ 500 | 0 | 0500 |     |     |      
+ 550 | 0 | 0550 |     |     |      
+(12 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Sort
+   Sort Key: t1.a
+   ->  Hash Left Join
+         Hash Cond: ((t1.c)::text = (t2.c)::text)
+         Filter: ((t1.b + COALESCE(t2.b, 0)) = 0)
+         ->  Append
+               ->  Seq Scan on prt1_p1 t1
+               ->  Seq Scan on prt1_p2 t1_1
+               ->  Seq Scan on prt1_p3 t1_2
+         ->  Hash
+               ->  Append
+                     ->  Hash Join
+                           Hash Cond: (t2.a = t3.b)
+                           ->  Seq Scan on prt1_p1 t2
+                           ->  Hash
+                                 ->  Seq Scan on prt2_p1 t3
+                     ->  Hash Join
+                           Hash Cond: (t2_1.a = t3_1.b)
+                           ->  Seq Scan on prt1_p2 t2_1
+                           ->  Hash
+                                 ->  Seq Scan on prt2_p2 t3_1
+                     ->  Hash Join
+                           Hash Cond: (t2_2.a = t3_2.b)
+                           ->  Seq Scan on prt1_p3 t2_2
+                           ->  Hash
+                                 ->  Seq Scan on prt2_p3 t3_2
+(26 rows)
+
+SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
+  a  | t2a | t2c  
+-----+-----+------
+   0 |   0 | 0000
+  50 |     | 
+ 100 |     | 
+ 150 | 150 | 0150
+ 200 |     | 
+ 250 |     | 
+ 300 | 300 | 0300
+ 350 |     | 
+ 400 |     | 
+ 450 | 450 | 0450
+ 500 |     | 
+ 550 |     | 
+(12 rows)
+
+--
+-- partitioned by expression
+--
+CREATE TABLE prt1_e (a int, b int, c int) PARTITION BY RANGE(((a + b)/2));
+CREATE TABLE prt1_e_p1 PARTITION OF prt1_e FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt1_e_p2 PARTITION OF prt1_e FOR VALUES FROM (250) TO (500);
+CREATE TABLE prt1_e_p3 PARTITION OF prt1_e FOR VALUES FROM (500) TO (600);
+INSERT INTO prt1_e SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i;
+CREATE INDEX iprt1_e_p1_ab2 on prt1_e_p1(((a+b)/2));
+CREATE INDEX iprt1_e_p2_ab2 on prt1_e_p2(((a+b)/2));
+CREATE INDEX iprt1_e_p3_ab2 on prt1_e_p3(((a+b)/2));
+ANALYZE prt1_e;
+CREATE TABLE prt2_e (a int, b int, c int) PARTITION BY RANGE(((b + a)/2));
+CREATE TABLE prt2_e_p1 PARTITION OF prt2_e FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt2_e_p2 PARTITION OF prt2_e FOR VALUES FROM (250) TO (500);
+CREATE TABLE prt2_e_p3 PARTITION OF prt2_e FOR VALUES FROM (500) TO (600);
+INSERT INTO prt2_e SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i;
+ANALYZE prt2_e;
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b;
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a, t2.b
+   ->  Append
+         ->  Hash Join
+               Hash Cond: (((t2.b + t2.a) / 2) = ((t1.a + t1.b) / 2))
+               ->  Seq Scan on prt2_e_p1 t2
+               ->  Hash
+                     ->  Seq Scan on prt1_e_p1 t1
+                           Filter: (c = 0)
+         ->  Hash Join
+               Hash Cond: (((t2_1.b + t2_1.a) / 2) = ((t1_1.a + t1_1.b) / 2))
+               ->  Seq Scan on prt2_e_p2 t2_1
+               ->  Hash
+                     ->  Seq Scan on prt1_e_p2 t1_1
+                           Filter: (c = 0)
+         ->  Hash Join
+               Hash Cond: (((t2_2.b + t2_2.a) / 2) = ((t1_2.a + t1_2.b) / 2))
+               ->  Seq Scan on prt2_e_p3 t2_2
+               ->  Hash
+                     ->  Seq Scan on prt1_e_p3 t1_2
+                           Filter: (c = 0)
+(21 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b;
+  a  | c |  b  | c 
+-----+---+-----+---
+   0 | 0 |   0 | 0
+ 150 | 0 | 150 | 0
+ 300 | 0 | 300 | 0
+ 450 | 0 | 450 | 0
+(4 rows)
+
+--
+-- N-way join
+--
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a
+   ->  Result
+         ->  Append
+               ->  Nested Loop
+                     Join Filter: (t1.a = ((t3.a + t3.b) / 2))
+                     ->  Hash Join
+                           Hash Cond: (t2.b = t1.a)
+                           ->  Seq Scan on prt2_p1 t2
+                           ->  Hash
+                                 ->  Seq Scan on prt1_p1 t1
+                                       Filter: (b = 0)
+                     ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t3
+                           Index Cond: (((a + b) / 2) = t2.b)
+               ->  Nested Loop
+                     Join Filter: (t1_1.a = ((t3_1.a + t3_1.b) / 2))
+                     ->  Hash Join
+                           Hash Cond: (t2_1.b = t1_1.a)
+                           ->  Seq Scan on prt2_p2 t2_1
+                           ->  Hash
+                                 ->  Seq Scan on prt1_p2 t1_1
+                                       Filter: (b = 0)
+                     ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t3_1
+                           Index Cond: (((a + b) / 2) = t2_1.b)
+               ->  Nested Loop
+                     Join Filter: (t1_2.a = ((t3_2.a + t3_2.b) / 2))
+                     ->  Hash Join
+                           Hash Cond: (t2_2.b = t1_2.a)
+                           ->  Seq Scan on prt2_p3 t2_2
+                           ->  Hash
+                                 ->  Seq Scan on prt1_p3 t1_2
+                                       Filter: (b = 0)
+                     ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t3_2
+                           Index Cond: (((a + b) / 2) = t2_2.b)
+(34 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   | ?column? | c 
+-----+------+-----+------+----------+---
+   0 | 0000 |   0 | 0000 |        0 | 0
+ 150 | 0150 | 150 | 0150 |      300 | 0
+ 300 | 0300 | 300 | 0300 |      600 | 0
+ 450 | 0450 | 450 | 0450 |      900 | 0
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a, t2.b, ((t3.a + t3.b))
+   ->  Result
+         ->  Append
+               ->  Hash Right Join
+                     Hash Cond: (((t3.a + t3.b) / 2) = t1.a)
+                     ->  Seq Scan on prt1_e_p1 t3
+                     ->  Hash
+                           ->  Hash Right Join
+                                 Hash Cond: (t2.b = t1.a)
+                                 ->  Seq Scan on prt2_p1 t2
+                                 ->  Hash
+                                       ->  Seq Scan on prt1_p1 t1
+                                             Filter: (b = 0)
+               ->  Hash Right Join
+                     Hash Cond: (((t3_1.a + t3_1.b) / 2) = t1_1.a)
+                     ->  Seq Scan on prt1_e_p2 t3_1
+                     ->  Hash
+                           ->  Hash Right Join
+                                 Hash Cond: (t2_1.b = t1_1.a)
+                                 ->  Seq Scan on prt2_p2 t2_1
+                                 ->  Hash
+                                       ->  Seq Scan on prt1_p2 t1_1
+                                             Filter: (b = 0)
+               ->  Hash Right Join
+                     Hash Cond: (((t3_2.a + t3_2.b) / 2) = t1_2.a)
+                     ->  Seq Scan on prt1_e_p3 t3_2
+                     ->  Hash
+                           ->  Hash Right Join
+                                 Hash Cond: (t2_2.b = t1_2.a)
+                                 ->  Seq Scan on prt2_p3 t2_2
+                                 ->  Hash
+                                       ->  Seq Scan on prt1_p3 t1_2
+                                             Filter: (b = 0)
+(34 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+  a  |  c   |  b  |  c   | ?column? | c 
+-----+------+-----+------+----------+---
+   0 | 0000 |   0 | 0000 |        0 | 0
+  50 | 0050 |     |      |      100 | 0
+ 100 | 0100 |     |      |      200 | 0
+ 150 | 0150 | 150 | 0150 |      300 | 0
+ 200 | 0200 |     |      |      400 | 0
+ 250 | 0250 |     |      |      500 | 0
+ 300 | 0300 | 300 | 0300 |      600 | 0
+ 350 | 0350 |     |      |      700 | 0
+ 400 | 0400 |     |      |      800 | 0
+ 450 | 0450 | 450 | 0450 |      900 | 0
+ 500 | 0500 |     |      |     1000 | 0
+ 550 | 0550 |     |      |     1100 | 0
+(12 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a, t2.b, ((t3.a + t3.b))
+   ->  Result
+         ->  Append
+               ->  Nested Loop Left Join
+                     ->  Hash Right Join
+                           Hash Cond: (t1.a = ((t3.a + t3.b) / 2))
+                           ->  Seq Scan on prt1_p1 t1
+                           ->  Hash
+                                 ->  Seq Scan on prt1_e_p1 t3
+                                       Filter: (c = 0)
+                     ->  Index Scan using iprt2_p1_b on prt2_p1 t2
+                           Index Cond: (t1.a = b)
+               ->  Nested Loop Left Join
+                     ->  Hash Right Join
+                           Hash Cond: (t1_1.a = ((t3_1.a + t3_1.b) / 2))
+                           ->  Seq Scan on prt1_p2 t1_1
+                           ->  Hash
+                                 ->  Seq Scan on prt1_e_p2 t3_1
+                                       Filter: (c = 0)
+                     ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
+                           Index Cond: (t1_1.a = b)
+               ->  Nested Loop Left Join
+                     ->  Hash Right Join
+                           Hash Cond: (t1_2.a = ((t3_2.a + t3_2.b) / 2))
+                           ->  Seq Scan on prt1_p3 t1_2
+                           ->  Hash
+                                 ->  Seq Scan on prt1_e_p3 t3_2
+                                       Filter: (c = 0)
+                     ->  Index Scan using iprt2_p3_b on prt2_p3 t2_2
+                           Index Cond: (t1_2.a = b)
+(31 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+  a  |  c   |  b  |  c   | ?column? | c 
+-----+------+-----+------+----------+---
+   0 | 0000 |   0 | 0000 |        0 | 0
+  50 | 0050 |     |      |      100 | 0
+ 100 | 0100 |     |      |      200 | 0
+ 150 | 0150 | 150 | 0150 |      300 | 0
+ 200 | 0200 |     |      |      400 | 0
+ 250 | 0250 |     |      |      500 | 0
+ 300 | 0300 | 300 | 0300 |      600 | 0
+ 350 | 0350 |     |      |      700 | 0
+ 400 | 0400 |     |      |      800 | 0
+ 450 | 0450 | 450 | 0450 |      900 | 0
+ 500 | 0500 |     |      |     1000 | 0
+ 550 | 0550 |     |      |     1100 | 0
+(12 rows)
+
+-- Cases with non-nullable expressions in subquery results;
+-- make sure these go to null as expected
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b;
+                                                      QUERY PLAN                                                      
+----------------------------------------------------------------------------------------------------------------------
+ Sort
+   Sort Key: prt1_p1.a, prt2_p1.b, ((prt1_e_p1.a + prt1_e_p1.b))
+   ->  Result
+         ->  Append
+               ->  Hash Full Join
+                     Hash Cond: (prt1_p1.a = ((prt1_e_p1.a + prt1_e_p1.b) / 2))
+                     Filter: ((prt1_p1.a = (50)) OR (prt2_p1.b = (75)) OR (((prt1_e_p1.a + prt1_e_p1.b) / 2) = (50)))
+                     ->  Hash Full Join
+                           Hash Cond: (prt1_p1.a = prt2_p1.b)
+                           ->  Seq Scan on prt1_p1
+                                 Filter: (b = 0)
+                           ->  Hash
+                                 ->  Seq Scan on prt2_p1
+                                       Filter: (a = 0)
+                     ->  Hash
+                           ->  Seq Scan on prt1_e_p1
+                                 Filter: (c = 0)
+               ->  Hash Full Join
+                     Hash Cond: (prt1_p2.a = ((prt1_e_p2.a + prt1_e_p2.b) / 2))
+                     Filter: ((prt1_p2.a = (50)) OR (prt2_p2.b = (75)) OR (((prt1_e_p2.a + prt1_e_p2.b) / 2) = (50)))
+                     ->  Hash Full Join
+                           Hash Cond: (prt1_p2.a = prt2_p2.b)
+                           ->  Seq Scan on prt1_p2
+                                 Filter: (b = 0)
+                           ->  Hash
+                                 ->  Seq Scan on prt2_p2
+                                       Filter: (a = 0)
+                     ->  Hash
+                           ->  Seq Scan on prt1_e_p2
+                                 Filter: (c = 0)
+               ->  Hash Full Join
+                     Hash Cond: (prt1_p3.a = ((prt1_e_p3.a + prt1_e_p3.b) / 2))
+                     Filter: ((prt1_p3.a = (50)) OR (prt2_p3.b = (75)) OR (((prt1_e_p3.a + prt1_e_p3.b) / 2) = (50)))
+                     ->  Hash Full Join
+                           Hash Cond: (prt1_p3.a = prt2_p3.b)
+                           ->  Seq Scan on prt1_p3
+                                 Filter: (b = 0)
+                           ->  Hash
+                                 ->  Seq Scan on prt2_p3
+                                       Filter: (a = 0)
+                     ->  Hash
+                           ->  Seq Scan on prt1_e_p3
+                                 Filter: (c = 0)
+(43 rows)
+
+SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b;
+ a  | phv | b  | phv | ?column? | phv 
+----+-----+----+-----+----------+-----
+ 50 |  50 |    |     |      100 |  50
+    |     | 75 |  75 |          |    
+(2 rows)
+
+-- Semi-join
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
+                                   QUERY PLAN                                    
+---------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a
+   ->  Append
+         ->  Nested Loop
+               Join Filter: (t1.a = t1_3.b)
+               ->  HashAggregate
+                     Group Key: t1_3.b
+                     ->  Hash Join
+                           Hash Cond: (((t2.a + t2.b) / 2) = t1_3.b)
+                           ->  Seq Scan on prt1_e_p1 t2
+                           ->  Hash
+                                 ->  Seq Scan on prt2_p1 t1_3
+                                       Filter: (a = 0)
+               ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                     Index Cond: (a = ((t2.a + t2.b) / 2))
+                     Filter: (b = 0)
+         ->  Nested Loop
+               Join Filter: (t1_1.a = t1_4.b)
+               ->  HashAggregate
+                     Group Key: t1_4.b
+                     ->  Hash Join
+                           Hash Cond: (((t2_1.a + t2_1.b) / 2) = t1_4.b)
+                           ->  Seq Scan on prt1_e_p2 t2_1
+                           ->  Hash
+                                 ->  Seq Scan on prt2_p2 t1_4
+                                       Filter: (a = 0)
+               ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                     Index Cond: (a = ((t2_1.a + t2_1.b) / 2))
+                     Filter: (b = 0)
+         ->  Nested Loop
+               Join Filter: (t1_2.a = t1_5.b)
+               ->  HashAggregate
+                     Group Key: t1_5.b
+                     ->  Nested Loop
+                           ->  Seq Scan on prt2_p3 t1_5
+                                 Filter: (a = 0)
+                           ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t2_2
+                                 Index Cond: (((a + b) / 2) = t1_5.b)
+               ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                     Index Cond: (a = ((t2_2.a + t2_2.b) / 2))
+                     Filter: (b = 0)
+(41 rows)
+
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   
+-----+---+------
+   0 | 0 | 0000
+ 150 | 0 | 0150
+ 300 | 0 | 0300
+ 450 | 0 | 0450
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+                                  QUERY PLAN                                   
+-------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a
+   ->  Append
+         ->  Nested Loop
+               ->  HashAggregate
+                     Group Key: t1_3.b
+                     ->  Hash Semi Join
+                           Hash Cond: (t1_3.b = ((t1_6.a + t1_6.b) / 2))
+                           ->  Seq Scan on prt2_p1 t1_3
+                           ->  Hash
+                                 ->  Seq Scan on prt1_e_p1 t1_6
+                                       Filter: (c = 0)
+               ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                     Index Cond: (a = t1_3.b)
+                     Filter: (b = 0)
+         ->  Nested Loop
+               ->  HashAggregate
+                     Group Key: t1_4.b
+                     ->  Hash Semi Join
+                           Hash Cond: (t1_4.b = ((t1_7.a + t1_7.b) / 2))
+                           ->  Seq Scan on prt2_p2 t1_4
+                           ->  Hash
+                                 ->  Seq Scan on prt1_e_p2 t1_7
+                                       Filter: (c = 0)
+               ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                     Index Cond: (a = t1_4.b)
+                     Filter: (b = 0)
+         ->  Nested Loop
+               ->  Unique
+                     ->  Sort
+                           Sort Key: t1_5.b
+                           ->  Hash Semi Join
+                                 Hash Cond: (t1_5.b = ((t1_8.a + t1_8.b) / 2))
+                                 ->  Seq Scan on prt2_p3 t1_5
+                                 ->  Hash
+                                       ->  Seq Scan on prt1_e_p3 t1_8
+                                             Filter: (c = 0)
+               ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                     Index Cond: (a = t1_5.b)
+                     Filter: (b = 0)
+(40 rows)
+
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   
+-----+---+------
+   0 | 0 | 0000
+ 150 | 0 | 0150
+ 300 | 0 | 0300
+ 450 | 0 | 0450
+(4 rows)
+
+-- test merge joins
+SET enable_hashjoin TO off;
+SET enable_nestloop TO off;
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Merge Append
+   Sort Key: t1.a
+   ->  Merge Semi Join
+         Merge Cond: (t1.a = t1_3.b)
+         ->  Sort
+               Sort Key: t1.a
+               ->  Seq Scan on prt1_p1 t1
+                     Filter: (b = 0)
+         ->  Merge Semi Join
+               Merge Cond: (t1_3.b = (((t1_6.a + t1_6.b) / 2)))
+               ->  Sort
+                     Sort Key: t1_3.b
+                     ->  Seq Scan on prt2_p1 t1_3
+               ->  Sort
+                     Sort Key: (((t1_6.a + t1_6.b) / 2))
+                     ->  Seq Scan on prt1_e_p1 t1_6
+                           Filter: (c = 0)
+   ->  Merge Semi Join
+         Merge Cond: (t1_1.a = t1_4.b)
+         ->  Sort
+               Sort Key: t1_1.a
+               ->  Seq Scan on prt1_p2 t1_1
+                     Filter: (b = 0)
+         ->  Merge Semi Join
+               Merge Cond: (t1_4.b = (((t1_7.a + t1_7.b) / 2)))
+               ->  Sort
+                     Sort Key: t1_4.b
+                     ->  Seq Scan on prt2_p2 t1_4
+               ->  Sort
+                     Sort Key: (((t1_7.a + t1_7.b) / 2))
+                     ->  Seq Scan on prt1_e_p2 t1_7
+                           Filter: (c = 0)
+   ->  Merge Semi Join
+         Merge Cond: (t1_2.a = t1_5.b)
+         ->  Sort
+               Sort Key: t1_2.a
+               ->  Seq Scan on prt1_p3 t1_2
+                     Filter: (b = 0)
+         ->  Merge Semi Join
+               Merge Cond: (t1_5.b = (((t1_8.a + t1_8.b) / 2)))
+               ->  Sort
+                     Sort Key: t1_5.b
+                     ->  Seq Scan on prt2_p3 t1_5
+               ->  Sort
+                     Sort Key: (((t1_8.a + t1_8.b) / 2))
+                     ->  Seq Scan on prt1_e_p3 t1_8
+                           Filter: (c = 0)
+(47 rows)
+
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   
+-----+---+------
+   0 | 0 | 0000
+ 150 | 0 | 0150
+ 300 | 0 | 0300
+ 450 | 0 | 0450
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+                                    QUERY PLAN                                    
+----------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a, t2.b, ((t3.a + t3.b))
+   ->  Result
+         ->  Append
+               ->  Merge Left Join
+                     Merge Cond: (t1.a = t2.b)
+                     ->  Sort
+                           Sort Key: t1.a
+                           ->  Merge Left Join
+                                 Merge Cond: ((((t3.a + t3.b) / 2)) = t1.a)
+                                 ->  Sort
+                                       Sort Key: (((t3.a + t3.b) / 2))
+                                       ->  Seq Scan on prt1_e_p1 t3
+                                             Filter: (c = 0)
+                                 ->  Sort
+                                       Sort Key: t1.a
+                                       ->  Seq Scan on prt1_p1 t1
+                     ->  Sort
+                           Sort Key: t2.b
+                           ->  Seq Scan on prt2_p1 t2
+               ->  Merge Left Join
+                     Merge Cond: (t1_1.a = t2_1.b)
+                     ->  Sort
+                           Sort Key: t1_1.a
+                           ->  Merge Left Join
+                                 Merge Cond: ((((t3_1.a + t3_1.b) / 2)) = t1_1.a)
+                                 ->  Sort
+                                       Sort Key: (((t3_1.a + t3_1.b) / 2))
+                                       ->  Seq Scan on prt1_e_p2 t3_1
+                                             Filter: (c = 0)
+                                 ->  Sort
+                                       Sort Key: t1_1.a
+                                       ->  Seq Scan on prt1_p2 t1_1
+                     ->  Sort
+                           Sort Key: t2_1.b
+                           ->  Seq Scan on prt2_p2 t2_1
+               ->  Merge Left Join
+                     Merge Cond: (t1_2.a = t2_2.b)
+                     ->  Sort
+                           Sort Key: t1_2.a
+                           ->  Merge Left Join
+                                 Merge Cond: ((((t3_2.a + t3_2.b) / 2)) = t1_2.a)
+                                 ->  Sort
+                                       Sort Key: (((t3_2.a + t3_2.b) / 2))
+                                       ->  Seq Scan on prt1_e_p3 t3_2
+                                             Filter: (c = 0)
+                                 ->  Sort
+                                       Sort Key: t1_2.a
+                                       ->  Seq Scan on prt1_p3 t1_2
+                     ->  Sort
+                           Sort Key: t2_2.b
+                           ->  Seq Scan on prt2_p3 t2_2
+(52 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+  a  |  c   |  b  |  c   | ?column? | c 
+-----+------+-----+------+----------+---
+   0 | 0000 |   0 | 0000 |        0 | 0
+  50 | 0050 |     |      |      100 | 0
+ 100 | 0100 |     |      |      200 | 0
+ 150 | 0150 | 150 | 0150 |      300 | 0
+ 200 | 0200 |     |      |      400 | 0
+ 250 | 0250 |     |      |      500 | 0
+ 300 | 0300 | 300 | 0300 |      600 | 0
+ 350 | 0350 |     |      |      700 | 0
+ 400 | 0400 |     |      |      800 | 0
+ 450 | 0450 | 450 | 0450 |      900 | 0
+ 500 | 0500 |     |      |     1000 | 0
+ 550 | 0550 |     |      |     1100 | 0
+(12 rows)
+
+-- MergeAppend on nullable column
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Sort
+   Sort Key: prt1_p1.a, b
+   ->  Append
+         ->  Merge Left Join
+               Merge Cond: (prt1_p1.a = b)
+               ->  Sort
+                     Sort Key: prt1_p1.a
+                     ->  Seq Scan on prt1_p1
+                           Filter: ((a < 450) AND (b = 0))
+               ->  Sort
+                     Sort Key: b
+                     ->  Result
+                           One-Time Filter: false
+         ->  Merge Left Join
+               Merge Cond: (prt1_p2.a = prt2_p2.b)
+               ->  Sort
+                     Sort Key: prt1_p2.a
+                     ->  Seq Scan on prt1_p2
+                           Filter: ((a < 450) AND (b = 0))
+               ->  Sort
+                     Sort Key: prt2_p2.b
+                     ->  Seq Scan on prt2_p2
+                           Filter: (b > 250)
+(23 rows)
+
+SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  b  
+-----+-----
+   0 |    
+  50 |    
+ 100 |    
+ 150 |    
+ 200 |    
+ 250 |    
+ 300 | 300
+ 350 |    
+ 400 |    
+(9 rows)
+
+RESET enable_hashjoin;
+RESET enable_nestloop;
+--
+-- partitioned by multiple columns
+--
+CREATE TABLE prt1_m (a int, b int, c int) PARTITION BY RANGE(a, ((a + b)/2));
+CREATE TABLE prt1_m_p1 PARTITION OF prt1_m FOR VALUES FROM (0, 0) TO (250, 250);
+CREATE TABLE prt1_m_p2 PARTITION OF prt1_m FOR VALUES FROM (250, 250) TO (500, 500);
+CREATE TABLE prt1_m_p3 PARTITION OF prt1_m FOR VALUES FROM (500, 500) TO (600, 600);
+INSERT INTO prt1_m SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i;
+ANALYZE prt1_m;
+CREATE TABLE prt2_m (a int, b int, c int) PARTITION BY RANGE(((b + a)/2), b);
+CREATE TABLE prt2_m_p1 PARTITION OF prt2_m FOR VALUES FROM (0, 0) TO (250, 250);
+CREATE TABLE prt2_m_p2 PARTITION OF prt2_m FOR VALUES FROM (250, 250) TO (500, 500);
+CREATE TABLE prt2_m_p3 PARTITION OF prt2_m FOR VALUES FROM (500, 500) TO (600, 600);
+INSERT INTO prt2_m SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i;
+ANALYZE prt2_m;
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b;
+                                                             QUERY PLAN                                                             
+------------------------------------------------------------------------------------------------------------------------------------
+ Sort
+   Sort Key: prt1_m_p1.a, prt2_m_p1.b
+   ->  Append
+         ->  Hash Full Join
+               Hash Cond: ((prt1_m_p1.a = ((prt2_m_p1.b + prt2_m_p1.a) / 2)) AND (((prt1_m_p1.a + prt1_m_p1.b) / 2) = prt2_m_p1.b))
+               ->  Seq Scan on prt1_m_p1
+                     Filter: (c = 0)
+               ->  Hash
+                     ->  Seq Scan on prt2_m_p1
+                           Filter: (c = 0)
+         ->  Hash Full Join
+               Hash Cond: ((prt1_m_p2.a = ((prt2_m_p2.b + prt2_m_p2.a) / 2)) AND (((prt1_m_p2.a + prt1_m_p2.b) / 2) = prt2_m_p2.b))
+               ->  Seq Scan on prt1_m_p2
+                     Filter: (c = 0)
+               ->  Hash
+                     ->  Seq Scan on prt2_m_p2
+                           Filter: (c = 0)
+         ->  Hash Full Join
+               Hash Cond: ((prt1_m_p3.a = ((prt2_m_p3.b + prt2_m_p3.a) / 2)) AND (((prt1_m_p3.a + prt1_m_p3.b) / 2) = prt2_m_p3.b))
+               ->  Seq Scan on prt1_m_p3
+                     Filter: (c = 0)
+               ->  Hash
+                     ->  Seq Scan on prt2_m_p3
+                           Filter: (c = 0)
+(24 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b;
+  a  | c |  b  | c 
+-----+---+-----+---
+   0 | 0 |   0 | 0
+  50 | 0 |     |  
+ 100 | 0 |     |  
+ 150 | 0 | 150 | 0
+ 200 | 0 |     |  
+ 250 | 0 |     |  
+ 300 | 0 | 300 | 0
+ 350 | 0 |     |  
+ 400 | 0 |     |  
+ 450 | 0 | 450 | 0
+ 500 | 0 |     |  
+ 550 | 0 |     |  
+     |   |  75 | 0
+     |   | 225 | 0
+     |   | 375 | 0
+     |   | 525 | 0
+(16 rows)
+
+--
+-- tests for list partitioned tables.
+--
+CREATE TABLE plt1 (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE plt1_p1 PARTITION OF plt1 FOR VALUES IN ('0000', '0003', '0004', '0010');
+CREATE TABLE plt1_p2 PARTITION OF plt1 FOR VALUES IN ('0001', '0005', '0002', '0009');
+CREATE TABLE plt1_p3 PARTITION OF plt1 FOR VALUES IN ('0006', '0007', '0008', '0011');
+INSERT INTO plt1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE plt1;
+CREATE TABLE plt2 (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE plt2_p1 PARTITION OF plt2 FOR VALUES IN ('0000', '0003', '0004', '0010');
+CREATE TABLE plt2_p2 PARTITION OF plt2 FOR VALUES IN ('0001', '0005', '0002', '0009');
+CREATE TABLE plt2_p3 PARTITION OF plt2 FOR VALUES IN ('0006', '0007', '0008', '0011');
+INSERT INTO plt2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i;
+ANALYZE plt2;
+--
+-- list partitioned by expression
+--
+CREATE TABLE plt1_e (a int, b int, c text) PARTITION BY LIST(ltrim(c, 'A'));
+CREATE TABLE plt1_e_p1 PARTITION OF plt1_e FOR VALUES IN ('0000', '0003', '0004', '0010');
+CREATE TABLE plt1_e_p2 PARTITION OF plt1_e FOR VALUES IN ('0001', '0005', '0002', '0009');
+CREATE TABLE plt1_e_p3 PARTITION OF plt1_e FOR VALUES IN ('0006', '0007', '0008', '0011');
+INSERT INTO plt1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE plt1_e;
+-- test partition matching with N-way join
+EXPLAIN (COSTS OFF)
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.c, t3.c
+   ->  HashAggregate
+         Group Key: t1.c, t2.c, t3.c
+         ->  Result
+               ->  Append
+                     ->  Hash Join
+                           Hash Cond: (t1.c = t2.c)
+                           ->  Seq Scan on plt1_p1 t1
+                           ->  Hash
+                                 ->  Hash Join
+                                       Hash Cond: (t2.c = ltrim(t3.c, 'A'::text))
+                                       ->  Seq Scan on plt2_p1 t2
+                                       ->  Hash
+                                             ->  Seq Scan on plt1_e_p1 t3
+                     ->  Hash Join
+                           Hash Cond: (t1_1.c = t2_1.c)
+                           ->  Seq Scan on plt1_p2 t1_1
+                           ->  Hash
+                                 ->  Hash Join
+                                       Hash Cond: (t2_1.c = ltrim(t3_1.c, 'A'::text))
+                                       ->  Seq Scan on plt2_p2 t2_1
+                                       ->  Hash
+                                             ->  Seq Scan on plt1_e_p2 t3_1
+                     ->  Hash Join
+                           Hash Cond: (t1_2.c = t2_2.c)
+                           ->  Seq Scan on plt1_p3 t1_2
+                           ->  Hash
+                                 ->  Hash Join
+                                       Hash Cond: (t2_2.c = ltrim(t3_2.c, 'A'::text))
+                                       ->  Seq Scan on plt2_p3 t2_2
+                                       ->  Hash
+                                             ->  Seq Scan on plt1_e_p3 t3_2
+(33 rows)
+
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+         avg          |         avg          |          avg          |  c   |  c   |   c   
+----------------------+----------------------+-----------------------+------+------+-------
+  24.0000000000000000 |  24.0000000000000000 |   48.0000000000000000 | 0000 | 0000 | A0000
+  74.0000000000000000 |  75.0000000000000000 |  148.0000000000000000 | 0001 | 0001 | A0001
+ 124.0000000000000000 | 124.5000000000000000 |  248.0000000000000000 | 0002 | 0002 | A0002
+ 174.0000000000000000 | 174.0000000000000000 |  348.0000000000000000 | 0003 | 0003 | A0003
+ 224.0000000000000000 | 225.0000000000000000 |  448.0000000000000000 | 0004 | 0004 | A0004
+ 274.0000000000000000 | 274.5000000000000000 |  548.0000000000000000 | 0005 | 0005 | A0005
+ 324.0000000000000000 | 324.0000000000000000 |  648.0000000000000000 | 0006 | 0006 | A0006
+ 374.0000000000000000 | 375.0000000000000000 |  748.0000000000000000 | 0007 | 0007 | A0007
+ 424.0000000000000000 | 424.5000000000000000 |  848.0000000000000000 | 0008 | 0008 | A0008
+ 474.0000000000000000 | 474.0000000000000000 |  948.0000000000000000 | 0009 | 0009 | A0009
+ 524.0000000000000000 | 525.0000000000000000 | 1048.0000000000000000 | 0010 | 0010 | A0010
+ 574.0000000000000000 | 574.5000000000000000 | 1148.0000000000000000 | 0011 | 0011 | A0011
+(12 rows)
+
+-- joins where one of the relations is proven empty
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a = 1 AND t1.a = 2;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 LEFT JOIN prt2 t2 ON t1.a = t2.b;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+                 QUERY PLAN                 
+--------------------------------------------
+ Sort
+   Sort Key: a, t2.b
+   ->  Hash Left Join
+         Hash Cond: (t2.b = a)
+         ->  Append
+               ->  Seq Scan on prt2_p1 t2
+                     Filter: (a = 0)
+               ->  Seq Scan on prt2_p2 t2_1
+                     Filter: (a = 0)
+               ->  Seq Scan on prt2_p3 t2_2
+                     Filter: (a = 0)
+         ->  Hash
+               ->  Result
+                     One-Time Filter: false
+(14 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+                 QUERY PLAN                 
+--------------------------------------------
+ Sort
+   Sort Key: a, t2.b
+   ->  Hash Left Join
+         Hash Cond: (t2.b = a)
+         ->  Append
+               ->  Seq Scan on prt2_p1 t2
+                     Filter: (a = 0)
+               ->  Seq Scan on prt2_p2 t2_1
+                     Filter: (a = 0)
+               ->  Seq Scan on prt2_p3 t2_2
+                     Filter: (a = 0)
+         ->  Hash
+               ->  Result
+                     One-Time Filter: false
+(14 rows)
+
+--
+-- multiple levels of partitioning 
+--
+CREATE TABLE prt1_l (a int, b int, c varchar) PARTITION BY RANGE(a);
+CREATE TABLE prt1_l_p1 PARTITION OF prt1_l FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt1_l_p2 PARTITION OF prt1_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c);
+CREATE TABLE prt1_l_p2_p1 PARTITION OF prt1_l_p2 FOR VALUES IN ('0000', '0001');
+CREATE TABLE prt1_l_p2_p2 PARTITION OF prt1_l_p2 FOR VALUES IN ('0002', '0003');
+CREATE TABLE prt1_l_p3 PARTITION OF prt1_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (b);
+CREATE TABLE prt1_l_p3_p1 PARTITION OF prt1_l_p3 FOR VALUES FROM (0) TO (13);
+CREATE TABLE prt1_l_p3_p2 PARTITION OF prt1_l_p3 FOR VALUES FROM (13) TO (25);
+INSERT INTO prt1_l SELECT i, i % 25, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt1_l;
+CREATE TABLE prt2_l (a int, b int, c varchar) PARTITION BY RANGE(b);
+CREATE TABLE prt2_l_p1 PARTITION OF prt2_l FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt2_l_p2 PARTITION OF prt2_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c);
+CREATE TABLE prt2_l_p2_p1 PARTITION OF prt2_l_p2 FOR VALUES IN ('0000', '0001');
+CREATE TABLE prt2_l_p2_p2 PARTITION OF prt2_l_p2 FOR VALUES IN ('0002', '0003');
+CREATE TABLE prt2_l_p3 PARTITION OF prt2_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (a);
+CREATE TABLE prt2_l_p3_p1 PARTITION OF prt2_l_p3 FOR VALUES FROM (0) TO (13);
+CREATE TABLE prt2_l_p3_p2 PARTITION OF prt2_l_p3 FOR VALUES FROM (13) TO (25);
+INSERT INTO prt2_l SELECT i % 25, i, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 3) i;
+ANALYZE prt2_l;
+-- inner join, qual covering only top-level partitions
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Sort
+   Sort Key: t1.a
+   ->  Append
+         ->  Hash Join
+               Hash Cond: (t2.b = t1.a)
+               ->  Seq Scan on prt2_l_p1 t2
+               ->  Hash
+                     ->  Seq Scan on prt1_l_p1 t1
+                           Filter: (b = 0)
+         ->  Hash Join
+               Hash Cond: (t2_1.b = t1_1.a)
+               ->  Append
+                     ->  Seq Scan on prt2_l_p2_p1 t2_1
+                     ->  Seq Scan on prt2_l_p2_p2 t2_2
+               ->  Hash
+                     ->  Append
+                           ->  Seq Scan on prt1_l_p2_p1 t1_1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p2_p2 t1_2
+                                 Filter: (b = 0)
+         ->  Hash Join
+               Hash Cond: (t2_3.b = t1_3.a)
+               ->  Append
+                     ->  Seq Scan on prt2_l_p3_p1 t2_3
+                     ->  Seq Scan on prt2_l_p3_p2 t2_4
+               ->  Hash
+                     ->  Append
+                           ->  Seq Scan on prt1_l_p3_p1 t1_3
+                                 Filter: (b = 0)
+(29 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+ 150 | 0002 | 150 | 0002
+ 300 | 0000 | 300 | 0000
+ 450 | 0002 | 450 | 0002
+(4 rows)
+
+-- left join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a, t2.b
+   ->  Append
+         ->  Hash Right Join
+               Hash Cond: ((t2.b = t1.a) AND ((t2.c)::text = (t1.c)::text))
+               ->  Seq Scan on prt2_l_p1 t2
+               ->  Hash
+                     ->  Seq Scan on prt1_l_p1 t1
+                           Filter: (b = 0)
+         ->  Hash Right Join
+               Hash Cond: ((t2_1.b = t1_1.a) AND ((t2_1.c)::text = (t1_1.c)::text))
+               ->  Seq Scan on prt2_l_p2_p1 t2_1
+               ->  Hash
+                     ->  Seq Scan on prt1_l_p2_p1 t1_1
+                           Filter: (b = 0)
+         ->  Hash Right Join
+               Hash Cond: ((t2_2.b = t1_2.a) AND ((t2_2.c)::text = (t1_2.c)::text))
+               ->  Seq Scan on prt2_l_p2_p2 t2_2
+               ->  Hash
+                     ->  Seq Scan on prt1_l_p2_p2 t1_2
+                           Filter: (b = 0)
+         ->  Hash Right Join
+               Hash Cond: ((t2_3.b = t1_3.a) AND ((t2_3.c)::text = (t1_3.c)::text))
+               ->  Append
+                     ->  Seq Scan on prt2_l_p3_p1 t2_3
+                     ->  Seq Scan on prt2_l_p3_p2 t2_4
+               ->  Hash
+                     ->  Append
+                           ->  Seq Scan on prt1_l_p3_p1 t1_3
+                                 Filter: (b = 0)
+(30 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+  50 | 0002 |     | 
+ 100 | 0000 |     | 
+ 150 | 0002 | 150 | 0002
+ 200 | 0000 |     | 
+ 250 | 0002 |     | 
+ 300 | 0000 | 300 | 0000
+ 350 | 0002 |     | 
+ 400 | 0000 |     | 
+ 450 | 0002 | 450 | 0002
+ 500 | 0000 |     | 
+ 550 | 0002 |     | 
+(12 rows)
+
+-- right join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a, t2.b
+   ->  Result
+         ->  Append
+               ->  Hash Right Join
+                     Hash Cond: ((t1.a = t2.b) AND ((t1.c)::text = (t2.c)::text))
+                     ->  Seq Scan on prt1_l_p1 t1
+                     ->  Hash
+                           ->  Seq Scan on prt2_l_p1 t2
+                                 Filter: (a = 0)
+               ->  Hash Right Join
+                     Hash Cond: ((t1_1.a = t2_1.b) AND ((t1_1.c)::text = (t2_1.c)::text))
+                     ->  Seq Scan on prt1_l_p2_p1 t1_1
+                     ->  Hash
+                           ->  Seq Scan on prt2_l_p2_p1 t2_1
+                                 Filter: (a = 0)
+               ->  Hash Right Join
+                     Hash Cond: ((t1_2.a = t2_2.b) AND ((t1_2.c)::text = (t2_2.c)::text))
+                     ->  Seq Scan on prt1_l_p2_p2 t1_2
+                     ->  Hash
+                           ->  Seq Scan on prt2_l_p2_p2 t2_2
+                                 Filter: (a = 0)
+               ->  Hash Right Join
+                     Hash Cond: ((t1_3.a = t2_3.b) AND ((t1_3.c)::text = (t2_3.c)::text))
+                     ->  Append
+                           ->  Seq Scan on prt1_l_p3_p1 t1_3
+                           ->  Seq Scan on prt1_l_p3_p2 t1_4
+                     ->  Hash
+                           ->  Append
+                                 ->  Seq Scan on prt2_l_p3_p1 t2_3
+                                       Filter: (a = 0)
+(31 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+ 150 | 0002 | 150 | 0002
+ 300 | 0000 | 300 | 0000
+ 450 | 0002 | 450 | 0002
+     |      |  75 | 0003
+     |      | 225 | 0001
+     |      | 375 | 0003
+     |      | 525 | 0001
+(8 rows)
+
+-- full join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b;
+                                                     QUERY PLAN                                                     
+--------------------------------------------------------------------------------------------------------------------
+ Sort
+   Sort Key: prt1_l_p1.a, prt2_l_p1.b
+   ->  Append
+         ->  Hash Full Join
+               Hash Cond: ((prt1_l_p1.a = prt2_l_p1.b) AND ((prt1_l_p1.c)::text = (prt2_l_p1.c)::text))
+               ->  Seq Scan on prt1_l_p1
+                     Filter: (b = 0)
+               ->  Hash
+                     ->  Seq Scan on prt2_l_p1
+                           Filter: (a = 0)
+         ->  Hash Full Join
+               Hash Cond: ((prt1_l_p2_p1.a = prt2_l_p2_p1.b) AND ((prt1_l_p2_p1.c)::text = (prt2_l_p2_p1.c)::text))
+               ->  Seq Scan on prt1_l_p2_p1
+                     Filter: (b = 0)
+               ->  Hash
+                     ->  Seq Scan on prt2_l_p2_p1
+                           Filter: (a = 0)
+         ->  Hash Full Join
+               Hash Cond: ((prt1_l_p2_p2.a = prt2_l_p2_p2.b) AND ((prt1_l_p2_p2.c)::text = (prt2_l_p2_p2.c)::text))
+               ->  Seq Scan on prt1_l_p2_p2
+                     Filter: (b = 0)
+               ->  Hash
+                     ->  Seq Scan on prt2_l_p2_p2
+                           Filter: (a = 0)
+         ->  Hash Full Join
+               Hash Cond: ((prt1_l_p3_p1.a = prt2_l_p3_p1.b) AND ((prt1_l_p3_p1.c)::text = (prt2_l_p3_p1.c)::text))
+               ->  Append
+                     ->  Seq Scan on prt1_l_p3_p1
+                           Filter: (b = 0)
+               ->  Hash
+                     ->  Append
+                           ->  Seq Scan on prt2_l_p3_p1
+                                 Filter: (a = 0)
+(33 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+  50 | 0002 |     | 
+ 100 | 0000 |     | 
+ 150 | 0002 | 150 | 0002
+ 200 | 0000 |     | 
+ 250 | 0002 |     | 
+ 300 | 0000 | 300 | 0000
+ 350 | 0002 |     | 
+ 400 | 0000 |     | 
+ 450 | 0002 | 450 | 0002
+ 500 | 0000 |     | 
+ 550 | 0002 |     | 
+     |      |  75 | 0003
+     |      | 225 | 0001
+     |      | 375 | 0003
+     |      | 525 | 0001
+(16 rows)
+
+-- lateral partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT * FROM prt1_l t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss
+			  ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a;
+                                             QUERY PLAN                                              
+-----------------------------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.a
+   ->  Result
+         ->  Append
+               ->  Nested Loop Left Join
+                     ->  Seq Scan on prt1_l_p1 t1
+                           Filter: (b = 0)
+                     ->  Hash Join
+                           Hash Cond: ((t3.b = t2.a) AND ((t3.c)::text = (t2.c)::text))
+                           ->  Seq Scan on prt2_l_p1 t3
+                           ->  Hash
+                                 ->  Seq Scan on prt1_l_p1 t2
+                                       Filter: ((t1.a = a) AND ((t1.c)::text = (c)::text))
+               ->  Nested Loop Left Join
+                     ->  Seq Scan on prt1_l_p2_p1 t1_1
+                           Filter: (b = 0)
+                     ->  Hash Join
+                           Hash Cond: ((t3_1.b = t2_1.a) AND ((t3_1.c)::text = (t2_1.c)::text))
+                           ->  Seq Scan on prt2_l_p2_p1 t3_1
+                           ->  Hash
+                                 ->  Seq Scan on prt1_l_p2_p1 t2_1
+                                       Filter: ((t1_1.a = a) AND ((t1_1.c)::text = (c)::text))
+               ->  Nested Loop Left Join
+                     ->  Seq Scan on prt1_l_p2_p2 t1_2
+                           Filter: (b = 0)
+                     ->  Hash Join
+                           Hash Cond: ((t3_2.b = t2_2.a) AND ((t3_2.c)::text = (t2_2.c)::text))
+                           ->  Seq Scan on prt2_l_p2_p2 t3_2
+                           ->  Hash
+                                 ->  Seq Scan on prt1_l_p2_p2 t2_2
+                                       Filter: ((t1_2.a = a) AND ((t1_2.c)::text = (c)::text))
+               ->  Nested Loop Left Join
+                     ->  Append
+                           ->  Seq Scan on prt1_l_p3_p1 t1_3
+                                 Filter: (b = 0)
+                     ->  Hash Join
+                           Hash Cond: ((t3_3.b = t2_3.a) AND ((t3_3.c)::text = (t2_3.c)::text))
+                           ->  Append
+                                 ->  Seq Scan on prt2_l_p3_p1 t3_3
+                                 ->  Seq Scan on prt2_l_p3_p2 t3_4
+                           ->  Hash
+                                 ->  Append
+                                       ->  Seq Scan on prt1_l_p3_p1 t2_3
+                                             Filter: ((t1_3.a = a) AND ((t1_3.c)::text = (c)::text))
+                                       ->  Seq Scan on prt1_l_p3_p2 t2_4
+                                             Filter: ((t1_3.a = a) AND ((t1_3.c)::text = (c)::text))
+(46 rows)
+
+SELECT * FROM prt1_l t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss
+			  ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   | t2a | t2c  | t2b | t3b | least 
+-----+---+------+-----+------+-----+-----+-------
+   0 | 0 | 0000 |   0 | 0000 |   0 |   0 |     0
+  50 | 0 | 0002 |     |      |     |     |      
+ 100 | 0 | 0000 |     |      |     |     |      
+ 150 | 0 | 0002 | 150 | 0002 |   0 | 150 |   150
+ 200 | 0 | 0000 |     |      |     |     |      
+ 250 | 0 | 0002 |     |      |     |     |      
+ 300 | 0 | 0000 | 300 | 0000 |   0 | 300 |   300
+ 350 | 0 | 0002 |     |      |     |     |      
+ 400 | 0 | 0000 |     |      |     |     |      
+ 450 | 0 | 0002 | 450 | 0002 |   0 | 450 |   450
+ 500 | 0 | 0000 |     |      |     |     |      
+ 550 | 0 | 0002 |     |      |     |     |      
+(12 rows)
+
+-- join with one side empty
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.b = t2.a AND t1.c = t2.c;
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((t2.b = a) AND (t2.a = b) AND ((t2.c)::text = (c)::text))
+   ->  Append
+         ->  Seq Scan on prt2_l_p1 t2
+         ->  Seq Scan on prt2_l_p2_p1 t2_1
+         ->  Seq Scan on prt2_l_p2_p2 t2_2
+         ->  Seq Scan on prt2_l_p3_p1 t2_3
+         ->  Seq Scan on prt2_l_p3_p2 t2_4
+   ->  Hash
+         ->  Result
+               One-Time Filter: false
+(11 rows)
+
+--
+-- negative testcases
+--
+CREATE TABLE prt1_n (a int, b int, c varchar) PARTITION BY RANGE(c);
+CREATE TABLE prt1_n_p1 PARTITION OF prt1_n FOR VALUES FROM ('0000') TO ('0250');
+CREATE TABLE prt1_n_p2 PARTITION OF prt1_n FOR VALUES FROM ('0250') TO ('0500');
+INSERT INTO prt1_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 499, 2) i;
+ANALYZE prt1_n;
+CREATE TABLE prt2_n (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE prt2_n_p1 PARTITION OF prt2_n FOR VALUES IN ('0000', '0003', '0004', '0010', '0006', '0007');
+CREATE TABLE prt2_n_p2 PARTITION OF prt2_n FOR VALUES IN ('0001', '0005', '0002', '0009', '0008', '0011');
+INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt2_n;
+CREATE TABLE prt3_n (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE prt3_n_p1 PARTITION OF prt3_n FOR VALUES IN ('0000', '0004', '0006', '0007');
+CREATE TABLE prt3_n_p2 PARTITION OF prt3_n FOR VALUES IN ('0001', '0002', '0008', '0010');
+CREATE TABLE prt3_n_p3 PARTITION OF prt3_n FOR VALUES IN ('0003', '0005', '0009', '0011');
+INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt3_n;
+CREATE TABLE prt4_n (a int, b int, c text) PARTITION BY RANGE(a);
+CREATE TABLE prt4_n_p1 PARTITION OF prt4_n FOR VALUES FROM (0) TO (300);
+CREATE TABLE prt4_n_p2 PARTITION OF prt4_n FOR VALUES FROM (300) TO (500);
+CREATE TABLE prt4_n_p3 PARTITION OF prt4_n FOR VALUES FROM (500) TO (600);
+INSERT INTO prt4_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt4_n;
+-- partition-wise join can not be applied if the partition ranges differ
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2 WHERE t1.a = t2.a;
+                  QUERY PLAN                  
+----------------------------------------------
+ Hash Join
+   Hash Cond: (t1.a = t2.a)
+   ->  Append
+         ->  Seq Scan on prt1_p1 t1
+         ->  Seq Scan on prt1_p2 t1_1
+         ->  Seq Scan on prt1_p3 t1_2
+   ->  Hash
+         ->  Append
+               ->  Seq Scan on prt4_n_p1 t2
+               ->  Seq Scan on prt4_n_p2 t2_1
+               ->  Seq Scan on prt4_n_p3 t2_2
+(11 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2, prt2 t3 WHERE t1.a = t2.a and t1.a = t3.b;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Hash Join
+   Hash Cond: (t2.a = t1.a)
+   ->  Append
+         ->  Seq Scan on prt4_n_p1 t2
+         ->  Seq Scan on prt4_n_p2 t2_1
+         ->  Seq Scan on prt4_n_p3 t2_2
+   ->  Hash
+         ->  Append
+               ->  Hash Join
+                     Hash Cond: (t1.a = t3.b)
+                     ->  Seq Scan on prt1_p1 t1
+                     ->  Hash
+                           ->  Seq Scan on prt2_p1 t3
+               ->  Hash Join
+                     Hash Cond: (t1_1.a = t3_1.b)
+                     ->  Seq Scan on prt1_p2 t1_1
+                     ->  Hash
+                           ->  Seq Scan on prt2_p2 t3_1
+               ->  Hash Join
+                     Hash Cond: (t1_2.a = t3_2.b)
+                     ->  Seq Scan on prt1_p3 t1_2
+                     ->  Hash
+                           ->  Seq Scan on prt2_p3 t3_2
+(23 rows)
+
+-- partition-wise join can not be applied if there are no equi-join conditions
+-- between partition keys
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 LEFT JOIN prt2 t2 ON (t1.a < t2.b);
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Nested Loop Left Join
+   ->  Append
+         ->  Seq Scan on prt1_p1 t1
+         ->  Seq Scan on prt1_p2 t1_1
+         ->  Seq Scan on prt1_p3 t1_2
+   ->  Append
+         ->  Index Scan using iprt2_p1_b on prt2_p1 t2
+               Index Cond: (t1.a < b)
+         ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
+               Index Cond: (t1.a < b)
+         ->  Index Scan using iprt2_p3_b on prt2_p3 t2_2
+               Index Cond: (t1.a < b)
+(12 rows)
+
+-- equi-join with join condition on partial keys does not qualify for
+-- partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1, prt2_m t2 WHERE t1.a = (t2.b + t2.a)/2;
+                  QUERY PLAN                  
+----------------------------------------------
+ Hash Join
+   Hash Cond: (((t2.b + t2.a) / 2) = t1.a)
+   ->  Append
+         ->  Seq Scan on prt2_m_p1 t2
+         ->  Seq Scan on prt2_m_p2 t2_1
+         ->  Seq Scan on prt2_m_p3 t2_2
+   ->  Hash
+         ->  Append
+               ->  Seq Scan on prt1_m_p1 t1
+               ->  Seq Scan on prt1_m_p2 t1_1
+               ->  Seq Scan on prt1_m_p3 t1_2
+(11 rows)
+
+-- equi-join between out-of-order partition key columns does not qualify for
+-- partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.a = t2.b;
+                  QUERY PLAN                  
+----------------------------------------------
+ Hash Left Join
+   Hash Cond: (t1.a = t2.b)
+   ->  Append
+         ->  Seq Scan on prt1_m_p1 t1
+         ->  Seq Scan on prt1_m_p2 t1_1
+         ->  Seq Scan on prt1_m_p3 t1_2
+   ->  Hash
+         ->  Append
+               ->  Seq Scan on prt2_m_p1 t2
+               ->  Seq Scan on prt2_m_p2 t2_1
+               ->  Seq Scan on prt2_m_p3 t2_2
+(11 rows)
+
+-- equi-join between non-key columns does not qualify for partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.c = t2.c;
+                  QUERY PLAN                  
+----------------------------------------------
+ Hash Left Join
+   Hash Cond: (t1.c = t2.c)
+   ->  Append
+         ->  Seq Scan on prt1_m_p1 t1
+         ->  Seq Scan on prt1_m_p2 t1_1
+         ->  Seq Scan on prt1_m_p3 t1_2
+   ->  Hash
+         ->  Append
+               ->  Seq Scan on prt2_m_p1 t2
+               ->  Seq Scan on prt2_m_p2 t2_1
+               ->  Seq Scan on prt2_m_p3 t2_2
+(11 rows)
+
+-- partition-wise join can not be applied between tables with different
+-- partition lists
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 LEFT JOIN prt2_n t2 ON (t1.c = t2.c);
+                  QUERY PLAN                  
+----------------------------------------------
+ Hash Right Join
+   Hash Cond: (t2.c = (t1.c)::text)
+   ->  Append
+         ->  Seq Scan on prt2_n_p1 t2
+         ->  Seq Scan on prt2_n_p2 t2_1
+   ->  Hash
+         ->  Append
+               ->  Seq Scan on prt1_n_p1 t1
+               ->  Seq Scan on prt1_n_p2 t1_1
+(9 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 JOIN prt2_n t2 ON (t1.c = t2.c) JOIN plt1 t3 ON (t1.c = t3.c);
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Hash Join
+   Hash Cond: (t2.c = (t1.c)::text)
+   ->  Append
+         ->  Seq Scan on prt2_n_p1 t2
+         ->  Seq Scan on prt2_n_p2 t2_1
+   ->  Hash
+         ->  Hash Join
+               Hash Cond: (t3.c = (t1.c)::text)
+               ->  Append
+                     ->  Seq Scan on plt1_p1 t3
+                     ->  Seq Scan on plt1_p2 t3_1
+                     ->  Seq Scan on plt1_p3 t3_2
+               ->  Hash
+                     ->  Append
+                           ->  Seq Scan on prt1_n_p1 t1
+                           ->  Seq Scan on prt1_n_p2 t1_1
+(16 rows)
+
+-- partition-wise join can not be applied for a join between list and range
+-- partitioned table
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 FULL JOIN prt1 t2 ON (t1.c = t2.c);
+                  QUERY PLAN                  
+----------------------------------------------
+ Hash Full Join
+   Hash Cond: ((t2.c)::text = (t1.c)::text)
+   ->  Append
+         ->  Seq Scan on prt1_p1 t2
+         ->  Seq Scan on prt1_p2 t2_1
+         ->  Seq Scan on prt1_p3 t2_2
+   ->  Hash
+         ->  Append
+               ->  Seq Scan on prt1_n_p1 t1
+               ->  Seq Scan on prt1_n_p2 t1_1
+(10 rows)
+
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index b624ad7c..d098ccb4 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -113,6 +113,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_multi_cluster_print        | off
  enable_nestloop                   | on
  enable_nestloop_suppression       | off
+ enable_partition_wise_join        | off
  enable_null_string                | off
  enable_oracle_compatible          | off
  enable_parallel_ddl               | off
diff --git a/src/test/regress/expected/sysviews_1.out b/src/test/regress/expected/sysviews_1.out
index 708e4676..76c8fa59 100644
--- a/src/test/regress/expected/sysviews_1.out
+++ b/src/test/regress/expected/sysviews_1.out
@@ -95,6 +95,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_multi_cluster           | on
  enable_multi_cluster_print     | off
  enable_nestloop                | on
+ enable_partition_wise_join     | off
  enable_oracle_compatible       | off
  enable_pgbouncer               | off
  enable_plpgsql_debug_print     | off
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index f9eabd53..289f1ed4 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -116,7 +116,7 @@ test: publication subscription
 # ----------
 # Another group of parallel tests
 # ----------
-test: select_views portals_p2 foreign_key cluster dependency guc bitmapops combocid tsearch tsdicts foreign_data window xmlmap functional_deps json jsonb json_encoding indirect_toast equivclass
+test: select_views portals_p2 foreign_key cluster dependency guc bitmapops combocid tsearch tsdicts foreign_data window xmlmap functional_deps json jsonb json_encoding indirect_toast equivclass partition_join
 
 # ----------
 # As XL uses advisory locks internally running this test separately.
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 04781232..ad4f5d4f 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -193,6 +193,7 @@ test: xml
 test: event_trigger
 test: fast_default
 test: stats
+test: partition_join
 test: xc_create_function
 test: xc_groupby
 test: xc_distkey
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
new file mode 100644
index 00000000..ca525d99
--- /dev/null
+++ b/src/test/regress/sql/partition_join.sql
@@ -0,0 +1,354 @@
+--
+-- PARTITION_JOIN
+-- Test partition-wise join between partitioned tables
+--
+
+-- Enable partition-wise join, which by default is disabled.
+SET enable_partition_wise_join to true;
+
+--
+-- partitioned by a single column
+--
+CREATE TABLE prt1 (a int, b int, c varchar) PARTITION BY RANGE(a);
+CREATE TABLE prt1_p1 PARTITION OF prt1 FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt1_p3 PARTITION OF prt1 FOR VALUES FROM (500) TO (600);
+CREATE TABLE prt1_p2 PARTITION OF prt1 FOR VALUES FROM (250) TO (500);
+INSERT INTO prt1 SELECT i, i % 25, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 2 = 0;
+CREATE INDEX iprt1_p1_a on prt1_p1(a);
+CREATE INDEX iprt1_p2_a on prt1_p2(a);
+CREATE INDEX iprt1_p3_a on prt1_p3(a);
+ANALYZE prt1;
+
+CREATE TABLE prt2 (a int, b int, c varchar) PARTITION BY RANGE(b);
+CREATE TABLE prt2_p1 PARTITION OF prt2 FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt2_p2 PARTITION OF prt2 FOR VALUES FROM (250) TO (500);
+CREATE TABLE prt2_p3 PARTITION OF prt2 FOR VALUES FROM (500) TO (600);
+INSERT INTO prt2 SELECT i % 25, i, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 3 = 0;
+CREATE INDEX iprt2_p1_b on prt2_p1(b);
+CREATE INDEX iprt2_p2_b on prt2_p2(b);
+CREATE INDEX iprt2_p3_b on prt2_p3(b);
+ANALYZE prt2;
+
+-- inner join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+
+-- left outer join, with whole-row reference
+EXPLAIN (COSTS OFF)
+SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+
+-- right outer join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+
+-- full outer join, with placeholder vars
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b;
+
+-- Join with pruned partitions from joining relations
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b;
+
+-- Semi-join
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a;
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a;
+
+-- Anti-join with aggregates
+EXPLAIN (COSTS OFF)
+SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b);
+SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b);
+
+-- lateral reference
+EXPLAIN (COSTS OFF)
+SELECT * FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a;
+SELECT * FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
+SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
+
+--
+-- partitioned by expression
+--
+CREATE TABLE prt1_e (a int, b int, c int) PARTITION BY RANGE(((a + b)/2));
+CREATE TABLE prt1_e_p1 PARTITION OF prt1_e FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt1_e_p2 PARTITION OF prt1_e FOR VALUES FROM (250) TO (500);
+CREATE TABLE prt1_e_p3 PARTITION OF prt1_e FOR VALUES FROM (500) TO (600);
+INSERT INTO prt1_e SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i;
+CREATE INDEX iprt1_e_p1_ab2 on prt1_e_p1(((a+b)/2));
+CREATE INDEX iprt1_e_p2_ab2 on prt1_e_p2(((a+b)/2));
+CREATE INDEX iprt1_e_p3_ab2 on prt1_e_p3(((a+b)/2));
+ANALYZE prt1_e;
+
+CREATE TABLE prt2_e (a int, b int, c int) PARTITION BY RANGE(((b + a)/2));
+CREATE TABLE prt2_e_p1 PARTITION OF prt2_e FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt2_e_p2 PARTITION OF prt2_e FOR VALUES FROM (250) TO (500);
+CREATE TABLE prt2_e_p3 PARTITION OF prt2_e FOR VALUES FROM (500) TO (600);
+INSERT INTO prt2_e SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i;
+ANALYZE prt2_e;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b;
+
+--
+-- N-way join
+--
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+
+-- Cases with non-nullable expressions in subquery results;
+-- make sure these go to null as expected
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b;
+SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b;
+
+-- Semi-join
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+
+-- test merge joins
+SET enable_hashjoin TO off;
+SET enable_nestloop TO off;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+
+-- MergeAppend on nullable column
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+
+RESET enable_hashjoin;
+RESET enable_nestloop;
+
+--
+-- partitioned by multiple columns
+--
+CREATE TABLE prt1_m (a int, b int, c int) PARTITION BY RANGE(a, ((a + b)/2));
+CREATE TABLE prt1_m_p1 PARTITION OF prt1_m FOR VALUES FROM (0, 0) TO (250, 250);
+CREATE TABLE prt1_m_p2 PARTITION OF prt1_m FOR VALUES FROM (250, 250) TO (500, 500);
+CREATE TABLE prt1_m_p3 PARTITION OF prt1_m FOR VALUES FROM (500, 500) TO (600, 600);
+INSERT INTO prt1_m SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i;
+ANALYZE prt1_m;
+
+CREATE TABLE prt2_m (a int, b int, c int) PARTITION BY RANGE(((b + a)/2), b);
+CREATE TABLE prt2_m_p1 PARTITION OF prt2_m FOR VALUES FROM (0, 0) TO (250, 250);
+CREATE TABLE prt2_m_p2 PARTITION OF prt2_m FOR VALUES FROM (250, 250) TO (500, 500);
+CREATE TABLE prt2_m_p3 PARTITION OF prt2_m FOR VALUES FROM (500, 500) TO (600, 600);
+INSERT INTO prt2_m SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i;
+ANALYZE prt2_m;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b;
+
+--
+-- tests for list partitioned tables.
+--
+CREATE TABLE plt1 (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE plt1_p1 PARTITION OF plt1 FOR VALUES IN ('0000', '0003', '0004', '0010');
+CREATE TABLE plt1_p2 PARTITION OF plt1 FOR VALUES IN ('0001', '0005', '0002', '0009');
+CREATE TABLE plt1_p3 PARTITION OF plt1 FOR VALUES IN ('0006', '0007', '0008', '0011');
+INSERT INTO plt1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE plt1;
+
+CREATE TABLE plt2 (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE plt2_p1 PARTITION OF plt2 FOR VALUES IN ('0000', '0003', '0004', '0010');
+CREATE TABLE plt2_p2 PARTITION OF plt2 FOR VALUES IN ('0001', '0005', '0002', '0009');
+CREATE TABLE plt2_p3 PARTITION OF plt2 FOR VALUES IN ('0006', '0007', '0008', '0011');
+INSERT INTO plt2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i;
+ANALYZE plt2;
+
+--
+-- list partitioned by expression
+--
+CREATE TABLE plt1_e (a int, b int, c text) PARTITION BY LIST(ltrim(c, 'A'));
+CREATE TABLE plt1_e_p1 PARTITION OF plt1_e FOR VALUES IN ('0000', '0003', '0004', '0010');
+CREATE TABLE plt1_e_p2 PARTITION OF plt1_e FOR VALUES IN ('0001', '0005', '0002', '0009');
+CREATE TABLE plt1_e_p3 PARTITION OF plt1_e FOR VALUES IN ('0006', '0007', '0008', '0011');
+INSERT INTO plt1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE plt1_e;
+
+-- test partition matching with N-way join
+EXPLAIN (COSTS OFF)
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+
+-- joins where one of the relations is proven empty
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a = 1 AND t1.a = 2;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 LEFT JOIN prt2 t2 ON t1.a = t2.b;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+
+--
+-- multiple levels of partitioning 
+--
+CREATE TABLE prt1_l (a int, b int, c varchar) PARTITION BY RANGE(a);
+CREATE TABLE prt1_l_p1 PARTITION OF prt1_l FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt1_l_p2 PARTITION OF prt1_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c);
+CREATE TABLE prt1_l_p2_p1 PARTITION OF prt1_l_p2 FOR VALUES IN ('0000', '0001');
+CREATE TABLE prt1_l_p2_p2 PARTITION OF prt1_l_p2 FOR VALUES IN ('0002', '0003');
+CREATE TABLE prt1_l_p3 PARTITION OF prt1_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (b);
+CREATE TABLE prt1_l_p3_p1 PARTITION OF prt1_l_p3 FOR VALUES FROM (0) TO (13);
+CREATE TABLE prt1_l_p3_p2 PARTITION OF prt1_l_p3 FOR VALUES FROM (13) TO (25);
+INSERT INTO prt1_l SELECT i, i % 25, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt1_l;
+
+CREATE TABLE prt2_l (a int, b int, c varchar) PARTITION BY RANGE(b);
+CREATE TABLE prt2_l_p1 PARTITION OF prt2_l FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt2_l_p2 PARTITION OF prt2_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c);
+CREATE TABLE prt2_l_p2_p1 PARTITION OF prt2_l_p2 FOR VALUES IN ('0000', '0001');
+CREATE TABLE prt2_l_p2_p2 PARTITION OF prt2_l_p2 FOR VALUES IN ('0002', '0003');
+CREATE TABLE prt2_l_p3 PARTITION OF prt2_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (a);
+CREATE TABLE prt2_l_p3_p1 PARTITION OF prt2_l_p3 FOR VALUES FROM (0) TO (13);
+CREATE TABLE prt2_l_p3_p2 PARTITION OF prt2_l_p3 FOR VALUES FROM (13) TO (25);
+INSERT INTO prt2_l SELECT i % 25, i, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 3) i;
+ANALYZE prt2_l;
+
+-- inner join, qual covering only top-level partitions
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+
+-- left join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+
+-- right join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+
+-- full join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b;
+
+-- lateral partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT * FROM prt1_l t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss
+			  ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a;
+SELECT * FROM prt1_l t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss
+			  ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a;
+
+-- join with one side empty
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.b = t2.a AND t1.c = t2.c;
+
+--
+-- negative testcases
+--
+CREATE TABLE prt1_n (a int, b int, c varchar) PARTITION BY RANGE(c);
+CREATE TABLE prt1_n_p1 PARTITION OF prt1_n FOR VALUES FROM ('0000') TO ('0250');
+CREATE TABLE prt1_n_p2 PARTITION OF prt1_n FOR VALUES FROM ('0250') TO ('0500');
+INSERT INTO prt1_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 499, 2) i;
+ANALYZE prt1_n;
+
+CREATE TABLE prt2_n (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE prt2_n_p1 PARTITION OF prt2_n FOR VALUES IN ('0000', '0003', '0004', '0010', '0006', '0007');
+CREATE TABLE prt2_n_p2 PARTITION OF prt2_n FOR VALUES IN ('0001', '0005', '0002', '0009', '0008', '0011');
+INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt2_n;
+
+CREATE TABLE prt3_n (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE prt3_n_p1 PARTITION OF prt3_n FOR VALUES IN ('0000', '0004', '0006', '0007');
+CREATE TABLE prt3_n_p2 PARTITION OF prt3_n FOR VALUES IN ('0001', '0002', '0008', '0010');
+CREATE TABLE prt3_n_p3 PARTITION OF prt3_n FOR VALUES IN ('0003', '0005', '0009', '0011');
+INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt3_n;
+
+CREATE TABLE prt4_n (a int, b int, c text) PARTITION BY RANGE(a);
+CREATE TABLE prt4_n_p1 PARTITION OF prt4_n FOR VALUES FROM (0) TO (300);
+CREATE TABLE prt4_n_p2 PARTITION OF prt4_n FOR VALUES FROM (300) TO (500);
+CREATE TABLE prt4_n_p3 PARTITION OF prt4_n FOR VALUES FROM (500) TO (600);
+INSERT INTO prt4_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt4_n;
+
+-- partition-wise join can not be applied if the partition ranges differ
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2 WHERE t1.a = t2.a;
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2, prt2 t3 WHERE t1.a = t2.a and t1.a = t3.b;
+
+-- partition-wise join can not be applied if there are no equi-join conditions
+-- between partition keys
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 LEFT JOIN prt2 t2 ON (t1.a < t2.b);
+
+-- equi-join with join condition on partial keys does not qualify for
+-- partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1, prt2_m t2 WHERE t1.a = (t2.b + t2.a)/2;
+
+-- equi-join between out-of-order partition key columns does not qualify for
+-- partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.a = t2.b;
+
+-- equi-join between non-key columns does not qualify for partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.c = t2.c;
+
+-- partition-wise join can not be applied between tables with different
+-- partition lists
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 LEFT JOIN prt2_n t2 ON (t1.c = t2.c);
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 JOIN prt2_n t2 ON (t1.c = t2.c) JOIN plt1 t3 ON (t1.c = t3.c);
+
+-- partition-wise join can not be applied for a join between list and range
+-- partitioned table
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 FULL JOIN prt1 t2 ON (t1.c = t2.c);

From 756ea7fe056033cd24f546d2a03143421d08307f Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 16:34:51 +0800
Subject: [PATCH 200/578] Clean up sloppy maintenance of regression test
 schedule
 files.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/test/regress/parallel_schedule | 4 ++--
 src/test/regress/serial_schedule   | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 289f1ed4..ab868e3a 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -116,7 +116,7 @@ test: publication subscription
 # ----------
 # Another group of parallel tests
 # ----------
-test: select_views portals_p2 foreign_key cluster dependency guc bitmapops combocid tsearch tsdicts foreign_data window xmlmap functional_deps json jsonb json_encoding indirect_toast equivclass partition_join
+test: select_views portals_p2 foreign_key cluster dependency guc bitmapops combocid tsearch tsdicts foreign_data window xmlmap functional_deps json jsonb json_encoding indirect_toast equivclass
 
 # ----------
 # As XL uses advisory locks internally running this test separately.
@@ -134,7 +134,7 @@ test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion t
 # ----------
 # Another group of parallel tests
 # ----------
-test: identity
+test: identity partition_join
 
 # event triggers cannot run concurrently with any test that runs DDL
 test: event_trigger
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index ad4f5d4f..28d7802d 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -93,6 +93,7 @@ test: updatable_views
 test: rolenames
 test: roleattributes
 test: create_am
+test: hash_func
 test: sanity_check
 test: errors
 test: select
@@ -184,16 +185,16 @@ test: conversion
 test: truncate
 test: alter_table
 test: sequence
-test: identity
 test: polymorphism
 test: rowtypes
 test: returning
 test: with
 test: xml
+test: identity
+test: partition_join
 test: event_trigger
 test: fast_default
 test: stats
-test: partition_join
 test: xc_create_function
 test: xc_groupby
 test: xc_distkey

From 61ef8966abdfc14710a263668dcb662e7f97556e Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Sat, 28 Oct 2017 11:10:21 +0200
Subject: [PATCH 201/578] Fix misplaced ReleaseSysCache call in
 get_default_partition_oid.

Julien Rouhaud

Discussion: http://postgr.es/m/CAOBaU_Y4omLA+VbsVdA-JwBLoJWiPxfdKCkMjrZM7NMZxa1fKw@mail.gmail.com
---
 src/backend/catalog/partition.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 70a3d6d8..b498716e 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -2790,9 +2790,9 @@ get_default_partition_oid(Oid parentId)
 
 		part_table_form = (Form_pg_partitioned_table) GETSTRUCT(tuple);
 		defaultPartId = part_table_form->partdefid;
+		ReleaseSysCache(tuple);
 	}
 
-	ReleaseSysCache(tuple);
 	return defaultPartId;
 }
 

From e6e22b6ce502c4bb04ca588d5ea32e40543453f0 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 31 Oct 2017 14:41:21 +0530
Subject: [PATCH 202/578] Fix code related to partitioning schemes for dropped
 columns.

The entry in appinfo->translated_vars can be NULL; if so, we must avoid
dereferencing it.

Ashutosh Bapat

Discussion: http://postgr.es/m/CAFjFpReL7+1ien=-21rhjpO3bV7aAm1rQ8XgLVk2csFagSzpZQ@mail.gmail.com
---
 src/backend/optimizer/path/allpaths.c     | 12 ++++++++++++
 src/test/regress/expected/alter_table.out |  7 +++++++
 src/test/regress/sql/alter_table.sql      |  4 ++++
 3 files changed, 23 insertions(+)

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 0774ff46..ba5a4418 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -970,6 +970,18 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
 													attno - 1);
 					int			child_index;
 
+					/*
+					 * Ignore any column dropped from the parent.
+					 * Corresponding Var won't have any translation. It won't
+					 * have attr_needed information, since it can not be
+					 * referenced in the query.
+					 */
+					if (var == NULL)
+					{
+						Assert(attr_needed == NULL);
+						continue;
+					}
+
 					child_index = var->varattno - childrel->min_attr;
 					childrel->attr_needed[child_index] = attr_needed;
 				}
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 1748add2..d112f403 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3613,6 +3613,13 @@ ALTER TABLE list_parted2 DROP COLUMN b;
 ERROR:  cannot drop column named in partition key
 ALTER TABLE list_parted2 ALTER COLUMN b TYPE text;
 ERROR:  cannot alter type of column named in partition key
+-- dropping non-partition key columns should be allowed on the parent table.
+ALTER TABLE list_parted DROP COLUMN b;
+SELECT * FROM list_parted;
+ a 
+---
+(0 rows)
+
 -- cleanup
 DROP TABLE list_parted, list_parted2, range_parted;
 DROP TABLE fail_def_part;
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index e2c0219e..97d2d9bf 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -2393,6 +2393,10 @@ ALTER TABLE part_2 INHERIT inh_test;
 ALTER TABLE list_parted2 DROP COLUMN b;
 ALTER TABLE list_parted2 ALTER COLUMN b TYPE text;
 
+-- dropping non-partition key columns should be allowed on the parent table.
+ALTER TABLE list_parted DROP COLUMN b;
+SELECT * FROM list_parted;
+
 -- cleanup
 DROP TABLE list_parted, list_parted2, range_parted;
 DROP TABLE fail_def_part;

From 510daa93e5a1eed5b6c0b09706aa615c6961da27 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 18:24:56 +0800
Subject: [PATCH 203/578] 1.After a MINVALUE/MAXVALUE bound, allow only more of
 the same. 2.Copy information from the relcache instead of pointing to it.
 3.Add hash partitioning.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/ddl.sgml                        |   28 +-
 doc/src/sgml/ref/alter_table.sgml            |    7 +
 doc/src/sgml/ref/create_table.sgml           | 4020 +++++++++---------
 src/backend/catalog/partition.c              |  684 ++-
 src/backend/commands/tablecmds.c             |   40 +-
 src/backend/nodes/copyfuncs.c                |    2 +
 src/backend/nodes/equalfuncs.c               |    2 +
 src/backend/nodes/outfuncs.c                 |    2 +
 src/backend/nodes/readfuncs.c                |    2 +
 src/backend/optimizer/path/joinrels.c        |   10 +-
 src/backend/optimizer/util/plancat.c         |   35 +-
 src/backend/parser/gram.y                    |   76 +-
 src/backend/parser/parse_utilcmd.c           |   75 +-
 src/backend/utils/adt/ruleutils.c            |   15 +-
 src/backend/utils/cache/relcache.c           |   26 +-
 src/bin/psql/tab-complete.c                  |    2 +-
 src/include/catalog/partition.h              |    5 +
 src/include/catalog/pg_proc.h                |    3 +
 src/include/nodes/parsenodes.h               |    8 +-
 src/test/regress/expected/alter_table.out    |   62 +
 src/test/regress/expected/alter_table_1.out  |   62 +
 src/test/regress/expected/alter_table_2.out  |   62 +
 src/test/regress/expected/alter_table_3.out  |   62 +
 src/test/regress/expected/create_table.out   |   88 +-
 src/test/regress/expected/inherit.out        |    4 +-
 src/test/regress/expected/inherit_1.out      |    4 +-
 src/test/regress/expected/inherit_2.out      |    4 +-
 src/test/regress/expected/inherit_3.out      |    4 +-
 src/test/regress/expected/insert.out         |   81 +-
 src/test/regress/expected/insert_1.out       |   35 +-
 src/test/regress/expected/partition_join.out |   81 +
 src/test/regress/expected/update.out         |   29 +
 src/test/regress/sql/alter_table.sql         |   64 +
 src/test/regress/sql/create_table.sql        |   57 +-
 src/test/regress/sql/inherit.sql             |    4 +-
 src/test/regress/sql/insert.sql              |   52 +-
 src/test/regress/sql/partition_join.sql      |   32 +
 src/test/regress/sql/update.sql              |   28 +
 src/tools/pgindent/typedefs.list             |    1 +
 39 files changed, 3669 insertions(+), 2189 deletions(-)

diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml
index 7449e064..a65a130f 100644
--- a/doc/src/sgml/ddl.sgml
+++ b/doc/src/sgml/ddl.sgml
@@ -3145,6 +3145,19 @@ VALUES ('Albany', NULL, NULL, 'NY');
         </para>
        </listitem>
       </varlistentry>
+
+      <varlistentry>
+       <term>Hash Partitioning</term>
+
+       <listitem>
+        <para>
+         The table is partitioned by specifying a modulus and a remainder for
+         each partition. Each partition will hold the rows for which the hash
+         value of the partition key divided by the specified modulus will
+         produce the specified remainder.
+        </para>
+       </listitem>
+      </varlistentry>
      </variablelist>
 
      If your application needs to use other forms of partitioning not listed
@@ -3171,9 +3184,8 @@ VALUES ('Albany', NULL, NULL, 'NY');
     All rows inserted into a partitioned table will be routed to one of the
     <firstterm>partitions</firstterm> based on the value of the partition
     key.  Each partition has a subset of the data defined by its
-    <firstterm>partition bounds</firstterm>.  Currently supported
-    partitioning methods include range and list, where each partition is
-    assigned a range of keys and a list of keys, respectively.
+    <firstterm>partition bounds</firstterm>.  The currently supported
+    partitioning methods are range, list, and hash.
    </para>
 
    <para>
@@ -3598,11 +3610,11 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02
 
       <listitem>
        <para>
-        Declarative partitioning only supports list and range partitioning,
-        whereas table inheritance allows data to be divided in a manner of
-        the user's choosing.  (Note, however, that if constraint exclusion is
-        unable to prune partitions effectively, query performance will be very
-        poor.)
+        Declarative partitioning only supports range, list and hash
+        partitioning, whereas table inheritance allows data to be divided in a
+        manner of the user's choosing.  (Note, however, that if constraint
+        exclusion is unable to prune partitions effectively, query performance
+        will be very poor.)
        </para>
       </listitem>
 
diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml
index 06c5655e..d9ddbd01 100644
--- a/doc/src/sgml/ref/alter_table.sgml
+++ b/doc/src/sgml/ref/alter_table.sgml
@@ -1629,6 +1629,13 @@ ALTER TABLE cities
     ATTACH PARTITION cities_partdef DEFAULT;
 </programlisting></para>
 
+  <para>
+   Attach a partition to hash partitioned table:
+<programlisting>
+ALTER TABLE orders
+    ATTACH PARTITION orders_p4 FOR VALUES WITH (MODULUS 4, REMAINDER 3);
+</programlisting></para>
+
   <para>
    Detach a partition from partitioned table:
 <programlisting>
diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml
index e46601b7..62792897 100644
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -1,2111 +1,1995 @@
-<!--
-doc/src/sgml/ref/create_table.sgml
-PostgreSQL documentation
--->
-
-<refentry id="SQL-CREATETABLE">
- <indexterm zone="sql-createtable">
-  <primary>CREATE TABLE</primary>
- </indexterm>
-
- <refmeta>
-  <refentrytitle>CREATE TABLE</refentrytitle>
-  <manvolnum>7</manvolnum>
-  <refmiscinfo>SQL - Language Statements</refmiscinfo>
- </refmeta>
-
- <refnamediv>
-  <refname>CREATE TABLE</refname>
-  <refpurpose>define a new table</refpurpose>
- </refnamediv>
-
- <refsynopsisdiv>
-<synopsis>
-CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] <replaceable class="PARAMETER">table_name</replaceable> ( [
-  { <replaceable class="PARAMETER">column_name</replaceable> <replaceable class="PARAMETER">data_type</replaceable> [ COLLATE <replaceable>collation</replaceable> ] [ <replaceable class="PARAMETER">column_constraint</replaceable> [ ... ] ]
-    | <replaceable>table_constraint</replaceable>
-    | LIKE <replaceable>source_table</replaceable> [ <replaceable>like_option</replaceable> ... ] }
-    [, ... ]
-] )
-[ INHERITS ( <replaceable>parent_table</replaceable> [, ... ] ) ]
-[ PARTITION BY { RANGE | LIST } ( { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ COLLATE <replaceable class="parameter">collation</replaceable> ] [ <replaceable class="parameter">opclass</replaceable> ] [, ... ] ) ]
-[ WITH ( <replaceable class="PARAMETER">storage_parameter</replaceable> [= <replaceable class="PARAMETER">value</replaceable>] [, ... ] ) | WITH OIDS | WITHOUT OIDS ]
-[ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ]
-[ TABLESPACE <replaceable class="PARAMETER">tablespace_name</replaceable> ]
-[ 
-  DISTRIBUTE BY { REPLICATION | ROUNDROBIN | { [HASH | MODULO ] ( <replaceable class="PARAMETER">column_name</replaceable> ) } } |
-  DISTRIBUTED { { BY ( <replaceable class="PARAMETER">column_name</replaceable> ) } | { RANDOMLY } |
-  DISTSTYLE { EVEN | KEY | ALL } DISTKEY ( <replaceable class="PARAMETER">column_name</replaceable> )
-]
-[ TO { GROUP <replaceable class="PARAMETER">groupname</replaceable> | NODE ( <replaceable class="PARAMETER">nodename</replaceable> [, ... ] ) } ]
-
-CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] <replaceable class="PARAMETER">table_name</replaceable>
-    OF <replaceable class="PARAMETER">type_name</replaceable> [ (
-  { <replaceable class="PARAMETER">column_name</replaceable> [ WITH OPTIONS ] [ <replaceable class="PARAMETER">column_constraint</replaceable> [ ... ] ]
-    | <replaceable>table_constraint</replaceable> }
-    [, ... ]
-) ]
-[ PARTITION BY { RANGE | LIST } ( { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ COLLATE <replaceable class="parameter">collation</replaceable> ] [ <replaceable class="parameter">opclass</replaceable> ] [, ... ] ) ]
-[ WITH ( <replaceable class="PARAMETER">storage_parameter</replaceable> [= <replaceable class="PARAMETER">value</replaceable>] [, ... ] ) | WITH OIDS | WITHOUT OIDS ]
-[ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ]
-[ TABLESPACE <replaceable class="PARAMETER">tablespace_name</replaceable> ]
-
-CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] <replaceable class="PARAMETER">table_name</replaceable>
-    PARTITION OF <replaceable class="PARAMETER">parent_table</replaceable> [ (
-  { <replaceable class="PARAMETER">column_name</replaceable> [ WITH OPTIONS ] [ <replaceable class="PARAMETER">column_constraint</replaceable> [ ... ] ]
-    | <replaceable>table_constraint</replaceable> }
-    [, ... ]
-) ] { FOR VALUES <replaceable class="PARAMETER">partition_bound_spec</replaceable> | DEFAULT }
-[ PARTITION BY { RANGE | LIST } ( { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ COLLATE <replaceable class="parameter">collation</replaceable> ] [ <replaceable class="parameter">opclass</replaceable> ] [, ... ] ) ]
-[ WITH ( <replaceable class="PARAMETER">storage_parameter</replaceable> [= <replaceable class="PARAMETER">value</replaceable>] [, ... ] ) | WITH OIDS | WITHOUT OIDS ]
-[ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ]
-[ TABLESPACE <replaceable class="PARAMETER">tablespace_name</replaceable> ]
-[ 
-  DISTRIBUTE BY { REPLICATION | ROUNDROBIN | { [HASH | MODULO ] ( <replaceable class="PARAMETER">column_name</replaceable> ) } } |
-  DISTRIBUTED { { BY ( <replaceable class="PARAMETER">column_name</replaceable> ) } | { RANDOMLY } |
-  DISTSTYLE { EVEN | KEY | ALL } DISTKEY ( <replaceable class="PARAMETER">column_name</replaceable> )
-]
-[ TO { GROUP <replaceable class="PARAMETER">groupname</replaceable> | NODE ( <replaceable class="PARAMETER">nodename</replaceable> [, ... ] ) } ]
-
-<phrase>where <replaceable class="PARAMETER">column_constraint</replaceable> is:</phrase>
-
-[ CONSTRAINT <replaceable class="PARAMETER">constraint_name</replaceable> ]
-{ NOT NULL |
-  NULL |
-  CHECK ( <replaceable class="PARAMETER">expression</replaceable> ) [ NO INHERIT ] |
-  DEFAULT <replaceable>default_expr</replaceable> |
-  GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY [ ( <replaceable>sequence_options</replaceable> ) ] |
-  UNIQUE <replaceable class="PARAMETER">index_parameters</replaceable> |
-  PRIMARY KEY <replaceable class="PARAMETER">index_parameters</replaceable> |
-  REFERENCES <replaceable class="PARAMETER">reftable</replaceable> [ ( <replaceable class="PARAMETER">refcolumn</replaceable> ) ] [ MATCH FULL | MATCH PARTIAL | MATCH SIMPLE ]
-    [ ON DELETE <replaceable class="parameter">action</replaceable> ] [ ON UPDATE <replaceable class="parameter">action</replaceable> ] }
-[ DEFERRABLE | NOT DEFERRABLE ] [ INITIALLY DEFERRED | INITIALLY IMMEDIATE ]
-
-<phrase>and <replaceable class="PARAMETER">table_constraint</replaceable> is:</phrase>
-
-[ CONSTRAINT <replaceable class="PARAMETER">constraint_name</replaceable> ]
-{ CHECK ( <replaceable class="PARAMETER">expression</replaceable> ) [ NO INHERIT ] |
-  UNIQUE ( <replaceable class="PARAMETER">column_name</replaceable> [, ... ] ) <replaceable class="PARAMETER">index_parameters</replaceable> |
-  PRIMARY KEY ( <replaceable class="PARAMETER">column_name</replaceable> [, ... ] ) <replaceable class="PARAMETER">index_parameters</replaceable> |
-  EXCLUDE [ USING <replaceable class="parameter">index_method</replaceable> ] ( <replaceable class="parameter">exclude_element</replaceable> WITH <replaceable class="parameter">operator</replaceable> [, ... ] ) <replaceable class="parameter">index_parameters</replaceable> [ WHERE ( <replaceable class="parameter">predicate</replaceable> ) ] |
-  FOREIGN KEY ( <replaceable class="PARAMETER">column_name</replaceable> [, ... ] ) REFERENCES <replaceable class="PARAMETER">reftable</replaceable> [ ( <replaceable class="PARAMETER">refcolumn</replaceable> [, ... ] ) ]
-    [ MATCH FULL | MATCH PARTIAL | MATCH SIMPLE ] [ ON DELETE <replaceable class="parameter">action</replaceable> ] [ ON UPDATE <replaceable class="parameter">action</replaceable> ] }
-[ DEFERRABLE | NOT DEFERRABLE ] [ INITIALLY DEFERRED | INITIALLY IMMEDIATE ]
-
-<phrase>and <replaceable class="PARAMETER">like_option</replaceable> is:</phrase>
-
-{ INCLUDING | EXCLUDING } { DEFAULTS | CONSTRAINTS | IDENTITY | INDEXES | STORAGE | COMMENTS | ALL }
-
-<phrase>and <replaceable class="PARAMETER">partition_bound_spec</replaceable> is:</phrase>
-
-IN ( { <replaceable class="PARAMETER">numeric_literal</replaceable> | <replaceable class="PARAMETER">string_literal</replaceable> | NULL } [, ...] ) |
-FROM ( { <replaceable class="PARAMETER">numeric_literal</replaceable> | <replaceable class="PARAMETER">string_literal</replaceable> | MINVALUE | MAXVALUE } [, ...] )
-  TO ( { <replaceable class="PARAMETER">numeric_literal</replaceable> | <replaceable class="PARAMETER">string_literal</replaceable> | MINVALUE | MAXVALUE } [, ...] )
-
-<phrase><replaceable class="PARAMETER">index_parameters</replaceable> in <literal>UNIQUE</literal>, <literal>PRIMARY KEY</literal>, and <literal>EXCLUDE</literal> constraints are:</phrase>
-
-[ WITH ( <replaceable class="PARAMETER">storage_parameter</replaceable> [= <replaceable class="PARAMETER">value</replaceable>] [, ... ] ) ]
-[ USING INDEX TABLESPACE <replaceable class="PARAMETER">tablespace_name</replaceable> ]
-
-<phrase><replaceable class="PARAMETER">exclude_element</replaceable> in an <literal>EXCLUDE</literal> constraint is:</phrase>
-
-{ <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ <replaceable class="parameter">opclass</replaceable> ] [ ASC | DESC ] [ NULLS { FIRST | LAST } ]
-</synopsis>
-
- </refsynopsisdiv>
-
- <refsect1 id="SQL-CREATETABLE-description">
-  <title>Description</title>
-
-  <para>
-   <command>CREATE TABLE</command> will create a new, initially empty table
-   in the current database. The table will be owned by the user issuing the
-   command.
-  </para>
-
-  <para>
-   If a schema name is given (for example, <literal>CREATE TABLE
-   myschema.mytable ...</>) then the table is created in the specified
-   schema.  Otherwise it is created in the current schema.  Temporary
-   tables exist in a special schema, so a schema name cannot be given
-   when creating a temporary table.  The name of the table must be
-   distinct from the name of any other table, sequence, index, view,
-   or foreign table in the same schema.
-  </para>
-
-  <para>
-   <command>CREATE TABLE</command> also automatically creates a data
-   type that represents the composite type corresponding
-   to one row of the table.  Therefore, tables cannot have the same
-   name as any existing data type in the same schema.
-  </para>
-
-  <para>
-   The optional constraint clauses specify constraints (tests) that
-   new or updated rows must satisfy for an insert or update operation
-   to succeed.  A constraint is an SQL object that helps define the
-   set of valid values in the table in various ways.
-  </para>
-
-  <para>
-   There are two ways to define constraints: table constraints and
-   column constraints.  A column constraint is defined as part of a
-   column definition.  A table constraint definition is not tied to a
-   particular column, and it can encompass more than one column.
-   Every column constraint can also be written as a table constraint;
-   a column constraint is only a notational convenience for use when the
-   constraint only affects one column.
-  </para>
-
-  <para>
-   To be able to create a table, you must have <literal>USAGE</literal>
-   privilege on all column types or the type in the <literal>OF</literal>
-   clause, respectively.
-  </para>
- </refsect1>
-
- <refsect1>
-  <title>Parameters</title>
-
-  <variablelist>
-
-   <varlistentry id="SQL-CREATETABLE-TEMPORARY">
-    <term><literal>TEMPORARY</> or <literal>TEMP</></term>
-    <listitem>
-     <para>
-      If specified, the table is created as a temporary table.
-      Temporary tables are automatically dropped at the end of a
-      session, or optionally at the end of the current transaction
-      (see <literal>ON COMMIT</literal> below).  Existing permanent
-      tables with the same name are not visible to the current session
-      while the temporary table exists, unless they are referenced
-      with schema-qualified names. Any indexes created on a temporary
-      table are automatically temporary as well.
-     </para>
-
-     <para>
-      The <link linkend="autovacuum">autovacuum daemon</link> cannot
-      access and therefore cannot vacuum or analyze temporary tables.
-      For this reason, appropriate vacuum and analyze operations should be
-      performed via session SQL commands.  For example, if a temporary
-      table is going to be used in complex queries, it is wise to run
-      <command>ANALYZE</> on the temporary table after it is populated.
-     </para>
-
-     <para>
-      Optionally, <literal>GLOBAL</literal> or <literal>LOCAL</literal>
-      can be written before <literal>TEMPORARY</> or <literal>TEMP</>.
-      This presently makes no difference in <productname>PostgreSQL</>
-      and is deprecated; see
-      <xref linkend="sql-createtable-compatibility"
-      endterm="sql-createtable-compatibility-title">.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry id="SQL-CREATETABLE-UNLOGGED">
-    <term><literal>UNLOGGED</></term>
-    <listitem>
-     <para>
-      If specified, the table is created as an unlogged table.  Data written
-      to unlogged tables is not written to the write-ahead log (see <xref
-      linkend="wal">), which makes them considerably faster than ordinary
-      tables.  However, they are not crash-safe: an unlogged table is
-      automatically truncated after a crash or unclean shutdown.  The contents
-      of an unlogged table are also not replicated to standby servers.
-      Any indexes created on an unlogged table are automatically unlogged as
-      well.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>IF NOT EXISTS</></term>
-    <listitem>
-     <para>
-      Do not throw an error if a relation with the same name already exists.
-      A notice is issued in this case.  Note that there is no guarantee that
-      the existing relation is anything like the one that would have been
-      created.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><replaceable class="PARAMETER">table_name</replaceable></term>
-    <listitem>
-     <para>
-      The name (optionally schema-qualified) of the table to be created.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>OF <replaceable class="PARAMETER">type_name</replaceable></literal></term>
-    <listitem>
-     <para>
-      Creates a <firstterm>typed table</firstterm>, which takes its
-      structure from the specified composite type (name optionally
-      schema-qualified).  A typed table is tied to its type; for
-      example the table will be dropped if the type is dropped
-      (with <literal>DROP TYPE ... CASCADE</literal>).
-     </para>
-
-     <para>
-      When a typed table is created, then the data types of the
-      columns are determined by the underlying composite type and are
-      not specified by the <literal>CREATE TABLE</literal> command.
-      But the <literal>CREATE TABLE</literal> command can add defaults
-      and constraints to the table and can specify storage parameters.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry id="SQL-CREATETABLE-PARTITION">
-    <term><literal>PARTITION OF <replaceable class="PARAMETER">parent_table</replaceable> { FOR VALUES <replaceable class="PARAMETER">partition_bound_spec</replaceable> | DEFAULT }</literal></term>
-    <listitem>
-     <para>
-      Creates the table as a <firstterm>partition</firstterm> of the specified
-      parent table. The table can be created either as a partition for specific
-      values using <literal>FOR VALUES</literal> or as a default partition
-      using <literal>DEFAULT</literal>.
-     </para>
-
-     <para>
-      The <replaceable class="PARAMETER">partition_bound_spec</replaceable>
-      must correspond to the partitioning method and partition key of the
-      parent table, and must not overlap with any existing partition of that
-      parent.  The form with <literal>IN</> is used for list partitioning,
-      while the form with <literal>FROM</> and <literal>TO</> is used for
-      range partitioning.
-     </para>
-
-     <para>
-      Each of the values specified in
-      the <replaceable class="PARAMETER">partition_bound_spec</> is
-      a literal, <literal>NULL</literal>, <literal>MINVALUE</literal>, or
-      <literal>MAXVALUE</literal>.  Each literal value must be either a
-      numeric constant that is coercible to the corresponding partition key
-      column's type, or a string literal that is valid input for that type.
-     </para>
-
-     <para>
-      When creating a list partition, <literal>NULL</literal> can be
-      specified to signify that the partition allows the partition key
-      column to be null.  However, there cannot be more than one such
-      list partition for a given parent table.  <literal>NULL</literal>
-      cannot be specified for range partitions.
-     </para>
-
-     <para>
-      When creating a range partition, the lower bound specified with
-      <literal>FROM</literal> is an inclusive bound, whereas the upper
-      bound specified with <literal>TO</literal> is an exclusive bound.
-      That is, the values specified in the <literal>FROM</literal> list
-      are valid values of the corresponding partition key columns for this
-      partition, whereas those in the <literal>TO</literal> list are
-      not.  Note that this statement must be understood according to the
-      rules of row-wise comparison (<xref linkend="row-wise-comparison">).
-      For example, given <literal>PARTITION BY RANGE (x,y)</>, a partition
-      bound <literal>FROM (1, 2) TO (3, 4)</literal>
-      allows <literal>x=1</> with any <literal>y&gt;=2</>,
-      <literal>x=2</> with any non-null <literal>y</>,
-      and <literal>x=3</> with any <literal>y&lt;4</>.
-     </para>
-
-     <para>
-      The special values <literal>MINVALUE</> and <literal>MAXVALUE</>
-      may be used when creating a range partition to indicate that there
-      is no lower or upper bound on the column's value. For example, a
-      partition defined using <literal>FROM (MINVALUE) TO (10)</> allows
-      any values less than 10, and a partition defined using
-      <literal>FROM (10) TO (MAXVALUE)</> allows any values greater than
-      or equal to 10.
-     </para>
-
-     <para>
-      When creating a range partition involving more than one column, it
-      can also make sense to use <literal>MAXVALUE</> as part of the lower
-      bound, and <literal>MINVALUE</> as part of the upper bound. For
-      example, a partition defined using
-      <literal>FROM (0, MAXVALUE) TO (10, MAXVALUE)</> allows any rows
-      where the first partition key column is greater than 0 and less than
-      or equal to 10. Similarly, a partition defined using
-      <literal>FROM ('a', MINVALUE) TO ('b', MINVALUE)</> allows any rows
-      where the first partition key column starts with "a".
-     </para>
-
-     <para>
-      Note that any values after <literal>MINVALUE</> or
-      <literal>MAXVALUE</> in a partition bound are ignored; so the bound
-      <literal>(10, MINVALUE, 0)</> is equivalent to
-      <literal>(10, MINVALUE, 10)</> and <literal>(10, MINVALUE, MINVALUE)</>
-      and <literal>(10, MINVALUE, MAXVALUE)</>.
-     </para>
-
-     <para>
-      Also note that some element types, such as <literal>timestamp</>,
-      have a notion of "infinity", which is just another value that can
-      be stored. This is different from <literal>MINVALUE</> and
-      <literal>MAXVALUE</>, which are not real values that can be stored,
-      but rather they are ways of saying that the value is unbounded.
-      <literal>MAXVALUE</> can be thought of as being greater than any
-      other value, including "infinity" and <literal>MINVALUE</> as being
-      less than any other value, including "minus infinity". Thus the range
-      <literal>FROM ('infinity') TO (MAXVALUE)</> is not an empty range; it
-      allows precisely one value to be stored &mdash; "infinity".
-     </para>
-
-     <para>
-      If <literal>DEFAULT</literal> is specified, the table will be
-      created as a default partition of the parent table. The parent can
-      either be a list or range partitioned table. A partition key value
-      not fitting into any other partition of the given parent will be
-      routed to the default partition. There can be only one default
-      partition for a given parent table.
-     </para>
-
-     <para>
-      When a table has an existing <literal>DEFAULT</literal> partition and
-      a new partition is added to it, the existing default partition must
-      be scanned to verify that it does not contain any rows which properly
-      belong in the new partition.  If the default partition contains a
-      large number of rows, this may be slow.  The scan will be skipped if
-      the default partition is a foreign table or if it has a constraint which
-      proves that it cannot contain rows which should be placed in the new
-      partition.
-     </para>
-
-     <para>
-      A partition must have the same column names and types as the partitioned
-      table to which it belongs.  If the parent is specified <literal>WITH
-      OIDS</literal> then all partitions must have OIDs; the parent's OID
-      column will be inherited by all partitions just like any other column.
-      Modifications to the column names or types of a partitioned table, or
-      the addition or removal of an OID column, will automatically propagate
-      to all partitions.  <literal>CHECK</> constraints will be inherited
-      automatically by every partition, but an individual partition may specify
-      additional <literal>CHECK</> constraints; additional constraints with
-      the same name and condition as in the parent will be merged with the
-      parent constraint.  Defaults may be specified separately for each
-      partition.
-     </para>
-
-     <para>
-      Rows inserted into a partitioned table will be automatically routed to
-      the correct partition.  If no suitable partition exists, an error will
-      occur.  Also, if updating a row in a given partition would require it
-      to move to another partition due to new partition key values, an error
-      will occur.
-     </para>
-
-     <para>
-      Operations such as TRUNCATE which normally affect a table and all of its
-      inheritance children will cascade to all partitions, but may also be
-      performed on an individual partition.  Note that dropping a partition
-      with <literal>DROP TABLE</literal> requires taking an <literal>ACCESS
-      EXCLUSIVE</literal> lock on the parent table.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><replaceable class="PARAMETER">column_name</replaceable></term>
-    <listitem>
-     <para>
-      The name of a column to be created in the new table.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><replaceable class="PARAMETER">data_type</replaceable></term>
-    <listitem>
-     <para>
-      The data type of the column. This can include array
-      specifiers. For more information on the data types supported by
-      <productname>PostgreSQL</productname>, refer to <xref
-      linkend="datatype">.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>COLLATE <replaceable>collation</replaceable></literal></term>
-    <listitem>
-     <para>
-      The <literal>COLLATE</> clause assigns a collation to
-      the column (which must be of a collatable data type).
-      If not specified, the column data type's default collation is used.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>INHERITS ( <replaceable>parent_table</replaceable> [, ... ] )</literal></term>
-    <listitem>
-     <para>
-      The optional <literal>INHERITS</> clause specifies a list of
-      tables from which the new table automatically inherits all
-      columns.  Parent tables can be plain tables or foreign tables.
-     </para>
-
-     <para>
-      Use of <literal>INHERITS</> creates a persistent relationship
-      between the new child table and its parent table(s).  Schema
-      modifications to the parent(s) normally propagate to children
-      as well, and by default the data of the child table is included in
-      scans of the parent(s).
-     </para>
-
-     <para>
-      If the same column name exists in more than one parent
-      table, an error is reported unless the data types of the columns
-      match in each of the parent tables.  If there is no conflict,
-      then the duplicate columns are merged to form a single column in
-      the new table.  If the column name list of the new table
-      contains a column name that is also inherited, the data type must
-      likewise match the inherited column(s), and the column
-      definitions are merged into one.  If the
-      new table explicitly specifies a default value for the column,
-      this default overrides any defaults from inherited declarations
-      of the column.  Otherwise, any parents that specify default
-      values for the column must all specify the same default, or an
-      error will be reported.
-     </para>
-
-     <para>
-      <literal>CHECK</> constraints are merged in essentially the same way as
-      columns: if multiple parent tables and/or the new table definition
-      contain identically-named <literal>CHECK</> constraints, these
-      constraints must all have the same check expression, or an error will be
-      reported.  Constraints having the same name and expression will
-      be merged into one copy.  A constraint marked <literal>NO INHERIT</> in a
-      parent will not be considered.  Notice that an unnamed <literal>CHECK</>
-      constraint in the new table will never be merged, since a unique name
-      will always be chosen for it.
-     </para>
-
-     <para>
-      In <productname>Postgres-XL</>, it is currently not possible to distribute a table with more than one parent.
-     </para>
-
-     <para>
-      Column <literal>STORAGE</> settings are also copied from parent tables.
-     </para>
-
-     <para>
-      If a column in the parent table is an identity column, that property is
-      not inherited.  A column in the child table can be declared identity
-      column if desired.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>PARTITION BY { RANGE | LIST } ( { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ <replaceable class="parameter">opclass</replaceable> ] [, ...] ) </literal></term>
-    <listitem>
-     <para>
-      The optional <literal>PARTITION BY</literal> clause specifies a strategy
-      of partitioning the table.  The table thus created is called a
-      <firstterm>partitioned</firstterm> table.  The parenthesized list of
-      columns or expressions forms the <firstterm>partition key</firstterm>
-      for the table.  When using range partitioning, the partition key can
-      include multiple columns or expressions (up to 32, but this limit can
-      altered when building <productname>PostgreSQL</productname>.), but for
-      list partitioning, the partition key must consist of a single column or
-      expression.  If no B-tree operator class is specified when creating a
-      partitioned table, the default B-tree operator class for the datatype will
-      be used.  If there is none, an error will be reported.
-     </para>
-
-     <para>
-      A partitioned table is divided into sub-tables (called partitions),
-      which are created using separate <literal>CREATE TABLE</> commands.
-      The partitioned table is itself empty.  A data row inserted into the
-      table is routed to a partition based on the value of columns or
-      expressions in the partition key.  If no existing partition matches
-      the values in the new row, an error will be reported.
-     </para>
-
-     <para>
-      Partitioned tables do not support <literal>UNIQUE</literal>,
-      <literal>PRIMARY KEY</literal>, <literal>EXCLUDE</literal>, or
-      <literal>FOREIGN KEY</literal> constraints; however, you can define
-      these constraints on individual partitions.
-     </para>
-
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>LIKE <replaceable>source_table</replaceable> [ <replaceable>like_option</replaceable> ... ]</literal></term>
-    <listitem>
-     <para>
-      The <literal>LIKE</literal> clause specifies a table from which
-      the new table automatically copies all column names, their data types,
-      and their not-null constraints.
-     </para>
-     <para>
-      Unlike <literal>INHERITS</literal>, the new table and original table
-      are completely decoupled after creation is complete.  Changes to the
-      original table will not be applied to the new table, and it is not
-      possible to include data of the new table in scans of the original
-      table.
-     </para>
-     <para>
-      Default expressions for the copied column definitions will be copied
-      only if <literal>INCLUDING DEFAULTS</literal> is specified.  The
-      default behavior is to exclude default expressions, resulting in the
-      copied columns in the new table having null defaults.
-      Note that copying defaults that call database-modification functions,
-      such as <function>nextval</>, may create a functional linkage between
-      the original and new tables.
-     </para>
-     <para>
-      Any identity specifications of copied column definitions will only be
-      copied if <literal>INCLUDING IDENTITY</literal> is specified.  A new
-      sequence is created for each identity column of the new table, separate
-      from the sequences associated with the old table.
-     </para>
-     <para>
-      Not-null constraints are always copied to the new table.
-      <literal>CHECK</literal> constraints will be copied only if
-      <literal>INCLUDING CONSTRAINTS</literal> is specified.
-      No distinction is made between column constraints and table
-      constraints.
-     </para>
-     <para>
-      Indexes, <literal>PRIMARY KEY</>, <literal>UNIQUE</>,
-      and <literal>EXCLUDE</> constraints on the original table will be
-      created on the new table only if <literal>INCLUDING INDEXES</literal>
-      is specified.  Names for the new indexes and constraints are
-      chosen according to the default rules, regardless of how the originals
-      were named.  (This behavior avoids possible duplicate-name failures for
-      the new indexes.)
-     </para>
-     <para>
-      <literal>STORAGE</> settings for the copied column definitions will be
-      copied only if <literal>INCLUDING STORAGE</literal> is specified.  The
-      default behavior is to exclude <literal>STORAGE</> settings, resulting
-      in the copied columns in the new table having type-specific default
-      settings.  For more on <literal>STORAGE</> settings, see
-      <xref linkend="storage-toast">.
-     </para>
-     <para>
-      Comments for the copied columns, constraints, and indexes
-      will be copied only if <literal>INCLUDING COMMENTS</literal>
-      is specified. The default behavior is to exclude comments, resulting in
-      the copied columns and constraints in the new table having no comments.
-     </para>
-     <para>
-      <literal>INCLUDING ALL</literal> is an abbreviated form of
-      <literal>INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING INDEXES INCLUDING STORAGE INCLUDING COMMENTS</literal>.
-     </para>
-     <para>
-      Note that unlike <literal>INHERITS</literal>, columns and
-      constraints copied by <literal>LIKE</> are not merged with similarly
-      named columns and constraints.
-      If the same name is specified explicitly or in another
-      <literal>LIKE</literal> clause, an error is signaled.
-     </para>
-     <para>
-      The <literal>LIKE</literal> clause can also be used to copy column
-      definitions from views, foreign tables, or composite types.
-      Inapplicable options (e.g., <literal>INCLUDING INDEXES</literal> from
-      a view) are ignored.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>CONSTRAINT <replaceable class="PARAMETER">constraint_name</replaceable></literal></term>
-    <listitem>
-     <para>
-      An optional name for a column or table constraint.  If the
-      constraint is violated, the constraint name is present in error messages,
-      so constraint names like <literal>col must be positive</> can be used
-      to communicate helpful constraint information to client applications.
-      (Double-quotes are needed to specify constraint names that contain spaces.)
-      If a constraint name is not specified, the system generates a name.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>NOT NULL</></term>
-    <listitem>
-     <para>
-      The column is not allowed to contain null values.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>NULL</></term>
-    <listitem>
-     <para>
-      The column is allowed to contain null values. This is the default.
-     </para>
-
-     <para>
-      This clause is only provided for compatibility with
-      non-standard SQL databases.  Its use is discouraged in new
-      applications.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>CHECK ( <replaceable class="PARAMETER">expression</replaceable> ) [ NO INHERIT ] </literal></term>
-    <listitem>
-     <para>
-      The <literal>CHECK</> clause specifies an expression producing a
-      Boolean result which new or updated rows must satisfy for an
-      insert or update operation to succeed.  Expressions evaluating
-      to TRUE or UNKNOWN succeed.  Should any row of an insert or
-      update operation produce a FALSE result, an error exception is
-      raised and the insert or update does not alter the database.  A
-      check constraint specified as a column constraint should
-      reference that column's value only, while an expression
-      appearing in a table constraint can reference multiple columns.
-     </para>
-
-     <para>
-      Currently, <literal>CHECK</literal> expressions cannot contain
-      subqueries nor refer to variables other than columns of the
-      current row.  The system column <literal>tableoid</literal>
-      may be referenced, but not any other system column.
-     </para>
-
-     <para>
-      A constraint marked with <literal>NO INHERIT</> will not propagate to
-      child tables.
-     </para>
-
-     <para>
-      When a table has multiple <literal>CHECK</literal> constraints,
-      they will be tested for each row in alphabetical order by name,
-      after checking <literal>NOT NULL</> constraints.
-      (<productname>PostgreSQL</> versions before 9.5 did not honor any
-      particular firing order for <literal>CHECK</literal> constraints.)
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>DEFAULT
-    <replaceable>default_expr</replaceable></literal></term>
-    <listitem>
-     <para>
-      The <literal>DEFAULT</> clause assigns a default data value for
-      the column whose column definition it appears within.  The value
-      is any variable-free expression (subqueries and cross-references
-      to other columns in the current table are not allowed).  The
-      data type of the default expression must match the data type of the
-      column.
-     </para>
-
-     <para>
-      The default expression will be used in any insert operation that
-      does not specify a value for the column.  If there is no default
-      for a column, then the default is null.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY [ ( <replaceable>sequence_options</replaceable> ) ]</literal></term>
-    <listitem>
-     <para>
-      This clause creates the column as an <firstterm>identity
-      column</firstterm>.  It will have an implicit sequence attached to it
-      and the column in new rows will automatically have values from the
-      sequence assigned to it.
-     </para>
-
-     <para>
-      The clauses <literal>ALWAYS</literal> and <literal>BY DEFAULT</literal>
-      determine how the sequence value is given precedence over a
-      user-specified value in an <command>INSERT</command> statement.
-      If <literal>ALWAYS</literal> is specified, a user-specified value is
-      only accepted if the <command>INSERT</command> statement
-      specifies <literal>OVERRIDING SYSTEM VALUE</literal>.  If <literal>BY
-      DEFAULT</literal> is specified, then the user-specified value takes
-      precedence.  See <xref linkend="sql-insert"> for details.  (In
-      the <command>COPY</command> command, user-specified values are always
-      used regardless of this setting.)
-     </para>
-
-     <para>
-      The optional <replaceable>sequence_options</replaceable> clause can be
-      used to override the options of the sequence.
-      See <xref linkend="sql-createsequence"> for details.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>UNIQUE</> (column constraint)</term>
-    <term><literal>UNIQUE ( <replaceable class="PARAMETER">column_name</replaceable> [, ... ] )</> (table constraint)</term>
-
-    <listitem>
-     <para>
-      The <literal>UNIQUE</literal> constraint specifies that a
-      group of one or more columns of a table can contain
-      only unique values. The behavior of the unique table constraint
-      is the same as that for column constraints, with the additional
-      capability to span multiple columns.
-     </para>
-
-     <para>
-      For the purpose of a unique constraint, null values are not
-      considered equal.
-     </para>
-
-     <para>
-      In <productname>Postgres-XL</>, if <command>DISTRIBUTE BY
-      REPLICATION</> is not specified, only the distribution key is
-      allowed to have this constraint.
-     </para>
-
-     <para>
-      Each unique table constraint must name a set of columns that is
-      different from the set of columns named by any other unique or
-      primary key constraint defined for the table.  (Otherwise it
-      would just be the same constraint listed twice.)
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>PRIMARY KEY</> (column constraint)</term>
-    <term><literal>PRIMARY KEY ( <replaceable class="PARAMETER">column_name</replaceable> [, ... ] )</> (table constraint)</term>
-    <listitem>
-     <para>
-      The <literal>PRIMARY KEY</> constraint specifies that a column or
-      columns of a table can contain only unique (non-duplicate), nonnull
-      values. Only one primary key can be specified for a table, whether as a
-      column constraint or a table constraint.
-     </para>
-
-     <para>
-      The primary key constraint should name a set of columns that is
-      different from the set of columns named by any unique
-      constraint defined for the same table.  (Otherwise, the unique
-      constraint is redundant and will be discarded.)
-     </para>
-
-     <para>
-      <literal>PRIMARY KEY</literal> enforces the same data constraints as
-      a combination of <literal>UNIQUE</> and <literal>NOT NULL</>, but
-      identifying a set of columns as the primary key also provides metadata
-      about the design of the schema, since a primary key implies that other
-      tables can rely on this set of columns as a unique identifier for rows.
-     </para>
-
-     <para>
-      In <productname>Postgres-XL</>, if <command>DISTRIBUTE BY REPLICATION</> is not specified, the
-      distribution key must be included in the set of primary key
-      columns.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry id="SQL-CREATETABLE-EXCLUDE">
-    <term><literal>EXCLUDE [ USING <replaceable class="parameter">index_method</replaceable> ] ( <replaceable class="parameter">exclude_element</replaceable> WITH <replaceable class="parameter">operator</replaceable> [, ... ] ) <replaceable class="parameter">index_parameters</replaceable> [ WHERE ( <replaceable class="parameter">predicate</replaceable> ) ]</literal></term>
-    <listitem>
-     <para>
-      The <literal>EXCLUDE</> clause defines an exclusion
-      constraint, which guarantees that if
-      any two rows are compared on the specified column(s) or
-      expression(s) using the specified operator(s), not all of these
-      comparisons will return <literal>TRUE</>.  If all of the
-      specified operators test for equality, this is equivalent to a
-      <literal>UNIQUE</> constraint, although an ordinary unique constraint
-      will be faster.  However, exclusion constraints can specify
-      constraints that are more general than simple equality.
-      For example, you can specify a constraint that
-      no two rows in the table contain overlapping circles
-      (see <xref linkend="datatype-geometric">) by using the
-      <literal>&amp;&amp;</> operator.
-     </para>
-
-     <para>
-      Exclusion constraints are implemented using
-      an index, so each specified operator must be associated with an
-      appropriate operator class
-      (see <xref linkend="indexes-opclass">) for the index access
-      method <replaceable>index_method</>.
-      The operators are required to be commutative.
-      Each <replaceable class="parameter">exclude_element</replaceable>
-      can optionally specify an operator class and/or ordering options;
-      these are described fully under
-      <xref linkend="sql-createindex">.
-     </para>
-
-     <para>
-      The access method must support <literal>amgettuple</> (see <xref
-      linkend="indexam">); at present this means <acronym>GIN</>
-      cannot be used.  Although it's allowed, there is little point in using
-      B-tree or hash indexes with an exclusion constraint, because this
-      does nothing that an ordinary unique constraint doesn't do better.
-      So in practice the access method will always be <acronym>GiST</> or
-      <acronym>SP-GiST</>.
-     </para>
-
-     <para>
-      The <replaceable class="parameter">predicate</> allows you to specify an
-      exclusion constraint on a subset of the table; internally this creates a
-      partial index. Note that parentheses are required around the predicate.
-     </para>
-
-     <para>
-      In <productname>Postgres-XL</>, exclusion constraints are currently not
-      supported.The constraint is enforced when both rows map to the same datanode.
-      But if they go into different datanodes, the constraint is not enforced.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>REFERENCES <replaceable class="parameter">reftable</replaceable> [ ( <replaceable class="parameter">refcolumn</replaceable> ) ] [ MATCH <replaceable class="parameter">matchtype</replaceable> ] [ ON DELETE <replaceable class="parameter">action</replaceable> ] [ ON UPDATE <replaceable class="parameter">action</replaceable> ]</literal> (column constraint)</term>
-
-   <term><literal>FOREIGN KEY ( <replaceable class="parameter">column_name</replaceable> [, ... ] )
-    REFERENCES <replaceable class="parameter">reftable</replaceable> [ ( <replaceable class="parameter">refcolumn</replaceable> [, ... ] ) ]
-    [ MATCH <replaceable class="parameter">matchtype</replaceable> ]
-    [ ON DELETE <replaceable class="parameter">action</replaceable> ]
-    [ ON UPDATE <replaceable class="parameter">action</replaceable> ]</literal>
-    (table constraint)</term>
-
-    <listitem>
-     <para>
-      These clauses specify a foreign key constraint, which requires
-      that a group of one or more columns of the new table must only
-      contain values that match values in the referenced
-      column(s) of some row of the referenced table.  If the <replaceable
-      class="parameter">refcolumn</replaceable> list is omitted, the
-      primary key of the <replaceable class="parameter">reftable</replaceable>
-      is used.  The referenced columns must be the columns of a non-deferrable
-      unique or primary key constraint in the referenced table.  The user
-      must have <literal>REFERENCES</> permission on the referenced table
-      (either the whole table, or the specific referenced columns).
-      Note that foreign key constraints cannot be defined between temporary
-      tables and permanent tables.
-     </para>
-
-     <para>
-      A value inserted into the referencing column(s) is matched against the
-      values of the referenced table and referenced columns using the
-      given match type.  There are three match types: <literal>MATCH
-      FULL</>, <literal>MATCH PARTIAL</>, and <literal>MATCH
-      SIMPLE</literal> (which is the default).  <literal>MATCH
-      FULL</> will not allow one column of a multicolumn foreign key
-      to be null unless all foreign key columns are null; if they are all
-      null, the row is not required to have a match in the referenced table.
-      <literal>MATCH SIMPLE</literal> allows any of the foreign key columns
-      to be null; if any of them are null, the row is not required to have a
-      match in the referenced table.
-      <literal>MATCH PARTIAL</> is not yet implemented.
-      (Of course, <literal>NOT NULL</> constraints can be applied to the
-      referencing column(s) to prevent these cases from arising.)
-     </para>
-
-     <para>
-      In addition, when the data in the referenced columns is changed,
-      certain actions are performed on the data in this table's
-      columns.  The <literal>ON DELETE</literal> clause specifies the
-      action to perform when a referenced row in the referenced table is
-      being deleted.  Likewise, the <literal>ON UPDATE</literal>
-      clause specifies the action to perform when a referenced column
-      in the referenced table is being updated to a new value. If the
-      row is updated, but the referenced column is not actually
-      changed, no action is done. Referential actions other than the
-      <literal>NO ACTION</literal> check cannot be deferred, even if
-      the constraint is declared deferrable. There are the following possible
-      actions for each clause:
-
-      <variablelist>
-       <varlistentry>
-        <term><literal>NO ACTION</literal></term>
-        <listitem>
-         <para>
-          Produce an error indicating that the deletion or update
-          would create a foreign key constraint violation.
-          If the constraint is deferred, this
-          error will be produced at constraint check time if there still
-          exist any referencing rows.  This is the default action.
-         </para>
-        </listitem>
-       </varlistentry>
-
-       <varlistentry>
-        <term><literal>RESTRICT</literal></term>
-        <listitem>
-         <para>
-          Produce an error indicating that the deletion or update
-          would create a foreign key constraint violation.
-          This is the same as <literal>NO ACTION</literal> except that
-          the check is not deferrable.
-         </para>
-        </listitem>
-       </varlistentry>
-
-       <varlistentry>
-        <term><literal>CASCADE</literal></term>
-        <listitem>
-         <para>
-          Delete any rows referencing the deleted row, or update the
-          values of the referencing column(s) to the new values of the
-          referenced columns, respectively.
-         </para>
-        </listitem>
-       </varlistentry>
-
-       <varlistentry>
-        <term><literal>SET NULL</literal></term>
-        <listitem>
-         <para>
-          Set the referencing column(s) to null.
-         </para>
-        </listitem>
-       </varlistentry>
-
-       <varlistentry>
-        <term><literal>SET DEFAULT</literal></term>
-        <listitem>
-         <para>
-          Set the referencing column(s) to their default values.
-          (There must be a row in the referenced table matching the default
-          values, if they are not null, or the operation will fail.)
-         </para>
-        </listitem>
-       </varlistentry>
-      </variablelist>
-     </para>
-
-     <para>
-      If the referenced column(s) are changed frequently, it might be wise to
-      add an index to the referencing column(s) so that referential actions
-      associated with the foreign key constraint can be performed more
-      efficiently.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>DEFERRABLE</literal></term>
-    <term><literal>NOT DEFERRABLE</literal></term>
-    <listitem>
-     <para>
-      This controls whether the constraint can be deferred.  A
-      constraint that is not deferrable will be checked immediately
-      after every command.  Checking of constraints that are
-      deferrable can be postponed until the end of the transaction
-      (using the <xref linkend="sql-set-constraints"> command).
-      <literal>NOT DEFERRABLE</literal> is the default.
-      Currently, only <literal>UNIQUE</>, <literal>PRIMARY KEY</>,
-      <literal>EXCLUDE</>, and
-      <literal>REFERENCES</> (foreign key) constraints accept this
-      clause.  <literal>NOT NULL</> and <literal>CHECK</> constraints are not
-      deferrable.  Note that deferrable constraints cannot be used as
-      conflict arbitrators in an <command>INSERT</command> statement that
-      includes an <literal>ON CONFLICT DO UPDATE</> clause.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>INITIALLY IMMEDIATE</literal></term>
-    <term><literal>INITIALLY DEFERRED</literal></term>
-    <listitem>
-     <para>
-      If a constraint is deferrable, this clause specifies the default
-      time to check the constraint.  If the constraint is
-      <literal>INITIALLY IMMEDIATE</literal>, it is checked after each
-      statement. This is the default.  If the constraint is
-      <literal>INITIALLY DEFERRED</literal>, it is checked only at the
-      end of the transaction.  The constraint check time can be
-      altered with the <xref linkend="sql-set-constraints"> command.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>WITH ( <replaceable class="PARAMETER">storage_parameter</replaceable> [= <replaceable class="PARAMETER">value</replaceable>] [, ... ] )</literal></term>
-    <listitem>
-     <para>
-      This clause specifies optional storage parameters for a table or index;
-      see <xref linkend="sql-createtable-storage-parameters"
-      endterm="sql-createtable-storage-parameters-title"> for more
-      information.  The <literal>WITH</> clause for a
-      table can also include <literal>OIDS=TRUE</> (or just <literal>OIDS</>)
-      to specify that rows of the new table
-      should have OIDs (object identifiers) assigned to them, or
-      <literal>OIDS=FALSE</> to specify that the rows should not have OIDs.
-      If <literal>OIDS</> is not specified, the default setting depends upon
-      the <xref linkend="guc-default-with-oids"> configuration parameter.
-      (If the new table inherits from any tables that have OIDs, then
-      <literal>OIDS=TRUE</> is forced even if the command says
-      <literal>OIDS=FALSE</>.)
-     </para>
-
-     <para>
-      If <literal>OIDS=FALSE</literal> is specified or implied, the new
-      table does not store OIDs and no OID will be assigned for a row inserted
-      into it. This is generally considered worthwhile, since it
-      will reduce OID consumption and thereby postpone the wraparound
-      of the 32-bit OID counter. Once the counter wraps around, OIDs
-      can no longer be assumed to be unique, which makes them
-      considerably less useful. In addition, excluding OIDs from a
-      table reduces the space required to store the table on disk by
-      4 bytes per row (on most machines), slightly improving performance.
-     </para>
-
-     <para>
-      To remove OIDs from a table after it has been created, use <xref
-      linkend="sql-altertable">.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>WITH OIDS</></term>
-    <term><literal>WITHOUT OIDS</></term>
-    <listitem>
-     <para>
-      These are obsolescent syntaxes equivalent to <literal>WITH (OIDS)</>
-      and <literal>WITH (OIDS=FALSE)</>, respectively.  If you wish to give
-      both an <literal>OIDS</> setting and storage parameters, you must use
-      the <literal>WITH ( ... )</> syntax; see above.
-     </para>
-
-     <para>
-      In <productname>Postgres-XL</>, OID is managed locally in each
-      Datanode and Coordinator.  The OID value may be inconsistent for
-      rows stored in different Datanodes.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>ON COMMIT</literal></term>
-    <listitem>
-     <para>
-      The behavior of temporary tables at the end of a transaction
-      block can be controlled using <literal>ON COMMIT</literal>.
-      The three options are:
-
-      <variablelist>
-       <varlistentry>
-        <term><literal>PRESERVE ROWS</literal></term>
-        <listitem>
-         <para>
-          No special action is taken at the ends of transactions.
-          This is the default behavior.
-         </para>
-        </listitem>
-       </varlistentry>
-
-       <varlistentry>
-        <term><literal>DELETE ROWS</literal></term>
-        <listitem>
-         <para>
-          All rows in the temporary table will be deleted at the end
-          of each transaction block.  Essentially, an automatic <xref
-          linkend="sql-truncate"> is done
-          at each commit.
-         </para>
-        </listitem>
-       </varlistentry>
-
-       <varlistentry>
-        <term><literal>DROP</literal></term>
-        <listitem>
-         <para>
-          The temporary table will be dropped at the end of the current
-          transaction block.
-         </para>
-        </listitem>
-       </varlistentry>
-      </variablelist></para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>TABLESPACE <replaceable class="PARAMETER">tablespace_name</replaceable></literal></term>
-    <listitem>
-     <para>
-      The <replaceable class="PARAMETER">tablespace_name</replaceable> is the name
-      of the tablespace in which the new table is to be created.
-      If not specified,
-      <xref linkend="guc-default-tablespace"> is consulted, or
-      <xref linkend="guc-temp-tablespaces"> if the table is temporary.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>DISTRIBUTE BY</literal></term>
-    <listitem>
-     <para>
-      This clause specifies how the table is distributed or replicated among Datanodes.
-     </para>
-
-      <variablelist>
-
-       <varlistentry>
-        <term><literal>REPLICATION</literal></term>
-        <listitem>
-         <para>
-          Each row of the table will be replicated to all the
-          Datanode of the <productname>Postgres-XL</> database
-          cluster.
-         </para>
-        </listitem>
-       </varlistentry>
-
-       <varlistentry>
-        <term><literal>ROUNDROBIN</literal></term>
-        <listitem>
-         <para>
-          Each row of the table will be placed in one of the Datanodes
-          in a round-robin manner.  The value of the row will not be
-          needed to determine what Datanode to go.
-         </para>
-        </listitem>
-       </varlistentry>
-
-      <varlistentry>
-       <term><literal>HASH ( <replaceable class="PARAMETER">column_name</> )</literal></term>
-       <listitem>
-        <para>
-         Each row of the table will be placed based on the hash value
-         of the specified column.  Following type is allowed as
-         distribution column: INT8, INT2, OID, INT4, BOOL, INT2VECTOR,
-         OIDVECTOR, CHAR, NAME, TEXT, BPCHAR, BYTEA, VARCHAR, NUMERIC, 
-         MONEY, ABSTIME, RELTIME, DATE, TIME,TIMESTAMP, TIMESTAMPTZ, 
-         INTERVAL, and TIMETZ.
-        </para>
-        <para>
-         Please note that floating point is not allowed as a basis of
-         the distribution column.
-        </para>
-       </listitem>
-      </varlistentry>
-
-      <varlistentry>
-       <term><literal>MODULO ( <replaceable class="PARAMETER">column_name</> )</literal></term>
-       <listitem>
-        <para>
-         Each row of the table will be placed based on the modulo
-         of the specified column.  Following type is allowed as
-         distribution column: INT8, INT2, INT4, BOOL, ABSTIME, RELTIME, 
-         DATE.
-        </para>
-        <para>
-         Please note that floating point is not allowed as a basis of
-         the distribution column.
-        </para>
-       </listitem>
-      </varlistentry>
-
-     </variablelist>
-    <para>
-     If <literal>DISTRIBUTE BY</> is not specified, columns with
-     UNIQUE constraint will be chosen as the distribution key.  If no
-     such column is specified, distribution column is the first
-     eligible column in the definition.  If no such column is found,
-     then the table will be distributed by <literal>ROUNDROBIN</>.
-    </para>
-
-   </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>TO GROUP</literal></term>
-    <term><literal>TO NODE</literal></term>
-      <listitem>
-       <para>
-        This defines on the list of nodes on which table data exists.
-        If this is not specified table data is present on all Datanodes.
-       </para>
-      </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><replaceable class="PARAMETER">nodename</replaceable></term>
-      <listitem>
-       <para>
-        Associated with <literal>TO NODE</literal>, it defines a <productname>
-        Postgres-XL</productname> node of catalog pgxc_node.
-       </para>
-      </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><replaceable class="PARAMETER">groupname</replaceable></term>
-      <listitem>
-       <para>
-        Associated with <literal>TO GROUP</literal>, it defines a <productname>
-        Postgres-XL</productname> node group in catalog pgxc_group.
-       </para>
-      </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>USING INDEX TABLESPACE <replaceable class="PARAMETER">tablespace_name</replaceable></literal></term>
-    <listitem>
-     <para>
-      This clause allows selection of the tablespace in which the index
-      associated with a <literal>UNIQUE</literal>, <literal>PRIMARY
-      KEY</literal>, or <literal>EXCLUDE</> constraint will be created.
-      If not specified,
-      <xref linkend="guc-default-tablespace"> is consulted, or
-      <xref linkend="guc-temp-tablespaces"> if the table is temporary.
-     </para>
-    </listitem>
-   </varlistentry>
-
-  </variablelist>
-
-  <refsect2 id="SQL-CREATETABLE-storage-parameters">
-   <title id="SQL-CREATETABLE-storage-parameters-title">Storage Parameters</title>
-
- <indexterm zone="sql-createtable-storage-parameters">
-  <primary>storage parameters</primary>
- </indexterm>
-
+ <!--
+ doc/src/sgml/ref/create_table.sgml
+ PostgreSQL documentation
+ -->
+ 
+ <refentry id="sql-createtable">
+  <indexterm zone="sql-createtable">
+   <primary>CREATE TABLE</primary>
+  </indexterm>
+ 
+  <refmeta>
+   <refentrytitle>CREATE TABLE</refentrytitle>
+   <manvolnum>7</manvolnum>
+   <refmiscinfo>SQL - Language Statements</refmiscinfo>
+  </refmeta>
+ 
+  <refnamediv>
+   <refname>CREATE TABLE</refname>
+   <refpurpose>define a new table</refpurpose>
+  </refnamediv>
+ 
+  <refsynopsisdiv>
+ <synopsis>
+ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] <replaceable class="parameter">table_name</replaceable> ( [
+   { <replaceable class="parameter">column_name</replaceable> <replaceable class="parameter">data_type</replaceable> [ COLLATE <replaceable>collation</replaceable> ] [ <replaceable class="parameter">column_constraint</replaceable> [ ... ] ]
+     | <replaceable>table_constraint</replaceable>
+     | LIKE <replaceable>source_table</replaceable> [ <replaceable>like_option</replaceable> ... ] }
+     [, ... ]
+ ] )
+ [ INHERITS ( <replaceable>parent_table</replaceable> [, ... ] ) ]
+ [ PARTITION BY { RANGE | LIST | HASH } ( { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ COLLATE <replaceable class="parameter">collation</replaceable> ] [ <replaceable class="parameter">opclass</replaceable> ] [, ... ] ) ]
+ [ WITH ( <replaceable class="parameter">storage_parameter</replaceable> [= <replaceable class="parameter">value</replaceable>] [, ... ] ) | WITH OIDS | WITHOUT OIDS ]
+ [ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ]
+ [ TABLESPACE <replaceable class="parameter">tablespace_name</replaceable> ]
+ 
+ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] <replaceable class="parameter">table_name</replaceable>
+     OF <replaceable class="parameter">type_name</replaceable> [ (
+   { <replaceable class="parameter">column_name</replaceable> [ WITH OPTIONS ] [ <replaceable class="parameter">column_constraint</replaceable> [ ... ] ]
+     | <replaceable>table_constraint</replaceable> }
+     [, ... ]
+ ) ]
+ [ PARTITION BY { RANGE | LIST | HASH } ( { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ COLLATE <replaceable class="parameter">collation</replaceable> ] [ <replaceable class="parameter">opclass</replaceable> ] [, ... ] ) ]
+ [ WITH ( <replaceable class="parameter">storage_parameter</replaceable> [= <replaceable class="parameter">value</replaceable>] [, ... ] ) | WITH OIDS | WITHOUT OIDS ]
+ [ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ]
+ [ TABLESPACE <replaceable class="parameter">tablespace_name</replaceable> ]
+ 
+ CREATE [ [ GLOBAL | LOCAL ] { TEMPORARY | TEMP } | UNLOGGED ] TABLE [ IF NOT EXISTS ] <replaceable class="parameter">table_name</replaceable>
+     PARTITION OF <replaceable class="parameter">parent_table</replaceable> [ (
+   { <replaceable class="parameter">column_name</replaceable> [ WITH OPTIONS ] [ <replaceable class="parameter">column_constraint</replaceable> [ ... ] ]
+     | <replaceable>table_constraint</replaceable> }
+     [, ... ]
+ ) ] { FOR VALUES <replaceable class="parameter">partition_bound_spec</replaceable> | DEFAULT }
+ [ PARTITION BY { RANGE | LIST | HASH } ( { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ COLLATE <replaceable class="parameter">collation</replaceable> ] [ <replaceable class="parameter">opclass</replaceable> ] [, ... ] ) ]
+ [ WITH ( <replaceable class="parameter">storage_parameter</replaceable> [= <replaceable class="parameter">value</replaceable>] [, ... ] ) | WITH OIDS | WITHOUT OIDS ]
+ [ ON COMMIT { PRESERVE ROWS | DELETE ROWS | DROP } ]
+ [ TABLESPACE <replaceable class="parameter">tablespace_name</replaceable> ]
+ 
+ <phrase>where <replaceable class="parameter">column_constraint</replaceable> is:</phrase>
+ 
+ [ CONSTRAINT <replaceable class="parameter">constraint_name</replaceable> ]
+ { NOT NULL |
+   NULL |
+   CHECK ( <replaceable class="parameter">expression</replaceable> ) [ NO INHERIT ] |
+   DEFAULT <replaceable>default_expr</replaceable> |
+   GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY [ ( <replaceable>sequence_options</replaceable> ) ] |
+   UNIQUE <replaceable class="parameter">index_parameters</replaceable> |
+   PRIMARY KEY <replaceable class="parameter">index_parameters</replaceable> |
+   REFERENCES <replaceable class="parameter">reftable</replaceable> [ ( <replaceable class="parameter">refcolumn</replaceable> ) ] [ MATCH FULL | MATCH PARTIAL | MATCH SIMPLE ]
+     [ ON DELETE <replaceable class="parameter">action</replaceable> ] [ ON UPDATE <replaceable class="parameter">action</replaceable> ] }
+ [ DEFERRABLE | NOT DEFERRABLE ] [ INITIALLY DEFERRED | INITIALLY IMMEDIATE ]
+ 
+ <phrase>and <replaceable class="parameter">table_constraint</replaceable> is:</phrase>
+ 
+ [ CONSTRAINT <replaceable class="parameter">constraint_name</replaceable> ]
+ { CHECK ( <replaceable class="parameter">expression</replaceable> ) [ NO INHERIT ] |
+   UNIQUE ( <replaceable class="parameter">column_name</replaceable> [, ... ] ) <replaceable class="parameter">index_parameters</replaceable> |
+   PRIMARY KEY ( <replaceable class="parameter">column_name</replaceable> [, ... ] ) <replaceable class="parameter">index_parameters</replaceable> |
+   EXCLUDE [ USING <replaceable class="parameter">index_method</replaceable> ] ( <replaceable class="parameter">exclude_element</replaceable> WITH <replaceable class="parameter">operator</replaceable> [, ... ] ) <replaceable class="parameter">index_parameters</replaceable> [ WHERE ( <replaceable class="parameter">predicate</replaceable> ) ] |
+   FOREIGN KEY ( <replaceable class="parameter">column_name</replaceable> [, ... ] ) REFERENCES <replaceable class="parameter">reftable</replaceable> [ ( <replaceable class="parameter">refcolumn</replaceable> [, ... ] ) ]
+     [ MATCH FULL | MATCH PARTIAL | MATCH SIMPLE ] [ ON DELETE <replaceable class="parameter">action</replaceable> ] [ ON UPDATE <replaceable class="parameter">action</replaceable> ] }
+ [ DEFERRABLE | NOT DEFERRABLE ] [ INITIALLY DEFERRED | INITIALLY IMMEDIATE ]
+ 
+ <phrase>and <replaceable class="parameter">like_option</replaceable> is:</phrase>
+ 
+ { INCLUDING | EXCLUDING } { DEFAULTS | CONSTRAINTS | IDENTITY | INDEXES | STORAGE | COMMENTS | ALL }
+ 
+ <phrase>and <replaceable class="parameter">partition_bound_spec</replaceable> is:</phrase>
+ 
+ IN ( { <replaceable class="parameter">numeric_literal</replaceable> | <replaceable class="parameter">string_literal</replaceable> | NULL } [, ...] ) |
+ FROM ( { <replaceable class="parameter">numeric_literal</replaceable> | <replaceable class="parameter">string_literal</replaceable> | MINVALUE | MAXVALUE } [, ...] )
+   TO ( { <replaceable class="parameter">numeric_literal</replaceable> | <replaceable class="parameter">string_literal</replaceable> | MINVALUE | MAXVALUE } [, ...] ) |
+ WITH ( MODULUS <replaceable class="parameter">numeric_literal</replaceable>, REMAINDER <replaceable class="parameter">numeric_literal</replaceable> )
+ 
+ <phrase><replaceable class="parameter">index_parameters</replaceable> in <literal>UNIQUE</literal>, <literal>PRIMARY KEY</literal>, and <literal>EXCLUDE</literal> constraints are:</phrase>
+ 
+ [ WITH ( <replaceable class="parameter">storage_parameter</replaceable> [= <replaceable class="parameter">value</replaceable>] [, ... ] ) ]
+ [ USING INDEX TABLESPACE <replaceable class="parameter">tablespace_name</replaceable> ]
+ 
+ <phrase><replaceable class="parameter">exclude_element</replaceable> in an <literal>EXCLUDE</literal> constraint is:</phrase>
+ 
+ { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ <replaceable class="parameter">opclass</replaceable> ] [ ASC | DESC ] [ NULLS { FIRST | LAST } ]
+ </synopsis>
+ 
+  </refsynopsisdiv>
+ 
+  <refsect1 id="sql-createtable-description">
+   <title>Description</title>
+ 
    <para>
-    The <literal>WITH</> clause can specify <firstterm>storage parameters</>
-    for tables, and for indexes associated with a <literal>UNIQUE</literal>,
-    <literal>PRIMARY KEY</literal>, or <literal>EXCLUDE</> constraint.
-    Storage parameters for
-    indexes are documented in <xref linkend="SQL-CREATEINDEX">.
-    The storage parameters currently
-    available for tables are listed below.  For many of these parameters, as
-    shown, there is an additional parameter with the same name prefixed with
-    <literal>toast.</literal>, which controls the behavior of the
-    table's secondary <acronym>TOAST</> table, if any
-    (see <xref linkend="storage-toast"> for more information about TOAST).
-    If a table parameter value is set and the
-    equivalent <literal>toast.</literal> parameter is not, the TOAST table
-    will use the table's parameter value.
-    Specifying these parameters for partitioned tables is not supported,
-    but you may specify them for individual leaf partitions.
+    <command>CREATE TABLE</command> will create a new, initially empty table
+    in the current database. The table will be owned by the user issuing the
+    command.
    </para>
-
-   <variablelist>
-
-   <varlistentry>
-    <term><literal>fillfactor</> (<type>integer</>)</term>
-    <listitem>
-     <para>
-      The fillfactor for a table is a percentage between 10 and 100.
-      100 (complete packing) is the default.  When a smaller fillfactor
-      is specified, <command>INSERT</> operations pack table pages only
-      to the indicated percentage; the remaining space on each page is
-      reserved for updating rows on that page.  This gives <command>UPDATE</>
-      a chance to place the updated copy of a row on the same page as the
-      original, which is more efficient than placing it on a different page.
-      For a table whose entries are never updated, complete packing is the
-      best choice, but in heavily updated tables smaller fillfactors are
-      appropriate.  This parameter cannot be set for TOAST tables.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>parallel_workers</> (<type>integer</>)</term>
-    <listitem>
-     <para>
-      This sets the number of workers that should be used to assist a parallel
-      scan of this table.  If not set, the system will determine a value based
-      on the relation size.  The actual number of workers chosen by the planner
-      may be less, for example due to
-      the setting of <xref linkend="guc-max-worker-processes">.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_enabled</>, <literal>toast.autovacuum_enabled</literal> (<type>boolean</>)</term>
-    <listitem>
-     <para>
-     Enables or disables the autovacuum daemon for a particular table.
-     If true, the autovacuum daemon will perform automatic <command>VACUUM</>
-     and/or <command>ANALYZE</> operations on this table following the rules
-     discussed in <xref linkend="autovacuum">.
-     If false, this table will not be autovacuumed, except to prevent
-     transaction ID wraparound. See <xref linkend="vacuum-for-wraparound"> for
-     more about wraparound prevention.
-     Note that the autovacuum daemon does not run at all (except to prevent
-     transaction ID wraparound) if the <xref linkend="guc-autovacuum">
-     parameter is false; setting individual tables' storage parameters does
-     not override that.  Therefore there is seldom much point in explicitly
-     setting this storage parameter to <literal>true</>, only
-     to <literal>false</>.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_vacuum_threshold</>, <literal>toast.autovacuum_vacuum_threshold</literal> (<type>integer</>)</term>
-    <listitem>
-     <para>
-      Per-table value for <xref linkend="guc-autovacuum-vacuum-threshold">
-      parameter.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_vacuum_scale_factor</>, <literal>toast.autovacuum_vacuum_scale_factor</literal> (<type>float4</>)</term>
-    <listitem>
-     <para>
-      Per-table value for <xref linkend="guc-autovacuum-vacuum-scale-factor">
-      parameter.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_analyze_threshold</> (<type>integer</>)</term>
-    <listitem>
-     <para>
-      Per-table value for <xref linkend="guc-autovacuum-analyze-threshold">
-      parameter.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_analyze_scale_factor</> (<type>float4</>)</term>
-    <listitem>
-     <para>
-      Per-table value for <xref linkend="guc-autovacuum-analyze-scale-factor">
-      parameter.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_vacuum_cost_delay</>, <literal>toast.autovacuum_vacuum_cost_delay</literal> (<type>integer</>)</term>
-    <listitem>
-     <para>
-      Per-table value for <xref linkend="guc-autovacuum-vacuum-cost-delay">
-      parameter.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_vacuum_cost_limit</>, <literal>toast.autovacuum_vacuum_cost_limit</literal> (<type>integer</>)</term>
-    <listitem>
-     <para>
-      Per-table value for <xref linkend="guc-autovacuum-vacuum-cost-limit">
-      parameter.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_freeze_min_age</>, <literal>toast.autovacuum_freeze_min_age</literal> (<type>integer</>)</term>
-    <listitem>
-     <para>
-      Per-table value for <xref linkend="guc-vacuum-freeze-min-age">
-      parameter.  Note that autovacuum will ignore
-      per-table <literal>autovacuum_freeze_min_age</> parameters that are
-      larger than half the
-      system-wide <xref linkend="guc-autovacuum-freeze-max-age"> setting.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_freeze_max_age</>, <literal>toast.autovacuum_freeze_max_age</literal> (<type>integer</>)</term>
-    <listitem>
-     <para>
-      Per-table value for <xref linkend="guc-autovacuum-freeze-max-age">
-      parameter.  Note that autovacuum will ignore
-      per-table <literal>autovacuum_freeze_max_age</> parameters that are
-      larger than the system-wide setting (it can only be set smaller).
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_freeze_table_age</literal>, <literal>toast.autovacuum_freeze_table_age</literal> (<type>integer</type>)</term>
-    <listitem>
-     <para>
-      Per-table value for <xref linkend="guc-vacuum-freeze-table-age">
-      parameter.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_multixact_freeze_min_age</literal>, <literal>toast.autovacuum_multixact_freeze_min_age</literal> (<type>integer</type>)</term>
-    <listitem>
-     <para>
-      Per-table value for <xref linkend="guc-vacuum-multixact-freeze-min-age">
-      parameter.  Note that autovacuum will ignore
-      per-table <literal>autovacuum_multixact_freeze_min_age</> parameters
-      that are larger than half the
-      system-wide <xref linkend="guc-autovacuum-multixact-freeze-max-age">
-      setting.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_multixact_freeze_max_age</literal>, <literal>toast.autovacuum_multixact_freeze_max_age</literal> (<type>integer</type>)</term>
-    <listitem>
-     <para>
-      Per-table value
-      for <xref linkend="guc-autovacuum-multixact-freeze-max-age"> parameter.
-      Note that autovacuum will ignore
-      per-table <literal>autovacuum_multixact_freeze_max_age</> parameters
-      that are larger than the system-wide setting (it can only be set
-      smaller).
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>autovacuum_multixact_freeze_table_age</literal>, <literal>toast.autovacuum_multixact_freeze_table_age</literal> (<type>integer</type>)</term>
-    <listitem>
-     <para>
-      Per-table value
-      for <xref linkend="guc-vacuum-multixact-freeze-table-age"> parameter.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>log_autovacuum_min_duration</literal>, <literal>toast.log_autovacuum_min_duration</literal> (<type>integer</type>)</term>
-    <listitem>
-     <para>
-      Per-table value for <xref linkend="guc-log-autovacuum-min-duration">
-      parameter.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   <varlistentry>
-    <term><literal>user_catalog_table</literal> (<type>boolean</type>)</term>
-    <listitem>
-     <para>
-      Declare the table as an additional catalog table for purposes of
-      logical replication. See
-      <xref linkend="logicaldecoding-capabilities"> for details.
-      This parameter cannot be set for TOAST tables.
-     </para>
-    </listitem>
-   </varlistentry>
-
-   </variablelist>
-
-  </refsect2>
- </refsect1>
-
- <refsect1 id="SQL-CREATETABLE-notes">
-  <title>Notes</title>
-
-    <para>
-     Using OIDs in new applications is not recommended: where
-     possible, using an identity column or other sequence
-     generator as the table's primary key is preferred. However, if
-     your application does make use of OIDs to identify specific
-     rows of a table, it is recommended to create a unique constraint
-     on the <structfield>oid</> column of that table, to ensure that
-     OIDs in the table will indeed uniquely identify rows even after
-     counter wraparound.  Avoid assuming that OIDs are unique across
-     tables; if you need a database-wide unique identifier, use the
-     combination of <structfield>tableoid</> and row OID for the
-     purpose.
-    </para>
-
-    <tip>
-     <para>
-      The use of <literal>OIDS=FALSE</literal> is not recommended
-      for tables with no primary key, since without either an OID or a
-      unique data key, it is difficult to identify specific rows.
-     </para>
-    </tip>
-
-    <para>
-     <productname>PostgreSQL</productname> automatically creates an
-     index for each unique constraint and primary key constraint to
-     enforce uniqueness.  Thus, it is not necessary to create an
-     index explicitly for primary key columns.  (See <xref
-     linkend="sql-createindex"> for more information.)
-    </para>
-
-    <para>
-     Unique constraints and primary keys are not inherited in the
-     current implementation.  This makes the combination of
-     inheritance and unique constraints rather dysfunctional.
-    </para>
-
-    <para>
-     A table cannot have more than 1600 columns.  (In practice, the
-     effective limit is usually lower because of tuple-length constraints.)
-    </para>
-
- </refsect1>
-
-
- <refsect1 id="SQL-CREATETABLE-examples">
-  <title>Examples</title>
-
-  <para>
-   Create table <structname>films</> and table
-   <structname>distributors</>:
-
-<programlisting>
-CREATE TABLE films (
-    code        char(5) CONSTRAINT firstkey PRIMARY KEY,
-    title       varchar(40) NOT NULL,
-    did         integer NOT NULL,
-    date_prod   date,
-    kind        varchar(10),
-    len         interval hour to minute
-);
-
-CREATE TABLE distributors (
-     did    integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY,
-     name   varchar(40) NOT NULL CHECK (name &lt;&gt; '')
-);
-</programlisting>
-  </para>
-
-  <para>
-   Create a table with a 2-dimensional array:
-
-<programlisting>
-CREATE TABLE array_int (
-    vector  int[][]
-);
-</programlisting>
-  </para>
-
-  <para>
-   Define a unique table constraint for the table
-   <literal>films</literal>.  Unique table constraints can be defined
-   on one or more columns of the table:
-
-<programlisting>
-CREATE TABLE films (
-    code        char(5),
-    title       varchar(40),
-    did         integer,
-    date_prod   date,
-    kind        varchar(10),
-    len         interval hour to minute,
-    CONSTRAINT production UNIQUE(date_prod)
-);
-</programlisting>
-  </para>
-
-  <para>
-   Define a check column constraint:
-
-<programlisting>
-CREATE TABLE distributors (
-    did     integer CHECK (did &gt; 100),
-    name    varchar(40)
-);
-</programlisting>
-  </para>
-
-  <para>
-   Define a check table constraint:
-
-<programlisting>
-CREATE TABLE distributors (
-    did     integer,
-    name    varchar(40)
-    CONSTRAINT con1 CHECK (did &gt; 100 AND name &lt;&gt; '')
-);
-</programlisting>
-  </para>
-
-  <para>
-   Define a primary key table constraint for the table
-   <structname>films</>:
-
-<programlisting>
-CREATE TABLE films (
-    code        char(5),
-    title       varchar(40),
-    did         integer,
-    date_prod   date,
-    kind        varchar(10),
-    len         interval hour to minute,
-    CONSTRAINT code_title PRIMARY KEY(code,title)
-);
-</programlisting>
-  </para>
-
-  <para>
-   Define a primary key constraint for table
-   <structname>distributors</>.  The following two examples are
-   equivalent, the first using the table constraint syntax, the second
-   the column constraint syntax:
-
-<programlisting>
-CREATE TABLE distributors (
-    did     integer,
-    name    varchar(40),
-    PRIMARY KEY(did)
-);
-
-CREATE TABLE distributors (
-    did     integer PRIMARY KEY,
-    name    varchar(40)
-);
-</programlisting>
-  </para>
-
-  <para>
-   Assign a literal constant default value for the column
-   <literal>name</literal>, arrange for the default value of column
-   <literal>did</literal> to be generated by selecting the next value
-   of a sequence object, and make the default value of
-   <literal>modtime</literal> be the time at which the row is
-   inserted:
-
-<programlisting>
-CREATE TABLE distributors (
-    name      varchar(40) DEFAULT 'Luso Films',
-    did       integer DEFAULT nextval('distributors_serial'),
-    modtime   timestamp DEFAULT current_timestamp
-);
-</programlisting>
-  </para>
-
-  <para>
-   Define two <literal>NOT NULL</> column constraints on the table
-   <classname>distributors</classname>, one of which is explicitly
-   given a name:
-
-<programlisting>
-CREATE TABLE distributors (
-    did     integer CONSTRAINT no_null NOT NULL,
-    name    varchar(40) NOT NULL
-);
-</programlisting>
-    </para>
-
-    <para>
-     Define a unique constraint for the <literal>name</literal> column:
-
-<programlisting>
-CREATE TABLE distributors (
-    did     integer,
-    name    varchar(40) UNIQUE
-);
-</programlisting>
-
-     The same, specified as a table constraint:
-
-<programlisting>
-CREATE TABLE distributors (
-    did     integer,
-    name    varchar(40),
-    UNIQUE(name)
-);
-</programlisting>
-  </para>
-
-  <para>
-   Create the same table, specifying 70% fill factor for both the table
-   and its unique index:
-
-<programlisting>
-CREATE TABLE distributors (
-    did     integer,
-    name    varchar(40),
-    UNIQUE(name) WITH (fillfactor=70)
-)
-WITH (fillfactor=70);
-</programlisting>
-  </para>
-
-  <para>
-   Create table <structname>circles</> with an exclusion
-   constraint that prevents any two circles from overlapping:
-
-<programlisting>
-CREATE TABLE circles (
-    c circle,
-    EXCLUDE USING gist (c WITH &amp;&amp;)
-);
-</programlisting>
-  </para>
-
-  <para>
-   Create table <structname>cinemas</> in tablespace <structname>diskvol1</>:
-
-<programlisting>
-CREATE TABLE cinemas (
-        id integer,
-        name text,
-        location text
-) TABLESPACE diskvol1;
-</programlisting>
-  </para>
-
-  <para>
-   Create a composite type and a typed table:
-<programlisting>
-CREATE TYPE employee_type AS (name text, salary numeric);
-
-CREATE TABLE employees OF employee_type (
-    PRIMARY KEY (name),
-    salary WITH OPTIONS DEFAULT 1000
-);
-</programlisting></para>
-
-  <para>
-   Create a range partitioned table:
-<programlisting>
-CREATE TABLE measurement (
-    logdate         date not null,
-    peaktemp        int,
-    unitsales       int
-) PARTITION BY RANGE (logdate);
-</programlisting></para>
-
-  <para>
-   Create a range partitioned table with multiple columns in the partition key:
-<programlisting>
-CREATE TABLE measurement_year_month (
-    logdate         date not null,
-    peaktemp        int,
-    unitsales       int
-) PARTITION BY RANGE (EXTRACT(YEAR FROM logdate), EXTRACT(MONTH FROM logdate));
-</programlisting></para>
-
-  <para>
-   Create a list partitioned table:
-<programlisting>
-CREATE TABLE cities (
-    city_id      bigserial not null,
-    name         text not null,
-    population   bigint
-) PARTITION BY LIST (left(lower(name), 1));
-</programlisting></para>
-
-  <para>
-   Create partition of a range partitioned table:
-<programlisting>
-CREATE TABLE measurement_y2016m07
-    PARTITION OF measurement (
-    unitsales DEFAULT 0
-) FOR VALUES FROM ('2016-07-01') TO ('2016-08-01');
-</programlisting></para>
-
-  <para>
-   Create a few partitions of a range partitioned table with multiple
-   columns in the partition key:
-<programlisting>
-CREATE TABLE measurement_ym_older
-    PARTITION OF measurement_year_month
-    FOR VALUES FROM (MINVALUE, 0) TO (2016, 11);
-
-CREATE TABLE measurement_ym_y2016m11
-    PARTITION OF measurement_year_month
-    FOR VALUES FROM (2016, 11) TO (2016, 12);
-
-CREATE TABLE measurement_ym_y2016m12
-    PARTITION OF measurement_year_month
-    FOR VALUES FROM (2016, 12) TO (2017, 01);
-
-CREATE TABLE measurement_ym_y2017m01
-    PARTITION OF measurement_year_month
-    FOR VALUES FROM (2017, 01) TO (2017, 02);
-</programlisting></para>
-
-  <para>
-   Create partition of a list partitioned table:
-<programlisting>
-CREATE TABLE cities_ab
-    PARTITION OF cities (
-    CONSTRAINT city_id_nonzero CHECK (city_id != 0)
-) FOR VALUES IN ('a', 'b');
-</programlisting></para>
-
-  <para>
-   Create partition of a list partitioned table that is itself further
-   partitioned and then add a partition to it:
-<programlisting>
-CREATE TABLE cities_ab
-    PARTITION OF cities (
-    CONSTRAINT city_id_nonzero CHECK (city_id != 0)
-) FOR VALUES IN ('a', 'b') PARTITION BY RANGE (population);
-
-CREATE TABLE cities_ab_10000_to_100000
-    PARTITION OF cities_ab FOR VALUES FROM (10000) TO (100000);
-</programlisting></para>
-
-  <para>
-   Create a default partition:
-<programlisting>
-CREATE TABLE cities_partdef
-    PARTITION OF cities DEFAULT;
-</programlisting></para>
- </refsect1>
-
- <refsect1 id="SQL-CREATETABLE-compatibility">
-  <title id="SQL-CREATETABLE-compatibility-title">Compatibility</title>
-
-  <para>
-   The <command>CREATE TABLE</command> command conforms to the
-   <acronym>SQL</acronym> standard, with exceptions listed below.
-  </para>
-
-  <refsect2>
-   <title>Temporary Tables</title>
-
+ 
    <para>
-    Although the syntax of <literal>CREATE TEMPORARY TABLE</literal>
-    resembles that of the SQL standard, the effect is not the same.  In the
-    standard,
-    temporary tables are defined just once and automatically exist (starting
-    with empty contents) in every session that needs them.
-    <productname>PostgreSQL</productname> instead
-    requires each session to issue its own <literal>CREATE TEMPORARY
-    TABLE</literal> command for each temporary table to be used.  This allows
-    different sessions to use the same temporary table name for different
-    purposes, whereas the standard's approach constrains all instances of a
-    given temporary table name to have the same table structure.
+    If a schema name is given (for example, <literal>CREATE TABLE
+    myschema.mytable ...</literal>) then the table is created in the specified
+    schema.  Otherwise it is created in the current schema.  Temporary
+    tables exist in a special schema, so a schema name cannot be given
+    when creating a temporary table.  The name of the table must be
+    distinct from the name of any other table, sequence, index, view,
+    or foreign table in the same schema.
    </para>
-
+ 
    <para>
-    The standard's definition of the behavior of temporary tables is
-    widely ignored.  <productname>PostgreSQL</productname>'s behavior
-    on this point is similar to that of several other SQL databases.
+    <command>CREATE TABLE</command> also automatically creates a data
+    type that represents the composite type corresponding
+    to one row of the table.  Therefore, tables cannot have the same
+    name as any existing data type in the same schema.
    </para>
-
+ 
    <para>
-    The SQL standard also distinguishes between global and local temporary
-    tables, where a local temporary table has a separate set of contents for
-    each SQL module within each session, though its definition is still shared
-    across sessions.  Since <productname>PostgreSQL</productname> does not
-    support SQL modules, this distinction is not relevant in
-    <productname>PostgreSQL</productname>.
+    The optional constraint clauses specify constraints (tests) that
+    new or updated rows must satisfy for an insert or update operation
+    to succeed.  A constraint is an SQL object that helps define the
+    set of valid values in the table in various ways.
    </para>
-
+ 
    <para>
-    For compatibility's sake, <productname>PostgreSQL</productname> will
-    accept the <literal>GLOBAL</literal> and <literal>LOCAL</literal> keywords
-    in a temporary table declaration, but they currently have no effect.
-    Use of these keywords is discouraged, since future versions of
-    <productname>PostgreSQL</productname> might adopt a more
-    standard-compliant interpretation of their meaning.
+    There are two ways to define constraints: table constraints and
+    column constraints.  A column constraint is defined as part of a
+    column definition.  A table constraint definition is not tied to a
+    particular column, and it can encompass more than one column.
+    Every column constraint can also be written as a table constraint;
+    a column constraint is only a notational convenience for use when the
+    constraint only affects one column.
    </para>
-
+ 
    <para>
-    The <literal>ON COMMIT</literal> clause for temporary tables
-    also resembles the SQL standard, but has some differences.
-    If the <literal>ON COMMIT</> clause is omitted, SQL specifies that the
-    default behavior is <literal>ON COMMIT DELETE ROWS</>.  However, the
-    default behavior in <productname>PostgreSQL</productname> is
-    <literal>ON COMMIT PRESERVE ROWS</literal>.  The <literal>ON COMMIT
-    DROP</literal> option does not exist in SQL.
+    To be able to create a table, you must have <literal>USAGE</literal>
+    privilege on all column types or the type in the <literal>OF</literal>
+    clause, respectively.
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title>Non-deferred Uniqueness Constraints</title>
-
+  </refsect1>
+ 
+  <refsect1>
+   <title>Parameters</title>
+ 
+   <variablelist>
+ 
+    <varlistentry id="sql-createtable-temporary">
+     <term><literal>TEMPORARY</literal> or <literal>TEMP</literal></term>
+     <listitem>
+      <para>
+       If specified, the table is created as a temporary table.
+       Temporary tables are automatically dropped at the end of a
+       session, or optionally at the end of the current transaction
+       (see <literal>ON COMMIT</literal> below).  Existing permanent
+       tables with the same name are not visible to the current session
+       while the temporary table exists, unless they are referenced
+       with schema-qualified names. Any indexes created on a temporary
+       table are automatically temporary as well.
+      </para>
+ 
+      <para>
+       The <link linkend="autovacuum">autovacuum daemon</link> cannot
+       access and therefore cannot vacuum or analyze temporary tables.
+       For this reason, appropriate vacuum and analyze operations should be
+       performed via session SQL commands.  For example, if a temporary
+       table is going to be used in complex queries, it is wise to run
+       <command>ANALYZE</command> on the temporary table after it is populated.
+      </para>
+ 
+      <para>
+       Optionally, <literal>GLOBAL</literal> or <literal>LOCAL</literal>
+       can be written before <literal>TEMPORARY</literal> or <literal>TEMP</literal>.
+       This presently makes no difference in <productname>PostgreSQL</productname>
+       and is deprecated; see
+       <xref linkend="sql-createtable-compatibility"
+       endterm="sql-createtable-compatibility-title">.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry id="sql-createtable-unlogged">
+     <term><literal>UNLOGGED</literal></term>
+     <listitem>
+      <para>
+       If specified, the table is created as an unlogged table.  Data written
+       to unlogged tables is not written to the write-ahead log (see <xref
+       linkend="wal">), which makes them considerably faster than ordinary
+       tables.  However, they are not crash-safe: an unlogged table is
+       automatically truncated after a crash or unclean shutdown.  The contents
+       of an unlogged table are also not replicated to standby servers.
+       Any indexes created on an unlogged table are automatically unlogged as
+       well.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>IF NOT EXISTS</literal></term>
+     <listitem>
+      <para>
+       Do not throw an error if a relation with the same name already exists.
+       A notice is issued in this case.  Note that there is no guarantee that
+       the existing relation is anything like the one that would have been
+       created.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><replaceable class="parameter">table_name</replaceable></term>
+     <listitem>
+      <para>
+       The name (optionally schema-qualified) of the table to be created.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>OF <replaceable class="parameter">type_name</replaceable></literal></term>
+     <listitem>
+      <para>
+       Creates a <firstterm>typed table</firstterm>, which takes its
+       structure from the specified composite type (name optionally
+       schema-qualified).  A typed table is tied to its type; for
+       example the table will be dropped if the type is dropped
+       (with <literal>DROP TYPE ... CASCADE</literal>).
+      </para>
+ 
+      <para>
+       When a typed table is created, then the data types of the
+       columns are determined by the underlying composite type and are
+       not specified by the <literal>CREATE TABLE</literal> command.
+       But the <literal>CREATE TABLE</literal> command can add defaults
+       and constraints to the table and can specify storage parameters.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry id="sql-createtable-partition">
+     <term><literal>PARTITION OF <replaceable class="parameter">parent_table</replaceable> { FOR VALUES <replaceable class="parameter">partition_bound_spec</replaceable> | DEFAULT }</literal></term>
+     <listitem>
+      <para>
+       Creates the table as a <firstterm>partition</firstterm> of the specified
+       parent table. The table can be created either as a partition for specific
+       values using <literal>FOR VALUES</literal> or as a default partition
+       using <literal>DEFAULT</literal>.  This option is not available for
+       hash-partitioned tables.
+      </para>
+ 
+      <para>
+       The <replaceable class="parameter">partition_bound_spec</replaceable>
+       must correspond to the partitioning method and partition key of the
+       parent table, and must not overlap with any existing partition of that
+       parent.  The form with <literal>IN</literal> is used for list partitioning,
+       the form with <literal>FROM</literal> and <literal>TO</literal> is used
+       for range partitioning, and the form with <literal>WITH</literal> is used
+       for hash partitioning.
+      </para>
+ 
+      <para>
+       Each of the values specified in
+       the <replaceable class="parameter">partition_bound_spec</replaceable> is
+       a literal, <literal>NULL</literal>, <literal>MINVALUE</literal>, or
+       <literal>MAXVALUE</literal>.  Each literal value must be either a
+       numeric constant that is coercible to the corresponding partition key
+       column's type, or a string literal that is valid input for that type.
+      </para>
+ 
+      <para>
+       When creating a list partition, <literal>NULL</literal> can be
+       specified to signify that the partition allows the partition key
+       column to be null.  However, there cannot be more than one such
+       list partition for a given parent table.  <literal>NULL</literal>
+       cannot be specified for range partitions.
+      </para>
+ 
+      <para>
+       When creating a range partition, the lower bound specified with
+       <literal>FROM</literal> is an inclusive bound, whereas the upper
+       bound specified with <literal>TO</literal> is an exclusive bound.
+       That is, the values specified in the <literal>FROM</literal> list
+       are valid values of the corresponding partition key columns for this
+       partition, whereas those in the <literal>TO</literal> list are
+       not.  Note that this statement must be understood according to the
+       rules of row-wise comparison (<xref linkend="row-wise-comparison">).
+       For example, given <literal>PARTITION BY RANGE (x,y)</literal>, a partition
+       bound <literal>FROM (1, 2) TO (3, 4)</literal>
+       allows <literal>x=1</literal> with any <literal>y&gt;=2</literal>,
+       <literal>x=2</literal> with any non-null <literal>y</literal>,
+       and <literal>x=3</literal> with any <literal>y&lt;4</literal>.
+      </para>
+ 
+      <para>
+       The special values <literal>MINVALUE</literal> and <literal>MAXVALUE</literal>
+       may be used when creating a range partition to indicate that there
+       is no lower or upper bound on the column's value. For example, a
+       partition defined using <literal>FROM (MINVALUE) TO (10)</literal> allows
+       any values less than 10, and a partition defined using
+       <literal>FROM (10) TO (MAXVALUE)</literal> allows any values greater than
+       or equal to 10.
+      </para>
+ 
+      <para>
+       When creating a range partition involving more than one column, it
+       can also make sense to use <literal>MAXVALUE</literal> as part of the lower
+       bound, and <literal>MINVALUE</literal> as part of the upper bound. For
+       example, a partition defined using
+       <literal>FROM (0, MAXVALUE) TO (10, MAXVALUE)</literal> allows any rows
+       where the first partition key column is greater than 0 and less than
+       or equal to 10. Similarly, a partition defined using
+       <literal>FROM ('a', MINVALUE) TO ('b', MINVALUE)</literal> allows any rows
+       where the first partition key column starts with "a".
+      </para>
+ 
+      <para>
+       Note that if <literal>MINVALUE</literal> or <literal>MAXVALUE</literal> is used for
+       one column of a partitioning bound, the same value must be used for all
+       subsequent columns.  For example, <literal>(10, MINVALUE, 0)</literal> is not
+       a valid bound; you should write <literal>(10, MINVALUE, MINVALUE)</literal>.
+      </para>
+ 
+      <para>
+       Also note that some element types, such as <literal>timestamp</literal>,
+       have a notion of "infinity", which is just another value that can
+       be stored. This is different from <literal>MINVALUE</literal> and
+       <literal>MAXVALUE</literal>, which are not real values that can be stored,
+       but rather they are ways of saying that the value is unbounded.
+       <literal>MAXVALUE</literal> can be thought of as being greater than any
+       other value, including "infinity" and <literal>MINVALUE</literal> as being
+       less than any other value, including "minus infinity". Thus the range
+       <literal>FROM ('infinity') TO (MAXVALUE)</literal> is not an empty range; it
+       allows precisely one value to be stored &mdash; "infinity".
+      </para>
+ 
+      <para>
+       If <literal>DEFAULT</literal> is specified, the table will be
+       created as a default partition of the parent table. The parent can
+       either be a list or range partitioned table. A partition key value
+       not fitting into any other partition of the given parent will be
+       routed to the default partition. There can be only one default
+       partition for a given parent table.
+      </para>
+ 
+      <para>
+       When a table has an existing <literal>DEFAULT</literal> partition and
+       a new partition is added to it, the existing default partition must
+       be scanned to verify that it does not contain any rows which properly
+       belong in the new partition.  If the default partition contains a
+       large number of rows, this may be slow.  The scan will be skipped if
+       the default partition is a foreign table or if it has a constraint which
+       proves that it cannot contain rows which should be placed in the new
+       partition.
+      </para>
+ 
+      <para>
+       When creating a hash partition, a modulus and remainder must be specified.
+       The modulus must be a positive integer, and the remainder must be a
+       non-negative integer less than the modulus.  Typically, when initially
+       setting up a hash-partitioned table, you should choose a modulus equal to
+       the number of partitions and assign every table the same modulus and a
+       different remainder (see examples, below).   However, it is not required
+       that every partition have the same modulus, only that every modulus which
+       occurs among the partitions of a hash-partitioned table is a factor of the
+       next larger modulus.  This allows the number of partitions to be increased
+       incrementally without needing to move all the data at once.  For example,
+       suppose you have a hash-partitioned table with 8 partitions, each of which
+       has modulus 8, but find it necessary to increase the number of partitions
+       to 16.  You can detach one of the modulus-8 partitions, create two new
+       modulus-16 partitions covering the same portion of the key space (one with
+       a remainder equal to the remainder of the detached partition, and the
+       other with a remainder equal to that value plus 8), and repopulate them
+       with data.  You can then repeat this -- perhaps at a later time -- for
+       each modulus-8 partition until none remain.  While this may still involve
+       a large amount of data movement at each step, it is still better than
+       having to create a whole new table and move all the data at once.
+      </para>
+ 
+      <para>
+       A partition must have the same column names and types as the partitioned
+       table to which it belongs.  If the parent is specified <literal>WITH
+       OIDS</literal> then all partitions must have OIDs; the parent's OID
+       column will be inherited by all partitions just like any other column.
+       Modifications to the column names or types of a partitioned table, or
+       the addition or removal of an OID column, will automatically propagate
+       to all partitions.  <literal>CHECK</literal> constraints will be inherited
+       automatically by every partition, but an individual partition may specify
+       additional <literal>CHECK</literal> constraints; additional constraints with
+       the same name and condition as in the parent will be merged with the
+       parent constraint.  Defaults may be specified separately for each
+       partition.
+      </para>
+ 
+      <para>
+       Rows inserted into a partitioned table will be automatically routed to
+       the correct partition.  If no suitable partition exists, an error will
+       occur.  Also, if updating a row in a given partition would require it
+       to move to another partition due to new partition key values, an error
+       will occur.
+      </para>
+ 
+      <para>
+       Operations such as TRUNCATE which normally affect a table and all of its
+       inheritance children will cascade to all partitions, but may also be
+       performed on an individual partition.  Note that dropping a partition
+       with <literal>DROP TABLE</literal> requires taking an <literal>ACCESS
+       EXCLUSIVE</literal> lock on the parent table.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><replaceable class="parameter">column_name</replaceable></term>
+     <listitem>
+      <para>
+       The name of a column to be created in the new table.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><replaceable class="parameter">data_type</replaceable></term>
+     <listitem>
+      <para>
+       The data type of the column. This can include array
+       specifiers. For more information on the data types supported by
+       <productname>PostgreSQL</productname>, refer to <xref
+       linkend="datatype">.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>COLLATE <replaceable>collation</replaceable></literal></term>
+     <listitem>
+      <para>
+       The <literal>COLLATE</literal> clause assigns a collation to
+       the column (which must be of a collatable data type).
+       If not specified, the column data type's default collation is used.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>INHERITS ( <replaceable>parent_table</replaceable> [, ... ] )</literal></term>
+     <listitem>
+      <para>
+       The optional <literal>INHERITS</literal> clause specifies a list of
+       tables from which the new table automatically inherits all
+       columns.  Parent tables can be plain tables or foreign tables.
+      </para>
+ 
+      <para>
+       Use of <literal>INHERITS</literal> creates a persistent relationship
+       between the new child table and its parent table(s).  Schema
+       modifications to the parent(s) normally propagate to children
+       as well, and by default the data of the child table is included in
+       scans of the parent(s).
+      </para>
+ 
+      <para>
+       If the same column name exists in more than one parent
+       table, an error is reported unless the data types of the columns
+       match in each of the parent tables.  If there is no conflict,
+       then the duplicate columns are merged to form a single column in
+       the new table.  If the column name list of the new table
+       contains a column name that is also inherited, the data type must
+       likewise match the inherited column(s), and the column
+       definitions are merged into one.  If the
+       new table explicitly specifies a default value for the column,
+       this default overrides any defaults from inherited declarations
+       of the column.  Otherwise, any parents that specify default
+       values for the column must all specify the same default, or an
+       error will be reported.
+      </para>
+ 
+      <para>
+       <literal>CHECK</literal> constraints are merged in essentially the same way as
+       columns: if multiple parent tables and/or the new table definition
+       contain identically-named <literal>CHECK</literal> constraints, these
+       constraints must all have the same check expression, or an error will be
+       reported.  Constraints having the same name and expression will
+       be merged into one copy.  A constraint marked <literal>NO INHERIT</literal> in a
+       parent will not be considered.  Notice that an unnamed <literal>CHECK</literal>
+       constraint in the new table will never be merged, since a unique name
+       will always be chosen for it.
+      </para>
+ 
+      <para>
+       Column <literal>STORAGE</literal> settings are also copied from parent tables.
+      </para>
+ 
+      <para>
+       If a column in the parent table is an identity column, that property is
+       not inherited.  A column in the child table can be declared identity
+       column if desired.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>PARTITION BY { RANGE | LIST | HASH } ( { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ <replaceable class="parameter">opclass</replaceable> ] [, ...] ) </literal></term>
+     <listitem>
+      <para>
+       The optional <literal>PARTITION BY</literal> clause specifies a strategy
+       of partitioning the table.  The table thus created is called a
+       <firstterm>partitioned</firstterm> table.  The parenthesized list of
+       columns or expressions forms the <firstterm>partition key</firstterm>
+       for the table.  When using range or hash partitioning, the partition key
+       can include multiple columns or expressions (up to 32, but this limit can
+       be altered when building <productname>PostgreSQL</productname>), but for
+       list partitioning, the partition key must consist of a single column or
+       expression.
+      </para>
+ 
+      <para>
+       Range and list partitioning require a btree operator class, while hash
+       partitioning requires a hash operator class.  If no operator class is
+       specified explicitly, the default operator class of the appropriate
+       type will be used; if no default operator class exists, an error will
+       be raised.  When hash partitioning is used, the operator class used
+       must implement support function 2 (see <xref linkend="xindex-support">
+       for details).
+      </para>
+ 
+      <para>
+       A partitioned table is divided into sub-tables (called partitions),
+       which are created using separate <literal>CREATE TABLE</literal> commands.
+       The partitioned table is itself empty.  A data row inserted into the
+       table is routed to a partition based on the value of columns or
+       expressions in the partition key.  If no existing partition matches
+       the values in the new row, an error will be reported.
+      </para>
+ 
+      <para>
+       Partitioned tables do not support <literal>UNIQUE</literal>,
+       <literal>PRIMARY KEY</literal>, <literal>EXCLUDE</literal>, or
+       <literal>FOREIGN KEY</literal> constraints; however, you can define
+       these constraints on individual partitions.
+      </para>
+ 
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>LIKE <replaceable>source_table</replaceable> [ <replaceable>like_option</replaceable> ... ]</literal></term>
+     <listitem>
+      <para>
+       The <literal>LIKE</literal> clause specifies a table from which
+       the new table automatically copies all column names, their data types,
+       and their not-null constraints.
+      </para>
+      <para>
+       Unlike <literal>INHERITS</literal>, the new table and original table
+       are completely decoupled after creation is complete.  Changes to the
+       original table will not be applied to the new table, and it is not
+       possible to include data of the new table in scans of the original
+       table.
+      </para>
+      <para>
+       Default expressions for the copied column definitions will be copied
+       only if <literal>INCLUDING DEFAULTS</literal> is specified.  The
+       default behavior is to exclude default expressions, resulting in the
+       copied columns in the new table having null defaults.
+       Note that copying defaults that call database-modification functions,
+       such as <function>nextval</function>, may create a functional linkage between
+       the original and new tables.
+      </para>
+      <para>
+       Any identity specifications of copied column definitions will only be
+       copied if <literal>INCLUDING IDENTITY</literal> is specified.  A new
+       sequence is created for each identity column of the new table, separate
+       from the sequences associated with the old table.
+      </para>
+      <para>
+       Not-null constraints are always copied to the new table.
+       <literal>CHECK</literal> constraints will be copied only if
+       <literal>INCLUDING CONSTRAINTS</literal> is specified.
+       No distinction is made between column constraints and table
+       constraints.
+      </para>
+      <para>
+       Indexes, <literal>PRIMARY KEY</literal>, <literal>UNIQUE</literal>,
+       and <literal>EXCLUDE</literal> constraints on the original table will be
+       created on the new table only if <literal>INCLUDING INDEXES</literal>
+       is specified.  Names for the new indexes and constraints are
+       chosen according to the default rules, regardless of how the originals
+       were named.  (This behavior avoids possible duplicate-name failures for
+       the new indexes.)
+      </para>
+      <para>
+       <literal>STORAGE</literal> settings for the copied column definitions will be
+       copied only if <literal>INCLUDING STORAGE</literal> is specified.  The
+       default behavior is to exclude <literal>STORAGE</literal> settings, resulting
+       in the copied columns in the new table having type-specific default
+       settings.  For more on <literal>STORAGE</literal> settings, see
+       <xref linkend="storage-toast">.
+      </para>
+      <para>
+       Comments for the copied columns, constraints, and indexes
+       will be copied only if <literal>INCLUDING COMMENTS</literal>
+       is specified. The default behavior is to exclude comments, resulting in
+       the copied columns and constraints in the new table having no comments.
+      </para>
+      <para>
+       <literal>INCLUDING ALL</literal> is an abbreviated form of
+       <literal>INCLUDING DEFAULTS INCLUDING IDENTITY INCLUDING CONSTRAINTS INCLUDING INDEXES INCLUDING STORAGE INCLUDING COMMENTS</literal>.
+      </para>
+      <para>
+       Note that unlike <literal>INHERITS</literal>, columns and
+       constraints copied by <literal>LIKE</literal> are not merged with similarly
+       named columns and constraints.
+       If the same name is specified explicitly or in another
+       <literal>LIKE</literal> clause, an error is signaled.
+      </para>
+      <para>
+       The <literal>LIKE</literal> clause can also be used to copy column
+       definitions from views, foreign tables, or composite types.
+       Inapplicable options (e.g., <literal>INCLUDING INDEXES</literal> from
+       a view) are ignored.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>CONSTRAINT <replaceable class="parameter">constraint_name</replaceable></literal></term>
+     <listitem>
+      <para>
+       An optional name for a column or table constraint.  If the
+       constraint is violated, the constraint name is present in error messages,
+       so constraint names like <literal>col must be positive</literal> can be used
+       to communicate helpful constraint information to client applications.
+       (Double-quotes are needed to specify constraint names that contain spaces.)
+       If a constraint name is not specified, the system generates a name.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>NOT NULL</literal></term>
+     <listitem>
+      <para>
+       The column is not allowed to contain null values.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>NULL</literal></term>
+     <listitem>
+      <para>
+       The column is allowed to contain null values. This is the default.
+      </para>
+ 
+      <para>
+       This clause is only provided for compatibility with
+       non-standard SQL databases.  Its use is discouraged in new
+       applications.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>CHECK ( <replaceable class="parameter">expression</replaceable> ) [ NO INHERIT ] </literal></term>
+     <listitem>
+      <para>
+       The <literal>CHECK</literal> clause specifies an expression producing a
+       Boolean result which new or updated rows must satisfy for an
+       insert or update operation to succeed.  Expressions evaluating
+       to TRUE or UNKNOWN succeed.  Should any row of an insert or
+       update operation produce a FALSE result, an error exception is
+       raised and the insert or update does not alter the database.  A
+       check constraint specified as a column constraint should
+       reference that column's value only, while an expression
+       appearing in a table constraint can reference multiple columns.
+      </para>
+ 
+      <para>
+       Currently, <literal>CHECK</literal> expressions cannot contain
+       subqueries nor refer to variables other than columns of the
+       current row.  The system column <literal>tableoid</literal>
+       may be referenced, but not any other system column.
+      </para>
+ 
+      <para>
+       A constraint marked with <literal>NO INHERIT</literal> will not propagate to
+       child tables.
+      </para>
+ 
+      <para>
+       When a table has multiple <literal>CHECK</literal> constraints,
+       they will be tested for each row in alphabetical order by name,
+       after checking <literal>NOT NULL</literal> constraints.
+       (<productname>PostgreSQL</productname> versions before 9.5 did not honor any
+       particular firing order for <literal>CHECK</literal> constraints.)
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>DEFAULT
+     <replaceable>default_expr</replaceable></literal></term>
+     <listitem>
+      <para>
+       The <literal>DEFAULT</literal> clause assigns a default data value for
+       the column whose column definition it appears within.  The value
+       is any variable-free expression (subqueries and cross-references
+       to other columns in the current table are not allowed).  The
+       data type of the default expression must match the data type of the
+       column.
+      </para>
+ 
+      <para>
+       The default expression will be used in any insert operation that
+       does not specify a value for the column.  If there is no default
+       for a column, then the default is null.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>GENERATED { ALWAYS | BY DEFAULT } AS IDENTITY [ ( <replaceable>sequence_options</replaceable> ) ]</literal></term>
+     <listitem>
+      <para>
+       This clause creates the column as an <firstterm>identity
+       column</firstterm>.  It will have an implicit sequence attached to it
+       and the column in new rows will automatically have values from the
+       sequence assigned to it.
+      </para>
+ 
+      <para>
+       The clauses <literal>ALWAYS</literal> and <literal>BY DEFAULT</literal>
+       determine how the sequence value is given precedence over a
+       user-specified value in an <command>INSERT</command> statement.
+       If <literal>ALWAYS</literal> is specified, a user-specified value is
+       only accepted if the <command>INSERT</command> statement
+       specifies <literal>OVERRIDING SYSTEM VALUE</literal>.  If <literal>BY
+       DEFAULT</literal> is specified, then the user-specified value takes
+       precedence.  See <xref linkend="sql-insert"> for details.  (In
+       the <command>COPY</command> command, user-specified values are always
+       used regardless of this setting.)
+      </para>
+ 
+      <para>
+       The optional <replaceable>sequence_options</replaceable> clause can be
+       used to override the options of the sequence.
+       See <xref linkend="sql-createsequence"> for details.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>UNIQUE</literal> (column constraint)</term>
+     <term><literal>UNIQUE ( <replaceable class="parameter">column_name</replaceable> [, ... ] )</literal> (table constraint)</term>
+ 
+     <listitem>
+      <para>
+       The <literal>UNIQUE</literal> constraint specifies that a
+       group of one or more columns of a table can contain
+       only unique values. The behavior of the unique table constraint
+       is the same as that for column constraints, with the additional
+       capability to span multiple columns.
+      </para>
+ 
+      <para>
+       For the purpose of a unique constraint, null values are not
+       considered equal.
+      </para>
+ 
+      <para>
+       Each unique table constraint must name a set of columns that is
+       different from the set of columns named by any other unique or
+       primary key constraint defined for the table.  (Otherwise it
+       would just be the same constraint listed twice.)
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>PRIMARY KEY</literal> (column constraint)</term>
+     <term><literal>PRIMARY KEY ( <replaceable class="parameter">column_name</replaceable> [, ... ] )</literal> (table constraint)</term>
+     <listitem>
+      <para>
+       The <literal>PRIMARY KEY</literal> constraint specifies that a column or
+       columns of a table can contain only unique (non-duplicate), nonnull
+       values. Only one primary key can be specified for a table, whether as a
+       column constraint or a table constraint.
+      </para>
+ 
+      <para>
+       The primary key constraint should name a set of columns that is
+       different from the set of columns named by any unique
+       constraint defined for the same table.  (Otherwise, the unique
+       constraint is redundant and will be discarded.)
+      </para>
+ 
+      <para>
+       <literal>PRIMARY KEY</literal> enforces the same data constraints as
+       a combination of <literal>UNIQUE</literal> and <literal>NOT NULL</literal>, but
+       identifying a set of columns as the primary key also provides metadata
+       about the design of the schema, since a primary key implies that other
+       tables can rely on this set of columns as a unique identifier for rows.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry id="sql-createtable-exclude">
+     <term><literal>EXCLUDE [ USING <replaceable class="parameter">index_method</replaceable> ] ( <replaceable class="parameter">exclude_element</replaceable> WITH <replaceable class="parameter">operator</replaceable> [, ... ] ) <replaceable class="parameter">index_parameters</replaceable> [ WHERE ( <replaceable class="parameter">predicate</replaceable> ) ]</literal></term>
+     <listitem>
+      <para>
+       The <literal>EXCLUDE</literal> clause defines an exclusion
+       constraint, which guarantees that if
+       any two rows are compared on the specified column(s) or
+       expression(s) using the specified operator(s), not all of these
+       comparisons will return <literal>TRUE</literal>.  If all of the
+       specified operators test for equality, this is equivalent to a
+       <literal>UNIQUE</literal> constraint, although an ordinary unique constraint
+       will be faster.  However, exclusion constraints can specify
+       constraints that are more general than simple equality.
+       For example, you can specify a constraint that
+       no two rows in the table contain overlapping circles
+       (see <xref linkend="datatype-geometric">) by using the
+       <literal>&amp;&amp;</literal> operator.
+      </para>
+ 
+      <para>
+       Exclusion constraints are implemented using
+       an index, so each specified operator must be associated with an
+       appropriate operator class
+       (see <xref linkend="indexes-opclass">) for the index access
+       method <replaceable>index_method</replaceable>.
+       The operators are required to be commutative.
+       Each <replaceable class="parameter">exclude_element</replaceable>
+       can optionally specify an operator class and/or ordering options;
+       these are described fully under
+       <xref linkend="sql-createindex">.
+      </para>
+ 
+      <para>
+       The access method must support <literal>amgettuple</literal> (see <xref
+       linkend="indexam">); at present this means <acronym>GIN</acronym>
+       cannot be used.  Although it's allowed, there is little point in using
+       B-tree or hash indexes with an exclusion constraint, because this
+       does nothing that an ordinary unique constraint doesn't do better.
+       So in practice the access method will always be <acronym>GiST</acronym> or
+       <acronym>SP-GiST</acronym>.
+      </para>
+ 
+      <para>
+       The <replaceable class="parameter">predicate</replaceable> allows you to specify an
+       exclusion constraint on a subset of the table; internally this creates a
+       partial index. Note that parentheses are required around the predicate.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>REFERENCES <replaceable class="parameter">reftable</replaceable> [ ( <replaceable class="parameter">refcolumn</replaceable> ) ] [ MATCH <replaceable class="parameter">matchtype</replaceable> ] [ ON DELETE <replaceable class="parameter">action</replaceable> ] [ ON UPDATE <replaceable class="parameter">action</replaceable> ]</literal> (column constraint)</term>
+ 
+    <term><literal>FOREIGN KEY ( <replaceable class="parameter">column_name</replaceable> [, ... ] )
+     REFERENCES <replaceable class="parameter">reftable</replaceable> [ ( <replaceable class="parameter">refcolumn</replaceable> [, ... ] ) ]
+     [ MATCH <replaceable class="parameter">matchtype</replaceable> ]
+     [ ON DELETE <replaceable class="parameter">action</replaceable> ]
+     [ ON UPDATE <replaceable class="parameter">action</replaceable> ]</literal>
+     (table constraint)</term>
+ 
+     <listitem>
+      <para>
+       These clauses specify a foreign key constraint, which requires
+       that a group of one or more columns of the new table must only
+       contain values that match values in the referenced
+       column(s) of some row of the referenced table.  If the <replaceable
+       class="parameter">refcolumn</replaceable> list is omitted, the
+       primary key of the <replaceable class="parameter">reftable</replaceable>
+       is used.  The referenced columns must be the columns of a non-deferrable
+       unique or primary key constraint in the referenced table.  The user
+       must have <literal>REFERENCES</literal> permission on the referenced table
+       (either the whole table, or the specific referenced columns).
+       Note that foreign key constraints cannot be defined between temporary
+       tables and permanent tables.
+      </para>
+ 
+      <para>
+       A value inserted into the referencing column(s) is matched against the
+       values of the referenced table and referenced columns using the
+       given match type.  There are three match types: <literal>MATCH
+       FULL</literal>, <literal>MATCH PARTIAL</literal>, and <literal>MATCH
+       SIMPLE</literal> (which is the default).  <literal>MATCH
+       FULL</literal> will not allow one column of a multicolumn foreign key
+       to be null unless all foreign key columns are null; if they are all
+       null, the row is not required to have a match in the referenced table.
+       <literal>MATCH SIMPLE</literal> allows any of the foreign key columns
+       to be null; if any of them are null, the row is not required to have a
+       match in the referenced table.
+       <literal>MATCH PARTIAL</literal> is not yet implemented.
+       (Of course, <literal>NOT NULL</literal> constraints can be applied to the
+       referencing column(s) to prevent these cases from arising.)
+      </para>
+ 
+      <para>
+       In addition, when the data in the referenced columns is changed,
+       certain actions are performed on the data in this table's
+       columns.  The <literal>ON DELETE</literal> clause specifies the
+       action to perform when a referenced row in the referenced table is
+       being deleted.  Likewise, the <literal>ON UPDATE</literal>
+       clause specifies the action to perform when a referenced column
+       in the referenced table is being updated to a new value. If the
+       row is updated, but the referenced column is not actually
+       changed, no action is done. Referential actions other than the
+       <literal>NO ACTION</literal> check cannot be deferred, even if
+       the constraint is declared deferrable. There are the following possible
+       actions for each clause:
+ 
+       <variablelist>
+        <varlistentry>
+         <term><literal>NO ACTION</literal></term>
+         <listitem>
+          <para>
+           Produce an error indicating that the deletion or update
+           would create a foreign key constraint violation.
+           If the constraint is deferred, this
+           error will be produced at constraint check time if there still
+           exist any referencing rows.  This is the default action.
+          </para>
+         </listitem>
+        </varlistentry>
+ 
+        <varlistentry>
+         <term><literal>RESTRICT</literal></term>
+         <listitem>
+          <para>
+           Produce an error indicating that the deletion or update
+           would create a foreign key constraint violation.
+           This is the same as <literal>NO ACTION</literal> except that
+           the check is not deferrable.
+          </para>
+         </listitem>
+        </varlistentry>
+ 
+        <varlistentry>
+         <term><literal>CASCADE</literal></term>
+         <listitem>
+          <para>
+           Delete any rows referencing the deleted row, or update the
+           values of the referencing column(s) to the new values of the
+           referenced columns, respectively.
+          </para>
+         </listitem>
+        </varlistentry>
+ 
+        <varlistentry>
+         <term><literal>SET NULL</literal></term>
+         <listitem>
+          <para>
+           Set the referencing column(s) to null.
+          </para>
+         </listitem>
+        </varlistentry>
+ 
+        <varlistentry>
+         <term><literal>SET DEFAULT</literal></term>
+         <listitem>
+          <para>
+           Set the referencing column(s) to their default values.
+           (There must be a row in the referenced table matching the default
+           values, if they are not null, or the operation will fail.)
+          </para>
+         </listitem>
+        </varlistentry>
+       </variablelist>
+      </para>
+ 
+      <para>
+       If the referenced column(s) are changed frequently, it might be wise to
+       add an index to the referencing column(s) so that referential actions
+       associated with the foreign key constraint can be performed more
+       efficiently.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>DEFERRABLE</literal></term>
+     <term><literal>NOT DEFERRABLE</literal></term>
+     <listitem>
+      <para>
+       This controls whether the constraint can be deferred.  A
+       constraint that is not deferrable will be checked immediately
+       after every command.  Checking of constraints that are
+       deferrable can be postponed until the end of the transaction
+       (using the <xref linkend="sql-set-constraints"> command).
+       <literal>NOT DEFERRABLE</literal> is the default.
+       Currently, only <literal>UNIQUE</literal>, <literal>PRIMARY KEY</literal>,
+       <literal>EXCLUDE</literal>, and
+       <literal>REFERENCES</literal> (foreign key) constraints accept this
+       clause.  <literal>NOT NULL</literal> and <literal>CHECK</literal> constraints are not
+       deferrable.  Note that deferrable constraints cannot be used as
+       conflict arbitrators in an <command>INSERT</command> statement that
+       includes an <literal>ON CONFLICT DO UPDATE</literal> clause.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>INITIALLY IMMEDIATE</literal></term>
+     <term><literal>INITIALLY DEFERRED</literal></term>
+     <listitem>
+      <para>
+       If a constraint is deferrable, this clause specifies the default
+       time to check the constraint.  If the constraint is
+       <literal>INITIALLY IMMEDIATE</literal>, it is checked after each
+       statement. This is the default.  If the constraint is
+       <literal>INITIALLY DEFERRED</literal>, it is checked only at the
+       end of the transaction.  The constraint check time can be
+       altered with the <xref linkend="sql-set-constraints"> command.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>WITH ( <replaceable class="parameter">storage_parameter</replaceable> [= <replaceable class="parameter">value</replaceable>] [, ... ] )</literal></term>
+     <listitem>
+      <para>
+       This clause specifies optional storage parameters for a table or index;
+       see <xref linkend="sql-createtable-storage-parameters"
+       endterm="sql-createtable-storage-parameters-title"> for more
+       information.  The <literal>WITH</literal> clause for a
+       table can also include <literal>OIDS=TRUE</literal> (or just <literal>OIDS</literal>)
+       to specify that rows of the new table
+       should have OIDs (object identifiers) assigned to them, or
+       <literal>OIDS=FALSE</literal> to specify that the rows should not have OIDs.
+       If <literal>OIDS</literal> is not specified, the default setting depends upon
+       the <xref linkend="guc-default-with-oids"> configuration parameter.
+       (If the new table inherits from any tables that have OIDs, then
+       <literal>OIDS=TRUE</literal> is forced even if the command says
+       <literal>OIDS=FALSE</literal>.)
+      </para>
+ 
+      <para>
+       If <literal>OIDS=FALSE</literal> is specified or implied, the new
+       table does not store OIDs and no OID will be assigned for a row inserted
+       into it. This is generally considered worthwhile, since it
+       will reduce OID consumption and thereby postpone the wraparound
+       of the 32-bit OID counter. Once the counter wraps around, OIDs
+       can no longer be assumed to be unique, which makes them
+       considerably less useful. In addition, excluding OIDs from a
+       table reduces the space required to store the table on disk by
+       4 bytes per row (on most machines), slightly improving performance.
+      </para>
+ 
+      <para>
+       To remove OIDs from a table after it has been created, use <xref
+       linkend="sql-altertable">.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>WITH OIDS</literal></term>
+     <term><literal>WITHOUT OIDS</literal></term>
+     <listitem>
+      <para>
+       These are obsolescent syntaxes equivalent to <literal>WITH (OIDS)</literal>
+       and <literal>WITH (OIDS=FALSE)</literal>, respectively.  If you wish to give
+       both an <literal>OIDS</literal> setting and storage parameters, you must use
+       the <literal>WITH ( ... )</literal> syntax; see above.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>ON COMMIT</literal></term>
+     <listitem>
+      <para>
+       The behavior of temporary tables at the end of a transaction
+       block can be controlled using <literal>ON COMMIT</literal>.
+       The three options are:
+ 
+       <variablelist>
+        <varlistentry>
+         <term><literal>PRESERVE ROWS</literal></term>
+         <listitem>
+          <para>
+           No special action is taken at the ends of transactions.
+           This is the default behavior.
+          </para>
+         </listitem>
+        </varlistentry>
+ 
+        <varlistentry>
+         <term><literal>DELETE ROWS</literal></term>
+         <listitem>
+          <para>
+           All rows in the temporary table will be deleted at the end
+           of each transaction block.  Essentially, an automatic <xref
+           linkend="sql-truncate"> is done
+           at each commit.
+          </para>
+         </listitem>
+        </varlistentry>
+ 
+        <varlistentry>
+         <term><literal>DROP</literal></term>
+         <listitem>
+          <para>
+           The temporary table will be dropped at the end of the current
+           transaction block.
+          </para>
+         </listitem>
+        </varlistentry>
+       </variablelist></para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>TABLESPACE <replaceable class="parameter">tablespace_name</replaceable></literal></term>
+     <listitem>
+      <para>
+       The <replaceable class="parameter">tablespace_name</replaceable> is the name
+       of the tablespace in which the new table is to be created.
+       If not specified,
+       <xref linkend="guc-default-tablespace"> is consulted, or
+       <xref linkend="guc-temp-tablespaces"> if the table is temporary.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>USING INDEX TABLESPACE <replaceable class="parameter">tablespace_name</replaceable></literal></term>
+     <listitem>
+      <para>
+       This clause allows selection of the tablespace in which the index
+       associated with a <literal>UNIQUE</literal>, <literal>PRIMARY
+       KEY</literal>, or <literal>EXCLUDE</literal> constraint will be created.
+       If not specified,
+       <xref linkend="guc-default-tablespace"> is consulted, or
+       <xref linkend="guc-temp-tablespaces"> if the table is temporary.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+   </variablelist>
+ 
+   <refsect2 id="sql-createtable-storage-parameters">
+    <title id="sql-createtable-storage-parameters-title">Storage Parameters</title>
+ 
+  <indexterm zone="sql-createtable-storage-parameters">
+   <primary>storage parameters</primary>
+  </indexterm>
+ 
+    <para>
+     The <literal>WITH</literal> clause can specify <firstterm>storage parameters</firstterm>
+     for tables, and for indexes associated with a <literal>UNIQUE</literal>,
+     <literal>PRIMARY KEY</literal>, or <literal>EXCLUDE</literal> constraint.
+     Storage parameters for
+     indexes are documented in <xref linkend="sql-createindex">.
+     The storage parameters currently
+     available for tables are listed below.  For many of these parameters, as
+     shown, there is an additional parameter with the same name prefixed with
+     <literal>toast.</literal>, which controls the behavior of the
+     table's secondary <acronym>TOAST</acronym> table, if any
+     (see <xref linkend="storage-toast"> for more information about TOAST).
+     If a table parameter value is set and the
+     equivalent <literal>toast.</literal> parameter is not, the TOAST table
+     will use the table's parameter value.
+     Specifying these parameters for partitioned tables is not supported,
+     but you may specify them for individual leaf partitions.
+    </para>
+ 
+    <variablelist>
+ 
+    <varlistentry>
+     <term><literal>fillfactor</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       The fillfactor for a table is a percentage between 10 and 100.
+       100 (complete packing) is the default.  When a smaller fillfactor
+       is specified, <command>INSERT</command> operations pack table pages only
+       to the indicated percentage; the remaining space on each page is
+       reserved for updating rows on that page.  This gives <command>UPDATE</command>
+       a chance to place the updated copy of a row on the same page as the
+       original, which is more efficient than placing it on a different page.
+       For a table whose entries are never updated, complete packing is the
+       best choice, but in heavily updated tables smaller fillfactors are
+       appropriate.  This parameter cannot be set for TOAST tables.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>parallel_workers</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       This sets the number of workers that should be used to assist a parallel
+       scan of this table.  If not set, the system will determine a value based
+       on the relation size.  The actual number of workers chosen by the planner
+       may be less, for example due to
+       the setting of <xref linkend="guc-max-worker-processes">.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_enabled</literal>, <literal>toast.autovacuum_enabled</literal> (<type>boolean</type>)</term>
+     <listitem>
+      <para>
+      Enables or disables the autovacuum daemon for a particular table.
+      If true, the autovacuum daemon will perform automatic <command>VACUUM</command>
+      and/or <command>ANALYZE</command> operations on this table following the rules
+      discussed in <xref linkend="autovacuum">.
+      If false, this table will not be autovacuumed, except to prevent
+      transaction ID wraparound. See <xref linkend="vacuum-for-wraparound"> for
+      more about wraparound prevention.
+      Note that the autovacuum daemon does not run at all (except to prevent
+      transaction ID wraparound) if the <xref linkend="guc-autovacuum">
+      parameter is false; setting individual tables' storage parameters does
+      not override that.  Therefore there is seldom much point in explicitly
+      setting this storage parameter to <literal>true</literal>, only
+      to <literal>false</literal>.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_vacuum_threshold</literal>, <literal>toast.autovacuum_vacuum_threshold</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       Per-table value for <xref linkend="guc-autovacuum-vacuum-threshold">
+       parameter.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_vacuum_scale_factor</literal>, <literal>toast.autovacuum_vacuum_scale_factor</literal> (<type>float4</type>)</term>
+     <listitem>
+      <para>
+       Per-table value for <xref linkend="guc-autovacuum-vacuum-scale-factor">
+       parameter.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_analyze_threshold</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       Per-table value for <xref linkend="guc-autovacuum-analyze-threshold">
+       parameter.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_analyze_scale_factor</literal> (<type>float4</type>)</term>
+     <listitem>
+      <para>
+       Per-table value for <xref linkend="guc-autovacuum-analyze-scale-factor">
+       parameter.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_vacuum_cost_delay</literal>, <literal>toast.autovacuum_vacuum_cost_delay</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       Per-table value for <xref linkend="guc-autovacuum-vacuum-cost-delay">
+       parameter.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_vacuum_cost_limit</literal>, <literal>toast.autovacuum_vacuum_cost_limit</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       Per-table value for <xref linkend="guc-autovacuum-vacuum-cost-limit">
+       parameter.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_freeze_min_age</literal>, <literal>toast.autovacuum_freeze_min_age</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       Per-table value for <xref linkend="guc-vacuum-freeze-min-age">
+       parameter.  Note that autovacuum will ignore
+       per-table <literal>autovacuum_freeze_min_age</literal> parameters that are
+       larger than half the
+       system-wide <xref linkend="guc-autovacuum-freeze-max-age"> setting.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_freeze_max_age</literal>, <literal>toast.autovacuum_freeze_max_age</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       Per-table value for <xref linkend="guc-autovacuum-freeze-max-age">
+       parameter.  Note that autovacuum will ignore
+       per-table <literal>autovacuum_freeze_max_age</literal> parameters that are
+       larger than the system-wide setting (it can only be set smaller).
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_freeze_table_age</literal>, <literal>toast.autovacuum_freeze_table_age</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       Per-table value for <xref linkend="guc-vacuum-freeze-table-age">
+       parameter.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_multixact_freeze_min_age</literal>, <literal>toast.autovacuum_multixact_freeze_min_age</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       Per-table value for <xref linkend="guc-vacuum-multixact-freeze-min-age">
+       parameter.  Note that autovacuum will ignore
+       per-table <literal>autovacuum_multixact_freeze_min_age</literal> parameters
+       that are larger than half the
+       system-wide <xref linkend="guc-autovacuum-multixact-freeze-max-age">
+       setting.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_multixact_freeze_max_age</literal>, <literal>toast.autovacuum_multixact_freeze_max_age</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       Per-table value
+       for <xref linkend="guc-autovacuum-multixact-freeze-max-age"> parameter.
+       Note that autovacuum will ignore
+       per-table <literal>autovacuum_multixact_freeze_max_age</literal> parameters
+       that are larger than the system-wide setting (it can only be set
+       smaller).
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>autovacuum_multixact_freeze_table_age</literal>, <literal>toast.autovacuum_multixact_freeze_table_age</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       Per-table value
+       for <xref linkend="guc-vacuum-multixact-freeze-table-age"> parameter.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>log_autovacuum_min_duration</literal>, <literal>toast.log_autovacuum_min_duration</literal> (<type>integer</type>)</term>
+     <listitem>
+      <para>
+       Per-table value for <xref linkend="guc-log-autovacuum-min-duration">
+       parameter.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    <varlistentry>
+     <term><literal>user_catalog_table</literal> (<type>boolean</type>)</term>
+     <listitem>
+      <para>
+       Declare the table as an additional catalog table for purposes of
+       logical replication. See
+       <xref linkend="logicaldecoding-capabilities"> for details.
+       This parameter cannot be set for TOAST tables.
+      </para>
+     </listitem>
+    </varlistentry>
+ 
+    </variablelist>
+ 
+   </refsect2>
+  </refsect1>
+ 
+  <refsect1 id="sql-createtable-notes">
+   <title>Notes</title>
+ 
+     <para>
+      Using OIDs in new applications is not recommended: where
+      possible, using an identity column or other sequence
+      generator as the table's primary key is preferred. However, if
+      your application does make use of OIDs to identify specific
+      rows of a table, it is recommended to create a unique constraint
+      on the <structfield>oid</structfield> column of that table, to ensure that
+      OIDs in the table will indeed uniquely identify rows even after
+      counter wraparound.  Avoid assuming that OIDs are unique across
+      tables; if you need a database-wide unique identifier, use the
+      combination of <structfield>tableoid</structfield> and row OID for the
+      purpose.
+     </para>
+ 
+     <tip>
+      <para>
+       The use of <literal>OIDS=FALSE</literal> is not recommended
+       for tables with no primary key, since without either an OID or a
+       unique data key, it is difficult to identify specific rows.
+      </para>
+     </tip>
+ 
+     <para>
+      <productname>PostgreSQL</productname> automatically creates an
+      index for each unique constraint and primary key constraint to
+      enforce uniqueness.  Thus, it is not necessary to create an
+      index explicitly for primary key columns.  (See <xref
+      linkend="sql-createindex"> for more information.)
+     </para>
+ 
+     <para>
+      Unique constraints and primary keys are not inherited in the
+      current implementation.  This makes the combination of
+      inheritance and unique constraints rather dysfunctional.
+     </para>
+ 
+     <para>
+      A table cannot have more than 1600 columns.  (In practice, the
+      effective limit is usually lower because of tuple-length constraints.)
+     </para>
+ 
+  </refsect1>
+ 
+ 
+  <refsect1 id="sql-createtable-examples">
+   <title>Examples</title>
+ 
    <para>
-    When a <literal>UNIQUE</> or <literal>PRIMARY KEY</> constraint is
-    not deferrable, <productname>PostgreSQL</productname> checks for
-    uniqueness immediately whenever a row is inserted or modified.
-    The SQL standard says that uniqueness should be enforced only at
-    the end of the statement; this makes a difference when, for example,
-    a single command updates multiple key values.  To obtain
-    standard-compliant behavior, declare the constraint as
-    <literal>DEFERRABLE</> but not deferred (i.e., <literal>INITIALLY
-    IMMEDIATE</>).  Be aware that this can be significantly slower than
-    immediate uniqueness checking.
+    Create table <structname>films</structname> and table
+    <structname>distributors</structname>:
+ 
+ <programlisting>
+ CREATE TABLE films (
+     code        char(5) CONSTRAINT firstkey PRIMARY KEY,
+     title       varchar(40) NOT NULL,
+     did         integer NOT NULL,
+     date_prod   date,
+     kind        varchar(10),
+     len         interval hour to minute
+ );
+ 
+ CREATE TABLE distributors (
+      did    integer PRIMARY KEY GENERATED BY DEFAULT AS IDENTITY,
+      name   varchar(40) NOT NULL CHECK (name &lt;&gt; '')
+ );
+ </programlisting>
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title>Column Check Constraints</title>
-
+ 
    <para>
-    The SQL standard says that <literal>CHECK</> column constraints
-    can only refer to the column they apply to; only <literal>CHECK</>
-    table constraints can refer to multiple columns.
-    <productname>PostgreSQL</productname> does not enforce this
-    restriction; it treats column and table check constraints alike.
+    Create a table with a 2-dimensional array:
+ 
+ <programlisting>
+ CREATE TABLE array_int (
+     vector  int[][]
+ );
+ </programlisting>
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title><literal>EXCLUDE</literal> Constraint</title>
-
+ 
    <para>
-    The <literal>EXCLUDE</> constraint type is a
-    <productname>PostgreSQL</productname> extension.
+    Define a unique table constraint for the table
+    <literal>films</literal>.  Unique table constraints can be defined
+    on one or more columns of the table:
+ 
+ <programlisting>
+ CREATE TABLE films (
+     code        char(5),
+     title       varchar(40),
+     did         integer,
+     date_prod   date,
+     kind        varchar(10),
+     len         interval hour to minute,
+     CONSTRAINT production UNIQUE(date_prod)
+ );
+ </programlisting>
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title><literal>NULL</literal> <quote>Constraint</quote></title>
-
+ 
    <para>
-    The <literal>NULL</> <quote>constraint</quote> (actually a
-    non-constraint) is a <productname>PostgreSQL</productname>
-    extension to the SQL standard that is included for compatibility with some
-    other database systems (and for symmetry with the <literal>NOT
-    NULL</literal> constraint).  Since it is the default for any
-    column, its presence is simply noise.
+    Define a check column constraint:
+ 
+ <programlisting>
+ CREATE TABLE distributors (
+     did     integer CHECK (did &gt; 100),
+     name    varchar(40)
+ );
+ </programlisting>
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title>Inheritance</title>
-
+ 
    <para>
-    Multiple inheritance via the <literal>INHERITS</literal> clause is
-    a <productname>PostgreSQL</productname> language extension.
-    SQL:1999 and later define single inheritance using a
-    different syntax and different semantics.  SQL:1999-style
-    inheritance is not yet supported by
-    <productname>PostgreSQL</productname>.
+    Define a check table constraint:
+ 
+ <programlisting>
+ CREATE TABLE distributors (
+     did     integer,
+     name    varchar(40)
+     CONSTRAINT con1 CHECK (did &gt; 100 AND name &lt;&gt; '')
+ );
+ </programlisting>
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title>Zero-column Tables</title>
-
+ 
    <para>
-    <productname>PostgreSQL</productname> allows a table of no columns
-    to be created (for example, <literal>CREATE TABLE foo();</>).  This
-    is an extension from the SQL standard, which does not allow zero-column
-    tables.  Zero-column tables are not in themselves very useful, but
-    disallowing them creates odd special cases for <command>ALTER TABLE
-    DROP COLUMN</>, so it seems cleaner to ignore this spec restriction.
+    Define a primary key table constraint for the table
+    <structname>films</structname>:
+ 
+ <programlisting>
+ CREATE TABLE films (
+     code        char(5),
+     title       varchar(40),
+     did         integer,
+     date_prod   date,
+     kind        varchar(10),
+     len         interval hour to minute,
+     CONSTRAINT code_title PRIMARY KEY(code,title)
+ );
+ </programlisting>
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title>Multiple Identity Columns</title>
-
+ 
    <para>
-    <productname>PostgreSQL</productname> allows a table to have more than one
-    identity column.  The standard specifies that a table can have at most one
-    identity column.  This is relaxed mainly to give more flexibility for
-    doing schema changes or migrations.  Note that
-    the <command>INSERT</command> command supports only one override clause
-    that applies to the entire statement, so having multiple identity columns
-    with different behaviors is not well supported.
+    Define a primary key constraint for table
+    <structname>distributors</structname>.  The following two examples are
+    equivalent, the first using the table constraint syntax, the second
+    the column constraint syntax:
+ 
+ <programlisting>
+ CREATE TABLE distributors (
+     did     integer,
+     name    varchar(40),
+     PRIMARY KEY(did)
+ );
+ 
+ CREATE TABLE distributors (
+     did     integer PRIMARY KEY,
+     name    varchar(40)
+ );
+ </programlisting>
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title><literal>LIKE</> Clause</title>
-
+ 
    <para>
-    While a <literal>LIKE</> clause exists in the SQL standard, many of the
-    options that <productname>PostgreSQL</productname> accepts for it are not
-    in the standard, and some of the standard's options are not implemented
-    by <productname>PostgreSQL</productname>.
+    Assign a literal constant default value for the column
+    <literal>name</literal>, arrange for the default value of column
+    <literal>did</literal> to be generated by selecting the next value
+    of a sequence object, and make the default value of
+    <literal>modtime</literal> be the time at which the row is
+    inserted:
+ 
+ <programlisting>
+ CREATE TABLE distributors (
+     name      varchar(40) DEFAULT 'Luso Films',
+     did       integer DEFAULT nextval('distributors_serial'),
+     modtime   timestamp DEFAULT current_timestamp
+ );
+ </programlisting>
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title><literal>WITH</> Clause</title>
-
+ 
    <para>
-    The <literal>WITH</> clause is a <productname>PostgreSQL</productname>
-    extension; neither storage parameters nor OIDs are in the standard.
+    Define two <literal>NOT NULL</literal> column constraints on the table
+    <classname>distributors</classname>, one of which is explicitly
+    given a name:
+ 
+ <programlisting>
+ CREATE TABLE distributors (
+     did     integer CONSTRAINT no_null NOT NULL,
+     name    varchar(40) NOT NULL
+ );
+ </programlisting>
+     </para>
+ 
+     <para>
+      Define a unique constraint for the <literal>name</literal> column:
+ 
+ <programlisting>
+ CREATE TABLE distributors (
+     did     integer,
+     name    varchar(40) UNIQUE
+ );
+ </programlisting>
+ 
+      The same, specified as a table constraint:
+ 
+ <programlisting>
+ CREATE TABLE distributors (
+     did     integer,
+     name    varchar(40),
+     UNIQUE(name)
+ );
+ </programlisting>
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title>Tablespaces</title>
-
+ 
    <para>
-    The <productname>PostgreSQL</productname> concept of tablespaces is not
-    part of the standard.  Hence, the clauses <literal>TABLESPACE</literal>
-    and <literal>USING INDEX TABLESPACE</literal> are extensions.
+    Create the same table, specifying 70% fill factor for both the table
+    and its unique index:
+ 
+ <programlisting>
+ CREATE TABLE distributors (
+     did     integer,
+     name    varchar(40),
+     UNIQUE(name) WITH (fillfactor=70)
+ )
+ WITH (fillfactor=70);
+ </programlisting>
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title>Typed Tables</title>
-
+ 
    <para>
-    Typed tables implement a subset of the SQL standard.  According to
-    the standard, a typed table has columns corresponding to the
-    underlying composite type as well as one other column that is
-    the <quote>self-referencing column</quote>.  PostgreSQL does not
-    support these self-referencing columns explicitly, but the same
-    effect can be had using the OID feature.
+    Create table <structname>circles</structname> with an exclusion
+    constraint that prevents any two circles from overlapping:
+ 
+ <programlisting>
+ CREATE TABLE circles (
+     c circle,
+     EXCLUDE USING gist (c WITH &amp;&amp;)
+ );
+ </programlisting>
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title><literal>PARTITION BY</> Clause</title>
-
+ 
    <para>
-    The <literal>PARTITION BY</> clause is a
-    <productname>PostgreSQL</productname> extension.
+    Create table <structname>cinemas</structname> in tablespace <structname>diskvol1</structname>:
+ 
+ <programlisting>
+ CREATE TABLE cinemas (
+         id serial,
+         name text,
+         location text
+ ) TABLESPACE diskvol1;
+ </programlisting>
    </para>
-  </refsect2>
-
-  <refsect2>
-   <title><literal>PARTITION OF</> Clause</title>
-
+ 
    <para>
-    The <literal>PARTITION OF</> clause is a
-    <productname>PostgreSQL</productname> extension.
-   </para>
-  </refsect2>
-
-  <refsect2>
-   <title><productname>Postgres-XL</> Specifics</title>
-
+    Create a composite type and a typed table:
+ <programlisting>
+ CREATE TYPE employee_type AS (name text, salary numeric);
+ 
+ CREATE TABLE employees OF employee_type (
+     PRIMARY KEY (name),
+     salary WITH OPTIONS DEFAULT 1000
+ );
+ </programlisting></para>
+ 
    <para>
-    Currently, immutable, stable, volatile functions and nextval are allowed in DEFAULT clause. 
-    as <literal>DEFAULT</> values.
-   </para>
+    Create a range partitioned table:
+ <programlisting>
+ CREATE TABLE measurement (
+     logdate         date not null,
+     peaktemp        int,
+     unitsales       int
+ ) PARTITION BY RANGE (logdate);
+ </programlisting></para>
+ 
    <para>
-    <literal>PRIMARY KEY</> and foreign key must include the
-    distribution column.
-   </para>
+    Create a range partitioned table with multiple columns in the partition key:
+ <programlisting>
+ CREATE TABLE measurement_year_month (
+     logdate         date not null,
+     peaktemp        int,
+     unitsales       int
+ ) PARTITION BY RANGE (EXTRACT(YEAR FROM logdate), EXTRACT(MONTH FROM logdate));
+ </programlisting></para>
+ 
    <para>
-    <literal>TEMP</> tables and exclusion constraint are not supported
-    yet.
-   </para>
+    Create a list partitioned table:
+ <programlisting>
+ CREATE TABLE cities (
+     city_id      bigserial not null,
+     name         text not null,
+     population   bigint
+ ) PARTITION BY LIST (left(lower(name), 1));
+ </programlisting></para>
+ 
    <para>
-   </para>
+    Create a hash partitioned table:
+ <programlisting>
+ CREATE TABLE orders (
+     order_id     bigint not null,
+     cust_id      bigint not null,
+     status       text
+ ) PARTITION BY HASH (order_id);
+ </programlisting></para>
+ 
    <para>
-    In <productname>Postgres-XL</>, OID is maintained locally in each
-    Datanode and Coordinator.  The OID value may be inconsistent for rows
-    stored in different Datanodes.
+    Create partition of a range partitioned table:
+ <programlisting>
+ CREATE TABLE measurement_y2016m07
+     PARTITION OF measurement (
+     unitsales DEFAULT 0
+ ) FOR VALUES FROM ('2016-07-01') TO ('2016-08-01');
+ </programlisting></para>
+ 
+   <para>
+    Create a few partitions of a range partitioned table with multiple
+    columns in the partition key:
+ <programlisting>
+ CREATE TABLE measurement_ym_older
+     PARTITION OF measurement_year_month
+     FOR VALUES FROM (MINVALUE, MINVALUE) TO (2016, 11);
+ 
+ CREATE TABLE measurement_ym_y2016m11
+     PARTITION OF measurement_year_month
+     FOR VALUES FROM (2016, 11) TO (2016, 12);
+ 
+ CREATE TABLE measurement_ym_y2016m12
+     PARTITION OF measurement_year_month
+     FOR VALUES FROM (2016, 12) TO (2017, 01);
+ 
+ CREATE TABLE measurement_ym_y2017m01
+     PARTITION OF measurement_year_month
+     FOR VALUES FROM (2017, 01) TO (2017, 02);
+ </programlisting></para>
+ 
+   <para>
+    Create partition of a list partitioned table:
+ <programlisting>
+ CREATE TABLE cities_ab
+     PARTITION OF cities (
+     CONSTRAINT city_id_nonzero CHECK (city_id != 0)
+ ) FOR VALUES IN ('a', 'b');
+ </programlisting></para>
+ 
+   <para>
+    Create partition of a list partitioned table that is itself further
+    partitioned and then add a partition to it:
+ <programlisting>
+ CREATE TABLE cities_ab
+     PARTITION OF cities (
+     CONSTRAINT city_id_nonzero CHECK (city_id != 0)
+ ) FOR VALUES IN ('a', 'b') PARTITION BY RANGE (population);
+ 
+ CREATE TABLE cities_ab_10000_to_100000
+     PARTITION OF cities_ab FOR VALUES FROM (10000) TO (100000);
+ </programlisting></para>
+ 
+   <para>
+    Create partitions of a hash partitioned table:
+ <programlisting>
+ CREATE TABLE orders_p1 PARTITION OF orders
+     FOR VALUES WITH (MODULUS 4, REMAINDER 0);
+ CREATE TABLE orders_p2 PARTITION OF orders
+     FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+ CREATE TABLE orders_p3 PARTITION OF orders
+     FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+ CREATE TABLE orders_p4 PARTITION OF orders
+     FOR VALUES WITH (MODULUS 4, REMAINDER 3);
+ </programlisting></para>
+ 
+   <para>
+    Create a default partition:
+ <programlisting>
+ CREATE TABLE cities_partdef
+     PARTITION OF cities DEFAULT;
+ </programlisting></para>
+  </refsect1>
+ 
+  <refsect1 id="sql-createtable-compatibility">
+   <title id="sql-createtable-compatibility-title">Compatibility</title>
+ 
+   <para>
+    The <command>CREATE TABLE</command> command conforms to the
+    <acronym>SQL</acronym> standard, with exceptions listed below.
    </para>
-
-  </refsect2>
- </refsect1>
-
-
- <refsect1>
-  <title>See Also</title>
-
-  <simplelist type="inline">
-   <member><xref linkend="sql-altertable"></member>
-   <member><xref linkend="sql-droptable"></member>
-   <member><xref linkend="sql-createtableas"></member>
-   <member><xref linkend="sql-createtablespace"></member>
-   <member><xref linkend="sql-createtype"></member>
-  </simplelist>
- </refsect1>
-</refentry>
+ 
+   <refsect2>
+    <title>Temporary Tables</title>
+ 
+    <para>
+     Although the syntax of <literal>CREATE TEMPORARY TABLE</literal>
+     resembles that of the SQL standard, the effect is not the same.  In the
+     standard,
+     temporary tables are defined just once and automatically exist (starting
+     with empty contents) in every session that needs them.
+     <productname>PostgreSQL</productname> instead
+     requires each session to issue its own <literal>CREATE TEMPORARY
+     TABLE</literal> command for each temporary table to be used.  This allows
+     different sessions to use the same temporary table name for different
+     purposes, whereas the standard's approach constrains all instances of a
+     given temporary table name to have the same table structure.
+    </para>
+ 
+    <para>
+     The standard's definition of the behavior of temporary tables is
+     widely ignored.  <productname>PostgreSQL</productname>'s behavior
+     on this point is similar to that of several other SQL databases.
+    </para>
+ 
+    <para>
+     The SQL standard also distinguishes between global and local temporary
+     tables, where a local temporary table has a separate set of contents for
+     each SQL module within each session, though its definition is still shared
+     across sessions.  Since <productname>PostgreSQL</productname> does not
+     support SQL modules, this distinction is not relevant in
+     <productname>PostgreSQL</productname>.
+    </para>
+ 
+    <para>
+     For compatibility's sake, <productname>PostgreSQL</productname> will
+     accept the <literal>GLOBAL</literal> and <literal>LOCAL</literal> keywords
+     in a temporary table declaration, but they currently have no effect.
+     Use of these keywords is discouraged, since future versions of
+     <productname>PostgreSQL</productname> might adopt a more
+     standard-compliant interpretation of their meaning.
+    </para>
+ 
+    <para>
+     The <literal>ON COMMIT</literal> clause for temporary tables
+     also resembles the SQL standard, but has some differences.
+     If the <literal>ON COMMIT</literal> clause is omitted, SQL specifies that the
+     default behavior is <literal>ON COMMIT DELETE ROWS</literal>.  However, the
+     default behavior in <productname>PostgreSQL</productname> is
+     <literal>ON COMMIT PRESERVE ROWS</literal>.  The <literal>ON COMMIT
+     DROP</literal> option does not exist in SQL.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title>Non-deferred Uniqueness Constraints</title>
+ 
+    <para>
+     When a <literal>UNIQUE</literal> or <literal>PRIMARY KEY</literal> constraint is
+     not deferrable, <productname>PostgreSQL</productname> checks for
+     uniqueness immediately whenever a row is inserted or modified.
+     The SQL standard says that uniqueness should be enforced only at
+     the end of the statement; this makes a difference when, for example,
+     a single command updates multiple key values.  To obtain
+     standard-compliant behavior, declare the constraint as
+     <literal>DEFERRABLE</literal> but not deferred (i.e., <literal>INITIALLY
+     IMMEDIATE</literal>).  Be aware that this can be significantly slower than
+     immediate uniqueness checking.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title>Column Check Constraints</title>
+ 
+    <para>
+     The SQL standard says that <literal>CHECK</literal> column constraints
+     can only refer to the column they apply to; only <literal>CHECK</literal>
+     table constraints can refer to multiple columns.
+     <productname>PostgreSQL</productname> does not enforce this
+     restriction; it treats column and table check constraints alike.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title><literal>EXCLUDE</literal> Constraint</title>
+ 
+    <para>
+     The <literal>EXCLUDE</literal> constraint type is a
+     <productname>PostgreSQL</productname> extension.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title><literal>NULL</literal> <quote>Constraint</quote></title>
+ 
+    <para>
+     The <literal>NULL</literal> <quote>constraint</quote> (actually a
+     non-constraint) is a <productname>PostgreSQL</productname>
+     extension to the SQL standard that is included for compatibility with some
+     other database systems (and for symmetry with the <literal>NOT
+     NULL</literal> constraint).  Since it is the default for any
+     column, its presence is simply noise.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title>Inheritance</title>
+ 
+    <para>
+     Multiple inheritance via the <literal>INHERITS</literal> clause is
+     a <productname>PostgreSQL</productname> language extension.
+     SQL:1999 and later define single inheritance using a
+     different syntax and different semantics.  SQL:1999-style
+     inheritance is not yet supported by
+     <productname>PostgreSQL</productname>.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title>Zero-column Tables</title>
+ 
+    <para>
+     <productname>PostgreSQL</productname> allows a table of no columns
+     to be created (for example, <literal>CREATE TABLE foo();</literal>).  This
+     is an extension from the SQL standard, which does not allow zero-column
+     tables.  Zero-column tables are not in themselves very useful, but
+     disallowing them creates odd special cases for <command>ALTER TABLE
+     DROP COLUMN</command>, so it seems cleaner to ignore this spec restriction.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title>Multiple Identity Columns</title>
+ 
+    <para>
+     <productname>PostgreSQL</productname> allows a table to have more than one
+     identity column.  The standard specifies that a table can have at most one
+     identity column.  This is relaxed mainly to give more flexibility for
+     doing schema changes or migrations.  Note that
+     the <command>INSERT</command> command supports only one override clause
+     that applies to the entire statement, so having multiple identity columns
+     with different behaviors is not well supported.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title><literal>LIKE</literal> Clause</title>
+ 
+    <para>
+     While a <literal>LIKE</literal> clause exists in the SQL standard, many of the
+     options that <productname>PostgreSQL</productname> accepts for it are not
+     in the standard, and some of the standard's options are not implemented
+     by <productname>PostgreSQL</productname>.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title><literal>WITH</literal> Clause</title>
+ 
+    <para>
+     The <literal>WITH</literal> clause is a <productname>PostgreSQL</productname>
+     extension; neither storage parameters nor OIDs are in the standard.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title>Tablespaces</title>
+ 
+    <para>
+     The <productname>PostgreSQL</productname> concept of tablespaces is not
+     part of the standard.  Hence, the clauses <literal>TABLESPACE</literal>
+     and <literal>USING INDEX TABLESPACE</literal> are extensions.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title>Typed Tables</title>
+ 
+    <para>
+     Typed tables implement a subset of the SQL standard.  According to
+     the standard, a typed table has columns corresponding to the
+     underlying composite type as well as one other column that is
+     the <quote>self-referencing column</quote>.  PostgreSQL does not
+     support these self-referencing columns explicitly, but the same
+     effect can be had using the OID feature.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title><literal>PARTITION BY</literal> Clause</title>
+ 
+    <para>
+     The <literal>PARTITION BY</literal> clause is a
+     <productname>PostgreSQL</productname> extension.
+    </para>
+   </refsect2>
+ 
+   <refsect2>
+    <title><literal>PARTITION OF</literal> Clause</title>
+ 
+    <para>
+     The <literal>PARTITION OF</literal> clause is a
+     <productname>PostgreSQL</productname> extension.
+    </para>
+   </refsect2>
+ 
+  </refsect1>
+ 
+ 
+  <refsect1>
+   <title>See Also</title>
+ 
+   <simplelist type="inline">
+    <member><xref linkend="sql-altertable"></member>
+    <member><xref linkend="sql-droptable"></member>
+    <member><xref linkend="sql-createtableas"></member>
+    <member><xref linkend="sql-createtablespace"></member>
+    <member><xref linkend="sql-createtype"></member>
+   </simplelist>
+  </refsect1>
+ </refentry>
diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index b498716e..9832a333 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -15,6 +15,7 @@
 
 #include "postgres.h"
 
+#include "access/hash.h"
 #include "access/heapam.h"
 #include "access/htup_details.h"
 #include "access/nbtree.h"
@@ -46,6 +47,7 @@
 #include "utils/datum.h"
 #include "utils/memutils.h"
 #include "utils/fmgroids.h"
+#include "utils/hashutils.h"
 #include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
@@ -61,26 +63,35 @@
  * In the case of range partitioning, ndatums will typically be far less than
  * 2 * nparts, because a partition's upper bound and the next partition's lower
  * bound are the same in most common cases, and we only store one of them (the
- * upper bound).
+ * upper bound).  In case of hash partitioning, ndatums will be same as the
+ * number of partitions.
+ *
+ * For range and list partitioned tables, datums is an array of datum-tuples
+ * with key->partnatts datums each.  For hash partitioned tables, it is an array
+ * of datum-tuples with 2 datums, modulus and remainder, corresponding to a
+ * given partition.
  *
  * In the case of list partitioning, the indexes array stores one entry for
  * every datum, which is the index of the partition that accepts a given datum.
  * In case of range partitioning, it stores one entry per distinct range
  * datum, which is the index of the partition for which a given datum
- * is an upper bound.
+ * is an upper bound.  In the case of hash partitioning, the number of the
+ * entries in the indexes array is same as the greatest modulus amongst all
+ * partitions.  For a given partition key datum-tuple, the index of the
+ * partition which would accept that datum-tuple would be given by the entry
+ * pointed by remainder produced when hash value of the datum-tuple is divided
+ * by the greatest modulus.
  */
 
 typedef struct PartitionBoundInfoData
 {
-    char        strategy;        /* list or range bounds? */
+	char		strategy;		/* hash, list or range? */
     int            ndatums;        /* Length of the datums following array */
-    Datum      **datums;            /* Array of datum-tuples with key->partnatts
-                                 * datums each */
+	Datum	  **datums;
     PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
-                                     * NULL for list partitioned tables */
-    int           *indexes;        /* Partition indexes; one entry per member of
-                                 * the datums array (plus one if range
-                                 * partitioned table) */
+									 * NULL for hash and list partitioned
+									 * tables */
+	int		   *indexes;		/* Partition indexes */
     int            null_index;        /* Index of the null-accepting partition; -1
                                  * if there isn't one */
 	int			default_index;	/* Index of the default partition; -1 if there
@@ -95,6 +106,14 @@ typedef struct PartitionBoundInfoData
  * is represented with one of the following structs.
  */
 
+/* One bound of a hash partition */
+typedef struct PartitionHashBound
+{
+	int			modulus;
+	int			remainder;
+	int			index;
+} PartitionHashBound;
+
 /* One value coming from some (index'th) list partition */
 typedef struct PartitionListValue
 {
@@ -111,6 +130,7 @@ typedef struct PartitionRangeBound
     bool        lower;            /* this is the lower (vs upper) bound */
 } PartitionRangeBound;
 
+static int32 qsort_partition_hbound_cmp(const void *a, const void *b);
 static int32 qsort_partition_list_value_cmp(const void *a, const void *b,
                                void *arg);
 static int32 qsort_partition_rbound_cmp(const void *a, const void *b,
@@ -126,6 +146,7 @@ static void get_range_key_properties(PartitionKey key, int keynum,
                          ListCell **partexprs_item,
                          Expr **keyCol,
                          Const **lower_val, Const **upper_val);
+static List *get_qual_for_hash(Relation parent, PartitionBoundSpec *spec);
 static List *get_qual_for_list(Relation parent, PartitionBoundSpec *spec);
 static List *get_qual_for_range(Relation parent, PartitionBoundSpec *spec,
 				   bool for_default);
@@ -134,6 +155,8 @@ static List *generate_partition_qual(Relation rel);
 
 static PartitionRangeBound *make_one_range_bound(PartitionKey key, int index,
                      List *datums, bool lower);
+static int32 partition_hbound_cmp(int modulus1, int remainder1, int modulus2,
+					 int remainder2);
 static int32 partition_rbound_cmp(PartitionKey key,
                      Datum *datums1, PartitionRangeDatumKind *kind1,
                      bool lower1, PartitionRangeBound *b2);
@@ -149,6 +172,12 @@ static int partition_bound_bsearch(PartitionKey key,
                         void *probe, bool probe_is_bound, bool *is_equal);
 static void get_partition_dispatch_recurse(Relation rel, Relation parent,
 							   List **pds, List **leaf_part_oids);
+static int	get_partition_bound_num_indexes(PartitionBoundInfo b);
+static int	get_greatest_modulus(PartitionBoundInfo b);
+static uint64 compute_hash_value(PartitionKey key, Datum *values, bool *isnull);
+
+/* SQL-callable function for use in hash partition CHECK constraints */
+PG_FUNCTION_INFO_V1(satisfies_hash_partition);
 
 /*
  * RelationBuildPartitionDesc
@@ -174,6 +203,9 @@ RelationBuildPartitionDesc(Relation rel)
     int            ndatums = 0;
 	int			default_index = -1;
 
+	/* Hash partitioning specific */
+	PartitionHashBound **hbounds = NULL;
+
     /* List partitioning specific */
     PartitionListValue **all_values = NULL;
     int            null_index = -1;
@@ -267,7 +299,35 @@ RelationBuildPartitionDesc(Relation rel)
             oids[i++] = lfirst_oid(cell);
 
         /* Convert from node to the internal representation */
-        if (key->strategy == PARTITION_STRATEGY_LIST)
+		if (key->strategy == PARTITION_STRATEGY_HASH)
+		{
+			ndatums = nparts;
+			hbounds = (PartitionHashBound **)
+				palloc(nparts * sizeof(PartitionHashBound *));
+
+			i = 0;
+			foreach(cell, boundspecs)
+			{
+				PartitionBoundSpec *spec = castNode(PartitionBoundSpec,
+													lfirst(cell));
+
+				if (spec->strategy != PARTITION_STRATEGY_HASH)
+					elog(ERROR, "invalid strategy in partition bound spec");
+
+				hbounds[i] = (PartitionHashBound *)
+					palloc(sizeof(PartitionHashBound));
+
+				hbounds[i]->modulus = spec->modulus;
+				hbounds[i]->remainder = spec->remainder;
+				hbounds[i]->index = i;
+				i++;
+			}
+
+			/* Sort all the bounds in ascending order */
+			qsort(hbounds, nparts, sizeof(PartitionHashBound *),
+				  qsort_partition_hbound_cmp);
+		}
+		else if (key->strategy == PARTITION_STRATEGY_LIST)
         {
             List       *non_null_values = NIL;
 
@@ -517,6 +577,42 @@ RelationBuildPartitionDesc(Relation rel)
 
         switch (key->strategy)
         {
+			case PARTITION_STRATEGY_HASH:
+				{
+					/* Modulus are stored in ascending order */
+					int			greatest_modulus = hbounds[ndatums - 1]->modulus;
+
+					boundinfo->indexes = (int *) palloc(greatest_modulus *
+														sizeof(int));
+
+					for (i = 0; i < greatest_modulus; i++)
+						boundinfo->indexes[i] = -1;
+
+					for (i = 0; i < nparts; i++)
+					{
+						int			modulus = hbounds[i]->modulus;
+						int			remainder = hbounds[i]->remainder;
+
+						boundinfo->datums[i] = (Datum *) palloc(2 *
+																sizeof(Datum));
+						boundinfo->datums[i][0] = Int32GetDatum(modulus);
+						boundinfo->datums[i][1] = Int32GetDatum(remainder);
+
+						while (remainder < greatest_modulus)
+						{
+							/* overlap? */
+							Assert(boundinfo->indexes[remainder] == -1);
+							boundinfo->indexes[remainder] = i;
+							remainder += modulus;
+						}
+
+						mapping[hbounds[i]->index] = i;
+						pfree(hbounds[i]);
+					}
+					pfree(hbounds);
+					break;
+				}
+
             case PARTITION_STRATEGY_LIST:
                 {
                     boundinfo->indexes = (int *) palloc(ndatums * sizeof(int));
@@ -650,8 +746,7 @@ RelationBuildPartitionDesc(Relation rel)
          * Now assign OIDs from the original array into mapped indexes of the
          * result array.  Order of OIDs in the former is defined by the
          * catalog scan that retrieved them, whereas that in the latter is
-         * defined by canonicalized representation of the list values or the
-         * range bounds.
+		 * defined by canonicalized representation of the partition bounds.
          */
         for (i = 0; i < nparts; i++)
             result->oids[mapping[i]] = oids[i];
@@ -688,6 +783,49 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
 	if (b1->default_index != b2->default_index)
 		return false;
 
+	if (b1->strategy == PARTITION_STRATEGY_HASH)
+	{
+		int			greatest_modulus;
+
+		/*
+		 * If two hash partitioned tables have different greatest moduli,
+		 * their partition schemes don't match.  For hash partitioned table,
+		 * the greatest modulus is given by the last datum and number of
+		 * partitions is given by ndatums.
+		 */
+		if (b1->datums[b1->ndatums - 1][0] != b2->datums[b2->ndatums - 1][0])
+			return false;
+
+		/*
+		 * We arrange the partitions in the ascending order of their modulus
+		 * and remainders.  Also every modulus is factor of next larger
+		 * modulus.  Therefore we can safely store index of a given partition
+		 * in indexes array at remainder of that partition.  Also entries at
+		 * (remainder + N * modulus) positions in indexes array are all same
+		 * for (modulus, remainder) specification for any partition.  Thus
+		 * datums array from both the given bounds are same, if and only if
+		 * their indexes array will be same.  So, it suffices to compare
+		 * indexes array.
+		 */
+		greatest_modulus = get_greatest_modulus(b1);
+		for (i = 0; i < greatest_modulus; i++)
+			if (b1->indexes[i] != b2->indexes[i])
+				return false;
+
+#ifdef USE_ASSERT_CHECKING
+
+		/*
+		 * Nonetheless make sure that the bounds are indeed same when the
+		 * indexes match.  Hash partition bound stores modulus and remainder
+		 * at b1->datums[i][0] and b1->datums[i][1] position respectively.
+		 */
+		for (i = 0; i < b1->ndatums; i++)
+			Assert((b1->datums[i][0] == b2->datums[i][0] &&
+					b1->datums[i][1] == b2->datums[i][1]));
+#endif
+	}
+	else
+	{
     for (i = 0; i < b1->ndatums; i++)
     {
         int            j;
@@ -701,21 +839,26 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
                 if (b1->kind[i][j] != b2->kind[i][j])
                     return false;
 
-                /* Non-finite bounds are equal without further examination. */
+					/*
+					 * Non-finite bounds are equal without further
+					 * examination.
+					 */
                 if (b1->kind[i][j] != PARTITION_RANGE_DATUM_VALUE)
                     continue;
             }
 
             /*
-             * Compare the actual values. Note that it would be both incorrect
-             * and unsafe to invoke the comparison operator derived from the
-             * partitioning specification here.  It would be incorrect because
-             * we want the relcache entry to be updated for ANY change to the
-             * partition bounds, not just those that the partitioning operator
-             * thinks are significant.  It would be unsafe because we might
-             * reach this code in the context of an aborted transaction, and
-             * an arbitrary partitioning operator might not be safe in that
-             * context.  datumIsEqual() should be simple enough to be safe.
+				 * Compare the actual values. Note that it would be both
+				 * incorrect and unsafe to invoke the comparison operator
+				 * derived from the partitioning specification here.  It would
+				 * be incorrect because we want the relcache entry to be
+				 * updated for ANY change to the partition bounds, not just
+				 * those that the partitioning operator thinks are
+				 * significant.  It would be unsafe because we might reach
+				 * this code in the context of an aborted transaction, and an
+				 * arbitrary partitioning operator might not be safe in that
+				 * context.  datumIsEqual() should be simple enough to be
+				 * safe.
              */
             if (!datumIsEqual(b1->datums[i][j], b2->datums[i][j],
 							  parttypbyval[j], parttyplen[j]))
@@ -730,10 +873,100 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
 	if (b1->strategy == PARTITION_STRATEGY_RANGE &&
         b1->indexes[i] != b2->indexes[i])
         return false;
-
+	}
     return true;
 }
 
+/*
+ * Return a copy of given PartitionBoundInfo structure. The data types of bounds
+ * are described by given partition key specificiation.
+ */
+extern PartitionBoundInfo
+partition_bounds_copy(PartitionBoundInfo src,
+					  PartitionKey key)
+{
+    PartitionBoundInfo dest;
+    int         i;
+    int         ndatums;
+    int         partnatts;
+    int         num_indexes;
+
+	dest = (PartitionBoundInfo) palloc(sizeof(PartitionBoundInfoData));
+
+	dest->strategy = src->strategy;
+	ndatums = dest->ndatums = src->ndatums;
+	partnatts = key->partnatts;
+
+	/* Range partitioned table has an extra index. */
+	num_indexes = get_partition_bound_num_indexes(src);
+
+	/* List partitioned tables have only a single partition key. */
+	Assert(key->strategy != PARTITION_STRATEGY_LIST || partnatts == 1);
+
+	dest->datums = (Datum **) palloc(sizeof(Datum *) * ndatums);
+
+	if (src->kind != NULL)
+	{
+		dest->kind = (PartitionRangeDatumKind **) palloc(ndatums *
+												sizeof(PartitionRangeDatumKind *));
+		for (i = 0; i < ndatums; i++)
+		{
+			dest->kind[i] = (PartitionRangeDatumKind *) palloc(partnatts *
+												sizeof(PartitionRangeDatumKind));
+
+			memcpy(dest->kind[i], src->kind[i],
+				   sizeof(PartitionRangeDatumKind) * key->partnatts);
+		}
+	}
+	else
+		dest->kind = NULL;
+
+	for (i = 0; i < ndatums; i++)
+	{
+		int         j;
+
+		/*
+		* For a corresponding to hash partition, datums array will have two
+		* elements - modulus and remainder.
+		*/
+		bool        hash_part = (key->strategy == PARTITION_STRATEGY_HASH);
+		int         natts = hash_part ? 2 : partnatts;
+
+		dest->datums[i] = (Datum *) palloc(sizeof(Datum) * natts);
+
+		for (j = 0; j < natts; j++)
+		{
+		    bool        byval;
+		    int         typlen;
+
+		    if (hash_part)
+		    {
+		        typlen = sizeof(int32); /* Always int4 */
+		        byval = true;   /* int4 is pass-by-value */
+		    }
+		    else
+		    {
+		        byval = key->parttypbyval[j];
+		        typlen = key->parttyplen[j];
+		    }
+
+		    if (dest->kind == NULL ||
+				dest->kind[i][j] == PARTITION_RANGE_DATUM_VALUE)
+				dest->datums[i][j] = datumCopy(src->datums[i][j],
+						                       byval, typlen);
+
+		}
+	}
+
+	dest->indexes = (int *) palloc(sizeof(int) * num_indexes);
+	memcpy(dest->indexes, src->indexes, sizeof(int) * num_indexes);
+
+	dest->null_index = src->null_index;
+	dest->default_index = src->default_index;
+
+	return dest;
+}
+
 /*
  * check_new_partition_bound
  *
@@ -766,6 +999,89 @@ check_new_partition_bound(char *relname, Relation parent,
 
     switch (key->strategy)
     {
+		case PARTITION_STRATEGY_HASH:
+			{
+				Assert(spec->strategy == PARTITION_STRATEGY_HASH);
+				Assert(spec->remainder >= 0 && spec->remainder < spec->modulus);
+
+				if (partdesc->nparts > 0)
+				{
+					PartitionBoundInfo boundinfo = partdesc->boundinfo;
+					Datum	  **datums = boundinfo->datums;
+					int			ndatums = boundinfo->ndatums;
+					int			greatest_modulus;
+					int			remainder;
+					int			offset;
+					bool		equal,
+								valid_modulus = true;
+					int			prev_modulus,	/* Previous largest modulus */
+								next_modulus;	/* Next largest modulus */
+
+					/*
+					 * Check rule that every modulus must be a factor of the
+					 * next larger modulus.  For example, if you have a bunch
+					 * of partitions that all have modulus 5, you can add a
+					 * new partition with modulus 10 or a new partition with
+					 * modulus 15, but you cannot add both a partition with
+					 * modulus 10 and a partition with modulus 15, because 10
+					 * is not a factor of 15.
+					 *
+					 * Get greatest bound in array boundinfo->datums which is
+					 * less than or equal to spec->modulus and
+					 * spec->remainder.
+					 */
+					offset = partition_bound_bsearch(key, boundinfo, spec,
+													 true, &equal);
+					if (offset < 0)
+					{
+						next_modulus = DatumGetInt32(datums[0][0]);
+						valid_modulus = (next_modulus % spec->modulus) == 0;
+					}
+					else
+					{
+						prev_modulus = DatumGetInt32(datums[offset][0]);
+						valid_modulus = (spec->modulus % prev_modulus) == 0;
+
+						if (valid_modulus && (offset + 1) < ndatums)
+						{
+							next_modulus = DatumGetInt32(datums[offset + 1][0]);
+							valid_modulus = (next_modulus % spec->modulus) == 0;
+						}
+					}
+
+					if (!valid_modulus)
+						ereport(ERROR,
+								(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+								 errmsg("every hash partition modulus must be a factor of the next larger modulus")));
+
+					greatest_modulus = get_greatest_modulus(boundinfo);
+					remainder = spec->remainder;
+
+					/*
+					 * Normally, the lowest remainder that could conflict with
+					 * the new partition is equal to the remainder specified
+					 * for the new partition, but when the new partition has a
+					 * modulus higher than any used so far, we need to adjust.
+					 */
+					if (remainder >= greatest_modulus)
+						remainder = remainder % greatest_modulus;
+
+					/* Check every potentially-conflicting remainder. */
+					do
+					{
+						if (boundinfo->indexes[remainder] != -1)
+						{
+							overlap = true;
+							with = boundinfo->indexes[remainder];
+							break;
+						}
+						remainder += spec->modulus;
+					} while (remainder < greatest_modulus);
+				}
+
+				break;
+			}
+
         case PARTITION_STRATEGY_LIST:
             {
                 Assert(spec->strategy == PARTITION_STRATEGY_LIST);
@@ -1136,6 +1452,11 @@ get_qual_from_partbound(Relation rel, Relation parent,
 
     switch (key->strategy)
     {
+		case PARTITION_STRATEGY_HASH:
+			Assert(spec->strategy == PARTITION_STRATEGY_HASH);
+			my_qual = get_qual_for_hash(parent, spec);
+			break;
+
         case PARTITION_STRATEGY_LIST:
             Assert(spec->strategy == PARTITION_STRATEGY_LIST);
 			my_qual = get_qual_for_list(parent, spec);
@@ -1506,6 +1827,92 @@ make_partition_op_expr(PartitionKey key, int keynum,
     return result;
 }
 
+/*
+ * get_qual_for_hash
+ *
+ * Given a list of partition columns, modulus and remainder corresponding to a
+ * partition, this function returns CHECK constraint expression Node for that
+ * partition.
+ *
+ * The partition constraint for a hash partition is always a call to the
+ * built-in function satisfies_hash_partition().  The first two arguments are
+ * the modulus and remainder for the partition; the remaining arguments are the
+ * values to be hashed.
+ */
+static List *
+get_qual_for_hash(Relation parent, PartitionBoundSpec *spec)
+{
+	PartitionKey key = RelationGetPartitionKey(parent);
+	FuncExpr   *fexpr;
+	Node	   *relidConst;
+	Node	   *modulusConst;
+	Node	   *remainderConst;
+	List	   *args;
+	ListCell   *partexprs_item;
+	int			i;
+
+	/* Fixed arguments. */
+	relidConst = (Node *) makeConst(OIDOID,
+									-1,
+									InvalidOid,
+									sizeof(Oid),
+									ObjectIdGetDatum(RelationGetRelid(parent)),
+									false,
+									true);
+
+	modulusConst = (Node *) makeConst(INT4OID,
+									  -1,
+									  InvalidOid,
+									  sizeof(int32),
+									  Int32GetDatum(spec->modulus),
+									  false,
+									  true);
+
+	remainderConst = (Node *) makeConst(INT4OID,
+										-1,
+										InvalidOid,
+										sizeof(int32),
+										Int32GetDatum(spec->remainder),
+										false,
+										true);
+
+	args = list_make3(relidConst, modulusConst, remainderConst);
+	partexprs_item = list_head(key->partexprs);
+
+	/* Add an argument for each key column. */
+	for (i = 0; i < key->partnatts; i++)
+	{
+		Node	   *keyCol;
+
+		/* Left operand */
+		if (key->partattrs[i] != 0)
+		{
+			keyCol = (Node *) makeVar(1,
+									  key->partattrs[i],
+									  key->parttypid[i],
+									  key->parttypmod[i],
+									  key->parttypcoll[i],
+									  0);
+		}
+		else
+		{
+			keyCol = (Node *) copyObject(lfirst(partexprs_item));
+			partexprs_item = lnext(partexprs_item);
+		}
+
+		args = lappend(args, keyCol);
+	}
+
+	fexpr = makeFuncExpr(F_SATISFIES_HASH_PARTITION,
+						 BOOLOID,
+						 args,
+						 InvalidOid,
+						 InvalidOid,
+						 COERCE_EXPLICIT_CALL);
+
+	return list_make1(fexpr);
+}
+
 /*
  * get_qual_for_list
  *
@@ -2371,6 +2778,17 @@ get_partition_for_tuple(PartitionDispatch *pd,
 		/* Route as appropriate based on partitioning strategy. */
 		switch (key->strategy)
         {
+			case PARTITION_STRATEGY_HASH:
+				{
+					PartitionBoundInfo boundinfo = partdesc->boundinfo;
+					int			greatest_modulus = get_greatest_modulus(boundinfo);
+					uint64		rowHash = compute_hash_value(key, values,
+															 isnull);
+
+					cur_index = boundinfo->indexes[rowHash % greatest_modulus];
+				}
+				break;
+
 			case PARTITION_STRATEGY_LIST:
 
 				if (isnull[0])
@@ -2483,6 +2901,38 @@ get_partition_for_tuple(PartitionDispatch *pd,
     return result;
 }
 
+/*
+ * qsort_partition_hbound_cmp
+ *
+ * We sort hash bounds by modulus, then by remainder.
+ */
+static int32
+qsort_partition_hbound_cmp(const void *a, const void *b)
+{
+	PartitionHashBound *h1 = (*(PartitionHashBound *const *) a);
+	PartitionHashBound *h2 = (*(PartitionHashBound *const *) b);
+
+	return partition_hbound_cmp(h1->modulus, h1->remainder,
+								h2->modulus, h2->remainder);
+}
+
+/*
+ * partition_hbound_cmp
+ *
+ * Compares modulus first, then remainder if modulus are equal.
+ */
+static int32
+partition_hbound_cmp(int modulus1, int remainder1, int modulus2, int remainder2)
+{
+	if (modulus1 < modulus2)
+		return -1;
+	if (modulus1 > modulus2)
+		return 1;
+	if (modulus1 == modulus2 && remainder1 != remainder2)
+		return (remainder1 > remainder2) ? 1 : -1;
+	return 0;
+}
+
 /*
  * qsort_partition_list_value_cmp
  *
@@ -2669,6 +3119,15 @@ partition_bound_cmp(PartitionKey key, PartitionBoundInfo boundinfo,
 
     switch (key->strategy)
     {
+		case PARTITION_STRATEGY_HASH:
+			{
+				PartitionBoundSpec *spec = (PartitionBoundSpec *) probe;
+
+				cmpval = partition_hbound_cmp(DatumGetInt32(bound_datums[0]),
+											  DatumGetInt32(bound_datums[1]),
+											  spec->modulus, spec->remainder);
+				break;
+			}
         case PARTITION_STRATEGY_LIST:
             cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
                                                      key->partcollation[0],
@@ -2853,3 +3312,182 @@ get_proposed_default_constraint(List *new_part_constraints)
 
 	return list_make1(defPartConstraint);
 }
+
+/*
+ * get_partition_bound_num_indexes
+ *
+ * Returns the number of the entries in the partition bound indexes array.
+ */
+static int
+get_partition_bound_num_indexes(PartitionBoundInfo bound)
+{
+	int			num_indexes;
+
+	Assert(bound);
+
+	switch (bound->strategy)
+	{
+		case PARTITION_STRATEGY_HASH:
+
+			/*
+			 * The number of the entries in the indexes array is same as the
+			 * greatest modulus.
+			 */
+			num_indexes = get_greatest_modulus(bound);
+			break;
+
+		case PARTITION_STRATEGY_LIST:
+			num_indexes = bound->ndatums;
+			break;
+
+		case PARTITION_STRATEGY_RANGE:
+			/* Range partitioned table has an extra index. */
+			num_indexes = bound->ndatums + 1;
+			break;
+
+		default:
+			elog(ERROR, "unexpected partition strategy: %d",
+				 (int) bound->strategy);
+	}
+
+	return num_indexes;
+}
+
+/*
+ * get_greatest_modulus
+ *
+ * Returns the greatest modulus of the hash partition bound. The greatest
+ * modulus will be at the end of the datums array because hash partitions are
+ * arranged in the ascending order of their modulus and remainders.
+ */
+static int
+get_greatest_modulus(PartitionBoundInfo bound)
+{
+	Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH);
+	Assert(bound->datums && bound->ndatums > 0);
+	Assert(DatumGetInt32(bound->datums[bound->ndatums - 1][0]) > 0);
+
+	return DatumGetInt32(bound->datums[bound->ndatums - 1][0]);
+}
+
+/*
+ * compute_hash_value
+ *
+ * Compute the hash value for given not null partition key values.
+ */
+static uint64
+compute_hash_value(PartitionKey key, Datum *values, bool *isnull)
+{
+	int			i;
+	int			nkeys = key->partnatts;
+	uint64		rowHash = 0;
+	Datum		seed = UInt64GetDatum(HASH_PARTITION_SEED);
+
+	for (i = 0; i < nkeys; i++)
+	{
+		if (!isnull[i])
+		{
+			Datum		hash;
+
+			Assert(OidIsValid(key->partsupfunc[i].fn_oid));
+
+			/*
+			 * Compute hash for each datum value by calling respective
+			 * datatype-specific hash functions of each partition key
+			 * attribute.
+			 */
+			hash = FunctionCall2(&key->partsupfunc[i], values[i], seed);
+
+			/* Form a single 64-bit hash value */
+			rowHash = hash_combine64(rowHash, DatumGetUInt64(hash));
+		}
+	}
+
+	return rowHash;
+}
+
+/*
+ * satisfies_hash_partition
+ *
+ * This is a SQL-callable function for use in hash partition constraints takes
+ * an already computed hash values of each partition key attribute, and combine
+ * them into a single hash value by calling hash_combine64.
+ *
+ * Returns true if remainder produced when this computed single hash value is
+ * divided by the given modulus is equal to given remainder, otherwise false.
+ *
+ * See get_qual_for_hash() for usage.
+ */
+Datum
+satisfies_hash_partition(PG_FUNCTION_ARGS)
+{
+	typedef struct ColumnsHashData
+	{
+		Oid			relid;
+		int16		nkeys;
+		FmgrInfo	partsupfunc[PARTITION_MAX_KEYS];
+	}			ColumnsHashData;
+	Oid			parentId = PG_GETARG_OID(0);
+	int			modulus = PG_GETARG_INT32(1);
+	int			remainder = PG_GETARG_INT32(2);
+	short		nkeys = PG_NARGS() - 3;
+	int			i;
+	Datum		seed = UInt64GetDatum(HASH_PARTITION_SEED);
+	ColumnsHashData *my_extra;
+	uint64		rowHash = 0;
+
+	/*
+	 * Cache hash function information.
+	 */
+	my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra;
+	if (my_extra == NULL || my_extra->nkeys != nkeys ||
+		my_extra->relid != parentId)
+	{
+		Relation	parent;
+		PartitionKey key;
+		int j;
+
+		fcinfo->flinfo->fn_extra =
+			MemoryContextAllocZero(fcinfo->flinfo->fn_mcxt,
+								   offsetof(ColumnsHashData, partsupfunc) +
+								   sizeof(FmgrInfo) * nkeys);
+		my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra;
+		my_extra->nkeys = nkeys;
+		my_extra->relid = parentId;
+
+		/* Open parent relation and fetch partition keyinfo */
+		parent = heap_open(parentId, AccessShareLock);
+		key = RelationGetPartitionKey(parent);
+
+		Assert(key->partnatts == nkeys);
+		for (j = 0; j < nkeys; ++j)
+			fmgr_info_copy(&my_extra->partsupfunc[j],
+						   key->partsupfunc,
+						   fcinfo->flinfo->fn_mcxt);
+
+		/* Hold lock until commit */
+		heap_close(parent, NoLock);
+	}
+
+	for (i = 0; i < nkeys; i++)
+	{
+		/* keys start from fourth argument of function. */
+		int			argno = i + 3;
+
+		if (!PG_ARGISNULL(argno))
+		{
+			Datum		hash;
+
+			Assert(OidIsValid(my_extra->partsupfunc[i].fn_oid));
+
+			hash = FunctionCall2(&my_extra->partsupfunc[i],
+								 PG_GETARG_DATUM(argno),
+								 seed);
+
+			/* Form a single 64-bit hash value */
+			rowHash = hash_combine64(rowHash, DatumGetUInt64(hash));
+		}
+	}
+
+	PG_RETURN_BOOL(rowHash % modulus == remainder);
+}
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 360fd0ee..74b82ebf 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -535,7 +535,7 @@ static void RangeVarCallbackForAlterRelation(const RangeVar *rv, Oid relid,
 static bool is_partition_attr(Relation rel, AttrNumber attnum, bool *used_in_expr);
 static PartitionSpec *transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy);
 static void ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
-                      List **partexprs, Oid *partopclass, Oid *partcollation);
+					  List **partexprs, Oid *partopclass, Oid *partcollation, char strategy);
 static void CreateInheritance(Relation child_rel, Relation parent_rel);
 static void RemoveInheritance(Relation child_rel, Relation parent_rel);
 static ObjectAddress ATExecAttachPartition(List **wqueue, Relation rel,
@@ -1167,7 +1167,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 #endif
         ComputePartitionAttrs(rel, stmt->partspec->partParams,
                               partattrs, &partexprs, partopclass,
-                              partcollation);
+							  partcollation, strategy);
 
         StorePartitionKey(rel, strategy, partnatts, partattrs, partexprs,
                           partopclass, partcollation);
@@ -16134,7 +16134,9 @@ transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy)
     newspec->location = partspec->location;
 
     /* Parse partitioning strategy name */
-    if (pg_strcasecmp(partspec->strategy, "list") == 0)
+	if (pg_strcasecmp(partspec->strategy, "hash") == 0)
+		*strategy = PARTITION_STRATEGY_HASH;
+	else if (pg_strcasecmp(partspec->strategy, "list") == 0)
         *strategy = PARTITION_STRATEGY_LIST;
     else if (pg_strcasecmp(partspec->strategy, "range") == 0)
         *strategy = PARTITION_STRATEGY_RANGE;
@@ -16211,10 +16213,12 @@ transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy)
  */
 static void
 ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
-                      List **partexprs, Oid *partopclass, Oid *partcollation)
-{// #lizard forgives
+					  List **partexprs, Oid *partopclass, Oid *partcollation,
+					  char strategy)
+{
     int            attn;
     ListCell   *lc;
+	Oid			am_oid;
 
     attn = 0;
     foreach(lc, partParams)
@@ -16374,25 +16378,41 @@ ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
         partcollation[attn] = attcollation;
 
         /*
-         * Identify a btree opclass to use. Currently, we use only btree
-         * operators, which seems enough for list and range partitioning.
+		 * Identify the appropriate operator class.  For list and range
+		 * partitioning, we use a btree operator class; hash partitioning uses
+		 * a hash operator class.
          */
+		if (strategy == PARTITION_STRATEGY_HASH)
+			am_oid = HASH_AM_OID;
+		else
+			am_oid = BTREE_AM_OID;
+
         if (!pelem->opclass)
         {
-            partopclass[attn] = GetDefaultOpClass(atttype, BTREE_AM_OID);
+			partopclass[attn] = GetDefaultOpClass(atttype, am_oid);
 
             if (!OidIsValid(partopclass[attn]))
+			{
+				if (strategy == PARTITION_STRATEGY_HASH)
+					ereport(ERROR,
+							(errcode(ERRCODE_UNDEFINED_OBJECT),
+							 errmsg("data type %s has no default hash operator class",
+									format_type_be(atttype)),
+							 errhint("You must specify a hash operator class or define a default hash operator class for the data type.")));
+				else
                 ereport(ERROR,
                         (errcode(ERRCODE_UNDEFINED_OBJECT),
                          errmsg("data type %s has no default btree operator class",
                                 format_type_be(atttype)),
                          errhint("You must specify a btree operator class or define a default btree operator class for the data type.")));
+
+			}
         }
         else
             partopclass[attn] = ResolveOpClass(pelem->opclass,
                                                atttype,
-                                               "btree",
-                                               BTREE_AM_OID);
+											   am_oid == HASH_AM_OID ? "hash" : "btree",
+											   am_oid);
 
         attn++;
     }
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index a7e1d32a..1e57fedd 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -4829,6 +4829,8 @@ _copyPartitionBoundSpec(const PartitionBoundSpec *from)
 
     COPY_SCALAR_FIELD(strategy);
 	COPY_SCALAR_FIELD(is_default);
+	COPY_SCALAR_FIELD(modulus);
+	COPY_SCALAR_FIELD(remainder);
     COPY_NODE_FIELD(listdatums);
     COPY_NODE_FIELD(lowerdatums);
     COPY_NODE_FIELD(upperdatums);
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 6efee4a8..c05b411c 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -2953,6 +2953,8 @@ _equalPartitionBoundSpec(const PartitionBoundSpec *a, const PartitionBoundSpec *
 {
     COMPARE_SCALAR_FIELD(strategy);
 	COMPARE_SCALAR_FIELD(is_default);
+	COMPARE_SCALAR_FIELD(modulus);
+	COMPARE_SCALAR_FIELD(remainder);
     COMPARE_NODE_FIELD(listdatums);
     COMPARE_NODE_FIELD(lowerdatums);
     COMPARE_NODE_FIELD(upperdatums);
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index daf0445f..fb711230 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -5003,6 +5003,8 @@ _outPartitionBoundSpec(StringInfo str, const PartitionBoundSpec *node)
 
     WRITE_CHAR_FIELD(strategy);
 	WRITE_BOOL_FIELD(is_default);
+	WRITE_INT_FIELD(modulus);
+	WRITE_INT_FIELD(remainder);
     WRITE_NODE_FIELD(listdatums);
     WRITE_NODE_FIELD(lowerdatums);
     WRITE_NODE_FIELD(upperdatums);
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 32c879f7..2bdc5067 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -4077,6 +4077,8 @@ _readPartitionBoundSpec(void)
 
     READ_CHAR_FIELD(strategy);
 	READ_BOOL_FIELD(is_default);
+	READ_INT_FIELD(modulus);
+	READ_INT_FIELD(remainder);
     READ_NODE_FIELD(listdatums);
     READ_NODE_FIELD(lowerdatums);
     READ_NODE_FIELD(upperdatums);
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index ad902dcf..d6fad96c 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -1484,7 +1484,7 @@ have_partkey_equi_join(RelOptInfo *rel1, RelOptInfo *rel2, JoinType jointype,
 			continue;
 
 		/* Skip clauses which are not equality conditions. */
-		if (!rinfo->mergeopfamilies)
+		if (!rinfo->mergeopfamilies && !OidIsValid(rinfo->hashjoinoperator))
 			continue;
 
 		opexpr = (OpExpr *) rinfo->clause;
@@ -1536,7 +1536,13 @@ have_partkey_equi_join(RelOptInfo *rel1, RelOptInfo *rel2, JoinType jointype,
 		 * The clause allows partition-wise join if only it uses the same
 		 * operator family as that specified by the partition key.
 		 */
-		if (!list_member_oid(rinfo->mergeopfamilies,
+		if (rel1->part_scheme->strategy == PARTITION_STRATEGY_HASH)
+		{
+			if (!op_in_opfamily(rinfo->hashjoinoperator,
+								part_scheme->partopfamily[ipk1]))
+				continue;
+		}
+		else if (!list_member_oid(rinfo->mergeopfamilies,
 							 part_scheme->partopfamily[ipk1]))
 			continue;
 
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 55ea9c8c..fc680b63 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -1978,13 +1978,15 @@ set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel,
                            Relation relation)
 {
    PartitionDesc partdesc;
+	PartitionKey  partkey;
 
    Assert(relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
 
    partdesc = RelationGetPartitionDesc(relation);
+	partkey = RelationGetPartitionKey(relation);
    rel->part_scheme = find_partition_scheme(root, relation);
    Assert(partdesc != NULL && rel->part_scheme != NULL);
-   rel->boundinfo = partdesc->boundinfo;
+	rel->boundinfo = partition_bounds_copy(partdesc->boundinfo, partkey);
    rel->nparts = partdesc->nparts;
 	set_baserel_partition_key_exprs(relation, rel);
 }
@@ -2041,18 +2043,33 @@ find_partition_scheme(PlannerInfo *root, Relation relation)
 
    /*
     * Did not find matching partition scheme. Create one copying relevant
-    * information from the relcache. Instead of copying whole arrays, copy
-    * the pointers in relcache. It's safe to do so since
-    * RelationClearRelation() wouldn't change it while planner is using it.
+	 * information from the relcache. We need to copy the contents of the array
+	 * since the relcache entry may not survive after we have closed the
+	 * relation.
     */
    part_scheme = (PartitionScheme) palloc0(sizeof(PartitionSchemeData));
    part_scheme->strategy = partkey->strategy;
    part_scheme->partnatts = partkey->partnatts;
-   part_scheme->partopfamily = partkey->partopfamily;
-   part_scheme->partopcintype = partkey->partopcintype;
-   part_scheme->parttypcoll = partkey->parttypcoll;
-   part_scheme->parttyplen = partkey->parttyplen;
-   part_scheme->parttypbyval = partkey->parttypbyval;
+
+	part_scheme->partopfamily = (Oid *) palloc(sizeof(Oid) * partnatts);
+	memcpy(part_scheme->partopfamily, partkey->partopfamily,
+		   sizeof(Oid) * partnatts);
+
+	part_scheme->partopcintype = (Oid *) palloc(sizeof(Oid) * partnatts);
+	memcpy(part_scheme->partopcintype, partkey->partopcintype,
+		   sizeof(Oid) * partnatts);
+
+	part_scheme->parttypcoll = (Oid *) palloc(sizeof(Oid) * partnatts);
+	memcpy(part_scheme->parttypcoll, partkey->parttypcoll,
+		   sizeof(Oid) * partnatts);
+
+	part_scheme->parttyplen = (int16 *) palloc(sizeof(int16) * partnatts);
+	memcpy(part_scheme->parttyplen, partkey->parttyplen,
+		   sizeof(int16) * partnatts);
+
+	part_scheme->parttypbyval = (bool *) palloc(sizeof(bool) * partnatts);
+	memcpy(part_scheme->parttypbyval, partkey->parttypbyval,
+		   sizeof(bool) * partnatts);
 
    /* Add the partitioning scheme to PlannerInfo. */
    root->part_schemes = lappend(root->part_schemes, part_scheme);
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 41b045c3..1cf77960 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -620,7 +620,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <list>		part_params
 %type <partboundspec> PartitionBoundSpec
 %type <node>		partbound_datum PartitionRangeDatum
-%type <list>		partbound_datum_list range_datum_list
+%type <list>       hash_partbound partbound_datum_list range_datum_list
+%type <defelt>     hash_partbound_elem
 
 %type <node>	lock_param
 
@@ -2834,8 +2835,61 @@ alter_identity_column_option:
 		;
 
 PartitionBoundSpec:
+			/* a HASH partition*/
+			FOR VALUES WITH '(' hash_partbound ')'
+				{
+					ListCell   *lc;
+					PartitionBoundSpec *n = makeNode(PartitionBoundSpec);
+
+					n->strategy = PARTITION_STRATEGY_HASH;
+					n->modulus = n->remainder = -1;
+
+					foreach (lc, $5)
+					{
+						DefElem    *opt = lfirst_node(DefElem, lc);
+
+						if (strcmp(opt->defname, "modulus") == 0)
+						{
+							if (n->modulus != -1)
+								ereport(ERROR,
+										(errcode(ERRCODE_DUPLICATE_OBJECT),
+										 errmsg("modulus for hash partition provided more than once"),
+										 parser_errposition(opt->location)));
+							n->modulus = defGetInt32(opt);
+						}
+						else if (strcmp(opt->defname, "remainder") == 0)
+						{
+							if (n->remainder != -1)
+								ereport(ERROR,
+										(errcode(ERRCODE_DUPLICATE_OBJECT),
+										 errmsg("remainder for hash partition provided more than once"),
+										 parser_errposition(opt->location)));
+							n->remainder = defGetInt32(opt);
+						}
+						else
+							ereport(ERROR,
+									(errcode(ERRCODE_SYNTAX_ERROR),
+									 errmsg("unrecognized hash partition bound specification \"%s\"",
+											opt->defname),
+									 parser_errposition(opt->location)));
+					}
+
+					if (n->modulus == -1)
+						ereport(ERROR,
+								(errcode(ERRCODE_SYNTAX_ERROR),
+								 errmsg("modulus for hash partition must be specified")));
+					if (n->remainder == -1)
+						ereport(ERROR,
+								(errcode(ERRCODE_SYNTAX_ERROR),
+								 errmsg("remainder for hash partition must be specified")));
+
+					n->location = @3;
+
+					$$ = n;
+				}
+
 			/* a LIST partition */
-			FOR VALUES IN_P '(' partbound_datum_list ')'
+			| FOR VALUES IN_P '(' partbound_datum_list ')'
 				{
 					PartitionBoundSpec *n = makeNode(PartitionBoundSpec);
 
@@ -2873,6 +2927,24 @@ PartitionBoundSpec:
 				}
 		;
 
+hash_partbound_elem:
+		NonReservedWord Iconst
+			{
+				$$ = makeDefElem($1, (Node *)makeInteger($2), @1);
+			}
+		;
+
+hash_partbound:
+		hash_partbound_elem
+			{
+				$$ = list_make1($1);
+			}
+		| hash_partbound ',' hash_partbound_elem
+			{
+				$$ = lappend($1, $3);
+			}
+		;
+
 partbound_datum:
 			Sconst			{ $$ = makeStringConst($1, @1); }
 			| NumericOnly	{ $$ = makeAConst($1, @1); }
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 3695d9dc..e75e6b5e 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -196,6 +196,7 @@ static PGXCSubCluster *makeSubCluster(List *nodelist);
 static PGXCSubCluster *makeShardSubCluster(Oid groupId, Oid coldGroupId);
 #endif
 static void transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd);
+static void validateInfiniteBounds(ParseState *pstate, List *blist);
 static Const *transformPartitionBoundValue(ParseState *pstate, A_Const *con,
                              const char *colName, Oid colType, int32 colTypmod);
 
@@ -4965,6 +4966,11 @@ transformPartitionBound(ParseState *pstate, Relation parent,
 
 	if (spec->is_default)
 	{
+		if (strategy == PARTITION_STRATEGY_HASH)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+					 errmsg("a hash-partitioned table may not have a default partition")));
+
 		/*
 		 * In case of the default partition, parser had no way to identify the
 		 * partition strategy. Assign the parent's strategy to the default
@@ -4975,7 +4981,27 @@ transformPartitionBound(ParseState *pstate, Relation parent,
 		return result_spec;
 	}
 
-    if (strategy == PARTITION_STRATEGY_LIST)
+	if (strategy == PARTITION_STRATEGY_HASH)
+	{
+		if (spec->strategy != PARTITION_STRATEGY_HASH)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+					 errmsg("invalid bound specification for a hash partition"),
+					 parser_errposition(pstate, exprLocation((Node *) spec))));
+
+		if (spec->modulus <= 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+					 errmsg("modulus for hash partition must be a positive integer")));
+
+		Assert(spec->remainder >= 0);
+
+		if (spec->remainder >= spec->modulus)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+					 errmsg("remainder for hash partition must be less than modulus")));
+	}
+	else if (strategy == PARTITION_STRATEGY_LIST)
     {
         ListCell   *cell;
         char       *colname;
@@ -5053,6 +5079,13 @@ transformPartitionBound(ParseState *pstate, Relation parent,
                     (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
                      errmsg("TO must specify exactly one value per partitioning column")));
 
+		/*
+		 * Once we see MINVALUE or MAXVALUE for one column, the remaining
+		 * columns must be the same.
+		 */
+		validateInfiniteBounds(pstate, spec->lowerdatums);
+		validateInfiniteBounds(pstate, spec->upperdatums);
+
         /* Transform all the constants */
         i = j = 0;
         result_spec->lowerdatums = result_spec->upperdatums = NIL;
@@ -5124,6 +5157,46 @@ transformPartitionBound(ParseState *pstate, Relation parent,
     return result_spec;
 }
 
+/*
+ * validateInfiniteBounds
+ *
+ * Check that a MAXVALUE or MINVALUE specification in a partition bound is
+ * followed only by more of the same.
+ */
+static void
+validateInfiniteBounds(ParseState *pstate, List *blist)
+{
+        ListCell   *lc;
+	PartitionRangeDatumKind kind = PARTITION_RANGE_DATUM_VALUE;
+
+	foreach(lc, blist)
+	{
+		PartitionRangeDatum *prd = castNode(PartitionRangeDatum, lfirst(lc));
+
+		if (kind == prd->kind)
+			continue;
+
+		switch (kind)
+		{
+			case PARTITION_RANGE_DATUM_VALUE:
+				kind = prd->kind;
+				break;
+
+			case PARTITION_RANGE_DATUM_MAXVALUE:
+				ereport(ERROR,
+						(errcode(ERRCODE_DATATYPE_MISMATCH),
+						 errmsg("every bound following MAXVALUE must also be MAXVALUE"),
+						 parser_errposition(pstate, exprLocation((Node *) prd))));
+
+			case PARTITION_RANGE_DATUM_MINVALUE:
+				ereport(ERROR,
+						(errcode(ERRCODE_DATATYPE_MISMATCH),
+						 errmsg("every bound following MINVALUE must also be MINVALUE"),
+						 parser_errposition(pstate, exprLocation((Node *) prd))));
+		}
+	}
+}
+
 /*
  * Transform one constant in a partition bound spec
  */
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 2b83875e..984ace45 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -1645,7 +1645,7 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
  *
  * Returns the partition key specification, ie, the following:
  *
- * PARTITION BY { RANGE | LIST } (column opt_collation opt_opclass [, ...])
+ * PARTITION BY { RANGE | LIST | HASH } (column opt_collation opt_opclass [, ...])
  */
 Datum
 pg_get_partkeydef(PG_FUNCTION_ARGS)
@@ -1749,6 +1749,10 @@ pg_get_partkeydef_worker(Oid relid, int prettyFlags,
 
     switch (form->partstrat)
     {
+               case PARTITION_STRATEGY_HASH:
+                        if (!attrsOnly)
+                                appendStringInfo(&buf, "HASH");
+                        break;
         case PARTITION_STRATEGY_LIST:
             if (!attrsOnly)
                 appendStringInfo(&buf, "LIST");
@@ -9379,6 +9383,15 @@ get_rule_expr(Node *node, deparse_context *context,
 
                 switch (spec->strategy)
                 {
+					case PARTITION_STRATEGY_HASH:
+						Assert(spec->modulus > 0 && spec->remainder >= 0);
+						Assert(spec->modulus > spec->remainder);
+
+						appendStringInfoString(buf, "FOR VALUES");
+						appendStringInfo(buf, " WITH (modulus %d, remainder %d)",
+										 spec->modulus, spec->remainder);
+						break;
+
                     case PARTITION_STRATEGY_LIST:
                         Assert(spec->listdatums != NIL);
 
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 66aebfe9..f6acc9f0 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -32,6 +32,7 @@
 #include <fcntl.h>
 #include <unistd.h>
 
+#include "access/hash.h"
 #include "access/htup_details.h"
 #include "access/multixact.h"
 #include "access/nbtree.h"
@@ -981,6 +982,7 @@ RelationBuildPartitionKey(Relation relation)
     Datum        datum;
     MemoryContext partkeycxt,
                 oldcxt;
+	int16		procnum;
 
     tuple = SearchSysCache1(PARTRELID,
                             ObjectIdGetDatum(RelationGetRelid(relation)));
@@ -1060,6 +1062,10 @@ RelationBuildPartitionKey(Relation relation)
     key->parttypalign = (char *) palloc0(key->partnatts * sizeof(char));
     key->parttypcoll = (Oid *) palloc0(key->partnatts * sizeof(Oid));
 
+	/* For the hash partitioning, an extended hash function will be used. */
+	procnum = (key->strategy == PARTITION_STRATEGY_HASH) ?
+		HASHEXTENDED_PROC : BTORDER_PROC;
+
     /* Copy partattrs and fill other per-attribute info */
     memcpy(key->partattrs, attrs, key->partnatts * sizeof(int16));
     partexprs_item = list_head(key->partexprs);
@@ -1080,18 +1086,20 @@ RelationBuildPartitionKey(Relation relation)
         key->partopfamily[i] = opclassform->opcfamily;
         key->partopcintype[i] = opclassform->opcintype;
 
-        /*
-         * A btree support function covers the cases of list and range methods
-         * currently supported.
-         */
+		/* Get a support function for the specified opfamily and datatypes */
         funcid = get_opfamily_proc(opclassform->opcfamily,
                                    opclassform->opcintype,
                                    opclassform->opcintype,
-                                   BTORDER_PROC);
-        if (!OidIsValid(funcid))    /* should not happen */
-            elog(ERROR, "missing support function %d(%u,%u) in opfamily %u",
-                 BTORDER_PROC, opclassform->opcintype, opclassform->opcintype,
-                 opclassform->opcfamily);
+								   procnum);
+		if (!OidIsValid(funcid))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+					 errmsg("operator class \"%s\" of access method %s is missing support function %d for data type \"%s\"",
+							NameStr(opclassform->opcname),
+							(key->strategy == PARTITION_STRATEGY_HASH) ?
+							"hash" : "btree",
+							procnum,
+							format_type_be(opclassform->opcintype))));
 
         fmgr_info(funcid, &key->partsupfunc[i]);
 
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 4ce5a90e..49305e4a 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -2074,7 +2074,7 @@ psql_completion(const char *text, int start, int end)
     else if (TailMatches3("ATTACH", "PARTITION", MatchAny))
 		COMPLETE_WITH_LIST2("FOR VALUES", "DEFAULT");
     else if (TailMatches2("FOR", "VALUES"))
-        COMPLETE_WITH_LIST2("FROM (", "IN (");
+		COMPLETE_WITH_LIST3("FROM (", "IN (", "WITH (");
 
     /*
      * If we have ALTER TABLE <foo> DETACH PARTITION, provide a list of
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index 454a940a..8acc01a8 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -19,6 +19,9 @@
 #include "parser/parse_node.h"
 #include "utils/rel.h"
 
+/* Seed for the extended hash function */
+#define HASH_PARTITION_SEED UINT64CONST(0x7A5B22367996DCFD)
+
 /*
  * PartitionBoundInfo encapsulates a set of partition bounds.  It is usually
  * associated with partitioned tables as part of its partition descriptor.
@@ -74,6 +77,8 @@ extern void RelationBuildPartitionDesc(Relation relation);
 extern bool partition_bounds_equal(int partnatts, int16 *parttyplen,
 					   bool *parttypbyval, PartitionBoundInfo b1,
 					   PartitionBoundInfo b2);
+extern PartitionBoundInfo partition_bounds_copy(PartitionBoundInfo src,
+					  PartitionKey key);
 
 extern void check_new_partition_bound(char *relname, Relation parent,
 						  PartitionBoundSpec *spec);
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index e5bcf8ae..27c9ef26 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -5707,6 +5707,9 @@ DATA(insert OID = 3353 (  pg_ls_logdir                 PGNSP PGUID 12 10 20 0 0
 DESCR("list files in the log directory");
 DATA(insert OID = 3354 (  pg_ls_waldir                 PGNSP PGUID 12 10 20 0 0 f f f f t t v s 0 0 2249 "" "{25,20,1184}" "{o,o,o}" "{name,size,modification}" _null_ _null_ pg_ls_waldir _null_ _null_ _null_ ));
 DESCR("list of files in the WAL directory");
+/* hash partitioning constraint function */
+DATA(insert OID = 4687 ( satisfies_hash_partition PGNSP PGUID 12 1 0 2276 0 f f f f f f i s 4 0 16 "26 23 23 2276" _null_ _null_ _null_ _null_ _null_ satisfies_hash_partition _null_ _null_ _null_ ));
+DESCR("hash partition CHECK constraint");
 DATA(insert OID = 3410 (  pg_extent_info                PGNSP PGUID 12 10 20 0 0 f f f f f t v s 1 0 2249 "2205" "{23,16,23,23,23,23,23,23,23}" "{o,o,o,o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next,scan_prev,alloc_next,alloc_prev}" _null_ _null_ pg_extent_info_oid _null_ _null_ _null_ ));
 DESCR("get extent info of a relation");
 DATA(insert OID = 3411 (  pg_shard_scan_list            PGNSP PGUID 12 10 20 0 0 f f f f f t v s 2 0 2249 "2205 23" "{23,16,23,23,23,23}" "{o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next}" _null_ _null_ pg_shard_scan_list_oid _null_ _null_ _null_ ));
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index c508a87d..983a1ab0 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -849,7 +849,8 @@ typedef struct PartitionBy
 typedef struct PartitionSpec
 {
     NodeTag        type;
-    char       *strategy;        /* partitioning strategy ('list' or 'range') */
+        char       *strategy;           /* partitioning strategy ('hash', 'list' or
+                                                                 * 'range') */
     List       *partParams;        /* List of PartitionElems */
 #ifdef __TBASE__
     PartitionBy *interval;      /* used for interval partition */
@@ -858,6 +859,7 @@ typedef struct PartitionSpec
 } PartitionSpec;
 
 /* Internal codes for partitioning strategies */
+#define PARTITION_STRATEGY_HASH         'h'
 #define PARTITION_STRATEGY_LIST        'l'
 #define PARTITION_STRATEGY_RANGE    'r'
 #ifdef __TBASE__
@@ -878,6 +880,10 @@ typedef struct PartitionBoundSpec
     char        strategy;        /* see PARTITION_STRATEGY codes above */
 	bool		is_default;		/* is it a default partition bound? */
 
+	/* Partitioning info for HASH strategy: */
+	int			modulus;
+	int			remainder;
+
     /* Partitioning info for LIST strategy: */
     List       *listdatums;        /* List of Consts (or A_Consts in raw tree) */
 
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index d112f403..455cee74 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3304,6 +3304,7 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg
 CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS);
 ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
 ERROR:  partition "fail_part" would overlap partition "part_1"
+DROP TABLE fail_part;
 -- check that an existing table can be attached as a default partition
 CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS);
 ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT;
@@ -3503,6 +3504,59 @@ CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1);
 CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2);
 INFO:  partition constraint for table "quuux_default1" is implied by existing constraints
 DROP TABLE quuux;
+-- check validation when attaching hash partitions
+-- The default hash functions as they exist today aren't portable; they can
+-- return different results on different machines.  Depending upon how the
+-- values are hashed, the row may map to different partitions, which result in
+-- regression failure.  To avoid this, let's create a non-default hash function
+-- that just returns the input value unchanged.
+CREATE OR REPLACE FUNCTION dummy_hashint4(a int4, seed int8) RETURNS int8 AS
+$$ BEGIN RETURN (a + 1 + seed); END; $$ LANGUAGE 'plpgsql' IMMUTABLE;
+CREATE OPERATOR CLASS custom_opclass FOR TYPE int4 USING HASH AS
+OPERATOR 1 = , FUNCTION 2 dummy_hashint4(int4, int8);
+-- check that the new partition won't overlap with an existing partition
+CREATE TABLE hash_parted (
+   a int,
+   b int
+) PARTITION BY HASH (a custom_opclass);
+CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0);
+CREATE TABLE fail_part (LIKE hpart_1);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+DROP TABLE fail_part;
+-- check validation when attaching hash partitions
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_2 (LIKE hash_parted);
+INSERT INTO hpart_2 VALUES (3, 0);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+ERROR:  partition constraint is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_2;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+-- check that leaf partitions are scanned when attaching a partitioned
+-- table
+CREATE TABLE hpart_5 (
+   LIKE hash_parted
+) PARTITION BY LIST (b);
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3');
+INSERT INTO hpart_5_a (a, b) VALUES (7, 1);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+ERROR:  partition constraint is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_5_a;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+-- check that the table being attach is with valid modulus and remainder value
+CREATE TABLE fail_part(LIKE hash_parted);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1);
+ERROR:  modulus for hash partition must be a positive integer
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8);
+ERROR:  remainder for hash partition must be less than modulus
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+DROP TABLE fail_part;
 --
 -- DETACH PARTITION
 --
@@ -3514,12 +3568,17 @@ DROP TABLE regular_table;
 -- check that the partition being detached exists at all
 ALTER TABLE list_parted2 DETACH PARTITION part_4;
 ERROR:  relation "part_4" does not exist
+ALTER TABLE hash_parted DETACH PARTITION hpart_4;
+ERROR:  relation "hpart_4" does not exist
 -- check that the partition being detached is actually a partition of the parent
 CREATE TABLE not_a_part (a int);
 ALTER TABLE list_parted2 DETACH PARTITION not_a_part;
 ERROR:  relation "not_a_part" is not a partition of relation "list_parted2"
 ALTER TABLE list_parted2 DETACH PARTITION part_1;
 ERROR:  relation "part_1" is not a partition of relation "list_parted2"
+ALTER TABLE hash_parted DETACH PARTITION not_a_part;
+ERROR:  relation "not_a_part" is not a partition of relation "hash_parted"
+DROP TABLE not_a_part;
 -- check that, after being detached, attinhcount/coninhcount is dropped to 0 and
 -- attislocal/conislocal is set to true
 ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
@@ -3623,6 +3682,9 @@ SELECT * FROM list_parted;
 -- cleanup
 DROP TABLE list_parted, list_parted2, range_parted;
 DROP TABLE fail_def_part;
+DROP TABLE hash_parted;
+DROP OPERATOR CLASS custom_opclass USING HASH;
+DROP FUNCTION dummy_hashint4(a int4, seed int8);
 -- more tests for certain multi-level partitioning scenarios
 create table p (a int, b int) partition by range (a, b);
 create table p1 (b int, a int not null) partition by range (b);
diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out
index 9d508790..357e16da 100644
--- a/src/test/regress/expected/alter_table_1.out
+++ b/src/test/regress/expected/alter_table_1.out
@@ -3303,6 +3303,7 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg
 CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS);
 ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
 ERROR:  partition "fail_part" would overlap partition "part_1"
+DROP TABLE fail_part;
 -- check validation when attaching list partitions
 CREATE TABLE list_parted2 (
 	a int,
@@ -3434,6 +3435,59 @@ DETAIL:  "part_5" is already a child of "list_parted2".
 ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0);
 ERROR:  circular inheritance not allowed
 DETAIL:  "list_parted2" is already a child of "list_parted2".
+-- check validation when attaching hash partitions
+-- The default hash functions as they exist today aren't portable; they can
+-- return different results on different machines.  Depending upon how the
+-- values are hashed, the row may map to different partitions, which result in
+-- regression failure.  To avoid this, let's create a non-default hash function
+-- that just returns the input value unchanged.
+CREATE OR REPLACE FUNCTION dummy_hashint4(a int4, seed int8) RETURNS int8 AS
+$$ BEGIN RETURN (a + 1 + seed); END; $$ LANGUAGE 'plpgsql' IMMUTABLE;
+CREATE OPERATOR CLASS custom_opclass FOR TYPE int4 USING HASH AS
+OPERATOR 1 = , FUNCTION 2 dummy_hashint4(int4, int8);
+-- check that the new partition won't overlap with an existing partition
+CREATE TABLE hash_parted (
+   a int,
+   b int
+) PARTITION BY HASH (a custom_opclass);
+CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0);
+CREATE TABLE fail_part (LIKE hpart_1);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+DROP TABLE fail_part;
+-- check validation when attaching hash partitions
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_2 (LIKE hash_parted);
+INSERT INTO hpart_2 VALUES (3, 0);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+ERROR:  partition constraint is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_2;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+-- check that leaf partitions are scanned when attaching a partitioned
+-- table
+CREATE TABLE hpart_5 (
+   LIKE hash_parted
+) PARTITION BY LIST (b);
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3');
+INSERT INTO hpart_5_a (a, b) VALUES (7, 1);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+ERROR:  partition constraint is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_5_a;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+-- check that the table being attach is with valid modulus and remainder value
+CREATE TABLE fail_part(LIKE hash_parted);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1);
+ERROR:  modulus for hash partition must be a positive integer
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8);
+ERROR:  remainder for hash partition must be less than modulus
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+DROP TABLE fail_part;
 --
 -- DETACH PARTITION
 --
@@ -3445,12 +3499,17 @@ DROP TABLE regular_table;
 -- check that the partition being detached exists at all
 ALTER TABLE list_parted2 DETACH PARTITION part_4;
 ERROR:  relation "part_4" does not exist
+ALTER TABLE hash_parted DETACH PARTITION hpart_4;
+ERROR:  relation "hpart_4" does not exist
 -- check that the partition being detached is actually a partition of the parent
 CREATE TABLE not_a_part (a int);
 ALTER TABLE list_parted2 DETACH PARTITION not_a_part;
 ERROR:  relation "not_a_part" is not a partition of relation "list_parted2"
 ALTER TABLE list_parted2 DETACH PARTITION part_1;
 ERROR:  relation "part_1" is not a partition of relation "list_parted2"
+ALTER TABLE hash_parted DETACH PARTITION not_a_part;
+ERROR:  relation "not_a_part" is not a partition of relation "hash_parted"
+DROP TABLE not_a_part;
 -- check that, after being detached, attinhcount/coninhcount is dropped to 0 and
 -- attislocal/conislocal is set to true
 ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
@@ -3546,6 +3605,9 @@ ALTER TABLE list_parted2 ALTER COLUMN b TYPE text;
 ERROR:  cannot alter type of column named in partition key
 -- cleanup
 DROP TABLE list_parted, list_parted2, range_parted;
+DROP TABLE hash_parted;
+DROP OPERATOR CLASS custom_opclass USING HASH;
+DROP FUNCTION dummy_hashint4(a int4, seed int8);
 -- more tests for certain multi-level partitioning scenarios
 create table p (a int, b int) partition by range (a, b);
 create table p1 (b int, a int not null) partition by range (b);
diff --git a/src/test/regress/expected/alter_table_2.out b/src/test/regress/expected/alter_table_2.out
index c25de7bc..88f9f851 100644
--- a/src/test/regress/expected/alter_table_2.out
+++ b/src/test/regress/expected/alter_table_2.out
@@ -3303,6 +3303,7 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg
 CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS);
 ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
 ERROR:  partition "fail_part" would overlap partition "part_1"
+DROP TABLE fail_part;
 -- check validation when attaching list partitions
 CREATE TABLE list_parted2 (
 	a int,
@@ -3434,6 +3435,59 @@ DETAIL:  "part_5" is already a child of "list_parted2".
 ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0);
 ERROR:  circular inheritance not allowed
 DETAIL:  "list_parted2" is already a child of "list_parted2".
+-- check validation when attaching hash partitions
+-- The default hash functions as they exist today aren't portable; they can
+-- return different results on different machines.  Depending upon how the
+-- values are hashed, the row may map to different partitions, which result in
+-- regression failure.  To avoid this, let's create a non-default hash function
+-- that just returns the input value unchanged.
+CREATE OR REPLACE FUNCTION dummy_hashint4(a int4, seed int8) RETURNS int8 AS
+$$ BEGIN RETURN (a + 1 + seed); END; $$ LANGUAGE 'plpgsql' IMMUTABLE;
+CREATE OPERATOR CLASS custom_opclass FOR TYPE int4 USING HASH AS
+OPERATOR 1 = , FUNCTION 2 dummy_hashint4(int4, int8);
+-- check that the new partition won't overlap with an existing partition
+CREATE TABLE hash_parted (
+   a int,
+   b int
+) PARTITION BY HASH (a custom_opclass);
+CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0);
+CREATE TABLE fail_part (LIKE hpart_1);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+DROP TABLE fail_part;
+-- check validation when attaching hash partitions
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_2 (LIKE hash_parted);
+INSERT INTO hpart_2 VALUES (3, 0);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+ERROR:  partition constraint is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_2;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+-- check that leaf partitions are scanned when attaching a partitioned
+-- table
+CREATE TABLE hpart_5 (
+   LIKE hash_parted
+) PARTITION BY LIST (b);
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3');
+INSERT INTO hpart_5_a (a, b) VALUES (7, 1);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+ERROR:  partition constraint is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_5_a;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+-- check that the table being attach is with valid modulus and remainder value
+CREATE TABLE fail_part(LIKE hash_parted);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1);
+ERROR:  modulus for hash partition must be a positive integer
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8);
+ERROR:  remainder for hash partition must be less than modulus
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+DROP TABLE fail_part;
 --
 -- DETACH PARTITION
 --
@@ -3445,12 +3499,17 @@ DROP TABLE regular_table;
 -- check that the partition being detached exists at all
 ALTER TABLE list_parted2 DETACH PARTITION part_4;
 ERROR:  relation "part_4" does not exist
+ALTER TABLE hash_parted DETACH PARTITION hpart_4;
+ERROR:  relation "hpart_4" does not exist
 -- check that the partition being detached is actually a partition of the parent
 CREATE TABLE not_a_part (a int);
 ALTER TABLE list_parted2 DETACH PARTITION not_a_part;
 ERROR:  relation "not_a_part" is not a partition of relation "list_parted2"
 ALTER TABLE list_parted2 DETACH PARTITION part_1;
 ERROR:  relation "part_1" is not a partition of relation "list_parted2"
+ALTER TABLE hash_parted DETACH PARTITION not_a_part;
+ERROR:  relation "not_a_part" is not a partition of relation "hash_parted"
+DROP TABLE not_a_part;
 -- check that, after being detached, attinhcount/coninhcount is dropped to 0 and
 -- attislocal/conislocal is set to true
 ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
@@ -3546,6 +3605,9 @@ ALTER TABLE list_parted2 ALTER COLUMN b TYPE text;
 ERROR:  cannot alter type of column named in partition key
 -- cleanup
 DROP TABLE list_parted, list_parted2, range_parted;
+DROP TABLE hash_parted;
+DROP OPERATOR CLASS custom_opclass USING HASH;
+DROP FUNCTION dummy_hashint4(a int4, seed int8);
 -- more tests for certain multi-level partitioning scenarios
 create table p (a int, b int) partition by range (a, b);
 create table p1 (b int, a int not null) partition by range (b);
diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out
index 031cc211..345150e0 100644
--- a/src/test/regress/expected/alter_table_3.out
+++ b/src/test/regress/expected/alter_table_3.out
@@ -3303,6 +3303,7 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg
 CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS);
 ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
 ERROR:  partition "fail_part" would overlap partition "part_1"
+DROP TABLE fail_part;
 -- check validation when attaching list partitions
 CREATE TABLE list_parted2 (
 	a int,
@@ -3434,6 +3435,59 @@ DETAIL:  "part_5" is already a child of "list_parted2".
 ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0);
 ERROR:  circular inheritance not allowed
 DETAIL:  "list_parted2" is already a child of "list_parted2".
+-- check validation when attaching hash partitions
+-- The default hash functions as they exist today aren't portable; they can
+-- return different results on different machines.  Depending upon how the
+-- values are hashed, the row may map to different partitions, which result in
+-- regression failure.  To avoid this, let's create a non-default hash function
+-- that just returns the input value unchanged.
+CREATE OR REPLACE FUNCTION dummy_hashint4(a int4, seed int8) RETURNS int8 AS
+$$ BEGIN RETURN (a + 1 + seed); END; $$ LANGUAGE 'plpgsql' IMMUTABLE;
+CREATE OPERATOR CLASS custom_opclass FOR TYPE int4 USING HASH AS
+OPERATOR 1 = , FUNCTION 2 dummy_hashint4(int4, int8);
+-- check that the new partition won't overlap with an existing partition
+CREATE TABLE hash_parted (
+   a int,
+   b int
+) PARTITION BY HASH (a custom_opclass);
+CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0);
+CREATE TABLE fail_part (LIKE hpart_1);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0);
+ERROR:  partition "fail_part" would overlap partition "hpart_1"
+DROP TABLE fail_part;
+-- check validation when attaching hash partitions
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_2 (LIKE hash_parted);
+INSERT INTO hpart_2 VALUES (3, 0);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+ERROR:  partition constraint is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_2;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+-- check that leaf partitions are scanned when attaching a partitioned
+-- table
+CREATE TABLE hpart_5 (
+   LIKE hash_parted
+) PARTITION BY LIST (b);
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3');
+INSERT INTO hpart_5_a (a, b) VALUES (7, 1);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+ERROR:  partition constraint is violated by some row
+-- should be ok after deleting the bad row
+DELETE FROM hpart_5_a;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+-- check that the table being attach is with valid modulus and remainder value
+CREATE TABLE fail_part(LIKE hash_parted);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1);
+ERROR:  modulus for hash partition must be a positive integer
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8);
+ERROR:  remainder for hash partition must be less than modulus
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+DROP TABLE fail_part;
 --
 -- DETACH PARTITION
 --
@@ -3445,12 +3499,17 @@ DROP TABLE regular_table;
 -- check that the partition being detached exists at all
 ALTER TABLE list_parted2 DETACH PARTITION part_4;
 ERROR:  relation "part_4" does not exist
+ALTER TABLE hash_parted DETACH PARTITION hpart_4;
+ERROR:  relation "hpart_4" does not exist
 -- check that the partition being detached is actually a partition of the parent
 CREATE TABLE not_a_part (a int);
 ALTER TABLE list_parted2 DETACH PARTITION not_a_part;
 ERROR:  relation "not_a_part" is not a partition of relation "list_parted2"
 ALTER TABLE list_parted2 DETACH PARTITION part_1;
 ERROR:  relation "part_1" is not a partition of relation "list_parted2"
+ALTER TABLE hash_parted DETACH PARTITION not_a_part;
+ERROR:  relation "not_a_part" is not a partition of relation "hash_parted"
+DROP TABLE not_a_part;
 -- check that, after being detached, attinhcount/coninhcount is dropped to 0 and
 -- attislocal/conislocal is set to true
 ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
@@ -3546,6 +3605,9 @@ ALTER TABLE list_parted2 ALTER COLUMN b TYPE text;
 ERROR:  cannot alter type of column named in partition key
 -- cleanup
 DROP TABLE list_parted, list_parted2, range_parted;
+DROP TABLE hash_parted;
+DROP OPERATOR CLASS custom_opclass USING HASH;
+DROP FUNCTION dummy_hashint4(a int4, seed int8);
 -- more tests for certain multi-level partitioning scenarios
 create table p (a int, b int) partition by range (a, b);
 create table p1 (b int, a int not null) partition by range (b);
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index 982e28f0..4ae86d8c 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -343,11 +343,11 @@ CREATE TABLE partitioned (
 ) PARTITION BY RANGE (const_func());
 ERROR:  cannot use constant expression as partition key
 DROP FUNCTION const_func();
--- only accept "list" and "range" as partitioning strategy
+-- only accept valid partitioning strategy
 CREATE TABLE partitioned (
 	a int
-) PARTITION BY HASH (a);
-ERROR:  unrecognized partitioning strategy "hash"
+) PARTITION BY MAGIC (a);
+ERROR:  unrecognized partitioning strategy "magic"
 -- specified column must be present in the table
 CREATE TABLE partitioned (
 	a int
@@ -470,6 +470,11 @@ CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) TO (2);
 ERROR:  invalid bound specification for a list partition
 LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) T...
                                                              ^
+-- trying to specify modulus and remainder for list partitioned table
+CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1);
+ERROR:  invalid bound specification for a list partition
+LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODU...
+                                                             ^
 -- check default partition cannot be created more than once
 CREATE TABLE part_default PARTITION OF list_parted DEFAULT;
 CREATE TABLE fail_default_part PARTITION OF list_parted DEFAULT;
@@ -512,6 +517,11 @@ CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES IN ('a');
 ERROR:  invalid bound specification for a range partition
 LINE 1: ...BLE fail_part PARTITION OF range_parted FOR VALUES IN ('a');
                                                               ^
+-- trying to specify modulus and remainder for range partitioned table
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1);
+ERROR:  invalid bound specification for a range partition
+LINE 1: ...LE fail_part PARTITION OF range_parted FOR VALUES WITH (MODU...
+                                                             ^
 -- each of start and end bounds must have same number of values as the
 -- length of the partition key
 CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('z');
@@ -521,6 +531,37 @@ ERROR:  TO must specify exactly one value per partitioning column
 -- cannot specify null values in range bounds
 CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM (null) TO (maxvalue);
 ERROR:  cannot specify NULL in range bound
+-- trying to specify modulus and remainder for range partitioned table
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1);
+ERROR:  invalid bound specification for a range partition
+LINE 1: ...LE fail_part PARTITION OF range_parted FOR VALUES WITH (MODU...
+                                                             ^
+-- check partition bound syntax for the hash partition
+CREATE TABLE hash_parted (
+	a int
+) PARTITION BY HASH (a);
+CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 10, REMAINDER 0);
+CREATE TABLE hpart_2 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 50, REMAINDER 1);
+CREATE TABLE hpart_3 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 200, REMAINDER 2);
+-- modulus 25 is factor of modulus of 50 but 10 is not factor of 25.
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 25, REMAINDER 3);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+-- previous modulus 50 is factor of 150 but this modulus is not factor of next modulus 200.
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 150, REMAINDER 3);
+ERROR:  every hash partition modulus must be a factor of the next larger modulus
+-- trying to specify range for the hash partitioned table
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a', 1) TO ('z');
+ERROR:  invalid bound specification for a hash partition
+LINE 1: ...BLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a',...
+                                                             ^
+-- trying to specify list value for the hash partitioned table
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000);
+ERROR:  invalid bound specification for a hash partition
+LINE 1: ...BLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000);
+                                                             ^
+-- trying to create default partition for the hash partitioned table
+CREATE TABLE fail_default_part PARTITION OF hash_parted DEFAULT;
+ERROR:  a hash-partitioned table may not have a default partition
 -- check if compatible with the specified parent
 -- cannot create as partition of a non-partitioned table
 CREATE TABLE unparted (
@@ -528,6 +569,8 @@ CREATE TABLE unparted (
 );
 CREATE TABLE fail_part PARTITION OF unparted FOR VALUES IN ('a');
 ERROR:  "unparted" is not partitioned
+CREATE TABLE fail_part PARTITION OF unparted FOR VALUES WITH (MODULUS 2, REMAINDER 1);
+ERROR:  "unparted" is not partitioned
 DROP TABLE unparted;
 -- cannot create a permanent rel as partition of a temp rel
 CREATE TEMP TABLE temp_parted (
@@ -628,6 +671,23 @@ CREATE TABLE range3_default PARTITION OF range_parted3 DEFAULT;
 -- more specific ranges
 CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO (1, maxvalue);
 ERROR:  partition "fail_part" would overlap partition "part10"
+-- check for partition bound overlap and other invalid specifications for the hash partition
+CREATE TABLE hash_parted2 (
+	a varchar
+) PARTITION BY HASH (a);
+CREATE TABLE h2part_1 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+CREATE TABLE h2part_2 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 0);
+CREATE TABLE h2part_3 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 4);
+CREATE TABLE h2part_4 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 5);
+-- overlap with part_4
+CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 2, REMAINDER 1);
+ERROR:  partition "fail_part" would overlap partition "h2part_4"
+-- modulus must be greater than zero
+CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 0, REMAINDER 1);
+ERROR:  modulus for hash partition must be a positive integer
+-- remainder must be greater than or equal to zero and less than modulus
+CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 8);
+ERROR:  remainder for hash partition must be less than modulus
 -- check schema propagation from parent
 CREATE TABLE parted (
 	a text,
@@ -732,9 +792,17 @@ Check constraints:
     "check_a" CHECK (length(a) > 0)
 Number of partitions: 3 (Use \d+ to list them.)
 
+\d hash_parted
+            Table "public.hash_parted"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition key: HASH (a)
+Number of partitions: 3 (Use \d+ to list them.)
+
 -- check that we get the expected partition constraints
 CREATE TABLE range_parted4 (a int, b int, c int) PARTITION BY RANGE (abs(a), abs(b), c);
-CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, 0, 0) TO (MAXVALUE, 0, 0);
+CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE);
 \d+ unbounded_range_part
                            Table "public.unbounded_range_part"
  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
@@ -742,13 +810,13 @@ CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MI
  a      | integer |           |          |         | plain   |              | 
  b      | integer |           |          |         | plain   |              | 
  c      | integer |           |          |         | plain   |              | 
-Partition of: range_parted4 FOR VALUES FROM (MINVALUE, 0, 0) TO (MAXVALUE, 0, 0)
+Partition of: range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE)
 Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL))
 Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
 
 DROP TABLE unbounded_range_part;
-CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, 0, 0) TO (1, MAXVALUE, 0);
+CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE);
 \d+ range_parted4_1
                               Table "public.range_parted4_1"
  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
@@ -756,7 +824,7 @@ CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALU
  a      | integer |           |          |         | plain   |              | 
  b      | integer |           |          |         | plain   |              | 
  c      | integer |           |          |         | plain   |              | 
-Partition of: range_parted4 FOR VALUES FROM (MINVALUE, 0, 0) TO (1, MAXVALUE, 0)
+Partition of: range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE)
 Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND (abs(a) <= 1))
 Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
@@ -774,7 +842,7 @@ Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS N
 Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
 
-CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, 0);
+CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE);
 \d+ range_parted4_3
                               Table "public.range_parted4_3"
  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
@@ -782,7 +850,7 @@ CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, M
  a      | integer |           |          |         | plain   |              | 
  b      | integer |           |          |         | plain   |              | 
  c      | integer |           |          |         | plain   |              | 
-Partition of: range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, 0)
+Partition of: range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE)
 Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND ((abs(a) > 6) OR ((abs(a) = 6) AND (abs(b) >= 8))) AND (abs(a) <= 9))
 Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
@@ -790,6 +858,8 @@ Location Nodes: ALL DATANODES
 DROP TABLE range_parted4;
 -- cleanup
 DROP TABLE parted, list_parted, range_parted, list_parted2, range_parted2, range_parted3;
+DROP TABLE hash_parted;
+DROP TABLE hash_parted2;
 -- comments on partitioned tables columns
 CREATE TABLE parted_col_comment (a int, b text) PARTITION BY LIST (a);
 COMMENT ON TABLE parted_col_comment IS 'Am partitioned table';
diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
index cabe7df5..6e287dc4 100644
--- a/src/test/regress/expected/inherit.out
+++ b/src/test/regress/expected/inherit.out
@@ -2166,12 +2166,12 @@ drop table range_list_parted;
 -- check that constraint exclusion is able to cope with the partition
 -- constraint emitted for multi-column range partitioned tables
 create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
-create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, 1, 1);
+create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1);
 create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10);
 create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
 create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20);
-create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, 0, 0);
+create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
 explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
                 QUERY PLAN                
 ------------------------------------------
diff --git a/src/test/regress/expected/inherit_1.out b/src/test/regress/expected/inherit_1.out
index c087db53..ff38ed79 100644
--- a/src/test/regress/expected/inherit_1.out
+++ b/src/test/regress/expected/inherit_1.out
@@ -2160,12 +2160,12 @@ drop table range_list_parted;
 -- check that constraint exclusion is able to cope with the partition
 -- constraint emitted for multi-column range partitioned tables
 create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
-create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, 1, 1);
+create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1);
 create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10);
 create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
 create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20);
-create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, 0, 0);
+create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
 explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
              QUERY PLAN             
 ------------------------------------
diff --git a/src/test/regress/expected/inherit_2.out b/src/test/regress/expected/inherit_2.out
index 9b61d3c6..8d97e116 100644
--- a/src/test/regress/expected/inherit_2.out
+++ b/src/test/regress/expected/inherit_2.out
@@ -2131,12 +2131,12 @@ drop table range_list_parted;
 -- check that constraint exclusion is able to cope with the partition
 -- constraint emitted for multi-column range partitioned tables
 create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
-create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, 1, 1);
+create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1);
 create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10);
 create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
 create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20);
-create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, 0, 0);
+create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
 explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
                 QUERY PLAN                
 ------------------------------------------
diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out
index d0ff897f..402c6a51 100644
--- a/src/test/regress/expected/inherit_3.out
+++ b/src/test/regress/expected/inherit_3.out
@@ -2147,12 +2147,12 @@ drop table range_list_parted;
 -- check that constraint exclusion is able to cope with the partition
 -- constraint emitted for multi-column range partitioned tables
 create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
-create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, 1, 1);
+create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1);
 create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10);
 create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
 create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20);
-create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, 0, 0);
+create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
 explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
              QUERY PLAN             
 ------------------------------------
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index 9d5b125e..96b99abb 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -384,8 +384,54 @@ select tableoid::regclass::text, a, min(b) as min_b, max(b) as max_b from list_p
  part_null     |    |     1 |     1
 (9 rows)
 
+-- direct partition inserts should check hash partition bound constraint
+-- create custom operator class and hash function, for the same reason
+-- explained in alter_table.sql
+create or replace function dummy_hashint4(a int4, seed int8) returns int8 as
+$$ begin return (a + seed); end; $$ language 'plpgsql' immutable;
+create operator class custom_opclass for type int4 using hash as
+operator 1 = , function 2 dummy_hashint4(int4, int8);
+create table hash_parted (
+	a int
+) partition by hash (a custom_opclass);
+create table hpart0 partition of hash_parted for values with (modulus 4, remainder 0);
+create table hpart1 partition of hash_parted for values with (modulus 4, remainder 1);
+create table hpart2 partition of hash_parted for values with (modulus 4, remainder 2);
+create table hpart3 partition of hash_parted for values with (modulus 4, remainder 3);
+insert into hash_parted values(generate_series(1,10));
+-- direct insert of values divisible by 4 - ok;
+insert into hpart0 values(12),(16);
+-- fail;
+insert into hpart0 values(11);
+ERROR:  new row for relation "hpart0" violates partition constraint
+DETAIL:  Failing row contains (11).
+-- 11 % 4 -> 3 remainder i.e. valid data for hpart3 partition
+insert into hpart3 values(11);
+-- view data
+select tableoid::regclass as part, a, a%4 as "remainder = a % 4"
+from hash_parted order by part;
+  part  | a  | remainder = a % 4 
+--------+----+-------------------
+ hpart0 |  4 |                 0
+ hpart0 |  8 |                 0
+ hpart0 | 12 |                 0
+ hpart0 | 16 |                 0
+ hpart1 |  1 |                 1
+ hpart1 |  5 |                 1
+ hpart1 |  9 |                 1
+ hpart2 |  2 |                 2
+ hpart2 |  6 |                 2
+ hpart2 | 10 |                 2
+ hpart3 |  3 |                 3
+ hpart3 |  7 |                 3
+ hpart3 | 11 |                 3
+(13 rows)
+
 -- cleanup
 drop table range_parted, list_parted;
+drop table hash_parted;
+drop operator class custom_opclass using hash;
+drop function dummy_hashint4(a int4, seed int8);
 -- test that a default partition added as the first partition accepts any value
 -- including null
 create table list_parted (a int) partition by list (a);
@@ -607,15 +653,28 @@ revoke all on key_desc from someone_else;
 revoke all on key_desc_1 from someone_else;
 drop role someone_else;
 drop table key_desc, key_desc_1;
+-- test minvalue/maxvalue restrictions
+create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
+create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, maxvalue);
+ERROR:  every bound following MINVALUE must also be MINVALUE
+LINE 1: ...partition of mcrparted for values from (minvalue, 0, 0) to (...
+                                                             ^
+create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, minvalue);
+ERROR:  every bound following MAXVALUE must also be MAXVALUE
+LINE 1: ...r values from (10, 6, minvalue) to (10, maxvalue, minvalue);
+                                                             ^
+create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, minvalue);
+ERROR:  every bound following MINVALUE must also be MINVALUE
+LINE 1: ...ition of mcrparted for values from (21, minvalue, 0) to (30,...
+                                                             ^
 -- check multi-column range partitioning expression enforces the same
 -- constraint as what tuple-routing would determine it to be
-create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
-create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, 0);
+create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, maxvalue, maxvalue);
 create table mcrparted1 partition of mcrparted for values from (2, 1, minvalue) to (10, 5, 10);
-create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, 0);
+create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, maxvalue);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
-create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, maxvalue);
-create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, 0, 0);
+create table mcrparted4 partition of mcrparted for values from (21, minvalue, minvalue) to (30, 20, maxvalue);
+create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, maxvalue, maxvalue);
 -- routed to mcrparted0
 insert into mcrparted values (0, 1, 1);
 insert into mcrparted0 values (0, 1, 1);
@@ -696,14 +755,14 @@ drop table brtrigpartcon;
 drop function brtrigpartcon1trigf();
 -- check multi-column range partitioning with minvalue/maxvalue constraints
 create table mcrparted (a text, b int) partition by range(a, b);
-create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, 0) to ('b', minvalue);
+create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, minvalue) to ('b', minvalue);
 create table mcrparted2_b partition of mcrparted for values from ('b', minvalue) to ('c', minvalue);
 create table mcrparted3_c_to_common partition of mcrparted for values from ('c', minvalue) to ('common', minvalue);
 create table mcrparted4_common_lt_0 partition of mcrparted for values from ('common', minvalue) to ('common', 0);
 create table mcrparted5_common_0_to_10 partition of mcrparted for values from ('common', 0) to ('common', 10);
 create table mcrparted6_common_ge_10 partition of mcrparted for values from ('common', 10) to ('common', maxvalue);
 create table mcrparted7_gt_common_lt_d partition of mcrparted for values from ('common', maxvalue) to ('d', minvalue);
-create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, 0);
+create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, maxvalue);
 \d+ mcrparted
                                  Table "public.mcrparted"
  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
@@ -711,14 +770,14 @@ create table mcrparted8_ge_d partition of mcrparted for values from ('d', minval
  a      | text    |           |          |         | extended |              | 
  b      | integer |           |          |         | plain    |              | 
 Partition key: RANGE (a, b)
-Partitions: mcrparted1_lt_b FOR VALUES FROM (MINVALUE, 0) TO ('b', MINVALUE),
+Partitions: mcrparted1_lt_b FOR VALUES FROM (MINVALUE, MINVALUE) TO ('b', MINVALUE),
             mcrparted2_b FOR VALUES FROM ('b', MINVALUE) TO ('c', MINVALUE),
             mcrparted3_c_to_common FOR VALUES FROM ('c', MINVALUE) TO ('common', MINVALUE),
             mcrparted4_common_lt_0 FOR VALUES FROM ('common', MINVALUE) TO ('common', 0),
             mcrparted5_common_0_to_10 FOR VALUES FROM ('common', 0) TO ('common', 10),
             mcrparted6_common_ge_10 FOR VALUES FROM ('common', 10) TO ('common', MAXVALUE),
             mcrparted7_gt_common_lt_d FOR VALUES FROM ('common', MAXVALUE) TO ('d', MINVALUE),
-            mcrparted8_ge_d FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, 0)
+            mcrparted8_ge_d FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, MAXVALUE)
 Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
 
@@ -728,7 +787,7 @@ Location Nodes: ALL DATANODES
 --------+---------+-----------+----------+---------+----------+--------------+-------------
  a      | text    |           |          |         | extended |              | 
  b      | integer |           |          |         | plain    |              | 
-Partition of: mcrparted FOR VALUES FROM (MINVALUE, 0) TO ('b', MINVALUE)
+Partition of: mcrparted FOR VALUES FROM (MINVALUE, MINVALUE) TO ('b', MINVALUE)
 Partition constraint: ((a IS NOT NULL) AND (b IS NOT NULL) AND (a < 'b'::text))
 Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
@@ -805,7 +864,7 @@ Location Nodes: ALL DATANODES
 --------+---------+-----------+----------+---------+----------+--------------+-------------
  a      | text    |           |          |         | extended |              | 
  b      | integer |           |          |         | plain    |              | 
-Partition of: mcrparted FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, 0)
+Partition of: mcrparted FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, MAXVALUE)
 Partition constraint: ((a IS NOT NULL) AND (b IS NOT NULL) AND (a >= 'd'::text))
 Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
diff --git a/src/test/regress/expected/insert_1.out b/src/test/regress/expected/insert_1.out
index 66cffedd..3528769d 100644
--- a/src/test/regress/expected/insert_1.out
+++ b/src/test/regress/expected/insert_1.out
@@ -607,15 +607,28 @@ revoke all on key_desc from someone_else;
 revoke all on key_desc_1 from someone_else;
 drop role someone_else;
 drop table key_desc, key_desc_1;
+-- test minvalue/maxvalue restrictions
+create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
+create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, maxvalue);
+ERROR:  every bound following MINVALUE must also be MINVALUE
+LINE 1: ...partition of mcrparted for values from (minvalue, 0, 0) to (...
+                                                             ^
+create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, minvalue);
+ERROR:  every bound following MAXVALUE must also be MAXVALUE
+LINE 1: ...r values from (10, 6, minvalue) to (10, maxvalue, minvalue);
+                                                             ^
+create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, minvalue);
+ERROR:  every bound following MINVALUE must also be MINVALUE
+LINE 1: ...ition of mcrparted for values from (21, minvalue, 0) to (30,...
+                                                             ^
 -- check multi-column range partitioning expression enforces the same
 -- constraint as what tuple-routing would determine it to be
-create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
-create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, 0);
+create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, maxvalue, maxvalue);
 create table mcrparted1 partition of mcrparted for values from (2, 1, minvalue) to (10, 5, 10);
-create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, 0);
+create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, maxvalue);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
-create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, maxvalue);
-create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, 0, 0);
+create table mcrparted4 partition of mcrparted for values from (21, minvalue, minvalue) to (30, 20, maxvalue);
+create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, maxvalue, maxvalue);
 -- routed to mcrparted0
 insert into mcrparted values (0, 1, 1);
 insert into mcrparted0 values (0, 1, 1);
@@ -696,14 +709,14 @@ drop table brtrigpartcon;
 drop function brtrigpartcon1trigf();
 -- check multi-column range partitioning with minvalue/maxvalue constraints
 create table mcrparted (a text, b int) partition by range(a, b);
-create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, 0) to ('b', minvalue);
+create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, minvalue) to ('b', minvalue);
 create table mcrparted2_b partition of mcrparted for values from ('b', minvalue) to ('c', minvalue);
 create table mcrparted3_c_to_common partition of mcrparted for values from ('c', minvalue) to ('common', minvalue);
 create table mcrparted4_common_lt_0 partition of mcrparted for values from ('common', minvalue) to ('common', 0);
 create table mcrparted5_common_0_to_10 partition of mcrparted for values from ('common', 0) to ('common', 10);
 create table mcrparted6_common_ge_10 partition of mcrparted for values from ('common', 10) to ('common', maxvalue);
 create table mcrparted7_gt_common_lt_d partition of mcrparted for values from ('common', maxvalue) to ('d', minvalue);
-create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, 0);
+create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, maxvalue);
 \d+ mcrparted
                                  Table "public.mcrparted"
  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
@@ -711,14 +724,14 @@ create table mcrparted8_ge_d partition of mcrparted for values from ('d', minval
  a      | text    |           |          |         | extended |              | 
  b      | integer |           |          |         | plain    |              | 
 Partition key: RANGE (a, b)
-Partitions: mcrparted1_lt_b FOR VALUES FROM (MINVALUE, 0) TO ('b', MINVALUE),
+Partitions: mcrparted1_lt_b FOR VALUES FROM (MINVALUE, MAXVALUE) TO ('b', MINVALUE),
             mcrparted2_b FOR VALUES FROM ('b', MINVALUE) TO ('c', MINVALUE),
             mcrparted3_c_to_common FOR VALUES FROM ('c', MINVALUE) TO ('common', MINVALUE),
             mcrparted4_common_lt_0 FOR VALUES FROM ('common', MINVALUE) TO ('common', 0),
             mcrparted5_common_0_to_10 FOR VALUES FROM ('common', 0) TO ('common', 10),
             mcrparted6_common_ge_10 FOR VALUES FROM ('common', 10) TO ('common', MAXVALUE),
             mcrparted7_gt_common_lt_d FOR VALUES FROM ('common', MAXVALUE) TO ('d', MINVALUE),
-            mcrparted8_ge_d FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, 0)
+            mcrparted8_ge_d FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, MAXVALUE)
 Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
 
@@ -728,7 +741,7 @@ Location Nodes: ALL DATANODES
 --------+---------+-----------+----------+---------+----------+--------------+-------------
  a      | text    |           |          |         | extended |              | 
  b      | integer |           |          |         | plain    |              | 
-Partition of: mcrparted FOR VALUES FROM (MINVALUE, 0) TO ('b', MINVALUE)
+Partition of: mcrparted FOR VALUES FROM (MINVALUE, MINVALUE) TO ('b', MINVALUE)
 Partition constraint: ((a IS NOT NULL) AND (b IS NOT NULL) AND (a < 'b'::text))
 Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
@@ -805,7 +818,7 @@ Location Nodes: ALL DATANODES
 --------+---------+-----------+----------+---------+----------+--------------+-------------
  a      | text    |           |          |         | extended |              | 
  b      | integer |           |          |         | plain    |              | 
-Partition of: mcrparted FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, 0)
+Partition of: mcrparted FOR VALUES FROM ('d', MINVALUE) TO (MAXVALUE, MAXVALUE)
 Partition constraint: ((a IS NOT NULL) AND (b IS NOT NULL) AND (a >= 'd'::text))
 Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index 234b8b53..1c8cdb34 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -1256,6 +1256,87 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1
                      One-Time Filter: false
 (14 rows)
 
+--
+-- tests for hash partitioned tables.
+--
+CREATE TABLE pht1 (a int, b int, c text) PARTITION BY HASH(c);
+CREATE TABLE pht1_p1 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 0);
+CREATE TABLE pht1_p2 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 1);
+CREATE TABLE pht1_p3 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+INSERT INTO pht1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE pht1;
+CREATE TABLE pht2 (a int, b int, c text) PARTITION BY HASH(c);
+CREATE TABLE pht2_p1 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 0);
+CREATE TABLE pht2_p2 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 1);
+CREATE TABLE pht2_p3 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+INSERT INTO pht2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i;
+ANALYZE pht2;
+--
+-- hash partitioned by expression
+--
+CREATE TABLE pht1_e (a int, b int, c text) PARTITION BY HASH(ltrim(c, 'A'));
+CREATE TABLE pht1_e_p1 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 0);
+CREATE TABLE pht1_e_p2 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 1);
+CREATE TABLE pht1_e_p3 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+INSERT INTO pht1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE pht1_e;
+-- test partition matching with N-way join
+EXPLAIN (COSTS OFF)
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.c, t3.c
+   ->  HashAggregate
+         Group Key: t1.c, t2.c, t3.c
+         ->  Result
+               ->  Append
+                     ->  Hash Join
+                           Hash Cond: (t1.c = t2.c)
+                           ->  Seq Scan on pht1_p1 t1
+                           ->  Hash
+                                 ->  Hash Join
+                                       Hash Cond: (t2.c = ltrim(t3.c, 'A'::text))
+                                       ->  Seq Scan on pht2_p1 t2
+                                       ->  Hash
+                                             ->  Seq Scan on pht1_e_p1 t3
+                     ->  Hash Join
+                           Hash Cond: (t1_1.c = t2_1.c)
+                           ->  Seq Scan on pht1_p2 t1_1
+                           ->  Hash
+                                 ->  Hash Join
+                                       Hash Cond: (t2_1.c = ltrim(t3_1.c, 'A'::text))
+                                       ->  Seq Scan on pht2_p2 t2_1
+                                       ->  Hash
+                                             ->  Seq Scan on pht1_e_p2 t3_1
+                     ->  Hash Join
+                           Hash Cond: (t1_2.c = t2_2.c)
+                           ->  Seq Scan on pht1_p3 t1_2
+                           ->  Hash
+                                 ->  Hash Join
+                                       Hash Cond: (t2_2.c = ltrim(t3_2.c, 'A'::text))
+                                       ->  Seq Scan on pht2_p3 t2_2
+                                       ->  Hash
+                                             ->  Seq Scan on pht1_e_p3 t3_2
+(33 rows)
+
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+         avg          |         avg          |          avg          |  c   |  c   |   c   
+----------------------+----------------------+-----------------------+------+------+-------
+  24.0000000000000000 |  24.0000000000000000 |   48.0000000000000000 | 0000 | 0000 | A0000
+  74.0000000000000000 |  75.0000000000000000 |  148.0000000000000000 | 0001 | 0001 | A0001
+ 124.0000000000000000 | 124.5000000000000000 |  248.0000000000000000 | 0002 | 0002 | A0002
+ 174.0000000000000000 | 174.0000000000000000 |  348.0000000000000000 | 0003 | 0003 | A0003
+ 224.0000000000000000 | 225.0000000000000000 |  448.0000000000000000 | 0004 | 0004 | A0004
+ 274.0000000000000000 | 274.5000000000000000 |  548.0000000000000000 | 0005 | 0005 | A0005
+ 324.0000000000000000 | 324.0000000000000000 |  648.0000000000000000 | 0006 | 0006 | A0006
+ 374.0000000000000000 | 375.0000000000000000 |  748.0000000000000000 | 0007 | 0007 | A0007
+ 424.0000000000000000 | 424.5000000000000000 |  848.0000000000000000 | 0008 | 0008 | A0008
+ 474.0000000000000000 | 474.0000000000000000 |  948.0000000000000000 | 0009 | 0009 | A0009
+ 524.0000000000000000 | 525.0000000000000000 | 1048.0000000000000000 | 0010 | 0010 | A0010
+ 574.0000000000000000 | 574.5000000000000000 | 1148.0000000000000000 | 0011 | 0011 | A0011
+(12 rows)
+
 --
 -- multiple levels of partitioning 
 --
diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out
index 2989db8f..0aae60ac 100644
--- a/src/test/regress/expected/update.out
+++ b/src/test/regress/expected/update.out
@@ -250,6 +250,35 @@ ERROR:  new row for relation "list_default" violates partition constraint
 DETAIL:  Failing row contains (a, 10).
 -- ok
 update list_default set a = 'x' where a = 'd';
+-- create custom operator class and hash function, for the same reason
+-- explained in alter_table.sql
+create or replace function dummy_hashint4(a int4, seed int8) returns int8 as
+$$ begin return (a + seed); end; $$ language 'plpgsql' immutable;
+create operator class custom_opclass for type int4 using hash as
+operator 1 = , function 2 dummy_hashint4(int4, int8);
+create table hash_parted (
+	a int,
+	b int
+) partition by hash (a custom_opclass, b custom_opclass);
+create table hpart1 partition of hash_parted for values with (modulus 2, remainder 1);
+create table hpart2 partition of hash_parted for values with (modulus 4, remainder 2);
+create table hpart3 partition of hash_parted for values with (modulus 8, remainder 0);
+create table hpart4 partition of hash_parted for values with (modulus 8, remainder 4);
+insert into hpart1 values (1, 1);
+insert into hpart2 values (2, 5);
+insert into hpart4 values (3, 4);
+-- fail
+update hpart1 set a = 3, b=4 where a = 1;
+ERROR:  new row for relation "hpart1" violates partition constraint
+DETAIL:  Failing row contains (3, 4).
+update hash_parted set b = b - 1 where b = 1;
+ERROR:  new row for relation "hpart1" violates partition constraint
+DETAIL:  Failing row contains (1, 0).
+-- ok
+update hash_parted set b = b + 8 where b = 1;
 -- cleanup
 drop table range_parted;
 drop table list_parted;
+drop table hash_parted;
+drop operator class custom_opclass using hash;
+drop function dummy_hashint4(a int4, seed int8);
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index 97d2d9bf..a32521f1 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -2112,6 +2112,7 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::reg
 -- check that the new partition won't overlap with an existing partition
 CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS);
 ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1);
+DROP TABLE fail_part;
 -- check that an existing table can be attached as a default partition
 CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS);
 ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT;
@@ -2307,6 +2308,62 @@ CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1);
 CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2);
 DROP TABLE quuux;
 
+-- check validation when attaching hash partitions
+
+-- The default hash functions as they exist today aren't portable; they can
+-- return different results on different machines.  Depending upon how the
+-- values are hashed, the row may map to different partitions, which result in
+-- regression failure.  To avoid this, let's create a non-default hash function
+-- that just returns the input value unchanged.
+CREATE OR REPLACE FUNCTION dummy_hashint4(a int4, seed int8) RETURNS int8 AS
+$$ BEGIN RETURN (a + 1 + seed); END; $$ LANGUAGE 'plpgsql' IMMUTABLE;
+CREATE OPERATOR CLASS custom_opclass FOR TYPE int4 USING HASH AS
+OPERATOR 1 = , FUNCTION 2 dummy_hashint4(int4, int8);
+
+-- check that the new partition won't overlap with an existing partition
+CREATE TABLE hash_parted (
+	a int,
+	b int
+) PARTITION BY HASH (a custom_opclass);
+CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0);
+CREATE TABLE fail_part (LIKE hpart_1);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0);
+DROP TABLE fail_part;
+
+-- check validation when attaching hash partitions
+
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_2 (LIKE hash_parted);
+INSERT INTO hpart_2 VALUES (3, 0);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+
+-- should be ok after deleting the bad row
+DELETE FROM hpart_2;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1);
+
+-- check that leaf partitions are scanned when attaching a partitioned
+-- table
+CREATE TABLE hpart_5 (
+	LIKE hash_parted
+) PARTITION BY LIST (b);
+
+-- check that violating rows are correctly reported
+CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3');
+INSERT INTO hpart_5_a (a, b) VALUES (7, 1);
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+
+-- should be ok after deleting the bad row
+DELETE FROM hpart_5_a;
+ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+
+-- check that the table being attach is with valid modulus and remainder value
+CREATE TABLE fail_part(LIKE hash_parted);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8);
+ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+DROP TABLE fail_part;
+
 --
 -- DETACH PARTITION
 --
@@ -2318,12 +2375,16 @@ DROP TABLE regular_table;
 
 -- check that the partition being detached exists at all
 ALTER TABLE list_parted2 DETACH PARTITION part_4;
+ALTER TABLE hash_parted DETACH PARTITION hpart_4;
 
 -- check that the partition being detached is actually a partition of the parent
 CREATE TABLE not_a_part (a int);
 ALTER TABLE list_parted2 DETACH PARTITION not_a_part;
 ALTER TABLE list_parted2 DETACH PARTITION part_1;
 
+ALTER TABLE hash_parted DETACH PARTITION not_a_part;
+DROP TABLE not_a_part;
+
 -- check that, after being detached, attinhcount/coninhcount is dropped to 0 and
 -- attislocal/conislocal is set to true
 ALTER TABLE list_parted2 DETACH PARTITION part_3_4;
@@ -2400,6 +2461,9 @@ SELECT * FROM list_parted;
 -- cleanup
 DROP TABLE list_parted, list_parted2, range_parted;
 DROP TABLE fail_def_part;
+DROP TABLE hash_parted;
+DROP OPERATOR CLASS custom_opclass USING HASH;
+DROP FUNCTION dummy_hashint4(a int4, seed int8);
 
 -- more tests for certain multi-level partitioning scenarios
 create table p (a int, b int) partition by range (a, b);
diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql
index 1a74fdd1..c1cf6ee1 100644
--- a/src/test/regress/sql/create_table.sql
+++ b/src/test/regress/sql/create_table.sql
@@ -352,10 +352,10 @@ CREATE TABLE partitioned (
 ) PARTITION BY RANGE (const_func());
 DROP FUNCTION const_func();
 
--- only accept "list" and "range" as partitioning strategy
+-- only accept valid partitioning strategy
 CREATE TABLE partitioned (
-	a int
-) PARTITION BY HASH (a);
+    a int
+) PARTITION BY MAGIC (a);
 
 -- specified column must be present in the table
 CREATE TABLE partitioned (
@@ -448,6 +448,8 @@ CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES IN ('1'::int);
 CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES IN ();
 -- trying to specify range for list partitioned table
 CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) TO (2);
+-- trying to specify modulus and remainder for list partitioned table
+CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1);
 
 -- check default partition cannot be created more than once
 CREATE TABLE part_default PARTITION OF list_parted DEFAULT;
@@ -483,6 +485,8 @@ CREATE TABLE range_parted (
 
 -- trying to specify list for range partitioned table
 CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES IN ('a');
+-- trying to specify modulus and remainder for range partitioned table
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1);
 -- each of start and end bounds must have same number of values as the
 -- length of the partition key
 CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('z');
@@ -491,6 +495,28 @@ CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a') TO ('z',
 -- cannot specify null values in range bounds
 CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM (null) TO (maxvalue);
 
+-- trying to specify modulus and remainder for range partitioned table
+CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1);
+
+-- check partition bound syntax for the hash partition
+CREATE TABLE hash_parted (
+	a int
+) PARTITION BY HASH (a);
+CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 10, REMAINDER 0);
+CREATE TABLE hpart_2 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 50, REMAINDER 1);
+CREATE TABLE hpart_3 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 200, REMAINDER 2);
+-- modulus 25 is factor of modulus of 50 but 10 is not factor of 25.
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 25, REMAINDER 3);
+-- previous modulus 50 is factor of 150 but this modulus is not factor of next modulus 200.
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 150, REMAINDER 3);
+-- trying to specify range for the hash partitioned table
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a', 1) TO ('z');
+-- trying to specify list value for the hash partitioned table
+CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000);
+
+-- trying to create default partition for the hash partitioned table
+CREATE TABLE fail_default_part PARTITION OF hash_parted DEFAULT;
+
 -- check if compatible with the specified parent
 
 -- cannot create as partition of a non-partitioned table
@@ -498,6 +524,7 @@ CREATE TABLE unparted (
 	a int
 );
 CREATE TABLE fail_part PARTITION OF unparted FOR VALUES IN ('a');
+CREATE TABLE fail_part PARTITION OF unparted FOR VALUES WITH (MODULUS 2, REMAINDER 1);
 DROP TABLE unparted;
 
 -- cannot create a permanent rel as partition of a temp rel
@@ -587,6 +614,21 @@ CREATE TABLE range3_default PARTITION OF range_parted3 DEFAULT;
 -- more specific ranges
 CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO (1, maxvalue);
 
+-- check for partition bound overlap and other invalid specifications for the hash partition
+CREATE TABLE hash_parted2 (
+	a varchar
+) PARTITION BY HASH (a);
+CREATE TABLE h2part_1 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 4, REMAINDER 2);
+CREATE TABLE h2part_2 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 0);
+CREATE TABLE h2part_3 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 4);
+CREATE TABLE h2part_4 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 5);
+-- overlap with part_4
+CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 2, REMAINDER 1);
+-- modulus must be greater than zero
+CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 0, REMAINDER 1);
+-- remainder must be greater than or equal to zero and less than modulus
+CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 8);
+
 -- check schema propagation from parent
 
 CREATE TABLE parted (
@@ -640,22 +682,25 @@ CREATE TABLE part_c_1_10 PARTITION OF part_c FOR VALUES FROM (1) TO (10);
 -- output could vary depending on the order in which partition oids are
 -- returned.
 \d parted
+\d hash_parted
 
 -- check that we get the expected partition constraints
 CREATE TABLE range_parted4 (a int, b int, c int) PARTITION BY RANGE (abs(a), abs(b), c);
-CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, 0, 0) TO (MAXVALUE, 0, 0);
+CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE);
 \d+ unbounded_range_part
 DROP TABLE unbounded_range_part;
-CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, 0, 0) TO (1, MAXVALUE, 0);
+CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE);
 \d+ range_parted4_1
 CREATE TABLE range_parted4_2 PARTITION OF range_parted4 FOR VALUES FROM (3, 4, 5) TO (6, 7, MAXVALUE);
 \d+ range_parted4_2
-CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, 0);
+CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE);
 \d+ range_parted4_3
 DROP TABLE range_parted4;
 
 -- cleanup
 DROP TABLE parted, list_parted, range_parted, list_parted2, range_parted2, range_parted3;
+DROP TABLE hash_parted;
+DROP TABLE hash_parted2;
 
 -- comments on partitioned tables columns
 CREATE TABLE parted_col_comment (a int, b text) PARTITION BY LIST (a);
diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
index ff11dbcb..58f7f523 100644
--- a/src/test/regress/sql/inherit.sql
+++ b/src/test/regress/sql/inherit.sql
@@ -734,12 +734,12 @@ drop table range_list_parted;
 -- check that constraint exclusion is able to cope with the partition
 -- constraint emitted for multi-column range partitioned tables
 create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
-create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, 1, 1);
+create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1);
 create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10);
 create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
 create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20);
-create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, 0, 0);
+create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
 explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
 explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scans mcrparted1
 explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scans mcrparted1, mcrparted2
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index bbfc03c4..ef7abf94 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -221,8 +221,41 @@ insert into list_parted select 'gg', s.a from generate_series(1, 9) s(a);
 insert into list_parted (b) values (1);
 select tableoid::regclass::text, a, min(b) as min_b, max(b) as max_b from list_parted group by 1, 2 order by 1;
 
+-- direct partition inserts should check hash partition bound constraint
+
+-- create custom operator class and hash function, for the same reason
+-- explained in alter_table.sql
+create or replace function dummy_hashint4(a int4, seed int8) returns int8 as
+$$ begin return (a + seed); end; $$ language 'plpgsql' immutable;
+create operator class custom_opclass for type int4 using hash as
+operator 1 = , function 2 dummy_hashint4(int4, int8);
+
+create table hash_parted (
+	a int
+) partition by hash (a custom_opclass);
+create table hpart0 partition of hash_parted for values with (modulus 4, remainder 0);
+create table hpart1 partition of hash_parted for values with (modulus 4, remainder 1);
+create table hpart2 partition of hash_parted for values with (modulus 4, remainder 2);
+create table hpart3 partition of hash_parted for values with (modulus 4, remainder 3);
+
+insert into hash_parted values(generate_series(1,10));
+
+-- direct insert of values divisible by 4 - ok;
+insert into hpart0 values(12),(16);
+-- fail;
+insert into hpart0 values(11);
+-- 11 % 4 -> 3 remainder i.e. valid data for hpart3 partition
+insert into hpart3 values(11);
+
+-- view data
+select tableoid::regclass as part, a, a%4 as "remainder = a % 4"
+from hash_parted order by part;
+
 -- cleanup
 drop table range_parted, list_parted;
+drop table hash_parted;
+drop operator class custom_opclass using hash;
+drop function dummy_hashint4(a int4, seed int8);
 
 -- test that a default partition added as the first partition accepts any value
 -- including null
@@ -363,15 +396,20 @@ revoke all on key_desc_1 from someone_else;
 drop role someone_else;
 drop table key_desc, key_desc_1;
 
+-- test minvalue/maxvalue restrictions
+create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
+create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, maxvalue);
+create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, minvalue);
+create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, minvalue);
+
 -- check multi-column range partitioning expression enforces the same
 -- constraint as what tuple-routing would determine it to be
-create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
-create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, 0);
+create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, maxvalue, maxvalue);
 create table mcrparted1 partition of mcrparted for values from (2, 1, minvalue) to (10, 5, 10);
-create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, 0);
+create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, maxvalue);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
-create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, maxvalue);
-create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, 0, 0);
+create table mcrparted4 partition of mcrparted for values from (21, minvalue, minvalue) to (30, 20, maxvalue);
+create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, maxvalue, maxvalue);
 
 -- routed to mcrparted0
 insert into mcrparted values (0, 1, 1);
@@ -436,14 +474,14 @@ drop function brtrigpartcon1trigf();
 
 -- check multi-column range partitioning with minvalue/maxvalue constraints
 create table mcrparted (a text, b int) partition by range(a, b);
-create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, 0) to ('b', minvalue);
+create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, minvalue) to ('b', minvalue);
 create table mcrparted2_b partition of mcrparted for values from ('b', minvalue) to ('c', minvalue);
 create table mcrparted3_c_to_common partition of mcrparted for values from ('c', minvalue) to ('common', minvalue);
 create table mcrparted4_common_lt_0 partition of mcrparted for values from ('common', minvalue) to ('common', 0);
 create table mcrparted5_common_0_to_10 partition of mcrparted for values from ('common', 0) to ('common', 10);
 create table mcrparted6_common_ge_10 partition of mcrparted for values from ('common', 10) to ('common', maxvalue);
 create table mcrparted7_gt_common_lt_d partition of mcrparted for values from ('common', maxvalue) to ('d', minvalue);
-create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, 0);
+create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, maxvalue);
 
 \d+ mcrparted
 \d+ mcrparted1_lt_b
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index ca525d99..2316bbdc 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -229,6 +229,38 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
 
+--
+-- tests for hash partitioned tables.
+--
+CREATE TABLE pht1 (a int, b int, c text) PARTITION BY HASH(c);
+CREATE TABLE pht1_p1 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 0);
+CREATE TABLE pht1_p2 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 1);
+CREATE TABLE pht1_p3 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+INSERT INTO pht1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE pht1;
+
+CREATE TABLE pht2 (a int, b int, c text) PARTITION BY HASH(c);
+CREATE TABLE pht2_p1 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 0);
+CREATE TABLE pht2_p2 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 1);
+CREATE TABLE pht2_p3 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+INSERT INTO pht2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i;
+ANALYZE pht2;
+
+--
+-- hash partitioned by expression
+--
+CREATE TABLE pht1_e (a int, b int, c text) PARTITION BY HASH(ltrim(c, 'A'));
+CREATE TABLE pht1_e_p1 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 0);
+CREATE TABLE pht1_e_p2 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 1);
+CREATE TABLE pht1_e_p3 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+INSERT INTO pht1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE pht1_e;
+
+-- test partition matching with N-way join
+EXPLAIN (COSTS OFF)
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+
 --
 -- multiple levels of partitioning 
 --
diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql
index 42c5e405..9d673de4 100644
--- a/src/test/regress/sql/update.sql
+++ b/src/test/regress/sql/update.sql
@@ -148,6 +148,34 @@ update list_default set a = 'a' where a = 'd';
 -- ok
 update list_default set a = 'x' where a = 'd';
 
+-- create custom operator class and hash function, for the same reason
+-- explained in alter_table.sql
+create or replace function dummy_hashint4(a int4, seed int8) returns int8 as
+$$ begin return (a + seed); end; $$ language 'plpgsql' immutable;
+create operator class custom_opclass for type int4 using hash as
+operator 1 = , function 2 dummy_hashint4(int4, int8);
+
+create table hash_parted (
+	a int,
+	b int
+) partition by hash (a custom_opclass, b custom_opclass);
+create table hpart1 partition of hash_parted for values with (modulus 2, remainder 1);
+create table hpart2 partition of hash_parted for values with (modulus 4, remainder 2);
+create table hpart3 partition of hash_parted for values with (modulus 8, remainder 0);
+create table hpart4 partition of hash_parted for values with (modulus 8, remainder 4);
+insert into hpart1 values (1, 1);
+insert into hpart2 values (2, 5);
+insert into hpart4 values (3, 4);
+
+-- fail
+update hpart1 set a = 3, b=4 where a = 1;
+update hash_parted set b = b - 1 where b = 1;
+-- ok
+update hash_parted set b = b + 8 where b = 1;
+
 -- cleanup
 drop table range_parted;
 drop table list_parted;
+drop table hash_parted;
+drop operator class custom_opclass using hash;
+drop function dummy_hashint4(a int4, seed int8);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index dedefbdf..ba6ce916 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1562,6 +1562,7 @@ PartitionDispatch
 PartitionDispatchData
 PartitionElem
 PartitionKey
+PartitionHashBound
 PartitionListValue
 PartitionRangeBound
 PartitionRangeDatum

From 4932c4931e3de93f30dd4eaeed00a8b2ddd78570 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 20:14:34 +0800
Subject: [PATCH 204/578] Centralize executor-related partitioning code.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/partition.c        | 353 ++--------------
 src/backend/commands/copy.c            |   1 +
 src/backend/executor/Makefile          |   2 +-
 src/backend/executor/execMain.c        | 261 +-----------
 src/backend/executor/execPartition.c   | 560 +++++++++++++++++++++++++
 src/backend/executor/nodeModifyTable.c |   1 +
 src/include/catalog/partition.h        |  48 +--
 src/include/executor/execPartition.h   |  65 +++
 src/include/executor/executor.h        |  13 +-
 9 files changed, 666 insertions(+), 638 deletions(-)
 create mode 100644 src/backend/executor/execPartition.c
 create mode 100644 src/include/executor/execPartition.h

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 9832a333..ae0bbfbe 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -170,8 +170,6 @@ static int32 partition_bound_cmp(PartitionKey key,
 static int partition_bound_bsearch(PartitionKey key,
                         PartitionBoundInfo boundinfo,
                         void *probe, bool probe_is_bound, bool *is_equal);
-static void get_partition_dispatch_recurse(Relation rel, Relation parent,
-							   List **pds, List **leaf_part_oids);
 static int	get_partition_bound_num_indexes(PartitionBoundInfo b);
 static int	get_greatest_modulus(PartitionBoundInfo b);
 static uint64 compute_hash_value(PartitionKey key, Datum *values, bool *isnull);
@@ -1565,148 +1563,6 @@ get_partition_qual_relid(Oid relid)
     return result;
 }
 
-/*
- * RelationGetPartitionDispatchInfo
- *        Returns information necessary to route tuples down a partition tree
- *
- * The number of elements in the returned array (that is, the number of
- * PartitionDispatch objects for the partitioned tables in the partition tree)
- * is returned in *num_parted and a list of the OIDs of all the leaf
- * partitions of rel is returned in *leaf_part_oids.
- *
- * All the relations in the partition tree (including 'rel') must have been
- * locked (using at least the AccessShareLock) by the caller.
- */
-PartitionDispatch *
-RelationGetPartitionDispatchInfo(Relation rel,
-                                 int *num_parted, List **leaf_part_oids)
-{
-	List	   *pdlist = NIL;
-    PartitionDispatchData **pd;
-	ListCell   *lc;
-	int			i;
-
-	Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
-
-	*num_parted = 0;
-	*leaf_part_oids = NIL;
-
-	get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids);
-	*num_parted = list_length(pdlist);
-	pd = (PartitionDispatchData **) palloc(*num_parted *
-										   sizeof(PartitionDispatchData *));
-	i = 0;
-	foreach(lc, pdlist)
-	{
-		pd[i++] = lfirst(lc);
-        }
-
-	return pd;
-    }
-
-    /*
- * get_partition_dispatch_recurse
- *		Recursively expand partition tree rooted at rel
- *
- * As the partition tree is expanded in a depth-first manner, we mantain two
- * global lists: of PartitionDispatch objects corresponding to partitioned
- * tables in *pds and of the leaf partition OIDs in *leaf_part_oids.
- *
- * Note that the order of OIDs of leaf partitions in leaf_part_oids matches
- * the order in which the planner's expand_partitioned_rtentry() processes
- * them.  It's not necessarily the case that the offsets match up exactly,
- * because constraint exclusion might prune away some partitions on the
- * planner side, whereas we'll always have the complete list; but unpruned
- * partitions will appear in the same order in the plan as they are returned
- * here.
-     */
-static void
-get_partition_dispatch_recurse(Relation rel, Relation parent,
-							   List **pds, List **leaf_part_oids)
-{
-	TupleDesc	tupdesc = RelationGetDescr(rel);
-	PartitionDesc partdesc = RelationGetPartitionDesc(rel);
-	PartitionKey partkey = RelationGetPartitionKey(rel);
-	PartitionDispatch pd;
-	int			i;
-
-	check_stack_depth();
-
-	/* Build a PartitionDispatch for this table and add it to *pds. */
-	pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
-	*pds = lappend(*pds, pd);
-	pd->reldesc = rel;
-	pd->key = partkey;
-	pd->keystate = NIL;
-	pd->partdesc = partdesc;
-        if (parent != NULL)
-        {
-            /*
-		 * For every partitioned table other than the root, we must store a
-		 * tuple table slot initialized with its tuple descriptor and a tuple
-		 * conversion map to convert a tuple from its parent's rowtype to its
-		 * own. That is to make sure that we are looking at the correct row
-		 * using the correct tuple descriptor when computing its partition key
-		 * for tuple routing.
-             */
-		pd->tupslot = MakeSingleTupleTableSlot(tupdesc);
-		pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
-                                                   tupdesc,
-                                                   gettext_noop("could not convert row type"));
-        }
-        else
-        {
-            /* Not required for the root partitioned table */
-		pd->tupslot = NULL;
-		pd->tupmap = NULL;
-        }
-
-        /*
-	 * Go look at each partition of this table.  If it's a leaf partition,
-	 * simply add its OID to *leaf_part_oids.  If it's a partitioned table,
-	 * recursively call get_partition_dispatch_recurse(), so that its
-	 * partitions are processed as well and a corresponding PartitionDispatch
-	 * object gets added to *pds.
-	 *
-	 * About the values in pd->indexes: for a leaf partition, it contains the
-	 * leaf partition's position in the global list *leaf_part_oids minus 1,
-	 * whereas for a partitioned table partition, it contains the partition's
-	 * position in the global list *pds multiplied by -1.  The latter is
-	 * multiplied by -1 to distinguish partitioned tables from leaf partitions
-	 * when going through the values in pd->indexes.  So, for example, when
-	 * using it during tuple-routing, encountering a value >= 0 means we found
-	 * a leaf partition.  It is immediately returned as the index in the array
-	 * of ResultRelInfos of all the leaf partitions, using which we insert the
-	 * tuple into that leaf partition.  A negative value means we found a
-	 * partitioned table.  The value multiplied by -1 is returned as the index
-	 * in the array of PartitionDispatch objects of all partitioned tables in
-	 * the tree.  This value is used to continue the search in the next level
-	 * of the partition tree.
-	 */
-	pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
-	for (i = 0; i < partdesc->nparts; i++)
-	{
-		Oid			partrelid = partdesc->oids[i];
-
-            if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
-            {
-                *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
-			pd->indexes[i] = list_length(*leaf_part_oids) - 1;
-            }
-            else
-            {
-                /*
-			 * We assume all tables in the partition tree were already locked
-			 * by the caller.
-                 */
-			Relation	partrel = heap_open(partrelid, NoLock);
-
-			pd->indexes[i] = -list_length(*pds);
-			get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids);
-		}
-    }
-}
-
 /* Module-local functions */
 
 /*
@@ -2646,134 +2502,21 @@ generate_partition_qual(Relation rel)
     return result;
 }
 
-/* ----------------
- *        FormPartitionKeyDatum
- *            Construct values[] and isnull[] arrays for the partition key
- *            of a tuple.
- *
- *    pd                Partition dispatch object of the partitioned table
- *    slot            Heap tuple from which to extract partition key
- *    estate            executor state for evaluating any partition key
- *                    expressions (must be non-NULL)
- *    values            Array of partition key Datums (output area)
- *    isnull            Array of is-null indicators (output area)
- *
- * the ecxt_scantuple slot of estate's per-tuple expr context must point to
- * the heap tuple passed in.
- * ----------------
- */
-void
-FormPartitionKeyDatum(PartitionDispatch pd,
-                      TupleTableSlot *slot,
-                      EState *estate,
-                      Datum *values,
-                      bool *isnull)
-{// #lizard forgives
-    ListCell   *partexpr_item;
-    int            i;
-
-    if (pd->key->partexprs != NIL && pd->keystate == NIL)
-    {
-        /* Check caller has set up context correctly */
-        Assert(estate != NULL &&
-               GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
-
-        /* First time through, set up expression evaluation state */
-        pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
-    }
-
-    partexpr_item = list_head(pd->keystate);
-    for (i = 0; i < pd->key->partnatts; i++)
-    {
-        AttrNumber    keycol = pd->key->partattrs[i];
-        Datum        datum;
-        bool        isNull;
-
-        if (keycol != 0)
-        {
-            /* Plain column; get the value directly from the heap tuple */
-            datum = slot_getattr(slot, keycol, &isNull);
-        }
-        else
-        {
-            /* Expression; need to evaluate it */
-            if (partexpr_item == NULL)
-                elog(ERROR, "wrong number of partition key expressions");
-            datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
-                                              GetPerTupleExprContext(estate),
-                                              &isNull);
-            partexpr_item = lnext(partexpr_item);
-        }
-        values[i] = datum;
-        isnull[i] = isNull;
-    }
-
-    if (partexpr_item != NULL)
-        elog(ERROR, "wrong number of partition key expressions");
-}
-
 /*
  * get_partition_for_tuple
- *        Finds a leaf partition for tuple contained in *slot
+ *      Finds partition of relation which accepts the partition key specified
+ *      in values and isnull
  *
- * Returned value is the sequence number of the leaf partition thus found,
- * or -1 if no leaf partition is found for the tuple.  *failed_at is set
- * to the OID of the partitioned table whose partition was not found in
- * the latter case.
+ * Return value is index of the partition (>= 0 and < partdesc->nparts) if one
+ * found or -1 if none found.
  */
 int
-get_partition_for_tuple(PartitionDispatch *pd,
-                        TupleTableSlot *slot,
-                        EState *estate,
-                        PartitionDispatchData **failed_at,
-                        TupleTableSlot **failed_slot)
-{// #lizard forgives
-    PartitionDispatch parent;
-    Datum        values[PARTITION_MAX_KEYS];
-    bool        isnull[PARTITION_MAX_KEYS];
-	int			result;
-    ExprContext *ecxt = GetPerTupleExprContext(estate);
-    TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
-
-    /* start with the root partitioned table */
-    parent = pd[0];
-    while (true)
-    {
-        PartitionKey key = parent->key;
-        PartitionDesc partdesc = parent->partdesc;
-        TupleTableSlot *myslot = parent->tupslot;
-        TupleConversionMap *map = parent->tupmap;
-		int		cur_index = -1;
-
-        if (myslot != NULL && map != NULL)
+get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
         {
-            HeapTuple    tuple = ExecFetchSlotTuple(slot);
-
-            ExecClearTuple(myslot);
-            tuple = do_convert_tuple(tuple, map, NULL);
-            ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
-            slot = myslot;
-        }
-
-        /* Quick exit */
-        if (partdesc->nparts == 0)
-        {
-            *failed_at = parent;
-            *failed_slot = slot;
-            result = -1;
-            goto error_exit;
-        }
-
-        /*
-         * Extract partition key from tuple. Expression evaluation machinery
-         * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
-         * point to the correct tuple slot.  The slot might have changed from
-         * what was used for the parent table if the table of the current
-         * partitioning level has different tuple descriptor from the parent.
-         * So update ecxt_scantuple accordingly.
-         */
-        ecxt->ecxt_scantuple = slot;
-        FormPartitionKeyDatum(parent, slot, estate, values, isnull);
+    int     bound_offset;
+    int     part_index = -1;
+    PartitionKey  key = RelationGetPartitionKey(relation);
+    PartitionDesc partdesc = RelationGetPartitionDesc(relation);
 
 		/* Route as appropriate based on partitioning strategy. */
 		switch (key->strategy)
@@ -2782,32 +2525,29 @@ get_partition_for_tuple(PartitionDispatch *pd,
 				{
 					PartitionBoundInfo boundinfo = partdesc->boundinfo;
 					int			greatest_modulus = get_greatest_modulus(boundinfo);
-					uint64		rowHash = compute_hash_value(key, values,
-															 isnull);
+                uint64  rowHash = compute_hash_value(key, values, isnull);
 
-					cur_index = boundinfo->indexes[rowHash % greatest_modulus];
+                part_index = boundinfo->indexes[rowHash % greatest_modulus];
 				}
 				break;
 
 			case PARTITION_STRATEGY_LIST:
-
 				if (isnull[0])
 				{
 					if (partition_bound_accepts_nulls(partdesc->boundinfo))
-						cur_index = partdesc->boundinfo->null_index;
+                    part_index = partdesc->boundinfo->null_index;
 				}
 				else
 				{
 					bool		equal = false;
-					int			cur_offset;
 
-					cur_offset = partition_bound_bsearch(key,
+                bound_offset = partition_bound_bsearch(key,
 														 partdesc->boundinfo,
 														 values,
 														 false,
 														 &equal);
-					if (cur_offset >= 0 && equal)
-						cur_index = partdesc->boundinfo->indexes[cur_offset];
+                if (bound_offset >= 0 && equal)
+                    part_index = partdesc->boundinfo->indexes[bound_offset];
 				}
 				break;
 
@@ -2815,7 +2555,6 @@ get_partition_for_tuple(PartitionDispatch *pd,
 				{
 					bool		equal = false,
 								range_partkey_has_null = false;
-					int			cur_offset;
 					int			i;
 
 					/*
@@ -2829,36 +2568,26 @@ get_partition_for_tuple(PartitionDispatch *pd,
 							partition_bound_has_default(partdesc->boundinfo))
 						{
 							range_partkey_has_null = true;
-							break;
-						}
-						else if (isnull[i])
-                {
-                    *failed_at = parent;
-                    *failed_slot = slot;
-                    result = -1;
-                    goto error_exit;
+                        part_index = partdesc->boundinfo->default_index;
                 }
             }
 
-					/*
-					 * No need to search for partition, as the null key will
-					 * be routed to the default partition.
-					 */
-					if (range_partkey_has_null)
-						break;
-
-					cur_offset = partition_bound_bsearch(key,
+                if (!range_partkey_has_null)
+                {
+                    bound_offset = partition_bound_bsearch(key,
 														 partdesc->boundinfo,
 														 values,
 														 false,
 														 &equal);
 
                     /*
-					 * The offset returned is such that the bound at
-					 * cur_offset is less than or equal to the tuple value, so
-					 * the bound at offset+1 is the upper bound.
+                     * The bound at bound_offset is less than or equal to the
+                     * tuple value, so the bound at offset+1 is the upper
+                     * bound of the partition we're looking for, if there
+                     * actually exists one.
                      */
-                    cur_index = partdesc->boundinfo->indexes[cur_offset + 1];
+                    part_index = partdesc->boundinfo->indexes[bound_offset + 1];
+                }
 				}
                     break;
 
@@ -2868,37 +2597,13 @@ get_partition_for_tuple(PartitionDispatch *pd,
             }
 
         /*
-         * cur_index < 0 means we failed to find a partition of this parent.
+     * part_index < 0 means we failed to find a partition of this parent.
 		 * Use the default partition, if there is one.
 		 */
-		if (cur_index < 0)
-			cur_index = partdesc->boundinfo->default_index;
+    if (part_index < 0)
+        part_index = partdesc->boundinfo->default_index;
 
-		/*
-		 * If cur_index is still less than 0 at this point, there's no
-		 * partition for this tuple.  Otherwise, we either found the leaf
-		 * partition, or a child partitioned table through which we have to
-		 * route the tuple.
-         */
-        if (cur_index < 0)
-        {
-            result = -1;
-            *failed_at = parent;
-            *failed_slot = slot;
-            break;
-        }
-        else if (parent->indexes[cur_index] >= 0)
-        {
-            result = parent->indexes[cur_index];
-            break;
-        }
-        else
-            parent = pd[-parent->indexes[cur_index]];
-    }
-
-error_exit:
-    ecxt->ecxt_scantuple = ecxt_scantuple_old;
-    return result;
+    return part_index;
 }
 
 /*
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 9e5aec9f..e376f863 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -38,6 +38,7 @@
 #include "commands/copy.h"
 #include "commands/defrem.h"
 #include "commands/trigger.h"
+#include "executor/execPartition.h"
 #include "executor/executor.h"
 #include "libpq/libpq.h"
 #include "libpq/pqformat.h"
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
index fef60fb4..b7d58365 100644
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@@ -14,7 +14,7 @@ include $(top_builddir)/src/Makefile.global
 
 OBJS = execAmi.o execCurrent.o execExpr.o execExprInterp.o \
        execGrouping.o execIndexing.o execJunk.o \
-       execMain.o execParallel.o execProcnode.o \
+       execMain.o execParallel.o execPartition.o execProcnode.o \
        execReplication.o execScan.o execSRF.o execTuples.o \
        execUtils.o functions.o instrument.o nodeAppend.o nodeAgg.o \
        nodeBitmapAnd.o nodeBitmapOr.o \
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 776c9d41..a4978497 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -44,7 +44,6 @@
 #include "access/xact.h"
 #include "catalog/namespace.h"
 #include "catalog/partition.h"
-#include "catalog/pg_inherits_fn.h"
 #include "catalog/pg_publication.h"
 #ifdef _MLS_
 #include "catalog/pg_class.h"
@@ -133,14 +132,8 @@ static char *ExecBuildSlotValueDescription(Oid reloid,
                               TupleDesc tupdesc,
                               Bitmapset *modifiedCols,
                               int maxfieldlen);
-static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
-                                     Datum *values,
-                                     bool *isnull,
-                                     int maxfieldlen);
 static void EvalPlanQualStart(EPQState *epqstate, EState *parentestate,
                   Plan *planTree);
-static void ExecPartitionCheck(ResultRelInfo *resultRelInfo,
-                   TupleTableSlot *slot, EState *estate);
 #ifdef _MLS_
 static int ExecCheckRTERelkindextPerms(RangeTblEntry *rte);
 #endif
@@ -2215,8 +2208,10 @@ ExecRelCheck(ResultRelInfo *resultRelInfo,
 
 /*
  * ExecPartitionCheck --- check that tuple meets the partition constraint.
+ *
+ * Exported in executor.h for outside use.
  */
-static void
+void
 ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
                    EState *estate)
 {
@@ -3657,256 +3652,6 @@ EvalPlanQualEnd(EPQState *epqstate)
     epqstate->origslot = NULL;
 }
 
-/*
- * ExecSetupPartitionTupleRouting - set up information needed during
- * tuple routing for partitioned tables
- *
- * Output arguments:
- * 'pd' receives an array of PartitionDispatch objects with one entry for
- *        every partitioned table in the partition tree
- * 'partitions' receives an array of ResultRelInfo objects with one entry for
- *        every leaf partition in the partition tree
- * 'tup_conv_maps' receives an array of TupleConversionMap objects with one
- *        entry for every leaf partition (required to convert input tuple based
- *        on the root table's rowtype to a leaf partition's rowtype after tuple
- *        routing is done)
- * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used
- *        to manipulate any given leaf partition's rowtype after that partition
- *        is chosen by tuple-routing.
- * 'num_parted' receives the number of partitioned tables in the partition
- *        tree (= the number of entries in the 'pd' output array)
- * 'num_partitions' receives the number of leaf partitions in the partition
- *        tree (= the number of entries in the 'partitions' and 'tup_conv_maps'
- *        output arrays
- *
- * Note that all the relations in the partition tree are locked using the
- * RowExclusiveLock mode upon return from this function.
- */
-void
-ExecSetupPartitionTupleRouting(Relation rel,
-                               Index resultRTindex,
-                               PartitionDispatch **pd,
-                               ResultRelInfo **partitions,
-                               TupleConversionMap ***tup_conv_maps,
-                               TupleTableSlot **partition_tuple_slot,
-                               int *num_parted, int *num_partitions)
-{
-    TupleDesc    tupDesc = RelationGetDescr(rel);
-    List       *leaf_parts;
-    ListCell   *cell;
-    int            i;
-    ResultRelInfo *leaf_part_rri;
-
-   /*
-    * Get the information about the partition tree after locking all the
-    * partitions.
-    */
-   (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL);
-   *pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts);
-    *num_partitions = list_length(leaf_parts);
-    *partitions = (ResultRelInfo *) palloc0(*num_partitions *
-                                           sizeof(ResultRelInfo));
-    *tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions *
-                                                     sizeof(TupleConversionMap *));
-
-    /*
-     * Initialize an empty slot that will be used to manipulate tuples of any
-     * given partition's rowtype.  It is attached to the caller-specified node
-     * (such as ModifyTableState) and released when the node finishes
-     * processing.
-     */
-    *partition_tuple_slot = MakeTupleTableSlot();
-
-    leaf_part_rri = *partitions;
-    i = 0;
-    foreach(cell, leaf_parts)
-    {
-        Relation    partrel;
-        TupleDesc    part_tupdesc;
-
-        /*
-         * We locked all the partitions above including the leaf partitions.
-         * Note that each of the relations in *partitions are eventually
-         * closed by the caller.
-         */
-        partrel = heap_open(lfirst_oid(cell), NoLock);
-        part_tupdesc = RelationGetDescr(partrel);
-
-        /*
-         * Verify result relation is a valid target for the current operation.
-         */
-        CheckValidResultRel(partrel, CMD_INSERT);
-
-        /*
-         * Save a tuple conversion map to convert a tuple routed to this
-         * partition from the parent's type to the partition's.
-         */
-        (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc,
-                                                     gettext_noop("could not convert row type"));
-
-        InitResultRelInfo(leaf_part_rri,
-                          partrel,
-                          resultRTindex,
-                          rel,
-                          0);
-
-        /*
-         * Open partition indices (remember we do not support ON CONFLICT in
-         * case of partitioned tables, so we do not need support information
-         * for speculative insertion)
-         */
-        if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex &&
-            leaf_part_rri->ri_IndexRelationDescs == NULL)
-            ExecOpenIndices(leaf_part_rri, false);
-
-        leaf_part_rri++;
-        i++;
-    }
-}
-
-/*
- * ExecFindPartition -- Find a leaf partition in the partition tree rooted
- * at parent, for the heap tuple contained in *slot
- *
- * estate must be non-NULL; we'll need it to compute any expressions in the
- * partition key(s)
- *
- * If no leaf partition is found, this routine errors out with the appropriate
- * error message, else it returns the leaf partition sequence number returned
- * by get_partition_for_tuple() unchanged.
- */
-int
-ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
-                  TupleTableSlot *slot, EState *estate)
-{
-    int            result;
-    PartitionDispatchData *failed_at;
-    TupleTableSlot *failed_slot;
-
-    /*
-     * First check the root table's partition constraint, if any.  No point in
-     * routing the tuple if it doesn't belong in the root table itself.
-     */
-    if (resultRelInfo->ri_PartitionCheck)
-        ExecPartitionCheck(resultRelInfo, slot, estate);
-
-    result = get_partition_for_tuple(pd, slot, estate,
-                                     &failed_at, &failed_slot);
-    if (result < 0)
-    {
-        Relation    failed_rel;
-        Datum        key_values[PARTITION_MAX_KEYS];
-        bool        key_isnull[PARTITION_MAX_KEYS];
-        char       *val_desc;
-        ExprContext *ecxt = GetPerTupleExprContext(estate);
-
-        failed_rel = failed_at->reldesc;
-        ecxt->ecxt_scantuple = failed_slot;
-        FormPartitionKeyDatum(failed_at, failed_slot, estate,
-                              key_values, key_isnull);
-        val_desc = ExecBuildSlotPartitionKeyDescription(failed_rel,
-                                                        key_values,
-                                                        key_isnull,
-                                                        64);
-        Assert(OidIsValid(RelationGetRelid(failed_rel)));
-        ereport(ERROR,
-                (errcode(ERRCODE_CHECK_VIOLATION),
-                 errmsg("no partition of relation \"%s\" found for row",
-                        RelationGetRelationName(failed_rel)),
-                 val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0));
-    }
-
-    return result;
-}
-
-/*
- * BuildSlotPartitionKeyDescription
- *
- * This works very much like BuildIndexValueDescription() and is currently
- * used for building error messages when ExecFindPartition() fails to find
- * partition for a row.
- */
-static char *
-ExecBuildSlotPartitionKeyDescription(Relation rel,
-                                     Datum *values,
-                                     bool *isnull,
-                                     int maxfieldlen)
-{// #lizard forgives
-    StringInfoData buf;
-    PartitionKey key = RelationGetPartitionKey(rel);
-    int            partnatts = get_partition_natts(key);
-    int            i;
-    Oid            relid = RelationGetRelid(rel);
-    AclResult    aclresult;
-
-    if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED)
-        return NULL;
-
-    /* If the user has table-level access, just go build the description. */
-    aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT);
-    if (aclresult != ACLCHECK_OK)
-    {
-        /*
-         * Step through the columns of the partition key and make sure the
-         * user has SELECT rights on all of them.
-         */
-        for (i = 0; i < partnatts; i++)
-        {
-            AttrNumber    attnum = get_partition_col_attnum(key, i);
-
-            /*
-             * If this partition key column is an expression, we return no
-             * detail rather than try to figure out what column(s) the
-             * expression includes and if the user has SELECT rights on them.
-             */
-            if (attnum == InvalidAttrNumber ||
-                pg_attribute_aclcheck(relid, attnum, GetUserId(),
-                                      ACL_SELECT) != ACLCHECK_OK)
-                return NULL;
-        }
-    }
-
-    initStringInfo(&buf);
-    appendStringInfo(&buf, "(%s) = (",
-                     pg_get_partkeydef_columns(relid, true));
-
-    for (i = 0; i < partnatts; i++)
-    {
-        char       *val;
-        int            vallen;
-
-        if (isnull[i])
-            val = "null";
-        else
-        {
-            Oid            foutoid;
-            bool        typisvarlena;
-
-            getTypeOutputInfo(get_partition_col_typid(key, i),
-                              &foutoid, &typisvarlena);
-            val = OidOutputFunctionCall(foutoid, values[i]);
-        }
-
-        if (i > 0)
-            appendStringInfoString(&buf, ", ");
-
-        /* truncate if needed */
-        vallen = strlen(val);
-        if (vallen <= maxfieldlen)
-            appendStringInfoString(&buf, val);
-        else
-        {
-            vallen = pg_mbcliplen(val, vallen, maxfieldlen);
-            appendBinaryStringInfo(&buf, val, vallen);
-            appendStringInfoString(&buf, "...");
-        }
-    }
-
-    appendStringInfoChar(&buf, ')');
-
-    return buf.data;
-}
-
 #ifdef _MLS_
 /*
  * cls user could access cls system tables and original system tables, and no more access limit
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
new file mode 100644
index 00000000..d275cefe
--- /dev/null
+++ b/src/backend/executor/execPartition.c
@@ -0,0 +1,560 @@
+/*-------------------------------------------------------------------------
+ *
+ * execPartition.c
+ *	  Support routines for partitioning.
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/executor/execPartition.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "catalog/pg_inherits_fn.h"
+#include "executor/execPartition.h"
+#include "executor/executor.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "utils/lsyscache.h"
+#include "utils/rls.h"
+#include "utils/ruleutils.h"
+
+static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel,
+								 int *num_parted, List **leaf_part_oids);
+static void get_partition_dispatch_recurse(Relation rel, Relation parent,
+							   List **pds, List **leaf_part_oids);
+static void FormPartitionKeyDatum(PartitionDispatch pd,
+					  TupleTableSlot *slot,
+					  EState *estate,
+					  Datum *values,
+					  bool *isnull);
+static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
+									 Datum *values,
+									 bool *isnull,
+									 int maxfieldlen);
+
+/*
+ * ExecSetupPartitionTupleRouting - set up information needed during
+ * tuple routing for partitioned tables
+ *
+ * Output arguments:
+ * 'pd' receives an array of PartitionDispatch objects with one entry for
+ *		every partitioned table in the partition tree
+ * 'partitions' receives an array of ResultRelInfo* objects with one entry for
+ *		every leaf partition in the partition tree
+ * 'tup_conv_maps' receives an array of TupleConversionMap objects with one
+ *		entry for every leaf partition (required to convert input tuple based
+ *		on the root table's rowtype to a leaf partition's rowtype after tuple
+ *		routing is done)
+ * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used
+ *		to manipulate any given leaf partition's rowtype after that partition
+ *		is chosen by tuple-routing.
+ * 'num_parted' receives the number of partitioned tables in the partition
+ *		tree (= the number of entries in the 'pd' output array)
+ * 'num_partitions' receives the number of leaf partitions in the partition
+ *		tree (= the number of entries in the 'partitions' and 'tup_conv_maps'
+ *		output arrays
+ *
+ * Note that all the relations in the partition tree are locked using the
+ * RowExclusiveLock mode upon return from this function.
+ */
+void
+ExecSetupPartitionTupleRouting(Relation rel,
+							   Index resultRTindex,
+							   EState *estate,
+							   PartitionDispatch **pd,
+							   ResultRelInfo ***partitions,
+							   TupleConversionMap ***tup_conv_maps,
+							   TupleTableSlot **partition_tuple_slot,
+							   int *num_parted, int *num_partitions)
+{
+	TupleDesc	tupDesc = RelationGetDescr(rel);
+	List	   *leaf_parts;
+	ListCell   *cell;
+	int			i;
+	ResultRelInfo *leaf_part_rri;
+
+	/*
+	 * Get the information about the partition tree after locking all the
+	 * partitions.
+	 */
+	(void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL);
+	*pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts);
+	*num_partitions = list_length(leaf_parts);
+	*partitions = (ResultRelInfo **) palloc(*num_partitions *
+											sizeof(ResultRelInfo *));
+	*tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions *
+													 sizeof(TupleConversionMap *));
+
+	/*
+	 * Initialize an empty slot that will be used to manipulate tuples of any
+	 * given partition's rowtype.  It is attached to the caller-specified node
+	 * (such as ModifyTableState) and released when the node finishes
+	 * processing.
+	 */
+	*partition_tuple_slot = MakeTupleTableSlot();
+
+	leaf_part_rri = (ResultRelInfo *) palloc0(*num_partitions *
+											  sizeof(ResultRelInfo));
+	i = 0;
+	foreach(cell, leaf_parts)
+	{
+		Relation	partrel;
+		TupleDesc	part_tupdesc;
+
+		/*
+		 * We locked all the partitions above including the leaf partitions.
+		 * Note that each of the relations in *partitions are eventually
+		 * closed by the caller.
+		 */
+		partrel = heap_open(lfirst_oid(cell), NoLock);
+		part_tupdesc = RelationGetDescr(partrel);
+
+		/*
+		 * Save a tuple conversion map to convert a tuple routed to this
+		 * partition from the parent's type to the partition's.
+		 */
+		(*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc,
+													 gettext_noop("could not convert row type"));
+
+		InitResultRelInfo(leaf_part_rri,
+						  partrel,
+						  resultRTindex,
+						  rel,
+						  estate->es_instrument);
+
+		/*
+		 * Verify result relation is a valid target for INSERT.
+		 */
+		CheckValidResultRel(leaf_part_rri, CMD_INSERT);
+
+		/*
+		 * Open partition indices (remember we do not support ON CONFLICT in
+		 * case of partitioned tables, so we do not need support information
+		 * for speculative insertion)
+		 */
+		if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex &&
+			leaf_part_rri->ri_IndexRelationDescs == NULL)
+			ExecOpenIndices(leaf_part_rri, false);
+
+		estate->es_leaf_result_relations =
+			lappend(estate->es_leaf_result_relations, leaf_part_rri);
+
+		(*partitions)[i] = leaf_part_rri++;
+		i++;
+	}
+}
+
+/*
+ * ExecFindPartition -- Find a leaf partition in the partition tree rooted
+ * at parent, for the heap tuple contained in *slot
+ *
+ * estate must be non-NULL; we'll need it to compute any expressions in the
+ * partition key(s)
+ *
+ * If no leaf partition is found, this routine errors out with the appropriate
+ * error message, else it returns the leaf partition sequence number
+ * as an index into the array of (ResultRelInfos of) all leaf partitions in
+ * the partition tree.
+ */
+int
+ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
+				  TupleTableSlot *slot, EState *estate)
+{
+	int			result;
+	Datum		values[PARTITION_MAX_KEYS];
+	bool		isnull[PARTITION_MAX_KEYS];
+	Relation	rel;
+	PartitionDispatch parent;
+	ExprContext *ecxt = GetPerTupleExprContext(estate);
+	TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple;
+
+	/*
+	 * First check the root table's partition constraint, if any.  No point in
+	 * routing the tuple if it doesn't belong in the root table itself.
+	 */
+	if (resultRelInfo->ri_PartitionCheck)
+		ExecPartitionCheck(resultRelInfo, slot, estate);
+
+	/* start with the root partitioned table */
+	parent = pd[0];
+	while (true)
+	{
+		PartitionDesc	partdesc;
+		TupleTableSlot *myslot = parent->tupslot;
+		TupleConversionMap *map = parent->tupmap;
+		int		cur_index = -1;
+
+		rel = parent->reldesc;
+		partdesc = RelationGetPartitionDesc(rel);
+
+		/*
+		 * Convert the tuple to this parent's layout so that we can do certain
+		 * things we do below.
+		 */
+		if (myslot != NULL && map != NULL)
+		{
+			HeapTuple	tuple = ExecFetchSlotTuple(slot);
+
+			ExecClearTuple(myslot);
+			tuple = do_convert_tuple(tuple, map);
+			ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
+			slot = myslot;
+		}
+
+		/* Quick exit */
+		if (partdesc->nparts == 0)
+		{
+			result = -1;
+			break;
+		}
+
+		/*
+		 * Extract partition key from tuple. Expression evaluation machinery
+		 * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
+		 * point to the correct tuple slot.  The slot might have changed from
+		 * what was used for the parent table if the table of the current
+		 * partitioning level has different tuple descriptor from the parent.
+		 * So update ecxt_scantuple accordingly.
+		 */
+		ecxt->ecxt_scantuple = slot;
+		FormPartitionKeyDatum(parent, slot, estate, values, isnull);
+		cur_index = get_partition_for_tuple(rel, values, isnull);
+
+		/*
+		 * cur_index < 0 means we failed to find a partition of this parent.
+		 * cur_index >= 0 means we either found the leaf partition, or the
+		 * next parent to find a partition of.
+		 */
+		if (cur_index < 0)
+		{
+			result = -1;
+			break;
+		}
+		else if (parent->indexes[cur_index] >= 0)
+		{
+			result = parent->indexes[cur_index];
+			break;
+		}
+		else
+			parent = pd[-parent->indexes[cur_index]];
+	}
+
+	/* A partition was not found. */
+	if (result < 0)
+	{
+		char	   *val_desc;
+
+		val_desc = ExecBuildSlotPartitionKeyDescription(rel,
+														values, isnull, 64);
+		Assert(OidIsValid(RelationGetRelid(rel)));
+		ereport(ERROR,
+				(errcode(ERRCODE_CHECK_VIOLATION),
+				 errmsg("no partition of relation \"%s\" found for row",
+						RelationGetRelationName(rel)),
+				 val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0));
+	}
+
+	ecxt->ecxt_scantuple = ecxt_scantuple_old;
+	return result;
+}
+
+/*
+ * RelationGetPartitionDispatchInfo
+ *		Returns information necessary to route tuples down a partition tree
+ *
+ * The number of elements in the returned array (that is, the number of
+ * PartitionDispatch objects for the partitioned tables in the partition tree)
+ * is returned in *num_parted and a list of the OIDs of all the leaf
+ * partitions of rel is returned in *leaf_part_oids.
+ *
+ * All the relations in the partition tree (including 'rel') must have been
+ * locked (using at least the AccessShareLock) by the caller.
+ */
+static PartitionDispatch *
+RelationGetPartitionDispatchInfo(Relation rel,
+								 int *num_parted, List **leaf_part_oids)
+{
+	List	   *pdlist = NIL;
+	PartitionDispatchData **pd;
+	ListCell   *lc;
+	int			i;
+
+	Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
+
+	*num_parted = 0;
+	*leaf_part_oids = NIL;
+
+	get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids);
+	*num_parted = list_length(pdlist);
+	pd = (PartitionDispatchData **) palloc(*num_parted *
+										   sizeof(PartitionDispatchData *));
+	i = 0;
+	foreach(lc, pdlist)
+	{
+		pd[i++] = lfirst(lc);
+	}
+
+	return pd;
+}
+
+/*
+ * get_partition_dispatch_recurse
+ *		Recursively expand partition tree rooted at rel
+ *
+ * As the partition tree is expanded in a depth-first manner, we maintain two
+ * global lists: of PartitionDispatch objects corresponding to partitioned
+ * tables in *pds and of the leaf partition OIDs in *leaf_part_oids.
+ *
+ * Note that the order of OIDs of leaf partitions in leaf_part_oids matches
+ * the order in which the planner's expand_partitioned_rtentry() processes
+ * them.  It's not necessarily the case that the offsets match up exactly,
+ * because constraint exclusion might prune away some partitions on the
+ * planner side, whereas we'll always have the complete list; but unpruned
+ * partitions will appear in the same order in the plan as they are returned
+ * here.
+ */
+static void
+get_partition_dispatch_recurse(Relation rel, Relation parent,
+							   List **pds, List **leaf_part_oids)
+{
+	TupleDesc	tupdesc = RelationGetDescr(rel);
+	PartitionDesc partdesc = RelationGetPartitionDesc(rel);
+	PartitionKey partkey = RelationGetPartitionKey(rel);
+	PartitionDispatch pd;
+	int			i;
+
+	check_stack_depth();
+
+	/* Build a PartitionDispatch for this table and add it to *pds. */
+	pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData));
+	*pds = lappend(*pds, pd);
+	pd->reldesc = rel;
+	pd->key = partkey;
+	pd->keystate = NIL;
+	pd->partdesc = partdesc;
+	if (parent != NULL)
+	{
+		/*
+		 * For every partitioned table other than the root, we must store a
+		 * tuple table slot initialized with its tuple descriptor and a tuple
+		 * conversion map to convert a tuple from its parent's rowtype to its
+		 * own. That is to make sure that we are looking at the correct row
+		 * using the correct tuple descriptor when computing its partition key
+		 * for tuple routing.
+		 */
+		pd->tupslot = MakeSingleTupleTableSlot(tupdesc);
+		pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
+											tupdesc,
+											gettext_noop("could not convert row type"));
+	}
+	else
+	{
+		/* Not required for the root partitioned table */
+		pd->tupslot = NULL;
+		pd->tupmap = NULL;
+	}
+
+	/*
+	 * Go look at each partition of this table.  If it's a leaf partition,
+	 * simply add its OID to *leaf_part_oids.  If it's a partitioned table,
+	 * recursively call get_partition_dispatch_recurse(), so that its
+	 * partitions are processed as well and a corresponding PartitionDispatch
+	 * object gets added to *pds.
+	 *
+	 * About the values in pd->indexes: for a leaf partition, it contains the
+	 * leaf partition's position in the global list *leaf_part_oids minus 1,
+	 * whereas for a partitioned table partition, it contains the partition's
+	 * position in the global list *pds multiplied by -1.  The latter is
+	 * multiplied by -1 to distinguish partitioned tables from leaf partitions
+	 * when going through the values in pd->indexes.  So, for example, when
+	 * using it during tuple-routing, encountering a value >= 0 means we found
+	 * a leaf partition.  It is immediately returned as the index in the array
+	 * of ResultRelInfos of all the leaf partitions, using which we insert the
+	 * tuple into that leaf partition.  A negative value means we found a
+	 * partitioned table.  The value multiplied by -1 is returned as the index
+	 * in the array of PartitionDispatch objects of all partitioned tables in
+	 * the tree.  This value is used to continue the search in the next level
+	 * of the partition tree.
+	 */
+	pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
+	for (i = 0; i < partdesc->nparts; i++)
+	{
+		Oid			partrelid = partdesc->oids[i];
+
+		if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE)
+		{
+			*leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid);
+			pd->indexes[i] = list_length(*leaf_part_oids) - 1;
+		}
+		else
+		{
+			/*
+			 * We assume all tables in the partition tree were already locked
+			 * by the caller.
+			 */
+			Relation	partrel = heap_open(partrelid, NoLock);
+
+			pd->indexes[i] = -list_length(*pds);
+			get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids);
+		}
+	}
+}
+
+/* ----------------
+ *		FormPartitionKeyDatum
+ *			Construct values[] and isnull[] arrays for the partition key
+ *			of a tuple.
+ *
+ *	pd				Partition dispatch object of the partitioned table
+ *	slot			Heap tuple from which to extract partition key
+ *	estate			executor state for evaluating any partition key
+ *					expressions (must be non-NULL)
+ *	values			Array of partition key Datums (output area)
+ *	isnull			Array of is-null indicators (output area)
+ *
+ * the ecxt_scantuple slot of estate's per-tuple expr context must point to
+ * the heap tuple passed in.
+ * ----------------
+ */
+static void
+FormPartitionKeyDatum(PartitionDispatch pd,
+					  TupleTableSlot *slot,
+					  EState *estate,
+					  Datum *values,
+					  bool *isnull)
+{
+	ListCell   *partexpr_item;
+	int			i;
+
+	if (pd->key->partexprs != NIL && pd->keystate == NIL)
+	{
+		/* Check caller has set up context correctly */
+		Assert(estate != NULL &&
+			   GetPerTupleExprContext(estate)->ecxt_scantuple == slot);
+
+		/* First time through, set up expression evaluation state */
+		pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate);
+	}
+
+	partexpr_item = list_head(pd->keystate);
+	for (i = 0; i < pd->key->partnatts; i++)
+	{
+		AttrNumber	keycol = pd->key->partattrs[i];
+		Datum		datum;
+		bool		isNull;
+
+		if (keycol != 0)
+		{
+			/* Plain column; get the value directly from the heap tuple */
+			datum = slot_getattr(slot, keycol, &isNull);
+		}
+		else
+		{
+			/* Expression; need to evaluate it */
+			if (partexpr_item == NULL)
+				elog(ERROR, "wrong number of partition key expressions");
+			datum = ExecEvalExprSwitchContext((ExprState *) lfirst(partexpr_item),
+											  GetPerTupleExprContext(estate),
+											  &isNull);
+			partexpr_item = lnext(partexpr_item);
+		}
+		values[i] = datum;
+		isnull[i] = isNull;
+	}
+
+	if (partexpr_item != NULL)
+		elog(ERROR, "wrong number of partition key expressions");
+}
+
+/*
+ * BuildSlotPartitionKeyDescription
+ *
+ * This works very much like BuildIndexValueDescription() and is currently
+ * used for building error messages when ExecFindPartition() fails to find
+ * partition for a row.
+ */
+static char *
+ExecBuildSlotPartitionKeyDescription(Relation rel,
+									 Datum *values,
+									 bool *isnull,
+									 int maxfieldlen)
+{
+	StringInfoData buf;
+	PartitionKey key = RelationGetPartitionKey(rel);
+	int			partnatts = get_partition_natts(key);
+	int			i;
+	Oid			relid = RelationGetRelid(rel);
+	AclResult	aclresult;
+
+	if (check_enable_rls(relid, InvalidOid, true) == RLS_ENABLED)
+		return NULL;
+
+	/* If the user has table-level access, just go build the description. */
+	aclresult = pg_class_aclcheck(relid, GetUserId(), ACL_SELECT);
+	if (aclresult != ACLCHECK_OK)
+	{
+		/*
+		 * Step through the columns of the partition key and make sure the
+		 * user has SELECT rights on all of them.
+		 */
+		for (i = 0; i < partnatts; i++)
+		{
+			AttrNumber	attnum = get_partition_col_attnum(key, i);
+
+			/*
+			 * If this partition key column is an expression, we return no
+			 * detail rather than try to figure out what column(s) the
+			 * expression includes and if the user has SELECT rights on them.
+			 */
+			if (attnum == InvalidAttrNumber ||
+				pg_attribute_aclcheck(relid, attnum, GetUserId(),
+									  ACL_SELECT) != ACLCHECK_OK)
+				return NULL;
+		}
+	}
+
+	initStringInfo(&buf);
+	appendStringInfo(&buf, "(%s) = (",
+					 pg_get_partkeydef_columns(relid, true));
+
+	for (i = 0; i < partnatts; i++)
+	{
+		char	   *val;
+		int			vallen;
+
+		if (isnull[i])
+			val = "null";
+		else
+		{
+			Oid			foutoid;
+			bool		typisvarlena;
+
+			getTypeOutputInfo(get_partition_col_typid(key, i),
+							  &foutoid, &typisvarlena);
+			val = OidOutputFunctionCall(foutoid, values[i]);
+		}
+
+		if (i > 0)
+			appendStringInfoString(&buf, ", ");
+
+		/* truncate if needed */
+		vallen = strlen(val);
+		if (vallen <= maxfieldlen)
+			appendStringInfoString(&buf, val);
+		else
+		{
+			vallen = pg_mbcliplen(val, vallen, maxfieldlen);
+			appendBinaryStringInfo(&buf, val, vallen);
+			appendStringInfoString(&buf, "...");
+		}
+	}
+
+	appendStringInfoChar(&buf, ')');
+
+	return buf.data;
+}
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 4a03adb3..660bfd4b 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -41,6 +41,7 @@
 #include "access/htup_details.h"
 #include "access/xact.h"
 #include "commands/trigger.h"
+#include "executor/execPartition.h"
 #include "executor/executor.h"
 #include "executor/nodeModifyTable.h"
 #include "foreign/fdwapi.h"
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index 8acc01a8..295e9d22 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -42,37 +42,6 @@ typedef struct PartitionDescData
 
 typedef struct PartitionDescData *PartitionDesc;
 
-/*-----------------------
- * PartitionDispatch - information about one partitioned table in a partition
- * hierarchy required to route a tuple to one of its partitions
- *
- *	reldesc		Relation descriptor of the table
- *	key			Partition key information of the table
- *	keystate	Execution state required for expressions in the partition key
- *	partdesc	Partition descriptor of the table
- *	tupslot		A standalone TupleTableSlot initialized with this table's tuple
- *				descriptor
- *	tupmap		TupleConversionMap to convert from the parent's rowtype to
- *				this table's rowtype (when extracting the partition key of a
- *				tuple just before routing it through this table)
- *	indexes		Array with partdesc->nparts members (for details on what
- *				individual members represent, see how they are set in
- *				RelationGetPartitionDispatchInfo())
- *-----------------------
- */
-typedef struct PartitionDispatchData
-{
-	Relation	reldesc;
-	PartitionKey key;
-	List	   *keystate;		/* list of ExprState */
-	PartitionDesc partdesc;
-	TupleTableSlot *tupslot;
-	TupleConversionMap *tupmap;
-	int		   *indexes;
-} PartitionDispatchData;
-
-typedef struct PartitionDispatchData *PartitionDispatch;
-
 extern void RelationBuildPartitionDesc(Relation relation);
 extern bool partition_bounds_equal(int partnatts, int16 *parttyplen,
 					   bool *parttypbyval, PartitionBoundInfo b1,
@@ -91,19 +60,6 @@ extern List *map_partition_varattnos(List *expr, int target_varno,
 extern List *RelationGetPartitionQual(Relation rel);
 extern Expr *get_partition_qual_relid(Oid relid);
 
-/* For tuple routing */
-extern PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel,
-								 int *num_parted, List **leaf_part_oids);
-extern void FormPartitionKeyDatum(PartitionDispatch pd,
-					  TupleTableSlot *slot,
-					  EState *estate,
-					  Datum *values,
-					  bool *isnull);
-extern int get_partition_for_tuple(PartitionDispatch *pd,
-						TupleTableSlot *slot,
-						EState *estate,
-						PartitionDispatchData **failed_at,
-						TupleTableSlot **failed_slot);
 extern Oid	get_default_oid_from_partdesc(PartitionDesc partdesc);
 extern Oid	get_default_partition_oid(Oid parentId);
 extern void update_default_partition_oid(Oid parentId, Oid defaultPartId);
@@ -111,4 +67,8 @@ extern void check_default_allows_bound(Relation parent, Relation defaultRel,
 						   PartitionBoundSpec *new_spec);
 extern List *get_proposed_default_constraint(List *new_part_constaints);
 
+/* For tuple routing */
+extern int get_partition_for_tuple(Relation relation, Datum *values,
+							bool *isnull);
+
 #endif							/* PARTITION_H */
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
new file mode 100644
index 00000000..64e5aab4
--- /dev/null
+++ b/src/include/executor/execPartition.h
@@ -0,0 +1,65 @@
+/*--------------------------------------------------------------------
+ * execPartition.h
+ *		POSTGRES partitioning executor interface
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *		src/include/executor/execPartition.h
+ *--------------------------------------------------------------------
+ */
+
+#ifndef EXECPARTITION_H
+#define EXECPARTITION_H
+
+#include "catalog/partition.h"
+#include "nodes/execnodes.h"
+#include "nodes/parsenodes.h"
+#include "nodes/plannodes.h"
+
+/*-----------------------
+ * PartitionDispatch - information about one partitioned table in a partition
+ * hierarchy required to route a tuple to one of its partitions
+ *
+ *	reldesc		Relation descriptor of the table
+ *	key			Partition key information of the table
+ *	keystate	Execution state required for expressions in the partition key
+ *	partdesc	Partition descriptor of the table
+ *	tupslot		A standalone TupleTableSlot initialized with this table's tuple
+ *				descriptor
+ *	tupmap		TupleConversionMap to convert from the parent's rowtype to
+ *				this table's rowtype (when extracting the partition key of a
+ *				tuple just before routing it through this table)
+ *	indexes		Array with partdesc->nparts members (for details on what
+ *				individual members represent, see how they are set in
+ *				get_partition_dispatch_recurse())
+ *-----------------------
+ */
+typedef struct PartitionDispatchData
+{
+	Relation		reldesc;
+	PartitionKey 	key;
+	List		   *keystate;		/* list of ExprState */
+	PartitionDesc	partdesc;
+	TupleTableSlot *tupslot;
+	TupleConversionMap *tupmap;
+	int			   *indexes;
+} PartitionDispatchData;
+
+typedef struct PartitionDispatchData *PartitionDispatch;
+
+extern void ExecSetupPartitionTupleRouting(Relation rel,
+							   Index resultRTindex,
+							   EState *estate,
+							   PartitionDispatch **pd,
+							   ResultRelInfo ***partitions,
+							   TupleConversionMap ***tup_conv_maps,
+							   TupleTableSlot **partition_tuple_slot,
+							   int *num_parted, int *num_partitions);
+extern int ExecFindPartition(ResultRelInfo *resultRelInfo,
+				  PartitionDispatch *pd,
+				  TupleTableSlot *slot,
+				  EState *estate);
+
+#endif							/* EXECPARTITION_H */
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 7fb94908..4ea9ef52 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -206,6 +206,8 @@ extern void ExecCleanUpTriggerState(EState *estate);
 extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids);
 extern void ExecConstraints(ResultRelInfo *resultRelInfo,
                 TupleTableSlot *slot, EState *estate);
+extern void ExecPartitionCheck(ResultRelInfo *resultRelInfo,
+				   TupleTableSlot *slot, EState *estate);
 extern void ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo,
                       TupleTableSlot *slot, EState *estate);
 extern LockTupleMode ExecUpdateLockMode(EState *estate, ResultRelInfo *relinfo);
@@ -224,17 +226,6 @@ extern void EvalPlanQualSetPlan(EPQState *epqstate,
 extern void EvalPlanQualSetTuple(EPQState *epqstate, Index rti,
                       HeapTuple tuple);
 extern HeapTuple EvalPlanQualGetTuple(EPQState *epqstate, Index rti);
-extern void ExecSetupPartitionTupleRouting(Relation rel,
-                                Index resultRTindex,
-                                PartitionDispatch **pd,
-                                ResultRelInfo **partitions,
-                                TupleConversionMap ***tup_conv_maps,
-                                TupleTableSlot **partition_tuple_slot,
-                                int *num_parted, int *num_partitions);
-extern int ExecFindPartition(ResultRelInfo *resultRelInfo,
-                  PartitionDispatch *pd,
-                  TupleTableSlot *slot,
-                  EState *estate);
 
 #define EvalPlanQualSetSlot(epqstate, slot)  ((epqstate)->origslot = (slot))
 extern void EvalPlanQualFetchRowMarks(EPQState *epqstate);

From 80e7f9733058c4ffaee6c9fb9c865a2d135c8b02 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 20:33:15 +0800
Subject: [PATCH 205/578] Set proargmodes for satisfies_hash_partition.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/include/catalog/pg_proc.h             |  2 +-
 src/test/regress/expected/type_sanity.out | 11 +++++++++++
 src/test/regress/sql/type_sanity.sql      |  8 ++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 27c9ef26..51adc65a 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -5708,7 +5708,7 @@ DESCR("list files in the log directory");
 DATA(insert OID = 3354 (  pg_ls_waldir                 PGNSP PGUID 12 10 20 0 0 f f f f t t v s 0 0 2249 "" "{25,20,1184}" "{o,o,o}" "{name,size,modification}" _null_ _null_ pg_ls_waldir _null_ _null_ _null_ ));
 DESCR("list of files in the WAL directory");
 /* hash partitioning constraint function */
-DATA(insert OID = 4687 ( satisfies_hash_partition PGNSP PGUID 12 1 0 2276 0 f f f f f f i s 4 0 16 "26 23 23 2276" _null_ _null_ _null_ _null_ _null_ satisfies_hash_partition _null_ _null_ _null_ ));
+DATA(insert OID = 4687 ( satisfies_hash_partition PGNSP PGUID 12 1 0 2276 0 f f f f f f i s 4 0 16 "26 23 23 2276" _null_ "{i,i,i,v}" _null_ _null_ _null_ satisfies_hash_partition _null_ _null_ _null_ ));
 DESCR("hash partition CHECK constraint");
 DATA(insert OID = 3410 (  pg_extent_info                PGNSP PGUID 12 10 20 0 0 f f f f f t v s 1 0 2249 "2205" "{23,16,23,23,23,23,23,23,23}" "{o,o,o,o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next,scan_prev,alloc_next,alloc_prev}" _null_ _null_ pg_extent_info_oid _null_ _null_ _null_ ));
 DESCR("get extent info of a relation");
diff --git a/src/test/regress/expected/type_sanity.out b/src/test/regress/expected/type_sanity.out
index 7b200bae..16af46e1 100644
--- a/src/test/regress/expected/type_sanity.out
+++ b/src/test/regress/expected/type_sanity.out
@@ -129,6 +129,17 @@ WHERE p1.typinput = p2.oid AND NOT
 -----+---------+-----+---------
 (0 rows)
 
+-- Check that all and only those functions with a variadic type have
+-- a variadic argument.
+SELECT oid::regprocedure, proargmodes, provariadic
+FROM pg_proc
+WHERE (proargmodes IS NOT NULL AND 'v' = any(proargmodes))
+    IS DISTINCT FROM
+    (provariadic != 0);
+ oid | proargmodes | provariadic 
+-----+-------------+-------------
+(0 rows)
+
 -- As of 8.0, this check finds refcursor, which is borrowing
 -- other types' I/O routines
 SELECT p1.oid, p1.typname, p2.oid, p2.proname
diff --git a/src/test/regress/sql/type_sanity.sql b/src/test/regress/sql/type_sanity.sql
index 4c658140..4e38f3e7 100644
--- a/src/test/regress/sql/type_sanity.sql
+++ b/src/test/regress/sql/type_sanity.sql
@@ -104,6 +104,14 @@ WHERE p1.typinput = p2.oid AND NOT
       p2.proargtypes[1] = 'oid'::regtype AND
       p2.proargtypes[2] = 'int4'::regtype));
 
+-- Check that all and only those functions with a variadic type have
+-- a variadic argument.
+SELECT oid::regprocedure, proargmodes, provariadic
+FROM pg_proc
+WHERE (proargmodes IS NOT NULL AND 'v' = any(proargmodes))
+    IS DISTINCT FROM
+    (provariadic != 0);
+
 -- As of 8.0, this check finds refcursor, which is borrowing
 -- other types' I/O routines
 SELECT p1.oid, p1.typname, p2.oid, p2.proname

From f37a2b1de4b369a5eccecf26b380bf6aea98ccd7 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 20:38:25 +0800
Subject: [PATCH 206/578] Fix multiple problems with
 satisfies_hash_partition.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/partition.c         | 188 +++++++++++++++++++++---
 src/test/regress/expected/hash_part.out | 113 ++++++++++++++
 src/test/regress/parallel_schedule      |   2 +-
 src/test/regress/serial_schedule        |   1 +
 src/test/regress/sql/hash_part.sql      |  90 ++++++++++++
 5 files changed, 370 insertions(+), 24 deletions(-)
 create mode 100644 src/test/regress/expected/hash_part.out
 create mode 100644 src/test/regress/sql/hash_part.sql

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index ae0bbfbe..092e925e 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -40,6 +40,7 @@
 #include "optimizer/planmain.h"
 #include "optimizer/prep.h"
 #include "optimizer/var.h"
+#include "parser/parse_coerce.h"
 #include "rewrite/rewriteManip.h"
 #include "storage/lmgr.h"
 #include "utils/array.h"
@@ -3114,9 +3115,11 @@ compute_hash_value(PartitionKey key, Datum *values, bool *isnull)
 /*
  * satisfies_hash_partition
  *
- * This is a SQL-callable function for use in hash partition constraints takes
- * an already computed hash values of each partition key attribute, and combine
- * them into a single hash value by calling hash_combine64.
+ * This is an SQL-callable function for use in hash partition constraints.
+ * The first three arguments are the parent table OID, modulus, and remainder.
+ * The remaining arguments are the value of the partitioning columns (or
+ * expressions); these are hashed and the results are combined into a single
+ * hash value by calling hash_combine64.
  *
  * Returns true if remainder produced when this computed single hash value is
  * divided by the given modulus is equal to given remainder, otherwise false.
@@ -3129,59 +3132,159 @@ satisfies_hash_partition(PG_FUNCTION_ARGS)
 	typedef struct ColumnsHashData
 	{
 		Oid			relid;
-		int16		nkeys;
+		int			nkeys;
+		Oid			variadic_type;
+		int16		variadic_typlen;
+		bool		variadic_typbyval;
+		char		variadic_typalign;
 		FmgrInfo	partsupfunc[PARTITION_MAX_KEYS];
 	}			ColumnsHashData;
-	Oid			parentId = PG_GETARG_OID(0);
-	int			modulus = PG_GETARG_INT32(1);
-	int			remainder = PG_GETARG_INT32(2);
-	short		nkeys = PG_NARGS() - 3;
-	int			i;
+	Oid			parentId;
+	int			modulus;
+	int			remainder;
 	Datum		seed = UInt64GetDatum(HASH_PARTITION_SEED);
 	ColumnsHashData *my_extra;
 	uint64		rowHash = 0;
 
+	/* Return null if the parent OID, modulus, or remainder is NULL. */
+	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
+		PG_RETURN_NULL();
+	parentId = PG_GETARG_OID(0);
+	modulus = PG_GETARG_INT32(1);
+	remainder = PG_GETARG_INT32(2);
+
+	/* Sanity check modulus and remainder. */
+	if (modulus <= 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("modulus for hash partition must be a positive integer")));
+	if (remainder < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("remainder for hash partition must be a non-negative integer")));
+	if (remainder >= modulus)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("remainder for hash partition must be less than modulus")));
+
 	/*
 	 * Cache hash function information.
 	 */
 	my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra;
-	if (my_extra == NULL || my_extra->nkeys != nkeys ||
-		my_extra->relid != parentId)
+	if (my_extra == NULL || my_extra->relid != parentId)
 	{
 		Relation	parent;
 		PartitionKey key;
 		int j;
 
+		/* Open parent relation and fetch partition keyinfo */
+		parent = try_relation_open(parentId, AccessShareLock);
+		if (parent == NULL)
+			PG_RETURN_NULL();
+		key = RelationGetPartitionKey(parent);
+
+		/* Reject parent table that is not hash-partitioned. */
+		if (parent->rd_rel->relkind != RELKIND_PARTITIONED_TABLE ||
+			key->strategy != PARTITION_STRATEGY_HASH)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("\"%s\" is not a hash partitioned table",
+							get_rel_name(parentId))));
+
+		if (!get_fn_expr_variadic(fcinfo->flinfo))
+		{
+			int			nargs = PG_NARGS() - 3;
+
+			/* complain if wrong number of column values */
+			if (key->partnatts != nargs)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("number of partitioning columns (%d) does not match number of partition keys provided (%d)",
+								key->partnatts, nargs)));
+
+			/* allocate space for our cache */
 		fcinfo->flinfo->fn_extra =
 			MemoryContextAllocZero(fcinfo->flinfo->fn_mcxt,
 								   offsetof(ColumnsHashData, partsupfunc) +
-								   sizeof(FmgrInfo) * nkeys);
+									   sizeof(FmgrInfo) * nargs);
 		my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra;
-		my_extra->nkeys = nkeys;
 		my_extra->relid = parentId;
+			my_extra->nkeys = key->partnatts;
 
-		/* Open parent relation and fetch partition keyinfo */
-		parent = heap_open(parentId, AccessShareLock);
-		key = RelationGetPartitionKey(parent);
+			/* check argument types and save fmgr_infos */
+			for (j = 0; j < key->partnatts; ++j)
+			{
+				Oid			argtype = get_fn_expr_argtype(fcinfo->flinfo, j + 3);
+
+				if (argtype != key->parttypid[j] && !IsBinaryCoercible(argtype, key->parttypid[j]))
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							 errmsg("column %d of the partition key has type \"%s\", but supplied value is of type \"%s\"",
+									j + 1, format_type_be(key->parttypid[j]), format_type_be(argtype))));
 
-		Assert(key->partnatts == nkeys);
-		for (j = 0; j < nkeys; ++j)
 			fmgr_info_copy(&my_extra->partsupfunc[j],
-						   key->partsupfunc,
+							   &key->partsupfunc[j],
+							   fcinfo->flinfo->fn_mcxt);
+			}
+
+		}
+		else
+		{
+			ArrayType  *variadic_array = PG_GETARG_ARRAYTYPE_P(3);
+
+			/* allocate space for our cache -- just one FmgrInfo in this case */
+			fcinfo->flinfo->fn_extra =
+				MemoryContextAllocZero(fcinfo->flinfo->fn_mcxt,
+									   offsetof(ColumnsHashData, partsupfunc) +
+									   sizeof(FmgrInfo));
+			my_extra = (ColumnsHashData *) fcinfo->flinfo->fn_extra;
+			my_extra->relid = parentId;
+			my_extra->nkeys = key->partnatts;
+			my_extra->variadic_type = ARR_ELEMTYPE(variadic_array);
+			get_typlenbyvalalign(my_extra->variadic_type,
+								 &my_extra->variadic_typlen,
+								 &my_extra->variadic_typbyval,
+								 &my_extra->variadic_typalign);
+
+			/* check argument types */
+			for (j = 0; j < key->partnatts; ++j)
+				if (key->parttypid[j] != my_extra->variadic_type)
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							 errmsg("column %d of the partition key has type \"%s\", but supplied value is of type \"%s\"",
+									j + 1,
+									format_type_be(key->parttypid[j]),
+									format_type_be(my_extra->variadic_type))));
+
+			fmgr_info_copy(&my_extra->partsupfunc[0],
+						   &key->partsupfunc[0],
 						   fcinfo->flinfo->fn_mcxt);
+		}
 
 		/* Hold lock until commit */
-		heap_close(parent, NoLock);
+		relation_close(parent, NoLock);
 	}
 
+	if (!OidIsValid(my_extra->variadic_type))
+	{
+		int			nkeys = my_extra->nkeys;
+		int			i;
+
+		/*
+		 * For a non-variadic call, neither the number of arguments nor their
+		 * types can change across calls, so avoid the expense of rechecking
+		 * here.
+		 */
+
 	for (i = 0; i < nkeys; i++)
 	{
+			Datum		hash;
+
 		/* keys start from fourth argument of function. */
 		int			argno = i + 3;
 
-		if (!PG_ARGISNULL(argno))
-		{
-			Datum		hash;
+			if (PG_ARGISNULL(argno))
+				continue;
 
 			Assert(OidIsValid(my_extra->partsupfunc[i].fn_oid));
 
@@ -3193,6 +3296,45 @@ satisfies_hash_partition(PG_FUNCTION_ARGS)
 			rowHash = hash_combine64(rowHash, DatumGetUInt64(hash));
 		}
 	}
+	else
+	{
+		ArrayType  *variadic_array = PG_GETARG_ARRAYTYPE_P(3);
+		int			i;
+		int			nelems;
+		Datum	   *datum;
+		bool	   *isnull;
+
+		deconstruct_array(variadic_array,
+						  my_extra->variadic_type,
+						  my_extra->variadic_typlen,
+						  my_extra->variadic_typbyval,
+						  my_extra->variadic_typalign,
+						  &datum, &isnull, &nelems);
+
+		/* complain if wrong number of column values */
+		if (nelems != my_extra->nkeys)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("number of partitioning columns (%d) does not match number of partition keys provided (%d)",
+							my_extra->nkeys, nelems)));
+
+		for (i = 0; i < nelems; i++)
+		{
+			Datum		hash;
+
+			if (isnull[i])
+				continue;
+
+			Assert(OidIsValid(my_extra->partsupfunc[0].fn_oid));
+
+			hash = FunctionCall2(&my_extra->partsupfunc[0],
+								 datum[i],
+								 seed);
+
+			/* Form a single 64-bit hash value */
+			rowHash = hash_combine64(rowHash, DatumGetUInt64(hash));
+		}
+	}
 
 	PG_RETURN_BOOL(rowHash % modulus == remainder);
 }
diff --git a/src/test/regress/expected/hash_part.out b/src/test/regress/expected/hash_part.out
new file mode 100644
index 00000000..9e9e56f6
--- /dev/null
+++ b/src/test/regress/expected/hash_part.out
@@ -0,0 +1,113 @@
+--
+-- Hash partitioning.
+--
+CREATE OR REPLACE FUNCTION hashint4_noop(int4, int8) RETURNS int8 AS
+$$SELECT coalesce($1,0)::int8$$ LANGUAGE sql IMMUTABLE;
+CREATE OPERATOR CLASS test_int4_ops FOR TYPE int4 USING HASH AS
+OPERATOR 1 = , FUNCTION 2 hashint4_noop(int4, int8);
+CREATE OR REPLACE FUNCTION hashtext_length(text, int8) RETURNS int8 AS
+$$SELECT length(coalesce($1,''))::int8$$ LANGUAGE sql IMMUTABLE;
+CREATE OPERATOR CLASS test_text_ops FOR TYPE text USING HASH AS
+OPERATOR 1 = , FUNCTION 2 hashtext_length(text, int8);
+CREATE TABLE mchash (a int, b text, c jsonb)
+  PARTITION BY HASH (a test_int4_ops, b test_text_ops);
+CREATE TABLE mchash1
+  PARTITION OF mchash FOR VALUES WITH (MODULUS 4, REMAINDER 0);
+-- invalid OID, no such table
+SELECT satisfies_hash_partition(0, 4, 0, NULL);
+ satisfies_hash_partition 
+--------------------------
+ 
+(1 row)
+
+-- not partitioned
+SELECT satisfies_hash_partition('tenk1'::regclass, 4, 0, NULL);
+ERROR:  "tenk1" is not a hash partitioned table
+-- partition rather than the parent
+SELECT satisfies_hash_partition('mchash1'::regclass, 4, 0, NULL);
+ERROR:  "mchash1" is not a hash partitioned table
+-- invalid modulus
+SELECT satisfies_hash_partition('mchash'::regclass, 0, 0, NULL);
+ERROR:  modulus for hash partition must be a positive integer
+-- remainder too small
+SELECT satisfies_hash_partition('mchash'::regclass, 1, -1, NULL);
+ERROR:  remainder for hash partition must be a non-negative integer
+-- remainder too large
+SELECT satisfies_hash_partition('mchash'::regclass, 1, 1, NULL);
+ERROR:  remainder for hash partition must be less than modulus
+-- modulus is null
+SELECT satisfies_hash_partition('mchash'::regclass, NULL, 0, NULL);
+ satisfies_hash_partition 
+--------------------------
+ 
+(1 row)
+
+-- remainder is null
+SELECT satisfies_hash_partition('mchash'::regclass, 4, NULL, NULL);
+ satisfies_hash_partition 
+--------------------------
+ 
+(1 row)
+
+-- too many arguments
+SELECT satisfies_hash_partition('mchash'::regclass, 4, 0, NULL::int, NULL::text, NULL::json);
+ERROR:  number of partitioning columns (2) does not match number of partition keys provided (3)
+-- too few arguments
+SELECT satisfies_hash_partition('mchash'::regclass, 3, 1, NULL::int);
+ERROR:  number of partitioning columns (2) does not match number of partition keys provided (1)
+-- wrong argument type
+SELECT satisfies_hash_partition('mchash'::regclass, 2, 1, NULL::int, NULL::int);
+ERROR:  column 2 of the partition key has type "text", but supplied value is of type "integer"
+-- ok, should be false
+SELECT satisfies_hash_partition('mchash'::regclass, 4, 0, 0, ''::text);
+ satisfies_hash_partition 
+--------------------------
+ f
+(1 row)
+
+-- ok, should be true
+SELECT satisfies_hash_partition('mchash'::regclass, 4, 0, 1, ''::text);
+ satisfies_hash_partition 
+--------------------------
+ t
+(1 row)
+
+-- argument via variadic syntax, should fail because not all partitioning
+-- columns are of the correct type
+SELECT satisfies_hash_partition('mchash'::regclass, 2, 1,
+								variadic array[1,2]::int[]);
+ERROR:  column 2 of the partition key has type "text", but supplied value is of type "integer"
+-- multiple partitioning columns of the same type
+CREATE TABLE mcinthash (a int, b int, c jsonb)
+  PARTITION BY HASH (a test_int4_ops, b test_int4_ops);
+-- now variadic should work, should be false
+SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0,
+								variadic array[0, 0]);
+ satisfies_hash_partition 
+--------------------------
+ f
+(1 row)
+
+-- should be true
+SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0,
+								variadic array[1, 0]);
+ satisfies_hash_partition 
+--------------------------
+ t
+(1 row)
+
+-- wrong length
+SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0,
+								variadic array[]::int[]);
+ERROR:  number of partitioning columns (2) does not match number of partition keys provided (0)
+-- wrong type
+SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0,
+								variadic array[now(), now()]);
+ERROR:  column 1 of the partition key has type "integer", but supplied value is of type "timestamp with time zone"
+-- cleanup
+DROP TABLE mchash;
+DROP TABLE mcinthash;
+DROP OPERATOR CLASS test_text_ops USING hash;
+DROP OPERATOR CLASS test_int4_ops USING hash;
+DROP FUNCTION hashint4_noop(int4, int8);
+DROP FUNCTION hashtext_length(text, int8);
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index ab868e3a..7c3fa29e 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -134,7 +134,7 @@ test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion t
 # ----------
 # Another group of parallel tests
 # ----------
-test: identity partition_join
+test: identity partition_join hash_part
 
 # event triggers cannot run concurrently with any test that runs DDL
 test: event_trigger
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 28d7802d..890742ef 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -192,6 +192,7 @@ test: with
 test: xml
 test: identity
 test: partition_join
+test: hash_part
 test: event_trigger
 test: fast_default
 test: stats
diff --git a/src/test/regress/sql/hash_part.sql b/src/test/regress/sql/hash_part.sql
new file mode 100644
index 00000000..94c5eaab
--- /dev/null
+++ b/src/test/regress/sql/hash_part.sql
@@ -0,0 +1,90 @@
+--
+-- Hash partitioning.
+--
+
+CREATE OR REPLACE FUNCTION hashint4_noop(int4, int8) RETURNS int8 AS
+$$SELECT coalesce($1,0)::int8$$ LANGUAGE sql IMMUTABLE;
+CREATE OPERATOR CLASS test_int4_ops FOR TYPE int4 USING HASH AS
+OPERATOR 1 = , FUNCTION 2 hashint4_noop(int4, int8);
+
+CREATE OR REPLACE FUNCTION hashtext_length(text, int8) RETURNS int8 AS
+$$SELECT length(coalesce($1,''))::int8$$ LANGUAGE sql IMMUTABLE;
+CREATE OPERATOR CLASS test_text_ops FOR TYPE text USING HASH AS
+OPERATOR 1 = , FUNCTION 2 hashtext_length(text, int8);
+
+CREATE TABLE mchash (a int, b text, c jsonb)
+  PARTITION BY HASH (a test_int4_ops, b test_text_ops);
+CREATE TABLE mchash1
+  PARTITION OF mchash FOR VALUES WITH (MODULUS 4, REMAINDER 0);
+
+-- invalid OID, no such table
+SELECT satisfies_hash_partition(0, 4, 0, NULL);
+
+-- not partitioned
+SELECT satisfies_hash_partition('tenk1'::regclass, 4, 0, NULL);
+
+-- partition rather than the parent
+SELECT satisfies_hash_partition('mchash1'::regclass, 4, 0, NULL);
+
+-- invalid modulus
+SELECT satisfies_hash_partition('mchash'::regclass, 0, 0, NULL);
+
+-- remainder too small
+SELECT satisfies_hash_partition('mchash'::regclass, 1, -1, NULL);
+
+-- remainder too large
+SELECT satisfies_hash_partition('mchash'::regclass, 1, 1, NULL);
+
+-- modulus is null
+SELECT satisfies_hash_partition('mchash'::regclass, NULL, 0, NULL);
+
+-- remainder is null
+SELECT satisfies_hash_partition('mchash'::regclass, 4, NULL, NULL);
+
+-- too many arguments
+SELECT satisfies_hash_partition('mchash'::regclass, 4, 0, NULL::int, NULL::text, NULL::json);
+
+-- too few arguments
+SELECT satisfies_hash_partition('mchash'::regclass, 3, 1, NULL::int);
+
+-- wrong argument type
+SELECT satisfies_hash_partition('mchash'::regclass, 2, 1, NULL::int, NULL::int);
+
+-- ok, should be false
+SELECT satisfies_hash_partition('mchash'::regclass, 4, 0, 0, ''::text);
+
+-- ok, should be true
+SELECT satisfies_hash_partition('mchash'::regclass, 4, 0, 1, ''::text);
+
+-- argument via variadic syntax, should fail because not all partitioning
+-- columns are of the correct type
+SELECT satisfies_hash_partition('mchash'::regclass, 2, 1,
+								variadic array[1,2]::int[]);
+
+-- multiple partitioning columns of the same type
+CREATE TABLE mcinthash (a int, b int, c jsonb)
+  PARTITION BY HASH (a test_int4_ops, b test_int4_ops);
+
+-- now variadic should work, should be false
+SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0,
+								variadic array[0, 0]);
+
+-- should be true
+SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0,
+								variadic array[1, 0]);
+
+-- wrong length
+SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0,
+								variadic array[]::int[]);
+
+-- wrong type
+SELECT satisfies_hash_partition('mcinthash'::regclass, 4, 0,
+								variadic array[now(), now()]);
+
+-- cleanup
+DROP TABLE mchash;
+DROP TABLE mcinthash;
+DROP OPERATOR CLASS test_text_ops USING hash;
+DROP OPERATOR CLASS test_int4_ops USING hash;
+DROP FUNCTION hashint4_noop(int4, int8);
+DROP FUNCTION hashtext_length(text, int8);

From 89b1b9957e6e6c18797efa9fcfacbfa19ee8d284 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 20:51:35 +0800
Subject: [PATCH 207/578] Show partition info from psql
 \d+.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/bin/psql/describe.c                    | 34 ++++++++++++++++++----
 src/test/regress/expected/create_table.out | 11 ++++---
 src/test/regress/expected/foreign_data.out |  3 ++
 src/test/regress/expected/insert.out       | 17 +++++++++++
 src/test/regress/sql/create_table.sql      |  2 +-
 src/test/regress/sql/insert.sql            |  4 +++
 6 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 266c3c31..00cd59b0 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -2844,7 +2844,9 @@ describeOneTableDetails(const char *schemaname,
         /* print child tables (with additional info if partitions) */
         if (pset.sversion >= 100000)
             printfPQExpBuffer(&buf,
-                              "SELECT c.oid::pg_catalog.regclass, pg_catalog.pg_get_expr(c.relpartbound, c.oid)"
+							  "SELECT c.oid::pg_catalog.regclass,"
+							  "       pg_catalog.pg_get_expr(c.relpartbound, c.oid),"
+							  "       c.relkind"
                               " FROM pg_catalog.pg_class c, pg_catalog.pg_inherits i"
                               " WHERE c.oid=i.inhrelid AND i.inhparent = '%s'"
                               " ORDER BY c.oid::pg_catalog.regclass::pg_catalog.text;", oid);
@@ -2867,7 +2869,18 @@ describeOneTableDetails(const char *schemaname,
         else
             tuples = PQntuples(result);
 
-        if (!verbose)
+		/*
+		 * For a partitioned table with no partitions, always print the number
+		 * of partitions as zero, even when verbose output is expected.
+		 * Otherwise, we will not print "Partitions" section for a partitioned
+		 * table without any partitions.
+		 */
+		if (tableinfo.relkind == RELKIND_PARTITIONED_TABLE && tuples == 0)
+		{
+			printfPQExpBuffer(&buf, _("Number of partitions: %d"), tuples);
+			printTableAddFooter(&cont, buf.data);
+		}
+		else if (!verbose)
         {
             /* print the number of child tables, if any */
             if (tuples > 0)
@@ -2899,12 +2912,21 @@ describeOneTableDetails(const char *schemaname,
                 }
                 else
                 {
+					char *partitioned_note;
+
+					if (*PQgetvalue(result, i, 2) == RELKIND_PARTITIONED_TABLE)
+						partitioned_note = ", PARTITIONED";
+					else
+						partitioned_note = "";
+
                     if (i == 0)
-                        printfPQExpBuffer(&buf, "%s: %s %s",
-                                          ct, PQgetvalue(result, i, 0), PQgetvalue(result, i, 1));
+						printfPQExpBuffer(&buf, "%s: %s %s%s",
+										  ct, PQgetvalue(result, i, 0), PQgetvalue(result, i, 1),
+										  partitioned_note);
                     else
-                        printfPQExpBuffer(&buf, "%*s  %s %s",
-                                          ctw, "", PQgetvalue(result, i, 0), PQgetvalue(result, i, 1));
+						printfPQExpBuffer(&buf, "%*s  %s %s%s",
+										  ctw, "", PQgetvalue(result, i, 0), PQgetvalue(result, i, 1),
+										  partitioned_note);
                 }
                 if (i < tuples - 1)
                     appendPQExpBufferChar(&buf, ',');
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index 4ae86d8c..86c347be 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -431,13 +431,15 @@ ERROR:  cannot inherit from partitioned table "partitioned2"
  c      | text    |           |          | 
  d      | text    |           |          | 
 Partition key: RANGE (a oid_ops, plusone(b), c, d COLLATE "C")
+Number of partitions: 0
 
-\d partitioned2
+\d+ partitioned2
             Table "public.partitioned2"
- Column |  Type   | Collation | Nullable | Default 
---------+---------+-----------+----------+---------
- a      | integer |           |          | 
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | integer |           |          |         | plain   |              | 
 Partition key: LIST (((a + 1)))
+Number of partitions: 0
 
 DROP TABLE partitioned, partitioned2;
 --
@@ -877,6 +879,7 @@ SELECT obj_description('parted_col_comment'::regclass);
  a      | integer |           |          |         | plain    |              | Partition key
  b      | text    |           |          |         | extended |              | 
 Partition key: LIST (a)
+Number of partitions: 0
 Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
 
diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out
index 1ba6f02c..a5326254 100644
--- a/src/test/regress/expected/foreign_data.out
+++ b/src/test/regress/expected/foreign_data.out
@@ -1518,6 +1518,7 @@ ERROR:  foreign table "pt2_1" does not exist
  c2     | text    |           |          |         | extended |              | 
  c3     | date    |           |          |         | plain    |              | 
 Partition key: LIST (c1)
+Number of partitions: 0
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
@@ -1578,6 +1579,7 @@ ALTER TABLE pt2 ALTER c2 SET NOT NULL;
  c2     | text    |           | not null |         | extended |              | 
  c3     | date    |           |          |         | plain    |              | 
 Partition key: LIST (c1)
+Number of partitions: 0
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
@@ -1601,6 +1603,7 @@ ALTER TABLE pt2 ADD CONSTRAINT pt2chk1 CHECK (c1 > 0);
 Partition key: LIST (c1)
 Check constraints:
     "pt2chk1" CHECK (c1 > 0)
+Number of partitions: 0
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index 96b99abb..d7fd4ee7 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -427,6 +427,23 @@ from hash_parted order by part;
  hpart3 | 11 |                 3
 (13 rows)
 
+-- test \d+ output on a table which has both partitioned and unpartitioned
+-- partitions
+\d+ list_parted
+                                Table "public.list_parted"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | text    |           |          |         | extended |              | 
+ b      | integer |           |          |         | plain    |              | 
+Partition key: LIST (lower(a))
+Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'),
+            part_cc_dd FOR VALUES IN ('cc', 'dd'),
+            part_default DEFAULT, PARTITIONED,
+            part_ee_ff FOR VALUES IN ('ee', 'ff'), PARTITIONED,
+            part_gg FOR VALUES IN ('gg'), PARTITIONED,
+            part_null FOR VALUES IN (NULL),
+            part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;
diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql
index c1cf6ee1..43ada6b3 100644
--- a/src/test/regress/sql/create_table.sql
+++ b/src/test/regress/sql/create_table.sql
@@ -423,7 +423,7 @@ CREATE TABLE fail () INHERITS (partitioned2);
 
 -- Partition key in describe output
 \d partitioned
-\d partitioned2
+\d+ partitioned2
 
 DROP TABLE partitioned, partitioned2;
 
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index ef7abf94..491af082 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -251,6 +251,10 @@ insert into hpart3 values(11);
 select tableoid::regclass as part, a, a%4 as "remainder = a % 4"
 from hash_parted order by part;
 
+-- test \d+ output on a table which has both partitioned and unpartitioned
+-- partitions
+\d+ list_parted
+
 -- cleanup
 drop table range_parted, list_parted;
 drop table hash_parted;

From f54767eb3d7b462886c9d37e6055e94c07fd2ec5 Mon Sep 17 00:00:00 2001
From: Simon Riggs <simon@2ndQuadrant.com>
Date: Thu, 23 Nov 2017 05:17:47 +1100
Subject: [PATCH 208/578] Sort default partition to bottom of psql \d+
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Minor patch to change sort order only

Author: Ashutosh Bapat
Reviewed-by:  Álvaro Herrera, Simon Riggs
---
 src/bin/psql/describe.c              | 3 ++-
 src/test/regress/expected/insert.out | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 00cd59b0..f198c238 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -2849,7 +2849,8 @@ describeOneTableDetails(const char *schemaname,
 							  "       c.relkind"
                               " FROM pg_catalog.pg_class c, pg_catalog.pg_inherits i"
                               " WHERE c.oid=i.inhrelid AND i.inhparent = '%s'"
-                              " ORDER BY c.oid::pg_catalog.regclass::pg_catalog.text;", oid);
+							  " ORDER BY pg_catalog.pg_get_expr(c.relpartbound, c.oid) = 'DEFAULT',"
+							  "          c.oid::pg_catalog.regclass::pg_catalog.text;", oid);
         else if (pset.sversion >= 80300)
             printfPQExpBuffer(&buf,
                               "SELECT c.oid::pg_catalog.regclass"
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index d7fd4ee7..503221fc 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -438,11 +438,11 @@ from hash_parted order by part;
 Partition key: LIST (lower(a))
 Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'),
             part_cc_dd FOR VALUES IN ('cc', 'dd'),
-            part_default DEFAULT, PARTITIONED,
             part_ee_ff FOR VALUES IN ('ee', 'ff'), PARTITIONED,
             part_gg FOR VALUES IN ('gg'), PARTITIONED,
             part_null FOR VALUES IN (NULL),
-            part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED
+            part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED,
+            part_default DEFAULT, PARTITIONED
 
 -- cleanup
 drop table range_parted, list_parted;

From ee4678a1e765d821e5561cc86b332f27f43fdc08 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 20:58:08 +0800
Subject: [PATCH 209/578] Fix assorted syscache lookup sloppiness in
 partition-related code.

---
 src/backend/catalog/heap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 0c382fe7..2f135c95 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -2797,6 +2797,8 @@ heap_drop_with_catalog(Oid relid)
      * shared-cache-inval notice that will make them update their index lists.
      */
     tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
+	if (!HeapTupleIsValid(tuple))
+		elog(ERROR, "cache lookup failed for relation %u", relid);
     if (((Form_pg_class) GETSTRUCT(tuple))->relispartition)
     {
         parentOid = get_partition_parent(relid);
@@ -4243,9 +4245,6 @@ StorePartitionKey(Relation rel,
 
     Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
 
-    tuple = SearchSysCache1(PARTRELID,
-                            ObjectIdGetDatum(RelationGetRelid(rel)));
-
     /* Copy the partition attribute numbers, opclass OIDs into arrays */
     partattrs_vec = buildint2vector(partattrs, partnatts);
     partopclass_vec = buildoidvector(partopclass, partnatts);

From 367cc1a194bae456316c32e29ad73173164bdb86 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 21:19:33 +0800
Subject: [PATCH 210/578] Add null test to partition constraint for default
 range partitions.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/partition.c         | 27 ++++++++++++++++++++-----
 src/test/regress/expected/inherit.out   |  9 +++++----
 src/test/regress/expected/inherit_1.out |  9 +++++----
 src/test/regress/expected/inherit_2.out |  7 ++++---
 src/test/regress/expected/inherit_3.out |  9 +++++----
 src/test/regress/expected/update.out    |  2 +-
 src/test/regress/sql/inherit.sql        |  9 +++++----
 7 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 092e925e..45dca13c 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -2163,12 +2163,29 @@ get_qual_for_range(Relation parent, PartitionBoundSpec *spec,
 
 		if (or_expr_args != NIL)
         {
-			/* OR all the non-default partition constraints; then negate it */
-			result = lappend(result,
+			Expr   *other_parts_constr;
+
+			/*
+			 * Combine the constraints obtained for non-default partitions
+			 * using OR.  As requested, each of the OR's args doesn't include
+			 * the NOT NULL test for partition keys (which is to avoid its
+			 * useless repetition).  Add the same now.
+			 */
+			other_parts_constr =
+						makeBoolExpr(AND_EXPR,
+							lappend(get_range_nulltest(key),
 							 list_length(or_expr_args) > 1
-							 ? makeBoolExpr(OR_EXPR, or_expr_args, -1)
-							 : linitial(or_expr_args));
-			result = list_make1(makeBoolExpr(NOT_EXPR, result, -1));
+										? makeBoolExpr(OR_EXPR, or_expr_args,
+													   -1)
+										: linitial(or_expr_args)),
+									-1);
+
+			/*
+			 * Finally, the default partition contains everything *NOT*
+			 * contained in the non-default partitions.
+			 */
+			result = list_make1(makeBoolExpr(NOT_EXPR,
+										list_make1(other_parts_constr), -1));
         }
 
 		return result;
diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
index 6e287dc4..51d9903d 100644
--- a/src/test/regress/expected/inherit.out
+++ b/src/test/regress/expected/inherit.out
@@ -2166,13 +2166,14 @@ drop table range_list_parted;
 -- check that constraint exclusion is able to cope with the partition
 -- constraint emitted for multi-column range partitioned tables
 create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
+create table mcrparted_def partition of mcrparted default;
 create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1);
 create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10);
 create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
 create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20);
 create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
-explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
+explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0, mcrparted_def
                 QUERY PLAN                
 ------------------------------------------
  Remote Subquery Scan on all (datanode_2)
@@ -2181,7 +2182,7 @@ explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
                Filter: (a = 0)
 (4 rows)
 
-explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scans mcrparted1
+explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scans mcrparted1, mcrparted_def
                     QUERY PLAN                     
 ---------------------------------------------------
  Remote Subquery Scan on all (datanode_2)
@@ -2190,7 +2191,7 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scan
                Filter: ((a = 10) AND (abs(b) < 5))
 (4 rows)
 
-explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scans mcrparted1, mcrparted2
+explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scans mcrparted1, mcrparted2, mcrparted_def
                     QUERY PLAN                     
 ---------------------------------------------------
  Remote Subquery Scan on all (datanode_2)
@@ -2246,7 +2247,7 @@ explain (costs off) select * from mcrparted where a = 20 and abs(b) = 10 and c >
                Filter: ((c > 10) AND (a = 20) AND (abs(b) = 10))
 (4 rows)
 
-explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5
+explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5, mcrparted_def
                   QUERY PLAN                   
 -----------------------------------------------
  Remote Subquery Scan on all (datanode_2)
diff --git a/src/test/regress/expected/inherit_1.out b/src/test/regress/expected/inherit_1.out
index ff38ed79..a6b99b17 100644
--- a/src/test/regress/expected/inherit_1.out
+++ b/src/test/regress/expected/inherit_1.out
@@ -2160,13 +2160,14 @@ drop table range_list_parted;
 -- check that constraint exclusion is able to cope with the partition
 -- constraint emitted for multi-column range partitioned tables
 create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
+create table mcrparted_def partition of mcrparted default;
 create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1);
 create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10);
 create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
 create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20);
 create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
-explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
+explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0, mcrparted_def
              QUERY PLAN             
 ------------------------------------
  Remote Fast Query Execution
@@ -2176,7 +2177,7 @@ explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
                Filter: (a = 0)
 (5 rows)
 
-explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scans mcrparted1
+explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scans mcrparted1, mcrparted_def
                     QUERY PLAN                     
 ---------------------------------------------------
  Remote Fast Query Execution
@@ -2186,7 +2187,7 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scan
                Filter: ((a = 10) AND (abs(b) < 5))
 (5 rows)
 
-explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scans mcrparted1, mcrparted2
+explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scans mcrparted1, mcrparted2, mcrparted_def
                     QUERY PLAN                     
 ---------------------------------------------------
  Remote Fast Query Execution
@@ -2246,7 +2247,7 @@ explain (costs off) select * from mcrparted where a = 20 and abs(b) = 10 and c >
                Filter: ((c > 10) AND (a = 20) AND (abs(b) = 10))
 (5 rows)
 
-explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5
+explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5, mcrparted_def
                   QUERY PLAN                   
 -----------------------------------------------
  Remote Fast Query Execution
diff --git a/src/test/regress/expected/inherit_2.out b/src/test/regress/expected/inherit_2.out
index 8d97e116..ef08ec3e 100644
--- a/src/test/regress/expected/inherit_2.out
+++ b/src/test/regress/expected/inherit_2.out
@@ -2131,13 +2131,14 @@ drop table range_list_parted;
 -- check that constraint exclusion is able to cope with the partition
 -- constraint emitted for multi-column range partitioned tables
 create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
+create table mcrparted_def partition of mcrparted default;
 create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1);
 create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10);
 create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
 create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20);
 create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
-explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
+explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0, mcrparted_def
                 QUERY PLAN                
 ------------------------------------------
  Remote Subquery Scan on all (datanode_2)
@@ -2146,7 +2147,7 @@ explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
                Filter: (a = 0)
 (4 rows)
 
-explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scans mcrparted1
+explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scans mcrparted1, mcrparted_def
                     QUERY PLAN                     
 ---------------------------------------------------
  Remote Subquery Scan on all (datanode_2)
@@ -2155,7 +2156,7 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scan
                Filter: ((a = 10) AND (abs(b) < 5))
 (4 rows)
 
-explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scans mcrparted1, mcrparted2
+explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scans mcrparted1, mcrparted2, mcrparted_def, mcrparted_def
                     QUERY PLAN                     
 ---------------------------------------------------
  Remote Subquery Scan on all (datanode_2)
diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out
index 402c6a51..9a33a70d 100644
--- a/src/test/regress/expected/inherit_3.out
+++ b/src/test/regress/expected/inherit_3.out
@@ -2147,13 +2147,14 @@ drop table range_list_parted;
 -- check that constraint exclusion is able to cope with the partition
 -- constraint emitted for multi-column range partitioned tables
 create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
+create table mcrparted_def partition of mcrparted default;
 create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1);
 create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10);
 create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
 create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20);
 create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
-explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
+explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0, mcrparted_def
              QUERY PLAN             
 ------------------------------------
  Remote Fast Query Execution
@@ -2163,7 +2164,7 @@ explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
                Filter: (a = 0)
 (5 rows)
 
-explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scans mcrparted1
+explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scans mcrparted1, mcrparted_def
                     QUERY PLAN                     
 ---------------------------------------------------
  Remote Fast Query Execution
@@ -2173,7 +2174,7 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scan
                Filter: ((a = 10) AND (abs(b) < 5))
 (5 rows)
 
-explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scans mcrparted1, mcrparted2
+explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scans mcrparted1, mcrparted2, mcrparted_def
                     QUERY PLAN                     
 ---------------------------------------------------
  Remote Fast Query Execution
@@ -2233,7 +2234,7 @@ explain (costs off) select * from mcrparted where a = 20 and abs(b) = 10 and c >
                Filter: ((c > 10) AND (a = 20) AND (abs(b) = 10))
 (5 rows)
 
-explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5
+explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5, mcrparted_def
                   QUERY PLAN                   
 -----------------------------------------------
  Remote Fast Query Execution
diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out
index 0aae60ac..9cdaf10f 100644
--- a/src/test/regress/expected/update.out
+++ b/src/test/regress/expected/update.out
@@ -227,7 +227,7 @@ create table part_def partition of range_parted default;
  a      | text    |           |          |         | extended |              | 
  b      | integer |           |          |         | plain    |              | 
 Partition of: range_parted DEFAULT
-Partition constraint: (NOT (((a = 'a'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'a'::text) AND (b >= 10) AND (b < 20)) OR ((a = 'b'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'b'::text) AND (b >= 10) AND (b < 20))))
+Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'a'::text) AND (b >= 10) AND (b < 20)) OR ((a = 'b'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'b'::text) AND (b >= 10) AND (b < 20)))))
 
 insert into range_parted values ('c', 9);
 -- ok
diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
index 58f7f523..ea17dd86 100644
--- a/src/test/regress/sql/inherit.sql
+++ b/src/test/regress/sql/inherit.sql
@@ -734,19 +734,20 @@ drop table range_list_parted;
 -- check that constraint exclusion is able to cope with the partition
 -- constraint emitted for multi-column range partitioned tables
 create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c);
+create table mcrparted_def partition of mcrparted default;
 create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, 1, 1);
 create table mcrparted1 partition of mcrparted for values from (1, 1, 1) to (10, 5, 10);
 create table mcrparted2 partition of mcrparted for values from (10, 5, 10) to (10, 10, 10);
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
 create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20);
 create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
-explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0
-explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scans mcrparted1
-explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scans mcrparted1, mcrparted2
+explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0, mcrparted_def
+explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scans mcrparted1, mcrparted_def
+explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scans mcrparted1, mcrparted2, mcrparted_def
 explain (costs off) select * from mcrparted where abs(b) = 5;	-- scans all partitions
 explain (costs off) select * from mcrparted where a > -1;	-- scans all partitions
 explain (costs off) select * from mcrparted where a = 20 and abs(b) = 10 and c > 10;	-- scans mcrparted4
-explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5
+explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mcrparted3, mcrparte4, mcrparte5, mcrparted_def
 drop table mcrparted;
 
 -- check that partitioned table Appends cope with being referenced in

From febad44a0674ef540a69bf37f3f9e240f1b1d3d0 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 28 Nov 2017 14:11:16 -0500
Subject: [PATCH 211/578] If a range-partitioned table has no default
 partition, reject null keys.

Commit 4e5fe9ad19e14af360de7970caa8b150436c9dec introduced this
problem.  Also add a test so it doesn't get broken again.

Report by Rushabh Lathia.  Fix by Amit Langote.  Reviewed by Rushabh
Lathia and Amul Sul.  Tweaked by me.

Discussion: http://postgr.es/m/CAGPqQf0Y1iJyk4QJBdMf=pS9i6Q0JUMM_h5-qkR3OMJ-e04PyA@mail.gmail.com
---
 src/backend/catalog/partition.c      | 5 ++---
 src/test/regress/expected/insert.out | 4 ++++
 src/test/regress/sql/insert.sql      | 3 +++
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 45dca13c..544e3365 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -2582,11 +2582,10 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
 					 */
             for (i = 0; i < key->partnatts; i++)
             {
-						if (isnull[i] &&
-							partition_bound_has_default(partdesc->boundinfo))
+					if (isnull[i])
 						{
 							range_partkey_has_null = true;
-                        part_index = partdesc->boundinfo->default_index;
+						break;
                 }
             }
 
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index 503221fc..a3649273 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -692,6 +692,10 @@ create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue)
 create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10);
 create table mcrparted4 partition of mcrparted for values from (21, minvalue, minvalue) to (30, 20, maxvalue);
 create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, maxvalue, maxvalue);
+-- null not allowed in range partition
+insert into mcrparted values (null, null, null);
+ERROR:  no partition of relation "mcrparted" found for row
+DETAIL:  Partition key of the failing row contains (a, abs(b), c) = (null, null, null).
 -- routed to mcrparted0
 insert into mcrparted values (0, 1, 1);
 insert into mcrparted0 values (0, 1, 1);
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 491af082..e276954e 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -415,6 +415,9 @@ create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20
 create table mcrparted4 partition of mcrparted for values from (21, minvalue, minvalue) to (30, 20, maxvalue);
 create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, maxvalue, maxvalue);
 
+-- null not allowed in range partition
+insert into mcrparted values (null, null, null);
+
 -- routed to mcrparted0
 insert into mcrparted values (0, 1, 1);
 insert into mcrparted0 values (0, 1, 1);

From 27f838fa4c17efe70c86406d96835ffe48c489f0 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 21:28:02 +0800
Subject: [PATCH 212/578] Add extensive tests for partition pruning.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/test/regress/expected/partition_prune.out | 1095 +++++++++++++++++
 src/test/regress/parallel_schedule            |    2 +-
 src/test/regress/serial_schedule              |    1 +
 src/test/regress/sql/partition_prune.sql      |  155 +++
 4 files changed, 1252 insertions(+), 1 deletion(-)
 create mode 100644 src/test/regress/expected/partition_prune.out
 create mode 100644 src/test/regress/sql/partition_prune.sql

diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
new file mode 100644
index 00000000..aabb0240
--- /dev/null
+++ b/src/test/regress/expected/partition_prune.out
@@ -0,0 +1,1095 @@
+--
+-- Test partitioning planner code
+--
+create table lp (a char) partition by list (a);
+create table lp_default partition of lp default;
+create table lp_ef partition of lp for values in ('e', 'f');
+create table lp_ad partition of lp for values in ('a', 'd');
+create table lp_bc partition of lp for values in ('b', 'c');
+create table lp_g partition of lp for values in ('g');
+create table lp_null partition of lp for values in (null);
+explain (costs off) select * from lp;
+          QUERY PLAN          
+------------------------------
+ Append
+   ->  Seq Scan on lp_ad
+   ->  Seq Scan on lp_bc
+   ->  Seq Scan on lp_ef
+   ->  Seq Scan on lp_g
+   ->  Seq Scan on lp_null
+   ->  Seq Scan on lp_default
+(7 rows)
+
+explain (costs off) select * from lp where a > 'a' and a < 'd';
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Append
+   ->  Seq Scan on lp_bc
+         Filter: ((a > 'a'::bpchar) AND (a < 'd'::bpchar))
+   ->  Seq Scan on lp_default
+         Filter: ((a > 'a'::bpchar) AND (a < 'd'::bpchar))
+(5 rows)
+
+explain (costs off) select * from lp where a > 'a' and a <= 'd';
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Append
+   ->  Seq Scan on lp_ad
+         Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar))
+   ->  Seq Scan on lp_bc
+         Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar))
+   ->  Seq Scan on lp_default
+         Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar))
+(7 rows)
+
+explain (costs off) select * from lp where a = 'a';
+            QUERY PLAN             
+-----------------------------------
+ Append
+   ->  Seq Scan on lp_ad
+         Filter: (a = 'a'::bpchar)
+(3 rows)
+
+explain (costs off) select * from lp where 'a' = a;	/* commuted */
+            QUERY PLAN             
+-----------------------------------
+ Append
+   ->  Seq Scan on lp_ad
+         Filter: ('a'::bpchar = a)
+(3 rows)
+
+explain (costs off) select * from lp where a is not null;
+           QUERY PLAN            
+---------------------------------
+ Append
+   ->  Seq Scan on lp_ad
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on lp_bc
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on lp_ef
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on lp_g
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on lp_default
+         Filter: (a IS NOT NULL)
+(11 rows)
+
+explain (costs off) select * from lp where a is null;
+         QUERY PLAN          
+-----------------------------
+ Append
+   ->  Seq Scan on lp_null
+         Filter: (a IS NULL)
+(3 rows)
+
+explain (costs off) select * from lp where a = 'a' or a = 'c';
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Append
+   ->  Seq Scan on lp_ad
+         Filter: ((a = 'a'::bpchar) OR (a = 'c'::bpchar))
+   ->  Seq Scan on lp_bc
+         Filter: ((a = 'a'::bpchar) OR (a = 'c'::bpchar))
+(5 rows)
+
+explain (costs off) select * from lp where a is not null and (a = 'a' or a = 'c');
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on lp_ad
+         Filter: ((a IS NOT NULL) AND ((a = 'a'::bpchar) OR (a = 'c'::bpchar)))
+   ->  Seq Scan on lp_bc
+         Filter: ((a IS NOT NULL) AND ((a = 'a'::bpchar) OR (a = 'c'::bpchar)))
+(5 rows)
+
+explain (costs off) select * from lp where a <> 'g';
+             QUERY PLAN             
+------------------------------------
+ Append
+   ->  Seq Scan on lp_ad
+         Filter: (a <> 'g'::bpchar)
+   ->  Seq Scan on lp_bc
+         Filter: (a <> 'g'::bpchar)
+   ->  Seq Scan on lp_ef
+         Filter: (a <> 'g'::bpchar)
+   ->  Seq Scan on lp_default
+         Filter: (a <> 'g'::bpchar)
+(9 rows)
+
+explain (costs off) select * from lp where a <> 'a' and a <> 'd';
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Append
+   ->  Seq Scan on lp_bc
+         Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar))
+   ->  Seq Scan on lp_ef
+         Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar))
+   ->  Seq Scan on lp_g
+         Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar))
+   ->  Seq Scan on lp_default
+         Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar))
+(9 rows)
+
+explain (costs off) select * from lp where a not in ('a', 'd');
+                   QUERY PLAN                   
+------------------------------------------------
+ Append
+   ->  Seq Scan on lp_bc
+         Filter: (a <> ALL ('{a,d}'::bpchar[]))
+   ->  Seq Scan on lp_ef
+         Filter: (a <> ALL ('{a,d}'::bpchar[]))
+   ->  Seq Scan on lp_g
+         Filter: (a <> ALL ('{a,d}'::bpchar[]))
+   ->  Seq Scan on lp_default
+         Filter: (a <> ALL ('{a,d}'::bpchar[]))
+(9 rows)
+
+-- collation matches the partitioning collation, pruning works
+create table coll_pruning (a text collate "C") partition by list (a);
+create table coll_pruning_a partition of coll_pruning for values in ('a');
+create table coll_pruning_b partition of coll_pruning for values in ('b');
+create table coll_pruning_def partition of coll_pruning default;
+explain (costs off) select * from coll_pruning where a collate "C" = 'a' collate "C";
+                 QUERY PLAN                  
+---------------------------------------------
+ Append
+   ->  Seq Scan on coll_pruning_a
+         Filter: (a = 'a'::text COLLATE "C")
+(3 rows)
+
+-- collation doesn't match the partitioning collation, no pruning occurs
+explain (costs off) select * from coll_pruning where a collate "POSIX" = 'a' collate "POSIX";
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Append
+   ->  Seq Scan on coll_pruning_a
+         Filter: ((a)::text = 'a'::text COLLATE "POSIX")
+   ->  Seq Scan on coll_pruning_b
+         Filter: ((a)::text = 'a'::text COLLATE "POSIX")
+   ->  Seq Scan on coll_pruning_def
+         Filter: ((a)::text = 'a'::text COLLATE "POSIX")
+(7 rows)
+
+create table rlp (a int, b varchar) partition by range (a);
+create table rlp_default partition of rlp default partition by list (a);
+create table rlp_default_default partition of rlp_default default;
+create table rlp_default_10 partition of rlp_default for values in (10);
+create table rlp_default_30 partition of rlp_default for values in (30);
+create table rlp_default_null partition of rlp_default for values in (null);
+create table rlp1 partition of rlp for values from (minvalue) to (1);
+create table rlp2 partition of rlp for values from (1) to (10);
+create table rlp3 (b varchar, a int) partition by list (b varchar_ops);
+create table rlp3_default partition of rlp3 default;
+create table rlp3abcd partition of rlp3 for values in ('ab', 'cd');
+create table rlp3efgh partition of rlp3 for values in ('ef', 'gh');
+create table rlp3nullxy partition of rlp3 for values in (null, 'xy');
+alter table rlp attach partition rlp3 for values from (15) to (20);
+create table rlp4 partition of rlp for values from (20) to (30) partition by range (a);
+create table rlp4_default partition of rlp4 default;
+create table rlp4_1 partition of rlp4 for values from (20) to (25);
+create table rlp4_2 partition of rlp4 for values from (25) to (29);
+create table rlp5 partition of rlp for values from (31) to (maxvalue) partition by range (a);
+create table rlp5_default partition of rlp5 default;
+create table rlp5_1 partition of rlp5 for values from (31) to (40);
+explain (costs off) select * from rlp where a < 1;
+       QUERY PLAN        
+-------------------------
+ Append
+   ->  Seq Scan on rlp1
+         Filter: (a < 1)
+(3 rows)
+
+explain (costs off) select * from rlp where 1 > a;	/* commuted */
+       QUERY PLAN        
+-------------------------
+ Append
+   ->  Seq Scan on rlp1
+         Filter: (1 > a)
+(3 rows)
+
+explain (costs off) select * from rlp where a <= 1;
+              QUERY PLAN               
+---------------------------------------
+ Append
+   ->  Seq Scan on rlp1
+         Filter: (a <= 1)
+   ->  Seq Scan on rlp2
+         Filter: (a <= 1)
+   ->  Seq Scan on rlp_default_default
+         Filter: (a <= 1)
+(7 rows)
+
+explain (costs off) select * from rlp where a = 1;
+       QUERY PLAN        
+-------------------------
+ Append
+   ->  Seq Scan on rlp2
+         Filter: (a = 1)
+(3 rows)
+
+explain (costs off) select * from rlp where a = 1::bigint;		/* same as above */
+            QUERY PLAN             
+-----------------------------------
+ Append
+   ->  Seq Scan on rlp2
+         Filter: (a = '1'::bigint)
+(3 rows)
+
+explain (costs off) select * from rlp where a = 1::numeric;	/* no pruning */
+                  QUERY PLAN                   
+-----------------------------------------------
+ Append
+   ->  Seq Scan on rlp1
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp2
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp3abcd
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp3efgh
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp3nullxy
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp3_default
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp4_1
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp4_2
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp4_default
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp5_1
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp5_default
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp_default_10
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp_default_30
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp_default_null
+         Filter: ((a)::numeric = '1'::numeric)
+   ->  Seq Scan on rlp_default_default
+         Filter: ((a)::numeric = '1'::numeric)
+(31 rows)
+
+explain (costs off) select * from rlp where a <= 10;
+              QUERY PLAN               
+---------------------------------------
+ Append
+   ->  Seq Scan on rlp1
+         Filter: (a <= 10)
+   ->  Seq Scan on rlp2
+         Filter: (a <= 10)
+   ->  Seq Scan on rlp_default_10
+         Filter: (a <= 10)
+   ->  Seq Scan on rlp_default_default
+         Filter: (a <= 10)
+(9 rows)
+
+explain (costs off) select * from rlp where a > 10;
+              QUERY PLAN               
+---------------------------------------
+ Append
+   ->  Seq Scan on rlp3abcd
+         Filter: (a > 10)
+   ->  Seq Scan on rlp3efgh
+         Filter: (a > 10)
+   ->  Seq Scan on rlp3nullxy
+         Filter: (a > 10)
+   ->  Seq Scan on rlp3_default
+         Filter: (a > 10)
+   ->  Seq Scan on rlp4_1
+         Filter: (a > 10)
+   ->  Seq Scan on rlp4_2
+         Filter: (a > 10)
+   ->  Seq Scan on rlp4_default
+         Filter: (a > 10)
+   ->  Seq Scan on rlp5_1
+         Filter: (a > 10)
+   ->  Seq Scan on rlp5_default
+         Filter: (a > 10)
+   ->  Seq Scan on rlp_default_30
+         Filter: (a > 10)
+   ->  Seq Scan on rlp_default_default
+         Filter: (a > 10)
+(23 rows)
+
+explain (costs off) select * from rlp where a < 15;
+              QUERY PLAN               
+---------------------------------------
+ Append
+   ->  Seq Scan on rlp1
+         Filter: (a < 15)
+   ->  Seq Scan on rlp2
+         Filter: (a < 15)
+   ->  Seq Scan on rlp_default_10
+         Filter: (a < 15)
+   ->  Seq Scan on rlp_default_default
+         Filter: (a < 15)
+(9 rows)
+
+explain (costs off) select * from rlp where a <= 15;
+              QUERY PLAN               
+---------------------------------------
+ Append
+   ->  Seq Scan on rlp1
+         Filter: (a <= 15)
+   ->  Seq Scan on rlp2
+         Filter: (a <= 15)
+   ->  Seq Scan on rlp3abcd
+         Filter: (a <= 15)
+   ->  Seq Scan on rlp3efgh
+         Filter: (a <= 15)
+   ->  Seq Scan on rlp3nullxy
+         Filter: (a <= 15)
+   ->  Seq Scan on rlp3_default
+         Filter: (a <= 15)
+   ->  Seq Scan on rlp_default_10
+         Filter: (a <= 15)
+   ->  Seq Scan on rlp_default_default
+         Filter: (a <= 15)
+(17 rows)
+
+explain (costs off) select * from rlp where a > 15 and b = 'ab';
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Append
+   ->  Seq Scan on rlp3abcd
+         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp4_1
+         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp4_2
+         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp4_default
+         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp5_1
+         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp5_default
+         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp_default_30
+         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp_default_default
+         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+(17 rows)
+
+explain (costs off) select * from rlp where a = 16;
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on rlp3abcd
+         Filter: (a = 16)
+   ->  Seq Scan on rlp3efgh
+         Filter: (a = 16)
+   ->  Seq Scan on rlp3nullxy
+         Filter: (a = 16)
+   ->  Seq Scan on rlp3_default
+         Filter: (a = 16)
+(9 rows)
+
+explain (costs off) select * from rlp where a = 16 and b in ('not', 'in', 'here');
+                                 QUERY PLAN                                 
+----------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on rlp3_default
+         Filter: ((a = 16) AND ((b)::text = ANY ('{not,in,here}'::text[])))
+(3 rows)
+
+explain (costs off) select * from rlp where a = 16 and b < 'ab';
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Append
+   ->  Seq Scan on rlp3_default
+         Filter: (((b)::text < 'ab'::text) AND (a = 16))
+(3 rows)
+
+explain (costs off) select * from rlp where a = 16 and b <= 'ab';
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Append
+   ->  Seq Scan on rlp3abcd
+         Filter: (((b)::text <= 'ab'::text) AND (a = 16))
+   ->  Seq Scan on rlp3_default
+         Filter: (((b)::text <= 'ab'::text) AND (a = 16))
+(5 rows)
+
+explain (costs off) select * from rlp where a = 16 and b is null;
+                 QUERY PLAN                 
+--------------------------------------------
+ Append
+   ->  Seq Scan on rlp3nullxy
+         Filter: ((b IS NULL) AND (a = 16))
+(3 rows)
+
+explain (costs off) select * from rlp where a = 16 and b is not null;
+                   QUERY PLAN                   
+------------------------------------------------
+ Append
+   ->  Seq Scan on rlp3abcd
+         Filter: ((b IS NOT NULL) AND (a = 16))
+   ->  Seq Scan on rlp3efgh
+         Filter: ((b IS NOT NULL) AND (a = 16))
+   ->  Seq Scan on rlp3nullxy
+         Filter: ((b IS NOT NULL) AND (a = 16))
+   ->  Seq Scan on rlp3_default
+         Filter: ((b IS NOT NULL) AND (a = 16))
+(9 rows)
+
+explain (costs off) select * from rlp where a is null;
+             QUERY PLAN             
+------------------------------------
+ Append
+   ->  Seq Scan on rlp_default_null
+         Filter: (a IS NULL)
+(3 rows)
+
+explain (costs off) select * from rlp where a is not null;
+              QUERY PLAN               
+---------------------------------------
+ Append
+   ->  Seq Scan on rlp1
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp2
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp3abcd
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp3efgh
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp3nullxy
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp3_default
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp4_1
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp4_2
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp4_default
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp5_1
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp5_default
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp_default_10
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp_default_30
+         Filter: (a IS NOT NULL)
+   ->  Seq Scan on rlp_default_default
+         Filter: (a IS NOT NULL)
+(29 rows)
+
+explain (costs off) select * from rlp where a > 30;
+              QUERY PLAN               
+---------------------------------------
+ Append
+   ->  Seq Scan on rlp5_1
+         Filter: (a > 30)
+   ->  Seq Scan on rlp5_default
+         Filter: (a > 30)
+   ->  Seq Scan on rlp_default_default
+         Filter: (a > 30)
+(7 rows)
+
+explain (costs off) select * from rlp where a = 30;	/* only default is scanned */
+            QUERY PLAN            
+----------------------------------
+ Append
+   ->  Seq Scan on rlp_default_30
+         Filter: (a = 30)
+(3 rows)
+
+explain (costs off) select * from rlp where a <= 31;
+              QUERY PLAN               
+---------------------------------------
+ Append
+   ->  Seq Scan on rlp1
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp2
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp3abcd
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp3efgh
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp3nullxy
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp3_default
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp4_1
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp4_2
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp4_default
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp5_1
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp5_default
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp_default_10
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp_default_30
+         Filter: (a <= 31)
+   ->  Seq Scan on rlp_default_default
+         Filter: (a <= 31)
+(29 rows)
+
+explain (costs off) select * from rlp where a = 1 or a = 7;
+              QUERY PLAN              
+--------------------------------------
+ Append
+   ->  Seq Scan on rlp2
+         Filter: ((a = 1) OR (a = 7))
+(3 rows)
+
+explain (costs off) select * from rlp where a = 1 or b = 'ab';
+                      QUERY PLAN                       
+-------------------------------------------------------
+ Append
+   ->  Seq Scan on rlp1
+         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp2
+         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp3abcd
+         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp4_1
+         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp4_2
+         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp4_default
+         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp5_1
+         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp5_default
+         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp_default_10
+         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp_default_30
+         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp_default_null
+         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+   ->  Seq Scan on rlp_default_default
+         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+(25 rows)
+
+explain (costs off) select * from rlp where a > 20 and a < 27;
+               QUERY PLAN                
+-----------------------------------------
+ Append
+   ->  Seq Scan on rlp4_1
+         Filter: ((a > 20) AND (a < 27))
+   ->  Seq Scan on rlp4_2
+         Filter: ((a > 20) AND (a < 27))
+   ->  Seq Scan on rlp4_default
+         Filter: ((a > 20) AND (a < 27))
+(7 rows)
+
+explain (costs off) select * from rlp where a = 29;
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on rlp4_default
+         Filter: (a = 29)
+(3 rows)
+
+explain (costs off) select * from rlp where a >= 29;
+              QUERY PLAN               
+---------------------------------------
+ Append
+   ->  Seq Scan on rlp4_default
+         Filter: (a >= 29)
+   ->  Seq Scan on rlp5_1
+         Filter: (a >= 29)
+   ->  Seq Scan on rlp5_default
+         Filter: (a >= 29)
+   ->  Seq Scan on rlp_default_30
+         Filter: (a >= 29)
+   ->  Seq Scan on rlp_default_default
+         Filter: (a >= 29)
+(11 rows)
+
+-- redundant clauses are eliminated
+explain (costs off) select * from rlp where a > 1 and a = 10;	/* only default */
+               QUERY PLAN               
+----------------------------------------
+ Append
+   ->  Seq Scan on rlp_default_10
+         Filter: ((a > 1) AND (a = 10))
+(3 rows)
+
+explain (costs off) select * from rlp where a > 1 and a >=15;	/* rlp3 onwards, including default */
+               QUERY PLAN                
+-----------------------------------------
+ Append
+   ->  Seq Scan on rlp3abcd
+         Filter: ((a > 1) AND (a >= 15))
+   ->  Seq Scan on rlp3efgh
+         Filter: ((a > 1) AND (a >= 15))
+   ->  Seq Scan on rlp3nullxy
+         Filter: ((a > 1) AND (a >= 15))
+   ->  Seq Scan on rlp3_default
+         Filter: ((a > 1) AND (a >= 15))
+   ->  Seq Scan on rlp4_1
+         Filter: ((a > 1) AND (a >= 15))
+   ->  Seq Scan on rlp4_2
+         Filter: ((a > 1) AND (a >= 15))
+   ->  Seq Scan on rlp4_default
+         Filter: ((a > 1) AND (a >= 15))
+   ->  Seq Scan on rlp5_1
+         Filter: ((a > 1) AND (a >= 15))
+   ->  Seq Scan on rlp5_default
+         Filter: ((a > 1) AND (a >= 15))
+   ->  Seq Scan on rlp_default_30
+         Filter: ((a > 1) AND (a >= 15))
+   ->  Seq Scan on rlp_default_default
+         Filter: ((a > 1) AND (a >= 15))
+(23 rows)
+
+explain (costs off) select * from rlp where a = 1 and a = 3;	/* empty */
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+explain (costs off) select * from rlp where (a = 1 and a = 3) or (a > 1 and a = 15);
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Append
+   ->  Seq Scan on rlp2
+         Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
+   ->  Seq Scan on rlp3abcd
+         Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
+   ->  Seq Scan on rlp3efgh
+         Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
+   ->  Seq Scan on rlp3nullxy
+         Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
+   ->  Seq Scan on rlp3_default
+         Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
+(11 rows)
+
+-- multi-column keys
+create table mc3p (a int, b int, c int) partition by range (a, abs(b), c);
+create table mc3p_default partition of mc3p default;
+create table mc3p0 partition of mc3p for values from (minvalue, minvalue, minvalue) to (1, 1, 1);
+create table mc3p1 partition of mc3p for values from (1, 1, 1) to (10, 5, 10);
+create table mc3p2 partition of mc3p for values from (10, 5, 10) to (10, 10, 10);
+create table mc3p3 partition of mc3p for values from (10, 10, 10) to (10, 10, 20);
+create table mc3p4 partition of mc3p for values from (10, 10, 20) to (10, maxvalue, maxvalue);
+create table mc3p5 partition of mc3p for values from (11, 1, 1) to (20, 10, 10);
+create table mc3p6 partition of mc3p for values from (20, 10, 10) to (20, 20, 20);
+create table mc3p7 partition of mc3p for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
+explain (costs off) select * from mc3p where a = 1;
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on mc3p0
+         Filter: (a = 1)
+   ->  Seq Scan on mc3p1
+         Filter: (a = 1)
+   ->  Seq Scan on mc3p_default
+         Filter: (a = 1)
+(7 rows)
+
+explain (costs off) select * from mc3p where a = 1 and abs(b) < 1;
+                 QUERY PLAN                 
+--------------------------------------------
+ Append
+   ->  Seq Scan on mc3p0
+         Filter: ((a = 1) AND (abs(b) < 1))
+   ->  Seq Scan on mc3p_default
+         Filter: ((a = 1) AND (abs(b) < 1))
+(5 rows)
+
+explain (costs off) select * from mc3p where a = 1 and abs(b) = 1;
+                 QUERY PLAN                 
+--------------------------------------------
+ Append
+   ->  Seq Scan on mc3p0
+         Filter: ((a = 1) AND (abs(b) = 1))
+   ->  Seq Scan on mc3p1
+         Filter: ((a = 1) AND (abs(b) = 1))
+   ->  Seq Scan on mc3p_default
+         Filter: ((a = 1) AND (abs(b) = 1))
+(7 rows)
+
+explain (costs off) select * from mc3p where a = 1 and abs(b) = 1 and c < 8;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Append
+   ->  Seq Scan on mc3p0
+         Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1))
+   ->  Seq Scan on mc3p1
+         Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1))
+   ->  Seq Scan on mc3p_default
+         Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1))
+(7 rows)
+
+explain (costs off) select * from mc3p where a = 10 and abs(b) between 5 and 35;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Append
+   ->  Seq Scan on mc3p1
+         Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
+   ->  Seq Scan on mc3p2
+         Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
+   ->  Seq Scan on mc3p3
+         Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
+   ->  Seq Scan on mc3p4
+         Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
+   ->  Seq Scan on mc3p_default
+         Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
+(11 rows)
+
+explain (costs off) select * from mc3p where a > 10;
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on mc3p5
+         Filter: (a > 10)
+   ->  Seq Scan on mc3p6
+         Filter: (a > 10)
+   ->  Seq Scan on mc3p7
+         Filter: (a > 10)
+   ->  Seq Scan on mc3p_default
+         Filter: (a > 10)
+(9 rows)
+
+explain (costs off) select * from mc3p where a >= 10;
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on mc3p1
+         Filter: (a >= 10)
+   ->  Seq Scan on mc3p2
+         Filter: (a >= 10)
+   ->  Seq Scan on mc3p3
+         Filter: (a >= 10)
+   ->  Seq Scan on mc3p4
+         Filter: (a >= 10)
+   ->  Seq Scan on mc3p5
+         Filter: (a >= 10)
+   ->  Seq Scan on mc3p6
+         Filter: (a >= 10)
+   ->  Seq Scan on mc3p7
+         Filter: (a >= 10)
+   ->  Seq Scan on mc3p_default
+         Filter: (a >= 10)
+(17 rows)
+
+explain (costs off) select * from mc3p where a < 10;
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on mc3p0
+         Filter: (a < 10)
+   ->  Seq Scan on mc3p1
+         Filter: (a < 10)
+   ->  Seq Scan on mc3p_default
+         Filter: (a < 10)
+(7 rows)
+
+explain (costs off) select * from mc3p where a <= 10 and abs(b) < 10;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Append
+   ->  Seq Scan on mc3p0
+         Filter: ((a <= 10) AND (abs(b) < 10))
+   ->  Seq Scan on mc3p1
+         Filter: ((a <= 10) AND (abs(b) < 10))
+   ->  Seq Scan on mc3p2
+         Filter: ((a <= 10) AND (abs(b) < 10))
+   ->  Seq Scan on mc3p_default
+         Filter: ((a <= 10) AND (abs(b) < 10))
+(9 rows)
+
+explain (costs off) select * from mc3p where a = 11 and abs(b) = 0;
+                 QUERY PLAN                  
+---------------------------------------------
+ Append
+   ->  Seq Scan on mc3p_default
+         Filter: ((a = 11) AND (abs(b) = 0))
+(3 rows)
+
+explain (costs off) select * from mc3p where a = 20 and abs(b) = 10 and c = 100;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Append
+   ->  Seq Scan on mc3p6
+         Filter: ((a = 20) AND (c = 100) AND (abs(b) = 10))
+(3 rows)
+
+explain (costs off) select * from mc3p where a > 20;
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on mc3p7
+         Filter: (a > 20)
+   ->  Seq Scan on mc3p_default
+         Filter: (a > 20)
+(5 rows)
+
+explain (costs off) select * from mc3p where a >= 20;
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on mc3p5
+         Filter: (a >= 20)
+   ->  Seq Scan on mc3p6
+         Filter: (a >= 20)
+   ->  Seq Scan on mc3p7
+         Filter: (a >= 20)
+   ->  Seq Scan on mc3p_default
+         Filter: (a >= 20)
+(9 rows)
+
+explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20);
+                                                           QUERY PLAN                                                            
+---------------------------------------------------------------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on mc3p1
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)))
+   ->  Seq Scan on mc3p2
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)))
+   ->  Seq Scan on mc3p5
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)))
+   ->  Seq Scan on mc3p_default
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)))
+(9 rows)
+
+explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20) or a < 1;
+                                                                 QUERY PLAN                                                                 
+--------------------------------------------------------------------------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on mc3p0
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
+   ->  Seq Scan on mc3p1
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
+   ->  Seq Scan on mc3p2
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
+   ->  Seq Scan on mc3p5
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
+   ->  Seq Scan on mc3p_default
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
+(11 rows)
+
+explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20) or a < 1 or a = 1;
+                                                                      QUERY PLAN                                                                       
+-------------------------------------------------------------------------------------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on mc3p0
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
+   ->  Seq Scan on mc3p1
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
+   ->  Seq Scan on mc3p2
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
+   ->  Seq Scan on mc3p5
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
+   ->  Seq Scan on mc3p_default
+         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
+(11 rows)
+
+explain (costs off) select * from mc3p where a = 1 or abs(b) = 1 or c = 1;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Append
+   ->  Seq Scan on mc3p0
+         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+   ->  Seq Scan on mc3p1
+         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+   ->  Seq Scan on mc3p2
+         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+   ->  Seq Scan on mc3p4
+         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+   ->  Seq Scan on mc3p5
+         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+   ->  Seq Scan on mc3p6
+         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+   ->  Seq Scan on mc3p7
+         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+   ->  Seq Scan on mc3p_default
+         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+(17 rows)
+
+explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 10);
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on mc3p0
+         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
+   ->  Seq Scan on mc3p1
+         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
+   ->  Seq Scan on mc3p2
+         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
+   ->  Seq Scan on mc3p3
+         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
+   ->  Seq Scan on mc3p4
+         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
+   ->  Seq Scan on mc3p_default
+         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
+(13 rows)
+
+explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 9);
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on mc3p0
+         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9)))
+   ->  Seq Scan on mc3p1
+         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9)))
+   ->  Seq Scan on mc3p2
+         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9)))
+   ->  Seq Scan on mc3p_default
+         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9)))
+(9 rows)
+
+-- a simpler multi-column keys case
+create table mc2p (a int, b int) partition by range (a, b);
+create table mc2p_default partition of mc2p default;
+create table mc2p0 partition of mc2p for values from (minvalue, minvalue) to (1, minvalue);
+create table mc2p1 partition of mc2p for values from (1, minvalue) to (1, 1);
+create table mc2p2 partition of mc2p for values from (1, 1) to (2, minvalue);
+create table mc2p3 partition of mc2p for values from (2, minvalue) to (2, 1);
+create table mc2p4 partition of mc2p for values from (2, 1) to (2, maxvalue);
+create table mc2p5 partition of mc2p for values from (2, maxvalue) to (maxvalue, maxvalue);
+explain (costs off) select * from mc2p where a < 2;
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on mc2p0
+         Filter: (a < 2)
+   ->  Seq Scan on mc2p1
+         Filter: (a < 2)
+   ->  Seq Scan on mc2p2
+         Filter: (a < 2)
+   ->  Seq Scan on mc2p_default
+         Filter: (a < 2)
+(9 rows)
+
+explain (costs off) select * from mc2p where a = 2 and b < 1;
+              QUERY PLAN               
+---------------------------------------
+ Append
+   ->  Seq Scan on mc2p3
+         Filter: ((b < 1) AND (a = 2))
+(3 rows)
+
+explain (costs off) select * from mc2p where a > 1;
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on mc2p2
+         Filter: (a > 1)
+   ->  Seq Scan on mc2p3
+         Filter: (a > 1)
+   ->  Seq Scan on mc2p4
+         Filter: (a > 1)
+   ->  Seq Scan on mc2p5
+         Filter: (a > 1)
+   ->  Seq Scan on mc2p_default
+         Filter: (a > 1)
+(11 rows)
+
+explain (costs off) select * from mc2p where a = 1 and b > 1;
+              QUERY PLAN               
+---------------------------------------
+ Append
+   ->  Seq Scan on mc2p2
+         Filter: ((b > 1) AND (a = 1))
+(3 rows)
+
+-- boolean partitioning
+create table boolpart (a bool) partition by list (a);
+create table boolpart_default partition of boolpart default;
+create table boolpart_t partition of boolpart for values in ('true');
+create table boolpart_f partition of boolpart for values in ('false');
+explain (costs off) select * from boolpart where a in (true, false);
+                   QUERY PLAN                   
+------------------------------------------------
+ Append
+   ->  Seq Scan on boolpart_f
+         Filter: (a = ANY ('{t,f}'::boolean[]))
+   ->  Seq Scan on boolpart_t
+         Filter: (a = ANY ('{t,f}'::boolean[]))
+(5 rows)
+
+explain (costs off) select * from boolpart where a = false;
+             QUERY PLAN             
+------------------------------------
+ Append
+   ->  Seq Scan on boolpart_f
+         Filter: (NOT a)
+   ->  Seq Scan on boolpart_t
+         Filter: (NOT a)
+   ->  Seq Scan on boolpart_default
+         Filter: (NOT a)
+(7 rows)
+
+explain (costs off) select * from boolpart where not a = false;
+             QUERY PLAN             
+------------------------------------
+ Append
+   ->  Seq Scan on boolpart_f
+         Filter: a
+   ->  Seq Scan on boolpart_t
+         Filter: a
+   ->  Seq Scan on boolpart_default
+         Filter: a
+(7 rows)
+
+explain (costs off) select * from boolpart where a is true or a is not true;
+                    QUERY PLAN                    
+--------------------------------------------------
+ Append
+   ->  Seq Scan on boolpart_f
+         Filter: ((a IS TRUE) OR (a IS NOT TRUE))
+   ->  Seq Scan on boolpart_t
+         Filter: ((a IS TRUE) OR (a IS NOT TRUE))
+   ->  Seq Scan on boolpart_default
+         Filter: ((a IS TRUE) OR (a IS NOT TRUE))
+(7 rows)
+
+explain (costs off) select * from boolpart where a is not true;
+             QUERY PLAN             
+------------------------------------
+ Append
+   ->  Seq Scan on boolpart_f
+         Filter: (a IS NOT TRUE)
+   ->  Seq Scan on boolpart_t
+         Filter: (a IS NOT TRUE)
+   ->  Seq Scan on boolpart_default
+         Filter: (a IS NOT TRUE)
+(7 rows)
+
+explain (costs off) select * from boolpart where a is not true and a is not false;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Append
+   ->  Seq Scan on boolpart_f
+         Filter: ((a IS NOT TRUE) AND (a IS NOT FALSE))
+   ->  Seq Scan on boolpart_t
+         Filter: ((a IS NOT TRUE) AND (a IS NOT FALSE))
+   ->  Seq Scan on boolpart_default
+         Filter: ((a IS NOT TRUE) AND (a IS NOT FALSE))
+(7 rows)
+
+explain (costs off) select * from boolpart where a is unknown;
+             QUERY PLAN             
+------------------------------------
+ Append
+   ->  Seq Scan on boolpart_f
+         Filter: (a IS UNKNOWN)
+   ->  Seq Scan on boolpart_t
+         Filter: (a IS UNKNOWN)
+   ->  Seq Scan on boolpart_default
+         Filter: (a IS UNKNOWN)
+(7 rows)
+
+explain (costs off) select * from boolpart where a is not unknown;
+             QUERY PLAN             
+------------------------------------
+ Append
+   ->  Seq Scan on boolpart_f
+         Filter: (a IS NOT UNKNOWN)
+   ->  Seq Scan on boolpart_t
+         Filter: (a IS NOT UNKNOWN)
+   ->  Seq Scan on boolpart_default
+         Filter: (a IS NOT UNKNOWN)
+(7 rows)
+
+drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart;
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 7c3fa29e..d8a925ca 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -134,7 +134,7 @@ test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion t
 # ----------
 # Another group of parallel tests
 # ----------
-test: identity partition_join hash_part
+test: identity partition_join partition_prune hash_part
 
 # event triggers cannot run concurrently with any test that runs DDL
 test: event_trigger
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 890742ef..f0989763 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -192,6 +192,7 @@ test: with
 test: xml
 test: identity
 test: partition_join
+test: partition_prune
 test: hash_part
 test: event_trigger
 test: fast_default
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
new file mode 100644
index 00000000..514f8e5c
--- /dev/null
+++ b/src/test/regress/sql/partition_prune.sql
@@ -0,0 +1,155 @@
+--
+-- Test partitioning planner code
+--
+create table lp (a char) partition by list (a);
+create table lp_default partition of lp default;
+create table lp_ef partition of lp for values in ('e', 'f');
+create table lp_ad partition of lp for values in ('a', 'd');
+create table lp_bc partition of lp for values in ('b', 'c');
+create table lp_g partition of lp for values in ('g');
+create table lp_null partition of lp for values in (null);
+explain (costs off) select * from lp;
+explain (costs off) select * from lp where a > 'a' and a < 'd';
+explain (costs off) select * from lp where a > 'a' and a <= 'd';
+explain (costs off) select * from lp where a = 'a';
+explain (costs off) select * from lp where 'a' = a;	/* commuted */
+explain (costs off) select * from lp where a is not null;
+explain (costs off) select * from lp where a is null;
+explain (costs off) select * from lp where a = 'a' or a = 'c';
+explain (costs off) select * from lp where a is not null and (a = 'a' or a = 'c');
+explain (costs off) select * from lp where a <> 'g';
+explain (costs off) select * from lp where a <> 'a' and a <> 'd';
+explain (costs off) select * from lp where a not in ('a', 'd');
+
+-- collation matches the partitioning collation, pruning works
+create table coll_pruning (a text collate "C") partition by list (a);
+create table coll_pruning_a partition of coll_pruning for values in ('a');
+create table coll_pruning_b partition of coll_pruning for values in ('b');
+create table coll_pruning_def partition of coll_pruning default;
+explain (costs off) select * from coll_pruning where a collate "C" = 'a' collate "C";
+-- collation doesn't match the partitioning collation, no pruning occurs
+explain (costs off) select * from coll_pruning where a collate "POSIX" = 'a' collate "POSIX";
+
+create table rlp (a int, b varchar) partition by range (a);
+create table rlp_default partition of rlp default partition by list (a);
+create table rlp_default_default partition of rlp_default default;
+create table rlp_default_10 partition of rlp_default for values in (10);
+create table rlp_default_30 partition of rlp_default for values in (30);
+create table rlp_default_null partition of rlp_default for values in (null);
+create table rlp1 partition of rlp for values from (minvalue) to (1);
+create table rlp2 partition of rlp for values from (1) to (10);
+
+create table rlp3 (b varchar, a int) partition by list (b varchar_ops);
+create table rlp3_default partition of rlp3 default;
+create table rlp3abcd partition of rlp3 for values in ('ab', 'cd');
+create table rlp3efgh partition of rlp3 for values in ('ef', 'gh');
+create table rlp3nullxy partition of rlp3 for values in (null, 'xy');
+alter table rlp attach partition rlp3 for values from (15) to (20);
+
+create table rlp4 partition of rlp for values from (20) to (30) partition by range (a);
+create table rlp4_default partition of rlp4 default;
+create table rlp4_1 partition of rlp4 for values from (20) to (25);
+create table rlp4_2 partition of rlp4 for values from (25) to (29);
+
+create table rlp5 partition of rlp for values from (31) to (maxvalue) partition by range (a);
+create table rlp5_default partition of rlp5 default;
+create table rlp5_1 partition of rlp5 for values from (31) to (40);
+
+explain (costs off) select * from rlp where a < 1;
+explain (costs off) select * from rlp where 1 > a;	/* commuted */
+explain (costs off) select * from rlp where a <= 1;
+explain (costs off) select * from rlp where a = 1;
+explain (costs off) select * from rlp where a = 1::bigint;		/* same as above */
+explain (costs off) select * from rlp where a = 1::numeric;	/* no pruning */
+explain (costs off) select * from rlp where a <= 10;
+explain (costs off) select * from rlp where a > 10;
+explain (costs off) select * from rlp where a < 15;
+explain (costs off) select * from rlp where a <= 15;
+explain (costs off) select * from rlp where a > 15 and b = 'ab';
+explain (costs off) select * from rlp where a = 16;
+explain (costs off) select * from rlp where a = 16 and b in ('not', 'in', 'here');
+explain (costs off) select * from rlp where a = 16 and b < 'ab';
+explain (costs off) select * from rlp where a = 16 and b <= 'ab';
+explain (costs off) select * from rlp where a = 16 and b is null;
+explain (costs off) select * from rlp where a = 16 and b is not null;
+explain (costs off) select * from rlp where a is null;
+explain (costs off) select * from rlp where a is not null;
+explain (costs off) select * from rlp where a > 30;
+explain (costs off) select * from rlp where a = 30;	/* only default is scanned */
+explain (costs off) select * from rlp where a <= 31;
+explain (costs off) select * from rlp where a = 1 or a = 7;
+explain (costs off) select * from rlp where a = 1 or b = 'ab';
+
+explain (costs off) select * from rlp where a > 20 and a < 27;
+explain (costs off) select * from rlp where a = 29;
+explain (costs off) select * from rlp where a >= 29;
+
+-- redundant clauses are eliminated
+explain (costs off) select * from rlp where a > 1 and a = 10;	/* only default */
+explain (costs off) select * from rlp where a > 1 and a >=15;	/* rlp3 onwards, including default */
+explain (costs off) select * from rlp where a = 1 and a = 3;	/* empty */
+explain (costs off) select * from rlp where (a = 1 and a = 3) or (a > 1 and a = 15);
+
+-- multi-column keys
+create table mc3p (a int, b int, c int) partition by range (a, abs(b), c);
+create table mc3p_default partition of mc3p default;
+create table mc3p0 partition of mc3p for values from (minvalue, minvalue, minvalue) to (1, 1, 1);
+create table mc3p1 partition of mc3p for values from (1, 1, 1) to (10, 5, 10);
+create table mc3p2 partition of mc3p for values from (10, 5, 10) to (10, 10, 10);
+create table mc3p3 partition of mc3p for values from (10, 10, 10) to (10, 10, 20);
+create table mc3p4 partition of mc3p for values from (10, 10, 20) to (10, maxvalue, maxvalue);
+create table mc3p5 partition of mc3p for values from (11, 1, 1) to (20, 10, 10);
+create table mc3p6 partition of mc3p for values from (20, 10, 10) to (20, 20, 20);
+create table mc3p7 partition of mc3p for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
+
+explain (costs off) select * from mc3p where a = 1;
+explain (costs off) select * from mc3p where a = 1 and abs(b) < 1;
+explain (costs off) select * from mc3p where a = 1 and abs(b) = 1;
+explain (costs off) select * from mc3p where a = 1 and abs(b) = 1 and c < 8;
+explain (costs off) select * from mc3p where a = 10 and abs(b) between 5 and 35;
+explain (costs off) select * from mc3p where a > 10;
+explain (costs off) select * from mc3p where a >= 10;
+explain (costs off) select * from mc3p where a < 10;
+explain (costs off) select * from mc3p where a <= 10 and abs(b) < 10;
+explain (costs off) select * from mc3p where a = 11 and abs(b) = 0;
+explain (costs off) select * from mc3p where a = 20 and abs(b) = 10 and c = 100;
+explain (costs off) select * from mc3p where a > 20;
+explain (costs off) select * from mc3p where a >= 20;
+explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20);
+explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20) or a < 1;
+explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20) or a < 1 or a = 1;
+explain (costs off) select * from mc3p where a = 1 or abs(b) = 1 or c = 1;
+explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 10);
+explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 9);
+
+-- a simpler multi-column keys case
+create table mc2p (a int, b int) partition by range (a, b);
+create table mc2p_default partition of mc2p default;
+create table mc2p0 partition of mc2p for values from (minvalue, minvalue) to (1, minvalue);
+create table mc2p1 partition of mc2p for values from (1, minvalue) to (1, 1);
+create table mc2p2 partition of mc2p for values from (1, 1) to (2, minvalue);
+create table mc2p3 partition of mc2p for values from (2, minvalue) to (2, 1);
+create table mc2p4 partition of mc2p for values from (2, 1) to (2, maxvalue);
+create table mc2p5 partition of mc2p for values from (2, maxvalue) to (maxvalue, maxvalue);
+
+explain (costs off) select * from mc2p where a < 2;
+explain (costs off) select * from mc2p where a = 2 and b < 1;
+explain (costs off) select * from mc2p where a > 1;
+explain (costs off) select * from mc2p where a = 1 and b > 1;
+
+-- boolean partitioning
+create table boolpart (a bool) partition by list (a);
+create table boolpart_default partition of boolpart default;
+create table boolpart_t partition of boolpart for values in ('true');
+create table boolpart_f partition of boolpart for values in ('false');
+
+explain (costs off) select * from boolpart where a in (true, false);
+explain (costs off) select * from boolpart where a = false;
+explain (costs off) select * from boolpart where not a = false;
+explain (costs off) select * from boolpart where a is true or a is not true;
+explain (costs off) select * from boolpart where a is not true;
+explain (costs off) select * from boolpart where a is not true and a is not false;
+explain (costs off) select * from boolpart where a is unknown;
+explain (costs off) select * from boolpart where a is not unknown;
+
+drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart;

From baa79c39fd912cefd2ed9f77a4f75c380be58983 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 21:34:24 +0800
Subject: [PATCH 213/578] New C function: bms_add_range.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/nodes/bitmapset.c | 72 +++++++++++++++++++++++++++++++++++
 src/include/nodes/bitmapset.h |  1 +
 2 files changed, 73 insertions(+)

diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index 70a53947..8ec465d2 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -943,6 +943,78 @@ bms_clean_members(Bitmapset *a)
 
 #endif
 
+/*
+ * bms_add_range
+ *     Add members in the range of 'lower' to 'upper' to the set.
+ *
+ * Note this could also be done by calling bms_add_member in a loop, however,
+ * using this function will be faster when the range is large as we work with
+ * at the bitmapword level rather than at bit level.
+ */
+Bitmapset *
+bms_add_range(Bitmapset *a, int lower, int upper)
+{
+   int         lwordnum,
+               lbitnum,
+               uwordnum,
+               ushiftbits,
+               wordnum;
+
+   if (lower < 0 || upper < 0)
+       elog(ERROR, "negative bitmapset member not allowed");
+   if (lower > upper)
+       elog(ERROR, "lower range must not be above upper range");
+   uwordnum = WORDNUM(upper);
+
+   if (a == NULL)
+   {
+       a = (Bitmapset *) palloc0(BITMAPSET_SIZE(uwordnum + 1));
+       a->nwords = uwordnum + 1;
+   }
+
+   /* ensure we have enough words to store the upper bit */
+   else if (uwordnum >= a->nwords)
+   {
+       int         oldnwords = a->nwords;
+       int         i;
+
+       a = (Bitmapset *) repalloc(a, BITMAPSET_SIZE(uwordnum + 1));
+       a->nwords = uwordnum + 1;
+       /* zero out the enlarged portion */
+       for (i = oldnwords; i < a->nwords; i++)
+           a->words[i] = 0;
+   }
+
+   wordnum = lwordnum = WORDNUM(lower);
+
+   lbitnum = BITNUM(lower);
+   ushiftbits = BITS_PER_BITMAPWORD - (BITNUM(upper) + 1);
+
+   /*
+    * Special case when lwordnum is the same as uwordnum we must perform the
+    * upper and lower masking on the word.
+    */
+   if (lwordnum == uwordnum)
+   {
+       a->words[lwordnum] |= ~(bitmapword) (((bitmapword) 1 << lbitnum) - 1)
+                             & (~(bitmapword) 0) >> ushiftbits;
+   }
+   else
+   {
+       /* turn on lbitnum and all bits left of it */
+       a->words[wordnum++] |= ~(bitmapword) (((bitmapword) 1 << lbitnum) - 1);
+
+       /* turn on all bits for any intermediate words */
+       while (wordnum < uwordnum)
+           a->words[wordnum++] = ~(bitmapword) 0;
+
+       /* turn on upper's bit and all bits right of it. */
+       a->words[uwordnum] |= (~(bitmapword) 0) >> ushiftbits;
+   }
+
+   return a;
+}
+
 /*
  * bms_int_members - like bms_intersect, but left input is recycled
  */
diff --git a/src/include/nodes/bitmapset.h b/src/include/nodes/bitmapset.h
index 800dcb52..fc101c8f 100644
--- a/src/include/nodes/bitmapset.h
+++ b/src/include/nodes/bitmapset.h
@@ -165,6 +165,7 @@ extern bool bms_is_empty(const Bitmapset *a);
 extern Bitmapset *bms_add_member(Bitmapset *a, int x);
 extern Bitmapset *bms_del_member(Bitmapset *a, int x);
 extern Bitmapset *bms_add_members(Bitmapset *a, const Bitmapset *b);
+extern Bitmapset *bms_add_range(Bitmapset *a, int lower, int upper);
 extern Bitmapset *bms_int_members(Bitmapset *a, const Bitmapset *b);
 extern Bitmapset *bms_del_members(Bitmapset *a, const Bitmapset *b);
 extern Bitmapset *bms_join(Bitmapset *a, Bitmapset *b);

From c092b1ee10419e26a151e29269e410aee1eb809f Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Fri, 1 Dec 2017 10:01:50 -0500
Subject: [PATCH 214/578] Fix uninitialized memory reference.

Without this, when partdesc->nparts == 0, we end up calling
ExecBuildSlotPartitionKeyDescription without initializing values
and isnull.

Reported by Coverity via Michael Paquier.  Patch by Michael Paquier,
reviewed and revised by Amit Langote.

Discussion: http://postgr.es/m/CAB7nPqQ3mwkdMoPY-ocgTpPnjd8TKOadMxdTtMLvEzF8480Zfg@mail.gmail.com
---
 src/backend/executor/execPartition.c | 18 +++++++++++-------
 src/test/regress/expected/insert.out |  4 ++++
 src/test/regress/sql/insert.sql      |  4 ++++
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index d275cefe..537d8986 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -206,13 +206,6 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 			slot = myslot;
 		}
 
-		/* Quick exit */
-		if (partdesc->nparts == 0)
-		{
-			result = -1;
-			break;
-		}
-
 		/*
 		 * Extract partition key from tuple. Expression evaluation machinery
 		 * that FormPartitionKeyDatum() invokes expects ecxt_scantuple to
@@ -223,6 +216,17 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 		 */
 		ecxt->ecxt_scantuple = slot;
 		FormPartitionKeyDatum(parent, slot, estate, values, isnull);
+
+		/*
+		 * Nothing for get_partition_for_tuple() to do if there are no
+		 * partitions to begin with.
+		 */
+		if (partdesc->nparts == 0)
+		{
+			result = -1;
+			break;
+		}
+
 		cur_index = get_partition_for_tuple(rel, values, isnull);
 
 		/*
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index a3649273..a671b345 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -167,6 +167,10 @@ create table range_parted (
 	a text,
 	b int
 ) partition by range (a, (b+0));
+-- no partitions, so fail
+insert into range_parted values ('a', 11);
+ERROR:  no partition of relation "range_parted" found for row
+DETAIL:  Partition key of the failing row contains (a, (b + 0)) = (a, 11).
 create table part1 partition of range_parted for values from ('a', 1) to ('a', 10);
 create table part2 partition of range_parted for values from ('a', 10) to ('a', 20);
 create table part3 partition of range_parted for values from ('b', 1) to ('b', 10);
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index e276954e..21d04de1 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -90,6 +90,10 @@ create table range_parted (
 	a text,
 	b int
 ) partition by range (a, (b+0));
+
+-- no partitions, so fail
+insert into range_parted values ('a', 11);
+
 create table part1 partition of range_parted for values from ('a', 1) to ('a', 10);
 create table part2 partition of range_parted for values from ('a', 10) to ('a', 20);
 create table part3 partition of range_parted for values from ('b', 1) to ('b', 10);

From 7fb1e744f91ba38be271464b7029d8af9026176f Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Fri, 1 Dec 2017 10:58:08 -0500
Subject: [PATCH 215/578] Try to exclude partitioned tables in toto.

Ashutosh Bapat, reviewed by Jeevan Chalke.  Comment by me.

Discussion: http://postgr.es/m/CAFjFpRcuRaydz88CY_aQekmuvmN2A9ax5z0k=ppT+s8KS8xMRA@mail.gmail.com
---
 src/backend/optimizer/util/plancat.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index fc680b63..dba8d09d 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -1564,8 +1564,18 @@ relation_excluded_by_constraints(PlannerInfo *root,
     if (predicate_refuted_by(safe_restrictions, safe_restrictions, false))
         return true;
 
-    /* Only plain relations have constraints */
-    if (rte->rtekind != RTE_RELATION || rte->inh)
+	/*
+	 * Only plain relations have constraints.  In a partitioning hierarchy,
+	 * but not with regular table inheritance, it's OK to assume that any
+	 * constraints that hold for the parent also hold for every child; for
+	 * instance, table inheritance allows the parent to have constraints
+	 * marked NO INHERIT, but table partitioning does not.  We choose to check
+	 * whether the partitioning parents can be excluded here; doing so
+	 * consumes some cycles, but potentially saves us the work of excluding
+	 * each child individually.
+	 */
+	if (rte->rtekind != RTE_RELATION ||
+		(rte->inh && rte->relkind != RELKIND_PARTITIONED_TABLE))
         return false;
 
     /*

From 10aba7a168d0338278f20948149d8d45f15ad5c8 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 25 Jun 2020 21:48:02 +0800
Subject: [PATCH 216/578] Re-allow INSERT .. ON CONFLICT DO NOTHING on
 partitioned
 tables.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/ddl.sgml                         | 13 +++++++++----
 src/backend/commands/copy.c                   |  3 ++-
 src/backend/executor/execPartition.c          | 15 ++++++++++-----
 src/backend/executor/nodeModifyTable.c        |  3 ++-
 src/backend/parser/analyze.c                  |  8 --------
 src/include/executor/execPartition.h          |  3 ++-
 src/test/regress/expected/insert_conflict.out | 13 +++++++++++++
 src/test/regress/sql/insert_conflict.sql      | 13 +++++++++++++
 8 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml
index a65a130f..168c5f54 100644
--- a/doc/src/sgml/ddl.sgml
+++ b/doc/src/sgml/ddl.sgml
@@ -3558,10 +3558,15 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02
      <listitem>
       <para>
        Using the <literal>ON CONFLICT</literal> clause with partitioned tables
-       will cause an error, because unique or exclusion constraints can only be
-       created on individual partitions.  There is no support for enforcing
-       uniqueness (or an exclusion constraint) across an entire partitioning
-       hierarchy.
+       will cause an error if the conflict target is specified (see
+       <xref linkend="sql-on-conflict" /> for more details on how the clause
+       works).  Therefore, it is not possible to specify
+       <literal>DO UPDATE</literal> as the alternative action, because
+       specifying the conflict target is mandatory in that case.  On the other
+       hand, specifying <literal>DO NOTHING</literal> as the alternative action
+       works fine provided the conflict target is not specified.  In that case,
+       unique constraints (or exclusion constraints) of the individual leaf
+       partitions are considered.
       </para>
      </listitem>
 
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index e376f863..d0f69503 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -1643,7 +1643,8 @@ BeginCopy(ParseState *pstate,
             int            num_parted,
                         num_partitions;
 
-            ExecSetupPartitionTupleRouting(rel,
+			ExecSetupPartitionTupleRouting(NULL,
+                                                                                   rel,
                                            1,
                                            &partition_dispatch_info,
                                            &partitions,
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 537d8986..08a27d71 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -63,7 +63,8 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
  * RowExclusiveLock mode upon return from this function.
  */
 void
-ExecSetupPartitionTupleRouting(Relation rel,
+ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
+							   Relation rel,
 							   Index resultRTindex,
 							   EState *estate,
 							   PartitionDispatch **pd,
@@ -133,13 +134,17 @@ ExecSetupPartitionTupleRouting(Relation rel,
 		CheckValidResultRel(leaf_part_rri, CMD_INSERT);
 
 		/*
-		 * Open partition indices (remember we do not support ON CONFLICT in
-		 * case of partitioned tables, so we do not need support information
-		 * for speculative insertion)
+		 * Open partition indices.  The user may have asked to check for
+		 * conflicts within this leaf partition and do "nothing" instead of
+		 * throwing an error.  Be prepared in that case by initializing the
+		 * index information needed by ExecInsert() to perform speculative
+		 * insertions.
 		 */
 		if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex &&
 			leaf_part_rri->ri_IndexRelationDescs == NULL)
-			ExecOpenIndices(leaf_part_rri, false);
+			ExecOpenIndices(leaf_part_rri,
+							mtstate != NULL &&
+							mtstate->mt_onconflict != ONCONFLICT_NONE);
 
 		estate->es_leaf_result_relations =
 			lappend(estate->es_leaf_result_relations, leaf_part_rri);
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 660bfd4b..749c320e 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -2704,7 +2704,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
         int            num_parted,
                     num_partitions;
 
-        ExecSetupPartitionTupleRouting(rel,
+		ExecSetupPartitionTupleRouting(mtstate,
+                                                                           rel,
                                        node->nominalRelation,
                                        &partition_dispatch_info,
                                        &partitions,
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index fb5e27f1..62db0557 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -1307,16 +1307,8 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
 
     /* Process ON CONFLICT, if any. */
     if (stmt->onConflictClause)
-    {
-        /* Bail out if target relation is partitioned table */
-        if (pstate->p_target_rangetblentry->relkind == RELKIND_PARTITIONED_TABLE)
-            ereport(ERROR,
-                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                     errmsg("ON CONFLICT clause is not supported with partitioned tables")));
-
         qry->onConflict = transformOnConflictClause(pstate,
                                                     stmt->onConflictClause);
-    }
 
     /*
      * If we have a RETURNING clause, we need to add the target relation to
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index 64e5aab4..703ff4f7 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -49,7 +49,8 @@ typedef struct PartitionDispatchData
 
 typedef struct PartitionDispatchData *PartitionDispatch;
 
-extern void ExecSetupPartitionTupleRouting(Relation rel,
+extern void ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
+							   Relation rel,
 							   Index resultRTindex,
 							   EState *estate,
 							   PartitionDispatch **pd,
diff --git a/src/test/regress/expected/insert_conflict.out b/src/test/regress/expected/insert_conflict.out
index d316b344..f10974de 100644
--- a/src/test/regress/expected/insert_conflict.out
+++ b/src/test/regress/expected/insert_conflict.out
@@ -814,3 +814,16 @@ select * from selfconflict;
 (3 rows)
 
 drop table selfconflict;
+-- check that the following works:
+-- insert into partitioned_table on conflict do nothing
+create table parted_conflict_test (a int, b char) partition by list (a);
+create table parted_conflict_test_1 partition of parted_conflict_test (b unique) for values in (1);
+insert into parted_conflict_test values (1, 'a') on conflict do nothing;
+insert into parted_conflict_test values (1, 'a') on conflict do nothing;
+-- however, on conflict do update is not supported yet
+insert into parted_conflict_test values (1) on conflict (b) do update set a = excluded.a;
+ERROR:  there is no unique or exclusion constraint matching the ON CONFLICT specification
+-- but it works OK if we target the partition directly
+insert into parted_conflict_test_1 values (1) on conflict (b) do
+update set a = excluded.a;
+drop table parted_conflict_test;
diff --git a/src/test/regress/sql/insert_conflict.sql b/src/test/regress/sql/insert_conflict.sql
index 58518bf2..92dfdd85 100644
--- a/src/test/regress/sql/insert_conflict.sql
+++ b/src/test/regress/sql/insert_conflict.sql
@@ -475,3 +475,16 @@ commit;
 select * from selfconflict order by 1;
 
 drop table selfconflict;
+
+-- check that the following works:
+-- insert into partitioned_table on conflict do nothing
+create table parted_conflict_test (a int, b char) partition by list (a);
+create table parted_conflict_test_1 partition of parted_conflict_test (b unique) for values in (1);
+insert into parted_conflict_test values (1, 'a') on conflict do nothing;
+insert into parted_conflict_test values (1, 'a') on conflict do nothing;
+-- however, on conflict do update is not supported yet
+insert into parted_conflict_test values (1) on conflict (b) do update set a = excluded.a;
+-- but it works OK if we target the partition directly
+insert into parted_conflict_test_1 values (1) on conflict (b) do
+update set a = excluded.a;
+drop table parted_conflict_test;

From b659c7636b4ec937749f00fb8131427e456e035c Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Fri, 1 Dec 2017 13:52:59 -0500
Subject: [PATCH 217/578] Minor code beautification in partition_bounds_equal.

Use get_greatest_modulus more consistently, instead of doing the
same thing in an ad-hoc manner in this one place.

Ashutosh Bapat

Discussion: http://postgr.es/m/CAFjFpReT9L4RCiJBKOyWC2=i02kv9uG2fx=4Fv7kFY2t0SPCgw@mail.gmail.com
---
 src/backend/catalog/partition.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 544e3365..c5550d1e 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -784,15 +784,13 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
 
 	if (b1->strategy == PARTITION_STRATEGY_HASH)
 	{
-		int			greatest_modulus;
+		int			greatest_modulus = get_greatest_modulus(b1);
 
 		/*
 		 * If two hash partitioned tables have different greatest moduli,
-		 * their partition schemes don't match.  For hash partitioned table,
-		 * the greatest modulus is given by the last datum and number of
-		 * partitions is given by ndatums.
+		 * their partition schemes don't match.
 		 */
-		if (b1->datums[b1->ndatums - 1][0] != b2->datums[b2->ndatums - 1][0])
+		if (greatest_modulus != get_greatest_modulus(b2))
 			return false;
 
 		/*
@@ -806,7 +804,6 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
 		 * their indexes array will be same.  So, it suffices to compare
 		 * indexes array.
 		 */
-		greatest_modulus = get_greatest_modulus(b1);
 		for (i = 0; i < greatest_modulus; i++)
 			if (b1->indexes[i] != b2->indexes[i])
 				return false;

From c4e1c445a6c4f40ae10680c50bca603e3b26d8c8 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 26 Jun 2020 10:36:13 +0800
Subject: [PATCH 218/578] Prohibit identity columns on typed tables and
 partitions.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/parser/parse_utilcmd.c     | 13 +++++++++++++
 src/test/regress/expected/identity.out | 12 ++++++++++++
 src/test/regress/sql/identity.sql      | 16 ++++++++++++++++
 3 files changed, 41 insertions(+)

diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index e75e6b5e..a5c17b8b 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -137,6 +137,7 @@ typedef struct
 #endif
     bool        ispartitioned;    /* true if table is partitioned */
     PartitionBoundSpec *partbound;    /* transformed FOR VALUES */
+	bool            ofType;                 /* true if statement contains OF typename */
 } CreateStmtContext;
 
 /* State shared by transformCreateSchemaStmt and its subroutines */
@@ -375,6 +376,8 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString)
 #else
     cxt.ispartitioned = stmt->partspec != NULL;
 #endif
+	cxt.partbound = stmt->partbound;
+	cxt.ofType = (stmt->ofTypename != NULL);
     /*
      * Notice that we allow OIDs here only for plain tables, even though
      * foreign tables also support them.  This is necessary because the
@@ -1049,6 +1052,15 @@ transformColumnDefinition(CreateStmtContext *cxt, ColumnDef *column)
                     Type        ctype;
                     Oid            typeOid;
 
+					if (cxt->ofType)
+						ereport(ERROR,
+									(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+									 errmsg("identity colums are not supported on typed tables")));
+					if (cxt->partbound)
+						ereport(ERROR,
+								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+								 errmsg("identify columns are not supported on partitions")));
+
                     ctype = typenameType(cxt->pstate, column->typeName, NULL);
                     typeOid = HeapTupleGetOid(ctype);
                     ReleaseSysCache(ctype);
@@ -3316,6 +3328,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
 #endif
     cxt.ispartitioned = (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
     cxt.partbound = NULL;
+	cxt.ofType = false;
 
     /*
      * The only subtypes that currently require parse transformation handling
diff --git a/src/test/regress/expected/identity.out b/src/test/regress/expected/identity.out
index 7844395d..5dd9909b 100644
--- a/src/test/regress/expected/identity.out
+++ b/src/test/regress/expected/identity.out
@@ -326,3 +326,15 @@ SELECT * FROM itest8;
 RESET ROLE;
 DROP TABLE itest8;
 DROP USER regress_user1;
+-- typed tables (currently not supported)
+CREATE TYPE itest_type AS (f1 integer, f2 text, f3 bigint);
+CREATE TABLE itest12 OF itest_type (f1 WITH OPTIONS GENERATED ALWAYS AS IDENTITY); -- error
+ERROR:  identity colums are not supported on typed tables
+DROP TYPE itest_type CASCADE;
+-- table partitions (currently not supported)
+CREATE TABLE itest_parent (f1 date NOT NULL, f2 text, f3 bigint) PARTITION BY RANGE (f1);
+CREATE TABLE itest_child PARTITION OF itest_parent (
+    f3 WITH OPTIONS GENERATED ALWAYS AS IDENTITY
+) FOR VALUES FROM ('2016-07-01') TO ('2016-08-01'); -- error
+ERROR:  identify columns are not supported on partitions
+DROP TABLE itest_parent;
diff --git a/src/test/regress/sql/identity.sql b/src/test/regress/sql/identity.sql
index 4e19fde2..13d9e4c1 100644
--- a/src/test/regress/sql/identity.sql
+++ b/src/test/regress/sql/identity.sql
@@ -190,3 +190,19 @@ SELECT * FROM itest8;
 RESET ROLE;
 DROP TABLE itest8;
 DROP USER regress_user1;
+
+
+-- typed tables (currently not supported)
+
+CREATE TYPE itest_type AS (f1 integer, f2 text, f3 bigint);
+CREATE TABLE itest12 OF itest_type (f1 WITH OPTIONS GENERATED ALWAYS AS IDENTITY); -- error
+DROP TYPE itest_type CASCADE;
+
+
+-- table partitions (currently not supported)
+
+CREATE TABLE itest_parent (f1 date NOT NULL, f2 text, f3 bigint) PARTITION BY RANGE (f1);
+CREATE TABLE itest_child PARTITION OF itest_parent (
+    f3 WITH OPTIONS GENERATED ALWAYS AS IDENTITY
+) FOR VALUES FROM ('2016-07-01') TO ('2016-08-01'); -- error
+DROP TABLE itest_parent;

From 7764c73d5add98dc648c949ec2573c2eb32b09ce Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Mon, 11 Dec 2017 12:48:40 -0500
Subject: [PATCH 219/578] Improve comment about PartitionBoundInfoData.

Ashutosh Bapat, per discussion with Julien Rouhaund, who also
reviewed this patch.

Discussion: http://postgr.es/m/CAFjFpReBR3ftK9C23LLCZY_TDXhhjB_dgE-L9+mfTnA=gkvdvQ@mail.gmail.com
---
 src/backend/catalog/partition.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index c5550d1e..8bac934b 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -72,6 +72,13 @@
  * of datum-tuples with 2 datums, modulus and remainder, corresponding to a
  * given partition.
  *
+ * The datums in datums array are arranged in increasing order as defined by
+ * functions qsort_partition_rbound_cmp(), qsort_partition_list_value_cmp() and
+ * qsort_partition_hbound_cmp() for range, list and hash partitioned tables
+ * respectively. For range and list partitions this simply means that the
+ * datums in the datums array are arranged in increasing order as defined by
+ * the partition key's operator classes and collations.
+ *
  * In the case of list partitioning, the indexes array stores one entry for
  * every datum, which is the index of the partition that accepts a given datum.
  * In case of range partitioning, it stores one entry per distinct range

From 721fbc782ed010b64281a700b21da8698a9dfd9e Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 12 Dec 2017 10:52:15 -0500
Subject: [PATCH 220/578] Remove bug from OPTIMIZER_DEBUG code for
 partition-wise join.

Etsuro Fujita, reviewed by Ashutosh Bapat

Discussion: http://postgr.es/m/5A2A60E6.6000008@lab.ntt.co.jp
---
 src/backend/optimizer/path/allpaths.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index ba5a4418..439e98ed 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -3436,7 +3436,7 @@ generate_partition_wise_join_paths(PlannerInfo *root, RelOptInfo *rel)
 		set_cheapest(child_rel);
 
 #ifdef OPTIMIZER_DEBUG
-		debug_print_rel(root, rel);
+		debug_print_rel(root, child_rel);
 #endif
 
 		live_children = lappend(live_children, child_rel);

From f7c14be86e6f9801a1638b5332b634e431ef8e11 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 26 Jun 2020 15:21:42 +0800
Subject: [PATCH 221/578] Simplify and encapsulate tuple routing support code.

---
 src/backend/commands/copy.c            |  86 ++++------------
 src/backend/executor/execPartition.c   | 108 +++++++++++--------
 src/backend/executor/nodeModifyTable.c | 137 ++++++++++---------------
 src/include/executor/execPartition.h   |  47 +++++++--
 src/include/nodes/execnodes.h          |   9 +-
 5 files changed, 182 insertions(+), 205 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index d0f69503..316356d8 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -208,12 +208,9 @@ typedef struct CopyStateData
     bool        volatile_defexprs;    /* is any of defexprs volatile? */
     List       *range_table;
 
-    PartitionDispatch *partition_dispatch_info;
-    int            num_dispatch;    /* Number of entries in the above array */
-    int            num_partitions; /* Number of members in the following arrays */
-    ResultRelInfo *partitions;    /* Per partition result relation */
-    TupleConversionMap **partition_tupconv_maps;
-    TupleTableSlot *partition_tuple_slot;
+	/* Tuple-routing support info */
+	PartitionTupleRouting *partition_tuple_routing;
+
     TransitionCaptureState *transition_capture;
     TupleConversionMap **transition_tupconv_maps;
 
@@ -1636,27 +1633,10 @@ BeginCopy(ParseState *pstate,
         /* Initialize state for CopyFrom tuple routing. */
         if (is_from && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
         {
-            PartitionDispatch *partition_dispatch_info;
-            ResultRelInfo *partitions;
-            TupleConversionMap **partition_tupconv_maps;
-            TupleTableSlot *partition_tuple_slot;
-            int            num_parted,
-                        num_partitions;
-
-			ExecSetupPartitionTupleRouting(NULL,
-                                                                                   rel,
-                                           1,
-                                           &partition_dispatch_info,
-                                           &partitions,
-                                           &partition_tupconv_maps,
-                                           &partition_tuple_slot,
-                                           &num_parted, &num_partitions);
-            cstate->partition_dispatch_info = partition_dispatch_info;
-            cstate->num_dispatch = num_parted;
-            cstate->partitions = partitions;
-            cstate->num_partitions = num_partitions;
-            cstate->partition_tupconv_maps = partition_tupconv_maps;
-            cstate->partition_tuple_slot = partition_tuple_slot;
+            PartitionTupleRouting *proute;
+
+            proute = cstate->partition_tuple_routing =
+                ExecSetupPartitionTupleRouting(NULL, cstate->rel, 1, estate);
 
             /*
              * If we are capturing transition tuples, they may need to be
@@ -1669,12 +1649,11 @@ BeginCopy(ParseState *pstate,
                 int            i;
 
                 cstate->transition_tupconv_maps = (TupleConversionMap **)
-                    palloc0(sizeof(TupleConversionMap *) *
-                            cstate->num_partitions);
-                for (i = 0; i < cstate->num_partitions; ++i)
+		            palloc0(sizeof(TupleConversionMap *) * proute->num_partitions);
+                for (i = 0; i < proute->num_partitions; ++i)
                 {
                     cstate->transition_tupconv_maps[i] =
-                        convert_tuples_by_name(RelationGetDescr(cstate->partitions[i].ri_RelationDesc),
+						convert_tuples_by_name(RelationGetDescr(proute->partitions[i]->ri_RelationDesc),
                                                RelationGetDescr(rel),
                                                gettext_noop("could not convert row type"));
                 }
@@ -3184,7 +3163,7 @@ CopyFrom(CopyState cstate)
     if ((resultRelInfo->ri_TrigDesc != NULL &&
          (resultRelInfo->ri_TrigDesc->trig_insert_before_row ||
           resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) ||
-        cstate->partition_dispatch_info != NULL ||
+		cstate->partition_tuple_routing != NULL ||
         cstate->volatile_defexprs)
     {
         useHeapMultiInsert = false;
@@ -3459,10 +3438,11 @@ CopyFrom(CopyState cstate)
         ExecStoreTuple(tuple, slot, InvalidBuffer, false);
 
         /* Determine the partition to heap_insert the tuple into */
-        if (cstate->partition_dispatch_info)
+		if (cstate->partition_tuple_routing)
         {
             int            leaf_part_index;
             TupleConversionMap *map;
+			PartitionTupleRouting *proute = cstate->partition_tuple_routing;
 
             /*
              * Away we go ... If we end up not finding a partition after all,
@@ -3473,11 +3453,11 @@ CopyFrom(CopyState cstate)
              * partition, respectively.
              */
             leaf_part_index = ExecFindPartition(resultRelInfo,
-                                                cstate->partition_dispatch_info,
+												proute->partition_dispatch_info,
                                                 slot,
                                                 estate);
             Assert(leaf_part_index >= 0 &&
-                   leaf_part_index < cstate->num_partitions);
+				   leaf_part_index < proute->num_partitions);
 
             /*
              * If this tuple is mapped to a partition that is not same as the
@@ -3495,7 +3475,7 @@ CopyFrom(CopyState cstate)
              * to the selected partition.
              */
             saved_resultRelInfo = resultRelInfo;
-            resultRelInfo = cstate->partitions + leaf_part_index;
+			resultRelInfo = proute->partitions[leaf_part_index];
 
             /* We do not yet have a way to insert into a foreign partition */
             if (resultRelInfo->ri_FdwRoutine)
@@ -3542,7 +3522,7 @@ CopyFrom(CopyState cstate)
              * We might need to convert from the parent rowtype to the
              * partition rowtype.
              */
-            map = cstate->partition_tupconv_maps[leaf_part_index];
+			map = proute->partition_tupconv_maps[leaf_part_index];
             if (map)
             {
                 Relation    partrel = resultRelInfo->ri_RelationDesc;
@@ -3554,7 +3534,7 @@ CopyFrom(CopyState cstate)
                  * point on.  Use a dedicated slot from this point on until
                  * we're finished dealing with the partition.
                  */
-                slot = cstate->partition_tuple_slot;
+				slot = proute->partition_tuple_slot;
                 Assert(slot != NULL);
                 ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
                 ExecStoreTuple(tuple, slot, InvalidBuffer, true);
@@ -4012,34 +3992,8 @@ CopyFrom(CopyState cstate)
     ExecCloseIndices(resultRelInfo);
 
     /* Close all the partitioned tables, leaf partitions, and their indices */
-    if (cstate->partition_dispatch_info)
-    {
-        int            i;
-
-        /*
-         * Remember cstate->partition_dispatch_info[0] corresponds to the root
-         * partitioned table, which we must not try to close, because it is
-         * the main target table of COPY that will be closed eventually by
-         * DoCopy().  Also, tupslot is NULL for the root partitioned table.
-         */
-        for (i = 1; i < cstate->num_dispatch; i++)
-        {
-            PartitionDispatch pd = cstate->partition_dispatch_info[i];
-
-            heap_close(pd->reldesc, NoLock);
-            ExecDropSingleTupleTableSlot(pd->tupslot);
-        }
-        for (i = 0; i < cstate->num_partitions; i++)
-        {
-            ResultRelInfo *resultRelInfo = cstate->partitions + i;
-
-            ExecCloseIndices(resultRelInfo);
-            heap_close(resultRelInfo->ri_RelationDesc, NoLock);
-        }
-
-        /* Release the standalone partition tuple descriptor */
-        ExecDropSingleTupleTableSlot(cstate->partition_tuple_slot);
-    }
+	if (cstate->partition_tuple_routing)
+		ExecCleanupTupleRouting(cstate->partition_tuple_routing);
 
     /* Close any trigger target relations */
     ExecCleanUpTriggerState(estate);
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 08a27d71..0f4c8db3 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -38,58 +38,40 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
 									 int maxfieldlen);
 
 /*
- * ExecSetupPartitionTupleRouting - set up information needed during
- * tuple routing for partitioned tables
- *
- * Output arguments:
- * 'pd' receives an array of PartitionDispatch objects with one entry for
- *		every partitioned table in the partition tree
- * 'partitions' receives an array of ResultRelInfo* objects with one entry for
- *		every leaf partition in the partition tree
- * 'tup_conv_maps' receives an array of TupleConversionMap objects with one
- *		entry for every leaf partition (required to convert input tuple based
- *		on the root table's rowtype to a leaf partition's rowtype after tuple
- *		routing is done)
- * 'partition_tuple_slot' receives a standalone TupleTableSlot to be used
- *		to manipulate any given leaf partition's rowtype after that partition
- *		is chosen by tuple-routing.
- * 'num_parted' receives the number of partitioned tables in the partition
- *		tree (= the number of entries in the 'pd' output array)
- * 'num_partitions' receives the number of leaf partitions in the partition
- *		tree (= the number of entries in the 'partitions' and 'tup_conv_maps'
- *		output arrays
+ * ExecSetupPartitionTupleRouting - sets up information needed during
+ * tuple routing for partitioned tables, encapsulates it in
+ * PartitionTupleRouting, and returns it.
  *
  * Note that all the relations in the partition tree are locked using the
  * RowExclusiveLock mode upon return from this function.
  */
-void
+PartitionTupleRouting *
 ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
-							   Relation rel,
-							   Index resultRTindex,
-							   EState *estate,
-							   PartitionDispatch **pd,
-							   ResultRelInfo ***partitions,
-							   TupleConversionMap ***tup_conv_maps,
-							   TupleTableSlot **partition_tuple_slot,
-							   int *num_parted, int *num_partitions)
+							   Relation rel, Index resultRTindex,
+							   EState *estate)
 {
 	TupleDesc	tupDesc = RelationGetDescr(rel);
 	List	   *leaf_parts;
 	ListCell   *cell;
 	int			i;
 	ResultRelInfo *leaf_part_rri;
+	PartitionTupleRouting *proute;
 
 	/*
 	 * Get the information about the partition tree after locking all the
 	 * partitions.
 	 */
 	(void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL);
-	*pd = RelationGetPartitionDispatchInfo(rel, num_parted, &leaf_parts);
-	*num_partitions = list_length(leaf_parts);
-	*partitions = (ResultRelInfo **) palloc(*num_partitions *
-											sizeof(ResultRelInfo *));
-	*tup_conv_maps = (TupleConversionMap **) palloc0(*num_partitions *
-													 sizeof(TupleConversionMap *));
+	proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting));
+	proute->partition_dispatch_info =
+		RelationGetPartitionDispatchInfo(rel, &proute->num_dispatch,
+										 &leaf_parts);
+	proute->num_partitions = list_length(leaf_parts);
+	proute->partitions = (ResultRelInfo **) palloc(proute->num_partitions *
+												   sizeof(ResultRelInfo *));
+	proute->partition_tupconv_maps =
+		(TupleConversionMap **) palloc0(proute->num_partitions *
+										sizeof(TupleConversionMap *));
 
 	/*
 	 * Initialize an empty slot that will be used to manipulate tuples of any
@@ -97,9 +79,9 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 	 * (such as ModifyTableState) and released when the node finishes
 	 * processing.
 	 */
-	*partition_tuple_slot = MakeTupleTableSlot();
+	proute->partition_tuple_slot = MakeTupleTableSlot();
 
-	leaf_part_rri = (ResultRelInfo *) palloc0(*num_partitions *
+	leaf_part_rri = (ResultRelInfo *) palloc0(proute->num_partitions *
 											  sizeof(ResultRelInfo));
 	i = 0;
 	foreach(cell, leaf_parts)
@@ -109,8 +91,8 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 
 		/*
 		 * We locked all the partitions above including the leaf partitions.
-		 * Note that each of the relations in *partitions are eventually
-		 * closed by the caller.
+		 * Note that each of the relations in proute->partitions are
+		 * eventually closed by the caller.
 		 */
 		partrel = heap_open(lfirst_oid(cell), NoLock);
 		part_tupdesc = RelationGetDescr(partrel);
@@ -119,8 +101,9 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 		 * Save a tuple conversion map to convert a tuple routed to this
 		 * partition from the parent's type to the partition's.
 		 */
-		(*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc,
-													 gettext_noop("could not convert row type"));
+		proute->partition_tupconv_maps[i] =
+			convert_tuples_by_name(tupDesc, part_tupdesc,
+								   gettext_noop("could not convert row type"));
 
 		InitResultRelInfo(leaf_part_rri,
 						  partrel,
@@ -149,9 +132,11 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 		estate->es_leaf_result_relations =
 			lappend(estate->es_leaf_result_relations, leaf_part_rri);
 
-		(*partitions)[i] = leaf_part_rri++;
+		proute->partitions[i] = leaf_part_rri++;
 		i++;
 	}
+
+	return proute;
 }
 
 /*
@@ -272,6 +257,45 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 	return result;
 }
 
+/*
+ * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple
+ * routing.
+ *
+ * Close all the partitioned tables, leaf partitions, and their indices.
+ */
+void
+ExecCleanupTupleRouting(PartitionTupleRouting * proute)
+{
+	int			i;
+
+	/*
+	 * Remember, proute->partition_dispatch_info[0] corresponds to the root
+	 * partitioned table, which we must not try to close, because it is the
+	 * main target table of the query that will be closed by callers such as
+	 * ExecEndPlan() or DoCopy(). Also, tupslot is NULL for the root
+	 * partitioned table.
+	 */
+	for (i = 1; i < proute->num_dispatch; i++)
+	{
+		PartitionDispatch pd = proute->partition_dispatch_info[i];
+
+		heap_close(pd->reldesc, NoLock);
+		ExecDropSingleTupleTableSlot(pd->tupslot);
+	}
+
+	for (i = 0; i < proute->num_partitions; i++)
+	{
+		ResultRelInfo *resultRelInfo = proute->partitions[i];
+
+		ExecCloseIndices(resultRelInfo);
+		heap_close(resultRelInfo->ri_RelationDesc, NoLock);
+	}
+
+	/* Release the standalone partition tuple descriptor, if any */
+	if (proute->partition_tuple_slot)
+		ExecDropSingleTupleTableSlot(proute->partition_tuple_slot);
+}
+
 /*
  * RelationGetPartitionDispatchInfo
  *		Returns information necessary to route tuples down a partition tree
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 749c320e..4c873634 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -318,32 +318,33 @@ ExecInsert(ModifyTableState *mtstate,
     }
 #endif
     /* Determine the partition to heap_insert the tuple into */
-    if (mtstate->mt_partition_dispatch_info)
+	if (mtstate->mt_partition_tuple_routing)
     {
         int            leaf_part_index;
+		PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
         TupleConversionMap *map;
 
         /*
          * Away we go ... If we end up not finding a partition after all,
          * ExecFindPartition() does not return and errors out instead.
          * Otherwise, the returned value is to be used as an index into arrays
-         * mt_partitions[] and mt_partition_tupconv_maps[] that will get us
-         * the ResultRelInfo and TupleConversionMap for the partition,
+         * proute->partitions[] and proute->partition_tupconv_maps[] that will
+         * get us the ResultRelInfo and TupleConversionMap for the partition,
          * respectively.
          */
         leaf_part_index = ExecFindPartition(resultRelInfo,
-                                            mtstate->mt_partition_dispatch_info,
+				                            proute->partition_dispatch_info,
                                             slot,
                                             estate);
         Assert(leaf_part_index >= 0 &&
-               leaf_part_index < mtstate->mt_num_partitions);
+			   leaf_part_index < proute->num_partitions);
 
         /*
          * Save the old ResultRelInfo and switch to the one corresponding to
          * the selected partition.
          */
         saved_resultRelInfo = resultRelInfo;
-        resultRelInfo = mtstate->mt_partitions + leaf_part_index;
+		resultRelInfo = proute->partitions[leaf_part_index];
 
         /* We do not yet have a way to insert into a foreign partition */
         if (resultRelInfo->ri_FdwRoutine)
@@ -388,7 +389,7 @@ ExecInsert(ModifyTableState *mtstate,
          * We might need to convert from the parent rowtype to the partition
          * rowtype.
          */
-        map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
+		map = proute->partition_tupconv_maps[leaf_part_index];
         if (map)
         {
             Relation    partrel = resultRelInfo->ri_RelationDesc;
@@ -400,7 +401,7 @@ ExecInsert(ModifyTableState *mtstate,
              * on, until we're finished dealing with the partition. Use the
              * dedicated slot for that.
              */
-            slot = mtstate->mt_partition_tuple_slot;
+			slot = proute->partition_tuple_slot;
             Assert(slot != NULL);
             ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
             ExecStoreTuple(tuple, slot, InvalidBuffer, true);
@@ -1834,25 +1835,12 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
      */
     if (mtstate->mt_transition_capture != NULL)
     {
-        ResultRelInfo *resultRelInfos;
         int            numResultRelInfos;
+		PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
 
-        /* Find the set of partitions so that we can find their TupleDescs. */
-        if (mtstate->mt_partition_dispatch_info != NULL)
-        {
-            /*
-             * For INSERT via partitioned table, so we need TupleDescs based
-             * on the partition routing table.
-             */
-            resultRelInfos = mtstate->mt_partitions;
-            numResultRelInfos = mtstate->mt_num_partitions;
-        }
-        else
-        {
-            /* Otherwise we need the ResultRelInfo for each subplan. */
-            resultRelInfos = mtstate->resultRelInfo;
-            numResultRelInfos = mtstate->mt_nplans;
-        }
+		numResultRelInfos = (proute != NULL ?
+                             proute->num_partitions :
+                             mtstate->mt_nplans);
 
         /*
          * Build array of conversion maps from each child's TupleDesc to the
@@ -1862,6 +1850,29 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
          */
         mtstate->mt_transition_tupconv_maps = (TupleConversionMap **)
             palloc0(sizeof(TupleConversionMap *) * numResultRelInfos);
+
+		/* Choose the right set of partitions */
+		if (proute != NULL)
+		{
+			/*
+			 * For tuple routing among partitions, we need TupleDescs based
+			 * on the partition routing table.
+			 */
+			ResultRelInfo **resultRelInfos = proute->partitions;
+
+			for (i = 0; i < numResultRelInfos; ++i)
+			{
+				mtstate->mt_transition_tupconv_maps[i] =
+					convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc),
+										   RelationGetDescr(targetRelInfo->ri_RelationDesc),
+										   gettext_noop("could not convert row type"));
+			}
+		}
+		else
+		{
+			/* Otherwise we need the ResultRelInfo for each subplan. */
+			ResultRelInfo *resultRelInfos = mtstate->resultRelInfo;
+
         for (i = 0; i < numResultRelInfos; ++i)
         {
             mtstate->mt_transition_tupconv_maps[i] =
@@ -1869,6 +1880,7 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
                                        RelationGetDescr(targetRelInfo->ri_RelationDesc),
                                        gettext_noop("could not convert row type"));
         }
+		}
 
         /*
          * Install the conversion map for the first plan for UPDATE and DELETE
@@ -2470,6 +2482,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
     ListCell   *l;
     int            i;
     Relation    rel;
+    PartitionTupleRouting *proute = NULL;
+    int         num_partitions = 0;
 #ifdef __TBASE__
     bool        remote_dml = false;
 #endif
@@ -2697,27 +2711,11 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
     if (operation == CMD_INSERT &&
         rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
     {
-        PartitionDispatch *partition_dispatch_info;
-        ResultRelInfo *partitions;
-        TupleConversionMap **partition_tupconv_maps;
-        TupleTableSlot *partition_tuple_slot;
-        int            num_parted,
-                    num_partitions;
-
+        proute = mtstate->mt_partition_tuple_routing =
 		ExecSetupPartitionTupleRouting(mtstate,
-                                                                           rel,
-                                       node->nominalRelation,
-                                       &partition_dispatch_info,
-                                       &partitions,
-                                       &partition_tupconv_maps,
-                                       &partition_tuple_slot,
-                                       &num_parted, &num_partitions);
-        mtstate->mt_partition_dispatch_info = partition_dispatch_info;
-        mtstate->mt_num_dispatch = num_parted;
-        mtstate->mt_partitions = partitions;
-        mtstate->mt_num_partitions = num_partitions;
-        mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
-        mtstate->mt_partition_tuple_slot = partition_tuple_slot;
+                                           rel, node->nominalRelation,
+                                           estate);
+        num_partitions = proute->num_partitions;
     }
 
     /* Build state for collecting transition tuples */
@@ -2777,7 +2775,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
      * will suffice.  This only occurs for the INSERT case; UPDATE/DELETE
      * cases are handled above.
      */
-    if (node->withCheckOptionLists != NIL && mtstate->mt_num_partitions > 0)
+	if (node->withCheckOptionLists != NIL && num_partitions > 0)
     {
         List       *wcoList;
         PlanState  *plan;
@@ -2794,14 +2792,16 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
                mtstate->mt_nplans == 1);
         wcoList = linitial(node->withCheckOptionLists);
         plan = mtstate->mt_plans[0];
-        resultRelInfo = mtstate->mt_partitions;
-        for (i = 0; i < mtstate->mt_num_partitions; i++)
+		for (i = 0; i < num_partitions; i++)
         {
-            Relation    partrel = resultRelInfo->ri_RelationDesc;
+			Relation	partrel;
             List       *mapped_wcoList;
             List       *wcoExprs = NIL;
             ListCell   *ll;
 
+			resultRelInfo = proute->partitions[i];
+			partrel = resultRelInfo->ri_RelationDesc;
+
             /* varno = node->nominalRelation */
             mapped_wcoList = map_partition_varattnos(wcoList,
                                                      node->nominalRelation,
@@ -2817,7 +2817,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
             resultRelInfo->ri_WithCheckOptions = mapped_wcoList;
             resultRelInfo->ri_WithCheckOptionExprs = wcoExprs;
-            resultRelInfo++;
         }
     }
 
@@ -2879,13 +2878,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
          * will suffice.  This only occurs for the INSERT case; UPDATE/DELETE
          * are handled above.
          */
-        resultRelInfo = mtstate->mt_partitions;
         returningList = linitial(node->returningLists);
-        for (i = 0; i < mtstate->mt_num_partitions; i++)
+		for (i = 0; i < num_partitions; i++)
         {
-            Relation    partrel = resultRelInfo->ri_RelationDesc;
+			Relation	partrel;
             List       *rlist;
 
+			resultRelInfo = proute->partitions[i];
+			partrel = resultRelInfo->ri_RelationDesc;
+
             /* varno = node->nominalRelation */
             rlist = map_partition_varattnos(returningList,
                                             node->nominalRelation,
@@ -2893,7 +2894,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
             resultRelInfo->ri_projectReturning =
                 ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps,
                                         resultRelInfo->ri_RelationDesc->rd_att);
-            resultRelInfo++;
         }
     }
     else
@@ -3227,32 +3227,9 @@ ExecEndModifyTable(ModifyTableState *node)
 	}
 #endif
 
-    /*
-     * Close all the partitioned tables, leaf partitions, and their indices
-     *
-     * Remember node->mt_partition_dispatch_info[0] corresponds to the root
-     * partitioned table, which we must not try to close, because it is the
-     * main target table of the query that will be closed by ExecEndPlan().
-     * Also, tupslot is NULL for the root partitioned table.
-     */
-    for (i = 1; i < node->mt_num_dispatch; i++)
-    {
-        PartitionDispatch pd = node->mt_partition_dispatch_info[i];
-
-        heap_close(pd->reldesc, NoLock);
-        ExecDropSingleTupleTableSlot(pd->tupslot);
-    }
-    for (i = 0; i < node->mt_num_partitions; i++)
-    {
-        ResultRelInfo *resultRelInfo = node->mt_partitions + i;
-
-        ExecCloseIndices(resultRelInfo);
-        heap_close(resultRelInfo->ri_RelationDesc, NoLock);
-    }
-
-    /* Release the standalone partition tuple descriptor, if any */
-    if (node->mt_partition_tuple_slot)
-        ExecDropSingleTupleTableSlot(node->mt_partition_tuple_slot);
+    /* Close all the partitioned tables, leaf partitions, and their indices */
+    if (node->mt_partition_tuple_routing)
+        ExecCleanupTupleRouting(node->mt_partition_tuple_routing);
 
     /*
      * Free the exprcontext
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index 703ff4f7..bea189c5 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -49,18 +49,47 @@ typedef struct PartitionDispatchData
 
 typedef struct PartitionDispatchData *PartitionDispatch;
 
-extern void ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
-							   Relation rel,
-							   Index resultRTindex,
-							   EState *estate,
-							   PartitionDispatch **pd,
-							   ResultRelInfo ***partitions,
-							   TupleConversionMap ***tup_conv_maps,
-							   TupleTableSlot **partition_tuple_slot,
-							   int *num_parted, int *num_partitions);
+/*-----------------------
+ * PartitionTupleRouting - Encapsulates all information required to execute
+ * tuple-routing between partitions.
+ *
+ * partition_dispatch_info		Array of PartitionDispatch objects with one
+ *								entry for every partitioned table in the
+ *								partition tree.
+ * num_dispatch					number of partitioned tables in the partition
+ *								tree (= length of partition_dispatch_info[])
+ * partitions					Array of ResultRelInfo* objects with one entry
+ *								for every leaf partition in the partition tree.
+ * num_partitions				Number of leaf partitions in the partition tree
+ *								(= 'partitions' array length)
+ * partition_tupconv_maps		Array of TupleConversionMap objects with one
+ *								entry for every leaf partition (required to
+ *								convert input tuple based on the root table's
+ *								rowtype to a leaf partition's rowtype after
+ *								tuple routing is done)
+ * partition_tuple_slot			TupleTableSlot to be used to manipulate any
+ *								given leaf partition's rowtype after that
+ *								partition is chosen for insertion by
+ *								tuple-routing.
+ *-----------------------
+ */
+typedef struct PartitionTupleRouting
+{
+	PartitionDispatch *partition_dispatch_info;
+	int			num_dispatch;
+	ResultRelInfo **partitions;
+	int			num_partitions;
+	TupleConversionMap **partition_tupconv_maps;
+	TupleTableSlot *partition_tuple_slot;
+} PartitionTupleRouting;
+
+extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
+							   Relation rel, Index resultRTindex,
+							   EState *estate);
 extern int ExecFindPartition(ResultRelInfo *resultRelInfo,
 				  PartitionDispatch *pd,
 				  TupleTableSlot *slot,
 				  EState *estate);
+extern void ExecCleanupTupleRouting(PartitionTupleRouting *proute);
 
 #endif							/* EXECPARTITION_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 087b2223..3d7ece62 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1066,15 +1066,8 @@ typedef struct ModifyTableState
     TupleTableSlot *mt_existing;    /* slot to store existing target tuple in */
     List       *mt_excludedtlist;    /* the excluded pseudo relation's tlist  */
     TupleTableSlot *mt_conflproj;    /* CONFLICT ... SET ... projection target */
-    struct PartitionDispatchData **mt_partition_dispatch_info;
+        struct PartitionTupleRouting *mt_partition_tuple_routing;
     /* Tuple-routing support info */
-    int            mt_num_dispatch;    /* Number of entries in the above array */
-    int            mt_num_partitions;    /* Number of members in the following
-                                     * arrays */
-    ResultRelInfo *mt_partitions;    /* Per partition result relation */
-    TupleConversionMap **mt_partition_tupconv_maps;
-    /* Per partition tuple conversion map */
-    TupleTableSlot *mt_partition_tuple_slot;
     struct TransitionCaptureState *mt_transition_capture;
     /* controls transition table population */
     TupleConversionMap **mt_transition_tupconv_maps;

From 39aff787446819eb96a5e4f3c7a1858cc3d7ae68 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 4 Jan 2018 16:25:49 -0500
Subject: [PATCH 222/578] Minor preparatory refactoring for UPDATE row
 movement.

Generalize is_partition_attr to has_partition_attrs and make it
accessible from outside tablecmds.c.  Change map_partition_varattnos
to clarify that it can be used for mapping between any two relations
in a partitioning hierarchy, not just parent -> child.

Amit Khandekar, reviewed by Amit Langote, David Rowley, and me.
Some comment changes by me.

Discussion: http://postgr.es/m/CAJ3gD9fWfxgKC+PfJZF3hkgAcNOy-LpfPxVYitDEXKHjeieWQQ@mail.gmail.com
---
 src/backend/catalog/partition.c  | 87 ++++++++++++++++++++++++++++----
 src/backend/commands/tablecmds.c | 71 +++-----------------------
 src/include/catalog/partition.h  |  6 ++-
 3 files changed, 87 insertions(+), 77 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 8bac934b..08b58d74 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -1480,10 +1480,13 @@ get_qual_from_partbound(Relation rel, Relation parent,
 
 /*
  * map_partition_varattnos - maps varattno of any Vars in expr from the
- * parent attno to partition attno.
+ * attno's of 'from_rel' to the attno's of 'to_rel' partition, each of which
+ * may be either a leaf partition or a partitioned table, but both of which
+ * must be from the same partitioning hierarchy.
  *
- * We must allow for cases where physical attnos of a partition can be
- * different from the parent's.
+ * Even though all of the same column names must be present in all relations
+ * in the hierarchy, and they must also have the same types, the attnos may
+ * be different.
  *
  * If found_whole_row is not NULL, *found_whole_row returns whether a
  * whole-row variable was found in the input expression.
@@ -1493,8 +1496,8 @@ get_qual_from_partbound(Relation rel, Relation parent,
  * are working on Lists, so it's less messy to do the casts internally.
  */
 List *
-map_partition_varattnos(List *expr, int target_varno,
-                        Relation partrel, Relation parent,
+map_partition_varattnos(List *expr, int fromrel_varno,
+						Relation to_rel, Relation from_rel,
                         bool *found_whole_row)
 {
 	bool		my_found_whole_row = false;
@@ -1503,14 +1506,14 @@ map_partition_varattnos(List *expr, int target_varno,
 	{
 		AttrNumber *part_attnos;
 
-    part_attnos = convert_tuples_by_name_map(RelationGetDescr(partrel),
-                                             RelationGetDescr(parent),
+		part_attnos = convert_tuples_by_name_map(RelationGetDescr(to_rel),
+												 RelationGetDescr(from_rel),
                                              gettext_noop("could not convert row type"));
     expr = (List *) map_variable_attnos((Node *) expr,
-                                        target_varno, 0,
+											fromrel_varno, 0,
                                         part_attnos,
-                                        RelationGetDescr(parent)->natts,
-                                        RelationGetForm(partrel)->reltype,
+											RelationGetDescr(from_rel)->natts,
+											RelationGetForm(to_rel)->reltype,
                                         &my_found_whole_row);
 	}
 
@@ -2627,6 +2630,70 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
     return part_index;
 }
 
+/*
+ * Checks if any of the 'attnums' is a partition key attribute for rel
+ *
+ * Sets *used_in_expr if any of the 'attnums' is found to be referenced in some
+ * partition key expression.  It's possible for a column to be both used
+ * directly and as part of an expression; if that happens, *used_in_expr may
+ * end up as either true or false.  That's OK for current uses of this
+ * function, because *used_in_expr is only used to tailor the error message
+ * text.
+ */
+bool
+has_partition_attrs(Relation rel, Bitmapset *attnums,
+					bool *used_in_expr)
+{
+	PartitionKey key;
+	int			partnatts;
+	List	   *partexprs;
+	ListCell   *partexprs_item;
+	int			i;
+
+	if (attnums == NULL || rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		return false;
+
+	key = RelationGetPartitionKey(rel);
+	partnatts = get_partition_natts(key);
+	partexprs = get_partition_exprs(key);
+
+	partexprs_item = list_head(partexprs);
+	for (i = 0; i < partnatts; i++)
+	{
+		AttrNumber	partattno = get_partition_col_attnum(key, i);
+
+		if (partattno != 0)
+		{
+			if (bms_is_member(partattno - FirstLowInvalidHeapAttributeNumber,
+							  attnums))
+			{
+				if (used_in_expr)
+					*used_in_expr = false;
+				return true;
+			}
+		}
+		else
+		{
+			/* Arbitrary expression */
+			Node	   *expr = (Node *) lfirst(partexprs_item);
+			Bitmapset  *expr_attrs = NULL;
+
+			/* Find all attributes referenced */
+			pull_varattnos(expr, 1, &expr_attrs);
+			partexprs_item = lnext(partexprs_item);
+
+			if (bms_overlap(attnums, expr_attrs))
+			{
+				if (used_in_expr)
+					*used_in_expr = true;
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
 /*
  * qsort_partition_hbound_cmp
  *
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 74b82ebf..536b8661 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -532,7 +532,6 @@ static void RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid,
                                 Oid oldRelOid, void *arg);
 static void RangeVarCallbackForAlterRelation(const RangeVar *rv, Oid relid,
                                  Oid oldrelid, void *arg);
-static bool is_partition_attr(Relation rel, AttrNumber attnum, bool *used_in_expr);
 static PartitionSpec *transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy);
 static void ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
 					  List **partexprs, Oid *partopclass, Oid *partcollation, char strategy);
@@ -8128,68 +8127,6 @@ ATPrepDropColumn(List **wqueue, Relation rel, bool recurse, bool recursing,
         cmd->subtype = AT_DropColumnRecurse;
 }
 
-/*
- * Checks if attnum is a partition attribute for rel
- *
- * Sets *used_in_expr if attnum is found to be referenced in some partition
- * key expression.  It's possible for a column to be both used directly and
- * as part of an expression; if that happens, *used_in_expr may end up as
- * either true or false.  That's OK for current uses of this function, because
- * *used_in_expr is only used to tailor the error message text.
- */
-static bool
-is_partition_attr(Relation rel, AttrNumber attnum, bool *used_in_expr)
-{// #lizard forgives
-    PartitionKey key;
-    int            partnatts;
-    List       *partexprs;
-    ListCell   *partexprs_item;
-    int            i;
-
-    if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
-        return false;
-
-    key = RelationGetPartitionKey(rel);
-    partnatts = get_partition_natts(key);
-    partexprs = get_partition_exprs(key);
-
-    partexprs_item = list_head(partexprs);
-    for (i = 0; i < partnatts; i++)
-    {
-        AttrNumber    partattno = get_partition_col_attnum(key, i);
-
-        if (partattno != 0)
-        {
-            if (attnum == partattno)
-            {
-                if (used_in_expr)
-                    *used_in_expr = false;
-                return true;
-            }
-        }
-        else
-        {
-            /* Arbitrary expression */
-            Node       *expr = (Node *) lfirst(partexprs_item);
-            Bitmapset  *expr_attrs = NULL;
-
-            /* Find all attributes referenced */
-            pull_varattnos(expr, 1, &expr_attrs);
-            partexprs_item = lnext(partexprs_item);
-
-            if (bms_is_member(attnum - FirstLowInvalidHeapAttributeNumber,
-                              expr_attrs))
-            {
-                if (used_in_expr)
-                    *used_in_expr = true;
-                return true;
-            }
-        }
-    }
-
-    return false;
-}
-
 /*
  * Return value is the address of the dropped column.
  */
@@ -8250,7 +8187,9 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName,
                         colName)));
 
     /* Don't drop columns used in the partition key */
-    if (is_partition_attr(rel, attnum, &is_expr))
+	if (has_partition_attrs(rel,
+							bms_make_singleton(attnum - FirstLowInvalidHeapAttributeNumber),
+							&is_expr))
     {
         if (!is_expr)
             ereport(ERROR,
@@ -10699,7 +10638,9 @@ ATPrepAlterColumnType(List **wqueue,
                         colName)));
 
     /* Don't alter columns used in the partition key */
-    if (is_partition_attr(rel, attnum, &is_expr))
+	if (has_partition_attrs(rel,
+							bms_make_singleton(attnum - FirstLowInvalidHeapAttributeNumber),
+							&is_expr))
     {
         if (!is_expr)
             ereport(ERROR,
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index 295e9d22..3d8b08ba 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -54,11 +54,13 @@ extern void check_new_partition_bound(char *relname, Relation parent,
 extern Oid	get_partition_parent(Oid relid);
 extern List *get_qual_from_partbound(Relation rel, Relation parent,
 						PartitionBoundSpec *spec);
-extern List *map_partition_varattnos(List *expr, int target_varno,
-						Relation partrel, Relation parent,
+extern List *map_partition_varattnos(List *expr, int fromrel_varno,
+						Relation to_rel, Relation from_rel,
 						bool *found_whole_row);
 extern List *RelationGetPartitionQual(Relation rel);
 extern Expr *get_partition_qual_relid(Oid relid);
+extern bool has_partition_attrs(Relation rel, Bitmapset *attnums,
+					bool *used_in_expr);
 
 extern Oid	get_default_oid_from_partdesc(PartitionDesc partdesc);
 extern Oid	get_default_partition_oid(Oid parentId);

From abe8c72d16cc3a352fa34825d0ab6d719717f373 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 26 Jun 2020 16:11:19 +0800
Subject: [PATCH 223/578] Factor error generation out of
 ExecPartitionCheck.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/commands/copy.c            |  2 +-
 src/backend/executor/execMain.c        | 50 ++++++++++++++++----------
 src/backend/executor/execPartition.c   |  5 +--
 src/backend/executor/execReplication.c |  4 +--
 src/backend/executor/nodeModifyTable.c |  4 +--
 src/include/executor/executor.h        |  5 ++-
 6 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 316356d8..ab6834e0 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -3585,7 +3585,7 @@ CopyFrom(CopyState cstate)
 
                 /* Check the constraints of the tuple */
                 if (cstate->rel->rd_att->constr || check_partition_constr)
-                    ExecConstraints(resultRelInfo, slot, estate);
+					ExecConstraints(resultRelInfo, slot, estate, true);
 
 #ifdef _MLS_
                 if (is_mls_user())
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index a4978497..63de1a27 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -2210,16 +2210,12 @@ ExecRelCheck(ResultRelInfo *resultRelInfo,
  * ExecPartitionCheck --- check that tuple meets the partition constraint.
  *
  * Exported in executor.h for outside use.
+ * Returns true if it meets the partition constraint, else returns false.
  */
-void
+bool
 ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
                    EState *estate)
 {
-    Relation    rel = resultRelInfo->ri_RelationDesc;
-    TupleDesc    tupdesc = RelationGetDescr(rel);
-    Bitmapset  *modifiedCols;
-    Bitmapset  *insertedCols;
-    Bitmapset  *updatedCols;
     ExprContext *econtext;
 
     /*
@@ -2247,12 +2243,29 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
      * As in case of the catalogued constraints, we treat a NULL result as
      * success here, not a failure.
      */
-    if (!ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext))
+    return ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext);
+}
+
+/*
+ * ExecPartitionCheckEmitError - Form and emit an error message after a failed
+ * partition constraint check.
+ */
+void
+ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo,
+                           TupleTableSlot *slot,
+                           EState *estate)
     {
-        char       *val_desc;
+   Relation    rel = resultRelInfo->ri_RelationDesc;
         Relation    orig_rel = rel;
-
-        /* See the comment above. */
+   TupleDesc   tupdesc = RelationGetDescr(rel);
+   char       *val_desc;
+   Bitmapset  *modifiedCols;
+   Bitmapset  *insertedCols;
+   Bitmapset  *updatedCols;
+   /*
+    * Need to first convert the tuple to the root partitioned table's row
+    * type. For details, check similar comments in ExecConstraints().
+    */
         if (resultRelInfo->ri_PartitionRoot)
         {
             HeapTuple    tuple = ExecFetchSlotTuple(slot);
@@ -2266,7 +2279,7 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
                                          gettext_noop("could not convert row type"));
             if (map != NULL)
             {
-                tuple = do_convert_tuple(tuple, map, rel);
+            tuple = do_convert_tuple(tuple, map);
                 ExecSetSlotDescriptor(slot, tupdesc);
                 ExecStoreTuple(tuple, slot, InvalidBuffer, false);
             }
@@ -2286,13 +2299,12 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
                         RelationGetRelationName(orig_rel)),
                  val_desc ? errdetail("Failing row contains %s.", val_desc) : 0));
     }
-}
 
 /*
  * ExecConstraints - check constraints of the tuple in 'slot'
  *
- * This checks the traditional NOT NULL and check constraints, as well as
- * the partition constraint, if any.
+ * This checks the traditional NOT NULL and check constraints, and if
+ * requested, checks the partition constraint.
  *
  * Note: 'slot' contains the tuple to check the constraints of, which may
  * have been converted from the original input tuple after tuple routing.
@@ -2300,8 +2312,9 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
  */
 void
 ExecConstraints(ResultRelInfo *resultRelInfo,
-                TupleTableSlot *slot, EState *estate)
-{// #lizard forgives
+				TupleTableSlot *slot, EState *estate,
+				bool check_partition_constraint)
+{
     Relation    rel = resultRelInfo->ri_RelationDesc;
     TupleDesc    tupdesc = RelationGetDescr(rel);
     TupleConstr *constr = tupdesc->constr;
@@ -2415,8 +2428,9 @@ ExecConstraints(ResultRelInfo *resultRelInfo,
         }
     }
 
-    if (resultRelInfo->ri_PartitionCheck)
-        ExecPartitionCheck(resultRelInfo, slot, estate);
+	if (check_partition_constraint && resultRelInfo->ri_PartitionCheck &&
+		!ExecPartitionCheck(resultRelInfo, slot, estate))
+		ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
 }
 
 
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 0f4c8db3..b1dfe5a9 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -167,8 +167,9 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 	 * First check the root table's partition constraint, if any.  No point in
 	 * routing the tuple if it doesn't belong in the root table itself.
 	 */
-	if (resultRelInfo->ri_PartitionCheck)
-		ExecPartitionCheck(resultRelInfo, slot, estate);
+	if (resultRelInfo->ri_PartitionCheck &&
+		!ExecPartitionCheck(resultRelInfo, slot, estate))
+		ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
 
 	/* start with the root partitioned table */
 	parent = pd[0];
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 837b903f..c0b6f4a0 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -404,7 +404,7 @@ ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot)
 
         /* Check the constraints of the tuple */
         if (rel->rd_att->constr)
-            ExecConstraints(resultRelInfo, slot, estate);
+			ExecConstraints(resultRelInfo, slot, estate, true);
 
 #ifdef _MLS_
         if (is_mls_user())
@@ -491,7 +491,7 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate,
 
         /* Check the constraints of the tuple */
         if (rel->rd_att->constr)
-            ExecConstraints(resultRelInfo, slot, estate);
+			ExecConstraints(resultRelInfo, slot, estate, true);
 
 #ifdef _MLS_
         if (is_mls_user())
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 4c873634..9eba56d3 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -628,7 +628,7 @@ ExecInsert(ModifyTableState *mtstate,
 
         /* Check the constraints of the tuple */
         if (resultRelationDesc->rd_att->constr || check_partition_constr)
-            ExecConstraints(resultRelInfo, slot, estate);
+                    ExecConstraints(resultRelInfo, slot, estate, true);			
 
 #ifdef _MLS_
         if (is_mls_user())
@@ -1367,7 +1367,7 @@ lreplace:;
          * tuple-routing is performed here, hence the slot remains unchanged.
          */
         if (resultRelationDesc->rd_att->constr || resultRelInfo->ri_PartitionCheck)
-            ExecConstraints(resultRelInfo, slot, estate);
+			ExecConstraints(resultRelInfo, slot, estate, true);
 
 #ifdef _MLS_
         if (is_mls_user())
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 4ea9ef52..a143cd77 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -205,8 +205,11 @@ extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid);
 extern void ExecCleanUpTriggerState(EState *estate);
 extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids);
 extern void ExecConstraints(ResultRelInfo *resultRelInfo,
+				TupleTableSlot *slot, EState *estate,
+				bool check_partition_constraint);
+extern bool ExecPartitionCheck(ResultRelInfo *resultRelInfo,
                 TupleTableSlot *slot, EState *estate);
-extern void ExecPartitionCheck(ResultRelInfo *resultRelInfo,
+extern void ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo,
 				   TupleTableSlot *slot, EState *estate);
 extern void ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo,
                       TupleTableSlot *slot, EState *estate);

From 16b9bc37f1e1abccb9422ad3ad5572ee6792eafb Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 17 Jan 2018 14:44:15 -0500
Subject: [PATCH 224/578] Remove useless lookup of root partitioned rel in
 ExecInitModifyTable().

node->partitioned_rels is only set in UPDATE/DELETE cases, but
ExecInitModifyTable only uses its "rel" variable in INSERT cases,
so the extra logic to find the root rel is just a waste of complexity
and cycles.

Etsuro Fujita, reviewed by Amit Langote

Discussion: https://postgr.es/m/93cf9816-2f7d-0f67-8ed2-4a4e497a6ab8@lab.ntt.co.jp
---
 src/backend/executor/nodeModifyTable.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 9eba56d3..a7ed98e7 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -47,7 +47,6 @@
 #include "foreign/fdwapi.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
-#include "parser/parsetree.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
 #include "utils/builtins.h"
@@ -2694,20 +2693,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
     estate->es_result_relation_info = saved_resultRelInfo;
 
-    /* The root table RT index is at the head of the partitioned_rels list */
-    if (node->partitioned_rels)
-    {
-        Index        root_rti;
-        Oid            root_oid;
-
-        root_rti = linitial_int(node->partitioned_rels);
-        root_oid = getrelid(root_rti, estate->es_range_table);
-        rel = heap_open(root_oid, NoLock);    /* locked by InitPlan */
-    }
-    else
-        rel = mtstate->resultRelInfo->ri_RelationDesc;
-
     /* Build state for INSERT tuple routing */
+	rel = mtstate->resultRelInfo->ri_RelationDesc;
     if (operation == CMD_INSERT &&
         rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
     {
@@ -2909,10 +2896,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
         mtstate->ps.ps_ExprContext = NULL;
     }
 
-    /* Close the root partitioned rel if we opened it above. */
-    if (rel != mtstate->resultRelInfo->ri_RelationDesc)
-        heap_close(rel, NoLock);
-
     /*
      * If needed, Initialize target list, projection and qual for ON CONFLICT
      * DO UPDATE.

From d503d3212ce54a7cc3f725a0ec867cc745b08315 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 26 Jun 2020 17:57:02 +0800
Subject: [PATCH 225/578] Allow UPDATE to move rows between partitions.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/ddl.sgml                   |  24 +-
 doc/src/sgml/ref/update.sgml            |  13 +-
 doc/src/sgml/trigger.sgml               |  23 +
 src/backend/commands/copy.c             |  40 +-
 src/backend/commands/trigger.c          |  52 +-
 src/backend/executor/execPartition.c    | 241 ++++++++-
 src/backend/executor/nodeModifyTable.c  | 558 +++++++++++++++----
 src/backend/nodes/copyfuncs.c           |   2 +
 src/backend/nodes/equalfuncs.c          |   1 +
 src/backend/nodes/outfuncs.c            |   3 +
 src/backend/nodes/readfuncs.c           |   1 +
 src/backend/optimizer/path/allpaths.c   |   4 +-
 src/backend/optimizer/plan/createplan.c |   4 +
 src/backend/optimizer/plan/planner.c    |  19 +-
 src/backend/optimizer/prep/prepunion.c  |  28 +-
 src/backend/optimizer/util/pathnode.c   |   4 +
 src/include/executor/execPartition.h    |  34 +-
 src/include/nodes/execnodes.h           |   4 +-
 src/include/nodes/plannodes.h           |   1 +
 src/include/nodes/relation.h            |   3 +
 src/include/optimizer/pathnode.h        |   1 +
 src/include/optimizer/planner.h         |   3 +-
 src/test/regress/expected/update.out    | 683 ++++++++++++++++++++++--
 src/test/regress/sql/update.sql         | 458 +++++++++++++++-
 src/tools/pgindent/typedefs.list        |   1 +
 25 files changed, 1945 insertions(+), 260 deletions(-)

diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml
index 168c5f54..cf41e569 100644
--- a/doc/src/sgml/ddl.sgml
+++ b/doc/src/sgml/ddl.sgml
@@ -3275,6 +3275,11 @@ VALUES ('Albany', NULL, NULL, 'NY');
     foreign table partitions.
    </para>
 
+   <para>
+    Updating the partition key of a row might cause it to be moved into a
+    different partition where this row satisfies its partition constraint.
+   </para>
+
    <sect3 id="ddl-partitioning-declarative-example">
     <title>Example</title>
 
@@ -3572,9 +3577,22 @@ ALTER TABLE measurement ATTACH PARTITION measurement_y2008m02
 
      <listitem>
       <para>
-       An <command>UPDATE</> that causes a row to move from one partition to
-       another fails, because the new value of the row fails to satisfy the
-       implicit partition constraint of the original partition.
+       When an <command>UPDATE</command> causes a row to move from one
+       partition to another, there is a chance that another concurrent
+       <command>UPDATE</command> or <command>DELETE</command> misses this row.
+       Suppose session 1 is performing an <command>UPDATE</command> on a
+       partition key, and meanwhile a concurrent session 2 for which this row
+       is visible performs an <command>UPDATE</command> or
+       <command>DELETE</command> operation on this row. Session 2 can silently
+       miss the row if the row is deleted from the partition due to session
+       1's activity.  In such case, session 2's
+       <command>UPDATE</command> or <command>DELETE</command>, being unaware of
+       the row movement thinks that the row has just been deleted and concludes
+       that there is nothing to be done for this row. In the usual case where
+       the table is not partitioned, or where there is no row movement,
+       session 2 would have identified the newly updated row and carried out
+       the <command>UPDATE</command>/<command>DELETE</command> on this new row
+       version.
       </para>
      </listitem>
 
diff --git a/doc/src/sgml/ref/update.sgml b/doc/src/sgml/ref/update.sgml
index b84fd93a..2fd709b8 100644
--- a/doc/src/sgml/ref/update.sgml
+++ b/doc/src/sgml/ref/update.sgml
@@ -288,10 +288,15 @@ UPDATE <replaceable class="parameter">count</replaceable>
 
   <para>
    In the case of a partitioned table, updating a row might cause it to no
-   longer satisfy the partition constraint.  Since there is no provision to
-   move the row to the partition appropriate to the new value of its
-   partitioning key, an error will occur in this case.  This can also happen
-   when updating a partition directly.
+   longer satisfy the partition constraint of the containing partition. In that
+   case, if there is some other partition in the partition tree for which this
+   row satisfies its partition constraint, then the row is moved to that
+   partition. If there is no such partition, an error will occur.  Behind the
+   scenes, the row movement is actually a <command>DELETE</command> and
+   <command>INSERT</command> operation. However, there is a possibility that a
+   concurrent <command>UPDATE</command> or <command>DELETE</command> on the
+   same row may miss this row. For details see the section
+   <xref linkend="ddl-partitioning-declarative-limitations"/>.
   </para>
  </refsect1>
 
diff --git a/doc/src/sgml/trigger.sgml b/doc/src/sgml/trigger.sgml
index ea29a097..44a9a3c9 100644
--- a/doc/src/sgml/trigger.sgml
+++ b/doc/src/sgml/trigger.sgml
@@ -163,6 +163,29 @@
     triggers.
    </para>
 
+   <para>
+    If an <command>UPDATE</command> on a partitioned table causes a row to move
+    to another partition, it will be performed as a <command>DELETE</command>
+    from the original partition followed by an <command>INSERT</command> into
+    the new partition. In this case, all row-level <literal>BEFORE</literal>
+    <command>UPDATE</command> triggers and all row-level
+    <literal>BEFORE</literal> <command>DELETE</command> triggers are fired on
+    the original partition. Then all row-level <literal>BEFORE</literal>
+    <command>INSERT</command> triggers are fired on the destination partition.
+    The possibility of surprising outcomes should be considered when all these
+    triggers affect the row being moved. As far as <literal>AFTER ROW</literal>
+    triggers are concerned, <literal>AFTER</literal> <command>DELETE</command>
+    and <literal>AFTER</literal> <command>INSERT</command> triggers are
+    applied; but <literal>AFTER</literal> <command>UPDATE</command> triggers
+    are not applied because the <command>UPDATE</command> has been converted to
+    a <command>DELETE</command> and an <command>INSERT</command>. As far as
+    statement-level triggers are concerned, none of the
+    <command>DELETE</command> or <command>INSERT</command> triggers are fired,
+    even if row movement occurs; only the <command>UPDATE</command> triggers
+    defined on the target table used in the <command>UPDATE</command> statement
+    will be fired.
+   </para>
+
    <para>
     Trigger functions invoked by per-statement triggers should always
     return <symbol>NULL</symbol>. Trigger functions invoked by per-row
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index ab6834e0..cf770f46 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -212,7 +212,6 @@ typedef struct CopyStateData
 	PartitionTupleRouting *partition_tuple_routing;
 
     TransitionCaptureState *transition_capture;
-    TupleConversionMap **transition_tupconv_maps;
 
     /*
      * These variables are used to reduce overhead in textual COPY FROM.
@@ -1645,19 +1644,7 @@ BeginCopy(ParseState *pstate,
              * modifies the tuple).
              */
             if (cstate->transition_capture != NULL)
-            {
-                int            i;
-
-                cstate->transition_tupconv_maps = (TupleConversionMap **)
-		            palloc0(sizeof(TupleConversionMap *) * proute->num_partitions);
-                for (i = 0; i < proute->num_partitions; ++i)
-                {
-                    cstate->transition_tupconv_maps[i] =
-						convert_tuples_by_name(RelationGetDescr(proute->partitions[i]->ri_RelationDesc),
-                                               RelationGetDescr(rel),
-                                               gettext_noop("could not convert row type"));
-                }
-            }
+				ExecSetupChildParentMapForLeaf(proute);
         }
 #ifdef PGXC
         /* Get copy statement and execution node information */
@@ -3441,7 +3428,6 @@ CopyFrom(CopyState cstate)
 		if (cstate->partition_tuple_routing)
         {
             int            leaf_part_index;
-            TupleConversionMap *map;
 			PartitionTupleRouting *proute = cstate->partition_tuple_routing;
 
             /*
@@ -3505,7 +3491,8 @@ CopyFrom(CopyState cstate)
                      */
                     cstate->transition_capture->tcs_original_insert_tuple = NULL;
                     cstate->transition_capture->tcs_map =
-                        cstate->transition_tupconv_maps[leaf_part_index];
+						TupConvMapForLeaf(proute, saved_resultRelInfo,
+										  leaf_part_index);
                 }
                 else
                 {
@@ -3522,23 +3509,10 @@ CopyFrom(CopyState cstate)
              * We might need to convert from the parent rowtype to the
              * partition rowtype.
              */
-			map = proute->partition_tupconv_maps[leaf_part_index];
-            if (map)
-            {
-                Relation    partrel = resultRelInfo->ri_RelationDesc;
-
-                tuple = do_convert_tuple(tuple, map, partrel);
-
-                /*
-                 * We must use the partition's tuple descriptor from this
-                 * point on.  Use a dedicated slot from this point on until
-                 * we're finished dealing with the partition.
-                 */
-				slot = proute->partition_tuple_slot;
-                Assert(slot != NULL);
-                ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
-                ExecStoreTuple(tuple, slot, InvalidBuffer, true);
-            }
+            tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index],
+                                              tuple,
+                                              proute->partition_tuple_slot,
+                                              &slot);
 
             tuple->t_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc);
         }
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 6412550b..7a2a05ff 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -2994,8 +2994,13 @@ ExecARUpdateTriggers(EState *estate, ResultRelInfo *relinfo,
     {
         HeapTuple    trigtuple;
 
-        Assert(HeapTupleIsValid(fdw_trigtuple) ^ ItemPointerIsValid(tupleid));
-        if (fdw_trigtuple == NULL)
+		/*
+		 * Note: if the UPDATE is converted into a DELETE+INSERT as part of
+		 * update-partition-key operation, then this function is also called
+		 * separately for DELETE and INSERT to capture transition table rows.
+		 * In such case, either old tuple or new tuple can be NULL.
+		 */
+		if (fdw_trigtuple == NULL && ItemPointerIsValid(tupleid))
             trigtuple = GetTupleForTrigger(estate,
                                            NULL,
                                            relinfo,
@@ -5559,7 +5564,12 @@ AfterTriggerPendingOnRel(Oid relid)
  *    triggers actually need to be queued.  It is also called after each row,
  *    even if there are no triggers for that event, if there are any AFTER
  *    STATEMENT triggers for the statement which use transition tables, so that
- *    the transition tuplestores can be built.
+ *	the transition tuplestores can be built.  Furthermore, if the transition
+ *	capture is happening for UPDATEd rows being moved to another partition due
+ *	to the partition-key being changed, then this function is called once when
+ *	the row is deleted (to capture OLD row), and once when the row is inserted
+ *	into another partition (to capture NEW row).  This is done separately because
+ *	DELETE and INSERT happen on different tables.
  *
  *    Transition tuplestores are built now, rather than when events are pulled
  *    off of the queue because AFTER ROW triggers are allowed to select from the
@@ -5612,12 +5622,25 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
         bool        update_new_table = transition_capture->tcs_update_new_table;
         bool        insert_new_table = transition_capture->tcs_insert_new_table;;
 
-        if ((event == TRIGGER_EVENT_DELETE && delete_old_table) ||
-            (event == TRIGGER_EVENT_UPDATE && update_old_table))
+		/*
+		* For INSERT events newtup should be non-NULL, for DELETE events
+		* oldtup should be non-NULL, whereas for UPDATE events normally both
+		* oldtup and newtup are non-NULL.  But for UPDATE events fired for
+		* capturing transition tuples during UPDATE partition-key row
+		* movement, oldtup is NULL when the event is for a row being inserted,
+		* whereas newtup is NULL when the event is for a row being deleted.
+		*/
+		Assert(!(event == TRIGGER_EVENT_DELETE && delete_old_table &&
+		        oldtup == NULL));
+		Assert(!(event == TRIGGER_EVENT_INSERT && insert_new_table &&
+		        newtup == NULL));
+
+		if (oldtup != NULL &&
+		   ((event == TRIGGER_EVENT_DELETE && delete_old_table) ||
+		    (event == TRIGGER_EVENT_UPDATE && update_old_table)))
         {
             Tuplestorestate *old_tuplestore;
 
-            Assert(oldtup != NULL);
             old_tuplestore = transition_capture->tcs_old_tuplestore;
 
             if (map != NULL)
@@ -5630,12 +5653,12 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
             else
                 tuplestore_puttuple(old_tuplestore, oldtup);
         }
-        if ((event == TRIGGER_EVENT_INSERT && insert_new_table) ||
-            (event == TRIGGER_EVENT_UPDATE && update_new_table))
+		if (newtup != NULL &&
+            ((event == TRIGGER_EVENT_INSERT && insert_new_table) ||
+            (event == TRIGGER_EVENT_UPDATE && update_new_table)))
         {
             Tuplestorestate *new_tuplestore;
 
-            Assert(newtup != NULL);
             if (event == TRIGGER_EVENT_INSERT)
                 new_tuplestore = transition_capture->tcs_insert_tuplestore;
             else
@@ -5654,11 +5677,18 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
                 tuplestore_puttuple(new_tuplestore, newtup);
         }
 
-        /* If transition tables are the only reason we're here, return. */
+		/*
+		 * If transition tables are the only reason we're here, return. As
+		 * mentioned above, we can also be here during update tuple routing in
+		 * presence of transition tables, in which case this function is called
+		 * separately for oldtup and newtup, so we expect exactly one of them
+		 * to be NULL.
+		 */
         if (trigdesc == NULL ||
             (event == TRIGGER_EVENT_DELETE && !trigdesc->trig_delete_after_row) ||
             (event == TRIGGER_EVENT_INSERT && !trigdesc->trig_insert_after_row) ||
-            (event == TRIGGER_EVENT_UPDATE && !trigdesc->trig_update_after_row))
+			(event == TRIGGER_EVENT_UPDATE && !trigdesc->trig_update_after_row) ||
+			(event == TRIGGER_EVENT_UPDATE && ((oldtup == NULL) ^ (newtup == NULL))))
             return;
     }
 
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index b1dfe5a9..a08f308f 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -54,7 +54,11 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 	List	   *leaf_parts;
 	ListCell   *cell;
 	int			i;
-	ResultRelInfo *leaf_part_rri;
+	ResultRelInfo *leaf_part_arr = NULL,
+			   *update_rri = NULL;
+	int			num_update_rri = 0,
+				update_rri_index = 0;
+	bool		is_update = false;
 	PartitionTupleRouting *proute;
 
 	/*
@@ -69,10 +73,38 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 	proute->num_partitions = list_length(leaf_parts);
 	proute->partitions = (ResultRelInfo **) palloc(proute->num_partitions *
 												   sizeof(ResultRelInfo *));
-	proute->partition_tupconv_maps =
+	proute->parent_child_tupconv_maps =
 		(TupleConversionMap **) palloc0(proute->num_partitions *
 										sizeof(TupleConversionMap *));
 
+	/* Set up details specific to the type of tuple routing we are doing. */
+	if (mtstate && mtstate->operation == CMD_UPDATE)
+	{
+		ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
+
+		is_update = true;
+		update_rri = mtstate->resultRelInfo;
+		num_update_rri = list_length(node->plans);
+		proute->subplan_partition_offsets =
+			palloc(num_update_rri * sizeof(int));
+
+		/*
+		 * We need an additional tuple slot for storing transient tuples that
+		 * are converted to the root table descriptor.
+		 */
+		proute->root_tuple_slot = MakeTupleTableSlot();
+	}
+	else
+	{
+		/*
+		 * Since we are inserting tuples, we need to create all new result
+		 * rels. Avoid repeated pallocs by allocating memory for all the
+		 * result rels in bulk.
+		 */
+		leaf_part_arr = (ResultRelInfo *) palloc0(proute->num_partitions *
+												  sizeof(ResultRelInfo));
+	}
+
 	/*
 	 * Initialize an empty slot that will be used to manipulate tuples of any
 	 * given partition's rowtype.  It is attached to the caller-specified node
@@ -81,38 +113,86 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 	 */
 	proute->partition_tuple_slot = MakeTupleTableSlot();
 
-	leaf_part_rri = (ResultRelInfo *) palloc0(proute->num_partitions *
-											  sizeof(ResultRelInfo));
 	i = 0;
 	foreach(cell, leaf_parts)
 	{
-		Relation	partrel;
+		ResultRelInfo *leaf_part_rri;
+		Relation	partrel = NULL;
 		TupleDesc	part_tupdesc;
+		Oid			leaf_oid = lfirst_oid(cell);
+
+		if (is_update)
+		{
+			/*
+			 * If the leaf partition is already present in the per-subplan
+			 * result rels, we re-use that rather than initialize a new result
+			 * rel. The per-subplan resultrels and the resultrels of the leaf
+			 * partitions are both in the same canonical order. So while going
+			 * through the leaf partition oids, we need to keep track of the
+			 * next per-subplan result rel to be looked for in the leaf
+			 * partition resultrels.
+			 */
+			if (update_rri_index < num_update_rri &&
+				RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid)
+			{
+				leaf_part_rri = &update_rri[update_rri_index];
+				partrel = leaf_part_rri->ri_RelationDesc;
+
+				/*
+				 * This is required in order to we convert the partition's
+				 * tuple to be compatible with the root partitioned table's
+				 * tuple descriptor.  When generating the per-subplan result
+				 * rels, this was not set.
+				 */
+				leaf_part_rri->ri_PartitionRoot = rel;
+
+				/* Remember the subplan offset for this ResultRelInfo */
+				proute->subplan_partition_offsets[update_rri_index] = i;
+
+				update_rri_index++;
+			}
+			else
+				leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo));
+		}
+		else
+		{
+			/* For INSERTs, we already have an array of result rels allocated */
+			leaf_part_rri = &leaf_part_arr[i];
+		}
 
 		/*
-		 * We locked all the partitions above including the leaf partitions.
-		 * Note that each of the relations in proute->partitions are
-		 * eventually closed by the caller.
+		 * If we didn't open the partition rel, it means we haven't
+		 * initialized the result rel either.
 		 */
-		partrel = heap_open(lfirst_oid(cell), NoLock);
+		if (!partrel)
+		{
+			/*
+			 * We locked all the partitions above including the leaf
+			 * partitions. Note that each of the newly opened relations in
+			 * proute->partitions are eventually closed by the caller.
+			 */
+			partrel = heap_open(leaf_oid, NoLock);
+			InitResultRelInfo(leaf_part_rri,
+							  partrel,
+							  resultRTindex,
+							  rel,
+							  estate->es_instrument);
+		}
+
 		part_tupdesc = RelationGetDescr(partrel);
 
 		/*
 		 * Save a tuple conversion map to convert a tuple routed to this
 		 * partition from the parent's type to the partition's.
 		 */
-		proute->partition_tupconv_maps[i] =
+		proute->parent_child_tupconv_maps[i] =
 			convert_tuples_by_name(tupDesc, part_tupdesc,
 								   gettext_noop("could not convert row type"));
 
-		InitResultRelInfo(leaf_part_rri,
-						  partrel,
-						  resultRTindex,
-						  rel,
-						  estate->es_instrument);
-
 		/*
-		 * Verify result relation is a valid target for INSERT.
+		 * Verify result relation is a valid target for an INSERT.  An UPDATE
+		 * of a partition-key becomes a DELETE+INSERT operation, so this check
+		 * is still required when the operation is CMD_UPDATE.
 		 */
 		CheckValidResultRel(leaf_part_rri, CMD_INSERT);
 
@@ -132,10 +212,16 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 		estate->es_leaf_result_relations =
 			lappend(estate->es_leaf_result_relations, leaf_part_rri);
 
-		proute->partitions[i] = leaf_part_rri++;
+		proute->partitions[i] = leaf_part_rri;
 		i++;
 	}
 
+	/*
+	 * For UPDATE, we should have found all the per-subplan resultrels in the
+	 * leaf partitions.
+	 */
+	Assert(!is_update || update_rri_index == num_update_rri);
+
 	return proute;
 }
 
@@ -258,6 +344,101 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 	return result;
 }
 
+/*
+ * ExecSetupChildParentMapForLeaf -- Initialize the per-leaf-partition
+ * child-to-root tuple conversion map array.
+ *
+ * This map is required for capturing transition tuples when the target table
+ * is a partitioned table. For a tuple that is routed by an INSERT or UPDATE,
+ * we need to convert it from the leaf partition to the target table
+ * descriptor.
+ */
+void
+ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute)
+{
+	Assert(proute != NULL);
+
+	/*
+	 * These array elements gets filled up with maps on an on-demand basis.
+	 * Initially just set all of them to NULL.
+	 */
+	proute->child_parent_tupconv_maps =
+		(TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) *
+										proute->num_partitions);
+
+	/* Same is the case for this array. All the values are set to false */
+	proute->child_parent_map_not_required =
+		(bool *) palloc0(sizeof(bool) * proute->num_partitions);
+}
+
+/*
+ * TupConvMapForLeaf -- Get the tuple conversion map for a given leaf partition
+ * index.
+ */
+TupleConversionMap *
+TupConvMapForLeaf(PartitionTupleRouting *proute,
+				  ResultRelInfo *rootRelInfo, int leaf_index)
+{
+	ResultRelInfo **resultRelInfos = proute->partitions;
+	TupleConversionMap **map;
+	TupleDesc	tupdesc;
+
+	/* Don't call this if we're not supposed to be using this type of map. */
+	Assert(proute->child_parent_tupconv_maps != NULL);
+
+	/* If it's already known that we don't need a map, return NULL. */
+	if (proute->child_parent_map_not_required[leaf_index])
+		return NULL;
+
+	/* If we've already got a map, return it. */
+	map = &proute->child_parent_tupconv_maps[leaf_index];
+	if (*map != NULL)
+		return *map;
+
+	/* No map yet; try to create one. */
+	tupdesc = RelationGetDescr(resultRelInfos[leaf_index]->ri_RelationDesc);
+	*map =
+		convert_tuples_by_name(tupdesc,
+							   RelationGetDescr(rootRelInfo->ri_RelationDesc),
+							   gettext_noop("could not convert row type"));
+
+	/* If it turns out no map is needed, remember for next time. */
+	proute->child_parent_map_not_required[leaf_index] = (*map == NULL);
+
+	return *map;
+}
+
+/*
+ * ConvertPartitionTupleSlot -- convenience function for tuple conversion.
+ * The tuple, if converted, is stored in new_slot, and *p_my_slot is
+ * updated to point to it.  new_slot typically should be one of the
+ * dedicated partition tuple slots. If map is NULL, *p_my_slot is not changed.
+ *
+ * Returns the converted tuple, unless map is NULL, in which case original
+ * tuple is returned unmodified.
+ */
+HeapTuple
+ConvertPartitionTupleSlot(TupleConversionMap *map,
+						  HeapTuple tuple,
+						  TupleTableSlot *new_slot,
+						  TupleTableSlot **p_my_slot)
+{
+	if (!map)
+		return tuple;
+
+	tuple = do_convert_tuple(tuple, map);
+
+	/*
+	 * Change the partition tuple slot descriptor, as per converted tuple.
+	 */
+	*p_my_slot = new_slot;
+	Assert(new_slot != NULL);
+	ExecSetSlotDescriptor(new_slot, map->outdesc);
+	ExecStoreTuple(tuple, new_slot, InvalidBuffer, true);
+
+	return tuple;
+}
+
 /*
  * ExecCleanupTupleRouting -- Clean up objects allocated for partition tuple
  * routing.
@@ -265,9 +446,10 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
  * Close all the partitioned tables, leaf partitions, and their indices.
  */
 void
-ExecCleanupTupleRouting(PartitionTupleRouting * proute)
+ExecCleanupTupleRouting(PartitionTupleRouting *proute)
 {
 	int			i;
+	int			subplan_index = 0;
 
 	/*
 	 * Remember, proute->partition_dispatch_info[0] corresponds to the root
@@ -288,11 +470,30 @@ ExecCleanupTupleRouting(PartitionTupleRouting * proute)
 	{
 		ResultRelInfo *resultRelInfo = proute->partitions[i];
 
+		/*
+		 * If this result rel is one of the UPDATE subplan result rels, let
+		 * ExecEndPlan() close it. For INSERT or COPY,
+		 * proute->subplan_partition_offsets will always be NULL. Note that
+		 * the subplan_partition_offsets array and the partitions array have
+		 * the partitions in the same order. So, while we iterate over
+		 * partitions array, we also iterate over the
+		 * subplan_partition_offsets array in order to figure out which of the
+		 * result rels are present in the UPDATE subplans.
+		 */
+		if (proute->subplan_partition_offsets &&
+			proute->subplan_partition_offsets[subplan_index] == i)
+		{
+			subplan_index++;
+			continue;
+		}
+
 		ExecCloseIndices(resultRelInfo);
 		heap_close(resultRelInfo->ri_RelationDesc, NoLock);
 	}
 
-	/* Release the standalone partition tuple descriptor, if any */
+	/* Release the standalone partition tuple descriptors, if any */
+	if (proute->root_tuple_slot)
+		ExecDropSingleTupleTableSlot(proute->root_tuple_slot);
 	if (proute->partition_tuple_slot)
 		ExecDropSingleTupleTableSlot(proute->partition_tuple_slot);
 }
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index a7ed98e7..f04ef73d 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -79,6 +79,11 @@ static bool ExecOnConflictUpdate(ModifyTableState *mtstate,
                      EState *estate,
                      bool canSetTag,
                      TupleTableSlot **returning);
+static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node);
+static void ExecSetupChildParentMapForTcs(ModifyTableState *mtstate);
+static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate);
+static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node,
+						int whichplan);
 
 /*
  * Verify that the tuples to be produced by INSERT or UPDATE match the
@@ -281,6 +286,7 @@ ExecInsert(ModifyTableState *mtstate,
     Oid            newId;
     List       *recheckIndexes = NIL;
     TupleTableSlot *result = NULL;
+	TransitionCaptureState *ar_insert_trig_tcs;
 #ifdef __TBASE__
     bool       has_unshippable_trigger = false;
     int        remoterel_index = 0;
@@ -321,7 +327,6 @@ ExecInsert(ModifyTableState *mtstate,
     {
         int            leaf_part_index;
 		PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
-        TupleConversionMap *map;
 
         /*
          * Away we go ... If we end up not finding a partition after all,
@@ -370,8 +375,10 @@ ExecInsert(ModifyTableState *mtstate,
                  * back to tuplestore format.
                  */
                 mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
+
                 mtstate->mt_transition_capture->tcs_map =
-                    mtstate->mt_transition_tupconv_maps[leaf_part_index];
+					TupConvMapForLeaf(proute, saved_resultRelInfo,
+									  leaf_part_index);
             }
             else
             {
@@ -388,23 +395,10 @@ ExecInsert(ModifyTableState *mtstate,
          * We might need to convert from the parent rowtype to the partition
          * rowtype.
          */
-		map = proute->partition_tupconv_maps[leaf_part_index];
-        if (map)
-        {
-            Relation    partrel = resultRelInfo->ri_RelationDesc;
-
-            tuple = do_convert_tuple(tuple, map, partrel);
-
-            /*
-             * We must use the partition's tuple descriptor from this point
-             * on, until we're finished dealing with the partition. Use the
-             * dedicated slot for that.
-             */
-			slot = proute->partition_tuple_slot;
-            Assert(slot != NULL);
-            ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
-            ExecStoreTuple(tuple, slot, InvalidBuffer, true);
-        }
+		tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index],
+                                          tuple,
+                                          proute->partition_tuple_slot,
+                                          &slot);
     }
 #ifdef __TBASE__
     /* Determine the interval partition to heap_insert the tuple into */
@@ -590,6 +584,8 @@ ExecInsert(ModifyTableState *mtstate,
     }
     else
     {
+		WCOKind		wco_kind;
+
         /*
          * We always check the partition constraint, including when the tuple
          * got here via tuple-routing.  However we don't need to in the latter
@@ -607,14 +603,23 @@ ExecInsert(ModifyTableState *mtstate,
         tuple->t_tableOid = RelationGetRelid(resultRelationDesc);
 
         /*
-         * Check any RLS INSERT WITH CHECK policies
+		 * Check any RLS WITH CHECK policies.
          *
+		 * Normally we should check INSERT policies. But if the insert is the
+		 * result of a partition key update that moved the tuple to a new
+		 * partition, we should instead check UPDATE policies, because we are
+		 * executing policies defined on the target table, and not those
+		 * defined on the child partitions.
+		 */
+		wco_kind = (mtstate->operation == CMD_UPDATE) ?
+			WCO_RLS_UPDATE_CHECK : WCO_RLS_INSERT_CHECK;
+
+		/*
          * ExecWithCheckOptions() will skip any WCOs which are not of the kind
          * we are looking for at this point.
          */
         if (resultRelInfo->ri_WithCheckOptions != NIL)
-            ExecWithCheckOptions(WCO_RLS_INSERT_CHECK,
-                                 resultRelInfo, slot, estate);
+			ExecWithCheckOptions(wco_kind, resultRelInfo, slot, estate);
 
         /*
          * No need though if the tuple has been routed, and a BR trigger
@@ -830,9 +835,32 @@ ExecInsert(ModifyTableState *mtstate,
         setLastTid(&(tuple->t_self));
     }
 
+	/*
+	 * If this insert is the result of a partition key update that moved the
+	 * tuple to a new partition, put this row into the transition NEW TABLE,
+	 * if there is one. We need to do this separately for DELETE and INSERT
+	 * because they happen on different tables.
+	 */
+	ar_insert_trig_tcs = mtstate->mt_transition_capture;
+	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+		&& mtstate->mt_transition_capture->tcs_update_new_table)
+	{
+		ExecARUpdateTriggers(estate, resultRelInfo, NULL,
+							 NULL,
+							 tuple,
+							 NULL,
+							 mtstate->mt_transition_capture);
+
+		/*
+		 * We've already captured the NEW TABLE row, so make sure any AR
+		 * INSERT trigger fired below doesn't capture it again.
+		 */
+		ar_insert_trig_tcs = NULL;
+	}
+
     /* AFTER ROW INSERT Triggers */
     ExecARInsertTriggers(estate, resultRelInfo, tuple, recheckIndexes,
-                         mtstate->mt_transition_capture);
+						 ar_insert_trig_tcs);
 
     list_free(recheckIndexes);
 
@@ -888,6 +916,8 @@ ExecDelete(ModifyTableState *mtstate,
            TupleTableSlot *planSlot,
            EPQState *epqstate,
            EState *estate,
+		   bool *tupleDeleted,
+		   bool processReturning,
            bool canSetTag)
 #else
 static TupleTableSlot *
@@ -897,6 +927,8 @@ ExecDelete(ModifyTableState *mtstate,
            TupleTableSlot *planSlot,
            EPQState *epqstate,
            EState *estate,
+		   bool *tupleDeleted,
+		   bool processReturning,
            bool canSetTag)
 #endif
 {// #lizard forgives
@@ -910,6 +942,11 @@ ExecDelete(ModifyTableState *mtstate,
     ModifyTable *mt = (ModifyTable *)mtstate->ps.plan;
 #endif
 
+    TransitionCaptureState *ar_delete_trig_tcs;
+
+    if (tupleDeleted)
+        *tupleDeleted = false;
+
     /*
      * get information on the (current) result relation
      */
@@ -1122,12 +1159,40 @@ ldelete:;
     if (canSetTag)
         (estate->es_processed)++;
 
+	/* Tell caller that the delete actually happened. */
+	if (tupleDeleted)
+		*tupleDeleted = true;
+
+	/*
+	 * If this delete is the result of a partition key update that moved the
+	 * tuple to a new partition, put this row into the transition OLD TABLE,
+	 * if there is one. We need to do this separately for DELETE and INSERT
+	 * because they happen on different tables.
+	 */
+	ar_delete_trig_tcs = mtstate->mt_transition_capture;
+	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
+		&& mtstate->mt_transition_capture->tcs_update_old_table)
+	{
+		ExecARUpdateTriggers(estate, resultRelInfo,
+							 tupleid,
+							 oldtuple,
+							 NULL,
+							 NULL,
+							 mtstate->mt_transition_capture);
+
+		/*
+		 * We've already captured the NEW TABLE row, so make sure any AR
+		 * DELETE trigger fired below doesn't capture it again.
+		 */
+		ar_delete_trig_tcs = NULL;
+	}
+
     /* AFTER ROW DELETE Triggers */
     ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple,
-                         mtstate->mt_transition_capture);
+						 ar_delete_trig_tcs);
 
-    /* Process RETURNING if present */
-    if (resultRelInfo->ri_projectReturning)
+	/* Process RETURNING if present and if requested */
+	if (processReturning && resultRelInfo->ri_projectReturning)
     {
         /*
          * We have to put the target tuple into a slot, which means first we
@@ -1220,6 +1285,7 @@ ExecUpdate(ModifyTableState *mtstate,
     HTSU_Result result;
     HeapUpdateFailureData hufd;
     List       *recheckIndexes = NIL;
+	TupleConversionMap *saved_tcs_map = NULL;
 #ifdef __TBASE__
     int        remoterel_index = 0;
     ModifyTable *mt = (ModifyTable *)mtstate->ps.plan;
@@ -1336,6 +1402,7 @@ ExecUpdate(ModifyTableState *mtstate,
     else
     {
         LockTupleMode lockmode;
+		bool		partition_constraint_failed;
 
         /*
          * Constraints might reference the tableoid column, so initialize
@@ -1351,22 +1418,143 @@ ExecUpdate(ModifyTableState *mtstate,
          * (We don't need to redo triggers, however.  If there are any BEFORE
          * triggers then trigger.c will have done heap_lock_tuple to lock the
          * correct tuple, so there's no need to do them again.)
-         *
-         * ExecWithCheckOptions() will skip any WCOs which are not of the kind
-         * we are looking for at this point.
          */
 lreplace:;
-        if (resultRelInfo->ri_WithCheckOptions != NIL)
+
+	       /*
+	        * If partition constraint fails, this row might get moved to another
+	        * partition, in which case we should check the RLS CHECK policy just
+	        * before inserting into the new partition, rather than doing it here.
+	        * This is because a trigger on that partition might again change the
+	        * row.  So skip the WCO checks if the partition constraint fails.
+	        */
+	    partition_constraint_failed =
+	           resultRelInfo->ri_PartitionCheck &&
+	           !ExecPartitionCheck(resultRelInfo, slot, estate);
+
+	    if (!partition_constraint_failed &&
+	           resultRelInfo->ri_WithCheckOptions != NIL)
+	    {
+	           /*
+	            * ExecWithCheckOptions() will skip any WCOs which are not of the
+	            * kind we are looking for at this point.
+	            */
+
             ExecWithCheckOptions(WCO_RLS_UPDATE_CHECK,
                                  resultRelInfo, slot, estate);
+	    }
+
+	    /*
+	    * If a partition check failed, try to move the row into the right
+	    * partition.
+	    */
+	    if (partition_constraint_failed)
+	    {
+	       bool        tuple_deleted;
+	       TupleTableSlot *ret_slot;
+	       PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+	       int         map_index;
+	       TupleConversionMap *tupconv_map;
+
+	       /*
+	        * When an UPDATE is run on a leaf partition, we will not have
+	        * partition tuple routing set up. In that case, fail with
+	        * partition constraint violation error.
+	        */
+	       if (proute == NULL)
+	           ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
+
+	       /*
+	        * Row movement, part 1.  Delete the tuple, but skip RETURNING
+	        * processing. We want to return rows from INSERT.
+	        */
+	       ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate,
+	                  &tuple_deleted, false, false);
+
+	       /*
+	        * For some reason if DELETE didn't happen (e.g. trigger prevented
+	        * it, or it was already deleted by self, or it was concurrently
+	        * deleted by another transaction), then we should skip the insert
+	        * as well; otherwise, an UPDATE could cause an increase in the
+	        * total number of rows across all partitions, which is clearly
+	        * wrong.
+	        *
+	        * For a normal UPDATE, the case where the tuple has been the
+	        * subject of a concurrent UPDATE or DELETE would be handled by
+	        * the EvalPlanQual machinery, but for an UPDATE that we've
+	        * translated into a DELETE from this partition and an INSERT into
+	        * some other partition, that's not available, because CTID chains
+	        * can't span relation boundaries.  We mimic the semantics to a
+	        * limited extent by skipping the INSERT if the DELETE fails to
+	        * find a tuple. This ensures that two concurrent attempts to
+	        * UPDATE the same tuple at the same time can't turn one tuple
+	        * into two, and that an UPDATE of a just-deleted tuple can't
+	        * resurrect it.
+	        */
+	       if (!tuple_deleted)
+	           return NULL;
+
+	       /*
+	        * Updates set the transition capture map only when a new subplan
+	        * is chosen.  But for inserts, it is set for each row. So after
+	        * INSERT, we need to revert back to the map created for UPDATE;
+	        * otherwise the next UPDATE will incorrectly use the one created
+	        * for INSERT.  So first save the one created for UPDATE.
+	        */
+	       if (mtstate->mt_transition_capture)
+	           saved_tcs_map = mtstate->mt_transition_capture->tcs_map;
+
+	       /*
+	        * resultRelInfo is one of the per-subplan resultRelInfos.  So we
+	        * should convert the tuple into root's tuple descriptor, since
+	        * ExecInsert() starts the search from root.  The tuple conversion
+	        * map list is in the order of mtstate->resultRelInfo[], so to
+	        * retrieve the one for this resultRel, we need to know the
+	        * position of the resultRel in mtstate->resultRelInfo[].
+	        */
+	       map_index = resultRelInfo - mtstate->resultRelInfo;
+	       Assert(map_index >= 0 && map_index < mtstate->mt_nplans);
+	       tupconv_map = tupconv_map_for_subplan(mtstate, map_index);
+	       tuple = ConvertPartitionTupleSlot(tupconv_map,
+	                                         tuple,
+	                                         proute->root_tuple_slot,
+	                                         &slot);
+
+
+	       /*
+	        * For ExecInsert(), make it look like we are inserting into the
+	        * root.
+	        */
+	       Assert(mtstate->rootResultRelInfo != NULL);
+	       estate->es_result_relation_info = mtstate->rootResultRelInfo;
+
+	       ret_slot = ExecInsert(mtstate, slot, planSlot, NULL,
+	                             ONCONFLICT_NONE, estate, canSetTag);
+
+	       /*
+	        * Revert back the active result relation and the active
+	        * transition capture map that we changed above.
+	        */
+	       estate->es_result_relation_info = resultRelInfo;
+	       if (mtstate->mt_transition_capture)
+	       {
+	           mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
+	           mtstate->mt_transition_capture->tcs_map = saved_tcs_map;
+	       }
+	       return ret_slot;
+	    }
 
         /*
          * Check the constraints of the tuple.  Note that we pass the same
          * slot for the orig_slot argument, because unlike ExecInsert(), no
          * tuple-routing is performed here, hence the slot remains unchanged.
+         * We've already checked the partition constraint above; however, we
+         * must still ensure the tuple passes all other constraints, so we
+         * will call ExecConstraints() and have it validate all remaining
+         * checks.
          */
-        if (resultRelationDesc->rd_att->constr || resultRelInfo->ri_PartitionCheck)
-			ExecConstraints(resultRelInfo, slot, estate, true);
+	    if (resultRelationDesc->rd_att->constr)
+	    	ExecConstraints(resultRelInfo, slot, estate, false);
 
 #ifdef _MLS_
         if (is_mls_user())
@@ -1763,17 +1951,20 @@ fireBSTriggers(ModifyTableState *node)
 }
 
 /*
- * Return the ResultRelInfo for which we will fire AFTER STATEMENT triggers.
- * This is also the relation into whose tuple format all captured transition
- * tuples must be converted.
+ * Return the target rel ResultRelInfo.
+ *
+ * This relation is the same as :
+ * - the relation for which we will fire AFTER STATEMENT triggers.
+ * - the relation into whose tuple format all captured transition tuples must
+ *   be converted.
+ * - the root partitioned table.
  */
 static ResultRelInfo *
-getASTriggerResultRelInfo(ModifyTableState *node)
+getTargetResultRelInfo(ModifyTableState *node)
 {
     /*
-     * If the node modifies a partitioned table, we must fire its triggers.
-     * Note that in that case, node->resultRelInfo points to the first leaf
-     * partition, not the root table.
+	 * Note that if the node modifies a partitioned table, node->resultRelInfo
+	 * points to the first leaf partition, not the root table.
      */
     if (node->rootResultRelInfo != NULL)
         return node->rootResultRelInfo;
@@ -1787,7 +1978,7 @@ getASTriggerResultRelInfo(ModifyTableState *node)
 static void
 fireASTriggers(ModifyTableState *node)
 {
-    ResultRelInfo *resultRelInfo = getASTriggerResultRelInfo(node);
+	ResultRelInfo *resultRelInfo = getTargetResultRelInfo(node);
 
     switch (node->operation)
     {
@@ -1820,8 +2011,7 @@ fireASTriggers(ModifyTableState *node)
 static void
 ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
 {
-    ResultRelInfo *targetRelInfo = getASTriggerResultRelInfo(mtstate);
-    int            i;
+	ResultRelInfo *targetRelInfo = getTargetResultRelInfo(mtstate);
 
     /* Check for transition tables on the directly targeted relation. */
     mtstate->mt_transition_capture =
@@ -1834,60 +2024,141 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
      */
     if (mtstate->mt_transition_capture != NULL)
     {
-        int            numResultRelInfos;
-		PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
-
-		numResultRelInfos = (proute != NULL ?
-                             proute->num_partitions :
-                             mtstate->mt_nplans);
+		ExecSetupChildParentMapForTcs(mtstate);
 
         /*
-         * Build array of conversion maps from each child's TupleDesc to the
-         * one used in the tuplestore.  The map pointers may be NULL when no
-         * conversion is necessary, which is hopefully a common case for
-         * partitions.
+         * Install the conversion map for the first plan for UPDATE and DELETE
+         * operations.  It will be advanced each time we switch to the next
+         * plan.  (INSERT operations set it every time, so we need not update
+         * mtstate->mt_oc_transition_capture here.)
          */
-        mtstate->mt_transition_tupconv_maps = (TupleConversionMap **)
-            palloc0(sizeof(TupleConversionMap *) * numResultRelInfos);
+	       if (mtstate->mt_transition_capture && mtstate->operation != CMD_INSERT)
+	           mtstate->mt_transition_capture->tcs_map =
+	               tupconv_map_for_subplan(mtstate, 0);
+	}
+}
 
-		/* Choose the right set of partitions */
-		if (proute != NULL)
+/*
+ * Initialize the child-to-root tuple conversion map array for UPDATE subplans.
+ *
+ * This map array is required to convert the tuple from the subplan result rel
+ * to the target table descriptor. This requirement arises for two independent
+ * scenarios:
+ * 1. For update-tuple-routing.
+ * 2. For capturing tuples in transition tables.
+ */
+void
+ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate)
 		{
+   ResultRelInfo *targetRelInfo = getTargetResultRelInfo(mtstate);
+   ResultRelInfo *resultRelInfos = mtstate->resultRelInfo;
+   TupleDesc   outdesc;
+   int         numResultRelInfos = mtstate->mt_nplans;
+   int         i;
+
+   /*
+    * First check if there is already a per-subplan array allocated. Even if
+    * there is already a per-leaf map array, we won't require a per-subplan
+    * one, since we will use the subplan offset array to convert the subplan
+    * index to per-leaf index.
+    */
+   if (mtstate->mt_per_subplan_tupconv_maps ||
+       (mtstate->mt_partition_tuple_routing &&
+        mtstate->mt_partition_tuple_routing->child_parent_tupconv_maps))
+       return;
+
 			/*
-			 * For tuple routing among partitions, we need TupleDescs based
-			 * on the partition routing table.
+    * Build array of conversion maps from each child's TupleDesc to the one
+    * used in the target relation.  The map pointers may be NULL when no
+    * conversion is necessary, which is hopefully a common case.
 			 */
-			ResultRelInfo **resultRelInfos = proute->partitions;
+
+   /* Get tuple descriptor of the target rel. */
+   outdesc = RelationGetDescr(targetRelInfo->ri_RelationDesc);
+
+   mtstate->mt_per_subplan_tupconv_maps = (TupleConversionMap **)
+       palloc(sizeof(TupleConversionMap *) * numResultRelInfos);
 
 			for (i = 0; i < numResultRelInfos; ++i)
 			{
-				mtstate->mt_transition_tupconv_maps[i] =
-					convert_tuples_by_name(RelationGetDescr(resultRelInfos[i]->ri_RelationDesc),
-										   RelationGetDescr(targetRelInfo->ri_RelationDesc),
+       mtstate->mt_per_subplan_tupconv_maps[i] =
+           convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc),
+                                  outdesc,
 										   gettext_noop("could not convert row type"));
 			}
 		}
-		else
-		{
-			/* Otherwise we need the ResultRelInfo for each subplan. */
-			ResultRelInfo *resultRelInfos = mtstate->resultRelInfo;
 
-        for (i = 0; i < numResultRelInfos; ++i)
+/*
+ * Initialize the child-to-root tuple conversion map array required for
+ * capturing transition tuples.
+ *
+ * The map array can be indexed either by subplan index or by leaf-partition
+ * index.  For transition tables, we need a subplan-indexed access to the map,
+ * and where tuple-routing is present, we also require a leaf-indexed access.
+ */
+static void
+ExecSetupChildParentMapForTcs(ModifyTableState *mtstate)
+		{
+   PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+
+   /*
+    * If partition tuple routing is set up, we will require partition-indexed
+    * access. In that case, create the map array indexed by partition; we
+    * will still be able to access the maps using a subplan index by
+    * converting the subplan index to a partition index using
+    * subplan_partition_offsets. If tuple routing is not set up, it means we
+    * don't require partition-indexed access. In that case, create just a
+    * subplan-indexed map.
+    */
+   if (proute)
         {
-            mtstate->mt_transition_tupconv_maps[i] =
-                convert_tuples_by_name(RelationGetDescr(resultRelInfos[i].ri_RelationDesc),
-                                       RelationGetDescr(targetRelInfo->ri_RelationDesc),
-                                       gettext_noop("could not convert row type"));
+		/*
+		 * If a partition-indexed map array is to be created, the subplan map
+         * array has to be NULL.  If the subplan map array is already created,
+         * we won't be able to access the map using a partition index.
+		 */
+       Assert(mtstate->mt_per_subplan_tupconv_maps == NULL);
+
+       ExecSetupChildParentMapForLeaf(proute);
         }
+   else
+       ExecSetupChildParentMapForSubplan(mtstate);
 		}
 
         /*
-         * Install the conversion map for the first plan for UPDATE and DELETE
-         * operations.  It will be advanced each time we switch to the next
-         * plan.  (INSERT operations set it every time.)
+ * For a given subplan index, get the tuple conversion map.
          */
-        mtstate->mt_transition_capture->tcs_map =
-            mtstate->mt_transition_tupconv_maps[0];
+static TupleConversionMap *
+tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan)
+{
+   /*
+    * If a partition-index tuple conversion map array is allocated, we need
+    * to first get the index into the partition array. Exactly *one* of the
+    * two arrays is allocated. This is because if there is a partition array
+    * required, we don't require subplan-indexed array since we can translate
+    * subplan index into partition index. And, we create a subplan-indexed
+    * array *only* if partition-indexed array is not required.
+    */
+   if (mtstate->mt_per_subplan_tupconv_maps == NULL)
+   {
+       int         leaf_index;
+       PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+
+       /*
+        * If subplan-indexed array is NULL, things should have been arranged
+        * to convert the subplan index to partition index.
+        */
+       Assert(proute && proute->subplan_partition_offsets != NULL);
+
+       leaf_index = proute->subplan_partition_offsets[whichplan];
+
+       return TupConvMapForLeaf(proute, getTargetResultRelInfo(mtstate),
+                                leaf_index);
+   }
+   else
+   {
+       Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans);
+       return mtstate->mt_per_subplan_tupconv_maps[whichplan];
     }
 }
 
@@ -2134,9 +2405,8 @@ ExecModifyTable(PlanState *pstate)
                 if (node->mt_transition_capture != NULL)
                 {
                     /* Prepare to convert transition tuples from this child. */
-                    Assert(node->mt_transition_tupconv_maps != NULL);
                     node->mt_transition_capture->tcs_map =
-                        node->mt_transition_tupconv_maps[node->mt_whichplan];
+							tupconv_map_for_subplan(node, node->mt_whichplan);
                 }
                 continue;
             }
@@ -2400,10 +2670,12 @@ ExecModifyTable(PlanState *pstate)
             case CMD_DELETE:
 #ifdef __TBASE__
                 slot = ExecDelete(node, tupleid, oldtuple, slot, planSlot,
-                                  &node->mt_epqstate, estate, node->canSetTag);
+						         &node->mt_epqstate, estate,
+                                 NULL, true, node->canSetTag);
 #else
                 slot = ExecDelete(node, tupleid, oldtuple, planSlot,
-                                  &node->mt_epqstate, estate, node->canSetTag);
+						         &node->mt_epqstate, estate,
+                                 NULL, true, node->canSetTag);
 #endif
                 break;
             default:
@@ -2478,9 +2750,12 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
     ResultRelInfo *resultRelInfo;
     TupleDesc    tupDesc;
     Plan       *subplan;
+	int         firstVarno = 0;
+    Relation    firstResultRel = NULL;
     ListCell   *l;
     int            i;
     Relation    rel;
+	bool        update_tuple_routing_needed = node->partColsUpdated;
     PartitionTupleRouting *proute = NULL;
     int         num_partitions = 0;
 #ifdef __TBASE__
@@ -2572,6 +2847,16 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
             resultRelInfo->ri_IndexRelationDescs == NULL)
             ExecOpenIndices(resultRelInfo, mtstate->mt_onconflict != ONCONFLICT_NONE);
 
+		/*
+		* If this is an UPDATE and a BEFORE UPDATE trigger is present, the
+		* trigger itself might modify the partition-key values. So arrange
+		* for tuple routing.
+		*/
+		if (resultRelInfo->ri_TrigDesc &&
+		   resultRelInfo->ri_TrigDesc->trig_update_before_row &&
+		   operation == CMD_UPDATE)
+		   update_tuple_routing_needed = true;
+
         /* Now init the plan for this result rel */
 #ifdef __TBASE__
         if (resultRelInfo->ispartparent && node->arbiterIndexes)
@@ -2693,22 +2978,52 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
     estate->es_result_relation_info = saved_resultRelInfo;
 
-    /* Build state for INSERT tuple routing */
-	rel = mtstate->resultRelInfo->ri_RelationDesc;
-    if (operation == CMD_INSERT &&
-        rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+	/* Get the target relation */
+	rel = (getTargetResultRelInfo(mtstate))->ri_RelationDesc;
+
+	/*
+	* If it's not a partitioned table after all, UPDATE tuple routing should
+	* not be attempted.
+	*/
+	if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+	   update_tuple_routing_needed = false;
+
+	/*
+	* Build state for tuple routing if it's an INSERT or if it's an UPDATE of
+	* partition key.
+	*/
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE &&
+	   (operation == CMD_INSERT || update_tuple_routing_needed))
     {
         proute = mtstate->mt_partition_tuple_routing =
 		ExecSetupPartitionTupleRouting(mtstate,
                                            rel, node->nominalRelation,
                                            estate);
         num_partitions = proute->num_partitions;
+
+        /*
+        * Below are required as reference objects for mapping partition
+        * attno's in expressions such as WithCheckOptions and RETURNING.
+        */
+        firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
+        firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
     }
 
     /* Build state for collecting transition tuples */
     ExecSetupTransitionCaptureState(mtstate, estate);
 
     /*
+	* Construct mapping from each of the per-subplan partition attnos to the
+	* root attno.  This is required when during update row movement the tuple
+	* descriptor of a source partition does not match the root partitioned
+	* table descriptor.  In such a case we need to convert tuples to the root
+	* tuple descriptor, because the search for destination partition starts
+	* from the root.  Skip this setup if it's not a partition key update.
+	*/
+	if (update_tuple_routing_needed)
+	   ExecSetupChildParentMapForSubplan(mtstate);
+
+	/*
      * Initialize any WITH CHECK OPTION constraints if needed.
      */
     resultRelInfo = mtstate->resultRelInfo;
@@ -2759,26 +3074,29 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
      * Build WITH CHECK OPTION constraints for each leaf partition rel. Note
      * that we didn't build the withCheckOptionList for each partition within
      * the planner, but simple translation of the varattnos for each partition
-     * will suffice.  This only occurs for the INSERT case; UPDATE/DELETE
-     * cases are handled above.
+	 * will suffice.  This only occurs for the INSERT case or for UPDATE row
+	 * movement. DELETEs and local UPDATEs are handled above.
      */
 	if (node->withCheckOptionLists != NIL && num_partitions > 0)
     {
-        List       *wcoList;
-        PlanState  *plan;
+		List	   *first_wcoList;
 
         /*
          * In case of INSERT on partitioned tables, there is only one plan.
          * Likewise, there is only one WITH CHECK OPTIONS list, not one per
-         * partition.  We make a copy of the WCO qual for each partition; note
-         * that, if there are SubPlans in there, they all end up attached to
-         * the one parent Plan node.
-         */
-        Assert(operation == CMD_INSERT &&
+		 * partition. Whereas for UPDATE, there are as many WCOs as there are
+		 * plans. So in either case, use the WCO expression of the first
+		 * resultRelInfo as a reference to calculate attno's for the WCO
+		 * expression of each of the partitions. We make a copy of the WCO
+		 * qual for each partition. Note that, if there are SubPlans in there,
+		 * they all end up attached to the one parent Plan node.
+		 */
+		Assert(update_tuple_routing_needed ||
+			   (operation == CMD_INSERT &&
                list_length(node->withCheckOptionLists) == 1 &&
-               mtstate->mt_nplans == 1);
-        wcoList = linitial(node->withCheckOptionLists);
-        plan = mtstate->mt_plans[0];
+				mtstate->mt_nplans == 1));
+
+		first_wcoList = linitial(node->withCheckOptionLists);
 		for (i = 0; i < num_partitions; i++)
         {
 			Relation	partrel;
@@ -2787,17 +3105,26 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
             ListCell   *ll;
 
 			resultRelInfo = proute->partitions[i];
+
+			/*
+			 * If we are referring to a resultRelInfo from one of the update
+			 * result rels, that result rel would already have
+			 * WithCheckOptions initialized.
+			 */
+			if (resultRelInfo->ri_WithCheckOptions)
+				continue;
+
 			partrel = resultRelInfo->ri_RelationDesc;
 
-            /* varno = node->nominalRelation */
-            mapped_wcoList = map_partition_varattnos(wcoList,
-                                                     node->nominalRelation,
-                                                     partrel, rel, NULL);
+			mapped_wcoList = map_partition_varattnos(first_wcoList,
+													 firstVarno,
+													 partrel, firstResultRel,
+													 NULL);
             foreach(ll, mapped_wcoList)
             {
                 WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll));
                 ExprState  *wcoExpr = ExecInitQual(castNode(List, wco->qual),
-                                                   plan);
+												   &mtstate->ps);
 
                 wcoExprs = lappend(wcoExprs, wcoExpr);
             }
@@ -2814,7 +3141,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
     {
         TupleTableSlot *slot;
         ExprContext *econtext;
-        List       *returningList;
+		List	   *firstReturningList;
 
         /*
          * Initialize result tuple slot and assign its rowtype using the first
@@ -2862,22 +3189,35 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
          * Build a projection for each leaf partition rel.  Note that we
          * didn't build the returningList for each partition within the
          * planner, but simple translation of the varattnos for each partition
-         * will suffice.  This only occurs for the INSERT case; UPDATE/DELETE
-         * are handled above.
+		 * will suffice.  This only occurs for the INSERT case or for UPDATE
+		 * row movement. DELETEs and local UPDATEs are handled above.
          */
-        returningList = linitial(node->returningLists);
+		firstReturningList = linitial(node->returningLists);
 		for (i = 0; i < num_partitions; i++)
         {
 			Relation	partrel;
             List       *rlist;
 
 			resultRelInfo = proute->partitions[i];
+
+			/*
+			 * If we are referring to a resultRelInfo from one of the update
+			 * result rels, that result rel would already have a returningList
+			 * built.
+			 */
+			if (resultRelInfo->ri_projectReturning)
+				continue;
+
 			partrel = resultRelInfo->ri_RelationDesc;
 
-            /* varno = node->nominalRelation */
-            rlist = map_partition_varattnos(returningList,
-                                            node->nominalRelation,
-                                            partrel, rel, NULL);
+			/*
+			 * Use the returning expression of the first resultRelInfo as a
+			 * reference to calculate attno's for the returning expression of
+			 * each of the partitions.
+			 */
+			rlist = map_partition_varattnos(firstReturningList,
+											firstVarno,
+											partrel, firstResultRel, NULL);
             resultRelInfo->ri_projectReturning =
                 ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps,
                                         resultRelInfo->ri_RelationDesc->rd_att);
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 1e57fedd..b55431d6 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -246,6 +246,7 @@ _copyModifyTable(const ModifyTable *from)
     COPY_SCALAR_FIELD(canSetTag);
     COPY_SCALAR_FIELD(nominalRelation);
     COPY_NODE_FIELD(partitioned_rels);
+	COPY_SCALAR_FIELD(partColsUpdated);
     COPY_NODE_FIELD(resultRelations);
     COPY_SCALAR_FIELD(resultRelIndex);
     COPY_SCALAR_FIELD(rootResultRelIndex);
@@ -2536,6 +2537,7 @@ _copyPartitionedChildRelInfo(const PartitionedChildRelInfo *from)
 
     COPY_SCALAR_FIELD(parent_relid);
     COPY_NODE_FIELD(child_rels);
+	COPY_SCALAR_FIELD(part_cols_updated);
 
     return newnode;
 }
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index c05b411c..0e47737a 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -936,6 +936,7 @@ _equalPartitionedChildRelInfo(const PartitionedChildRelInfo *a, const Partitione
 {
     COMPARE_SCALAR_FIELD(parent_relid);
     COMPARE_NODE_FIELD(child_rels);
+	COMPARE_SCALAR_FIELD(part_cols_updated);
 
     return true;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index fb711230..610c2fae 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -661,6 +661,7 @@ _outModifyTable(StringInfo str, const ModifyTable *node)
     WRITE_BOOL_FIELD(canSetTag);
     WRITE_UINT_FIELD(nominalRelation);
     WRITE_NODE_FIELD(partitioned_rels);
+	WRITE_BOOL_FIELD(partColsUpdated);
     WRITE_NODE_FIELD(resultRelations);
     WRITE_INT_FIELD(resultRelIndex);
     WRITE_INT_FIELD(rootResultRelIndex);
@@ -3408,6 +3409,7 @@ _outModifyTablePath(StringInfo str, const ModifyTablePath *node)
     WRITE_BOOL_FIELD(canSetTag);
     WRITE_UINT_FIELD(nominalRelation);
     WRITE_NODE_FIELD(partitioned_rels);
+	WRITE_BOOL_FIELD(partColsUpdated);
     WRITE_NODE_FIELD(resultRelations);
     WRITE_NODE_FIELD(subpaths);
     WRITE_NODE_FIELD(subroots);
@@ -3859,6 +3861,7 @@ _outPartitionedChildRelInfo(StringInfo str, const PartitionedChildRelInfo *node)
 
     WRITE_UINT_FIELD(parent_relid);
     WRITE_NODE_FIELD(child_rels);
+	WRITE_BOOL_FIELD(part_cols_updated);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 2bdc5067..e0744408 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -2381,6 +2381,7 @@ _readModifyTable(void)
     READ_BOOL_FIELD(canSetTag);
     READ_UINT_FIELD(nominalRelation);
     READ_NODE_FIELD(partitioned_rels);
+	READ_BOOL_FIELD(partColsUpdated);
     READ_NODE_FIELD(resultRelations);
     READ_INT_FIELD(resultRelIndex);
     READ_INT_FIELD(rootResultRelIndex);
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 439e98ed..9020a606 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -1379,7 +1379,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
 		case RTE_RELATION:
     if (rte->relkind == RELKIND_PARTITIONED_TABLE)
 				partitioned_rels =
-					get_partitioned_child_rels(root, rel->relid);
+						get_partitioned_child_rels(root, rel->relid, NULL);
 			break;
 		case RTE_SUBQUERY:
 			build_partitioned_rels = true;
@@ -1417,7 +1417,7 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
 		{
 			List	   *cprels;
 
-			cprels = get_partitioned_child_rels(root, childrel->relid);
+			cprels = get_partitioned_child_rels(root, childrel->relid, NULL);
 			partitioned_rels = list_concat(partitioned_rels,
 										   list_copy(cprels));
 		}
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 45880a2f..bf38bafc 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -329,6 +329,7 @@ static ProjectSet *make_project_set(List *tlist, Plan *subplan);
 static ModifyTable *make_modifytable(PlannerInfo *root,
                  CmdType operation, bool canSetTag,
                  Index nominalRelation, List *partitioned_rels,
+				 bool partColsUpdated,
                  List *resultRelations, List *subplans,
                  List *withCheckOptionLists, List *returningLists,
                  List *rowMarks, OnConflictExpr *onconflict, int epqParam);
@@ -2972,6 +2973,7 @@ create_modifytable_plan(PlannerInfo *root, ModifyTablePath *best_path)
                             best_path->canSetTag,
                             best_path->nominalRelation,
                             best_path->partitioned_rels,
+							best_path->partColsUpdated,
                             best_path->resultRelations,
                             subplans,
                             best_path->withCheckOptionLists,
@@ -8471,6 +8473,7 @@ static ModifyTable *
 make_modifytable(PlannerInfo *root,
                  CmdType operation, bool canSetTag,
                  Index nominalRelation, List *partitioned_rels,
+				 bool partColsUpdated,
                  List *resultRelations, List *subplans,
                  List *withCheckOptionLists, List *returningLists,
                  List *rowMarks, OnConflictExpr *onconflict, int epqParam)
@@ -8500,6 +8503,7 @@ make_modifytable(PlannerInfo *root,
     node->canSetTag = canSetTag;
     node->nominalRelation = nominalRelation;
     node->partitioned_rels = partitioned_rels;
+	node->partColsUpdated = partColsUpdated;
     node->resultRelations = resultRelations;
     node->resultRelIndex = -1;    /* will be set correctly in setrefs.c */
     node->rootResultRelIndex = -1;    /* will be set correctly in setrefs.c */
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 498b1d5e..b22d678d 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1235,6 +1235,7 @@ inheritance_planner(PlannerInfo *root)
 	Query	   *parent_parse;
 	Bitmapset  *parent_relids = bms_make_singleton(top_parentRTindex);
 	PlannerInfo **parent_roots = NULL;
+	bool		partColsUpdated = false;
 
     Assert(parse->commandType != CMD_INSERT);
 
@@ -1306,7 +1307,8 @@ inheritance_planner(PlannerInfo *root)
     if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE)
 	{
 		nominalRelation = top_parentRTindex;
-		partitioned_rels = get_partitioned_child_rels(root, top_parentRTindex);
+		partitioned_rels = get_partitioned_child_rels(root, top_parentRTindex,
+													  &partColsUpdated);
 		/* The root partitioned table is included as a child rel */
 		Assert(list_length(partitioned_rels) >= 1);
 	}
@@ -1680,6 +1682,7 @@ inheritance_planner(PlannerInfo *root)
                                      parse->canSetTag,
                                      nominalRelation,
                                      partitioned_rels,
+									 partColsUpdated,
                                      resultRelations,
                                      subpaths,
                                      subroots,
@@ -2354,6 +2357,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
                                         parse->canSetTag,
                                         parse->resultRelation,
                                         NIL,
+										false,
                                         list_make1_int(parse->resultRelation),
                                         list_make1(path),
                                         list_make1(root),
@@ -7840,17 +7844,24 @@ grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path,
 /*
  * get_partitioned_child_rels
  *        Returns a list of the RT indexes of the partitioned child relations
- *        with rti as the root parent RT index.
+ *      with rti as the root parent RT index. Also sets
+ *      *part_cols_updated to true if any of the root rte's updated
+ *      columns is used in the partition key either of the relation whose RTI
+ *      is specified or of any child relation.
  *
  * Note: This function might get called even for range table entries that
  * are not partitioned tables; in such a case, it will simply return NIL.
  */
 List *
-get_partitioned_child_rels(PlannerInfo *root, Index rti)
+get_partitioned_child_rels(PlannerInfo *root, Index rti,
+                          bool *part_cols_updated)
 {
     List       *result = NIL;
     ListCell   *l;
 
+    if (part_cols_updated)
+        *part_cols_updated = false;
+
     foreach(l, root->pcinfo_list)
     {
         PartitionedChildRelInfo *pc = lfirst(l);
@@ -7858,6 +7869,8 @@ get_partitioned_child_rels(PlannerInfo *root, Index rti)
         if (pc->parent_relid == rti)
         {
             result = pc->child_rels;
+			if (part_cols_updated)
+				*part_cols_updated = pc->part_cols_updated;
             break;
         }
     }
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 1fe5a341..c40a38ee 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -105,7 +105,8 @@ static void expand_partitioned_rtentry(PlannerInfo *root,
 						   RangeTblEntry *parentrte,
 						   Index parentRTindex, Relation parentrel,
 						   PlanRowMark *top_parentrc, LOCKMODE lockmode,
-						   List **appinfos, List **partitioned_child_rels);
+						   List **appinfos, List **partitioned_child_rels,
+						   bool *part_cols_updated);
 static void expand_single_inheritance_child(PlannerInfo *root,
 								RangeTblEntry *parentrte,
 								Index parentRTindex, Relation parentrel,
@@ -1543,16 +1544,19 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
    if (RelationGetPartitionDesc(oldrelation) != NULL)
    {
 		List	   *partitioned_child_rels = NIL;
+		bool		part_cols_updated = false;
 
 		Assert(rte->relkind == RELKIND_PARTITIONED_TABLE);
 
        /*
         * If this table has partitions, recursively expand them in the order
-		 * in which they appear in the PartitionDesc.
+		 * in which they appear in the PartitionDesc.  While at it, also
+		 * extract the partition key columns of all the partitioned tables.
         */
        expand_partitioned_rtentry(root, rte, rti, oldrelation, oldrc,
 								   lockmode, &root->append_rel_list,
-                                     &partitioned_child_rels);
+								   &partitioned_child_rels,
+								   &part_cols_updated);
 
 		/*
 		 * We keep a list of objects in root, each of which maps a root
@@ -1569,6 +1573,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
 			pcinfo = makeNode(PartitionedChildRelInfo);
 			pcinfo->parent_relid = rti;
 			pcinfo->child_rels = partitioned_child_rels;
+			pcinfo->part_cols_updated = part_cols_updated;
 			root->pcinfo_list = lappend(root->pcinfo_list, pcinfo);
 		}
    }
@@ -1645,7 +1650,8 @@ static void
 expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
 						   Index parentRTindex, Relation parentrel,
 						   PlanRowMark *top_parentrc, LOCKMODE lockmode,
-						   List **appinfos, List **partitioned_child_rels)
+						   List **appinfos, List **partitioned_child_rels,
+						   bool *part_cols_updated)
 {
 	int			i;
 	RangeTblEntry *childrte;
@@ -1660,6 +1666,17 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
 
 	Assert(parentrte->inh);
 
+	/*
+	 * Note down whether any partition key cols are being updated. Though it's
+	 * the root partitioned table's updatedCols we are interested in, we
+	 * instead use parentrte to get the updatedCols. This is convenient because
+	 * parentrte already has the root partrel's updatedCols translated to match
+	 * the attribute ordering of parentrel.
+	 */
+	if (!*part_cols_updated)
+		*part_cols_updated =
+			has_partition_attrs(parentrel, parentrte->updatedCols, NULL);
+
 	/* First expand the partitioned table itself. */
 	expand_single_inheritance_child(root, parentrte, parentRTindex, parentrel,
 									top_parentrc, parentrel,
@@ -1699,7 +1716,8 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
 		if (childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 			expand_partitioned_rtentry(root, childrte, childRTindex,
 									   childrel, top_parentrc, lockmode,
-									   appinfos, partitioned_child_rels);
+									   appinfos, partitioned_child_rels,
+									   part_cols_updated);
 
 		/* Close child relation, but keep locks */
 		heap_close(childrel, NoLock);
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index c2d27db7..0a6735d1 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -6650,6 +6650,8 @@ create_lockrows_path(PlannerInfo *root, RelOptInfo *rel,
  * 'partitioned_rels' is an integer list of RT indexes of non-leaf tables in
  *        the partition tree, if this is an UPDATE/DELETE to a partitioned table.
  *        Otherwise NIL.
+ * 'partColsUpdated' is true if any partitioning columns are being updated,
+ *		either from the target relation or a descendent partitioned table.
  * 'resultRelations' is an integer list of actual RT indexes of target rel(s)
  * 'subpaths' is a list of Path(s) producing source data (one per rel)
  * 'subroots' is a list of PlannerInfo structs (one per rel)
@@ -6663,6 +6665,7 @@ ModifyTablePath *
 create_modifytable_path(PlannerInfo *root, RelOptInfo *rel,
                         CmdType operation, bool canSetTag,
                         Index nominalRelation, List *partitioned_rels,
+						bool partColsUpdated,
                         List *resultRelations, List *subpaths,
                         List *subroots,
                         List *withCheckOptionLists, List *returningLists,
@@ -6730,6 +6733,7 @@ create_modifytable_path(PlannerInfo *root, RelOptInfo *rel,
     pathnode->canSetTag = canSetTag;
     pathnode->nominalRelation = nominalRelation;
     pathnode->partitioned_rels = list_copy(partitioned_rels);
+	pathnode->partColsUpdated = partColsUpdated;
     pathnode->resultRelations = resultRelations;
     pathnode->subpaths = subpaths;
     pathnode->subroots = subroots;
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index bea189c5..45acfa92 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -62,11 +62,24 @@ typedef struct PartitionDispatchData *PartitionDispatch;
  *								for every leaf partition in the partition tree.
  * num_partitions				Number of leaf partitions in the partition tree
  *								(= 'partitions' array length)
- * partition_tupconv_maps		Array of TupleConversionMap objects with one
+ * parent_child_tupconv_maps	Array of TupleConversionMap objects with one
  *								entry for every leaf partition (required to
- *								convert input tuple based on the root table's
- *								rowtype to a leaf partition's rowtype after
- *								tuple routing is done)
+ *								convert tuple from the root table's rowtype to
+ *								a leaf partition's rowtype after tuple routing
+ *								is done)
+ * child_parent_tupconv_maps	Array of TupleConversionMap objects with one
+ *								entry for every leaf partition (required to
+ *								convert an updated tuple from the leaf
+ *								partition's rowtype to the root table's rowtype
+ *								so that tuple routing can be done)
+ * child_parent_map_not_required  Array of bool. True value means that a map is
+ *								determined to be not required for the given
+ *								partition. False means either we haven't yet
+ *								checked if a map is required, or it was
+ *								determined to be required.
+ * subplan_partition_offsets	Integer array ordered by UPDATE subplans. Each
+ *								element of this array has the index into the
+ *								corresponding partition in partitions array.
  * partition_tuple_slot			TupleTableSlot to be used to manipulate any
  *								given leaf partition's rowtype after that
  *								partition is chosen for insertion by
@@ -79,8 +92,12 @@ typedef struct PartitionTupleRouting
 	int			num_dispatch;
 	ResultRelInfo **partitions;
 	int			num_partitions;
-	TupleConversionMap **partition_tupconv_maps;
+	TupleConversionMap **parent_child_tupconv_maps;
+	TupleConversionMap **child_parent_tupconv_maps;
+	bool	   *child_parent_map_not_required;
+	int		   *subplan_partition_offsets;
 	TupleTableSlot *partition_tuple_slot;
+	TupleTableSlot *root_tuple_slot;
 } PartitionTupleRouting;
 
 extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
@@ -90,6 +107,13 @@ extern int ExecFindPartition(ResultRelInfo *resultRelInfo,
 				  PartitionDispatch *pd,
 				  TupleTableSlot *slot,
 				  EState *estate);
+extern void ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute);
+extern TupleConversionMap *TupConvMapForLeaf(PartitionTupleRouting *proute,
+				  ResultRelInfo *rootRelInfo, int leaf_index);
+extern HeapTuple ConvertPartitionTupleSlot(TupleConversionMap *map,
+						  HeapTuple tuple,
+						  TupleTableSlot *new_slot,
+						  TupleTableSlot **p_my_slot);
 extern void ExecCleanupTupleRouting(PartitionTupleRouting *proute);
 
 #endif							/* EXECPARTITION_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 3d7ece62..74475d60 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1070,8 +1070,8 @@ typedef struct ModifyTableState
     /* Tuple-routing support info */
     struct TransitionCaptureState *mt_transition_capture;
     /* controls transition table population */
-    TupleConversionMap **mt_transition_tupconv_maps;
-    /* Per plan/partition tuple conversion */
+        TupleConversionMap **mt_per_subplan_tupconv_maps;
+        /* Per plan map for tuple conversion from child to root */
 #ifdef __TBASE__
     /* used for interval partition */
     bool        haspartparent;
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index ce1f6719..4b3c49d2 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -256,6 +256,7 @@ typedef struct ModifyTable
     Index        nominalRelation;    /* Parent RT index for use of EXPLAIN */
     /* RT indexes of non-leaf tables in a partition tree */
     List       *partitioned_rels;
+	bool		partColsUpdated;	/* some part key in hierarchy updated */
     List       *resultRelations;    /* integer list of RT indexes */
     int            resultRelIndex; /* index of first resultRel in plan's list */
     int            rootResultRelIndex; /* index of the partitioned table root */
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 96258106..e2af7ebc 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -1815,6 +1815,7 @@ typedef struct ModifyTablePath
     Index        nominalRelation;    /* Parent RT index for use of EXPLAIN */
     /* RT indexes of non-leaf tables in a partition tree */
     List       *partitioned_rels;
+	bool		partColsUpdated;	/* some part key in hierarchy updated */
     List       *resultRelations;    /* integer list of RT indexes */
     List       *subpaths;        /* Path(s) producing source data */
     List       *subroots;        /* per-target-table PlannerInfos */
@@ -2263,6 +2264,8 @@ typedef struct PartitionedChildRelInfo
 
     Index        parent_relid;
     List       *child_rels;
+	bool		part_cols_updated;	/* is the partition key of any of
+									 * the partitioned tables updated? */
 } PartitionedChildRelInfo;
 
 /*
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index d6e8ffdb..4097e568 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -299,6 +299,7 @@ extern ModifyTablePath *create_modifytable_path(PlannerInfo *root,
                         RelOptInfo *rel,
                         CmdType operation, bool canSetTag,
                         Index nominalRelation, List *partitioned_rels,
+						bool partColsUpdated,
                         List *resultRelations, List *subpaths,
                         List *subroots,
                         List *withCheckOptionLists, List *returningLists,
diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h
index 464efbe4..1425e543 100644
--- a/src/include/optimizer/planner.h
+++ b/src/include/optimizer/planner.h
@@ -121,7 +121,8 @@ extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr);
 
 extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid);
 
-extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti);
+extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti,
+						   bool *part_cols_updated);
 extern List *get_partitioned_child_rels_for_join(PlannerInfo *root,
                                                                        Relids join_relids);
 
diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out
index 9cdaf10f..ed21a142 100644
--- a/src/test/regress/expected/update.out
+++ b/src/test/regress/expected/update.out
@@ -198,58 +198,668 @@ INSERT INTO upsert_test VALUES (1, 'Bat') ON CONFLICT(a)
 
 DROP TABLE update_test;
 DROP TABLE upsert_test;
--- update to a partition should check partition bound constraint for the new tuple
-create table range_parted (
+---------------------------
+-- UPDATE with row movement
+---------------------------
+-- When a partitioned table receives an UPDATE to the partitioned key and the
+-- new values no longer meet the partition's bound, the row must be moved to
+-- the correct partition for the new partition key (if one exists). We must
+-- also ensure that updatable views on partitioned tables properly enforce any
+-- WITH CHECK OPTION that is defined. The situation with triggers in this case
+-- also requires thorough testing as partition key updates causing row
+-- movement convert UPDATEs into DELETE+INSERT.
+CREATE TABLE range_parted (
 	a text,
-	b int
-) partition by range (a, b);
-create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10);
-create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20);
-create table part_b_1_b_10 partition of range_parted for values from ('b', 1) to ('b', 10);
-create table part_b_10_b_20 partition of range_parted for values from ('b', 10) to ('b', 20);
-insert into part_a_1_a_10 values ('a', 1);
-insert into part_b_10_b_20 values ('b', 10);
--- fail
-update part_a_1_a_10 set a = 'b' where a = 'a';
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
-update range_parted set b = b - 1 where b = 10;
-ERROR:  new row for relation "part_b_10_b_20" violates partition constraint
-DETAIL:  Failing row contains (b, 9).
+   b bigint,
+   c numeric,
+   d int,
+   e varchar
+) PARTITION BY RANGE (a, b);
+-- Create partitions intentionally in descending bound order, so as to test
+-- that update-row-movement works with the leaf partitions not in bound order.
+CREATE TABLE part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int);
+ALTER TABLE range_parted ATTACH PARTITION part_b_20_b_30 FOR VALUES FROM ('b', 20) TO ('b', 30);
+CREATE TABLE part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY RANGE (c);
+CREATE TABLE part_b_1_b_10 PARTITION OF range_parted FOR VALUES FROM ('b', 1) TO ('b', 10);
+ALTER TABLE range_parted ATTACH PARTITION part_b_10_b_20 FOR VALUES FROM ('b', 10) TO ('b', 20);
+CREATE TABLE part_a_10_a_20 PARTITION OF range_parted FOR VALUES FROM ('a', 10) TO ('a', 20);
+CREATE TABLE part_a_1_a_10 PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('a', 10);
+-- Check that partition-key UPDATE works sanely on a partitioned table that
+-- does not have any child partitions.
+UPDATE part_b_10_b_20 set b = b - 6;
+-- Create some more partitions following the above pattern of descending bound
+-- order, but let's make the situation a bit more complex by having the
+-- attribute numbers of the columns vary from their parent partition.
+CREATE TABLE part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY range (abs(d));
+ALTER TABLE part_c_100_200 DROP COLUMN e, DROP COLUMN c, DROP COLUMN a;
+ALTER TABLE part_c_100_200 ADD COLUMN c numeric, ADD COLUMN e varchar, ADD COLUMN a text;
+ALTER TABLE part_c_100_200 DROP COLUMN b;
+ALTER TABLE part_c_100_200 ADD COLUMN b bigint;
+CREATE TABLE part_d_1_15 PARTITION OF part_c_100_200 FOR VALUES FROM (1) TO (15);
+CREATE TABLE part_d_15_20 PARTITION OF part_c_100_200 FOR VALUES FROM (15) TO (20);
+ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_100_200 FOR VALUES FROM (100) TO (200);
+CREATE TABLE part_c_1_100 (e varchar, d int, c numeric, b bigint, a text);
+ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_1_100 FOR VALUES FROM (1) TO (100);
+\set init_range_parted 'truncate range_parted; insert into range_parted VALUES (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)'
+\set show_data 'select tableoid::regclass::text COLLATE "C" partname, * from range_parted ORDER BY 1, 2, 3, 4, 5, 6'
+:init_range_parted;
+:show_data;
+    partname    | a | b  |  c  | d  | e 
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 |  1 | 
+ part_a_1_a_10  | a |  1 |   1 |  1 | 
+ part_c_1_100   | b | 12 |  96 |  1 | 
+ part_c_1_100   | b | 13 |  97 |  2 | 
+ part_d_15_20   | b | 15 | 105 | 16 | 
+ part_d_15_20   | b | 17 | 105 | 19 | 
+(6 rows)
+
+-- The order of subplans should be in bound order
+EXPLAIN (costs off) UPDATE range_parted set c = c - 50 WHERE c > 97;
+             QUERY PLAN              
+-------------------------------------
+ Update on range_parted
+   Update on part_a_1_a_10
+   Update on part_a_10_a_20
+   Update on part_b_1_b_10
+   Update on part_c_1_100
+   Update on part_d_1_15
+   Update on part_d_15_20
+   Update on part_b_20_b_30
+   ->  Seq Scan on part_a_1_a_10
+         Filter: (c > '97'::numeric)
+   ->  Seq Scan on part_a_10_a_20
+         Filter: (c > '97'::numeric)
+   ->  Seq Scan on part_b_1_b_10
+         Filter: (c > '97'::numeric)
+   ->  Seq Scan on part_c_1_100
+         Filter: (c > '97'::numeric)
+   ->  Seq Scan on part_d_1_15
+         Filter: (c > '97'::numeric)
+   ->  Seq Scan on part_d_15_20
+         Filter: (c > '97'::numeric)
+   ->  Seq Scan on part_b_20_b_30
+         Filter: (c > '97'::numeric)
+(22 rows)
+
+-- fail, row movement happens only within the partition subtree.
+UPDATE part_c_100_200 set c = c - 20, d = c WHERE c = 105;
+ERROR:  new row for relation "part_c_100_200" violates partition constraint
+DETAIL:  Failing row contains (105, 85, null, b, 15).
+-- fail, no partition key update, so no attempt to move tuple,
+-- but "a = 'a'" violates partition constraint enforced by root partition)
+UPDATE part_b_10_b_20 set a = 'a';
+ERROR:  new row for relation "part_c_1_100" violates partition constraint
+DETAIL:  Failing row contains (null, 1, 96, 12, a).
+-- ok, partition key update, no constraint violation
+UPDATE range_parted set d = d - 10 WHERE d > 10;
+-- ok, no partition key update, no constraint violation
+UPDATE range_parted set e = d;
+-- No row found
+UPDATE part_c_1_100 set c = c + 20 WHERE c = 98;
+-- ok, row movement
+UPDATE part_b_10_b_20 set c = c + 20 returning c, b, a;
+  c  | b  | a 
+-----+----+---
+ 116 | 12 | b
+ 117 | 13 | b
+ 125 | 15 | b
+ 125 | 17 | b
+(4 rows)
+
+:show_data;
+    partname    | a | b  |  c  | d | e 
+----------------+---+----+-----+---+---
+ part_a_10_a_20 | a | 10 | 200 | 1 | 1
+ part_a_1_a_10  | a |  1 |   1 | 1 | 1
+ part_d_1_15    | b | 12 | 116 | 1 | 1
+ part_d_1_15    | b | 13 | 117 | 2 | 2
+ part_d_1_15    | b | 15 | 125 | 6 | 6
+ part_d_1_15    | b | 17 | 125 | 9 | 9
+(6 rows)
+
+-- fail, row movement happens only within the partition subtree.
+UPDATE part_b_10_b_20 set b = b - 6 WHERE c > 116 returning *;
+ERROR:  new row for relation "part_d_1_15" violates partition constraint
+DETAIL:  Failing row contains (2, 117, 2, b, 7).
+-- ok, row movement, with subset of rows moved into different partition.
+UPDATE range_parted set b = b - 6 WHERE c > 116 returning a, b + c;
+ a | ?column? 
+---+----------
+ a |      204
+ b |      124
+ b |      134
+ b |      136
+(4 rows)
+
+:show_data;
+   partname    | a | b  |  c  | d | e 
+---------------+---+----+-----+---+---
+ part_a_1_a_10 | a |  1 |   1 | 1 | 1
+ part_a_1_a_10 | a |  4 | 200 | 1 | 1
+ part_b_1_b_10 | b |  7 | 117 | 2 | 2
+ part_b_1_b_10 | b |  9 | 125 | 6 | 6
+ part_d_1_15   | b | 11 | 125 | 9 | 9
+ part_d_1_15   | b | 12 | 116 | 1 | 1
+(6 rows)
+
+-- Common table needed for multiple test scenarios.
+CREATE TABLE mintab(c1 int);
+INSERT into mintab VALUES (120);
+-- update partition key using updatable view.
+CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 FROM mintab) WITH CHECK OPTION;
+-- ok
+UPDATE upview set c = 199 WHERE b = 4;
+-- fail, check option violation
+UPDATE upview set c = 120 WHERE b = 4;
+ERROR:  new row violates check option for view "upview"
+DETAIL:  Failing row contains (a, 4, 120, 1, 1).
+-- fail, row movement with check option violation
+UPDATE upview set a = 'b', b = 15, c = 120 WHERE b = 4;
+ERROR:  new row violates check option for view "upview"
+DETAIL:  Failing row contains (b, 15, 120, 1, 1).
+-- ok, row movement, check option passes
+UPDATE upview set a = 'b', b = 15 WHERE b = 4;
+:show_data;
+   partname    | a | b  |  c  | d | e 
+---------------+---+----+-----+---+---
+ part_a_1_a_10 | a |  1 |   1 | 1 | 1
+ part_b_1_b_10 | b |  7 | 117 | 2 | 2
+ part_b_1_b_10 | b |  9 | 125 | 6 | 6
+ part_d_1_15   | b | 11 | 125 | 9 | 9
+ part_d_1_15   | b | 12 | 116 | 1 | 1
+ part_d_1_15   | b | 15 | 199 | 1 | 1
+(6 rows)
+
+-- cleanup
+DROP VIEW upview;
+-- RETURNING having whole-row vars.
+:init_range_parted;
+UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c > 100 returning (range_parted), *;
+ range_parted  | a | b  | c  | d  | e 
+---------------+---+----+----+----+---
+ (b,15,95,16,) | b | 15 | 95 | 16 | 
+ (b,17,95,19,) | b | 17 | 95 | 19 | 
+(2 rows)
+
+:show_data;
+    partname    | a | b  |  c  | d  | e 
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 |  1 | 
+ part_a_1_a_10  | a |  1 |   1 |  1 | 
+ part_c_1_100   | b | 12 |  96 |  1 | 
+ part_c_1_100   | b | 13 |  97 |  2 | 
+ part_c_1_100   | b | 15 |  95 | 16 | 
+ part_c_1_100   | b | 17 |  95 | 19 | 
+(6 rows)
+
+-- Transition tables with update row movement
+:init_range_parted;
+CREATE FUNCTION trans_updatetrigfunc() RETURNS trigger LANGUAGE plpgsql AS
+$$
+  begin
+    raise notice 'trigger = %, old table = %, new table = %',
+                 TG_NAME,
+                 (select string_agg(old_table::text, ', ' ORDER BY a) FROM old_table),
+                 (select string_agg(new_table::text, ', ' ORDER BY a) FROM new_table);
+    return null;
+  end;
+$$;
+CREATE TRIGGER trans_updatetrig
+  AFTER UPDATE ON range_parted REFERENCING OLD TABLE AS old_table NEW TABLE AS new_table
+  FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc();
+UPDATE range_parted set c = (case when c = 96 then 110 else c + 1 end ) WHERE a = 'b' and b > 10 and c >= 96;
+NOTICE:  trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,15,105,16,), (b,17,105,19,), new table = (b,12,110,1,), (b,13,98,2,), (b,15,106,16,), (b,17,106,19,)
+:show_data;
+    partname    | a | b  |  c  | d  | e 
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 |  1 | 
+ part_a_1_a_10  | a |  1 |   1 |  1 | 
+ part_c_1_100   | b | 13 |  98 |  2 | 
+ part_d_15_20   | b | 15 | 106 | 16 | 
+ part_d_15_20   | b | 17 | 106 | 19 | 
+ part_d_1_15    | b | 12 | 110 |  1 | 
+(6 rows)
+
+:init_range_parted;
+-- Enabling OLD TABLE capture for both DELETE as well as UPDATE stmt triggers
+-- should not cause DELETEd rows to be captured twice. Similar thing for
+-- INSERT triggers and inserted rows.
+CREATE TRIGGER trans_deletetrig
+  AFTER DELETE ON range_parted REFERENCING OLD TABLE AS old_table
+  FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc();
+CREATE TRIGGER trans_inserttrig
+  AFTER INSERT ON range_parted REFERENCING NEW TABLE AS new_table
+  FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc();
+UPDATE range_parted set c = c + 50 WHERE a = 'b' and b > 10 and c >= 96;
+NOTICE:  trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,15,105,16,), (b,17,105,19,), new table = (b,12,146,1,), (b,13,147,2,), (b,15,155,16,), (b,17,155,19,)
+:show_data;
+    partname    | a | b  |  c  | d  | e 
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 |  1 | 
+ part_a_1_a_10  | a |  1 |   1 |  1 | 
+ part_d_15_20   | b | 15 | 155 | 16 | 
+ part_d_15_20   | b | 17 | 155 | 19 | 
+ part_d_1_15    | b | 12 | 146 |  1 | 
+ part_d_1_15    | b | 13 | 147 |  2 | 
+(6 rows)
+
+DROP TRIGGER trans_deletetrig ON range_parted;
+DROP TRIGGER trans_inserttrig ON range_parted;
+-- Don't drop trans_updatetrig yet. It is required below.
+-- Test with transition tuple conversion happening for rows moved into the
+-- new partition. This requires a trigger that references transition table
+-- (we already have trans_updatetrig). For inserted rows, the conversion
+-- is not usually needed, because the original tuple is already compatible with
+-- the desired transition tuple format. But conversion happens when there is a
+-- BR trigger because the trigger can change the inserted row. So install a
+-- BR triggers on those child partitions where the rows will be moved.
+CREATE FUNCTION func_parted_mod_b() RETURNS trigger AS $$
+BEGIN
+   NEW.b = NEW.b + 1;
+   return NEW;
+END $$ language plpgsql;
+CREATE TRIGGER trig_c1_100 BEFORE UPDATE OR INSERT ON part_c_1_100
+   FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b();
+CREATE TRIGGER trig_d1_15 BEFORE UPDATE OR INSERT ON part_d_1_15
+   FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b();
+CREATE TRIGGER trig_d15_20 BEFORE UPDATE OR INSERT ON part_d_15_20
+   FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b();
+:init_range_parted;
+UPDATE range_parted set c = (case when c = 96 then 110 else c + 1 end) WHERE a = 'b' and b > 10 and c >= 96;
+NOTICE:  trigger = trans_updatetrig, old table = (b,13,96,1,), (b,14,97,2,), (b,16,105,16,), (b,18,105,19,), new table = (b,15,110,1,), (b,15,98,2,), (b,17,106,16,), (b,19,106,19,)
+:show_data;
+    partname    | a | b  |  c  | d  | e 
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 |  1 | 
+ part_a_1_a_10  | a |  1 |   1 |  1 | 
+ part_c_1_100   | b | 15 |  98 |  2 | 
+ part_d_15_20   | b | 17 | 106 | 16 | 
+ part_d_15_20   | b | 19 | 106 | 19 | 
+ part_d_1_15    | b | 15 | 110 |  1 | 
+(6 rows)
+
+:init_range_parted;
+UPDATE range_parted set c = c + 50 WHERE a = 'b' and b > 10 and c >= 96;
+NOTICE:  trigger = trans_updatetrig, old table = (b,13,96,1,), (b,14,97,2,), (b,16,105,16,), (b,18,105,19,), new table = (b,15,146,1,), (b,16,147,2,), (b,17,155,16,), (b,19,155,19,)
+:show_data;
+    partname    | a | b  |  c  | d  | e 
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 |  1 | 
+ part_a_1_a_10  | a |  1 |   1 |  1 | 
+ part_d_15_20   | b | 17 | 155 | 16 | 
+ part_d_15_20   | b | 19 | 155 | 19 | 
+ part_d_1_15    | b | 15 | 146 |  1 | 
+ part_d_1_15    | b | 16 | 147 |  2 | 
+(6 rows)
+
+-- Case where per-partition tuple conversion map array is allocated, but the
+-- map is not required for the particular tuple that is routed, thanks to
+-- matching table attributes of the partition and the target table.
+:init_range_parted;
+UPDATE range_parted set b = 15 WHERE b = 1;
+NOTICE:  trigger = trans_updatetrig, old table = (a,1,1,1,), new table = (a,15,1,1,)
+:show_data;
+    partname    | a | b  |  c  | d  | e 
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 |  1 | 
+ part_a_10_a_20 | a | 15 |   1 |  1 | 
+ part_c_1_100   | b | 13 |  96 |  1 | 
+ part_c_1_100   | b | 14 |  97 |  2 | 
+ part_d_15_20   | b | 16 | 105 | 16 | 
+ part_d_15_20   | b | 18 | 105 | 19 | 
+(6 rows)
+
+DROP TRIGGER trans_updatetrig ON range_parted;
+DROP TRIGGER trig_c1_100 ON part_c_1_100;
+DROP TRIGGER trig_d1_15 ON part_d_1_15;
+DROP TRIGGER trig_d15_20 ON part_d_15_20;
+DROP FUNCTION func_parted_mod_b();
+-- RLS policies with update-row-movement
+-----------------------------------------
+ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY;
+CREATE USER regress_range_parted_user;
+GRANT ALL ON range_parted, mintab TO regress_range_parted_user;
+CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true);
+CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0);
+:init_range_parted;
+SET SESSION AUTHORIZATION regress_range_parted_user;
+-- This should fail with RLS violation error while moving row from
+-- part_a_10_a_20 to part_d_1_15, because we are setting 'c' to an odd number.
+UPDATE range_parted set a = 'b', c = 151 WHERE a = 'a' and c = 200;
+ERROR:  new row violates row-level security policy for table "range_parted"
+RESET SESSION AUTHORIZATION;
+-- Create a trigger on part_d_1_15
+CREATE FUNCTION func_d_1_15() RETURNS trigger AS $$
+BEGIN
+   NEW.c = NEW.c + 1; -- Make even numbers odd, or vice versa
+   return NEW;
+END $$ LANGUAGE plpgsql;
+CREATE TRIGGER trig_d_1_15 BEFORE INSERT ON part_d_1_15
+   FOR EACH ROW EXECUTE PROCEDURE func_d_1_15();
+:init_range_parted;
+SET SESSION AUTHORIZATION regress_range_parted_user;
+-- Here, RLS checks should succeed while moving row from part_a_10_a_20 to
+-- part_d_1_15. Even though the UPDATE is setting 'c' to an odd number, the
+-- trigger at the destination partition again makes it an even number.
+UPDATE range_parted set a = 'b', c = 151 WHERE a = 'a' and c = 200;
+RESET SESSION AUTHORIZATION;
+:init_range_parted;
+SET SESSION AUTHORIZATION regress_range_parted_user;
+-- This should fail with RLS violation error. Even though the UPDATE is setting
+-- 'c' to an even number, the trigger at the destination partition again makes
+-- it an odd number.
+UPDATE range_parted set a = 'b', c = 150 WHERE a = 'a' and c = 200;
+ERROR:  new row violates row-level security policy for table "range_parted"
+-- Cleanup
+RESET SESSION AUTHORIZATION;
+DROP TRIGGER trig_d_1_15 ON part_d_1_15;
+DROP FUNCTION func_d_1_15();
+-- Policy expression contains SubPlan
+RESET SESSION AUTHORIZATION;
+:init_range_parted;
+CREATE POLICY policy_range_parted_subplan on range_parted
+    AS RESTRICTIVE for UPDATE USING (true)
+    WITH CHECK ((SELECT range_parted.c <= c1 FROM mintab));
+SET SESSION AUTHORIZATION regress_range_parted_user;
+-- fail, mintab has row with c1 = 120
+UPDATE range_parted set a = 'b', c = 122 WHERE a = 'a' and c = 200;
+ERROR:  new row violates row-level security policy "policy_range_parted_subplan" for table "range_parted"
 -- ok
-update range_parted set b = b + 1 where b = 10;
+UPDATE range_parted set a = 'b', c = 120 WHERE a = 'a' and c = 200;
+-- RLS policy expression contains whole row.
+RESET SESSION AUTHORIZATION;
+:init_range_parted;
+CREATE POLICY policy_range_parted_wholerow on range_parted AS RESTRICTIVE for UPDATE USING (true)
+   WITH CHECK (range_parted = row('b', 10, 112, 1, NULL)::range_parted);
+SET SESSION AUTHORIZATION regress_range_parted_user;
+-- ok, should pass the RLS check
+UPDATE range_parted set a = 'b', c = 112 WHERE a = 'a' and c = 200;
+RESET SESSION AUTHORIZATION;
+:init_range_parted;
+SET SESSION AUTHORIZATION regress_range_parted_user;
+-- fail, the whole row RLS check should fail
+UPDATE range_parted set a = 'b', c = 116 WHERE a = 'a' and c = 200;
+ERROR:  new row violates row-level security policy "policy_range_parted_wholerow" for table "range_parted"
+-- Cleanup
+RESET SESSION AUTHORIZATION;
+DROP POLICY policy_range_parted ON range_parted;
+DROP POLICY policy_range_parted_subplan ON range_parted;
+DROP POLICY policy_range_parted_wholerow ON range_parted;
+REVOKE ALL ON range_parted, mintab FROM regress_range_parted_user;
+DROP USER regress_range_parted_user;
+DROP TABLE mintab;
+-- statement triggers with update row movement
+---------------------------------------------------
+:init_range_parted;
+CREATE FUNCTION trigfunc() returns trigger language plpgsql as
+$$
+  begin
+    raise notice 'trigger = % fired on table % during %',
+                 TG_NAME, TG_TABLE_NAME, TG_OP;
+    return null;
+  end;
+$$;
+-- Triggers on root partition
+CREATE TRIGGER parent_delete_trig
+  AFTER DELETE ON range_parted for each statement execute procedure trigfunc();
+CREATE TRIGGER parent_update_trig
+  AFTER UPDATE ON range_parted for each statement execute procedure trigfunc();
+CREATE TRIGGER parent_insert_trig
+  AFTER INSERT ON range_parted for each statement execute procedure trigfunc();
+-- Triggers on leaf partition part_c_1_100
+CREATE TRIGGER c1_delete_trig
+  AFTER DELETE ON part_c_1_100 for each statement execute procedure trigfunc();
+CREATE TRIGGER c1_update_trig
+  AFTER UPDATE ON part_c_1_100 for each statement execute procedure trigfunc();
+CREATE TRIGGER c1_insert_trig
+  AFTER INSERT ON part_c_1_100 for each statement execute procedure trigfunc();
+-- Triggers on leaf partition part_d_1_15
+CREATE TRIGGER d1_delete_trig
+  AFTER DELETE ON part_d_1_15 for each statement execute procedure trigfunc();
+CREATE TRIGGER d1_update_trig
+  AFTER UPDATE ON part_d_1_15 for each statement execute procedure trigfunc();
+CREATE TRIGGER d1_insert_trig
+  AFTER INSERT ON part_d_1_15 for each statement execute procedure trigfunc();
+-- Triggers on leaf partition part_d_15_20
+CREATE TRIGGER d15_delete_trig
+  AFTER DELETE ON part_d_15_20 for each statement execute procedure trigfunc();
+CREATE TRIGGER d15_update_trig
+  AFTER UPDATE ON part_d_15_20 for each statement execute procedure trigfunc();
+CREATE TRIGGER d15_insert_trig
+  AFTER INSERT ON part_d_15_20 for each statement execute procedure trigfunc();
+-- Move all rows from part_c_100_200 to part_c_1_100. None of the delete or
+-- insert statement triggers should be fired.
+UPDATE range_parted set c = c - 50 WHERE c > 97;
+NOTICE:  trigger = parent_update_trig fired on table range_parted during UPDATE
+:show_data;
+    partname    | a | b  |  c  | d  | e 
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 150 |  1 | 
+ part_a_1_a_10  | a |  1 |   1 |  1 | 
+ part_c_1_100   | b | 12 |  96 |  1 | 
+ part_c_1_100   | b | 13 |  97 |  2 | 
+ part_c_1_100   | b | 15 |  55 | 16 | 
+ part_c_1_100   | b | 17 |  55 | 19 | 
+(6 rows)
+
+DROP TRIGGER parent_delete_trig ON range_parted;
+DROP TRIGGER parent_update_trig ON range_parted;
+DROP TRIGGER parent_insert_trig ON range_parted;
+DROP TRIGGER c1_delete_trig ON part_c_1_100;
+DROP TRIGGER c1_update_trig ON part_c_1_100;
+DROP TRIGGER c1_insert_trig ON part_c_1_100;
+DROP TRIGGER d1_delete_trig ON part_d_1_15;
+DROP TRIGGER d1_update_trig ON part_d_1_15;
+DROP TRIGGER d1_insert_trig ON part_d_1_15;
+DROP TRIGGER d15_delete_trig ON part_d_15_20;
+DROP TRIGGER d15_update_trig ON part_d_15_20;
+DROP TRIGGER d15_insert_trig ON part_d_15_20;
 -- Creating default partition for range
+:init_range_parted;
 create table part_def partition of range_parted default;
 \d+ part_def
-                                  Table "public.part_def"
- Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
---------+---------+-----------+----------+---------+----------+--------------+-------------
- a      | text    |           |          |         | extended |              | 
- b      | integer |           |          |         | plain    |              | 
+                                       Table "public.part_def"
+ Column |       Type        | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+-------------------+-----------+----------+---------+----------+--------------+-------------
+ a      | text              |           |          |         | extended |              | 
+ b      | bigint            |           |          |         | plain    |              | 
+ c      | numeric           |           |          |         | main     |              | 
+ d      | integer           |           |          |         | plain    |              | 
+ e      | character varying |           |          |         | extended |              | 
 Partition of: range_parted DEFAULT
-Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'a'::text) AND (b >= 10) AND (b < 20)) OR ((a = 'b'::text) AND (b >= 1) AND (b < 10)) OR ((a = 'b'::text) AND (b >= 10) AND (b < 20)))))
-
+Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'a'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'b'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '20'::bigint) AND (b < '30'::bigint)))))
+ 
 insert into range_parted values ('c', 9);
 -- ok
 update part_def set a = 'd' where a = 'c';
 -- fail
 update part_def set a = 'a' where a = 'd';
 ERROR:  new row for relation "part_def" violates partition constraint
-DETAIL:  Failing row contains (a, 9).
-create table list_parted (
+DETAIL:  Failing row contains (a, 9, null, null, null).
+:show_data;
+    partname    | a | b  |  c  | d  | e 
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 |  1 | 
+ part_a_1_a_10  | a |  1 |   1 |  1 | 
+ part_c_1_100   | b | 12 |  96 |  1 | 
+ part_c_1_100   | b | 13 |  97 |  2 | 
+ part_d_15_20   | b | 15 | 105 | 16 | 
+ part_d_15_20   | b | 17 | 105 | 19 | 
+ part_def       | d |  9 |     |    | 
+(7 rows)
+
+-- Update row movement from non-default to default partition.
+-- fail, default partition is not under part_a_10_a_20;
+UPDATE part_a_10_a_20 set a = 'ad' WHERE a = 'a';
+ERROR:  new row for relation "part_a_10_a_20" violates partition constraint
+DETAIL:  Failing row contains (ad, 10, 200, 1, null).
+-- ok
+UPDATE range_parted set a = 'ad' WHERE a = 'a';
+UPDATE range_parted set a = 'bd' WHERE a = 'b';
+:show_data;
+ partname | a  | b  |  c  | d  | e 
+----------+----+----+-----+----+---
+ part_def | ad |  1 |   1 |  1 | 
+ part_def | ad | 10 | 200 |  1 | 
+ part_def | bd | 12 |  96 |  1 | 
+ part_def | bd | 13 |  97 |  2 | 
+ part_def | bd | 15 | 105 | 16 | 
+ part_def | bd | 17 | 105 | 19 | 
+ part_def | d  |  9 |     |    | 
+(7 rows)
+
+-- Update row movement from default to non-default partitions.
+-- ok
+UPDATE range_parted set a = 'a' WHERE a = 'ad';
+UPDATE range_parted set a = 'b' WHERE a = 'bd';
+:show_data;
+    partname    | a | b  |  c  | d  | e 
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 |  1 | 
+ part_a_1_a_10  | a |  1 |   1 |  1 | 
+ part_c_1_100   | b | 12 |  96 |  1 | 
+ part_c_1_100   | b | 13 |  97 |  2 | 
+ part_d_15_20   | b | 15 | 105 | 16 | 
+ part_d_15_20   | b | 17 | 105 | 19 | 
+ part_def       | d |  9 |     |    | 
+(7 rows)
+
+-- Cleanup: range_parted no longer needed.
+DROP TABLE range_parted;
+CREATE TABLE list_parted (
 	a text,
 	b int
-) partition by list (a);
-create table list_part1  partition of list_parted for values in ('a', 'b');
-create table list_default partition of list_parted default;
-insert into list_part1 values ('a', 1);
-insert into list_default values ('d', 10);
+) PARTITION BY list (a);
+CREATE TABLE list_part1  PARTITION OF list_parted for VALUES in ('a', 'b');
+CREATE TABLE list_default PARTITION OF list_parted default;
+INSERT into list_part1 VALUES ('a', 1);
+INSERT into list_default VALUES ('d', 10);
 -- fail
-update list_default set a = 'a' where a = 'd';
+UPDATE list_default set a = 'a' WHERE a = 'd';
 ERROR:  new row for relation "list_default" violates partition constraint
 DETAIL:  Failing row contains (a, 10).
 -- ok
-update list_default set a = 'x' where a = 'd';
+UPDATE list_default set a = 'x' WHERE a = 'd';
+DROP TABLE list_parted;
+--------------
+-- Some more update-partition-key test scenarios below. This time use list
+-- partitions.
+--------------
+-- Setup for list partitions
+CREATE TABLE list_parted (a numeric, b int, c int8) PARTITION BY list (a);
+CREATE TABLE sub_parted PARTITION OF list_parted for VALUES in (1) PARTITION BY list (b);
+CREATE TABLE sub_part1(b int, c int8, a numeric);
+ALTER TABLE sub_parted ATTACH PARTITION sub_part1 for VALUES in (1);
+CREATE TABLE sub_part2(b int, c int8, a numeric);
+ALTER TABLE sub_parted ATTACH PARTITION sub_part2 for VALUES in (2);
+CREATE TABLE list_part1(a numeric, b int, c int8);
+ALTER TABLE list_parted ATTACH PARTITION list_part1 for VALUES in (2,3);
+INSERT into list_parted VALUES (2,5,50);
+INSERT into list_parted VALUES (3,6,60);
+INSERT into sub_parted VALUES (1,1,60);
+INSERT into sub_parted VALUES (1,2,10);
+-- Test partition constraint violation when intermediate ancestor is used and
+-- constraint is inherited from upper root.
+UPDATE sub_parted set a = 2 WHERE c = 10;
+ERROR:  new row for relation "sub_part2" violates partition constraint
+DETAIL:  Failing row contains (2, 10, 2).
+-- Test update-partition-key, where the unpruned partitions do not have their
+-- partition keys updated.
+SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1;
+  tableoid  | a | b | c  
+------------+---+---+----
+ list_part1 | 2 | 5 | 50
+(1 row)
+
+UPDATE list_parted set b = c + a WHERE a = 2;
+SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1;
+  tableoid  | a | b  | c  
+------------+---+----+----
+ list_part1 | 2 | 52 | 50
+(1 row)
+
+-- Test the case where BR UPDATE triggers change the partition key.
+CREATE FUNCTION func_parted_mod_b() returns trigger as $$
+BEGIN
+   NEW.b = 2; -- This is changing partition key column.
+   return NEW;
+END $$ LANGUAGE plpgsql;
+CREATE TRIGGER parted_mod_b before update on sub_part1
+   for each row execute procedure func_parted_mod_b();
+SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
+  tableoid  | a | b  | c  
+------------+---+----+----
+ list_part1 | 2 | 52 | 50
+ list_part1 | 3 |  6 | 60
+ sub_part1  | 1 |  1 | 60
+ sub_part2  | 1 |  2 | 10
+(4 rows)
+
+-- This should do the tuple routing even though there is no explicit
+-- partition-key update, because there is a trigger on sub_part1.
+UPDATE list_parted set c = 70 WHERE b  = 1;
+SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
+  tableoid  | a | b  | c  
+------------+---+----+----
+ list_part1 | 2 | 52 | 50
+ list_part1 | 3 |  6 | 60
+ sub_part2  | 1 |  2 | 10
+ sub_part2  | 1 |  2 | 70
+(4 rows)
+
+DROP TRIGGER parted_mod_b ON sub_part1;
+-- If BR DELETE trigger prevented DELETE from happening, we should also skip
+-- the INSERT if that delete is part of UPDATE=>DELETE+INSERT.
+CREATE OR REPLACE FUNCTION func_parted_mod_b() returns trigger as $$
+BEGIN
+   raise notice 'Trigger: Got OLD row %, but returning NULL', OLD;
+   return NULL;
+END $$ LANGUAGE plpgsql;
+CREATE TRIGGER trig_skip_delete before delete on sub_part2
+   for each row execute procedure func_parted_mod_b();
+UPDATE list_parted set b = 1 WHERE c = 70;
+NOTICE:  Trigger: Got OLD row (2,70,1), but returning NULL
+SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
+  tableoid  | a | b  | c  
+------------+---+----+----
+ list_part1 | 2 | 52 | 50
+ list_part1 | 3 |  6 | 60
+ sub_part2  | 1 |  2 | 10
+ sub_part2  | 1 |  2 | 70
+(4 rows)
+
+-- Drop the trigger. Now the row should be moved.
+DROP TRIGGER trig_skip_delete ON sub_part2;
+UPDATE list_parted set b = 1 WHERE c = 70;
+SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
+  tableoid  | a | b  | c  
+------------+---+----+----
+ list_part1 | 2 | 52 | 50
+ list_part1 | 3 |  6 | 60
+ sub_part1  | 1 |  1 | 70
+ sub_part2  | 1 |  2 | 10
+(4 rows)
+
+DROP FUNCTION func_parted_mod_b();
+-- UPDATE partition-key with FROM clause. If join produces multiple output
+-- rows for the same row to be modified, we should tuple-route the row only
+-- once. There should not be any rows inserted.
+CREATE TABLE non_parted (id int);
+INSERT into non_parted VALUES (1), (1), (1), (2), (2), (2), (3), (3), (3);
+UPDATE list_parted t1 set a = 2 FROM non_parted t2 WHERE t1.a = t2.id and a = 1;
+SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
+  tableoid  | a | b  | c  
+------------+---+----+----
+ list_part1 | 2 |  1 | 70
+ list_part1 | 2 |  2 | 10
+ list_part1 | 2 | 52 | 50
+ list_part1 | 3 |  6 | 60
+(4 rows)
+
+DROP TABLE non_parted;
+-- Cleanup: list_parted no longer needed.
+DROP TABLE list_parted;
 -- create custom operator class and hash function, for the same reason
 -- explained in alter_table.sql
 create or replace function dummy_hashint4(a int4, seed int8) returns int8 as
@@ -271,14 +881,11 @@ insert into hpart4 values (3, 4);
 update hpart1 set a = 3, b=4 where a = 1;
 ERROR:  new row for relation "hpart1" violates partition constraint
 DETAIL:  Failing row contains (3, 4).
+-- ok, row movement
 update hash_parted set b = b - 1 where b = 1;
-ERROR:  new row for relation "hpart1" violates partition constraint
-DETAIL:  Failing row contains (1, 0).
 -- ok
 update hash_parted set b = b + 8 where b = 1;
 -- cleanup
-drop table range_parted;
-drop table list_parted;
 drop table hash_parted;
 drop operator class custom_opclass using hash;
 drop function dummy_hashint4(a int4, seed int8);
diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql
index 9d673de4..a4f2f161 100644
--- a/src/test/regress/sql/update.sql
+++ b/src/test/regress/sql/update.sql
@@ -107,25 +107,336 @@ INSERT INTO upsert_test VALUES (1, 'Bat') ON CONFLICT(a)
 DROP TABLE update_test;
 DROP TABLE upsert_test;
 
--- update to a partition should check partition bound constraint for the new tuple
-create table range_parted (
+
+---------------------------
+-- UPDATE with row movement
+---------------------------
+
+-- When a partitioned table receives an UPDATE to the partitioned key and the
+-- new values no longer meet the partition's bound, the row must be moved to
+-- the correct partition for the new partition key (if one exists). We must
+-- also ensure that updatable views on partitioned tables properly enforce any
+-- WITH CHECK OPTION that is defined. The situation with triggers in this case
+-- also requires thorough testing as partition key updates causing row
+-- movement convert UPDATEs into DELETE+INSERT.
+
+CREATE TABLE range_parted (
 	a text,
-	b int
-) partition by range (a, b);
-create table part_a_1_a_10 partition of range_parted for values from ('a', 1) to ('a', 10);
-create table part_a_10_a_20 partition of range_parted for values from ('a', 10) to ('a', 20);
-create table part_b_1_b_10 partition of range_parted for values from ('b', 1) to ('b', 10);
-create table part_b_10_b_20 partition of range_parted for values from ('b', 10) to ('b', 20);
-insert into part_a_1_a_10 values ('a', 1);
-insert into part_b_10_b_20 values ('b', 10);
+	b bigint,
+	c numeric,
+	d int,
+	e varchar
+) PARTITION BY RANGE (a, b);
 
--- fail
-update part_a_1_a_10 set a = 'b' where a = 'a';
-update range_parted set b = b - 1 where b = 10;
+-- Create partitions intentionally in descending bound order, so as to test
+-- that update-row-movement works with the leaf partitions not in bound order.
+CREATE TABLE part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int);
+ALTER TABLE range_parted ATTACH PARTITION part_b_20_b_30 FOR VALUES FROM ('b', 20) TO ('b', 30);
+CREATE TABLE part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY RANGE (c);
+CREATE TABLE part_b_1_b_10 PARTITION OF range_parted FOR VALUES FROM ('b', 1) TO ('b', 10);
+ALTER TABLE range_parted ATTACH PARTITION part_b_10_b_20 FOR VALUES FROM ('b', 10) TO ('b', 20);
+CREATE TABLE part_a_10_a_20 PARTITION OF range_parted FOR VALUES FROM ('a', 10) TO ('a', 20);
+CREATE TABLE part_a_1_a_10 PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('a', 10);
+
+-- Check that partition-key UPDATE works sanely on a partitioned table that
+-- does not have any child partitions.
+UPDATE part_b_10_b_20 set b = b - 6;
+
+-- Create some more partitions following the above pattern of descending bound
+-- order, but let's make the situation a bit more complex by having the
+-- attribute numbers of the columns vary from their parent partition.
+CREATE TABLE part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY range (abs(d));
+ALTER TABLE part_c_100_200 DROP COLUMN e, DROP COLUMN c, DROP COLUMN a;
+ALTER TABLE part_c_100_200 ADD COLUMN c numeric, ADD COLUMN e varchar, ADD COLUMN a text;
+ALTER TABLE part_c_100_200 DROP COLUMN b;
+ALTER TABLE part_c_100_200 ADD COLUMN b bigint;
+CREATE TABLE part_d_1_15 PARTITION OF part_c_100_200 FOR VALUES FROM (1) TO (15);
+CREATE TABLE part_d_15_20 PARTITION OF part_c_100_200 FOR VALUES FROM (15) TO (20);
+
+ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_100_200 FOR VALUES FROM (100) TO (200);
+
+CREATE TABLE part_c_1_100 (e varchar, d int, c numeric, b bigint, a text);
+ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_1_100 FOR VALUES FROM (1) TO (100);
+
+\set init_range_parted 'truncate range_parted; insert into range_parted VALUES (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)'
+\set show_data 'select tableoid::regclass::text COLLATE "C" partname, * from range_parted ORDER BY 1, 2, 3, 4, 5, 6'
+:init_range_parted;
+:show_data;
+
+-- The order of subplans should be in bound order
+EXPLAIN (costs off) UPDATE range_parted set c = c - 50 WHERE c > 97;
+
+-- fail, row movement happens only within the partition subtree.
+UPDATE part_c_100_200 set c = c - 20, d = c WHERE c = 105;
+-- fail, no partition key update, so no attempt to move tuple,
+-- but "a = 'a'" violates partition constraint enforced by root partition)
+UPDATE part_b_10_b_20 set a = 'a';
+-- ok, partition key update, no constraint violation
+UPDATE range_parted set d = d - 10 WHERE d > 10;
+-- ok, no partition key update, no constraint violation
+UPDATE range_parted set e = d;
+-- No row found
+UPDATE part_c_1_100 set c = c + 20 WHERE c = 98;
+-- ok, row movement
+UPDATE part_b_10_b_20 set c = c + 20 returning c, b, a;
+:show_data;
+
+-- fail, row movement happens only within the partition subtree.
+UPDATE part_b_10_b_20 set b = b - 6 WHERE c > 116 returning *;
+-- ok, row movement, with subset of rows moved into different partition.
+UPDATE range_parted set b = b - 6 WHERE c > 116 returning a, b + c;
+
+:show_data;
+
+-- Common table needed for multiple test scenarios.
+CREATE TABLE mintab(c1 int);
+INSERT into mintab VALUES (120);
+
+-- update partition key using updatable view.
+CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 FROM mintab) WITH CHECK OPTION;
+-- ok
+UPDATE upview set c = 199 WHERE b = 4;
+-- fail, check option violation
+UPDATE upview set c = 120 WHERE b = 4;
+-- fail, row movement with check option violation
+UPDATE upview set a = 'b', b = 15, c = 120 WHERE b = 4;
+-- ok, row movement, check option passes
+UPDATE upview set a = 'b', b = 15 WHERE b = 4;
+
+:show_data;
+
+-- cleanup
+DROP VIEW upview;
+
+-- RETURNING having whole-row vars.
+:init_range_parted;
+UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c > 100 returning (range_parted), *;
+:show_data;
+
+
+-- Transition tables with update row movement
+:init_range_parted;
+
+CREATE FUNCTION trans_updatetrigfunc() RETURNS trigger LANGUAGE plpgsql AS
+$$
+  begin
+    raise notice 'trigger = %, old table = %, new table = %',
+                 TG_NAME,
+                 (select string_agg(old_table::text, ', ' ORDER BY a) FROM old_table),
+                 (select string_agg(new_table::text, ', ' ORDER BY a) FROM new_table);
+    return null;
+  end;
+$$;
+
+CREATE TRIGGER trans_updatetrig
+  AFTER UPDATE ON range_parted REFERENCING OLD TABLE AS old_table NEW TABLE AS new_table
+  FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc();
+
+UPDATE range_parted set c = (case when c = 96 then 110 else c + 1 end ) WHERE a = 'b' and b > 10 and c >= 96;
+:show_data;
+:init_range_parted;
+
+-- Enabling OLD TABLE capture for both DELETE as well as UPDATE stmt triggers
+-- should not cause DELETEd rows to be captured twice. Similar thing for
+-- INSERT triggers and inserted rows.
+CREATE TRIGGER trans_deletetrig
+  AFTER DELETE ON range_parted REFERENCING OLD TABLE AS old_table
+  FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc();
+CREATE TRIGGER trans_inserttrig
+  AFTER INSERT ON range_parted REFERENCING NEW TABLE AS new_table
+  FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc();
+UPDATE range_parted set c = c + 50 WHERE a = 'b' and b > 10 and c >= 96;
+:show_data;
+DROP TRIGGER trans_deletetrig ON range_parted;
+DROP TRIGGER trans_inserttrig ON range_parted;
+-- Don't drop trans_updatetrig yet. It is required below.
+
+-- Test with transition tuple conversion happening for rows moved into the
+-- new partition. This requires a trigger that references transition table
+-- (we already have trans_updatetrig). For inserted rows, the conversion
+-- is not usually needed, because the original tuple is already compatible with
+-- the desired transition tuple format. But conversion happens when there is a
+-- BR trigger because the trigger can change the inserted row. So install a
+-- BR triggers on those child partitions where the rows will be moved.
+CREATE FUNCTION func_parted_mod_b() RETURNS trigger AS $$
+BEGIN
+   NEW.b = NEW.b + 1;
+   return NEW;
+END $$ language plpgsql;
+CREATE TRIGGER trig_c1_100 BEFORE UPDATE OR INSERT ON part_c_1_100
+   FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b();
+CREATE TRIGGER trig_d1_15 BEFORE UPDATE OR INSERT ON part_d_1_15
+   FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b();
+CREATE TRIGGER trig_d15_20 BEFORE UPDATE OR INSERT ON part_d_15_20
+   FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b();
+:init_range_parted;
+UPDATE range_parted set c = (case when c = 96 then 110 else c + 1 end) WHERE a = 'b' and b > 10 and c >= 96;
+:show_data;
+:init_range_parted;
+UPDATE range_parted set c = c + 50 WHERE a = 'b' and b > 10 and c >= 96;
+:show_data;
+
+-- Case where per-partition tuple conversion map array is allocated, but the
+-- map is not required for the particular tuple that is routed, thanks to
+-- matching table attributes of the partition and the target table.
+:init_range_parted;
+UPDATE range_parted set b = 15 WHERE b = 1;
+:show_data;
+
+DROP TRIGGER trans_updatetrig ON range_parted;
+DROP TRIGGER trig_c1_100 ON part_c_1_100;
+DROP TRIGGER trig_d1_15 ON part_d_1_15;
+DROP TRIGGER trig_d15_20 ON part_d_15_20;
+DROP FUNCTION func_parted_mod_b();
+
+-- RLS policies with update-row-movement
+-----------------------------------------
+
+ALTER TABLE range_parted ENABLE ROW LEVEL SECURITY;
+CREATE USER regress_range_parted_user;
+GRANT ALL ON range_parted, mintab TO regress_range_parted_user;
+CREATE POLICY seeall ON range_parted AS PERMISSIVE FOR SELECT USING (true);
+CREATE POLICY policy_range_parted ON range_parted for UPDATE USING (true) WITH CHECK (c % 2 = 0);
+
+:init_range_parted;
+SET SESSION AUTHORIZATION regress_range_parted_user;
+-- This should fail with RLS violation error while moving row from
+-- part_a_10_a_20 to part_d_1_15, because we are setting 'c' to an odd number.
+UPDATE range_parted set a = 'b', c = 151 WHERE a = 'a' and c = 200;
+
+RESET SESSION AUTHORIZATION;
+-- Create a trigger on part_d_1_15
+CREATE FUNCTION func_d_1_15() RETURNS trigger AS $$
+BEGIN
+   NEW.c = NEW.c + 1; -- Make even numbers odd, or vice versa
+   return NEW;
+END $$ LANGUAGE plpgsql;
+CREATE TRIGGER trig_d_1_15 BEFORE INSERT ON part_d_1_15
+   FOR EACH ROW EXECUTE PROCEDURE func_d_1_15();
+
+:init_range_parted;
+SET SESSION AUTHORIZATION regress_range_parted_user;
+
+-- Here, RLS checks should succeed while moving row from part_a_10_a_20 to
+-- part_d_1_15. Even though the UPDATE is setting 'c' to an odd number, the
+-- trigger at the destination partition again makes it an even number.
+UPDATE range_parted set a = 'b', c = 151 WHERE a = 'a' and c = 200;
+
+RESET SESSION AUTHORIZATION;
+:init_range_parted;
+SET SESSION AUTHORIZATION regress_range_parted_user;
+-- This should fail with RLS violation error. Even though the UPDATE is setting
+-- 'c' to an even number, the trigger at the destination partition again makes
+-- it an odd number.
+UPDATE range_parted set a = 'b', c = 150 WHERE a = 'a' and c = 200;
+
+-- Cleanup
+RESET SESSION AUTHORIZATION;
+DROP TRIGGER trig_d_1_15 ON part_d_1_15;
+DROP FUNCTION func_d_1_15();
+
+-- Policy expression contains SubPlan
+RESET SESSION AUTHORIZATION;
+:init_range_parted;
+CREATE POLICY policy_range_parted_subplan on range_parted
+    AS RESTRICTIVE for UPDATE USING (true)
+    WITH CHECK ((SELECT range_parted.c <= c1 FROM mintab));
+SET SESSION AUTHORIZATION regress_range_parted_user;
+-- fail, mintab has row with c1 = 120
+UPDATE range_parted set a = 'b', c = 122 WHERE a = 'a' and c = 200;
 -- ok
-update range_parted set b = b + 1 where b = 10;
+UPDATE range_parted set a = 'b', c = 120 WHERE a = 'a' and c = 200;
+
+-- RLS policy expression contains whole row.
+
+RESET SESSION AUTHORIZATION;
+:init_range_parted;
+CREATE POLICY policy_range_parted_wholerow on range_parted AS RESTRICTIVE for UPDATE USING (true)
+   WITH CHECK (range_parted = row('b', 10, 112, 1, NULL)::range_parted);
+SET SESSION AUTHORIZATION regress_range_parted_user;
+-- ok, should pass the RLS check
+UPDATE range_parted set a = 'b', c = 112 WHERE a = 'a' and c = 200;
+RESET SESSION AUTHORIZATION;
+:init_range_parted;
+SET SESSION AUTHORIZATION regress_range_parted_user;
+-- fail, the whole row RLS check should fail
+UPDATE range_parted set a = 'b', c = 116 WHERE a = 'a' and c = 200;
+
+-- Cleanup
+RESET SESSION AUTHORIZATION;
+DROP POLICY policy_range_parted ON range_parted;
+DROP POLICY policy_range_parted_subplan ON range_parted;
+DROP POLICY policy_range_parted_wholerow ON range_parted;
+REVOKE ALL ON range_parted, mintab FROM regress_range_parted_user;
+DROP USER regress_range_parted_user;
+DROP TABLE mintab;
+
+
+-- statement triggers with update row movement
+---------------------------------------------------
+
+:init_range_parted;
+
+CREATE FUNCTION trigfunc() returns trigger language plpgsql as
+$$
+  begin
+    raise notice 'trigger = % fired on table % during %',
+                 TG_NAME, TG_TABLE_NAME, TG_OP;
+    return null;
+  end;
+$$;
+-- Triggers on root partition
+CREATE TRIGGER parent_delete_trig
+  AFTER DELETE ON range_parted for each statement execute procedure trigfunc();
+CREATE TRIGGER parent_update_trig
+  AFTER UPDATE ON range_parted for each statement execute procedure trigfunc();
+CREATE TRIGGER parent_insert_trig
+  AFTER INSERT ON range_parted for each statement execute procedure trigfunc();
+
+-- Triggers on leaf partition part_c_1_100
+CREATE TRIGGER c1_delete_trig
+  AFTER DELETE ON part_c_1_100 for each statement execute procedure trigfunc();
+CREATE TRIGGER c1_update_trig
+  AFTER UPDATE ON part_c_1_100 for each statement execute procedure trigfunc();
+CREATE TRIGGER c1_insert_trig
+  AFTER INSERT ON part_c_1_100 for each statement execute procedure trigfunc();
+
+-- Triggers on leaf partition part_d_1_15
+CREATE TRIGGER d1_delete_trig
+  AFTER DELETE ON part_d_1_15 for each statement execute procedure trigfunc();
+CREATE TRIGGER d1_update_trig
+  AFTER UPDATE ON part_d_1_15 for each statement execute procedure trigfunc();
+CREATE TRIGGER d1_insert_trig
+  AFTER INSERT ON part_d_1_15 for each statement execute procedure trigfunc();
+-- Triggers on leaf partition part_d_15_20
+CREATE TRIGGER d15_delete_trig
+  AFTER DELETE ON part_d_15_20 for each statement execute procedure trigfunc();
+CREATE TRIGGER d15_update_trig
+  AFTER UPDATE ON part_d_15_20 for each statement execute procedure trigfunc();
+CREATE TRIGGER d15_insert_trig
+  AFTER INSERT ON part_d_15_20 for each statement execute procedure trigfunc();
+
+-- Move all rows from part_c_100_200 to part_c_1_100. None of the delete or
+-- insert statement triggers should be fired.
+UPDATE range_parted set c = c - 50 WHERE c > 97;
+:show_data;
+
+DROP TRIGGER parent_delete_trig ON range_parted;
+DROP TRIGGER parent_update_trig ON range_parted;
+DROP TRIGGER parent_insert_trig ON range_parted;
+DROP TRIGGER c1_delete_trig ON part_c_1_100;
+DROP TRIGGER c1_update_trig ON part_c_1_100;
+DROP TRIGGER c1_insert_trig ON part_c_1_100;
+DROP TRIGGER d1_delete_trig ON part_d_1_15;
+DROP TRIGGER d1_update_trig ON part_d_1_15;
+DROP TRIGGER d1_insert_trig ON part_d_1_15;
+DROP TRIGGER d15_delete_trig ON part_d_15_20;
+DROP TRIGGER d15_update_trig ON part_d_15_20;
+DROP TRIGGER d15_insert_trig ON part_d_15_20;
+
 
 -- Creating default partition for range
+:init_range_parted;
 create table part_def partition of range_parted default;
 \d+ part_def
 insert into range_parted values ('c', 9);
@@ -134,19 +445,119 @@ update part_def set a = 'd' where a = 'c';
 -- fail
 update part_def set a = 'a' where a = 'd';
 
-create table list_parted (
+:show_data;
+
+-- Update row movement from non-default to default partition.
+-- fail, default partition is not under part_a_10_a_20;
+UPDATE part_a_10_a_20 set a = 'ad' WHERE a = 'a';
+-- ok
+UPDATE range_parted set a = 'ad' WHERE a = 'a';
+UPDATE range_parted set a = 'bd' WHERE a = 'b';
+:show_data;
+-- Update row movement from default to non-default partitions.
+-- ok
+UPDATE range_parted set a = 'a' WHERE a = 'ad';
+UPDATE range_parted set a = 'b' WHERE a = 'bd';
+:show_data;
+
+-- Cleanup: range_parted no longer needed.
+DROP TABLE range_parted;
+
+CREATE TABLE list_parted (
 	a text,
 	b int
-) partition by list (a);
-create table list_part1  partition of list_parted for values in ('a', 'b');
-create table list_default partition of list_parted default;
-insert into list_part1 values ('a', 1);
-insert into list_default values ('d', 10);
+) PARTITION BY list (a);
+CREATE TABLE list_part1  PARTITION OF list_parted for VALUES in ('a', 'b');
+CREATE TABLE list_default PARTITION OF list_parted default;
+INSERT into list_part1 VALUES ('a', 1);
+INSERT into list_default VALUES ('d', 10);
 
 -- fail
-update list_default set a = 'a' where a = 'd';
+UPDATE list_default set a = 'a' WHERE a = 'd';
 -- ok
-update list_default set a = 'x' where a = 'd';
+UPDATE list_default set a = 'x' WHERE a = 'd';
+
+DROP TABLE list_parted;
+
+--------------
+-- Some more update-partition-key test scenarios below. This time use list
+-- partitions.
+--------------
+
+-- Setup for list partitions
+CREATE TABLE list_parted (a numeric, b int, c int8) PARTITION BY list (a);
+CREATE TABLE sub_parted PARTITION OF list_parted for VALUES in (1) PARTITION BY list (b);
+
+CREATE TABLE sub_part1(b int, c int8, a numeric);
+ALTER TABLE sub_parted ATTACH PARTITION sub_part1 for VALUES in (1);
+CREATE TABLE sub_part2(b int, c int8, a numeric);
+ALTER TABLE sub_parted ATTACH PARTITION sub_part2 for VALUES in (2);
+
+CREATE TABLE list_part1(a numeric, b int, c int8);
+ALTER TABLE list_parted ATTACH PARTITION list_part1 for VALUES in (2,3);
+
+INSERT into list_parted VALUES (2,5,50);
+INSERT into list_parted VALUES (3,6,60);
+INSERT into sub_parted VALUES (1,1,60);
+INSERT into sub_parted VALUES (1,2,10);
+
+-- Test partition constraint violation when intermediate ancestor is used and
+-- constraint is inherited from upper root.
+UPDATE sub_parted set a = 2 WHERE c = 10;
+
+-- Test update-partition-key, where the unpruned partitions do not have their
+-- partition keys updated.
+SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1;
+UPDATE list_parted set b = c + a WHERE a = 2;
+SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1;
+
+
+-- Test the case where BR UPDATE triggers change the partition key.
+CREATE FUNCTION func_parted_mod_b() returns trigger as $$
+BEGIN
+   NEW.b = 2; -- This is changing partition key column.
+   return NEW;
+END $$ LANGUAGE plpgsql;
+CREATE TRIGGER parted_mod_b before update on sub_part1
+   for each row execute procedure func_parted_mod_b();
+
+SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
+
+-- This should do the tuple routing even though there is no explicit
+-- partition-key update, because there is a trigger on sub_part1.
+UPDATE list_parted set c = 70 WHERE b  = 1;
+SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
+
+DROP TRIGGER parted_mod_b ON sub_part1;
+
+-- If BR DELETE trigger prevented DELETE from happening, we should also skip
+-- the INSERT if that delete is part of UPDATE=>DELETE+INSERT.
+CREATE OR REPLACE FUNCTION func_parted_mod_b() returns trigger as $$
+BEGIN
+   raise notice 'Trigger: Got OLD row %, but returning NULL', OLD;
+   return NULL;
+END $$ LANGUAGE plpgsql;
+CREATE TRIGGER trig_skip_delete before delete on sub_part2
+   for each row execute procedure func_parted_mod_b();
+UPDATE list_parted set b = 1 WHERE c = 70;
+SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
+-- Drop the trigger. Now the row should be moved.
+DROP TRIGGER trig_skip_delete ON sub_part2;
+UPDATE list_parted set b = 1 WHERE c = 70;
+SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
+DROP FUNCTION func_parted_mod_b();
+
+-- UPDATE partition-key with FROM clause. If join produces multiple output
+-- rows for the same row to be modified, we should tuple-route the row only
+-- once. There should not be any rows inserted.
+CREATE TABLE non_parted (id int);
+INSERT into non_parted VALUES (1), (1), (1), (2), (2), (2), (3), (3), (3);
+UPDATE list_parted t1 set a = 2 FROM non_parted t2 WHERE t1.a = t2.id and a = 1;
+SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
+DROP TABLE non_parted;
+
+-- Cleanup: list_parted no longer needed.
+DROP TABLE list_parted;
 
 -- create custom operator class and hash function, for the same reason
 -- explained in alter_table.sql
@@ -169,13 +580,12 @@ insert into hpart4 values (3, 4);
 
 -- fail
 update hpart1 set a = 3, b=4 where a = 1;
+-- ok, row movement
 update hash_parted set b = b - 1 where b = 1;
 -- ok
 update hash_parted set b = b + 8 where b = 1;
 
 -- cleanup
-drop table range_parted;
-drop table list_parted;
 drop table hash_parted;
 drop operator class custom_opclass using hash;
 drop function dummy_hashint4(a int4, seed int8);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index ba6ce916..a3cb20f8 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1568,6 +1568,7 @@ PartitionRangeBound
 PartitionRangeDatum
 PartitionRangeDatumKind
 PartitionSpec
+PartitionTupleRouting
 PartitionedChildRelInfo
 PasswordType
 Path

From 07e28f3b981287f20c2259ae1b7c1182210551ca Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Wed, 24 Jan 2018 16:34:51 -0500
Subject: [PATCH 226/578] Avoid referencing off the end of
 subplan_partition_offsets.

Report by buildfarm member skink and Tom Lane.  Analysis by me.
Patch by Amit Khandekar.

Discussion: http://postgr.es/m/CAJ3gD9fVA1iXQYhfqHP5n_TEd4U9=V8TL_cc-oKRnRmxgdvJrQ@mail.gmail.com
---
 src/backend/executor/execPartition.c   | 2 ++
 src/backend/executor/nodeModifyTable.c | 3 ++-
 src/include/executor/execPartition.h   | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index a08f308f..e312167f 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -87,6 +87,7 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 		num_update_rri = list_length(node->plans);
 		proute->subplan_partition_offsets =
 			palloc(num_update_rri * sizeof(int));
+		proute->num_subplan_partition_offsets = num_update_rri;
 
 		/*
 		 * We need an additional tuple slot for storing transient tuples that
@@ -481,6 +482,7 @@ ExecCleanupTupleRouting(PartitionTupleRouting *proute)
 		 * result rels are present in the UPDATE subplans.
 		 */
 		if (proute->subplan_partition_offsets &&
+			subplan_index < proute->num_subplan_partition_offsets &&
 			proute->subplan_partition_offsets[subplan_index] == i)
 		{
 			subplan_index++;
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index f04ef73d..003ff4b8 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -2148,7 +2148,8 @@ tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan)
         * If subplan-indexed array is NULL, things should have been arranged
         * to convert the subplan index to partition index.
         */
-       Assert(proute && proute->subplan_partition_offsets != NULL);
+		Assert(proute && proute->subplan_partition_offsets != NULL &&
+			   whichplan < proute->num_subplan_partition_offsets);
 
        leaf_index = proute->subplan_partition_offsets[whichplan];
 
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index 45acfa92..4e0bdc35 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -80,6 +80,7 @@ typedef struct PartitionDispatchData *PartitionDispatch;
  * subplan_partition_offsets	Integer array ordered by UPDATE subplans. Each
  *								element of this array has the index into the
  *								corresponding partition in partitions array.
+ * num_subplan_partition_offsets  Length of 'subplan_partition_offsets' array
  * partition_tuple_slot			TupleTableSlot to be used to manipulate any
  *								given leaf partition's rowtype after that
  *								partition is chosen for insertion by
@@ -96,6 +97,7 @@ typedef struct PartitionTupleRouting
 	TupleConversionMap **child_parent_tupconv_maps;
 	bool	   *child_parent_map_not_required;
 	int		   *subplan_partition_offsets;
+	int			num_subplan_partition_offsets;
 	TupleTableSlot *partition_tuple_slot;
 	TupleTableSlot *root_tuple_slot;
 } PartitionTupleRouting;

From 45402b03030f45de48fbca8b4ecc6f35c748f06c Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Fri, 2 Feb 2018 09:23:42 -0500
Subject: [PATCH 227/578] Refactor code for partition bound searching

Remove partition_bound_cmp() and partition_bound_bsearch(), whose
void * argument could be, depending on the situation, of any of
three different types: PartitionBoundSpec *, PartitionRangeBound *,
Datum *.

Instead, introduce separate bound-searching functions for each
situation: partition_list_bsearch, partition_range_bsearch,
partition_range_datum_bsearch, and partition_hash_bsearch.  This
requires duplicating the code for binary search, but it makes the
code much more type safe, involves fewer branches at runtime, and
at least in my opinion, is much easier to understand.

Along the way, add an option to partition_range_datum_bsearch
allowing the number of keys to be specified, so that we can search
for partitions based on a prefix of the full list of partition
keys.  This is important for pending work to improve partition
pruning.

Amit Langote, per a suggestion from me.

Discussion: http://postgr.es/m/CA+TgmoaVLDLc8=YESRwD32gPhodU_ELmXyKs77gveiYp+JE4vQ@mail.gmail.com
---
 src/backend/catalog/partition.c | 243 +++++++++++++++++++++-----------
 1 file changed, 159 insertions(+), 84 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 08b58d74..e185a7ee 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -170,14 +170,21 @@ static int32 partition_rbound_cmp(PartitionKey key,
                      bool lower1, PartitionRangeBound *b2);
 static int32 partition_rbound_datum_cmp(PartitionKey key,
                            Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
-                           Datum *tuple_datums);
+						   Datum *tuple_datums, int n_tuple_datums);
 
-static int32 partition_bound_cmp(PartitionKey key,
+static int partition_list_bsearch(PartitionKey key,
                     PartitionBoundInfo boundinfo,
-                    int offset, void *probe, bool probe_is_bound);
-static int partition_bound_bsearch(PartitionKey key,
+					   Datum value, bool *is_equal);
+static int partition_range_bsearch(PartitionKey key,
                         PartitionBoundInfo boundinfo,
-                        void *probe, bool probe_is_bound, bool *is_equal);
+						PartitionRangeBound *probe, bool *is_equal);
+static int partition_range_datum_bsearch(PartitionKey key,
+							  PartitionBoundInfo boundinfo,
+							  int nvalues, Datum *values, bool *is_equal);
+static int partition_hash_bsearch(PartitionKey key,
+					   PartitionBoundInfo boundinfo,
+					   int modulus, int remainder);
+
 static int	get_partition_bound_num_indexes(PartitionBoundInfo b);
 static int	get_greatest_modulus(PartitionBoundInfo b);
 static uint64 compute_hash_value(PartitionKey key, Datum *values, bool *isnull);
@@ -1015,8 +1022,7 @@ check_new_partition_bound(char *relname, Relation parent,
 					int			greatest_modulus;
 					int			remainder;
 					int			offset;
-					bool		equal,
-								valid_modulus = true;
+					bool		valid_modulus = true;
 					int			prev_modulus,	/* Previous largest modulus */
 								next_modulus;	/* Next largest modulus */
 
@@ -1029,12 +1035,13 @@ check_new_partition_bound(char *relname, Relation parent,
 					 * modulus 10 and a partition with modulus 15, because 10
 					 * is not a factor of 15.
 					 *
-					 * Get greatest bound in array boundinfo->datums which is
-					 * less than or equal to spec->modulus and
-					 * spec->remainder.
+					 * Get the greatest (modulus, remainder) pair contained in
+					 * boundinfo->datums that is less than or equal to the
+					 * (spec->modulus, spec->remainder) pair.
 					 */
-					offset = partition_bound_bsearch(key, boundinfo, spec,
-													 true, &equal);
+					offset = partition_hash_bsearch(key, boundinfo,
+													spec->modulus,
+													spec->remainder);
 					if (offset < 0)
 					{
 						next_modulus = DatumGetInt32(datums[0][0]);
@@ -1108,9 +1115,9 @@ check_new_partition_bound(char *relname, Relation parent,
                             int            offset;
                             bool        equal;
 
-                            offset = partition_bound_bsearch(key, boundinfo,
-                                                             &val->constvalue,
-                                                             true, &equal);
+							offset = partition_list_bsearch(key, boundinfo,
+															val->constvalue,
+															&equal);
                             if (offset >= 0 && equal)
                             {
                                 overlap = true;
@@ -1182,8 +1189,8 @@ check_new_partition_bound(char *relname, Relation parent,
                      * since the index array is initialised with an extra -1
                      * at the end.
                      */
-                    offset = partition_bound_bsearch(key, boundinfo, lower,
-                                                     true, &equal);
+					offset = partition_range_bsearch(key, boundinfo, lower,
+													 &equal);
 
                     if (boundinfo->indexes[offset + 1] < 0)
                     {
@@ -1196,10 +1203,16 @@ check_new_partition_bound(char *relname, Relation parent,
                         if (offset + 1 < boundinfo->ndatums)
                         {
                             int32        cmpval;
+							Datum 	   *datums;
+							PartitionRangeDatumKind *kind;
+							bool		is_lower;
+
+							datums = boundinfo->datums[offset + 1];
+							kind = boundinfo->kind[offset + 1];
+							is_lower = (boundinfo->indexes[offset + 1] == -1);
 
-                            cmpval = partition_bound_cmp(key, boundinfo,
-                                                         offset + 1, upper,
-                                                         true);
+							cmpval = partition_rbound_cmp(key, datums, kind,
+														  is_lower, upper);
                             if (cmpval < 0)
                             {
                                 /*
@@ -2566,11 +2579,9 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
 				{
 					bool		equal = false;
 
-                bound_offset = partition_bound_bsearch(key,
+				bound_offset = partition_list_bsearch(key,
 														 partdesc->boundinfo,
-														 values,
-														 false,
-														 &equal);
+													  values[0], &equal);
                 if (bound_offset >= 0 && equal)
                     part_index = partdesc->boundinfo->indexes[bound_offset];
 				}
@@ -2598,12 +2609,11 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
 
                 if (!range_partkey_has_null)
                 {
-                    bound_offset = partition_bound_bsearch(key,
+					bound_offset = partition_range_datum_bsearch(key,
 														 partdesc->boundinfo,
+														key->partnatts,
 														 values,
-														 false,
 														 &equal);
-
                     /*
                      * The bound at bound_offset is less than or equal to the
                      * tuple value, so the bound at offset+1 is the upper
@@ -2874,12 +2884,12 @@ partition_rbound_cmp(PartitionKey key,
 static int32
 partition_rbound_datum_cmp(PartitionKey key,
                            Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
-                           Datum *tuple_datums)
+						   Datum *tuple_datums, int n_tuple_datums)
 {
     int            i;
     int32        cmpval = -1;
 
-    for (i = 0; i < key->partnatts; i++)
+	for (i = 0; i < n_tuple_datums; i++)
     {
         if (rb_kind[i] == PARTITION_RANGE_DATUM_MINVALUE)
             return -1;
@@ -2898,84 +2908,104 @@ partition_rbound_datum_cmp(PartitionKey key,
 }
 
 /*
- * partition_bound_cmp
+ * partition_list_bsearch
+ *		Returns the index of the greatest bound datum that is less than equal
+ * 		to the given value or -1 if all of the bound datums are greater
  *
- * Return whether the bound at offset in boundinfo is <, =, or > the argument
- * specified in *probe.
+ * *is_equal is set to true if the bound datum at the returned index is equal
+ * to the input value.
  */
-static int32
-partition_bound_cmp(PartitionKey key, PartitionBoundInfo boundinfo,
-                    int offset, void *probe, bool probe_is_bound)
+static int
+partition_list_bsearch(PartitionKey key,
+					   PartitionBoundInfo boundinfo,
+					   Datum value, bool *is_equal)
 {
-    Datum       *bound_datums = boundinfo->datums[offset];
-    int32        cmpval = -1;
+	int			lo,
+				hi,
+				mid;
 
-    switch (key->strategy)
-    {
-		case PARTITION_STRATEGY_HASH:
+	lo = -1;
+	hi = boundinfo->ndatums - 1;
+	while (lo < hi)
 			{
-				PartitionBoundSpec *spec = (PartitionBoundSpec *) probe;
+		int32		cmpval;
 
-				cmpval = partition_hbound_cmp(DatumGetInt32(bound_datums[0]),
-											  DatumGetInt32(bound_datums[1]),
-											  spec->modulus, spec->remainder);
-				break;
-			}
-        case PARTITION_STRATEGY_LIST:
+		mid = (lo + hi + 1) / 2;
             cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
                                                      key->partcollation[0],
-                                                     bound_datums[0],
-                                                     *(Datum *) probe));
+												 boundinfo->datums[mid][0],
+												 value));
+		if (cmpval <= 0)
+		{
+			lo = mid;
+			*is_equal = (cmpval == 0);
+			if (*is_equal)
             break;
+		}
+		else
+			hi = mid - 1;
+	}
 
-        case PARTITION_STRATEGY_RANGE:
-            {
-                PartitionRangeDatumKind *kind = boundinfo->kind[offset];
+	return lo;
+}
 
-                if (probe_is_bound)
-                {
                     /*
-                     * We need to pass whether the existing bound is a lower
-                     * bound, so that two equal-valued lower and upper bounds
-                     * are not regarded equal.
+ * partition_range_bsearch
+ *		Returns the index of the greatest range bound that is less than or
+ *		equal to the given range bound or -1 if all of the range bounds are
+ *		greater
+ *
+ * *is_equal is set to true if the range bound at the returned index is equal
+ * to the input range bound
                      */
-                    bool        lower = boundinfo->indexes[offset] < 0;
+static int
+partition_range_bsearch(PartitionKey key,
+						PartitionBoundInfo boundinfo,
+						PartitionRangeBound *probe, bool *is_equal)
+{
+	int			lo,
+				hi,
+				mid;
+
+	lo = -1;
+	hi = boundinfo->ndatums - 1;
+	while (lo < hi)
+	{
+		int32		cmpval;
 
+		mid = (lo + hi + 1) / 2;
                     cmpval = partition_rbound_cmp(key,
-                                                  bound_datums, kind, lower,
-                                                  (PartitionRangeBound *) probe);
-                }
-                else
-                    cmpval = partition_rbound_datum_cmp(key,
-                                                        bound_datums, kind,
-                                                        (Datum *) probe);
+									  boundinfo->datums[mid],
+									  boundinfo->kind[mid],
+									  (boundinfo->indexes[mid] == -1),
+									  probe);
+		if (cmpval <= 0)
+		{
+			lo = mid;
+			*is_equal = (cmpval == 0);
+
+			if (*is_equal)
                 break;
             }
-
-        default:
-            elog(ERROR, "unexpected partition strategy: %d",
-                 (int) key->strategy);
+		else
+			hi = mid - 1;
     }
 
-    return cmpval;
+	return lo;
 }
 
 /*
- * Binary search on a collection of partition bounds. Returns greatest
- * bound in array boundinfo->datums which is less than or equal to *probe.
- * If all bounds in the array are greater than *probe, -1 is returned.
+ * partition_range_bsearch
+ *		Returns the index of the greatest range bound that is less than or
+ *		equal to the given tuple or -1 if all of the range bounds are greater
  *
- * *probe could either be a partition bound or a Datum array representing
- * the partition key of a tuple being routed; probe_is_bound tells which.
- * We pass that down to the comparison function so that it can interpret the
- * contents of *probe accordingly.
- *
- * *is_equal is set to whether the bound at the returned index is equal with
- * *probe.
+ * *is_equal is set to true if the range bound at the returned index is equal
+ * to the input tuple.
  */
 static int
-partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo,
-                        void *probe, bool probe_is_bound, bool *is_equal)
+partition_range_datum_bsearch(PartitionKey key,
+							  PartitionBoundInfo boundinfo,
+							  int nvalues, Datum *values, bool *is_equal)
 {
     int            lo,
                 hi,
@@ -2988,8 +3018,11 @@ partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo,
         int32        cmpval;
 
         mid = (lo + hi + 1) / 2;
-        cmpval = partition_bound_cmp(key, boundinfo, mid, probe,
-                                     probe_is_bound);
+		cmpval = partition_rbound_datum_cmp(key,
+											boundinfo->datums[mid],
+											boundinfo->kind[mid],
+											values,
+											nvalues);
         if (cmpval <= 0)
         {
             lo = mid;
@@ -3005,6 +3038,48 @@ partition_bound_bsearch(PartitionKey key, PartitionBoundInfo boundinfo,
     return lo;
 }
 
+/*
+ * partition_hash_bsearch
+ *		Returns the index of the greatest (modulus, remainder) pair that is
+ *		less than or equal to the given (modulus, remainder) pair or -1 if
+ *		all of them are greater
+ */
+static int
+partition_hash_bsearch(PartitionKey key,
+					   PartitionBoundInfo boundinfo,
+					   int modulus, int remainder)
+{
+	int			lo,
+				hi,
+				mid;
+
+	lo = -1;
+	hi = boundinfo->ndatums - 1;
+	while (lo < hi)
+	{
+		int32		cmpval,
+					bound_modulus,
+					bound_remainder;
+
+		mid = (lo + hi + 1) / 2;
+		bound_modulus = DatumGetInt32(boundinfo->datums[mid][0]);
+		bound_remainder = DatumGetInt32(boundinfo->datums[mid][1]);
+		cmpval = partition_hbound_cmp(bound_modulus, bound_remainder,
+									  modulus, remainder);
+		if (cmpval <= 0)
+		{
+			lo = mid;
+
+			if (cmpval == 0)
+				break;
+		}
+		else
+			hi = mid - 1;
+	}
+
+	return lo;
+}
+
 /*
  * get_default_oid_from_partdesc
  *

From dfe5a49c10508ff1d203c8008703b20f7f1465db Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 5 Feb 2018 10:37:30 -0500
Subject: [PATCH 228/578] Fix RelationBuildPartitionKey's processing of
 partition key expressions.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Failure to advance the list pointer while reading partition expressions
from a list results in invoking an input function with inappropriate data,
possibly leading to crashes or, with carefully crafted input, disclosure
of arbitrary backend memory.

Bug discovered independently by Álvaro Herrera and David Rowley.
This patch is by Álvaro but owes something to David's proposed fix.
Back-patch to v10 where the issue was introduced.

Security: CVE-2018-1052
---
 src/backend/utils/cache/relcache.c         |  5 +++++
 src/test/regress/expected/create_table.out | 23 ++++++++++++++++++----
 src/test/regress/sql/create_table.sql      |  9 +++++++--
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index f6acc9f0..f9520010 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -1115,9 +1115,14 @@ RelationBuildPartitionKey(Relation relation)
         }
         else
         {
+			if (partexprs_item == NULL)
+				elog(ERROR, "wrong number of partition key expressions");
+
             key->parttypid[i] = exprType(lfirst(partexprs_item));
             key->parttypmod[i] = exprTypmod(lfirst(partexprs_item));
             key->parttypcoll[i] = exprCollation(lfirst(partexprs_item));
+
+			partexprs_item = lnext(partexprs_item);
         }
         get_typlenbyvalalign(key->parttypid[i],
                              &key->parttyplen[i],
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index 86c347be..7fa55adb 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -417,8 +417,9 @@ DETAIL:  table partitioned depends on function plusone(integer)
 HINT:  Use DROP ... CASCADE to drop the dependent objects too.
 -- partitioned table cannot participate in regular inheritance
 CREATE TABLE partitioned2 (
-	a int
-) PARTITION BY LIST ((a+1));
+	a int,
+	b text
+) PARTITION BY RANGE ((a+1), substr(b, 1, 5));
 CREATE TABLE fail () INHERITS (partitioned2);
 ERROR:  cannot inherit from partitioned table "partitioned2"
 -- Partition key in describe output
@@ -436,11 +437,25 @@ Number of partitions: 0
 \d+ partitioned2
             Table "public.partitioned2"
  Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
---------+---------+-----------+----------+---------+---------+--------------+-------------
+--------+---------+-----------+----------+---------+----------+--------------+-------------
  a      | integer |           |          |         | plain   |              | 
-Partition key: LIST (((a + 1)))
+ b      | text    |           |          |         | extended |              | 
+Partition key: RANGE (((a + 1)), substr(b, 1, 5))
 Number of partitions: 0
 
+INSERT INTO partitioned2 VALUES (1, 'hello');
+ERROR:  no partition of relation "partitioned2" found for row
+DETAIL:  Partition key of the failing row contains ((a + 1), substr(b, 1, 5)) = (2, hello).
+CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO (100, 'ccccc');
+\d+ part2_1
+                                  Table "public.part2_1"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+----------+--------------+-------------
+ a      | integer |           |          |         | plain    |              | 
+ b      | text    |           |          |         | extended |              | 
+Partition of: partitioned2 FOR VALUES FROM ('-1', 'aaaaa') TO (100, 'ccccc')
+Partition constraint: (((a + 1) IS NOT NULL) AND (substr(b, 1, 5) IS NOT NULL) AND (((a + 1) > '-1'::integer) OR (((a + 1) = '-1'::integer) AND (substr(b, 1, 5) >= 'aaaaa'::text))) AND (((a + 1) < 100) OR (((a + 1) = 100) AND (substr(b, 1, 5) < 'ccccc'::text))))
+
 DROP TABLE partitioned, partitioned2;
 --
 -- Partitions
diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql
index 43ada6b3..b125fa50 100644
--- a/src/test/regress/sql/create_table.sql
+++ b/src/test/regress/sql/create_table.sql
@@ -417,14 +417,19 @@ DROP FUNCTION plusone(int);
 
 -- partitioned table cannot participate in regular inheritance
 CREATE TABLE partitioned2 (
-	a int
-) PARTITION BY LIST ((a+1));
+	a int,
+	b text
+) PARTITION BY RANGE ((a+1), substr(b, 1, 5));
 CREATE TABLE fail () INHERITS (partitioned2);
 
 -- Partition key in describe output
 \d partitioned
 \d+ partitioned2
 
+INSERT INTO partitioned2 VALUES (1, 'hello');
+CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO (100, 'ccccc');
+\d+ part2_1
+
 DROP TABLE partitioned, partitioned2;
 
 --

From 98279f99acf04227f628476f31c4a045bb3fb641 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 26 Jun 2020 19:53:18 +0800
Subject: [PATCH 229/578] Fix possible crash in partition-wise join.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/optimizer/path/allpaths.c        | 18 ++------
 src/backend/optimizer/path/joinrels.c        | 16 --------
 src/backend/optimizer/util/relnode.c         |  5 ++-
 src/include/nodes/relation.h                 | 14 ++++---
 src/test/regress/expected/partition_join.out | 43 ++++++++++++--------
 src/test/regress/sql/partition_join.sql      |  2 +-
 6 files changed, 43 insertions(+), 55 deletions(-)

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 9020a606..947c75f3 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -3399,20 +3399,8 @@ generate_partition_wise_join_paths(PlannerInfo *root, RelOptInfo *rel)
 	if (!IS_JOIN_REL(rel))
 		return;
 
-	/*
-	 * If we've already proven this join is empty, we needn't consider any
-	 * more paths for it.
-	 */
-	if (IS_DUMMY_REL(rel))
-		return;
-
-	/*
-	 * Nothing to do if the relation is not partitioned. An outer join
-	 * relation which had empty inner relation in every pair will have rest of
-	 * the partitioning properties set except the child-join RelOptInfos. See
-	 * try_partition_wise_join() for more explanation.
-	 */
-	if (rel->nparts <= 0 || rel->part_rels == NULL)
+        /* We've nothing to do if the relation is not partitioned. */
+        if (!IS_PARTITIONED_REL(rel))
 		return;
 
 	/* Guard against stack overflow due to overly deep partition hierarchy. */
@@ -3426,6 +3414,8 @@ generate_partition_wise_join_paths(PlannerInfo *root, RelOptInfo *rel)
 	{
 		RelOptInfo *child_rel = part_rels[cnt_parts];
 
+		Assert(child_rel != NULL);
+
 		/* Add partition-wise join paths for partitioned child-joins. */
 		generate_partition_wise_join_paths(root, child_rel);
 
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index d6fad96c..d8afa3ef 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -1340,17 +1340,6 @@ try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
 	if (!IS_PARTITIONED_REL(joinrel))
 		return;
 
-	/*
-	 * set_rel_pathlist() may not create paths in children of an empty
-	 * partitioned table and so we can not add paths to child-joins. So, deem
-	 * such a join as unpartitioned. When a partitioned relation is deemed
-	 * empty because all its children are empty, dummy path will be set in
-	 * each of the children.  In such a case we could still consider the join
-	 * as partitioned, but it might not help much.
-	 */
-	if (IS_DUMMY_REL(rel1) || IS_DUMMY_REL(rel2))
-		return;
-
 	/*
 	 * Since this join relation is partitioned, all the base relations
 	 * participating in this join must be partitioned and so are all the
@@ -1382,11 +1371,6 @@ try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
 
 	nparts = joinrel->nparts;
 
-	/* Allocate space to hold child-joins RelOptInfos, if not already done. */
-	if (!joinrel->part_rels)
-		joinrel->part_rels =
-			(RelOptInfo **) palloc0(sizeof(RelOptInfo *) * nparts);
-
 	/*
 	 * Create child-join relations for this partitioned join, if those don't
 	 * exist. Add paths to child-joins for a pair of child relations
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index 0896b4c2..70acf299 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -1721,11 +1721,14 @@ build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel,
 	 */
 	joinrel->part_scheme = part_scheme;
 	joinrel->boundinfo = outer_rel->boundinfo;
-	joinrel->nparts = outer_rel->nparts;
 	partnatts = joinrel->part_scheme->partnatts;
 	joinrel->partexprs = (List **) palloc0(sizeof(List *) * partnatts);
 	joinrel->nullable_partexprs =
 		(List **) palloc0(sizeof(List *) *partnatts);
+        joinrel->nparts = outer_rel->nparts;
+        joinrel->part_rels =
+                 (RelOptInfo **) palloc0(sizeof(RelOptInfo *) * joinrel->nparts);
+
 
 	/*
 	 * Construct partition keys for the join.
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index e2af7ebc..ee843fad 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -791,13 +791,17 @@ typedef struct RelOptInfo
 /*
  * Is given relation partitioned?
  *
- * A join between two partitioned relations with same partitioning scheme
- * without any matching partitions will not have any partition in it but will
- * have partition scheme set. So a relation is deemed to be partitioned if it
- * has a partitioning scheme, bounds and positive number of partitions.
+ * It's not enough to test whether rel->part_scheme is set, because it might
+ * be that the basic partitioning properties of the input relations matched
+ * but the partition bounds did not.
+ *
+ * We treat dummy relations as unpartitioned.  We could alternatively
+ * treat them as partitioned, but it's not clear whether that's a useful thing
+ * to do.
  */
 #define IS_PARTITIONED_REL(rel) \
-   ((rel)->part_scheme && (rel)->boundinfo && (rel)->nparts > 0)
+	((rel)->part_scheme && (rel)->boundinfo && (rel)->nparts > 0 && \
+	 (rel)->part_rels && !(IS_DUMMY_REL(rel)))
 
 /*
  * Convenience macro to make sure that a partitioned relation has all the
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index 1c8cdb34..4e1cfedd 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -1217,24 +1217,31 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1
 (2 rows)
 
 EXPLAIN (COSTS OFF)
-SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
-                 QUERY PLAN                 
---------------------------------------------
- Sort
-   Sort Key: a, t2.b
-   ->  Hash Left Join
-         Hash Cond: (t2.b = a)
-         ->  Append
-               ->  Seq Scan on prt2_p1 t2
-                     Filter: (a = 0)
-               ->  Seq Scan on prt2_p2 t2_1
-                     Filter: (a = 0)
-               ->  Seq Scan on prt2_p3 t2_2
-                     Filter: (a = 0)
-         ->  Hash
-               ->  Result
-                     One-Time Filter: false
-(14 rows)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a;
+                    QUERY PLAN                    
+--------------------------------------------------
+ Hash Left Join
+   Hash Cond: (t2.b = a)
+   ->  Append
+         ->  Hash Join
+               Hash Cond: (t3.a = t2.b)
+               ->  Seq Scan on prt1_p1 t3
+               ->  Hash
+                     ->  Seq Scan on prt2_p1 t2
+         ->  Hash Join
+               Hash Cond: (t3_1.a = t2_1.b)
+               ->  Seq Scan on prt1_p2 t3_1
+               ->  Hash
+                     ->  Seq Scan on prt2_p2 t2_1
+         ->  Hash Join
+               Hash Cond: (t3_2.a = t2_2.b)
+               ->  Seq Scan on prt1_p3 t3_2
+               ->  Hash
+                     ->  Seq Scan on prt2_p3 t2_2
+   ->  Hash
+         ->  Result
+               One-Time Filter: false
+(21 rows)
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 2316bbdc..4aa775e7 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -224,7 +224,7 @@ EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 LEFT JOIN prt2 t2 ON t1.a = t2.b;
 
 EXPLAIN (COSTS OFF)
-SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a;
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;

From 340ca8d212c4c5f70e5c17bad46f4ced485dae32 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 26 Jun 2020 21:30:42 +0800
Subject: [PATCH 230/578] Be lazier about partition tuple routing.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/commands/copy.c          |  10 +-
 src/backend/executor/execPartition.c | 367 +++++++++++++++++++--------
 src/include/executor/execPartition.h |   9 +-
 3 files changed, 275 insertions(+), 111 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index cf770f46..8ef1f6cd 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -1635,7 +1635,7 @@ BeginCopy(ParseState *pstate,
             PartitionTupleRouting *proute;
 
             proute = cstate->partition_tuple_routing =
-                ExecSetupPartitionTupleRouting(NULL, cstate->rel, 1, estate);
+			ExecSetupPartitionTupleRouting(NULL, cstate->rel);
 
             /*
              * If we are capturing transition tuples, they may need to be
@@ -3462,6 +3462,14 @@ CopyFrom(CopyState cstate)
              */
             saved_resultRelInfo = resultRelInfo;
 			resultRelInfo = proute->partitions[leaf_part_index];
+			if (resultRelInfo == NULL)
+			{
+				resultRelInfo = ExecInitPartitionInfo(NULL,
+													  saved_resultRelInfo,
+													  proute, estate,
+													  leaf_part_index);
+				Assert(resultRelInfo != NULL);
+			}
 
             /* We do not yet have a way to insert into a foreign partition */
             if (resultRelInfo->ri_FdwRoutine)
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index e312167f..ad72c3cf 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -44,21 +44,25 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
  *
  * Note that all the relations in the partition tree are locked using the
  * RowExclusiveLock mode upon return from this function.
+ *
+ * While we allocate the arrays of pointers of ResultRelInfo and
+ * TupleConversionMap for all partitions here, actual objects themselves are
+ * lazily allocated for a given partition if a tuple is actually routed to it;
+ * see ExecInitPartitionInfo.  However, if the function is invoked for update
+ * tuple routing, caller would already have initialized ResultRelInfo's for
+ * some of the partitions, which are reused and assigned to their respective
+ * slot in the aforementioned array.
  */
 PartitionTupleRouting *
-ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
-							   Relation rel, Index resultRTindex,
-							   EState *estate)
+ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
 {
 	TupleDesc	tupDesc = RelationGetDescr(rel);
 	List	   *leaf_parts;
 	ListCell   *cell;
 	int			i;
-	ResultRelInfo *leaf_part_arr = NULL,
-			   *update_rri = NULL;
+	ResultRelInfo *update_rri = NULL;
 	int			num_update_rri = 0,
 				update_rri_index = 0;
-	bool		is_update = false;
 	PartitionTupleRouting *proute;
 
 	/*
@@ -76,13 +80,14 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 	proute->parent_child_tupconv_maps =
 		(TupleConversionMap **) palloc0(proute->num_partitions *
 										sizeof(TupleConversionMap *));
+	proute->partition_oids = (Oid *) palloc(proute->num_partitions *
+											sizeof(Oid));
 
 	/* Set up details specific to the type of tuple routing we are doing. */
 	if (mtstate && mtstate->operation == CMD_UPDATE)
 	{
 		ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
 
-		is_update = true;
 		update_rri = mtstate->resultRelInfo;
 		num_update_rri = list_length(node->plans);
 		proute->subplan_partition_offsets =
@@ -95,16 +100,6 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 		 */
 		proute->root_tuple_slot = MakeTupleTableSlot();
 	}
-	else
-	{
-		/*
-		 * Since we are inserting tuples, we need to create all new result
-		 * rels. Avoid repeated pallocs by allocating memory for all the
-		 * result rels in bulk.
-		 */
-		leaf_part_arr = (ResultRelInfo *) palloc0(proute->num_partitions *
-												  sizeof(ResultRelInfo));
-	}
 
 	/*
 	 * Initialize an empty slot that will be used to manipulate tuples of any
@@ -117,101 +112,58 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 	i = 0;
 	foreach(cell, leaf_parts)
 	{
-		ResultRelInfo *leaf_part_rri;
-		Relation	partrel = NULL;
-		TupleDesc	part_tupdesc;
+		ResultRelInfo *leaf_part_rri = NULL;
 		Oid			leaf_oid = lfirst_oid(cell);
 
-		if (is_update)
-		{
-			/*
-			 * If the leaf partition is already present in the per-subplan
-			 * result rels, we re-use that rather than initialize a new result
-			 * rel. The per-subplan resultrels and the resultrels of the leaf
-			 * partitions are both in the same canonical order. So while going
-			 * through the leaf partition oids, we need to keep track of the
-			 * next per-subplan result rel to be looked for in the leaf
-			 * partition resultrels.
-			 */
-			if (update_rri_index < num_update_rri &&
-				RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid)
-			{
-				leaf_part_rri = &update_rri[update_rri_index];
-				partrel = leaf_part_rri->ri_RelationDesc;
-
-				/*
-				 * This is required in order to we convert the partition's
-				 * tuple to be compatible with the root partitioned table's
-				 * tuple descriptor.  When generating the per-subplan result
-				 * rels, this was not set.
-				 */
-				leaf_part_rri->ri_PartitionRoot = rel;
-
-				/* Remember the subplan offset for this ResultRelInfo */
-				proute->subplan_partition_offsets[update_rri_index] = i;
-
-				update_rri_index++;
-			}
-			else
-				leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo));
-		}
-		else
-		{
-			/* For INSERTs, we already have an array of result rels allocated */
-			leaf_part_rri = &leaf_part_arr[i];
-		}
+		proute->partition_oids[i] = leaf_oid;
 
 		/*
-		 * If we didn't open the partition rel, it means we haven't
-		 * initialized the result rel either.
+		 * If the leaf partition is already present in the per-subplan result
+         * rels, we re-use that rather than initialize a new result rel. The
+         * per-subplan resultrels and the resultrels of the leaf partitions
+         * are both in the same canonical order. So while going through the
+         * leaf partition oids, we need to keep track of the next per-subplan
+         * result rel to be looked for in the leaf partition resultrels.
 		 */
-		if (!partrel)
+		if (update_rri_index < num_update_rri &&
+            RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid)
 		{
-			/*
-			 * We locked all the partitions above including the leaf
-			 * partitions. Note that each of the newly opened relations in
-			 * proute->partitions are eventually closed by the caller.
-			 */
-			partrel = heap_open(leaf_oid, NoLock);
-			InitResultRelInfo(leaf_part_rri,
-							  partrel,
-							  resultRTindex,
-							  rel,
-							  estate->es_instrument);
-		}
-
-		part_tupdesc = RelationGetDescr(partrel);
-
-		/*
-		 * Save a tuple conversion map to convert a tuple routed to this
-		 * partition from the parent's type to the partition's.
-		 */
-		proute->parent_child_tupconv_maps[i] =
-			convert_tuples_by_name(tupDesc, part_tupdesc,
-								   gettext_noop("could not convert row type"));
-
-		/*
-		 * Verify result relation is a valid target for an INSERT.  An UPDATE
-		 * of a partition-key becomes a DELETE+INSERT operation, so this check
-		 * is still required when the operation is CMD_UPDATE.
-		 */
-		CheckValidResultRel(leaf_part_rri, CMD_INSERT);
-
-		/*
-		 * Open partition indices.  The user may have asked to check for
-		 * conflicts within this leaf partition and do "nothing" instead of
-		 * throwing an error.  Be prepared in that case by initializing the
-		 * index information needed by ExecInsert() to perform speculative
-		 * insertions.
-		 */
-		if (leaf_part_rri->ri_RelationDesc->rd_rel->relhasindex &&
-			leaf_part_rri->ri_IndexRelationDescs == NULL)
-			ExecOpenIndices(leaf_part_rri,
-							mtstate != NULL &&
-							mtstate->mt_onconflict != ONCONFLICT_NONE);
-
-		estate->es_leaf_result_relations =
-			lappend(estate->es_leaf_result_relations, leaf_part_rri);
+			 Relation    partrel;
+             TupleDesc   part_tupdesc;
+
+            leaf_part_rri = &update_rri[update_rri_index];
+            partrel = leaf_part_rri->ri_RelationDesc;
+
+            /*
+             * This is required in order to convert the partition's tuple to
+             * be compatible with the root partitioned table's tuple
+             * descriptor.  When generating the per-subplan result rels, this
+             * was not set.
+             */
+            leaf_part_rri->ri_PartitionRoot = rel;
+
+            /* Remember the subplan offset for this ResultRelInfo */
+            proute->subplan_partition_offsets[update_rri_index] = i;
+
+            update_rri_index++;
+
+            part_tupdesc = RelationGetDescr(partrel);
+
+            /*
+             * Save a tuple conversion map to convert a tuple routed to this
+             * partition from the parent's type to the partition's.
+             */
+            proute->parent_child_tupconv_maps[i] =
+                convert_tuples_by_name(tupDesc, part_tupdesc,
+                                       gettext_noop("could not convert row type"));
+
+            /*
+             * Verify result relation is a valid target for an INSERT.  An
+             * UPDATE of a partition-key becomes a DELETE+INSERT operation, so
+             * this check is required even when the operation is CMD_UPDATE.
+             */
+            CheckValidResultRel(leaf_part_rri, CMD_INSERT);
+         }
 
 		proute->partitions[i] = leaf_part_rri;
 		i++;
@@ -219,9 +171,9 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
 
 	/*
 	 * For UPDATE, we should have found all the per-subplan resultrels in the
-	 * leaf partitions.
+	 * leaf partitions.  (If this is an INSERT, both values will be zero.)
 	 */
-	Assert(!is_update || update_rri_index == num_update_rri);
+	Assert(update_rri_index == num_update_rri);
 
 	return proute;
 }
@@ -345,6 +297,201 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 	return result;
 }
 
+/*
+ * ExecInitPartitionInfo
+ *		Initialize ResultRelInfo and other information for a partition if not
+ *		already done
+ *
+ * Returns the ResultRelInfo
+ */
+ResultRelInfo *
+ExecInitPartitionInfo(ModifyTableState *mtstate,
+					  ResultRelInfo *resultRelInfo,
+					  PartitionTupleRouting *proute,
+					  EState *estate, int partidx)
+{
+	Relation	rootrel = resultRelInfo->ri_RelationDesc,
+				partrel;
+	ResultRelInfo *leaf_part_rri;
+	ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL;
+	MemoryContext oldContext;
+
+	/*
+	 * We locked all the partitions in ExecSetupPartitionTupleRouting
+	 * including the leaf partitions.
+	 */
+	partrel = heap_open(proute->partition_oids[partidx], NoLock);
+
+	/*
+	 * Keep ResultRelInfo and other information for this partition in the
+	 * per-query memory context so they'll survive throughout the query.
+	 */
+	oldContext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	leaf_part_rri = (ResultRelInfo *) palloc0(sizeof(ResultRelInfo));
+	InitResultRelInfo(leaf_part_rri,
+					  partrel,
+					  node ? node->nominalRelation : 1,
+					  rootrel,
+					  estate->es_instrument);
+
+	/*
+	 * Verify result relation is a valid target for an INSERT.  An UPDATE of a
+	 * partition-key becomes a DELETE+INSERT operation, so this check is still
+	 * required when the operation is CMD_UPDATE.
+	 */
+	CheckValidResultRel(leaf_part_rri, CMD_INSERT);
+
+	/*
+	 * Since we've just initialized this ResultRelInfo, it's not in any list
+	 * attached to the estate as yet.  Add it, so that it can be found later.
+	 *
+	 * Note that the entries in this list appear in no predetermined order,
+	 * because partition result rels are initialized as and when they're
+	 * needed.
+	 */
+	estate->es_tuple_routing_result_relations =
+		lappend(estate->es_tuple_routing_result_relations,
+				leaf_part_rri);
+
+	/*
+	 * Open partition indices.  The user may have asked to check for conflicts
+	 * within this leaf partition and do "nothing" instead of throwing an
+	 * error.  Be prepared in that case by initializing the index information
+	 * needed by ExecInsert() to perform speculative insertions.
+	 */
+	if (partrel->rd_rel->relhasindex &&
+		leaf_part_rri->ri_IndexRelationDescs == NULL)
+		ExecOpenIndices(leaf_part_rri,
+						(mtstate != NULL &&
+						 mtstate->mt_onconflict != ONCONFLICT_NONE));
+
+	/*
+	 * Build WITH CHECK OPTION constraints for the partition.  Note that we
+	 * didn't build the withCheckOptionList for partitions within the planner,
+	 * but simple translation of varattnos will suffice.  This only occurs for
+	 * the INSERT case or in the case of UPDATE tuple routing where we didn't
+	 * find a result rel to reuse in ExecSetupPartitionTupleRouting().
+	 */
+	if (node && node->withCheckOptionLists != NIL)
+	{
+		List	   *wcoList;
+		List	   *wcoExprs = NIL;
+		ListCell   *ll;
+		int			firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
+		Relation	firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
+
+		/*
+		 * In the case of INSERT on a partitioned table, there is only one
+		 * plan.  Likewise, there is only one WCO list, not one per partition.
+		 * For UPDATE, there are as many WCO lists as there are plans.
+		 */
+		Assert((node->operation == CMD_INSERT &&
+				list_length(node->withCheckOptionLists) == 1 &&
+				list_length(node->plans) == 1) ||
+			   (node->operation == CMD_UPDATE &&
+				list_length(node->withCheckOptionLists) ==
+				list_length(node->plans)));
+
+		/*
+		 * Use the WCO list of the first plan as a reference to calculate
+		 * attno's for the WCO list of this partition.  In the INSERT case,
+		 * that refers to the root partitioned table, whereas in the UPDATE
+		 * tuple routing case, that refers to the first partition in the
+		 * mtstate->resultRelInfo array.  In any case, both that relation and
+		 * this partition should have the same columns, so we should be able
+		 * to map attributes successfully.
+		 */
+		wcoList = linitial(node->withCheckOptionLists);
+
+		/*
+		 * Convert Vars in it to contain this partition's attribute numbers.
+		 */
+		wcoList = map_partition_varattnos(wcoList, firstVarno,
+										  partrel, firstResultRel, NULL);
+		foreach(ll, wcoList)
+		{
+			WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll));
+			ExprState  *wcoExpr = ExecInitQual(castNode(List, wco->qual),
+											   mtstate->mt_plans[0]);
+
+			wcoExprs = lappend(wcoExprs, wcoExpr);
+		}
+
+		leaf_part_rri->ri_WithCheckOptions = wcoList;
+		leaf_part_rri->ri_WithCheckOptionExprs = wcoExprs;
+	}
+
+	/*
+	 * Build the RETURNING projection for the partition.  Note that we didn't
+	 * build the returningList for partitions within the planner, but simple
+	 * translation of varattnos will suffice.  This only occurs for the INSERT
+	 * case or in the case of UPDATE tuple routing where we didn't find a
+	 * result rel to reuse in ExecSetupPartitionTupleRouting().
+	 */
+	if (node && node->returningLists != NIL)
+	{
+		TupleTableSlot *slot;
+		ExprContext *econtext;
+		List	   *returningList;
+		int			firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
+		Relation	firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
+
+		/* See the comment above for WCO lists. */
+		Assert((node->operation == CMD_INSERT &&
+				list_length(node->returningLists) == 1 &&
+				list_length(node->plans) == 1) ||
+			   (node->operation == CMD_UPDATE &&
+				list_length(node->returningLists) ==
+				list_length(node->plans)));
+
+		/*
+		 * Use the RETURNING list of the first plan as a reference to
+		 * calculate attno's for the RETURNING list of this partition.  See
+		 * the comment above for WCO lists for more details on why this is
+		 * okay.
+		 */
+		returningList = linitial(node->returningLists);
+
+		/*
+		 * Convert Vars in it to contain this partition's attribute numbers.
+		 */
+		returningList = map_partition_varattnos(returningList, firstVarno,
+												partrel, firstResultRel,
+												NULL);
+
+		/*
+		 * Initialize the projection itself.
+		 *
+		 * Use the slot and the expression context that would have been set up
+		 * in ExecInitModifyTable() for projection's output.
+		 */
+		Assert(mtstate->ps.ps_ResultTupleSlot != NULL);
+		slot = mtstate->ps.ps_ResultTupleSlot;
+		Assert(mtstate->ps.ps_ExprContext != NULL);
+		econtext = mtstate->ps.ps_ExprContext;
+		leaf_part_rri->ri_projectReturning =
+			ExecBuildProjectionInfo(returningList, econtext, slot,
+									&mtstate->ps, RelationGetDescr(partrel));
+	}
+
+	Assert(proute->partitions[partidx] == NULL);
+	proute->partitions[partidx] = leaf_part_rri;
+
+	/*
+	 * Save a tuple conversion map to convert a tuple routed to this partition
+	 * from the parent's type to the partition's.
+	 */
+	proute->parent_child_tupconv_maps[partidx] =
+		convert_tuples_by_name(RelationGetDescr(rootrel),
+							   RelationGetDescr(partrel),
+							   gettext_noop("could not convert row type"));
+
+	MemoryContextSwitchTo(oldContext);
+
+	return leaf_part_rri;
+}
+
 /*
  * ExecSetupChildParentMapForLeaf -- Initialize the per-leaf-partition
  * child-to-root tuple conversion map array.
@@ -471,6 +618,10 @@ ExecCleanupTupleRouting(PartitionTupleRouting *proute)
 	{
 		ResultRelInfo *resultRelInfo = proute->partitions[i];
 
+		/* skip further processsing for uninitialized partitions */
+		if (resultRelInfo == NULL)
+			continue;
+
 		/*
 		 * If this result rel is one of the UPDATE subplan result rels, let
 		 * ExecEndPlan() close it. For INSERT or COPY,
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index 4e0bdc35..40a67ea3 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -58,6 +58,7 @@ typedef struct PartitionDispatchData *PartitionDispatch;
  *								partition tree.
  * num_dispatch					number of partitioned tables in the partition
  *								tree (= length of partition_dispatch_info[])
+ * partition_oids				Array of leaf partitions OIDs
  * partitions					Array of ResultRelInfo* objects with one entry
  *								for every leaf partition in the partition tree.
  * num_partitions				Number of leaf partitions in the partition tree
@@ -91,6 +92,7 @@ typedef struct PartitionTupleRouting
 {
 	PartitionDispatch *partition_dispatch_info;
 	int			num_dispatch;
+	Oid		   *partition_oids;
 	ResultRelInfo **partitions;
 	int			num_partitions;
 	TupleConversionMap **parent_child_tupconv_maps;
@@ -103,12 +105,15 @@ typedef struct PartitionTupleRouting
 } PartitionTupleRouting;
 
 extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(ModifyTableState *mtstate,
-							   Relation rel, Index resultRTindex,
-							   EState *estate);
+							   Relation rel);
 extern int ExecFindPartition(ResultRelInfo *resultRelInfo,
 				  PartitionDispatch *pd,
 				  TupleTableSlot *slot,
 				  EState *estate);
+extern ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
+					ResultRelInfo *resultRelInfo,
+					PartitionTupleRouting *proute,
+					EState *estate, int partidx);
 extern void ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute);
 extern TupleConversionMap *TupConvMapForLeaf(PartitionTupleRouting *proute,
 				  ResultRelInfo *rootRelInfo, int leaf_index);

From 77056eb0dd53aed29fa1a63459432a77c7bc4841 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 26 Jun 2020 21:31:38 +0800
Subject: [PATCH 231/578] Be lazier about partition tuple routing.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/executor/nodeModifyTable.c | 134 +++----------------------
 1 file changed, 12 insertions(+), 122 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 003ff4b8..8d8b816d 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -345,10 +345,18 @@ ExecInsert(ModifyTableState *mtstate,
 
         /*
          * Save the old ResultRelInfo and switch to the one corresponding to
-         * the selected partition.
+		 * the selected partition.  (We might need to initialize it first.)
          */
         saved_resultRelInfo = resultRelInfo;
 		resultRelInfo = proute->partitions[leaf_part_index];
+		if (resultRelInfo == NULL)
+		{
+			resultRelInfo = ExecInitPartitionInfo(mtstate,
+												  saved_resultRelInfo,
+												  proute, estate,
+												  leaf_part_index);
+			Assert(resultRelInfo != NULL);
+		}
 
         /* We do not yet have a way to insert into a foreign partition */
         if (resultRelInfo->ri_FdwRoutine)
@@ -2751,14 +2759,11 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
     ResultRelInfo *resultRelInfo;
     TupleDesc    tupDesc;
     Plan       *subplan;
-	int         firstVarno = 0;
-    Relation    firstResultRel = NULL;
     ListCell   *l;
     int            i;
     Relation    rel;
 	bool        update_tuple_routing_needed = node->partColsUpdated;
-    PartitionTupleRouting *proute = NULL;
-    int         num_partitions = 0;
+
 #ifdef __TBASE__
     bool        remote_dml = false;
 #endif
@@ -2995,20 +3000,8 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 	*/
 	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE &&
 	   (operation == CMD_INSERT || update_tuple_routing_needed))
-    {
-        proute = mtstate->mt_partition_tuple_routing =
-		ExecSetupPartitionTupleRouting(mtstate,
-                                           rel, node->nominalRelation,
-                                           estate);
-        num_partitions = proute->num_partitions;
-
-        /*
-        * Below are required as reference objects for mapping partition
-        * attno's in expressions such as WithCheckOptions and RETURNING.
-        */
-        firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex;
-        firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc;
-    }
+		mtstate->mt_partition_tuple_routing =
+                        ExecSetupPartitionTupleRouting(mtstate, rel);
 
     /* Build state for collecting transition tuples */
     ExecSetupTransitionCaptureState(mtstate, estate);
@@ -3071,70 +3064,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
     }
 #endif    
 
-    /*
-     * Build WITH CHECK OPTION constraints for each leaf partition rel. Note
-     * that we didn't build the withCheckOptionList for each partition within
-     * the planner, but simple translation of the varattnos for each partition
-	 * will suffice.  This only occurs for the INSERT case or for UPDATE row
-	 * movement. DELETEs and local UPDATEs are handled above.
-     */
-	if (node->withCheckOptionLists != NIL && num_partitions > 0)
-    {
-		List	   *first_wcoList;
-
-        /*
-         * In case of INSERT on partitioned tables, there is only one plan.
-         * Likewise, there is only one WITH CHECK OPTIONS list, not one per
-		 * partition. Whereas for UPDATE, there are as many WCOs as there are
-		 * plans. So in either case, use the WCO expression of the first
-		 * resultRelInfo as a reference to calculate attno's for the WCO
-		 * expression of each of the partitions. We make a copy of the WCO
-		 * qual for each partition. Note that, if there are SubPlans in there,
-		 * they all end up attached to the one parent Plan node.
-		 */
-		Assert(update_tuple_routing_needed ||
-			   (operation == CMD_INSERT &&
-               list_length(node->withCheckOptionLists) == 1 &&
-				mtstate->mt_nplans == 1));
-
-		first_wcoList = linitial(node->withCheckOptionLists);
-		for (i = 0; i < num_partitions; i++)
-        {
-			Relation	partrel;
-            List       *mapped_wcoList;
-            List       *wcoExprs = NIL;
-            ListCell   *ll;
-
-			resultRelInfo = proute->partitions[i];
-
-			/*
-			 * If we are referring to a resultRelInfo from one of the update
-			 * result rels, that result rel would already have
-			 * WithCheckOptions initialized.
-			 */
-			if (resultRelInfo->ri_WithCheckOptions)
-				continue;
-
-			partrel = resultRelInfo->ri_RelationDesc;
-
-			mapped_wcoList = map_partition_varattnos(first_wcoList,
-													 firstVarno,
-													 partrel, firstResultRel,
-													 NULL);
-            foreach(ll, mapped_wcoList)
-            {
-                WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll));
-                ExprState  *wcoExpr = ExecInitQual(castNode(List, wco->qual),
-												   &mtstate->ps);
-
-                wcoExprs = lappend(wcoExprs, wcoExpr);
-            }
-
-            resultRelInfo->ri_WithCheckOptions = mapped_wcoList;
-            resultRelInfo->ri_WithCheckOptionExprs = wcoExprs;
-        }
-    }
-
     /*
      * Initialize RETURNING projections if needed.
      */
@@ -3142,7 +3071,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
     {
         TupleTableSlot *slot;
         ExprContext *econtext;
-		List	   *firstReturningList;
 
         /*
          * Initialize result tuple slot and assign its rowtype using the first
@@ -3185,44 +3113,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 
             resultRelInfo++;
         }
-
-        /*
-         * Build a projection for each leaf partition rel.  Note that we
-         * didn't build the returningList for each partition within the
-         * planner, but simple translation of the varattnos for each partition
-		 * will suffice.  This only occurs for the INSERT case or for UPDATE
-		 * row movement. DELETEs and local UPDATEs are handled above.
-         */
-		firstReturningList = linitial(node->returningLists);
-		for (i = 0; i < num_partitions; i++)
-        {
-			Relation	partrel;
-            List       *rlist;
-
-			resultRelInfo = proute->partitions[i];
-
-			/*
-			 * If we are referring to a resultRelInfo from one of the update
-			 * result rels, that result rel would already have a returningList
-			 * built.
-			 */
-			if (resultRelInfo->ri_projectReturning)
-				continue;
-
-			partrel = resultRelInfo->ri_RelationDesc;
-
-			/*
-			 * Use the returning expression of the first resultRelInfo as a
-			 * reference to calculate attno's for the returning expression of
-			 * each of the partitions.
-			 */
-			rlist = map_partition_varattnos(firstReturningList,
-											firstVarno,
-											partrel, firstResultRel, NULL);
-            resultRelInfo->ri_projectReturning =
-                ExecBuildProjectionInfo(rlist, econtext, slot, &mtstate->ps,
-                                        resultRelInfo->ri_RelationDesc->rd_att);
-        }
     }
     else
     {

From 4485fe66fd7dcfd7f4b2ccaedd92bc279d090fdc Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 26 Jun 2020 21:37:39 +0800
Subject: [PATCH 232/578] Revise API for
 partition_rbound_cmp/partition_rbound_datum_cmp.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/partition.c | 54 ++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index e185a7ee..01715488 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -165,10 +165,12 @@ static PartitionRangeBound *make_one_range_bound(PartitionKey key, int index,
                      List *datums, bool lower);
 static int32 partition_hbound_cmp(int modulus1, int remainder1, int modulus2,
 					 int remainder2);
-static int32 partition_rbound_cmp(PartitionKey key,
-                     Datum *datums1, PartitionRangeDatumKind *kind1,
-                     bool lower1, PartitionRangeBound *b2);
-static int32 partition_rbound_datum_cmp(PartitionKey key,
+static int32 partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc,
+					 Oid *partcollation, Datum *datums1,
+					 PartitionRangeDatumKind *kind1, bool lower1,
+					 PartitionRangeBound *b2);
+static int32 partition_rbound_datum_cmp(FmgrInfo *partsupfunc,
+						   Oid *partcollation,
                            Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
 						   Datum *tuple_datums, int n_tuple_datums);
 
@@ -1150,8 +1152,9 @@ check_new_partition_bound(char *relname, Relation parent,
                  * First check if the resulting range would be empty with
                  * specified lower and upper bounds
                  */
-                if (partition_rbound_cmp(key, lower->datums, lower->kind, true,
-                                         upper) >= 0)
+				if (partition_rbound_cmp(key->partnatts, key->partsupfunc,
+										 key->partcollation, lower->datums,
+										 lower->kind, true, upper) >= 0)
                 {
                     ereport(ERROR,
                             (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
@@ -1211,7 +1214,10 @@ check_new_partition_bound(char *relname, Relation parent,
 							kind = boundinfo->kind[offset + 1];
 							is_lower = (boundinfo->indexes[offset + 1] == -1);
 
-							cmpval = partition_rbound_cmp(key, datums, kind,
+							cmpval = partition_rbound_cmp(key->partnatts,
+														  key->partsupfunc,
+														  key->partcollation,
+														  datums, kind,
 														  is_lower, upper);
                             if (cmpval < 0)
                             {
@@ -2614,6 +2620,7 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
 														key->partnatts,
 														 values,
 														 &equal);
+
                     /*
                      * The bound at bound_offset is less than or equal to the
                      * tuple value, so the bound at offset+1 is the upper
@@ -2807,7 +2814,9 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
     PartitionRangeBound *b2 = (*(PartitionRangeBound *const *) b);
     PartitionKey key = (PartitionKey) arg;
 
-    return partition_rbound_cmp(key, b1->datums, b1->kind, b1->lower, b2);
+	return partition_rbound_cmp(key->partnatts, key->partsupfunc,
+								key->partcollation, b1->datums, b1->kind,
+								b1->lower, b2);
 }
 
 /*
@@ -2816,6 +2825,10 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
  * Return for two range bounds whether the 1st one (specified in datum1,
  * kind1, and lower1) is <, =, or > the bound specified in *b2.
  *
+ * partnatts, partsupfunc and partcollation give the number of attributes in the
+ * bounds to be compared, comparison function to be used and the collations of
+ * attributes, respectively.
+ *
  * Note that if the values of the two range bounds compare equal, then we take
  * into account whether they are upper or lower bounds, and an upper bound is
  * considered to be smaller than a lower bound. This is important to the way
@@ -2824,7 +2837,7 @@ qsort_partition_rbound_cmp(const void *a, const void *b, void *arg)
  * two contiguous partitions.
  */
 static int32
-partition_rbound_cmp(PartitionKey key,
+partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc, Oid *partcollation,
                      Datum *datums1, PartitionRangeDatumKind *kind1,
                      bool lower1, PartitionRangeBound *b2)
 {// #lizard forgives
@@ -2834,7 +2847,7 @@ partition_rbound_cmp(PartitionKey key,
     PartitionRangeDatumKind *kind2 = b2->kind;
     bool        lower2 = b2->lower;
 
-    for (i = 0; i < key->partnatts; i++)
+	for (i = 0; i < partnatts; i++)
     {
         /*
          * First, handle cases where the column is unbounded, which should not
@@ -2855,8 +2868,8 @@ partition_rbound_cmp(PartitionKey key,
              */
             break;
 
-        cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i],
-                                                 key->partcollation[i],
+		cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[i],
+												 partcollation[i],
                                                  datums1[i],
                                                  datums2[i]));
         if (cmpval != 0)
@@ -2880,9 +2893,14 @@ partition_rbound_cmp(PartitionKey key,
  *
  * Return whether range bound (specified in rb_datums, rb_kind, and rb_lower)
  * is <, =, or > partition key of tuple (tuple_datums)
+ *
+ * n_tuple_datums, partsupfunc and partcollation give number of attributes in
+ * the bounds to be compared, comparison function to be used and the collations
+ * of attributes resp.
+ *
  */
 static int32
-partition_rbound_datum_cmp(PartitionKey key,
+partition_rbound_datum_cmp(FmgrInfo *partsupfunc, Oid *partcollation,
                            Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
 						   Datum *tuple_datums, int n_tuple_datums)
 {
@@ -2896,8 +2914,8 @@ partition_rbound_datum_cmp(PartitionKey key,
         else if (rb_kind[i] == PARTITION_RANGE_DATUM_MAXVALUE)
             return 1;
 
-        cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[i],
-                                                 key->partcollation[i],
+		cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[i],
+												 partcollation[i],
                                                  rb_datums[i],
                                                  tuple_datums[i]));
         if (cmpval != 0)
@@ -2974,7 +2992,8 @@ partition_range_bsearch(PartitionKey key,
 		int32		cmpval;
 
 		mid = (lo + hi + 1) / 2;
-                    cmpval = partition_rbound_cmp(key,
+		cmpval = partition_rbound_cmp(key->partnatts, key->partsupfunc,
+									  key->partcollation,
 									  boundinfo->datums[mid],
 									  boundinfo->kind[mid],
 									  (boundinfo->indexes[mid] == -1),
@@ -3018,7 +3037,8 @@ partition_range_datum_bsearch(PartitionKey key,
         int32        cmpval;
 
         mid = (lo + hi + 1) / 2;
-		cmpval = partition_rbound_datum_cmp(key,
+		cmpval = partition_rbound_datum_cmp(key->partsupfunc,
+											key->partcollation,
 											boundinfo->datums[mid],
 											boundinfo->kind[mid],
 											values,

From ba7473f1c0f0e6e4aa5f460c9268caa3857e362a Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 26 Jun 2020 21:43:47 +0800
Subject: [PATCH 233/578] Revise API for partition bound search
 functions.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/partition.c | 66 +++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 01715488..bf765697 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -174,22 +174,24 @@ static int32 partition_rbound_datum_cmp(FmgrInfo *partsupfunc,
                            Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
 						   Datum *tuple_datums, int n_tuple_datums);
 
-static int partition_list_bsearch(PartitionKey key,
+static int partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation,
                     PartitionBoundInfo boundinfo,
 					   Datum value, bool *is_equal);
-static int partition_range_bsearch(PartitionKey key,
+static int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
+                        Oid *partcollation,
                         PartitionBoundInfo boundinfo,
 						PartitionRangeBound *probe, bool *is_equal);
-static int partition_range_datum_bsearch(PartitionKey key,
+static int partition_range_datum_bsearch(FmgrInfo *partsupfunc,
+                              Oid *partcollation,
 							  PartitionBoundInfo boundinfo,
 							  int nvalues, Datum *values, bool *is_equal);
-static int partition_hash_bsearch(PartitionKey key,
-					   PartitionBoundInfo boundinfo,
+static int partition_hash_bsearch(PartitionBoundInfo boundinfo,
 					   int modulus, int remainder);
 
 static int	get_partition_bound_num_indexes(PartitionBoundInfo b);
 static int	get_greatest_modulus(PartitionBoundInfo b);
-static uint64 compute_hash_value(PartitionKey key, Datum *values, bool *isnull);
+static uint64 compute_hash_value(int partnatts, FmgrInfo *partsupfunc,
+                                 Datum *values, bool *isnull);
 
 /* SQL-callable function for use in hash partition CHECK constraints */
 PG_FUNCTION_INFO_V1(satisfies_hash_partition);
@@ -1041,7 +1043,7 @@ check_new_partition_bound(char *relname, Relation parent,
 					 * boundinfo->datums that is less than or equal to the
 					 * (spec->modulus, spec->remainder) pair.
 					 */
-					offset = partition_hash_bsearch(key, boundinfo,
+					offset = partition_hash_bsearch(boundinfo,
 													spec->modulus,
 													spec->remainder);
 					if (offset < 0)
@@ -1117,7 +1119,9 @@ check_new_partition_bound(char *relname, Relation parent,
                             int            offset;
                             bool        equal;
 
-							offset = partition_list_bsearch(key, boundinfo,
+							offset = partition_list_bsearch(key->partsupfunc,
+														key->partcollation,
+															boundinfo,
 															val->constvalue,
 															&equal);
                             if (offset >= 0 && equal)
@@ -1192,7 +1196,10 @@ check_new_partition_bound(char *relname, Relation parent,
                      * since the index array is initialised with an extra -1
                      * at the end.
                      */
-					offset = partition_range_bsearch(key, boundinfo, lower,
+					offset = partition_range_bsearch(key->partnatts,
+													 key->partsupfunc,
+													 key->partcollation,
+													 boundinfo, lower,
 													 &equal);
 
                     if (boundinfo->indexes[offset + 1] < 0)
@@ -2569,7 +2576,9 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
 				{
 					PartitionBoundInfo boundinfo = partdesc->boundinfo;
 					int			greatest_modulus = get_greatest_modulus(boundinfo);
-                uint64  rowHash = compute_hash_value(key, values, isnull);
+				uint64		rowHash = compute_hash_value(key->partnatts,
+														 key->partsupfunc,
+														 values, isnull);
 
                 part_index = boundinfo->indexes[rowHash % greatest_modulus];
 				}
@@ -2585,7 +2594,8 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
 				{
 					bool		equal = false;
 
-				bound_offset = partition_list_bsearch(key,
+				bound_offset = partition_list_bsearch(key->partsupfunc,
+													  key->partcollation,
 														 partdesc->boundinfo,
 													  values[0], &equal);
                 if (bound_offset >= 0 && equal)
@@ -2615,7 +2625,8 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
 
                 if (!range_partkey_has_null)
                 {
-					bound_offset = partition_range_datum_bsearch(key,
+					bound_offset = partition_range_datum_bsearch(key->partsupfunc,
+																 key->partcollation,
 														 partdesc->boundinfo,
 														key->partnatts,
 														 values,
@@ -2934,7 +2945,7 @@ partition_rbound_datum_cmp(FmgrInfo *partsupfunc, Oid *partcollation,
  * to the input value.
  */
 static int
-partition_list_bsearch(PartitionKey key,
+partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation,
 					   PartitionBoundInfo boundinfo,
 					   Datum value, bool *is_equal)
 {
@@ -2949,8 +2960,8 @@ partition_list_bsearch(PartitionKey key,
 		int32		cmpval;
 
 		mid = (lo + hi + 1) / 2;
-            cmpval = DatumGetInt32(FunctionCall2Coll(&key->partsupfunc[0],
-                                                     key->partcollation[0],
+		cmpval = DatumGetInt32(FunctionCall2Coll(&partsupfunc[0],
+												 partcollation[0],
 												 boundinfo->datums[mid][0],
 												 value));
 		if (cmpval <= 0)
@@ -2977,7 +2988,8 @@ partition_list_bsearch(PartitionKey key,
  * to the input range bound
                      */
 static int
-partition_range_bsearch(PartitionKey key,
+partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
+						Oid *partcollation,
 						PartitionBoundInfo boundinfo,
 						PartitionRangeBound *probe, bool *is_equal)
 {
@@ -2992,8 +3004,7 @@ partition_range_bsearch(PartitionKey key,
 		int32		cmpval;
 
 		mid = (lo + hi + 1) / 2;
-		cmpval = partition_rbound_cmp(key->partnatts, key->partsupfunc,
-									  key->partcollation,
+		cmpval = partition_rbound_cmp(partnatts, partsupfunc, partcollation,
 									  boundinfo->datums[mid],
 									  boundinfo->kind[mid],
 									  (boundinfo->indexes[mid] == -1),
@@ -3022,7 +3033,7 @@ partition_range_bsearch(PartitionKey key,
  * to the input tuple.
  */
 static int
-partition_range_datum_bsearch(PartitionKey key,
+partition_range_datum_bsearch(FmgrInfo *partsupfunc, Oid *partcollation,
 							  PartitionBoundInfo boundinfo,
 							  int nvalues, Datum *values, bool *is_equal)
 {
@@ -3037,8 +3048,8 @@ partition_range_datum_bsearch(PartitionKey key,
         int32        cmpval;
 
         mid = (lo + hi + 1) / 2;
-		cmpval = partition_rbound_datum_cmp(key->partsupfunc,
-											key->partcollation,
+		cmpval = partition_rbound_datum_cmp(partsupfunc,
+											partcollation,
 											boundinfo->datums[mid],
 											boundinfo->kind[mid],
 											values,
@@ -3065,8 +3076,7 @@ partition_range_datum_bsearch(PartitionKey key,
  *		all of them are greater
  */
 static int
-partition_hash_bsearch(PartitionKey key,
-					   PartitionBoundInfo boundinfo,
+partition_hash_bsearch(PartitionBoundInfo boundinfo,
 					   int modulus, int remainder)
 {
 	int			lo,
@@ -3264,27 +3274,27 @@ get_greatest_modulus(PartitionBoundInfo bound)
  * Compute the hash value for given not null partition key values.
  */
 static uint64
-compute_hash_value(PartitionKey key, Datum *values, bool *isnull)
+compute_hash_value(int partnatts, FmgrInfo *partsupfunc,
+				   Datum *values, bool *isnull)
 {
 	int			i;
-	int			nkeys = key->partnatts;
 	uint64		rowHash = 0;
 	Datum		seed = UInt64GetDatum(HASH_PARTITION_SEED);
 
-	for (i = 0; i < nkeys; i++)
+	for (i = 0; i < partnatts; i++)
 	{
 		if (!isnull[i])
 		{
 			Datum		hash;
 
-			Assert(OidIsValid(key->partsupfunc[i].fn_oid));
+			Assert(OidIsValid(partsupfunc[i].fn_oid));
 
 			/*
 			 * Compute hash for each datum value by calling respective
 			 * datatype-specific hash functions of each partition key
 			 * attribute.
 			 */
-			hash = FunctionCall2(&key->partsupfunc[i], values[i], seed);
+			hash = FunctionCall2(&partsupfunc[i], values[i], seed);
 
 			/* Form a single 64-bit hash value */
 			rowHash = hash_combine64(rowHash, DatumGetUInt64(hash));

From 6b542dcaf20bbda6f501310a50f1864e01b5e7e9 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Mon, 26 Feb 2018 17:05:46 -0300
Subject: [PATCH 234/578] Update PartitionTupleRouting struct comment

Small review on edd44738bc88.

Discussion: https://postgr.es/m/20180222165315.k27qfn4goskhoswj@alvherre.pgsql
Reviewed-by: Robert Haas, Amit Langote
---
 src/include/executor/execPartition.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index 40a67ea3..6996258a 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -58,11 +58,15 @@ typedef struct PartitionDispatchData *PartitionDispatch;
  *								partition tree.
  * num_dispatch					number of partitioned tables in the partition
  *								tree (= length of partition_dispatch_info[])
- * partition_oids				Array of leaf partitions OIDs
+ * partition_oids				Array of leaf partitions OIDs with one entry
+ *								for every leaf partition in the partition tree,
+ *								initialized in full by
+ *								ExecSetupPartitionTupleRouting.
  * partitions					Array of ResultRelInfo* objects with one entry
- *								for every leaf partition in the partition tree.
+ *								for every leaf partition in the partition tree,
+ *								initialized lazily by ExecInitPartitionInfo.
  * num_partitions				Number of leaf partitions in the partition tree
- *								(= 'partitions' array length)
+ *								(= 'partitions_oid'/'partitions' array length)
  * parent_child_tupconv_maps	Array of TupleConversionMap objects with one
  *								entry for every leaf partition (required to
  *								convert tuple from the root table's rowtype to

From 59dc8ae73967f58329719c0b6f807610b38f8b67 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Wed, 28 Feb 2018 12:16:09 -0500
Subject: [PATCH 235/578] For partitionwise join, match on partcollation, not
 parttypcoll.

The previous code considered two tables to have the partition scheme
if the underlying columns had the same collation, but what we
actually need to compare is not the collations associated with the
column but the collation used for partitioning.  Fix that.

Robert Haas and Amit Langote

Discussion: http://postgr.es/m/0f95f924-0efa-4cf5-eb5f-9a3d1bc3c33d@lab.ntt.co.jp
---
 src/backend/optimizer/util/plancat.c | 6 +++---
 src/include/nodes/relation.h         | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index dba8d09d..e310e85b 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -2034,7 +2034,7 @@ find_partition_scheme(PlannerInfo *root, Relation relation)
                   sizeof(Oid) * partnatts) != 0 ||
            memcmp(partkey->partopcintype, part_scheme->partopcintype,
                   sizeof(Oid) * partnatts) != 0 ||
-           memcmp(partkey->parttypcoll, part_scheme->parttypcoll,
+			memcmp(partkey->partcollation, part_scheme->partcollation,
                   sizeof(Oid) * partnatts) != 0)
            continue;
 
@@ -2069,8 +2069,8 @@ find_partition_scheme(PlannerInfo *root, Relation relation)
 	memcpy(part_scheme->partopcintype, partkey->partopcintype,
 		   sizeof(Oid) * partnatts);
 
-	part_scheme->parttypcoll = (Oid *) palloc(sizeof(Oid) * partnatts);
-	memcpy(part_scheme->parttypcoll, partkey->parttypcoll,
+	part_scheme->partcollation = (Oid *) palloc(sizeof(Oid) * partnatts);
+	memcpy(part_scheme->partcollation, partkey->partcollation,
 		   sizeof(Oid) * partnatts);
 
 	part_scheme->parttyplen = (int16 *) palloc(sizeof(int16) * partnatts);
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index ee843fad..6172b31e 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -463,7 +463,7 @@ typedef struct PartitionSchemeData
 	int16		partnatts;		/* number of partition attributes */
 	Oid		   *partopfamily;	/* OIDs of operator families */
 	Oid		   *partopcintype;	/* OIDs of opclass declared input data types */
-	Oid		   *parttypcoll;	/* OIDs of collations of partition keys. */
+	Oid		   *partcollation;	/* OIDs of partitioning collations */
 
 	/* Cached information about partition key data types. */
 	int16	   *parttyplen;

From e1047e742f714523a6bb715645987dc9f3db0512 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 5 Mar 2018 17:49:59 -0800
Subject: [PATCH 236/578] Fix parent node of WCO expressions in partitioned
 tables.

Since edd44738bc8814 WCO expressions of partitioned tables are
initialized with the first subplan as parent. That's not correct, as
the correct context is the ModifyTableState node. That's also what is
used for RETURNING processing, initialized nearby.

This appears not to cause any visible problems for in core code, but
is problematic for in development patch.

Discussion: https://postgr.es/m/20180303043818.tnvlo243bgy7una3@alap3.anarazel.de
---
 src/backend/executor/execPartition.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index ad72c3cf..50bc3754 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -413,7 +413,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 		{
 			WithCheckOption *wco = castNode(WithCheckOption, lfirst(ll));
 			ExprState  *wcoExpr = ExecInitQual(castNode(List, wco->qual),
-											   mtstate->mt_plans[0]);
+											   &mtstate->ps);
 
 			wcoExprs = lappend(wcoExprs, wcoExpr);
 		}

From 2e2ee0bfff790eebde820e65734e61df02bf35cc Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Sun, 28 Jun 2020 14:49:58 +0800
Subject: [PATCH 237/578] Fix state reversal after partition tuple routing

---
 src/backend/commands/copy.c            |  13 +-
 src/backend/executor/nodeModifyTable.c | 216 +++++++++++++------------
 src/include/nodes/execnodes.h          |   9 +-
 src/test/regress/expected/insert.out   |  26 +++
 src/test/regress/sql/insert.sql        |  23 +++
 5 files changed, 177 insertions(+), 110 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 8ef1f6cd..321b44a1 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -3489,13 +3489,12 @@ CopyFrom(CopyState cstate)
             if (cstate->transition_capture != NULL)
             {
                 if (resultRelInfo->ri_TrigDesc &&
-                    (resultRelInfo->ri_TrigDesc->trig_insert_before_row ||
-                     resultRelInfo->ri_TrigDesc->trig_insert_instead_row))
+					resultRelInfo->ri_TrigDesc->trig_insert_before_row)
                 {
                     /*
-                     * If there are any BEFORE or INSTEAD triggers on the
-                     * partition, we'll have to be ready to convert their
-                     * result back to tuplestore format.
+					 * If there are any BEFORE triggers on the partition,
+					 * we'll have to be ready to convert their result back to
+					 * tuplestore format.
                      */
                     cstate->transition_capture->tcs_original_insert_tuple = NULL;
                     cstate->transition_capture->tcs_map =
@@ -3772,18 +3771,18 @@ CopyFrom(CopyState cstate)
              * tuples inserted by an INSERT command.
              */
             processed++;
+		}
 
+        /* Restore the saved ResultRelInfo */
             if (saved_resultRelInfo)
             {
                 resultRelInfo = saved_resultRelInfo;
                 estate->es_result_relation_info = resultRelInfo;
             }
-        }
 #ifdef PGXC
         }
 #endif
     }
-
     /* Flush any remaining buffered tuples */
 #ifdef __TBASE__
     if(IS_PGXC_DATANODE && npart > 0)
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 8d8b816d..a8cb0df0 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -79,6 +79,11 @@ static bool ExecOnConflictUpdate(ModifyTableState *mtstate,
                      EState *estate,
                      bool canSetTag,
                      TupleTableSlot **returning);
+static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate,
+						EState *estate,
+						PartitionTupleRouting *proute,
+						ResultRelInfo *targetRelInfo,
+						TupleTableSlot *slot);
 static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node);
 static void ExecSetupChildParentMapForTcs(ModifyTableState *mtstate);
 static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate);
@@ -281,7 +286,6 @@ ExecInsert(ModifyTableState *mtstate,
 {// #lizard forgives
     HeapTuple    tuple;
     ResultRelInfo *resultRelInfo;
-    ResultRelInfo *saved_resultRelInfo = NULL;
     Relation    resultRelationDesc;
     Oid            newId;
     List       *recheckIndexes = NIL;
@@ -322,92 +326,7 @@ ExecInsert(ModifyTableState *mtstate,
         tuple = ExecMaterializeSlot_shard(slot, hasshard, diskey, secdiskey, RelationGetRelid(resultRelationDesc));
     }
 #endif
-    /* Determine the partition to heap_insert the tuple into */
-	if (mtstate->mt_partition_tuple_routing)
-    {
-        int            leaf_part_index;
-		PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
-
-        /*
-         * Away we go ... If we end up not finding a partition after all,
-         * ExecFindPartition() does not return and errors out instead.
-         * Otherwise, the returned value is to be used as an index into arrays
-         * proute->partitions[] and proute->partition_tupconv_maps[] that will
-         * get us the ResultRelInfo and TupleConversionMap for the partition,
-         * respectively.
-         */
-        leaf_part_index = ExecFindPartition(resultRelInfo,
-				                            proute->partition_dispatch_info,
-                                            slot,
-                                            estate);
-        Assert(leaf_part_index >= 0 &&
-			   leaf_part_index < proute->num_partitions);
-
-        /*
-         * Save the old ResultRelInfo and switch to the one corresponding to
-		 * the selected partition.  (We might need to initialize it first.)
-         */
-        saved_resultRelInfo = resultRelInfo;
-		resultRelInfo = proute->partitions[leaf_part_index];
-		if (resultRelInfo == NULL)
-		{
-			resultRelInfo = ExecInitPartitionInfo(mtstate,
-												  saved_resultRelInfo,
-												  proute, estate,
-												  leaf_part_index);
-			Assert(resultRelInfo != NULL);
-		}
 
-        /* We do not yet have a way to insert into a foreign partition */
-        if (resultRelInfo->ri_FdwRoutine)
-            ereport(ERROR,
-                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                     errmsg("cannot route inserted tuples to a foreign table")));
-
-        /* For ExecInsertIndexTuples() to work on the partition's indexes */
-        estate->es_result_relation_info = resultRelInfo;
-
-        /*
-         * If we're capturing transition tuples, we might need to convert from
-         * the partition rowtype to parent rowtype.
-         */
-        if (mtstate->mt_transition_capture != NULL)
-        {
-            if (resultRelInfo->ri_TrigDesc &&
-                (resultRelInfo->ri_TrigDesc->trig_insert_before_row ||
-                 resultRelInfo->ri_TrigDesc->trig_insert_instead_row))
-            {
-                /*
-                 * If there are any BEFORE or INSTEAD triggers on the
-                 * partition, we'll have to be ready to convert their result
-                 * back to tuplestore format.
-                 */
-                mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
-
-                mtstate->mt_transition_capture->tcs_map =
-					TupConvMapForLeaf(proute, saved_resultRelInfo,
-									  leaf_part_index);
-            }
-            else
-            {
-                /*
-                 * Otherwise, just remember the original unconverted tuple, to
-                 * avoid a needless round trip conversion.
-                 */
-                mtstate->mt_transition_capture->tcs_original_insert_tuple = tuple;
-                mtstate->mt_transition_capture->tcs_map = NULL;
-            }
-        }
-
-        /*
-         * We might need to convert from the parent rowtype to the partition
-         * rowtype.
-         */
-		tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index],
-                                          tuple,
-                                          proute->partition_tuple_slot,
-                                          &slot);
-    }
 #ifdef __TBASE__
     /* Determine the interval partition to heap_insert the tuple into */
     else if (resultRelInfo->ispartparent)
@@ -633,7 +552,7 @@ ExecInsert(ModifyTableState *mtstate,
          * No need though if the tuple has been routed, and a BR trigger
          * doesn't exist.
          */
-        if (saved_resultRelInfo != NULL &&
+		if (resultRelInfo->ri_PartitionRoot != NULL &&
             !(resultRelInfo->ri_TrigDesc &&
               resultRelInfo->ri_TrigDesc->trig_insert_before_row))
             check_partition_constr = false;
@@ -891,9 +810,6 @@ ExecInsert(ModifyTableState *mtstate,
     if (resultRelInfo->ri_projectReturning)
         result = ExecProcessReturning(resultRelInfo, slot, planSlot);
 
-    if (saved_resultRelInfo)
-        estate->es_result_relation_info = saved_resultRelInfo;
-
     return result;
 }
 
@@ -1528,27 +1444,22 @@ lreplace:;
 	                                         proute->root_tuple_slot,
 	                                         &slot);
 
-
-	       /*
-	        * For ExecInsert(), make it look like we are inserting into the
-	        * root.
-	        */
+			/* Prepare for tuple routing */
 	       Assert(mtstate->rootResultRelInfo != NULL);
-	       estate->es_result_relation_info = mtstate->rootResultRelInfo;
+			slot = ExecPrepareTupleRouting(mtstate, estate, proute,
+										   mtstate->rootResultRelInfo, slot);
 
 	       ret_slot = ExecInsert(mtstate, slot, planSlot, NULL,
 	                             ONCONFLICT_NONE, estate, canSetTag);
 
-	       /*
-	        * Revert back the active result relation and the active
-	        * transition capture map that we changed above.
-	        */
+			/* Revert ExecPrepareTupleRouting's node change. */
 	       estate->es_result_relation_info = resultRelInfo;
 	       if (mtstate->mt_transition_capture)
 	       {
 	           mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
 	           mtstate->mt_transition_capture->tcs_map = saved_tcs_map;
 	       }
+
 	       return ret_slot;
 	    }
 
@@ -2046,6 +1957,103 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate)
 	}
 }
 
+/*
+ * ExecPrepareTupleRouting --- prepare for routing one tuple
+ *
+ * Determine the partition in which the tuple in slot is to be inserted,
+ * and modify mtstate and estate to prepare for it.
+ *
+ * Caller must revert the estate changes after executing the insertion!
+ * In mtstate, transition capture changes may also need to be reverted.
+ *
+ * Returns a slot holding the tuple of the partition rowtype.
+ */
+static TupleTableSlot *
+ExecPrepareTupleRouting(ModifyTableState *mtstate,
+						EState *estate,
+						PartitionTupleRouting *proute,
+						ResultRelInfo *targetRelInfo,
+						TupleTableSlot *slot)
+{
+	int			partidx;
+	ResultRelInfo *partrel;
+	HeapTuple	tuple;
+
+	/*
+	 * Determine the target partition.  If ExecFindPartition does not find
+	 * a partition after all, it doesn't return here; otherwise, the returned
+	 * value is to be used as an index into the arrays for the ResultRelInfo
+	 * and TupleConversionMap for the partition.
+	 */
+	partidx = ExecFindPartition(targetRelInfo,
+								proute->partition_dispatch_info,
+								slot,
+								estate);
+	Assert(partidx >= 0 && partidx < proute->num_partitions);
+
+	/*
+	 * Get the ResultRelInfo corresponding to the selected partition; if not
+	 * yet there, initialize it.
+	 */
+	partrel = proute->partitions[partidx];
+	if (partrel == NULL)
+		partrel = ExecInitPartitionInfo(mtstate, targetRelInfo,
+										proute, estate,
+										partidx);
+
+	/* We do not yet have a way to insert into a foreign partition */
+	if (partrel->ri_FdwRoutine)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot route inserted tuples to a foreign table")));
+
+	/*
+	 * Make it look like we are inserting into the partition.
+	 */
+	estate->es_result_relation_info = partrel;
+
+	/* Get the heap tuple out of the given slot. */
+	tuple = ExecMaterializeSlot(slot);
+
+	/*
+	 * If we're capturing transition tuples, we might need to convert from the
+	 * partition rowtype to parent rowtype.
+	 */
+	if (mtstate->mt_transition_capture != NULL)
+	{
+		if (partrel->ri_TrigDesc &&
+			partrel->ri_TrigDesc->trig_insert_before_row)
+		{
+			/*
+			 * If there are any BEFORE triggers on the partition, we'll have
+			 * to be ready to convert their result back to tuplestore format.
+			 */
+			mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
+			mtstate->mt_transition_capture->tcs_map =
+				TupConvMapForLeaf(proute, targetRelInfo, partidx);
+		}
+		else
+		{
+			/*
+			 * Otherwise, just remember the original unconverted tuple, to
+			 * avoid a needless round trip conversion.
+			 */
+			mtstate->mt_transition_capture->tcs_original_insert_tuple = tuple;
+			mtstate->mt_transition_capture->tcs_map = NULL;
+		}
+	}
+
+	/*
+	 * Convert the tuple, if necessary.
+	 */
+	ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[partidx],
+							  tuple,
+							  proute->partition_tuple_slot,
+							  &slot);
+
+	return slot;
+}
+
 /*
  * Initialize the child-to-root tuple conversion map array for UPDATE subplans.
  *
@@ -2182,6 +2190,7 @@ static TupleTableSlot *
 ExecModifyTable(PlanState *pstate)
 {// #lizard forgives
     ModifyTableState *node = castNode(ModifyTableState, pstate);
+	PartitionTupleRouting *proute = node->mt_partition_tuple_routing;
     EState       *estate = node->ps.state;
     CmdType        operation = node->operation;
     ResultRelInfo *saved_resultRelInfo;
@@ -2658,9 +2667,16 @@ ExecModifyTable(PlanState *pstate)
                     oldtag = mls_command_tag_switch_to(CLS_CMD_ROW);
                 }
 #endif
+                /* Prepare for tuple routing if needed. */
+                if (proute)
+                    slot = ExecPrepareTupleRouting(node, estate, proute,
+                                                   resultRelInfo, slot);
                 slot = ExecInsert(node, slot, planSlot,
                                   node->mt_arbiterindexes, node->mt_onconflict,
                                   estate, node->canSetTag);
+                /* Revert ExecPrepareTupleRouting's state change. */
+                if (proute)
+                    estate->es_result_relation_info = resultRelInfo;
 #ifdef _MLS_
                 if (IsA(subplanstate, ResultState) || IsA(subplanstate, RemoteSubplanState) )
                 {
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 74475d60..bed56a23 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1066,12 +1066,15 @@ typedef struct ModifyTableState
     TupleTableSlot *mt_existing;    /* slot to store existing target tuple in */
     List       *mt_excludedtlist;    /* the excluded pseudo relation's tlist  */
     TupleTableSlot *mt_conflproj;    /* CONFLICT ... SET ... projection target */
-        struct PartitionTupleRouting *mt_partition_tuple_routing;
+
     /* Tuple-routing support info */
+        struct PartitionTupleRouting *mt_partition_tuple_routing;
+
+	/* controls transition table population for specified operation */
     struct TransitionCaptureState *mt_transition_capture;
-    /* controls transition table population */
-        TupleConversionMap **mt_per_subplan_tupconv_maps;
+
         /* Per plan map for tuple conversion from child to root */
+        TupleConversionMap **mt_per_subplan_tupconv_maps;
 #ifdef __TBASE__
     /* used for interval partition */
     bool        haspartparent;
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index a671b345..e1a74c4a 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -778,6 +778,32 @@ drop role regress_coldesc_role;
 drop table inserttest3;
 drop table brtrigpartcon;
 drop function brtrigpartcon1trigf();
+-- check that "do nothing" BR triggers work with tuple-routing (this checks
+-- that estate->es_result_relation_info is appropriately set/reset for each
+-- routed tuple)
+create table donothingbrtrig_test (a int, b text) partition by list (a);
+create table donothingbrtrig_test1 (b text, a int);
+create table donothingbrtrig_test2 (c text, b text, a int);
+alter table donothingbrtrig_test2 drop column c;
+create or replace function donothingbrtrig_func() returns trigger as $$begin raise notice 'b: %', new.b; return NULL; end$$ language plpgsql;
+create trigger donothingbrtrig1 before insert on donothingbrtrig_test1 for each row execute procedure donothingbrtrig_func();
+create trigger donothingbrtrig2 before insert on donothingbrtrig_test2 for each row execute procedure donothingbrtrig_func();
+alter table donothingbrtrig_test attach partition donothingbrtrig_test1 for values in (1);
+alter table donothingbrtrig_test attach partition donothingbrtrig_test2 for values in (2);
+insert into donothingbrtrig_test values (1, 'foo'), (2, 'bar');
+NOTICE:  b: foo
+NOTICE:  b: bar
+copy donothingbrtrig_test from stdout;
+NOTICE:  b: baz
+NOTICE:  b: qux
+select tableoid::regclass, * from donothingbrtrig_test;
+ tableoid | a | b 
+----------+---+---
+(0 rows)
+
+-- cleanup
+drop table donothingbrtrig_test;
+drop function donothingbrtrig_func();
 -- check multi-column range partitioning with minvalue/maxvalue constraints
 create table mcrparted (a text, b int) partition by range(a, b);
 create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, minvalue) to ('b', minvalue);
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 21d04de1..9a561519 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -483,6 +483,29 @@ drop table inserttest3;
 drop table brtrigpartcon;
 drop function brtrigpartcon1trigf();
 
+-- check that "do nothing" BR triggers work with tuple-routing (this checks
+-- that estate->es_result_relation_info is appropriately set/reset for each
+-- routed tuple)
+create table donothingbrtrig_test (a int, b text) partition by list (a);
+create table donothingbrtrig_test1 (b text, a int);
+create table donothingbrtrig_test2 (c text, b text, a int);
+alter table donothingbrtrig_test2 drop column c;
+create or replace function donothingbrtrig_func() returns trigger as $$begin raise notice 'b: %', new.b; return NULL; end$$ language plpgsql;
+create trigger donothingbrtrig1 before insert on donothingbrtrig_test1 for each row execute procedure donothingbrtrig_func();
+create trigger donothingbrtrig2 before insert on donothingbrtrig_test2 for each row execute procedure donothingbrtrig_func();
+alter table donothingbrtrig_test attach partition donothingbrtrig_test1 for values in (1);
+alter table donothingbrtrig_test attach partition donothingbrtrig_test2 for values in (2);
+insert into donothingbrtrig_test values (1, 'foo'), (2, 'bar');
+copy donothingbrtrig_test from stdout;
+1	baz
+2	qux
+\.
+select tableoid::regclass, * from donothingbrtrig_test;
+
+-- cleanup
+drop table donothingbrtrig_test;
+drop function donothingbrtrig_func();
+
 -- check multi-column range partitioning with minvalue/maxvalue constraints
 create table mcrparted (a text, b int) partition by range(a, b);
 create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, minvalue) to ('b', minvalue);

From 5827d1e618561143ec6a746817a8accf7410a018 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Sun, 28 Jun 2020 20:53:20 +0800
Subject: [PATCH 238/578] Factor some code out of
 create_grouping_paths.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/optimizer/plan/planner.c | 6146 +++++++++++++-------------
 1 file changed, 3112 insertions(+), 3034 deletions(-)

diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b22d678d..640dcc4d 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -203,6 +203,26 @@ static bool can_push_down_grouping(PlannerInfo *root, Query *parse, Path *path);
 static bool can_push_down_window(PlannerInfo *root, Path *path);
 static void adjust_paths_for_srfs(PlannerInfo *root, RelOptInfo *rel,
                       List *targets, List *targets_contain_srfs);
+static void add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
+                         RelOptInfo *grouped_rel, PathTarget *target,
+                         PathTarget *partial_grouping_target,
+                         const AggClauseCosts *agg_costs,
+                         const AggClauseCosts *agg_final_costs,
+                         grouping_sets_data *gd, bool can_sort, bool can_hash,
+                         double dNumGroups, List *havingQual);
+static void add_partial_paths_to_grouping_rel(PlannerInfo *root,
+                                 RelOptInfo *input_rel,
+                                 RelOptInfo *grouped_rel,
+                                 PathTarget *target,
+                                 PathTarget *partial_grouping_target,
+                                 AggClauseCosts *agg_partial_costs,
+                                 AggClauseCosts *agg_final_costs,
+                                 grouping_sets_data *gd,
+                                 bool can_sort,
+                                 bool can_hash,
+                                 List *havingQual);
+static bool can_parallel_agg(PlannerInfo *root, RelOptInfo *input_rel,
+                RelOptInfo *grouped_rel, const AggClauseCosts *agg_costs);
 #ifdef __TBASE__
 static Path *adjust_modifytable_subpath(PlannerInfo *root, Query *parse, Path *path);
 #endif
@@ -3899,15 +3919,12 @@ create_grouping_paths(PlannerInfo *root,
     PathTarget *partial_grouping_target = NULL;
     AggClauseCosts agg_partial_costs;    /* parallel only */
     AggClauseCosts agg_final_costs; /* parallel only */
-    Size        hashaggtablesize;
     double        dNumGroups;
-    double        dNumPartialGroups = 0;
     bool        can_hash;
     bool        can_sort;
     bool        try_parallel_aggregation;
     bool        try_distributed_aggregation;
 
-    ListCell   *lc;
 
     /* For now, do all work in the (GROUP_AGG, NULL) upperrel */
     grouped_rel = fetch_upper_rel(root, UPPERREL_GROUP_AGG, NULL);
@@ -4041,44 +4058,11 @@ create_grouping_paths(PlannerInfo *root,
                 (gd ? gd->any_hashable : grouping_is_hashable(parse->groupClause)));
 
     /*
-     * If grouped_rel->consider_parallel is true, then paths that we generate
-     * for this grouping relation could be run inside of a worker, but that
-     * doesn't mean we can actually use the PartialAggregate/FinalizeAggregate
-     * execution strategy.  Figure that out.
-     */
-    if (!grouped_rel->consider_parallel)
-    {
-        /* Not even parallel-safe. */
-        try_parallel_aggregation = false;
-    }
-    else if (input_rel->partial_pathlist == NIL)
-    {
-        /* Nothing to use as input for partial aggregate. */
-        try_parallel_aggregation = false;
-    }
-    else if (!parse->hasAggs && parse->groupClause == NIL)
-    {
-        /*
-         * We don't know how to do parallel aggregation unless we have either
-         * some aggregates or a grouping clause.
+     * Figure out whether a PartialAggregate/Finalize Aggregate execution
+     * strategy is viable.
          */
-        try_parallel_aggregation = false;
-    }
-    else if (parse->groupingSets)
-    {
-        /* We don't know how to do grouping sets in parallel. */
-        try_parallel_aggregation = false;
-    }
-    else if (agg_costs->hasNonPartial || agg_costs->hasNonSerial)
-    {
-        /* Insufficient support for partial mode. */
-        try_parallel_aggregation = false;
-    }
-    else
-    {
-        /* Everything looks good. */
-        try_parallel_aggregation = true;
-    }
+	 try_parallel_aggregation = can_parallel_agg(root, input_rel, grouped_rel,
+                                                 agg_costs);
 
     /*
      * The distributed aggregation however works even if there are no partial
@@ -4133,8 +4117,6 @@ create_grouping_paths(PlannerInfo *root,
      */
     if (try_parallel_aggregation)
     {
-        Path       *cheapest_partial_path = linitial(input_rel->partial_pathlist);
-
         /*
          * Build target list for partial aggregate paths.  These paths cannot
          * just emit the same tlist as regular aggregate paths, because (1) we
@@ -4144,11 +4126,6 @@ create_grouping_paths(PlannerInfo *root,
          */
         partial_grouping_target = make_partial_grouping_target(root, target);
 
-        /* Estimate number of partial groups. */
-        dNumPartialGroups = get_number_of_groups(root,
-                                                 cheapest_partial_path->rows,
-                                                 gd);
-
         /*
          * Collect statistics about aggregates for estimating costs of
          * performing aggregation in parallel.
@@ -4171,97 +4148,11 @@ create_grouping_paths(PlannerInfo *root,
                                  &agg_final_costs);
         }
 
-        if (can_sort)
-        {
-            /* This was checked before setting try_parallel_aggregation */
-            Assert(parse->hasAggs || parse->groupClause);
-
-            /*
-             * Use any available suitably-sorted path as input, and also
-             * consider sorting the cheapest partial path.
-             */
-            foreach(lc, input_rel->partial_pathlist)
-            {
-                Path       *path = (Path *) lfirst(lc);
-                bool        is_sorted;
-
-                is_sorted = pathkeys_contained_in(root->group_pathkeys,
-                                                  path->pathkeys);
-                if (path == cheapest_partial_path || is_sorted)
-                {
-                    /* Sort the cheapest partial path, if it isn't already */
-                    if (!is_sorted)
-                        path = (Path *) create_sort_path(root,
-                                                         grouped_rel,
-                                                         path,
-                                                         root->group_pathkeys,
-                                                         -1.0);
-
-                    if (parse->hasAggs)
-                        add_partial_path(grouped_rel, (Path *)
-                                         create_agg_path(root,
-                                                         grouped_rel,
-                                                         path,
-                                                         partial_grouping_target,
-                                                         parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-                                                         AGGSPLIT_INITIAL_SERIAL,
-                                                         parse->groupClause,
-                                                         NIL,
-                                                         &agg_partial_costs,
-                                                         dNumPartialGroups));
-                    else
-                        add_partial_path(grouped_rel, (Path *)
-                                         create_group_path(root,
-                                                           grouped_rel,
-                                                           path,
-                                                           partial_grouping_target,
-                                                           parse->groupClause,
-                                                           NIL,
-                                                           dNumPartialGroups));
-                }
-            }
-        }
-
-        if (can_hash)
-        {
-            /* Checked above */
-            Assert(parse->hasAggs || parse->groupClause);
-
-            hashaggtablesize =
-                estimate_hashagg_tablesize(cheapest_partial_path,
-                                           &agg_partial_costs,
-                                           dNumPartialGroups);
-
-            /*
-             * Tentatively produce a partial HashAgg Path, depending on if it
-             * looks as if the hash table will fit in work_mem.
-             */
-#ifdef __TBASE__
-			if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg)
-#else
-            if (hashaggtablesize < work_mem * 1024L)
-#endif
-            {
-				AggPath *aggpath = (AggPath *)
-                                 create_agg_path(root,
-                                                 grouped_rel,
-                                                 cheapest_partial_path,
+	    add_partial_paths_to_grouping_rel(root, input_rel, grouped_rel, target,
                                                  partial_grouping_target,
-                                                 AGG_HASHED,
-                                                 AGGSPLIT_INITIAL_SERIAL,
-                                                 parse->groupClause,
-                                                 NIL,
-                                                 &agg_partial_costs,
-												 dNumPartialGroups);
-#ifdef __TBASE__
-				if (hashaggtablesize >= work_mem * 1024L)
-				{
-					aggpath->hybrid = true;
-				}
-#endif
-				add_partial_path(grouped_rel, (Path *)aggpath);
-            }
-        }
+	                                      &agg_partial_costs, &agg_final_costs,
+	                                      gd, can_sort, can_hash,
+	                                      (List *) parse->havingQual);
     }
 #ifdef __TBASE__
 	else
@@ -4300,6 +4191,67 @@ create_grouping_paths(PlannerInfo *root,
      */
 
     /* Build final grouping paths */
+	add_paths_to_grouping_rel(root, input_rel, grouped_rel, target,
+	                          partial_grouping_target, agg_costs,
+	                          &agg_final_costs, gd, can_sort, can_hash,
+	                          dNumGroups, (List *) parse->havingQual);
+
+	/* Generate XL aggregate paths, with distributed 2-phase aggregation. */
+
+	/*
+	 * If there were no partial paths, we did not initialize any of the
+	 * partial paths above. If that's the case, initialize here.
+	 *
+	 * XXX The reason why the initialization block at the beginning is not
+	 * simply performed unconditionally is that we may skip it if we've been
+	 * successful in fully pushing down any of the aggregates, and entirely
+	 * skip generating the XL paths.
+	 *
+	 * XXX Can we simply use the same estimates as regular partial aggregates,
+	 * or do we need to invent something else? It might be a better idea to
+	 * use estimates for the whole result here (e.g. total number of groups)
+	 * instead of the partial ones. Underestimates often have more severe
+	 * consequences (e.g. OOM with HashAggregate) than overestimates, so this
+	 * seems like a more defensive approach.
+	 *
+	 * XXX After thinking a bit more about the estimation, it may depend on
+	 * pushdown - if the aggregate is fully pushed down (as above, we can
+	 * probably use dNumGroups/numberOfNodes as a cardinality estimate, as
+	 * we know the per-node groupings won't overlap. But here we need to be
+	 * more careful.
+	 */
+	if (try_distributed_aggregation)
+	{
+		partial_grouping_target = make_partial_grouping_target(root, target);
+
+		/* Estimate number of partial groups. */
+		dNumPartialGroups = get_number_of_groups(root,
+												 cheapest_path->rows,
+												 gd);
+
+		/*
+		 * Collect statistics about aggregates for estimating costs of
+		 * performing aggregation in parallel.
+		 */
+		MemSet(&agg_partial_costs, 0, sizeof(AggClauseCosts));
+		MemSet(&agg_final_costs, 0, sizeof(AggClauseCosts));
+		if (parse->hasAggs)
+		{
+			/* partial phase */
+			get_agg_clause_costs(root, (Node *) partial_grouping_target->exprs,
+								 AGGSPLIT_INITIAL_SERIAL,
+								 &agg_partial_costs);
+
+			/* final phase */
+			get_agg_clause_costs(root, (Node *) target->exprs,
+								 AGGSPLIT_FINAL_DESERIAL,
+								 &agg_final_costs);
+			get_agg_clause_costs(root, parse->havingQual,
+								 AGGSPLIT_FINAL_DESERIAL,
+								 &agg_final_costs);
+		}
+
+		/* Build final XL grouping paths */
     if (can_sort)
     {
         /*
@@ -4324,115 +4276,119 @@ create_grouping_paths(PlannerInfo *root,
              */
             if (path == cheapest_path || is_sorted)
             {
-#ifdef __TBASE__
-                bool try_redistribute_grouping = false;
-                double dNumLocalGroups;
-                PathTarget * local_grouping_target = make_partial_grouping_target(root, target);
-
-				grouped_rel->reltarget = local_grouping_target;
-
-                /* Estimate number of partial groups. */
-                dNumLocalGroups = get_number_of_groups(root,
-                                                         cheapest_path->rows,
-                                                         gd);
-#endif
-
-#ifdef __TBASE__
-                if (olap_optimizer && !has_cold_hot_table)
-                {
-                    if (!is_sorted && !agg_costs->hasOnlyDistinct)
+					/*
+					 * We can't really beat paths that we managed to fully push
+					 * down above, so we can skip them entirely.
+					 *
+					 * XXX Not constructing any paths, so we can do this before
+					 * adding the Sort path.
+					 */
+					if (can_push_down_grouping(root, parse, path))
+						continue;
+
+					/* Sort the cheapest-total path if it isn't already sorted */
+					if (!is_sorted)
                         path = (Path *) create_sort_path(root,
                                                          grouped_rel,
                                                          path,
                                                          root->group_pathkeys,
                                                          -1.0);
-                }
-                else
-                {
+
+					/* Now decide what to stick atop it */
+					if (parse->groupingSets)
+					{
+						/*
+						 * TODO 2-phase aggregation for grouping sets paths not
+						 * supported yet, but this the place where such paths
+						 * should be constructed.
+						 */
+					}
+					else if (parse->hasAggs)
+					{
+						/*
+						 * We have aggregation, possibly with plain GROUP BY. Make
+						 * an AggPath.
+						 */
+
+						path = (Path *) create_agg_path(root,
+														grouped_rel,
+														path,
+														partial_grouping_target,
+										parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+														AGGSPLIT_INITIAL_SERIAL,
+														parse->groupClause,
+														NIL,
+														&agg_partial_costs,
+														dNumPartialGroups);
+
+#ifdef __TBASE__
+						if (olap_optimizer && !has_cold_hot_table)
+						{
+							/* redistribute local grouping results among datanodes */
+							path = create_redistribute_grouping_path(root, parse, path);
+                        }
+                        else
+                        {
+							path = create_remotesubplan_path(root, path, NULL);
+						}
+#else
+						path = create_remotesubplan_path(root, path, NULL);
 #endif
-                /* Sort the cheapest-total path if it isn't already sorted */
-                if (!is_sorted)
-                    path = (Path *) create_sort_path(root,
+
+#ifdef __TBASE__
+						if (parse->groupClause && olap_optimizer && !has_cold_hot_table && 
+							(!is_sorted || root->group_pathkeys))
+						{
+                            path = (Path *) create_sort_path(root,
                                                      grouped_rel,
                                                      path,
                                                      root->group_pathkeys,
                                                      -1.0);
-#ifdef __TBASE__
-                }
+                        }
 #endif
-                /*
-                 * If the grouping can't be fully pushed down, redistribute the
-                 * path on top of the (sorted) path. If if can be pushed down,
-                 * disable construction of complex distributed paths.
-                 */
-                if (! can_push_down_grouping(root, parse, path))
-#ifdef __TBASE__
-                {
-                    /* some special aggs cannot be parallel executed, such as count(distinct) */
-                    if(agg_costs->hasNonPartial || agg_costs->hasNonSerial ||
-                        parse->groupingSets || path->pathtype == T_Agg ||
-                        path->pathtype == T_Group || !olap_optimizer || has_cold_hot_table)
-                    {
-                        if (agg_costs->hasOnlyDistinct && olap_optimizer && !parse->groupingSets
-                            && !has_cold_hot_table)
-                            path = create_redistribute_grouping_path(root, parse, path);
-                        else
-                            path = create_remotesubplan_path(root, path, NULL);
-
-						if (agg_costs->hasOnlyDistinct && olap_optimizer &&
-                            !has_cold_hot_table)
-                        {
-							if (root->group_pathkeys)
-							{
-								path = (Path *) create_sort_path(root,
+                        /*
+						 * We generate two paths, differing in the second phase
+						 * implementation (sort and hash).
+                        */
+						{
+							Path *agg_path = (Path *)
+									 create_agg_path(root,
 												 grouped_rel,
 												 path,
-												 root->group_pathkeys,
-												 -1.0);
-							}
-                        }
-                    }
-                    else
-                    {
-                        /*
-                          * If the grouping can not be fully pushed down, we adopt another
-                          * strategy instead. 
-                          * 1. do grouping on each datanode locally
-                          * 2. re-distribute grouping results among datanodes, then do the 
-                          *     final grouping
-                                           */
+													 target,
+											 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+													 AGGSPLIT_FINAL_DESERIAL,
+													 parse->groupClause,
+													 (List *) parse->havingQual,
+													 &agg_final_costs,
+													 dNumGroups);
                                            
-                        try_redistribute_grouping = true;
+							//agg_path->parallel_safe = true;
 
-                        /* step 1 */
-                        if (parse->groupingSets)
-                        {
-                            /*
-                             * TODO 2-phase aggregation for grouping sets paths not
-                             * supported yet, but this the place where such paths
-                             * should be constructed.
-                             */
+							add_path(grouped_rel, agg_path);
                         }
-                        else if (parse->hasAggs)
-                        {
-                            /*
-                             * We have aggregation, possibly with plain GROUP BY. Make
-                             * an AggPath.
-                             */
 
-                            path = (Path *) create_agg_path(root,
+						if (can_hash)
+						{
+							Path *agg_path = (Path *)
+									 create_agg_path(root,
                                                             grouped_rel,
                                                             path,
-                                                            local_grouping_target,
-                                            parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-                                                            AGGSPLIT_INITIAL_SERIAL,
+													 target,
+													 AGG_HASHED,
+													 AGGSPLIT_FINAL_DESERIAL,
                                                             parse->groupClause,
-                                                            NIL,
-                                                            &agg_partial_costs,
-                                                            dNumLocalGroups);
-                        }
-                        else if (parse->groupClause)
-                        {
+													 (List *) parse->havingQual,
+													 &agg_final_costs,
+													 dNumGroups);
+
+							//agg_path->parallel_safe = true;
+							
+							add_path(grouped_rel, agg_path);
+						}
+                    }
+                    else if (parse->groupClause)
+                    {
                             /*
                              * We have GROUP BY without aggregation or grouping sets.
                              * Make a GroupPath.
@@ -4440,85 +4396,126 @@ create_grouping_paths(PlannerInfo *root,
                             path = (Path *) create_group_path(root,
                                                               grouped_rel,
                                                               path,
-                                                              local_grouping_target,
+														  partial_grouping_target,
                                                               parse->groupClause,
                                                               NIL,
-                                                              dNumLocalGroups);
-                        }
-                        else
-                        {
-                            /* Other cases should have been handled above */
-                            Assert(false);
-                        }
+														  dNumPartialGroups);
 
-                        /* step 2*/
+#ifdef __TBASE__
+						if (olap_optimizer && !has_cold_hot_table)
+						{
+							/* redistribute local grouping results among datanodes */
                             path = create_redistribute_grouping_path(root, parse, path);
                         }
-                    }
+						else
+						{
+							path = create_remotesubplan_path(root, path, NULL);
+                        }
 #else
-                    path = create_remotesubplan_path(root, path, NULL);
+                        path = create_remotesubplan_path(root, path, NULL);
 #endif
 
-                else
-                    try_distributed_aggregation = false;
-
 #ifdef __TBASE__
-                if(try_redistribute_grouping)
-                {
-                    /*
-                      * do final grouping at each datanode
-                      */
-                      
-                    /* Now decide what to stick atop it */
-                    if (parse->groupingSets)
-                    {
-                        /*
-                         * TODO 2-phase aggregation for grouping sets paths not
-                         * supported yet, but this the place where such paths
-                         * should be constructed.
-                         */
-                    }
-                    else if (parse->hasAggs)
-                    {
-                        /*
-                         * We generate two paths, differing in the second phase
-                         * implementation (sort and hash).
-                         */
-                        Path *remote_path = path;
-                        
-                        if (parse->groupClause)
+						if (olap_optimizer && !has_cold_hot_table && (!is_sorted || root->group_pathkeys))
                         {
-                            if (!is_sorted || root->group_pathkeys)
-                            {
                                 path = (Path *) create_sort_path(root,
                                                                  grouped_rel,
                                                                  path,
                                                                  root->group_pathkeys,
                                                                  -1.0);
-                            }
                         }
-                        
-                        path = (Path *)create_agg_path(root,
+#endif
+						{
+							Path *group_path = (Path *)
+									 create_group_path(root,
                                                  grouped_rel,
                                                  path,
                                                  target,
-                                         parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-                                                 AGGSPLIT_FINAL_DESERIAL,
                                                  parse->groupClause,
                                                  (List *) parse->havingQual,
-                                                 &agg_final_costs,
                                                  dNumGroups);
                         
-                        //path->parallel_safe = true;
+							//group_path->parallel_safe = true;
                         
-                        add_path(grouped_rel, path);
+							add_path(grouped_rel, group_path);
+						}
 
-                        if (can_hash)
-                        {
-                            path = (Path *)
+					}
+					else
+					{
+						/* Other cases should have been handled above */
+						Assert(false);
+					}
+				}
+			}
+		}
+
+        if (can_hash)
+        {
+			hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
+														  agg_costs,
+														  dNumGroups);
+
+			/*
+			 * Provided that the estimated size of the hashtable does not exceed
+			 * work_mem, we'll generate a HashAgg Path, although if we were unable
+			 * to sort above, then we'd better generate a Path, so that we at
+			 * least have one.
+			 */
+#ifdef __TBASE__
+			if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg ||
+				grouped_rel->pathlist == NIL)
+#else
+			if (hashaggtablesize < work_mem * 1024L ||
+				grouped_rel->pathlist == NIL)
+#endif
+			{
+				/* If the whole aggregate was pushed down, we're done. */
+				if (! can_push_down_grouping(root, parse, cheapest_path))
+				{
+					Path *path, *agg_path;
+
+					path = (Path *) create_agg_path(root,
+										   grouped_rel,
+										   cheapest_path,
+										   partial_grouping_target,
+										   AGG_HASHED,
+										   AGGSPLIT_INITIAL_SERIAL,
+										   parse->groupClause,
+										   NIL,
+										   &agg_partial_costs,
+										   dNumPartialGroups);
+
+					/* keep partially aggregated path for the can_sort branch */
+					agg_path = path;
+#ifdef __TBASE__
+					if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
+					{
+						AggPath *aggpath = (AggPath *)agg_path;
+
+						aggpath->hybrid = true;
+					}
+#endif
+
+#ifdef __TBASE__
+					if (olap_optimizer && !has_cold_hot_table)
+					{
+						/* redistribute local grouping results among datanodes */
+						path = create_redistribute_grouping_path(root, parse, path);
+					}
+					else
+					{
+						path = create_remotesubplan_path(root, path, NULL);
+					}
+#else
+					path = create_remotesubplan_path(root, path, NULL);
+#endif
+					/* Generate paths with both hash and sort second phase. */
+					{
+						Path *agg_path = (Path *)
                                      create_agg_path(root,
                                                      grouped_rel,
-                                                     remote_path,
+												 path,
                                                      target,
                                                      AGG_HASHED,
                                                      AGGSPLIT_FINAL_DESERIAL,
@@ -4526,3381 +4523,3462 @@ create_grouping_paths(PlannerInfo *root,
                                                      (List *) parse->havingQual,
                                                      &agg_final_costs,
                                                      dNumGroups);
-                            //path->parallel_safe = true;
-							if (g_hybrid_hash_agg)
+
+						//agg_path->parallel_safe = true;
+#ifdef __TBASE__
+						if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
 							{
-								AggPath *agg = (AggPath *)path;
-								agg->hybrid = true;
-							}
+							AggPath *aggpath = (AggPath *)agg_path;
                             
-                            add_path(grouped_rel, path);
+							aggpath->hybrid = true;
                         }
+#endif
+						add_path(grouped_rel, agg_path);
                     }
-                    else if (parse->groupClause)
+
+					if (can_sort)
                     {
-                        if (!is_sorted || root->group_pathkeys)
-                        {
+#ifdef __TBASE__
+						if (!olap_optimizer || has_cold_hot_table)
+#endif
                             path = (Path *) create_sort_path(root,
                                                              grouped_rel,
-                                                             path,
+									 agg_path,
                                                              root->group_pathkeys,
                                                              -1.0);
-                        }
-
-                        path = (Path *)
-                                 create_group_path(root,
-                                                   grouped_rel,
-                                                   path,
-                                                   target,
-                                                   parse->groupClause,
-                                                   (List *) parse->havingQual,
-                                                   dNumGroups);
-
-                        //path->parallel_safe = true;
-                        
-                        /*
-                         * We have GROUP BY without aggregation or grouping sets.
-                         * Make a GroupPath.
-                         */
-                        add_path(grouped_rel, path);
 
-                    }
-                    else
-                    {
-                        /* Other cases should have been handled above */
-                        Assert(false);
-                    }
-                }
-                else
-                {
+#ifdef __TBASE__
+						if (olap_optimizer && !has_cold_hot_table)
+                        {
+							/* redistribute local grouping results among datanodes */
+							path = create_redistribute_grouping_path(root, parse, agg_path);
+                        }
+                        else
+                        {
+							path = create_remotesubplan_path(root, path, NULL);
+						}
+#else
+						path = create_remotesubplan_path(root, path, NULL);
 #endif
 
-                /* Now decide what to stick atop it */
-                if (parse->groupingSets)
-                {
-                    consider_groupingsets_paths(root, grouped_rel,
-                                                path, true, can_hash, target,
-                                                gd, agg_costs, dNumGroups);
-                }
-                else if (parse->hasAggs)
-                {
 #ifdef __TBASE__
-                    bool parallel_aware = false;
-                    bool parallel_safe  = false;
-                    Path *agg_path      = NULL;
-
-					if (root->group_pathkeys && olap_optimizer && 
-						!has_cold_hot_table && agg_costs->hasOnlyDistinct)
-					{
-						if (!pathkeys_contained_in(root->group_pathkeys,
-											  path->pathkeys))
+						if (olap_optimizer && !has_cold_hot_table)
 						{
+							/*
+							 * AGG_HASHED aggregate paths are always unsorted, so add
+							 * a Sorted node for the final AGG_SORTED step.
+							 */
 							path = (Path *) create_sort_path(root,
 											 grouped_rel,
 											 path,
 											 root->group_pathkeys,
 											 -1.0);
 					  	}
-					}
-
-                    if (path->pathtype == T_Sort && olap_optimizer && !has_cold_hot_table)
-                    {
-                        SortPath   *pathnode = (SortPath *)path;
-
-                        if (pathnode->subpath->pathtype == T_Gather || agg_costs->hasOnlyDistinct)
-                        {
-                            path->parallel_aware = true;
-                            parallel_aware       = true;
-                            parallel_safe        = true;
-                        }
-                    }
+#endif
 
-                    agg_path = (Path *)
+						path = (Path *)
                              create_agg_path(root,
                                              grouped_rel,
                                              path,
                                              target,
                                              parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-                                             AGGSPLIT_SIMPLE,
+												 AGGSPLIT_FINAL_DESERIAL,
                                              parse->groupClause,
                                              (List *) parse->havingQual,
-                                             agg_costs,
+												 &agg_final_costs,
                                              dNumGroups);
-                    agg_path->parallel_aware = parallel_aware;
-                    agg_path->parallel_safe  = parallel_safe;
 
-                    add_path(grouped_rel, agg_path);
-#else
-                    /*
-                     * We have aggregation, possibly with plain GROUP BY. Make
-                     * an AggPath.
-                     */
-                    add_path(grouped_rel, (Path *)
-                             create_agg_path(root,
-                                             grouped_rel,
-                                             path,
-                                             target,
-                                             parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-                                             AGGSPLIT_SIMPLE,
-                                             parse->groupClause,
-                                             (List *) parse->havingQual,
-                                             agg_costs,
-                                             dNumGroups));
-#endif
-                }
-                else if (parse->groupClause)
-                {
-#ifdef __TBASE__
-                    bool parallel_aware = false;
-                    bool parallel_safe    = false;
-                    Path *group_path      = NULL;
+						//path->parallel_safe = true;
 
-					if (root->group_pathkeys && olap_optimizer && 
-						!has_cold_hot_table && agg_costs->hasOnlyDistinct)
-					{
-						if (!pathkeys_contained_in(root->group_pathkeys,
-											  path->pathkeys))
-						{
-							path = (Path *) create_sort_path(root,
-											 grouped_rel,
-											 path,
-											 root->group_pathkeys,
-											 -1.0);
-					  	}
+						add_path(grouped_rel, path);
 					}
+				}
+			}
+        }
+    }
 
-                    if (path->pathtype == T_Sort && olap_optimizer && !has_cold_hot_table)
-                    {
-                        SortPath   *pathnode = (SortPath *)path;
-
-                        if (pathnode->subpath->pathtype == T_Gather)
-                        {
-                            path->parallel_aware = true;
-                            parallel_aware       = true;
-                            parallel_safe        = true;
-                        }
-                    }
-
-                    group_path = (Path *)
-                             create_group_path(root,
-                                               grouped_rel,
-                                               path,
-                                               target,
-                                               parse->groupClause,
-                                               (List *) parse->havingQual,
-                                               dNumGroups);
-                    group_path->parallel_aware = parallel_aware;
-                    group_path->parallel_safe  = parallel_safe;
-                    add_path(grouped_rel, group_path);
-#else
+	/* Give a helpful error if we failed to find any implementation */
+	if (grouped_rel->pathlist == NIL)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("could not implement GROUP BY"),
+				 errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
 
                     /*
-                     * We have GROUP BY without aggregation or grouping sets.
-                     * Make a GroupPath.
+	 * If there is an FDW that's responsible for all baserels of the query,
+	 * let it consider adding ForeignPaths.
                      */
-                    add_path(grouped_rel, (Path *)
-                             create_group_path(root,
-                                               grouped_rel,
-                                               path,
-                                               target,
-                                               parse->groupClause,
-                                               (List *) parse->havingQual,
-                                               dNumGroups));
-#endif
-                }
-                else
-                {
-                    /* Other cases should have been handled above */
-                    Assert(false);
-                }
-#ifdef __TBASE__
-                }
-#endif
-            }
-        }
-
+	if (grouped_rel->fdwroutine &&
+		grouped_rel->fdwroutine->GetForeignUpperPaths)
+		grouped_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_GROUP_AGG,
+													  input_rel, grouped_rel);
+
+	/* Let extensions possibly add some more paths */
+	if (create_upper_paths_hook)
+		(*create_upper_paths_hook) (root, UPPERREL_GROUP_AGG,
+									input_rel, grouped_rel);
+
+	/* Now choose the best path(s) */
+	set_cheapest(grouped_rel);
         /*
-         * Now generate a complete GroupAgg Path atop of the cheapest partial
-         * path.  We can do this using either Gather or Gather Merge.
+	 * We've been using the partial pathlist for the grouped relation to hold
+	 * partially aggregated paths, but that's actually a little bit bogus
+	 * because it's unsafe for later planning stages -- like ordered_rel ---
+	 * to get the idea that they can use these partial paths as if they didn't
+	 * need a FinalizeAggregate step.  Zap the partial pathlist at this stage
+	 * so we don't get confused.
          */
-        if (grouped_rel->partial_pathlist)
-        {
-#ifdef __TBASE__
-            bool       redistribute_group PG_USED_FOR_ASSERTS_ONLY = false;
-#endif
-            Path       *path = (Path *) linitial(grouped_rel->partial_pathlist);
-#ifdef __TBASE__
-            double        total_groups = 0;
+	grouped_rel->partial_pathlist = NIL;
 
-            if (olap_optimizer && !has_cold_hot_table)
-            {
-                total_groups = path->rows;
-            }
-            else
-                total_groups = path->rows * path->parallel_workers;
-#else
-            double        total_groups = path->rows * path->parallel_workers;
-#endif
-            path = (Path *) create_gather_path(root,
-                                               grouped_rel,
-                                               path,
-                                               partial_grouping_target,
-                                               NULL,
-                                               &total_groups);
-            /*
-             * Since Gather's output is always unsorted, we'll need to sort,
-             * unless there's no GROUP BY clause or a degenerate (constant)
-             * one, in which case there will only be a single group.
-             */
-#ifdef __TBASE__
-            if (!olap_optimizer || has_cold_hot_table)
-            {
-#endif
-            if (root->group_pathkeys)
-                path = (Path *) create_sort_path(root,
-                                                 grouped_rel,
-                                                 path,
-                                                 root->group_pathkeys,
-                                                 -1.0);
-#ifdef __TBASE__
-            }
-#endif
-            /*
-             * If the grouping can't be fully pushed down, we'll push down the
-             * first phase of the aggregate, and redistribute only the partial
-             * results.
-             *
-             * If if can be pushed down, disable construction of complex
-             * distributed paths.
-             *
-             * XXX Keep this after the Sort node, to make the path sorted.
-             */
-            if (! can_push_down_grouping(root, parse, path))
-#ifdef __TBASE__
-            {
-                if (olap_optimizer && !has_cold_hot_table)
-                {
-                    /* redistribute local grouping results among datanodes */
-                    path = create_redistribute_grouping_path(root, parse, path);
-                    redistribute_group = true;
-                }
-                else
-                    path = create_remotesubplan_path(root, path, NULL);
-            }
-#else
-            path = create_remotesubplan_path(root, path, NULL);
-#endif
+	return grouped_rel;
+}
 
-            else
-                try_distributed_aggregation = false;
 
-#ifdef __TBASE__
+/*
+ * For a given input path, consider the possible ways of doing grouping sets on
+ * it, by combinations of hashing and sorting.  This can be called multiple
+ * times, so it's important that it not scribble on input.  No result is
+ * returned, but any generated paths are added to grouped_rel.
+ */
+static void
+consider_groupingsets_paths(PlannerInfo *root,
+							RelOptInfo *grouped_rel,
+							Path *path,
+							bool is_sorted,
+							bool can_hash,
+							PathTarget *target,
+							grouping_sets_data *gd,
+							const AggClauseCosts *agg_costs,
+							double dNumGroups)
+{
+	Query	   *parse = root->parse;
+
             /*
-             * Since Gather's output is always unsorted, we'll need to sort,
-             * unless there's no GROUP BY clause or a degenerate (constant)
-             * one, in which case there will only be a single group.
+	 * If we're not being offered sorted input, then only consider plans that
+	 * can be done entirely by hashing.
+             *
+	 * We can hash everything if it looks like it'll fit in work_mem. But if
+	 * the input is actually sorted despite not being advertised as such, we
+	 * prefer to make use of that in order to use less memory.
+             *
+	 * If none of the grouping sets are sortable, then ignore the work_mem
+	 * limit and generate a path anyway, since otherwise we'll just fail.
              */
-            if (olap_optimizer && !has_cold_hot_table)
-            {
-                if (root->group_pathkeys)
-                {
-                    path = (Path *) create_sort_path(root,
-                                                     grouped_rel,
-                                                     path,
-                                                     root->group_pathkeys,
-                                                     -1.0);
-                        path->parallel_aware = true;
-                }
-            }
-#endif
-            if (parse->hasAggs)
-            {
-                Path *agg_path = NULL;
-
-                agg_path = (Path *)
-                         create_agg_path(root,
-                                         grouped_rel,
-                                         path,
-                                         target,
-                                         parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-                                         AGGSPLIT_FINAL_DESERIAL,
-                                         parse->groupClause,
-                                         (List *) parse->havingQual,
-                                         &agg_final_costs,
-                                         dNumGroups);
-
-                if (olap_optimizer && !has_cold_hot_table)
+	if (!is_sorted)
                 {
-                    agg_path->parallel_safe = true;
-                    agg_path->parallel_aware = true;
-                }
-                
-                add_path(grouped_rel, agg_path);
-            }
-            else
-            {
-                Path *group_path = NULL;
-
-                group_path = (Path *)
-                         create_group_path(root,
-                                           grouped_rel,
-                                           path,
-                                           target,
-                                           parse->groupClause,
-                                           (List *) parse->havingQual,
-                                           dNumGroups);
-
-                if (olap_optimizer && !has_cold_hot_table)
+		List	   *new_rollups = NIL;
+		RollupData *unhashed_rollup = NULL;
+		List	   *sets_data;
+		List	   *empty_sets_data = NIL;
+		List	   *empty_sets = NIL;
+		ListCell   *lc;
+		ListCell   *l_start = list_head(gd->rollups);
+		AggStrategy strat = AGG_HASHED;
+		Size		hashsize;
+		double		exclude_groups = 0.0;
+
+		Assert(can_hash);
+
+		if (pathkeys_contained_in(root->group_pathkeys, path->pathkeys))
                 {
-                    group_path->parallel_safe = true;
-                    group_path->parallel_aware = true;
+			unhashed_rollup = lfirst(l_start);
+			exclude_groups = unhashed_rollup->numGroups;
+			l_start = lnext(l_start);
                 }
                 
-                add_path(grouped_rel, group_path);
-            }
+		hashsize = estimate_hashagg_tablesize(path,
+											  agg_costs,
+											  dNumGroups - exclude_groups);
 
             /*
-             * The point of using Gather Merge rather than Gather is that it
-             * can preserve the ordering of the input path, so there's no
-             * reason to try it unless (1) it's possible to produce more than
-             * one output row and (2) we want the output path to be ordered.
+		 * gd->rollups is empty if we have only unsortable columns to work
+		 * with.  Override work_mem in that case; otherwise, we'll rely on the
+		 * sorted-input case to generate usable mixed paths.
              */
-            if (parse->groupClause != NIL && root->group_pathkeys != NIL)
-            {
-                foreach(lc, grouped_rel->partial_pathlist)
-                {
-                    Path       *subpath = (Path *) lfirst(lc);
-                    Path       *gmpath;
-                    double        total_groups;
+		if (hashsize > work_mem * 1024L && gd->rollups)
+			return;				/* nope, won't fit */
 
                     /*
-                     * It's useful to consider paths that are already properly
-                     * ordered for Gather Merge, because those don't need a
-                     * sort.  It's also useful to consider the cheapest path,
-                     * because sorting it in parallel and then doing Gather
-                     * Merge may be better than doing an unordered Gather
-                     * followed by a sort.  But there's no point in
-                     * considering non-cheapest paths that aren't already
-                     * sorted correctly.
+		 * We need to burst the existing rollups list into individual grouping
+		 * sets and recompute a groupClause for each set.
                      */
-                    if (path != subpath &&
-                        !pathkeys_contained_in(root->group_pathkeys,
-                                               subpath->pathkeys))
-                        continue;
+		sets_data = list_copy(gd->unsortable_sets);
 
-#ifdef __TBASE__
-                    if (olap_optimizer && !has_cold_hot_table)
-                        total_groups = subpath->rows;
-                    else
-#endif
-                    total_groups = subpath->rows * subpath->parallel_workers;
-
-
-#ifdef __TBASE__
-                    if (olap_optimizer && !has_cold_hot_table)
+		for_each_cell(lc, l_start)
                     {
-                    gmpath = (Path *) create_gather_path(root,
-                                                         grouped_rel,
-                                                         subpath,
-                                                         partial_grouping_target,
-                                                         NULL,
-                                                         &total_groups);
-                    }
-                    else
-#endif
-                    gmpath = (Path *)
-                        create_gather_merge_path(root,
-                                                 grouped_rel,
-                                                 subpath,
-                                                 partial_grouping_target,
-                                                 root->group_pathkeys,
-                                                 NULL,
-                                                 &total_groups);
+			RollupData *rollup = lfirst(lc);
 
                     /*
-                     * If the grouping can't be fully pushed down, we'll push down the
-                     * first phase of the aggregate, and redistribute only the partial
-                     * results.
+			 * If we find an unhashable rollup that's not been skipped by the
+			 * "actually sorted" check above, we can't cope; we'd need sorted
+			 * input (with a different sort order) but we can't get that here.
+			 * So bail out; we'll get a valid path from the is_sorted case
+			 * instead.
+			 *
+			 * The mere presence of empty grouping sets doesn't make a rollup
+			 * unhashable (see preprocess_grouping_sets), we handle those
+			 * specially below.
                      */
-                    redistribute_group = false;
-
-                    if (! can_push_down_grouping(root, parse, gmpath))
-#ifdef __TBASE__
+			if (!rollup->hashable)
+				return;
+			else
+				sets_data = list_concat(sets_data, list_copy(rollup->gsets_data));
+		}
+		foreach(lc, sets_data)
                     {
-                        if (olap_optimizer && !has_cold_hot_table)
+			GroupingSetData *gs = lfirst(lc);
+			List	   *gset = gs->set;
+			RollupData *rollup;
+
+			if (gset == NIL)
                         {
-                            /* redistribute local grouping results among datanodes */
-                            gmpath = create_redistribute_grouping_path(root, parse, gmpath);
-                            redistribute_group = true;
+				/* Empty grouping sets can't be hashed. */
+				empty_sets_data = lappend(empty_sets_data, gs);
+				empty_sets = lappend(empty_sets, NIL);
                         }
                         else
                         {
-                            gmpath = create_remotesubplan_path(root, gmpath, NULL);
+				rollup = makeNode(RollupData);
+
+				rollup->groupClause = preprocess_groupclause(root, gset);
+				rollup->gsets_data = list_make1(gs);
+				rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
+														 rollup->gsets_data,
+														 gd->tleref_to_colnum_map);
+				rollup->numGroups = gs->numGroups;
+				rollup->hashable = true;
+				rollup->is_hashed = true;
+				new_rollups = lappend(new_rollups, rollup);
                         }
                     }
-#else
-                    gmpath = create_remotesubplan_path(root, gmpath, NULL);
-#endif
 
-#ifdef __TBASE__
                     /*
-                     * Since Gather's output is always unsorted, we'll need to sort,
-                     * unless there's no GROUP BY clause or a degenerate (constant)
-                     * one, in which case there will only be a single group.
+		 * If we didn't find anything nonempty to hash, then bail.  We'll
+		 * generate a path from the is_sorted case.
                      */
-                    if (olap_optimizer && !has_cold_hot_table)
+		if (new_rollups == NIL)
+			return;
+
+		/*
+		 * If there were empty grouping sets they should have been in the
+		 * first rollup.
+		 */
+		Assert(!unhashed_rollup || !empty_sets);
+
+		if (unhashed_rollup)
                     {
-                        if (root->group_pathkeys)
+			new_rollups = lappend(new_rollups, unhashed_rollup);
+			strat = AGG_MIXED;
+		}
+		else if (empty_sets)
                         {
-                            gmpath = (Path *) create_sort_path(root,
-                                                              grouped_rel,
-                                                              gmpath,
-                                                              root->group_pathkeys,
-                                                              -1.0);
-
-                                gmpath->parallel_aware = true;
-                        }
+			RollupData *rollup = makeNode(RollupData);
+
+			rollup->groupClause = NIL;
+			rollup->gsets_data = empty_sets_data;
+			rollup->gsets = empty_sets;
+			rollup->numGroups = list_length(empty_sets);
+			rollup->hashable = false;
+			rollup->is_hashed = false;
+			new_rollups = lappend(new_rollups, rollup);
+			strat = AGG_MIXED;
                     }
-#endif
-                    if (parse->hasAggs)
-                    {
-                        Path *agg_path = NULL;
 
-                        agg_path = (Path *)
-                                 create_agg_path(root,
+		/*
+		 * If the grouping can't be fully pushed down, redistribute the
+		 * path on top of the (sorted) path. If if can be pushed down,
+		 * disable construction of complex distributed paths.
+		 */
+		if (! can_push_down_grouping(root, parse, path))
+			path = create_remotesubplan_path(root, path, NULL);
+
+		add_path(grouped_rel, (Path *)
+				 create_groupingsets_path(root,
                                                  grouped_rel,
-                                                 gmpath,
+										  path,
                                                  target,
-                                                 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-                                                 AGGSPLIT_FINAL_DESERIAL,
-                                                 parse->groupClause,
                                                  (List *) parse->havingQual,
-                                                 &agg_final_costs,
-                                                 dNumGroups);
-                        if (olap_optimizer && !has_cold_hot_table)
-                        {
-                            agg_path->parallel_safe = true;
-                            agg_path->parallel_aware = true;
+										  strat,
+										  new_rollups,
+										  agg_costs,
+										  dNumGroups));
+		return;
                         }
                         
-                        add_path(grouped_rel, agg_path);
-                    }
-                    else
+	/*
+	 * If we have sorted input but nothing we can do with it, bail.
+	 */
+	if (list_length(gd->rollups) == 0)
+		return;
+
+	/*
+	 * Given sorted input, we try and make two paths: one sorted and one mixed
+	 * sort/hash. (We need to try both because hashagg might be disabled, or
+	 * some columns might not be sortable.)
+	 *
+	 * can_hash is passed in as false if some obstacle elsewhere (such as
+	 * ordered aggs) means that we shouldn't consider hashing at all.
+	 */
+	if (can_hash && gd->any_hashable)
                     {
-                        Path *group_path = NULL;
-
-                        group_path = (Path *)
-                                 create_group_path(root,
-                                                   grouped_rel,
-                                                   gmpath,
-                                                   target,
-                                                   parse->groupClause,
-                                                   (List *) parse->havingQual,
-                                                   dNumGroups);
-
-                        if (olap_optimizer && !has_cold_hot_table)
+		List	   *rollups = NIL;
+		List	   *hash_sets = list_copy(gd->unsortable_sets);
+		double		availspace = (work_mem * 1024.0);
+		ListCell   *lc;
+
+		/*
+		 * Account first for space needed for groups we can't sort at all.
+		 */
+		availspace -= (double) estimate_hashagg_tablesize(path,
+														  agg_costs,
+														  gd->dNumHashGroups);
+
+		if (availspace > 0 && list_length(gd->rollups) > 1)
                         {
-                            group_path->parallel_safe = true;
-                            group_path->parallel_aware = true;
-                        }
+			double		scale;
+			int			num_rollups = list_length(gd->rollups);
+			int			k_capacity;
+			int		   *k_weights = palloc(num_rollups * sizeof(int));
+			Bitmapset  *hash_items = NULL;
+			int			i;
                         
-                        add_path(grouped_rel, group_path);
-                    }
-                }
-            }
-        }
-    }
+			/*
+			 * We treat this as a knapsack problem: the knapsack capacity
+			 * represents work_mem, the item weights are the estimated memory
+			 * usage of the hashtables needed to implement a single rollup,
+			 * and we really ought to use the cost saving as the item value;
+			 * however, currently the costs assigned to sort nodes don't
+			 * reflect the comparison costs well, and so we treat all items as
+			 * of equal value (each rollup we hash instead saves us one sort).
+			 *
+			 * To use the discrete knapsack, we need to scale the values to a
+			 * reasonably small bounded range.  We choose to allow a 5% error
+			 * margin; we have no more than 4096 rollups in the worst possible
+			 * case, which with a 5% error margin will require a bit over 42MB
+			 * of workspace. (Anyone wanting to plan queries that complex had
+			 * better have the memory for it.  In more reasonable cases, with
+			 * no more than a couple of dozen rollups, the memory usage will
+			 * be negligible.)
+			 *
+			 * k_capacity is naturally bounded, but we clamp the values for
+			 * scale and weight (below) to avoid overflows or underflows (or
+			 * uselessly trying to use a scale factor less than 1 byte).
+			 */
+			scale = Max(availspace / (20.0 * num_rollups), 1.0);
+			k_capacity = (int) floor(availspace / scale);
 
-    if (can_hash)
-    {
-        if (parse->groupingSets)
-        {
             /*
-             * Try for a hash-only groupingsets path over unsorted input.
+			 * We leave the first rollup out of consideration since it's the
+			 * one that matches the input sort order.  We assign indexes "i"
+			 * to only those entries considered for hashing; the second loop,
+			 * below, must use the same condition.
              */
-            consider_groupingsets_paths(root, grouped_rel,
-                                        cheapest_path, false, true, target,
-                                        gd, agg_costs, dNumGroups);
-        }
-        else
-        {
-            hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
+			i = 0;
+			for_each_cell(lc, lnext(list_head(gd->rollups)))
+            {
+				RollupData *rollup = lfirst(lc);
+
+				if (rollup->hashable)
+				{
+					double		sz = estimate_hashagg_tablesize(path,
                                                           agg_costs,
-                                                          dNumGroups);
+																rollup->numGroups);
 
-            /*
-             * Provided that the estimated size of the hashtable does not
-             * exceed work_mem, we'll generate a HashAgg Path, although if we
-             * were unable to sort above, then we'd better generate a Path, so
-             * that we at least have one.
-             */
-#ifdef __TBASE__
-			if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg ||
-				grouped_rel->pathlist == NIL)
-#else
-            if (hashaggtablesize < work_mem * 1024L ||
-                grouped_rel->pathlist == NIL)
-#endif
-            {
-                /* Don't mess with the cheapest path directly. */
-                Path *path = cheapest_path;
-#ifdef __TBASE__
-                bool try_redistribute_grouping = false;
-#endif
+                    /*
+					 * If sz is enormous, but work_mem (and hence scale) is
+					 * small, avoid integer overflow here.
+                    */
+					k_weights[i] = (int) Min(floor(sz / scale),
+											 k_capacity + 1.0);
+					++i;
+				}
+			}
 
                 /*
-                 * If the grouping can't be fully pushed down, we'll push down the
-                 * first phase of the aggregate, and redistribute only the partial
-                 * results.
-                 *
-                 * If if can be pushed down, disable construction of complex
-                 * distributed paths.
+			 * Apply knapsack algorithm; compute the set of items which
+			 * maximizes the value stored (in this case the number of sorts
+			 * saved) while keeping the total size (approximately) within
+			 * capacity.
                  */
-                if (! can_push_down_grouping(root, parse, path))
-#ifdef XCP
+			if (i > 0)
+				hash_items = DiscreteKnapsack(k_capacity, i, k_weights, NULL);
+
+			if (!bms_is_empty(hash_items))
+            {
+				rollups = list_make1(linitial(gd->rollups));
+
+				i = 0;
+				for_each_cell(lc, lnext(list_head(gd->rollups)))
                 {
-                    /* some special aggs cannot be parallel executed, such as count(distinct) */
-                    if(agg_costs->hasNonPartial || agg_costs->hasNonSerial || 
-                        path->pathtype == T_Agg || path->pathtype == T_Group ||
-                        !olap_optimizer || has_cold_hot_table)
-                    {
-                        if (agg_costs->hasOnlyDistinct && olap_optimizer && !has_cold_hot_table)
-                            path = create_redistribute_grouping_path(root, parse, path);
+					RollupData *rollup = lfirst(lc);
+
+					if (rollup->hashable)
+					{
+						if (bms_is_member(i, hash_items))
+							hash_sets = list_concat(hash_sets,
+													list_copy(rollup->gsets_data));
                         else
-                            path = create_remotesubplan_path(root, path, NULL);
+							rollups = lappend(rollups, rollup);
+						++i;
                     }
                     else
-                    {
-                        /*
-                          * If the grouping can not be fully pushed down, we adopt another
-                          * strategy instead. 
-                          * 1. do grouping on each datanode locally
-                          * 2. re-distribute grouping results among datanodes, then do the 
-                          *     final grouping
-                                        */
-						AggClauseCosts hashagg_partial_costs;
-                        PathTarget * local_grouping_target = make_partial_grouping_target(root, target);
+						rollups = lappend(rollups, rollup);
+				}
+			}
+		}
 
-                        /* Estimate number of partial groups. */
-                        double dNumLocalGroups = get_number_of_groups(root,
-                                                                 cheapest_path->rows,
-                                                                 gd);
-                        try_redistribute_grouping = true;
+		if (!rollups && hash_sets)
+			rollups = list_copy(gd->rollups);
 
-						MemSet(&hashagg_partial_costs, 0, sizeof(AggClauseCosts));
+		foreach(lc, hash_sets)
+		{
+			GroupingSetData *gs = lfirst(lc);
+			RollupData *rollup = makeNode(RollupData);
 				
-						get_agg_clause_costs(root, (Node *) local_grouping_target->exprs,
-								 AGGSPLIT_INITIAL_SERIAL,
-								 &hashagg_partial_costs);
+			Assert(gs->set != NIL);
                         
-                        /* step 1 */
-                        path = (Path *) create_agg_path(root,
-                                                        grouped_rel,
-                                                        cheapest_path,
-                                                        local_grouping_target,
-                                                        AGG_HASHED,
-                                                        AGGSPLIT_INITIAL_SERIAL,
-                                                        parse->groupClause,
-                                                        NIL,
-														&hashagg_partial_costs,
-                                                        dNumLocalGroups);
+			rollup->groupClause = preprocess_groupclause(root, gs->set);
+			rollup->gsets_data = list_make1(gs);
+			rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
+													 rollup->gsets_data,
+													 gd->tleref_to_colnum_map);
+			rollup->numGroups = gs->numGroups;
+			rollup->hashable = true;
+			rollup->is_hashed = true;
+			rollups = lcons(rollup, rollups);
+		}
 
-#ifdef __TBASE__
-						if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
+		if (rollups)
 						{
-							AggPath *aggpath = (AggPath *)path;
-
-							aggpath->hybrid = true;
-						}
-#endif
-
-                        /* step 2 */
-                        path = create_redistribute_grouping_path(root, parse, path);
+			/*
+			 * If the grouping can't be fully pushed down, redistribute the
+			 * path on top of the (sorted) path. If if can be pushed down,
+			 * disable construction of complex distributed paths.
+			 */
+			if (! can_push_down_grouping(root, parse, path))
+				path = create_remotesubplan_path(root, path, NULL);
+
+			add_path(grouped_rel, (Path *)
+					 create_groupingsets_path(root,
+											  grouped_rel,
+											  path,
+											  target,
+											  (List *) parse->havingQual,
+											  AGG_MIXED,
+											  rollups,
+											  agg_costs,
+											  dNumGroups));
                     }
                 }
-#else
-                path = create_remotesubplan_path(root, path, NULL);
-#endif
-                else
-                    try_distributed_aggregation = false;
 
                 /*
-                 * We just need an Agg over the cheapest-total input path,
-                 * since input order won't matter.
+	 * Now try the simple sorted case.
                  */
-#ifdef __TBASE__
-                if(try_redistribute_grouping)
+	if (!gd->unsortable_sets)
                 {
-					AggClauseCosts hashagg_final_costs;
-					Path *agg_path;
-
-					MemSet(&hashagg_final_costs, 0, sizeof(AggClauseCosts));
-
-					get_agg_clause_costs(root, (Node *) target->exprs,
-								 AGGSPLIT_FINAL_DESERIAL,
-								 &hashagg_final_costs);
-					get_agg_clause_costs(root, parse->havingQual,
-								 AGGSPLIT_FINAL_DESERIAL,
-								 &hashagg_final_costs);
+		/*
+		 * If the grouping can't be fully pushed down, redistribute the
+		 * path on top of the (sorted) path. If if can be pushed down,
+		 * disable construction of complex distributed paths.
+		 */
+		if (! can_push_down_grouping(root, parse, path))
+			path = create_remotesubplan_path(root, path, NULL);
 											
-					agg_path = (Path *)
-                                 create_agg_path(root,
+		add_path(grouped_rel, (Path *)
+				 create_groupingsets_path(root,
                                                  grouped_rel,
                                                  path,
                                                  target,
-                                                 AGG_HASHED,
-                                                 AGGSPLIT_FINAL_DESERIAL,
-                                                 parse->groupClause,
                                                  (List *) parse->havingQual,
-												 &hashagg_final_costs,
-                                                 dNumGroups);
-#ifdef __TBASE__
-					if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
-					{
-						AggPath *aggpath = (AggPath *)agg_path;
-
-						aggpath->hybrid = true;
+										  AGG_SORTED,
+										  gd->rollups,
+										  agg_costs,
+										  dNumGroups));
 					}
-#endif
-                    //agg_path->parallel_safe = true;
-                    
-                    add_path(grouped_rel, agg_path);
                 }
-                else
-                {
-                    bool parallel_aware = false;
-                    bool parallel_safe  = false;
-                    Path *agg_path      = NULL;
 
-                    if ((path->pathtype == T_Gather || agg_costs->hasOnlyDistinct) && olap_optimizer
-                        && !has_cold_hot_table)
+/*
+ * create_window_paths
+ *
+ * Build a new upperrel containing Paths for window-function evaluation.
+ *
+ * input_rel: contains the source-data Paths
+ * input_target: result of make_window_input_target
+ * output_target: what the topmost WindowAggPath should return
+ * tlist: query's target list (needed to look up pathkeys)
+ * wflists: result of find_window_functions
+ * activeWindows: result of select_active_windows
+ *
+ * Note: all Paths in input_rel are expected to return input_target.
+ */
+static RelOptInfo *
+create_window_paths(PlannerInfo *root,
+					RelOptInfo *input_rel,
+					PathTarget *input_target,
+					PathTarget *output_target,
+					List *tlist,
+					WindowFuncLists *wflists,
+					List *activeWindows)
                     {
-                        parallel_safe = true;
-                        parallel_aware = true;
-                    }
+	RelOptInfo *window_rel;
+	ListCell   *lc;
                     
-                    agg_path = (Path *)
-                         create_agg_path(root, grouped_rel,
-                                         path,
-                                         target,
-                                         AGG_HASHED,
-                                         AGGSPLIT_SIMPLE,
-                                         parse->groupClause,
-                                         (List *) parse->havingQual,
-                                         agg_costs,
-                                         dNumGroups);
-                    agg_path->parallel_aware = parallel_aware;
-                    agg_path->parallel_safe  = parallel_safe;
-#ifdef __TBASE__
-					if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
+	/* For now, do all work in the (WINDOW, NULL) upperrel */
+	window_rel = fetch_upper_rel(root, UPPERREL_WINDOW, NULL);
+
+	/*
+	 * If the input relation is not parallel-safe, then the window relation
+	 * can't be parallel-safe, either.  Otherwise, we need to examine the
+	 * target list and active windows for non-parallel-safe constructs.
+	 */
+	if (input_rel->consider_parallel &&
+		is_parallel_safe(root, (Node *) output_target->exprs) &&
+		is_parallel_safe(root, (Node *) activeWindows))
+		window_rel->consider_parallel = true;
+
+	/*
+	 * If the input rel belongs to a single FDW, so does the window rel.
+	 */
+	window_rel->serverid = input_rel->serverid;
+	window_rel->userid = input_rel->userid;
+	window_rel->useridiscurrent = input_rel->useridiscurrent;
+	window_rel->fdwroutine = input_rel->fdwroutine;
+
+	/*
+	 * Consider computing window functions starting from the existing
+	 * cheapest-total path (which will likely require a sort) as well as any
+	 * existing paths that satisfy root->window_pathkeys (which won't).
+	 */
+	foreach(lc, input_rel->pathlist)
 					{
-						AggPath *aggpath = (AggPath *)agg_path;
+		Path	   *path = (Path *) lfirst(lc);
 
-						aggpath->hybrid = true;
-					}
-#endif
-                    add_path(grouped_rel, agg_path);
-                }
-#else
-                add_path(grouped_rel, (Path *)
-                         create_agg_path(root, grouped_rel,
+		if (path == input_rel->cheapest_total_path ||
+			pathkeys_contained_in(root->window_pathkeys, path->pathkeys))
+			create_one_window_path(root,
+								   window_rel,
                                          path,
-                                         target,
-                                         AGG_HASHED,
-                                         AGGSPLIT_SIMPLE,
-                                         parse->groupClause,
-                                         (List *) parse->havingQual,
-                                         agg_costs,
-                                         dNumGroups));
-#endif
+								   input_target,
+								   output_target,
+								   tlist,
+								   wflists,
+								   activeWindows);
             }
+
+	/*
+	 * If there is an FDW that's responsible for all baserels of the query,
+	 * let it consider adding ForeignPaths.
+	 */
+	if (window_rel->fdwroutine &&
+		window_rel->fdwroutine->GetForeignUpperPaths)
+		window_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_WINDOW,
+													 input_rel, window_rel);
+
+	/* Let extensions possibly add some more paths */
+	if (create_upper_paths_hook)
+		(*create_upper_paths_hook) (root, UPPERREL_WINDOW,
+									input_rel, window_rel);
+
+	/* Now choose the best path(s) */
+	set_cheapest(window_rel);
+
+	return window_rel;
         }
 
         /*
-         * Generate a HashAgg Path atop of the cheapest partial path. Once
-         * again, we'll only do this if it looks as though the hash table
-         * won't exceed work_mem.
+ * Stack window-function implementation steps atop the given Path, and
+ * add the result to window_rel.
+ *
+ * window_rel: upperrel to contain result
+ * path: input Path to use (must return input_target)
+ * input_target: result of make_window_input_target
+ * output_target: what the topmost WindowAggPath should return
+ * tlist: query's target list (needed to look up pathkeys)
+ * wflists: result of find_window_functions
+ * activeWindows: result of select_active_windows
          */
-        if (grouped_rel->partial_pathlist)
+static void
+create_one_window_path(PlannerInfo *root,
+					   RelOptInfo *window_rel,
+					   Path *path,
+					   PathTarget *input_target,
+					   PathTarget *output_target,
+					   List *tlist,
+					   WindowFuncLists *wflists,
+					   List *activeWindows)
         {
-            bool       redistribute_group = false;
-            Path       *path = (Path *) linitial(grouped_rel->partial_pathlist);
+	PathTarget *window_target;
+	ListCell   *l;
 
-            hashaggtablesize = estimate_hashagg_tablesize(path,
-                                                          &agg_final_costs,
-                                                          dNumGroups);
+	/*
+	 * Since each window clause could require a different sort order, we stack
+	 * up a WindowAgg node for each clause, with sort steps between them as
+	 * needed.  (We assume that select_active_windows chose a good order for
+	 * executing the clauses in.)
+	 *
+	 * input_target should contain all Vars and Aggs needed for the result.
+	 * (In some cases we wouldn't need to propagate all of these all the way
+	 * to the top, since they might only be needed as inputs to WindowFuncs.
+	 * It's probably not worth trying to optimize that though.)  It must also
+	 * contain all window partitioning and sorting expressions, to ensure
+	 * they're computed only once at the bottom of the stack (that's critical
+	 * for volatile functions).  As we climb up the stack, we'll add outputs
+	 * for the WindowFuncs computed at each level.
+	 */
+	window_target = input_target;
 
-#ifdef __TBASE__
-			if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg)
-#else
-            if (hashaggtablesize < work_mem * 1024L)
-#endif
+	foreach(l, activeWindows)
             {
-#ifdef __TBASE__
-                double        total_groups = 0;
+		WindowClause *wc = (WindowClause *) lfirst(l);
+		List	   *window_pathkeys;
+
+		window_pathkeys = make_pathkeys_for_window(root,
+												   wc,
+												   tlist);
 
-                if (olap_optimizer && !has_cold_hot_table)
+		/* Sort if necessary */
+		if (!pathkeys_contained_in(window_pathkeys, path->pathkeys))
                 {
-                    total_groups = path->rows;
-                }
-                else
-                    total_groups = path->rows * path->parallel_workers;
-#else
-                double        total_groups = path->rows * path->parallel_workers;
-#endif
-                path = (Path *) create_gather_path(root,
-                                                   grouped_rel,
+			path = (Path *) create_sort_path(root, window_rel,
                                                    path,
-                                                   partial_grouping_target,
-                                                   NULL,
-                                                   &total_groups);
+											 window_pathkeys,
+											 -1.0);
+		}
+
+		if (lnext(l))
+		{
                 /*
-                * If the grouping can't be fully pushed down, we'll push down the
-                * first phase of the aggregate, and redistribute only the partial
-                * results.
+			 * Add the current WindowFuncs to the output target for this
+			 * intermediate WindowAggPath.  We must copy window_target to
+			 * avoid changing the previous path's target.
                 *
-                * If if can be pushed down, disable construction of complex
-                * distributed paths.
+			 * Note: a WindowFunc adds nothing to the target's eval costs; but
+			 * we do need to account for the increase in tlist width.
                 */
-                if (! can_push_down_grouping(root, parse, path))
-#ifdef __TBASE__
-                {
-                    if (olap_optimizer && !has_cold_hot_table)
-                    {
-                        /* redistribute local grouping results among datanodes */
-                        path = create_redistribute_grouping_path(root, parse, path);
-                        redistribute_group = true;
-                    }
-                    else
-                    {
-                        path = create_remotesubplan_path(root, path, NULL);
-                    }
-                }
-#else
-                    path = create_remotesubplan_path(root, path, NULL);
-#endif
-                else
-                    try_distributed_aggregation = false;
+			ListCell   *lc2;
 
-#ifdef __TBASE__
-                if (!redistribute_group)
-                {
-                    Path *agg_path = (Path *)
-                         create_agg_path(root,
-                                         grouped_rel,
-                                         path,
-                                         target,
-                                         AGG_HASHED,
-                                         AGGSPLIT_FINAL_DESERIAL,
-                                         parse->groupClause,
-                                         (List *) parse->havingQual,
-                                         &agg_final_costs,
-                                         dNumGroups);
-
-                    if (olap_optimizer && !has_cold_hot_table)
-                    {
-                        agg_path->parallel_aware = true;
-                        agg_path->parallel_safe  = true;
-                    }
-#ifdef __TBASE__
-					if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
+			window_target = copy_pathtarget(window_target);
+			foreach(lc2, wflists->windowFuncs[wc->winref])
 					{
-						AggPath *aggpath = (AggPath *)agg_path;
+				WindowFunc *wfunc = lfirst_node(WindowFunc, lc2);
 
-						aggpath->hybrid = true;
+				add_column_to_pathtarget(window_target, (Expr *) wfunc, 0);
+				window_target->width += get_typavgwidth(wfunc->wintype, -1);
 					}
-#endif
-                    add_path(grouped_rel, agg_path);
                 }
                 else
                 {
-#endif        
-                Path *agg_path = (Path *)
-                         create_agg_path(root,
-                                         grouped_rel,
-                                         path,
-                                         target,
-                                         AGG_HASHED,
-                                         AGGSPLIT_FINAL_DESERIAL,
-                                         parse->groupClause,
-                                         (List *) parse->havingQual,
-                                         &agg_final_costs,
-                                         dNumGroups);
-
-                if (olap_optimizer && !has_cold_hot_table)
-                {
-                    agg_path->parallel_aware = true;
-                    agg_path->parallel_safe = true;
+			/* Install the goal target in the topmost WindowAgg */
+			window_target = output_target;
                 }
-#ifdef __TBASE__
-				if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
-				{
-					AggPath *aggpath = (AggPath *)agg_path;
 
-					aggpath->hybrid = true;
-				}
-#endif
-                add_path(grouped_rel, agg_path);
-#ifdef __TBASE__
-                }
-#endif
-            }
-        }
+		/* We can't really push down window functions for now. */
+		if (!can_push_down_window(root, path))
+			path = create_remotesubplan_path(root, path, NULL);
+
+		path = (Path *)
+			create_windowagg_path(root, window_rel, path, window_target,
+								  wflists->windowFuncs[wc->winref],
+								  wc,
+								  window_pathkeys);
     }
 
-    /* Generate XL aggregate paths, with distributed 2-phase aggregation. */
+	add_path(window_rel, path);
+}
 
     /*
-     * If there were no partial paths, we did not initialize any of the
-     * partial paths above. If that's the case, initialize here.
+ * create_distinct_paths
      *
-     * XXX The reason why the initialization block at the beginning is not
-     * simply performed unconditionally is that we may skip it if we've been
-     * successful in fully pushing down any of the aggregates, and entirely
-     * skip generating the XL paths.
+ * Build a new upperrel containing Paths for SELECT DISTINCT evaluation.
      *
-     * XXX Can we simply use the same estimates as regular partial aggregates,
-     * or do we need to invent something else? It might be a better idea to
-     * use estimates for the whole result here (e.g. total number of groups)
-     * instead of the partial ones. Underestimates often have more severe
-     * consequences (e.g. OOM with HashAggregate) than overestimates, so this
-     * seems like a more defensive approach.
+ * input_rel: contains the source-data Paths
      *
-     * XXX After thinking a bit more about the estimation, it may depend on
-     * pushdown - if the aggregate is fully pushed down (as above, we can
-     * probably use dNumGroups/numberOfNodes as a cardinality estimate, as
-     * we know the per-node groupings won't overlap. But here we need to be
-     * more careful.
+ * Note: input paths should already compute the desired pathtarget, since
+ * Sort/Unique won't project anything.
      */
-    if (try_distributed_aggregation)
+static RelOptInfo *
+create_distinct_paths(PlannerInfo *root,
+					  RelOptInfo *input_rel)
     {
-        partial_grouping_target = make_partial_grouping_target(root, target);
+	Query	   *parse = root->parse;
+	Path	   *cheapest_input_path = input_rel->cheapest_total_path;
+	RelOptInfo *distinct_rel;
+	double		numDistinctRows;
+	bool		allow_hash;
+	Path	   *path;
+	ListCell   *lc;
 
-#ifdef __TBASE__
-		grouped_rel->reltarget = partial_grouping_target;
-#endif
-        /* Estimate number of partial groups. */
-        dNumPartialGroups = get_number_of_groups(root,
-                                                 cheapest_path->rows,
-                                                 gd);
+	/* For now, do all work in the (DISTINCT, NULL) upperrel */
+	distinct_rel = fetch_upper_rel(root, UPPERREL_DISTINCT, NULL);
 
         /*
-         * Collect statistics about aggregates for estimating costs of
-         * performing aggregation in parallel.
+	 * We don't compute anything at this level, so distinct_rel will be
+	 * parallel-safe if the input rel is parallel-safe.  In particular, if
+	 * there is a DISTINCT ON (...) clause, any path for the input_rel will
+	 * output those expressions, and will not be parallel-safe unless those
+	 * expressions are parallel-safe.
          */
-        MemSet(&agg_partial_costs, 0, sizeof(AggClauseCosts));
-        MemSet(&agg_final_costs, 0, sizeof(AggClauseCosts));
-        if (parse->hasAggs)
-        {
-            /* partial phase */
-            get_agg_clause_costs(root, (Node *) partial_grouping_target->exprs,
-                                 AGGSPLIT_INITIAL_SERIAL,
-                                 &agg_partial_costs);
-
-            /* final phase */
-            get_agg_clause_costs(root, (Node *) target->exprs,
-                                 AGGSPLIT_FINAL_DESERIAL,
-                                 &agg_final_costs);
-            get_agg_clause_costs(root, parse->havingQual,
-                                 AGGSPLIT_FINAL_DESERIAL,
-                                 &agg_final_costs);
-        }
+	distinct_rel->consider_parallel = input_rel->consider_parallel;
 
-        /* Build final XL grouping paths */
-        if (can_sort)
+	/*
+	 * If the input rel belongs to a single FDW, so does the distinct_rel.
+	 */
+	distinct_rel->serverid = input_rel->serverid;
+	distinct_rel->userid = input_rel->userid;
+	distinct_rel->useridiscurrent = input_rel->useridiscurrent;
+	distinct_rel->fdwroutine = input_rel->fdwroutine;
+
+	/* Estimate number of distinct rows there will be */
+	if (parse->groupClause || parse->groupingSets || parse->hasAggs ||
+		root->hasHavingQual)
         {
             /*
-             * Use any available suitably-sorted path as input, and also consider
-             * sorting the cheapest-total path.
+		 * If there was grouping or aggregation, use the number of input rows
+		 * as the estimated number of DISTINCT rows (ie, assume the input is
+		 * already mostly unique).
              */
-            foreach(lc, input_rel->pathlist)
+		numDistinctRows = cheapest_input_path->rows;
+	}
+	else
             {
-                Path       *path = (Path *) lfirst(lc);
-                bool        is_sorted;
+		/*
+		 * Otherwise, the UNIQUE filter has effects comparable to GROUP BY.
+		 */
+		List	   *distinctExprs;
 
-                is_sorted = pathkeys_contained_in(root->group_pathkeys,
-                        path->pathkeys);
+		distinctExprs = get_sortgrouplist_exprs(parse->distinctClause,
+												parse->targetList);
+		numDistinctRows = estimate_num_groups(root, distinctExprs,
+											  cheapest_input_path->rows,
+											  NULL);
+	}
 
                 /*
-                 * XL: Can it happen that the cheapest path can't be pushed down,
-                 * while some other path could be? Perhaps we should move the check
-                 * if a path can be pushed down up, and add another OR condition
-                 * to consider all paths that can be pushed down?
-                 *
-                 * if (path == cheapest_path || is_sorted || can_push_down)
+	 * Consider sort-based implementations of DISTINCT, if possible.
                  */
-                if (path == cheapest_path || is_sorted)
+	if (grouping_is_sortable(parse->distinctClause))
                 {
                     /*
-                     * We can't really beat paths that we managed to fully push
-                     * down above, so we can skip them entirely.
+		 * First, if we have any adequately-presorted paths, just stick a
+		 * Unique node on those.  Then consider doing an explicit sort of the
+		 * cheapest input path and Unique'ing that.
                      *
-                     * XXX Not constructing any paths, so we can do this before
-                     * adding the Sort path.
+		 * When we have DISTINCT ON, we must sort by the more rigorous of
+		 * DISTINCT and ORDER BY, else it won't have the desired behavior.
+		 * Also, if we do have to do an explicit sort, we might as well use
+		 * the more rigorous ordering to avoid a second sort later.  (Note
+		 * that the parser will have ensured that one clause is a prefix of
+		 * the other.)
                      */
-                    if (can_push_down_grouping(root, parse, path))
-                        continue;
+		List	   *needed_pathkeys;
 
-                    /* Sort the cheapest-total path if it isn't already sorted */
-                    if (!is_sorted)
-                        path = (Path *) create_sort_path(root,
-                                                         grouped_rel,
-                                                         path,
-                                                         root->group_pathkeys,
-                                                         -1.0);
+		if (parse->hasDistinctOn &&
+			list_length(root->distinct_pathkeys) <
+			list_length(root->sort_pathkeys))
+			needed_pathkeys = root->sort_pathkeys;
+		else
+			needed_pathkeys = root->distinct_pathkeys;
 
-                    /* Now decide what to stick atop it */
-                    if (parse->groupingSets)
+		foreach(lc, input_rel->pathlist)
                     {
-                        /*
-                         * TODO 2-phase aggregation for grouping sets paths not
-                         * supported yet, but this the place where such paths
-                         * should be constructed.
-                         */
-                    }
-                    else if (parse->hasAggs)
+			Path	   *path = (Path *) lfirst(lc);
+
+			if (pathkeys_contained_in(needed_pathkeys, path->pathkeys))
                     {
                         /*
-                         * We have aggregation, possibly with plain GROUP BY. Make
-                         * an AggPath.
+				 * Make sure the distribution matches the distinct clause,
+				 * needed by the UNIQUE path.
+				 *
+				 * FIXME This could probably benefit from pushing a UNIQUE
+				 * to the remote side, and only doing a merge locally.
                          */
+				if (!grouping_distribution_match(root, parse, path, parse->distinctClause))
+					path = create_remotesubplan_path(root, path, NULL);
 
-                        path = (Path *) create_agg_path(root,
-                                                        grouped_rel,
+				add_path(distinct_rel, (Path *)
+						 create_upper_unique_path(root, distinct_rel,
                                                         path,
-                                                        partial_grouping_target,
-                                        parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-                                                        AGGSPLIT_INITIAL_SERIAL,
-                                                        parse->groupClause,
-                                                        NIL,
-                                                        &agg_partial_costs,
-                                                        dNumPartialGroups);
+												  list_length(root->distinct_pathkeys),
+												  numDistinctRows));
+			}
+		}
 
-#ifdef __TBASE__
-                        if (olap_optimizer && !has_cold_hot_table)
+		/* For explicit-sort case, always use the more rigorous clause */
+		if (list_length(root->distinct_pathkeys) <
+			list_length(root->sort_pathkeys))
                         {
-                            /* redistribute local grouping results among datanodes */
-                            path = create_redistribute_grouping_path(root, parse, path);
+			needed_pathkeys = root->sort_pathkeys;
+			/* Assert checks that parser didn't mess up... */
+			Assert(pathkeys_contained_in(root->distinct_pathkeys,
+										 needed_pathkeys));
                         }
                         else
-                        {
-                            path = create_remotesubplan_path(root, path, NULL);
-                        }
-#else
-                        path = create_remotesubplan_path(root, path, NULL);
-#endif
+			needed_pathkeys = root->distinct_pathkeys;
 
-#ifdef __TBASE__
-                        if (parse->groupClause && olap_optimizer && !has_cold_hot_table && 
-                            (!is_sorted || root->group_pathkeys))
-                        {
-                            path = (Path *) create_sort_path(root,
-                                                             grouped_rel,
+		path = cheapest_input_path;
+		if (!pathkeys_contained_in(needed_pathkeys, path->pathkeys))
+			path = (Path *) create_sort_path(root, distinct_rel,
                                                              path,
-                                                             root->group_pathkeys,
+											 needed_pathkeys,
                                                              -1.0);
+
+		/* In case of grouping / distribution mismatch, inject remote scan. */
+		if (!grouping_distribution_match(root, parse, path, parse->distinctClause))
+			path = create_remotesubplan_path(root, path, NULL);
+
+		add_path(distinct_rel, (Path *)
+				 create_upper_unique_path(root, distinct_rel,
+										  path,
+										  list_length(root->distinct_pathkeys),
+										  numDistinctRows));
                         }
-#endif
+
                         /*
-                         * We generate two paths, differing in the second phase
-                         * implementation (sort and hash).
+	 * Consider hash-based implementations of DISTINCT, if possible.
+	 *
+	 * If we were not able to make any other types of path, we *must* hash or
+	 * die trying.  If we do have other choices, there are several things that
+	 * should prevent selection of hashing: if the query uses DISTINCT ON
+	 * (because it won't really have the expected behavior if we hash), or if
+	 * enable_hashagg is off, or if it looks like the hashtable will exceed
+	 * work_mem.
+	 *
+	 * Note: grouping_is_hashable() is much more expensive to check than the
+	 * other gating conditions, so we want to do it last.
                          */
+	if (distinct_rel->pathlist == NIL)
+		allow_hash = true;		/* we have no alternatives */
+	else if (parse->hasDistinctOn || !enable_hashagg)
+		allow_hash = false;		/* policy-based decision not to hash */
+	else
                         {
-                            Path *agg_path = (Path *)
-                                     create_agg_path(root,
-                                                     grouped_rel,
-                                                     path,
-                                                     target,
-                                             parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-                                                     AGGSPLIT_FINAL_DESERIAL,
-                                                     parse->groupClause,
-                                                     (List *) parse->havingQual,
-                                                     &agg_final_costs,
-                                                     dNumGroups);
+		Size		hashentrysize;
 
-                            //agg_path->parallel_safe = true;
+		/* Estimate per-hash-entry space at tuple width... */
+		hashentrysize = MAXALIGN(cheapest_input_path->pathtarget->width) +
+			MAXALIGN(SizeofMinimalTupleHeader);
+		/* plus the per-hash-entry overhead */
+		hashentrysize += hash_agg_entry_size(0);
                             
-                            add_path(grouped_rel, agg_path);
+		/* Allow hashing only if hashtable is predicted to fit in work_mem */
+		allow_hash = (hashentrysize * numDistinctRows <= work_mem * 1024L);
                         }
 
-                        if (can_hash)
+	if (allow_hash && grouping_is_hashable(parse->distinctClause))
                         {
-                            Path *agg_path = (Path *)
+		Path *input_path = cheapest_input_path;
+
+		/* If needed, inject RemoteSubplan redistributing the data. */
+		if (!grouping_distribution_match(root, parse, input_path, parse->distinctClause))
+			input_path = create_remotesubplan_path(root, input_path, NULL);
+
+		/* XXX Maybe we can make this a 2-phase aggregate too? */
+
+		/* Generate hashed aggregate path --- no sort needed */
+		add_path(distinct_rel, (Path *)
                                      create_agg_path(root,
-                                                     grouped_rel,
-                                                     path,
-                                                     target,
+								 distinct_rel,
+								 input_path,
+								 input_path->pathtarget,
                                                      AGG_HASHED,
-                                                     AGGSPLIT_FINAL_DESERIAL,
-                                                     parse->groupClause,
-                                                     (List *) parse->havingQual,
-                                                     &agg_final_costs,
-                                                     dNumGroups);
+								 AGGSPLIT_SIMPLE,
+								 parse->distinctClause,
+								 NIL,
+								 NULL,
+								 numDistinctRows));
+	}
 
-                            //agg_path->parallel_safe = true;
+	/* Give a helpful error if we failed to find any implementation */
+	if (distinct_rel->pathlist == NIL)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("could not implement DISTINCT"),
+				 errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
                             
-                            add_path(grouped_rel, agg_path);
-                        }
+	/*
+	 * If there is an FDW that's responsible for all baserels of the query,
+	 * let it consider adding ForeignPaths.
+	 */
+	if (distinct_rel->fdwroutine &&
+		distinct_rel->fdwroutine->GetForeignUpperPaths)
+		distinct_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_DISTINCT,
+													   input_rel, distinct_rel);
+
+	/* Let extensions possibly add some more paths */
+	if (create_upper_paths_hook)
+		(*create_upper_paths_hook) (root, UPPERREL_DISTINCT,
+									input_rel, distinct_rel);
+
+	/* Now choose the best path(s) */
+	set_cheapest(distinct_rel);
+
+	return distinct_rel;
                     }
-                    else if (parse->groupClause)
+
+/*
+ * create_ordered_paths
+ *
+ * Build a new upperrel containing Paths for ORDER BY evaluation.
+ *
+ * All paths in the result must satisfy the ORDER BY ordering.
+ * The only new path we need consider is an explicit sort on the
+ * cheapest-total existing path.
+ *
+ * input_rel: contains the source-data Paths
+ * target: the output tlist the result Paths must emit
+ * limit_tuples: estimated bound on the number of output tuples,
+ *		or -1 if no LIMIT or couldn't estimate
+ */
+static RelOptInfo *
+create_ordered_paths(PlannerInfo *root,
+					 RelOptInfo *input_rel,
+					 PathTarget *target,
+					 double limit_tuples)
                     {
+	Path	   *cheapest_input_path = input_rel->cheapest_total_path;
+	RelOptInfo *ordered_rel;
+	ListCell   *lc;
+
+	/* For now, do all work in the (ORDERED, NULL) upperrel */
+	ordered_rel = fetch_upper_rel(root, UPPERREL_ORDERED, NULL);
+
                         /*
-                         * We have GROUP BY without aggregation or grouping sets.
-                         * Make a GroupPath.
+	 * If the input relation is not parallel-safe, then the ordered relation
+	 * can't be parallel-safe, either.  Otherwise, it's parallel-safe if the
+	 * target list is parallel-safe.
                          */
-                        path = (Path *) create_group_path(root,
-                                                          grouped_rel,
-                                                          path,
-                                                          partial_grouping_target,
-                                                          parse->groupClause,
-                                                          NIL,
-                                                          dNumPartialGroups);
+	if (input_rel->consider_parallel &&
+		is_parallel_safe(root, (Node *) target->exprs))
+		ordered_rel->consider_parallel = true;
 
-#ifdef __TBASE__
-                        if (olap_optimizer && !has_cold_hot_table)
-                        {
-                            /* redistribute local grouping results among datanodes */
-                            path = create_redistribute_grouping_path(root, parse, path);
-                        }
-                        else
+	/*
+	 * If the input rel belongs to a single FDW, so does the ordered_rel.
+	 */
+	ordered_rel->serverid = input_rel->serverid;
+	ordered_rel->userid = input_rel->userid;
+	ordered_rel->useridiscurrent = input_rel->useridiscurrent;
+	ordered_rel->fdwroutine = input_rel->fdwroutine;
+
+	foreach(lc, input_rel->pathlist)
                         {
-                            path = create_remotesubplan_path(root, path, NULL);
-                        }
-#else
-                        path = create_remotesubplan_path(root, path, NULL);
-#endif
+		Path	   *path = (Path *) lfirst(lc);
+		bool		is_sorted;
 
-#ifdef __TBASE__
-                        if (olap_optimizer && !has_cold_hot_table && (!is_sorted || root->group_pathkeys))
+		is_sorted = pathkeys_contained_in(root->sort_pathkeys,
+										  path->pathkeys);
+		if (path == cheapest_input_path || is_sorted)
+		{
+			if (!is_sorted)
                         {
+				/* An explicit sort here can take advantage of LIMIT */
                             path = (Path *) create_sort_path(root,
-                                                             grouped_rel,
+												 ordered_rel,
                                                              path,
-                                                             root->group_pathkeys,
-                                                              -1.0);
+												 root->sort_pathkeys,
+												 limit_tuples);
                         }
-#endif
-                        {
-                            Path *group_path = (Path *)
-                                     create_group_path(root,
-                                                       grouped_rel,
-                                                       path,
-                                                       target,
-                                                       parse->groupClause,
-                                                       (List *) parse->havingQual,
-                                                       dNumGroups);
 
-                            //group_path->parallel_safe = true;
+			/* Add projection step if needed */
+			if (path->pathtarget != target)
+				path = apply_projection_to_path(root, ordered_rel,
+												path, target);
                             
-                            add_path(grouped_rel, group_path);
+			add_path(ordered_rel, path);
                         }
-
                     }
-                    else
+
+            /*
+	 * generate_gather_paths() will have already generated a simple Gather
+	 * path for the best parallel path, if any, and the loop above will have
+	 * considered sorting it.  Similarly, generate_gather_paths() will also
+	 * have generated order-preserving Gather Merge plans which can be used
+	 * without sorting if they happen to match the sort_pathkeys, and the loop
+	 * above will have handled those as well.  However, there's one more
+	 * possibility: it may make sense to sort the cheapest partial path
+	 * according to the required output order and then use Gather Merge.
+             */
+	if (ordered_rel->consider_parallel && root->sort_pathkeys != NIL &&
+		input_rel->partial_pathlist != NIL)
                     {
-                        /* Other cases should have been handled above */
-                        Assert(false);
-                    }
-                }
-            }
-        }
-
-        if (can_hash)
-        {
-            hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
-                                                          agg_costs,
-                                                          dNumGroups);
-
-            /*
-             * Provided that the estimated size of the hashtable does not exceed
-             * work_mem, we'll generate a HashAgg Path, although if we were unable
-             * to sort above, then we'd better generate a Path, so that we at
-             * least have one.
-             */
-#ifdef __TBASE__
-			if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg ||
-				grouped_rel->pathlist == NIL)
-#else
-            if (hashaggtablesize < work_mem * 1024L ||
-                grouped_rel->pathlist == NIL)
-#endif
-            {
-                /* If the whole aggregate was pushed down, we're done. */
-                if (! can_push_down_grouping(root, parse, cheapest_path))
-                {
-                    Path *path, *agg_path;
-
-                    path = (Path *) create_agg_path(root,
-                                           grouped_rel,
-                                           cheapest_path,
-                                           partial_grouping_target,
-                                           AGG_HASHED,
-                                           AGGSPLIT_INITIAL_SERIAL,
-                                           parse->groupClause,
-                                           NIL,
-                                           &agg_partial_costs,
-                                           dNumPartialGroups);
-
-                    /* keep partially aggregated path for the can_sort branch */
-                    agg_path = path;
-#ifdef __TBASE__
-					if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
-					{
-						AggPath *aggpath = (AggPath *)agg_path;
-
-						aggpath->hybrid = true;
-					}
-#endif
-
-#ifdef __TBASE__
-                    if (olap_optimizer && !has_cold_hot_table)
-                    {
-                        /* redistribute local grouping results among datanodes */
-                        path = create_redistribute_grouping_path(root, parse, path);
-                    }
-                    else
-                    {
-                        path = create_remotesubplan_path(root, path, NULL);
-                    }
-#else
-                    path = create_remotesubplan_path(root, path, NULL);
-#endif
-                    /* Generate paths with both hash and sort second phase. */
-                    {
-                        Path *agg_path = (Path *)
-                                 create_agg_path(root,
-                                                 grouped_rel,
-                                                 path,
-                                                 target,
-                                                 AGG_HASHED,
-                                                 AGGSPLIT_FINAL_DESERIAL,
-                                                 parse->groupClause,
-                                                 (List *) parse->havingQual,
-                                                 &agg_final_costs,
-                                                 dNumGroups);
-
-                        //agg_path->parallel_safe = true;
-#ifdef __TBASE__
-						if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
-						{
-							AggPath *aggpath = (AggPath *)agg_path;
+		Path	   *cheapest_partial_path;
 
-							aggpath->hybrid = true;
-						}
-#endif
-                        add_path(grouped_rel, agg_path);
-                    }
-
-                    if (can_sort)
-                    {
-#ifdef __TBASE__
-                        if (!olap_optimizer || has_cold_hot_table)
-#endif
-                            path = (Path *) create_sort_path(root,
-                                     grouped_rel,
-                                     agg_path,
-                                     root->group_pathkeys,
-                                     -1.0);
-
-#ifdef __TBASE__
-                        if (olap_optimizer && !has_cold_hot_table)
-                        {
-                            /* redistribute local grouping results among datanodes */
-                            path = create_redistribute_grouping_path(root, parse, agg_path);
-                        }
-                        else
-                        {
-                            path = create_remotesubplan_path(root, path, NULL);
-                        }
-#else
-                        path = create_remotesubplan_path(root, path, NULL);
-#endif
+		cheapest_partial_path = linitial(input_rel->partial_pathlist);
 
-#ifdef __TBASE__
-                        if (olap_optimizer && !has_cold_hot_table)
-                        {
                             /*
-                             * AGG_HASHED aggregate paths are always unsorted, so add
-                             * a Sorted node for the final AGG_SORTED step.
+		 * If cheapest partial path doesn't need a sort, this is redundant
+		 * with what's already been tried.
                              */
+		if (!pathkeys_contained_in(root->sort_pathkeys,
+								   cheapest_partial_path->pathkeys))
+		{
+			Path	   *path;
+			double		total_groups;
+
                             path = (Path *) create_sort_path(root,
-                                                             grouped_rel,
-                                                             path,
-                                                             root->group_pathkeys,
+											 ordered_rel,
+											 cheapest_partial_path,
+											 root->sort_pathkeys,
                                                              -1.0);
-                        }
-#endif
 
+			total_groups = cheapest_partial_path->rows *
+				cheapest_partial_path->parallel_workers;
                         path = (Path *)
-                                 create_agg_path(root,
-                                                 grouped_rel,
+				create_gather_merge_path(root, ordered_rel,
                                                  path,
-                                                 target,
-                                         parse->groupClause ? AGG_SORTED : AGG_PLAIN,
-                                                 AGGSPLIT_FINAL_DESERIAL,
-                                                 parse->groupClause,
-                                                 (List *) parse->havingQual,
-                                                 &agg_final_costs,
-                                                 dNumGroups);
+										 target, root->sort_pathkeys, NULL,
+										 &total_groups);
 
-                        //path->parallel_safe = true;
+			/* Add projection step if needed */
+			if (path->pathtarget != target)
+				path = apply_projection_to_path(root, ordered_rel,
+												path, target);
 
-                        add_path(grouped_rel, path);
-                    }
-                }
-            }
+			add_path(ordered_rel, path);
         }
     }
 
-    /* Give a helpful error if we failed to find any implementation */
-    if (grouped_rel->pathlist == NIL)
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("could not implement GROUP BY"),
-                 errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
-
     /*
      * If there is an FDW that's responsible for all baserels of the query,
      * let it consider adding ForeignPaths.
      */
-    if (grouped_rel->fdwroutine &&
-        grouped_rel->fdwroutine->GetForeignUpperPaths)
-        grouped_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_GROUP_AGG,
-                                                      input_rel, grouped_rel);
+	if (ordered_rel->fdwroutine &&
+		ordered_rel->fdwroutine->GetForeignUpperPaths)
+		ordered_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_ORDERED,
+													  input_rel, ordered_rel);
 
     /* Let extensions possibly add some more paths */
     if (create_upper_paths_hook)
-        (*create_upper_paths_hook) (root, UPPERREL_GROUP_AGG,
-                                    input_rel, grouped_rel);
+		(*create_upper_paths_hook) (root, UPPERREL_ORDERED,
+									input_rel, ordered_rel);
 
-    /* Now choose the best path(s) */
-    set_cheapest(grouped_rel);
     /*
-     * We've been using the partial pathlist for the grouped relation to hold
-     * partially aggregated paths, but that's actually a little bit bogus
-     * because it's unsafe for later planning stages -- like ordered_rel ---
-     * to get the idea that they can use these partial paths as if they didn't
-     * need a FinalizeAggregate step.  Zap the partial pathlist at this stage
-     * so we don't get confused.
+	 * No need to bother with set_cheapest here; grouping_planner does not
+	 * need us to do it.
      */
-    grouped_rel->partial_pathlist = NIL;
+	Assert(ordered_rel->pathlist != NIL);
 
-    return grouped_rel;
+	return ordered_rel;
 }
 
 
 /*
- * For a given input path, consider the possible ways of doing grouping sets on
- * it, by combinations of hashing and sorting.  This can be called multiple
- * times, so it's important that it not scribble on input.  No result is
- * returned, but any generated paths are added to grouped_rel.
+ * make_group_input_target
+ *	  Generate appropriate PathTarget for initial input to grouping nodes.
+ *
+ * If there is grouping or aggregation, the scan/join subplan cannot emit
+ * the query's final targetlist; for example, it certainly can't emit any
+ * aggregate function calls.  This routine generates the correct target
+ * for the scan/join subplan.
+ *
+ * The query target list passed from the parser already contains entries
+ * for all ORDER BY and GROUP BY expressions, but it will not have entries
+ * for variables used only in HAVING clauses; so we need to add those
+ * variables to the subplan target list.  Also, we flatten all expressions
+ * except GROUP BY items into their component variables; other expressions
+ * will be computed by the upper plan nodes rather than by the subplan.
+ * For example, given a query like
+ *		SELECT a+b,SUM(c+d) FROM table GROUP BY a+b;
+ * we want to pass this targetlist to the subplan:
+ *		a+b,c,d
+ * where the a+b target will be used by the Sort/Group steps, and the
+ * other targets will be used for computing the final results.
+ *
+ * 'final_target' is the query's final target list (in PathTarget form)
+ *
+ * The result is the PathTarget to be computed by the Paths returned from
+ * query_planner().
  */
-static void
-consider_groupingsets_paths(PlannerInfo *root,
-                            RelOptInfo *grouped_rel,
-                            Path *path,
-                            bool is_sorted,
-                            bool can_hash,
-                            PathTarget *target,
-                            grouping_sets_data *gd,
-                            const AggClauseCosts *agg_costs,
-                            double dNumGroups)
-{// #lizard forgives
+static PathTarget *
+make_group_input_target(PlannerInfo *root, PathTarget *final_target)
+{
     Query       *parse = root->parse;
+	PathTarget *input_target;
+	List	   *non_group_cols;
+	List	   *non_group_vars;
+	int			i;
+	ListCell   *lc;
 
     /*
-     * If we're not being offered sorted input, then only consider plans that
-     * can be done entirely by hashing.
-     *
-     * We can hash everything if it looks like it'll fit in work_mem. But if
-     * the input is actually sorted despite not being advertised as such, we
-     * prefer to make use of that in order to use less memory.
-     *
-     * If none of the grouping sets are sortable, then ignore the work_mem
-     * limit and generate a path anyway, since otherwise we'll just fail.
-     */
-    if (!is_sorted)
-    {
-        List       *new_rollups = NIL;
-        RollupData *unhashed_rollup = NULL;
-        List       *sets_data;
-        List       *empty_sets_data = NIL;
-        List       *empty_sets = NIL;
-        ListCell   *lc;
-        ListCell   *l_start = list_head(gd->rollups);
-        AggStrategy strat = AGG_HASHED;
-        Size        hashsize;
-        double        exclude_groups = 0.0;
+	 * We must build a target containing all grouping columns, plus any other
+	 * Vars mentioned in the query's targetlist and HAVING qual.
+	 */
+	input_target = create_empty_pathtarget();
+	non_group_cols = NIL;
 
-        Assert(can_hash);
+	i = 0;
+	foreach(lc, final_target->exprs)
+	{
+		Expr	   *expr = (Expr *) lfirst(lc);
+		Index		sgref = get_pathtarget_sortgroupref(final_target, i);
 
-        if (pathkeys_contained_in(root->group_pathkeys, path->pathkeys))
+		if (sgref && parse->groupClause &&
+			get_sortgroupref_clause_noerr(sgref, parse->groupClause) != NULL)
         {
-            unhashed_rollup = lfirst(l_start);
-            exclude_groups = unhashed_rollup->numGroups;
-            l_start = lnext(l_start);
+			/*
+			 * It's a grouping column, so add it to the input target as-is.
+			 */
+			add_column_to_pathtarget(input_target, expr, sgref);
+		}
+		else
+		{
+			/*
+			 * Non-grouping column, so just remember the expression for later
+			 * call to pull_var_clause.
+			 */
+			non_group_cols = lappend(non_group_cols, expr);
         }
 
-        hashsize = estimate_hashagg_tablesize(path,
-                                              agg_costs,
-                                              dNumGroups - exclude_groups);
+		i++;
+	}
 
         /*
-         * gd->rollups is empty if we have only unsortable columns to work
-         * with.  Override work_mem in that case; otherwise, we'll rely on the
-         * sorted-input case to generate usable mixed paths.
+	 * If there's a HAVING clause, we'll need the Vars it uses, too.
          */
-        if (hashsize > work_mem * 1024L && gd->rollups)
-            return;                /* nope, won't fit */
+	if (parse->havingQual)
+		non_group_cols = lappend(non_group_cols, parse->havingQual);
 
         /*
-         * We need to burst the existing rollups list into individual grouping
-         * sets and recompute a groupClause for each set.
+	 * Pull out all the Vars mentioned in non-group cols (plus HAVING), and
+	 * add them to the input target if not already present.  (A Var used
+	 * directly as a GROUP BY item will be present already.)  Note this
+	 * includes Vars used in resjunk items, so we are covering the needs of
+	 * ORDER BY and window specifications.  Vars used within Aggrefs and
+	 * WindowFuncs will be pulled out here, too.
          */
-        sets_data = list_copy(gd->unsortable_sets);
-
-        for_each_cell(lc, l_start)
-        {
-            RollupData *rollup = lfirst(lc);
+	non_group_vars = pull_var_clause((Node *) non_group_cols,
+									 PVC_RECURSE_AGGREGATES |
+									 PVC_RECURSE_WINDOWFUNCS |
+									 PVC_INCLUDE_PLACEHOLDERS);
+	add_new_columns_to_pathtarget(input_target, non_group_vars);
+
+	/* clean up cruft */
+	list_free(non_group_vars);
+	list_free(non_group_cols);
+
+	/* XXX this causes some redundant cost calculation ... */
+	return set_pathtarget_cost_width(root, input_target);
+}
 
             /*
-             * If we find an unhashable rollup that's not been skipped by the
-             * "actually sorted" check above, we can't cope; we'd need sorted
-             * input (with a different sort order) but we can't get that here.
-             * So bail out; we'll get a valid path from the is_sorted case
-             * instead.
+ * make_partial_grouping_target
+ *	  Generate appropriate PathTarget for output of partial aggregate
+ *	  (or partial grouping, if there are no aggregates) nodes.
              *
-             * The mere presence of empty grouping sets doesn't make a rollup
-             * unhashable (see preprocess_grouping_sets), we handle those
-             * specially below.
+ * A partial aggregation node needs to emit all the same aggregates that
+ * a regular aggregation node would, plus any aggregates used in HAVING;
+ * except that the Aggref nodes should be marked as partial aggregates.
+ *
+ * In addition, we'd better emit any Vars and PlaceholderVars that are
+ * used outside of Aggrefs in the aggregation tlist and HAVING.  (Presumably,
+ * these would be Vars that are grouped by or used in grouping expressions.)
+ *
+ * grouping_target is the tlist to be emitted by the topmost aggregation step.
+ * We get the HAVING clause out of *root.
              */
-            if (!rollup->hashable)
-                return;
-            else
-                sets_data = list_concat(sets_data, list_copy(rollup->gsets_data));
-        }
-        foreach(lc, sets_data)
+static PathTarget *
+make_partial_grouping_target(PlannerInfo *root, PathTarget *grouping_target)
         {
-            GroupingSetData *gs = lfirst(lc);
-            List       *gset = gs->set;
-            RollupData *rollup;
-
-            if (gset == NIL)
+	Query	   *parse = root->parse;
+	PathTarget *partial_target;
+	List	   *non_group_cols;
+	List	   *non_group_exprs;
+	int			i;
+	ListCell   *lc;
+
+	partial_target = create_empty_pathtarget();
+	non_group_cols = NIL;
+
+	i = 0;
+	foreach(lc, grouping_target->exprs)
             {
-                /* Empty grouping sets can't be hashed. */
-                empty_sets_data = lappend(empty_sets_data, gs);
-                empty_sets = lappend(empty_sets, NIL);
+		Expr	   *expr = (Expr *) lfirst(lc);
+		Index		sgref = get_pathtarget_sortgroupref(grouping_target, i);
+
+		if (sgref && parse->groupClause &&
+			get_sortgroupref_clause_noerr(sgref, parse->groupClause) != NULL)
+		{
+			/*
+			 * It's a grouping column, so add it to the partial_target as-is.
+			 * (This allows the upper agg step to repeat the grouping calcs.)
+			 */
+			add_column_to_pathtarget(partial_target, expr, sgref);
             }
             else
             {
-                rollup = makeNode(RollupData);
-
-                rollup->groupClause = preprocess_groupclause(root, gset);
-                rollup->gsets_data = list_make1(gs);
-                rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
-                                                         rollup->gsets_data,
-                                                         gd->tleref_to_colnum_map);
-                rollup->numGroups = gs->numGroups;
-                rollup->hashable = true;
-                rollup->is_hashed = true;
-                new_rollups = lappend(new_rollups, rollup);
+			/*
+			 * Non-grouping column, so just remember the expression for later
+			 * call to pull_var_clause.
+			 */
+			non_group_cols = lappend(non_group_cols, expr);
             }
+
+		i++;
         }
 
         /*
-         * If we didn't find anything nonempty to hash, then bail.  We'll
-         * generate a path from the is_sorted case.
+	 * If there's a HAVING clause, we'll need the Vars/Aggrefs it uses, too.
          */
-        if (new_rollups == NIL)
-            return;
+	if (parse->havingQual)
+		non_group_cols = lappend(non_group_cols, parse->havingQual);
 
         /*
-         * If there were empty grouping sets they should have been in the
-         * first rollup.
+	 * Pull out all the Vars, PlaceHolderVars, and Aggrefs mentioned in
+	 * non-group cols (plus HAVING), and add them to the partial_target if not
+	 * already present.  (An expression used directly as a GROUP BY item will
+	 * be present already.)  Note this includes Vars used in resjunk items, so
+	 * we are covering the needs of ORDER BY and window specifications.
          */
-        Assert(!unhashed_rollup || !empty_sets);
+	non_group_exprs = pull_var_clause((Node *) non_group_cols,
+									  PVC_INCLUDE_AGGREGATES |
+									  PVC_RECURSE_WINDOWFUNCS |
+									  PVC_INCLUDE_PLACEHOLDERS);
 
-        if (unhashed_rollup)
-        {
-            new_rollups = lappend(new_rollups, unhashed_rollup);
-            strat = AGG_MIXED;
-        }
-        else if (empty_sets)
+	add_new_columns_to_pathtarget(partial_target, non_group_exprs);
+
+	/*
+	 * Adjust Aggrefs to put them in partial mode.  At this point all Aggrefs
+	 * are at the top level of the target list, so we can just scan the list
+	 * rather than recursing through the expression trees.
+	 */
+	foreach(lc, partial_target->exprs)
         {
-            RollupData *rollup = makeNode(RollupData);
+		Aggref	   *aggref = (Aggref *) lfirst(lc);
 
-            rollup->groupClause = NIL;
-            rollup->gsets_data = empty_sets_data;
-            rollup->gsets = empty_sets;
-            rollup->numGroups = list_length(empty_sets);
-            rollup->hashable = false;
-            rollup->is_hashed = false;
-            new_rollups = lappend(new_rollups, rollup);
-            strat = AGG_MIXED;
-        }
+		if (IsA(aggref, Aggref))
+		{
+			Aggref	   *newaggref;
 
         /*
-         * If the grouping can't be fully pushed down, redistribute the
-         * path on top of the (sorted) path. If if can be pushed down,
-         * disable construction of complex distributed paths.
+			 * We shouldn't need to copy the substructure of the Aggref node,
+			 * but flat-copy the node itself to avoid damaging other trees.
          */
-        if (! can_push_down_grouping(root, parse, path))
-            path = create_remotesubplan_path(root, path, NULL);
-
-        add_path(grouped_rel, (Path *)
-                 create_groupingsets_path(root,
-                                          grouped_rel,
-                                          path,
-                                          target,
-                                          (List *) parse->havingQual,
-                                          strat,
-                                          new_rollups,
-                                          agg_costs,
-                                          dNumGroups));
-        return;
+			newaggref = makeNode(Aggref);
+			memcpy(newaggref, aggref, sizeof(Aggref));
+
+			/* For now, assume serialization is required */
+			mark_partial_aggref(newaggref, AGGSPLIT_INITIAL_SERIAL);
+
+			lfirst(lc) = newaggref;
+		}
     }
 
-    /*
-     * If we have sorted input but nothing we can do with it, bail.
-     */
-    if (list_length(gd->rollups) == 0)
-        return;
+	/* clean up cruft */
+	list_free(non_group_exprs);
+	list_free(non_group_cols);
+
+	/* XXX this causes some redundant cost calculation ... */
+	return set_pathtarget_cost_width(root, partial_target);
+}
 
     /*
-     * Given sorted input, we try and make two paths: one sorted and one mixed
-     * sort/hash. (We need to try both because hashagg might be disabled, or
-     * some columns might not be sortable.)
+ * mark_partial_aggref
+ *	  Adjust an Aggref to make it represent a partial-aggregation step.
      *
-     * can_hash is passed in as false if some obstacle elsewhere (such as
-     * ordered aggs) means that we shouldn't consider hashing at all.
+ * The Aggref node is modified in-place; caller must do any copying required.
      */
-    if (can_hash && gd->any_hashable)
+void
+mark_partial_aggref(Aggref *agg, AggSplit aggsplit)
     {
-        List       *rollups = NIL;
-        List       *hash_sets = list_copy(gd->unsortable_sets);
-        double        availspace = (work_mem * 1024.0);
-        ListCell   *lc;
-
-        /*
-         * Account first for space needed for groups we can't sort at all.
-         */
-        availspace -= (double) estimate_hashagg_tablesize(path,
-                                                          agg_costs,
-                                                          gd->dNumHashGroups);
+	/* aggtranstype should be computed by this point */
+	Assert(OidIsValid(agg->aggtranstype));
+	/* ... but aggsplit should still be as the parser left it */
+	Assert(agg->aggsplit == AGGSPLIT_SIMPLE);
 
-        if (availspace > 0 && list_length(gd->rollups) > 1)
-        {
-            double        scale;
-            int            num_rollups = list_length(gd->rollups);
-            int            k_capacity;
-            int           *k_weights = palloc(num_rollups * sizeof(int));
-            Bitmapset  *hash_items = NULL;
-            int            i;
+	/* Mark the Aggref with the intended partial-aggregation mode */
+	agg->aggsplit = aggsplit;
 
             /*
-             * We treat this as a knapsack problem: the knapsack capacity
-             * represents work_mem, the item weights are the estimated memory
-             * usage of the hashtables needed to implement a single rollup,
-             * and we really ought to use the cost saving as the item value;
-             * however, currently the costs assigned to sort nodes don't
-             * reflect the comparison costs well, and so we treat all items as
-             * of equal value (each rollup we hash instead saves us one sort).
-             *
-             * To use the discrete knapsack, we need to scale the values to a
-             * reasonably small bounded range.  We choose to allow a 5% error
-             * margin; we have no more than 4096 rollups in the worst possible
-             * case, which with a 5% error margin will require a bit over 42MB
-             * of workspace. (Anyone wanting to plan queries that complex had
-             * better have the memory for it.  In more reasonable cases, with
-             * no more than a couple of dozen rollups, the memory usage will
-             * be negligible.)
-             *
-             * k_capacity is naturally bounded, but we clamp the values for
-             * scale and weight (below) to avoid overflows or underflows (or
-             * uselessly trying to use a scale factor less than 1 byte).
+	 * Adjust result type if needed.  Normally, a partial aggregate returns
+	 * the aggregate's transition type; but if that's INTERNAL and we're
+	 * serializing, it returns BYTEA instead.
              */
-            scale = Max(availspace / (20.0 * num_rollups), 1.0);
-            k_capacity = (int) floor(availspace / scale);
+	if (DO_AGGSPLIT_SKIPFINAL(aggsplit))
+	{
+		if (agg->aggtranstype == INTERNALOID && DO_AGGSPLIT_SERIALIZE(aggsplit))
+			agg->aggtype = BYTEAOID;
+		else
+			agg->aggtype = agg->aggtranstype;
+	}
+}
 
             /*
-             * We leave the first rollup out of consideration since it's the
-             * one that matches the input sort order.  We assign indexes "i"
-             * to only those entries considered for hashing; the second loop,
-             * below, must use the same condition.
+ * postprocess_setop_tlist
+ *	  Fix up targetlist returned by plan_set_operations().
+ *
+ * We need to transpose sort key info from the orig_tlist into new_tlist.
+ * NOTE: this would not be good enough if we supported resjunk sort keys
+ * for results of set operations --- then, we'd need to project a whole
+ * new tlist to evaluate the resjunk columns.  For now, just ereport if we
+ * find any resjunk columns in orig_tlist.
              */
-            i = 0;
-            for_each_cell(lc, lnext(list_head(gd->rollups)))
+static List *
+postprocess_setop_tlist(List *new_tlist, List *orig_tlist)
             {
-                RollupData *rollup = lfirst(lc);
+	ListCell   *l;
+	ListCell   *orig_tlist_item = list_head(orig_tlist);
 
-                if (rollup->hashable)
+	foreach(l, new_tlist)
                 {
-                    double        sz = estimate_hashagg_tablesize(path,
-                                                                agg_costs,
-                                                                rollup->numGroups);
+		TargetEntry *new_tle = (TargetEntry *) lfirst(l);
+		TargetEntry *orig_tle;
 
-                    /*
-                     * If sz is enormous, but work_mem (and hence scale) is
-                     * small, avoid integer overflow here.
-                     */
-                    k_weights[i] = (int) Min(floor(sz / scale),
-                                             k_capacity + 1.0);
-                    ++i;
+		/* ignore resjunk columns in setop result */
+		if (new_tle->resjunk)
+			continue;
+
+		Assert(orig_tlist_item != NULL);
+		orig_tle = (TargetEntry *) lfirst(orig_tlist_item);
+		orig_tlist_item = lnext(orig_tlist_item);
+		if (orig_tle->resjunk)	/* should not happen */
+			elog(ERROR, "resjunk output columns are not implemented");
+		Assert(new_tle->resno == orig_tle->resno);
+		new_tle->ressortgroupref = orig_tle->ressortgroupref;
                 }
+	if (orig_tlist_item != NULL)
+		elog(ERROR, "resjunk output columns are not implemented");
+	return new_tlist;
             }
 
             /*
-             * Apply knapsack algorithm; compute the set of items which
-             * maximizes the value stored (in this case the number of sorts
-             * saved) while keeping the total size (approximately) within
-             * capacity.
+ * select_active_windows
+ *		Create a list of the "active" window clauses (ie, those referenced
+ *		by non-deleted WindowFuncs) in the order they are to be executed.
              */
-            if (i > 0)
-                hash_items = DiscreteKnapsack(k_capacity, i, k_weights, NULL);
-
-            if (!bms_is_empty(hash_items))
+static List *
+select_active_windows(PlannerInfo *root, WindowFuncLists *wflists)
             {
-                rollups = list_make1(linitial(gd->rollups));
+	List	   *result;
+	List	   *actives;
+	ListCell   *lc;
 
-                i = 0;
-                for_each_cell(lc, lnext(list_head(gd->rollups)))
+	/* First, make a list of the active windows */
+	actives = NIL;
+	foreach(lc, root->parse->windowClause)
                 {
-                    RollupData *rollup = lfirst(lc);
+		WindowClause *wc = (WindowClause *) lfirst(lc);
 
-                    if (rollup->hashable)
-                    {
-                        if (bms_is_member(i, hash_items))
-                            hash_sets = list_concat(hash_sets,
-                                                    list_copy(rollup->gsets_data));
-                        else
-                            rollups = lappend(rollups, rollup);
-                        ++i;
-                    }
-                    else
-                        rollups = lappend(rollups, rollup);
-                }
-            }
+		/* It's only active if wflists shows some related WindowFuncs */
+		Assert(wc->winref <= wflists->maxWinRef);
+		if (wflists->windowFuncs[wc->winref] != NIL)
+			actives = lappend(actives, wc);
         }
 
-        if (!rollups && hash_sets)
-            rollups = list_copy(gd->rollups);
-
-        foreach(lc, hash_sets)
+	/*
+	 * Now, ensure that windows with identical partitioning/ordering clauses
+	 * are adjacent in the list.  This is required by the SQL standard, which
+	 * says that only one sort is to be used for such windows, even if they
+	 * are otherwise distinct (eg, different names or framing clauses).
+	 *
+	 * There is room to be much smarter here, for example detecting whether
+	 * one window's sort keys are a prefix of another's (so that sorting for
+	 * the latter would do for the former), or putting windows first that
+	 * match a sort order available for the underlying query.  For the moment
+	 * we are content with meeting the spec.
+	 */
+	result = NIL;
+	while (actives != NIL)
         {
-            GroupingSetData *gs = lfirst(lc);
-            RollupData *rollup = makeNode(RollupData);
+		WindowClause *wc = (WindowClause *) linitial(actives);
+		ListCell   *prev;
+		ListCell   *next;
 
-            Assert(gs->set != NIL);
-
-            rollup->groupClause = preprocess_groupclause(root, gs->set);
-            rollup->gsets_data = list_make1(gs);
-            rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
-                                                     rollup->gsets_data,
-                                                     gd->tleref_to_colnum_map);
-            rollup->numGroups = gs->numGroups;
-            rollup->hashable = true;
-            rollup->is_hashed = true;
-            rollups = lcons(rollup, rollups);
-        }
+		/* Move wc from actives to result */
+		actives = list_delete_first(actives);
+		result = lappend(result, wc);
 
-        if (rollups)
+		/* Now move any matching windows from actives to result */
+		prev = NULL;
+		for (lc = list_head(actives); lc; lc = next)
         {
-            /*
-             * If the grouping can't be fully pushed down, redistribute the
-             * path on top of the (sorted) path. If if can be pushed down,
-             * disable construction of complex distributed paths.
-             */
-            if (! can_push_down_grouping(root, parse, path))
-                path = create_remotesubplan_path(root, path, NULL);
-
-            add_path(grouped_rel, (Path *)
-                     create_groupingsets_path(root,
-                                              grouped_rel,
-                                              path,
-                                              target,
-                                              (List *) parse->havingQual,
-                                              AGG_MIXED,
-                                              rollups,
-                                              agg_costs,
-                                              dNumGroups));
+			WindowClause *wc2 = (WindowClause *) lfirst(lc);
+
+			next = lnext(lc);
+			/* framing options are NOT to be compared here! */
+			if (equal(wc->partitionClause, wc2->partitionClause) &&
+				equal(wc->orderClause, wc2->orderClause))
+			{
+				actives = list_delete_cell(actives, lc, prev);
+				result = lappend(result, wc2);
         }
+			else
+				prev = lc;
     }
-
-    /*
-     * Now try the simple sorted case.
-     */
-    if (!gd->unsortable_sets)
-    {
-        /*
-         * If the grouping can't be fully pushed down, redistribute the
-         * path on top of the (sorted) path. If if can be pushed down,
-         * disable construction of complex distributed paths.
-         */
-        if (! can_push_down_grouping(root, parse, path))
-            path = create_remotesubplan_path(root, path, NULL);
-
-        add_path(grouped_rel, (Path *)
-                 create_groupingsets_path(root,
-                                          grouped_rel,
-                                          path,
-                                          target,
-                                          (List *) parse->havingQual,
-                                          AGG_SORTED,
-                                          gd->rollups,
-                                          agg_costs,
-                                          dNumGroups));
     }
+
+	return result;
 }
 
 /*
- * create_window_paths
- *
- * Build a new upperrel containing Paths for window-function evaluation.
+ * make_window_input_target
+ *	  Generate appropriate PathTarget for initial input to WindowAgg nodes.
  *
- * input_rel: contains the source-data Paths
- * input_target: result of make_window_input_target
- * output_target: what the topmost WindowAggPath should return
- * tlist: query's target list (needed to look up pathkeys)
- * wflists: result of find_window_functions
- * activeWindows: result of select_active_windows
+ * When the query has window functions, this function computes the desired
+ * target to be computed by the node just below the first WindowAgg.
+ * This tlist must contain all values needed to evaluate the window functions,
+ * compute the final target list, and perform any required final sort step.
+ * If multiple WindowAggs are needed, each intermediate one adds its window
+ * function results onto this base tlist; only the topmost WindowAgg computes
+ * the actual desired target list.
  *
- * Note: all Paths in input_rel are expected to return input_target.
- */
-static RelOptInfo *
-create_window_paths(PlannerInfo *root,
-                    RelOptInfo *input_rel,
-                    PathTarget *input_target,
-                    PathTarget *output_target,
-                    List *tlist,
-                    WindowFuncLists *wflists,
+ * This function is much like make_group_input_target, though not quite enough
+ * like it to share code.  As in that function, we flatten most expressions
+ * into their component variables.  But we do not want to flatten window
+ * PARTITION BY/ORDER BY clauses, since that might result in multiple
+ * evaluations of them, which would be bad (possibly even resulting in
+ * inconsistent answers, if they contain volatile functions).
+ * Also, we must not flatten GROUP BY clauses that were left unflattened by
+ * make_group_input_target, because we may no longer have access to the
+ * individual Vars in them.
+ *
+ * Another key difference from make_group_input_target is that we don't
+ * flatten Aggref expressions, since those are to be computed below the
+ * window functions and just referenced like Vars above that.
+ *
+ * 'final_target' is the query's final target list (in PathTarget form)
+ * 'activeWindows' is the list of active windows previously identified by
+ *			select_active_windows.
+ *
+ * The result is the PathTarget to be computed by the plan node immediately
+ * below the first WindowAgg node.
+ */
+static PathTarget *
+make_window_input_target(PlannerInfo *root,
+						 PathTarget *final_target,
                     List *activeWindows)
-{// #lizard forgives
-    RelOptInfo *window_rel;
+{
+	Query	   *parse = root->parse;
+	PathTarget *input_target;
+	Bitmapset  *sgrefs;
+	List	   *flattenable_cols;
+	List	   *flattenable_vars;
+	int			i;
     ListCell   *lc;
 
-    /* For now, do all work in the (WINDOW, NULL) upperrel */
-    window_rel = fetch_upper_rel(root, UPPERREL_WINDOW, NULL);
-
-    /*
-     * If the input relation is not parallel-safe, then the window relation
-     * can't be parallel-safe, either.  Otherwise, we need to examine the
-     * target list and active windows for non-parallel-safe constructs.
-     */
-    if (input_rel->consider_parallel &&
-        is_parallel_safe(root, (Node *) output_target->exprs) &&
-        is_parallel_safe(root, (Node *) activeWindows))
-        window_rel->consider_parallel = true;
+	Assert(parse->hasWindowFuncs);
 
     /*
-     * If the input rel belongs to a single FDW, so does the window rel.
+	 * Collect the sortgroupref numbers of window PARTITION/ORDER BY clauses
+	 * into a bitmapset for convenient reference below.
      */
-    window_rel->serverid = input_rel->serverid;
-    window_rel->userid = input_rel->userid;
-    window_rel->useridiscurrent = input_rel->useridiscurrent;
-    window_rel->fdwroutine = input_rel->fdwroutine;
+	sgrefs = NULL;
+	foreach(lc, activeWindows)
+	{
+		WindowClause *wc = (WindowClause *) lfirst(lc);
+		ListCell   *lc2;
 
-    /*
-     * Consider computing window functions starting from the existing
-     * cheapest-total path (which will likely require a sort) as well as any
-     * existing paths that satisfy root->window_pathkeys (which won't).
-     */
-    foreach(lc, input_rel->pathlist)
+		foreach(lc2, wc->partitionClause)
     {
-        Path       *path = (Path *) lfirst(lc);
+			SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc2);
 
-        if (path == input_rel->cheapest_total_path ||
-            pathkeys_contained_in(root->window_pathkeys, path->pathkeys))
-            create_one_window_path(root,
-                                   window_rel,
-                                   path,
-                                   input_target,
-                                   output_target,
-                                   tlist,
-                                   wflists,
-                                   activeWindows);
+			sgrefs = bms_add_member(sgrefs, sortcl->tleSortGroupRef);
     }
+		foreach(lc2, wc->orderClause)
+		{
+			SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc2);
 
-    /*
-     * If there is an FDW that's responsible for all baserels of the query,
-     * let it consider adding ForeignPaths.
-     */
-    if (window_rel->fdwroutine &&
-        window_rel->fdwroutine->GetForeignUpperPaths)
-        window_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_WINDOW,
-                                                     input_rel, window_rel);
-
-    /* Let extensions possibly add some more paths */
-    if (create_upper_paths_hook)
-        (*create_upper_paths_hook) (root, UPPERREL_WINDOW,
-                                    input_rel, window_rel);
+			sgrefs = bms_add_member(sgrefs, sortcl->tleSortGroupRef);
+		}
+	}
 
-    /* Now choose the best path(s) */
-    set_cheapest(window_rel);
+	/* Add in sortgroupref numbers of GROUP BY clauses, too */
+	foreach(lc, parse->groupClause)
+	{
+		SortGroupClause *grpcl = (SortGroupClause *) lfirst(lc);
 
-    return window_rel;
+		sgrefs = bms_add_member(sgrefs, grpcl->tleSortGroupRef);
 }
 
 /*
- * Stack window-function implementation steps atop the given Path, and
- * add the result to window_rel.
- *
- * window_rel: upperrel to contain result
- * path: input Path to use (must return input_target)
- * input_target: result of make_window_input_target
- * output_target: what the topmost WindowAggPath should return
- * tlist: query's target list (needed to look up pathkeys)
- * wflists: result of find_window_functions
- * activeWindows: result of select_active_windows
+	 * Construct a target containing all the non-flattenable targetlist items,
+	 * and save aside the others for a moment.
  */
-static void
-create_one_window_path(PlannerInfo *root,
-                       RelOptInfo *window_rel,
-                       Path *path,
-                       PathTarget *input_target,
-                       PathTarget *output_target,
-                       List *tlist,
-                       WindowFuncLists *wflists,
-                       List *activeWindows)
+	input_target = create_empty_pathtarget();
+	flattenable_cols = NIL;
+
+	i = 0;
+	foreach(lc, final_target->exprs)
 {
-    PathTarget *window_target;
-    ListCell   *l;
+		Expr	   *expr = (Expr *) lfirst(lc);
+		Index		sgref = get_pathtarget_sortgroupref(final_target, i);
 
     /*
-     * Since each window clause could require a different sort order, we stack
-     * up a WindowAgg node for each clause, with sort steps between them as
-     * needed.  (We assume that select_active_windows chose a good order for
-     * executing the clauses in.)
-     *
-     * input_target should contain all Vars and Aggs needed for the result.
-     * (In some cases we wouldn't need to propagate all of these all the way
-     * to the top, since they might only be needed as inputs to WindowFuncs.
-     * It's probably not worth trying to optimize that though.)  It must also
-     * contain all window partitioning and sorting expressions, to ensure
-     * they're computed only once at the bottom of the stack (that's critical
-     * for volatile functions).  As we climb up the stack, we'll add outputs
-     * for the WindowFuncs computed at each level.
+		 * Don't want to deconstruct window clauses or GROUP BY items.  (Note
+		 * that such items can't contain window functions, so it's okay to
+		 * compute them below the WindowAgg nodes.)
      */
-    window_target = input_target;
-
-    foreach(l, activeWindows)
-    {
-        WindowClause *wc = (WindowClause *) lfirst(l);
-        List       *window_pathkeys;
-
-        window_pathkeys = make_pathkeys_for_window(root,
-                                                   wc,
-                                                   tlist);
-
-        /* Sort if necessary */
-        if (!pathkeys_contained_in(window_pathkeys, path->pathkeys))
-        {
-            path = (Path *) create_sort_path(root, window_rel,
-                                             path,
-                                             window_pathkeys,
-                                             -1.0);
-        }
-
-        if (lnext(l))
+		if (sgref != 0 && bms_is_member(sgref, sgrefs))
         {
             /*
-             * Add the current WindowFuncs to the output target for this
-             * intermediate WindowAggPath.  We must copy window_target to
-             * avoid changing the previous path's target.
-             *
-             * Note: a WindowFunc adds nothing to the target's eval costs; but
-             * we do need to account for the increase in tlist width.
+			 * Don't want to deconstruct this value, so add it to the input
+			 * target as-is.
              */
-            ListCell   *lc2;
-
-            window_target = copy_pathtarget(window_target);
-            foreach(lc2, wflists->windowFuncs[wc->winref])
-            {
-                WindowFunc *wfunc = lfirst_node(WindowFunc, lc2);
-
-                add_column_to_pathtarget(window_target, (Expr *) wfunc, 0);
-                window_target->width += get_typavgwidth(wfunc->wintype, -1);
-            }
+			add_column_to_pathtarget(input_target, expr, sgref);
         }
         else
         {
-            /* Install the goal target in the topmost WindowAgg */
-            window_target = output_target;
+			/*
+			 * Column is to be flattened, so just remember the expression for
+			 * later call to pull_var_clause.
+			 */
+			flattenable_cols = lappend(flattenable_cols, expr);
         }
 
-        /* We can't really push down window functions for now. */
-        if (!can_push_down_window(root, path))
-            path = create_remotesubplan_path(root, path, NULL);
-
-        path = (Path *)
-            create_windowagg_path(root, window_rel, path, window_target,
-                                  wflists->windowFuncs[wc->winref],
-                                  wc,
-                                  window_pathkeys);
+		i++;
     }
 
-    add_path(window_rel, path);
+	/*
+	 * Pull out all the Vars and Aggrefs mentioned in flattenable columns, and
+	 * add them to the input target if not already present.  (Some might be
+	 * there already because they're used directly as window/group clauses.)
+	 *
+	 * Note: it's essential to use PVC_INCLUDE_AGGREGATES here, so that any
+	 * Aggrefs are placed in the Agg node's tlist and not left to be computed
+	 * at higher levels.  On the other hand, we should recurse into
+	 * WindowFuncs to make sure their input expressions are available.
+	 */
+	flattenable_vars = pull_var_clause((Node *) flattenable_cols,
+									   PVC_INCLUDE_AGGREGATES |
+									   PVC_RECURSE_WINDOWFUNCS |
+									   PVC_INCLUDE_PLACEHOLDERS);
+	add_new_columns_to_pathtarget(input_target, flattenable_vars);
+
+	/* clean up cruft */
+	list_free(flattenable_vars);
+	list_free(flattenable_cols);
+
+	/* XXX this causes some redundant cost calculation ... */
+	return set_pathtarget_cost_width(root, input_target);
 }
 
 /*
- * create_distinct_paths
- *
- * Build a new upperrel containing Paths for SELECT DISTINCT evaluation.
+ * make_pathkeys_for_window
+ *		Create a pathkeys list describing the required input ordering
+ *		for the given WindowClause.
  *
- * input_rel: contains the source-data Paths
+ * The required ordering is first the PARTITION keys, then the ORDER keys.
+ * In the future we might try to implement windowing using hashing, in which
+ * case the ordering could be relaxed, but for now we always sort.
  *
- * Note: input paths should already compute the desired pathtarget, since
- * Sort/Unique won't project anything.
+ * Caution: if you change this, see createplan.c's get_column_info_for_window!
  */
-static RelOptInfo *
-create_distinct_paths(PlannerInfo *root,
-                      RelOptInfo *input_rel)
-{// #lizard forgives
-    Query       *parse = root->parse;
-    Path       *cheapest_input_path = input_rel->cheapest_total_path;
-    RelOptInfo *distinct_rel;
-    double        numDistinctRows;
-    bool        allow_hash;
-    Path       *path;
-    ListCell   *lc;
-
-    /* For now, do all work in the (DISTINCT, NULL) upperrel */
-    distinct_rel = fetch_upper_rel(root, UPPERREL_DISTINCT, NULL);
+static List *
+make_pathkeys_for_window(PlannerInfo *root, WindowClause *wc,
+						 List *tlist)
+{
+	List	   *window_pathkeys;
+	List	   *window_sortclauses;
+
+	/* Throw error if can't sort */
+	if (!grouping_is_sortable(wc->partitionClause))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("could not implement window PARTITION BY"),
+				 errdetail("Window partitioning columns must be of sortable datatypes.")));
+	if (!grouping_is_sortable(wc->orderClause))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("could not implement window ORDER BY"),
+				 errdetail("Window ordering columns must be of sortable datatypes.")));
+
+	/* Okay, make the combined pathkeys */
+	window_sortclauses = list_concat(list_copy(wc->partitionClause),
+									 list_copy(wc->orderClause));
+	window_pathkeys = make_pathkeys_for_sortclauses(root,
+													window_sortclauses,
+													tlist);
+	list_free(window_sortclauses);
+	return window_pathkeys;
+}
 
     /*
-     * We don't compute anything at this level, so distinct_rel will be
-     * parallel-safe if the input rel is parallel-safe.  In particular, if
-     * there is a DISTINCT ON (...) clause, any path for the input_rel will
-     * output those expressions, and will not be parallel-safe unless those
-     * expressions are parallel-safe.
+ * make_sort_input_target
+ *	  Generate appropriate PathTarget for initial input to Sort step.
+ *
+ * If the query has ORDER BY, this function chooses the target to be computed
+ * by the node just below the Sort (and DISTINCT, if any, since Unique can't
+ * project) steps.  This might or might not be identical to the query's final
+ * output target.
+ *
+ * The main argument for keeping the sort-input tlist the same as the final
+ * is that we avoid a separate projection node (which will be needed if
+ * they're different, because Sort can't project).  However, there are also
+ * advantages to postponing tlist evaluation till after the Sort: it ensures
+ * a consistent order of evaluation for any volatile functions in the tlist,
+ * and if there's also a LIMIT, we can stop the query without ever computing
+ * tlist functions for later rows, which is beneficial for both volatile and
+ * expensive functions.
+ *
+ * Our current policy is to postpone volatile expressions till after the sort
+ * unconditionally (assuming that that's possible, ie they are in plain tlist
+ * columns and not ORDER BY/GROUP BY/DISTINCT columns).  We also prefer to
+ * postpone set-returning expressions, because running them beforehand would
+ * bloat the sort dataset, and because it might cause unexpected output order
+ * if the sort isn't stable.  However there's a constraint on that: all SRFs
+ * in the tlist should be evaluated at the same plan step, so that they can
+ * run in sync in nodeProjectSet.  So if any SRFs are in sort columns, we
+ * mustn't postpone any SRFs.  (Note that in principle that policy should
+ * probably get applied to the group/window input targetlists too, but we
+ * have not done that historically.)  Lastly, expensive expressions are
+ * postponed if there is a LIMIT, or if root->tuple_fraction shows that
+ * partial evaluation of the query is possible (if neither is true, we expect
+ * to have to evaluate the expressions for every row anyway), or if there are
+ * any volatile or set-returning expressions (since once we've put in a
+ * projection at all, it won't cost any more to postpone more stuff).
+ *
+ * Another issue that could potentially be considered here is that
+ * evaluating tlist expressions could result in data that's either wider
+ * or narrower than the input Vars, thus changing the volume of data that
+ * has to go through the Sort.  However, we usually have only a very bad
+ * idea of the output width of any expression more complex than a Var,
+ * so for now it seems too risky to try to optimize on that basis.
+ *
+ * Note that if we do produce a modified sort-input target, and then the
+ * query ends up not using an explicit Sort, no particular harm is done:
+ * we'll initially use the modified target for the preceding path nodes,
+ * but then change them to the final target with apply_projection_to_path.
+ * Moreover, in such a case the guarantees about evaluation order of
+ * volatile functions still hold, since the rows are sorted already.
+ *
+ * This function has some things in common with make_group_input_target and
+ * make_window_input_target, though the detailed rules for what to do are
+ * different.  We never flatten/postpone any grouping or ordering columns;
+ * those are needed before the sort.  If we do flatten a particular
+ * expression, we leave Aggref and WindowFunc nodes alone, since those were
+ * computed earlier.
+ *
+ * 'final_target' is the query's final target list (in PathTarget form)
+ * 'have_postponed_srfs' is an output argument, see below
+ *
+ * The result is the PathTarget to be computed by the plan node immediately
+ * below the Sort step (and the Distinct step, if any).  This will be
+ * exactly final_target if we decide a projection step wouldn't be helpful.
+ *
+ * In addition, *have_postponed_srfs is set to TRUE if we choose to postpone
+ * any set-returning functions to after the Sort.
      */
-    distinct_rel->consider_parallel = input_rel->consider_parallel;
+static PathTarget *
+make_sort_input_target(PlannerInfo *root,
+					   PathTarget *final_target,
+					   bool *have_postponed_srfs)
+{
+	Query	   *parse = root->parse;
+	PathTarget *input_target;
+	int			ncols;
+	bool	   *col_is_srf;
+	bool	   *postpone_col;
+	bool		have_srf;
+	bool		have_volatile;
+	bool		have_expensive;
+	bool		have_srf_sortcols;
+	bool		postpone_srfs;
+	List	   *postponable_cols;
+	List	   *postponable_vars;
+	int			i;
+	ListCell   *lc;
+
+	/* Shouldn't get here unless query has ORDER BY */
+	Assert(parse->sortClause);
+
+	*have_postponed_srfs = false;	/* default result */
+
+	/* Inspect tlist and collect per-column information */
+	ncols = list_length(final_target->exprs);
+	col_is_srf = (bool *) palloc0(ncols * sizeof(bool));
+	postpone_col = (bool *) palloc0(ncols * sizeof(bool));
+	have_srf = have_volatile = have_expensive = have_srf_sortcols = false;
+
+	i = 0;
+	foreach(lc, final_target->exprs)
+	{
+		Expr	   *expr = (Expr *) lfirst(lc);
 
     /*
-     * If the input rel belongs to a single FDW, so does the distinct_rel.
+		 * If the column has a sortgroupref, assume it has to be evaluated
+		 * before sorting.  Generally such columns would be ORDER BY, GROUP
+		 * BY, etc targets.  One exception is columns that were removed from
+		 * GROUP BY by remove_useless_groupby_columns() ... but those would
+		 * only be Vars anyway.  There don't seem to be any cases where it
+		 * would be worth the trouble to double-check.
      */
-    distinct_rel->serverid = input_rel->serverid;
-    distinct_rel->userid = input_rel->userid;
-    distinct_rel->useridiscurrent = input_rel->useridiscurrent;
-    distinct_rel->fdwroutine = input_rel->fdwroutine;
-
-    /* Estimate number of distinct rows there will be */
-    if (parse->groupClause || parse->groupingSets || parse->hasAggs ||
-        root->hasHavingQual)
+		if (get_pathtarget_sortgroupref(final_target, i) == 0)
     {
         /*
-         * If there was grouping or aggregation, use the number of input rows
-         * as the estimated number of DISTINCT rows (ie, assume the input is
-         * already mostly unique).
+			 * Check for SRF or volatile functions.  Check the SRF case first
+			 * because we must know whether we have any postponed SRFs.
          */
-        numDistinctRows = cheapest_input_path->rows;
+			if (parse->hasTargetSRFs &&
+				expression_returns_set((Node *) expr))
+			{
+				/* We'll decide below whether these are postponable */
+				col_is_srf[i] = true;
+				have_srf = true;
+			}
+			else if (contain_volatile_functions((Node *) expr))
+			{
+				/* Unconditionally postpone */
+				postpone_col[i] = true;
+				have_volatile = true;
     }
     else
     {
         /*
-         * Otherwise, the UNIQUE filter has effects comparable to GROUP BY.
+				 * Else check the cost.  XXX it's annoying to have to do this
+				 * when set_pathtarget_cost_width() just did it.  Refactor to
+				 * allow sharing the work?
          */
-        List       *distinctExprs;
+				QualCost	cost;
 
-        distinctExprs = get_sortgrouplist_exprs(parse->distinctClause,
-                                                parse->targetList);
-        numDistinctRows = estimate_num_groups(root, distinctExprs,
-                                              cheapest_input_path->rows,
-                                              NULL);
-    }
+				cost_qual_eval_node(&cost, (Node *) expr, root);
 
     /*
-     * Consider sort-based implementations of DISTINCT, if possible.
+				 * We arbitrarily define "expensive" as "more than 10X
+				 * cpu_operator_cost".  Note this will take in any PL function
+				 * with default cost.
      */
-    if (grouping_is_sortable(parse->distinctClause))
+				if (cost.per_tuple > 10 * cpu_operator_cost)
+				{
+					postpone_col[i] = true;
+					have_expensive = true;
+				}
+			}
+		}
+		else
     {
+			/* For sortgroupref cols, just check if any contain SRFs */
+			if (!have_srf_sortcols &&
+				parse->hasTargetSRFs &&
+				expression_returns_set((Node *) expr))
+				have_srf_sortcols = true;
+		}
+
+		i++;
+	}
+
         /*
-         * First, if we have any adequately-presorted paths, just stick a
-         * Unique node on those.  Then consider doing an explicit sort of the
-         * cheapest input path and Unique'ing that.
-         *
-         * When we have DISTINCT ON, we must sort by the more rigorous of
-         * DISTINCT and ORDER BY, else it won't have the desired behavior.
-         * Also, if we do have to do an explicit sort, we might as well use
-         * the more rigorous ordering to avoid a second sort later.  (Note
-         * that the parser will have ensured that one clause is a prefix of
-         * the other.)
+	 * We can postpone SRFs if we have some but none are in sortgroupref cols.
          */
-        List       *needed_pathkeys;
+	postpone_srfs = (have_srf && !have_srf_sortcols);
 
-        if (parse->hasDistinctOn &&
-            list_length(root->distinct_pathkeys) <
-            list_length(root->sort_pathkeys))
-            needed_pathkeys = root->sort_pathkeys;
-        else
-            needed_pathkeys = root->distinct_pathkeys;
+	/*
+	 * If we don't need a post-sort projection, just return final_target.
+	 */
+	if (!(postpone_srfs || have_volatile ||
+		  (have_expensive &&
+		   (parse->limitCount || root->tuple_fraction > 0))))
+		return final_target;
 
-        foreach(lc, input_rel->pathlist)
-        {
-            Path       *path = (Path *) lfirst(lc);
+	/*
+	 * Report whether the post-sort projection will contain set-returning
+	 * functions.  This is important because it affects whether the Sort can
+	 * rely on the query's LIMIT (if any) to bound the number of rows it needs
+	 * to return.
+	 */
+	*have_postponed_srfs = postpone_srfs;
 
-            if (pathkeys_contained_in(needed_pathkeys, path->pathkeys))
-            {
                 /*
-                 * Make sure the distribution matches the distinct clause,
-                 * needed by the UNIQUE path.
-                 *
-                 * FIXME This could probably benefit from pushing a UNIQUE
-                 * to the remote side, and only doing a merge locally.
+	 * Construct the sort-input target, taking all non-postponable columns and
+	 * then adding Vars, PlaceHolderVars, Aggrefs, and WindowFuncs found in
+	 * the postponable ones.
                  */
-                if (!grouping_distribution_match(root, parse, path, parse->distinctClause))
-                    path = create_remotesubplan_path(root, path, NULL);
-
-                add_path(distinct_rel, (Path *)
-                         create_upper_unique_path(root, distinct_rel,
-                                                  path,
-                                                  list_length(root->distinct_pathkeys),
-                                                  numDistinctRows));
-            }
-        }
+	input_target = create_empty_pathtarget();
+	postponable_cols = NIL;
 
-        /* For explicit-sort case, always use the more rigorous clause */
-        if (list_length(root->distinct_pathkeys) <
-            list_length(root->sort_pathkeys))
+	i = 0;
+	foreach(lc, final_target->exprs)
         {
-            needed_pathkeys = root->sort_pathkeys;
-            /* Assert checks that parser didn't mess up... */
-            Assert(pathkeys_contained_in(root->distinct_pathkeys,
-                                         needed_pathkeys));
-        }
+		Expr	   *expr = (Expr *) lfirst(lc);
+
+		if (postpone_col[i] || (postpone_srfs && col_is_srf[i]))
+			postponable_cols = lappend(postponable_cols, expr);
         else
-            needed_pathkeys = root->distinct_pathkeys;
+			add_column_to_pathtarget(input_target, expr,
+									 get_pathtarget_sortgroupref(final_target, i));
 
-        path = cheapest_input_path;
-        if (!pathkeys_contained_in(needed_pathkeys, path->pathkeys))
-            path = (Path *) create_sort_path(root, distinct_rel,
-                                             path,
-                                             needed_pathkeys,
-                                             -1.0);
-
-        /* In case of grouping / distribution mismatch, inject remote scan. */
-        if (!grouping_distribution_match(root, parse, path, parse->distinctClause))
-            path = create_remotesubplan_path(root, path, NULL);
-
-        add_path(distinct_rel, (Path *)
-                 create_upper_unique_path(root, distinct_rel,
-                                          path,
-                                          list_length(root->distinct_pathkeys),
-                                          numDistinctRows));
+		i++;
+	}
+
+	/*
+	 * Pull out all the Vars, Aggrefs, and WindowFuncs mentioned in
+	 * postponable columns, and add them to the sort-input target if not
+	 * already present.  (Some might be there already.)  We mustn't
+	 * deconstruct Aggrefs or WindowFuncs here, since the projection node
+	 * would be unable to recompute them.
+	 */
+	postponable_vars = pull_var_clause((Node *) postponable_cols,
+									   PVC_INCLUDE_AGGREGATES |
+									   PVC_INCLUDE_WINDOWFUNCS |
+									   PVC_INCLUDE_PLACEHOLDERS);
+	add_new_columns_to_pathtarget(input_target, postponable_vars);
+
+	/* clean up cruft */
+	list_free(postponable_vars);
+	list_free(postponable_cols);
+
+	/* XXX this represents even more redundant cost calculation ... */
+	return set_pathtarget_cost_width(root, input_target);
     }
 
     /*
-     * Consider hash-based implementations of DISTINCT, if possible.
+ * get_cheapest_fractional_path
+ *	  Find the cheapest path for retrieving a specified fraction of all
+ *	  the tuples expected to be returned by the given relation.
      *
-     * If we were not able to make any other types of path, we *must* hash or
-     * die trying.  If we do have other choices, there are several things that
-     * should prevent selection of hashing: if the query uses DISTINCT ON
-     * (because it won't really have the expected behavior if we hash), or if
-     * enable_hashagg is off, or if it looks like the hashtable will exceed
-     * work_mem.
+ * We interpret tuple_fraction the same way as grouping_planner.
      *
-     * Note: grouping_is_hashable() is much more expensive to check than the
-     * other gating conditions, so we want to do it last.
+ * We assume set_cheapest() has been run on the given rel.
      */
-    if (distinct_rel->pathlist == NIL)
-        allow_hash = true;        /* we have no alternatives */
-    else if (parse->hasDistinctOn || !enable_hashagg)
-        allow_hash = false;        /* policy-based decision not to hash */
-    else
+Path *
+get_cheapest_fractional_path(RelOptInfo *rel, double tuple_fraction)
     {
-        Size        hashentrysize;
+	Path	   *best_path = rel->cheapest_total_path;
+	ListCell   *l;
 
-        /* Estimate per-hash-entry space at tuple width... */
-        hashentrysize = MAXALIGN(cheapest_input_path->pathtarget->width) +
-            MAXALIGN(SizeofMinimalTupleHeader);
-        /* plus the per-hash-entry overhead */
-        hashentrysize += hash_agg_entry_size(0);
+	/* If all tuples will be retrieved, just return the cheapest-total path */
+	if (tuple_fraction <= 0.0)
+		return best_path;
 
-        /* Allow hashing only if hashtable is predicted to fit in work_mem */
-        allow_hash = (hashentrysize * numDistinctRows <= work_mem * 1024L);
-    }
+	/* Convert absolute # of tuples to a fraction; no need to clamp to 0..1 */
+	if (tuple_fraction >= 1.0 && best_path->rows > 0)
+		tuple_fraction /= best_path->rows;
 
-    if (allow_hash && grouping_is_hashable(parse->distinctClause))
+	foreach(l, rel->pathlist)
     {
-        Path *input_path = cheapest_input_path;
-
-        /* If needed, inject RemoteSubplan redistributing the data. */
-        if (!grouping_distribution_match(root, parse, input_path, parse->distinctClause))
-            input_path = create_remotesubplan_path(root, input_path, NULL);
-
-        /* XXX Maybe we can make this a 2-phase aggregate too? */
-
-        /* Generate hashed aggregate path --- no sort needed */
-        add_path(distinct_rel, (Path *)
-                 create_agg_path(root,
-                                 distinct_rel,
-                                 input_path,
-                                 input_path->pathtarget,
-                                 AGG_HASHED,
-                                 AGGSPLIT_SIMPLE,
-                                 parse->distinctClause,
-                                 NIL,
-                                 NULL,
-                                 numDistinctRows));
-    }
-
-    /* Give a helpful error if we failed to find any implementation */
-    if (distinct_rel->pathlist == NIL)
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("could not implement DISTINCT"),
-                 errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
-
-    /*
-     * If there is an FDW that's responsible for all baserels of the query,
-     * let it consider adding ForeignPaths.
-     */
-    if (distinct_rel->fdwroutine &&
-        distinct_rel->fdwroutine->GetForeignUpperPaths)
-        distinct_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_DISTINCT,
-                                                       input_rel, distinct_rel);
+		Path	   *path = (Path *) lfirst(l);
 
-    /* Let extensions possibly add some more paths */
-    if (create_upper_paths_hook)
-        (*create_upper_paths_hook) (root, UPPERREL_DISTINCT,
-                                    input_rel, distinct_rel);
+		if (path == rel->cheapest_total_path ||
+			compare_fractional_path_costs(best_path, path, tuple_fraction) <= 0)
+			continue;
 
-    /* Now choose the best path(s) */
-    set_cheapest(distinct_rel);
+		best_path = path;
+    }
 
-    return distinct_rel;
+	return best_path;
 }
 
 /*
- * create_ordered_paths
- *
- * Build a new upperrel containing Paths for ORDER BY evaluation.
+ * adjust_paths_for_srfs
+ *		Fix up the Paths of the given upperrel to handle tSRFs properly.
  *
- * All paths in the result must satisfy the ORDER BY ordering.
- * The only new path we need consider is an explicit sort on the
- * cheapest-total existing path.
+ * The executor can only handle set-returning functions that appear at the
+ * top level of the targetlist of a ProjectSet plan node.  If we have any SRFs
+ * that are not at top level, we need to split up the evaluation into multiple
+ * plan levels in which each level satisfies this constraint.  This function
+ * modifies each Path of an upperrel that (might) compute any SRFs in its
+ * output tlist to insert appropriate projection steps.
  *
- * input_rel: contains the source-data Paths
- * target: the output tlist the result Paths must emit
- * limit_tuples: estimated bound on the number of output tuples,
- *        or -1 if no LIMIT or couldn't estimate
+ * The given targets and targets_contain_srfs lists are from
+ * split_pathtarget_at_srfs().  We assume the existing Paths emit the first
+ * target in targets.
  */
-static RelOptInfo *
-create_ordered_paths(PlannerInfo *root,
-                     RelOptInfo *input_rel,
-                     PathTarget *target,
-                     double limit_tuples)
-{// #lizard forgives
-    Path       *cheapest_input_path = input_rel->cheapest_total_path;
-    RelOptInfo *ordered_rel;
+static void
+adjust_paths_for_srfs(PlannerInfo *root, RelOptInfo *rel,
+					  List *targets, List *targets_contain_srfs)
+{
     ListCell   *lc;
 
-    /* For now, do all work in the (ORDERED, NULL) upperrel */
-    ordered_rel = fetch_upper_rel(root, UPPERREL_ORDERED, NULL);
+	Assert(list_length(targets) == list_length(targets_contain_srfs));
+	Assert(!linitial_int(targets_contain_srfs));
 
-    /*
-     * If the input relation is not parallel-safe, then the ordered relation
-     * can't be parallel-safe, either.  Otherwise, it's parallel-safe if the
-     * target list is parallel-safe.
-     */
-    if (input_rel->consider_parallel &&
-        is_parallel_safe(root, (Node *) target->exprs))
-        ordered_rel->consider_parallel = true;
+	/* If no SRFs appear at this plan level, nothing to do */
+	if (list_length(targets) == 1)
+		return;
 
     /*
-     * If the input rel belongs to a single FDW, so does the ordered_rel.
+	 * Stack SRF-evaluation nodes atop each path for the rel.
+	 *
+	 * In principle we should re-run set_cheapest() here to identify the
+	 * cheapest path, but it seems unlikely that adding the same tlist eval
+	 * costs to all the paths would change that, so we don't bother. Instead,
+	 * just assume that the cheapest-startup and cheapest-total paths remain
+	 * so.  (There should be no parameterized paths anymore, so we needn't
+	 * worry about updating cheapest_parameterized_paths.)
      */
-    ordered_rel->serverid = input_rel->serverid;
-    ordered_rel->userid = input_rel->userid;
-    ordered_rel->useridiscurrent = input_rel->useridiscurrent;
-    ordered_rel->fdwroutine = input_rel->fdwroutine;
-
-    foreach(lc, input_rel->pathlist)
+	foreach(lc, rel->pathlist)
     {
-        Path       *path = (Path *) lfirst(lc);
-        bool        is_sorted;
+		Path	   *subpath = (Path *) lfirst(lc);
+		Path	   *newpath = subpath;
+		ListCell   *lc1,
+				   *lc2;
 
-        is_sorted = pathkeys_contained_in(root->sort_pathkeys,
-                                          path->pathkeys);
-        if (path == cheapest_input_path || is_sorted)
-        {
-            if (!is_sorted)
+		Assert(subpath->param_info == NULL);
+		forboth(lc1, targets, lc2, targets_contain_srfs)
             {
-                /* An explicit sort here can take advantage of LIMIT */
-                path = (Path *) create_sort_path(root,
-                                                 ordered_rel,
-                                                 path,
-                                                 root->sort_pathkeys,
-                                                 limit_tuples);
-            }
-
-            /* Add projection step if needed */
-            if (path->pathtarget != target)
-                path = apply_projection_to_path(root, ordered_rel,
-                                                path, target);
-
-            add_path(ordered_rel, path);
+			PathTarget *thistarget = (PathTarget *) lfirst(lc1);
+			bool		contains_srfs = (bool) lfirst_int(lc2);
+
+			/* If this level doesn't contain SRFs, do regular projection */
+			if (contains_srfs)
+				newpath = (Path *) create_set_projection_path(root,
+															  rel,
+															  newpath,
+															  thistarget);
+			else
+				newpath = (Path *) apply_projection_to_path(root,
+															rel,
+															newpath,
+															thistarget);
         }
+		lfirst(lc) = newpath;
+		if (subpath == rel->cheapest_startup_path)
+			rel->cheapest_startup_path = newpath;
+		if (subpath == rel->cheapest_total_path)
+			rel->cheapest_total_path = newpath;
     }
 
-    /*
-     * generate_gather_paths() will have already generated a simple Gather
-     * path for the best parallel path, if any, and the loop above will have
-     * considered sorting it.  Similarly, generate_gather_paths() will also
-     * have generated order-preserving Gather Merge plans which can be used
-     * without sorting if they happen to match the sort_pathkeys, and the loop
-     * above will have handled those as well.  However, there's one more
-     * possibility: it may make sense to sort the cheapest partial path
-     * according to the required output order and then use Gather Merge.
-     */
-    if (ordered_rel->consider_parallel && root->sort_pathkeys != NIL &&
-        input_rel->partial_pathlist != NIL)
+	/* Likewise for partial paths, if any */
+	foreach(lc, rel->partial_pathlist)
     {
-        Path       *cheapest_partial_path;
+		Path	   *subpath = (Path *) lfirst(lc);
+		Path	   *newpath = subpath;
+		ListCell   *lc1,
+				   *lc2;
 
-        cheapest_partial_path = linitial(input_rel->partial_pathlist);
-
-        /*
-         * If cheapest partial path doesn't need a sort, this is redundant
-         * with what's already been tried.
-         */
-        if (!pathkeys_contained_in(root->sort_pathkeys,
-                                   cheapest_partial_path->pathkeys))
+		Assert(subpath->param_info == NULL);
+		forboth(lc1, targets, lc2, targets_contain_srfs)
         {
-            Path       *path;
-            double        total_groups;
-
-            path = (Path *) create_sort_path(root,
-                                             ordered_rel,
-                                             cheapest_partial_path,
-                                             root->sort_pathkeys,
-                                             -1.0);
-
-            total_groups = cheapest_partial_path->rows *
-                cheapest_partial_path->parallel_workers;
-            path = (Path *)
-                create_gather_merge_path(root, ordered_rel,
-                                         path,
-                                         target, root->sort_pathkeys, NULL,
-                                         &total_groups);
-
-            /* Add projection step if needed */
-            if (path->pathtarget != target)
-                path = apply_projection_to_path(root, ordered_rel,
-                                                path, target);
-
-            add_path(ordered_rel, path);
+			PathTarget *thistarget = (PathTarget *) lfirst(lc1);
+			bool		contains_srfs = (bool) lfirst_int(lc2);
+
+			/* If this level doesn't contain SRFs, do regular projection */
+			if (contains_srfs)
+				newpath = (Path *) create_set_projection_path(root,
+															  rel,
+															  newpath,
+															  thistarget);
+			else
+			{
+				/* avoid apply_projection_to_path, in case of multiple refs */
+				newpath = (Path *) create_projection_path(root,
+														  rel,
+														  newpath,
+														  thistarget);
+			}
+		}
+		lfirst(lc) = newpath;
         }
     }
 
     /*
-     * If there is an FDW that's responsible for all baserels of the query,
-     * let it consider adding ForeignPaths.
+ * expression_planner
+ *		Perform planner's transformations on a standalone expression.
+ *
+ * Various utility commands need to evaluate expressions that are not part
+ * of a plannable query.  They can do so using the executor's regular
+ * expression-execution machinery, but first the expression has to be fed
+ * through here to transform it from parser output to something executable.
+ *
+ * Currently, we disallow sublinks in standalone expressions, so there's no
+ * real "planning" involved here.  (That might not always be true though.)
+ * What we must do is run eval_const_expressions to ensure that any function
+ * calls are converted to positional notation and function default arguments
+ * get inserted.  The fact that constant subexpressions get simplified is a
+ * side-effect that is useful when the expression will get evaluated more than
+ * once.  Also, we must fix operator function IDs.
+ *
+ * Note: this must not make any damaging changes to the passed-in expression
+ * tree.  (It would actually be okay to apply fix_opfuncids to it, but since
+ * we first do an expression_tree_mutator-based walk, what is returned will
+ * be a new node tree.)
      */
-    if (ordered_rel->fdwroutine &&
-        ordered_rel->fdwroutine->GetForeignUpperPaths)
-        ordered_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_ORDERED,
-                                                      input_rel, ordered_rel);
-
-    /* Let extensions possibly add some more paths */
-    if (create_upper_paths_hook)
-        (*create_upper_paths_hook) (root, UPPERREL_ORDERED,
-                                    input_rel, ordered_rel);
+Expr *
+expression_planner(Expr *expr)
+{
+	Node	   *result;
 
     /*
-     * No need to bother with set_cheapest here; grouping_planner does not
-     * need us to do it.
+	 * Convert named-argument function calls, insert default arguments and
+	 * simplify constant subexprs
      */
-    Assert(ordered_rel->pathlist != NIL);
+	result = eval_const_expressions(NULL, (Node *) expr);
 
-    return ordered_rel;
+	/* Fill in opfuncid values if missing */
+	fix_opfuncids(result);
+
+	return (Expr *) result;
 }
 
 
 /*
- * make_group_input_target
- *      Generate appropriate PathTarget for initial input to grouping nodes.
- *
- * If there is grouping or aggregation, the scan/join subplan cannot emit
- * the query's final targetlist; for example, it certainly can't emit any
- * aggregate function calls.  This routine generates the correct target
- * for the scan/join subplan.
- *
- * The query target list passed from the parser already contains entries
- * for all ORDER BY and GROUP BY expressions, but it will not have entries
- * for variables used only in HAVING clauses; so we need to add those
- * variables to the subplan target list.  Also, we flatten all expressions
- * except GROUP BY items into their component variables; other expressions
- * will be computed by the upper plan nodes rather than by the subplan.
- * For example, given a query like
- *        SELECT a+b,SUM(c+d) FROM table GROUP BY a+b;
- * we want to pass this targetlist to the subplan:
- *        a+b,c,d
- * where the a+b target will be used by the Sort/Group steps, and the
- * other targets will be used for computing the final results.
+ * plan_cluster_use_sort
+ *		Use the planner to decide how CLUSTER should implement sorting
  *
- * 'final_target' is the query's final target list (in PathTarget form)
+ * tableOid is the OID of a table to be clustered on its index indexOid
+ * (which is already known to be a btree index).  Decide whether it's
+ * cheaper to do an indexscan or a seqscan-plus-sort to execute the CLUSTER.
+ * Return TRUE to use sorting, FALSE to use an indexscan.
  *
- * The result is the PathTarget to be computed by the Paths returned from
- * query_planner().
+ * Note: caller had better already hold some type of lock on the table.
  */
-static PathTarget *
-make_group_input_target(PlannerInfo *root, PathTarget *final_target)
+bool
+plan_cluster_use_sort(Oid tableOid, Oid indexOid)
 {
-    Query       *parse = root->parse;
-    PathTarget *input_target;
-    List       *non_group_cols;
-    List       *non_group_vars;
-    int            i;
+	PlannerInfo *root;
+	Query	   *query;
+	PlannerGlobal *glob;
+	RangeTblEntry *rte;
+	RelOptInfo *rel;
+	IndexOptInfo *indexInfo;
+	QualCost	indexExprCost;
+	Cost		comparisonCost;
+	Path	   *seqScanPath;
+	Path		seqScanAndSortPath;
+	IndexPath  *indexScanPath;
     ListCell   *lc;
 
-    /*
-     * We must build a target containing all grouping columns, plus any other
-     * Vars mentioned in the query's targetlist and HAVING qual.
-     */
-    input_target = create_empty_pathtarget();
-    non_group_cols = NIL;
-
-    i = 0;
-    foreach(lc, final_target->exprs)
-    {
-        Expr       *expr = (Expr *) lfirst(lc);
-        Index        sgref = get_pathtarget_sortgroupref(final_target, i);
-
-        if (sgref && parse->groupClause &&
-            get_sortgroupref_clause_noerr(sgref, parse->groupClause) != NULL)
+	/* We can short-circuit the cost comparison if indexscans are disabled */
+	if (!enable_indexscan)
+		return true;			/* use sort */
+
+	/* Set up mostly-dummy planner state */
+	query = makeNode(Query);
+	query->commandType = CMD_SELECT;
+
+	glob = makeNode(PlannerGlobal);
+
+	root = makeNode(PlannerInfo);
+	root->parse = query;
+	root->glob = glob;
+	root->query_level = 1;
+	root->planner_cxt = CurrentMemoryContext;
+	root->wt_param_id = -1;
+	root->recursiveOk = true;
+
+	/* Build a minimal RTE for the rel */
+	rte = makeNode(RangeTblEntry);
+	rte->rtekind = RTE_RELATION;
+	rte->relid = tableOid;
+	rte->relkind = RELKIND_RELATION;	/* Don't be too picky. */
+	rte->lateral = false;
+	rte->inh = false;
+	rte->inFromCl = true;
+	query->rtable = list_make1(rte);
+
+	/* Set up RTE/RelOptInfo arrays */
+	setup_simple_rel_arrays(root);
+
+	/* Build RelOptInfo */
+	rel = build_simple_rel(root, 1, NULL);
+
+	/* Locate IndexOptInfo for the target index */
+	indexInfo = NULL;
+	foreach(lc, rel->indexlist)
         {
-            /*
-             * It's a grouping column, so add it to the input target as-is.
-             */
-            add_column_to_pathtarget(input_target, expr, sgref);
+		indexInfo = (IndexOptInfo *) lfirst(lc);
+		if (indexInfo->indexoid == indexOid)
+			break;
         }
-        else
-        {
+
             /*
-             * Non-grouping column, so just remember the expression for later
-             * call to pull_var_clause.
+	 * It's possible that get_relation_info did not generate an IndexOptInfo
+	 * for the desired index; this could happen if it's not yet reached its
+	 * indcheckxmin usability horizon, or if it's a system index and we're
+	 * ignoring system indexes.  In such cases we should tell CLUSTER to not
+	 * trust the index contents but use seqscan-and-sort.
              */
-            non_group_cols = lappend(non_group_cols, expr);
-        }
-
-        i++;
-    }
+	if (lc == NULL)				/* not in the list? */
+		return true;			/* use sort */
 
     /*
-     * If there's a HAVING clause, we'll need the Vars it uses, too.
+	 * Rather than doing all the pushups that would be needed to use
+	 * set_baserel_size_estimates, just do a quick hack for rows and width.
      */
-    if (parse->havingQual)
-        non_group_cols = lappend(non_group_cols, parse->havingQual);
+	rel->rows = rel->tuples;
+	rel->reltarget->width = get_relation_data_width(tableOid, NULL);
+
+	root->total_table_pages = rel->pages;
 
     /*
-     * Pull out all the Vars mentioned in non-group cols (plus HAVING), and
-     * add them to the input target if not already present.  (A Var used
-     * directly as a GROUP BY item will be present already.)  Note this
-     * includes Vars used in resjunk items, so we are covering the needs of
-     * ORDER BY and window specifications.  Vars used within Aggrefs and
-     * WindowFuncs will be pulled out here, too.
+	 * Determine eval cost of the index expressions, if any.  We need to
+	 * charge twice that amount for each tuple comparison that happens during
+	 * the sort, since tuplesort.c will have to re-evaluate the index
+	 * expressions each time.  (XXX that's pretty inefficient...)
      */
-    non_group_vars = pull_var_clause((Node *) non_group_cols,
-                                     PVC_RECURSE_AGGREGATES |
-                                     PVC_RECURSE_WINDOWFUNCS |
-                                     PVC_INCLUDE_PLACEHOLDERS);
-    add_new_columns_to_pathtarget(input_target, non_group_vars);
-
-    /* clean up cruft */
-    list_free(non_group_vars);
-    list_free(non_group_cols);
-
-    /* XXX this causes some redundant cost calculation ... */
-    return set_pathtarget_cost_width(root, input_target);
+	cost_qual_eval(&indexExprCost, indexInfo->indexprs, root);
+	comparisonCost = 2.0 * (indexExprCost.startup + indexExprCost.per_tuple);
+
+	/* Estimate the cost of seq scan + sort */
+	seqScanPath = create_seqscan_path(root, rel, NULL, 0);
+	cost_sort(&seqScanAndSortPath, root, NIL,
+			  seqScanPath->total_cost, rel->tuples, rel->reltarget->width,
+			  comparisonCost, maintenance_work_mem, -1.0);
+
+	/* Estimate the cost of index scan */
+	indexScanPath = create_index_path(root, indexInfo,
+									  NIL, NIL, NIL, NIL, NIL,
+									  ForwardScanDirection, false,
+									  NULL, 1.0, false);
+
+	return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost);
 }
 
+
 /*
- * make_partial_grouping_target
- *      Generate appropriate PathTarget for output of partial aggregate
- *      (or partial grouping, if there are no aggregates) nodes.
+ * grouping_distribution_match
+ * 	Check if the path distribution matches grouping distribution.
  *
- * A partial aggregation node needs to emit all the same aggregates that
- * a regular aggregation node would, plus any aggregates used in HAVING;
- * except that the Aggref nodes should be marked as partial aggregates.
+ * Grouping preserves distribution if the distribution key is on of the
+ * grouping keys (arbitrary one). In that case it's guaranteed that groups
+ * on different nodes do not overlap, and we can push the aggregation to
+ * remote nodes as a whole.
  *
- * In addition, we'd better emit any Vars and PlaceholderVars that are
- * used outside of Aggrefs in the aggregation tlist and HAVING.  (Presumably,
- * these would be Vars that are grouped by or used in grouping expressions.)
+ * Otherwise we need to either fetch all the data to the coordinator and
+ * perform the aggregation there, or use two-phase aggregation, with the
+ * first phase (partial aggregation) pushed down, and the second phase
+ * (combining and finalizing the results) executed on the coordinator.
  *
- * grouping_target is the tlist to be emitted by the topmost aggregation step.
- * We get the HAVING clause out of *root.
+ * XXX This is used not only for plain aggregation, but also for various
+ * other paths, relying on grouping infrastructure (DISTINCT ON, UNIQUE).
  */
-static PathTarget *
-make_partial_grouping_target(PlannerInfo *root, PathTarget *grouping_target)
+static bool
+grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path,
+							List *clauses)
 {
-    Query       *parse = root->parse;
-    PathTarget *partial_target;
-    List       *non_group_cols;
-    List       *non_group_exprs;
     int            i;
-    ListCell   *lc;
+	bool	matches_key = false;
+	Distribution *distribution = path->distribution;
 
-    partial_target = create_empty_pathtarget();
-    non_group_cols = NIL;
-
-    i = 0;
-    foreach(lc, grouping_target->exprs)
-    {
-        Expr       *expr = (Expr *) lfirst(lc);
-        Index        sgref = get_pathtarget_sortgroupref(grouping_target, i);
+	int numGroupCols = list_length(clauses);
+	AttrNumber *groupColIdx = extract_grouping_cols(clauses,
+													parse->targetList);
 
-        if (sgref && parse->groupClause &&
-            get_sortgroupref_clause_noerr(sgref, parse->groupClause) != NULL)
+#ifdef __COLD_HOT__
+	if (has_cold_hot_table)
         {
-            /*
-             * It's a grouping column, so add it to the partial_target as-is.
-             * (This allows the upper agg step to repeat the grouping calcs.)
-             */
-            add_column_to_pathtarget(partial_target, expr, sgref);
-        }
-        else
+		if (! path->distribution)
         {
-            /*
-             * Non-grouping column, so just remember the expression for later
-             * call to pull_var_clause.
-             */
-            non_group_cols = lappend(non_group_cols, expr);
+			return true;
         }
 
-        i++;
+		return false;
     }
+#endif
 
     /*
-     * If there's a HAVING clause, we'll need the Vars/Aggrefs it uses, too.
-     */
-    if (parse->havingQual)
-        non_group_cols = lappend(non_group_cols, parse->havingQual);
-
-    /*
-     * Pull out all the Vars, PlaceHolderVars, and Aggrefs mentioned in
-     * non-group cols (plus HAVING), and add them to the partial_target if not
-     * already present.  (An expression used directly as a GROUP BY item will
-     * be present already.)  Note this includes Vars used in resjunk items, so
-     * we are covering the needs of ORDER BY and window specifications.
+	 * With no explicit data distribution or replicated tables, we can simply
+	 * push down the whole aggregation to the remote node, without any sort
+	 * of redistribution. So consider this to be a match.
      */
-    non_group_exprs = pull_var_clause((Node *) non_group_cols,
-                                      PVC_INCLUDE_AGGREGATES |
-                                      PVC_RECURSE_WINDOWFUNCS |
-                                      PVC_INCLUDE_PLACEHOLDERS);
+	if ((distribution == NULL) ||
+		IsLocatorReplicated(distribution->distributionType))
+		return true;
 
-    add_new_columns_to_pathtarget(partial_target, non_group_exprs);
+	/* But no distribution expression means 'no match'. */
+	if (distribution->distributionExpr == NULL)
+		return false;
 
     /*
-     * Adjust Aggrefs to put them in partial mode.  At this point all Aggrefs
-     * are at the top level of the target list, so we can just scan the list
-     * rather than recursing through the expression trees.
+	 * With distributed data and table distributed using an expression, we
+	 * need to check if the distribution expression matches one of the
+	 * grouping keys (arbitrary one).
      */
-    foreach(lc, partial_target->exprs)
+	for (i = 0; i < numGroupCols; i++)
     {
-        Aggref       *aggref = (Aggref *) lfirst(lc);
+		TargetEntry *te = (TargetEntry *)list_nth(parse->targetList,
+												  groupColIdx[i]-1);
 
-        if (IsA(aggref, Aggref))
+		if (equal(te->expr, distribution->distributionExpr))
         {
-            Aggref       *newaggref;
-
-            /*
-             * We shouldn't need to copy the substructure of the Aggref node,
-             * but flat-copy the node itself to avoid damaging other trees.
-             */
-            newaggref = makeNode(Aggref);
-            memcpy(newaggref, aggref, sizeof(Aggref));
-
-            /* For now, assume serialization is required */
-            mark_partial_aggref(newaggref, AGGSPLIT_INITIAL_SERIAL);
-
-            lfirst(lc) = newaggref;
+			matches_key = true;
+			break;
         }
     }
 
-    /* clean up cruft */
-    list_free(non_group_exprs);
-    list_free(non_group_cols);
-
-    /* XXX this causes some redundant cost calculation ... */
-    return set_pathtarget_cost_width(root, partial_target);
+	return matches_key;
 }
 
 /*
- * mark_partial_aggref
- *      Adjust an Aggref to make it represent a partial-aggregation step.
+ * get_partitioned_child_rels
+ *		Returns a list of the RT indexes of the partitioned child relations
+ *      with rti as the root parent RT index. Also sets
+ *      *part_cols_updated to true if any of the root rte's updated
+ *      columns is used in the partition key either of the relation whose RTI
+ *      is specified or of any child relation.
  *
- * The Aggref node is modified in-place; caller must do any copying required.
+ * Note: This function might get called even for range table entries that
+ * are not partitioned tables; in such a case, it will simply return NIL.
  */
-void
-mark_partial_aggref(Aggref *agg, AggSplit aggsplit)
+List *
+get_partitioned_child_rels(PlannerInfo *root, Index rti,
+                          bool *part_cols_updated)
 {
-    /* aggtranstype should be computed by this point */
-    Assert(OidIsValid(agg->aggtranstype));
-    /* ... but aggsplit should still be as the parser left it */
-    Assert(agg->aggsplit == AGGSPLIT_SIMPLE);
+	List	   *result = NIL;
+	ListCell   *l;
 
-    /* Mark the Aggref with the intended partial-aggregation mode */
-    agg->aggsplit = aggsplit;
+    if (part_cols_updated)
+        *part_cols_updated = false;
 
-    /*
-     * Adjust result type if needed.  Normally, a partial aggregate returns
-     * the aggregate's transition type; but if that's INTERNAL and we're
-     * serializing, it returns BYTEA instead.
-     */
-    if (DO_AGGSPLIT_SKIPFINAL(aggsplit))
+	foreach(l, root->pcinfo_list)
     {
-        if (agg->aggtranstype == INTERNALOID && DO_AGGSPLIT_SERIALIZE(aggsplit))
-            agg->aggtype = BYTEAOID;
-        else
-            agg->aggtype = agg->aggtranstype;
+		PartitionedChildRelInfo *pc = lfirst(l);
+
+		if (pc->parent_relid == rti)
+		{
+			result = pc->child_rels;
+			if (part_cols_updated)
+				*part_cols_updated = pc->part_cols_updated;
+			break;
     }
 }
 
+	return result;
+}
+
+
 /*
- * postprocess_setop_tlist
- *      Fix up targetlist returned by plan_set_operations().
- *
- * We need to transpose sort key info from the orig_tlist into new_tlist.
- * NOTE: this would not be good enough if we supported resjunk sort keys
- * for results of set operations --- then, we'd need to project a whole
- * new tlist to evaluate the resjunk columns.  For now, just ereport if we
- * find any resjunk columns in orig_tlist.
+ * get_partitioned_child_rels_for_join
+ *     Build and return a list containing the RTI of every partitioned
+ *     relation which is a child of some rel included in the join.
  */
-static List *
-postprocess_setop_tlist(List *new_tlist, List *orig_tlist)
+List *
+get_partitioned_child_rels_for_join(PlannerInfo *root, Relids join_relids)
 {
+   List       *result = NIL;
     ListCell   *l;
-    ListCell   *orig_tlist_item = list_head(orig_tlist);
 
-    foreach(l, new_tlist)
+   foreach(l, root->pcinfo_list)
     {
-        TargetEntry *new_tle = (TargetEntry *) lfirst(l);
-        TargetEntry *orig_tle;
-
-        /* ignore resjunk columns in setop result */
-        if (new_tle->resjunk)
-            continue;
+       PartitionedChildRelInfo *pc = lfirst(l);
 
-        Assert(orig_tlist_item != NULL);
-        orig_tle = (TargetEntry *) lfirst(orig_tlist_item);
-        orig_tlist_item = lnext(orig_tlist_item);
-        if (orig_tle->resjunk)    /* should not happen */
-            elog(ERROR, "resjunk output columns are not implemented");
-        Assert(new_tle->resno == orig_tle->resno);
-        new_tle->ressortgroupref = orig_tle->ressortgroupref;
+       if (bms_is_member(pc->parent_relid, join_relids))
+           result = list_concat(result, list_copy(pc->child_rels));
     }
-    if (orig_tlist_item != NULL)
-        elog(ERROR, "resjunk output columns are not implemented");
-    return new_tlist;
+
+   return result;
 }
 
 /*
- * select_active_windows
- *        Create a list of the "active" window clauses (ie, those referenced
- *        by non-deleted WindowFuncs) in the order they are to be executed.
+ * add_paths_to_grouping_rel
+ *
+ * Add non-partial paths to grouping relation.
  */
-static List *
-select_active_windows(PlannerInfo *root, WindowFuncLists *wflists)
+static void
+add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
+                          RelOptInfo *grouped_rel, PathTarget *target,
+                          PathTarget *partial_grouping_target,
+                          const AggClauseCosts *agg_costs,
+                          const AggClauseCosts *agg_final_costs,
+                          grouping_sets_data *gd, bool can_sort, bool can_hash,
+                          double dNumGroups, List *havingQual)
 {
-    List       *result;
-    List       *actives;
+    Query      *parse = root->parse;
+    Path       *cheapest_path = input_rel->cheapest_total_path;
     ListCell   *lc;
 
-    /* First, make a list of the active windows */
-    actives = NIL;
-    foreach(lc, root->parse->windowClause)
-    {
-        WindowClause *wc = (WindowClause *) lfirst(lc);
-
-        /* It's only active if wflists shows some related WindowFuncs */
-        Assert(wc->winref <= wflists->maxWinRef);
-        if (wflists->windowFuncs[wc->winref] != NIL)
-            actives = lappend(actives, wc);
-    }
-
-    /*
-     * Now, ensure that windows with identical partitioning/ordering clauses
-     * are adjacent in the list.  This is required by the SQL standard, which
-     * says that only one sort is to be used for such windows, even if they
-     * are otherwise distinct (eg, different names or framing clauses).
-     *
-     * There is room to be much smarter here, for example detecting whether
-     * one window's sort keys are a prefix of another's (so that sorting for
-     * the latter would do for the former), or putting windows first that
-     * match a sort order available for the underlying query.  For the moment
-     * we are content with meeting the spec.
-     */
-    result = NIL;
-    while (actives != NIL)
+	if (can_sort)
     {
-        WindowClause *wc = (WindowClause *) linitial(actives);
-        ListCell   *prev;
-        ListCell   *next;
-
-        /* Move wc from actives to result */
-        actives = list_delete_first(actives);
-        result = lappend(result, wc);
-
-        /* Now move any matching windows from actives to result */
-        prev = NULL;
-        for (lc = list_head(actives); lc; lc = next)
+        /*
+		 * Use any available suitably-sorted path as input, and also consider
+		 * sorting the cheapest-total path.
+         */
+	    foreach(lc, input_rel->pathlist)
         {
-            WindowClause *wc2 = (WindowClause *) lfirst(lc);
-
-            next = lnext(lc);
-            /* framing options are NOT to be compared here! */
-            if (equal(wc->partitionClause, wc2->partitionClause) &&
-                equal(wc->orderClause, wc2->orderClause))
+			Path	   *path = (Path *) lfirst(lc);
+			bool		is_sorted;
+
+			is_sorted = pathkeys_contained_in(root->group_pathkeys,
+											  path->pathkeys);
+
+			/*
+			 * XL: Can it happen that the cheapest path can't be pushed down,
+			 * while some other path could be? Perhaps we should move the check
+			 * if a path can be pushed down up, and add another OR condition
+			 * to consider all paths that can be pushed down?
+			 *
+			 * if (path == cheapest_path || is_sorted || can_push_down)
+			 */
+			if (path == cheapest_path || is_sorted)
             {
-                actives = list_delete_cell(actives, lc, prev);
-                result = lappend(result, wc2);
-            }
-            else
-                prev = lc;
-        }
-    }
+#ifdef __TBASE__
+				bool try_redistribute_grouping = false;
+				PathTarget * local_grouping_target = make_partial_grouping_target(root, target);
 
-    return result;
-}
+				/* Estimate number of partial groups. */
+				double dNumLocalGroups = get_number_of_groups(root,
+														 cheapest_path->rows,
+														 gd);
+#endif
 
+#ifdef __TBASE__
+				if (olap_optimizer && !has_cold_hot_table)
+                {
+					if (!is_sorted && !agg_costs->hasOnlyDistinct)
+						path = (Path *) create_sort_path(root,
+														 grouped_rel,
+														 path,
+														 root->group_pathkeys,
+														 -1.0);
+                }
+                else
+				{
+#endif
+				    /* Sort the cheapest-total path if it isn't already sorted */
+				    if (!is_sorted)
+					    path = (Path *) create_sort_path(root,
+													 grouped_rel,
+													 path,
+													 root->group_pathkeys,
+													 -1.0);
+#ifdef __TBASE__
+                }
+#endif
 /*
- * make_window_input_target
- *      Generate appropriate PathTarget for initial input to WindowAgg nodes.
- *
- * When the query has window functions, this function computes the desired
- * target to be computed by the node just below the first WindowAgg.
- * This tlist must contain all values needed to evaluate the window functions,
- * compute the final target list, and perform any required final sort step.
- * If multiple WindowAggs are needed, each intermediate one adds its window
- * function results onto this base tlist; only the topmost WindowAgg computes
- * the actual desired target list.
- *
- * This function is much like make_group_input_target, though not quite enough
- * like it to share code.  As in that function, we flatten most expressions
- * into their component variables.  But we do not want to flatten window
- * PARTITION BY/ORDER BY clauses, since that might result in multiple
- * evaluations of them, which would be bad (possibly even resulting in
- * inconsistent answers, if they contain volatile functions).
- * Also, we must not flatten GROUP BY clauses that were left unflattened by
- * make_group_input_target, because we may no longer have access to the
- * individual Vars in them.
- *
- * Another key difference from make_group_input_target is that we don't
- * flatten Aggref expressions, since those are to be computed below the
- * window functions and just referenced like Vars above that.
- *
- * 'final_target' is the query's final target list (in PathTarget form)
- * 'activeWindows' is the list of active windows previously identified by
- *            select_active_windows.
- *
- * The result is the PathTarget to be computed by the plan node immediately
- * below the first WindowAgg node.
+				 * If the grouping can't be fully pushed down, redistribute the
+				 * path on top of the (sorted) path. If if can be pushed down,
+				 * disable construction of complex distributed paths.
  */
-static PathTarget *
-make_window_input_target(PlannerInfo *root,
-                         PathTarget *final_target,
-                         List *activeWindows)
-{
-    Query       *parse = root->parse;
-    PathTarget *input_target;
-    Bitmapset  *sgrefs;
-    List       *flattenable_cols;
-    List       *flattenable_vars;
-    int            i;
-    ListCell   *lc;
+				if (! can_push_down_grouping(root, parse, path))
+#ifdef __TBASE__
+                {
+					/* some special aggs cannot be parallel executed, such as count(distinct) */
+					if(agg_costs->hasNonPartial || agg_costs->hasNonSerial ||
+						parse->groupingSets || path->pathtype == T_Agg ||
+						path->pathtype == T_Group || !olap_optimizer || has_cold_hot_table)
+                    {
+						if (agg_costs->hasOnlyDistinct && olap_optimizer && !parse->groupingSets
+							&& !has_cold_hot_table)
+							path = create_redistribute_grouping_path(root, parse, path);
+						else
+							path = create_remotesubplan_path(root, path, NULL);
 
-    Assert(parse->hasWindowFuncs);
+						if (agg_costs->hasOnlyDistinct && olap_optimizer &&
+							!has_cold_hot_table)
+                        {
+							if (root->group_pathkeys)
+                            {
+								path = (Path *) create_sort_path(root,
+												 grouped_rel,
+												 path,
+												 root->group_pathkeys,
+												 -1.0);
+                            }
+                        }
+                    }
+					else
+					{
+                         /*
+						  * If the grouping can not be fully pushed down, we adopt another
+						  * strategy instead.
+						  * 1. do grouping on each datanode locally
+						  * 2. re-distribute grouping results among datanodes, then do the
+						  * 	final grouping
+                          */
+
+						try_redistribute_grouping = true;
+
+						/* step 1 */
+						if (parse->groupingSets)
+						{
+                            /*
+							 * TODO 2-phase aggregation for grouping sets paths not
+							 * supported yet, but this the place where such paths
+							 * should be constructed.
+                             */
+						}
+						else if (parse->hasAggs)
+                        {
+                            /*
+							 * We have aggregation, possibly with plain GROUP BY. Make
+							 * an AggPath.
+                             */
 
-    /*
-     * Collect the sortgroupref numbers of window PARTITION/ORDER BY clauses
-     * into a bitmapset for convenient reference below.
-     */
-    sgrefs = NULL;
-    foreach(lc, activeWindows)
-    {
-        WindowClause *wc = (WindowClause *) lfirst(lc);
-        ListCell   *lc2;
+							path = (Path *) create_agg_path(root,
+															grouped_rel,
+															path,
+															local_grouping_target,
+											parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+															AGGSPLIT_INITIAL_SERIAL,
+															parse->groupClause,
+															NIL,
+															&agg_partial_costs,
+															dNumLocalGroups);
+                        }
+						else if (parse->groupClause)
+                        {
+                            /*
+							 * We have GROUP BY without aggregation or grouping sets.
+							 * Make a GroupPath.
+                             */
+							path = (Path *) create_group_path(root,
+															  grouped_rel,
+															  path,
+															  local_grouping_target,
+															  parse->groupClause,
+															  NIL,
+															  dNumLocalGroups);
+						}
+						else
+						{
+							/* Other cases should have been handled above */
+							Assert(false);
+                        }
 
-        foreach(lc2, wc->partitionClause)
-        {
-            SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc2);
+						/* step 2*/
+						path = create_redistribute_grouping_path(root, parse, path);
+					}
+                }
+#else
+					path = create_remotesubplan_path(root, path, NULL);
+#endif
 
-            sgrefs = bms_add_member(sgrefs, sortcl->tleSortGroupRef);
-        }
-        foreach(lc2, wc->orderClause)
-        {
-            SortGroupClause *sortcl = (SortGroupClause *) lfirst(lc2);
+				else
+					try_distributed_aggregation = false;
 
-            sgrefs = bms_add_member(sgrefs, sortcl->tleSortGroupRef);
-        }
-    }
+#ifdef __TBASE__
+				if(try_redistribute_grouping)
+				{
+                    /*
+					 * do final grouping at each datanode
+                     */
 
-    /* Add in sortgroupref numbers of GROUP BY clauses, too */
-    foreach(lc, parse->groupClause)
-    {
-        SortGroupClause *grpcl = (SortGroupClause *) lfirst(lc);
+					/* Now decide what to stick atop it */
+					if (parse->groupingSets)
+					{
+						/*
+						 * TODO 2-phase aggregation for grouping sets paths not
+						 * supported yet, but this the place where such paths
+						 * should be constructed.
+						 */
+					}
+					else if (parse->hasAggs)
+					{
+						/*
+						 * We generate two paths, differing in the second phase
+						 * implementation (sort and hash).
+						 */
+						Path *remote_path = path;
 
-        sgrefs = bms_add_member(sgrefs, grpcl->tleSortGroupRef);
-    }
+						if (parse->groupClause)
+						{
+							if (!is_sorted || root->group_pathkeys)
+							{
+								path = (Path *) create_sort_path(root,
+																 grouped_rel,
+																 path,
+																 root->group_pathkeys,
+																 -1.0);
+							}
+						}
 
-    /*
-     * Construct a target containing all the non-flattenable targetlist items,
-     * and save aside the others for a moment.
-     */
-    input_target = create_empty_pathtarget();
-    flattenable_cols = NIL;
+						path = (Path *)create_agg_path(root,
+												 grouped_rel,
+												 path,
+												 target,
+										 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+												 AGGSPLIT_FINAL_DESERIAL,
+												 parse->groupClause,
+												 havingQual,
+												 agg_final_costs,
+												 dNumGroups);
 
-    i = 0;
-    foreach(lc, final_target->exprs)
-    {
-        Expr       *expr = (Expr *) lfirst(lc);
-        Index        sgref = get_pathtarget_sortgroupref(final_target, i);
+						//path->parallel_safe = true;
 
-        /*
-         * Don't want to deconstruct window clauses or GROUP BY items.  (Note
-         * that such items can't contain window functions, so it's okay to
-         * compute them below the WindowAgg nodes.)
-         */
-        if (sgref != 0 && bms_is_member(sgref, sgrefs))
-        {
-            /*
-             * Don't want to deconstruct this value, so add it to the input
-             * target as-is.
-             */
-            add_column_to_pathtarget(input_target, expr, sgref);
-        }
-        else
-        {
-            /*
-             * Column is to be flattened, so just remember the expression for
-             * later call to pull_var_clause.
-             */
-            flattenable_cols = lappend(flattenable_cols, expr);
-        }
+						add_path(grouped_rel, path);
 
-        i++;
-    }
+						if (can_hash)
+						{
+							path = (Path *)
+									 create_agg_path(root,
+													 grouped_rel,
+													 remote_path,
+													 target,
+													 AGG_HASHED,
+													 AGGSPLIT_FINAL_DESERIAL,
+													 parse->groupClause,
+													 havingQual,
+													 agg_final_costs,
+													 dNumGroups);
+							//path->parallel_safe = true;
+							if (g_hybrid_hash_agg)
+							{
+								AggPath *agg = (AggPath *)path;
+								agg->hybrid = true;
+                            }
 
-    /*
-     * Pull out all the Vars and Aggrefs mentioned in flattenable columns, and
-     * add them to the input target if not already present.  (Some might be
-     * there already because they're used directly as window/group clauses.)
-     *
-     * Note: it's essential to use PVC_INCLUDE_AGGREGATES here, so that any
-     * Aggrefs are placed in the Agg node's tlist and not left to be computed
-     * at higher levels.  On the other hand, we should recurse into
-     * WindowFuncs to make sure their input expressions are available.
-     */
-    flattenable_vars = pull_var_clause((Node *) flattenable_cols,
-                                       PVC_INCLUDE_AGGREGATES |
-                                       PVC_RECURSE_WINDOWFUNCS |
-                                       PVC_INCLUDE_PLACEHOLDERS);
-    add_new_columns_to_pathtarget(input_target, flattenable_vars);
-
-    /* clean up cruft */
-    list_free(flattenable_vars);
-    list_free(flattenable_cols);
-
-    /* XXX this causes some redundant cost calculation ... */
-    return set_pathtarget_cost_width(root, input_target);
-}
+							add_path(grouped_rel, path);
+						}
+					}
+					else if (parse->groupClause)
+					{
+						if (!is_sorted || root->group_pathkeys)
+						{
+							path = (Path *) create_sort_path(root,
+															 grouped_rel,
+															 path,
+															 root->group_pathkeys,
+															 -1.0);
+						}
 
-/*
- * make_pathkeys_for_window
- *        Create a pathkeys list describing the required input ordering
- *        for the given WindowClause.
- *
- * The required ordering is first the PARTITION keys, then the ORDER keys.
- * In the future we might try to implement windowing using hashing, in which
- * case the ordering could be relaxed, but for now we always sort.
- *
- * Caution: if you change this, see createplan.c's get_column_info_for_window!
- */
-static List *
-make_pathkeys_for_window(PlannerInfo *root, WindowClause *wc,
-                         List *tlist)
-{
-    List       *window_pathkeys;
-    List       *window_sortclauses;
+						path = (Path *)
+								 create_group_path(root,
+												   grouped_rel,
+												   path,
+												   target,
+												   parse->groupClause,
+												   havingQual,
+												   dNumGroups);
 
-    /* Throw error if can't sort */
-    if (!grouping_is_sortable(wc->partitionClause))
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("could not implement window PARTITION BY"),
-                 errdetail("Window partitioning columns must be of sortable datatypes.")));
-    if (!grouping_is_sortable(wc->orderClause))
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("could not implement window ORDER BY"),
-                 errdetail("Window ordering columns must be of sortable datatypes.")));
-
-    /* Okay, make the combined pathkeys */
-    window_sortclauses = list_concat(list_copy(wc->partitionClause),
-                                     list_copy(wc->orderClause));
-    window_pathkeys = make_pathkeys_for_sortclauses(root,
-                                                    window_sortclauses,
-                                                    tlist);
-    list_free(window_sortclauses);
-    return window_pathkeys;
-}
+						//path->parallel_safe = true;
 
-/*
- * make_sort_input_target
- *      Generate appropriate PathTarget for initial input to Sort step.
- *
- * If the query has ORDER BY, this function chooses the target to be computed
- * by the node just below the Sort (and DISTINCT, if any, since Unique can't
- * project) steps.  This might or might not be identical to the query's final
- * output target.
- *
- * The main argument for keeping the sort-input tlist the same as the final
- * is that we avoid a separate projection node (which will be needed if
- * they're different, because Sort can't project).  However, there are also
- * advantages to postponing tlist evaluation till after the Sort: it ensures
- * a consistent order of evaluation for any volatile functions in the tlist,
- * and if there's also a LIMIT, we can stop the query without ever computing
- * tlist functions for later rows, which is beneficial for both volatile and
- * expensive functions.
- *
- * Our current policy is to postpone volatile expressions till after the sort
- * unconditionally (assuming that that's possible, ie they are in plain tlist
- * columns and not ORDER BY/GROUP BY/DISTINCT columns).  We also prefer to
- * postpone set-returning expressions, because running them beforehand would
- * bloat the sort dataset, and because it might cause unexpected output order
- * if the sort isn't stable.  However there's a constraint on that: all SRFs
- * in the tlist should be evaluated at the same plan step, so that they can
- * run in sync in nodeProjectSet.  So if any SRFs are in sort columns, we
- * mustn't postpone any SRFs.  (Note that in principle that policy should
- * probably get applied to the group/window input targetlists too, but we
- * have not done that historically.)  Lastly, expensive expressions are
- * postponed if there is a LIMIT, or if root->tuple_fraction shows that
- * partial evaluation of the query is possible (if neither is true, we expect
- * to have to evaluate the expressions for every row anyway), or if there are
- * any volatile or set-returning expressions (since once we've put in a
- * projection at all, it won't cost any more to postpone more stuff).
- *
- * Another issue that could potentially be considered here is that
- * evaluating tlist expressions could result in data that's either wider
- * or narrower than the input Vars, thus changing the volume of data that
- * has to go through the Sort.  However, we usually have only a very bad
- * idea of the output width of any expression more complex than a Var,
- * so for now it seems too risky to try to optimize on that basis.
- *
- * Note that if we do produce a modified sort-input target, and then the
- * query ends up not using an explicit Sort, no particular harm is done:
- * we'll initially use the modified target for the preceding path nodes,
- * but then change them to the final target with apply_projection_to_path.
- * Moreover, in such a case the guarantees about evaluation order of
- * volatile functions still hold, since the rows are sorted already.
- *
- * This function has some things in common with make_group_input_target and
- * make_window_input_target, though the detailed rules for what to do are
- * different.  We never flatten/postpone any grouping or ordering columns;
- * those are needed before the sort.  If we do flatten a particular
- * expression, we leave Aggref and WindowFunc nodes alone, since those were
- * computed earlier.
- *
- * 'final_target' is the query's final target list (in PathTarget form)
- * 'have_postponed_srfs' is an output argument, see below
- *
- * The result is the PathTarget to be computed by the plan node immediately
- * below the Sort step (and the Distinct step, if any).  This will be
- * exactly final_target if we decide a projection step wouldn't be helpful.
- *
- * In addition, *have_postponed_srfs is set to TRUE if we choose to postpone
- * any set-returning functions to after the Sort.
- */
-static PathTarget *
-make_sort_input_target(PlannerInfo *root,
-                       PathTarget *final_target,
-                       bool *have_postponed_srfs)
-{// #lizard forgives
-    Query       *parse = root->parse;
-    PathTarget *input_target;
-    int            ncols;
-    bool       *col_is_srf;
-    bool       *postpone_col;
-    bool        have_srf;
-    bool        have_volatile;
-    bool        have_expensive;
-    bool        have_srf_sortcols;
-    bool        postpone_srfs;
-    List       *postponable_cols;
-    List       *postponable_vars;
-    int            i;
-    ListCell   *lc;
+                        /*
+						 * We have GROUP BY without aggregation or grouping sets.
+						 * Make a GroupPath.
+                         */
+						add_path(grouped_rel, path);
 
-    /* Shouldn't get here unless query has ORDER BY */
-    Assert(parse->sortClause);
+					}
+					else
+                    {
+						/* Other cases should have been handled above */
+						Assert(false);
+					}
+				}
+				else
+				{
+#endif
 
-    *have_postponed_srfs = false;    /* default result */
+				/* Now decide what to stick atop it */
+				if (parse->groupingSets)
+				{
+					consider_groupingsets_paths(root, grouped_rel,
+												path, true, can_hash, target,
+												gd, agg_costs, dNumGroups);
+				}
+				else if (parse->hasAggs)
+				{
+#ifdef __TBASE__
+					bool parallel_aware = false;
+					bool parallel_safe  = false;
+					Path *agg_path      = NULL;
 
-    /* Inspect tlist and collect per-column information */
-    ncols = list_length(final_target->exprs);
-    col_is_srf = (bool *) palloc0(ncols * sizeof(bool));
-    postpone_col = (bool *) palloc0(ncols * sizeof(bool));
-    have_srf = have_volatile = have_expensive = have_srf_sortcols = false;
+					if (root->group_pathkeys && olap_optimizer &&
+						!has_cold_hot_table && agg_costs->hasOnlyDistinct)
+					{
+						if (!pathkeys_contained_in(root->group_pathkeys,
+											  path->pathkeys))
+						{
+							path = (Path *) create_sort_path(root,
+											 grouped_rel,
+											 path,
+											 root->group_pathkeys,
+											 -1.0);
+					  	}
+                    }
 
-    i = 0;
-    foreach(lc, final_target->exprs)
-    {
-        Expr       *expr = (Expr *) lfirst(lc);
+					if (path->pathtype == T_Sort && olap_optimizer && !has_cold_hot_table)
+					{
+						SortPath   *pathnode = (SortPath *)path;
+
+						if (pathnode->subpath->pathtype == T_Gather || agg_costs->hasOnlyDistinct)
+						{
+							path->parallel_aware = true;
+							parallel_aware       = true;
+							parallel_safe        = true;
+						}
+					}
+
+					agg_path = (Path *)
+							 create_agg_path(root,
+											 grouped_rel,
+											 path,
+											 target,
+											 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+											 AGGSPLIT_SIMPLE,
+											 parse->groupClause,
+											 havingQual,
+											 agg_costs,
+											 dNumGroups);
+					agg_path->parallel_aware = parallel_aware;
+					agg_path->parallel_safe  = parallel_safe;
+
+					add_path(grouped_rel, agg_path);
+#else
+                    /*
+					 * We have aggregation, possibly with plain GROUP BY. Make
+					 * an AggPath.
+                     */
+					add_path(grouped_rel, (Path *)
+							 create_agg_path(root,
+											 grouped_rel,
+											 path,
+											 target,
+											 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+											 AGGSPLIT_SIMPLE,
+											 parse->groupClause,
+											 havingQual,
+											 agg_costs,
+											 dNumGroups));
+#endif
+				}
+				else if (parse->groupClause)
+				{
+#ifdef __TBASE__
+					bool parallel_aware = false;
+					bool parallel_safe	= false;
+					Path *group_path      = NULL;
+
+					if (root->group_pathkeys && olap_optimizer &&
+						!has_cold_hot_table && agg_costs->hasOnlyDistinct)
+					{
+						if (!pathkeys_contained_in(root->group_pathkeys,
+											  path->pathkeys))
+						{
+							path = (Path *) create_sort_path(root,
+											 grouped_rel,
+											 path,
+											 root->group_pathkeys,
+											 -1.0);
+					  	}
+					}
+
+					if (path->pathtype == T_Sort && olap_optimizer && !has_cold_hot_table)
+					{
+						SortPath   *pathnode = (SortPath *)path;
+
+						if (pathnode->subpath->pathtype == T_Gather)
+						{
+							path->parallel_aware = true;
+							parallel_aware       = true;
+							parallel_safe        = true;
+						}
+					}
+
+					group_path = (Path *)
+							 create_group_path(root,
+											   grouped_rel,
+											   path,
+											   target,
+											   parse->groupClause,
+											   havingQual,
+											   dNumGroups);
+					group_path->parallel_aware = parallel_aware;
+					group_path->parallel_safe  = parallel_safe;
+					add_path(grouped_rel, group_path);
+#else
+
+					/*
+					 * We have GROUP BY without aggregation or grouping sets.
+					 * Make a GroupPath.
+					 */
+					add_path(grouped_rel, (Path *)
+							 create_group_path(root,
+											   grouped_rel,
+											   path,
+											   target,
+											   parse->groupClause,
+											   havingQual,
+											   dNumGroups));
+#endif
+				}
+				else
+                {
+					/* Other cases should have been handled above */
+					Assert(false);
+				}
+#ifdef __TBASE__
+				}
+#endif
+			}
+		}
 
         /*
-         * If the column has a sortgroupref, assume it has to be evaluated
-         * before sorting.  Generally such columns would be ORDER BY, GROUP
-         * BY, etc targets.  One exception is columns that were removed from
-         * GROUP BY by remove_useless_groupby_columns() ... but those would
-         * only be Vars anyway.  There don't seem to be any cases where it
-         * would be worth the trouble to double-check.
+		 * Now generate a complete GroupAgg Path atop of the cheapest partial
+		 * path.  We can do this using either Gather or Gather Merge.
+		 */
+		if (grouped_rel->partial_pathlist)
+		{
+#ifdef __TBASE__
+			bool       redistribute_group PG_USED_FOR_ASSERTS_ONLY = false;
+#endif
+			Path	   *path = (Path *) linitial(grouped_rel->partial_pathlist);
+#ifdef __TBASE__
+			double		total_groups = 0;
+
+			if (olap_optimizer && !has_cold_hot_table)
+			{
+				total_groups = path->rows;
+			}
+			else
+				total_groups = path->rows * path->parallel_workers;
+#else
+			double		total_groups = path->rows * path->parallel_workers;
+#endif
+			path = (Path *) create_gather_path(root,
+											   grouped_rel,
+											   path,
+											   partial_grouping_target,
+											   NULL,
+											   &total_groups);
+			/*
+			 * Since Gather's output is always unsorted, we'll need to sort,
+			 * unless there's no GROUP BY clause or a degenerate (constant)
+			 * one, in which case there will only be a single group.
          */
-        if (get_pathtarget_sortgroupref(final_target, i) == 0)
-        {
+#ifdef __TBASE__
+		    if (!olap_optimizer || has_cold_hot_table)
+            {
+#endif
+			    if (root->group_pathkeys)
+				    path = (Path *) create_sort_path(root,
+												 grouped_rel,
+												 path,
+												 root->group_pathkeys,
+												 -1.0);
+#ifdef __TBASE__
+	    	}
+#endif
             /*
-             * Check for SRF or volatile functions.  Check the SRF case first
-             * because we must know whether we have any postponed SRFs.
+			 * If the grouping can't be fully pushed down, we'll push down the
+			 * first phase of the aggregate, and redistribute only the partial
+			 * results.
+			 *
+			 * If if can be pushed down, disable construction of complex
+			 * distributed paths.
+			 *
+			 * XXX Keep this after the Sort node, to make the path sorted.
              */
-            if (parse->hasTargetSRFs &&
-                expression_returns_set((Node *) expr))
-            {
-                /* We'll decide below whether these are postponable */
-                col_is_srf[i] = true;
-                have_srf = true;
-            }
-            else if (contain_volatile_functions((Node *) expr))
-            {
-                /* Unconditionally postpone */
-                postpone_col[i] = true;
-                have_volatile = true;
-            }
-            else
+			if (! can_push_down_grouping(root, parse, path))
+#ifdef __TBASE__
             {
-                /*
-                 * Else check the cost.  XXX it's annoying to have to do this
-                 * when set_pathtarget_cost_width() just did it.  Refactor to
-                 * allow sharing the work?
-                 */
-                QualCost    cost;
+				if (olap_optimizer && !has_cold_hot_table)
+                {
+					/* redistribute local grouping results among datanodes */
+					path = create_redistribute_grouping_path(root, parse, path);
+					redistribute_group = true;
+                }
+                else
+					path = create_remotesubplan_path(root, path, NULL);
+			}
+#else
+			path = create_remotesubplan_path(root, path, NULL);
+#endif
 
-                cost_qual_eval_node(&cost, (Node *) expr, root);
+			else
+				try_distributed_aggregation = false;
 
+#ifdef __TBASE__
                 /*
-                 * We arbitrarily define "expensive" as "more than 10X
-                 * cpu_operator_cost".  Note this will take in any PL function
-                 * with default cost.
+			 * Since Gather's output is always unsorted, we'll need to sort,
+			 * unless there's no GROUP BY clause or a degenerate (constant)
+			 * one, in which case there will only be a single group.
                  */
-                if (cost.per_tuple > 10 * cpu_operator_cost)
-                {
-                    postpone_col[i] = true;
-                    have_expensive = true;
+			if (olap_optimizer && !has_cold_hot_table)
+            {
+				if (root->group_pathkeys)
+				{
+					path = (Path *) create_sort_path(root,
+													 grouped_rel,
+													 path,
+													 root->group_pathkeys,
+													 -1.0);
+						path->parallel_aware = true;
                 }
             }
-        }
-        else
-        {
-            /* For sortgroupref cols, just check if any contain SRFs */
-            if (!have_srf_sortcols &&
-                parse->hasTargetSRFs &&
-                expression_returns_set((Node *) expr))
-                have_srf_sortcols = true;
-        }
-
-        i++;
-    }
-
-    /*
-     * We can postpone SRFs if we have some but none are in sortgroupref cols.
-     */
-    postpone_srfs = (have_srf && !have_srf_sortcols);
-
-    /*
-     * If we don't need a post-sort projection, just return final_target.
-     */
-    if (!(postpone_srfs || have_volatile ||
-          (have_expensive &&
-           (parse->limitCount || root->tuple_fraction > 0))))
-        return final_target;
-
-    /*
-     * Report whether the post-sort projection will contain set-returning
-     * functions.  This is important because it affects whether the Sort can
-     * rely on the query's LIMIT (if any) to bound the number of rows it needs
-     * to return.
-     */
-    *have_postponed_srfs = postpone_srfs;
-
-    /*
-     * Construct the sort-input target, taking all non-postponable columns and
-     * then adding Vars, PlaceHolderVars, Aggrefs, and WindowFuncs found in
-     * the postponable ones.
-     */
-    input_target = create_empty_pathtarget();
-    postponable_cols = NIL;
-
-    i = 0;
-    foreach(lc, final_target->exprs)
-    {
-        Expr       *expr = (Expr *) lfirst(lc);
-
-        if (postpone_col[i] || (postpone_srfs && col_is_srf[i]))
-            postponable_cols = lappend(postponable_cols, expr);
-        else
-            add_column_to_pathtarget(input_target, expr,
-                                     get_pathtarget_sortgroupref(final_target, i));
+#endif
+			if (parse->hasAggs)
+			{
+				Path *agg_path = NULL;
+
+				agg_path = (Path *)
+						 create_agg_path(root,
+										 grouped_rel,
+										 path,
+										 target,
+										 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+										 AGGSPLIT_FINAL_DESERIAL,
+										 parse->groupClause,
+										 havingQual,
+										 agg_final_costs,
+										 dNumGroups);
+
+				if (olap_optimizer && !has_cold_hot_table)
+				{
+					agg_path->parallel_safe = true;
+					agg_path->parallel_aware = true;
+				}
 
-        i++;
-    }
+				add_path(grouped_rel, agg_path);
+            }
+            else
+            {
+				Path *group_path = NULL;
+
+				group_path = (Path *)
+						 create_group_path(root,
+										   grouped_rel,
+										   path,
+										   target,
+										   parse->groupClause,
+										   havingQual,
+										   dNumGroups);
+
+				if (olap_optimizer && !has_cold_hot_table)
+				{
+					group_path->parallel_safe = true;
+					group_path->parallel_aware = true;
+                }
 
-    /*
-     * Pull out all the Vars, Aggrefs, and WindowFuncs mentioned in
-     * postponable columns, and add them to the sort-input target if not
-     * already present.  (Some might be there already.)  We mustn't
-     * deconstruct Aggrefs or WindowFuncs here, since the projection node
-     * would be unable to recompute them.
-     */
-    postponable_vars = pull_var_clause((Node *) postponable_cols,
-                                       PVC_INCLUDE_AGGREGATES |
-                                       PVC_INCLUDE_WINDOWFUNCS |
-                                       PVC_INCLUDE_PLACEHOLDERS);
-    add_new_columns_to_pathtarget(input_target, postponable_vars);
-
-    /* clean up cruft */
-    list_free(postponable_vars);
-    list_free(postponable_cols);
-
-    /* XXX this represents even more redundant cost calculation ... */
-    return set_pathtarget_cost_width(root, input_target);
-}
+				add_path(grouped_rel, group_path);
+            }
 
-/*
- * get_cheapest_fractional_path
- *      Find the cheapest path for retrieving a specified fraction of all
- *      the tuples expected to be returned by the given relation.
- *
- * We interpret tuple_fraction the same way as grouping_planner.
- *
- * We assume set_cheapest() has been run on the given rel.
- */
-Path *
-get_cheapest_fractional_path(RelOptInfo *rel, double tuple_fraction)
-{
-    Path       *best_path = rel->cheapest_total_path;
-    ListCell   *l;
+            /*
+			 * The point of using Gather Merge rather than Gather is that it
+			 * can preserve the ordering of the input path, so there's no
+			 * reason to try it unless (1) it's possible to produce more than
+			 * one output row and (2) we want the output path to be ordered.
+             */
+			if (parse->groupClause != NIL && root->group_pathkeys != NIL)
+			{
+				foreach(lc, grouped_rel->partial_pathlist)
+				{
+					Path	   *subpath = (Path *) lfirst(lc);
+					Path	   *gmpath;
+					double		total_groups;
 
-    /* If all tuples will be retrieved, just return the cheapest-total path */
-    if (tuple_fraction <= 0.0)
-        return best_path;
+                    /*
+					 * It's useful to consider paths that are already properly
+					 * ordered for Gather Merge, because those don't need a
+					 * sort.  It's also useful to consider the cheapest path,
+					 * because sorting it in parallel and then doing Gather
+					 * Merge may be better than doing an unordered Gather
+					 * followed by a sort.  But there's no point in
+					 * considering non-cheapest paths that aren't already
+					 * sorted correctly.
+                     */
+					if (path != subpath &&
+						!pathkeys_contained_in(root->group_pathkeys,
+											   subpath->pathkeys))
+						continue;
 
-    /* Convert absolute # of tuples to a fraction; no need to clamp to 0..1 */
-    if (tuple_fraction >= 1.0 && best_path->rows > 0)
-        tuple_fraction /= best_path->rows;
+#ifdef __TBASE__
+					if (olap_optimizer && !has_cold_hot_table)
+						total_groups = subpath->rows;
+					else
+#endif
+					total_groups = subpath->rows * subpath->parallel_workers;
 
-    foreach(l, rel->pathlist)
-    {
-        Path       *path = (Path *) lfirst(l);
 
-        if (path == rel->cheapest_total_path ||
-            compare_fractional_path_costs(best_path, path, tuple_fraction) <= 0)
-            continue;
+#ifdef __TBASE__
+					if (olap_optimizer && !has_cold_hot_table)
+                    {
+					    gmpath = (Path *) create_gather_path(root,
+													     grouped_rel,
+													     subpath,
+													     partial_grouping_target,
+													     NULL,
+													     &total_groups);
+                    }
+					else
+#endif
+					gmpath = (Path *)
+						create_gather_merge_path(root,
+												 grouped_rel,
+												 subpath,
+												 partial_grouping_target,
+												 root->group_pathkeys,
+												 NULL,
+												 &total_groups);
 
-        best_path = path;
-    }
+                    /*
+					 * If the grouping can't be fully pushed down, we'll push down the
+					 * first phase of the aggregate, and redistribute only the partial
+					 * results.
+                     */
+					redistribute_group = false;
 
-    return best_path;
-}
+					if (! can_push_down_grouping(root, parse, gmpath))
+#ifdef __TBASE__
+                    {
+						if (olap_optimizer && !has_cold_hot_table)
+						{
+							/* redistribute local grouping results among datanodes */
+							gmpath = create_redistribute_grouping_path(root, parse, gmpath);
+							redistribute_group = true;
+						}
+						else
+                        {
+							gmpath = create_remotesubplan_path(root, gmpath, NULL);
+                        }
+                    }
+#else
+					gmpath = create_remotesubplan_path(root, gmpath, NULL);
+#endif
 
-/*
- * adjust_paths_for_srfs
- *        Fix up the Paths of the given upperrel to handle tSRFs properly.
- *
- * The executor can only handle set-returning functions that appear at the
- * top level of the targetlist of a ProjectSet plan node.  If we have any SRFs
- * that are not at top level, we need to split up the evaluation into multiple
- * plan levels in which each level satisfies this constraint.  This function
- * modifies each Path of an upperrel that (might) compute any SRFs in its
- * output tlist to insert appropriate projection steps.
- *
- * The given targets and targets_contain_srfs lists are from
- * split_pathtarget_at_srfs().  We assume the existing Paths emit the first
- * target in targets.
- */
-static void
-adjust_paths_for_srfs(PlannerInfo *root, RelOptInfo *rel,
-                      List *targets, List *targets_contain_srfs)
-{
-    ListCell   *lc;
+#ifdef __TBASE__
+                    /*
+					 * Since Gather's output is always unsorted, we'll need to sort,
+					 * unless there's no GROUP BY clause or a degenerate (constant)
+					 * one, in which case there will only be a single group.
+                     */
+					if (olap_optimizer && !has_cold_hot_table)
+                    {
+						if (root->group_pathkeys)
+						{
+							gmpath = (Path *) create_sort_path(root,
+															  grouped_rel,
+															  gmpath,
+															  root->group_pathkeys,
+															  -1.0);
 
-    Assert(list_length(targets) == list_length(targets_contain_srfs));
-    Assert(!linitial_int(targets_contain_srfs));
+								gmpath->parallel_aware = true;
+						}
+					}
+#endif
+					if (parse->hasAggs)
+					{
+						Path *agg_path = NULL;
 
-    /* If no SRFs appear at this plan level, nothing to do */
-    if (list_length(targets) == 1)
-        return;
+						agg_path = (Path *)
+								 create_agg_path(root,
+												 grouped_rel,
+												 gmpath,
+												 target,
+												 parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+												 AGGSPLIT_FINAL_DESERIAL,
+												 parse->groupClause,
+												 havingQual,
+												 agg_final_costs,
+												 dNumGroups);
+						if (olap_optimizer && !has_cold_hot_table)
+						{
+							agg_path->parallel_safe = true;
+							agg_path->parallel_aware = true;
+						}
 
-    /*
-     * Stack SRF-evaluation nodes atop each path for the rel.
-     *
-     * In principle we should re-run set_cheapest() here to identify the
-     * cheapest path, but it seems unlikely that adding the same tlist eval
-     * costs to all the paths would change that, so we don't bother. Instead,
-     * just assume that the cheapest-startup and cheapest-total paths remain
-     * so.  (There should be no parameterized paths anymore, so we needn't
-     * worry about updating cheapest_parameterized_paths.)
-     */
-    foreach(lc, rel->pathlist)
-    {
-        Path       *subpath = (Path *) lfirst(lc);
-        Path       *newpath = subpath;
-        ListCell   *lc1,
-                   *lc2;
+						add_path(grouped_rel, agg_path);
+					}
+					else
+                    {
+						Path *group_path = NULL;
+
+						group_path = (Path *)
+								 create_group_path(root,
+												   grouped_rel,
+												   gmpath,
+												   target,
+												   parse->groupClause,
+												   havingQual,
+												   dNumGroups);
+
+						if (olap_optimizer && !has_cold_hot_table)
+                        {
+							group_path->parallel_safe = true;
+							group_path->parallel_aware = true;
+						}
 
-        Assert(subpath->param_info == NULL);
-        forboth(lc1, targets, lc2, targets_contain_srfs)
-        {
-            PathTarget *thistarget = (PathTarget *) lfirst(lc1);
-            bool        contains_srfs = (bool) lfirst_int(lc2);
-
-            /* If this level doesn't contain SRFs, do regular projection */
-            if (contains_srfs)
-                newpath = (Path *) create_set_projection_path(root,
-                                                              rel,
-                                                              newpath,
-                                                              thistarget);
-            else
-                newpath = (Path *) apply_projection_to_path(root,
-                                                            rel,
-                                                            newpath,
-                                                            thistarget);
+						add_path(grouped_rel, group_path);
+					}
+				}
+			}
         }
-        lfirst(lc) = newpath;
-        if (subpath == rel->cheapest_startup_path)
-            rel->cheapest_startup_path = newpath;
-        if (subpath == rel->cheapest_total_path)
-            rel->cheapest_total_path = newpath;
     }
 
-    /* Likewise for partial paths, if any */
-    foreach(lc, rel->partial_pathlist)
+	if (can_hash)
     {
-        Path       *subpath = (Path *) lfirst(lc);
-        Path       *newpath = subpath;
-        ListCell   *lc1,
-                   *lc2;
+		Size        hashaggtablesize;
 
-        Assert(subpath->param_info == NULL);
-        forboth(lc1, targets, lc2, targets_contain_srfs)
+		if (parse->groupingSets)
         {
-            PathTarget *thistarget = (PathTarget *) lfirst(lc1);
-            bool        contains_srfs = (bool) lfirst_int(lc2);
-
-            /* If this level doesn't contain SRFs, do regular projection */
-            if (contains_srfs)
-                newpath = (Path *) create_set_projection_path(root,
-                                                              rel,
-                                                              newpath,
-                                                              thistarget);
-            else
+			/*
+			 * Try for a hash-only groupingsets path over unsorted input.
+			 */
+			consider_groupingsets_paths(root, grouped_rel,
+										cheapest_path, false, true, target,
+										gd, agg_costs, dNumGroups);
+		}
+        else
+        {
+			hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
+														  agg_costs,
+														  dNumGroups);
+
+            /*
+			 * Provided that the estimated size of the hashtable does not
+			 * exceed work_mem, we'll generate a HashAgg Path, although if we
+			 * were unable to sort above, then we'd better generate a Path, so
+			 * that we at least have one.
+             */
+#ifdef __TBASE__
+			if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg ||
+				grouped_rel->pathlist == NIL)
+#else
+			if (hashaggtablesize < work_mem * 1024L ||
+				grouped_rel->pathlist == NIL)
+#endif
             {
-                /* avoid apply_projection_to_path, in case of multiple refs */
-                newpath = (Path *) create_projection_path(root,
-                                                          rel,
-                                                          newpath,
-                                                          thistarget);
-            }
-        }
-        lfirst(lc) = newpath;
-    }
-}
+				/* Don't mess with the cheapest path directly. */
+				Path *path = cheapest_path;
+#ifdef __TBASE__
+				bool try_redistribute_grouping = false;
+#endif
 
-/*
- * expression_planner
- *        Perform planner's transformations on a standalone expression.
- *
- * Various utility commands need to evaluate expressions that are not part
- * of a plannable query.  They can do so using the executor's regular
- * expression-execution machinery, but first the expression has to be fed
- * through here to transform it from parser output to something executable.
- *
- * Currently, we disallow sublinks in standalone expressions, so there's no
- * real "planning" involved here.  (That might not always be true though.)
- * What we must do is run eval_const_expressions to ensure that any function
- * calls are converted to positional notation and function default arguments
- * get inserted.  The fact that constant subexpressions get simplified is a
- * side-effect that is useful when the expression will get evaluated more than
- * once.  Also, we must fix operator function IDs.
- *
- * Note: this must not make any damaging changes to the passed-in expression
- * tree.  (It would actually be okay to apply fix_opfuncids to it, but since
- * we first do an expression_tree_mutator-based walk, what is returned will
- * be a new node tree.)
- */
-Expr *
-expression_planner(Expr *expr)
-{
-    Node       *result;
+                /*
+				 * If the grouping can't be fully pushed down, we'll push down the
+				 * first phase of the aggregate, and redistribute only the partial
+				 * results.
+				 *
+				 * If if can be pushed down, disable construction of complex
+				 * distributed paths.
+                 */
+				if (! can_push_down_grouping(root, parse, path))
+#ifdef XCP
+				{
+					/* some special aggs cannot be parallel executed, such as count(distinct) */
+					if(agg_costs->hasNonPartial || agg_costs->hasNonSerial ||
+						path->pathtype == T_Agg || path->pathtype == T_Group ||
+						!olap_optimizer || has_cold_hot_table)
+					{
+						if (agg_costs->hasOnlyDistinct && olap_optimizer && !has_cold_hot_table)
+							path = create_redistribute_grouping_path(root, parse, path);
+						else
+							path = create_remotesubplan_path(root, path, NULL);
+					}
+					else
+					{
+						/*
+						  * If the grouping can not be fully pushed down, we adopt another
+						  * strategy instead.
+						  * 1. do grouping on each datanode locally
+						  * 2. re-distribute grouping results among datanodes, then do the
+						  * 	final grouping
+										*/
+						AggClauseCosts hashagg_partial_costs;
+						PathTarget * local_grouping_target = make_partial_grouping_target(root, target);
 
-    /*
-     * Convert named-argument function calls, insert default arguments and
-     * simplify constant subexprs
-     */
-    result = eval_const_expressions(NULL, (Node *) expr);
+						/* Estimate number of partial groups. */
+						double dNumLocalGroups = get_number_of_groups(root,
+																 cheapest_path->rows,
+																 gd);
+						try_redistribute_grouping = true;
 
-    /* Fill in opfuncid values if missing */
-    fix_opfuncids(result);
+						MemSet(&hashagg_partial_costs, 0, sizeof(AggClauseCosts));
 
-    return (Expr *) result;
-}
+						get_agg_clause_costs(root, (Node *) local_grouping_target->exprs,
+								 AGGSPLIT_INITIAL_SERIAL,
+								 &hashagg_partial_costs);
+
+						/* step 1 */
+						path = (Path *) create_agg_path(root,
+														grouped_rel,
+														cheapest_path,
+														local_grouping_target,
+														AGG_HASHED,
+														AGGSPLIT_INITIAL_SERIAL,
+														parse->groupClause,
+														NIL,
+														&hashagg_partial_costs,
+														dNumLocalGroups);
+
+#ifdef __TBASE__
+						if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
+						{
+							AggPath *aggpath = (AggPath *)path;
 
+							aggpath->hybrid = true;
+                        }
+#endif
+
+						/* step 2 */
+						path = create_redistribute_grouping_path(root, parse, path);
+					}
+				}
+#else
+				path = create_remotesubplan_path(root, path, NULL);
+#endif
+				else
+					try_distributed_aggregation = false;
 
 /*
- * plan_cluster_use_sort
- *        Use the planner to decide how CLUSTER should implement sorting
- *
- * tableOid is the OID of a table to be clustered on its index indexOid
- * (which is already known to be a btree index).  Decide whether it's
- * cheaper to do an indexscan or a seqscan-plus-sort to execute the CLUSTER.
- * Return TRUE to use sorting, FALSE to use an indexscan.
- *
- * Note: caller had better already hold some type of lock on the table.
+				 * We just need an Agg over the cheapest-total input path,
+				 * since input order won't matter.
  */
-bool
-plan_cluster_use_sort(Oid tableOid, Oid indexOid)
-{
-    PlannerInfo *root;
-    Query       *query;
-    PlannerGlobal *glob;
-    RangeTblEntry *rte;
-    RelOptInfo *rel;
-    IndexOptInfo *indexInfo;
-    QualCost    indexExprCost;
-    Cost        comparisonCost;
-    Path       *seqScanPath;
-    Path        seqScanAndSortPath;
-    IndexPath  *indexScanPath;
-    ListCell   *lc;
+#ifdef __TBASE__
+				if(try_redistribute_grouping)
+                {
+					AggClauseCosts hashagg_final_costs;
+					Path *agg_path;
 
-    /* We can short-circuit the cost comparison if indexscans are disabled */
-    if (!enable_indexscan)
-        return true;            /* use sort */
+					MemSet(&hashagg_final_costs, 0, sizeof(AggClauseCosts));
 
-    /* Set up mostly-dummy planner state */
-    query = makeNode(Query);
-    query->commandType = CMD_SELECT;
+					get_agg_clause_costs(root, (Node *) target->exprs,
+								 AGGSPLIT_FINAL_DESERIAL,
+								 &hashagg_final_costs);
+					get_agg_clause_costs(root, parse->havingQual,
+								 AGGSPLIT_FINAL_DESERIAL,
+								 &hashagg_final_costs);
 
-    glob = makeNode(PlannerGlobal);
+					agg_path = (Path *)
+								 create_agg_path(root,
+												 grouped_rel,
+												 path,
+												 target,
+												 AGG_HASHED,
+												 AGGSPLIT_FINAL_DESERIAL,
+												 parse->groupClause,
+												 havingQual,
+												 &hashagg_final_costs,
+												 dNumGroups);
+#ifdef __TBASE__
+					if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
+					{
+						AggPath *aggpath = (AggPath *)agg_path;
 
-    root = makeNode(PlannerInfo);
-    root->parse = query;
-    root->glob = glob;
-    root->query_level = 1;
-    root->planner_cxt = CurrentMemoryContext;
-    root->wt_param_id = -1;
-    root->recursiveOk = true;
+						aggpath->hybrid = true;
+					}
+#endif
+					//agg_path->parallel_safe = true;
 
-    /* Build a minimal RTE for the rel */
-    rte = makeNode(RangeTblEntry);
-    rte->rtekind = RTE_RELATION;
-    rte->relid = tableOid;
-    rte->relkind = RELKIND_RELATION;    /* Don't be too picky. */
-    rte->lateral = false;
-    rte->inh = false;
-    rte->inFromCl = true;
-    query->rtable = list_make1(rte);
-
-    /* Set up RTE/RelOptInfo arrays */
-    setup_simple_rel_arrays(root);
-
-    /* Build RelOptInfo */
-    rel = build_simple_rel(root, 1, NULL);
-
-    /* Locate IndexOptInfo for the target index */
-    indexInfo = NULL;
-    foreach(lc, rel->indexlist)
-    {
-        indexInfo = (IndexOptInfo *) lfirst(lc);
-        if (indexInfo->indexoid == indexOid)
-            break;
-    }
+					add_path(grouped_rel, agg_path);
+				}
+				else
+				{
+					bool parallel_aware = false;
+					bool parallel_safe  = false;
+					Path *agg_path      = NULL;
 
-    /*
-     * It's possible that get_relation_info did not generate an IndexOptInfo
-     * for the desired index; this could happen if it's not yet reached its
-     * indcheckxmin usability horizon, or if it's a system index and we're
-     * ignoring system indexes.  In such cases we should tell CLUSTER to not
-     * trust the index contents but use seqscan-and-sort.
-     */
-    if (lc == NULL)                /* not in the list? */
-        return true;            /* use sort */
+					if ((path->pathtype == T_Gather || agg_costs->hasOnlyDistinct) && olap_optimizer
+						&& !has_cold_hot_table)
+					{
+						parallel_safe = true;
+						parallel_aware = true;
+					}
 
-    /*
-     * Rather than doing all the pushups that would be needed to use
-     * set_baserel_size_estimates, just do a quick hack for rows and width.
-     */
-    rel->rows = rel->tuples;
-    rel->reltarget->width = get_relation_data_width(tableOid, NULL);
+					agg_path = (Path *)
+						 create_agg_path(root, grouped_rel,
+										 path,
+										 target,
+										 AGG_HASHED,
+										 AGGSPLIT_SIMPLE,
+										 parse->groupClause,
+										 havingQual,
+										 agg_costs,
+										 dNumGroups);
+					agg_path->parallel_aware = parallel_aware;
+					agg_path->parallel_safe  = parallel_safe;
+#ifdef __TBASE__
+					if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
+                    {
+						AggPath *aggpath = (AggPath *)agg_path;
+
+						aggpath->hybrid = true;
+					}
+#endif
+					add_path(grouped_rel, agg_path);
+				}
+#else
+				add_path(grouped_rel, (Path *)
+						 create_agg_path(root, grouped_rel,
+										 path,
+										 target,
+										 AGG_HASHED,
+										 AGGSPLIT_SIMPLE,
+										 parse->groupClause,
+										 havingQual,
+										 agg_costs,
+										 dNumGroups));
+#endif
+			}
+        }
 
-    root->total_table_pages = rel->pages;
+        /*
+		 * Generate a HashAgg Path atop of the cheapest partial path. Once
+		 * again, we'll only do this if it looks as though the hash table
+		 * won't exceed work_mem.
+         */
+		if (grouped_rel->partial_pathlist)
+		{
+			bool       redistribute_group = false;
+			Path	   *path = (Path *) linitial(grouped_rel->partial_pathlist);
 
-    /*
-     * Determine eval cost of the index expressions, if any.  We need to
-     * charge twice that amount for each tuple comparison that happens during
-     * the sort, since tuplesort.c will have to re-evaluate the index
-     * expressions each time.  (XXX that's pretty inefficient...)
-     */
-    cost_qual_eval(&indexExprCost, indexInfo->indexprs, root);
-    comparisonCost = 2.0 * (indexExprCost.startup + indexExprCost.per_tuple);
-
-    /* Estimate the cost of seq scan + sort */
-    seqScanPath = create_seqscan_path(root, rel, NULL, 0);
-    cost_sort(&seqScanAndSortPath, root, NIL,
-              seqScanPath->total_cost, rel->tuples, rel->reltarget->width,
-              comparisonCost, maintenance_work_mem, -1.0);
-
-    /* Estimate the cost of index scan */
-    indexScanPath = create_index_path(root, indexInfo,
-                                      NIL, NIL, NIL, NIL, NIL,
-                                      ForwardScanDirection, false,
-                                      NULL, 1.0, false);
-
-    return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost);
-}
+			hashaggtablesize = estimate_hashagg_tablesize(path,
+														  agg_final_costs,
+														  dNumGroups);
 
+#ifdef __TBASE__
+			if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg)
+#else
+			if (hashaggtablesize < work_mem * 1024L)
+#endif
+			{
+#ifdef __TBASE__
+				double		total_groups = 0;
 
-/*
- * grouping_distribution_match
- *     Check if the path distribution matches grouping distribution.
- *
- * Grouping preserves distribution if the distribution key is on of the
- * grouping keys (arbitrary one). In that case it's guaranteed that groups
- * on different nodes do not overlap, and we can push the aggregation to
- * remote nodes as a whole.
- *
- * Otherwise we need to either fetch all the data to the coordinator and
- * perform the aggregation there, or use two-phase aggregation, with the
- * first phase (partial aggregation) pushed down, and the second phase
- * (combining and finalizing the results) executed on the coordinator.
- *
- * XXX This is used not only for plain aggregation, but also for various
- * other paths, relying on grouping infrastructure (DISTINCT ON, UNIQUE).
- */
-static bool
-grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path,
-                            List *clauses)
-{// #lizard forgives
-    int        i;
-    bool    matches_key = false;
-    Distribution *distribution = path->distribution;
+				if (olap_optimizer && !has_cold_hot_table)
+				{
+					total_groups = path->rows;
+				}
+				else
+					total_groups = path->rows * path->parallel_workers;
+#else
+				double		total_groups = path->rows * path->parallel_workers;
+#endif
+				path = (Path *) create_gather_path(root,
+												   grouped_rel,
+												   path,
+												   partial_grouping_target,
+												   NULL,
+												   &total_groups);
+               /*
+				* If the grouping can't be fully pushed down, we'll push down the
+				* first phase of the aggregate, and redistribute only the partial
+				* results.
+				*
+				* If if can be pushed down, disable construction of complex
+				* distributed paths.
+                */
+				if (! can_push_down_grouping(root, parse, path))
+#ifdef __TBASE__
+				{
+					if (olap_optimizer && !has_cold_hot_table)
+					{
+						/* redistribute local grouping results among datanodes */
+						path = create_redistribute_grouping_path(root, parse, path);
+						redistribute_group = true;
+					}
+					else
+					{
+						path = create_remotesubplan_path(root, path, NULL);
+					}
+				}
+#else
+					path = create_remotesubplan_path(root, path, NULL);
+#endif
+				else
+					try_distributed_aggregation = false;
 
-    int numGroupCols = list_length(clauses);
-    AttrNumber *groupColIdx = extract_grouping_cols(clauses,
-                                                    parse->targetList);
+#ifdef __TBASE__
+				if (!redistribute_group)
+				{
+					Path *agg_path = (Path *)
+						 create_agg_path(root,
+										 grouped_rel,
+										 path,
+										 target,
+										 AGG_HASHED,
+										 AGGSPLIT_FINAL_DESERIAL,
+										 parse->groupClause,
+										 havingQual,
+										 agg_final_costs,
+										 dNumGroups);
+
+					if (olap_optimizer && !has_cold_hot_table)
+					{
+						agg_path->parallel_aware = true;
+						agg_path->parallel_safe  = true;
+                    }
+#ifdef __TBASE__
+					if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
+					{
+						AggPath *aggpath = (AggPath *)agg_path;
 
-#ifdef __COLD_HOT__
-    if (has_cold_hot_table)
-    {
-        if (! path->distribution)
-        {
-            return true;
-        }
+						aggpath->hybrid = true;
+					}
+#endif
+					add_path(grouped_rel, agg_path);
+				}
+				else
+                {
+#endif
+				Path *agg_path = (Path *)
+						 create_agg_path(root,
+										 grouped_rel,
+										 path,
+										 target,
+										 AGG_HASHED,
+										 AGGSPLIT_FINAL_DESERIAL,
+										 parse->groupClause,
+										 havingQual,
+										 agg_final_costs,
+										 dNumGroups);
+
+				if (olap_optimizer && !has_cold_hot_table)
+                {
+					agg_path->parallel_aware = true;
+					agg_path->parallel_safe = true;
+                }
+#ifdef __TBASE__
+				if (hashaggtablesize >= work_mem * 1024L && g_hybrid_hash_agg)
+				{
+					AggPath *aggpath = (AggPath *)agg_path;
         
-        return false;
-    }
+					aggpath->hybrid = true;
+				}
+#endif
+				add_path(grouped_rel, agg_path);
+#ifdef __TBASE__
+                }
 #endif
+			}
+		}
+	}
+}
 
     /*
-     * With no explicit data distribution or replicated tables, we can simply
-     * push down the whole aggregation to the remote node, without any sort
-     * of redistribution. So consider this to be a match.
+ * add_partial_paths_to_grouping_rel
+ *
+ * Add partial paths to grouping relation.  These paths are not fully
+ * aggregated; a FinalizeAggregate step is still required.
      */
-    if ((distribution == NULL) ||
-        IsLocatorReplicated(distribution->distributionType))
-        return true;
+static void
+add_partial_paths_to_grouping_rel(PlannerInfo *root,
+                                  RelOptInfo *input_rel,
+                                  RelOptInfo *grouped_rel,
+                                  PathTarget *target,
+                                  PathTarget *partial_grouping_target,
+                                  AggClauseCosts *agg_partial_costs,
+                                  AggClauseCosts *agg_final_costs,
+                                  grouping_sets_data *gd,
+                                  bool can_sort,
+                                  bool can_hash,
+                                  List *havingQual)
+{
+    Query      *parse = root->parse;
+    Path       *cheapest_partial_path = linitial(input_rel->partial_pathlist);
+    Size        hashaggtablesize;
+    double      dNumPartialGroups = 0;
+    ListCell   *lc;
 
-    /* But no distribution expression means 'no match'. */
-    if (distribution->distributionExpr == NULL)
-        return false;
+    /* Estimate number of partial groups. */
+    dNumPartialGroups = get_number_of_groups(root,
+                                             cheapest_partial_path->rows,
+                                             gd);
+
+    if (can_sort)
+    {
+        /* This should have been checked previously */
+        Assert(parse->hasAggs || parse->groupClause);
 
     /*
-     * With distributed data and table distributed using an expression, we
-     * need to check if the distribution expression matches one of the
-     * grouping keys (arbitrary one).
+         * Use any available suitably-sorted path as input, and also consider
+         * sorting the cheapest partial path.
      */
-    for (i = 0; i < numGroupCols; i++)
+        foreach(lc, input_rel->partial_pathlist)
     {
-        TargetEntry *te = (TargetEntry *)list_nth(parse->targetList,
-                                                  groupColIdx[i]-1);
+            Path       *path = (Path *) lfirst(lc);
+            bool        is_sorted;
 
-        if (equal(te->expr, distribution->distributionExpr))
+            is_sorted = pathkeys_contained_in(root->group_pathkeys,
+                                              path->pathkeys);
+            if (path == cheapest_partial_path || is_sorted)
         {
-            matches_key = true;
-            break;
+                /* Sort the cheapest partial path, if it isn't already */
+                if (!is_sorted)
+                    path = (Path *) create_sort_path(root,
+                                                     grouped_rel,
+                                                     path,
+                                                     root->group_pathkeys,
+                                                     -1.0);
+
+                if (parse->hasAggs)
+                    add_partial_path(grouped_rel, (Path *)
+                                     create_agg_path(root,
+                                                     grouped_rel,
+                                                     path,
+                                                     partial_grouping_target,
+                                                     parse->groupClause ? AGG_SORTED : AGG_PLAIN,
+                                                     AGGSPLIT_INITIAL_SERIAL,
+                                                     parse->groupClause,
+                                                     NIL,
+                                                     agg_partial_costs,
+                                                     dNumPartialGroups));
+                else
+                    add_partial_path(grouped_rel, (Path *)
+                                     create_group_path(root,
+                                                       grouped_rel,
+                                                       path,
+                                                       partial_grouping_target,
+                                                       parse->groupClause,
+                                                       NIL,
+                                                       dNumPartialGroups));
         }
     }
-
-    return matches_key;
 }
 
-/*
- * get_partitioned_child_rels
- *        Returns a list of the RT indexes of the partitioned child relations
- *      with rti as the root parent RT index. Also sets
- *      *part_cols_updated to true if any of the root rte's updated
- *      columns is used in the partition key either of the relation whose RTI
- *      is specified or of any child relation.
- *
- * Note: This function might get called even for range table entries that
- * are not partitioned tables; in such a case, it will simply return NIL.
- */
-List *
-get_partitioned_child_rels(PlannerInfo *root, Index rti,
-                          bool *part_cols_updated)
+    if (can_hash)
 {
-    List       *result = NIL;
-    ListCell   *l;
+        /* Checked above */
+        Assert(parse->hasAggs || parse->groupClause);
 
-    if (part_cols_updated)
-        *part_cols_updated = false;
+        hashaggtablesize =
+            estimate_hashagg_tablesize(cheapest_partial_path,
+                                       agg_partial_costs,
+                                       dNumPartialGroups);
 
-    foreach(l, root->pcinfo_list)
+		/*
+		 * Tentatively produce a partial HashAgg Path, depending on if it
+		 * looks as if the hash table will fit in work_mem.
+		 */
+#ifdef __TBASE__
+		if (hashaggtablesize < work_mem * 1024L || g_hybrid_hash_agg)
+#else
+		if (hashaggtablesize < work_mem * 1024L)
+#endif
     {
-        PartitionedChildRelInfo *pc = lfirst(l);
-
-        if (pc->parent_relid == rti)
+			AggPath *aggpath = (AggPath *)
+							 create_agg_path(root,
+											 grouped_rel,
+											 cheapest_partial_path,
+											 partial_grouping_target,
+											 AGG_HASHED,
+											 AGGSPLIT_INITIAL_SERIAL,
+											 parse->groupClause,
+											 NIL,
+											 agg_partial_costs,
+											 dNumPartialGroups);
+#ifdef __TBASE__
+			if (hashaggtablesize >= work_mem * 1024L)
         {
-            result = pc->child_rels;
-			if (part_cols_updated)
-				*part_cols_updated = pc->part_cols_updated;
-            break;
+				aggpath->hybrid = true;
+			}
+#endif
+			add_partial_path(grouped_rel, (Path *)aggpath);
         }
     }
-
-    return result;
 }
 
-
 /*
- * get_partitioned_child_rels_for_join
- *     Build and return a list containing the RTI of every partitioned
- *     relation which is a child of some rel included in the join.
+ * can_parallel_agg
+ *
+ * Determines whether or not parallel grouping and/or aggregation is possible.
+ * Returns true when possible, false otherwise.
  */
-List *
-get_partitioned_child_rels_for_join(PlannerInfo *root, Relids join_relids)
+static bool
+can_parallel_agg(PlannerInfo *root, RelOptInfo *input_rel,
+                 RelOptInfo *grouped_rel, const AggClauseCosts *agg_costs)
 {
-   List       *result = NIL;
-   ListCell   *l;
+    Query      *parse = root->parse;
 
-   foreach(l, root->pcinfo_list)
+    if (!grouped_rel->consider_parallel)
    {
-       PartitionedChildRelInfo *pc = lfirst(l);
-
-       if (bms_is_member(pc->parent_relid, join_relids))
-           result = list_concat(result, list_copy(pc->child_rels));
+        /* Not even parallel-safe. */
+        return false;
    }
-
-   return result;
+    else if (input_rel->partial_pathlist == NIL)
+    {
+        /* Nothing to use as input for partial aggregate. */
+        return false;
+    }
+    else if (!parse->hasAggs && parse->groupClause == NIL)
+    {
+        /*
+         * We don't know how to do parallel aggregation unless we have either
+         * some aggregates or a grouping clause.
+         */
+        return false;
+    }
+    else if (parse->groupingSets)
+    {
+        /* We don't know how to do grouping sets in parallel. */
+        return false;
+    }
+    else if (agg_costs->hasNonPartial || agg_costs->hasNonSerial)
+    {
+        /* Insufficient support for partial mode. */
+        return false;
 }
 
+    /* Everything looks good. */
+    return true;
+}
 
 static bool
 groupingsets_distribution_match(PlannerInfo *root, Query *parse, Path *path)

From 1500a993bd9318d4e09353db29d6512fcbe0c32c Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Sun, 28 Jun 2020 21:53:18 +0800
Subject: [PATCH 239/578] Correctly assess parallel-safety of tlists when SRFs
 are used.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/optimizer/plan/planner.c | 52 +++++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 640dcc4d..e2cf7b37 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -147,6 +147,7 @@ static Size estimate_hashagg_tablesize(Path *path,
 static RelOptInfo *create_grouping_paths(PlannerInfo *root,
                       RelOptInfo *input_rel,
                       PathTarget *target,
+					  bool target_parallel_safe,
                       const AggClauseCosts *agg_costs,
                       grouping_sets_data *gd);
 static void consider_groupingsets_paths(PlannerInfo *root,
@@ -162,6 +163,7 @@ static RelOptInfo *create_window_paths(PlannerInfo *root,
                     RelOptInfo *input_rel,
                     PathTarget *input_target,
                     PathTarget *output_target,
+					bool output_target_parallel_safe,
                     List *tlist,
                     WindowFuncLists *wflists,
                     List *activeWindows);
@@ -178,6 +180,7 @@ static RelOptInfo *create_distinct_paths(PlannerInfo *root,
 static RelOptInfo *create_ordered_paths(PlannerInfo *root,
                      RelOptInfo *input_rel,
                      PathTarget *target,
+					 bool target_parallel_safe,
                      double limit_tuples);
 static PathTarget *make_group_input_target(PlannerInfo *root,
                         PathTarget *final_target);
@@ -1754,6 +1757,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
     PathTarget *final_target;
     List       *final_targets;
     List       *final_targets_contain_srfs;
+	bool		final_target_parallel_safe;
     RelOptInfo *current_rel;
     RelOptInfo *final_rel;
     ListCell   *lc;
@@ -1816,6 +1820,10 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
         /* Also extract the PathTarget form of the setop result tlist */
         final_target = current_rel->cheapest_total_path->pathtarget;
 
+		/* And check whether it's parallel safe */
+		final_target_parallel_safe =
+			is_parallel_safe(root, (Node *) final_target->exprs);
+
         /* The setop result tlist couldn't contain any SRFs */
         Assert(!parse->hasTargetSRFs);
         final_targets = final_targets_contain_srfs = NIL;
@@ -1847,12 +1855,15 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
         PathTarget *sort_input_target;
         List       *sort_input_targets;
         List       *sort_input_targets_contain_srfs;
+		bool		sort_input_target_parallel_safe;
         PathTarget *grouping_target;
         List       *grouping_targets;
         List       *grouping_targets_contain_srfs;
+		bool		grouping_target_parallel_safe;
         PathTarget *scanjoin_target;
         List       *scanjoin_targets;
         List       *scanjoin_targets_contain_srfs;
+		bool		scanjoin_target_parallel_safe;
         bool        have_grouping;
         AggClauseCosts agg_costs;
         WindowFuncLists *wflists = NULL;
@@ -1982,6 +1993,8 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
          * that were obtained within query_planner().
          */
         final_target = create_pathtarget(root, tlist);
+		final_target_parallel_safe =
+			is_parallel_safe(root, (Node *) final_target->exprs);
 
         /*
          * If ORDER BY was given, consider whether we should use a post-sort
@@ -1989,11 +2002,18 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
          * so.
          */
         if (parse->sortClause)
+		{
             sort_input_target = make_sort_input_target(root,
                                                        final_target,
                                                        &have_postponed_srfs);
+			sort_input_target_parallel_safe =
+				is_parallel_safe(root, (Node *) sort_input_target->exprs);
+		}
         else
+		{
             sort_input_target = final_target;
+			sort_input_target_parallel_safe = final_target_parallel_safe;
+		}
 
         /*
          * If we have window functions to deal with, the output from any
@@ -2001,11 +2021,18 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
          * otherwise, it should be sort_input_target.
          */
         if (activeWindows)
+		{
             grouping_target = make_window_input_target(root,
                                                        final_target,
                                                        activeWindows);
+			grouping_target_parallel_safe =
+				is_parallel_safe(root, (Node *) grouping_target->exprs);
+		}
         else
+		{
             grouping_target = sort_input_target;
+			grouping_target_parallel_safe = sort_input_target_parallel_safe;
+		}
 
         /*
          * If we have grouping or aggregation to do, the topmost scan/join
@@ -2015,9 +2042,16 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
         have_grouping = (parse->groupClause || parse->groupingSets ||
                          parse->hasAggs || root->hasHavingQual);
         if (have_grouping)
+		{
             scanjoin_target = make_group_input_target(root, final_target);
+			scanjoin_target_parallel_safe =
+				is_parallel_safe(root, (Node *) grouping_target->exprs);
+		}
         else
+		{
             scanjoin_target = grouping_target;
+			scanjoin_target_parallel_safe = grouping_target_parallel_safe;
+		}
 
         /*
          * If there are any SRFs in the targetlist, we must separate each of
@@ -2099,8 +2133,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
          * for partial paths.  But only parallel-safe expressions can be
          * computed by partial paths.
          */
-        if (current_rel->partial_pathlist &&
-            is_parallel_safe(root, (Node *) scanjoin_target->exprs))
+		if (current_rel->partial_pathlist && scanjoin_target_parallel_safe)
         {
             /* Apply the scan/join target to each partial path */
             foreach(lc, current_rel->partial_pathlist)
@@ -2161,6 +2194,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
             current_rel = create_grouping_paths(root,
                                                 current_rel,
                                                 grouping_target,
+												grouping_target_parallel_safe,
                                                 &agg_costs,
                                                 gset_data);
             /* Fix things up if grouping_target contains SRFs */
@@ -2180,6 +2214,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
                                               current_rel,
                                               grouping_target,
                                               sort_input_target,
+											  sort_input_target_parallel_safe,
                                               tlist,
                                               wflists,
                                               activeWindows);
@@ -2213,6 +2248,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
         current_rel = create_ordered_paths(root,
                                            current_rel,
                                            final_target,
+										   final_target_parallel_safe,
                                            have_postponed_srfs ? -1.0 :
                                            limit_tuples);
         /* Fix things up if final_target contains SRFs */
@@ -3910,6 +3946,7 @@ static RelOptInfo *
 create_grouping_paths(PlannerInfo *root,
                       RelOptInfo *input_rel,
                       PathTarget *target,
+					  bool target_parallel_safe,
                       const AggClauseCosts *agg_costs,
                       grouping_sets_data *gd)
 {// #lizard forgives
@@ -3934,8 +3971,7 @@ create_grouping_paths(PlannerInfo *root,
      * can't be parallel-safe, either.  Otherwise, it's parallel-safe if the
      * target list and HAVING quals are parallel-safe.
      */
-    if (input_rel->consider_parallel &&
-        is_parallel_safe(root, (Node *) target->exprs) &&
+	if (input_rel->consider_parallel && target_parallel_safe &&
         is_parallel_safe(root, (Node *) parse->havingQual))
         grouped_rel->consider_parallel = true;
 
@@ -5016,6 +5052,7 @@ create_window_paths(PlannerInfo *root,
 					RelOptInfo *input_rel,
 					PathTarget *input_target,
 					PathTarget *output_target,
+					bool output_target_parallel_safe,
 					List *tlist,
 					WindowFuncLists *wflists,
 					List *activeWindows)
@@ -5031,8 +5068,7 @@ create_window_paths(PlannerInfo *root,
 	 * can't be parallel-safe, either.  Otherwise, we need to examine the
 	 * target list and active windows for non-parallel-safe constructs.
 	 */
-	if (input_rel->consider_parallel &&
-		is_parallel_safe(root, (Node *) output_target->exprs) &&
+	if (input_rel->consider_parallel && output_target_parallel_safe &&
 		is_parallel_safe(root, (Node *) activeWindows))
 		window_rel->consider_parallel = true;
 
@@ -5433,6 +5469,7 @@ static RelOptInfo *
 create_ordered_paths(PlannerInfo *root,
 					 RelOptInfo *input_rel,
 					 PathTarget *target,
+					 bool target_parallel_safe,
 					 double limit_tuples)
                     {
 	Path	   *cheapest_input_path = input_rel->cheapest_total_path;
@@ -5447,8 +5484,7 @@ create_ordered_paths(PlannerInfo *root,
 	 * can't be parallel-safe, either.  Otherwise, it's parallel-safe if the
 	 * target list is parallel-safe.
                          */
-	if (input_rel->consider_parallel &&
-		is_parallel_safe(root, (Node *) target->exprs))
+	if (input_rel->consider_parallel && target_parallel_safe)
 		ordered_rel->consider_parallel = true;
 
 	/*

From 03bb7b4fdf67123a740a34947cd59a2c9623925f Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 10:30:39 +0800
Subject: [PATCH 240/578] Let Parallel Append over simple UNION ALL have
 partial subpaths.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/optimizer/path/allpaths.c         | 22 +++++++
 src/backend/optimizer/plan/planner.c          | 16 +++++
 src/backend/optimizer/plan/subselect.c        | 17 ++++-
 src/test/regress/expected/select_parallel.out | 65 +++++++++++++++++++
 .../regress/expected/select_parallel_1.out    | 65 +++++++++++++++++++
 .../regress/expected/select_parallel_2.out    | 65 +++++++++++++++++++
 .../regress/expected/select_parallel_3.out    | 65 +++++++++++++++++++
 .../regress/expected/select_parallel_4.out    | 65 +++++++++++++++++++
 .../regress/expected/select_parallel_5.out    | 65 +++++++++++++++++++
 src/test/regress/sql/select_parallel.sql      | 25 +++++++
 10 files changed, 468 insertions(+), 2 deletions(-)

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 947c75f3..f5516316 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -2106,6 +2106,28 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
                                           pathkeys, required_outer,
                                           distribution));
     }
+
+	/* If consider_parallel is false, there should be no partial paths. */
+	Assert(sub_final_rel->consider_parallel ||
+	      sub_final_rel->partial_pathlist == NIL);
+
+	/* Same for partial paths. */
+	foreach(lc, sub_final_rel->partial_pathlist)
+	{
+	   Path       *subpath = (Path *) lfirst(lc);
+	   List       *pathkeys;
+
+	   /* Convert subpath's pathkeys to outer representation */
+	   pathkeys = convert_subquery_pathkeys(root,
+	                                        rel,
+	                                        subpath->pathkeys,
+	                                        make_tlist_from_pathtarget(subpath->pathtarget));
+
+	   /* Generate outer path using this subpath */
+	   add_partial_path(rel, (Path *)
+	                    create_subqueryscan_path(root, rel, subpath,
+	                                             pathkeys, required_outer));
+	}
 }
 
 /*
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index e2cf7b37..aa85504d 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -2432,6 +2432,22 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
     }
 
     /*
+	 * Generate partial paths for final_rel, too, if outer query levels might
+	 * be able to make use of them.
+	 */
+	if (final_rel->consider_parallel && root->query_level > 1 &&
+		!limit_needed(parse))
+	{
+		Assert(!parse->rowMarks && parse->commandType == CMD_SELECT);
+		foreach(lc, current_rel->partial_pathlist)
+		{
+			Path	   *partial_path = (Path *) lfirst(lc);
+
+			add_partial_path(final_rel, partial_path);
+		}
+	}
+
+	/*
      * If there is an FDW that's responsible for all baserels of the query,
      * let it consider adding ForeignPaths.
      */
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 98ed5c26..bcdbe3da 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -4934,6 +4934,13 @@ SS_charge_for_initplans(PlannerInfo *root, RelOptInfo *final_rel)
         path->parallel_safe = false;
     }
 
+	/*
+	 * Forget about any partial paths and clear consider_parallel, too;
+	 * they're not usable if we attached an initPlan.
+	 */
+	final_rel->partial_pathlist = NIL;
+	final_rel->consider_parallel = false;
+
     /* We needn't do set_cheapest() here, caller will do it */
 }
 
@@ -5134,10 +5141,16 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
             {
                 SubqueryScan *sscan = (SubqueryScan *) plan;
                 RelOptInfo *rel;
+				Bitmapset  *subquery_params;
 
-                /* We must run SS_finalize_plan on the subquery */
+				/* We must run finalize_plan on the subquery */
                 rel = find_base_rel(root, sscan->scan.scanrelid);
-                SS_finalize_plan(rel->subroot, sscan->subplan);
+				subquery_params = rel->subroot->outer_params;
+				if (gather_param >= 0)
+					subquery_params = bms_add_member(bms_copy(subquery_params),
+													 gather_param);
+				finalize_plan(rel->subroot, sscan->subplan, gather_param,
+							  subquery_params, NULL);
 
                 /* Now we can add its extParams to the parent's params */
                 context.paramids = bms_add_members(context.paramids,
diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out
index ef57a7a7..69b17579 100644
--- a/src/test/regress/expected/select_parallel.out
+++ b/src/test/regress/expected/select_parallel.out
@@ -338,4 +338,69 @@ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1;
 select stringu1::int2 from tenk1 where unique1 = 1;
 ERROR:  invalid input syntax for integer: "BAAAAA"
 CONTEXT:  parallel worker
+-- test interaction with set-returning functions
+SAVEPOINT settings;
+-- multiple subqueries under a single Gather node
+-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes
+SET LOCAL parallel_setup_cost = 10;
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Gather
+   Workers Planned: 4
+   ->  Parallel Append
+         ->  Parallel Seq Scan on tenk1
+               Filter: (fivethous = (tenthous + 1))
+         ->  Parallel Seq Scan on tenk1 tenk1_1
+               Filter: (fivethous = (tenthous + 1))
+(7 rows)
+
+ROLLBACK TO SAVEPOINT settings;
+-- can't use multiple subqueries under a single Gather node due to initPlans
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+ORDER BY 1;
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Sort
+   Sort Key: tenk1.unique1
+   ->  Append
+         ->  Gather
+               Workers Planned: 4
+               Params Evaluated: $1
+               InitPlan 1 (returns $1)
+                 ->  Limit
+                       ->  Gather
+                             Workers Planned: 4
+                             ->  Parallel Seq Scan on tenk1 tenk1_2
+                                   Filter: (fivethous = 1)
+               ->  Parallel Seq Scan on tenk1
+                     Filter: (fivethous = $1)
+         ->  Gather
+               Workers Planned: 4
+               Params Evaluated: $3
+               InitPlan 2 (returns $3)
+                 ->  Limit
+                       ->  Gather
+                             Workers Planned: 4
+                             ->  Parallel Seq Scan on tenk1 tenk1_3
+                                   Filter: (fivethous = 1)
+               ->  Parallel Seq Scan on tenk1 tenk1_1
+                     Filter: (fivethous = $3)
+(25 rows)
+
+-- test interaction with SRFs
+SELECT * FROM information_schema.foreign_data_wrapper_options
+ORDER BY 1, 2, 3;
+ foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value 
+------------------------------+---------------------------+-------------+--------------
+(0 rows)
+
 rollback;
diff --git a/src/test/regress/expected/select_parallel_1.out b/src/test/regress/expected/select_parallel_1.out
index 2f089381..0bc4ec2a 100644
--- a/src/test/regress/expected/select_parallel_1.out
+++ b/src/test/regress/expected/select_parallel_1.out
@@ -343,4 +343,69 @@ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1;
 -- provoke error in worker
 select stringu1::int2 from tenk1 where unique1 = 1;
 ERROR:  invalid input syntax for integer: "BAAAAA"
+-- test interaction with set-returning functions
+SAVEPOINT settings;
+-- multiple subqueries under a single Gather node
+-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes
+SET LOCAL parallel_setup_cost = 10;
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Gather
+   Workers Planned: 4
+   ->  Parallel Append
+         ->  Parallel Seq Scan on tenk1
+               Filter: (fivethous = (tenthous + 1))
+         ->  Parallel Seq Scan on tenk1 tenk1_1
+               Filter: (fivethous = (tenthous + 1))
+(7 rows)
+
+ROLLBACK TO SAVEPOINT settings;
+-- can't use multiple subqueries under a single Gather node due to initPlans
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+ORDER BY 1;
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Sort
+   Sort Key: tenk1.unique1
+   ->  Append
+         ->  Gather
+               Workers Planned: 4
+               Params Evaluated: $1
+               InitPlan 1 (returns $1)
+                 ->  Limit
+                       ->  Gather
+                             Workers Planned: 4
+                             ->  Parallel Seq Scan on tenk1 tenk1_2
+                                   Filter: (fivethous = 1)
+               ->  Parallel Seq Scan on tenk1
+                     Filter: (fivethous = $1)
+         ->  Gather
+               Workers Planned: 4
+               Params Evaluated: $3
+               InitPlan 2 (returns $3)
+                 ->  Limit
+                       ->  Gather
+                             Workers Planned: 4
+                             ->  Parallel Seq Scan on tenk1 tenk1_3
+                                   Filter: (fivethous = 1)
+               ->  Parallel Seq Scan on tenk1 tenk1_1
+                     Filter: (fivethous = $3)
+(25 rows)
+
+-- test interaction with SRFs
+SELECT * FROM information_schema.foreign_data_wrapper_options
+ORDER BY 1, 2, 3;
+ foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value 
+------------------------------+---------------------------+-------------+--------------
+(0 rows)
+
 rollback;
diff --git a/src/test/regress/expected/select_parallel_2.out b/src/test/regress/expected/select_parallel_2.out
index 36bff2d7..0c81d6f4 100644
--- a/src/test/regress/expected/select_parallel_2.out
+++ b/src/test/regress/expected/select_parallel_2.out
@@ -347,4 +347,69 @@ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1;
 -- provoke error in worker
 select stringu1::int2 from tenk1 where unique1 = 1;
 ERROR:  invalid input syntax for integer: "BAAAAA"
+-- test interaction with set-returning functions
+SAVEPOINT settings;
+-- multiple subqueries under a single Gather node
+-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes
+SET LOCAL parallel_setup_cost = 10;
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Gather
+   Workers Planned: 4
+   ->  Parallel Append
+         ->  Parallel Seq Scan on tenk1
+               Filter: (fivethous = (tenthous + 1))
+         ->  Parallel Seq Scan on tenk1 tenk1_1
+               Filter: (fivethous = (tenthous + 1))
+(7 rows)
+
+ROLLBACK TO SAVEPOINT settings;
+-- can't use multiple subqueries under a single Gather node due to initPlans
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+ORDER BY 1;
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Sort
+   Sort Key: tenk1.unique1
+   ->  Append
+         ->  Gather
+               Workers Planned: 4
+               Params Evaluated: $1
+               InitPlan 1 (returns $1)
+                 ->  Limit
+                       ->  Gather
+                             Workers Planned: 4
+                             ->  Parallel Seq Scan on tenk1 tenk1_2
+                                   Filter: (fivethous = 1)
+               ->  Parallel Seq Scan on tenk1
+                     Filter: (fivethous = $1)
+         ->  Gather
+               Workers Planned: 4
+               Params Evaluated: $3
+               InitPlan 2 (returns $3)
+                 ->  Limit
+                       ->  Gather
+                             Workers Planned: 4
+                             ->  Parallel Seq Scan on tenk1 tenk1_3
+                                   Filter: (fivethous = 1)
+               ->  Parallel Seq Scan on tenk1 tenk1_1
+                     Filter: (fivethous = $3)
+(25 rows)
+
+-- test interaction with SRFs
+SELECT * FROM information_schema.foreign_data_wrapper_options
+ORDER BY 1, 2, 3;
+ foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value 
+------------------------------+---------------------------+-------------+--------------
+(0 rows)
+
 rollback;
diff --git a/src/test/regress/expected/select_parallel_3.out b/src/test/regress/expected/select_parallel_3.out
index a4717f62..8566355f 100644
--- a/src/test/regress/expected/select_parallel_3.out
+++ b/src/test/regress/expected/select_parallel_3.out
@@ -343,4 +343,69 @@ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1;
 -- provoke error in worker
 select stringu1::int2 from tenk1 where unique1 = 1;
 ERROR:  invalid input syntax for integer: "BAAAAA"
+-- test interaction with set-returning functions
+SAVEPOINT settings;
+-- multiple subqueries under a single Gather node
+-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes
+SET LOCAL parallel_setup_cost = 10;
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Gather
+   Workers Planned: 4
+   ->  Parallel Append
+         ->  Parallel Seq Scan on tenk1
+               Filter: (fivethous = (tenthous + 1))
+         ->  Parallel Seq Scan on tenk1 tenk1_1
+               Filter: (fivethous = (tenthous + 1))
+(7 rows)
+
+ROLLBACK TO SAVEPOINT settings;
+-- can't use multiple subqueries under a single Gather node due to initPlans
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+ORDER BY 1;
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Sort
+   Sort Key: tenk1.unique1
+   ->  Append
+         ->  Gather
+               Workers Planned: 4
+               Params Evaluated: $1
+               InitPlan 1 (returns $1)
+                 ->  Limit
+                       ->  Gather
+                             Workers Planned: 4
+                             ->  Parallel Seq Scan on tenk1 tenk1_2
+                                   Filter: (fivethous = 1)
+               ->  Parallel Seq Scan on tenk1
+                     Filter: (fivethous = $1)
+         ->  Gather
+               Workers Planned: 4
+               Params Evaluated: $3
+               InitPlan 2 (returns $3)
+                 ->  Limit
+                       ->  Gather
+                             Workers Planned: 4
+                             ->  Parallel Seq Scan on tenk1 tenk1_3
+                                   Filter: (fivethous = 1)
+               ->  Parallel Seq Scan on tenk1 tenk1_1
+                     Filter: (fivethous = $3)
+(25 rows)
+
+-- test interaction with SRFs
+SELECT * FROM information_schema.foreign_data_wrapper_options
+ORDER BY 1, 2, 3;
+ foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value 
+------------------------------+---------------------------+-------------+--------------
+(0 rows)
+
 rollback;
diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out
index 684d4989..93228c2e 100644
--- a/src/test/regress/expected/select_parallel_4.out
+++ b/src/test/regress/expected/select_parallel_4.out
@@ -380,4 +380,69 @@ SELECT xc_node_id != 0 FROM t_worker_identifier;
 -- provoke error in worker
 select stringu1::int2 from tenk1 where unique1 = 1;
 ERROR:  invalid input syntax for integer: "BAAAAA"
+-- test interaction with set-returning functions
+SAVEPOINT settings;
+-- multiple subqueries under a single Gather node
+-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes
+SET LOCAL parallel_setup_cost = 10;
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Gather
+   Workers Planned: 4
+   ->  Parallel Append
+         ->  Parallel Seq Scan on tenk1
+               Filter: (fivethous = (tenthous + 1))
+         ->  Parallel Seq Scan on tenk1 tenk1_1
+               Filter: (fivethous = (tenthous + 1))
+(7 rows)
+
+ROLLBACK TO SAVEPOINT settings;
+-- can't use multiple subqueries under a single Gather node due to initPlans
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+ORDER BY 1;
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Sort
+   Sort Key: tenk1.unique1
+   ->  Append
+         ->  Gather
+               Workers Planned: 4
+               Params Evaluated: $1
+               InitPlan 1 (returns $1)
+                 ->  Limit
+                       ->  Gather
+                             Workers Planned: 4
+                             ->  Parallel Seq Scan on tenk1 tenk1_2
+                                   Filter: (fivethous = 1)
+               ->  Parallel Seq Scan on tenk1
+                     Filter: (fivethous = $1)
+         ->  Gather
+               Workers Planned: 4
+               Params Evaluated: $3
+               InitPlan 2 (returns $3)
+                 ->  Limit
+                       ->  Gather
+                             Workers Planned: 4
+                             ->  Parallel Seq Scan on tenk1 tenk1_3
+                                   Filter: (fivethous = 1)
+               ->  Parallel Seq Scan on tenk1 tenk1_1
+                     Filter: (fivethous = $3)
+(25 rows)
+
+-- test interaction with SRFs
+SELECT * FROM information_schema.foreign_data_wrapper_options
+ORDER BY 1, 2, 3;
+ foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value 
+------------------------------+---------------------------+-------------+--------------
+(0 rows)
+
 rollback;
diff --git a/src/test/regress/expected/select_parallel_5.out b/src/test/regress/expected/select_parallel_5.out
index 94ab46f1..6b20689d 100644
--- a/src/test/regress/expected/select_parallel_5.out
+++ b/src/test/regress/expected/select_parallel_5.out
@@ -343,4 +343,69 @@ EXPLAIN (analyze, timing off, summary off, costs off) SELECT * FROM tenk1;
 -- provoke error in worker
 select stringu1::int2 from tenk1 where unique1 = 1;
 ERROR:  invalid input syntax for integer: "BAAAAA"
+-- test interaction with set-returning functions
+SAVEPOINT settings;
+-- multiple subqueries under a single Gather node
+-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes
+SET LOCAL parallel_setup_cost = 10;
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1;
+                     QUERY PLAN                     
+----------------------------------------------------
+ Gather
+   Workers Planned: 4
+   ->  Parallel Append
+         ->  Parallel Seq Scan on tenk1
+               Filter: (fivethous = (tenthous + 1))
+         ->  Parallel Seq Scan on tenk1 tenk1_1
+               Filter: (fivethous = (tenthous + 1))
+(7 rows)
+
+ROLLBACK TO SAVEPOINT settings;
+-- can't use multiple subqueries under a single Gather node due to initPlans
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+ORDER BY 1;
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Sort
+   Sort Key: tenk1.unique1
+   ->  Append
+         ->  Gather
+               Workers Planned: 4
+               Params Evaluated: $1
+               InitPlan 1 (returns $1)
+                 ->  Limit
+                       ->  Gather
+                             Workers Planned: 4
+                             ->  Parallel Seq Scan on tenk1 tenk1_2
+                                   Filter: (fivethous = 1)
+               ->  Parallel Seq Scan on tenk1
+                     Filter: (fivethous = $1)
+         ->  Gather
+               Workers Planned: 4
+               Params Evaluated: $3
+               InitPlan 2 (returns $3)
+                 ->  Limit
+                       ->  Gather
+                             Workers Planned: 4
+                             ->  Parallel Seq Scan on tenk1 tenk1_3
+                                   Filter: (fivethous = 1)
+               ->  Parallel Seq Scan on tenk1 tenk1_1
+                     Filter: (fivethous = $3)
+(25 rows)
+
+-- test interaction with SRFs
+SELECT * FROM information_schema.foreign_data_wrapper_options
+ORDER BY 1, 2, 3;
+ foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value 
+------------------------------+---------------------------+-------------+--------------
+(0 rows)
+
 rollback;
diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql
index d2cca20f..25ee90a1 100644
--- a/src/test/regress/sql/select_parallel.sql
+++ b/src/test/regress/sql/select_parallel.sql
@@ -145,4 +145,29 @@ SELECT xc_node_id != 0 FROM t_worker_identifier;
 -- provoke error in worker
 select stringu1::int2 from tenk1 where unique1 = 1;
 
+-- test interaction with set-returning functions
+SAVEPOINT settings;
+
+-- multiple subqueries under a single Gather node
+-- must set parallel_setup_cost > 0 to discourage multiple Gather nodes
+SET LOCAL parallel_setup_cost = 10;
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1;
+ROLLBACK TO SAVEPOINT settings;
+
+-- can't use multiple subqueries under a single Gather node due to initPlans
+EXPLAIN (COSTS OFF)
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique1 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+UNION ALL
+SELECT unique1 FROM tenk1 WHERE fivethous =
+   (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
+ORDER BY 1;
+
+-- test interaction with SRFs
+SELECT * FROM information_schema.foreign_data_wrapper_options
+ORDER BY 1, 2, 3;
+
 rollback;

From 9fb4ae6d1c7a16bdd422c5fd79529c780a7a4d7f Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 10:45:01 +0800
Subject: [PATCH 241/578] Pass additional arguments to a couple of
 grouping-related functions.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/optimizer/plan/planner.c | 40 +++++++++++++++++-----------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index aa85504d..d7357766 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -140,7 +140,8 @@ static List *reorder_grouping_sets(List *groupingSets, List *sortclause);
 static void standard_qp_callback(PlannerInfo *root, void *extra);
 static double get_number_of_groups(PlannerInfo *root,
                      double path_rows,
-                     grouping_sets_data *gd);
+					 grouping_sets_data *gd,
+					 List *target_list);
 static Size estimate_hashagg_tablesize(Path *path,
                            const AggClauseCosts *agg_costs,
                            double dNumGroups);
@@ -185,7 +186,8 @@ static RelOptInfo *create_ordered_paths(PlannerInfo *root,
 static PathTarget *make_group_input_target(PlannerInfo *root,
                         PathTarget *final_target);
 static PathTarget *make_partial_grouping_target(PlannerInfo *root,
-                             PathTarget *grouping_target);
+							 PathTarget *grouping_target,
+							 Node *havingQual);
 static List *postprocess_setop_tlist(List *new_tlist, List *orig_tlist);
 static List *select_active_windows(PlannerInfo *root, WindowFuncLists *wflists);
 static PathTarget *make_window_input_target(PlannerInfo *root,
@@ -3768,7 +3770,8 @@ standard_qp_callback(PlannerInfo *root, void *extra)
  * Estimate number of groups produced by grouping clauses (1 if not grouping)
  *
  * path_rows: number of output rows from scan/join step
- * gsets: grouping set data, or NULL if not doing grouping sets
+ * gd: grouping sets data including list of grouping sets and their clauses
+ * target_list: target list containing group clause references
  *
  * If doing grouping sets, we also annotate the gsets data with the estimates
  * for each set and each individual rollup list, with a view to later
@@ -3777,7 +3780,8 @@ standard_qp_callback(PlannerInfo *root, void *extra)
 static double
 get_number_of_groups(PlannerInfo *root,
                      double path_rows,
-                     grouping_sets_data *gd)
+					 grouping_sets_data *gd,
+					 List *target_list)
 {
     Query       *parse = root->parse;
     double        dNumGroups;
@@ -3802,7 +3806,7 @@ get_number_of_groups(PlannerInfo *root,
                 ListCell   *lc;
 
                 groupExprs = get_sortgrouplist_exprs(rollup->groupClause,
-                                                     parse->targetList);
+													 target_list);
 
                 rollup->numGroups = 0.0;
 
@@ -3829,7 +3833,7 @@ get_number_of_groups(PlannerInfo *root,
                 gd->dNumHashGroups = 0;
 
                 groupExprs = get_sortgrouplist_exprs(parse->groupClause,
-                                                     parse->targetList);
+													 target_list);
 
                 forboth(lc, gd->hash_sets_idx, lc2, gd->unsortable_sets)
                 {
@@ -3851,7 +3855,7 @@ get_number_of_groups(PlannerInfo *root,
         {
             /* Plain GROUP BY */
             groupExprs = get_sortgrouplist_exprs(parse->groupClause,
-                                                 parse->targetList);
+												 target_list);
 
             dNumGroups = estimate_num_groups(root, groupExprs, path_rows,
                                              NULL);
@@ -4070,7 +4074,8 @@ create_grouping_paths(PlannerInfo *root,
      */
     dNumGroups = get_number_of_groups(root,
                                       cheapest_path->rows,
-                                      gd);
+									  gd,
+									  parse->targetList);
 
     /*
      * Determine whether it's possible to perform sort-based implementations
@@ -4176,7 +4181,8 @@ create_grouping_paths(PlannerInfo *root,
          * appear in the result tlist, and (2) the Aggrefs must be set in
          * partial mode.
          */
-        partial_grouping_target = make_partial_grouping_target(root, target);
+		partial_grouping_target = make_partial_grouping_target(root, target,
+				                                               (Node *) parse->havingQual);
 
         /*
          * Collect statistics about aggregates for estimating costs of
@@ -4274,7 +4280,8 @@ create_grouping_paths(PlannerInfo *root,
 	 */
 	if (try_distributed_aggregation)
 	{
-		partial_grouping_target = make_partial_grouping_target(root, target);
+		partial_grouping_target = make_partial_grouping_target(root, target,
+				                                               (Node *) parse->havingQual);
 
 		/* Estimate number of partial groups. */
 		dNumPartialGroups = get_number_of_groups(root,
@@ -5725,10 +5732,12 @@ make_group_input_target(PlannerInfo *root, PathTarget *final_target)
  * these would be Vars that are grouped by or used in grouping expressions.)
  *
  * grouping_target is the tlist to be emitted by the topmost aggregation step.
- * We get the HAVING clause out of *root.
+ * havingQual represents the HAVING clause.
              */
 static PathTarget *
-make_partial_grouping_target(PlannerInfo *root, PathTarget *grouping_target)
+make_partial_grouping_target(PlannerInfo *root,
+							 PathTarget *grouping_target,
+							 Node *havingQual)
         {
 	Query	   *parse = root->parse;
 	PathTarget *partial_target;
@@ -5770,8 +5779,8 @@ make_partial_grouping_target(PlannerInfo *root, PathTarget *grouping_target)
         /*
 	 * If there's a HAVING clause, we'll need the Vars/Aggrefs it uses, too.
          */
-	if (parse->havingQual)
-		non_group_cols = lappend(non_group_cols, parse->havingQual);
+	if (havingQual)
+		non_group_cols = lappend(non_group_cols, havingQual);
 
         /*
 	 * Pull out all the Vars, PlaceHolderVars, and Aggrefs mentioned in
@@ -7892,7 +7901,8 @@ add_partial_paths_to_grouping_rel(PlannerInfo *root,
     /* Estimate number of partial groups. */
     dNumPartialGroups = get_number_of_groups(root,
                                              cheapest_partial_path->rows,
-                                             gd);
+											 gd,
+											 parse->targetList);
 
     if (can_sort)
     {

From 7e471cc3bf984e351786a8171a9809fa4535ebf3 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 11:22:23 +0800
Subject: [PATCH 242/578] Split create_grouping_paths into degenerate and
 non-degenerate cases.

---
 src/backend/optimizer/plan/planner.c | 161 ++++++++++++++++++---------
 1 file changed, 109 insertions(+), 52 deletions(-)

diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index d7357766..8723fe38 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -151,6 +151,16 @@ static RelOptInfo *create_grouping_paths(PlannerInfo *root,
 					  bool target_parallel_safe,
                       const AggClauseCosts *agg_costs,
                       grouping_sets_data *gd);
+static bool is_degenerate_grouping(PlannerInfo *root);
+static void create_degenerate_grouping_paths(PlannerInfo *root,
+								 RelOptInfo *input_rel,
+								 PathTarget *target, RelOptInfo *grouped_rel);
+static void create_ordinary_grouping_paths(PlannerInfo *root,
+							   RelOptInfo *input_rel,
+							   PathTarget *target, RelOptInfo *grouped_rel,
+							   RelOptInfo *partially_grouped_rel,
+							   const AggClauseCosts *agg_costs,
+							   grouping_sets_data *gd);
 static void consider_groupingsets_paths(PlannerInfo *root,
                             RelOptInfo *grouped_rel,
                             Path *path,
@@ -3956,11 +3966,6 @@ estimate_hashagg_entrysize(Path *path, const AggClauseCosts *agg_costs,
  *
  * Note: all Paths in input_rel are expected to return the target computed
  * by make_group_input_target.
- *
- * We need to consider sorted and hashed aggregation in the same function,
- * because otherwise (1) it would be harder to throw an appropriate error
- * message if neither way works, and (2) we should not allow hashtable size
- * considerations to dissuade us from using hashing if sorting is not possible.
  */
 static RelOptInfo *
 create_grouping_paths(PlannerInfo *root,
@@ -3971,15 +3976,8 @@ create_grouping_paths(PlannerInfo *root,
                       grouping_sets_data *gd)
 {// #lizard forgives
     Query       *parse = root->parse;
-    Path       *cheapest_path = input_rel->cheapest_total_path;
     RelOptInfo *grouped_rel;
     PathTarget *partial_grouping_target = NULL;
-    AggClauseCosts agg_partial_costs;    /* parallel only */
-    AggClauseCosts agg_final_costs; /* parallel only */
-    double        dNumGroups;
-    bool        can_hash;
-    bool        can_sort;
-    bool        try_parallel_aggregation;
     bool        try_distributed_aggregation;
 
 
@@ -4004,35 +4002,85 @@ create_grouping_paths(PlannerInfo *root,
     grouped_rel->fdwroutine = input_rel->fdwroutine;
 
     /*
-     * Check for degenerate grouping.
+	 *  Create either paths for a degenerate grouping or paths for ordinary
+     * grouping, as appropriate.
      */
-    if ((root->hasHavingQual || parse->groupingSets) &&
-        !parse->hasAggs && parse->groupClause == NIL)
+	if (is_degenerate_grouping(root))
+	{
+	       create_degenerate_grouping_paths(root, input_rel, target, grouped_rel);
+
+	       /* No need to consider any other alternatives. */
+	       set_cheapest(grouped_rel);
+	       return grouped_rel;
+	}
+	else
     {
+	       create_ordinary_grouping_paths(root, input_rel, target, grouped_rel,
+	                                      partially_grouped_rel, agg_costs, gd);
+
+	       /* Now choose the best path(s) */
+	       set_cheapest(grouped_rel);
+
+	       /*
+	        * We've been using the partial pathlist for the grouped relation to hold
+	        * partially aggregated paths, but that's actually a little bit bogus
+	        * because it's unsafe for later planning stages -- like ordered_rel ---
+	        * to get the idea that they can use these partial paths as if they didn't
+	        * need a FinalizeAggregate step.  Zap the partial pathlist at this stage
+	        * so we don't get confused.
+	        */
+	       grouped_rel->partial_pathlist = NIL;
+
+	       return grouped_rel;
+
+	}
+}
+
+/*
+ * is_degenerate_grouping
+ *
+ * A degenerate grouping is one in which the query has a HAVING qual and/or
+ * grouping sets, but no aggregates and no GROUP BY (which implies that the
+ * grouping sets are all empty).
+ */
+static bool
+is_degenerate_grouping(PlannerInfo *root)
+{
+   Query      *parse = root->parse;
+
+   return (root->hasHavingQual || parse->groupingSets) &&
+       !parse->hasAggs && parse->groupClause == NIL;
+}
+
         /*
-         * We have a HAVING qual and/or grouping sets, but no aggregates and
-         * no GROUP BY (which implies that the grouping sets are all empty).
+ * create_degenerate_grouping_paths
          *
-         * This is a degenerate case in which we are supposed to emit either
-         * zero or one row for each grouping set depending on whether HAVING
-         * succeeds.  Furthermore, there cannot be any variables in either
-         * HAVING or the targetlist, so we actually do not need the FROM table
-         * at all!    We can just throw away the plan-so-far and generate a
-         * Result node.  This is a sufficiently unusual corner case that it's
-         * not worth contorting the structure of this module to avoid having
-         * to generate the earlier paths in the first place.
+ * When the grouping is degenerate (see is_degenerate_grouping), we are
+ * supposed to emit either zero or one row for each grouping set depending on
+ * whether HAVING succeeds.  Furthermore, there cannot be any variables in
+ * either HAVING or the targetlist, so we actually do not need the FROM table
+ * at all! We can just throw away the plan-so-far and generate a Result node.
+ * This is a sufficiently unusual corner case that it's not worth contorting
+ * the structure of this module to avoid having to generate the earlier paths
+ * in the first place.
          */
-        int            nrows = list_length(parse->groupingSets);
+static void
+create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
+                                PathTarget *target, RelOptInfo *grouped_rel)
+{
+    Query      *parse = root->parse;
+    int         nrows;
         Path       *path;
 
+    nrows = list_length(parse->groupingSets);
         if (nrows > 1)
         {
             /*
-             * Doesn't seem worthwhile writing code to cons up a
-             * generate_series or a values scan to emit multiple rows. Instead
-             * just make N clones and append them.  (With a volatile HAVING
-             * clause, this means you might get between 0 and N output rows.
-             * Offhand I think that's desired.)
+		 * Doesn't seem worthwhile writing code to cons up a generate_series
+         * or a values scan to emit multiple rows. Instead just make N clones
+         * and append them.  (With a volatile HAVING clause, this means you
+         * might get between 0 and N output rows. Offhand I think that's
+         * desired.)
              */
             List       *paths = NIL;
 
@@ -4047,9 +4095,12 @@ create_grouping_paths(PlannerInfo *root,
             path = (Path *)
                 create_append_path(grouped_rel,
                                    paths,
+	                           NIL,
                                    NULL,
                                    0,
-                                   NIL);
+	                           false,
+	                           NIL,
+	                           -1);
             path->pathtarget = target;
         }
         else
@@ -4060,16 +4111,36 @@ create_grouping_paths(PlannerInfo *root,
                                    target,
                                    (List *) parse->havingQual);
         }
-
         add_path(grouped_rel, path);
-
-        /* No need to consider any other alternatives. */
-        set_cheapest(grouped_rel);
-
-        return grouped_rel;
     }
 
     /*
+ * create_ordinary_grouping_paths
+ *
+ * Create grouping paths for the ordinary (that is, non-degenerate) case.
+ *
+ * We need to consider sorted and hashed aggregation in the same function,
+ * because otherwise (1) it would be harder to throw an appropriate error
+ * message if neither way works, and (2) we should not allow hashtable size
+ * considerations to dissuade us from using hashing if sorting is not possible.
+ */
+static void
+create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
+                              PathTarget *target, RelOptInfo *grouped_rel,
+                              RelOptInfo *partially_grouped_rel,
+                              const AggClauseCosts *agg_costs,
+                              grouping_sets_data *gd)
+{
+   Query      *parse = root->parse;
+   Path       *cheapest_path = input_rel->cheapest_total_path;
+   AggClauseCosts agg_partial_costs;   /* parallel only */
+   AggClauseCosts agg_final_costs; /* parallel only */
+   double      dNumGroups;
+   bool        can_hash;
+   bool        can_sort;
+   bool        try_parallel_aggregation;
+
+	/*
      * Estimate number of groups.
      */
     dNumGroups = get_number_of_groups(root,
@@ -4676,20 +4747,6 @@ create_grouping_paths(PlannerInfo *root,
 	if (create_upper_paths_hook)
 		(*create_upper_paths_hook) (root, UPPERREL_GROUP_AGG,
 									input_rel, grouped_rel);
-
-	/* Now choose the best path(s) */
-	set_cheapest(grouped_rel);
-        /*
-	 * We've been using the partial pathlist for the grouped relation to hold
-	 * partially aggregated paths, but that's actually a little bit bogus
-	 * because it's unsafe for later planning stages -- like ordered_rel ---
-	 * to get the idea that they can use these partial paths as if they didn't
-	 * need a FinalizeAggregate step.  Zap the partial pathlist at this stage
-	 * so we don't get confused.
-         */
-	grouped_rel->partial_pathlist = NIL;
-
-	return grouped_rel;
 }
 
 
From d9302fbfd46acc360a28df0f725a7f35f4bc6276 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 12:56:01 +0800
Subject: [PATCH 243/578] Avoid creating a TOAST table for a partitioned table.
 2. fix planner.c bug.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/optimizer/plan/planner.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 8723fe38..7538b6be 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -158,7 +158,6 @@ static void create_degenerate_grouping_paths(PlannerInfo *root,
 static void create_ordinary_grouping_paths(PlannerInfo *root,
 							   RelOptInfo *input_rel,
 							   PathTarget *target, RelOptInfo *grouped_rel,
-							   RelOptInfo *partially_grouped_rel,
 							   const AggClauseCosts *agg_costs,
 							   grouping_sets_data *gd);
 static void consider_groupingsets_paths(PlannerInfo *root,
@@ -3977,8 +3976,6 @@ create_grouping_paths(PlannerInfo *root,
 {// #lizard forgives
     Query       *parse = root->parse;
     RelOptInfo *grouped_rel;
-    PathTarget *partial_grouping_target = NULL;
-    bool        try_distributed_aggregation;
 
 
     /* For now, do all work in the (GROUP_AGG, NULL) upperrel */
@@ -4016,7 +4013,7 @@ create_grouping_paths(PlannerInfo *root,
 	else
     {
 	       create_ordinary_grouping_paths(root, input_rel, target, grouped_rel,
-	                                      partially_grouped_rel, agg_costs, gd);
+	    		    agg_costs, gd);
 
 	       /* Now choose the best path(s) */
 	       set_cheapest(grouped_rel);
@@ -4127,7 +4124,6 @@ create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
 static void
 create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
                               PathTarget *target, RelOptInfo *grouped_rel,
-                              RelOptInfo *partially_grouped_rel,
                               const AggClauseCosts *agg_costs,
                               grouping_sets_data *gd)
 {
@@ -4139,6 +4135,8 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
    bool        can_hash;
    bool        can_sort;
    bool        try_parallel_aggregation;
+    bool		try_distributed_aggregation;
+	PathTarget *partial_grouping_target = NULL;
 
 	/*
      * Estimate number of groups.

From d72d845d02dd43ad0341e0c362fa94271ae86da3 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 14:38:59 +0800
Subject: [PATCH 244/578] Enforce child constraints during COPY TO a
 partitioned table.

---
 src/backend/commands/copy.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 321b44a1..e3e67988 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -3565,7 +3565,8 @@ CopyFrom(CopyState cstate)
                     check_partition_constr = false;
 
                 /* Check the constraints of the tuple */
-                if (cstate->rel->rd_att->constr || check_partition_constr)
+				if (resultRelInfo->ri_RelationDesc->rd_att->constr ||
+                    check_partition_constr)
 					ExecConstraints(resultRelInfo, slot, estate, true);
 
 #ifdef _MLS_

From 54e24d887537475c3d8f1053f53c838cfdfddf8a Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 15:50:24 +0800
Subject: [PATCH 245/578] Faster partition
 pruning.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/Makefile                          |    2 +-
 src/backend/catalog/partition.c               |  133 +-
 src/backend/nodes/copyfuncs.c                 |   56 +-
 src/backend/nodes/equalfuncs.c                |   13 -
 src/backend/nodes/nodeFuncs.c                 |   25 +
 src/backend/nodes/outfuncs.c                  |   44 +-
 src/backend/nodes/readfuncs.c                 |   31 +
 src/backend/optimizer/path/allpaths.c         |  112 +-
 src/backend/optimizer/path/indxpath.c         |    4 +-
 src/backend/optimizer/plan/planner.c          |  100 +-
 src/backend/optimizer/prep/prepunion.c        |   47 +-
 src/backend/optimizer/util/plancat.c          |   48 +-
 src/backend/optimizer/util/relnode.c          |    8 +
 src/backend/partitioning/Makefile             |   17 +
 src/backend/partitioning/partprune.c          | 2782 +++++++++++++++++
 src/include/catalog/partition.h               |    3 +-
 src/include/catalog/pg_opfamily.h             |    3 +
 src/include/nodes/nodes.h                     |    4 +-
 src/include/nodes/primnodes.h                 |   75 +
 src/include/nodes/relation.h                  |   38 +-
 src/include/optimizer/planner.h               |    5 -
 src/include/partitioning/partbounds.h         |  124 +
 src/include/partitioning/partprune.h          |   49 +
 src/test/regress/expected/inherit.out         |    4 +-
 src/test/regress/expected/inherit_1.out       |    4 +-
 src/test/regress/expected/inherit_2.out       |    4 +-
 src/test/regress/expected/inherit_3.out       |    4 +-
 src/test/regress/expected/partition_prune.out |  493 ++-
 src/test/regress/sql/partition_prune.sql      |  123 +-
 29 files changed, 3971 insertions(+), 384 deletions(-)
 create mode 100644 src/backend/partitioning/Makefile
 create mode 100644 src/backend/partitioning/partprune.c
 create mode 100644 src/include/partitioning/partbounds.h
 create mode 100644 src/include/partitioning/partprune.h

diff --git a/src/backend/Makefile b/src/backend/Makefile
index 4c35043b..75b093d8 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -22,7 +22,7 @@ override CFLAGS += $(PTHREAD_CFLAGS)
 endif
 
 SUBDIRS = access audit bootstrap catalog contrib parser commands executor foreign lib libpq \
-	pgxc main nodes optimizer oracle port postmaster regex replication rewrite \
+	pgxc main nodes optimizer partitioning oracle port postmaster regex replication rewrite \
 	statistics storage tcop tsearch utils $(top_builddir)/src/timezone $(top_builddir)/src/interfaces/libpq
 
 include $(srcdir)/common.mk
diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index bf765697..f74a88f0 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -41,6 +41,7 @@
 #include "optimizer/prep.h"
 #include "optimizer/var.h"
 #include "parser/parse_coerce.h"
+#include "partitioning/partbounds.h"
 #include "rewrite/rewriteManip.h"
 #include "storage/lmgr.h"
 #include "utils/array.h"
@@ -55,88 +56,6 @@
 #include "utils/ruleutils.h"
 #include "utils/syscache.h"
 
-/*
- * Information about bounds of a partitioned relation
- *
- * A list partition datum that is known to be NULL is never put into the
- * datums array. Instead, it is tracked using the null_index field.
- *
- * In the case of range partitioning, ndatums will typically be far less than
- * 2 * nparts, because a partition's upper bound and the next partition's lower
- * bound are the same in most common cases, and we only store one of them (the
- * upper bound).  In case of hash partitioning, ndatums will be same as the
- * number of partitions.
- *
- * For range and list partitioned tables, datums is an array of datum-tuples
- * with key->partnatts datums each.  For hash partitioned tables, it is an array
- * of datum-tuples with 2 datums, modulus and remainder, corresponding to a
- * given partition.
- *
- * The datums in datums array are arranged in increasing order as defined by
- * functions qsort_partition_rbound_cmp(), qsort_partition_list_value_cmp() and
- * qsort_partition_hbound_cmp() for range, list and hash partitioned tables
- * respectively. For range and list partitions this simply means that the
- * datums in the datums array are arranged in increasing order as defined by
- * the partition key's operator classes and collations.
- *
- * In the case of list partitioning, the indexes array stores one entry for
- * every datum, which is the index of the partition that accepts a given datum.
- * In case of range partitioning, it stores one entry per distinct range
- * datum, which is the index of the partition for which a given datum
- * is an upper bound.  In the case of hash partitioning, the number of the
- * entries in the indexes array is same as the greatest modulus amongst all
- * partitions.  For a given partition key datum-tuple, the index of the
- * partition which would accept that datum-tuple would be given by the entry
- * pointed by remainder produced when hash value of the datum-tuple is divided
- * by the greatest modulus.
- */
-
-typedef struct PartitionBoundInfoData
-{
-	char		strategy;		/* hash, list or range? */
-    int            ndatums;        /* Length of the datums following array */
-	Datum	  **datums;
-    PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
-									 * NULL for hash and list partitioned
-									 * tables */
-	int		   *indexes;		/* Partition indexes */
-    int            null_index;        /* Index of the null-accepting partition; -1
-                                 * if there isn't one */
-	int			default_index;	/* Index of the default partition; -1 if there
-								 * isn't one */
-} PartitionBoundInfoData;
-
-#define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1)
-#define partition_bound_has_default(bi) ((bi)->default_index != -1)
-
-/*
- * When qsort'ing partition bounds after reading from the catalog, each bound
- * is represented with one of the following structs.
- */
-
-/* One bound of a hash partition */
-typedef struct PartitionHashBound
-{
-	int			modulus;
-	int			remainder;
-	int			index;
-} PartitionHashBound;
-
-/* One value coming from some (index'th) list partition */
-typedef struct PartitionListValue
-{
-    int            index;
-    Datum        value;
-} PartitionListValue;
-
-/* One bound of a range partition */
-typedef struct PartitionRangeBound
-{
-    int            index;
-    Datum       *datums;            /* range bound datums */
-    PartitionRangeDatumKind *kind;    /* the kind of each datum */
-    bool        lower;            /* this is the lower (vs upper) bound */
-} PartitionRangeBound;
 
 static int32 qsort_partition_hbound_cmp(const void *a, const void *b);
 static int32 qsort_partition_list_value_cmp(const void *a, const void *b,
@@ -169,29 +88,8 @@ static int32 partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc,
 					 Oid *partcollation, Datum *datums1,
 					 PartitionRangeDatumKind *kind1, bool lower1,
 					 PartitionRangeBound *b2);
-static int32 partition_rbound_datum_cmp(FmgrInfo *partsupfunc,
-						   Oid *partcollation,
-                           Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
-						   Datum *tuple_datums, int n_tuple_datums);
-
-static int partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation,
-                    PartitionBoundInfo boundinfo,
-					   Datum value, bool *is_equal);
-static int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
-                        Oid *partcollation,
-                        PartitionBoundInfo boundinfo,
-						PartitionRangeBound *probe, bool *is_equal);
-static int partition_range_datum_bsearch(FmgrInfo *partsupfunc,
-                              Oid *partcollation,
-							  PartitionBoundInfo boundinfo,
-							  int nvalues, Datum *values, bool *is_equal);
-static int partition_hash_bsearch(PartitionBoundInfo boundinfo,
-					   int modulus, int remainder);
 
 static int	get_partition_bound_num_indexes(PartitionBoundInfo b);
-static int	get_greatest_modulus(PartitionBoundInfo b);
-static uint64 compute_hash_value(int partnatts, FmgrInfo *partsupfunc,
-                                 Datum *values, bool *isnull);
 
 /* SQL-callable function for use in hash partition CHECK constraints */
 PG_FUNCTION_INFO_V1(satisfies_hash_partition);
@@ -802,13 +700,13 @@ partition_bounds_equal(int partnatts, int16 *parttyplen, bool *parttypbyval,
 
 	if (b1->strategy == PARTITION_STRATEGY_HASH)
 	{
-		int			greatest_modulus = get_greatest_modulus(b1);
+		int			greatest_modulus = get_hash_partition_greatest_modulus(b1);
 
 		/*
 		 * If two hash partitioned tables have different greatest moduli,
 		 * their partition schemes don't match.
 		 */
-		if (greatest_modulus != get_greatest_modulus(b2))
+		if (greatest_modulus != get_hash_partition_greatest_modulus(b2))
 			return false;
 
 		/*
@@ -1068,7 +966,7 @@ check_new_partition_bound(char *relname, Relation parent,
 								(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 								 errmsg("every hash partition modulus must be a factor of the next larger modulus")));
 
-					greatest_modulus = get_greatest_modulus(boundinfo);
+					greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
 					remainder = spec->remainder;
 
 					/*
@@ -1597,7 +1495,6 @@ get_partition_qual_relid(Oid relid)
     return result;
 }
 
-/* Module-local functions */
 
 /*
  * get_partition_operator
@@ -2575,7 +2472,7 @@ get_partition_for_tuple(Relation relation, Datum *values, bool *isnull)
 			case PARTITION_STRATEGY_HASH:
 				{
 					PartitionBoundInfo boundinfo = partdesc->boundinfo;
-					int			greatest_modulus = get_greatest_modulus(boundinfo);
+				int			greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
 				uint64		rowHash = compute_hash_value(key->partnatts,
 														 key->partsupfunc,
 														 values, isnull);
@@ -2910,7 +2807,7 @@ partition_rbound_cmp(int partnatts, FmgrInfo *partsupfunc, Oid *partcollation,
  * of attributes resp.
  *
  */
-static int32
+int32
 partition_rbound_datum_cmp(FmgrInfo *partsupfunc, Oid *partcollation,
                            Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
 						   Datum *tuple_datums, int n_tuple_datums)
@@ -2944,7 +2841,7 @@ partition_rbound_datum_cmp(FmgrInfo *partsupfunc, Oid *partcollation,
  * *is_equal is set to true if the bound datum at the returned index is equal
  * to the input value.
  */
-static int
+int
 partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation,
 					   PartitionBoundInfo boundinfo,
 					   Datum value, bool *is_equal)
@@ -2987,7 +2884,7 @@ partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation,
  * *is_equal is set to true if the range bound at the returned index is equal
  * to the input range bound
                      */
-static int
+int
 partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
 						Oid *partcollation,
 						PartitionBoundInfo boundinfo,
@@ -3032,7 +2929,7 @@ partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
  * *is_equal is set to true if the range bound at the returned index is equal
  * to the input tuple.
  */
-static int
+int
 partition_range_datum_bsearch(FmgrInfo *partsupfunc, Oid *partcollation,
 							  PartitionBoundInfo boundinfo,
 							  int nvalues, Datum *values, bool *is_equal)
@@ -3075,7 +2972,7 @@ partition_range_datum_bsearch(FmgrInfo *partsupfunc, Oid *partcollation,
  *		less than or equal to the given (modulus, remainder) pair or -1 if
  *		all of them are greater
  */
-static int
+int
 partition_hash_bsearch(PartitionBoundInfo boundinfo,
 					   int modulus, int remainder)
 {
@@ -3231,7 +3128,7 @@ get_partition_bound_num_indexes(PartitionBoundInfo bound)
 			 * The number of the entries in the indexes array is same as the
 			 * greatest modulus.
 			 */
-			num_indexes = get_greatest_modulus(bound);
+			num_indexes = get_hash_partition_greatest_modulus(bound);
 			break;
 
 		case PARTITION_STRATEGY_LIST:
@@ -3252,14 +3149,14 @@ get_partition_bound_num_indexes(PartitionBoundInfo bound)
 }
 
 /*
- * get_greatest_modulus
+ * get_hash_partition_greatest_modulus
  *
  * Returns the greatest modulus of the hash partition bound. The greatest
  * modulus will be at the end of the datums array because hash partitions are
  * arranged in the ascending order of their modulus and remainders.
  */
-static int
-get_greatest_modulus(PartitionBoundInfo bound)
+int
+get_hash_partition_greatest_modulus(PartitionBoundInfo bound)
 {
 	Assert(bound && bound->strategy == PARTITION_STRATEGY_HASH);
 	Assert(bound->datums && bound->ndatums > 0);
@@ -3273,7 +3170,7 @@ get_greatest_modulus(PartitionBoundInfo bound)
  *
  * Compute the hash value for given not null partition key values.
  */
-static uint64
+uint64
 compute_hash_value(int partnatts, FmgrInfo *partsupfunc,
 				   Datum *values, bool *isnull)
 {
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index b55431d6..e87c8463 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -2402,6 +2402,38 @@ _copyOnConflictExpr(const OnConflictExpr *from)
     return newnode;
 }
 
+/*
+ * _copyPartitionPruneStepOp
+ */
+static PartitionPruneStepOp *
+_copyPartitionPruneStepOp(const PartitionPruneStepOp *from)
+{
+	PartitionPruneStepOp *newnode = makeNode(PartitionPruneStepOp);
+
+	COPY_SCALAR_FIELD(step.step_id);
+	COPY_SCALAR_FIELD(opstrategy);
+	COPY_NODE_FIELD(exprs);
+	COPY_NODE_FIELD(cmpfns);
+	COPY_BITMAPSET_FIELD(nullkeys);
+
+	return newnode;
+}
+
+/*
+ * _copyPartitionPruneStepCombine
+ */
+static PartitionPruneStepCombine *
+_copyPartitionPruneStepCombine(const PartitionPruneStepCombine *from)
+{
+	PartitionPruneStepCombine *newnode = makeNode(PartitionPruneStepCombine);
+
+	COPY_SCALAR_FIELD(step.step_id);
+	COPY_SCALAR_FIELD(combineOp);
+	COPY_NODE_FIELD(source_stepids);
+
+	return newnode;
+}
+
 /* ****************************************************************
  *                        relation.h copy functions
  *
@@ -2527,21 +2559,6 @@ _copyAppendRelInfo(const AppendRelInfo *from)
     return newnode;
 }
 
-/*
- * _copyPartitionedChildRelInfo
- */
-static PartitionedChildRelInfo *
-_copyPartitionedChildRelInfo(const PartitionedChildRelInfo *from)
-{
-    PartitionedChildRelInfo *newnode = makeNode(PartitionedChildRelInfo);
-
-    COPY_SCALAR_FIELD(parent_relid);
-    COPY_NODE_FIELD(child_rels);
-	COPY_SCALAR_FIELD(part_cols_updated);
-
-    return newnode;
-}
-
 /*
  * _copyPlaceHolderInfo
  */
@@ -5649,6 +5666,12 @@ copyObjectImpl(const void *from)
         case T_OnConflictExpr:
             retval = _copyOnConflictExpr(from);
             break;
+        case T_PartitionPruneStepOp:
+            retval = _copyPartitionPruneStepOp(from);
+            break;
+        case T_PartitionPruneStepCombine:
+            retval = _copyPartitionPruneStepCombine(from);
+            break;
 
             /*
              * RELATION NODES
@@ -5668,9 +5691,6 @@ copyObjectImpl(const void *from)
         case T_AppendRelInfo:
             retval = _copyAppendRelInfo(from);
             break;
-        case T_PartitionedChildRelInfo:
-            retval = _copyPartitionedChildRelInfo(from);
-            break;
         case T_PlaceHolderInfo:
             retval = _copyPlaceHolderInfo(from);
             break;
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 0e47737a..8abab4bb 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -931,16 +931,6 @@ _equalAppendRelInfo(const AppendRelInfo *a, const AppendRelInfo *b)
     return true;
 }
 
-static bool
-_equalPartitionedChildRelInfo(const PartitionedChildRelInfo *a, const PartitionedChildRelInfo *b)
-{
-    COMPARE_SCALAR_FIELD(parent_relid);
-    COMPARE_NODE_FIELD(child_rels);
-	COMPARE_SCALAR_FIELD(part_cols_updated);
-
-    return true;
-}
-
 static bool
 _equalPlaceHolderInfo(const PlaceHolderInfo *a, const PlaceHolderInfo *b)
 {
@@ -3397,9 +3387,6 @@ equal(const void *a, const void *b)
         case T_AppendRelInfo:
             retval = _equalAppendRelInfo(a, b);
             break;
-        case T_PartitionedChildRelInfo:
-            retval = _equalPartitionedChildRelInfo(a, b);
-            break;
         case T_PlaceHolderInfo:
             retval = _equalPlaceHolderInfo(a, b);
             break;
diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c
index a7ab020e..8a10e344 100644
--- a/src/backend/nodes/nodeFuncs.c
+++ b/src/backend/nodes/nodeFuncs.c
@@ -2147,6 +2147,17 @@ expression_tree_walker(Node *node,
                     return true;
             }
             break;
+		case T_PartitionPruneStepOp:
+		   {
+		       PartitionPruneStepOp *opstep = (PartitionPruneStepOp *) node;
+
+		       if (walker((Node *) opstep->exprs, context))
+		           return true;
+		   }
+		   break;
+		case T_PartitionPruneStepCombine:
+		   /* no expression subnodes */
+		   break;
         case T_JoinExpr:
             {
                 bool        left_arg_ret  = false;
@@ -2977,6 +2988,20 @@ expression_tree_mutator(Node *node,
                 return (Node *) newnode;
             }
             break;
+		case T_PartitionPruneStepOp:
+			{
+				PartitionPruneStepOp *opstep = (PartitionPruneStepOp *) node;
+				PartitionPruneStepOp *newnode;
+
+				FLATCOPY(newnode, opstep, PartitionPruneStepOp);
+				MUTATE(newnode->exprs, opstep->exprs, List *);
+
+				return (Node *) newnode;
+			}
+			break;
+		case T_PartitionPruneStepCombine:
+			/* no expression sub-nodes */
+			return (Node *) copyObject(node);
         case T_JoinExpr:
             {
                 JoinExpr   *join = (JoinExpr *) node;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 610c2fae..24ca2109 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -2978,6 +2978,28 @@ _outFromExpr(StringInfo str, const FromExpr *node)
     WRITE_NODE_FIELD(quals);
 }
 
+static void
+_outPartitionPruneStepOp(StringInfo str, const PartitionPruneStepOp *node)
+{
+	WRITE_NODE_TYPE("PARTITIONPRUNESTEPOP");
+
+	WRITE_INT_FIELD(step.step_id);
+	WRITE_INT_FIELD(opstrategy);
+	WRITE_NODE_FIELD(exprs);
+	WRITE_NODE_FIELD(cmpfns);
+	WRITE_BITMAPSET_FIELD(nullkeys);
+}
+
+static void
+_outPartitionPruneStepCombine(StringInfo str, const PartitionPruneStepCombine *node)
+{
+	WRITE_NODE_TYPE("PARTITIONPRUNESTEPCOMBINE");
+
+	WRITE_INT_FIELD(step.step_id);
+	WRITE_ENUM_FIELD(combineOp, PartitionPruneCombineOp);
+	WRITE_NODE_FIELD(source_stepids);
+}
+
 static void
 _outOnConflictExpr(StringInfo str, const OnConflictExpr *node)
 {
@@ -3527,7 +3549,6 @@ _outPlannerInfo(StringInfo str, const PlannerInfo *node)
     WRITE_NODE_FIELD(full_join_clauses);
     WRITE_NODE_FIELD(join_info_list);
     WRITE_NODE_FIELD(append_rel_list);
-    WRITE_NODE_FIELD(pcinfo_list);
     WRITE_NODE_FIELD(rowMarks);
     WRITE_NODE_FIELD(placeholder_list);
     WRITE_NODE_FIELD(fkey_list);
@@ -3552,6 +3573,7 @@ _outPlannerInfo(StringInfo str, const PlannerInfo *node)
     WRITE_INT_FIELD(wt_param_id);
     WRITE_BITMAPSET_FIELD(curOuterRels);
     WRITE_NODE_FIELD(curOuterParams);
+	WRITE_BOOL_FIELD(partColsUpdated);
 #ifdef __TBASE__
     WRITE_BOOL_FIELD(haspart_tobe_modify);
     WRITE_UINT_FIELD(partrelindex);
@@ -3606,6 +3628,7 @@ _outRelOptInfo(StringInfo str, const RelOptInfo *node)
     WRITE_NODE_FIELD(joininfo);
     WRITE_BOOL_FIELD(has_eclass_joins);
     WRITE_BITMAPSET_FIELD(top_parent_relids);
+	WRITE_NODE_FIELD(partitioned_child_rels);
 #ifdef __TBASE__
 	WRITE_BOOL_FIELD(intervalparent);
 	WRITE_BOOL_FIELD(isdefault);
@@ -3854,16 +3877,6 @@ _outAppendRelInfo(StringInfo str, const AppendRelInfo *node)
     WRITE_OID_FIELD(parent_reloid);
 }
 
-static void
-_outPartitionedChildRelInfo(StringInfo str, const PartitionedChildRelInfo *node)
-{
-    WRITE_NODE_TYPE("PARTITIONEDCHILDRELINFO");
-
-    WRITE_UINT_FIELD(parent_relid);
-    WRITE_NODE_FIELD(child_rels);
-	WRITE_BOOL_FIELD(part_cols_updated);
-}
-
 static void
 _outPlaceHolderInfo(StringInfo str, const PlaceHolderInfo *node)
 {
@@ -5423,6 +5436,12 @@ outNode(StringInfo str, const void *obj)
             case T_OnConflictExpr:
                 _outOnConflictExpr(str, obj);
                 break;
+			case T_PartitionPruneStepOp:
+			    _outPartitionPruneStepOp(str, obj);
+			    break;
+			case T_PartitionPruneStepCombine:
+			    _outPartitionPruneStepCombine(str, obj);
+			    break;
             case T_Path:
                 _outPath(str, obj);
                 break;
@@ -5564,9 +5583,6 @@ outNode(StringInfo str, const void *obj)
             case T_AppendRelInfo:
                 _outAppendRelInfo(str, obj);
                 break;
-            case T_PartitionedChildRelInfo:
-                _outPartitionedChildRelInfo(str, obj);
-                break;
             case T_PlaceHolderInfo:
                 _outPlaceHolderInfo(str, obj);
                 break;
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index e0744408..72e9a6fa 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -2074,6 +2074,33 @@ _readOnConflictExpr(void)
     READ_DONE();
 }
 
+static PartitionPruneStepOp *
+_readPartitionPruneStepOp(void)
+{
+   READ_LOCALS(PartitionPruneStepOp);
+
+   READ_INT_FIELD(step.step_id);
+   READ_INT_FIELD(opstrategy);
+   READ_NODE_FIELD(exprs);
+   READ_NODE_FIELD(cmpfns);
+   READ_BITMAPSET_FIELD(nullkeys);
+
+   READ_DONE();
+}
+
+static PartitionPruneStepCombine *
+_readPartitionPruneStepCombine(void)
+{
+   READ_LOCALS(PartitionPruneStepCombine);
+
+   READ_INT_FIELD(step.step_id);
+   READ_ENUM_FIELD(combineOp, PartitionPruneCombineOp);
+   READ_NODE_FIELD(source_stepids);
+
+   READ_DONE();
+}
+
+
 /*
  *    Stuff from parsenodes.h.
  */
@@ -4349,6 +4376,10 @@ parseNodeString(void)
         return_value = _readFromExpr();
     else if (MATCH("ONCONFLICTEXPR", 14))
         return_value = _readOnConflictExpr();
+	else if (MATCH("PARTITIONPRUNESTEPOP", 20))
+        return_value = _readPartitionPruneStepOp();
+    else if (MATCH("PARTITIONPRUNESTEPCOMBINE", 25))
+        return_value = _readPartitionPruneStepCombine();
     else if (MATCH("RTE", 3))
         return_value = _readRangeTblEntry();
     else if (MATCH("RANGETBLFUNCTION", 16))
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index f5516316..59663d81 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -45,6 +45,7 @@
 #include "optimizer/var.h"
 #include "parser/parse_clause.h"
 #include "parser/parsetree.h"
+#include "partitioning/partprune.h"
 #include "pgxc/nodemgr.h"
 #ifdef PGXC
 #include "nodes/makefuncs.h"
@@ -887,6 +888,8 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
     double       *parent_attrsizes;
     int            nattrs;
     ListCell   *l;
+	Relids		live_children = NULL;
+	bool		did_pruning = false;
 
 	/* Guard against stack overflow due to overly deep inheritance tree. */
 	check_stack_depth();
@@ -894,6 +897,31 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
     Assert(IS_SIMPLE_REL(rel));
 
     /*
+	 * Initialize partitioned_child_rels to contain this RT index.
+	 *
+	 * Note that during the set_append_rel_pathlist() phase, we will bubble up
+	 * the indexes of partitioned relations that appear down in the tree, so
+	 * that when we've created Paths for all the children, the root
+	 * partitioned table's list will contain all such indexes.
+	 */
+	if (rte->relkind == RELKIND_PARTITIONED_TABLE)
+		rel->partitioned_child_rels = list_make1_int(rti);
+
+	/*
+	 * If the partitioned relation has any baserestrictinfo quals then we
+	 * attempt to use these quals to prune away partitions that cannot
+	 * possibly contain any tuples matching these quals.  In this case we'll
+	 * store the relids of all partitions which could possibly contain a
+	 * matching tuple, and skip anything else in the loop below.
+	 */
+	if (rte->relkind == RELKIND_PARTITIONED_TABLE &&
+		rel->baserestrictinfo != NIL)
+	{
+		live_children = prune_append_rel_partitions(rel);
+		did_pruning = true;
+	}
+
+	/*
      * Initialize to compute size estimates for whole append relation.
      *
      * We handle width estimates by weighting the widths of different child
@@ -1141,6 +1169,13 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
             continue;
         }
 
+		if (did_pruning && !bms_is_member(appinfo->child_relid, live_children))
+		{
+			/* This partition was pruned; skip it. */
+			set_dummy_rel_pathlist(childrel);
+			continue;
+		}
+
         if (relation_excluded_by_constraints(root, childrel, childRTE))
         {
             /*
@@ -1322,6 +1357,12 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
         if (IS_DUMMY_REL(childrel))
             continue;
 
+		/* Bubble up childrel's partitioned children. */
+		if (rel->part_scheme)
+			rel->partitioned_child_rels =
+				list_concat(rel->partitioned_child_rels,
+							list_copy(childrel->partitioned_child_rels));
+
         /*
          * Child is live, so add it to the live_childrels list for use below.
          */
@@ -1356,48 +1397,54 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
     List       *all_child_outers = NIL;
     ListCell   *l;
     List       *partitioned_rels = NIL;
-    RangeTblEntry *rte;
 	bool		build_partitioned_rels = false;
 
+	   /*
+	    * AppendPath generated for partitioned tables must record the RT indexes
+	    * of partitioned tables that are direct or indirect children of this
+	    * Append rel.
+	    *
+	    * AppendPath may be for a sub-query RTE (UNION ALL), in which case, 'rel'
+	    * itself does not represent a partitioned relation, but the child sub-
+	    * queries may contain references to partitioned relations.  The loop
+	    * below will look for such children and collect them in a list to be
+	    * passed to the path creation function.  (This assumes that we don't need
+	    * to look through multiple levels of subquery RTEs; if we ever do, we
+	    * could consider stuffing the list we generate here into sub-query RTE's
+	    * RelOptInfo, just like we do for partitioned rels, which would be used
+	    * when populating our parent rel with paths.  For the present, that
+	    * appears to be unnecessary.)
+	    */
+	   if (rel->part_scheme != NULL)
+	   {
 	if (IS_SIMPLE_REL(rel))
+	           partitioned_rels = rel->partitioned_child_rels;
+	        else if (IS_JOIN_REL(rel))
 	{
+	            int         relid = -1;
+
 	/*
-	 * A root partition will already have a PartitionedChildRelInfo, and a
-		* non-root partitioned table doesn't need one, because its Append
-		* paths will get flattened into the parent anyway.  For a subquery
-		* RTE, no PartitionedChildRelInfo exists; we collect all
-		* partitioned_rels associated with any child.  (This assumes that we
-		* don't need to look through multiple levels of subquery RTEs; if we
-		* ever do, we could create a PartitionedChildRelInfo with the
-		* accumulated list of partitioned_rels which would then be found when
-		* populated our parent rel with paths.  For the present, that appears
-		* to be unnecessary.)
+	             * For a partitioned joinrel, concatenate the component rels'
+	   	         * partitioned_child_rels lists.
 	 */
-    rte = planner_rt_fetch(rel->relid, root);
-	switch (rte->rtekind)
+	    	    while ((relid = bms_next_member(rel->relids, relid)) >= 0)
 	{
-		case RTE_RELATION:
-    if (rte->relkind == RELKIND_PARTITIONED_TABLE)
+	    	       RelOptInfo *component;
+
+	    	       Assert(relid >= 1 && relid < root->simple_rel_array_size);
+	    	       component = root->simple_rel_array[relid];
+	    	       Assert(component->part_scheme != NULL);
+	    	       Assert(list_length(component->partitioned_child_rels) >= 1);
 				partitioned_rels =
-						get_partitioned_child_rels(root, rel->relid, NULL);
-			break;
-		case RTE_SUBQUERY:
-			build_partitioned_rels = true;
-			break;
-		default:
-			elog(ERROR, "unexpcted rtekind: %d", (int) rte->rtekind);
+	    	           list_concat(partitioned_rels,
+	    	                       list_copy(component->partitioned_child_rels));
 	}
 	}
-	else if (rel->reloptkind == RELOPT_JOINREL && rel->part_scheme)
-	{
-		/*
-		* Associate PartitionedChildRelInfo of the root partitioned tables
-		* being joined with the root partitioned join (indicated by
-		* RELOPT_JOINREL).
-		*/
-		partitioned_rels = get_partitioned_child_rels_for_join(root,
-		                                                      rel->relids);
+
+	    	Assert(list_length(partitioned_rels) >= 1);
 	}
+	else if (rel->rtekind == RTE_SUBQUERY)
+        build_partitioned_rels = true;
 
     /*
      * For every non-dummy child, remember the cheapest path.  Also, identify
@@ -1415,9 +1462,8 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
 		 */
 		if (build_partitioned_rels)
 		{
-			List	   *cprels;
+			List	   *cprels = childrel->partitioned_child_rels;
 
-			cprels = get_partitioned_child_rels(root, childrel->relid, NULL);
 			partitioned_rels = list_concat(partitioned_rels,
 										   list_copy(cprels));
 		}
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 4d3b3cce..40d0757d 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -40,9 +40,7 @@
 #include "utils/selfuncs.h"
 
 
-#define IsBooleanOpfamily(opfamily) \
-    ((opfamily) == BOOL_BTREE_FAM_OID || (opfamily) == BOOL_HASH_FAM_OID)
-
+/* XXX see PartCollMatchesExprColl */
 #define IndexCollMatchesExprColl(idxcollation, exprcollation) \
     ((idxcollation) == InvalidOid || (idxcollation) == (exprcollation))
 
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 7538b6be..3e3065cd 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -628,7 +628,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
     root->multiexpr_params = NIL;
     root->eq_classes = NIL;
     root->append_rel_list = NIL;
-    root->pcinfo_list = NIL;
     root->rowMarks = NIL;
     memset(root->upper_rels, 0, sizeof(root->upper_rels));
     memset(root->upper_targets, 0, sizeof(root->upper_targets));
@@ -648,6 +647,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
 	else
 		root->wt_param_id = -1;
 	root->non_recursive_path = NULL;
+	root->partColsUpdated = false;
 
 	/*
 	 * If there is a WITH list, process each WITH query and either convert it
@@ -1264,12 +1264,12 @@ inheritance_planner(PlannerInfo *root)
     ListCell   *lc;
     Index        rti;
     RangeTblEntry *parent_rte;
+	Relids		partitioned_relids = NULL;
     List       *partitioned_rels = NIL;
 	PlannerInfo *parent_root;
 	Query	   *parent_parse;
 	Bitmapset  *parent_relids = bms_make_singleton(top_parentRTindex);
 	PlannerInfo **parent_roots = NULL;
-	bool		partColsUpdated = false;
 
     Assert(parse->commandType != CMD_INSERT);
 
@@ -1341,10 +1341,12 @@ inheritance_planner(PlannerInfo *root)
     if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE)
 	{
 		nominalRelation = top_parentRTindex;
-		partitioned_rels = get_partitioned_child_rels(root, top_parentRTindex,
-													  &partColsUpdated);
-		/* The root partitioned table is included as a child rel */
-		Assert(list_length(partitioned_rels) >= 1);
+
+		/*
+		 * Root parent's RT index is always present in the partitioned_rels of
+		 * the ModifyTable node, if one is needed at all.
+		 */
+		partitioned_relids = bms_make_singleton(top_parentRTindex);
 	}
 
 	/*
@@ -1575,6 +1577,15 @@ inheritance_planner(PlannerInfo *root)
         if (IS_DUMMY_PATH(subpath))
             continue;
 
+		/*
+		* Add the current parent's RT index to the partitione_rels set if
+		* we're going to create the ModifyTable path for a partitioned root
+		* table.
+		*/
+		if (partitioned_relids)
+		   partitioned_relids = bms_add_member(partitioned_relids,
+		                                       appinfo->parent_relid);
+
 #ifdef XCP
         /*
          * All subplans should have the same distribution, except may be
@@ -1709,6 +1720,21 @@ inheritance_planner(PlannerInfo *root)
     else
         rowMarks = root->rowMarks;
 
+	if (partitioned_relids)
+	{
+		int			i;
+
+		i = -1;
+		while ((i = bms_next_member(partitioned_relids, i)) >= 0)
+			partitioned_rels = lappend_int(partitioned_rels, i);
+
+		/*
+		 * If we're going to create ModifyTable at all, the list should
+		 * contain at least one member, that is, the root parent's index.
+		 */
+		Assert(list_length(partitioned_rels) >= 1);
+	}
+
     /* Create Path representing a ModifyTable to do the UPDATE/DELETE work */
     add_path(final_rel, (Path *)
              create_modifytable_path(root, final_rel,
@@ -1716,7 +1742,7 @@ inheritance_planner(PlannerInfo *root)
                                      parse->canSetTag,
                                      nominalRelation,
                                      partitioned_rels,
-									 partColsUpdated,
+									 root->partColsUpdated,
                                      resultRelations,
                                      subpaths,
                                      subroots,
@@ -6802,66 +6828,6 @@ grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path,
 	return matches_key;
 }
 
-/*
- * get_partitioned_child_rels
- *		Returns a list of the RT indexes of the partitioned child relations
- *      with rti as the root parent RT index. Also sets
- *      *part_cols_updated to true if any of the root rte's updated
- *      columns is used in the partition key either of the relation whose RTI
- *      is specified or of any child relation.
- *
- * Note: This function might get called even for range table entries that
- * are not partitioned tables; in such a case, it will simply return NIL.
- */
-List *
-get_partitioned_child_rels(PlannerInfo *root, Index rti,
-                          bool *part_cols_updated)
-{
-	List	   *result = NIL;
-	ListCell   *l;
-
-    if (part_cols_updated)
-        *part_cols_updated = false;
-
-	foreach(l, root->pcinfo_list)
-    {
-		PartitionedChildRelInfo *pc = lfirst(l);
-
-		if (pc->parent_relid == rti)
-		{
-			result = pc->child_rels;
-			if (part_cols_updated)
-				*part_cols_updated = pc->part_cols_updated;
-			break;
-    }
-}
-
-	return result;
-}
-
-
-/*
- * get_partitioned_child_rels_for_join
- *     Build and return a list containing the RTI of every partitioned
- *     relation which is a child of some rel included in the join.
- */
-List *
-get_partitioned_child_rels_for_join(PlannerInfo *root, Relids join_relids)
-{
-   List       *result = NIL;
-    ListCell   *l;
-
-   foreach(l, root->pcinfo_list)
-    {
-       PartitionedChildRelInfo *pc = lfirst(l);
-
-       if (bms_is_member(pc->parent_relid, join_relids))
-           result = list_concat(result, list_copy(pc->child_rels));
-    }
-
-   return result;
-}
-
 /*
  * add_paths_to_grouping_rel
  *
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index c40a38ee..d2e6c3c6 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -105,8 +105,7 @@ static void expand_partitioned_rtentry(PlannerInfo *root,
 						   RangeTblEntry *parentrte,
 						   Index parentRTindex, Relation parentrel,
 						   PlanRowMark *top_parentrc, LOCKMODE lockmode,
-						   List **appinfos, List **partitioned_child_rels,
-						   bool *part_cols_updated);
+						   List **appinfos);
 static void expand_single_inheritance_child(PlannerInfo *root,
 								RangeTblEntry *parentrte,
 								Index parentRTindex, Relation parentrel,
@@ -1543,9 +1542,6 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
     /* Scan the inheritance set and expand it */
    if (RelationGetPartitionDesc(oldrelation) != NULL)
    {
-		List	   *partitioned_child_rels = NIL;
-		bool		part_cols_updated = false;
-
 		Assert(rte->relkind == RELKIND_PARTITIONED_TABLE);
 
        /*
@@ -1554,28 +1550,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti)
 		 * extract the partition key columns of all the partitioned tables.
         */
        expand_partitioned_rtentry(root, rte, rti, oldrelation, oldrc,
-								   lockmode, &root->append_rel_list,
-								   &partitioned_child_rels,
-								   &part_cols_updated);
-
-		/*
-		 * We keep a list of objects in root, each of which maps a root
-		 * partitioned parent RT index to the list of RT indexes of descendant
-		 * partitioned child tables.  When creating an Append or a ModifyTable
-		 * path for the parent, we copy the child RT index list verbatim to
-		 * the path so that it could be carried over to the executor so that
-		 * the latter could identify the partitioned child tables.
-		 */
-		if (rte->inh && partitioned_child_rels != NIL)
-		{
-			PartitionedChildRelInfo *pcinfo;
-
-			pcinfo = makeNode(PartitionedChildRelInfo);
-			pcinfo->parent_relid = rti;
-			pcinfo->child_rels = partitioned_child_rels;
-			pcinfo->part_cols_updated = part_cols_updated;
-			root->pcinfo_list = lappend(root->pcinfo_list, pcinfo);
-		}
+								   lockmode, &root->append_rel_list);
    }
    else
    {
@@ -1650,8 +1625,7 @@ static void
 expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
 						   Index parentRTindex, Relation parentrel,
 						   PlanRowMark *top_parentrc, LOCKMODE lockmode,
-						   List **appinfos, List **partitioned_child_rels,
-						   bool *part_cols_updated)
+						   List **appinfos)
 {
 	int			i;
 	RangeTblEntry *childrte;
@@ -1673,8 +1647,8 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
 	 * parentrte already has the root partrel's updatedCols translated to match
 	 * the attribute ordering of parentrel.
 	 */
-	if (!*part_cols_updated)
-		*part_cols_updated =
+	if (!root->partColsUpdated)
+		root->partColsUpdated =
 			has_partition_attrs(parentrel, parentrte->updatedCols, NULL);
 
 	/* First expand the partitioned table itself. */
@@ -1682,14 +1656,6 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
 									top_parentrc, parentrel,
 									appinfos, &childrte, &childRTindex);
 
-	/*
-	 * The partitioned table does not have data for itself but still need to
-	 * be locked. Update given list of partitioned children with RTI of this
-	 * partitioned relation.
-	 */
-	*partitioned_child_rels = lappend_int(*partitioned_child_rels,
-										  childRTindex);
-
 	for (i = 0; i < partdesc->nparts; i++)
 	{
 		Oid			childOID = partdesc->oids[i];
@@ -1716,8 +1682,7 @@ expand_partitioned_rtentry(PlannerInfo *root, RangeTblEntry *parentrte,
 		if (childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 			expand_partitioned_rtentry(root, childrte, childRTindex,
 									   childrel, top_parentrc, lockmode,
-									   appinfos, partitioned_child_rels,
-									   part_cols_updated);
+									   appinfos);
 
 		/* Close child relation, but keep locks */
 		heap_close(childrel, NoLock);
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index e310e85b..6abe45f5 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -1297,7 +1297,6 @@ get_relation_constraints(PlannerInfo *root,
     Index        varno = rel->relid;
     Relation    relation;
     TupleConstr *constr;
-    List       *pcqual;
 
     /*
      * We assume the relation has already been safely locked.
@@ -1383,16 +1382,27 @@ get_relation_constraints(PlannerInfo *root,
         }
     }
 
-    /* Append partition predicates, if any */
-    pcqual = RelationGetPartitionQual(relation);
+	/*
+	* Append partition predicates, if any.
+	*
+	* For selects, partition pruning uses the parent table's partition bound
+	* descriptor, instead of constraint exclusion which is driven by the
+	* individual partition's partition constraint.
+	*/
+	if (root->parse->commandType != CMD_SELECT)
+	{
+		List       *pcqual = RelationGetPartitionQual(relation);
+
     if (pcqual)
     {
         /*
-         * Run each expression through const-simplification and
-         * canonicalization similar to check constraints.
+		    * Run the partition quals through const-simplification similar to
+		    * check constraints.  We skip canonicalize_qual, though, because
+		    * partition quals should be in canonical form already; also,
+		    * since the qual is in implicit-AND format, we'd have to
+		    * explicitly convert it to explicit-AND format and back again.
          */
         pcqual = (List *) eval_const_expressions(root, (Node *) pcqual);
-        pcqual = (List *) canonicalize_qual((Expr *) pcqual);
 
         /* Fix Vars to have the desired varno */
         if (varno != 1)
@@ -1400,6 +1410,7 @@ get_relation_constraints(PlannerInfo *root,
 
         result = list_concat(result, pcqual);
     }
+	}
 
     heap_close(relation, NoLock);
 
@@ -1999,6 +2010,7 @@ set_relation_partition_info(PlannerInfo *root, RelOptInfo *rel,
 	rel->boundinfo = partition_bounds_copy(partdesc->boundinfo, partkey);
    rel->nparts = partdesc->nparts;
 	set_baserel_partition_key_exprs(relation, rel);
+	rel->partition_qual = RelationGetPartitionQual(relation);
 }
 
 /*
@@ -2011,7 +2023,8 @@ find_partition_scheme(PlannerInfo *root, Relation relation)
 {
    PartitionKey partkey = RelationGetPartitionKey(relation);
    ListCell   *lc;
-   int         partnatts;
+	int			partnatts,
+				i;
    PartitionScheme part_scheme;
 
    /* A partitioned table should have a partition key. */
@@ -2029,7 +2042,7 @@ find_partition_scheme(PlannerInfo *root, Relation relation)
            partnatts != part_scheme->partnatts)
            continue;
 
-       /* Match the partition key types. */
+		/* Match partition key type properties. */
        if (memcmp(partkey->partopfamily, part_scheme->partopfamily,
                   sizeof(Oid) * partnatts) != 0 ||
            memcmp(partkey->partopcintype, part_scheme->partopcintype,
@@ -2047,6 +2060,19 @@ find_partition_scheme(PlannerInfo *root, Relation relation)
        Assert(memcmp(partkey->parttypbyval, part_scheme->parttypbyval,
                      sizeof(bool) * partnatts) == 0);
 
+		/*
+		 * If partopfamily and partopcintype matched, must have the same
+		 * partition comparison functions.  Note that we cannot reliably
+		 * Assert the equality of function structs themselves for they might
+		 * be different across PartitionKey's, so just Assert for the function
+		 * OIDs.
+		 */
+#ifdef USE_ASSERT_CHECKING
+		for (i = 0; i < partkey->partnatts; i++)
+			Assert(partkey->partsupfunc[i].fn_oid ==
+				   part_scheme->partsupfunc[i].fn_oid);
+#endif
+
        /* Found matching partition scheme. */
        return part_scheme;
    }
@@ -2081,6 +2107,12 @@ find_partition_scheme(PlannerInfo *root, Relation relation)
 	memcpy(part_scheme->parttypbyval, partkey->parttypbyval,
 		   sizeof(bool) * partnatts);
 
+	part_scheme->partsupfunc = (FmgrInfo *)
+		palloc(sizeof(FmgrInfo) * partnatts);
+	for (i = 0; i < partnatts; i++)
+		fmgr_info_copy(&part_scheme->partsupfunc[i], &partkey->partsupfunc[i],
+					   CurrentMemoryContext);
+
    /* Add the partitioning scheme to PlannerInfo. */
    root->part_schemes = lappend(root->part_schemes, part_scheme);
 
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index 70acf299..1f6fb286 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -163,9 +163,11 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
 	rel->part_scheme = NULL;
 	rel->nparts = 0;
 	rel->boundinfo = NULL;
+	rel->partition_qual = NIL;
 	rel->part_rels = NULL;
 	rel->partexprs = NULL;
         rel->nullable_partexprs = NULL;
+    rel->partitioned_child_rels = NIL;
 #ifdef __TBASE__
 	rel->intervalparent = false;
 	rel->isdefault      = rte->isdefault;
@@ -622,9 +624,11 @@ build_join_rel(PlannerInfo *root,
     joinrel->part_scheme = NULL;
 	joinrel->nparts = 0;
 	joinrel->boundinfo = NULL;
+	joinrel->partition_qual = NIL;
 	joinrel->part_rels = NULL;
 	joinrel->partexprs = NULL;
     joinrel->nullable_partexprs = NULL;
+    joinrel->partitioned_child_rels = NIL;
 #ifdef __TBASE__
 	joinrel->resultRelLoc = RESULT_REL_NONE;
 #endif
@@ -793,9 +797,13 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
 	joinrel->has_eclass_joins = false;
 	joinrel->top_parent_relids = NULL;
 	joinrel->part_scheme = NULL;
+	joinrel->nparts = 0;
+	joinrel->boundinfo = NULL;
+	joinrel->partition_qual = NIL;
 	joinrel->part_rels = NULL;
 	joinrel->partexprs = NULL;
 	joinrel->nullable_partexprs = NULL;
+	joinrel->partitioned_child_rels = NIL;
 
 	joinrel->top_parent_relids = bms_union(outer_rel->top_parent_relids,
 										   inner_rel->top_parent_relids);
diff --git a/src/backend/partitioning/Makefile b/src/backend/partitioning/Makefile
new file mode 100644
index 00000000..429207c4
--- /dev/null
+++ b/src/backend/partitioning/Makefile
@@ -0,0 +1,17 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for backend/partitioning
+#
+# IDENTIFICATION
+#    src/backend/partitioning/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/partitioning
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = partprune.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c
new file mode 100644
index 00000000..959ee164
--- /dev/null
+++ b/src/backend/partitioning/partprune.c
@@ -0,0 +1,2782 @@
+/*-------------------------------------------------------------------------
+ *
+ * partprune.c
+ *		Parses clauses attempting to match them up to partition keys of a
+ *		given relation and generates a set of "pruning steps", which can be
+ *		later "executed" either from the planner or the executor to determine
+ *		the minimum set of partitions which match the given clauses.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *		  src/backend/partitioning/partprune.c
+ *
+ *-------------------------------------------------------------------------
+*/
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "access/nbtree.h"
+#include "catalog/pg_operator.h"
+#include "catalog/pg_opfamily.h"
+#include "catalog/pg_type.h"
+#include "miscadmin.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "optimizer/clauses.h"
+#include "optimizer/planner.h"
+#include "optimizer/predtest.h"
+#include "optimizer/prep.h"
+#include "partitioning/partprune.h"
+#include "partitioning/partbounds.h"
+#include "rewrite/rewriteManip.h"
+#include "utils/lsyscache.h"
+
+
+/*
+ * Information about a clause matched with a partition key.
+ */
+typedef struct PartClauseInfo
+{
+	int			keyno;			/* Partition key number (0 to partnatts - 1)  */
+	Oid			opno;			/* operator used to compare partkey to 'expr' */
+	bool		op_is_ne;		/* is clause's original operator <> ? */
+	Expr	   *expr;			/* expr the partition key is compared to */
+	Oid			cmpfn;			/* Oid of function to compare 'expr' to the
+								 * partition key */
+	int			op_strategy;	/* cached info. */
+} PartClauseInfo;
+
+/*
+ * PartClauseMatchStatus
+ *		Describes the result match_clause_to_partition_key produces for a
+ *		given clause and the partition key to match with that are passed to it
+ */
+typedef enum PartClauseMatchStatus
+{
+	PARTCLAUSE_NOMATCH,
+	PARTCLAUSE_MATCH_CLAUSE,
+	PARTCLAUSE_MATCH_NULLNESS,
+	PARTCLAUSE_MATCH_STEPS,
+	PARTCLAUSE_MATCH_CONTRADICT,
+	PARTCLAUSE_UNSUPPORTED
+} PartClauseMatchStatus;
+
+/*
+ * GeneratePruningStepsContext
+ *		Information about the current state of generation of "pruning steps"
+ *		for a given set of clauses
+ *
+ * gen_partprune_steps() initializes an instance of this struct, which is used
+ * throughout the step generation process.
+ */
+typedef struct GeneratePruningStepsContext
+{
+	int			next_step_id;
+	List	   *steps;
+} GeneratePruningStepsContext;
+
+/* The result of performing one PartitionPruneStep */
+typedef struct PruneStepResult
+{
+	/*
+	 * The offsets of bounds (in a table's boundinfo) whose partition is
+	 * selected by the pruning step.
+	 */
+	Bitmapset  *bound_offsets;
+
+	bool		scan_default;	/* Scan the default partition? */
+	bool		scan_null;		/* Scan the partition for NULL values? */
+} PruneStepResult;
+
+
+static List *gen_partprune_steps_internal(GeneratePruningStepsContext *context,
+							 RelOptInfo *rel, List *clauses,
+							 bool *contradictory);
+static PartitionPruneStep *gen_prune_step_op(GeneratePruningStepsContext *context,
+				  StrategyNumber opstrategy, bool op_is_ne,
+				  List *exprs, List *cmpfns, Bitmapset *nullkeys);
+static PartitionPruneStep *gen_prune_step_combine(GeneratePruningStepsContext *context,
+					   List *source_stepids,
+					   PartitionPruneCombineOp combineOp);
+static PartitionPruneStep *gen_prune_steps_from_opexps(PartitionScheme part_scheme,
+							GeneratePruningStepsContext *context,
+							List **keyclauses, Bitmapset *nullkeys);
+static PartClauseMatchStatus match_clause_to_partition_key(RelOptInfo *rel,
+							  GeneratePruningStepsContext *context,
+							  Expr *clause, Expr *partkey, int partkeyidx,
+							  bool *clause_is_not_null,
+							  PartClauseInfo **pc, List **clause_steps);
+static List *get_steps_using_prefix(GeneratePruningStepsContext *context,
+					   StrategyNumber step_opstrategy,
+					   bool step_op_is_ne,
+					   Expr *step_lastexpr,
+					   Oid step_lastcmpfn,
+					   int step_lastkeyno,
+					   Bitmapset *step_nullkeys,
+					   List *prefix);
+static List *get_steps_using_prefix_recurse(GeneratePruningStepsContext *context,
+							   StrategyNumber step_opstrategy,
+							   bool step_op_is_ne,
+							   Expr *step_lastexpr,
+							   Oid step_lastcmpfn,
+							   int step_lastkeyno,
+							   Bitmapset *step_nullkeys,
+							   ListCell *start,
+							   List *step_exprs,
+							   List *step_cmpfns);
+static PruneStepResult *get_matching_hash_bounds(PartitionPruneContext *context,
+						 StrategyNumber opstrategy, Datum *values, int nvalues,
+						 FmgrInfo *partsupfunc, Bitmapset *nullkeys);
+static PruneStepResult *get_matching_list_bounds(PartitionPruneContext *context,
+						 StrategyNumber opstrategy, Datum value, int nvalues,
+						 FmgrInfo *partsupfunc, Bitmapset *nullkeys);
+static PruneStepResult *get_matching_range_bounds(PartitionPruneContext *context,
+						  StrategyNumber opstrategy, Datum *values, int nvalues,
+						  FmgrInfo *partsupfunc, Bitmapset *nullkeys);
+static PruneStepResult *perform_pruning_base_step(PartitionPruneContext *context,
+						  PartitionPruneStepOp *opstep);
+static PruneStepResult *perform_pruning_combine_step(PartitionPruneContext *context,
+							 PartitionPruneStepCombine *cstep,
+							 PruneStepResult **step_results);
+static bool match_boolean_partition_clause(Oid partopfamily, Expr *clause,
+							   Expr *partkey, Expr **outconst);
+static bool partkey_datum_from_expr(PartitionPruneContext *context,
+						Expr *expr, Datum *value);
+
+
+/*
+ * gen_partprune_steps
+ *		Process 'clauses' (a rel's baserestrictinfo list of clauses) and return
+ *		a list of "partition pruning steps"
+ *
+ * If the clauses in the input list are contradictory or there is a
+ * pseudo-constant "false", *contradictory is set to true upon return.
+ */
+List *
+gen_partprune_steps(RelOptInfo *rel, List *clauses, bool *contradictory)
+{
+	GeneratePruningStepsContext context;
+
+	context.next_step_id = 0;
+	context.steps = NIL;
+
+	/* The clauses list may be modified below, so better make a copy. */
+	clauses = list_copy(clauses);
+
+	/*
+	 * For sub-partitioned tables there's a corner case where if the
+	 * sub-partitioned table shares any partition keys with its parent, then
+	 * it's possible that the partitioning hierarchy allows the parent
+	 * partition to only contain a narrower range of values than the
+	 * sub-partitioned table does.  In this case it is possible that we'd
+	 * include partitions that could not possibly have any tuples matching
+	 * 'clauses'.  The possibility of such a partition arrangement is perhaps
+	 * unlikely for non-default partitions, but it may be more likely in the
+	 * case of default partitions, so we'll add the parent partition table's
+	 * partition qual to the clause list in this case only.  This may result
+	 * in the default partition being eliminated.
+	 */
+	if (partition_bound_has_default(rel->boundinfo) &&
+		rel->partition_qual != NIL)
+	{
+		List	   *partqual = rel->partition_qual;
+
+		partqual = (List *) expression_planner((Expr *) partqual);
+
+		/* Fix Vars to have the desired varno */
+		if (rel->relid != 1)
+			ChangeVarNodes((Node *) partqual, 1, rel->relid, 0);
+
+		clauses = list_concat(clauses, partqual);
+	}
+
+	/* Down into the rabbit-hole. */
+	gen_partprune_steps_internal(&context, rel, clauses, contradictory);
+
+	return context.steps;
+}
+
+/*
+ * prune_append_rel_partitions
+ *		Returns RT indexes of the minimum set of child partitions which must
+ *		be scanned to satisfy rel's baserestrictinfo quals.
+ *
+ * Callers must ensure that 'rel' is a partitioned table.
+ */
+Relids
+prune_append_rel_partitions(RelOptInfo *rel)
+{
+	Relids		result;
+	List	   *clauses = rel->baserestrictinfo;
+	List	   *pruning_steps;
+	bool		contradictory;
+	PartitionPruneContext context;
+	Bitmapset  *partindexes;
+	int			i;
+
+	Assert(clauses != NIL);
+	Assert(rel->part_scheme != NULL);
+
+	/* If there are no partitions, return the empty set */
+	if (rel->nparts == 0)
+		return NULL;
+
+	/*
+	 * Process clauses.  If the clauses are found to be contradictory, we can
+	 * return the empty set.
+	 */
+	pruning_steps = gen_partprune_steps(rel, clauses, &contradictory);
+	if (contradictory)
+		return NULL;
+
+	context.strategy = rel->part_scheme->strategy;
+	context.partnatts = rel->part_scheme->partnatts;
+	context.partopfamily = rel->part_scheme->partopfamily;
+	context.partopcintype = rel->part_scheme->partopcintype;
+	context.partcollation = rel->part_scheme->partcollation;
+	context.partsupfunc = rel->part_scheme->partsupfunc;
+	context.nparts = rel->nparts;
+	context.boundinfo = rel->boundinfo;
+
+	/* Actual pruning happens here. */
+	partindexes = get_matching_partitions(&context, pruning_steps);
+
+	/* Add selected partitions' RT indexes to result. */
+	i = -1;
+	result = NULL;
+	while ((i = bms_next_member(partindexes, i)) >= 0)
+		result = bms_add_member(result, rel->part_rels[i]->relid);
+
+	return result;
+}
+
+/*
+ * get_matching_partitions
+ *		Determine partitions that survive partition pruning
+ *
+ * Returns a Bitmapset of indexes of surviving partitions.
+ */
+Bitmapset *
+get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
+{
+	Bitmapset  *result;
+	int			num_steps = list_length(pruning_steps),
+				i;
+	PruneStepResult **results,
+			   *final_result;
+	ListCell   *lc;
+
+	/* If there are no pruning steps then all partitions match. */
+	if (num_steps == 0)
+		return bms_add_range(NULL, 0, context->nparts - 1);
+
+	/*
+	 * Allocate space for individual pruning steps to store its result.  Each
+	 * slot will hold a PruneStepResult after performing a given pruning step.
+	 * Later steps may use the result of one or more earlier steps.  The
+	 * result of applying all pruning steps is the value contained in the slot
+	 * of the last pruning step.
+	 */
+	results = (PruneStepResult **)
+		palloc0(num_steps * sizeof(PruneStepResult *));
+	foreach(lc, pruning_steps)
+	{
+		PartitionPruneStep *step = lfirst(lc);
+
+		switch (nodeTag(step))
+		{
+			case T_PartitionPruneStepOp:
+				results[step->step_id] =
+					perform_pruning_base_step(context,
+											  (PartitionPruneStepOp *) step);
+				break;
+
+			case T_PartitionPruneStepCombine:
+				results[step->step_id] =
+					perform_pruning_combine_step(context,
+												 (PartitionPruneStepCombine *) step,
+												 results);
+				break;
+
+			default:
+				elog(ERROR, "invalid pruning step type: %d",
+					 (int) nodeTag(step));
+		}
+	}
+
+	/*
+	 * At this point we know the offsets of all the datums whose corresponding
+	 * partitions need to be in the result, including special null-accepting
+	 * and default partitions.  Collect the actual partition indexes now.
+	 */
+	final_result = results[num_steps - 1];
+	Assert(final_result != NULL);
+	i = -1;
+	result = NULL;
+	while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0)
+	{
+		int			partindex = context->boundinfo->indexes[i];
+
+		/*
+		 * In range and hash partitioning cases, some slots may contain -1,
+		 * indicating that no partition has been defined to accept a given
+		 * range of data or for a given remainder, respectively. The default
+		 * partition, if any, in case of range partitioning, will be added to
+		 * the result, because the specified range still satisfies the query's
+		 * conditions.
+		 */
+		if (partindex >= 0)
+			result = bms_add_member(result, partindex);
+	}
+
+	/* Add the null and/or default partition if needed and if present. */
+	if (final_result->scan_null)
+	{
+		Assert(context->strategy == PARTITION_STRATEGY_LIST);
+		Assert(partition_bound_accepts_nulls(context->boundinfo));
+		result = bms_add_member(result, context->boundinfo->null_index);
+	}
+	if (final_result->scan_default)
+	{
+		Assert(context->strategy == PARTITION_STRATEGY_LIST ||
+			   context->strategy == PARTITION_STRATEGY_RANGE);
+		Assert(partition_bound_has_default(context->boundinfo));
+		result = bms_add_member(result, context->boundinfo->default_index);
+	}
+
+	return result;
+}
+
+/*
+ * gen_partprune_steps_internal
+ *		Processes 'clauses' to generate partition pruning steps.
+ *
+ * From OpExpr clauses that are mutually AND'd, we find combinations of those
+ * that match to the partition key columns and for every such combination,
+ * we emit a PartitionPruneStepOp containing a vector of expressions whose
+ * values are used as a look up key to search partitions by comparing the
+ * values with partition bounds.  Relevant details of the operator and a
+ * vector of (possibly cross-type) comparison functions is also included with
+ * each step.
+ *
+ * For BoolExpr clauses, we recursively generate steps for each argument, and
+ * return a PartitionPruneStepCombine of their results.
+ *
+ * The generated steps are added to the context's steps list.  Each step is
+ * assigned a step identifier, unique even across recursive calls.
+ *
+ * If we find clauses that are mutually contradictory, or a pseudoconstant
+ * clause that contains false, we set *contradictory to true and return NIL
+ * (that is, no pruning steps).  Caller should consider all partitions as
+ * pruned in that case.  Otherwise, *contradictory is set to false.
+ *
+ * Note: the 'clauses' List may be modified inside this function. Callers may
+ * like to make a copy of it before passing them to this function.
+ */
+static List *
+gen_partprune_steps_internal(GeneratePruningStepsContext *context,
+							 RelOptInfo *rel, List *clauses,
+							 bool *contradictory)
+{
+	PartitionScheme part_scheme = rel->part_scheme;
+	List	   *keyclauses[PARTITION_MAX_KEYS];
+	Bitmapset  *nullkeys = NULL,
+			   *notnullkeys = NULL;
+	bool		generate_opsteps = false;
+	List	   *result = NIL;
+	ListCell   *lc;
+
+	*contradictory = false;
+
+	memset(keyclauses, 0, sizeof(keyclauses));
+	foreach(lc, clauses)
+	{
+		Expr	   *clause = (Expr *) lfirst(lc);
+		int			i;
+
+		if (IsA(clause, RestrictInfo))
+		{
+			RestrictInfo *rinfo = (RestrictInfo *) clause;
+
+			clause = rinfo->clause;
+			if (rinfo->pseudoconstant &&
+				IsA(rinfo->clause, Const) &&
+				!DatumGetBool(((Const *) clause)->constvalue))
+			{
+				*contradictory = true;
+				return NIL;
+			}
+		}
+
+		/* Get the BoolExpr's out of the way. */
+		if (IsA(clause, BoolExpr))
+		{
+			/*
+			 * Generate steps for arguments.
+			 *
+			 * While steps generated for the arguments themselves will be
+			 * added to context->steps during recursion and will be evaluated
+			 * independently, collect their step IDs to be stored in the
+			 * combine step we'll be creating.
+			 */
+			if (or_clause((Node *) clause))
+			{
+				List	   *arg_stepids = NIL;
+				bool		all_args_contradictory = true;
+				ListCell   *lc1;
+
+				/*
+				 * Get pruning step for each arg.  If we get contradictory for
+				 * all args, it means the OR expression is false as a whole.
+				 */
+				foreach(lc1, ((BoolExpr *) clause)->args)
+				{
+					Expr	   *arg = lfirst(lc1);
+					bool		arg_contradictory;
+					List	   *argsteps;
+
+					argsteps =
+						gen_partprune_steps_internal(context, rel,
+													 list_make1(arg),
+													 &arg_contradictory);
+					if (!arg_contradictory)
+						all_args_contradictory = false;
+
+					if (argsteps != NIL)
+					{
+						PartitionPruneStep *step;
+
+						Assert(list_length(argsteps) == 1);
+						step = (PartitionPruneStep *) linitial(argsteps);
+						arg_stepids = lappend_int(arg_stepids, step->step_id);
+					}
+					else
+					{
+						/*
+						 * No steps either means that arg_contradictory is
+						 * true or the arg didn't contain a clause matching
+						 * this partition key.
+						 *
+						 * In case of the latter, we cannot prune using such
+						 * an arg.  To indicate that to the pruning code, we
+						 * must construct a dummy PartitionPruneStepCombine
+						 * whose source_stepids is set to an empty List.
+						 * However, if we can prove using constraint exclusion
+						 * that the clause refutes the table's partition
+						 * constraint (if it's sub-partitioned), we need not
+						 * bother with that.  That is, we effectively ignore
+						 * this OR arm.
+						 */
+						List	   *partconstr = rel->partition_qual;
+						PartitionPruneStep *orstep;
+
+						/* Just ignore this argument. */
+						if (arg_contradictory)
+							continue;
+
+						if (partconstr)
+						{
+							partconstr = (List *)
+								expression_planner((Expr *) partconstr);
+							if (rel->relid != 1)
+								ChangeVarNodes((Node *) partconstr, 1,
+											   rel->relid, 0);
+							if (predicate_refuted_by(partconstr,
+													 list_make1(arg),
+													 false))
+								continue;
+						}
+
+						orstep = gen_prune_step_combine(context, NIL,
+														PARTPRUNE_COMBINE_UNION);
+						arg_stepids = lappend_int(arg_stepids, orstep->step_id);
+					}
+				}
+
+				*contradictory = all_args_contradictory;
+
+				/* Check if any contradicting clauses were found */
+				if (*contradictory)
+					return NIL;
+
+				if (arg_stepids != NIL)
+				{
+					PartitionPruneStep *step;
+
+					step = gen_prune_step_combine(context, arg_stepids,
+												  PARTPRUNE_COMBINE_UNION);
+					result = lappend(result, step);
+				}
+				continue;
+			}
+			else if (and_clause((Node *) clause))
+			{
+				List	   *args = ((BoolExpr *) clause)->args;
+				List	   *argsteps,
+						   *arg_stepids = NIL;
+				ListCell   *lc1;
+
+				/*
+				 * args may itself contain clauses of arbitrary type, so just
+				 * recurse and later combine the component partitions sets
+				 * using a combine step.
+				 */
+				argsteps = gen_partprune_steps_internal(context, rel, args,
+														contradictory);
+				if (*contradictory)
+					return NIL;
+
+				foreach(lc1, argsteps)
+				{
+					PartitionPruneStep *step = lfirst(lc1);
+
+					arg_stepids = lappend_int(arg_stepids, step->step_id);
+				}
+
+				if (arg_stepids != NIL)
+				{
+					PartitionPruneStep *step;
+
+					step = gen_prune_step_combine(context, arg_stepids,
+												  PARTPRUNE_COMBINE_INTERSECT);
+					result = lappend(result, step);
+				}
+				continue;
+			}
+
+			/*
+			 * Fall-through for a NOT clause, which if it's a Boolean clause,
+			 * will be handled in match_clause_to_partition_key(). We
+			 * currently don't perform any pruning for more complex NOT
+			 * clauses.
+			 */
+		}
+
+		/*
+		 * Must be a clause for which we can check if one of its args matches
+		 * the partition key.
+		 */
+		for (i = 0; i < part_scheme->partnatts; i++)
+		{
+			Expr	   *partkey = linitial(rel->partexprs[i]);
+			bool		clause_is_not_null = false;
+			PartClauseInfo *pc = NULL;
+			List	   *clause_steps = NIL;
+
+			switch (match_clause_to_partition_key(rel, context,
+												  clause, partkey, i,
+												  &clause_is_not_null,
+												  &pc, &clause_steps))
+			{
+				case PARTCLAUSE_MATCH_CLAUSE:
+					Assert(pc != NULL);
+
+					/*
+					 * Since we only allow strict operators, check for any
+					 * contradicting IS NULL.
+					 */
+					if (bms_is_member(i, nullkeys))
+					{
+						*contradictory = true;
+						return NIL;
+					}
+					generate_opsteps = true;
+					keyclauses[i] = lappend(keyclauses[i], pc);
+					break;
+
+				case PARTCLAUSE_MATCH_NULLNESS:
+					if (!clause_is_not_null)
+					{
+						/* check for conflicting IS NOT NULL */
+						if (bms_is_member(i, notnullkeys))
+						{
+							*contradictory = true;
+							return NIL;
+						}
+						nullkeys = bms_add_member(nullkeys, i);
+					}
+					else
+					{
+						/* check for conflicting IS NULL */
+						if (bms_is_member(i, nullkeys))
+						{
+							*contradictory = true;
+							return NIL;
+						}
+						notnullkeys = bms_add_member(notnullkeys, i);
+					}
+					break;
+
+				case PARTCLAUSE_MATCH_STEPS:
+					Assert(clause_steps != NIL);
+					result = list_concat(result, clause_steps);
+					break;
+
+				case PARTCLAUSE_MATCH_CONTRADICT:
+					/* We've nothing more to do if a contradiction was found. */
+					*contradictory = true;
+					return NIL;
+
+				case PARTCLAUSE_NOMATCH:
+
+					/*
+					 * Clause didn't match this key, but it might match the
+					 * next one.
+					 */
+					continue;
+
+				case PARTCLAUSE_UNSUPPORTED:
+					/* This clause cannot be used for pruning. */
+					break;
+
+				default:
+					Assert(false);
+					break;
+			}
+
+			/* done; go check the next clause. */
+			break;
+		}
+	}
+
+	/*
+	 * If generate_opsteps is set to false it means no OpExprs were directly
+	 * present in the input list.
+	 */
+	if (!generate_opsteps)
+	{
+		/*
+		 * Generate one prune step for the information derived from IS NULL,
+		 * if any.  To prune hash partitions, we must have found IS NULL
+		 * clauses for all partition keys.
+		 */
+		if (!bms_is_empty(nullkeys) &&
+			(part_scheme->strategy != PARTITION_STRATEGY_HASH ||
+			 bms_num_members(nullkeys) == part_scheme->partnatts))
+		{
+			PartitionPruneStep *step;
+
+			step = gen_prune_step_op(context, InvalidStrategy,
+									 false, NIL, NIL, nullkeys);
+			result = lappend(result, step);
+		}
+
+		/*
+		 * Note that for IS NOT NULL clauses, simply having step suffices;
+		 * there is no need to propagate the exact details of which keys are
+		 * required to be NOT NULL.  Hash partitioning expects to see actual
+		 * values to perform any pruning.
+		 */
+		if (!bms_is_empty(notnullkeys) &&
+			part_scheme->strategy != PARTITION_STRATEGY_HASH)
+		{
+			PartitionPruneStep *step;
+
+			step = gen_prune_step_op(context, InvalidStrategy,
+									 false, NIL, NIL, NULL);
+			result = lappend(result, step);
+		}
+	}
+	else
+	{
+		PartitionPruneStep *step;
+
+		/* Generate pruning steps from OpExpr clauses in keyclauses. */
+		step = gen_prune_steps_from_opexps(part_scheme, context,
+										   keyclauses, nullkeys);
+		if (step != NULL)
+			result = lappend(result, step);
+	}
+
+	/*
+	 * Finally, results from all entries appearing in result should be
+	 * combined using an INTERSECT combine step, if more than one.
+	 */
+	if (list_length(result) > 1)
+	{
+		List	   *step_ids = NIL;
+
+		foreach(lc, result)
+		{
+			PartitionPruneStep *step = lfirst(lc);
+
+			step_ids = lappend_int(step_ids, step->step_id);
+		}
+
+		if (step_ids != NIL)
+		{
+			PartitionPruneStep *step;
+
+			step = gen_prune_step_combine(context, step_ids,
+										  PARTPRUNE_COMBINE_INTERSECT);
+			result = lappend(result, step);
+		}
+	}
+
+	return result;
+}
+
+/*
+ * gen_prune_step_op
+ *		Generate a pruning step for a specific operator
+ *
+ * The step is assigned a unique step identifier and added to context's 'steps'
+ * list.
+ */
+static PartitionPruneStep *
+gen_prune_step_op(GeneratePruningStepsContext *context,
+				  StrategyNumber opstrategy, bool op_is_ne,
+				  List *exprs, List *cmpfns,
+				  Bitmapset *nullkeys)
+{
+	PartitionPruneStepOp *opstep = makeNode(PartitionPruneStepOp);
+
+	opstep->step.step_id = context->next_step_id++;
+
+	/*
+	 * For clauses that contain an <> operator, set opstrategy to
+	 * InvalidStrategy to signal get_matching_list_bounds to do the right
+	 * thing.
+	 */
+	if (op_is_ne)
+	{
+		Assert(opstrategy == BTEqualStrategyNumber);
+		opstep->opstrategy = InvalidStrategy;
+	}
+	else
+		opstep->opstrategy = opstrategy;
+	Assert(list_length(exprs) == list_length(cmpfns));
+	opstep->exprs = exprs;
+	opstep->cmpfns = cmpfns;
+	opstep->nullkeys = nullkeys;
+
+	context->steps = lappend(context->steps, opstep);
+
+	return (PartitionPruneStep *) opstep;
+}
+
+/*
+ * gen_prune_step_combine
+ *		Generate a pruning step for a combination of several other steps
+ *
+ * The step is assigned a unique step identifier and added to context's
+ * 'steps' list.
+ */
+static PartitionPruneStep *
+gen_prune_step_combine(GeneratePruningStepsContext *context,
+					   List *source_stepids,
+					   PartitionPruneCombineOp combineOp)
+{
+	PartitionPruneStepCombine *cstep = makeNode(PartitionPruneStepCombine);
+
+	cstep->step.step_id = context->next_step_id++;
+	cstep->combineOp = combineOp;
+	cstep->source_stepids = source_stepids;
+
+	context->steps = lappend(context->steps, cstep);
+
+	return (PartitionPruneStep *) cstep;
+}
+
+/*
+ * gen_prune_steps_from_opexps
+ *		Generate pruning steps based on clauses for partition keys
+ *
+ * 'keyclauses' contains one list of clauses per partition key.  We check here
+ * if we have found clauses for a valid subset of the partition key. In some
+ * cases, (depending on the type of partitioning being used) if we didn't
+ * find clauses for a given key, we discard clauses that may have been
+ * found for any subsequent keys; see specific notes below.
+ */
+static PartitionPruneStep *
+gen_prune_steps_from_opexps(PartitionScheme part_scheme,
+							GeneratePruningStepsContext *context,
+							List **keyclauses, Bitmapset *nullkeys)
+{
+	ListCell   *lc;
+	List	   *opsteps = NIL;
+	List	   *btree_clauses[BTMaxStrategyNumber + 1],
+			   *hash_clauses[HTMaxStrategyNumber + 1];
+	bool		need_next_less,
+				need_next_eq,
+				need_next_greater;
+	int			i;
+
+	memset(btree_clauses, 0, sizeof(btree_clauses));
+	memset(hash_clauses, 0, sizeof(hash_clauses));
+	for (i = 0; i < part_scheme->partnatts; i++)
+	{
+		List	   *clauselist = keyclauses[i];
+		bool		consider_next_key = true;
+
+		/*
+		 * To be useful for pruning, we must have clauses for a prefix of
+		 * partition keys in the case of range partitioning.  So, ignore
+		 * clauses for keys after this one.
+		 */
+		if (part_scheme->strategy == PARTITION_STRATEGY_RANGE &&
+			clauselist == NIL)
+			break;
+
+		/*
+		 * For hash partitioning, if a column doesn't have the necessary
+		 * equality clause, there should be an IS NULL clause, otherwise
+		 * pruning is not possible.
+		 */
+		if (part_scheme->strategy == PARTITION_STRATEGY_HASH &&
+			clauselist == NIL && !bms_is_member(i, nullkeys))
+			return NULL;
+
+		need_next_eq = need_next_less = need_next_greater = true;
+		foreach(lc, clauselist)
+		{
+			PartClauseInfo *pc = (PartClauseInfo *) lfirst(lc);
+			Oid			lefttype,
+						righttype;
+
+			/* Look up the operator's btree/hash strategy number. */
+			if (pc->op_strategy == InvalidStrategy)
+				get_op_opfamily_properties(pc->opno,
+										   part_scheme->partopfamily[i],
+										   false,
+										   &pc->op_strategy,
+										   &lefttype,
+										   &righttype);
+
+			switch (part_scheme->strategy)
+			{
+				case PARTITION_STRATEGY_LIST:
+				case PARTITION_STRATEGY_RANGE:
+					{
+						PartClauseInfo *last = NULL;
+						bool		inclusive = false;
+
+						/*
+						 * Add this clause to the list of clauses to be used
+						 * for pruning if this is the first such key for this
+						 * operator strategy or if it is consecutively next to
+						 * the last column for which a clause with this
+						 * operator strategy was matched.
+						 */
+						if (btree_clauses[pc->op_strategy] != NIL)
+							last = llast(btree_clauses[pc->op_strategy]);
+
+						if (last == NULL ||
+							i == last->keyno || i == last->keyno + 1)
+							btree_clauses[pc->op_strategy] =
+								lappend(btree_clauses[pc->op_strategy], pc);
+
+						/*
+						 * We may not need the next clause if they're of
+						 * certain strategy.
+						 */
+						switch (pc->op_strategy)
+						{
+							case BTLessEqualStrategyNumber:
+								inclusive = true;
+								/* fall through */
+							case BTLessStrategyNumber:
+								if (!inclusive)
+									need_next_eq = need_next_less = false;
+								break;
+							case BTEqualStrategyNumber:
+								/* always accept clauses for the next key. */
+								break;
+							case BTGreaterEqualStrategyNumber:
+								inclusive = true;
+								/* fall through */
+							case BTGreaterStrategyNumber:
+								if (!inclusive)
+									need_next_eq = need_next_greater = false;
+								break;
+						}
+
+						/* We may want to change our mind. */
+						if (consider_next_key)
+							consider_next_key = (need_next_eq ||
+												 need_next_less ||
+												 need_next_greater);
+						break;
+					}
+
+				case PARTITION_STRATEGY_HASH:
+					if (pc->op_strategy != HTEqualStrategyNumber)
+						elog(ERROR, "invalid clause for hash partitioning");
+					hash_clauses[pc->op_strategy] =
+						lappend(hash_clauses[pc->op_strategy], pc);
+					break;
+
+				default:
+					elog(ERROR, "invalid partition strategy: %c",
+						 part_scheme->strategy);
+					break;
+			}
+		}
+
+		/*
+		 * If we've decided that clauses for subsequent partition keys
+		 * wouldn't be useful for pruning, don't search any further.
+		 */
+		if (!consider_next_key)
+			break;
+	}
+
+	/*
+	 * Now, we have divided clauses according to their operator strategies.
+	 * Check for each strategy if we can generate pruning step(s) by
+	 * collecting a list of expressions whose values will constitute a vector
+	 * that can be used as a lookup key by a partition bound searching
+	 * function.
+	 */
+	switch (part_scheme->strategy)
+	{
+		case PARTITION_STRATEGY_LIST:
+		case PARTITION_STRATEGY_RANGE:
+			{
+				List	   *eq_clauses = btree_clauses[BTEqualStrategyNumber];
+				List	   *le_clauses = btree_clauses[BTLessEqualStrategyNumber];
+				List	   *ge_clauses = btree_clauses[BTGreaterEqualStrategyNumber];
+				int			strat;
+
+				/*
+				 * For each clause under consideration for a given strategy,
+				 * we collect expressions from clauses for earlier keys, whose
+				 * operator strategy is inclusive, into a list called
+				 * 'prefix'. By appending the clause's own expression to the
+				 * 'prefix', we'll generate one step using the so generated
+				 * vector and assign the current strategy to it.  Actually,
+				 * 'prefix' might contain multiple clauses for the same key,
+				 * in which case, we must generate steps for various
+				 * combinations of expressions of different keys, which
+				 * get_steps_using_prefix takes care of for us.
+				 */
+				for (strat = 1; strat <= BTMaxStrategyNumber; strat++)
+				{
+					foreach(lc, btree_clauses[strat])
+					{
+						PartClauseInfo *pc = lfirst(lc);
+						ListCell   *lc1;
+						List	   *prefix = NIL;
+						List	   *pc_steps;
+
+						/*
+						 * Expressions from = clauses can always be in the
+						 * prefix, provided they're from an earlier key.
+						 */
+						foreach(lc1, eq_clauses)
+						{
+							PartClauseInfo *eqpc = lfirst(lc1);
+
+							if (eqpc->keyno == pc->keyno)
+								break;
+							if (eqpc->keyno < pc->keyno)
+								prefix = lappend(prefix, eqpc);
+						}
+
+						/*
+						 * If we're generating steps for </<= strategy, we can
+						 * add other <= clauses to the prefix, provided
+						 * they're from an earlier key.
+						 */
+						if (strat == BTLessStrategyNumber ||
+							strat == BTLessEqualStrategyNumber)
+						{
+							foreach(lc1, le_clauses)
+							{
+								PartClauseInfo *lepc = lfirst(lc1);
+
+								if (lepc->keyno == pc->keyno)
+									break;
+								if (lepc->keyno < pc->keyno)
+									prefix = lappend(prefix, lepc);
+							}
+						}
+
+						/*
+						 * If we're generating steps for >/>= strategy, we can
+						 * add other >= clauses to the prefix, provided
+						 * they're from an earlier key.
+						 */
+						if (strat == BTGreaterStrategyNumber ||
+							strat == BTGreaterEqualStrategyNumber)
+						{
+							foreach(lc1, ge_clauses)
+							{
+								PartClauseInfo *gepc = lfirst(lc1);
+
+								if (gepc->keyno == pc->keyno)
+									break;
+								if (gepc->keyno < pc->keyno)
+									prefix = lappend(prefix, gepc);
+							}
+						}
+
+						/*
+						 * As mentioned above, if 'prefix' contains multiple
+						 * expressions for the same key, the following will
+						 * generate multiple steps, one for each combination
+						 * of the expressions for different keys.
+						 *
+						 * Note that we pass NULL for step_nullkeys, because
+						 * we don't search list/range partition bounds where
+						 * some keys are NULL.
+						 */
+						Assert(pc->op_strategy == strat);
+						pc_steps = get_steps_using_prefix(context, strat,
+														  pc->op_is_ne,
+														  pc->expr,
+														  pc->cmpfn,
+														  pc->keyno,
+														  NULL,
+														  prefix);
+						opsteps = list_concat(opsteps, list_copy(pc_steps));
+					}
+				}
+				break;
+			}
+
+		case PARTITION_STRATEGY_HASH:
+			{
+				List	   *eq_clauses = hash_clauses[HTEqualStrategyNumber];
+
+				/* For hash partitioning, we have just the = strategy. */
+				if (eq_clauses != NIL)
+				{
+					PartClauseInfo *pc;
+					List	   *pc_steps;
+					List	   *prefix = NIL;
+					int			last_keyno;
+					ListCell   *lc1;
+
+					/*
+					 * Locate the clause for the greatest column.  This may
+					 * not belong to the last partition key, but it is the
+					 * clause belonging to the last partition key we found a
+					 * clause for above.
+					 */
+					pc = llast(eq_clauses);
+
+					/*
+					 * There might be multiple clauses which matched to that
+					 * partition key; find the first such clause.  While at
+					 * it, add all the clauses before that one to 'prefix'.
+					 */
+					last_keyno = pc->keyno;
+					foreach(lc, eq_clauses)
+					{
+						pc = lfirst(lc);
+						if (pc->keyno == last_keyno)
+							break;
+						prefix = lappend(prefix, pc);
+					}
+
+					/*
+					 * For each clause for the "last" column, after appending
+					 * the clause's own expression to the 'prefix', we'll
+					 * generate one step using the so generated vector and and
+					 * assign = as its strategy.  Actually, 'prefix' might
+					 * contain multiple clauses for the same key, in which
+					 * case, we must generate steps for various combinations
+					 * of expressions of different keys, which
+					 * get_steps_using_prefix will take care of for us.
+					 */
+					for_each_cell(lc1, lc)
+					{
+						pc = lfirst(lc1);
+
+						/*
+						 * Note that we pass nullkeys for step_nullkeys,
+						 * because we need to tell hash partition bound search
+						 * function which of the keys we found IS NULL clauses
+						 * for.
+						 */
+						Assert(pc->op_strategy == HTEqualStrategyNumber);
+						pc_steps =
+							get_steps_using_prefix(context,
+												   HTEqualStrategyNumber,
+												   false,
+												   pc->expr,
+												   pc->cmpfn,
+												   pc->keyno,
+												   nullkeys,
+												   prefix);
+						opsteps = list_concat(opsteps, list_copy(pc_steps));
+					}
+				}
+				break;
+			}
+
+		default:
+			elog(ERROR, "invalid partition strategy: %c",
+				 part_scheme->strategy);
+			break;
+	}
+
+	/* Lastly, add a combine step to mutually AND these op steps, if needed */
+	if (list_length(opsteps) > 1)
+	{
+		List	   *opstep_ids = NIL;
+
+		foreach(lc, opsteps)
+		{
+			PartitionPruneStep *step = lfirst(lc);
+
+			opstep_ids = lappend_int(opstep_ids, step->step_id);
+		}
+
+		if (opstep_ids != NIL)
+			return gen_prune_step_combine(context, opstep_ids,
+										  PARTPRUNE_COMBINE_INTERSECT);
+		return NULL;
+	}
+	else if (opsteps != NIL)
+		return linitial(opsteps);
+
+	return NULL;
+}
+
+/*
+ * If the partition key has a collation, then the clause must have the same
+ * input collation.  If the partition key is non-collatable, we assume the
+ * collation doesn't matter, because while collation wasn't considered when
+ * performing partitioning, the clause still may have a collation assigned
+ * due to the other input being of a collatable type.
+ *
+ * See also IndexCollMatchesExprColl.
+ */
+#define PartCollMatchesExprColl(partcoll, exprcoll) \
+	((partcoll) == InvalidOid || (partcoll) == (exprcoll))
+
+/*
+ * match_clause_to_partition_key
+ *		Attempt to match the given 'clause' with the specified partition key.
+ *
+ * Return value is:
+ * * PARTCLAUSE_NOMATCH if the clause doesn't match this partition key (but
+ *   caller should keep trying, because it might match a subsequent key).
+ *   Output arguments: none set.
+ *
+ * * PARTCLAUSE_MATCH_CLAUSE if there is a match.
+ *   Output arguments: *pc is set to a PartClauseInfo constructed for the
+ *   matched clause.
+ *
+ * * PARTCLAUSE_MATCH_NULLNESS if there is a match, and the matched clause was
+ *   either a "a IS NULL" or "a IS NOT NULL" clause.
+ *   Output arguments: *clause_is_not_null is set to false in the former case
+ *   true otherwise.
+ *
+ * * PARTCLAUSE_MATCH_STEPS if there is a match.
+ *   Output arguments: *clause_steps is set to a list of PartitionPruneStep
+ *   generated for the clause.
+ *
+ * * PARTCLAUSE_MATCH_CONTRADICT if the clause is self-contradictory.  This can
+ *   only happen if it's a BoolExpr whose arguments are self-contradictory.
+ *   Output arguments: none set.
+ *
+ * * PARTCLAUSE_UNSUPPORTED if the clause cannot be used for pruning at all
+ *   due to one of its properties, such as argument volatility, even if it may
+ *   have been matched with a key.
+ *   Output arguments: none set.
+ */
+static PartClauseMatchStatus
+match_clause_to_partition_key(RelOptInfo *rel,
+							  GeneratePruningStepsContext *context,
+							  Expr *clause, Expr *partkey, int partkeyidx,
+							  bool *clause_is_not_null, PartClauseInfo **pc,
+							  List **clause_steps)
+{
+	PartitionScheme part_scheme = rel->part_scheme;
+	Expr	   *expr;
+	Oid			partopfamily = part_scheme->partopfamily[partkeyidx],
+				partcoll = part_scheme->partcollation[partkeyidx];
+
+	/*
+	 * Recognize specially shaped clauses that match with the Boolean
+	 * partition key.
+	 */
+	if (match_boolean_partition_clause(partopfamily, clause, partkey, &expr))
+	{
+		PartClauseInfo *partclause;
+
+		partclause = (PartClauseInfo *) palloc(sizeof(PartClauseInfo));
+		partclause->keyno = partkeyidx;
+		/* Do pruning with the Boolean equality operator. */
+		partclause->opno = BooleanEqualOperator;
+		partclause->op_is_ne = false;
+		partclause->expr = expr;
+		/* We know that expr is of Boolean type. */
+		partclause->cmpfn = rel->part_scheme->partsupfunc[partkeyidx].fn_oid;
+		partclause->op_strategy = InvalidStrategy;
+
+		*pc = partclause;
+
+		return PARTCLAUSE_MATCH_CLAUSE;
+	}
+	else if (IsA(clause, OpExpr) &&
+			 list_length(((OpExpr *) clause)->args) == 2)
+	{
+		OpExpr	   *opclause = (OpExpr *) clause;
+		Expr	   *leftop,
+				   *rightop;
+		Oid			commutator = InvalidOid,
+					negator = InvalidOid;
+		Oid			cmpfn;
+		Oid			exprtype;
+		bool		is_opne_listp = false;
+		PartClauseInfo *partclause;
+
+		leftop = (Expr *) get_leftop(clause);
+		if (IsA(leftop, RelabelType))
+			leftop = ((RelabelType *) leftop)->arg;
+		rightop = (Expr *) get_rightop(clause);
+		if (IsA(rightop, RelabelType))
+			rightop = ((RelabelType *) rightop)->arg;
+
+		/* check if the clause matches this partition key */
+		if (equal(leftop, partkey))
+			expr = rightop;
+		else if (equal(rightop, partkey))
+		{
+			expr = leftop;
+			commutator = get_commutator(opclause->opno);
+
+			/* nothing we can do unless we can swap the operands */
+			if (!OidIsValid(commutator))
+				return PARTCLAUSE_UNSUPPORTED;
+		}
+		else
+			/* clause does not match this partition key, but perhaps next. */
+			return PARTCLAUSE_NOMATCH;
+
+		/*
+		 * Partition key also consists of a collation that's specified for it,
+		 * so try to match it too.  There may be multiple keys with the same
+		 * expression but different collations.
+		 */
+		if (!PartCollMatchesExprColl(partcoll, opclause->inputcollid))
+			return PARTCLAUSE_NOMATCH;
+
+		/*
+		 * Matched with this key.  Now check various properties of the clause
+		 * to see if it's sane to use it for pruning.  If any of the
+		 * properties makes it unsuitable for pruning, then the clause is
+		 * useless no matter which key it's matched to.
+		 */
+
+		/*
+		 * Only allow strict operators.  This will guarantee nulls are
+		 * filtered.
+		 */
+		if (!op_strict(opclause->opno))
+			return PARTCLAUSE_UNSUPPORTED;
+
+		/* We can't use any volatile expressions to prune partitions. */
+		if (contain_volatile_functions((Node *) expr))
+			return PARTCLAUSE_UNSUPPORTED;
+
+		/*
+		 * Normally we only bother with operators that are listed as being
+		 * part of the partitioning operator family.  But we make an exception
+		 * in one case -- operators named '<>' are not listed in any operator
+		 * family whatsoever, in which case, we try to perform partition
+		 * pruning with it only if list partitioning is in use.
+		 */
+		if (!op_in_opfamily(opclause->opno, partopfamily))
+		{
+			if (part_scheme->strategy != PARTITION_STRATEGY_LIST)
+				return PARTCLAUSE_UNSUPPORTED;
+
+			/*
+			 * To confirm if the operator is really '<>', check if its negator
+			 * is a btree equality operator.
+			 */
+			negator = get_negator(opclause->opno);
+			if (OidIsValid(negator) && op_in_opfamily(negator, partopfamily))
+			{
+				Oid			lefttype;
+				Oid			righttype;
+				int			strategy;
+
+				get_op_opfamily_properties(negator, partopfamily, false,
+										   &strategy, &lefttype, &righttype);
+
+				if (strategy == BTEqualStrategyNumber)
+					is_opne_listp = true;
+			}
+
+			/* Operator isn't really what we were hoping it'd be. */
+			if (!is_opne_listp)
+				return PARTCLAUSE_UNSUPPORTED;
+		}
+
+		/* Check if we're going to need a cross-type comparison function. */
+		exprtype = exprType((Node *) expr);
+		if (exprtype != part_scheme->partopcintype[partkeyidx])
+		{
+			switch (part_scheme->strategy)
+			{
+				case PARTITION_STRATEGY_LIST:
+				case PARTITION_STRATEGY_RANGE:
+					cmpfn =
+						get_opfamily_proc(part_scheme->partopfamily[partkeyidx],
+										  part_scheme->partopcintype[partkeyidx],
+										  exprtype, BTORDER_PROC);
+					break;
+
+				case PARTITION_STRATEGY_HASH:
+					cmpfn =
+						get_opfamily_proc(part_scheme->partopfamily[partkeyidx],
+										  exprtype, exprtype, HASHEXTENDED_PROC);
+					break;
+
+				default:
+					elog(ERROR, "invalid partition strategy: %c",
+						 part_scheme->strategy);
+					break;
+			}
+
+			/* If we couldn't find one, we cannot use this expression. */
+			if (!OidIsValid(cmpfn))
+				return PARTCLAUSE_UNSUPPORTED;
+		}
+		else
+			cmpfn = part_scheme->partsupfunc[partkeyidx].fn_oid;
+
+		partclause = (PartClauseInfo *) palloc(sizeof(PartClauseInfo));
+		partclause->keyno = partkeyidx;
+
+		/* For <> operator clauses, pass on the negator. */
+		partclause->op_is_ne = false;
+		partclause->op_strategy = InvalidStrategy;
+
+		if (is_opne_listp)
+		{
+			Assert(OidIsValid(negator));
+			partclause->opno = negator;
+			partclause->op_is_ne = true;
+
+			/*
+			 * We already know the strategy in this case, so may as well set
+			 * it rather than having to look it up later.
+			 */
+			partclause->op_strategy = BTEqualStrategyNumber;
+		}
+		/* And if commuted before matching, pass on the commutator */
+		else if (OidIsValid(commutator))
+			partclause->opno = commutator;
+		else
+			partclause->opno = opclause->opno;
+
+		partclause->expr = expr;
+		partclause->cmpfn = cmpfn;
+
+		*pc = partclause;
+
+		return PARTCLAUSE_MATCH_CLAUSE;
+	}
+	else if (IsA(clause, ScalarArrayOpExpr))
+	{
+		ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
+		Oid			saop_op = saop->opno;
+		Oid			saop_coll = saop->inputcollid;
+		Expr	   *leftop = (Expr *) linitial(saop->args),
+				   *rightop = (Expr *) lsecond(saop->args);
+		List	   *elem_exprs,
+				   *elem_clauses;
+		ListCell   *lc1;
+
+		if (IsA(leftop, RelabelType))
+			leftop = ((RelabelType *) leftop)->arg;
+
+		/* Check it matches this partition key */
+		if (!equal(leftop, partkey) ||
+			!PartCollMatchesExprColl(partcoll, saop->inputcollid))
+			return PARTCLAUSE_NOMATCH;
+
+		/*
+		 * Matched with this key.  Check various properties of the clause to
+		 * see if it can sanely be used for partition pruning.
+		 */
+
+		/*
+		 * Only allow strict operators.  This will guarantee nulls are
+		 * filtered.
+		 */
+		if (!op_strict(saop->opno))
+			return PARTCLAUSE_UNSUPPORTED;
+
+		/* Useless if the array has any volatile functions. */
+		if (contain_volatile_functions((Node *) rightop))
+			return PARTCLAUSE_UNSUPPORTED;
+
+		/*
+		 * In case of NOT IN (..), we get a '<>', which we handle if list
+		 * partitioning is in use and we're able to confirm that it's negator
+		 * is a btree equality operator belonging to the partitioning operator
+		 * family.
+		 */
+		if (!op_in_opfamily(saop_op, partopfamily))
+		{
+			Oid			negator;
+
+			if (part_scheme->strategy != PARTITION_STRATEGY_LIST)
+				return PARTCLAUSE_UNSUPPORTED;
+
+			negator = get_negator(saop_op);
+			if (OidIsValid(negator) && op_in_opfamily(negator, partopfamily))
+			{
+				int			strategy;
+				Oid			lefttype,
+							righttype;
+
+				get_op_opfamily_properties(negator, partopfamily,
+										   false, &strategy,
+										   &lefttype, &righttype);
+				if (strategy != BTEqualStrategyNumber)
+					return PARTCLAUSE_UNSUPPORTED;
+			}
+		}
+
+		/*
+		 * First generate a list of Const nodes, one for each array element
+		 * (excepting nulls).
+		 */
+		elem_exprs = NIL;
+		if (IsA(rightop, Const))
+		{
+			Const	   *arr = castNode(Const, rightop);
+			ArrayType  *arrval = DatumGetArrayTypeP(arr->constvalue);
+			int16		elemlen;
+			bool		elembyval;
+			char		elemalign;
+			Datum	   *elem_values;
+			bool	   *elem_nulls;
+			int			num_elems,
+						i;
+
+			get_typlenbyvalalign(ARR_ELEMTYPE(arrval),
+								 &elemlen, &elembyval, &elemalign);
+			deconstruct_array(arrval,
+							  ARR_ELEMTYPE(arrval),
+							  elemlen, elembyval, elemalign,
+							  &elem_values, &elem_nulls,
+							  &num_elems);
+			for (i = 0; i < num_elems; i++)
+			{
+				Const	   *elem_expr;
+
+				/* Only consider non-null values. */
+				if (elem_nulls[i])
+					continue;
+
+				elem_expr = makeConst(ARR_ELEMTYPE(arrval), -1,
+									  arr->constcollid, elemlen,
+									  elem_values[i], false, elembyval);
+				elem_exprs = lappend(elem_exprs, elem_expr);
+			}
+		}
+		else
+		{
+			ArrayExpr  *arrexpr = castNode(ArrayExpr, rightop);
+
+			/*
+			 * For a nested ArrayExpr, we don't know how to get the actual
+			 * scalar values out into a flat list, so we give up doing
+			 * anything with this ScalarArrayOpExpr.
+			 */
+			if (arrexpr->multidims)
+				return PARTCLAUSE_UNSUPPORTED;
+
+			elem_exprs = arrexpr->elements;
+		}
+
+		/*
+		 * Now generate a list of clauses, one for each array element, of the
+		 * form saop_leftop saop_op elem_expr
+		 */
+		elem_clauses = NIL;
+		foreach(lc1, elem_exprs)
+		{
+			Expr	   *rightop = (Expr *) lfirst(lc1),
+					   *elem_clause;
+
+			elem_clause = make_opclause(saop_op, BOOLOID, false,
+										leftop, rightop,
+										InvalidOid, saop_coll);
+			elem_clauses = lappend(elem_clauses, elem_clause);
+		}
+
+		/*
+		 * Build a combine step as if for an OR clause or add the clauses to
+		 * the end of the list that's being processed currently.
+		 */
+		if (saop->useOr && list_length(elem_clauses) > 1)
+		{
+			Expr	   *orexpr;
+			bool		contradictory;
+
+			orexpr = makeBoolExpr(OR_EXPR, elem_clauses, -1);
+			*clause_steps =
+				gen_partprune_steps_internal(context, rel, list_make1(orexpr),
+											 &contradictory);
+			if (contradictory)
+				return PARTCLAUSE_MATCH_CONTRADICT;
+
+			Assert(list_length(*clause_steps) == 1);
+			return PARTCLAUSE_MATCH_STEPS;
+		}
+		else
+		{
+			bool		contradictory;
+
+			*clause_steps =
+				gen_partprune_steps_internal(context, rel, elem_clauses,
+											 &contradictory);
+			if (contradictory)
+				return PARTCLAUSE_MATCH_CONTRADICT;
+			Assert(list_length(*clause_steps) >= 1);
+			return PARTCLAUSE_MATCH_STEPS;
+		}
+	}
+	else if (IsA(clause, NullTest))
+	{
+		NullTest   *nulltest = (NullTest *) clause;
+		Expr	   *arg = nulltest->arg;
+
+		if (IsA(arg, RelabelType))
+			arg = ((RelabelType *) arg)->arg;
+
+		/* Does arg match with this partition key column? */
+		if (!equal(arg, partkey))
+			return PARTCLAUSE_NOMATCH;
+
+		*clause_is_not_null = nulltest->nulltesttype == IS_NOT_NULL;
+
+		return PARTCLAUSE_MATCH_NULLNESS;
+	}
+
+	return PARTCLAUSE_UNSUPPORTED;
+}
+
+/*
+ * get_steps_using_prefix
+ *		Generate list of PartitionPruneStepOp steps each consisting of given
+ *		opstrategy
+ *
+ * To generate steps, step_lastexpr and step_lastcmpfn are appended to
+ * expressions and cmpfns, respectively, extracted from the clauses in
+ * 'prefix'.  Actually, since 'prefix' may contain multiple clauses for the
+ * same partition key column, we must generate steps for various combinations
+ * of the clauses of different keys.
+ */
+static List *
+get_steps_using_prefix(GeneratePruningStepsContext *context,
+					   StrategyNumber step_opstrategy,
+					   bool step_op_is_ne,
+					   Expr *step_lastexpr,
+					   Oid step_lastcmpfn,
+					   int step_lastkeyno,
+					   Bitmapset *step_nullkeys,
+					   List *prefix)
+{
+	/* Quick exit if there are no values to prefix with. */
+	if (list_length(prefix) == 0)
+	{
+		PartitionPruneStep *step;
+
+		step = gen_prune_step_op(context,
+								 step_opstrategy,
+								 step_op_is_ne,
+								 list_make1(step_lastexpr),
+								 list_make1_oid(step_lastcmpfn),
+								 step_nullkeys);
+		return list_make1(step);
+	}
+
+	/* Recurse to generate steps for various combinations. */
+	return get_steps_using_prefix_recurse(context,
+										  step_opstrategy,
+										  step_op_is_ne,
+										  step_lastexpr,
+										  step_lastcmpfn,
+										  step_lastkeyno,
+										  step_nullkeys,
+										  list_head(prefix),
+										  NIL, NIL);
+}
+
+/*
+ * get_steps_using_prefix_recurse
+ *		Recursively generate combinations of clauses for different partition
+ *		keys and start generating steps upon reaching clauses for the greatest
+ *		column that is less than the one for which we're currently generating
+ *		steps (that is, step_lastkeyno)
+ *
+ * 'start' is where we should start iterating for the current invocation.
+ * 'step_exprs' and 'step_cmpfns' each contains the expressions and cmpfns
+ * we've generated so far from the clauses for the previous part keys.
+ */
+static List *
+get_steps_using_prefix_recurse(GeneratePruningStepsContext *context,
+							   StrategyNumber step_opstrategy,
+							   bool step_op_is_ne,
+							   Expr *step_lastexpr,
+							   Oid step_lastcmpfn,
+							   int step_lastkeyno,
+							   Bitmapset *step_nullkeys,
+							   ListCell *start,
+							   List *step_exprs,
+							   List *step_cmpfns)
+{
+	List	   *result = NIL;
+	ListCell   *lc;
+	int			cur_keyno;
+
+	/* Actually, recursion would be limited by PARTITION_MAX_KEYS. */
+	check_stack_depth();
+
+	/* Check if we need to recurse. */
+	Assert(start != NULL);
+	cur_keyno = ((PartClauseInfo *) lfirst(start))->keyno;
+	if (cur_keyno < step_lastkeyno - 1)
+	{
+		PartClauseInfo *pc;
+		ListCell   *next_start;
+
+		/*
+		 * For each clause with cur_keyno, adds its expr and cmpfn to
+		 * step_exprs and step_cmpfns, respectively, and recurse after setting
+		 * next_start to the ListCell of the first clause for the next
+		 * partition key.
+		 */
+		for_each_cell(lc, start)
+		{
+			pc = lfirst(lc);
+
+			if (pc->keyno > cur_keyno)
+				break;
+		}
+		next_start = lc;
+
+		for_each_cell(lc, start)
+		{
+			List	   *moresteps;
+
+			pc = lfirst(lc);
+			if (pc->keyno == cur_keyno)
+			{
+				/* clean up before starting a new recursion cycle. */
+				if (cur_keyno == 0)
+				{
+					list_free(step_exprs);
+					list_free(step_cmpfns);
+					step_exprs = list_make1(pc->expr);
+					step_cmpfns = list_make1_oid(pc->cmpfn);
+				}
+				else
+				{
+					step_exprs = lappend(step_exprs, pc->expr);
+					step_cmpfns = lappend_oid(step_cmpfns, pc->cmpfn);
+				}
+			}
+			else
+			{
+				Assert(pc->keyno > cur_keyno);
+				break;
+			}
+
+			moresteps = get_steps_using_prefix_recurse(context,
+													   step_opstrategy,
+													   step_op_is_ne,
+													   step_lastexpr,
+													   step_lastcmpfn,
+													   step_lastkeyno,
+													   step_nullkeys,
+													   next_start,
+													   step_exprs,
+													   step_cmpfns);
+			result = list_concat(result, moresteps);
+		}
+	}
+	else
+	{
+		/*
+		 * End the current recursion cycle and start generating steps, one for
+		 * each clause with cur_keyno, which is all clauses from here onward
+		 * till the end of the list.
+		 */
+		Assert(list_length(step_exprs) == cur_keyno);
+		for_each_cell(lc, start)
+		{
+			PartClauseInfo *pc = lfirst(lc);
+			PartitionPruneStep *step;
+			List	   *step_exprs1,
+					   *step_cmpfns1;
+
+			Assert(pc->keyno == cur_keyno);
+
+			/* Leave the original step_exprs unmodified. */
+			step_exprs1 = list_copy(step_exprs);
+			step_exprs1 = lappend(step_exprs1, pc->expr);
+			step_exprs1 = lappend(step_exprs1, step_lastexpr);
+
+			/* Leave the original step_cmpfns unmodified. */
+			step_cmpfns1 = list_copy(step_cmpfns);
+			step_cmpfns1 = lappend_oid(step_cmpfns1, pc->cmpfn);
+			step_cmpfns1 = lappend_oid(step_cmpfns1, step_lastcmpfn);
+
+			step = gen_prune_step_op(context,
+									 step_opstrategy, step_op_is_ne,
+									 step_exprs1, step_cmpfns1,
+									 step_nullkeys);
+			result = lappend(result, step);
+		}
+	}
+
+	return result;
+}
+
+/*
+ * get_matching_hash_bounds
+ *		Determine offset of the hash bound matching the specified values,
+ *		considering that all the non-null values come from clauses containing
+ *		a compatible hash equality operator and any keys that are null come
+ *		from an IS NULL clause.
+ *
+ * Generally this function will return a single matching bound offset,
+ * although if a partition has not been setup for a given modulus then we may
+ * return no matches.  If the number of clauses found don't cover the entire
+ * partition key, then we'll need to return all offsets.
+ *
+ * 'opstrategy' if non-zero must be HTEqualStrategyNumber.
+ *
+ * 'values' contains Datums indexed by the partition key to use for pruning.
+ *
+ * 'nvalues', the number of Datums in the 'values' array.
+ *
+ * 'partsupfunc' contains partition hashing functions that can produce correct
+ * hash for the type of the values contained in 'values'.
+ *
+ * 'nullkeys' is the set of partition keys that are null.
+ */
+static PruneStepResult *
+get_matching_hash_bounds(PartitionPruneContext *context,
+						 StrategyNumber opstrategy, Datum *values, int nvalues,
+						 FmgrInfo *partsupfunc, Bitmapset *nullkeys)
+{
+	PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
+	PartitionBoundInfo boundinfo = context->boundinfo;
+	int		   *partindices = boundinfo->indexes;
+	int			partnatts = context->partnatts;
+	bool		isnull[PARTITION_MAX_KEYS];
+	int			i;
+	uint64		rowHash;
+	int			greatest_modulus;
+
+	Assert(context->strategy == PARTITION_STRATEGY_HASH);
+
+	/*
+	 * For hash partitioning we can only perform pruning based on equality
+	 * clauses to the partition key or IS NULL clauses.  We also can only
+	 * prune if we got values for all keys.
+	 */
+	if (nvalues + bms_num_members(nullkeys) == partnatts)
+	{
+		/*
+		 * If there are any values, they must have come from clauses
+		 * containing an equality operator compatible with hash partitioning.
+		 */
+		Assert(opstrategy == HTEqualStrategyNumber || nvalues == 0);
+
+		for (i = 0; i < partnatts; i++)
+			isnull[i] = bms_is_member(i, nullkeys);
+
+		greatest_modulus = get_hash_partition_greatest_modulus(boundinfo);
+		rowHash = compute_hash_value(partnatts, partsupfunc, values, isnull);
+
+		if (partindices[rowHash % greatest_modulus] >= 0)
+			result->bound_offsets =
+				bms_make_singleton(rowHash % greatest_modulus);
+	}
+	else
+		result->bound_offsets = bms_add_range(NULL, 0,
+											  boundinfo->ndatums - 1);
+
+	/*
+	 * There is neither a special hash null partition or the default hash
+	 * partition.
+	 */
+	result->scan_null = result->scan_default = false;
+
+	return result;
+}
+
+/*
+ * get_matching_list_bounds
+ *		Determine the offsets of list bounds matching the specified value,
+ *		according to the semantics of the given operator strategy
+ * 'opstrategy' if non-zero must be a btree strategy number.
+ *
+ * 'value' contains the value to use for pruning.
+ *
+ * 'nvalues', if non-zero, should be exactly 1, because of list partitioning.
+ *
+ * 'partsupfunc' contains the list partitioning comparison function to be used
+ * to perform partition_list_bsearch
+ *
+ * 'nullkeys' is the set of partition keys that are null.
+ */
+static PruneStepResult *
+get_matching_list_bounds(PartitionPruneContext *context,
+						 StrategyNumber opstrategy, Datum value, int nvalues,
+						 FmgrInfo *partsupfunc, Bitmapset *nullkeys)
+{
+	PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
+	PartitionBoundInfo boundinfo = context->boundinfo;
+	int			off,
+				minoff,
+				maxoff;
+	bool		is_equal;
+	bool		inclusive = false;
+	Oid		   *partcollation = context->partcollation;
+
+	Assert(context->strategy == PARTITION_STRATEGY_LIST);
+	Assert(context->partnatts == 1);
+
+	result->scan_null = result->scan_default = false;
+
+	if (!bms_is_empty(nullkeys))
+	{
+		/*
+		 * Nulls may exist in only one partition - the partition whose
+		 * accepted set of values includes null or the default partition if
+		 * the former doesn't exist.
+		 */
+		if (partition_bound_accepts_nulls(boundinfo))
+			result->scan_null = true;
+		else
+			result->scan_default = partition_bound_has_default(boundinfo);
+		return result;
+	}
+
+	/*
+	 * If there are no datums to compare keys with, but there are partitions,
+	 * just return the default partition if one exists.
+	 */
+	if (boundinfo->ndatums == 0)
+	{
+		result->scan_default = partition_bound_has_default(boundinfo);
+		return result;
+	}
+
+	minoff = 0;
+	maxoff = boundinfo->ndatums - 1;
+
+	/*
+	 * If there are no values to compare with the datums in boundinfo, it
+	 * means the caller asked for partitions for all non-null datums.  Add
+	 * indexes of *all* partitions, including the default if any.
+	 */
+	if (nvalues == 0)
+	{
+		result->bound_offsets = bms_add_range(NULL, 0,
+											  boundinfo->ndatums - 1);
+		result->scan_default = partition_bound_has_default(boundinfo);
+		return result;
+	}
+
+	/* Special case handling of values coming from a <> operator clause. */
+	if (opstrategy == InvalidStrategy)
+	{
+		/*
+		 * First match to all bounds.  We'll remove any matching datums below.
+		 */
+		result->bound_offsets = bms_add_range(NULL, 0,
+											  boundinfo->ndatums - 1);
+
+		off = partition_list_bsearch(partsupfunc, partcollation, boundinfo,
+									 value, &is_equal);
+		if (off >= 0 && is_equal)
+		{
+
+			/* We have a match. Remove from the result. */
+			Assert(boundinfo->indexes[off] >= 0);
+			result->bound_offsets = bms_del_member(result->bound_offsets,
+												   off);
+		}
+
+		/* Always include the default partition if any. */
+		result->scan_default = partition_bound_has_default(boundinfo);
+
+		return result;
+	}
+
+	/*
+	 * With range queries, always include the default list partition, because
+	 * list partitions divide the key space in a discontinuous manner, not all
+	 * values in the given range will have a partition assigned.  This may not
+	 * technically be true for some data types (e.g. integer types), however,
+	 * we currently lack any sort of infrastructure to provide us with proofs
+	 * that would allow us to do anything smarter here.
+	 */
+	if (opstrategy != BTEqualStrategyNumber)
+		result->scan_default = partition_bound_has_default(boundinfo);
+
+	switch (opstrategy)
+	{
+		case BTEqualStrategyNumber:
+			off = partition_list_bsearch(partsupfunc,
+										 partcollation,
+										 boundinfo, value,
+										 &is_equal);
+			if (off >= 0 && is_equal)
+			{
+				Assert(boundinfo->indexes[off] >= 0);
+				result->bound_offsets = bms_make_singleton(off);
+			}
+			else
+				result->scan_default = partition_bound_has_default(boundinfo);
+			return result;
+
+		case BTGreaterEqualStrategyNumber:
+			inclusive = true;
+			/* fall through */
+		case BTGreaterStrategyNumber:
+			off = partition_list_bsearch(partsupfunc,
+										 partcollation,
+										 boundinfo, value,
+										 &is_equal);
+			if (off >= 0)
+			{
+				/* We don't want the matched datum to be in the result. */
+				if (!is_equal || !inclusive)
+					off++;
+			}
+			else
+			{
+				/*
+				 * This case means all partition bounds are greater, which in
+				 * turn means that all partitions satisfy this key.
+				 */
+				off = 0;
+			}
+
+			/*
+			 * off is greater than the numbers of datums we have partitions
+			 * for.  The only possible partition that could contain a match is
+			 * the default partition, but we must've set context->scan_default
+			 * above anyway if one exists.
+			 */
+			if (off > boundinfo->ndatums - 1)
+				return result;
+
+			minoff = off;
+			break;
+
+		case BTLessEqualStrategyNumber:
+			inclusive = true;
+			/* fall through */
+		case BTLessStrategyNumber:
+			off = partition_list_bsearch(partsupfunc,
+										 partcollation,
+										 boundinfo, value,
+										 &is_equal);
+			if (off >= 0 && is_equal && !inclusive)
+				off--;
+
+			/*
+			 * off is smaller than the datums of all non-default partitions.
+			 * The only possible partition that could contain a match is the
+			 * default partition, but we must've set context->scan_default
+			 * above anyway if one exists.
+			 */
+			if (off < 0)
+				return result;
+
+			maxoff = off;
+			break;
+
+		default:
+			elog(ERROR, "invalid strategy number %d", opstrategy);
+			break;
+	}
+
+	result->bound_offsets = bms_add_range(NULL, minoff, maxoff);
+	return result;
+}
+
+
+/*
+ * get_matching_range_datums
+ *		Determine the offsets of range bounds matching the specified values,
+ *		according to the semantics of the given operator strategy
+ *
+ * Each datum whose offset is in result is to be treated as the upper bound of
+ * the partition that will contain the desired values.
+ *
+ * If default partition needs to be scanned for given values, set scan_default
+ * in result if present.
+ *
+ * 'opstrategy' if non-zero must be a btree strategy number.
+ *
+ * 'values' contains Datums indexed by the partition key to use for pruning.
+ *
+ * 'nvalues', number of Datums in 'values' array. Must be <= context->partnatts.
+ *
+ * 'partsupfunc' contains the range partitioning comparison functions to be
+ * used to perform partition_range_datum_bsearch or partition_rbound_datum_cmp
+ * using.
+ *
+ * 'nullkeys' is the set of partition keys that are null.
+ */
+static PruneStepResult *
+get_matching_range_bounds(PartitionPruneContext *context,
+						  StrategyNumber opstrategy, Datum *values, int nvalues,
+						  FmgrInfo *partsupfunc, Bitmapset *nullkeys)
+{
+	PruneStepResult *result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
+	PartitionBoundInfo boundinfo = context->boundinfo;
+	Oid		   *partcollation = context->partcollation;
+	int			partnatts = context->partnatts;
+	int		   *partindices = boundinfo->indexes;
+	int			off,
+				minoff,
+				maxoff,
+				i;
+	bool		is_equal;
+	bool		inclusive = false;
+
+	Assert(context->strategy == PARTITION_STRATEGY_RANGE);
+	Assert(nvalues <= partnatts);
+
+	result->scan_null = result->scan_default = false;
+
+	/*
+	 * If there are no datums to compare keys with, or if we got an IS NULL
+	 * clause just return the default partition, if it exists.
+	 */
+	if (boundinfo->ndatums == 0 || !bms_is_empty(nullkeys))
+	{
+		result->scan_default = partition_bound_has_default(boundinfo);
+		return result;
+	}
+
+	minoff = 0;
+	maxoff = boundinfo->ndatums;
+
+	/*
+	 * If there are no values to compare with the datums in boundinfo, it
+	 * means the caller asked for partitions for all non-null datums.  Add
+	 * indexes of *all* partitions, including the default partition if one
+	 * exists.
+	 */
+	if (nvalues == 0)
+	{
+		if (partindices[minoff] < 0)
+			minoff++;
+		if (partindices[maxoff] < 0)
+			maxoff--;
+
+		result->scan_default = partition_bound_has_default(boundinfo);
+		result->bound_offsets = bms_add_range(NULL, minoff, maxoff);
+
+		return result;
+	}
+
+	/*
+	 * If the query does not constrain all key columns, we'll need to scan the
+	 * the default partition, if any.
+	 */
+	if (nvalues < partnatts)
+		result->scan_default = partition_bound_has_default(boundinfo);
+
+	switch (opstrategy)
+	{
+		case BTEqualStrategyNumber:
+			/* Look for the smallest bound that is = lookup value. */
+			off = partition_range_datum_bsearch(partsupfunc,
+												partcollation,
+												boundinfo,
+												nvalues, values,
+												&is_equal);
+
+			if (off >= 0 && is_equal)
+			{
+				if (nvalues == partnatts)
+				{
+					/* There can only be zero or one matching partition. */
+					if (partindices[off + 1] >= 0)
+						result->bound_offsets = bms_make_singleton(off + 1);
+					else
+						result->scan_default =
+							partition_bound_has_default(boundinfo);
+					return result;
+				}
+				else
+				{
+					int			saved_off = off;
+
+					/*
+					 * Since the lookup value contains only a prefix of keys,
+					 * we must find other bounds that may also match the
+					 * prefix.  partition_range_datum_bsearch() returns the
+					 * offset of one of them, find others by checking adjacent
+					 * bounds.
+					 */
+
+					/*
+					 * First find greatest bound that's smaller than the
+					 * lookup value.
+					 */
+					while (off >= 1)
+					{
+						int32		cmpval;
+
+						cmpval =
+							partition_rbound_datum_cmp(partsupfunc,
+													   partcollation,
+													   boundinfo->datums[off - 1],
+													   boundinfo->kind[off - 1],
+													   values, nvalues);
+						if (cmpval != 0)
+							break;
+						off--;
+					}
+
+					Assert(0 ==
+						   partition_rbound_datum_cmp(partsupfunc,
+													  partcollation,
+													  boundinfo->datums[off],
+													  boundinfo->kind[off],
+													  values, nvalues));
+
+					/*
+					 * We can treat 'off' as the offset of the smallest bound
+					 * to be included in the result, if we know it is the
+					 * upper bound of the partition in which the lookup value
+					 * could possibly exist.  One case it couldn't is if the
+					 * bound, or precisely the matched portion of its prefix,
+					 * is not inclusive.
+					 */
+					if (boundinfo->kind[off][nvalues] ==
+						PARTITION_RANGE_DATUM_MINVALUE)
+						off++;
+
+					minoff = off;
+
+					/*
+					 * Now find smallest bound that's greater than the lookup
+					 * value.
+					 */
+					off = saved_off;
+					while (off < boundinfo->ndatums - 1)
+					{
+						int32		cmpval;
+
+						cmpval = partition_rbound_datum_cmp(partsupfunc,
+															partcollation,
+															boundinfo->datums[off + 1],
+															boundinfo->kind[off + 1],
+															values, nvalues);
+						if (cmpval != 0)
+							break;
+						off++;
+					}
+
+					Assert(0 ==
+						   partition_rbound_datum_cmp(partsupfunc,
+													  partcollation,
+													  boundinfo->datums[off],
+													  boundinfo->kind[off],
+													  values, nvalues));
+
+					/*
+					 * off + 1, then would be the offset of the greatest bound
+					 * to be included in the result.
+					 */
+					maxoff = off + 1;
+				}
+
+				/*
+				 * Skip if minoff/maxoff are actually the upper bound of a
+				 * un-assigned portion of values.
+				 */
+				if (partindices[minoff] < 0 && minoff < boundinfo->ndatums)
+					minoff++;
+				if (partindices[maxoff] < 0 && maxoff >= 1)
+					maxoff--;
+
+				/*
+				 * There may exist a range of values unassigned to any
+				 * non-default partition between the datums at minoff and
+				 * maxoff.  Add the default partition in that case.
+				 */
+				if (partition_bound_has_default(boundinfo))
+				{
+					for (i = minoff; i <= maxoff; i++)
+					{
+						if (partindices[i] < 0)
+						{
+							result->scan_default = true;
+							break;
+						}
+					}
+				}
+
+				Assert(minoff >= 0 && maxoff >= 0);
+				result->bound_offsets = bms_add_range(NULL, minoff, maxoff);
+			}
+			else if (off >= 0)	/* !is_equal */
+			{
+				/*
+				 * The lookup value falls in the range between some bounds in
+				 * boundinfo.  'off' would be the offset of the greatest bound
+				 * that is <= lookup value, so add off + 1 to the result
+				 * instead as the offset of the upper bound of the only
+				 * partition that may contain the lookup value.
+				 */
+				if (partindices[off + 1] >= 0)
+					result->bound_offsets = bms_make_singleton(off + 1);
+				else
+					result->scan_default =
+						partition_bound_has_default(boundinfo);
+			}
+			else
+			{
+				/*
+				 * off < 0: the lookup value is smaller than all bounds, so
+				 * only the default partition qualifies, if there is one.
+				 */
+				result->scan_default = partition_bound_has_default(boundinfo);
+			}
+
+			return result;
+
+		case BTGreaterEqualStrategyNumber:
+			inclusive = true;
+			/* fall through */
+		case BTGreaterStrategyNumber:
+
+			/*
+			 * Look for the smallest bound that is > or >= lookup value and
+			 * set minoff to its offset.
+			 */
+			off = partition_range_datum_bsearch(partsupfunc,
+												partcollation,
+												boundinfo,
+												nvalues, values,
+												&is_equal);
+			if (off < 0)
+			{
+				/*
+				 * All bounds are greater than the lookup value, so include
+				 * all of them in the result.
+				 */
+				minoff = 0;
+			}
+			else
+			{
+				if (is_equal && nvalues < partnatts)
+				{
+					/*
+					 * Since the lookup value contains only a prefix of keys,
+					 * we must find other bounds that may also match the
+					 * prefix.  partition_range_datum_bsearch() returns the
+					 * offset of one of them, find others by checking adjacent
+					 * bounds.
+					 *
+					 * Based on whether the lookup values are inclusive or
+					 * not, we must either include the indexes of all such
+					 * bounds in the result (that is, set minoff to the index
+					 * of smallest such bound) or find the smallest one that's
+					 * greater than the lookup values and set minoff to that.
+					 */
+					while (off >= 1 && off < boundinfo->ndatums - 1)
+					{
+						int32		cmpval;
+						int			nextoff;
+
+						nextoff = inclusive ? off - 1 : off + 1;
+						cmpval =
+							partition_rbound_datum_cmp(partsupfunc,
+													   partcollation,
+													   boundinfo->datums[nextoff],
+													   boundinfo->kind[nextoff],
+													   values, nvalues);
+						if (cmpval != 0)
+							break;
+
+						off = nextoff;
+					}
+
+					Assert(0 ==
+						   partition_rbound_datum_cmp(partsupfunc,
+													  partcollation,
+													  boundinfo->datums[off],
+													  boundinfo->kind[off],
+													  values, nvalues));
+
+					minoff = inclusive ? off : off + 1;
+				}
+
+				/*
+				 * lookup value falls in the range between some bounds in
+				 * boundinfo.  off would be the offset of the greatest bound
+				 * that is <= lookup value, so add off + 1 to the result
+				 * instead as the offset of the upper bound of the smallest
+				 * partition that may contain the lookup value.
+				 */
+				else
+					minoff = off + 1;
+			}
+			break;
+
+		case BTLessEqualStrategyNumber:
+			inclusive = true;
+			/* fall through */
+		case BTLessStrategyNumber:
+
+			/*
+			 * Look for the greatest bound that is < or <= lookup value and
+			 * set minoff to its offset.
+			 */
+			off = partition_range_datum_bsearch(partsupfunc,
+												partcollation,
+												boundinfo,
+												nvalues, values,
+												&is_equal);
+			if (off < 0)
+			{
+				/*
+				 * All bounds are greater than the key, so we could only
+				 * expect to find the lookup key in the default partition.
+				 */
+				result->scan_default = partition_bound_has_default(boundinfo);
+				return result;
+			}
+			else
+			{
+				/*
+				 * See the comment above.
+				 */
+				if (is_equal && nvalues < partnatts)
+				{
+					while (off >= 1 && off < boundinfo->ndatums - 1)
+					{
+						int32		cmpval;
+						int			nextoff;
+
+						nextoff = inclusive ? off + 1 : off - 1;
+						cmpval = partition_rbound_datum_cmp(partsupfunc,
+															partcollation,
+															boundinfo->datums[nextoff],
+															boundinfo->kind[nextoff],
+															values, nvalues);
+						if (cmpval != 0)
+							break;
+
+						off = nextoff;
+					}
+
+					Assert(0 ==
+						   partition_rbound_datum_cmp(partsupfunc,
+													  partcollation,
+													  boundinfo->datums[off],
+													  boundinfo->kind[off],
+													  values, nvalues));
+
+					maxoff = inclusive ? off + 1 : off;
+				}
+
+				/*
+				 * The lookup value falls in the range between some bounds in
+				 * boundinfo.  'off' would be the offset of the greatest bound
+				 * that is <= lookup value, so add off + 1 to the result
+				 * instead as the offset of the upper bound of the greatest
+				 * partition that may contain lookup value.  If the lookup
+				 * value had exactly matched the bound, but it isn't
+				 * inclusive, no need add the adjacent partition.
+				 */
+				else if (!is_equal || inclusive)
+					maxoff = off + 1;
+				else
+					maxoff = off;
+			}
+			break;
+
+		default:
+			elog(ERROR, "invalid strategy number %d", opstrategy);
+			break;
+	}
+
+	/*
+	 * Skip a gap and when doing so, check if the bound contains a finite
+	 * value to decide if we need to add the default partition.  If it's an
+	 * infinite bound, we need not add the default partition, as having an
+	 * infinite bound means the partition in question catches any values that
+	 * would otherwise be in the default partition.
+	 */
+	if (partindices[minoff] < 0)
+	{
+		int			lastkey = nvalues - 1;
+
+		if (minoff >= 0 &&
+			minoff < boundinfo->ndatums &&
+			boundinfo->kind[minoff][lastkey] ==
+			PARTITION_RANGE_DATUM_VALUE)
+			result->scan_default = partition_bound_has_default(boundinfo);
+
+		minoff++;
+	}
+
+	/*
+	 * Skip a gap.  See the above comment about how we decide whether or or
+	 * not to scan the default partition based whether the datum that will
+	 * become the maximum datum is finite or not.
+	 */
+	if (maxoff >= 1 && partindices[maxoff] < 0)
+	{
+		int			lastkey = nvalues - 1;
+
+		if (maxoff >= 0 &&
+			maxoff <= boundinfo->ndatums &&
+			boundinfo->kind[maxoff - 1][lastkey] ==
+			PARTITION_RANGE_DATUM_VALUE)
+			result->scan_default = partition_bound_has_default(boundinfo);
+
+		maxoff--;
+	}
+
+	if (partition_bound_has_default(boundinfo))
+	{
+		/*
+		 * There may exist a range of values unassigned to any non-default
+		 * partition between the datums at minoff and maxoff.  Add the default
+		 * partition in that case.
+		 */
+		for (i = minoff; i <= maxoff; i++)
+		{
+			if (partindices[i] < 0)
+			{
+				result->scan_default = true;
+				break;
+			}
+		}
+	}
+
+	Assert(minoff >= 0 && maxoff >= 0);
+	if (minoff <= maxoff)
+		result->bound_offsets = bms_add_range(NULL, minoff, maxoff);
+
+	return result;
+}
+
+/*
+ * perform_pruning_base_step
+ *		Determines the indexes of datums that satisfy conditions specified in
+ *		'opstep'.
+ *
+ * Result also contains whether special null-accepting and/or default
+ * partition need to be scanned.
+ */
+static PruneStepResult *
+perform_pruning_base_step(PartitionPruneContext *context,
+						  PartitionPruneStepOp *opstep)
+{
+	ListCell   *lc1,
+			   *lc2;
+	int			keyno,
+				nvalues;
+	Datum		values[PARTITION_MAX_KEYS];
+	FmgrInfo	partsupfunc[PARTITION_MAX_KEYS];
+
+	/*
+	 * There better be the same number of expressions and compare functions.
+	 */
+	Assert(list_length(opstep->exprs) == list_length(opstep->cmpfns));
+
+	nvalues = 0;
+	lc1 = list_head(opstep->exprs);
+	lc2 = list_head(opstep->cmpfns);
+
+	/*
+	 * Generate the partition lookup key that will be used by one of the
+	 * get_matching_*_bounds functions called below.
+	 */
+	for (keyno = 0; keyno < context->partnatts; keyno++)
+	{
+		/*
+		 * For hash partitioning, it is possible that values of some keys are
+		 * not provided in operator clauses, but instead the planner found
+		 * that they appeared in a IS NULL clause.
+		 */
+		if (bms_is_member(keyno, opstep->nullkeys))
+			continue;
+
+		/*
+		 * For range partitioning, we must only perform pruning with values
+		 * for either all partition keys or a prefix thereof.
+		 */
+		if (keyno > nvalues && context->strategy == PARTITION_STRATEGY_RANGE)
+			break;
+
+		if (lc1 != NULL)
+		{
+			Expr	   *expr;
+			Datum		datum;
+
+			expr = lfirst(lc1);
+			if (partkey_datum_from_expr(context, expr, &datum))
+			{
+				Oid			cmpfn;
+
+				/*
+				 * If we're going to need a different comparison function than
+				 * the one cached in the PartitionKey, we'll need to look up
+				 * the FmgrInfo.
+				 */
+				cmpfn = lfirst_oid(lc2);
+				Assert(OidIsValid(cmpfn));
+				if (cmpfn != context->partsupfunc[keyno].fn_oid)
+					fmgr_info(cmpfn, &partsupfunc[keyno]);
+				else
+					fmgr_info_copy(&partsupfunc[keyno],
+								   &context->partsupfunc[keyno],
+								   CurrentMemoryContext);
+
+				values[keyno] = datum;
+				nvalues++;
+			}
+
+			lc1 = lnext(lc1);
+			lc2 = lnext(lc2);
+		}
+	}
+
+	switch (context->strategy)
+	{
+		case PARTITION_STRATEGY_HASH:
+			return get_matching_hash_bounds(context,
+											opstep->opstrategy,
+											values, nvalues,
+											partsupfunc,
+											opstep->nullkeys);
+
+		case PARTITION_STRATEGY_LIST:
+			return get_matching_list_bounds(context,
+											opstep->opstrategy,
+											values[0], nvalues,
+											&partsupfunc[0],
+											opstep->nullkeys);
+
+		case PARTITION_STRATEGY_RANGE:
+			return get_matching_range_bounds(context,
+											 opstep->opstrategy,
+											 values, nvalues,
+											 partsupfunc,
+											 opstep->nullkeys);
+
+		default:
+			elog(ERROR, "unexpected partition strategy: %d",
+				 (int) context->strategy);
+			break;
+	}
+
+	return NULL;
+}
+
+/*
+ * perform_pruning_combine_step
+ *		Determines the indexes of datums obtained by combining those given
+ *		by the steps identified by cstep->source_stepids using the specified
+ *		combination method
+ *
+ * Since cstep may refer to the result of earlier steps, we also receive
+ * step_results here.
+ */
+static PruneStepResult *
+perform_pruning_combine_step(PartitionPruneContext *context,
+							 PartitionPruneStepCombine *cstep,
+							 PruneStepResult **step_results)
+{
+	ListCell   *lc1;
+	PruneStepResult *result = NULL;
+	bool		firststep;
+
+	/*
+	 * A combine step without any source steps is an indication to not perform
+	 * any partition pruning, we just return all partitions.
+	 */
+	result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
+	if (list_length(cstep->source_stepids) == 0)
+	{
+		PartitionBoundInfo boundinfo = context->boundinfo;
+
+		result->bound_offsets = bms_add_range(NULL, 0, boundinfo->ndatums - 1);
+		result->scan_default = partition_bound_has_default(boundinfo);
+		result->scan_null = partition_bound_accepts_nulls(boundinfo);
+		return result;
+	}
+
+	switch (cstep->combineOp)
+	{
+		case PARTPRUNE_COMBINE_UNION:
+			foreach(lc1, cstep->source_stepids)
+			{
+				int			step_id = lfirst_int(lc1);
+				PruneStepResult *step_result;
+
+				/*
+				 * step_results[step_id] must contain a valid result, which is
+				 * confirmed by the fact that cstep's step_id is greater than
+				 * step_id and the fact that results of the individual steps
+				 * are evaluated in sequence of their step_ids.
+				 */
+				if (step_id >= cstep->step.step_id)
+					elog(ERROR, "invalid pruning combine step argument");
+				step_result = step_results[step_id];
+				Assert(step_result != NULL);
+
+				/* Record any additional datum indexes from this step */
+				result->bound_offsets = bms_add_members(result->bound_offsets,
+														step_result->bound_offsets);
+
+				/* Update whether to scan null and default partitions. */
+				if (!result->scan_null)
+					result->scan_null = step_result->scan_null;
+				if (!result->scan_default)
+					result->scan_default = step_result->scan_default;
+			}
+			break;
+
+		case PARTPRUNE_COMBINE_INTERSECT:
+			firststep = true;
+			foreach(lc1, cstep->source_stepids)
+			{
+				int			step_id = lfirst_int(lc1);
+				PruneStepResult *step_result;
+
+				if (step_id >= cstep->step.step_id)
+					elog(ERROR, "invalid pruning combine step argument");
+				step_result = step_results[step_id];
+				Assert(step_result != NULL);
+
+				if (firststep)
+				{
+					/* Copy step's result the first time. */
+					result->bound_offsets = step_result->bound_offsets;
+					result->scan_null = step_result->scan_null;
+					result->scan_default = step_result->scan_default;
+					firststep = false;
+				}
+				else
+				{
+					/* Record datum indexes common to both steps */
+					result->bound_offsets =
+						bms_int_members(result->bound_offsets,
+										step_result->bound_offsets);
+
+					/* Update whether to scan null and default partitions. */
+					if (result->scan_null)
+						result->scan_null = step_result->scan_null;
+					if (result->scan_default)
+						result->scan_default = step_result->scan_default;
+				}
+			}
+			break;
+
+		default:
+			elog(ERROR, "invalid pruning combine op: %d",
+				 (int) cstep->combineOp);
+	}
+
+	return result;
+}
+
+/*
+ * match_boolean_partition_clause
+ *
+ * Sets *outconst to a Const containing true or false value and returns true if
+ * we're able to match the clause to the partition key as specially-shaped
+ * Boolean clause.  Returns false otherwise with *outconst set to NULL.
+ */
+static bool
+match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey,
+							   Expr **outconst)
+{
+	Expr	   *leftop;
+
+	*outconst = NULL;
+
+	if (!IsBooleanOpfamily(partopfamily))
+		return false;
+
+	if (IsA(clause, BooleanTest))
+	{
+		BooleanTest *btest = (BooleanTest *) clause;
+
+		/* Only IS [NOT] TRUE/FALSE are any good to us */
+		if (btest->booltesttype == IS_UNKNOWN ||
+			btest->booltesttype == IS_NOT_UNKNOWN)
+			return false;
+
+		leftop = btest->arg;
+		if (IsA(leftop, RelabelType))
+			leftop = ((RelabelType *) leftop)->arg;
+
+		if (equal(leftop, partkey))
+			*outconst = (btest->booltesttype == IS_TRUE ||
+						 btest->booltesttype == IS_NOT_FALSE)
+				? (Expr *) makeBoolConst(true, false)
+				: (Expr *) makeBoolConst(false, false);
+
+		if (*outconst)
+			return true;
+	}
+	else
+	{
+		bool		is_not_clause = not_clause((Node *) clause);
+
+		leftop = is_not_clause ? get_notclausearg(clause) : clause;
+
+		if (IsA(leftop, RelabelType))
+			leftop = ((RelabelType *) leftop)->arg;
+
+		/* Compare to the partition key, and make up a clause ... */
+		if (equal(leftop, partkey))
+			*outconst = is_not_clause ?
+				(Expr *) makeBoolConst(false, false) :
+				(Expr *) makeBoolConst(true, false);
+		else if (equal(negate_clause((Node *) leftop), partkey))
+			*outconst = (Expr *) makeBoolConst(false, false);
+
+		if (*outconst)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * partkey_datum_from_expr
+ *		Evaluate 'expr', set *value to the resulting Datum. Return true if
+ *		evaluation was possible, otherwise false.
+ */
+static bool
+partkey_datum_from_expr(PartitionPruneContext *context,
+						Expr *expr, Datum *value)
+{
+	switch (nodeTag(expr))
+	{
+		case T_Const:
+			*value = ((Const *) expr)->constvalue;
+			return true;
+
+		default:
+			break;
+	}
+
+	return false;
+}
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index 3d8b08ba..6cade9aa 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -26,7 +26,7 @@
  * PartitionBoundInfo encapsulates a set of partition bounds.  It is usually
  * associated with partitioned tables as part of its partition descriptor.
  *
- * The internal structure is opaque outside partition.c.
+ * The internal structure appears in partbounds.h.
  */
 typedef struct PartitionBoundInfoData *PartitionBoundInfo;
 
@@ -69,7 +69,6 @@ extern void check_default_allows_bound(Relation parent, Relation defaultRel,
 						   PartitionBoundSpec *new_spec);
 extern List *get_proposed_default_constraint(List *new_part_constaints);
 
-/* For tuple routing */
 extern int get_partition_for_tuple(Relation relation, Datum *values,
 							bool *isnull);
 
diff --git a/src/include/catalog/pg_opfamily.h b/src/include/catalog/pg_opfamily.h
index 044e4076..599fdd41 100644
--- a/src/include/catalog/pg_opfamily.h
+++ b/src/include/catalog/pg_opfamily.h
@@ -53,6 +53,9 @@ typedef FormData_pg_opfamily *Form_pg_opfamily;
 #define Anum_pg_opfamily_opfnamespace    3
 #define Anum_pg_opfamily_opfowner        4
 
+#define IsBooleanOpfamily(opfamily) \
+	((opfamily) == BOOL_BTREE_FAM_OID || (opfamily) == BOOL_HASH_FAM_OID)
+
 /* ----------------
  *        initial contents of pg_opfamily
  * ----------------
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 43f90ba9..2f585807 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -246,6 +246,9 @@ typedef enum NodeTag
     T_FromExpr,
     T_OnConflictExpr,
     T_IntoClause,
+        T_PartitionPruneStep,
+        T_PartitionPruneStepOp,
+        T_PartitionPruneStepCombine,
 #ifdef PGXC
     T_DistributeBy,
     T_PGXCSubCluster,
@@ -328,7 +331,6 @@ typedef enum NodeTag
     T_PlaceHolderVar,
     T_SpecialJoinInfo,
     T_AppendRelInfo,
-    T_PartitionedChildRelInfo,
     T_PlaceHolderInfo,
     T_MinMaxAggInfo,
     T_PlannerParamItem,
diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h
index faee37ff..a8df3382 100644
--- a/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@ -20,6 +20,7 @@
 #define PRIMNODES_H
 
 #include "access/attnum.h"
+#include "access/stratnum.h"
 #include "nodes/bitmapset.h"
 #include "nodes/pg_list.h"
 
@@ -1597,4 +1598,78 @@ typedef struct OnConflictExpr
     List       *exclRelTlist;    /* tlist of the EXCLUDED pseudo relation */
 } OnConflictExpr;
 
+
+/*
+ * Node types to represent a partition pruning step.
+ */
+
+/*
+ * The base Node type.  step_id is the global identifier of a given step
+ * within a given pruning context.
+ */
+typedef struct PartitionPruneStep
+{
+	NodeTag		type;
+	int			step_id;
+} PartitionPruneStep;
+
+/*----------
+ * PartitionPruneStepOp - Information to prune using a set of mutually AND'd
+ *							OpExpr clauses
+ *
+ * This contains information extracted from up to partnatts OpExpr clauses,
+ * where partnatts is the number of partition key columns.  'opstrategy' is the
+ * strategy of the operator in the clause matched to the last partition key.
+ * 'exprs' contains expressions which comprise the lookup key to be passed to
+ * the partition bound search function.  'cmpfns' contains the OIDs of
+ * comparison function used to compare aforementioned expressions with
+ * partition bounds.  Both 'exprs' and 'cmpfns' contain the same number of
+ * items up to partnatts items.
+ *
+ * Once we find the offset of a partition bound using the lookup key, we
+ * determine which partitions to include in the result based on the value of
+ * 'opstrategy'.  For example, if it were equality, we'd return just the
+ * partition that would contain that key or a set of partitions if the key
+ * didn't consist of all partitioning columns.  For non-equality strategies,
+ * we'd need to include other partitions as appropriate.
+ *
+ * 'nullkeys' is the set containing the offset of the partition keys (0 to
+ * partnatts - 1) that were matched to an IS NULL clause.  This is only
+ * considered for hash partitioning as we need to pass which keys are null
+ * to the hash partition bound search function.  It is never possible to
+ * have an expression be present in 'exprs' for a given partition key and
+ * the corresponding bit set in 'nullkeys'.
+ *----------
+ */
+typedef struct PartitionPruneStepOp
+{
+	PartitionPruneStep step;
+
+	StrategyNumber opstrategy;
+	List	   *exprs;
+	List	   *cmpfns;
+	Bitmapset  *nullkeys;
+} PartitionPruneStepOp;
+
+/*----------
+ * PartitionPruneStepCombine - Information to prune using a BoolExpr clause
+ *
+ * For BoolExpr clauses, we combine the set of partitions determined for each
+ * of its argument clauses.
+ *----------
+ */
+typedef enum PartitionPruneCombineOp
+{
+	PARTPRUNE_COMBINE_UNION,
+	PARTPRUNE_COMBINE_INTERSECT
+} PartitionPruneCombineOp;
+
+typedef struct PartitionPruneStepCombine
+{
+	PartitionPruneStep step;
+
+	PartitionPruneCombineOp combineOp;
+	List	   *source_stepids;
+} PartitionPruneStepCombine;
+
 #endif                            /* PRIMNODES_H */
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 6172b31e..e49bc1a0 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -76,6 +76,7 @@
 #define RELATION_H
 
 #include "access/sdir.h"
+#include "fmgr.h"
 #include "lib/stringinfo.h"
 #include "nodes/params.h"
 #include "nodes/parsenodes.h"
@@ -345,8 +346,6 @@ typedef struct PlannerInfo
 
     List       *append_rel_list;    /* list of AppendRelInfos */
 
-    List       *pcinfo_list;    /* list of PartitionedChildRelInfos */
-
     List       *rowMarks;        /* list of PlanRowMarks */
 
     List       *placeholder_list;    /* list of PlaceHolderInfos */
@@ -417,6 +416,9 @@ typedef struct PlannerInfo
 
     /* optional private data for join_search_hook, e.g., GEQO */
     void       *join_search_private;
+
+        /* Does this query modify any partition key columns? */
+        bool            partColsUpdated;
 #ifdef XCP
     /*
      * This is NULL for a SELECT query (NULL distribution means "Coordinator"
@@ -468,6 +470,9 @@ typedef struct PartitionSchemeData
 	/* Cached information about partition key data types. */
 	int16	   *parttyplen;
 	bool	   *parttypbyval;
+
+	/* Cached information about partition comparison functions. */
+	FmgrInfo   *partsupfunc;
 }			PartitionSchemeData;
 
 typedef struct PartitionSchemeData *PartitionScheme;
@@ -641,10 +646,14 @@ typedef struct PartitionSchemeData *PartitionScheme;
  * If the relation is partitioned, these fields will be set:
  *
  * 		part_scheme - Partitioning scheme of the relation
- * 		boundinfo - Partition bounds
  * 		nparts - Number of partitions
+ *		boundinfo - Partition bounds
+ *		partition_qual - Partition constraint if not the root
  * 		part_rels - RelOptInfos for each partition
  * 		partexprs, nullable_partexprs - Partition key expressions
+ *		partitioned_child_rels - RT indexes of unpruned partitions of
+ *								 relation that are partitioned tables
+ *								 themselves
  *
  * Note: A base relation always has only one set of partition keys, but a join
  * relation may have as many sets of partition keys as the number of relations
@@ -771,10 +780,12 @@ typedef struct RelOptInfo
 	PartitionScheme part_scheme;    /* Partitioning scheme. */
 	int         nparts;         /* number of partitions */
 	struct PartitionBoundInfoData *boundinfo;   /* Partition bounds */
+        List       *partition_qual; /* partition constraint */
 	struct RelOptInfo **part_rels;  /* Array of RelOptInfos of partitions,
 	                                * stored in the same order of bounds */
         List      **partexprs;      /* Non-nullable partition key expressions. */
         List      **nullable_partexprs; /* Nullable partition key expressions. */
+        List       *partitioned_child_rels; /* List of RT indexes. */
 #ifdef __TBASE__
 	/* used for interval partition */
 	bool		intervalparent;     /* is interval partition */
@@ -2251,27 +2262,6 @@ typedef struct AppendRelInfo
     Oid            parent_reloid;    /* OID of parent relation */
 } AppendRelInfo;
 
-/*
- * For a partitioned table, this maps its RT index to the list of RT indexes
- * of the partitioned child tables in the partition tree.  We need to
- * separately store this information, because we do not create AppendRelInfos
- * for the partitioned child tables of a parent table, since AppendRelInfos
- * contain information that is unnecessary for the partitioned child tables.
- * The child_rels list must contain at least one element, because the parent
- * partitioned table is itself counted as a child.
- *
- * These structs are kept in the PlannerInfo node's pcinfo_list.
- */
-typedef struct PartitionedChildRelInfo
-{
-    NodeTag        type;
-
-    Index        parent_relid;
-    List       *child_rels;
-	bool		part_cols_updated;	/* is the partition key of any of
-									 * the partitioned tables updated? */
-} PartitionedChildRelInfo;
-
 /*
  * For each distinct placeholder expression generated during planning, we
  * store a PlaceHolderInfo node in the PlannerInfo node's placeholder_list.
diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h
index 1425e543..2e47c1e3 100644
--- a/src/include/optimizer/planner.h
+++ b/src/include/optimizer/planner.h
@@ -121,11 +121,6 @@ extern Expr *preprocess_phv_expression(PlannerInfo *root, Expr *expr);
 
 extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid);
 
-extern List *get_partitioned_child_rels(PlannerInfo *root, Index rti,
-						   bool *part_cols_updated);
-extern List *get_partitioned_child_rels_for_join(PlannerInfo *root,
-                                                                       Relids join_relids);
-
 extern void preprocess_rowmarks(PlannerInfo *root);
 
 #ifdef __TBASE__
diff --git a/src/include/partitioning/partbounds.h b/src/include/partitioning/partbounds.h
new file mode 100644
index 00000000..c76014d4
--- /dev/null
+++ b/src/include/partitioning/partbounds.h
@@ -0,0 +1,124 @@
+/*-------------------------------------------------------------------------
+ *
+ * partbounds.h
+ *
+ * Copyright (c) 2007-2018, PostgreSQL Global Development Group
+ *
+ * src/include/partitioning/partbounds.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PARTBOUNDS_H
+#define PARTBOUNDS_H
+
+#include "catalog/partition.h"
+
+
+/*
+ * PartitionBoundInfoData encapsulates a set of partition bounds. It is
+ * usually associated with partitioned tables as part of its partition
+ * descriptor, but may also be used to represent a virtual partitioned
+ * table such as a partitioned joinrel within the planner.
+ *
+ * A list partition datum that is known to be NULL is never put into the
+ * datums array. Instead, it is tracked using the null_index field.
+ *
+ * In the case of range partitioning, ndatums will typically be far less than
+ * 2 * nparts, because a partition's upper bound and the next partition's lower
+ * bound are the same in most common cases, and we only store one of them (the
+ * upper bound).  In case of hash partitioning, ndatums will be same as the
+ * number of partitions.
+ *
+ * For range and list partitioned tables, datums is an array of datum-tuples
+ * with key->partnatts datums each.  For hash partitioned tables, it is an array
+ * of datum-tuples with 2 datums, modulus and remainder, corresponding to a
+ * given partition.
+ *
+ * The datums in datums array are arranged in increasing order as defined by
+ * functions qsort_partition_rbound_cmp(), qsort_partition_list_value_cmp() and
+ * qsort_partition_hbound_cmp() for range, list and hash partitioned tables
+ * respectively. For range and list partitions this simply means that the
+ * datums in the datums array are arranged in increasing order as defined by
+ * the partition key's operator classes and collations.
+ *
+ * In the case of list partitioning, the indexes array stores one entry for
+ * every datum, which is the index of the partition that accepts a given datum.
+ * In case of range partitioning, it stores one entry per distinct range
+ * datum, which is the index of the partition for which a given datum
+ * is an upper bound.  In the case of hash partitioning, the number of the
+ * entries in the indexes array is same as the greatest modulus amongst all
+ * partitions.  For a given partition key datum-tuple, the index of the
+ * partition which would accept that datum-tuple would be given by the entry
+ * pointed by remainder produced when hash value of the datum-tuple is divided
+ * by the greatest modulus.
+ */
+
+typedef struct PartitionBoundInfoData
+{
+	char		strategy;		/* hash, list or range? */
+	int			ndatums;		/* Length of the datums following array */
+	Datum	  **datums;
+	PartitionRangeDatumKind **kind; /* The kind of each range bound datum;
+									 * NULL for hash and list partitioned
+									 * tables */
+	int		   *indexes;		/* Partition indexes */
+	int			null_index;		/* Index of the null-accepting partition; -1
+								 * if there isn't one */
+	int			default_index;	/* Index of the default partition; -1 if there
+								 * isn't one */
+} PartitionBoundInfoData;
+
+#define partition_bound_accepts_nulls(bi) ((bi)->null_index != -1)
+#define partition_bound_has_default(bi) ((bi)->default_index != -1)
+
+/*
+ * When qsort'ing partition bounds after reading from the catalog, each bound
+ * is represented with one of the following structs.
+ */
+
+/* One bound of a hash partition */
+typedef struct PartitionHashBound
+{
+	int			modulus;
+	int			remainder;
+	int			index;
+} PartitionHashBound;
+
+/* One value coming from some (index'th) list partition */
+typedef struct PartitionListValue
+{
+	int			index;
+	Datum		value;
+} PartitionListValue;
+
+/* One bound of a range partition */
+typedef struct PartitionRangeBound
+{
+	int			index;
+	Datum	   *datums;			/* range bound datums */
+	PartitionRangeDatumKind *kind;	/* the kind of each datum */
+	bool		lower;			/* this is the lower (vs upper) bound */
+} PartitionRangeBound;
+
+extern int	get_hash_partition_greatest_modulus(PartitionBoundInfo b);
+extern int partition_list_bsearch(FmgrInfo *partsupfunc, Oid *partcollation,
+					   PartitionBoundInfo boundinfo,
+					   Datum value, bool *is_equal);
+extern int partition_range_bsearch(int partnatts, FmgrInfo *partsupfunc,
+						Oid *partcollation,
+						PartitionBoundInfo boundinfo,
+						PartitionRangeBound *probe, bool *is_equal);
+extern int partition_range_datum_bsearch(FmgrInfo *partsupfunc,
+							  Oid *partcollation,
+							  PartitionBoundInfo boundinfo,
+							  int nvalues, Datum *values, bool *is_equal);
+extern int partition_hash_bsearch(PartitionBoundInfo boundinfo,
+					   int modulus, int remainder);
+extern uint64 compute_hash_value(int partnatts, FmgrInfo *partsupfunc,
+				   Datum *values, bool *isnull);
+extern int32 partition_rbound_datum_cmp(FmgrInfo *partsupfunc,
+						   Oid *partcollation,
+						   Datum *rb_datums, PartitionRangeDatumKind *rb_kind,
+						   Datum *tuple_datums, int n_tuple_datums);
+
+#endif							/* PARTBOUNDS_H */
diff --git a/src/include/partitioning/partprune.h b/src/include/partitioning/partprune.h
new file mode 100644
index 00000000..52fadc7c
--- /dev/null
+++ b/src/include/partitioning/partprune.h
@@ -0,0 +1,49 @@
+/*-------------------------------------------------------------------------
+ *
+ * partprune.h
+ *	  prototypes for partprune.c
+ *
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/partitioning/partprune.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PARTPRUNE_H
+#define PARTPRUNE_H
+
+#include "catalog/partition.h"
+#include "nodes/relation.h"
+
+/*
+ * PartitionPruneContext
+ *
+ * Information about a partitioned table needed to perform partition pruning.
+ */
+typedef struct PartitionPruneContext
+{
+	/* Partition key information */
+	char		strategy;
+	int			partnatts;
+	Oid		   *partopfamily;
+	Oid		   *partopcintype;
+	Oid		   *partcollation;
+	FmgrInfo   *partsupfunc;
+
+	/* Number of partitions */
+	int			nparts;
+
+	/* Partition boundary info */
+	PartitionBoundInfo boundinfo;
+} PartitionPruneContext;
+
+
+extern Relids prune_append_rel_partitions(RelOptInfo *rel);
+extern Bitmapset *get_matching_partitions(PartitionPruneContext *context,
+						List *pruning_steps);
+extern List *gen_partprune_steps(RelOptInfo *rel, List *clauses,
+					bool *contradictory);
+
+#endif							/* PARTPRUNE_H */
diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
index 51d9903d..be0a774d 100644
--- a/src/test/regress/expected/inherit.out
+++ b/src/test/regress/expected/inherit.out
@@ -2215,9 +2215,11 @@ explain (costs off) select * from mcrparted where abs(b) = 5;	-- scans all parti
                Filter: (abs(b) = 5)
          ->  Seq Scan on mcrparted3
                Filter: (abs(b) = 5)
+         ->  Seq Scan on mcrparted4
+               Filter: (abs(b) = 5)               
          ->  Seq Scan on mcrparted5
                Filter: (abs(b) = 5)
-(12 rows)
+(14 rows)
 
 explain (costs off) select * from mcrparted where a > -1;	-- scans all partitions
                      QUERY PLAN                      
diff --git a/src/test/regress/expected/inherit_1.out b/src/test/regress/expected/inherit_1.out
index a6b99b17..d16ab5d6 100644
--- a/src/test/regress/expected/inherit_1.out
+++ b/src/test/regress/expected/inherit_1.out
@@ -2213,9 +2213,11 @@ explain (costs off) select * from mcrparted where abs(b) = 5;	-- scans all parti
                Filter: (abs(b) = 5)
          ->  Seq Scan on mcrparted3
                Filter: (abs(b) = 5)
+         ->  Seq Scan on mcrparted4
+               Filter: (abs(b) = 5)              
          ->  Seq Scan on mcrparted5
                Filter: (abs(b) = 5)
-(13 rows)
+(15 rows)
 
 explain (costs off) select * from mcrparted where a > -1;	-- scans all partitions
                 QUERY PLAN                 
diff --git a/src/test/regress/expected/inherit_2.out b/src/test/regress/expected/inherit_2.out
index ef08ec3e..0502f335 100644
--- a/src/test/regress/expected/inherit_2.out
+++ b/src/test/regress/expected/inherit_2.out
@@ -2180,9 +2180,11 @@ explain (costs off) select * from mcrparted where abs(b) = 5;	-- scans all parti
                Filter: (abs(b) = 5)
          ->  Seq Scan on mcrparted3
                Filter: (abs(b) = 5)
+         ->  Seq Scan on mcrparted4
+               Filter: (abs(b) = 5)
          ->  Seq Scan on mcrparted5
                Filter: (abs(b) = 5)
-(12 rows)
+(14 rows)
 
 explain (costs off) select * from mcrparted where a > -1;	-- scans all partitions
                      QUERY PLAN                      
diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out
index 9a33a70d..955a1170 100644
--- a/src/test/regress/expected/inherit_3.out
+++ b/src/test/regress/expected/inherit_3.out
@@ -2200,9 +2200,11 @@ explain (costs off) select * from mcrparted where abs(b) = 5;	-- scans all parti
                Filter: (abs(b) = 5)
          ->  Seq Scan on mcrparted3
                Filter: (abs(b) = 5)
+         ->  Seq Scan on mcrparted4
+               Filter: (abs(b) = 5)
          ->  Seq Scan on mcrparted5
                Filter: (abs(b) = 5)
-(13 rows)
+(15 rows)
 
 explain (costs off) select * from mcrparted where a > -1;	-- scans all partitions
                 QUERY PLAN                 
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index aabb0240..fe195e31 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -208,16 +208,14 @@ explain (costs off) select * from rlp where 1 > a;	/* commuted */
 (3 rows)
 
 explain (costs off) select * from rlp where a <= 1;
-              QUERY PLAN               
----------------------------------------
+        QUERY PLAN        
+--------------------------
  Append
    ->  Seq Scan on rlp1
          Filter: (a <= 1)
    ->  Seq Scan on rlp2
          Filter: (a <= 1)
-   ->  Seq Scan on rlp_default_default
-         Filter: (a <= 1)
-(7 rows)
+(5 rows)
 
 explain (costs off) select * from rlp where a = 1;
        QUERY PLAN        
@@ -577,7 +575,9 @@ explain (costs off) select * from rlp where a > 20 and a < 27;
          Filter: ((a > 20) AND (a < 27))
    ->  Seq Scan on rlp4_default
          Filter: ((a > 20) AND (a < 27))
-(7 rows)
+   ->  Seq Scan on rlp_default_default
+         Filter: ((a > 20) AND (a < 27))
+(9 rows)
 
 explain (costs off) select * from rlp where a = 29;
            QUERY PLAN           
@@ -716,9 +716,7 @@ explain (costs off) select * from mc3p where a = 1 and abs(b) = 1 and c < 8;
          Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1))
    ->  Seq Scan on mc3p1
          Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1))
-   ->  Seq Scan on mc3p_default
-         Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1))
-(7 rows)
+(5 rows)
 
 explain (costs off) select * from mc3p where a = 10 and abs(b) between 5 and 35;
                            QUERY PLAN                            
@@ -894,6 +892,8 @@ explain (costs off) select * from mc3p where a = 1 or abs(b) = 1 or c = 1;
          Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
    ->  Seq Scan on mc3p2
          Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+   ->  Seq Scan on mc3p3
+         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
    ->  Seq Scan on mc3p4
          Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
    ->  Seq Scan on mc3p5
@@ -904,7 +904,7 @@ explain (costs off) select * from mc3p where a = 1 or abs(b) = 1 or c = 1;
          Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
    ->  Seq Scan on mc3p_default
          Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
-(17 rows)
+(19 rows)
 
 explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 10);
                                   QUERY PLAN                                  
@@ -1040,33 +1040,22 @@ explain (costs off) select * from boolpart where a is true or a is not true;
          Filter: ((a IS TRUE) OR (a IS NOT TRUE))
    ->  Seq Scan on boolpart_t
          Filter: ((a IS TRUE) OR (a IS NOT TRUE))
-   ->  Seq Scan on boolpart_default
-         Filter: ((a IS TRUE) OR (a IS NOT TRUE))
-(7 rows)
+(5 rows)
 
 explain (costs off) select * from boolpart where a is not true;
-             QUERY PLAN             
-------------------------------------
+           QUERY PLAN            
+---------------------------------
  Append
    ->  Seq Scan on boolpart_f
          Filter: (a IS NOT TRUE)
-   ->  Seq Scan on boolpart_t
-         Filter: (a IS NOT TRUE)
-   ->  Seq Scan on boolpart_default
-         Filter: (a IS NOT TRUE)
-(7 rows)
+(3 rows)
 
 explain (costs off) select * from boolpart where a is not true and a is not false;
-                       QUERY PLAN                       
---------------------------------------------------------
- Append
-   ->  Seq Scan on boolpart_f
-         Filter: ((a IS NOT TRUE) AND (a IS NOT FALSE))
-   ->  Seq Scan on boolpart_t
-         Filter: ((a IS NOT TRUE) AND (a IS NOT FALSE))
-   ->  Seq Scan on boolpart_default
-         Filter: ((a IS NOT TRUE) AND (a IS NOT FALSE))
-(7 rows)
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
 
 explain (costs off) select * from boolpart where a is unknown;
              QUERY PLAN             
@@ -1092,4 +1081,446 @@ explain (costs off) select * from boolpart where a is not unknown;
          Filter: (a IS NOT UNKNOWN)
 (7 rows)
 
-drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart;
+--
+-- some more cases
+--
+--
+-- pruning for partitioned table appearing inside a sub-query
+--
+-- pruning won't work for mc3p, because some keys are Params
+explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.a = t1.b and abs(t2.b) = 1 and t2.c = 1) s where t1.a = 1;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Nested Loop
+   ->  Append
+         ->  Seq Scan on mc2p1 t1
+               Filter: (a = 1)
+         ->  Seq Scan on mc2p2 t1_1
+               Filter: (a = 1)
+         ->  Seq Scan on mc2p_default t1_2
+               Filter: (a = 1)
+   ->  Aggregate
+         ->  Append
+               ->  Seq Scan on mc3p0 t2
+                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
+               ->  Seq Scan on mc3p1 t2_1
+                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
+               ->  Seq Scan on mc3p2 t2_2
+                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
+               ->  Seq Scan on mc3p3 t2_3
+                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
+               ->  Seq Scan on mc3p4 t2_4
+                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
+               ->  Seq Scan on mc3p5 t2_5
+                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
+               ->  Seq Scan on mc3p6 t2_6
+                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
+               ->  Seq Scan on mc3p7 t2_7
+                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
+               ->  Seq Scan on mc3p_default t2_8
+                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
+(28 rows)
+
+-- pruning should work fine, because values for a prefix of keys (a, b) are
+-- available
+explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.c = t1.b and abs(t2.b) = 1 and t2.a = 1) s where t1.a = 1;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Nested Loop
+   ->  Append
+         ->  Seq Scan on mc2p1 t1
+               Filter: (a = 1)
+         ->  Seq Scan on mc2p2 t1_1
+               Filter: (a = 1)
+         ->  Seq Scan on mc2p_default t1_2
+               Filter: (a = 1)
+   ->  Aggregate
+         ->  Append
+               ->  Seq Scan on mc3p0 t2
+                     Filter: ((c = t1.b) AND (a = 1) AND (abs(b) = 1))
+               ->  Seq Scan on mc3p1 t2_1
+                     Filter: ((c = t1.b) AND (a = 1) AND (abs(b) = 1))
+               ->  Seq Scan on mc3p_default t2_2
+                     Filter: ((c = t1.b) AND (a = 1) AND (abs(b) = 1))
+(16 rows)
+
+-- also here, because values for all keys are provided
+explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.a = 1 and abs(t2.b) = 1 and t2.c = 1) s where t1.a = 1;
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Nested Loop
+   ->  Aggregate
+         ->  Append
+               ->  Seq Scan on mc3p1 t2
+                     Filter: ((a = 1) AND (c = 1) AND (abs(b) = 1))
+   ->  Append
+         ->  Seq Scan on mc2p1 t1
+               Filter: (a = 1)
+         ->  Seq Scan on mc2p2 t1_1
+               Filter: (a = 1)
+         ->  Seq Scan on mc2p_default t1_2
+               Filter: (a = 1)
+(12 rows)
+
+--
+-- pruning with clauses containing <> operator
+--
+-- doesn't prune range partitions
+create table rp (a int) partition by range (a);
+create table rp0 partition of rp for values from (minvalue) to (1);
+create table rp1 partition of rp for values from (1) to (2);
+create table rp2 partition of rp for values from (2) to (maxvalue);
+explain (costs off) select * from rp where a <> 1;
+        QUERY PLAN        
+--------------------------
+ Append
+   ->  Seq Scan on rp0
+         Filter: (a <> 1)
+   ->  Seq Scan on rp1
+         Filter: (a <> 1)
+   ->  Seq Scan on rp2
+         Filter: (a <> 1)
+(7 rows)
+
+explain (costs off) select * from rp where a <> 1 and a <> 2;
+               QUERY PLAN                
+-----------------------------------------
+ Append
+   ->  Seq Scan on rp0
+         Filter: ((a <> 1) AND (a <> 2))
+   ->  Seq Scan on rp1
+         Filter: ((a <> 1) AND (a <> 2))
+   ->  Seq Scan on rp2
+         Filter: ((a <> 1) AND (a <> 2))
+(7 rows)
+
+-- null partition should be eliminated due to strict <> clause.
+explain (costs off) select * from lp where a <> 'a';
+             QUERY PLAN             
+------------------------------------
+ Append
+   ->  Seq Scan on lp_ad
+         Filter: (a <> 'a'::bpchar)
+   ->  Seq Scan on lp_bc
+         Filter: (a <> 'a'::bpchar)
+   ->  Seq Scan on lp_ef
+         Filter: (a <> 'a'::bpchar)
+   ->  Seq Scan on lp_g
+         Filter: (a <> 'a'::bpchar)
+   ->  Seq Scan on lp_default
+         Filter: (a <> 'a'::bpchar)
+(11 rows)
+
+-- ensure we detect contradictions in clauses; a can't be NULL and NOT NULL.
+explain (costs off) select * from lp where a <> 'a' and a is null;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+explain (costs off) select * from lp where (a <> 'a' and a <> 'd') or a is null;
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on lp_bc
+         Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
+   ->  Seq Scan on lp_ef
+         Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
+   ->  Seq Scan on lp_g
+         Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
+   ->  Seq Scan on lp_null
+         Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
+   ->  Seq Scan on lp_default
+         Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
+(11 rows)
+
+-- check that it also works for a partitioned table that's not root,
+-- which in this case are partitions of rlp that are themselves
+-- list-partitioned on b
+explain (costs off) select * from rlp where a = 15 and b <> 'ab' and b <> 'cd' and b <> 'xy' and b is not null;
+                                                                QUERY PLAN                                                                
+------------------------------------------------------------------------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on rlp3efgh
+         Filter: ((b IS NOT NULL) AND ((b)::text <> 'ab'::text) AND ((b)::text <> 'cd'::text) AND ((b)::text <> 'xy'::text) AND (a = 15))
+   ->  Seq Scan on rlp3_default
+         Filter: ((b IS NOT NULL) AND ((b)::text <> 'ab'::text) AND ((b)::text <> 'cd'::text) AND ((b)::text <> 'xy'::text) AND (a = 15))
+(5 rows)
+
+--
+-- different collations for different keys with same expression
+--
+create table coll_pruning_multi (a text) partition by range (substr(a, 1) collate "POSIX", substr(a, 1) collate "C");
+create table coll_pruning_multi1 partition of coll_pruning_multi for values from ('a', 'a') to ('a', 'e');
+create table coll_pruning_multi2 partition of coll_pruning_multi for values from ('a', 'e') to ('a', 'z');
+create table coll_pruning_multi3 partition of coll_pruning_multi for values from ('b', 'a') to ('b', 'e');
+-- no pruning, because no value for the leading key
+explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'e' collate "C";
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Append
+   ->  Seq Scan on coll_pruning_multi1
+         Filter: (substr(a, 1) = 'e'::text COLLATE "C")
+   ->  Seq Scan on coll_pruning_multi2
+         Filter: (substr(a, 1) = 'e'::text COLLATE "C")
+   ->  Seq Scan on coll_pruning_multi3
+         Filter: (substr(a, 1) = 'e'::text COLLATE "C")
+(7 rows)
+
+-- pruning, with a value provided for the leading key
+explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'a' collate "POSIX";
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Append
+   ->  Seq Scan on coll_pruning_multi1
+         Filter: (substr(a, 1) = 'a'::text COLLATE "POSIX")
+   ->  Seq Scan on coll_pruning_multi2
+         Filter: (substr(a, 1) = 'a'::text COLLATE "POSIX")
+(5 rows)
+
+-- pruning, with values provided for both keys
+explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'e' collate "C" and substr(a, 1) = 'a' collate "POSIX";
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on coll_pruning_multi2
+         Filter: ((substr(a, 1) = 'e'::text COLLATE "C") AND (substr(a, 1) = 'a'::text COLLATE "POSIX"))
+(3 rows)
+
+--
+-- LIKE operators don't prune
+--
+create table like_op_noprune (a text) partition by list (a);
+create table like_op_noprune1 partition of like_op_noprune for values in ('ABC');
+create table like_op_noprune2 partition of like_op_noprune for values in ('BCD');
+explain (costs off) select * from like_op_noprune where a like '%BC';
+             QUERY PLAN             
+------------------------------------
+ Append
+   ->  Seq Scan on like_op_noprune1
+         Filter: (a ~~ '%BC'::text)
+   ->  Seq Scan on like_op_noprune2
+         Filter: (a ~~ '%BC'::text)
+(5 rows)
+
+--
+-- tests wherein clause value requires a cross-type comparison function
+--
+create table lparted_by_int2 (a smallint) partition by list (a);
+create table lparted_by_int2_1 partition of lparted_by_int2 for values in (1);
+create table lparted_by_int2_16384 partition of lparted_by_int2 for values in (16384);
+explain (costs off) select * from lparted_by_int2 where a = 100000000000000;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+create table rparted_by_int2 (a smallint) partition by range (a);
+create table rparted_by_int2_1 partition of rparted_by_int2 for values from (1) to (10);
+create table rparted_by_int2_16384 partition of rparted_by_int2 for values from (10) to (16384);
+-- all partitions pruned
+explain (costs off) select * from rparted_by_int2 where a > 100000000000000;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+create table rparted_by_int2_maxvalue partition of rparted_by_int2 for values from (16384) to (maxvalue);
+-- all partitions but rparted_by_int2_maxvalue pruned
+explain (costs off) select * from rparted_by_int2 where a > 100000000000000;
+                   QUERY PLAN                    
+-------------------------------------------------
+ Append
+   ->  Seq Scan on rparted_by_int2_maxvalue
+         Filter: (a > '100000000000000'::bigint)
+(3 rows)
+
+drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2;
+-- hash partitioning
+create table hp (a int, b text) partition by hash (a, b);
+create table hp0 partition of hp for values with (modulus 4, remainder 0);
+create table hp3 partition of hp for values with (modulus 4, remainder 3);
+create table hp1 partition of hp for values with (modulus 4, remainder 1);
+create table hp2 partition of hp for values with (modulus 4, remainder 2);
+insert into hp values (null, null);
+insert into hp values (1, null);
+insert into hp values (1, 'xxx');
+insert into hp values (null, 'xxx');
+insert into hp values (10, 'xxx');
+insert into hp values (10, 'yyy');
+select tableoid::regclass, * from hp order by 1;
+ tableoid | a  |  b  
+----------+----+-----
+ hp0      |    | 
+ hp0      |  1 | 
+ hp0      |  1 | xxx
+ hp3      | 10 | yyy
+ hp1      |    | xxx
+ hp2      | 10 | xxx
+(6 rows)
+
+-- partial keys won't prune, nor would non-equality conditions
+explain (costs off) select * from hp where a = 1;
+       QUERY PLAN        
+-------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (a = 1)
+   ->  Seq Scan on hp1
+         Filter: (a = 1)
+   ->  Seq Scan on hp2
+         Filter: (a = 1)
+   ->  Seq Scan on hp3
+         Filter: (a = 1)
+(9 rows)
+
+explain (costs off) select * from hp where b = 'xxx';
+            QUERY PLAN             
+-----------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (b = 'xxx'::text)
+   ->  Seq Scan on hp1
+         Filter: (b = 'xxx'::text)
+   ->  Seq Scan on hp2
+         Filter: (b = 'xxx'::text)
+   ->  Seq Scan on hp3
+         Filter: (b = 'xxx'::text)
+(9 rows)
+
+explain (costs off) select * from hp where a is null;
+         QUERY PLAN          
+-----------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (a IS NULL)
+   ->  Seq Scan on hp1
+         Filter: (a IS NULL)
+   ->  Seq Scan on hp2
+         Filter: (a IS NULL)
+   ->  Seq Scan on hp3
+         Filter: (a IS NULL)
+(9 rows)
+
+explain (costs off) select * from hp where b is null;
+         QUERY PLAN          
+-----------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (b IS NULL)
+   ->  Seq Scan on hp1
+         Filter: (b IS NULL)
+   ->  Seq Scan on hp2
+         Filter: (b IS NULL)
+   ->  Seq Scan on hp3
+         Filter: (b IS NULL)
+(9 rows)
+
+explain (costs off) select * from hp where a < 1 and b = 'xxx';
+                   QUERY PLAN                    
+-------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a < 1) AND (b = 'xxx'::text))
+   ->  Seq Scan on hp1
+         Filter: ((a < 1) AND (b = 'xxx'::text))
+   ->  Seq Scan on hp2
+         Filter: ((a < 1) AND (b = 'xxx'::text))
+   ->  Seq Scan on hp3
+         Filter: ((a < 1) AND (b = 'xxx'::text))
+(9 rows)
+
+explain (costs off) select * from hp where a <> 1 and b = 'yyy';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a <> 1) AND (b = 'yyy'::text))
+   ->  Seq Scan on hp1
+         Filter: ((a <> 1) AND (b = 'yyy'::text))
+   ->  Seq Scan on hp2
+         Filter: ((a <> 1) AND (b = 'yyy'::text))
+   ->  Seq Scan on hp3
+         Filter: ((a <> 1) AND (b = 'yyy'::text))
+(9 rows)
+
+-- pruning should work if non-null values are provided for all the keys
+explain (costs off) select * from hp where a is null and b is null;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a IS NULL) AND (b IS NULL))
+(3 rows)
+
+explain (costs off) select * from hp where a = 1 and b is null;
+                QUERY PLAN                 
+-------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((b IS NULL) AND (a = 1))
+(3 rows)
+
+explain (costs off) select * from hp where a = 1 and b = 'xxx';
+                   QUERY PLAN                    
+-------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a = 1) AND (b = 'xxx'::text))
+(3 rows)
+
+explain (costs off) select * from hp where a is null and b = 'xxx';
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Append
+   ->  Seq Scan on hp1
+         Filter: ((a IS NULL) AND (b = 'xxx'::text))
+(3 rows)
+
+explain (costs off) select * from hp where a = 10 and b = 'xxx';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Append
+   ->  Seq Scan on hp2
+         Filter: ((a = 10) AND (b = 'xxx'::text))
+(3 rows)
+
+explain (costs off) select * from hp where a = 10 and b = 'yyy';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Append
+   ->  Seq Scan on hp3
+         Filter: ((a = 10) AND (b = 'yyy'::text))
+(3 rows)
+
+explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null);
+                                                       QUERY PLAN                                                        
+-------------------------------------------------------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
+   ->  Seq Scan on hp2
+         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
+   ->  Seq Scan on hp3
+         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
+(7 rows)
+
+-- hash partitiong pruning doesn't occur with <> operator clauses
+explain (costs off) select * from hp where a <> 1 and b <> 'xxx';
+                    QUERY PLAN                     
+---------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a <> 1) AND (b <> 'xxx'::text))
+   ->  Seq Scan on hp1
+         Filter: ((a <> 1) AND (b <> 'xxx'::text))
+   ->  Seq Scan on hp2
+         Filter: ((a <> 1) AND (b <> 'xxx'::text))
+   ->  Seq Scan on hp3
+         Filter: ((a <> 1) AND (b <> 'xxx'::text))
+(9 rows)
+
+drop table hp;
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index 514f8e5c..974e62c3 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -152,4 +152,125 @@ explain (costs off) select * from boolpart where a is not true and a is not fals
 explain (costs off) select * from boolpart where a is unknown;
 explain (costs off) select * from boolpart where a is not unknown;
 
-drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart;
+--
+-- some more cases
+--
+
+--
+-- pruning for partitioned table appearing inside a sub-query
+--
+-- pruning won't work for mc3p, because some keys are Params
+explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.a = t1.b and abs(t2.b) = 1 and t2.c = 1) s where t1.a = 1;
+
+-- pruning should work fine, because values for a prefix of keys (a, b) are
+-- available
+explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.c = t1.b and abs(t2.b) = 1 and t2.a = 1) s where t1.a = 1;
+
+-- also here, because values for all keys are provided
+explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.a = 1 and abs(t2.b) = 1 and t2.c = 1) s where t1.a = 1;
+
+--
+-- pruning with clauses containing <> operator
+--
+
+-- doesn't prune range partitions
+create table rp (a int) partition by range (a);
+create table rp0 partition of rp for values from (minvalue) to (1);
+create table rp1 partition of rp for values from (1) to (2);
+create table rp2 partition of rp for values from (2) to (maxvalue);
+
+explain (costs off) select * from rp where a <> 1;
+explain (costs off) select * from rp where a <> 1 and a <> 2;
+
+-- null partition should be eliminated due to strict <> clause.
+explain (costs off) select * from lp where a <> 'a';
+
+-- ensure we detect contradictions in clauses; a can't be NULL and NOT NULL.
+explain (costs off) select * from lp where a <> 'a' and a is null;
+explain (costs off) select * from lp where (a <> 'a' and a <> 'd') or a is null;
+
+-- check that it also works for a partitioned table that's not root,
+-- which in this case are partitions of rlp that are themselves
+-- list-partitioned on b
+explain (costs off) select * from rlp where a = 15 and b <> 'ab' and b <> 'cd' and b <> 'xy' and b is not null;
+
+--
+-- different collations for different keys with same expression
+--
+create table coll_pruning_multi (a text) partition by range (substr(a, 1) collate "POSIX", substr(a, 1) collate "C");
+create table coll_pruning_multi1 partition of coll_pruning_multi for values from ('a', 'a') to ('a', 'e');
+create table coll_pruning_multi2 partition of coll_pruning_multi for values from ('a', 'e') to ('a', 'z');
+create table coll_pruning_multi3 partition of coll_pruning_multi for values from ('b', 'a') to ('b', 'e');
+
+-- no pruning, because no value for the leading key
+explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'e' collate "C";
+
+-- pruning, with a value provided for the leading key
+explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'a' collate "POSIX";
+
+-- pruning, with values provided for both keys
+explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'e' collate "C" and substr(a, 1) = 'a' collate "POSIX";
+
+--
+-- LIKE operators don't prune
+--
+create table like_op_noprune (a text) partition by list (a);
+create table like_op_noprune1 partition of like_op_noprune for values in ('ABC');
+create table like_op_noprune2 partition of like_op_noprune for values in ('BCD');
+explain (costs off) select * from like_op_noprune where a like '%BC';
+
+--
+-- tests wherein clause value requires a cross-type comparison function
+--
+create table lparted_by_int2 (a smallint) partition by list (a);
+create table lparted_by_int2_1 partition of lparted_by_int2 for values in (1);
+create table lparted_by_int2_16384 partition of lparted_by_int2 for values in (16384);
+explain (costs off) select * from lparted_by_int2 where a = 100000000000000;
+
+create table rparted_by_int2 (a smallint) partition by range (a);
+create table rparted_by_int2_1 partition of rparted_by_int2 for values from (1) to (10);
+create table rparted_by_int2_16384 partition of rparted_by_int2 for values from (10) to (16384);
+-- all partitions pruned
+explain (costs off) select * from rparted_by_int2 where a > 100000000000000;
+create table rparted_by_int2_maxvalue partition of rparted_by_int2 for values from (16384) to (maxvalue);
+-- all partitions but rparted_by_int2_maxvalue pruned
+explain (costs off) select * from rparted_by_int2 where a > 100000000000000;
+
+drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2;
+
+-- hash partitioning
+create table hp (a int, b text) partition by hash (a, b);
+create table hp0 partition of hp for values with (modulus 4, remainder 0);
+create table hp3 partition of hp for values with (modulus 4, remainder 3);
+create table hp1 partition of hp for values with (modulus 4, remainder 1);
+create table hp2 partition of hp for values with (modulus 4, remainder 2);
+
+insert into hp values (null, null);
+insert into hp values (1, null);
+insert into hp values (1, 'xxx');
+insert into hp values (null, 'xxx');
+insert into hp values (10, 'xxx');
+insert into hp values (10, 'yyy');
+select tableoid::regclass, * from hp order by 1;
+
+-- partial keys won't prune, nor would non-equality conditions
+explain (costs off) select * from hp where a = 1;
+explain (costs off) select * from hp where b = 'xxx';
+explain (costs off) select * from hp where a is null;
+explain (costs off) select * from hp where b is null;
+explain (costs off) select * from hp where a < 1 and b = 'xxx';
+explain (costs off) select * from hp where a <> 1 and b = 'yyy';
+
+-- pruning should work if non-null values are provided for all the keys
+explain (costs off) select * from hp where a is null and b is null;
+explain (costs off) select * from hp where a = 1 and b is null;
+explain (costs off) select * from hp where a = 1 and b = 'xxx';
+explain (costs off) select * from hp where a is null and b = 'xxx';
+explain (costs off) select * from hp where a = 10 and b = 'xxx';
+explain (costs off) select * from hp where a = 10 and b = 'yyy';
+explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null);
+
+-- hash partitiong pruning doesn't occur with <> operator clauses
+explain (costs off) select * from hp where a <> 1 and b <> 'xxx';
+
+drop table hp;

From 7973f781bbd20ee24129f28116e608f4c11ea4c7 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 15:55:26 +0800
Subject: [PATCH 246/578] Attempt to fix endianess issues in new hash partition
 test.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/test/regress/expected/partition_prune.out | 185 -----------------
 .../regress/expected/partition_prune_hash.out | 189 ++++++++++++++++++
 .../expected/partition_prune_hash_1.out       | 187 +++++++++++++++++
 src/test/regress/parallel_schedule            |   2 +-
 src/test/regress/serial_schedule              |   1 +
 src/test/regress/sql/partition_prune.sql      |  37 ----
 src/test/regress/sql/partition_prune_hash.sql |  41 ++++
 7 files changed, 419 insertions(+), 223 deletions(-)
 create mode 100644 src/test/regress/expected/partition_prune_hash.out
 create mode 100644 src/test/regress/expected/partition_prune_hash_1.out
 create mode 100644 src/test/regress/sql/partition_prune_hash.sql

diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index fe195e31..3e0a196e 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -1339,188 +1339,3 @@ explain (costs off) select * from rparted_by_int2 where a > 100000000000000;
 (3 rows)
 
 drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2;
--- hash partitioning
-create table hp (a int, b text) partition by hash (a, b);
-create table hp0 partition of hp for values with (modulus 4, remainder 0);
-create table hp3 partition of hp for values with (modulus 4, remainder 3);
-create table hp1 partition of hp for values with (modulus 4, remainder 1);
-create table hp2 partition of hp for values with (modulus 4, remainder 2);
-insert into hp values (null, null);
-insert into hp values (1, null);
-insert into hp values (1, 'xxx');
-insert into hp values (null, 'xxx');
-insert into hp values (10, 'xxx');
-insert into hp values (10, 'yyy');
-select tableoid::regclass, * from hp order by 1;
- tableoid | a  |  b  
-----------+----+-----
- hp0      |    | 
- hp0      |  1 | 
- hp0      |  1 | xxx
- hp3      | 10 | yyy
- hp1      |    | xxx
- hp2      | 10 | xxx
-(6 rows)
-
--- partial keys won't prune, nor would non-equality conditions
-explain (costs off) select * from hp where a = 1;
-       QUERY PLAN        
--------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: (a = 1)
-   ->  Seq Scan on hp1
-         Filter: (a = 1)
-   ->  Seq Scan on hp2
-         Filter: (a = 1)
-   ->  Seq Scan on hp3
-         Filter: (a = 1)
-(9 rows)
-
-explain (costs off) select * from hp where b = 'xxx';
-            QUERY PLAN             
------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: (b = 'xxx'::text)
-   ->  Seq Scan on hp1
-         Filter: (b = 'xxx'::text)
-   ->  Seq Scan on hp2
-         Filter: (b = 'xxx'::text)
-   ->  Seq Scan on hp3
-         Filter: (b = 'xxx'::text)
-(9 rows)
-
-explain (costs off) select * from hp where a is null;
-         QUERY PLAN          
------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: (a IS NULL)
-   ->  Seq Scan on hp1
-         Filter: (a IS NULL)
-   ->  Seq Scan on hp2
-         Filter: (a IS NULL)
-   ->  Seq Scan on hp3
-         Filter: (a IS NULL)
-(9 rows)
-
-explain (costs off) select * from hp where b is null;
-         QUERY PLAN          
------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: (b IS NULL)
-   ->  Seq Scan on hp1
-         Filter: (b IS NULL)
-   ->  Seq Scan on hp2
-         Filter: (b IS NULL)
-   ->  Seq Scan on hp3
-         Filter: (b IS NULL)
-(9 rows)
-
-explain (costs off) select * from hp where a < 1 and b = 'xxx';
-                   QUERY PLAN                    
--------------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: ((a < 1) AND (b = 'xxx'::text))
-   ->  Seq Scan on hp1
-         Filter: ((a < 1) AND (b = 'xxx'::text))
-   ->  Seq Scan on hp2
-         Filter: ((a < 1) AND (b = 'xxx'::text))
-   ->  Seq Scan on hp3
-         Filter: ((a < 1) AND (b = 'xxx'::text))
-(9 rows)
-
-explain (costs off) select * from hp where a <> 1 and b = 'yyy';
-                    QUERY PLAN                    
---------------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: ((a <> 1) AND (b = 'yyy'::text))
-   ->  Seq Scan on hp1
-         Filter: ((a <> 1) AND (b = 'yyy'::text))
-   ->  Seq Scan on hp2
-         Filter: ((a <> 1) AND (b = 'yyy'::text))
-   ->  Seq Scan on hp3
-         Filter: ((a <> 1) AND (b = 'yyy'::text))
-(9 rows)
-
--- pruning should work if non-null values are provided for all the keys
-explain (costs off) select * from hp where a is null and b is null;
-                  QUERY PLAN                   
------------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: ((a IS NULL) AND (b IS NULL))
-(3 rows)
-
-explain (costs off) select * from hp where a = 1 and b is null;
-                QUERY PLAN                 
--------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: ((b IS NULL) AND (a = 1))
-(3 rows)
-
-explain (costs off) select * from hp where a = 1 and b = 'xxx';
-                   QUERY PLAN                    
--------------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: ((a = 1) AND (b = 'xxx'::text))
-(3 rows)
-
-explain (costs off) select * from hp where a is null and b = 'xxx';
-                     QUERY PLAN                      
------------------------------------------------------
- Append
-   ->  Seq Scan on hp1
-         Filter: ((a IS NULL) AND (b = 'xxx'::text))
-(3 rows)
-
-explain (costs off) select * from hp where a = 10 and b = 'xxx';
-                    QUERY PLAN                    
---------------------------------------------------
- Append
-   ->  Seq Scan on hp2
-         Filter: ((a = 10) AND (b = 'xxx'::text))
-(3 rows)
-
-explain (costs off) select * from hp where a = 10 and b = 'yyy';
-                    QUERY PLAN                    
---------------------------------------------------
- Append
-   ->  Seq Scan on hp3
-         Filter: ((a = 10) AND (b = 'yyy'::text))
-(3 rows)
-
-explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null);
-                                                       QUERY PLAN                                                        
--------------------------------------------------------------------------------------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
-   ->  Seq Scan on hp2
-         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
-   ->  Seq Scan on hp3
-         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
-(7 rows)
-
--- hash partitiong pruning doesn't occur with <> operator clauses
-explain (costs off) select * from hp where a <> 1 and b <> 'xxx';
-                    QUERY PLAN                     
----------------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: ((a <> 1) AND (b <> 'xxx'::text))
-   ->  Seq Scan on hp1
-         Filter: ((a <> 1) AND (b <> 'xxx'::text))
-   ->  Seq Scan on hp2
-         Filter: ((a <> 1) AND (b <> 'xxx'::text))
-   ->  Seq Scan on hp3
-         Filter: ((a <> 1) AND (b <> 'xxx'::text))
-(9 rows)
-
-drop table hp;
diff --git a/src/test/regress/expected/partition_prune_hash.out b/src/test/regress/expected/partition_prune_hash.out
new file mode 100644
index 00000000..fbba3f1f
--- /dev/null
+++ b/src/test/regress/expected/partition_prune_hash.out
@@ -0,0 +1,189 @@
+--
+-- Test Partition pruning for HASH partitioning
+-- We keep this as a seperate test as hash functions return
+-- values will vary based on CPU architecture.
+--
+create table hp (a int, b text) partition by hash (a, b);
+create table hp0 partition of hp for values with (modulus 4, remainder 0);
+create table hp3 partition of hp for values with (modulus 4, remainder 3);
+create table hp1 partition of hp for values with (modulus 4, remainder 1);
+create table hp2 partition of hp for values with (modulus 4, remainder 2);
+insert into hp values (null, null);
+insert into hp values (1, null);
+insert into hp values (1, 'xxx');
+insert into hp values (null, 'xxx');
+insert into hp values (10, 'xxx');
+insert into hp values (10, 'yyy');
+select tableoid::regclass, * from hp order by 1;
+ tableoid | a  |  b  
+----------+----+-----
+ hp0      |    | 
+ hp0      |  1 | 
+ hp0      |  1 | xxx
+ hp3      | 10 | yyy
+ hp1      |    | xxx
+ hp2      | 10 | xxx
+(6 rows)
+
+-- partial keys won't prune, nor would non-equality conditions
+explain (costs off) select * from hp where a = 1;
+       QUERY PLAN        
+-------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (a = 1)
+   ->  Seq Scan on hp1
+         Filter: (a = 1)
+   ->  Seq Scan on hp2
+         Filter: (a = 1)
+   ->  Seq Scan on hp3
+         Filter: (a = 1)
+(9 rows)
+
+explain (costs off) select * from hp where b = 'xxx';
+            QUERY PLAN             
+-----------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (b = 'xxx'::text)
+   ->  Seq Scan on hp1
+         Filter: (b = 'xxx'::text)
+   ->  Seq Scan on hp2
+         Filter: (b = 'xxx'::text)
+   ->  Seq Scan on hp3
+         Filter: (b = 'xxx'::text)
+(9 rows)
+
+explain (costs off) select * from hp where a is null;
+         QUERY PLAN          
+-----------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (a IS NULL)
+   ->  Seq Scan on hp1
+         Filter: (a IS NULL)
+   ->  Seq Scan on hp2
+         Filter: (a IS NULL)
+   ->  Seq Scan on hp3
+         Filter: (a IS NULL)
+(9 rows)
+
+explain (costs off) select * from hp where b is null;
+         QUERY PLAN          
+-----------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (b IS NULL)
+   ->  Seq Scan on hp1
+         Filter: (b IS NULL)
+   ->  Seq Scan on hp2
+         Filter: (b IS NULL)
+   ->  Seq Scan on hp3
+         Filter: (b IS NULL)
+(9 rows)
+
+explain (costs off) select * from hp where a < 1 and b = 'xxx';
+                   QUERY PLAN                    
+-------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a < 1) AND (b = 'xxx'::text))
+   ->  Seq Scan on hp1
+         Filter: ((a < 1) AND (b = 'xxx'::text))
+   ->  Seq Scan on hp2
+         Filter: ((a < 1) AND (b = 'xxx'::text))
+   ->  Seq Scan on hp3
+         Filter: ((a < 1) AND (b = 'xxx'::text))
+(9 rows)
+
+explain (costs off) select * from hp where a <> 1 and b = 'yyy';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a <> 1) AND (b = 'yyy'::text))
+   ->  Seq Scan on hp1
+         Filter: ((a <> 1) AND (b = 'yyy'::text))
+   ->  Seq Scan on hp2
+         Filter: ((a <> 1) AND (b = 'yyy'::text))
+   ->  Seq Scan on hp3
+         Filter: ((a <> 1) AND (b = 'yyy'::text))
+(9 rows)
+
+-- pruning should work if non-null values are provided for all the keys
+explain (costs off) select * from hp where a is null and b is null;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a IS NULL) AND (b IS NULL))
+(3 rows)
+
+explain (costs off) select * from hp where a = 1 and b is null;
+                QUERY PLAN                 
+-------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((b IS NULL) AND (a = 1))
+(3 rows)
+
+explain (costs off) select * from hp where a = 1 and b = 'xxx';
+                   QUERY PLAN                    
+-------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a = 1) AND (b = 'xxx'::text))
+(3 rows)
+
+explain (costs off) select * from hp where a is null and b = 'xxx';
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Append
+   ->  Seq Scan on hp1
+         Filter: ((a IS NULL) AND (b = 'xxx'::text))
+(3 rows)
+
+explain (costs off) select * from hp where a = 10 and b = 'xxx';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Append
+   ->  Seq Scan on hp2
+         Filter: ((a = 10) AND (b = 'xxx'::text))
+(3 rows)
+
+explain (costs off) select * from hp where a = 10 and b = 'yyy';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Append
+   ->  Seq Scan on hp3
+         Filter: ((a = 10) AND (b = 'yyy'::text))
+(3 rows)
+
+explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null);
+                                                       QUERY PLAN                                                        
+-------------------------------------------------------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
+   ->  Seq Scan on hp2
+         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
+   ->  Seq Scan on hp3
+         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
+(7 rows)
+
+-- hash partitiong pruning doesn't occur with <> operator clauses
+explain (costs off) select * from hp where a <> 1 and b <> 'xxx';
+                    QUERY PLAN                     
+---------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a <> 1) AND (b <> 'xxx'::text))
+   ->  Seq Scan on hp1
+         Filter: ((a <> 1) AND (b <> 'xxx'::text))
+   ->  Seq Scan on hp2
+         Filter: ((a <> 1) AND (b <> 'xxx'::text))
+   ->  Seq Scan on hp3
+         Filter: ((a <> 1) AND (b <> 'xxx'::text))
+(9 rows)
+
+drop table hp;
diff --git a/src/test/regress/expected/partition_prune_hash_1.out b/src/test/regress/expected/partition_prune_hash_1.out
new file mode 100644
index 00000000..4a26a0e2
--- /dev/null
+++ b/src/test/regress/expected/partition_prune_hash_1.out
@@ -0,0 +1,187 @@
+--
+-- Test Partition pruning for HASH partitioning
+-- We keep this as a seperate test as hash functions return
+-- values will vary based on CPU architecture.
+--
+create table hp (a int, b text) partition by hash (a, b);
+create table hp0 partition of hp for values with (modulus 4, remainder 0);
+create table hp3 partition of hp for values with (modulus 4, remainder 3);
+create table hp1 partition of hp for values with (modulus 4, remainder 1);
+create table hp2 partition of hp for values with (modulus 4, remainder 2);
+insert into hp values (null, null);
+insert into hp values (1, null);
+insert into hp values (1, 'xxx');
+insert into hp values (null, 'xxx');
+insert into hp values (10, 'xxx');
+insert into hp values (10, 'yyy');
+select tableoid::regclass, * from hp order by 1;
+ tableoid | a  |  b  
+----------+----+-----
+ hp0      |    | 
+ hp0      |  1 | 
+ hp0      | 10 | xxx
+ hp3      |    | xxx
+ hp3      | 10 | yyy
+ hp2      |  1 | xxx
+(6 rows)
+
+-- partial keys won't prune, nor would non-equality conditions
+explain (costs off) select * from hp where a = 1;
+       QUERY PLAN        
+-------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (a = 1)
+   ->  Seq Scan on hp1
+         Filter: (a = 1)
+   ->  Seq Scan on hp2
+         Filter: (a = 1)
+   ->  Seq Scan on hp3
+         Filter: (a = 1)
+(9 rows)
+
+explain (costs off) select * from hp where b = 'xxx';
+            QUERY PLAN             
+-----------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (b = 'xxx'::text)
+   ->  Seq Scan on hp1
+         Filter: (b = 'xxx'::text)
+   ->  Seq Scan on hp2
+         Filter: (b = 'xxx'::text)
+   ->  Seq Scan on hp3
+         Filter: (b = 'xxx'::text)
+(9 rows)
+
+explain (costs off) select * from hp where a is null;
+         QUERY PLAN          
+-----------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (a IS NULL)
+   ->  Seq Scan on hp1
+         Filter: (a IS NULL)
+   ->  Seq Scan on hp2
+         Filter: (a IS NULL)
+   ->  Seq Scan on hp3
+         Filter: (a IS NULL)
+(9 rows)
+
+explain (costs off) select * from hp where b is null;
+         QUERY PLAN          
+-----------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (b IS NULL)
+   ->  Seq Scan on hp1
+         Filter: (b IS NULL)
+   ->  Seq Scan on hp2
+         Filter: (b IS NULL)
+   ->  Seq Scan on hp3
+         Filter: (b IS NULL)
+(9 rows)
+
+explain (costs off) select * from hp where a < 1 and b = 'xxx';
+                   QUERY PLAN                    
+-------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a < 1) AND (b = 'xxx'::text))
+   ->  Seq Scan on hp1
+         Filter: ((a < 1) AND (b = 'xxx'::text))
+   ->  Seq Scan on hp2
+         Filter: ((a < 1) AND (b = 'xxx'::text))
+   ->  Seq Scan on hp3
+         Filter: ((a < 1) AND (b = 'xxx'::text))
+(9 rows)
+
+explain (costs off) select * from hp where a <> 1 and b = 'yyy';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a <> 1) AND (b = 'yyy'::text))
+   ->  Seq Scan on hp1
+         Filter: ((a <> 1) AND (b = 'yyy'::text))
+   ->  Seq Scan on hp2
+         Filter: ((a <> 1) AND (b = 'yyy'::text))
+   ->  Seq Scan on hp3
+         Filter: ((a <> 1) AND (b = 'yyy'::text))
+(9 rows)
+
+-- pruning should work if non-null values are provided for all the keys
+explain (costs off) select * from hp where a is null and b is null;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a IS NULL) AND (b IS NULL))
+(3 rows)
+
+explain (costs off) select * from hp where a = 1 and b is null;
+                QUERY PLAN                 
+-------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((b IS NULL) AND (a = 1))
+(3 rows)
+
+explain (costs off) select * from hp where a = 1 and b = 'xxx';
+                   QUERY PLAN                    
+-------------------------------------------------
+ Append
+   ->  Seq Scan on hp2
+         Filter: ((a = 1) AND (b = 'xxx'::text))
+(3 rows)
+
+explain (costs off) select * from hp where a is null and b = 'xxx';
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Append
+   ->  Seq Scan on hp3
+         Filter: ((a IS NULL) AND (b = 'xxx'::text))
+(3 rows)
+
+explain (costs off) select * from hp where a = 10 and b = 'xxx';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a = 10) AND (b = 'xxx'::text))
+(3 rows)
+
+explain (costs off) select * from hp where a = 10 and b = 'yyy';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Append
+   ->  Seq Scan on hp3
+         Filter: ((a = 10) AND (b = 'yyy'::text))
+(3 rows)
+
+explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null);
+                                                       QUERY PLAN                                                        
+-------------------------------------------------------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
+   ->  Seq Scan on hp3
+         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
+(5 rows)
+
+-- hash partitiong pruning doesn't occur with <> operator clauses
+explain (costs off) select * from hp where a <> 1 and b <> 'xxx';
+                    QUERY PLAN                     
+---------------------------------------------------
+ Append
+   ->  Seq Scan on hp0
+         Filter: ((a <> 1) AND (b <> 'xxx'::text))
+   ->  Seq Scan on hp1
+         Filter: ((a <> 1) AND (b <> 'xxx'::text))
+   ->  Seq Scan on hp2
+         Filter: ((a <> 1) AND (b <> 'xxx'::text))
+   ->  Seq Scan on hp3
+         Filter: ((a <> 1) AND (b <> 'xxx'::text))
+(9 rows)
+
+drop table hp;
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index d8a925ca..905cb00a 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -134,7 +134,7 @@ test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion t
 # ----------
 # Another group of parallel tests
 # ----------
-test: identity partition_join partition_prune hash_part
+test: identity partition_join partition_prune partition_prune_hash hash_part
 
 # event triggers cannot run concurrently with any test that runs DDL
 test: event_trigger
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index f0989763..1f00bfbc 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -193,6 +193,7 @@ test: xml
 test: identity
 test: partition_join
 test: partition_prune
+test: partition_prune_hash
 test: hash_part
 test: event_trigger
 test: fast_default
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index 974e62c3..ca313897 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -237,40 +237,3 @@ create table rparted_by_int2_maxvalue partition of rparted_by_int2 for values fr
 explain (costs off) select * from rparted_by_int2 where a > 100000000000000;
 
 drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2;
-
--- hash partitioning
-create table hp (a int, b text) partition by hash (a, b);
-create table hp0 partition of hp for values with (modulus 4, remainder 0);
-create table hp3 partition of hp for values with (modulus 4, remainder 3);
-create table hp1 partition of hp for values with (modulus 4, remainder 1);
-create table hp2 partition of hp for values with (modulus 4, remainder 2);
-
-insert into hp values (null, null);
-insert into hp values (1, null);
-insert into hp values (1, 'xxx');
-insert into hp values (null, 'xxx');
-insert into hp values (10, 'xxx');
-insert into hp values (10, 'yyy');
-select tableoid::regclass, * from hp order by 1;
-
--- partial keys won't prune, nor would non-equality conditions
-explain (costs off) select * from hp where a = 1;
-explain (costs off) select * from hp where b = 'xxx';
-explain (costs off) select * from hp where a is null;
-explain (costs off) select * from hp where b is null;
-explain (costs off) select * from hp where a < 1 and b = 'xxx';
-explain (costs off) select * from hp where a <> 1 and b = 'yyy';
-
--- pruning should work if non-null values are provided for all the keys
-explain (costs off) select * from hp where a is null and b is null;
-explain (costs off) select * from hp where a = 1 and b is null;
-explain (costs off) select * from hp where a = 1 and b = 'xxx';
-explain (costs off) select * from hp where a is null and b = 'xxx';
-explain (costs off) select * from hp where a = 10 and b = 'xxx';
-explain (costs off) select * from hp where a = 10 and b = 'yyy';
-explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null);
-
--- hash partitiong pruning doesn't occur with <> operator clauses
-explain (costs off) select * from hp where a <> 1 and b <> 'xxx';
-
-drop table hp;
diff --git a/src/test/regress/sql/partition_prune_hash.sql b/src/test/regress/sql/partition_prune_hash.sql
new file mode 100644
index 00000000..fd1783bf
--- /dev/null
+++ b/src/test/regress/sql/partition_prune_hash.sql
@@ -0,0 +1,41 @@
+--
+-- Test Partition pruning for HASH partitioning
+-- We keep this as a seperate test as hash functions return
+-- values will vary based on CPU architecture.
+--
+
+create table hp (a int, b text) partition by hash (a, b);
+create table hp0 partition of hp for values with (modulus 4, remainder 0);
+create table hp3 partition of hp for values with (modulus 4, remainder 3);
+create table hp1 partition of hp for values with (modulus 4, remainder 1);
+create table hp2 partition of hp for values with (modulus 4, remainder 2);
+
+insert into hp values (null, null);
+insert into hp values (1, null);
+insert into hp values (1, 'xxx');
+insert into hp values (null, 'xxx');
+insert into hp values (10, 'xxx');
+insert into hp values (10, 'yyy');
+select tableoid::regclass, * from hp order by 1;
+
+-- partial keys won't prune, nor would non-equality conditions
+explain (costs off) select * from hp where a = 1;
+explain (costs off) select * from hp where b = 'xxx';
+explain (costs off) select * from hp where a is null;
+explain (costs off) select * from hp where b is null;
+explain (costs off) select * from hp where a < 1 and b = 'xxx';
+explain (costs off) select * from hp where a <> 1 and b = 'yyy';
+
+-- pruning should work if non-null values are provided for all the keys
+explain (costs off) select * from hp where a is null and b is null;
+explain (costs off) select * from hp where a = 1 and b is null;
+explain (costs off) select * from hp where a = 1 and b = 'xxx';
+explain (costs off) select * from hp where a is null and b = 'xxx';
+explain (costs off) select * from hp where a = 10 and b = 'xxx';
+explain (costs off) select * from hp where a = 10 and b = 'yyy';
+explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null);
+
+-- hash partitiong pruning doesn't occur with <> operator clauses
+explain (costs off) select * from hp where a <> 1 and b <> 'xxx';
+
+drop table hp;

From 9c6403bca6d1ef8073b18996b368d63d58d3d549 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 6 Apr 2018 20:54:22 -0700
Subject: [PATCH 247/578] Blindly attempt to fix sepgsql tests broken due to
 9fdb675fc5.

The failure appears to solely be caused by the changed partition
pruning logic.

Author: Andres Freund
Discussion: https://postgr.es/m/20180406210330.wmqw42wqgiicktli@alap3.anarazel.de
Signed-off-by: JennyJennyChen <chenzaini@sina.com>
---
 contrib/sepgsql/expected/misc.out | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/contrib/sepgsql/expected/misc.out b/contrib/sepgsql/expected/misc.out
index 98f8005a..128f6bd0 100644
--- a/contrib/sepgsql/expected/misc.out
+++ b/contrib/sepgsql/expected/misc.out
@@ -32,9 +32,6 @@ LOG:  SELinux: allowed { execute } scontext=unconfined_u:unconfined_r:sepgsql_re
 (6 rows)
 
 SELECT * FROM t1p WHERE o > 50 AND p like '%64%';
-LOG:  SELinux: allowed { execute } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0-s0:c0.c255 tcontext=system_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="pg_catalog.int4le(integer,integer)"
-LOG:  SELinux: allowed { execute } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0-s0:c0.c255 tcontext=system_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="pg_catalog.int4le(integer,integer)"
-LOG:  SELinux: allowed { execute } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0-s0:c0.c255 tcontext=system_u:object_r:sepgsql_proc_exec_t:s0 tclass=db_procedure name="pg_catalog.int4le(integer,integer)"
 LOG:  SELinux: allowed { select } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0-s0:c0.c255 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_table name="public.t1p"
 LOG:  SELinux: allowed { select } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0-s0:c0.c255 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table t1p column o"
 LOG:  SELinux: allowed { select } scontext=unconfined_u:unconfined_r:sepgsql_regtest_superuser_t:s0-s0:c0.c255 tcontext=unconfined_u:object_r:sepgsql_table_t:s0 tclass=db_column name="table t1p column p"

From deb429fef69b810afcfd9b22cd8c59d76acbfaa5 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 17:37:11 +0800
Subject: [PATCH 248/578] Fix ALTER TABLE .. ATTACH PARTITION ... DEFAULT
 .http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/commands/tablecmds.c            | 153 +++++++++-----------
 src/test/regress/expected/alter_table.out   |  16 ++
 src/test/regress/expected/alter_table_1.out |  16 ++
 src/test/regress/expected/alter_table_2.out |  16 ++
 src/test/regress/expected/alter_table_3.out |  16 ++
 src/test/regress/sql/alter_table.sql        |  18 +++
 6 files changed, 147 insertions(+), 88 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 536b8661..dbdd156e 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -539,8 +539,7 @@ static void CreateInheritance(Relation child_rel, Relation parent_rel);
 static void RemoveInheritance(Relation child_rel, Relation parent_rel);
 static ObjectAddress ATExecAttachPartition(List **wqueue, Relation rel,
                       PartitionCmd *cmd);
-static void ValidatePartitionConstraints(List **wqueue, Relation scanrel,
-                            List *scanrel_children,
+static void QueuePartitionConstraintValidation(List **wqueue, Relation scanrel,
 							List *partConstraint,
                             bool validate_default);
 static ObjectAddress ATExecDetachPartition(Relation rel, RangeVar *name);
@@ -16442,29 +16441,23 @@ PartConstraintImpliedByRelConstraint(Relation scanrel,
 }
 
 /*
- * ValidatePartitionConstraints
+ * QueuePartitionConstraintValidation
  *
- * Check whether all rows in the given table obey the given partition
- * constraint; if so, it can be attached as a partition.  We do this by
- * scanning the table (or all of its leaf partitions) row by row, except when
- * the existing constraints are sufficient to prove that the new partitioning
- * constraint must already hold.
+ * Add an entry to wqueue to have the given partition constraint validated by
+ * Phase 3, for the given relation, and all its children.
+ *
+ * We first verify whether the given constraint is implied by pre-existing
+ * relation constraints; if it is, there's no need to scan the table to
+ * validate, so don't queue in that case.
  */
 static void
-ValidatePartitionConstraints(List **wqueue, Relation scanrel,
-							 List *scanrel_children,
+QueuePartitionConstraintValidation(List **wqueue, Relation scanrel,
 							 List *partConstraint,
 							 bool validate_default)
 {
-	bool		found_whole_row;
-	ListCell   *lc;
-
-	if (partConstraint == NIL)
-		return;
-
 	/*
-	 * Based on the table's existing constraints, determine if we can skip
-	 * scanning the table to validate the partition constraint.
+	 * Based on the table's existing constraints, determine whether or not we
+	 * may skip scanning the table.
 	 */
 	if (PartConstraintImpliedByRelConstraint(scanrel, partConstraint))
 	{
@@ -16479,69 +16472,54 @@ ValidatePartitionConstraints(List **wqueue, Relation scanrel,
 		return;
 	}
 
-	/* Constraints proved insufficient, so we need to scan the table. */
-	foreach(lc, scanrel_children)
+	/*
+	 * Constraints proved insufficient. For plain relations, queue a validation
+	 * item now; for partitioned tables, recurse to process each partition.
+	 */
+	if (scanrel->rd_rel->relkind == RELKIND_RELATION)
 	{
 		AlteredTableInfo *tab;
-		Oid			part_relid = lfirst_oid(lc);
-		Relation	part_rel;
-		List	   *my_partconstr = partConstraint;
 
-		/* Lock already taken */
-		if (part_relid != RelationGetRelid(scanrel))
-			part_rel = heap_open(part_relid, NoLock);
-		else
-			part_rel = scanrel;
+		/* Grab a work queue entry. */
+		tab = ATGetQueueEntry(wqueue, scanrel);
+		Assert(tab->partition_constraint == NULL);
+		tab->partition_constraint = (Expr *) linitial(partConstraint);
+		tab->validate_default = validate_default;
+	}
+	else if (scanrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+	{
+		PartitionDesc partdesc = RelationGetPartitionDesc(scanrel);
+		int			i;
+
+		for (i = 0; i < partdesc->nparts; i++)
+		{
+			Relation	part_rel;
+			bool		found_whole_row;
+			List	   *thisPartConstraint;
 
 		/*
-		 * Skip if the partition is itself a partitioned table.  We can only
-		 * ever scan RELKIND_RELATION relations.
+			 * This is the minimum lock we need to prevent concurrent data
+			 * additions.
 		 */
-		if (part_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-		{
-			if (part_rel != scanrel)
-				heap_close(part_rel, NoLock);
-			continue;
-		}
+			part_rel = heap_open(partdesc->oids[i], ShareLock);
 
-		if (part_rel != scanrel)
-		{
 			/*
 			 * Adjust the constraint for scanrel so that it matches this
 			 * partition's attribute numbers.
 			 */
-			my_partconstr = map_partition_varattnos(my_partconstr, 1,
-													part_rel, scanrel,
-													&found_whole_row);
+			thisPartConstraint =
+				map_partition_varattnos(partConstraint, 1,
+										part_rel, scanrel, &found_whole_row);
 			/* There can never be a whole-row reference here */
 			if (found_whole_row)
-				elog(ERROR, "unexpected whole-row reference found in partition key");
+				elog(ERROR, "unexpected whole-row reference found in partition constraint");
 
-			/* Can we skip scanning this part_rel? */
-			if (PartConstraintImpliedByRelConstraint(part_rel, my_partconstr))
-			{
-				if (!validate_default)
-					ereport(INFO,
-							(errmsg("partition constraint for table \"%s\" is implied by existing constraints",
-									RelationGetRelationName(part_rel))));
-				else
-					ereport(INFO,
-							(errmsg("updated partition constraint for default partition \"%s\" is implied by existing constraints",
-									RelationGetRelationName(part_rel))));
-				heap_close(part_rel, NoLock);
-				continue;
+			QueuePartitionConstraintValidation(wqueue, part_rel,
+											   thisPartConstraint,
+											   validate_default);
+			heap_close(part_rel, NoLock);	/* keep lock till commit */
 			}
 		}
-
-		/* Grab a work queue entry. */
-		tab = ATGetQueueEntry(wqueue, part_rel);
-		tab->partition_constraint = (Expr *) linitial(my_partconstr);
-		tab->validate_default = validate_default;
-
-		/* keep our lock until commit */
-		if (part_rel != scanrel)
-			heap_close(part_rel, NoLock);
-	}
 }
 
 /*
@@ -16568,8 +16546,8 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
 	List	   *partBoundConstraint;
 
 	/*
-	 * We must lock the default partition, because attaching a new partition
-	 * will change its partition constraint.
+	 * We must lock the default partition if one exists, because attaching a
+     * new partition will change its partition constraint.
 	 */
 	defaultPartOid =
 		get_default_oid_from_partdesc(RelationGetPartitionDesc(rel));
@@ -16634,17 +16612,18 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
      *
      * We do that by checking if rel is a member of the list of attachRel's
      * partitions provided the latter is partitioned at all.  We want to avoid
-     * having to construct this list again, so we request the strongest lock
-     * on all partitions.  We need the strongest lock, because we may decide
-     * to scan them if we find out that the table being attached (or its leaf
-     * partitions) may contain rows that violate the partition constraint. If
-     * the table has a constraint that would prevent such rows, which by
-     * definition is present in all the partitions, we need not scan the
-     * table, nor its partitions.  But we cannot risk a deadlock by taking a
+	 * having to construct this list again, so we request a lock on all
+     * partitions.  We need ShareLock, preventing data changes, because we
+     * may decide to scan them if we find out that the table being attached (or
+     * its leaf partitions) may contain rows that violate the partition
+     * constraint.  If the table has a constraint that would prevent such rows,
+     * which by definition is present in all the partitions, we need not scan
+     * the table, nor its partitions.  But we cannot risk a deadlock by taking
+     * a weaker lock now and the stronger one only when needed.
      * weaker lock now and the stronger one only when needed.
      */
     attachrel_children = find_all_inheritors(RelationGetRelid(attachrel),
-                                             AccessExclusiveLock, NULL);
+			                                 ShareLock, NULL);
     if (list_member_oid(attachrel_children, RelationGetRelid(rel)))
         ereport(ERROR,
                 (errcode(ERRCODE_DUPLICATE_TABLE),
@@ -16777,31 +16756,29 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
 				 "unexpected whole-row reference found in partition key");
 
 	/* Validate partition constraints against the table being attached. */
-	ValidatePartitionConstraints(wqueue, attachrel, attachrel_children,
-									 partConstraint, false);
+		QueuePartitionConstraintValidation(wqueue, attachrel, partConstraint,
+										   false);
 	}
 
 	/*
-	 * Check whether default partition has a row that would fit the partition
-	 * being attached.
+	 * If we're attaching a partition other than the default partition and a
+	 * default one exists, then that partition's partition constraint changes,
+	 * so add an entry to the work queue to validate it, too.  (We must not
+	 * do this when the partition being attached is the default one; we
+	 * already did it above!)
 	 */
-	defaultPartOid =
-		get_default_oid_from_partdesc(RelationGetPartitionDesc(rel));
 	if (OidIsValid(defaultPartOid))
 	{
 		Relation	defaultrel;
-		List	   *defaultrel_children;
 		List	   *defPartConstraint;
 
-		/* We already have taken a lock on default partition. */
+		Assert(!cmd->bound->is_default);
+
+		/* we already hold a lock on the default partition */
 		defaultrel = heap_open(defaultPartOid, NoLock);
 		defPartConstraint =
 			get_proposed_default_constraint(partBoundConstraint);
-		defaultrel_children =
-			find_all_inheritors(defaultPartOid,
-								AccessExclusiveLock, NULL);
-		ValidatePartitionConstraints(wqueue, defaultrel,
-									 defaultrel_children,
+		QueuePartitionConstraintValidation(wqueue, defaultrel,
 									 defPartConstraint, true);
 
 		/* keep our lock until commit. */
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 455cee74..088474cf 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3729,3 +3729,19 @@ create table parted_validate_test_1 partition of parted_validate_test for values
 alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid;
 alter table parted_validate_test validate constraint parted_validate_test_chka;
 drop table parted_validate_test;
+-- check that violating rows are correctly reported when attaching as the
+-- default partition
+create table defpart_attach_test (a int) partition by list (a);
+create table defpart_attach_test1 partition of defpart_attach_test for values in (1);
+create table defpart_attach_test_d (like defpart_attach_test);
+insert into defpart_attach_test_d values (1), (2);
+-- error because its constraint as the default partition would be violated
+-- by the row containing 1
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+ERROR:  partition constraint is violated by some row
+delete from defpart_attach_test_d where a = 1;
+alter table defpart_attach_test_d add check (a > 1);
+-- should be attached successfully and without needing to be scanned
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+INFO:  partition constraint for table "defpart_attach_test_d" is implied by existing constraints
+drop table defpart_attach_test;
diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out
index 357e16da..8e1053bc 100644
--- a/src/test/regress/expected/alter_table_1.out
+++ b/src/test/regress/expected/alter_table_1.out
@@ -3652,3 +3652,19 @@ create table parted_validate_test_1 partition of parted_validate_test for values
 alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid;
 alter table parted_validate_test validate constraint parted_validate_test_chka;
 drop table parted_validate_test;
+-- check that violating rows are correctly reported when attaching as the
+-- default partition
+create table defpart_attach_test (a int) partition by list (a);
+create table defpart_attach_test1 partition of defpart_attach_test for values in (1);
+create table defpart_attach_test_d (like defpart_attach_test);
+insert into defpart_attach_test_d values (1), (2);
+-- error because its constraint as the default partition would be violated
+-- by the row containing 1
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+ERROR:  partition constraint is violated by some row
+delete from defpart_attach_test_d where a = 1;
+alter table defpart_attach_test_d add check (a > 1);
+-- should be attached successfully and without needing to be scanned
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+INFO:  partition constraint for table "defpart_attach_test_d" is implied by existing constraints
+drop table defpart_attach_test;
diff --git a/src/test/regress/expected/alter_table_2.out b/src/test/regress/expected/alter_table_2.out
index 88f9f851..19a9d000 100644
--- a/src/test/regress/expected/alter_table_2.out
+++ b/src/test/regress/expected/alter_table_2.out
@@ -3652,3 +3652,19 @@ create table parted_validate_test_1 partition of parted_validate_test for values
 alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid;
 alter table parted_validate_test validate constraint parted_validate_test_chka;
 drop table parted_validate_test;
+-- check that violating rows are correctly reported when attaching as the
+-- default partition
+create table defpart_attach_test (a int) partition by list (a);
+create table defpart_attach_test1 partition of defpart_attach_test for values in (1);
+create table defpart_attach_test_d (like defpart_attach_test);
+insert into defpart_attach_test_d values (1), (2);
+-- error because its constraint as the default partition would be violated
+-- by the row containing 1
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+ERROR:  partition constraint is violated by some row
+delete from defpart_attach_test_d where a = 1;
+alter table defpart_attach_test_d add check (a > 1);
+-- should be attached successfully and without needing to be scanned
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+INFO:  partition constraint for table "defpart_attach_test_d" is implied by existing constraints
+drop table defpart_attach_test;
diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out
index 345150e0..5cdf3e7a 100644
--- a/src/test/regress/expected/alter_table_3.out
+++ b/src/test/regress/expected/alter_table_3.out
@@ -3652,3 +3652,19 @@ create table parted_validate_test_1 partition of parted_validate_test for values
 alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid;
 alter table parted_validate_test validate constraint parted_validate_test_chka;
 drop table parted_validate_test;
+-- check that violating rows are correctly reported when attaching as the
+-- default partition
+create table defpart_attach_test (a int) partition by list (a);
+create table defpart_attach_test1 partition of defpart_attach_test for values in (1);
+create table defpart_attach_test_d (like defpart_attach_test);
+insert into defpart_attach_test_d values (1), (2);
+-- error because its constraint as the default partition would be violated
+-- by the row containing 1
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+ERROR:  partition constraint is violated by some row
+delete from defpart_attach_test_d where a = 1;
+alter table defpart_attach_test_d add check (a > 1);
+-- should be attached successfully and without needing to be scanned
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+INFO:  partition constraint for table "defpart_attach_test_d" is implied by existing constraints
+drop table defpart_attach_test;
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index a32521f1..e1c6772c 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -2498,3 +2498,21 @@ create table parted_validate_test_1 partition of parted_validate_test for values
 alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid;
 alter table parted_validate_test validate constraint parted_validate_test_chka;
 drop table parted_validate_test;
+
+-- check that violating rows are correctly reported when attaching as the
+-- default partition
+create table defpart_attach_test (a int) partition by list (a);
+create table defpart_attach_test1 partition of defpart_attach_test for values in (1);
+create table defpart_attach_test_d (like defpart_attach_test);
+insert into defpart_attach_test_d values (1), (2);
+
+-- error because its constraint as the default partition would be violated
+-- by the row containing 1
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+delete from defpart_attach_test_d where a = 1;
+alter table defpart_attach_test_d add check (a > 1);
+
+-- should be attached successfully and without needing to be scanned
+alter table defpart_attach_test attach partition defpart_attach_test_d default;
+
+drop table defpart_attach_test;
\ No newline at end of file

From 84d1ef9a84dff1b7bfcadfb042922920853ba8bd Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Thu, 12 Apr 2018 16:51:55 -0300
Subject: [PATCH 249/578] Add comment about default partition in
 check_new_partition_bound

The intention of the test is not immediately obvious, so we need this
much.
---
 src/backend/catalog/partition.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index f74a88f0..5c85918f 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -898,6 +898,12 @@ check_new_partition_bound(char *relname, Relation parent,
 
 	if (spec->is_default)
 	{
+		/*
+		 * The default partition bound never conflicts with any other
+		 * partition's; if that's what we're attaching, the only possible
+		 * problem is that one already exists, so check for that and we're
+		 * done.
+		 */
 		if (boundinfo == NULL || !partition_bound_has_default(boundinfo))
 			return;
 

From b58f50ddbdd49c6d0c0e89b25963a68c48dd886b Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 17:46:24 +0800
Subject: [PATCH 250/578] Revert lowering of lock level for ATTACH
 PARTITION.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/commands/tablecmds.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index dbdd156e..afa19507 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -16498,10 +16498,9 @@ QueuePartitionConstraintValidation(List **wqueue, Relation scanrel,
 			List	   *thisPartConstraint;
 
 		/*
-			 * This is the minimum lock we need to prevent concurrent data
-			 * additions.
+			 * This is the minimum lock we need to prevent deadlocks.
 		 */
-			part_rel = heap_open(partdesc->oids[i], ShareLock);
+			part_rel = heap_open(partdesc->oids[i], AccessExclusiveLock);
 
 			/*
 			 * Adjust the constraint for scanrel so that it matches this
@@ -16612,18 +16611,17 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
      *
      * We do that by checking if rel is a member of the list of attachRel's
      * partitions provided the latter is partitioned at all.  We want to avoid
-	 * having to construct this list again, so we request a lock on all
-     * partitions.  We need ShareLock, preventing data changes, because we
-     * may decide to scan them if we find out that the table being attached (or
-     * its leaf partitions) may contain rows that violate the partition
-     * constraint.  If the table has a constraint that would prevent such rows,
-     * which by definition is present in all the partitions, we need not scan
-     * the table, nor its partitions.  But we cannot risk a deadlock by taking
-     * a weaker lock now and the stronger one only when needed.
+	 * having to construct this list again, so we request the strongest lock
+     * on all partitions.  We need the strongest lock, because we may decide
+     * to scan them if we find out that the table being attached (or its leaf
+     * partitions) may contain rows that violate the partition constraint. If
+     * the table has a constraint that would prevent such rows, which by
+     * definition is present in all the partitions, we need not scan the
+     * table, nor its partitions.  But we cannot risk a deadlock by taking a
      * weaker lock now and the stronger one only when needed.
      */
     attachrel_children = find_all_inheritors(RelationGetRelid(attachrel),
-			                                 ShareLock, NULL);
+			                                 AccessExclusiveLock, NULL);
     if (list_member_oid(attachrel_children, RelationGetRelid(rel)))
         ereport(ERROR,
                 (errcode(ERRCODE_DUPLICATE_TABLE),

From 33216c2612714f287ff0403f0ede13d48facb22d Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 19:47:39 +0800
Subject: [PATCH 251/578] Fix handling of partition bounds for boolean
 partitioning columns.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/parser/gram.y                  |  2 ++
 src/test/regress/expected/create_table.out | 14 ++++++++++++++
 src/test/regress/sql/create_table.sql      |  7 +++++++
 3 files changed, 23 insertions(+)

diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 1cf77960..a3eb7514 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -2948,6 +2948,8 @@ hash_partbound:
 partbound_datum:
 			Sconst			{ $$ = makeStringConst($1, @1); }
 			| NumericOnly	{ $$ = makeAConst($1, @1); }
+			| TRUE_P		{ $$ = makeStringConst(pstrdup("true"), @1); }
+			| FALSE_P		{ $$ = makeStringConst(pstrdup("false"), @1); }
 			| NULL_P		{ $$ = makeNullAConst(@1); }
 		;
 
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index 7fa55adb..55e9e44d 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -899,3 +899,17 @@ Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
 
 DROP TABLE parted_col_comment;
+-- partition on boolean column
+create table boolspart (a bool) partition by list (a);
+create table boolspart_t partition of boolspart for values in (true);
+create table boolspart_f partition of boolspart for values in (false);
+\d+ boolspart
+                                 Table "public.boolspart"
+ Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+--------+---------+-----------+----------+---------+---------+--------------+-------------
+ a      | boolean |           |          |         | plain   |              | 
+Partition key: LIST (a)
+Partitions: boolspart_f FOR VALUES IN (false),
+            boolspart_t FOR VALUES IN (true)
+
+drop table boolspart;
diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql
index b125fa50..68482d79 100644
--- a/src/test/regress/sql/create_table.sql
+++ b/src/test/regress/sql/create_table.sql
@@ -714,3 +714,10 @@ COMMENT ON COLUMN parted_col_comment.a IS 'Partition key';
 SELECT obj_description('parted_col_comment'::regclass);
 \d+ parted_col_comment
 DROP TABLE parted_col_comment;
+
+-- partition on boolean column
+create table boolspart (a bool) partition by list (a);
+create table boolspart_t partition of boolspart for values in (true);
+create table boolspart_f partition of boolspart for values in (false);
+\d+ boolspart
+drop table boolspart;

From 1ff5704b1e0393d1f53502027d65150d7f974059 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 19:59:52 +0800
Subject: [PATCH 252/578] Fix assorted partition pruning bugs.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/partitioning/partprune.c          | 58 ++++++++--------
 src/test/regress/expected/partition_prune.out | 66 +++++++++++++++++++
 src/test/regress/sql/partition_prune.sql      | 14 ++++
 3 files changed, 106 insertions(+), 32 deletions(-)

diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c
index 959ee164..03bacd1f 100644
--- a/src/backend/partitioning/partprune.c
+++ b/src/backend/partitioning/partprune.c
@@ -364,8 +364,9 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
  * For BoolExpr clauses, we recursively generate steps for each argument, and
  * return a PartitionPruneStepCombine of their results.
  *
- * The generated steps are added to the context's steps list.  Each step is
- * assigned a step identifier, unique even across recursive calls.
+ * The return value is a list of the steps generated, which are also added to
+ * the context's steps list.  Each step is assigned a step identifier, unique
+ * even across recursive calls.
  *
  * If we find clauses that are mutually contradictory, or a pseudoconstant
  * clause that contains false, we set *contradictory to true and return NIL
@@ -1386,6 +1387,7 @@ match_clause_to_partition_key(RelOptInfo *rel,
 		List	   *elem_exprs,
 				   *elem_clauses;
 		ListCell   *lc1;
+		bool		contradictory;
 
 		if (IsA(leftop, RelabelType))
 			leftop = ((RelabelType *) leftop)->arg;
@@ -1404,7 +1406,7 @@ match_clause_to_partition_key(RelOptInfo *rel,
 		 * Only allow strict operators.  This will guarantee nulls are
 		 * filtered.
 		 */
-		if (!op_strict(saop->opno))
+		if (!op_strict(saop_op))
 			return PARTCLAUSE_UNSUPPORTED;
 
 		/* Useless if the array has any volatile functions. */
@@ -1437,6 +1439,8 @@ match_clause_to_partition_key(RelOptInfo *rel,
 				if (strategy != BTEqualStrategyNumber)
 					return PARTCLAUSE_UNSUPPORTED;
 			}
+			else
+				return PARTCLAUSE_UNSUPPORTED; /* no useful negator */
 		}
 
 		/*
@@ -1477,7 +1481,7 @@ match_clause_to_partition_key(RelOptInfo *rel,
 				elem_exprs = lappend(elem_exprs, elem_expr);
 			}
 		}
-		else
+		else if (IsA(rightop, ArrayExpr))
 		{
 			ArrayExpr  *arrexpr = castNode(ArrayExpr, rightop);
 
@@ -1491,6 +1495,11 @@ match_clause_to_partition_key(RelOptInfo *rel,
 
 			elem_exprs = arrexpr->elements;
 		}
+		else
+		{
+			/* Give up on any other clause types. */
+			return PARTCLAUSE_UNSUPPORTED;
+		}
 
 		/*
 		 * Now generate a list of clauses, one for each array element, of the
@@ -1509,36 +1518,21 @@ match_clause_to_partition_key(RelOptInfo *rel,
 		}
 
 		/*
-		 * Build a combine step as if for an OR clause or add the clauses to
-		 * the end of the list that's being processed currently.
+		 * If we have an ANY clause and multiple elements, first turn the list
+		 * of clauses into an OR expression.
 		 */
 		if (saop->useOr && list_length(elem_clauses) > 1)
-		{
-			Expr	   *orexpr;
-			bool		contradictory;
-
-			orexpr = makeBoolExpr(OR_EXPR, elem_clauses, -1);
-			*clause_steps =
-				gen_partprune_steps_internal(context, rel, list_make1(orexpr),
-											 &contradictory);
-			if (contradictory)
-				return PARTCLAUSE_MATCH_CONTRADICT;
-
-			Assert(list_length(*clause_steps) == 1);
-			return PARTCLAUSE_MATCH_STEPS;
-		}
-		else
-		{
-			bool		contradictory;
-
-			*clause_steps =
-				gen_partprune_steps_internal(context, rel, elem_clauses,
-											 &contradictory);
-			if (contradictory)
-				return PARTCLAUSE_MATCH_CONTRADICT;
-			Assert(list_length(*clause_steps) >= 1);
-			return PARTCLAUSE_MATCH_STEPS;
-		}
+			elem_clauses = list_make1(makeBoolExpr(OR_EXPR, elem_clauses, -1));
+
+		/* Finally, generate steps */
+		*clause_steps =
+			gen_partprune_steps_internal(context, rel, elem_clauses,
+										 &contradictory);
+		if (contradictory)
+			return PARTCLAUSE_MATCH_CONTRADICT;
+		else if (*clause_steps == NIL)
+			return PARTCLAUSE_UNSUPPORTED;	/* step generation failed */
+		return PARTCLAUSE_MATCH_STEPS;
 	}
 	else if (IsA(clause, NullTest))
 	{
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index 3e0a196e..b91cac4b 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -1081,6 +1081,72 @@ explain (costs off) select * from boolpart where a is not unknown;
          Filter: (a IS NOT UNKNOWN)
 (7 rows)
 
+-- test scalar-to-array operators
+create table coercepart (a varchar) partition by list (a);
+create table coercepart_ab partition of coercepart for values in ('ab');
+create table coercepart_bc partition of coercepart for values in ('bc');
+create table coercepart_cd partition of coercepart for values in ('cd');
+explain (costs off) select * from coercepart where a in ('ab', to_char(125, '999'));
+                                                          QUERY PLAN                                                          
+------------------------------------------------------------------------------------------------------------------------------
+ Append
+   ->  Seq Scan on coercepart_ab
+         Filter: ((a)::text = ANY ((ARRAY['ab'::character varying, (to_char(125, '999'::text))::character varying])::text[]))
+   ->  Seq Scan on coercepart_bc
+         Filter: ((a)::text = ANY ((ARRAY['ab'::character varying, (to_char(125, '999'::text))::character varying])::text[]))
+   ->  Seq Scan on coercepart_cd
+         Filter: ((a)::text = ANY ((ARRAY['ab'::character varying, (to_char(125, '999'::text))::character varying])::text[]))
+(7 rows)
+
+explain (costs off) select * from coercepart where a ~ any ('{ab}');
+                     QUERY PLAN                     
+----------------------------------------------------
+ Append
+   ->  Seq Scan on coercepart_ab
+         Filter: ((a)::text ~ ANY ('{ab}'::text[]))
+   ->  Seq Scan on coercepart_bc
+         Filter: ((a)::text ~ ANY ('{ab}'::text[]))
+   ->  Seq Scan on coercepart_cd
+         Filter: ((a)::text ~ ANY ('{ab}'::text[]))
+(7 rows)
+
+explain (costs off) select * from coercepart where a !~ all ('{ab}');
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Append
+   ->  Seq Scan on coercepart_ab
+         Filter: ((a)::text !~ ALL ('{ab}'::text[]))
+   ->  Seq Scan on coercepart_bc
+         Filter: ((a)::text !~ ALL ('{ab}'::text[]))
+   ->  Seq Scan on coercepart_cd
+         Filter: ((a)::text !~ ALL ('{ab}'::text[]))
+(7 rows)
+
+explain (costs off) select * from coercepart where a ~ any ('{ab,bc}');
+                      QUERY PLAN                       
+-------------------------------------------------------
+ Append
+   ->  Seq Scan on coercepart_ab
+         Filter: ((a)::text ~ ANY ('{ab,bc}'::text[]))
+   ->  Seq Scan on coercepart_bc
+         Filter: ((a)::text ~ ANY ('{ab,bc}'::text[]))
+   ->  Seq Scan on coercepart_cd
+         Filter: ((a)::text ~ ANY ('{ab,bc}'::text[]))
+(7 rows)
+
+explain (costs off) select * from coercepart where a !~ all ('{ab,bc}');
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Append
+   ->  Seq Scan on coercepart_ab
+         Filter: ((a)::text !~ ALL ('{ab,bc}'::text[]))
+   ->  Seq Scan on coercepart_bc
+         Filter: ((a)::text !~ ALL ('{ab,bc}'::text[]))
+   ->  Seq Scan on coercepart_cd
+         Filter: ((a)::text !~ ALL ('{ab,bc}'::text[]))
+(7 rows)
+
+drop table coercepart;
 --
 -- some more cases
 --
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index ca313897..164b74ee 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -152,6 +152,20 @@ explain (costs off) select * from boolpart where a is not true and a is not fals
 explain (costs off) select * from boolpart where a is unknown;
 explain (costs off) select * from boolpart where a is not unknown;
 
+-- test scalar-to-array operators
+create table coercepart (a varchar) partition by list (a);
+create table coercepart_ab partition of coercepart for values in ('ab');
+create table coercepart_bc partition of coercepart for values in ('bc');
+create table coercepart_cd partition of coercepart for values in ('cd');
+
+explain (costs off) select * from coercepart where a in ('ab', to_char(125, '999'));
+explain (costs off) select * from coercepart where a ~ any ('{ab}');
+explain (costs off) select * from coercepart where a !~ all ('{ab}');
+explain (costs off) select * from coercepart where a ~ any ('{ab,bc}');
+explain (costs off) select * from coercepart where a !~ all ('{ab,bc}');
+
+drop table coercepart;
+
 --
 -- some more cases
 --

From a74e161eec95f99a0f9800d40c627100783fe873 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 20:02:58 +0800
Subject: [PATCH 253/578] pgstatindex, pageinspect: handle partitioned indexes

---
 contrib/pageinspect/expected/page.out        |   6 +-
 contrib/pageinspect/rawpage.c                | 391 +++++-----
 contrib/pageinspect/sql/page.sql             |   5 +-
 contrib/pgstattuple/expected/pgstattuple.out |   3 +
 contrib/pgstattuple/pgstattuple.c            | 779 ++++++++++---------
 contrib/pgstattuple/sql/pgstattuple.sql      |   2 +
 6 files changed, 603 insertions(+), 583 deletions(-)

diff --git a/contrib/pageinspect/expected/page.out b/contrib/pageinspect/expected/page.out
index 8e15947a..5cbe2203 100644
--- a/contrib/pageinspect/expected/page.out
+++ b/contrib/pageinspect/expected/page.out
@@ -83,10 +83,14 @@ SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
 (1 row)
 
 DROP TABLE test1;
--- check that using any of these functions with a partitioned table would fail
+-- check that using any of these functions with a partitioned table or index
+-- would fail
 create table test_partitioned (a int) partition by range (a);
+create index test_partitioned_index on test_partitioned (a);
 select get_raw_page('test_partitioned', 0); -- error about partitioned table
 ERROR:  cannot get raw page from partitioned table "test_partitioned"
+select get_raw_page('test_partitioned_index', 0); -- error about partitioned index
+ERROR:  cannot get raw page from partitioned index "test_partitioned_index"
 -- a regular table which is a member of a partition set should work though
 create table test_part1 partition of test_partitioned for values from ( 1 ) to (100);
 select get_raw_page('test_part1', 0); -- get farther and error about empty table
diff --git a/contrib/pageinspect/rawpage.c b/contrib/pageinspect/rawpage.c
index 9682498d..999c4b45 100644
--- a/contrib/pageinspect/rawpage.c
+++ b/contrib/pageinspect/rawpage.c
@@ -1,14 +1,14 @@
 /*-------------------------------------------------------------------------
  *
  * rawpage.c
- *      Functions to extract a raw page as bytea and inspect it
+ *	  Functions to extract a raw page as bytea and inspect it
  *
  * Access-method specific inspection functions are in separate files.
  *
  * Copyright (c) 2007-2017, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *      contrib/pageinspect/rawpage.c
+ *	  contrib/pageinspect/rawpage.c
  *
  *-------------------------------------------------------------------------
  */
@@ -33,7 +33,7 @@
 PG_MODULE_MAGIC;
 
 static bytea *get_raw_page_internal(text *relname, ForkNumber forknum,
-                      BlockNumber blkno);
+					  BlockNumber blkno);
 
 
 /*
@@ -46,23 +46,23 @@ PG_FUNCTION_INFO_V1(get_raw_page);
 Datum
 get_raw_page(PG_FUNCTION_ARGS)
 {
-    text       *relname = PG_GETARG_TEXT_PP(0);
-    uint32        blkno = PG_GETARG_UINT32(1);
-    bytea       *raw_page;
-
-    /*
-     * We don't normally bother to check the number of arguments to a C
-     * function, but here it's needed for safety because early 8.4 beta
-     * releases mistakenly redefined get_raw_page() as taking three arguments.
-     */
-    if (PG_NARGS() != 2)
-        ereport(ERROR,
-                (errmsg("wrong number of arguments to get_raw_page()"),
-                 errhint("Run the updated pageinspect.sql script.")));
-
-    raw_page = get_raw_page_internal(relname, MAIN_FORKNUM, blkno);
-
-    PG_RETURN_BYTEA_P(raw_page);
+	text	   *relname = PG_GETARG_TEXT_PP(0);
+	uint32		blkno = PG_GETARG_UINT32(1);
+	bytea	   *raw_page;
+
+	/*
+	 * We don't normally bother to check the number of arguments to a C
+	 * function, but here it's needed for safety because early 8.4 beta
+	 * releases mistakenly redefined get_raw_page() as taking three arguments.
+	 */
+	if (PG_NARGS() != 2)
+		ereport(ERROR,
+				(errmsg("wrong number of arguments to get_raw_page()"),
+				 errhint("Run the updated pageinspect.sql script.")));
+
+	raw_page = get_raw_page_internal(relname, MAIN_FORKNUM, blkno);
+
+	PG_RETURN_BYTEA_P(raw_page);
 }
 
 /*
@@ -75,17 +75,17 @@ PG_FUNCTION_INFO_V1(get_raw_page_fork);
 Datum
 get_raw_page_fork(PG_FUNCTION_ARGS)
 {
-    text       *relname = PG_GETARG_TEXT_PP(0);
-    text       *forkname = PG_GETARG_TEXT_PP(1);
-    uint32        blkno = PG_GETARG_UINT32(2);
-    bytea       *raw_page;
-    ForkNumber    forknum;
+	text	   *relname = PG_GETARG_TEXT_PP(0);
+	text	   *forkname = PG_GETARG_TEXT_PP(1);
+	uint32		blkno = PG_GETARG_UINT32(2);
+	bytea	   *raw_page;
+	ForkNumber	forknum;
 
-    forknum = forkname_to_number(text_to_cstring(forkname));
+	forknum = forkname_to_number(text_to_cstring(forkname));
 
-    raw_page = get_raw_page_internal(relname, forknum, blkno);
+	raw_page = get_raw_page_internal(relname, forknum, blkno);
 
-    PG_RETURN_BYTEA_P(raw_page);
+	PG_RETURN_BYTEA_P(raw_page);
 }
 
 /*
@@ -94,76 +94,81 @@ get_raw_page_fork(PG_FUNCTION_ARGS)
 static bytea *
 get_raw_page_internal(text *relname, ForkNumber forknum, BlockNumber blkno)
 {
-    bytea       *raw_page;
-    RangeVar   *relrv;
-    Relation    rel;
-    char       *raw_page_data;
-    Buffer        buf;
-
-    if (!superuser())
-        ereport(ERROR,
-                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                 (errmsg("must be superuser to use raw functions"))));
-
-    relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
-    rel = relation_openrv(relrv, AccessShareLock);
-
-    /* Check that this relation has storage */
-    if (rel->rd_rel->relkind == RELKIND_VIEW)
-        ereport(ERROR,
-                (errcode(ERRCODE_WRONG_OBJECT_TYPE),
-                 errmsg("cannot get raw page from view \"%s\"",
-                        RelationGetRelationName(rel))));
-    if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
-        ereport(ERROR,
-                (errcode(ERRCODE_WRONG_OBJECT_TYPE),
-                 errmsg("cannot get raw page from composite type \"%s\"",
-                        RelationGetRelationName(rel))));
-    if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
-        ereport(ERROR,
-                (errcode(ERRCODE_WRONG_OBJECT_TYPE),
-                 errmsg("cannot get raw page from foreign table \"%s\"",
-                        RelationGetRelationName(rel))));
-    if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-        ereport(ERROR,
-                (errcode(ERRCODE_WRONG_OBJECT_TYPE),
-                 errmsg("cannot get raw page from partitioned table \"%s\"",
-                        RelationGetRelationName(rel))));
-
-    /*
-     * Reject attempts to read non-local temporary relations; we would be
-     * likely to get wrong data since we have no visibility into the owning
-     * session's local buffers.
-     */
-    if (RELATION_IS_OTHER_TEMP(rel))
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("cannot access temporary tables of other sessions")));
-
-    if (blkno >= RelationGetNumberOfBlocksInFork(rel, forknum))
-        ereport(ERROR,
-                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                 errmsg("block number %u is out of range for relation \"%s\"",
-                        blkno, RelationGetRelationName(rel))));
-
-    /* Initialize buffer to copy to */
-    raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
-    SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
-    raw_page_data = VARDATA(raw_page);
-
-    /* Take a verbatim copy of the page */
-
-    buf = ReadBufferExtended(rel, forknum, blkno, RBM_NORMAL, NULL);
-    LockBuffer(buf, BUFFER_LOCK_SHARE);
-
-    memcpy(raw_page_data, BufferGetPage(buf), BLCKSZ);
-
-    LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-    ReleaseBuffer(buf);
-
-    relation_close(rel, AccessShareLock);
-
-    return raw_page;
+	bytea	   *raw_page;
+	RangeVar   *relrv;
+	Relation	rel;
+	char	   *raw_page_data;
+	Buffer		buf;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use raw functions"))));
+
+	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+	rel = relation_openrv(relrv, AccessShareLock);
+
+	/* Check that this relation has storage */
+	if (rel->rd_rel->relkind == RELKIND_VIEW)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from view \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from composite type \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from foreign table \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from partitioned table \"%s\"",
+						RelationGetRelationName(rel))));
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("cannot get raw page from partitioned index \"%s\"",
+						RelationGetRelationName(rel))));
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+	if (blkno >= RelationGetNumberOfBlocksInFork(rel, forknum))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("block number %u is out of range for relation \"%s\"",
+						blkno, RelationGetRelationName(rel))));
+
+	/* Initialize buffer to copy to */
+	raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
+	SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
+	raw_page_data = VARDATA(raw_page);
+
+	/* Take a verbatim copy of the page */
+
+	buf = ReadBufferExtended(rel, forknum, blkno, RBM_NORMAL, NULL);
+	LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+	memcpy(raw_page_data, BufferGetPage(buf), BLCKSZ);
+
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	ReleaseBuffer(buf);
+
+	relation_close(rel, AccessShareLock);
+
+	return raw_page;
 }
 
 
@@ -182,23 +187,23 @@ get_raw_page_internal(text *relname, ForkNumber forknum, BlockNumber blkno)
 Page
 get_page_from_raw(bytea *raw_page)
 {
-    Page        page;
-    int            raw_page_size;
+	Page		page;
+	int			raw_page_size;
 
-    raw_page_size = VARSIZE_ANY_EXHDR(raw_page);
+	raw_page_size = VARSIZE_ANY_EXHDR(raw_page);
 
-    if (raw_page_size != BLCKSZ)
-        ereport(ERROR,
-                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                 errmsg("invalid page size"),
-                 errdetail("Expected %d bytes, got %d.",
-                           BLCKSZ, raw_page_size)));
+	if (raw_page_size != BLCKSZ)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("invalid page size"),
+				 errdetail("Expected %d bytes, got %d.",
+						   BLCKSZ, raw_page_size)));
 
-    page = palloc(raw_page_size);
+	page = palloc(raw_page_size);
 
-    memcpy(page, VARDATA_ANY(raw_page), raw_page_size);
+	memcpy(page, VARDATA_ANY(raw_page), raw_page_size);
 
-    return page;
+	return page;
 }
 
 
@@ -213,76 +218,76 @@ PG_FUNCTION_INFO_V1(page_header);
 Datum
 page_header(PG_FUNCTION_ARGS)
 {
-    bytea       *raw_page = PG_GETARG_BYTEA_P(0);
-    int            raw_page_size;
-
-    TupleDesc    tupdesc;
-
-    Datum        result;
-    HeapTuple    tuple;
-    Datum        values[10];
-    bool        nulls[10];
-
-    PageHeader    page;
-    XLogRecPtr    lsn;
-
-    if (!superuser())
-        ereport(ERROR,
-                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                 (errmsg("must be superuser to use raw page functions"))));
-
-    raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
-
-    /*
-     * Check that enough data was supplied, so that we don't try to access
-     * fields outside the supplied buffer.
-     */
-    if (raw_page_size < SizeOfPageHeaderData)
-        ereport(ERROR,
-                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                 errmsg("input page too small (%d bytes)", raw_page_size)));
-
-    page = (PageHeader) VARDATA(raw_page);
-
-    /* Build a tuple descriptor for our result type */
-    if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
-        elog(ERROR, "return type must be a row type");
-
-    /* Extract information from the page header */
-
-    lsn = PageGetLSN(page);
-
-    /* pageinspect >= 1.2 uses pg_lsn instead of text for the LSN field. */
-    if (tupdesc->attrs[0]->atttypid == TEXTOID)
-    {
-        char        lsnchar[64];
-
-        snprintf(lsnchar, sizeof(lsnchar), "%X/%X",
-                 (uint32) (lsn >> 32), (uint32) lsn);
-        values[0] = CStringGetTextDatum(lsnchar);
-    }
-    else
-        values[0] = LSNGetDatum(lsn);
-    values[1] = UInt16GetDatum(page->pd_checksum);
-    values[2] = UInt16GetDatum(page->pd_flags);
-#ifdef _SHARDING_    
-    values[3] = UInt16GetDatum(page->pd_shard);
+	bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
+	int			raw_page_size;
+
+	TupleDesc	tupdesc;
+
+	Datum		result;
+	HeapTuple	tuple;
+	Datum		values[10];
+	bool		nulls[10];
+
+	PageHeader	page;
+	XLogRecPtr	lsn;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use raw page functions"))));
+
+	raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
+
+	/*
+	 * Check that enough data was supplied, so that we don't try to access
+	 * fields outside the supplied buffer.
+	 */
+	if (raw_page_size < SizeOfPageHeaderData)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("input page too small (%d bytes)", raw_page_size)));
+
+	page = (PageHeader) VARDATA(raw_page);
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	/* Extract information from the page header */
+
+	lsn = PageGetLSN(page);
+
+	/* pageinspect >= 1.2 uses pg_lsn instead of text for the LSN field. */
+	if (tupdesc->attrs[0]->atttypid == TEXTOID)
+	{
+		char		lsnchar[64];
+
+		snprintf(lsnchar, sizeof(lsnchar), "%X/%X",
+				 (uint32) (lsn >> 32), (uint32) lsn);
+		values[0] = CStringGetTextDatum(lsnchar);
+	}
+	else
+		values[0] = LSNGetDatum(lsn);
+	values[1] = UInt16GetDatum(page->pd_checksum);
+	values[2] = UInt16GetDatum(page->pd_flags);
+#ifdef _SHARDING_	
+	values[3] = UInt16GetDatum(page->pd_shard);
 #endif
-    values[4] = UInt16GetDatum(page->pd_lower);
-    values[5] = UInt16GetDatum(page->pd_upper);
-    values[6] = UInt16GetDatum(page->pd_special);
-    values[7] = UInt16GetDatum(PageGetPageSize(page));
-    values[8] = UInt16GetDatum(PageGetPageLayoutVersion(page));
-    values[9] = TransactionIdGetDatum(page->pd_prune_xid);
+	values[4] = UInt16GetDatum(page->pd_lower);
+	values[5] = UInt16GetDatum(page->pd_upper);
+	values[6] = UInt16GetDatum(page->pd_special);
+	values[7] = UInt16GetDatum(PageGetPageSize(page));
+	values[8] = UInt16GetDatum(PageGetPageLayoutVersion(page));
+	values[9] = TransactionIdGetDatum(page->pd_prune_xid);
 
-    /* Build and return the tuple. */
+	/* Build and return the tuple. */
 
-    memset(nulls, 0, sizeof(nulls));
+	memset(nulls, 0, sizeof(nulls));
 
-    tuple = heap_form_tuple(tupdesc, values, nulls);
-    result = HeapTupleGetDatum(tuple);
+	tuple = heap_form_tuple(tupdesc, values, nulls);
+	result = HeapTupleGetDatum(tuple);
 
-    PG_RETURN_DATUM(result);
+	PG_RETURN_DATUM(result);
 }
 
 /*
@@ -296,27 +301,27 @@ PG_FUNCTION_INFO_V1(page_checksum);
 Datum
 page_checksum(PG_FUNCTION_ARGS)
 {
-    bytea       *raw_page = PG_GETARG_BYTEA_P(0);
-    uint32        blkno = PG_GETARG_INT32(1);
-    int            raw_page_size;
-    PageHeader    page;
+	bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
+	uint32		blkno = PG_GETARG_INT32(1);
+	int			raw_page_size;
+	PageHeader	page;
 
-    if (!superuser())
-        ereport(ERROR,
-                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                 (errmsg("must be superuser to use raw page functions"))));
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use raw page functions"))));
 
-    raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
+	raw_page_size = VARSIZE(raw_page) - VARHDRSZ;
 
-    /*
-     * Check that the supplied page is of the right size.
-     */
-    if (raw_page_size != BLCKSZ)
-        ereport(ERROR,
-                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                 errmsg("incorrect size of input page (%d bytes)", raw_page_size)));
+	/*
+	 * Check that the supplied page is of the right size.
+	 */
+	if (raw_page_size != BLCKSZ)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("incorrect size of input page (%d bytes)", raw_page_size)));
 
-    page = (PageHeader) VARDATA(raw_page);
+	page = (PageHeader) VARDATA(raw_page);
 
-    PG_RETURN_INT16(pg_checksum_page((char *) page, blkno));
+	PG_RETURN_INT16(pg_checksum_page((char *) page, blkno));
 }
diff --git a/contrib/pageinspect/sql/page.sql b/contrib/pageinspect/sql/page.sql
index 493ca9b2..518d5fba 100644
--- a/contrib/pageinspect/sql/page.sql
+++ b/contrib/pageinspect/sql/page.sql
@@ -33,9 +33,12 @@ SELECT * FROM fsm_page_contents(get_raw_page('test1', 'fsm', 0));
 
 DROP TABLE test1;
 
--- check that using any of these functions with a partitioned table would fail
+-- check that using any of these functions with a partitioned table or index
+-- would fail
 create table test_partitioned (a int) partition by range (a);
+create index test_partitioned_index on test_partitioned (a);
 select get_raw_page('test_partitioned', 0); -- error about partitioned table
+select get_raw_page('test_partitioned_index', 0); -- error about partitioned index
 
 -- a regular table which is a member of a partition set should work though
 create table test_part1 partition of test_partitioned for values from ( 1 ) to (100);
diff --git a/contrib/pgstattuple/expected/pgstattuple.out b/contrib/pgstattuple/expected/pgstattuple.out
index 129b29c6..770e73ed 100644
--- a/contrib/pgstattuple/expected/pgstattuple.out
+++ b/contrib/pgstattuple/expected/pgstattuple.out
@@ -152,9 +152,12 @@ select pgstatginindex('test_hashidx');
 ERROR:  relation "test_hashidx" is not a GIN index
 -- check that using any of these functions with unsupported relations will fail
 create table test_partitioned (a int) partition by range (a);
+create index test_partitioned_index on test_partitioned(a);
 -- these should all fail
 select pgstattuple('test_partitioned');
 ERROR:  "test_partitioned" (partitioned table) is not supported
+select pgstattuple('test_partitioned_index');
+ERROR:  "test_partitioned_index" (partitioned index) is not supported
 select pgstattuple_approx('test_partitioned');
 ERROR:  "test_partitioned" is not a table or materialized view
 select pg_relpages('test_partitioned');
diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c
index ae26d6fe..da993bae 100644
--- a/contrib/pgstattuple/pgstattuple.c
+++ b/contrib/pgstattuple/pgstattuple.c
@@ -1,7 +1,7 @@
 /*
  * contrib/pgstattuple/pgstattuple.c
  *
- * Copyright (c) 2001,2002    Tatsuo Ishii
+ * Copyright (c) 2001,2002	Tatsuo Ishii
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose, without fee, and without a
@@ -53,34 +53,34 @@ PG_FUNCTION_INFO_V1(pgstattuplebyid_v1_5);
  */
 typedef struct pgstattuple_type
 {
-    uint64        table_len;
-    uint64        tuple_count;
-    uint64        tuple_len;
-    uint64        dead_tuple_count;
-    uint64        dead_tuple_len;
-    uint64        free_space;        /* free/reusable space in bytes */
+	uint64		table_len;
+	uint64		tuple_count;
+	uint64		tuple_len;
+	uint64		dead_tuple_count;
+	uint64		dead_tuple_len;
+	uint64		free_space;		/* free/reusable space in bytes */
 } pgstattuple_type;
 
 typedef void (*pgstat_page) (pgstattuple_type *, Relation, BlockNumber,
-                             BufferAccessStrategy);
+							 BufferAccessStrategy);
 
 static Datum build_pgstattuple_type(pgstattuple_type *stat,
-                       FunctionCallInfo fcinfo);
+					   FunctionCallInfo fcinfo);
 static Datum pgstat_relation(Relation rel, FunctionCallInfo fcinfo);
 static Datum pgstat_heap(Relation rel, FunctionCallInfo fcinfo);
 static void pgstat_btree_page(pgstattuple_type *stat,
-                  Relation rel, BlockNumber blkno,
-                  BufferAccessStrategy bstrategy);
+				  Relation rel, BlockNumber blkno,
+				  BufferAccessStrategy bstrategy);
 static void pgstat_hash_page(pgstattuple_type *stat,
-                 Relation rel, BlockNumber blkno,
-                 BufferAccessStrategy bstrategy);
+				 Relation rel, BlockNumber blkno,
+				 BufferAccessStrategy bstrategy);
 static void pgstat_gist_page(pgstattuple_type *stat,
-                 Relation rel, BlockNumber blkno,
-                 BufferAccessStrategy bstrategy);
+				 Relation rel, BlockNumber blkno,
+				 BufferAccessStrategy bstrategy);
 static Datum pgstat_index(Relation rel, BlockNumber start,
-             pgstat_page pagefn, FunctionCallInfo fcinfo);
+			 pgstat_page pagefn, FunctionCallInfo fcinfo);
 static void pgstat_index_page(pgstattuple_type *stat, Page page,
-                  OffsetNumber minoff, OffsetNumber maxoff);
+				  OffsetNumber minoff, OffsetNumber maxoff);
 
 /*
  * build_pgstattuple_type -- build a pgstattuple_type tuple
@@ -88,65 +88,65 @@ static void pgstat_index_page(pgstattuple_type *stat, Page page,
 static Datum
 build_pgstattuple_type(pgstattuple_type *stat, FunctionCallInfo fcinfo)
 {
-#define NCOLUMNS    9
-#define NCHARS        32
-
-    HeapTuple    tuple;
-    char       *values[NCOLUMNS];
-    char        values_buf[NCOLUMNS][NCHARS];
-    int            i;
-    double        tuple_percent;
-    double        dead_tuple_percent;
-    double        free_percent;    /* free/reusable space in % */
-    TupleDesc    tupdesc;
-    AttInMetadata *attinmeta;
-
-    /* Build a tuple descriptor for our result type */
-    if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
-        elog(ERROR, "return type must be a row type");
-
-    /*
-     * Generate attribute metadata needed later to produce tuples from raw C
-     * strings
-     */
-    attinmeta = TupleDescGetAttInMetadata(tupdesc);
-
-    if (stat->table_len == 0)
-    {
-        tuple_percent = 0.0;
-        dead_tuple_percent = 0.0;
-        free_percent = 0.0;
-    }
-    else
-    {
-        tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
-        dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
-        free_percent = 100.0 * stat->free_space / stat->table_len;
-    }
-
-    /*
-     * Prepare a values array for constructing the tuple. This should be an
-     * array of C strings which will be processed later by the appropriate
-     * "in" functions.
-     */
-    for (i = 0; i < NCOLUMNS; i++)
-        values[i] = values_buf[i];
-    i = 0;
-    snprintf(values[i++], NCHARS, INT64_FORMAT, stat->table_len);
-    snprintf(values[i++], NCHARS, INT64_FORMAT, stat->tuple_count);
-    snprintf(values[i++], NCHARS, INT64_FORMAT, stat->tuple_len);
-    snprintf(values[i++], NCHARS, "%.2f", tuple_percent);
-    snprintf(values[i++], NCHARS, INT64_FORMAT, stat->dead_tuple_count);
-    snprintf(values[i++], NCHARS, INT64_FORMAT, stat->dead_tuple_len);
-    snprintf(values[i++], NCHARS, "%.2f", dead_tuple_percent);
-    snprintf(values[i++], NCHARS, INT64_FORMAT, stat->free_space);
-    snprintf(values[i++], NCHARS, "%.2f", free_percent);
-
-    /* build a tuple */
-    tuple = BuildTupleFromCStrings(attinmeta, values);
-
-    /* make the tuple into a datum */
-    return HeapTupleGetDatum(tuple);
+#define NCOLUMNS	9
+#define NCHARS		32
+
+	HeapTuple	tuple;
+	char	   *values[NCOLUMNS];
+	char		values_buf[NCOLUMNS][NCHARS];
+	int			i;
+	double		tuple_percent;
+	double		dead_tuple_percent;
+	double		free_percent;	/* free/reusable space in % */
+	TupleDesc	tupdesc;
+	AttInMetadata *attinmeta;
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	/*
+	 * Generate attribute metadata needed later to produce tuples from raw C
+	 * strings
+	 */
+	attinmeta = TupleDescGetAttInMetadata(tupdesc);
+
+	if (stat->table_len == 0)
+	{
+		tuple_percent = 0.0;
+		dead_tuple_percent = 0.0;
+		free_percent = 0.0;
+	}
+	else
+	{
+		tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
+		dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
+		free_percent = 100.0 * stat->free_space / stat->table_len;
+	}
+
+	/*
+	 * Prepare a values array for constructing the tuple. This should be an
+	 * array of C strings which will be processed later by the appropriate
+	 * "in" functions.
+	 */
+	for (i = 0; i < NCOLUMNS; i++)
+		values[i] = values_buf[i];
+	i = 0;
+	snprintf(values[i++], NCHARS, INT64_FORMAT, stat->table_len);
+	snprintf(values[i++], NCHARS, INT64_FORMAT, stat->tuple_count);
+	snprintf(values[i++], NCHARS, INT64_FORMAT, stat->tuple_len);
+	snprintf(values[i++], NCHARS, "%.2f", tuple_percent);
+	snprintf(values[i++], NCHARS, INT64_FORMAT, stat->dead_tuple_count);
+	snprintf(values[i++], NCHARS, INT64_FORMAT, stat->dead_tuple_len);
+	snprintf(values[i++], NCHARS, "%.2f", dead_tuple_percent);
+	snprintf(values[i++], NCHARS, INT64_FORMAT, stat->free_space);
+	snprintf(values[i++], NCHARS, "%.2f", free_percent);
+
+	/* build a tuple */
+	tuple = BuildTupleFromCStrings(attinmeta, values);
+
+	/* make the tuple into a datum */
+	return HeapTupleGetDatum(tuple);
 }
 
 /* ----------
@@ -165,20 +165,20 @@ build_pgstattuple_type(pgstattuple_type *stat, FunctionCallInfo fcinfo)
 Datum
 pgstattuple(PG_FUNCTION_ARGS)
 {
-    text       *relname = PG_GETARG_TEXT_PP(0);
-    RangeVar   *relrv;
-    Relation    rel;
+	text	   *relname = PG_GETARG_TEXT_PP(0);
+	RangeVar   *relrv;
+	Relation	rel;
 
-    if (!superuser())
-        ereport(ERROR,
-                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                 (errmsg("must be superuser to use pgstattuple functions"))));
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use pgstattuple functions"))));
 
-    /* open relation */
-    relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
-    rel = relation_openrv(relrv, AccessShareLock);
+	/* open relation */
+	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+	rel = relation_openrv(relrv, AccessShareLock);
 
-    PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
+	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
 }
 
 /*
@@ -191,46 +191,46 @@ pgstattuple(PG_FUNCTION_ARGS)
 Datum
 pgstattuple_v1_5(PG_FUNCTION_ARGS)
 {
-    text       *relname = PG_GETARG_TEXT_PP(0);
-    RangeVar   *relrv;
-    Relation    rel;
+	text	   *relname = PG_GETARG_TEXT_PP(0);
+	RangeVar   *relrv;
+	Relation	rel;
 
-    /* open relation */
-    relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
-    rel = relation_openrv(relrv, AccessShareLock);
+	/* open relation */
+	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
+	rel = relation_openrv(relrv, AccessShareLock);
 
-    PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
+	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
 }
 
 /* Must keep superuser() check, see above. */
 Datum
 pgstattuplebyid(PG_FUNCTION_ARGS)
 {
-    Oid            relid = PG_GETARG_OID(0);
-    Relation    rel;
+	Oid			relid = PG_GETARG_OID(0);
+	Relation	rel;
 
-    if (!superuser())
-        ereport(ERROR,
-                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                 (errmsg("must be superuser to use pgstattuple functions"))));
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use pgstattuple functions"))));
 
-    /* open relation */
-    rel = relation_open(relid, AccessShareLock);
+	/* open relation */
+	rel = relation_open(relid, AccessShareLock);
 
-    PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
+	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
 }
 
 /* Remove superuser() check for 1.5 version, see above */
 Datum
 pgstattuplebyid_v1_5(PG_FUNCTION_ARGS)
 {
-    Oid            relid = PG_GETARG_OID(0);
-    Relation    rel;
+	Oid			relid = PG_GETARG_OID(0);
+	Relation	rel;
 
-    /* open relation */
-    rel = relation_open(relid, AccessShareLock);
+	/* open relation */
+	rel = relation_open(relid, AccessShareLock);
 
-    PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
+	PG_RETURN_DATUM(pgstat_relation(rel, fcinfo));
 }
 
 /*
@@ -239,73 +239,76 @@ pgstattuplebyid_v1_5(PG_FUNCTION_ARGS)
 static Datum
 pgstat_relation(Relation rel, FunctionCallInfo fcinfo)
 {
-    const char *err;
-
-    /*
-     * Reject attempts to read non-local temporary relations; we would be
-     * likely to get wrong data since we have no visibility into the owning
-     * session's local buffers.
-     */
-    if (RELATION_IS_OTHER_TEMP(rel))
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("cannot access temporary tables of other sessions")));
-
-    switch (rel->rd_rel->relkind)
-    {
-        case RELKIND_RELATION:
-        case RELKIND_MATVIEW:
-        case RELKIND_TOASTVALUE:
-        case RELKIND_SEQUENCE:
-            return pgstat_heap(rel, fcinfo);
-        case RELKIND_INDEX:
-            switch (rel->rd_rel->relam)
-            {
-                case BTREE_AM_OID:
-                    return pgstat_index(rel, BTREE_METAPAGE + 1,
-                                        pgstat_btree_page, fcinfo);
-                case HASH_AM_OID:
-                    return pgstat_index(rel, HASH_METAPAGE + 1,
-                                        pgstat_hash_page, fcinfo);
-                case GIST_AM_OID:
-                    return pgstat_index(rel, GIST_ROOT_BLKNO + 1,
-                                        pgstat_gist_page, fcinfo);
-                case GIN_AM_OID:
-                    err = "gin index";
-                    break;
-                case SPGIST_AM_OID:
-                    err = "spgist index";
-                    break;
-                case BRIN_AM_OID:
-                    err = "brin index";
-                    break;
-                default:
-                    err = "unknown index";
-                    break;
-            }
-            break;
-        case RELKIND_VIEW:
-            err = "view";
-            break;
-        case RELKIND_COMPOSITE_TYPE:
-            err = "composite type";
-            break;
-        case RELKIND_FOREIGN_TABLE:
-            err = "foreign table";
-            break;
-        case RELKIND_PARTITIONED_TABLE:
-            err = "partitioned table";
-            break;
-        default:
-            err = "unknown";
-            break;
-    }
-
-    ereport(ERROR,
-            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-             errmsg("\"%s\" (%s) is not supported",
-                    RelationGetRelationName(rel), err)));
-    return 0;                    /* should not happen */
+	const char *err;
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+	switch (rel->rd_rel->relkind)
+	{
+		case RELKIND_RELATION:
+		case RELKIND_MATVIEW:
+		case RELKIND_TOASTVALUE:
+		case RELKIND_SEQUENCE:
+			return pgstat_heap(rel, fcinfo);
+		case RELKIND_INDEX:
+			switch (rel->rd_rel->relam)
+			{
+				case BTREE_AM_OID:
+					return pgstat_index(rel, BTREE_METAPAGE + 1,
+										pgstat_btree_page, fcinfo);
+				case HASH_AM_OID:
+					return pgstat_index(rel, HASH_METAPAGE + 1,
+										pgstat_hash_page, fcinfo);
+				case GIST_AM_OID:
+					return pgstat_index(rel, GIST_ROOT_BLKNO + 1,
+										pgstat_gist_page, fcinfo);
+				case GIN_AM_OID:
+					err = "gin index";
+					break;
+				case SPGIST_AM_OID:
+					err = "spgist index";
+					break;
+				case BRIN_AM_OID:
+					err = "brin index";
+					break;
+				default:
+					err = "unknown index";
+					break;
+			}
+			break;
+		case RELKIND_VIEW:
+			err = "view";
+			break;
+		case RELKIND_COMPOSITE_TYPE:
+			err = "composite type";
+			break;
+		case RELKIND_FOREIGN_TABLE:
+			err = "foreign table";
+			break;
+		case RELKIND_PARTITIONED_TABLE:
+			err = "partitioned table";
+			break;
+		case RELKIND_PARTITIONED_INDEX:
+			err = "partitioned index";
+			break;
+		default:
+			err = "unknown";
+			break;
+	}
+
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("\"%s\" (%s) is not supported",
+					RelationGetRelationName(rel), err)));
+	return 0;					/* should not happen */
 }
 
 /*
@@ -314,81 +317,81 @@ pgstat_relation(Relation rel, FunctionCallInfo fcinfo)
 static Datum
 pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
 {
-    HeapScanDesc scan;
-    HeapTuple    tuple;
-    BlockNumber nblocks;
-    BlockNumber block = 0;        /* next block to count free space in */
-    BlockNumber tupblock;
-    Buffer        buffer;
-    pgstattuple_type stat = {0};
-    SnapshotData SnapshotDirty;
-
-    /* Disable syncscan because we assume we scan from block zero upwards */
-    scan = heap_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false);
-    InitDirtySnapshot(SnapshotDirty);
-
-    nblocks = scan->rs_nblocks; /* # blocks to be scanned */
-
-    /* scan the relation */
-    while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
-    {
-        CHECK_FOR_INTERRUPTS();
-
-        /* must hold a buffer lock to call HeapTupleSatisfiesVisibility */
-        LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
-
-        if (HeapTupleSatisfiesVisibility(tuple, &SnapshotDirty, scan->rs_cbuf))
-        {
-            stat.tuple_len += tuple->t_len;
-            stat.tuple_count++;
-        }
-        else
-        {
-            stat.dead_tuple_len += tuple->t_len;
-            stat.dead_tuple_count++;
-        }
-
-        LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
-
-        /*
-         * To avoid physically reading the table twice, try to do the
-         * free-space scan in parallel with the heap scan.  However,
-         * heap_getnext may find no tuples on a given page, so we cannot
-         * simply examine the pages returned by the heap scan.
-         */
-        tupblock = ItemPointerGetBlockNumber(&tuple->t_self);
-
-        while (block <= tupblock)
-        {
-            CHECK_FOR_INTERRUPTS();
-
-            buffer = ReadBufferExtended(rel, MAIN_FORKNUM, block,
-                                        RBM_NORMAL, scan->rs_strategy);
-            LockBuffer(buffer, BUFFER_LOCK_SHARE);
-            stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer));
-            UnlockReleaseBuffer(buffer);
-            block++;
-        }
-    }
-
-    while (block < nblocks)
-    {
-        CHECK_FOR_INTERRUPTS();
-
-        buffer = ReadBufferExtended(rel, MAIN_FORKNUM, block,
-                                    RBM_NORMAL, scan->rs_strategy);
-        LockBuffer(buffer, BUFFER_LOCK_SHARE);
-        stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer));
-        UnlockReleaseBuffer(buffer);
-        block++;
-    }
-
-    heap_endscan(scan);
-    relation_close(rel, AccessShareLock);
-
-    stat.table_len = (uint64) nblocks * BLCKSZ;
-
-    return build_pgstattuple_type(&stat, fcinfo);
+	HeapScanDesc scan;
+	HeapTuple	tuple;
+	BlockNumber nblocks;
+	BlockNumber block = 0;		/* next block to count free space in */
+	BlockNumber tupblock;
+	Buffer		buffer;
+	pgstattuple_type stat = {0};
+	SnapshotData SnapshotDirty;
+
+	/* Disable syncscan because we assume we scan from block zero upwards */
+	scan = heap_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false);
+	InitDirtySnapshot(SnapshotDirty);
+
+	nblocks = scan->rs_nblocks; /* # blocks to be scanned */
+
+	/* scan the relation */
+	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		/* must hold a buffer lock to call HeapTupleSatisfiesVisibility */
+		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
+
+		if (HeapTupleSatisfiesVisibility(tuple, &SnapshotDirty, scan->rs_cbuf))
+		{
+			stat.tuple_len += tuple->t_len;
+			stat.tuple_count++;
+		}
+		else
+		{
+			stat.dead_tuple_len += tuple->t_len;
+			stat.dead_tuple_count++;
+		}
+
+		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+
+		/*
+		 * To avoid physically reading the table twice, try to do the
+		 * free-space scan in parallel with the heap scan.  However,
+		 * heap_getnext may find no tuples on a given page, so we cannot
+		 * simply examine the pages returned by the heap scan.
+		 */
+		tupblock = ItemPointerGetBlockNumber(&tuple->t_self);
+
+		while (block <= tupblock)
+		{
+			CHECK_FOR_INTERRUPTS();
+
+			buffer = ReadBufferExtended(rel, MAIN_FORKNUM, block,
+										RBM_NORMAL, scan->rs_strategy);
+			LockBuffer(buffer, BUFFER_LOCK_SHARE);
+			stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer));
+			UnlockReleaseBuffer(buffer);
+			block++;
+		}
+	}
+
+	while (block < nblocks)
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, block,
+									RBM_NORMAL, scan->rs_strategy);
+		LockBuffer(buffer, BUFFER_LOCK_SHARE);
+		stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer));
+		UnlockReleaseBuffer(buffer);
+		block++;
+	}
+
+	heap_endscan(scan);
+	relation_close(rel, AccessShareLock);
+
+	stat.table_len = (uint64) nblocks * BLCKSZ;
+
+	return build_pgstattuple_type(&stat, fcinfo);
 }
 
 /*
@@ -396,43 +399,43 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
  */
 static void
 pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
-                  BufferAccessStrategy bstrategy)
+				  BufferAccessStrategy bstrategy)
 {
-    Buffer        buf;
-    Page        page;
-
-    buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
-    LockBuffer(buf, BT_READ);
-    page = BufferGetPage(buf);
-
-    /* Page is valid, see what to do with it */
-    if (PageIsNew(page))
-    {
-        /* fully empty page */
-        stat->free_space += BLCKSZ;
-    }
-    else
-    {
-        BTPageOpaque opaque;
-
-        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-        if (opaque->btpo_flags & (BTP_DELETED | BTP_HALF_DEAD))
-        {
-            /* recyclable page */
-            stat->free_space += BLCKSZ;
-        }
-        else if (P_ISLEAF(opaque))
-        {
-            pgstat_index_page(stat, page, P_FIRSTDATAKEY(opaque),
-                              PageGetMaxOffsetNumber(page));
-        }
-        else
-        {
-            /* root or node */
-        }
-    }
-
-    _bt_relbuf(rel, buf);
+	Buffer		buf;
+	Page		page;
+
+	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
+	LockBuffer(buf, BT_READ);
+	page = BufferGetPage(buf);
+
+	/* Page is valid, see what to do with it */
+	if (PageIsNew(page))
+	{
+		/* fully empty page */
+		stat->free_space += BLCKSZ;
+	}
+	else
+	{
+		BTPageOpaque opaque;
+
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		if (opaque->btpo_flags & (BTP_DELETED | BTP_HALF_DEAD))
+		{
+			/* recyclable page */
+			stat->free_space += BLCKSZ;
+		}
+		else if (P_ISLEAF(opaque))
+		{
+			pgstat_index_page(stat, page, P_FIRSTDATAKEY(opaque),
+							  PageGetMaxOffsetNumber(page));
+		}
+		else
+		{
+			/* root or node */
+		}
+	}
+
+	_bt_relbuf(rel, buf);
 }
 
 /*
@@ -440,41 +443,41 @@ pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
  */
 static void
 pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
-                 BufferAccessStrategy bstrategy)
+				 BufferAccessStrategy bstrategy)
 {
-    Buffer        buf;
-    Page        page;
-
-    buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy);
-    page = BufferGetPage(buf);
-
-    if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData)))
-    {
-        HashPageOpaque opaque;
-
-        opaque = (HashPageOpaque) PageGetSpecialPointer(page);
-        switch (opaque->hasho_flag & LH_PAGE_TYPE)
-        {
-            case LH_UNUSED_PAGE:
-                stat->free_space += BLCKSZ;
-                break;
-            case LH_BUCKET_PAGE:
-            case LH_OVERFLOW_PAGE:
-                pgstat_index_page(stat, page, FirstOffsetNumber,
-                                  PageGetMaxOffsetNumber(page));
-                break;
-            case LH_BITMAP_PAGE:
-            case LH_META_PAGE:
-            default:
-                break;
-        }
-    }
-    else
-    {
-        /* maybe corrupted */
-    }
-
-    _hash_relbuf(rel, buf);
+	Buffer		buf;
+	Page		page;
+
+	buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy);
+	page = BufferGetPage(buf);
+
+	if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData)))
+	{
+		HashPageOpaque opaque;
+
+		opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+		switch (opaque->hasho_flag & LH_PAGE_TYPE)
+		{
+			case LH_UNUSED_PAGE:
+				stat->free_space += BLCKSZ;
+				break;
+			case LH_BUCKET_PAGE:
+			case LH_OVERFLOW_PAGE:
+				pgstat_index_page(stat, page, FirstOffsetNumber,
+								  PageGetMaxOffsetNumber(page));
+				break;
+			case LH_BITMAP_PAGE:
+			case LH_META_PAGE:
+			default:
+				break;
+		}
+	}
+	else
+	{
+		/* maybe corrupted */
+	}
+
+	_hash_relbuf(rel, buf);
 }
 
 /*
@@ -482,27 +485,27 @@ pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
  */
 static void
 pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
-                 BufferAccessStrategy bstrategy)
+				 BufferAccessStrategy bstrategy)
 {
-    Buffer        buf;
-    Page        page;
-
-    buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
-    LockBuffer(buf, GIST_SHARE);
-    gistcheckpage(rel, buf);
-    page = BufferGetPage(buf);
-
-    if (GistPageIsLeaf(page))
-    {
-        pgstat_index_page(stat, page, FirstOffsetNumber,
-                          PageGetMaxOffsetNumber(page));
-    }
-    else
-    {
-        /* root or node */
-    }
-
-    UnlockReleaseBuffer(buf);
+	Buffer		buf;
+	Page		page;
+
+	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
+	LockBuffer(buf, GIST_SHARE);
+	gistcheckpage(rel, buf);
+	page = BufferGetPage(buf);
+
+	if (GistPageIsLeaf(page))
+	{
+		pgstat_index_page(stat, page, FirstOffsetNumber,
+						  PageGetMaxOffsetNumber(page));
+	}
+	else
+	{
+		/* root or node */
+	}
+
+	UnlockReleaseBuffer(buf);
 }
 
 /*
@@ -510,43 +513,43 @@ pgstat_gist_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
  */
 static Datum
 pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn,
-             FunctionCallInfo fcinfo)
+			 FunctionCallInfo fcinfo)
 {
-    BlockNumber nblocks;
-    BlockNumber blkno;
-    BufferAccessStrategy bstrategy;
-    pgstattuple_type stat = {0};
+	BlockNumber nblocks;
+	BlockNumber blkno;
+	BufferAccessStrategy bstrategy;
+	pgstattuple_type stat = {0};
 
-    /* prepare access strategy for this index */
-    bstrategy = GetAccessStrategy(BAS_BULKREAD);
+	/* prepare access strategy for this index */
+	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
-    blkno = start;
-    for (;;)
-    {
-        /* Get the current relation length */
-        LockRelationForExtension(rel, ExclusiveLock);
-        nblocks = RelationGetNumberOfBlocks(rel);
-        UnlockRelationForExtension(rel, ExclusiveLock);
+	blkno = start;
+	for (;;)
+	{
+		/* Get the current relation length */
+		LockRelationForExtension(rel, ExclusiveLock);
+		nblocks = RelationGetNumberOfBlocks(rel);
+		UnlockRelationForExtension(rel, ExclusiveLock);
 
-        /* Quit if we've scanned the whole relation */
-        if (blkno >= nblocks)
-        {
-            stat.table_len = (uint64) nblocks * BLCKSZ;
+		/* Quit if we've scanned the whole relation */
+		if (blkno >= nblocks)
+		{
+			stat.table_len = (uint64) nblocks * BLCKSZ;
 
-            break;
-        }
+			break;
+		}
 
-        for (; blkno < nblocks; blkno++)
-        {
-            CHECK_FOR_INTERRUPTS();
+		for (; blkno < nblocks; blkno++)
+		{
+			CHECK_FOR_INTERRUPTS();
 
-            pagefn(&stat, rel, blkno, bstrategy);
-        }
-    }
+			pagefn(&stat, rel, blkno, bstrategy);
+		}
+	}
 
-    relation_close(rel, AccessShareLock);
+	relation_close(rel, AccessShareLock);
 
-    return build_pgstattuple_type(&stat, fcinfo);
+	return build_pgstattuple_type(&stat, fcinfo);
 }
 
 /*
@@ -554,25 +557,25 @@ pgstat_index(Relation rel, BlockNumber start, pgstat_page pagefn,
  */
 static void
 pgstat_index_page(pgstattuple_type *stat, Page page,
-                  OffsetNumber minoff, OffsetNumber maxoff)
+				  OffsetNumber minoff, OffsetNumber maxoff)
 {
-    OffsetNumber i;
-
-    stat->free_space += PageGetFreeSpace(page);
-
-    for (i = minoff; i <= maxoff; i = OffsetNumberNext(i))
-    {
-        ItemId        itemid = PageGetItemId(page, i);
-
-        if (ItemIdIsDead(itemid))
-        {
-            stat->dead_tuple_count++;
-            stat->dead_tuple_len += ItemIdGetLength(itemid);
-        }
-        else
-        {
-            stat->tuple_count++;
-            stat->tuple_len += ItemIdGetLength(itemid);
-        }
-    }
+	OffsetNumber i;
+
+	stat->free_space += PageGetFreeSpace(page);
+
+	for (i = minoff; i <= maxoff; i = OffsetNumberNext(i))
+	{
+		ItemId		itemid = PageGetItemId(page, i);
+
+		if (ItemIdIsDead(itemid))
+		{
+			stat->dead_tuple_count++;
+			stat->dead_tuple_len += ItemIdGetLength(itemid);
+		}
+		else
+		{
+			stat->tuple_count++;
+			stat->tuple_len += ItemIdGetLength(itemid);
+		}
+	}
 }
diff --git a/contrib/pgstattuple/sql/pgstattuple.sql b/contrib/pgstattuple/sql/pgstattuple.sql
index 8eb5fd2c..8b7beb0c 100644
--- a/contrib/pgstattuple/sql/pgstattuple.sql
+++ b/contrib/pgstattuple/sql/pgstattuple.sql
@@ -64,8 +64,10 @@ select pgstatginindex('test_hashidx');
 
 -- check that using any of these functions with unsupported relations will fail
 create table test_partitioned (a int) partition by range (a);
+create index test_partitioned_index on test_partitioned(a);
 -- these should all fail
 select pgstattuple('test_partitioned');
+select pgstattuple('test_partitioned_index');
 select pgstattuple_approx('test_partitioned');
 select pg_relpages('test_partitioned');
 select pgstatindex('test_partitioned');

From 7a29480302c53e7c94fd85f0ec06656f26fb2e45 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 20:04:50 +0800
Subject: [PATCH 254/578] psql: have \d show FKs on partitioned tables

---
 src/bin/psql/describe.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index f198c238..1c671aca 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -2240,8 +2240,13 @@ describeOneTableDetails(const char *schemaname,
             PQclear(result);
         }
 
-        /* print foreign-key constraints (there are none if no triggers) */
-        if (tableinfo.hastriggers)
+		/*
+		 * Print foreign-key constraints (there are none if no triggers,
+		 * except if the table is partitioned, in which case the triggers
+		 * appear in the partitions)
+		 */
+		if (tableinfo.hastriggers ||
+			tableinfo.relkind == RELKIND_PARTITIONED_TABLE)
         {
             printfPQExpBuffer(&buf,
                               "SELECT conname,\n"

From c44038c0a8e33fdd5c282cd4f1fe80ff17647186 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 20:41:58 +0800
Subject: [PATCH 255/578] Don't needlessly check the partition contraint
 twice.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/commands/copy.c            | 31 ++++++++++------------
 src/backend/executor/execMain.c        | 31 ++++++++++++----------
 src/backend/executor/execPartition.c   |  5 ++--
 src/backend/executor/execReplication.c |  8 ++++--
 src/backend/executor/nodeModifyTable.c | 36 +++++++++++---------------
 src/include/executor/executor.h        |  5 ++--
 6 files changed, 55 insertions(+), 61 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index e3e67988..5b7eb4b9 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -3548,26 +3548,21 @@ CopyFrom(CopyState cstate)
             }
             else
             {
+				/* Check the constraints of the tuple */
+				if (resultRelInfo->ri_RelationDesc->rd_att->constr)
+				   ExecConstraints(resultRelInfo, slot, estate);
+
                 /*
-                 * We always check the partition constraint, including when
-                 * the tuple got here via tuple-routing.  However we don't
-                 * need to in the latter case if no BR trigger is defined on
-                 * the partition.  Note that a BR trigger might modify the
-                 * tuple such that the partition constraint is no longer
-                 * satisfied, so we need to check in that case.
+				* Also check the tuple against the partition constraint, if
+				* there is one; except that if we got here via tuple-routing,
+				* we don't need to if there's no BR trigger defined on the
+				* partition.
                  */
-                bool        check_partition_constr =
-                (resultRelInfo->ri_PartitionCheck != NIL);
-
-                if (saved_resultRelInfo != NULL &&
-                    !(resultRelInfo->ri_TrigDesc &&
-                      resultRelInfo->ri_TrigDesc->trig_insert_before_row))
-                    check_partition_constr = false;
-
-                /* Check the constraints of the tuple */
-				if (resultRelInfo->ri_RelationDesc->rd_att->constr ||
-                    check_partition_constr)
-					ExecConstraints(resultRelInfo, slot, estate, true);
+				if (resultRelInfo->ri_PartitionCheck &&
+				   (saved_resultRelInfo == NULL ||
+				    (resultRelInfo->ri_TrigDesc &&
+				     resultRelInfo->ri_TrigDesc->trig_insert_before_row)))
+				   ExecPartitionCheck(resultRelInfo, slot, estate, true);
 
 #ifdef _MLS_
                 if (is_mls_user())
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 63de1a27..5a082133 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -2209,14 +2209,16 @@ ExecRelCheck(ResultRelInfo *resultRelInfo,
 /*
  * ExecPartitionCheck --- check that tuple meets the partition constraint.
  *
- * Exported in executor.h for outside use.
- * Returns true if it meets the partition constraint, else returns false.
+ * Returns true if it meets the partition constraint.  If the constraint
+ * fails and we're asked to emit to error, do so and don't return; otherwise
+ * return false.
  */
 bool
 ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
-                   EState *estate)
+				   EState *estate, bool emitError)
 {
     ExprContext *econtext;
+	bool	success;
 
     /*
      * If first time through, build expression state tree for the partition
@@ -2243,7 +2245,13 @@ ExecPartitionCheck(ResultRelInfo *resultRelInfo, TupleTableSlot *slot,
      * As in case of the catalogued constraints, we treat a NULL result as
      * success here, not a failure.
      */
-    return ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext);
+	success = ExecCheck(resultRelInfo->ri_PartitionCheckExpr, econtext);
+
+	/* if asked to emit error, don't actually return on failure */
+	if (!success && emitError)
+		ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
+
+	return success;
 }
 
 /*
@@ -2303,17 +2311,17 @@ ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo,
 /*
  * ExecConstraints - check constraints of the tuple in 'slot'
  *
- * This checks the traditional NOT NULL and check constraints, and if
- * requested, checks the partition constraint.
+ * This checks the traditional NOT NULL and check constraints.
+ *
+ * The partition constraint is *NOT* checked.
  *
  * Note: 'slot' contains the tuple to check the constraints of, which may
  * have been converted from the original input tuple after tuple routing.
- * 'resultRelInfo' is the original result relation, before tuple routing.
+ * 'resultRelInfo' is the final result relation, after tuple routing.
  */
 void
 ExecConstraints(ResultRelInfo *resultRelInfo,
-				TupleTableSlot *slot, EState *estate,
-				bool check_partition_constraint)
+				TupleTableSlot *slot, EState *estate)
 {
     Relation    rel = resultRelInfo->ri_RelationDesc;
     TupleDesc    tupdesc = RelationGetDescr(rel);
@@ -2427,13 +2435,8 @@ ExecConstraints(ResultRelInfo *resultRelInfo,
                      errtableconstraint(orig_rel, failed)));
         }
     }
-
-	if (check_partition_constraint && resultRelInfo->ri_PartitionCheck &&
-		!ExecPartitionCheck(resultRelInfo, slot, estate))
-		ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
 }
 
-
 /*
  * ExecWithCheckOptions -- check that tuple satisfies any WITH CHECK OPTIONs
  * of the specified kind.
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 50bc3754..1835d52a 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -206,9 +206,8 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 	 * First check the root table's partition constraint, if any.  No point in
 	 * routing the tuple if it doesn't belong in the root table itself.
 	 */
-	if (resultRelInfo->ri_PartitionCheck &&
-		!ExecPartitionCheck(resultRelInfo, slot, estate))
-		ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
+	if (resultRelInfo->ri_PartitionCheck)
+		ExecPartitionCheck(resultRelInfo, slot, estate, true);
 
 	/* start with the root partitioned table */
 	parent = pd[0];
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index c0b6f4a0..1cf551a0 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -404,7 +404,9 @@ ExecSimpleRelationInsert(EState *estate, TupleTableSlot *slot)
 
         /* Check the constraints of the tuple */
         if (rel->rd_att->constr)
-			ExecConstraints(resultRelInfo, slot, estate, true);
+			ExecConstraints(resultRelInfo, slot, estate);
+        if (resultRelInfo->ri_PartitionCheck)
+            ExecPartitionCheck(resultRelInfo, slot, estate, true);
 
 #ifdef _MLS_
         if (is_mls_user())
@@ -491,7 +493,9 @@ ExecSimpleRelationUpdate(EState *estate, EPQState *epqstate,
 
         /* Check the constraints of the tuple */
         if (rel->rd_att->constr)
-			ExecConstraints(resultRelInfo, slot, estate, true);
+			ExecConstraints(resultRelInfo, slot, estate);
+        if (resultRelInfo->ri_PartitionCheck)
+            ExecPartitionCheck(resultRelInfo, slot, estate, true);
 
 #ifdef _MLS_
         if (is_mls_user())
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index a8cb0df0..34a53370 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -513,16 +513,6 @@ ExecInsert(ModifyTableState *mtstate,
     {
 		WCOKind		wco_kind;
 
-        /*
-         * We always check the partition constraint, including when the tuple
-         * got here via tuple-routing.  However we don't need to in the latter
-         * case if no BR trigger is defined on the partition.  Note that a BR
-         * trigger might modify the tuple such that the partition constraint
-         * is no longer satisfied, so we need to check in that case.
-         */
-        bool        check_partition_constr =
-        (resultRelInfo->ri_PartitionCheck != NIL);
-
         /*
          * Constraints might reference the tableoid column, so initialize
          * t_tableOid before evaluating them.
@@ -549,17 +539,21 @@ ExecInsert(ModifyTableState *mtstate,
 			ExecWithCheckOptions(wco_kind, resultRelInfo, slot, estate);
 
         /*
-         * No need though if the tuple has been routed, and a BR trigger
-         * doesn't exist.
+		 * Check the constraints of the tuple.
          */
-		if (resultRelInfo->ri_PartitionRoot != NULL &&
-            !(resultRelInfo->ri_TrigDesc &&
-              resultRelInfo->ri_TrigDesc->trig_insert_before_row))
-            check_partition_constr = false;
+		if (resultRelationDesc->rd_att->constr)
+            ExecConstraints(resultRelInfo, slot, estate);
 
-        /* Check the constraints of the tuple */
-        if (resultRelationDesc->rd_att->constr || check_partition_constr)
-                    ExecConstraints(resultRelInfo, slot, estate, true);			
+		/*
+		* Also check the tuple against the partition constraint, if there is
+		* one; except that if we got here via tuple-routing, we don't need to
+		* if there's no BR trigger defined on the partition.
+		*/
+		if (resultRelInfo->ri_PartitionCheck &&
+		   (resultRelInfo->ri_PartitionRoot == NULL ||
+		    (resultRelInfo->ri_TrigDesc &&
+		     resultRelInfo->ri_TrigDesc->trig_insert_before_row)))
+		   ExecPartitionCheck(resultRelInfo, slot, estate, true);
 
 #ifdef _MLS_
         if (is_mls_user())
@@ -1354,7 +1348,7 @@ lreplace:;
 	        */
 	    partition_constraint_failed =
 	           resultRelInfo->ri_PartitionCheck &&
-	           !ExecPartitionCheck(resultRelInfo, slot, estate);
+			!ExecPartitionCheck(resultRelInfo, slot, estate, false);
 
 	    if (!partition_constraint_failed &&
 	           resultRelInfo->ri_WithCheckOptions != NIL)
@@ -1473,7 +1467,7 @@ lreplace:;
          * checks.
          */
 	    if (resultRelationDesc->rd_att->constr)
-	    	ExecConstraints(resultRelInfo, slot, estate, false);
+	    	ExecConstraints(resultRelInfo, slot, estate);
 
 #ifdef _MLS_
         if (is_mls_user())
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index a143cd77..72f62666 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -205,10 +205,9 @@ extern ResultRelInfo *ExecGetTriggerResultRel(EState *estate, Oid relid);
 extern void ExecCleanUpTriggerState(EState *estate);
 extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids);
 extern void ExecConstraints(ResultRelInfo *resultRelInfo,
-				TupleTableSlot *slot, EState *estate,
-				bool check_partition_constraint);
-extern bool ExecPartitionCheck(ResultRelInfo *resultRelInfo,
                 TupleTableSlot *slot, EState *estate);
+extern bool ExecPartitionCheck(ResultRelInfo *resultRelInfo,
+				   TupleTableSlot *slot, EState *estate, bool emitError);
 extern void ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo,
 				   TupleTableSlot *slot, EState *estate);
 extern void ExecWithCheckOptions(WCOKind kind, ResultRelInfo *resultRelInfo,

From 19fcfde92868cd1a246fd418e08f0d5bbfed2c7e Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 21:08:48 +0800
Subject: [PATCH 256/578] Fix FK checks of TRUNCATE involving partitioned
 tables.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/heap.c             |  7 ++-
 src/test/regress/expected/truncate.out | 75 ++++++++++++++++++++++++++
 src/test/regress/sql/truncate.sql      | 47 ++++++++++++++++
 3 files changed, 127 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 2f135c95..57e486f1 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -4057,13 +4057,16 @@ heap_truncate_check_FKs(List *relations, bool tempTables)
      * Build a list of OIDs of the interesting relations.
      *
      * If a relation has no triggers, then it can neither have FKs nor be
-     * referenced by a FK from another table, so we can ignore it.
+	 * referenced by a FK from another table, so we can ignore it.  For
+	 * partitioned tables, FKs have no triggers, so we must include them
+	 * anyway.
      */
     foreach(cell, relations)
     {
         Relation    rel = lfirst(cell);
 
-        if (rel->rd_rel->relhastriggers)
+		if (rel->rd_rel->relhastriggers ||
+			rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
             oids = lappend_oid(oids, RelationGetRelid(rel));
     }
 
diff --git a/src/test/regress/expected/truncate.out b/src/test/regress/expected/truncate.out
index d47b1080..168fc0cc 100644
--- a/src/test/regress/expected/truncate.out
+++ b/src/test/regress/expected/truncate.out
@@ -481,3 +481,78 @@ ERROR:  cannot truncate only a partitioned table
 HINT:  Do not specify the ONLY keyword, or use truncate only on the partitions directly.
 TRUNCATE truncparted;
 DROP TABLE truncparted;
+-- foreign key on partitioned table: partition key is referencing column.
+-- Make sure truncate did execute on all tables
+CREATE FUNCTION tp_ins_data() RETURNS void LANGUAGE plpgsql AS $$
+  BEGIN
+   INSERT INTO truncprim VALUES (1), (100), (150);
+   INSERT INTO truncpart VALUES (1), (100), (150);
+  END
+$$;
+CREATE FUNCTION tp_chk_data(OUT pktb regclass, OUT pkval int, OUT fktb regclass, OUT fkval int)
+  RETURNS SETOF record LANGUAGE plpgsql AS $$
+  BEGIN
+    RETURN QUERY SELECT
+      pk.tableoid::regclass, pk.a, fk.tableoid::regclass, fk.a
+    FROM truncprim pk FULL JOIN truncpart fk USING (a)
+    ORDER BY 2, 4;
+  END
+$$;
+CREATE TABLE truncprim (a int PRIMARY KEY);
+CREATE TABLE truncpart (a int REFERENCES truncprim)
+  PARTITION BY RANGE (a);
+CREATE TABLE truncpart_1 PARTITION OF truncpart FOR VALUES FROM (0) TO (100);
+CREATE TABLE truncpart_2 PARTITION OF truncpart FOR VALUES FROM (100) TO (200)
+  PARTITION BY RANGE (a);
+CREATE TABLE truncpart_2_1 PARTITION OF truncpart_2 FOR VALUES FROM (100) TO (150);
+CREATE TABLE truncpart_2_d PARTITION OF truncpart_2 DEFAULT;
+TRUNCATE TABLE truncprim;  -- should fail
+ERROR:  cannot truncate a table referenced in a foreign key constraint
+DETAIL:  Table "truncpart" references "truncprim".
+HINT:  Truncate table "truncpart" at the same time, or use TRUNCATE ... CASCADE.
+select tp_ins_data();
+ tp_ins_data 
+-------------
+ 
+(1 row)
+
+-- should truncate everything
+TRUNCATE TABLE truncprim, truncpart;
+select * from tp_chk_data();
+ pktb | pkval | fktb | fkval 
+------+-------+------+-------
+(0 rows)
+
+select tp_ins_data();
+ tp_ins_data 
+-------------
+ 
+(1 row)
+
+-- should truncate everything
+SET client_min_messages TO WARNING;    -- suppress cascading notices
+TRUNCATE TABLE truncprim CASCADE;
+RESET client_min_messages;
+SELECT * FROM tp_chk_data();
+ pktb | pkval | fktb | fkval 
+------+-------+------+-------
+(0 rows)
+
+SELECT tp_ins_data();
+ tp_ins_data 
+-------------
+ 
+(1 row)
+
+-- should truncate all partitions
+TRUNCATE TABLE truncpart;
+SELECT * FROM tp_chk_data();
+   pktb    | pkval | fktb | fkval 
+-----------+-------+------+-------
+ truncprim |     1 |      |      
+ truncprim |   100 |      |      
+ truncprim |   150 |      |      
+(3 rows)
+
+DROP TABLE truncprim, truncpart;
+DROP FUNCTION tp_ins_data(), tp_chk_data();
diff --git a/src/test/regress/sql/truncate.sql b/src/test/regress/sql/truncate.sql
index 7d25bc0b..afde2f66 100644
--- a/src/test/regress/sql/truncate.sql
+++ b/src/test/regress/sql/truncate.sql
@@ -244,3 +244,50 @@ INSERT INTO truncparted VALUES (1, 'a');
 TRUNCATE ONLY truncparted;
 TRUNCATE truncparted;
 DROP TABLE truncparted;
+
+-- foreign key on partitioned table: partition key is referencing column.
+-- Make sure truncate did execute on all tables
+CREATE FUNCTION tp_ins_data() RETURNS void LANGUAGE plpgsql AS $$
+  BEGIN
+	INSERT INTO truncprim VALUES (1), (100), (150);
+	INSERT INTO truncpart VALUES (1), (100), (150);
+  END
+$$;
+CREATE FUNCTION tp_chk_data(OUT pktb regclass, OUT pkval int, OUT fktb regclass, OUT fkval int)
+  RETURNS SETOF record LANGUAGE plpgsql AS $$
+  BEGIN
+    RETURN QUERY SELECT
+      pk.tableoid::regclass, pk.a, fk.tableoid::regclass, fk.a
+    FROM truncprim pk FULL JOIN truncpart fk USING (a)
+    ORDER BY 2, 4;
+  END
+$$;
+CREATE TABLE truncprim (a int PRIMARY KEY);
+CREATE TABLE truncpart (a int REFERENCES truncprim)
+  PARTITION BY RANGE (a);
+CREATE TABLE truncpart_1 PARTITION OF truncpart FOR VALUES FROM (0) TO (100);
+CREATE TABLE truncpart_2 PARTITION OF truncpart FOR VALUES FROM (100) TO (200)
+  PARTITION BY RANGE (a);
+CREATE TABLE truncpart_2_1 PARTITION OF truncpart_2 FOR VALUES FROM (100) TO (150);
+CREATE TABLE truncpart_2_d PARTITION OF truncpart_2 DEFAULT;
+
+TRUNCATE TABLE truncprim;	-- should fail
+
+select tp_ins_data();
+-- should truncate everything
+TRUNCATE TABLE truncprim, truncpart;
+select * from tp_chk_data();
+
+select tp_ins_data();
+-- should truncate everything
+SET client_min_messages TO WARNING;	-- suppress cascading notices
+TRUNCATE TABLE truncprim CASCADE;
+RESET client_min_messages;
+SELECT * FROM tp_chk_data();
+
+SELECT tp_ins_data();
+-- should truncate all partitions
+TRUNCATE TABLE truncpart;
+SELECT * FROM tp_chk_data();
+DROP TABLE truncprim, truncpart;
+DROP FUNCTION tp_ins_data(), tp_chk_data();

From 0c76ec2f0e70cf39b142c418b1577141e5deebb8 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 29 Jun 2020 21:34:41 +0800
Subject: [PATCH 257/578] Improve performance of tuple conversion map
 generation

---
 src/backend/access/common/tupconvert.c | 53 ++++++++++++++++++--------
 src/backend/optimizer/prep/prepunion.c | 41 ++++++++++----------
 2 files changed, 57 insertions(+), 37 deletions(-)

diff --git a/src/backend/access/common/tupconvert.c b/src/backend/access/common/tupconvert.c
index 0092ce37..6812689b 100644
--- a/src/backend/access/common/tupconvert.c
+++ b/src/backend/access/common/tupconvert.c
@@ -290,33 +290,55 @@ convert_tuples_by_name_map(TupleDesc indesc,
                            const char *msg)
 {// #lizard forgives
     AttrNumber *attrMap;
-    int            n;
+    int         outnatts;
+    int         innatts;
     int            i;
+	int         nextindesc = -1;
 
-    n = outdesc->natts;
-    attrMap = (AttrNumber *) palloc0(n * sizeof(AttrNumber));
-    for (i = 0; i < n; i++)
+    outnatts = outdesc->natts;
+    innatts = indesc->natts;
+
+    attrMap = (AttrNumber *) palloc0(outnatts * sizeof(AttrNumber));
+    for (i = 0; i < outnatts; i++)
     {
-        Form_pg_attribute att = outdesc->attrs[i];
+		Form_pg_attribute outatt = TupleDescAttr(outdesc, i);
         char       *attname;
         Oid            atttypid;
         int32        atttypmod;
         int            j;
 
-        if (att->attisdropped)
+		if (outatt->attisdropped)
             continue;            /* attrMap[i] is already 0 */
-        attname = NameStr(att->attname);
-        atttypid = att->atttypid;
-        atttypmod = att->atttypmod;
-        for (j = 0; j < indesc->natts; j++)
+		attname = NameStr(outatt->attname);
+		atttypid = outatt->atttypid;
+		atttypmod = outatt->atttypmod;
+
+		/*
+		* Now search for an attribute with the same name in the indesc. It
+		* seems likely that a partitioned table will have the attributes in
+		* the same order as the partition, so the search below is optimized
+		* for that case.  It is possible that columns are dropped in one of
+		* the relations, but not the other, so we use the 'nextindesc'
+		* counter to track the starting point of the search.  If the inner
+		* loop encounters dropped columns then it will have to skip over
+		* them, but it should leave 'nextindesc' at the correct position for
+		* the next outer loop.
+		*/
+		for (j = 0; j < innatts; j++)
         {
-            att = indesc->attrs[j];
-            if (att->attisdropped)
+			Form_pg_attribute inatt;
+
+			nextindesc++;
+			if (nextindesc >= innatts)
+			   nextindesc = 0;
+
+			inatt = TupleDescAttr(indesc, nextindesc);
+			if (inatt->attisdropped)
                 continue;
-            if (strcmp(attname, NameStr(att->attname)) == 0)
+			if (strcmp(attname, NameStr(inatt->attname)) == 0)
             {
                 /* Found it, check type */
-                if (atttypid != att->atttypid || atttypmod != att->atttypmod)
+				if (atttypid != inatt->atttypid || atttypmod != inatt->atttypmod)
                     ereport(ERROR,
                             (errcode(ERRCODE_DATATYPE_MISMATCH),
                              errmsg_internal("%s", _(msg)),
@@ -324,7 +346,7 @@ convert_tuples_by_name_map(TupleDesc indesc,
                                        attname,
                                        format_type_be(outdesc->tdtypeid),
                                        format_type_be(indesc->tdtypeid))));
-                attrMap[i] = (AttrNumber) (j + 1);
+				attrMap[i] = inatt->attnum;
                 break;
             }
         }
@@ -337,7 +359,6 @@ convert_tuples_by_name_map(TupleDesc indesc,
                                format_type_be(outdesc->tdtypeid),
                                format_type_be(indesc->tdtypeid))));
     }
-
     return attrMap;
 }
 
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index d2e6c3c6..a9c117f1 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -52,6 +52,7 @@
 #include "utils/lsyscache.h"
 #include "utils/rel.h"
 #include "utils/selfuncs.h"
+#include "utils/syscache.h"
 
 
 typedef struct
@@ -1848,9 +1849,11 @@ make_inh_translation_list(Relation oldrelation, Relation newrelation,
     List       *vars = NIL;
     TupleDesc    old_tupdesc = RelationGetDescr(oldrelation);
     TupleDesc    new_tupdesc = RelationGetDescr(newrelation);
+	Oid			new_relid = RelationGetRelid(newrelation);
     int            oldnatts = old_tupdesc->natts;
     int            newnatts = new_tupdesc->natts;
     int            old_attno;
+	int			new_attno = 0;
 
     for (old_attno = 0; old_attno < oldnatts; old_attno++)
     {
@@ -1859,7 +1862,6 @@ make_inh_translation_list(Relation oldrelation, Relation newrelation,
         Oid            atttypid;
         int32        atttypmod;
         Oid            attcollation;
-        int            new_attno;
 
         att = old_tupdesc->attrs[old_attno];
         if (att->attisdropped)
@@ -1892,29 +1894,25 @@ make_inh_translation_list(Relation oldrelation, Relation newrelation,
          * Otherwise we have to search for the matching column by name.
          * There's no guarantee it'll have the same column position, because
          * of cases like ALTER TABLE ADD COLUMN and multiple inheritance.
-         * However, in simple cases it will be the same column number, so try
-         * that before we go groveling through all the columns.
-         *
-         * Note: the test for (att = ...) != NULL cannot fail, it's just a
-         * notational device to include the assignment into the if-clause.
-         */
-        if (old_attno < newnatts &&
-            (att = new_tupdesc->attrs[old_attno]) != NULL &&
-            !att->attisdropped && att->attinhcount != 0 &&
-            strcmp(attname, NameStr(att->attname)) == 0)
-            new_attno = old_attno;
-        else
-        {
-            for (new_attno = 0; new_attno < newnatts; new_attno++)
+         * However, in simple cases, the relative order of columns is mostly
+         * the same in both relations, so try the column of newrelation that
+         * follows immediately after the one that we just found, and if that
+         * fails, let syscache handle it.
+		 */
+        if (new_attno >= newnatts ||
+            (att = TupleDescAttr(new_tupdesc, new_attno))->attisdropped ||
+            strcmp(attname, NameStr(att->attname)) != 0)
             {
-                att = new_tupdesc->attrs[new_attno];
-                if (!att->attisdropped && att->attinhcount != 0 &&
-                    strcmp(attname, NameStr(att->attname)) == 0)
-                    break;
-            }
-            if (new_attno >= newnatts)
+            HeapTuple   newtup;
+
+            newtup = SearchSysCacheAttName(new_relid, attname);
+            if (!newtup)
                 elog(ERROR, "could not find inherited attribute \"%s\" of relation \"%s\"",
                      attname, RelationGetRelationName(newrelation));
+            new_attno = ((Form_pg_attribute) GETSTRUCT(newtup))->attnum - 1;
+            ReleaseSysCache(newtup);
+
+            att = TupleDescAttr(new_tupdesc, new_attno);
         }
 
         /* Found it, check type and collation match */
@@ -1931,6 +1929,7 @@ make_inh_translation_list(Relation oldrelation, Relation newrelation,
                                      atttypmod,
                                      attcollation,
                                      0));
+		new_attno++;
     }
 
     *translated_vars = vars;

From dbd5825b1fce442c2535ebb5fa2eded27efdef79 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Mon, 16 Jul 2018 18:38:09 -0400
Subject: [PATCH 258/578] Fix partition pruning with IS [NOT] NULL clauses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The original code was unable to prune partitions that could not possibly
contain NULL values, when the query specified less than all columns in a
multicolumn partition key.  Reorder the if-tests so that it is, and add
more commentary and regression tests.

Reported-by: Ashutosh Bapat <ashutosh.bapat@enterprisedb.com>
Co-authored-by: Dilip Kumar <dilipbalaut@gmail.com>
Co-authored-by: Amit Langote <Langote_Amit_f8@lab.ntt.co.jp>
Co-authored-by: Álvaro Herrera <alvherre@alvh.no-ip.org>
Reviewed-by: Ashutosh Bapat <ashutosh.bapat@enterprisedb.com>
Reviewed-by: amul sul <sulamul@gmail.com>
Discussion: https://postgr.es/m/CAFjFpRc7qjLUfXLVBBC_HAnx644sjTYM=qVoT3TJ840HPbsTXw@mail.gmail.com
---
 src/backend/partitioning/partprune.c          | 78 ++++++++++---------
 src/test/regress/expected/partition_prune.out | 41 ++++++++++
 src/test/regress/sql/partition_prune.sql      |  7 ++
 3 files changed, 90 insertions(+), 36 deletions(-)

diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c
index 03bacd1f..242267f2 100644
--- a/src/backend/partitioning/partprune.c
+++ b/src/backend/partitioning/partprune.c
@@ -642,54 +642,60 @@ gen_partprune_steps_internal(GeneratePruningStepsContext *context,
 		}
 	}
 
-	/*
-	 * If generate_opsteps is set to false it means no OpExprs were directly
-	 * present in the input list.
+	/*-----------
+	 * Now generate some (more) pruning steps.  We have three strategies:
+	 *
+	 * 1) Generate pruning steps based on IS NULL clauses:
+	 *   a) For list partitioning, null partition keys can only be found in
+	 *      the designated null-accepting partition, so if there are IS NULL
+	 *      clauses containing partition keys we should generate a pruning
+	 *      step that gets rid of all partitions but that one.  We can
+	 *      disregard any OpExpr we may have found.
+	 *   b) For range partitioning, only the default partition can contain
+	 *      NULL values, so the same rationale applies.
+	 *   c) For hash partitioning, we only apply this strategy if we have
+	 *      IS NULL clauses for all the keys.  Strategy 2 below will take
+	 *      care of the case where some keys have OpExprs and others have
+	 *      IS NULL clauses.
+	 *
+	 * 2) If not, generate steps based on OpExprs we have (if any).
+	 *
+	 * 3) If this doesn't work either, we may be able to generate steps to
+	 *    prune just the null-accepting partition (if one exists), if we have
+	 *    IS NOT NULL clauses for all partition keys.
 	 */
-	if (!generate_opsteps)
+	if (!bms_is_empty(nullkeys) &&
+		(part_scheme->strategy == PARTITION_STRATEGY_LIST ||
+		 part_scheme->strategy == PARTITION_STRATEGY_RANGE ||
+		 (part_scheme->strategy == PARTITION_STRATEGY_HASH &&
+		  bms_num_members(nullkeys) == part_scheme->partnatts)))
 	{
-		/*
-		 * Generate one prune step for the information derived from IS NULL,
-		 * if any.  To prune hash partitions, we must have found IS NULL
-		 * clauses for all partition keys.
-		 */
-		if (!bms_is_empty(nullkeys) &&
-			(part_scheme->strategy != PARTITION_STRATEGY_HASH ||
-			 bms_num_members(nullkeys) == part_scheme->partnatts))
-		{
-			PartitionPruneStep *step;
-
-			step = gen_prune_step_op(context, InvalidStrategy,
-									 false, NIL, NIL, nullkeys);
-			result = lappend(result, step);
-		}
-
-		/*
-		 * Note that for IS NOT NULL clauses, simply having step suffices;
-		 * there is no need to propagate the exact details of which keys are
-		 * required to be NOT NULL.  Hash partitioning expects to see actual
-		 * values to perform any pruning.
-		 */
-		if (!bms_is_empty(notnullkeys) &&
-			part_scheme->strategy != PARTITION_STRATEGY_HASH)
-		{
-			PartitionPruneStep *step;
+		PartitionPruneStep *step;
 
-			step = gen_prune_step_op(context, InvalidStrategy,
-									 false, NIL, NIL, NULL);
-			result = lappend(result, step);
-		}
+		/* Strategy 1 */
+		step = gen_prune_step_op(context, InvalidStrategy,
+								 false, NIL, NIL, nullkeys);
+		result = lappend(result, step);
 	}
-	else
+	else if (generate_opsteps)
 	{
 		PartitionPruneStep *step;
 
-		/* Generate pruning steps from OpExpr clauses in keyclauses. */
+		/* Strategy 2 */
 		step = gen_prune_steps_from_opexps(part_scheme, context,
 										   keyclauses, nullkeys);
 		if (step != NULL)
 			result = lappend(result, step);
 	}
+	else if (bms_num_members(notnullkeys) == part_scheme->partnatts)
+	{
+		PartitionPruneStep *step;
+
+		/* Strategy 3 */
+		step = gen_prune_step_op(context, InvalidStrategy,
+								 false, NIL, NIL, NULL);
+		result = lappend(result, step);
+	}
 
 	/*
 	 * Finally, results from all entries appearing in result should be
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index b91cac4b..5db871b4 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -993,6 +993,47 @@ explain (costs off) select * from mc2p where a = 1 and b > 1;
          Filter: ((b > 1) AND (a = 1))
 (3 rows)
 
+-- all partitions but the default one should be pruned
+explain (costs off) select * from mc2p where a = 1 and b is null;
+                QUERY PLAN                 
+-------------------------------------------
+ Append
+   ->  Seq Scan on mc2p_default
+         Filter: ((b IS NULL) AND (a = 1))
+(3 rows)
+
+explain (costs off) select * from mc2p where a is null and b is null;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Append
+   ->  Seq Scan on mc2p_default
+         Filter: ((a IS NULL) AND (b IS NULL))
+(3 rows)
+
+explain (costs off) select * from mc2p where a is null and b = 1;
+                QUERY PLAN                 
+-------------------------------------------
+ Append
+   ->  Seq Scan on mc2p_default
+         Filter: ((a IS NULL) AND (b = 1))
+(3 rows)
+
+explain (costs off) select * from mc2p where a is null;
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on mc2p_default
+         Filter: (a IS NULL)
+(3 rows)
+
+explain (costs off) select * from mc2p where b is null;
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on mc2p_default
+         Filter: (b IS NULL)
+(3 rows)
+
 -- boolean partitioning
 create table boolpart (a bool) partition by list (a);
 create table boolpart_default partition of boolpart default;
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index 164b74ee..0a812c9c 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -137,6 +137,13 @@ explain (costs off) select * from mc2p where a = 2 and b < 1;
 explain (costs off) select * from mc2p where a > 1;
 explain (costs off) select * from mc2p where a = 1 and b > 1;
 
+-- all partitions but the default one should be pruned
+explain (costs off) select * from mc2p where a = 1 and b is null;
+explain (costs off) select * from mc2p where a is null and b is null;
+explain (costs off) select * from mc2p where a is null and b = 1;
+explain (costs off) select * from mc2p where a is null;
+explain (costs off) select * from mc2p where b is null;
+
 -- boolean partitioning
 create table boolpart (a bool) partition by list (a);
 create table boolpart_default partition of boolpart default;

From 0b2016b79f6bfb0d23d7d1a01fd3ddee25c1b881 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 19 Jul 2018 09:01:57 +0900
Subject: [PATCH 259/578] Fix re-parameterize of MergeAppendPath

Instead of MergeAppendPath, MergeAppend nodes were considered.  This
code is not covered by any tests now, which should be addressed at some
point.

This is an oversight from f49842d, which introduced partition-wise joins
in v11, so back-patch down to that.

Author: Michael Paquier
Reviewed-by: Ashutosh Bapat
Discussion: https://postgr.es/m/20180718062202.GC8565@paquier.xyz
---
 src/backend/optimizer/util/pathnode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 0a6735d1..4d2a1f32 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -7153,7 +7153,7 @@ do { \
 			}
 			break;
 
-		case T_MergeAppend:
+		case T_MergeAppendPath:
 			{
 				MergeAppendPath *mapath;
 

From 4e4a47bdd8fb342f1a50575648b00c3c2facc751 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Mon, 30 Jul 2018 17:18:42 -0400
Subject: [PATCH 260/578] Change bms_add_range to be a no-op for empty ranges

In commit 84940644de93, bms_add_range was added with an API to fail with
an error if an empty range was specified.  This seems arbitrary and
unhelpful, so turn that case into a no-op instead.  Callers that require
further verification on the arguments or result can apply them by
themselves.

This fixes the bug that partition pruning throws an API error for a case
involving the default partition of a default partition, as in the
included test case.

Reported-by: Rajkumar Raghuwanshi <rajkumar.raghuwanshi@enterprisedb.com>
Diagnosed-by: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://postgr.es/m/16590.1532622503@sss.pgh.pa.us
---
 src/backend/nodes/bitmapset.c                 |  7 +++++--
 src/test/regress/expected/partition_prune.out | 15 +++++++++++++++
 src/test/regress/sql/partition_prune.sql      |  7 +++++++
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index 8ec465d2..f4b56e9f 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -960,6 +960,10 @@ bms_add_range(Bitmapset *a, int lower, int upper)
                ushiftbits,
                wordnum;
 
+	/* do nothing if nothing is called for, without further checking */
+	if (upper < lower)
+		return a;
+
    if (lower < 0 || upper < 0)
        elog(ERROR, "negative bitmapset member not allowed");
    if (lower > upper)
@@ -971,13 +975,12 @@ bms_add_range(Bitmapset *a, int lower, int upper)
        a = (Bitmapset *) palloc0(BITMAPSET_SIZE(uwordnum + 1));
        a->nwords = uwordnum + 1;
    }
-
-   /* ensure we have enough words to store the upper bit */
    else if (uwordnum >= a->nwords)
    {
        int         oldnwords = a->nwords;
        int         i;
 
+		/* ensure we have enough words to store the upper bit */
        a = (Bitmapset *) repalloc(a, BITMAPSET_SIZE(uwordnum + 1));
        a->nwords = uwordnum + 1;
        /* zero out the enlarged portion */
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index 5db871b4..94bceb8d 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -1188,6 +1188,21 @@ explain (costs off) select * from coercepart where a !~ all ('{ab,bc}');
 (7 rows)
 
 drop table coercepart;
+CREATE TABLE part (a INT, b INT) PARTITION BY LIST (a);
+CREATE TABLE part_p1 PARTITION OF part FOR VALUES IN (-2,-1,0,1,2);
+CREATE TABLE part_p2 PARTITION OF part DEFAULT PARTITION BY RANGE(a);
+CREATE TABLE part_p2_p1 PARTITION OF part_p2 DEFAULT;
+INSERT INTO part VALUES (-1,-1), (1,1), (2,NULL), (NULL,-2),(NULL,NULL);
+EXPLAIN (COSTS OFF) SELECT tableoid::regclass as part, a, b FROM part WHERE a IS NULL ORDER BY 1, 2, 3;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Sort
+   Sort Key: ((part_p2_p1.tableoid)::regclass), part_p2_p1.a, part_p2_p1.b
+   ->  Append
+         ->  Seq Scan on part_p2_p1
+               Filter: (a IS NULL)
+(5 rows)
+
 --
 -- some more cases
 --
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index 0a812c9c..4862cdfd 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -173,6 +173,13 @@ explain (costs off) select * from coercepart where a !~ all ('{ab,bc}');
 
 drop table coercepart;
 
+CREATE TABLE part (a INT, b INT) PARTITION BY LIST (a);
+CREATE TABLE part_p1 PARTITION OF part FOR VALUES IN (-2,-1,0,1,2);
+CREATE TABLE part_p2 PARTITION OF part DEFAULT PARTITION BY RANGE(a);
+CREATE TABLE part_p2_p1 PARTITION OF part_p2 DEFAULT;
+INSERT INTO part VALUES (-1,-1), (1,1), (2,NULL), (NULL,-2),(NULL,NULL);
+EXPLAIN (COSTS OFF) SELECT tableoid::regclass as part, a, b FROM part WHERE a IS NULL ORDER BY 1, 2, 3;
+
 --
 -- some more cases
 --

From 8054542bdeca32046c9dbb62f5612f77994224b9 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 11:30:25 +0800
Subject: [PATCH 261/578] Error position support for partition specifications

---
 src/backend/commands/tablecmds.c           | 16 +++++++++++-----
 src/test/regress/expected/create_table.out |  6 ++++++
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index afa19507..d585d5e9 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -533,7 +533,7 @@ static void RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid,
 static void RangeVarCallbackForAlterRelation(const RangeVar *rv, Oid relid,
                                  Oid oldrelid, void *arg);
 static PartitionSpec *transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy);
-static void ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
+static void ComputePartitionAttrs(ParseState *pstate, Relation rel, List *partParams, AttrNumber *partattrs,
 					  List **partexprs, Oid *partopclass, Oid *partcollation, char strategy);
 static void CreateInheritance(Relation child_rel, Relation parent_rel);
 static void RemoveInheritance(Relation child_rel, Relation parent_rel);
@@ -1124,6 +1124,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
      */
     if (stmt->partspec)
     {
+		ParseState *pstate;
         char        strategy;
         int            partnatts;
         AttrNumber    partattrs[PARTITION_MAX_KEYS];
@@ -1131,6 +1132,9 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
         Oid            partcollation[PARTITION_MAX_KEYS];
         List       *partexprs = NIL;
 
+		pstate = make_parsestate(NULL);
+		pstate->p_sourcetext = queryString;
+
         partnatts = list_length(stmt->partspec->partParams);
 
         /* Protect fixed-size arrays here and in executor */
@@ -1163,7 +1167,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
         else
         {
 #endif
-        ComputePartitionAttrs(rel, stmt->partspec->partParams,
+		ComputePartitionAttrs(pstate, rel, stmt->partspec->partParams,
                               partattrs, &partexprs, partopclass,
 							  partcollation, strategy);
 
@@ -16152,7 +16156,7 @@ transformPartitionSpec(Relation rel, PartitionSpec *partspec, char *strategy)
  * Expressions in the PartitionElems must be parse-analyzed already.
  */
 static void
-ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
+ComputePartitionAttrs(ParseState *pstate, Relation rel, List *partParams, AttrNumber *partattrs,
 					  List **partexprs, Oid *partopclass, Oid *partcollation,
 					  char strategy)
 {
@@ -16179,14 +16183,16 @@ ComputePartitionAttrs(Relation rel, List *partParams, AttrNumber *partattrs,
                 ereport(ERROR,
                         (errcode(ERRCODE_UNDEFINED_COLUMN),
                          errmsg("column \"%s\" named in partition key does not exist",
-                                pelem->name)));
+								pelem->name),
+						 parser_errposition(pstate, pelem->location)));
             attform = (Form_pg_attribute) GETSTRUCT(atttuple);
 
             if (attform->attnum <= 0)
                 ereport(ERROR,
                         (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
                          errmsg("cannot use system column \"%s\" in partition key",
-                                pelem->name)));
+								pelem->name),
+						 parser_errposition(pstate, pelem->location)));
 
             partattrs[attn] = attform->attnum;
             atttype = attform->atttypid;
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index 55e9e44d..e9bf8784 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -353,11 +353,15 @@ CREATE TABLE partitioned (
 	a int
 ) PARTITION BY RANGE (b);
 ERROR:  column "b" named in partition key does not exist
+LINE 3: ) PARTITION BY RANGE (b);
+                              ^
 -- cannot use system columns in partition key
 CREATE TABLE partitioned (
 	a int
 ) PARTITION BY RANGE (xmin);
 ERROR:  cannot use system column "xmin" in partition key
+LINE 3: ) PARTITION BY RANGE (xmin);
+                              ^
 -- functions in key must be immutable
 CREATE FUNCTION immut_func (a int) RETURNS int AS $$ SELECT a + random()::int; $$ LANGUAGE SQL;
 CREATE TABLE partitioned (
@@ -746,6 +750,8 @@ SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::reg
 -- specify PARTITION BY for a partition
 CREATE TABLE fail_part_col_not_found PARTITION OF parted FOR VALUES IN ('c') PARTITION BY RANGE (c);
 ERROR:  column "c" named in partition key does not exist
+LINE 1: ...TITION OF parted FOR VALUES IN ('c') PARTITION BY RANGE (c);
+                                                                    ^
 CREATE TABLE part_c PARTITION OF parted (b WITH OPTIONS NOT NULL DEFAULT 0) FOR VALUES IN ('c') PARTITION BY RANGE ((b));
 -- create a level-2 partition
 CREATE TABLE part_c_1_10 PARTITION OF part_c FOR VALUES FROM (1) TO (10);

From b62a02dd517ac39bc0adb576989e97dc20aae877 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 11:38:05 +0800
Subject: [PATCH 262/578] Minor fixes for psql tab completion.

---
 src/bin/psql/tab-complete.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 49305e4a..29eaf18b 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -558,6 +558,7 @@ static const SchemaQuery Query_for_list_of_tmf = {
     "pg_catalog.pg_class c",
     /* selcondition */
     "c.relkind IN (" CppAsString2(RELKIND_RELATION) ", "
+        CppAsString2(RELKIND_PARTITIONED_TABLE) ", "
     CppAsString2(RELKIND_MATVIEW) ", "
     CppAsString2(RELKIND_FOREIGN_TABLE) ")",
     /* viscondition */
@@ -2034,6 +2035,7 @@ psql_completion(const char *text, int start, int end)
             "fillfactor",
             "parallel_workers",
             "log_autovacuum_min_duration",
+			"toast_tuple_target",
             "toast.autovacuum_enabled",
             "toast.autovacuum_freeze_max_age",
             "toast.autovacuum_freeze_min_age",
@@ -2535,7 +2537,7 @@ psql_completion(const char *text, int start, int end)
         COMPLETE_WITH_LIST2("TABLE", "MATERIALIZED VIEW");
     /* Complete PARTITION BY with RANGE ( or LIST ( or ... */
     else if (TailMatches2("PARTITION", "BY"))
-        COMPLETE_WITH_LIST2("RANGE (", "LIST (");
+		COMPLETE_WITH_LIST3("RANGE (", "LIST (", "HASH (");
     /* If we have xxx PARTITION OF, provide a list of partitioned tables */
     else if (TailMatches2("PARTITION", "OF"))
         COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_tables, "");

From d554203e6baefb1092b6d240946dddb875248f1c Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 11:50:39 +0800
Subject: [PATCH 263/578] Fix ALTER/TYPE on columns referenced by FKs in
 partitioned tables

---
 src/backend/commands/tablecmds.c            | 59 +++++++--------------
 src/test/regress/expected/foreign_key.out   | 12 +++++
 src/test/regress/expected/foreign_key_1.out | 12 +++++
 src/test/regress/expected/foreign_key_2.out | 12 +++++
 src/test/regress/sql/foreign_key.sql        | 11 ++++
 5 files changed, 65 insertions(+), 41 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index d585d5e9..b4f3ddb8 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -11080,26 +11080,6 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
                 {
                     char       *defstring = pg_get_constraintdef_command(foundObject.objectId);
 
-                    /*
-                     * Put NORMAL dependencies at the front of the list and
-                     * AUTO dependencies at the back.  This makes sure that
-                     * foreign-key constraints depending on this column will
-                     * be dropped before unique or primary-key constraints of
-                     * the column; which we must have because the FK
-                     * constraints depend on the indexes belonging to the
-                     * unique constraints.
-                     */
-                    if (foundDep->deptype == DEPENDENCY_NORMAL)
-                    {
-                        tab->changedConstraintOids =
-                            lcons_oid(foundObject.objectId,
-                                      tab->changedConstraintOids);
-                        tab->changedConstraintDefs =
-                            lcons(defstring,
-                                  tab->changedConstraintDefs);
-                    }
-                    else
-                    {
                         tab->changedConstraintOids =
                             lappend_oid(tab->changedConstraintOids,
                                         foundObject.objectId);
@@ -11107,7 +11087,6 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
                             lappend(tab->changedConstraintDefs,
                                     defstring);
                     }
-                }
                 break;
 
             case OCLASS_REWRITE:
@@ -11462,10 +11441,18 @@ static void
 ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
 {
     ObjectAddress obj;
+	ObjectAddresses *objects;
     ListCell   *def_item;
     ListCell   *oid_item;
 
     /*
+	 * Collect all the constraints and indexes to drop so we can process them
+	 * in a single call.  That way we don't have to worry about dependencies
+	 * among them.
+	 */
+	objects = new_object_addresses();
+
+	/*
      * Re-parse the index and constraint definitions, and attach them to the
      * appropriate work queue entries.  We do this before dropping because in
      * the case of a FOREIGN KEY constraint, we might not yet have exclusive
@@ -11498,6 +11485,9 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
         conislocal = con->conislocal;
         ReleaseSysCache(tup);
 
+		ObjectAddressSet(obj, ConstraintRelationId, lfirst_oid(oid_item));
+		add_exact_object_address(&obj, objects);
+
         /*
          * If the constraint is inherited (only), we don't want to inject a
          * new definition here; it'll get recreated when ATAddCheckConstraint
@@ -11521,31 +11511,18 @@ ATPostAlterTypeCleanup(List **wqueue, AlteredTableInfo *tab, LOCKMODE lockmode)
         ATPostAlterTypeParse(oldId, relid, InvalidOid,
                              (char *) lfirst(def_item),
                              wqueue, lockmode, tab->rewrite);
+
+		ObjectAddressSet(obj, RelationRelationId, lfirst_oid(oid_item));
+		add_exact_object_address(&obj, objects);
     }
 
     /*
-     * Now we can drop the existing constraints and indexes --- constraints
-     * first, since some of them might depend on the indexes.  In fact, we
-     * have to delete FOREIGN KEY constraints before UNIQUE constraints, but
-     * we already ordered the constraint list to ensure that would happen. It
-     * should be okay to use DROP_RESTRICT here, since nothing else should be
-     * depending on these objects.
+	 * It should be okay to use DROP_RESTRICT here, since nothing else should
+	 * be depending on these objects.
      */
-    foreach(oid_item, tab->changedConstraintOids)
-    {
-        obj.classId = ConstraintRelationId;
-        obj.objectId = lfirst_oid(oid_item);
-        obj.objectSubId = 0;
-        performDeletion(&obj, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
-    }
+	performMultipleDeletions(objects, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
 
-    foreach(oid_item, tab->changedIndexOids)
-    {
-        obj.classId = RelationRelationId;
-        obj.objectId = lfirst_oid(oid_item);
-        obj.objectSubId = 0;
-        performDeletion(&obj, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
-    }
+	free_object_addresses(objects);
 
     /*
      * The objects will get recreated during subsequent passes over the work
diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out
index 59e95190..5a958f3a 100644
--- a/src/test/regress/expected/foreign_key.out
+++ b/src/test/regress/expected/foreign_key.out
@@ -1431,3 +1431,15 @@ alter table fktable2 drop constraint fktable2_f1_fkey;
 ERROR:  cannot ALTER TABLE "pktable2" because it has pending trigger events
 commit;
 drop table pktable2, fktable2;
+-- Altering a type referenced by a foreign key needs to drop/recreate the FK.
+-- Ensure that works.
+CREATE TABLE fk_notpartitioned_pk (a INT, PRIMARY KEY(a), CHECK (a > 0));
+CREATE TABLE fk_partitioned_fk (a INT REFERENCES fk_notpartitioned_pk(a) PRIMARY KEY) PARTITION BY RANGE(a);
+CREATE TABLE fk_partitioned_fk_1 PARTITION OF fk_partitioned_fk FOR VALUES FROM (MINVALUE) TO (MAXVALUE);
+INSERT INTO fk_notpartitioned_pk VALUES (1);
+INSERT INTO fk_partitioned_fk VALUES (1);
+ALTER TABLE fk_notpartitioned_pk ALTER COLUMN a TYPE bigint;
+DELETE FROM fk_notpartitioned_pk WHERE a = 1;
+ERROR:  update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk"
+DETAIL:  Key (a)=(1) is still referenced from table "fk_partitioned_fk".
+DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk;
diff --git a/src/test/regress/expected/foreign_key_1.out b/src/test/regress/expected/foreign_key_1.out
index e5861d11..cb069e3a 100644
--- a/src/test/regress/expected/foreign_key_1.out
+++ b/src/test/regress/expected/foreign_key_1.out
@@ -1426,3 +1426,15 @@ alter table fktable2 drop constraint fktable2_f1_fkey;
 ERROR:  cannot ALTER TABLE "pktable2" because it has pending trigger events
 commit;
 drop table pktable2, fktable2;
+-- Altering a type referenced by a foreign key needs to drop/recreate the FK.
+-- Ensure that works.
+CREATE TABLE fk_notpartitioned_pk (a INT, PRIMARY KEY(a), CHECK (a > 0));
+CREATE TABLE fk_partitioned_fk (a INT REFERENCES fk_notpartitioned_pk(a) PRIMARY KEY) PARTITION BY RANGE(a);
+CREATE TABLE fk_partitioned_fk_1 PARTITION OF fk_partitioned_fk FOR VALUES FROM (MINVALUE) TO (MAXVALUE);
+INSERT INTO fk_notpartitioned_pk VALUES (1);
+INSERT INTO fk_partitioned_fk VALUES (1);
+ALTER TABLE fk_notpartitioned_pk ALTER COLUMN a TYPE bigint;
+DELETE FROM fk_notpartitioned_pk WHERE a = 1;
+ERROR:  update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk"
+DETAIL:  Key (a)=(1) is still referenced from table "fk_partitioned_fk".
+DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk;
diff --git a/src/test/regress/expected/foreign_key_2.out b/src/test/regress/expected/foreign_key_2.out
index 8b8ac8ac..27e9026d 100644
--- a/src/test/regress/expected/foreign_key_2.out
+++ b/src/test/regress/expected/foreign_key_2.out
@@ -1438,3 +1438,15 @@ alter table fktable2 drop constraint fktable2_f1_fkey;
 ERROR:  cannot ALTER TABLE "pktable2" because it has pending trigger events
 commit;
 drop table pktable2, fktable2;
+-- Altering a type referenced by a foreign key needs to drop/recreate the FK.
+-- Ensure that works.
+CREATE TABLE fk_notpartitioned_pk (a INT, PRIMARY KEY(a), CHECK (a > 0));
+CREATE TABLE fk_partitioned_fk (a INT REFERENCES fk_notpartitioned_pk(a) PRIMARY KEY) PARTITION BY RANGE(a);
+CREATE TABLE fk_partitioned_fk_1 PARTITION OF fk_partitioned_fk FOR VALUES FROM (MINVALUE) TO (MAXVALUE);
+INSERT INTO fk_notpartitioned_pk VALUES (1);
+INSERT INTO fk_partitioned_fk VALUES (1);
+ALTER TABLE fk_notpartitioned_pk ALTER COLUMN a TYPE bigint;
+DELETE FROM fk_notpartitioned_pk WHERE a = 1;
+ERROR:  update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk"
+DETAIL:  Key (a)=(1) is still referenced from table "fk_partitioned_fk".
+DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk;
diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql
index 2fcd7d60..8c887eb9 100644
--- a/src/test/regress/sql/foreign_key.sql
+++ b/src/test/regress/sql/foreign_key.sql
@@ -1060,3 +1060,14 @@ alter table fktable2 drop constraint fktable2_f1_fkey;
 commit;
 
 drop table pktable2, fktable2;
+
+-- Altering a type referenced by a foreign key needs to drop/recreate the FK.
+-- Ensure that works.
+CREATE TABLE fk_notpartitioned_pk (a INT, PRIMARY KEY(a), CHECK (a > 0));
+CREATE TABLE fk_partitioned_fk (a INT REFERENCES fk_notpartitioned_pk(a) PRIMARY KEY) PARTITION BY RANGE(a);
+CREATE TABLE fk_partitioned_fk_1 PARTITION OF fk_partitioned_fk FOR VALUES FROM (MINVALUE) TO (MAXVALUE);
+INSERT INTO fk_notpartitioned_pk VALUES (1);
+INSERT INTO fk_partitioned_fk VALUES (1);
+ALTER TABLE fk_notpartitioned_pk ALTER COLUMN a TYPE bigint;
+DELETE FROM fk_notpartitioned_pk WHERE a = 1;
+DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk;

From 873bafeff4e7918c0f3ba710d3f43a7a38be32b0 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 15:36:08 +0800
Subject: [PATCH 264/578] Fix event triggers for partitioned tables.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/index.c                   |  10 +-
 src/backend/commands/event_trigger.c          |  13 +-
 src/backend/commands/indexcmds.c              |   3 +-
 src/backend/commands/tablecmds.c              |   2 +-
 src/backend/commands/view.c                   |   4 +
 src/backend/executor/execPartition.c          |  31 +++++
 src/include/catalog/index.h                   |   3 +-
 src/include/executor/execPartition.h          |  30 +----
 src/include/tcop/deparse_utility.h            | 122 +++++++++---------
 .../test_ddl_deparse/expected/alter_table.out |  12 ++
 .../test_ddl_deparse/sql/alter_table.sql      |   8 ++
 src/test/regress/expected/event_trigger.out   |  20 ++-
 src/test/regress/sql/event_trigger.sql        |  13 ++
 13 files changed, 170 insertions(+), 101 deletions(-)

diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 04d3d181..7f01e417 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -48,6 +48,7 @@
 #include "catalog/pg_type.h"
 #include "catalog/storage.h"
 #include "commands/tablecmds.h"
+#include "commands/event_trigger.h"
 #include "commands/trigger.h"
 #include "executor/executor.h"
 #include "miscadmin.h"
@@ -198,8 +199,9 @@ relationHasPrimaryKey(Relation rel)
 void
 index_check_primary_key(Relation heapRel,
                         IndexInfo *indexInfo,
-                        bool is_alter_table)
-{// #lizard forgives
+						bool is_alter_table,
+						IndexStmt *stmt)
+{
     List       *cmds;
     int            i;
 #ifdef __TBASE__
@@ -295,7 +297,11 @@ index_check_primary_key(Relation heapRel,
      * unduly.
      */
     if (cmds)
+	{
+		EventTriggerAlterTableStart((Node *) stmt);
         AlterTableInternal(RelationGetRelid(heapRel), cmds, true);
+		EventTriggerAlterTableEnd();
+	}
 }
 
 /*
diff --git a/src/backend/commands/event_trigger.c b/src/backend/commands/event_trigger.c
index d289c395..742e23d4 100644
--- a/src/backend/commands/event_trigger.c
+++ b/src/backend/commands/event_trigger.c
@@ -1813,11 +1813,6 @@ EventTriggerCollectSimpleCommand(ObjectAddress address,
  * Note we don't collect the command immediately; instead we keep it in
  * currentCommand, and only when we're done processing the subcommands we will
  * add it to the command list.
- *
- * XXX -- this API isn't considering the possibility of an ALTER TABLE command
- * being called reentrantly by an event trigger function.  Do we need stackable
- * commands at this level?    Perhaps at least we should detect the condition and
- * raise an error.
  */
 void
 EventTriggerAlterTableStart(Node *parsetree)
@@ -1842,6 +1837,7 @@ EventTriggerAlterTableStart(Node *parsetree)
     command->d.alterTable.subcmds = NIL;
     command->parsetree = copyObject(parsetree);
 
+	command->parent = currentEventTriggerState->currentCommand;
     currentEventTriggerState->currentCommand = command;
 
     MemoryContextSwitchTo(oldcxt);
@@ -1882,6 +1878,7 @@ EventTriggerCollectAlterTableSubcmd(Node *subcmd, ObjectAddress address)
         return;
 
     Assert(IsA(subcmd, AlterTableCmd));
+	Assert(OidIsValid(currentEventTriggerState->currentCommand));
     Assert(OidIsValid(currentEventTriggerState->currentCommand->d.alterTable.objectId));
 
     oldcxt = MemoryContextSwitchTo(currentEventTriggerState->cxt);
@@ -1907,11 +1904,15 @@ EventTriggerCollectAlterTableSubcmd(Node *subcmd, ObjectAddress address)
 void
 EventTriggerAlterTableEnd(void)
 {
+	CollectedCommand *parent;
+
     /* ignore if event trigger context not set, or collection disabled */
     if (!currentEventTriggerState ||
         currentEventTriggerState->commandCollectionInhibited)
         return;
 
+	parent = currentEventTriggerState->currentCommand->parent;
+
     /* If no subcommands, don't collect */
     if (list_length(currentEventTriggerState->currentCommand->d.alterTable.subcmds) != 0)
     {
@@ -1922,7 +1923,7 @@ EventTriggerAlterTableEnd(void)
     else
         pfree(currentEventTriggerState->currentCommand);
 
-    currentEventTriggerState->currentCommand = NULL;
+	currentEventTriggerState->currentCommand = parent;
 }
 
 /*
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 234d4e26..ad99f3e2 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -33,6 +33,7 @@
 #include "commands/comment.h"
 #include "commands/dbcommands.h"
 #include "commands/defrem.h"
+#include "commands/event_trigger.h"
 #include "commands/tablecmds.h"
 #include "commands/tablespace.h"
 #include "mb/pg_wchar.h"
@@ -664,7 +665,7 @@ DefineIndex(Oid relationId,
      * Extra checks when creating a PRIMARY KEY index.
      */
     if (stmt->primary)
-        index_check_primary_key(rel, indexInfo, is_alter_table);
+		index_check_primary_key(rel, indexInfo, is_alter_table, stmt);
 
     /*
      * We disallow indexes on system columns other than OID.  They would not
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index b4f3ddb8..cf6b6896 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -8569,7 +8569,7 @@ ATExecAddIndexConstraint(AlteredTableInfo *tab, Relation rel,
 
     /* Extra checks needed if making primary key */
     if (stmt->primary)
-        index_check_primary_key(rel, indexInfo, true);
+		index_check_primary_key(rel, indexInfo, true, stmt);
 
     /* Note we currently don't support EXCLUSION constraints here */
     if (stmt->primary)
diff --git a/src/backend/commands/view.c b/src/backend/commands/view.c
index ae927e9e..b2a9ebc6 100644
--- a/src/backend/commands/view.c
+++ b/src/backend/commands/view.c
@@ -65,6 +65,8 @@ validateWithCheckOption(char *value)
  *
  * Create a view relation and use the rules system to store the query
  * for the view.
+ *
+ * EventTriggerAlterTableStart must have been called already.
  *---------------------------------------------------------------------
  */
 static ObjectAddress
@@ -190,6 +192,7 @@ DefineVirtualRelation(RangeVar *relation, List *tlist, bool replace,
                 atcmds = lappend(atcmds, atcmd);
             }
 
+			/* EventTriggerAlterTableStart called by ProcessUtilitySlow */
             AlterTableInternal(viewOid, atcmds, true);
 
             /* Make the new view columns visible */
@@ -221,6 +224,7 @@ DefineVirtualRelation(RangeVar *relation, List *tlist, bool replace,
         atcmd->def = (Node *) options;
         atcmds = list_make1(atcmd);
 
+		/* EventTriggerAlterTableStart called by ProcessUtilitySlow */
         AlterTableInternal(viewOid, atcmds, true);
 
         ObjectAddressSet(address, RelationRelationId, viewOid);
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 1835d52a..60221c6b 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -23,6 +23,37 @@
 #include "utils/rls.h"
 #include "utils/ruleutils.h"
 
+
+/*-----------------------
+ * PartitionDispatch - information about one partitioned table in a partition
+ * hierarchy required to route a tuple to one of its partitions
+ *
+ * reldesc     Relation descriptor of the table
+ * key         Partition key information of the table
+ * keystate    Execution state required for expressions in the partition key
+ * partdesc    Partition descriptor of the table
+ * tupslot     A standalone TupleTableSlot initialized with this table's tuple
+ *             descriptor
+ * tupmap      TupleConversionMap to convert from the parent's rowtype to
+ *             this table's rowtype (when extracting the partition key of a
+ *             tuple just before routing it through this table)
+ * indexes     Array with partdesc->nparts members (for details on what
+ *             individual members represent, see how they are set in
+ *             get_partition_dispatch_recurse())
+ *-----------------------
+ */
+typedef struct PartitionDispatchData
+{
+   Relation    reldesc;
+   PartitionKey key;
+   List       *keystate;       /* list of ExprState */
+   PartitionDesc partdesc;
+   TupleTableSlot *tupslot;
+   TupleConversionMap *tupmap;
+   int        *indexes;
+} PartitionDispatchData;
+
+
 static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel,
 								 int *num_parted, List **leaf_part_oids);
 static void get_partition_dispatch_recurse(Relation rel, Relation parent,
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index c2ee59fa..3afe88f8 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -100,7 +100,8 @@ typedef enum
 
 extern void index_check_primary_key(Relation heapRel,
                         IndexInfo *indexInfo,
-                        bool is_alter_table);
+						bool is_alter_table,
+                                                IndexStmt *stmt);
 
 extern Oid index_create(Relation heapRelation,
              const char *indexRelationName,
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index 6996258a..d4d1be1d 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -18,35 +18,7 @@
 #include "nodes/parsenodes.h"
 #include "nodes/plannodes.h"
 
-/*-----------------------
- * PartitionDispatch - information about one partitioned table in a partition
- * hierarchy required to route a tuple to one of its partitions
- *
- *	reldesc		Relation descriptor of the table
- *	key			Partition key information of the table
- *	keystate	Execution state required for expressions in the partition key
- *	partdesc	Partition descriptor of the table
- *	tupslot		A standalone TupleTableSlot initialized with this table's tuple
- *				descriptor
- *	tupmap		TupleConversionMap to convert from the parent's rowtype to
- *				this table's rowtype (when extracting the partition key of a
- *				tuple just before routing it through this table)
- *	indexes		Array with partdesc->nparts members (for details on what
- *				individual members represent, see how they are set in
- *				get_partition_dispatch_recurse())
- *-----------------------
- */
-typedef struct PartitionDispatchData
-{
-	Relation		reldesc;
-	PartitionKey 	key;
-	List		   *keystate;		/* list of ExprState */
-	PartitionDesc	partdesc;
-	TupleTableSlot *tupslot;
-	TupleConversionMap *tupmap;
-	int			   *indexes;
-} PartitionDispatchData;
-
+/* See execPartition.c for the definition. */
 typedef struct PartitionDispatchData *PartitionDispatch;
 
 /*-----------------------
diff --git a/src/include/tcop/deparse_utility.h b/src/include/tcop/deparse_utility.h
index 3818a858..0d709d06 100644
--- a/src/include/tcop/deparse_utility.h
+++ b/src/include/tcop/deparse_utility.h
@@ -23,13 +23,13 @@
  */
 typedef enum CollectedCommandType
 {
-    SCT_Simple,
-    SCT_AlterTable,
-    SCT_Grant,
-    SCT_AlterOpFamily,
-    SCT_AlterDefaultPrivileges,
-    SCT_CreateOpClass,
-    SCT_AlterTSConfig
+	SCT_Simple,
+	SCT_AlterTable,
+	SCT_Grant,
+	SCT_AlterOpFamily,
+	SCT_AlterDefaultPrivileges,
+	SCT_CreateOpClass,
+	SCT_AlterTSConfig
 } CollectedCommandType;
 
 /*
@@ -37,69 +37,71 @@ typedef enum CollectedCommandType
  */
 typedef struct CollectedATSubcmd
 {
-    ObjectAddress address;        /* affected column, constraint, index, ... */
-    Node       *parsetree;
+	ObjectAddress address;		/* affected column, constraint, index, ... */
+	Node	   *parsetree;
 } CollectedATSubcmd;
 
 typedef struct CollectedCommand
 {
-    CollectedCommandType type;
-    bool        in_extension;
-    Node       *parsetree;
+	CollectedCommandType type;
 
-    union
-    {
-        /* most commands */
-        struct
-        {
-            ObjectAddress address;
-            ObjectAddress secondaryObject;
-        }            simple;
+	bool		in_extension;
+	Node	   *parsetree;
 
-        /* ALTER TABLE, and internal uses thereof */
-        struct
-        {
-            Oid            objectId;
-            Oid            classId;
-            List       *subcmds;
-        }            alterTable;
+	union
+	{
+		/* most commands */
+		struct
+		{
+			ObjectAddress address;
+			ObjectAddress secondaryObject;
+		}			simple;
 
-        /* GRANT / REVOKE */
-        struct
-        {
-            InternalGrant *istmt;
-        }            grant;
+		/* ALTER TABLE, and internal uses thereof */
+		struct
+		{
+			Oid			objectId;
+			Oid			classId;
+			List	   *subcmds;
+		}			alterTable;
 
-        /* ALTER OPERATOR FAMILY */
-        struct
-        {
-            ObjectAddress address;
-            List       *operators;
-            List       *procedures;
-        }            opfam;
+		/* GRANT / REVOKE */
+		struct
+		{
+			InternalGrant *istmt;
+		}			grant;
 
-        /* CREATE OPERATOR CLASS */
-        struct
-        {
-            ObjectAddress address;
-            List       *operators;
-            List       *procedures;
-        }            createopc;
+		/* ALTER OPERATOR FAMILY */
+		struct
+		{
+			ObjectAddress address;
+			List	   *operators;
+			List	   *procedures;
+		}			opfam;
 
-        /* ALTER TEXT SEARCH CONFIGURATION ADD/ALTER/DROP MAPPING */
-        struct
-        {
-            ObjectAddress address;
-            Oid           *dictIds;
-            int            ndicts;
-        }            atscfg;
+		/* CREATE OPERATOR CLASS */
+		struct
+		{
+			ObjectAddress address;
+			List	   *operators;
+			List	   *procedures;
+		}			createopc;
 
-        /* ALTER DEFAULT PRIVILEGES */
-        struct
-        {
-            GrantObjectType objtype;
-        }            defprivs;
-    }            d;
+		/* ALTER TEXT SEARCH CONFIGURATION ADD/ALTER/DROP MAPPING */
+		struct
+		{
+			ObjectAddress address;
+			Oid		   *dictIds;
+			int			ndicts;
+		}			atscfg;
+
+		/* ALTER DEFAULT PRIVILEGES */
+		struct
+		{
+			GrantObjectType objtype;
+		}			defprivs;
+	}			d;
+        struct CollectedCommand *parent;                /* when nested */
 } CollectedCommand;
 
-#endif                            /* DEPARSE_UTILITY_H */
+#endif							/* DEPARSE_UTILITY_H */
diff --git a/src/test/modules/test_ddl_deparse/expected/alter_table.out b/src/test/modules/test_ddl_deparse/expected/alter_table.out
index e304787b..7da847d4 100644
--- a/src/test/modules/test_ddl_deparse/expected/alter_table.out
+++ b/src/test/modules/test_ddl_deparse/expected/alter_table.out
@@ -16,3 +16,15 @@ NOTICE:  DDL test: type simple, tag ALTER TABLE
 ALTER TABLE parent ADD CONSTRAINT a_pos CHECK (a > 0);
 NOTICE:  DDL test: type alter table, tag ALTER TABLE
 NOTICE:    subcommand: ADD CONSTRAINT (and recurse)
+CREATE TABLE part (
+	a int
+) PARTITION BY RANGE (a);
+NOTICE:  DDL test: type simple, tag CREATE TABLE
+CREATE TABLE part1 PARTITION OF part FOR VALUES FROM (1) to (100);
+NOTICE:  DDL test: type simple, tag CREATE TABLE
+ALTER TABLE part ADD PRIMARY KEY (a);
+NOTICE:  DDL test: type alter table, tag CREATE INDEX
+NOTICE:    subcommand: SET NOT NULL
+NOTICE:    subcommand: SET NOT NULL
+NOTICE:  DDL test: type alter table, tag ALTER TABLE
+NOTICE:    subcommand: ADD INDEX
diff --git a/src/test/modules/test_ddl_deparse/sql/alter_table.sql b/src/test/modules/test_ddl_deparse/sql/alter_table.sql
index 6e2cca75..dec53a06 100644
--- a/src/test/modules/test_ddl_deparse/sql/alter_table.sql
+++ b/src/test/modules/test_ddl_deparse/sql/alter_table.sql
@@ -11,3 +11,11 @@ ALTER TABLE parent ADD COLUMN b serial;
 ALTER TABLE parent RENAME COLUMN b TO c;
 
 ALTER TABLE parent ADD CONSTRAINT a_pos CHECK (a > 0);
+
+CREATE TABLE part (
+	a int
+) PARTITION BY RANGE (a);
+
+CREATE TABLE part1 PARTITION OF part FOR VALUES FROM (1) to (100);
+
+ALTER TABLE part ADD PRIMARY KEY (a);
diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out
index 085eb207..2537e6f1 100644
--- a/src/test/regress/expected/event_trigger.out
+++ b/src/test/regress/expected/event_trigger.out
@@ -283,14 +283,32 @@ CREATE SCHEMA evttrig
 	CREATE TABLE one (col_a SERIAL PRIMARY KEY, col_b text DEFAULT 'forty two')
 	CREATE INDEX one_idx ON one (col_b)
 	CREATE TABLE two (col_c INTEGER CHECK (col_c > 0) REFERENCES one DEFAULT 42);
+-- Partitioned tables with a partitioned index
+CREATE TABLE evttrig.parted (
+    id int PRIMARY KEY)
+    PARTITION BY RANGE (id);
+CREATE TABLE evttrig.part_1_10 PARTITION OF evttrig.parted (id)
+  FOR VALUES FROM (1) TO (10);
+CREATE TABLE evttrig.part_10_20 PARTITION OF evttrig.parted (id)
+  FOR VALUES FROM (10) TO (20) PARTITION BY RANGE (id);
+CREATE TABLE evttrig.part_10_15 PARTITION OF evttrig.part_10_20 (id)
+  FOR VALUES FROM (10) TO (15);
+CREATE TABLE evttrig.part_15_20 PARTITION OF evttrig.part_10_20 (id)
+  FOR VALUES FROM (15) TO (20);
 ALTER TABLE evttrig.two DROP COLUMN col_c;
 ALTER TABLE evttrig.one ALTER COLUMN col_b DROP DEFAULT;
 ALTER TABLE evttrig.one DROP CONSTRAINT one_pkey;
 DROP INDEX evttrig.one_idx;
 DROP SCHEMA evttrig CASCADE;
-NOTICE:  drop cascades to 2 other objects
+NOTICE:  drop cascades to 3 other objects
 DETAIL:  drop cascades to table evttrig.one
 drop cascades to table evttrig.two
+drop cascades to table evttrig.parted
+NOTICE:  NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.parted name={evttrig,parted} args={}
+NOTICE:  NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_1_10 name={evttrig,part_1_10} args={}
+NOTICE:  NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_20 name={evttrig,part_10_20} args={}
+NOTICE:  NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_15 name={evttrig,part_10_15} args={}
+NOTICE:  NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_15_20 name={evttrig,part_15_20} args={}
 DROP TABLE a_temp_tbl;
 DROP EVENT TRIGGER regress_event_trigger_report_dropped;
 ERROR:  event trigger "regress_event_trigger_report_dropped" does not exist
diff --git a/src/test/regress/sql/event_trigger.sql b/src/test/regress/sql/event_trigger.sql
index b65bf3ec..9c8fa5f6 100644
--- a/src/test/regress/sql/event_trigger.sql
+++ b/src/test/regress/sql/event_trigger.sql
@@ -263,6 +263,19 @@ CREATE SCHEMA evttrig
 	CREATE INDEX one_idx ON one (col_b)
 	CREATE TABLE two (col_c INTEGER CHECK (col_c > 0) REFERENCES one DEFAULT 42);
 
+-- Partitioned tables with a partitioned index
+CREATE TABLE evttrig.parted (
+    id int PRIMARY KEY)
+    PARTITION BY RANGE (id);
+CREATE TABLE evttrig.part_1_10 PARTITION OF evttrig.parted (id)
+  FOR VALUES FROM (1) TO (10);
+CREATE TABLE evttrig.part_10_20 PARTITION OF evttrig.parted (id)
+  FOR VALUES FROM (10) TO (20) PARTITION BY RANGE (id);
+CREATE TABLE evttrig.part_10_15 PARTITION OF evttrig.part_10_20 (id)
+  FOR VALUES FROM (10) TO (15);
+CREATE TABLE evttrig.part_15_20 PARTITION OF evttrig.part_10_20 (id)
+  FOR VALUES FROM (15) TO (20);
+
 ALTER TABLE evttrig.two DROP COLUMN col_c;
 ALTER TABLE evttrig.one ALTER COLUMN col_b DROP DEFAULT;
 ALTER TABLE evttrig.one DROP CONSTRAINT one_pkey;

From 40d30a952b94ef13273936fedb953c0ad3df9e66 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 15:53:50 +0800
Subject: [PATCH 265/578] Fix catalog insertion order for ATTACH PARTITION.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/commands/tablecmds.c            |  6 +++---
 src/test/regress/expected/alter_table.out   | 15 +++++++++++++++
 src/test/regress/expected/alter_table_1.out | 15 +++++++++++++++
 src/test/regress/expected/alter_table_2.out | 15 +++++++++++++++
 src/test/regress/expected/alter_table_3.out | 15 +++++++++++++++
 src/test/regress/sql/alter_table.sql        | 18 +++++++++++++++++-
 6 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index cf6b6896..f976badd 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -16688,9 +16688,6 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
                         trigger_name, RelationGetRelationName(attachrel)),
                  errdetail("ROW triggers with transition tables are not supported on partitions")));
 
-    /* OK to create inheritance.  Rest of the checks performed there */
-    CreateInheritance(attachrel, rel);
-
 	/* Update the default partition oid */
 	if (cmd->bound->is_default)
 		update_default_partition_oid(RelationGetRelid(rel),
@@ -16704,6 +16701,9 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
     check_new_partition_bound(RelationGetRelationName(attachrel), rel,
                               cmd->bound);
 
+	/* OK to create inheritance.  Rest of the checks performed there */
+	CreateInheritance(attachrel, rel);
+
     /* Update the pg_class entry. */
     StorePartitionBound(attachrel, rel, cmd->bound);
 
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 088474cf..ea00b3ae 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3745,3 +3745,18 @@ alter table defpart_attach_test_d add check (a > 1);
 alter table defpart_attach_test attach partition defpart_attach_test_d default;
 INFO:  partition constraint for table "defpart_attach_test_d" is implied by existing constraints
 drop table defpart_attach_test;
+-- test case where the partitioning operator is a SQL function whose
+-- evaluation results in the table's relcache being rebuilt partway through
+-- the execution of an ATTACH PARTITION command
+create function at_test_sql_partop (int4, int4) returns int language sql
+as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$;
+create operator class at_test_sql_partop for type int4 using btree as
+    operator 1 < (int4, int4), operator 2 <= (int4, int4),
+    operator 3 = (int4, int4), operator 4 >= (int4, int4),
+    operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4);
+create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop);
+create table at_test_sql_partop_1 (a int);
+alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10);
+drop table at_test_sql_partop;
+drop operator class at_test_sql_partop using btree;
+drop function at_test_sql_partop;
diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out
index 8e1053bc..744691c9 100644
--- a/src/test/regress/expected/alter_table_1.out
+++ b/src/test/regress/expected/alter_table_1.out
@@ -3668,3 +3668,18 @@ alter table defpart_attach_test_d add check (a > 1);
 alter table defpart_attach_test attach partition defpart_attach_test_d default;
 INFO:  partition constraint for table "defpart_attach_test_d" is implied by existing constraints
 drop table defpart_attach_test;
+-- test case where the partitioning operator is a SQL function whose
+-- evaluation results in the table's relcache being rebuilt partway through
+-- the execution of an ATTACH PARTITION command
+create function at_test_sql_partop (int4, int4) returns int language sql
+as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$;
+create operator class at_test_sql_partop for type int4 using btree as
+    operator 1 < (int4, int4), operator 2 <= (int4, int4),
+    operator 3 = (int4, int4), operator 4 >= (int4, int4),
+    operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4);
+create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop);
+create table at_test_sql_partop_1 (a int);
+alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10);
+drop table at_test_sql_partop;
+drop operator class at_test_sql_partop using btree;
+drop function at_test_sql_partop;
diff --git a/src/test/regress/expected/alter_table_2.out b/src/test/regress/expected/alter_table_2.out
index 19a9d000..a958aa64 100644
--- a/src/test/regress/expected/alter_table_2.out
+++ b/src/test/regress/expected/alter_table_2.out
@@ -3668,3 +3668,18 @@ alter table defpart_attach_test_d add check (a > 1);
 alter table defpart_attach_test attach partition defpart_attach_test_d default;
 INFO:  partition constraint for table "defpart_attach_test_d" is implied by existing constraints
 drop table defpart_attach_test;
+-- test case where the partitioning operator is a SQL function whose
+-- evaluation results in the table's relcache being rebuilt partway through
+-- the execution of an ATTACH PARTITION command
+create function at_test_sql_partop (int4, int4) returns int language sql
+as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$;
+create operator class at_test_sql_partop for type int4 using btree as
+    operator 1 < (int4, int4), operator 2 <= (int4, int4),
+    operator 3 = (int4, int4), operator 4 >= (int4, int4),
+    operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4);
+create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop);
+create table at_test_sql_partop_1 (a int);
+alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10);
+drop table at_test_sql_partop;
+drop operator class at_test_sql_partop using btree;
+drop function at_test_sql_partop;
diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out
index 5cdf3e7a..9d426e3c 100644
--- a/src/test/regress/expected/alter_table_3.out
+++ b/src/test/regress/expected/alter_table_3.out
@@ -3668,3 +3668,18 @@ alter table defpart_attach_test_d add check (a > 1);
 alter table defpart_attach_test attach partition defpart_attach_test_d default;
 INFO:  partition constraint for table "defpart_attach_test_d" is implied by existing constraints
 drop table defpart_attach_test;
+-- test case where the partitioning operator is a SQL function whose
+-- evaluation results in the table's relcache being rebuilt partway through
+-- the execution of an ATTACH PARTITION command
+create function at_test_sql_partop (int4, int4) returns int language sql
+as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$;
+create operator class at_test_sql_partop for type int4 using btree as
+    operator 1 < (int4, int4), operator 2 <= (int4, int4),
+    operator 3 = (int4, int4), operator 4 >= (int4, int4),
+    operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4);
+create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop);
+create table at_test_sql_partop_1 (a int);
+alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10);
+drop table at_test_sql_partop;
+drop operator class at_test_sql_partop using btree;
+drop function at_test_sql_partop;
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index e1c6772c..7b5f2409 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -2515,4 +2515,20 @@ alter table defpart_attach_test_d add check (a > 1);
 -- should be attached successfully and without needing to be scanned
 alter table defpart_attach_test attach partition defpart_attach_test_d default;
 
-drop table defpart_attach_test;
\ No newline at end of file
+drop table defpart_attach_test;
+
+-- test case where the partitioning operator is a SQL function whose
+-- evaluation results in the table's relcache being rebuilt partway through
+-- the execution of an ATTACH PARTITION command
+create function at_test_sql_partop (int4, int4) returns int language sql
+as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$;
+create operator class at_test_sql_partop for type int4 using btree as
+    operator 1 < (int4, int4), operator 2 <= (int4, int4),
+    operator 3 = (int4, int4), operator 4 >= (int4, int4),
+    operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4);
+create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop);
+create table at_test_sql_partop_1 (a int);
+alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10);
+drop table at_test_sql_partop;
+drop operator class at_test_sql_partop using btree;
+drop function at_test_sql_partop;
\ No newline at end of file

From 58fd2898036f5773284c5c92efaffc4361917106 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 17:20:43 +0800
Subject: [PATCH 266/578] Add pg_partition_tree to display information about
 partitions.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/func.sgml                       |  43 ++++++
 src/backend/utils/adt/Makefile               |   2 +-
 src/backend/utils/adt/partitionfuncs.c       | 154 +++++++++++++++++++
 src/include/catalog/pg_proc.h                |   3 +
 src/test/regress/expected/partition_info.out | 114 ++++++++++++++
 src/test/regress/parallel_schedule           |   2 +-
 src/test/regress/serial_schedule             |   1 +
 src/test/regress/sql/partition_info.sql      |  68 ++++++++
 8 files changed, 385 insertions(+), 2 deletions(-)
 create mode 100644 src/backend/utils/adt/partitionfuncs.c
 create mode 100644 src/test/regress/expected/partition_info.out
 create mode 100644 src/test/regress/sql/partition_info.sql

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 6729c562..cb4821d5 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -19824,6 +19824,49 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup());
     The function returns the number of new collation objects it created.
    </para>
 
+   <table id="functions-info-partition">
+    <title>Partitioning Information Functions</title>
+    <tgroup cols="3">
+     <thead>
+      <row><entry>Name</entry> <entry>Return Type</entry> <entry>Description</entry></row>
+     </thead>
+
+     <tbody> 
+      <row>
+       <entry><literal><function>pg_partition_tree(<type>regclass</type>)</function></literal></entry>
+       <entry><type>setof record</type></entry>
+       <entry>
+        List information about tables or indexes in a partition tree for a
+        given partitioned table or partitioned index, with one row for each
+        partition.  Information provided includes the name of the partition,
+        the name of its immediate parent, a boolean value telling if the
+        partition is a leaf, and an integer telling its level in the hierarchy.
+        The value of level begins at <literal>0</literal> for the input table
+        or index in its role as the root of the partition tree,
+        <literal>1</literal> for its partitions, <literal>2</literal> for
+        their partitions, and so on.
+       </entry>
+      </row>
+     </tbody>
+    </tgroup>
+   </table>
+
+   <para>
+    To check the total size of the data contained in
+    <structname>measurement</structname> table described in
+    <xref linkend="ddl-partitioning-declarative-example"/>, one could use the
+    following query:
+   </para>
+
+<programlisting>
+=# SELECT pg_size_pretty(sum(pg_relation_size(relid))) AS total_size
+     FROM pg_partition_tree('measurement');
+ total_size 
+------------
+ 24 kB
+(1 row)
+</programlisting>
+
   </sect2>
 
   <sect2 id="functions-admin-index">
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index 1fb01841..1e6765fc 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -19,7 +19,7 @@ OBJS = acl.o amutils.o arrayfuncs.o array_expanded.o array_selfuncs.o \
 	jsonfuncs.o like.o lockfuncs.o mac.o mac8.o misc.o nabstime.o name.o \
 	network.o network_gist.o network_selfuncs.o network_spgist.o \
 	numeric.o numutils.o oid.o oracle_compat.o \
-	orderedsetaggs.o pg_locale.o pg_lsn.o pg_upgrade_support.o \
+	orderedsetaggs.o partitionfuncs.o pg_locale.o pg_lsn.o pg_upgrade_support.o \
 	pgstatfuncs.o \
 	pseudotypes.o quote.o rangetypes.o rangetypes_gist.o \
 	rangetypes_selfuncs.o rangetypes_spgist.o rangetypes_typanalyze.o \
diff --git a/src/backend/utils/adt/partitionfuncs.c b/src/backend/utils/adt/partitionfuncs.c
new file mode 100644
index 00000000..8f9218ad
--- /dev/null
+++ b/src/backend/utils/adt/partitionfuncs.c
@@ -0,0 +1,154 @@
+/*-------------------------------------------------------------------------
+ *
+ * partitionfuncs.c
+ *	  Functions for accessing partition-related metadata
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/adt/partitionfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "catalog/partition.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_inherits.h"
+#include "catalog/pg_type.h"
+#include "funcapi.h"
+#include "utils/fmgrprotos.h"
+#include "utils/lsyscache.h"
+
+
+/*
+ * pg_partition_tree
+ *
+ * Produce a view with one row per member of a partition tree, beginning
+ * from the top-most parent given by the caller.  This gives information
+ * about each partition, its immediate partitioned parent, if it is
+ * a leaf partition and its level in the hierarchy.
+ */
+Datum
+pg_partition_tree(PG_FUNCTION_ARGS)
+{
+#define PG_PARTITION_TREE_COLS	4
+	Oid			rootrelid = PG_GETARG_OID(0);
+	char		relkind = get_rel_relkind(rootrelid);
+	FuncCallContext *funcctx;
+	ListCell  **next;
+
+	/* Only allow relation types that can appear in partition trees. */
+	if (relkind != RELKIND_RELATION &&
+		relkind != RELKIND_FOREIGN_TABLE &&
+		relkind != RELKIND_INDEX &&
+		relkind != RELKIND_PARTITIONED_TABLE &&
+		relkind != RELKIND_PARTITIONED_INDEX)
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("\"%s\" is not a table, a foreign table, or an index",
+						get_rel_name(rootrelid))));
+
+	/* stuff done only on the first call of the function */
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcxt;
+		TupleDesc	tupdesc;
+		List	   *partitions;
+
+		/* create a function context for cross-call persistence */
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		/* switch to memory context appropriate for multiple function calls */
+		oldcxt = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		/*
+		 * Find all members of inheritance set.  We only need AccessShareLock
+		 * on the children for the partition information lookup.
+		 */
+		partitions = find_all_inheritors(rootrelid, AccessShareLock, NULL);
+
+		tupdesc = CreateTemplateTupleDesc(PG_PARTITION_TREE_COLS, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "relid",
+						   REGCLASSOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "parentid",
+						   REGCLASSOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "isleaf",
+						   BOOLOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "level",
+						   INT4OID, -1, 0);
+
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+		/* allocate memory for user context */
+		next = (ListCell **) palloc(sizeof(ListCell *));
+		*next = list_head(partitions);
+		funcctx->user_fctx = (void *) next;
+
+		MemoryContextSwitchTo(oldcxt);
+	}
+
+	/* stuff done on every call of the function */
+	funcctx = SRF_PERCALL_SETUP();
+	next = (ListCell **) funcctx->user_fctx;
+
+	if (*next != NULL)
+	{
+		Datum		result;
+		Datum		values[PG_PARTITION_TREE_COLS];
+		bool		nulls[PG_PARTITION_TREE_COLS];
+		HeapTuple	tuple;
+		Oid			parentid = InvalidOid;
+		Oid			relid = lfirst_oid(*next);
+		char		relkind = get_rel_relkind(relid);
+		int			level = 0;
+		List	   *ancestors = get_partition_ancestors(lfirst_oid(*next));
+		ListCell   *lc;
+
+		/*
+		 * Form tuple with appropriate data.
+		 */
+		MemSet(nulls, 0, sizeof(nulls));
+		MemSet(values, 0, sizeof(values));
+
+		/* relid */
+		values[0] = ObjectIdGetDatum(relid);
+
+		/* parentid */
+		if (ancestors != NIL)
+			parentid = linitial_oid(ancestors);
+		if (OidIsValid(parentid))
+			values[1] = ObjectIdGetDatum(parentid);
+		else
+			nulls[1] = true;
+
+		/* isleaf */
+		values[2] = BoolGetDatum(relkind != RELKIND_PARTITIONED_TABLE &&
+								 relkind != RELKIND_PARTITIONED_INDEX);
+
+		/* level */
+		if (relid != rootrelid)
+		{
+			foreach(lc, ancestors)
+			{
+				level++;
+				if (lfirst_oid(lc) == rootrelid)
+					break;
+			}
+		}
+		values[3] = Int32GetDatum(level);
+
+		*next = lnext(*next);
+
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+		result = HeapTupleGetDatum(tuple);
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+
+	/* done when there are no more elements left */
+	SRF_RETURN_DONE(funcctx);
+}
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 51adc65a..bd1481cf 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -5710,6 +5710,9 @@ DESCR("list of files in the WAL directory");
 /* hash partitioning constraint function */
 DATA(insert OID = 4687 ( satisfies_hash_partition PGNSP PGUID 12 1 0 2276 0 f f f f f f i s 4 0 16 "26 23 23 2276" _null_ "{i,i,i,v}" _null_ _null_ _null_ satisfies_hash_partition _null_ _null_ _null_ ));
 DESCR("hash partition CHECK constraint");
+/* information about a partition tree */
+DATA(insert OID = 4688 ( pg_partition_tree PGNSP PGUID 12 1 1000 0 0 f f f f t t v s 1 0 2249 "2205" "{2205,2205,2205,16,23}" "{i,o,o,o,o}" "{rootrelid,relid,parentrelid,isleaf,level}" _null_ _null_ pg_partition_tree _null_ _null_ _null_ ));
+DESCR("view partition tree tables");
 DATA(insert OID = 3410 (  pg_extent_info                PGNSP PGUID 12 10 20 0 0 f f f f f t v s 1 0 2249 "2205" "{23,16,23,23,23,23,23,23,23}" "{o,o,o,o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next,scan_prev,alloc_next,alloc_prev}" _null_ _null_ pg_extent_info_oid _null_ _null_ _null_ ));
 DESCR("get extent info of a relation");
 DATA(insert OID = 3411 (  pg_shard_scan_list            PGNSP PGUID 12 10 20 0 0 f f f f f t v s 2 0 2249 "2205 23" "{23,16,23,23,23,23}" "{o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next}" _null_ _null_ pg_shard_scan_list_oid _null_ _null_ _null_ ));
diff --git a/src/test/regress/expected/partition_info.out b/src/test/regress/expected/partition_info.out
new file mode 100644
index 00000000..6b116125
--- /dev/null
+++ b/src/test/regress/expected/partition_info.out
@@ -0,0 +1,114 @@
+--
+-- Tests for pg_partition_tree
+--
+SELECT * FROM pg_partition_tree(NULL);
+ relid | parentrelid | isleaf | level 
+-------+-------------+--------+-------
+(0 rows)
+
+-- Test table partition trees
+CREATE TABLE ptif_test (a int, b int) PARTITION BY range (a);
+CREATE TABLE ptif_test0 PARTITION OF ptif_test
+  FOR VALUES FROM (minvalue) TO (0) PARTITION BY list (b);
+CREATE TABLE ptif_test01 PARTITION OF ptif_test0 FOR VALUES IN (1);
+CREATE TABLE ptif_test1 PARTITION OF ptif_test
+  FOR VALUES FROM (0) TO (100) PARTITION BY list (b);
+CREATE TABLE ptif_test11 PARTITION OF ptif_test1 FOR VALUES IN (1);
+CREATE TABLE ptif_test2 PARTITION OF ptif_test
+  FOR VALUES FROM (100) TO (maxvalue);
+-- Test index partition tree
+CREATE INDEX ptif_test_index ON ONLY ptif_test (a);
+CREATE INDEX ptif_test0_index ON ONLY ptif_test0 (a);
+ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test0_index;
+CREATE INDEX ptif_test01_index ON ptif_test01 (a);
+ALTER INDEX ptif_test0_index ATTACH PARTITION ptif_test01_index;
+CREATE INDEX ptif_test1_index ON ONLY ptif_test1 (a);
+ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test1_index;
+CREATE INDEX ptif_test11_index ON ptif_test11 (a);
+ALTER INDEX ptif_test1_index ATTACH PARTITION ptif_test11_index;
+CREATE INDEX ptif_test2_index ON ptif_test2 (a);
+ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test2_index;
+-- List all tables members of the tree
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test');
+    relid    | parentrelid | level | isleaf 
+-------------+-------------+-------+--------
+ ptif_test   |             |     0 | f
+ ptif_test0  | ptif_test   |     1 | f
+ ptif_test1  | ptif_test   |     1 | f
+ ptif_test2  | ptif_test   |     1 | t
+ ptif_test01 | ptif_test0  |     2 | t
+ ptif_test11 | ptif_test1  |     2 | t
+(6 rows)
+
+-- List tables from an intermediate level
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test0') p
+  JOIN pg_class c ON (p.relid = c.oid);
+    relid    | parentrelid | level | isleaf 
+-------------+-------------+-------+--------
+ ptif_test0  | ptif_test   |     0 | f
+ ptif_test01 | ptif_test0  |     1 | t
+(2 rows)
+
+-- List from leaf table
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test01') p
+  JOIN pg_class c ON (p.relid = c.oid);
+    relid    | parentrelid | level | isleaf 
+-------------+-------------+-------+--------
+ ptif_test01 | ptif_test0  |     0 | t
+(1 row)
+
+-- List all indexes members of the tree
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test_index');
+       relid       |   parentrelid    | level | isleaf 
+-------------------+------------------+-------+--------
+ ptif_test_index   |                  |     0 | f
+ ptif_test0_index  | ptif_test_index  |     1 | f
+ ptif_test1_index  | ptif_test_index  |     1 | f
+ ptif_test2_index  | ptif_test_index  |     1 | t
+ ptif_test01_index | ptif_test0_index |     2 | t
+ ptif_test11_index | ptif_test1_index |     2 | t
+(6 rows)
+
+-- List indexes from an intermediate level
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test0_index') p
+  JOIN pg_class c ON (p.relid = c.oid);
+       relid       |   parentrelid    | level | isleaf 
+-------------------+------------------+-------+--------
+ ptif_test0_index  | ptif_test_index  |     0 | f
+ ptif_test01_index | ptif_test0_index |     1 | t
+(2 rows)
+
+-- List from leaf index
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test01_index') p
+  JOIN pg_class c ON (p.relid = c.oid);
+       relid       |   parentrelid    | level | isleaf 
+-------------------+------------------+-------+--------
+ ptif_test01_index | ptif_test0_index |     0 | t
+(1 row)
+
+DROP TABLE ptif_test;
+-- A table not part of a partition tree works is the only member listed.
+CREATE TABLE ptif_normal_table(a int);
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_normal_table');
+       relid       | parentrelid | level | isleaf 
+-------------------+-------------+-------+--------
+ ptif_normal_table |             |     0 | t
+(1 row)
+
+DROP TABLE ptif_normal_table;
+-- Views and materialized viewS cannot be part of a partition tree.
+CREATE VIEW ptif_test_view AS SELECT 1;
+CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1;
+SELECT * FROM pg_partition_tree('ptif_test_view');
+ERROR:  "ptif_test_view" is not a table, a foreign table, or an index
+SELECT * FROM pg_partition_tree('ptif_test_matview');
+ERROR:  "ptif_test_matview" is not a table, a foreign table, or an index
+DROP VIEW ptif_test_view;
+DROP MATERIALIZED VIEW ptif_test_matview;
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 905cb00a..95fafcd7 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -134,7 +134,7 @@ test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion t
 # ----------
 # Another group of parallel tests
 # ----------
-test: identity partition_join partition_prune partition_prune_hash hash_part
+test: identity partition_join partition_prune partition_prune_hash hash_part partition_info
 
 # event triggers cannot run concurrently with any test that runs DDL
 test: event_trigger
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 1f00bfbc..f91b37b9 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -194,6 +194,7 @@ test: identity
 test: partition_join
 test: partition_prune
 test: partition_prune_hash
+test: partition_info
 test: hash_part
 test: event_trigger
 test: fast_default
diff --git a/src/test/regress/sql/partition_info.sql b/src/test/regress/sql/partition_info.sql
new file mode 100644
index 00000000..5a76f22b
--- /dev/null
+++ b/src/test/regress/sql/partition_info.sql
@@ -0,0 +1,68 @@
+--
+-- Tests for pg_partition_tree
+--
+SELECT * FROM pg_partition_tree(NULL);
+
+-- Test table partition trees
+CREATE TABLE ptif_test (a int, b int) PARTITION BY range (a);
+CREATE TABLE ptif_test0 PARTITION OF ptif_test
+  FOR VALUES FROM (minvalue) TO (0) PARTITION BY list (b);
+CREATE TABLE ptif_test01 PARTITION OF ptif_test0 FOR VALUES IN (1);
+CREATE TABLE ptif_test1 PARTITION OF ptif_test
+  FOR VALUES FROM (0) TO (100) PARTITION BY list (b);
+CREATE TABLE ptif_test11 PARTITION OF ptif_test1 FOR VALUES IN (1);
+CREATE TABLE ptif_test2 PARTITION OF ptif_test
+  FOR VALUES FROM (100) TO (maxvalue);
+
+-- Test index partition tree
+CREATE INDEX ptif_test_index ON ONLY ptif_test (a);
+CREATE INDEX ptif_test0_index ON ONLY ptif_test0 (a);
+ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test0_index;
+CREATE INDEX ptif_test01_index ON ptif_test01 (a);
+ALTER INDEX ptif_test0_index ATTACH PARTITION ptif_test01_index;
+CREATE INDEX ptif_test1_index ON ONLY ptif_test1 (a);
+ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test1_index;
+CREATE INDEX ptif_test11_index ON ptif_test11 (a);
+ALTER INDEX ptif_test1_index ATTACH PARTITION ptif_test11_index;
+CREATE INDEX ptif_test2_index ON ptif_test2 (a);
+ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test2_index;
+
+-- List all tables members of the tree
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test');
+-- List tables from an intermediate level
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test0') p
+  JOIN pg_class c ON (p.relid = c.oid);
+-- List from leaf table
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test01') p
+  JOIN pg_class c ON (p.relid = c.oid);
+
+-- List all indexes members of the tree
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test_index');
+-- List indexes from an intermediate level
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test0_index') p
+  JOIN pg_class c ON (p.relid = c.oid);
+-- List from leaf index
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test01_index') p
+  JOIN pg_class c ON (p.relid = c.oid);
+
+DROP TABLE ptif_test;
+
+-- A table not part of a partition tree works is the only member listed.
+CREATE TABLE ptif_normal_table(a int);
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_normal_table');
+DROP TABLE ptif_normal_table;
+
+-- Views and materialized viewS cannot be part of a partition tree.
+CREATE VIEW ptif_test_view AS SELECT 1;
+CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1;
+SELECT * FROM pg_partition_tree('ptif_test_view');
+SELECT * FROM pg_partition_tree('ptif_test_matview');
+DROP VIEW ptif_test_view;
+DROP MATERIALIZED VIEW ptif_test_matview;

From 62b8223aa66cb27f23294f6cd9f184a12de4c41b Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 17:52:18 +0800
Subject: [PATCH 267/578] Fix tablespace handling for partitioned indexes

---
 src/backend/catalog/heap.c                |  9 ++++
 src/backend/commands/tablecmds.c          | 60 +++++++++++++++++++++--
 src/test/regress/input/tablespace.source  | 10 ++++
 src/test/regress/output/tablespace.source | 19 ++++++-
 4 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 57e486f1..ff83af36 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -419,6 +419,15 @@ heap_create(const char *relname,
              */
             reltablespace = InvalidOid;
             break;
+
+		case RELKIND_PARTITIONED_INDEX:
+			/*
+			 * Preserve tablespace so that it's used as tablespace for indexes
+			 * on future partitions.
+			 */
+			create_storage = false;
+			break;
+
         case RELKIND_SEQUENCE:
             create_storage = true;
 
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index f976badd..fa871fd8 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -495,6 +495,7 @@ static bool ATPrepChangePersistence(Relation rel, bool toLogged);
 static void ATPrepSetTableSpace(AlteredTableInfo *tab, Relation rel,
                     char *tablespacename, LOCKMODE lockmode);
 static void ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode);
+static void ATExecPartedIdxSetTableSpace(Relation rel, Oid newTableSpace);
 static void ATExecSetRelOptions(Relation rel, List *defList,
                     AlterTableType operation,
                     LOCKMODE lockmode);
@@ -4904,7 +4905,8 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd,
             pass = AT_PASS_DROP;
             break;
         case AT_SetTableSpace:    /* SET TABLESPACE */
-            ATSimplePermissions(rel, ATT_TABLE | ATT_MATVIEW | ATT_INDEX);
+			ATSimplePermissions(rel, ATT_TABLE | ATT_MATVIEW | ATT_INDEX |
+								ATT_PARTITIONED_INDEX);
             /* This command never recurses */
             ATPrepSetTableSpace(tab, rel, cmd->name, lockmode);
             pass = AT_PASS_MISC;    /* doesn't actually matter */
@@ -5278,10 +5280,13 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
              */
             break;
         case AT_SetTableSpace:    /* SET TABLESPACE */
-
             /*
-             * Nothing to do here; Phase 3 does the work
+			 * Only do this for partitioned indexes, for which this is just
+			 * a catalog change.  Other relation types are handled by Phase 3.
              */
+			if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+				ATExecPartedIdxSetTableSpace(rel, tab->newTableSpace);
+
             break;
         case AT_SetRelOptions:    /* SET (...) */
         case AT_ResetRelOptions:    /* RESET (...) */
@@ -12590,6 +12595,55 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
     list_free(reltoastidxids);
 }
 
+/*
+ * Special handling of ALTER TABLE SET TABLESPACE for partitioned indexes,
+ * which have no storage (so not handled in Phase 3 like other relation types)
+ */
+static void
+ATExecPartedIdxSetTableSpace(Relation rel, Oid newTableSpace)
+{
+	HeapTuple	tuple;
+	Oid			oldTableSpace;
+	Relation	pg_class;
+	Form_pg_class rd_rel;
+	Oid			indexOid = RelationGetRelid(rel);
+
+	Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX);
+
+	/*
+	 * No work if no change in tablespace.
+	 */
+	oldTableSpace = rel->rd_rel->reltablespace;
+	if (newTableSpace == oldTableSpace ||
+		(newTableSpace == MyDatabaseTableSpace && oldTableSpace == 0))
+	{
+		InvokeObjectPostAlterHook(RelationRelationId,
+								  indexOid, 0);
+		return;
+	}
+
+	/* Get a modifiable copy of the relation's pg_class row */
+	pg_class = heap_open(RelationRelationId, RowExclusiveLock);
+
+	tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(indexOid));
+	if (!HeapTupleIsValid(tuple))
+		elog(ERROR, "cache lookup failed for relation %u", indexOid);
+	rd_rel = (Form_pg_class) GETSTRUCT(tuple);
+
+	/* update the pg_class row */
+	rd_rel->reltablespace = (newTableSpace == MyDatabaseTableSpace) ? InvalidOid : newTableSpace;
+	CatalogTupleUpdate(pg_class, &tuple->t_self, tuple);
+
+	InvokeObjectPostAlterHook(RelationRelationId, indexOid, 0);
+
+	heap_freetuple(tuple);
+
+	heap_close(pg_class, RowExclusiveLock);
+
+	/* Make sure the reltablespace change is visible */
+	CommandCounterIncrement();
+}
+
 /*
  * Alter Table ALL ... SET TABLESPACE
  *
diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source
index 03a62bd7..1454e433 100644
--- a/src/test/regress/input/tablespace.source
+++ b/src/test/regress/input/tablespace.source
@@ -44,6 +44,14 @@ CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace;
 SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
     where c.reltablespace = t.oid AND c.relname = 'foo_idx';
 
+-- partitioned index
+CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
+CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1);
+CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2);
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx';
+
 -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
 CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace;
 INSERT INTO testschema.test_default_tab VALUES (1);
@@ -93,6 +101,8 @@ CREATE UNIQUE INDEX anindex ON testschema.atable(column1);
 
 ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace;
 ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace;
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default;
+ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace;
 
 INSERT INTO testschema.atable VALUES(3);	-- ok
 INSERT INTO testschema.atable VALUES(1);	-- fail (checks index)
diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source
index 40f8a72f..a1a615de 100644
--- a/src/test/regress/output/tablespace.source
+++ b/src/test/regress/output/tablespace.source
@@ -61,6 +61,20 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
  foo_idx | regress_tblspace
 (1 row)
 
+-- partitioned index
+CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
+CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1);
+CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2);
+SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
+    where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx';
+   relname   |     spcname      
+-------------+------------------
+ part1_a_idx | regress_tblspace
+ part2_a_idx | regress_tblspace
+ part_a_idx  | regress_tblspace
+(3 rows)
+
 -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
 CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace;
 INSERT INTO testschema.test_default_tab VALUES (1);
@@ -200,6 +214,8 @@ CREATE TABLE testschema.atable AS VALUES (1), (2);
 CREATE UNIQUE INDEX anindex ON testschema.atable(column1);
 ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace;
 ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace;
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default;
+ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace;
 INSERT INTO testschema.atable VALUES(3);	-- ok
 INSERT INTO testschema.atable VALUES(1);	-- fail (checks index)
 ERROR:  duplicate key value violates unique constraint "anindex"
@@ -241,9 +257,10 @@ NOTICE:  no matching relations in tablespace "regress_tblspace_renamed" found
 -- Should succeed
 DROP TABLESPACE regress_tblspace_renamed;
 DROP SCHEMA testschema CASCADE;
-NOTICE:  drop cascades to 4 other objects
+NOTICE:  drop cascades to 5 other objects
 DETAIL:  drop cascades to table testschema.foo
 drop cascades to table testschema.asselect
+drop cascades to table testschema.part
 drop cascades to table testschema.atable
 drop cascades to table testschema.tablespace_acl
 DROP ROLE regress_tablespace_user1;

From e07fc7615b31017ae3554d2a7a8166b247389765 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 19:16:12 +0800
Subject: [PATCH 268/578] Optimize nested ConvertRowtypeExpr nodes.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/optimizer/util/clauses.c    | 46 +++++++++++++++++++++++++
 src/test/regress/expected/inherit.out   | 18 ++++++++++
 src/test/regress/expected/inherit_1.out | 16 +++++++++
 src/test/regress/expected/inherit_2.out | 16 +++++++++
 src/test/regress/expected/inherit_3.out | 16 +++++++++
 src/test/regress/sql/inherit.sql        |  5 +++
 6 files changed, 117 insertions(+)

diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index 82a9f2ba..697b7dcc 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -3614,6 +3614,52 @@ eval_const_expressions_mutator(Node *node,
                                                       context);
             }
             break;
+		case T_ConvertRowtypeExpr:
+			{
+				ConvertRowtypeExpr *cre = castNode(ConvertRowtypeExpr, node);
+				Node		   *arg;
+				ConvertRowtypeExpr *newcre;
+
+				arg = eval_const_expressions_mutator((Node *) cre->arg,
+													 context);
+
+				newcre = makeNode(ConvertRowtypeExpr);
+				newcre->resulttype = cre->resulttype;
+				newcre->convertformat = cre->convertformat;
+				newcre->location = cre->location;
+
+				/*
+				 * In case of a nested ConvertRowtypeExpr, we can convert the
+				 * leaf row directly to the topmost row format without any
+				 * intermediate conversions. (This works because
+				 * ConvertRowtypeExpr is used only for child->parent
+				 * conversion in inheritance trees, which works by exact match
+				 * of column name, and a column absent in an intermediate
+				 * result can't be present in the final result.)
+				 *
+				 * No need to check more than one level deep, because the
+				 * above recursion will have flattened anything else.
+				 */
+				if (arg != NULL && IsA(arg, ConvertRowtypeExpr))
+				{
+					ConvertRowtypeExpr *argcre = (ConvertRowtypeExpr *) arg;
+
+					arg = (Node *) argcre->arg;
+
+					/*
+					 * Make sure an outer implicit conversion can't hide an
+					 * inner explicit one.
+					 */
+					if (newcre->convertformat == COERCE_IMPLICIT_CAST)
+						newcre->convertformat = argcre->convertformat;
+				}
+
+				newcre->arg = (Expr *) arg;
+
+				if (arg != NULL && IsA(arg, Const))
+					return ece_evaluate_expr((Node *) newcre);
+				return (Node *) newcre;
+			}
         default:
             break;
     }
diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
index be0a774d..91c39448 100644
--- a/src/test/regress/expected/inherit.out
+++ b/src/test/regress/expected/inherit.out
@@ -1001,6 +1001,8 @@ NOTICE:  drop cascades to table c1
 -- tables. See the pgsql-hackers thread beginning Dec. 4/04
 create table base (i integer);
 create table derived () inherits (base);
+create table more_derived (like derived, b int) inherits (derived);
+NOTICE:  merging column "i" with inherited definition
 insert into derived (i) values (0);
 select derived::base from derived;
  derived 
@@ -1014,6 +1016,22 @@ select NULL::derived::base;
  
 (1 row)
 
+-- remove redundant conversions.
+explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived;
+                QUERY PLAN                 
+-------------------------------------------
+ Seq Scan on public.more_derived
+   Output: (ROW(i, b)::more_derived)::base
+(2 rows)
+
+explain (verbose on, costs off) select (1, 2)::more_derived::derived::base;
+      QUERY PLAN       
+-----------------------
+ Result
+   Output: '(1)'::base
+(2 rows)
+
+drop table more_derived;
 drop table derived;
 drop table base;
 create table p1(ff1 int) distribute by roundrobin;
diff --git a/src/test/regress/expected/inherit_1.out b/src/test/regress/expected/inherit_1.out
index d16ab5d6..8f7d0e3a 100644
--- a/src/test/regress/expected/inherit_1.out
+++ b/src/test/regress/expected/inherit_1.out
@@ -994,6 +994,22 @@ select NULL::derived::base;
  
 (1 row)
 
+-- remove redundant conversions.
+explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived;
+                QUERY PLAN                 
+-------------------------------------------
+ Seq Scan on public.more_derived
+   Output: (ROW(i, b)::more_derived)::base
+(2 rows)
+
+explain (verbose on, costs off) select (1, 2)::more_derived::derived::base;
+      QUERY PLAN       
+-----------------------
+ Result
+   Output: '(1)'::base
+(2 rows)
+
+drop table more_derived;
 drop table derived;
 drop table base;
 create table p1(ff1 int) distribute by roundrobin;
diff --git a/src/test/regress/expected/inherit_2.out b/src/test/regress/expected/inherit_2.out
index 0502f335..65ff71fe 100644
--- a/src/test/regress/expected/inherit_2.out
+++ b/src/test/regress/expected/inherit_2.out
@@ -996,6 +996,22 @@ select NULL::derived::base;
  
 (1 row)
 
+-- remove redundant conversions.
+explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived;
+                QUERY PLAN                 
+-------------------------------------------
+ Seq Scan on public.more_derived
+   Output: (ROW(i, b)::more_derived)::base
+(2 rows)
+
+explain (verbose on, costs off) select (1, 2)::more_derived::derived::base;
+      QUERY PLAN       
+-----------------------
+ Result
+   Output: '(1)'::base
+(2 rows)
+
+drop table more_derived;
 drop table derived;
 drop table base;
 create table p1(ff1 int) distribute by roundrobin;
diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out
index 955a1170..707a6f63 100644
--- a/src/test/regress/expected/inherit_3.out
+++ b/src/test/regress/expected/inherit_3.out
@@ -994,6 +994,22 @@ select NULL::derived::base;
  
 (1 row)
 
+-- remove redundant conversions.
+explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived;
+                QUERY PLAN                 
+-------------------------------------------
+ Seq Scan on public.more_derived
+   Output: (ROW(i, b)::more_derived)::base
+(2 rows)
+
+explain (verbose on, costs off) select (1, 2)::more_derived::derived::base;
+      QUERY PLAN       
+-----------------------
+ Result
+   Output: '(1)'::base
+(2 rows)
+
+drop table more_derived;
 drop table derived;
 drop table base;
 create table p1(ff1 int) distribute by roundrobin;
diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
index ea17dd86..e58bfd36 100644
--- a/src/test/regress/sql/inherit.sql
+++ b/src/test/regress/sql/inherit.sql
@@ -306,9 +306,14 @@ drop table p1 cascade;
 -- tables. See the pgsql-hackers thread beginning Dec. 4/04
 create table base (i integer);
 create table derived () inherits (base);
+create table more_derived (like derived, b int) inherits (derived);
 insert into derived (i) values (0);
 select derived::base from derived;
 select NULL::derived::base;
+-- remove redundant conversions.
+explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived;
+explain (verbose on, costs off) select (1, 2)::more_derived::derived::base;
+drop table more_derived;
 drop table derived;
 drop table base;
 

From 98a2dfe642fc9aa07b7550fee520bb13f4b3e847 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 19:40:52 +0800
Subject: [PATCH 269/578] Fix dependency handling of partitions and inheritance
 for ON COMMIT.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/ref/create_table.sgml |  7 ++-
 src/backend/commands/tablecmds.c   | 63 ++++++++++++++++------
 src/test/regress/expected/temp.out | 85 ++++++++++++++++++++++++++++++
 src/test/regress/sql/temp.sql      | 59 +++++++++++++++++++++
 4 files changed, 196 insertions(+), 18 deletions(-)

diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml
index 62792897..70d3dcfc 100644
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -1107,7 +1107,8 @@
            All rows in the temporary table will be deleted at the end
            of each transaction block.  Essentially, an automatic <xref
            linkend="sql-truncate"> is done
-           at each commit.
+           at each commit. When used on a partitioned table, this
+           is not cascaded to its partitions.
           </para>
          </listitem>
         </varlistentry>
@@ -1117,7 +1118,9 @@
          <listitem>
           <para>
            The temporary table will be dropped at the end of the current
-           transaction block.
+          transaction block.  When used on a partitioned table, this action
+          drops its partitions and when used on tables with inheritance
+          children, it drops the dependent children.
           </para>
          </listitem>
         </varlistentry>
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index fa871fd8..062a6439 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -15617,6 +15617,7 @@ PreCommit_on_commit_actions(void)
 {// #lizard forgives
     ListCell   *l;
     List       *oids_to_truncate = NIL;
+	List       *oids_to_drop = NIL;
 
 #ifdef XCP
     /*
@@ -15659,35 +15660,65 @@ PreCommit_on_commit_actions(void)
                     oids_to_truncate = lappend_oid(oids_to_truncate, oc->relid);
                 break;
             case ONCOMMIT_DROP:
+				oids_to_drop = lappend_oid(oids_to_drop, oc->relid);
+				break;
+		}
+	}
+
+	/*
+	 * Truncate relations before dropping so that all dependencies between
+	 * relations are removed after they are worked on.  Doing it like this
+	 * might be a waste as it is possible that a relation being truncated will
+	 * be dropped anyway due to its parent being dropped, but this makes the
+	 * code more robust because of not having to re-check that the relation
+	 * exists at truncation time.
+	 */
+	if (oids_to_truncate != NIL)
+	{
+		heap_truncate(oids_to_truncate);
+		CommandCounterIncrement();	/* XXX needed? */
+	}
+	if (oids_to_drop != NIL)
+	{
+		ObjectAddresses *targetObjects = new_object_addresses();
+		ListCell   *l;
+
+		foreach(l, oids_to_drop)
                 {
                     ObjectAddress object;
 
                     object.classId = RelationRelationId;
-                    object.objectId = oc->relid;
+			object.objectId = lfirst_oid(l);
                     object.objectSubId = 0;
 
+			Assert(!object_address_present(&object, targetObjects));
+
+			add_exact_object_address(&object, targetObjects);
+		}
+
                     /*
-                     * Since this is an automatic drop, rather than one
-                     * directly initiated by the user, we pass the
-                     * PERFORM_DELETION_INTERNAL flag.
+		 * Since this is an automatic drop, rather than one directly initiated
+		 * by the user, we pass the PERFORM_DELETION_INTERNAL flag.
                      */
-                    performDeletion(&object,
-                                    DROP_CASCADE, PERFORM_DELETION_INTERNAL);
+		performMultipleDeletions(targetObjects, DROP_CASCADE,
+								 PERFORM_DELETION_INTERNAL | PERFORM_DELETION_QUIETLY);
+
+#ifdef USE_ASSERT_CHECKING
 
                     /*
-                     * Note that table deletion will call
-                     * remove_on_commit_action, so the entry should get marked
-                     * as deleted.
+		 * Note that table deletion will call remove_on_commit_action, so the
+		 * entry should get marked as deleted.
                      */
+		foreach(l, on_commits)
+		{
+			OnCommitItem *oc = (OnCommitItem *) lfirst(l);
+
+			if (oc->oncommit != ONCOMMIT_DROP)
+				continue;
+
                     Assert(oc->deleting_subid != InvalidSubTransactionId);
-                    break;
-                }
         }
-    }
-    if (oids_to_truncate != NIL)
-    {
-        heap_truncate(oids_to_truncate);
-        CommandCounterIncrement();    /* XXX needed? */
+#endif
     }
 }
 
diff --git a/src/test/regress/expected/temp.out b/src/test/regress/expected/temp.out
index 0c4ac2ea..ee8f251d 100644
--- a/src/test/regress/expected/temp.out
+++ b/src/test/regress/expected/temp.out
@@ -191,3 +191,88 @@ select pg_temp.whoami();
 (1 row)
 
 drop table public.whereami;
+-- Check dependencies between ON COMMIT actions with a partitioned
+-- table and its partitions.  Using ON COMMIT DROP on a parent removes
+-- the whole set.
+begin;
+create temp table temp_parted_oncommit_test (a int)
+  partition by list (a) on commit drop;
+create temp table temp_parted_oncommit_test1
+  partition of temp_parted_oncommit_test
+  for values in (1) on commit delete rows;
+create temp table temp_parted_oncommit_test2
+  partition of temp_parted_oncommit_test
+  for values in (2) on commit drop;
+insert into temp_parted_oncommit_test values (1), (2);
+commit;
+-- no relations remain in this case.
+select relname from pg_class where relname like 'temp_parted_oncommit_test%';
+ relname 
+---------
+(0 rows)
+
+-- Using ON COMMIT DELETE on a partitioned table does not remove
+-- all rows if partitions preserve their data.
+begin;
+create temp table temp_parted_oncommit_test (a int)
+  partition by list (a) on commit delete rows;
+create temp table temp_parted_oncommit_test1
+  partition of temp_parted_oncommit_test
+  for values in (1) on commit preserve rows;
+create temp table temp_parted_oncommit_test2
+  partition of temp_parted_oncommit_test
+  for values in (2) on commit drop;
+insert into temp_parted_oncommit_test values (1), (2);
+commit;
+-- Data from the remaining partition is still here as its rows are
+-- preserved.
+select * from temp_parted_oncommit_test;
+ a 
+---
+ 1
+(1 row)
+
+-- two relations remain in this case.
+select relname from pg_class where relname like 'temp_parted_oncommit_test%';
+          relname           
+----------------------------
+ temp_parted_oncommit_test
+ temp_parted_oncommit_test1
+(2 rows)
+
+drop table temp_parted_oncommit_test;
+-- Check dependencies between ON COMMIT actions with inheritance trees.
+-- Using ON COMMIT DROP on a parent removes the whole set.
+begin;
+create temp table temp_inh_oncommit_test (a int) on commit drop;
+create temp table temp_inh_oncommit_test1 ()
+  inherits(temp_inh_oncommit_test) on commit delete rows;
+insert into temp_inh_oncommit_test1 values (1);
+commit;
+-- no relations remain in this case
+select relname from pg_class where relname like 'temp_inh_oncommit_test%';
+ relname 
+---------
+(0 rows)
+
+-- Data on the parent is removed, and the child goes away.
+begin;
+create temp table temp_inh_oncommit_test (a int) on commit delete rows;
+create temp table temp_inh_oncommit_test1 ()
+  inherits(temp_inh_oncommit_test) on commit drop;
+insert into temp_inh_oncommit_test1 values (1);
+insert into temp_inh_oncommit_test values (1);
+commit;
+select * from temp_inh_oncommit_test;
+ a 
+---
+(0 rows)
+
+-- one relation remains
+select relname from pg_class where relname like 'temp_inh_oncommit_test%';
+        relname         
+------------------------
+ temp_inh_oncommit_test
+(1 row)
+
+drop table temp_inh_oncommit_test;
diff --git a/src/test/regress/sql/temp.sql b/src/test/regress/sql/temp.sql
index 6c3fc018..efac176f 100644
--- a/src/test/regress/sql/temp.sql
+++ b/src/test/regress/sql/temp.sql
@@ -151,3 +151,62 @@ select whoami();
 select pg_temp.whoami();
 
 drop table public.whereami;
+
+-- Check dependencies between ON COMMIT actions with a partitioned
+-- table and its partitions.  Using ON COMMIT DROP on a parent removes
+-- the whole set.
+begin;
+create temp table temp_parted_oncommit_test (a int)
+  partition by list (a) on commit drop;
+create temp table temp_parted_oncommit_test1
+  partition of temp_parted_oncommit_test
+  for values in (1) on commit delete rows;
+create temp table temp_parted_oncommit_test2
+  partition of temp_parted_oncommit_test
+  for values in (2) on commit drop;
+insert into temp_parted_oncommit_test values (1), (2);
+commit;
+-- no relations remain in this case.
+select relname from pg_class where relname like 'temp_parted_oncommit_test%';
+-- Using ON COMMIT DELETE on a partitioned table does not remove
+-- all rows if partitions preserve their data.
+begin;
+create temp table temp_parted_oncommit_test (a int)
+  partition by list (a) on commit delete rows;
+create temp table temp_parted_oncommit_test1
+  partition of temp_parted_oncommit_test
+  for values in (1) on commit preserve rows;
+create temp table temp_parted_oncommit_test2
+  partition of temp_parted_oncommit_test
+  for values in (2) on commit drop;
+insert into temp_parted_oncommit_test values (1), (2);
+commit;
+-- Data from the remaining partition is still here as its rows are
+-- preserved.
+select * from temp_parted_oncommit_test;
+-- two relations remain in this case.
+select relname from pg_class where relname like 'temp_parted_oncommit_test%';
+drop table temp_parted_oncommit_test;
+
+-- Check dependencies between ON COMMIT actions with inheritance trees.
+-- Using ON COMMIT DROP on a parent removes the whole set.
+begin;
+create temp table temp_inh_oncommit_test (a int) on commit drop;
+create temp table temp_inh_oncommit_test1 ()
+  inherits(temp_inh_oncommit_test) on commit delete rows;
+insert into temp_inh_oncommit_test1 values (1);
+commit;
+-- no relations remain in this case
+select relname from pg_class where relname like 'temp_inh_oncommit_test%';
+-- Data on the parent is removed, and the child goes away.
+begin;
+create temp table temp_inh_oncommit_test (a int) on commit delete rows;
+create temp table temp_inh_oncommit_test1 ()
+  inherits(temp_inh_oncommit_test) on commit drop;
+insert into temp_inh_oncommit_test1 values (1);
+insert into temp_inh_oncommit_test values (1);
+commit;
+select * from temp_inh_oncommit_test;
+-- one relation remains
+select relname from pg_class where relname like 'temp_inh_oncommit_test%';
+drop table temp_inh_oncommit_test;
\ No newline at end of file

From 10f8d2a0f55abe970a3c9ae8bcf2df4f994d83f8 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 20:09:23 +0800
Subject: [PATCH 270/578] Disallow COPY FREEZE on partitioned
 tables.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/perform.sgml   |  3 ++-
 doc/src/sgml/ref/copy.sgml  |  4 +++-
 src/backend/commands/copy.c | 29 +++++++++++++++++++++++++++--
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 454c3f1f..70483f73 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1546,7 +1546,8 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
     needs to be written, because in case of an error, the files
     containing the newly loaded data will be removed anyway.
     However, this consideration only applies when
-    <xref linkend="guc-wal-level"> is <literal>minimal</> as all commands
+    <xref linkend="guc-wal-level"> is <literal>minimal</> for
+    non-partitioned tables as all commands 
     must write WAL otherwise.
    </para>
 
diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml
index 48f0c5c7..84edcac5 100644
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -230,7 +230,9 @@ COPY { <replaceable class="parameter">table_name</replaceable> [ ( <replaceable
       This is intended as a performance option for initial data loading.
       Rows will be frozen only if the table being loaded has been created
       or truncated in the current subtransaction, there are no cursors
-      open and there are no older snapshots held by this transaction.
+      open and there are no older snapshots held by this transaction.  It is
+      currently not possible to perform a <command>COPY FREEZE</command> on
+      a partitioned table.
      </para>
      <para>
       Note that all other sessions will immediately be able to see the data
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 5b7eb4b9..533187a0 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -3064,11 +3064,20 @@ CopyFrom(CopyState cstate)
      * go into pages containing tuples from any other transactions --- but this
      * must be the case if we have a new table or new relfilenode, so we need
      * no additional work to enforce that.
+	 *
+	 * We currently don't support this optimization if the COPY target is a
+	 * partitioned table as we currently only lazily initialize partition
+	 * information when routing the first tuple to the partition.  We cannot
+	 * know at this stage if we can perform this optimization.  It should be
+	 * possible to improve on this, but it does mean maintaining heap insert
+	 * option flags per partition and setting them when we first open the
+	 * partition.
      *----------
      */
     /* createSubid is creation check, newRelfilenodeSubid is truncation check */
-    if (cstate->rel->rd_createSubid != InvalidSubTransactionId ||
-        cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId)
+	if (cstate->rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE &&
+		(cstate->rel->rd_createSubid != InvalidSubTransactionId ||
+		 cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId))
     {
         hi_options |= HEAP_INSERT_SKIP_FSM;
         if (!XLogIsNeeded())
@@ -3085,6 +3094,22 @@ CopyFrom(CopyState cstate)
      */
     if (cstate->freeze)
     {
+		/*
+		* We currently disallow COPY FREEZE on partitioned tables.  The
+		* reason for this is that we've simply not yet opened the partitions
+		* to determine if the optimization can be applied to them.  We could
+		* go and open them all here, but doing so may be quite a costly
+		* overhead for small copies.  In any case, we may just end up routing
+		* tuples to a small number of partitions.  It seems better just to
+		* raise an ERROR for partitioned tables.
+		*/
+		if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+		{
+		   ereport(ERROR,
+		       (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+		           errmsg("cannot perform FREEZE on a partitioned table")));
+		}
+
         if (!ThereAreNoPriorRegisteredSnapshots() || !ThereAreNoReadyPortals())
             ereport(ERROR,
                     (errcode(ERRCODE_INVALID_TRANSACTION_STATE),

From c2687280400bd3d646ccc43e5fd4bd33a9dd9cc1 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Fri, 23 Nov 2018 08:44:15 -0300
Subject: [PATCH 271/578] Don't allow partitioned indexes in pg_global
 tablespace

Missing in dfa608141982.

Author: David Rowley
Discussion: https://postgr.es/m/CAKJS1f-M3NMTCpv=vDfkoqHbMPFf=3-Z1ud=+1DHH00tC+zLaQ@mail.gmail.com
---
 src/backend/commands/tablecmds.c          | 6 ++++++
 src/test/regress/input/tablespace.source  | 1 +
 src/test/regress/output/tablespace.source | 2 ++
 3 files changed, 9 insertions(+)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 062a6439..0bc4e296 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -12610,6 +12610,12 @@ ATExecPartedIdxSetTableSpace(Relation rel, Oid newTableSpace)
 
 	Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX);
 
+	/* Can't allow a non-shared relation in pg_global */
+	if (newTableSpace == GLOBALTABLESPACE_OID)
+		ereport(ERROR,
+		(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+			errmsg("only shared relations can be placed in pg_global tablespace")));
+
 	/*
 	 * No work if no change in tablespace.
 	 */
diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source
index 1454e433..e4e4cf0e 100644
--- a/src/test/regress/input/tablespace.source
+++ b/src/test/regress/input/tablespace.source
@@ -101,6 +101,7 @@ CREATE UNIQUE INDEX anindex ON testschema.atable(column1);
 
 ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace;
 ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace;
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_global;
 ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default;
 ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace;
 
diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source
index a1a615de..8fa26db8 100644
--- a/src/test/regress/output/tablespace.source
+++ b/src/test/regress/output/tablespace.source
@@ -214,6 +214,8 @@ CREATE TABLE testschema.atable AS VALUES (1), (2);
 CREATE UNIQUE INDEX anindex ON testschema.atable(column1);
 ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace;
 ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace;
+ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_global;
+ERROR:  only shared relations can be placed in pg_global tablespace
 ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default;
 ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace;
 INSERT INTO testschema.atable VALUES(3);	-- ok

From eab56b88c0a7264119e14f0ea99822b79bb846fc Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 12 Dec 2018 09:49:39 +0900
Subject: [PATCH 272/578] Tweak pg_partition_tree for undefined relations and
 unsupported relkinds

This fixes a crash which happened when calling the function directly
with a relation OID referring to a non-existing object, and changes the
behavior so as NULL is returned for unsupported relkinds instead of
generating an error.  This puts the new function in line with many other
system functions, and eases actions like full scans of pg_class.

Author: Michael Paquier
Reviewed-by: Amit Langote, Stephen Frost
Discussion: https://postgr.es/m/20181207010406.GO2407@paquier.xyz
---
 src/backend/utils/adt/partitionfuncs.c       | 11 ++++++-----
 src/test/regress/expected/partition_info.out | 18 ++++++++++++++++--
 src/test/regress/sql/partition_info.sql      |  1 +
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/backend/utils/adt/partitionfuncs.c b/src/backend/utils/adt/partitionfuncs.c
index 8f9218ad..2c9fcd1f 100644
--- a/src/backend/utils/adt/partitionfuncs.c
+++ b/src/backend/utils/adt/partitionfuncs.c
@@ -23,6 +23,7 @@
 #include "funcapi.h"
 #include "utils/fmgrprotos.h"
 #include "utils/lsyscache.h"
+#include "utils/syscache.h"
 
 
 /*
@@ -42,16 +43,16 @@ pg_partition_tree(PG_FUNCTION_ARGS)
 	FuncCallContext *funcctx;
 	ListCell  **next;
 
-	/* Only allow relation types that can appear in partition trees. */
+	if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(rootrelid)))
+		PG_RETURN_NULL();
+
+	/* Return NULL for relation types that cannot appear in partition trees */
 	if (relkind != RELKIND_RELATION &&
 		relkind != RELKIND_FOREIGN_TABLE &&
 		relkind != RELKIND_INDEX &&
 		relkind != RELKIND_PARTITIONED_TABLE &&
 		relkind != RELKIND_PARTITIONED_INDEX)
-		ereport(ERROR,
-				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
-				 errmsg("\"%s\" is not a table, a foreign table, or an index",
-						get_rel_name(rootrelid))));
+		PG_RETURN_NULL();
 
 	/* stuff done only on the first call of the function */
 	if (SRF_IS_FIRSTCALL())
diff --git a/src/test/regress/expected/partition_info.out b/src/test/regress/expected/partition_info.out
index 6b116125..202d8208 100644
--- a/src/test/regress/expected/partition_info.out
+++ b/src/test/regress/expected/partition_info.out
@@ -6,6 +6,12 @@ SELECT * FROM pg_partition_tree(NULL);
 -------+-------------+--------+-------
 (0 rows)
 
+SELECT * FROM pg_partition_tree(0);
+ relid | parentrelid | isleaf | level 
+-------+-------------+--------+-------
+       |             |        |      
+(1 row)
+
 -- Test table partition trees
 CREATE TABLE ptif_test (a int, b int) PARTITION BY range (a);
 CREATE TABLE ptif_test0 PARTITION OF ptif_test
@@ -107,8 +113,16 @@ DROP TABLE ptif_normal_table;
 CREATE VIEW ptif_test_view AS SELECT 1;
 CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1;
 SELECT * FROM pg_partition_tree('ptif_test_view');
-ERROR:  "ptif_test_view" is not a table, a foreign table, or an index
+ relid | parentrelid | isleaf | level 
+-------+-------------+--------+-------
+       |             |        |      
+(1 row)
+
 SELECT * FROM pg_partition_tree('ptif_test_matview');
-ERROR:  "ptif_test_matview" is not a table, a foreign table, or an index
+ relid | parentrelid | isleaf | level 
+-------+-------------+--------+-------
+       |             |        |      
+(1 row)
+
 DROP VIEW ptif_test_view;
 DROP MATERIALIZED VIEW ptif_test_matview;
diff --git a/src/test/regress/sql/partition_info.sql b/src/test/regress/sql/partition_info.sql
index 5a76f22b..9b55a7fe 100644
--- a/src/test/regress/sql/partition_info.sql
+++ b/src/test/regress/sql/partition_info.sql
@@ -2,6 +2,7 @@
 -- Tests for pg_partition_tree
 --
 SELECT * FROM pg_partition_tree(NULL);
+SELECT * FROM pg_partition_tree(0);
 
 -- Test table partition trees
 CREATE TABLE ptif_test (a int, b int) PARTITION BY range (a);

From 4e7e2780297c3c933d2350fbf794d6836a2c4ec7 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 20:46:08 +0800
Subject: [PATCH 273/578] Fix tablespace handling for partitioned tables.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/ref/create_table.sgml        |   8 +-
 src/backend/catalog/heap.c                |   6 +-
 src/backend/commands/tablecmds.c          | 152 ++++++++++++++--------
 src/include/catalog/pg_class.h            |  13 ++
 src/test/regress/input/tablespace.source  |  12 ++
 src/test/regress/output/tablespace.source |  19 +++
 6 files changed, 148 insertions(+), 62 deletions(-)

diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml
index 70d3dcfc..47f82c50 100644
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -1136,8 +1136,12 @@
        of the tablespace in which the new table is to be created.
        If not specified,
        <xref linkend="guc-default-tablespace"> is consulted, or
-       <xref linkend="guc-temp-tablespaces"> if the table is temporary.
-      </para>
+       <xref linkend="guc-temp-tablespaces"/> if the table is temporary.  For
+       partitioned tables, since no storage is required for the table itself,
+       the tablespace specified here only serves to mark the default tablespace
+       for any newly created partitions when no other tablespace is explicitly
+       specified. 
+     </para>
      </listitem>
     </varlistentry>
  
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index ff83af36..56e4d7f1 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -410,7 +410,6 @@ heap_create(const char *relname,
         case RELKIND_VIEW:
         case RELKIND_COMPOSITE_TYPE:
         case RELKIND_FOREIGN_TABLE:
-        case RELKIND_PARTITIONED_TABLE:
             create_storage = false;
 
             /*
@@ -420,10 +419,11 @@ heap_create(const char *relname,
             reltablespace = InvalidOid;
             break;
 
+		case RELKIND_PARTITIONED_TABLE:
 		case RELKIND_PARTITIONED_INDEX:
 			/*
-			 * Preserve tablespace so that it's used as tablespace for indexes
-			 * on future partitions.
+			 * For partitioned tables and indexes, preserve tablespace so that
+			 * it's used as the tablespace for future partitions.
 			 */
 			create_storage = false;
 			break;
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 0bc4e296..b24611be 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -340,7 +340,7 @@ struct DropRelationCallbackState
 
 static void truncate_check_rel(Relation rel);
 static List *MergeAttributes(List *schema, List *supers, char relpersistence,
-                bool is_partition, List **supOids, List **supconstr,
+				bool is_partition, List **supconstr,
                 int *supOidCount);
 static bool MergeCheckConstraint(List *constraints, char *name, Node *expr);
 static void MergeAttributesIntoExisting(Relation child_rel, Relation parent_rel);
@@ -495,7 +495,7 @@ static bool ATPrepChangePersistence(Relation rel, bool toLogged);
 static void ATPrepSetTableSpace(AlteredTableInfo *tab, Relation rel,
                     char *tablespacename, LOCKMODE lockmode);
 static void ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode);
-static void ATExecPartedIdxSetTableSpace(Relation rel, Oid newTableSpace);
+static void ATExecSetTableSpaceNoStorage(Relation rel, Oid newTableSpace);
 static void ATExecSetRelOptions(Relation rel, List *defList,
                     AlterTableType operation,
                     LOCKMODE lockmode);
@@ -593,6 +593,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
     static char *validnsps[] = HEAP_RELOPT_NAMESPACES;
     Oid            ofTypeId;
     ObjectAddress address;
+	LOCKMODE    parentLockmode;
 
 #ifdef _SHARDING_
     bool        has_extent = false;
@@ -668,6 +669,46 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("cannot create temporary table within security-restricted operation")));
 
+	/*
+	 * Determine the lockmode to use when scanning parents.  A self-exclusive
+	 * lock is needed here.
+	 *
+	 * For regular inheritance, if two backends attempt to add children to the
+	 * same parent simultaneously, and that parent has no pre-existing
+	 * children, then both will attempt to update the parent's relhassubclass
+	 * field, leading to a "tuple concurrently updated" error.  Also, this
+	 * interlocks against a concurrent ANALYZE on the parent table, which
+	 * might otherwise be attempting to clear the parent's relhassubclass
+	 * field, if its previous children were recently dropped.
+	 *
+	 * If the child table is a partition, then we instead grab an exclusive
+	 * lock on the parent because its partition descriptor will be changed by
+	 * addition of the new partition.
+	 */
+	parentLockmode = (stmt->partbound != NULL ? AccessExclusiveLock :
+					  ShareUpdateExclusiveLock);
+
+	/* Determine the list of OIDs of the parents. */
+	inheritOids = NIL;
+	foreach(listptr, stmt->inhRelations)
+	{
+		RangeVar   *rv = (RangeVar *) lfirst(listptr);
+		Oid			parentOid;
+
+		parentOid = RangeVarGetRelid(rv, parentLockmode, false);
+
+		/*
+		 * Reject duplications in the list of parents.
+		 */
+		if (list_member_oid(inheritOids, parentOid))
+			ereport(ERROR,
+					(errcode(ERRCODE_DUPLICATE_TABLE),
+					 errmsg("relation \"%s\" would be inherited from more than once",
+							get_rel_name(parentOid))));
+
+		inheritOids = lappend_oid(inheritOids, parentOid);
+	}
+
 	/*
 	 * Select tablespace to use.  If not specified, use default tablespace
 	 * (which may in turn default to database's default).
@@ -676,6 +717,25 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 	{
 		tablespaceId = get_tablespace_oid(stmt->tablespacename, false);
 	}
+	else if (stmt->partbound)
+	{
+		HeapTuple	tup;
+
+		/*
+		 * For partitions, when no other tablespace is specified, we default
+		 * the tablespace to the parent partitioned table's.
+		 */
+		Assert(list_length(inheritOids) == 1);
+		tup = SearchSysCache1(RELOID,
+							  DatumGetObjectId(linitial_oid(inheritOids)));
+
+		tablespaceId = ((Form_pg_class) GETSTRUCT(tup))->reltablespace;
+
+		if (!OidIsValid(tablespaceId))
+			tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence);
+
+		ReleaseSysCache(tup);
+	}
 	else
 	{
 		tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence);
@@ -734,10 +794,10 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 	 * modified by MergeAttributes.)
 	 */
 	stmt->tableElts =
-		MergeAttributes(stmt->tableElts, stmt->inhRelations,
+		MergeAttributes(stmt->tableElts, inheritOids,
 						stmt->relation->relpersistence,
 						stmt->partbound != NULL,
-						&inheritOids, &old_constraints, &parentOidCount);
+						&old_constraints, &parentOidCount);
 
 	/*
 	 * Create a tuple descriptor from the relation schema.  Note that this
@@ -2206,12 +2266,11 @@ storage_name(char c)
  * Input arguments:
  * 'schema' is the column/attribute definition for the table. (It's a list
  *        of ColumnDef's.) It is destructively changed.
- * 'supers' is a list of names (as RangeVar nodes) of parent relations.
+ * 'supers' is a list of OIDs of parent relations, already locked by caller.
  * 'relpersistence' is a persistence type of the table.
  * 'is_partition' tells if the table is a partition
  *
  * Output arguments:
- * 'supOids' receives a list of the OIDs of the parent relations.
  * 'supconstr' receives a list of constraints belonging to the parents,
  *        updated as necessary to be valid for the child.
  * 'supOidCount' is set to the number of parents that have OID columns.
@@ -2260,12 +2319,11 @@ storage_name(char c)
  */
 static List *
 MergeAttributes(List *schema, List *supers, char relpersistence,
-                bool is_partition, List **supOids, List **supconstr,
+				bool is_partition, List **supconstr,
                 int *supOidCount)
 {// #lizard forgives
     ListCell   *entry;
     List       *inhSchema = NIL;
-    List       *parentOids = NIL;
     List       *constraints = NIL;
     int            parentsWithOids = 0;
     bool        have_bogus_defaults = false;
@@ -2372,31 +2430,15 @@ MergeAttributes(List *schema, List *supers, char relpersistence,
     child_attno = 0;
     foreach(entry, supers)
     {
-        RangeVar   *parent = (RangeVar *) lfirst(entry);
+		Oid         parent = lfirst_oid(entry);
         Relation    relation;
         TupleDesc    tupleDesc;
         TupleConstr *constr;
         AttrNumber *newattno;
         AttrNumber    parent_attno;
 
-        /*
-         * A self-exclusive lock is needed here.  If two backends attempt to
-         * add children to the same parent simultaneously, and that parent has
-         * no pre-existing children, then both will attempt to update the
-         * parent's relhassubclass field, leading to a "tuple concurrently
-         * updated" error.  Also, this interlocks against a concurrent ANALYZE
-         * on the parent table, which might otherwise be attempting to clear
-         * the parent's relhassubclass field, if its previous children were
-         * recently dropped.
-         *
-         * If the child table is a partition, then we instead grab an
-         * exclusive lock on the parent because its partition descriptor will
-         * be changed by addition of the new partition.
-         */
-        if (!is_partition)
-            relation = heap_openrv(parent, ShareUpdateExclusiveLock);
-        else
-            relation = heap_openrv(parent, AccessExclusiveLock);
+		/* caller already got lock */
+        relation = heap_open(parent, NoLock);
 
         /*
          * We do not allow partitioned tables and partitions to participate in
@@ -2407,12 +2449,12 @@ MergeAttributes(List *schema, List *supers, char relpersistence,
             ereport(ERROR,
                     (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                      errmsg("cannot inherit from partitioned table \"%s\"",
-                            parent->relname)));
+							RelationGetRelationName(relation))));
         if (relation->rd_rel->relispartition && !is_partition)
             ereport(ERROR,
                     (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                      errmsg("cannot inherit from partition \"%s\"",
-                            parent->relname)));
+							RelationGetRelationName(relation))));
 
         if (relation->rd_rel->relkind != RELKIND_RELATION &&
             relation->rd_rel->relkind != RELKIND_FOREIGN_TABLE &&
@@ -2420,7 +2462,7 @@ MergeAttributes(List *schema, List *supers, char relpersistence,
             ereport(ERROR,
                     (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                      errmsg("inherited relation \"%s\" is not a table or foreign table",
-                            parent->relname)));
+							 RelationGetRelationName(relation))));
         /* Permanent rels cannot inherit from temporary ones */
         if (relpersistence != RELPERSISTENCE_TEMP &&
             relation->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
@@ -2429,7 +2471,7 @@ MergeAttributes(List *schema, List *supers, char relpersistence,
                      errmsg(!is_partition
                             ? "cannot inherit from temporary relation \"%s\""
                             : "cannot create a permanent relation as partition of temporary relation \"%s\"",
-                            parent->relname)));
+							RelationGetRelationName(relation))));
 
         /* If existing rel is temp, it must belong to this session */
         if (relation->rd_rel->relpersistence == RELPERSISTENCE_TEMP &&
@@ -2448,17 +2490,6 @@ MergeAttributes(List *schema, List *supers, char relpersistence,
             aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
                            RelationGetRelationName(relation));
 
-        /*
-         * Reject duplications in the list of parents.
-         */
-        if (list_member_oid(parentOids, RelationGetRelid(relation)))
-            ereport(ERROR,
-                    (errcode(ERRCODE_DUPLICATE_TABLE),
-                     errmsg("relation \"%s\" would be inherited from more than once",
-                            parent->relname)));
-
-        parentOids = lappend_oid(parentOids, RelationGetRelid(relation));
-
         if (relation->rd_rel->relhasoids)
             parentsWithOids++;
 
@@ -2926,7 +2957,6 @@ MergeAttributes(List *schema, List *supers, char relpersistence,
         }
     }
 
-    *supOids = parentOids;
     *supconstr = constraints;
     *supOidCount = parentsWithOids;
     return schema;
@@ -5281,11 +5311,13 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
             break;
         case AT_SetTableSpace:    /* SET TABLESPACE */
             /*
-			 * Only do this for partitioned indexes, for which this is just
-			 * a catalog change.  Other relation types are handled by Phase 3.
+			 * Only do this for partitioned tables and indexes, for which this
+			 * is just a catalog change.  Other relation types which have
+			 * storage are handled by Phase 3.
              */
-			if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
-				ATExecPartedIdxSetTableSpace(rel, tab->newTableSpace);
+			if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE ||
+				rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+				ATExecSetTableSpaceNoStorage(rel, tab->newTableSpace);
 
             break;
         case AT_SetRelOptions:    /* SET (...) */
@@ -12596,19 +12628,26 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
 }
 
 /*
- * Special handling of ALTER TABLE SET TABLESPACE for partitioned indexes,
- * which have no storage (so not handled in Phase 3 like other relation types)
+ * Special handling of ALTER TABLE SET TABLESPACE for relations with no
+ * storage that have an interest in preserving tablespace.
+ *
+ * Since these have no storage the tablespace can be updated with a simple
+ * metadata only operation to update the tablespace.
  */
 static void
-ATExecPartedIdxSetTableSpace(Relation rel, Oid newTableSpace)
+ATExecSetTableSpaceNoStorage(Relation rel, Oid newTableSpace)
 {
 	HeapTuple	tuple;
 	Oid			oldTableSpace;
 	Relation	pg_class;
 	Form_pg_class rd_rel;
-	Oid			indexOid = RelationGetRelid(rel);
+	Oid			reloid = RelationGetRelid(rel);
 
-	Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX);
+	/*
+	 * Shouldn't be called on relations having storage; these are processed
+	 * in phase 3.
+	 */
+	Assert(!RELKIND_CAN_HAVE_STORAGE(rel->rd_rel->relkind));
 
 	/* Can't allow a non-shared relation in pg_global */
 	if (newTableSpace == GLOBALTABLESPACE_OID)
@@ -12623,24 +12662,23 @@ ATExecPartedIdxSetTableSpace(Relation rel, Oid newTableSpace)
 	if (newTableSpace == oldTableSpace ||
 		(newTableSpace == MyDatabaseTableSpace && oldTableSpace == 0))
 	{
-		InvokeObjectPostAlterHook(RelationRelationId,
-								  indexOid, 0);
+		InvokeObjectPostAlterHook(RelationRelationId, reloid, 0);
 		return;
 	}
 
 	/* Get a modifiable copy of the relation's pg_class row */
 	pg_class = heap_open(RelationRelationId, RowExclusiveLock);
 
-	tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(indexOid));
+	tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(reloid));
 	if (!HeapTupleIsValid(tuple))
-		elog(ERROR, "cache lookup failed for relation %u", indexOid);
+		elog(ERROR, "cache lookup failed for relation %u", reloid);
 	rd_rel = (Form_pg_class) GETSTRUCT(tuple);
 
 	/* update the pg_class row */
 	rd_rel->reltablespace = (newTableSpace == MyDatabaseTableSpace) ? InvalidOid : newTableSpace;
 	CatalogTupleUpdate(pg_class, &tuple->t_self, tuple);
 
-	InvokeObjectPostAlterHook(RelationRelationId, indexOid, 0);
+	InvokeObjectPostAlterHook(RelationRelationId, reloid, 0);
 
 	heap_freetuple(tuple);
 
diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h
index f69dd5e3..15929163 100644
--- a/src/include/catalog/pg_class.h
+++ b/src/include/catalog/pg_class.h
@@ -205,6 +205,19 @@ DESCR("");
  */
 #define          REPLICA_IDENTITY_INDEX    'i'
 
+/*
+ * Relation kinds that have physical storage. These relations normally have
+ * relfilenode set to non-zero, but it can also be zero if the relation is
+ * mapped.
+ */
+#define RELKIND_CAN_HAVE_STORAGE(relkind) \
+   ((relkind) == RELKIND_RELATION || \
+    (relkind) == RELKIND_INDEX || \
+    (relkind) == RELKIND_SEQUENCE || \
+    (relkind) == RELKIND_TOASTVALUE || \
+    (relkind) == RELKIND_MATVIEW)
+
+
 #ifdef _MLS_
 /* enum for relkindext column */
 #define       RELKIND_AUDIT_SYS_TABLE   'a'
diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source
index e4e4cf0e..abad2716 100644
--- a/src/test/regress/input/tablespace.source
+++ b/src/test/regress/input/tablespace.source
@@ -44,6 +44,18 @@ CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace;
 SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
     where c.reltablespace = t.oid AND c.relname = 'foo_idx';
 
+-- partitioned table
+CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
+CREATE TABLE testschema.part12 PARTITION OF testschema.part FOR VALUES IN(1,2) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.part12_1 PARTITION OF testschema.part12 FOR VALUES IN (1);
+ALTER TABLE testschema.part12 SET TABLESPACE pg_default;
+CREATE TABLE testschema.part12_2 PARTITION OF testschema.part12 FOR VALUES IN (2);
+-- Ensure part12_1 defaulted to regress_tblspace and part12_2 defaulted to pg_default.
+SELECT relname, spcname FROM pg_catalog.pg_class c
+    LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid
+    where c.relname LIKE 'part%' order by relname;
+DROP TABLE testschema.part;
+
 -- partitioned index
 CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
 CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1);
diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source
index 8fa26db8..03383fd4 100644
--- a/src/test/regress/output/tablespace.source
+++ b/src/test/regress/output/tablespace.source
@@ -61,6 +61,25 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
  foo_idx | regress_tblspace
 (1 row)
 
+-- partitioned table
+CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
+CREATE TABLE testschema.part12 PARTITION OF testschema.part FOR VALUES IN(1,2) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.part12_1 PARTITION OF testschema.part12 FOR VALUES IN (1);
+ALTER TABLE testschema.part12 SET TABLESPACE pg_default;
+CREATE TABLE testschema.part12_2 PARTITION OF testschema.part12 FOR VALUES IN (2);
+-- Ensure part12_1 defaulted to regress_tblspace and part12_2 defaulted to pg_default.
+SELECT relname, spcname FROM pg_catalog.pg_class c
+    LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid
+    where c.relname LIKE 'part%' order by relname;
+ relname  |     spcname      
+----------+------------------
+ part     | 
+ part12   | 
+ part12_1 | regress_tblspace
+ part12_2 | 
+(4 rows)
+
+DROP TABLE testschema.part;
 -- partitioned index
 CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
 CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1);

From 0548b6623b865cac379d7f8ad30f328062dc1412 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 20:50:55 +0800
Subject: [PATCH 274/578] Include partitioned indexes to system view
 pg_indexes.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/system_views.sql | 2 +-
 src/test/regress/expected/rules.out  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 0e7d14ff..c5326deb 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -162,7 +162,7 @@ CREATE VIEW pg_indexes AS
          JOIN pg_class I ON (I.oid = X.indexrelid)
          LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
          LEFT JOIN pg_tablespace T ON (T.oid = I.reltablespace)
-    WHERE C.relkind IN ('r', 'm') AND I.relkind = 'i';
+    WHERE C.relkind IN ('r', 'm', 'p') AND I.relkind IN ('i', 'I');
 
 CREATE OR REPLACE VIEW pg_sequences AS
     SELECT
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index f750332d..ba5666ef 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1414,7 +1414,7 @@ pg_indexes| SELECT n.nspname AS schemaname,
      JOIN pg_class i ON ((i.oid = x.indexrelid)))
      LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)))
      LEFT JOIN pg_tablespace t ON ((t.oid = i.reltablespace)))
-  WHERE ((c.relkind = ANY (ARRAY['r'::"char", 'm'::"char"])) AND (i.relkind = 'i'::"char"));
+  WHERE ((c.relkind = ANY (ARRAY['r'::"char", 'm'::"char", 'p'::"char"])) AND (i.relkind = ANY (ARRAY['i'::"char", 'I'::"char"])));
 pg_locks| SELECT l.locktype,
     l.database,
     l.relation,

From af0fcaefa0f8b0bc253cf9e36e9e605e3c44577b Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 30 Jun 2020 21:21:09 +0800
Subject: [PATCH 275/578] Delay lock acquisition for partitions until we route
 a tuple to
 them.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/executor/execPartition.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index 60221c6b..dd60cbc8 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -73,9 +73,6 @@ static char *ExecBuildSlotPartitionKeyDescription(Relation rel,
  * tuple routing for partitioned tables, encapsulates it in
  * PartitionTupleRouting, and returns it.
  *
- * Note that all the relations in the partition tree are locked using the
- * RowExclusiveLock mode upon return from this function.
- *
  * While we allocate the arrays of pointers of ResultRelInfo and
  * TupleConversionMap for all partitions here, actual objects themselves are
  * lazily allocated for a given partition if a tuple is actually routed to it;
@@ -100,7 +97,6 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
 	 * Get the information about the partition tree after locking all the
 	 * partitions.
 	 */
-	(void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL);
 	proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting));
 	proute->partition_dispatch_info =
 		RelationGetPartitionDispatchInfo(rel, &proute->num_dispatch,
@@ -329,8 +325,9 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 
 /*
  * ExecInitPartitionInfo
- *		Initialize ResultRelInfo and other information for a partition if not
- *		already done
+ *     Lock the partition and initialize ResultRelInfo.  Also setup other
+ *     information for the partition and store it in the next empty slot in
+ *     the proute->partitions array.
  *
  * Returns the ResultRelInfo
  */
@@ -346,11 +343,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 	ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL;
 	MemoryContext oldContext;
 
-	/*
-	 * We locked all the partitions in ExecSetupPartitionTupleRouting
-	 * including the leaf partitions.
-	 */
-	partrel = heap_open(proute->partition_oids[partidx], NoLock);
+	partrel = table_open(dispatch->partdesc->oids[partidx], RowExclusiveLock);
 
 	/*
 	 * Keep ResultRelInfo and other information for this partition in the

From a376a42f535439761e491db12c8dbfff406c91d4 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 1 Jul 2020 10:40:31 +0800
Subject: [PATCH 276/578] pg_partition_ancestors.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/func.sgml                       | 11 ++++
 src/backend/utils/adt/partitionfuncs.c       | 49 +++++++++++++++++
 src/include/catalog/pg_proc.h                |  2 +
 src/test/regress/expected/partition_info.out | 55 ++++++++++++++++++++
 src/test/regress/sql/partition_info.sql      | 11 ++++
 5 files changed, 128 insertions(+)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index cb4821d5..8cefea34 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -19847,6 +19847,17 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup());
         their partitions, and so on.
        </entry>
       </row>
+       <row>
+        <entry>
+         <indexterm><primary>pg_partition_ancestors</primary></indexterm>
+         <literal><function>pg_partition_ancestors(<type>regclass</type>)</function></literal>
+        </entry>
+        <entry><type>setof regclass</type></entry>
+        <entry>
+         List the ancestor relations of the given partition,
+         including the partition itself.
+        </entry>
+       </row>
      </tbody>
     </tgroup>
    </table>
diff --git a/src/backend/utils/adt/partitionfuncs.c b/src/backend/utils/adt/partitionfuncs.c
index 2c9fcd1f..1020c2c3 100644
--- a/src/backend/utils/adt/partitionfuncs.c
+++ b/src/backend/utils/adt/partitionfuncs.c
@@ -153,3 +153,52 @@ pg_partition_tree(PG_FUNCTION_ARGS)
 	/* done when there are no more elements left */
 	SRF_RETURN_DONE(funcctx);
 }
+
+/*
+ * pg_partition_ancestors
+ *
+ * Produces a view with one row per ancestor of the given partition,
+ * including the input relation itself.
+ */
+Datum
+pg_partition_ancestors(PG_FUNCTION_ARGS)
+{
+   Oid         relid = PG_GETARG_OID(0);
+   FuncCallContext *funcctx;
+   ListCell  **next;
+
+   if (SRF_IS_FIRSTCALL())
+   {
+       MemoryContext oldcxt;
+       List       *ancestors;
+
+       funcctx = SRF_FIRSTCALL_INIT();
+
+       if (!check_rel_can_be_partition(relid))
+           SRF_RETURN_DONE(funcctx);
+
+       oldcxt = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+       ancestors = get_partition_ancestors(relid);
+       ancestors = lcons_oid(relid, ancestors);
+
+       next = (ListCell **) palloc(sizeof(ListCell *));
+       *next = list_head(ancestors);
+       funcctx->user_fctx = (void *) next;
+
+       MemoryContextSwitchTo(oldcxt);
+   }
+
+   funcctx = SRF_PERCALL_SETUP();
+   next = (ListCell **) funcctx->user_fctx;
+
+   if (*next != NULL)
+   {
+       Oid         relid = lfirst_oid(*next);
+
+       *next = lnext(*next);
+       SRF_RETURN_NEXT(funcctx, ObjectIdGetDatum(relid));
+   }
+
+   SRF_RETURN_DONE(funcctx);
+}
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index bd1481cf..9af8050d 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -5713,6 +5713,8 @@ DESCR("hash partition CHECK constraint");
 /* information about a partition tree */
 DATA(insert OID = 4688 ( pg_partition_tree PGNSP PGUID 12 1 1000 0 0 f f f f t t v s 1 0 2249 "2205" "{2205,2205,2205,16,23}" "{i,o,o,o,o}" "{rootrelid,relid,parentrelid,isleaf,level}" _null_ _null_ pg_partition_tree _null_ _null_ _null_ ));
 DESCR("view partition tree tables");
+DATA(insert OID = 4689 ( pg_partition_ancestors PGNSP PGUID 12 1 10 0 0 f f f f t t v s 1 0 2205 "2205" "{2205,2205}" "{i,o}" "{partitionid,relid}" _null_ _null_ pg_partition_ancestors _null_ _null_ _null_ ));
+DESCR("view ancestors of the partition");
 DATA(insert OID = 3410 (  pg_extent_info                PGNSP PGUID 12 10 20 0 0 f f f f f t v s 1 0 2249 "2205" "{23,16,23,23,23,23,23,23,23}" "{o,o,o,o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next,scan_prev,alloc_next,alloc_prev}" _null_ _null_ pg_extent_info_oid _null_ _null_ _null_ ));
 DESCR("get extent info of a relation");
 DATA(insert OID = 3411 (  pg_shard_scan_list            PGNSP PGUID 12 10 20 0 0 f f f f f t v s 2 0 2249 "2205 23" "{23,16,23,23,23,23}" "{o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next}" _null_ _null_ pg_shard_scan_list_oid _null_ _null_ _null_ ));
diff --git a/src/test/regress/expected/partition_info.out b/src/test/regress/expected/partition_info.out
index 202d8208..5916eca2 100644
--- a/src/test/regress/expected/partition_info.out
+++ b/src/test/regress/expected/partition_info.out
@@ -12,6 +12,16 @@ SELECT * FROM pg_partition_tree(0);
        |             |        |      
 (1 row)
 
+SELECT * FROM pg_partition_ancestors(NULL);
+ relid 
+-------
+(0 rows)
+
+SELECT * FROM pg_partition_ancestors(0);
+ relid 
+-------
+(0 rows)
+
 -- Test table partition trees
 CREATE TABLE ptif_test (a int, b int) PARTITION BY range (a);
 CREATE TABLE ptif_test0 PARTITION OF ptif_test
@@ -66,6 +76,21 @@ SELECT relid, parentrelid, level, isleaf
  ptif_test01 | ptif_test0  |     0 | t
 (1 row)
 
+-- List all ancestors of root and leaf tables
+SELECT * FROM pg_partition_ancestors('ptif_test01');
+    relid    
+-------------
+ ptif_test01
+ ptif_test0
+ ptif_test
+(3 rows)
+
+SELECT * FROM pg_partition_ancestors('ptif_test');
+   relid   
+-----------
+ ptif_test
+(1 row)
+
 -- List all indexes members of the tree
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree('ptif_test_index');
@@ -98,6 +123,21 @@ SELECT relid, parentrelid, level, isleaf
  ptif_test01_index | ptif_test0_index |     0 | t
 (1 row)
 
+-- List all ancestors of root and leaf indexes
+SELECT * FROM pg_partition_ancestors('ptif_test01_index');
+       relid       
+-------------------
+ ptif_test01_index
+ ptif_test0_index
+ ptif_test_index
+(3 rows)
+
+SELECT * FROM pg_partition_ancestors('ptif_test_index');
+      relid      
+-----------------
+ ptif_test_index
+(1 row)
+
 DROP TABLE ptif_test;
 -- A table not part of a partition tree works is the only member listed.
 CREATE TABLE ptif_normal_table(a int);
@@ -108,6 +148,11 @@ SELECT relid, parentrelid, level, isleaf
  ptif_normal_table |             |     0 | t
 (1 row)
 
+SELECT * FROM pg_partition_ancestors('ptif_normal_table');
+ relid 
+-------
+(0 rows)
+
 DROP TABLE ptif_normal_table;
 -- Views and materialized viewS cannot be part of a partition tree.
 CREATE VIEW ptif_test_view AS SELECT 1;
@@ -124,5 +169,15 @@ SELECT * FROM pg_partition_tree('ptif_test_matview');
        |             |        |      
 (1 row)
 
+SELECT * FROM pg_partition_ancestors('ptif_test_view');
+ relid 
+-------
+(0 rows)
+
+SELECT * FROM pg_partition_ancestors('ptif_test_matview');
+ relid 
+-------
+(0 rows)
+
 DROP VIEW ptif_test_view;
 DROP MATERIALIZED VIEW ptif_test_matview;
diff --git a/src/test/regress/sql/partition_info.sql b/src/test/regress/sql/partition_info.sql
index 9b55a7fe..6e2ec675 100644
--- a/src/test/regress/sql/partition_info.sql
+++ b/src/test/regress/sql/partition_info.sql
@@ -3,6 +3,8 @@
 --
 SELECT * FROM pg_partition_tree(NULL);
 SELECT * FROM pg_partition_tree(0);
+SELECT * FROM pg_partition_ancestors(NULL);
+SELECT * FROM pg_partition_ancestors(0);
 
 -- Test table partition trees
 CREATE TABLE ptif_test (a int, b int) PARTITION BY range (a);
@@ -39,6 +41,9 @@ SELECT relid, parentrelid, level, isleaf
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree('ptif_test01') p
   JOIN pg_class c ON (p.relid = c.oid);
+-- List all ancestors of root and leaf tables
+SELECT * FROM pg_partition_ancestors('ptif_test01');
+SELECT * FROM pg_partition_ancestors('ptif_test');
 
 -- List all indexes members of the tree
 SELECT relid, parentrelid, level, isleaf
@@ -51,6 +56,9 @@ SELECT relid, parentrelid, level, isleaf
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree('ptif_test01_index') p
   JOIN pg_class c ON (p.relid = c.oid);
+-- List all ancestors of root and leaf indexes
+SELECT * FROM pg_partition_ancestors('ptif_test01_index');
+SELECT * FROM pg_partition_ancestors('ptif_test_index');
 
 DROP TABLE ptif_test;
 
@@ -58,6 +66,7 @@ DROP TABLE ptif_test;
 CREATE TABLE ptif_normal_table(a int);
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree('ptif_normal_table');
+SELECT * FROM pg_partition_ancestors('ptif_normal_table');
 DROP TABLE ptif_normal_table;
 
 -- Views and materialized viewS cannot be part of a partition tree.
@@ -65,5 +74,7 @@ CREATE VIEW ptif_test_view AS SELECT 1;
 CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1;
 SELECT * FROM pg_partition_tree('ptif_test_view');
 SELECT * FROM pg_partition_tree('ptif_test_matview');
+SELECT * FROM pg_partition_ancestors('ptif_test_view');
+SELECT * FROM pg_partition_ancestors('ptif_test_matview');
 DROP VIEW ptif_test_view;
 DROP MATERIALIZED VIEW ptif_test_matview;

From 2e3345d3059a8936cb688e1f72b0aa9c52959343 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 1 Jul 2020 11:00:50 +0800
Subject: [PATCH 277/578] pg_upgrade: Ignore TOAST for partitioned tables

---
 src/bin/pg_dump/pg_dump.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 10a8ce5f..97384c01 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -4054,14 +4054,20 @@ binary_upgrade_set_type_oids_by_rel_oid(Archive *fout,
     Oid            pg_type_oid;
     bool        toast_set = false;
 
-    /* we only support old >= 8.3 for binary upgrades */
+	/*
+	 * We only support old >= 8.3 for binary upgrades.
+	 *
+	 * We purposefully ignore toast OIDs for partitioned tables; the reason is
+	 * that versions 10 and 11 have them, but 12 does not, so emitting them
+	 * causes the upgrade to fail.
+	 */
     appendPQExpBuffer(upgrade_query,
                       "SELECT c.reltype AS crel, t.reltype AS trel "
                       "FROM pg_catalog.pg_class c "
                       "LEFT JOIN pg_catalog.pg_class t ON "
-                      "  (c.reltoastrelid = t.oid) "
+					  "  (c.reltoastrelid = t.oid AND c.relkind <> '%c') "
                       "WHERE c.oid = '%u'::pg_catalog.oid;",
-                      pg_rel_oid);
+					  RELKIND_PARTITIONED_TABLE, pg_rel_oid);
 
     upgrade_res = ExecuteSqlQueryForSingleRow(fout, upgrade_query->data);
 
@@ -5789,6 +5795,10 @@ getTables(Archive *fout, int *numTables)
      * information about each table, basically just enough to decide if it is
      * interesting. We must fetch all tables in this phase because otherwise
      * we cannot correctly identify inherited columns, owned sequences, etc.
+	 *
+	 * We purposefully ignore toast OIDs for partitioned tables; the reason is
+	 * that versions 10 and 11 have them, but 12 does not, so emitting them
+	 * causes the upgrade to fail.
      */
 
     if (fout->remoteVersion >= 90600)
@@ -5902,7 +5912,7 @@ getTables(Archive *fout, int *numTables)
                           "d.classid = c.tableoid AND d.objid = c.oid AND "
                           "d.objsubid = 0 AND "
                           "d.refclassid = c.tableoid AND d.deptype IN ('a', 'i')) "
-                          "LEFT JOIN pg_class tc ON (c.reltoastrelid = tc.oid) "
+						  "LEFT JOIN pg_class tc ON (c.reltoastrelid = tc.oid AND c.relkind <> '%c') "
                           "LEFT JOIN pg_init_privs pip ON "
                           "(c.oid = pip.objoid "
                           "AND pip.classoid = 'pg_class'::regclass "
@@ -5929,6 +5939,7 @@ getTables(Archive *fout, int *numTables)
                           ispartition,
                           partbound,
                           RELKIND_SEQUENCE,
+						  RELKIND_PARTITIONED_TABLE,
                           RELKIND_RELATION, RELKIND_SEQUENCE,
                           RELKIND_VIEW, RELKIND_COMPOSITE_TYPE,
                           RELKIND_MATVIEW, RELKIND_FOREIGN_TABLE,

From 1740d570fac7cf324d7ff7f0253afd227c90c03f Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 1 Jul 2020 11:43:19 +0800
Subject: [PATCH 278/578] Add pg_partition_root to display top-most parent of a
 partition tree.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/func.sgml                       | 11 +++
 src/backend/utils/adt/partitionfuncs.c       | 75 +++++++++++++++++---
 src/include/catalog/pg_proc.h                |  7 +-
 src/test/regress/expected/partition_info.out | 58 +++++++++++++++
 src/test/regress/sql/partition_info.sql      | 13 ++++
 5 files changed, 153 insertions(+), 11 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 8cefea34..08a9ec2a 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -19846,6 +19846,17 @@ postgres=# SELECT * FROM pg_walfile_name_offset(pg_stop_backup());
         <literal>1</literal> for its partitions, <literal>2</literal> for
         their partitions, and so on.
        </entry>
+       <row>
+        <entry>
+         <indexterm><primary>pg_partition_root</primary></indexterm>
+         <literal><function>pg_partition_root(<type>regclass</type>)</function></literal>
+        </entry>
+        <entry><type>regclass</type></entry>
+        <entry>
+         Return the top-most parent of a partition tree to which the given
+         relation belongs.
+        </entry>
+       </row>
       </row>
        <row>
         <entry>
diff --git a/src/backend/utils/adt/partitionfuncs.c b/src/backend/utils/adt/partitionfuncs.c
index 1020c2c3..13ddec59 100644
--- a/src/backend/utils/adt/partitionfuncs.c
+++ b/src/backend/utils/adt/partitionfuncs.c
@@ -25,6 +25,33 @@
 #include "utils/lsyscache.h"
 #include "utils/syscache.h"
 
+/*
+* Checks if a given relation can be part of a partition tree.  Returns
+* false if the relation cannot be processed, in which case it is up to
+* the caller to decide what to do, by either raising an error or doing
+* something else.
+*/
+static bool
+check_rel_can_be_partition(Oid relid)
+{
+char        relkind;
+
+/* Check if relation exists */
+if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relid)))
+    return false;
+
+relkind = get_rel_relkind(relid);
+
+/* Only allow relation types that can appear in partition trees. */
+if (relkind != RELKIND_RELATION &&
+    relkind != RELKIND_FOREIGN_TABLE &&
+    relkind != RELKIND_INDEX &&
+    relkind != RELKIND_PARTITIONED_TABLE &&
+    relkind != RELKIND_PARTITIONED_INDEX)
+    return false;
+
+return true;
+}
 
 /*
  * pg_partition_tree
@@ -39,19 +66,10 @@ pg_partition_tree(PG_FUNCTION_ARGS)
 {
 #define PG_PARTITION_TREE_COLS	4
 	Oid			rootrelid = PG_GETARG_OID(0);
-	char		relkind = get_rel_relkind(rootrelid);
 	FuncCallContext *funcctx;
 	ListCell  **next;
 
-	if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(rootrelid)))
-		PG_RETURN_NULL();
-
-	/* Return NULL for relation types that cannot appear in partition trees */
-	if (relkind != RELKIND_RELATION &&
-		relkind != RELKIND_FOREIGN_TABLE &&
-		relkind != RELKIND_INDEX &&
-		relkind != RELKIND_PARTITIONED_TABLE &&
-		relkind != RELKIND_PARTITIONED_INDEX)
+	if (!check_rel_can_be_partition(rootrelid))
 		PG_RETURN_NULL();
 
 	/* stuff done only on the first call of the function */
@@ -154,6 +172,43 @@ pg_partition_tree(PG_FUNCTION_ARGS)
 	SRF_RETURN_DONE(funcctx);
 }
 
+/*
+ * pg_partition_root
+ *
+ * Returns the top-most parent of the partition tree to which a given
+ * relation belongs, or NULL if it's not (or cannot be) part of any
+ * partition tree.
+ */
+Datum
+pg_partition_root(PG_FUNCTION_ARGS)
+{
+   Oid         relid = PG_GETARG_OID(0);
+   Oid         rootrelid;
+   List       *ancestors;
+
+   if (!check_rel_can_be_partition(relid))
+       PG_RETURN_NULL();
+
+   /*
+    * If the relation is not a partition (it may be the partition parent),
+    * return itself as a result.
+    */
+   if (!get_rel_relispartition(relid))
+       PG_RETURN_OID(relid);
+
+   /* Fetch the top-most parent */
+   ancestors = get_partition_ancestors(relid);
+   rootrelid = llast_oid(ancestors);
+   list_free(ancestors);
+
+   /*
+    * "rootrelid" must contain a valid OID, given that the input relation is
+    * a valid partition tree member as checked above.
+    */
+   Assert(OidIsValid(rootrelid));
+   PG_RETURN_OID(rootrelid);
+}
+
 /*
  * pg_partition_ancestors
  *
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 9af8050d..76881d68 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -5713,7 +5713,12 @@ DESCR("hash partition CHECK constraint");
 /* information about a partition tree */
 DATA(insert OID = 4688 ( pg_partition_tree PGNSP PGUID 12 1 1000 0 0 f f f f t t v s 1 0 2249 "2205" "{2205,2205,2205,16,23}" "{i,o,o,o,o}" "{rootrelid,relid,parentrelid,isleaf,level}" _null_ _null_ pg_partition_tree _null_ _null_ _null_ ));
 DESCR("view partition tree tables");
-DATA(insert OID = 4689 ( pg_partition_ancestors PGNSP PGUID 12 1 10 0 0 f f f f t t v s 1 0 2205 "2205" "{2205,2205}" "{i,o}" "{partitionid,relid}" _null_ _null_ pg_partition_ancestors _null_ _null_ _null_ ));
+
+/* function to get the top-most partition root parent */
+DATA(insert OID = 4689 ( pg_partition_root PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2205 "2205" _null_ _null_ _null_ _null_ _null_ pg_partition_root _null_ _null_ _null_ ));
+DESCR("get top-most partition root parent");
+
+DATA(insert OID = 4690 ( pg_partition_ancestors PGNSP PGUID 12 1 10 0 0 f f f f t t v s 1 0 2205 "2205" "{2205,2205}" "{i,o}" "{partitionid,relid}" _null_ _null_ pg_partition_ancestors _null_ _null_ _null_ ));
 DESCR("view ancestors of the partition");
 DATA(insert OID = 3410 (  pg_extent_info                PGNSP PGUID 12 10 20 0 0 f f f f f t v s 1 0 2249 "2205" "{23,16,23,23,23,23,23,23,23}" "{o,o,o,o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next,scan_prev,alloc_next,alloc_prev}" _null_ _null_ pg_extent_info_oid _null_ _null_ _null_ ));
 DESCR("get extent info of a relation");
diff --git a/src/test/regress/expected/partition_info.out b/src/test/regress/expected/partition_info.out
index 5916eca2..00a0ed80 100644
--- a/src/test/regress/expected/partition_info.out
+++ b/src/test/regress/expected/partition_info.out
@@ -12,6 +12,18 @@ SELECT * FROM pg_partition_tree(0);
        |             |        |      
 (1 row)
 
+SELECT pg_partition_root(NULL);
+ pg_partition_root 
+-------------------
+ 
+(1 row)
+
+SELECT pg_partition_root(0);
+ pg_partition_root 
+-------------------
+ 
+(1 row)
+
 SELECT * FROM pg_partition_ancestors(NULL);
  relid 
 -------
@@ -76,6 +88,20 @@ SELECT relid, parentrelid, level, isleaf
  ptif_test01 | ptif_test0  |     0 | t
 (1 row)
 
+-- List all members using pg_partition_root with leaf table reference
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree(pg_partition_root('ptif_test01')) p
+  JOIN pg_class c ON (p.relid = c.oid);
+    relid    | parentrelid | level | isleaf 
+-------------+-------------+-------+--------
+ ptif_test   |             |     0 | f
+ ptif_test0  | ptif_test   |     1 | f
+ ptif_test1  | ptif_test   |     1 | f
+ ptif_test2  | ptif_test   |     1 | t
+ ptif_test01 | ptif_test0  |     2 | t
+ ptif_test11 | ptif_test1  |     2 | t
+(6 rows)
+
 -- List all ancestors of root and leaf tables
 SELECT * FROM pg_partition_ancestors('ptif_test01');
     relid    
@@ -123,6 +149,20 @@ SELECT relid, parentrelid, level, isleaf
  ptif_test01_index | ptif_test0_index |     0 | t
 (1 row)
 
+-- List all members using pg_partition_root with leaf index reference
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree(pg_partition_root('ptif_test01_index')) p
+  JOIN pg_class c ON (p.relid = c.oid);
+       relid       |   parentrelid    | level | isleaf 
+-------------------+------------------+-------+--------
+ ptif_test_index   |                  |     0 | f
+ ptif_test0_index  | ptif_test_index  |     1 | f
+ ptif_test1_index  | ptif_test_index  |     1 | f
+ ptif_test2_index  | ptif_test_index  |     1 | t
+ ptif_test01_index | ptif_test0_index |     2 | t
+ ptif_test11_index | ptif_test1_index |     2 | t
+(6 rows)
+
 -- List all ancestors of root and leaf indexes
 SELECT * FROM pg_partition_ancestors('ptif_test01_index');
        relid       
@@ -148,6 +188,12 @@ SELECT relid, parentrelid, level, isleaf
  ptif_normal_table |             |     0 | t
 (1 row)
 
+SELECT pg_partition_root('ptif_normal_table');
+ pg_partition_root 
+-------------------
+ ptif_normal_table
+(1 row)
+
 SELECT * FROM pg_partition_ancestors('ptif_normal_table');
  relid 
 -------
@@ -169,6 +215,18 @@ SELECT * FROM pg_partition_tree('ptif_test_matview');
        |             |        |      
 (1 row)
 
+SELECT pg_partition_root('ptif_test_view');
+ pg_partition_root 
+-------------------
+ 
+(1 row)
+
+SELECT pg_partition_root('ptif_test_matview');
+ pg_partition_root 
+-------------------
+ 
+(1 row)
+
 SELECT * FROM pg_partition_ancestors('ptif_test_view');
  relid 
 -------
diff --git a/src/test/regress/sql/partition_info.sql b/src/test/regress/sql/partition_info.sql
index 6e2ec675..f49688eb 100644
--- a/src/test/regress/sql/partition_info.sql
+++ b/src/test/regress/sql/partition_info.sql
@@ -3,6 +3,8 @@
 --
 SELECT * FROM pg_partition_tree(NULL);
 SELECT * FROM pg_partition_tree(0);
+SELECT pg_partition_root(NULL);
+SELECT pg_partition_root(0);
 SELECT * FROM pg_partition_ancestors(NULL);
 SELECT * FROM pg_partition_ancestors(0);
 
@@ -41,6 +43,10 @@ SELECT relid, parentrelid, level, isleaf
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree('ptif_test01') p
   JOIN pg_class c ON (p.relid = c.oid);
+-- List all members using pg_partition_root with leaf table reference
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree(pg_partition_root('ptif_test01')) p
+  JOIN pg_class c ON (p.relid = c.oid);
 -- List all ancestors of root and leaf tables
 SELECT * FROM pg_partition_ancestors('ptif_test01');
 SELECT * FROM pg_partition_ancestors('ptif_test');
@@ -56,6 +62,10 @@ SELECT relid, parentrelid, level, isleaf
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree('ptif_test01_index') p
   JOIN pg_class c ON (p.relid = c.oid);
+-- List all members using pg_partition_root with leaf index reference
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree(pg_partition_root('ptif_test01_index')) p
+  JOIN pg_class c ON (p.relid = c.oid);
 -- List all ancestors of root and leaf indexes
 SELECT * FROM pg_partition_ancestors('ptif_test01_index');
 SELECT * FROM pg_partition_ancestors('ptif_test_index');
@@ -66,6 +76,7 @@ DROP TABLE ptif_test;
 CREATE TABLE ptif_normal_table(a int);
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree('ptif_normal_table');
+SELECT pg_partition_root('ptif_normal_table');
 SELECT * FROM pg_partition_ancestors('ptif_normal_table');
 DROP TABLE ptif_normal_table;
 
@@ -74,6 +85,8 @@ CREATE VIEW ptif_test_view AS SELECT 1;
 CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1;
 SELECT * FROM pg_partition_tree('ptif_test_view');
 SELECT * FROM pg_partition_tree('ptif_test_matview');
+SELECT pg_partition_root('ptif_test_view');
+SELECT pg_partition_root('ptif_test_matview');
 SELECT * FROM pg_partition_ancestors('ptif_test_view');
 SELECT * FROM pg_partition_ancestors('ptif_test_matview');
 DROP VIEW ptif_test_view;

From e98d386320bdbf17cffb0249d8e79bee099b6fad Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 1 Jul 2020 13:09:41 +0800
Subject: [PATCH 279/578] Fix crash with
 pg_partition_root.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/utils/adt/partitionfuncs.c       |  43 +++----
 src/test/regress/expected/partition_info.out | 113 +++++++++++++++----
 src/test/regress/sql/partition_info.sql      |  29 ++++-
 3 files changed, 142 insertions(+), 43 deletions(-)

diff --git a/src/backend/utils/adt/partitionfuncs.c b/src/backend/utils/adt/partitionfuncs.c
index 13ddec59..87f1cced 100644
--- a/src/backend/utils/adt/partitionfuncs.c
+++ b/src/backend/utils/adt/partitionfuncs.c
@@ -34,23 +34,23 @@
 static bool
 check_rel_can_be_partition(Oid relid)
 {
-char        relkind;
+    char        relkind;
+    bool        relispartition;
 
-/* Check if relation exists */
-if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relid)))
-    return false;
+    /* Check if relation exists */
+    if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relid)))
+        return false;
 
-relkind = get_rel_relkind(relid);
+    relkind = get_rel_relkind(relid);
+    relispartition = get_rel_relispartition(relid);
 
-/* Only allow relation types that can appear in partition trees. */
-if (relkind != RELKIND_RELATION &&
-    relkind != RELKIND_FOREIGN_TABLE &&
-    relkind != RELKIND_INDEX &&
-    relkind != RELKIND_PARTITIONED_TABLE &&
-    relkind != RELKIND_PARTITIONED_INDEX)
-    return false;
+    /* Only allow relation types that can appear in partition trees. */
+    if (!relispartition &&
+        relkind != RELKIND_PARTITIONED_TABLE &&
+        relkind != RELKIND_PARTITIONED_INDEX)
+        return false;
 
-return true;
+    return true;
 }
 
 /*
@@ -69,9 +69,6 @@ pg_partition_tree(PG_FUNCTION_ARGS)
 	FuncCallContext *funcctx;
 	ListCell  **next;
 
-	if (!check_rel_can_be_partition(rootrelid))
-		PG_RETURN_NULL();
-
 	/* stuff done only on the first call of the function */
 	if (SRF_IS_FIRSTCALL())
 	{
@@ -82,6 +79,9 @@ pg_partition_tree(PG_FUNCTION_ARGS)
 		/* create a function context for cross-call persistence */
 		funcctx = SRF_FIRSTCALL_INIT();
 
+		if (!check_rel_can_be_partition(rootrelid))
+           SRF_RETURN_DONE(funcctx);
+
 		/* switch to memory context appropriate for multiple function calls */
 		oldcxt = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 
@@ -189,15 +189,16 @@ pg_partition_root(PG_FUNCTION_ARGS)
    if (!check_rel_can_be_partition(relid))
        PG_RETURN_NULL();
 
+   /* fetch the list of ancestors */
+   ancestors = get_partition_ancestors(relid);
+
    /*
-    * If the relation is not a partition (it may be the partition parent),
-    * return itself as a result.
+    * If the input relation is already the top-most parent, just return
+    * itself.
     */
-   if (!get_rel_relispartition(relid))
+   if (ancestors == NIL)
        PG_RETURN_OID(relid);
 
-   /* Fetch the top-most parent */
-   ancestors = get_partition_ancestors(relid);
    rootrelid = llast_oid(ancestors);
    list_free(ancestors);
 
diff --git a/src/test/regress/expected/partition_info.out b/src/test/regress/expected/partition_info.out
index 00a0ed80..c26d02a5 100644
--- a/src/test/regress/expected/partition_info.out
+++ b/src/test/regress/expected/partition_info.out
@@ -8,9 +8,8 @@ SELECT * FROM pg_partition_tree(NULL);
 
 SELECT * FROM pg_partition_tree(0);
  relid | parentrelid | isleaf | level 
--------+-------------+--------+-------
-       |             |        |      
-(1 row)
+-------+-------------+--------+-------    
+(0 row)
 
 SELECT pg_partition_root(NULL);
  pg_partition_root 
@@ -43,7 +42,35 @@ CREATE TABLE ptif_test1 PARTITION OF ptif_test
   FOR VALUES FROM (0) TO (100) PARTITION BY list (b);
 CREATE TABLE ptif_test11 PARTITION OF ptif_test1 FOR VALUES IN (1);
 CREATE TABLE ptif_test2 PARTITION OF ptif_test
-  FOR VALUES FROM (100) TO (maxvalue);
+  FOR VALUES FROM (100) TO (200);
+-- This partitioned table should remain with no partitions.
+CREATE TABLE ptif_test3 PARTITION OF ptif_test
+  FOR VALUES FROM (200) TO (maxvalue) PARTITION BY list (b);
+-- Test pg_partition_root for tables
+SELECT pg_partition_root('ptif_test');
+ pg_partition_root 
+-------------------
+ ptif_test
+(1 row)
+
+SELECT pg_partition_root('ptif_test0');
+ pg_partition_root 
+-------------------
+ ptif_test
+(1 row)
+
+SELECT pg_partition_root('ptif_test01');
+ pg_partition_root 
+-------------------
+ ptif_test
+(1 row)
+
+SELECT pg_partition_root('ptif_test3');
+ pg_partition_root 
+-------------------
+ ptif_test
+(1 row)
+
 -- Test index partition tree
 CREATE INDEX ptif_test_index ON ONLY ptif_test (a);
 CREATE INDEX ptif_test0_index ON ONLY ptif_test0 (a);
@@ -56,6 +83,33 @@ CREATE INDEX ptif_test11_index ON ptif_test11 (a);
 ALTER INDEX ptif_test1_index ATTACH PARTITION ptif_test11_index;
 CREATE INDEX ptif_test2_index ON ptif_test2 (a);
 ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test2_index;
+CREATE INDEX ptif_test3_index ON ptif_test3 (a);
+ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test3_index;
+-- Test pg_partition_root for indexes
+SELECT pg_partition_root('ptif_test_index');
+ pg_partition_root 
+-------------------
+ ptif_test_index
+(1 row)
+
+SELECT pg_partition_root('ptif_test0_index');
+ pg_partition_root 
+-------------------
+ ptif_test_index
+(1 row)
+
+SELECT pg_partition_root('ptif_test01_index');
+ pg_partition_root 
+-------------------
+ ptif_test_index
+(1 row)
+
+SELECT pg_partition_root('ptif_test3_index');
+ pg_partition_root 
+-------------------
+ ptif_test_index
+(1 row)
+
 -- List all tables members of the tree
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree('ptif_test');
@@ -65,9 +119,10 @@ SELECT relid, parentrelid, level, isleaf
  ptif_test0  | ptif_test   |     1 | f
  ptif_test1  | ptif_test   |     1 | f
  ptif_test2  | ptif_test   |     1 | t
+ ptif_test3  | ptif_test   |     1 | f
  ptif_test01 | ptif_test0  |     2 | t
  ptif_test11 | ptif_test1  |     2 | t
-(6 rows)
+(7 rows)
 
 -- List tables from an intermediate level
 SELECT relid, parentrelid, level, isleaf
@@ -88,6 +143,15 @@ SELECT relid, parentrelid, level, isleaf
  ptif_test01 | ptif_test0  |     0 | t
 (1 row)
 
+-- List from partitioned table with no partitions
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test3') p
+  JOIN pg_class c ON (p.relid = c.oid);
+   relid    | parentrelid | level | isleaf 
+------------+-------------+-------+--------
+ ptif_test3 | ptif_test   |     0 | f
+(1 row)
+
 -- List all members using pg_partition_root with leaf table reference
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree(pg_partition_root('ptif_test01')) p
@@ -98,9 +162,10 @@ SELECT relid, parentrelid, level, isleaf
  ptif_test0  | ptif_test   |     1 | f
  ptif_test1  | ptif_test   |     1 | f
  ptif_test2  | ptif_test   |     1 | t
+ ptif_test3  | ptif_test   |     1 | f
  ptif_test01 | ptif_test0  |     2 | t
  ptif_test11 | ptif_test1  |     2 | t
-(6 rows)
+(7 rows)
 
 -- List all ancestors of root and leaf tables
 SELECT * FROM pg_partition_ancestors('ptif_test01');
@@ -126,9 +191,10 @@ SELECT relid, parentrelid, level, isleaf
  ptif_test0_index  | ptif_test_index  |     1 | f
  ptif_test1_index  | ptif_test_index  |     1 | f
  ptif_test2_index  | ptif_test_index  |     1 | t
+ ptif_test3_index  | ptif_test_index  |     1 | f
  ptif_test01_index | ptif_test0_index |     2 | t
  ptif_test11_index | ptif_test1_index |     2 | t
-(6 rows)
+(7 rows)
 
 -- List indexes from an intermediate level
 SELECT relid, parentrelid, level, isleaf
@@ -149,6 +215,15 @@ SELECT relid, parentrelid, level, isleaf
  ptif_test01_index | ptif_test0_index |     0 | t
 (1 row)
 
+-- List from partitioned index with no partitions
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test3_index') p
+  JOIN pg_class c ON (p.relid = c.oid);
+      relid       |   parentrelid   | level | isleaf 
+------------------+-----------------+-------+--------
+ ptif_test3_index | ptif_test_index |     0 | f
+(1 row)
+
 -- List all members using pg_partition_root with leaf index reference
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree(pg_partition_root('ptif_test01_index')) p
@@ -159,9 +234,10 @@ SELECT relid, parentrelid, level, isleaf
  ptif_test0_index  | ptif_test_index  |     1 | f
  ptif_test1_index  | ptif_test_index  |     1 | f
  ptif_test2_index  | ptif_test_index  |     1 | t
+ ptif_test3_index  | ptif_test_index  |     1 | f
  ptif_test01_index | ptif_test0_index |     2 | t
  ptif_test11_index | ptif_test1_index |     2 | t
-(6 rows)
+(7 rows)
 
 -- List all ancestors of root and leaf indexes
 SELECT * FROM pg_partition_ancestors('ptif_test01_index');
@@ -179,19 +255,18 @@ SELECT * FROM pg_partition_ancestors('ptif_test_index');
 (1 row)
 
 DROP TABLE ptif_test;
--- A table not part of a partition tree works is the only member listed.
+-- A table not part of a partition tree works is not listed.
 CREATE TABLE ptif_normal_table(a int);
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree('ptif_normal_table');
-       relid       | parentrelid | level | isleaf 
--------------------+-------------+-------+--------
- ptif_normal_table |             |     0 | t
-(1 row)
+ relid | parentrelid | level | isleaf 
+-------+-------------+-------+--------
+(0 rows)
 
 SELECT pg_partition_root('ptif_normal_table');
  pg_partition_root 
 -------------------
- ptif_normal_table
+
 (1 row)
 
 SELECT * FROM pg_partition_ancestors('ptif_normal_table');
@@ -205,15 +280,13 @@ CREATE VIEW ptif_test_view AS SELECT 1;
 CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1;
 SELECT * FROM pg_partition_tree('ptif_test_view');
  relid | parentrelid | isleaf | level 
--------+-------------+--------+-------
-       |             |        |      
-(1 row)
+-------+-------------+--------+------- 
+(0 row)
 
 SELECT * FROM pg_partition_tree('ptif_test_matview');
  relid | parentrelid | isleaf | level 
--------+-------------+--------+-------
-       |             |        |      
-(1 row)
+-------+-------------+--------+------- 
+(0 row)
 
 SELECT pg_partition_root('ptif_test_view');
  pg_partition_root 
diff --git a/src/test/regress/sql/partition_info.sql b/src/test/regress/sql/partition_info.sql
index f49688eb..afa16c07 100644
--- a/src/test/regress/sql/partition_info.sql
+++ b/src/test/regress/sql/partition_info.sql
@@ -17,7 +17,16 @@ CREATE TABLE ptif_test1 PARTITION OF ptif_test
   FOR VALUES FROM (0) TO (100) PARTITION BY list (b);
 CREATE TABLE ptif_test11 PARTITION OF ptif_test1 FOR VALUES IN (1);
 CREATE TABLE ptif_test2 PARTITION OF ptif_test
-  FOR VALUES FROM (100) TO (maxvalue);
+  FOR VALUES FROM (100) TO (200);
+-- This partitioned table should remain with no partitions.
+CREATE TABLE ptif_test3 PARTITION OF ptif_test
+  FOR VALUES FROM (200) TO (maxvalue) PARTITION BY list (b);  
+
+-- Test pg_partition_root for tables
+SELECT pg_partition_root('ptif_test');
+SELECT pg_partition_root('ptif_test0');
+SELECT pg_partition_root('ptif_test01');
+SELECT pg_partition_root('ptif_test3');
 
 -- Test index partition tree
 CREATE INDEX ptif_test_index ON ONLY ptif_test (a);
@@ -31,6 +40,14 @@ CREATE INDEX ptif_test11_index ON ptif_test11 (a);
 ALTER INDEX ptif_test1_index ATTACH PARTITION ptif_test11_index;
 CREATE INDEX ptif_test2_index ON ptif_test2 (a);
 ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test2_index;
+CREATE INDEX ptif_test3_index ON ptif_test3 (a);
+ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test3_index;
+
+-- Test pg_partition_root for indexes
+SELECT pg_partition_root('ptif_test_index');
+SELECT pg_partition_root('ptif_test0_index');
+SELECT pg_partition_root('ptif_test01_index');
+SELECT pg_partition_root('ptif_test3_index');
 
 -- List all tables members of the tree
 SELECT relid, parentrelid, level, isleaf
@@ -43,6 +60,10 @@ SELECT relid, parentrelid, level, isleaf
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree('ptif_test01') p
   JOIN pg_class c ON (p.relid = c.oid);
+-- List from partitioned table with no partitions
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test3') p
+  JOIN pg_class c ON (p.relid = c.oid);
 -- List all members using pg_partition_root with leaf table reference
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree(pg_partition_root('ptif_test01')) p
@@ -62,6 +83,10 @@ SELECT relid, parentrelid, level, isleaf
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree('ptif_test01_index') p
   JOIN pg_class c ON (p.relid = c.oid);
+-- List from partitioned index with no partitions
+SELECT relid, parentrelid, level, isleaf
+  FROM pg_partition_tree('ptif_test3_index') p
+  JOIN pg_class c ON (p.relid = c.oid);
 -- List all members using pg_partition_root with leaf index reference
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree(pg_partition_root('ptif_test01_index')) p
@@ -72,7 +97,7 @@ SELECT * FROM pg_partition_ancestors('ptif_test_index');
 
 DROP TABLE ptif_test;
 
--- A table not part of a partition tree works is the only member listed.
+-- A table not part of a partition tree works is not listed.
 CREATE TABLE ptif_normal_table(a int);
 SELECT relid, parentrelid, level, isleaf
   FROM pg_partition_tree('ptif_normal_table');

From 31f9b1f23520319f12e2a075ef0608854ba45cca Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 1 Jul 2020 19:27:24 +0800
Subject: [PATCH 280/578] psql \dP: list partitioned tables and indexes.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/ref/psql-ref.sgml     |  33 +++++
 src/bin/psql/command.c             |  17 +++
 src/bin/psql/describe.c            | 214 +++++++++++++++++++++++++++++
 src/bin/psql/describe.h            |  11 +-
 src/bin/psql/help.c                |   3 +-
 src/bin/psql/tab-complete.c        |  41 +++++-
 src/test/regress/expected/psql.out | 131 ++++++++++++++++++
 src/test/regress/sql/psql.sql      |  69 ++++++++++
 8 files changed, 513 insertions(+), 6 deletions(-)

diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml
index c592edac..db3109f7 100644
--- a/doc/src/sgml/ref/psql-ref.sgml
+++ b/doc/src/sgml/ref/psql-ref.sgml
@@ -1609,6 +1609,39 @@ testdb=&gt;
         </listitem>
       </varlistentry>
 
+
+      <varlistentry>
+        <term><literal>\dP[itn+] [ <link linkend="app-psql-patterns"><replaceable class="parameter">pattern</replaceable></link> ]</literal></term>
+        <listitem>
+        <para>
+        Lists partitioned relations.
+        If <replaceable class="parameter">pattern</replaceable>
+        is specified, only entries whose name matches the pattern are listed.
+        The modifiers <literal>t</literal> (tables) and <literal>i</literal>
+        (indexes) can be appended to the command, filtering the kind of
+        relations to list.  By default, partitioned tables and indexes are
+        listed.
+        </para>
+
+        <para>
+        If the modifier <literal>n</literal> (<quote>nested</quote>) is used,
+        or a pattern is specified, then non-root partitioned tables are
+        included, and a column is shown displaying the parent of each
+        partitioned relation.
+        </para>
+
+        <para>
+        If <literal>+</literal> is appended to the command, the sum of sizes of
+        table's partitions (including that of their indexes) is also displayed,
+        along with the associated description.
+        If <literal>n</literal> is combined with <literal>+</literal>, two
+        sizes are shown: one including the total size of directly-attached
+        leaf partitions, and another showing the total size of all partitions,
+        including indirectly attached sub-partitions.
+        </para>
+        </listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><literal>\drds [ <link linkend="APP-PSQL-patterns"><replaceable class="parameter">role-pattern</replaceable></link> [ <link linkend="APP-PSQL-patterns"><replaceable class="parameter">database-pattern</replaceable></link> ] ]</literal></term>
         <listitem>
diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c
index 49813637..30ee6793 100644
--- a/src/bin/psql/command.c
+++ b/src/bin/psql/command.c
@@ -797,6 +797,23 @@ exec_command_d(PsqlScanState scan_state, bool active_branch, const char *cmd)
             case 'p':
                 success = permissionsList(pattern);
                 break;
+			case 'P':
+				{
+					switch (cmd[2])
+					{
+						case '\0':
+						case '+':
+						case 't':
+						case 'i':
+						case 'n':
+							success = listPartitionedTables(&cmd[2], pattern, show_verbose);
+							break;
+						default:
+							status = PSQL_CMD_UNKNOWN;
+							break;
+					}
+				}
+				break;
             case 'T':
                 success = describeTypes(pattern, show_verbose, show_system);
                 break;
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 1c671aca..b7023ae5 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -3771,6 +3771,220 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys
     return true;
 }
 
+/*
+ * \dP
+ * Takes an optional regexp to select particular relations
+ *
+ * As with \d, you can specify the kinds of relations you want:
+ *
+ * t for tables
+ * i for indexes
+ *
+ * And there's additional flags:
+ *
+ * n to list non-leaf partitioned tables
+ *
+ * and you can mix and match these in any order.
+ */
+bool
+listPartitionedTables(const char *reltypes, const char *pattern, bool verbose)
+{
+	bool		showTables = strchr(reltypes, 't') != NULL;
+	bool		showIndexes = strchr(reltypes, 'i') != NULL;
+	bool		showNested = strchr(reltypes, 'n') != NULL;
+	PQExpBufferData buf;
+	PQExpBufferData title;
+	PGresult   *res;
+	printQueryOpt myopt = pset.popt;
+	bool translate_columns[] = {false, false, false, false, false, false, false, false, false};
+	const char *tabletitle;
+	bool		mixed_output = false;
+
+	/*
+	 * Note: Declarative table partitioning is only supported as of Pg 10.0.
+	 */
+	if (pset.sversion < 100000)
+	{
+		char		sverbuf[32];
+
+		pg_log_error("The server (version %s) does not support declarative table partitioning.",
+					 formatPGVersionNumber(pset.sversion, false,
+										   sverbuf, sizeof(sverbuf)));
+		return true;
+	}
+
+	/* If no relation kind was selected, show them all */
+	if (!showTables && !showIndexes)
+		showTables = showIndexes = true;
+
+	if (showIndexes && !showTables)
+		tabletitle = _("List of partitioned indexes");	/* \dPi */
+	else if (showTables && !showIndexes)
+		tabletitle = _("List of partitioned tables");	/* \dPt */
+	else
+	{
+		/* show all kinds */
+		tabletitle = _("List of partitioned relations");
+		mixed_output = true;
+	}
+
+	initPQExpBuffer(&buf);
+
+	printfPQExpBuffer(&buf,
+					  "SELECT n.nspname as \"%s\",\n"
+					  "  c.relname as \"%s\",\n"
+					  "  pg_catalog.pg_get_userbyid(c.relowner) as \"%s\"",
+					  gettext_noop("Schema"),
+					  gettext_noop("Name"),
+					  gettext_noop("Owner"));
+
+	if (mixed_output)
+	{
+		appendPQExpBuffer(&buf,
+						  ",\n  CASE c.relkind"
+						  " WHEN " CppAsString2(RELKIND_PARTITIONED_TABLE) " THEN '%s'"
+						  " WHEN " CppAsString2(RELKIND_PARTITIONED_INDEX) " THEN '%s'"
+						  " END as \"%s\"",
+						  gettext_noop("partitioned table"),
+						  gettext_noop("partitioned index"),
+						  gettext_noop("Type"));
+
+		translate_columns[3] = true;
+	}
+
+	if (showNested || pattern)
+		appendPQExpBuffer(&buf,
+						  ",\n  c3.oid::regclass as \"%s\"",
+						  gettext_noop("Parent name"));
+
+	if (showIndexes)
+		appendPQExpBuffer(&buf,
+						  ",\n c2.oid::regclass as \"%s\"",
+						  gettext_noop("On table"));
+
+	if (verbose)
+	{
+		if (showNested)
+		{
+			appendPQExpBuffer(&buf,
+							  ",\n  s.dps as \"%s\"",
+							  gettext_noop("Leaf partition size"));
+			appendPQExpBuffer(&buf,
+							  ",\n  s.tps as \"%s\"",
+							  gettext_noop("Total size"));
+		}
+		else
+			/* Sizes of all partitions are considered in this case. */
+			appendPQExpBuffer(&buf,
+							  ",\n  s.tps as \"%s\"",
+							  gettext_noop("Total size"));
+
+		appendPQExpBuffer(&buf,
+						  ",\n  pg_catalog.obj_description(c.oid, 'pg_class') as \"%s\"",
+						  gettext_noop("Description"));
+	}
+
+	appendPQExpBufferStr(&buf,
+						 "\nFROM pg_catalog.pg_class c"
+						 "\n     LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace");
+
+	if (showIndexes)
+		appendPQExpBufferStr(&buf,
+							 "\n     LEFT JOIN pg_catalog.pg_index i ON i.indexrelid = c.oid"
+							 "\n     LEFT JOIN pg_catalog.pg_class c2 ON i.indrelid = c2.oid");
+
+	if (showNested || pattern)
+		appendPQExpBufferStr(&buf,
+							 "\n     LEFT JOIN pg_catalog.pg_inherits inh ON c.oid = inh.inhrelid"
+							 "\n     LEFT JOIN pg_catalog.pg_class c3 ON c3.oid = inh.inhparent");
+
+	if (verbose)
+	{
+		if (pset.sversion < 120000)
+		{
+			appendPQExpBuffer(&buf,
+							  ",\n     LATERAL (WITH RECURSIVE d\n"
+							  "                AS (SELECT inhrelid AS oid, 1 AS level\n"
+							  "                      FROM pg_catalog.pg_inherits\n"
+							  "                     WHERE inhparent = c.oid\n"
+							  "                    UNION ALL\n"
+							  "                    SELECT inhrelid, level + 1\n"
+							  "                      FROM pg_catalog.pg_inherits i\n"
+							  "                           JOIN d ON i.inhparent = d.oid)\n"
+							  "                SELECT pg_catalog.pg_size_pretty(sum(pg_catalog.pg_table_size("
+							  "d.oid))) AS tps,\n"
+							  "                       pg_catalog.pg_size_pretty(sum("
+							  "\n             CASE WHEN d.level = 1"
+							  " THEN pg_catalog.pg_table_size(d.oid) ELSE 0 END)) AS dps\n"
+							  "               FROM d) s");
+		}
+		else
+		{
+			/* PostgreSQL 12 has pg_partition_tree function */
+			appendPQExpBuffer(&buf,
+							  ",\n     LATERAL (SELECT pg_catalog.pg_size_pretty(sum("
+							  "\n                 CASE WHEN ppt.isleaf AND ppt.level = 1"
+							  "\n                      THEN pg_catalog.pg_table_size(ppt.relid)"
+							  " ELSE 0 END)) AS dps"
+							  ",\n                     pg_catalog.pg_size_pretty(sum("
+							  "pg_catalog.pg_table_size(ppt.relid))) AS tps"
+							  "\n              FROM pg_catalog.pg_partition_tree(c.oid) ppt) s");
+		}
+	}
+
+	appendPQExpBufferStr(&buf, "\nWHERE c.relkind IN (");
+	if (showTables)
+		appendPQExpBufferStr(&buf, CppAsString2(RELKIND_PARTITIONED_TABLE) ",");
+	if (showIndexes)
+		appendPQExpBufferStr(&buf, CppAsString2(RELKIND_PARTITIONED_INDEX) ",");
+	appendPQExpBufferStr(&buf, "''");	/* dummy */
+	appendPQExpBufferStr(&buf, ")\n");
+
+	appendPQExpBufferStr(&buf, !showNested && !pattern ?
+						 " AND NOT c.relispartition\n" : "");
+
+	if (!pattern)
+		appendPQExpBufferStr(&buf, "      AND n.nspname <> 'pg_catalog'\n"
+							 "      AND n.nspname <> 'information_schema'\n");
+
+	/*
+	 * TOAST objects are suppressed unconditionally.  Since we don't provide
+	 * any way to select RELKIND_TOASTVALUE above, we would never show toast
+	 * tables in any case; it seems a bit confusing to allow their indexes to
+	 * be shown.  Use plain \d if you really need to look at a TOAST
+	 * table/index.
+	 */
+	appendPQExpBufferStr(&buf, "      AND n.nspname !~ '^pg_toast'\n");
+
+	processSQLNamePattern(pset.db, &buf, pattern, true, false,
+						  "n.nspname", "c.relname", NULL,
+						  "pg_catalog.pg_table_is_visible(c.oid)");
+
+	appendPQExpBuffer(&buf, "ORDER BY \"Schema\", %s%s\"Name\";",
+					  mixed_output ? "\"Type\" DESC, " : "",
+					  showNested || pattern ? "\"Parent name\" NULLS FIRST, " : "");
+
+	res = PSQLexec(buf.data);
+	termPQExpBuffer(&buf);
+	if (!res)
+		return false;
+
+	initPQExpBuffer(&title);
+	appendPQExpBuffer(&title, "%s", tabletitle);
+
+	myopt.nullPrint = NULL;
+	myopt.title = title.data;
+	myopt.translate_header = true;
+	myopt.translate_columns = translate_columns;
+	myopt.n_translate_columns = lengthof(translate_columns);
+
+	printQuery(res, &myopt, pset.queryFout, false, pset.logfile);
+
+	termPQExpBuffer(&title);
+
+	PQclear(res);
+	return true;
+}
 
 /*
  * \dL
diff --git a/src/bin/psql/describe.h b/src/bin/psql/describe.h
index da6046c9..2224397f 100644
--- a/src/bin/psql/describe.h
+++ b/src/bin/psql/describe.h
@@ -63,6 +63,9 @@ extern bool listAllDbs(const char *pattern, bool verbose);
 /* \dt, \di, \ds, \dS, etc. */
 extern bool listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSystem);
 
+/* \dP */
+extern bool listPartitionedTables(const char *reltypes, const char *pattern, bool verbose);
+
 /* \dD */
 extern bool listDomains(const char *pattern, bool verbose, bool showSystem);
 
@@ -103,12 +106,12 @@ extern bool listExtensionContents(const char *pattern);
 extern bool listEventTriggers(const char *pattern, bool verbose);
 
 /* \dRp */
-bool        listPublications(const char *pattern);
+bool		listPublications(const char *pattern);
 
 /* \dRp+ */
-bool        describePublications(const char *pattern);
+bool		describePublications(const char *pattern);
 
 /* \dRs */
-bool        describeSubscriptions(const char *pattern, bool verbose);
+bool		describeSubscriptions(const char *pattern, bool verbose);
 
-#endif                            /* DESCRIBE_H */
+#endif							/* DESCRIBE_H */
diff --git a/src/bin/psql/help.c b/src/bin/psql/help.c
index a793c9f9..ec7d0efe 100644
--- a/src/bin/psql/help.c
+++ b/src/bin/psql/help.c
@@ -167,7 +167,7 @@ slashUsage(unsigned short int pager)
      * Use "psql --help=commands | wc" to count correctly.  It's okay to count
      * the USE_READLINE line even in builds without that.
      */
-    output = PageOutput(122, pager ? &(pset.popt.topt) : NULL);
+	output = PageOutput(123, pager ? &(pset.popt.topt) : NULL);
 
     fprintf(output, _("General\n"));
     fprintf(output, _("  \\copyright             show PostgreSQL usage and distribution terms\n"));
@@ -248,6 +248,7 @@ slashUsage(unsigned short int pager)
     fprintf(output, _("  \\do[S]  [PATTERN]      list operators\n"));
     fprintf(output, _("  \\dO[S+] [PATTERN]      list collations\n"));
     fprintf(output, _("  \\dp     [PATTERN]      list table, view, and sequence access privileges\n"));
+	fprintf(output, _("  \\dP[tin+] [PATTERN]    list [only table/index] partitioned relations\n"));
     fprintf(output, _("  \\drds [PATRN1 [PATRN2]] list per-database role settings\n"));
     fprintf(output, _("  \\dRp[+] [PATTERN]      list replication publications\n"));
     fprintf(output, _("  \\dRs[+] [PATTERN]      list replication subscriptions\n"));
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 29eaf18b..fd1c4a5e 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -499,6 +499,23 @@ static const SchemaQuery Query_for_list_of_constraints_with_schema = {
     NULL
 };
 
+/* partitioned relations */
+static const SchemaQuery Query_for_list_of_partitioned_relations = {
+    /* catname */
+   "pg_catalog.pg_class c",
+   /* selcondition */
+   "c.relkind IN (" CppAsString2(RELKIND_PARTITIONED_TABLE)
+       ", " CppAsString2(RELKIND_PARTITIONED_INDEX) ")",
+   /* viscondition */
+   "pg_catalog.pg_table_is_visible(c.oid)",
+   /* namespace */
+   "c.relnamespace",
+   /* result */
+   "pg_catalog.quote_ident(c.relname)",
+   /* qualresult */
+   	NULL
+};
+
 /* Relations supporting INSERT, UPDATE or DELETE */
 static const SchemaQuery Query_for_list_of_updatables = {
     /* catname */
@@ -518,6 +535,22 @@ static const SchemaQuery Query_for_list_of_updatables = {
     NULL
 };
 
+static const SchemaQuery Query_for_list_of_partitioned_indexes = {
+    /* catname */
+    "pg_catalog.pg_class c",
+	/* selcondition */
+    "c.relkind = " CppAsString2(RELKIND_PARTITIONED_INDEX),
+	/* viscondition */
+    "pg_catalog.pg_table_is_visible(c.oid)",
+	/* namespace */
+    "c.relnamespace",
+	/* result */
+    "pg_catalog.quote_ident(c.relname)",
+	/* qualresult */
+	NULL
+};
+
+/* All relations */
 static const SchemaQuery Query_for_list_of_relations = {
     /* catname */
     "pg_catalog.pg_class c",
@@ -1444,7 +1477,7 @@ psql_completion(const char *text, int start, int end)
         "\\d", "\\da", "\\dA", "\\db", "\\dc", "\\dC", "\\dd", "\\ddp", "\\dD",
         "\\des", "\\det", "\\deu", "\\dew", "\\dE", "\\df",
         "\\dF", "\\dFd", "\\dFp", "\\dFt", "\\dg", "\\di", "\\dl", "\\dL",
-        "\\dm", "\\dn", "\\do", "\\dO", "\\dp",
+		"\\dm", "\\dn", "\\do", "\\dO", "\\dp", "\\dP", "\\dPi", "\\dPt",
         "\\drds", "\\dRs", "\\dRp", "\\ds", "\\dS",
         "\\dt", "\\dT", "\\dv", "\\du", "\\dx", "\\dy",
         "\\e", "\\echo", "\\ef", "\\elif", "\\else", "\\encoding",
@@ -3471,6 +3504,12 @@ psql_completion(const char *text, int start, int end)
         COMPLETE_WITH_QUERY(Query_for_list_of_schemas);
     else if (TailMatchesCS1("\\dp") || TailMatchesCS1("\\z"))
         COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_tsvmf, NULL);
+	else if (TailMatchesCS("\\dPi*"))
+	   COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_indexes, NULL);
+	else if (TailMatchesCS("\\dPt*"))
+	   COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_tables, NULL);
+	else if (TailMatchesCS("\\dP*"))
+	   COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_relations, NULL);
     else if (TailMatchesCS1("\\ds*"))
         COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_sequences, NULL);
     else if (TailMatchesCS1("\\dt*"))
diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out
index d602aeef..3e0eae21 100644
--- a/src/test/regress/expected/psql.out
+++ b/src/test/regress/expected/psql.out
@@ -2964,3 +2964,134 @@ SELECT 3
 UNION SELECT 4 
 UNION SELECT 5
 ORDER BY 1;
+create schema testpart;
+create role testrole_partitioning;
+alter schema testpart owner to testrole_partitioning;
+set role to testrole_partitioning;
+-- run test inside own schema and hide other partitions
+set search_path to testpart;
+create table testtable_apple(logdate date);
+create table testtable_orange(logdate date);
+create index testtable_apple_index on testtable_apple(logdate);
+create index testtable_orange_index on testtable_orange(logdate);
+create table testpart_apple(logdate date) partition by range(logdate);
+create table testpart_orange(logdate date) partition by range(logdate);
+create index testpart_apple_index on testpart_apple(logdate);
+create index testpart_orange_index on testpart_orange(logdate);
+-- only partition related object should be displayed
+\dP test*apple*
+                                       List of partitioned relations
+  Schema  |         Name         |         Owner         |       Type        | Parent name |    On table    
+----------+----------------------+-----------------------+-------------------+-------------+----------------
+ testpart | testpart_apple       | testrole_partitioning | partitioned table |             | 
+ testpart | testpart_apple_index | testrole_partitioning | partitioned index |             | testpart_apple
+(2 rows)
+
+\dPt test*apple*
+                   List of partitioned tables
+  Schema  |      Name      |         Owner         | Parent name 
+----------+----------------+-----------------------+-------------
+ testpart | testpart_apple | testrole_partitioning | 
+(1 row)
+
+\dPi test*apple*
+                              List of partitioned indexes
+  Schema  |         Name         |         Owner         | Parent name |    On table    
+----------+----------------------+-----------------------+-------------+----------------
+ testpart | testpart_apple_index | testrole_partitioning |             | testpart_apple
+(1 row)
+
+drop table testtable_apple;
+drop table testtable_orange;
+drop table testpart_apple;
+drop table testpart_orange;
+create table parent_tab (id int) partition by range (id);
+create index parent_index on parent_tab (id);
+create table child_0_10 partition of parent_tab
+  for values from (0) to (10);
+create table child_10_20 partition of parent_tab
+  for values from (10) to (20);
+create table child_20_30 partition of parent_tab
+  for values from (20) to (30);
+insert into parent_tab values (generate_series(0,29));
+create table child_30_40 partition of parent_tab
+for values from (30) to (40)
+  partition by range(id);
+create table child_30_35 partition of child_30_40
+  for values from (30) to (35);
+create table child_35_40 partition of child_30_40
+   for values from (35) to (40);
+insert into parent_tab values (generate_series(30,39));
+\dPt
+          List of partitioned tables
+  Schema  |    Name    |         Owner         
+----------+------------+-----------------------
+ testpart | parent_tab | testrole_partitioning
+(1 row)
+
+\dPi
+                 List of partitioned indexes
+  Schema  |     Name     |         Owner         |  On table  
+----------+--------------+-----------------------+------------
+ testpart | parent_index | testrole_partitioning | parent_tab
+(1 row)
+
+\dP testpart.*
+                                     List of partitioned relations
+  Schema  |        Name        |         Owner         |       Type        | Parent name  |  On table   
+----------+--------------------+-----------------------+-------------------+--------------+-------------
+ testpart | parent_tab         | testrole_partitioning | partitioned table |              | 
+ testpart | child_30_40        | testrole_partitioning | partitioned table | parent_tab   | 
+ testpart | parent_index       | testrole_partitioning | partitioned index |              | parent_tab
+ testpart | child_30_40_id_idx | testrole_partitioning | partitioned index | parent_index | child_30_40
+(4 rows)
+
+\dP
+                          List of partitioned relations
+  Schema  |     Name     |         Owner         |       Type        |  On table  
+----------+--------------+-----------------------+-------------------+------------
+ testpart | parent_tab   | testrole_partitioning | partitioned table | 
+ testpart | parent_index | testrole_partitioning | partitioned index | parent_tab
+(2 rows)
+
+\dPtn
+                  List of partitioned tables
+  Schema  |    Name     |         Owner         | Parent name 
+----------+-------------+-----------------------+-------------
+ testpart | parent_tab  | testrole_partitioning | 
+ testpart | child_30_40 | testrole_partitioning | parent_tab
+(2 rows)
+
+\dPin
+                            List of partitioned indexes
+  Schema  |        Name        |         Owner         | Parent name  |  On table   
+----------+--------------------+-----------------------+--------------+-------------
+ testpart | parent_index       | testrole_partitioning |              | parent_tab
+ testpart | child_30_40_id_idx | testrole_partitioning | parent_index | child_30_40
+(2 rows)
+
+\dPn
+                                     List of partitioned relations
+  Schema  |        Name        |         Owner         |       Type        | Parent name  |  On table   
+----------+--------------------+-----------------------+-------------------+--------------+-------------
+ testpart | parent_tab         | testrole_partitioning | partitioned table |              | 
+ testpart | child_30_40        | testrole_partitioning | partitioned table | parent_tab   | 
+ testpart | parent_index       | testrole_partitioning | partitioned index |              | parent_tab
+ testpart | child_30_40_id_idx | testrole_partitioning | partitioned index | parent_index | child_30_40
+(4 rows)
+
+\dPn testpart.*
+                                     List of partitioned relations
+  Schema  |        Name        |         Owner         |       Type        | Parent name  |  On table   
+----------+--------------------+-----------------------+-------------------+--------------+-------------
+ testpart | parent_tab         | testrole_partitioning | partitioned table |              | 
+ testpart | child_30_40        | testrole_partitioning | partitioned table | parent_tab   | 
+ testpart | parent_index       | testrole_partitioning | partitioned index |              | parent_tab
+ testpart | child_30_40_id_idx | testrole_partitioning | partitioned index | parent_index | child_30_40
+(4 rows)
+
+drop table parent_tab cascade;
+drop schema testpart;
+set search_path to default;
+set role to default;
+drop role testrole_partitioning;
\ No newline at end of file
diff --git a/src/test/regress/sql/psql.sql b/src/test/regress/sql/psql.sql
index b56a05f7..60b34177 100644
--- a/src/test/regress/sql/psql.sql
+++ b/src/test/regress/sql/psql.sql
@@ -560,3 +560,72 @@ UNION SELECT 5
 ORDER BY 1;
 \r
 \p
+
+create schema testpart;
+create role testrole_partitioning;
+
+alter schema testpart owner to testrole_partitioning;
+
+set role to testrole_partitioning;
+
+-- run test inside own schema and hide other partitions
+set search_path to testpart;
+
+create table testtable_apple(logdate date);
+create table testtable_orange(logdate date);
+create index testtable_apple_index on testtable_apple(logdate);
+create index testtable_orange_index on testtable_orange(logdate);
+
+create table testpart_apple(logdate date) partition by range(logdate);
+create table testpart_orange(logdate date) partition by range(logdate);
+
+create index testpart_apple_index on testpart_apple(logdate);
+create index testpart_orange_index on testpart_orange(logdate);
+
+-- only partition related object should be displayed
+\dP test*apple*
+\dPt test*apple*
+\dPi test*apple*
+
+drop table testtable_apple;
+drop table testtable_orange;
+drop table testpart_apple;
+drop table testpart_orange;
+
+create table parent_tab (id int) partition by range (id);
+create index parent_index on parent_tab (id);
+create table child_0_10 partition of parent_tab
+  for values from (0) to (10);
+create table child_10_20 partition of parent_tab
+  for values from (10) to (20);
+create table child_20_30 partition of parent_tab
+  for values from (20) to (30);
+insert into parent_tab values (generate_series(0,29));
+create table child_30_40 partition of parent_tab
+for values from (30) to (40)
+  partition by range(id);
+create table child_30_35 partition of child_30_40
+  for values from (30) to (35);
+create table child_35_40 partition of child_30_40
+   for values from (35) to (40);
+insert into parent_tab values (generate_series(30,39));
+
+\dPt
+\dPi
+
+\dP testpart.*
+\dP
+
+\dPtn
+\dPin
+\dPn
+\dPn testpart.*
+
+drop table parent_tab cascade;
+
+drop schema testpart;
+
+set search_path to default;
+
+set role to default;
+drop role testrole_partitioning;

From 219faaf03d6ca29fce4eb774d27de63ca68cc53c Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 1 Jul 2020 20:07:49 +0800
Subject: [PATCH 281/578] Fix EvalPlanQualStart to handle partitioned result
 rels correctly.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/executor/execMain.c                | 14 +++++++++++++-
 src/test/isolation/expected/eval-plan-qual.out | 12 ++++++++++++
 src/test/isolation/specs/eval-plan-qual.spec   | 18 ++++++++++++++++++
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 5a082133..4140f135 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -3510,7 +3510,7 @@ EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree)
      * es_param_exec_vals, etc.
      *
      * The ResultRelInfo array management is trickier than it looks.  We
-     * create a fresh array for the child but copy all the content from the
+	 * create fresh arrays for the child but copy all the content from the
      * parent.  This is because it's okay for the child to share any
      * per-relation state the parent has already created --- but if the child
      * sets up any ResultRelInfo fields, such as its own junkfilter, that
@@ -3527,6 +3527,7 @@ EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree)
     if (parentestate->es_num_result_relations > 0)
     {
         int            numResultRelations = parentestate->es_num_result_relations;
+		int			numRootResultRels = parentestate->es_num_root_result_relations;
         ResultRelInfo *resultRelInfos;
 
         resultRelInfos = (ResultRelInfo *)
@@ -3535,6 +3536,17 @@ EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree)
                numResultRelations * sizeof(ResultRelInfo));
         estate->es_result_relations = resultRelInfos;
         estate->es_num_result_relations = numResultRelations;
+
+		/* Also transfer partitioned root result relations. */
+		if (numRootResultRels > 0)
+		{
+			resultRelInfos = (ResultRelInfo *)
+				palloc(numRootResultRels * sizeof(ResultRelInfo));
+			memcpy(resultRelInfos, parentestate->es_root_result_relations,
+				   numRootResultRels * sizeof(ResultRelInfo));
+			estate->es_root_result_relations = resultRelInfos;
+			estate->es_num_root_result_relations = numRootResultRels;
+		}
     }
     /* es_result_relation_info must NOT be copied */
     /* es_trig_target_relations must NOT be copied */
diff --git a/src/test/isolation/expected/eval-plan-qual.out b/src/test/isolation/expected/eval-plan-qual.out
index 10c784a0..6be164fe 100644
--- a/src/test/isolation/expected/eval-plan-qual.out
+++ b/src/test/isolation/expected/eval-plan-qual.out
@@ -184,3 +184,15 @@ step readwcte: <... completed>
 id             value          
 
 1              tableAValue2   
+
+starting permutation: simplepartupdate complexpartupdate c1 c2
+step simplepartupdate: 
+   update parttbl set a = a;
+
+step complexpartupdate: 
+   with u as (update parttbl set a = a returning parttbl.*)
+   update parttbl set a = u.a from u;
+ <waiting ...>
+step c1: COMMIT;
+step complexpartupdate: <... completed>
+step c2: COMMIT;
\ No newline at end of file
diff --git a/src/test/isolation/specs/eval-plan-qual.spec b/src/test/isolation/specs/eval-plan-qual.spec
index 5e1fce05..a2f6948b 100644
--- a/src/test/isolation/specs/eval-plan-qual.spec
+++ b/src/test/isolation/specs/eval-plan-qual.spec
@@ -47,10 +47,17 @@ setup
  INSERT INTO table_a VALUES (1, 'tableAValue');
  INSERT INTO table_b VALUES (1, 'tableBValue');
 }
+setup
+{ 
+ CREATE TABLE parttbl (a int) PARTITION BY LIST (a);
+ CREATE TABLE parttbl1 PARTITION OF parttbl FOR VALUES IN (1);
+ INSERT INTO parttbl VALUES (1);
+}
 
 teardown
 {
  DROP TABLE accounts, p, table_a, table_b CASCADE;
+ DROP TABLE parttbl;
 }
 
 session "s1"
@@ -101,6 +108,11 @@ step "updateforss"	{
 	UPDATE table_b SET value = 'newTableBValue' WHERE id = 1;
 }
 
+# test for EPQ on a partitioned result table
+
+step "simplepartupdate"    {
+   update parttbl set a = a;
+}
 
 session "s2"
 setup		{ BEGIN ISOLATION LEVEL READ COMMITTED; }
@@ -127,6 +139,10 @@ step "readforss"	{
 	WHERE ta.id = 1 FOR UPDATE OF ta;
 }
 step "wrtwcte"	{ UPDATE table_a SET value = 'tableAValue2' WHERE id = 1; }
+step "complexpartupdate"   {
+   with u as (update parttbl set a = a returning parttbl.*)
+   update parttbl set a = u.a from u;
+}
 step "c2"	{ COMMIT; }
 
 session "s3"
@@ -158,3 +174,5 @@ permutation "wx2" "partiallock" "c2" "c1" "read"
 permutation "wx2" "lockwithvalues" "c2" "c1" "read"
 permutation "updateforss" "readforss" "c1" "c2"
 permutation "wrtwcte" "readwcte" "c1" "c2"
+
+permutation "simplepartupdate" "complexpartupdate" "c1" "c2"

From c8c453ee8547a85a445bc5718e94058cf38487f6 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Wed, 17 Apr 2019 18:17:43 -0400
Subject: [PATCH 282/578] psql: display tablespace for partitioned indexes

Nothing was shown previously.
---
 src/bin/psql/describe.c                   | 3 ++-
 src/test/regress/input/tablespace.source  | 1 +
 src/test/regress/output/tablespace.source | 8 ++++++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index b7023ae5..4fd3864b 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -3281,7 +3281,8 @@ add_tablespace_footer(printTableContent *const cont, char relkind,
     if (relkind == RELKIND_RELATION ||
         relkind == RELKIND_MATVIEW ||
         relkind == RELKIND_INDEX ||
-        relkind == RELKIND_PARTITIONED_TABLE)
+		relkind == RELKIND_PARTITIONED_TABLE ||
+		relkind == RELKIND_PARTITIONED_INDEX)
     {
         /*
          * We ignore the database default tablespace so that users not using
diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source
index abad2716..4bf5302d 100644
--- a/src/test/regress/input/tablespace.source
+++ b/src/test/regress/input/tablespace.source
@@ -63,6 +63,7 @@ CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace;
 CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2);
 SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
     where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx';
+\d testschema.part_a_idx
 
 -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
 CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace;
diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source
index 03383fd4..6688ae7e 100644
--- a/src/test/regress/output/tablespace.source
+++ b/src/test/regress/output/tablespace.source
@@ -94,6 +94,14 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
  part_a_idx  | regress_tblspace
 (3 rows)
 
+\d testschema.part_a_idx
+Partitioned index "testschema.part_a_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+btree, for table "testschema.part"
+Tablespace: "regress_tblspace"
+
 -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
 CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace;
 INSERT INTO testschema.test_default_tab VALUES (1);

From aac238dfe88c46dfaa5c234f90973a1dea5358eb Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 1 Jul 2020 21:43:00 +0800
Subject: [PATCH 283/578] Fix tablespace inheritance for partitioned rels.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/bootstrap/bootparse.y         |   2 +
 src/backend/commands/indexcmds.c          |  28 +-
 src/backend/commands/matview.c            |   2 +-
 src/backend/commands/tablecmds.c          |  32 +--
 src/backend/commands/tablespace.c         |  16 +-
 src/backend/nodes/copyfuncs.c             |   2 +
 src/backend/nodes/equalfuncs.c            |   2 +
 src/backend/nodes/outfuncs.c              |   4 +
 src/backend/parser/gram.y                 |   2 +
 src/backend/parser/parse_utilcmd.c        |   2 +
 src/backend/utils/adt/ruleutils.c         |  11 +-
 src/include/commands/tablespace.h         |  30 +-
 src/include/nodes/parsenodes.h            |   4 +
 src/test/regress/input/tablespace.source  |  90 ++++++
 src/test/regress/output/tablespace.source | 334 ++++++++++++++++++++++
 15 files changed, 523 insertions(+), 38 deletions(-)

diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y
index 78267925..137c2dad 100644
--- a/src/backend/bootstrap/bootparse.y
+++ b/src/backend/bootstrap/bootparse.y
@@ -377,6 +377,7 @@ Boot_DeclareIndexStmt:
 					stmt->transformed = false;
 					stmt->concurrent = false;
 					stmt->if_not_exists = false;
+					stmt->reset_default_tblspc = false;
 
 					/* locks and races need not concern us in bootstrap mode */
 					relationId = RangeVarGetRelid(stmt->relation, NoLock,
@@ -421,6 +422,7 @@ Boot_DeclareUniqueIndexStmt:
 					stmt->transformed = false;
 					stmt->concurrent = false;
 					stmt->if_not_exists = false;
+					stmt->reset_default_tblspc = false;
 
 					/* locks and races need not concern us in bootstrap mode */
 					relationId = RangeVarGetRelid(stmt->relation, NoLock,
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index ad99f3e2..4596a9f4 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -349,9 +349,23 @@ DefineIndex(Oid relationId,
     LOCKTAG        heaplocktag;
     LOCKMODE    lockmode;
     Snapshot    snapshot;
+	int         save_nestlevel = -1;
     int            i;
 
     /*
+	* Some callers need us to run with an empty default_tablespace; this is a
+	* necessary hack to be able to reproduce catalog state accurately when
+	* recreating indexes after table-rewriting ALTER TABLE.
+	*/
+	if (stmt->reset_default_tblspc)
+	{
+	   save_nestlevel = NewGUCNestLevel();
+	   (void) set_config_option("default_tablespace", "",
+	                            PGC_USERSET, PGC_S_SESSION,
+	                            GUC_ACTION_SAVE, true, 0, false);
+	}
+
+	/*
      * count attributes in index
      */
     numberOfAttributes = list_length(stmt->indexParams);
@@ -448,10 +462,15 @@ DefineIndex(Oid relationId,
     if (stmt->tableSpace)
     {
         tablespaceId = get_tablespace_oid(stmt->tableSpace, false);
+		if (partitioned && tablespaceId == MyDatabaseTableSpace)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot specify default tablespace for partitioned relation")));
     }
     else
     {
-        tablespaceId = GetDefaultTablespace(rel->rd_rel->relpersistence);
+		tablespaceId = GetDefaultTablespace(rel->rd_rel->relpersistence,
+											partitioned);
         /* note InvalidOid is OK in this case */
     }
 
@@ -753,6 +772,13 @@ DefineIndex(Oid relationId,
 
     ObjectAddressSet(address, RelationRelationId, indexRelationId);
 
+	/*
+	* Revert to original default_tablespace.  Must do this before any return
+	* from this function, but after index_create, so this is a good time.
+	*/
+	if (save_nestlevel >= 0)
+	   AtEOXact_GUC(true, save_nestlevel);
+
     if (!OidIsValid(indexRelationId))
     {
         heap_close(rel, NoLock);
diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c
index 0144ee68..102e2f36 100644
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -303,7 +303,7 @@ ExecRefreshMatView(RefreshMatViewStmt *stmt, const char *queryString,
     /* Concurrent refresh builds new data in temp tablespace, and does diff. */
     if (concurrent)
     {
-        tableSpace = GetDefaultTablespace(RELPERSISTENCE_TEMP);
+		tableSpace = GetDefaultTablespace(RELPERSISTENCE_TEMP, false);
         relpersistence = RELPERSISTENCE_TEMP;
     }
     else
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index b24611be..8f8a76d8 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -590,6 +590,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
     Datum        reloptions;
     ListCell   *listptr;
     AttrNumber    attnum;
+	bool		partitioned;
     static char *validnsps[] = HEAP_RELOPT_NAMESPACES;
     Oid            ofTypeId;
     ObjectAddress address;
@@ -634,7 +635,10 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
         else
 #endif
             relkind = RELKIND_PARTITIONED_TABLE;
+		partitioned = true;
     }
+	else
+		partitioned = false;
 
     /*
      * Look up the namespace in which we are supposed to create the relation,
@@ -716,31 +720,24 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 	if (stmt->tablespacename)
 	{
 		tablespaceId = get_tablespace_oid(stmt->tablespacename, false);
+
+		if (partitioned && tablespaceId == MyDatabaseTableSpace)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot specify default tablespace for partitioned relations")));
 	}
 	else if (stmt->partbound)
 	{
-		HeapTuple	tup;
-
 		/*
 		 * For partitions, when no other tablespace is specified, we default
 		 * the tablespace to the parent partitioned table's.
 		 */
 		Assert(list_length(inheritOids) == 1);
-		tup = SearchSysCache1(RELOID,
-							  DatumGetObjectId(linitial_oid(inheritOids)));
-
-		tablespaceId = ((Form_pg_class) GETSTRUCT(tup))->reltablespace;
-
-		if (!OidIsValid(tablespaceId))
-			tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence);
-
-		ReleaseSysCache(tup);
+		tablespaceId = get_rel_tablespace(linitial_oid(inheritOids));
 	}
 	else
-	{
-		tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence);
-		/* note InvalidOid is OK in this case */
-	}
+		tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence,
+											partitioned);
 
 	/* Check permissions except when using database's default */
 	if (OidIsValid(tablespaceId) && tablespaceId != MyDatabaseTableSpace)
@@ -1183,7 +1180,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
      * Process the partitioning specification (if any) and store the partition
      * key information into the catalog.
      */
-    if (stmt->partspec)
+	if (partitioned)
     {
 		ParseState *pstate;
         char        strategy;
@@ -11628,6 +11625,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
 
             if (!rewrite)
                 TryReuseIndex(oldId, stmt);
+			stmt->reset_default_tblspc = true;
             /* keep the index's comment */
             stmt->idxcomment = GetComment(oldId, RelationRelationId, 0);
 
@@ -11659,6 +11657,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
                     /* keep any comment on the index */
                     indstmt->idxcomment = GetComment(indoid,
                                                      RelationRelationId, 0);
+					indstmt->reset_default_tblspc = true;
 
                     cmd->subtype = AT_ReAddIndex;
                     tab->subcmds[AT_PASS_OLD_INDEX] =
@@ -11680,6 +11679,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
                     if (con->contype == CONSTR_FOREIGN &&
                         !rewrite && tab->rewrite == 0)
                         TryReuseForeignKey(oldId, con);
+					con->reset_default_tblspc = true;
                     cmd->subtype = AT_ReAddConstraint;
                     tab->subcmds[AT_PASS_OLD_CONSTR] =
                         lappend(tab->subcmds[AT_PASS_OLD_CONSTR], cmd);
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 28892609..1b208c6c 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -1206,7 +1206,9 @@ check_default_tablespace(char **newval, void **extra, GucSource source)
  * GetDefaultTablespace -- get the OID of the current default tablespace
  *
  * Temporary objects have different default tablespaces, hence the
- * relpersistence parameter must be specified.
+ * relpersistence parameter must be specified.  Also, for partitioned tables,
+ * we disallow specifying the database default, so that needs to be specified
+ * too.
  *
  * May return InvalidOid to indicate "use the database's default tablespace".
  *
@@ -1217,7 +1219,7 @@ check_default_tablespace(char **newval, void **extra, GucSource source)
  * default_tablespace GUC variable.
  */
 Oid
-GetDefaultTablespace(char relpersistence)
+GetDefaultTablespace(char relpersistence, bool partitioned)
 {
     Oid            result;
 
@@ -1243,10 +1245,18 @@ GetDefaultTablespace(char relpersistence)
 
     /*
      * Allow explicit specification of database's default tablespace in
-     * default_tablespace without triggering permissions checks.
+	 * default_tablespace without triggering permissions checks.  Don't
+	 * allow specifying that when creating a partitioned table, however,
+	 * since the result is confusing.
      */
     if (result == MyDatabaseTableSpace)
+	{
+		if (partitioned)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot specify default tablespace for partitioned relations")));
         result = InvalidOid;
+	}
     return result;
 }
 
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index e87c8463..ea4a0c71 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -3153,6 +3153,7 @@ _copyConstraint(const Constraint *from)
     COPY_NODE_FIELD(options);
     COPY_STRING_FIELD(indexname);
     COPY_STRING_FIELD(indexspace);
+	COPY_SCALAR_FIELD(reset_default_tblspc);
     COPY_STRING_FIELD(access_method);
     COPY_NODE_FIELD(where_clause);
     COPY_NODE_FIELD(pktable);
@@ -3745,6 +3746,7 @@ _copyIndexStmt(const IndexStmt *from)
     COPY_SCALAR_FIELD(transformed);
     COPY_SCALAR_FIELD(concurrent);
     COPY_SCALAR_FIELD(if_not_exists);
+	COPY_SCALAR_FIELD(reset_default_tblspc);
 #ifdef __TBASE__
     COPY_SCALAR_FIELD(parentIndexOid);
 #endif
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 8abab4bb..c92cbd30 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -1365,6 +1365,7 @@ _equalIndexStmt(const IndexStmt *a, const IndexStmt *b)
     COMPARE_SCALAR_FIELD(transformed);
     COMPARE_SCALAR_FIELD(concurrent);
     COMPARE_SCALAR_FIELD(if_not_exists);
+	COMPARE_SCALAR_FIELD(reset_default_tblspc);
 
     return true;
 }
@@ -2644,6 +2645,7 @@ _equalConstraint(const Constraint *a, const Constraint *b)
     COMPARE_NODE_FIELD(options);
     COMPARE_STRING_FIELD(indexname);
     COMPARE_STRING_FIELD(indexspace);
+	COMPARE_SCALAR_FIELD(reset_default_tblspc);
     COMPARE_STRING_FIELD(access_method);
     COMPARE_NODE_FIELD(where_clause);
     COMPARE_NODE_FIELD(pktable);
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 24ca2109..092a7dd5 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -4027,6 +4027,7 @@ _outIndexStmt(StringInfo str, const IndexStmt *node)
     WRITE_BOOL_FIELD(transformed);
     WRITE_BOOL_FIELD(concurrent);
     WRITE_BOOL_FIELD(if_not_exists);
+	WRITE_BOOL_FIELD(reset_default_tblspc);
 }
 
 static void
@@ -4898,6 +4899,7 @@ _outConstraint(StringInfo str, const Constraint *node)
             WRITE_NODE_FIELD(options);
             WRITE_STRING_FIELD(indexname);
             WRITE_STRING_FIELD(indexspace);
+			WRITE_BOOL_FIELD(reset_default_tblspc);
             /* access_method and where_clause not currently used */
             break;
 
@@ -4907,6 +4909,7 @@ _outConstraint(StringInfo str, const Constraint *node)
             WRITE_NODE_FIELD(options);
             WRITE_STRING_FIELD(indexname);
             WRITE_STRING_FIELD(indexspace);
+			WRITE_BOOL_FIELD(reset_default_tblspc);
             /* access_method and where_clause not currently used */
             break;
 
@@ -4916,6 +4919,7 @@ _outConstraint(StringInfo str, const Constraint *node)
             WRITE_NODE_FIELD(options);
             WRITE_STRING_FIELD(indexname);
             WRITE_STRING_FIELD(indexspace);
+			WRITE_BOOL_FIELD(reset_default_tblspc);
             WRITE_STRING_FIELD(access_method);
             WRITE_NODE_FIELD(where_clause);
             break;
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index a3eb7514..f8e17e4e 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -7758,6 +7758,7 @@ IndexStmt:	CREATE opt_unique INDEX opt_concurrently opt_index_name
 					n->initdeferred = false;
 					n->transformed = false;
 					n->if_not_exists = false;
+					n->reset_default_tblspc = false;
 					$$ = (Node *)n;
 				}
 			| CREATE opt_unique INDEX opt_concurrently IF_P NOT EXISTS index_name
@@ -7784,6 +7785,7 @@ IndexStmt:	CREATE opt_unique INDEX opt_concurrently opt_index_name
 					n->initdeferred = false;
 					n->transformed = false;
 					n->if_not_exists = true;
+					n->reset_default_tblspc = false;
 					$$ = (Node *)n;
 				}
 		;
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index a5c17b8b..af249f71 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -1795,6 +1795,7 @@ generateClonedIndexStmt(CreateStmtContext *cxt, Relation source_idx,
     index->transformed = true;    /* don't need transformIndexStmt */
     index->concurrent = false;
     index->if_not_exists = false;
+	index->reset_default_tblspc = false;
 
     /*
      * We don't try to preserve the name of the source index; instead, just
@@ -2268,6 +2269,7 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
     index->transformed = false;
     index->concurrent = false;
     index->if_not_exists = false;
+	index->reset_default_tblspc = constraint->reset_default_tblspc;
 
     /*
      * If it's ALTER TABLE ADD CONSTRAINT USING INDEX, look up the index and
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 984ace45..9acf184a 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -1472,13 +1472,14 @@ pg_get_indexdef_worker(Oid indexrelid, int colno,
             Oid            tblspc;
 
             tblspc = get_rel_tablespace(indexrelid);
-            if (!OidIsValid(tblspc))
-                tblspc = MyDatabaseTableSpace;
+			if (OidIsValid(tblspc))
+			{
             if (isConstraint)
                 appendStringInfoString(&buf, " USING INDEX");
             appendStringInfo(&buf, " TABLESPACE %s",
                              quote_identifier(get_tablespace_name(tblspc)));
         }
+		}
 
         /*
          * If it's a partial index, decompile and append the predicate
@@ -2126,6 +2127,12 @@ pg_get_constraintdef_worker(Oid constraintId, bool fullCommand,
                         pfree(options);
                     }
 
+					/*
+					 * Print the tablespace, unless it's the database default.
+					 * This is to help ALTER TABLE usage of this facility,
+					 * which needs this behavior to recreate exact catalog
+					 * state.
+					 */
                     tblspc = get_rel_tablespace(indexId);
                     if (OidIsValid(tblspc))
                         appendStringInfo(&buf, " USING INDEX TABLESPACE %s",
diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h
index 9a933cca..32805ab4 100644
--- a/src/include/commands/tablespace.h
+++ b/src/include/commands/tablespace.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * tablespace.h
- *        Tablespace management commands (create/drop tablespace).
+ *		Tablespace management commands (create/drop tablespace).
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -20,40 +20,40 @@
 #include "nodes/parsenodes.h"
 
 /* XLOG stuff */
-#define XLOG_TBLSPC_CREATE        0x00
-#define XLOG_TBLSPC_DROP        0x10
+#define XLOG_TBLSPC_CREATE		0x00
+#define XLOG_TBLSPC_DROP		0x10
 
 typedef struct xl_tblspc_create_rec
 {
-    Oid            ts_id;
-    char        ts_path[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated string */
+	Oid			ts_id;
+	char		ts_path[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated string */
 } xl_tblspc_create_rec;
 
 typedef struct xl_tblspc_drop_rec
 {
-    Oid            ts_id;
+	Oid			ts_id;
 } xl_tblspc_drop_rec;
 
 typedef struct TableSpaceOpts
 {
-    int32        vl_len_;        /* varlena header (do not touch directly!) */
-    float8        random_page_cost;
-    float8        seq_page_cost;
-    int            effective_io_concurrency;
+	int32		vl_len_;		/* varlena header (do not touch directly!) */
+	float8		random_page_cost;
+	float8		seq_page_cost;
+	int			effective_io_concurrency;
 } TableSpaceOpts;
 
-extern Oid    CreateTableSpace(CreateTableSpaceStmt *stmt);
+extern Oid	CreateTableSpace(CreateTableSpaceStmt *stmt);
 extern void DropTableSpace(DropTableSpaceStmt *stmt);
 extern ObjectAddress RenameTableSpace(const char *oldname, const char *newname);
-extern Oid    AlterTableSpaceOptions(AlterTableSpaceOptionsStmt *stmt);
+extern Oid	AlterTableSpaceOptions(AlterTableSpaceOptionsStmt *stmt);
 
 extern void TablespaceCreateDbspace(Oid spcNode, Oid dbNode, bool isRedo);
 
-extern Oid    GetDefaultTablespace(char relpersistence);
+extern Oid	GetDefaultTablespace(char relpersistence, bool partitioned);
 
 extern void PrepareTempTablespaces(void);
 
-extern Oid    get_tablespace_oid(const char *tablespacename, bool missing_ok);
+extern Oid	get_tablespace_oid(const char *tablespacename, bool missing_ok);
 extern char *get_tablespace_name(Oid spc_oid);
 
 extern bool directory_is_empty(const char *path);
@@ -63,4 +63,4 @@ extern void tblspc_redo(XLogReaderState *rptr);
 extern void tblspc_desc(StringInfo buf, XLogReaderState *rptr);
 extern const char *tblspc_identify(uint8 info);
 
-#endif                            /* TABLESPACE_H */
+#endif							/* TABLESPACE_H */
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 983a1ab0..acc64eb0 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -2252,6 +2252,8 @@ typedef struct Constraint
     List       *options;        /* options from WITH clause */
     char       *indexname;        /* existing index to use; otherwise NULL */
     char       *indexspace;        /* index tablespace; NULL for default */
+	bool		reset_default_tblspc;	/* reset default_tablespace prior to
+										 * creating the index */
     /* These could be, but currently are not, used for UNIQUE/PKEY: */
     char       *access_method;    /* index access method; NULL for default */
     Node       *where_clause;    /* partial index predicate */
@@ -2881,6 +2883,8 @@ typedef struct IndexStmt
     bool        transformed;    /* true when transformIndexStmt is finished */
     bool        concurrent;        /* should this be a concurrent index build? */
     bool        if_not_exists;    /* just do nothing if index already exists? */
+        bool        reset_default_tblspc;   /* reset default_tablespace prior to
+                                             * executing */
 #ifdef __TBASE__
     /* used for interval partition */
     Oid         parentIndexOid;
diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source
index 4bf5302d..5323f07e 100644
--- a/src/test/regress/input/tablespace.source
+++ b/src/test/regress/input/tablespace.source
@@ -65,24 +65,45 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
     where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx';
 \d testschema.part_a_idx
 
+-- partitioned rels cannot specify the default tablespace.  These fail:
+CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default;
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a);
+SET default_tablespace TO 'pg_default';
+CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a);
+-- but these work:
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+SET default_tablespace TO '';
+CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a);
+DROP TABLE testschema.dflt, testschema.dflt2;
+
 -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
 CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace;
 INSERT INTO testschema.test_default_tab VALUES (1);
 CREATE INDEX test_index1 on testschema.test_default_tab (id);
 CREATE INDEX test_index2 on testschema.test_default_tab (id) TABLESPACE regress_tblspace;
+ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index3 PRIMARY KEY (id);
+ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace;
+
 \d testschema.test_index1
 \d testschema.test_index2
+\d testschema.test_index3
+\d testschema.test_index4
 -- use a custom tablespace for default_tablespace
 SET default_tablespace TO regress_tblspace;
 -- tablespace should not change if no rewrite
 ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint;
 \d testschema.test_index1
 \d testschema.test_index2
+\d testschema.test_index3
+\d testschema.test_index4
 SELECT * FROM testschema.test_default_tab;
 -- tablespace should not change even if there is an index rewrite
 ALTER TABLE testschema.test_default_tab ALTER id TYPE int;
 \d testschema.test_index1
 \d testschema.test_index2
+\d testschema.test_index3
+\d testschema.test_index4
 SELECT * FROM testschema.test_default_tab;
 -- now use the default tablespace for default_tablespace
 SET default_tablespace TO '';
@@ -90,12 +111,64 @@ SET default_tablespace TO '';
 ALTER TABLE testschema.test_default_tab ALTER id TYPE int;
 \d testschema.test_index1
 \d testschema.test_index2
+\d testschema.test_index3
+\d testschema.test_index4
 -- tablespace should not change even if there is an index rewrite
 ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint;
 \d testschema.test_index1
 \d testschema.test_index2
+\d testschema.test_index3
+\d testschema.test_index4
 DROP TABLE testschema.test_default_tab;
 
+-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
+-- (this time with a partitioned table)
+CREATE TABLE testschema.test_default_tab_p(id bigint, val bigint)
+    PARTITION BY LIST (id) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.test_default_tab_p1 PARTITION OF testschema.test_default_tab_p
+    FOR VALUES IN (1);
+INSERT INTO testschema.test_default_tab_p VALUES (1);
+CREATE INDEX test_index1 on testschema.test_default_tab_p (val);
+CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace;
+ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id);
+ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace;
+
+\d testschema.test_index1
+\d testschema.test_index2
+\d testschema.test_index3
+\d testschema.test_index4
+-- use a custom tablespace for default_tablespace
+SET default_tablespace TO regress_tblspace;
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
+\d testschema.test_index1
+\d testschema.test_index2
+\d testschema.test_index3
+\d testschema.test_index4
+SELECT * FROM testschema.test_default_tab_p;
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
+\d testschema.test_index1
+\d testschema.test_index2
+\d testschema.test_index3
+\d testschema.test_index4
+SELECT * FROM testschema.test_default_tab_p;
+-- now use the default tablespace for default_tablespace
+SET default_tablespace TO '';
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
+\d testschema.test_index1
+\d testschema.test_index2
+\d testschema.test_index3
+\d testschema.test_index4
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
+\d testschema.test_index1
+\d testschema.test_index2
+\d testschema.test_index3
+\d testschema.test_index4
+DROP TABLE testschema.test_default_tab_p;
+
 -- check that default_tablespace affects index additions in ALTER TABLE
 CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace;
 INSERT INTO testschema.test_tab VALUES (1);
@@ -108,6 +181,23 @@ ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_pkey PRIMARY KEY (id);
 SELECT * FROM testschema.test_tab;
 DROP TABLE testschema.test_tab;
 
+-- check that default_tablespace is handled correctly by multi-command
+-- ALTER TABLE that includes a tablespace-preserving rewrite
+CREATE TABLE testschema.test_tab(a int, b int, c int);
+SET default_tablespace TO regress_tblspace;
+ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (a);
+CREATE INDEX test_tab_a_idx ON testschema.test_tab (a);
+SET default_tablespace TO '';
+CREATE INDEX test_tab_b_idx ON testschema.test_tab (b);
+\d testschema.test_tab_unique
+\d testschema.test_tab_a_idx
+\d testschema.test_tab_b_idx
+ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c);
+\d testschema.test_tab_unique
+\d testschema.test_tab_a_idx
+\d testschema.test_tab_b_idx
+DROP TABLE testschema.test_tab;
+
 -- let's try moving a table from one place to another
 CREATE TABLE testschema.atable AS VALUES (1), (2);
 CREATE UNIQUE INDEX anindex ON testschema.atable(column1);
diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source
index 6688ae7e..21e8baff 100644
--- a/src/test/regress/output/tablespace.source
+++ b/src/test/regress/output/tablespace.source
@@ -102,11 +102,28 @@ Partitioned index "testschema.part_a_idx"
 btree, for table "testschema.part"
 Tablespace: "regress_tblspace"
 
+-- partitioned rels cannot specify the default tablespace.  These fail:
+CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default;
+ERROR:  cannot specify default tablespace for partitioned relations
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a);
+ERROR:  cannot specify default tablespace for partitioned relation
+SET default_tablespace TO 'pg_default';
+CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+ERROR:  cannot specify default tablespace for partitioned relations
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a);
+ERROR:  cannot specify default tablespace for partitioned relations
+-- but these work:
+CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+SET default_tablespace TO '';
+CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a);
+DROP TABLE testschema.dflt, testschema.dflt2;
 -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
 CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace;
 INSERT INTO testschema.test_default_tab VALUES (1);
 CREATE INDEX test_index1 on testschema.test_default_tab (id);
 CREATE INDEX test_index2 on testschema.test_default_tab (id) TABLESPACE regress_tblspace;
+ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index3 PRIMARY KEY (id);
+ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace;
 \d testschema.test_index1
 Index "testschema.test_index1"
  Column |  Type  | Definition 
@@ -122,6 +139,21 @@ Index "testschema.test_index2"
 btree, for table "testschema.test_default_tab"
 Tablespace: "regress_tblspace"
 
+\d testschema.test_index3
+   Index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+   Index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
 -- use a custom tablespace for default_tablespace
 SET default_tablespace TO regress_tblspace;
 -- tablespace should not change if no rewrite
@@ -141,6 +173,21 @@ Index "testschema.test_index2"
 btree, for table "testschema.test_default_tab"
 Tablespace: "regress_tblspace"
 
+\d testschema.test_index3
+    Index "testschema.test_index3"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+    Index "testschema.test_index4"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
 SELECT * FROM testschema.test_default_tab;
  id 
 ----
@@ -164,6 +211,21 @@ Index "testschema.test_index2"
 btree, for table "testschema.test_default_tab"
 Tablespace: "regress_tblspace"
 
+\d testschema.test_index3
+   Index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+   Index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
 SELECT * FROM testschema.test_default_tab;
  id 
 ----
@@ -189,6 +251,21 @@ Index "testschema.test_index2"
 btree, for table "testschema.test_default_tab"
 Tablespace: "regress_tblspace"
 
+\d testschema.test_index3
+    Index "testschema.test_index3"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+    Index "testschema.test_index4"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ id     | integer | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
 -- tablespace should not change even if there is an index rewrite
 ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint;
 \d testschema.test_index1
@@ -206,7 +283,208 @@ Index "testschema.test_index2"
 btree, for table "testschema.test_default_tab"
 Tablespace: "regress_tblspace"
 
+\d testschema.test_index3
+   Index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab"
+
+\d testschema.test_index4
+   Index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab"
+Tablespace: "regress_tblspace"
+
 DROP TABLE testschema.test_default_tab;
+-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
+-- (this time with a partitioned table)
+CREATE TABLE testschema.test_default_tab_p(id bigint, val bigint)
+    PARTITION BY LIST (id) TABLESPACE regress_tblspace;
+CREATE TABLE testschema.test_default_tab_p1 PARTITION OF testschema.test_default_tab_p
+    FOR VALUES IN (1);
+INSERT INTO testschema.test_default_tab_p VALUES (1);
+CREATE INDEX test_index1 on testschema.test_default_tab_p (val);
+CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace;
+ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id);
+ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Tablespace: "regress_tblspace"
+
+-- use a custom tablespace for default_tablespace
+SET default_tablespace TO regress_tblspace;
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab_p;
+ id | val 
+----+-----
+  1 |    
+(1 row)
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Tablespace: "regress_tblspace"
+
+SELECT * FROM testschema.test_default_tab_p;
+ id | val 
+----+-----
+  1 |    
+(1 row)
+
+-- now use the default tablespace for default_tablespace
+SET default_tablespace TO '';
+-- tablespace should not change if no rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ val    | integer | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Tablespace: "regress_tblspace"
+
+-- tablespace should not change even if there is an index rewrite
+ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
+\d testschema.test_index1
+Partitioned index "testschema.test_index1"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+
+\d testschema.test_index2
+Partitioned index "testschema.test_index2"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ val    | bigint | yes  | val
+btree, for table "testschema.test_default_tab_p"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_index3
+Partitioned index "testschema.test_index3"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+primary key, btree, for table "testschema.test_default_tab_p"
+
+\d testschema.test_index4
+Partitioned index "testschema.test_index4"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ id     | bigint | yes  | id
+unique, btree, for table "testschema.test_default_tab_p"
+Tablespace: "regress_tblspace"
+
+DROP TABLE testschema.test_default_tab_p;
 -- check that default_tablespace affects index additions in ALTER TABLE
 CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace;
 INSERT INTO testschema.test_tab VALUES (1);
@@ -235,6 +513,62 @@ SELECT * FROM testschema.test_tab;
   1
 (1 row)
 
+DROP TABLE testschema.test_tab;
+-- check that default_tablespace is handled correctly by multi-command
+-- ALTER TABLE that includes a tablespace-preserving rewrite
+CREATE TABLE testschema.test_tab(a int, b int, c int);
+SET default_tablespace TO regress_tblspace;
+ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (a);
+CREATE INDEX test_tab_a_idx ON testschema.test_tab (a);
+SET default_tablespace TO '';
+CREATE INDEX test_tab_b_idx ON testschema.test_tab (b);
+\d testschema.test_tab_unique
+  Index "testschema.test_tab_unique"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+unique, btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_a_idx
+  Index "testschema.test_tab_a_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_b_idx
+  Index "testschema.test_tab_b_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ b      | integer | yes  | b
+btree, for table "testschema.test_tab"
+
+ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c);
+\d testschema.test_tab_unique
+  Index "testschema.test_tab_unique"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+unique, btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_a_idx
+  Index "testschema.test_tab_a_idx"
+ Column |  Type   | Key? | Definition 
+--------+---------+------+------------
+ a      | integer | yes  | a
+btree, for table "testschema.test_tab"
+Tablespace: "regress_tblspace"
+
+\d testschema.test_tab_b_idx
+  Index "testschema.test_tab_b_idx"
+ Column |  Type  | Key? | Definition 
+--------+--------+------+------------
+ b      | bigint | yes  | b
+btree, for table "testschema.test_tab"
+
 DROP TABLE testschema.test_tab;
 -- let's try moving a table from one place to another
 CREATE TABLE testschema.atable AS VALUES (1), (2);

From 577e6c7d3b530dbc9dee23db132f5aeba33da48b Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 2 Jul 2020 10:31:37 +0800
Subject: [PATCH 284/578] Fix bogus logic for combining range-partitioned
 columns during pruning.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/partitioning/partprune.c          | 46 ++++---------------
 src/test/regress/expected/partition_prune.out | 27 +++++++++++
 src/test/regress/sql/partition_prune.sql      | 19 ++++++++
 3 files changed, 55 insertions(+), 37 deletions(-)

diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c
index 242267f2..38c1e8ea 100644
--- a/src/backend/partitioning/partprune.c
+++ b/src/backend/partitioning/partprune.c
@@ -806,9 +806,6 @@ gen_prune_steps_from_opexps(PartitionScheme part_scheme,
 	List	   *opsteps = NIL;
 	List	   *btree_clauses[BTMaxStrategyNumber + 1],
 			   *hash_clauses[HTMaxStrategyNumber + 1];
-	bool		need_next_less,
-				need_next_eq,
-				need_next_greater;
 	int			i;
 
 	memset(btree_clauses, 0, sizeof(btree_clauses));
@@ -819,9 +816,8 @@ gen_prune_steps_from_opexps(PartitionScheme part_scheme,
 		bool		consider_next_key = true;
 
 		/*
-		 * To be useful for pruning, we must have clauses for a prefix of
-		 * partition keys in the case of range partitioning.  So, ignore
-		 * clauses for keys after this one.
+		 * For range partitioning, if we have no clauses for the current key,
+		 * we can't consider any later keys either, so we can stop here.
 		 */
 		if (part_scheme->strategy == PARTITION_STRATEGY_RANGE &&
 			clauselist == NIL)
@@ -836,7 +832,6 @@ gen_prune_steps_from_opexps(PartitionScheme part_scheme,
 			clauselist == NIL && !bms_is_member(i, nullkeys))
 			return NULL;
 
-		need_next_eq = need_next_less = need_next_greater = true;
 		foreach(lc, clauselist)
 		{
 			PartClauseInfo *pc = (PartClauseInfo *) lfirst(lc);
@@ -858,7 +853,6 @@ gen_prune_steps_from_opexps(PartitionScheme part_scheme,
 				case PARTITION_STRATEGY_RANGE:
 					{
 						PartClauseInfo *last = NULL;
-						bool		inclusive = false;
 
 						/*
 						 * Add this clause to the list of clauses to be used
@@ -876,35 +870,13 @@ gen_prune_steps_from_opexps(PartitionScheme part_scheme,
 								lappend(btree_clauses[pc->op_strategy], pc);
 
 						/*
-						 * We may not need the next clause if they're of
-						 * certain strategy.
+						 * We can't consider subsequent partition keys if the
+						 * clause for the current key contains a non-inclusive
+						 * operator.
 						 */
-						switch (pc->op_strategy)
-						{
-							case BTLessEqualStrategyNumber:
-								inclusive = true;
-								/* fall through */
-							case BTLessStrategyNumber:
-								if (!inclusive)
-									need_next_eq = need_next_less = false;
-								break;
-							case BTEqualStrategyNumber:
-								/* always accept clauses for the next key. */
-								break;
-							case BTGreaterEqualStrategyNumber:
-								inclusive = true;
-								/* fall through */
-							case BTGreaterStrategyNumber:
-								if (!inclusive)
-									need_next_eq = need_next_greater = false;
-								break;
-						}
-
-						/* We may want to change our mind. */
-						if (consider_next_key)
-							consider_next_key = (need_next_eq ||
-												 need_next_less ||
-												 need_next_greater);
+						if (pc->op_strategy == BTLessStrategyNumber ||
+							pc->op_strategy == BTGreaterStrategyNumber)
+							consider_next_key = false;
 						break;
 					}
 
@@ -2340,7 +2312,7 @@ get_matching_range_bounds(PartitionPruneContext *context,
 
 			/*
 			 * Look for the greatest bound that is < or <= lookup value and
-			 * set minoff to its offset.
+			 * set maxoff to its offset.
 			 */
 			off = partition_range_datum_bsearch(partsupfunc,
 												partcollation,
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index 94bceb8d..95a64972 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -1461,3 +1461,30 @@ explain (costs off) select * from rparted_by_int2 where a > 100000000000000;
 (3 rows)
 
 drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2;
+--
+-- Check that pruning with composite range partitioning works correctly when
+-- it must ignore clauses for trailing keys once it has seen a clause with
+-- non-inclusive operator for an earlier key
+--
+create table mc3p (a int, b int, c int) partition by range (a, abs(b), c);
+create table mc3p0 partition of mc3p
+  for values from (0, 0, 0) to (0, maxvalue, maxvalue);
+create table mc3p1 partition of mc3p
+  for values from (1, 1, 1) to (2, minvalue, minvalue);
+create table mc3p2 partition of mc3p
+  for values from (2, minvalue, minvalue) to (3, maxvalue, maxvalue);
+insert into mc3p values (0, 1, 1), (1, 1, 1), (2, 1, 1);
+explain (analyze, costs off, summary off, timing off)
+select * from mc3p where a < 3 and abs(b) = 1;
+                   QUERY PLAN                    
+-------------------------------------------------
+ Append (actual rows=3 loops=1)
+   ->  Seq Scan on mc3p0 (actual rows=1 loops=1)
+         Filter: ((a < 3) AND (abs(b) = 1))
+   ->  Seq Scan on mc3p1 (actual rows=1 loops=1)
+         Filter: ((a < 3) AND (abs(b) = 1))
+   ->  Seq Scan on mc3p2 (actual rows=1 loops=1)
+         Filter: ((a < 3) AND (abs(b) = 1))
+(7 rows)
+
+drop table mc3p;
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index 4862cdfd..4b5acbe1 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -265,3 +265,22 @@ create table rparted_by_int2_maxvalue partition of rparted_by_int2 for values fr
 explain (costs off) select * from rparted_by_int2 where a > 100000000000000;
 
 drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2;
+
+--
+-- Check that pruning with composite range partitioning works correctly when
+-- it must ignore clauses for trailing keys once it has seen a clause with
+-- non-inclusive operator for an earlier key
+--
+create table mc3p (a int, b int, c int) partition by range (a, abs(b), c);
+create table mc3p0 partition of mc3p
+  for values from (0, 0, 0) to (0, maxvalue, maxvalue);
+create table mc3p1 partition of mc3p
+  for values from (1, 1, 1) to (2, minvalue, minvalue);
+create table mc3p2 partition of mc3p
+  for values from (2, minvalue, minvalue) to (3, maxvalue, maxvalue);
+insert into mc3p values (0, 1, 1), (1, 1, 1), (2, 1, 1);
+
+explain (analyze, costs off, summary off, timing off)
+select * from mc3p where a < 3 and abs(b) = 1;
+
+drop table mc3p;

From f756f4e13415ba36120130930d38466f05512010 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Fri, 7 Jun 2019 00:44:17 -0400
Subject: [PATCH 285/578] Fix default_tablespace usage for partitioned tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In commit 87259588d0ab I (Álvaro) tried to rationalize the determination
of tablespace to use for partitioned tables, but failed to handle the
default_tablespace case.  Repair and add proper tests.

Author: Amit Langote, Rushabh Lathia
Reported-by: Rushabh Lathia
Reviewed-by: Amit Langote, Álvaro Herrera
Discussion: https://postgr.es/m/CAGPqQf0cYjm1=rjxk_6gU0SjUS70=yFUAdCJLwWzh9bhNJnyVg@mail.gmail.com
---
 src/backend/commands/tablecmds.c          |  8 +++-
 src/test/regress/input/tablespace.source  | 34 ++++++++++++---
 src/test/regress/output/tablespace.source | 50 ++++++++++++++++++-----
 3 files changed, 73 insertions(+), 19 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 8f8a76d8..393d30c4 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -714,8 +714,8 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 	}
 
 	/*
-	 * Select tablespace to use.  If not specified, use default tablespace
-	 * (which may in turn default to database's default).
+	 * Select tablespace to use: an explicitly indicated one, or (in the case
+	 * of a partitioned table) the parent's, if it has one.
 	 */
 	if (stmt->tablespacename)
 	{
@@ -736,6 +736,10 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 		tablespaceId = get_rel_tablespace(linitial_oid(inheritOids));
 	}
 	else
+		tablespaceId = InvalidOid;
+
+	/* still nothing? use the default */
+	if (!OidIsValid(tablespaceId))
 		tablespaceId = GetDefaultTablespace(stmt->relation->relpersistence,
 											partitioned);
 
diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source
index 5323f07e..d46f0e4c 100644
--- a/src/test/regress/input/tablespace.source
+++ b/src/test/regress/input/tablespace.source
@@ -44,16 +44,38 @@ CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace;
 SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
     where c.reltablespace = t.oid AND c.relname = 'foo_idx';
 
+--
 -- partitioned table
+--
 CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
-CREATE TABLE testschema.part12 PARTITION OF testschema.part FOR VALUES IN(1,2) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
-CREATE TABLE testschema.part12_1 PARTITION OF testschema.part12 FOR VALUES IN (1);
-ALTER TABLE testschema.part12 SET TABLESPACE pg_default;
-CREATE TABLE testschema.part12_2 PARTITION OF testschema.part12 FOR VALUES IN (2);
--- Ensure part12_1 defaulted to regress_tblspace and part12_2 defaulted to pg_default.
+SET default_tablespace TO pg_global;
+CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1);
+RESET default_tablespace;
+CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1);
+SET default_tablespace TO regress_tblspace;
+CREATE TABLE testschema.part_2 PARTITION OF testschema.part FOR VALUES IN (2);
+SET default_tablespace TO pg_global;
+CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3);
+ALTER TABLE testschema.part SET TABLESPACE regress_tblspace;
+CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3);
+CREATE TABLE testschema.part_4 PARTITION OF testschema.part FOR VALUES IN (4)
+  TABLESPACE pg_default;
+CREATE TABLE testschema.part_56 PARTITION OF testschema.part FOR VALUES IN (5, 6)
+  PARTITION BY LIST (a);
+ALTER TABLE testschema.part SET TABLESPACE pg_default;
+CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8)
+  PARTITION BY LIST (a);
+CREATE TABLE testschema.part_910 PARTITION OF testschema.part FOR VALUES IN (9, 10)
+  PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+RESET default_tablespace;
+CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8)
+  PARTITION BY LIST (a);
+
 SELECT relname, spcname FROM pg_catalog.pg_class c
+    JOIN pg_catalog.pg_namespace n ON (c.relnamespace = n.oid)
     LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid
-    where c.relname LIKE 'part%' order by relname;
+    where c.relname LIKE 'part%' AND n.nspname = 'testschema' order by relname;
+RESET default_tablespace;
 DROP TABLE testschema.part;
 
 -- partitioned index
diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source
index 21e8baff..15c0d3e0 100644
--- a/src/test/regress/output/tablespace.source
+++ b/src/test/regress/output/tablespace.source
@@ -61,24 +61,52 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
  foo_idx | regress_tblspace
 (1 row)
 
+--
 -- partitioned table
+--
 CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);
-CREATE TABLE testschema.part12 PARTITION OF testschema.part FOR VALUES IN(1,2) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
-CREATE TABLE testschema.part12_1 PARTITION OF testschema.part12 FOR VALUES IN (1);
-ALTER TABLE testschema.part12 SET TABLESPACE pg_default;
-CREATE TABLE testschema.part12_2 PARTITION OF testschema.part12 FOR VALUES IN (2);
--- Ensure part12_1 defaulted to regress_tblspace and part12_2 defaulted to pg_default.
+SET default_tablespace TO pg_global;
+CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1);
+ERROR:  only shared relations can be placed in pg_global tablespace
+RESET default_tablespace;
+CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1);
+SET default_tablespace TO regress_tblspace;
+CREATE TABLE testschema.part_2 PARTITION OF testschema.part FOR VALUES IN (2);
+SET default_tablespace TO pg_global;
+CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3);
+ERROR:  only shared relations can be placed in pg_global tablespace
+ALTER TABLE testschema.part SET TABLESPACE regress_tblspace;
+CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3);
+CREATE TABLE testschema.part_4 PARTITION OF testschema.part FOR VALUES IN (4)
+  TABLESPACE pg_default;
+CREATE TABLE testschema.part_56 PARTITION OF testschema.part FOR VALUES IN (5, 6)
+  PARTITION BY LIST (a);
+ALTER TABLE testschema.part SET TABLESPACE pg_default;
+CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8)
+  PARTITION BY LIST (a);
+ERROR:  only shared relations can be placed in pg_global tablespace
+CREATE TABLE testschema.part_910 PARTITION OF testschema.part FOR VALUES IN (9, 10)
+  PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+RESET default_tablespace;
+CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8)
+  PARTITION BY LIST (a);
 SELECT relname, spcname FROM pg_catalog.pg_class c
+    JOIN pg_catalog.pg_namespace n ON (c.relnamespace = n.oid)
     LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid
-    where c.relname LIKE 'part%' order by relname;
+    where c.relname LIKE 'part%' AND n.nspname = 'testschema' order by relname;
  relname  |     spcname      
 ----------+------------------
  part     | 
- part12   | 
- part12_1 | regress_tblspace
- part12_2 | 
-(4 rows)
-
+ part_1   | 
+ part_2   | regress_tblspace
+ part_3   | regress_tblspace
+ part_4   | 
+ part_56  | regress_tblspace
+ part_78  | 
+ part_910 | regress_tblspace
+(8 rows)
+
+RESET default_tablespace;
 DROP TABLE testschema.part;
 -- partitioned index
 CREATE TABLE testschema.part (a int) PARTITION BY LIST (a);

From 015a1b4073770b195ec00941ce95d9467d5833e3 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 2 Jul 2020 11:04:21 +0800
Subject: [PATCH 286/578] Fix RANGE partition pruning with multiple boolean
 partition
 keys.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/partitioning/partprune.c          | 50 ++++++++++++++-----
 src/test/regress/expected/partition_prune.out | 15 +++++-
 src/test/regress/sql/partition_prune.sql      | 11 +++-
 3 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c
index 38c1e8ea..7adf41f3 100644
--- a/src/backend/partitioning/partprune.c
+++ b/src/backend/partitioning/partprune.c
@@ -140,8 +140,10 @@ static PruneStepResult *perform_pruning_base_step(PartitionPruneContext *context
 static PruneStepResult *perform_pruning_combine_step(PartitionPruneContext *context,
 							 PartitionPruneStepCombine *cstep,
 							 PruneStepResult **step_results);
-static bool match_boolean_partition_clause(Oid partopfamily, Expr *clause,
-							   Expr *partkey, Expr **outconst);
+static PartClauseMatchStatus match_boolean_partition_clause(Oid partopfamily,
+                                                           Expr *clause,
+                                                           Expr *partkey,
+                                                           Expr **outconst);
 static bool partkey_datum_from_expr(PartitionPruneContext *context,
 						Expr *expr, Datum *value);
 
@@ -1166,6 +1168,7 @@ match_clause_to_partition_key(RelOptInfo *rel,
 							  bool *clause_is_not_null, PartClauseInfo **pc,
 							  List **clause_steps)
 {
+	PartClauseMatchStatus boolmatchstatus;
 	PartitionScheme part_scheme = rel->part_scheme;
 	Expr	   *expr;
 	Oid			partopfamily = part_scheme->partopfamily[partkeyidx],
@@ -1175,7 +1178,10 @@ match_clause_to_partition_key(RelOptInfo *rel,
 	 * Recognize specially shaped clauses that match with the Boolean
 	 * partition key.
 	 */
-	if (match_boolean_partition_clause(partopfamily, clause, partkey, &expr))
+	boolmatchstatus = match_boolean_partition_clause(partopfamily, clause,
+                                                     partkey, &expr);
+
+    if (boolmatchstatus == PARTCLAUSE_MATCH_CLAUSE)
 	{
 		PartClauseInfo *partclause;
 
@@ -1529,7 +1535,21 @@ match_clause_to_partition_key(RelOptInfo *rel,
 		return PARTCLAUSE_MATCH_NULLNESS;
 	}
 
-	return PARTCLAUSE_UNSUPPORTED;
+	/*
+	 * If we get here then the return value depends on the result of the
+	 * match_boolean_partition_clause call above.  If the call returned
+	 * PARTCLAUSE_UNSUPPORTED then we're either not dealing with a bool qual
+	 * or the bool qual is not suitable for pruning.  Since the qual didn't
+	 * match up to any of the other qual types supported here, then trying to
+	 * match it against any other partition key is a waste of time, so just
+	 * return PARTCLAUSE_UNSUPPORTED.  If the qual just couldn't be matched to
+	 * this partition key, then it may match another, so return
+	 * PARTCLAUSE_NOMATCH.  The only other value that
+	 * match_boolean_partition_clause can return is PARTCLAUSE_MATCH_CLAUSE,
+	 * and since that value was already dealt with above, then we can just
+	 * return boolmatchstatus.
+	 */
+	return boolmatchstatus;
 }
 
 /*
@@ -2670,11 +2690,15 @@ perform_pruning_combine_step(PartitionPruneContext *context,
 /*
  * match_boolean_partition_clause
  *
- * Sets *outconst to a Const containing true or false value and returns true if
- * we're able to match the clause to the partition key as specially-shaped
- * Boolean clause.  Returns false otherwise with *outconst set to NULL.
+ * If we're able to match the clause to the partition key as specially-shaped
+ * boolean clause, set *outconst to a Const containing a true or false value
+ * and return PARTCLAUSE_MATCH_CLAUSE.  Returns PARTCLAUSE_UNSUPPORTED if the
+ * clause is not a boolean clause or if the boolean clause is unsuitable for
+ * partition pruning.  Returns PARTCLAUSE_NOMATCH if it's a bool quals but
+ * just does not match this partition key.  *outconst is set to NULL in the
+ * latter two cases.
  */
-static bool
+static PartClauseMatchStatus
 match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey,
 							   Expr **outconst)
 {
@@ -2683,7 +2707,7 @@ match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey,
 	*outconst = NULL;
 
 	if (!IsBooleanOpfamily(partopfamily))
-		return false;
+		return PARTCLAUSE_UNSUPPORTED;
 
 	if (IsA(clause, BooleanTest))
 	{
@@ -2692,7 +2716,7 @@ match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey,
 		/* Only IS [NOT] TRUE/FALSE are any good to us */
 		if (btest->booltesttype == IS_UNKNOWN ||
 			btest->booltesttype == IS_NOT_UNKNOWN)
-			return false;
+			return PARTCLAUSE_UNSUPPORTED;
 
 		leftop = btest->arg;
 		if (IsA(leftop, RelabelType))
@@ -2705,7 +2729,7 @@ match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey,
 				: (Expr *) makeBoolConst(false, false);
 
 		if (*outconst)
-			return true;
+			return PARTCLAUSE_MATCH_CLAUSE;
 	}
 	else
 	{
@@ -2725,10 +2749,10 @@ match_boolean_partition_clause(Oid partopfamily, Expr *clause, Expr *partkey,
 			*outconst = (Expr *) makeBoolConst(false, false);
 
 		if (*outconst)
-			return true;
+			return PARTCLAUSE_MATCH_CLAUSE;
 	}
 
-	return false;
+	return PARTCLAUSE_NOMATCH;
 }
 
 /*
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index 95a64972..a08f303d 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -1122,6 +1122,19 @@ explain (costs off) select * from boolpart where a is not unknown;
          Filter: (a IS NOT UNKNOWN)
 (7 rows)
 
+create table boolrangep (a bool, b bool, c int) partition by range (a,b,c);
+create table boolrangep_tf partition of boolrangep for values from ('true', 'false', 0) to ('true', 'false', 100);
+create table boolrangep_ft partition of boolrangep for values from ('false', 'true', 0) to ('false', 'true', 100);
+create table boolrangep_ff1 partition of boolrangep for values from ('false', 'false', 0) to ('false', 'false', 50);
+create table boolrangep_ff2 partition of boolrangep for values from ('false', 'false', 50) to ('false', 'false', 100);
+-- try a more complex case that's been known to trip up pruning in the past
+explain (costs off)  select * from boolrangep where not a and not b and c = 25;
+                  QUERY PLAN                  
+----------------------------------------------
+ Seq Scan on boolrangep_ff1
+   Filter: ((NOT a) AND (NOT b) AND (c = 25))
+(2 rows)
+
 -- test scalar-to-array operators
 create table coercepart (a varchar) partition by list (a);
 create table coercepart_ab partition of coercepart for values in ('ab');
@@ -1460,7 +1473,7 @@ explain (costs off) select * from rparted_by_int2 where a > 100000000000000;
          Filter: (a > '100000000000000'::bigint)
 (3 rows)
 
-drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2;
+drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, boolrangep, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2;
 --
 -- Check that pruning with composite range partitioning works correctly when
 -- it must ignore clauses for trailing keys once it has seen a clause with
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index 4b5acbe1..1cd151e2 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -159,6 +159,15 @@ explain (costs off) select * from boolpart where a is not true and a is not fals
 explain (costs off) select * from boolpart where a is unknown;
 explain (costs off) select * from boolpart where a is not unknown;
 
+create table boolrangep (a bool, b bool, c int) partition by range (a,b,c);
+create table boolrangep_tf partition of boolrangep for values from ('true', 'false', 0) to ('true', 'false', 100);
+create table boolrangep_ft partition of boolrangep for values from ('false', 'true', 0) to ('false', 'true', 100);
+create table boolrangep_ff1 partition of boolrangep for values from ('false', 'false', 0) to ('false', 'false', 50);
+create table boolrangep_ff2 partition of boolrangep for values from ('false', 'false', 50) to ('false', 'false', 100);
+
+-- try a more complex case that's been known to trip up pruning in the past
+explain (costs off)  select * from boolrangep where not a and not b and c = 25;
+
 -- test scalar-to-array operators
 create table coercepart (a varchar) partition by list (a);
 create table coercepart_ab partition of coercepart for values in ('ab');
@@ -264,7 +273,7 @@ create table rparted_by_int2_maxvalue partition of rparted_by_int2 for values fr
 -- all partitions but rparted_by_int2_maxvalue pruned
 explain (costs off) select * from rparted_by_int2 where a > 100000000000000;
 
-drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2;
+drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, boolrangep, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2;
 
 --
 -- Check that pruning with composite range partitioning works correctly when

From 80d2048d3e70e4e31ec32accf51fb626b43fbc60 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 2 Jul 2020 11:28:00 +0800
Subject: [PATCH 287/578] Install dependencies to prevent dropping partition
 key columns.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/dependency.c            | 53 ++++++++++++++++++---
 src/backend/catalog/heap.c                  | 28 +++++++++--
 src/backend/commands/tablecmds.c            | 27 +++++------
 src/include/catalog/dependency.h            |  2 +-
 src/test/regress/expected/alter_table.out   | 10 ++--
 src/test/regress/expected/alter_table_1.out |  6 +--
 src/test/regress/expected/alter_table_2.out |  6 +--
 src/test/regress/expected/alter_table_3.out |  6 +--
 src/test/regress/expected/create_table.out  | 36 ++++++++++++++
 src/test/regress/sql/create_table.sql       | 33 +++++++++++++
 10 files changed, 168 insertions(+), 39 deletions(-)

diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c
index 5975a9bb..924d7f35 100644
--- a/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@ -656,6 +656,7 @@ findDependentObjects(const ObjectAddress *object,
                 ObjectIdGetDatum(object->objectId));
     if (object->objectSubId != 0)
     {
+		/* Consider only dependencies of this sub-object */
         ScanKeyInit(&key[2],
                     Anum_pg_depend_objsubid,
                     BTEqualStrategyNumber, F_INT4EQ,
@@ -663,7 +664,10 @@ findDependentObjects(const ObjectAddress *object,
         nkeys = 3;
     }
     else
+	{
+		/* Consider dependencies of this object and any sub-objects it has */
         nkeys = 2;
+	}
 
     scan = systable_beginscan(*depRel, DependDependerIndexId, true,
                               NULL, nkeys, key);
@@ -676,6 +680,18 @@ findDependentObjects(const ObjectAddress *object,
         otherObject.objectId = foundDep->refobjid;
         otherObject.objectSubId = foundDep->refobjsubid;
 
+		/*
+		 * When scanning dependencies of a whole object, we may find rows
+		 * linking sub-objects of the object to the object itself.  (Normally,
+		 * such a dependency is implicit, but we must make explicit ones in
+		 * some cases involving partitioning.)  We must ignore such rows to
+		 * avoid infinite recursion.
+		 */
+		if (otherObject.classId == object->classId &&
+			otherObject.objectId == object->objectId &&
+			object->objectSubId == 0)
+			continue;
+
         switch (foundDep->deptype)
         {
             case DEPENDENCY_NORMAL:
@@ -863,6 +879,16 @@ findDependentObjects(const ObjectAddress *object,
         otherObject.objectSubId = foundDep->objsubid;
 
         /*
+		 * If what we found is a sub-object of the current object, just ignore
+		 * it.  (Normally, such a dependency is implicit, but we must make
+		 * explicit ones in some cases involving partitioning.)
+		 */
+		if (otherObject.classId == object->classId &&
+			otherObject.objectId == object->objectId &&
+			object->objectSubId == 0)
+			continue;
+
+		/*
          * Must lock the dependent object before recursing to it.
          */
         AcquireDeletionLock(&otherObject, 0);
@@ -1601,8 +1627,10 @@ recordDependencyOnExpr(const ObjectAddress *depender,
  * As above, but only one relation is expected to be referenced (with
  * varno = 1 and varlevelsup = 0).  Pass the relation OID instead of a
  * range table.  An additional frammish is that dependencies on that
- * relation (or its component columns) will be marked with 'self_behavior',
- * whereas 'behavior' is used for everything else.
+ * relation's component columns will be marked with 'self_behavior',
+ * whereas 'behavior' is used for everything else; also, if 'reverse_self'
+ * is true, those dependencies are reversed so that the columns are made
+ * to depend on the table not vice versa.
  *
  * NOTE: the caller should ensure that a whole-table dependency on the
  * specified relation is created separately, if one is needed.  In particular,
@@ -1615,7 +1643,7 @@ recordDependencyOnSingleRelExpr(const ObjectAddress *depender,
                                 Node *expr, Oid relId,
                                 DependencyType behavior,
                                 DependencyType self_behavior,
-                                bool ignore_self)
+								bool reverse_self)
 {
     find_expr_references_context context;
     RangeTblEntry rte;
@@ -1638,7 +1666,8 @@ recordDependencyOnSingleRelExpr(const ObjectAddress *depender,
     eliminate_duplicate_dependencies(context.addrs);
 
     /* Separate self-dependencies if necessary */
-    if (behavior != self_behavior && context.addrs->numrefs > 0)
+	if ((behavior != self_behavior || reverse_self) &&
+		context.addrs->numrefs > 0)
     {
         ObjectAddresses *self_addrs;
         ObjectAddress *outobj;
@@ -1669,11 +1698,23 @@ recordDependencyOnSingleRelExpr(const ObjectAddress *depender,
         }
         context.addrs->numrefs = outrefs;
 
-        /* Record the self-dependencies */
-        if (!ignore_self)
+		/* Record the self-dependencies with the appropriate direction */
+		if (!reverse_self)
             recordMultipleDependencies(depender,
                                        self_addrs->refs, self_addrs->numrefs,
                                        self_behavior);
+		else
+		{
+			/* Can't use recordMultipleDependencies, so do it the hard way */
+			int			selfref;
+
+			for (selfref = 0; selfref < self_addrs->numrefs; selfref++)
+			{
+				ObjectAddress *thisobj = self_addrs->refs + selfref;
+
+				recordDependencyOn(thisobj, depender, self_behavior);
+			}
+		}
 
         free_object_addresses(self_addrs);
     }
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 56e4d7f1..39a9c235 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -4323,16 +4323,36 @@ StorePartitionKey(Relation rel,
     }
 
     /*
-     * Anything mentioned in the expressions.  We must ignore the column
-     * references, which will depend on the table itself; there is no separate
-     * partition key object.
+	 * The partitioning columns are made internally dependent on the table,
+	 * because we cannot drop any of them without dropping the whole table.
+	 * (ATExecDropColumn independently enforces that, but it's not bulletproof
+	 * so we need the dependencies too.)
+	 */
+	for (i = 0; i < partnatts; i++)
+	{
+		if (partattrs[i] == 0)
+			continue;			/* ignore expressions here */
+
+		referenced.classId = RelationRelationId;
+		referenced.objectId = RelationGetRelid(rel);
+		referenced.objectSubId = partattrs[i];
+
+		recordDependencyOn(&referenced, &myself, DEPENDENCY_INTERNAL);
+	}
+
+	/*
+	 * Also consider anything mentioned in partition expressions.  External
+	 * references (e.g. functions) get NORMAL dependencies.  Table columns
+	 * mentioned in the expressions are handled the same as plain partitioning
+	 * columns, i.e. they become internally dependent on the whole table.
      */
     if (partexprs)
         recordDependencyOnSingleRelExpr(&myself,
                                         (Node *) partexprs,
                                         RelationGetRelid(rel),
                                         DEPENDENCY_NORMAL,
-                                        DEPENDENCY_AUTO, true);
+										DEPENDENCY_INTERNAL,
+										true /* reverse the self-deps */ );
 
     /*
      * We must invalidate the relcache so that the next
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 393d30c4..43ccc2f3 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -8220,26 +8220,29 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName,
                  errmsg("cannot drop system column \"%s\"",
                         colName)));
 
-    /* Don't drop inherited columns */
+    /*
+	 * Don't drop inherited columns, unless recursing (presumably from a drop
+	 * of the parent column)
+	 */
     if (targetatt->attinhcount > 0 && !recursing)
         ereport(ERROR,
                 (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
                  errmsg("cannot drop inherited column \"%s\"",
                         colName)));
 
-    /* Don't drop columns used in the partition key */
+	/*
+	* Don't drop columns used in the partition key, either.  (If we let this
+	* go through, the key column's dependencies would cause a cascaded drop
+	* of the whole table, which is surely not what the user expected.)
+	*/
 	if (has_partition_attrs(rel,
 							bms_make_singleton(attnum - FirstLowInvalidHeapAttributeNumber),
 							&is_expr))
     {
-        if (!is_expr)
             ereport(ERROR,
                     (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
-                     errmsg("cannot drop column named in partition key")));
-        else
-            ereport(ERROR,
-                    (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
-                     errmsg("cannot drop column referenced in partition key expression")));
+		        errmsg("cannot drop column \"%s\" because it is part of the partition key of relation \"%s\"",
+		             colName, RelationGetRelationName(rel))));
     }
 
 #ifdef __TBASE__
@@ -10683,14 +10686,10 @@ ATPrepAlterColumnType(List **wqueue,
 							bms_make_singleton(attnum - FirstLowInvalidHeapAttributeNumber),
 							&is_expr))
     {
-        if (!is_expr)
-            ereport(ERROR,
-                    (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
-                     errmsg("cannot alter type of column named in partition key")));
-        else
             ereport(ERROR,
                     (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
-                     errmsg("cannot alter type of column referenced in partition key expression")));
+                errmsg("cannot alter column \"%s\" because it is part of the partition key of relation \"%s\"",
+                       colName, RelationGetRelationName(rel))));
     }
 
 #ifdef __TBASE__
diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h
index 83aa4d05..6af0f85d 100644
--- a/src/include/catalog/dependency.h
+++ b/src/include/catalog/dependency.h
@@ -279,7 +279,7 @@ extern void recordDependencyOnSingleRelExpr(const ObjectAddress *depender,
                                 Node *expr, Oid relId,
                                 DependencyType behavior,
                                 DependencyType self_behavior,
-                                bool ignore_self);
+											bool reverse_self);
 
 extern ObjectClass getObjectClass(const ObjectAddress *object);
 
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index ea00b3ae..9ace1f7a 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3141,11 +3141,11 @@ LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&);
 ALTER TABLE partitioned DROP COLUMN a;
 ERROR:  Distribution column cannot be dropped
 ALTER TABLE partitioned ALTER COLUMN a TYPE char(5);
-ERROR:  cannot alter type of column named in partition key
+ERROR:  cannot alter column "a" because it is part of the partition key of relation "partitioned"
 ALTER TABLE partitioned DROP COLUMN b;
-ERROR:  cannot drop column referenced in partition key expression
+ERROR:  cannot drop column "b" because it is part of the partition key of relation "partitioned"
 ALTER TABLE partitioned ALTER COLUMN b TYPE char(5);
-ERROR:  cannot alter type of column referenced in partition key expression
+ERROR:  cannot alter column "b" because it is part of the partition key of relation "partitioned"
 -- partitioned table cannot participate in regular inheritance
 CREATE TABLE nonpartitioned (
 	a int,
@@ -3669,9 +3669,9 @@ ERROR:  cannot change inheritance of a partition
 -- partitioned tables; for example, part_5, which is list_parted2's
 -- partition, is partitioned on b;
 ALTER TABLE list_parted2 DROP COLUMN b;
-ERROR:  cannot drop column named in partition key
+ERROR:  cannot drop column "b" because it is part of the partition key of relation "part_5"
 ALTER TABLE list_parted2 ALTER COLUMN b TYPE text;
-ERROR:  cannot alter type of column named in partition key
+ERROR:  cannot alter column "b" because it is part of the partition key of relation "part_5"
 -- dropping non-partition key columns should be allowed on the parent table.
 ALTER TABLE list_parted DROP COLUMN b;
 SELECT * FROM list_parted;
diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out
index 744691c9..aa4082c8 100644
--- a/src/test/regress/expected/alter_table_1.out
+++ b/src/test/regress/expected/alter_table_1.out
@@ -3140,11 +3140,11 @@ LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&);
 ALTER TABLE partitioned DROP COLUMN a;
 ERROR:  Distribution column cannot be dropped
 ALTER TABLE partitioned ALTER COLUMN a TYPE char(5);
-ERROR:  cannot alter type of column named in partition key
+ERROR:  cannot alter column "a" because it is part of the partition key of relation "partitioned"
 ALTER TABLE partitioned DROP COLUMN b;
-ERROR:  cannot drop column referenced in partition key expression
+ERROR:  cannot drop column "b" because it is part of the partition key of relation "partitioned"
 ALTER TABLE partitioned ALTER COLUMN b TYPE char(5);
-ERROR:  cannot alter type of column referenced in partition key expression
+ERROR:  cannot alter column "b" because it is part of the partition key of relation "partitioned"
 -- partitioned table cannot participate in regular inheritance
 CREATE TABLE nonpartitioned (
 	a int,
diff --git a/src/test/regress/expected/alter_table_2.out b/src/test/regress/expected/alter_table_2.out
index a958aa64..773bd445 100644
--- a/src/test/regress/expected/alter_table_2.out
+++ b/src/test/regress/expected/alter_table_2.out
@@ -3140,11 +3140,11 @@ LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&);
 ALTER TABLE partitioned DROP COLUMN a;
 ERROR:  Distribution column cannot be dropped
 ALTER TABLE partitioned ALTER COLUMN a TYPE char(5);
-ERROR:  cannot alter type of column named in partition key
+ERROR:  cannot alter column "a" because it is part of the partition key of relation "partitioned"
 ALTER TABLE partitioned DROP COLUMN b;
-ERROR:  cannot drop column referenced in partition key expression
+ERROR:  cannot drop column "b" because it is part of the partition key of relation "partitioned"
 ALTER TABLE partitioned ALTER COLUMN b TYPE char(5);
-ERROR:  cannot alter type of column referenced in partition key expression
+ERROR:  cannot alter column "b" because it is part of the partition key of relation "partitioned"
 -- partitioned table cannot participate in regular inheritance
 CREATE TABLE nonpartitioned (
 	a int,
diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out
index 9d426e3c..3287e360 100644
--- a/src/test/regress/expected/alter_table_3.out
+++ b/src/test/regress/expected/alter_table_3.out
@@ -3140,11 +3140,11 @@ LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&);
 ALTER TABLE partitioned DROP COLUMN a;
 ERROR:  Distribution column cannot be dropped
 ALTER TABLE partitioned ALTER COLUMN a TYPE char(5);
-ERROR:  cannot alter type of column named in partition key
+ERROR:  cannot alter column "a" because it is part of the partition key of relation "partitioned"
 ALTER TABLE partitioned DROP COLUMN b;
-ERROR:  cannot drop column referenced in partition key expression
+ERROR:  cannot drop column "b" because it is part of the partition key of relation "partitioned"
 ALTER TABLE partitioned ALTER COLUMN b TYPE char(5);
-ERROR:  cannot alter type of column referenced in partition key expression
+ERROR:  cannot alter column "b" because it is part of the partition key of relation "partitioned"
 -- partitioned table cannot participate in regular inheritance
 CREATE TABLE nonpartitioned (
 	a int,
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index e9bf8784..3290fe55 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -461,6 +461,42 @@ Partition of: partitioned2 FOR VALUES FROM ('-1', 'aaaaa') TO (100, 'ccccc')
 Partition constraint: (((a + 1) IS NOT NULL) AND (substr(b, 1, 5) IS NOT NULL) AND (((a + 1) > '-1'::integer) OR (((a + 1) = '-1'::integer) AND (substr(b, 1, 5) >= 'aaaaa'::text))) AND (((a + 1) < 100) OR (((a + 1) = 100) AND (substr(b, 1, 5) < 'ccccc'::text))))
 
 DROP TABLE partitioned, partitioned2;
+-- check that dependencies of partition columns are handled correctly
+create domain intdom1 as int;
+create table partitioned (
+	a intdom1,
+	b text
+) partition by range (a);
+alter table partitioned drop column a;  -- fail
+ERROR:  cannot drop column "a" because it is part of the partition key of relation "partitioned"
+drop domain intdom1;  -- fail, requires cascade
+ERROR:  cannot drop type intdom1 because other objects depend on it
+DETAIL:  table partitioned depends on type intdom1
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+drop domain intdom1 cascade;
+NOTICE:  drop cascades to table partitioned
+table partitioned;  -- gone
+ERROR:  relation "partitioned" does not exist
+LINE 1: table partitioned;
+              ^
+-- likewise for columns used in partition expressions
+create domain intdom1 as int;
+create table partitioned (
+	a intdom1,
+	b text
+) partition by range (plusone(a));
+alter table partitioned drop column a;  -- fail
+ERROR:  cannot drop column "a" because it is part of the partition key of relation "partitioned"
+drop domain intdom1;  -- fail, requires cascade
+ERROR:  cannot drop type intdom1 because other objects depend on it
+DETAIL:  table partitioned depends on type intdom1
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
+drop domain intdom1 cascade;
+NOTICE:  drop cascades to table partitioned
+table partitioned;  -- gone
+ERROR:  relation "partitioned" does not exist
+LINE 1: table partitioned;
+              ^
 --
 -- Partitions
 --
diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql
index 68482d79..d00a5935 100644
--- a/src/test/regress/sql/create_table.sql
+++ b/src/test/regress/sql/create_table.sql
@@ -432,6 +432,39 @@ CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO
 
 DROP TABLE partitioned, partitioned2;
 
+-- check that dependencies of partition columns are handled correctly
+create domain intdom1 as int;
+
+create table partitioned (
+	a intdom1,
+	b text
+) partition by range (a);
+
+alter table partitioned drop column a;  -- fail
+
+drop domain intdom1;  -- fail, requires cascade
+
+drop domain intdom1 cascade;
+
+table partitioned;  -- gone
+
+-- likewise for columns used in partition expressions
+create domain intdom1 as int;
+
+create table partitioned (
+	a intdom1,
+	b text
+) partition by range (plusone(a));
+
+alter table partitioned drop column a;  -- fail
+
+drop domain intdom1;  -- fail, requires cascade
+
+drop domain intdom1 cascade;
+
+table partitioned;  -- gone
+
+
 --
 -- Partitions
 --

From 3e54f3574fad0481d1946d019a074e0a33e39d69 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 2 Jul 2020 11:49:42 +0800
Subject: [PATCH 288/578] Check that partitions are not in use when dropping
 constraints.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/commands/tablecmds.c            | 56 +++++++++++++++++----
 src/test/regress/expected/foreign_key.out   | 15 ++++++
 src/test/regress/expected/foreign_key_1.out | 15 ++++++
 src/test/regress/expected/foreign_key_2.out | 15 ++++++
 src/test/regress/sql/foreign_key.sql        | 12 +++++
 5 files changed, 102 insertions(+), 11 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 43ccc2f3..31c4aa91 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -405,6 +405,7 @@ static void ATSimplePermissions(Relation rel, int allowed_targets);
 static void ATWrongRelkindError(Relation rel, int allowed_targets);
 static void ATSimpleRecursion(List **wqueue, Relation rel,
                   AlterTableCmd *cmd, bool recurse, LOCKMODE lockmode);
+static void ATCheckPartitionsNotInUse(Relation rel, LOCKMODE lockmode);
 static void ATTypedTableRecursion(List **wqueue, Relation rel, AlterTableCmd *cmd,
                       LOCKMODE lockmode);
 static List *find_typed_table_dependencies(Oid typeOid, const char *typeName,
@@ -4151,8 +4152,7 @@ CheckTableNotInUse(Relation rel, const char *stmt)
         ereport(ERROR,
                 (errcode(ERRCODE_OBJECT_IN_USE),
         /* translator: first %s is a SQL command, eg ALTER TABLE */
-                 errmsg("cannot %s \"%s\" because "
-                        "it is being used by active queries in this session",
+				 errmsg("cannot %s \"%s\" because it is being used by active queries in this session",
                         stmt, RelationGetRelationName(rel))));
 
     if (rel->rd_rel->relkind != RELKIND_INDEX &&
@@ -4160,8 +4160,7 @@ CheckTableNotInUse(Relation rel, const char *stmt)
         ereport(ERROR,
                 (errcode(ERRCODE_OBJECT_IN_USE),
         /* translator: first %s is a SQL command, eg ALTER TABLE */
-                 errmsg("cannot %s \"%s\" because "
-                        "it has pending trigger events",
+				 errmsg("cannot %s \"%s\" because it has pending trigger events",
                         stmt, RelationGetRelationName(rel))));
 }
 
@@ -4790,16 +4789,19 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd,
             break;
         case AT_AddIdentity:
             ATSimplePermissions(rel, ATT_TABLE | ATT_VIEW | ATT_FOREIGN_TABLE);
+			/* This command never recurses */
             pass = AT_PASS_ADD_CONSTR;
             break;
-        case AT_DropIdentity:
-            ATSimplePermissions(rel, ATT_TABLE | ATT_VIEW | ATT_FOREIGN_TABLE);
-            pass = AT_PASS_DROP;
-            break;
         case AT_SetIdentity:
             ATSimplePermissions(rel, ATT_TABLE | ATT_VIEW | ATT_FOREIGN_TABLE);
+			/* This command never recurses */
             pass = AT_PASS_COL_ATTRS;
             break;
+		case AT_DropIdentity:
+			ATSimplePermissions(rel, ATT_TABLE | ATT_VIEW | ATT_FOREIGN_TABLE);
+			/* This command never recurses */
+			pass = AT_PASS_DROP;
+			break;
         case AT_DropNotNull:    /* ALTER COLUMN DROP NOT NULL */
             ATSimplePermissions(rel, ATT_TABLE | ATT_FOREIGN_TABLE);
             ATPrepDropNotNull(rel, recurse, recursing);
@@ -4861,7 +4863,8 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd,
             break;
         case AT_DropConstraint: /* DROP CONSTRAINT */
             ATSimplePermissions(rel, ATT_TABLE | ATT_FOREIGN_TABLE);
-            /* Recursion occurs during execution phase */
+			ATCheckPartitionsNotInUse(rel, lockmode);
+			/* Other recursion occurs during execution phase */
             /* No command-specific prep needed except saving recurse flag */
             if (recurse)
                 cmd->subtype = AT_DropConstraintRecurse;
@@ -6477,8 +6480,9 @@ ATSimpleRecursion(List **wqueue, Relation rel,
                   AlterTableCmd *cmd, bool recurse, LOCKMODE lockmode)
 {// #lizard forgives
     /*
-     * Propagate to children if desired.  Only plain tables and foreign tables
-     * have children, so no need to search for other relkinds.
+	 * Propagate to children if desired.  Only plain tables, foreign tables
+	 * and partitioned tables have children, so no need to search for other
+	 * relkinds.
      */
     if (recurse &&
         (rel->rd_rel->relkind == RELKIND_RELATION ||
@@ -6550,6 +6554,36 @@ ATSimpleRecursion(List **wqueue, Relation rel,
     }
 }
 
+/*
+ * Obtain list of partitions of the given table, locking them all at the given
+ * lockmode and ensuring that they all pass CheckTableNotInUse.
+ *
+ * This function is a no-op if the given relation is not a partitioned table;
+ * in particular, nothing is done if it's a legacy inheritance parent.
+ */
+static void
+ATCheckPartitionsNotInUse(Relation rel, LOCKMODE lockmode)
+{
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+	{
+		List	   *inh;
+		ListCell   *cell;
+
+		inh = find_all_inheritors(RelationGetRelid(rel), lockmode, NULL);
+		/* first element is the parent rel; must ignore it */
+		for_each_cell(cell, lnext(list_head(inh)))
+		{
+			Relation	childrel;
+
+			/* find_all_inheritors already got lock */
+			childrel = table_open(lfirst_oid(cell), NoLock);
+			CheckTableNotInUse(childrel, "ALTER TABLE");
+			table_close(childrel, NoLock);
+		}
+		list_free(inh);
+	}
+}
+
 /*
  * ATTypedTableRecursion
  *
diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out
index 5a958f3a..c3d40a45 100644
--- a/src/test/regress/expected/foreign_key.out
+++ b/src/test/regress/expected/foreign_key.out
@@ -1443,3 +1443,18 @@ DELETE FROM fk_notpartitioned_pk WHERE a = 1;
 ERROR:  update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk"
 DETAIL:  Key (a)=(1) is still referenced from table "fk_partitioned_fk".
 DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk;
+-- ensure we check partitions are "not used" when dropping constraints
+CREATE SCHEMA fkpart8
+  CREATE TABLE tbl1(f1 int PRIMARY KEY)
+  CREATE TABLE tbl2(f1 int REFERENCES tbl1 DEFERRABLE INITIALLY DEFERRED) PARTITION BY RANGE(f1)
+  CREATE TABLE tbl2_p1 PARTITION OF tbl2 FOR VALUES FROM (minvalue) TO (maxvalue);
+INSERT INTO fkpart8.tbl1 VALUES(1);
+BEGIN;
+INSERT INTO fkpart8.tbl2 VALUES(1);
+ALTER TABLE fkpart8.tbl2 DROP CONSTRAINT tbl2_f1_fkey;
+ERROR:  cannot ALTER TABLE "tbl2_p1" because it has pending trigger events
+COMMIT;
+DROP SCHEMA fkpart8 CASCADE;
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to table fkpart8.tbl1
+drop cascades to table fkpart8.tbl2
diff --git a/src/test/regress/expected/foreign_key_1.out b/src/test/regress/expected/foreign_key_1.out
index cb069e3a..2d28c086 100644
--- a/src/test/regress/expected/foreign_key_1.out
+++ b/src/test/regress/expected/foreign_key_1.out
@@ -1438,3 +1438,18 @@ DELETE FROM fk_notpartitioned_pk WHERE a = 1;
 ERROR:  update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk"
 DETAIL:  Key (a)=(1) is still referenced from table "fk_partitioned_fk".
 DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk;
+-- ensure we check partitions are "not used" when dropping constraints
+CREATE SCHEMA fkpart8
+  CREATE TABLE tbl1(f1 int PRIMARY KEY)
+  CREATE TABLE tbl2(f1 int REFERENCES tbl1 DEFERRABLE INITIALLY DEFERRED) PARTITION BY RANGE(f1)
+  CREATE TABLE tbl2_p1 PARTITION OF tbl2 FOR VALUES FROM (minvalue) TO (maxvalue);
+INSERT INTO fkpart8.tbl1 VALUES(1);
+BEGIN;
+INSERT INTO fkpart8.tbl2 VALUES(1);
+ALTER TABLE fkpart8.tbl2 DROP CONSTRAINT tbl2_f1_fkey;
+ERROR:  cannot ALTER TABLE "tbl2_p1" because it has pending trigger events
+COMMIT;
+DROP SCHEMA fkpart8 CASCADE;
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to table fkpart8.tbl1
+drop cascades to table fkpart8.tbl2
diff --git a/src/test/regress/expected/foreign_key_2.out b/src/test/regress/expected/foreign_key_2.out
index 27e9026d..e3b7210b 100644
--- a/src/test/regress/expected/foreign_key_2.out
+++ b/src/test/regress/expected/foreign_key_2.out
@@ -1450,3 +1450,18 @@ DELETE FROM fk_notpartitioned_pk WHERE a = 1;
 ERROR:  update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk"
 DETAIL:  Key (a)=(1) is still referenced from table "fk_partitioned_fk".
 DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk;
+-- ensure we check partitions are "not used" when dropping constraints
+CREATE SCHEMA fkpart8
+  CREATE TABLE tbl1(f1 int PRIMARY KEY)
+  CREATE TABLE tbl2(f1 int REFERENCES tbl1 DEFERRABLE INITIALLY DEFERRED) PARTITION BY RANGE(f1)
+  CREATE TABLE tbl2_p1 PARTITION OF tbl2 FOR VALUES FROM (minvalue) TO (maxvalue);
+INSERT INTO fkpart8.tbl1 VALUES(1);
+BEGIN;
+INSERT INTO fkpart8.tbl2 VALUES(1);
+ALTER TABLE fkpart8.tbl2 DROP CONSTRAINT tbl2_f1_fkey;
+ERROR:  cannot ALTER TABLE "tbl2_p1" because it has pending trigger events
+COMMIT;
+DROP SCHEMA fkpart8 CASCADE;
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to table fkpart8.tbl1
+drop cascades to table fkpart8.tbl2
diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql
index 8c887eb9..a1d62828 100644
--- a/src/test/regress/sql/foreign_key.sql
+++ b/src/test/regress/sql/foreign_key.sql
@@ -1071,3 +1071,15 @@ INSERT INTO fk_partitioned_fk VALUES (1);
 ALTER TABLE fk_notpartitioned_pk ALTER COLUMN a TYPE bigint;
 DELETE FROM fk_notpartitioned_pk WHERE a = 1;
 DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk;
+
+-- ensure we check partitions are "not used" when dropping constraints
+CREATE SCHEMA fkpart8
+  CREATE TABLE tbl1(f1 int PRIMARY KEY)
+  CREATE TABLE tbl2(f1 int REFERENCES tbl1 DEFERRABLE INITIALLY DEFERRED) PARTITION BY RANGE(f1)
+  CREATE TABLE tbl2_p1 PARTITION OF tbl2 FOR VALUES FROM (minvalue) TO (maxvalue);
+INSERT INTO fkpart8.tbl1 VALUES(1);
+BEGIN;
+INSERT INTO fkpart8.tbl2 VALUES(1);
+ALTER TABLE fkpart8.tbl2 DROP CONSTRAINT tbl2_f1_fkey;
+COMMIT;
+DROP SCHEMA fkpart8 CASCADE;
\ No newline at end of file

From 1f127fe76c8020f68bbcd02140df46bf9a8a9828 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 2 Jul 2020 12:56:51 +0800
Subject: [PATCH 289/578] Fix infelicities in describeOneTableDetails'
 partitioned-table handling.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/bin/psql/describe.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 4fd3864b..c67d2570 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -1877,42 +1877,40 @@ describeOneTableDetails(const char *schemaname,
     }
 
     /* Make footers */
-    if (pset.sversion >= 100000)
+
+	if (tableinfo.ispartition)
     {
-        /* Get the partition information */
+		/* Footer information for a partition child table */
         PGresult   *result;
-        char       *parent_name;
-        char       *partdef;
-        char       *partconstraintdef = NULL;
 
         printfPQExpBuffer(&buf,
                           "SELECT inhparent::pg_catalog.regclass,\n"
-                          "  pg_catalog.pg_get_expr(c.relpartbound, inhrelid)");
+						  "  pg_catalog.pg_get_expr(c.relpartbound, c.oid)");
         /* If verbose, also request the partition constraint definition */
         if (verbose)
-            appendPQExpBuffer(&buf,
-                              ",\n  pg_catalog.pg_get_partition_constraintdef(inhrelid)");
+			appendPQExpBufferStr(&buf,
+								 ",\n  pg_catalog.pg_get_partition_constraintdef(c.oid)");
         appendPQExpBuffer(&buf,
                           "\nFROM pg_catalog.pg_class c"
                           " JOIN pg_catalog.pg_inherits i"
                           " ON c.oid = inhrelid"
-                          "\nWHERE c.oid = '%s' AND c.relispartition;", oid);
+						  "\nWHERE c.oid = '%s';", oid);
         result = PSQLexec(buf.data);
         if (!result)
             goto error_return;
 
         if (PQntuples(result) > 0)
         {
-            parent_name = PQgetvalue(result, 0, 0);
-            partdef = PQgetvalue(result, 0, 1);
-
-			if (PQnfields(result) == 3 && !PQgetisnull(result, 0, 2))
-                partconstraintdef = PQgetvalue(result, 0, 2);
+			char	   *parent_name = PQgetvalue(result, 0, 0);
+			char	   *partdef = PQgetvalue(result, 0, 1);
+			char       *partconstraintdef = NULL;
 
             printfPQExpBuffer(&tmpbuf, _("Partition of: %s %s"), parent_name,
                               partdef);
             printTableAddFooter(&cont, tmpbuf.data);
 
+            if (!PQgetisnull(result, 0, 2))
+            	partconstraintdef = PQgetvalue(result, 0, 2);
 			/* If there isn't any constraint, show that explicitly */
 			if (partconstraintdef == NULL || partconstraintdef[0] == '\0')
 				printfPQExpBuffer(&tmpbuf, _("No partition constraint"));
@@ -1921,26 +1919,29 @@ describeOneTableDetails(const char *schemaname,
                                   partconstraintdef);
                 printTableAddFooter(&cont, tmpbuf.data);
 
-            PQclear(result);
         }
+		PQclear(result);
     }
 
     if (tableinfo.relkind == RELKIND_PARTITIONED_TABLE)
     {
-        /* Get the partition key information  */
+		/* Footer information for a partitioned table (partitioning parent) */
         PGresult   *result;
-        char       *partkeydef;
 
         printfPQExpBuffer(&buf,
                           "SELECT pg_catalog.pg_get_partkeydef('%s'::pg_catalog.oid);",
                           oid);
         result = PSQLexec(buf.data);
-        if (!result || PQntuples(result) != 1)
+		if (!result)
             goto error_return;
 
-        partkeydef = PQgetvalue(result, 0, 0);
+		if (PQntuples(result) == 1)
+		{
+		   char       *partkeydef = PQgetvalue(result, 0, 0);
+
         printfPQExpBuffer(&tmpbuf, _("Partition key: %s"), partkeydef);
         printTableAddFooter(&cont, tmpbuf.data);
+		}
         PQclear(result);
     }
 

From 9813e44a44ee359b8799dbf43f55eaed09b06972 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 2 Jul 2020 15:12:56 +0800
Subject: [PATCH 290/578] Improve pruning of a default
 partition.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/partitioning/partprune.c          | 218 ++++++++----------
 src/include/partitioning/partbounds.h         |   1 -
 src/test/regress/expected/partition_prune.out |  20 +-
 src/test/regress/sql/partition_prune.sql      |   1 +
 4 files changed, 111 insertions(+), 129 deletions(-)

diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c
index 7adf41f3..5d84bf2a 100644
--- a/src/backend/partitioning/partprune.c
+++ b/src/backend/partitioning/partprune.c
@@ -269,6 +269,7 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
 	PruneStepResult **results,
 			   *final_result;
 	ListCell   *lc;
+	bool		scan_default;
 
 	/* If there are no pruning steps then all partitions match. */
 	if (num_steps == 0)
@@ -317,30 +318,39 @@ get_matching_partitions(PartitionPruneContext *context, List *pruning_steps)
 	Assert(final_result != NULL);
 	i = -1;
 	result = NULL;
+	scan_default = final_result->scan_default;
 	while ((i = bms_next_member(final_result->bound_offsets, i)) >= 0)
 	{
 		int			partindex = context->boundinfo->indexes[i];
 
-		/*
-		 * In range and hash partitioning cases, some slots may contain -1,
-		 * indicating that no partition has been defined to accept a given
-		 * range of data or for a given remainder, respectively. The default
-		 * partition, if any, in case of range partitioning, will be added to
-		 * the result, because the specified range still satisfies the query's
-		 * conditions.
-		 */
-		if (partindex >= 0)
-			result = bms_add_member(result, partindex);
+		if (partindex < 0)
+		{
+			/*
+			 * In range partitioning cases, if a partition index is -1 it
+			 * means that the bound at the offset is the upper bound for a
+			 * range not covered by any partition (other than a possible
+			 * default partition).  In hash partitioning, the same means no
+			 * partition has been defined for the corresponding remainder
+			 * value.
+			 *
+			 * In either case, the value is still part of the queried range of
+			 * values, so mark to scan the default partition if one exists.
+			 */
+			scan_default |= partition_bound_has_default(context->boundinfo);
+			continue;
+		}
+
+		result = bms_add_member(result, partindex);
 	}
 
-	/* Add the null and/or default partition if needed and if present. */
+	/* Add the null and/or default partition if needed and present. */
 	if (final_result->scan_null)
 	{
 		Assert(context->strategy == PARTITION_STRATEGY_LIST);
 		Assert(partition_bound_accepts_nulls(context->boundinfo));
 		result = bms_add_member(result, context->boundinfo->null_index);
 	}
-	if (final_result->scan_default)
+	if (scan_default)
 	{
 		Assert(context->strategy == PARTITION_STRATEGY_LIST ||
 			   context->strategy == PARTITION_STRATEGY_RANGE);
@@ -1809,6 +1819,11 @@ get_matching_hash_bounds(PartitionPruneContext *context,
  * get_matching_list_bounds
  *		Determine the offsets of list bounds matching the specified value,
  *		according to the semantics of the given operator strategy
+ *
+ * scan_default will be set in the returned struct, if the default partition
+ * needs to be scanned, provided one exists at all.  scan_null will be set if
+ * the special null-accepting partition needs to be scanned.
+ *
  * 'opstrategy' if non-zero must be a btree strategy number.
  *
  * 'value' contains the value to use for pruning.
@@ -2008,8 +2023,13 @@ get_matching_list_bounds(PartitionPruneContext *context,
  * Each datum whose offset is in result is to be treated as the upper bound of
  * the partition that will contain the desired values.
  *
- * If default partition needs to be scanned for given values, set scan_default
- * in result if present.
+ * scan_default is set in the returned struct if a default partition exists
+ * and we're absolutely certain that it needs to be scanned.  We do *not* set
+ * it just because values match portions of the key space uncovered by
+ * partitions other than default (space which we normally assume to belong to
+ * the default partition): the final set of bounds obtained after combining
+ * multiple pruning steps might exclude it, so we infer its inclusion
+ * elsewhere.
  *
  * 'opstrategy' if non-zero must be a btree strategy number.
  *
@@ -2035,8 +2055,7 @@ get_matching_range_bounds(PartitionPruneContext *context,
 	int		   *partindices = boundinfo->indexes;
 	int			off,
 				minoff,
-				maxoff,
-				i;
+				maxoff;
 	bool		is_equal;
 	bool		inclusive = false;
 
@@ -2066,12 +2085,15 @@ get_matching_range_bounds(PartitionPruneContext *context,
 	 */
 	if (nvalues == 0)
 	{
+		/* ignore key space not covered by any partitions */
 		if (partindices[minoff] < 0)
 			minoff++;
 		if (partindices[maxoff] < 0)
 			maxoff--;
 
 		result->scan_default = partition_bound_has_default(boundinfo);
+		Assert(partindices[minoff] >= 0 &&
+               partindices[maxoff] >= 0);
 		result->bound_offsets = bms_add_range(NULL, minoff, maxoff);
 
 		return result;
@@ -2099,11 +2121,7 @@ get_matching_range_bounds(PartitionPruneContext *context,
 				if (nvalues == partnatts)
 				{
 					/* There can only be zero or one matching partition. */
-					if (partindices[off + 1] >= 0)
-						result->bound_offsets = bms_make_singleton(off + 1);
-					else
-						result->scan_default =
-							partition_bound_has_default(boundinfo);
+					result->bound_offsets = bms_make_singleton(off + 1);
 					return result;
 				}
 				else
@@ -2191,57 +2209,21 @@ get_matching_range_bounds(PartitionPruneContext *context,
 					maxoff = off + 1;
 				}
 
-				/*
-				 * Skip if minoff/maxoff are actually the upper bound of a
-				 * un-assigned portion of values.
-				 */
-				if (partindices[minoff] < 0 && minoff < boundinfo->ndatums)
-					minoff++;
-				if (partindices[maxoff] < 0 && maxoff >= 1)
-					maxoff--;
-
-				/*
-				 * There may exist a range of values unassigned to any
-				 * non-default partition between the datums at minoff and
-				 * maxoff.  Add the default partition in that case.
-				 */
-				if (partition_bound_has_default(boundinfo))
-				{
-					for (i = minoff; i <= maxoff; i++)
-					{
-						if (partindices[i] < 0)
-						{
-							result->scan_default = true;
-							break;
-						}
-					}
-				}
-
 				Assert(minoff >= 0 && maxoff >= 0);
 				result->bound_offsets = bms_add_range(NULL, minoff, maxoff);
 			}
-			else if (off >= 0)	/* !is_equal */
+			else
 			{
 				/*
 				 * The lookup value falls in the range between some bounds in
 				 * boundinfo.  'off' would be the offset of the greatest bound
 				 * that is <= lookup value, so add off + 1 to the result
 				 * instead as the offset of the upper bound of the only
-				 * partition that may contain the lookup value.
-				 */
-				if (partindices[off + 1] >= 0)
-					result->bound_offsets = bms_make_singleton(off + 1);
-				else
-					result->scan_default =
-						partition_bound_has_default(boundinfo);
-			}
-			else
-			{
-				/*
-				 * off < 0: the lookup value is smaller than all bounds, so
-				 * only the default partition qualifies, if there is one.
+				 * partition that may contain the lookup value.  If 'off' is
+				 * -1 indicating that all bounds are greater, then we simply
+				 * end up adding the first bound's offset, that is, 0.
 				 */
-				result->scan_default = partition_bound_has_default(boundinfo);
+				result->bound_offsets = bms_make_singleton(off + 1);
 			}
 
 			return result;
@@ -2312,16 +2294,18 @@ get_matching_range_bounds(PartitionPruneContext *context,
 
 					minoff = inclusive ? off : off + 1;
 				}
-
-				/*
-				 * lookup value falls in the range between some bounds in
-				 * boundinfo.  off would be the offset of the greatest bound
-				 * that is <= lookup value, so add off + 1 to the result
-				 * instead as the offset of the upper bound of the smallest
-				 * partition that may contain the lookup value.
-				 */
 				else
+				{
+
+					/*
+					 * lookup value falls in the range between some bounds in
+					 * boundinfo.  off would be the offset of the greatest
+					 * bound that is <= lookup value, so add off + 1 to the
+					 * result instead as the offset of the upper bound of the
+					 * smallest partition that may contain the lookup value.
+					 */
 					minoff = off + 1;
+				}
 			}
 			break;
 
@@ -2339,16 +2323,7 @@ get_matching_range_bounds(PartitionPruneContext *context,
 												boundinfo,
 												nvalues, values,
 												&is_equal);
-			if (off < 0)
-			{
-				/*
-				 * All bounds are greater than the key, so we could only
-				 * expect to find the lookup key in the default partition.
-				 */
-				result->scan_default = partition_bound_has_default(boundinfo);
-				return result;
-			}
-			else
+			if (off >= 0)
 			{
 				/*
 				 * See the comment above.
@@ -2396,6 +2371,14 @@ get_matching_range_bounds(PartitionPruneContext *context,
 				else
 					maxoff = off;
 			}
+			else
+			{
+				/*
+				 * 'off' is -1 indicating that all bounds are greater, so just
+				 * set the first bound's offset as maxoff.
+				 */
+				maxoff = off + 1;
+			}
 			break;
 
 		default:
@@ -2403,58 +2386,43 @@ get_matching_range_bounds(PartitionPruneContext *context,
 			break;
 	}
 
+	 Assert(minoff >= 0 && minoff <= boundinfo->ndatums);
+     Assert(maxoff >= 0 && maxoff <= boundinfo->ndatums);
+
 	/*
-	 * Skip a gap and when doing so, check if the bound contains a finite
-	 * value to decide if we need to add the default partition.  If it's an
-	 * infinite bound, we need not add the default partition, as having an
-	 * infinite bound means the partition in question catches any values that
-	 * would otherwise be in the default partition.
+	 * If the smallest partition to return has MINVALUE (negative infinity) as
+     * its lower bound, increment it to point to the next finite bound
+     * (supposedly its upper bound), so that we don't advertently end up
+     * scanning the default partition.
 	 */
-	if (partindices[minoff] < 0)
+    if (minoff < boundinfo->ndatums && partindices[minoff] < 0)
 	{
 		int			lastkey = nvalues - 1;
 
-		if (minoff >= 0 &&
-			minoff < boundinfo->ndatums &&
-			boundinfo->kind[minoff][lastkey] ==
-			PARTITION_RANGE_DATUM_VALUE)
-			result->scan_default = partition_bound_has_default(boundinfo);
-
-		minoff++;
+		if (boundinfo->kind[minoff][lastkey] ==
+		   PARTITION_RANGE_DATUM_MINVALUE)
+		{
+		   minoff++;
+		   Assert(boundinfo->indexes[minoff] >= 0);
+		}
 	}
 
 	/*
-	 * Skip a gap.  See the above comment about how we decide whether or or
-	 * not to scan the default partition based whether the datum that will
-	 * become the maximum datum is finite or not.
+	 * If the previous greatest partition has MAXVALUE (positive infinity) as
+     * its upper bound (something only possible to do with multi-column range
+     * partitioning), we scan switch to it as the greatest partition to
+     * return.  Again, so that we don't advertently end up scanning the
+     * default partition.
 	 */
 	if (maxoff >= 1 && partindices[maxoff] < 0)
 	{
 		int			lastkey = nvalues - 1;
 
-		if (maxoff >= 0 &&
-			maxoff <= boundinfo->ndatums &&
-			boundinfo->kind[maxoff - 1][lastkey] ==
-			PARTITION_RANGE_DATUM_VALUE)
-			result->scan_default = partition_bound_has_default(boundinfo);
-
-		maxoff--;
-	}
-
-	if (partition_bound_has_default(boundinfo))
-	{
-		/*
-		 * There may exist a range of values unassigned to any non-default
-		 * partition between the datums at minoff and maxoff.  Add the default
-		 * partition in that case.
-		 */
-		for (i = minoff; i <= maxoff; i++)
+		if (boundinfo->kind[maxoff - 1][lastkey] ==
+            PARTITION_RANGE_DATUM_MAXVALUE)
 		{
-			if (partindices[i] < 0)
-			{
-				result->scan_default = true;
-				break;
-			}
+			maxoff--;
+            Assert(boundinfo->indexes[maxoff] >= 0);
 		}
 	}
 
@@ -2599,14 +2567,24 @@ perform_pruning_combine_step(PartitionPruneContext *context,
 
 	/*
 	 * A combine step without any source steps is an indication to not perform
-	 * any partition pruning, we just return all partitions.
+	 * any partition pruning.  Return all datum indexes in that case.
 	 */
 	result = (PruneStepResult *) palloc0(sizeof(PruneStepResult));
 	if (list_length(cstep->source_stepids) == 0)
 	{
 		PartitionBoundInfo boundinfo = context->boundinfo;
+		int			rangemax;
+
+		/*
+		 * Add all valid offsets into the boundinfo->indexes array.  For range
+		 * partitioning, boundinfo->indexes contains (boundinfo->ndatums + 1)
+		 * valid entries; otherwise there are boundinfo->ndatums.
+		 */
+		rangemax = context->strategy == PARTITION_STRATEGY_RANGE ?
+			boundinfo->ndatums : boundinfo->ndatums - 1;
 
-		result->bound_offsets = bms_add_range(NULL, 0, boundinfo->ndatums - 1);
+		result->bound_offsets =
+			bms_add_range(result->bound_offsets, 0, rangemax);
 		result->scan_default = partition_bound_has_default(boundinfo);
 		result->scan_null = partition_bound_accepts_nulls(boundinfo);
 		return result;
diff --git a/src/include/partitioning/partbounds.h b/src/include/partitioning/partbounds.h
index c76014d4..45df3fb8 100644
--- a/src/include/partitioning/partbounds.h
+++ b/src/include/partitioning/partbounds.h
@@ -52,7 +52,6 @@
  * pointed by remainder produced when hash value of the datum-tuple is divided
  * by the greatest modulus.
  */
-
 typedef struct PartitionBoundInfoData
 {
 	char		strategy;		/* hash, list or range? */
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index a08f303d..ff388472 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -517,15 +517,13 @@ explain (costs off) select * from rlp where a <= 31;
          Filter: (a <= 31)
    ->  Seq Scan on rlp5_1
          Filter: (a <= 31)
-   ->  Seq Scan on rlp5_default
-         Filter: (a <= 31)
    ->  Seq Scan on rlp_default_10
          Filter: (a <= 31)
    ->  Seq Scan on rlp_default_30
          Filter: (a <= 31)
    ->  Seq Scan on rlp_default_default
          Filter: (a <= 31)
-(29 rows)
+(27 rows)
 
 explain (costs off) select * from rlp where a = 1 or a = 7;
               QUERY PLAN              
@@ -573,11 +571,7 @@ explain (costs off) select * from rlp where a > 20 and a < 27;
          Filter: ((a > 20) AND (a < 27))
    ->  Seq Scan on rlp4_2
          Filter: ((a > 20) AND (a < 27))
-   ->  Seq Scan on rlp4_default
-         Filter: ((a > 20) AND (a < 27))
-   ->  Seq Scan on rlp_default_default
-         Filter: ((a > 20) AND (a < 27))
-(9 rows)
+(5 rows)
 
 explain (costs off) select * from rlp where a = 29;
            QUERY PLAN           
@@ -603,6 +597,16 @@ explain (costs off) select * from rlp where a >= 29;
          Filter: (a >= 29)
 (11 rows)
 
+explain (costs off) select * from rlp where a < 1 or (a > 20 and a < 25);
+                      QUERY PLAN                      
+------------------------------------------------------
+ Append
+   ->  Seq Scan on rlp1
+         Filter: ((a < 1) OR ((a > 20) AND (a < 25)))
+   ->  Seq Scan on rlp4_1
+         Filter: ((a < 1) OR ((a > 20) AND (a < 25)))
+(5 rows)
+
 -- redundant clauses are eliminated
 explain (costs off) select * from rlp where a > 1 and a = 10;	/* only default */
                QUERY PLAN               
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index 1cd151e2..55fda489 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -83,6 +83,7 @@ explain (costs off) select * from rlp where a = 1 or b = 'ab';
 explain (costs off) select * from rlp where a > 20 and a < 27;
 explain (costs off) select * from rlp where a = 29;
 explain (costs off) select * from rlp where a >= 29;
+explain (costs off) select * from rlp where a < 1 or (a > 20 and a < 25);
 
 -- redundant clauses are eliminated
 explain (costs off) select * from rlp where a > 1 and a = 10;	/* only default */

From c072d696d1b205959f88bb169ab62dbaf44fb393 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 2 Jul 2020 15:25:18 +0800
Subject: [PATCH 291/578] Fix dependency handling of column drop with
 partitioned
 tables.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/commands/tablecmds.c | 43 ++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 31c4aa91..9ab2e3ec 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -447,7 +447,8 @@ static void ATPrepDropColumn(List **wqueue, Relation rel, bool recurse, bool rec
 static ObjectAddress ATExecDropColumn(List **wqueue, Relation rel, const char *colName,
                  DropBehavior behavior,
                  bool recurse, bool recursing,
-                 bool missing_ok, LOCKMODE lockmode);
+									  bool missing_ok, LOCKMODE lockmode,
+									  ObjectAddresses *addrs);
 static ObjectAddress ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
                IndexStmt *stmt, bool is_rebuild, LOCKMODE lockmode);
 static ObjectAddress ATExecAddConstraint(List **wqueue,
@@ -5209,12 +5210,14 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
         case AT_DropColumn:        /* DROP COLUMN */
             address = ATExecDropColumn(wqueue, rel, cmd->name,
                                        cmd->behavior, false, false,
-                                       cmd->missing_ok, lockmode);
+									   cmd->missing_ok, lockmode,
+									   NULL);
             break;
         case AT_DropColumnRecurse:    /* DROP COLUMN with recursion */
             address = ATExecDropColumn(wqueue, rel, cmd->name,
                                        cmd->behavior, true, false,
-                                       cmd->missing_ok, lockmode);
+									   cmd->missing_ok, lockmode,
+									   NULL);
             break;
         case AT_AddIndex:        /* ADD INDEX */
             address = ATExecAddIndex(tab, rel, (IndexStmt *) cmd->def, false,
@@ -8203,14 +8206,23 @@ ATPrepDropColumn(List **wqueue, Relation rel, bool recurse, bool recursing,
 }
 
 /*
- * Return value is the address of the dropped column.
+ * Drops column 'colName' from relation 'rel' and returns the address of the
+ * dropped column.  The column is also dropped (or marked as no longer
+ * inherited from relation) from the relation's inheritance children, if any.
+ *
+ * In the recursive invocations for inheritance child relations, instead of
+ * dropping the column directly (if to be dropped at all), its object address
+ * is added to 'addrs', which must be non-NULL in such invocations.  All
+ * columns are dropped at the same time after all the children have been
+ * checked recursively.
  */
 static ObjectAddress
 ATExecDropColumn(List **wqueue, Relation rel, const char *colName,
                  DropBehavior behavior,
                  bool recurse, bool recursing,
-                 bool missing_ok, LOCKMODE lockmode)
-{// #lizard forgives
+				 bool missing_ok, LOCKMODE lockmode,
+				 ObjectAddresses *addrs)
+{
     HeapTuple    tuple;
     Form_pg_attribute targetatt;
     AttrNumber    attnum;
@@ -8222,6 +8234,11 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName,
     if (recursing)
         ATSimplePermissions(rel, ATT_TABLE | ATT_FOREIGN_TABLE);
 
+	/* Initialize addrs on the first invocation */
+	Assert(!recursing || addrs != NULL);
+	if (!recursing)
+		addrs = new_object_addresses();
+
     /*
      * get the number of the attribute
      */
@@ -8362,7 +8379,7 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName,
                 /* Time to delete this child column, too */
                 ATExecDropColumn(wqueue, childrel, colName,
                                  behavior, true, true,
-                                 false, lockmode);
+									 false, lockmode, addrs);
             }
             else
             {
@@ -8421,14 +8438,18 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName,
         heap_close(attr_rel, RowExclusiveLock);
     }
 
-    /*
-     * Perform the actual column deletion
-     */
+	/* Add object to delete */
     object.classId = RelationRelationId;
     object.objectId = RelationGetRelid(rel);
     object.objectSubId = attnum;
+	add_exact_object_address(&object, addrs);
 
-    performDeletion(&object, behavior, 0);
+	if (!recursing)
+	{
+	   /* Recursion has ended, drop everything that was collected */
+	   performMultipleDeletions(addrs, behavior, 0);
+	   free_object_addresses(addrs);
+	}
 
     /*
      * If we dropped the OID column, must adjust pg_class.relhasoids and tell

From 26b10a96796d14d7b62b1749edec738b3c75f36f Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 2 Jul 2020 17:04:48 +0800
Subject: [PATCH 292/578] Simplify index_[constraint_]create API.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/index.c      | 107 ++++++++++++++++---------------
 src/backend/catalog/toasting.c   |   3 +-
 src/backend/commands/indexcmds.c |  33 +++++++---
 src/backend/commands/tablecmds.c |  13 ++--
 src/include/catalog/index.h      |  29 +++++----
 5 files changed, 105 insertions(+), 80 deletions(-)

diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 7f01e417..db5d16ee 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -719,19 +719,25 @@ UpdateIndexRelation(Oid indexoid,
  * classObjectId: array of index opclass OIDs, one per index column
  * coloptions: array of per-index-column indoption settings
  * reloptions: AM-specific options
- * isprimary: index is a PRIMARY KEY
- * isconstraint: index is owned by PRIMARY KEY, UNIQUE, or EXCLUSION constraint
- * deferrable: constraint is DEFERRABLE
- * initdeferred: constraint is INITIALLY DEFERRED
- * allow_system_table_mods: allow table to be a system catalog
- * skip_build: true to skip the index_build() step for the moment; caller
- *        must do it later (typically via reindex_index())
- * concurrent: if true, do not lock the table against writers.  The index
- *        will be marked "invalid" and the caller must take additional steps
+ * flags: bitmask that can include any combination of these bits:
+ *		INDEX_CREATE_IS_PRIMARY
+ *			the index is a primary key
+ *		INDEX_CREATE_ADD_CONSTRAINT:
+ *			invoke index_constraint_create also
+ *		INDEX_CREATE_SKIP_BUILD:
+ *			skip the index_build() step for the moment; caller must do it
+ *			later (typically via reindex_index())
+ *		INDEX_CREATE_CONCURRENT:
+ *			do not lock the table against writers.  The index will be
+ *			marked "invalid" and the caller must take additional steps
  *        to fix it up.
+ *		INDEX_CREATE_IF_NOT_EXISTS:
+ *			do not throw an error if a relation with the same name
+ *			already exists.
+ * constr_flags: flags passed to index_constraint_create
+ *		(only if INDEX_CREATE_ADD_CONSTRAINT is set)
+ * allow_system_table_mods: allow table to be a system catalog
  * is_internal: if true, post creation hook for new index
- * if_not_exists: if true, do not throw an error if a relation with
- *        the same name already exists.
  *
  * Returns the OID of the created index.
  */
@@ -748,16 +754,11 @@ index_create(Relation heapRelation,
              Oid *classObjectId,
              int16 *coloptions,
              Datum reloptions,
-             bool isprimary,
-             bool isconstraint,
-             bool deferrable,
-             bool initdeferred,
+			 bits16 flags,
+			 bits16 constr_flags,
              bool allow_system_table_mods,
-             bool skip_build,
-             bool concurrent,
-             bool is_internal,
-             bool if_not_exists)
-{// #lizard forgives
+			 bool is_internal)
+{
     Oid            heapRelationId = RelationGetRelid(heapRelation);
     Relation    pg_class;
     Relation    indexRelation;
@@ -768,6 +769,12 @@ index_create(Relation heapRelation,
     Oid            namespaceId;
     int            i;
     char        relpersistence;
+	bool		isprimary = (flags & INDEX_CREATE_IS_PRIMARY) != 0;
+	bool		concurrent = (flags & INDEX_CREATE_CONCURRENT) != 0;
+
+	/* constraint flags can only be set when a constraint is requested */
+	Assert((constr_flags == 0) ||
+		   ((flags & INDEX_CREATE_ADD_CONSTRAINT) != 0));
 
     is_exclusion = (indexInfo->ii_ExclusionOps != NULL);
 
@@ -833,7 +840,7 @@ index_create(Relation heapRelation,
 
     if (get_relname_relid(indexRelationName, namespaceId))
     {
-        if (if_not_exists)
+		if ((flags & INDEX_CREATE_IF_NOT_EXISTS) != 0)
         {
             ereport(NOTICE,
                     (errcode(ERRCODE_DUPLICATE_TABLE),
@@ -956,7 +963,7 @@ index_create(Relation heapRelation,
     UpdateIndexRelation(indexRelationId, heapRelationId, indexInfo,
                         collationObjectId, classObjectId, coloptions,
                         isprimary, is_exclusion,
-                        !deferrable,
+						(constr_flags & INDEX_CONSTR_CREATE_DEFERRABLE) == 0,
                         !concurrent);
 
     /*
@@ -982,7 +989,7 @@ index_create(Relation heapRelation,
         myself.objectId = indexRelationId;
         myself.objectSubId = 0;
 
-        if (isconstraint)
+		if ((flags & INDEX_CREATE_ADD_CONSTRAINT) != 0)
         {
             char        constraintType;
 
@@ -1003,11 +1010,7 @@ index_create(Relation heapRelation,
                                     indexInfo,
                                     indexRelationName,
                                     constraintType,
-                                    deferrable,
-                                    initdeferred,
-                                    false,    /* already marked primary */
-                                    false,    /* pg_index entry is OK */
-                                    false,    /* no old dependencies */
+									constr_flags,
                                     allow_system_table_mods,
                                     is_internal);
         }
@@ -1044,10 +1047,6 @@ index_create(Relation heapRelation,
 
                 recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
             }
-
-            /* Non-constraint indexes can't be deferrable */
-            Assert(!deferrable);
-            Assert(!initdeferred);
         }
 
         /* Store dependency on collations */
@@ -1098,9 +1097,7 @@ index_create(Relation heapRelation,
     else
     {
         /* Bootstrap mode - assert we weren't asked for constraint support */
-        Assert(!isconstraint);
-        Assert(!deferrable);
-        Assert(!initdeferred);
+		Assert((flags & INDEX_CREATE_ADD_CONSTRAINT) == 0);
     }
 
     /* Post creation hook for new index */
@@ -1128,15 +1125,16 @@ index_create(Relation heapRelation,
      * If this is bootstrap (initdb) time, then we don't actually fill in the
      * index yet.  We'll be creating more indexes and classes later, so we
      * delay filling them in until just before we're done with bootstrapping.
-     * Similarly, if the caller specified skip_build then filling the index is
-     * delayed till later (ALTER TABLE can save work in some cases with this).
-     * Otherwise, we call the AM routine that constructs the index.
+	 * Similarly, if the caller specified to skip the build then filling the
+	 * index is delayed till later (ALTER TABLE can save work in some cases
+	 * with this).  Otherwise, we call the AM routine that constructs the
+	 * index.
      */
     if (IsBootstrapProcessingMode())
     {
         index_register(heapRelationId, indexRelationId, indexInfo);
     }
-    else if (skip_build)
+	else if ((flags & INDEX_CREATE_SKIP_BUILD) != 0)
     {
         /*
          * Caller is responsible for filling the index later on.  However,
@@ -1176,12 +1174,13 @@ index_create(Relation heapRelation,
  * constraintName: what it say (generally, should match name of index)
  * constraintType: one of CONSTRAINT_PRIMARY, CONSTRAINT_UNIQUE, or
  *        CONSTRAINT_EXCLUSION
- * deferrable: constraint is DEFERRABLE
- * initdeferred: constraint is INITIALLY DEFERRED
- * mark_as_primary: if true, set flags to mark index as primary key
- * update_pgindex: if true, update pg_index row (else caller's done that)
- * remove_old_dependencies: if true, remove existing dependencies of index
- *        on table's columns
+ * flags: bitmask that can include any combination of these bits:
+ *		INDEX_CONSTR_CREATE_MARK_AS_PRIMARY: index is a PRIMARY KEY
+ *		INDEX_CONSTR_CREATE_DEFERRABLE: constraint is DEFERRABLE
+ *		INDEX_CONSTR_CREATE_INIT_DEFERRED: constraint is INITIALLY DEFERRED
+ *		INDEX_CONSTR_CREATE_UPDATE_INDEX: update the pg_index row
+ *		INDEX_CONSTR_CREATE_REMOVE_OLD_DEPS: remove existing dependencies
+ *			of index on table's columns
  * allow_system_table_mods: allow table to be a system catalog
  * is_internal: index is constructed due to internal process
  */
@@ -1191,11 +1190,7 @@ index_constraint_create(Relation heapRelation,
                         IndexInfo *indexInfo,
                         const char *constraintName,
                         char constraintType,
-                        bool deferrable,
-                        bool initdeferred,
-                        bool mark_as_primary,
-                        bool update_pgindex,
-                        bool remove_old_dependencies,
+						bits16 constr_flags,
                         bool allow_system_table_mods,
                         bool is_internal)
 {// #lizard forgives
@@ -1203,6 +1198,13 @@ index_constraint_create(Relation heapRelation,
     ObjectAddress myself,
                 referenced;
     Oid            conOid;
+	bool		deferrable;
+	bool		initdeferred;
+	bool		mark_as_primary;
+
+	deferrable = (constr_flags & INDEX_CONSTR_CREATE_DEFERRABLE) != 0;
+	initdeferred = (constr_flags & INDEX_CONSTR_CREATE_INIT_DEFERRED) != 0;
+	mark_as_primary = (constr_flags & INDEX_CONSTR_CREATE_MARK_AS_PRIMARY) != 0;
 
     /* constraint creation support doesn't work while bootstrapping */
     Assert(!IsBootstrapProcessingMode());
@@ -1229,7 +1231,7 @@ index_constraint_create(Relation heapRelation,
      * has any expressions or predicate, but we'd never be turning such an
      * index into a UNIQUE or PRIMARY KEY constraint.
      */
-    if (remove_old_dependencies)
+	if (constr_flags & INDEX_CONSTR_CREATE_REMOVE_OLD_DEPS)
         deleteDependencyRecordsForClass(RelationRelationId, indexRelationId,
                                         RelationRelationId, DEPENDENCY_AUTO);
 
@@ -1334,7 +1336,8 @@ index_constraint_create(Relation heapRelation,
      * is a risk that concurrent readers of the table will miss seeing this
      * index at all.
      */
-    if (update_pgindex && (mark_as_primary || deferrable))
+	if ((constr_flags & INDEX_CONSTR_CREATE_UPDATE_INDEX) &&
+		(mark_as_primary || deferrable))
     {
         Relation    pg_index;
         HeapTuple    indexTuple;
diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c
index d908bfc3..325b72e9 100644
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@@ -396,8 +396,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
                  BTREE_AM_OID,
                  rel->rd_rel->reltablespace,
                  collationObjectId, classObjectId, coloptions, (Datum) 0,
-                 true, false, false, false,
-                 true, false, false, true, false);
+				 INDEX_CREATE_IS_PRIMARY, 0, true, true);
 
     heap_close(toast_rel, NoLock);
 
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 4596a9f4..af0b9947 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -340,6 +340,8 @@ DefineIndex(Oid relationId,
     Datum        reloptions;
     int16       *coloptions;
     IndexInfo  *indexInfo;
+	bits16		flags;
+	bits16		constr_flags;
     int            numberOfAttributes;
     TransactionId limitXmin;
     VirtualTransactionId *old_snapshots;
@@ -755,20 +757,35 @@ DefineIndex(Oid relationId,
     Assert(!OidIsValid(stmt->oldNode) || (skip_build && !stmt->concurrent));
 
     /*
-     * Make the catalog entries for the index, including constraints. Then, if
-     * not skip_build || concurrent, actually build the index.
+	 * Make the catalog entries for the index, including constraints. This
+	 * step also actually builds the index, except if caller requested not to
+	 * or in concurrent mode, in which case it'll be done later.
      */
+	flags = constr_flags = 0;
+	if (stmt->isconstraint)
+		flags |= INDEX_CREATE_ADD_CONSTRAINT;
+	if (skip_build || stmt->concurrent)
+		flags |= INDEX_CREATE_SKIP_BUILD;
+	if (stmt->if_not_exists)
+		flags |= INDEX_CREATE_IF_NOT_EXISTS;
+	if (stmt->concurrent)
+		flags |= INDEX_CREATE_CONCURRENT;
+	if (stmt->primary)
+		flags |= INDEX_CREATE_IS_PRIMARY;
+
+	if (stmt->deferrable)
+		constr_flags |= INDEX_CONSTR_CREATE_DEFERRABLE;
+	if (stmt->initdeferred)
+		constr_flags |= INDEX_CONSTR_CREATE_INIT_DEFERRED;
+
     indexRelationId =
         index_create(rel, indexRelationName, indexRelationId, stmt->oldNode,
                      indexInfo, indexColNames,
                      accessMethodId, tablespaceId,
                      collationObjectId, classObjectId,
-                     coloptions, reloptions, stmt->primary,
-                     stmt->isconstraint, stmt->deferrable, stmt->initdeferred,
-                     allowSystemTableMods,
-                     skip_build || stmt->concurrent,
-                     stmt->concurrent, !check_rights,
-                     stmt->if_not_exists);
+					 coloptions, reloptions,
+					 flags, constr_flags,
+					 allowSystemTableMods, !check_rights);
 
     ObjectAddressSet(address, RelationRelationId, indexRelationId);
 
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 9ab2e3ec..c6e42c3d 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -8630,6 +8630,7 @@ ATExecAddIndexConstraint(AlteredTableInfo *tab, Relation rel,
     char       *constraintName;
     char        constraintType;
     ObjectAddress address;
+	bits16		flags;
 
     Assert(IsA(stmt, IndexStmt));
     Assert(OidIsValid(index_oid));
@@ -8674,16 +8675,18 @@ ATExecAddIndexConstraint(AlteredTableInfo *tab, Relation rel,
         constraintType = CONSTRAINT_UNIQUE;
 
     /* Create the catalog entries for the constraint */
+	flags = INDEX_CONSTR_CREATE_UPDATE_INDEX |
+		INDEX_CONSTR_CREATE_REMOVE_OLD_DEPS |
+		(stmt->initdeferred ? INDEX_CONSTR_CREATE_INIT_DEFERRED : 0) |
+		(stmt->deferrable ? INDEX_CONSTR_CREATE_DEFERRABLE : 0) |
+		(stmt->primary ? INDEX_CONSTR_CREATE_MARK_AS_PRIMARY : 0);
+
     address = index_constraint_create(rel,
                                       index_oid,
                                       indexInfo,
                                       constraintName,
                                       constraintType,
-                                      stmt->deferrable,
-                                      stmt->initdeferred,
-                                      stmt->primary,
-                                      true, /* update pg_index */
-                                      true, /* remove old dependencies */
+									  flags,
                                       allowSystemTableMods,
                                       false);    /* is_internal */
 
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index 3afe88f8..4928dfd1 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -103,6 +103,12 @@ extern void index_check_primary_key(Relation heapRel,
 						bool is_alter_table,
                                                 IndexStmt *stmt);
 
+#define    INDEX_CREATE_IS_PRIMARY             (1 << 0)
+#define    INDEX_CREATE_ADD_CONSTRAINT         (1 << 1)
+#define    INDEX_CREATE_SKIP_BUILD             (1 << 2)
+#define    INDEX_CREATE_CONCURRENT             (1 << 3)
+#define    INDEX_CREATE_IF_NOT_EXISTS          (1 << 4)
+
 extern Oid index_create(Relation heapRelation,
              const char *indexRelationName,
              Oid indexRelationId,
@@ -115,26 +121,23 @@ extern Oid index_create(Relation heapRelation,
              Oid *classObjectId,
              int16 *coloptions,
              Datum reloptions,
-             bool isprimary,
-             bool isconstraint,
-             bool deferrable,
-             bool initdeferred,
+			 bits16 flags,
+			 bits16 constr_flags,
              bool allow_system_table_mods,
-             bool skip_build,
-             bool concurrent,
-             bool is_internal,
-             bool if_not_exists);
+			 bool is_internal);
+
+#define	INDEX_CONSTR_CREATE_MARK_AS_PRIMARY	(1 << 0)
+#define	INDEX_CONSTR_CREATE_DEFERRABLE		(1 << 1)
+#define	INDEX_CONSTR_CREATE_INIT_DEFERRED	(1 << 2)
+#define	INDEX_CONSTR_CREATE_UPDATE_INDEX	(1 << 3)
+#define	INDEX_CONSTR_CREATE_REMOVE_OLD_DEPS	(1 << 4)
 
 extern ObjectAddress index_constraint_create(Relation heapRelation,
                         Oid indexRelationId,
                         IndexInfo *indexInfo,
                         const char *constraintName,
                         char constraintType,
-                        bool deferrable,
-                        bool initdeferred,
-                        bool mark_as_primary,
-                        bool update_pgindex,
-                        bool remove_old_dependencies,
+						bits16 constr_flags,
                         bool allow_system_table_mods,
                         bool is_internal);
 

From 521f269f55bd55e22da864eae943816ebf9cf6ee Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 2 Jul 2020 19:59:12 +0800
Subject: [PATCH 293/578] Local partitioned indexes.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 doc/src/sgml/catalogs.sgml                |  23 +
 doc/src/sgml/ref/alter_index.sgml         |  14 +
 doc/src/sgml/ref/alter_table.sgml         |   8 +-
 doc/src/sgml/ref/create_index.sgml        |  33 +-
 doc/src/sgml/ref/reindex.sgml             |   5 +
 src/backend/access/common/reloptions.c    |   1 +
 src/backend/access/heap/heapam.c          |   9 +-
 src/backend/access/index/indexam.c        |   3 +-
 src/backend/bootstrap/bootparse.y         |   2 +
 src/backend/catalog/aclchk.c              |   9 +-
 src/backend/catalog/dependency.c          |  14 +-
 src/backend/catalog/index.c               | 201 +++++-
 src/backend/catalog/objectaddress.c       |   5 +-
 src/backend/catalog/pg_depend.c           |  13 +-
 src/backend/catalog/pg_inherits.c         |  80 +++
 src/backend/catalog/toasting.c            |   2 +
 src/backend/commands/indexcmds.c          | 394 ++++++++++-
 src/backend/commands/tablecmds.c          | 665 +++++++++++++++++--
 src/backend/nodes/copyfuncs.c             |   1 +
 src/backend/nodes/equalfuncs.c            |   1 +
 src/backend/nodes/outfuncs.c              |   1 +
 src/backend/parser/gram.y                 |  33 +-
 src/backend/parser/parse_utilcmd.c        |  55 +-
 src/backend/tcop/utility.c                |  22 +
 src/backend/utils/adt/amutils.c           |   3 +-
 src/backend/utils/adt/ruleutils.c         |  17 +-
 src/backend/utils/cache/relcache.c        |  40 +-
 src/bin/pg_dump/common.c                  | 102 +++
 src/bin/pg_dump/pg_dump.c                 | 102 ++-
 src/bin/pg_dump/pg_dump.h                 |  11 +
 src/bin/pg_dump/pg_dump_sort.c            |  56 +-
 src/bin/pg_dump/t/002_pg_dump.pl          |  95 +++
 src/bin/psql/describe.c                   |  20 +-
 src/bin/psql/tab-complete.c               |  34 +-
 src/include/catalog/dependency.h          |  15 +
 src/include/catalog/index.h               |  10 +
 src/include/catalog/pg_class.h            |   1 +
 src/include/catalog/pg_inherits_fn.h      |   9 +-
 src/include/commands/defrem.h             |   3 +-
 src/include/nodes/execnodes.h             |   1 +
 src/include/nodes/parsenodes.h            |   7 +-
 src/include/parser/parse_utilcmd.h        |   3 +
 src/test/regress/expected/alter_table.out |  65 +-
 src/test/regress/expected/indexing.out    | 757 ++++++++++++++++++++++
 src/test/regress/parallel_schedule        |   2 +-
 src/test/regress/serial_schedule          |   1 +
 src/test/regress/sql/alter_table.sql      |  16 +
 src/test/regress/sql/indexing.sql         | 388 +++++++++++
 48 files changed, 3176 insertions(+), 176 deletions(-)
 create mode 100644 src/test/regress/expected/indexing.out
 create mode 100644 src/test/regress/sql/indexing.sql

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index fdac2074..399f8275 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -3008,6 +3008,29 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
      </listitem>
     </varlistentry>
 
+    <varlistentry>
+      <term><symbol>DEPENDENCY_INTERNAL_AUTO</symbol> (<literal>I</literal>)</term>
+      <listitem>
+       <para>
+        The dependent object was created as part of creation of the
+        referenced object, and is really just a part of its internal
+        implementation.  A <command>DROP</command> of the dependent object
+        will be disallowed outright (we'll tell the user to issue a
+        <command>DROP</command> against the referenced object, instead).
+        While a regular internal dependency will prevent
+        the dependent object from being dropped while any such dependencies
+        remain, <literal>DEPENDENCY_INTERNAL_AUTO</literal> will allow such
+        a drop as long as the object can be found by following any of such
+        dependencies.
+        Example: an index on a partition is made internal-auto-dependent on
+        both the partition itself as well as on the index on the parent
+        partitioned table; so the partition index is dropped together with
+        either the partition it indexes, or with the parent index it is
+        attached to.
+       </para>
+      </listitem>
+     </varlistentry>
+ 
     <varlistentry>
      <term><symbol>DEPENDENCY_EXTENSION</> (<literal>e</>)</term>
      <listitem>
diff --git a/doc/src/sgml/ref/alter_index.sgml b/doc/src/sgml/ref/alter_index.sgml
index ad77b574..149a16bc 100644
--- a/doc/src/sgml/ref/alter_index.sgml
+++ b/doc/src/sgml/ref/alter_index.sgml
@@ -23,6 +23,7 @@ PostgreSQL documentation
 <synopsis>
 ALTER INDEX [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable> RENAME TO <replaceable class="PARAMETER">new_name</replaceable>
 ALTER INDEX [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable> SET TABLESPACE <replaceable class="PARAMETER">tablespace_name</replaceable>
+ALTER INDEX <replaceable class="parameter">name</replaceable> ATTACH PARTITION <replaceable class="parameter">index_name</replaceable>
 ALTER INDEX <replaceable class="PARAMETER">name</replaceable> DEPENDS ON EXTENSION <replaceable class="PARAMETER">extension_name</replaceable>
 ALTER INDEX [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable> SET ( <replaceable class="PARAMETER">storage_parameter</replaceable> = <replaceable class="PARAMETER">value</replaceable> [, ... ] )
 ALTER INDEX [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable> RESET ( <replaceable class="PARAMETER">storage_parameter</replaceable> [, ... ] )
@@ -73,6 +74,19 @@ ALTER INDEX ALL IN TABLESPACE <replaceable class="PARAMETER">name</replaceable>
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term><literal>ATTACH PARTITION</literal></term>
+    <listitem>
+     <para>
+      Causes the named index to become attached to the altered index.
+      The named index must be on a partition of the table containing the
+      index being altered, and have an equivalent definition.  An attached
+      index cannot be dropped by itself, and will automatically be dropped
+      if its parent index is dropped.
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term><literal>DEPENDS ON EXTENSION</literal></term>
     <listitem>
diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml
index d9ddbd01..ba4c3d04 100644
--- a/doc/src/sgml/ref/alter_table.sgml
+++ b/doc/src/sgml/ref/alter_table.sgml
@@ -837,7 +837,10 @@ ALTER TABLE [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable>
       as a partition of the target table. The table can be attached
       as a partition for specific values using <literal>FOR VALUES
       </literal> or as a default partition by using <literal>DEFAULT
-      </literal>.
+      </literal>.  For each index in the target table, a corresponding
+      one will be created in the attached table; or, if an equivalent
+      index already exists, will be attached to the target table's index,
+      as if <command>ALTER INDEX ATTACH PARTITION</command> had been executed.
      </para>
 
      <para>
@@ -929,7 +932,8 @@ ALTER TABLE [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable>
      <para>
       This form detaches specified partition of the target table.  The detached
       partition continues to exist as a standalone table, but no longer has any
-      ties to the table from which it was detached.
+      ties to the table from which it was detached.  Any indexes that were
+      attached to the target table's indexes are detached.
      </para>
     </listitem>
    </varlistentry>
diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml
index 6e59d73a..85634e5f 100644
--- a/doc/src/sgml/ref/create_index.sgml
+++ b/doc/src/sgml/ref/create_index.sgml
@@ -21,7 +21,7 @@ PostgreSQL documentation
 
  <refsynopsisdiv>
 <synopsis>
-CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class="parameter">name</replaceable> ] ON <replaceable class="parameter">table_name</replaceable> [ USING <replaceable class="parameter">method</replaceable> ]
+CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class="parameter">name</replaceable> ] ON [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ USING <replaceable class="parameter">method</replaceable> ]
     ( { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ COLLATE <replaceable class="parameter">collation</replaceable> ] [ <replaceable class="parameter">opclass</replaceable> ] [ ASC | DESC ] [ NULLS { FIRST | LAST } ] [, ...] )
     [ WITH ( <replaceable class="PARAMETER">storage_parameter</replaceable> = <replaceable class="PARAMETER">value</replaceable> [, ... ] ) ]
     [ TABLESPACE <replaceable class="parameter">tablespace_name</replaceable> ]
@@ -155,6 +155,16 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><literal>ONLY</literal></term>
+      <listitem>
+       <para>
+        Indicates not to recurse creating indexes on partitions, if the
+        table is partitioned.  The default is to recurse.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><replaceable class="parameter">table_name</replaceable></term>
       <listitem>
@@ -549,6 +559,27 @@ Indexes:
    linkend="xindex">.
   </para>
 
+   <para>
+    When <literal>CREATE INDEX</literal> is invoked on a partitioned
+    table, the default behavior is to recurse to all partitions to ensure
+    they all have matching indexes.
+    Each partition is first checked to determine whether an equivalent
+    index already exists, and if so, that index will become attached as a
+    partition index to the index being created, which will become its
+    parent index.
+    If no matching index exists, a new index will be created and
+    automatically attached; the name of the new index in each partition
+    will be determined as if no index name had been specified in the
+    command.
+    If the <literal>ONLY</literal> option is specified, no recursion
+    is done, and the index is marked invalid
+    (<command>ALTER INDEX ... ATTACH PARTITION</command> turns the index
+    valid, once all partitions acquire the index.)  Note, however, that
+    any partition that is created in the future using
+    <command>CREATE TABLE ... PARTITION OF</command> will automatically
+    contain the index regardless of whether this option was specified.
+   </para>
+ 
   <para>
    For index methods that support ordered scans (currently, only B-tree),
    the optional clauses <literal>ASC</>, <literal>DESC</>, <literal>NULLS
diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml
index 3908ade3..61d4c1e1 100644
--- a/doc/src/sgml/ref/reindex.sgml
+++ b/doc/src/sgml/ref/reindex.sgml
@@ -231,6 +231,11 @@ REINDEX [ ( VERBOSE ) ] { INDEX | TABLE | SCHEMA | DATABASE | SYSTEM } <replacea
    reindex anything.
   </para>
 
+  <para>
+   Reindexing partitioned tables or partitioned indexes is not supported.
+   Each individual partition can be reindexed separately instead.
+  </para>
+
  </refsect1>
 
  <refsect1>
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 25b6394c..f3602fb6 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -1052,6 +1052,7 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc,
             options = view_reloptions(datum, false);
             break;
         case RELKIND_INDEX:
+		case RELKIND_PARTITIONED_INDEX:
             options = index_reloptions(amoptions, datum, false);
             break;
         case RELKIND_FOREIGN_TABLE:
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 97064050..b0129032 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1484,7 +1484,8 @@ heap_open(Oid relationId, LOCKMODE lockmode)
 
     r = relation_open(relationId, lockmode);
 
-    if (r->rd_rel->relkind == RELKIND_INDEX)
+	if (r->rd_rel->relkind == RELKIND_INDEX ||
+		r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
         ereport(ERROR,
                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                  errmsg("\"%s\" is an index",
@@ -1512,7 +1513,8 @@ heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
 
     r = relation_openrv(relation, lockmode);
 
-    if (r->rd_rel->relkind == RELKIND_INDEX)
+	if (r->rd_rel->relkind == RELKIND_INDEX ||
+		r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
         ereport(ERROR,
                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                  errmsg("\"%s\" is an index",
@@ -1544,7 +1546,8 @@ heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
 
     if (r)
     {
-        if (r->rd_rel->relkind == RELKIND_INDEX)
+		if (r->rd_rel->relkind == RELKIND_INDEX ||
+			r->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
             ereport(ERROR,
                     (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                      errmsg("\"%s\" is an index",
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index c7be5d3a..931f71cc 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -215,7 +215,8 @@ index_open(Oid relationId, LOCKMODE lockmode)
 
     r = relation_open(relationId, lockmode);
 
-    if (r->rd_rel->relkind != RELKIND_INDEX)
+	if (r->rd_rel->relkind != RELKIND_INDEX &&
+		r->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
         ereport(ERROR,
                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                  errmsg("\"%s\" is not an index",
diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y
index 137c2dad..128b2e6c 100644
--- a/src/backend/bootstrap/bootparse.y
+++ b/src/backend/bootstrap/bootparse.y
@@ -386,6 +386,7 @@ Boot_DeclareIndexStmt:
 					DefineIndex(relationId,
 								stmt,
 								$4,
+								InvalidOid,
 								false,
 								false,
 								false,
@@ -431,6 +432,7 @@ Boot_DeclareUniqueIndexStmt:
 					DefineIndex(relationId,
 								stmt,
 								$5,
+								InvalidOid,
 								false,
 								false,
 								false,
diff --git a/src/backend/catalog/aclchk.c b/src/backend/catalog/aclchk.c
index 73fdd150..26e9dfb9 100644
--- a/src/backend/catalog/aclchk.c
+++ b/src/backend/catalog/aclchk.c
@@ -1789,7 +1789,8 @@ ExecGrant_Relation(InternalGrant *istmt)
         pg_class_tuple = (Form_pg_class) GETSTRUCT(tuple);
 
         /* Not sensible to grant on an index */
-        if (pg_class_tuple->relkind == RELKIND_INDEX)
+		if (pg_class_tuple->relkind == RELKIND_INDEX ||
+			pg_class_tuple->relkind == RELKIND_PARTITIONED_INDEX)
             ereport(ERROR,
                     (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                      errmsg("\"%s\" is an index",
@@ -5384,7 +5385,8 @@ recordExtObjInitPriv(Oid objoid, Oid classoid)
         pg_class_tuple = (Form_pg_class) GETSTRUCT(tuple);
 
         /* Indexes don't have permissions */
-        if (pg_class_tuple->relkind == RELKIND_INDEX)
+		if (pg_class_tuple->relkind == RELKIND_INDEX ||
+			pg_class_tuple->relkind == RELKIND_PARTITIONED_INDEX)
             return;
 
         /* Composite types don't have permissions either */
@@ -5669,7 +5671,8 @@ removeExtObjInitPriv(Oid objoid, Oid classoid)
         pg_class_tuple = (Form_pg_class) GETSTRUCT(tuple);
 
         /* Indexes don't have permissions */
-        if (pg_class_tuple->relkind == RELKIND_INDEX)
+		if (pg_class_tuple->relkind == RELKIND_INDEX ||
+			pg_class_tuple->relkind == RELKIND_PARTITIONED_INDEX)
             return;
 
         /* Composite types don't have permissions either */
diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c
index 924d7f35..737e549d 100644
--- a/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@ -728,6 +728,7 @@ findDependentObjects(const ObjectAddress *object,
                 /* FALL THRU */
 
             case DEPENDENCY_INTERNAL:
+			case DEPENDENCY_INTERNAL_AUTO:
 
                 /*
                  * This object is part of the internal implementation of
@@ -779,6 +780,14 @@ findDependentObjects(const ObjectAddress *object,
                  * transform this deletion request into a delete of this
                  * owning object.
                  *
+				 * For INTERNAL_AUTO dependencies, we don't enforce this;
+				 * in other words, we don't follow the links back to the
+				 * owning object.
+				 */
+				if (foundDep->deptype == DEPENDENCY_INTERNAL_AUTO)
+					break;
+
+				/*
                  * First, release caller's lock on this object and get
                  * deletion lock on the owning object.  (We must release
                  * caller's lock to avoid deadlock against a concurrent
@@ -821,6 +830,7 @@ findDependentObjects(const ObjectAddress *object,
                 /* And we're done here. */
                 systable_endscan(scan);
                 return;
+
             case DEPENDENCY_PIN:
 
                 /*
@@ -918,6 +928,7 @@ findDependentObjects(const ObjectAddress *object,
             case DEPENDENCY_AUTO_EXTENSION:
                 subflags = DEPFLAG_AUTO;
                 break;
+			case DEPENDENCY_INTERNAL_AUTO:
             case DEPENDENCY_INTERNAL:
                 subflags = DEPFLAG_INTERNAL;
                 break;
@@ -1267,7 +1278,8 @@ doDeletion(const ObjectAddress *object, int flags)
             {
                 char        relKind = get_rel_relkind(object->objectId);
 
-                if (relKind == RELKIND_INDEX)
+				if (relKind == RELKIND_INDEX ||
+					relKind == RELKIND_PARTITIONED_INDEX)
                 {
                     bool        concurrent = ((flags & PERFORM_DELETION_CONCURRENTLY) != 0);
 
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index db5d16ee..81c91015 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -41,6 +41,8 @@
 #include "catalog/pg_collation.h"
 #include "catalog/pg_constraint.h"
 #include "catalog/pg_constraint_fn.h"
+#include "catalog/pg_depend.h"
+#include "catalog/pg_inherits_fn.h"
 #include "catalog/pg_operator.h"
 #include "catalog/pg_opclass.h"
 #include "catalog/pg_tablespace.h"
@@ -56,6 +58,7 @@
 #include "nodes/nodeFuncs.h"
 #include "optimizer/clauses.h"
 #include "parser/parser.h"
+#include "rewrite/rewriteManip.h"
 #include "pgxc/pgxc.h"
 #include "storage/bufmgr.h"
 #include "storage/lmgr.h"
@@ -103,6 +106,7 @@ static void InitializeAttributeOids(Relation indexRelation,
                         int numatts, Oid indexoid);
 static void AppendAttributeTuples(Relation indexRelation, int numatts);
 static void UpdateIndexRelation(Oid indexoid, Oid heapoid,
+					Oid parentIndexId,
                     IndexInfo *indexInfo,
                     Oid *collationOids,
                     Oid *classOids,
@@ -110,7 +114,8 @@ static void UpdateIndexRelation(Oid indexoid, Oid heapoid,
                     bool primary,
                     bool isexclusion,
                     bool immediate,
-                    bool isvalid);
+					bool isvalid,
+					bool isready);
 static void index_update_stats(Relation rel,
                    bool hasindex, bool isprimary,
                    double reltuples);
@@ -590,6 +595,7 @@ AppendAttributeTuples(Relation indexRelation, int numatts)
 static void
 UpdateIndexRelation(Oid indexoid,
                     Oid heapoid,
+					Oid parentIndexOid,
                     IndexInfo *indexInfo,
                     Oid *collationOids,
                     Oid *classOids,
@@ -597,7 +603,8 @@ UpdateIndexRelation(Oid indexoid,
                     bool primary,
                     bool isexclusion,
                     bool immediate,
-                    bool isvalid)
+					bool isvalid,
+					bool isready)
 {
     int2vector *indkey;
     oidvector  *indcollation;
@@ -671,8 +678,7 @@ UpdateIndexRelation(Oid indexoid,
     values[Anum_pg_index_indisclustered - 1] = BoolGetDatum(false);
     values[Anum_pg_index_indisvalid - 1] = BoolGetDatum(isvalid);
     values[Anum_pg_index_indcheckxmin - 1] = BoolGetDatum(false);
-    /* we set isvalid and isready the same way */
-    values[Anum_pg_index_indisready - 1] = BoolGetDatum(isvalid);
+	values[Anum_pg_index_indisready - 1] = BoolGetDatum(isready);
     values[Anum_pg_index_indislive - 1] = BoolGetDatum(true);
     values[Anum_pg_index_indisreplident - 1] = BoolGetDatum(false);
     values[Anum_pg_index_indkey - 1] = PointerGetDatum(indkey);
@@ -709,6 +715,8 @@ UpdateIndexRelation(Oid indexoid,
  * indexRelationId: normally, pass InvalidOid to let this routine
  *        generate an OID for the index.  During bootstrap this may be
  *        nonzero to specify a preselected OID.
+ * parentIndexRelid: if creating an index partition, the OID of the
+ *		parent index; otherwise InvalidOid.
  * relFileNode: normally, pass InvalidOid to get new storage.  May be
  *        nonzero to attach an existing valid build.
  * indexInfo: same info executor uses to insert into the index
@@ -734,6 +742,8 @@ UpdateIndexRelation(Oid indexoid,
  *		INDEX_CREATE_IF_NOT_EXISTS:
  *			do not throw an error if a relation with the same name
  *			already exists.
+ *		INDEX_CREATE_PARTITIONED:
+ *			create a partitioned index (table must be partitioned)
  * constr_flags: flags passed to index_constraint_create
  *		(only if INDEX_CREATE_ADD_CONSTRAINT is set)
  * allow_system_table_mods: allow table to be a system catalog
@@ -745,6 +755,7 @@ Oid
 index_create(Relation heapRelation,
              const char *indexRelationName,
              Oid indexRelationId,
+			 Oid parentIndexRelid,
              Oid relFileNode,
              IndexInfo *indexInfo,
              List *indexColNames,
@@ -770,12 +781,18 @@ index_create(Relation heapRelation,
     int            i;
     char        relpersistence;
 	bool		isprimary = (flags & INDEX_CREATE_IS_PRIMARY) != 0;
+	bool		invalid = (flags & INDEX_CREATE_INVALID) != 0;
 	bool		concurrent = (flags & INDEX_CREATE_CONCURRENT) != 0;
+	bool		partitioned = (flags & INDEX_CREATE_PARTITIONED) != 0;
+	char		relkind;
 
 	/* constraint flags can only be set when a constraint is requested */
 	Assert((constr_flags == 0) ||
 		   ((flags & INDEX_CREATE_ADD_CONSTRAINT) != 0));
+	/* partitioned indexes must never be "built" by themselves */
+	Assert(!partitioned || (flags & INDEX_CREATE_SKIP_BUILD));
 
+	relkind = partitioned ? RELKIND_PARTITIONED_INDEX : RELKIND_INDEX;
     is_exclusion = (indexInfo->ii_ExclusionOps != NULL);
 
     pg_class = heap_open(RelationRelationId, RowExclusiveLock);
@@ -893,9 +910,9 @@ index_create(Relation heapRelation,
     }
 
     /*
-     * create the index relation's relcache entry and physical disk file. (If
-     * we fail further down, it's the smgr's responsibility to remove the disk
-     * file again.)
+	 * create the index relation's relcache entry and, if necessary, the
+	 * physical disk file. (If we fail further down, it's the smgr's
+	 * responsibility to remove the disk file again, if any.)
      */
     indexRelation = heap_create(indexRelationName,
                                 namespaceId,
@@ -903,7 +920,7 @@ index_create(Relation heapRelation,
                                 indexRelationId,
                                 relFileNode,
                                 indexTupDesc,
-                                RELKIND_INDEX,
+								relkind,
                                 relpersistence,
                                 shared_relation,
                                 mapped_relation,
@@ -960,12 +977,18 @@ index_create(Relation heapRelation,
      *      (Or, could define a rule to maintain the predicate) --Nels, Feb '92
      * ----------------
      */
-    UpdateIndexRelation(indexRelationId, heapRelationId, indexInfo,
+	UpdateIndexRelation(indexRelationId, heapRelationId, parentIndexRelid,
+						indexInfo,
                         collationObjectId, classObjectId, coloptions,
                         isprimary, is_exclusion,
 						(constr_flags & INDEX_CONSTR_CREATE_DEFERRABLE) == 0,
+						!concurrent && !invalid,
                         !concurrent);
 
+	/* update pg_inherits, if needed */
+	if (OidIsValid(parentIndexRelid))
+		StoreSingleInheritance(indexRelationId, parentIndexRelid, 1);
+
     /*
      * Register constraint and dependencies for the index.
      *
@@ -1017,6 +1040,9 @@ index_create(Relation heapRelation,
         else
         {
             bool        have_simple_col = false;
+			DependencyType	deptype;
+
+			deptype = OidIsValid(parentIndexRelid) ? DEPENDENCY_INTERNAL_AUTO : DEPENDENCY_AUTO;
 
             /* Create auto dependencies on simply-referenced columns */
             for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
@@ -1027,7 +1053,7 @@ index_create(Relation heapRelation,
                     referenced.objectId = heapRelationId;
                     referenced.objectSubId = indexInfo->ii_KeyAttrNumbers[i];
 
-                    recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
+					recordDependencyOn(&myself, &referenced, deptype);
 
                     have_simple_col = true;
                 }
@@ -1045,8 +1071,18 @@ index_create(Relation heapRelation,
                 referenced.objectId = heapRelationId;
                 referenced.objectSubId = 0;
 
-                recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
+				recordDependencyOn(&myself, &referenced, deptype);
+			}
             }
+
+		/* Store dependency on parent index, if any */
+		if (OidIsValid(parentIndexRelid))
+		{
+			referenced.classId = RelationRelationId;
+			referenced.objectId = parentIndexRelid;
+			referenced.objectSubId = 0;
+
+			recordDependencyOn(&myself, &referenced, DEPENDENCY_INTERNAL_AUTO);
         }
 
         /* Store dependency on collations */
@@ -1598,8 +1634,9 @@ index_drop(Oid indexId, bool concurrent)
     }
 
     /*
-     * Schedule physical removal of the files
+	 * Schedule physical removal of the files (if any)
      */
+	if (userIndexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
     RelationDropStorage(userIndexRelation);
 
     /*
@@ -1649,6 +1686,11 @@ index_drop(Oid indexId, bool concurrent)
     DeleteRelationTuple(indexId);
 
     /*
+	 * fix INHERITS relation
+	 */
+	DeleteInheritsTuple(indexId, InvalidOid);
+
+	/*
      * We are presently too lazy to attempt to compute the new correct value
      * of relhasindex (the next VACUUM will fix it if necessary). So there is
      * no need to update the pg_class tuple for the owning relation. But we
@@ -1741,12 +1783,120 @@ BuildIndexInfo(Relation index)
     ii->ii_BrokenHotChain = false;
 
     /* set up for possible use by index AM */
+	ii->ii_Am = index->rd_rel->relam;
     ii->ii_AmCache = NULL;
     ii->ii_Context = CurrentMemoryContext;
 
     return ii;
 }
 
+/*
+ * CompareIndexInfo
+ *		Return whether the properties of two indexes (in different tables)
+ *		indicate that they have the "same" definitions.
+ *
+ * Note: passing collations and opfamilies separately is a kludge.  Adding
+ * them to IndexInfo may result in better coding here and elsewhere.
+ *
+ * Use convert_tuples_by_name_map(index2, index1) to build the attmap.
+ */
+bool
+CompareIndexInfo(IndexInfo *info1, IndexInfo *info2,
+				 Oid *collations1, Oid *collations2,
+				 Oid *opfamilies1, Oid *opfamilies2,
+				 AttrNumber *attmap, int maplen)
+{
+	int		i;
+
+	if (info1->ii_Unique != info2->ii_Unique)
+		return false;
+
+	/* indexes are only equivalent if they have the same access method */
+	if (info1->ii_Am != info2->ii_Am)
+		return false;
+
+	/* and same number of attributes */
+	if (info1->ii_NumIndexAttrs != info2->ii_NumIndexAttrs)
+		return false;
+
+	/*
+	 * and columns match through the attribute map (actual attribute numbers
+	 * might differ!)  Note that this implies that index columns that are
+	 * expressions appear in the same positions.  We will next compare the
+	 * expressions themselves.
+	 */
+	for (i = 0; i < info1->ii_NumIndexAttrs; i++)
+	{
+		if (maplen < info2->ii_KeyAttrNumbers[i])
+			elog(ERROR, "incorrect attribute map");
+
+		if (attmap[info2->ii_KeyAttrNumbers[i] - 1] !=
+			info1->ii_KeyAttrNumbers[i])
+			return false;
+
+		if (collations1[i] != collations2[i])
+			return false;
+		if (opfamilies1[i] != opfamilies2[i])
+			return false;
+	}
+
+	/*
+	 * For expression indexes: either both are expression indexes, or neither
+	 * is; if they are, make sure the expressions match.
+	 */
+	if ((info1->ii_Expressions != NIL) != (info2->ii_Expressions != NIL))
+		return false;
+	if (info1->ii_Expressions != NIL)
+	{
+		bool	found_whole_row;
+		Node   *mapped;
+
+		mapped = map_variable_attnos((Node *) info2->ii_Expressions,
+									 1, 0, attmap, maplen,
+									 InvalidOid, &found_whole_row);
+		if (found_whole_row)
+		{
+			/*
+			 * we could throw an error here, but seems out of scope for this
+			 * routine.
+			 */
+			return false;
+		}
+
+		if (!equal(info1->ii_Expressions, mapped))
+			return false;
+	}
+
+	/* Partial index predicates must be identical, if they exist */
+	if ((info1->ii_Predicate == NULL) != (info2->ii_Predicate == NULL))
+		return false;
+	if (info1->ii_Predicate != NULL)
+	{
+		bool	found_whole_row;
+		Node   *mapped;
+
+		mapped = map_variable_attnos((Node *) info2->ii_Predicate,
+									 1, 0, attmap, maplen,
+									 InvalidOid, &found_whole_row);
+		if (found_whole_row)
+		{
+			/*
+			 * we could throw an error here, but seems out of scope for this
+			 * routine.
+			 */
+			return false;
+		}
+		if (!equal(info1->ii_Predicate, mapped))
+			return false;
+	}
+
+	/* No support currently for comparing exclusion indexes. */
+	if (info1->ii_ExclusionOps != NULL || info2->ii_ExclusionOps != NULL)
+		return false;
+
+	return true;
+}
+
 /* ----------------
  *        BuildSpeculativeIndexInfo
  *            Add extra state to IndexInfo record
@@ -1969,6 +2119,9 @@ index_update_stats(Relation rel,
         elog(ERROR, "could not find tuple for relation %u", relid);
     rd_rel = (Form_pg_class) GETSTRUCT(tuple);
 
+	/* Should this be a more comprehensive test? */
+	Assert(rd_rel->relkind != RELKIND_PARTITIONED_INDEX);
+
     /* Apply required updates, if any, to copied tuple */
 
     dirty = false;
@@ -3416,6 +3569,14 @@ reindex_index(Oid indexId, bool skip_constraint_checks, char persistence,
     iRel = index_open(indexId, AccessExclusiveLock);
 
     /*
+	 * The case of reindexing partitioned tables and indexes is handled
+	 * differently by upper layers, so this case shouldn't arise.
+	 */
+	if (iRel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+		elog(ERROR, "unsupported relation kind for index \"%s\"",
+			 RelationGetRelationName(iRel));
+
+	/*
      * Don't allow reindex on temp tables of other backends ... their local
      * buffer manager is not going to cope.
      */
@@ -3614,6 +3775,22 @@ reindex_relation(Oid relid, int flags, int options)
      */
     rel = heap_open(relid, ShareLock);
 
+	/*
+	 * This may be useful when implemented someday; but that day is not today.
+	 * For now, avoid erroring out when called in a multi-table context
+	 * (REINDEX SCHEMA) and happen to come across a partitioned table.  The
+	 * partitions may be reindexed on their own anyway.
+	 */
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+	{
+		ereport(WARNING,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("REINDEX of partitioned tables is not yet implemented, skipping \"%s\"",
+						RelationGetRelationName(rel))));
+		heap_close(rel, ShareLock);
+		return false;
+	}
+
     toast_relid = rel->rd_rel->reltoastrelid;
 
     /*
diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c
index 7ff21d2d..1a45c53b 100644
--- a/src/backend/catalog/objectaddress.c
+++ b/src/backend/catalog/objectaddress.c
@@ -1299,7 +1299,8 @@ get_relation_by_qualified_name(ObjectType objtype, List *object,
     switch (objtype)
     {
         case OBJECT_INDEX:
-            if (relation->rd_rel->relkind != RELKIND_INDEX)
+			if (relation->rd_rel->relkind != RELKIND_INDEX &&
+				relation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
                 ereport(ERROR,
                         (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                          errmsg("\"%s\" is not an index",
@@ -3628,6 +3629,7 @@ getRelationDescription(StringInfo buffer, Oid relid)
                              relname);
             break;
         case RELKIND_INDEX:
+		case RELKIND_PARTITIONED_INDEX:
             appendStringInfo(buffer, _("index %s"),
                              relname);
             break;
@@ -4144,6 +4146,7 @@ getRelationTypeDescription(StringInfo buffer, Oid relid, int32 objectSubId)
             appendStringInfoString(buffer, "table");
             break;
         case RELKIND_INDEX:
+		case RELKIND_PARTITIONED_INDEX:
             appendStringInfoString(buffer, "index");
             break;
         case RELKIND_SEQUENCE:
diff --git a/src/backend/catalog/pg_depend.c b/src/backend/catalog/pg_depend.c
index 68711d22..bf20cc54 100644
--- a/src/backend/catalog/pg_depend.c
+++ b/src/backend/catalog/pg_depend.c
@@ -656,14 +656,19 @@ get_constraint_index(Oid constraintId)
 
         /*
          * We assume any internal dependency of an index on the constraint
-         * must be what we are looking for.  (The relkind test is just
-         * paranoia; there shouldn't be any such dependencies otherwise.)
+		 * must be what we are looking for.
          */
         if (deprec->classid == RelationRelationId &&
             deprec->objsubid == 0 &&
-            deprec->deptype == DEPENDENCY_INTERNAL &&
-            get_rel_relkind(deprec->objid) == RELKIND_INDEX)
+			deprec->deptype == DEPENDENCY_INTERNAL)
         {
+			char		relkind = get_rel_relkind(deprec->objid);
+
+			/* This is pure paranoia; there shouldn't be any such */
+			if (relkind != RELKIND_INDEX &&
+				relkind != RELKIND_PARTITIONED_INDEX)
+				break;
+
             indexId = deprec->objid;
             break;
         }
diff --git a/src/backend/catalog/pg_inherits.c b/src/backend/catalog/pg_inherits.c
index 84e2fa04..351bd788 100644
--- a/src/backend/catalog/pg_inherits.c
+++ b/src/backend/catalog/pg_inherits.c
@@ -400,3 +400,83 @@ typeInheritsFrom(Oid subclassTypeId, Oid superclassTypeId)
 
     return result;
 }
+
+/*
+ * Create a single pg_inherits row with the given data
+ */
+void
+StoreSingleInheritance(Oid relationId, Oid parentOid, int32 seqNumber)
+{
+	Datum		values[Natts_pg_inherits];
+	bool		nulls[Natts_pg_inherits];
+	HeapTuple	tuple;
+	Relation	inhRelation;
+
+	inhRelation = heap_open(InheritsRelationId, RowExclusiveLock);
+
+	/*
+	 * Make the pg_inherits entry
+	 */
+	values[Anum_pg_inherits_inhrelid - 1] = ObjectIdGetDatum(relationId);
+	values[Anum_pg_inherits_inhparent - 1] = ObjectIdGetDatum(parentOid);
+	values[Anum_pg_inherits_inhseqno - 1] = Int32GetDatum(seqNumber);
+
+	memset(nulls, 0, sizeof(nulls));
+
+	tuple = heap_form_tuple(RelationGetDescr(inhRelation), values, nulls);
+
+	CatalogTupleInsert(inhRelation, tuple);
+
+	heap_freetuple(tuple);
+
+	heap_close(inhRelation, RowExclusiveLock);
+}
+
+/*
+ * DeleteInheritsTuple
+ *
+ * Delete pg_inherits tuples with the given inhrelid.  inhparent may be given
+ * as InvalidOid, in which case all tuples matching inhrelid are deleted;
+ * otherwise only delete tuples with the specified inhparent.
+ *
+ * Returns whether at least one row was deleted.
+ */
+bool
+DeleteInheritsTuple(Oid inhrelid, Oid inhparent)
+{
+	bool	found = false;
+	Relation	catalogRelation;
+	ScanKeyData key;
+	SysScanDesc scan;
+	HeapTuple	inheritsTuple;
+
+	/*
+	 * Find pg_inherits entries by inhrelid.
+	 */
+	catalogRelation = heap_open(InheritsRelationId, RowExclusiveLock);
+	ScanKeyInit(&key,
+				Anum_pg_inherits_inhrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(inhrelid));
+	scan = systable_beginscan(catalogRelation, InheritsRelidSeqnoIndexId,
+							  true, NULL, 1, &key);
+
+	while (HeapTupleIsValid(inheritsTuple = systable_getnext(scan)))
+	{
+		Oid			parent;
+
+		/* Compare inhparent if it was given, and do the actual deletion. */
+		parent = ((Form_pg_inherits) GETSTRUCT(inheritsTuple))->inhparent;
+		if (!OidIsValid(inhparent) || parent == inhparent)
+		{
+			CatalogTupleDelete(catalogRelation, &inheritsTuple->t_self);
+			found = true;
+		}
+	}
+
+	/* Done */
+	systable_endscan(scan);
+	heap_close(catalogRelation, RowExclusiveLock);
+
+	return found;
+}
diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c
index 325b72e9..a82b2037 100644
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@@ -378,6 +378,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
     indexInfo->ii_ReadyForInserts = true;
     indexInfo->ii_Concurrent = false;
     indexInfo->ii_BrokenHotChain = false;
+	indexInfo->ii_Am = BTREE_AM_OID;
     indexInfo->ii_AmCache = NULL;
     indexInfo->ii_Context = CurrentMemoryContext;
 
@@ -391,6 +392,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
     coloptions[1] = 0;
 
     index_create(toast_rel, toast_idxname, toastIndexOid, InvalidOid,
+				 InvalidOid,
                  indexInfo,
                  list_make2("chunk_id", "chunk_seq"),
                  BTREE_AM_OID,
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index af0b9947..22c2348e 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -25,7 +25,10 @@
 #include "catalog/catalog.h"
 #include "catalog/index.h"
 #include "catalog/indexing.h"
+#include "catalog/partition.h"
 #include "catalog/pg_am.h"
+#include "catalog/pg_inherits.h"
+#include "catalog/pg_inherits_fn.h"
 #include "catalog/pg_opclass.h"
 #include "catalog/pg_opfamily.h"
 #include "catalog/pg_tablespace.h"
@@ -38,6 +41,7 @@
 #include "commands/tablespace.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
+#include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
 #include "optimizer/clauses.h"
 #include "optimizer/planner.h"
@@ -45,6 +49,7 @@
 #include "parser/parse_coerce.h"
 #include "parser/parse_func.h"
 #include "parser/parse_oper.h"
+#include "rewrite/rewriteManip.h"
 #ifdef PGXC
 #include "parser/parse_utilcmd.h"
 #include "pgxc/pgxc.h"
@@ -84,6 +89,7 @@ static char *ChooseIndexNameAddition(List *colnames);
 static List *ChooseIndexColumnNames(List *indexElems);
 static void RangeVarCallbackForReindexIndex(const RangeVar *relation,
                                 Oid relId, Oid oldRelId, void *arg);
+static void ReindexPartitionedIndex(Relation parentIdx);
 
 /*
  * CheckIndexCompatible
@@ -190,6 +196,7 @@ CheckIndexCompatible(Oid oldId,
     indexInfo->ii_ExclusionOps = NULL;
     indexInfo->ii_ExclusionProcs = NULL;
     indexInfo->ii_ExclusionStrats = NULL;
+	indexInfo->ii_Am = accessMethodId;
     indexInfo->ii_AmCache = NULL;
     indexInfo->ii_Context = CurrentMemoryContext;
     typeObjectId = (Oid *) palloc(numberOfAttributes * sizeof(Oid));
@@ -299,14 +306,15 @@ CheckIndexCompatible(Oid oldId,
  * 'stmt': IndexStmt describing the properties of the new index.
  * 'indexRelationId': normally InvalidOid, but during bootstrap can be
  *        nonzero to specify a preselected OID for the index.
+ * 'parentIndexId': the OID of the parent index; InvalidOid if not the child
+ *		of a partitioned index.
  * 'is_alter_table': this is due to an ALTER rather than a CREATE operation.
  * 'check_rights': check for CREATE rights in namespace and tablespace.  (This
  *        should be true except when ALTER is deleting/recreating an index.)
  * 'check_not_in_use': check for table not already in use in current session.
  *        This should be true unless caller is holding the table open, in which
  *        case the caller had better have checked it earlier.
- * 'skip_build': make the catalog entries but leave the index file empty;
- *        it will be filled later.
+ * 'skip_build': make the catalog entries but don't create the index files
  * 'quiet': suppress the NOTICE chatter ordinarily provided for constraints.
  *
  * Returns the object address of the created index.
@@ -315,6 +323,7 @@ ObjectAddress
 DefineIndex(Oid relationId,
             IndexStmt *stmt,
             Oid indexRelationId,
+			Oid parentIndexId,
             bool is_alter_table,
             bool check_rights,
             bool check_not_in_use,
@@ -337,6 +346,7 @@ DefineIndex(Oid relationId,
     IndexAmRoutine *amRoutine;
     bool        amcanorder;
     amoptions_function amoptions;
+	bool		partitioned;
     Datum        reloptions;
     int16       *coloptions;
     IndexInfo  *indexInfo;
@@ -399,7 +409,8 @@ DefineIndex(Oid relationId,
     namespaceId = RelationGetNamespace(rel);
 
     if (rel->rd_rel->relkind != RELKIND_RELATION &&
-        rel->rd_rel->relkind != RELKIND_MATVIEW)
+		rel->rd_rel->relkind != RELKIND_MATVIEW &&
+		rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
     {
         if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
 
@@ -411,11 +422,6 @@ DefineIndex(Oid relationId,
                     (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                      errmsg("cannot create index on foreign table \"%s\"",
                             RelationGetRelationName(rel))));
-        else if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-            ereport(ERROR,
-                    (errcode(ERRCODE_WRONG_OBJECT_TYPE),
-                     errmsg("cannot create index on partitioned table \"%s\"",
-                            RelationGetRelationName(rel))));
         else
             ereport(ERROR,
                     (errcode(ERRCODE_WRONG_OBJECT_TYPE),
@@ -424,6 +430,38 @@ DefineIndex(Oid relationId,
     }
 
     /*
+	* Establish behavior for partitioned tables, and verify sanity of
+	* parameters.
+	*
+	* We do not build an actual index in this case; we only create a few
+	* catalog entries.  The actual indexes are built by recursing for each
+	* partition.
+	*/
+	partitioned = rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE;
+	if (partitioned)
+	{
+	   if (stmt->concurrent)
+	       ereport(ERROR,
+	               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+	                errmsg("cannot create index on partitioned table \"%s\" concurrently",
+	                       RelationGetRelationName(rel))));
+	   if (stmt->unique)
+	       ereport(ERROR,
+	               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+	                errmsg("cannot create unique index on partitioned table \"%s\"",
+	                       RelationGetRelationName(rel))));
+	   if (stmt->excludeOpNames)
+	       ereport(ERROR,
+	               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+	                errmsg("cannot create exclusion constraints on partitioned table \"%s\"",
+	                       RelationGetRelationName(rel))));
+	   if (stmt->primary || stmt->isconstraint)
+	       ereport(ERROR,
+	               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+	                errmsg("cannot create constraints on partitioned tables")));
+	}
+
+	/*
      * Don't try to CREATE INDEX on temp tables of other backends.
      */
     if (RELATION_IS_OTHER_TEMP(rel))
@@ -668,6 +706,7 @@ DefineIndex(Oid relationId,
     indexInfo->ii_ReadyForInserts = !stmt->concurrent;
     indexInfo->ii_Concurrent = stmt->concurrent;
     indexInfo->ii_BrokenHotChain = false;
+	indexInfo->ii_Am = accessMethodId;
     indexInfo->ii_AmCache = NULL;
     indexInfo->ii_Context = CurrentMemoryContext;
 
@@ -759,19 +798,24 @@ DefineIndex(Oid relationId,
     /*
 	 * Make the catalog entries for the index, including constraints. This
 	 * step also actually builds the index, except if caller requested not to
-	 * or in concurrent mode, in which case it'll be done later.
+	 * or in concurrent mode, in which case it'll be done later, or
+	 * doing a partitioned index (because those don't have storage).
      */
 	flags = constr_flags = 0;
 	if (stmt->isconstraint)
 		flags |= INDEX_CREATE_ADD_CONSTRAINT;
-	if (skip_build || stmt->concurrent)
+	if (skip_build || stmt->concurrent || partitioned)
 		flags |= INDEX_CREATE_SKIP_BUILD;
 	if (stmt->if_not_exists)
 		flags |= INDEX_CREATE_IF_NOT_EXISTS;
 	if (stmt->concurrent)
 		flags |= INDEX_CREATE_CONCURRENT;
+	if (partitioned)
+		flags |= INDEX_CREATE_PARTITIONED;
 	if (stmt->primary)
 		flags |= INDEX_CREATE_IS_PRIMARY;
+	if (partitioned && stmt->relation && !stmt->relation->inh)
+		flags |= INDEX_CREATE_INVALID;
 
 	if (stmt->deferrable)
 		constr_flags |= INDEX_CONSTR_CREATE_DEFERRABLE;
@@ -779,8 +823,8 @@ DefineIndex(Oid relationId,
 		constr_flags |= INDEX_CONSTR_CREATE_INIT_DEFERRED;
 
     indexRelationId =
-        index_create(rel, indexRelationName, indexRelationId, stmt->oldNode,
-                     indexInfo, indexColNames,
+		index_create(rel, indexRelationName, indexRelationId, parentIndexId,
+					 stmt->oldNode, indexInfo, indexColNames,
                      accessMethodId, tablespaceId,
                      collationObjectId, classObjectId,
 					 coloptions, reloptions,
@@ -807,6 +851,160 @@ DefineIndex(Oid relationId,
         CreateComments(indexRelationId, RelationRelationId, 0,
                        stmt->idxcomment);
 
+	if (partitioned)
+	{
+		/*
+		 * Unless caller specified to skip this step (via ONLY), process
+		 * each partition to make sure they all contain a corresponding index.
+		 *
+		 * If we're called internally (no stmt->relation), recurse always.
+		 */
+		if (!stmt->relation || stmt->relation->inh)
+		{
+			PartitionDesc partdesc = RelationGetPartitionDesc(rel);
+			int			nparts = partdesc->nparts;
+			Oid		   *part_oids = palloc(sizeof(Oid) * nparts);
+			bool		invalidate_parent = false;
+			TupleDesc	parentDesc;
+			Oid		   *opfamOids;
+
+			memcpy(part_oids, partdesc->oids, sizeof(Oid) * nparts);
+
+			parentDesc = CreateTupleDescCopy(RelationGetDescr(rel));
+			opfamOids = palloc(sizeof(Oid) * numberOfAttributes);
+			for (i = 0; i < numberOfAttributes; i++)
+				opfamOids[i] = get_opclass_family(classObjectId[i]);
+
+			heap_close(rel, NoLock);
+
+			/*
+			 * For each partition, scan all existing indexes; if one matches
+			 * our index definition and is not already attached to some other
+			 * parent index, attach it to the one we just created.
+			 *
+			 * If none matches, build a new index by calling ourselves
+			 * recursively with the same options (except for the index name).
+			 */
+			for (i = 0; i < nparts; i++)
+			{
+				Oid		childRelid = part_oids[i];
+				Relation childrel;
+				List   *childidxs;
+				ListCell *cell;
+				AttrNumber *attmap;
+				bool	found = false;
+				int		maplen;
+
+				childrel = heap_open(childRelid, lockmode);
+				childidxs = RelationGetIndexList(childrel);
+				attmap =
+					convert_tuples_by_name_map(RelationGetDescr(childrel),
+											   parentDesc,
+											   gettext_noop("could not convert row type"));
+				maplen = parentDesc->natts;
+
+
+				foreach(cell, childidxs)
+				{
+					Oid			cldidxid = lfirst_oid(cell);
+					Relation	cldidx;
+					IndexInfo  *cldIdxInfo;
+
+					/* this index is already partition of another one */
+					if (has_superclass(cldidxid))
+						continue;
+
+					cldidx = index_open(cldidxid, lockmode);
+					cldIdxInfo = BuildIndexInfo(cldidx);
+					if (CompareIndexInfo(cldIdxInfo, indexInfo,
+										 cldidx->rd_indcollation,
+										 collationObjectId,
+										 cldidx->rd_opfamily,
+										 opfamOids,
+										 attmap, maplen))
+					{
+						/*
+						 * Found a match.  Attach index to parent and we're
+						 * done, but keep lock till commit.
+						 */
+						IndexSetParentIndex(cldidx, indexRelationId);
+
+						if (!IndexIsValid(cldidx->rd_index))
+							invalidate_parent = true;
+
+						found = true;
+						index_close(cldidx, NoLock);
+						break;
+					}
+
+					index_close(cldidx, lockmode);
+				}
+
+				list_free(childidxs);
+				heap_close(childrel, NoLock);
+
+				/*
+				 * If no matching index was found, create our own.
+				 */
+				if (!found)
+				{
+					IndexStmt  *childStmt = copyObject(stmt);
+					bool		found_whole_row;
+
+					childStmt->whereClause =
+						map_variable_attnos(stmt->whereClause, 1, 0,
+											attmap, maplen,
+											InvalidOid, &found_whole_row);
+					if (found_whole_row)
+						elog(ERROR, "cannot convert whole-row table reference");
+
+					childStmt->idxname = NULL;
+					childStmt->relationId = childRelid;
+					DefineIndex(childRelid, childStmt,
+								InvalidOid,			/* no predefined OID */
+								indexRelationId,	/* this is our child */
+								false, check_rights, check_not_in_use,
+								false, quiet);
+				}
+
+				pfree(attmap);
+			}
+
+			/*
+			 * The pg_index row we inserted for this index was marked
+			 * indisvalid=true.  But if we attached an existing index that
+			 * is invalid, this is incorrect, so update our row to
+			 * invalid too.
+			 */
+			if (invalidate_parent)
+			{
+				Relation	pg_index = heap_open(IndexRelationId, RowExclusiveLock);
+				HeapTuple	tup,
+							newtup;
+
+				tup = SearchSysCache1(INDEXRELID,
+									  ObjectIdGetDatum(indexRelationId));
+				if (!tup)
+					elog(ERROR, "cache lookup failed for index %u",
+						 indexRelationId);
+				newtup = heap_copytuple(tup);
+				((Form_pg_index) GETSTRUCT(newtup))->indisvalid = false;
+				CatalogTupleUpdate(pg_index, &tup->t_self, newtup);
+				ReleaseSysCache(tup);
+				heap_close(pg_index, RowExclusiveLock);
+				heap_freetuple(newtup);
+			}
+		}
+		else
+			heap_close(rel, NoLock);
+
+		/*
+		 * Indexes on partitioned tables are not themselves built, so we're
+		 * done here.
+		 */
+		return address;
+	}
+
     if (!stmt->concurrent)
     {
         /* Close the heap and we're done, in the non-concurrent case */
@@ -1904,7 +2102,7 @@ ChooseIndexColumnNames(List *indexElems)
  * ReindexIndex
  *        Recreate a specific index.
  */
-Oid
+void
 ReindexIndex(RangeVar *indexRelation, int options)
 {
     Oid            indOid;
@@ -1927,12 +2125,17 @@ ReindexIndex(RangeVar *indexRelation, int options)
      * lock on the index.
      */
     irel = index_open(indOid, NoLock);
+
+	if (irel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+	{
+		ReindexPartitionedIndex(irel);
+		return;
+	}
+
     persistence = irel->rd_rel->relpersistence;
     index_close(irel, NoLock);
 
     reindex_index(indOid, false, persistence, options);
-
-    return indOid;
 }
 
 /*
@@ -1971,7 +2174,8 @@ RangeVarCallbackForReindexIndex(const RangeVar *relation,
     relkind = get_rel_relkind(relId);
     if (!relkind)
         return;
-    if (relkind != RELKIND_INDEX)
+	if (relkind != RELKIND_INDEX &&
+		relkind != RELKIND_PARTITIONED_INDEX)
         ereport(ERROR,
                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                  errmsg("\"%s\" is not an index", relation->relname)));
@@ -2115,6 +2319,12 @@ ReindexMultipleTables(const char *objectName, ReindexObjectType objectKind,
         /*
          * Only regular tables and matviews can have indexes, so ignore any
          * other kind of relation.
+		 *
+		 * It is tempting to also consider partitioned tables here, but that
+		 * has the problem that if the children are in the same schema, they
+		 * would be processed twice.  Maybe we could have a separate list of
+		 * partitioned tables, and expand that afterwards into relids,
+		 * ignoring any duplicates.
          */
         if (classtuple->relkind != RELKIND_RELATION &&
             classtuple->relkind != RELKIND_MATVIEW)
@@ -2177,3 +2387,155 @@ ReindexMultipleTables(const char *objectName, ReindexObjectType objectKind,
 
     MemoryContextDelete(private_context);
 }
+
+/*
+ *	ReindexPartitionedIndex
+ *		Reindex each child of the given partitioned index.
+ *
+ * Not yet implemented.
+ */
+static void
+ReindexPartitionedIndex(Relation parentIdx)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("REINDEX is not yet implemented for partitioned indexes")));
+}
+
+/*
+ * Insert or delete an appropriate pg_inherits tuple to make the given index
+ * be a partition of the indicated parent index.
+ *
+ * This also corrects the pg_depend information for the affected index.
+ */
+void
+IndexSetParentIndex(Relation partitionIdx, Oid parentOid)
+{
+	Relation	pg_inherits;
+	ScanKeyData	key[2];
+	SysScanDesc	scan;
+	Oid			partRelid = RelationGetRelid(partitionIdx);
+	HeapTuple	tuple;
+	bool		fix_dependencies;
+
+	/* Make sure this is an index */
+	Assert(partitionIdx->rd_rel->relkind == RELKIND_INDEX ||
+		   partitionIdx->rd_rel->relkind == RELKIND_PARTITIONED_INDEX);
+
+	/*
+	 * Scan pg_inherits for rows linking our index to some parent.
+	 */
+	pg_inherits = relation_open(InheritsRelationId, RowExclusiveLock);
+	ScanKeyInit(&key[0],
+				Anum_pg_inherits_inhrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(partRelid));
+	ScanKeyInit(&key[1],
+				Anum_pg_inherits_inhseqno,
+				BTEqualStrategyNumber, F_INT4EQ,
+				Int32GetDatum(1));
+	scan = systable_beginscan(pg_inherits, InheritsRelidSeqnoIndexId, true,
+							  NULL, 2, key);
+	tuple = systable_getnext(scan);
+
+	if (!HeapTupleIsValid(tuple))
+	{
+		if (parentOid == InvalidOid)
+		{
+			/*
+			 * No pg_inherits row, and no parent wanted: nothing to do in
+			 * this case.
+			 */
+			fix_dependencies = false;
+		}
+		else
+		{
+			Datum	values[Natts_pg_inherits];
+			bool	isnull[Natts_pg_inherits];
+
+			/*
+			 * No pg_inherits row exists, and we want a parent for this index,
+			 * so insert it.
+			 */
+			values[Anum_pg_inherits_inhrelid - 1] = ObjectIdGetDatum(partRelid);
+			values[Anum_pg_inherits_inhparent - 1] =
+				ObjectIdGetDatum(parentOid);
+			values[Anum_pg_inherits_inhseqno - 1] = Int32GetDatum(1);
+			memset(isnull, false, sizeof(isnull));
+
+			tuple = heap_form_tuple(RelationGetDescr(pg_inherits),
+									values, isnull);
+			CatalogTupleInsert(pg_inherits, tuple);
+
+			fix_dependencies = true;
+		}
+	}
+	else
+	{
+		Form_pg_inherits	inhForm = (Form_pg_inherits) GETSTRUCT(tuple);
+
+		if (parentOid == InvalidOid)
+		{
+			/*
+			 * There exists a pg_inherits row, which we want to clear; do so.
+			 */
+			CatalogTupleDelete(pg_inherits, &tuple->t_self);
+			fix_dependencies = true;
+		}
+		else
+		{
+			/*
+			 * A pg_inherits row exists.  If it's the same we want, then we're
+			 * good; if it differs, that amounts to a corrupt catalog and
+			 * should not happen.
+			 */
+			if (inhForm->inhparent != parentOid)
+			{
+				/* unexpected: we should not get called in this case */
+				elog(ERROR, "bogus pg_inherit row: inhrelid %u inhparent %u",
+					 inhForm->inhrelid, inhForm->inhparent);
+			}
+
+			/* already in the right state */
+			fix_dependencies = false;
+		}
+	}
+
+	/* done with pg_inherits */
+	systable_endscan(scan);
+	relation_close(pg_inherits, RowExclusiveLock);
+
+	if (fix_dependencies)
+	{
+		ObjectAddress partIdx;
+
+		/*
+		 * Insert/delete pg_depend rows.  If setting a parent, add an
+		 * INTERNAL_AUTO dependency to the parent index; if making standalone,
+		 * remove all existing rows and put back the regular dependency on the
+		 * table.
+		 */
+		ObjectAddressSet(partIdx, RelationRelationId, partRelid);
+
+		if (OidIsValid(parentOid))
+		{
+			ObjectAddress	parentIdx;
+
+			ObjectAddressSet(parentIdx, RelationRelationId, parentOid);
+			recordDependencyOn(&partIdx, &parentIdx, DEPENDENCY_INTERNAL_AUTO);
+		}
+		else
+		{
+			ObjectAddress	partitionTbl;
+
+			ObjectAddressSet(partitionTbl, RelationRelationId,
+							 partitionIdx->rd_index->indrelid);
+
+			deleteDependencyRecordsForClass(RelationRelationId, partRelid,
+											RelationRelationId,
+											DEPENDENCY_INTERNAL_AUTO);
+
+			recordDependencyOn(&partIdx, &partitionTbl, DEPENDENCY_AUTO);
+		}
+	}
+}
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index c6e42c3d..834f2840 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -311,6 +311,12 @@ static const struct dropmsgstrings dropmsgstringarray[] = {
         gettext_noop("table \"%s\" does not exist, skipping"),
         gettext_noop("\"%s\" is not a table"),
     gettext_noop("Use DROP TABLE to remove a table.")},
+	{RELKIND_PARTITIONED_INDEX,
+		ERRCODE_UNDEFINED_OBJECT,
+		gettext_noop("index \"%s\" does not exist"),
+		gettext_noop("index \"%s\" does not exist, skipping"),
+		gettext_noop("\"%s\" is not an index"),
+	gettext_noop("Use DROP INDEX to remove an index.")},
     {'\0', 0, NULL, NULL, NULL, NULL}
 };
 
@@ -329,6 +335,7 @@ struct DropRelationCallbackState
 #define        ATT_INDEX                0x0008
 #define        ATT_COMPOSITE_TYPE        0x0010
 #define        ATT_FOREIGN_TABLE        0x0020
+#define		ATT_PARTITIONED_INDEX	0x0040
 
 /*
  * Partition tables are expected to be dropped when the parent partitioned
@@ -542,10 +549,16 @@ static void CreateInheritance(Relation child_rel, Relation parent_rel);
 static void RemoveInheritance(Relation child_rel, Relation parent_rel);
 static ObjectAddress ATExecAttachPartition(List **wqueue, Relation rel,
                       PartitionCmd *cmd);
+static void AttachPartitionEnsureIndexes(Relation rel, Relation attachrel);
 static void QueuePartitionConstraintValidation(List **wqueue, Relation scanrel,
 							List *partConstraint,
                             bool validate_default);
 static ObjectAddress ATExecDetachPartition(Relation rel, RangeVar *name);
+static ObjectAddress ATExecAttachPartitionIdx(List **wqueue, Relation rel,
+                         RangeVar *name);
+static void validatePartitionedIndex(Relation partedIdx, Relation partedTbl);
+static void refuseDupeIndexAttach(Relation parentIdx, Relation partIdx,
+                     Relation partitionTbl);
 #ifdef _SHARDING_
 static void AtExecRebuildExtent(Relation rel);
 #endif
@@ -1237,12 +1250,59 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 
         StorePartitionKey(rel, strategy, partnatts, partattrs, partexprs,
                           partopclass, partcollation);
+
+		/* make it all visible */
+        CommandCounterIncrement();
 #ifdef __TBASE__
         }
 #endif
     }
 
     /*
+	* If we're creating a partition, create now all the indexes defined in
+	* the parent.  We can't do it earlier, because DefineIndex wants to know
+	* the partition key which we just stored.
+	*/
+	if (stmt->partbound)
+	{
+	   Oid         parentId = linitial_oid(inheritOids);
+	   Relation    parent;
+	   List       *idxlist;
+	   ListCell   *cell;
+
+	   /* Already have strong enough lock on the parent */
+	   parent = heap_open(parentId, NoLock);
+	   idxlist = RelationGetIndexList(parent);
+
+	   /*
+	    * For each index in the parent table, create one in the partition
+	    */
+	   foreach(cell, idxlist)
+	   {
+	       Relation    idxRel = index_open(lfirst_oid(cell), AccessShareLock);
+	       AttrNumber *attmap;
+	       IndexStmt  *idxstmt;
+
+	       attmap = convert_tuples_by_name_map(RelationGetDescr(rel),
+	                                           RelationGetDescr(parent),
+	                                           gettext_noop("could not convert row type"));
+	       idxstmt =
+	           generateClonedIndexStmt(NULL, RelationGetRelid(rel), idxRel,
+	                                   attmap, RelationGetDescr(rel)->natts);
+	       DefineIndex(RelationGetRelid(rel),
+	                   idxstmt,
+	                   InvalidOid,
+	                   RelationGetRelid(idxRel),
+	                   false, false, false, false, false);
+
+	       index_close(idxRel, AccessShareLock);
+	   }
+
+	   list_free(idxlist);
+	   heap_close(parent, NoLock);
+	}
+
+	/*
      * Now add any newly specified column default values and CHECK constraints
      * to the new relation.  These are passed to us in the form of raw
      * parsetrees; we need to transform them to executable expression trees
@@ -1728,10 +1788,13 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
      * but RemoveRelations() can only pass one relkind for a given relation.
      * It chooses RELKIND_RELATION for both regular and partitioned tables.
      * That means we must be careful before giving the wrong type error when
-     * the relation is RELKIND_PARTITIONED_TABLE.
+	 * the relation is RELKIND_PARTITIONED_TABLE.  An equivalent problem
+	 * exists with indexes.
      */
     if (classform->relkind == RELKIND_PARTITIONED_TABLE)
         expected_relkind = RELKIND_RELATION;
+	else if (classform->relkind == RELKIND_PARTITIONED_INDEX)
+		expected_relkind = RELKIND_INDEX;
     else
         expected_relkind = classform->relkind;
 
@@ -1759,7 +1822,8 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
      * we do it the other way around.  No error if we don't find a pg_index
      * entry, though --- the relation may have been dropped.
      */
-    if (relkind == RELKIND_INDEX && relOid != oldRelOid)
+	if ((relkind == RELKIND_INDEX || relkind == RELKIND_PARTITIONED_INDEX) &&
+		relOid != oldRelOid)
     {
         state->heapOid = IndexGetRelation(relOid, true);
         if (OidIsValid(state->heapOid))
@@ -3065,27 +3129,11 @@ StoreCatalogInheritance1(Oid relationId, Oid parentOid,
                          int16 seqNumber, Relation inhRelation,
                          bool child_is_partition)
 {
-    TupleDesc    desc = RelationGetDescr(inhRelation);
-    Datum        values[Natts_pg_inherits];
-    bool        nulls[Natts_pg_inherits];
     ObjectAddress childobject,
                 parentobject;
-    HeapTuple    tuple;
-
-    /*
-     * Make the pg_inherits entry
-     */
-    values[Anum_pg_inherits_inhrelid - 1] = ObjectIdGetDatum(relationId);
-    values[Anum_pg_inherits_inhparent - 1] = ObjectIdGetDatum(parentOid);
-    values[Anum_pg_inherits_inhseqno - 1] = Int16GetDatum(seqNumber);
-
-    memset(nulls, 0, sizeof(nulls));
-
-    tuple = heap_form_tuple(desc, values, nulls);
 
-    CatalogTupleInsert(inhRelation, tuple);
-
-    heap_freetuple(tuple);
+	/* store the pg_inherits row */
+    StoreSingleInheritance(relationId, parentOid, seqNumber);
 
     /*
      * Store a dependency too
@@ -3415,6 +3463,7 @@ renameatt_check(Oid myrelid, Form_pg_class classform, bool recursing)
         relkind != RELKIND_MATVIEW &&
         relkind != RELKIND_COMPOSITE_TYPE &&
         relkind != RELKIND_INDEX &&
+		relkind != RELKIND_PARTITIONED_INDEX &&
         relkind != RELKIND_FOREIGN_TABLE &&
         relkind != RELKIND_PARTITIONED_TABLE)
         ereport(ERROR,
@@ -4077,7 +4126,8 @@ RenameRelationInternal(Oid myrelid, const char *newrelname, bool is_internal)
     /*
      * Also rename the associated constraint, if any.
      */
-    if (targetrelation->rd_rel->relkind == RELKIND_INDEX)
+	if (targetrelation->rd_rel->relkind == RELKIND_INDEX ||
+		targetrelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
     {
         Oid            constraintId = get_index_constraint(myrelid);
 
@@ -4157,6 +4207,7 @@ CheckTableNotInUse(Relation rel, const char *stmt)
                         stmt, RelationGetRelationName(rel))));
 
     if (rel->rd_rel->relkind != RELKIND_INDEX &&
+		rel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX &&
         AfterTriggerPendingOnRel(RelationGetRelid(rel)))
         ereport(ERROR,
                 (errcode(ERRCODE_OBJECT_IN_USE),
@@ -5051,6 +5102,10 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd,
             break;
 #endif
         case AT_AttachPartition:
+			ATSimplePermissions(rel, ATT_TABLE | ATT_PARTITIONED_INDEX);
+           /* No command-specific prep needed */
+            pass = AT_PASS_MISC;
+            break;
         case AT_DetachPartition:
             ATSimplePermissions(rel, ATT_TABLE);
             /* No command-specific prep needed */
@@ -5430,9 +5485,15 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
             break;
 #endif
         case AT_AttachPartition:
+			if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
             ATExecAttachPartition(wqueue, rel, (PartitionCmd *) cmd->def);
+            else
+                ATExecAttachPartitionIdx(wqueue, rel,
+                                         ((PartitionCmd *) cmd->def)->name);
             break;
         case AT_DetachPartition:
+			/* ATPrepCmd ensures it must be a table */
+            Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
             ATExecDetachPartition(rel, ((PartitionCmd *) cmd->def)->name);
             break;
 #ifdef __TBASE__
@@ -5750,9 +5811,13 @@ ATRewriteTables(AlterTableStmt *parsetree, List **wqueue, LOCKMODE lockmode)
                      errmsg("Incompatible operation with data redistribution")));
 #endif
 
-        /* Foreign tables have no storage, nor do partitioned tables. */
+		/*
+         * Foreign tables have no storage, nor do partitioned tables and
+         * indexes.
+         */
         if (tab->relkind == RELKIND_FOREIGN_TABLE ||
-            tab->relkind == RELKIND_PARTITIONED_TABLE)
+				tab->relkind == RELKIND_PARTITIONED_TABLE ||
+                tab->relkind == RELKIND_PARTITIONED_INDEX)
             continue;
 
         /*
@@ -6383,6 +6448,9 @@ ATSimplePermissions(Relation rel, int allowed_targets)
         case RELKIND_INDEX:
             actual_target = ATT_INDEX;
             break;
+		case RELKIND_PARTITIONED_INDEX:
+			actual_target = ATT_PARTITIONED_INDEX;
+			break;
         case RELKIND_COMPOSITE_TYPE:
             actual_target = ATT_COMPOSITE_TYPE;
             break;
@@ -7940,6 +8008,7 @@ ATPrepSetStatistics(Relation rel, const char *colName, Node *newValue, LOCKMODE
     if (rel->rd_rel->relkind != RELKIND_RELATION &&
         rel->rd_rel->relkind != RELKIND_MATVIEW &&
         rel->rd_rel->relkind != RELKIND_INDEX &&
+		rel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX &&
         rel->rd_rel->relkind != RELKIND_FOREIGN_TABLE &&
         rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
         ereport(ERROR,
@@ -7947,6 +8016,17 @@ ATPrepSetStatistics(Relation rel, const char *colName, Node *newValue, LOCKMODE
                  errmsg("\"%s\" is not a table, materialized view, index, or foreign table",
                         RelationGetRelationName(rel))));
 
+	/*
+	 * We allow referencing columns by numbers only for indexes, since table
+	 * column numbers could contain gaps if columns are later dropped.
+	 */
+	if (rel->rd_rel->relkind != RELKIND_INDEX &&
+	    rel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX &&
+	    !colName)
+	    ereport(ERROR,
+	            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+	             errmsg("cannot refer to non-index column by number")));
+
     /* Permissions checks */
     if (!pg_class_ownercheck(RelationGetRelid(rel), GetUserId()))
         aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_CLASS,
@@ -8006,6 +8086,15 @@ ATExecSetStatistics(Relation rel, const char *colName, Node *newValue, LOCKMODE
                  errmsg("cannot alter system column \"%s\"",
                         colName)));
 
+	if ((rel->rd_rel->relkind == RELKIND_INDEX ||
+	     rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) &&
+	    rel->rd_index->indkey.values[attnum - 1] != 0)
+	    ereport(ERROR,
+	            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+	             errmsg("cannot alter statistics on non-expression column \"%s\" of index \"%s\"",
+	                    NameStr(attrtuple->attname), RelationGetRelationName(rel)),
+	             errhint("Alter statistics on table column instead.")));
+
     attrtuple->attstattarget = newtarget;
 
     CatalogTupleUpdate(attrelation, &tuple->t_self, tuple);
@@ -8521,6 +8610,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
     address = DefineIndex(RelationGetRelid(rel),
                           stmt,
                           InvalidOid,    /* no predefined OID */
+						  InvalidOid,	/* no parent index */
                           true, /* is_alter_table */
                           check_rights,
                           false,    /* check_not_in_use - we did it already */
@@ -11140,7 +11230,8 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
                 {
                     char        relKind = get_rel_relkind(foundObject.objectId);
 
-                    if (relKind == RELKIND_INDEX)
+					if (relKind == RELKIND_INDEX ||
+						relKind == RELKIND_PARTITIONED_INDEX)
                     {
                         Assert(foundObject.objectSubId == 0);
                         if (!list_member_oid(tab->changedIndexOids, foundObject.objectId))
@@ -11924,6 +12015,15 @@ ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, LOCKMODE lock
                 newOwnerId = tuple_class->relowner;
             }
             break;
+		case RELKIND_PARTITIONED_INDEX:
+			if (recursing)
+				break;
+			ereport(ERROR,
+					(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+					 errmsg("cannot change owner of index \"%s\"",
+							NameStr(tuple_class->relname)),
+					 errhint("Change the ownership of the index's table, instead.")));
+			break;
         case RELKIND_SEQUENCE:
             if (!recursing &&
                 tuple_class->relowner != newOwnerId)
@@ -12045,6 +12145,7 @@ ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, LOCKMODE lock
          */
         if (tuple_class->relkind != RELKIND_COMPOSITE_TYPE &&
             tuple_class->relkind != RELKIND_INDEX &&
+			tuple_class->relkind != RELKIND_PARTITIONED_INDEX &&
             tuple_class->relkind != RELKIND_TOASTVALUE)
             changeDependencyOnOwner(RelationRelationId, relationOid,
                                     newOwnerId);
@@ -12052,7 +12153,8 @@ ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, LOCKMODE lock
         /*
          * Also change the ownership of the table's row type, if it has one
          */
-        if (tuple_class->relkind != RELKIND_INDEX)
+		if (tuple_class->relkind != RELKIND_INDEX &&
+			tuple_class->relkind != RELKIND_PARTITIONED_INDEX)
             AlterTypeOwnerInternal(tuple_class->reltype, newOwnerId);
 
         /*
@@ -12061,6 +12163,7 @@ ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, LOCKMODE lock
          * relation, as well as its toast table (if it has one).
          */
         if (tuple_class->relkind == RELKIND_RELATION ||
+			tuple_class->relkind == RELKIND_PARTITIONED_TABLE ||
             tuple_class->relkind == RELKIND_MATVIEW ||
             tuple_class->relkind == RELKIND_TOASTVALUE)
         {
@@ -12386,6 +12489,7 @@ ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation,
             (void) view_reloptions(newOptions, true);
             break;
         case RELKIND_INDEX:
+		case RELKIND_PARTITIONED_INDEX:
             (void) index_reloptions(rel->rd_amroutine->amoptions, newOptions, true);
             break;
         default:
@@ -12859,7 +12963,8 @@ AlterTableMoveAll(AlterTableMoveAllStmt *stmt)
              relForm->relkind != RELKIND_RELATION &&
              relForm->relkind != RELKIND_PARTITIONED_TABLE) ||
             (stmt->objtype == OBJECT_INDEX &&
-             relForm->relkind != RELKIND_INDEX) ||
+			 relForm->relkind != RELKIND_INDEX &&
+			 relForm->relkind != RELKIND_PARTITIONED_INDEX) ||
             (stmt->objtype == OBJECT_MATVIEW &&
              relForm->relkind != RELKIND_MATVIEW))
             continue;
@@ -13797,45 +13902,18 @@ RemoveInheritance(Relation child_rel, Relation parent_rel)
     Relation    catalogRelation;
     SysScanDesc scan;
     ScanKeyData key[3];
-    HeapTuple    inheritsTuple,
-                attributeTuple,
+	HeapTuple	attributeTuple,
                 constraintTuple;
     List       *connames;
-    bool        found = false;
+	bool		found;
     bool        child_is_partition = false;
 
     /* If parent_rel is a partitioned table, child_rel must be a partition */
     if (parent_rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
         child_is_partition = true;
 
-    /*
-     * Find and destroy the pg_inherits entry linking the two, or error out if
-     * there is none.
-     */
-    catalogRelation = heap_open(InheritsRelationId, RowExclusiveLock);
-    ScanKeyInit(&key[0],
-                Anum_pg_inherits_inhrelid,
-                BTEqualStrategyNumber, F_OIDEQ,
-                ObjectIdGetDatum(RelationGetRelid(child_rel)));
-    scan = systable_beginscan(catalogRelation, InheritsRelidSeqnoIndexId,
-                              true, NULL, 1, key);
-
-    while (HeapTupleIsValid(inheritsTuple = systable_getnext(scan)))
-    {
-        Oid            inhparent;
-
-        inhparent = ((Form_pg_inherits) GETSTRUCT(inheritsTuple))->inhparent;
-        if (inhparent == RelationGetRelid(parent_rel))
-        {
-            CatalogTupleDelete(catalogRelation, &inheritsTuple->t_self);
-            found = true;
-            break;
-        }
-    }
-
-    systable_endscan(scan);
-    heap_close(catalogRelation, RowExclusiveLock);
-
+	found = DeleteInheritsTuple(RelationGetRelid(child_rel),
+								RelationGetRelid(parent_rel));
     if (!found)
     {
         if (child_is_partition)
@@ -16073,7 +16151,8 @@ RangeVarCallbackForAlterRelation(const RangeVar *rv, Oid relid, Oid oldrelid,
                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                  errmsg("\"%s\" is not a composite type", rv->relname)));
 
-    if (reltype == OBJECT_INDEX && relkind != RELKIND_INDEX
+	if (reltype == OBJECT_INDEX && relkind != RELKIND_INDEX &&
+		relkind != RELKIND_PARTITIONED_INDEX
         && !IsA(stmt, RenameStmt))
         ereport(ERROR,
                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
@@ -16897,6 +16976,9 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
     /* Update the pg_class entry. */
     StorePartitionBound(attachrel, rel, cmd->bound);
 
+	/* Ensure there exists a correct set of indexes in the partition. */
+	AttachPartitionEnsureIndexes(rel, attachrel);
+
     /*
      * Generate partition constraint from the partition bound specification.
      * If the parent itself is a partition, make sure to include its
@@ -16964,6 +17046,127 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd)
     return address;
 }
 
+/*
+ * AttachPartitionEnsureIndexes
+ *		subroutine for ATExecAttachPartition to create/match indexes
+ *
+ * Enforce the indexing rule for partitioned tables during ALTER TABLE / ATTACH
+ * PARTITION: every partition must have an index attached to each index on the
+ * partitioned table.
+ */
+static void
+AttachPartitionEnsureIndexes(Relation rel, Relation attachrel)
+{
+	List	   *idxes;
+	List	   *attachRelIdxs;
+	Relation   *attachrelIdxRels;
+	IndexInfo **attachInfos;
+	int			i;
+	ListCell   *cell;
+	MemoryContext cxt;
+	MemoryContext oldcxt;
+
+	cxt = AllocSetContextCreate(CurrentMemoryContext,
+								"AttachPartitionEnsureIndexes",
+								ALLOCSET_DEFAULT_SIZES);
+	oldcxt = MemoryContextSwitchTo(cxt);
+
+	idxes = RelationGetIndexList(rel);
+	attachRelIdxs = RelationGetIndexList(attachrel);
+	attachrelIdxRels = palloc(sizeof(Relation) * list_length(attachRelIdxs));
+	attachInfos = palloc(sizeof(IndexInfo *) * list_length(attachRelIdxs));
+
+	/* Build arrays of all existing indexes and their IndexInfos */
+	i = 0;
+	foreach(cell, attachRelIdxs)
+	{
+		Oid			cldIdxId = lfirst_oid(cell);
+
+		attachrelIdxRels[i] = index_open(cldIdxId, AccessShareLock);
+		attachInfos[i] = BuildIndexInfo(attachrelIdxRels[i]);
+		i++;
+	}
+
+	/*
+	 * For each index on the partitioned table, find a matching one in the
+	 * partition-to-be; if one is not found, create one.
+	 */
+	foreach(cell, idxes)
+	{
+		Oid			idx = lfirst_oid(cell);
+		Relation	idxRel = index_open(idx, AccessShareLock);
+		IndexInfo  *info;
+		AttrNumber *attmap;
+		bool		found = false;
+
+		/*
+		 * Ignore indexes in the partitioned table other than partitioned
+		 * indexes.
+		 */
+		if (idxRel->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
+		{
+			index_close(idxRel, AccessShareLock);
+			continue;
+		}
+
+		/* construct an indexinfo to compare existing indexes against */
+		info = BuildIndexInfo(idxRel);
+		attmap = convert_tuples_by_name_map(RelationGetDescr(attachrel),
+											RelationGetDescr(rel),
+											gettext_noop("could not convert row type"));
+
+		/*
+		 * Scan the list of existing indexes in the partition-to-be, and mark
+		 * the first matching, unattached one we find, if any, as partition of
+		 * the parent index.  If we find one, we're done.
+		 */
+		for (i = 0; i < list_length(attachRelIdxs); i++)
+		{
+			/* does this index have a parent?  if so, can't use it */
+			if (has_superclass(RelationGetRelid(attachrelIdxRels[i])))
+				continue;
+
+			if (CompareIndexInfo(attachInfos[i], info,
+								 attachrelIdxRels[i]->rd_indcollation,
+								 idxRel->rd_indcollation,
+								 attachrelIdxRels[i]->rd_opfamily,
+								 idxRel->rd_opfamily,
+								 attmap,
+								 RelationGetDescr(rel)->natts))
+			{
+				/* bingo. */
+				IndexSetParentIndex(attachrelIdxRels[i], idx);
+				found = true;
+				break;
+			}
+		}
+
+		/*
+		 * If no suitable index was found in the partition-to-be, create one
+		 * now.
+		 */
+		if (!found)
+		{
+			IndexStmt  *stmt;
+
+			stmt = generateClonedIndexStmt(NULL, RelationGetRelid(attachrel),
+										   idxRel, attmap,
+										   RelationGetDescr(rel)->natts);
+			DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid,
+						RelationGetRelid(idxRel),
+						false, false, false, false, false);
+		}
+
+		index_close(idxRel, AccessShareLock);
+	}
+
+	/* Clean up. */
+	for (i = 0; i < list_length(attachRelIdxs); i++)
+		index_close(attachrelIdxRels[i], AccessShareLock);
+	MemoryContextSwitchTo(oldcxt);
+	MemoryContextDelete(cxt);
+}
+
 /*
  * ALTER TABLE DETACH PARTITION
  *
@@ -16982,6 +17185,8 @@ ATExecDetachPartition(Relation rel, RangeVar *name)
                 new_repl[Natts_pg_class];
     ObjectAddress address;
 	Oid         defaultPartOid;
+	List       *indexes;
+    ListCell   *cell;
 #ifdef _MLS_
     bool        schema_bound;
     Oid         partoid;
@@ -17067,6 +17272,24 @@ ATExecDetachPartition(Relation rel, RangeVar *name)
 		}
 	}
 
+	/* detach indexes too */
+	indexes = RelationGetIndexList(partRel);
+	foreach(cell, indexes)
+	{
+		Oid			idxid = lfirst_oid(cell);
+		Relation	idx;
+
+		if (!has_superclass(idxid))
+			continue;
+
+		Assert((IndexGetRelation(get_partition_parent(idxid), false) ==
+			   RelationGetRelid(rel)));
+
+		idx = index_open(idxid, AccessExclusiveLock);
+		IndexSetParentIndex(idx, InvalidOid);
+		relation_close(idx, AccessExclusiveLock);
+	}
+
     /*
      * Invalidate the parent's relcache so that the partition is no longer
      * included in its partition descriptor.
@@ -17081,6 +17304,332 @@ ATExecDetachPartition(Relation rel, RangeVar *name)
     return address;
 }
 
+
+/*
+ * Before acquiring lock on an index, acquire the same lock on the owning
+ * table.
+ */
+struct AttachIndexCallbackState
+{
+   Oid     partitionOid;
+   Oid     parentTblOid;
+   bool    lockedParentTbl;
+};
+
+static void
+RangeVarCallbackForAttachIndex(const RangeVar *rv, Oid relOid, Oid oldRelOid,
+                              void *arg)
+{
+   struct AttachIndexCallbackState *state;
+   Form_pg_class classform;
+   HeapTuple   tuple;
+
+   state = (struct AttachIndexCallbackState *) arg;
+
+   if (!state->lockedParentTbl)
+   {
+       LockRelationOid(state->parentTblOid, AccessShareLock);
+       state->lockedParentTbl = true;
+   }
+
+   /*
+    * If we previously locked some other heap, and the name we're looking up
+    * no longer refers to an index on that relation, release the now-useless
+    * lock.  XXX maybe we should do *after* we verify whether the index does
+    * not actually belong to the same relation ...
+    */
+   if (relOid != oldRelOid && OidIsValid(state->partitionOid))
+   {
+       UnlockRelationOid(state->partitionOid, AccessShareLock);
+       state->partitionOid = InvalidOid;
+   }
+
+   /* Didn't find a relation, so no need for locking or permission checks. */
+   if (!OidIsValid(relOid))
+       return;
+
+   tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relOid));
+   if (!HeapTupleIsValid(tuple))
+       return;                 /* concurrently dropped, so nothing to do */
+   classform = (Form_pg_class) GETSTRUCT(tuple);
+   if (classform->relkind != RELKIND_PARTITIONED_INDEX &&
+       classform->relkind != RELKIND_INDEX)
+       ereport(ERROR,
+               (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                errmsg("\"%s\" is not an index", rv->relname)));
+   ReleaseSysCache(tuple);
+
+   /*
+    * Since we need only examine the heap's tupledesc, an access share lock
+    * on it (preventing any DDL) is sufficient.
+    */
+   state->partitionOid = IndexGetRelation(relOid, false);
+   LockRelationOid(state->partitionOid, AccessShareLock);
+}
+
+/*
+ * ALTER INDEX i1 ATTACH PARTITION i2
+ */
+static ObjectAddress
+ATExecAttachPartitionIdx(List **wqueue, Relation parentIdx, RangeVar *name)
+{
+   Relation    partIdx;
+   Relation    partTbl;
+   Relation    parentTbl;
+   ObjectAddress address;
+   Oid         partIdxId;
+   Oid         currParent;
+   struct AttachIndexCallbackState state;
+
+   /*
+    * We need to obtain lock on the index 'name' to modify it, but we also
+    * need to read its owning table's tuple descriptor -- so we need to lock
+    * both.  To avoid deadlocks, obtain lock on the table before doing so on
+    * the index.  Furthermore, we need to examine the parent table of the
+    * partition, so lock that one too.
+    */
+   state.partitionOid = InvalidOid;
+   state.parentTblOid = parentIdx->rd_index->indrelid;
+   state.lockedParentTbl = false;
+   partIdxId =
+       RangeVarGetRelidExtended(name, AccessExclusiveLock, false, false,
+                                RangeVarCallbackForAttachIndex,
+                                (void *) &state);
+   /* Not there? */
+   if (!OidIsValid(partIdxId))
+       ereport(ERROR,
+               (errcode(ERRCODE_UNDEFINED_OBJECT),
+                errmsg("index \"%s\" does not exist", name->relname)));
+
+   /* no deadlock risk: RangeVarGetRelidExtended already acquired the lock */
+   partIdx = relation_open(partIdxId, AccessExclusiveLock);
+
+   /* we already hold locks on both tables, so this is safe: */
+   parentTbl = relation_open(parentIdx->rd_index->indrelid, AccessShareLock);
+   partTbl = relation_open(partIdx->rd_index->indrelid, NoLock);
+
+   ObjectAddressSet(address, RelationRelationId, RelationGetRelid(partIdx));
+
+   /* Silently do nothing if already in the right state */
+   currParent = !has_superclass(partIdxId) ? InvalidOid :
+       get_partition_parent(partIdxId);
+   if (currParent != RelationGetRelid(parentIdx))
+   {
+       IndexInfo  *childInfo;
+       IndexInfo  *parentInfo;
+       AttrNumber *attmap;
+       bool        found;
+       int         i;
+       PartitionDesc partDesc;
+
+       /*
+        * If this partition already has an index attached, refuse the operation.
+        */
+       refuseDupeIndexAttach(parentIdx, partIdx, partTbl);
+
+       if (OidIsValid(currParent))
+           ereport(ERROR,
+                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                    errmsg("cannot attach index \"%s\" as a partition of index \"%s\"",
+                           RelationGetRelationName(partIdx),
+                           RelationGetRelationName(parentIdx)),
+                    errdetail("Index \"%s\" is already attached to another index.",
+                              RelationGetRelationName(partIdx))));
+
+       /* Make sure it indexes a partition of the other index's table */
+       partDesc = RelationGetPartitionDesc(parentTbl);
+       found = false;
+       for (i = 0; i < partDesc->nparts; i++)
+       {
+           if (partDesc->oids[i] == state.partitionOid)
+           {
+               found = true;
+               break;
+           }
+       }
+       if (!found)
+           ereport(ERROR,
+                   (errmsg("cannot attach index \"%s\" as a partition of index \"%s\"",
+                           RelationGetRelationName(partIdx),
+                           RelationGetRelationName(parentIdx)),
+                    errdetail("Index \"%s\" is not an index on any partition of table \"%s\".",
+                              RelationGetRelationName(partIdx),
+                              RelationGetRelationName(parentTbl))));
+
+       /* Ensure the indexes are compatible */
+       childInfo = BuildIndexInfo(partIdx);
+       parentInfo = BuildIndexInfo(parentIdx);
+       attmap = convert_tuples_by_name_map(RelationGetDescr(partTbl),
+                                           RelationGetDescr(parentTbl),
+                                           gettext_noop("could not convert row type"));
+       if (!CompareIndexInfo(childInfo, parentInfo,
+                             partIdx->rd_indcollation,
+                             parentIdx->rd_indcollation,
+                             partIdx->rd_opfamily,
+                             parentIdx->rd_opfamily,
+                             attmap,
+                             RelationGetDescr(partTbl)->natts))
+           ereport(ERROR,
+                   (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+                    errmsg("cannot attach index \"%s\" as a partition of index \"%s\"",
+                           RelationGetRelationName(partIdx),
+                           RelationGetRelationName(parentIdx)),
+                    errdetail("The index definitions do not match.")));
+
+       /* All good -- do it */
+       IndexSetParentIndex(partIdx, RelationGetRelid(parentIdx));
+       pfree(attmap);
+
+       CommandCounterIncrement();
+
+       validatePartitionedIndex(parentIdx, parentTbl);
+   }
+
+   relation_close(parentTbl, AccessShareLock);
+   /* keep these locks till commit */
+   relation_close(partTbl, NoLock);
+   relation_close(partIdx, NoLock);
+
+   return address;
+}
+
+/*
+ * Verify whether the given partition already contains an index attached
+ * to the given partitioned index.  If so, raise an error.
+ */
+static void
+refuseDupeIndexAttach(Relation parentIdx, Relation partIdx, Relation partitionTbl)
+{
+   Relation        pg_inherits;
+   ScanKeyData     key;
+   HeapTuple       tuple;
+   SysScanDesc     scan;
+
+   pg_inherits = heap_open(InheritsRelationId, AccessShareLock);
+   ScanKeyInit(&key, Anum_pg_inherits_inhparent,
+               BTEqualStrategyNumber, F_OIDEQ,
+               ObjectIdGetDatum(RelationGetRelid(parentIdx)));
+   scan = systable_beginscan(pg_inherits, InheritsParentIndexId, true,
+                             NULL, 1, &key);
+   while (HeapTupleIsValid(tuple = systable_getnext(scan)))
+   {
+       Form_pg_inherits    inhForm;
+       Oid         tab;
+
+       inhForm = (Form_pg_inherits) GETSTRUCT(tuple);
+       tab = IndexGetRelation(inhForm->inhrelid, false);
+       if (tab == RelationGetRelid(partitionTbl))
+           ereport(ERROR,
+                   (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                    errmsg("cannot attach index \"%s\" as a partition of index \"%s\"",
+                           RelationGetRelationName(partIdx),
+                           RelationGetRelationName(parentIdx)),
+                    errdetail("Another index is already attached for partition \"%s\".",
+                              RelationGetRelationName(partitionTbl))));
+   }
+
+   systable_endscan(scan);
+   heap_close(pg_inherits, AccessShareLock);
+}
+
+/*
+ * Verify whether the set of attached partition indexes to a parent index on
+ * a partitioned table is complete.  If it is, mark the parent index valid.
+ *
+ * This should be called each time a partition index is attached.
+ */
+static void
+validatePartitionedIndex(Relation partedIdx, Relation partedTbl)
+{
+   Relation        inheritsRel;
+   SysScanDesc     scan;
+   ScanKeyData     key;
+   int             tuples = 0;
+   HeapTuple       inhTup;
+   bool            updated = false;
+
+   Assert(partedIdx->rd_rel->relkind == RELKIND_PARTITIONED_INDEX);
+
+   /*
+    * Scan pg_inherits for this parent index.  Count each valid index we find
+    * (verifying the pg_index entry for each), and if we reach the total
+    * amount we expect, we can mark this parent index as valid.
+    */
+   inheritsRel = heap_open(InheritsRelationId, AccessShareLock);
+   ScanKeyInit(&key, Anum_pg_inherits_inhparent,
+               BTEqualStrategyNumber, F_OIDEQ,
+               ObjectIdGetDatum(RelationGetRelid(partedIdx)));
+   scan = systable_beginscan(inheritsRel, InheritsParentIndexId, true,
+                             NULL, 1, &key);
+   while ((inhTup = systable_getnext(scan)) != NULL)
+   {
+       Form_pg_inherits inhForm = (Form_pg_inherits) GETSTRUCT(inhTup);
+       HeapTuple       indTup;
+       Form_pg_index   indexForm;
+
+       indTup = SearchSysCache1(INDEXRELID,
+                               ObjectIdGetDatum(inhForm->inhrelid));
+       if (!indTup)
+           elog(ERROR, "cache lookup failed for index %u",
+                inhForm->inhrelid);
+       indexForm = (Form_pg_index) GETSTRUCT(indTup);
+       if (IndexIsValid(indexForm))
+           tuples += 1;
+       ReleaseSysCache(indTup);
+   }
+
+   /* Done with pg_inherits */
+   systable_endscan(scan);
+   heap_close(inheritsRel, AccessShareLock);
+
+   /*
+    * If we found as many inherited indexes as the partitioned table has
+    * partitions, we're good; update pg_index to set indisvalid.
+    */
+   if (tuples == RelationGetPartitionDesc(partedTbl)->nparts)
+   {
+       Relation    idxRel;
+       HeapTuple   newtup;
+
+       idxRel = heap_open(IndexRelationId, RowExclusiveLock);
+
+       newtup = heap_copytuple(partedIdx->rd_indextuple);
+       ((Form_pg_index) GETSTRUCT(newtup))->indisvalid = true;
+       updated = true;
+
+       CatalogTupleUpdate(idxRel, &partedIdx->rd_indextuple->t_self, newtup);
+
+       heap_close(idxRel, RowExclusiveLock);
+   }
+
+   /*
+    * If this index is in turn a partition of a larger index, validating it
+    * might cause the parent to become valid also.  Try that.
+    */
+   if (updated &&
+       has_superclass(RelationGetRelid(partedIdx)))
+   {
+       Oid         parentIdxId,
+                   parentTblId;
+       Relation    parentIdx,
+                   parentTbl;
+
+       /* make sure we see the validation we just did */
+       CommandCounterIncrement();
+
+       parentIdxId = get_partition_parent(RelationGetRelid(partedIdx));
+       parentTblId = get_partition_parent(RelationGetRelid(partedTbl));
+       parentIdx = relation_open(parentIdxId, AccessExclusiveLock);
+       parentTbl = relation_open(parentTblId, AccessExclusiveLock);
+       Assert(!parentIdx->rd_index->indisvalid);
+
+       validatePartitionedIndex(parentIdx, parentTbl);
+
+       relation_close(parentIdx, AccessExclusiveLock);
+       relation_close(parentTbl, AccessExclusiveLock);
+   }
+}
+
 #ifdef _MIGRATE_
 bool
 oidarray_contian_oid(Oid *old_oids, int old_num, Oid new_oid)
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index ea4a0c71..8bc360f1 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -3729,6 +3729,7 @@ _copyIndexStmt(const IndexStmt *from)
 
     COPY_STRING_FIELD(idxname);
     COPY_NODE_FIELD(relation);
+	COPY_SCALAR_FIELD(relationId);
     COPY_STRING_FIELD(accessMethod);
     COPY_STRING_FIELD(tableSpace);
     COPY_NODE_FIELD(indexParams);
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index c92cbd30..7bbe8255 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -1348,6 +1348,7 @@ _equalIndexStmt(const IndexStmt *a, const IndexStmt *b)
 {
     COMPARE_STRING_FIELD(idxname);
     COMPARE_NODE_FIELD(relation);
+	COMPARE_SCALAR_FIELD(relationId);
     COMPARE_STRING_FIELD(accessMethod);
     COMPARE_STRING_FIELD(tableSpace);
     COMPARE_NODE_FIELD(indexParams);
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 092a7dd5..7df4571b 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -4010,6 +4010,7 @@ _outIndexStmt(StringInfo str, const IndexStmt *node)
 
     WRITE_STRING_FIELD(idxname);
     WRITE_NODE_FIELD(relation);
+	WRITE_OID_FIELD(relationId);
     WRITE_STRING_FIELD(accessMethod);
     WRITE_STRING_FIELD(tableSpace);
     WRITE_NODE_FIELD(indexParams);
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index f8e17e4e..4e74a77d 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -316,7 +316,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <ival>	add_drop opt_asc_desc opt_nulls_order
 
 %type <node>	alter_table_cmd alter_type_cmd opt_collate_clause
-	   replica_identity partition_cmd alter_group_cmd
+	   replica_identity partition_cmd alter_group_cmd index_partition_cmd
 %type <list>	alter_table_cmds alter_type_cmds alter_group_cmds
 %type <list>    alter_identity_column_option_list
 %type <defelt>  alter_identity_column_option
@@ -1996,6 +1996,15 @@ AlterTableStmt:
 					n->missing_ok = true;
 					$$ = (Node *)n;
 				}
+		|	ALTER INDEX qualified_name index_partition_cmd
+				{
+					AlterTableStmt *n = makeNode(AlterTableStmt);
+					n->relation = $3;
+					n->cmds = list_make1($4);
+					n->relkind = OBJECT_INDEX;
+					n->missing_ok = false;
+					$$ = (Node *)n;
+				}
 		|	ALTER INDEX ALL IN_P TABLESPACE name SET TABLESPACE name opt_nowait
 				{
 					AlterTableMoveAllStmt *n =
@@ -2146,6 +2155,22 @@ alter_group_cmd:
 				}
 		;
 		
+index_partition_cmd:
+           /* ALTER INDEX <name> ATTACH PARTITION <index_name> */
+           ATTACH PARTITION qualified_name
+               {
+                   AlterTableCmd *n = makeNode(AlterTableCmd);
+                   PartitionCmd *cmd = makeNode(PartitionCmd);
+
+                   n->subtype = AT_AttachPartition;
+                   cmd->name = $3;
+                   cmd->bound = NULL;
+                   n->def = (Node *) cmd;
+
+                   $$ = (Node *) n;
+               }
+       ;
+
 alter_table_cmd:
 			/* ALTER TABLE <name> ADD <coldef> */
 			ADD_P columnDef
@@ -7735,7 +7760,7 @@ defacl_privilege_target:
  *****************************************************************************/
 
 IndexStmt:	CREATE opt_unique INDEX opt_concurrently opt_index_name
-			ON qualified_name access_method_clause '(' index_params ')'
+			ON relation_expr access_method_clause '(' index_params ')'
 			opt_reloptions OptTableSpace where_clause
 				{
 					IndexStmt *n = makeNode(IndexStmt);
@@ -7743,6 +7768,7 @@ IndexStmt:	CREATE opt_unique INDEX opt_concurrently opt_index_name
 					n->concurrent = $4;
 					n->idxname = $5;
 					n->relation = $7;
+					n->relationId = InvalidOid;
 					n->accessMethod = $8;
 					n->indexParams = $10;
 					n->options = $12;
@@ -7762,7 +7788,7 @@ IndexStmt:	CREATE opt_unique INDEX opt_concurrently opt_index_name
 					$$ = (Node *)n;
 				}
 			| CREATE opt_unique INDEX opt_concurrently IF_P NOT EXISTS index_name
-			ON qualified_name access_method_clause '(' index_params ')'
+			ON relation_expr access_method_clause '(' index_params ')'
 			opt_reloptions OptTableSpace where_clause
 				{
 					IndexStmt *n = makeNode(IndexStmt);
@@ -7770,6 +7796,7 @@ IndexStmt:	CREATE opt_unique INDEX opt_concurrently opt_index_name
 					n->concurrent = $4;
 					n->idxname = $8;
 					n->relation = $10;
+					n->relationId = InvalidOid;
 					n->accessMethod = $11;
 					n->indexParams = $13;
 					n->options = $15;
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index af249f71..cf1bc20a 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -169,9 +169,6 @@ static void transformTableLikeClause(CreateStmtContext *cxt,
                          TableLikeClause *table_like_clause);
 static void transformOfType(CreateStmtContext *cxt,
                 TypeName *ofTypename);
-static IndexStmt *generateClonedIndexStmt(CreateStmtContext *cxt,
-                        Relation source_idx,
-                        const AttrNumber *attmap, int attmap_length);
 static List *get_collation(Oid collation, Oid actual_datatype);
 static List *get_opclass(Oid opclass, Oid actual_datatype);
 static void transformIndexConstraints(CreateStmtContext *cxt);
@@ -1632,7 +1629,8 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
             parent_index = index_open(parent_index_oid, AccessShareLock);
 
             /* Build CREATE INDEX statement to recreate the parent_index */
-            index_stmt = generateClonedIndexStmt(cxt, parent_index,
+                        index_stmt = generateClonedIndexStmt(cxt->relation, InvalidOid,
+                                                                                                 parent_index,
                                                  attmap, tupleDesc->natts);
 
 #ifdef __TBASE__
@@ -1720,10 +1718,12 @@ transformOfType(CreateStmtContext *cxt, TypeName *ofTypename)
 
 /*
  * Generate an IndexStmt node using information from an already existing index
- * "source_idx".  Attribute numbers should be adjusted according to attmap.
+ * "source_idx", for the rel identified either by heapRel or heapRelid.
+ *
+ * Attribute numbers should be adjusted according to attmap.
  */
-static IndexStmt *
-generateClonedIndexStmt(CreateStmtContext *cxt, Relation source_idx,
+IndexStmt *
+generateClonedIndexStmt(RangeVar *heapRel, Oid heapRelid, Relation source_idx,
                         const AttrNumber *attmap, int attmap_length)
 {// #lizard forgives
     Oid            source_relid = RelationGetRelid(source_idx);
@@ -1745,6 +1745,9 @@ generateClonedIndexStmt(CreateStmtContext *cxt, Relation source_idx,
     Datum        datum;
     bool        isnull;
 
+	Assert((heapRel == NULL && OidIsValid(heapRelid)) ||
+		   (heapRel != NULL && !OidIsValid(heapRelid)));
+
     /*
      * Fetch pg_class tuple of source index.  We can't use the copy in the
      * relcache entry because it doesn't include optional fields.
@@ -1780,7 +1783,8 @@ generateClonedIndexStmt(CreateStmtContext *cxt, Relation source_idx,
 
     /* Begin building the IndexStmt */
     index = makeNode(IndexStmt);
-    index->relation = cxt->relation;
+	index->relation = heapRel;
+	index->relationId = heapRelid;
     index->accessMethod = pstrdup(NameStr(amrec->amname));
     if (OidIsValid(idxrelrec->reltablespace))
         index->tableSpace = get_tablespace_name(idxrelrec->reltablespace);
@@ -4947,18 +4951,39 @@ transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd)
 {
     Relation    parentRel = cxt->rel;
 
-    /* the table must be partitioned */
-    if (parentRel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
-        ereport(ERROR,
-                (errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
-                 errmsg("\"%s\" is not partitioned",
-                        RelationGetRelationName(parentRel))));
-
+	switch (parentRel->rd_rel->relkind)
+	{
+		case RELKIND_PARTITIONED_TABLE:
     /* transform the partition bound, if any */
     Assert(RelationGetPartitionKey(parentRel) != NULL);
     if (cmd->bound != NULL)
         cxt->partbound = transformPartitionBound(cxt->pstate, parentRel,
                                                  cmd->bound);
+			break;
+		case RELKIND_PARTITIONED_INDEX:
+			/* nothing to check */
+			Assert(cmd->bound == NULL);
+			break;
+		case RELKIND_RELATION:
+			/* the table must be partitioned */
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+					 errmsg("table \"%s\" is not partitioned",
+							RelationGetRelationName(parentRel))));
+			break;
+		case RELKIND_INDEX:
+			/* the index must be partitioned */
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+					 errmsg("index \"%s\" is not partitioned",
+							RelationGetRelationName(parentRel))));
+			break;
+		default:
+			/* parser shouldn't let this case through */
+			elog(ERROR, "\"%s\" is not a partitioned table or index",
+				 RelationGetRelationName(parentRel));
+			break;
+	}
 }
 
 /*
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index c99d090e..a6536b13 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -25,6 +25,7 @@
 #include "access/xlog.h"
 #include "catalog/catalog.h"
 #include "catalog/namespace.h"
+#include "catalog/pg_inherits_fn.h"
 #include "catalog/toasting.h"
 #include "commands/alter.h"
 #include "commands/async.h"
@@ -3677,6 +3678,7 @@ ProcessUtilitySlow(ParseState *pstate,
                     IndexStmt  *stmt = (IndexStmt *) parsetree;
                     Oid            relid;
                     LOCKMODE    lockmode;
+					List       *inheritors = NIL;
 #ifdef __TBASE__
                     Relation   rel = NULL;
 #endif
@@ -3719,6 +3721,23 @@ ProcessUtilitySlow(ParseState *pstate,
                     }
 #endif
 
+					/*
+					* CREATE INDEX on partitioned tables (but not regular
+					* inherited tables) recurses to partitions, so we must
+					* acquire locks early to avoid deadlocks.
+					*/
+					if (stmt->relation->inh)
+					{
+					   Relation    rel;
+
+					   /* already locked by RangeVarGetRelidExtended */
+					   rel = heap_open(relid, NoLock);
+					   if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+					       inheritors = find_all_inheritors(relid, lockmode,
+					                                        NULL);
+					   heap_close(rel, NoLock);
+					}
+
                     /* Run parse analysis ... */
                     stmt = transformIndexStmt(relid, stmt, queryString);
 
@@ -3728,6 +3747,7 @@ ProcessUtilitySlow(ParseState *pstate,
                         DefineIndex(relid,    /* OID of heap relation */
                                     stmt,
                                     InvalidOid, /* no predefined OID */
+									InvalidOid, /* no parent index */
                                     false,    /* is_alter_table */
                                     true,    /* check_rights */
                                     true,    /* check_not_in_use */
@@ -3879,6 +3899,8 @@ ProcessUtilitySlow(ParseState *pstate,
 													 parsetree);
 					commandCollected = true;
 					EventTriggerAlterTableEnd();
+
+					list_free(inheritors);
 				}
 				break;
 
diff --git a/src/backend/utils/adt/amutils.c b/src/backend/utils/adt/amutils.c
index b05d24d0..31367fb1 100644
--- a/src/backend/utils/adt/amutils.c
+++ b/src/backend/utils/adt/amutils.c
@@ -183,7 +183,8 @@ indexam_property(FunctionCallInfo fcinfo,
         if (!HeapTupleIsValid(tuple))
             PG_RETURN_NULL();
         rd_rel = (Form_pg_class) GETSTRUCT(tuple);
-        if (rd_rel->relkind != RELKIND_INDEX)
+		if (rd_rel->relkind != RELKIND_INDEX &&
+			rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
         {
             ReleaseSysCache(tuple);
             PG_RETURN_NULL();
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 9acf184a..8ce8cefe 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -360,7 +360,7 @@ static void decompile_column_index_array(Datum column_index_array, Oid relId,
 static char *pg_get_ruledef_worker(Oid ruleoid, int prettyFlags);
 static char *pg_get_indexdef_worker(Oid indexrelid, int colno,
                        const Oid *excludeOps,
-                       bool attrsOnly, bool showTblSpc,
+					   bool attrsOnly, bool showTblSpc, bool inherits,
                        int prettyFlags, bool missing_ok);
 static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok);
 static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags,
@@ -1142,7 +1142,7 @@ pg_get_indexdef(PG_FUNCTION_ARGS)
 
     prettyFlags = PRETTYFLAG_INDENT;
 
-    res = pg_get_indexdef_worker(indexrelid, 0, NULL, false, false,
+	res = pg_get_indexdef_worker(indexrelid, 0, NULL, false, false, false,
                                  prettyFlags, true);
 
     if (res == NULL)
@@ -1163,7 +1163,7 @@ pg_get_indexdef_ext(PG_FUNCTION_ARGS)
     prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
 
     res = pg_get_indexdef_worker(indexrelid, colno, NULL, colno != 0, false,
-                                 prettyFlags, true);
+								 false, prettyFlags, true);
 
     if (res == NULL)
         PG_RETURN_NULL();
@@ -1179,7 +1179,7 @@ pg_get_indexdef_ext(PG_FUNCTION_ARGS)
 char *
 pg_get_indexdef_string(Oid indexrelid)
 {
-    return pg_get_indexdef_worker(indexrelid, 0, NULL, false, true, 0, false);
+	return pg_get_indexdef_worker(indexrelid, 0, NULL, false, true, true, 0, false);
 }
 
 /* Internal version that just reports the column definitions */
@@ -1189,7 +1189,7 @@ pg_get_indexdef_columns(Oid indexrelid, bool pretty)
     int            prettyFlags;
 
     prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
-    return pg_get_indexdef_worker(indexrelid, 0, NULL, true, false,
+	return pg_get_indexdef_worker(indexrelid, 0, NULL, true, false, false,
                                   prettyFlags, false);
 }
 
@@ -1202,7 +1202,7 @@ pg_get_indexdef_columns(Oid indexrelid, bool pretty)
 static char *
 pg_get_indexdef_worker(Oid indexrelid, int colno,
                        const Oid *excludeOps,
-                       bool attrsOnly, bool showTblSpc,
+					   bool attrsOnly, bool showTblSpc, bool inherits,
                        int prettyFlags, bool missing_ok)
 {// #lizard forgives
     /* might want a separate isConstraint parameter later */
@@ -1318,9 +1318,11 @@ pg_get_indexdef_worker(Oid indexrelid, int colno,
     if (!attrsOnly)
     {
         if (!isConstraint)
-            appendStringInfo(&buf, "CREATE %sINDEX %s ON %s USING %s (",
+			appendStringInfo(&buf, "CREATE %sINDEX %s ON %s%s USING %s (",
                              idxrec->indisunique ? "UNIQUE " : "",
                              quote_identifier(NameStr(idxrelrec->relname)),
+							 idxrelrec->relkind == RELKIND_PARTITIONED_INDEX
+							 && !inherits ? "ONLY " : "",
                              generate_relation_name(indrelid, NIL),
                              quote_identifier(NameStr(amrec->amname)));
         else                    /* currently, must be EXCLUDE constraint */
@@ -2237,6 +2239,7 @@ pg_get_constraintdef_worker(Oid constraintId, bool fullCommand,
                                                               operators,
                                                               false,
                                                               false,
+															  false,
                                                               prettyFlags,
                                                               false));
                 break;
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index f9520010..55943dff 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -490,18 +490,26 @@ static void
 RelationParseRelOptions(Relation relation, HeapTuple tuple)
 {// #lizard forgives
     bytea       *options;
+	amoptions_function amoptsfn;
 
     relation->rd_options = NULL;
 
-    /* Fall out if relkind should not have options */
+	/*
+	 * Look up any AM-specific parse function; fall out if relkind should not
+	 * have options.
+	 */
     switch (relation->rd_rel->relkind)
     {
         case RELKIND_RELATION:
         case RELKIND_TOASTVALUE:
-        case RELKIND_INDEX:
         case RELKIND_VIEW:
         case RELKIND_MATVIEW:
         case RELKIND_PARTITIONED_TABLE:
+			amoptsfn = NULL;
+			break;
+		case RELKIND_INDEX:
+		case RELKIND_PARTITIONED_INDEX:
+			amoptsfn = relation->rd_amroutine->amoptions;
             break;
         default:
             return;
@@ -516,10 +524,8 @@ RelationParseRelOptions(Relation relation, HeapTuple tuple)
 	{
 	    return;
 	}
-    options = extractRelOptions(tuple,
-                                GetPgClassDescriptor(),
-                                relation->rd_rel->relkind == RELKIND_INDEX ?
-                                relation->rd_amroutine->amoptions : NULL);
+        
+        options = extractRelOptions(tuple, GetPgClassDescriptor(), amoptsfn);
 
     /*
      * Copy parsed data into CacheMemoryContext.  To guard against the
@@ -2379,7 +2385,8 @@ RelationIdGetRelation(Oid relationId)
              * and we don't want to use the full-blown procedure because it's
              * a headache for indexes that reload itself depends on.
              */
-            if (rd->rd_rel->relkind == RELKIND_INDEX)
+			if (rd->rd_rel->relkind == RELKIND_INDEX ||
+				rd->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
                 RelationReloadIndexInfo(rd);
             else
                 RelationClearRelation(rd, true);
@@ -2585,7 +2592,8 @@ RelationReloadIndexInfo(Relation relation)
     Form_pg_class relp;
 
     /* Should be called only for invalidated indexes */
-    Assert(relation->rd_rel->relkind == RELKIND_INDEX &&
+	Assert((relation->rd_rel->relkind == RELKIND_INDEX ||
+			relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) &&
            !relation->rd_isvalid);
 
     /* Ensure it's closed at smgr level */
@@ -2816,7 +2824,8 @@ RelationClearRelation(Relation relation, bool rebuild)
     {
         RelationInitPhysicalAddr(relation);
 
-        if (relation->rd_rel->relkind == RELKIND_INDEX)
+		if (relation->rd_rel->relkind == RELKIND_INDEX ||
+			relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
         {
             relation->rd_isvalid = false;    /* needs to be revalidated */
             if (relation->rd_refcnt > 1 && IsTransactionState())
@@ -2832,7 +2841,8 @@ RelationClearRelation(Relation relation, bool rebuild)
      * re-read the pg_class row to handle possible physical relocation of the
      * index, and we check for pg_index updates too.
      */
-    if (relation->rd_rel->relkind == RELKIND_INDEX &&
+	if ((relation->rd_rel->relkind == RELKIND_INDEX ||
+		 relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX) &&
         relation->rd_refcnt > 0 &&
         relation->rd_indexcxt != NULL)
     {
@@ -5904,7 +5914,10 @@ load_relcache_init_file(bool shared)
             rel->rd_att->constr = constr;
         }
 
-        /* If it's an index, there's more to do */
+		/*
+		 * If it's an index, there's more to do.  Note we explicitly ignore
+		 * partitioned indexes here.
+		 */
         if (rel->rd_rel->relkind == RELKIND_INDEX)
         {
             MemoryContext indexcxt;
@@ -6268,7 +6281,10 @@ write_relcache_init_file(bool shared)
                    (rel->rd_options ? VARSIZE(rel->rd_options) : 0),
                    fp);
 
-        /* If it's an index, there's more to do */
+		/*
+		 * If it's an index, there's more to do. Note we explicitly ignore
+		 * partitioned indexes here.
+		 */
         if (rel->rd_rel->relkind == RELKIND_INDEX)
         {
             /* write the pg_index tuple */
diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c
index a03dd76c..0942fa5b 100644
--- a/src/bin/pg_dump/common.c
+++ b/src/bin/pg_dump/common.c
@@ -68,6 +68,7 @@ static int    numextmembers;
 
 static void flagInhTables(TableInfo *tbinfo, int numTables,
               InhInfo *inhinfo, int numInherits);
+static void flagInhIndexes(Archive *fout, TableInfo *tblinfo, int numTables);
 static void flagInhAttrs(DumpOptions *dopt, TableInfo *tblinfo, int numTables);
 static DumpableObject **buildIndexArray(void *objArray, int numObjs,
                 Size objSize);
@@ -76,6 +77,8 @@ static int    ExtensionMemberIdCompare(const void *p1, const void *p2);
 static void findParentsByOid(TableInfo *self,
                  InhInfo *inhinfo, int numInherits);
 static int    strInArray(const char *pattern, char **arr, int arr_size);
+static IndxInfo *findIndexByOid(Oid oid, DumpableObject **idxinfoindex,
+			   int numIndexes);
 
 
 /*
@@ -258,6 +261,10 @@ getSchemaData(Archive *fout, int *numTablesPtr)
     getIndexes(fout, tblinfo, numTables);
 
     if (g_verbose)
+		write_msg(NULL, "flagging indexes in partitioned tables\n");
+	flagInhIndexes(fout, tblinfo, numTables);
+
+	if (g_verbose)
         write_msg(NULL, "reading extended statistics\n");
     getExtendedStatistics(fout, tblinfo, numTables);
 
@@ -335,6 +342,89 @@ flagInhTables(TableInfo *tblinfo, int numTables,
     }
 }
 
+/*
+ * flagInhIndexes -
+ *	 Create AttachIndexInfo objects for partitioned indexes, and add
+ *	 appropriate dependency links.
+ */
+static void
+flagInhIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
+{
+	int		i,
+			j,
+			k;
+	DumpableObject ***parentIndexArray;
+
+	parentIndexArray = (DumpableObject ***)
+		pg_malloc0(getMaxDumpId() * sizeof(DumpableObject **));
+
+	for (i = 0; i < numTables; i++)
+	{
+		TableInfo	   *parenttbl;
+		IndexAttachInfo *attachinfo;
+
+		if (!tblinfo[i].ispartition || tblinfo[i].numParents == 0)
+			continue;
+
+		Assert(tblinfo[i].numParents == 1);
+		parenttbl = tblinfo[i].parents[0];
+
+		/*
+		 * We need access to each parent table's index list, but there is no
+		 * index to cover them outside of this function.  To avoid having to
+		 * sort every parent table's indexes each time we come across each of
+		 * its partitions, create an indexed array for each parent the first
+		 * time it is required.
+		 */
+		if (parentIndexArray[parenttbl->dobj.dumpId] == NULL)
+			parentIndexArray[parenttbl->dobj.dumpId] =
+				buildIndexArray(parenttbl->indexes,
+								parenttbl->numIndexes,
+								sizeof(IndxInfo));
+
+		attachinfo = (IndexAttachInfo *)
+			pg_malloc0(tblinfo[i].numIndexes * sizeof(IndexAttachInfo));
+		for (j = 0, k = 0; j < tblinfo[i].numIndexes; j++)
+		{
+			IndxInfo   *index = &(tblinfo[i].indexes[j]);
+			IndxInfo   *parentidx;
+
+			if (index->parentidx == 0)
+				continue;
+
+			parentidx = findIndexByOid(index->parentidx,
+									   parentIndexArray[parenttbl->dobj.dumpId],
+									   parenttbl->numIndexes);
+			if (parentidx == NULL)
+				continue;
+
+			attachinfo[k].dobj.objType = DO_INDEX_ATTACH;
+			attachinfo[k].dobj.catId.tableoid = 0;
+			attachinfo[k].dobj.catId.oid = 0;
+			AssignDumpId(&attachinfo[k].dobj);
+			attachinfo[k].dobj.name = pg_strdup(index->dobj.name);
+			attachinfo[k].parentIdx = parentidx;
+			attachinfo[k].partitionIdx = index;
+
+			/*
+			 * We want dependencies from parent to partition (so that the
+			 * partition index is created first), and another one from
+			 * attach object to parent (so that the partition index is
+			 * attached once the parent index has been created).
+			 */
+			addObjectDependency(&parentidx->dobj, index->dobj.dumpId);
+			addObjectDependency(&attachinfo[k].dobj, parentidx->dobj.dumpId);
+
+			k++;
+		}
+	}
+
+	for (i = 0; i < numTables; i++)
+		if (parentIndexArray[i])
+			pg_free(parentIndexArray[i]);
+	pg_free(parentIndexArray);
+}
+
 /* flagInhAttrs -
  *     for each dumpable table in tblinfo, flag its inherited attributes
  *
@@ -808,6 +898,18 @@ findExtensionByOid(Oid oid)
     return (ExtensionInfo *) findObjectByOid(oid, extinfoindex, numExtensions);
 }
 
+/*
+ * findIndexByOid
+ *		find the entry of the index with the given oid
+ *
+ * This one's signature is different from the previous ones because we lack a
+ * global array of all indexes, so caller must pass their array as argument.
+ */
+static IndxInfo *
+findIndexByOid(Oid oid, DumpableObject **idxinfoindex, int numIndexes)
+{
+	return (IndxInfo *) findObjectByOid(oid, idxinfoindex, numIndexes);
+}
 
 /*
  * setExtensionMembership
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 97384c01..b11b02ae 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -200,6 +200,7 @@ static void dumpAttrDef(Archive *fout, AttrDefInfo *adinfo);
 static void dumpSequence(Archive *fout, TableInfo *tbinfo);
 static void dumpSequenceData(Archive *fout, TableDataInfo *tdinfo);
 static void dumpIndex(Archive *fout, IndxInfo *indxinfo);
+static void dumpIndexAttach(Archive *fout, IndexAttachInfo *attachinfo);
 static void dumpStatisticsExt(Archive *fout, StatsExtInfo *statsextinfo);
 static void dumpConstraint(Archive *fout, ConstraintInfo *coninfo);
 static void dumpTableConstraintComment(Archive *fout, ConstraintInfo *coninfo);
@@ -6770,6 +6771,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
     int            i_tableoid,
                 i_oid,
                 i_indexname,
+				i_parentidx,
                 i_indexdef,
                 i_indnkeys,
                 i_indkey,
@@ -6791,15 +6793,17 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
     {
         TableInfo  *tbinfo = &tblinfo[i];
 
-        /* Only plain tables and materialized views have indexes. */
-        if (tbinfo->relkind != RELKIND_RELATION &&
-            tbinfo->relkind != RELKIND_MATVIEW)
-            continue;
         if (!tbinfo->hasindex)
             continue;
 
-        /* Ignore indexes of tables whose definitions are not to be dumped */
-        if (!(tbinfo->dobj.dump & DUMP_COMPONENT_DEFINITION))
+		/*
+		 * Ignore indexes of tables whose definitions are not to be dumped.
+		 *
+		 * We also need indexes on partitioned tables which have partitions to
+		 * be dumped, in order to dump the indexes on the partitions.
+		 */
+		if (!(tbinfo->dobj.dump & DUMP_COMPONENT_DEFINITION) &&
+			!tbinfo->interesting)
             continue;
 
         if (g_verbose)
@@ -6822,7 +6826,39 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
          * is not.
          */
         resetPQExpBuffer(query);
-        if (fout->remoteVersion >= 90400)
+		if (fout->remoteVersion >= 11000)
+		{
+			appendPQExpBuffer(query,
+							  "SELECT t.tableoid, t.oid, "
+							  "t.relname AS indexname, "
+							  "inh.inhparent AS parentidx, "
+							  "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, "
+							  "t.relnatts AS indnkeys, "
+							  "i.indkey, i.indisclustered, "
+							  "i.indisreplident, t.relpages, "
+							  "c.contype, c.conname, "
+							  "c.condeferrable, c.condeferred, "
+							  "c.tableoid AS contableoid, "
+							  "c.oid AS conoid, "
+							  "pg_catalog.pg_get_constraintdef(c.oid, false) AS condef, "
+							  "(SELECT spcname FROM pg_catalog.pg_tablespace s WHERE s.oid = t.reltablespace) AS tablespace, "
+							  "t.reloptions AS indreloptions "
+							  "FROM pg_catalog.pg_index i "
+							  "JOIN pg_catalog.pg_class t ON (t.oid = i.indexrelid) "
+							  "JOIN pg_catalog.pg_class t2 ON (t2.oid = i.indrelid) "
+							  "LEFT JOIN pg_catalog.pg_constraint c "
+							  "ON (i.indrelid = c.conrelid AND "
+							  "i.indexrelid = c.conindid AND "
+							  "c.contype IN ('p','u','x')) "
+							  "LEFT JOIN pg_catalog.pg_inherits inh "
+							  "ON (inh.inhrelid = indexrelid) "
+							  "WHERE i.indrelid = '%u'::pg_catalog.oid "
+							  "AND (i.indisvalid OR t2.relkind = 'p') "
+							  "AND i.indisready "
+							  "ORDER BY indexname",
+							  tbinfo->dobj.catId.oid);
+		}
+		else if (fout->remoteVersion >= 90400)
         {
             /*
              * the test on indisready is necessary in 9.2, and harmless in
@@ -6831,6 +6867,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
             appendPQExpBuffer(query,
                               "SELECT t.tableoid, t.oid, "
                               "t.relname AS indexname, "
+							  "0 AS parentidx, "
                               "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, "
                               "t.relnatts AS indnkeys, "
                               "i.indkey, i.indisclustered, "
@@ -6862,6 +6899,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
             appendPQExpBuffer(query,
                               "SELECT t.tableoid, t.oid, "
                               "t.relname AS indexname, "
+							  "0 AS parentidx, "
                               "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, "
                               "t.relnatts AS indnkeys, "
                               "i.indkey, i.indisclustered, "
@@ -6889,6 +6927,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
             appendPQExpBuffer(query,
                               "SELECT t.tableoid, t.oid, "
                               "t.relname AS indexname, "
+							  "0 AS parentidx, "
                               "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, "
                               "t.relnatts AS indnkeys, "
                               "i.indkey, i.indisclustered, "
@@ -6919,6 +6958,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
             appendPQExpBuffer(query,
                               "SELECT t.tableoid, t.oid, "
                               "t.relname AS indexname, "
+							  "0 AS parentidx, "
                               "pg_catalog.pg_get_indexdef(i.indexrelid) AS indexdef, "
                               "t.relnatts AS indnkeys, "
                               "i.indkey, i.indisclustered, "
@@ -6951,6 +6991,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
         i_tableoid = PQfnumber(res, "tableoid");
         i_oid = PQfnumber(res, "oid");
         i_indexname = PQfnumber(res, "indexname");
+		i_parentidx = PQfnumber(res, "parentidx");
         i_indexdef = PQfnumber(res, "indexdef");
         i_indnkeys = PQfnumber(res, "indnkeys");
         i_indkey = PQfnumber(res, "indkey");
@@ -6967,8 +7008,10 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
         i_tablespace = PQfnumber(res, "tablespace");
         i_indreloptions = PQfnumber(res, "indreloptions");
 
-        indxinfo = (IndxInfo *) pg_malloc(ntups * sizeof(IndxInfo));
+		tbinfo->indexes = indxinfo =
+			(IndxInfo *) pg_malloc(ntups * sizeof(IndxInfo));
         constrinfo = (ConstraintInfo *) pg_malloc(ntups * sizeof(ConstraintInfo));
+		tbinfo->numIndexes = ntups;
 
         for (j = 0; j < ntups; j++)
         {
@@ -6978,6 +7021,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
             indxinfo[j].dobj.catId.tableoid = atooid(PQgetvalue(res, j, i_tableoid));
             indxinfo[j].dobj.catId.oid = atooid(PQgetvalue(res, j, i_oid));
             AssignDumpId(&indxinfo[j].dobj);
+			indxinfo[j].dobj.dump = tbinfo->dobj.dump;
             indxinfo[j].dobj.name = pg_strdup(PQgetvalue(res, j, i_indexname));
             indxinfo[j].dobj.namespace = tbinfo->dobj.namespace;
             indxinfo[j].indextable = tbinfo;
@@ -6990,6 +7034,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                           indxinfo[j].indkeys, indxinfo[j].indnkeys);
             indxinfo[j].indisclustered = (PQgetvalue(res, j, i_indisclustered)[0] == 't');
             indxinfo[j].indisreplident = (PQgetvalue(res, j, i_indisreplident)[0] == 't');
+			indxinfo[j].parentidx = atooid(PQgetvalue(res, j, i_parentidx));
             indxinfo[j].relpages = atoi(PQgetvalue(res, j, i_relpages));
             contype = *(PQgetvalue(res, j, i_contype));
 
@@ -7003,6 +7048,7 @@ getIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
                 constrinfo[j].dobj.catId.tableoid = atooid(PQgetvalue(res, j, i_contableoid));
                 constrinfo[j].dobj.catId.oid = atooid(PQgetvalue(res, j, i_conoid));
                 AssignDumpId(&constrinfo[j].dobj);
+				constrinfo[j].dobj.dump = tbinfo->dobj.dump;
                 constrinfo[j].dobj.name = pg_strdup(PQgetvalue(res, j, i_conname));
                 constrinfo[j].dobj.namespace = tbinfo->dobj.namespace;
                 constrinfo[j].contable = tbinfo;
@@ -9773,6 +9819,9 @@ dumpDumpableObject(Archive *fout, DumpableObject *dobj)
         case DO_INDEX:
             dumpIndex(fout, (IndxInfo *) dobj);
             break;
+		case DO_INDEX_ATTACH:
+			dumpIndexAttach(fout, (IndexAttachInfo *) dobj);
+			break;
         case DO_STATSEXT:
             dumpStatisticsExt(fout, (StatsExtInfo *) dobj);
             break;
@@ -16528,6 +16577,42 @@ dumpIndex(Archive *fout, IndxInfo *indxinfo)
     destroyPQExpBuffer(labelq);
 }
 
+/*
+ * dumpIndexAttach
+ *	  write out to fout a partitioned-index attachment clause
+ */
+void
+dumpIndexAttach(Archive *fout, IndexAttachInfo *attachinfo)
+{
+	if (fout->dopt->dataOnly)
+		return;
+
+	if (attachinfo->partitionIdx->dobj.dump & DUMP_COMPONENT_DEFINITION)
+	{
+		PQExpBuffer	q = createPQExpBuffer();
+
+		appendPQExpBuffer(q, "\nALTER INDEX %s ",
+						  fmtQualifiedId(fout->remoteVersion,
+										 attachinfo->parentIdx->dobj.namespace->dobj.name,
+										 attachinfo->parentIdx->dobj.name));
+		appendPQExpBuffer(q, "ATTACH PARTITION %s;\n",
+						  fmtQualifiedId(fout->remoteVersion,
+										 attachinfo->partitionIdx->dobj.namespace->dobj.name,
+										 attachinfo->partitionIdx->dobj.name));
+
+		ArchiveEntry(fout, attachinfo->dobj.catId, attachinfo->dobj.dumpId,
+					 attachinfo->dobj.name,
+					 NULL, NULL,
+					 "",
+					 false, "INDEX ATTACH", SECTION_POST_DATA,
+					 q->data, "", NULL,
+					 NULL, 0,
+					 NULL, NULL);
+
+		destroyPQExpBuffer(q);
+	}
+}
+
 /*
  * dumpStatisticsExt
  *      write out to fout an extended statistics object
@@ -18188,6 +18273,7 @@ addBoundaryDependencies(DumpableObject **dobjs, int numObjs,
                 addObjectDependency(postDataBound, dobj->dumpId);
                 break;
             case DO_INDEX:
+			case DO_INDEX_ATTACH:
             case DO_STATSEXT:
             case DO_REFRESH_MATVIEW:
             case DO_TRIGGER:
diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h
index 1cff1a8b..133a66a9 100644
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@ -116,6 +116,7 @@ typedef enum
     DO_TABLE,
     DO_ATTRDEF,
     DO_INDEX,
+	DO_INDEX_ATTACH,
     DO_STATSEXT,
     DO_RULE,
     DO_TRIGGER,
@@ -418,6 +419,8 @@ typedef struct _tableInfo
      */
     int            numParents;        /* number of (immediate) parent tables */
     struct _tableInfo **parents;    /* TableInfos of immediate parents */
+	int			numIndexes;		/* number of indexes */
+	struct _indxInfo *indexes;	/* indexes */
     struct _tableDataInfo *dataObj; /* TableDataInfo, if dumping its data */
     int            numTriggers;    /* number of triggers for table */
     struct _triggerInfo *triggers;    /* array of TriggerInfo structs */
@@ -451,11 +454,19 @@ typedef struct _indxInfo
     Oid           *indkeys;
     bool        indisclustered;
     bool        indisreplident;
+	Oid			parentidx;		/* if partitioned, parent index OID */
     /* if there is an associated constraint object, its dumpId: */
     DumpId        indexconstraint;
     int            relpages;        /* relpages of the underlying table */
 } IndxInfo;
 
+typedef struct _indexAttachInfo
+{
+	DumpableObject dobj;
+	IndxInfo   *parentIdx;		/* link to index on partitioned table */
+	IndxInfo   *partitionIdx;	/* link to index on partition */
+} IndexAttachInfo;
+
 typedef struct _statsExtInfo
 {
     DumpableObject dobj;
diff --git a/src/bin/pg_dump/pg_dump_sort.c b/src/bin/pg_dump/pg_dump_sort.c
index 3bf17ece..46b38750 100644
--- a/src/bin/pg_dump/pg_dump_sort.c
+++ b/src/bin/pg_dump/pg_dump_sort.c
@@ -35,6 +35,10 @@ static const char *modulename = gettext_noop("sorter");
  * pg_dump.c; that is, PRE_DATA objects must sort before DO_PRE_DATA_BOUNDARY,
  * POST_DATA objects must sort after DO_POST_DATA_BOUNDARY, and DATA objects
  * must sort between them.
+ *
+ * Note: sortDataAndIndexObjectsBySize wants to have all DO_TABLE_DATA and
+ * DO_INDEX objects in contiguous chunks, so do not reuse the values for those
+ * for other object types.
  */
 static const int dbObjectTypePriority[] =
 {
@@ -53,11 +57,12 @@ static const int dbObjectTypePriority[] =
     18,                            /* DO_TABLE */
     20,                            /* DO_ATTRDEF */
     28,                            /* DO_INDEX */
-    29,                            /* DO_STATSEXT */
-    30,                            /* DO_RULE */
-    31,                            /* DO_TRIGGER */
+	29,							/* DO_INDEX_ATTACH */
+	30,							/* DO_STATSEXT */
+	31,							/* DO_RULE */
+	32,							/* DO_TRIGGER */
     27,                            /* DO_CONSTRAINT */
-    32,                            /* DO_FK_CONSTRAINT */
+	33,							/* DO_FK_CONSTRAINT */
     2,                            /* DO_PROCLANG */
     10,                            /* DO_CAST */
     23,                            /* DO_TABLE_DATA */
@@ -69,18 +74,18 @@ static const int dbObjectTypePriority[] =
     15,                            /* DO_TSCONFIG */
     16,                            /* DO_FDW */
     17,                            /* DO_FOREIGN_SERVER */
-    32,                            /* DO_DEFAULT_ACL */
+	33,							/* DO_DEFAULT_ACL */
     3,                            /* DO_TRANSFORM */
     21,                            /* DO_BLOB */
     25,                            /* DO_BLOB_DATA */
     22,                            /* DO_PRE_DATA_BOUNDARY */
     26,                            /* DO_POST_DATA_BOUNDARY */
-    33,                            /* DO_EVENT_TRIGGER */
-    38,                            /* DO_REFRESH_MATVIEW */
-    34,                            /* DO_POLICY */
-    35,                            /* DO_PUBLICATION */
-    36,                            /* DO_PUBLICATION_REL */
-    37                            /* DO_SUBSCRIPTION */
+	34,							/* DO_EVENT_TRIGGER */
+	39,							/* DO_REFRESH_MATVIEW */
+	35,							/* DO_POLICY */
+	36,							/* DO_PUBLICATION */
+	37,							/* DO_PUBLICATION_REL */
+	38							/* DO_SUBSCRIPTION */
 };
 
 static DumpId preDataBoundId;
@@ -937,6 +942,13 @@ repairDomainConstraintMultiLoop(DumpableObject *domainobj,
     addObjectDependency(constraintobj, postDataBoundId);
 }
 
+static void
+repairIndexLoop(DumpableObject *partedindex,
+				DumpableObject *partindex)
+{
+	removeObjectDependency(partedindex, partindex->dumpId);
+}
+
 /*
  * Fix a dependency loop, or die trying ...
  *
@@ -1099,6 +1111,23 @@ repairDependencyLoop(DumpableObject **loop,
         return;
     }
 
+	/* index on partitioned table and corresponding index on partition */
+	if (nLoop == 2 &&
+		loop[0]->objType == DO_INDEX &&
+		loop[1]->objType == DO_INDEX)
+	{
+		if (((IndxInfo *) loop[0])->parentidx == loop[1]->catId.oid)
+		{
+			repairIndexLoop(loop[0], loop[1]);
+			return;
+		}
+		else if (((IndxInfo *) loop[1])->parentidx == loop[0]->catId.oid)
+		{
+			repairIndexLoop(loop[1], loop[0]);
+			return;
+		}
+	}
+
     /* Indirect loop involving table and attribute default */
     if (nLoop > 2)
     {
@@ -1292,6 +1321,11 @@ describeDumpableObject(DumpableObject *obj, char *buf, int bufsize)
                      "INDEX %s  (ID %d OID %u)",
                      obj->name, obj->dumpId, obj->catId.oid);
             return;
+		case DO_INDEX_ATTACH:
+			snprintf(buf, bufsize,
+					 "INDEX ATTACH %s  (ID %d)",
+					 obj->name, obj->dumpId);
+			return;
         case DO_STATSEXT:
             snprintf(buf, bufsize,
                      "STATISTICS %s  (ID %d OID %u)",
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index c492fbdc..360d5954 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -5163,6 +5163,101 @@
 			section_pre_data         => 1,
 			test_schema_plus_blobs   => 1, }, },
 
+	'CREATE INDEX ON ONLY measurement' => {
+		all_runs     => 1,
+		catch_all    => 'CREATE ... commands',
+		create_order => 92,
+		create_sql   => 'CREATE INDEX ON dump_test.measurement (city_id, logdate);',
+		regexp => qr/^
+		\QCREATE INDEX measurement_city_id_logdate_idx ON ONLY measurement USING\E
+		/xm,
+		like => {
+			binary_upgrade           => 1,
+			clean                    => 1,
+			clean_if_exists          => 1,
+			createdb                 => 1,
+			defaults                 => 1,
+			exclude_test_table       => 1,
+			exclude_test_table_data  => 1,
+			no_blobs                 => 1,
+			no_privs                 => 1,
+			no_owner                 => 1,
+			only_dump_test_schema    => 1,
+			pg_dumpall_dbprivs       => 1,
+			schema_only              => 1,
+			section_post_data        => 1,
+			test_schema_plus_blobs   => 1,
+			with_oids                => 1, },
+		unlike => {
+			exclude_dump_test_schema => 1,
+			only_dump_test_table     => 1,
+			pg_dumpall_globals       => 1,
+			pg_dumpall_globals_clean => 1,
+			role                     => 1,
+			section_pre_data         => 1, }, },
+
+	'CREATE INDEX ... ON measurement_y2006_m2' => {
+		all_runs     => 1,
+		catch_all    => 'CREATE ... commands',
+		regexp       => qr/^
+		\QCREATE INDEX measurement_y2006m2_city_id_logdate_idx ON measurement_y2006m2 \E
+		/xm,
+		like => {
+			binary_upgrade           => 1,
+			clean                    => 1,
+			clean_if_exists          => 1,
+			createdb                 => 1,
+			defaults                 => 1,
+			exclude_dump_test_schema => 1,
+			exclude_test_table       => 1,
+			exclude_test_table_data  => 1,
+			no_blobs                 => 1,
+			no_privs                 => 1,
+			no_owner                 => 1,
+			pg_dumpall_dbprivs       => 1,
+			role                     => 1,
+			schema_only              => 1,
+			section_post_data        => 1,
+			with_oids                => 1, },
+		unlike => {
+			only_dump_test_schema    => 1,
+			only_dump_test_table     => 1,
+			pg_dumpall_globals       => 1,
+			pg_dumpall_globals_clean => 1,
+			section_pre_data         => 1,
+			test_schema_plus_blobs   => 1, }, },
+
+	'ALTER INDEX ... ATTACH PARTITION' => {
+		all_runs     => 1,
+		catch_all    => 'CREATE ... commands',
+		regexp       => qr/^
+		\QALTER INDEX dump_test.measurement_city_id_logdate_idx ATTACH PARTITION dump_test_second_schema.measurement_y2006m2_city_id_logdate_idx\E
+		/xm,
+		like => {
+			binary_upgrade           => 1,
+			clean                    => 1,
+			clean_if_exists          => 1,
+			createdb                 => 1,
+			defaults                 => 1,
+			exclude_dump_test_schema => 1,
+			exclude_test_table       => 1,
+			exclude_test_table_data  => 1,
+			no_blobs                 => 1,
+			no_privs                 => 1,
+			no_owner                 => 1,
+			pg_dumpall_dbprivs       => 1,
+			role                     => 1,
+			schema_only              => 1,
+			section_post_data        => 1,
+			with_oids                => 1, },
+		unlike => {
+			only_dump_test_schema    => 1,
+			only_dump_test_table     => 1,
+			pg_dumpall_globals       => 1,
+			pg_dumpall_globals_clean => 1,
+			section_pre_data         => 1,
+			test_schema_plus_blobs   => 1, }, },
+
 	'CREATE VIEW test_view' => {
 		all_runs     => 1,
 		catch_all    => 'CREATE ... commands',
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index c67d2570..64bb8794 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -1622,7 +1622,8 @@ describeOneTableDetails(const char *schemaname,
         appendPQExpBufferStr(&buf, ",\n  a.attidentity");
     else
         appendPQExpBufferStr(&buf, ",\n  ''::pg_catalog.char AS attidentity");
-    if (tableinfo.relkind == RELKIND_INDEX)
+	if (tableinfo.relkind == RELKIND_INDEX ||
+		tableinfo.relkind == RELKIND_PARTITIONED_INDEX)
         appendPQExpBufferStr(&buf, ",\n  pg_catalog.pg_get_indexdef(a.attrelid, a.attnum, TRUE) AS indexdef");
     else
         appendPQExpBufferStr(&buf, ",\n  NULL AS indexdef");
@@ -1687,6 +1688,7 @@ describeOneTableDetails(const char *schemaname,
                               schemaname, relationname);
             break;
         case RELKIND_INDEX:
+		case RELKIND_PARTITIONED_INDEX:
             if (tableinfo.relpersistence == 'u')
                 printfPQExpBuffer(&title, _("Unlogged index \"%s.%s\""),
                                   schemaname, relationname);
@@ -1747,7 +1749,8 @@ describeOneTableDetails(const char *schemaname,
     if (tableinfo.relkind == RELKIND_SEQUENCE)
         headers[cols++] = gettext_noop("Value");
 
-    if (tableinfo.relkind == RELKIND_INDEX)
+	if (tableinfo.relkind == RELKIND_INDEX ||
+        tableinfo.relkind == RELKIND_PARTITIONED_INDEX)
         headers[cols++] = gettext_noop("Definition");
 
     if (tableinfo.relkind == RELKIND_FOREIGN_TABLE && pset.sversion >= 90200)
@@ -1757,6 +1760,7 @@ describeOneTableDetails(const char *schemaname,
     {
         headers[cols++] = gettext_noop("Storage");
         if (tableinfo.relkind == RELKIND_RELATION ||
+			tableinfo.relkind == RELKIND_PARTITIONED_INDEX ||
             tableinfo.relkind == RELKIND_MATVIEW ||
             tableinfo.relkind == RELKIND_FOREIGN_TABLE ||
             tableinfo.relkind == RELKIND_PARTITIONED_TABLE)
@@ -1833,7 +1837,8 @@ describeOneTableDetails(const char *schemaname,
             printTableAddCell(&cont, seq_values[i], false, false);
 
         /* Expression for index column */
-        if (tableinfo.relkind == RELKIND_INDEX)
+		if (tableinfo.relkind == RELKIND_INDEX ||
+			tableinfo.relkind == RELKIND_PARTITIONED_INDEX)
             printTableAddCell(&cont, PQgetvalue(res, i, 7), false, false);
 
         /* FDW options for foreign table column, only for 9.2 or later */
@@ -1856,6 +1861,7 @@ describeOneTableDetails(const char *schemaname,
 
             /* Statistics target, if the relkind supports this feature */
             if (tableinfo.relkind == RELKIND_RELATION ||
+				tableinfo.relkind == RELKIND_PARTITIONED_INDEX ||
                 tableinfo.relkind == RELKIND_MATVIEW ||
                 tableinfo.relkind == RELKIND_FOREIGN_TABLE ||
                 tableinfo.relkind == RELKIND_PARTITIONED_TABLE)
@@ -1945,7 +1951,8 @@ describeOneTableDetails(const char *schemaname,
         PQclear(result);
     }
 
-    if (tableinfo.relkind == RELKIND_INDEX)
+	if (tableinfo.relkind == RELKIND_INDEX ||
+		tableinfo.relkind == RELKIND_PARTITIONED_INDEX)
     {
         /* Footer information about an index */
         PGresult   *result;
@@ -3631,6 +3638,7 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys
                       " WHEN 's' THEN '%s'"
                       " WHEN " CppAsString2(RELKIND_FOREIGN_TABLE) " THEN '%s'"
                       " WHEN " CppAsString2(RELKIND_PARTITIONED_TABLE) " THEN '%s'"
+					  " WHEN " CppAsString2(RELKIND_PARTITIONED_INDEX) " THEN '%s'"
                       " END as \"%s\",\n"
                       "  pg_catalog.pg_get_userbyid(c.relowner) as \"%s\"",
                       gettext_noop("Schema"),
@@ -3643,6 +3651,7 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys
                       gettext_noop("special"),
                       gettext_noop("foreign table"),
                       gettext_noop("table"),    /* partitioned table */
+					  gettext_noop("index"),	/* partitioned index */
                       gettext_noop("Type"),
                       gettext_noop("Owner"));
 
@@ -3699,7 +3708,8 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys
     if (showMatViews)
         appendPQExpBufferStr(&buf, CppAsString2(RELKIND_MATVIEW) ",");
     if (showIndexes)
-        appendPQExpBufferStr(&buf, CppAsString2(RELKIND_INDEX) ",");
+		appendPQExpBufferStr(&buf, CppAsString2(RELKIND_INDEX) ","
+							 CppAsString2(RELKIND_PARTITIONED_INDEX) ",");
     if (showSeq)
         appendPQExpBufferStr(&buf, CppAsString2(RELKIND_SEQUENCE) ",");
     if (showSystem || pattern)
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index fd1c4a5e..db21cc50 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -412,7 +412,8 @@ static const SchemaQuery Query_for_list_of_indexes = {
     /* catname */
     "pg_catalog.pg_class c",
     /* selcondition */
-    "c.relkind IN (" CppAsString2(RELKIND_INDEX) ")",
+	"c.relkind IN (" CppAsString2(RELKIND_INDEX) ", "
+	CppAsString2(RELKIND_PARTITIONED_INDEX) ")",
     /* viscondition */
     "pg_catalog.pg_table_is_visible(c.oid)",
     /* namespace */
@@ -604,6 +605,23 @@ static const SchemaQuery Query_for_list_of_tmf = {
     NULL
 };
 
+static const SchemaQuery Query_for_list_of_tpm = {
+	/* catname */
+	"pg_catalog.pg_class c",
+	/* selcondition */
+	"c.relkind IN (" CppAsString2(RELKIND_RELATION) ", "
+	CppAsString2(RELKIND_PARTITIONED_TABLE) ", "
+	CppAsString2(RELKIND_MATVIEW) ")",
+	/* viscondition */
+	"pg_catalog.pg_table_is_visible(c.oid)",
+	/* namespace */
+	"c.relnamespace",
+	/* result */
+	"pg_catalog.quote_ident(c.relname)",
+	/* qualresult */
+	NULL
+};
+
 static const SchemaQuery Query_for_list_of_tm = {
     /* catname */
     "pg_catalog.pg_class c",
@@ -1706,7 +1724,12 @@ psql_completion(const char *text, int start, int end)
                                    "UNION SELECT 'ALL IN TABLESPACE'");
     /* ALTER INDEX <name> */
     else if (Matches3("ALTER", "INDEX", MatchAny))
-        COMPLETE_WITH_LIST4("OWNER TO", "RENAME TO", "SET", "RESET");
+	    COMPLETE_WITH_LIST5("OWNER TO", "RENAME TO", "SET",
+	                       "RESET", "ATTACH PARTITION");
+	else if (Matches4("ALTER", "INDEX", MatchAny, "ATTACH"))
+	    COMPLETE_WITH_CONST("PARTITION");
+	else if (Matches5("ALTER", "INDEX", MatchAny, "ATTACH", "PARTITION"))
+	    COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_indexes, NULL);
     /* ALTER INDEX <name> SET */
     else if (Matches4("ALTER", "INDEX", MatchAny, "SET"))
         COMPLETE_WITH_LIST2("(", "TABLESPACE");
@@ -2372,10 +2395,13 @@ psql_completion(const char *text, int start, int end)
         COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_indexes,
                                    " UNION SELECT 'ON'"
                                    " UNION SELECT 'CONCURRENTLY'");
-    /* Complete ... INDEX|CONCURRENTLY [<name>] ON with a list of tables  */
+	/*
+	 * Complete ... INDEX|CONCURRENTLY [<name>] ON with a list of relations
+	 * that can indexes can be created on
+	 */
     else if (TailMatches3("INDEX|CONCURRENTLY", MatchAny, "ON") ||
              TailMatches2("INDEX|CONCURRENTLY", "ON"))
-        COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_tm, NULL);
+		COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_tpm, NULL);
 
     /*
      * Complete CREATE|UNIQUE INDEX CONCURRENTLY with "ON" and existing
diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h
index 6af0f85d..c2c0c9b1 100644
--- a/src/include/catalog/dependency.h
+++ b/src/include/catalog/dependency.h
@@ -110,6 +110,20 @@
  * Example: a trigger that's created to enforce a foreign-key constraint
  * is made internally dependent on the constraint's pg_constraint entry.
  *
+ * DEPENDENCY_INTERNAL_AUTO ('I'): the dependent object was created as
+ * part of creation of the referenced object, and is really just a part
+ * of its internal implementation.  A DROP of the dependent object will
+ * be disallowed outright (we'll tell the user to issue a DROP against the
+ * referenced object, instead).  While a regular internal dependency will
+ * prevent the dependent object from being dropped while any such
+ * dependencies remain, DEPENDENCY_INTERNAL_AUTO will allow such a drop as
+ * long as the object can be found by following any of such dependencies.
+ * Example: an index on a partition is made internal-auto-dependent on
+ * both the partition itself as well as on the index on the parent
+ * partitioned table; so the partition index is dropped together with
+ * either the partition it indexes, or with the parent index it is attached
+ * to.
+
  * DEPENDENCY_EXTENSION ('e'): the dependent object is a member of the
  * extension that is the referenced object.  The dependent object can be
  * dropped only via DROP EXTENSION on the referenced object.  Functionally
@@ -136,6 +150,7 @@ typedef enum DependencyType
     DEPENDENCY_NORMAL = 'n',
     DEPENDENCY_AUTO = 'a',
     DEPENDENCY_INTERNAL = 'i',
+	DEPENDENCY_INTERNAL_AUTO = 'I',
     DEPENDENCY_EXTENSION = 'e',
     DEPENDENCY_AUTO_EXTENSION = 'x',
     DEPENDENCY_PIN = 'p'
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index 4928dfd1..3a7ed05f 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -108,10 +108,13 @@ extern void index_check_primary_key(Relation heapRel,
 #define    INDEX_CREATE_SKIP_BUILD             (1 << 2)
 #define    INDEX_CREATE_CONCURRENT             (1 << 3)
 #define    INDEX_CREATE_IF_NOT_EXISTS          (1 << 4)
+#define	INDEX_CREATE_PARTITIONED			(1 << 5)
+#define INDEX_CREATE_INVALID				(1 << 6)
 
 extern Oid index_create(Relation heapRelation,
              const char *indexRelationName,
              Oid indexRelationId,
+			 Oid parentIndexRelid,
              Oid relFileNode,
              IndexInfo *indexInfo,
              List *indexColNames,
@@ -145,6 +148,11 @@ extern void index_drop(Oid indexId, bool concurrent);
 
 extern IndexInfo *BuildIndexInfo(Relation index);
 
+extern bool CompareIndexInfo(IndexInfo *info1, IndexInfo *info2,
+				 Oid *collations1, Oid *collations2,
+				 Oid *opfamilies1, Oid *opfamilies2,
+				 AttrNumber *attmap, int maplen);
+
 extern void BuildSpeculativeIndexInfo(Relation index, IndexInfo *ii);
 
 extern void FormIndexDatum(IndexInfo *indexInfo,
@@ -199,4 +207,6 @@ extern Oid    IndexGetRelation(Oid indexId, bool missing_ok);
 extern bool index_is_interval(Oid indexId);
 #endif
 
+extern void IndexSetParentIndex(Relation idx, Oid parentOid);
+
 #endif                            /* INDEX_H */
diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h
index 15929163..4ad3a5a6 100644
--- a/src/include/catalog/pg_class.h
+++ b/src/include/catalog/pg_class.h
@@ -183,6 +183,7 @@ DESCR("");
 #define          RELKIND_COMPOSITE_TYPE  'c'    /* composite type */
 #define          RELKIND_FOREIGN_TABLE   'f'    /* foreign table */
 #define          RELKIND_PARTITIONED_TABLE 'p' /* partitioned table */
+#define		  RELKIND_PARTITIONED_INDEX 'I' /* partitioned index */
 
 #define          RELPERSISTENCE_PERMANENT    'p' /* regular table */
 #define          RELPERSISTENCE_UNLOGGED    'u' /* unlogged permanent table */
diff --git a/src/include/catalog/pg_inherits_fn.h b/src/include/catalog/pg_inherits_fn.h
index 3c371890..09663312 100644
--- a/src/include/catalog/pg_inherits_fn.h
+++ b/src/include/catalog/pg_inherits_fn.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pg_inherits_fn.h
- *      prototypes for functions in catalog/pg_inherits.c
+ *	  prototypes for functions in catalog/pg_inherits.c
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -19,9 +19,12 @@
 
 extern List *find_inheritance_children(Oid parentrelId, LOCKMODE lockmode);
 extern List *find_all_inheritors(Oid parentrelId, LOCKMODE lockmode,
-                    List **parents);
+					List **parents);
 extern bool has_subclass(Oid relationId);
 extern bool has_superclass(Oid relationId);
 extern bool typeInheritsFrom(Oid subclassTypeId, Oid superclassTypeId);
+extern void StoreSingleInheritance(Oid relationId, Oid parentOid,
+					   int32 seqNumber);
+extern bool DeleteInheritsTuple(Oid inhrelid, Oid inhparent);
 
-#endif                            /* PG_INHERITS_FN_H */
+#endif							/* PG_INHERITS_FN_H */
diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h
index 2e4f2c44..377f9f94 100644
--- a/src/include/commands/defrem.h
+++ b/src/include/commands/defrem.h
@@ -25,12 +25,13 @@ extern void RemoveObjects(DropStmt *stmt);
 extern ObjectAddress DefineIndex(Oid relationId,
             IndexStmt *stmt,
             Oid indexRelationId,
+			Oid parentIndexId,
             bool is_alter_table,
             bool check_rights,
             bool check_not_in_use,
             bool skip_build,
             bool quiet);
-extern Oid    ReindexIndex(RangeVar *indexRelation, int options);
+extern void ReindexIndex(RangeVar *indexRelation, int options);
 extern Oid    ReindexTable(RangeVar *relation, int options);
 extern void ReindexMultipleTables(const char *objectName, ReindexObjectType objectKind,
                       int options);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index bed56a23..ddb99ddf 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -152,6 +152,7 @@ typedef struct IndexInfo
     bool        ii_ReadyForInserts;
     bool        ii_Concurrent;
     bool        ii_BrokenHotChain;
+	Oid			ii_Am;
     void       *ii_AmCache;
     MemoryContext ii_Context;
 } IndexInfo;
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index acc64eb0..df2746c9 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -918,7 +918,7 @@ typedef struct PartitionRangeDatum
 } PartitionRangeDatum;
 
 /*
- * PartitionCmd - info for ALTER TABLE ATTACH/DETACH PARTITION commands
+ * PartitionCmd - info for ALTER TABLE/INDEX ATTACH/DETACH PARTITION commands
  */
 typedef struct PartitionCmd
 {
@@ -2859,6 +2859,10 @@ typedef struct FetchStmt
  * index, just a UNIQUE/PKEY constraint using an existing index.  isconstraint
  * must always be true in this case, and the fields describing the index
  * properties are empty.
+ *
+ * The relation to build the index on can be represented either by name
+ * (in which case the RangeVar indicates whether to recurse or not) or by OID
+ * (in which case the command is always recursive).
  * ----------------------
  */
 typedef struct IndexStmt
@@ -2866,6 +2870,7 @@ typedef struct IndexStmt
     NodeTag        type;
     char       *idxname;        /* name of new index, or NULL for default */
     RangeVar   *relation;        /* relation to build index on */
+	Oid			relationId;		/* OID of relation to build index on */
     char       *accessMethod;    /* name of access method (eg. btree) */
     char       *tableSpace;        /* tablespace, or NULL for default */
     List       *indexParams;    /* columns to index: a list of IndexElem */
diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h
index adde3238..e527a119 100644
--- a/src/include/parser/parse_utilcmd.h
+++ b/src/include/parser/parse_utilcmd.h
@@ -101,5 +101,8 @@ extern bool CheckLocalIndexColumn (char loctype, char *partcolname, char *indexc
 #endif
 extern PartitionBoundSpec *transformPartitionBound(ParseState *pstate, Relation parent,
                         PartitionBoundSpec *spec);
+extern IndexStmt *generateClonedIndexStmt(RangeVar *heapRel, Oid heapOid,
+                                                 Relation source_idx,
+                                                 const AttrNumber *attmap, int attmap_length);
 
 #endif                            /* PARSE_UTILCMD_H */
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 9ace1f7a..626f34f7 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -1967,6 +1967,67 @@ create table tab1 (a int, b text);
 create table tab2 (x int, y tab1);
 alter table tab1 alter column b type varchar; -- fails
 ERROR:  cannot alter table "tab1" because column "tab2.y" uses its row type
+-- Alter column type that's part of a partitioned index
+create table at_partitioned (a int, b text) partition by range (a);
+create table at_part_1 partition of at_partitioned for values from (0) to (1000);
+insert into at_partitioned values (512, '0.123');
+create table at_part_2 (b text, a int);
+insert into at_part_2 values ('1.234', 1024);
+create index on at_partitioned (b);
+create index on at_partitioned (a);
+\d at_part_1
+             Table "public.at_part_1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | text    |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (0) TO (1000)
+Indexes:
+    "at_part_1_a_idx" btree (a)
+    "at_part_1_b_idx" btree (b)
+
+\d at_part_2
+             Table "public.at_part_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ b      | text    |           |          | 
+ a      | integer |           |          | 
+
+alter table at_partitioned attach partition at_part_2 for values from (1000) to (2000);
+\d at_part_2
+             Table "public.at_part_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ b      | text    |           |          | 
+ a      | integer |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000)
+Indexes:
+    "at_part_2_a_idx" btree (a)
+    "at_part_2_b_idx" btree (b)
+
+alter table at_partitioned alter column b type numeric using b::numeric;
+\d at_part_1
+             Table "public.at_part_1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | numeric |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (0) TO (1000)
+Indexes:
+    "at_part_1_a_idx" btree (a)
+    "at_part_1_b_idx" btree (b)
+
+\d at_part_2
+             Table "public.at_part_2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ b      | numeric |           |          | 
+ a      | integer |           |          | 
+Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000)
+Indexes:
+    "at_part_2_a_idx" btree (a)
+    "at_part_2_b_idx" btree (b)
+
 -- disallow recursive containment of row types
 create temp table recur1 (f1 int);
 alter table recur1 add column f2 recur1; -- fails
@@ -3168,7 +3229,7 @@ CREATE TABLE unparted (
 );
 CREATE TABLE fail_part (like unparted);
 ALTER TABLE unparted ATTACH PARTITION fail_part FOR VALUES IN ('a');
-ERROR:  "unparted" is not partitioned
+ERROR:  table "unparted" is not partitioned
 DROP TABLE unparted, fail_part;
 -- check that partition bound is compatible
 CREATE TABLE list_parted (
@@ -3563,7 +3624,7 @@ DROP TABLE fail_part;
 -- check that the table is partitioned at all
 CREATE TABLE regular_table (a int);
 ALTER TABLE regular_table DETACH PARTITION any_name;
-ERROR:  "regular_table" is not partitioned
+ERROR:  table "regular_table" is not partitioned
 DROP TABLE regular_table;
 -- check that the partition being detached exists at all
 ALTER TABLE list_parted2 DETACH PARTITION part_4;
diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out
new file mode 100644
index 00000000..e9cccca8
--- /dev/null
+++ b/src/test/regress/expected/indexing.out
@@ -0,0 +1,757 @@
+-- Creating an index on a partitioned table makes the partitions
+-- automatically get the index
+create table idxpart (a int, b int, c text) partition by range (a);
+create table idxpart1 partition of idxpart for values from (0) to (10);
+create table idxpart2 partition of idxpart for values from (10) to (100)
+	partition by range (b);
+create table idxpart21 partition of idxpart2 for values from (0) to (100);
+create index on idxpart (a);
+select relname, relkind, inhparent::regclass
+    from pg_class left join pg_index ix on (indexrelid = oid)
+	left join pg_inherits on (ix.indexrelid = inhrelid)
+	where relname like 'idxpart%' order by relname;
+     relname     | relkind |   inhparent    
+-----------------+---------+----------------
+ idxpart         | p       | 
+ idxpart1        | r       | 
+ idxpart1_a_idx  | i       | idxpart_a_idx
+ idxpart2        | p       | 
+ idxpart21       | r       | 
+ idxpart21_a_idx | i       | idxpart2_a_idx
+ idxpart2_a_idx  | I       | idxpart_a_idx
+ idxpart_a_idx   | I       | 
+(8 rows)
+
+drop table idxpart;
+-- Some unsupported features
+create table idxpart (a int, b int, c text) partition by range (a);
+create table idxpart1 partition of idxpart for values from (0) to (10);
+create unique index on idxpart (a);
+ERROR:  cannot create unique index on partitioned table "idxpart"
+create index concurrently on idxpart (a);
+ERROR:  cannot create index on partitioned table "idxpart" concurrently
+drop table idxpart;
+-- If a table without index is attached as partition to a table with
+-- an index, the index is automatically created
+create table idxpart (a int, b int, c text) partition by range (a);
+create index idxparti on idxpart (a);
+create index idxparti2 on idxpart (b, c);
+create table idxpart1 (like idxpart);
+\d idxpart1
+              Table "public.idxpart1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+
+alter table idxpart attach partition idxpart1 for values from (0) to (10);
+\d idxpart1
+              Table "public.idxpart1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+Partition of: idxpart FOR VALUES FROM (0) TO (10)
+Indexes:
+    "idxpart1_a_idx" btree (a)
+    "idxpart1_b_c_idx" btree (b, c)
+
+drop table idxpart;
+-- If a partition already has an index, don't create a duplicative one
+create table idxpart (a int, b int) partition by range (a, b);
+create table idxpart1 partition of idxpart for values from (0, 0) to (10, 10);
+create index on idxpart1 (a, b);
+create index on idxpart (a, b);
+\d idxpart1
+              Table "public.idxpart1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+Partition of: idxpart FOR VALUES FROM (0, 0) TO (10, 10)
+Indexes:
+    "idxpart1_a_b_idx" btree (a, b)
+
+select relname, relkind, inhparent::regclass
+    from pg_class left join pg_index ix on (indexrelid = oid)
+	left join pg_inherits on (ix.indexrelid = inhrelid)
+	where relname like 'idxpart%' order by relname;
+     relname      | relkind |    inhparent    
+------------------+---------+-----------------
+ idxpart          | p       | 
+ idxpart1         | r       | 
+ idxpart1_a_b_idx | i       | idxpart_a_b_idx
+ idxpart_a_b_idx  | I       | 
+(4 rows)
+
+drop table idxpart;
+-- DROP behavior for partitioned indexes
+create table idxpart (a int) partition by range (a);
+create index on idxpart (a);
+create table idxpart1 partition of idxpart for values from (0) to (10);
+drop index idxpart1_a_idx;	-- no way
+ERROR:  cannot drop index idxpart1_a_idx because index idxpart_a_idx requires it
+HINT:  You can drop index idxpart_a_idx instead.
+drop index idxpart_a_idx;	-- both indexes go away
+select relname, relkind from pg_class
+  where relname like 'idxpart%' order by relname;
+ relname  | relkind 
+----------+---------
+ idxpart  | p
+ idxpart1 | r
+(2 rows)
+
+create index on idxpart (a);
+drop table idxpart1;		-- the index on partition goes away too
+select relname, relkind from pg_class
+  where relname like 'idxpart%' order by relname;
+    relname    | relkind 
+---------------+---------
+ idxpart       | p
+ idxpart_a_idx | I
+(2 rows)
+
+drop table idxpart;
+-- ALTER INDEX .. ATTACH, error cases
+create table idxpart (a int, b int) partition by range (a, b);
+create table idxpart1 partition of idxpart for values from (0, 0) to (10, 10);
+create index idxpart_a_b_idx on only idxpart (a, b);
+create index idxpart1_a_b_idx on idxpart1 (a, b);
+create index idxpart1_tst1 on idxpart1 (b, a);
+create index idxpart1_tst2 on idxpart1 using hash (a);
+create index idxpart1_tst3 on idxpart1 (a, b) where a > 10;
+alter index idxpart attach partition idxpart1;
+ERROR:  "idxpart" is not an index
+alter index idxpart_a_b_idx attach partition idxpart1;
+ERROR:  "idxpart1" is not an index
+alter index idxpart_a_b_idx attach partition idxpart_a_b_idx;
+ERROR:  cannot attach index "idxpart_a_b_idx" as a partition of index "idxpart_a_b_idx"
+DETAIL:  Index "idxpart_a_b_idx" is not an index on any partition of table "idxpart".
+alter index idxpart_a_b_idx attach partition idxpart1_b_idx;
+ERROR:  relation "idxpart1_b_idx" does not exist
+alter index idxpart_a_b_idx attach partition idxpart1_tst1;
+ERROR:  cannot attach index "idxpart1_tst1" as a partition of index "idxpart_a_b_idx"
+DETAIL:  The index definitions do not match.
+alter index idxpart_a_b_idx attach partition idxpart1_tst2;
+ERROR:  cannot attach index "idxpart1_tst2" as a partition of index "idxpart_a_b_idx"
+DETAIL:  The index definitions do not match.
+alter index idxpart_a_b_idx attach partition idxpart1_tst3;
+ERROR:  cannot attach index "idxpart1_tst3" as a partition of index "idxpart_a_b_idx"
+DETAIL:  The index definitions do not match.
+-- OK
+alter index idxpart_a_b_idx attach partition idxpart1_a_b_idx;
+alter index idxpart_a_b_idx attach partition idxpart1_a_b_idx; -- quiet
+-- reject dupe
+create index idxpart1_2_a_b on idxpart1 (a, b);
+alter index idxpart_a_b_idx attach partition idxpart1_2_a_b;
+ERROR:  cannot attach index "idxpart1_2_a_b" as a partition of index "idxpart_a_b_idx"
+DETAIL:  Another index is already attached for partition "idxpart1".
+drop table idxpart;
+-- make sure everything's gone
+select indexrelid::regclass, indrelid::regclass
+  from pg_index where indexrelid::regclass::text like 'idxpart%';
+ indexrelid | indrelid 
+------------+----------
+(0 rows)
+
+-- Don't auto-attach incompatible indexes
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (a int, b int);
+create index on idxpart1 using hash (a);
+create index on idxpart1 (a) where b > 1;
+create index on idxpart1 ((a + 0));
+create index on idxpart1 (a, a);
+create index on idxpart (a);
+alter table idxpart attach partition idxpart1 for values from (0) to (1000);
+\d idxpart1
+              Table "public.idxpart1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+Partition of: idxpart FOR VALUES FROM (0) TO (1000)
+Indexes:
+    "idxpart1_a_a1_idx" btree (a, a)
+    "idxpart1_a_idx" hash (a)
+    "idxpart1_a_idx1" btree (a) WHERE b > 1
+    "idxpart1_a_idx2" btree (a)
+    "idxpart1_expr_idx" btree ((a + 0))
+
+drop table idxpart;
+-- If CREATE INDEX ONLY, don't create indexes on partitions; and existing
+-- indexes on partitions don't change parent.  ALTER INDEX ATTACH can change
+-- the parent after the fact.
+create table idxpart (a int) partition by range (a);
+create table idxpart1 partition of idxpart for values from (0) to (100);
+create table idxpart2 partition of idxpart for values from (100) to (1000)
+  partition by range (a);
+create table idxpart21 partition of idxpart2 for values from (100) to (200);
+create table idxpart22 partition of idxpart2 for values from (200) to (300);
+create index on idxpart22 (a);
+create index on only idxpart2 (a);
+create index on idxpart (a);
+-- Here we expect that idxpart1 and idxpart2 have a new index, but idxpart21
+-- does not; also, idxpart22 is not attached.
+\d idxpart1
+              Table "public.idxpart1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition of: idxpart FOR VALUES FROM (0) TO (100)
+Indexes:
+    "idxpart1_a_idx" btree (a)
+
+\d idxpart2
+              Table "public.idxpart2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition of: idxpart FOR VALUES FROM (100) TO (1000)
+Partition key: RANGE (a)
+Indexes:
+    "idxpart2_a_idx" btree (a) INVALID
+Number of partitions: 2 (Use \d+ to list them.)
+
+\d idxpart21
+             Table "public.idxpart21"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition of: idxpart2 FOR VALUES FROM (100) TO (200)
+
+select indexrelid::regclass, indrelid::regclass, inhparent::regclass
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+where indexrelid::regclass::text like 'idxpart%'
+  order by indrelid::regclass::text collate "C";
+   indexrelid    | indrelid  |   inhparent   
+-----------------+-----------+---------------
+ idxpart_a_idx   | idxpart   | 
+ idxpart1_a_idx  | idxpart1  | idxpart_a_idx
+ idxpart2_a_idx  | idxpart2  | idxpart_a_idx
+ idxpart22_a_idx | idxpart22 | 
+(4 rows)
+
+alter index idxpart2_a_idx attach partition idxpart22_a_idx;
+select indexrelid::regclass, indrelid::regclass, inhparent::regclass
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+where indexrelid::regclass::text like 'idxpart%'
+  order by indrelid::regclass::text collate "C";
+   indexrelid    | indrelid  |   inhparent    
+-----------------+-----------+----------------
+ idxpart_a_idx   | idxpart   | 
+ idxpart1_a_idx  | idxpart1  | idxpart_a_idx
+ idxpart2_a_idx  | idxpart2  | idxpart_a_idx
+ idxpart22_a_idx | idxpart22 | idxpart2_a_idx
+(4 rows)
+
+-- attaching idxpart22 is not enough to set idxpart22_a_idx valid ...
+alter index idxpart2_a_idx attach partition idxpart22_a_idx;
+\d idxpart2
+              Table "public.idxpart2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition of: idxpart FOR VALUES FROM (100) TO (1000)
+Partition key: RANGE (a)
+Indexes:
+    "idxpart2_a_idx" btree (a) INVALID
+Number of partitions: 2 (Use \d+ to list them.)
+
+-- ... but this one is.
+create index on idxpart21 (a);
+alter index idxpart2_a_idx attach partition idxpart21_a_idx;
+\d idxpart2
+              Table "public.idxpart2"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition of: idxpart FOR VALUES FROM (100) TO (1000)
+Partition key: RANGE (a)
+Indexes:
+    "idxpart2_a_idx" btree (a)
+Number of partitions: 2 (Use \d+ to list them.)
+
+drop table idxpart;
+-- When a table is attached a partition and it already has an index, a
+-- duplicate index should not get created, but rather the index becomes
+-- attached to the parent's index.
+create table idxpart (a int, b int, c text) partition by range (a);
+create index idxparti on idxpart (a);
+create index idxparti2 on idxpart (b, c);
+create table idxpart1 (like idxpart including indexes);
+\d idxpart1
+              Table "public.idxpart1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+Indexes:
+    "idxpart1_a_idx" btree (a)
+    "idxpart1_b_c_idx" btree (b, c)
+
+select relname, relkind, inhparent::regclass
+    from pg_class left join pg_index ix on (indexrelid = oid)
+	left join pg_inherits on (ix.indexrelid = inhrelid)
+	where relname like 'idxpart%' order by relname;
+     relname      | relkind | inhparent 
+------------------+---------+-----------
+ idxpart          | p       | 
+ idxpart1         | r       | 
+ idxpart1_a_idx   | i       | 
+ idxpart1_b_c_idx | i       | 
+ idxparti         | I       | 
+ idxparti2        | I       | 
+(6 rows)
+
+alter table idxpart attach partition idxpart1 for values from (0) to (10);
+\d idxpart1
+              Table "public.idxpart1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+ c      | text    |           |          | 
+Partition of: idxpart FOR VALUES FROM (0) TO (10)
+Indexes:
+    "idxpart1_a_idx" btree (a)
+    "idxpart1_b_c_idx" btree (b, c)
+
+select relname, relkind, inhparent::regclass
+    from pg_class left join pg_index ix on (indexrelid = oid)
+	left join pg_inherits on (ix.indexrelid = inhrelid)
+	where relname like 'idxpart%' order by relname;
+     relname      | relkind | inhparent 
+------------------+---------+-----------
+ idxpart          | p       | 
+ idxpart1         | r       | 
+ idxpart1_a_idx   | i       | idxparti
+ idxpart1_b_c_idx | i       | idxparti2
+ idxparti         | I       | 
+ idxparti2        | I       | 
+(6 rows)
+
+drop table idxpart;
+-- Verify that attaching an invalid index does not mark the parent index valid.
+-- On the other hand, attaching a valid index marks not only its direct
+-- ancestor valid, but also any indirect ancestor that was only missing the one
+-- that was just made valid
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 partition of idxpart for values from (1) to (1000) partition by range (a);
+create table idxpart11 partition of idxpart1 for values from (1) to (100);
+create index on only idxpart1 (a);
+create index on only idxpart (a);
+-- this results in two invalid indexes:
+select relname, indisvalid from pg_class join pg_index on indexrelid = oid
+   where relname like 'idxpart%' order by relname;
+    relname     | indisvalid 
+----------------+------------
+ idxpart1_a_idx | f
+ idxpart_a_idx  | f
+(2 rows)
+
+-- idxpart1_a_idx is not valid, so idxpart_a_idx should not become valid:
+alter index idxpart_a_idx attach partition idxpart1_a_idx;
+select relname, indisvalid from pg_class join pg_index on indexrelid = oid
+   where relname like 'idxpart%' order by relname;
+    relname     | indisvalid 
+----------------+------------
+ idxpart1_a_idx | f
+ idxpart_a_idx  | f
+(2 rows)
+
+-- after creating and attaching this, both idxpart1_a_idx and idxpart_a_idx
+-- should become valid
+create index on idxpart11 (a);
+alter index idxpart1_a_idx attach partition idxpart11_a_idx;
+select relname, indisvalid from pg_class join pg_index on indexrelid = oid
+   where relname like 'idxpart%' order by relname;
+     relname     | indisvalid 
+-----------------+------------
+ idxpart11_a_idx | t
+ idxpart1_a_idx  | t
+ idxpart_a_idx   | t
+(3 rows)
+
+drop table idxpart;
+-- verify dependency handling during ALTER TABLE DETACH PARTITION
+create table idxpart (a int) partition by range (a);
+create table idxpart1 (like idxpart);
+create index on idxpart1 (a);
+create index on idxpart (a);
+create table idxpart2 (like idxpart);
+alter table idxpart attach partition idxpart1 for values from (0000) to (1000);
+alter table idxpart attach partition idxpart2 for values from (1000) to (2000);
+create table idxpart3 partition of idxpart for values from (2000) to (3000);
+select relname, relkind from pg_class where relname like 'idxpart%' order by relname;
+    relname     | relkind 
+----------------+---------
+ idxpart        | p
+ idxpart1       | r
+ idxpart1_a_idx | i
+ idxpart2       | r
+ idxpart2_a_idx | i
+ idxpart3       | r
+ idxpart3_a_idx | i
+ idxpart_a_idx  | I
+(8 rows)
+
+-- a) after detaching partitions, the indexes can be dropped independently
+alter table idxpart detach partition idxpart1;
+alter table idxpart detach partition idxpart2;
+alter table idxpart detach partition idxpart3;
+drop index idxpart1_a_idx;
+drop index idxpart2_a_idx;
+drop index idxpart3_a_idx;
+select relname, relkind from pg_class where relname like 'idxpart%' order by relname;
+    relname    | relkind 
+---------------+---------
+ idxpart       | p
+ idxpart1      | r
+ idxpart2      | r
+ idxpart3      | r
+ idxpart_a_idx | I
+(5 rows)
+
+drop table idxpart, idxpart1, idxpart2, idxpart3;
+select relname, relkind from pg_class where relname like 'idxpart%' order by relname;
+ relname | relkind 
+---------+---------
+(0 rows)
+
+create table idxpart (a int) partition by range (a);
+create table idxpart1 (like idxpart);
+create index on idxpart1 (a);
+create index on idxpart (a);
+create table idxpart2 (like idxpart);
+alter table idxpart attach partition idxpart1 for values from (0000) to (1000);
+alter table idxpart attach partition idxpart2 for values from (1000) to (2000);
+create table idxpart3 partition of idxpart for values from (2000) to (3000);
+-- b) after detaching, dropping the index on parent does not remove the others
+select relname, relkind from pg_class where relname like 'idxpart%' order by relname;
+    relname     | relkind 
+----------------+---------
+ idxpart        | p
+ idxpart1       | r
+ idxpart1_a_idx | i
+ idxpart2       | r
+ idxpart2_a_idx | i
+ idxpart3       | r
+ idxpart3_a_idx | i
+ idxpart_a_idx  | I
+(8 rows)
+
+alter table idxpart detach partition idxpart1;
+alter table idxpart detach partition idxpart2;
+alter table idxpart detach partition idxpart3;
+drop index idxpart_a_idx;
+select relname, relkind from pg_class where relname like 'idxpart%' order by relname;
+    relname     | relkind 
+----------------+---------
+ idxpart        | p
+ idxpart1       | r
+ idxpart1_a_idx | i
+ idxpart2       | r
+ idxpart2_a_idx | i
+ idxpart3       | r
+ idxpart3_a_idx | i
+(7 rows)
+
+drop table idxpart, idxpart1, idxpart2, idxpart3;
+select relname, relkind from pg_class where relname like 'idxpart%' order by relname;
+ relname | relkind 
+---------+---------
+(0 rows)
+
+-- Verify that expression indexes inherit correctly
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (like idxpart);
+create index on idxpart1 ((a + b));
+create index on idxpart ((a + b));
+create table idxpart2 (like idxpart);
+alter table idxpart attach partition idxpart1 for values from (0000) to (1000);
+alter table idxpart attach partition idxpart2 for values from (1000) to (2000);
+create table idxpart3 partition of idxpart for values from (2000) to (3000);
+select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef
+  from pg_class join pg_inherits on inhrelid = oid,
+  lateral pg_get_indexdef(pg_class.oid)
+  where relkind in ('i', 'I') and relname like 'idxpart%' order by relname;
+       child       |      parent      |                              childdef                              
+-------------------+------------------+--------------------------------------------------------------------
+ idxpart1_expr_idx | idxpart_expr_idx | CREATE INDEX idxpart1_expr_idx ON idxpart1 USING btree (((a + b)))
+ idxpart2_expr_idx | idxpart_expr_idx | CREATE INDEX idxpart2_expr_idx ON idxpart2 USING btree (((a + b)))
+ idxpart3_expr_idx | idxpart_expr_idx | CREATE INDEX idxpart3_expr_idx ON idxpart3 USING btree (((a + b)))
+(3 rows)
+
+drop table idxpart;
+-- Verify behavior for collation (mis)matches
+create table idxpart (a text) partition by range (a);
+create table idxpart1 (like idxpart);
+create table idxpart2 (like idxpart);
+create index on idxpart2 (a collate "POSIX");
+create index on idxpart2 (a);
+create index on idxpart2 (a collate "C");
+alter table idxpart attach partition idxpart1 for values from ('aaa') to ('bbb');
+alter table idxpart attach partition idxpart2 for values from ('bbb') to ('ccc');
+create table idxpart3 partition of idxpart for values from ('ccc') to ('ddd');
+create index on idxpart (a collate "C");
+create table idxpart4 partition of idxpart for values from ('ddd') to ('eee');
+select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef
+  from pg_class left join pg_inherits on inhrelid = oid,
+  lateral pg_get_indexdef(pg_class.oid)
+  where relkind in ('i', 'I') and relname like 'idxpart%' order by relname;
+      child      |    parent     |                                childdef                                 
+-----------------+---------------+-------------------------------------------------------------------------
+ idxpart1_a_idx  | idxpart_a_idx | CREATE INDEX idxpart1_a_idx ON idxpart1 USING btree (a COLLATE "C")
+ idxpart2_a_idx  |               | CREATE INDEX idxpart2_a_idx ON idxpart2 USING btree (a COLLATE "POSIX")
+ idxpart2_a_idx1 |               | CREATE INDEX idxpart2_a_idx1 ON idxpart2 USING btree (a)
+ idxpart2_a_idx2 | idxpart_a_idx | CREATE INDEX idxpart2_a_idx2 ON idxpart2 USING btree (a COLLATE "C")
+ idxpart3_a_idx  | idxpart_a_idx | CREATE INDEX idxpart3_a_idx ON idxpart3 USING btree (a COLLATE "C")
+ idxpart4_a_idx  | idxpart_a_idx | CREATE INDEX idxpart4_a_idx ON idxpart4 USING btree (a COLLATE "C")
+ idxpart_a_idx   |               | CREATE INDEX idxpart_a_idx ON ONLY idxpart USING btree (a COLLATE "C")
+(7 rows)
+
+drop table idxpart;
+-- Verify behavior for opclass (mis)matches
+create table idxpart (a text) partition by range (a);
+create table idxpart1 (like idxpart);
+create table idxpart2 (like idxpart);
+create index on idxpart2 (a);
+alter table idxpart attach partition idxpart1 for values from ('aaa') to ('bbb');
+alter table idxpart attach partition idxpart2 for values from ('bbb') to ('ccc');
+create table idxpart3 partition of idxpart for values from ('ccc') to ('ddd');
+create index on idxpart (a text_pattern_ops);
+create table idxpart4 partition of idxpart for values from ('ddd') to ('eee');
+-- must *not* have attached the index we created on idxpart2
+select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef
+  from pg_class left join pg_inherits on inhrelid = oid,
+  lateral pg_get_indexdef(pg_class.oid)
+  where relkind in ('i', 'I') and relname like 'idxpart%' order by relname;
+      child      |    parent     |                                  childdef                                   
+-----------------+---------------+-----------------------------------------------------------------------------
+ idxpart1_a_idx  | idxpart_a_idx | CREATE INDEX idxpart1_a_idx ON idxpart1 USING btree (a text_pattern_ops)
+ idxpart2_a_idx  |               | CREATE INDEX idxpart2_a_idx ON idxpart2 USING btree (a)
+ idxpart2_a_idx1 | idxpart_a_idx | CREATE INDEX idxpart2_a_idx1 ON idxpart2 USING btree (a text_pattern_ops)
+ idxpart3_a_idx  | idxpart_a_idx | CREATE INDEX idxpart3_a_idx ON idxpart3 USING btree (a text_pattern_ops)
+ idxpart4_a_idx  | idxpart_a_idx | CREATE INDEX idxpart4_a_idx ON idxpart4 USING btree (a text_pattern_ops)
+ idxpart_a_idx   |               | CREATE INDEX idxpart_a_idx ON ONLY idxpart USING btree (a text_pattern_ops)
+(6 rows)
+
+drop index idxpart_a_idx;
+create index on only idxpart (a text_pattern_ops);
+-- must reject
+alter index idxpart_a_idx attach partition idxpart2_a_idx;
+ERROR:  cannot attach index "idxpart2_a_idx" as a partition of index "idxpart_a_idx"
+DETAIL:  The index definitions do not match.
+drop table idxpart;
+-- Verify that attaching indexes maps attribute numbers correctly
+create table idxpart (col1 int, a int, col2 int, b int) partition by range (a);
+create table idxpart1 (b int, col1 int, col2 int, col3 int, a int);
+alter table idxpart drop column col1, drop column col2;
+alter table idxpart1 drop column col1, drop column col2, drop column col3;
+alter table idxpart attach partition idxpart1 for values from (0) to (1000);
+create index idxpart_1_idx on only idxpart (b, a);
+create index idxpart1_1_idx on idxpart1 (b, a);
+create index idxpart1_1b_idx on idxpart1 (b);
+-- test expressions and partial-index predicate, too
+create index idxpart_2_idx on only idxpart ((b + a)) where a > 1;
+create index idxpart1_2_idx on idxpart1 ((b + a)) where a > 1;
+create index idxpart1_2b_idx on idxpart1 ((a + b)) where a > 1;
+create index idxpart1_2c_idx on idxpart1 ((b + a)) where b > 1;
+alter index idxpart_1_idx attach partition idxpart1_1b_idx;	-- fail
+ERROR:  cannot attach index "idxpart1_1b_idx" as a partition of index "idxpart_1_idx"
+DETAIL:  The index definitions do not match.
+alter index idxpart_1_idx attach partition idxpart1_1_idx;
+alter index idxpart_2_idx attach partition idxpart1_2b_idx;	-- fail
+ERROR:  cannot attach index "idxpart1_2b_idx" as a partition of index "idxpart_2_idx"
+DETAIL:  The index definitions do not match.
+alter index idxpart_2_idx attach partition idxpart1_2c_idx;	-- fail
+ERROR:  cannot attach index "idxpart1_2c_idx" as a partition of index "idxpart_2_idx"
+DETAIL:  The index definitions do not match.
+alter index idxpart_2_idx attach partition idxpart1_2_idx;	-- ok
+select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef
+  from pg_class left join pg_inherits on inhrelid = oid,
+  lateral pg_get_indexdef(pg_class.oid)
+  where relkind in ('i', 'I') and relname like 'idxpart%' order by relname;
+      child      |    parent     |                                     childdef                                     
+-----------------+---------------+----------------------------------------------------------------------------------
+ idxpart1_1_idx  | idxpart_1_idx | CREATE INDEX idxpart1_1_idx ON idxpart1 USING btree (b, a)
+ idxpart1_1b_idx |               | CREATE INDEX idxpart1_1b_idx ON idxpart1 USING btree (b)
+ idxpart1_2_idx  | idxpart_2_idx | CREATE INDEX idxpart1_2_idx ON idxpart1 USING btree (((b + a))) WHERE (a > 1)
+ idxpart1_2b_idx |               | CREATE INDEX idxpart1_2b_idx ON idxpart1 USING btree (((a + b))) WHERE (a > 1)
+ idxpart1_2c_idx |               | CREATE INDEX idxpart1_2c_idx ON idxpart1 USING btree (((b + a))) WHERE (b > 1)
+ idxpart_1_idx   |               | CREATE INDEX idxpart_1_idx ON ONLY idxpart USING btree (b, a)
+ idxpart_2_idx   |               | CREATE INDEX idxpart_2_idx ON ONLY idxpart USING btree (((b + a))) WHERE (a > 1)
+(7 rows)
+
+drop table idxpart;
+-- Make sure the partition columns are mapped correctly
+create table idxpart (a int, b int, c text) partition by range (a);
+create index idxparti on idxpart (a);
+create index idxparti2 on idxpart (c, b);
+create table idxpart1 (c text, a int, b int);
+alter table idxpart attach partition idxpart1 for values from (0) to (10);
+create table idxpart2 (c text, a int, b int);
+create index on idxpart2 (a);
+create index on idxpart2 (c, b);
+alter table idxpart attach partition idxpart2 for values from (10) to (20);
+select c.relname, pg_get_indexdef(indexrelid)
+  from pg_class c join pg_index i on c.oid = i.indexrelid
+  where indrelid::regclass::text like 'idxpart%'
+  order by indrelid::regclass::text collate "C";
+     relname      |                       pg_get_indexdef                        
+------------------+--------------------------------------------------------------
+ idxparti         | CREATE INDEX idxparti ON ONLY idxpart USING btree (a)
+ idxparti2        | CREATE INDEX idxparti2 ON ONLY idxpart USING btree (c, b)
+ idxpart1_a_idx   | CREATE INDEX idxpart1_a_idx ON idxpart1 USING btree (a)
+ idxpart1_c_b_idx | CREATE INDEX idxpart1_c_b_idx ON idxpart1 USING btree (c, b)
+ idxpart2_a_idx   | CREATE INDEX idxpart2_a_idx ON idxpart2 USING btree (a)
+ idxpart2_c_b_idx | CREATE INDEX idxpart2_c_b_idx ON idxpart2 USING btree (c, b)
+(6 rows)
+
+drop table idxpart;
+-- Verify that columns are mapped correctly in expression indexes
+create table idxpart (col1 int, col2 int, a int, b int) partition by range (a);
+create table idxpart1 (col2 int, b int, col1 int, a int);
+create table idxpart2 (col1 int, col2 int, b int, a int);
+alter table idxpart drop column col1, drop column col2;
+alter table idxpart1 drop column col1, drop column col2;
+alter table idxpart2 drop column col1, drop column col2;
+create index on idxpart2 (abs(b));
+alter table idxpart attach partition idxpart2 for values from (0) to (1);
+create index on idxpart (abs(b));
+alter table idxpart attach partition idxpart1 for values from (1) to (2);
+select c.relname, pg_get_indexdef(indexrelid)
+  from pg_class c join pg_index i on c.oid = i.indexrelid
+  where indrelid::regclass::text like 'idxpart%'
+  order by indrelid::regclass::text collate "C";
+     relname      |                          pg_get_indexdef                          
+------------------+-------------------------------------------------------------------
+ idxpart_abs_idx  | CREATE INDEX idxpart_abs_idx ON ONLY idxpart USING btree (abs(b))
+ idxpart1_abs_idx | CREATE INDEX idxpart1_abs_idx ON idxpart1 USING btree (abs(b))
+ idxpart2_abs_idx | CREATE INDEX idxpart2_abs_idx ON idxpart2 USING btree (abs(b))
+(3 rows)
+
+drop table idxpart;
+-- Verify that columns are mapped correctly for WHERE in a partial index
+create table idxpart (col1 int, a int, col3 int, b int) partition by range (a);
+alter table idxpart drop column col1, drop column col3;
+create table idxpart1 (col1 int, col2 int, col3 int, col4 int, b int, a int);
+alter table idxpart1 drop column col1, drop column col2, drop column col3, drop column col4;
+alter table idxpart attach partition idxpart1 for values from (0) to (1000);
+create table idxpart2 (col1 int, col2 int, b int, a int);
+create index on idxpart2 (a) where b > 1000;
+alter table idxpart2 drop column col1, drop column col2;
+alter table idxpart attach partition idxpart2 for values from (1000) to (2000);
+create index on idxpart (a) where b > 1000;
+select c.relname, pg_get_indexdef(indexrelid)
+  from pg_class c join pg_index i on c.oid = i.indexrelid
+  where indrelid::regclass::text like 'idxpart%'
+  order by indrelid::regclass::text collate "C";
+    relname     |                               pg_get_indexdef                               
+----------------+-----------------------------------------------------------------------------
+ idxpart_a_idx  | CREATE INDEX idxpart_a_idx ON ONLY idxpart USING btree (a) WHERE (b > 1000)
+ idxpart1_a_idx | CREATE INDEX idxpart1_a_idx ON idxpart1 USING btree (a) WHERE (b > 1000)
+ idxpart2_a_idx | CREATE INDEX idxpart2_a_idx ON idxpart2 USING btree (a) WHERE (b > 1000)
+(3 rows)
+
+drop table idxpart;
+-- Column number mapping: dropped columns in the partition
+create table idxpart1 (drop_1 int, drop_2 int, col_keep int, drop_3 int);
+alter table idxpart1 drop column drop_1;
+alter table idxpart1 drop column drop_2;
+alter table idxpart1 drop column drop_3;
+create index on idxpart1 (col_keep);
+create table idxpart (col_keep int) partition by range (col_keep);
+create index on idxpart (col_keep);
+alter table idxpart attach partition idxpart1 for values from (0) to (1000);
+\d idxpart
+               Table "public.idxpart"
+  Column  |  Type   | Collation | Nullable | Default 
+----------+---------+-----------+----------+---------
+ col_keep | integer |           |          | 
+Partition key: RANGE (col_keep)
+Indexes:
+    "idxpart_col_keep_idx" btree (col_keep)
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d idxpart1
+               Table "public.idxpart1"
+  Column  |  Type   | Collation | Nullable | Default 
+----------+---------+-----------+----------+---------
+ col_keep | integer |           |          | 
+Partition of: idxpart FOR VALUES FROM (0) TO (1000)
+Indexes:
+    "idxpart1_col_keep_idx" btree (col_keep)
+
+select attrelid::regclass, attname, attnum from pg_attribute
+  where attrelid::regclass::text like 'idxpart%' and attnum > 0
+  order by attrelid::regclass, attnum;
+       attrelid        |           attname            | attnum 
+-----------------------+------------------------------+--------
+ idxpart1              | ........pg.dropped.1........ |      1
+ idxpart1              | ........pg.dropped.2........ |      2
+ idxpart1              | col_keep                     |      3
+ idxpart1              | ........pg.dropped.4........ |      4
+ idxpart1_col_keep_idx | col_keep                     |      1
+ idxpart               | col_keep                     |      1
+ idxpart_col_keep_idx  | col_keep                     |      1
+(7 rows)
+
+drop table idxpart;
+-- Column number mapping: dropped columns in the parent table
+create table idxpart(drop_1 int, drop_2 int, col_keep int, drop_3 int) partition by range (col_keep);
+alter table idxpart drop column drop_1;
+alter table idxpart drop column drop_2;
+alter table idxpart drop column drop_3;
+create table idxpart1 (col_keep int);
+create index on idxpart1 (col_keep);
+create index on idxpart (col_keep);
+alter table idxpart attach partition idxpart1 for values from (0) to (1000);
+\d idxpart
+               Table "public.idxpart"
+  Column  |  Type   | Collation | Nullable | Default 
+----------+---------+-----------+----------+---------
+ col_keep | integer |           |          | 
+Partition key: RANGE (col_keep)
+Indexes:
+    "idxpart_col_keep_idx" btree (col_keep)
+Number of partitions: 1 (Use \d+ to list them.)
+
+\d idxpart1
+               Table "public.idxpart1"
+  Column  |  Type   | Collation | Nullable | Default 
+----------+---------+-----------+----------+---------
+ col_keep | integer |           |          | 
+Partition of: idxpart FOR VALUES FROM (0) TO (1000)
+Indexes:
+    "idxpart1_col_keep_idx" btree (col_keep)
+
+select attrelid::regclass, attname, attnum from pg_attribute
+  where attrelid::regclass::text like 'idxpart%' and attnum > 0
+  order by attrelid::regclass, attnum;
+       attrelid        |           attname            | attnum 
+-----------------------+------------------------------+--------
+ idxpart               | ........pg.dropped.1........ |      1
+ idxpart               | ........pg.dropped.2........ |      2
+ idxpart               | col_keep                     |      3
+ idxpart               | ........pg.dropped.4........ |      4
+ idxpart1              | col_keep                     |      1
+ idxpart1_col_keep_idx | col_keep                     |      1
+ idxpart_col_keep_idx  | col_keep                     |      1
+(7 rows)
+
+drop table idxpart;
+-- intentionally leave some objects around
+create table idxpart (a int) partition by range (a);
+create table idxpart1 partition of idxpart for values from (0) to (100);
+create table idxpart2 partition of idxpart for values from (100) to (1000)
+  partition by range (a);
+create table idxpart21 partition of idxpart2 for values from (100) to (200);
+create table idxpart22 partition of idxpart2 for values from (200) to (300);
+create index on idxpart22 (a);
+create index on only idxpart2 (a);
+alter index idxpart2_a_idx attach partition idxpart22_a_idx;
+create index on idxpart (a);
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 95fafcd7..58485cf1 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -134,7 +134,7 @@ test: plancache limit plpgsql copy2 temp domain prepare without_oid conversion t
 # ----------
 # Another group of parallel tests
 # ----------
-test: identity partition_join partition_prune partition_prune_hash hash_part partition_info
+test: identity partition_join partition_prune partition_prune_hash hash_part partition_info indexing
 
 # event triggers cannot run concurrently with any test that runs DDL
 test: event_trigger
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index f91b37b9..3e5da44f 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -196,6 +196,7 @@ test: partition_prune
 test: partition_prune_hash
 test: partition_info
 test: hash_part
+test: indexing
 test: event_trigger
 test: fast_default
 test: stats
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index 7b5f2409..b53af1c0 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -1328,6 +1328,22 @@ create table tab1 (a int, b text);
 create table tab2 (x int, y tab1);
 alter table tab1 alter column b type varchar; -- fails
 
+-- Alter column type that's part of a partitioned index
+create table at_partitioned (a int, b text) partition by range (a);
+create table at_part_1 partition of at_partitioned for values from (0) to (1000);
+insert into at_partitioned values (512, '0.123');
+create table at_part_2 (b text, a int);
+insert into at_part_2 values ('1.234', 1024);
+create index on at_partitioned (b);
+create index on at_partitioned (a);
+\d at_part_1
+\d at_part_2
+alter table at_partitioned attach partition at_part_2 for values from (1000) to (2000);
+\d at_part_2
+alter table at_partitioned alter column b type numeric using b::numeric;
+\d at_part_1
+\d at_part_2
+
 -- disallow recursive containment of row types
 create temp table recur1 (f1 int);
 alter table recur1 add column f2 recur1; -- fails
diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql
new file mode 100644
index 00000000..33be7186
--- /dev/null
+++ b/src/test/regress/sql/indexing.sql
@@ -0,0 +1,388 @@
+-- Creating an index on a partitioned table makes the partitions
+-- automatically get the index
+create table idxpart (a int, b int, c text) partition by range (a);
+create table idxpart1 partition of idxpart for values from (0) to (10);
+create table idxpart2 partition of idxpart for values from (10) to (100)
+	partition by range (b);
+create table idxpart21 partition of idxpart2 for values from (0) to (100);
+create index on idxpart (a);
+select relname, relkind, inhparent::regclass
+    from pg_class left join pg_index ix on (indexrelid = oid)
+	left join pg_inherits on (ix.indexrelid = inhrelid)
+	where relname like 'idxpart%' order by relname;
+drop table idxpart;
+
+-- Some unsupported features
+create table idxpart (a int, b int, c text) partition by range (a);
+create table idxpart1 partition of idxpart for values from (0) to (10);
+create unique index on idxpart (a);
+create index concurrently on idxpart (a);
+drop table idxpart;
+
+-- If a table without index is attached as partition to a table with
+-- an index, the index is automatically created
+create table idxpart (a int, b int, c text) partition by range (a);
+create index idxparti on idxpart (a);
+create index idxparti2 on idxpart (b, c);
+create table idxpart1 (like idxpart);
+\d idxpart1
+alter table idxpart attach partition idxpart1 for values from (0) to (10);
+\d idxpart1
+drop table idxpart;
+
+-- If a partition already has an index, don't create a duplicative one
+create table idxpart (a int, b int) partition by range (a, b);
+create table idxpart1 partition of idxpart for values from (0, 0) to (10, 10);
+create index on idxpart1 (a, b);
+create index on idxpart (a, b);
+\d idxpart1
+select relname, relkind, inhparent::regclass
+    from pg_class left join pg_index ix on (indexrelid = oid)
+	left join pg_inherits on (ix.indexrelid = inhrelid)
+	where relname like 'idxpart%' order by relname;
+drop table idxpart;
+
+-- DROP behavior for partitioned indexes
+create table idxpart (a int) partition by range (a);
+create index on idxpart (a);
+create table idxpart1 partition of idxpart for values from (0) to (10);
+drop index idxpart1_a_idx;	-- no way
+drop index idxpart_a_idx;	-- both indexes go away
+select relname, relkind from pg_class
+  where relname like 'idxpart%' order by relname;
+create index on idxpart (a);
+drop table idxpart1;		-- the index on partition goes away too
+select relname, relkind from pg_class
+  where relname like 'idxpart%' order by relname;
+drop table idxpart;
+
+-- ALTER INDEX .. ATTACH, error cases
+create table idxpart (a int, b int) partition by range (a, b);
+create table idxpart1 partition of idxpart for values from (0, 0) to (10, 10);
+create index idxpart_a_b_idx on only idxpart (a, b);
+create index idxpart1_a_b_idx on idxpart1 (a, b);
+create index idxpart1_tst1 on idxpart1 (b, a);
+create index idxpart1_tst2 on idxpart1 using hash (a);
+create index idxpart1_tst3 on idxpart1 (a, b) where a > 10;
+
+alter index idxpart attach partition idxpart1;
+alter index idxpart_a_b_idx attach partition idxpart1;
+alter index idxpart_a_b_idx attach partition idxpart_a_b_idx;
+alter index idxpart_a_b_idx attach partition idxpart1_b_idx;
+alter index idxpart_a_b_idx attach partition idxpart1_tst1;
+alter index idxpart_a_b_idx attach partition idxpart1_tst2;
+alter index idxpart_a_b_idx attach partition idxpart1_tst3;
+-- OK
+alter index idxpart_a_b_idx attach partition idxpart1_a_b_idx;
+alter index idxpart_a_b_idx attach partition idxpart1_a_b_idx; -- quiet
+
+-- reject dupe
+create index idxpart1_2_a_b on idxpart1 (a, b);
+alter index idxpart_a_b_idx attach partition idxpart1_2_a_b;
+drop table idxpart;
+-- make sure everything's gone
+select indexrelid::regclass, indrelid::regclass
+  from pg_index where indexrelid::regclass::text like 'idxpart%';
+
+-- Don't auto-attach incompatible indexes
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (a int, b int);
+create index on idxpart1 using hash (a);
+create index on idxpart1 (a) where b > 1;
+create index on idxpart1 ((a + 0));
+create index on idxpart1 (a, a);
+create index on idxpart (a);
+alter table idxpart attach partition idxpart1 for values from (0) to (1000);
+\d idxpart1
+drop table idxpart;
+
+-- If CREATE INDEX ONLY, don't create indexes on partitions; and existing
+-- indexes on partitions don't change parent.  ALTER INDEX ATTACH can change
+-- the parent after the fact.
+create table idxpart (a int) partition by range (a);
+create table idxpart1 partition of idxpart for values from (0) to (100);
+create table idxpart2 partition of idxpart for values from (100) to (1000)
+  partition by range (a);
+create table idxpart21 partition of idxpart2 for values from (100) to (200);
+create table idxpart22 partition of idxpart2 for values from (200) to (300);
+create index on idxpart22 (a);
+create index on only idxpart2 (a);
+create index on idxpart (a);
+-- Here we expect that idxpart1 and idxpart2 have a new index, but idxpart21
+-- does not; also, idxpart22 is not attached.
+\d idxpart1
+\d idxpart2
+\d idxpart21
+select indexrelid::regclass, indrelid::regclass, inhparent::regclass
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+where indexrelid::regclass::text like 'idxpart%'
+  order by indrelid::regclass::text collate "C";
+alter index idxpart2_a_idx attach partition idxpart22_a_idx;
+select indexrelid::regclass, indrelid::regclass, inhparent::regclass
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+where indexrelid::regclass::text like 'idxpart%'
+  order by indrelid::regclass::text collate "C";
+-- attaching idxpart22 is not enough to set idxpart22_a_idx valid ...
+alter index idxpart2_a_idx attach partition idxpart22_a_idx;
+\d idxpart2
+-- ... but this one is.
+create index on idxpart21 (a);
+alter index idxpart2_a_idx attach partition idxpart21_a_idx;
+\d idxpart2
+drop table idxpart;
+
+-- When a table is attached a partition and it already has an index, a
+-- duplicate index should not get created, but rather the index becomes
+-- attached to the parent's index.
+create table idxpart (a int, b int, c text) partition by range (a);
+create index idxparti on idxpart (a);
+create index idxparti2 on idxpart (b, c);
+create table idxpart1 (like idxpart including indexes);
+\d idxpart1
+select relname, relkind, inhparent::regclass
+    from pg_class left join pg_index ix on (indexrelid = oid)
+	left join pg_inherits on (ix.indexrelid = inhrelid)
+	where relname like 'idxpart%' order by relname;
+alter table idxpart attach partition idxpart1 for values from (0) to (10);
+\d idxpart1
+select relname, relkind, inhparent::regclass
+    from pg_class left join pg_index ix on (indexrelid = oid)
+	left join pg_inherits on (ix.indexrelid = inhrelid)
+	where relname like 'idxpart%' order by relname;
+drop table idxpart;
+
+-- Verify that attaching an invalid index does not mark the parent index valid.
+-- On the other hand, attaching a valid index marks not only its direct
+-- ancestor valid, but also any indirect ancestor that was only missing the one
+-- that was just made valid
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 partition of idxpart for values from (1) to (1000) partition by range (a);
+create table idxpart11 partition of idxpart1 for values from (1) to (100);
+create index on only idxpart1 (a);
+create index on only idxpart (a);
+-- this results in two invalid indexes:
+select relname, indisvalid from pg_class join pg_index on indexrelid = oid
+   where relname like 'idxpart%' order by relname;
+-- idxpart1_a_idx is not valid, so idxpart_a_idx should not become valid:
+alter index idxpart_a_idx attach partition idxpart1_a_idx;
+select relname, indisvalid from pg_class join pg_index on indexrelid = oid
+   where relname like 'idxpart%' order by relname;
+-- after creating and attaching this, both idxpart1_a_idx and idxpart_a_idx
+-- should become valid
+create index on idxpart11 (a);
+alter index idxpart1_a_idx attach partition idxpart11_a_idx;
+select relname, indisvalid from pg_class join pg_index on indexrelid = oid
+   where relname like 'idxpart%' order by relname;
+drop table idxpart;
+
+-- verify dependency handling during ALTER TABLE DETACH PARTITION
+create table idxpart (a int) partition by range (a);
+create table idxpart1 (like idxpart);
+create index on idxpart1 (a);
+create index on idxpart (a);
+create table idxpart2 (like idxpart);
+alter table idxpart attach partition idxpart1 for values from (0000) to (1000);
+alter table idxpart attach partition idxpart2 for values from (1000) to (2000);
+create table idxpart3 partition of idxpart for values from (2000) to (3000);
+select relname, relkind from pg_class where relname like 'idxpart%' order by relname;
+-- a) after detaching partitions, the indexes can be dropped independently
+alter table idxpart detach partition idxpart1;
+alter table idxpart detach partition idxpart2;
+alter table idxpart detach partition idxpart3;
+drop index idxpart1_a_idx;
+drop index idxpart2_a_idx;
+drop index idxpart3_a_idx;
+select relname, relkind from pg_class where relname like 'idxpart%' order by relname;
+drop table idxpart, idxpart1, idxpart2, idxpart3;
+select relname, relkind from pg_class where relname like 'idxpart%' order by relname;
+
+create table idxpart (a int) partition by range (a);
+create table idxpart1 (like idxpart);
+create index on idxpart1 (a);
+create index on idxpart (a);
+create table idxpart2 (like idxpart);
+alter table idxpart attach partition idxpart1 for values from (0000) to (1000);
+alter table idxpart attach partition idxpart2 for values from (1000) to (2000);
+create table idxpart3 partition of idxpart for values from (2000) to (3000);
+-- b) after detaching, dropping the index on parent does not remove the others
+select relname, relkind from pg_class where relname like 'idxpart%' order by relname;
+alter table idxpart detach partition idxpart1;
+alter table idxpart detach partition idxpart2;
+alter table idxpart detach partition idxpart3;
+drop index idxpart_a_idx;
+select relname, relkind from pg_class where relname like 'idxpart%' order by relname;
+drop table idxpart, idxpart1, idxpart2, idxpart3;
+select relname, relkind from pg_class where relname like 'idxpart%' order by relname;
+
+-- Verify that expression indexes inherit correctly
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (like idxpart);
+create index on idxpart1 ((a + b));
+create index on idxpart ((a + b));
+create table idxpart2 (like idxpart);
+alter table idxpart attach partition idxpart1 for values from (0000) to (1000);
+alter table idxpart attach partition idxpart2 for values from (1000) to (2000);
+create table idxpart3 partition of idxpart for values from (2000) to (3000);
+select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef
+  from pg_class join pg_inherits on inhrelid = oid,
+  lateral pg_get_indexdef(pg_class.oid)
+  where relkind in ('i', 'I') and relname like 'idxpart%' order by relname;
+drop table idxpart;
+
+-- Verify behavior for collation (mis)matches
+create table idxpart (a text) partition by range (a);
+create table idxpart1 (like idxpart);
+create table idxpart2 (like idxpart);
+create index on idxpart2 (a collate "POSIX");
+create index on idxpart2 (a);
+create index on idxpart2 (a collate "C");
+alter table idxpart attach partition idxpart1 for values from ('aaa') to ('bbb');
+alter table idxpart attach partition idxpart2 for values from ('bbb') to ('ccc');
+create table idxpart3 partition of idxpart for values from ('ccc') to ('ddd');
+create index on idxpart (a collate "C");
+create table idxpart4 partition of idxpart for values from ('ddd') to ('eee');
+select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef
+  from pg_class left join pg_inherits on inhrelid = oid,
+  lateral pg_get_indexdef(pg_class.oid)
+  where relkind in ('i', 'I') and relname like 'idxpart%' order by relname;
+drop table idxpart;
+
+-- Verify behavior for opclass (mis)matches
+create table idxpart (a text) partition by range (a);
+create table idxpart1 (like idxpart);
+create table idxpart2 (like idxpart);
+create index on idxpart2 (a);
+alter table idxpart attach partition idxpart1 for values from ('aaa') to ('bbb');
+alter table idxpart attach partition idxpart2 for values from ('bbb') to ('ccc');
+create table idxpart3 partition of idxpart for values from ('ccc') to ('ddd');
+create index on idxpart (a text_pattern_ops);
+create table idxpart4 partition of idxpart for values from ('ddd') to ('eee');
+-- must *not* have attached the index we created on idxpart2
+select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef
+  from pg_class left join pg_inherits on inhrelid = oid,
+  lateral pg_get_indexdef(pg_class.oid)
+  where relkind in ('i', 'I') and relname like 'idxpart%' order by relname;
+drop index idxpart_a_idx;
+create index on only idxpart (a text_pattern_ops);
+-- must reject
+alter index idxpart_a_idx attach partition idxpart2_a_idx;
+drop table idxpart;
+
+-- Verify that attaching indexes maps attribute numbers correctly
+create table idxpart (col1 int, a int, col2 int, b int) partition by range (a);
+create table idxpart1 (b int, col1 int, col2 int, col3 int, a int);
+alter table idxpart drop column col1, drop column col2;
+alter table idxpart1 drop column col1, drop column col2, drop column col3;
+alter table idxpart attach partition idxpart1 for values from (0) to (1000);
+create index idxpart_1_idx on only idxpart (b, a);
+create index idxpart1_1_idx on idxpart1 (b, a);
+create index idxpart1_1b_idx on idxpart1 (b);
+-- test expressions and partial-index predicate, too
+create index idxpart_2_idx on only idxpart ((b + a)) where a > 1;
+create index idxpart1_2_idx on idxpart1 ((b + a)) where a > 1;
+create index idxpart1_2b_idx on idxpart1 ((a + b)) where a > 1;
+create index idxpart1_2c_idx on idxpart1 ((b + a)) where b > 1;
+alter index idxpart_1_idx attach partition idxpart1_1b_idx;	-- fail
+alter index idxpart_1_idx attach partition idxpart1_1_idx;
+alter index idxpart_2_idx attach partition idxpart1_2b_idx;	-- fail
+alter index idxpart_2_idx attach partition idxpart1_2c_idx;	-- fail
+alter index idxpart_2_idx attach partition idxpart1_2_idx;	-- ok
+select relname as child, inhparent::regclass as parent, pg_get_indexdef as childdef
+  from pg_class left join pg_inherits on inhrelid = oid,
+  lateral pg_get_indexdef(pg_class.oid)
+  where relkind in ('i', 'I') and relname like 'idxpart%' order by relname;
+drop table idxpart;
+
+-- Make sure the partition columns are mapped correctly
+create table idxpart (a int, b int, c text) partition by range (a);
+create index idxparti on idxpart (a);
+create index idxparti2 on idxpart (c, b);
+create table idxpart1 (c text, a int, b int);
+alter table idxpart attach partition idxpart1 for values from (0) to (10);
+create table idxpart2 (c text, a int, b int);
+create index on idxpart2 (a);
+create index on idxpart2 (c, b);
+alter table idxpart attach partition idxpart2 for values from (10) to (20);
+select c.relname, pg_get_indexdef(indexrelid)
+  from pg_class c join pg_index i on c.oid = i.indexrelid
+  where indrelid::regclass::text like 'idxpart%'
+  order by indrelid::regclass::text collate "C";
+drop table idxpart;
+
+-- Verify that columns are mapped correctly in expression indexes
+create table idxpart (col1 int, col2 int, a int, b int) partition by range (a);
+create table idxpart1 (col2 int, b int, col1 int, a int);
+create table idxpart2 (col1 int, col2 int, b int, a int);
+alter table idxpart drop column col1, drop column col2;
+alter table idxpart1 drop column col1, drop column col2;
+alter table idxpart2 drop column col1, drop column col2;
+create index on idxpart2 (abs(b));
+alter table idxpart attach partition idxpart2 for values from (0) to (1);
+create index on idxpart (abs(b));
+alter table idxpart attach partition idxpart1 for values from (1) to (2);
+select c.relname, pg_get_indexdef(indexrelid)
+  from pg_class c join pg_index i on c.oid = i.indexrelid
+  where indrelid::regclass::text like 'idxpart%'
+  order by indrelid::regclass::text collate "C";
+drop table idxpart;
+
+-- Verify that columns are mapped correctly for WHERE in a partial index
+create table idxpart (col1 int, a int, col3 int, b int) partition by range (a);
+alter table idxpart drop column col1, drop column col3;
+create table idxpart1 (col1 int, col2 int, col3 int, col4 int, b int, a int);
+alter table idxpart1 drop column col1, drop column col2, drop column col3, drop column col4;
+alter table idxpart attach partition idxpart1 for values from (0) to (1000);
+create table idxpart2 (col1 int, col2 int, b int, a int);
+create index on idxpart2 (a) where b > 1000;
+alter table idxpart2 drop column col1, drop column col2;
+alter table idxpart attach partition idxpart2 for values from (1000) to (2000);
+create index on idxpart (a) where b > 1000;
+select c.relname, pg_get_indexdef(indexrelid)
+  from pg_class c join pg_index i on c.oid = i.indexrelid
+  where indrelid::regclass::text like 'idxpart%'
+  order by indrelid::regclass::text collate "C";
+drop table idxpart;
+
+-- Column number mapping: dropped columns in the partition
+create table idxpart1 (drop_1 int, drop_2 int, col_keep int, drop_3 int);
+alter table idxpart1 drop column drop_1;
+alter table idxpart1 drop column drop_2;
+alter table idxpart1 drop column drop_3;
+create index on idxpart1 (col_keep);
+create table idxpart (col_keep int) partition by range (col_keep);
+create index on idxpart (col_keep);
+alter table idxpart attach partition idxpart1 for values from (0) to (1000);
+\d idxpart
+\d idxpart1
+select attrelid::regclass, attname, attnum from pg_attribute
+  where attrelid::regclass::text like 'idxpart%' and attnum > 0
+  order by attrelid::regclass, attnum;
+drop table idxpart;
+
+-- Column number mapping: dropped columns in the parent table
+create table idxpart(drop_1 int, drop_2 int, col_keep int, drop_3 int) partition by range (col_keep);
+alter table idxpart drop column drop_1;
+alter table idxpart drop column drop_2;
+alter table idxpart drop column drop_3;
+create table idxpart1 (col_keep int);
+create index on idxpart1 (col_keep);
+create index on idxpart (col_keep);
+alter table idxpart attach partition idxpart1 for values from (0) to (1000);
+\d idxpart
+\d idxpart1
+select attrelid::regclass, attname, attnum from pg_attribute
+  where attrelid::regclass::text like 'idxpart%' and attnum > 0
+  order by attrelid::regclass, attnum;
+drop table idxpart;
+
+-- intentionally leave some objects around
+create table idxpart (a int) partition by range (a);
+create table idxpart1 partition of idxpart for values from (0) to (100);
+create table idxpart2 partition of idxpart for values from (100) to (1000)
+  partition by range (a);
+create table idxpart21 partition of idxpart2 for values from (100) to (200);
+create table idxpart22 partition of idxpart2 for values from (200) to (300);
+create index on idxpart22 (a);
+create index on only idxpart2 (a);
+alter index idxpart2_a_idx attach partition idxpart22_a_idx;
+create index on idxpart (a);

From 2d2f13f0a28a10c9186765837ff0e53ae3941fd3 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 22 Aug 2017 15:36:49 -0700
Subject: [PATCH 294/578] Add a hash_combine function for mixing hash values.

This hash function is derived from Boost's function of the same name.

Author: Andres Freund, Thomas Munro
Discussion: https://postgr.es/m/CAEepm%3D3rdgjfxW4cKvJ0OEmya2-34B0qHNG1xV0vK7TGPJGMUQ%40mail.gmail.com
Discussion: https://postgr.es/m/20170731210844.3cwrkmsmbbpt4rjc%40alap3.anarazel.de
---
 src/include/utils/hashutils.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 src/include/utils/hashutils.h

diff --git a/src/include/utils/hashutils.h b/src/include/utils/hashutils.h
new file mode 100644
index 00000000..56b7bfc9
--- /dev/null
+++ b/src/include/utils/hashutils.h
@@ -0,0 +1,23 @@
+/*
+ * Utilities for working with hash values.
+ *
+ * Portions Copyright (c) 2017, PostgreSQL Global Development Group
+ */
+
+#ifndef HASHUTILS_H
+#define HASHUTILS_H
+
+/*
+ * Combine two hash values, resulting in another hash value, with decent bit
+ * mixing.
+ *
+ * Similar to boost's hash_combine().
+ */
+static inline uint32
+hash_combine(uint32 a, uint32 b)
+{
+	a ^= b + 0x9e3779b9 + (a << 6) + (a >> 2);
+	return a;
+}
+
+#endif							/* HASHUTILS_H */

From fbed5652ec7a2d4189f265659fb494374b22bfd4 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 22 Sep 2017 13:38:42 -0700
Subject: [PATCH 295/578] Add inline murmurhash32(uint32) function.

The function already existed in tidbitmap.c but more users requiring
fast hashing of 32bit ints are coming up.

Author: Andres Freund
Discussion: https://postgr.es/m/20170914061207.zxotvyopetm7lrrp@alap3.anarazel.de
---
 src/backend/nodes/tidbitmap.c | 20 ++------------------
 src/include/utils/hashutils.h | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c
index 5be82536..73820707 100644
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -45,6 +45,7 @@
 #include "nodes/tidbitmap.h"
 #include "storage/lwlock.h"
 #include "utils/dsa.h"
+#include "utils/hashutils.h"
 
 /*
  * The maximum number of tuples per page is not large (typically 256 with
@@ -237,30 +238,13 @@ static int    tbm_comparator(const void *left, const void *right);
 static int tbm_shared_comparator(const void *left, const void *right,
                       void *arg);
 
-/*
- * Simple inline murmur hash implementation for the exact width required, for
- * performance.
- */
-static inline uint32
-hash_blockno(BlockNumber b)
-{
-    uint32        h = b;
-
-    h ^= h >> 16;
-    h *= 0x85ebca6b;
-    h ^= h >> 13;
-    h *= 0xc2b2ae35;
-    h ^= h >> 16;
-    return h;
-}
-
 /* define hashtable mapping block numbers to PagetableEntry's */
 #define SH_USE_NONDEFAULT_ALLOCATOR
 #define SH_PREFIX pagetable
 #define SH_ELEMENT_TYPE PagetableEntry
 #define SH_KEY_TYPE BlockNumber
 #define SH_KEY blockno
-#define SH_HASH_KEY(tb, key) hash_blockno(key)
+#define SH_HASH_KEY(tb, key) murmurhash32(key)
 #define SH_EQUAL(tb, a, b) a == b
 #define SH_SCOPE static inline
 #define SH_DEFINE
diff --git a/src/include/utils/hashutils.h b/src/include/utils/hashutils.h
index 56b7bfc9..35281689 100644
--- a/src/include/utils/hashutils.h
+++ b/src/include/utils/hashutils.h
@@ -20,4 +20,22 @@ hash_combine(uint32 a, uint32 b)
 	return a;
 }
 
+
+/*
+ * Simple inline murmur hash implementation hashing a 32 bit ingeger, for
+ * performance.
+ */
+static inline uint32
+murmurhash32(uint32 data)
+{
+	uint32		h = data;
+
+	h ^= h >> 16;
+	h *= 0x85ebca6b;
+	h ^= h >> 13;
+	h *= 0xc2b2ae35;
+	h ^= h >> 16;
+	return h;
+}
+
 #endif							/* HASHUTILS_H */

From 330f9f9673a3f9bac14bc4bf241fc0e0f3f69f14 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 29 Sep 2017 15:52:55 -0700
Subject: [PATCH 296/578] Fix typo.

Reported-By: Thomas Munro and Jesper Pedersen
---
 src/include/utils/hashutils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/utils/hashutils.h b/src/include/utils/hashutils.h
index 35281689..366bd0e7 100644
--- a/src/include/utils/hashutils.h
+++ b/src/include/utils/hashutils.h
@@ -22,7 +22,7 @@ hash_combine(uint32 a, uint32 b)
 
 
 /*
- * Simple inline murmur hash implementation hashing a 32 bit ingeger, for
+ * Simple inline murmur hash implementation hashing a 32 bit integer, for
  * performance.
  */
 static inline uint32

From a974c3771ccc22b6bf8387f09f41d92ff27ef12f Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Sun, 29 Oct 2017 12:41:43 +0530
Subject: [PATCH 297/578] Add hash_combine64.

Extracted from a larger patch by Amul Sul, with some comment additions
by me.

Discussion: http://postgr.es/m/20171024113004.hn5qajypin4dy5sw@alap3.anarazel.de
---
 src/include/utils/hashutils.h | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/include/utils/hashutils.h b/src/include/utils/hashutils.h
index 366bd0e7..3a5c21f5 100644
--- a/src/include/utils/hashutils.h
+++ b/src/include/utils/hashutils.h
@@ -8,8 +8,8 @@
 #define HASHUTILS_H
 
 /*
- * Combine two hash values, resulting in another hash value, with decent bit
- * mixing.
+ * Combine two 32-bit hash values, resulting in another hash value, with
+ * decent bit mixing.
  *
  * Similar to boost's hash_combine().
  */
@@ -20,6 +20,18 @@ hash_combine(uint32 a, uint32 b)
 	return a;
 }
 
+/*
+ * Combine two 64-bit hash values, resulting in another hash value, using the
+ * same kind of technique as hash_combine().  Testing shows that this also
+ * produces good bit mixing.
+ */
+static inline uint64
+hash_combine64(uint64 a, uint64 b)
+{
+	/* 0x49a0f4dd15e5a8e3 is 64bit random data */
+	a ^= b + 0x49a0f4dd15e5a8e3 + (a << 54) + (a >> 7);
+	return a;
+}
 
 /*
  * Simple inline murmur hash implementation hashing a 32 bit integer, for

From bff015b6c1b4bd4fcc154514215024b9608bb9e5 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 7 Nov 2017 13:54:36 -0500
Subject: [PATCH 298/578] Fix unportable spelling of int64 constant.

Per buildfarm member pademelon.
---
 src/include/utils/hashutils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/utils/hashutils.h b/src/include/utils/hashutils.h
index 3a5c21f5..0a2620be 100644
--- a/src/include/utils/hashutils.h
+++ b/src/include/utils/hashutils.h
@@ -29,7 +29,7 @@ static inline uint64
 hash_combine64(uint64 a, uint64 b)
 {
 	/* 0x49a0f4dd15e5a8e3 is 64bit random data */
-	a ^= b + 0x49a0f4dd15e5a8e3 + (a << 54) + (a >> 7);
+	a ^= b + UINT64CONST(0x49a0f4dd15e5a8e3) + (a << 54) + (a >> 7);
 	return a;
 }
 

From 8d4128c29b72c99f855d7598f1e293b7de35eb8a Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Fri, 3 Jul 2020 19:30:35 +0800
Subject: [PATCH 299/578] update select_parallel expect result

---
 .../regress/expected/select_parallel_5.out    | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/test/regress/expected/select_parallel_5.out b/src/test/regress/expected/select_parallel_5.out
index 6b20689d..d5be6ecb 100644
--- a/src/test/regress/expected/select_parallel_5.out
+++ b/src/test/regress/expected/select_parallel_5.out
@@ -95,6 +95,30 @@ explain (costs off)
                            ->  Parallel Seq Scan on tenk1
 (10 rows)
 
+explain (costs off)
+	select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong
+		from tenk1 group by islong order by num;
+                                                       QUERY PLAN                                                       
+------------------------------------------------------------------------------------------------------------------------
+ Sort
+   Sort Key: (count(stringu1))
+   ->  Finalize HashAggregate
+         Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Gather
+                     Workers Planned: 4
+                     ->  Partial HashAggregate
+                           Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END
+                           ->  Parallel Seq Scan on tenk1
+(10 rows)
+
+select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong
+	from tenk1 group by islong order by num;
+  num  | islong 
+-------+--------
+ 10000 | LONG
+(1 row)
+
 -- test that parallel plan for aggregates is not selected when
 -- target list contains parallel restricted clause.
 explain (costs off)

From fbda1355519b5c6a5275fb419eefcd1aa3630b5a Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 3 Jul 2020 20:17:32 +0800
Subject: [PATCH 300/578] fix compile errors.20200703.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/commands/copy.c              |  2 +-
 src/backend/commands/indexcmds.c         |  1 +
 src/backend/commands/tablecmds.c         |  7 +--
 src/backend/executor/execMain.c          |  2 +-
 src/backend/executor/execPartition.c     | 24 +++------
 src/backend/executor/nodeModifyTable.c   |  9 ++--
 src/backend/optimizer/path/allpaths.c    | 35 ++++++++++++-
 src/backend/optimizer/path/joinrels.c    |  2 +-
 src/backend/optimizer/prep/prepunion.c   | 65 +++++++++++++++++++++++-
 src/backend/optimizer/util/placeholder.c |  2 +-
 src/backend/optimizer/util/relnode.c     |  2 +-
 src/include/executor/execPartition.h     |  2 +-
 src/include/optimizer/prep.h             |  3 ++
 13 files changed, 122 insertions(+), 34 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 533187a0..8bf02419 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -3541,7 +3541,7 @@ CopyFrom(CopyState cstate)
              * We might need to convert from the parent rowtype to the
              * partition rowtype.
              */
-            tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index],
+            tuple = ConvertPartitionTupleSlot(resultRelInfo->ri_RelationDesc, proute->parent_child_tupconv_maps[leaf_part_index],
                                               tuple,
                                               proute->partition_tuple_slot,
                                               &slot);
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 22c2348e..be45e453 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -67,6 +67,7 @@
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/tqual.h"
+#include "utils/guc.h"
 
 
 /* non-export function prototypes */
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 834f2840..430141a2 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -6647,9 +6647,9 @@ ATCheckPartitionsNotInUse(Relation rel, LOCKMODE lockmode)
 			Relation	childrel;
 
 			/* find_all_inheritors already got lock */
-			childrel = table_open(lfirst_oid(cell), NoLock);
+			childrel = heap_open(lfirst_oid(cell), NoLock);
 			CheckTableNotInUse(childrel, "ALTER TABLE");
-			table_close(childrel, NoLock);
+			heap_close(childrel, NoLock);
 		}
 		list_free(inh);
 	}
@@ -8489,7 +8489,7 @@ ATExecDropColumn(List **wqueue, Relation rel, const char *colName,
                     /* Time to delete this child column, too */
                     ATExecDropColumn(wqueue, childrel, colName,
                                      behavior, true, true,
-                                     false, lockmode);
+									 false, lockmode, addrs);
                 }
                 else
                 {
@@ -8658,6 +8658,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
                 addr = DefineIndex(partOid,    /* OID of heap relation */
                                    partidxstmt,
                                    InvalidOid, /* no predefined OID */
+								   InvalidOid,	/* no parent index */
                                    true,    /* is_alter_table */
                                    check_rights,    /* check_rights */
                                    false,    /* check_not_in_use */
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 4140f135..3bc95f7d 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -2287,7 +2287,7 @@ ExecPartitionCheckEmitError(ResultRelInfo *resultRelInfo,
                                          gettext_noop("could not convert row type"));
             if (map != NULL)
             {
-            tuple = do_convert_tuple(tuple, map);
+            tuple = do_convert_tuple(tuple, map, rel);
                 ExecSetSlotDescriptor(slot, tupdesc);
                 ExecStoreTuple(tuple, slot, InvalidBuffer, false);
             }
diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index dd60cbc8..f5d69874 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -189,7 +189,7 @@ ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel)
              * UPDATE of a partition-key becomes a DELETE+INSERT operation, so
              * this check is required even when the operation is CMD_UPDATE.
              */
-            CheckValidResultRel(leaf_part_rri, CMD_INSERT);
+            CheckValidResultRel(leaf_part_rri->ri_RelationDesc, CMD_INSERT);
          }
 
 		proute->partitions[i] = leaf_part_rri;
@@ -257,7 +257,7 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd,
 			HeapTuple	tuple = ExecFetchSlotTuple(slot);
 
 			ExecClearTuple(myslot);
-			tuple = do_convert_tuple(tuple, map);
+			tuple = do_convert_tuple(tuple, map, NULL);
 			ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
 			slot = myslot;
 		}
@@ -343,7 +343,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 	ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL;
 	MemoryContext oldContext;
 
-	partrel = table_open(dispatch->partdesc->oids[partidx], RowExclusiveLock);
+	partrel = heap_open(proute->partition_oids[partidx], RowExclusiveLock);
 
 	/*
 	 * Keep ResultRelInfo and other information for this partition in the
@@ -363,19 +363,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate,
 	 * partition-key becomes a DELETE+INSERT operation, so this check is still
 	 * required when the operation is CMD_UPDATE.
 	 */
-	CheckValidResultRel(leaf_part_rri, CMD_INSERT);
-
-	/*
-	 * Since we've just initialized this ResultRelInfo, it's not in any list
-	 * attached to the estate as yet.  Add it, so that it can be found later.
-	 *
-	 * Note that the entries in this list appear in no predetermined order,
-	 * because partition result rels are initialized as and when they're
-	 * needed.
-	 */
-	estate->es_tuple_routing_result_relations =
-		lappend(estate->es_tuple_routing_result_relations,
-				leaf_part_rri);
+	CheckValidResultRel(leaf_part_rri->ri_RelationDesc, CMD_INSERT);
 
 	/*
 	 * Open partition indices.  The user may have asked to check for conflicts
@@ -589,7 +577,7 @@ TupConvMapForLeaf(PartitionTupleRouting *proute,
  * tuple is returned unmodified.
  */
 HeapTuple
-ConvertPartitionTupleSlot(TupleConversionMap *map,
+ConvertPartitionTupleSlot(Relation partrel, TupleConversionMap *map,
 						  HeapTuple tuple,
 						  TupleTableSlot *new_slot,
 						  TupleTableSlot **p_my_slot)
@@ -597,7 +585,7 @@ ConvertPartitionTupleSlot(TupleConversionMap *map,
 	if (!map)
 		return tuple;
 
-	tuple = do_convert_tuple(tuple, map);
+	tuple = do_convert_tuple(tuple, map, partrel);
 
 	/*
 	 * Change the partition tuple slot descriptor, as per converted tuple.
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 34a53370..3d6b9769 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -286,6 +286,7 @@ ExecInsert(ModifyTableState *mtstate,
 {// #lizard forgives
     HeapTuple    tuple;
     ResultRelInfo *resultRelInfo;
+	ResultRelInfo *saved_resultRelInfo = NULL;
     Relation    resultRelationDesc;
     Oid            newId;
     List       *recheckIndexes = NIL;
@@ -329,7 +330,7 @@ ExecInsert(ModifyTableState *mtstate,
 
 #ifdef __TBASE__
     /* Determine the interval partition to heap_insert the tuple into */
-    else if (resultRelInfo->ispartparent)
+	if (resultRelInfo->ispartparent)
     {
         AttrNumber partkey;
         Datum        partvalue;
@@ -1386,7 +1387,7 @@ lreplace:;
 	        * Row movement, part 1.  Delete the tuple, but skip RETURNING
 	        * processing. We want to return rows from INSERT.
 	        */
-	       ExecDelete(mtstate, tupleid, oldtuple, planSlot, epqstate, estate,
+	       ExecDelete(mtstate, tupleid, oldtuple, slot, planSlot, epqstate, estate,
 	                  &tuple_deleted, false, false);
 
 	       /*
@@ -1433,7 +1434,7 @@ lreplace:;
 	       map_index = resultRelInfo - mtstate->resultRelInfo;
 	       Assert(map_index >= 0 && map_index < mtstate->mt_nplans);
 	       tupconv_map = tupconv_map_for_subplan(mtstate, map_index);
-	       tuple = ConvertPartitionTupleSlot(tupconv_map,
+	       tuple = ConvertPartitionTupleSlot(resultRelInfo->ri_RelationDesc, tupconv_map,
 	                                         tuple,
 	                                         proute->root_tuple_slot,
 	                                         &slot);
@@ -2040,7 +2041,7 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate,
 	/*
 	 * Convert the tuple, if necessary.
 	 */
-	ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[partidx],
+	ConvertPartitionTupleSlot(partrel->ri_RelationDesc, proute->parent_child_tupconv_maps[partidx],
 							  tuple,
 							  proute->partition_tuple_slot,
 							  &slot);
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 59663d81..73159be1 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -1031,7 +1031,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
 		childrel->reltarget->exprs = (List *)
 			adjust_appendrel_attrs(root,
 								   (Node *) rel->reltarget->exprs,
-								   1, &appinfo);
+								   &appinfo);
 
         /*
 		 * We have to make child entries in the EquivalenceClass data
@@ -2169,10 +2169,41 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
 	                                        subpath->pathkeys,
 	                                        make_tlist_from_pathtarget(subpath->pathtarget));
 
+	    if (subpath->distribution && subpath->distribution->distributionExpr)
+	    {
+	   			ListCell *lc;
+
+	   			/* FIXME Could we use pathtarget directly? */
+	   			List *targetlist = make_tlist_from_pathtarget(subpath->pathtarget);
+
+	   			/*
+	   			 * The distribution expression from the subplan's tlist, but it should
+	   			 * be from the rel, need conversion.
+	   			 */
+	   			distribution = makeNode(Distribution);
+	   			distribution->distributionType = subpath->distribution->distributionType;
+	   			distribution->nodes = bms_copy(subpath->distribution->nodes);
+	   			distribution->restrictNodes = bms_copy(subpath->distribution->restrictNodes);
+
+	   			foreach(lc, targetlist)
+	   			{
+	   				TargetEntry *tle = (TargetEntry *) lfirst(lc);
+	   				if (equal(tle->expr, subpath->distribution->distributionExpr))
+	   				{
+	   					distribution->distributionExpr = (Node *)
+	   							makeVarFromTargetEntry(rel->relid, tle);
+	   					break;
+	   				}
+	   			}
+	    }
+	    else
+	   			distribution = subpath->distribution;
+
 	   /* Generate outer path using this subpath */
 	   add_partial_path(rel, (Path *)
 	                    create_subqueryscan_path(root, rel, subpath,
-	                                             pathkeys, required_outer));
+	                                             pathkeys, required_outer,
+												 distribution));
 	}
 }
 
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index d8afa3ef..715036b9 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -1405,7 +1405,7 @@ try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
 		 * applicable to the parent join.
 		 */
 		child_restrictlist =
-			(List *) adjust_appendrel_attrs(root,
+			(List *) adjust_appendrel_attrs_nappinfos(root,
 											(Node *) parent_restrictlist,
 											nappinfos, appinfos);
 		pfree(appinfos);
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index a9c117f1..ea16dfee 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -2037,6 +2037,69 @@ adjust_appendrel_attrs(PlannerInfo *root, Node *node, AppendRelInfo *appinfo)
     return result;
 }
 
+/*
+ * adjust_appendrel_attrs
+ *        Copy the specified query or expression and translate Vars referring to a
+ *        parent rel to refer to the corresponding child rel instead.  We also
+ *        update rtindexes appearing outside Vars, such as resultRelation and
+ *        jointree relids.
+ *
+ * Note: this is only applied after conversion of sublinks to subplans,
+ * so we don't need to cope with recursion into sub-queries.
+ *
+ * Note: this is not hugely different from what pullup_replace_vars() does;
+ * maybe we should try to fold the two routines together.
+ */
+Node *
+adjust_appendrel_attrs_nappinfos(PlannerInfo *root, Node *node, int nappinfos,
+                                           AppendRelInfo **appinfos)
+{
+        Node       *result;
+        adjust_appendrel_attrs_context context;
+
+        context.root = root;
+        context.nappinfos = nappinfos;
+        context.appinfos = appinfos;
+
+        /* If there's nothing to adjust, don't call this function. */
+        Assert(nappinfos >= 1 && appinfos != NULL);
+
+        /*
+         * Must be prepared to start with a Query or a bare expression tree.
+         */
+        if (node && IsA(node, Query))
+        {
+                Query      *newnode;
+                int                     cnt;
+
+                newnode = query_tree_mutator((Query *) node,
+                                                                         adjust_appendrel_attrs_mutator,
+                                                                         (void *) &context,
+                                                                         QTW_IGNORE_RC_SUBQUERIES);
+                for (cnt = 0; cnt < nappinfos; cnt++)
+                {
+                        AppendRelInfo *appinfo = appinfos[cnt];
+
+                        if (newnode->resultRelation == appinfo->parent_relid)
+                        {
+                                newnode->resultRelation = appinfo->child_relid;
+                                /* Fix tlist resnos too, if it's inherited UPDATE */
+                                if (newnode->commandType == CMD_UPDATE)
+                                        newnode->targetList =
+                                                adjust_inherited_tlist(newnode->targetList,
+                                                                                           appinfo);
+                                break;
+                        }
+                }
+
+                result = (Node *) newnode;
+        }
+        else
+                result = adjust_appendrel_attrs_mutator(node, &context);
+
+        return result;
+}
+
 static Node *
 adjust_appendrel_attrs_mutator(Node *node,
                                adjust_appendrel_attrs_context *context)
@@ -2467,7 +2530,7 @@ build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo,
    sjinfo->syn_righthand = adjust_child_relids(sjinfo->syn_righthand,
                                                right_nappinfos,
                                                right_appinfos);
-   sjinfo->semi_rhs_exprs = (List *) adjust_appendrel_attrs(root,
+   sjinfo->semi_rhs_exprs = (List *) adjust_appendrel_attrs_nappinfos(root,
                                                             (Node *) sjinfo->semi_rhs_exprs,
                                                             right_nappinfos,
                                                             right_appinfos);
diff --git a/src/backend/optimizer/util/placeholder.c b/src/backend/optimizer/util/placeholder.c
index 0d5351a6..a344dbe8 100644
--- a/src/backend/optimizer/util/placeholder.c
+++ b/src/backend/optimizer/util/placeholder.c
@@ -499,7 +499,7 @@ add_placeholders_to_child_joinrel(PlannerInfo *root, RelOptInfo *childrel,
 			if (bms_overlap(phv->phrels, parentrel->relids) &&
 				childrel->reloptkind == RELOPT_OTHER_JOINREL)
 			{
-				phv = (PlaceHolderVar *) adjust_appendrel_attrs(root,
+				phv = (PlaceHolderVar *) adjust_appendrel_attrs_nappinfos(root,
 																(Node *) phv,
 																nappinfos,
 																appinfos);
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index 1f6fb286..39100cae 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -819,7 +819,7 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
 
 	/* Construct joininfo list. */
 	appinfos = find_appinfos_by_relids(root, joinrel->relids, &nappinfos);
-	joinrel->joininfo = (List *) adjust_appendrel_attrs(root,
+	joinrel->joininfo = (List *) adjust_appendrel_attrs_nappinfos(root,
 														(Node *) parent_joinrel->joininfo,
 														nappinfos,
 														appinfos);
diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h
index d4d1be1d..0cd7b1b5 100644
--- a/src/include/executor/execPartition.h
+++ b/src/include/executor/execPartition.h
@@ -93,7 +93,7 @@ extern ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate,
 extern void ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute);
 extern TupleConversionMap *TupConvMapForLeaf(PartitionTupleRouting *proute,
 				  ResultRelInfo *rootRelInfo, int leaf_index);
-extern HeapTuple ConvertPartitionTupleSlot(TupleConversionMap *map,
+extern HeapTuple ConvertPartitionTupleSlot(Relation partrel, TupleConversionMap *map,
 						  HeapTuple tuple,
 						  TupleTableSlot *new_slot,
 						  TupleTableSlot **p_my_slot);
diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h
index e51066ed..f560052d 100644
--- a/src/include/optimizer/prep.h
+++ b/src/include/optimizer/prep.h
@@ -55,6 +55,9 @@ extern void expand_inherited_tables(PlannerInfo *root);
 extern Node *adjust_appendrel_attrs(PlannerInfo *root, Node *node,
 					   AppendRelInfo *appinfo);
 
+extern Node *adjust_appendrel_attrs_nappinfos(PlannerInfo *root, Node *node, int nappinfos,
+                                           AppendRelInfo **appinfos);
+
 extern Node *adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node,
 								  RelOptInfo *child_rel);
 

From cafa1ab65a4455970dd1c2f5fb6d09f4c261c9b4 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 7 Jul 2020 21:35:37 +0800
Subject: [PATCH 301/578] fix compile errors.20200707.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/partition.c         | 85 +++++++++++++++++++++----
 src/backend/optimizer/plan/createplan.c |  2 +-
 src/backend/optimizer/plan/planner.c    | 38 ++++++-----
 src/backend/optimizer/plan/subselect.c  | 19 +++++-
 src/backend/optimizer/prep/prepunion.c  | 74 +++++++++++++++++++++
 src/backend/optimizer/util/clauses.c    |  7 ++
 src/backend/optimizer/util/pathnode.c   |  3 +-
 src/backend/tcop/utility.c              |  1 +
 src/backend/utils/adt/partitionfuncs.c  |  1 +
 src/include/catalog/partition.h         |  1 +
 src/include/optimizer/prep.h            |  3 +
 11 files changed, 200 insertions(+), 34 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 5c85918f..74d045dc 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -56,7 +56,9 @@
 #include "utils/ruleutils.h"
 #include "utils/syscache.h"
 
-
+static Oid get_partition_parent_worker(Relation inhRel, Oid relid);
+static void get_partition_ancestors_worker(Relation inhRel, Oid relid,
+                               List **ancestors);
 static int32 qsort_partition_hbound_cmp(const void *a, const void *b);
 static int32 qsort_partition_list_value_cmp(const void *a, const void *b,
                                void *arg);
@@ -1335,15 +1337,34 @@ check_default_allows_bound(Relation parent, Relation default_rel,
 Oid
 get_partition_parent(Oid relid)
 {
-    Form_pg_inherits form;
     Relation    catalogRelation;
-    SysScanDesc scan;
-    ScanKeyData key[2];
-    HeapTuple    tuple;
     Oid            result;
 
     catalogRelation = heap_open(InheritsRelationId, AccessShareLock);
 
+	result = get_partition_parent_worker(catalogRelation, relid);
+
+	if (!OidIsValid(result))
+        elog(ERROR, "could not find tuple for parent of relation %u", relid);
+
+	heap_close(catalogRelation, AccessShareLock);
+
+	return result;
+}
+
+/*
+ * get_partition_parent_worker
+ *     Scan the pg_inherits relation to return the OID of the parent of the
+ *     given relation
+ */
+static Oid
+get_partition_parent_worker(Relation inhRel, Oid relid)
+{
+   SysScanDesc scan;
+   ScanKeyData key[2];
+   Oid         result = InvalidOid;
+   HeapTuple   tuple;
+
     ScanKeyInit(&key[0],
                 Anum_pg_inherits_inhrelid,
                 BTEqualStrategyNumber, F_OIDEQ,
@@ -1353,22 +1374,64 @@ get_partition_parent(Oid relid)
                 BTEqualStrategyNumber, F_INT4EQ,
                 Int32GetDatum(1));
 
-    scan = systable_beginscan(catalogRelation, InheritsRelidSeqnoIndexId, true,
+   scan = systable_beginscan(inhRel, InheritsRelidSeqnoIndexId, true,
                               NULL, 2, key);
-
     tuple = systable_getnext(scan);
-    if (!HeapTupleIsValid(tuple))
-        elog(ERROR, "could not find tuple for parent of relation %u", relid);
+   if (HeapTupleIsValid(tuple))
+   {
+       Form_pg_inherits form = (Form_pg_inherits) GETSTRUCT(tuple);
 
-    form = (Form_pg_inherits) GETSTRUCT(tuple);
     result = form->inhparent;
+   }
 
     systable_endscan(scan);
-    heap_close(catalogRelation, AccessShareLock);
 
     return result;
 }
 
+/*
+ * get_partition_ancestors
+ *     Obtain ancestors of given relation
+ *
+ * Returns a list of ancestors of the given relation.
+ *
+ * Note: Because this function assumes that the relation whose OID is passed
+ * as an argument and each ancestor will have precisely one parent, it should
+ * only be called when it is known that the relation is a partition.
+ */
+List *
+get_partition_ancestors(Oid relid)
+{
+   List       *result = NIL;
+   Relation    inhRel;
+
+   inhRel = heap_open(InheritsRelationId, AccessShareLock);
+
+   get_partition_ancestors_worker(inhRel, relid, &result);
+
+   heap_close(inhRel, AccessShareLock);
+
+   return result;
+}
+
+/*
+ * get_partition_ancestors_worker
+ *     recursive worker for get_partition_ancestors
+ */
+static void
+get_partition_ancestors_worker(Relation inhRel, Oid relid, List **ancestors)
+{
+   Oid         parentOid;
+
+   /* Recursion ends at the topmost level, ie., when there's no parent */
+   parentOid = get_partition_parent_worker(inhRel, relid);
+   if (parentOid == InvalidOid)
+       return;
+
+   *ancestors = lappend_oid(*ancestors, parentOid);
+   get_partition_ancestors_worker(inhRel, parentOid, ancestors);
+}
+
 /*
  * get_qual_from_partbound
  *        Given a parser node for partition bound, return the list of executable
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index bf38bafc..706b3340 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -6728,7 +6728,7 @@ make_remotesubplan(PlannerInfo *root,
                 /* need sort */
                 if (distributionType == LOCATOR_TYPE_NONE && pathkeys && need_sort)
                 {
-                    subplan = (Plan *)make_sort_from_pathkeys(subplan, pathkeys);
+					subplan = (Plan *)make_sort_from_pathkeys(subplan, pathkeys, NULL);
 
                     subplan->startup_cost = gather_plan->plan.startup_cost;
                     subplan->total_cost = gather_plan->plan.total_cost;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 3e3065cd..2b736fd4 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -223,7 +223,7 @@ static void add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
                          const AggClauseCosts *agg_costs,
                          const AggClauseCosts *agg_final_costs,
                          grouping_sets_data *gd, bool can_sort, bool can_hash,
-                         double dNumGroups, List *havingQual);
+                         double dNumGroups, List *havingQual, bool *try_distributed_aggregation);
 static void add_partial_paths_to_grouping_rel(PlannerInfo *root,
                                  RelOptInfo *input_rel,
                                  RelOptInfo *grouped_rel,
@@ -4118,12 +4118,9 @@ create_degenerate_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
             path = (Path *)
                 create_append_path(grouped_rel,
                                    paths,
-	                           NIL,
                                    NULL,
                                    0,
-	                           false,
-	                           NIL,
-	                           -1);
+	                           NIL);
             path->pathtarget = target;
         }
         else
@@ -4157,13 +4154,17 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
    Path       *cheapest_path = input_rel->cheapest_total_path;
    AggClauseCosts agg_partial_costs;   /* parallel only */
    AggClauseCosts agg_final_costs; /* parallel only */
+    Size		hashaggtablesize;
    double      dNumGroups;
+    double		dNumPartialGroups = 0;
    bool        can_hash;
    bool        can_sort;
    bool        try_parallel_aggregation;
     bool		try_distributed_aggregation;
 	PathTarget *partial_grouping_target = NULL;
 
+	ListCell   *lc;
+
 	/*
      * Estimate number of groups.
      */
@@ -4347,7 +4348,7 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
 	add_paths_to_grouping_rel(root, input_rel, grouped_rel, target,
 	                          partial_grouping_target, agg_costs,
 	                          &agg_final_costs, gd, can_sort, can_hash,
-	                          dNumGroups, (List *) parse->havingQual);
+	                          dNumGroups, (List *) parse->havingQual, &try_distributed_aggregation);
 
 	/* Generate XL aggregate paths, with distributed 2-phase aggregation. */
 
@@ -4381,7 +4382,8 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
 		/* Estimate number of partial groups. */
 		dNumPartialGroups = get_number_of_groups(root,
 												 cheapest_path->rows,
-												 gd);
+												 gd,
+												 parse->targetList);
 
 		/*
 		 * Collect statistics about aggregates for estimating costs of
@@ -6840,7 +6842,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
                           const AggClauseCosts *agg_costs,
                           const AggClauseCosts *agg_final_costs,
                           grouping_sets_data *gd, bool can_sort, bool can_hash,
-                          double dNumGroups, List *havingQual)
+                          double dNumGroups, List *havingQual, bool *try_distributed_aggregation)
 {
     Query      *parse = root->parse;
     Path       *cheapest_path = input_rel->cheapest_total_path;
@@ -6872,12 +6874,13 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
             {
 #ifdef __TBASE__
 				bool try_redistribute_grouping = false;
-				PathTarget * local_grouping_target = make_partial_grouping_target(root, target);
+				PathTarget * local_grouping_target = make_partial_grouping_target(root, target, (Node *) parse->havingQual);
 
 				/* Estimate number of partial groups. */
 				double dNumLocalGroups = get_number_of_groups(root,
 														 cheapest_path->rows,
-														 gd);
+														 gd,
+														 parse->targetList);
 #endif
 
 #ifdef __TBASE__
@@ -6971,7 +6974,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 															AGGSPLIT_INITIAL_SERIAL,
 															parse->groupClause,
 															NIL,
-															&agg_partial_costs,
+															agg_costs,
 															dNumLocalGroups);
                         }
 						else if (parse->groupClause)
@@ -7003,7 +7006,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 #endif
 
 				else
-					try_distributed_aggregation = false;
+					*try_distributed_aggregation = false;
 
 #ifdef __TBASE__
 				if(try_redistribute_grouping)
@@ -7335,7 +7338,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 #endif
 
 			else
-				try_distributed_aggregation = false;
+				*try_distributed_aggregation = false;
 
 #ifdef __TBASE__
                 /*
@@ -7624,12 +7627,13 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 						  * 	final grouping
 										*/
 						AggClauseCosts hashagg_partial_costs;
-						PathTarget * local_grouping_target = make_partial_grouping_target(root, target);
+						PathTarget * local_grouping_target = make_partial_grouping_target(root, target, (Node *) parse->havingQual);
 
 						/* Estimate number of partial groups. */
 						double dNumLocalGroups = get_number_of_groups(root,
 																 cheapest_path->rows,
-																 gd);
+																 gd,
+																 parse->targetList);
 						try_redistribute_grouping = true;
 
 						MemSet(&hashagg_partial_costs, 0, sizeof(AggClauseCosts));
@@ -7667,7 +7671,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 				path = create_remotesubplan_path(root, path, NULL);
 #endif
 				else
-					try_distributed_aggregation = false;
+					*try_distributed_aggregation = false;
 
 /*
 				 * We just need an Agg over the cheapest-total input path,
@@ -7825,7 +7829,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 					path = create_remotesubplan_path(root, path, NULL);
 #endif
 				else
-					try_distributed_aggregation = false;
+					*try_distributed_aggregation = false;
 
 #ifdef __TBASE__
 				if (!redistribute_group)
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index bcdbe3da..3aa0c9f4 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -172,6 +172,7 @@ static Node *process_sublinks_mutator(Node *node,
                          process_sublinks_context *context);
 static Bitmapset *finalize_plan(PlannerInfo *root,
               Plan *plan,
+			  int gather_param,
               Bitmapset *valid_params,
               Bitmapset *scan_params);
 static bool finalize_primnode(Node *node, finalize_primnode_context *context);
@@ -4974,12 +4975,15 @@ void
 SS_finalize_plan(PlannerInfo *root, Plan *plan)
 {
     /* No setup needed, just recurse through plan tree. */
-    (void) finalize_plan(root, plan, root->outer_params, NULL);
+	(void) finalize_plan(root, plan, -1, root->outer_params, NULL);
 }
 
 /*
  * Recursive processing of all nodes in the plan tree
  *
+ * gather_param is the rescan_param of an ancestral Gather/GatherMerge,
+ * or -1 if there is none.
+ *
  * valid_params is the set of param IDs supplied by outer plan levels
  * that are valid to reference in this plan node or its children.
  *
@@ -5006,7 +5010,7 @@ SS_finalize_plan(PlannerInfo *root, Plan *plan)
  * can be handled more cleanly.
  */
 static Bitmapset *
-finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
+finalize_plan(PlannerInfo *root, Plan *plan, int gather_param, Bitmapset *valid_params,
               Bitmapset *scan_params)
 {// #lizard forgives
     finalize_primnode_context context;
@@ -5137,7 +5141,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
             context.paramids = bms_add_members(context.paramids, scan_params);
             break;
 
-        case T_SubqueryScan:
+
             {
                 SubqueryScan *sscan = (SubqueryScan *) plan;
                 RelOptInfo *rel;
@@ -5287,6 +5291,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
                         bms_add_members(context.paramids,
                                         finalize_plan(root,
                                                       (Plan *) lfirst(lc),
+													  gather_param,
                                                       valid_params,
                                                       scan_params));
                 }
@@ -5317,6 +5322,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
                         bms_add_members(context.paramids,
                                         finalize_plan(root,
                                                       (Plan *) lfirst(l),
+													  gather_param,
                                                       valid_params,
                                                       scan_params));
                 }
@@ -5344,6 +5350,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
                         bms_add_members(context.paramids,
                                         finalize_plan(root,
                                                       (Plan *) lfirst(l),
+													  gather_param,
                                                       valid_params,
                                                       scan_params));
                 }
@@ -5360,6 +5367,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
                         bms_add_members(context.paramids,
                                         finalize_plan(root,
                                                       (Plan *) lfirst(l),
+													  gather_param,
                                                       valid_params,
                                                       scan_params));
                 }
@@ -5376,6 +5384,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
                         bms_add_members(context.paramids,
                                         finalize_plan(root,
                                                       (Plan *) lfirst(l),
+													  gather_param,
                                                       valid_params,
                                                       scan_params));
                 }
@@ -5392,6 +5401,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
                         bms_add_members(context.paramids,
                                         finalize_plan(root,
                                                       (Plan *) lfirst(l),
+													  gather_param,
                                                       valid_params,
                                                       scan_params));
                 }
@@ -5503,6 +5513,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
     /* Process left and right child plans, if any */
     child_params = finalize_plan(root,
                                  plan->lefttree,
+								 gather_param,
                                  valid_params,
                                  scan_params);
     context.paramids = bms_add_members(context.paramids, child_params);
@@ -5512,6 +5523,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
         /* right child can reference nestloop_params as well as valid_params */
         child_params = finalize_plan(root,
                                      plan->righttree,
+									 gather_param,
                                      bms_union(nestloop_params, valid_params),
                                      scan_params);
         /* ... and they don't count as parameters used at my level */
@@ -5523,6 +5535,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
         /* easy case */
         child_params = finalize_plan(root,
                                      plan->righttree,
+									 gather_param,
                                      valid_params,
                                      scan_params);
     }
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index ea16dfee..62bc5200 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -59,6 +59,8 @@ typedef struct
 {
     PlannerInfo *root;
     AppendRelInfo *appinfo;
+	AppendRelInfo **appinfos;
+	int nappinfos;
 } adjust_appendrel_attrs_context;
 
 static Path *recurse_set_operations(Node *setOp, PlannerInfo *root,
@@ -121,6 +123,8 @@ static Bitmapset *translate_col_privs(const Bitmapset *parent_privs,
                     List *translated_vars);
 static Node *adjust_appendrel_attrs_mutator(Node *node,
                                adjust_appendrel_attrs_context *context);
+static Relids adjust_child_relids(Relids relids, int nappinfos,
+                    AppendRelInfo **appinfos);
 static Relids adjust_relid_set(Relids relids, Index oldrelid, Index newrelid);
 static List *adjust_inherited_tlist(List *tlist,
                        AppendRelInfo *context);
@@ -2309,6 +2313,40 @@ adjust_appendrel_attrs_mutator(Node *node,
                                    (void *) context);
 }
 
+/*
+ * Substitute child relids for parent relids in a Relid set.  The array of
+ * appinfos specifies the substitutions to be performed.
+ */
+static Relids
+adjust_child_relids(Relids relids, int nappinfos, AppendRelInfo **appinfos)
+{
+        Bitmapset  *result = NULL;
+        int                     cnt;
+
+        for (cnt = 0; cnt < nappinfos; cnt++)
+        {
+                AppendRelInfo *appinfo = appinfos[cnt];
+
+                /* Remove parent, add child */
+                if (bms_is_member(appinfo->parent_relid, relids))
+                {
+                        /* Make a copy if we are changing the set. */
+                        if (!result)
+                                result = bms_copy(relids);
+
+                        result = bms_del_member(result, appinfo->parent_relid);
+                        result = bms_add_member(result, appinfo->child_relid);
+                }
+        }
+
+        /* If we made any changes, return the modified copy. */
+        if (result)
+                return result;
+
+        /* Otherwise, return the original set without modification. */
+        return relids;
+}
+
 /*
  * Substitute newrelid for oldrelid in a Relid set
  */
@@ -2541,3 +2579,39 @@ build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo,
    return sjinfo;
 }
 
+/*
+ * find_appinfos_by_relids
+ *      Find AppendRelInfo structures for all relations specified by relids.
+ *
+ * The AppendRelInfos are returned in an array, which can be pfree'd by the
+ * caller. *nappinfos is set to the number of entries in the array.
+ */
+AppendRelInfo **
+find_appinfos_by_relids(PlannerInfo *root, Relids relids, int *nappinfos)
+{
+    ListCell   *lc;
+    AppendRelInfo **appinfos;
+    int         cnt = 0;
+
+    *nappinfos = bms_num_members(relids);
+    appinfos = (AppendRelInfo **) palloc(sizeof(AppendRelInfo *) * *nappinfos);
+
+    foreach(lc, root->append_rel_list)
+    {
+        AppendRelInfo *appinfo = lfirst(lc);
+
+        if (bms_is_member(appinfo->child_relid, relids))
+        {
+            appinfos[cnt] = appinfo;
+            cnt++;
+
+            /* Stop when we have gathered all the AppendRelInfos. */
+            if (cnt == *nappinfos)
+                return appinfos;
+        }
+    }
+
+    /* Should have found the entries ... */
+    elog(ERROR, "did not find all requested child rels in append_rel_list");
+    return NULL;                /* not reached */
+}
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index 697b7dcc..ef96602f 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -2510,6 +2510,13 @@ estimate_expression_value(PlannerInfo *root, Node *node)
     return eval_const_expressions_mutator(node, &context);
 }
 
+/* Generic macro for applying evaluate_expr */
+#define ece_evaluate_expr(node) \
+        ((Node *) evaluate_expr((Expr *) (node), \
+                                                        exprType((Node *) (node)), \
+                                                        exprTypmod((Node *) (node)), \
+                                                        exprCollation((Node *) (node))))
+
 static Node *
 eval_const_expressions_mutator(Node *node,
                                eval_const_expressions_context *context)
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 4d2a1f32..2d4a5d2b 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -6967,8 +6967,7 @@ reparameterize_path_by_child(PlannerInfo *root, Path *path,
 #define ADJUST_CHILD_ATTRS(node) \
 	((node) = \
 	 (List *) adjust_appendrel_attrs_multilevel(root, (Node *) (node), \
-												child_rel->relids, \
-												child_rel->top_parent_relids))
+												child_rel))
 
 #define REPARAMETERIZE_CHILD_PATH(path) \
 do { \
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index a6536b13..1ef4a799 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -3817,6 +3817,7 @@ ProcessUtilitySlow(ParseState *pstate,
                                 addr = DefineIndex(partOid,    /* OID of heap relation */
                                                    partidxstmt,
                                                    InvalidOid, /* no predefined OID */
+												   InvalidOid,
                                                    false,    /* is_alter_table */
                                                    true,    /* check_rights */
                                                    true,    /* check_not_in_use */
diff --git a/src/backend/utils/adt/partitionfuncs.c b/src/backend/utils/adt/partitionfuncs.c
index 87f1cced..1e77f172 100644
--- a/src/backend/utils/adt/partitionfuncs.c
+++ b/src/backend/utils/adt/partitionfuncs.c
@@ -19,6 +19,7 @@
 #include "catalog/partition.h"
 #include "catalog/pg_class.h"
 #include "catalog/pg_inherits.h"
+#include "catalog/pg_inherits_fn.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
 #include "utils/fmgrprotos.h"
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index 6cade9aa..4265fd50 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -52,6 +52,7 @@ extern PartitionBoundInfo partition_bounds_copy(PartitionBoundInfo src,
 extern void check_new_partition_bound(char *relname, Relation parent,
 						  PartitionBoundSpec *spec);
 extern Oid	get_partition_parent(Oid relid);
+extern List *get_partition_ancestors(Oid relid);
 extern List *get_qual_from_partbound(Relation rel, Relation parent,
 						PartitionBoundSpec *spec);
 extern List *map_partition_varattnos(List *expr, int fromrel_varno,
diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h
index f560052d..99c87a2d 100644
--- a/src/include/optimizer/prep.h
+++ b/src/include/optimizer/prep.h
@@ -61,6 +61,9 @@ extern Node *adjust_appendrel_attrs_nappinfos(PlannerInfo *root, Node *node, int
 extern Node *adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node,
 								  RelOptInfo *child_rel);
 
+extern AppendRelInfo **find_appinfos_by_relids(PlannerInfo *root,
+                       Relids relids, int *nappinfos);
+
 extern SpecialJoinInfo *build_child_join_sjinfo(PlannerInfo *root,
                                                SpecialJoinInfo *parent_sjinfo,
                                                Relids left_relids, Relids right_relids);

From 369c85820770ca15d2a36a777e0302ffc3608a29 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 7 Jul 2020 21:47:49 +0800
Subject: [PATCH 302/578] Fix assorted bugs in
 pg_get_partition_constraintdef().
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/partition.c        | 43 ++++++++++++++++----------
 src/backend/utils/cache/lsyscache.c    | 24 ++++++++++++++
 src/include/utils/lsyscache.h          |  1 +
 src/test/regress/expected/indexing.out | 19 ++++++++++++
 src/test/regress/sql/indexing.sql      |  2 ++
 5 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 74d045dc..1b46a503 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -1535,31 +1535,38 @@ RelationGetPartitionQual(Relation rel)
  * get_partition_qual_relid
  *
  * Returns an expression tree describing the passed-in relation's partition
- * constraint. If there is no partition constraint returns NULL; this can
- * happen if the default partition is the only partition.
+ * constraint.
+ *
+ * If the relation is not found, or is not a partition, or there is no
+ * partition constraint, return NULL.  We must guard against the first two
+ * cases because this supports a SQL function that could be passed any OID.
+ * The last case can happen even if relispartition is true, when a default
+ * partition is the only partition.
  */
 Expr *
 get_partition_qual_relid(Oid relid)
 {
-    Relation    rel = heap_open(relid, AccessShareLock);
     Expr       *result = NULL;
-    List       *and_args;
 
-    /* Do the work only if this relation is a partition. */
-    if (rel->rd_rel->relispartition)
+	/* Do the work only if this relation exists and is a partition. */
+    if (get_rel_relispartition(relid))
     {
+    	Relation    rel = relation_open(relid, AccessShareLock);
+        List       *and_args;
+
         and_args = generate_partition_qual(rel);
 
+		/* Convert implicit-AND list format to boolean expression */
 		if (and_args == NIL)
 			result = NULL;
 		else if (list_length(and_args) > 1)
             result = makeBoolExpr(AND_EXPR, and_args, -1);
         else
             result = linitial(and_args);
-    }
 
-    /* Keep the lock. */
-    heap_close(rel, NoLock);
+		/* Keep the lock, to allow safe deparsing against the rel by caller. */
+        relation_close(rel, NoLock);
+	}
 
     return result;
 }
@@ -2455,7 +2462,6 @@ generate_partition_qual(Relation rel)
     MemoryContext oldcxt;
     Datum        boundDatum;
     bool        isnull;
-    PartitionBoundSpec *bound;
     List       *my_qual = NIL,
                *result = NIL;
     Relation    parent;
@@ -2469,7 +2475,7 @@ generate_partition_qual(Relation rel)
         return copyObject(rel->rd_partcheck);
 
     /* Grab at least an AccessShareLock on the parent table */
-    parent = heap_open(get_partition_parent(RelationGetRelid(rel)),
+	parent = relation_open(get_partition_parent(RelationGetRelid(rel)),
                        AccessShareLock);
 
     /* Get pg_class.relpartbound */
@@ -2481,14 +2487,19 @@ generate_partition_qual(Relation rel)
     boundDatum = SysCacheGetAttr(RELOID, tuple,
                                  Anum_pg_class_relpartbound,
                                  &isnull);
-    if (isnull)                    /* should not happen */
-        elog(ERROR, "relation \"%s\" has relpartbound = null",
-             RelationGetRelationName(rel));
+
+	if (!isnull)
+	{
+	    PartitionBoundSpec *bound;
+
     bound = castNode(PartitionBoundSpec,
                      stringToNode(TextDatumGetCString(boundDatum)));
-    ReleaseSysCache(tuple);
 
     my_qual = get_qual_from_partbound(rel, parent, bound);
+	}
+
+	ReleaseSysCache(tuple);
+
 
     /* Add the parent's quals to the list (if any) */
     if (parent->rd_rel->relispartition)
@@ -2514,7 +2525,7 @@ generate_partition_qual(Relation rel)
     MemoryContextSwitchTo(oldcxt);
 
     /* Keep the parent locked until commit */
-    heap_close(parent, NoLock);
+	relation_close(parent, NoLock);
 
     return result;
 }
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index d8a59308..9061c0ed 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -1934,6 +1934,30 @@ get_rel_relkind(Oid relid)
         return '\0';
 }
 
+/*
+ * get_rel_relispartition
+ *
+ *		Returns the relispartition flag associated with a given relation.
+ */
+bool
+get_rel_relispartition(Oid relid)
+{
+	HeapTuple	tp;
+
+	tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
+	if (HeapTupleIsValid(tp))
+	{
+		Form_pg_class reltup = (Form_pg_class) GETSTRUCT(tp);
+		bool		result;
+
+		result = reltup->relispartition;
+		ReleaseSysCache(tp);
+		return result;
+	}
+	else
+		return false;
+}
+
 /*
  * get_rel_tablespace
  *
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index e0d757b0..e94c510b 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -141,6 +141,7 @@ extern char *get_rel_name(Oid relid);
 extern Oid	get_rel_namespace(Oid relid);
 extern Oid	get_rel_type_id(Oid relid);
 extern char get_rel_relkind(Oid relid);
+extern bool get_rel_relispartition(Oid relid);
 extern Oid	get_rel_tablespace(Oid relid);
 extern char get_rel_persistence(Oid relid);
 extern Oid	get_transform_fromsql(Oid typid, Oid langid, List *trftypes);
diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out
index e9cccca8..804aa2eb 100644
--- a/src/test/regress/expected/indexing.out
+++ b/src/test/regress/expected/indexing.out
@@ -58,6 +58,25 @@ Indexes:
     "idxpart1_a_idx" btree (a)
     "idxpart1_b_c_idx" btree (b, c)
 
+\d+ idxpart1_a_idx
+                 Index "public.idxpart1_a_idx"
+ Column |  Type   | Key? | Definition | Storage | Stats target 
+--------+---------+------+------------+---------+--------------
+ a      | integer | yes  | a          | plain   | 
+Partition of: idxparti 
+No partition constraint
+btree, for table "public.idxpart1"
+
+\d+ idxpart1_b_c_idx
+                Index "public.idxpart1_b_c_idx"
+ Column |  Type   | Key? | Definition | Storage  | Stats target 
+--------+---------+------+------------+----------+--------------
+ b      | integer | yes  | b          | plain    | 
+ c      | text    | yes  | c          | extended | 
+Partition of: idxparti2 
+No partition constraint
+btree, for table "public.idxpart1"
+
 drop table idxpart;
 -- If a partition already has an index, don't create a duplicative one
 create table idxpart (a int, b int) partition by range (a, b);
diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql
index 33be7186..cd1dd3b0 100644
--- a/src/test/regress/sql/indexing.sql
+++ b/src/test/regress/sql/indexing.sql
@@ -28,6 +28,8 @@ create table idxpart1 (like idxpart);
 \d idxpart1
 alter table idxpart attach partition idxpart1 for values from (0) to (10);
 \d idxpart1
+\d+ idxpart1_a_idx
+\d+ idxpart1_b_c_idx
 drop table idxpart;
 
 -- If a partition already has an index, don't create a duplicative one

From b2297103dc452b257132e86e152d7f679bb512d4 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 9 Jul 2020 19:50:00 +0800
Subject: [PATCH 303/578] fix pg_amproc.h error,\d+ partition table error,
 default partition
 error.http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/catalog/partition.c        | 57 +++++++-------------
 src/backend/optimizer/path/allpaths.c  |  2 +-
 src/backend/optimizer/plan/subselect.c |  2 +-
 src/bin/psql/describe.c                | 47 ++++++++++-------
 src/bin/psql/tab-complete.c            |  6 +--
 src/include/catalog/pg_amproc.h        | 72 +++++++++++++-------------
 6 files changed, 88 insertions(+), 98 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 1b46a503..263a426a 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -331,21 +331,18 @@ RelationBuildPartitionDesc(Relation rel)
         }
         else if (key->strategy == PARTITION_STRATEGY_RANGE)
         {
-            int            j,
-                        k;
+		    int         k;
             PartitionRangeBound **all_bounds,
                        *prev;
-            bool       *distinct_indexes;
 
             all_bounds = (PartitionRangeBound **) palloc0(2 * nparts *
                                                           sizeof(PartitionRangeBound *));
-            distinct_indexes = (bool *) palloc(2 * nparts * sizeof(bool));
 
             /*
              * Create a unified list of range bounds across all the
              * partitions.
              */
-            i = j = 0;
+		    i = ndatums = 0;
             foreach(cell, boundspecs)
             {
                 PartitionBoundSpec *spec = castNode(PartitionBoundSpec,
@@ -371,28 +368,26 @@ RelationBuildPartitionDesc(Relation rel)
                                              true);
                 upper = make_one_range_bound(key, i, spec->upperdatums,
                                              false);
-                all_bounds[j] = lower;
-                all_bounds[j + 1] = upper;
-                j += 2;
+		        all_bounds[ndatums++] = lower;
+		        all_bounds[ndatums++] = upper;
                 i++;
             }
 
-			Assert(j == nparts * 2 ||
-                  (default_index != -1 && j == (nparts - 1) * 2));
+		    Assert(ndatums == nparts * 2 ||
+		           (default_index != -1 && ndatums == (nparts - 1) * 2));
 
             /* Sort all the bounds in ascending order */
-			qsort_arg(all_bounds, j,
+		    qsort_arg(all_bounds, ndatums,
                       sizeof(PartitionRangeBound *),
                       qsort_partition_rbound_cmp,
                       (void *) key);
 
-            /*
-             * Count the number of distinct bounds to allocate an array of
-             * that size.
-             */
-            ndatums = 0;
+		    /* Save distinct bounds from all_bounds into rbounds. */
+		    rbounds = (PartitionRangeBound **)
+		        palloc(ndatums * sizeof(PartitionRangeBound *));
+		    k = 0;
             prev = NULL;
-            for (i = 0; i < 2 * nparts; i++)
+		    for (i = 0; i < ndatums; i++)
             {
                 PartitionRangeBound *cur = all_bounds[i];
                 bool        is_distinct = false;
@@ -429,34 +424,18 @@ RelationBuildPartitionDesc(Relation rel)
                 }
 
                 /*
-                 * Count the current bound if it is distinct from the previous
-                 * one.  Also, store if the index i contains a distinct bound
-                 * that we'd like put in the relcache array.
+		         * Only if the bound is distinct save it into a temporary
+		         * array i.e. rbounds which is later copied into boundinfo
+		         * datums array.
                  */
                 if (is_distinct)
-                {
-                    distinct_indexes[i] = true;
-                    ndatums++;
-                }
-                else
-                    distinct_indexes[i] = false;
+		            rbounds[k++] = all_bounds[i];
 
                 prev = cur;
             }
 
-            /*
-             * Finally save them in an array from where they will be copied
-             * into the relcache.
-             */
-            rbounds = (PartitionRangeBound **) palloc(ndatums *
-                                                      sizeof(PartitionRangeBound *));
-            k = 0;
-            for (i = 0; i < 2 * nparts; i++)
-            {
-                if (distinct_indexes[i])
-                    rbounds[k++] = all_bounds[i];
-            }
-            Assert(k == ndatums);
+		    /* Update ndatums to hold the count of distinct datums. */
+		    ndatums = k;
         }
         else
             elog(ERROR, "unexpected partition strategy: %d",
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 73159be1..fbeeb3e5 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -1031,7 +1031,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
 		childrel->reltarget->exprs = (List *)
 			adjust_appendrel_attrs(root,
 								   (Node *) rel->reltarget->exprs,
-								   &appinfo);
+								   appinfo);
 
         /*
 		 * We have to make child entries in the EquivalenceClass data
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 3aa0c9f4..c1583491 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -5141,7 +5141,7 @@ finalize_plan(PlannerInfo *root, Plan *plan, int gather_param, Bitmapset *valid_
             context.paramids = bms_add_members(context.paramids, scan_params);
             break;
 
-
+		case T_SubqueryScan:
             {
                 SubqueryScan *sscan = (SubqueryScan *) plan;
                 RelOptInfo *rel;
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 64bb8794..4c2e07ac 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -1400,6 +1400,7 @@ describeOneTableDetails(const char *schemaname,
         bool        rowsecurity;
         bool        forcerowsecurity;
         bool        hasoids;
+		bool        ispartition;
         Oid            tablespace;
         char       *reloptions;
         char       *reloftype;
@@ -1428,7 +1429,7 @@ describeOneTableDetails(const char *schemaname,
         printfPQExpBuffer(&buf,
                           "SELECT c.relchecks, c.relkind, c.relhasindex, c.relhasrules, "
                           "c.relhastriggers, c.relrowsecurity, c.relforcerowsecurity, "
-                          "c.relhasoids, %s, c.reltablespace, "
+						  "c.relhasoids, c.relispartition, %s, c.reltablespace, "
                           "CASE WHEN c.reloftype = 0 THEN '' ELSE c.reloftype::pg_catalog.regtype::pg_catalog.text END, "
 #ifdef __TBASE__
                           "c.relpersistence, c.relreplident, c.relpartkind\n"
@@ -1559,20 +1560,21 @@ describeOneTableDetails(const char *schemaname,
     tableinfo.rowsecurity = strcmp(PQgetvalue(res, 0, 5), "t") == 0;
     tableinfo.forcerowsecurity = strcmp(PQgetvalue(res, 0, 6), "t") == 0;
     tableinfo.hasoids = strcmp(PQgetvalue(res, 0, 7), "t") == 0;
+	tableinfo.ispartition = strcmp(PQgetvalue(res, 0, 8), "t") == 0;
     tableinfo.reloptions = (pset.sversion >= 80200) ?
-        pg_strdup(PQgetvalue(res, 0, 8)) : NULL;
+		pg_strdup(PQgetvalue(res, 0, 9)) : NULL;
     tableinfo.tablespace = (pset.sversion >= 80000) ?
-        atooid(PQgetvalue(res, 0, 9)) : 0;
+		atooid(PQgetvalue(res, 0, 10)) : 0;
     tableinfo.reloftype = (pset.sversion >= 90000 &&
-                           strcmp(PQgetvalue(res, 0, 10), "") != 0) ?
-        pg_strdup(PQgetvalue(res, 0, 10)) : NULL;
+						   strcmp(PQgetvalue(res, 0, 11), "") != 0) ?
+		pg_strdup(PQgetvalue(res, 0, 11)) : NULL;
     tableinfo.relpersistence = (pset.sversion >= 90100) ?
-        *(PQgetvalue(res, 0, 11)) : 0;
+		*(PQgetvalue(res, 0, 12)) : 0;
     tableinfo.relreplident = (pset.sversion >= 90400) ?
-        *(PQgetvalue(res, 0, 12)) : 'd';
+		*(PQgetvalue(res, 0, 13)) : 'd';
 #ifdef __TBASE__
     tableinfo.relpartkind = (pset.sversion >= 90500)?
-        *PQgetvalue(res, 0, 13) : 'n';
+		*PQgetvalue(res, 0, 14) : 'n';
 #endif
     PQclear(res);
     res = NULL;
@@ -2257,11 +2259,15 @@ describeOneTableDetails(const char *schemaname,
 			tableinfo.relkind == RELKIND_PARTITIONED_TABLE)
         {
             printfPQExpBuffer(&buf,
-                              "SELECT conname,\n"
-                              "  pg_catalog.pg_get_constraintdef(r.oid, true) as condef\n"
-                              "FROM pg_catalog.pg_constraint r\n"
-                              "WHERE r.conrelid = '%s' AND r.contype = 'f' ORDER BY 1;",
-                              oid);
+							  "SELECT conrelid = '%s'::pg_catalog.regclass AS sametable,\n"
+							  "  conname,\n"
+							  "  pg_catalog.pg_get_constraintdef(oid, true) as condef,\n"
+							  "  conrelid::pg_catalog.regclass AS ontable\n"
+							  " FROM pg_catalog.pg_constraint,\n"
+					          "      pg_catalog.pg_partition_ancestors('%s')\n"
+							  " WHERE conrelid = relid AND contype = 'f'\n"
+							  " ORDER BY sametable DESC, conname;",
+							  oid, oid);
             result = PSQLexec(buf.data);
             if (!result)
                 goto error_return;
@@ -2273,10 +2279,15 @@ describeOneTableDetails(const char *schemaname,
                 printTableAddFooter(&cont, _("Foreign-key constraints:"));
                 for (i = 0; i < tuples; i++)
                 {
-                    /* untranslated constraint name and def */
-                    printfPQExpBuffer(&buf, "    \"%s\" %s",
-                                      PQgetvalue(result, i, 0),
-                                      PQgetvalue(result, i, 1));
+					/*
+					 * Print untranslated constraint name and definition. Use
+					 * a "TABLE tab" prefix when the constraint is defined in
+					 * a parent partitioned table.
+					 */
+					 printfPQExpBuffer(&buf, "    TABLE \"%s\" CONSTRAINT \"%s\" %s",
+					                                          PQgetvalue(result, i, 1),
+					                                          PQgetvalue(result, i, 2),
+					                                          PQgetvalue(result, i, 3));
 
                     printTableAddFooter(&cont, buf.data);
                 }
@@ -3819,7 +3830,7 @@ listPartitionedTables(const char *reltypes, const char *pattern, bool verbose)
 	{
 		char		sverbuf[32];
 
-		pg_log_error("The server (version %s) does not support declarative table partitioning.",
+		psql_error("The server (version %s) does not support declarative table partitioning.",
 					 formatPGVersionNumber(pset.sversion, false,
 										   sverbuf, sizeof(sverbuf)));
 		return true;
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index db21cc50..638f04f5 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -3530,11 +3530,11 @@ psql_completion(const char *text, int start, int end)
         COMPLETE_WITH_QUERY(Query_for_list_of_schemas);
     else if (TailMatchesCS1("\\dp") || TailMatchesCS1("\\z"))
         COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_tsvmf, NULL);
-	else if (TailMatchesCS("\\dPi*"))
+	else if (TailMatchesCS1("\\dPi*"))
 	   COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_indexes, NULL);
-	else if (TailMatchesCS("\\dPt*"))
+	else if (TailMatchesCS1("\\dPt*"))
 	   COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_tables, NULL);
-	else if (TailMatchesCS("\\dP*"))
+	else if (TailMatchesCS1("\\dP*"))
 	   COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_partitioned_relations, NULL);
     else if (TailMatchesCS1("\\ds*"))
         COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_sequences, NULL);
diff --git a/src/include/catalog/pg_amproc.h b/src/include/catalog/pg_amproc.h
index b239bbec..b02d0d0d 100644
--- a/src/include/catalog/pg_amproc.h
+++ b/src/include/catalog/pg_amproc.h
@@ -153,77 +153,77 @@ DATA(insert (    4033   3802 3802 1 4044 ));
 
 /* hash */
 DATA(insert (    427   1042 1042 1 1080 ));
-DATA(insert (  427   1042 1042 2 972 ));
+DATA(insert (  427   1042 1042 2 4676 ));
 DATA(insert (    431   18 18 1 454 ));
-DATA(insert (  431   18 18 2 446 ));
+DATA(insert (  431   18 18 2 4666 ));
 DATA(insert (    435   1082 1082 1 450 ));
-DATA(insert (  435   1082 1082 2 425 ));
+DATA(insert (  435   1082 1082 2 4661 ));
 DATA(insert (    627   2277 2277 1 626 ));
-DATA(insert (  627   2277 2277 2 782 ));
+DATA(insert (  627   2277 2277 2 4686 ));
 DATA(insert (    1971   700 700 1 451 ));
-DATA(insert (  1971   700 700 2 443 ));
+DATA(insert (  1971   700 700 2 4663 ));
 DATA(insert (    1971   701 701 1 452 ));
-DATA(insert (  1971   701 701 2 444 ));
+DATA(insert (  1971   701 701 2 4664 ));
 DATA(insert (    1975   869 869 1 422 ));
-DATA(insert (  1975   869 869 2 779 ));
+DATA(insert (  1975   869 869 2 4673 ));
 DATA(insert (    1977   21 21 1 449 ));
-DATA(insert (  1977   21 21 2 441 ));
+DATA(insert (  1977   21 21 2 4660 ));
 DATA(insert (    1977   23 23 1 450 ));
-DATA(insert (  1977   23 23 2 425 ));
+DATA(insert (  1977   23 23 2 4661 ));
 DATA(insert (    1977   20 20 1 949 ));
-DATA(insert (  1977   20 20 2 442 ));
+DATA(insert (  1977   20 20 2 4662 ));
 DATA(insert (    1983   1186 1186 1 1697 ));
-DATA(insert (  1983   1186 1186 2 3418 ));
+DATA(insert (  1983   1186 1186 2 4679 ));
 DATA(insert (    1985   829 829 1 399 ));
-DATA(insert (  1985   829 829 2 778 ));
+DATA(insert (  1985   829 829 2 4672 ));
 DATA(insert (    1987   19 19 1 455 ));
-DATA(insert (  1987   19 19 2 447 ));
+DATA(insert (  1987   19 19 2 4667 ));
 DATA(insert (    1990   26 26 1 453 ));
-DATA(insert (  1990   26 26 2 445 ));
+DATA(insert (  1990   26 26 2 4665 ));
 DATA(insert (    1992   30 30 1 457 ));
-DATA(insert (  1992   30 30 2 776 ));
+DATA(insert (  1992   30 30 2 4670 ));
 DATA(insert (    1995   25 25 1 400 ));
-DATA(insert (  1995   25 25 2 448));
+DATA(insert (  1995   25 25 2 4668));
 DATA(insert (    1997   1083 1083 1 1688 ));
-DATA(insert (  1997   1083 1083 2 3409 ));
+DATA(insert (  1997   1083 1083 2 4677 ));
 DATA(insert (    1998   1700 1700 1 432 ));
-DATA(insert (  1998   1700 1700 2 780 ));
+DATA(insert (  1998   1700 1700 2 4674 ));
 DATA(insert (    1999   1184 1184 1 2039 ));
-DATA(insert (  1999   1184 1184 2 3411 ));
+DATA(insert (  1999   1184 1184 2 4680 ));
 DATA(insert (    2001   1266 1266 1 1696 ));
-DATA(insert (  2001   1266 1266 2 3410 ));
+DATA(insert (  2001   1266 1266 2 4678 ));
 DATA(insert (    2040   1114 1114 1 2039 ));
-DATA(insert (  2040   1114 1114 2 3411 ));
+DATA(insert (  2040   1114 1114 2 4680 ));
 DATA(insert (    2222   16 16 1 454 ));
-DATA(insert (  2222   16 16 2 446 ));
+DATA(insert (  2222   16 16 2 4666 ));
 DATA(insert (    2223   17 17 1 456 ));
-DATA(insert (  2223   17 17 2 772 ));
+DATA(insert (  2223   17 17 2 4669 ));
 DATA(insert (    2225   28 28 1 450 ));
-DATA(insert (  2225   28 28 2 425));
+DATA(insert (  2225   28 28 2 4661));
 DATA(insert (    2226   29 29 1 450 ));
-DATA(insert (  2226   29 29 2 425 ));
+DATA(insert (  2226   29 29 2 4661 ));
 DATA(insert (    2227   702 702 1 450 ));
-DATA(insert (  2227   702 702 2 425 ));
+DATA(insert (  2227   702 702 2 4661 ));
 DATA(insert (    2228   703 703 1 450 ));
-DATA(insert (  2228   703 703 2 425 ));
+DATA(insert (  2228   703 703 2 4661 ));
 DATA(insert (    2229   25 25 1 400 ));
-DATA(insert (  2229   25 25 2 448 ));
+DATA(insert (  2229   25 25 2 4668 ));
 DATA(insert (    2231   1042 1042 1 1080 ));
-DATA(insert (  2231   1042 1042 2 972 ));
+DATA(insert (  2231   1042 1042 2 4676 ));
 DATA(insert (    2235   1033 1033 1 329 ));
-DATA(insert (  2235   1033 1033 2 777 ));
+DATA(insert (  2235   1033 1033 2 4671 ));
 DATA(insert (    2969   2950 2950 1 2963 ));
-DATA(insert (  2969   2950 2950 2 3412 ));
+DATA(insert (  2969   2950 2950 2 4681 ));
 DATA(insert (    3254   3220 3220 1 3252 ));
-DATA(insert (  3254   3220 3220 2 3413 ));
+DATA(insert (  3254   3220 3220 2 4682 ));
 DATA(insert (    3372   774 774 1 328 ));
-DATA(insert (  3372   774 774 2 781 ));
+DATA(insert (  3372   774 774 2 4675 ));
 DATA(insert (    3523   3500 3500 1 3515 ));
-DATA(insert (  3523   3500 3500 2 3414 ));
+DATA(insert (  3523   3500 3500 2 4683 ));
 DATA(insert (    3903   3831 3831 1 3902 ));
-DATA(insert (  3903   3831 3831 2 3417 ));
+DATA(insert (  3903   3831 3831 2 4685 ));
 DATA(insert (    4034   3802 3802 1 4045 ));
-DATA(insert (  4034   3802 3802 2 3416));
+DATA(insert (  4034   3802 3802 2 4684));
 
 /* gist */
 DATA(insert (    1029   600 600 1 2179 ));

From ac8c2434c5dc5be3570e150ea6ca3c4d78082bdb Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 10 Jul 2020 11:58:37 +0800
Subject: [PATCH 304/578] Fix crash when ALTER TABLE recreates indexes on
 partitions.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/commands/indexcmds.c       | 4 ++--
 src/backend/commands/tablecmds.c       | 2 +-
 src/test/regress/expected/indexing.out | 8 ++++++++
 src/test/regress/sql/indexing.sql      | 9 +++++++++
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index be45e453..76701f4a 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -964,8 +964,8 @@ DefineIndex(Oid relationId,
 					DefineIndex(childRelid, childStmt,
 								InvalidOid,			/* no predefined OID */
 								indexRelationId,	/* this is our child */
-								false, check_rights, check_not_in_use,
-								false, quiet);
+								is_alter_table, check_rights, check_not_in_use,
+								skip_build, quiet);
 				}
 
 				pfree(attmap);
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 430141a2..989fb062 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -17155,7 +17155,7 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel)
 										   RelationGetDescr(rel)->natts);
 			DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid,
 						RelationGetRelid(idxRel),
-						false, false, false, false, false);
+						true, false, false, false, false);
 		}
 
 		index_close(idxRel, AccessShareLock);
diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out
index 804aa2eb..d4326a87 100644
--- a/src/test/regress/expected/indexing.out
+++ b/src/test/regress/expected/indexing.out
@@ -31,6 +31,14 @@ ERROR:  cannot create unique index on partitioned table "idxpart"
 create index concurrently on idxpart (a);
 ERROR:  cannot create index on partitioned table "idxpart" concurrently
 drop table idxpart;
+-- Verify bugfix with index rewrite on ALTER TABLE / SET DATA TYPE
+-- https://postgr.es/m/CAKcux6mxNCGsgATwf5CGMF8g4WSupCXicCVMeKUTuWbyxHOMsQ@mail.gmail.com
+CREATE TABLE idxpart (a INT, b TEXT, c INT) PARTITION BY RANGE(a);
+CREATE TABLE idxpart1 PARTITION OF idxpart FOR VALUES FROM (MINVALUE) TO (MAXVALUE);
+CREATE INDEX partidx_abc_idx ON idxpart (a, b, c);
+INSERT INTO idxpart (a, b, c) SELECT i, i, i FROM generate_series(1, 50) i;
+ALTER TABLE idxpart ALTER COLUMN c TYPE numeric;
+DROP TABLE idxpart;
 -- If a table without index is attached as partition to a table with
 -- an index, the index is automatically created
 create table idxpart (a int, b int, c text) partition by range (a);
diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql
index cd1dd3b0..4762e687 100644
--- a/src/test/regress/sql/indexing.sql
+++ b/src/test/regress/sql/indexing.sql
@@ -19,6 +19,15 @@ create unique index on idxpart (a);
 create index concurrently on idxpart (a);
 drop table idxpart;
 
+-- Verify bugfix with index rewrite on ALTER TABLE / SET DATA TYPE
+-- https://postgr.es/m/CAKcux6mxNCGsgATwf5CGMF8g4WSupCXicCVMeKUTuWbyxHOMsQ@mail.gmail.com
+CREATE TABLE idxpart (a INT, b TEXT, c INT) PARTITION BY RANGE(a);
+CREATE TABLE idxpart1 PARTITION OF idxpart FOR VALUES FROM (MINVALUE) TO (MAXVALUE);
+CREATE INDEX partidx_abc_idx ON idxpart (a, b, c);
+INSERT INTO idxpart (a, b, c) SELECT i, i, i FROM generate_series(1, 50) i;
+ALTER TABLE idxpart ALTER COLUMN c TYPE numeric;
+DROP TABLE idxpart;
+
 -- If a table without index is attached as partition to a table with
 -- an index, the index is automatically created
 create table idxpart (a int, b int, c text) partition by range (a);

From d10527dd94a348df085f2c41864b0b0a2b0088f1 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 16 Jul 2020 19:30:57 +0800
Subject: [PATCH 305/578] fix regress error related partition table.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/optimizer/path/allpaths.c         |    6 +-
 src/backend/optimizer/path/equivclass.c       |    5 +-
 src/backend/optimizer/path/joinrels.c         |    2 +-
 src/backend/optimizer/plan/planner.c          |    2 +-
 src/backend/optimizer/prep/prepunion.c        |  273 +-
 src/backend/optimizer/util/pathnode.c         |    3 +-
 src/backend/optimizer/util/placeholder.c      |    2 +-
 src/backend/optimizer/util/relnode.c          |    2 +-
 src/bin/psql/describe.c                       |   45 +-
 src/include/optimizer/prep.h                  |    8 +-
 src/test/regress/expected/alter_table.out     |   21 +-
 src/test/regress/expected/alter_table_3.out   |    3 +-
 src/test/regress/expected/create_table.out    |    6 +
 src/test/regress/expected/event_trigger.out   |    7 +-
 src/test/regress/expected/foreign_data.out    |    3 +
 src/test/regress/expected/foreign_key_2.out   |   25 +-
 src/test/regress/expected/identity_1.out      |   12 +
 src/test/regress/expected/indexing.out        |   65 +-
 src/test/regress/expected/inherit_2.out       |   24 +
 src/test/regress/expected/inherit_3.out       |   78 +-
 src/test/regress/expected/insert.out          |   50 +-
 .../regress/expected/insert_conflict_1.out    |   21 +
 src/test/regress/expected/partition_info.out  |   94 +-
 .../regress/expected/partition_join_1.out     | 2102 ++++++++++++++
 src/test/regress/expected/partition_prune.out | 2477 +++++++++--------
 .../regress/expected/partition_prune_hash.out |  288 +-
 src/test/regress/expected/psql.out            |   26 +-
 src/test/regress/expected/sanity_check_1.out  |    6 +
 src/test/regress/expected/sysviews.out        |    2 +-
 src/test/regress/expected/temp.out            |    2 +-
 src/test/regress/expected/truncate.out        |   22 +-
 src/test/regress/expected/update.out          |  355 +--
 src/test/regress/input/tablespace.source      |   29 +-
 src/test/regress/output/tablespace.source     |  326 +--
 src/test/regress/sql/alter_table.sql          |    4 +-
 src/test/regress/sql/event_trigger.sql        |    2 +-
 src/test/regress/sql/indexing.sql             |   32 +-
 src/test/regress/sql/inherit.sql              |    2 +-
 src/test/regress/sql/insert.sql               |    6 +-
 src/test/regress/sql/partition_info.sql       |   24 -
 src/test/regress/sql/partition_prune.sql      |    2 +-
 src/test/regress/sql/select_parallel.sql      |    2 +
 src/test/regress/sql/temp.sql                 |    2 +-
 src/test/regress/sql/truncate.sql             |    4 +-
 src/test/regress/sql/update.sql               |   23 +-
 45 files changed, 4348 insertions(+), 2147 deletions(-)
 create mode 100644 src/test/regress/expected/partition_join_1.out

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index fbeeb3e5..4326a646 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -1031,7 +1031,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
 		childrel->reltarget->exprs = (List *)
 			adjust_appendrel_attrs(root,
 								   (Node *) rel->reltarget->exprs,
-								   appinfo);
+								   1, &appinfo);
 
         /*
 		 * We have to make child entries in the EquivalenceClass data
@@ -1073,7 +1073,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
             Assert(IsA(rinfo, RestrictInfo));
             childqual = adjust_appendrel_attrs(root,
                                                (Node *) rinfo->clause,
-                                               appinfo);
+											   1, &appinfo);
             childqual = eval_const_expressions(root, childqual);
             /* check for flat-out constant */
             if (childqual && IsA(childqual, Const))
@@ -1190,7 +1190,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel,
         childrel->joininfo = (List *)
             adjust_appendrel_attrs(root,
                                    (Node *) rel->joininfo,
-                                   appinfo);
+								   1, &appinfo);
 
         /*
          * If parallelism is allowable for this query in general, see whether
diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c
index 4ad19a55..bb16aff0 100644
--- a/src/backend/optimizer/path/equivclass.c
+++ b/src/backend/optimizer/path/equivclass.c
@@ -1344,7 +1344,8 @@ generate_join_implied_equalities_broken(PlannerInfo *root,
     if (IS_OTHER_REL(inner_rel) && result != NIL)
         result = (List *) adjust_appendrel_attrs_multilevel(root,
                                                             (Node *) result,
-                                                            inner_rel);
+															inner_rel->relids,
+															inner_rel->top_parent_relids);
 
     return result;
 }
@@ -2127,7 +2128,7 @@ add_child_rel_equivalences(PlannerInfo *root,
                 child_expr = (Expr *)
                     adjust_appendrel_attrs(root,
                                            (Node *) cur_em->em_expr,
-                                           appinfo);
+										   1, &appinfo);
 
                 /*
                  * Transform em_relids to match.  Note we do *not* do
diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c
index 715036b9..d8afa3ef 100644
--- a/src/backend/optimizer/path/joinrels.c
+++ b/src/backend/optimizer/path/joinrels.c
@@ -1405,7 +1405,7 @@ try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
 		 * applicable to the parent join.
 		 */
 		child_restrictlist =
-			(List *) adjust_appendrel_attrs_nappinfos(root,
+			(List *) adjust_appendrel_attrs(root,
 											(Node *) parent_restrictlist,
 											nappinfos, appinfos);
 		pfree(appinfos);
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 2b736fd4..55c28ea3 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1402,7 +1402,7 @@ inheritance_planner(PlannerInfo *root)
         subroot->parse = (Query *)
 			adjust_appendrel_attrs(parent_root,
 								   (Node *) parent_parse,
-                                   appinfo);
+								   1, &appinfo);
 
         /*
          * If there are securityQuals attached to the parent, move them to the
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 62bc5200..e5ddbc5c 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -58,9 +58,8 @@
 typedef struct
 {
     PlannerInfo *root;
-    AppendRelInfo *appinfo;
-	AppendRelInfo **appinfos;
 	int nappinfos;
+	AppendRelInfo **appinfos;
 } adjust_appendrel_attrs_context;
 
 static Path *recurse_set_operations(Node *setOp, PlannerInfo *root,
@@ -125,7 +124,8 @@ static Node *adjust_appendrel_attrs_mutator(Node *node,
                                adjust_appendrel_attrs_context *context);
 static Relids adjust_child_relids(Relids relids, int nappinfos,
                     AppendRelInfo **appinfos);
-static Relids adjust_relid_set(Relids relids, Index oldrelid, Index newrelid);
+static Relids adjust_child_relids(Relids relids, int nappinfos,
+                   AppendRelInfo **appinfos);
 static List *adjust_inherited_tlist(List *tlist,
                        AppendRelInfo *context);
 
@@ -1991,56 +1991,6 @@ translate_col_privs(const Bitmapset *parent_privs,
     return child_privs;
 }
 
-/*
- * adjust_appendrel_attrs
- *      Copy the specified query or expression and translate Vars referring
- *      to the parent rel of the specified AppendRelInfo to refer to the
- *      child rel instead.  We also update rtindexes appearing outside Vars,
- *      such as resultRelation and jointree relids.
- *
- * Note: this is only applied after conversion of sublinks to subplans,
- * so we don't need to cope with recursion into sub-queries.
- *
- * Note: this is not hugely different from what pullup_replace_vars() does;
- * maybe we should try to fold the two routines together.
- */
-Node *
-adjust_appendrel_attrs(PlannerInfo *root, Node *node, AppendRelInfo *appinfo)
-{
-    Node       *result;
-    adjust_appendrel_attrs_context context;
-
-    context.root = root;
-    context.appinfo = appinfo;
-
-    /*
-     * Must be prepared to start with a Query or a bare expression tree.
-     */
-    if (node && IsA(node, Query))
-    {
-        Query       *newnode;
-
-        newnode = query_tree_mutator((Query *) node,
-                                     adjust_appendrel_attrs_mutator,
-                                     (void *) &context,
-                                     QTW_IGNORE_RC_SUBQUERIES);
-        if (newnode->resultRelation == appinfo->parent_relid)
-        {
-            newnode->resultRelation = appinfo->child_relid;
-            /* Fix tlist resnos too, if it's inherited UPDATE */
-            if (newnode->commandType == CMD_UPDATE)
-                newnode->targetList =
-                    adjust_inherited_tlist(newnode->targetList,
-                                           appinfo);
-        }
-        result = (Node *) newnode;
-    }
-    else
-        result = adjust_appendrel_attrs_mutator(node, &context);
-
-    return result;
-}
-
 /*
  * adjust_appendrel_attrs
  *        Copy the specified query or expression and translate Vars referring to a
@@ -2055,7 +2005,7 @@ adjust_appendrel_attrs(PlannerInfo *root, Node *node, AppendRelInfo *appinfo)
  * maybe we should try to fold the two routines together.
  */
 Node *
-adjust_appendrel_attrs_nappinfos(PlannerInfo *root, Node *node, int nappinfos,
+adjust_appendrel_attrs(PlannerInfo *root, Node *node, int nappinfos,
                                            AppendRelInfo **appinfos)
 {
         Node       *result;
@@ -2107,17 +2057,28 @@ adjust_appendrel_attrs_nappinfos(PlannerInfo *root, Node *node, int nappinfos,
 static Node *
 adjust_appendrel_attrs_mutator(Node *node,
                                adjust_appendrel_attrs_context *context)
-{// #lizard forgives
-    AppendRelInfo *appinfo = context->appinfo;
+{
+	AppendRelInfo **appinfos = context->appinfos;
+    int         nappinfos = context->nappinfos;
+    int         cnt;
 
     if (node == NULL)
         return NULL;
     if (IsA(node, Var))
     {
         Var           *var = (Var *) copyObject(node);
+		AppendRelInfo *appinfo = NULL;
+
+        for (cnt = 0; cnt < nappinfos; cnt++)
+        {
+            if (var->varno == appinfos[cnt]->parent_relid)
+            {
+                appinfo = appinfos[cnt];
+                break;
+            }
+        }
 
-        if (var->varlevelsup == 0 &&
-            var->varno == appinfo->parent_relid)
+        if (var->varlevelsup == 0 && appinfo)
         {
             var->varno = appinfo->child_relid;
             var->varnoold = appinfo->child_relid;
@@ -2197,29 +2158,54 @@ adjust_appendrel_attrs_mutator(Node *node,
     {
         CurrentOfExpr *cexpr = (CurrentOfExpr *) copyObject(node);
 
+		for (cnt = 0; cnt < nappinfos; cnt++)
+        {
+            AppendRelInfo *appinfo = appinfos[cnt];
+ 
         if (cexpr->cvarno == appinfo->parent_relid)
+            {
             cexpr->cvarno = appinfo->child_relid;
+                break;
+            }
+        }
         return (Node *) cexpr;
     }
     if (IsA(node, RangeTblRef))
     {
         RangeTblRef *rtr = (RangeTblRef *) copyObject(node);
 
+		for (cnt = 0; cnt < nappinfos; cnt++)
+        {
+            AppendRelInfo *appinfo = appinfos[cnt];
+ 
         if (rtr->rtindex == appinfo->parent_relid)
+            {
             rtr->rtindex = appinfo->child_relid;
+                break;
+            }
+        }
         return (Node *) rtr;
     }
     if (IsA(node, JoinExpr))
     {
         /* Copy the JoinExpr node with correct mutation of subnodes */
         JoinExpr   *j;
+		AppendRelInfo *appinfo;
 
         j = (JoinExpr *) expression_tree_mutator(node,
                                                  adjust_appendrel_attrs_mutator,
                                                  (void *) context);
         /* now fix JoinExpr's rtindex (probably never happens) */
+		for (cnt = 0; cnt < nappinfos; cnt++)
+        {
+            appinfo = appinfos[cnt];
+ 
         if (j->rtindex == appinfo->parent_relid)
+            {
             j->rtindex = appinfo->child_relid;
+                break;
+            }
+        }
         return (Node *) j;
     }
     if (IsA(node, PlaceHolderVar))
@@ -2232,9 +2218,8 @@ adjust_appendrel_attrs_mutator(Node *node,
                                                          (void *) context);
         /* now fix PlaceHolderVar's relid sets */
         if (phv->phlevelsup == 0)
-            phv->phrels = adjust_relid_set(phv->phrels,
-                                           appinfo->parent_relid,
-                                           appinfo->child_relid);
+			phv->phrels = adjust_child_relids(phv->phrels, context->nappinfos,
+                                             context->appinfos);
         return (Node *) phv;
     }
     /* Shouldn't need to handle planner auxiliary nodes here */
@@ -2265,24 +2250,24 @@ adjust_appendrel_attrs_mutator(Node *node,
             adjust_appendrel_attrs_mutator((Node *) oldinfo->orclause, context);
 
         /* adjust relid sets too */
-        newinfo->clause_relids = adjust_relid_set(oldinfo->clause_relids,
-                                                  appinfo->parent_relid,
-                                                  appinfo->child_relid);
-        newinfo->required_relids = adjust_relid_set(oldinfo->required_relids,
-                                                    appinfo->parent_relid,
-                                                    appinfo->child_relid);
-        newinfo->outer_relids = adjust_relid_set(oldinfo->outer_relids,
-                                                 appinfo->parent_relid,
-                                                 appinfo->child_relid);
-        newinfo->nullable_relids = adjust_relid_set(oldinfo->nullable_relids,
-                                                    appinfo->parent_relid,
-                                                    appinfo->child_relid);
-        newinfo->left_relids = adjust_relid_set(oldinfo->left_relids,
-                                                appinfo->parent_relid,
-                                                appinfo->child_relid);
-        newinfo->right_relids = adjust_relid_set(oldinfo->right_relids,
-                                                 appinfo->parent_relid,
-                                                 appinfo->child_relid);
+        newinfo->clause_relids = adjust_child_relids(oldinfo->clause_relids,
+                                                     context->nappinfos,
+                                                     context->appinfos);
+        newinfo->required_relids = adjust_child_relids(oldinfo->required_relids,
+                                                       context->nappinfos,
+                                                       context->appinfos);
+        newinfo->outer_relids = adjust_child_relids(oldinfo->outer_relids,
+                                                    context->nappinfos,
+                                                    context->appinfos);
+        newinfo->nullable_relids = adjust_child_relids(oldinfo->nullable_relids,
+                                                       context->nappinfos,
+                                                       context->appinfos);
+        newinfo->left_relids = adjust_child_relids(oldinfo->left_relids,
+                                                   context->nappinfos,
+                                                   context->appinfos);
+        newinfo->right_relids = adjust_child_relids(oldinfo->right_relids,
+                                                    context->nappinfos,
+                                                    context->appinfos);
 
         /*
          * Reset cached derivative fields, since these might need to have
@@ -2347,23 +2332,6 @@ adjust_child_relids(Relids relids, int nappinfos, AppendRelInfo **appinfos)
         return relids;
 }
 
-/*
- * Substitute newrelid for oldrelid in a Relid set
- */
-static Relids
-adjust_relid_set(Relids relids, Index oldrelid, Index newrelid)
-{
-    if (bms_is_member(oldrelid, relids))
-    {
-        /* Ensure we have a modifiable copy */
-        relids = bms_copy(relids);
-        /* Remove old, add new */
-        relids = bms_del_member(relids, oldrelid);
-        relids = bms_add_member(relids, newrelid);
-    }
-    return relids;
-}
-
 /*
  * Replace any relid present in top_parent_relids with its child in
  * child_relids. Members of child_relids can be multiple levels below top
@@ -2518,65 +2486,42 @@ adjust_inherited_tlist(List *tlist, AppendRelInfo *context)
  * adjust_appendrel_attrs_multilevel
  *      Apply Var translations from a toplevel appendrel parent down to a child.
  *
- * In some cases we need to translate expressions referencing a baserel
+ * In some cases we need to translate expressions referencing a parent relation
  * to reference an appendrel child that's multiple levels removed from it.
  */
 Node *
 adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node,
-                                  RelOptInfo *child_rel)
+								  Relids child_relids,
+                                  Relids top_parent_relids)
 {
-    AppendRelInfo *appinfo = find_childrel_appendrelinfo(root, child_rel);
-    RelOptInfo *parent_rel = find_base_rel(root, appinfo->parent_relid);
+    AppendRelInfo **appinfos;
+    Bitmapset  *parent_relids = NULL;
+    int         nappinfos;
+    int         cnt;
 
-    /* If parent is also a child, first recurse to apply its translations */
-    if (IS_OTHER_REL(parent_rel))
-        node = adjust_appendrel_attrs_multilevel(root, node, parent_rel);
-    else
-        Assert(parent_rel->reloptkind == RELOPT_BASEREL);
-    /* Now translate for this child */
-    return adjust_appendrel_attrs(root, node, appinfo);
-}
+    Assert(bms_num_members(child_relids) == bms_num_members(top_parent_relids));
 
-/*
- * Construct the SpecialJoinInfo for a child-join by translating
- * SpecialJoinInfo for the join between parents. left_relids and right_relids
- * are the relids of left and right side of the join respectively.
- */
-SpecialJoinInfo *
-build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo,
-                       Relids left_relids, Relids right_relids)
+    appinfos = find_appinfos_by_relids(root, child_relids, &nappinfos);
+ 
+    /* Construct relids set for the immediate parent of given child. */
+    for (cnt = 0; cnt < nappinfos; cnt++)
 {
-   SpecialJoinInfo *sjinfo = makeNode(SpecialJoinInfo);
-   AppendRelInfo **left_appinfos;
-   int         left_nappinfos;
-   AppendRelInfo **right_appinfos;
-   int         right_nappinfos;
+        AppendRelInfo *appinfo = appinfos[cnt];
 
-   memcpy(sjinfo, parent_sjinfo, sizeof(SpecialJoinInfo));
-   left_appinfos = find_appinfos_by_relids(root, left_relids,
-                                           &left_nappinfos);
-   right_appinfos = find_appinfos_by_relids(root, right_relids,
-                                            &right_nappinfos);
+        parent_relids = bms_add_member(parent_relids, appinfo->parent_relid);
+    }
 
-   sjinfo->min_lefthand = adjust_child_relids(sjinfo->min_lefthand,
-                                              left_nappinfos, left_appinfos);
-   sjinfo->min_righthand = adjust_child_relids(sjinfo->min_righthand,
-                                               right_nappinfos,
-                                               right_appinfos);
-   sjinfo->syn_lefthand = adjust_child_relids(sjinfo->syn_lefthand,
-                                              left_nappinfos, left_appinfos);
-   sjinfo->syn_righthand = adjust_child_relids(sjinfo->syn_righthand,
-                                               right_nappinfos,
-                                               right_appinfos);
-   sjinfo->semi_rhs_exprs = (List *) adjust_appendrel_attrs_nappinfos(root,
-                                                            (Node *) sjinfo->semi_rhs_exprs,
-                                                            right_nappinfos,
-                                                            right_appinfos);
+    /* Recurse if immediate parent is not the top parent. */
+    if (!bms_equal(parent_relids, top_parent_relids))
+        node = adjust_appendrel_attrs_multilevel(root, node, parent_relids,
+                                                 top_parent_relids);
 
-   pfree(left_appinfos);
-   pfree(right_appinfos);
+	/* Now translate for this child */
+	node = adjust_appendrel_attrs(root, node, nappinfos, appinfos);
 
-   return sjinfo;
+    pfree(appinfos);
+
+    return node;
 }
 
 /*
@@ -2615,3 +2560,45 @@ find_appinfos_by_relids(PlannerInfo *root, Relids relids, int *nappinfos)
     elog(ERROR, "did not find all requested child rels in append_rel_list");
     return NULL;                /* not reached */
 }
+
+/*
+ * Construct the SpecialJoinInfo for a child-join by translating
+ * SpecialJoinInfo for the join between parents. left_relids and right_relids
+ * are the relids of left and right side of the join respectively.
+ */
+SpecialJoinInfo *
+build_child_join_sjinfo(PlannerInfo *root, SpecialJoinInfo *parent_sjinfo,
+                       Relids left_relids, Relids right_relids)
+{
+   SpecialJoinInfo *sjinfo = makeNode(SpecialJoinInfo);
+   AppendRelInfo **left_appinfos;
+   int         left_nappinfos;
+   AppendRelInfo **right_appinfos;
+   int         right_nappinfos;
+
+   memcpy(sjinfo, parent_sjinfo, sizeof(SpecialJoinInfo));
+   left_appinfos = find_appinfos_by_relids(root, left_relids,
+                                           &left_nappinfos);
+   right_appinfos = find_appinfos_by_relids(root, right_relids,
+                                            &right_nappinfos);
+
+   sjinfo->min_lefthand = adjust_child_relids(sjinfo->min_lefthand,
+                                              left_nappinfos, left_appinfos);
+   sjinfo->min_righthand = adjust_child_relids(sjinfo->min_righthand,
+                                               right_nappinfos,
+                                               right_appinfos);
+   sjinfo->syn_lefthand = adjust_child_relids(sjinfo->syn_lefthand,
+                                              left_nappinfos, left_appinfos);
+   sjinfo->syn_righthand = adjust_child_relids(sjinfo->syn_righthand,
+                                               right_nappinfos,
+                                               right_appinfos);
+   sjinfo->semi_rhs_exprs = (List *) adjust_appendrel_attrs(root,
+                                                            (Node *) sjinfo->semi_rhs_exprs,
+                                                            right_nappinfos,
+                                                            right_appinfos);
+
+   pfree(left_appinfos);
+   pfree(right_appinfos);
+
+   return sjinfo;
+}
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 2d4a5d2b..4d2a1f32 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -6967,7 +6967,8 @@ reparameterize_path_by_child(PlannerInfo *root, Path *path,
 #define ADJUST_CHILD_ATTRS(node) \
 	((node) = \
 	 (List *) adjust_appendrel_attrs_multilevel(root, (Node *) (node), \
-												child_rel))
+												child_rel->relids, \
+												child_rel->top_parent_relids))
 
 #define REPARAMETERIZE_CHILD_PATH(path) \
 do { \
diff --git a/src/backend/optimizer/util/placeholder.c b/src/backend/optimizer/util/placeholder.c
index a344dbe8..0d5351a6 100644
--- a/src/backend/optimizer/util/placeholder.c
+++ b/src/backend/optimizer/util/placeholder.c
@@ -499,7 +499,7 @@ add_placeholders_to_child_joinrel(PlannerInfo *root, RelOptInfo *childrel,
 			if (bms_overlap(phv->phrels, parentrel->relids) &&
 				childrel->reloptkind == RELOPT_OTHER_JOINREL)
 			{
-				phv = (PlaceHolderVar *) adjust_appendrel_attrs_nappinfos(root,
+				phv = (PlaceHolderVar *) adjust_appendrel_attrs(root,
 																(Node *) phv,
 																nappinfos,
 																appinfos);
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index 39100cae..1f6fb286 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -819,7 +819,7 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
 
 	/* Construct joininfo list. */
 	appinfos = find_appinfos_by_relids(root, joinrel->relids, &nappinfos);
-	joinrel->joininfo = (List *) adjust_appendrel_attrs_nappinfos(root,
+	joinrel->joininfo = (List *) adjust_appendrel_attrs(root,
 														(Node *) parent_joinrel->joininfo,
 														nappinfos,
 														appinfos);
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 4c2e07ac..ff427084 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -1911,12 +1911,15 @@ describeOneTableDetails(const char *schemaname,
         {
 			char	   *parent_name = PQgetvalue(result, 0, 0);
 			char	   *partdef = PQgetvalue(result, 0, 1);
-			char       *partconstraintdef = NULL;
 
             printfPQExpBuffer(&tmpbuf, _("Partition of: %s %s"), parent_name,
                               partdef);
             printTableAddFooter(&cont, tmpbuf.data);
 
+			if (verbose)
+			{
+				char       *partconstraintdef = NULL;
+
             if (!PQgetisnull(result, 0, 2))
             	partconstraintdef = PQgetvalue(result, 0, 2);
 			/* If there isn't any constraint, show that explicitly */
@@ -1926,7 +1929,7 @@ describeOneTableDetails(const char *schemaname,
                 printfPQExpBuffer(&tmpbuf, _("Partition constraint: %s"),
                                   partconstraintdef);
                 printTableAddFooter(&cont, tmpbuf.data);
-
+			}
         }
 		PQclear(result);
     }
@@ -2258,6 +2261,13 @@ describeOneTableDetails(const char *schemaname,
 		if (tableinfo.hastriggers ||
 			tableinfo.relkind == RELKIND_PARTITIONED_TABLE)
         {
+			if (pset.sversion >= 100000 &&
+			        (tableinfo.ispartition || tableinfo.relkind == RELKIND_PARTITIONED_TABLE))
+			{
+                /*
+                 * Put the constraints defined in this table first, followed
+                 * by the constraints defined in ancestor partitioned tables.
+                 */
             printfPQExpBuffer(&buf,
 							  "SELECT conrelid = '%s'::pg_catalog.regclass AS sametable,\n"
 							  "  conname,\n"
@@ -2268,6 +2278,21 @@ describeOneTableDetails(const char *schemaname,
 							  " WHERE conrelid = relid AND contype = 'f'\n"
 							  " ORDER BY sametable DESC, conname;",
 							  oid, oid);
+
+			}
+			else
+			{
+				printfPQExpBuffer(&buf,
+				                  "SELECT true as sametable, conname,\n"
+				                  "  pg_catalog.pg_get_constraintdef(r.oid, true) as condef,\n"
+				                  "  conrelid::pg_catalog.regclass AS ontable\n"
+				                  " FROM pg_catalog.pg_constraint r\n"
+				                  " WHERE r.conrelid = '%s' AND r.contype = 'f'\n",
+				                  oid);
+
+				appendPQExpBuffer(&buf, " ORDER BY conname");
+			}
+
             result = PSQLexec(buf.data);
             if (!result)
                 goto error_return;
@@ -2276,6 +2301,11 @@ describeOneTableDetails(const char *schemaname,
 
             if (tuples > 0)
             {
+                int                     i_sametable = PQfnumber(result, "sametable"),
+                                        i_conname = PQfnumber(result, "conname"),
+                                        i_condef = PQfnumber(result, "condef"),
+                                        i_ontable = PQfnumber(result, "ontable");
+
                 printTableAddFooter(&cont, _("Foreign-key constraints:"));
                 for (i = 0; i < tuples; i++)
                 {
@@ -2284,10 +2314,15 @@ describeOneTableDetails(const char *schemaname,
 					 * a "TABLE tab" prefix when the constraint is defined in
 					 * a parent partitioned table.
 					 */
+                    if (strcmp(PQgetvalue(result, i, i_sametable), "f") == 0)
 					 printfPQExpBuffer(&buf, "    TABLE \"%s\" CONSTRAINT \"%s\" %s",
-					                                          PQgetvalue(result, i, 1),
-					                                          PQgetvalue(result, i, 2),
-					                                          PQgetvalue(result, i, 3));
+                                                              PQgetvalue(result, i, i_ontable),
+                                                              PQgetvalue(result, i, i_conname),
+                                                              PQgetvalue(result, i, i_condef));
+                    else
+                            printfPQExpBuffer(&buf, "    \"%s\" %s",
+                                                              PQgetvalue(result, i, i_conname),
+                                                              PQgetvalue(result, i, i_condef));
 
                     printTableAddFooter(&cont, buf.data);
                 }
diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h
index 99c87a2d..e35330d7 100644
--- a/src/include/optimizer/prep.h
+++ b/src/include/optimizer/prep.h
@@ -53,13 +53,11 @@ extern RelOptInfo *plan_set_operations(PlannerInfo *root);
 extern void expand_inherited_tables(PlannerInfo *root);
 
 extern Node *adjust_appendrel_attrs(PlannerInfo *root, Node *node,
-					   AppendRelInfo *appinfo);
-
-extern Node *adjust_appendrel_attrs_nappinfos(PlannerInfo *root, Node *node, int nappinfos,
-                                           AppendRelInfo **appinfos);
+					   int nappinfos, AppendRelInfo **appinfos);
 
 extern Node *adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node,
-								  RelOptInfo *child_rel);
+								  Relids child_relids,
+                                  Relids top_parent_relids);
 
 extern AppendRelInfo **find_appinfos_by_relids(PlannerInfo *root,
                        Relids relids, int *nappinfos);
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 626f34f7..737710bc 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -1448,7 +1448,7 @@ copy test("........pg.dropped.1........") to stdout;
 ERROR:  column "........pg.dropped.1........" of relation "test" does not exist
 copy test from stdin;
 ERROR:  extra data after last expected column
-CONTEXT:  COPY test, line 1: "10	11	12"
+CONTEXT:  COPY test, line 1: "10	11	12", nodetype:1(1:cn,0:dn)
 select * from test order by b;
  b | c 
 ---+---
@@ -1971,8 +1971,8 @@ ERROR:  cannot alter table "tab1" because column "tab2.y" uses its row type
 create table at_partitioned (a int, b text) partition by range (a);
 create table at_part_1 partition of at_partitioned for values from (0) to (1000);
 insert into at_partitioned values (512, '0.123');
-create table at_part_2 (b text, a int);
-insert into at_part_2 values ('1.234', 1024);
+create table at_part_2 (a int, b text);
+insert into at_part_2 values (1024, '1.234');
 create index on at_partitioned (b);
 create index on at_partitioned (a);
 \d at_part_1
@@ -1990,16 +1990,16 @@ Indexes:
              Table "public.at_part_2"
  Column |  Type   | Collation | Nullable | Default 
 --------+---------+-----------+----------+---------
- b      | text    |           |          | 
  a      | integer |           |          | 
+ b      | text    |           |          | 
 
 alter table at_partitioned attach partition at_part_2 for values from (1000) to (2000);
 \d at_part_2
              Table "public.at_part_2"
  Column |  Type   | Collation | Nullable | Default 
 --------+---------+-----------+----------+---------
- b      | text    |           |          | 
  a      | integer |           |          | 
+ b      | text    |           |          | 
 Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000)
 Indexes:
     "at_part_2_a_idx" btree (a)
@@ -2021,8 +2021,8 @@ Indexes:
              Table "public.at_part_2"
  Column |  Type   | Collation | Nullable | Default 
 --------+---------+-----------+----------+---------
- b      | numeric |           |          | 
  a      | integer |           |          | 
+ b      | numeric |           |          | 
 Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000)
 Indexes:
     "at_part_2_a_idx" btree (a)
@@ -2992,8 +2992,7 @@ ALTER TABLE new_system_table RENAME TO old_system_table;
 CREATE INDEX old_system_table__othercol ON old_system_table (othercol);
 INSERT INTO old_system_table(othercol) VALUES ('somedata'), ('otherdata');
 UPDATE old_system_table SET id = -id;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+ERROR:  Distributed column or partition column "id" can't be updated in current version
 DELETE FROM old_system_table WHERE othercol = 'somedata';
 TRUNCATE old_system_table;
 ALTER TABLE old_system_table DROP CONSTRAINT new_system_table_pkey;
@@ -3577,8 +3576,8 @@ CREATE OPERATOR CLASS custom_opclass FOR TYPE int4 USING HASH AS
 OPERATOR 1 = , FUNCTION 2 dummy_hashint4(int4, int8);
 -- check that the new partition won't overlap with an existing partition
 CREATE TABLE hash_parted (
-   a int,
-   b int
+	a int,
+	b int
 ) PARTITION BY HASH (a custom_opclass);
 CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0);
 CREATE TABLE fail_part (LIKE hpart_1);
@@ -3599,7 +3598,7 @@ ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REM
 -- check that leaf partitions are scanned when attaching a partitioned
 -- table
 CREATE TABLE hpart_5 (
-   LIKE hash_parted
+	LIKE hash_parted
 ) PARTITION BY LIST (b);
 -- check that violating rows are correctly reported
 CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3');
diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out
index 3287e360..50bc6605 100644
--- a/src/test/regress/expected/alter_table_3.out
+++ b/src/test/regress/expected/alter_table_3.out
@@ -2931,7 +2931,8 @@ ALTER TABLE new_system_table RENAME TO old_system_table;
 CREATE INDEX old_system_table__othercol ON old_system_table (othercol);
 INSERT INTO old_system_table(othercol) VALUES ('somedata'), ('otherdata');
 UPDATE old_system_table SET id = -id;
-ERROR:  Distributed column or partition column "id" can't be updated in current version
+ERROR:  could not plan this distributed update
+DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 DELETE FROM old_system_table WHERE othercol = 'somedata';
 TRUNCATE old_system_table;
 ALTER TABLE old_system_table DROP CONSTRAINT new_system_table_pkey;
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index 3290fe55..4f679633 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -446,6 +446,8 @@ Number of partitions: 0
  b      | text    |           |          |         | extended |              | 
 Partition key: RANGE (((a + 1)), substr(b, 1, 5))
 Number of partitions: 0
+Distribute By: HASH(a)
+Location Nodes: ALL DATANODES
 
 INSERT INTO partitioned2 VALUES (1, 'hello');
 ERROR:  no partition of relation "partitioned2" found for row
@@ -459,6 +461,8 @@ CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO
  b      | text    |           |          |         | extended |              | 
 Partition of: partitioned2 FOR VALUES FROM ('-1', 'aaaaa') TO (100, 'ccccc')
 Partition constraint: (((a + 1) IS NOT NULL) AND (substr(b, 1, 5) IS NOT NULL) AND (((a + 1) > '-1'::integer) OR (((a + 1) = '-1'::integer) AND (substr(b, 1, 5) >= 'aaaaa'::text))) AND (((a + 1) < 100) OR (((a + 1) = 100) AND (substr(b, 1, 5) < 'ccccc'::text))))
+Distribute By: HASH(a)
+Location Nodes: ALL DATANODES
 
 DROP TABLE partitioned, partitioned2;
 -- check that dependencies of partition columns are handled correctly
@@ -953,5 +957,7 @@ create table boolspart_f partition of boolspart for values in (false);
 Partition key: LIST (a)
 Partitions: boolspart_f FOR VALUES IN (false),
             boolspart_t FOR VALUES IN (true)
+Distribute By: HASH(a)
+Location Nodes: ALL DATANODES
 
 drop table boolspart;
diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out
index 2537e6f1..6ff64a59 100644
--- a/src/test/regress/expected/event_trigger.out
+++ b/src/test/regress/expected/event_trigger.out
@@ -285,7 +285,7 @@ CREATE SCHEMA evttrig
 	CREATE TABLE two (col_c INTEGER CHECK (col_c > 0) REFERENCES one DEFAULT 42);
 -- Partitioned tables with a partitioned index
 CREATE TABLE evttrig.parted (
-    id int PRIMARY KEY)
+    id int)
     PARTITION BY RANGE (id);
 CREATE TABLE evttrig.part_1_10 PARTITION OF evttrig.parted (id)
   FOR VALUES FROM (1) TO (10);
@@ -304,11 +304,6 @@ NOTICE:  drop cascades to 3 other objects
 DETAIL:  drop cascades to table evttrig.one
 drop cascades to table evttrig.two
 drop cascades to table evttrig.parted
-NOTICE:  NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.parted name={evttrig,parted} args={}
-NOTICE:  NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_1_10 name={evttrig,part_1_10} args={}
-NOTICE:  NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_20 name={evttrig,part_10_20} args={}
-NOTICE:  NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_10_15 name={evttrig,part_10_15} args={}
-NOTICE:  NORMAL: orig=f normal=t istemp=f type=table identity=evttrig.part_15_20 name={evttrig,part_15_20} args={}
 DROP TABLE a_temp_tbl;
 DROP EVENT TRIGGER regress_event_trigger_report_dropped;
 ERROR:  event trigger "regress_event_trigger_report_dropped" does not exist
diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out
index a5326254..dec9af7f 100644
--- a/src/test/regress/expected/foreign_data.out
+++ b/src/test/regress/expected/foreign_data.out
@@ -1491,6 +1491,7 @@ ERROR:  server "s0" does not exist
  c2     | text    |           |          |         | extended |              | 
  c3     | date    |           |          |         | plain    |              | 
 Partition key: LIST (c1)
+Number of partitions: 0
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
@@ -1540,6 +1541,7 @@ ERROR:  relation "pt2_1" does not exist
  c2     | text    |           |          |         | extended |              | 
  c3     | date    |           |          |         | plain    |              | 
 Partition key: LIST (c1)
+Number of partitions: 0
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
@@ -1560,6 +1562,7 @@ ERROR:  relation "pt2_1" does not exist
  c2     | text    |           |          |         | extended |              | 
  c3     | date    |           |          |         | plain    |              | 
 Partition key: LIST (c1)
+Number of partitions: 0
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
diff --git a/src/test/regress/expected/foreign_key_2.out b/src/test/regress/expected/foreign_key_2.out
index e3b7210b..c64ced2f 100644
--- a/src/test/regress/expected/foreign_key_2.out
+++ b/src/test/regress/expected/foreign_key_2.out
@@ -1442,26 +1442,39 @@ drop table pktable2, fktable2;
 -- Ensure that works.
 CREATE TABLE fk_notpartitioned_pk (a INT, PRIMARY KEY(a), CHECK (a > 0));
 CREATE TABLE fk_partitioned_fk (a INT REFERENCES fk_notpartitioned_pk(a) PRIMARY KEY) PARTITION BY RANGE(a);
+ERROR:  foreign key constraints are not supported on partitioned tables
+LINE 1: CREATE TABLE fk_partitioned_fk (a INT REFERENCES fk_notparti...
+                                              ^
 CREATE TABLE fk_partitioned_fk_1 PARTITION OF fk_partitioned_fk FOR VALUES FROM (MINVALUE) TO (MAXVALUE);
+ERROR:  relation "fk_partitioned_fk" does not exist
 INSERT INTO fk_notpartitioned_pk VALUES (1);
 INSERT INTO fk_partitioned_fk VALUES (1);
+ERROR:  relation "fk_partitioned_fk" does not exist
+LINE 1: INSERT INTO fk_partitioned_fk VALUES (1);
+                    ^
 ALTER TABLE fk_notpartitioned_pk ALTER COLUMN a TYPE bigint;
 DELETE FROM fk_notpartitioned_pk WHERE a = 1;
-ERROR:  update or delete on table "fk_notpartitioned_pk" violates foreign key constraint "fk_partitioned_fk_a_fkey" on table "fk_partitioned_fk"
-DETAIL:  Key (a)=(1) is still referenced from table "fk_partitioned_fk".
 DROP TABLE fk_notpartitioned_pk, fk_partitioned_fk;
+ERROR:  table "fk_partitioned_fk" does not exist
 -- ensure we check partitions are "not used" when dropping constraints
 CREATE SCHEMA fkpart8
   CREATE TABLE tbl1(f1 int PRIMARY KEY)
   CREATE TABLE tbl2(f1 int REFERENCES tbl1 DEFERRABLE INITIALLY DEFERRED) PARTITION BY RANGE(f1)
   CREATE TABLE tbl2_p1 PARTITION OF tbl2 FOR VALUES FROM (minvalue) TO (maxvalue);
+ERROR:  foreign key constraints are not supported on partitioned tables
+LINE 3:   CREATE TABLE tbl2(f1 int REFERENCES tbl1 DEFERRABLE INITIA...
+                                   ^
 INSERT INTO fkpart8.tbl1 VALUES(1);
+ERROR:  relation "fkpart8.tbl1" does not exist
+LINE 1: INSERT INTO fkpart8.tbl1 VALUES(1);
+                    ^
 BEGIN;
 INSERT INTO fkpart8.tbl2 VALUES(1);
+ERROR:  relation "fkpart8.tbl2" does not exist
+LINE 1: INSERT INTO fkpart8.tbl2 VALUES(1);
+                    ^
 ALTER TABLE fkpart8.tbl2 DROP CONSTRAINT tbl2_f1_fkey;
-ERROR:  cannot ALTER TABLE "tbl2_p1" because it has pending trigger events
+ERROR:  current transaction is aborted, commands ignored until end of transaction block
 COMMIT;
 DROP SCHEMA fkpart8 CASCADE;
-NOTICE:  drop cascades to 2 other objects
-DETAIL:  drop cascades to table fkpart8.tbl1
-drop cascades to table fkpart8.tbl2
+ERROR:  schema "fkpart8" does not exist
diff --git a/src/test/regress/expected/identity_1.out b/src/test/regress/expected/identity_1.out
index facf2230..e07bcfa2 100644
--- a/src/test/regress/expected/identity_1.out
+++ b/src/test/regress/expected/identity_1.out
@@ -322,3 +322,15 @@ SELECT * FROM itest8;
 RESET ROLE;
 DROP TABLE itest8;
 DROP USER regress_user1;
+-- typed tables (currently not supported)
+CREATE TYPE itest_type AS (f1 integer, f2 text, f3 bigint);
+CREATE TABLE itest12 OF itest_type (f1 WITH OPTIONS GENERATED ALWAYS AS IDENTITY); -- error
+ERROR:  identity colums are not supported on typed tables
+DROP TYPE itest_type CASCADE;
+-- table partitions (currently not supported)
+CREATE TABLE itest_parent (f1 date NOT NULL, f2 text, f3 bigint) PARTITION BY RANGE (f1);
+CREATE TABLE itest_child PARTITION OF itest_parent (
+    f3 WITH OPTIONS GENERATED ALWAYS AS IDENTITY
+) FOR VALUES FROM ('2016-07-01') TO ('2016-08-01'); -- error
+ERROR:  identify columns are not supported on partitions
+DROP TABLE itest_parent;
diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out
index d4326a87..f996a88d 100644
--- a/src/test/regress/expected/indexing.out
+++ b/src/test/regress/expected/indexing.out
@@ -29,7 +29,8 @@ create table idxpart1 partition of idxpart for values from (0) to (10);
 create unique index on idxpart (a);
 ERROR:  cannot create unique index on partitioned table "idxpart"
 create index concurrently on idxpart (a);
-ERROR:  cannot create index on partitioned table "idxpart" concurrently
+ERROR:  PGXC does not support concurrent INDEX yet
+DETAIL:  The feature is not currently supported
 drop table idxpart;
 -- Verify bugfix with index rewrite on ALTER TABLE / SET DATA TYPE
 -- https://postgr.es/m/CAKcux6mxNCGsgATwf5CGMF8g4WSupCXicCVMeKUTuWbyxHOMsQ@mail.gmail.com
@@ -67,22 +68,18 @@ Indexes:
     "idxpart1_b_c_idx" btree (b, c)
 
 \d+ idxpart1_a_idx
-                 Index "public.idxpart1_a_idx"
- Column |  Type   | Key? | Definition | Storage | Stats target 
---------+---------+------+------------+---------+--------------
- a      | integer | yes  | a          | plain   | 
-Partition of: idxparti 
-No partition constraint
+      Index "public.idxpart1_a_idx"
+ Column |  Type   | Definition | Storage 
+--------+---------+------------+---------
+ a      | integer | a          | plain
 btree, for table "public.idxpart1"
 
 \d+ idxpart1_b_c_idx
-                Index "public.idxpart1_b_c_idx"
- Column |  Type   | Key? | Definition | Storage  | Stats target 
---------+---------+------+------------+----------+--------------
- b      | integer | yes  | b          | plain    | 
- c      | text    | yes  | c          | extended | 
-Partition of: idxparti2 
-No partition constraint
+     Index "public.idxpart1_b_c_idx"
+ Column |  Type   | Definition | Storage  
+--------+---------+------------+----------
+ b      | integer | b          | plain
+ c      | text    | c          | extended
 btree, for table "public.idxpart1"
 
 drop table idxpart;
@@ -574,10 +571,8 @@ ERROR:  cannot attach index "idxpart2_a_idx" as a partition of index "idxpart_a_
 DETAIL:  The index definitions do not match.
 drop table idxpart;
 -- Verify that attaching indexes maps attribute numbers correctly
-create table idxpart (col1 int, a int, col2 int, b int) partition by range (a);
-create table idxpart1 (b int, col1 int, col2 int, col3 int, a int);
-alter table idxpart drop column col1, drop column col2;
-alter table idxpart1 drop column col1, drop column col2, drop column col3;
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (a int, b int);
 alter table idxpart attach partition idxpart1 for values from (0) to (1000);
 create index idxpart_1_idx on only idxpart (b, a);
 create index idxpart1_1_idx on idxpart1 (b, a);
@@ -618,9 +613,9 @@ drop table idxpart;
 create table idxpart (a int, b int, c text) partition by range (a);
 create index idxparti on idxpart (a);
 create index idxparti2 on idxpart (c, b);
-create table idxpart1 (c text, a int, b int);
+create table idxpart1 (a int, b int, c text);
 alter table idxpart attach partition idxpart1 for values from (0) to (10);
-create table idxpart2 (c text, a int, b int);
+create table idxpart2 (a int, b int, c text);
 create index on idxpart2 (a);
 create index on idxpart2 (c, b);
 alter table idxpart attach partition idxpart2 for values from (10) to (20);
@@ -640,12 +635,9 @@ select c.relname, pg_get_indexdef(indexrelid)
 
 drop table idxpart;
 -- Verify that columns are mapped correctly in expression indexes
-create table idxpart (col1 int, col2 int, a int, b int) partition by range (a);
-create table idxpart1 (col2 int, b int, col1 int, a int);
-create table idxpart2 (col1 int, col2 int, b int, a int);
-alter table idxpart drop column col1, drop column col2;
-alter table idxpart1 drop column col1, drop column col2;
-alter table idxpart2 drop column col1, drop column col2;
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (a int, b int);
+create table idxpart2 (a int, b int);
 create index on idxpart2 (abs(b));
 alter table idxpart attach partition idxpart2 for values from (0) to (1);
 create index on idxpart (abs(b));
@@ -663,14 +655,11 @@ select c.relname, pg_get_indexdef(indexrelid)
 
 drop table idxpart;
 -- Verify that columns are mapped correctly for WHERE in a partial index
-create table idxpart (col1 int, a int, col3 int, b int) partition by range (a);
-alter table idxpart drop column col1, drop column col3;
-create table idxpart1 (col1 int, col2 int, col3 int, col4 int, b int, a int);
-alter table idxpart1 drop column col1, drop column col2, drop column col3, drop column col4;
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (a int, b int);
 alter table idxpart attach partition idxpart1 for values from (0) to (1000);
-create table idxpart2 (col1 int, col2 int, b int, a int);
+create table idxpart2 (a int, b int);
 create index on idxpart2 (a) where b > 1000;
-alter table idxpart2 drop column col1, drop column col2;
 alter table idxpart attach partition idxpart2 for values from (1000) to (2000);
 create index on idxpart (a) where b > 1000;
 select c.relname, pg_get_indexdef(indexrelid)
@@ -686,7 +675,7 @@ select c.relname, pg_get_indexdef(indexrelid)
 
 drop table idxpart;
 -- Column number mapping: dropped columns in the partition
-create table idxpart1 (drop_1 int, drop_2 int, col_keep int, drop_3 int);
+create table idxpart1 (col_keep int, drop_1 int, drop_2 int, drop_3 int);
 alter table idxpart1 drop column drop_1;
 alter table idxpart1 drop column drop_2;
 alter table idxpart1 drop column drop_3;
@@ -718,9 +707,9 @@ select attrelid::regclass, attname, attnum from pg_attribute
   order by attrelid::regclass, attnum;
        attrelid        |           attname            | attnum 
 -----------------------+------------------------------+--------
- idxpart1              | ........pg.dropped.1........ |      1
+ idxpart1              | col_keep                     |      1
  idxpart1              | ........pg.dropped.2........ |      2
- idxpart1              | col_keep                     |      3
+ idxpart1              | ........pg.dropped.3........ |      3
  idxpart1              | ........pg.dropped.4........ |      4
  idxpart1_col_keep_idx | col_keep                     |      1
  idxpart               | col_keep                     |      1
@@ -729,7 +718,7 @@ select attrelid::regclass, attname, attnum from pg_attribute
 
 drop table idxpart;
 -- Column number mapping: dropped columns in the parent table
-create table idxpart(drop_1 int, drop_2 int, col_keep int, drop_3 int) partition by range (col_keep);
+create table idxpart(col_keep int, drop_1 int, drop_2 int, drop_3 int) partition by range (col_keep);
 alter table idxpart drop column drop_1;
 alter table idxpart drop column drop_2;
 alter table idxpart drop column drop_3;
@@ -761,9 +750,9 @@ select attrelid::regclass, attname, attnum from pg_attribute
   order by attrelid::regclass, attnum;
        attrelid        |           attname            | attnum 
 -----------------------+------------------------------+--------
- idxpart               | ........pg.dropped.1........ |      1
+ idxpart               | col_keep                     |      1
  idxpart               | ........pg.dropped.2........ |      2
- idxpart               | col_keep                     |      3
+ idxpart               | ........pg.dropped.3........ |      3
  idxpart               | ........pg.dropped.4........ |      4
  idxpart1              | col_keep                     |      1
  idxpart1_col_keep_idx | col_keep                     |      1
diff --git a/src/test/regress/expected/inherit_2.out b/src/test/regress/expected/inherit_2.out
index 65ff71fe..b6a72418 100644
--- a/src/test/regress/expected/inherit_2.out
+++ b/src/test/regress/expected/inherit_2.out
@@ -866,6 +866,28 @@ select tableoid::regclass::text as relname, parted_tab.* from parted_tab order b
 (3 rows)
 
 drop table parted_tab;
+-- Check UPDATE with multi-level partitioned inherited target
+create table mlparted_tab (a int, b char, c text) partition by list (a);
+create table mlparted_tab_part1 partition of mlparted_tab for values in (1);
+create table mlparted_tab_part2 partition of mlparted_tab for values in (2) partition by list (b);
+create table mlparted_tab_part3 partition of mlparted_tab for values in (3);
+create table mlparted_tab_part2a partition of mlparted_tab_part2 for values in ('a');
+create table mlparted_tab_part2b partition of mlparted_tab_part2 for values in ('b');
+insert into mlparted_tab values (1, 'a'), (2, 'a'), (2, 'b'), (3, 'a');
+update mlparted_tab mlp set c = 'xxx'
+from
+  (select a from some_tab union all select a+1 from some_tab) ss (a)
+where (mlp.a = ss.a and mlp.b = 'b') or mlp.a = 3;
+select tableoid::regclass::text as relname, mlparted_tab.* from mlparted_tab order by 1,2;
+       relname       | a | b |  c  
+---------------------+---+---+-----
+ mlparted_tab_part1  | 1 | a | 
+ mlparted_tab_part2a | 2 | a | 
+ mlparted_tab_part2b | 2 | b | xxx
+ mlparted_tab_part3  | 3 | a | xxx
+(4 rows)
+
+drop table mlparted_tab;
 drop table some_tab cascade;
 NOTICE:  drop cascades to table some_tab_child
 /* Test multiple inheritance of column defaults */
@@ -983,6 +1005,8 @@ NOTICE:  drop cascades to table c1
 -- tables. See the pgsql-hackers thread beginning Dec. 4/04
 create table base (i integer);
 create table derived () inherits (base);
+create table more_derived (like derived, b int) inherits (derived);
+NOTICE:  merging column "i" with inherited definition
 insert into derived (i) values (0);
 select derived::base from derived;
  derived 
diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out
index 707a6f63..251ee257 100644
--- a/src/test/regress/expected/inherit_3.out
+++ b/src/test/regress/expected/inherit_3.out
@@ -864,6 +864,30 @@ select tableoid::regclass::text as relname, parted_tab.* from parted_tab order b
 (3 rows)
 
 drop table parted_tab;
+-- Check UPDATE with multi-level partitioned inherited target
+create table mlparted_tab (a int, b char, c text) partition by list (a);
+create table mlparted_tab_part1 partition of mlparted_tab for values in (1);
+create table mlparted_tab_part2 partition of mlparted_tab for values in (2) partition by list (b);
+create table mlparted_tab_part3 partition of mlparted_tab for values in (3);
+create table mlparted_tab_part2a partition of mlparted_tab_part2 for values in ('a');
+create table mlparted_tab_part2b partition of mlparted_tab_part2 for values in ('b');
+insert into mlparted_tab values (1, 'a'), (2, 'a'), (2, 'b'), (3, 'a');
+update mlparted_tab mlp set c = 'xxx'
+from
+  (select a from some_tab union all select a+1 from some_tab) ss (a)
+where (mlp.a = ss.a and mlp.b = 'b') or mlp.a = 3;
+ERROR:  could not plan this distributed update
+DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+select tableoid::regclass::text as relname, mlparted_tab.* from mlparted_tab order by 1,2;
+       relname       | a | b | c 
+---------------------+---+---+---
+ mlparted_tab_part1  | 1 | a | 
+ mlparted_tab_part2a | 2 | a | 
+ mlparted_tab_part2b | 2 | b | 
+ mlparted_tab_part3  | 3 | a | 
+(4 rows)
+
+drop table mlparted_tab;
 drop table some_tab cascade;
 NOTICE:  drop cascades to table some_tab_child
 /* Test multiple inheritance of column defaults */
@@ -981,6 +1005,8 @@ NOTICE:  drop cascades to table c1
 -- tables. See the pgsql-hackers thread beginning Dec. 4/04
 create table base (i integer);
 create table derived () inherits (base);
+create table more_derived (like derived, b int) inherits (derived);
+NOTICE:  merging column "i" with inherited definition
 insert into derived (i) values (0);
 select derived::base from derived;
  derived 
@@ -997,16 +1023,20 @@ select NULL::derived::base;
 -- remove redundant conversions.
 explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived;
                 QUERY PLAN                 
--------------------------------------------
- Seq Scan on public.more_derived
+----------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: ((ROW(more_derived.i, more_derived.b)::more_derived)::derived)::base
+   Node/s: datanode_1, datanode_2
+   Remote query: SELECT ((ROW(i, b)::more_derived)::derived)::base AS "row" FROM more_derived
+   ->  Seq Scan on public.more_derived
    Output: (ROW(i, b)::more_derived)::base
-(2 rows)
+(6 rows)
 
 explain (verbose on, costs off) select (1, 2)::more_derived::derived::base;
       QUERY PLAN       
------------------------
+-------------------------------------------
  Result
-   Output: '(1)'::base
+   Output: (ROW(1, 2)::more_derived)::base
 (2 rows)
 
 drop table more_derived;
@@ -1997,19 +2027,17 @@ explain (costs off) select * from list_parted where a in ('ab', 'cd', 'ef');
                Filter: ((a)::text = ANY ('{ab,cd,ef}'::text[]))
 (7 rows)
 
-explain (costs off) select * from list_parted where a = 'ab' or a in (null, 'cd');
+explain (costs off) select * from list_parted where a = 'ab' or a is null or a ='cd';
                                          QUERY PLAN                                          
 ---------------------------------------------------------------------------------------------
  Remote Fast Query Execution
    Node/s: datanode_1, datanode_2
    ->  Append
          ->  Seq Scan on part_ab_cd
-               Filter: (((a)::text = 'ab'::text) OR ((a)::text = ANY ('{NULL,cd}'::text[])))
-         ->  Seq Scan on part_ef_gh
-               Filter: (((a)::text = 'ab'::text) OR ((a)::text = ANY ('{NULL,cd}'::text[])))
+               Filter: (((a)::text = 'ab'::text) OR (a IS NULL) OR ((a)::text = 'cd'::text))
          ->  Seq Scan on part_null_xy
-               Filter: (((a)::text = 'ab'::text) OR ((a)::text = ANY ('{NULL,cd}'::text[])))
-(9 rows)
+               Filter: (((a)::text = 'ab'::text) OR (a IS NULL) OR ((a)::text = 'cd'::text))
+(7 rows)
 
 explain (costs off) select * from list_parted where a = 'ab';
                    QUERY PLAN                   
@@ -2172,13 +2200,15 @@ create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (
 create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
 explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0, mcrparted_def
              QUERY PLAN             
-------------------------------------
+---------------------------------------
  Remote Fast Query Execution
    Node/s: datanode_2
    ->  Append
          ->  Seq Scan on mcrparted0
                Filter: (a = 0)
-(5 rows)
+         ->  Seq Scan on mcrparted_def
+               Filter: (a = 0)
+(7 rows)
 
 explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scans mcrparted1, mcrparted_def
                     QUERY PLAN                     
@@ -2188,7 +2218,9 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) < 5;	-- scan
    ->  Append
          ->  Seq Scan on mcrparted1
                Filter: ((a = 10) AND (abs(b) < 5))
-(5 rows)
+         ->  Seq Scan on mcrparted_def
+               Filter: ((a = 10) AND (abs(b) < 5))
+(7 rows)
 
 explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scans mcrparted1, mcrparted2, mcrparted_def
                     QUERY PLAN                     
@@ -2200,11 +2232,13 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scan
                Filter: ((a = 10) AND (abs(b) = 5))
          ->  Seq Scan on mcrparted2
                Filter: ((a = 10) AND (abs(b) = 5))
-(7 rows)
+         ->  Seq Scan on mcrparted_def
+               Filter: ((a = 10) AND (abs(b) = 5))
+(9 rows)
 
 explain (costs off) select * from mcrparted where abs(b) = 5;	-- scans all partitions
              QUERY PLAN             
-------------------------------------
+---------------------------------------
  Remote Fast Query Execution
    Node/s: datanode_1, datanode_2
    ->  Append
@@ -2220,7 +2254,9 @@ explain (costs off) select * from mcrparted where abs(b) = 5;	-- scans all parti
                Filter: (abs(b) = 5)
          ->  Seq Scan on mcrparted5
                Filter: (abs(b) = 5)
-(15 rows)
+         ->  Seq Scan on mcrparted_def
+               Filter: (abs(b) = 5)
+(17 rows)
 
 explain (costs off) select * from mcrparted where a > -1;	-- scans all partitions
                 QUERY PLAN                 
@@ -2240,7 +2276,9 @@ explain (costs off) select * from mcrparted where a > -1;	-- scans all partition
                Filter: (a > '-1'::integer)
          ->  Seq Scan on mcrparted5
                Filter: (a > '-1'::integer)
-(15 rows)
+         ->  Seq Scan on mcrparted_def
+               Filter: (a > '-1'::integer)
+(17 rows)
 
 explain (costs off) select * from mcrparted where a = 20 and abs(b) = 10 and c > 10;	-- scans mcrparted4
                            QUERY PLAN                            
@@ -2264,7 +2302,9 @@ explain (costs off) select * from mcrparted where a = 20 and c > 20; -- scans mc
                Filter: ((c > 20) AND (a = 20))
          ->  Seq Scan on mcrparted5
                Filter: ((c > 20) AND (a = 20))
-(9 rows)
+         ->  Seq Scan on mcrparted_def
+               Filter: ((c > 20) AND (a = 20))
+(11 rows)
 
 drop table mcrparted;
 -- check that partitioned table Appends cope with being referenced in
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index e1a74c4a..d12e3494 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -268,17 +268,17 @@ insert into part_default_p2 values ('de', 35);
 insert into list_parted values ('ab', 21);
 insert into list_parted values ('xx', 1);
 insert into list_parted values ('yy', 2);
-select tableoid::regclass, * from list_parted;
+select tableoid::regclass, * from list_parted order by 1,2,3;
       tableoid      | a  | b  
 --------------------+----+----
  part_cc_dd         | cC |  1
+ part_null          |    |  0
  part_ee_ff1        | ff |  1
  part_ee_ff2        | ff | 11
  part_xx_yy_p1      | xx |  1
  part_xx_yy_defpart | yy |  2
- part_null          |    |  0
- part_default_p1    | cd | 25
  part_default_p1    | ab | 21
+ part_default_p1    | cd | 25
  part_default_p2    | de | 35
 (9 rows)
 
@@ -322,11 +322,11 @@ select tableoid::regclass, * from range_parted order by 1, 2, 3;
  part3    | b |  1
  part4    | b | 10
  part4    | b | 10
- part_def | c | 10
- part_def |   |   
  part_def | a |   
- part_def |   | 19
  part_def | b | 20
+ part_def | c | 10
+ part_def |   | 19
+ part_def |   |   
 (11 rows)
 
 -- ok
@@ -342,21 +342,21 @@ DETAIL:  Partition key of the failing row contains (b) = (0).
 -- ok
 insert into list_parted values ('EE', 1);
 insert into part_ee_ff values ('EE', 10);
-select tableoid::regclass, * from list_parted;
+select tableoid::regclass, * from list_parted order by 1,2,3;
       tableoid      | a  | b  
 --------------------+----+----
  part_aa_bb         | aA |   
  part_cc_dd         | cC |  1
- part_ee_ff1        | ff |  1
+ part_null          |    |  0
+ part_null          |    |  1
  part_ee_ff1        | EE |  1
- part_ee_ff2        | ff | 11
+ part_ee_ff1        | ff |  1
  part_ee_ff2        | EE | 10
+ part_ee_ff2        | ff | 11
  part_xx_yy_p1      | xx |  1
  part_xx_yy_defpart | yy |  2
- part_null          |    |  0
- part_null          |    |  1
- part_default_p1    | cd | 25
  part_default_p1    | ab | 21
+ part_default_p1    | cd | 25
  part_default_p2    | de | 35
 (13 rows)
 
@@ -413,7 +413,7 @@ DETAIL:  Failing row contains (11).
 insert into hpart3 values(11);
 -- view data
 select tableoid::regclass as part, a, a%4 as "remainder = a % 4"
-from hash_parted order by part;
+from hash_parted order by part,a;
   part  | a  | remainder = a % 4 
 --------+----+-------------------
  hpart0 |  4 |                 0
@@ -447,6 +447,8 @@ Partitions: part_aa_bb FOR VALUES IN ('aa', 'bb'),
             part_null FOR VALUES IN (NULL),
             part_xx_yy FOR VALUES IN ('xx', 'yy'), PARTITIONED,
             part_default DEFAULT, PARTITIONED
+Distribute By: HASH(a)
+Location Nodes: ALL DATANODES
 
 -- cleanup
 drop table range_parted, list_parted;
@@ -464,6 +466,8 @@ create table part_default partition of list_parted default;
  a      | integer |           |          |         | plain   |              | 
 Partition of: list_parted DEFAULT
 No partition constraint
+Distribute By: HASH(a)
+Location Nodes: ALL DATANODES
 
 insert into part_default values (null);
 insert into part_default values (1);
@@ -639,7 +643,7 @@ DETAIL:  Failing row contains (34, 50, null).
 -- ok
 create table mlparted_defd partition of mlparted_def default;
 insert into mlparted values (70, 100);
-select tableoid::regclass, * from mlparted_def;
+select tableoid::regclass, * from mlparted_def order by 1;
    tableoid    | a  |  b  | c 
 ---------------+----+-----+---
  mlparted_def1 | 40 | 100 | 
@@ -785,17 +789,27 @@ create table donothingbrtrig_test (a int, b text) partition by list (a);
 create table donothingbrtrig_test1 (b text, a int);
 create table donothingbrtrig_test2 (c text, b text, a int);
 alter table donothingbrtrig_test2 drop column c;
+ERROR:  Distribution column cannot be dropped
 create or replace function donothingbrtrig_func() returns trigger as $$begin raise notice 'b: %', new.b; return NULL; end$$ language plpgsql;
 create trigger donothingbrtrig1 before insert on donothingbrtrig_test1 for each row execute procedure donothingbrtrig_func();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 create trigger donothingbrtrig2 before insert on donothingbrtrig_test2 for each row execute procedure donothingbrtrig_func();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 alter table donothingbrtrig_test attach partition donothingbrtrig_test1 for values in (1);
+ERROR:  table "donothingbrtrig_test1" contains column "a" at position 2, but parent "donothingbrtrig_test" has it at position 1
+DETAIL:  Postgres-XL requires attribute positions to match
+HINT:  Check for column ordering and dropped columns, if any
 alter table donothingbrtrig_test attach partition donothingbrtrig_test2 for values in (2);
+ERROR:  table "donothingbrtrig_test2" contains column "c" not found in parent "donothingbrtrig_test"
+DETAIL:  New partition should contain only the columns present in parent.
 insert into donothingbrtrig_test values (1, 'foo'), (2, 'bar');
-NOTICE:  b: foo
-NOTICE:  b: bar
+ERROR:  no partition of relation "donothingbrtrig_test" found for row
+DETAIL:  Partition key of the failing row contains (a) = (1).
 copy donothingbrtrig_test from stdout;
-NOTICE:  b: baz
-NOTICE:  b: qux
+ERROR:  no partition of relation "donothingbrtrig_test" found for row
+DETAIL:  Partition key of the failing row contains (a) = (1).
 select tableoid::regclass, * from donothingbrtrig_test;
  tableoid | a | b 
 ----------+---+---
diff --git a/src/test/regress/expected/insert_conflict_1.out b/src/test/regress/expected/insert_conflict_1.out
index 1a544406..40048bfb 100644
--- a/src/test/regress/expected/insert_conflict_1.out
+++ b/src/test/regress/expected/insert_conflict_1.out
@@ -830,3 +830,24 @@ select * from selfconflict order by 1;
 (3 rows)
 
 drop table selfconflict;
+-- check that the following works:
+-- insert into partitioned_table on conflict do nothing
+create table parted_conflict_test (a int, b char) partition by list (a);
+create table parted_conflict_test_1 partition of parted_conflict_test (b unique) for values in (1);
+ERROR:  Unique index of partitioned table must contain the hash/modulo distribution column.
+insert into parted_conflict_test values (1, 'a') on conflict do nothing;
+ERROR:  no partition of relation "parted_conflict_test" found for row
+DETAIL:  Partition key of the failing row contains (a) = (1).
+insert into parted_conflict_test values (1, 'a') on conflict do nothing;
+ERROR:  no partition of relation "parted_conflict_test" found for row
+DETAIL:  Partition key of the failing row contains (a) = (1).
+-- however, on conflict do update is not supported yet
+insert into parted_conflict_test values (1) on conflict (b) do update set a = excluded.a;
+ERROR:  Distributed column or partition column "a" can't be updated in current version
+-- but it works OK if we target the partition directly
+insert into parted_conflict_test_1 values (1) on conflict (b) do
+update set a = excluded.a;
+ERROR:  relation "parted_conflict_test_1" does not exist
+LINE 1: insert into parted_conflict_test_1 values (1) on conflict (b...
+                    ^
+drop table parted_conflict_test;
diff --git a/src/test/regress/expected/partition_info.out b/src/test/regress/expected/partition_info.out
index c26d02a5..d26fb257 100644
--- a/src/test/regress/expected/partition_info.out
+++ b/src/test/regress/expected/partition_info.out
@@ -8,8 +8,8 @@ SELECT * FROM pg_partition_tree(NULL);
 
 SELECT * FROM pg_partition_tree(0);
  relid | parentrelid | isleaf | level 
--------+-------------+--------+-------    
-(0 row)
+-------+-------------+--------+-------
+(0 rows)
 
 SELECT pg_partition_root(NULL);
  pg_partition_root 
@@ -45,7 +45,7 @@ CREATE TABLE ptif_test2 PARTITION OF ptif_test
   FOR VALUES FROM (100) TO (200);
 -- This partitioned table should remain with no partitions.
 CREATE TABLE ptif_test3 PARTITION OF ptif_test
-  FOR VALUES FROM (200) TO (maxvalue) PARTITION BY list (b);
+  FOR VALUES FROM (200) TO (maxvalue) PARTITION BY list (b);  
 -- Test pg_partition_root for tables
 SELECT pg_partition_root('ptif_test');
  pg_partition_root 
@@ -98,12 +98,6 @@ SELECT pg_partition_root('ptif_test0_index');
  ptif_test_index
 (1 row)
 
-SELECT pg_partition_root('ptif_test01_index');
- pg_partition_root 
--------------------
- ptif_test_index
-(1 row)
-
 SELECT pg_partition_root('ptif_test3_index');
  pg_partition_root 
 -------------------
@@ -182,78 +176,6 @@ SELECT * FROM pg_partition_ancestors('ptif_test');
  ptif_test
 (1 row)
 
--- List all indexes members of the tree
-SELECT relid, parentrelid, level, isleaf
-  FROM pg_partition_tree('ptif_test_index');
-       relid       |   parentrelid    | level | isleaf 
--------------------+------------------+-------+--------
- ptif_test_index   |                  |     0 | f
- ptif_test0_index  | ptif_test_index  |     1 | f
- ptif_test1_index  | ptif_test_index  |     1 | f
- ptif_test2_index  | ptif_test_index  |     1 | t
- ptif_test3_index  | ptif_test_index  |     1 | f
- ptif_test01_index | ptif_test0_index |     2 | t
- ptif_test11_index | ptif_test1_index |     2 | t
-(7 rows)
-
--- List indexes from an intermediate level
-SELECT relid, parentrelid, level, isleaf
-  FROM pg_partition_tree('ptif_test0_index') p
-  JOIN pg_class c ON (p.relid = c.oid);
-       relid       |   parentrelid    | level | isleaf 
--------------------+------------------+-------+--------
- ptif_test0_index  | ptif_test_index  |     0 | f
- ptif_test01_index | ptif_test0_index |     1 | t
-(2 rows)
-
--- List from leaf index
-SELECT relid, parentrelid, level, isleaf
-  FROM pg_partition_tree('ptif_test01_index') p
-  JOIN pg_class c ON (p.relid = c.oid);
-       relid       |   parentrelid    | level | isleaf 
--------------------+------------------+-------+--------
- ptif_test01_index | ptif_test0_index |     0 | t
-(1 row)
-
--- List from partitioned index with no partitions
-SELECT relid, parentrelid, level, isleaf
-  FROM pg_partition_tree('ptif_test3_index') p
-  JOIN pg_class c ON (p.relid = c.oid);
-      relid       |   parentrelid   | level | isleaf 
-------------------+-----------------+-------+--------
- ptif_test3_index | ptif_test_index |     0 | f
-(1 row)
-
--- List all members using pg_partition_root with leaf index reference
-SELECT relid, parentrelid, level, isleaf
-  FROM pg_partition_tree(pg_partition_root('ptif_test01_index')) p
-  JOIN pg_class c ON (p.relid = c.oid);
-       relid       |   parentrelid    | level | isleaf 
--------------------+------------------+-------+--------
- ptif_test_index   |                  |     0 | f
- ptif_test0_index  | ptif_test_index  |     1 | f
- ptif_test1_index  | ptif_test_index  |     1 | f
- ptif_test2_index  | ptif_test_index  |     1 | t
- ptif_test3_index  | ptif_test_index  |     1 | f
- ptif_test01_index | ptif_test0_index |     2 | t
- ptif_test11_index | ptif_test1_index |     2 | t
-(7 rows)
-
--- List all ancestors of root and leaf indexes
-SELECT * FROM pg_partition_ancestors('ptif_test01_index');
-       relid       
--------------------
- ptif_test01_index
- ptif_test0_index
- ptif_test_index
-(3 rows)
-
-SELECT * FROM pg_partition_ancestors('ptif_test_index');
-      relid      
------------------
- ptif_test_index
-(1 row)
-
 DROP TABLE ptif_test;
 -- A table not part of a partition tree works is not listed.
 CREATE TABLE ptif_normal_table(a int);
@@ -266,7 +188,7 @@ SELECT relid, parentrelid, level, isleaf
 SELECT pg_partition_root('ptif_normal_table');
  pg_partition_root 
 -------------------
-
+ 
 (1 row)
 
 SELECT * FROM pg_partition_ancestors('ptif_normal_table');
@@ -280,13 +202,13 @@ CREATE VIEW ptif_test_view AS SELECT 1;
 CREATE MATERIALIZED VIEW ptif_test_matview AS SELECT 1;
 SELECT * FROM pg_partition_tree('ptif_test_view');
  relid | parentrelid | isleaf | level 
--------+-------------+--------+------- 
-(0 row)
+-------+-------------+--------+-------
+(0 rows)
 
 SELECT * FROM pg_partition_tree('ptif_test_matview');
  relid | parentrelid | isleaf | level 
--------+-------------+--------+------- 
-(0 row)
+-------+-------------+--------+-------
+(0 rows)
 
 SELECT pg_partition_root('ptif_test_view');
  pg_partition_root 
diff --git a/src/test/regress/expected/partition_join_1.out b/src/test/regress/expected/partition_join_1.out
new file mode 100644
index 00000000..83d35561
--- /dev/null
+++ b/src/test/regress/expected/partition_join_1.out
@@ -0,0 +1,2102 @@
+--
+-- PARTITION_JOIN
+-- Test partition-wise join between partitioned tables
+--
+-- Enable partition-wise join, which by default is disabled.
+SET enable_partition_wise_join to true;
+--
+-- partitioned by a single column
+--
+CREATE TABLE prt1 (a int, b int, c varchar) PARTITION BY RANGE(a);
+CREATE TABLE prt1_p1 PARTITION OF prt1 FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt1_p3 PARTITION OF prt1 FOR VALUES FROM (500) TO (600);
+CREATE TABLE prt1_p2 PARTITION OF prt1 FOR VALUES FROM (250) TO (500);
+INSERT INTO prt1 SELECT i, i % 25, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 2 = 0;
+CREATE INDEX iprt1_p1_a on prt1_p1(a);
+CREATE INDEX iprt1_p2_a on prt1_p2(a);
+CREATE INDEX iprt1_p3_a on prt1_p3(a);
+ANALYZE prt1;
+CREATE TABLE prt2 (a int, b int, c varchar) PARTITION BY RANGE(b);
+CREATE TABLE prt2_p1 PARTITION OF prt2 FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt2_p2 PARTITION OF prt2 FOR VALUES FROM (250) TO (500);
+CREATE TABLE prt2_p3 PARTITION OF prt2 FOR VALUES FROM (500) TO (600);
+INSERT INTO prt2 SELECT i % 25, i, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 3 = 0;
+CREATE INDEX iprt2_p1_b on prt2_p1(b);
+CREATE INDEX iprt2_p2_b on prt2_p2(b);
+CREATE INDEX iprt2_p3_b on prt2_p3(b);
+ANALYZE prt2;
+-- inner join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a
+         ->  Append
+               ->  Nested Loop
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Seq Scan on prt1_p1 t1
+                                 Filter: (b = 0)
+                     ->  Index Scan using iprt2_p1_b on prt2_p1 t2
+                           Index Cond: (b = t1.a)
+               ->  Nested Loop
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Seq Scan on prt1_p2 t1_1
+                                 Filter: (b = 0)
+                     ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
+                           Index Cond: (b = t1_1.a)
+               ->  Nested Loop
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Seq Scan on prt1_p3 t1_2
+                                 Filter: (b = 0)
+                     ->  Bitmap Heap Scan on prt2_p3 t2_2
+                           Recheck Cond: (b = t1_2.a)
+                           ->  Bitmap Index Scan on iprt2_p3_b
+                                 Index Cond: (b = t1_2.a)
+(24 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+ 150 | 0150 | 150 | 0150
+ 300 | 0300 | 300 | 0300
+ 450 | 0450 | 450 | 0450
+(4 rows)
+
+-- left outer join, with whole-row reference
+EXPLAIN (COSTS OFF)
+SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a, t2.b
+         ->  Result
+               ->  Append
+                     ->  Nested Loop Left Join
+                           ->  Seq Scan on prt1_p1 t1
+                                 Filter: (b = 0)
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  Index Scan using iprt2_p1_b on prt2_p1 t2
+                                             Index Cond: (t1.a = b)
+                     ->  Nested Loop Left Join
+                           ->  Seq Scan on prt1_p2 t1_1
+                                 Filter: (b = 0)
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
+                                             Index Cond: (t1_1.a = b)
+                     ->  Nested Loop Left Join
+                           ->  Seq Scan on prt1_p3 t1_2
+                                 Filter: (b = 0)
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  Bitmap Heap Scan on prt2_p3 t2_2
+                                             Recheck Cond: (t1_2.a = b)
+                                             ->  Bitmap Index Scan on iprt2_p3_b
+                                                   Index Cond: (t1_2.a = b)
+(31 rows)
+
+SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+      t1      |      t2      
+--------------+--------------
+ (0,0,0000)   | (0,0,0000)
+ (50,0,0050)  | 
+ (100,0,0100) | 
+ (150,0,0150) | (0,150,0150)
+ (200,0,0200) | 
+ (250,0,0250) | 
+ (300,0,0300) | (0,300,0300)
+ (350,0,0350) | 
+ (400,0,0400) | 
+ (450,0,0450) | (0,450,0450)
+ (500,0,0500) | 
+ (550,0,0550) | 
+(12 rows)
+
+-- right outer join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a, t2.b
+         ->  Result
+               ->  Append
+                     ->  Nested Loop Left Join
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_p1 t2
+                                       Filter: (a = 0)
+                           ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                                 Index Cond: (a = t2.b)
+                     ->  Nested Loop Left Join
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_p2 t2_1
+                                       Filter: (a = 0)
+                           ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                                 Index Cond: (a = t2_1.b)
+                     ->  Nested Loop Left Join
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_p3 t2_2
+                                       Filter: (a = 0)
+                           ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                                 Index Cond: (a = t2_2.b)
+(26 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+ 150 | 0150 | 150 | 0150
+ 300 | 0300 | 300 | 0300
+ 450 | 0450 | 450 | 0450
+     |      |  75 | 0075
+     |      | 225 | 0225
+     |      | 375 | 0375
+     |      | 525 | 0525
+(8 rows)
+
+-- full outer join, with placeholder vars
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: prt1_p1.a, prt2_p1.b
+         ->  Append
+               ->  Hash Full Join
+                     Hash Cond: (prt1_p1.a = prt2_p1.b)
+                     Filter: (((50) = prt1_p1.a) OR ((75) = prt2_p1.b))
+                     ->  Seq Scan on prt1_p1
+                           Filter: (b = 0)
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_p1
+                                       Filter: (a = 0)
+               ->  Hash Full Join
+                     Hash Cond: (prt1_p2.a = prt2_p2.b)
+                     Filter: (((50) = prt1_p2.a) OR ((75) = prt2_p2.b))
+                     ->  Seq Scan on prt1_p2
+                           Filter: (b = 0)
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_p2
+                                       Filter: (a = 0)
+               ->  Hash Full Join
+                     Hash Cond: (prt1_p3.a = prt2_p3.b)
+                     Filter: (((50) = prt1_p3.a) OR ((75) = prt2_p3.b))
+                     ->  Seq Scan on prt1_p3
+                           Filter: (b = 0)
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_p3
+                                       Filter: (a = 0)
+(34 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b;
+ a  |  c   | b  |  c   
+----+------+----+------
+ 50 | 0050 |    | 
+    |      | 75 | 0075
+(2 rows)
+
+-- Join with pruned partitions from joining relations
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a
+         ->  Append
+               ->  Nested Loop
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Seq Scan on prt1_p2 t1
+                                 Filter: ((a < 450) AND (b = 0))
+                     ->  Index Scan using iprt2_p2_b on prt2_p2 t2
+                           Index Cond: ((b = t1.a) AND (b > 250))
+(10 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+ 300 | 0300 | 300 | 0300
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Sort
+   Sort Key: prt1_p1.a, b
+   ->  Append
+         ->  Hash Left Join
+               Hash Cond: (prt1_p1.a = b)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Seq Scan on prt1_p1
+                           Filter: ((a < 450) AND (b = 0))
+               ->  Hash
+                     ->  Result
+                           One-Time Filter: false
+         ->  Nested Loop Left Join
+               ->  Seq Scan on prt1_p2
+                     Filter: ((a < 450) AND (b = 0))
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  Index Scan using iprt2_p2_b on prt2_p2
+                                 Index Cond: ((prt1_p2.a = b) AND (b > 250))
+(19 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c
+-----+------+-----+------
+   0 | 0000 |     |
+  50 | 0050 |     |
+ 100 | 0100 |     |
+ 150 | 0150 |     |
+ 200 | 0200 |     |
+ 250 | 0250 |     |
+ 300 | 0300 | 300 | 0300
+ 350 | 0350 |     |
+ 400 | 0400 |     |
+(9 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Sort
+   Sort Key: prt1_p1.a, b
+   ->  Append
+         ->  Hash Full Join
+               Hash Cond: (prt1_p1.a = b)
+               Filter: ((prt1_p1.b = 0) OR (a = 0))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Seq Scan on prt1_p1
+                           Filter: (a < 450)
+               ->  Hash
+                     ->  Result
+                           One-Time Filter: false
+         ->  Hash Full Join
+               Hash Cond: (prt1_p2.a = prt2_p2.b)
+               Filter: ((prt1_p2.b = 0) OR (prt2_p2.a = 0))
+               ->  Seq Scan on prt1_p2
+                     Filter: (a < 450)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  Seq Scan on prt2_p2
+                                 Filter: (b > 250)
+         ->  Hash Full Join
+               Hash Cond: (prt2_p3.b = a)
+               Filter: ((b = 0) OR (prt2_p3.a = 0))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Seq Scan on prt2_p3
+                           Filter: (b > 250)
+               ->  Hash
+                     ->  Result
+                           One-Time Filter: false
+(31 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c
+-----+------+-----+------
+   0 | 0000 |     |
+  50 | 0050 |     |
+ 100 | 0100 |     |
+ 150 | 0150 |     |
+ 200 | 0200 |     |
+ 250 | 0250 |     |
+ 300 | 0300 | 300 | 0300
+ 350 | 0350 |     |
+ 400 | 0400 |     |
+     |      | 375 | 0375
+     |      | 450 | 0450
+     |      | 525 | 0525
+(12 rows)
+
+-- Semi-join
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a;
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a
+         ->  Append
+               ->  Nested Loop
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           Distribute results by H: b
+                           ->  HashAggregate
+                                 Group Key: t2.b
+                                 ->  Remote Subquery Scan on all (datanode_2)
+                                       Distribute results by H: b
+                                       ->  HashAggregate
+                                             Group Key: t2.b
+                                             ->  Seq Scan on prt2_p1 t2
+                                                   Filter: (a = 0)
+                     ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                           Index Cond: (a = t2.b)
+                           Filter: (b = 0)
+               ->  Nested Loop
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           Distribute results by H: b
+                           ->  HashAggregate
+                                 Group Key: t2_1.b
+                                 ->  Remote Subquery Scan on all (datanode_2)
+                                       Distribute results by H: b
+                                       ->  HashAggregate
+                                             Group Key: t2_1.b
+                                             ->  Seq Scan on prt2_p2 t2_1
+                                                   Filter: (a = 0)
+                     ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                           Index Cond: (a = t2_1.b)
+                           Filter: (b = 0)
+               ->  Nested Loop
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           Distribute results by H: b
+                           ->  HashAggregate
+                                 Group Key: t2_2.b
+                                 ->  Remote Subquery Scan on all (datanode_2)
+                                       Distribute results by H: b
+                                       ->  HashAggregate
+                                             Group Key: t2_2.b
+                                             ->  Seq Scan on prt2_p3 t2_2
+                                                   Filter: (a = 0)
+                     ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                           Index Cond: (a = t2_2.b)
+                           Filter: (b = 0)
+(46 rows)
+
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   
+-----+---+------
+   0 | 0 | 0000
+ 150 | 0 | 0150
+ 300 | 0 | 0300
+ 450 | 0 | 0450
+(4 rows)
+
+-- Anti-join with aggregates
+EXPLAIN (COSTS OFF)
+SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b);
+                                         QUERY PLAN                                         
+--------------------------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Partial Aggregate
+               ->  Append
+                     ->  Hash Anti Join
+                           Hash Cond: (t1.a = t2.b)
+                           ->  Seq Scan on prt1_p1 t1
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  Seq Scan on prt2_p1 t2
+                     ->  Hash Anti Join
+                           Hash Cond: (t1_1.a = t2_1.b)
+                           ->  Seq Scan on prt1_p2 t1_1
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  Seq Scan on prt2_p2 t2_1
+                     ->  Nested Loop Anti Join
+                           ->  Seq Scan on prt1_p3 t1_2
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  Index Only Scan using iprt2_p3_b on prt2_p3 t2_2
+                                             Index Cond: (b = t1_2.a)
+(25 rows)
+
+SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b);
+  sum  |         avg          | sum  |         avg         
+-------+----------------------+------+---------------------
+ 60000 | 300.0000000000000000 | 2400 | 12.0000000000000000
+(1 row)
+
+-- lateral reference
+EXPLAIN (COSTS OFF)
+SELECT * FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a;
+ERROR:  could not devise a query plan for the given query
+SELECT * FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   | t2a | t3a | least
+-----+---+------+-----+-----+-------
+   0 | 0 | 0000 |   0 |   0 |     0
+  50 | 0 | 0050 |     |     |
+ 100 | 0 | 0100 |     |     |
+ 150 | 0 | 0150 | 150 |   0 |   150
+ 200 | 0 | 0200 |     |     |
+ 250 | 0 | 0250 |     |     |
+ 300 | 0 | 0300 | 300 |   0 |   300
+ 350 | 0 | 0350 |     |     |
+ 400 | 0 | 0400 |     |     |
+ 450 | 0 | 0450 | 450 |   0 |   450
+ 500 | 0 | 0500 |     |     |
+ 550 | 0 | 0550 |     |     |
+(12 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a
+         ->  Hash Right Join
+               Hash Cond: ((c)::text = (c)::text)
+               Filter: ((b + COALESCE(b, 0)) = 0)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: c
+                     ->  Append
+                           ->  Hash Join
+                                 Hash Cond: (t2.a = t3.b)
+                                 ->  Seq Scan on prt1_p1 t2
+                                 ->  Hash
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Distribute results by H: b
+                                             ->  Seq Scan on prt2_p1 t3
+                           ->  Hash Join
+                                 Hash Cond: (t2_1.a = t3_1.b)
+                                 ->  Seq Scan on prt1_p2 t2_1
+                                 ->  Hash
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Distribute results by H: b
+                                             ->  Seq Scan on prt2_p2 t3_1
+                           ->  Nested Loop
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  Seq Scan on prt2_p3 t3_2
+                                 ->  Index Scan using iprt1_p3_a on prt1_p3 t2_2
+                                       Index Cond: (a = t3_2.b)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: c
+                           ->  Append
+                                 ->  Seq Scan on prt1_p1 t1
+                                 ->  Seq Scan on prt1_p2 t1_1
+                                 ->  Seq Scan on prt1_p3 t1_2
+(36 rows)
+
+SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
+  a  | t2a | t2c  
+-----+-----+------
+   0 |   0 | 0000
+  50 |     | 
+ 100 |     | 
+ 150 | 150 | 0150
+ 200 |     | 
+ 250 |     | 
+ 300 | 300 | 0300
+ 350 |     | 
+ 400 |     | 
+ 450 | 450 | 0450
+ 500 |     | 
+ 550 |     | 
+(12 rows)
+
+--
+-- partitioned by expression
+--
+CREATE TABLE prt1_e (a int, b int, c int) PARTITION BY RANGE(((a + b)/2));
+CREATE TABLE prt1_e_p1 PARTITION OF prt1_e FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt1_e_p2 PARTITION OF prt1_e FOR VALUES FROM (250) TO (500);
+CREATE TABLE prt1_e_p3 PARTITION OF prt1_e FOR VALUES FROM (500) TO (600);
+INSERT INTO prt1_e SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i;
+CREATE INDEX iprt1_e_p1_ab2 on prt1_e_p1(((a+b)/2));
+CREATE INDEX iprt1_e_p2_ab2 on prt1_e_p2(((a+b)/2));
+CREATE INDEX iprt1_e_p3_ab2 on prt1_e_p3(((a+b)/2));
+ANALYZE prt1_e;
+CREATE TABLE prt2_e (a int, b int, c int) PARTITION BY RANGE(((b + a)/2));
+CREATE TABLE prt2_e_p1 PARTITION OF prt2_e FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt2_e_p2 PARTITION OF prt2_e FOR VALUES FROM (250) TO (500);
+CREATE TABLE prt2_e_p3 PARTITION OF prt2_e FOR VALUES FROM (500) TO (600);
+INSERT INTO prt2_e SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i;
+ANALYZE prt2_e;
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b;
+                                         QUERY PLAN                                          
+---------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a, t2.b
+         ->  Append
+               ->  Hash Join
+                     Hash Cond: (((t2.b + t2.a) / 2) = ((t1.a + t1.b) / 2))
+                     ->  Seq Scan on prt2_e_p1 t2
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Seq Scan on prt1_e_p1 t1
+                                       Filter: (c = 0)
+               ->  Hash Join
+                     Hash Cond: (((t2_1.b + t2_1.a) / 2) = ((t1_1.a + t1_1.b) / 2))
+                     ->  Seq Scan on prt2_e_p2 t2_1
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Seq Scan on prt1_e_p2 t1_1
+                                       Filter: (c = 0)
+               ->  Nested Loop
+                     ->  Seq Scan on prt2_e_p3 t2_2
+                     ->  Materialize
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t1_2
+                                       Index Cond: (((a + b) / 2) = ((t2_2.b + t2_2.a) / 2))
+                                       Filter: (c = 0)
+(25 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b;
+  a  | c |  b  | c 
+-----+---+-----+---
+   0 | 0 |   0 | 0
+ 150 | 0 | 150 | 0
+ 300 | 0 | 300 | 0
+ 450 | 0 | 450 | 0
+(4 rows)
+
+--
+-- N-way join
+--
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b;
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a
+         ->  Result
+               ->  Append
+                     ->  Nested Loop
+                           Join Filter: (t1.a = (((t3.a + t3.b) / 2)))
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: a
+                                 ->  Nested Loop
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Seq Scan on prt1_p1 t1
+                                                   Filter: (b = 0)
+                                       ->  Index Scan using iprt2_p1_b on prt2_p1 t2
+                                             Index Cond: (b = t1.a)
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: ((a + b) / 2)
+                                       ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t3
+                                             Index Cond: (((a + b) / 2) = t2.b)
+                     ->  Nested Loop
+                           Join Filter: (t1_1.a = (((t3_1.a + t3_1.b) / 2)))
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: a
+                                 ->  Nested Loop
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Seq Scan on prt1_p2 t1_1
+                                                   Filter: (b = 0)
+                                       ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
+                                             Index Cond: (b = t1_1.a)
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: ((a + b) / 2)
+                                       ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t3_1
+                                             Index Cond: (((a + b) / 2) = t2_1.b)
+                     ->  Nested Loop
+                           Join Filter: (t1_2.a = t2_2.b)
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: a
+                                 ->  Nested Loop
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Seq Scan on prt1_p3 t1_2
+                                                   Filter: (b = 0)
+                                       ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t3_2
+                                             Index Cond: (((a + b) / 2) = t1_2.a)
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  Index Scan using iprt2_p3_b on prt2_p3 t2_2
+                                             Index Cond: (b = ((t3_2.a + t3_2.b) / 2))
+(50 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   | ?column? | c 
+-----+------+-----+------+----------+---
+   0 | 0000 |   0 | 0000 |        0 | 0
+ 150 | 0150 | 150 | 0150 |      300 | 0
+ 300 | 0300 | 300 | 0300 |      600 | 0
+ 450 | 0450 | 450 | 0450 |      900 | 0
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a, t2.b, ((t3.a + t3.b))
+         ->  Result
+               ->  Append
+                     ->  Nested Loop Left Join
+                           ->  Nested Loop Left Join
+                                 ->  Seq Scan on prt1_p1 t1
+                                       Filter: (b = 0)
+                                 ->  Materialize
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Distribute results by H: b
+                                             ->  Index Scan using iprt2_p1_b on prt2_p1 t2
+                                                   Index Cond: (t1.a = b)
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: ((a + b) / 2)
+                                       ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t3
+                                             Index Cond: (t1.a = ((a + b) / 2))
+                     ->  Nested Loop Left Join
+                           ->  Nested Loop Left Join
+                                 ->  Seq Scan on prt1_p2 t1_1
+                                       Filter: (b = 0)
+                                 ->  Materialize
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Distribute results by H: b
+                                             ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
+                                                   Index Cond: (t1_1.a = b)
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: ((a + b) / 2)
+                                       ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t3_1
+                                             Index Cond: (t1_1.a = ((a + b) / 2))
+                     ->  Nested Loop Left Join
+                           ->  Nested Loop Left Join
+                                 ->  Seq Scan on prt1_p3 t1_2
+                                       Filter: (b = 0)
+                                 ->  Materialize
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Distribute results by H: b
+                                             ->  Bitmap Heap Scan on prt2_p3 t2_2
+                                                   Recheck Cond: (t1_2.a = b)
+                                                   ->  Bitmap Index Scan on iprt2_p3_b
+                                                         Index Cond: (t1_2.a = b)
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: ((a + b) / 2)
+                                       ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t3_2
+                                             Index Cond: (t1_2.a = ((a + b) / 2))
+(49 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+  a  |  c   |  b  |  c   | ?column? | c 
+-----+------+-----+------+----------+---
+   0 | 0000 |   0 | 0000 |        0 | 0
+  50 | 0050 |     |      |      100 | 0
+ 100 | 0100 |     |      |      200 | 0
+ 150 | 0150 | 150 | 0150 |      300 | 0
+ 200 | 0200 |     |      |      400 | 0
+ 250 | 0250 |     |      |      500 | 0
+ 300 | 0300 | 300 | 0300 |      600 | 0
+ 350 | 0350 |     |      |      700 | 0
+ 400 | 0400 |     |      |      800 | 0
+ 450 | 0450 | 450 | 0450 |      900 | 0
+ 500 | 0500 |     |      |     1000 | 0
+ 550 | 0550 |     |      |     1100 | 0
+(12 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a, t2.b, ((t3.a + t3.b))
+         ->  Result
+               ->  Append
+                     ->  Nested Loop Left Join
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: a
+                                 ->  Nested Loop Left Join
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Distribute results by H: ((a + b) / 2)
+                                             ->  Seq Scan on prt1_e_p1 t3
+                                                   Filter: (c = 0)
+                                       ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                                             Index Cond: (a = ((t3.a + t3.b) / 2))
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  Index Scan using iprt2_p1_b on prt2_p1 t2
+                                             Index Cond: (t1.a = b)
+                     ->  Nested Loop Left Join
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: a
+                                 ->  Nested Loop Left Join
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Distribute results by H: ((a + b) / 2)
+                                             ->  Seq Scan on prt1_e_p2 t3_1
+                                                   Filter: (c = 0)
+                                       ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                                             Index Cond: (a = ((t3_1.a + t3_1.b) / 2))
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
+                                             Index Cond: (t1_1.a = b)
+                     ->  Nested Loop Left Join
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: a
+                                 ->  Nested Loop Left Join
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Distribute results by H: ((a + b) / 2)
+                                             ->  Seq Scan on prt1_e_p3 t3_2
+                                                   Filter: (c = 0)
+                                       ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                                             Index Cond: (a = ((t3_2.a + t3_2.b) / 2))
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  Index Scan using iprt2_p3_b on prt2_p3 t2_2
+                                             Index Cond: (t1_2.a = b)
+(50 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+  a  |  c   |  b  |  c   | ?column? | c 
+-----+------+-----+------+----------+---
+   0 | 0000 |   0 | 0000 |        0 | 0
+  50 | 0050 |     |      |      100 | 0
+ 100 | 0100 |     |      |      200 | 0
+ 150 | 0150 | 150 | 0150 |      300 | 0
+ 200 | 0200 |     |      |      400 | 0
+ 250 | 0250 |     |      |      500 | 0
+ 300 | 0300 | 300 | 0300 |      600 | 0
+ 350 | 0350 |     |      |      700 | 0
+ 400 | 0400 |     |      |      800 | 0
+ 450 | 0450 | 450 | 0450 |      900 | 0
+ 500 | 0500 |     |      |     1000 | 0
+ 550 | 0550 |     |      |     1100 | 0
+(12 rows)
+
+-- Cases with non-nullable expressions in subquery results;
+-- make sure these go to null as expected
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b;
+                                                         QUERY PLAN                                                         
+----------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: prt1_p1.a, prt2_p1.b, ((prt1_e_p1.a + prt1_e_p1.b))
+         ->  Result
+               ->  Append
+                     ->  Hash Full Join
+                           Hash Cond: (prt1_p1.a = (((prt1_e_p1.a + prt1_e_p1.b) / 2)))
+                           Filter: ((prt1_p1.a = (50)) OR (prt2_p1.b = (75)) OR (((prt1_e_p1.a + prt1_e_p1.b) / 2) = (50)))
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: a
+                                 ->  Hash Full Join
+                                       Hash Cond: (prt1_p1.a = prt2_p1.b)
+                                       ->  Seq Scan on prt1_p1
+                                             Filter: (b = 0)
+                                       ->  Hash
+                                             ->  Remote Subquery Scan on all (datanode_2)
+                                                   Distribute results by H: b
+                                                   ->  Seq Scan on prt2_p1
+                                                         Filter: (a = 0)
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: ((a + b) / 2)
+                                       ->  Seq Scan on prt1_e_p1
+                                             Filter: (c = 0)
+                     ->  Hash Full Join
+                           Hash Cond: (prt1_p2.a = (((prt1_e_p2.a + prt1_e_p2.b) / 2)))
+                           Filter: ((prt1_p2.a = (50)) OR (prt2_p2.b = (75)) OR (((prt1_e_p2.a + prt1_e_p2.b) / 2) = (50)))
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: a
+                                 ->  Hash Full Join
+                                       Hash Cond: (prt1_p2.a = prt2_p2.b)
+                                       ->  Seq Scan on prt1_p2
+                                             Filter: (b = 0)
+                                       ->  Hash
+                                             ->  Remote Subquery Scan on all (datanode_2)
+                                                   Distribute results by H: b
+                                                   ->  Seq Scan on prt2_p2
+                                                         Filter: (a = 0)
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: ((a + b) / 2)
+                                       ->  Seq Scan on prt1_e_p2
+                                             Filter: (c = 0)
+                     ->  Hash Full Join
+                           Hash Cond: (prt1_p3.a = (((prt1_e_p3.a + prt1_e_p3.b) / 2)))
+                           Filter: ((prt1_p3.a = (50)) OR (prt2_p3.b = (75)) OR (((prt1_e_p3.a + prt1_e_p3.b) / 2) = (50)))
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: a
+                                 ->  Hash Full Join
+                                       Hash Cond: (prt1_p3.a = prt2_p3.b)
+                                       ->  Seq Scan on prt1_p3
+                                             Filter: (b = 0)
+                                       ->  Hash
+                                             ->  Remote Subquery Scan on all (datanode_2)
+                                                   Distribute results by H: b
+                                                   ->  Seq Scan on prt2_p3
+                                                         Filter: (a = 0)
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: ((a + b) / 2)
+                                       ->  Seq Scan on prt1_e_p3
+                                             Filter: (c = 0)
+(62 rows)
+
+SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b;
+ a  | phv | b  | phv | ?column? | phv 
+----+-----+----+-----+----------+-----
+ 50 |  50 |    |     |      100 |  50
+    |     | 75 |  75 |          |    
+(2 rows)
+
+-- Semi-join
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a
+         ->  Append
+               ->  Nested Loop
+                     Join Filter: (t1.a = t1_3.b)
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  HashAggregate
+                                 Group Key: t1_3.b
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  HashAggregate
+                                             Group Key: t1_3.b
+                                             ->  Nested Loop
+                                                   ->  Remote Subquery Scan on all (datanode_2)
+                                                         ->  Seq Scan on prt2_p1 t1_3
+                                                               Filter: (a = 0)
+                                                   ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t2
+                                                         Index Cond: (((a + b) / 2) = t1_3.b)
+                     ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                           Index Cond: (a = ((t2.a + t2.b) / 2))
+                           Filter: (b = 0)
+               ->  Nested Loop
+                     Join Filter: (t1_1.a = t1_4.b)
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  HashAggregate
+                                 Group Key: t1_4.b
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  HashAggregate
+                                             Group Key: t1_4.b
+                                             ->  Nested Loop
+                                                   ->  Remote Subquery Scan on all (datanode_2)
+                                                         ->  Seq Scan on prt2_p2 t1_4
+                                                               Filter: (a = 0)
+                                                   ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t2_1
+                                                         Index Cond: (((a + b) / 2) = t1_4.b)
+                     ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                           Index Cond: (a = ((t2_1.a + t2_1.b) / 2))
+                           Filter: (b = 0)
+               ->  Nested Loop
+                     Join Filter: (t1_2.a = t1_5.b)
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  HashAggregate
+                                 Group Key: t1_5.b
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: b
+                                       ->  HashAggregate
+                                             Group Key: t1_5.b
+                                             ->  Nested Loop
+                                                   ->  Remote Subquery Scan on all (datanode_2)
+                                                         ->  Seq Scan on prt2_p3 t1_5
+                                                               Filter: (a = 0)
+                                                   ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t2_2
+                                                         Index Cond: (((a + b) / 2) = t1_5.b)
+                     ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                           Index Cond: (a = ((t2_2.a + t2_2.b) / 2))
+                           Filter: (b = 0)
+(61 rows)
+
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   
+-----+---+------
+   0 | 0 | 0000
+ 150 | 0 | 0150
+ 300 | 0 | 0300
+ 450 | 0 | 0450
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a
+         ->  Append
+               ->  Nested Loop Semi Join
+                     ->  Seq Scan on prt1_p1 t1
+                           Filter: (b = 0)
+                     ->  Materialize
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  Nested Loop Semi Join
+                                       ->  Index Only Scan using iprt2_p1_b on prt2_p1 t1_3
+                                             Index Cond: (b = t1.a)
+                                       ->  Materialize
+                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t1_6
+                                                         Index Cond: (((a + b) / 2) = t1_3.b)
+                                                         Filter: (c = 0)
+               ->  Nested Loop Semi Join
+                     ->  Seq Scan on prt1_p2 t1_1
+                           Filter: (b = 0)
+                     ->  Materialize
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  Nested Loop Semi Join
+                                       ->  Index Only Scan using iprt2_p2_b on prt2_p2 t1_4
+                                             Index Cond: (b = t1_1.a)
+                                       ->  Materialize
+                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t1_7
+                                                         Index Cond: (((a + b) / 2) = t1_4.b)
+                                                         Filter: (c = 0)
+               ->  Nested Loop Semi Join
+                     ->  Seq Scan on prt1_p3 t1_2
+                           Filter: (b = 0)
+                     ->  Materialize
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  Nested Loop Semi Join
+                                       ->  Bitmap Heap Scan on prt2_p3 t1_5
+                                             Recheck Cond: (b = t1_2.a)
+                                             ->  Bitmap Index Scan on iprt2_p3_b
+                                                   Index Cond: (b = t1_2.a)
+                                       ->  Materialize
+                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t1_8
+                                                         Index Cond: (((a + b) / 2) = t1_5.b)
+                                                         Filter: (c = 0)
+(48 rows)
+
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   
+-----+---+------
+   0 | 0 | 0000
+ 150 | 0 | 0150
+ 300 | 0 | 0300
+ 450 | 0 | 0450
+(4 rows)
+
+-- test merge joins
+SET enable_hashjoin TO off;
+SET enable_nestloop TO off;
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a
+         ->  Append
+               ->  Merge Semi Join
+                     Merge Cond: (t1.a = t1_3.b)
+                     ->  Sort
+                           Sort Key: t1.a
+                           ->  Seq Scan on prt1_p1 t1
+                                 Filter: (b = 0)
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  Merge Semi Join
+                                 Merge Cond: (t1_3.b = (((t1_6.a + t1_6.b) / 2)))
+                                 ->  Sort
+                                       Sort Key: t1_3.b
+                                       ->  Seq Scan on prt2_p1 t1_3
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       ->  Sort
+                                             Sort Key: (((t1_6.a + t1_6.b) / 2))
+                                             ->  Seq Scan on prt1_e_p1 t1_6
+                                                   Filter: (c = 0)
+               ->  Merge Semi Join
+                     Merge Cond: (t1_1.a = t1_4.b)
+                     ->  Sort
+                           Sort Key: t1_1.a
+                           ->  Seq Scan on prt1_p2 t1_1
+                                 Filter: (b = 0)
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  Merge Semi Join
+                                 Merge Cond: (t1_4.b = (((t1_7.a + t1_7.b) / 2)))
+                                 ->  Sort
+                                       Sort Key: t1_4.b
+                                       ->  Seq Scan on prt2_p2 t1_4
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       ->  Sort
+                                             Sort Key: (((t1_7.a + t1_7.b) / 2))
+                                             ->  Seq Scan on prt1_e_p2 t1_7
+                                                   Filter: (c = 0)
+               ->  Merge Semi Join
+                     Merge Cond: (t1_2.a = t1_5.b)
+                     ->  Sort
+                           Sort Key: t1_2.a
+                           ->  Seq Scan on prt1_p3 t1_2
+                                 Filter: (b = 0)
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  Merge Semi Join
+                                 Merge Cond: (t1_5.b = (((t1_8.a + t1_8.b) / 2)))
+                                 ->  Sort
+                                       Sort Key: t1_5.b
+                                       ->  Seq Scan on prt2_p3 t1_5
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       ->  Sort
+                                             Sort Key: (((t1_8.a + t1_8.b) / 2))
+                                             ->  Seq Scan on prt1_e_p3 t1_8
+                                                   Filter: (c = 0)
+(58 rows)
+
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   
+-----+---+------
+   0 | 0 | 0000
+ 150 | 0 | 0150
+ 300 | 0 | 0300
+ 450 | 0 | 0450
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a, t2.b, ((t3.a + t3.b))
+         ->  Result
+               ->  Append
+                     ->  Merge Right Join
+                           Merge Cond: (t1.a = (((t3.a + t3.b) / 2)))
+                           ->  Merge Left Join
+                                 Merge Cond: (t1.a = t2.b)
+                                 ->  Sort
+                                       Sort Key: t1.a
+                                       ->  Seq Scan on prt1_p1 t1
+                                 ->  Materialize
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Distribute results by H: b
+                                             ->  Sort
+                                                   Sort Key: t2.b
+                                                   ->  Seq Scan on prt2_p1 t2
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: ((a + b) / 2)
+                                       ->  Sort
+                                             Sort Key: (((t3.a + t3.b) / 2))
+                                             ->  Seq Scan on prt1_e_p1 t3
+                                                   Filter: (c = 0)
+                     ->  Merge Right Join
+                           Merge Cond: (t1_1.a = (((t3_1.a + t3_1.b) / 2)))
+                           ->  Merge Left Join
+                                 Merge Cond: (t1_1.a = t2_1.b)
+                                 ->  Sort
+                                       Sort Key: t1_1.a
+                                       ->  Seq Scan on prt1_p2 t1_1
+                                 ->  Materialize
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Distribute results by H: b
+                                             ->  Sort
+                                                   Sort Key: t2_1.b
+                                                   ->  Seq Scan on prt2_p2 t2_1
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: ((a + b) / 2)
+                                       ->  Sort
+                                             Sort Key: (((t3_1.a + t3_1.b) / 2))
+                                             ->  Seq Scan on prt1_e_p2 t3_1
+                                                   Filter: (c = 0)
+                     ->  Merge Right Join
+                           Merge Cond: (t2_2.b = t1_2.a)
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  Sort
+                                       Sort Key: t2_2.b
+                                       ->  Seq Scan on prt2_p3 t2_2
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: a
+                                       ->  Sort
+                                             Sort Key: t1_2.a
+                                             ->  Merge Left Join
+                                                   Merge Cond: ((((t3_2.a + t3_2.b) / 2)) = t1_2.a)
+                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                         Distribute results by H: ((a + b) / 2)
+                                                         ->  Sort
+                                                               Sort Key: (((t3_2.a + t3_2.b) / 2))
+                                                               ->  Seq Scan on prt1_e_p3 t3_2
+                                                                     Filter: (c = 0)
+                                                   ->  Sort
+                                                         Sort Key: t1_2.a
+                                                         ->  Seq Scan on prt1_p3 t1_2
+(68 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+  a  |  c   |  b  |  c   | ?column? | c 
+-----+------+-----+------+----------+---
+   0 | 0000 |   0 | 0000 |        0 | 0
+  50 | 0050 |     |      |      100 | 0
+ 100 | 0100 |     |      |      200 | 0
+ 150 | 0150 | 150 | 0150 |      300 | 0
+ 200 | 0200 |     |      |      400 | 0
+ 250 | 0250 |     |      |      500 | 0
+ 300 | 0300 | 300 | 0300 |      600 | 0
+ 350 | 0350 |     |      |      700 | 0
+ 400 | 0400 |     |      |      800 | 0
+ 450 | 0450 | 450 | 0450 |      900 | 0
+ 500 | 0500 |     |      |     1000 | 0
+ 550 | 0550 |     |      |     1100 | 0
+(12 rows)
+
+-- MergeAppend on nullable column
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Sort
+   Sort Key: prt1_p1.a, b
+   ->  Append
+         ->  Merge Left Join
+               Merge Cond: (prt1_p1.a = b)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Sort
+                           Sort Key: prt1_p1.a
+                           ->  Seq Scan on prt1_p1
+                                 Filter: ((a < 450) AND (b = 0))
+               ->  Sort
+                     Sort Key: b
+                     ->  Result
+                           One-Time Filter: false
+         ->  Merge Right Join
+               Merge Cond: (prt2_p2.b = prt1_p2.a)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  Sort
+                           Sort Key: prt2_p2.b
+                           ->  Seq Scan on prt2_p2
+                                 Filter: (b > 250)
+               ->  Sort
+                     Sort Key: prt1_p2.a
+                     ->  Seq Scan on prt1_p2
+                           Filter: ((a < 450) AND (b = 0))
+(26 rows)
+
+SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  b
+-----+-----
+   0 |
+  50 |
+ 100 |
+ 150 |
+ 200 |
+ 250 |
+ 300 | 300
+ 350 |
+ 400 |
+(9 rows)
+
+RESET enable_hashjoin;
+RESET enable_nestloop;
+--
+-- partitioned by multiple columns
+--
+CREATE TABLE prt1_m (a int, b int, c int) PARTITION BY RANGE(a, ((a + b)/2));
+CREATE TABLE prt1_m_p1 PARTITION OF prt1_m FOR VALUES FROM (0, 0) TO (250, 250);
+CREATE TABLE prt1_m_p2 PARTITION OF prt1_m FOR VALUES FROM (250, 250) TO (500, 500);
+CREATE TABLE prt1_m_p3 PARTITION OF prt1_m FOR VALUES FROM (500, 500) TO (600, 600);
+INSERT INTO prt1_m SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i;
+ANALYZE prt1_m;
+CREATE TABLE prt2_m (a int, b int, c int) PARTITION BY RANGE(((b + a)/2), b);
+CREATE TABLE prt2_m_p1 PARTITION OF prt2_m FOR VALUES FROM (0, 0) TO (250, 250);
+CREATE TABLE prt2_m_p2 PARTITION OF prt2_m FOR VALUES FROM (250, 250) TO (500, 500);
+CREATE TABLE prt2_m_p3 PARTITION OF prt2_m FOR VALUES FROM (500, 500) TO (600, 600);
+INSERT INTO prt2_m SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i;
+ANALYZE prt2_m;
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b;
+                                                                 QUERY PLAN                                                                 
+--------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: prt1_m_p1.a, prt2_m_p1.b
+         ->  Append
+               ->  Hash Full Join
+                     Hash Cond: ((prt1_m_p1.a = (((prt2_m_p1.b + prt2_m_p1.a) / 2))) AND (((prt1_m_p1.a + prt1_m_p1.b) / 2) = prt2_m_p1.b))
+                     ->  Seq Scan on prt1_m_p1
+                           Filter: (c = 0)
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: ((b + a) / 2)
+                                 ->  Seq Scan on prt2_m_p1
+                                       Filter: (c = 0)
+               ->  Hash Full Join
+                     Hash Cond: ((prt1_m_p2.a = (((prt2_m_p2.b + prt2_m_p2.a) / 2))) AND (((prt1_m_p2.a + prt1_m_p2.b) / 2) = prt2_m_p2.b))
+                     ->  Seq Scan on prt1_m_p2
+                           Filter: (c = 0)
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: ((b + a) / 2)
+                                 ->  Seq Scan on prt2_m_p2
+                                       Filter: (c = 0)
+               ->  Hash Full Join
+                     Hash Cond: ((prt1_m_p3.a = (((prt2_m_p3.b + prt2_m_p3.a) / 2))) AND (((prt1_m_p3.a + prt1_m_p3.b) / 2) = prt2_m_p3.b))
+                     ->  Seq Scan on prt1_m_p3
+                           Filter: (c = 0)
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: ((b + a) / 2)
+                                 ->  Seq Scan on prt2_m_p3
+                                       Filter: (c = 0)
+(31 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b;
+  a  | c |  b  | c 
+-----+---+-----+---
+   0 | 0 |   0 | 0
+  50 | 0 |     |  
+ 100 | 0 |     |  
+ 150 | 0 | 150 | 0
+ 200 | 0 |     |  
+ 250 | 0 |     |  
+ 300 | 0 | 300 | 0
+ 350 | 0 |     |  
+ 400 | 0 |     |  
+ 450 | 0 | 450 | 0
+ 500 | 0 |     |  
+ 550 | 0 |     |  
+     |   |  75 | 0
+     |   | 225 | 0
+     |   | 375 | 0
+     |   | 525 | 0
+(16 rows)
+
+--
+-- tests for list partitioned tables.
+--
+CREATE TABLE plt1 (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE plt1_p1 PARTITION OF plt1 FOR VALUES IN ('0000', '0003', '0004', '0010');
+CREATE TABLE plt1_p2 PARTITION OF plt1 FOR VALUES IN ('0001', '0005', '0002', '0009');
+CREATE TABLE plt1_p3 PARTITION OF plt1 FOR VALUES IN ('0006', '0007', '0008', '0011');
+INSERT INTO plt1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE plt1;
+CREATE TABLE plt2 (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE plt2_p1 PARTITION OF plt2 FOR VALUES IN ('0000', '0003', '0004', '0010');
+CREATE TABLE plt2_p2 PARTITION OF plt2 FOR VALUES IN ('0001', '0005', '0002', '0009');
+CREATE TABLE plt2_p3 PARTITION OF plt2 FOR VALUES IN ('0006', '0007', '0008', '0011');
+INSERT INTO plt2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i;
+ANALYZE plt2;
+--
+-- list partitioned by expression
+--
+CREATE TABLE plt1_e (a int, b int, c text) PARTITION BY LIST(ltrim(c, 'A'));
+CREATE TABLE plt1_e_p1 PARTITION OF plt1_e FOR VALUES IN ('0000', '0003', '0004', '0010');
+CREATE TABLE plt1_e_p2 PARTITION OF plt1_e FOR VALUES IN ('0001', '0005', '0002', '0009');
+CREATE TABLE plt1_e_p3 PARTITION OF plt1_e FOR VALUES IN ('0006', '0007', '0008', '0011');
+INSERT INTO plt1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE plt1_e;
+-- test partition matching with N-way join
+EXPLAIN (COSTS OFF)
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+                                                   QUERY PLAN                                                    
+-----------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Finalize GroupAggregate
+         Group Key: c, c, c
+         ->  Sort
+               Sort Key: c, c
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: c
+                     ->  Partial HashAggregate
+                           Group Key: t1.c, t2.c, t3.c
+                           ->  Result
+                                 ->  Append
+                                       ->  Hash Join
+                                             Hash Cond: (t1.c = t2.c)
+                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   ->  Seq Scan on plt1_p1 t1
+                                             ->  Hash
+                                                   ->  Hash Join
+                                                         Hash Cond: (t2.c = ltrim(t3.c, 'A'::text))
+                                                         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                               ->  Seq Scan on plt2_p1 t2
+                                                         ->  Hash
+                                                               ->  Seq Scan on plt1_e_p1 t3
+                                       ->  Hash Join
+                                             Hash Cond: (t1_1.c = t2_1.c)
+                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   ->  Seq Scan on plt1_p2 t1_1
+                                             ->  Hash
+                                                   ->  Hash Join
+                                                         Hash Cond: (t2_1.c = ltrim(t3_1.c, 'A'::text))
+                                                         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                               ->  Seq Scan on plt2_p2 t2_1
+                                                         ->  Hash
+                                                               ->  Seq Scan on plt1_e_p2 t3_1
+                                       ->  Hash Join
+                                             Hash Cond: (t1_2.c = t2_2.c)
+                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   ->  Seq Scan on plt1_p3 t1_2
+                                             ->  Hash
+                                                   ->  Hash Join
+                                                         Hash Cond: (t2_2.c = ltrim(t3_2.c, 'A'::text))
+                                                         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                               ->  Seq Scan on plt2_p3 t2_2
+                                                         ->  Hash
+                                                               ->  Seq Scan on plt1_e_p3 t3_2
+(44 rows)
+
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+         avg          |         avg          |          avg          |  c   |  c   |   c   
+----------------------+----------------------+-----------------------+------+------+-------
+  24.0000000000000000 |  24.0000000000000000 |   48.0000000000000000 | 0000 | 0000 | A0000
+  74.0000000000000000 |  75.0000000000000000 |  148.0000000000000000 | 0001 | 0001 | A0001
+ 124.0000000000000000 | 124.5000000000000000 |  248.0000000000000000 | 0002 | 0002 | A0002
+ 174.0000000000000000 | 174.0000000000000000 |  348.0000000000000000 | 0003 | 0003 | A0003
+ 224.0000000000000000 | 225.0000000000000000 |  448.0000000000000000 | 0004 | 0004 | A0004
+ 274.0000000000000000 | 274.5000000000000000 |  548.0000000000000000 | 0005 | 0005 | A0005
+ 324.0000000000000000 | 324.0000000000000000 |  648.0000000000000000 | 0006 | 0006 | A0006
+ 374.0000000000000000 | 375.0000000000000000 |  748.0000000000000000 | 0007 | 0007 | A0007
+ 424.0000000000000000 | 424.5000000000000000 |  848.0000000000000000 | 0008 | 0008 | A0008
+ 474.0000000000000000 | 474.0000000000000000 |  948.0000000000000000 | 0009 | 0009 | A0009
+ 524.0000000000000000 | 525.0000000000000000 | 1048.0000000000000000 | 0010 | 0010 | A0010
+ 574.0000000000000000 | 574.5000000000000000 | 1148.0000000000000000 | 0011 | 0011 | A0011
+(12 rows)
+
+-- joins where one of the relations is proven empty
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a = 1 AND t1.a = 2;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 LEFT JOIN prt2 t2 ON t1.a = t2.b;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Hash Left Join
+   Hash Cond: (b = a)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Append
+               ->  Hash Join
+                     Hash Cond: (t3.a = t2.b)
+                     ->  Seq Scan on prt1_p1 t3
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_p1 t2
+               ->  Hash Join
+                     Hash Cond: (t3_1.a = t2_1.b)
+                     ->  Seq Scan on prt1_p2 t3_1
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_p2 t2_1
+               ->  Nested Loop
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  Seq Scan on prt2_p3 t2_2
+                     ->  Index Only Scan using iprt1_p3_a on prt1_p3 t3_2
+                           Index Cond: (a = t2_2.b)
+   ->  Hash
+         ->  Result
+               One-Time Filter: false
+(27 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Sort
+   Sort Key: a, b
+   ->  Hash Left Join
+         Hash Cond: (b = a)
+         ->  Remote Subquery Scan on all (datanode_2)
+               ->  Append
+                     ->  Seq Scan on prt2_p1 t2
+                           Filter: (a = 0)
+                     ->  Seq Scan on prt2_p2 t2_1
+                           Filter: (a = 0)
+                     ->  Seq Scan on prt2_p3 t2_2
+                           Filter: (a = 0)
+         ->  Hash
+               ->  Result
+                     One-Time Filter: false
+(15 rows)
+
+--
+-- tests for hash partitioned tables.
+--
+CREATE TABLE pht1 (a int, b int, c text) PARTITION BY HASH(c);
+CREATE TABLE pht1_p1 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 0);
+CREATE TABLE pht1_p2 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 1);
+CREATE TABLE pht1_p3 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+INSERT INTO pht1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE pht1;
+CREATE TABLE pht2 (a int, b int, c text) PARTITION BY HASH(c);
+CREATE TABLE pht2_p1 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 0);
+CREATE TABLE pht2_p2 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 1);
+CREATE TABLE pht2_p3 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+INSERT INTO pht2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i;
+ANALYZE pht2;
+--
+-- hash partitioned by expression
+--
+CREATE TABLE pht1_e (a int, b int, c text) PARTITION BY HASH(ltrim(c, 'A'));
+CREATE TABLE pht1_e_p1 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 0);
+CREATE TABLE pht1_e_p2 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 1);
+CREATE TABLE pht1_e_p3 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+INSERT INTO pht1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE pht1_e;
+-- test partition matching with N-way join
+EXPLAIN (COSTS OFF)
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+                                                   QUERY PLAN                                                    
+-----------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Finalize GroupAggregate
+         Group Key: c, c, c
+         ->  Sort
+               Sort Key: c, c
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: c
+                     ->  Partial HashAggregate
+                           Group Key: t1.c, t2.c, t3.c
+                           ->  Result
+                                 ->  Append
+                                       ->  Hash Join
+                                             Hash Cond: (t1.c = t2.c)
+                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   ->  Seq Scan on pht1_p1 t1
+                                             ->  Hash
+                                                   ->  Hash Join
+                                                         Hash Cond: (t2.c = ltrim(t3.c, 'A'::text))
+                                                         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                               ->  Seq Scan on pht2_p1 t2
+                                                         ->  Hash
+                                                               ->  Seq Scan on pht1_e_p1 t3
+                                       ->  Hash Join
+                                             Hash Cond: (t1_1.c = t2_1.c)
+                                             ->  Hash Join
+                                                   Hash Cond: (t1_1.c = ltrim(t3_1.c, 'A'::text))
+                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                         ->  Seq Scan on pht1_p2 t1_1
+                                                   ->  Hash
+                                                         ->  Seq Scan on pht1_e_p2 t3_1
+                                             ->  Hash
+                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                         ->  Seq Scan on pht2_p2 t2_1
+                                       ->  Hash Join
+                                             Hash Cond: (t1_2.c = t2_2.c)
+                                             ->  Hash Join
+                                                   Hash Cond: (t1_2.c = ltrim(t3_2.c, 'A'::text))
+                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                         ->  Seq Scan on pht1_p3 t1_2
+                                                   ->  Hash
+                                                         ->  Seq Scan on pht1_e_p3 t3_2
+                                             ->  Hash
+                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                         ->  Seq Scan on pht2_p3 t2_2
+(44 rows)
+
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+         avg          |         avg          |          avg          |  c   |  c   |   c   
+----------------------+----------------------+-----------------------+------+------+-------
+  24.0000000000000000 |  24.0000000000000000 |   48.0000000000000000 | 0000 | 0000 | A0000
+  74.0000000000000000 |  75.0000000000000000 |  148.0000000000000000 | 0001 | 0001 | A0001
+ 124.0000000000000000 | 124.5000000000000000 |  248.0000000000000000 | 0002 | 0002 | A0002
+ 174.0000000000000000 | 174.0000000000000000 |  348.0000000000000000 | 0003 | 0003 | A0003
+ 224.0000000000000000 | 225.0000000000000000 |  448.0000000000000000 | 0004 | 0004 | A0004
+ 274.0000000000000000 | 274.5000000000000000 |  548.0000000000000000 | 0005 | 0005 | A0005
+ 324.0000000000000000 | 324.0000000000000000 |  648.0000000000000000 | 0006 | 0006 | A0006
+ 374.0000000000000000 | 375.0000000000000000 |  748.0000000000000000 | 0007 | 0007 | A0007
+ 424.0000000000000000 | 424.5000000000000000 |  848.0000000000000000 | 0008 | 0008 | A0008
+ 474.0000000000000000 | 474.0000000000000000 |  948.0000000000000000 | 0009 | 0009 | A0009
+ 524.0000000000000000 | 525.0000000000000000 | 1048.0000000000000000 | 0010 | 0010 | A0010
+ 574.0000000000000000 | 574.5000000000000000 | 1148.0000000000000000 | 0011 | 0011 | A0011
+(12 rows)
+
+--
+-- multiple levels of partitioning 
+--
+CREATE TABLE prt1_l (a int, b int, c varchar) PARTITION BY RANGE(a);
+CREATE TABLE prt1_l_p1 PARTITION OF prt1_l FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt1_l_p2 PARTITION OF prt1_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c);
+CREATE TABLE prt1_l_p2_p1 PARTITION OF prt1_l_p2 FOR VALUES IN ('0000', '0001');
+CREATE TABLE prt1_l_p2_p2 PARTITION OF prt1_l_p2 FOR VALUES IN ('0002', '0003');
+CREATE TABLE prt1_l_p3 PARTITION OF prt1_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (b);
+CREATE TABLE prt1_l_p3_p1 PARTITION OF prt1_l_p3 FOR VALUES FROM (0) TO (13);
+CREATE TABLE prt1_l_p3_p2 PARTITION OF prt1_l_p3 FOR VALUES FROM (13) TO (25);
+INSERT INTO prt1_l SELECT i, i % 25, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt1_l;
+CREATE TABLE prt2_l (a int, b int, c varchar) PARTITION BY RANGE(b);
+CREATE TABLE prt2_l_p1 PARTITION OF prt2_l FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt2_l_p2 PARTITION OF prt2_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c);
+CREATE TABLE prt2_l_p2_p1 PARTITION OF prt2_l_p2 FOR VALUES IN ('0000', '0001');
+CREATE TABLE prt2_l_p2_p2 PARTITION OF prt2_l_p2 FOR VALUES IN ('0002', '0003');
+CREATE TABLE prt2_l_p3 PARTITION OF prt2_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (a);
+CREATE TABLE prt2_l_p3_p1 PARTITION OF prt2_l_p3 FOR VALUES FROM (0) TO (13);
+CREATE TABLE prt2_l_p3_p2 PARTITION OF prt2_l_p3 FOR VALUES FROM (13) TO (25);
+INSERT INTO prt2_l SELECT i % 25, i, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 3) i;
+ANALYZE prt2_l;
+-- inner join, qual covering only top-level partitions
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a
+         ->  Append
+               ->  Hash Join
+                     Hash Cond: (t2.b = t1.a)
+                     ->  Seq Scan on prt2_l_p1 t2
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Seq Scan on prt1_l_p1 t1
+                                       Filter: (b = 0)
+               ->  Hash Join
+                     Hash Cond: (t2_1.b = a)
+                     ->  Append
+                           ->  Seq Scan on prt2_l_p2_p1 t2_1
+                           ->  Seq Scan on prt2_l_p2_p2 t2_2
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Append
+                                       ->  Seq Scan on prt1_l_p2_p1 t1_1
+                                             Filter: (b = 0)
+                                       ->  Seq Scan on prt1_l_p2_p2 t1_2
+                                             Filter: (b = 0)
+               ->  Nested Loop
+                     Join Filter: (a = t2_3.b)
+                     ->  Append
+                           ->  Seq Scan on prt2_l_p3_p1 t2_3
+                           ->  Seq Scan on prt2_l_p3_p2 t2_4
+                     ->  Materialize
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Append
+                                       ->  Seq Scan on prt1_l_p3_p1 t1_3
+                                             Filter: (b = 0)
+(33 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+ 150 | 0002 | 150 | 0002
+ 300 | 0000 | 300 | 0000
+ 450 | 0002 | 450 | 0002
+(4 rows)
+
+-- left join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a, t2.b
+         ->  Append
+               ->  Hash Right Join
+                     Hash Cond: ((t2.b = t1.a) AND ((t2.c)::text = (t1.c)::text))
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  Seq Scan on prt2_l_p1 t2
+                     ->  Hash
+                           ->  Seq Scan on prt1_l_p1 t1
+                                 Filter: (b = 0)
+               ->  Hash Right Join
+                     Hash Cond: ((t2_1.b = t1_1.a) AND ((t2_1.c)::text = (t1_1.c)::text))
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  Seq Scan on prt2_l_p2_p1 t2_1
+                     ->  Hash
+                           ->  Seq Scan on prt1_l_p2_p1 t1_1
+                                 Filter: (b = 0)
+               ->  Hash Right Join
+                     Hash Cond: ((t2_2.b = t1_2.a) AND ((t2_2.c)::text = (t1_2.c)::text))
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  Seq Scan on prt2_l_p2_p2 t2_2
+                     ->  Hash
+                           ->  Seq Scan on prt1_l_p2_p2 t1_2
+                                 Filter: (b = 0)
+               ->  Nested Loop Left Join
+                     Join Filter: ((a = b) AND ((c)::text = (c)::text))
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Append
+                                 ->  Seq Scan on prt1_l_p3_p1 t1_3
+                                       Filter: (b = 0)
+                     ->  Materialize
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  Append
+                                       ->  Seq Scan on prt2_l_p3_p1 t2_3
+                                       ->  Seq Scan on prt2_l_p3_p2 t2_4
+(41 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+  50 | 0002 |     | 
+ 100 | 0000 |     | 
+ 150 | 0002 | 150 | 0002
+ 200 | 0000 |     | 
+ 250 | 0002 |     | 
+ 300 | 0000 | 300 | 0000
+ 350 | 0002 |     | 
+ 400 | 0000 |     | 
+ 450 | 0002 | 450 | 0002
+ 500 | 0000 |     | 
+ 550 | 0002 |     | 
+(12 rows)
+
+-- right join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+                                            QUERY PLAN                                            
+--------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a, t2.b
+         ->  Result
+               ->  Append
+                     ->  Hash Right Join
+                           Hash Cond: ((t1.a = t2.b) AND ((t1.c)::text = (t2.c)::text))
+                           ->  Seq Scan on prt1_l_p1 t1
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all (datanode_2)
+                                       Distribute results by H: b
+                                       ->  Seq Scan on prt2_l_p1 t2
+                                             Filter: (a = 0)
+                     ->  Nested Loop Left Join
+                           Join Filter: ((t1_1.a = t2_1.b) AND ((t1_1.c)::text = (t2_1.c)::text))
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_l_p2_p1 t2_1
+                                       Filter: (a = 0)
+                           ->  Seq Scan on prt1_l_p2_p1 t1_1
+                     ->  Hash Right Join
+                           Hash Cond: ((t1_2.a = t2_2.b) AND ((t1_2.c)::text = (t2_2.c)::text))
+                           ->  Seq Scan on prt1_l_p2_p2 t1_2
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all (datanode_2)
+                                       Distribute results by H: b
+                                       ->  Seq Scan on prt2_l_p2_p2 t2_2
+                                             Filter: (a = 0)
+                     ->  Nested Loop Left Join
+                           Join Filter: ((a = b) AND ((c)::text = (c)::text))
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  Append
+                                       ->  Seq Scan on prt2_l_p3_p1 t2_3
+                                             Filter: (a = 0)
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: a
+                                       ->  Append
+                                             ->  Seq Scan on prt1_l_p3_p1 t1_3
+                                             ->  Seq Scan on prt1_l_p3_p2 t1_4
+(41 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+ 150 | 0002 | 150 | 0002
+ 300 | 0000 | 300 | 0000
+ 450 | 0002 | 450 | 0002
+     |      |  75 | 0003
+     |      | 225 | 0001
+     |      | 375 | 0003
+     |      | 525 | 0001
+(8 rows)
+
+-- full join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b;
+                                                        QUERY PLAN                                                        
+--------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: prt1_l_p1.a, prt2_l_p1.b
+         ->  Append
+               ->  Hash Full Join
+                     Hash Cond: ((prt1_l_p1.a = prt2_l_p1.b) AND ((prt1_l_p1.c)::text = (prt2_l_p1.c)::text))
+                     ->  Seq Scan on prt1_l_p1
+                           Filter: (b = 0)
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_l_p1
+                                       Filter: (a = 0)
+               ->  Hash Full Join
+                     Hash Cond: ((prt1_l_p2_p1.a = prt2_l_p2_p1.b) AND ((prt1_l_p2_p1.c)::text = (prt2_l_p2_p1.c)::text))
+                     ->  Seq Scan on prt1_l_p2_p1
+                           Filter: (b = 0)
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_l_p2_p1
+                                       Filter: (a = 0)
+               ->  Hash Full Join
+                     Hash Cond: ((prt1_l_p2_p2.a = prt2_l_p2_p2.b) AND ((prt1_l_p2_p2.c)::text = (prt2_l_p2_p2.c)::text))
+                     ->  Seq Scan on prt1_l_p2_p2
+                           Filter: (b = 0)
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_l_p2_p2
+                                       Filter: (a = 0)
+               ->  Hash Full Join
+                     Hash Cond: ((a = b) AND ((c)::text = (c)::text))
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Append
+                                 ->  Seq Scan on prt1_l_p3_p1
+                                       Filter: (b = 0)
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  Append
+                                       ->  Seq Scan on prt2_l_p3_p1
+                                             Filter: (a = 0)
+(44 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+  50 | 0002 |     | 
+ 100 | 0000 |     | 
+ 150 | 0002 | 150 | 0002
+ 200 | 0000 |     | 
+ 250 | 0002 |     | 
+ 300 | 0000 | 300 | 0000
+ 350 | 0002 |     | 
+ 400 | 0000 |     | 
+ 450 | 0002 | 450 | 0002
+ 500 | 0000 |     | 
+ 550 | 0002 |     | 
+     |      |  75 | 0003
+     |      | 225 | 0001
+     |      | 375 | 0003
+     |      | 525 | 0001
+(16 rows)
+
+-- lateral partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT * FROM prt1_l t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss
+			  ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a;
+ERROR:  could not devise a query plan for the given query
+SELECT * FROM prt1_l t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss
+			  ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   | t2a | t2c  | t2b | t3b | least
+-----+---+------+-----+------+-----+-----+-------
+   0 | 0 | 0000 |   0 | 0000 |   0 |   0 |     0
+  50 | 0 | 0002 |     |      |     |     |
+ 100 | 0 | 0000 |     |      |     |     |
+ 150 | 0 | 0002 | 150 | 0002 |   0 | 150 |   150
+ 200 | 0 | 0000 |     |      |     |     |
+ 250 | 0 | 0002 |     |      |     |     |
+ 300 | 0 | 0000 | 300 | 0000 |   0 | 300 |   300
+ 350 | 0 | 0002 |     |      |     |     |
+ 400 | 0 | 0000 |     |      |     |     |
+ 450 | 0 | 0002 | 450 | 0002 |   0 | 450 |   450
+ 500 | 0 | 0000 |     |      |     |     |
+ 550 | 0 | 0002 |     |      |     |     |
+(12 rows)
+
+-- join with one side empty
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.b = t2.a AND t1.c = t2.c;
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((b = a) AND (a = b) AND ((c)::text = (c)::text))
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Append
+               ->  Seq Scan on prt2_l_p1 t2
+               ->  Seq Scan on prt2_l_p2_p1 t2_1
+               ->  Seq Scan on prt2_l_p2_p2 t2_2
+               ->  Seq Scan on prt2_l_p3_p1 t2_3
+               ->  Seq Scan on prt2_l_p3_p2 t2_4
+   ->  Hash
+         ->  Result
+               One-Time Filter: false
+(12 rows)
+
+--
+-- negative testcases
+--
+CREATE TABLE prt1_n (a int, b int, c varchar) PARTITION BY RANGE(c);
+CREATE TABLE prt1_n_p1 PARTITION OF prt1_n FOR VALUES FROM ('0000') TO ('0250');
+CREATE TABLE prt1_n_p2 PARTITION OF prt1_n FOR VALUES FROM ('0250') TO ('0500');
+INSERT INTO prt1_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 499, 2) i;
+ANALYZE prt1_n;
+CREATE TABLE prt2_n (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE prt2_n_p1 PARTITION OF prt2_n FOR VALUES IN ('0000', '0003', '0004', '0010', '0006', '0007');
+CREATE TABLE prt2_n_p2 PARTITION OF prt2_n FOR VALUES IN ('0001', '0005', '0002', '0009', '0008', '0011');
+INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt2_n;
+CREATE TABLE prt3_n (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE prt3_n_p1 PARTITION OF prt3_n FOR VALUES IN ('0000', '0004', '0006', '0007');
+CREATE TABLE prt3_n_p2 PARTITION OF prt3_n FOR VALUES IN ('0001', '0002', '0008', '0010');
+CREATE TABLE prt3_n_p3 PARTITION OF prt3_n FOR VALUES IN ('0003', '0005', '0009', '0011');
+INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt3_n;
+CREATE TABLE prt4_n (a int, b int, c text) PARTITION BY RANGE(a);
+CREATE TABLE prt4_n_p1 PARTITION OF prt4_n FOR VALUES FROM (0) TO (300);
+CREATE TABLE prt4_n_p2 PARTITION OF prt4_n FOR VALUES FROM (300) TO (500);
+CREATE TABLE prt4_n_p3 PARTITION OF prt4_n FOR VALUES FROM (500) TO (600);
+INSERT INTO prt4_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt4_n;
+-- partition-wise join can not be applied if the partition ranges differ
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2 WHERE t1.a = t2.a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Join
+         Hash Cond: (t1.a = t2.a)
+         ->  Append
+               ->  Seq Scan on prt1_p1 t1
+               ->  Seq Scan on prt1_p2 t1_1
+               ->  Seq Scan on prt1_p3 t1_2
+         ->  Hash
+               ->  Append
+                     ->  Seq Scan on prt4_n_p1 t2
+                     ->  Seq Scan on prt4_n_p2 t2_1
+                     ->  Seq Scan on prt4_n_p3 t2_2
+(12 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2, prt2 t3 WHERE t1.a = t2.a and t1.a = t3.b;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Join
+         Hash Cond: (t1.a = t2.a)
+         ->  Append
+               ->  Hash Join
+                     Hash Cond: (t1.a = t3.b)
+                     ->  Seq Scan on prt1_p1 t1
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_p1 t3
+               ->  Hash Join
+                     Hash Cond: (t1_1.a = t3_1.b)
+                     ->  Seq Scan on prt1_p2 t1_1
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  Seq Scan on prt2_p2 t3_1
+               ->  Nested Loop
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  Seq Scan on prt2_p3 t3_2
+                     ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                           Index Cond: (a = t3_2.b)
+         ->  Hash
+               ->  Append
+                     ->  Seq Scan on prt4_n_p1 t2
+                     ->  Seq Scan on prt4_n_p2 t2_1
+                     ->  Seq Scan on prt4_n_p3 t2_2
+(29 rows)
+
+-- partition-wise join can not be applied if there are no equi-join conditions
+-- between partition keys
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 LEFT JOIN prt2 t2 ON (t1.a < t2.b);
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Nested Loop Left Join
+   Join Filter: (a < b)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Append
+               ->  Seq Scan on prt1_p1 t1
+               ->  Seq Scan on prt1_p2 t1_1
+               ->  Seq Scan on prt1_p3 t1_2
+   ->  Materialize
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Append
+                     ->  Seq Scan on prt2_p1 t2
+                     ->  Seq Scan on prt2_p2 t2_1
+                     ->  Seq Scan on prt2_p3 t2_2
+(13 rows)
+
+-- equi-join with join condition on partial keys does not qualify for
+-- partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1, prt2_m t2 WHERE t1.a = (t2.b + t2.a)/2;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Join
+         Hash Cond: ((((b + a) / 2)) = a)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: ((b + a) / 2)
+               ->  Result
+                     ->  Append
+                           ->  Seq Scan on prt2_m_p1 t2
+                           ->  Seq Scan on prt2_m_p2 t2_1
+                           ->  Seq Scan on prt2_m_p3 t2_2
+         ->  Hash
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_m_p1 t1
+                           ->  Seq Scan on prt1_m_p2 t1_1
+                           ->  Seq Scan on prt1_m_p3 t1_2
+(17 rows)
+
+-- equi-join between out-of-order partition key columns does not qualify for
+-- partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.a = t2.b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Left Join
+         Hash Cond: (t1.a = b)
+         ->  Append
+               ->  Seq Scan on prt1_m_p1 t1
+               ->  Seq Scan on prt1_m_p2 t1_1
+               ->  Seq Scan on prt1_m_p3 t1_2
+         ->  Hash
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Append
+                           ->  Seq Scan on prt2_m_p1 t2
+                           ->  Seq Scan on prt2_m_p2 t2_1
+                           ->  Seq Scan on prt2_m_p3 t2_2
+(13 rows)
+
+-- equi-join between non-key columns does not qualify for partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.c = t2.c;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Left Join
+         Hash Cond: (t1.c = c)
+         ->  Append
+               ->  Seq Scan on prt1_m_p1 t1
+               ->  Seq Scan on prt1_m_p2 t1_1
+               ->  Seq Scan on prt1_m_p3 t1_2
+         ->  Hash
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Append
+                           ->  Seq Scan on prt2_m_p1 t2
+                           ->  Seq Scan on prt2_m_p2 t2_1
+                           ->  Seq Scan on prt2_m_p3 t2_2
+(13 rows)
+
+-- partition-wise join can not be applied between tables with different
+-- partition lists
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 LEFT JOIN prt2_n t2 ON (t1.c = t2.c);
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Right Join
+         Hash Cond: (c = (c)::text)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: c
+               ->  Append
+                     ->  Seq Scan on prt2_n_p1 t2
+                     ->  Seq Scan on prt2_n_p2 t2_1
+         ->  Hash
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: c
+                     ->  Append
+                           ->  Seq Scan on prt1_n_p1 t1
+                           ->  Seq Scan on prt1_n_p2 t1_1
+(14 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 JOIN prt2_n t2 ON (t1.c = t2.c) JOIN plt1 t3 ON (t1.c = t3.c);
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Join
+         Hash Cond: (c = (c)::text)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Append
+                     ->  Seq Scan on plt1_p1 t3
+                     ->  Seq Scan on plt1_p2 t3_1
+                     ->  Seq Scan on plt1_p3 t3_2
+         ->  Hash
+               ->  Hash Join
+                     Hash Cond: (t2.c = (c)::text)
+                     ->  Append
+                           ->  Seq Scan on prt2_n_p1 t2
+                           ->  Seq Scan on prt2_n_p2 t2_1
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Append
+                                       ->  Seq Scan on prt1_n_p1 t1
+                                       ->  Seq Scan on prt1_n_p2 t1_1
+(19 rows)
+
+-- partition-wise join can not be applied for a join between list and range
+-- partitioned table
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 FULL JOIN prt1 t2 ON (t1.c = t2.c);
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Full Join
+         Hash Cond: ((c)::text = (c)::text)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: c
+               ->  Append
+                     ->  Seq Scan on prt1_p1 t2
+                     ->  Seq Scan on prt1_p2 t2_1
+                     ->  Seq Scan on prt1_p3 t2_2
+         ->  Hash
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: c
+                     ->  Append
+                           ->  Seq Scan on prt1_n_p1 t1
+                           ->  Seq Scan on prt1_n_p2 t1_1
+(15 rows)
+
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index ff388472..61bbdf23 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -9,140 +9,164 @@ create table lp_bc partition of lp for values in ('b', 'c');
 create table lp_g partition of lp for values in ('g');
 create table lp_null partition of lp for values in (null);
 explain (costs off) select * from lp;
-          QUERY PLAN          
-------------------------------
- Append
-   ->  Seq Scan on lp_ad
-   ->  Seq Scan on lp_bc
-   ->  Seq Scan on lp_ef
-   ->  Seq Scan on lp_g
-   ->  Seq Scan on lp_null
-   ->  Seq Scan on lp_default
-(7 rows)
+             QUERY PLAN             
+------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on lp_ad
+         ->  Seq Scan on lp_bc
+         ->  Seq Scan on lp_ef
+         ->  Seq Scan on lp_g
+         ->  Seq Scan on lp_null
+         ->  Seq Scan on lp_default
+(9 rows)
 
 explain (costs off) select * from lp where a > 'a' and a < 'd';
-                        QUERY PLAN                         
------------------------------------------------------------
- Append
-   ->  Seq Scan on lp_bc
-         Filter: ((a > 'a'::bpchar) AND (a < 'd'::bpchar))
-   ->  Seq Scan on lp_default
-         Filter: ((a > 'a'::bpchar) AND (a < 'd'::bpchar))
-(5 rows)
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on lp_bc
+               Filter: ((a > 'a'::bpchar) AND (a < 'd'::bpchar))
+         ->  Seq Scan on lp_default
+               Filter: ((a > 'a'::bpchar) AND (a < 'd'::bpchar))
+(7 rows)
 
 explain (costs off) select * from lp where a > 'a' and a <= 'd';
-                         QUERY PLAN                         
-------------------------------------------------------------
- Append
-   ->  Seq Scan on lp_ad
-         Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar))
-   ->  Seq Scan on lp_bc
-         Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar))
-   ->  Seq Scan on lp_default
-         Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar))
-(7 rows)
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on lp_ad
+               Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar))
+         ->  Seq Scan on lp_bc
+               Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar))
+         ->  Seq Scan on lp_default
+               Filter: ((a > 'a'::bpchar) AND (a <= 'd'::bpchar))
+(9 rows)
 
 explain (costs off) select * from lp where a = 'a';
-            QUERY PLAN             
------------------------------------
- Append
-   ->  Seq Scan on lp_ad
-         Filter: (a = 'a'::bpchar)
-(3 rows)
+               QUERY PLAN                
+-----------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on lp_ad
+               Filter: (a = 'a'::bpchar)
+(5 rows)
 
 explain (costs off) select * from lp where 'a' = a;	/* commuted */
-            QUERY PLAN             
------------------------------------
- Append
-   ->  Seq Scan on lp_ad
-         Filter: ('a'::bpchar = a)
-(3 rows)
+               QUERY PLAN                
+-----------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on lp_ad
+               Filter: ('a'::bpchar = a)
+(5 rows)
 
 explain (costs off) select * from lp where a is not null;
-           QUERY PLAN            
----------------------------------
- Append
-   ->  Seq Scan on lp_ad
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on lp_bc
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on lp_ef
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on lp_g
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on lp_default
-         Filter: (a IS NOT NULL)
-(11 rows)
+              QUERY PLAN               
+---------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on lp_ad
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on lp_bc
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on lp_ef
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on lp_g
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on lp_default
+               Filter: (a IS NOT NULL)
+(13 rows)
 
 explain (costs off) select * from lp where a is null;
-         QUERY PLAN          
------------------------------
- Append
-   ->  Seq Scan on lp_null
-         Filter: (a IS NULL)
-(3 rows)
+            QUERY PLAN             
+-----------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on lp_null
+               Filter: (a IS NULL)
+(5 rows)
 
 explain (costs off) select * from lp where a = 'a' or a = 'c';
-                        QUERY PLAN                        
-----------------------------------------------------------
- Append
-   ->  Seq Scan on lp_ad
-         Filter: ((a = 'a'::bpchar) OR (a = 'c'::bpchar))
-   ->  Seq Scan on lp_bc
-         Filter: ((a = 'a'::bpchar) OR (a = 'c'::bpchar))
-(5 rows)
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on lp_ad
+               Filter: ((a = 'a'::bpchar) OR (a = 'c'::bpchar))
+         ->  Seq Scan on lp_bc
+               Filter: ((a = 'a'::bpchar) OR (a = 'c'::bpchar))
+(7 rows)
 
 explain (costs off) select * from lp where a is not null and (a = 'a' or a = 'c');
-                                   QUERY PLAN                                   
---------------------------------------------------------------------------------
- Append
-   ->  Seq Scan on lp_ad
-         Filter: ((a IS NOT NULL) AND ((a = 'a'::bpchar) OR (a = 'c'::bpchar)))
-   ->  Seq Scan on lp_bc
-         Filter: ((a IS NOT NULL) AND ((a = 'a'::bpchar) OR (a = 'c'::bpchar)))
-(5 rows)
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on lp_ad
+               Filter: ((a IS NOT NULL) AND ((a = 'a'::bpchar) OR (a = 'c'::bpchar)))
+         ->  Seq Scan on lp_bc
+               Filter: ((a IS NOT NULL) AND ((a = 'a'::bpchar) OR (a = 'c'::bpchar)))
+(7 rows)
 
 explain (costs off) select * from lp where a <> 'g';
-             QUERY PLAN             
-------------------------------------
- Append
-   ->  Seq Scan on lp_ad
-         Filter: (a <> 'g'::bpchar)
-   ->  Seq Scan on lp_bc
-         Filter: (a <> 'g'::bpchar)
-   ->  Seq Scan on lp_ef
-         Filter: (a <> 'g'::bpchar)
-   ->  Seq Scan on lp_default
-         Filter: (a <> 'g'::bpchar)
-(9 rows)
+                QUERY PLAN                
+------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on lp_ad
+               Filter: (a <> 'g'::bpchar)
+         ->  Seq Scan on lp_bc
+               Filter: (a <> 'g'::bpchar)
+         ->  Seq Scan on lp_ef
+               Filter: (a <> 'g'::bpchar)
+         ->  Seq Scan on lp_default
+               Filter: (a <> 'g'::bpchar)
+(11 rows)
 
 explain (costs off) select * from lp where a <> 'a' and a <> 'd';
-                         QUERY PLAN                          
--------------------------------------------------------------
- Append
-   ->  Seq Scan on lp_bc
-         Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar))
-   ->  Seq Scan on lp_ef
-         Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar))
-   ->  Seq Scan on lp_g
-         Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar))
-   ->  Seq Scan on lp_default
-         Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar))
-(9 rows)
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on lp_bc
+               Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar))
+         ->  Seq Scan on lp_ef
+               Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar))
+         ->  Seq Scan on lp_g
+               Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar))
+         ->  Seq Scan on lp_default
+               Filter: ((a <> 'a'::bpchar) AND (a <> 'd'::bpchar))
+(11 rows)
 
 explain (costs off) select * from lp where a not in ('a', 'd');
-                   QUERY PLAN                   
-------------------------------------------------
- Append
-   ->  Seq Scan on lp_bc
-         Filter: (a <> ALL ('{a,d}'::bpchar[]))
-   ->  Seq Scan on lp_ef
-         Filter: (a <> ALL ('{a,d}'::bpchar[]))
-   ->  Seq Scan on lp_g
-         Filter: (a <> ALL ('{a,d}'::bpchar[]))
-   ->  Seq Scan on lp_default
-         Filter: (a <> ALL ('{a,d}'::bpchar[]))
-(9 rows)
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on lp_bc
+               Filter: (a <> ALL ('{a,d}'::bpchar[]))
+         ->  Seq Scan on lp_ef
+               Filter: (a <> ALL ('{a,d}'::bpchar[]))
+         ->  Seq Scan on lp_g
+               Filter: (a <> ALL ('{a,d}'::bpchar[]))
+         ->  Seq Scan on lp_default
+               Filter: (a <> ALL ('{a,d}'::bpchar[]))
+(11 rows)
 
 -- collation matches the partitioning collation, pruning works
 create table coll_pruning (a text collate "C") partition by list (a);
@@ -150,25 +174,29 @@ create table coll_pruning_a partition of coll_pruning for values in ('a');
 create table coll_pruning_b partition of coll_pruning for values in ('b');
 create table coll_pruning_def partition of coll_pruning default;
 explain (costs off) select * from coll_pruning where a collate "C" = 'a' collate "C";
-                 QUERY PLAN                  
----------------------------------------------
- Append
-   ->  Seq Scan on coll_pruning_a
-         Filter: (a = 'a'::text COLLATE "C")
-(3 rows)
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on coll_pruning_a
+               Filter: (a = 'a'::text COLLATE "C")
+(5 rows)
 
 -- collation doesn't match the partitioning collation, no pruning occurs
 explain (costs off) select * from coll_pruning where a collate "POSIX" = 'a' collate "POSIX";
-                       QUERY PLAN                        
----------------------------------------------------------
- Append
-   ->  Seq Scan on coll_pruning_a
-         Filter: ((a)::text = 'a'::text COLLATE "POSIX")
-   ->  Seq Scan on coll_pruning_b
-         Filter: ((a)::text = 'a'::text COLLATE "POSIX")
-   ->  Seq Scan on coll_pruning_def
-         Filter: ((a)::text = 'a'::text COLLATE "POSIX")
-(7 rows)
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on coll_pruning_a
+               Filter: ((a)::text = 'a'::text COLLATE "POSIX")
+         ->  Seq Scan on coll_pruning_b
+               Filter: ((a)::text = 'a'::text COLLATE "POSIX")
+         ->  Seq Scan on coll_pruning_def
+               Filter: ((a)::text = 'a'::text COLLATE "POSIX")
+(9 rows)
 
 create table rlp (a int, b varchar) partition by range (a);
 create table rlp_default partition of rlp default partition by list (a);
@@ -178,7 +206,7 @@ create table rlp_default_30 partition of rlp_default for values in (30);
 create table rlp_default_null partition of rlp_default for values in (null);
 create table rlp1 partition of rlp for values from (minvalue) to (1);
 create table rlp2 partition of rlp for values from (1) to (10);
-create table rlp3 (b varchar, a int) partition by list (b varchar_ops);
+create table rlp3 (a int, b varchar) partition by list (b varchar_ops);
 create table rlp3_default partition of rlp3 default;
 create table rlp3abcd partition of rlp3 for values in ('ab', 'cd');
 create table rlp3efgh partition of rlp3 for values in ('ef', 'gh');
@@ -192,480 +220,544 @@ create table rlp5 partition of rlp for values from (31) to (maxvalue) partition
 create table rlp5_default partition of rlp5 default;
 create table rlp5_1 partition of rlp5 for values from (31) to (40);
 explain (costs off) select * from rlp where a < 1;
-       QUERY PLAN        
--------------------------
- Append
-   ->  Seq Scan on rlp1
-         Filter: (a < 1)
-(3 rows)
+            QUERY PLAN            
+----------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp1
+               Filter: (a < 1)
+(5 rows)
 
 explain (costs off) select * from rlp where 1 > a;	/* commuted */
-       QUERY PLAN        
--------------------------
- Append
-   ->  Seq Scan on rlp1
-         Filter: (1 > a)
-(3 rows)
+            QUERY PLAN            
+----------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp1
+               Filter: (1 > a)
+(5 rows)
 
 explain (costs off) select * from rlp where a <= 1;
-        QUERY PLAN        
---------------------------
- Append
-   ->  Seq Scan on rlp1
-         Filter: (a <= 1)
-   ->  Seq Scan on rlp2
-         Filter: (a <= 1)
-(5 rows)
+            QUERY PLAN            
+----------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp1
+               Filter: (a <= 1)
+         ->  Seq Scan on rlp2
+               Filter: (a <= 1)
+(7 rows)
 
 explain (costs off) select * from rlp where a = 1;
-       QUERY PLAN        
--------------------------
- Append
-   ->  Seq Scan on rlp2
-         Filter: (a = 1)
-(3 rows)
+          QUERY PLAN           
+-------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on rlp2
+               Filter: (a = 1)
+(5 rows)
 
 explain (costs off) select * from rlp where a = 1::bigint;		/* same as above */
-            QUERY PLAN             
------------------------------------
- Append
-   ->  Seq Scan on rlp2
-         Filter: (a = '1'::bigint)
-(3 rows)
+               QUERY PLAN                
+-----------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on rlp2
+               Filter: (a = '1'::bigint)
+(5 rows)
 
 explain (costs off) select * from rlp where a = 1::numeric;	/* no pruning */
-                  QUERY PLAN                   
------------------------------------------------
- Append
-   ->  Seq Scan on rlp1
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp2
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp3abcd
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp3efgh
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp3nullxy
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp3_default
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp4_1
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp4_2
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp4_default
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp5_1
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp5_default
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp_default_10
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp_default_30
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp_default_null
-         Filter: ((a)::numeric = '1'::numeric)
-   ->  Seq Scan on rlp_default_default
-         Filter: ((a)::numeric = '1'::numeric)
-(31 rows)
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp1
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp2
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp3abcd
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp3efgh
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp3nullxy
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp3_default
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp4_1
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp4_2
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp4_default
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp5_1
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp5_default
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp_default_10
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp_default_30
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp_default_null
+               Filter: ((a)::numeric = '1'::numeric)
+         ->  Seq Scan on rlp_default_default
+               Filter: ((a)::numeric = '1'::numeric)
+(33 rows)
 
 explain (costs off) select * from rlp where a <= 10;
-              QUERY PLAN               
----------------------------------------
- Append
-   ->  Seq Scan on rlp1
-         Filter: (a <= 10)
-   ->  Seq Scan on rlp2
-         Filter: (a <= 10)
-   ->  Seq Scan on rlp_default_10
-         Filter: (a <= 10)
-   ->  Seq Scan on rlp_default_default
-         Filter: (a <= 10)
-(9 rows)
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp1
+               Filter: (a <= 10)
+         ->  Seq Scan on rlp2
+               Filter: (a <= 10)
+         ->  Seq Scan on rlp_default_10
+               Filter: (a <= 10)
+         ->  Seq Scan on rlp_default_default
+               Filter: (a <= 10)
+(11 rows)
 
 explain (costs off) select * from rlp where a > 10;
-              QUERY PLAN               
----------------------------------------
- Append
-   ->  Seq Scan on rlp3abcd
-         Filter: (a > 10)
-   ->  Seq Scan on rlp3efgh
-         Filter: (a > 10)
-   ->  Seq Scan on rlp3nullxy
-         Filter: (a > 10)
-   ->  Seq Scan on rlp3_default
-         Filter: (a > 10)
-   ->  Seq Scan on rlp4_1
-         Filter: (a > 10)
-   ->  Seq Scan on rlp4_2
-         Filter: (a > 10)
-   ->  Seq Scan on rlp4_default
-         Filter: (a > 10)
-   ->  Seq Scan on rlp5_1
-         Filter: (a > 10)
-   ->  Seq Scan on rlp5_default
-         Filter: (a > 10)
-   ->  Seq Scan on rlp_default_30
-         Filter: (a > 10)
-   ->  Seq Scan on rlp_default_default
-         Filter: (a > 10)
-(23 rows)
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp3abcd
+               Filter: (a > 10)
+         ->  Seq Scan on rlp3efgh
+               Filter: (a > 10)
+         ->  Seq Scan on rlp3nullxy
+               Filter: (a > 10)
+         ->  Seq Scan on rlp3_default
+               Filter: (a > 10)
+         ->  Seq Scan on rlp4_1
+               Filter: (a > 10)
+         ->  Seq Scan on rlp4_2
+               Filter: (a > 10)
+         ->  Seq Scan on rlp4_default
+               Filter: (a > 10)
+         ->  Seq Scan on rlp5_1
+               Filter: (a > 10)
+         ->  Seq Scan on rlp5_default
+               Filter: (a > 10)
+         ->  Seq Scan on rlp_default_30
+               Filter: (a > 10)
+         ->  Seq Scan on rlp_default_default
+               Filter: (a > 10)
+(25 rows)
 
 explain (costs off) select * from rlp where a < 15;
-              QUERY PLAN               
----------------------------------------
- Append
-   ->  Seq Scan on rlp1
-         Filter: (a < 15)
-   ->  Seq Scan on rlp2
-         Filter: (a < 15)
-   ->  Seq Scan on rlp_default_10
-         Filter: (a < 15)
-   ->  Seq Scan on rlp_default_default
-         Filter: (a < 15)
-(9 rows)
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp1
+               Filter: (a < 15)
+         ->  Seq Scan on rlp2
+               Filter: (a < 15)
+         ->  Seq Scan on rlp_default_10
+               Filter: (a < 15)
+         ->  Seq Scan on rlp_default_default
+               Filter: (a < 15)
+(11 rows)
 
 explain (costs off) select * from rlp where a <= 15;
-              QUERY PLAN               
----------------------------------------
- Append
-   ->  Seq Scan on rlp1
-         Filter: (a <= 15)
-   ->  Seq Scan on rlp2
-         Filter: (a <= 15)
-   ->  Seq Scan on rlp3abcd
-         Filter: (a <= 15)
-   ->  Seq Scan on rlp3efgh
-         Filter: (a <= 15)
-   ->  Seq Scan on rlp3nullxy
-         Filter: (a <= 15)
-   ->  Seq Scan on rlp3_default
-         Filter: (a <= 15)
-   ->  Seq Scan on rlp_default_10
-         Filter: (a <= 15)
-   ->  Seq Scan on rlp_default_default
-         Filter: (a <= 15)
-(17 rows)
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp1
+               Filter: (a <= 15)
+         ->  Seq Scan on rlp2
+               Filter: (a <= 15)
+         ->  Seq Scan on rlp3abcd
+               Filter: (a <= 15)
+         ->  Seq Scan on rlp3efgh
+               Filter: (a <= 15)
+         ->  Seq Scan on rlp3nullxy
+               Filter: (a <= 15)
+         ->  Seq Scan on rlp3_default
+               Filter: (a <= 15)
+         ->  Seq Scan on rlp_default_10
+               Filter: (a <= 15)
+         ->  Seq Scan on rlp_default_default
+               Filter: (a <= 15)
+(19 rows)
 
 explain (costs off) select * from rlp where a > 15 and b = 'ab';
-                       QUERY PLAN                        
----------------------------------------------------------
- Append
-   ->  Seq Scan on rlp3abcd
-         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp4_1
-         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp4_2
-         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp4_default
-         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp5_1
-         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp5_default
-         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp_default_30
-         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp_default_default
-         Filter: ((a > 15) AND ((b)::text = 'ab'::text))
-(17 rows)
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp3abcd
+               Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp4_1
+               Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp4_2
+               Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp4_default
+               Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp5_1
+               Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp5_default
+               Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp_default_30
+               Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp_default_default
+               Filter: ((a > 15) AND ((b)::text = 'ab'::text))
+(19 rows)
 
 explain (costs off) select * from rlp where a = 16;
-           QUERY PLAN           
---------------------------------
- Append
-   ->  Seq Scan on rlp3abcd
-         Filter: (a = 16)
-   ->  Seq Scan on rlp3efgh
-         Filter: (a = 16)
-   ->  Seq Scan on rlp3nullxy
-         Filter: (a = 16)
-   ->  Seq Scan on rlp3_default
-         Filter: (a = 16)
-(9 rows)
+              QUERY PLAN              
+--------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on rlp3abcd
+               Filter: (a = 16)
+         ->  Seq Scan on rlp3efgh
+               Filter: (a = 16)
+         ->  Seq Scan on rlp3nullxy
+               Filter: (a = 16)
+         ->  Seq Scan on rlp3_default
+               Filter: (a = 16)
+(11 rows)
 
 explain (costs off) select * from rlp where a = 16 and b in ('not', 'in', 'here');
-                                 QUERY PLAN                                 
-----------------------------------------------------------------------------
- Append
-   ->  Seq Scan on rlp3_default
-         Filter: ((a = 16) AND ((b)::text = ANY ('{not,in,here}'::text[])))
-(3 rows)
+                                    QUERY PLAN                                    
+----------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on rlp3_default
+               Filter: ((a = 16) AND ((b)::text = ANY ('{not,in,here}'::text[])))
+(5 rows)
 
 explain (costs off) select * from rlp where a = 16 and b < 'ab';
-                       QUERY PLAN                        
----------------------------------------------------------
- Append
-   ->  Seq Scan on rlp3_default
-         Filter: (((b)::text < 'ab'::text) AND (a = 16))
-(3 rows)
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on rlp3_default
+               Filter: (((b)::text < 'ab'::text) AND (a = 16))
+(5 rows)
 
 explain (costs off) select * from rlp where a = 16 and b <= 'ab';
-                        QUERY PLAN                        
-----------------------------------------------------------
- Append
-   ->  Seq Scan on rlp3abcd
-         Filter: (((b)::text <= 'ab'::text) AND (a = 16))
-   ->  Seq Scan on rlp3_default
-         Filter: (((b)::text <= 'ab'::text) AND (a = 16))
-(5 rows)
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on rlp3abcd
+               Filter: (((b)::text <= 'ab'::text) AND (a = 16))
+         ->  Seq Scan on rlp3_default
+               Filter: (((b)::text <= 'ab'::text) AND (a = 16))
+(7 rows)
 
 explain (costs off) select * from rlp where a = 16 and b is null;
-                 QUERY PLAN                 
---------------------------------------------
- Append
-   ->  Seq Scan on rlp3nullxy
-         Filter: ((b IS NULL) AND (a = 16))
-(3 rows)
+                    QUERY PLAN                    
+--------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on rlp3nullxy
+               Filter: ((b IS NULL) AND (a = 16))
+(5 rows)
 
 explain (costs off) select * from rlp where a = 16 and b is not null;
-                   QUERY PLAN                   
-------------------------------------------------
- Append
-   ->  Seq Scan on rlp3abcd
-         Filter: ((b IS NOT NULL) AND (a = 16))
-   ->  Seq Scan on rlp3efgh
-         Filter: ((b IS NOT NULL) AND (a = 16))
-   ->  Seq Scan on rlp3nullxy
-         Filter: ((b IS NOT NULL) AND (a = 16))
-   ->  Seq Scan on rlp3_default
-         Filter: ((b IS NOT NULL) AND (a = 16))
-(9 rows)
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on rlp3abcd
+               Filter: ((b IS NOT NULL) AND (a = 16))
+         ->  Seq Scan on rlp3efgh
+               Filter: ((b IS NOT NULL) AND (a = 16))
+         ->  Seq Scan on rlp3nullxy
+               Filter: ((b IS NOT NULL) AND (a = 16))
+         ->  Seq Scan on rlp3_default
+               Filter: ((b IS NOT NULL) AND (a = 16))
+(11 rows)
 
 explain (costs off) select * from rlp where a is null;
-             QUERY PLAN             
-------------------------------------
- Append
-   ->  Seq Scan on rlp_default_null
-         Filter: (a IS NULL)
-(3 rows)
+                QUERY PLAN                
+------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp_default_null
+               Filter: (a IS NULL)
+(5 rows)
 
 explain (costs off) select * from rlp where a is not null;
-              QUERY PLAN               
----------------------------------------
- Append
-   ->  Seq Scan on rlp1
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp2
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp3abcd
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp3efgh
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp3nullxy
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp3_default
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp4_1
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp4_2
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp4_default
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp5_1
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp5_default
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp_default_10
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp_default_30
-         Filter: (a IS NOT NULL)
-   ->  Seq Scan on rlp_default_default
-         Filter: (a IS NOT NULL)
-(29 rows)
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp1
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp2
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp3abcd
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp3efgh
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp3nullxy
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp3_default
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp4_1
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp4_2
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp4_default
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp5_1
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp5_default
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp_default_10
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp_default_30
+               Filter: (a IS NOT NULL)
+         ->  Seq Scan on rlp_default_default
+               Filter: (a IS NOT NULL)
+(31 rows)
 
 explain (costs off) select * from rlp where a > 30;
-              QUERY PLAN               
----------------------------------------
- Append
-   ->  Seq Scan on rlp5_1
-         Filter: (a > 30)
-   ->  Seq Scan on rlp5_default
-         Filter: (a > 30)
-   ->  Seq Scan on rlp_default_default
-         Filter: (a > 30)
-(7 rows)
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp5_1
+               Filter: (a > 30)
+         ->  Seq Scan on rlp5_default
+               Filter: (a > 30)
+         ->  Seq Scan on rlp_default_default
+               Filter: (a > 30)
+(9 rows)
 
 explain (costs off) select * from rlp where a = 30;	/* only default is scanned */
-            QUERY PLAN            
-----------------------------------
- Append
-   ->  Seq Scan on rlp_default_30
-         Filter: (a = 30)
-(3 rows)
+               QUERY PLAN               
+----------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on rlp_default_30
+               Filter: (a = 30)
+(5 rows)
 
 explain (costs off) select * from rlp where a <= 31;
-              QUERY PLAN               
----------------------------------------
- Append
-   ->  Seq Scan on rlp1
-         Filter: (a <= 31)
-   ->  Seq Scan on rlp2
-         Filter: (a <= 31)
-   ->  Seq Scan on rlp3abcd
-         Filter: (a <= 31)
-   ->  Seq Scan on rlp3efgh
-         Filter: (a <= 31)
-   ->  Seq Scan on rlp3nullxy
-         Filter: (a <= 31)
-   ->  Seq Scan on rlp3_default
-         Filter: (a <= 31)
-   ->  Seq Scan on rlp4_1
-         Filter: (a <= 31)
-   ->  Seq Scan on rlp4_2
-         Filter: (a <= 31)
-   ->  Seq Scan on rlp4_default
-         Filter: (a <= 31)
-   ->  Seq Scan on rlp5_1
-         Filter: (a <= 31)
-   ->  Seq Scan on rlp_default_10
-         Filter: (a <= 31)
-   ->  Seq Scan on rlp_default_30
-         Filter: (a <= 31)
-   ->  Seq Scan on rlp_default_default
-         Filter: (a <= 31)
-(27 rows)
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp1
+               Filter: (a <= 31)
+         ->  Seq Scan on rlp2
+               Filter: (a <= 31)
+         ->  Seq Scan on rlp3abcd
+               Filter: (a <= 31)
+         ->  Seq Scan on rlp3efgh
+               Filter: (a <= 31)
+         ->  Seq Scan on rlp3nullxy
+               Filter: (a <= 31)
+         ->  Seq Scan on rlp3_default
+               Filter: (a <= 31)
+         ->  Seq Scan on rlp4_1
+               Filter: (a <= 31)
+         ->  Seq Scan on rlp4_2
+               Filter: (a <= 31)
+         ->  Seq Scan on rlp4_default
+               Filter: (a <= 31)
+         ->  Seq Scan on rlp5_1
+               Filter: (a <= 31)
+         ->  Seq Scan on rlp_default_10
+               Filter: (a <= 31)
+         ->  Seq Scan on rlp_default_30
+               Filter: (a <= 31)
+         ->  Seq Scan on rlp_default_default
+               Filter: (a <= 31)
+(29 rows)
 
 explain (costs off) select * from rlp where a = 1 or a = 7;
-              QUERY PLAN              
---------------------------------------
- Append
-   ->  Seq Scan on rlp2
-         Filter: ((a = 1) OR (a = 7))
-(3 rows)
+                 QUERY PLAN                 
+--------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp2
+               Filter: ((a = 1) OR (a = 7))
+(5 rows)
 
 explain (costs off) select * from rlp where a = 1 or b = 'ab';
-                      QUERY PLAN                       
--------------------------------------------------------
- Append
-   ->  Seq Scan on rlp1
-         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp2
-         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp3abcd
-         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp4_1
-         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp4_2
-         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp4_default
-         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp5_1
-         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp5_default
-         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp_default_10
-         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp_default_30
-         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp_default_null
-         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
-   ->  Seq Scan on rlp_default_default
-         Filter: ((a = 1) OR ((b)::text = 'ab'::text))
-(25 rows)
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp1
+               Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp2
+               Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp3abcd
+               Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp4_1
+               Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp4_2
+               Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp4_default
+               Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp5_1
+               Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp5_default
+               Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp_default_10
+               Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp_default_30
+               Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp_default_null
+               Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+         ->  Seq Scan on rlp_default_default
+               Filter: ((a = 1) OR ((b)::text = 'ab'::text))
+(27 rows)
 
 explain (costs off) select * from rlp where a > 20 and a < 27;
-               QUERY PLAN                
------------------------------------------
- Append
-   ->  Seq Scan on rlp4_1
-         Filter: ((a > 20) AND (a < 27))
-   ->  Seq Scan on rlp4_2
-         Filter: ((a > 20) AND (a < 27))
-(5 rows)
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp4_1
+               Filter: ((a > 20) AND (a < 27))
+         ->  Seq Scan on rlp4_2
+               Filter: ((a > 20) AND (a < 27))
+(7 rows)
 
 explain (costs off) select * from rlp where a = 29;
-           QUERY PLAN           
---------------------------------
- Append
-   ->  Seq Scan on rlp4_default
-         Filter: (a = 29)
-(3 rows)
+              QUERY PLAN              
+--------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on rlp4_default
+               Filter: (a = 29)
+(5 rows)
 
 explain (costs off) select * from rlp where a >= 29;
-              QUERY PLAN               
----------------------------------------
- Append
-   ->  Seq Scan on rlp4_default
-         Filter: (a >= 29)
-   ->  Seq Scan on rlp5_1
-         Filter: (a >= 29)
-   ->  Seq Scan on rlp5_default
-         Filter: (a >= 29)
-   ->  Seq Scan on rlp_default_30
-         Filter: (a >= 29)
-   ->  Seq Scan on rlp_default_default
-         Filter: (a >= 29)
-(11 rows)
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp4_default
+               Filter: (a >= 29)
+         ->  Seq Scan on rlp5_1
+               Filter: (a >= 29)
+         ->  Seq Scan on rlp5_default
+               Filter: (a >= 29)
+         ->  Seq Scan on rlp_default_30
+               Filter: (a >= 29)
+         ->  Seq Scan on rlp_default_default
+               Filter: (a >= 29)
+(13 rows)
 
 explain (costs off) select * from rlp where a < 1 or (a > 20 and a < 25);
-                      QUERY PLAN                      
-------------------------------------------------------
- Append
-   ->  Seq Scan on rlp1
-         Filter: ((a < 1) OR ((a > 20) AND (a < 25)))
-   ->  Seq Scan on rlp4_1
-         Filter: ((a < 1) OR ((a > 20) AND (a < 25)))
-(5 rows)
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp1
+               Filter: ((a < 1) OR ((a > 20) AND (a < 25)))
+         ->  Seq Scan on rlp4_1
+               Filter: ((a < 1) OR ((a > 20) AND (a < 25)))
+(7 rows)
 
 -- redundant clauses are eliminated
 explain (costs off) select * from rlp where a > 1 and a = 10;	/* only default */
-               QUERY PLAN               
-----------------------------------------
- Append
-   ->  Seq Scan on rlp_default_10
-         Filter: ((a > 1) AND (a = 10))
-(3 rows)
+                  QUERY PLAN                  
+----------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on rlp_default_10
+               Filter: ((a > 1) AND (a = 10))
+(5 rows)
 
 explain (costs off) select * from rlp where a > 1 and a >=15;	/* rlp3 onwards, including default */
-               QUERY PLAN                
------------------------------------------
- Append
-   ->  Seq Scan on rlp3abcd
-         Filter: ((a > 1) AND (a >= 15))
-   ->  Seq Scan on rlp3efgh
-         Filter: ((a > 1) AND (a >= 15))
-   ->  Seq Scan on rlp3nullxy
-         Filter: ((a > 1) AND (a >= 15))
-   ->  Seq Scan on rlp3_default
-         Filter: ((a > 1) AND (a >= 15))
-   ->  Seq Scan on rlp4_1
-         Filter: ((a > 1) AND (a >= 15))
-   ->  Seq Scan on rlp4_2
-         Filter: ((a > 1) AND (a >= 15))
-   ->  Seq Scan on rlp4_default
-         Filter: ((a > 1) AND (a >= 15))
-   ->  Seq Scan on rlp5_1
-         Filter: ((a > 1) AND (a >= 15))
-   ->  Seq Scan on rlp5_default
-         Filter: ((a > 1) AND (a >= 15))
-   ->  Seq Scan on rlp_default_30
-         Filter: ((a > 1) AND (a >= 15))
-   ->  Seq Scan on rlp_default_default
-         Filter: ((a > 1) AND (a >= 15))
-(23 rows)
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp3abcd
+               Filter: ((a > 1) AND (a >= 15))
+         ->  Seq Scan on rlp3efgh
+               Filter: ((a > 1) AND (a >= 15))
+         ->  Seq Scan on rlp3nullxy
+               Filter: ((a > 1) AND (a >= 15))
+         ->  Seq Scan on rlp3_default
+               Filter: ((a > 1) AND (a >= 15))
+         ->  Seq Scan on rlp4_1
+               Filter: ((a > 1) AND (a >= 15))
+         ->  Seq Scan on rlp4_2
+               Filter: ((a > 1) AND (a >= 15))
+         ->  Seq Scan on rlp4_default
+               Filter: ((a > 1) AND (a >= 15))
+         ->  Seq Scan on rlp5_1
+               Filter: ((a > 1) AND (a >= 15))
+         ->  Seq Scan on rlp5_default
+               Filter: ((a > 1) AND (a >= 15))
+         ->  Seq Scan on rlp_default_30
+               Filter: ((a > 1) AND (a >= 15))
+         ->  Seq Scan on rlp_default_default
+               Filter: ((a > 1) AND (a >= 15))
+(25 rows)
 
 explain (costs off) select * from rlp where a = 1 and a = 3;	/* empty */
-        QUERY PLAN        
---------------------------
- Result
-   One-Time Filter: false
-(2 rows)
+           QUERY PLAN           
+--------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Result
+         One-Time Filter: false
+(4 rows)
 
 explain (costs off) select * from rlp where (a = 1 and a = 3) or (a > 1 and a = 15);
-                            QUERY PLAN                             
--------------------------------------------------------------------
- Append
-   ->  Seq Scan on rlp2
-         Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
-   ->  Seq Scan on rlp3abcd
-         Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
-   ->  Seq Scan on rlp3efgh
-         Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
-   ->  Seq Scan on rlp3nullxy
-         Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
-   ->  Seq Scan on rlp3_default
-         Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
-(11 rows)
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rlp2
+               Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
+         ->  Seq Scan on rlp3abcd
+               Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
+         ->  Seq Scan on rlp3efgh
+               Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
+         ->  Seq Scan on rlp3nullxy
+               Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
+         ->  Seq Scan on rlp3_default
+               Filter: (((a = 1) AND (a = 3)) OR ((a > 1) AND (a = 15)))
+(13 rows)
 
 -- multi-column keys
 create table mc3p (a int, b int, c int) partition by range (a, abs(b), c);
@@ -679,268 +771,306 @@ create table mc3p5 partition of mc3p for values from (11, 1, 1) to (20, 10, 10);
 create table mc3p6 partition of mc3p for values from (20, 10, 10) to (20, 20, 20);
 create table mc3p7 partition of mc3p for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
 explain (costs off) select * from mc3p where a = 1;
-           QUERY PLAN           
---------------------------------
- Append
-   ->  Seq Scan on mc3p0
-         Filter: (a = 1)
-   ->  Seq Scan on mc3p1
-         Filter: (a = 1)
-   ->  Seq Scan on mc3p_default
-         Filter: (a = 1)
-(7 rows)
+              QUERY PLAN              
+--------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on mc3p0
+               Filter: (a = 1)
+         ->  Seq Scan on mc3p1
+               Filter: (a = 1)
+         ->  Seq Scan on mc3p_default
+               Filter: (a = 1)
+(9 rows)
 
 explain (costs off) select * from mc3p where a = 1 and abs(b) < 1;
-                 QUERY PLAN                 
---------------------------------------------
- Append
-   ->  Seq Scan on mc3p0
-         Filter: ((a = 1) AND (abs(b) < 1))
-   ->  Seq Scan on mc3p_default
-         Filter: ((a = 1) AND (abs(b) < 1))
-(5 rows)
+                    QUERY PLAN                    
+--------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on mc3p0
+               Filter: ((a = 1) AND (abs(b) < 1))
+         ->  Seq Scan on mc3p_default
+               Filter: ((a = 1) AND (abs(b) < 1))
+(7 rows)
 
 explain (costs off) select * from mc3p where a = 1 and abs(b) = 1;
-                 QUERY PLAN                 
---------------------------------------------
- Append
-   ->  Seq Scan on mc3p0
-         Filter: ((a = 1) AND (abs(b) = 1))
-   ->  Seq Scan on mc3p1
-         Filter: ((a = 1) AND (abs(b) = 1))
-   ->  Seq Scan on mc3p_default
-         Filter: ((a = 1) AND (abs(b) = 1))
-(7 rows)
+                    QUERY PLAN                    
+--------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on mc3p0
+               Filter: ((a = 1) AND (abs(b) = 1))
+         ->  Seq Scan on mc3p1
+               Filter: ((a = 1) AND (abs(b) = 1))
+         ->  Seq Scan on mc3p_default
+               Filter: ((a = 1) AND (abs(b) = 1))
+(9 rows)
 
 explain (costs off) select * from mc3p where a = 1 and abs(b) = 1 and c < 8;
-                       QUERY PLAN                       
---------------------------------------------------------
- Append
-   ->  Seq Scan on mc3p0
-         Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1))
-   ->  Seq Scan on mc3p1
-         Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1))
-(5 rows)
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on mc3p0
+               Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1))
+         ->  Seq Scan on mc3p1
+               Filter: ((c < 8) AND (a = 1) AND (abs(b) = 1))
+(7 rows)
 
 explain (costs off) select * from mc3p where a = 10 and abs(b) between 5 and 35;
-                           QUERY PLAN                            
------------------------------------------------------------------
- Append
-   ->  Seq Scan on mc3p1
-         Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
-   ->  Seq Scan on mc3p2
-         Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
-   ->  Seq Scan on mc3p3
-         Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
-   ->  Seq Scan on mc3p4
-         Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
-   ->  Seq Scan on mc3p_default
-         Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
-(11 rows)
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p1
+               Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
+         ->  Seq Scan on mc3p2
+               Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
+         ->  Seq Scan on mc3p3
+               Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
+         ->  Seq Scan on mc3p4
+               Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
+         ->  Seq Scan on mc3p_default
+               Filter: ((a = 10) AND (abs(b) >= 5) AND (abs(b) <= 35))
+(13 rows)
 
 explain (costs off) select * from mc3p where a > 10;
-           QUERY PLAN           
---------------------------------
- Append
-   ->  Seq Scan on mc3p5
-         Filter: (a > 10)
-   ->  Seq Scan on mc3p6
-         Filter: (a > 10)
-   ->  Seq Scan on mc3p7
-         Filter: (a > 10)
-   ->  Seq Scan on mc3p_default
-         Filter: (a > 10)
-(9 rows)
+              QUERY PLAN              
+--------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p5
+               Filter: (a > 10)
+         ->  Seq Scan on mc3p6
+               Filter: (a > 10)
+         ->  Seq Scan on mc3p7
+               Filter: (a > 10)
+         ->  Seq Scan on mc3p_default
+               Filter: (a > 10)
+(11 rows)
 
 explain (costs off) select * from mc3p where a >= 10;
-           QUERY PLAN           
---------------------------------
- Append
-   ->  Seq Scan on mc3p1
-         Filter: (a >= 10)
-   ->  Seq Scan on mc3p2
-         Filter: (a >= 10)
-   ->  Seq Scan on mc3p3
-         Filter: (a >= 10)
-   ->  Seq Scan on mc3p4
-         Filter: (a >= 10)
-   ->  Seq Scan on mc3p5
-         Filter: (a >= 10)
-   ->  Seq Scan on mc3p6
-         Filter: (a >= 10)
-   ->  Seq Scan on mc3p7
-         Filter: (a >= 10)
-   ->  Seq Scan on mc3p_default
-         Filter: (a >= 10)
-(17 rows)
+              QUERY PLAN              
+--------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p1
+               Filter: (a >= 10)
+         ->  Seq Scan on mc3p2
+               Filter: (a >= 10)
+         ->  Seq Scan on mc3p3
+               Filter: (a >= 10)
+         ->  Seq Scan on mc3p4
+               Filter: (a >= 10)
+         ->  Seq Scan on mc3p5
+               Filter: (a >= 10)
+         ->  Seq Scan on mc3p6
+               Filter: (a >= 10)
+         ->  Seq Scan on mc3p7
+               Filter: (a >= 10)
+         ->  Seq Scan on mc3p_default
+               Filter: (a >= 10)
+(19 rows)
 
 explain (costs off) select * from mc3p where a < 10;
-           QUERY PLAN           
---------------------------------
- Append
-   ->  Seq Scan on mc3p0
-         Filter: (a < 10)
-   ->  Seq Scan on mc3p1
-         Filter: (a < 10)
-   ->  Seq Scan on mc3p_default
-         Filter: (a < 10)
-(7 rows)
+              QUERY PLAN              
+--------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p0
+               Filter: (a < 10)
+         ->  Seq Scan on mc3p1
+               Filter: (a < 10)
+         ->  Seq Scan on mc3p_default
+               Filter: (a < 10)
+(9 rows)
 
 explain (costs off) select * from mc3p where a <= 10 and abs(b) < 10;
-                  QUERY PLAN                   
------------------------------------------------
- Append
-   ->  Seq Scan on mc3p0
-         Filter: ((a <= 10) AND (abs(b) < 10))
-   ->  Seq Scan on mc3p1
-         Filter: ((a <= 10) AND (abs(b) < 10))
-   ->  Seq Scan on mc3p2
-         Filter: ((a <= 10) AND (abs(b) < 10))
-   ->  Seq Scan on mc3p_default
-         Filter: ((a <= 10) AND (abs(b) < 10))
-(9 rows)
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p0
+               Filter: ((a <= 10) AND (abs(b) < 10))
+         ->  Seq Scan on mc3p1
+               Filter: ((a <= 10) AND (abs(b) < 10))
+         ->  Seq Scan on mc3p2
+               Filter: ((a <= 10) AND (abs(b) < 10))
+         ->  Seq Scan on mc3p_default
+               Filter: ((a <= 10) AND (abs(b) < 10))
+(11 rows)
 
 explain (costs off) select * from mc3p where a = 11 and abs(b) = 0;
-                 QUERY PLAN                  
----------------------------------------------
- Append
-   ->  Seq Scan on mc3p_default
-         Filter: ((a = 11) AND (abs(b) = 0))
-(3 rows)
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p_default
+               Filter: ((a = 11) AND (abs(b) = 0))
+(5 rows)
 
 explain (costs off) select * from mc3p where a = 20 and abs(b) = 10 and c = 100;
-                         QUERY PLAN                         
-------------------------------------------------------------
- Append
-   ->  Seq Scan on mc3p6
-         Filter: ((a = 20) AND (c = 100) AND (abs(b) = 10))
-(3 rows)
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p6
+               Filter: ((a = 20) AND (c = 100) AND (abs(b) = 10))
+(5 rows)
 
 explain (costs off) select * from mc3p where a > 20;
-           QUERY PLAN           
---------------------------------
- Append
-   ->  Seq Scan on mc3p7
-         Filter: (a > 20)
-   ->  Seq Scan on mc3p_default
-         Filter: (a > 20)
-(5 rows)
+              QUERY PLAN              
+--------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p7
+               Filter: (a > 20)
+         ->  Seq Scan on mc3p_default
+               Filter: (a > 20)
+(7 rows)
 
 explain (costs off) select * from mc3p where a >= 20;
-           QUERY PLAN           
---------------------------------
- Append
-   ->  Seq Scan on mc3p5
-         Filter: (a >= 20)
-   ->  Seq Scan on mc3p6
-         Filter: (a >= 20)
-   ->  Seq Scan on mc3p7
-         Filter: (a >= 20)
-   ->  Seq Scan on mc3p_default
-         Filter: (a >= 20)
-(9 rows)
+              QUERY PLAN              
+--------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p5
+               Filter: (a >= 20)
+         ->  Seq Scan on mc3p6
+               Filter: (a >= 20)
+         ->  Seq Scan on mc3p7
+               Filter: (a >= 20)
+         ->  Seq Scan on mc3p_default
+               Filter: (a >= 20)
+(11 rows)
 
 explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20);
-                                                           QUERY PLAN                                                            
----------------------------------------------------------------------------------------------------------------------------------
- Append
-   ->  Seq Scan on mc3p1
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)))
-   ->  Seq Scan on mc3p2
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)))
-   ->  Seq Scan on mc3p5
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)))
-   ->  Seq Scan on mc3p_default
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)))
-(9 rows)
+                                                              QUERY PLAN                                                               
+---------------------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p1
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)))
+         ->  Seq Scan on mc3p2
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)))
+         ->  Seq Scan on mc3p5
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)))
+         ->  Seq Scan on mc3p_default
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)))
+(11 rows)
 
 explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20) or a < 1;
-                                                                 QUERY PLAN                                                                 
---------------------------------------------------------------------------------------------------------------------------------------------
- Append
-   ->  Seq Scan on mc3p0
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
-   ->  Seq Scan on mc3p1
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
-   ->  Seq Scan on mc3p2
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
-   ->  Seq Scan on mc3p5
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
-   ->  Seq Scan on mc3p_default
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
-(11 rows)
+                                                                    QUERY PLAN                                                                    
+--------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p0
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
+         ->  Seq Scan on mc3p1
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
+         ->  Seq Scan on mc3p2
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
+         ->  Seq Scan on mc3p5
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
+         ->  Seq Scan on mc3p_default
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1))
+(13 rows)
 
 explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1 and c = 1) or (a = 10 and abs(b) = 5 and c = 10) or (a > 11 and a < 20) or a < 1 or a = 1;
-                                                                      QUERY PLAN                                                                       
--------------------------------------------------------------------------------------------------------------------------------------------------------
- Append
-   ->  Seq Scan on mc3p0
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
-   ->  Seq Scan on mc3p1
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
-   ->  Seq Scan on mc3p2
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
-   ->  Seq Scan on mc3p5
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
-   ->  Seq Scan on mc3p_default
-         Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
-(11 rows)
+                                                                         QUERY PLAN                                                                          
+-------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p0
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
+         ->  Seq Scan on mc3p1
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
+         ->  Seq Scan on mc3p2
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
+         ->  Seq Scan on mc3p5
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
+         ->  Seq Scan on mc3p_default
+               Filter: (((a = 1) AND (abs(b) = 1) AND (c = 1)) OR ((a = 10) AND (abs(b) = 5) AND (c = 10)) OR ((a > 11) AND (a < 20)) OR (a < 1) OR (a = 1))
+(13 rows)
 
 explain (costs off) select * from mc3p where a = 1 or abs(b) = 1 or c = 1;
-                      QUERY PLAN                      
-------------------------------------------------------
- Append
-   ->  Seq Scan on mc3p0
-         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
-   ->  Seq Scan on mc3p1
-         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
-   ->  Seq Scan on mc3p2
-         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
-   ->  Seq Scan on mc3p3
-         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
-   ->  Seq Scan on mc3p4
-         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
-   ->  Seq Scan on mc3p5
-         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
-   ->  Seq Scan on mc3p6
-         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
-   ->  Seq Scan on mc3p7
-         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
-   ->  Seq Scan on mc3p_default
-         Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
-(19 rows)
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p0
+               Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+         ->  Seq Scan on mc3p1
+               Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+         ->  Seq Scan on mc3p2
+               Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+         ->  Seq Scan on mc3p3
+               Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+         ->  Seq Scan on mc3p4
+               Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+         ->  Seq Scan on mc3p5
+               Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+         ->  Seq Scan on mc3p6
+               Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+         ->  Seq Scan on mc3p7
+               Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+         ->  Seq Scan on mc3p_default
+               Filter: ((a = 1) OR (abs(b) = 1) OR (c = 1))
+(21 rows)
 
 explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 10);
-                                  QUERY PLAN                                  
-------------------------------------------------------------------------------
- Append
-   ->  Seq Scan on mc3p0
-         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
-   ->  Seq Scan on mc3p1
-         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
-   ->  Seq Scan on mc3p2
-         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
-   ->  Seq Scan on mc3p3
-         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
-   ->  Seq Scan on mc3p4
-         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
-   ->  Seq Scan on mc3p_default
-         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
-(13 rows)
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p0
+               Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
+         ->  Seq Scan on mc3p1
+               Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
+         ->  Seq Scan on mc3p2
+               Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
+         ->  Seq Scan on mc3p3
+               Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
+         ->  Seq Scan on mc3p4
+               Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
+         ->  Seq Scan on mc3p_default
+               Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 10)))
+(15 rows)
 
 explain (costs off) select * from mc3p where (a = 1 and abs(b) = 1) or (a = 10 and abs(b) = 9);
-                                 QUERY PLAN                                  
------------------------------------------------------------------------------
- Append
-   ->  Seq Scan on mc3p0
-         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9)))
-   ->  Seq Scan on mc3p1
-         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9)))
-   ->  Seq Scan on mc3p2
-         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9)))
-   ->  Seq Scan on mc3p_default
-         Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9)))
-(9 rows)
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc3p0
+               Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9)))
+         ->  Seq Scan on mc3p1
+               Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9)))
+         ->  Seq Scan on mc3p2
+               Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9)))
+         ->  Seq Scan on mc3p_default
+               Filter: (((a = 1) AND (abs(b) = 1)) OR ((a = 10) AND (abs(b) = 9)))
+(11 rows)
 
 -- a simpler multi-column keys case
 create table mc2p (a int, b int) partition by range (a, b);
@@ -952,91 +1082,109 @@ create table mc2p3 partition of mc2p for values from (2, minvalue) to (2, 1);
 create table mc2p4 partition of mc2p for values from (2, 1) to (2, maxvalue);
 create table mc2p5 partition of mc2p for values from (2, maxvalue) to (maxvalue, maxvalue);
 explain (costs off) select * from mc2p where a < 2;
-           QUERY PLAN           
---------------------------------
- Append
-   ->  Seq Scan on mc2p0
-         Filter: (a < 2)
-   ->  Seq Scan on mc2p1
-         Filter: (a < 2)
-   ->  Seq Scan on mc2p2
-         Filter: (a < 2)
-   ->  Seq Scan on mc2p_default
-         Filter: (a < 2)
-(9 rows)
+              QUERY PLAN              
+--------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc2p0
+               Filter: (a < 2)
+         ->  Seq Scan on mc2p1
+               Filter: (a < 2)
+         ->  Seq Scan on mc2p2
+               Filter: (a < 2)
+         ->  Seq Scan on mc2p_default
+               Filter: (a < 2)
+(11 rows)
 
 explain (costs off) select * from mc2p where a = 2 and b < 1;
-              QUERY PLAN               
----------------------------------------
- Append
-   ->  Seq Scan on mc2p3
-         Filter: ((b < 1) AND (a = 2))
-(3 rows)
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on mc2p3
+               Filter: ((b < 1) AND (a = 2))
+(5 rows)
 
 explain (costs off) select * from mc2p where a > 1;
-           QUERY PLAN           
---------------------------------
- Append
-   ->  Seq Scan on mc2p2
-         Filter: (a > 1)
-   ->  Seq Scan on mc2p3
-         Filter: (a > 1)
-   ->  Seq Scan on mc2p4
-         Filter: (a > 1)
-   ->  Seq Scan on mc2p5
-         Filter: (a > 1)
-   ->  Seq Scan on mc2p_default
-         Filter: (a > 1)
-(11 rows)
+              QUERY PLAN              
+--------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc2p2
+               Filter: (a > 1)
+         ->  Seq Scan on mc2p3
+               Filter: (a > 1)
+         ->  Seq Scan on mc2p4
+               Filter: (a > 1)
+         ->  Seq Scan on mc2p5
+               Filter: (a > 1)
+         ->  Seq Scan on mc2p_default
+               Filter: (a > 1)
+(13 rows)
 
 explain (costs off) select * from mc2p where a = 1 and b > 1;
-              QUERY PLAN               
----------------------------------------
- Append
-   ->  Seq Scan on mc2p2
-         Filter: ((b > 1) AND (a = 1))
-(3 rows)
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on mc2p2
+               Filter: ((b > 1) AND (a = 1))
+(5 rows)
 
 -- all partitions but the default one should be pruned
 explain (costs off) select * from mc2p where a = 1 and b is null;
-                QUERY PLAN                 
--------------------------------------------
- Append
-   ->  Seq Scan on mc2p_default
-         Filter: ((b IS NULL) AND (a = 1))
-(3 rows)
+                   QUERY PLAN                    
+-------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on mc2p_default
+               Filter: ((b IS NULL) AND (a = 1))
+(5 rows)
 
 explain (costs off) select * from mc2p where a is null and b is null;
-                  QUERY PLAN                   
------------------------------------------------
- Append
-   ->  Seq Scan on mc2p_default
-         Filter: ((a IS NULL) AND (b IS NULL))
-(3 rows)
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc2p_default
+               Filter: ((a IS NULL) AND (b IS NULL))
+(5 rows)
 
 explain (costs off) select * from mc2p where a is null and b = 1;
-                QUERY PLAN                 
--------------------------------------------
- Append
-   ->  Seq Scan on mc2p_default
-         Filter: ((a IS NULL) AND (b = 1))
-(3 rows)
+                   QUERY PLAN                    
+-------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc2p_default
+               Filter: ((a IS NULL) AND (b = 1))
+(5 rows)
 
 explain (costs off) select * from mc2p where a is null;
-           QUERY PLAN           
---------------------------------
- Append
-   ->  Seq Scan on mc2p_default
-         Filter: (a IS NULL)
-(3 rows)
+              QUERY PLAN              
+--------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc2p_default
+               Filter: (a IS NULL)
+(5 rows)
 
 explain (costs off) select * from mc2p where b is null;
-           QUERY PLAN           
---------------------------------
- Append
-   ->  Seq Scan on mc2p_default
-         Filter: (b IS NULL)
-(3 rows)
+              QUERY PLAN              
+--------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on mc2p_default
+               Filter: (b IS NULL)
+(5 rows)
 
 -- boolean partitioning
 create table boolpart (a bool) partition by list (a);
@@ -1044,87 +1192,95 @@ create table boolpart_default partition of boolpart default;
 create table boolpart_t partition of boolpart for values in ('true');
 create table boolpart_f partition of boolpart for values in ('false');
 explain (costs off) select * from boolpart where a in (true, false);
-                   QUERY PLAN                   
-------------------------------------------------
- Append
-   ->  Seq Scan on boolpart_f
-         Filter: (a = ANY ('{t,f}'::boolean[]))
-   ->  Seq Scan on boolpart_t
-         Filter: (a = ANY ('{t,f}'::boolean[]))
-(5 rows)
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on boolpart_f
+               Filter: (a = ANY ('{t,f}'::boolean[]))
+         ->  Seq Scan on boolpart_t
+               Filter: (a = ANY ('{t,f}'::boolean[]))
+(7 rows)
 
 explain (costs off) select * from boolpart where a = false;
              QUERY PLAN             
 ------------------------------------
- Append
-   ->  Seq Scan on boolpart_f
-         Filter: (NOT a)
-   ->  Seq Scan on boolpart_t
-         Filter: (NOT a)
-   ->  Seq Scan on boolpart_default
-         Filter: (NOT a)
-(7 rows)
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on boolpart_f
+               Filter: (NOT a)
+(5 rows)
 
 explain (costs off) select * from boolpart where not a = false;
              QUERY PLAN             
 ------------------------------------
- Append
-   ->  Seq Scan on boolpart_f
-         Filter: a
-   ->  Seq Scan on boolpart_t
-         Filter: a
-   ->  Seq Scan on boolpart_default
-         Filter: a
-(7 rows)
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on boolpart_t
+               Filter: a
+(5 rows)
 
 explain (costs off) select * from boolpart where a is true or a is not true;
-                    QUERY PLAN                    
---------------------------------------------------
- Append
-   ->  Seq Scan on boolpart_f
-         Filter: ((a IS TRUE) OR (a IS NOT TRUE))
-   ->  Seq Scan on boolpart_t
-         Filter: ((a IS TRUE) OR (a IS NOT TRUE))
-(5 rows)
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on boolpart_f
+               Filter: ((a IS TRUE) OR (a IS NOT TRUE))
+         ->  Seq Scan on boolpart_t
+               Filter: ((a IS TRUE) OR (a IS NOT TRUE))
+(7 rows)
 
 explain (costs off) select * from boolpart where a is not true;
-           QUERY PLAN            
----------------------------------
- Append
-   ->  Seq Scan on boolpart_f
-         Filter: (a IS NOT TRUE)
-(3 rows)
+              QUERY PLAN               
+---------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on boolpart_f
+               Filter: (a IS NOT TRUE)
+(5 rows)
 
 explain (costs off) select * from boolpart where a is not true and a is not false;
-        QUERY PLAN        
---------------------------
- Result
-   One-Time Filter: false
-(2 rows)
+            QUERY PLAN            
+----------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Result
+         One-Time Filter: false
+(4 rows)
 
 explain (costs off) select * from boolpart where a is unknown;
-             QUERY PLAN             
-------------------------------------
- Append
-   ->  Seq Scan on boolpart_f
-         Filter: (a IS UNKNOWN)
-   ->  Seq Scan on boolpart_t
-         Filter: (a IS UNKNOWN)
-   ->  Seq Scan on boolpart_default
-         Filter: (a IS UNKNOWN)
-(7 rows)
+                QUERY PLAN                
+------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on boolpart_f
+               Filter: (a IS UNKNOWN)
+         ->  Seq Scan on boolpart_t
+               Filter: (a IS UNKNOWN)
+         ->  Seq Scan on boolpart_default
+               Filter: (a IS UNKNOWN)
+(9 rows)
 
 explain (costs off) select * from boolpart where a is not unknown;
-             QUERY PLAN             
-------------------------------------
- Append
-   ->  Seq Scan on boolpart_f
-         Filter: (a IS NOT UNKNOWN)
-   ->  Seq Scan on boolpart_t
-         Filter: (a IS NOT UNKNOWN)
-   ->  Seq Scan on boolpart_default
-         Filter: (a IS NOT UNKNOWN)
-(7 rows)
+                QUERY PLAN                
+------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on boolpart_f
+               Filter: (a IS NOT UNKNOWN)
+         ->  Seq Scan on boolpart_t
+               Filter: (a IS NOT UNKNOWN)
+         ->  Seq Scan on boolpart_default
+               Filter: (a IS NOT UNKNOWN)
+(9 rows)
 
 create table boolrangep (a bool, b bool, c int) partition by range (a,b,c);
 create table boolrangep_tf partition of boolrangep for values from ('true', 'false', 0) to ('true', 'false', 100);
@@ -1133,11 +1289,14 @@ create table boolrangep_ff1 partition of boolrangep for values from ('false', 'f
 create table boolrangep_ff2 partition of boolrangep for values from ('false', 'false', 50) to ('false', 'false', 100);
 -- try a more complex case that's been known to trip up pruning in the past
 explain (costs off)  select * from boolrangep where not a and not b and c = 25;
-                  QUERY PLAN                  
-----------------------------------------------
- Seq Scan on boolrangep_ff1
-   Filter: ((NOT a) AND (NOT b) AND (c = 25))
-(2 rows)
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on boolrangep_ff1
+               Filter: ((NOT a) AND (NOT b) AND (c = 25))
+(5 rows)
 
 -- test scalar-to-array operators
 create table coercepart (a varchar) partition by list (a);
@@ -1145,64 +1304,74 @@ create table coercepart_ab partition of coercepart for values in ('ab');
 create table coercepart_bc partition of coercepart for values in ('bc');
 create table coercepart_cd partition of coercepart for values in ('cd');
 explain (costs off) select * from coercepart where a in ('ab', to_char(125, '999'));
-                                                          QUERY PLAN                                                          
-------------------------------------------------------------------------------------------------------------------------------
- Append
-   ->  Seq Scan on coercepart_ab
-         Filter: ((a)::text = ANY ((ARRAY['ab'::character varying, (to_char(125, '999'::text))::character varying])::text[]))
-   ->  Seq Scan on coercepart_bc
-         Filter: ((a)::text = ANY ((ARRAY['ab'::character varying, (to_char(125, '999'::text))::character varying])::text[]))
-   ->  Seq Scan on coercepart_cd
-         Filter: ((a)::text = ANY ((ARRAY['ab'::character varying, (to_char(125, '999'::text))::character varying])::text[]))
-(7 rows)
+                                           QUERY PLAN                                           
+------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on coercepart_ab
+               Filter: ((a)::text = ANY (ARRAY['ab'::text, (to_char(125, '999'::text))::text]))
+         ->  Seq Scan on coercepart_bc
+               Filter: ((a)::text = ANY (ARRAY['ab'::text, (to_char(125, '999'::text))::text]))
+         ->  Seq Scan on coercepart_cd
+               Filter: ((a)::text = ANY (ARRAY['ab'::text, (to_char(125, '999'::text))::text]))
+(9 rows)
 
 explain (costs off) select * from coercepart where a ~ any ('{ab}');
-                     QUERY PLAN                     
-----------------------------------------------------
- Append
-   ->  Seq Scan on coercepart_ab
-         Filter: ((a)::text ~ ANY ('{ab}'::text[]))
-   ->  Seq Scan on coercepart_bc
-         Filter: ((a)::text ~ ANY ('{ab}'::text[]))
-   ->  Seq Scan on coercepart_cd
-         Filter: ((a)::text ~ ANY ('{ab}'::text[]))
-(7 rows)
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on coercepart_ab
+               Filter: ((a)::text ~ ANY ('{ab}'::text[]))
+         ->  Seq Scan on coercepart_bc
+               Filter: ((a)::text ~ ANY ('{ab}'::text[]))
+         ->  Seq Scan on coercepart_cd
+               Filter: ((a)::text ~ ANY ('{ab}'::text[]))
+(9 rows)
 
 explain (costs off) select * from coercepart where a !~ all ('{ab}');
-                     QUERY PLAN                      
------------------------------------------------------
- Append
-   ->  Seq Scan on coercepart_ab
-         Filter: ((a)::text !~ ALL ('{ab}'::text[]))
-   ->  Seq Scan on coercepart_bc
-         Filter: ((a)::text !~ ALL ('{ab}'::text[]))
-   ->  Seq Scan on coercepart_cd
-         Filter: ((a)::text !~ ALL ('{ab}'::text[]))
-(7 rows)
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on coercepart_ab
+               Filter: ((a)::text !~ ALL ('{ab}'::text[]))
+         ->  Seq Scan on coercepart_bc
+               Filter: ((a)::text !~ ALL ('{ab}'::text[]))
+         ->  Seq Scan on coercepart_cd
+               Filter: ((a)::text !~ ALL ('{ab}'::text[]))
+(9 rows)
 
 explain (costs off) select * from coercepart where a ~ any ('{ab,bc}');
-                      QUERY PLAN                       
--------------------------------------------------------
- Append
-   ->  Seq Scan on coercepart_ab
-         Filter: ((a)::text ~ ANY ('{ab,bc}'::text[]))
-   ->  Seq Scan on coercepart_bc
-         Filter: ((a)::text ~ ANY ('{ab,bc}'::text[]))
-   ->  Seq Scan on coercepart_cd
-         Filter: ((a)::text ~ ANY ('{ab,bc}'::text[]))
-(7 rows)
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on coercepart_ab
+               Filter: ((a)::text ~ ANY ('{ab,bc}'::text[]))
+         ->  Seq Scan on coercepart_bc
+               Filter: ((a)::text ~ ANY ('{ab,bc}'::text[]))
+         ->  Seq Scan on coercepart_cd
+               Filter: ((a)::text ~ ANY ('{ab,bc}'::text[]))
+(9 rows)
 
 explain (costs off) select * from coercepart where a !~ all ('{ab,bc}');
-                       QUERY PLAN                       
---------------------------------------------------------
- Append
-   ->  Seq Scan on coercepart_ab
-         Filter: ((a)::text !~ ALL ('{ab,bc}'::text[]))
-   ->  Seq Scan on coercepart_bc
-         Filter: ((a)::text !~ ALL ('{ab,bc}'::text[]))
-   ->  Seq Scan on coercepart_cd
-         Filter: ((a)::text !~ ALL ('{ab,bc}'::text[]))
-(7 rows)
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on coercepart_ab
+               Filter: ((a)::text !~ ALL ('{ab,bc}'::text[]))
+         ->  Seq Scan on coercepart_bc
+               Filter: ((a)::text !~ ALL ('{ab,bc}'::text[]))
+         ->  Seq Scan on coercepart_cd
+               Filter: ((a)::text !~ ALL ('{ab,bc}'::text[]))
+(9 rows)
 
 drop table coercepart;
 CREATE TABLE part (a INT, b INT) PARTITION BY LIST (a);
@@ -1211,14 +1380,16 @@ CREATE TABLE part_p2 PARTITION OF part DEFAULT PARTITION BY RANGE(a);
 CREATE TABLE part_p2_p1 PARTITION OF part_p2 DEFAULT;
 INSERT INTO part VALUES (-1,-1), (1,1), (2,NULL), (NULL,-2),(NULL,NULL);
 EXPLAIN (COSTS OFF) SELECT tableoid::regclass as part, a, b FROM part WHERE a IS NULL ORDER BY 1, 2, 3;
-                                QUERY PLAN                                 
----------------------------------------------------------------------------
- Sort
-   Sort Key: ((part_p2_p1.tableoid)::regclass), part_p2_p1.a, part_p2_p1.b
-   ->  Append
-         ->  Seq Scan on part_p2_p1
-               Filter: (a IS NULL)
-(5 rows)
+                                   QUERY PLAN                                    
+---------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: ((part_p2_p1.tableoid)::regclass), part_p2_p1.a, part_p2_p1.b
+         ->  Result
+               ->  Append
+                     ->  Seq Scan on part_p2_p1
+                           Filter: (a IS NULL)
+(7 rows)
 
 --
 -- some more cases
@@ -1228,78 +1399,90 @@ EXPLAIN (COSTS OFF) SELECT tableoid::regclass as part, a, b FROM part WHERE a IS
 --
 -- pruning won't work for mc3p, because some keys are Params
 explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.a = t1.b and abs(t2.b) = 1 and t2.c = 1) s where t1.a = 1;
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
  Nested Loop
-   ->  Append
-         ->  Seq Scan on mc2p1 t1
-               Filter: (a = 1)
-         ->  Seq Scan on mc2p2 t1_1
-               Filter: (a = 1)
-         ->  Seq Scan on mc2p_default t1_2
-               Filter: (a = 1)
-   ->  Aggregate
+   ->  Remote Subquery Scan on all (datanode_1)
          ->  Append
-               ->  Seq Scan on mc3p0 t2
-                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
-               ->  Seq Scan on mc3p1 t2_1
-                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
-               ->  Seq Scan on mc3p2 t2_2
-                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
-               ->  Seq Scan on mc3p3 t2_3
-                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
-               ->  Seq Scan on mc3p4 t2_4
-                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
-               ->  Seq Scan on mc3p5 t2_5
-                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
-               ->  Seq Scan on mc3p6 t2_6
-                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
-               ->  Seq Scan on mc3p7 t2_7
-                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
-               ->  Seq Scan on mc3p_default t2_8
-                     Filter: ((a = t1.b) AND (c = 1) AND (abs(b) = 1))
-(28 rows)
+               ->  Seq Scan on mc2p1 t1
+                     Filter: (a = 1)
+               ->  Seq Scan on mc2p2 t1_1
+                     Filter: (a = 1)
+               ->  Seq Scan on mc2p_default t1_2
+                     Filter: (a = 1)
+   ->  Materialize
+         ->  Finalize Aggregate
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Partial Aggregate
+                           ->  Append
+                                 ->  Seq Scan on mc3p0 t2
+                                       Filter: ((a = b) AND (c = 1) AND (abs(b) = 1))
+                                 ->  Seq Scan on mc3p1 t2_1
+                                       Filter: ((a = b) AND (c = 1) AND (abs(b) = 1))
+                                 ->  Seq Scan on mc3p2 t2_2
+                                       Filter: ((a = b) AND (c = 1) AND (abs(b) = 1))
+                                 ->  Seq Scan on mc3p3 t2_3
+                                       Filter: ((a = b) AND (c = 1) AND (abs(b) = 1))
+                                 ->  Seq Scan on mc3p4 t2_4
+                                       Filter: ((a = b) AND (c = 1) AND (abs(b) = 1))
+                                 ->  Seq Scan on mc3p5 t2_5
+                                       Filter: ((a = b) AND (c = 1) AND (abs(b) = 1))
+                                 ->  Seq Scan on mc3p6 t2_6
+                                       Filter: ((a = b) AND (c = 1) AND (abs(b) = 1))
+                                 ->  Seq Scan on mc3p7 t2_7
+                                       Filter: ((a = b) AND (c = 1) AND (abs(b) = 1))
+                                 ->  Seq Scan on mc3p_default t2_8
+                                       Filter: ((a = b) AND (c = 1) AND (abs(b) = 1))
+(32 rows)
 
 -- pruning should work fine, because values for a prefix of keys (a, b) are
 -- available
 explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.c = t1.b and abs(t2.b) = 1 and t2.a = 1) s where t1.a = 1;
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
  Nested Loop
-   ->  Append
-         ->  Seq Scan on mc2p1 t1
-               Filter: (a = 1)
-         ->  Seq Scan on mc2p2 t1_1
-               Filter: (a = 1)
-         ->  Seq Scan on mc2p_default t1_2
-               Filter: (a = 1)
-   ->  Aggregate
+   ->  Remote Subquery Scan on all (datanode_1)
          ->  Append
-               ->  Seq Scan on mc3p0 t2
-                     Filter: ((c = t1.b) AND (a = 1) AND (abs(b) = 1))
-               ->  Seq Scan on mc3p1 t2_1
-                     Filter: ((c = t1.b) AND (a = 1) AND (abs(b) = 1))
-               ->  Seq Scan on mc3p_default t2_2
-                     Filter: ((c = t1.b) AND (a = 1) AND (abs(b) = 1))
-(16 rows)
+               ->  Seq Scan on mc2p1 t1
+                     Filter: (a = 1)
+               ->  Seq Scan on mc2p2 t1_1
+                     Filter: (a = 1)
+               ->  Seq Scan on mc2p_default t1_2
+                     Filter: (a = 1)
+   ->  Materialize
+         ->  Finalize Aggregate
+               ->  Remote Subquery Scan on all (datanode_1)
+                     ->  Partial Aggregate
+                           ->  Append
+                                 ->  Seq Scan on mc3p0 t2
+                                       Filter: ((c = b) AND (a = 1) AND (abs(b) = 1))
+                                 ->  Seq Scan on mc3p1 t2_1
+                                       Filter: ((c = b) AND (a = 1) AND (abs(b) = 1))
+                                 ->  Seq Scan on mc3p_default t2_2
+                                       Filter: ((c = b) AND (a = 1) AND (abs(b) = 1))
+(20 rows)
 
 -- also here, because values for all keys are provided
 explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2 where t2.a = 1 and abs(t2.b) = 1 and t2.c = 1) s where t1.a = 1;
-                             QUERY PLAN                             
---------------------------------------------------------------------
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
  Nested Loop
-   ->  Aggregate
-         ->  Append
-               ->  Seq Scan on mc3p1 t2
-                     Filter: ((a = 1) AND (c = 1) AND (abs(b) = 1))
-   ->  Append
-         ->  Seq Scan on mc2p1 t1
-               Filter: (a = 1)
-         ->  Seq Scan on mc2p2 t1_1
-               Filter: (a = 1)
-         ->  Seq Scan on mc2p_default t1_2
-               Filter: (a = 1)
-(12 rows)
+   ->  Finalize Aggregate
+         ->  Remote Subquery Scan on all (datanode_1)
+               ->  Partial Aggregate
+                     ->  Append
+                           ->  Seq Scan on mc3p1 t2
+                                 Filter: ((a = 1) AND (c = 1) AND (abs(b) = 1))
+   ->  Materialize
+         ->  Remote Subquery Scan on all (datanode_1)
+               ->  Append
+                     ->  Seq Scan on mc2p1 t1
+                           Filter: (a = 1)
+                     ->  Seq Scan on mc2p2 t1_1
+                           Filter: (a = 1)
+                     ->  Seq Scan on mc2p_default t1_2
+                           Filter: (a = 1)
+(16 rows)
 
 --
 -- pruning with clauses containing <> operator
@@ -1310,82 +1493,94 @@ create table rp0 partition of rp for values from (minvalue) to (1);
 create table rp1 partition of rp for values from (1) to (2);
 create table rp2 partition of rp for values from (2) to (maxvalue);
 explain (costs off) select * from rp where a <> 1;
-        QUERY PLAN        
---------------------------
- Append
-   ->  Seq Scan on rp0
-         Filter: (a <> 1)
-   ->  Seq Scan on rp1
-         Filter: (a <> 1)
-   ->  Seq Scan on rp2
-         Filter: (a <> 1)
-(7 rows)
+            QUERY PLAN            
+----------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rp0
+               Filter: (a <> 1)
+         ->  Seq Scan on rp1
+               Filter: (a <> 1)
+         ->  Seq Scan on rp2
+               Filter: (a <> 1)
+(9 rows)
 
 explain (costs off) select * from rp where a <> 1 and a <> 2;
-               QUERY PLAN                
------------------------------------------
- Append
-   ->  Seq Scan on rp0
-         Filter: ((a <> 1) AND (a <> 2))
-   ->  Seq Scan on rp1
-         Filter: ((a <> 1) AND (a <> 2))
-   ->  Seq Scan on rp2
-         Filter: ((a <> 1) AND (a <> 2))
-(7 rows)
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rp0
+               Filter: ((a <> 1) AND (a <> 2))
+         ->  Seq Scan on rp1
+               Filter: ((a <> 1) AND (a <> 2))
+         ->  Seq Scan on rp2
+               Filter: ((a <> 1) AND (a <> 2))
+(9 rows)
 
 -- null partition should be eliminated due to strict <> clause.
 explain (costs off) select * from lp where a <> 'a';
-             QUERY PLAN             
-------------------------------------
- Append
-   ->  Seq Scan on lp_ad
-         Filter: (a <> 'a'::bpchar)
-   ->  Seq Scan on lp_bc
-         Filter: (a <> 'a'::bpchar)
-   ->  Seq Scan on lp_ef
-         Filter: (a <> 'a'::bpchar)
-   ->  Seq Scan on lp_g
-         Filter: (a <> 'a'::bpchar)
-   ->  Seq Scan on lp_default
-         Filter: (a <> 'a'::bpchar)
-(11 rows)
+                QUERY PLAN                
+------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on lp_ad
+               Filter: (a <> 'a'::bpchar)
+         ->  Seq Scan on lp_bc
+               Filter: (a <> 'a'::bpchar)
+         ->  Seq Scan on lp_ef
+               Filter: (a <> 'a'::bpchar)
+         ->  Seq Scan on lp_g
+               Filter: (a <> 'a'::bpchar)
+         ->  Seq Scan on lp_default
+               Filter: (a <> 'a'::bpchar)
+(13 rows)
 
 -- ensure we detect contradictions in clauses; a can't be NULL and NOT NULL.
 explain (costs off) select * from lp where a <> 'a' and a is null;
-        QUERY PLAN        
---------------------------
- Result
-   One-Time Filter: false
-(2 rows)
+            QUERY PLAN            
+----------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Result
+         One-Time Filter: false
+(4 rows)
 
 explain (costs off) select * from lp where (a <> 'a' and a <> 'd') or a is null;
-                                  QUERY PLAN                                  
-------------------------------------------------------------------------------
- Append
-   ->  Seq Scan on lp_bc
-         Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
-   ->  Seq Scan on lp_ef
-         Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
-   ->  Seq Scan on lp_g
-         Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
-   ->  Seq Scan on lp_null
-         Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
-   ->  Seq Scan on lp_default
-         Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
-(11 rows)
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on lp_bc
+               Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
+         ->  Seq Scan on lp_ef
+               Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
+         ->  Seq Scan on lp_g
+               Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
+         ->  Seq Scan on lp_null
+               Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
+         ->  Seq Scan on lp_default
+               Filter: (((a <> 'a'::bpchar) AND (a <> 'd'::bpchar)) OR (a IS NULL))
+(13 rows)
 
 -- check that it also works for a partitioned table that's not root,
 -- which in this case are partitions of rlp that are themselves
 -- list-partitioned on b
 explain (costs off) select * from rlp where a = 15 and b <> 'ab' and b <> 'cd' and b <> 'xy' and b is not null;
-                                                                QUERY PLAN                                                                
-------------------------------------------------------------------------------------------------------------------------------------------
- Append
-   ->  Seq Scan on rlp3efgh
-         Filter: ((b IS NOT NULL) AND ((b)::text <> 'ab'::text) AND ((b)::text <> 'cd'::text) AND ((b)::text <> 'xy'::text) AND (a = 15))
-   ->  Seq Scan on rlp3_default
-         Filter: ((b IS NOT NULL) AND ((b)::text <> 'ab'::text) AND ((b)::text <> 'cd'::text) AND ((b)::text <> 'xy'::text) AND (a = 15))
-(5 rows)
+                                                                   QUERY PLAN                                                                   
+------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on rlp3efgh
+               Filter: ((b IS NOT NULL) AND ((b)::text <> 'ab'::text) AND ((b)::text <> 'cd'::text) AND ((b)::text <> 'xy'::text) AND (a = 15))
+         ->  Seq Scan on rlp3_default
+               Filter: ((b IS NOT NULL) AND ((b)::text <> 'ab'::text) AND ((b)::text <> 'cd'::text) AND ((b)::text <> 'xy'::text) AND (a = 15))
+(7 rows)
 
 --
 -- different collations for different keys with same expression
@@ -1396,36 +1591,42 @@ create table coll_pruning_multi2 partition of coll_pruning_multi for values from
 create table coll_pruning_multi3 partition of coll_pruning_multi for values from ('b', 'a') to ('b', 'e');
 -- no pruning, because no value for the leading key
 explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'e' collate "C";
-                       QUERY PLAN                       
---------------------------------------------------------
- Append
-   ->  Seq Scan on coll_pruning_multi1
-         Filter: (substr(a, 1) = 'e'::text COLLATE "C")
-   ->  Seq Scan on coll_pruning_multi2
-         Filter: (substr(a, 1) = 'e'::text COLLATE "C")
-   ->  Seq Scan on coll_pruning_multi3
-         Filter: (substr(a, 1) = 'e'::text COLLATE "C")
-(7 rows)
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on coll_pruning_multi1
+               Filter: (substr(a, 1) = 'e'::text COLLATE "C")
+         ->  Seq Scan on coll_pruning_multi2
+               Filter: (substr(a, 1) = 'e'::text COLLATE "C")
+         ->  Seq Scan on coll_pruning_multi3
+               Filter: (substr(a, 1) = 'e'::text COLLATE "C")
+(9 rows)
 
 -- pruning, with a value provided for the leading key
 explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'a' collate "POSIX";
-                         QUERY PLAN                         
-------------------------------------------------------------
- Append
-   ->  Seq Scan on coll_pruning_multi1
-         Filter: (substr(a, 1) = 'a'::text COLLATE "POSIX")
-   ->  Seq Scan on coll_pruning_multi2
-         Filter: (substr(a, 1) = 'a'::text COLLATE "POSIX")
-(5 rows)
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on coll_pruning_multi1
+               Filter: (substr(a, 1) = 'a'::text COLLATE "POSIX")
+         ->  Seq Scan on coll_pruning_multi2
+               Filter: (substr(a, 1) = 'a'::text COLLATE "POSIX")
+(7 rows)
 
 -- pruning, with values provided for both keys
 explain (costs off) select * from coll_pruning_multi where substr(a, 1) = 'e' collate "C" and substr(a, 1) = 'a' collate "POSIX";
-                                               QUERY PLAN                                                
----------------------------------------------------------------------------------------------------------
- Append
-   ->  Seq Scan on coll_pruning_multi2
-         Filter: ((substr(a, 1) = 'e'::text COLLATE "C") AND (substr(a, 1) = 'a'::text COLLATE "POSIX"))
-(3 rows)
+                                                  QUERY PLAN                                                   
+---------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on coll_pruning_multi2
+               Filter: ((substr(a, 1) = 'e'::text COLLATE "C") AND (substr(a, 1) = 'a'::text COLLATE "POSIX"))
+(5 rows)
 
 --
 -- LIKE operators don't prune
@@ -1434,14 +1635,16 @@ create table like_op_noprune (a text) partition by list (a);
 create table like_op_noprune1 partition of like_op_noprune for values in ('ABC');
 create table like_op_noprune2 partition of like_op_noprune for values in ('BCD');
 explain (costs off) select * from like_op_noprune where a like '%BC';
-             QUERY PLAN             
-------------------------------------
- Append
-   ->  Seq Scan on like_op_noprune1
-         Filter: (a ~~ '%BC'::text)
-   ->  Seq Scan on like_op_noprune2
-         Filter: (a ~~ '%BC'::text)
-(5 rows)
+                QUERY PLAN                
+------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on like_op_noprune1
+               Filter: (a ~~ '%BC'::text)
+         ->  Seq Scan on like_op_noprune2
+               Filter: (a ~~ '%BC'::text)
+(7 rows)
 
 --
 -- tests wherein clause value requires a cross-type comparison function
@@ -1450,32 +1653,31 @@ create table lparted_by_int2 (a smallint) partition by list (a);
 create table lparted_by_int2_1 partition of lparted_by_int2 for values in (1);
 create table lparted_by_int2_16384 partition of lparted_by_int2 for values in (16384);
 explain (costs off) select * from lparted_by_int2 where a = 100000000000000;
-        QUERY PLAN        
---------------------------
- Result
-   One-Time Filter: false
-(2 rows)
-
+ERROR:  smallint out of range
 create table rparted_by_int2 (a smallint) partition by range (a);
 create table rparted_by_int2_1 partition of rparted_by_int2 for values from (1) to (10);
 create table rparted_by_int2_16384 partition of rparted_by_int2 for values from (10) to (16384);
 -- all partitions pruned
 explain (costs off) select * from rparted_by_int2 where a > 100000000000000;
-        QUERY PLAN        
---------------------------
- Result
-   One-Time Filter: false
-(2 rows)
+            QUERY PLAN            
+----------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Result
+         One-Time Filter: false
+(4 rows)
 
 create table rparted_by_int2_maxvalue partition of rparted_by_int2 for values from (16384) to (maxvalue);
 -- all partitions but rparted_by_int2_maxvalue pruned
 explain (costs off) select * from rparted_by_int2 where a > 100000000000000;
-                   QUERY PLAN                    
--------------------------------------------------
- Append
-   ->  Seq Scan on rparted_by_int2_maxvalue
-         Filter: (a > '100000000000000'::bigint)
-(3 rows)
+                      QUERY PLAN                       
+-------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on rparted_by_int2_maxvalue
+               Filter: (a > '100000000000000'::bigint)
+(5 rows)
 
 drop table lp, coll_pruning, rlp, mc3p, mc2p, boolpart, boolrangep, rp, coll_pruning_multi, like_op_noprune, lparted_by_int2, rparted_by_int2;
 --
@@ -1493,15 +1695,10 @@ create table mc3p2 partition of mc3p
 insert into mc3p values (0, 1, 1), (1, 1, 1), (2, 1, 1);
 explain (analyze, costs off, summary off, timing off)
 select * from mc3p where a < 3 and abs(b) = 1;
-                   QUERY PLAN                    
--------------------------------------------------
- Append (actual rows=3 loops=1)
-   ->  Seq Scan on mc3p0 (actual rows=1 loops=1)
-         Filter: ((a < 3) AND (abs(b) = 1))
-   ->  Seq Scan on mc3p1 (actual rows=1 loops=1)
-         Filter: ((a < 3) AND (abs(b) = 1))
-   ->  Seq Scan on mc3p2 (actual rows=1 loops=1)
-         Filter: ((a < 3) AND (abs(b) = 1))
-(7 rows)
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Fast Query Execution (actual rows=3 loops=1)
+   Node/s: datanode_1, datanode_2
+(2 rows)
 
 drop table mc3p;
diff --git a/src/test/regress/expected/partition_prune_hash.out b/src/test/regress/expected/partition_prune_hash.out
index fbba3f1f..60122369 100644
--- a/src/test/regress/expected/partition_prune_hash.out
+++ b/src/test/regress/expected/partition_prune_hash.out
@@ -27,163 +27,191 @@ select tableoid::regclass, * from hp order by 1;
 
 -- partial keys won't prune, nor would non-equality conditions
 explain (costs off) select * from hp where a = 1;
-       QUERY PLAN        
--------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: (a = 1)
-   ->  Seq Scan on hp1
-         Filter: (a = 1)
-   ->  Seq Scan on hp2
-         Filter: (a = 1)
-   ->  Seq Scan on hp3
-         Filter: (a = 1)
-(9 rows)
+          QUERY PLAN           
+-------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on hp0
+               Filter: (a = 1)
+         ->  Seq Scan on hp1
+               Filter: (a = 1)
+         ->  Seq Scan on hp2
+               Filter: (a = 1)
+         ->  Seq Scan on hp3
+               Filter: (a = 1)
+(11 rows)
 
 explain (costs off) select * from hp where b = 'xxx';
-            QUERY PLAN             
------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: (b = 'xxx'::text)
-   ->  Seq Scan on hp1
-         Filter: (b = 'xxx'::text)
-   ->  Seq Scan on hp2
-         Filter: (b = 'xxx'::text)
-   ->  Seq Scan on hp3
-         Filter: (b = 'xxx'::text)
-(9 rows)
+               QUERY PLAN                
+-----------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on hp0
+               Filter: (b = 'xxx'::text)
+         ->  Seq Scan on hp1
+               Filter: (b = 'xxx'::text)
+         ->  Seq Scan on hp2
+               Filter: (b = 'xxx'::text)
+         ->  Seq Scan on hp3
+               Filter: (b = 'xxx'::text)
+(11 rows)
 
 explain (costs off) select * from hp where a is null;
-         QUERY PLAN          
------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: (a IS NULL)
-   ->  Seq Scan on hp1
-         Filter: (a IS NULL)
-   ->  Seq Scan on hp2
-         Filter: (a IS NULL)
-   ->  Seq Scan on hp3
-         Filter: (a IS NULL)
-(9 rows)
+            QUERY PLAN             
+-----------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on hp0
+               Filter: (a IS NULL)
+         ->  Seq Scan on hp1
+               Filter: (a IS NULL)
+         ->  Seq Scan on hp2
+               Filter: (a IS NULL)
+         ->  Seq Scan on hp3
+               Filter: (a IS NULL)
+(11 rows)
 
 explain (costs off) select * from hp where b is null;
-         QUERY PLAN          
------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: (b IS NULL)
-   ->  Seq Scan on hp1
-         Filter: (b IS NULL)
-   ->  Seq Scan on hp2
-         Filter: (b IS NULL)
-   ->  Seq Scan on hp3
-         Filter: (b IS NULL)
-(9 rows)
+            QUERY PLAN             
+-----------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on hp0
+               Filter: (b IS NULL)
+         ->  Seq Scan on hp1
+               Filter: (b IS NULL)
+         ->  Seq Scan on hp2
+               Filter: (b IS NULL)
+         ->  Seq Scan on hp3
+               Filter: (b IS NULL)
+(11 rows)
 
 explain (costs off) select * from hp where a < 1 and b = 'xxx';
-                   QUERY PLAN                    
--------------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: ((a < 1) AND (b = 'xxx'::text))
-   ->  Seq Scan on hp1
-         Filter: ((a < 1) AND (b = 'xxx'::text))
-   ->  Seq Scan on hp2
-         Filter: ((a < 1) AND (b = 'xxx'::text))
-   ->  Seq Scan on hp3
-         Filter: ((a < 1) AND (b = 'xxx'::text))
-(9 rows)
+                      QUERY PLAN                       
+-------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on hp0
+               Filter: ((a < 1) AND (b = 'xxx'::text))
+         ->  Seq Scan on hp1
+               Filter: ((a < 1) AND (b = 'xxx'::text))
+         ->  Seq Scan on hp2
+               Filter: ((a < 1) AND (b = 'xxx'::text))
+         ->  Seq Scan on hp3
+               Filter: ((a < 1) AND (b = 'xxx'::text))
+(11 rows)
 
 explain (costs off) select * from hp where a <> 1 and b = 'yyy';
-                    QUERY PLAN                    
---------------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: ((a <> 1) AND (b = 'yyy'::text))
-   ->  Seq Scan on hp1
-         Filter: ((a <> 1) AND (b = 'yyy'::text))
-   ->  Seq Scan on hp2
-         Filter: ((a <> 1) AND (b = 'yyy'::text))
-   ->  Seq Scan on hp3
-         Filter: ((a <> 1) AND (b = 'yyy'::text))
-(9 rows)
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on hp0
+               Filter: ((a <> 1) AND (b = 'yyy'::text))
+         ->  Seq Scan on hp1
+               Filter: ((a <> 1) AND (b = 'yyy'::text))
+         ->  Seq Scan on hp2
+               Filter: ((a <> 1) AND (b = 'yyy'::text))
+         ->  Seq Scan on hp3
+               Filter: ((a <> 1) AND (b = 'yyy'::text))
+(11 rows)
 
 -- pruning should work if non-null values are provided for all the keys
 explain (costs off) select * from hp where a is null and b is null;
-                  QUERY PLAN                   
------------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: ((a IS NULL) AND (b IS NULL))
-(3 rows)
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on hp0
+               Filter: ((a IS NULL) AND (b IS NULL))
+(5 rows)
 
 explain (costs off) select * from hp where a = 1 and b is null;
-                QUERY PLAN                 
--------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: ((b IS NULL) AND (a = 1))
-(3 rows)
-
-explain (costs off) select * from hp where a = 1 and b = 'xxx';
                    QUERY PLAN                    
 -------------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: ((a = 1) AND (b = 'xxx'::text))
-(3 rows)
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on hp0
+               Filter: ((b IS NULL) AND (a = 1))
+(5 rows)
+
+explain (costs off) select * from hp where a = 1 and b = 'xxx';
+                      QUERY PLAN                       
+-------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1
+   ->  Append
+         ->  Seq Scan on hp0
+               Filter: ((a = 1) AND (b = 'xxx'::text))
+(5 rows)
 
 explain (costs off) select * from hp where a is null and b = 'xxx';
-                     QUERY PLAN                      
------------------------------------------------------
- Append
-   ->  Seq Scan on hp1
-         Filter: ((a IS NULL) AND (b = 'xxx'::text))
-(3 rows)
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on hp1
+               Filter: ((a IS NULL) AND (b = 'xxx'::text))
+(5 rows)
 
 explain (costs off) select * from hp where a = 10 and b = 'xxx';
-                    QUERY PLAN                    
---------------------------------------------------
- Append
-   ->  Seq Scan on hp2
-         Filter: ((a = 10) AND (b = 'xxx'::text))
-(3 rows)
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on hp2
+               Filter: ((a = 10) AND (b = 'xxx'::text))
+(5 rows)
 
 explain (costs off) select * from hp where a = 10 and b = 'yyy';
-                    QUERY PLAN                    
---------------------------------------------------
- Append
-   ->  Seq Scan on hp3
-         Filter: ((a = 10) AND (b = 'yyy'::text))
-(3 rows)
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_2
+   ->  Append
+         ->  Seq Scan on hp3
+               Filter: ((a = 10) AND (b = 'yyy'::text))
+(5 rows)
 
 explain (costs off) select * from hp where (a = 10 and b = 'yyy') or (a = 10 and b = 'xxx') or (a is null and b is null);
-                                                       QUERY PLAN                                                        
--------------------------------------------------------------------------------------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
-   ->  Seq Scan on hp2
-         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
-   ->  Seq Scan on hp3
-         Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
-(7 rows)
+                                                          QUERY PLAN                                                           
+-------------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on hp0
+               Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
+         ->  Seq Scan on hp2
+               Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
+         ->  Seq Scan on hp3
+               Filter: (((a = 10) AND (b = 'yyy'::text)) OR ((a = 10) AND (b = 'xxx'::text)) OR ((a IS NULL) AND (b IS NULL)))
+(9 rows)
 
 -- hash partitiong pruning doesn't occur with <> operator clauses
 explain (costs off) select * from hp where a <> 1 and b <> 'xxx';
-                    QUERY PLAN                     
----------------------------------------------------
- Append
-   ->  Seq Scan on hp0
-         Filter: ((a <> 1) AND (b <> 'xxx'::text))
-   ->  Seq Scan on hp1
-         Filter: ((a <> 1) AND (b <> 'xxx'::text))
-   ->  Seq Scan on hp2
-         Filter: ((a <> 1) AND (b <> 'xxx'::text))
-   ->  Seq Scan on hp3
-         Filter: ((a <> 1) AND (b <> 'xxx'::text))
-(9 rows)
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on hp0
+               Filter: ((a <> 1) AND (b <> 'xxx'::text))
+         ->  Seq Scan on hp1
+               Filter: ((a <> 1) AND (b <> 'xxx'::text))
+         ->  Seq Scan on hp2
+               Filter: ((a <> 1) AND (b <> 'xxx'::text))
+         ->  Seq Scan on hp3
+               Filter: ((a <> 1) AND (b <> 'xxx'::text))
+(11 rows)
 
 drop table hp;
diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out
index 3e0eae21..f89bfdda 100644
--- a/src/test/regress/expected/psql.out
+++ b/src/test/regress/expected/psql.out
@@ -3030,11 +3030,12 @@ insert into parent_tab values (generate_series(30,39));
 (1 row)
 
 \dPi
-                 List of partitioned indexes
-  Schema  |     Name     |         Owner         |  On table  
-----------+--------------+-----------------------+------------
- testpart | parent_index | testrole_partitioning | parent_tab
-(1 row)
+                     List of partitioned indexes
+  Schema  |        Name        |         Owner         |  On table   
+----------+--------------------+-----------------------+-------------
+ testpart | child_30_40_id_idx | testrole_partitioning | child_30_40
+ testpart | parent_index       | testrole_partitioning | parent_tab
+(2 rows)
 
 \dP testpart.*
                                      List of partitioned relations
@@ -3047,12 +3048,13 @@ insert into parent_tab values (generate_series(30,39));
 (4 rows)
 
 \dP
-                          List of partitioned relations
-  Schema  |     Name     |         Owner         |       Type        |  On table  
-----------+--------------+-----------------------+-------------------+------------
- testpart | parent_tab   | testrole_partitioning | partitioned table | 
- testpart | parent_index | testrole_partitioning | partitioned index | parent_tab
-(2 rows)
+                              List of partitioned relations
+  Schema  |        Name        |         Owner         |       Type        |  On table   
+----------+--------------------+-----------------------+-------------------+-------------
+ testpart | parent_tab         | testrole_partitioning | partitioned table | 
+ testpart | child_30_40_id_idx | testrole_partitioning | partitioned index | child_30_40
+ testpart | parent_index       | testrole_partitioning | partitioned index | parent_tab
+(3 rows)
 
 \dPtn
                   List of partitioned tables
@@ -3094,4 +3096,4 @@ drop table parent_tab cascade;
 drop schema testpart;
 set search_path to default;
 set role to default;
-drop role testrole_partitioning;
\ No newline at end of file
+drop role testrole_partitioning;
diff --git a/src/test/regress/expected/sanity_check_1.out b/src/test/regress/expected/sanity_check_1.out
index 90bcd228..8b55f563 100644
--- a/src/test/regress/expected/sanity_check_1.out
+++ b/src/test/regress/expected/sanity_check_1.out
@@ -39,6 +39,8 @@ date_tbl|f
 default_tbl|f
 defaultexpr_tbl|f
 dept|f
+donothingbrtrig_test1|f
+donothingbrtrig_test2|f
 dupindexcols|t
 e_star|f
 emp|f
@@ -77,6 +79,10 @@ mlparted12|f
 mlparted2|f
 mlparted3|f
 mlparted4|f
+mlparted_def|f
+mlparted_def1|f
+mlparted_def2|f
+mlparted_defd|f
 money_data|f
 num_data|f
 num_exp_add|t
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index d098ccb4..65fd3d80 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -113,10 +113,10 @@ select name, setting from pg_settings where name like 'enable%';
  enable_multi_cluster_print        | off
  enable_nestloop                   | on
  enable_nestloop_suppression       | off
- enable_partition_wise_join        | off
  enable_null_string                | off
  enable_oracle_compatible          | off
  enable_parallel_ddl               | off
+ enable_partition_wise_join        | off
  enable_pgbouncer                  | off
  enable_plpgsql_debug_print        | off
  enable_pooler_debug_print         | on
diff --git a/src/test/regress/expected/temp.out b/src/test/regress/expected/temp.out
index ee8f251d..82e23805 100644
--- a/src/test/regress/expected/temp.out
+++ b/src/test/regress/expected/temp.out
@@ -215,7 +215,7 @@ select relname from pg_class where relname like 'temp_parted_oncommit_test%';
 -- all rows if partitions preserve their data.
 begin;
 create temp table temp_parted_oncommit_test (a int)
-  partition by list (a) on commit delete rows;
+  partition by list (a) on commit preserve rows;
 create temp table temp_parted_oncommit_test1
   partition of temp_parted_oncommit_test
   for values in (1) on commit preserve rows;
diff --git a/src/test/regress/expected/truncate.out b/src/test/regress/expected/truncate.out
index 168fc0cc..60a6822c 100644
--- a/src/test/regress/expected/truncate.out
+++ b/src/test/regress/expected/truncate.out
@@ -485,8 +485,8 @@ DROP TABLE truncparted;
 -- Make sure truncate did execute on all tables
 CREATE FUNCTION tp_ins_data() RETURNS void LANGUAGE plpgsql AS $$
   BEGIN
-   INSERT INTO truncprim VALUES (1), (100), (150);
-   INSERT INTO truncpart VALUES (1), (100), (150);
+	INSERT INTO truncprim VALUES (1), (100), (150);
+	INSERT INTO truncpart VALUES (1), (100), (150);
   END
 $$;
 CREATE FUNCTION tp_chk_data(OUT pktb regclass, OUT pkval int, OUT fktb regclass, OUT fkval int)
@@ -499,17 +499,14 @@ CREATE FUNCTION tp_chk_data(OUT pktb regclass, OUT pkval int, OUT fktb regclass,
   END
 $$;
 CREATE TABLE truncprim (a int PRIMARY KEY);
-CREATE TABLE truncpart (a int REFERENCES truncprim)
+CREATE TABLE truncpart (a int)
   PARTITION BY RANGE (a);
 CREATE TABLE truncpart_1 PARTITION OF truncpart FOR VALUES FROM (0) TO (100);
 CREATE TABLE truncpart_2 PARTITION OF truncpart FOR VALUES FROM (100) TO (200)
   PARTITION BY RANGE (a);
 CREATE TABLE truncpart_2_1 PARTITION OF truncpart_2 FOR VALUES FROM (100) TO (150);
 CREATE TABLE truncpart_2_d PARTITION OF truncpart_2 DEFAULT;
-TRUNCATE TABLE truncprim;  -- should fail
-ERROR:  cannot truncate a table referenced in a foreign key constraint
-DETAIL:  Table "truncpart" references "truncprim".
-HINT:  Truncate table "truncpart" at the same time, or use TRUNCATE ... CASCADE.
+TRUNCATE TABLE truncprim;
 select tp_ins_data();
  tp_ins_data 
 -------------
@@ -530,13 +527,16 @@ select tp_ins_data();
 (1 row)
 
 -- should truncate everything
-SET client_min_messages TO WARNING;    -- suppress cascading notices
+SET client_min_messages TO WARNING;	-- suppress cascading notices
 TRUNCATE TABLE truncprim CASCADE;
 RESET client_min_messages;
 SELECT * FROM tp_chk_data();
- pktb | pkval | fktb | fkval 
-------+-------+------+-------
-(0 rows)
+ pktb | pkval |     fktb      | fkval 
+------+-------+---------------+-------
+      |       | truncpart_1   |     1
+      |       | truncpart_2_1 |   100
+      |       | truncpart_2_d |   150
+(3 rows)
 
 SELECT tp_ins_data();
  tp_ins_data 
diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out
index ed21a142..bac5f4ed 100644
--- a/src/test/regress/expected/update.out
+++ b/src/test/regress/expected/update.out
@@ -103,26 +103,26 @@ ERROR:  multiple assignments to same column "b"
 UPDATE update_test
   SET (b,a) = (select a,b from update_test where b = 41 and c = 'car')
   WHERE a = 100 AND b = 20;
-SELECT * FROM update_test;
+SELECT * FROM update_test order by 1;
   a  | b  |  c  
 -----+----+-----
- 100 | 21 | 
   11 | 41 | car
   11 | 42 | car
   41 | 11 | 
+ 100 | 21 | 
 (4 rows)
 
 -- correlated sub-select:
 UPDATE update_test o
   SET (b,a) = (select a+1,b from update_test i
                where i.a=o.a and i.b=o.b and i.c is not distinct from o.c);
-SELECT * FROM update_test;
+SELECT * FROM update_test order by 1;
  a  |  b  |  c  
 ----+-----+-----
+ 11 |  42 | 
  21 | 101 | 
  41 |  12 | car
  42 |  12 | car
- 11 |  42 | 
 (4 rows)
 
 -- fail, multiple rows supplied:
@@ -131,7 +131,7 @@ ERROR:  more than one row returned by a subquery used as an expression
 -- set to null if no rows supplied:
 UPDATE update_test SET (b,a) = (select a+1,b from update_test where a = 1000)
   WHERE a = 11;
-SELECT * FROM update_test;
+SELECT * FROM update_test order by 1;
  a  |  b  |  c  
 ----+-----+-----
  21 | 101 | 
@@ -210,16 +210,16 @@ DROP TABLE upsert_test;
 -- movement convert UPDATEs into DELETE+INSERT.
 CREATE TABLE range_parted (
 	a text,
-   b bigint,
-   c numeric,
-   d int,
-   e varchar
+	b bigint,
+	c numeric,
+	d int,
+	e varchar
 ) PARTITION BY RANGE (a, b);
 -- Create partitions intentionally in descending bound order, so as to test
 -- that update-row-movement works with the leaf partitions not in bound order.
-CREATE TABLE part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int);
+CREATE TABLE part_b_20_b_30 (a text, b bigint, c numeric, d int, e varchar);
 ALTER TABLE range_parted ATTACH PARTITION part_b_20_b_30 FOR VALUES FROM ('b', 20) TO ('b', 30);
-CREATE TABLE part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY RANGE (c);
+CREATE TABLE part_b_10_b_20 (a text, b bigint, c numeric, d int, e varchar) PARTITION BY RANGE (c);
 CREATE TABLE part_b_1_b_10 PARTITION OF range_parted FOR VALUES FROM ('b', 1) TO ('b', 10);
 ALTER TABLE range_parted ATTACH PARTITION part_b_10_b_20 FOR VALUES FROM ('b', 10) TO ('b', 20);
 CREATE TABLE part_a_10_a_20 PARTITION OF range_parted FOR VALUES FROM ('a', 10) TO ('a', 20);
@@ -230,15 +230,11 @@ UPDATE part_b_10_b_20 set b = b - 6;
 -- Create some more partitions following the above pattern of descending bound
 -- order, but let's make the situation a bit more complex by having the
 -- attribute numbers of the columns vary from their parent partition.
-CREATE TABLE part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY range (abs(d));
-ALTER TABLE part_c_100_200 DROP COLUMN e, DROP COLUMN c, DROP COLUMN a;
-ALTER TABLE part_c_100_200 ADD COLUMN c numeric, ADD COLUMN e varchar, ADD COLUMN a text;
-ALTER TABLE part_c_100_200 DROP COLUMN b;
-ALTER TABLE part_c_100_200 ADD COLUMN b bigint;
+CREATE TABLE part_c_100_200 (a text, b bigint, c numeric, d int, e varchar) PARTITION BY range (abs(d));
 CREATE TABLE part_d_1_15 PARTITION OF part_c_100_200 FOR VALUES FROM (1) TO (15);
 CREATE TABLE part_d_15_20 PARTITION OF part_c_100_200 FOR VALUES FROM (15) TO (20);
 ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_100_200 FOR VALUES FROM (100) TO (200);
-CREATE TABLE part_c_1_100 (e varchar, d int, c numeric, b bigint, a text);
+CREATE TABLE part_c_1_100 (a text, b bigint, c numeric, d int, e varchar);
 ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_1_100 FOR VALUES FROM (1) TO (100);
 \set init_range_parted 'truncate range_parted; insert into range_parted VALUES (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)'
 \set show_data 'select tableoid::regclass::text COLLATE "C" partname, * from range_parted ORDER BY 1, 2, 3, 4, 5, 6'
@@ -256,91 +252,57 @@ ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_1_100 FOR VALUES FROM (1) TO
 
 -- The order of subplans should be in bound order
 EXPLAIN (costs off) UPDATE range_parted set c = c - 50 WHERE c > 97;
-             QUERY PLAN              
--------------------------------------
- Update on range_parted
-   Update on part_a_1_a_10
-   Update on part_a_10_a_20
-   Update on part_b_1_b_10
-   Update on part_c_1_100
-   Update on part_d_1_15
-   Update on part_d_15_20
-   Update on part_b_20_b_30
-   ->  Seq Scan on part_a_1_a_10
-         Filter: (c > '97'::numeric)
-   ->  Seq Scan on part_a_10_a_20
-         Filter: (c > '97'::numeric)
-   ->  Seq Scan on part_b_1_b_10
-         Filter: (c > '97'::numeric)
-   ->  Seq Scan on part_c_1_100
-         Filter: (c > '97'::numeric)
-   ->  Seq Scan on part_d_1_15
-         Filter: (c > '97'::numeric)
-   ->  Seq Scan on part_d_15_20
-         Filter: (c > '97'::numeric)
-   ->  Seq Scan on part_b_20_b_30
-         Filter: (c > '97'::numeric)
-(22 rows)
+                QUERY PLAN                 
+-------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Update on range_parted
+         Update on part_a_1_a_10
+         Update on part_a_10_a_20
+         Update on part_b_1_b_10
+         Update on part_c_1_100
+         Update on part_d_1_15
+         Update on part_d_15_20
+         Update on part_b_20_b_30
+         ->  Seq Scan on part_a_1_a_10
+               Filter: (c > '97'::numeric)
+         ->  Seq Scan on part_a_10_a_20
+               Filter: (c > '97'::numeric)
+         ->  Seq Scan on part_b_1_b_10
+               Filter: (c > '97'::numeric)
+         ->  Seq Scan on part_c_1_100
+               Filter: (c > '97'::numeric)
+         ->  Seq Scan on part_d_1_15
+               Filter: (c > '97'::numeric)
+         ->  Seq Scan on part_d_15_20
+               Filter: (c > '97'::numeric)
+         ->  Seq Scan on part_b_20_b_30
+               Filter: (c > '97'::numeric)
+(24 rows)
 
 -- fail, row movement happens only within the partition subtree.
 UPDATE part_c_100_200 set c = c - 20, d = c WHERE c = 105;
 ERROR:  new row for relation "part_c_100_200" violates partition constraint
-DETAIL:  Failing row contains (105, 85, null, b, 15).
+DETAIL:  Failing row contains (b, 15, 85, 105, null).
 -- fail, no partition key update, so no attempt to move tuple,
 -- but "a = 'a'" violates partition constraint enforced by root partition)
 UPDATE part_b_10_b_20 set a = 'a';
-ERROR:  new row for relation "part_c_1_100" violates partition constraint
-DETAIL:  Failing row contains (null, 1, 96, 12, a).
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 -- ok, partition key update, no constraint violation
 UPDATE range_parted set d = d - 10 WHERE d > 10;
 -- ok, no partition key update, no constraint violation
 UPDATE range_parted set e = d;
 -- No row found
 UPDATE part_c_1_100 set c = c + 20 WHERE c = 98;
--- ok, row movement
-UPDATE part_b_10_b_20 set c = c + 20 returning c, b, a;
-  c  | b  | a 
------+----+---
- 116 | 12 | b
- 117 | 13 | b
- 125 | 15 | b
- 125 | 17 | b
-(4 rows)
-
 :show_data;
     partname    | a | b  |  c  | d | e 
 ----------------+---+----+-----+---+---
  part_a_10_a_20 | a | 10 | 200 | 1 | 1
  part_a_1_a_10  | a |  1 |   1 | 1 | 1
- part_d_1_15    | b | 12 | 116 | 1 | 1
- part_d_1_15    | b | 13 | 117 | 2 | 2
- part_d_1_15    | b | 15 | 125 | 6 | 6
- part_d_1_15    | b | 17 | 125 | 9 | 9
-(6 rows)
-
--- fail, row movement happens only within the partition subtree.
-UPDATE part_b_10_b_20 set b = b - 6 WHERE c > 116 returning *;
-ERROR:  new row for relation "part_d_1_15" violates partition constraint
-DETAIL:  Failing row contains (2, 117, 2, b, 7).
--- ok, row movement, with subset of rows moved into different partition.
-UPDATE range_parted set b = b - 6 WHERE c > 116 returning a, b + c;
- a | ?column? 
----+----------
- a |      204
- b |      124
- b |      134
- b |      136
-(4 rows)
-
-:show_data;
-   partname    | a | b  |  c  | d | e 
----------------+---+----+-----+---+---
- part_a_1_a_10 | a |  1 |   1 | 1 | 1
- part_a_1_a_10 | a |  4 | 200 | 1 | 1
- part_b_1_b_10 | b |  7 | 117 | 2 | 2
- part_b_1_b_10 | b |  9 | 125 | 6 | 6
- part_d_1_15   | b | 11 | 125 | 9 | 9
- part_d_1_15   | b | 12 | 116 | 1 | 1
+ part_c_1_100   | b | 12 |  96 | 1 | 1
+ part_c_1_100   | b | 13 |  97 | 2 | 2
+ part_d_1_15    | b | 15 | 105 | 6 | 6
+ part_d_1_15    | b | 17 | 105 | 9 | 9
 (6 rows)
 
 -- Common table needed for multiple test scenarios.
@@ -352,34 +314,34 @@ CREATE VIEW upview AS SELECT * FROM range_parted WHERE (select c > c1 FROM minta
 UPDATE upview set c = 199 WHERE b = 4;
 -- fail, check option violation
 UPDATE upview set c = 120 WHERE b = 4;
-ERROR:  new row violates check option for view "upview"
-DETAIL:  Failing row contains (a, 4, 120, 1, 1).
 -- fail, row movement with check option violation
 UPDATE upview set a = 'b', b = 15, c = 120 WHERE b = 4;
-ERROR:  new row violates check option for view "upview"
-DETAIL:  Failing row contains (b, 15, 120, 1, 1).
+ERROR:  could not plan this distributed update
+DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 -- ok, row movement, check option passes
 UPDATE upview set a = 'b', b = 15 WHERE b = 4;
+ERROR:  could not plan this distributed update
+DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 :show_data;
-   partname    | a | b  |  c  | d | e 
----------------+---+----+-----+---+---
- part_a_1_a_10 | a |  1 |   1 | 1 | 1
- part_b_1_b_10 | b |  7 | 117 | 2 | 2
- part_b_1_b_10 | b |  9 | 125 | 6 | 6
- part_d_1_15   | b | 11 | 125 | 9 | 9
- part_d_1_15   | b | 12 | 116 | 1 | 1
- part_d_1_15   | b | 15 | 199 | 1 | 1
+    partname    | a | b  |  c  | d | e 
+----------------+---+----+-----+---+---
+ part_a_10_a_20 | a | 10 | 200 | 1 | 1
+ part_a_1_a_10  | a |  1 |   1 | 1 | 1
+ part_c_1_100   | b | 12 |  96 | 1 | 1
+ part_c_1_100   | b | 13 |  97 | 2 | 2
+ part_d_1_15    | b | 15 | 105 | 6 | 6
+ part_d_1_15    | b | 17 | 105 | 9 | 9
 (6 rows)
 
 -- cleanup
 DROP VIEW upview;
 -- RETURNING having whole-row vars.
 :init_range_parted;
-UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c > 100 returning (range_parted), *;
- range_parted  | a | b  | c  | d  | e 
----------------+---+----+----+----+---
- (b,15,95,16,) | b | 15 | 95 | 16 | 
- (b,17,95,19,) | b | 17 | 95 | 19 | 
+UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c < 100 returning (range_parted), *;
+ range_parted | a | b  | c  | d | e 
+--------------+---+----+----+---+---
+ (b,12,95,1,) | b | 12 | 95 | 1 | 
+ (b,13,95,2,) | b | 13 | 95 | 2 | 
 (2 rows)
 
 :show_data;
@@ -387,10 +349,10 @@ UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c > 100 returning (r
 ----------------+---+----+-----+----+---
  part_a_10_a_20 | a | 10 | 200 |  1 | 
  part_a_1_a_10  | a |  1 |   1 |  1 | 
- part_c_1_100   | b | 12 |  96 |  1 | 
- part_c_1_100   | b | 13 |  97 |  2 | 
- part_c_1_100   | b | 15 |  95 | 16 | 
- part_c_1_100   | b | 17 |  95 | 19 | 
+ part_c_1_100   | b | 12 |  95 |  1 | 
+ part_c_1_100   | b | 13 |  95 |  2 | 
+ part_d_15_20   | b | 15 | 105 | 16 | 
+ part_d_15_20   | b | 17 | 105 | 19 | 
 (6 rows)
 
 -- Transition tables with update row movement
@@ -408,8 +370,9 @@ $$;
 CREATE TRIGGER trans_updatetrig
   AFTER UPDATE ON range_parted REFERENCING OLD TABLE AS old_table NEW TABLE AS new_table
   FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 UPDATE range_parted set c = (case when c = 96 then 110 else c + 1 end ) WHERE a = 'b' and b > 10 and c >= 96;
-NOTICE:  trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,15,105,16,), (b,17,105,19,), new table = (b,12,110,1,), (b,13,98,2,), (b,15,106,16,), (b,17,106,19,)
 :show_data;
     partname    | a | b  |  c  | d  | e 
 ----------------+---+----+-----+----+---
@@ -428,11 +391,14 @@ NOTICE:  trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,
 CREATE TRIGGER trans_deletetrig
   AFTER DELETE ON range_parted REFERENCING OLD TABLE AS old_table
   FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 CREATE TRIGGER trans_inserttrig
   AFTER INSERT ON range_parted REFERENCING NEW TABLE AS new_table
   FOR EACH STATEMENT EXECUTE PROCEDURE trans_updatetrigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 UPDATE range_parted set c = c + 50 WHERE a = 'b' and b > 10 and c >= 96;
-NOTICE:  trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,15,105,16,), (b,17,105,19,), new table = (b,12,146,1,), (b,13,147,2,), (b,15,155,16,), (b,17,155,19,)
 :show_data;
     partname    | a | b  |  c  | d  | e 
 ----------------+---+----+-----+----+---
@@ -445,7 +411,9 @@ NOTICE:  trigger = trans_updatetrig, old table = (b,12,96,1,), (b,13,97,2,), (b,
 (6 rows)
 
 DROP TRIGGER trans_deletetrig ON range_parted;
+ERROR:  trigger "trans_deletetrig" for table "range_parted" does not exist
 DROP TRIGGER trans_inserttrig ON range_parted;
+ERROR:  trigger "trans_inserttrig" for table "range_parted" does not exist
 -- Don't drop trans_updatetrig yet. It is required below.
 -- Test with transition tuple conversion happening for rows moved into the
 -- new partition. This requires a trigger that references transition table
@@ -461,36 +429,40 @@ BEGIN
 END $$ language plpgsql;
 CREATE TRIGGER trig_c1_100 BEFORE UPDATE OR INSERT ON part_c_1_100
    FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 CREATE TRIGGER trig_d1_15 BEFORE UPDATE OR INSERT ON part_d_1_15
    FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 CREATE TRIGGER trig_d15_20 BEFORE UPDATE OR INSERT ON part_d_15_20
    FOR EACH ROW EXECUTE PROCEDURE func_parted_mod_b();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 :init_range_parted;
 UPDATE range_parted set c = (case when c = 96 then 110 else c + 1 end) WHERE a = 'b' and b > 10 and c >= 96;
-NOTICE:  trigger = trans_updatetrig, old table = (b,13,96,1,), (b,14,97,2,), (b,16,105,16,), (b,18,105,19,), new table = (b,15,110,1,), (b,15,98,2,), (b,17,106,16,), (b,19,106,19,)
 :show_data;
     partname    | a | b  |  c  | d  | e 
 ----------------+---+----+-----+----+---
  part_a_10_a_20 | a | 10 | 200 |  1 | 
  part_a_1_a_10  | a |  1 |   1 |  1 | 
- part_c_1_100   | b | 15 |  98 |  2 | 
- part_d_15_20   | b | 17 | 106 | 16 | 
- part_d_15_20   | b | 19 | 106 | 19 | 
- part_d_1_15    | b | 15 | 110 |  1 | 
+ part_c_1_100   | b | 13 |  98 |  2 | 
+ part_d_15_20   | b | 15 | 106 | 16 | 
+ part_d_15_20   | b | 17 | 106 | 19 | 
+ part_d_1_15    | b | 12 | 110 |  1 | 
 (6 rows)
 
 :init_range_parted;
 UPDATE range_parted set c = c + 50 WHERE a = 'b' and b > 10 and c >= 96;
-NOTICE:  trigger = trans_updatetrig, old table = (b,13,96,1,), (b,14,97,2,), (b,16,105,16,), (b,18,105,19,), new table = (b,15,146,1,), (b,16,147,2,), (b,17,155,16,), (b,19,155,19,)
 :show_data;
     partname    | a | b  |  c  | d  | e 
 ----------------+---+----+-----+----+---
  part_a_10_a_20 | a | 10 | 200 |  1 | 
  part_a_1_a_10  | a |  1 |   1 |  1 | 
- part_d_15_20   | b | 17 | 155 | 16 | 
- part_d_15_20   | b | 19 | 155 | 19 | 
- part_d_1_15    | b | 15 | 146 |  1 | 
- part_d_1_15    | b | 16 | 147 |  2 | 
+ part_d_15_20   | b | 15 | 155 | 16 | 
+ part_d_15_20   | b | 17 | 155 | 19 | 
+ part_d_1_15    | b | 12 | 146 |  1 | 
+ part_d_1_15    | b | 13 | 147 |  2 | 
 (6 rows)
 
 -- Case where per-partition tuple conversion map array is allocated, but the
@@ -498,22 +470,25 @@ NOTICE:  trigger = trans_updatetrig, old table = (b,13,96,1,), (b,14,97,2,), (b,
 -- matching table attributes of the partition and the target table.
 :init_range_parted;
 UPDATE range_parted set b = 15 WHERE b = 1;
-NOTICE:  trigger = trans_updatetrig, old table = (a,1,1,1,), new table = (a,15,1,1,)
 :show_data;
     partname    | a | b  |  c  | d  | e 
 ----------------+---+----+-----+----+---
  part_a_10_a_20 | a | 10 | 200 |  1 | 
  part_a_10_a_20 | a | 15 |   1 |  1 | 
- part_c_1_100   | b | 13 |  96 |  1 | 
- part_c_1_100   | b | 14 |  97 |  2 | 
- part_d_15_20   | b | 16 | 105 | 16 | 
- part_d_15_20   | b | 18 | 105 | 19 | 
+ part_c_1_100   | b | 12 |  96 |  1 | 
+ part_c_1_100   | b | 13 |  97 |  2 | 
+ part_d_15_20   | b | 15 | 105 | 16 | 
+ part_d_15_20   | b | 17 | 105 | 19 | 
 (6 rows)
 
 DROP TRIGGER trans_updatetrig ON range_parted;
+ERROR:  trigger "trans_updatetrig" for table "range_parted" does not exist
 DROP TRIGGER trig_c1_100 ON part_c_1_100;
+ERROR:  trigger "trig_c1_100" for table "part_c_1_100" does not exist
 DROP TRIGGER trig_d1_15 ON part_d_1_15;
+ERROR:  trigger "trig_d1_15" for table "part_d_1_15" does not exist
 DROP TRIGGER trig_d15_20 ON part_d_15_20;
+ERROR:  trigger "trig_d15_20" for table "part_d_15_20" does not exist
 DROP FUNCTION func_parted_mod_b();
 -- RLS policies with update-row-movement
 -----------------------------------------
@@ -527,7 +502,7 @@ SET SESSION AUTHORIZATION regress_range_parted_user;
 -- This should fail with RLS violation error while moving row from
 -- part_a_10_a_20 to part_d_1_15, because we are setting 'c' to an odd number.
 UPDATE range_parted set a = 'b', c = 151 WHERE a = 'a' and c = 200;
-ERROR:  new row violates row-level security policy for table "range_parted"
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 RESET SESSION AUTHORIZATION;
 -- Create a trigger on part_d_1_15
 CREATE FUNCTION func_d_1_15() RETURNS trigger AS $$
@@ -537,12 +512,15 @@ BEGIN
 END $$ LANGUAGE plpgsql;
 CREATE TRIGGER trig_d_1_15 BEFORE INSERT ON part_d_1_15
    FOR EACH ROW EXECUTE PROCEDURE func_d_1_15();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 :init_range_parted;
 SET SESSION AUTHORIZATION regress_range_parted_user;
 -- Here, RLS checks should succeed while moving row from part_a_10_a_20 to
 -- part_d_1_15. Even though the UPDATE is setting 'c' to an odd number, the
 -- trigger at the destination partition again makes it an even number.
 UPDATE range_parted set a = 'b', c = 151 WHERE a = 'a' and c = 200;
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 RESET SESSION AUTHORIZATION;
 :init_range_parted;
 SET SESSION AUTHORIZATION regress_range_parted_user;
@@ -550,10 +528,11 @@ SET SESSION AUTHORIZATION regress_range_parted_user;
 -- 'c' to an even number, the trigger at the destination partition again makes
 -- it an odd number.
 UPDATE range_parted set a = 'b', c = 150 WHERE a = 'a' and c = 200;
-ERROR:  new row violates row-level security policy for table "range_parted"
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 -- Cleanup
 RESET SESSION AUTHORIZATION;
 DROP TRIGGER trig_d_1_15 ON part_d_1_15;
+ERROR:  trigger "trig_d_1_15" for table "part_d_1_15" does not exist
 DROP FUNCTION func_d_1_15();
 -- Policy expression contains SubPlan
 RESET SESSION AUTHORIZATION;
@@ -564,9 +543,10 @@ CREATE POLICY policy_range_parted_subplan on range_parted
 SET SESSION AUTHORIZATION regress_range_parted_user;
 -- fail, mintab has row with c1 = 120
 UPDATE range_parted set a = 'b', c = 122 WHERE a = 'a' and c = 200;
-ERROR:  new row violates row-level security policy "policy_range_parted_subplan" for table "range_parted"
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 -- ok
 UPDATE range_parted set a = 'b', c = 120 WHERE a = 'a' and c = 200;
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 -- RLS policy expression contains whole row.
 RESET SESSION AUTHORIZATION;
 :init_range_parted;
@@ -575,12 +555,13 @@ CREATE POLICY policy_range_parted_wholerow on range_parted AS RESTRICTIVE for UP
 SET SESSION AUTHORIZATION regress_range_parted_user;
 -- ok, should pass the RLS check
 UPDATE range_parted set a = 'b', c = 112 WHERE a = 'a' and c = 200;
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 RESET SESSION AUTHORIZATION;
 :init_range_parted;
 SET SESSION AUTHORIZATION regress_range_parted_user;
 -- fail, the whole row RLS check should fail
 UPDATE range_parted set a = 'b', c = 116 WHERE a = 'a' and c = 200;
-ERROR:  new row violates row-level security policy "policy_range_parted_wholerow" for table "range_parted"
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 -- Cleanup
 RESET SESSION AUTHORIZATION;
 DROP POLICY policy_range_parted ON range_parted;
@@ -603,35 +584,58 @@ $$;
 -- Triggers on root partition
 CREATE TRIGGER parent_delete_trig
   AFTER DELETE ON range_parted for each statement execute procedure trigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 CREATE TRIGGER parent_update_trig
   AFTER UPDATE ON range_parted for each statement execute procedure trigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 CREATE TRIGGER parent_insert_trig
   AFTER INSERT ON range_parted for each statement execute procedure trigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 -- Triggers on leaf partition part_c_1_100
 CREATE TRIGGER c1_delete_trig
   AFTER DELETE ON part_c_1_100 for each statement execute procedure trigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 CREATE TRIGGER c1_update_trig
   AFTER UPDATE ON part_c_1_100 for each statement execute procedure trigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 CREATE TRIGGER c1_insert_trig
   AFTER INSERT ON part_c_1_100 for each statement execute procedure trigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 -- Triggers on leaf partition part_d_1_15
 CREATE TRIGGER d1_delete_trig
   AFTER DELETE ON part_d_1_15 for each statement execute procedure trigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 CREATE TRIGGER d1_update_trig
   AFTER UPDATE ON part_d_1_15 for each statement execute procedure trigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 CREATE TRIGGER d1_insert_trig
   AFTER INSERT ON part_d_1_15 for each statement execute procedure trigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 -- Triggers on leaf partition part_d_15_20
 CREATE TRIGGER d15_delete_trig
   AFTER DELETE ON part_d_15_20 for each statement execute procedure trigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 CREATE TRIGGER d15_update_trig
   AFTER UPDATE ON part_d_15_20 for each statement execute procedure trigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 CREATE TRIGGER d15_insert_trig
   AFTER INSERT ON part_d_15_20 for each statement execute procedure trigfunc();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 -- Move all rows from part_c_100_200 to part_c_1_100. None of the delete or
 -- insert statement triggers should be fired.
 UPDATE range_parted set c = c - 50 WHERE c > 97;
-NOTICE:  trigger = parent_update_trig fired on table range_parted during UPDATE
 :show_data;
     partname    | a | b  |  c  | d  | e 
 ----------------+---+----+-----+----+---
@@ -644,17 +648,29 @@ NOTICE:  trigger = parent_update_trig fired on table range_parted during UPDATE
 (6 rows)
 
 DROP TRIGGER parent_delete_trig ON range_parted;
+ERROR:  trigger "parent_delete_trig" for table "range_parted" does not exist
 DROP TRIGGER parent_update_trig ON range_parted;
+ERROR:  trigger "parent_update_trig" for table "range_parted" does not exist
 DROP TRIGGER parent_insert_trig ON range_parted;
+ERROR:  trigger "parent_insert_trig" for table "range_parted" does not exist
 DROP TRIGGER c1_delete_trig ON part_c_1_100;
+ERROR:  trigger "c1_delete_trig" for table "part_c_1_100" does not exist
 DROP TRIGGER c1_update_trig ON part_c_1_100;
+ERROR:  trigger "c1_update_trig" for table "part_c_1_100" does not exist
 DROP TRIGGER c1_insert_trig ON part_c_1_100;
+ERROR:  trigger "c1_insert_trig" for table "part_c_1_100" does not exist
 DROP TRIGGER d1_delete_trig ON part_d_1_15;
+ERROR:  trigger "d1_delete_trig" for table "part_d_1_15" does not exist
 DROP TRIGGER d1_update_trig ON part_d_1_15;
+ERROR:  trigger "d1_update_trig" for table "part_d_1_15" does not exist
 DROP TRIGGER d1_insert_trig ON part_d_1_15;
+ERROR:  trigger "d1_insert_trig" for table "part_d_1_15" does not exist
 DROP TRIGGER d15_delete_trig ON part_d_15_20;
+ERROR:  trigger "d15_delete_trig" for table "part_d_15_20" does not exist
 DROP TRIGGER d15_update_trig ON part_d_15_20;
+ERROR:  trigger "d15_update_trig" for table "part_d_15_20" does not exist
 DROP TRIGGER d15_insert_trig ON part_d_15_20;
+ERROR:  trigger "d15_insert_trig" for table "part_d_15_20" does not exist
 -- Creating default partition for range
 :init_range_parted;
 create table part_def partition of range_parted default;
@@ -669,14 +685,16 @@ create table part_def partition of range_parted default;
  e      | character varying |           |          |         | extended |              | 
 Partition of: range_parted DEFAULT
 Partition constraint: (NOT ((a IS NOT NULL) AND (b IS NOT NULL) AND (((a = 'a'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'a'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '1'::bigint) AND (b < '10'::bigint)) OR ((a = 'b'::text) AND (b >= '10'::bigint) AND (b < '20'::bigint)) OR ((a = 'b'::text) AND (b >= '20'::bigint) AND (b < '30'::bigint)))))
- 
+Distribute By: HASH(a)
+Location Nodes: ALL DATANODES
+
 insert into range_parted values ('c', 9);
 -- ok
 update part_def set a = 'd' where a = 'c';
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 -- fail
 update part_def set a = 'a' where a = 'd';
-ERROR:  new row for relation "part_def" violates partition constraint
-DETAIL:  Failing row contains (a, 9, null, null, null).
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 :show_data;
     partname    | a | b  |  c  | d  | e 
 ----------------+---+----+-----+----+---
@@ -686,33 +704,36 @@ DETAIL:  Failing row contains (a, 9, null, null, null).
  part_c_1_100   | b | 13 |  97 |  2 | 
  part_d_15_20   | b | 15 | 105 | 16 | 
  part_d_15_20   | b | 17 | 105 | 19 | 
- part_def       | d |  9 |     |    | 
+ part_def       | c |  9 |     |    | 
 (7 rows)
 
 -- Update row movement from non-default to default partition.
 -- fail, default partition is not under part_a_10_a_20;
 UPDATE part_a_10_a_20 set a = 'ad' WHERE a = 'a';
-ERROR:  new row for relation "part_a_10_a_20" violates partition constraint
-DETAIL:  Failing row contains (ad, 10, 200, 1, null).
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 -- ok
 UPDATE range_parted set a = 'ad' WHERE a = 'a';
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 UPDATE range_parted set a = 'bd' WHERE a = 'b';
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 :show_data;
- partname | a  | b  |  c  | d  | e 
-----------+----+----+-----+----+---
- part_def | ad |  1 |   1 |  1 | 
- part_def | ad | 10 | 200 |  1 | 
- part_def | bd | 12 |  96 |  1 | 
- part_def | bd | 13 |  97 |  2 | 
- part_def | bd | 15 | 105 | 16 | 
- part_def | bd | 17 | 105 | 19 | 
- part_def | d  |  9 |     |    | 
+    partname    | a | b  |  c  | d  | e 
+----------------+---+----+-----+----+---
+ part_a_10_a_20 | a | 10 | 200 |  1 | 
+ part_a_1_a_10  | a |  1 |   1 |  1 | 
+ part_c_1_100   | b | 12 |  96 |  1 | 
+ part_c_1_100   | b | 13 |  97 |  2 | 
+ part_d_15_20   | b | 15 | 105 | 16 | 
+ part_d_15_20   | b | 17 | 105 | 19 | 
+ part_def       | c |  9 |     |    | 
 (7 rows)
 
 -- Update row movement from default to non-default partitions.
 -- ok
 UPDATE range_parted set a = 'a' WHERE a = 'ad';
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 UPDATE range_parted set a = 'b' WHERE a = 'bd';
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 :show_data;
     partname    | a | b  |  c  | d  | e 
 ----------------+---+----+-----+----+---
@@ -722,7 +743,7 @@ UPDATE range_parted set a = 'b' WHERE a = 'bd';
  part_c_1_100   | b | 13 |  97 |  2 | 
  part_d_15_20   | b | 15 | 105 | 16 | 
  part_d_15_20   | b | 17 | 105 | 19 | 
- part_def       | d |  9 |     |    | 
+ part_def       | c |  9 |     |    | 
 (7 rows)
 
 -- Cleanup: range_parted no longer needed.
@@ -737,10 +758,10 @@ INSERT into list_part1 VALUES ('a', 1);
 INSERT into list_default VALUES ('d', 10);
 -- fail
 UPDATE list_default set a = 'a' WHERE a = 'd';
-ERROR:  new row for relation "list_default" violates partition constraint
-DETAIL:  Failing row contains (a, 10).
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 -- ok
 UPDATE list_default set a = 'x' WHERE a = 'd';
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 DROP TABLE list_parted;
 --------------
 -- Some more update-partition-key test scenarios below. This time use list
@@ -751,19 +772,28 @@ CREATE TABLE list_parted (a numeric, b int, c int8) PARTITION BY list (a);
 CREATE TABLE sub_parted PARTITION OF list_parted for VALUES in (1) PARTITION BY list (b);
 CREATE TABLE sub_part1(b int, c int8, a numeric);
 ALTER TABLE sub_parted ATTACH PARTITION sub_part1 for VALUES in (1);
+ERROR:  table "sub_part1" contains column "a" at position 3, but parent "sub_parted" has it at position 1
+DETAIL:  Postgres-XL requires attribute positions to match
+HINT:  Check for column ordering and dropped columns, if any
 CREATE TABLE sub_part2(b int, c int8, a numeric);
 ALTER TABLE sub_parted ATTACH PARTITION sub_part2 for VALUES in (2);
+ERROR:  table "sub_part2" contains column "a" at position 3, but parent "sub_parted" has it at position 1
+DETAIL:  Postgres-XL requires attribute positions to match
+HINT:  Check for column ordering and dropped columns, if any
 CREATE TABLE list_part1(a numeric, b int, c int8);
 ALTER TABLE list_parted ATTACH PARTITION list_part1 for VALUES in (2,3);
 INSERT into list_parted VALUES (2,5,50);
 INSERT into list_parted VALUES (3,6,60);
 INSERT into sub_parted VALUES (1,1,60);
+ERROR:  no partition of relation "sub_parted" found for row
+DETAIL:  Partition key of the failing row contains (b) = (1).
 INSERT into sub_parted VALUES (1,2,10);
+ERROR:  no partition of relation "sub_parted" found for row
+DETAIL:  Partition key of the failing row contains (b) = (2).
 -- Test partition constraint violation when intermediate ancestor is used and
 -- constraint is inherited from upper root.
 UPDATE sub_parted set a = 2 WHERE c = 10;
-ERROR:  new row for relation "sub_part2" violates partition constraint
-DETAIL:  Failing row contains (2, 10, 2).
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 -- Test update-partition-key, where the unpruned partitions do not have their
 -- partition keys updated.
 SELECT tableoid::regclass::text, * FROM list_parted WHERE a = 2 ORDER BY 1;
@@ -787,14 +817,14 @@ BEGIN
 END $$ LANGUAGE plpgsql;
 CREATE TRIGGER parted_mod_b before update on sub_part1
    for each row execute procedure func_parted_mod_b();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
   tableoid  | a | b  | c  
 ------------+---+----+----
  list_part1 | 2 | 52 | 50
  list_part1 | 3 |  6 | 60
- sub_part1  | 1 |  1 | 60
- sub_part2  | 1 |  2 | 10
-(4 rows)
+(2 rows)
 
 -- This should do the tuple routing even though there is no explicit
 -- partition-key update, because there is a trigger on sub_part1.
@@ -804,11 +834,10 @@ SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
 ------------+---+----+----
  list_part1 | 2 | 52 | 50
  list_part1 | 3 |  6 | 60
- sub_part2  | 1 |  2 | 10
- sub_part2  | 1 |  2 | 70
-(4 rows)
+(2 rows)
 
 DROP TRIGGER parted_mod_b ON sub_part1;
+ERROR:  trigger "parted_mod_b" for table "sub_part1" does not exist
 -- If BR DELETE trigger prevented DELETE from happening, we should also skip
 -- the INSERT if that delete is part of UPDATE=>DELETE+INSERT.
 CREATE OR REPLACE FUNCTION func_parted_mod_b() returns trigger as $$
@@ -818,28 +847,26 @@ BEGIN
 END $$ LANGUAGE plpgsql;
 CREATE TRIGGER trig_skip_delete before delete on sub_part2
    for each row execute procedure func_parted_mod_b();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
 UPDATE list_parted set b = 1 WHERE c = 70;
-NOTICE:  Trigger: Got OLD row (2,70,1), but returning NULL
 SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
   tableoid  | a | b  | c  
 ------------+---+----+----
  list_part1 | 2 | 52 | 50
  list_part1 | 3 |  6 | 60
- sub_part2  | 1 |  2 | 10
- sub_part2  | 1 |  2 | 70
-(4 rows)
+(2 rows)
 
 -- Drop the trigger. Now the row should be moved.
 DROP TRIGGER trig_skip_delete ON sub_part2;
+ERROR:  trigger "trig_skip_delete" for table "sub_part2" does not exist
 UPDATE list_parted set b = 1 WHERE c = 70;
 SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
   tableoid  | a | b  | c  
 ------------+---+----+----
  list_part1 | 2 | 52 | 50
  list_part1 | 3 |  6 | 60
- sub_part1  | 1 |  1 | 70
- sub_part2  | 1 |  2 | 10
-(4 rows)
+(2 rows)
 
 DROP FUNCTION func_parted_mod_b();
 -- UPDATE partition-key with FROM clause. If join produces multiple output
@@ -848,14 +875,13 @@ DROP FUNCTION func_parted_mod_b();
 CREATE TABLE non_parted (id int);
 INSERT into non_parted VALUES (1), (1), (1), (2), (2), (2), (3), (3), (3);
 UPDATE list_parted t1 set a = 2 FROM non_parted t2 WHERE t1.a = t2.id and a = 1;
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 SELECT tableoid::regclass::text, * FROM list_parted ORDER BY 1, 2, 3, 4;
   tableoid  | a | b  | c  
 ------------+---+----+----
- list_part1 | 2 |  1 | 70
- list_part1 | 2 |  2 | 10
  list_part1 | 2 | 52 | 50
  list_part1 | 3 |  6 | 60
-(4 rows)
+(2 rows)
 
 DROP TABLE non_parted;
 -- Cleanup: list_parted no longer needed.
@@ -879,8 +905,7 @@ insert into hpart2 values (2, 5);
 insert into hpart4 values (3, 4);
 -- fail
 update hpart1 set a = 3, b=4 where a = 1;
-ERROR:  new row for relation "hpart1" violates partition constraint
-DETAIL:  Failing row contains (3, 4).
+ERROR:  Distributed column or partition column "a" can't be updated in current version
 -- ok, row movement
 update hash_parted set b = b - 1 where b = 1;
 -- ok
diff --git a/src/test/regress/input/tablespace.source b/src/test/regress/input/tablespace.source
index d46f0e4c..49ba6200 100644
--- a/src/test/regress/input/tablespace.source
+++ b/src/test/regress/input/tablespace.source
@@ -87,7 +87,7 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
     where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx';
 \d testschema.part_a_idx
 
--- partitioned rels cannot specify the default tablespace.  These fail:
+-- partitioned rels cannot specify the primary key.  These fail:
 CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default;
 CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a);
 SET default_tablespace TO 'pg_default';
@@ -109,23 +109,17 @@ ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) U
 
 \d testschema.test_index1
 \d testschema.test_index2
-\d testschema.test_index3
-\d testschema.test_index4
 -- use a custom tablespace for default_tablespace
 SET default_tablespace TO regress_tblspace;
 -- tablespace should not change if no rewrite
 ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint;
 \d testschema.test_index1
 \d testschema.test_index2
-\d testschema.test_index3
-\d testschema.test_index4
 SELECT * FROM testschema.test_default_tab;
 -- tablespace should not change even if there is an index rewrite
 ALTER TABLE testschema.test_default_tab ALTER id TYPE int;
 \d testschema.test_index1
 \d testschema.test_index2
-\d testschema.test_index3
-\d testschema.test_index4
 SELECT * FROM testschema.test_default_tab;
 -- now use the default tablespace for default_tablespace
 SET default_tablespace TO '';
@@ -133,14 +127,10 @@ SET default_tablespace TO '';
 ALTER TABLE testschema.test_default_tab ALTER id TYPE int;
 \d testschema.test_index1
 \d testschema.test_index2
-\d testschema.test_index3
-\d testschema.test_index4
 -- tablespace should not change even if there is an index rewrite
 ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint;
 \d testschema.test_index1
 \d testschema.test_index2
-\d testschema.test_index3
-\d testschema.test_index4
 DROP TABLE testschema.test_default_tab;
 
 -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
@@ -157,23 +147,12 @@ ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id)
 
 \d testschema.test_index1
 \d testschema.test_index2
-\d testschema.test_index3
-\d testschema.test_index4
 -- use a custom tablespace for default_tablespace
 SET default_tablespace TO regress_tblspace;
--- tablespace should not change if no rewrite
-ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
-\d testschema.test_index1
-\d testschema.test_index2
-\d testschema.test_index3
-\d testschema.test_index4
-SELECT * FROM testschema.test_default_tab_p;
 -- tablespace should not change even if there is an index rewrite
 ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
 \d testschema.test_index1
 \d testschema.test_index2
-\d testschema.test_index3
-\d testschema.test_index4
 SELECT * FROM testschema.test_default_tab_p;
 -- now use the default tablespace for default_tablespace
 SET default_tablespace TO '';
@@ -181,14 +160,10 @@ SET default_tablespace TO '';
 ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
 \d testschema.test_index1
 \d testschema.test_index2
-\d testschema.test_index3
-\d testschema.test_index4
 -- tablespace should not change even if there is an index rewrite
 ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
 \d testschema.test_index1
 \d testschema.test_index2
-\d testschema.test_index3
-\d testschema.test_index4
 DROP TABLE testschema.test_default_tab_p;
 
 -- check that default_tablespace affects index additions in ALTER TABLE
@@ -214,7 +189,7 @@ CREATE INDEX test_tab_b_idx ON testschema.test_tab (b);
 \d testschema.test_tab_unique
 \d testschema.test_tab_a_idx
 \d testschema.test_tab_b_idx
-ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c);
+ALTER TABLE testschema.test_tab ALTER b TYPE bigint;
 \d testschema.test_tab_unique
 \d testschema.test_tab_a_idx
 \d testschema.test_tab_b_idx
diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source
index 15c0d3e0..1a5dc4d1 100644
--- a/src/test/regress/output/tablespace.source
+++ b/src/test/regress/output/tablespace.source
@@ -123,28 +123,43 @@ SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c
 (3 rows)
 
 \d testschema.part_a_idx
-Partitioned index "testschema.part_a_idx"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- a      | integer | yes  | a
+ Index "testschema.part_a_idx"
+ Column |  Type   | Definition 
+--------+---------+------------
+ a      | integer | a
 btree, for table "testschema.part"
 Tablespace: "regress_tblspace"
 
--- partitioned rels cannot specify the default tablespace.  These fail:
+-- partitioned rels cannot specify the primary key.  These fail:
 CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default;
-ERROR:  cannot specify default tablespace for partitioned relations
+ERROR:  primary key constraints are not supported on partitioned tables
+LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION B...
+                                            ^
 CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a);
-ERROR:  cannot specify default tablespace for partitioned relation
+ERROR:  primary key constraints are not supported on partitioned tables
+LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX ...
+                                            ^
 SET default_tablespace TO 'pg_default';
 CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
-ERROR:  cannot specify default tablespace for partitioned relations
+ERROR:  primary key constraints are not supported on partitioned tables
+LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION B...
+                                            ^
 CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a);
-ERROR:  cannot specify default tablespace for partitioned relations
+ERROR:  primary key constraints are not supported on partitioned tables
+LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX ...
+                                            ^
 -- but these work:
 CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
+ERROR:  primary key constraints are not supported on partitioned tables
+LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX ...
+                                            ^
 SET default_tablespace TO '';
 CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a);
+ERROR:  primary key constraints are not supported on partitioned tables
+LINE 1: CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION ...
+                                             ^
 DROP TABLE testschema.dflt, testschema.dflt2;
+ERROR:  table "dflt" does not exist
 -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
 CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace;
 INSERT INTO testschema.test_default_tab VALUES (1);
@@ -167,21 +182,6 @@ Index "testschema.test_index2"
 btree, for table "testschema.test_default_tab"
 Tablespace: "regress_tblspace"
 
-\d testschema.test_index3
-   Index "testschema.test_index3"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-primary key, btree, for table "testschema.test_default_tab"
-
-\d testschema.test_index4
-   Index "testschema.test_index4"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-unique, btree, for table "testschema.test_default_tab"
-Tablespace: "regress_tblspace"
-
 -- use a custom tablespace for default_tablespace
 SET default_tablespace TO regress_tblspace;
 -- tablespace should not change if no rewrite
@@ -201,21 +201,6 @@ Index "testschema.test_index2"
 btree, for table "testschema.test_default_tab"
 Tablespace: "regress_tblspace"
 
-\d testschema.test_index3
-    Index "testschema.test_index3"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- id     | integer | yes  | id
-primary key, btree, for table "testschema.test_default_tab"
-
-\d testschema.test_index4
-    Index "testschema.test_index4"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- id     | integer | yes  | id
-unique, btree, for table "testschema.test_default_tab"
-Tablespace: "regress_tblspace"
-
 SELECT * FROM testschema.test_default_tab;
  id 
 ----
@@ -239,21 +224,6 @@ Index "testschema.test_index2"
 btree, for table "testschema.test_default_tab"
 Tablespace: "regress_tblspace"
 
-\d testschema.test_index3
-   Index "testschema.test_index3"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-primary key, btree, for table "testschema.test_default_tab"
-
-\d testschema.test_index4
-   Index "testschema.test_index4"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-unique, btree, for table "testschema.test_default_tab"
-Tablespace: "regress_tblspace"
-
 SELECT * FROM testschema.test_default_tab;
  id 
 ----
@@ -279,21 +249,6 @@ Index "testschema.test_index2"
 btree, for table "testschema.test_default_tab"
 Tablespace: "regress_tblspace"
 
-\d testschema.test_index3
-    Index "testschema.test_index3"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- id     | integer | yes  | id
-primary key, btree, for table "testschema.test_default_tab"
-
-\d testschema.test_index4
-    Index "testschema.test_index4"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- id     | integer | yes  | id
-unique, btree, for table "testschema.test_default_tab"
-Tablespace: "regress_tblspace"
-
 -- tablespace should not change even if there is an index rewrite
 ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint;
 \d testschema.test_index1
@@ -311,21 +266,6 @@ Index "testschema.test_index2"
 btree, for table "testschema.test_default_tab"
 Tablespace: "regress_tblspace"
 
-\d testschema.test_index3
-   Index "testschema.test_index3"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-primary key, btree, for table "testschema.test_default_tab"
-
-\d testschema.test_index4
-   Index "testschema.test_index4"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-unique, btree, for table "testschema.test_default_tab"
-Tablespace: "regress_tblspace"
-
 DROP TABLE testschema.test_default_tab;
 -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
 -- (this time with a partitioned table)
@@ -337,109 +277,47 @@ INSERT INTO testschema.test_default_tab_p VALUES (1);
 CREATE INDEX test_index1 on testschema.test_default_tab_p (val);
 CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace;
 ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id);
+ERROR:  primary key constraints are not supported on partitioned tables
+LINE 1: ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT tes...
+                                                      ^
 ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace;
+ERROR:  unique constraints are not supported on partitioned tables
+LINE 1: ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT tes...
+                                                      ^
 \d testschema.test_index1
-Partitioned index "testschema.test_index1"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- val    | bigint | yes  | val
+Index "testschema.test_index1"
+ Column |  Type  | Definition 
+--------+--------+------------
+ val    | bigint | val
 btree, for table "testschema.test_default_tab_p"
 
 \d testschema.test_index2
-Partitioned index "testschema.test_index2"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- val    | bigint | yes  | val
+Index "testschema.test_index2"
+ Column |  Type  | Definition 
+--------+--------+------------
+ val    | bigint | val
 btree, for table "testschema.test_default_tab_p"
 Tablespace: "regress_tblspace"
 
-\d testschema.test_index3
-Partitioned index "testschema.test_index3"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-primary key, btree, for table "testschema.test_default_tab_p"
-
-\d testschema.test_index4
-Partitioned index "testschema.test_index4"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-unique, btree, for table "testschema.test_default_tab_p"
-Tablespace: "regress_tblspace"
-
 -- use a custom tablespace for default_tablespace
 SET default_tablespace TO regress_tblspace;
--- tablespace should not change if no rewrite
-ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
-\d testschema.test_index1
-Partitioned index "testschema.test_index1"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- val    | bigint | yes  | val
-btree, for table "testschema.test_default_tab_p"
-
-\d testschema.test_index2
-Partitioned index "testschema.test_index2"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- val    | bigint | yes  | val
-btree, for table "testschema.test_default_tab_p"
-Tablespace: "regress_tblspace"
-
-\d testschema.test_index3
-Partitioned index "testschema.test_index3"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-primary key, btree, for table "testschema.test_default_tab_p"
-
-\d testschema.test_index4
-Partitioned index "testschema.test_index4"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-unique, btree, for table "testschema.test_default_tab_p"
-Tablespace: "regress_tblspace"
-
-SELECT * FROM testschema.test_default_tab_p;
- id | val 
-----+-----
-  1 |    
-(1 row)
-
 -- tablespace should not change even if there is an index rewrite
 ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
 \d testschema.test_index1
-Partitioned index "testschema.test_index1"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- val    | integer | yes  | val
+Index "testschema.test_index1"
+ Column |  Type   | Definition 
+--------+---------+------------
+ val    | integer | val
 btree, for table "testschema.test_default_tab_p"
 
 \d testschema.test_index2
-Partitioned index "testschema.test_index2"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- val    | integer | yes  | val
+Index "testschema.test_index2"
+ Column |  Type   | Definition 
+--------+---------+------------
+ val    | integer | val
 btree, for table "testschema.test_default_tab_p"
 Tablespace: "regress_tblspace"
 
-\d testschema.test_index3
-Partitioned index "testschema.test_index3"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-primary key, btree, for table "testschema.test_default_tab_p"
-
-\d testschema.test_index4
-Partitioned index "testschema.test_index4"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-unique, btree, for table "testschema.test_default_tab_p"
-Tablespace: "regress_tblspace"
-
 SELECT * FROM testschema.test_default_tab_p;
  id | val 
 ----+-----
@@ -451,67 +329,37 @@ SET default_tablespace TO '';
 -- tablespace should not change if no rewrite
 ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int;
 \d testschema.test_index1
-Partitioned index "testschema.test_index1"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- val    | integer | yes  | val
+Index "testschema.test_index1"
+ Column |  Type   | Definition 
+--------+---------+------------
+ val    | integer | val
 btree, for table "testschema.test_default_tab_p"
 
 \d testschema.test_index2
-Partitioned index "testschema.test_index2"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- val    | integer | yes  | val
+Index "testschema.test_index2"
+ Column |  Type   | Definition 
+--------+---------+------------
+ val    | integer | val
 btree, for table "testschema.test_default_tab_p"
 Tablespace: "regress_tblspace"
 
-\d testschema.test_index3
-Partitioned index "testschema.test_index3"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-primary key, btree, for table "testschema.test_default_tab_p"
-
-\d testschema.test_index4
-Partitioned index "testschema.test_index4"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-unique, btree, for table "testschema.test_default_tab_p"
-Tablespace: "regress_tblspace"
-
 -- tablespace should not change even if there is an index rewrite
 ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint;
 \d testschema.test_index1
-Partitioned index "testschema.test_index1"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- val    | bigint | yes  | val
+Index "testschema.test_index1"
+ Column |  Type  | Definition 
+--------+--------+------------
+ val    | bigint | val
 btree, for table "testschema.test_default_tab_p"
 
 \d testschema.test_index2
-Partitioned index "testschema.test_index2"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- val    | bigint | yes  | val
+Index "testschema.test_index2"
+ Column |  Type  | Definition 
+--------+--------+------------
+ val    | bigint | val
 btree, for table "testschema.test_default_tab_p"
 Tablespace: "regress_tblspace"
 
-\d testschema.test_index3
-Partitioned index "testschema.test_index3"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-primary key, btree, for table "testschema.test_default_tab_p"
-
-\d testschema.test_index4
-Partitioned index "testschema.test_index4"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- id     | bigint | yes  | id
-unique, btree, for table "testschema.test_default_tab_p"
-Tablespace: "regress_tblspace"
-
 DROP TABLE testschema.test_default_tab_p;
 -- check that default_tablespace affects index additions in ALTER TABLE
 CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace;
@@ -551,50 +399,50 @@ CREATE INDEX test_tab_a_idx ON testschema.test_tab (a);
 SET default_tablespace TO '';
 CREATE INDEX test_tab_b_idx ON testschema.test_tab (b);
 \d testschema.test_tab_unique
-  Index "testschema.test_tab_unique"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- a      | integer | yes  | a
+Index "testschema.test_tab_unique"
+ Column |  Type   | Definition 
+--------+---------+------------
+ a      | integer | a
 unique, btree, for table "testschema.test_tab"
 Tablespace: "regress_tblspace"
 
 \d testschema.test_tab_a_idx
-  Index "testschema.test_tab_a_idx"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- a      | integer | yes  | a
+Index "testschema.test_tab_a_idx"
+ Column |  Type   | Definition 
+--------+---------+------------
+ a      | integer | a
 btree, for table "testschema.test_tab"
 Tablespace: "regress_tblspace"
 
 \d testschema.test_tab_b_idx
-  Index "testschema.test_tab_b_idx"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- b      | integer | yes  | b
+Index "testschema.test_tab_b_idx"
+ Column |  Type   | Definition 
+--------+---------+------------
+ b      | integer | b
 btree, for table "testschema.test_tab"
 
-ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c);
+ALTER TABLE testschema.test_tab ALTER b TYPE bigint;
 \d testschema.test_tab_unique
-  Index "testschema.test_tab_unique"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- a      | integer | yes  | a
+Index "testschema.test_tab_unique"
+ Column |  Type   | Definition 
+--------+---------+------------
+ a      | integer | a
 unique, btree, for table "testschema.test_tab"
 Tablespace: "regress_tblspace"
 
 \d testschema.test_tab_a_idx
-  Index "testschema.test_tab_a_idx"
- Column |  Type   | Key? | Definition 
---------+---------+------+------------
- a      | integer | yes  | a
+Index "testschema.test_tab_a_idx"
+ Column |  Type   | Definition 
+--------+---------+------------
+ a      | integer | a
 btree, for table "testschema.test_tab"
 Tablespace: "regress_tblspace"
 
 \d testschema.test_tab_b_idx
-  Index "testschema.test_tab_b_idx"
- Column |  Type  | Key? | Definition 
---------+--------+------+------------
- b      | bigint | yes  | b
+Index "testschema.test_tab_b_idx"
+ Column |  Type  | Definition 
+--------+--------+------------
+ b      | bigint | b
 btree, for table "testschema.test_tab"
 
 DROP TABLE testschema.test_tab;
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index b53af1c0..c0e41d0f 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -1332,8 +1332,8 @@ alter table tab1 alter column b type varchar; -- fails
 create table at_partitioned (a int, b text) partition by range (a);
 create table at_part_1 partition of at_partitioned for values from (0) to (1000);
 insert into at_partitioned values (512, '0.123');
-create table at_part_2 (b text, a int);
-insert into at_part_2 values ('1.234', 1024);
+create table at_part_2 (a int, b text);
+insert into at_part_2 values (1024, '1.234');
 create index on at_partitioned (b);
 create index on at_partitioned (a);
 \d at_part_1
diff --git a/src/test/regress/sql/event_trigger.sql b/src/test/regress/sql/event_trigger.sql
index 9c8fa5f6..dc8b017a 100644
--- a/src/test/regress/sql/event_trigger.sql
+++ b/src/test/regress/sql/event_trigger.sql
@@ -265,7 +265,7 @@ CREATE SCHEMA evttrig
 
 -- Partitioned tables with a partitioned index
 CREATE TABLE evttrig.parted (
-    id int PRIMARY KEY)
+    id int)
     PARTITION BY RANGE (id);
 CREATE TABLE evttrig.part_1_10 PARTITION OF evttrig.parted (id)
   FOR VALUES FROM (1) TO (10);
diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql
index 4762e687..130ee7cc 100644
--- a/src/test/regress/sql/indexing.sql
+++ b/src/test/regress/sql/indexing.sql
@@ -280,10 +280,8 @@ alter index idxpart_a_idx attach partition idxpart2_a_idx;
 drop table idxpart;
 
 -- Verify that attaching indexes maps attribute numbers correctly
-create table idxpart (col1 int, a int, col2 int, b int) partition by range (a);
-create table idxpart1 (b int, col1 int, col2 int, col3 int, a int);
-alter table idxpart drop column col1, drop column col2;
-alter table idxpart1 drop column col1, drop column col2, drop column col3;
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (a int, b int);
 alter table idxpart attach partition idxpart1 for values from (0) to (1000);
 create index idxpart_1_idx on only idxpart (b, a);
 create index idxpart1_1_idx on idxpart1 (b, a);
@@ -308,9 +306,9 @@ drop table idxpart;
 create table idxpart (a int, b int, c text) partition by range (a);
 create index idxparti on idxpart (a);
 create index idxparti2 on idxpart (c, b);
-create table idxpart1 (c text, a int, b int);
+create table idxpart1 (a int, b int, c text);
 alter table idxpart attach partition idxpart1 for values from (0) to (10);
-create table idxpart2 (c text, a int, b int);
+create table idxpart2 (a int, b int, c text);
 create index on idxpart2 (a);
 create index on idxpart2 (c, b);
 alter table idxpart attach partition idxpart2 for values from (10) to (20);
@@ -321,12 +319,9 @@ select c.relname, pg_get_indexdef(indexrelid)
 drop table idxpart;
 
 -- Verify that columns are mapped correctly in expression indexes
-create table idxpart (col1 int, col2 int, a int, b int) partition by range (a);
-create table idxpart1 (col2 int, b int, col1 int, a int);
-create table idxpart2 (col1 int, col2 int, b int, a int);
-alter table idxpart drop column col1, drop column col2;
-alter table idxpart1 drop column col1, drop column col2;
-alter table idxpart2 drop column col1, drop column col2;
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (a int, b int);
+create table idxpart2 (a int, b int);
 create index on idxpart2 (abs(b));
 alter table idxpart attach partition idxpart2 for values from (0) to (1);
 create index on idxpart (abs(b));
@@ -338,14 +333,11 @@ select c.relname, pg_get_indexdef(indexrelid)
 drop table idxpart;
 
 -- Verify that columns are mapped correctly for WHERE in a partial index
-create table idxpart (col1 int, a int, col3 int, b int) partition by range (a);
-alter table idxpart drop column col1, drop column col3;
-create table idxpart1 (col1 int, col2 int, col3 int, col4 int, b int, a int);
-alter table idxpart1 drop column col1, drop column col2, drop column col3, drop column col4;
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (a int, b int);
 alter table idxpart attach partition idxpart1 for values from (0) to (1000);
-create table idxpart2 (col1 int, col2 int, b int, a int);
+create table idxpart2 (a int, b int);
 create index on idxpart2 (a) where b > 1000;
-alter table idxpart2 drop column col1, drop column col2;
 alter table idxpart attach partition idxpart2 for values from (1000) to (2000);
 create index on idxpart (a) where b > 1000;
 select c.relname, pg_get_indexdef(indexrelid)
@@ -355,7 +347,7 @@ select c.relname, pg_get_indexdef(indexrelid)
 drop table idxpart;
 
 -- Column number mapping: dropped columns in the partition
-create table idxpart1 (drop_1 int, drop_2 int, col_keep int, drop_3 int);
+create table idxpart1 (col_keep int, drop_1 int, drop_2 int, drop_3 int);
 alter table idxpart1 drop column drop_1;
 alter table idxpart1 drop column drop_2;
 alter table idxpart1 drop column drop_3;
@@ -371,7 +363,7 @@ select attrelid::regclass, attname, attnum from pg_attribute
 drop table idxpart;
 
 -- Column number mapping: dropped columns in the parent table
-create table idxpart(drop_1 int, drop_2 int, col_keep int, drop_3 int) partition by range (col_keep);
+create table idxpart(col_keep int, drop_1 int, drop_2 int, drop_3 int) partition by range (col_keep);
 alter table idxpart drop column drop_1;
 alter table idxpart drop column drop_2;
 alter table idxpart drop column drop_3;
diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
index e58bfd36..5e8a2215 100644
--- a/src/test/regress/sql/inherit.sql
+++ b/src/test/regress/sql/inherit.sql
@@ -699,7 +699,7 @@ explain (costs off) select * from list_parted;
 explain (costs off) select * from list_parted where a is null;
 explain (costs off) select * from list_parted where a is not null;
 explain (costs off) select * from list_parted where a in ('ab', 'cd', 'ef');
-explain (costs off) select * from list_parted where a = 'ab' or a in (null, 'cd');
+explain (costs off) select * from list_parted where a = 'ab' or a is null or a ='cd';
 explain (costs off) select * from list_parted where a = 'ab';
 
 create table range_list_parted (
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 9a561519..70bfcbe8 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -168,7 +168,7 @@ insert into part_default_p2 values ('de', 35);
 insert into list_parted values ('ab', 21);
 insert into list_parted values ('xx', 1);
 insert into list_parted values ('yy', 2);
-select tableoid::regclass, * from list_parted;
+select tableoid::regclass, * from list_parted order by 1,2,3;
 
 -- Check tuple routing for partitioned tables
 
@@ -253,7 +253,7 @@ insert into hpart3 values(11);
 
 -- view data
 select tableoid::regclass as part, a, a%4 as "remainder = a % 4"
-from hash_parted order by part;
+from hash_parted order by part,a;
 
 -- test \d+ output on a table which has both partitioned and unpartitioned
 -- partitions
@@ -375,7 +375,7 @@ insert into mlparted_def2 values (34, 50);
 create table mlparted_defd partition of mlparted_def default;
 insert into mlparted values (70, 100);
 
-select tableoid::regclass, * from mlparted_def;
+select tableoid::regclass, * from mlparted_def order by 1;
 
 -- check that message shown after failure to find a partition shows the
 -- appropriate key description (or none) in various situations
diff --git a/src/test/regress/sql/partition_info.sql b/src/test/regress/sql/partition_info.sql
index afa16c07..bd62ec81 100644
--- a/src/test/regress/sql/partition_info.sql
+++ b/src/test/regress/sql/partition_info.sql
@@ -46,7 +46,6 @@ ALTER INDEX ptif_test_index ATTACH PARTITION ptif_test3_index;
 -- Test pg_partition_root for indexes
 SELECT pg_partition_root('ptif_test_index');
 SELECT pg_partition_root('ptif_test0_index');
-SELECT pg_partition_root('ptif_test01_index');
 SELECT pg_partition_root('ptif_test3_index');
 
 -- List all tables members of the tree
@@ -72,29 +71,6 @@ SELECT relid, parentrelid, level, isleaf
 SELECT * FROM pg_partition_ancestors('ptif_test01');
 SELECT * FROM pg_partition_ancestors('ptif_test');
 
--- List all indexes members of the tree
-SELECT relid, parentrelid, level, isleaf
-  FROM pg_partition_tree('ptif_test_index');
--- List indexes from an intermediate level
-SELECT relid, parentrelid, level, isleaf
-  FROM pg_partition_tree('ptif_test0_index') p
-  JOIN pg_class c ON (p.relid = c.oid);
--- List from leaf index
-SELECT relid, parentrelid, level, isleaf
-  FROM pg_partition_tree('ptif_test01_index') p
-  JOIN pg_class c ON (p.relid = c.oid);
--- List from partitioned index with no partitions
-SELECT relid, parentrelid, level, isleaf
-  FROM pg_partition_tree('ptif_test3_index') p
-  JOIN pg_class c ON (p.relid = c.oid);
--- List all members using pg_partition_root with leaf index reference
-SELECT relid, parentrelid, level, isleaf
-  FROM pg_partition_tree(pg_partition_root('ptif_test01_index')) p
-  JOIN pg_class c ON (p.relid = c.oid);
--- List all ancestors of root and leaf indexes
-SELECT * FROM pg_partition_ancestors('ptif_test01_index');
-SELECT * FROM pg_partition_ancestors('ptif_test_index');
-
 DROP TABLE ptif_test;
 
 -- A table not part of a partition tree works is not listed.
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index 55fda489..d6875fb9 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -39,7 +39,7 @@ create table rlp_default_null partition of rlp_default for values in (null);
 create table rlp1 partition of rlp for values from (minvalue) to (1);
 create table rlp2 partition of rlp for values from (1) to (10);
 
-create table rlp3 (b varchar, a int) partition by list (b varchar_ops);
+create table rlp3 (a int, b varchar) partition by list (b varchar_ops);
 create table rlp3_default partition of rlp3 default;
 create table rlp3abcd partition of rlp3 for values in ('ab', 'cd');
 create table rlp3efgh partition of rlp3 for values in ('ef', 'gh');
diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql
index 25ee90a1..fdc504e0 100644
--- a/src/test/regress/sql/select_parallel.sql
+++ b/src/test/regress/sql/select_parallel.sql
@@ -143,7 +143,9 @@ EXPLAIN (costs off) SELECT xc_node_id != 0 FROM t_worker_identifier;
 SELECT xc_node_id != 0 FROM t_worker_identifier;
 
 -- provoke error in worker
+SAVEPOINT settings;
 select stringu1::int2 from tenk1 where unique1 = 1;
+ROLLBACK TO SAVEPOINT settings;
 
 -- test interaction with set-returning functions
 SAVEPOINT settings;
diff --git a/src/test/regress/sql/temp.sql b/src/test/regress/sql/temp.sql
index efac176f..0ef81583 100644
--- a/src/test/regress/sql/temp.sql
+++ b/src/test/regress/sql/temp.sql
@@ -172,7 +172,7 @@ select relname from pg_class where relname like 'temp_parted_oncommit_test%';
 -- all rows if partitions preserve their data.
 begin;
 create temp table temp_parted_oncommit_test (a int)
-  partition by list (a) on commit delete rows;
+  partition by list (a) on commit preserve rows;
 create temp table temp_parted_oncommit_test1
   partition of temp_parted_oncommit_test
   for values in (1) on commit preserve rows;
diff --git a/src/test/regress/sql/truncate.sql b/src/test/regress/sql/truncate.sql
index afde2f66..681fac5d 100644
--- a/src/test/regress/sql/truncate.sql
+++ b/src/test/regress/sql/truncate.sql
@@ -263,7 +263,7 @@ CREATE FUNCTION tp_chk_data(OUT pktb regclass, OUT pkval int, OUT fktb regclass,
   END
 $$;
 CREATE TABLE truncprim (a int PRIMARY KEY);
-CREATE TABLE truncpart (a int REFERENCES truncprim)
+CREATE TABLE truncpart (a int)
   PARTITION BY RANGE (a);
 CREATE TABLE truncpart_1 PARTITION OF truncpart FOR VALUES FROM (0) TO (100);
 CREATE TABLE truncpart_2 PARTITION OF truncpart FOR VALUES FROM (100) TO (200)
@@ -271,7 +271,7 @@ CREATE TABLE truncpart_2 PARTITION OF truncpart FOR VALUES FROM (100) TO (200)
 CREATE TABLE truncpart_2_1 PARTITION OF truncpart_2 FOR VALUES FROM (100) TO (150);
 CREATE TABLE truncpart_2_d PARTITION OF truncpart_2 DEFAULT;
 
-TRUNCATE TABLE truncprim;	-- should fail
+TRUNCATE TABLE truncprim;
 
 select tp_ins_data();
 -- should truncate everything
diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql
index a4f2f161..c97218b3 100644
--- a/src/test/regress/sql/update.sql
+++ b/src/test/regress/sql/update.sql
@@ -130,9 +130,9 @@ CREATE TABLE range_parted (
 
 -- Create partitions intentionally in descending bound order, so as to test
 -- that update-row-movement works with the leaf partitions not in bound order.
-CREATE TABLE part_b_20_b_30 (e varchar, c numeric, a text, b bigint, d int);
+CREATE TABLE part_b_20_b_30 (a text, b bigint, c numeric, d int, e varchar);
 ALTER TABLE range_parted ATTACH PARTITION part_b_20_b_30 FOR VALUES FROM ('b', 20) TO ('b', 30);
-CREATE TABLE part_b_10_b_20 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY RANGE (c);
+CREATE TABLE part_b_10_b_20 (a text, b bigint, c numeric, d int, e varchar) PARTITION BY RANGE (c);
 CREATE TABLE part_b_1_b_10 PARTITION OF range_parted FOR VALUES FROM ('b', 1) TO ('b', 10);
 ALTER TABLE range_parted ATTACH PARTITION part_b_10_b_20 FOR VALUES FROM ('b', 10) TO ('b', 20);
 CREATE TABLE part_a_10_a_20 PARTITION OF range_parted FOR VALUES FROM ('a', 10) TO ('a', 20);
@@ -145,17 +145,13 @@ UPDATE part_b_10_b_20 set b = b - 6;
 -- Create some more partitions following the above pattern of descending bound
 -- order, but let's make the situation a bit more complex by having the
 -- attribute numbers of the columns vary from their parent partition.
-CREATE TABLE part_c_100_200 (e varchar, c numeric, a text, b bigint, d int) PARTITION BY range (abs(d));
-ALTER TABLE part_c_100_200 DROP COLUMN e, DROP COLUMN c, DROP COLUMN a;
-ALTER TABLE part_c_100_200 ADD COLUMN c numeric, ADD COLUMN e varchar, ADD COLUMN a text;
-ALTER TABLE part_c_100_200 DROP COLUMN b;
-ALTER TABLE part_c_100_200 ADD COLUMN b bigint;
+CREATE TABLE part_c_100_200 (a text, b bigint, c numeric, d int, e varchar) PARTITION BY range (abs(d));
 CREATE TABLE part_d_1_15 PARTITION OF part_c_100_200 FOR VALUES FROM (1) TO (15);
 CREATE TABLE part_d_15_20 PARTITION OF part_c_100_200 FOR VALUES FROM (15) TO (20);
 
 ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_100_200 FOR VALUES FROM (100) TO (200);
 
-CREATE TABLE part_c_1_100 (e varchar, d int, c numeric, b bigint, a text);
+CREATE TABLE part_c_1_100 (a text, b bigint, c numeric, d int, e varchar);
 ALTER TABLE part_b_10_b_20 ATTACH PARTITION part_c_1_100 FOR VALUES FROM (1) TO (100);
 
 \set init_range_parted 'truncate range_parted; insert into range_parted VALUES (''a'', 1, 1, 1), (''a'', 10, 200, 1), (''b'', 12, 96, 1), (''b'', 13, 97, 2), (''b'', 15, 105, 16), (''b'', 17, 105, 19)'
@@ -177,15 +173,6 @@ UPDATE range_parted set d = d - 10 WHERE d > 10;
 UPDATE range_parted set e = d;
 -- No row found
 UPDATE part_c_1_100 set c = c + 20 WHERE c = 98;
--- ok, row movement
-UPDATE part_b_10_b_20 set c = c + 20 returning c, b, a;
-:show_data;
-
--- fail, row movement happens only within the partition subtree.
-UPDATE part_b_10_b_20 set b = b - 6 WHERE c > 116 returning *;
--- ok, row movement, with subset of rows moved into different partition.
-UPDATE range_parted set b = b - 6 WHERE c > 116 returning a, b + c;
-
 :show_data;
 
 -- Common table needed for multiple test scenarios.
@@ -210,7 +197,7 @@ DROP VIEW upview;
 
 -- RETURNING having whole-row vars.
 :init_range_parted;
-UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c > 100 returning (range_parted), *;
+UPDATE range_parted set c = 95 WHERE a = 'b' and b > 10 and c < 100 returning (range_parted), *;
 :show_data;
 
 
From da7b6686fdcfccfbf92ac58e1663a9b80a160e73 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 16 Jul 2020 20:29:14 +0800
Subject: [PATCH 306/578] Postpone generate_gather_paths for topmost scan/join
 rel.
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 src/backend/optimizer/geqo/geqo_eval.c | 21 ++++++++++++++-------
 src/backend/optimizer/path/allpaths.c  | 24 ++++++++++++++++++------
 src/backend/optimizer/plan/planner.c   |  9 +++++++++
 3 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/src/backend/optimizer/geqo/geqo_eval.c b/src/backend/optimizer/geqo/geqo_eval.c
index 108b866c..87be2b76 100644
--- a/src/backend/optimizer/geqo/geqo_eval.c
+++ b/src/backend/optimizer/geqo/geqo_eval.c
@@ -40,7 +40,7 @@ typedef struct
 } Clump;
 
 static List *merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump,
-            bool force);
+			int num_gene, bool force);
 static bool desirable_join(PlannerInfo *root,
                RelOptInfo *outer_rel, RelOptInfo *inner_rel);
 
@@ -196,7 +196,7 @@ gimme_tree(PlannerInfo *root, Gene *tour, int num_gene)
         cur_clump->size = 1;
 
         /* Merge it into the clumps list, using only desirable joins */
-        clumps = merge_clump(root, clumps, cur_clump, false);
+		clumps = merge_clump(root, clumps, cur_clump, num_gene, false);
     }
 
     if (list_length(clumps) > 1)
@@ -210,7 +210,7 @@ gimme_tree(PlannerInfo *root, Gene *tour, int num_gene)
         {
             Clump       *clump = (Clump *) lfirst(lc);
 
-            fclumps = merge_clump(root, fclumps, clump, true);
+			fclumps = merge_clump(root, fclumps, clump, num_gene, true);
         }
         clumps = fclumps;
     }
@@ -235,8 +235,9 @@ gimme_tree(PlannerInfo *root, Gene *tour, int num_gene)
  * "desirable" joins.
  */
 static List *
-merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, bool force)
-{// #lizard forgives
+merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, int num_gene,
+			bool force)
+{
     ListCell   *prev;
     ListCell   *lc;
 
@@ -267,7 +268,13 @@ merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, bool force)
 				/* Create paths for partition-wise joins. */
 				generate_partition_wise_join_paths(root, joinrel);
 
-                /* Create GatherPaths for any useful partial paths for rel */
+				/*
+				* Except for the topmost scan/join rel, consider gathering
+				* partial paths.  We'll do the same for the topmost scan/join
+				* rel once we know the final targetlist (see
+				* grouping_planner).
+				*/
+				if (old_clump->size + new_clump->size < num_gene)
                 generate_gather_paths(root, joinrel);
 
                 /* Find and save the cheapest paths for this joinrel */
@@ -286,7 +293,7 @@ merge_clump(PlannerInfo *root, List *clumps, Clump *new_clump, bool force)
                  * others.  When no further merge is possible, we'll reinsert
                  * it into the list.
                  */
-                return merge_clump(root, clumps, old_clump, force);
+				return merge_clump(root, clumps, old_clump, num_gene, force);
             }
         }
         prev = lc;
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 4326a646..40bd2cf0 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -497,13 +497,20 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
     }
 
     /*
-     * If this is a baserel, consider gathering any partial paths we may have
-     * created for it.  (If we tried to gather inheritance children, we could
+	 * If this is a baserel, we should normally consider gathering any partial
+     * paths we may have created for it.
+     *
+     * However, if this is an inheritance child, skip it.  Otherwise, we could
      * end up with a very large number of gather nodes, each trying to grab
-     * its own pool of workers, so don't do this for otherrels.  Instead,
-     * we'll consider gathering partial paths for the parent appendrel.)
+	 * its own pool of workers.  Instead, we'll consider gathering partial
+     * paths for the parent appendrel.
+     *
+     * Also, if this is the topmost scan/join rel (that is, the only baserel),
+     * we postpone this until the final scan/join targelist is available (see
+     * grouping_planner).
      */
-    if (rel->reloptkind == RELOPT_BASEREL)
+	if (rel->reloptkind == RELOPT_BASEREL &&
+			bms_membership(root->all_baserels) != BMS_SINGLETON)
         generate_gather_paths(root, rel);
 
     /*
@@ -2730,7 +2737,12 @@ standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels)
 			/* Create paths for partition-wise joins. */
 			generate_partition_wise_join_paths(root, rel);
 
-            /* Create GatherPaths for any useful partial paths for rel */
+			/*
+             * Except for the topmost scan/join rel, consider gathering
+             * partial paths.  We'll do the same for the topmost scan/join rel
+             * once we know the final targetlist (see grouping_planner).
+             */
+            if (lev < levels_needed)
             generate_gather_paths(root, rel);
 
             /* Find and save the cheapest paths for this rel */
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 55c28ea3..7ee9d475 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -2133,6 +2133,15 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
         }
 
         /*
+		 * Generate Gather or Gather Merge paths for the topmost scan/join
+		 * relation.  Once that's done, we must re-determine which paths are
+		 * cheapest.  (The previously-cheapest path might even have been
+		 * pfree'd!)
+		 */
+		generate_gather_paths(root, current_rel);
+		set_cheapest(current_rel);
+
+		/*
          * Forcibly apply SRF-free scan/join target to all the Paths for the
          * scan/join rel.
          *

From a5465dd281c813cbbefa5f4ded13490339e1d6d7 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 17 Jul 2020 08:57:36 +0800
Subject: [PATCH 307/578] fix regress error related partition join .
 http://tapd.oa.com/TBase_Oracle_Migration/prong/stories/view/1020421696858794233

---
 .../regress/expected/partition_join_1.out     | 1838 +++++++----------
 src/test/regress/sql/partition_join.sql       |    2 +-
 2 files changed, 791 insertions(+), 1049 deletions(-)

diff --git a/src/test/regress/expected/partition_join_1.out b/src/test/regress/expected/partition_join_1.out
index 83d35561..a9a52a07 100644
--- a/src/test/regress/expected/partition_join_1.out
+++ b/src/test/regress/expected/partition_join_1.out
@@ -3,7 +3,7 @@
 -- Test partition-wise join between partitioned tables
 --
 -- Enable partition-wise join, which by default is disabled.
-SET enable_partition_wise_join to true;
+--SET enable_partition_wise_join to true;
 --
 -- partitioned by a single column
 --
@@ -32,29 +32,23 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b =
 -----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a
-         ->  Append
-               ->  Nested Loop
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           ->  Seq Scan on prt1_p1 t1
-                                 Filter: (b = 0)
-                     ->  Index Scan using iprt2_p1_b on prt2_p1 t2
-                           Index Cond: (b = t1.a)
-               ->  Nested Loop
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           ->  Seq Scan on prt1_p2 t1_1
-                                 Filter: (b = 0)
-                     ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
-                           Index Cond: (b = t1_1.a)
-               ->  Nested Loop
+         Sort Key: a
+         ->  Hash Join
+               Hash Cond: (t2.b = a)
+               ->  Append
+                     ->  Seq Scan on prt2_p1 t2
+                     ->  Seq Scan on prt2_p2 t2_1
+                     ->  Seq Scan on prt2_p3 t2_2
+               ->  Hash
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           ->  Seq Scan on prt1_p3 t1_2
-                                 Filter: (b = 0)
-                     ->  Bitmap Heap Scan on prt2_p3 t2_2
-                           Recheck Cond: (b = t1_2.a)
-                           ->  Bitmap Index Scan on iprt2_p3_b
-                                 Index Cond: (b = t1_2.a)
-(24 rows)
+                           ->  Append
+                                 ->  Seq Scan on prt1_p1 t1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_p2 t1_1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_p3 t1_2
+                                       Filter: (b = 0)
+(18 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
   a  |  c   |  b  |  c   
@@ -68,40 +62,30 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b =
 -- left outer join, with whole-row reference
 EXPLAIN (COSTS OFF)
 SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
-                                       QUERY PLAN                                        
------------------------------------------------------------------------------------------
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a, t2.b
-         ->  Result
-               ->  Append
-                     ->  Nested Loop Left Join
-                           ->  Seq Scan on prt1_p1 t1
-                                 Filter: (b = 0)
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  Index Scan using iprt2_p1_b on prt2_p1 t2
-                                             Index Cond: (t1.a = b)
-                     ->  Nested Loop Left Join
-                           ->  Seq Scan on prt1_p2 t1_1
-                                 Filter: (b = 0)
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
-                                             Index Cond: (t1_1.a = b)
-                     ->  Nested Loop Left Join
-                           ->  Seq Scan on prt1_p3 t1_2
-                                 Filter: (b = 0)
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  Bitmap Heap Scan on prt2_p3 t2_2
-                                             Recheck Cond: (t1_2.a = b)
-                                             ->  Bitmap Index Scan on iprt2_p3_b
-                                                   Index Cond: (t1_2.a = b)
-(31 rows)
+         Sort Key: a, b
+         ->  Hash Right Join
+               Hash Cond: (b = a)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  Append
+                           ->  Seq Scan on prt2_p1 t2
+                           ->  Seq Scan on prt2_p2 t2_1
+                           ->  Seq Scan on prt2_p3 t2_2
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Append
+                                 ->  Seq Scan on prt1_p1 t1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_p2 t1_1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_p3 t1_2
+                                       Filter: (b = 0)
+(21 rows)
 
 SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
       t1      |      t2      
@@ -123,35 +107,32 @@ SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER
 -- right outer join
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
-                                QUERY PLAN                                 
----------------------------------------------------------------------------
+                                   QUERY PLAN                                    
+---------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a, t2.b
-         ->  Result
-               ->  Append
-                     ->  Nested Loop Left Join
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
-                                 ->  Seq Scan on prt2_p1 t2
-                                       Filter: (a = 0)
-                           ->  Index Scan using iprt1_p1_a on prt1_p1 t1
-                                 Index Cond: (a = t2.b)
-                     ->  Nested Loop Left Join
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
-                                 ->  Seq Scan on prt2_p2 t2_1
-                                       Filter: (a = 0)
-                           ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
-                                 Index Cond: (a = t2_1.b)
-                     ->  Nested Loop Left Join
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
-                                 ->  Seq Scan on prt2_p3 t2_2
-                                       Filter: (a = 0)
-                           ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
-                                 Index Cond: (a = t2_2.b)
-(26 rows)
+         Sort Key: a, b
+         ->  Nested Loop Left Join
+               ->  Remote Subquery Scan on all (datanode_2)
+                     Distribute results by H: b
+                     ->  Append
+                           ->  Seq Scan on prt2_p1 t2
+                                 Filter: (a = 0)
+                           ->  Seq Scan on prt2_p2 t2_1
+                                 Filter: (a = 0)
+                           ->  Seq Scan on prt2_p3 t2_2
+                                 Filter: (a = 0)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Append
+                                 ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                                       Index Cond: (a = b)
+                                 ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                                       Index Cond: (a = b)
+                                 ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                                       Index Cond: (a = b)
+(23 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
   a  |  c   |  b  |  c   
@@ -169,43 +150,34 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHE
 -- full outer join, with placeholder vars
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b;
-                               QUERY PLAN                               
-------------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: prt1_p1.a, prt2_p1.b
-         ->  Append
-               ->  Hash Full Join
-                     Hash Cond: (prt1_p1.a = prt2_p1.b)
-                     Filter: (((50) = prt1_p1.a) OR ((75) = prt2_p1.b))
-                     ->  Seq Scan on prt1_p1
-                           Filter: (b = 0)
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
+         Sort Key: a, b
+         ->  Hash Full Join
+               Hash Cond: (a = b)
+               Filter: (((50) = a) OR ((75) = b))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_p1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p2
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p3
+                                 Filter: (b = 0)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           Distribute results by H: b
+                           ->  Append
                                  ->  Seq Scan on prt2_p1
                                        Filter: (a = 0)
-               ->  Hash Full Join
-                     Hash Cond: (prt1_p2.a = prt2_p2.b)
-                     Filter: (((50) = prt1_p2.a) OR ((75) = prt2_p2.b))
-                     ->  Seq Scan on prt1_p2
-                           Filter: (b = 0)
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
                                  ->  Seq Scan on prt2_p2
                                        Filter: (a = 0)
-               ->  Hash Full Join
-                     Hash Cond: (prt1_p3.a = prt2_p3.b)
-                     Filter: (((50) = prt1_p3.a) OR ((75) = prt2_p3.b))
-                     ->  Seq Scan on prt1_p3
-                           Filter: (b = 0)
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
                                  ->  Seq Scan on prt2_p3
                                        Filter: (a = 0)
-(34 rows)
+(25 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b;
  a  |  c   | b  |  c   
@@ -217,19 +189,26 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0)
 -- Join with pruned partitions from joining relations
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b;
-                                 QUERY PLAN                                  
------------------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a
-         ->  Append
-               ->  Nested Loop
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           ->  Seq Scan on prt1_p2 t1
+         Sort Key: a
+         ->  Nested Loop
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t1
+                                 Filter: ((a < 450) AND (b = 0))
+                           ->  Seq Scan on prt1_p2 t1_1
                                  Filter: ((a < 450) AND (b = 0))
+               ->  Append
                      ->  Index Scan using iprt2_p2_b on prt2_p2 t2
-                           Index Cond: ((b = t1.a) AND (b > 250))
-(10 rows)
+                           Index Cond: ((b = a) AND (b > 250))
+                     ->  Bitmap Heap Scan on prt2_p3 t2_1
+                           Recheck Cond: ((b = a) AND (b > 250))
+                           ->  Bitmap Index Scan on iprt2_p3_b
+                                 Index Cond: ((b = a) AND (b > 250))
+(17 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b;
   a  |  c   |  b  |  c   
@@ -239,92 +218,84 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a <
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
-                                 QUERY PLAN                                  
------------------------------------------------------------------------------
- Sort
-   Sort Key: prt1_p1.a, b
-   ->  Append
-         ->  Hash Left Join
-               Hash Cond: (prt1_p1.a = b)
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     ->  Seq Scan on prt1_p1
-                           Filter: ((a < 450) AND (b = 0))
-               ->  Hash
-                     ->  Result
-                           One-Time Filter: false
+                                   QUERY PLAN                                    
+---------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
          ->  Nested Loop Left Join
-               ->  Seq Scan on prt1_p2
-                     Filter: ((a < 450) AND (b = 0))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_p1
+                                 Filter: ((a < 450) AND (b = 0))
+                           ->  Seq Scan on prt1_p2
+                                 Filter: ((a < 450) AND (b = 0))
                ->  Materialize
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Distribute results by H: b
-                           ->  Index Scan using iprt2_p2_b on prt2_p2
-                                 Index Cond: ((prt1_p2.a = b) AND (b > 250))
-(19 rows)
+                           ->  Append
+                                 ->  Index Scan using iprt2_p2_b on prt2_p2
+                                       Index Cond: ((a = b) AND (b > 250))
+                                 ->  Bitmap Heap Scan on prt2_p3
+                                       Recheck Cond: ((a = b) AND (b > 250))
+                                       ->  Bitmap Index Scan on iprt2_p3_b
+                                             Index Cond: ((a = b) AND (b > 250))
+(21 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
-  a  |  c   |  b  |  c
+  a  |  c   |  b  |  c   
 -----+------+-----+------
-   0 | 0000 |     |
-  50 | 0050 |     |
- 100 | 0100 |     |
- 150 | 0150 |     |
- 200 | 0200 |     |
- 250 | 0250 |     |
+   0 | 0000 |     | 
+  50 | 0050 |     | 
+ 100 | 0100 |     | 
+ 150 | 0150 |     | 
+ 200 | 0200 |     | 
+ 250 | 0250 |     | 
  300 | 0300 | 300 | 0300
- 350 | 0350 |     |
- 400 | 0400 |     |
+ 350 | 0350 |     | 
+ 400 | 0400 |     | 
 (9 rows)
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b;
                                  QUERY PLAN                                  
 -----------------------------------------------------------------------------
- Sort
-   Sort Key: prt1_p1.a, b
-   ->  Append
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
          ->  Hash Full Join
-               Hash Cond: (prt1_p1.a = b)
-               Filter: ((prt1_p1.b = 0) OR (a = 0))
+               Hash Cond: (a = b)
+               Filter: ((b = 0) OR (a = 0))
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     ->  Seq Scan on prt1_p1
-                           Filter: (a < 450)
-               ->  Hash
-                     ->  Result
-                           One-Time Filter: false
-         ->  Hash Full Join
-               Hash Cond: (prt1_p2.a = prt2_p2.b)
-               Filter: ((prt1_p2.b = 0) OR (prt2_p2.a = 0))
-               ->  Seq Scan on prt1_p2
-                     Filter: (a < 450)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_p1
+                                 Filter: (a < 450)
+                           ->  Seq Scan on prt1_p2
+                                 Filter: (a < 450)
                ->  Hash
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Distribute results by H: b
-                           ->  Seq Scan on prt2_p2
-                                 Filter: (b > 250)
-         ->  Hash Full Join
-               Hash Cond: (prt2_p3.b = a)
-               Filter: ((b = 0) OR (prt2_p3.a = 0))
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     ->  Seq Scan on prt2_p3
-                           Filter: (b > 250)
-               ->  Hash
-                     ->  Result
-                           One-Time Filter: false
-(31 rows)
+                           ->  Append
+                                 ->  Seq Scan on prt2_p2
+                                       Filter: (b > 250)
+                                 ->  Seq Scan on prt2_p3
+                                       Filter: (b > 250)
+(21 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b;
-  a  |  c   |  b  |  c
+  a  |  c   |  b  |  c   
 -----+------+-----+------
-   0 | 0000 |     |
-  50 | 0050 |     |
- 100 | 0100 |     |
- 150 | 0150 |     |
- 200 | 0200 |     |
- 250 | 0250 |     |
+   0 | 0000 |     | 
+  50 | 0050 |     | 
+ 100 | 0100 |     | 
+ 150 | 0150 |     | 
+ 200 | 0200 |     | 
+ 250 | 0250 |     | 
  300 | 0300 | 300 | 0300
- 350 | 0350 |     |
- 400 | 0400 |     |
+ 350 | 0350 |     | 
+ 400 | 0400 |     | 
      |      | 375 | 0375
      |      | 450 | 0450
      |      | 525 | 0525
@@ -333,55 +304,37 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JO
 -- Semi-join
 EXPLAIN (COSTS OFF)
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a;
-                                  QUERY PLAN                                  
-------------------------------------------------------------------------------
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
          Sort Key: t1.a
-         ->  Append
-               ->  Nested Loop
-                     ->  Remote Subquery Scan on all (datanode_2)
-                           Distribute results by H: b
-                           ->  HashAggregate
-                                 Group Key: t2.b
-                                 ->  Remote Subquery Scan on all (datanode_2)
-                                       Distribute results by H: b
-                                       ->  HashAggregate
-                                             Group Key: t2.b
+         ->  Nested Loop
+               ->  Remote Subquery Scan on all (datanode_2)
+                     ->  HashAggregate
+                           Group Key: b
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  HashAggregate
+                                       Group Key: t2.b
+                                       ->  Append
                                              ->  Seq Scan on prt2_p1 t2
                                                    Filter: (a = 0)
-                     ->  Index Scan using iprt1_p1_a on prt1_p1 t1
-                           Index Cond: (a = t2.b)
-                           Filter: (b = 0)
-               ->  Nested Loop
-                     ->  Remote Subquery Scan on all (datanode_2)
-                           Distribute results by H: b
-                           ->  HashAggregate
-                                 Group Key: t2_1.b
-                                 ->  Remote Subquery Scan on all (datanode_2)
-                                       Distribute results by H: b
-                                       ->  HashAggregate
-                                             Group Key: t2_1.b
                                              ->  Seq Scan on prt2_p2 t2_1
                                                    Filter: (a = 0)
-                     ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
-                           Index Cond: (a = t2_1.b)
-                           Filter: (b = 0)
-               ->  Nested Loop
-                     ->  Remote Subquery Scan on all (datanode_2)
-                           Distribute results by H: b
-                           ->  HashAggregate
-                                 Group Key: t2_2.b
-                                 ->  Remote Subquery Scan on all (datanode_2)
-                                       Distribute results by H: b
-                                       ->  HashAggregate
-                                             Group Key: t2_2.b
                                              ->  Seq Scan on prt2_p3 t2_2
                                                    Filter: (a = 0)
+               ->  Append
+                     ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                           Index Cond: (a = b)
+                           Filter: (b = 0)
+                     ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                           Index Cond: (a = b)
+                           Filter: (b = 0)
                      ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
-                           Index Cond: (a = t2_2.b)
+                           Index Cond: (a = b)
                            Filter: (b = 0)
-(46 rows)
+(28 rows)
 
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a;
   a  | b |  c   
@@ -395,34 +348,24 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0)
 -- Anti-join with aggregates
 EXPLAIN (COSTS OFF)
 SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b);
-                                         QUERY PLAN                                         
---------------------------------------------------------------------------------------------
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
  Finalize Aggregate
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Partial Aggregate
-               ->  Append
-                     ->  Hash Anti Join
-                           Hash Cond: (t1.a = t2.b)
+               ->  Hash Anti Join
+                     Hash Cond: (t1.a = b)
+                     ->  Append
                            ->  Seq Scan on prt1_p1 t1
-                           ->  Hash
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  Seq Scan on prt2_p1 t2
-                     ->  Hash Anti Join
-                           Hash Cond: (t1_1.a = t2_1.b)
                            ->  Seq Scan on prt1_p2 t1_1
-                           ->  Hash
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  Seq Scan on prt2_p2 t2_1
-                     ->  Nested Loop Anti Join
                            ->  Seq Scan on prt1_p3 t1_2
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  Index Only Scan using iprt2_p3_b on prt2_p3 t2_2
-                                             Index Cond: (b = t1_2.a)
-(25 rows)
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Append
+                                       ->  Seq Scan on prt2_p1 t2
+                                       ->  Seq Scan on prt2_p2 t2_1
+                                       ->  Seq Scan on prt2_p3 t2_2
+(15 rows)
 
 SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b);
   sum  |         avg          | sum  |         avg         
@@ -435,24 +378,61 @@ EXPLAIN (COSTS OFF)
 SELECT * FROM prt1 t1 LEFT JOIN LATERAL
 			  (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
 			  ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a;
-ERROR:  could not devise a query plan for the given query
+                                            QUERY PLAN                                            
+--------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a
+         ->  Nested Loop Left Join
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p2 t1_1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p3 t1_2
+                                 Filter: (b = 0)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Nested Loop
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       ->  Append
+                                             ->  Index Only Scan using iprt1_p1_a on prt1_p1 t2
+                                                   Index Cond: (a = a)
+                                             ->  Index Only Scan using iprt1_p2_a on prt1_p2 t2_1
+                                                   Index Cond: (a = a)
+                                             ->  Index Only Scan using iprt1_p3_a on prt1_p3 t2_2
+                                                   Index Cond: (a = a)
+                                 ->  Append
+                                       ->  Index Scan using iprt2_p1_b on prt2_p1 t3
+                                             Index Cond: (b = a)
+                                       ->  Index Scan using iprt2_p2_b on prt2_p2 t3_1
+                                             Index Cond: (b = a)
+                                       ->  Bitmap Heap Scan on prt2_p3 t3_2
+                                             Recheck Cond: (b = a)
+                                             ->  Bitmap Index Scan on iprt2_p3_b
+                                                   Index Cond: (b = a)
+(34 rows)
+
 SELECT * FROM prt1 t1 LEFT JOIN LATERAL
 			  (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
 			  ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a;
-  a  | b |  c   | t2a | t3a | least
+  a  | b |  c   | t2a | t3a | least 
 -----+---+------+-----+-----+-------
    0 | 0 | 0000 |   0 |   0 |     0
-  50 | 0 | 0050 |     |     |
- 100 | 0 | 0100 |     |     |
+  50 | 0 | 0050 |     |     |      
+ 100 | 0 | 0100 |     |     |      
  150 | 0 | 0150 | 150 |   0 |   150
- 200 | 0 | 0200 |     |     |
- 250 | 0 | 0250 |     |     |
+ 200 | 0 | 0200 |     |     |      
+ 250 | 0 | 0250 |     |     |      
  300 | 0 | 0300 | 300 |   0 |   300
- 350 | 0 | 0350 |     |     |
- 400 | 0 | 0400 |     |     |
+ 350 | 0 | 0350 |     |     |      
+ 400 | 0 | 0400 |     |     |      
  450 | 0 | 0450 | 450 |   0 |   450
- 500 | 0 | 0500 |     |     |
- 550 | 0 | 0550 |     |     |
+ 500 | 0 | 0500 |     |     |      
+ 550 | 0 | 0550 |     |     |      
 (12 rows)
 
 EXPLAIN (COSTS OFF)
@@ -464,42 +444,33 @@ SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
          Sort Key: a
-         ->  Hash Right Join
+         ->  Hash Left Join
                Hash Cond: ((c)::text = (c)::text)
                Filter: ((b + COALESCE(b, 0)) = 0)
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Distribute results by H: c
                      ->  Append
-                           ->  Hash Join
-                                 Hash Cond: (t2.a = t3.b)
-                                 ->  Seq Scan on prt1_p1 t2
-                                 ->  Hash
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             Distribute results by H: b
-                                             ->  Seq Scan on prt2_p1 t3
-                           ->  Hash Join
-                                 Hash Cond: (t2_1.a = t3_1.b)
-                                 ->  Seq Scan on prt1_p2 t2_1
-                                 ->  Hash
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             Distribute results by H: b
-                                             ->  Seq Scan on prt2_p2 t3_1
-                           ->  Nested Loop
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  Seq Scan on prt2_p3 t3_2
-                                 ->  Index Scan using iprt1_p3_a on prt1_p3 t2_2
-                                       Index Cond: (a = t3_2.b)
+                           ->  Seq Scan on prt1_p1 t1
+                           ->  Seq Scan on prt1_p2 t1_1
+                           ->  Seq Scan on prt1_p3 t1_2
                ->  Hash
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Distribute results by H: c
-                           ->  Append
-                                 ->  Seq Scan on prt1_p1 t1
-                                 ->  Seq Scan on prt1_p2 t1_1
-                                 ->  Seq Scan on prt1_p3 t1_2
-(36 rows)
-
-SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
+                           ->  Hash Join
+                                 Hash Cond: (t2.a = b)
+                                 ->  Append
+                                       ->  Seq Scan on prt1_p1 t2
+                                       ->  Seq Scan on prt1_p2 t2_1
+                                       ->  Seq Scan on prt1_p3 t2_2
+                                 ->  Hash
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Append
+                                                   ->  Seq Scan on prt2_p1 t3
+                                                   ->  Seq Scan on prt2_p2 t3_1
+                                                   ->  Seq Scan on prt2_p3 t3_2
+(27 rows)
+
+SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
 			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
 			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
   a  | t2a | t2c  
@@ -538,34 +509,27 @@ INSERT INTO prt2_e SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i;
 ANALYZE prt2_e;
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b;
-                                         QUERY PLAN                                          
----------------------------------------------------------------------------------------------
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a, t2.b
-         ->  Append
-               ->  Hash Join
-                     Hash Cond: (((t2.b + t2.a) / 2) = ((t1.a + t1.b) / 2))
+         Sort Key: a, t2.b
+         ->  Hash Join
+               Hash Cond: (((t2.b + t2.a) / 2) = ((a + b) / 2))
+               ->  Append
                      ->  Seq Scan on prt2_e_p1 t2
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Seq Scan on prt2_e_p2 t2_1
+                     ->  Seq Scan on prt2_e_p3 t2_2
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Append
                                  ->  Seq Scan on prt1_e_p1 t1
                                        Filter: (c = 0)
-               ->  Hash Join
-                     Hash Cond: (((t2_1.b + t2_1.a) / 2) = ((t1_1.a + t1_1.b) / 2))
-                     ->  Seq Scan on prt2_e_p2 t2_1
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                  ->  Seq Scan on prt1_e_p2 t1_1
                                        Filter: (c = 0)
-               ->  Nested Loop
-                     ->  Seq Scan on prt2_e_p3 t2_2
-                     ->  Materialize
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t1_2
-                                       Index Cond: (((a + b) / 2) = ((t2_2.b + t2_2.a) / 2))
+                                 ->  Seq Scan on prt1_e_p3 t1_2
                                        Filter: (c = 0)
-(25 rows)
+(18 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b;
   a  | c |  b  | c 
@@ -581,59 +545,42 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 =
 --
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b;
-                                          QUERY PLAN                                           
------------------------------------------------------------------------------------------------
+                                         QUERY PLAN                                          
+---------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a
-         ->  Result
-               ->  Append
-                     ->  Nested Loop
-                           Join Filter: (t1.a = (((t3.a + t3.b) / 2)))
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: a
-                                 ->  Nested Loop
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         Sort Key: a
+         ->  Nested Loop
+               Join Filter: (a = (((a + b) / 2)))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Hash Join
+                           Hash Cond: (t2.b = a)
+                           ->  Append
+                                 ->  Seq Scan on prt2_p1 t2
+                                 ->  Seq Scan on prt2_p2 t2_1
+                                 ->  Seq Scan on prt2_p3 t2_2
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       ->  Append
                                              ->  Seq Scan on prt1_p1 t1
                                                    Filter: (b = 0)
-                                       ->  Index Scan using iprt2_p1_b on prt2_p1 t2
-                                             Index Cond: (b = t1.a)
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: ((a + b) / 2)
-                                       ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t3
-                                             Index Cond: (((a + b) / 2) = t2.b)
-                     ->  Nested Loop
-                           Join Filter: (t1_1.a = (((t3_1.a + t3_1.b) / 2)))
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: a
-                                 ->  Nested Loop
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                              ->  Seq Scan on prt1_p2 t1_1
                                                    Filter: (b = 0)
-                                       ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
-                                             Index Cond: (b = t1_1.a)
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: ((a + b) / 2)
-                                       ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t3_1
-                                             Index Cond: (((a + b) / 2) = t2_1.b)
-                     ->  Nested Loop
-                           Join Filter: (t1_2.a = t2_2.b)
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: a
-                                 ->  Nested Loop
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                              ->  Seq Scan on prt1_p3 t1_2
                                                    Filter: (b = 0)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: ((a + b) / 2)
+                           ->  Result
+                                 ->  Append
+                                       ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t3
+                                             Index Cond: (((a + b) / 2) = b)
+                                       ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t3_1
+                                             Index Cond: (((a + b) / 2) = b)
                                        ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t3_2
-                                             Index Cond: (((a + b) / 2) = t1_2.a)
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  Index Scan using iprt2_p3_b on prt2_p3 t2_2
-                                             Index Cond: (b = ((t3_2.a + t3_2.b) / 2))
-(50 rows)
+                                             Index Cond: (((a + b) / 2) = b)
+(33 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b;
   a  |  c   |  b  |  c   | ?column? | c 
@@ -646,58 +593,40 @@ SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
-                                          QUERY PLAN                                           
------------------------------------------------------------------------------------------------
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a, t2.b, ((t3.a + t3.b))
-         ->  Result
-               ->  Append
-                     ->  Nested Loop Left Join
-                           ->  Nested Loop Left Join
-                                 ->  Seq Scan on prt1_p1 t1
-                                       Filter: (b = 0)
-                                 ->  Materialize
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             Distribute results by H: b
-                                             ->  Index Scan using iprt2_p1_b on prt2_p1 t2
-                                                   Index Cond: (t1.a = b)
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: ((a + b) / 2)
-                                       ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t3
-                                             Index Cond: (t1.a = ((a + b) / 2))
-                     ->  Nested Loop Left Join
-                           ->  Nested Loop Left Join
-                                 ->  Seq Scan on prt1_p2 t1_1
-                                       Filter: (b = 0)
-                                 ->  Materialize
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             Distribute results by H: b
-                                             ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
-                                                   Index Cond: (t1_1.a = b)
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: ((a + b) / 2)
-                                       ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t3_1
-                                             Index Cond: (t1_1.a = ((a + b) / 2))
-                     ->  Nested Loop Left Join
-                           ->  Nested Loop Left Join
-                                 ->  Seq Scan on prt1_p3 t1_2
-                                       Filter: (b = 0)
-                                 ->  Materialize
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             Distribute results by H: b
-                                             ->  Bitmap Heap Scan on prt2_p3 t2_2
-                                                   Recheck Cond: (t1_2.a = b)
-                                                   ->  Bitmap Index Scan on iprt2_p3_b
-                                                         Index Cond: (t1_2.a = b)
-                           ->  Materialize
+         Sort Key: a, b, ((a + b))
+         ->  Hash Right Join
+               Hash Cond: ((((a + b) / 2)) = a)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: ((a + b) / 2)
+                     ->  Result
+                           ->  Append
+                                 ->  Seq Scan on prt1_e_p1 t3
+                                 ->  Seq Scan on prt1_e_p2 t3_1
+                                 ->  Seq Scan on prt1_e_p3 t3_2
+               ->  Hash
+                     ->  Hash Right Join
+                           Hash Cond: (b = a)
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  Append
+                                       ->  Seq Scan on prt2_p1 t2
+                                       ->  Seq Scan on prt2_p2 t2_1
+                                       ->  Seq Scan on prt2_p3 t2_2
+                           ->  Hash
                                  ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: ((a + b) / 2)
-                                       ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t3_2
-                                             Index Cond: (t1_2.a = ((a + b) / 2))
-(49 rows)
+                                       Distribute results by H: a
+                                       ->  Append
+                                             ->  Seq Scan on prt1_p1 t1
+                                                   Filter: (b = 0)
+                                             ->  Seq Scan on prt1_p2 t1_1
+                                                   Filter: (b = 0)
+                                             ->  Seq Scan on prt1_p3 t1_2
+                                                   Filter: (b = 0)
+(31 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
   a  |  c   |  b  |  c   | ?column? | c 
@@ -718,59 +647,44 @@ SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
-                                          QUERY PLAN                                           
------------------------------------------------------------------------------------------------
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a, t2.b, ((t3.a + t3.b))
-         ->  Result
-               ->  Append
-                     ->  Nested Loop Left Join
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: a
-                                 ->  Nested Loop Left Join
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             Distribute results by H: ((a + b) / 2)
-                                             ->  Seq Scan on prt1_e_p1 t3
-                                                   Filter: (c = 0)
-                                       ->  Index Scan using iprt1_p1_a on prt1_p1 t1
-                                             Index Cond: (a = ((t3.a + t3.b) / 2))
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  Index Scan using iprt2_p1_b on prt2_p1 t2
-                                             Index Cond: (t1.a = b)
-                     ->  Nested Loop Left Join
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: a
-                                 ->  Nested Loop Left Join
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             Distribute results by H: ((a + b) / 2)
-                                             ->  Seq Scan on prt1_e_p2 t3_1
-                                                   Filter: (c = 0)
-                                       ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
-                                             Index Cond: (a = ((t3_1.a + t3_1.b) / 2))
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
-                                             Index Cond: (t1_1.a = b)
-                     ->  Nested Loop Left Join
+         Sort Key: a, b, ((a + b))
+         ->  Nested Loop Left Join
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Hash Right Join
+                           Hash Cond: (a = (((a + b) / 2)))
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                  Distribute results by H: a
-                                 ->  Nested Loop Left Join
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             Distribute results by H: ((a + b) / 2)
-                                             ->  Seq Scan on prt1_e_p3 t3_2
-                                                   Filter: (c = 0)
-                                       ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
-                                             Index Cond: (a = ((t3_2.a + t3_2.b) / 2))
-                           ->  Materialize
+                                 ->  Append
+                                       ->  Seq Scan on prt1_p1 t1
+                                       ->  Seq Scan on prt1_p2 t1_1
+                                       ->  Seq Scan on prt1_p3 t1_2
+                           ->  Hash
                                  ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  Index Scan using iprt2_p3_b on prt2_p3 t2_2
-                                             Index Cond: (t1_2.a = b)
-(50 rows)
+                                       Distribute results by H: ((a + b) / 2)
+                                       ->  Result
+                                             ->  Append
+                                                   ->  Seq Scan on prt1_e_p1 t3
+                                                         Filter: (c = 0)
+                                                   ->  Seq Scan on prt1_e_p2 t3_1
+                                                         Filter: (c = 0)
+                                                   ->  Seq Scan on prt1_e_p3 t3_2
+                                                         Filter: (c = 0)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  Append
+                                 ->  Index Scan using iprt2_p1_b on prt2_p1 t2
+                                       Index Cond: (a = b)
+                                 ->  Index Scan using iprt2_p2_b on prt2_p2 t2_1
+                                       Index Cond: (a = b)
+                                 ->  Index Scan using iprt2_p3_b on prt2_p3 t2_2
+                                       Index Cond: (a = b)
+(35 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
   a  |  c   |  b  |  c   | ?column? | c 
@@ -793,71 +707,49 @@ SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2
 -- make sure these go to null as expected
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b;
-                                                         QUERY PLAN                                                         
-----------------------------------------------------------------------------------------------------------------------------
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: prt1_p1.a, prt2_p1.b, ((prt1_e_p1.a + prt1_e_p1.b))
-         ->  Result
-               ->  Append
+         Sort Key: a, b, ((a + b))
+         ->  Hash Full Join
+               Hash Cond: (a = (((a + b) / 2)))
+               Filter: ((a = (50)) OR (b = (75)) OR (((a + b) / 2) = (50)))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
                      ->  Hash Full Join
-                           Hash Cond: (prt1_p1.a = (((prt1_e_p1.a + prt1_e_p1.b) / 2)))
-                           Filter: ((prt1_p1.a = (50)) OR (prt2_p1.b = (75)) OR (((prt1_e_p1.a + prt1_e_p1.b) / 2) = (50)))
+                           Hash Cond: (a = b)
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                  Distribute results by H: a
-                                 ->  Hash Full Join
-                                       Hash Cond: (prt1_p1.a = prt2_p1.b)
+                                 ->  Append
                                        ->  Seq Scan on prt1_p1
                                              Filter: (b = 0)
-                                       ->  Hash
-                                             ->  Remote Subquery Scan on all (datanode_2)
-                                                   Distribute results by H: b
-                                                   ->  Seq Scan on prt2_p1
-                                                         Filter: (a = 0)
-                           ->  Hash
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: ((a + b) / 2)
-                                       ->  Seq Scan on prt1_e_p1
-                                             Filter: (c = 0)
-                     ->  Hash Full Join
-                           Hash Cond: (prt1_p2.a = (((prt1_e_p2.a + prt1_e_p2.b) / 2)))
-                           Filter: ((prt1_p2.a = (50)) OR (prt2_p2.b = (75)) OR (((prt1_e_p2.a + prt1_e_p2.b) / 2) = (50)))
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: a
-                                 ->  Hash Full Join
-                                       Hash Cond: (prt1_p2.a = prt2_p2.b)
                                        ->  Seq Scan on prt1_p2
                                              Filter: (b = 0)
-                                       ->  Hash
-                                             ->  Remote Subquery Scan on all (datanode_2)
-                                                   Distribute results by H: b
-                                                   ->  Seq Scan on prt2_p2
-                                                         Filter: (a = 0)
-                           ->  Hash
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: ((a + b) / 2)
-                                       ->  Seq Scan on prt1_e_p2
-                                             Filter: (c = 0)
-                     ->  Hash Full Join
-                           Hash Cond: (prt1_p3.a = (((prt1_e_p3.a + prt1_e_p3.b) / 2)))
-                           Filter: ((prt1_p3.a = (50)) OR (prt2_p3.b = (75)) OR (((prt1_e_p3.a + prt1_e_p3.b) / 2) = (50)))
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: a
-                                 ->  Hash Full Join
-                                       Hash Cond: (prt1_p3.a = prt2_p3.b)
                                        ->  Seq Scan on prt1_p3
                                              Filter: (b = 0)
-                                       ->  Hash
-                                             ->  Remote Subquery Scan on all (datanode_2)
-                                                   Distribute results by H: b
-                                                   ->  Seq Scan on prt2_p3
-                                                         Filter: (a = 0)
                            ->  Hash
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: ((a + b) / 2)
+                                 ->  Remote Subquery Scan on all (datanode_2)
+                                       Distribute results by H: b
+                                       ->  Append
+                                             ->  Seq Scan on prt2_p1
+                                                   Filter: (a = 0)
+                                             ->  Seq Scan on prt2_p2
+                                                   Filter: (a = 0)
+                                             ->  Seq Scan on prt2_p3
+                                                   Filter: (a = 0)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: ((a + b) / 2)
+                           ->  Result
+                                 ->  Append
+                                       ->  Seq Scan on prt1_e_p1
+                                             Filter: (c = 0)
+                                       ->  Seq Scan on prt1_e_p2
+                                             Filter: (c = 0)
                                        ->  Seq Scan on prt1_e_p3
                                              Filter: (c = 0)
-(62 rows)
+(40 rows)
 
 SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b;
  a  | phv | b  | phv | ?column? | phv 
@@ -873,66 +765,47 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHER
 ---------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a
-         ->  Append
-               ->  Nested Loop
-                     Join Filter: (t1.a = t1_3.b)
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: b
-                           ->  HashAggregate
-                                 Group Key: t1_3.b
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  HashAggregate
-                                             Group Key: t1_3.b
-                                             ->  Nested Loop
-                                                   ->  Remote Subquery Scan on all (datanode_2)
+         Sort Key: a
+         ->  Nested Loop
+               Join Filter: (a = b)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  HashAggregate
+                           Group Key: b
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  HashAggregate
+                                       Group Key: b
+                                       ->  Nested Loop
+                                             ->  Remote Subquery Scan on all (datanode_2)
+                                                   ->  Append
                                                          ->  Seq Scan on prt2_p1 t1_3
                                                                Filter: (a = 0)
-                                                   ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t2
-                                                         Index Cond: (((a + b) / 2) = t1_3.b)
-                     ->  Index Scan using iprt1_p1_a on prt1_p1 t1
-                           Index Cond: (a = ((t2.a + t2.b) / 2))
-                           Filter: (b = 0)
-               ->  Nested Loop
-                     Join Filter: (t1_1.a = t1_4.b)
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: b
-                           ->  HashAggregate
-                                 Group Key: t1_4.b
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  HashAggregate
-                                             Group Key: t1_4.b
-                                             ->  Nested Loop
-                                                   ->  Remote Subquery Scan on all (datanode_2)
                                                          ->  Seq Scan on prt2_p2 t1_4
                                                                Filter: (a = 0)
-                                                   ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t2_1
-                                                         Index Cond: (((a + b) / 2) = t1_4.b)
-                     ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
-                           Index Cond: (a = ((t2_1.a + t2_1.b) / 2))
-                           Filter: (b = 0)
-               ->  Nested Loop
-                     Join Filter: (t1_2.a = t1_5.b)
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: b
-                           ->  HashAggregate
-                                 Group Key: t1_5.b
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: b
-                                       ->  HashAggregate
-                                             Group Key: t1_5.b
-                                             ->  Nested Loop
-                                                   ->  Remote Subquery Scan on all (datanode_2)
                                                          ->  Seq Scan on prt2_p3 t1_5
                                                                Filter: (a = 0)
+                                             ->  Append
+                                                   ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t2
+                                                         Index Cond: (((a + b) / 2) = b)
+                                                   ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t2_1
+                                                         Index Cond: (((a + b) / 2) = b)
                                                    ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t2_2
-                                                         Index Cond: (((a + b) / 2) = t1_5.b)
-                     ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
-                           Index Cond: (a = ((t2_2.a + t2_2.b) / 2))
-                           Filter: (b = 0)
-(61 rows)
+                                                         Index Cond: (((a + b) / 2) = b)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Append
+                                 ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                                       Index Cond: (a = ((a + b) / 2))
+                                       Filter: (b = 0)
+                                 ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                                       Index Cond: (a = ((a + b) / 2))
+                                       Filter: (b = 0)
+                                 ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                                       Index Cond: (a = ((a + b) / 2))
+                                       Filter: (b = 0)
+(42 rows)
 
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
   a  | b |  c   
@@ -945,57 +818,49 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHER
 
 EXPLAIN (COSTS OFF)
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
-                                               QUERY PLAN                                                
----------------------------------------------------------------------------------------------------------
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a
-         ->  Append
-               ->  Nested Loop Semi Join
-                     ->  Seq Scan on prt1_p1 t1
-                           Filter: (b = 0)
-                     ->  Materialize
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: b
-                                 ->  Nested Loop Semi Join
-                                       ->  Index Only Scan using iprt2_p1_b on prt2_p1 t1_3
-                                             Index Cond: (b = t1.a)
-                                       ->  Materialize
-                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                   ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t1_6
-                                                         Index Cond: (((a + b) / 2) = t1_3.b)
-                                                         Filter: (c = 0)
-               ->  Nested Loop Semi Join
-                     ->  Seq Scan on prt1_p2 t1_1
-                           Filter: (b = 0)
-                     ->  Materialize
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: b
-                                 ->  Nested Loop Semi Join
-                                       ->  Index Only Scan using iprt2_p2_b on prt2_p2 t1_4
-                                             Index Cond: (b = t1_1.a)
-                                       ->  Materialize
-                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                   ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t1_7
-                                                         Index Cond: (((a + b) / 2) = t1_4.b)
-                                                         Filter: (c = 0)
-               ->  Nested Loop Semi Join
-                     ->  Seq Scan on prt1_p3 t1_2
-                           Filter: (b = 0)
-                     ->  Materialize
+         Sort Key: a
+         ->  Nested Loop
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  HashAggregate
+                           Group Key: b
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                  Distribute results by H: b
-                                 ->  Nested Loop Semi Join
-                                       ->  Bitmap Heap Scan on prt2_p3 t1_5
-                                             Recheck Cond: (b = t1_2.a)
-                                             ->  Bitmap Index Scan on iprt2_p3_b
-                                                   Index Cond: (b = t1_2.a)
-                                       ->  Materialize
-                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                   ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t1_8
-                                                         Index Cond: (((a + b) / 2) = t1_5.b)
-                                                         Filter: (c = 0)
-(48 rows)
+                                 ->  HashAggregate
+                                       Group Key: t1_3.b
+                                       ->  Hash Semi Join
+                                             Hash Cond: (t1_3.b = ((a + b) / 2))
+                                             ->  Append
+                                                   ->  Seq Scan on prt2_p1 t1_3
+                                                   ->  Seq Scan on prt2_p2 t1_4
+                                                   ->  Seq Scan on prt2_p3 t1_5
+                                             ->  Hash
+                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                         ->  Append
+                                                               ->  Seq Scan on prt1_e_p1 t1_6
+                                                                     Filter: (c = 0)
+                                                               ->  Seq Scan on prt1_e_p2 t1_7
+                                                                     Filter: (c = 0)
+                                                               ->  Seq Scan on prt1_e_p3 t1_8
+                                                                     Filter: (c = 0)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Append
+                                 ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                                       Index Cond: (a = b)
+                                       Filter: (b = 0)
+                                 ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                                       Index Cond: (a = b)
+                                       Filter: (b = 0)
+                                 ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                                       Index Cond: (a = b)
+                                       Filter: (b = 0)
+(40 rows)
 
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
   a  | b |  c   
@@ -1011,67 +876,44 @@ SET enable_hashjoin TO off;
 SET enable_nestloop TO off;
 EXPLAIN (COSTS OFF)
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
-                                       QUERY PLAN                                        
------------------------------------------------------------------------------------------
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Sort
-         Sort Key: t1.a
-         ->  Append
-               ->  Merge Semi Join
-                     Merge Cond: (t1.a = t1_3.b)
-                     ->  Sort
-                           Sort Key: t1.a
+   ->  Merge Semi Join
+         Merge Cond: (a = b)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: a
+               ->  Sort
+                     Sort Key: t1.a
+                     ->  Append
                            ->  Seq Scan on prt1_p1 t1
                                  Filter: (b = 0)
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: b
-                           ->  Merge Semi Join
-                                 Merge Cond: (t1_3.b = (((t1_6.a + t1_6.b) / 2)))
-                                 ->  Sort
-                                       Sort Key: t1_3.b
-                                       ->  Seq Scan on prt2_p1 t1_3
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       ->  Sort
-                                             Sort Key: (((t1_6.a + t1_6.b) / 2))
-                                             ->  Seq Scan on prt1_e_p1 t1_6
-                                                   Filter: (c = 0)
-               ->  Merge Semi Join
-                     Merge Cond: (t1_1.a = t1_4.b)
-                     ->  Sort
-                           Sort Key: t1_1.a
                            ->  Seq Scan on prt1_p2 t1_1
                                  Filter: (b = 0)
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: b
-                           ->  Merge Semi Join
-                                 Merge Cond: (t1_4.b = (((t1_7.a + t1_7.b) / 2)))
-                                 ->  Sort
-                                       Sort Key: t1_4.b
-                                       ->  Seq Scan on prt2_p2 t1_4
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       ->  Sort
-                                             Sort Key: (((t1_7.a + t1_7.b) / 2))
-                                             ->  Seq Scan on prt1_e_p2 t1_7
-                                                   Filter: (c = 0)
-               ->  Merge Semi Join
-                     Merge Cond: (t1_2.a = t1_5.b)
-                     ->  Sort
-                           Sort Key: t1_2.a
                            ->  Seq Scan on prt1_p3 t1_2
                                  Filter: (b = 0)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: b
+               ->  Merge Semi Join
+                     Merge Cond: (t1_3.b = (((a + b) / 2)))
+                     ->  Sort
+                           Sort Key: t1_3.b
+                           ->  Append
+                                 ->  Seq Scan on prt2_p1 t1_3
+                                 ->  Seq Scan on prt2_p2 t1_4
+                                 ->  Seq Scan on prt2_p3 t1_5
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: b
-                           ->  Merge Semi Join
-                                 Merge Cond: (t1_5.b = (((t1_8.a + t1_8.b) / 2)))
-                                 ->  Sort
-                                       Sort Key: t1_5.b
-                                       ->  Seq Scan on prt2_p3 t1_5
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       ->  Sort
-                                             Sort Key: (((t1_8.a + t1_8.b) / 2))
+                           ->  Sort
+                                 Sort Key: (((t1_6.a + t1_6.b) / 2))
+                                 ->  Result
+                                       ->  Append
+                                             ->  Seq Scan on prt1_e_p1 t1_6
+                                                   Filter: (c = 0)
+                                             ->  Seq Scan on prt1_e_p2 t1_7
+                                                   Filter: (c = 0)
                                              ->  Seq Scan on prt1_e_p3 t1_8
                                                    Filter: (c = 0)
-(58 rows)
+(35 rows)
 
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
   a  | b |  c   
@@ -1084,77 +926,50 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
-                                                QUERY PLAN                                                 
------------------------------------------------------------------------------------------------------------
+                                             QUERY PLAN                                              
+-----------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a, t2.b, ((t3.a + t3.b))
-         ->  Result
-               ->  Append
-                     ->  Merge Right Join
-                           Merge Cond: (t1.a = (((t3.a + t3.b) / 2)))
-                           ->  Merge Left Join
-                                 Merge Cond: (t1.a = t2.b)
-                                 ->  Sort
-                                       Sort Key: t1.a
-                                       ->  Seq Scan on prt1_p1 t1
-                                 ->  Materialize
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             Distribute results by H: b
-                                             ->  Sort
-                                                   Sort Key: t2.b
-                                                   ->  Seq Scan on prt2_p1 t2
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: ((a + b) / 2)
-                                       ->  Sort
-                                             Sort Key: (((t3.a + t3.b) / 2))
-                                             ->  Seq Scan on prt1_e_p1 t3
-                                                   Filter: (c = 0)
-                     ->  Merge Right Join
-                           Merge Cond: (t1_1.a = (((t3_1.a + t3_1.b) / 2)))
-                           ->  Merge Left Join
-                                 Merge Cond: (t1_1.a = t2_1.b)
-                                 ->  Sort
-                                       Sort Key: t1_1.a
-                                       ->  Seq Scan on prt1_p2 t1_1
-                                 ->  Materialize
+         Sort Key: a, b, ((a + b))
+         ->  Merge Right Join
+               Merge Cond: (b = a)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  Sort
+                           Sort Key: t2.b
+                           ->  Append
+                                 ->  Seq Scan on prt2_p1 t2
+                                 ->  Seq Scan on prt2_p2 t2_1
+                                 ->  Seq Scan on prt2_p3 t2_2
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Sort
+                                 Sort Key: a
+                                 ->  Merge Right Join
+                                       Merge Cond: (a = (((a + b) / 2)))
                                        ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             Distribute results by H: b
+                                             Distribute results by H: a
                                              ->  Sort
-                                                   Sort Key: t2_1.b
-                                                   ->  Seq Scan on prt2_p2 t2_1
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: ((a + b) / 2)
-                                       ->  Sort
-                                             Sort Key: (((t3_1.a + t3_1.b) / 2))
-                                             ->  Seq Scan on prt1_e_p2 t3_1
-                                                   Filter: (c = 0)
-                     ->  Merge Right Join
-                           Merge Cond: (t2_2.b = t1_2.a)
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: b
-                                 ->  Sort
-                                       Sort Key: t2_2.b
-                                       ->  Seq Scan on prt2_p3 t2_2
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: a
-                                       ->  Sort
-                                             Sort Key: t1_2.a
-                                             ->  Merge Left Join
-                                                   Merge Cond: ((((t3_2.a + t3_2.b) / 2)) = t1_2.a)
-                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                         Distribute results by H: ((a + b) / 2)
-                                                         ->  Sort
-                                                               Sort Key: (((t3_2.a + t3_2.b) / 2))
-                                                               ->  Seq Scan on prt1_e_p3 t3_2
-                                                                     Filter: (c = 0)
-                                                   ->  Sort
-                                                         Sort Key: t1_2.a
+                                                   Sort Key: t1.a
+                                                   ->  Append
+                                                         ->  Seq Scan on prt1_p1 t1
+                                                         ->  Seq Scan on prt1_p2 t1_1
                                                          ->  Seq Scan on prt1_p3 t1_2
-(68 rows)
+                                       ->  Materialize
+                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   Distribute results by H: ((a + b) / 2)
+                                                   ->  Sort
+                                                         Sort Key: (((t3.a + t3.b) / 2))
+                                                         ->  Result
+                                                               ->  Append
+                                                                     ->  Seq Scan on prt1_e_p1 t3
+                                                                           Filter: (c = 0)
+                                                                     ->  Seq Scan on prt1_e_p2 t3_1
+                                                                           Filter: (c = 0)
+                                                                     ->  Seq Scan on prt1_e_p3 t3_2
+                                                                           Filter: (c = 0)
+(41 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
   a  |  c   |  b  |  c   | ?column? | c 
@@ -1176,48 +991,46 @@ SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2
 -- MergeAppend on nullable column
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
-                              QUERY PLAN                               
------------------------------------------------------------------------
- Sort
-   Sort Key: prt1_p1.a, b
-   ->  Append
-         ->  Merge Left Join
-               Merge Cond: (prt1_p1.a = b)
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     ->  Sort
-                           Sort Key: prt1_p1.a
-                           ->  Seq Scan on prt1_p1
-                                 Filter: ((a < 450) AND (b = 0))
-               ->  Sort
-                     Sort Key: b
-                     ->  Result
-                           One-Time Filter: false
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
          ->  Merge Right Join
-               Merge Cond: (prt2_p2.b = prt1_p2.a)
+               Merge Cond: (b = a)
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Distribute results by H: b
                      ->  Sort
                            Sort Key: prt2_p2.b
-                           ->  Seq Scan on prt2_p2
-                                 Filter: (b > 250)
-               ->  Sort
-                     Sort Key: prt1_p2.a
-                     ->  Seq Scan on prt1_p2
-                           Filter: ((a < 450) AND (b = 0))
-(26 rows)
+                           ->  Append
+                                 ->  Seq Scan on prt2_p2
+                                       Filter: (b > 250)
+                                 ->  Seq Scan on prt2_p3
+                                       Filter: (b > 250)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Sort
+                                 Sort Key: prt1_p1.a
+                                 ->  Append
+                                       ->  Seq Scan on prt1_p1
+                                             Filter: ((a < 450) AND (b = 0))
+                                       ->  Seq Scan on prt1_p2
+                                             Filter: ((a < 450) AND (b = 0))
+(24 rows)
 
 SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
-  a  |  b
+  a  |  b  
 -----+-----
-   0 |
-  50 |
- 100 |
- 150 |
- 200 |
- 250 |
+   0 |    
+  50 |    
+ 100 |    
+ 150 |    
+ 200 |    
+ 250 |    
  300 | 300
- 350 |
- 400 |
+ 350 |    
+ 400 |    
 (9 rows)
 
 RESET enable_hashjoin;
@@ -1239,40 +1052,34 @@ INSERT INTO prt2_m SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i;
 ANALYZE prt2_m;
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b;
-                                                                 QUERY PLAN                                                                 
---------------------------------------------------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Sort
-         Sort Key: prt1_m_p1.a, prt2_m_p1.b
-         ->  Append
-               ->  Hash Full Join
-                     Hash Cond: ((prt1_m_p1.a = (((prt2_m_p1.b + prt2_m_p1.a) / 2))) AND (((prt1_m_p1.a + prt1_m_p1.b) / 2) = prt2_m_p1.b))
-                     ->  Seq Scan on prt1_m_p1
-                           Filter: (c = 0)
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: ((b + a) / 2)
-                                 ->  Seq Scan on prt2_m_p1
-                                       Filter: (c = 0)
-               ->  Hash Full Join
-                     Hash Cond: ((prt1_m_p2.a = (((prt2_m_p2.b + prt2_m_p2.a) / 2))) AND (((prt1_m_p2.a + prt1_m_p2.b) / 2) = prt2_m_p2.b))
-                     ->  Seq Scan on prt1_m_p2
-                           Filter: (c = 0)
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: ((b + a) / 2)
-                                 ->  Seq Scan on prt2_m_p2
-                                       Filter: (c = 0)
-               ->  Hash Full Join
-                     Hash Cond: ((prt1_m_p3.a = (((prt2_m_p3.b + prt2_m_p3.a) / 2))) AND (((prt1_m_p3.a + prt1_m_p3.b) / 2) = prt2_m_p3.b))
-                     ->  Seq Scan on prt1_m_p3
-                           Filter: (c = 0)
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: ((b + a) / 2)
-                                 ->  Seq Scan on prt2_m_p3
-                                       Filter: (c = 0)
-(31 rows)
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
+         ->  Hash Full Join
+               Hash Cond: ((a = (((b + a) / 2))) AND (((a + b) / 2) = b))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_m_p1
+                                 Filter: (c = 0)
+                           ->  Seq Scan on prt1_m_p2
+                                 Filter: (c = 0)
+                           ->  Seq Scan on prt1_m_p3
+                                 Filter: (c = 0)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: ((b + a) / 2)
+                           ->  Result
+                                 ->  Append
+                                       ->  Seq Scan on prt2_m_p1
+                                             Filter: (c = 0)
+                                       ->  Seq Scan on prt2_m_p2
+                                             Filter: (c = 0)
+                                       ->  Seq Scan on prt2_m_p3
+                                             Filter: (c = 0)
+(25 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b;
   a  | c |  b  | c 
@@ -1322,8 +1129,8 @@ ANALYZE plt1_e;
 -- test partition matching with N-way join
 EXPLAIN (COSTS OFF)
 SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
-                                                   QUERY PLAN                                                    
------------------------------------------------------------------------------------------------------------------
+                                             QUERY PLAN                                              
+-----------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Finalize GroupAggregate
          Group Key: c, c, c
@@ -1332,43 +1139,28 @@ SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, pl
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Distribute results by H: c
                      ->  Partial HashAggregate
-                           Group Key: t1.c, t2.c, t3.c
-                           ->  Result
-                                 ->  Append
-                                       ->  Hash Join
-                                             Hash Cond: (t1.c = t2.c)
+                           Group Key: c, c, t3.c
+                           ->  Hash Join
+                                 Hash Cond: (c = c)
+                                 ->  Hash Join
+                                       Hash Cond: (ltrim(t3.c, 'A'::text) = c)
+                                       ->  Append
+                                             ->  Seq Scan on plt1_e_p1 t3
+                                             ->  Seq Scan on plt1_e_p2 t3_1
+                                             ->  Seq Scan on plt1_e_p3 t3_2
+                                       ->  Hash
                                              ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   ->  Append
+                                                         ->  Seq Scan on plt2_p1 t2
+                                                         ->  Seq Scan on plt2_p2 t2_1
+                                                         ->  Seq Scan on plt2_p3 t2_2
+                                 ->  Hash
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Append
                                                    ->  Seq Scan on plt1_p1 t1
-                                             ->  Hash
-                                                   ->  Hash Join
-                                                         Hash Cond: (t2.c = ltrim(t3.c, 'A'::text))
-                                                         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                               ->  Seq Scan on plt2_p1 t2
-                                                         ->  Hash
-                                                               ->  Seq Scan on plt1_e_p1 t3
-                                       ->  Hash Join
-                                             Hash Cond: (t1_1.c = t2_1.c)
-                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                                    ->  Seq Scan on plt1_p2 t1_1
-                                             ->  Hash
-                                                   ->  Hash Join
-                                                         Hash Cond: (t2_1.c = ltrim(t3_1.c, 'A'::text))
-                                                         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                               ->  Seq Scan on plt2_p2 t2_1
-                                                         ->  Hash
-                                                               ->  Seq Scan on plt1_e_p2 t3_1
-                                       ->  Hash Join
-                                             Hash Cond: (t1_2.c = t2_2.c)
-                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                                    ->  Seq Scan on plt1_p3 t1_2
-                                             ->  Hash
-                                                   ->  Hash Join
-                                                         Hash Cond: (t2_2.c = ltrim(t3_2.c, 'A'::text))
-                                                         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                               ->  Seq Scan on plt2_p3 t2_2
-                                                         ->  Hash
-                                                               ->  Seq Scan on plt1_e_p3 t3_2
-(44 rows)
+(29 rows)
 
 SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
          avg          |         avg          |          avg          |  c   |  c   |   c   
@@ -1406,36 +1198,27 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a;
-                                    QUERY PLAN                                     
------------------------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Hash Left Join
    Hash Cond: (b = a)
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-         ->  Append
-               ->  Hash Join
-                     Hash Cond: (t3.a = t2.b)
-                     ->  Seq Scan on prt1_p1 t3
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: b
-                                 ->  Seq Scan on prt2_p1 t2
-               ->  Hash Join
-                     Hash Cond: (t3_1.a = t2_1.b)
-                     ->  Seq Scan on prt1_p2 t3_1
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: b
-                                 ->  Seq Scan on prt2_p2 t2_1
-               ->  Nested Loop
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: b
+         ->  Hash Join
+               Hash Cond: (a = t2.b)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t3
+                           ->  Seq Scan on prt1_p2 t3_1
+                           ->  Seq Scan on prt1_p3 t3_2
+               ->  Hash
+                     ->  Append
+                           ->  Seq Scan on prt2_p1 t2
+                           ->  Seq Scan on prt2_p2 t2_1
                            ->  Seq Scan on prt2_p3 t2_2
-                     ->  Index Only Scan using iprt1_p3_a on prt1_p3 t3_2
-                           Index Cond: (a = t2_2.b)
    ->  Hash
          ->  Result
                One-Time Filter: false
-(27 rows)
+(18 rows)
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
@@ -1485,8 +1268,8 @@ ANALYZE pht1_e;
 -- test partition matching with N-way join
 EXPLAIN (COSTS OFF)
 SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
-                                                   QUERY PLAN                                                    
------------------------------------------------------------------------------------------------------------------
+                                             QUERY PLAN                                              
+-----------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Finalize GroupAggregate
          Group Key: c, c, c
@@ -1495,43 +1278,28 @@ SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, ph
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Distribute results by H: c
                      ->  Partial HashAggregate
-                           Group Key: t1.c, t2.c, t3.c
-                           ->  Result
-                                 ->  Append
-                                       ->  Hash Join
-                                             Hash Cond: (t1.c = t2.c)
+                           Group Key: c, c, t3.c
+                           ->  Hash Join
+                                 Hash Cond: (c = c)
+                                 ->  Hash Join
+                                       Hash Cond: (ltrim(t3.c, 'A'::text) = c)
+                                       ->  Append
+                                             ->  Seq Scan on pht1_e_p1 t3
+                                             ->  Seq Scan on pht1_e_p2 t3_1
+                                             ->  Seq Scan on pht1_e_p3 t3_2
+                                       ->  Hash
                                              ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                   ->  Seq Scan on pht1_p1 t1
-                                             ->  Hash
-                                                   ->  Hash Join
-                                                         Hash Cond: (t2.c = ltrim(t3.c, 'A'::text))
-                                                         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                               ->  Seq Scan on pht2_p1 t2
-                                                         ->  Hash
-                                                               ->  Seq Scan on pht1_e_p1 t3
-                                       ->  Hash Join
-                                             Hash Cond: (t1_1.c = t2_1.c)
-                                             ->  Hash Join
-                                                   Hash Cond: (t1_1.c = ltrim(t3_1.c, 'A'::text))
-                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                         ->  Seq Scan on pht1_p2 t1_1
-                                                   ->  Hash
-                                                         ->  Seq Scan on pht1_e_p2 t3_1
-                                             ->  Hash
-                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   ->  Append
+                                                         ->  Seq Scan on pht2_p1 t2
                                                          ->  Seq Scan on pht2_p2 t2_1
-                                       ->  Hash Join
-                                             Hash Cond: (t1_2.c = t2_2.c)
-                                             ->  Hash Join
-                                                   Hash Cond: (t1_2.c = ltrim(t3_2.c, 'A'::text))
-                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                                         ->  Seq Scan on pht1_p3 t1_2
-                                                   ->  Hash
-                                                         ->  Seq Scan on pht1_e_p3 t3_2
-                                             ->  Hash
-                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                                          ->  Seq Scan on pht2_p3 t2_2
-(44 rows)
+                                 ->  Hash
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Append
+                                                   ->  Seq Scan on pht1_p1 t1
+                                                   ->  Seq Scan on pht1_p2 t1_1
+                                                   ->  Seq Scan on pht1_p3 t1_2
+(29 rows)
 
 SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
          avg          |         avg          |          avg          |  c   |  c   |   c   
@@ -1576,42 +1344,31 @@ ANALYZE prt2_l;
 -- inner join, qual covering only top-level partitions
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
-                                    QUERY PLAN                                     
------------------------------------------------------------------------------------
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a
-         ->  Append
-               ->  Hash Join
-                     Hash Cond: (t2.b = t1.a)
+         Sort Key: a
+         ->  Hash Join
+               Hash Cond: (t2.b = a)
+               ->  Append
                      ->  Seq Scan on prt2_l_p1 t2
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Seq Scan on prt2_l_p2_p1 t2_1
+                     ->  Seq Scan on prt2_l_p2_p2 t2_2
+                     ->  Seq Scan on prt2_l_p3_p1 t2_3
+                     ->  Seq Scan on prt2_l_p3_p2 t2_4
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Append
                                  ->  Seq Scan on prt1_l_p1 t1
                                        Filter: (b = 0)
-               ->  Hash Join
-                     Hash Cond: (t2_1.b = a)
-                     ->  Append
-                           ->  Seq Scan on prt2_l_p2_p1 t2_1
-                           ->  Seq Scan on prt2_l_p2_p2 t2_2
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 ->  Append
-                                       ->  Seq Scan on prt1_l_p2_p1 t1_1
-                                             Filter: (b = 0)
-                                       ->  Seq Scan on prt1_l_p2_p2 t1_2
-                                             Filter: (b = 0)
-               ->  Nested Loop
-                     Join Filter: (a = t2_3.b)
-                     ->  Append
-                           ->  Seq Scan on prt2_l_p3_p1 t2_3
-                           ->  Seq Scan on prt2_l_p3_p2 t2_4
-                     ->  Materialize
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 ->  Append
-                                       ->  Seq Scan on prt1_l_p3_p1 t1_3
-                                             Filter: (b = 0)
-(33 rows)
+                                 ->  Seq Scan on prt1_l_p2_p1 t1_1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p2_p2 t1_2
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p3_p1 t1_3
+                                       Filter: (b = 0)
+(22 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
   a  |  c   |  b  |  c   
@@ -1625,50 +1382,34 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1
 -- left join
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b;
-                                        QUERY PLAN                                        
-------------------------------------------------------------------------------------------
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a, t2.b
-         ->  Append
-               ->  Hash Right Join
-                     Hash Cond: ((t2.b = t1.a) AND ((t2.c)::text = (t1.c)::text))
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: b
+         Sort Key: a, b
+         ->  Hash Right Join
+               Hash Cond: ((b = a) AND ((c)::text = (c)::text))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  Append
                            ->  Seq Scan on prt2_l_p1 t2
-                     ->  Hash
-                           ->  Seq Scan on prt1_l_p1 t1
-                                 Filter: (b = 0)
-               ->  Hash Right Join
-                     Hash Cond: ((t2_1.b = t1_1.a) AND ((t2_1.c)::text = (t1_1.c)::text))
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: b
                            ->  Seq Scan on prt2_l_p2_p1 t2_1
-                     ->  Hash
-                           ->  Seq Scan on prt1_l_p2_p1 t1_1
-                                 Filter: (b = 0)
-               ->  Hash Right Join
-                     Hash Cond: ((t2_2.b = t1_2.a) AND ((t2_2.c)::text = (t1_2.c)::text))
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: b
                            ->  Seq Scan on prt2_l_p2_p2 t2_2
-                     ->  Hash
-                           ->  Seq Scan on prt1_l_p2_p2 t1_2
-                                 Filter: (b = 0)
-               ->  Nested Loop Left Join
-                     Join Filter: ((a = b) AND ((c)::text = (c)::text))
+                           ->  Seq Scan on prt2_l_p3_p1 t2_3
+                           ->  Seq Scan on prt2_l_p3_p2 t2_4
+               ->  Hash
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Distribute results by H: a
                            ->  Append
+                                 ->  Seq Scan on prt1_l_p1 t1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p2_p1 t1_1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p2_p2 t1_2
+                                       Filter: (b = 0)
                                  ->  Seq Scan on prt1_l_p3_p1 t1_3
                                        Filter: (b = 0)
-                     ->  Materialize
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: b
-                                 ->  Append
-                                       ->  Seq Scan on prt2_l_p3_p1 t2_3
-                                       ->  Seq Scan on prt2_l_p3_p2 t2_4
-(41 rows)
+(25 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b;
   a  |  c   |  b  |  c   
@@ -1690,50 +1431,34 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b
 -- right join
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b;
-                                            QUERY PLAN                                            
---------------------------------------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: t1.a, t2.b
-         ->  Result
-               ->  Append
-                     ->  Hash Right Join
-                           Hash Cond: ((t1.a = t2.b) AND ((t1.c)::text = (t2.c)::text))
+         Sort Key: a, b
+         ->  Hash Right Join
+               Hash Cond: ((a = b) AND ((c)::text = (c)::text))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
                            ->  Seq Scan on prt1_l_p1 t1
-                           ->  Hash
-                                 ->  Remote Subquery Scan on all (datanode_2)
-                                       Distribute results by H: b
-                                       ->  Seq Scan on prt2_l_p1 t2
-                                             Filter: (a = 0)
-                     ->  Nested Loop Left Join
-                           Join Filter: ((t1_1.a = t2_1.b) AND ((t1_1.c)::text = (t2_1.c)::text))
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
-                                 ->  Seq Scan on prt2_l_p2_p1 t2_1
-                                       Filter: (a = 0)
                            ->  Seq Scan on prt1_l_p2_p1 t1_1
-                     ->  Hash Right Join
-                           Hash Cond: ((t1_2.a = t2_2.b) AND ((t1_2.c)::text = (t2_2.c)::text))
                            ->  Seq Scan on prt1_l_p2_p2 t1_2
-                           ->  Hash
-                                 ->  Remote Subquery Scan on all (datanode_2)
-                                       Distribute results by H: b
-                                       ->  Seq Scan on prt2_l_p2_p2 t2_2
-                                             Filter: (a = 0)
-                     ->  Nested Loop Left Join
-                           Join Filter: ((a = b) AND ((c)::text = (c)::text))
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
-                                 ->  Append
-                                       ->  Seq Scan on prt2_l_p3_p1 t2_3
-                                             Filter: (a = 0)
-                           ->  Materialize
-                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                       Distribute results by H: a
-                                       ->  Append
-                                             ->  Seq Scan on prt1_l_p3_p1 t1_3
-                                             ->  Seq Scan on prt1_l_p3_p2 t1_4
-(41 rows)
+                           ->  Seq Scan on prt1_l_p3_p1 t1_3
+                           ->  Seq Scan on prt1_l_p3_p2 t1_4
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           Distribute results by H: b
+                           ->  Append
+                                 ->  Seq Scan on prt2_l_p1 t2
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_l_p2_p1 t2_1
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_l_p2_p2 t2_2
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_l_p3_p1 t2_3
+                                       Filter: (a = 0)
+(25 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b;
   a  |  c   |  b  |  c   
@@ -1751,53 +1476,37 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b
 -- full join
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b;
-                                                        QUERY PLAN                                                        
---------------------------------------------------------------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
-         Sort Key: prt1_l_p1.a, prt2_l_p1.b
-         ->  Append
-               ->  Hash Full Join
-                     Hash Cond: ((prt1_l_p1.a = prt2_l_p1.b) AND ((prt1_l_p1.c)::text = (prt2_l_p1.c)::text))
-                     ->  Seq Scan on prt1_l_p1
-                           Filter: (b = 0)
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
+         Sort Key: a, b
+         ->  Hash Full Join
+               Hash Cond: ((a = b) AND ((c)::text = (c)::text))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_l_p1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p2_p1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p2_p2
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p3_p1
+                                 Filter: (b = 0)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           Distribute results by H: b
+                           ->  Append
                                  ->  Seq Scan on prt2_l_p1
                                        Filter: (a = 0)
-               ->  Hash Full Join
-                     Hash Cond: ((prt1_l_p2_p1.a = prt2_l_p2_p1.b) AND ((prt1_l_p2_p1.c)::text = (prt2_l_p2_p1.c)::text))
-                     ->  Seq Scan on prt1_l_p2_p1
-                           Filter: (b = 0)
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
                                  ->  Seq Scan on prt2_l_p2_p1
                                        Filter: (a = 0)
-               ->  Hash Full Join
-                     Hash Cond: ((prt1_l_p2_p2.a = prt2_l_p2_p2.b) AND ((prt1_l_p2_p2.c)::text = (prt2_l_p2_p2.c)::text))
-                     ->  Seq Scan on prt1_l_p2_p2
-                           Filter: (b = 0)
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
                                  ->  Seq Scan on prt2_l_p2_p2
                                        Filter: (a = 0)
-               ->  Hash Full Join
-                     Hash Cond: ((a = b) AND ((c)::text = (c)::text))
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: a
-                           ->  Append
-                                 ->  Seq Scan on prt1_l_p3_p1
-                                       Filter: (b = 0)
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
-                                 ->  Append
-                                       ->  Seq Scan on prt2_l_p3_p1
-                                             Filter: (a = 0)
-(44 rows)
+                                 ->  Seq Scan on prt2_l_p3_p1
+                                       Filter: (a = 0)
+(28 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b;
   a  |  c   |  b  |  c   
@@ -1825,24 +1534,66 @@ EXPLAIN (COSTS OFF)
 SELECT * FROM prt1_l t1 LEFT JOIN LATERAL
 			  (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss
 			  ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a;
-ERROR:  could not devise a query plan for the given query
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a
+         ->  Nested Loop Left Join
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_l_p1 t1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p2_p1 t1_1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p2_p2 t1_2
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p3_p1 t1_3
+                                 Filter: (b = 0)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Hash Join
+                                 Hash Cond: ((b = t2.a) AND ((c)::text = (t2.c)::text))
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       ->  Append
+                                             ->  Seq Scan on prt2_l_p1 t3
+                                             ->  Seq Scan on prt2_l_p2_p1 t3_1
+                                             ->  Seq Scan on prt2_l_p2_p2 t3_2
+                                             ->  Seq Scan on prt2_l_p3_p1 t3_3
+                                             ->  Seq Scan on prt2_l_p3_p2 t3_4
+                                 ->  Hash
+                                       ->  Append
+                                             ->  Seq Scan on prt1_l_p1 t2
+                                                   Filter: ((a = a) AND ((c)::text = (c)::text))
+                                             ->  Seq Scan on prt1_l_p2_p1 t2_1
+                                                   Filter: ((a = a) AND ((c)::text = (c)::text))
+                                             ->  Seq Scan on prt1_l_p2_p2 t2_2
+                                                   Filter: ((a = a) AND ((c)::text = (c)::text))
+                                             ->  Seq Scan on prt1_l_p3_p1 t2_3
+                                                   Filter: ((a = a) AND ((c)::text = (c)::text))
+                                             ->  Seq Scan on prt1_l_p3_p2 t2_4
+                                                   Filter: ((a = a) AND ((c)::text = (c)::text))
+(39 rows)
+
 SELECT * FROM prt1_l t1 LEFT JOIN LATERAL
 			  (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss
 			  ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a;
-  a  | b |  c   | t2a | t2c  | t2b | t3b | least
+  a  | b |  c   | t2a | t2c  | t2b | t3b | least 
 -----+---+------+-----+------+-----+-----+-------
    0 | 0 | 0000 |   0 | 0000 |   0 |   0 |     0
-  50 | 0 | 0002 |     |      |     |     |
- 100 | 0 | 0000 |     |      |     |     |
+  50 | 0 | 0002 |     |      |     |     |      
+ 100 | 0 | 0000 |     |      |     |     |      
  150 | 0 | 0002 | 150 | 0002 |   0 | 150 |   150
- 200 | 0 | 0000 |     |      |     |     |
- 250 | 0 | 0002 |     |      |     |     |
+ 200 | 0 | 0000 |     |      |     |     |      
+ 250 | 0 | 0002 |     |      |     |     |      
  300 | 0 | 0000 | 300 | 0000 |   0 | 300 |   300
- 350 | 0 | 0002 |     |      |     |     |
- 400 | 0 | 0000 |     |      |     |     |
+ 350 | 0 | 0002 |     |      |     |     |      
+ 400 | 0 | 0000 |     |      |     |     |      
  450 | 0 | 0002 | 450 | 0002 |   0 | 450 |   450
- 500 | 0 | 0000 |     |      |     |     |
- 550 | 0 | 0002 |     |      |     |     |
+ 500 | 0 | 0000 |     |      |     |     |      
+ 550 | 0 | 0002 |     |      |     |     |      
 (12 rows)
 
 -- join with one side empty
@@ -1914,34 +1665,25 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2, prt2 t3 WHERE t1.a = t2.a
 -----------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Hash Join
-         Hash Cond: (t1.a = t2.a)
+         Hash Cond: (t2.a = t1.a)
          ->  Append
+               ->  Seq Scan on prt4_n_p1 t2
+               ->  Seq Scan on prt4_n_p2 t2_1
+               ->  Seq Scan on prt4_n_p3 t2_2
+         ->  Hash
                ->  Hash Join
-                     Hash Cond: (t1.a = t3.b)
-                     ->  Seq Scan on prt1_p1 t1
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: b
-                                 ->  Seq Scan on prt2_p1 t3
-               ->  Hash Join
-                     Hash Cond: (t1_1.a = t3_1.b)
-                     ->  Seq Scan on prt1_p2 t1_1
+                     Hash Cond: (t1.a = b)
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t1
+                           ->  Seq Scan on prt1_p2 t1_1
+                           ->  Seq Scan on prt1_p3 t1_2
                      ->  Hash
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: b
-                                 ->  Seq Scan on prt2_p2 t3_1
-               ->  Nested Loop
-                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                           Distribute results by H: b
-                           ->  Seq Scan on prt2_p3 t3_2
-                     ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
-                           Index Cond: (a = t3_2.b)
-         ->  Hash
-               ->  Append
-                     ->  Seq Scan on prt4_n_p1 t2
-                     ->  Seq Scan on prt4_n_p2 t2_1
-                     ->  Seq Scan on prt4_n_p3 t2_2
-(29 rows)
+                                 ->  Append
+                                       ->  Seq Scan on prt2_p1 t3
+                                       ->  Seq Scan on prt2_p2 t3_1
+                                       ->  Seq Scan on prt2_p3 t3_2
+(20 rows)
 
 -- partition-wise join can not be applied if there are no equi-join conditions
 -- between partition keys
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 4aa775e7..f84075e7 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -4,7 +4,7 @@
 --
 
 -- Enable partition-wise join, which by default is disabled.
-SET enable_partition_wise_join to true;
+--SET enable_partition_wise_join to true;
 
 --
 -- partitioned by a single column

From 622bf643e530d597d4897f26504e61f75120c848 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 17 Jul 2020 11:33:08 +0800
Subject: [PATCH 308/578] fix regress executor plan error

---
 src/test/regress/expected/join_6.out          | 6227 ++++++++++++++
 .../regress/expected/partition_join_2.out     | 1819 ++++
 src/test/regress/expected/subselect_2.out     | 1164 +++
 src/test/regress/expected/xc_groupby_3.out    | 7513 +++++++++++++++++
 4 files changed, 16723 insertions(+)
 create mode 100644 src/test/regress/expected/join_6.out
 create mode 100644 src/test/regress/expected/partition_join_2.out
 create mode 100644 src/test/regress/expected/subselect_2.out
 create mode 100644 src/test/regress/expected/xc_groupby_3.out

diff --git a/src/test/regress/expected/join_6.out b/src/test/regress/expected/join_6.out
new file mode 100644
index 00000000..58736aa0
--- /dev/null
+++ b/src/test/regress/expected/join_6.out
@@ -0,0 +1,6227 @@
+--
+-- JOIN
+-- Test JOIN clauses
+--
+CREATE TABLE J1_TBL (
+  i integer,
+  j integer,
+  t text
+);
+CREATE TABLE J2_TBL (
+  i integer,
+  k integer
+);
+INSERT INTO J1_TBL VALUES (1, 4, 'one');
+INSERT INTO J1_TBL VALUES (2, 3, 'two');
+INSERT INTO J1_TBL VALUES (3, 2, 'three');
+INSERT INTO J1_TBL VALUES (4, 1, 'four');
+INSERT INTO J1_TBL VALUES (5, 0, 'five');
+INSERT INTO J1_TBL VALUES (6, 6, 'six');
+INSERT INTO J1_TBL VALUES (7, 7, 'seven');
+INSERT INTO J1_TBL VALUES (8, 8, 'eight');
+INSERT INTO J1_TBL VALUES (0, NULL, 'zero');
+INSERT INTO J1_TBL VALUES (NULL, NULL, 'null');
+INSERT INTO J1_TBL VALUES (NULL, 0, 'zero');
+INSERT INTO J2_TBL VALUES (1, -1);
+INSERT INTO J2_TBL VALUES (2, 2);
+INSERT INTO J2_TBL VALUES (3, -3);
+INSERT INTO J2_TBL VALUES (2, 4);
+INSERT INTO J2_TBL VALUES (5, -5);
+INSERT INTO J2_TBL VALUES (5, -5);
+INSERT INTO J2_TBL VALUES (0, NULL);
+INSERT INTO J2_TBL VALUES (NULL, NULL);
+INSERT INTO J2_TBL VALUES (NULL, 0);
+--
+-- CORRELATION NAMES
+-- Make sure that table/column aliases are supported
+-- before diving into more complex join syntax.
+--
+SELECT '' AS "xxx", *
+  FROM J1_TBL AS tx 
+  ORDER BY i, j, t;
+ xxx | i | j |   t   
+-----+---+---+-------
+     | 0 |   | zero
+     | 1 | 4 | one
+     | 2 | 3 | two
+     | 3 | 2 | three
+     | 4 | 1 | four
+     | 5 | 0 | five
+     | 6 | 6 | six
+     | 7 | 7 | seven
+     | 8 | 8 | eight
+     |   | 0 | zero
+     |   |   | null
+(11 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL tx 
+  ORDER BY i, j, t;
+ xxx | i | j |   t   
+-----+---+---+-------
+     | 0 |   | zero
+     | 1 | 4 | one
+     | 2 | 3 | two
+     | 3 | 2 | three
+     | 4 | 1 | four
+     | 5 | 0 | five
+     | 6 | 6 | six
+     | 7 | 7 | seven
+     | 8 | 8 | eight
+     |   | 0 | zero
+     |   |   | null
+(11 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL AS t1 (a, b, c) 
+  ORDER BY a, b, c;
+ xxx | a | b |   c   
+-----+---+---+-------
+     | 0 |   | zero
+     | 1 | 4 | one
+     | 2 | 3 | two
+     | 3 | 2 | three
+     | 4 | 1 | four
+     | 5 | 0 | five
+     | 6 | 6 | six
+     | 7 | 7 | seven
+     | 8 | 8 | eight
+     |   | 0 | zero
+     |   |   | null
+(11 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL t1 (a, b, c)
+  ORDER BY a, b, c;
+ xxx | a | b |   c   
+-----+---+---+-------
+     | 0 |   | zero
+     | 1 | 4 | one
+     | 2 | 3 | two
+     | 3 | 2 | three
+     | 4 | 1 | four
+     | 5 | 0 | five
+     | 6 | 6 | six
+     | 7 | 7 | seven
+     | 8 | 8 | eight
+     |   | 0 | zero
+     |   |   | null
+(11 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL t1 (a, b, c), J2_TBL t2 (d, e) 
+  ORDER BY a, b, c, d, e;
+ xxx | a | b |   c   | d | e  
+-----+---+---+-------+---+----
+     | 0 |   | zero  | 0 |   
+     | 0 |   | zero  | 1 | -1
+     | 0 |   | zero  | 2 |  2
+     | 0 |   | zero  | 2 |  4
+     | 0 |   | zero  | 3 | -3
+     | 0 |   | zero  | 5 | -5
+     | 0 |   | zero  | 5 | -5
+     | 0 |   | zero  |   |  0
+     | 0 |   | zero  |   |   
+     | 1 | 4 | one   | 0 |   
+     | 1 | 4 | one   | 1 | -1
+     | 1 | 4 | one   | 2 |  2
+     | 1 | 4 | one   | 2 |  4
+     | 1 | 4 | one   | 3 | -3
+     | 1 | 4 | one   | 5 | -5
+     | 1 | 4 | one   | 5 | -5
+     | 1 | 4 | one   |   |  0
+     | 1 | 4 | one   |   |   
+     | 2 | 3 | two   | 0 |   
+     | 2 | 3 | two   | 1 | -1
+     | 2 | 3 | two   | 2 |  2
+     | 2 | 3 | two   | 2 |  4
+     | 2 | 3 | two   | 3 | -3
+     | 2 | 3 | two   | 5 | -5
+     | 2 | 3 | two   | 5 | -5
+     | 2 | 3 | two   |   |  0
+     | 2 | 3 | two   |   |   
+     | 3 | 2 | three | 0 |   
+     | 3 | 2 | three | 1 | -1
+     | 3 | 2 | three | 2 |  2
+     | 3 | 2 | three | 2 |  4
+     | 3 | 2 | three | 3 | -3
+     | 3 | 2 | three | 5 | -5
+     | 3 | 2 | three | 5 | -5
+     | 3 | 2 | three |   |  0
+     | 3 | 2 | three |   |   
+     | 4 | 1 | four  | 0 |   
+     | 4 | 1 | four  | 1 | -1
+     | 4 | 1 | four  | 2 |  2
+     | 4 | 1 | four  | 2 |  4
+     | 4 | 1 | four  | 3 | -3
+     | 4 | 1 | four  | 5 | -5
+     | 4 | 1 | four  | 5 | -5
+     | 4 | 1 | four  |   |  0
+     | 4 | 1 | four  |   |   
+     | 5 | 0 | five  | 0 |   
+     | 5 | 0 | five  | 1 | -1
+     | 5 | 0 | five  | 2 |  2
+     | 5 | 0 | five  | 2 |  4
+     | 5 | 0 | five  | 3 | -3
+     | 5 | 0 | five  | 5 | -5
+     | 5 | 0 | five  | 5 | -5
+     | 5 | 0 | five  |   |  0
+     | 5 | 0 | five  |   |   
+     | 6 | 6 | six   | 0 |   
+     | 6 | 6 | six   | 1 | -1
+     | 6 | 6 | six   | 2 |  2
+     | 6 | 6 | six   | 2 |  4
+     | 6 | 6 | six   | 3 | -3
+     | 6 | 6 | six   | 5 | -5
+     | 6 | 6 | six   | 5 | -5
+     | 6 | 6 | six   |   |  0
+     | 6 | 6 | six   |   |   
+     | 7 | 7 | seven | 0 |   
+     | 7 | 7 | seven | 1 | -1
+     | 7 | 7 | seven | 2 |  2
+     | 7 | 7 | seven | 2 |  4
+     | 7 | 7 | seven | 3 | -3
+     | 7 | 7 | seven | 5 | -5
+     | 7 | 7 | seven | 5 | -5
+     | 7 | 7 | seven |   |  0
+     | 7 | 7 | seven |   |   
+     | 8 | 8 | eight | 0 |   
+     | 8 | 8 | eight | 1 | -1
+     | 8 | 8 | eight | 2 |  2
+     | 8 | 8 | eight | 2 |  4
+     | 8 | 8 | eight | 3 | -3
+     | 8 | 8 | eight | 5 | -5
+     | 8 | 8 | eight | 5 | -5
+     | 8 | 8 | eight |   |  0
+     | 8 | 8 | eight |   |   
+     |   | 0 | zero  | 0 |   
+     |   | 0 | zero  | 1 | -1
+     |   | 0 | zero  | 2 |  2
+     |   | 0 | zero  | 2 |  4
+     |   | 0 | zero  | 3 | -3
+     |   | 0 | zero  | 5 | -5
+     |   | 0 | zero  | 5 | -5
+     |   | 0 | zero  |   |  0
+     |   | 0 | zero  |   |   
+     |   |   | null  | 0 |   
+     |   |   | null  | 1 | -1
+     |   |   | null  | 2 |  2
+     |   |   | null  | 2 |  4
+     |   |   | null  | 3 | -3
+     |   |   | null  | 5 | -5
+     |   |   | null  | 5 | -5
+     |   |   | null  |   |  0
+     |   |   | null  |   |   
+(99 rows)
+
+SELECT '' AS "xxx", t1.a, t2.e
+  FROM J1_TBL t1 (a, b, c), J2_TBL t2 (d, e)
+  WHERE t1.a = t2.d
+  ORDER BY a, e;
+ xxx | a | e  
+-----+---+----
+     | 0 |   
+     | 1 | -1
+     | 2 |  2
+     | 2 |  4
+     | 3 | -3
+     | 5 | -5
+     | 5 | -5
+(7 rows)
+
+--
+-- CROSS JOIN
+-- Qualifications are not allowed on cross joins,
+-- which degenerate into a standard unqualified inner join.
+--
+SELECT '' AS "xxx", *
+  FROM J1_TBL CROSS JOIN J2_TBL
+  ORDER BY J1_TBL.i, J1_TBL.j, J1_TBL.t, J2_TBL.i, J2_TBL.k;
+ xxx | i | j |   t   | i | k  
+-----+---+---+-------+---+----
+     | 0 |   | zero  | 0 |   
+     | 0 |   | zero  | 1 | -1
+     | 0 |   | zero  | 2 |  2
+     | 0 |   | zero  | 2 |  4
+     | 0 |   | zero  | 3 | -3
+     | 0 |   | zero  | 5 | -5
+     | 0 |   | zero  | 5 | -5
+     | 0 |   | zero  |   |  0
+     | 0 |   | zero  |   |   
+     | 1 | 4 | one   | 0 |   
+     | 1 | 4 | one   | 1 | -1
+     | 1 | 4 | one   | 2 |  2
+     | 1 | 4 | one   | 2 |  4
+     | 1 | 4 | one   | 3 | -3
+     | 1 | 4 | one   | 5 | -5
+     | 1 | 4 | one   | 5 | -5
+     | 1 | 4 | one   |   |  0
+     | 1 | 4 | one   |   |   
+     | 2 | 3 | two   | 0 |   
+     | 2 | 3 | two   | 1 | -1
+     | 2 | 3 | two   | 2 |  2
+     | 2 | 3 | two   | 2 |  4
+     | 2 | 3 | two   | 3 | -3
+     | 2 | 3 | two   | 5 | -5
+     | 2 | 3 | two   | 5 | -5
+     | 2 | 3 | two   |   |  0
+     | 2 | 3 | two   |   |   
+     | 3 | 2 | three | 0 |   
+     | 3 | 2 | three | 1 | -1
+     | 3 | 2 | three | 2 |  2
+     | 3 | 2 | three | 2 |  4
+     | 3 | 2 | three | 3 | -3
+     | 3 | 2 | three | 5 | -5
+     | 3 | 2 | three | 5 | -5
+     | 3 | 2 | three |   |  0
+     | 3 | 2 | three |   |   
+     | 4 | 1 | four  | 0 |   
+     | 4 | 1 | four  | 1 | -1
+     | 4 | 1 | four  | 2 |  2
+     | 4 | 1 | four  | 2 |  4
+     | 4 | 1 | four  | 3 | -3
+     | 4 | 1 | four  | 5 | -5
+     | 4 | 1 | four  | 5 | -5
+     | 4 | 1 | four  |   |  0
+     | 4 | 1 | four  |   |   
+     | 5 | 0 | five  | 0 |   
+     | 5 | 0 | five  | 1 | -1
+     | 5 | 0 | five  | 2 |  2
+     | 5 | 0 | five  | 2 |  4
+     | 5 | 0 | five  | 3 | -3
+     | 5 | 0 | five  | 5 | -5
+     | 5 | 0 | five  | 5 | -5
+     | 5 | 0 | five  |   |  0
+     | 5 | 0 | five  |   |   
+     | 6 | 6 | six   | 0 |   
+     | 6 | 6 | six   | 1 | -1
+     | 6 | 6 | six   | 2 |  2
+     | 6 | 6 | six   | 2 |  4
+     | 6 | 6 | six   | 3 | -3
+     | 6 | 6 | six   | 5 | -5
+     | 6 | 6 | six   | 5 | -5
+     | 6 | 6 | six   |   |  0
+     | 6 | 6 | six   |   |   
+     | 7 | 7 | seven | 0 |   
+     | 7 | 7 | seven | 1 | -1
+     | 7 | 7 | seven | 2 |  2
+     | 7 | 7 | seven | 2 |  4
+     | 7 | 7 | seven | 3 | -3
+     | 7 | 7 | seven | 5 | -5
+     | 7 | 7 | seven | 5 | -5
+     | 7 | 7 | seven |   |  0
+     | 7 | 7 | seven |   |   
+     | 8 | 8 | eight | 0 |   
+     | 8 | 8 | eight | 1 | -1
+     | 8 | 8 | eight | 2 |  2
+     | 8 | 8 | eight | 2 |  4
+     | 8 | 8 | eight | 3 | -3
+     | 8 | 8 | eight | 5 | -5
+     | 8 | 8 | eight | 5 | -5
+     | 8 | 8 | eight |   |  0
+     | 8 | 8 | eight |   |   
+     |   | 0 | zero  | 0 |   
+     |   | 0 | zero  | 1 | -1
+     |   | 0 | zero  | 2 |  2
+     |   | 0 | zero  | 2 |  4
+     |   | 0 | zero  | 3 | -3
+     |   | 0 | zero  | 5 | -5
+     |   | 0 | zero  | 5 | -5
+     |   | 0 | zero  |   |  0
+     |   | 0 | zero  |   |   
+     |   |   | null  | 0 |   
+     |   |   | null  | 1 | -1
+     |   |   | null  | 2 |  2
+     |   |   | null  | 2 |  4
+     |   |   | null  | 3 | -3
+     |   |   | null  | 5 | -5
+     |   |   | null  | 5 | -5
+     |   |   | null  |   |  0
+     |   |   | null  |   |   
+(99 rows)
+
+-- ambiguous column
+SELECT '' AS "xxx", i, k, t
+  FROM J1_TBL CROSS JOIN J2_TBL;
+ERROR:  column reference "i" is ambiguous
+LINE 1: SELECT '' AS "xxx", i, k, t
+                            ^
+-- resolve previous ambiguity by specifying the table name
+SELECT '' AS "xxx", t1.i, k, t
+  FROM J1_TBL t1 CROSS JOIN J2_TBL t2
+  ORDER BY i, k, t;
+ xxx | i | k  |   t   
+-----+---+----+-------
+     | 0 | -5 | zero
+     | 0 | -5 | zero
+     | 0 | -3 | zero
+     | 0 | -1 | zero
+     | 0 |  0 | zero
+     | 0 |  2 | zero
+     | 0 |  4 | zero
+     | 0 |    | zero
+     | 0 |    | zero
+     | 1 | -5 | one
+     | 1 | -5 | one
+     | 1 | -3 | one
+     | 1 | -1 | one
+     | 1 |  0 | one
+     | 1 |  2 | one
+     | 1 |  4 | one
+     | 1 |    | one
+     | 1 |    | one
+     | 2 | -5 | two
+     | 2 | -5 | two
+     | 2 | -3 | two
+     | 2 | -1 | two
+     | 2 |  0 | two
+     | 2 |  2 | two
+     | 2 |  4 | two
+     | 2 |    | two
+     | 2 |    | two
+     | 3 | -5 | three
+     | 3 | -5 | three
+     | 3 | -3 | three
+     | 3 | -1 | three
+     | 3 |  0 | three
+     | 3 |  2 | three
+     | 3 |  4 | three
+     | 3 |    | three
+     | 3 |    | three
+     | 4 | -5 | four
+     | 4 | -5 | four
+     | 4 | -3 | four
+     | 4 | -1 | four
+     | 4 |  0 | four
+     | 4 |  2 | four
+     | 4 |  4 | four
+     | 4 |    | four
+     | 4 |    | four
+     | 5 | -5 | five
+     | 5 | -5 | five
+     | 5 | -3 | five
+     | 5 | -1 | five
+     | 5 |  0 | five
+     | 5 |  2 | five
+     | 5 |  4 | five
+     | 5 |    | five
+     | 5 |    | five
+     | 6 | -5 | six
+     | 6 | -5 | six
+     | 6 | -3 | six
+     | 6 | -1 | six
+     | 6 |  0 | six
+     | 6 |  2 | six
+     | 6 |  4 | six
+     | 6 |    | six
+     | 6 |    | six
+     | 7 | -5 | seven
+     | 7 | -5 | seven
+     | 7 | -3 | seven
+     | 7 | -1 | seven
+     | 7 |  0 | seven
+     | 7 |  2 | seven
+     | 7 |  4 | seven
+     | 7 |    | seven
+     | 7 |    | seven
+     | 8 | -5 | eight
+     | 8 | -5 | eight
+     | 8 | -3 | eight
+     | 8 | -1 | eight
+     | 8 |  0 | eight
+     | 8 |  2 | eight
+     | 8 |  4 | eight
+     | 8 |    | eight
+     | 8 |    | eight
+     |   | -5 | null
+     |   | -5 | null
+     |   | -5 | zero
+     |   | -5 | zero
+     |   | -3 | null
+     |   | -3 | zero
+     |   | -1 | null
+     |   | -1 | zero
+     |   |  0 | null
+     |   |  0 | zero
+     |   |  2 | null
+     |   |  2 | zero
+     |   |  4 | null
+     |   |  4 | zero
+     |   |    | null
+     |   |    | null
+     |   |    | zero
+     |   |    | zero
+(99 rows)
+
+SELECT '' AS "xxx", ii, tt, kk
+  FROM (J1_TBL CROSS JOIN J2_TBL)
+    AS tx (ii, jj, tt, ii2, kk)
+    ORDER BY ii, tt, kk;
+ xxx | ii |  tt   | kk 
+-----+----+-------+----
+     |  0 | zero  | -5
+     |  0 | zero  | -5
+     |  0 | zero  | -3
+     |  0 | zero  | -1
+     |  0 | zero  |  0
+     |  0 | zero  |  2
+     |  0 | zero  |  4
+     |  0 | zero  |   
+     |  0 | zero  |   
+     |  1 | one   | -5
+     |  1 | one   | -5
+     |  1 | one   | -3
+     |  1 | one   | -1
+     |  1 | one   |  0
+     |  1 | one   |  2
+     |  1 | one   |  4
+     |  1 | one   |   
+     |  1 | one   |   
+     |  2 | two   | -5
+     |  2 | two   | -5
+     |  2 | two   | -3
+     |  2 | two   | -1
+     |  2 | two   |  0
+     |  2 | two   |  2
+     |  2 | two   |  4
+     |  2 | two   |   
+     |  2 | two   |   
+     |  3 | three | -5
+     |  3 | three | -5
+     |  3 | three | -3
+     |  3 | three | -1
+     |  3 | three |  0
+     |  3 | three |  2
+     |  3 | three |  4
+     |  3 | three |   
+     |  3 | three |   
+     |  4 | four  | -5
+     |  4 | four  | -5
+     |  4 | four  | -3
+     |  4 | four  | -1
+     |  4 | four  |  0
+     |  4 | four  |  2
+     |  4 | four  |  4
+     |  4 | four  |   
+     |  4 | four  |   
+     |  5 | five  | -5
+     |  5 | five  | -5
+     |  5 | five  | -3
+     |  5 | five  | -1
+     |  5 | five  |  0
+     |  5 | five  |  2
+     |  5 | five  |  4
+     |  5 | five  |   
+     |  5 | five  |   
+     |  6 | six   | -5
+     |  6 | six   | -5
+     |  6 | six   | -3
+     |  6 | six   | -1
+     |  6 | six   |  0
+     |  6 | six   |  2
+     |  6 | six   |  4
+     |  6 | six   |   
+     |  6 | six   |   
+     |  7 | seven | -5
+     |  7 | seven | -5
+     |  7 | seven | -3
+     |  7 | seven | -1
+     |  7 | seven |  0
+     |  7 | seven |  2
+     |  7 | seven |  4
+     |  7 | seven |   
+     |  7 | seven |   
+     |  8 | eight | -5
+     |  8 | eight | -5
+     |  8 | eight | -3
+     |  8 | eight | -1
+     |  8 | eight |  0
+     |  8 | eight |  2
+     |  8 | eight |  4
+     |  8 | eight |   
+     |  8 | eight |   
+     |    | null  | -5
+     |    | null  | -5
+     |    | null  | -3
+     |    | null  | -1
+     |    | null  |  0
+     |    | null  |  2
+     |    | null  |  4
+     |    | null  |   
+     |    | null  |   
+     |    | zero  | -5
+     |    | zero  | -5
+     |    | zero  | -3
+     |    | zero  | -1
+     |    | zero  |  0
+     |    | zero  |  2
+     |    | zero  |  4
+     |    | zero  |   
+     |    | zero  |   
+(99 rows)
+
+SELECT '' AS "xxx", tx.ii, tx.jj, tx.kk
+  FROM (J1_TBL t1 (a, b, c) CROSS JOIN J2_TBL t2 (d, e))
+    AS tx (ii, jj, tt, ii2, kk)
+    ORDER BY ii, jj, kk;
+ xxx | ii | jj | kk 
+-----+----+----+----
+     |  0 |    | -5
+     |  0 |    | -5
+     |  0 |    | -3
+     |  0 |    | -1
+     |  0 |    |  0
+     |  0 |    |  2
+     |  0 |    |  4
+     |  0 |    |   
+     |  0 |    |   
+     |  1 |  4 | -5
+     |  1 |  4 | -5
+     |  1 |  4 | -3
+     |  1 |  4 | -1
+     |  1 |  4 |  0
+     |  1 |  4 |  2
+     |  1 |  4 |  4
+     |  1 |  4 |   
+     |  1 |  4 |   
+     |  2 |  3 | -5
+     |  2 |  3 | -5
+     |  2 |  3 | -3
+     |  2 |  3 | -1
+     |  2 |  3 |  0
+     |  2 |  3 |  2
+     |  2 |  3 |  4
+     |  2 |  3 |   
+     |  2 |  3 |   
+     |  3 |  2 | -5
+     |  3 |  2 | -5
+     |  3 |  2 | -3
+     |  3 |  2 | -1
+     |  3 |  2 |  0
+     |  3 |  2 |  2
+     |  3 |  2 |  4
+     |  3 |  2 |   
+     |  3 |  2 |   
+     |  4 |  1 | -5
+     |  4 |  1 | -5
+     |  4 |  1 | -3
+     |  4 |  1 | -1
+     |  4 |  1 |  0
+     |  4 |  1 |  2
+     |  4 |  1 |  4
+     |  4 |  1 |   
+     |  4 |  1 |   
+     |  5 |  0 | -5
+     |  5 |  0 | -5
+     |  5 |  0 | -3
+     |  5 |  0 | -1
+     |  5 |  0 |  0
+     |  5 |  0 |  2
+     |  5 |  0 |  4
+     |  5 |  0 |   
+     |  5 |  0 |   
+     |  6 |  6 | -5
+     |  6 |  6 | -5
+     |  6 |  6 | -3
+     |  6 |  6 | -1
+     |  6 |  6 |  0
+     |  6 |  6 |  2
+     |  6 |  6 |  4
+     |  6 |  6 |   
+     |  6 |  6 |   
+     |  7 |  7 | -5
+     |  7 |  7 | -5
+     |  7 |  7 | -3
+     |  7 |  7 | -1
+     |  7 |  7 |  0
+     |  7 |  7 |  2
+     |  7 |  7 |  4
+     |  7 |  7 |   
+     |  7 |  7 |   
+     |  8 |  8 | -5
+     |  8 |  8 | -5
+     |  8 |  8 | -3
+     |  8 |  8 | -1
+     |  8 |  8 |  0
+     |  8 |  8 |  2
+     |  8 |  8 |  4
+     |  8 |  8 |   
+     |  8 |  8 |   
+     |    |  0 | -5
+     |    |  0 | -5
+     |    |  0 | -3
+     |    |  0 | -1
+     |    |  0 |  0
+     |    |  0 |  2
+     |    |  0 |  4
+     |    |  0 |   
+     |    |  0 |   
+     |    |    | -5
+     |    |    | -5
+     |    |    | -3
+     |    |    | -1
+     |    |    |  0
+     |    |    |  2
+     |    |    |  4
+     |    |    |   
+     |    |    |   
+(99 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL CROSS JOIN J2_TBL a CROSS JOIN J2_TBL b
+  ORDER BY J1_TBL.i,J1_TBL.j,J1_TBL.t,a.i,a.k,b.i,b.k;
+ xxx | i | j |   t   | i | k  | i | k  
+-----+---+---+-------+---+----+---+----
+     | 0 |   | zero  | 0 |    | 0 |   
+     | 0 |   | zero  | 0 |    | 1 | -1
+     | 0 |   | zero  | 0 |    | 2 |  2
+     | 0 |   | zero  | 0 |    | 2 |  4
+     | 0 |   | zero  | 0 |    | 3 | -3
+     | 0 |   | zero  | 0 |    | 5 | -5
+     | 0 |   | zero  | 0 |    | 5 | -5
+     | 0 |   | zero  | 0 |    |   |  0
+     | 0 |   | zero  | 0 |    |   |   
+     | 0 |   | zero  | 1 | -1 | 0 |   
+     | 0 |   | zero  | 1 | -1 | 1 | -1
+     | 0 |   | zero  | 1 | -1 | 2 |  2
+     | 0 |   | zero  | 1 | -1 | 2 |  4
+     | 0 |   | zero  | 1 | -1 | 3 | -3
+     | 0 |   | zero  | 1 | -1 | 5 | -5
+     | 0 |   | zero  | 1 | -1 | 5 | -5
+     | 0 |   | zero  | 1 | -1 |   |  0
+     | 0 |   | zero  | 1 | -1 |   |   
+     | 0 |   | zero  | 2 |  2 | 0 |   
+     | 0 |   | zero  | 2 |  2 | 1 | -1
+     | 0 |   | zero  | 2 |  2 | 2 |  2
+     | 0 |   | zero  | 2 |  2 | 2 |  4
+     | 0 |   | zero  | 2 |  2 | 3 | -3
+     | 0 |   | zero  | 2 |  2 | 5 | -5
+     | 0 |   | zero  | 2 |  2 | 5 | -5
+     | 0 |   | zero  | 2 |  2 |   |  0
+     | 0 |   | zero  | 2 |  2 |   |   
+     | 0 |   | zero  | 2 |  4 | 0 |   
+     | 0 |   | zero  | 2 |  4 | 1 | -1
+     | 0 |   | zero  | 2 |  4 | 2 |  2
+     | 0 |   | zero  | 2 |  4 | 2 |  4
+     | 0 |   | zero  | 2 |  4 | 3 | -3
+     | 0 |   | zero  | 2 |  4 | 5 | -5
+     | 0 |   | zero  | 2 |  4 | 5 | -5
+     | 0 |   | zero  | 2 |  4 |   |  0
+     | 0 |   | zero  | 2 |  4 |   |   
+     | 0 |   | zero  | 3 | -3 | 0 |   
+     | 0 |   | zero  | 3 | -3 | 1 | -1
+     | 0 |   | zero  | 3 | -3 | 2 |  2
+     | 0 |   | zero  | 3 | -3 | 2 |  4
+     | 0 |   | zero  | 3 | -3 | 3 | -3
+     | 0 |   | zero  | 3 | -3 | 5 | -5
+     | 0 |   | zero  | 3 | -3 | 5 | -5
+     | 0 |   | zero  | 3 | -3 |   |  0
+     | 0 |   | zero  | 3 | -3 |   |   
+     | 0 |   | zero  | 5 | -5 | 0 |   
+     | 0 |   | zero  | 5 | -5 | 0 |   
+     | 0 |   | zero  | 5 | -5 | 1 | -1
+     | 0 |   | zero  | 5 | -5 | 1 | -1
+     | 0 |   | zero  | 5 | -5 | 2 |  2
+     | 0 |   | zero  | 5 | -5 | 2 |  2
+     | 0 |   | zero  | 5 | -5 | 2 |  4
+     | 0 |   | zero  | 5 | -5 | 2 |  4
+     | 0 |   | zero  | 5 | -5 | 3 | -3
+     | 0 |   | zero  | 5 | -5 | 3 | -3
+     | 0 |   | zero  | 5 | -5 | 5 | -5
+     | 0 |   | zero  | 5 | -5 | 5 | -5
+     | 0 |   | zero  | 5 | -5 | 5 | -5
+     | 0 |   | zero  | 5 | -5 | 5 | -5
+     | 0 |   | zero  | 5 | -5 |   |  0
+     | 0 |   | zero  | 5 | -5 |   |  0
+     | 0 |   | zero  | 5 | -5 |   |   
+     | 0 |   | zero  | 5 | -5 |   |   
+     | 0 |   | zero  |   |  0 | 0 |   
+     | 0 |   | zero  |   |  0 | 1 | -1
+     | 0 |   | zero  |   |  0 | 2 |  2
+     | 0 |   | zero  |   |  0 | 2 |  4
+     | 0 |   | zero  |   |  0 | 3 | -3
+     | 0 |   | zero  |   |  0 | 5 | -5
+     | 0 |   | zero  |   |  0 | 5 | -5
+     | 0 |   | zero  |   |  0 |   |  0
+     | 0 |   | zero  |   |  0 |   |   
+     | 0 |   | zero  |   |    | 0 |   
+     | 0 |   | zero  |   |    | 1 | -1
+     | 0 |   | zero  |   |    | 2 |  2
+     | 0 |   | zero  |   |    | 2 |  4
+     | 0 |   | zero  |   |    | 3 | -3
+     | 0 |   | zero  |   |    | 5 | -5
+     | 0 |   | zero  |   |    | 5 | -5
+     | 0 |   | zero  |   |    |   |  0
+     | 0 |   | zero  |   |    |   |   
+     | 1 | 4 | one   | 0 |    | 0 |   
+     | 1 | 4 | one   | 0 |    | 1 | -1
+     | 1 | 4 | one   | 0 |    | 2 |  2
+     | 1 | 4 | one   | 0 |    | 2 |  4
+     | 1 | 4 | one   | 0 |    | 3 | -3
+     | 1 | 4 | one   | 0 |    | 5 | -5
+     | 1 | 4 | one   | 0 |    | 5 | -5
+     | 1 | 4 | one   | 0 |    |   |  0
+     | 1 | 4 | one   | 0 |    |   |   
+     | 1 | 4 | one   | 1 | -1 | 0 |   
+     | 1 | 4 | one   | 1 | -1 | 1 | -1
+     | 1 | 4 | one   | 1 | -1 | 2 |  2
+     | 1 | 4 | one   | 1 | -1 | 2 |  4
+     | 1 | 4 | one   | 1 | -1 | 3 | -3
+     | 1 | 4 | one   | 1 | -1 | 5 | -5
+     | 1 | 4 | one   | 1 | -1 | 5 | -5
+     | 1 | 4 | one   | 1 | -1 |   |  0
+     | 1 | 4 | one   | 1 | -1 |   |   
+     | 1 | 4 | one   | 2 |  2 | 0 |   
+     | 1 | 4 | one   | 2 |  2 | 1 | -1
+     | 1 | 4 | one   | 2 |  2 | 2 |  2
+     | 1 | 4 | one   | 2 |  2 | 2 |  4
+     | 1 | 4 | one   | 2 |  2 | 3 | -3
+     | 1 | 4 | one   | 2 |  2 | 5 | -5
+     | 1 | 4 | one   | 2 |  2 | 5 | -5
+     | 1 | 4 | one   | 2 |  2 |   |  0
+     | 1 | 4 | one   | 2 |  2 |   |   
+     | 1 | 4 | one   | 2 |  4 | 0 |   
+     | 1 | 4 | one   | 2 |  4 | 1 | -1
+     | 1 | 4 | one   | 2 |  4 | 2 |  2
+     | 1 | 4 | one   | 2 |  4 | 2 |  4
+     | 1 | 4 | one   | 2 |  4 | 3 | -3
+     | 1 | 4 | one   | 2 |  4 | 5 | -5
+     | 1 | 4 | one   | 2 |  4 | 5 | -5
+     | 1 | 4 | one   | 2 |  4 |   |  0
+     | 1 | 4 | one   | 2 |  4 |   |   
+     | 1 | 4 | one   | 3 | -3 | 0 |   
+     | 1 | 4 | one   | 3 | -3 | 1 | -1
+     | 1 | 4 | one   | 3 | -3 | 2 |  2
+     | 1 | 4 | one   | 3 | -3 | 2 |  4
+     | 1 | 4 | one   | 3 | -3 | 3 | -3
+     | 1 | 4 | one   | 3 | -3 | 5 | -5
+     | 1 | 4 | one   | 3 | -3 | 5 | -5
+     | 1 | 4 | one   | 3 | -3 |   |  0
+     | 1 | 4 | one   | 3 | -3 |   |   
+     | 1 | 4 | one   | 5 | -5 | 0 |   
+     | 1 | 4 | one   | 5 | -5 | 0 |   
+     | 1 | 4 | one   | 5 | -5 | 1 | -1
+     | 1 | 4 | one   | 5 | -5 | 1 | -1
+     | 1 | 4 | one   | 5 | -5 | 2 |  2
+     | 1 | 4 | one   | 5 | -5 | 2 |  2
+     | 1 | 4 | one   | 5 | -5 | 2 |  4
+     | 1 | 4 | one   | 5 | -5 | 2 |  4
+     | 1 | 4 | one   | 5 | -5 | 3 | -3
+     | 1 | 4 | one   | 5 | -5 | 3 | -3
+     | 1 | 4 | one   | 5 | -5 | 5 | -5
+     | 1 | 4 | one   | 5 | -5 | 5 | -5
+     | 1 | 4 | one   | 5 | -5 | 5 | -5
+     | 1 | 4 | one   | 5 | -5 | 5 | -5
+     | 1 | 4 | one   | 5 | -5 |   |  0
+     | 1 | 4 | one   | 5 | -5 |   |  0
+     | 1 | 4 | one   | 5 | -5 |   |   
+     | 1 | 4 | one   | 5 | -5 |   |   
+     | 1 | 4 | one   |   |  0 | 0 |   
+     | 1 | 4 | one   |   |  0 | 1 | -1
+     | 1 | 4 | one   |   |  0 | 2 |  2
+     | 1 | 4 | one   |   |  0 | 2 |  4
+     | 1 | 4 | one   |   |  0 | 3 | -3
+     | 1 | 4 | one   |   |  0 | 5 | -5
+     | 1 | 4 | one   |   |  0 | 5 | -5
+     | 1 | 4 | one   |   |  0 |   |  0
+     | 1 | 4 | one   |   |  0 |   |   
+     | 1 | 4 | one   |   |    | 0 |   
+     | 1 | 4 | one   |   |    | 1 | -1
+     | 1 | 4 | one   |   |    | 2 |  2
+     | 1 | 4 | one   |   |    | 2 |  4
+     | 1 | 4 | one   |   |    | 3 | -3
+     | 1 | 4 | one   |   |    | 5 | -5
+     | 1 | 4 | one   |   |    | 5 | -5
+     | 1 | 4 | one   |   |    |   |  0
+     | 1 | 4 | one   |   |    |   |   
+     | 2 | 3 | two   | 0 |    | 0 |   
+     | 2 | 3 | two   | 0 |    | 1 | -1
+     | 2 | 3 | two   | 0 |    | 2 |  2
+     | 2 | 3 | two   | 0 |    | 2 |  4
+     | 2 | 3 | two   | 0 |    | 3 | -3
+     | 2 | 3 | two   | 0 |    | 5 | -5
+     | 2 | 3 | two   | 0 |    | 5 | -5
+     | 2 | 3 | two   | 0 |    |   |  0
+     | 2 | 3 | two   | 0 |    |   |   
+     | 2 | 3 | two   | 1 | -1 | 0 |   
+     | 2 | 3 | two   | 1 | -1 | 1 | -1
+     | 2 | 3 | two   | 1 | -1 | 2 |  2
+     | 2 | 3 | two   | 1 | -1 | 2 |  4
+     | 2 | 3 | two   | 1 | -1 | 3 | -3
+     | 2 | 3 | two   | 1 | -1 | 5 | -5
+     | 2 | 3 | two   | 1 | -1 | 5 | -5
+     | 2 | 3 | two   | 1 | -1 |   |  0
+     | 2 | 3 | two   | 1 | -1 |   |   
+     | 2 | 3 | two   | 2 |  2 | 0 |   
+     | 2 | 3 | two   | 2 |  2 | 1 | -1
+     | 2 | 3 | two   | 2 |  2 | 2 |  2
+     | 2 | 3 | two   | 2 |  2 | 2 |  4
+     | 2 | 3 | two   | 2 |  2 | 3 | -3
+     | 2 | 3 | two   | 2 |  2 | 5 | -5
+     | 2 | 3 | two   | 2 |  2 | 5 | -5
+     | 2 | 3 | two   | 2 |  2 |   |  0
+     | 2 | 3 | two   | 2 |  2 |   |   
+     | 2 | 3 | two   | 2 |  4 | 0 |   
+     | 2 | 3 | two   | 2 |  4 | 1 | -1
+     | 2 | 3 | two   | 2 |  4 | 2 |  2
+     | 2 | 3 | two   | 2 |  4 | 2 |  4
+     | 2 | 3 | two   | 2 |  4 | 3 | -3
+     | 2 | 3 | two   | 2 |  4 | 5 | -5
+     | 2 | 3 | two   | 2 |  4 | 5 | -5
+     | 2 | 3 | two   | 2 |  4 |   |  0
+     | 2 | 3 | two   | 2 |  4 |   |   
+     | 2 | 3 | two   | 3 | -3 | 0 |   
+     | 2 | 3 | two   | 3 | -3 | 1 | -1
+     | 2 | 3 | two   | 3 | -3 | 2 |  2
+     | 2 | 3 | two   | 3 | -3 | 2 |  4
+     | 2 | 3 | two   | 3 | -3 | 3 | -3
+     | 2 | 3 | two   | 3 | -3 | 5 | -5
+     | 2 | 3 | two   | 3 | -3 | 5 | -5
+     | 2 | 3 | two   | 3 | -3 |   |  0
+     | 2 | 3 | two   | 3 | -3 |   |   
+     | 2 | 3 | two   | 5 | -5 | 0 |   
+     | 2 | 3 | two   | 5 | -5 | 0 |   
+     | 2 | 3 | two   | 5 | -5 | 1 | -1
+     | 2 | 3 | two   | 5 | -5 | 1 | -1
+     | 2 | 3 | two   | 5 | -5 | 2 |  2
+     | 2 | 3 | two   | 5 | -5 | 2 |  2
+     | 2 | 3 | two   | 5 | -5 | 2 |  4
+     | 2 | 3 | two   | 5 | -5 | 2 |  4
+     | 2 | 3 | two   | 5 | -5 | 3 | -3
+     | 2 | 3 | two   | 5 | -5 | 3 | -3
+     | 2 | 3 | two   | 5 | -5 | 5 | -5
+     | 2 | 3 | two   | 5 | -5 | 5 | -5
+     | 2 | 3 | two   | 5 | -5 | 5 | -5
+     | 2 | 3 | two   | 5 | -5 | 5 | -5
+     | 2 | 3 | two   | 5 | -5 |   |  0
+     | 2 | 3 | two   | 5 | -5 |   |  0
+     | 2 | 3 | two   | 5 | -5 |   |   
+     | 2 | 3 | two   | 5 | -5 |   |   
+     | 2 | 3 | two   |   |  0 | 0 |   
+     | 2 | 3 | two   |   |  0 | 1 | -1
+     | 2 | 3 | two   |   |  0 | 2 |  2
+     | 2 | 3 | two   |   |  0 | 2 |  4
+     | 2 | 3 | two   |   |  0 | 3 | -3
+     | 2 | 3 | two   |   |  0 | 5 | -5
+     | 2 | 3 | two   |   |  0 | 5 | -5
+     | 2 | 3 | two   |   |  0 |   |  0
+     | 2 | 3 | two   |   |  0 |   |   
+     | 2 | 3 | two   |   |    | 0 |   
+     | 2 | 3 | two   |   |    | 1 | -1
+     | 2 | 3 | two   |   |    | 2 |  2
+     | 2 | 3 | two   |   |    | 2 |  4
+     | 2 | 3 | two   |   |    | 3 | -3
+     | 2 | 3 | two   |   |    | 5 | -5
+     | 2 | 3 | two   |   |    | 5 | -5
+     | 2 | 3 | two   |   |    |   |  0
+     | 2 | 3 | two   |   |    |   |   
+     | 3 | 2 | three | 0 |    | 0 |   
+     | 3 | 2 | three | 0 |    | 1 | -1
+     | 3 | 2 | three | 0 |    | 2 |  2
+     | 3 | 2 | three | 0 |    | 2 |  4
+     | 3 | 2 | three | 0 |    | 3 | -3
+     | 3 | 2 | three | 0 |    | 5 | -5
+     | 3 | 2 | three | 0 |    | 5 | -5
+     | 3 | 2 | three | 0 |    |   |  0
+     | 3 | 2 | three | 0 |    |   |   
+     | 3 | 2 | three | 1 | -1 | 0 |   
+     | 3 | 2 | three | 1 | -1 | 1 | -1
+     | 3 | 2 | three | 1 | -1 | 2 |  2
+     | 3 | 2 | three | 1 | -1 | 2 |  4
+     | 3 | 2 | three | 1 | -1 | 3 | -3
+     | 3 | 2 | three | 1 | -1 | 5 | -5
+     | 3 | 2 | three | 1 | -1 | 5 | -5
+     | 3 | 2 | three | 1 | -1 |   |  0
+     | 3 | 2 | three | 1 | -1 |   |   
+     | 3 | 2 | three | 2 |  2 | 0 |   
+     | 3 | 2 | three | 2 |  2 | 1 | -1
+     | 3 | 2 | three | 2 |  2 | 2 |  2
+     | 3 | 2 | three | 2 |  2 | 2 |  4
+     | 3 | 2 | three | 2 |  2 | 3 | -3
+     | 3 | 2 | three | 2 |  2 | 5 | -5
+     | 3 | 2 | three | 2 |  2 | 5 | -5
+     | 3 | 2 | three | 2 |  2 |   |  0
+     | 3 | 2 | three | 2 |  2 |   |   
+     | 3 | 2 | three | 2 |  4 | 0 |   
+     | 3 | 2 | three | 2 |  4 | 1 | -1
+     | 3 | 2 | three | 2 |  4 | 2 |  2
+     | 3 | 2 | three | 2 |  4 | 2 |  4
+     | 3 | 2 | three | 2 |  4 | 3 | -3
+     | 3 | 2 | three | 2 |  4 | 5 | -5
+     | 3 | 2 | three | 2 |  4 | 5 | -5
+     | 3 | 2 | three | 2 |  4 |   |  0
+     | 3 | 2 | three | 2 |  4 |   |   
+     | 3 | 2 | three | 3 | -3 | 0 |   
+     | 3 | 2 | three | 3 | -3 | 1 | -1
+     | 3 | 2 | three | 3 | -3 | 2 |  2
+     | 3 | 2 | three | 3 | -3 | 2 |  4
+     | 3 | 2 | three | 3 | -3 | 3 | -3
+     | 3 | 2 | three | 3 | -3 | 5 | -5
+     | 3 | 2 | three | 3 | -3 | 5 | -5
+     | 3 | 2 | three | 3 | -3 |   |  0
+     | 3 | 2 | three | 3 | -3 |   |   
+     | 3 | 2 | three | 5 | -5 | 0 |   
+     | 3 | 2 | three | 5 | -5 | 0 |   
+     | 3 | 2 | three | 5 | -5 | 1 | -1
+     | 3 | 2 | three | 5 | -5 | 1 | -1
+     | 3 | 2 | three | 5 | -5 | 2 |  2
+     | 3 | 2 | three | 5 | -5 | 2 |  2
+     | 3 | 2 | three | 5 | -5 | 2 |  4
+     | 3 | 2 | three | 5 | -5 | 2 |  4
+     | 3 | 2 | three | 5 | -5 | 3 | -3
+     | 3 | 2 | three | 5 | -5 | 3 | -3
+     | 3 | 2 | three | 5 | -5 | 5 | -5
+     | 3 | 2 | three | 5 | -5 | 5 | -5
+     | 3 | 2 | three | 5 | -5 | 5 | -5
+     | 3 | 2 | three | 5 | -5 | 5 | -5
+     | 3 | 2 | three | 5 | -5 |   |  0
+     | 3 | 2 | three | 5 | -5 |   |  0
+     | 3 | 2 | three | 5 | -5 |   |   
+     | 3 | 2 | three | 5 | -5 |   |   
+     | 3 | 2 | three |   |  0 | 0 |   
+     | 3 | 2 | three |   |  0 | 1 | -1
+     | 3 | 2 | three |   |  0 | 2 |  2
+     | 3 | 2 | three |   |  0 | 2 |  4
+     | 3 | 2 | three |   |  0 | 3 | -3
+     | 3 | 2 | three |   |  0 | 5 | -5
+     | 3 | 2 | three |   |  0 | 5 | -5
+     | 3 | 2 | three |   |  0 |   |  0
+     | 3 | 2 | three |   |  0 |   |   
+     | 3 | 2 | three |   |    | 0 |   
+     | 3 | 2 | three |   |    | 1 | -1
+     | 3 | 2 | three |   |    | 2 |  2
+     | 3 | 2 | three |   |    | 2 |  4
+     | 3 | 2 | three |   |    | 3 | -3
+     | 3 | 2 | three |   |    | 5 | -5
+     | 3 | 2 | three |   |    | 5 | -5
+     | 3 | 2 | three |   |    |   |  0
+     | 3 | 2 | three |   |    |   |   
+     | 4 | 1 | four  | 0 |    | 0 |   
+     | 4 | 1 | four  | 0 |    | 1 | -1
+     | 4 | 1 | four  | 0 |    | 2 |  2
+     | 4 | 1 | four  | 0 |    | 2 |  4
+     | 4 | 1 | four  | 0 |    | 3 | -3
+     | 4 | 1 | four  | 0 |    | 5 | -5
+     | 4 | 1 | four  | 0 |    | 5 | -5
+     | 4 | 1 | four  | 0 |    |   |  0
+     | 4 | 1 | four  | 0 |    |   |   
+     | 4 | 1 | four  | 1 | -1 | 0 |   
+     | 4 | 1 | four  | 1 | -1 | 1 | -1
+     | 4 | 1 | four  | 1 | -1 | 2 |  2
+     | 4 | 1 | four  | 1 | -1 | 2 |  4
+     | 4 | 1 | four  | 1 | -1 | 3 | -3
+     | 4 | 1 | four  | 1 | -1 | 5 | -5
+     | 4 | 1 | four  | 1 | -1 | 5 | -5
+     | 4 | 1 | four  | 1 | -1 |   |  0
+     | 4 | 1 | four  | 1 | -1 |   |   
+     | 4 | 1 | four  | 2 |  2 | 0 |   
+     | 4 | 1 | four  | 2 |  2 | 1 | -1
+     | 4 | 1 | four  | 2 |  2 | 2 |  2
+     | 4 | 1 | four  | 2 |  2 | 2 |  4
+     | 4 | 1 | four  | 2 |  2 | 3 | -3
+     | 4 | 1 | four  | 2 |  2 | 5 | -5
+     | 4 | 1 | four  | 2 |  2 | 5 | -5
+     | 4 | 1 | four  | 2 |  2 |   |  0
+     | 4 | 1 | four  | 2 |  2 |   |   
+     | 4 | 1 | four  | 2 |  4 | 0 |   
+     | 4 | 1 | four  | 2 |  4 | 1 | -1
+     | 4 | 1 | four  | 2 |  4 | 2 |  2
+     | 4 | 1 | four  | 2 |  4 | 2 |  4
+     | 4 | 1 | four  | 2 |  4 | 3 | -3
+     | 4 | 1 | four  | 2 |  4 | 5 | -5
+     | 4 | 1 | four  | 2 |  4 | 5 | -5
+     | 4 | 1 | four  | 2 |  4 |   |  0
+     | 4 | 1 | four  | 2 |  4 |   |   
+     | 4 | 1 | four  | 3 | -3 | 0 |   
+     | 4 | 1 | four  | 3 | -3 | 1 | -1
+     | 4 | 1 | four  | 3 | -3 | 2 |  2
+     | 4 | 1 | four  | 3 | -3 | 2 |  4
+     | 4 | 1 | four  | 3 | -3 | 3 | -3
+     | 4 | 1 | four  | 3 | -3 | 5 | -5
+     | 4 | 1 | four  | 3 | -3 | 5 | -5
+     | 4 | 1 | four  | 3 | -3 |   |  0
+     | 4 | 1 | four  | 3 | -3 |   |   
+     | 4 | 1 | four  | 5 | -5 | 0 |   
+     | 4 | 1 | four  | 5 | -5 | 0 |   
+     | 4 | 1 | four  | 5 | -5 | 1 | -1
+     | 4 | 1 | four  | 5 | -5 | 1 | -1
+     | 4 | 1 | four  | 5 | -5 | 2 |  2
+     | 4 | 1 | four  | 5 | -5 | 2 |  2
+     | 4 | 1 | four  | 5 | -5 | 2 |  4
+     | 4 | 1 | four  | 5 | -5 | 2 |  4
+     | 4 | 1 | four  | 5 | -5 | 3 | -3
+     | 4 | 1 | four  | 5 | -5 | 3 | -3
+     | 4 | 1 | four  | 5 | -5 | 5 | -5
+     | 4 | 1 | four  | 5 | -5 | 5 | -5
+     | 4 | 1 | four  | 5 | -5 | 5 | -5
+     | 4 | 1 | four  | 5 | -5 | 5 | -5
+     | 4 | 1 | four  | 5 | -5 |   |  0
+     | 4 | 1 | four  | 5 | -5 |   |  0
+     | 4 | 1 | four  | 5 | -5 |   |   
+     | 4 | 1 | four  | 5 | -5 |   |   
+     | 4 | 1 | four  |   |  0 | 0 |   
+     | 4 | 1 | four  |   |  0 | 1 | -1
+     | 4 | 1 | four  |   |  0 | 2 |  2
+     | 4 | 1 | four  |   |  0 | 2 |  4
+     | 4 | 1 | four  |   |  0 | 3 | -3
+     | 4 | 1 | four  |   |  0 | 5 | -5
+     | 4 | 1 | four  |   |  0 | 5 | -5
+     | 4 | 1 | four  |   |  0 |   |  0
+     | 4 | 1 | four  |   |  0 |   |   
+     | 4 | 1 | four  |   |    | 0 |   
+     | 4 | 1 | four  |   |    | 1 | -1
+     | 4 | 1 | four  |   |    | 2 |  2
+     | 4 | 1 | four  |   |    | 2 |  4
+     | 4 | 1 | four  |   |    | 3 | -3
+     | 4 | 1 | four  |   |    | 5 | -5
+     | 4 | 1 | four  |   |    | 5 | -5
+     | 4 | 1 | four  |   |    |   |  0
+     | 4 | 1 | four  |   |    |   |   
+     | 5 | 0 | five  | 0 |    | 0 |   
+     | 5 | 0 | five  | 0 |    | 1 | -1
+     | 5 | 0 | five  | 0 |    | 2 |  2
+     | 5 | 0 | five  | 0 |    | 2 |  4
+     | 5 | 0 | five  | 0 |    | 3 | -3
+     | 5 | 0 | five  | 0 |    | 5 | -5
+     | 5 | 0 | five  | 0 |    | 5 | -5
+     | 5 | 0 | five  | 0 |    |   |  0
+     | 5 | 0 | five  | 0 |    |   |   
+     | 5 | 0 | five  | 1 | -1 | 0 |   
+     | 5 | 0 | five  | 1 | -1 | 1 | -1
+     | 5 | 0 | five  | 1 | -1 | 2 |  2
+     | 5 | 0 | five  | 1 | -1 | 2 |  4
+     | 5 | 0 | five  | 1 | -1 | 3 | -3
+     | 5 | 0 | five  | 1 | -1 | 5 | -5
+     | 5 | 0 | five  | 1 | -1 | 5 | -5
+     | 5 | 0 | five  | 1 | -1 |   |  0
+     | 5 | 0 | five  | 1 | -1 |   |   
+     | 5 | 0 | five  | 2 |  2 | 0 |   
+     | 5 | 0 | five  | 2 |  2 | 1 | -1
+     | 5 | 0 | five  | 2 |  2 | 2 |  2
+     | 5 | 0 | five  | 2 |  2 | 2 |  4
+     | 5 | 0 | five  | 2 |  2 | 3 | -3
+     | 5 | 0 | five  | 2 |  2 | 5 | -5
+     | 5 | 0 | five  | 2 |  2 | 5 | -5
+     | 5 | 0 | five  | 2 |  2 |   |  0
+     | 5 | 0 | five  | 2 |  2 |   |   
+     | 5 | 0 | five  | 2 |  4 | 0 |   
+     | 5 | 0 | five  | 2 |  4 | 1 | -1
+     | 5 | 0 | five  | 2 |  4 | 2 |  2
+     | 5 | 0 | five  | 2 |  4 | 2 |  4
+     | 5 | 0 | five  | 2 |  4 | 3 | -3
+     | 5 | 0 | five  | 2 |  4 | 5 | -5
+     | 5 | 0 | five  | 2 |  4 | 5 | -5
+     | 5 | 0 | five  | 2 |  4 |   |  0
+     | 5 | 0 | five  | 2 |  4 |   |   
+     | 5 | 0 | five  | 3 | -3 | 0 |   
+     | 5 | 0 | five  | 3 | -3 | 1 | -1
+     | 5 | 0 | five  | 3 | -3 | 2 |  2
+     | 5 | 0 | five  | 3 | -3 | 2 |  4
+     | 5 | 0 | five  | 3 | -3 | 3 | -3
+     | 5 | 0 | five  | 3 | -3 | 5 | -5
+     | 5 | 0 | five  | 3 | -3 | 5 | -5
+     | 5 | 0 | five  | 3 | -3 |   |  0
+     | 5 | 0 | five  | 3 | -3 |   |   
+     | 5 | 0 | five  | 5 | -5 | 0 |   
+     | 5 | 0 | five  | 5 | -5 | 0 |   
+     | 5 | 0 | five  | 5 | -5 | 1 | -1
+     | 5 | 0 | five  | 5 | -5 | 1 | -1
+     | 5 | 0 | five  | 5 | -5 | 2 |  2
+     | 5 | 0 | five  | 5 | -5 | 2 |  2
+     | 5 | 0 | five  | 5 | -5 | 2 |  4
+     | 5 | 0 | five  | 5 | -5 | 2 |  4
+     | 5 | 0 | five  | 5 | -5 | 3 | -3
+     | 5 | 0 | five  | 5 | -5 | 3 | -3
+     | 5 | 0 | five  | 5 | -5 | 5 | -5
+     | 5 | 0 | five  | 5 | -5 | 5 | -5
+     | 5 | 0 | five  | 5 | -5 | 5 | -5
+     | 5 | 0 | five  | 5 | -5 | 5 | -5
+     | 5 | 0 | five  | 5 | -5 |   |  0
+     | 5 | 0 | five  | 5 | -5 |   |  0
+     | 5 | 0 | five  | 5 | -5 |   |   
+     | 5 | 0 | five  | 5 | -5 |   |   
+     | 5 | 0 | five  |   |  0 | 0 |   
+     | 5 | 0 | five  |   |  0 | 1 | -1
+     | 5 | 0 | five  |   |  0 | 2 |  2
+     | 5 | 0 | five  |   |  0 | 2 |  4
+     | 5 | 0 | five  |   |  0 | 3 | -3
+     | 5 | 0 | five  |   |  0 | 5 | -5
+     | 5 | 0 | five  |   |  0 | 5 | -5
+     | 5 | 0 | five  |   |  0 |   |  0
+     | 5 | 0 | five  |   |  0 |   |   
+     | 5 | 0 | five  |   |    | 0 |   
+     | 5 | 0 | five  |   |    | 1 | -1
+     | 5 | 0 | five  |   |    | 2 |  2
+     | 5 | 0 | five  |   |    | 2 |  4
+     | 5 | 0 | five  |   |    | 3 | -3
+     | 5 | 0 | five  |   |    | 5 | -5
+     | 5 | 0 | five  |   |    | 5 | -5
+     | 5 | 0 | five  |   |    |   |  0
+     | 5 | 0 | five  |   |    |   |   
+     | 6 | 6 | six   | 0 |    | 0 |   
+     | 6 | 6 | six   | 0 |    | 1 | -1
+     | 6 | 6 | six   | 0 |    | 2 |  2
+     | 6 | 6 | six   | 0 |    | 2 |  4
+     | 6 | 6 | six   | 0 |    | 3 | -3
+     | 6 | 6 | six   | 0 |    | 5 | -5
+     | 6 | 6 | six   | 0 |    | 5 | -5
+     | 6 | 6 | six   | 0 |    |   |  0
+     | 6 | 6 | six   | 0 |    |   |   
+     | 6 | 6 | six   | 1 | -1 | 0 |   
+     | 6 | 6 | six   | 1 | -1 | 1 | -1
+     | 6 | 6 | six   | 1 | -1 | 2 |  2
+     | 6 | 6 | six   | 1 | -1 | 2 |  4
+     | 6 | 6 | six   | 1 | -1 | 3 | -3
+     | 6 | 6 | six   | 1 | -1 | 5 | -5
+     | 6 | 6 | six   | 1 | -1 | 5 | -5
+     | 6 | 6 | six   | 1 | -1 |   |  0
+     | 6 | 6 | six   | 1 | -1 |   |   
+     | 6 | 6 | six   | 2 |  2 | 0 |   
+     | 6 | 6 | six   | 2 |  2 | 1 | -1
+     | 6 | 6 | six   | 2 |  2 | 2 |  2
+     | 6 | 6 | six   | 2 |  2 | 2 |  4
+     | 6 | 6 | six   | 2 |  2 | 3 | -3
+     | 6 | 6 | six   | 2 |  2 | 5 | -5
+     | 6 | 6 | six   | 2 |  2 | 5 | -5
+     | 6 | 6 | six   | 2 |  2 |   |  0
+     | 6 | 6 | six   | 2 |  2 |   |   
+     | 6 | 6 | six   | 2 |  4 | 0 |   
+     | 6 | 6 | six   | 2 |  4 | 1 | -1
+     | 6 | 6 | six   | 2 |  4 | 2 |  2
+     | 6 | 6 | six   | 2 |  4 | 2 |  4
+     | 6 | 6 | six   | 2 |  4 | 3 | -3
+     | 6 | 6 | six   | 2 |  4 | 5 | -5
+     | 6 | 6 | six   | 2 |  4 | 5 | -5
+     | 6 | 6 | six   | 2 |  4 |   |  0
+     | 6 | 6 | six   | 2 |  4 |   |   
+     | 6 | 6 | six   | 3 | -3 | 0 |   
+     | 6 | 6 | six   | 3 | -3 | 1 | -1
+     | 6 | 6 | six   | 3 | -3 | 2 |  2
+     | 6 | 6 | six   | 3 | -3 | 2 |  4
+     | 6 | 6 | six   | 3 | -3 | 3 | -3
+     | 6 | 6 | six   | 3 | -3 | 5 | -5
+     | 6 | 6 | six   | 3 | -3 | 5 | -5
+     | 6 | 6 | six   | 3 | -3 |   |  0
+     | 6 | 6 | six   | 3 | -3 |   |   
+     | 6 | 6 | six   | 5 | -5 | 0 |   
+     | 6 | 6 | six   | 5 | -5 | 0 |   
+     | 6 | 6 | six   | 5 | -5 | 1 | -1
+     | 6 | 6 | six   | 5 | -5 | 1 | -1
+     | 6 | 6 | six   | 5 | -5 | 2 |  2
+     | 6 | 6 | six   | 5 | -5 | 2 |  2
+     | 6 | 6 | six   | 5 | -5 | 2 |  4
+     | 6 | 6 | six   | 5 | -5 | 2 |  4
+     | 6 | 6 | six   | 5 | -5 | 3 | -3
+     | 6 | 6 | six   | 5 | -5 | 3 | -3
+     | 6 | 6 | six   | 5 | -5 | 5 | -5
+     | 6 | 6 | six   | 5 | -5 | 5 | -5
+     | 6 | 6 | six   | 5 | -5 | 5 | -5
+     | 6 | 6 | six   | 5 | -5 | 5 | -5
+     | 6 | 6 | six   | 5 | -5 |   |  0
+     | 6 | 6 | six   | 5 | -5 |   |  0
+     | 6 | 6 | six   | 5 | -5 |   |   
+     | 6 | 6 | six   | 5 | -5 |   |   
+     | 6 | 6 | six   |   |  0 | 0 |   
+     | 6 | 6 | six   |   |  0 | 1 | -1
+     | 6 | 6 | six   |   |  0 | 2 |  2
+     | 6 | 6 | six   |   |  0 | 2 |  4
+     | 6 | 6 | six   |   |  0 | 3 | -3
+     | 6 | 6 | six   |   |  0 | 5 | -5
+     | 6 | 6 | six   |   |  0 | 5 | -5
+     | 6 | 6 | six   |   |  0 |   |  0
+     | 6 | 6 | six   |   |  0 |   |   
+     | 6 | 6 | six   |   |    | 0 |   
+     | 6 | 6 | six   |   |    | 1 | -1
+     | 6 | 6 | six   |   |    | 2 |  2
+     | 6 | 6 | six   |   |    | 2 |  4
+     | 6 | 6 | six   |   |    | 3 | -3
+     | 6 | 6 | six   |   |    | 5 | -5
+     | 6 | 6 | six   |   |    | 5 | -5
+     | 6 | 6 | six   |   |    |   |  0
+     | 6 | 6 | six   |   |    |   |   
+     | 7 | 7 | seven | 0 |    | 0 |   
+     | 7 | 7 | seven | 0 |    | 1 | -1
+     | 7 | 7 | seven | 0 |    | 2 |  2
+     | 7 | 7 | seven | 0 |    | 2 |  4
+     | 7 | 7 | seven | 0 |    | 3 | -3
+     | 7 | 7 | seven | 0 |    | 5 | -5
+     | 7 | 7 | seven | 0 |    | 5 | -5
+     | 7 | 7 | seven | 0 |    |   |  0
+     | 7 | 7 | seven | 0 |    |   |   
+     | 7 | 7 | seven | 1 | -1 | 0 |   
+     | 7 | 7 | seven | 1 | -1 | 1 | -1
+     | 7 | 7 | seven | 1 | -1 | 2 |  2
+     | 7 | 7 | seven | 1 | -1 | 2 |  4
+     | 7 | 7 | seven | 1 | -1 | 3 | -3
+     | 7 | 7 | seven | 1 | -1 | 5 | -5
+     | 7 | 7 | seven | 1 | -1 | 5 | -5
+     | 7 | 7 | seven | 1 | -1 |   |  0
+     | 7 | 7 | seven | 1 | -1 |   |   
+     | 7 | 7 | seven | 2 |  2 | 0 |   
+     | 7 | 7 | seven | 2 |  2 | 1 | -1
+     | 7 | 7 | seven | 2 |  2 | 2 |  2
+     | 7 | 7 | seven | 2 |  2 | 2 |  4
+     | 7 | 7 | seven | 2 |  2 | 3 | -3
+     | 7 | 7 | seven | 2 |  2 | 5 | -5
+     | 7 | 7 | seven | 2 |  2 | 5 | -5
+     | 7 | 7 | seven | 2 |  2 |   |  0
+     | 7 | 7 | seven | 2 |  2 |   |   
+     | 7 | 7 | seven | 2 |  4 | 0 |   
+     | 7 | 7 | seven | 2 |  4 | 1 | -1
+     | 7 | 7 | seven | 2 |  4 | 2 |  2
+     | 7 | 7 | seven | 2 |  4 | 2 |  4
+     | 7 | 7 | seven | 2 |  4 | 3 | -3
+     | 7 | 7 | seven | 2 |  4 | 5 | -5
+     | 7 | 7 | seven | 2 |  4 | 5 | -5
+     | 7 | 7 | seven | 2 |  4 |   |  0
+     | 7 | 7 | seven | 2 |  4 |   |   
+     | 7 | 7 | seven | 3 | -3 | 0 |   
+     | 7 | 7 | seven | 3 | -3 | 1 | -1
+     | 7 | 7 | seven | 3 | -3 | 2 |  2
+     | 7 | 7 | seven | 3 | -3 | 2 |  4
+     | 7 | 7 | seven | 3 | -3 | 3 | -3
+     | 7 | 7 | seven | 3 | -3 | 5 | -5
+     | 7 | 7 | seven | 3 | -3 | 5 | -5
+     | 7 | 7 | seven | 3 | -3 |   |  0
+     | 7 | 7 | seven | 3 | -3 |   |   
+     | 7 | 7 | seven | 5 | -5 | 0 |   
+     | 7 | 7 | seven | 5 | -5 | 0 |   
+     | 7 | 7 | seven | 5 | -5 | 1 | -1
+     | 7 | 7 | seven | 5 | -5 | 1 | -1
+     | 7 | 7 | seven | 5 | -5 | 2 |  2
+     | 7 | 7 | seven | 5 | -5 | 2 |  2
+     | 7 | 7 | seven | 5 | -5 | 2 |  4
+     | 7 | 7 | seven | 5 | -5 | 2 |  4
+     | 7 | 7 | seven | 5 | -5 | 3 | -3
+     | 7 | 7 | seven | 5 | -5 | 3 | -3
+     | 7 | 7 | seven | 5 | -5 | 5 | -5
+     | 7 | 7 | seven | 5 | -5 | 5 | -5
+     | 7 | 7 | seven | 5 | -5 | 5 | -5
+     | 7 | 7 | seven | 5 | -5 | 5 | -5
+     | 7 | 7 | seven | 5 | -5 |   |  0
+     | 7 | 7 | seven | 5 | -5 |   |  0
+     | 7 | 7 | seven | 5 | -5 |   |   
+     | 7 | 7 | seven | 5 | -5 |   |   
+     | 7 | 7 | seven |   |  0 | 0 |   
+     | 7 | 7 | seven |   |  0 | 1 | -1
+     | 7 | 7 | seven |   |  0 | 2 |  2
+     | 7 | 7 | seven |   |  0 | 2 |  4
+     | 7 | 7 | seven |   |  0 | 3 | -3
+     | 7 | 7 | seven |   |  0 | 5 | -5
+     | 7 | 7 | seven |   |  0 | 5 | -5
+     | 7 | 7 | seven |   |  0 |   |  0
+     | 7 | 7 | seven |   |  0 |   |   
+     | 7 | 7 | seven |   |    | 0 |   
+     | 7 | 7 | seven |   |    | 1 | -1
+     | 7 | 7 | seven |   |    | 2 |  2
+     | 7 | 7 | seven |   |    | 2 |  4
+     | 7 | 7 | seven |   |    | 3 | -3
+     | 7 | 7 | seven |   |    | 5 | -5
+     | 7 | 7 | seven |   |    | 5 | -5
+     | 7 | 7 | seven |   |    |   |  0
+     | 7 | 7 | seven |   |    |   |   
+     | 8 | 8 | eight | 0 |    | 0 |   
+     | 8 | 8 | eight | 0 |    | 1 | -1
+     | 8 | 8 | eight | 0 |    | 2 |  2
+     | 8 | 8 | eight | 0 |    | 2 |  4
+     | 8 | 8 | eight | 0 |    | 3 | -3
+     | 8 | 8 | eight | 0 |    | 5 | -5
+     | 8 | 8 | eight | 0 |    | 5 | -5
+     | 8 | 8 | eight | 0 |    |   |  0
+     | 8 | 8 | eight | 0 |    |   |   
+     | 8 | 8 | eight | 1 | -1 | 0 |   
+     | 8 | 8 | eight | 1 | -1 | 1 | -1
+     | 8 | 8 | eight | 1 | -1 | 2 |  2
+     | 8 | 8 | eight | 1 | -1 | 2 |  4
+     | 8 | 8 | eight | 1 | -1 | 3 | -3
+     | 8 | 8 | eight | 1 | -1 | 5 | -5
+     | 8 | 8 | eight | 1 | -1 | 5 | -5
+     | 8 | 8 | eight | 1 | -1 |   |  0
+     | 8 | 8 | eight | 1 | -1 |   |   
+     | 8 | 8 | eight | 2 |  2 | 0 |   
+     | 8 | 8 | eight | 2 |  2 | 1 | -1
+     | 8 | 8 | eight | 2 |  2 | 2 |  2
+     | 8 | 8 | eight | 2 |  2 | 2 |  4
+     | 8 | 8 | eight | 2 |  2 | 3 | -3
+     | 8 | 8 | eight | 2 |  2 | 5 | -5
+     | 8 | 8 | eight | 2 |  2 | 5 | -5
+     | 8 | 8 | eight | 2 |  2 |   |  0
+     | 8 | 8 | eight | 2 |  2 |   |   
+     | 8 | 8 | eight | 2 |  4 | 0 |   
+     | 8 | 8 | eight | 2 |  4 | 1 | -1
+     | 8 | 8 | eight | 2 |  4 | 2 |  2
+     | 8 | 8 | eight | 2 |  4 | 2 |  4
+     | 8 | 8 | eight | 2 |  4 | 3 | -3
+     | 8 | 8 | eight | 2 |  4 | 5 | -5
+     | 8 | 8 | eight | 2 |  4 | 5 | -5
+     | 8 | 8 | eight | 2 |  4 |   |  0
+     | 8 | 8 | eight | 2 |  4 |   |   
+     | 8 | 8 | eight | 3 | -3 | 0 |   
+     | 8 | 8 | eight | 3 | -3 | 1 | -1
+     | 8 | 8 | eight | 3 | -3 | 2 |  2
+     | 8 | 8 | eight | 3 | -3 | 2 |  4
+     | 8 | 8 | eight | 3 | -3 | 3 | -3
+     | 8 | 8 | eight | 3 | -3 | 5 | -5
+     | 8 | 8 | eight | 3 | -3 | 5 | -5
+     | 8 | 8 | eight | 3 | -3 |   |  0
+     | 8 | 8 | eight | 3 | -3 |   |   
+     | 8 | 8 | eight | 5 | -5 | 0 |   
+     | 8 | 8 | eight | 5 | -5 | 0 |   
+     | 8 | 8 | eight | 5 | -5 | 1 | -1
+     | 8 | 8 | eight | 5 | -5 | 1 | -1
+     | 8 | 8 | eight | 5 | -5 | 2 |  2
+     | 8 | 8 | eight | 5 | -5 | 2 |  2
+     | 8 | 8 | eight | 5 | -5 | 2 |  4
+     | 8 | 8 | eight | 5 | -5 | 2 |  4
+     | 8 | 8 | eight | 5 | -5 | 3 | -3
+     | 8 | 8 | eight | 5 | -5 | 3 | -3
+     | 8 | 8 | eight | 5 | -5 | 5 | -5
+     | 8 | 8 | eight | 5 | -5 | 5 | -5
+     | 8 | 8 | eight | 5 | -5 | 5 | -5
+     | 8 | 8 | eight | 5 | -5 | 5 | -5
+     | 8 | 8 | eight | 5 | -5 |   |  0
+     | 8 | 8 | eight | 5 | -5 |   |  0
+     | 8 | 8 | eight | 5 | -5 |   |   
+     | 8 | 8 | eight | 5 | -5 |   |   
+     | 8 | 8 | eight |   |  0 | 0 |   
+     | 8 | 8 | eight |   |  0 | 1 | -1
+     | 8 | 8 | eight |   |  0 | 2 |  2
+     | 8 | 8 | eight |   |  0 | 2 |  4
+     | 8 | 8 | eight |   |  0 | 3 | -3
+     | 8 | 8 | eight |   |  0 | 5 | -5
+     | 8 | 8 | eight |   |  0 | 5 | -5
+     | 8 | 8 | eight |   |  0 |   |  0
+     | 8 | 8 | eight |   |  0 |   |   
+     | 8 | 8 | eight |   |    | 0 |   
+     | 8 | 8 | eight |   |    | 1 | -1
+     | 8 | 8 | eight |   |    | 2 |  2
+     | 8 | 8 | eight |   |    | 2 |  4
+     | 8 | 8 | eight |   |    | 3 | -3
+     | 8 | 8 | eight |   |    | 5 | -5
+     | 8 | 8 | eight |   |    | 5 | -5
+     | 8 | 8 | eight |   |    |   |  0
+     | 8 | 8 | eight |   |    |   |   
+     |   | 0 | zero  | 0 |    | 0 |   
+     |   | 0 | zero  | 0 |    | 1 | -1
+     |   | 0 | zero  | 0 |    | 2 |  2
+     |   | 0 | zero  | 0 |    | 2 |  4
+     |   | 0 | zero  | 0 |    | 3 | -3
+     |   | 0 | zero  | 0 |    | 5 | -5
+     |   | 0 | zero  | 0 |    | 5 | -5
+     |   | 0 | zero  | 0 |    |   |  0
+     |   | 0 | zero  | 0 |    |   |   
+     |   | 0 | zero  | 1 | -1 | 0 |   
+     |   | 0 | zero  | 1 | -1 | 1 | -1
+     |   | 0 | zero  | 1 | -1 | 2 |  2
+     |   | 0 | zero  | 1 | -1 | 2 |  4
+     |   | 0 | zero  | 1 | -1 | 3 | -3
+     |   | 0 | zero  | 1 | -1 | 5 | -5
+     |   | 0 | zero  | 1 | -1 | 5 | -5
+     |   | 0 | zero  | 1 | -1 |   |  0
+     |   | 0 | zero  | 1 | -1 |   |   
+     |   | 0 | zero  | 2 |  2 | 0 |   
+     |   | 0 | zero  | 2 |  2 | 1 | -1
+     |   | 0 | zero  | 2 |  2 | 2 |  2
+     |   | 0 | zero  | 2 |  2 | 2 |  4
+     |   | 0 | zero  | 2 |  2 | 3 | -3
+     |   | 0 | zero  | 2 |  2 | 5 | -5
+     |   | 0 | zero  | 2 |  2 | 5 | -5
+     |   | 0 | zero  | 2 |  2 |   |  0
+     |   | 0 | zero  | 2 |  2 |   |   
+     |   | 0 | zero  | 2 |  4 | 0 |   
+     |   | 0 | zero  | 2 |  4 | 1 | -1
+     |   | 0 | zero  | 2 |  4 | 2 |  2
+     |   | 0 | zero  | 2 |  4 | 2 |  4
+     |   | 0 | zero  | 2 |  4 | 3 | -3
+     |   | 0 | zero  | 2 |  4 | 5 | -5
+     |   | 0 | zero  | 2 |  4 | 5 | -5
+     |   | 0 | zero  | 2 |  4 |   |  0
+     |   | 0 | zero  | 2 |  4 |   |   
+     |   | 0 | zero  | 3 | -3 | 0 |   
+     |   | 0 | zero  | 3 | -3 | 1 | -1
+     |   | 0 | zero  | 3 | -3 | 2 |  2
+     |   | 0 | zero  | 3 | -3 | 2 |  4
+     |   | 0 | zero  | 3 | -3 | 3 | -3
+     |   | 0 | zero  | 3 | -3 | 5 | -5
+     |   | 0 | zero  | 3 | -3 | 5 | -5
+     |   | 0 | zero  | 3 | -3 |   |  0
+     |   | 0 | zero  | 3 | -3 |   |   
+     |   | 0 | zero  | 5 | -5 | 0 |   
+     |   | 0 | zero  | 5 | -5 | 0 |   
+     |   | 0 | zero  | 5 | -5 | 1 | -1
+     |   | 0 | zero  | 5 | -5 | 1 | -1
+     |   | 0 | zero  | 5 | -5 | 2 |  2
+     |   | 0 | zero  | 5 | -5 | 2 |  2
+     |   | 0 | zero  | 5 | -5 | 2 |  4
+     |   | 0 | zero  | 5 | -5 | 2 |  4
+     |   | 0 | zero  | 5 | -5 | 3 | -3
+     |   | 0 | zero  | 5 | -5 | 3 | -3
+     |   | 0 | zero  | 5 | -5 | 5 | -5
+     |   | 0 | zero  | 5 | -5 | 5 | -5
+     |   | 0 | zero  | 5 | -5 | 5 | -5
+     |   | 0 | zero  | 5 | -5 | 5 | -5
+     |   | 0 | zero  | 5 | -5 |   |  0
+     |   | 0 | zero  | 5 | -5 |   |  0
+     |   | 0 | zero  | 5 | -5 |   |   
+     |   | 0 | zero  | 5 | -5 |   |   
+     |   | 0 | zero  |   |  0 | 0 |   
+     |   | 0 | zero  |   |  0 | 1 | -1
+     |   | 0 | zero  |   |  0 | 2 |  2
+     |   | 0 | zero  |   |  0 | 2 |  4
+     |   | 0 | zero  |   |  0 | 3 | -3
+     |   | 0 | zero  |   |  0 | 5 | -5
+     |   | 0 | zero  |   |  0 | 5 | -5
+     |   | 0 | zero  |   |  0 |   |  0
+     |   | 0 | zero  |   |  0 |   |   
+     |   | 0 | zero  |   |    | 0 |   
+     |   | 0 | zero  |   |    | 1 | -1
+     |   | 0 | zero  |   |    | 2 |  2
+     |   | 0 | zero  |   |    | 2 |  4
+     |   | 0 | zero  |   |    | 3 | -3
+     |   | 0 | zero  |   |    | 5 | -5
+     |   | 0 | zero  |   |    | 5 | -5
+     |   | 0 | zero  |   |    |   |  0
+     |   | 0 | zero  |   |    |   |   
+     |   |   | null  | 0 |    | 0 |   
+     |   |   | null  | 0 |    | 1 | -1
+     |   |   | null  | 0 |    | 2 |  2
+     |   |   | null  | 0 |    | 2 |  4
+     |   |   | null  | 0 |    | 3 | -3
+     |   |   | null  | 0 |    | 5 | -5
+     |   |   | null  | 0 |    | 5 | -5
+     |   |   | null  | 0 |    |   |  0
+     |   |   | null  | 0 |    |   |   
+     |   |   | null  | 1 | -1 | 0 |   
+     |   |   | null  | 1 | -1 | 1 | -1
+     |   |   | null  | 1 | -1 | 2 |  2
+     |   |   | null  | 1 | -1 | 2 |  4
+     |   |   | null  | 1 | -1 | 3 | -3
+     |   |   | null  | 1 | -1 | 5 | -5
+     |   |   | null  | 1 | -1 | 5 | -5
+     |   |   | null  | 1 | -1 |   |  0
+     |   |   | null  | 1 | -1 |   |   
+     |   |   | null  | 2 |  2 | 0 |   
+     |   |   | null  | 2 |  2 | 1 | -1
+     |   |   | null  | 2 |  2 | 2 |  2
+     |   |   | null  | 2 |  2 | 2 |  4
+     |   |   | null  | 2 |  2 | 3 | -3
+     |   |   | null  | 2 |  2 | 5 | -5
+     |   |   | null  | 2 |  2 | 5 | -5
+     |   |   | null  | 2 |  2 |   |  0
+     |   |   | null  | 2 |  2 |   |   
+     |   |   | null  | 2 |  4 | 0 |   
+     |   |   | null  | 2 |  4 | 1 | -1
+     |   |   | null  | 2 |  4 | 2 |  2
+     |   |   | null  | 2 |  4 | 2 |  4
+     |   |   | null  | 2 |  4 | 3 | -3
+     |   |   | null  | 2 |  4 | 5 | -5
+     |   |   | null  | 2 |  4 | 5 | -5
+     |   |   | null  | 2 |  4 |   |  0
+     |   |   | null  | 2 |  4 |   |   
+     |   |   | null  | 3 | -3 | 0 |   
+     |   |   | null  | 3 | -3 | 1 | -1
+     |   |   | null  | 3 | -3 | 2 |  2
+     |   |   | null  | 3 | -3 | 2 |  4
+     |   |   | null  | 3 | -3 | 3 | -3
+     |   |   | null  | 3 | -3 | 5 | -5
+     |   |   | null  | 3 | -3 | 5 | -5
+     |   |   | null  | 3 | -3 |   |  0
+     |   |   | null  | 3 | -3 |   |   
+     |   |   | null  | 5 | -5 | 0 |   
+     |   |   | null  | 5 | -5 | 0 |   
+     |   |   | null  | 5 | -5 | 1 | -1
+     |   |   | null  | 5 | -5 | 1 | -1
+     |   |   | null  | 5 | -5 | 2 |  2
+     |   |   | null  | 5 | -5 | 2 |  2
+     |   |   | null  | 5 | -5 | 2 |  4
+     |   |   | null  | 5 | -5 | 2 |  4
+     |   |   | null  | 5 | -5 | 3 | -3
+     |   |   | null  | 5 | -5 | 3 | -3
+     |   |   | null  | 5 | -5 | 5 | -5
+     |   |   | null  | 5 | -5 | 5 | -5
+     |   |   | null  | 5 | -5 | 5 | -5
+     |   |   | null  | 5 | -5 | 5 | -5
+     |   |   | null  | 5 | -5 |   |  0
+     |   |   | null  | 5 | -5 |   |  0
+     |   |   | null  | 5 | -5 |   |   
+     |   |   | null  | 5 | -5 |   |   
+     |   |   | null  |   |  0 | 0 |   
+     |   |   | null  |   |  0 | 1 | -1
+     |   |   | null  |   |  0 | 2 |  2
+     |   |   | null  |   |  0 | 2 |  4
+     |   |   | null  |   |  0 | 3 | -3
+     |   |   | null  |   |  0 | 5 | -5
+     |   |   | null  |   |  0 | 5 | -5
+     |   |   | null  |   |  0 |   |  0
+     |   |   | null  |   |  0 |   |   
+     |   |   | null  |   |    | 0 |   
+     |   |   | null  |   |    | 1 | -1
+     |   |   | null  |   |    | 2 |  2
+     |   |   | null  |   |    | 2 |  4
+     |   |   | null  |   |    | 3 | -3
+     |   |   | null  |   |    | 5 | -5
+     |   |   | null  |   |    | 5 | -5
+     |   |   | null  |   |    |   |  0
+     |   |   | null  |   |    |   |   
+(891 rows)
+
+--
+--
+-- Inner joins (equi-joins)
+--
+--
+--
+-- Inner joins (equi-joins) with USING clause
+-- The USING syntax changes the shape of the resulting table
+-- by including a column in the USING clause only once in the result.
+--
+-- Inner equi-join on specified column
+SELECT '' AS "xxx", *
+  FROM J1_TBL INNER JOIN J2_TBL USING (i)
+  ORDER BY i, j, k, t;
+ xxx | i | j |   t   | k  
+-----+---+---+-------+----
+     | 0 |   | zero  |   
+     | 1 | 4 | one   | -1
+     | 2 | 3 | two   |  2
+     | 2 | 3 | two   |  4
+     | 3 | 2 | three | -3
+     | 5 | 0 | five  | -5
+     | 5 | 0 | five  | -5
+(7 rows)
+
+-- Same as above, slightly different syntax
+SELECT '' AS "xxx", *
+  FROM J1_TBL JOIN J2_TBL USING (i)
+  ORDER BY i, j, k, t;
+ xxx | i | j |   t   | k  
+-----+---+---+-------+----
+     | 0 |   | zero  |   
+     | 1 | 4 | one   | -1
+     | 2 | 3 | two   |  2
+     | 2 | 3 | two   |  4
+     | 3 | 2 | three | -3
+     | 5 | 0 | five  | -5
+     | 5 | 0 | five  | -5
+(7 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL t1 (a, b, c) JOIN J2_TBL t2 (a, d) USING (a)
+  ORDER BY a, d;
+ xxx | a | b |   c   | d  
+-----+---+---+-------+----
+     | 0 |   | zero  |   
+     | 1 | 4 | one   | -1
+     | 2 | 3 | two   |  2
+     | 2 | 3 | two   |  4
+     | 3 | 2 | three | -3
+     | 5 | 0 | five  | -5
+     | 5 | 0 | five  | -5
+(7 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL t1 (a, b, c) JOIN J2_TBL t2 (a, b) USING (b)
+  ORDER BY b, t1.a;
+ xxx | b | a |   c   | a 
+-----+---+---+-------+---
+     | 0 | 5 | five  |  
+     | 0 |   | zero  |  
+     | 2 | 3 | three | 2
+     | 4 | 1 | one   | 2
+(4 rows)
+
+--
+-- NATURAL JOIN
+-- Inner equi-join on all columns with the same name
+--
+SELECT '' AS "xxx", *
+  FROM J1_TBL NATURAL JOIN J2_TBL
+  ORDER BY i, j, k, t;
+ xxx | i | j |   t   | k  
+-----+---+---+-------+----
+     | 0 |   | zero  |   
+     | 1 | 4 | one   | -1
+     | 2 | 3 | two   |  2
+     | 2 | 3 | two   |  4
+     | 3 | 2 | three | -3
+     | 5 | 0 | five  | -5
+     | 5 | 0 | five  | -5
+(7 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL t1 (a, b, c) NATURAL JOIN J2_TBL t2 (a, d)
+  ORDER BY a, b, c, d;
+ xxx | a | b |   c   | d  
+-----+---+---+-------+----
+     | 0 |   | zero  |   
+     | 1 | 4 | one   | -1
+     | 2 | 3 | two   |  2
+     | 2 | 3 | two   |  4
+     | 3 | 2 | three | -3
+     | 5 | 0 | five  | -5
+     | 5 | 0 | five  | -5
+(7 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL t1 (a, b, c) NATURAL JOIN J2_TBL t2 (d, a)
+  ORDER BY a, b, c, d;
+ xxx | a | b |  c   | d 
+-----+---+---+------+---
+     | 0 |   | zero |  
+     | 2 | 3 | two  | 2
+     | 4 | 1 | four | 2
+(3 rows)
+
+-- mismatch number of columns
+-- currently, Postgres will fill in with underlying names
+SELECT '' AS "xxx", *
+  FROM J1_TBL t1 (a, b) NATURAL JOIN J2_TBL t2 (a)
+  ORDER BY a, b, t, k;
+ xxx | a | b |   t   | k  
+-----+---+---+-------+----
+     | 0 |   | zero  |   
+     | 1 | 4 | one   | -1
+     | 2 | 3 | two   |  2
+     | 2 | 3 | two   |  4
+     | 3 | 2 | three | -3
+     | 5 | 0 | five  | -5
+     | 5 | 0 | five  | -5
+(7 rows)
+
+--
+-- Inner joins (equi-joins)
+--
+SELECT '' AS "xxx", *
+  FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i = J2_TBL.i)
+  ORDER BY J1_TBL.i, J1_TBL.j, J1_TBL.t, J2_TBL.i, J2_TBL.k;
+ xxx | i | j |   t   | i | k  
+-----+---+---+-------+---+----
+     | 0 |   | zero  | 0 |   
+     | 1 | 4 | one   | 1 | -1
+     | 2 | 3 | two   | 2 |  2
+     | 2 | 3 | two   | 2 |  4
+     | 3 | 2 | three | 3 | -3
+     | 5 | 0 | five  | 5 | -5
+     | 5 | 0 | five  | 5 | -5
+(7 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i = J2_TBL.k)
+  ORDER BY J1_TBL.i, J1_TBL.j, J1_TBL.t, J2_TBL.i, J2_TBL.k;
+ xxx | i | j |  t   | i | k 
+-----+---+---+------+---+---
+     | 0 |   | zero |   | 0
+     | 2 | 3 | two  | 2 | 2
+     | 4 | 1 | four | 2 | 4
+(3 rows)
+
+--
+-- Non-equi-joins
+--
+SELECT '' AS "xxx", *
+  FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i <= J2_TBL.k)
+  ORDER BY J1_TBL.i, J1_TBL.j, J1_TBL.t, J2_TBL.i, J2_TBL.k;
+ xxx | i | j |   t   | i | k 
+-----+---+---+-------+---+---
+     | 0 |   | zero  | 2 | 2
+     | 0 |   | zero  | 2 | 4
+     | 0 |   | zero  |   | 0
+     | 1 | 4 | one   | 2 | 2
+     | 1 | 4 | one   | 2 | 4
+     | 2 | 3 | two   | 2 | 2
+     | 2 | 3 | two   | 2 | 4
+     | 3 | 2 | three | 2 | 4
+     | 4 | 1 | four  | 2 | 4
+(9 rows)
+
+--
+-- Outer joins
+-- Note that OUTER is a noise word
+--
+SELECT '' AS "xxx", *
+  FROM J1_TBL LEFT OUTER JOIN J2_TBL USING (i)
+  ORDER BY i, k, t;
+ xxx | i | j |   t   | k  
+-----+---+---+-------+----
+     | 0 |   | zero  |   
+     | 1 | 4 | one   | -1
+     | 2 | 3 | two   |  2
+     | 2 | 3 | two   |  4
+     | 3 | 2 | three | -3
+     | 4 | 1 | four  |   
+     | 5 | 0 | five  | -5
+     | 5 | 0 | five  | -5
+     | 6 | 6 | six   |   
+     | 7 | 7 | seven |   
+     | 8 | 8 | eight |   
+     |   |   | null  |   
+     |   | 0 | zero  |   
+(13 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL LEFT JOIN J2_TBL USING (i)
+  ORDER BY i, k, t;
+ xxx | i | j |   t   | k  
+-----+---+---+-------+----
+     | 0 |   | zero  |   
+     | 1 | 4 | one   | -1
+     | 2 | 3 | two   |  2
+     | 2 | 3 | two   |  4
+     | 3 | 2 | three | -3
+     | 4 | 1 | four  |   
+     | 5 | 0 | five  | -5
+     | 5 | 0 | five  | -5
+     | 6 | 6 | six   |   
+     | 7 | 7 | seven |   
+     | 8 | 8 | eight |   
+     |   |   | null  |   
+     |   | 0 | zero  |   
+(13 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL RIGHT OUTER JOIN J2_TBL USING (i)
+  ORDER BY i, j, k, t;
+ xxx | i | j |   t   | k  
+-----+---+---+-------+----
+     | 0 |   | zero  |   
+     | 1 | 4 | one   | -1
+     | 2 | 3 | two   |  2
+     | 2 | 3 | two   |  4
+     | 3 | 2 | three | -3
+     | 5 | 0 | five  | -5
+     | 5 | 0 | five  | -5
+     |   |   |       |  0
+     |   |   |       |   
+(9 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL RIGHT JOIN J2_TBL USING (i)
+  ORDER BY i, j, k, t;
+ xxx | i | j |   t   | k  
+-----+---+---+-------+----
+     | 0 |   | zero  |   
+     | 1 | 4 | one   | -1
+     | 2 | 3 | two   |  2
+     | 2 | 3 | two   |  4
+     | 3 | 2 | three | -3
+     | 5 | 0 | five  | -5
+     | 5 | 0 | five  | -5
+     |   |   |       |  0
+     |   |   |       |   
+(9 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL FULL OUTER JOIN J2_TBL USING (i)
+  ORDER BY i, k, t;
+ xxx | i | j |   t   | k  
+-----+---+---+-------+----
+     | 0 |   | zero  |   
+     | 1 | 4 | one   | -1
+     | 2 | 3 | two   |  2
+     | 2 | 3 | two   |  4
+     | 3 | 2 | three | -3
+     | 4 | 1 | four  |   
+     | 5 | 0 | five  | -5
+     | 5 | 0 | five  | -5
+     | 6 | 6 | six   |   
+     | 7 | 7 | seven |   
+     | 8 | 8 | eight |   
+     |   |   |       |  0
+     |   |   | null  |   
+     |   | 0 | zero  |   
+     |   |   |       |   
+(15 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL FULL JOIN J2_TBL USING (i)
+  ORDER BY i, k, t;
+ xxx | i | j |   t   | k  
+-----+---+---+-------+----
+     | 0 |   | zero  |   
+     | 1 | 4 | one   | -1
+     | 2 | 3 | two   |  2
+     | 2 | 3 | two   |  4
+     | 3 | 2 | three | -3
+     | 4 | 1 | four  |   
+     | 5 | 0 | five  | -5
+     | 5 | 0 | five  | -5
+     | 6 | 6 | six   |   
+     | 7 | 7 | seven |   
+     | 8 | 8 | eight |   
+     |   |   |       |  0
+     |   |   | null  |   
+     |   | 0 | zero  |   
+     |   |   |       |   
+(15 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (k = 1);
+ xxx | i | j | t | k 
+-----+---+---+---+---
+(0 rows)
+
+SELECT '' AS "xxx", *
+  FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (i = 1);
+ xxx | i | j |  t  | k  
+-----+---+---+-----+----
+     | 1 | 4 | one | -1
+(1 row)
+
+--
+-- More complicated constructs
+--
+--
+-- Multiway full join
+--
+CREATE TABLE t1 (name TEXT, n INTEGER);
+CREATE TABLE t2 (name TEXT, n INTEGER);
+CREATE TABLE t3 (name TEXT, n INTEGER);
+INSERT INTO t1 VALUES ( 'bb', 11 );
+INSERT INTO t2 VALUES ( 'bb', 12 );
+INSERT INTO t2 VALUES ( 'cc', 22 );
+INSERT INTO t2 VALUES ( 'ee', 42 );
+INSERT INTO t3 VALUES ( 'bb', 13 );
+INSERT INTO t3 VALUES ( 'cc', 23 );
+INSERT INTO t3 VALUES ( 'dd', 33 );
+SELECT * FROM t1 FULL JOIN t2 USING (name) FULL JOIN t3 USING (name) 
+ORDER BY name,t1.n, t2.n, t3.n;
+ name | n  | n  | n  
+------+----+----+----
+ bb   | 11 | 12 | 13
+ cc   |    | 22 | 23
+ dd   |    |    | 33
+ ee   |    | 42 |   
+(4 rows)
+
+--
+-- Test interactions of join syntax and subqueries
+--
+-- Basic cases (we expect planner to pull up the subquery here)
+SELECT * FROM
+(SELECT * FROM t2) as s2
+INNER JOIN
+(SELECT * FROM t3) s3
+USING (name)
+ORDER BY name, s2.n, s3.n;
+ name | n  | n  
+------+----+----
+ bb   | 12 | 13
+ cc   | 22 | 23
+(2 rows)
+
+SELECT * FROM
+(SELECT * FROM t2) as s2
+LEFT JOIN
+(SELECT * FROM t3) s3
+USING (name)
+ORDER BY name, s2.n, s3.n;
+ name | n  | n  
+------+----+----
+ bb   | 12 | 13
+ cc   | 22 | 23
+ ee   | 42 |   
+(3 rows)
+
+SELECT * FROM
+(SELECT * FROM t2) as s2
+FULL JOIN
+(SELECT * FROM t3) s3
+USING (name)
+ORDER BY name, s2.n, s3.n;
+ name | n  | n  
+------+----+----
+ bb   | 12 | 13
+ cc   | 22 | 23
+ dd   |    | 33
+ ee   | 42 |   
+(4 rows)
+
+-- Cases with non-nullable expressions in subquery results;
+-- make sure these go to null as expected
+SELECT * FROM
+(SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2
+NATURAL INNER JOIN
+(SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3
+ORDER BY name, s2_n, s3_n;
+ name | s2_n | s2_2 | s3_n | s3_2 
+------+------+------+------+------
+ bb   |   12 |    2 |   13 |    3
+ cc   |   22 |    2 |   23 |    3
+(2 rows)
+
+SELECT * FROM
+(SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2
+NATURAL LEFT JOIN
+(SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3
+ORDER BY name, s2_n, s3_n;
+ name | s2_n | s2_2 | s3_n | s3_2 
+------+------+------+------+------
+ bb   |   12 |    2 |   13 |    3
+ cc   |   22 |    2 |   23 |    3
+ ee   |   42 |    2 |      |     
+(3 rows)
+
+SELECT * FROM
+(SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2
+NATURAL FULL JOIN
+(SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3
+ORDER BY name, s2_n, s3_n;
+ name | s2_n | s2_2 | s3_n | s3_2 
+------+------+------+------+------
+ bb   |   12 |    2 |   13 |    3
+ cc   |   22 |    2 |   23 |    3
+ dd   |      |      |   33 |    3
+ ee   |   42 |    2 |      |     
+(4 rows)
+
+SELECT * FROM
+(SELECT name, n as s1_n, 1 as s1_1 FROM t1) as s1
+NATURAL INNER JOIN
+(SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2
+NATURAL INNER JOIN
+(SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3
+ORDER BY name, s1_n, s2_n, s3_n;
+ name | s1_n | s1_1 | s2_n | s2_2 | s3_n | s3_2 
+------+------+------+------+------+------+------
+ bb   |   11 |    1 |   12 |    2 |   13 |    3
+(1 row)
+
+SELECT * FROM
+(SELECT name, n as s1_n, 1 as s1_1 FROM t1) as s1
+NATURAL FULL JOIN
+(SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2
+NATURAL FULL JOIN
+(SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3
+ORDER BY name, s1_n, s2_n, s3_n;
+ name | s1_n | s1_1 | s2_n | s2_2 | s3_n | s3_2 
+------+------+------+------+------+------+------
+ bb   |   11 |    1 |   12 |    2 |   13 |    3
+ cc   |      |      |   22 |    2 |   23 |    3
+ dd   |      |      |      |      |   33 |    3
+ ee   |      |      |   42 |    2 |      |     
+(4 rows)
+
+SELECT * FROM
+(SELECT name, n as s1_n FROM t1) as s1
+NATURAL FULL JOIN
+  (SELECT * FROM
+    (SELECT name, n as s2_n FROM t2) as s2
+    NATURAL FULL JOIN
+    (SELECT name, n as s3_n FROM t3) as s3
+  ) ss2
+  ORDER BY name, s1_n, s2_n, s3_n;
+ name | s1_n | s2_n | s3_n 
+------+------+------+------
+ bb   |   11 |   12 |   13
+ cc   |      |   22 |   23
+ dd   |      |      |   33
+ ee   |      |   42 |     
+(4 rows)
+
+SELECT * FROM
+(SELECT name, n as s1_n FROM t1) as s1
+NATURAL FULL JOIN
+  (SELECT * FROM
+    (SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2
+    NATURAL FULL JOIN
+    (SELECT name, n as s3_n FROM t3) as s3
+  ) ss2
+  ORDER BY name, s1_n, s2_n, s3_n;
+ name | s1_n | s2_n | s2_2 | s3_n 
+------+------+------+------+------
+ bb   |   11 |   12 |    2 |   13
+ cc   |      |   22 |    2 |   23
+ dd   |      |      |      |   33
+ ee   |      |   42 |    2 |     
+(4 rows)
+
+-- Test for propagation of nullability constraints into sub-joins
+create temp table x (x1 int, x2 int);
+insert into x values (1,11);
+insert into x values (2,22);
+insert into x values (3,null);
+insert into x values (4,44);
+insert into x values (5,null);
+create temp table y (y1 int, y2 int);
+insert into y values (1,111);
+insert into y values (2,222);
+insert into y values (3,333);
+insert into y values (4,null);
+select * from x ORDER BY x1;
+ x1 | x2 
+----+----
+  1 | 11
+  2 | 22
+  3 |   
+  4 | 44
+  5 |   
+(5 rows)
+
+select * from y ORDER BY y1;
+ y1 | y2  
+----+-----
+  1 | 111
+  2 | 222
+  3 | 333
+  4 |    
+(4 rows)
+
+select * from x left join y on (x1 = y1 and x2 is not null) ORDER BY x1, x2, y1, y2;
+ x1 | x2 | y1 | y2  
+----+----+----+-----
+  1 | 11 |  1 | 111
+  2 | 22 |  2 | 222
+  3 |    |    |    
+  4 | 44 |  4 |    
+  5 |    |    |    
+(5 rows)
+
+select * from x left join y on (x1 = y1 and y2 is not null) ORDER BY x1, x2, y1, y2;
+ x1 | x2 | y1 | y2  
+----+----+----+-----
+  1 | 11 |  1 | 111
+  2 | 22 |  2 | 222
+  3 |    |  3 | 333
+  4 | 44 |    |    
+  5 |    |    |    
+(5 rows)
+
+select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2)
+on (x1 = xx1) ORDER BY x1, x2, y1, y2;
+ x1 | x2 | y1 | y2  | xx1 | xx2 
+----+----+----+-----+-----+-----
+  1 | 11 |  1 | 111 |   1 |  11
+  2 | 22 |  2 | 222 |   2 |  22
+  3 |    |  3 | 333 |   3 |    
+  4 | 44 |  4 |     |   4 |  44
+  5 |    |    |     |   5 |    
+(5 rows)
+
+select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2)
+on (x1 = xx1 and x2 is not null) ORDER BY x1, x2, y1, y2;
+ x1 | x2 | y1 | y2  | xx1 | xx2 
+----+----+----+-----+-----+-----
+  1 | 11 |  1 | 111 |   1 |  11
+  2 | 22 |  2 | 222 |   2 |  22
+  3 |    |  3 | 333 |     |    
+  4 | 44 |  4 |     |   4 |  44
+  5 |    |    |     |     |    
+(5 rows)
+
+select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2)
+on (x1 = xx1 and y2 is not null) ORDER BY x1, x2, y1, y2;
+ x1 | x2 | y1 | y2  | xx1 | xx2 
+----+----+----+-----+-----+-----
+  1 | 11 |  1 | 111 |   1 |  11
+  2 | 22 |  2 | 222 |   2 |  22
+  3 |    |  3 | 333 |   3 |    
+  4 | 44 |  4 |     |     |    
+  5 |    |    |     |     |    
+(5 rows)
+
+select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2)
+on (x1 = xx1 and xx2 is not null) ORDER BY x1, x2, y1, y2;
+ x1 | x2 | y1 | y2  | xx1 | xx2 
+----+----+----+-----+-----+-----
+  1 | 11 |  1 | 111 |   1 |  11
+  2 | 22 |  2 | 222 |   2 |  22
+  3 |    |  3 | 333 |     |    
+  4 | 44 |  4 |     |   4 |  44
+  5 |    |    |     |     |    
+(5 rows)
+
+-- these should NOT give the same answers as above
+select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2)
+on (x1 = xx1) where (x2 is not null)
+ORDER BY x1, x2, y1, y2;
+ x1 | x2 | y1 | y2  | xx1 | xx2 
+----+----+----+-----+-----+-----
+  1 | 11 |  1 | 111 |   1 |  11
+  2 | 22 |  2 | 222 |   2 |  22
+  4 | 44 |  4 |     |   4 |  44
+(3 rows)
+
+select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2)
+on (x1 = xx1) where (y2 is not null)
+ORDER BY x1, x2, y1, y2;
+ x1 | x2 | y1 | y2  | xx1 | xx2 
+----+----+----+-----+-----+-----
+  1 | 11 |  1 | 111 |   1 |  11
+  2 | 22 |  2 | 222 |   2 |  22
+  3 |    |  3 | 333 |   3 |    
+(3 rows)
+
+select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2)
+on (x1 = xx1) where (xx2 is not null)
+ORDER BY x1, x2, y1, y2;
+ x1 | x2 | y1 | y2  | xx1 | xx2 
+----+----+----+-----+-----+-----
+  1 | 11 |  1 | 111 |   1 |  11
+  2 | 22 |  2 | 222 |   2 |  22
+  4 | 44 |  4 |     |   4 |  44
+(3 rows)
+
+--
+-- regression test: check for bug with propagation of implied equality
+-- to outside an IN
+--
+select count(*) from tenk1 a where unique1 in
+  (select unique1 from tenk1 b join tenk1 c using (unique1)
+   where b.unique2 = 42);
+ count 
+-------
+     1
+(1 row)
+
+--
+-- regression test: check for failure to generate a plan with multiple
+-- degenerate IN clauses
+--
+select count(*) from tenk1 x where
+  x.unique1 in (select a.f1 from int4_tbl a,float8_tbl b where a.f1=b.f1) and
+  x.unique1 = 0 and
+  x.unique1 in (select aa.f1 from int4_tbl aa,float8_tbl bb where aa.f1=bb.f1);
+ count 
+-------
+     1
+(1 row)
+
+-- try that with GEQO too
+begin;
+set geqo = on;
+set geqo_threshold = 2;
+select count(*) from tenk1 x where
+  x.unique1 in (select a.f1 from int4_tbl a,float8_tbl b where a.f1=b.f1) and
+  x.unique1 = 0 and
+  x.unique1 in (select aa.f1 from int4_tbl aa,float8_tbl bb where aa.f1=bb.f1);
+ count 
+-------
+     1
+(1 row)
+
+rollback;
+--
+-- regression test: be sure we cope with proven-dummy append rels
+--
+explain (costs off)
+select aa, bb, unique1, unique1
+  from tenk1 right join b on aa = unique1
+  where bb < bb and bb is null;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+select aa, bb, unique1, unique1
+  from tenk1 right join b on aa = unique1
+  where bb < bb and bb is null;
+ aa | bb | unique1 | unique1 
+----+----+---------+---------
+(0 rows)
+
+--
+-- regression test: check handling of empty-FROM subquery underneath outer join
+--
+set enable_nestloop to off;
+explain (costs off)
+select * from int8_tbl i1 left join (int8_tbl i2 join
+  (select 123 as x) ss on i2.q1 = x) on i1.q2 = i2.q2
+order by 1, 2;
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Sort
+   Sort Key: i1.q1, i1.q2
+   ->  Hash Left Join
+         Hash Cond: (i1.q2 = i2.q2)
+         ->  Remote Subquery Scan on all (datanode_1)
+               ->  Seq Scan on int8_tbl i1
+         ->  Hash
+               ->  Hash Join
+                     Hash Cond: (i2.q1 = (123))
+                     ->  Remote Subquery Scan on all (datanode_1)
+                           ->  Seq Scan on int8_tbl i2
+                     ->  Hash
+                           ->  Result
+(13 rows)
+
+select * from int8_tbl i1 left join (int8_tbl i2 join
+  (select 123 as x) ss on i2.q1 = x) on i1.q2 = i2.q2
+order by 1, 2;
+        q1        |        q2         | q1  |        q2        |  x  
+------------------+-------------------+-----+------------------+-----
+              123 |               456 | 123 |              456 | 123
+              123 |  4567890123456789 | 123 | 4567890123456789 | 123
+ 4567890123456789 | -4567890123456789 |     |                  |    
+ 4567890123456789 |               123 |     |                  |    
+ 4567890123456789 |  4567890123456789 | 123 | 4567890123456789 | 123
+(5 rows)
+
+reset enable_nestloop;
+--
+-- regression test: check a case where join_clause_is_movable_into() gives
+-- an imprecise result, causing an assertion failure
+--
+select count(*)
+from
+  (select t3.tenthous as x1, coalesce(t1.stringu1, t2.stringu1) as x2
+   from tenk1 t1
+   left join tenk1 t2 on t1.unique1 = t2.unique1
+   join tenk1 t3 on t1.unique2 = t3.unique2) ss,
+  tenk1 t4,
+  tenk1 t5
+where t4.thousand = t5.unique1 and ss.x1 = t4.tenthous and ss.x2 = t5.stringu1;
+ count 
+-------
+  1000
+(1 row)
+
+--
+-- regression test: check a case where we formerly missed including an EC
+-- enforcement clause because it was expected to be handled at scan level
+--
+explain (costs off)
+select a.f1, b.f1, t.thousand, t.tenthous from
+  tenk1 t,
+  (select sum(f1)+1 as f1 from int4_tbl i4a) a,
+  (select sum(f1) as f1 from int4_tbl i4b) b
+where b.f1 = t.thousand and a.f1 = b.f1 and (a.f1+b.f1+999) = t.tenthous;
+                                                         QUERY PLAN                                                          
+-----------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Nested Loop
+         ->  Aggregate
+               ->  Seq Scan on int4_tbl i4b
+         ->  Nested Loop
+               Join Filter: ((sum(i4b.f1)) = ((sum(i4a.f1) + 1)))
+               ->  Aggregate
+                     ->  Seq Scan on int4_tbl i4a
+               ->  Index Only Scan using tenk1_thous_tenthous on tenk1 t
+                     Index Cond: ((thousand = (sum(i4b.f1))) AND (tenthous = ((((sum(i4a.f1) + 1)) + (sum(i4b.f1))) + 999)))
+(10 rows)
+
+select a.f1, b.f1, t.thousand, t.tenthous from
+  tenk1 t,
+  (select sum(f1)+1 as f1 from int4_tbl i4a) a,
+  (select sum(f1) as f1 from int4_tbl i4b) b
+where b.f1 = t.thousand and a.f1 = b.f1 and (a.f1+b.f1+999) = t.tenthous;
+ f1 | f1 | thousand | tenthous 
+----+----+----------+----------
+(0 rows)
+
+--
+-- Clean up
+--
+DROP TABLE t1;
+DROP TABLE t2;
+DROP TABLE t3;
+DROP TABLE J1_TBL;
+DROP TABLE J2_TBL;
+-- Both DELETE and UPDATE allow the specification of additional tables
+-- to "join" against to determine which rows should be modified.
+CREATE TEMP TABLE t1 (a int, b int);
+CREATE TEMP TABLE t2 (a int, b int);
+CREATE TEMP TABLE t3 (x int, y int);
+INSERT INTO t1 VALUES (5, 10);
+INSERT INTO t1 VALUES (15, 20);
+INSERT INTO t1 VALUES (100, 100);
+INSERT INTO t1 VALUES (200, 1000);
+INSERT INTO t2 VALUES (200, 2000);
+INSERT INTO t3 VALUES (5, 20);
+INSERT INTO t3 VALUES (6, 7);
+INSERT INTO t3 VALUES (7, 8);
+INSERT INTO t3 VALUES (500, 100);
+DELETE FROM t3 USING t1 table1 WHERE t3.x = table1.a;
+SELECT * FROM t3 ORDER By x, y;
+  x  |  y  
+-----+-----
+   6 |   7
+   7 |   8
+ 500 | 100
+(3 rows)
+
+DELETE FROM t3 USING t1 JOIN t2 USING (a) WHERE t3.x > t1.a;
+ERROR:  could not plan this distributed delete
+DETAIL:  correlated or complex DELETE is currently not supported in Postgres-XL.
+SELECT * FROM t3 ORDER By x, y;
+  x  |  y  
+-----+-----
+   6 |   7
+   7 |   8
+ 500 | 100
+(3 rows)
+
+DELETE FROM t3 USING t3 t3_other WHERE t3.x = t3_other.x AND t3.y = t3_other.y;
+SELECT * FROM t3 ORDER By x, y;
+ x | y 
+---+---
+(0 rows)
+
+-- Test join against inheritance tree
+create temp table t2a () inherits (t2);
+insert into t2a values (200, 2001);
+select * from t1 left join t2 on (t1.a = t2.a) order by 1,2,3,4;
+  a  |  b   |  a  |  b   
+-----+------+-----+------
+   5 |   10 |     |     
+  15 |   20 |     |     
+ 100 |  100 |     |     
+ 200 | 1000 | 200 | 2000
+ 200 | 1000 | 200 | 2001
+(5 rows)
+
+-- Test matching of column name with wrong alias
+select t1.x from t1 join t3 on (t1.a = t3.x);
+ERROR:  column t1.x does not exist
+LINE 1: select t1.x from t1 join t3 on (t1.a = t3.x);
+               ^
+HINT:  Perhaps you meant to reference the column "t3.x".
+--
+-- regression test for 8.1 merge right join bug
+--
+CREATE TEMP TABLE tt1 ( tt1_id int4, joincol int4 );
+INSERT INTO tt1 VALUES (1, 11);
+INSERT INTO tt1 VALUES (2, NULL);
+CREATE TEMP TABLE tt2 ( tt2_id int4, joincol int4 );
+INSERT INTO tt2 VALUES (21, 11);
+INSERT INTO tt2 VALUES (22, 11);
+set enable_hashjoin to off;
+set enable_nestloop to off;
+-- these should give the same results
+select tt1.*, tt2.* from tt1 left join tt2 on tt1.joincol = tt2.joincol 
+      ORDER BY tt1_id, tt2_id; 
+ tt1_id | joincol | tt2_id | joincol 
+--------+---------+--------+---------
+      1 |      11 |     21 |      11
+      1 |      11 |     22 |      11
+      2 |         |        |        
+(3 rows)
+
+select tt1.*, tt2.* from tt2 right join tt1 on tt1.joincol = tt2.joincol 
+      ORDER BY tt1_id, tt2_id; 
+ tt1_id | joincol | tt2_id | joincol 
+--------+---------+--------+---------
+      1 |      11 |     21 |      11
+      1 |      11 |     22 |      11
+      2 |         |        |        
+(3 rows)
+
+reset enable_hashjoin;
+reset enable_nestloop;
+--
+-- regression test for bug #13908 (hash join with skew tuples & nbatch increase)
+--
+set work_mem to '64kB';
+set enable_mergejoin to off;
+explain (costs off)
+select count(*) from tenk1 a, tenk1 b
+  where a.hundred = b.thousand and (b.fivethous % 10) < 10;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Partial Aggregate
+               ->  Hash Join
+                     Hash Cond: (a.hundred = b.thousand)
+                     ->  Index Only Scan using tenk1_hundred on tenk1 a
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Seq Scan on tenk1 b
+                                       Filter: ((fivethous % 10) < 10)
+(10 rows)
+
+select count(*) from tenk1 a, tenk1 b
+  where a.hundred = b.thousand and (b.fivethous % 10) < 10;
+ count  
+--------
+ 100000
+(1 row)
+
+reset work_mem;
+reset enable_mergejoin;
+--
+-- regression test for 8.2 bug with improper re-ordering of left joins
+--
+create temp table tt3(f1 int, f2 text);
+insert into tt3 select x, repeat('xyzzy', 100) from generate_series(1,10000) x;
+create index tt3i on tt3(f1);
+analyze tt3;
+create temp table tt4(f1 int);
+insert into tt4 values (0),(1),(9999);
+analyze tt4;
+SELECT a.f1
+FROM tt4 a
+LEFT JOIN (
+        SELECT b.f1
+        FROM tt3 b LEFT JOIN tt3 c ON (b.f1 = c.f1)
+        WHERE c.f1 IS NULL
+) AS d ON (a.f1 = d.f1)
+WHERE d.f1 IS NULL ORDER BY f1;
+  f1  
+------
+    0
+    1
+ 9999
+(3 rows)
+
+--
+-- regression test for proper handling of outer joins within antijoins
+--
+create temp table tt4x(c1 int, c2 int, c3 int);
+explain (costs off)
+select * from tt4x t1
+where not exists (
+  select 1 from tt4x t2
+    left join tt4x t3 on t2.c3 = t3.c1
+    left join ( select t5.c1 as c1
+                from tt4x t4 left join tt4x t5 on t4.c2 = t5.c1
+              ) a1 on t3.c2 = a1.c1
+  where t1.c1 = t2.c2
+);
+                                                QUERY PLAN                                                 
+-----------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Anti Join
+         Hash Cond: (t1.c1 = t2.c2)
+         ->  Seq Scan on tt4x t1
+         ->  Hash
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: c2
+                     ->  Merge Right Join
+                           Merge Cond: (t5.c1 = t3.c2)
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: c1
+                                 ->  Merge Join
+                                       Merge Cond: (t4.c2 = t5.c1)
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Distribute results by H: c2
+                                             ->  Sort
+                                                   Sort Key: t4.c2
+                                                   ->  Seq Scan on tt4x t4
+                                       ->  Sort
+                                             Sort Key: t5.c1
+                                             ->  Seq Scan on tt4x t5
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: c2
+                                       ->  Sort
+                                             Sort Key: t3.c2
+                                             ->  Hash Left Join
+                                                   Hash Cond: (t2.c3 = t3.c1)
+                                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                         Distribute results by H: c3
+                                                         ->  Seq Scan on tt4x t2
+                                                   ->  Hash
+                                                         ->  Seq Scan on tt4x t3
+(33 rows)
+
+--
+-- regression test for problems of the sort depicted in bug #3494
+--
+create temp table tt5(f1 int, f2 int);
+create temp table tt6(f1 int, f2 int);
+insert into tt5 values(1, 10);
+insert into tt5 values(1, 11);
+insert into tt6 values(1, 9);
+insert into tt6 values(1, 2);
+insert into tt6 values(2, 9);
+select * from tt5,tt6 where tt5.f1 = tt6.f1 and tt5.f1 = tt5.f2 - tt6.f2 
+      ORDER BY tt5.f1, tt5.f2, tt6.f1, tt6.f2;
+ f1 | f2 | f1 | f2 
+----+----+----+----
+  1 | 10 |  1 |  9
+(1 row)
+
+--
+-- regression test for problems of the sort depicted in bug #3588
+--
+create temp table xx (pkxx int);
+create temp table yy (pkyy int, pkxx int);
+insert into xx values (1);
+insert into xx values (2);
+insert into xx values (3);
+insert into yy values (101, 1);
+insert into yy values (201, 2);
+insert into yy values (301, NULL);
+select yy.pkyy as yy_pkyy, yy.pkxx as yy_pkxx, yya.pkyy as yya_pkyy,
+       xxa.pkxx as xxa_pkxx, xxb.pkxx as xxb_pkxx
+from yy
+     left join (SELECT * FROM yy where pkyy = 101) as yya ON yy.pkyy = yya.pkyy
+     left join xx xxa on yya.pkxx = xxa.pkxx
+     left join xx xxb on coalesce (xxa.pkxx, 1) = xxb.pkxx 
+     ORDER BY yy_pkyy, yy_pkxx, yya_pkyy, xxa_pkxx, xxb_pkxx;
+ yy_pkyy | yy_pkxx | yya_pkyy | xxa_pkxx | xxb_pkxx 
+---------+---------+----------+----------+----------
+     101 |       1 |      101 |        1 |        1
+     201 |       2 |          |          |        1
+     301 |         |          |          |        1
+(3 rows)
+
+--
+-- regression test for improper pushing of constants across outer-join clauses
+-- (as seen in early 8.2.x releases)
+--
+create temp table zt1 (f1 int primary key);
+create temp table zt2 (f2 int primary key);
+create temp table zt3 (f3 int primary key);
+insert into zt1 values(53);
+insert into zt2 values(53);
+select * from
+  zt2 left join zt3 on (f2 = f3)
+      left join zt1 on (f3 = f1)
+where f2 = 53 
+ORDER BY f1, f2, f3;
+ f2 | f3 | f1 
+----+----+----
+ 53 |    |   
+(1 row)
+
+create temp view zv1 as select *,'dummy'::text AS junk from zt1;
+select * from
+  zt2 left join zt3 on (f2 = f3)
+      left join zv1 on (f3 = f1)
+where f2 = 53 
+ORDER BY f1, f2, f3;
+ f2 | f3 | f1 | junk 
+----+----+----+------
+ 53 |    |    | 
+(1 row)
+
+--
+-- regression test for improper extraction of OR indexqual conditions
+-- (as seen in early 8.3.x releases)
+--
+select a.unique2, a.ten, b.tenthous, b.unique2, b.hundred
+from tenk1 a left join tenk1 b on a.unique2 = b.tenthous
+where a.unique1 = 42 and
+      ((b.unique2 is null and a.ten = 2) or b.hundred = 3);
+ unique2 | ten | tenthous | unique2 | hundred 
+---------+-----+----------+---------+---------
+(0 rows)
+
+--
+-- test proper positioning of one-time quals in EXISTS (8.4devel bug)
+--
+prepare foo(bool) as
+  select count(*) from tenk1 a left join tenk1 b
+    on (a.unique2 = b.unique1 and exists
+        (select 1 from tenk1 c where c.thousand = b.unique2 and $1));
+execute foo(true);
+ count 
+-------
+ 10000
+(1 row)
+
+execute foo(false);
+ count 
+-------
+ 10000
+(1 row)
+
+--
+-- test for sane behavior with noncanonical merge clauses, per bug #4926
+--
+begin;
+set enable_mergejoin = 1;
+set enable_hashjoin = 0;
+set enable_nestloop = 0;
+create temp table a (i integer);
+create temp table b (x integer, y integer);
+select * from a left join b on i = x and i = y and x = i;
+ i | x | y 
+---+---+---
+(0 rows)
+
+rollback;
+--
+-- test NULL behavior of whole-row Vars, per bug #5025
+--
+select t1.q2, count(t2.*)
+from int8_tbl t1 left join int8_tbl t2 on (t1.q2 = t2.q1)
+group by t1.q2 order by 1;
+        q2         | count 
+-------------------+-------
+ -4567890123456789 |     0
+               123 |     2
+               456 |     0
+  4567890123456789 |     6
+(4 rows)
+
+select t1.q2, count(t2.*)
+from int8_tbl t1 left join (select * from int8_tbl) t2 on (t1.q2 = t2.q1)
+group by t1.q2 order by 1;
+        q2         | count 
+-------------------+-------
+ -4567890123456789 |     0
+               123 |     2
+               456 |     0
+  4567890123456789 |     6
+(4 rows)
+
+select t1.q2, count(t2.*)
+from int8_tbl t1 left join (select * from int8_tbl offset 0) t2 on (t1.q2 = t2.q1)
+group by t1.q2 order by 1;
+        q2         | count 
+-------------------+-------
+ -4567890123456789 |     0
+               123 |     2
+               456 |     0
+  4567890123456789 |     6
+(4 rows)
+
+select t1.q2, count(t2.*)
+from int8_tbl t1 left join
+  (select q1, case when q2=1 then 1 else q2 end as q2 from int8_tbl) t2
+  on (t1.q2 = t2.q1)
+group by t1.q2 order by 1;
+        q2         | count 
+-------------------+-------
+ -4567890123456789 |     0
+               123 |     2
+               456 |     0
+  4567890123456789 |     6
+(4 rows)
+
+--
+-- test incorrect failure to NULL pulled-up subexpressions
+--
+begin;
+create temp table a (
+     code char not null,
+     constraint a_pk primary key (code)
+);
+create temp table b (
+     a char not null,
+     num integer not null,
+     constraint b_pk primary key (a, num)
+);
+create temp table c (
+     name char not null,
+     a char,
+     constraint c_pk primary key (name)
+);
+insert into a (code) values ('p');
+insert into a (code) values ('q');
+insert into b (a, num) values ('p', 1);
+insert into b (a, num) values ('p', 2);
+insert into c (name, a) values ('A', 'p');
+insert into c (name, a) values ('B', 'q');
+insert into c (name, a) values ('C', null);
+select c.name, ss.code, ss.b_cnt, ss.const
+from c left join
+  (select a.code, coalesce(b_grp.cnt, 0) as b_cnt, -1 as const
+   from a left join
+     (select count(1) as cnt, b.a from b group by b.a) as b_grp
+     on a.code = b_grp.a
+  ) as ss
+  on (c.a = ss.code)
+order by c.name;
+ name | code | b_cnt | const 
+------+------+-------+-------
+ A    | p    |     2 |    -1
+ B    | q    |     0 |    -1
+ C    |      |       |      
+(3 rows)
+
+rollback;
+--
+-- test incorrect handling of placeholders that only appear in targetlists,
+-- per bug #6154
+--
+SELECT * FROM
+( SELECT 1 as key1 ) sub1
+LEFT JOIN
+( SELECT sub3.key3, sub4.value2, COALESCE(sub4.value2, 66) as value3 FROM
+    ( SELECT 1 as key3 ) sub3
+    LEFT JOIN
+    ( SELECT sub5.key5, COALESCE(sub6.value1, 1) as value2 FROM
+        ( SELECT 1 as key5 ) sub5
+        LEFT JOIN
+        ( SELECT 2 as key6, 42 as value1 ) sub6
+        ON sub5.key5 = sub6.key6
+    ) sub4
+    ON sub4.key5 = sub3.key3
+) sub2
+ON sub1.key1 = sub2.key3;
+ key1 | key3 | value2 | value3 
+------+------+--------+--------
+    1 |    1 |      1 |      1
+(1 row)
+
+-- test the path using join aliases, too
+SELECT * FROM
+( SELECT 1 as key1 ) sub1
+LEFT JOIN
+( SELECT sub3.key3, value2, COALESCE(value2, 66) as value3 FROM
+    ( SELECT 1 as key3 ) sub3
+    LEFT JOIN
+    ( SELECT sub5.key5, COALESCE(sub6.value1, 1) as value2 FROM
+        ( SELECT 1 as key5 ) sub5
+        LEFT JOIN
+        ( SELECT 2 as key6, 42 as value1 ) sub6
+        ON sub5.key5 = sub6.key6
+    ) sub4
+    ON sub4.key5 = sub3.key3
+) sub2
+ON sub1.key1 = sub2.key3;
+ key1 | key3 | value2 | value3 
+------+------+--------+--------
+    1 |    1 |      1 |      1
+(1 row)
+
+--
+-- test case where a PlaceHolderVar is used as a nestloop parameter
+--
+EXPLAIN (NUM_NODES OFF, NODES OFF, COSTS OFF)
+SELECT qq, unique1
+  FROM
+  ( SELECT COALESCE(q1, 0) AS qq FROM int8_tbl a ) AS ss1
+  FULL OUTER JOIN
+  ( SELECT COALESCE(q2, -1) AS qq FROM int8_tbl b ) AS ss2
+  USING (qq)
+  INNER JOIN tenk1 c ON qq = unique2;
+                                                  QUERY PLAN                                                   
+---------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Nested Loop
+         ->  Hash Full Join
+               Hash Cond: (COALESCE(a.q1, '0'::bigint) = COALESCE(b.q2, '-1'::bigint))
+               ->  Seq Scan on int8_tbl a
+               ->  Hash
+                     ->  Seq Scan on int8_tbl b
+         ->  Index Scan using tenk1_unique2 on tenk1 c
+               Index Cond: (unique2 = COALESCE((COALESCE(a.q1, '0'::bigint)), (COALESCE(b.q2, '-1'::bigint))))
+(9 rows)
+
+SELECT qq, unique1
+  FROM
+  ( SELECT COALESCE(q1, 0) AS qq FROM int8_tbl a ) AS ss1
+  FULL OUTER JOIN
+  ( SELECT COALESCE(q2, -1) AS qq FROM int8_tbl b ) AS ss2
+  USING (qq)
+  INNER JOIN tenk1 c ON qq = unique2;
+ qq  | unique1 
+-----+---------
+ 123 |    4596
+ 123 |    4596
+ 456 |    7318
+(3 rows)
+
+--
+-- nested nestloops can require nested PlaceHolderVars
+--
+create temp table nt1 (
+  id int primary key,
+  a1 boolean,
+  a2 boolean
+);
+create temp table nt2 (
+  id int primary key,
+  nt1_id int,
+  b1 boolean,
+  b2 boolean
+);
+create temp table nt3 (
+  id int primary key,
+  nt2_id int,
+  c1 boolean
+);
+insert into nt1 values (1,true,true);
+insert into nt1 values (2,true,false);
+insert into nt1 values (3,false,false);
+insert into nt2 values (1,1,true,true);
+insert into nt2 values (2,2,true,false);
+insert into nt2 values (3,3,false,false);
+insert into nt3 values (1,1,true);
+insert into nt3 values (2,2,false);
+insert into nt3 values (3,3,true);
+explain(num_nodes off, nodes off, costs off) 
+select nt3.id
+from nt3 as nt3
+  left join
+    (select nt2.*, (nt2.b1 and ss1.a3) AS b3
+     from nt2 as nt2
+       left join
+         (select nt1.*, (nt1.id is not null) as a3 from nt1) as ss1
+         on ss1.id = nt2.nt1_id
+    ) as ss2
+    on ss2.id = nt3.nt2_id
+where nt3.id = 1 and ss2.b3;
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Nested Loop
+         ->  Remote Subquery Scan on all
+               Distribute results by H: nt1_id
+               ->  Nested Loop
+                     ->  Remote Subquery Scan on all
+                           Distribute results by H: nt2_id
+                           ->  Index Scan using nt3_pkey on nt3
+                                 Index Cond: (id = 1)
+                     ->  Index Scan using nt2_pkey on nt2
+                           Index Cond: (id = nt3.nt2_id)
+         ->  Index Only Scan using nt1_pkey on nt1
+               Index Cond: (id = nt2.nt1_id)
+               Filter: (nt2.b1 AND (id IS NOT NULL))
+(14 rows)
+
+select nt3.id
+from nt3 as nt3
+  left join
+    (select nt2.*, (nt2.b1 and ss1.a3) AS b3
+     from nt2 as nt2
+       left join
+         (select nt1.*, (nt1.id is not null) as a3 from nt1) as ss1
+         on ss1.id = nt2.nt1_id
+    ) as ss2
+    on ss2.id = nt3.nt2_id
+where nt3.id = 1 and ss2.b3;
+ id 
+----
+  1
+(1 row)
+
+--
+-- test case where a PlaceHolderVar is propagated into a subquery
+--
+explain (num_nodes off, nodes off, costs off)
+select * from
+  int8_tbl t1 left join
+  (select q1 as x, 42 as y from int8_tbl t2) ss
+  on t1.q2 = ss.x
+where
+  1 = (select 1 from int8_tbl t3 where ss.y is not null limit 1)
+order by 1,2;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Sort
+         Sort Key: t1.q1, t1.q2
+         ->  Hash Left Join
+               Hash Cond: (t1.q2 = t2.q1)
+               Filter: (1 = (SubPlan 1))
+               ->  Seq Scan on int8_tbl t1
+               ->  Hash
+                     ->  Seq Scan on int8_tbl t2
+               SubPlan 1
+                 ->  Limit
+                       ->  Remote Subquery Scan on all
+                             ->  Limit
+                                   ->  Result
+                                         One-Time Filter: ((42) IS NOT NULL)
+                                         ->  Seq Scan on int8_tbl t3
+(16 rows)
+
+select * from
+  int8_tbl t1 left join
+  (select q1 as x, 42 as y from int8_tbl t2) ss
+  on t1.q2 = ss.x
+where
+  1 = (select 1 from int8_tbl t3 where ss.y is not null limit 1)
+order by 1,2;
+        q1        |        q2        |        x         | y  
+------------------+------------------+------------------+----
+              123 | 4567890123456789 | 4567890123456789 | 42
+              123 | 4567890123456789 | 4567890123456789 | 42
+              123 | 4567890123456789 | 4567890123456789 | 42
+ 4567890123456789 |              123 |              123 | 42
+ 4567890123456789 |              123 |              123 | 42
+ 4567890123456789 | 4567890123456789 | 4567890123456789 | 42
+ 4567890123456789 | 4567890123456789 | 4567890123456789 | 42
+ 4567890123456789 | 4567890123456789 | 4567890123456789 | 42
+(8 rows)
+
+--
+-- test the corner cases FULL JOIN ON TRUE and FULL JOIN ON FALSE
+--
+select * from int4_tbl a full join int4_tbl b on true order by 1,2;
+     f1      |     f1      
+-------------+-------------
+ -2147483647 | -2147483647
+ -2147483647 |     -123456
+ -2147483647 |           0
+ -2147483647 |      123456
+ -2147483647 |  2147483647
+     -123456 | -2147483647
+     -123456 |     -123456
+     -123456 |           0
+     -123456 |      123456
+     -123456 |  2147483647
+           0 | -2147483647
+           0 |     -123456
+           0 |           0
+           0 |      123456
+           0 |  2147483647
+      123456 | -2147483647
+      123456 |     -123456
+      123456 |           0
+      123456 |      123456
+      123456 |  2147483647
+  2147483647 | -2147483647
+  2147483647 |     -123456
+  2147483647 |           0
+  2147483647 |      123456
+  2147483647 |  2147483647
+(25 rows)
+
+select * from int4_tbl a full join int4_tbl b on false order by 1,2;
+     f1      |     f1      
+-------------+-------------
+ -2147483647 |            
+     -123456 |            
+           0 |            
+      123456 |            
+  2147483647 |            
+             | -2147483647
+             |     -123456
+             |           0
+             |      123456
+             |  2147483647
+(10 rows)
+
+--
+-- test for ability to use a cartesian join when necessary
+--
+explain (num_nodes off, nodes off, costs off)
+select * from
+  tenk1 join int4_tbl on f1 = twothousand,
+  int4(sin(1)) q1,
+  int4(sin(0)) q2
+where q1 = thousand or q2 = thousand;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Nested Loop
+   Join Filter: (tenk1.twothousand = int4_tbl.f1)
+   ->  Nested Loop
+         ->  Nested Loop
+               ->  Function Scan on q1
+               ->  Function Scan on q2
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     ->  Bitmap Heap Scan on tenk1
+                           Recheck Cond: ((q1.q1 = thousand) OR (q2.q2 = thousand))
+                           ->  BitmapOr
+                                 ->  Bitmap Index Scan on tenk1_thous_tenthous
+                                       Index Cond: (q1.q1 = thousand)
+                                 ->  Bitmap Index Scan on tenk1_thous_tenthous
+                                       Index Cond: (q2.q2 = thousand)
+   ->  Materialize
+         ->  Remote Subquery Scan on all
+               ->  Seq Scan on int4_tbl
+(18 rows)
+
+explain (num_nodes off, nodes off, costs off)
+select * from
+  tenk1 join int4_tbl on f1 = twothousand,
+  int4(sin(1)) q1,
+  int4(sin(0)) q2
+where thousand = (q1 + q2);
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ Nested Loop
+   Join Filter: (tenk1.twothousand = int4_tbl.f1)
+   ->  Nested Loop
+         ->  Nested Loop
+               ->  Function Scan on q1
+               ->  Function Scan on q2
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     ->  Bitmap Heap Scan on tenk1
+                           Recheck Cond: (thousand = (q1.q1 + q2.q2))
+                           ->  Bitmap Index Scan on tenk1_thous_tenthous
+                                 Index Cond: (thousand = (q1.q1 + q2.q2))
+   ->  Materialize
+         ->  Remote Subquery Scan on all
+               ->  Seq Scan on int4_tbl
+(15 rows)
+
+--
+-- test ability to generate a suitable plan for a star-schema query
+--
+explain (costs off)
+select * from
+  tenk1, int8_tbl a, int8_tbl b
+where thousand = a.q1 and tenthous = b.q1 and a.q2 = 1 and b.q2 = 2;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Merge Join
+         Merge Cond: (tenk1.thousand = a.q1)
+         ->  Sort
+               Sort Key: tenk1.thousand
+               ->  Merge Join
+                     Merge Cond: (tenk1.tenthous = b.q1)
+                     ->  Sort
+                           Sort Key: tenk1.tenthous
+                           ->  Seq Scan on tenk1
+                     ->  Sort
+                           Sort Key: b.q1
+                           ->  Seq Scan on int8_tbl b
+                                 Filter: (q2 = 2)
+         ->  Sort
+               Sort Key: a.q1
+               ->  Seq Scan on int8_tbl a
+                     Filter: (q2 = 1)
+(19 rows)
+
+--
+-- test a corner case in which we shouldn't apply the star-schema optimization
+--
+explain (costs off, nodes off)
+select t1.unique2, t1.stringu1, t2.unique1, t2.stringu2 from
+  tenk1 t1
+  inner join int4_tbl i1
+    left join (select v1.x2, v2.y1, 11 AS d1
+               from (values(1,0)) v1(x1,x2)
+               left join (values(3,1)) v2(y1,y2)
+               on v1.x1 = v2.y2) subq1
+    on (i1.f1 = subq1.x2)
+  on (t1.unique2 = subq1.d1)
+  left join tenk1 t2
+  on (subq1.y1 = t2.unique1)
+where t1.unique2 < 42 and t1.stringu1 > t2.stringu2;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Nested Loop
+   Join Filter: (t1.stringu1 > t2.stringu2)
+   ->  Nested Loop
+         ->  Nested Loop
+               Join Filter: ((1) = (1))
+               ->  Hash Join
+                     Hash Cond: (i1.f1 = (0))
+                     ->  Remote Subquery Scan on all
+                           ->  Seq Scan on int4_tbl i1
+                     ->  Hash
+                           ->  Result
+               ->  Result
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     ->  Index Scan using tenk1_unique2 on tenk1 t1
+                           Index Cond: ((unique2 = (11)) AND (unique2 < 42))
+   ->  Materialize
+         ->  Remote Subquery Scan on all
+               ->  Index Scan using tenk1_unique1 on tenk1 t2
+                     Index Cond: (unique1 = (3))
+(20 rows)
+
+select t1.unique2, t1.stringu1, t2.unique1, t2.stringu2 from
+  tenk1 t1
+  inner join int4_tbl i1
+    left join (select v1.x2, v2.y1, 11 AS d1
+               from (values(1,0)) v1(x1,x2)
+               left join (values(3,1)) v2(y1,y2)
+               on v1.x1 = v2.y2) subq1
+    on (i1.f1 = subq1.x2)
+  on (t1.unique2 = subq1.d1)
+  left join tenk1 t2
+  on (subq1.y1 = t2.unique1)
+where t1.unique2 < 42 and t1.stringu1 > t2.stringu2;
+ unique2 | stringu1 | unique1 | stringu2 
+---------+----------+---------+----------
+      11 | WFAAAA   |       3 | LKIAAA
+(1 row)
+
+-- variant that isn't quite a star-schema case
+select ss1.d1 from
+  tenk1 as t1
+  inner join tenk1 as t2
+  on t1.tenthous = t2.ten
+  inner join
+    int8_tbl as i8
+    left join int4_tbl as i4
+      inner join (select 64::information_schema.cardinal_number as d1
+                  from tenk1 t3,
+                       lateral (select abs(t3.unique1) + random()) ss0(x)
+                  where t3.fivethous < 0) as ss1
+      on i4.f1 = ss1.d1
+    on i8.q1 = i4.f1
+  on t1.tenthous = ss1.d1
+where t1.unique1 < i4.f1;
+ d1 
+----
+(0 rows)
+
+--
+-- test extraction of restriction OR clauses from join OR clause
+-- (we used to only do this for indexable clauses)
+--
+explain (costs off)
+select * from tenk1 a join tenk1 b on
+  (a.unique1 = 1 and b.unique1 = 2) or (a.unique2 = 3 and b.hundred = 4);
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
+ Nested Loop
+   Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR ((a.unique2 = 3) AND (b.hundred = 4)))
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Bitmap Heap Scan on tenk1 b
+               Recheck Cond: ((unique1 = 2) OR (hundred = 4))
+               ->  BitmapOr
+                     ->  Bitmap Index Scan on tenk1_unique1
+                           Index Cond: (unique1 = 2)
+                     ->  Bitmap Index Scan on tenk1_hundred
+                           Index Cond: (hundred = 4)
+   ->  Materialize
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Bitmap Heap Scan on tenk1 a
+                     Recheck Cond: ((unique1 = 1) OR (unique2 = 3))
+                     ->  BitmapOr
+                           ->  Bitmap Index Scan on tenk1_unique1
+                                 Index Cond: (unique1 = 1)
+                           ->  Bitmap Index Scan on tenk1_unique2
+                                 Index Cond: (unique2 = 3)
+(19 rows)
+
+explain (costs off)
+select * from tenk1 a join tenk1 b on
+  (a.unique1 = 1 and b.unique1 = 2) or (a.unique2 = 3 and b.ten = 4);
+                                         QUERY PLAN                                          
+---------------------------------------------------------------------------------------------
+ Nested Loop
+   Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR ((a.unique2 = 3) AND (b.ten = 4)))
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Seq Scan on tenk1 b
+               Filter: ((unique1 = 2) OR (ten = 4))
+   ->  Materialize
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Bitmap Heap Scan on tenk1 a
+                     Recheck Cond: ((unique1 = 1) OR (unique2 = 3))
+                     ->  BitmapOr
+                           ->  Bitmap Index Scan on tenk1_unique1
+                                 Index Cond: (unique1 = 1)
+                           ->  Bitmap Index Scan on tenk1_unique2
+                                 Index Cond: (unique2 = 3)
+(14 rows)
+
+explain (costs off)
+select * from tenk1 a join tenk1 b on
+  (a.unique1 = 1 and b.unique1 = 2) or
+  ((a.unique2 = 3 or a.unique2 = 7) and b.hundred = 4);
+                                                      QUERY PLAN                                                      
+----------------------------------------------------------------------------------------------------------------------
+ Nested Loop
+   Join Filter: (((a.unique1 = 1) AND (b.unique1 = 2)) OR (((a.unique2 = 3) OR (a.unique2 = 7)) AND (b.hundred = 4)))
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Bitmap Heap Scan on tenk1 b
+               Recheck Cond: ((unique1 = 2) OR (hundred = 4))
+               ->  BitmapOr
+                     ->  Bitmap Index Scan on tenk1_unique1
+                           Index Cond: (unique1 = 2)
+                     ->  Bitmap Index Scan on tenk1_hundred
+                           Index Cond: (hundred = 4)
+   ->  Materialize
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Bitmap Heap Scan on tenk1 a
+                     Recheck Cond: ((unique1 = 1) OR (unique2 = 3) OR (unique2 = 7))
+                     ->  BitmapOr
+                           ->  Bitmap Index Scan on tenk1_unique1
+                                 Index Cond: (unique1 = 1)
+                           ->  Bitmap Index Scan on tenk1_unique2
+                                 Index Cond: (unique2 = 3)
+                           ->  Bitmap Index Scan on tenk1_unique2
+                                 Index Cond: (unique2 = 7)
+(21 rows)
+
+--
+-- test placement of movable quals in a parameterized join tree
+--
+explain (num_nodes off, nodes off, costs off)
+select * from tenk1 t1 left join
+  (tenk1 t2 join tenk1 t3 on t2.thousand = t3.unique2)
+  on t1.hundred = t2.hundred and t1.ten = t3.ten
+where t1.unique1 = 1;
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all
+               Distribute results by H: hundred
+               ->  Index Scan using tenk1_unique1 on tenk1 t1
+                     Index Cond: (unique1 = 1)
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     Distribute results by H: hundred
+                     ->  Hash Join
+                           Hash Cond: (t3.unique2 = t2.thousand)
+                           Join Filter: (t1.ten = t3.ten)
+                           ->  Remote Subquery Scan on all
+                                 Distribute results by H: unique2
+                                 ->  Seq Scan on tenk1 t3
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all
+                                       Distribute results by H: thousand
+                                       ->  Bitmap Heap Scan on tenk1 t2
+                                             Recheck Cond: (t1.hundred = hundred)
+                                             ->  Bitmap Index Scan on tenk1_hundred
+                                                   Index Cond: (t1.hundred = hundred)
+(22 rows)
+
+explain (num_nodes off, nodes off, costs off)
+select * from tenk1 t1 left join
+  (tenk1 t2 join tenk1 t3 on t2.thousand = t3.unique2)
+  on t1.hundred = t2.hundred and t1.ten + t2.ten = t3.ten
+where t1.unique1 = 1;
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all
+               Distribute results by H: hundred
+               ->  Index Scan using tenk1_unique1 on tenk1 t1
+                     Index Cond: (unique1 = 1)
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     Distribute results by H: hundred
+                     ->  Hash Join
+                           Hash Cond: (t3.unique2 = t2.thousand)
+                           Join Filter: ((t1.ten + t2.ten) = t3.ten)
+                           ->  Remote Subquery Scan on all
+                                 Distribute results by H: unique2
+                                 ->  Seq Scan on tenk1 t3
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all
+                                       Distribute results by H: thousand
+                                       ->  Bitmap Heap Scan on tenk1 t2
+                                             Recheck Cond: (t1.hundred = hundred)
+                                             ->  Bitmap Index Scan on tenk1_hundred
+                                                   Index Cond: (t1.hundred = hundred)
+(22 rows)
+
+explain (num_nodes off, nodes off, costs off)
+select count(*) from
+  tenk1 a join tenk1 b on a.unique1 = b.unique2
+  left join tenk1 c on a.unique2 = b.unique1 and c.thousand = a.thousand
+  join int4_tbl on b.thousand = f1;
+                                              QUERY PLAN                                               
+-------------------------------------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Remote Subquery Scan on all
+         ->  Partial Aggregate
+               ->  Hash Right Join
+                     Hash Cond: (c.thousand = a.thousand)
+                     Join Filter: (a.unique2 = b.unique1)
+                     ->  Remote Subquery Scan on all
+                           Distribute results by H: thousand
+                           ->  Index Only Scan using tenk1_thous_tenthous on tenk1 c
+                     ->  Hash
+                           ->  Remote Subquery Scan on all
+                                 Distribute results by H: thousand
+                                 ->  Nested Loop
+                                       ->  Remote Subquery Scan on all
+                                             Distribute results by H: unique2
+                                             ->  Nested Loop
+                                                   ->  Seq Scan on int4_tbl
+                                                   ->  Bitmap Heap Scan on tenk1 b
+                                                         Recheck Cond: (thousand = int4_tbl.f1)
+                                                         ->  Bitmap Index Scan on tenk1_thous_tenthous
+                                                               Index Cond: (thousand = int4_tbl.f1)
+                                       ->  Index Scan using tenk1_unique1 on tenk1 a
+                                             Index Cond: (unique1 = b.unique2)
+(23 rows)
+
+select count(*) from
+  tenk1 a join tenk1 b on a.unique1 = b.unique2
+  left join tenk1 c on a.unique2 = b.unique1 and c.thousand = a.thousand
+  join int4_tbl on b.thousand = f1;
+ count 
+-------
+    10
+(1 row)
+
+explain (num_nodes off, nodes off, costs off)
+select b.unique1 from
+  tenk1 a join tenk1 b on a.unique1 = b.unique2
+  left join tenk1 c on b.unique1 = 42 and c.thousand = a.thousand
+  join int4_tbl i1 on b.thousand = f1
+  right join int4_tbl i2 on i2.f1 = b.tenthous
+  order by 1;
+                                                 QUERY PLAN                                                  
+-------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Sort
+         Sort Key: b.unique1
+         ->  Hash Right Join
+               Hash Cond: (b.tenthous = i2.f1)
+               ->  Remote Subquery Scan on all
+                     Distribute results by H: tenthous
+                     ->  Hash Right Join
+                           Hash Cond: (c.thousand = a.thousand)
+                           Join Filter: (b.unique1 = 42)
+                           ->  Remote Subquery Scan on all
+                                 Distribute results by H: 42
+                                 ->  Index Only Scan using tenk1_thous_tenthous on tenk1 c
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all
+                                       Distribute results by H: unique1
+                                       ->  Nested Loop
+                                             ->  Remote Subquery Scan on all
+                                                   Distribute results by H: unique2
+                                                   ->  Nested Loop
+                                                         ->  Seq Scan on int4_tbl i1
+                                                         ->  Bitmap Heap Scan on tenk1 b
+                                                               Recheck Cond: (thousand = i1.f1)
+                                                               ->  Bitmap Index Scan on tenk1_thous_tenthous
+                                                                     Index Cond: (thousand = i1.f1)
+                                             ->  Index Scan using tenk1_unique1 on tenk1 a
+                                                   Index Cond: (unique1 = b.unique2)
+               ->  Hash
+                     ->  Remote Subquery Scan on all
+                           Distribute results by H: f1
+                           ->  Seq Scan on int4_tbl i2
+(31 rows)
+
+select b.unique1 from
+  tenk1 a join tenk1 b on a.unique1 = b.unique2
+  left join tenk1 c on b.unique1 = 42 and c.thousand = a.thousand
+  join int4_tbl i1 on b.thousand = f1
+  right join int4_tbl i2 on i2.f1 = b.tenthous
+  order by 1;
+ unique1 
+---------
+       0
+        
+        
+        
+        
+(5 rows)
+
+explain (num_nodes off, nodes off, costs off)
+select * from
+(
+  select unique1, q1, coalesce(unique1, -1) + q1 as fault
+  from int8_tbl left join tenk1 on (q2 = unique2)
+) ss
+where fault = 122
+order by fault;
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Nested Loop Left Join
+         Filter: ((COALESCE(tenk1.unique1, '-1'::integer) + int8_tbl.q1) = 122)
+         ->  Remote Subquery Scan on all
+               Distribute results by H: q2
+               ->  Seq Scan on int8_tbl
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     Distribute results by H: unique2
+                     ->  Index Scan using tenk1_unique2 on tenk1
+                           Index Cond: (int8_tbl.q2 = unique2)
+(11 rows)
+
+select * from
+(
+  select unique1, q1, coalesce(unique1, -1) + q1 as fault
+  from int8_tbl left join tenk1 on (q2 = unique2)
+) ss
+where fault = 122
+order by fault;
+ unique1 | q1  | fault 
+---------+-----+-------
+         | 123 |   122
+(1 row)
+
+--
+-- test handling of potential equivalence clauses above outer joins
+--
+explain (num_nodes off, nodes off, costs off)
+select q1, unique2, thousand, hundred
+  from int8_tbl a left join tenk1 b on q1 = unique2
+  where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123);
+                                         QUERY PLAN                                         
+--------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Nested Loop Left Join
+         Filter: ((COALESCE(b.thousand, 123) = a.q1) AND (a.q1 = COALESCE(b.hundred, 123)))
+         ->  Remote Subquery Scan on all
+               Distribute results by H: q1
+               ->  Seq Scan on int8_tbl a
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     Distribute results by H: COALESCE(thousand, 123)
+                     ->  Index Scan using tenk1_unique2 on tenk1 b
+                           Index Cond: (a.q1 = unique2)
+(11 rows)
+
+select q1, unique2, thousand, hundred
+  from int8_tbl a left join tenk1 b on q1 = unique2
+  where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123);
+ q1 | unique2 | thousand | hundred 
+----+---------+----------+---------
+(0 rows)
+
+explain (num_nodes off, nodes off, costs off)
+select f1, unique2, case when unique2 is null then f1 else 0 end
+  from int4_tbl a left join tenk1 b on f1 = unique2
+  where (case when unique2 is null then f1 else 0 end) = 0;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Merge Right Join
+         Merge Cond: (b.unique2 = a.f1)
+         Filter: (CASE WHEN (b.unique2 IS NULL) THEN a.f1 ELSE 0 END = 0)
+         ->  Remote Subquery Scan on all
+               Distribute results by H: unique2
+               ->  Index Only Scan using tenk1_unique2 on tenk1 b
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     Distribute results by H: f1
+                     ->  Sort
+                           Sort Key: a.f1
+                           ->  Seq Scan on int4_tbl a
+(13 rows)
+
+select f1, unique2, case when unique2 is null then f1 else 0 end
+  from int4_tbl a left join tenk1 b on f1 = unique2
+  where (case when unique2 is null then f1 else 0 end) = 0;
+ f1 | unique2 | case 
+----+---------+------
+  0 |       0 |    0
+(1 row)
+
+--
+-- another case with equivalence clauses above outer joins (bug #8591)
+--
+explain (costs off)
+select a.unique1, b.unique1, c.unique1, coalesce(b.twothousand, a.twothousand)
+  from tenk1 a left join tenk1 b on b.thousand = a.unique1                        left join tenk1 c on c.unique2 = coalesce(b.twothousand, a.twothousand)
+  where a.unique2 < 10 and coalesce(b.twothousand, a.twothousand) = 44;
+                                                  QUERY PLAN                                                   
+---------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Nested Loop Left Join
+         ->  Hash Right Join
+               Hash Cond: (b.thousand = a.unique1)
+               Filter: (COALESCE(b.twothousand, a.twothousand) = 44)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: thousand
+                     ->  Seq Scan on tenk1 b
+               ->  Hash
+                     ->  Index Scan using tenk1_unique2 on tenk1 a
+                           Index Cond: (unique2 < 10)
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Index Scan using tenk1_unique2 on tenk1 c
+                           Index Cond: ((unique2 = COALESCE(b.twothousand, a.twothousand)) AND (unique2 = 44))
+(15 rows)
+
+select a.unique1, b.unique1, c.unique1, coalesce(b.twothousand, a.twothousand)
+  from tenk1 a left join tenk1 b on b.thousand = a.unique1                        left join tenk1 c on c.unique2 = coalesce(b.twothousand, a.twothousand)
+  where a.unique2 < 10 and coalesce(b.twothousand, a.twothousand) = 44;
+ unique1 | unique1 | unique1 | coalesce 
+---------+---------+---------+----------
+(0 rows)
+
+--
+-- check handling of join aliases when flattening multiple levels of subquery
+--
+explain (verbose, costs off)
+select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from
+  (values (0),(1)) foo1(join_key)
+left join
+  (select join_key, bug_field from
+    (select ss1.join_key, ss1.bug_field from
+      (select f1 as join_key, 666 as bug_field from int4_tbl i1) ss1
+    ) foo2
+   left join
+    (select unique2 as join_key from tenk1 i2) ss2
+   using (join_key)
+  ) foo3
+using (join_key);
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Hash Right Join
+   Output: "*VALUES*".column1, i1.f1, (666)
+   Hash Cond: (i1.f1 = "*VALUES*".column1)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         Output: i1.f1, 666
+         ->  Merge Right Join
+               Output: i1.f1, 666
+               Merge Cond: (i2.unique2 = i1.f1)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: i2.unique2
+                     Distribute results by H: unique2
+                     Sort Key: i2.unique2
+                     ->  Index Only Scan using tenk1_unique2 on public.tenk1 i2
+                           Output: i2.unique2
+               ->  Materialize
+                     Output: i1.f1
+                     ->  Remote Subquery Scan on all (datanode_1)
+                           Output: i1.f1
+                           Distribute results by H: f1
+                           ->  Sort
+                                 Output: i1.f1
+                                 Sort Key: i1.f1
+                                 ->  Seq Scan on public.int4_tbl i1
+                                       Output: i1.f1
+   ->  Hash
+         Output: "*VALUES*".column1
+         ->  Values Scan on "*VALUES*"
+               Output: "*VALUES*".column1
+(28 rows)
+
+select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from
+  (values (0),(1)) foo1(join_key)
+left join
+  (select join_key, bug_field from
+    (select ss1.join_key, ss1.bug_field from
+      (select f1 as join_key, 666 as bug_field from int4_tbl i1) ss1
+    ) foo2
+   left join
+    (select unique2 as join_key from tenk1 i2) ss2
+   using (join_key)
+  ) foo3
+using (join_key);
+ foo1_id | foo3_id | bug_field 
+---------+---------+-----------
+       0 |       0 |       666
+       1 |         |          
+(2 rows)
+
+--
+-- test successful handling of nested outer joins with degenerate join quals
+--
+set enable_nestloop to on;
+set enable_hashjoin to off;
+set enable_mergejoin to off;
+explain (verbose, costs off)
+select t1.* from
+  text_tbl t1
+  left join (select *, '***'::text as d1 from int8_tbl i8b1) b1
+    left join int8_tbl i8
+      left join (select *, null::int as d2 from int8_tbl i8b2) b2
+      on (i8.q1 = b2.q1)
+    on (b2.d2 = b1.q2)
+  on (t1.f1 = b1.d1)
+  left join int4_tbl i4
+  on (i8.q2 = i4.f1);
+                                    QUERY PLAN                                    
+----------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: t1.f1
+   ->  Nested Loop Left Join
+         Output: t1.f1
+         Join Filter: (i8.q2 = i4.f1)
+         ->  Nested Loop Left Join
+               Output: t1.f1, i8.q2
+               Join Filter: (t1.f1 = '***'::text)
+               ->  Seq Scan on public.text_tbl t1
+                     Output: t1.f1
+               ->  Materialize
+                     Output: i8.q2
+                     ->  Nested Loop Left Join
+                           Output: i8.q2
+                           Join Filter: ((NULL::integer) = i8b1.q2)
+                           ->  Seq Scan on public.int8_tbl i8b1
+                                 Output: i8b1.q1, i8b1.q2
+                           ->  Materialize
+                                 Output: i8.q2, (NULL::integer)
+                                 ->  Nested Loop Left Join
+                                       Output: i8.q2, (NULL::integer)
+                                       Join Filter: (i8.q1 = i8b2.q1)
+                                       ->  Seq Scan on public.int8_tbl i8
+                                             Output: i8.q1, i8.q2
+                                       ->  Materialize
+                                             Output: i8b2.q1, (NULL::integer)
+                                             ->  Seq Scan on public.int8_tbl i8b2
+                                                   Output: i8b2.q1, NULL::integer
+         ->  Materialize
+               Output: i4.f1
+               ->  Seq Scan on public.int4_tbl i4
+                     Output: i4.f1
+(32 rows)
+
+select t1.* from
+  text_tbl t1
+  left join (select *, '***'::text as d1 from int8_tbl i8b1) b1
+    left join int8_tbl i8
+      left join (select *, null::int as d2 from int8_tbl i8b2) b2
+      on (i8.q1 = b2.q1)
+    on (b2.d2 = b1.q2)
+  on (t1.f1 = b1.d1)
+  left join int4_tbl i4
+  on (i8.q2 = i4.f1);
+        f1         
+-------------------
+ hi de ho neighbor
+ doh!
+(2 rows)
+
+explain (verbose, costs off)
+select t1.* from
+  text_tbl t1
+  left join (select *, '***'::text as d1 from int8_tbl i8b1) b1
+    left join int8_tbl i8
+      left join (select *, null::int as d2 from int8_tbl i8b2, int4_tbl i4b2) b2
+      on (i8.q1 = b2.q1)
+    on (b2.d2 = b1.q2)
+  on (t1.f1 = b1.d1)
+  left join int4_tbl i4
+  on (i8.q2 = i4.f1);
+                                          QUERY PLAN                                          
+----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: t1.f1
+   ->  Nested Loop Left Join
+         Output: t1.f1
+         Join Filter: (i8.q2 = i4.f1)
+         ->  Nested Loop Left Join
+               Output: t1.f1, i8.q2
+               Join Filter: (t1.f1 = '***'::text)
+               ->  Seq Scan on public.text_tbl t1
+                     Output: t1.f1
+               ->  Materialize
+                     Output: i8.q2
+                     ->  Nested Loop Left Join
+                           Output: i8.q2
+                           Join Filter: ((NULL::integer) = i8b1.q2)
+                           ->  Seq Scan on public.int8_tbl i8b1
+                                 Output: i8b1.q1, i8b1.q2
+                           ->  Materialize
+                                 Output: i8.q2, (NULL::integer)
+                                 ->  Nested Loop Left Join
+                                       Output: i8.q2, (NULL::integer)
+                                       Join Filter: (i8.q1 = i8b2.q1)
+                                       ->  Seq Scan on public.int8_tbl i8
+                                             Output: i8.q1, i8.q2
+                                       ->  Materialize
+                                             Output: i8b2.q1, (NULL::integer)
+                                             ->  Nested Loop
+                                                   Output: i8b2.q1, NULL::integer
+                                                   ->  Seq Scan on public.int8_tbl i8b2
+                                                         Output: i8b2.q1, i8b2.q2
+                                                   ->  Materialize
+                                                         ->  Seq Scan on public.int4_tbl i4b2
+         ->  Materialize
+               Output: i4.f1
+               ->  Seq Scan on public.int4_tbl i4
+                     Output: i4.f1
+(36 rows)
+
+select t1.* from
+  text_tbl t1
+  left join (select *, '***'::text as d1 from int8_tbl i8b1) b1
+    left join int8_tbl i8
+      left join (select *, null::int as d2 from int8_tbl i8b2, int4_tbl i4b2) b2
+      on (i8.q1 = b2.q1)
+    on (b2.d2 = b1.q2)
+  on (t1.f1 = b1.d1)
+  left join int4_tbl i4
+  on (i8.q2 = i4.f1);
+        f1         
+-------------------
+ hi de ho neighbor
+ doh!
+(2 rows)
+
+explain (verbose, costs off)
+select t1.* from
+  text_tbl t1
+  left join (select *, '***'::text as d1 from int8_tbl i8b1) b1
+    left join int8_tbl i8
+      left join (select *, null::int as d2 from int8_tbl i8b2, int4_tbl i4b2
+                 where q1 = f1) b2
+      on (i8.q1 = b2.q1)
+    on (b2.d2 = b1.q2)
+  on (t1.f1 = b1.d1)
+  left join int4_tbl i4
+  on (i8.q2 = i4.f1);
+                                          QUERY PLAN                                          
+----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: t1.f1
+   ->  Nested Loop Left Join
+         Output: t1.f1
+         Join Filter: (i8.q2 = i4.f1)
+         ->  Nested Loop Left Join
+               Output: t1.f1, i8.q2
+               Join Filter: (t1.f1 = '***'::text)
+               ->  Seq Scan on public.text_tbl t1
+                     Output: t1.f1
+               ->  Materialize
+                     Output: i8.q2
+                     ->  Nested Loop Left Join
+                           Output: i8.q2
+                           Join Filter: ((NULL::integer) = i8b1.q2)
+                           ->  Seq Scan on public.int8_tbl i8b1
+                                 Output: i8b1.q1, i8b1.q2
+                           ->  Materialize
+                                 Output: i8.q2, (NULL::integer)
+                                 ->  Nested Loop Left Join
+                                       Output: i8.q2, (NULL::integer)
+                                       Join Filter: (i8.q1 = i8b2.q1)
+                                       ->  Seq Scan on public.int8_tbl i8
+                                             Output: i8.q1, i8.q2
+                                       ->  Materialize
+                                             Output: i8b2.q1, (NULL::integer)
+                                             ->  Nested Loop
+                                                   Output: i8b2.q1, NULL::integer
+                                                   Join Filter: (i8b2.q1 = i4b2.f1)
+                                                   ->  Seq Scan on public.int8_tbl i8b2
+                                                         Output: i8b2.q1, i8b2.q2
+                                                   ->  Materialize
+                                                         Output: i4b2.f1
+                                                         ->  Seq Scan on public.int4_tbl i4b2
+                                                               Output: i4b2.f1
+         ->  Materialize
+               Output: i4.f1
+               ->  Seq Scan on public.int4_tbl i4
+                     Output: i4.f1
+(39 rows)
+
+select t1.* from
+  text_tbl t1
+  left join (select *, '***'::text as d1 from int8_tbl i8b1) b1
+    left join int8_tbl i8
+      left join (select *, null::int as d2 from int8_tbl i8b2, int4_tbl i4b2
+                 where q1 = f1) b2
+      on (i8.q1 = b2.q1)
+    on (b2.d2 = b1.q2)
+  on (t1.f1 = b1.d1)
+  left join int4_tbl i4
+  on (i8.q2 = i4.f1);
+        f1         
+-------------------
+ hi de ho neighbor
+ doh!
+(2 rows)
+
+explain (verbose, costs off)
+select * from
+  text_tbl t1
+  inner join int8_tbl i8
+  on i8.q2 = 456
+  right join text_tbl t2
+  on t1.f1 = 'doh!'
+  left join int4_tbl i4
+  on i8.q1 = i4.f1;
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Nested Loop Left Join
+   Output: t1.f1, i8.q1, i8.q2, t2.f1, i4.f1
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         Output: t2.f1
+         ->  Seq Scan on public.text_tbl t2
+               Output: t2.f1
+   ->  Materialize
+         Output: i8.q1, i8.q2, i4.f1, t1.f1
+         ->  Remote Subquery Scan on all (datanode_2)
+               Output: i8.q1, i8.q2, i4.f1, t1.f1
+               ->  Nested Loop
+                     Output: i8.q1, i8.q2, i4.f1, t1.f1
+                     ->  Nested Loop Left Join
+                           Output: i8.q1, i8.q2, i4.f1
+                           Join Filter: (i8.q1 = i4.f1)
+                           ->  Seq Scan on public.int8_tbl i8
+                                 Output: i8.q1, i8.q2
+                                 Filter: (i8.q2 = 456)
+                           ->  Seq Scan on public.int4_tbl i4
+                                 Output: i4.f1
+                     ->  Seq Scan on public.text_tbl t1
+                           Output: t1.f1
+                           Filter: (t1.f1 = 'doh!'::text)
+(23 rows)
+
+select * from
+  text_tbl t1
+  inner join int8_tbl i8
+  on i8.q2 = 456
+  right join text_tbl t2
+  on t1.f1 = 'doh!'
+  left join int4_tbl i4
+  on i8.q1 = i4.f1;
+  f1  | q1  | q2  |        f1         | f1 
+------+-----+-----+-------------------+----
+ doh! | 123 | 456 | hi de ho neighbor |   
+ doh! | 123 | 456 | doh!              |   
+(2 rows)
+
+reset enable_nestloop;
+reset enable_hashjoin;
+reset enable_mergejoin;
+--
+-- test for appropriate join order in the presence of lateral references
+--
+explain (verbose, costs off)
+select * from
+  text_tbl t1
+  left join int8_tbl i8
+  on i8.q2 = 123,
+  lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss
+where t1.f1 = ss.f1;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Nested Loop
+   Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1
+   Join Filter: (t1.f1 = t2.f1)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         Output: t1.f1, i8.q1, i8.q2
+         ->  Nested Loop Left Join
+               Output: t1.f1, i8.q1, i8.q2
+               ->  Seq Scan on public.text_tbl t1
+                     Output: t1.f1
+               ->  Materialize
+                     Output: i8.q1, i8.q2
+                     ->  Seq Scan on public.int8_tbl i8
+                           Output: i8.q1, i8.q2
+                           Filter: (i8.q2 = 123)
+   ->  Materialize
+         Output: (i8.q1), t2.f1
+         ->  Limit
+               Output: (i8.q1), t2.f1
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: i8.q1, t2.f1
+                     ->  Limit
+                           Output: (i8.q1), t2.f1
+                           ->  Seq Scan on public.text_tbl t2
+                                 Output: i8.q1, t2.f1
+(24 rows)
+
+select * from
+  text_tbl t1
+  left join int8_tbl i8
+  on i8.q2 = 123,
+  lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss
+where t1.f1 = ss.f1;
+        f1         |        q1        | q2  |        q1        |        f1         
+-------------------+------------------+-----+------------------+-------------------
+ hi de ho neighbor | 4567890123456789 | 123 | 4567890123456789 | hi de ho neighbor
+(1 row)
+
+explain (verbose, costs off)
+select * from
+  text_tbl t1
+  left join int8_tbl i8
+  on i8.q2 = 123,
+  lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss1,
+  lateral (select ss1.* from text_tbl t3 limit 1) as ss2
+where t1.f1 = ss2.f1;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Nested Loop
+   Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1, ((i8.q1)), (t2.f1)
+   Join Filter: (t1.f1 = (t2.f1))
+   ->  Nested Loop
+         Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Output: t1.f1, i8.q1, i8.q2
+               ->  Nested Loop Left Join
+                     Output: t1.f1, i8.q1, i8.q2
+                     ->  Seq Scan on public.text_tbl t1
+                           Output: t1.f1
+                     ->  Materialize
+                           Output: i8.q1, i8.q2
+                           ->  Seq Scan on public.int8_tbl i8
+                                 Output: i8.q1, i8.q2
+                                 Filter: (i8.q2 = 123)
+         ->  Materialize
+               Output: (i8.q1), t2.f1
+               ->  Limit
+                     Output: (i8.q1), t2.f1
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Output: i8.q1, t2.f1
+                           ->  Limit
+                                 Output: (i8.q1), t2.f1
+                                 ->  Seq Scan on public.text_tbl t2
+                                       Output: i8.q1, t2.f1
+   ->  Materialize
+         Output: ((i8.q1)), (t2.f1)
+         ->  Limit
+               Output: ((i8.q1)), (t2.f1)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: (i8.q1), t2.f1
+                     ->  Limit
+                           Output: ((i8.q1)), (t2.f1)
+                           ->  Seq Scan on public.text_tbl t3
+                                 Output: (i8.q1), t2.f1
+(36 rows)
+
+select * from
+  text_tbl t1
+  left join int8_tbl i8
+  on i8.q2 = 123,
+  lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss1,
+  lateral (select ss1.* from text_tbl t3 limit 1) as ss2
+where t1.f1 = ss2.f1;
+        f1         |        q1        | q2  |        q1        |        f1         |        q1        |        f1         
+-------------------+------------------+-----+------------------+-------------------+------------------+-------------------
+ hi de ho neighbor | 4567890123456789 | 123 | 4567890123456789 | hi de ho neighbor | 4567890123456789 | hi de ho neighbor
+(1 row)
+
+explain (verbose, costs off)
+select 1 from
+  text_tbl as tt1
+  inner join text_tbl as tt2 on (tt1.f1 = 'foo')
+  left join text_tbl as tt3 on (tt3.f1 = 'foo')
+  left join text_tbl as tt4 on (tt3.f1 = tt4.f1),
+  lateral (select tt4.f1 as c0 from text_tbl as tt5 limit 1) as ss1
+where tt1.f1 = ss1.c0;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Nested Loop
+   Output: 1
+   ->  Nested Loop Left Join
+         Output: tt1.f1, tt4.f1
+         ->  Nested Loop
+               Output: tt1.f1
+               ->  Remote Subquery Scan on all (datanode_2)
+                     Output: tt1.f1
+                     ->  Seq Scan on public.text_tbl tt1
+                           Output: tt1.f1
+                           Filter: (tt1.f1 = 'foo'::text)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Seq Scan on public.text_tbl tt2
+         ->  Materialize
+               Output: tt4.f1
+               ->  Remote Subquery Scan on all (datanode_2)
+                     Output: tt4.f1
+                     ->  Nested Loop Left Join
+                           Output: tt4.f1
+                           Join Filter: (tt3.f1 = tt4.f1)
+                           ->  Seq Scan on public.text_tbl tt3
+                                 Output: tt3.f1
+                                 Filter: (tt3.f1 = 'foo'::text)
+                           ->  Seq Scan on public.text_tbl tt4
+                                 Output: tt4.f1
+                                 Filter: (tt4.f1 = 'foo'::text)
+   ->  Materialize
+         Output: ss1.c0
+         ->  Subquery Scan on ss1
+               Output: ss1.c0
+               Filter: (ss1.c0 = 'foo'::text)
+               ->  Limit
+                     Output: (tt4.f1)
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Output: tt4.f1
+                           ->  Limit
+                                 Output: (tt4.f1)
+                                 ->  Seq Scan on public.text_tbl tt5
+                                       Output: tt4.f1
+(40 rows)
+
+select 1 from
+  text_tbl as tt1
+  inner join text_tbl as tt2 on (tt1.f1 = 'foo')
+  left join text_tbl as tt3 on (tt3.f1 = 'foo')
+  left join text_tbl as tt4 on (tt3.f1 = tt4.f1),
+  lateral (select tt4.f1 as c0 from text_tbl as tt5 limit 1) as ss1
+where tt1.f1 = ss1.c0;
+ ?column? 
+----------
+(0 rows)
+
+--
+-- check a case in which a PlaceHolderVar forces join order
+--
+explain (verbose, costs off)
+select ss2.* from
+  int4_tbl i41
+  left join int8_tbl i8
+    join (select i42.f1 as c1, i43.f1 as c2, 42 as c3
+          from int4_tbl i42, int4_tbl i43) ss1
+    on i8.q1 = ss1.c2
+  on i41.f1 = ss1.c1,
+  lateral (select i41.*, i8.*, ss1.* from text_tbl limit 1) ss2
+where ss1.c2 = 0;
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
+ Nested Loop
+   Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
+   ->  Remote Subquery Scan on all (datanode_1)
+         Output: i41.f1, i42.f1, i8.q1, i8.q2, i43.f1, 42
+         ->  Hash Join
+               Output: i41.f1, i42.f1, i8.q1, i8.q2, i43.f1, 42
+               Hash Cond: (i41.f1 = i42.f1)
+               ->  Nested Loop
+                     Output: i8.q1, i8.q2, i43.f1, i41.f1
+                     ->  Nested Loop
+                           Output: i8.q1, i8.q2, i43.f1
+                           ->  Seq Scan on public.int8_tbl i8
+                                 Output: i8.q1, i8.q2
+                                 Filter: (i8.q1 = 0)
+                           ->  Seq Scan on public.int4_tbl i43
+                                 Output: i43.f1
+                                 Filter: (i43.f1 = 0)
+                     ->  Seq Scan on public.int4_tbl i41
+                           Output: i41.f1
+               ->  Hash
+                     Output: i42.f1
+                     ->  Seq Scan on public.int4_tbl i42
+                           Output: i42.f1
+   ->  Materialize
+         Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
+         ->  Limit
+               Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42)
+                     ->  Limit
+                           Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
+                           ->  Seq Scan on public.text_tbl
+                                 Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42)
+(33 rows)
+
+select ss2.* from
+  int4_tbl i41
+  left join int8_tbl i8
+    join (select i42.f1 as c1, i43.f1 as c2, 42 as c3
+          from int4_tbl i42, int4_tbl i43) ss1
+    on i8.q1 = ss1.c2
+  on i41.f1 = ss1.c1,
+  lateral (select i41.*, i8.*, ss1.* from text_tbl limit 1) ss2
+where ss1.c2 = 0;
+ f1 | q1 | q2 | c1 | c2 | c3 
+----+----+----+----+----+----
+(0 rows)
+
+--
+-- test successful handling of full join underneath left join (bug #14105)
+--
+explain (costs off)
+select * from
+  (select 1 as id) as xx
+  left join
+    (tenk1 as a1 full join (select 1 as id) as yy on (a1.unique1 = yy.id))
+  on (xx.id = coalesce(yy.id));
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Nested Loop Left Join
+   Join Filter: ((1) = COALESCE((1)))
+   ->  Result
+   ->  Materialize
+         ->  Hash Full Join
+               Hash Cond: (a1.unique1 = (1))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Seq Scan on tenk1 a1
+               ->  Hash
+                     ->  Result
+(10 rows)
+
+select * from
+  (select 1 as id) as xx
+  left join
+    (tenk1 as a1 full join (select 1 as id) as yy on (a1.unique1 = yy.id))
+  on (xx.id = coalesce(yy.id));
+ id | unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 | id 
+----+---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+---------+----
+  1 |       1 |    2838 |   1 |    1 |   1 |      1 |       1 |        1 |           1 |         1 |        1 |   2 |    3 | BAAAAA   | EFEAAA   | OOOOxx  |  1
+(1 row)
+
+--
+-- test ability to push constants through outer join clauses
+--
+explain (num_nodes off, nodes off, costs off)
+  select * from int4_tbl a left join tenk1 b on f1 = unique2 where f1 = 0;
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Nested Loop Left Join
+         Join Filter: (a.f1 = b.unique2)
+         ->  Remote Subquery Scan on all
+               Distribute results by H: f1
+               ->  Seq Scan on int4_tbl a
+                     Filter: (f1 = 0)
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     Distribute results by H: unique2
+                     ->  Index Scan using tenk1_unique2 on tenk1 b
+                           Index Cond: (unique2 = 0)
+(12 rows)
+
+explain (num_nodes off, nodes off, costs off)
+  select * from tenk1 a full join tenk1 b using(unique2) where unique2 = 42;
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Merge Full Join
+         Merge Cond: (a.unique2 = b.unique2)
+         ->  Remote Subquery Scan on all
+               Distribute results by H: unique2
+               ->  Index Scan using tenk1_unique2 on tenk1 a
+                     Index Cond: (unique2 = 42)
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     Distribute results by H: unique2
+                     ->  Index Scan using tenk1_unique2 on tenk1 b
+                           Index Cond: (unique2 = 42)
+(12 rows)
+
+--
+-- test that quals attached to an outer join have correct semantics,
+-- specifically that they don't re-use expressions computed below the join;
+-- we force a mergejoin so that coalesce(b.q1, 1) appears as a join input
+--
+set enable_hashjoin to off;
+set enable_nestloop to off;
+explain (verbose, costs off)
+  select a.q2, b.q1
+    from int8_tbl a left join int8_tbl b on a.q2 = coalesce(b.q1, 1)
+    where coalesce(b.q1, 1) > 0;
+                                                                         QUERY PLAN                                                                         
+------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: a.q2, b.q1
+   Node/s: datanode_1
+   Remote query: SELECT a.q2, b.q1 FROM (int8_tbl a LEFT JOIN int8_tbl b ON ((a.q2 = COALESCE(b.q1, (1)::bigint)))) WHERE (COALESCE(b.q1, (1)::bigint) > 0)
+   ->  Merge Left Join
+         Output: a.q2, b.q1
+         Merge Cond: (a.q2 = (COALESCE(b.q1, '1'::bigint)))
+         Filter: (COALESCE(b.q1, '1'::bigint) > 0)
+         ->  Sort
+               Output: a.q2
+               Sort Key: a.q2
+               ->  Seq Scan on public.int8_tbl a
+                     Output: a.q2
+         ->  Sort
+               Output: b.q1, (COALESCE(b.q1, '1'::bigint))
+               Sort Key: (COALESCE(b.q1, '1'::bigint))
+               ->  Seq Scan on public.int8_tbl b
+                     Output: b.q1, COALESCE(b.q1, '1'::bigint)
+(18 rows)
+
+select a.q2, b.q1
+  from int8_tbl a left join int8_tbl b on a.q2 = coalesce(b.q1, 1)
+  where coalesce(b.q1, 1) > 0;
+        q2         |        q1        
+-------------------+------------------
+ -4567890123456789 |                 
+               123 |              123
+               123 |              123
+               456 |                 
+  4567890123456789 | 4567890123456789
+  4567890123456789 | 4567890123456789
+  4567890123456789 | 4567890123456789
+  4567890123456789 | 4567890123456789
+  4567890123456789 | 4567890123456789
+  4567890123456789 | 4567890123456789
+(10 rows)
+
+reset enable_hashjoin;
+reset enable_nestloop;
+--
+-- test join removal
+--
+begin;
+CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int);
+CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int);
+CREATE TEMP TABLE c (id int PRIMARY KEY);
+CREATE TEMP TABLE d (a int, b int);
+INSERT INTO a VALUES (0, 0), (1, NULL);
+INSERT INTO b VALUES (0, 0), (1, NULL);
+INSERT INTO c VALUES (0), (1);
+INSERT INTO d VALUES (1,3), (2,2), (3,1);
+-- all three cases should be optimizable into a simple seqscan
+explain (verbose false, costs false, nodes false) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id;
+         QUERY PLAN          
+-----------------------------
+ Remote Subquery Scan on all
+   ->  Seq Scan on a
+(2 rows)
+
+explain (verbose false, costs false, nodes false) SELECT b.* FROM b LEFT JOIN c ON b.c_id = c.id;
+         QUERY PLAN          
+-----------------------------
+ Remote Subquery Scan on all
+   ->  Seq Scan on b
+(2 rows)
+
+explain (verbose false, costs false, nodes false)
+  SELECT a.* FROM a LEFT JOIN (b left join c on b.c_id = c.id)
+  ON (a.b_id = b.id);
+         QUERY PLAN          
+-----------------------------
+ Remote Subquery Scan on all
+   ->  Seq Scan on a
+(2 rows)
+
+-- check optimization of outer join within another special join
+explain (verbose false, costs false, nodes false)
+select id from a where id in (
+	select b.id from b left join c on b.id = c.id
+);
+            QUERY PLAN            
+----------------------------------
+ Remote Subquery Scan on all
+   ->  Hash Join
+         Hash Cond: (a.id = b.id)
+         ->  Seq Scan on a
+         ->  Hash
+               ->  Seq Scan on b
+(6 rows)
+
+-- check that join removal works for a left join when joining a subquery
+-- that is guaranteed to be unique by its GROUP BY clause
+explain (costs off)
+select d.* from d left join (select * from b group by b.id, b.c_id) s
+  on d.a = s.id and d.b = s.c_id;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Seq Scan on d
+(2 rows)
+
+-- similarly, but keying off a DISTINCT clause
+explain (costs off)
+select d.* from d left join (select distinct * from b) s
+  on d.a = s.id and d.b = s.c_id;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Seq Scan on d
+(2 rows)
+
+-- join removal is not possible when the GROUP BY contains a column that is
+-- not in the join condition.  (Note: as of 9.6, we notice that b.id is a
+-- primary key and so drop b.c_id from the GROUP BY of the resulting plan;
+-- but this happens too late for join removal in the outer plan level.)
+explain (costs off)
+select d.* from d left join (select * from b group by b.id, b.c_id) s
+  on d.a = s.id;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Merge Right Join
+         Merge Cond: (b.id = d.a)
+         ->  Group
+               Group Key: b.id
+               ->  Index Scan using b_pkey on b
+         ->  Sort
+               Sort Key: d.a
+               ->  Seq Scan on d
+(9 rows)
+
+-- similarly, but keying off a DISTINCT clause
+explain (costs off)
+select d.* from d left join (select distinct * from b) s
+  on d.a = s.id;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Merge Right Join
+         Merge Cond: (b.id = d.a)
+         ->  Unique
+               ->  Sort
+                     Sort Key: b.id, b.c_id
+                     ->  Seq Scan on b
+         ->  Sort
+               Sort Key: d.a
+               ->  Seq Scan on d
+(10 rows)
+
+-- check join removal works when uniqueness of the join condition is enforced
+-- by a UNION
+explain (costs off)
+select d.* from d left join (select id from a union select id from b) s
+  on d.a = s.id;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Seq Scan on d
+(2 rows)
+
+-- check join removal with a cross-type comparison operator
+-- commenting out queries on replicated tables
+-- as they can go either on datanode_1 or datanode_2
+--explain (costs off)
+--select i8.* from int8_tbl i8 left join (select f1 from int4_tbl group by f1) i4
+  --on i8.q1 = i4.f1;
+rollback;
+create temp table parent (k int primary key, pd int);
+create temp table child (k int unique, cd int);
+insert into parent values (1, 10), (2, 20), (3, 30);
+insert into child values (1, 100), (4, 400);
+-- this case is optimizable
+select p.* from parent p left join child c on (p.k = c.k) order by 1,2;
+ k | pd 
+---+----
+ 1 | 10
+ 2 | 20
+ 3 | 30
+(3 rows)
+
+explain (verbose false, costs false, nodes false)
+  select p.* from parent p left join child c on (p.k = c.k) order by 1,2;
+            QUERY PLAN            
+----------------------------------
+ Remote Subquery Scan on all
+   ->  Sort
+         Sort Key: p.k, p.pd
+         ->  Seq Scan on parent p
+(4 rows)
+
+-- this case is not
+select p.*, linked from parent p
+  left join (select c.*, true as linked from child c) as ss
+  on (p.k = ss.k) order by p.k;
+ k | pd | linked 
+---+----+--------
+ 1 | 10 | t
+ 2 | 20 | 
+ 3 | 30 | 
+(3 rows)
+
+explain (verbose false, costs false, nodes false)
+  select p.*, linked from parent p
+    left join (select c.*, true as linked from child c) as ss
+    on (p.k = ss.k) order by p.k;
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Merge Left Join
+         Merge Cond: (p.k = c.k)
+         ->  Index Scan using parent_pkey on parent p
+         ->  Index Only Scan using child_k_key on child c
+(5 rows)
+
+-- check for a 9.0rc1 bug: join removal breaks pseudoconstant qual handling
+select p.* from
+  parent p left join child c on (p.k = c.k)
+  where p.k = 1 and p.k = 2;
+ k | pd 
+---+----
+(0 rows)
+
+explain (verbose false, costs false, nodes false)
+select p.* from
+  parent p left join child c on (p.k = c.k)
+  where p.k = 1 and p.k = 2;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Fast Query Execution
+   ->  Result
+         One-Time Filter: false
+         ->  Index Scan using parent_pkey on parent p
+               Index Cond: (k = 1)
+(5 rows)
+
+select p.* from
+  (parent p left join child c on (p.k = c.k)) join parent x on p.k = x.k
+  where p.k = 1 and p.k = 2;
+ k | pd 
+---+----
+(0 rows)
+
+explain (verbose false, costs false, nodes false)
+select p.* from
+  (parent p left join child c on (p.k = c.k)) join parent x on p.k = x.k
+  where p.k = 1 and p.k = 2;
+           QUERY PLAN           
+--------------------------------
+ Remote Fast Query Execution
+   ->  Result
+         One-Time Filter: false
+(3 rows)
+
+-- bug 5255: this is not optimizable by join removal
+begin;
+CREATE TEMP TABLE a (id int PRIMARY KEY);
+CREATE TEMP TABLE b (id int PRIMARY KEY, a_id int);
+INSERT INTO a VALUES (0), (1);
+INSERT INTO b VALUES (0, 0), (1, NULL);
+SELECT * FROM b LEFT JOIN a ON (b.a_id = a.id) WHERE (a.id IS NULL OR a.id > 0);
+ id | a_id | id 
+----+------+----
+  1 |      |   
+(1 row)
+
+SELECT b.* FROM b LEFT JOIN a ON (b.a_id = a.id) WHERE (a.id IS NULL OR a.id > 0);
+ id | a_id 
+----+------
+  1 |     
+(1 row)
+
+rollback;
+-- another join removal bug: this is not optimizable, either
+begin;
+create temp table innertab (id int8 primary key, dat1 int8);
+insert into innertab values(123, 42);
+SELECT * FROM
+    (SELECT 1 AS x) ss1
+  LEFT JOIN
+    (SELECT q1, q2, COALESCE(dat1, q1) AS y
+     FROM int8_tbl LEFT JOIN innertab ON q2 = id) ss2
+  ON true order by 1, 2, 3, 4;
+ x |        q1        |        q2         |        y         
+---+------------------+-------------------+------------------
+ 1 |              123 |               456 |              123
+ 1 |              123 |  4567890123456789 |              123
+ 1 | 4567890123456789 | -4567890123456789 | 4567890123456789
+ 1 | 4567890123456789 |               123 |               42
+ 1 | 4567890123456789 |  4567890123456789 | 4567890123456789
+(5 rows)
+
+rollback;
+-- another join removal bug: we must clean up correctly when removing a PHV
+begin;
+create temp table uniquetbl (f1 text unique);
+explain (costs off)
+select t1.* from
+  uniquetbl as t1
+  left join (select *, '***'::text as d1 from uniquetbl) t2
+  on t1.f1 = t2.f1
+  left join uniquetbl t3
+  on t2.d1 = t3.f1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Seq Scan on uniquetbl t1
+(2 rows)
+
+explain (costs off)
+select t0.*
+from
+ text_tbl t0
+ left join
+   (select case t1.ten when 0 then 'doh!'::text else null::text end as case1,
+           t1.stringu2
+     from tenk1 t1
+     join int4_tbl i4 ON i4.f1 = t1.unique2
+     left join uniquetbl u1 ON u1.f1 = t1.string4) ss
+  on t0.f1 = ss.case1
+where ss.stringu2 !~* ss.case1;
+                                               QUERY PLAN                                               
+--------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Nested Loop
+         Join Filter: ((CASE t1.ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END) = t0.f1)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END
+               ->  Nested Loop
+                     ->  Seq Scan on int4_tbl i4
+                     ->  Index Scan using tenk1_unique2 on tenk1 t1
+                           Index Cond: (unique2 = i4.f1)
+                           Filter: (stringu2 !~* CASE ten WHEN 0 THEN 'doh!'::text ELSE NULL::text END)
+         ->  Materialize
+               ->  Seq Scan on text_tbl t0
+(12 rows)
+
+select t0.*
+from
+ text_tbl t0
+ left join
+   (select case t1.ten when 0 then 'doh!'::text else null::text end as case1,
+           t1.stringu2
+     from tenk1 t1
+     join int4_tbl i4 ON i4.f1 = t1.unique2
+     left join uniquetbl u1 ON u1.f1 = t1.string4) ss
+  on t0.f1 = ss.case1
+where ss.stringu2 !~* ss.case1;
+  f1  
+------
+ doh!
+(1 row)
+
+rollback;
+-- bug #8444: we've historically allowed duplicate aliases within aliased JOINs
+select * from
+  int8_tbl x join (int4_tbl x cross join int4_tbl y) j on q1 = f1; -- error
+ERROR:  column reference "f1" is ambiguous
+LINE 2: ..._tbl x join (int4_tbl x cross join int4_tbl y) j on q1 = f1;
+                                                                    ^
+select * from
+  int8_tbl x join (int4_tbl x cross join int4_tbl y) j on q1 = y.f1; -- error
+ERROR:  invalid reference to FROM-clause entry for table "y"
+LINE 2: ...bl x join (int4_tbl x cross join int4_tbl y) j on q1 = y.f1;
+                                                                  ^
+HINT:  There is an entry for table "y", but it cannot be referenced from this part of the query.
+select * from
+  int8_tbl x join (int4_tbl x cross join int4_tbl y(ff)) j on q1 = f1; -- ok
+ q1 | q2 | f1 | ff 
+----+----+----+----
+(0 rows)
+
+--
+-- Test hints given on incorrect column references are useful
+--
+select t1.uunique1 from
+  tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t1" suggestion
+ERROR:  column t1.uunique1 does not exist
+LINE 1: select t1.uunique1 from
+               ^
+HINT:  Perhaps you meant to reference the column "t1.unique1".
+select t2.uunique1 from
+  tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, prefer "t2" suggestion
+ERROR:  column t2.uunique1 does not exist
+LINE 1: select t2.uunique1 from
+               ^
+HINT:  Perhaps you meant to reference the column "t2.unique1".
+select uunique1 from
+  tenk1 t1 join tenk2 t2 on t1.two = t2.two; -- error, suggest both at once
+ERROR:  column "uunique1" does not exist
+LINE 1: select uunique1 from
+               ^
+HINT:  Perhaps you meant to reference the column "t1.unique1" or the column "t2.unique1".
+--
+-- Take care to reference the correct RTE
+--
+select atts.relid::regclass, s.* from pg_stats s join
+    pg_attribute a on s.attname = a.attname and s.tablename =
+    a.attrelid::regclass::text join (select unnest(indkey) attnum,
+    indexrelid from pg_index i) atts on atts.attnum = a.attnum where
+    schemaname != 'pg_catalog';
+ERROR:  column atts.relid does not exist
+LINE 1: select atts.relid::regclass, s.* from pg_stats s join
+               ^
+--
+-- Test LATERAL
+--
+select unique2, x.*
+from tenk1 a, lateral (select * from int4_tbl b where f1 = a.unique1) x;
+ unique2 | f1 
+---------+----
+    9998 |  0
+(1 row)
+
+explain (costs off)
+  select unique2, x.*
+  from tenk1 a, lateral (select * from int4_tbl b where f1 = a.unique1) x;
+                      QUERY PLAN                       
+-------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Nested Loop
+         ->  Seq Scan on int4_tbl b
+         ->  Index Scan using tenk1_unique1 on tenk1 a
+               Index Cond: (unique1 = b.f1)
+(5 rows)
+
+select unique2, x.*
+from int4_tbl x, lateral (select unique2 from tenk1 where f1 = unique1) ss;
+ unique2 | f1 
+---------+----
+    9998 |  0
+(1 row)
+
+explain (costs off)
+  select unique2, x.*
+  from int4_tbl x, lateral (select unique2 from tenk1 where f1 = unique1) ss;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Nested Loop
+         ->  Seq Scan on int4_tbl x
+         ->  Index Scan using tenk1_unique1 on tenk1
+               Index Cond: (unique1 = x.f1)
+(5 rows)
+
+explain (costs off)
+  select unique2, x.*
+  from int4_tbl x cross join lateral (select unique2 from tenk1 where f1 = unique1) ss;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Nested Loop
+         ->  Seq Scan on int4_tbl x
+         ->  Index Scan using tenk1_unique1 on tenk1
+               Index Cond: (unique1 = x.f1)
+(5 rows)
+
+select unique2, x.*
+from int4_tbl x left join lateral (select unique1, unique2 from tenk1 where f1 = unique1) ss on true order by 1;
+ unique2 |     f1      
+---------+-------------
+    9998 |           0
+         |     -123456
+         |  2147483647
+         |      123456
+         | -2147483647
+(5 rows)
+
+--explain (costs off)
+  --select unique2, x.*
+  --from int4_tbl x left join lateral (select unique1, unique2 from tenk1 where f1 = unique1) ss on true;
+-- check scoping of lateral versus parent references
+-- the first of these should return int8_tbl.q2, the second int8_tbl.q1
+select *, (select r from (select q1 as q2) x, (select q2 as r) y) from int8_tbl;
+        q1        |        q2         |         r         
+------------------+-------------------+-------------------
+              123 |               456 |               456
+              123 |  4567890123456789 |  4567890123456789
+ 4567890123456789 |               123 |               123
+ 4567890123456789 |  4567890123456789 |  4567890123456789
+ 4567890123456789 | -4567890123456789 | -4567890123456789
+(5 rows)
+
+select *, (select r from (select q1 as q2) x, lateral (select q2 as r) y) from int8_tbl;
+        q1        |        q2         |        r         
+------------------+-------------------+------------------
+              123 |               456 |              123
+              123 |  4567890123456789 |              123
+ 4567890123456789 |               123 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 | 4567890123456789
+(5 rows)
+
+-- lateral with function in FROM
+select count(*) from tenk1 a, lateral generate_series(1,two) g;
+ count 
+-------
+  5000
+(1 row)
+
+explain (costs off)
+  select count(*) from tenk1 a, lateral generate_series(1,two) g;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Aggregate
+   ->  Nested Loop
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tenk1 a
+         ->  Function Scan on generate_series g
+(5 rows)
+
+explain (costs off)
+  select count(*) from tenk1 a cross join lateral generate_series(1,two) g;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Aggregate
+   ->  Nested Loop
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tenk1 a
+         ->  Function Scan on generate_series g
+(5 rows)
+
+-- don't need the explicit LATERAL keyword for functions
+explain (costs off)
+  select count(*) from tenk1 a, generate_series(1,two) g;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Aggregate
+   ->  Nested Loop
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Seq Scan on tenk1 a
+         ->  Function Scan on generate_series g
+(5 rows)
+
+-- lateral with UNION ALL subselect
+explain (num_nodes off, nodes off, costs off)
+  select * from generate_series(100,200) g,
+    lateral (select * from int8_tbl a where g = q1 union all
+             select * from int8_tbl b where g = q2) ss;
+                   QUERY PLAN                   
+------------------------------------------------
+ Nested Loop
+   ->  Function Scan on generate_series g
+   ->  Materialize
+         ->  Remote Subquery Scan on all
+               ->  Append
+                     ->  Seq Scan on int8_tbl a
+                           Filter: (g.g = q1)
+                     ->  Seq Scan on int8_tbl b
+                           Filter: (g.g = q2)
+(9 rows)
+
+select * from generate_series(100,200) g,
+  lateral (select * from int8_tbl a where g = q1 union all
+           select * from int8_tbl b where g = q2) ss;
+  g  |        q1        |        q2        
+-----+------------------+------------------
+ 123 |              123 |              456
+ 123 |              123 | 4567890123456789
+ 123 | 4567890123456789 |              123
+(3 rows)
+
+-- lateral with VALUES
+explain (num_nodes off, nodes off, costs off)
+  select count(*) from tenk1 a,
+    tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x;
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Remote Subquery Scan on all
+         ->  Partial Aggregate
+               ->  Merge Join
+                     Merge Cond: (b.unique2 = a.unique1)
+                     ->  Remote Subquery Scan on all
+                           Distribute results by H: unique2
+                           ->  Index Only Scan using tenk1_unique2 on tenk1 b
+                     ->  Index Only Scan using tenk1_unique1 on tenk1 a
+(9 rows)
+
+select count(*) from tenk1 a,
+  tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x;
+ count 
+-------
+ 10000
+(1 row)
+
+-- lateral with VALUES, no flattening possible
+explain (num_nodes off, nodes off, costs off)
+  select count(*) from tenk1 a,
+    tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Aggregate
+   ->  Hash Join
+         Hash Cond: ("*VALUES*".column1 = b.unique2)
+         ->  Nested Loop
+               ->  Remote Subquery Scan on all
+                     ->  Index Only Scan using tenk1_unique1 on tenk1 a
+               ->  Values Scan on "*VALUES*"
+         ->  Hash
+               ->  Remote Subquery Scan on all
+                     ->  Index Only Scan using tenk1_unique2 on tenk1 b
+(10 rows)
+
+select count(*) from tenk1 a,
+  tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x;
+ count 
+-------
+ 10000
+(1 row)
+
+-- lateral injecting a strange outer join condition
+set enable_hashjoin to off;
+set enable_mergejoin to off;
+explain (num_nodes off, nodes off, costs off)
+  select * from int8_tbl a,
+    int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z)
+      on x.q2 = ss.z
+  order by a.q1, a.q2, x.q1, x.q2, ss.z;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Sort
+         Sort Key: a.q1, a.q2, x.q1, x.q2, (a.q1)
+         ->  Nested Loop
+               ->  Seq Scan on int8_tbl a
+               ->  Nested Loop Left Join
+                     Join Filter: (x.q2 = (a.q1))
+                     ->  Seq Scan on int8_tbl x
+                     ->  Materialize
+                           ->  Seq Scan on int4_tbl y
+(10 rows)
+
+select * from int8_tbl a,
+  int8_tbl x left join lateral (select a.q1 from int4_tbl y) ss(z)
+    on x.q2 = ss.z
+  order by a.q1, a.q2, x.q1, x.q2, ss.z;
+        q1        |        q2         |        q1        |        q2         |        z         
+------------------+-------------------+------------------+-------------------+------------------
+              123 |               456 |              123 |               456 |                 
+              123 |               456 |              123 |  4567890123456789 |                 
+              123 |               456 | 4567890123456789 | -4567890123456789 |                 
+              123 |               456 | 4567890123456789 |               123 |              123
+              123 |               456 | 4567890123456789 |               123 |              123
+              123 |               456 | 4567890123456789 |               123 |              123
+              123 |               456 | 4567890123456789 |               123 |              123
+              123 |               456 | 4567890123456789 |               123 |              123
+              123 |               456 | 4567890123456789 |  4567890123456789 |                 
+              123 |  4567890123456789 |              123 |               456 |                 
+              123 |  4567890123456789 |              123 |  4567890123456789 |                 
+              123 |  4567890123456789 | 4567890123456789 | -4567890123456789 |                 
+              123 |  4567890123456789 | 4567890123456789 |               123 |              123
+              123 |  4567890123456789 | 4567890123456789 |               123 |              123
+              123 |  4567890123456789 | 4567890123456789 |               123 |              123
+              123 |  4567890123456789 | 4567890123456789 |               123 |              123
+              123 |  4567890123456789 | 4567890123456789 |               123 |              123
+              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 |                 
+ 4567890123456789 | -4567890123456789 |              123 |               456 |                 
+ 4567890123456789 | -4567890123456789 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 | 4567890123456789 | -4567890123456789 |                 
+ 4567890123456789 | -4567890123456789 | 4567890123456789 |               123 |                 
+ 4567890123456789 | -4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 |              123 |               456 |                 
+ 4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 | 4567890123456789 | -4567890123456789 |                 
+ 4567890123456789 |               123 | 4567890123456789 |               123 |                 
+ 4567890123456789 |               123 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 |              123 |               456 |                 
+ 4567890123456789 |  4567890123456789 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 |              123 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 |                 
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 |                 
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+(57 rows)
+
+reset enable_hashjoin;
+reset enable_mergejoin;
+-- lateral reference to a join alias variable
+select * from (select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1,
+  lateral (select x) ss2(y) order by 1,2,3;
+ x | f1 | y 
+---+----+---
+ 0 |  0 | 0
+(1 row)
+
+select * from (select f1 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1,
+  lateral (values(x)) ss2(y) order by 1,2,3;
+      x      |     f1      |      y      
+-------------+-------------+-------------
+ -2147483647 | -2147483647 | -2147483647
+     -123456 |     -123456 |     -123456
+           0 |           0 |           0
+      123456 |      123456 |      123456
+  2147483647 |  2147483647 |  2147483647
+(5 rows)
+
+select * from ((select f1/2 as x from int4_tbl) ss1 join int4_tbl i4 on x = f1) j,
+  lateral (select x) ss2(y) order by 1,2,3;
+ x | f1 | y 
+---+----+---
+ 0 |  0 | 0
+(1 row)
+
+-- lateral references requiring pullup
+select * from (values(1)) x(lb),
+  lateral generate_series(lb,4) x4 order by 1,2;
+ lb | x4 
+----+----
+  1 |  1
+  1 |  2
+  1 |  3
+  1 |  4
+(4 rows)
+
+select * from (select f1/1000000000 from int4_tbl) x(lb),
+  lateral generate_series(lb,4) x4 order by 1,2;
+ lb | x4 
+----+----
+ -2 | -2
+ -2 | -1
+ -2 |  0
+ -2 |  1
+ -2 |  2
+ -2 |  3
+ -2 |  4
+  0 |  0
+  0 |  0
+  0 |  0
+  0 |  1
+  0 |  1
+  0 |  1
+  0 |  2
+  0 |  2
+  0 |  2
+  0 |  3
+  0 |  3
+  0 |  3
+  0 |  4
+  0 |  4
+  0 |  4
+  2 |  2
+  2 |  3
+  2 |  4
+(25 rows)
+
+select * from (values(1)) x(lb),
+  lateral (values(lb)) y(lbcopy) order by 1,2;
+ lb | lbcopy 
+----+--------
+  1 |      1
+(1 row)
+
+select * from (values(1)) x(lb),
+  lateral (select lb from int4_tbl) y(lbcopy);
+ lb | lbcopy 
+----+--------
+  1 |      1
+  1 |      1
+  1 |      1
+  1 |      1
+  1 |      1
+(5 rows)
+
+select * from
+  int8_tbl x left join (select q1,coalesce(q2,0) q2 from int8_tbl) y on x.q2 = y.q1,
+  lateral (values(x.q1,y.q1,y.q2)) v(xq1,yq1,yq2);
+        q1        |        q2         |        q1        |        q2         |       xq1        |       yq1        |        yq2        
+------------------+-------------------+------------------+-------------------+------------------+------------------+-------------------
+              123 |               456 |                  |                   |              123 |                  |                  
+              123 |  4567890123456789 | 4567890123456789 | -4567890123456789 |              123 | 4567890123456789 | -4567890123456789
+              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 |              123 | 4567890123456789 |  4567890123456789
+              123 |  4567890123456789 | 4567890123456789 |               123 |              123 | 4567890123456789 |               123
+ 4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789 |              123 |  4567890123456789
+ 4567890123456789 |               123 |              123 |               456 | 4567890123456789 |              123 |               456
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 |  4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789 | 4567890123456789 |               123
+ 4567890123456789 | -4567890123456789 |                  |                   | 4567890123456789 |                  |                  
+(10 rows)
+
+select * from
+  int8_tbl x left join (select q1,coalesce(q2,0) q2 from int8_tbl) y on x.q2 = y.q1,
+  lateral (select x.q1,y.q1,y.q2) v(xq1,yq1,yq2);
+        q1        |        q2         |        q1        |        q2         |       xq1        |       yq1        |        yq2        
+------------------+-------------------+------------------+-------------------+------------------+------------------+-------------------
+              123 |               456 |                  |                   |              123 |                  |                  
+              123 |  4567890123456789 | 4567890123456789 | -4567890123456789 |              123 | 4567890123456789 | -4567890123456789
+              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 |              123 | 4567890123456789 |  4567890123456789
+              123 |  4567890123456789 | 4567890123456789 |               123 |              123 | 4567890123456789 |               123
+ 4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789 |              123 |  4567890123456789
+ 4567890123456789 |               123 |              123 |               456 | 4567890123456789 |              123 |               456
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 |  4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789 | 4567890123456789 |               123
+ 4567890123456789 | -4567890123456789 |                  |                   | 4567890123456789 |                  |                  
+(10 rows)
+
+select x.* from
+  int8_tbl x left join (select q1,coalesce(q2,0) q2 from int8_tbl) y on x.q2 = y.q1,
+  lateral (select x.q1,y.q1,y.q2) v(xq1,yq1,yq2);
+        q1        |        q2         
+------------------+-------------------
+              123 |               456
+              123 |  4567890123456789
+              123 |  4567890123456789
+              123 |  4567890123456789
+ 4567890123456789 |               123
+ 4567890123456789 |               123
+ 4567890123456789 |  4567890123456789
+ 4567890123456789 |  4567890123456789
+ 4567890123456789 |  4567890123456789
+ 4567890123456789 | -4567890123456789
+(10 rows)
+
+select v.* from
+  (int8_tbl x left join (select q1,coalesce(q2,0) q2 from int8_tbl) y on x.q2 = y.q1)
+  left join int4_tbl z on z.f1 = x.q2,
+  lateral (select x.q1,y.q1 union all select x.q2,y.q2) v(vx,vy)
+  order by vx, vy;
+        vx         |        vy         
+-------------------+-------------------
+ -4567890123456789 |                  
+               123 |               456
+               123 |  4567890123456789
+               123 |  4567890123456789
+               123 |  4567890123456789
+               123 |  4567890123456789
+               123 |                  
+               456 |                  
+  4567890123456789 | -4567890123456789
+  4567890123456789 | -4567890123456789
+  4567890123456789 |               123
+  4567890123456789 |               123
+  4567890123456789 |               123
+  4567890123456789 |               123
+  4567890123456789 |  4567890123456789
+  4567890123456789 |  4567890123456789
+  4567890123456789 |  4567890123456789
+  4567890123456789 |  4567890123456789
+  4567890123456789 |  4567890123456789
+  4567890123456789 |                  
+(20 rows)
+
+select v.* from
+  (int8_tbl x left join (select q1,(select coalesce(q2,0)) q2 from int8_tbl) y on x.q2 = y.q1)
+  left join int4_tbl z on z.f1 = x.q2,
+  lateral (select x.q1,y.q1 union all select x.q2,y.q2) v(vx,vy)
+  order by vx, vy;
+        vx         |        vy         
+-------------------+-------------------
+ -4567890123456789 |                  
+               123 |               456
+               123 |  4567890123456789
+               123 |  4567890123456789
+               123 |  4567890123456789
+               123 |  4567890123456789
+               123 |                  
+               456 |                  
+  4567890123456789 | -4567890123456789
+  4567890123456789 | -4567890123456789
+  4567890123456789 |               123
+  4567890123456789 |               123
+  4567890123456789 |               123
+  4567890123456789 |               123
+  4567890123456789 |  4567890123456789
+  4567890123456789 |  4567890123456789
+  4567890123456789 |  4567890123456789
+  4567890123456789 |  4567890123456789
+  4567890123456789 |  4567890123456789
+  4567890123456789 |                  
+(20 rows)
+
+create temp table dual();
+insert into dual default values;
+analyze dual;
+select v.* from
+  (int8_tbl x left join (select q1,(select coalesce(q2,0)) q2 from int8_tbl) y on x.q2 = y.q1)
+  left join int4_tbl z on z.f1 = x.q2,
+  lateral (select x.q1,y.q1 from dual union all select x.q2,y.q2 from dual) v(vx,vy)
+  order by vx, vy;
+        vx         |        vy         
+-------------------+-------------------
+ -4567890123456789 |                  
+               123 |               456
+               123 |  4567890123456789
+               123 |  4567890123456789
+               123 |  4567890123456789
+               123 |  4567890123456789
+               123 |                  
+               456 |                  
+  4567890123456789 | -4567890123456789
+  4567890123456789 | -4567890123456789
+  4567890123456789 |               123
+  4567890123456789 |               123
+  4567890123456789 |               123
+  4567890123456789 |               123
+  4567890123456789 |  4567890123456789
+  4567890123456789 |  4567890123456789
+  4567890123456789 |  4567890123456789
+  4567890123456789 |  4567890123456789
+  4567890123456789 |  4567890123456789
+  4567890123456789 |                  
+(20 rows)
+
+explain (verbose, num_nodes off, nodes off, costs off)
+select * from
+  int8_tbl a left join
+  lateral (select *, a.q2 as x from int8_tbl b) ss on a.q2 = ss.q1;
+                   QUERY PLAN                   
+------------------------------------------------
+ Remote Subquery Scan on all
+   Output: a.q1, a.q2, b.q1, b.q2, a.q2
+   ->  Nested Loop Left Join
+         Output: a.q1, a.q2, b.q1, b.q2, (a.q2)
+         ->  Seq Scan on public.int8_tbl a
+               Output: a.q1, a.q2
+         ->  Seq Scan on public.int8_tbl b
+               Output: b.q1, b.q2, a.q2
+               Filter: (a.q2 = b.q1)
+(9 rows)
+
+select * from
+  int8_tbl a left join
+  lateral (select *, a.q2 as x from int8_tbl b) ss on a.q2 = ss.q1;
+        q1        |        q2         |        q1        |        q2         |        x         
+------------------+-------------------+------------------+-------------------+------------------
+              123 |               456 |                  |                   |                 
+              123 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
+              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+              123 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 |              123 |               456 |              123
+ 4567890123456789 |               123 |              123 |  4567890123456789 |              123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 |                  |                   |                 
+(10 rows)
+
+--explain (verbose, costs off)
+--select * from
+  --int8_tbl a left join
+  --lateral (select *, coalesce(a.q2, 42) as x from int8_tbl b) ss on a.q2 = ss.q1;
+select * from
+  int8_tbl a left join
+  lateral (select *, coalesce(a.q2, 42) as x from int8_tbl b) ss on a.q2 = ss.q1;
+        q1        |        q2         |        q1        |        q2         |        x         
+------------------+-------------------+------------------+-------------------+------------------
+              123 |               456 |                  |                   |                 
+              123 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
+              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+              123 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 |              123 |               456 |              123
+ 4567890123456789 |               123 |              123 |  4567890123456789 |              123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 |                  |                   |                 
+(10 rows)
+
+-- lateral can result in join conditions appearing below their
+-- real semantic level
+set enable_nestloop to on;
+set enable_hashjoin to off;
+set enable_mergejoin to off;
+explain (num_nodes off, nodes off, verbose, costs off)
+select * from int4_tbl i left join
+  lateral (select * from int2_tbl j where i.f1 = j.f1) k on true;
+                   QUERY PLAN                    
+-------------------------------------------------
+ Remote Subquery Scan on all
+   Output: i.f1, j.f1
+   ->  Nested Loop Left Join
+         Output: i.f1, j.f1
+         Join Filter: (i.f1 = j.f1)
+         ->  Remote Subquery Scan on all
+               Output: i.f1
+               Distribute results by H: f1
+               ->  Seq Scan on public.int4_tbl i
+                     Output: i.f1
+         ->  Materialize
+               Output: j.f1
+               ->  Seq Scan on public.int2_tbl j
+                     Output: j.f1
+(14 rows)
+
+select * from int4_tbl i left join
+  lateral (select * from int2_tbl j where i.f1 = j.f1) k on true order by 1;
+     f1      | f1 
+-------------+----
+ -2147483647 |   
+     -123456 |   
+           0 |  0
+      123456 |   
+  2147483647 |   
+(5 rows)
+
+reset enable_nestloop;
+reset enable_hashjoin;
+reset enable_mergejoin
+explain (num_nodes off, nodes off, verbose, costs off)
+select * from int4_tbl i left join
+  lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true;
+ERROR:  syntax error at or near "explain"
+LINE 2: explain (num_nodes off, nodes off, verbose, costs off)
+        ^
+select * from int4_tbl i left join
+  lateral (select coalesce(i) from int2_tbl j where i.f1 = j.f1) k on true order by 1;
+     f1      | coalesce 
+-------------+----------
+ -2147483647 | 
+     -123456 | 
+           0 | (0)
+      123456 | 
+  2147483647 | 
+(5 rows)
+
+set enable_hashjoin to off;
+set enable_mergejoin to off;
+explain (num_nodes off, nodes off, verbose, costs off)
+select * from int4_tbl a,
+  lateral (
+    select * from int4_tbl b left join int8_tbl c on (b.f1 = q1 and a.f1 = q2)
+  ) ss;
+                      QUERY PLAN                       
+-------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: a.f1, f1, q1, q2
+   ->  Nested Loop
+         Output: a.f1, b.f1, c.q1, c.q2
+         ->  Seq Scan on public.int4_tbl a
+               Output: a.f1
+         ->  Nested Loop Left Join
+               Output: b.f1, c.q1, c.q2
+               Join Filter: (b.f1 = c.q1)
+               ->  Seq Scan on public.int4_tbl b
+                     Output: b.f1
+               ->  Materialize
+                     Output: c.q1, c.q2
+                     ->  Seq Scan on public.int8_tbl c
+                           Output: c.q1, c.q2
+                           Filter: (a.f1 = c.q2)
+(16 rows)
+
+select * from int4_tbl a,
+  lateral (
+    select * from int4_tbl b left join int8_tbl c on (b.f1 = q1 and a.f1 = q2)
+  ) ss order by 1,2,3,4;
+     f1      |     f1      | q1 | q2 
+-------------+-------------+----+----
+ -2147483647 | -2147483647 |    |   
+ -2147483647 |     -123456 |    |   
+ -2147483647 |           0 |    |   
+ -2147483647 |      123456 |    |   
+ -2147483647 |  2147483647 |    |   
+     -123456 | -2147483647 |    |   
+     -123456 |     -123456 |    |   
+     -123456 |           0 |    |   
+     -123456 |      123456 |    |   
+     -123456 |  2147483647 |    |   
+           0 | -2147483647 |    |   
+           0 |     -123456 |    |   
+           0 |           0 |    |   
+           0 |      123456 |    |   
+           0 |  2147483647 |    |   
+      123456 | -2147483647 |    |   
+      123456 |     -123456 |    |   
+      123456 |           0 |    |   
+      123456 |      123456 |    |   
+      123456 |  2147483647 |    |   
+  2147483647 | -2147483647 |    |   
+  2147483647 |     -123456 |    |   
+  2147483647 |           0 |    |   
+  2147483647 |      123456 |    |   
+  2147483647 |  2147483647 |    |   
+(25 rows)
+
+reset enable_hashjoin;
+reset enable_mergejoin;
+-- lateral reference in a PlaceHolderVar evaluated at join level
+explain (num_nodes off, nodes off, verbose, costs off)
+select * from
+  int8_tbl a left join lateral
+  (select b.q1 as bq1, c.q1 as cq1, least(a.q1,b.q1,c.q1) from
+   int8_tbl b cross join int8_tbl c) ss
+  on a.q2 = ss.bq1;
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: a.q1, a.q2, b.q1, c.q1, LEAST(a.q1, b.q1, c.q1)
+   ->  Nested Loop Left Join
+         Output: a.q1, a.q2, b.q1, c.q1, (LEAST(a.q1, b.q1, c.q1))
+         ->  Seq Scan on public.int8_tbl a
+               Output: a.q1, a.q2
+         ->  Nested Loop
+               Output: b.q1, c.q1, LEAST(a.q1, b.q1, c.q1)
+               ->  Seq Scan on public.int8_tbl b
+                     Output: b.q1, b.q2
+                     Filter: (a.q2 = b.q1)
+               ->  Seq Scan on public.int8_tbl c
+                     Output: c.q1, c.q2
+(13 rows)
+
+select * from
+  int8_tbl a left join lateral
+  (select b.q1 as bq1, c.q1 as cq1, least(a.q1,b.q1,c.q1) from
+   int8_tbl b cross join int8_tbl c) ss
+  on a.q2 = ss.bq1;
+        q1        |        q2         |       bq1        |       cq1        |      least       
+------------------+-------------------+------------------+------------------+------------------
+              123 |               456 |                  |                  |                 
+              123 |  4567890123456789 | 4567890123456789 |              123 |              123
+              123 |  4567890123456789 | 4567890123456789 |              123 |              123
+              123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
+              123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
+              123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
+              123 |  4567890123456789 | 4567890123456789 |              123 |              123
+              123 |  4567890123456789 | 4567890123456789 |              123 |              123
+              123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
+              123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
+              123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
+              123 |  4567890123456789 | 4567890123456789 |              123 |              123
+              123 |  4567890123456789 | 4567890123456789 |              123 |              123
+              123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
+              123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
+              123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 |              123 |              123
+ 4567890123456789 |               123 |              123 |              123 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 |              123 |              123
+ 4567890123456789 |               123 |              123 |              123 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |              123 |              123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |              123 |              123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |              123 |              123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |              123 |              123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |              123 |              123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |              123 |              123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
+ 4567890123456789 | -4567890123456789 |                  |                  |                 
+(42 rows)
+
+-- case requiring nested PlaceHolderVars
+explain (num_nodes off, nodes off, verbose, costs off)
+select * from
+  int8_tbl c left join (
+    int8_tbl a left join (select q1, coalesce(q2,42) as x from int8_tbl b) ss1
+      on a.q2 = ss1.q1
+    cross join
+    lateral (select q1, coalesce(ss1.x,q2) as y from int8_tbl d) ss2
+  ) on c.q2 = ss2.q1,
+  lateral (select ss2.y offset 0) ss3;
+                                                                                   QUERY PLAN                                                                                   
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Nested Loop
+   Output: c.q1, c.q2, a.q1, a.q2, b.q1, (COALESCE(b.q2, '42'::bigint)), d.q1, (COALESCE(COALESCE(b.q2, '42'::bigint), d.q2)), ((COALESCE(COALESCE(b.q2, '42'::bigint), d.q2)))
+   ->  Remote Subquery Scan on all
+         Output: c.q1, c.q2, a.q1, a.q2, b.q1, d.q1, COALESCE(b.q2, '42'::bigint), COALESCE(COALESCE(b.q2, '42'::bigint), d.q2)
+         ->  Hash Right Join
+               Output: c.q1, c.q2, a.q1, a.q2, b.q1, d.q1, (COALESCE(b.q2, '42'::bigint)), (COALESCE((COALESCE(b.q2, '42'::bigint)), d.q2))
+               Hash Cond: (d.q1 = c.q2)
+               ->  Nested Loop
+                     Output: a.q1, a.q2, b.q1, d.q1, (COALESCE(b.q2, '42'::bigint)), (COALESCE((COALESCE(b.q2, '42'::bigint)), d.q2))
+                     ->  Hash Left Join
+                           Output: a.q1, a.q2, b.q1, (COALESCE(b.q2, '42'::bigint))
+                           Hash Cond: (a.q2 = b.q1)
+                           ->  Seq Scan on public.int8_tbl a
+                                 Output: a.q1, a.q2
+                           ->  Hash
+                                 Output: b.q1, (COALESCE(b.q2, '42'::bigint))
+                                 ->  Seq Scan on public.int8_tbl b
+                                       Output: b.q1, COALESCE(b.q2, '42'::bigint)
+                     ->  Seq Scan on public.int8_tbl d
+                           Output: d.q1, COALESCE((COALESCE(b.q2, '42'::bigint)), d.q2)
+               ->  Hash
+                     Output: c.q1, c.q2
+                     ->  Seq Scan on public.int8_tbl c
+                           Output: c.q1, c.q2
+   ->  Result
+         Output: (COALESCE(COALESCE(b.q2, '42'::bigint), d.q2))
+(26 rows)
+
+-- case that breaks the old ph_may_need optimization
+explain (num_nodes off, nodes off, verbose, costs off)
+select c.*,a.*,ss1.q1,ss2.q1,ss3.* from
+  int8_tbl c left join (
+    int8_tbl a left join
+      (select q1, coalesce(q2,f1) as x from int8_tbl b, int4_tbl b2
+       where q1 < f1) ss1
+      on a.q2 = ss1.q1
+    cross join
+    lateral (select q1, coalesce(ss1.x,q2) as y from int8_tbl d) ss2
+  ) on c.q2 = ss2.q1,
+  lateral (select * from int4_tbl i where ss2.y > f1) ss3;
+                                                  QUERY PLAN                                                   
+---------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: c.q1, c.q2, a.q1, a.q2, b.q1, d.q1, i.f1
+   ->  Nested Loop
+         Output: c.q1, c.q2, a.q1, a.q2, b.q1, d.q1, i.f1
+         Join Filter: ((COALESCE((COALESCE(b.q2, (b2.f1)::bigint)), d.q2)) > i.f1)
+         ->  Hash Right Join
+               Output: c.q1, c.q2, a.q1, a.q2, b.q1, d.q1, (COALESCE((COALESCE(b.q2, (b2.f1)::bigint)), d.q2))
+               Hash Cond: (d.q1 = c.q2)
+               ->  Nested Loop
+                     Output: a.q1, a.q2, b.q1, d.q1, (COALESCE((COALESCE(b.q2, (b2.f1)::bigint)), d.q2))
+                     ->  Hash Right Join
+                           Output: a.q1, a.q2, b.q1, (COALESCE(b.q2, (b2.f1)::bigint))
+                           Hash Cond: (b.q1 = a.q2)
+                           ->  Nested Loop
+                                 Output: b.q1, COALESCE(b.q2, (b2.f1)::bigint)
+                                 Join Filter: (b.q1 < b2.f1)
+                                 ->  Seq Scan on public.int8_tbl b
+                                       Output: b.q1, b.q2
+                                 ->  Materialize
+                                       Output: b2.f1
+                                       ->  Seq Scan on public.int4_tbl b2
+                                             Output: b2.f1
+                           ->  Hash
+                                 Output: a.q1, a.q2
+                                 ->  Seq Scan on public.int8_tbl a
+                                       Output: a.q1, a.q2
+                     ->  Seq Scan on public.int8_tbl d
+                           Output: d.q1, COALESCE((COALESCE(b.q2, (b2.f1)::bigint)), d.q2)
+               ->  Hash
+                     Output: c.q1, c.q2
+                     ->  Seq Scan on public.int8_tbl c
+                           Output: c.q1, c.q2
+         ->  Materialize
+               Output: i.f1
+               ->  Seq Scan on public.int4_tbl i
+                     Output: i.f1
+(36 rows)
+
+-- check processing of postponed quals (bug #9041)
+explain (num_nodes off, nodes off, verbose, costs off)
+select * from
+  (select 1 as x offset 0) x cross join (select 2 as y offset 0) y
+  left join lateral (
+    select * from (select 3 as z offset 0) z where z.z = x.x
+  ) zz on zz.z = y.y;
+                  QUERY PLAN                  
+----------------------------------------------
+ Nested Loop Left Join
+   Output: (1), (2), (3)
+   Join Filter: (((3) = (1)) AND ((3) = (2)))
+   ->  Nested Loop
+         Output: (1), (2)
+         ->  Result
+               Output: 1
+         ->  Result
+               Output: 2
+   ->  Result
+         Output: 3
+(11 rows)
+
+-- check we don't try to do a unique-ified semijoin with LATERAL
+explain (verbose, costs off, nodes off)
+select * from
+  (values (0,9998), (1,1000)) v(id,x),
+  lateral (select f1 from int4_tbl
+           where f1 = any (select unique1 from tenk1
+                           where unique2 = v.x offset 0)) ss;
+                                    QUERY PLAN                                    
+----------------------------------------------------------------------------------
+ Nested Loop
+   Output: "*VALUES*".column1, "*VALUES*".column2, int4_tbl.f1
+   ->  Values Scan on "*VALUES*"
+         Output: "*VALUES*".column1, "*VALUES*".column2
+   ->  Materialize
+         Output: int4_tbl.f1
+         ->  Remote Subquery Scan on all
+               Output: int4_tbl.f1
+               ->  Nested Loop
+                     Output: int4_tbl.f1
+                     Join Filter: (int4_tbl.f1 = tenk1.unique1)
+                     ->  HashAggregate
+                           Output: tenk1.unique1
+                           Group Key: tenk1.unique1
+                           ->  Index Scan using tenk1_unique2 on public.tenk1
+                                 Output: tenk1.unique1
+                                 Index Cond: (tenk1.unique2 = "*VALUES*".column2)
+                     ->  Seq Scan on public.int4_tbl
+                           Output: int4_tbl.f1
+(19 rows)
+
+select * from
+  (values (0,9998), (1,1000)) v(id,x),
+  lateral (select f1 from int4_tbl
+           where f1 = any (select unique1 from tenk1
+                           where unique2 = v.x offset 0)) ss;
+ id |  x   | f1 
+----+------+----
+  0 | 9998 |  0
+(1 row)
+
+-- check proper extParam/allParam handling (this isn't exactly a LATERAL issue,
+-- but we can make the test case much more compact with LATERAL)
+explain (verbose, costs off)
+select * from (values (0), (1)) v(id),
+lateral (select * from int8_tbl t1,
+         lateral (select * from
+                    (select * from int8_tbl t2
+                     where q1 = any (select q2 from int8_tbl t3
+                                     where q2 = (select greatest(t1.q1,t2.q2))
+                                       and (select v.id=0)) offset 0) ss2) ss
+         where t1.q1 = ss.q2) ss0;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Nested Loop
+   Output: "*VALUES*".column1, t1.q1, t1.q2, ss2.q1, ss2.q2
+   ->  Values Scan on "*VALUES*"
+         Output: "*VALUES*".column1
+   ->  Materialize
+         Output: t1.q1, t1.q2, ss2.q1, ss2.q2
+         ->  Remote Subquery Scan on all (datanode_1)
+               Output: t1.q1, t1.q2, ss2.q1, ss2.q2
+               ->  Nested Loop
+                     Output: t1.q1, t1.q2, ss2.q1, ss2.q2
+                     ->  Seq Scan on public.int8_tbl t1
+                           Output: t1.q1, t1.q2
+                     ->  Subquery Scan on ss2
+                           Output: ss2.q1, ss2.q2
+                           Filter: (t1.q1 = ss2.q2)
+                           ->  Seq Scan on public.int8_tbl t2
+                                 Output: t2.q1, t2.q2
+                                 Filter: (SubPlan 3)
+                                 SubPlan 3
+                                   ->  Remote Subquery Scan on all (datanode_1)
+                                         Output: t3.q2
+                                         ->  Result
+                                               Output: t3.q2
+                                               One-Time Filter: $4
+                                               InitPlan 1 (returns $2)
+                                                 ->  Result
+                                                       Output: GREATEST($0, t2.q2)
+                                               InitPlan 2 (returns $4)
+                                                 ->  Result
+                                                       Output: ($3 = 0)
+                                               ->  Seq Scan on public.int8_tbl t3
+                                                     Output: t3.q1, t3.q2
+                                                     Filter: (t3.q2 = $2)
+(33 rows)
+
+select * from (values (0), (1)) v(id),
+lateral (select * from int8_tbl t1,
+         lateral (select * from
+                    (select * from int8_tbl t2
+                     where q1 = any (select q2 from int8_tbl t3
+                                     where q2 = (select greatest(t1.q1,t2.q2))
+                                       and (select v.id=0)) offset 0) ss2) ss
+         where t1.q1 = ss.q2) ss0;
+ id |        q1        |        q2         |        q1        |        q2        
+----+------------------+-------------------+------------------+------------------
+  0 | 4567890123456789 |               123 | 4567890123456789 | 4567890123456789
+  0 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789
+  0 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789
+(3 rows)
+
+-- test some error cases where LATERAL should have been used but wasn't
+select f1,g from int4_tbl a, (select f1 as g) ss;
+ERROR:  column "f1" does not exist
+LINE 1: select f1,g from int4_tbl a, (select f1 as g) ss;
+                                             ^
+HINT:  There is a column named "f1" in table "a", but it cannot be referenced from this part of the query.
+select f1,g from int4_tbl a, (select a.f1 as g) ss;
+ERROR:  invalid reference to FROM-clause entry for table "a"
+LINE 1: select f1,g from int4_tbl a, (select a.f1 as g) ss;
+                                             ^
+HINT:  There is an entry for table "a", but it cannot be referenced from this part of the query.
+select f1,g from int4_tbl a cross join (select f1 as g) ss;
+ERROR:  column "f1" does not exist
+LINE 1: select f1,g from int4_tbl a cross join (select f1 as g) ss;
+                                                       ^
+HINT:  There is a column named "f1" in table "a", but it cannot be referenced from this part of the query.
+select f1,g from int4_tbl a cross join (select a.f1 as g) ss;
+ERROR:  invalid reference to FROM-clause entry for table "a"
+LINE 1: select f1,g from int4_tbl a cross join (select a.f1 as g) ss...
+                                                       ^
+HINT:  There is an entry for table "a", but it cannot be referenced from this part of the query.
+-- SQL:2008 says the left table is in scope but illegal to access here
+select f1,g from int4_tbl a right join lateral generate_series(0, a.f1) g on true;
+ERROR:  invalid reference to FROM-clause entry for table "a"
+LINE 1: ... int4_tbl a right join lateral generate_series(0, a.f1) g on...
+                                                             ^
+DETAIL:  The combining JOIN type must be INNER or LEFT for a LATERAL reference.
+select f1,g from int4_tbl a full join lateral generate_series(0, a.f1) g on true;
+ERROR:  invalid reference to FROM-clause entry for table "a"
+LINE 1: ...m int4_tbl a full join lateral generate_series(0, a.f1) g on...
+                                                             ^
+DETAIL:  The combining JOIN type must be INNER or LEFT for a LATERAL reference.
+-- check we complain about ambiguous table references
+select * from
+  int8_tbl x cross join (int4_tbl x cross join lateral (select x.f1) ss);
+ERROR:  table reference "x" is ambiguous
+LINE 2: ...cross join (int4_tbl x cross join lateral (select x.f1) ss);
+                                                             ^
+-- LATERAL can be used to put an aggregate into the FROM clause of its query
+select 1 from tenk1 a, lateral (select max(a.unique1) from int4_tbl b) ss;
+ERROR:  aggregate functions are not allowed in FROM clause of their own query level
+LINE 1: select 1 from tenk1 a, lateral (select max(a.unique1) from i...
+                                               ^
+-- check behavior of LATERAL in UPDATE/DELETE
+create temp table xx1 as select f1 as x1, -f1 as x2 from int4_tbl;
+-- error, can't do this:
+update xx1 set x2 = f1 from (select * from int4_tbl where f1 = x1) ss;
+ERROR:  column "x1" does not exist
+LINE 1: ... set x2 = f1 from (select * from int4_tbl where f1 = x1) ss;
+                                                                ^
+HINT:  There is a column named "x1" in table "xx1", but it cannot be referenced from this part of the query.
+update xx1 set x2 = f1 from (select * from int4_tbl where f1 = xx1.x1) ss;
+ERROR:  invalid reference to FROM-clause entry for table "xx1"
+LINE 1: ...t x2 = f1 from (select * from int4_tbl where f1 = xx1.x1) ss...
+                                                             ^
+HINT:  There is an entry for table "xx1", but it cannot be referenced from this part of the query.
+-- can't do it even with LATERAL:
+update xx1 set x2 = f1 from lateral (select * from int4_tbl where f1 = x1) ss;
+ERROR:  invalid reference to FROM-clause entry for table "xx1"
+LINE 1: ...= f1 from lateral (select * from int4_tbl where f1 = x1) ss;
+                                                                ^
+HINT:  There is an entry for table "xx1", but it cannot be referenced from this part of the query.
+-- we might in future allow something like this, but for now it's an error:
+update xx1 set x2 = f1 from xx1, lateral (select * from int4_tbl where f1 = x1) ss;
+ERROR:  table name "xx1" specified more than once
+-- also errors:
+delete from xx1 using (select * from int4_tbl where f1 = x1) ss;
+ERROR:  column "x1" does not exist
+LINE 1: ...te from xx1 using (select * from int4_tbl where f1 = x1) ss;
+                                                                ^
+HINT:  There is a column named "x1" in table "xx1", but it cannot be referenced from this part of the query.
+delete from xx1 using (select * from int4_tbl where f1 = xx1.x1) ss;
+ERROR:  invalid reference to FROM-clause entry for table "xx1"
+LINE 1: ...from xx1 using (select * from int4_tbl where f1 = xx1.x1) ss...
+                                                             ^
+HINT:  There is an entry for table "xx1", but it cannot be referenced from this part of the query.
+delete from xx1 using lateral (select * from int4_tbl where f1 = x1) ss;
+ERROR:  invalid reference to FROM-clause entry for table "xx1"
+LINE 1: ...xx1 using lateral (select * from int4_tbl where f1 = x1) ss;
+                                                                ^
+HINT:  There is an entry for table "xx1", but it cannot be referenced from this part of the query.
+-- demonstrate problem with extrememly slow join
+CREATE TABLE testr (a int, b int) DISTRIBUTE BY REPLICATION;
+INSERT INTO testr SELECT generate_series(1, 10000), generate_series(5001, 15000);
+CREATE TABLE testh (a int, b int);
+INSERT INTO testh SELECT generate_series(1, 10000), generate_series(8001, 18000);
+set enable_mergejoin TO false;
+set enable_hashjoin TO false;
+EXPLAIN (VERBOSE, COSTS OFF) SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Finalize Aggregate
+   Output: count(*)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         Output: PARTIAL count(*)
+         ->  Partial Aggregate
+               Output: PARTIAL count(*)
+               ->  Nested Loop Anti Join
+                     Join Filter: (testr.b = testh.b)
+                     ->  Remote Subquery Scan on all (datanode_1)
+                           Output: testr.b
+                           Distribute results by H: b
+                           ->  Seq Scan on public.testr
+                                 Output: testr.b
+                     ->  Materialize
+                           Output: testh.b
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Output: testh.b
+                                 Distribute results by H: b
+                                 ->  Seq Scan on public.testh
+                                       Output: testh.b
+(20 rows)
+
+SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b = testh.b);
+ count 
+-------
+  3000
+(1 row)
+
+--
+-- test LATERAL reference propagation down a multi-level inheritance hierarchy
+-- produced for a multi-level partitioned table hierarchy.
+--
+create table pt1 (a int, b int, c varchar) partition by range(a);
+create table pt1p1 partition of pt1 for values from (0) to (100) partition by range(b);
+create table pt1p2 partition of pt1 for values from (100) to (200);
+create table pt1p1p1 partition of pt1p1 for values from (0) to (100);
+insert into pt1 values (1, 1, 'x'), (101, 101, 'y');
+create table ut1 (a int, b int, c varchar);
+insert into ut1 values (101, 101, 'y'), (2, 2, 'z');
+explain (verbose, costs off)
+select t1.b, ss.phv from ut1 t1 left join lateral
+              (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv
+                     from pt1 t2 join ut1 t3 on t2.a = t3.b) ss
+              on t1.a = ss.t2a order by t1.a;
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: t1.b, LEAST(t1.a, a, t3.a), t1.a
+   Sort Key: t1.a
+   ->  Sort
+         Output: t1.b, (LEAST(t1.a, a, t3.a)), t1.a
+         Sort Key: t1.a
+         ->  Nested Loop Left Join
+               Output: t1.b, (LEAST(t1.a, a, t3.a)), t1.a
+               ->  Seq Scan on public.ut1 t1
+                     Output: t1.a, t1.b, t1.c
+               ->  Materialize
+                     Output: a, (LEAST(t1.a, a, t3.a))
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Output: a, LEAST(t1.a, a, t3.a)
+                           Distribute results by H: a
+                           ->  Nested Loop
+                                 Output: a, LEAST(t1.a, a, t3.a)
+                                 Join Filter: (a = t3.b)
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Output: t3.b, t3.a
+                                       Distribute results by H: b
+                                       ->  Seq Scan on public.ut1 t3
+                                             Output: t3.b, t3.a
+                                 ->  Materialize
+                                       Output: a
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Output: a
+                                             Distribute results by H: a
+                                             ->  Append
+                                                   ->  Seq Scan on public.pt1p1p1 t2
+                                                         Output: t2.a
+                                                         Filter: (t1.a = t2.a)
+                                                   ->  Seq Scan on public.pt1p2 t2_1
+                                                         Output: t2_1.a
+                                                         Filter: (t1.a = t2_1.a)
+(35 rows)
+
+select t1.b, ss.phv from ut1 t1 left join lateral
+              (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv
+                     from pt1 t2 join ut1 t3 on t2.a = t3.b) ss
+              on t1.a = ss.t2a order by t1.a;
+  b  | phv 
+-----+-----
+   2 |    
+ 101 | 101
+(2 rows)
+
+drop table pt1;
+drop table ut1;
+--
+-- test that foreign key join estimation performs sanely for outer joins
+--
+begin;
+create table fkest (a int, b int, c int unique, primary key(a,b));
+create table fkest1 (a int, b int, primary key(a,b));
+insert into fkest select x/10, x%10, x from generate_series(1,2000) x;
+insert into fkest1 select x/10, x%10 from generate_series(1,2000) x;
+alter table fkest1
+  add constraint fkest1_a_b_fkey foreign key (a,b) references fkest;
+analyze fkest;
+analyze fkest1;
+explain (costs off)
+select *
+from fkest f
+  left join fkest1 f1 on f.a = f1.a and f.b = f1.b
+  left join fkest1 f2 on f.a = f2.a and f.b = f2.b
+  left join fkest1 f3 on f.a = f3.a and f.b = f3.b
+where f.c = 1;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Nested Loop Left Join
+         ->  Nested Loop Left Join
+               ->  Nested Loop Left Join
+                     ->  Remote Subquery Scan on all (datanode_1)
+                           Distribute results by H: a
+                           ->  Index Scan using fkest_c_key on fkest f
+                                 Index Cond: (c = 1)
+                     ->  Index Only Scan using fkest1_pkey on fkest1 f1
+                           Index Cond: ((a = f.a) AND (b = f.b))
+               ->  Index Only Scan using fkest1_pkey on fkest1 f2
+                     Index Cond: ((a = f.a) AND (b = f.b))
+         ->  Index Only Scan using fkest1_pkey on fkest1 f3
+               Index Cond: ((a = f.a) AND (b = f.b))
+(14 rows)
+
+rollback;
+--
+-- test planner's ability to mark joins as unique
+--
+create table j1 (id int primary key);
+create table j2 (id int primary key);
+create table j3 (id int);
+insert into j1 values(1),(2),(3);
+insert into j2 values(1),(2),(3);
+insert into j3 values(1),(1);
+analyze j1;
+analyze j2;
+analyze j3;
+-- ensure join is properly marked as unique
+explain (verbose, costs off)
+select * from j1 inner join j2 on j1.id = j2.id;
+                                 QUERY PLAN                                 
+----------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: j1.id, j2.id
+   Node/s: datanode_1, datanode_2
+   Remote query: SELECT j1.id, j2.id FROM (j1 JOIN j2 ON ((j1.id = j2.id)))
+   ->  Nested Loop
+         Output: j1.id, j2.id
+         Inner Unique: true
+         ->  Seq Scan on public.j1
+               Output: j1.id
+         ->  Bitmap Heap Scan on public.j2
+               Output: j2.id
+               Recheck Cond: (j2.id = j1.id)
+               ->  Bitmap Index Scan on j2_pkey
+                     Index Cond: (j2.id = j1.id)
+(14 rows)
+
+-- ensure join is not unique when not an equi-join
+explain (verbose, costs off)
+select * from j1 inner join j2 on j1.id > j2.id;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Nested Loop
+   Output: j1.id, j2.id
+   Join Filter: (j1.id > j2.id)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         Output: j1.id
+         ->  Bitmap Heap Scan on public.j1
+               Output: j1.id
+               ->  Bitmap Index Scan on j1_pkey
+   ->  Materialize
+         Output: j2.id
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Output: j2.id
+               ->  Bitmap Heap Scan on public.j2
+                     Output: j2.id
+                     ->  Bitmap Index Scan on j2_pkey
+(15 rows)
+
+-- ensure non-unique rel is not chosen as inner
+explain (verbose, costs off)
+select * from j1 inner join j3 on j1.id = j3.id;
+                                 QUERY PLAN                                 
+----------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: j1.id, j3.id
+   Node/s: datanode_1, datanode_2
+   Remote query: SELECT j1.id, j3.id FROM (j1 JOIN j3 ON ((j1.id = j3.id)))
+   ->  Nested Loop
+         Output: j1.id, j3.id
+         Inner Unique: true
+         ->  Seq Scan on public.j3
+               Output: j3.id
+         ->  Bitmap Heap Scan on public.j1
+               Output: j1.id
+               Recheck Cond: (j1.id = j3.id)
+               ->  Bitmap Index Scan on j1_pkey
+                     Index Cond: (j1.id = j3.id)
+(14 rows)
+
+-- ensure left join is marked as unique
+explain (verbose, costs off)
+select * from j1 left join j2 on j1.id = j2.id;
+                                   QUERY PLAN                                    
+---------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: j1.id, j2.id
+   Node/s: datanode_1, datanode_2
+   Remote query: SELECT j1.id, j2.id FROM (j1 LEFT JOIN j2 ON ((j1.id = j2.id)))
+   ->  Nested Loop Left Join
+         Output: j1.id, j2.id
+         Inner Unique: true
+         ->  Seq Scan on public.j1
+               Output: j1.id
+         ->  Bitmap Heap Scan on public.j2
+               Output: j2.id
+               Recheck Cond: (j1.id = j2.id)
+               ->  Bitmap Index Scan on j2_pkey
+                     Index Cond: (j1.id = j2.id)
+(14 rows)
+
+-- ensure right join is marked as unique
+explain (verbose, costs off)
+select * from j1 right join j2 on j1.id = j2.id;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: j1.id, j2.id
+   ->  Nested Loop Left Join
+         Output: j1.id, j2.id
+         Inner Unique: true
+         ->  Seq Scan on public.j2
+               Output: j2.id
+         ->  Bitmap Heap Scan on public.j1
+               Output: j1.id
+               Recheck Cond: (j1.id = j2.id)
+               ->  Bitmap Index Scan on j1_pkey
+                     Index Cond: (j1.id = j2.id)
+(12 rows)
+
+-- ensure full join is marked as unique
+explain (verbose, costs off)
+select * from j1 full join j2 on j1.id = j2.id;
+                                   QUERY PLAN                                    
+---------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: j1.id, j2.id
+   Node/s: datanode_1, datanode_2
+   Remote query: SELECT j1.id, j2.id FROM (j1 FULL JOIN j2 ON ((j1.id = j2.id)))
+   ->  Hash Full Join
+         Output: j1.id, j2.id
+         Inner Unique: true
+         Hash Cond: (j1.id = j2.id)
+         ->  Seq Scan on public.j1
+               Output: j1.id
+         ->  Hash
+               Output: j2.id
+               ->  Seq Scan on public.j2
+                     Output: j2.id
+(14 rows)
+
+-- a clauseless (cross) join can't be unique
+explain (verbose, costs off)
+select * from j1 cross join j2;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Nested Loop
+   Output: j1.id, j2.id
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         Output: j1.id
+         ->  Bitmap Heap Scan on public.j1
+               Output: j1.id
+               ->  Bitmap Index Scan on j1_pkey
+   ->  Materialize
+         Output: j2.id
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Output: j2.id
+               ->  Bitmap Heap Scan on public.j2
+                     Output: j2.id
+                     ->  Bitmap Index Scan on j2_pkey
+(14 rows)
+
+-- ensure a natural join is marked as unique
+explain (verbose, costs off)
+select * from j1 natural join j2;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Remote Fast Query Execution
+   Output: j1.id
+   Node/s: datanode_1, datanode_2
+   Remote query: SELECT j1.id FROM (j1 JOIN j2 USING (id))
+   ->  Nested Loop
+         Output: j1.id
+         Inner Unique: true
+         ->  Seq Scan on public.j1
+               Output: j1.id
+         ->  Bitmap Heap Scan on public.j2
+               Output: j2.id
+               Recheck Cond: (j2.id = j1.id)
+               ->  Bitmap Index Scan on j2_pkey
+                     Index Cond: (j2.id = j1.id)
+(14 rows)
+
+-- ensure a distinct clause allows the inner to become unique
+explain (verbose, costs off)
+select * from j1
+inner join (select distinct id from j3) j3 on j1.id = j3.id;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: j1.id, id
+   ->  Nested Loop
+         Output: j1.id, j3.id
+         Inner Unique: true
+         ->  Unique
+               Output: j3.id
+               ->  Sort
+                     Output: j3.id
+                     Sort Key: j3.id
+                     ->  Seq Scan on public.j3
+                           Output: j3.id
+         ->  Bitmap Heap Scan on public.j1
+               Output: j1.id
+               Recheck Cond: (j1.id = j3.id)
+               ->  Bitmap Index Scan on j1_pkey
+                     Index Cond: (j1.id = j3.id)
+(17 rows)
+
+-- ensure group by clause allows the inner to become unique
+explain (verbose, costs off)
+select * from j1
+inner join (select id from j3 group by id) j3 on j1.id = j3.id;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: j1.id, id
+   ->  Nested Loop
+         Output: j1.id, j3.id
+         Inner Unique: true
+         ->  Group
+               Output: j3.id
+               Group Key: j3.id
+               ->  Sort
+                     Output: j3.id
+                     Sort Key: j3.id
+                     ->  Seq Scan on public.j3
+                           Output: j3.id
+         ->  Bitmap Heap Scan on public.j1
+               Output: j1.id
+               Recheck Cond: (j1.id = j3.id)
+               ->  Bitmap Index Scan on j1_pkey
+                     Index Cond: (j1.id = j3.id)
+(18 rows)
+
+drop table j1;
+drop table j2;
+drop table j3;
+-- test more complex permutations of unique joins
+create table j1 (id1 int, id2 int, primary key(id1,id2));
+create table j2 (id1 int, id2 int, primary key(id1,id2));
+create table j3 (id1 int, id2 int, primary key(id1,id2));
+insert into j1 values(1,1),(1,2);
+insert into j2 values(1,1);
+insert into j3 values(1,1);
+analyze j1;
+analyze j2;
+analyze j3;
+-- ensure there's no unique join when not all columns which are part of the
+-- unique index are seen in the join clause
+explain (verbose, costs off)
+select * from j1
+inner join j2 on j1.id1 = j2.id1;
+                                           QUERY PLAN                                           
+------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: j1.id1, j1.id2, j2.id1, j2.id2
+   Node/s: datanode_1, datanode_2
+   Remote query: SELECT j1.id1, j1.id2, j2.id1, j2.id2 FROM (j1 JOIN j2 ON ((j1.id1 = j2.id1)))
+   ->  Nested Loop
+         Output: j1.id1, j1.id2, j2.id1, j2.id2
+         Join Filter: (j1.id1 = j2.id1)
+         ->  Index Only Scan using j2_pkey on public.j2
+               Output: j2.id1, j2.id2
+         ->  Seq Scan on public.j1
+               Output: j1.id1, j1.id2
+(11 rows)
+
+-- ensure proper unique detection with multiple join quals
+explain (verbose, costs off)
+select * from j1
+inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2;
+                                                       QUERY PLAN                                                       
+------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: j1.id1, j1.id2, j2.id1, j2.id2
+   Node/s: datanode_1, datanode_2
+   Remote query: SELECT j1.id1, j1.id2, j2.id1, j2.id2 FROM (j1 JOIN j2 ON (((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))))
+   ->  Nested Loop
+         Output: j1.id1, j1.id2, j2.id1, j2.id2
+         Inner Unique: true
+         ->  Index Only Scan using j2_pkey on public.j2
+               Output: j2.id1, j2.id2
+         ->  Bitmap Heap Scan on public.j1
+               Output: j1.id1, j1.id2
+               Recheck Cond: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
+               ->  Bitmap Index Scan on j1_pkey
+                     Index Cond: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
+(14 rows)
+
+-- ensure we don't detect the join to be unique when quals are not part of the
+-- join condition
+explain (verbose, costs off)
+select * from j1
+inner join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+                                                    QUERY PLAN                                                     
+-------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: j1.id1, j1.id2, j2.id1, j2.id2
+   Node/s: datanode_1, datanode_2
+   Remote query: SELECT j1.id1, j1.id2, j2.id1, j2.id2 FROM (j1 JOIN j2 ON ((j1.id1 = j2.id1))) WHERE (j1.id2 = 1)
+   ->  Nested Loop
+         Output: j1.id1, j1.id2, j2.id1, j2.id2
+         Inner Unique: true
+         ->  Index Only Scan using j2_pkey on public.j2
+               Output: j2.id1, j2.id2
+         ->  Bitmap Heap Scan on public.j1
+               Output: j1.id1, j1.id2
+               Recheck Cond: ((j1.id1 = j2.id1) AND (j1.id2 = 1))
+               ->  Bitmap Index Scan on j1_pkey
+                     Index Cond: ((j1.id1 = j2.id1) AND (j1.id2 = 1))
+(14 rows)
+
+-- as above, but for left joins.
+explain (verbose, costs off)
+select * from j1
+left join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
+                                                       QUERY PLAN                                                       
+------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: j1.id1, j1.id2, j2.id1, j2.id2
+   Node/s: datanode_1, datanode_2
+   Remote query: SELECT j1.id1, j1.id2, j2.id1, j2.id2 FROM (j1 LEFT JOIN j2 ON ((j1.id1 = j2.id1))) WHERE (j1.id2 = 1)
+   ->  Nested Loop Left Join
+         Output: j1.id1, j1.id2, j2.id1, j2.id2
+         Join Filter: (j1.id1 = j2.id1)
+         ->  Bitmap Heap Scan on public.j1
+               Output: j1.id1, j1.id2
+               Recheck Cond: (j1.id2 = 1)
+               ->  Bitmap Index Scan on j1_pkey
+                     Index Cond: (j1.id2 = 1)
+         ->  Index Only Scan using j2_pkey on public.j2
+               Output: j2.id1, j2.id2
+(14 rows)
+
+-- validate logic in merge joins which skips mark and restore.
+-- it should only do this if all quals which were used to detect the unique
+-- are present as join quals, and not plain quals.
+set enable_nestloop to 0;
+set enable_hashjoin to 0;
+set enable_sort to 0;
+-- create an index that will be preferred over the PK to perform the join
+create index j1_id1_idx on j1 (id1) where id1 % 1000 = 1;
+explain (costs off) select * from j1 j1
+inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Nested Loop
+         Join Filter: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
+         ->  Bitmap Heap Scan on j1
+               Recheck Cond: ((id1 % 1000) = 1)
+               ->  Bitmap Index Scan on j1_id1_idx
+         ->  Bitmap Heap Scan on j1 j2
+               Recheck Cond: ((id1 % 1000) = 1)
+               ->  Bitmap Index Scan on j1_id1_idx
+(10 rows)
+
+select * from j1 j1
+inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
+where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
+ id1 | id2 | id1 | id2 
+-----+-----+-----+-----
+   1 |   1 |   1 |   1
+   1 |   2 |   1 |   2
+(2 rows)
+
+reset enable_nestloop;
+reset enable_hashjoin;
+reset enable_sort;
+drop table j1;
+drop table j2;
+drop table j3;
+-- check that semijoin inner is not seen as unique for a portion of the outerrel
+explain (verbose, costs off)
+select t1.unique1, t2.hundred
+from onek t1, tenk1 t2
+where exists (select 1 from tenk1 t3
+              where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
+      and t1.unique1 < 1;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: t1.unique1, t2.hundred
+   ->  Nested Loop
+         Output: t1.unique1, t2.hundred
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Output: t1.unique1, t3.tenthous
+               Distribute results by H: tenthous
+               ->  Hash Join
+                     Output: t1.unique1, t3.tenthous
+                     Hash Cond: (t3.thousand = t1.unique1)
+                     ->  HashAggregate
+                           Output: t3.thousand, t3.tenthous
+                           Group Key: t3.thousand, t3.tenthous
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Output: t3.thousand, t3.tenthous
+                                 Distribute results by H: thousand
+                                 ->  HashAggregate
+                                       Output: t3.thousand, t3.tenthous
+                                       Group Key: t3.thousand, t3.tenthous
+                                       ->  Index Only Scan using tenk1_thous_tenthous on public.tenk1 t3
+                                             Output: t3.thousand, t3.tenthous
+                     ->  Hash
+                           Output: t1.unique1
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Output: t1.unique1
+                                 Sort Key: t1.unique1
+                                 ->  Index Only Scan using onek_unique1 on public.onek t1
+                                       Output: t1.unique1
+                                       Index Cond: (t1.unique1 < 1)
+         ->  Materialize
+               Output: t2.hundred
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: t2.hundred
+                     Distribute results by H: hundred
+                     Sort Key: t2.hundred
+                     ->  Index Only Scan using tenk1_hundred on public.tenk1 t2
+                           Output: t2.hundred
+                           Index Cond: (t2.hundred = t3.tenthous)
+(38 rows)
+
+-- ... unless it actually is unique
+create table j3 as select unique1, tenthous from onek;
+vacuum analyze j3;
+create unique index on j3(unique1, tenthous);
+explain (verbose, costs off)
+select t1.unique1, t2.hundred
+from onek t1, tenk1 t2
+where exists (select 1 from j3
+              where j3.unique1 = t1.unique1 and j3.tenthous = t2.hundred)
+      and t1.unique1 < 1;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: t1.unique1, t2.hundred
+   ->  Nested Loop
+         Output: t1.unique1, t2.hundred
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Output: t1.unique1, j3.tenthous
+               ->  Nested Loop
+                     Output: t1.unique1, j3.tenthous
+                     ->  Index Only Scan using onek_unique1 on public.onek t1
+                           Output: t1.unique1
+                           Index Cond: (t1.unique1 < 1)
+                     ->  Index Only Scan using j3_unique1_tenthous_idx on public.j3
+                           Output: j3.unique1, j3.tenthous
+                           Index Cond: (j3.unique1 = t1.unique1)
+         ->  Index Only Scan using tenk1_hundred on public.tenk1 t2
+               Output: t2.hundred
+               Index Cond: (t2.hundred = j3.tenthous)
+(17 rows)
+
+drop table j3;
diff --git a/src/test/regress/expected/partition_join_2.out b/src/test/regress/expected/partition_join_2.out
new file mode 100644
index 00000000..8a414251
--- /dev/null
+++ b/src/test/regress/expected/partition_join_2.out
@@ -0,0 +1,1819 @@
+--
+-- PARTITION_JOIN
+-- Test partition-wise join between partitioned tables
+--
+-- Enable partition-wise join, which by default is disabled.
+--SET enable_partition_wise_join to true;
+--
+-- partitioned by a single column
+--
+CREATE TABLE prt1 (a int, b int, c varchar) PARTITION BY RANGE(a);
+CREATE TABLE prt1_p1 PARTITION OF prt1 FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt1_p3 PARTITION OF prt1 FOR VALUES FROM (500) TO (600);
+CREATE TABLE prt1_p2 PARTITION OF prt1 FOR VALUES FROM (250) TO (500);
+INSERT INTO prt1 SELECT i, i % 25, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 2 = 0;
+CREATE INDEX iprt1_p1_a on prt1_p1(a);
+CREATE INDEX iprt1_p2_a on prt1_p2(a);
+CREATE INDEX iprt1_p3_a on prt1_p3(a);
+ANALYZE prt1;
+CREATE TABLE prt2 (a int, b int, c varchar) PARTITION BY RANGE(b);
+CREATE TABLE prt2_p1 PARTITION OF prt2 FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt2_p2 PARTITION OF prt2 FOR VALUES FROM (250) TO (500);
+CREATE TABLE prt2_p3 PARTITION OF prt2 FOR VALUES FROM (500) TO (600);
+INSERT INTO prt2 SELECT i % 25, i, to_char(i, 'FM0000') FROM generate_series(0, 599) i WHERE i % 3 = 0;
+CREATE INDEX iprt2_p1_b on prt2_p1(b);
+CREATE INDEX iprt2_p2_b on prt2_p2(b);
+CREATE INDEX iprt2_p3_b on prt2_p3(b);
+ANALYZE prt2;
+-- inner join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a
+         ->  Hash Join
+               Hash Cond: (t2.b = a)
+               ->  Append
+                     ->  Seq Scan on prt2_p1 t2
+                     ->  Seq Scan on prt2_p2 t2_1
+                     ->  Seq Scan on prt2_p3 t2_2
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Append
+                                 ->  Seq Scan on prt1_p1 t1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_p2 t1_1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_p3 t1_2
+                                       Filter: (b = 0)
+(18 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+ 150 | 0150 | 150 | 0150
+ 300 | 0300 | 300 | 0300
+ 450 | 0450 | 450 | 0450
+(4 rows)
+
+-- left outer join, with whole-row reference
+EXPLAIN (COSTS OFF)
+SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
+         ->  Hash Right Join
+               Hash Cond: (b = a)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  Append
+                           ->  Seq Scan on prt2_p1 t2
+                           ->  Seq Scan on prt2_p2 t2_1
+                           ->  Seq Scan on prt2_p3 t2_2
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Append
+                                 ->  Seq Scan on prt1_p1 t1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_p2 t1_1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_p3 t1_2
+                                       Filter: (b = 0)
+(21 rows)
+
+SELECT t1, t2 FROM prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+      t1      |      t2      
+--------------+--------------
+ (0,0,0000)   | (0,0,0000)
+ (50,0,0050)  | 
+ (100,0,0100) | 
+ (150,0,0150) | (0,150,0150)
+ (200,0,0200) | 
+ (250,0,0250) | 
+ (300,0,0300) | (0,300,0300)
+ (350,0,0350) | 
+ (400,0,0400) | 
+ (450,0,0450) | (0,450,0450)
+ (500,0,0500) | 
+ (550,0,0550) | 
+(12 rows)
+
+-- right outer join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
+         ->  Hash Right Join
+               Hash Cond: (a = b)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t1
+                           ->  Seq Scan on prt1_p2 t1_1
+                           ->  Seq Scan on prt1_p3 t1_2
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           Distribute results by H: b
+                           ->  Append
+                                 ->  Seq Scan on prt2_p1 t2
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_p2 t2_1
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_p3 t2_2
+                                       Filter: (a = 0)
+(21 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+ 150 | 0150 | 150 | 0150
+ 300 | 0300 | 300 | 0300
+ 450 | 0450 | 450 | 0450
+     |      |  75 | 0075
+     |      | 225 | 0225
+     |      | 375 | 0375
+     |      | 525 | 0525
+(8 rows)
+
+-- full outer join, with placeholder vars
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
+         ->  Hash Full Join
+               Hash Cond: (a = b)
+               Filter: (((50) = a) OR ((75) = b))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_p1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p2
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p3
+                                 Filter: (b = 0)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           Distribute results by H: b
+                           ->  Append
+                                 ->  Seq Scan on prt2_p1
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_p2
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_p3
+                                       Filter: (a = 0)
+(25 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b;
+ a  |  c   | b  |  c   
+----+------+----+------
+ 50 | 0050 |    | 
+    |      | 75 | 0075
+(2 rows)
+
+-- Join with pruned partitions from joining relations
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a
+         ->  Nested Loop
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t1
+                                 Filter: ((a < 450) AND (b = 0))
+                           ->  Seq Scan on prt1_p2 t1_1
+                                 Filter: ((a < 450) AND (b = 0))
+               ->  Append
+                     ->  Index Scan using iprt2_p2_b on prt2_p2 t2
+                           Index Cond: ((b = a) AND (b > 250))
+                     ->  Bitmap Heap Scan on prt2_p3 t2_1
+                           Recheck Cond: ((b = a) AND (b > 250))
+                           ->  Bitmap Index Scan on iprt2_p3_b
+                                 Index Cond: ((b = a) AND (b > 250))
+(17 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+ 300 | 0300 | 300 | 0300
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
+         ->  Hash Right Join
+               Hash Cond: (b = a)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  Append
+                           ->  Seq Scan on prt2_p2
+                                 Filter: (b > 250)
+                           ->  Seq Scan on prt2_p3
+                                 Filter: (b > 250)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Append
+                                 ->  Seq Scan on prt1_p1
+                                       Filter: ((a < 450) AND (b = 0))
+                                 ->  Seq Scan on prt1_p2
+                                       Filter: ((a < 450) AND (b = 0))
+(20 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |     | 
+  50 | 0050 |     | 
+ 100 | 0100 |     | 
+ 150 | 0150 |     | 
+ 200 | 0200 |     | 
+ 250 | 0250 |     | 
+ 300 | 0300 | 300 | 0300
+ 350 | 0350 |     | 
+ 400 | 0400 |     | 
+(9 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
+         ->  Hash Full Join
+               Hash Cond: (a = b)
+               Filter: ((b = 0) OR (a = 0))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_p1
+                                 Filter: (a < 450)
+                           ->  Seq Scan on prt1_p2
+                                 Filter: (a < 450)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: b
+                           ->  Append
+                                 ->  Seq Scan on prt2_p2
+                                       Filter: (b > 250)
+                                 ->  Seq Scan on prt2_p3
+                                       Filter: (b > 250)
+(21 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 OR t2.a = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |     | 
+  50 | 0050 |     | 
+ 100 | 0100 |     | 
+ 150 | 0150 |     | 
+ 200 | 0200 |     | 
+ 250 | 0250 |     | 
+ 300 | 0300 | 300 | 0300
+ 350 | 0350 |     | 
+ 400 | 0400 |     | 
+     |      | 375 | 0375
+     |      | 450 | 0450
+     |      | 525 | 0525
+(12 rows)
+
+-- Semi-join
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: t1.a
+         ->  Nested Loop
+               ->  Remote Subquery Scan on all (datanode_2)
+                     ->  HashAggregate
+                           Group Key: b
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
+                                 ->  HashAggregate
+                                       Group Key: t2.b
+                                       ->  Append
+                                             ->  Seq Scan on prt2_p1 t2
+                                                   Filter: (a = 0)
+                                             ->  Seq Scan on prt2_p2 t2_1
+                                                   Filter: (a = 0)
+                                             ->  Seq Scan on prt2_p3 t2_2
+                                                   Filter: (a = 0)
+               ->  Append
+                     ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                           Index Cond: (a = b)
+                           Filter: (b = 0)
+                     ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                           Index Cond: (a = b)
+                           Filter: (b = 0)
+                     ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                           Index Cond: (a = b)
+                           Filter: (b = 0)
+(28 rows)
+
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   
+-----+---+------
+   0 | 0 | 0000
+ 150 | 0 | 0150
+ 300 | 0 | 0300
+ 450 | 0 | 0450
+(4 rows)
+
+-- Anti-join with aggregates
+EXPLAIN (COSTS OFF)
+SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b);
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Finalize Aggregate
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Partial Aggregate
+               ->  Hash Anti Join
+                     Hash Cond: (t1.a = b)
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t1
+                           ->  Seq Scan on prt1_p2 t1_1
+                           ->  Seq Scan on prt1_p3 t1_2
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Append
+                                       ->  Seq Scan on prt2_p1 t2
+                                       ->  Seq Scan on prt2_p2 t2_1
+                                       ->  Seq Scan on prt2_p3 t2_2
+(15 rows)
+
+SELECT sum(t1.a), avg(t1.a), sum(t1.b), avg(t1.b) FROM prt1 t1 WHERE NOT EXISTS (SELECT 1 FROM prt2 t2 WHERE t1.a = t2.b);
+  sum  |         avg          | sum  |         avg         
+-------+----------------------+------+---------------------
+ 60000 | 300.0000000000000000 | 2400 | 12.0000000000000000
+(1 row)
+
+-- lateral reference
+EXPLAIN (COSTS OFF)
+SELECT * FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a;
+                                               QUERY PLAN                                               
+--------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Gather Merge
+         Workers Planned: 1
+         ->  Sort
+               Sort Key: a
+               ->  Parallel Nested Loop Left Join
+                     ->  Parallel Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Parallel Append
+                                 ->  Parallel Seq Scan on prt1_p1 t1
+                                       Filter: (b = 0)
+                                 ->  Parallel Seq Scan on prt1_p2 t1_1
+                                       Filter: (b = 0)
+                                 ->  Parallel Seq Scan on prt1_p3 t1_2
+                                       Filter: (b = 0)
+                     ->  Materialize
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: a
+                                 ->  Nested Loop
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Append
+                                                   ->  Index Only Scan using iprt1_p1_a on prt1_p1 t2
+                                                         Index Cond: (a = a)
+                                                   ->  Index Only Scan using iprt1_p2_a on prt1_p2 t2_1
+                                                         Index Cond: (a = a)
+                                                   ->  Index Only Scan using iprt1_p3_a on prt1_p3 t2_2
+                                                         Index Cond: (a = a)
+                                       ->  Append
+                                             ->  Index Scan using iprt2_p1_b on prt2_p1 t3
+                                                   Index Cond: (b = a)
+                                             ->  Index Scan using iprt2_p2_b on prt2_p2 t3_1
+                                                   Index Cond: (b = a)
+                                             ->  Bitmap Heap Scan on prt2_p3 t3_2
+                                                   Recheck Cond: (b = a)
+                                                   ->  Bitmap Index Scan on iprt2_p3_b
+                                                         Index Cond: (b = a)
+(36 rows)
+
+SELECT * FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.a = ss.t2a WHERE t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   | t2a | t3a | least 
+-----+---+------+-----+-----+-------
+   0 | 0 | 0000 |   0 |   0 |     0
+  50 | 0 | 0050 |     |     |      
+ 100 | 0 | 0100 |     |     |      
+ 150 | 0 | 0150 | 150 |   0 |   150
+ 200 | 0 | 0200 |     |     |      
+ 250 | 0 | 0250 |     |     |      
+ 300 | 0 | 0300 | 300 |   0 |   300
+ 350 | 0 | 0350 |     |     |      
+ 400 | 0 | 0400 |     |     |      
+ 450 | 0 | 0450 | 450 |   0 |   450
+ 500 | 0 | 0500 |     |     |      
+ 550 | 0 | 0550 |     |     |      
+(12 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a
+         ->  Hash Left Join
+               Hash Cond: ((c)::text = (c)::text)
+               Filter: ((b + COALESCE(b, 0)) = 0)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: c
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t1
+                           ->  Seq Scan on prt1_p2 t1_1
+                           ->  Seq Scan on prt1_p3 t1_2
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: c
+                           ->  Hash Join
+                                 Hash Cond: (t2.a = b)
+                                 ->  Append
+                                       ->  Seq Scan on prt1_p1 t2
+                                       ->  Seq Scan on prt1_p2 t2_1
+                                       ->  Seq Scan on prt1_p3 t2_2
+                                 ->  Hash
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Append
+                                                   ->  Seq Scan on prt2_p1 t3
+                                                   ->  Seq Scan on prt2_p2 t3_1
+                                                   ->  Seq Scan on prt2_p3 t3_2
+(27 rows)
+
+SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
+			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
+  a  | t2a | t2c  
+-----+-----+------
+   0 |   0 | 0000
+  50 |     | 
+ 100 |     | 
+ 150 | 150 | 0150
+ 200 |     | 
+ 250 |     | 
+ 300 | 300 | 0300
+ 350 |     | 
+ 400 |     | 
+ 450 | 450 | 0450
+ 500 |     | 
+ 550 |     | 
+(12 rows)
+
+--
+-- partitioned by expression
+--
+CREATE TABLE prt1_e (a int, b int, c int) PARTITION BY RANGE(((a + b)/2));
+CREATE TABLE prt1_e_p1 PARTITION OF prt1_e FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt1_e_p2 PARTITION OF prt1_e FOR VALUES FROM (250) TO (500);
+CREATE TABLE prt1_e_p3 PARTITION OF prt1_e FOR VALUES FROM (500) TO (600);
+INSERT INTO prt1_e SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i;
+CREATE INDEX iprt1_e_p1_ab2 on prt1_e_p1(((a+b)/2));
+CREATE INDEX iprt1_e_p2_ab2 on prt1_e_p2(((a+b)/2));
+CREATE INDEX iprt1_e_p3_ab2 on prt1_e_p3(((a+b)/2));
+ANALYZE prt1_e;
+CREATE TABLE prt2_e (a int, b int, c int) PARTITION BY RANGE(((b + a)/2));
+CREATE TABLE prt2_e_p1 PARTITION OF prt2_e FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt2_e_p2 PARTITION OF prt2_e FOR VALUES FROM (250) TO (500);
+CREATE TABLE prt2_e_p3 PARTITION OF prt2_e FOR VALUES FROM (500) TO (600);
+INSERT INTO prt2_e SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i;
+ANALYZE prt2_e;
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, t2.b
+         ->  Hash Join
+               Hash Cond: (((t2.b + t2.a) / 2) = ((a + b) / 2))
+               ->  Append
+                     ->  Seq Scan on prt2_e_p1 t2
+                     ->  Seq Scan on prt2_e_p2 t2_1
+                     ->  Seq Scan on prt2_e_p3 t2_2
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Append
+                                 ->  Seq Scan on prt1_e_p1 t1
+                                       Filter: (c = 0)
+                                 ->  Seq Scan on prt1_e_p2 t1_1
+                                       Filter: (c = 0)
+                                 ->  Seq Scan on prt1_e_p3 t1_2
+                                       Filter: (c = 0)
+(18 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_e t1, prt2_e t2 WHERE (t1.a + t1.b)/2 = (t2.b + t2.a)/2 AND t1.c = 0 ORDER BY t1.a, t2.b;
+  a  | c |  b  | c 
+-----+---+-----+---
+   0 | 0 |   0 | 0
+ 150 | 0 | 150 | 0
+ 300 | 0 | 300 | 0
+ 450 | 0 | 450 | 0
+(4 rows)
+
+--
+-- N-way join
+--
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a
+         ->  Hash Join
+               Hash Cond: (b = a)
+               ->  Hash Join
+                     Hash Cond: (((t3.a + t3.b) / 2) = b)
+                     ->  Append
+                           ->  Seq Scan on prt1_e_p1 t3
+                           ->  Seq Scan on prt1_e_p2 t3_1
+                           ->  Seq Scan on prt1_e_p3 t3_2
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Append
+                                       ->  Seq Scan on prt2_p1 t2
+                                       ->  Seq Scan on prt2_p2 t2_1
+                                       ->  Seq Scan on prt2_p3 t2_2
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Append
+                                 ->  Seq Scan on prt1_p1 t1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_p2 t1_1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_p3 t1_2
+                                       Filter: (b = 0)
+(26 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM prt1 t1, prt2 t2, prt1_e t3 WHERE t1.a = t2.b AND t1.a = (t3.a + t3.b)/2 AND t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   | ?column? | c 
+-----+------+-----+------+----------+---
+   0 | 0000 |   0 | 0000 |        0 | 0
+ 150 | 0150 | 150 | 0150 |      300 | 0
+ 300 | 0300 | 300 | 0300 |      600 | 0
+ 450 | 0450 | 450 | 0450 |      900 | 0
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b, ((a + b))
+         ->  Hash Right Join
+               Hash Cond: ((((a + b) / 2)) = a)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: ((a + b) / 2)
+                     ->  Result
+                           ->  Append
+                                 ->  Seq Scan on prt1_e_p1 t3
+                                 ->  Seq Scan on prt1_e_p2 t3_1
+                                 ->  Seq Scan on prt1_e_p3 t3_2
+               ->  Hash
+                     ->  Hash Right Join
+                           Hash Cond: (b = a)
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  Append
+                                       ->  Seq Scan on prt2_p1 t2
+                                       ->  Seq Scan on prt2_p2 t2_1
+                                       ->  Seq Scan on prt2_p3 t2_2
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: a
+                                       ->  Append
+                                             ->  Seq Scan on prt1_p1 t1
+                                                   Filter: (b = 0)
+                                             ->  Seq Scan on prt1_p2 t1_1
+                                                   Filter: (b = 0)
+                                             ->  Seq Scan on prt1_p3 t1_2
+                                                   Filter: (b = 0)
+(31 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) LEFT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.b = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+  a  |  c   |  b  |  c   | ?column? | c 
+-----+------+-----+------+----------+---
+   0 | 0000 |   0 | 0000 |        0 | 0
+  50 | 0050 |     |      |      100 | 0
+ 100 | 0100 |     |      |      200 | 0
+ 150 | 0150 | 150 | 0150 |      300 | 0
+ 200 | 0200 |     |      |      400 | 0
+ 250 | 0250 |     |      |      500 | 0
+ 300 | 0300 | 300 | 0300 |      600 | 0
+ 350 | 0350 |     |      |      700 | 0
+ 400 | 0400 |     |      |      800 | 0
+ 450 | 0450 | 450 | 0450 |      900 | 0
+ 500 | 0500 |     |      |     1000 | 0
+ 550 | 0550 |     |      |     1100 | 0
+(12 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b, ((a + b))
+         ->  Hash Right Join
+               Hash Cond: (a = (((a + b) / 2)))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Hash Right Join
+                           Hash Cond: (b = t1.a)
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Append
+                                       ->  Seq Scan on prt2_p1 t2
+                                       ->  Seq Scan on prt2_p2 t2_1
+                                       ->  Seq Scan on prt2_p3 t2_2
+                           ->  Hash
+                                 ->  Append
+                                       ->  Seq Scan on prt1_p1 t1
+                                       ->  Seq Scan on prt1_p2 t1_1
+                                       ->  Seq Scan on prt1_p3 t1_2
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: ((a + b) / 2)
+                           ->  Result
+                                 ->  Append
+                                       ->  Seq Scan on prt1_e_p1 t3
+                                             Filter: (c = 0)
+                                       ->  Seq Scan on prt1_e_p2 t3_1
+                                             Filter: (c = 0)
+                                       ->  Seq Scan on prt1_e_p3 t3_2
+                                             Filter: (c = 0)
+(30 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+  a  |  c   |  b  |  c   | ?column? | c 
+-----+------+-----+------+----------+---
+   0 | 0000 |   0 | 0000 |        0 | 0
+  50 | 0050 |     |      |      100 | 0
+ 100 | 0100 |     |      |      200 | 0
+ 150 | 0150 | 150 | 0150 |      300 | 0
+ 200 | 0200 |     |      |      400 | 0
+ 250 | 0250 |     |      |      500 | 0
+ 300 | 0300 | 300 | 0300 |      600 | 0
+ 350 | 0350 |     |      |      700 | 0
+ 400 | 0400 |     |      |      800 | 0
+ 450 | 0450 | 450 | 0450 |      900 | 0
+ 500 | 0500 |     |      |     1000 | 0
+ 550 | 0550 |     |      |     1100 | 0
+(12 rows)
+
+-- Cases with non-nullable expressions in subquery results;
+-- make sure these go to null as expected
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b, ((a + b))
+         ->  Hash Full Join
+               Hash Cond: (a = (((a + b) / 2)))
+               Filter: ((a = (50)) OR (b = (75)) OR (((a + b) / 2) = (50)))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Hash Full Join
+                           Hash Cond: (a = b)
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: a
+                                 ->  Append
+                                       ->  Seq Scan on prt1_p1
+                                             Filter: (b = 0)
+                                       ->  Seq Scan on prt1_p2
+                                             Filter: (b = 0)
+                                       ->  Seq Scan on prt1_p3
+                                             Filter: (b = 0)
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all (datanode_2)
+                                       Distribute results by H: b
+                                       ->  Append
+                                             ->  Seq Scan on prt2_p1
+                                                   Filter: (a = 0)
+                                             ->  Seq Scan on prt2_p2
+                                                   Filter: (a = 0)
+                                             ->  Seq Scan on prt2_p3
+                                                   Filter: (a = 0)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: ((a + b) / 2)
+                           ->  Result
+                                 ->  Append
+                                       ->  Seq Scan on prt1_e_p1
+                                             Filter: (c = 0)
+                                       ->  Seq Scan on prt1_e_p2
+                                             Filter: (c = 0)
+                                       ->  Seq Scan on prt1_e_p3
+                                             Filter: (c = 0)
+(40 rows)
+
+SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b;
+ a  | phv | b  | phv | ?column? | phv 
+----+-----+----+-----+----------+-----
+ 50 |  50 |    |     |      100 |  50
+    |     | 75 |  75 |          |    
+(2 rows)
+
+-- Semi-join
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Merge Join
+         Merge Cond: (a = b)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: a
+               ->  Sort
+                     Sort Key: t1.a
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p2 t1_1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p3 t1_2
+                                 Filter: (b = 0)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: b
+               ->  Sort
+                     Sort Key: b
+                     ->  HashAggregate
+                           Group Key: b
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: b
+                                 ->  HashAggregate
+                                       Group Key: b
+                                       ->  Nested Loop
+                                             ->  Remote Subquery Scan on all (datanode_2)
+                                                   ->  Append
+                                                         ->  Seq Scan on prt2_p1 t1_3
+                                                               Filter: (a = 0)
+                                                         ->  Seq Scan on prt2_p2 t1_4
+                                                               Filter: (a = 0)
+                                                         ->  Seq Scan on prt2_p3 t1_5
+                                                               Filter: (a = 0)
+                                             ->  Append
+                                                   ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t2
+                                                         Index Cond: (((a + b) / 2) = b)
+                                                   ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t2_1
+                                                         Index Cond: (((a + b) / 2) = b)
+                                                   ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t2_2
+                                                         Index Cond: (((a + b) / 2) = b)
+(40 rows)
+
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   
+-----+---+------
+   0 | 0 | 0000
+ 150 | 0 | 0150
+ 300 | 0 | 0300
+ 450 | 0 | 0450
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Merge Semi Join
+         Merge Cond: (a = b)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: a
+               ->  Sort
+                     Sort Key: t1.a
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p2 t1_1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p3 t1_2
+                                 Filter: (b = 0)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: b
+               ->  Sort
+                     Sort Key: t1_3.b
+                     ->  Hash Semi Join
+                           Hash Cond: (t1_3.b = ((a + b) / 2))
+                           ->  Append
+                                 ->  Seq Scan on prt2_p1 t1_3
+                                 ->  Seq Scan on prt2_p2 t1_4
+                                 ->  Seq Scan on prt2_p3 t1_5
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       ->  Append
+                                             ->  Seq Scan on prt1_e_p1 t1_6
+                                                   Filter: (c = 0)
+                                             ->  Seq Scan on prt1_e_p2 t1_7
+                                                   Filter: (c = 0)
+                                             ->  Seq Scan on prt1_e_p3 t1_8
+                                                   Filter: (c = 0)
+(33 rows)
+
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   
+-----+---+------
+   0 | 0 | 0000
+ 150 | 0 | 0150
+ 300 | 0 | 0300
+ 450 | 0 | 0450
+(4 rows)
+
+-- test merge joins
+SET enable_hashjoin TO off;
+SET enable_nestloop TO off;
+EXPLAIN (COSTS OFF)
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Merge Semi Join
+         Merge Cond: (a = b)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: a
+               ->  Sort
+                     Sort Key: t1.a
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p2 t1_1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p3 t1_2
+                                 Filter: (b = 0)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: b
+               ->  Merge Semi Join
+                     Merge Cond: (t1_3.b = (((a + b) / 2)))
+                     ->  Sort
+                           Sort Key: t1_3.b
+                           ->  Append
+                                 ->  Seq Scan on prt2_p1 t1_3
+                                 ->  Seq Scan on prt2_p2 t1_4
+                                 ->  Seq Scan on prt2_p3 t1_5
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Sort
+                                 Sort Key: (((t1_6.a + t1_6.b) / 2))
+                                 ->  Result
+                                       ->  Append
+                                             ->  Seq Scan on prt1_e_p1 t1_6
+                                                   Filter: (c = 0)
+                                             ->  Seq Scan on prt1_e_p2 t1_7
+                                                   Filter: (c = 0)
+                                             ->  Seq Scan on prt1_e_p3 t1_8
+                                                   Filter: (c = 0)
+(35 rows)
+
+SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   
+-----+---+------
+   0 | 0 | 0000
+ 150 | 0 | 0150
+ 300 | 0 | 0300
+ 450 | 0 | 0450
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b, ((a + b))
+         ->  Merge Right Join
+               Merge Cond: (a = (((a + b) / 2)))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Merge Left Join
+                           Merge Cond: (t1.a = b)
+                           ->  Sort
+                                 Sort Key: t1.a
+                                 ->  Append
+                                       ->  Seq Scan on prt1_p1 t1
+                                       ->  Seq Scan on prt1_p2 t1_1
+                                       ->  Seq Scan on prt1_p3 t1_2
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       ->  Sort
+                                             Sort Key: t2.b
+                                             ->  Append
+                                                   ->  Seq Scan on prt2_p1 t2
+                                                   ->  Seq Scan on prt2_p2 t2_1
+                                                   ->  Seq Scan on prt2_p3 t2_2
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: ((a + b) / 2)
+                           ->  Sort
+                                 Sort Key: (((t3.a + t3.b) / 2))
+                                 ->  Result
+                                       ->  Append
+                                             ->  Seq Scan on prt1_e_p1 t3
+                                                   Filter: (c = 0)
+                                             ->  Seq Scan on prt1_e_p2 t3_1
+                                                   Filter: (c = 0)
+                                             ->  Seq Scan on prt1_e_p3 t3_2
+                                                   Filter: (c = 0)
+(36 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
+  a  |  c   |  b  |  c   | ?column? | c 
+-----+------+-----+------+----------+---
+   0 | 0000 |   0 | 0000 |        0 | 0
+  50 | 0050 |     |      |      100 | 0
+ 100 | 0100 |     |      |      200 | 0
+ 150 | 0150 | 150 | 0150 |      300 | 0
+ 200 | 0200 |     |      |      400 | 0
+ 250 | 0250 |     |      |      500 | 0
+ 300 | 0300 | 300 | 0300 |      600 | 0
+ 350 | 0350 |     |      |      700 | 0
+ 400 | 0400 |     |      |      800 | 0
+ 450 | 0450 | 450 | 0450 |      900 | 0
+ 500 | 0500 |     |      |     1000 | 0
+ 550 | 0550 |     |      |     1100 | 0
+(12 rows)
+
+-- MergeAppend on nullable column
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
+         ->  Merge Right Join
+               Merge Cond: (b = a)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  Sort
+                           Sort Key: prt2_p2.b
+                           ->  Append
+                                 ->  Seq Scan on prt2_p2
+                                       Filter: (b > 250)
+                                 ->  Seq Scan on prt2_p3
+                                       Filter: (b > 250)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Sort
+                                 Sort Key: prt1_p1.a
+                                 ->  Append
+                                       ->  Seq Scan on prt1_p1
+                                             Filter: ((a < 450) AND (b = 0))
+                                       ->  Seq Scan on prt1_p2
+                                             Filter: ((a < 450) AND (b = 0))
+(24 rows)
+
+SELECT t1.a, t2.b FROM (SELECT * FROM prt1 WHERE a < 450) t1 LEFT JOIN (SELECT * FROM prt2 WHERE b > 250) t2 ON t1.a = t2.b WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  b  
+-----+-----
+   0 |    
+  50 |    
+ 100 |    
+ 150 |    
+ 200 |    
+ 250 |    
+ 300 | 300
+ 350 |    
+ 400 |    
+(9 rows)
+
+RESET enable_hashjoin;
+RESET enable_nestloop;
+--
+-- partitioned by multiple columns
+--
+CREATE TABLE prt1_m (a int, b int, c int) PARTITION BY RANGE(a, ((a + b)/2));
+CREATE TABLE prt1_m_p1 PARTITION OF prt1_m FOR VALUES FROM (0, 0) TO (250, 250);
+CREATE TABLE prt1_m_p2 PARTITION OF prt1_m FOR VALUES FROM (250, 250) TO (500, 500);
+CREATE TABLE prt1_m_p3 PARTITION OF prt1_m FOR VALUES FROM (500, 500) TO (600, 600);
+INSERT INTO prt1_m SELECT i, i, i % 25 FROM generate_series(0, 599, 2) i;
+ANALYZE prt1_m;
+CREATE TABLE prt2_m (a int, b int, c int) PARTITION BY RANGE(((b + a)/2), b);
+CREATE TABLE prt2_m_p1 PARTITION OF prt2_m FOR VALUES FROM (0, 0) TO (250, 250);
+CREATE TABLE prt2_m_p2 PARTITION OF prt2_m FOR VALUES FROM (250, 250) TO (500, 500);
+CREATE TABLE prt2_m_p3 PARTITION OF prt2_m FOR VALUES FROM (500, 500) TO (600, 600);
+INSERT INTO prt2_m SELECT i, i, i % 25 FROM generate_series(0, 599, 3) i;
+ANALYZE prt2_m;
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
+         ->  Hash Full Join
+               Hash Cond: ((a = (((b + a) / 2))) AND (((a + b) / 2) = b))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_m_p1
+                                 Filter: (c = 0)
+                           ->  Seq Scan on prt1_m_p2
+                                 Filter: (c = 0)
+                           ->  Seq Scan on prt1_m_p3
+                                 Filter: (c = 0)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: ((b + a) / 2)
+                           ->  Result
+                                 ->  Append
+                                       ->  Seq Scan on prt2_m_p1
+                                             Filter: (c = 0)
+                                       ->  Seq Scan on prt2_m_p2
+                                             Filter: (c = 0)
+                                       ->  Seq Scan on prt2_m_p3
+                                             Filter: (c = 0)
+(25 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_m WHERE prt1_m.c = 0) t1 FULL JOIN (SELECT * FROM prt2_m WHERE prt2_m.c = 0) t2 ON (t1.a = (t2.b + t2.a)/2 AND t2.b = (t1.a + t1.b)/2) ORDER BY t1.a, t2.b;
+  a  | c |  b  | c 
+-----+---+-----+---
+   0 | 0 |   0 | 0
+  50 | 0 |     |  
+ 100 | 0 |     |  
+ 150 | 0 | 150 | 0
+ 200 | 0 |     |  
+ 250 | 0 |     |  
+ 300 | 0 | 300 | 0
+ 350 | 0 |     |  
+ 400 | 0 |     |  
+ 450 | 0 | 450 | 0
+ 500 | 0 |     |  
+ 550 | 0 |     |  
+     |   |  75 | 0
+     |   | 225 | 0
+     |   | 375 | 0
+     |   | 525 | 0
+(16 rows)
+
+--
+-- tests for list partitioned tables.
+--
+CREATE TABLE plt1 (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE plt1_p1 PARTITION OF plt1 FOR VALUES IN ('0000', '0003', '0004', '0010');
+CREATE TABLE plt1_p2 PARTITION OF plt1 FOR VALUES IN ('0001', '0005', '0002', '0009');
+CREATE TABLE plt1_p3 PARTITION OF plt1 FOR VALUES IN ('0006', '0007', '0008', '0011');
+INSERT INTO plt1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE plt1;
+CREATE TABLE plt2 (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE plt2_p1 PARTITION OF plt2 FOR VALUES IN ('0000', '0003', '0004', '0010');
+CREATE TABLE plt2_p2 PARTITION OF plt2 FOR VALUES IN ('0001', '0005', '0002', '0009');
+CREATE TABLE plt2_p3 PARTITION OF plt2 FOR VALUES IN ('0006', '0007', '0008', '0011');
+INSERT INTO plt2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i;
+ANALYZE plt2;
+--
+-- list partitioned by expression
+--
+CREATE TABLE plt1_e (a int, b int, c text) PARTITION BY LIST(ltrim(c, 'A'));
+CREATE TABLE plt1_e_p1 PARTITION OF plt1_e FOR VALUES IN ('0000', '0003', '0004', '0010');
+CREATE TABLE plt1_e_p2 PARTITION OF plt1_e FOR VALUES IN ('0001', '0005', '0002', '0009');
+CREATE TABLE plt1_e_p3 PARTITION OF plt1_e FOR VALUES IN ('0006', '0007', '0008', '0011');
+INSERT INTO plt1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE plt1_e;
+-- test partition matching with N-way join
+EXPLAIN (COSTS OFF)
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Finalize GroupAggregate
+         Group Key: c, c, c
+         ->  Sort
+               Sort Key: c, c
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: c
+                     ->  Partial HashAggregate
+                           Group Key: c, c, t3.c
+                           ->  Hash Join
+                                 Hash Cond: (c = c)
+                                 ->  Hash Join
+                                       Hash Cond: (c = ltrim(t3.c, 'A'::text))
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Append
+                                                   ->  Seq Scan on plt2_p1 t2
+                                                   ->  Seq Scan on plt2_p2 t2_1
+                                                   ->  Seq Scan on plt2_p3 t2_2
+                                       ->  Hash
+                                             ->  Append
+                                                   ->  Seq Scan on plt1_e_p1 t3
+                                                   ->  Seq Scan on plt1_e_p2 t3_1
+                                                   ->  Seq Scan on plt1_e_p3 t3_2
+                                 ->  Hash
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Append
+                                                   ->  Seq Scan on plt1_p1 t1
+                                                   ->  Seq Scan on plt1_p2 t1_1
+                                                   ->  Seq Scan on plt1_p3 t1_2
+(29 rows)
+
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM plt1 t1, plt2 t2, plt1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+         avg          |         avg          |          avg          |  c   |  c   |   c   
+----------------------+----------------------+-----------------------+------+------+-------
+  24.0000000000000000 |  24.0000000000000000 |   48.0000000000000000 | 0000 | 0000 | A0000
+  74.0000000000000000 |  75.0000000000000000 |  148.0000000000000000 | 0001 | 0001 | A0001
+ 124.0000000000000000 | 124.5000000000000000 |  248.0000000000000000 | 0002 | 0002 | A0002
+ 174.0000000000000000 | 174.0000000000000000 |  348.0000000000000000 | 0003 | 0003 | A0003
+ 224.0000000000000000 | 225.0000000000000000 |  448.0000000000000000 | 0004 | 0004 | A0004
+ 274.0000000000000000 | 274.5000000000000000 |  548.0000000000000000 | 0005 | 0005 | A0005
+ 324.0000000000000000 | 324.0000000000000000 |  648.0000000000000000 | 0006 | 0006 | A0006
+ 374.0000000000000000 | 375.0000000000000000 |  748.0000000000000000 | 0007 | 0007 | A0007
+ 424.0000000000000000 | 424.5000000000000000 |  848.0000000000000000 | 0008 | 0008 | A0008
+ 474.0000000000000000 | 474.0000000000000000 |  948.0000000000000000 | 0009 | 0009 | A0009
+ 524.0000000000000000 | 525.0000000000000000 | 1048.0000000000000000 | 0010 | 0010 | A0010
+ 574.0000000000000000 | 574.5000000000000000 | 1148.0000000000000000 | 0011 | 0011 | A0011
+(12 rows)
+
+-- joins where one of the relations is proven empty
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a = 1 AND t1.a = 2;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 LEFT JOIN prt2 t2 ON t1.a = t2.b;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Hash Join
+   Hash Cond: (a = b)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Append
+               ->  Seq Scan on prt1_p1 t3
+               ->  Seq Scan on prt1_p2 t3_1
+               ->  Seq Scan on prt1_p3 t3_2
+   ->  Hash
+         ->  Hash Left Join
+               Hash Cond: (b = a)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Append
+                           ->  Seq Scan on prt2_p1 t2
+                           ->  Seq Scan on prt2_p2 t2_1
+                           ->  Seq Scan on prt2_p3 t2_2
+               ->  Hash
+                     ->  Result
+                           One-Time Filter: false
+(18 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 FULL JOIN prt2 t2 ON t1.a = t2.b WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Sort
+   Sort Key: a, b
+   ->  Hash Left Join
+         Hash Cond: (b = a)
+         ->  Remote Subquery Scan on all (datanode_2)
+               ->  Append
+                     ->  Seq Scan on prt2_p1 t2
+                           Filter: (a = 0)
+                     ->  Seq Scan on prt2_p2 t2_1
+                           Filter: (a = 0)
+                     ->  Seq Scan on prt2_p3 t2_2
+                           Filter: (a = 0)
+         ->  Hash
+               ->  Result
+                     One-Time Filter: false
+(15 rows)
+
+--
+-- tests for hash partitioned tables.
+--
+CREATE TABLE pht1 (a int, b int, c text) PARTITION BY HASH(c);
+CREATE TABLE pht1_p1 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 0);
+CREATE TABLE pht1_p2 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 1);
+CREATE TABLE pht1_p3 PARTITION OF pht1 FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+INSERT INTO pht1 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE pht1;
+CREATE TABLE pht2 (a int, b int, c text) PARTITION BY HASH(c);
+CREATE TABLE pht2_p1 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 0);
+CREATE TABLE pht2_p2 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 1);
+CREATE TABLE pht2_p3 PARTITION OF pht2 FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+INSERT INTO pht2 SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 3) i;
+ANALYZE pht2;
+--
+-- hash partitioned by expression
+--
+CREATE TABLE pht1_e (a int, b int, c text) PARTITION BY HASH(ltrim(c, 'A'));
+CREATE TABLE pht1_e_p1 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 0);
+CREATE TABLE pht1_e_p2 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 1);
+CREATE TABLE pht1_e_p3 PARTITION OF pht1_e FOR VALUES WITH (MODULUS 3, REMAINDER 2);
+INSERT INTO pht1_e SELECT i, i, 'A' || to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE pht1_e;
+-- test partition matching with N-way join
+EXPLAIN (COSTS OFF)
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Finalize GroupAggregate
+         Group Key: c, c, c
+         ->  Sort
+               Sort Key: c, c
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: c
+                     ->  Partial HashAggregate
+                           Group Key: c, c, t3.c
+                           ->  Hash Join
+                                 Hash Cond: (c = c)
+                                 ->  Hash Join
+                                       Hash Cond: (c = ltrim(t3.c, 'A'::text))
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Append
+                                                   ->  Seq Scan on pht2_p1 t2
+                                                   ->  Seq Scan on pht2_p2 t2_1
+                                                   ->  Seq Scan on pht2_p3 t2_2
+                                       ->  Hash
+                                             ->  Append
+                                                   ->  Seq Scan on pht1_e_p1 t3
+                                                   ->  Seq Scan on pht1_e_p2 t3_1
+                                                   ->  Seq Scan on pht1_e_p3 t3_2
+                                 ->  Hash
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Append
+                                                   ->  Seq Scan on pht1_p1 t1
+                                                   ->  Seq Scan on pht1_p2 t1_1
+                                                   ->  Seq Scan on pht1_p3 t1_2
+(29 rows)
+
+SELECT avg(t1.a), avg(t2.b), avg(t3.a + t3.b), t1.c, t2.c, t3.c FROM pht1 t1, pht2 t2, pht1_e t3 WHERE t1.c = t2.c AND ltrim(t3.c, 'A') = t1.c GROUP BY t1.c, t2.c, t3.c ORDER BY t1.c, t2.c, t3.c;
+         avg          |         avg          |          avg          |  c   |  c   |   c   
+----------------------+----------------------+-----------------------+------+------+-------
+  24.0000000000000000 |  24.0000000000000000 |   48.0000000000000000 | 0000 | 0000 | A0000
+  74.0000000000000000 |  75.0000000000000000 |  148.0000000000000000 | 0001 | 0001 | A0001
+ 124.0000000000000000 | 124.5000000000000000 |  248.0000000000000000 | 0002 | 0002 | A0002
+ 174.0000000000000000 | 174.0000000000000000 |  348.0000000000000000 | 0003 | 0003 | A0003
+ 224.0000000000000000 | 225.0000000000000000 |  448.0000000000000000 | 0004 | 0004 | A0004
+ 274.0000000000000000 | 274.5000000000000000 |  548.0000000000000000 | 0005 | 0005 | A0005
+ 324.0000000000000000 | 324.0000000000000000 |  648.0000000000000000 | 0006 | 0006 | A0006
+ 374.0000000000000000 | 375.0000000000000000 |  748.0000000000000000 | 0007 | 0007 | A0007
+ 424.0000000000000000 | 424.5000000000000000 |  848.0000000000000000 | 0008 | 0008 | A0008
+ 474.0000000000000000 | 474.0000000000000000 |  948.0000000000000000 | 0009 | 0009 | A0009
+ 524.0000000000000000 | 525.0000000000000000 | 1048.0000000000000000 | 0010 | 0010 | A0010
+ 574.0000000000000000 | 574.5000000000000000 | 1148.0000000000000000 | 0011 | 0011 | A0011
+(12 rows)
+
+--
+-- multiple levels of partitioning 
+--
+CREATE TABLE prt1_l (a int, b int, c varchar) PARTITION BY RANGE(a);
+CREATE TABLE prt1_l_p1 PARTITION OF prt1_l FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt1_l_p2 PARTITION OF prt1_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c);
+CREATE TABLE prt1_l_p2_p1 PARTITION OF prt1_l_p2 FOR VALUES IN ('0000', '0001');
+CREATE TABLE prt1_l_p2_p2 PARTITION OF prt1_l_p2 FOR VALUES IN ('0002', '0003');
+CREATE TABLE prt1_l_p3 PARTITION OF prt1_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (b);
+CREATE TABLE prt1_l_p3_p1 PARTITION OF prt1_l_p3 FOR VALUES FROM (0) TO (13);
+CREATE TABLE prt1_l_p3_p2 PARTITION OF prt1_l_p3 FOR VALUES FROM (13) TO (25);
+INSERT INTO prt1_l SELECT i, i % 25, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt1_l;
+CREATE TABLE prt2_l (a int, b int, c varchar) PARTITION BY RANGE(b);
+CREATE TABLE prt2_l_p1 PARTITION OF prt2_l FOR VALUES FROM (0) TO (250);
+CREATE TABLE prt2_l_p2 PARTITION OF prt2_l FOR VALUES FROM (250) TO (500) PARTITION BY LIST (c);
+CREATE TABLE prt2_l_p2_p1 PARTITION OF prt2_l_p2 FOR VALUES IN ('0000', '0001');
+CREATE TABLE prt2_l_p2_p2 PARTITION OF prt2_l_p2 FOR VALUES IN ('0002', '0003');
+CREATE TABLE prt2_l_p3 PARTITION OF prt2_l FOR VALUES FROM (500) TO (600) PARTITION BY RANGE (a);
+CREATE TABLE prt2_l_p3_p1 PARTITION OF prt2_l_p3 FOR VALUES FROM (0) TO (13);
+CREATE TABLE prt2_l_p3_p2 PARTITION OF prt2_l_p3 FOR VALUES FROM (13) TO (25);
+INSERT INTO prt2_l SELECT i % 25, i, to_char(i % 4, 'FM0000') FROM generate_series(0, 599, 3) i;
+ANALYZE prt2_l;
+-- inner join, qual covering only top-level partitions
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a
+         ->  Hash Join
+               Hash Cond: (t2.b = a)
+               ->  Append
+                     ->  Seq Scan on prt2_l_p1 t2
+                     ->  Seq Scan on prt2_l_p2_p1 t2_1
+                     ->  Seq Scan on prt2_l_p2_p2 t2_2
+                     ->  Seq Scan on prt2_l_p3_p1 t2_3
+                     ->  Seq Scan on prt2_l_p3_p2 t2_4
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Append
+                                 ->  Seq Scan on prt1_l_p1 t1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p2_p1 t1_1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p2_p2 t1_2
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p3_p1 t1_3
+                                       Filter: (b = 0)
+(22 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1, prt2_l t2 WHERE t1.a = t2.b AND t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+ 150 | 0002 | 150 | 0002
+ 300 | 0000 | 300 | 0000
+ 450 | 0002 | 450 | 0002
+(4 rows)
+
+-- left join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
+         ->  Hash Right Join
+               Hash Cond: ((b = a) AND ((c)::text = (c)::text))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  Append
+                           ->  Seq Scan on prt2_l_p1 t2
+                           ->  Seq Scan on prt2_l_p2_p1 t2_1
+                           ->  Seq Scan on prt2_l_p2_p2 t2_2
+                           ->  Seq Scan on prt2_l_p3_p1 t2_3
+                           ->  Seq Scan on prt2_l_p3_p2 t2_4
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Append
+                                 ->  Seq Scan on prt1_l_p1 t1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p2_p1 t1_1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p2_p2 t1_2
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p3_p1 t1_3
+                                       Filter: (b = 0)
+(25 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 LEFT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t1.b = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+  50 | 0002 |     | 
+ 100 | 0000 |     | 
+ 150 | 0002 | 150 | 0002
+ 200 | 0000 |     | 
+ 250 | 0002 |     | 
+ 300 | 0000 | 300 | 0000
+ 350 | 0002 |     | 
+ 400 | 0000 |     | 
+ 450 | 0002 | 450 | 0002
+ 500 | 0000 |     | 
+ 550 | 0002 |     | 
+(12 rows)
+
+-- right join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
+         ->  Hash Right Join
+               Hash Cond: ((a = b) AND ((c)::text = (c)::text))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_l_p1 t1
+                           ->  Seq Scan on prt1_l_p2_p1 t1_1
+                           ->  Seq Scan on prt1_l_p2_p2 t1_2
+                           ->  Seq Scan on prt1_l_p3_p1 t1_3
+                           ->  Seq Scan on prt1_l_p3_p2 t1_4
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           Distribute results by H: b
+                           ->  Append
+                                 ->  Seq Scan on prt2_l_p1 t2
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_l_p2_p1 t2_1
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_l_p2_p2 t2_2
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_l_p3_p1 t2_3
+                                       Filter: (a = 0)
+(25 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.c = t2.c WHERE t2.a = 0 ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+ 150 | 0002 | 150 | 0002
+ 300 | 0000 | 300 | 0000
+ 450 | 0002 | 450 | 0002
+     |      |  75 | 0003
+     |      | 225 | 0001
+     |      | 375 | 0003
+     |      | 525 | 0001
+(8 rows)
+
+-- full join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a, b
+         ->  Hash Full Join
+               Hash Cond: ((a = b) AND ((c)::text = (c)::text))
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_l_p1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p2_p1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p2_p2
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p3_p1
+                                 Filter: (b = 0)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_2)
+                           Distribute results by H: b
+                           ->  Append
+                                 ->  Seq Scan on prt2_l_p1
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_l_p2_p1
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_l_p2_p2
+                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt2_l_p3_p1
+                                       Filter: (a = 0)
+(28 rows)
+
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b;
+  a  |  c   |  b  |  c   
+-----+------+-----+------
+   0 | 0000 |   0 | 0000
+  50 | 0002 |     | 
+ 100 | 0000 |     | 
+ 150 | 0002 | 150 | 0002
+ 200 | 0000 |     | 
+ 250 | 0002 |     | 
+ 300 | 0000 | 300 | 0000
+ 350 | 0002 |     | 
+ 400 | 0000 |     | 
+ 450 | 0002 | 450 | 0002
+ 500 | 0000 |     | 
+ 550 | 0002 |     | 
+     |      |  75 | 0003
+     |      | 225 | 0001
+     |      | 375 | 0003
+     |      | 525 | 0001
+(16 rows)
+
+-- lateral partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT * FROM prt1_l t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss
+			  ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a;
+                                              QUERY PLAN                                               
+-------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Gather Merge
+         Workers Planned: 1
+         ->  Sort
+               Sort Key: a
+               ->  Parallel Nested Loop Left Join
+                     ->  Parallel Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
+                           ->  Parallel Append
+                                 ->  Parallel Seq Scan on prt1_l_p1 t1
+                                       Filter: (b = 0)
+                                 ->  Parallel Seq Scan on prt1_l_p2_p1 t1_1
+                                       Filter: (b = 0)
+                                 ->  Parallel Seq Scan on prt1_l_p2_p2 t1_2
+                                       Filter: (b = 0)
+                                 ->  Parallel Seq Scan on prt1_l_p3_p1 t1_3
+                                       Filter: (b = 0)
+                     ->  Materialize
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: a
+                                 ->  Hash Join
+                                       Hash Cond: ((b = t2.a) AND ((c)::text = (t2.c)::text))
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             ->  Append
+                                                   ->  Seq Scan on prt2_l_p1 t3
+                                                   ->  Seq Scan on prt2_l_p2_p1 t3_1
+                                                   ->  Seq Scan on prt2_l_p2_p2 t3_2
+                                                   ->  Seq Scan on prt2_l_p3_p1 t3_3
+                                                   ->  Seq Scan on prt2_l_p3_p2 t3_4
+                                       ->  Hash
+                                             ->  Append
+                                                   ->  Seq Scan on prt1_l_p1 t2
+                                                         Filter: ((a = a) AND ((c)::text = (c)::text))
+                                                   ->  Seq Scan on prt1_l_p2_p1 t2_1
+                                                         Filter: ((a = a) AND ((c)::text = (c)::text))
+                                                   ->  Seq Scan on prt1_l_p2_p2 t2_2
+                                                         Filter: ((a = a) AND ((c)::text = (c)::text))
+                                                   ->  Seq Scan on prt1_l_p3_p1 t2_3
+                                                         Filter: ((a = a) AND ((c)::text = (c)::text))
+                                                   ->  Seq Scan on prt1_l_p3_p2 t2_4
+                                                         Filter: ((a = a) AND ((c)::text = (c)::text))
+(41 rows)
+
+SELECT * FROM prt1_l t1 LEFT JOIN LATERAL
+			  (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss
+			  ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a;
+  a  | b |  c   | t2a | t2c  | t2b | t3b | least 
+-----+---+------+-----+------+-----+-----+-------
+   0 | 0 | 0000 |   0 | 0000 |   0 |   0 |     0
+  50 | 0 | 0002 |     |      |     |     |      
+ 100 | 0 | 0000 |     |      |     |     |      
+ 150 | 0 | 0002 | 150 | 0002 |   0 | 150 |   150
+ 200 | 0 | 0000 |     |      |     |     |      
+ 250 | 0 | 0002 |     |      |     |     |      
+ 300 | 0 | 0000 | 300 | 0000 |   0 | 300 |   300
+ 350 | 0 | 0002 |     |      |     |     |      
+ 400 | 0 | 0000 |     |      |     |     |      
+ 450 | 0 | 0002 | 450 | 0002 |   0 | 450 |   450
+ 500 | 0 | 0000 |     |      |     |     |      
+ 550 | 0 | 0002 |     |      |     |     |      
+(12 rows)
+
+-- join with one side empty
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b AND t1.b = t2.a AND t1.c = t2.c;
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((b = a) AND (a = b) AND ((c)::text = (c)::text))
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Append
+               ->  Seq Scan on prt2_l_p1 t2
+               ->  Seq Scan on prt2_l_p2_p1 t2_1
+               ->  Seq Scan on prt2_l_p2_p2 t2_2
+               ->  Seq Scan on prt2_l_p3_p1 t2_3
+               ->  Seq Scan on prt2_l_p3_p2 t2_4
+   ->  Hash
+         ->  Result
+               One-Time Filter: false
+(12 rows)
+
+--
+-- negative testcases
+--
+CREATE TABLE prt1_n (a int, b int, c varchar) PARTITION BY RANGE(c);
+CREATE TABLE prt1_n_p1 PARTITION OF prt1_n FOR VALUES FROM ('0000') TO ('0250');
+CREATE TABLE prt1_n_p2 PARTITION OF prt1_n FOR VALUES FROM ('0250') TO ('0500');
+INSERT INTO prt1_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 499, 2) i;
+ANALYZE prt1_n;
+CREATE TABLE prt2_n (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE prt2_n_p1 PARTITION OF prt2_n FOR VALUES IN ('0000', '0003', '0004', '0010', '0006', '0007');
+CREATE TABLE prt2_n_p2 PARTITION OF prt2_n FOR VALUES IN ('0001', '0005', '0002', '0009', '0008', '0011');
+INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt2_n;
+CREATE TABLE prt3_n (a int, b int, c text) PARTITION BY LIST(c);
+CREATE TABLE prt3_n_p1 PARTITION OF prt3_n FOR VALUES IN ('0000', '0004', '0006', '0007');
+CREATE TABLE prt3_n_p2 PARTITION OF prt3_n FOR VALUES IN ('0001', '0002', '0008', '0010');
+CREATE TABLE prt3_n_p3 PARTITION OF prt3_n FOR VALUES IN ('0003', '0005', '0009', '0011');
+INSERT INTO prt2_n SELECT i, i, to_char(i/50, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt3_n;
+CREATE TABLE prt4_n (a int, b int, c text) PARTITION BY RANGE(a);
+CREATE TABLE prt4_n_p1 PARTITION OF prt4_n FOR VALUES FROM (0) TO (300);
+CREATE TABLE prt4_n_p2 PARTITION OF prt4_n FOR VALUES FROM (300) TO (500);
+CREATE TABLE prt4_n_p3 PARTITION OF prt4_n FOR VALUES FROM (500) TO (600);
+INSERT INTO prt4_n SELECT i, i, to_char(i, 'FM0000') FROM generate_series(0, 599, 2) i;
+ANALYZE prt4_n;
+-- partition-wise join can not be applied if the partition ranges differ
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2 WHERE t1.a = t2.a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Join
+         Hash Cond: (t1.a = t2.a)
+         ->  Append
+               ->  Seq Scan on prt1_p1 t1
+               ->  Seq Scan on prt1_p2 t1_1
+               ->  Seq Scan on prt1_p3 t1_2
+         ->  Hash
+               ->  Append
+                     ->  Seq Scan on prt4_n_p1 t2
+                     ->  Seq Scan on prt4_n_p2 t2_1
+                     ->  Seq Scan on prt4_n_p3 t2_2
+(12 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2, prt2 t3 WHERE t1.a = t2.a and t1.a = t3.b;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Join
+         Hash Cond: (t2.a = t1.a)
+         ->  Append
+               ->  Seq Scan on prt4_n_p1 t2
+               ->  Seq Scan on prt4_n_p2 t2_1
+               ->  Seq Scan on prt4_n_p3 t2_2
+         ->  Hash
+               ->  Hash Join
+                     Hash Cond: (t1.a = b)
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t1
+                           ->  Seq Scan on prt1_p2 t1_1
+                           ->  Seq Scan on prt1_p3 t1_2
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Append
+                                       ->  Seq Scan on prt2_p1 t3
+                                       ->  Seq Scan on prt2_p2 t3_1
+                                       ->  Seq Scan on prt2_p3 t3_2
+(20 rows)
+
+-- partition-wise join can not be applied if there are no equi-join conditions
+-- between partition keys
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 LEFT JOIN prt2 t2 ON (t1.a < t2.b);
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Nested Loop Left Join
+   Join Filter: (a < b)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Append
+               ->  Seq Scan on prt1_p1 t1
+               ->  Seq Scan on prt1_p2 t1_1
+               ->  Seq Scan on prt1_p3 t1_2
+   ->  Materialize
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Append
+                     ->  Seq Scan on prt2_p1 t2
+                     ->  Seq Scan on prt2_p2 t2_1
+                     ->  Seq Scan on prt2_p3 t2_2
+(13 rows)
+
+-- equi-join with join condition on partial keys does not qualify for
+-- partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1, prt2_m t2 WHERE t1.a = (t2.b + t2.a)/2;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Join
+         Hash Cond: ((((b + a) / 2)) = a)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: ((b + a) / 2)
+               ->  Result
+                     ->  Append
+                           ->  Seq Scan on prt2_m_p1 t2
+                           ->  Seq Scan on prt2_m_p2 t2_1
+                           ->  Seq Scan on prt2_m_p3 t2_2
+         ->  Hash
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_m_p1 t1
+                           ->  Seq Scan on prt1_m_p2 t1_1
+                           ->  Seq Scan on prt1_m_p3 t1_2
+(17 rows)
+
+-- equi-join between out-of-order partition key columns does not qualify for
+-- partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.a = t2.b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Left Join
+         Hash Cond: (t1.a = b)
+         ->  Append
+               ->  Seq Scan on prt1_m_p1 t1
+               ->  Seq Scan on prt1_m_p2 t1_1
+               ->  Seq Scan on prt1_m_p3 t1_2
+         ->  Hash
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Append
+                           ->  Seq Scan on prt2_m_p1 t2
+                           ->  Seq Scan on prt2_m_p2 t2_1
+                           ->  Seq Scan on prt2_m_p3 t2_2
+(13 rows)
+
+-- equi-join between non-key columns does not qualify for partition-wise join
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.c = t2.c;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Left Join
+         Hash Cond: (t1.c = c)
+         ->  Append
+               ->  Seq Scan on prt1_m_p1 t1
+               ->  Seq Scan on prt1_m_p2 t1_1
+               ->  Seq Scan on prt1_m_p3 t1_2
+         ->  Hash
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Append
+                           ->  Seq Scan on prt2_m_p1 t2
+                           ->  Seq Scan on prt2_m_p2 t2_1
+                           ->  Seq Scan on prt2_m_p3 t2_2
+(13 rows)
+
+-- partition-wise join can not be applied between tables with different
+-- partition lists
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 LEFT JOIN prt2_n t2 ON (t1.c = t2.c);
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Right Join
+         Hash Cond: (c = (c)::text)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: c
+               ->  Append
+                     ->  Seq Scan on prt2_n_p1 t2
+                     ->  Seq Scan on prt2_n_p2 t2_1
+         ->  Hash
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: c
+                     ->  Append
+                           ->  Seq Scan on prt1_n_p1 t1
+                           ->  Seq Scan on prt1_n_p2 t1_1
+(14 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 JOIN prt2_n t2 ON (t1.c = t2.c) JOIN plt1 t3 ON (t1.c = t3.c);
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Join
+         Hash Cond: (c = (c)::text)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Append
+                     ->  Seq Scan on plt1_p1 t3
+                     ->  Seq Scan on plt1_p2 t3_1
+                     ->  Seq Scan on plt1_p3 t3_2
+         ->  Hash
+               ->  Hash Join
+                     Hash Cond: (t2.c = (c)::text)
+                     ->  Append
+                           ->  Seq Scan on prt2_n_p1 t2
+                           ->  Seq Scan on prt2_n_p2 t2_1
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 ->  Append
+                                       ->  Seq Scan on prt1_n_p1 t1
+                                       ->  Seq Scan on prt1_n_p2 t1_1
+(19 rows)
+
+-- partition-wise join can not be applied for a join between list and range
+-- partitioned table
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 FULL JOIN prt1 t2 ON (t1.c = t2.c);
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Full Join
+         Hash Cond: ((c)::text = (c)::text)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: c
+               ->  Append
+                     ->  Seq Scan on prt1_p1 t2
+                     ->  Seq Scan on prt1_p2 t2_1
+                     ->  Seq Scan on prt1_p3 t2_2
+         ->  Hash
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: c
+                     ->  Append
+                           ->  Seq Scan on prt1_n_p1 t1
+                           ->  Seq Scan on prt1_n_p2 t1_1
+(15 rows)
+
diff --git a/src/test/regress/expected/subselect_2.out b/src/test/regress/expected/subselect_2.out
new file mode 100644
index 00000000..b774cebe
--- /dev/null
+++ b/src/test/regress/expected/subselect_2.out
@@ -0,0 +1,1164 @@
+--
+-- SUBSELECT
+--
+SELECT 1 AS one WHERE 1 IN (SELECT 1);
+ one 
+-----
+   1
+(1 row)
+
+SELECT 1 AS zero WHERE 1 NOT IN (SELECT 1);
+ zero 
+------
+(0 rows)
+
+SELECT 1 AS zero WHERE 1 IN (SELECT 2);
+ zero 
+------
+(0 rows)
+
+-- Check grammar's handling of extra parens in assorted contexts
+SELECT * FROM (SELECT 1 AS x) ss;
+ x 
+---
+ 1
+(1 row)
+
+SELECT * FROM ((SELECT 1 AS x)) ss;
+ x 
+---
+ 1
+(1 row)
+
+(SELECT 2) UNION SELECT 2;
+ ?column? 
+----------
+        2
+(1 row)
+
+((SELECT 2)) UNION SELECT 2;
+ ?column? 
+----------
+        2
+(1 row)
+
+SELECT ((SELECT 2) UNION SELECT 2);
+ ?column? 
+----------
+        2
+(1 row)
+
+SELECT (((SELECT 2)) UNION SELECT 2);
+ ?column? 
+----------
+        2
+(1 row)
+
+SELECT (SELECT ARRAY[1,2,3])[1];
+ array 
+-------
+     1
+(1 row)
+
+SELECT ((SELECT ARRAY[1,2,3]))[2];
+ array 
+-------
+     2
+(1 row)
+
+SELECT (((SELECT ARRAY[1,2,3])))[3];
+ array 
+-------
+     3
+(1 row)
+
+-- Set up some simple test tables
+CREATE TABLE SUBSELECT_TBL (
+  f1 integer,
+  f2 integer,
+  f3 float
+);
+INSERT INTO SUBSELECT_TBL VALUES (1, 2, 3);
+INSERT INTO SUBSELECT_TBL VALUES (2, 3, 4);
+INSERT INTO SUBSELECT_TBL VALUES (3, 4, 5);
+INSERT INTO SUBSELECT_TBL VALUES (1, 1, 1);
+INSERT INTO SUBSELECT_TBL VALUES (2, 2, 2);
+INSERT INTO SUBSELECT_TBL VALUES (3, 3, 3);
+INSERT INTO SUBSELECT_TBL VALUES (6, 7, 8);
+INSERT INTO SUBSELECT_TBL VALUES (8, 9, NULL);
+SELECT '' AS eight, * FROM SUBSELECT_TBL ORDER BY f1, f2, f3;
+ eight | f1 | f2 | f3 
+-------+----+----+----
+       |  1 |  1 |  1
+       |  1 |  2 |  3
+       |  2 |  2 |  2
+       |  2 |  3 |  4
+       |  3 |  3 |  3
+       |  3 |  4 |  5
+       |  6 |  7 |  8
+       |  8 |  9 |   
+(8 rows)
+
+-- Uncorrelated subselects
+SELECT '' AS two, f1 AS "Constant Select" FROM SUBSELECT_TBL
+  WHERE f1 IN (SELECT 1) ORDER BY 2;
+ two | Constant Select 
+-----+-----------------
+     |               1
+     |               1
+(2 rows)
+
+SELECT '' AS six, f1 AS "Uncorrelated Field" FROM SUBSELECT_TBL
+  WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL) 
+  ORDER BY 2;
+ six | Uncorrelated Field 
+-----+--------------------
+     |                  1
+     |                  1
+     |                  2
+     |                  2
+     |                  3
+     |                  3
+(6 rows)
+
+SELECT '' AS six, f1 AS "Uncorrelated Field" FROM SUBSELECT_TBL
+  WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE
+    f2 IN (SELECT f1 FROM SUBSELECT_TBL)) 
+    ORDER BY 2;
+ six | Uncorrelated Field 
+-----+--------------------
+     |                  1
+     |                  1
+     |                  2
+     |                  2
+     |                  3
+     |                  3
+(6 rows)
+
+SELECT '' AS three, f1, f2
+  FROM SUBSELECT_TBL
+  WHERE (f1, f2) NOT IN (SELECT f2, CAST(f3 AS int4) FROM SUBSELECT_TBL
+                         WHERE f3 IS NOT NULL) 
+                         ORDER BY f1, f2;
+ three | f1 | f2 
+-------+----+----
+       |  1 |  2
+       |  6 |  7
+       |  8 |  9
+(3 rows)
+
+-- Correlated subselects
+SELECT '' AS six, f1 AS "Correlated Field", f2 AS "Second Field"
+  FROM SUBSELECT_TBL upper
+  WHERE f1 IN (SELECT f2 FROM SUBSELECT_TBL WHERE f1 = upper.f1) 
+  ORDER BY f1, f2;
+ six | Correlated Field | Second Field 
+-----+------------------+--------------
+     |                1 |            1
+     |                1 |            2
+     |                2 |            2
+     |                2 |            3
+     |                3 |            3
+     |                3 |            4
+(6 rows)
+
+SELECT '' AS six, f1 AS "Correlated Field", f3 AS "Second Field"
+  FROM SUBSELECT_TBL upper
+  WHERE f1 IN
+    (SELECT f2 FROM SUBSELECT_TBL WHERE CAST(upper.f2 AS float) = f3)
+    ORDER BY 2, 3;
+ six | Correlated Field | Second Field 
+-----+------------------+--------------
+     |                1 |            1
+     |                2 |            2
+     |                2 |            4
+     |                3 |            3
+     |                3 |            5
+(5 rows)
+
+SELECT '' AS six, f1 AS "Correlated Field", f3 AS "Second Field"
+  FROM SUBSELECT_TBL upper
+  WHERE f3 IN (SELECT upper.f1 + f2 FROM SUBSELECT_TBL
+               WHERE f2 = CAST(f3 AS integer)) 
+               ORDER BY 2, 3;
+ six | Correlated Field | Second Field 
+-----+------------------+--------------
+     |                1 |            3
+     |                2 |            4
+     |                3 |            5
+     |                6 |            8
+(4 rows)
+
+SELECT '' AS five, f1 AS "Correlated Field"
+  FROM SUBSELECT_TBL
+  WHERE (f1, f2) IN (SELECT f2, CAST(f3 AS int4) FROM SUBSELECT_TBL
+                     WHERE f3 IS NOT NULL) 
+                     ORDER BY 2;
+ five | Correlated Field 
+------+------------------
+      |                1
+      |                2
+      |                2
+      |                3
+      |                3
+(5 rows)
+
+--
+-- Use some existing tables in the regression test
+--
+SELECT '' AS eight, ss.f1 AS "Correlated Field", ss.f3 AS "Second Field"
+  FROM SUBSELECT_TBL ss
+  WHERE f1 NOT IN (SELECT f1+1 FROM INT4_TBL
+                   WHERE f1 != ss.f1 AND f1 < 2147483647) 
+                   ORDER BY 2, 3;
+ eight | Correlated Field | Second Field 
+-------+------------------+--------------
+       |                2 |            2
+       |                2 |            4
+       |                3 |            3
+       |                3 |            5
+       |                6 |            8
+       |                8 |             
+(6 rows)
+
+select q1, float8(count(*)) / (select count(*) from int8_tbl)
+from int8_tbl group by q1 order by q1;
+        q1        | ?column? 
+------------------+----------
+              123 |      0.4
+ 4567890123456789 |      0.6
+(2 rows)
+
+-- Unspecified-type literals in output columns should resolve as text
+SELECT *, pg_typeof(f1) FROM
+  (SELECT 'foo' AS f1 FROM generate_series(1,3)) ss ORDER BY 1;
+ f1  | pg_typeof 
+-----+-----------
+ foo | text
+ foo | text
+ foo | text
+(3 rows)
+
+-- ... unless there's context to suggest differently
+explain verbose select '42' union all select '43';
+                   QUERY PLAN                    
+-------------------------------------------------
+ Append  (cost=0.00..0.04 rows=2 width=32)
+   ->  Result  (cost=0.00..0.01 rows=1 width=32)
+         Output: '42'::text
+   ->  Result  (cost=0.00..0.01 rows=1 width=32)
+         Output: '43'::text
+(5 rows)
+
+explain verbose select '42' union all select 43;
+                   QUERY PLAN                   
+------------------------------------------------
+ Append  (cost=0.00..0.04 rows=2 width=4)
+   ->  Result  (cost=0.00..0.01 rows=1 width=4)
+         Output: 42
+   ->  Result  (cost=0.00..0.01 rows=1 width=4)
+         Output: 43
+(5 rows)
+
+-- check materialization of an initplan reference (bug #14524)
+explain (verbose, costs off)
+select 1 = all (select (select 1));
+            QUERY PLAN             
+-----------------------------------
+ Result
+   Output: (SubPlan 2)
+   SubPlan 2
+     ->  Materialize
+           Output: ($0)
+           InitPlan 1 (returns $0)
+             ->  Result
+                   Output: 1
+           ->  Result
+                 Output: $0
+(10 rows)
+
+select 1 = all (select (select 1));
+ ?column? 
+----------
+ t
+(1 row)
+
+--
+-- Check EXISTS simplification with LIMIT
+--
+explain (costs off)
+select * from int4_tbl o where exists
+  (select 1 from int4_tbl i where i.f1=o.f1 limit null);
+                QUERY PLAN                
+------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   ->  Hash Semi Join
+         Hash Cond: (o.f1 = i.f1)
+         ->  Seq Scan on int4_tbl o
+         ->  Hash
+               ->  Seq Scan on int4_tbl i
+(6 rows)
+
+explain (costs off, nodes off)
+select * from int4_tbl o where not exists
+  (select 1 from int4_tbl i where i.f1=o.f1 limit 1);
+                QUERY PLAN                
+------------------------------------------
+ Remote Subquery Scan on all
+   ->  Hash Anti Join
+         Hash Cond: (o.f1 = i.f1)
+         ->  Seq Scan on int4_tbl o
+         ->  Hash
+               ->  Seq Scan on int4_tbl i
+(6 rows)
+
+explain (costs off, nodes off)
+select * from int4_tbl o where exists
+  (select 1 from int4_tbl i where i.f1=o.f1 limit 0);
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Seq Scan on int4_tbl o
+         Filter: (SubPlan 1)
+         SubPlan 1
+           ->  Limit
+                 ->  Remote Subquery Scan on all
+                       ->  Limit
+                             ->  Seq Scan on int4_tbl i
+                                   Filter: (f1 = o.f1)
+(9 rows)
+
+--
+-- Test cases to catch unpleasant interactions between IN-join processing
+-- and subquery pullup.
+--
+select count(*) from
+  (select 1 from tenk1 a
+   where unique1 IN (select hundred from tenk1 b)) ss;
+ count 
+-------
+   100
+(1 row)
+
+select count(distinct ss.ten) from
+  (select ten from tenk1 a
+   where unique1 IN (select hundred from tenk1 b)) ss;
+ count 
+-------
+    10
+(1 row)
+
+select count(*) from
+  (select 1 from tenk1 a
+   where unique1 IN (select distinct hundred from tenk1 b)) ss;
+ count 
+-------
+   100
+(1 row)
+
+select count(distinct ss.ten) from
+  (select ten from tenk1 a
+   where unique1 IN (select distinct hundred from tenk1 b)) ss;
+ count 
+-------
+    10
+(1 row)
+
+--
+-- Test cases to check for overenthusiastic optimization of
+-- "IN (SELECT DISTINCT ...)" and related cases.  Per example from
+-- Luca Pireddu and Michael Fuhr.
+--
+CREATE TEMP TABLE foo (id integer);
+CREATE TEMP TABLE bar (id1 integer, id2 integer);
+INSERT INTO foo VALUES (1);
+INSERT INTO bar VALUES (1, 1);
+INSERT INTO bar VALUES (2, 2);
+INSERT INTO bar VALUES (3, 1);
+-- These cases require an extra level of distinct-ing above subquery s
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
+                      SELECT id1, id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+-- These cases do not
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar UNION
+                      SELECT id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+--
+-- Test case to catch problems with multiply nested sub-SELECTs not getting
+-- recalculated properly.  Per bug report from Didier Moens.
+--
+CREATE TABLE orderstest (
+    approver_ref integer,
+    po_ref integer,
+    ordercanceled boolean
+);
+INSERT INTO orderstest VALUES (1, 1, false);
+INSERT INTO orderstest VALUES (66, 5, false);
+INSERT INTO orderstest VALUES (66, 6, false);
+INSERT INTO orderstest VALUES (66, 7, false);
+INSERT INTO orderstest VALUES (66, 1, true);
+INSERT INTO orderstest VALUES (66, 8, false);
+INSERT INTO orderstest VALUES (66, 1, false);
+INSERT INTO orderstest VALUES (77, 1, false);
+INSERT INTO orderstest VALUES (1, 1, false);
+INSERT INTO orderstest VALUES (66, 1, false);
+INSERT INTO orderstest VALUES (1, 1, false);
+CREATE VIEW orders_view AS
+SELECT *,
+(SELECT CASE
+   WHEN ord.approver_ref=1 THEN '---' ELSE 'Approved'
+ END) AS "Approved",
+(SELECT CASE
+ WHEN ord.ordercanceled
+ THEN 'Canceled'
+ ELSE
+  (SELECT CASE
+		WHEN ord.po_ref=1
+		THEN
+		 (SELECT CASE
+				WHEN ord.approver_ref=1
+				THEN '---'
+				ELSE 'Approved'
+			END)
+		ELSE 'PO'
+	END)
+END) AS "Status",
+(CASE
+ WHEN ord.ordercanceled
+ THEN 'Canceled'
+ ELSE
+  (CASE
+		WHEN ord.po_ref=1
+		THEN
+		 (CASE
+				WHEN ord.approver_ref=1
+				THEN '---'
+				ELSE 'Approved'
+			END)
+		ELSE 'PO'
+	END)
+END) AS "Status_OK"
+FROM orderstest ord;
+SELECT * FROM orders_view 
+ORDER BY approver_ref, po_ref, ordercanceled;
+ approver_ref | po_ref | ordercanceled | Approved |  Status  | Status_OK 
+--------------+--------+---------------+----------+----------+-----------
+            1 |      1 | f             | ---      | ---      | ---
+            1 |      1 | f             | ---      | ---      | ---
+            1 |      1 | f             | ---      | ---      | ---
+           66 |      1 | f             | Approved | Approved | Approved
+           66 |      1 | f             | Approved | Approved | Approved
+           66 |      1 | t             | Approved | Canceled | Canceled
+           66 |      5 | f             | Approved | PO       | PO
+           66 |      6 | f             | Approved | PO       | PO
+           66 |      7 | f             | Approved | PO       | PO
+           66 |      8 | f             | Approved | PO       | PO
+           77 |      1 | f             | Approved | Approved | Approved
+(11 rows)
+
+DROP TABLE orderstest cascade;
+NOTICE:  drop cascades to view orders_view
+--
+-- Test cases to catch situations where rule rewriter fails to propagate
+-- hasSubLinks flag correctly.  Per example from Kyle Bateman.
+--
+create temp table parts (
+    partnum     text,
+    cost        float8
+);
+create temp table shipped (
+    ttype       char(2),
+    ordnum      int4,
+    partnum     text,
+    value       float8
+);
+create temp view shipped_view as
+    select * from shipped where ttype = 'wt';
+create rule shipped_view_insert as on insert to shipped_view do instead
+    insert into shipped values('wt', new.ordnum, new.partnum, new.value);
+insert into parts (partnum, cost) values (1, 1234.56);
+insert into shipped_view (ordnum, partnum, value)
+    values (0, 1, (select cost from parts where partnum = '1'));
+select * from shipped_view;
+ ttype | ordnum | partnum |  value  
+-------+--------+---------+---------
+ wt    |      0 | 1       | 1234.56
+(1 row)
+
+create rule shipped_view_update as on update to shipped_view do instead
+    update shipped set partnum = new.partnum, value = new.value
+        where ttype = new.ttype and ordnum = new.ordnum;
+update shipped_view set value = 11
+    from int4_tbl a join int4_tbl b
+      on (a.f1 = (select f1 from int4_tbl c where c.f1=b.f1))
+    where ordnum = a.f1;
+ERROR:  could not plan this distributed update
+DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
+select * from shipped_view;
+ ttype | ordnum | partnum |  value  
+-------+--------+---------+---------
+ wt    |      0 | 1       | 1234.56
+(1 row)
+
+select f1, ss1 as relabel from
+    (select *, (select sum(f1) from int4_tbl b where f1 >= a.f1) as ss1
+     from int4_tbl a) ss 
+     ORDER BY f1, relabel;
+     f1      |  relabel   
+-------------+------------
+ -2147483647 |          0
+     -123456 | 2147483647
+           0 | 2147607103
+      123456 | 2147607103
+  2147483647 | 2147483647
+(5 rows)
+
+--
+-- Test cases involving PARAM_EXEC parameters and min/max index optimizations.
+-- Per bug report from David Sanchez i Gregori.
+--
+select * from (
+  select max(unique1) from tenk1 as a
+  where exists (select 1 from tenk1 as b where b.thousand = a.unique2)
+) ss;
+ max  
+------
+ 9997
+(1 row)
+
+select * from (
+  select min(unique1) from tenk1 as a
+  where not exists (select 1 from tenk1 as b where b.unique2 = 10000)
+) ss;
+ min 
+-----
+   0
+(1 row)
+
+--
+-- Test that an IN implemented using a UniquePath does unique-ification
+-- with the right semantics, as per bug #4113.  (Unfortunately we have
+-- no simple way to ensure that this test case actually chooses that type
+-- of plan, but it does in releases 7.4-8.3.  Note that an ordering difference
+-- here might mean that some other plan type is being used, rendering the test
+-- pointless.)
+--
+create temp table numeric_table (num_col numeric);
+insert into numeric_table values (1), (1.000000000000000000001), (2), (3);
+create temp table float_table (float_col float8);
+insert into float_table values (1), (2), (3);
+select * from float_table
+  where float_col in (select num_col from numeric_table) 
+  ORDER BY float_col;
+ float_col 
+-----------
+         1
+         2
+         3
+(3 rows)
+
+select * from numeric_table
+  where num_col in (select float_col from float_table) 
+  ORDER BY num_col;
+         num_col         
+-------------------------
+                       1
+ 1.000000000000000000001
+                       2
+                       3
+(4 rows)
+
+--
+-- Test case for bug #4290: bogus calculation of subplan param sets
+--
+create temp table ta (id int primary key, val int);
+insert into ta values(1,1);
+insert into ta values(2,2);
+create temp table tb (id int primary key, aval int);
+insert into tb values(1,1);
+insert into tb values(2,1);
+insert into tb values(3,2);
+insert into tb values(4,2);
+create temp table tc (id int primary key, aid int);
+insert into tc values(1,1);
+insert into tc values(2,2);
+select
+  ( select min(tb.id) from tb
+    where tb.aval = (select ta.val from ta where ta.id = tc.aid) ) as min_tb_id
+from tc 
+ORDER BY min_tb_id;
+ min_tb_id 
+-----------
+         1
+         3
+(2 rows)
+
+--
+-- Test case for 8.3 "failed to locate grouping columns" bug
+--
+create temp table t1 (f1 numeric(14,0), f2 varchar(30));
+select * from
+  (select distinct f1, f2, (select f2 from t1 x where x.f1 = up.f1) as fs
+   from t1 up) ss
+group by f1,f2,fs;
+ f1 | f2 | fs 
+----+----+----
+(0 rows)
+
+--
+-- Test case for bug #5514 (mishandling of whole-row Vars in subselects)
+--
+create temp table table_a(id integer);
+insert into table_a values (42);
+create temp view view_a as select * from table_a;
+select view_a from view_a;
+ view_a 
+--------
+ (42)
+(1 row)
+
+select (select view_a) from view_a;
+ view_a 
+--------
+ (42)
+(1 row)
+
+select (select (select view_a)) from view_a;
+ view_a 
+--------
+ (42)
+(1 row)
+
+select (select (a.*)::text) from view_a a;
+  a   
+------
+ (42)
+(1 row)
+
+--
+-- Check that whole-row Vars reading the result of a subselect don't include
+-- any junk columns therein
+--
+select q from (select max(f1) from int4_tbl group by f1 order by f1) q;
+       q       
+---------------
+ (-2147483647)
+ (-123456)
+ (0)
+ (123456)
+ (2147483647)
+(5 rows)
+
+with q as (select max(f1) from int4_tbl group by f1 order by f1)
+  select q from q;
+       q       
+---------------
+ (-2147483647)
+ (-123456)
+ (0)
+ (123456)
+ (2147483647)
+(5 rows)
+
+--
+-- Test case for sublinks pushed down into subselects via join alias expansion
+--
+select
+  (select sq1) as qq1
+from
+  (select exists(select 1 from int4_tbl where f1 = q2) as sq1, 42 as dummy
+   from int8_tbl) sq0
+  join
+  int4_tbl i4 on dummy = i4.f1;
+ qq1 
+-----
+(0 rows)
+
+--
+-- Test case for subselect within UPDATE of INSERT...ON CONFLICT DO UPDATE
+--
+create temp table upsert(key int4 primary key, val text);
+insert into upsert values(1, 'val') on conflict (key) do update set val = 'not seen';
+insert into upsert values(1, 'val') on conflict (key) do update set val = 'seen with subselect ' || (select f1 from int4_tbl where f1 != 0 limit 1)::text;
+select * from upsert;
+ key |            val             
+-----+----------------------------
+   1 | seen with subselect 123456
+(1 row)
+
+with aa as (select 'int4_tbl' u from int4_tbl limit 1)
+insert into upsert values (1, 'x'), (999, 'y')
+on conflict (key) do update set val = (select u from aa)
+returning *;
+ key |   val    
+-----+----------
+   1 | int4_tbl
+ 999 | y
+(2 rows)
+
+--
+-- Test case for cross-type partial matching in hashed subplan (bug #7597)
+--
+create temp table outer_7597 (f1 int4, f2 int4);
+insert into outer_7597 values (0, 0);
+insert into outer_7597 values (1, 0);
+insert into outer_7597 values (0, null);
+insert into outer_7597 values (1, null);
+create temp table inner_7597(c1 int8, c2 int8);
+insert into inner_7597 values(0, null);
+select * from outer_7597 where (f1, f2) not in (select * from inner_7597) order by 1;
+ f1 | f2 
+----+----
+  1 |  0
+  1 |   
+(2 rows)
+
+--
+-- Test case for premature memory release during hashing of subplan output
+--
+select '1'::text in (select '1'::name union all select '1'::name);
+ ?column? 
+----------
+ t
+(1 row)
+
+--
+-- Test case for planner bug with nested EXISTS handling
+--
+select a.thousand from tenk1 a, tenk1 b
+where a.thousand = b.thousand
+  and exists ( select 1 from tenk1 c where b.hundred = c.hundred
+                   and not exists ( select 1 from tenk1 d
+                                    where a.thousand = d.thousand ) );
+ thousand 
+----------
+(0 rows)
+
+--
+-- Check that nested sub-selects are not pulled up if they contain volatiles
+--
+explain (verbose, costs off)
+  select x, x from
+    (select (select now()) as x from (values(1),(2)) v(y)) ss;
+        QUERY PLAN         
+---------------------------
+ Values Scan on "*VALUES*"
+   Output: $0, $1
+   InitPlan 1 (returns $0)
+     ->  Result
+           Output: now()
+   InitPlan 2 (returns $1)
+     ->  Result
+           Output: now()
+(8 rows)
+
+explain (verbose, costs off)
+  select x, x from
+    (select (select random()) as x from (values(1),(2)) v(y)) ss;
+            QUERY PLAN            
+----------------------------------
+ Subquery Scan on ss
+   Output: ss.x, ss.x
+   ->  Values Scan on "*VALUES*"
+         Output: $0
+         InitPlan 1 (returns $0)
+           ->  Result
+                 Output: random()
+(7 rows)
+
+explain (verbose, costs off)
+  select x, x from
+    (select (select now() where y=y) as x from (values(1),(2)) v(y)) ss;
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Values Scan on "*VALUES*"
+   Output: (SubPlan 1), (SubPlan 2)
+   SubPlan 1
+     ->  Result
+           Output: now()
+           One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1)
+   SubPlan 2
+     ->  Result
+           Output: now()
+           One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1)
+(10 rows)
+
+explain (verbose, costs off)
+  select x, x from
+    (select (select random() where y=y) as x from (values(1),(2)) v(y)) ss;
+                                 QUERY PLAN                                 
+----------------------------------------------------------------------------
+ Subquery Scan on ss
+   Output: ss.x, ss.x
+   ->  Values Scan on "*VALUES*"
+         Output: (SubPlan 1)
+         SubPlan 1
+           ->  Result
+                 Output: random()
+                 One-Time Filter: ("*VALUES*".column1 = "*VALUES*".column1)
+(8 rows)
+
+--
+-- Check we behave sanely in corner case of empty SELECT list (bug #8648)
+--
+create temp table nocolumns();
+select exists(select * from nocolumns);
+ exists 
+--------
+ f
+(1 row)
+
+--
+-- Check sane behavior with nested IN SubLinks
+--
+explain (verbose, costs off)
+select * from int4_tbl where
+  (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in
+  (select ten from tenk1 b);
+                                            QUERY PLAN                                             
+---------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: int4_tbl.f1
+   ->  Hash Join
+         Output: int4_tbl.f1
+         Inner Unique: true
+         Hash Cond: (CASE WHEN (hashed SubPlan 1) THEN int4_tbl.f1 ELSE NULL::integer END = b.ten)
+         ->  Seq Scan on public.int4_tbl
+               Output: int4_tbl.f1
+         ->  Hash
+               Output: b.ten
+               ->  HashAggregate
+                     Output: b.ten
+                     Group Key: b.ten
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Output: b.ten
+                           Distribute results by H: ten
+                           ->  HashAggregate
+                                 Output: b.ten
+                                 Group Key: b.ten
+                                 ->  Seq Scan on public.tenk1 b
+                                       Output: b.ten
+         SubPlan 1
+           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                 Output: a.unique1
+                 ->  Index Only Scan using tenk1_unique1 on public.tenk1 a
+                       Output: a.unique1
+(26 rows)
+
+select * from int4_tbl where
+  (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in
+  (select ten from tenk1 b);
+ f1 
+----
+  0
+(1 row)
+
+--
+-- Check for incorrect optimization when IN subquery contains a SRF
+--
+explain (verbose, costs off)
+select * from int4_tbl o where (f1, f1) in
+  (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1)
+   Output: o.f1
+   ->  Nested Loop Semi Join
+         Output: o.f1
+         Join Filter: (o.f1 = "ANY_subquery".f1)
+         ->  Seq Scan on public.int4_tbl o
+               Output: o.f1
+         ->  Materialize
+               Output: "ANY_subquery".f1, "ANY_subquery".g
+               ->  Subquery Scan on "ANY_subquery"
+                     Output: "ANY_subquery".f1, "ANY_subquery".g
+                     Filter: ("ANY_subquery".f1 = "ANY_subquery".g)
+                     ->  Result
+                           Output: i.f1, ((generate_series(1, 2)) / 10)
+                           ->  ProjectSet
+                                 Output: generate_series(1, 2), i.f1
+                                 ->  HashAggregate
+                                       Output: i.f1
+                                       Group Key: i.f1
+                                       ->  Seq Scan on public.int4_tbl i
+                                             Output: i.f1
+(21 rows)
+
+select * from int4_tbl o where (f1, f1) in
+  (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1);
+ f1 
+----
+  0
+(1 row)
+
+--
+-- check for over-optimization of whole-row Var referencing an Append plan
+--
+select (select q from
+         (select 1,2,3 where f1 > 0
+          union all
+          select 4,5,6.0 where f1 <= 0
+         ) q )
+from int4_tbl order by 1;
+     q     
+-----------
+ (1,2,3)
+ (1,2,3)
+ (4,5,6.0)
+ (4,5,6.0)
+ (4,5,6.0)
+(5 rows)
+
+--
+-- Check that volatile quals aren't pushed down past a DISTINCT:
+-- nextval() should not be called more than the nominal number of times
+--
+create temp sequence ts1;
+select * from
+  (select distinct ten from tenk1) ss
+  where ten < 10 + nextval('ts1')
+  order by 1;
+ ten 
+-----
+   0
+   1
+   2
+   3
+   4
+   5
+   6
+   7
+   8
+   9
+(10 rows)
+
+select nextval('ts1');
+ nextval 
+---------
+      11
+(1 row)
+
+SELECT setseed(0);
+ setseed 
+---------
+ 
+(1 row)
+
+-- DROP TABLE IF EXISTS asd ;
+CREATE TABLE IF NOT EXISTS asd  AS
+SELECT clientid::numeric(20),
+ (clientid / 20 )::integer::numeric(20) as userid,
+ cts + ((random()* 3600 *24 )||'sec')::interval as cts,
+ (ARRAY['A','B','C','D','E','F'])[(random()*5+1)::integer] as state,
+ 0 as dim,
+ ((ARRAY['Cat','Dog','Duck'])[(clientid / 10  )% 3  +1 ]) ::text as app_name,
+ ((ARRAY['A','B'])[(clientid / 10  )% 2  +1 ]) ::text as platform
+ FROM generate_series('2016-01-01'::timestamp,'2016-10-01'::timestamp,interval '15 day') cts , generate_series( 1000,2000,10) clientid , generate_series(1,6) t
+;
+SELECT dates::timestamp as dates ,B.platform,B.app_name, B.clientid, B.userid,
+	B.state as state
+FROM ( VALUES
+('2016.08.30. 08:52:43') ,('2016.08.29. 04:57:12') ,('2016.08.26. 08:15:05') ,
+('2016.08.24. 11:49:51') ,('2016.08.22. 08:45:29') ,('2016.08.21. 04:53:47') ,('2016.08.20. 08:44:03')
+) AS D (dates)
+JOIN
+( SELECT DISTINCT clientid FROM asd
+	WHERE userid=74 ) C ON True
+INNER JOIN LATERAL (
+	SELECT DISTINCT ON (clientid,app_name,platform,state,dim) x.*
+	FROM asd x
+	INNER JOIN (SELECT p.clientid,p.app_name,p.platform , p.state, p.dim ,
+	     MAX(p.cts) AS selected_cts
+		FROM asd p
+		where cts<D.dates::timestamp and state in
+		('A','B')
+	GROUP BY p.clientid,p.app_name,p.platform,p.state,p.dim) y
+	ON y.clientid = x.clientid
+	AND y.selected_cts = x.cts
+	AND y.platform = x.platform
+	AND y.app_name=x.app_name
+	AND y.state=x.state
+	AND y.dim = x.dim
+	and x.clientid = C.clientid
+) B ON True
+ORDER BY dates desc, state;
+          dates           | platform | app_name | clientid | userid | state 
+--------------------------+----------+----------+----------+--------+-------
+ Tue Aug 30 08:52:43 2016 | A        | Dog      |     1480 |     74 | A
+ Tue Aug 30 08:52:43 2016 | B        | Duck     |     1490 |     74 | A
+ Tue Aug 30 08:52:43 2016 | A        | Dog      |     1480 |     74 | B
+ Tue Aug 30 08:52:43 2016 | B        | Duck     |     1490 |     74 | B
+ Mon Aug 29 04:57:12 2016 | A        | Dog      |     1480 |     74 | A
+ Mon Aug 29 04:57:12 2016 | B        | Duck     |     1490 |     74 | A
+ Mon Aug 29 04:57:12 2016 | A        | Dog      |     1480 |     74 | B
+ Mon Aug 29 04:57:12 2016 | B        | Duck     |     1490 |     74 | B
+ Fri Aug 26 08:15:05 2016 | B        | Duck     |     1490 |     74 | A
+ Fri Aug 26 08:15:05 2016 | A        | Dog      |     1480 |     74 | A
+ Fri Aug 26 08:15:05 2016 | B        | Duck     |     1490 |     74 | B
+ Fri Aug 26 08:15:05 2016 | A        | Dog      |     1480 |     74 | B
+ Wed Aug 24 11:49:51 2016 | A        | Dog      |     1480 |     74 | A
+ Wed Aug 24 11:49:51 2016 | B        | Duck     |     1490 |     74 | A
+ Wed Aug 24 11:49:51 2016 | A        | Dog      |     1480 |     74 | B
+ Wed Aug 24 11:49:51 2016 | B        | Duck     |     1490 |     74 | B
+ Mon Aug 22 08:45:29 2016 | B        | Duck     |     1490 |     74 | A
+ Mon Aug 22 08:45:29 2016 | A        | Dog      |     1480 |     74 | A
+ Mon Aug 22 08:45:29 2016 | B        | Duck     |     1490 |     74 | B
+ Mon Aug 22 08:45:29 2016 | A        | Dog      |     1480 |     74 | B
+ Sun Aug 21 04:53:47 2016 | B        | Duck     |     1490 |     74 | A
+ Sun Aug 21 04:53:47 2016 | A        | Dog      |     1480 |     74 | A
+ Sun Aug 21 04:53:47 2016 | B        | Duck     |     1490 |     74 | B
+ Sun Aug 21 04:53:47 2016 | A        | Dog      |     1480 |     74 | B
+ Sat Aug 20 08:44:03 2016 | A        | Dog      |     1480 |     74 | A
+ Sat Aug 20 08:44:03 2016 | B        | Duck     |     1490 |     74 | A
+ Sat Aug 20 08:44:03 2016 | B        | Duck     |     1490 |     74 | B
+ Sat Aug 20 08:44:03 2016 | A        | Dog      |     1480 |     74 | B
+(28 rows)
+
+DROP TABLE asd;
+SELECT setseed(0);
+ setseed 
+---------
+ 
+(1 row)
+
+--
+-- Check that volatile quals aren't pushed down past a set-returning function;
+-- while a nonvolatile qual can be, if it doesn't reference the SRF.
+--
+create function tattle(x int, y int) returns bool
+volatile language plpgsql as $$
+begin
+  raise notice 'x = %, y = %', x, y;
+  return x > y;
+end$$;
+explain (verbose, costs off)
+select * from
+  (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+  where tattle(x, 8);
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Subquery Scan on ss
+   Output: x, u
+   Filter: tattle(ss.x, 8)
+   ->  ProjectSet
+         Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
+         ->  Result
+(6 rows)
+
+select * from
+  (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+  where tattle(x, 8);
+NOTICE:  x = 9, y = 8
+NOTICE:  x = 9, y = 8
+NOTICE:  x = 9, y = 8
+NOTICE:  x = 9, y = 8
+NOTICE:  x = 9, y = 8
+NOTICE:  x = 9, y = 8
+ x | u  
+---+----
+ 9 |  1
+ 9 |  2
+ 9 |  3
+ 9 | 11
+ 9 | 12
+ 9 | 13
+(6 rows)
+
+-- if we pretend it's stable, we get different results:
+alter function tattle(x int, y int) stable;
+explain (verbose, costs off)
+select * from
+  (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+  where tattle(x, 8);
+                     QUERY PLAN                     
+----------------------------------------------------
+ ProjectSet
+   Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
+   ->  Result
+         One-Time Filter: tattle(9, 8)
+(4 rows)
+
+select * from
+  (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+  where tattle(x, 8);
+NOTICE:  x = 9, y = 8
+ x | u  
+---+----
+ 9 |  1
+ 9 |  2
+ 9 |  3
+ 9 | 11
+ 9 | 12
+ 9 | 13
+(6 rows)
+
+-- although even a stable qual should not be pushed down if it references SRF
+explain (verbose, costs off)
+select * from
+  (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+  where tattle(x, u);
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Subquery Scan on ss
+   Output: x, u
+   Filter: tattle(ss.x, ss.u)
+   ->  ProjectSet
+         Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
+         ->  Result
+(6 rows)
+
+select * from
+  (select 9 as x, unnest(array[1,2,3,11,12,13]) as u) ss
+  where tattle(x, u);
+NOTICE:  x = 9, y = 1
+NOTICE:  x = 9, y = 2
+NOTICE:  x = 9, y = 3
+NOTICE:  x = 9, y = 11
+NOTICE:  x = 9, y = 12
+NOTICE:  x = 9, y = 13
+ x | u 
+---+---
+ 9 | 1
+ 9 | 2
+ 9 | 3
+(3 rows)
+
+drop function tattle(x int, y int);
diff --git a/src/test/regress/expected/xc_groupby_3.out b/src/test/regress/expected/xc_groupby_3.out
new file mode 100644
index 00000000..6344aa6b
--- /dev/null
+++ b/src/test/regress/expected/xc_groupby_3.out
@@ -0,0 +1,7513 @@
+-- this file contains tests for GROUP BY with combinations of following
+-- 1. enable_hashagg = on/off (to force the grouping by sorting)
+-- 2. distributed or replicated tables across the datanodes
+-- If a testcase is added to any of the combinations, please check if it's
+-- applicable in other combinations as well.
+-- Since we want to test the plan reduction of GROUP and AGG nodes, disable fast
+-- query shipping
+set enable_fast_query_shipping to off;
+-- Combination 1: enable_hashagg on and distributed tables
+set enable_hashagg to on;
+-- create required tables and fill them with data
+create table xc_groupby_tab1 (val int, val2 int);
+create table xc_groupby_tab2 (val int, val2 int);
+insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3);
+insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2);
+select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2;
+ count | sum |        avg         |     ?column?     | val2 
+-------+-----+--------------------+------------------+------
+     2 |   8 | 4.0000000000000000 |                4 |    2
+     3 |   6 | 2.0000000000000000 |                2 |    1
+     3 |  11 | 3.6666666666666667 | 3.66666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2;
+                                                        QUERY PLAN                                                         
+---------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   Sort Key: count(*), sum(xc_groupby_tab1.val)
+   ->  Sort
+         Output: (count(*)), (sum(val)), (avg(val)), (((sum(val))::double precision / (count(*))::double precision)), val2
+         Sort Key: (count(*)), (sum(xc_groupby_tab1.val))
+         ->  Finalize HashAggregate
+               Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+               Group Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: val2
+                     ->  Partial HashAggregate
+                           Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: val, val2
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2;
+                                                    QUERY PLAN                                                     
+-------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   ->  Finalize HashAggregate
+         Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Remote Subquery Scan on all
+               Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+               Distribute results by H: val2
+               ->  Partial HashAggregate
+                     Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                     Group Key: xc_groupby_tab1.val2
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: val, val2
+(13 rows)
+
+-- joins and group by
+select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2 order by count(*);
+ count | sum |         avg         |     ?column?     | val2 | val2 
+-------+-----+---------------------+------------------+------+------
+     3 |     |                     |                  |      |    4
+     3 |     |                     |                  |    3 |     
+     6 |  96 | 16.0000000000000000 |               16 |    2 |    2
+     9 |  78 |  8.6666666666666667 | 8.66666666666667 |    1 |    1
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2;
+                                                                                                                                   QUERY PLAN                                                                                                                                    
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+   ->  Finalize HashAggregate
+         Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         ->  Remote Subquery Scan on all
+               Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val))
+               Distribute results by H: val2
+               ->  Partial HashAggregate
+                     Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val))
+                     Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+                     ->  Hash Full Join
+                           Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+                           Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
+                           ->  Remote Subquery Scan on all
+                                 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                                 Distribute results by H: val2
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                           ->  Hash
+                                 Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+                                 ->  Remote Subquery Scan on all
+                                       Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+                                       Distribute results by H: val2
+                                       ->  Seq Scan on public.xc_groupby_tab2
+                                             Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+(26 rows)
+
+-- aggregates over aggregates
+select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x order by 1;
+ sum 
+-----
+   8
+  17
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x order by 1;
+                                                     QUERY PLAN                                                     
+--------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(y), x
+   Sort Key: sum(y)
+   ->  Sort
+         Output: (sum(y)), x
+         Sort Key: (sum(y))
+         ->  Finalize HashAggregate
+               Output: sum(y), x
+               Group Key: x
+               ->  Remote Subquery Scan on all
+                     Output: x, PARTIAL sum(y)
+                     Distribute results by H: x
+                     ->  Partial HashAggregate
+                           Output: ((xc_groupby_tab1.val2 % 2)), PARTIAL sum((sum(xc_groupby_tab1.val)))
+                           Group Key: (xc_groupby_tab1.val2 % 2)
+                           ->  Finalize HashAggregate
+                                 Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2
+                                 Group Key: xc_groupby_tab1.val2
+                                 ->  Remote Subquery Scan on all
+                                       Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val)
+                                       Distribute results by H: val2
+                                       ->  Partial HashAggregate
+                                             Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val)
+                                             Group Key: xc_groupby_tab1.val2
+                                             ->  Seq Scan on public.xc_groupby_tab1
+                                                   Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+(26 rows)
+
+explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x;
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(y), x
+   ->  Finalize HashAggregate
+         Output: sum(y), x
+         Group Key: x
+         ->  Remote Subquery Scan on all
+               Output: x, PARTIAL sum(y)
+               Distribute results by H: x
+               ->  Partial HashAggregate
+                     Output: ((xc_groupby_tab1.val2 % 2)), PARTIAL sum((sum(xc_groupby_tab1.val)))
+                     Group Key: (xc_groupby_tab1.val2 % 2)
+                     ->  Finalize HashAggregate
+                           Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Remote Subquery Scan on all
+                                 Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val)
+                                 Distribute results by H: val2
+                                 ->  Partial HashAggregate
+                                       Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val)
+                                       Group Key: xc_groupby_tab1.val2
+                                       ->  Seq Scan on public.xc_groupby_tab1
+                                             Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+(22 rows)
+
+-- group by without aggregate
+select val2 from xc_groupby_tab1 group by val2 order by val2;
+ val2 
+------
+    1
+    2
+    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2 order by val2;
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   Sort Key: xc_groupby_tab1.val2
+   ->  Finalize GroupAggregate
+         Output: val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2
+               Sort Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2
+                     Distribute results by H: val2
+                     ->  Partial HashAggregate
+                           Output: val2
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: val, val2
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   ->  Finalize HashAggregate
+         Output: val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Remote Subquery Scan on all
+               Output: val2
+               Distribute results by H: val2
+               ->  Partial HashAggregate
+                     Output: val2
+                     Group Key: xc_groupby_tab1.val2
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: val, val2
+(13 rows)
+
+select val + val2 from xc_groupby_tab1 group by val + val2 order by 1;
+ ?column? 
+----------
+        2
+        3
+        4
+        7
+        8
+        9
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2 order by 1;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2)
+   Sort Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+   ->  Finalize GroupAggregate
+         Output: ((val + val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((val + val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (val + val2)
+                     Distribute results by H: (val + val2)
+                     ->  Partial HashAggregate
+                           Output: ((val + val2))
+                           Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: (val + val2)
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2)
+   ->  Finalize HashAggregate
+         Output: ((val + val2))
+         Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+         ->  Remote Subquery Scan on all
+               Output: (val + val2)
+               Distribute results by H: (val + val2)
+               ->  Partial HashAggregate
+                     Output: ((val + val2))
+                     Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: (val + val2)
+(13 rows)
+
+select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by 1, 2, 3;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        3 |   2 |    1
+        4 |   1 |    3
+        4 |   2 |    2
+        4 |   3 |    1
+        7 |   4 |    3
+        8 |   6 |    2
+        9 |   6 |    3
+(8 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by 1, 2;
+                                      QUERY PLAN                                       
+---------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2), val, val2
+   Sort Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val
+   ->  Sort
+         Output: ((val + val2)), val, val2
+         Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)), xc_groupby_tab1.val
+         ->  HashAggregate
+               Output: (val + val2), val, val2
+               Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val, val2
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2;
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2), val, val2
+   ->  HashAggregate
+         Output: (val + val2), val, val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+         ->  Seq Scan on public.xc_groupby_tab1
+               Output: val, val2
+(7 rows)
+
+select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1, 2, 3;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        5 |   3 |    2
+        5 |   4 |    1
+        6 |   2 |    4
+        6 |   4 |    2
+        7 |   3 |    4
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1, 2, 3;
+                                                  QUERY PLAN                                                   
+---------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+   Sort Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+   ->  Sort
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)), xc_groupby_tab1.val, xc_groupby_tab2.val2
+         Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)), xc_groupby_tab1.val, xc_groupby_tab2.val2
+         ->  HashAggregate
+               Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+               Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               ->  Merge Join
+                     Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val
+                           Sort Key: xc_groupby_tab1.val
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val
+                     ->  Sort
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                           Sort Key: xc_groupby_tab2.val
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(22 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+   ->  HashAggregate
+         Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+         ->  Merge Join
+               Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+               ->  Sort
+                     Output: xc_groupby_tab1.val
+                     Sort Key: xc_groupby_tab1.val
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: xc_groupby_tab1.val
+               ->  Sort
+                     Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                     Sort Key: xc_groupby_tab2.val
+                     ->  Seq Scan on public.xc_groupby_tab2
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(18 rows)
+
+select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1;
+ ?column? 
+----------
+        2
+        5
+        6
+        7
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1;
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+   Sort Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+   ->  Finalize GroupAggregate
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         ->  Sort
+               Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                     Distribute results by H: (val + val2)
+                     ->  Partial HashAggregate
+                           Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                           Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                           ->  Merge Join
+                                 Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                                 Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                                 ->  Sort
+                                       Output: xc_groupby_tab1.val
+                                       Sort Key: xc_groupby_tab1.val
+                                       ->  Seq Scan on public.xc_groupby_tab1
+                                             Output: xc_groupby_tab1.val
+                                 ->  Sort
+                                       Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                                       Sort Key: xc_groupby_tab2.val
+                                       ->  Seq Scan on public.xc_groupby_tab2
+                                             Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(28 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2;
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+   ->  Finalize HashAggregate
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+         ->  Remote Subquery Scan on all
+               Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+               Distribute results by H: (val + val2)
+               ->  Partial HashAggregate
+                     Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                     Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                     ->  Merge Join
+                           Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                           Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                           ->  Sort
+                                 Output: xc_groupby_tab1.val
+                                 Sort Key: xc_groupby_tab1.val
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: xc_groupby_tab1.val
+                           ->  Sort
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                                 Sort Key: xc_groupby_tab2.val
+                                 ->  Seq Scan on public.xc_groupby_tab2
+                                       Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(24 rows)
+
+-- group by with aggregates in expression
+select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1;
+      ?column?       | val2 
+---------------------+------
+ 11.0000000000000000 |    1
+ 14.0000000000000000 |    2
+ 17.6666666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1;
+                                            QUERY PLAN                                             
+---------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   Sort Key: (((count(*) + sum(xc_groupby_tab1.val)))::numeric + avg(xc_groupby_tab1.val))
+   ->  Sort
+         Output: ((((count(*) + sum(val)))::numeric + avg(val))), val2
+         Sort Key: ((((count(*) + sum(xc_groupby_tab1.val)))::numeric + avg(xc_groupby_tab1.val)))
+         ->  Finalize HashAggregate
+               Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+               Group Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: val2
+                     ->  Partial HashAggregate
+                           Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: val, val2
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2;
+                                       QUERY PLAN                                       
+----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   ->  Finalize HashAggregate
+         Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Remote Subquery Scan on all
+               Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+               Distribute results by H: val2
+               ->  Partial HashAggregate
+                     Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                     Group Key: xc_groupby_tab1.val2
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: val, val2
+(13 rows)
+
+-- group by with expressions in group by clause
+select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1;
+ sum |        avg         | ?column? 
+-----+--------------------+----------
+   6 | 2.0000000000000000 |        2
+   8 | 4.0000000000000000 |        4
+  11 | 3.6666666666666667 |        6
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   Sort Key: sum(xc_groupby_tab1.val)
+   ->  Sort
+         Output: (sum(val)), (avg(val)), ((2 * val2))
+         Sort Key: (sum(xc_groupby_tab1.val))
+         ->  Finalize HashAggregate
+               Output: sum(val), avg(val), ((2 * val2))
+               Group Key: (2 * xc_groupby_tab1.val2)
+               ->  Remote Subquery Scan on all
+                     Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: (2 * val2)
+                     ->  Partial HashAggregate
+                           Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: (2 * xc_groupby_tab1.val2)
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: (2 * val2), val
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2;
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   ->  Finalize HashAggregate
+         Output: sum(val), avg(val), ((2 * val2))
+         Group Key: (2 * xc_groupby_tab1.val2)
+         ->  Remote Subquery Scan on all
+               Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val)
+               Distribute results by H: (2 * val2)
+               ->  Partial HashAggregate
+                     Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val)
+                     Group Key: (2 * xc_groupby_tab1.val2)
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: (2 * val2), val
+(13 rows)
+
+drop table xc_groupby_tab1;
+drop table xc_groupby_tab2;
+-- some tests involving nulls, characters, float type etc.
+create table xc_groupby_def(a int, b varchar(25)); 
+insert into xc_groupby_def VALUES (NULL, NULL);
+insert into xc_groupby_def VALUES (1, NULL);
+insert into xc_groupby_def VALUES (NULL, 'One');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (3, 'Three');
+insert into xc_groupby_def VALUES (4, 'Three');
+insert into xc_groupby_def VALUES (5, 'Three');
+insert into xc_groupby_def VALUES (6, 'Two');
+insert into xc_groupby_def VALUES (7, NULL);
+insert into xc_groupby_def VALUES (8, 'Two');
+insert into xc_groupby_def VALUES (9, 'Three');
+insert into xc_groupby_def VALUES (10, 'Three');
+select a,count(a) from xc_groupby_def group by a order by a;
+ a  | count 
+----+-------
+  1 |     1
+  2 |     2
+  3 |     1
+  4 |     1
+  5 |     1
+  6 |     1
+  7 |     1
+  8 |     1
+  9 |     1
+ 10 |     1
+    |     0
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: a, count(a)
+   Sort Key: xc_groupby_def.a
+   ->  Sort
+         Output: a, (count(a))
+         Sort Key: xc_groupby_def.a
+         ->  HashAggregate
+               Output: a, count(a)
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(11 rows)
+
+select avg(a) from xc_groupby_def group by a order by 1;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+     3.0000000000000000
+     4.0000000000000000
+     5.0000000000000000
+     6.0000000000000000
+     7.0000000000000000
+     8.0000000000000000
+     9.0000000000000000
+    10.0000000000000000
+                       
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   Sort Key: avg(xc_groupby_def.a)
+   ->  Sort
+         Output: (avg(a)), a
+         Sort Key: (avg(xc_groupby_def.a))
+         ->  HashAggregate
+               Output: avg(a), a
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  HashAggregate
+         Output: avg(a), a
+         Group Key: xc_groupby_def.a
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+(7 rows)
+
+select avg(a) from xc_groupby_def group by b order by 1;
+        avg         
+--------------------
+ 4.0000000000000000
+ 4.5000000000000000
+ 6.2000000000000000
+                   
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b order by 1;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   Sort Key: avg(xc_groupby_def.a)
+   ->  Sort
+         Output: (avg(a)), b
+         Sort Key: (avg(xc_groupby_def.a))
+         ->  Finalize HashAggregate
+               Output: avg(a), b
+               Group Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL avg(a)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL avg(a)
+                           Group Key: xc_groupby_def.b
+                           ->  Seq Scan on public.xc_groupby_def
+                                 Output: a, b
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  Finalize HashAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_def.b
+         ->  Remote Subquery Scan on all
+               Output: b, PARTIAL avg(a)
+               Distribute results by H: b
+               ->  Partial HashAggregate
+                     Output: b, PARTIAL avg(a)
+                     Group Key: xc_groupby_def.b
+                     ->  Seq Scan on public.xc_groupby_def
+                           Output: a, b
+(13 rows)
+
+select sum(a) from xc_groupby_def group by b order by 1;
+ sum 
+-----
+   8
+  18
+  31
+    
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b order by 1;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), b
+   Sort Key: sum(xc_groupby_def.a)
+   ->  Sort
+         Output: (sum(a)), b
+         Sort Key: (sum(xc_groupby_def.a))
+         ->  Finalize HashAggregate
+               Output: sum(a), b
+               Group Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL sum(a)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL sum(a)
+                           Group Key: xc_groupby_def.b
+                           ->  Seq Scan on public.xc_groupby_def
+                                 Output: a, b
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), b
+   ->  Finalize HashAggregate
+         Output: sum(a), b
+         Group Key: xc_groupby_def.b
+         ->  Remote Subquery Scan on all
+               Output: b, PARTIAL sum(a)
+               Distribute results by H: b
+               ->  Partial HashAggregate
+                     Output: b, PARTIAL sum(a)
+                     Group Key: xc_groupby_def.b
+                     ->  Seq Scan on public.xc_groupby_def
+                           Output: a, b
+(13 rows)
+
+select count(*) from xc_groupby_def group by b order by 1;
+ count 
+-------
+     1
+     3
+     4
+     5
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b order by 1;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   Sort Key: count(*)
+   ->  Sort
+         Output: (count(*)), b
+         Sort Key: (count(*))
+         ->  Finalize HashAggregate
+               Output: count(*), b
+               Group Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL count(*)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL count(*)
+                           Group Key: xc_groupby_def.b
+                           ->  Seq Scan on public.xc_groupby_def
+                                 Output: a, b
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  Finalize HashAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Remote Subquery Scan on all
+               Output: b, PARTIAL count(*)
+               Distribute results by H: b
+               ->  Partial HashAggregate
+                     Output: b, PARTIAL count(*)
+                     Group Key: xc_groupby_def.b
+                     ->  Seq Scan on public.xc_groupby_def
+                           Output: a, b
+(13 rows)
+
+select count(*) from xc_groupby_def where a is not null group by a order by 1;
+ count 
+-------
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     2
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a order by 1;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), a
+   Sort Key: count(*)
+   ->  Sort
+         Output: (count(*)), a
+         Sort Key: (count(*))
+         ->  HashAggregate
+               Output: count(*), a
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+                     Filter: (xc_groupby_def.a IS NOT NULL)
+(12 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), a
+   ->  HashAggregate
+         Output: count(*), a
+         Group Key: xc_groupby_def.a
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+               Filter: (xc_groupby_def.a IS NOT NULL)
+(8 rows)
+
+select * from (select b from xc_groupby_def group by b) q order by q.b;
+   b   
+-------
+ One
+ Three
+ Two
+ 
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select b from xc_groupby_def group by b) q order by q.b;
+                                 QUERY PLAN                                 
+----------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b
+   Sort Key: b
+   ->  Finalize GroupAggregate
+         Output: xc_groupby_def.b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: xc_groupby_def.b
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: xc_groupby_def.b
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: xc_groupby_def.b
+                           Group Key: xc_groupby_def.b
+                           ->  Seq Scan on public.xc_groupby_def
+                                 Output: xc_groupby_def.a, xc_groupby_def.b
+(17 rows)
+
+select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b;
+   b   | count 
+-------+-------
+ One   |     1
+ Three |     5
+ Two   |     4
+       |     0
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b;
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b, count
+   Sort Key: b
+   ->  Finalize GroupAggregate
+         Output: xc_groupby_def.b, count(xc_groupby_def.b)
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: xc_groupby_def.b, (PARTIAL count(xc_groupby_def.b))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: xc_groupby_def.b, PARTIAL count(xc_groupby_def.b)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: xc_groupby_def.b, PARTIAL count(xc_groupby_def.b)
+                           Group Key: xc_groupby_def.b
+                           ->  Seq Scan on public.xc_groupby_def
+                                 Output: xc_groupby_def.a, xc_groupby_def.b
+(17 rows)
+
+select count(*) from xc_groupby_def where b is null group by b order by 1;
+ count 
+-------
+     3
+(1 row)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b;
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  Finalize GroupAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, (PARTIAL count(*))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL count(*)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL count(*)
+                           Group Key: xc_groupby_def.b
+                           ->  Seq Scan on public.xc_groupby_def
+                                 Output: a, b
+                                 Filter: (xc_groupby_def.b IS NULL)
+(17 rows)
+
+create table xc_groupby_g(a int, b float, c numeric);
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(2,2.3,5.2);
+select sum(a) from xc_groupby_g group by a;
+ sum 
+-----
+   2
+   2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), a
+   ->  HashAggregate
+         Output: sum(a), a
+         Group Key: xc_groupby_g.a
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+select sum(b) from xc_groupby_g group by b order by 1;
+ sum 
+-----
+ 2.3
+ 4.2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b order by 1;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(b), b
+   Sort Key: sum(xc_groupby_g.b)
+   ->  Sort
+         Output: (sum(b)), b
+         Sort Key: (sum(xc_groupby_g.b))
+         ->  Finalize HashAggregate
+               Output: sum(b), b
+               Group Key: xc_groupby_g.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL sum(b)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL sum(b)
+                           Group Key: xc_groupby_g.b
+                           ->  Seq Scan on public.xc_groupby_g
+                                 Output: a, b, c
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(b), b
+   ->  Finalize HashAggregate
+         Output: sum(b), b
+         Group Key: xc_groupby_g.b
+         ->  Remote Subquery Scan on all
+               Output: b, PARTIAL sum(b)
+               Distribute results by H: b
+               ->  Partial HashAggregate
+                     Output: b, PARTIAL sum(b)
+                     Group Key: xc_groupby_g.b
+                     ->  Seq Scan on public.xc_groupby_g
+                           Output: a, b, c
+(13 rows)
+
+select sum(c) from xc_groupby_g group by b order by 1;
+ sum 
+-----
+ 5.2
+ 6.4
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b order by 1;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(c), b
+   Sort Key: sum(xc_groupby_g.c)
+   ->  Sort
+         Output: (sum(c)), b
+         Sort Key: (sum(xc_groupby_g.c))
+         ->  Finalize HashAggregate
+               Output: sum(c), b
+               Group Key: xc_groupby_g.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL sum(c)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL sum(c)
+                           Group Key: xc_groupby_g.b
+                           ->  Seq Scan on public.xc_groupby_g
+                                 Output: a, b, c
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(c), b
+   ->  Finalize HashAggregate
+         Output: sum(c), b
+         Group Key: xc_groupby_g.b
+         ->  Remote Subquery Scan on all
+               Output: b, PARTIAL sum(c)
+               Distribute results by H: b
+               ->  Partial HashAggregate
+                     Output: b, PARTIAL sum(c)
+                     Group Key: xc_groupby_g.b
+                     ->  Seq Scan on public.xc_groupby_g
+                           Output: a, b, c
+(13 rows)
+
+select avg(a) from xc_groupby_g group by b order by 1;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b order by 1;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   Sort Key: avg(xc_groupby_g.a)
+   ->  Sort
+         Output: (avg(a)), b
+         Sort Key: (avg(xc_groupby_g.a))
+         ->  Finalize HashAggregate
+               Output: avg(a), b
+               Group Key: xc_groupby_g.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL avg(a)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL avg(a)
+                           Group Key: xc_groupby_g.b
+                           ->  Seq Scan on public.xc_groupby_g
+                                 Output: a, b, c
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  Finalize HashAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_g.b
+         ->  Remote Subquery Scan on all
+               Output: b, PARTIAL avg(a)
+               Distribute results by H: b
+               ->  Partial HashAggregate
+                     Output: b, PARTIAL avg(a)
+                     Group Key: xc_groupby_g.b
+                     ->  Seq Scan on public.xc_groupby_g
+                           Output: a, b, c
+(13 rows)
+
+select avg(b) from xc_groupby_g group by c order by 1;
+ avg 
+-----
+ 2.1
+ 2.3
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c order by 1;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(b), c
+   Sort Key: avg(xc_groupby_g.b)
+   ->  Sort
+         Output: (avg(b)), c
+         Sort Key: (avg(xc_groupby_g.b))
+         ->  Finalize HashAggregate
+               Output: avg(b), c
+               Group Key: xc_groupby_g.c
+               ->  Remote Subquery Scan on all
+                     Output: c, PARTIAL avg(b)
+                     Distribute results by H: c
+                     ->  Partial HashAggregate
+                           Output: c, PARTIAL avg(b)
+                           Group Key: xc_groupby_g.c
+                           ->  Seq Scan on public.xc_groupby_g
+                                 Output: a, b, c
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(b), c
+   ->  Finalize HashAggregate
+         Output: avg(b), c
+         Group Key: xc_groupby_g.c
+         ->  Remote Subquery Scan on all
+               Output: c, PARTIAL avg(b)
+               Distribute results by H: c
+               ->  Partial HashAggregate
+                     Output: c, PARTIAL avg(b)
+                     Group Key: xc_groupby_g.c
+                     ->  Seq Scan on public.xc_groupby_g
+                           Output: a, b, c
+(13 rows)
+
+select avg(c) from xc_groupby_g group by c order by 1;
+        avg         
+--------------------
+ 3.2000000000000000
+ 5.2000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c order by 1;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(c), c
+   Sort Key: avg(xc_groupby_g.c)
+   ->  Sort
+         Output: (avg(c)), c
+         Sort Key: (avg(xc_groupby_g.c))
+         ->  Finalize HashAggregate
+               Output: avg(c), c
+               Group Key: xc_groupby_g.c
+               ->  Remote Subquery Scan on all
+                     Output: c, PARTIAL avg(c)
+                     Distribute results by H: c
+                     ->  Partial HashAggregate
+                           Output: c, PARTIAL avg(c)
+                           Group Key: xc_groupby_g.c
+                           ->  Seq Scan on public.xc_groupby_g
+                                 Output: a, b, c
+(17 rows)
+
+drop table xc_groupby_def;
+drop table xc_groupby_g;
+-- Combination 2, enable_hashagg on and replicated tables.
+-- repeat the same tests for replicated tables
+-- create required tables and fill them with data
+create table xc_groupby_tab1 (val int, val2 int) distribute by replication;
+create table xc_groupby_tab2 (val int, val2 int) distribute by replication;
+insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3);
+insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2);
+select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2, 3;
+ count | sum |        avg         |     ?column?     | val2 
+-------+-----+--------------------+------------------+------
+     2 |   8 | 4.0000000000000000 |                4 |    2
+     3 |   6 | 2.0000000000000000 |                2 |    1
+     3 |  11 | 3.6666666666666667 | 3.66666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2, 3;
+                                                        QUERY PLAN                                                         
+---------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   ->  Sort
+         Output: (count(*)), (sum(val)), (avg(val)), (((sum(val))::double precision / (count(*))::double precision)), val2
+         Sort Key: (count(*)), (sum(xc_groupby_tab1.val)), (avg(xc_groupby_tab1.val))
+         ->  HashAggregate
+               Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+               Group Key: xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val, val2
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2;
+                                                    QUERY PLAN                                                     
+-------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   ->  HashAggregate
+         Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Seq Scan on public.xc_groupby_tab1
+               Output: val, val2
+(7 rows)
+
+-- joins and group by
+select * from (select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2 c1, xc_groupby_tab2.val2 c2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2) q order by q.c1, q.c2;
+ count | sum |         avg         |     ?column?     | c1 | c2 
+-------+-----+---------------------+------------------+----+----
+     9 |  78 |  8.6666666666666667 | 8.66666666666667 |  1 |  1
+     6 |  96 | 16.0000000000000000 |               16 |  2 |  2
+     3 |     |                     |                  |  3 |   
+     3 |     |                     |                  |    |  4
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2 c1, xc_groupby_tab2.val2 c2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2) q order by q.c1, q.c2;
+                                                                                                                                   QUERY PLAN                                                                                                                                    
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count, sum, avg, "?column?", c1, c2
+   ->  GroupAggregate
+         Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+               Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+               ->  Merge Full Join
+                     Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+                     Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                           Sort Key: xc_groupby_tab1.val2
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                     ->  Sort
+                           Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+                           Sort Key: xc_groupby_tab2.val2
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+(21 rows)
+
+-- aggregates over aggregates
+select * from (select sum(y) sum from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x) q order by q.sum;
+ sum 
+-----
+   8
+  17
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select sum(y) sum from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x) q order by q.sum;
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: q.sum
+   ->  Sort
+         Output: q.sum
+         Sort Key: q.sum
+         ->  Subquery Scan on q
+               Output: q.sum
+               ->  HashAggregate
+                     Output: sum((sum(xc_groupby_tab1.val))), ((xc_groupby_tab1.val2 % 2))
+                     Group Key: (xc_groupby_tab1.val2 % 2)
+                     ->  HashAggregate
+                           Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+(15 rows)
+
+-- group by without aggregate
+select val2 from xc_groupby_tab1 group by val2 order by 1;
+ val2 
+------
+    1
+    2
+    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2 order by 1;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   ->  Sort
+         Output: val2
+         Sort Key: xc_groupby_tab1.val2
+         ->  HashAggregate
+               Output: val2
+               Group Key: xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val, val2
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2;
+                   QUERY PLAN                   
+------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   ->  HashAggregate
+         Output: val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Seq Scan on public.xc_groupby_tab1
+               Output: val, val2
+(7 rows)
+
+select * from (select val + val2 sum from xc_groupby_tab1 group by val + val2) q order by q.sum;
+ sum 
+-----
+   2
+   3
+   4
+   7
+   8
+   9
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select val + val2 sum from xc_groupby_tab1 group by val + val2) q order by q.sum;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum
+   ->  Sort
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+         Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+         ->  HashAggregate
+               Output: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+               Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+(10 rows)
+
+select * from (select val + val2, val, val2 from xc_groupby_tab1 group by val, val2) q order by q.val, q.val2;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        4 |   1 |    3
+        3 |   2 |    1
+        4 |   2 |    2
+        4 |   3 |    1
+        7 |   4 |    3
+        8 |   6 |    2
+        9 |   6 |    3
+(8 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select val + val2, val, val2 from xc_groupby_tab1 group by val, val2) q order by q.val, q.val2;
+                                                  QUERY PLAN                                                   
+---------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: "?column?", val, val2
+   ->  Sort
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)), xc_groupby_tab1.val, xc_groupby_tab1.val2
+         Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+         ->  HashAggregate
+               Output: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val, xc_groupby_tab1.val2
+               Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+(10 rows)
+
+select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2) q order by q.val, q.val2;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        6 |   2 |    4
+        5 |   3 |    2
+        7 |   3 |    4
+        5 |   4 |    1
+        6 |   4 |    2
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2) q order by q.val, q.val2;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: "?column?", val, val2
+   ->  Group
+         Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               ->  Merge Join
+                     Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val
+                           Sort Key: xc_groupby_tab1.val
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val
+                     ->  Sort
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                           Sort Key: xc_groupby_tab2.val
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(21 rows)
+
+select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2 sum from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2) q order by q.sum;
+ sum 
+-----
+   2
+   5
+   6
+   7
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2 sum from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2) q order by q.sum;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum
+   ->  Group
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         ->  Sort
+               Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               ->  Merge Join
+                     Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val
+                           Sort Key: xc_groupby_tab1.val
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val
+                     ->  Sort
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                           Sort Key: xc_groupby_tab2.val
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(21 rows)
+
+-- group by with aggregates in expression
+select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by val2;
+      ?column?       | val2 
+---------------------+------
+ 11.0000000000000000 |    1
+ 14.0000000000000000 |    2
+ 17.6666666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by val2;
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   ->  Sort
+         Output: ((((count(*) + sum(val)))::numeric + avg(val))), val2
+         Sort Key: xc_groupby_tab1.val2
+         ->  HashAggregate
+               Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+               Group Key: xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val, val2
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   ->  HashAggregate
+         Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Seq Scan on public.xc_groupby_tab1
+               Output: val, val2
+(7 rows)
+
+-- group by with expressions in group by clause
+select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 2 * val2;
+ sum |        avg         | ?column? 
+-----+--------------------+----------
+   6 | 2.0000000000000000 |        2
+   8 | 4.0000000000000000 |        4
+  11 | 3.6666666666666667 |        6
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 2 * val2;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   ->  Sort
+         Output: (sum(val)), (avg(val)), ((2 * val2))
+         Sort Key: ((2 * xc_groupby_tab1.val2))
+         ->  HashAggregate
+               Output: sum(val), avg(val), ((2 * val2))
+               Group Key: (2 * xc_groupby_tab1.val2)
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: (2 * val2), val
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2;
+                    QUERY PLAN                    
+--------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   ->  HashAggregate
+         Output: sum(val), avg(val), ((2 * val2))
+         Group Key: (2 * xc_groupby_tab1.val2)
+         ->  Seq Scan on public.xc_groupby_tab1
+               Output: (2 * val2), val
+(7 rows)
+
+drop table xc_groupby_tab1;
+drop table xc_groupby_tab2;
+-- some tests involving nulls, characters, float type etc.
+create table xc_groupby_def(a int, b varchar(25)) distribute by replication; 
+insert into xc_groupby_def VALUES (NULL, NULL);
+insert into xc_groupby_def VALUES (1, NULL);
+insert into xc_groupby_def VALUES (NULL, 'One');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (3, 'Three');
+insert into xc_groupby_def VALUES (4, 'Three');
+insert into xc_groupby_def VALUES (5, 'Three');
+insert into xc_groupby_def VALUES (6, 'Two');
+insert into xc_groupby_def VALUES (7, NULL);
+insert into xc_groupby_def VALUES (8, 'Two');
+insert into xc_groupby_def VALUES (9, 'Three');
+insert into xc_groupby_def VALUES (10, 'Three');
+select a,count(a) from xc_groupby_def group by a order by a;
+ a  | count 
+----+-------
+  1 |     1
+  2 |     2
+  3 |     1
+  4 |     1
+  5 |     1
+  6 |     1
+  7 |     1
+  8 |     1
+  9 |     1
+ 10 |     1
+    |     0
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: a, count(a)
+   ->  Sort
+         Output: a, (count(a))
+         Sort Key: xc_groupby_def.a
+         ->  HashAggregate
+               Output: a, count(a)
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(10 rows)
+
+select avg(a) from xc_groupby_def group by a order by 1; 
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+     3.0000000000000000
+     4.0000000000000000
+     5.0000000000000000
+     6.0000000000000000
+     7.0000000000000000
+     8.0000000000000000
+     9.0000000000000000
+    10.0000000000000000
+                       
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  Sort
+         Output: (avg(a)), a
+         Sort Key: (avg(xc_groupby_def.a))
+         ->  HashAggregate
+               Output: avg(a), a
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  HashAggregate
+         Output: avg(a), a
+         Group Key: xc_groupby_def.a
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+(7 rows)
+
+select avg(a) from xc_groupby_def group by a order by 1;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+     3.0000000000000000
+     4.0000000000000000
+     5.0000000000000000
+     6.0000000000000000
+     7.0000000000000000
+     8.0000000000000000
+     9.0000000000000000
+    10.0000000000000000
+                       
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  Sort
+         Output: (avg(a)), a
+         Sort Key: (avg(xc_groupby_def.a))
+         ->  HashAggregate
+               Output: avg(a), a
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  HashAggregate
+         Output: avg(a), a
+         Group Key: xc_groupby_def.a
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+(7 rows)
+
+select avg(a) from xc_groupby_def group by b order by 1;
+        avg         
+--------------------
+ 4.0000000000000000
+ 4.5000000000000000
+ 6.2000000000000000
+                   
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b order by 1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  Sort
+         Output: (avg(a)), b
+         Sort Key: (avg(xc_groupby_def.a))
+         ->  HashAggregate
+               Output: avg(a), b
+               Group Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  HashAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_def.b
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+(7 rows)
+
+select sum(a) from xc_groupby_def group by b order by 1;
+ sum 
+-----
+   8
+  18
+  31
+    
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b order by 1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), b
+   ->  Sort
+         Output: (sum(a)), b
+         Sort Key: (sum(xc_groupby_def.a))
+         ->  HashAggregate
+               Output: sum(a), b
+               Group Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), b
+   ->  HashAggregate
+         Output: sum(a), b
+         Group Key: xc_groupby_def.b
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+(7 rows)
+
+select count(*) from xc_groupby_def group by b order by 1;
+ count 
+-------
+     1
+     3
+     4
+     5
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b order by 1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  Sort
+         Output: (count(*)), b
+         Sort Key: (count(*))
+         ->  HashAggregate
+               Output: count(*), b
+               Group Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  HashAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+(7 rows)
+
+select count(*) from xc_groupby_def where a is not null group by a order by 1;
+ count 
+-------
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     2
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a order by 1;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), a
+   ->  Sort
+         Output: (count(*)), a
+         Sort Key: (count(*))
+         ->  HashAggregate
+               Output: count(*), a
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+                     Filter: (xc_groupby_def.a IS NOT NULL)
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), a
+   ->  HashAggregate
+         Output: count(*), a
+         Group Key: xc_groupby_def.a
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+               Filter: (xc_groupby_def.a IS NOT NULL)
+(8 rows)
+
+select * from (select b from xc_groupby_def group by b) q order by q.b;
+   b   
+-------
+ One
+ Three
+ Two
+ 
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select b from xc_groupby_def group by b) q order by q.b;
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b
+   ->  Sort
+         Output: xc_groupby_def.b
+         Sort Key: xc_groupby_def.b
+         ->  HashAggregate
+               Output: xc_groupby_def.b
+               Group Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: xc_groupby_def.a, xc_groupby_def.b
+(10 rows)
+
+select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b;
+   b   | count 
+-------+-------
+ One   |     1
+ Three |     5
+ Two   |     4
+       |     0
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b, count
+   ->  Sort
+         Output: xc_groupby_def.b, (count(xc_groupby_def.b))
+         Sort Key: xc_groupby_def.b
+         ->  HashAggregate
+               Output: xc_groupby_def.b, count(xc_groupby_def.b)
+               Group Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: xc_groupby_def.a, xc_groupby_def.b
+(10 rows)
+
+select count(*) from xc_groupby_def where b is null group by b order by 1;
+ count 
+-------
+     3
+(1 row)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  GroupAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b
+                     Filter: (xc_groupby_def.b IS NULL)
+(11 rows)
+
+create table xc_groupby_g(a int, b float, c numeric) distribute by replication;
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(2,2.3,5.2);
+select sum(a) from xc_groupby_g group by a;
+ sum 
+-----
+   2
+   2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), a
+   ->  HashAggregate
+         Output: sum(a), a
+         Group Key: xc_groupby_g.a
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+select sum(b) from xc_groupby_g group by b order by 1;
+ sum 
+-----
+ 2.3
+ 4.2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b order by 1;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(b), b
+   ->  Sort
+         Output: (sum(b)), b
+         Sort Key: (sum(xc_groupby_g.b))
+         ->  HashAggregate
+               Output: sum(b), b
+               Group Key: xc_groupby_g.b
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a, b, c
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(b), b
+   ->  HashAggregate
+         Output: sum(b), b
+         Group Key: xc_groupby_g.b
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+select sum(c) from xc_groupby_g group by b order by 1;
+ sum 
+-----
+ 5.2
+ 6.4
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b order by 1;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(c), b
+   ->  Sort
+         Output: (sum(c)), b
+         Sort Key: (sum(xc_groupby_g.c))
+         ->  HashAggregate
+               Output: sum(c), b
+               Group Key: xc_groupby_g.b
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a, b, c
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(c), b
+   ->  HashAggregate
+         Output: sum(c), b
+         Group Key: xc_groupby_g.b
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+select avg(a) from xc_groupby_g group by b order by 1;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b order by 1;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  Sort
+         Output: (avg(a)), b
+         Sort Key: (avg(xc_groupby_g.a))
+         ->  HashAggregate
+               Output: avg(a), b
+               Group Key: xc_groupby_g.b
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a, b, c
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  HashAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_g.b
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+select avg(b) from xc_groupby_g group by c order by 1;
+ avg 
+-----
+ 2.1
+ 2.3
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c order by 1;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(b), c
+   ->  Sort
+         Output: (avg(b)), c
+         Sort Key: (avg(xc_groupby_g.b))
+         ->  HashAggregate
+               Output: avg(b), c
+               Group Key: xc_groupby_g.c
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a, b, c
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(b), c
+   ->  HashAggregate
+         Output: avg(b), c
+         Group Key: xc_groupby_g.c
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+select avg(c) from xc_groupby_g group by c order by 1;
+        avg         
+--------------------
+ 3.2000000000000000
+ 5.2000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c order by 1;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(c), c
+   ->  Sort
+         Output: (avg(c)), c
+         Sort Key: (avg(xc_groupby_g.c))
+         ->  HashAggregate
+               Output: avg(c), c
+               Group Key: xc_groupby_g.c
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a, b, c
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(c), c
+   ->  HashAggregate
+         Output: avg(c), c
+         Group Key: xc_groupby_g.c
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+drop table xc_groupby_def;
+drop table xc_groupby_g;
+reset enable_hashagg;
+-- Combination 3 enable_hashagg off and distributed tables
+set enable_hashagg to off;
+-- create required tables and fill them with data
+create table xc_groupby_tab1 (val int, val2 int);
+create table xc_groupby_tab2 (val int, val2 int);
+insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3);
+insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2);
+select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1;
+ count | sum |        avg         |     ?column?     | val2 
+-------+-----+--------------------+------------------+------
+     2 |   8 | 4.0000000000000000 |                4 |    2
+     3 |   6 | 2.0000000000000000 |                2 |    1
+     3 |  11 | 3.6666666666666667 | 3.66666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2;
+                                                    QUERY PLAN                                                     
+-------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   ->  Finalize GroupAggregate
+         Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val))
+               Sort Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: val2
+                     ->  Partial GroupAggregate
+                           Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Sort
+                                 Output: val2, val
+                                 Sort Key: xc_groupby_tab1.val2
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: val2, val
+(19 rows)
+
+-- joins and group by
+select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2 order by count(*);
+ count | sum |         avg         |     ?column?     | val2 | val2 
+-------+-----+---------------------+------------------+------+------
+     3 |     |                     |                  |      |    4
+     3 |     |                     |                  |    3 |     
+     6 |  96 | 16.0000000000000000 |               16 |    2 |    2
+     9 |  78 |  8.6666666666666667 | 8.66666666666667 |    1 |    1
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2;
+                                                                                                                                   QUERY PLAN                                                                                                                                    
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+   ->  Finalize GroupAggregate
+         Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, (PARTIAL count(*)), (PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val))), (PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val)))
+               Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+               ->  Remote Subquery Scan on all
+                     Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val))
+                     Distribute results by H: val2
+                     ->  Partial GroupAggregate
+                           Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val))
+                           Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+                           ->  Sort
+                                 Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+                                 Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+                                 ->  Hash Full Join
+                                       Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+                                       Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
+                                       ->  Remote Subquery Scan on all
+                                             Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                                             Distribute results by H: val2
+                                             ->  Seq Scan on public.xc_groupby_tab1
+                                                   Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                                       ->  Hash
+                                             Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+                                             ->  Remote Subquery Scan on all
+                                                   Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+                                                   Distribute results by H: val2
+                                                   ->  Seq Scan on public.xc_groupby_tab2
+                                                         Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+(32 rows)
+
+-- aggregates over aggregates
+select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x;
+ sum 
+-----
+  17
+   8
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x;
+                                                           QUERY PLAN                                                           
+--------------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(q1.y), q1.x
+   ->  Finalize GroupAggregate
+         Output: sum(q1.y), q1.x
+         Group Key: q1.x
+         ->  Sort
+               Output: q1.x, (PARTIAL sum(q1.y))
+               Sort Key: q1.x
+               ->  Remote Subquery Scan on all
+                     Output: q1.x, PARTIAL sum(q1.y)
+                     Distribute results by H: x
+                     ->  Partial GroupAggregate
+                           Output: q1.x, PARTIAL sum(q1.y)
+                           Group Key: q1.x
+                           ->  Sort
+                                 Output: q1.x, q1.y
+                                 Sort Key: q1.x
+                                 ->  Subquery Scan on q1
+                                       Output: q1.x, q1.y
+                                       ->  Finalize GroupAggregate
+                                             Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2
+                                             Group Key: xc_groupby_tab1.val2
+                                             ->  Sort
+                                                   Output: xc_groupby_tab1.val2, (PARTIAL sum(xc_groupby_tab1.val))
+                                                   Sort Key: xc_groupby_tab1.val2
+                                                   ->  Remote Subquery Scan on all
+                                                         Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val)
+                                                         Distribute results by H: val2
+                                                         ->  Partial GroupAggregate
+                                                               Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val)
+                                                               Group Key: xc_groupby_tab1.val2
+                                                               ->  Sort
+                                                                     Output: xc_groupby_tab1.val2, xc_groupby_tab1.val
+                                                                     Sort Key: xc_groupby_tab1.val2
+                                                                     ->  Seq Scan on public.xc_groupby_tab1
+                                                                           Output: xc_groupby_tab1.val2, xc_groupby_tab1.val
+(36 rows)
+
+-- group by without aggregate
+select val2 from xc_groupby_tab1 group by val2 order by 1;
+ val2 
+------
+    1
+    2
+    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   ->  Group
+         Output: val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2
+               Sort Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2
+                     Distribute results by H: val2
+                     ->  Group
+                           Output: val2
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Sort
+                                 Output: val2
+                                 Sort Key: xc_groupby_tab1.val2
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: val2
+(19 rows)
+
+select val + val2 from xc_groupby_tab1 group by val + val2 order by 1;
+ ?column? 
+----------
+        2
+        3
+        4
+        7
+        8
+        9
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2;
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2)
+   ->  Group
+         Output: ((val + val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((val + val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (val + val2)
+                     Distribute results by H: (val + val2)
+                     ->  Group
+                           Output: ((val + val2))
+                           Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+                           ->  Sort
+                                 Output: ((val + val2))
+                                 Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: (val + val2)
+(19 rows)
+
+select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by val, val2;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        4 |   1 |    3
+        3 |   2 |    1
+        4 |   2 |    2
+        4 |   3 |    1
+        7 |   4 |    3
+        8 |   6 |    2
+        9 |   6 |    3
+(8 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by val, val2;
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2), val, val2
+   Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+   ->  Group
+         Output: (val + val2), val, val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+         ->  Sort
+               Output: val, val2
+               Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val, val2
+(11 rows)
+
+select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        5 |   3 |    2
+        5 |   4 |    1
+        6 |   4 |    2
+        6 |   2 |    4
+        7 |   3 |    4
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+   ->  Group
+         Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               ->  Merge Join
+                     Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val
+                           Sort Key: xc_groupby_tab1.val
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val
+                     ->  Sort
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                           Sort Key: xc_groupby_tab2.val
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(21 rows)
+
+select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1;
+ ?column? 
+----------
+        2
+        5
+        6
+        7
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2;
+                                             QUERY PLAN                                              
+-----------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+   ->  Group
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         ->  Sort
+               Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                     Distribute results by H: (val + val2)
+                     ->  Group
+                           Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                           Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                           ->  Sort
+                                 Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                                 Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                                 ->  Merge Join
+                                       Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                                       Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                                       ->  Sort
+                                             Output: xc_groupby_tab1.val
+                                             Sort Key: xc_groupby_tab1.val
+                                             ->  Seq Scan on public.xc_groupby_tab1
+                                                   Output: xc_groupby_tab1.val
+                                       ->  Sort
+                                             Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                                             Sort Key: xc_groupby_tab2.val
+                                             ->  Seq Scan on public.xc_groupby_tab2
+                                                   Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(30 rows)
+
+-- group by with aggregates in expression
+select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1;
+      ?column?       | val2 
+---------------------+------
+ 11.0000000000000000 |    1
+ 14.0000000000000000 |    2
+ 17.6666666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2;
+                                          QUERY PLAN                                          
+----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   ->  Finalize GroupAggregate
+         Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val))
+               Sort Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: val2
+                     ->  Partial GroupAggregate
+                           Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Sort
+                                 Output: val2, val
+                                 Sort Key: xc_groupby_tab1.val2
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: val2, val
+(19 rows)
+
+-- group by with expressions in group by clause
+select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1;
+ sum |        avg         | ?column? 
+-----+--------------------+----------
+   6 | 2.0000000000000000 |        2
+   8 | 4.0000000000000000 |        4
+  11 | 3.6666666666666667 |        6
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   ->  Finalize GroupAggregate
+         Output: sum(val), avg(val), ((2 * val2))
+         Group Key: ((2 * xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((2 * val2)), (PARTIAL sum(val)), (PARTIAL avg(val))
+               Sort Key: ((2 * xc_groupby_tab1.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: (2 * val2)
+                     ->  Partial GroupAggregate
+                           Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: ((2 * xc_groupby_tab1.val2))
+                           ->  Sort
+                                 Output: ((2 * val2)), val
+                                 Sort Key: ((2 * xc_groupby_tab1.val2))
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: (2 * val2), val
+(19 rows)
+
+drop table xc_groupby_tab1;
+drop table xc_groupby_tab2;
+-- some tests involving nulls, characters, float type etc.
+create table xc_groupby_def(a int, b varchar(25)); 
+insert into xc_groupby_def VALUES (NULL, NULL);
+insert into xc_groupby_def VALUES (1, NULL);
+insert into xc_groupby_def VALUES (NULL, 'One');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (3, 'Three');
+insert into xc_groupby_def VALUES (4, 'Three');
+insert into xc_groupby_def VALUES (5, 'Three');
+insert into xc_groupby_def VALUES (6, 'Two');
+insert into xc_groupby_def VALUES (7, NULL);
+insert into xc_groupby_def VALUES (8, 'Two');
+insert into xc_groupby_def VALUES (9, 'Three');
+insert into xc_groupby_def VALUES (10, 'Three');
+select a,count(a) from xc_groupby_def group by a order by a;
+ a  | count 
+----+-------
+  1 |     1
+  2 |     2
+  3 |     1
+  4 |     1
+  5 |     1
+  6 |     1
+  7 |     1
+  8 |     1
+  9 |     1
+ 10 |     1
+    |     0
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: a, count(a)
+   Sort Key: xc_groupby_def.a
+   ->  GroupAggregate
+         Output: a, count(a)
+         Group Key: xc_groupby_def.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a
+(11 rows)
+
+select avg(a) from xc_groupby_def group by a order by 1;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+     3.0000000000000000
+     4.0000000000000000
+     5.0000000000000000
+     6.0000000000000000
+     7.0000000000000000
+     8.0000000000000000
+     9.0000000000000000
+    10.0000000000000000
+                       
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   Sort Key: avg(xc_groupby_def.a)
+   ->  Sort
+         Output: (avg(a)), a
+         Sort Key: (avg(xc_groupby_def.a))
+         ->  GroupAggregate
+               Output: avg(a), a
+               Group Key: xc_groupby_def.a
+               ->  Sort
+                     Output: a
+                     Sort Key: xc_groupby_def.a
+                     ->  Seq Scan on public.xc_groupby_def
+                           Output: a
+(14 rows)
+
+select avg(a) from xc_groupby_def group by b order by 1;
+        avg         
+--------------------
+ 4.0000000000000000
+ 4.5000000000000000
+ 6.2000000000000000
+                   
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  Finalize GroupAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, (PARTIAL avg(a))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL avg(a)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL avg(a)
+                           Group Key: xc_groupby_def.b
+                           ->  Sort
+                                 Output: b, a
+                                 Sort Key: xc_groupby_def.b
+                                 ->  Seq Scan on public.xc_groupby_def
+                                       Output: b, a
+(19 rows)
+
+select sum(a) from xc_groupby_def group by b order by 1;
+ sum 
+-----
+   8
+  18
+  31
+    
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), b
+   ->  Finalize GroupAggregate
+         Output: sum(a), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, (PARTIAL sum(a))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL sum(a)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL sum(a)
+                           Group Key: xc_groupby_def.b
+                           ->  Sort
+                                 Output: b, a
+                                 Sort Key: xc_groupby_def.b
+                                 ->  Seq Scan on public.xc_groupby_def
+                                       Output: b, a
+(19 rows)
+
+select count(*) from xc_groupby_def group by b order by 1;
+ count 
+-------
+     1
+     3
+     4
+     5
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  Finalize GroupAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, (PARTIAL count(*))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL count(*)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL count(*)
+                           Group Key: xc_groupby_def.b
+                           ->  Sort
+                                 Output: b
+                                 Sort Key: xc_groupby_def.b
+                                 ->  Seq Scan on public.xc_groupby_def
+                                       Output: b
+(19 rows)
+
+select count(*) from xc_groupby_def where a is not null group by a order by 1;
+ count 
+-------
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     2
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), a
+   ->  GroupAggregate
+         Output: count(*), a
+         Group Key: xc_groupby_def.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a
+                     Filter: (xc_groupby_def.a IS NOT NULL)
+(11 rows)
+
+select b from xc_groupby_def group by b order by 1;
+   b   
+-------
+ One
+ Three
+ Two
+ 
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select b from xc_groupby_def group by b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b
+   ->  Group
+         Output: b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b
+                     Distribute results by H: b
+                     ->  Group
+                           Output: b
+                           Group Key: xc_groupby_def.b
+                           ->  Sort
+                                 Output: b
+                                 Sort Key: xc_groupby_def.b
+                                 ->  Seq Scan on public.xc_groupby_def
+                                       Output: b
+(19 rows)
+
+select b,count(b) from xc_groupby_def group by b order by 1;
+   b   | count 
+-------+-------
+ One   |     1
+ Three |     5
+ Two   |     4
+       |     0
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select b,count(b) from xc_groupby_def group by b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b, count(b)
+   ->  Finalize GroupAggregate
+         Output: b, count(b)
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, (PARTIAL count(b))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL count(b)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL count(b)
+                           Group Key: xc_groupby_def.b
+                           ->  Sort
+                                 Output: b
+                                 Sort Key: xc_groupby_def.b
+                                 ->  Seq Scan on public.xc_groupby_def
+                                       Output: b
+(19 rows)
+
+select count(*) from xc_groupby_def where b is null group by b order by 1;
+ count 
+-------
+     3
+(1 row)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  Finalize GroupAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, (PARTIAL count(*))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL count(*)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL count(*)
+                           Group Key: xc_groupby_def.b
+                           ->  Sort
+                                 Output: b
+                                 Sort Key: xc_groupby_def.b
+                                 ->  Seq Scan on public.xc_groupby_def
+                                       Output: b
+                                       Filter: (xc_groupby_def.b IS NULL)
+(20 rows)
+
+create table xc_groupby_g(a int, b float, c numeric);
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(2,2.3,5.2);
+select sum(a) from xc_groupby_g group by a;
+ sum 
+-----
+   2
+   2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), a
+   ->  GroupAggregate
+         Output: sum(a), a
+         Group Key: xc_groupby_g.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_g.a
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a
+(10 rows)
+
+select sum(b) from xc_groupby_g group by b order by 1;
+ sum 
+-----
+ 2.3
+ 4.2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(b), b
+   ->  Finalize GroupAggregate
+         Output: sum(b), b
+         Group Key: xc_groupby_g.b
+         ->  Sort
+               Output: b, (PARTIAL sum(b))
+               Sort Key: xc_groupby_g.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL sum(b)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL sum(b)
+                           Group Key: xc_groupby_g.b
+                           ->  Sort
+                                 Output: b
+                                 Sort Key: xc_groupby_g.b
+                                 ->  Seq Scan on public.xc_groupby_g
+                                       Output: b
+(19 rows)
+
+select sum(c) from xc_groupby_g group by b order by 1;
+ sum 
+-----
+ 5.2
+ 6.4
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(c), b
+   ->  Finalize GroupAggregate
+         Output: sum(c), b
+         Group Key: xc_groupby_g.b
+         ->  Sort
+               Output: b, (PARTIAL sum(c))
+               Sort Key: xc_groupby_g.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL sum(c)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL sum(c)
+                           Group Key: xc_groupby_g.b
+                           ->  Sort
+                                 Output: b, c
+                                 Sort Key: xc_groupby_g.b
+                                 ->  Seq Scan on public.xc_groupby_g
+                                       Output: b, c
+(19 rows)
+
+select avg(a) from xc_groupby_g group by b order by 1;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  Finalize GroupAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_g.b
+         ->  Sort
+               Output: b, (PARTIAL avg(a))
+               Sort Key: xc_groupby_g.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL avg(a)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL avg(a)
+                           Group Key: xc_groupby_g.b
+                           ->  Sort
+                                 Output: b, a
+                                 Sort Key: xc_groupby_g.b
+                                 ->  Seq Scan on public.xc_groupby_g
+                                       Output: b, a
+(19 rows)
+
+select avg(b) from xc_groupby_g group by c order by 1;
+ avg 
+-----
+ 2.1
+ 2.3
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(b), c
+   ->  Finalize GroupAggregate
+         Output: avg(b), c
+         Group Key: xc_groupby_g.c
+         ->  Sort
+               Output: c, (PARTIAL avg(b))
+               Sort Key: xc_groupby_g.c
+               ->  Remote Subquery Scan on all
+                     Output: c, PARTIAL avg(b)
+                     Distribute results by H: c
+                     ->  Partial GroupAggregate
+                           Output: c, PARTIAL avg(b)
+                           Group Key: xc_groupby_g.c
+                           ->  Sort
+                                 Output: c, b
+                                 Sort Key: xc_groupby_g.c
+                                 ->  Seq Scan on public.xc_groupby_g
+                                       Output: c, b
+(19 rows)
+
+select avg(c) from xc_groupby_g group by c order by 1;
+        avg         
+--------------------
+ 3.2000000000000000
+ 5.2000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(c), c
+   ->  Finalize GroupAggregate
+         Output: avg(c), c
+         Group Key: xc_groupby_g.c
+         ->  Sort
+               Output: c, (PARTIAL avg(c))
+               Sort Key: xc_groupby_g.c
+               ->  Remote Subquery Scan on all
+                     Output: c, PARTIAL avg(c)
+                     Distribute results by H: c
+                     ->  Partial GroupAggregate
+                           Output: c, PARTIAL avg(c)
+                           Group Key: xc_groupby_g.c
+                           ->  Sort
+                                 Output: c
+                                 Sort Key: xc_groupby_g.c
+                                 ->  Seq Scan on public.xc_groupby_g
+                                       Output: c
+(19 rows)
+
+drop table xc_groupby_def;
+drop table xc_groupby_g;
+-- Combination 4 enable_hashagg off and replicated tables.
+-- repeat the same tests for replicated tables
+-- create required tables and fill them with data
+create table xc_groupby_tab1 (val int, val2 int) distribute by replication;
+create table xc_groupby_tab2 (val int, val2 int) distribute by replication;
+insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3);
+insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2);
+select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2;
+ count | sum |        avg         |     ?column?     | val2 
+-------+-----+--------------------+------------------+------
+     3 |   6 | 2.0000000000000000 |                2 |    1
+     2 |   8 | 4.0000000000000000 |                4 |    2
+     3 |  11 | 3.6666666666666667 | 3.66666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2;
+                                                    QUERY PLAN                                                     
+-------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   ->  GroupAggregate
+         Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2, val
+               Sort Key: xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val2, val
+(10 rows)
+
+-- joins and group by
+select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2 order by count(*);
+ count | sum |         avg         |     ?column?     | val2 | val2 
+-------+-----+---------------------+------------------+------+------
+     3 |     |                     |                  |    3 |     
+     3 |     |                     |                  |      |    4
+     6 |  96 | 16.0000000000000000 |               16 |    2 |    2
+     9 |  78 |  8.6666666666666667 | 8.66666666666667 |    1 |    1
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2;
+                                                                                                                                   QUERY PLAN                                                                                                                                    
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+   ->  GroupAggregate
+         Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+               Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+               ->  Merge Full Join
+                     Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+                     Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                           Sort Key: xc_groupby_tab1.val2
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                     ->  Sort
+                           Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+                           Sort Key: xc_groupby_tab2.val2
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+(21 rows)
+
+-- aggregates over aggregates
+select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x;
+ sum 
+-----
+   8
+  17
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x;
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(q1.y), q1.x
+   ->  GroupAggregate
+         Output: sum(q1.y), q1.x
+         Group Key: q1.x
+         ->  Sort
+               Output: q1.x, q1.y
+               Sort Key: q1.x
+               ->  Subquery Scan on q1
+                     Output: q1.x, q1.y
+                     ->  GroupAggregate
+                           Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Sort
+                                 Output: xc_groupby_tab1.val2, xc_groupby_tab1.val
+                                 Sort Key: xc_groupby_tab1.val2
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: xc_groupby_tab1.val2, xc_groupby_tab1.val
+(18 rows)
+
+-- group by without aggregate
+select val2 from xc_groupby_tab1 group by val2 order by 1;
+ val2 
+------
+    1
+    2
+    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   ->  Group
+         Output: val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2
+               Sort Key: xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val2
+(10 rows)
+
+select val + val2 from xc_groupby_tab1 group by val + val2 order by 1;
+ ?column? 
+----------
+        2
+        3
+        4
+        7
+        8
+        9
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2)
+   ->  Group
+         Output: ((val + val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((val + val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: (val + val2)
+(10 rows)
+
+select val + val2, val, val2 from xc_groupby_tab1 group by val, val2;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        4 |   1 |    3
+        3 |   2 |    1
+        4 |   2 |    2
+        4 |   3 |    1
+        7 |   4 |    3
+        8 |   6 |    2
+        9 |   6 |    3
+(8 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2;
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2), val, val2
+   ->  Group
+         Output: (val + val2), val, val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+         ->  Sort
+               Output: val, val2
+               Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val, val2
+(10 rows)
+
+select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        5 |   3 |    2
+        5 |   4 |    1
+        6 |   2 |    4
+        6 |   4 |    2
+        7 |   3 |    4
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+   ->  Group
+         Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               ->  Merge Join
+                     Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val
+                           Sort Key: xc_groupby_tab1.val
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val
+                     ->  Sort
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                           Sort Key: xc_groupby_tab2.val
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(21 rows)
+
+select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1;
+ ?column? 
+----------
+        2
+        5
+        6
+        7
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+   ->  Group
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         ->  Sort
+               Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               ->  Merge Join
+                     Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val
+                           Sort Key: xc_groupby_tab1.val
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val
+                     ->  Sort
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                           Sort Key: xc_groupby_tab2.val
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(21 rows)
+
+-- group by with aggregates in expression
+select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1;
+      ?column?       | val2 
+---------------------+------
+ 11.0000000000000000 |    1
+ 14.0000000000000000 |    2
+ 17.6666666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   ->  GroupAggregate
+         Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2, val
+               Sort Key: xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val2, val
+(10 rows)
+
+-- group by with expressions in group by clause
+select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1;
+ sum |        avg         | ?column? 
+-----+--------------------+----------
+   6 | 2.0000000000000000 |        2
+   8 | 4.0000000000000000 |        4
+  11 | 3.6666666666666667 |        6
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   ->  GroupAggregate
+         Output: sum(val), avg(val), ((2 * val2))
+         Group Key: ((2 * xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((2 * val2)), val
+               Sort Key: ((2 * xc_groupby_tab1.val2))
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: (2 * val2), val
+(10 rows)
+
+drop table xc_groupby_tab1;
+drop table xc_groupby_tab2;
+-- some tests involving nulls, characters, float type etc.
+create table xc_groupby_def(a int, b varchar(25)) distribute by replication; 
+insert into xc_groupby_def VALUES (NULL, NULL);
+insert into xc_groupby_def VALUES (1, NULL);
+insert into xc_groupby_def VALUES (NULL, 'One');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (3, 'Three');
+insert into xc_groupby_def VALUES (4, 'Three');
+insert into xc_groupby_def VALUES (5, 'Three');
+insert into xc_groupby_def VALUES (6, 'Two');
+insert into xc_groupby_def VALUES (7, NULL);
+insert into xc_groupby_def VALUES (8, 'Two');
+insert into xc_groupby_def VALUES (9, 'Three');
+insert into xc_groupby_def VALUES (10, 'Three');
+select a,count(a) from xc_groupby_def group by a order by a;
+ a  | count 
+----+-------
+  1 |     1
+  2 |     2
+  3 |     1
+  4 |     1
+  5 |     1
+  6 |     1
+  7 |     1
+  8 |     1
+  9 |     1
+ 10 |     1
+    |     0
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: a, count(a)
+   ->  GroupAggregate
+         Output: a, count(a)
+         Group Key: xc_groupby_def.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a
+(10 rows)
+
+select avg(a) from xc_groupby_def group by a; 
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+     3.0000000000000000
+     4.0000000000000000
+     5.0000000000000000
+     6.0000000000000000
+     7.0000000000000000
+     8.0000000000000000
+     9.0000000000000000
+    10.0000000000000000
+                       
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; 
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  GroupAggregate
+         Output: avg(a), a
+         Group Key: xc_groupby_def.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a
+(10 rows)
+
+select avg(a) from xc_groupby_def group by a;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+     3.0000000000000000
+     4.0000000000000000
+     5.0000000000000000
+     6.0000000000000000
+     7.0000000000000000
+     8.0000000000000000
+     9.0000000000000000
+    10.0000000000000000
+                       
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  GroupAggregate
+         Output: avg(a), a
+         Group Key: xc_groupby_def.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a
+(10 rows)
+
+select avg(a) from xc_groupby_def group by b order by 1;
+        avg         
+--------------------
+ 4.0000000000000000
+ 4.5000000000000000
+ 6.2000000000000000
+                   
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  GroupAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, a
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b, a
+(10 rows)
+
+select sum(a) from xc_groupby_def group by b order by 1;
+ sum 
+-----
+   8
+  18
+  31
+    
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), b
+   ->  GroupAggregate
+         Output: sum(a), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, a
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b, a
+(10 rows)
+
+select count(*) from xc_groupby_def group by b order by 1;
+ count 
+-------
+     1
+     3
+     4
+     5
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  GroupAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b
+(10 rows)
+
+select count(*) from xc_groupby_def where a is not null group by a;
+ count 
+-------
+     1
+     2
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), a
+   ->  GroupAggregate
+         Output: count(*), a
+         Group Key: xc_groupby_def.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a
+                     Filter: (xc_groupby_def.a IS NOT NULL)
+(11 rows)
+
+select b from xc_groupby_def group by b order by 1;
+   b   
+-------
+ One
+ Three
+ Two
+ 
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select b from xc_groupby_def group by b;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b
+   ->  Group
+         Output: b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b
+(10 rows)
+
+select b,count(b) from xc_groupby_def group by b order by 1;
+   b   | count 
+-------+-------
+ One   |     1
+ Three |     5
+ Two   |     4
+       |     0
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select b,count(b) from xc_groupby_def group by b;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b, count(b)
+   ->  GroupAggregate
+         Output: b, count(b)
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b
+(10 rows)
+
+select count(*) from xc_groupby_def where b is null group by b;
+ count 
+-------
+     3
+(1 row)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  GroupAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b
+                     Filter: (xc_groupby_def.b IS NULL)
+(11 rows)
+
+create table xc_groupby_g(a int, b float, c numeric) distribute by replication;
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(2,2.3,5.2);
+select sum(a) from xc_groupby_g group by a;
+ sum 
+-----
+   2
+   2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), a
+   ->  GroupAggregate
+         Output: sum(a), a
+         Group Key: xc_groupby_g.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_g.a
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a
+(10 rows)
+
+select sum(b) from xc_groupby_g group by b;
+ sum 
+-----
+ 4.2
+ 2.3
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(b), b
+   ->  GroupAggregate
+         Output: sum(b), b
+         Group Key: xc_groupby_g.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_g.b
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: b
+(10 rows)
+
+select sum(c) from xc_groupby_g group by b;
+ sum 
+-----
+ 6.4
+ 5.2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(c), b
+   ->  GroupAggregate
+         Output: sum(c), b
+         Group Key: xc_groupby_g.b
+         ->  Sort
+               Output: b, c
+               Sort Key: xc_groupby_g.b
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: b, c
+(10 rows)
+
+select avg(a) from xc_groupby_g group by b;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  GroupAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_g.b
+         ->  Sort
+               Output: b, a
+               Sort Key: xc_groupby_g.b
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: b, a
+(10 rows)
+
+select avg(b) from xc_groupby_g group by c;
+ avg 
+-----
+ 2.1
+ 2.3
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(b), c
+   ->  GroupAggregate
+         Output: avg(b), c
+         Group Key: xc_groupby_g.c
+         ->  Sort
+               Output: c, b
+               Sort Key: xc_groupby_g.c
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: c, b
+(10 rows)
+
+select avg(c) from xc_groupby_g group by c;
+        avg         
+--------------------
+ 3.2000000000000000
+ 5.2000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(c), c
+   ->  GroupAggregate
+         Output: avg(c), c
+         Group Key: xc_groupby_g.c
+         ->  Sort
+               Output: c
+               Sort Key: xc_groupby_g.c
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: c
+(10 rows)
+
+drop table xc_groupby_def;
+drop table xc_groupby_g;
+reset enable_hashagg;
+reset enable_fast_query_shipping;
+-- Now repeat all the tests with FQS turned on
+set enable_fast_query_shipping to on;
+-- Combination 1: enable_hashagg on and distributed tables
+set enable_hashagg to on;
+-- create required tables and fill them with data
+create table xc_groupby_tab1 (val int, val2 int);
+create table xc_groupby_tab2 (val int, val2 int);
+insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3);
+insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2);
+select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2;
+ count | sum |        avg         |     ?column?     | val2 
+-------+-----+--------------------+------------------+------
+     2 |   8 | 4.0000000000000000 |                4 |    2
+     3 |   6 | 2.0000000000000000 |                2 |    1
+     3 |  11 | 3.6666666666666667 | 3.66666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2;
+                                                        QUERY PLAN                                                         
+---------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   Sort Key: count(*), sum(xc_groupby_tab1.val)
+   ->  Sort
+         Output: (count(*)), (sum(val)), (avg(val)), (((sum(val))::double precision / (count(*))::double precision)), val2
+         Sort Key: (count(*)), (sum(xc_groupby_tab1.val))
+         ->  Finalize HashAggregate
+               Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+               Group Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: val2
+                     ->  Partial HashAggregate
+                           Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: val, val2
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2;
+                                                    QUERY PLAN                                                     
+-------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   ->  Finalize HashAggregate
+         Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Remote Subquery Scan on all
+               Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+               Distribute results by H: val2
+               ->  Partial HashAggregate
+                     Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                     Group Key: xc_groupby_tab1.val2
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: val, val2
+(13 rows)
+
+-- joins and group by
+select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2 order by count(*);
+ count | sum |         avg         |     ?column?     | val2 | val2 
+-------+-----+---------------------+------------------+------+------
+     3 |     |                     |                  |      |    4
+     3 |     |                     |                  |    3 |     
+     6 |  96 | 16.0000000000000000 |               16 |    2 |    2
+     9 |  78 |  8.6666666666666667 | 8.66666666666667 |    1 |    1
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2;
+                                                                                                                                   QUERY PLAN                                                                                                                                    
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+   ->  Finalize HashAggregate
+         Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         ->  Remote Subquery Scan on all
+               Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val))
+               Distribute results by H: val2
+               ->  Partial HashAggregate
+                     Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val))
+                     Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+                     ->  Hash Full Join
+                           Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+                           Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
+                           ->  Remote Subquery Scan on all
+                                 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                                 Distribute results by H: val2
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                           ->  Hash
+                                 Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+                                 ->  Remote Subquery Scan on all
+                                       Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+                                       Distribute results by H: val2
+                                       ->  Seq Scan on public.xc_groupby_tab2
+                                             Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+(26 rows)
+
+-- aggregates over aggregates
+select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x order by 1;
+ sum 
+-----
+   8
+  17
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x order by 1;
+                                                     QUERY PLAN                                                     
+--------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(y), x
+   Sort Key: sum(y)
+   ->  Sort
+         Output: (sum(y)), x
+         Sort Key: (sum(y))
+         ->  Finalize HashAggregate
+               Output: sum(y), x
+               Group Key: x
+               ->  Remote Subquery Scan on all
+                     Output: x, PARTIAL sum(y)
+                     Distribute results by H: x
+                     ->  Partial HashAggregate
+                           Output: ((xc_groupby_tab1.val2 % 2)), PARTIAL sum((sum(xc_groupby_tab1.val)))
+                           Group Key: (xc_groupby_tab1.val2 % 2)
+                           ->  Finalize HashAggregate
+                                 Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2
+                                 Group Key: xc_groupby_tab1.val2
+                                 ->  Remote Subquery Scan on all
+                                       Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val)
+                                       Distribute results by H: val2
+                                       ->  Partial HashAggregate
+                                             Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val)
+                                             Group Key: xc_groupby_tab1.val2
+                                             ->  Seq Scan on public.xc_groupby_tab1
+                                                   Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+(26 rows)
+
+explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x;
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(y), x
+   ->  Finalize HashAggregate
+         Output: sum(y), x
+         Group Key: x
+         ->  Remote Subquery Scan on all
+               Output: x, PARTIAL sum(y)
+               Distribute results by H: x
+               ->  Partial HashAggregate
+                     Output: ((xc_groupby_tab1.val2 % 2)), PARTIAL sum((sum(xc_groupby_tab1.val)))
+                     Group Key: (xc_groupby_tab1.val2 % 2)
+                     ->  Finalize HashAggregate
+                           Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Remote Subquery Scan on all
+                                 Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val)
+                                 Distribute results by H: val2
+                                 ->  Partial HashAggregate
+                                       Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val)
+                                       Group Key: xc_groupby_tab1.val2
+                                       ->  Seq Scan on public.xc_groupby_tab1
+                                             Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+(22 rows)
+
+-- group by without aggregate
+select val2 from xc_groupby_tab1 group by val2 order by 1;
+ val2 
+------
+    1
+    2
+    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2 order by 1;
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   Sort Key: xc_groupby_tab1.val2
+   ->  Finalize GroupAggregate
+         Output: val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2
+               Sort Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2
+                     Distribute results by H: val2
+                     ->  Partial HashAggregate
+                           Output: val2
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: val, val2
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   ->  Finalize HashAggregate
+         Output: val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Remote Subquery Scan on all
+               Output: val2
+               Distribute results by H: val2
+               ->  Partial HashAggregate
+                     Output: val2
+                     Group Key: xc_groupby_tab1.val2
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: val, val2
+(13 rows)
+
+select val + val2 from xc_groupby_tab1 group by val + val2 order by 1;
+ ?column? 
+----------
+        2
+        3
+        4
+        7
+        8
+        9
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2 order by 1;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2)
+   Sort Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+   ->  Finalize GroupAggregate
+         Output: ((val + val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((val + val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (val + val2)
+                     Distribute results by H: (val + val2)
+                     ->  Partial HashAggregate
+                           Output: ((val + val2))
+                           Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: (val + val2)
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2)
+   ->  Finalize HashAggregate
+         Output: ((val + val2))
+         Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+         ->  Remote Subquery Scan on all
+               Output: (val + val2)
+               Distribute results by H: (val + val2)
+               ->  Partial HashAggregate
+                     Output: ((val + val2))
+                     Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: (val + val2)
+(13 rows)
+
+select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by 1, 2;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        3 |   2 |    1
+        4 |   1 |    3
+        4 |   2 |    2
+        4 |   3 |    1
+        7 |   4 |    3
+        8 |   6 |    2
+        9 |   6 |    3
+(8 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by 1, 2;
+                                      QUERY PLAN                                       
+---------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2), val, val2
+   Sort Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val
+   ->  Sort
+         Output: ((val + val2)), val, val2
+         Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)), xc_groupby_tab1.val
+         ->  HashAggregate
+               Output: (val + val2), val, val2
+               Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val, val2
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2;
+                                            QUERY PLAN                                             
+---------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val, xc_groupby_tab1.val2
+   Remote query: SELECT (val + val2), val, val2 FROM xc_groupby_tab1 GROUP BY val, val2
+   ->  HashAggregate
+         Output: (val + val2), val, val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+         ->  Seq Scan on public.xc_groupby_tab1
+               Output: val, val2
+(8 rows)
+
+select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by val, val2;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        6 |   2 |    4
+        5 |   3 |    2
+        7 |   3 |    4
+        5 |   4 |    1
+        6 |   4 |    2
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by val, val2;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+   Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+   ->  Group
+         Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               ->  Merge Join
+                     Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val
+                           Sort Key: xc_groupby_tab1.val
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val
+                     ->  Sort
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                           Sort Key: xc_groupby_tab2.val
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(22 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2;
+                                                                                                                        QUERY PLAN                                                                                                                         
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+   Remote query: SELECT (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 FROM xc_groupby_tab1, xc_groupby_tab2 WHERE (xc_groupby_tab1.val = xc_groupby_tab2.val) GROUP BY xc_groupby_tab1.val, xc_groupby_tab2.val2
+   ->  HashAggregate
+         Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+         ->  Merge Join
+               Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+               ->  Sort
+                     Output: xc_groupby_tab1.val
+                     Sort Key: xc_groupby_tab1.val
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: xc_groupby_tab1.val
+               ->  Sort
+                     Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                     Sort Key: xc_groupby_tab2.val
+                     ->  Seq Scan on public.xc_groupby_tab2
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(19 rows)
+
+select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1;
+ ?column? 
+----------
+        2
+        5
+        6
+        7
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1;
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+   Sort Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+   ->  Finalize GroupAggregate
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         ->  Sort
+               Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                     Distribute results by H: (val + val2)
+                     ->  Partial HashAggregate
+                           Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                           Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                           ->  Merge Join
+                                 Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                                 Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                                 ->  Sort
+                                       Output: xc_groupby_tab1.val
+                                       Sort Key: xc_groupby_tab1.val
+                                       ->  Seq Scan on public.xc_groupby_tab1
+                                             Output: xc_groupby_tab1.val
+                                 ->  Sort
+                                       Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                                       Sort Key: xc_groupby_tab2.val
+                                       ->  Seq Scan on public.xc_groupby_tab2
+                                             Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(28 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2;
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+   ->  Finalize HashAggregate
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+         ->  Remote Subquery Scan on all
+               Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+               Distribute results by H: (val + val2)
+               ->  Partial HashAggregate
+                     Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                     Group Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                     ->  Merge Join
+                           Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                           Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                           ->  Sort
+                                 Output: xc_groupby_tab1.val
+                                 Sort Key: xc_groupby_tab1.val
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: xc_groupby_tab1.val
+                           ->  Sort
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                                 Sort Key: xc_groupby_tab2.val
+                                 ->  Seq Scan on public.xc_groupby_tab2
+                                       Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(24 rows)
+
+-- group by with aggregates in expression
+select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by val2;
+      ?column?       | val2 
+---------------------+------
+ 11.0000000000000000 |    1
+ 14.0000000000000000 |    2
+ 17.6666666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by val2;
+                                          QUERY PLAN                                          
+----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   Sort Key: xc_groupby_tab1.val2
+   ->  Finalize GroupAggregate
+         Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val))
+               Sort Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: val2
+                     ->  Partial HashAggregate
+                           Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: val, val2
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2;
+                                       QUERY PLAN                                       
+----------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   ->  Finalize HashAggregate
+         Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Remote Subquery Scan on all
+               Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+               Distribute results by H: val2
+               ->  Partial HashAggregate
+                     Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                     Group Key: xc_groupby_tab1.val2
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: val, val2
+(13 rows)
+
+-- group by with expressions in group by clause
+select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 2 * val2;
+ sum |        avg         | ?column? 
+-----+--------------------+----------
+   6 | 2.0000000000000000 |        2
+   8 | 4.0000000000000000 |        4
+  11 | 3.6666666666666667 |        6
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 2 * val2;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   Sort Key: (2 * xc_groupby_tab1.val2)
+   ->  Finalize GroupAggregate
+         Output: sum(val), avg(val), ((2 * val2))
+         Group Key: ((2 * xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((2 * val2)), (PARTIAL sum(val)), (PARTIAL avg(val))
+               Sort Key: ((2 * xc_groupby_tab1.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: (2 * val2)
+                     ->  Partial HashAggregate
+                           Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: (2 * xc_groupby_tab1.val2)
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: (2 * val2), val
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2;
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   ->  Finalize HashAggregate
+         Output: sum(val), avg(val), ((2 * val2))
+         Group Key: (2 * xc_groupby_tab1.val2)
+         ->  Remote Subquery Scan on all
+               Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val)
+               Distribute results by H: (2 * val2)
+               ->  Partial HashAggregate
+                     Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val)
+                     Group Key: (2 * xc_groupby_tab1.val2)
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: (2 * val2), val
+(13 rows)
+
+drop table xc_groupby_tab1;
+drop table xc_groupby_tab2;
+-- some tests involving nulls, characters, float type etc.
+create table xc_groupby_def(a int, b varchar(25)); 
+insert into xc_groupby_def VALUES (NULL, NULL);
+insert into xc_groupby_def VALUES (1, NULL);
+insert into xc_groupby_def VALUES (NULL, 'One');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (3, 'Three');
+insert into xc_groupby_def VALUES (4, 'Three');
+insert into xc_groupby_def VALUES (5, 'Three');
+insert into xc_groupby_def VALUES (6, 'Two');
+insert into xc_groupby_def VALUES (7, NULL);
+insert into xc_groupby_def VALUES (8, 'Two');
+insert into xc_groupby_def VALUES (9, 'Three');
+insert into xc_groupby_def VALUES (10, 'Three');
+select a,count(a) from xc_groupby_def group by a order by a;
+ a  | count 
+----+-------
+  1 |     1
+  2 |     2
+  3 |     1
+  4 |     1
+  5 |     1
+  6 |     1
+  7 |     1
+  8 |     1
+  9 |     1
+ 10 |     1
+    |     0
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: a, count(a)
+   Sort Key: xc_groupby_def.a
+   ->  Sort
+         Output: a, (count(a))
+         Sort Key: xc_groupby_def.a
+         ->  HashAggregate
+               Output: a, count(a)
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(11 rows)
+
+select avg(a) from xc_groupby_def group by a order by 1;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+     3.0000000000000000
+     4.0000000000000000
+     5.0000000000000000
+     6.0000000000000000
+     7.0000000000000000
+     8.0000000000000000
+     9.0000000000000000
+    10.0000000000000000
+                       
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   Sort Key: avg(xc_groupby_def.a)
+   ->  Sort
+         Output: (avg(a)), a
+         Sort Key: (avg(xc_groupby_def.a))
+         ->  HashAggregate
+               Output: avg(a), a
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  HashAggregate
+         Output: avg(a), a
+         Group Key: xc_groupby_def.a
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+(7 rows)
+
+select avg(a) from xc_groupby_def group by b order by 1;
+        avg         
+--------------------
+ 4.0000000000000000
+ 4.5000000000000000
+ 6.2000000000000000
+                   
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b order by 1;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   Sort Key: avg(xc_groupby_def.a)
+   ->  Sort
+         Output: (avg(a)), b
+         Sort Key: (avg(xc_groupby_def.a))
+         ->  Finalize HashAggregate
+               Output: avg(a), b
+               Group Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL avg(a)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL avg(a)
+                           Group Key: xc_groupby_def.b
+                           ->  Seq Scan on public.xc_groupby_def
+                                 Output: a, b
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  Finalize HashAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_def.b
+         ->  Remote Subquery Scan on all
+               Output: b, PARTIAL avg(a)
+               Distribute results by H: b
+               ->  Partial HashAggregate
+                     Output: b, PARTIAL avg(a)
+                     Group Key: xc_groupby_def.b
+                     ->  Seq Scan on public.xc_groupby_def
+                           Output: a, b
+(13 rows)
+
+select sum(a) from xc_groupby_def group by b order by 1;
+ sum 
+-----
+   8
+  18
+  31
+    
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b order by 1;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), b
+   Sort Key: sum(xc_groupby_def.a)
+   ->  Sort
+         Output: (sum(a)), b
+         Sort Key: (sum(xc_groupby_def.a))
+         ->  Finalize HashAggregate
+               Output: sum(a), b
+               Group Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL sum(a)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL sum(a)
+                           Group Key: xc_groupby_def.b
+                           ->  Seq Scan on public.xc_groupby_def
+                                 Output: a, b
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), b
+   ->  Finalize HashAggregate
+         Output: sum(a), b
+         Group Key: xc_groupby_def.b
+         ->  Remote Subquery Scan on all
+               Output: b, PARTIAL sum(a)
+               Distribute results by H: b
+               ->  Partial HashAggregate
+                     Output: b, PARTIAL sum(a)
+                     Group Key: xc_groupby_def.b
+                     ->  Seq Scan on public.xc_groupby_def
+                           Output: a, b
+(13 rows)
+
+select count(*) from xc_groupby_def group by b order by 1;
+ count 
+-------
+     1
+     3
+     4
+     5
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b order by 1;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   Sort Key: count(*)
+   ->  Sort
+         Output: (count(*)), b
+         Sort Key: (count(*))
+         ->  Finalize HashAggregate
+               Output: count(*), b
+               Group Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL count(*)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL count(*)
+                           Group Key: xc_groupby_def.b
+                           ->  Seq Scan on public.xc_groupby_def
+                                 Output: a, b
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  Finalize HashAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Remote Subquery Scan on all
+               Output: b, PARTIAL count(*)
+               Distribute results by H: b
+               ->  Partial HashAggregate
+                     Output: b, PARTIAL count(*)
+                     Group Key: xc_groupby_def.b
+                     ->  Seq Scan on public.xc_groupby_def
+                           Output: a, b
+(13 rows)
+
+select count(*) from xc_groupby_def where a is not null group by a order by 1;
+ count 
+-------
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     2
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a order by 1;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), a
+   Sort Key: count(*)
+   ->  Sort
+         Output: (count(*)), a
+         Sort Key: (count(*))
+         ->  HashAggregate
+               Output: count(*), a
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+                     Filter: (xc_groupby_def.a IS NOT NULL)
+(12 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), a
+   ->  HashAggregate
+         Output: count(*), a
+         Group Key: xc_groupby_def.a
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+               Filter: (xc_groupby_def.a IS NOT NULL)
+(8 rows)
+
+select * from (select b from xc_groupby_def group by b) q order by q.b;
+   b   
+-------
+ One
+ Three
+ Two
+ 
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select b from xc_groupby_def group by b) q order by q.b;
+                                 QUERY PLAN                                 
+----------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b
+   Sort Key: b
+   ->  Finalize GroupAggregate
+         Output: xc_groupby_def.b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: xc_groupby_def.b
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: xc_groupby_def.b
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: xc_groupby_def.b
+                           Group Key: xc_groupby_def.b
+                           ->  Seq Scan on public.xc_groupby_def
+                                 Output: xc_groupby_def.a, xc_groupby_def.b
+(17 rows)
+
+select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b;
+   b   | count 
+-------+-------
+ One   |     1
+ Three |     5
+ Two   |     4
+       |     0
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b;
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b, count
+   Sort Key: b
+   ->  Finalize GroupAggregate
+         Output: xc_groupby_def.b, count(xc_groupby_def.b)
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: xc_groupby_def.b, (PARTIAL count(xc_groupby_def.b))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: xc_groupby_def.b, PARTIAL count(xc_groupby_def.b)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: xc_groupby_def.b, PARTIAL count(xc_groupby_def.b)
+                           Group Key: xc_groupby_def.b
+                           ->  Seq Scan on public.xc_groupby_def
+                                 Output: xc_groupby_def.a, xc_groupby_def.b
+(17 rows)
+
+select count(*) from xc_groupby_def where b is null group by b order by 1;
+ count 
+-------
+     3
+(1 row)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b;
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  Finalize GroupAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, (PARTIAL count(*))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL count(*)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL count(*)
+                           Group Key: xc_groupby_def.b
+                           ->  Seq Scan on public.xc_groupby_def
+                                 Output: a, b
+                                 Filter: (xc_groupby_def.b IS NULL)
+(17 rows)
+
+create table xc_groupby_g(a int, b float, c numeric);
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(2,2.3,5.2);
+select sum(a) from xc_groupby_g group by a;
+ sum 
+-----
+   2
+   2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), a
+   ->  HashAggregate
+         Output: sum(a), a
+         Group Key: xc_groupby_g.a
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+select sum(b) from xc_groupby_g group by b order by 1;
+ sum 
+-----
+ 2.3
+ 4.2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b order by 1;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(b), b
+   Sort Key: sum(xc_groupby_g.b)
+   ->  Sort
+         Output: (sum(b)), b
+         Sort Key: (sum(xc_groupby_g.b))
+         ->  Finalize HashAggregate
+               Output: sum(b), b
+               Group Key: xc_groupby_g.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL sum(b)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL sum(b)
+                           Group Key: xc_groupby_g.b
+                           ->  Seq Scan on public.xc_groupby_g
+                                 Output: a, b, c
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(b), b
+   ->  Finalize HashAggregate
+         Output: sum(b), b
+         Group Key: xc_groupby_g.b
+         ->  Remote Subquery Scan on all
+               Output: b, PARTIAL sum(b)
+               Distribute results by H: b
+               ->  Partial HashAggregate
+                     Output: b, PARTIAL sum(b)
+                     Group Key: xc_groupby_g.b
+                     ->  Seq Scan on public.xc_groupby_g
+                           Output: a, b, c
+(13 rows)
+
+select sum(c) from xc_groupby_g group by b order by 1;
+ sum 
+-----
+ 5.2
+ 6.4
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b order by 1;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(c), b
+   Sort Key: sum(xc_groupby_g.c)
+   ->  Sort
+         Output: (sum(c)), b
+         Sort Key: (sum(xc_groupby_g.c))
+         ->  Finalize HashAggregate
+               Output: sum(c), b
+               Group Key: xc_groupby_g.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL sum(c)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL sum(c)
+                           Group Key: xc_groupby_g.b
+                           ->  Seq Scan on public.xc_groupby_g
+                                 Output: a, b, c
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(c), b
+   ->  Finalize HashAggregate
+         Output: sum(c), b
+         Group Key: xc_groupby_g.b
+         ->  Remote Subquery Scan on all
+               Output: b, PARTIAL sum(c)
+               Distribute results by H: b
+               ->  Partial HashAggregate
+                     Output: b, PARTIAL sum(c)
+                     Group Key: xc_groupby_g.b
+                     ->  Seq Scan on public.xc_groupby_g
+                           Output: a, b, c
+(13 rows)
+
+select avg(a) from xc_groupby_g group by b order by 1;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b order by 1;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   Sort Key: avg(xc_groupby_g.a)
+   ->  Sort
+         Output: (avg(a)), b
+         Sort Key: (avg(xc_groupby_g.a))
+         ->  Finalize HashAggregate
+               Output: avg(a), b
+               Group Key: xc_groupby_g.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL avg(a)
+                     Distribute results by H: b
+                     ->  Partial HashAggregate
+                           Output: b, PARTIAL avg(a)
+                           Group Key: xc_groupby_g.b
+                           ->  Seq Scan on public.xc_groupby_g
+                                 Output: a, b, c
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  Finalize HashAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_g.b
+         ->  Remote Subquery Scan on all
+               Output: b, PARTIAL avg(a)
+               Distribute results by H: b
+               ->  Partial HashAggregate
+                     Output: b, PARTIAL avg(a)
+                     Group Key: xc_groupby_g.b
+                     ->  Seq Scan on public.xc_groupby_g
+                           Output: a, b, c
+(13 rows)
+
+select avg(b) from xc_groupby_g group by c order by 1;
+ avg 
+-----
+ 2.1
+ 2.3
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c order by 1;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(b), c
+   Sort Key: avg(xc_groupby_g.b)
+   ->  Sort
+         Output: (avg(b)), c
+         Sort Key: (avg(xc_groupby_g.b))
+         ->  Finalize HashAggregate
+               Output: avg(b), c
+               Group Key: xc_groupby_g.c
+               ->  Remote Subquery Scan on all
+                     Output: c, PARTIAL avg(b)
+                     Distribute results by H: c
+                     ->  Partial HashAggregate
+                           Output: c, PARTIAL avg(b)
+                           Group Key: xc_groupby_g.c
+                           ->  Seq Scan on public.xc_groupby_g
+                                 Output: a, b, c
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(b), c
+   ->  Finalize HashAggregate
+         Output: avg(b), c
+         Group Key: xc_groupby_g.c
+         ->  Remote Subquery Scan on all
+               Output: c, PARTIAL avg(b)
+               Distribute results by H: c
+               ->  Partial HashAggregate
+                     Output: c, PARTIAL avg(b)
+                     Group Key: xc_groupby_g.c
+                     ->  Seq Scan on public.xc_groupby_g
+                           Output: a, b, c
+(13 rows)
+
+select avg(c) from xc_groupby_g group by c order by 1;
+        avg         
+--------------------
+ 3.2000000000000000
+ 5.2000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c order by 1;
+                          QUERY PLAN                           
+---------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(c), c
+   Sort Key: avg(xc_groupby_g.c)
+   ->  Sort
+         Output: (avg(c)), c
+         Sort Key: (avg(xc_groupby_g.c))
+         ->  Finalize HashAggregate
+               Output: avg(c), c
+               Group Key: xc_groupby_g.c
+               ->  Remote Subquery Scan on all
+                     Output: c, PARTIAL avg(c)
+                     Distribute results by H: c
+                     ->  Partial HashAggregate
+                           Output: c, PARTIAL avg(c)
+                           Group Key: xc_groupby_g.c
+                           ->  Seq Scan on public.xc_groupby_g
+                                 Output: a, b, c
+(17 rows)
+
+explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(c), c
+   ->  Finalize HashAggregate
+         Output: avg(c), c
+         Group Key: xc_groupby_g.c
+         ->  Remote Subquery Scan on all
+               Output: c, PARTIAL avg(c)
+               Distribute results by H: c
+               ->  Partial HashAggregate
+                     Output: c, PARTIAL avg(c)
+                     Group Key: xc_groupby_g.c
+                     ->  Seq Scan on public.xc_groupby_g
+                           Output: a, b, c
+(13 rows)
+
+drop table xc_groupby_def;
+drop table xc_groupby_g;
+-- Combination 2, enable_hashagg on and replicated tables.
+-- repeat the same tests for replicated tables
+-- create required tables and fill them with data
+create table xc_groupby_tab1 (val int, val2 int) distribute by replication;
+create table xc_groupby_tab2 (val int, val2 int) distribute by replication;
+insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3);
+insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2);
+select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2;
+ count | sum |        avg         |     ?column?     | val2 
+-------+-----+--------------------+------------------+------
+     2 |   8 | 4.0000000000000000 |                4 |    2
+     3 |   6 | 2.0000000000000000 |                2 |    1
+     3 |  11 | 3.6666666666666667 | 3.66666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2;
+                                                        QUERY PLAN                                                         
+---------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   ->  Sort
+         Output: (count(*)), (sum(val)), (avg(val)), (((sum(val))::double precision / (count(*))::double precision)), val2
+         Sort Key: (count(*)), (sum(xc_groupby_tab1.val))
+         ->  HashAggregate
+               Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+               Group Key: xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val, val2
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2;
+                                                    QUERY PLAN                                                     
+-------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   ->  HashAggregate
+         Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Seq Scan on public.xc_groupby_tab1
+               Output: val, val2
+(7 rows)
+
+-- joins and group by
+select * from (select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2 c1, xc_groupby_tab2.val2 c2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2) q order by q.c1, q.c2;
+ count | sum |         avg         |     ?column?     | c1 | c2 
+-------+-----+---------------------+------------------+----+----
+     9 |  78 |  8.6666666666666667 | 8.66666666666667 |  1 |  1
+     6 |  96 | 16.0000000000000000 |               16 |  2 |  2
+     3 |     |                     |                  |  3 |   
+     3 |     |                     |                  |    |  4
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2 c1, xc_groupby_tab2.val2 c2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2) q order by q.c1, q.c2;
+                                                                                                                                   QUERY PLAN                                                                                                                                    
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count, sum, avg, "?column?", c1, c2
+   ->  GroupAggregate
+         Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+               Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+               ->  Merge Full Join
+                     Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+                     Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                           Sort Key: xc_groupby_tab1.val2
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                     ->  Sort
+                           Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+                           Sort Key: xc_groupby_tab2.val2
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+(21 rows)
+
+-- aggregates over aggregates
+select * from (select sum(y) sum from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x) q order by q.sum;
+ sum 
+-----
+   8
+  17
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select sum(y) sum from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x) q order by q.sum;
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: q.sum
+   ->  Sort
+         Output: q.sum
+         Sort Key: q.sum
+         ->  Subquery Scan on q
+               Output: q.sum
+               ->  HashAggregate
+                     Output: sum((sum(xc_groupby_tab1.val))), ((xc_groupby_tab1.val2 % 2))
+                     Group Key: (xc_groupby_tab1.val2 % 2)
+                     ->  HashAggregate
+                           Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+(15 rows)
+
+-- group by without aggregate
+select val2 from xc_groupby_tab1 group by val2 order by 1;
+ val2 
+------
+    1
+    2
+    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2 order by 1;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   ->  Sort
+         Output: val2
+         Sort Key: xc_groupby_tab1.val2
+         ->  HashAggregate
+               Output: val2
+               Group Key: xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val, val2
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2;
+                   QUERY PLAN                   
+------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   ->  HashAggregate
+         Output: val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Seq Scan on public.xc_groupby_tab1
+               Output: val, val2
+(7 rows)
+
+select * from (select val + val2 sum from xc_groupby_tab1 group by val + val2) q order by q.sum;
+ sum 
+-----
+   2
+   3
+   4
+   7
+   8
+   9
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select val + val2 sum from xc_groupby_tab1 group by val + val2) q order by q.sum;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum
+   ->  Sort
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+         Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+         ->  HashAggregate
+               Output: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+               Group Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+(10 rows)
+
+select * from (select val + val2, val, val2 from xc_groupby_tab1 group by val, val2) q order by q.val, q.val2;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        4 |   1 |    3
+        3 |   2 |    1
+        4 |   2 |    2
+        4 |   3 |    1
+        7 |   4 |    3
+        8 |   6 |    2
+        9 |   6 |    3
+(8 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select val + val2, val, val2 from xc_groupby_tab1 group by val, val2) q order by q.val, q.val2;
+                                                  QUERY PLAN                                                   
+---------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: "?column?", val, val2
+   ->  Sort
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)), xc_groupby_tab1.val, xc_groupby_tab1.val2
+         Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+         ->  HashAggregate
+               Output: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val, xc_groupby_tab1.val2
+               Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+(10 rows)
+
+select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2) q order by q.val, q.val2;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        6 |   2 |    4
+        5 |   3 |    2
+        7 |   3 |    4
+        5 |   4 |    1
+        6 |   4 |    2
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2) q order by q.val, q.val2;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: "?column?", val, val2
+   ->  Group
+         Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               ->  Merge Join
+                     Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val
+                           Sort Key: xc_groupby_tab1.val
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val
+                     ->  Sort
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                           Sort Key: xc_groupby_tab2.val
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(21 rows)
+
+select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2 sum from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2) q order by q.sum;
+ sum 
+-----
+   2
+   5
+   6
+   7
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select xc_groupby_tab1.val + xc_groupby_tab2.val2 sum from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2) q order by q.sum;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum
+   ->  Group
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         ->  Sort
+               Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               ->  Merge Join
+                     Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val
+                           Sort Key: xc_groupby_tab1.val
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val
+                     ->  Sort
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                           Sort Key: xc_groupby_tab2.val
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(21 rows)
+
+-- group by with aggregates in expression
+select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1;
+      ?column?       | val2 
+---------------------+------
+ 11.0000000000000000 |    1
+ 14.0000000000000000 |    2
+ 17.6666666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1;
+                                            QUERY PLAN                                             
+---------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   ->  Sort
+         Output: ((((count(*) + sum(val)))::numeric + avg(val))), val2
+         Sort Key: ((((count(*) + sum(xc_groupby_tab1.val)))::numeric + avg(xc_groupby_tab1.val)))
+         ->  HashAggregate
+               Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+               Group Key: xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val, val2
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   ->  HashAggregate
+         Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Seq Scan on public.xc_groupby_tab1
+               Output: val, val2
+(7 rows)
+
+-- group by with expressions in group by clause
+select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1, 2;
+ sum |        avg         | ?column? 
+-----+--------------------+----------
+   6 | 2.0000000000000000 |        2
+   8 | 4.0000000000000000 |        4
+  11 | 3.6666666666666667 |        6
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1, 2;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   ->  Sort
+         Output: (sum(val)), (avg(val)), ((2 * val2))
+         Sort Key: (sum(xc_groupby_tab1.val)), (avg(xc_groupby_tab1.val))
+         ->  HashAggregate
+               Output: sum(val), avg(val), ((2 * val2))
+               Group Key: (2 * xc_groupby_tab1.val2)
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: (2 * val2), val
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2;
+                    QUERY PLAN                    
+--------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   ->  HashAggregate
+         Output: sum(val), avg(val), ((2 * val2))
+         Group Key: (2 * xc_groupby_tab1.val2)
+         ->  Seq Scan on public.xc_groupby_tab1
+               Output: (2 * val2), val
+(7 rows)
+
+drop table xc_groupby_tab1;
+drop table xc_groupby_tab2;
+-- some tests involving nulls, characters, float type etc.
+create table xc_groupby_def(a int, b varchar(25)) distribute by replication; 
+insert into xc_groupby_def VALUES (NULL, NULL);
+insert into xc_groupby_def VALUES (1, NULL);
+insert into xc_groupby_def VALUES (NULL, 'One');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (3, 'Three');
+insert into xc_groupby_def VALUES (4, 'Three');
+insert into xc_groupby_def VALUES (5, 'Three');
+insert into xc_groupby_def VALUES (6, 'Two');
+insert into xc_groupby_def VALUES (7, NULL);
+insert into xc_groupby_def VALUES (8, 'Two');
+insert into xc_groupby_def VALUES (9, 'Three');
+insert into xc_groupby_def VALUES (10, 'Three');
+select a,count(a) from xc_groupby_def group by a order by a;
+ a  | count 
+----+-------
+  1 |     1
+  2 |     2
+  3 |     1
+  4 |     1
+  5 |     1
+  6 |     1
+  7 |     1
+  8 |     1
+  9 |     1
+ 10 |     1
+    |     0
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: a, count(a)
+   ->  Sort
+         Output: a, (count(a))
+         Sort Key: xc_groupby_def.a
+         ->  HashAggregate
+               Output: a, count(a)
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(10 rows)
+
+select avg(a) from xc_groupby_def group by a order by a; 
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+     3.0000000000000000
+     4.0000000000000000
+     5.0000000000000000
+     6.0000000000000000
+     7.0000000000000000
+     8.0000000000000000
+     9.0000000000000000
+    10.0000000000000000
+                       
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  Sort
+         Output: (avg(a)), a
+         Sort Key: xc_groupby_def.a
+         ->  HashAggregate
+               Output: avg(a), a
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; 
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  HashAggregate
+         Output: avg(a), a
+         Group Key: xc_groupby_def.a
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+(7 rows)
+
+select avg(a) from xc_groupby_def group by a order by 1;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+     3.0000000000000000
+     4.0000000000000000
+     5.0000000000000000
+     6.0000000000000000
+     7.0000000000000000
+     8.0000000000000000
+     9.0000000000000000
+    10.0000000000000000
+                       
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  Sort
+         Output: (avg(a)), a
+         Sort Key: (avg(xc_groupby_def.a))
+         ->  HashAggregate
+               Output: avg(a), a
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  HashAggregate
+         Output: avg(a), a
+         Group Key: xc_groupby_def.a
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+(7 rows)
+
+select avg(a) from xc_groupby_def group by b order by 1;
+        avg         
+--------------------
+ 4.0000000000000000
+ 4.5000000000000000
+ 6.2000000000000000
+                   
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b order by 1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  Sort
+         Output: (avg(a)), b
+         Sort Key: (avg(xc_groupby_def.a))
+         ->  HashAggregate
+               Output: avg(a), b
+               Group Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  HashAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_def.b
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+(7 rows)
+
+select sum(a) from xc_groupby_def group by b order by 1;
+ sum 
+-----
+   8
+  18
+  31
+    
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b order by 1;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), b
+   ->  Sort
+         Output: (sum(a)), b
+         Sort Key: (sum(xc_groupby_def.a))
+         ->  HashAggregate
+               Output: sum(a), b
+               Group Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), b
+   ->  HashAggregate
+         Output: sum(a), b
+         Group Key: xc_groupby_def.b
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+(7 rows)
+
+select count(*) from xc_groupby_def group by b order by b;
+ count 
+-------
+     1
+     5
+     4
+     3
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b order by b;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  Sort
+         Output: (count(*)), b
+         Sort Key: xc_groupby_def.b
+         ->  HashAggregate
+               Output: count(*), b
+               Group Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b;
+                  QUERY PLAN                   
+-----------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  HashAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+(7 rows)
+
+select count(*) from xc_groupby_def where a is not null group by a order by 1;
+ count 
+-------
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     2
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a order by 1;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), a
+   ->  Sort
+         Output: (count(*)), a
+         Sort Key: (count(*))
+         ->  HashAggregate
+               Output: count(*), a
+               Group Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a, b
+                     Filter: (xc_groupby_def.a IS NOT NULL)
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), a
+   ->  HashAggregate
+         Output: count(*), a
+         Group Key: xc_groupby_def.a
+         ->  Seq Scan on public.xc_groupby_def
+               Output: a, b
+               Filter: (xc_groupby_def.a IS NOT NULL)
+(8 rows)
+
+select * from (select b from xc_groupby_def group by b) q order by q.b;
+   b   
+-------
+ One
+ Three
+ Two
+ 
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select b from xc_groupby_def group by b) q order by q.b;
+                           QUERY PLAN                           
+----------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b
+   ->  Sort
+         Output: xc_groupby_def.b
+         Sort Key: xc_groupby_def.b
+         ->  HashAggregate
+               Output: xc_groupby_def.b
+               Group Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: xc_groupby_def.a, xc_groupby_def.b
+(10 rows)
+
+select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b;
+   b   | count 
+-------+-------
+ One   |     1
+ Three |     5
+ Two   |     4
+       |     0
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select * from (select b,count(b) from xc_groupby_def group by b) q order by q.b;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b, count
+   ->  Sort
+         Output: xc_groupby_def.b, (count(xc_groupby_def.b))
+         Sort Key: xc_groupby_def.b
+         ->  HashAggregate
+               Output: xc_groupby_def.b, count(xc_groupby_def.b)
+               Group Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: xc_groupby_def.a, xc_groupby_def.b
+(10 rows)
+
+select count(*) from xc_groupby_def where b is null group by b;
+ count 
+-------
+     3
+(1 row)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  GroupAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b
+                     Filter: (xc_groupby_def.b IS NULL)
+(11 rows)
+
+create table xc_groupby_g(a int, b float, c numeric) distribute by replication;
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(2,2.3,5.2);
+select sum(a) from xc_groupby_g group by a;
+ sum 
+-----
+   2
+   2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), a
+   ->  HashAggregate
+         Output: sum(a), a
+         Group Key: xc_groupby_g.a
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+select sum(b) from xc_groupby_g group by b order by 1;
+ sum 
+-----
+ 2.3
+ 4.2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b order by 1;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(b), b
+   ->  Sort
+         Output: (sum(b)), b
+         Sort Key: (sum(xc_groupby_g.b))
+         ->  HashAggregate
+               Output: sum(b), b
+               Group Key: xc_groupby_g.b
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a, b, c
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(b), b
+   ->  HashAggregate
+         Output: sum(b), b
+         Group Key: xc_groupby_g.b
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+select sum(c) from xc_groupby_g group by b order by 1;
+ sum 
+-----
+ 5.2
+ 6.4
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b order by 1;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(c), b
+   ->  Sort
+         Output: (sum(c)), b
+         Sort Key: (sum(xc_groupby_g.c))
+         ->  HashAggregate
+               Output: sum(c), b
+               Group Key: xc_groupby_g.b
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a, b, c
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(c), b
+   ->  HashAggregate
+         Output: sum(c), b
+         Group Key: xc_groupby_g.b
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+select avg(a) from xc_groupby_g group by b order by 1;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b order by 1;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  Sort
+         Output: (avg(a)), b
+         Sort Key: (avg(xc_groupby_g.a))
+         ->  HashAggregate
+               Output: avg(a), b
+               Group Key: xc_groupby_g.b
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a, b, c
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  HashAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_g.b
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+select avg(b) from xc_groupby_g group by c order by c;
+ avg 
+-----
+ 2.1
+ 2.3
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c order by c;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(b), c
+   ->  Sort
+         Output: (avg(b)), c
+         Sort Key: xc_groupby_g.c
+         ->  HashAggregate
+               Output: avg(b), c
+               Group Key: xc_groupby_g.c
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a, b, c
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(b), c
+   ->  HashAggregate
+         Output: avg(b), c
+         Group Key: xc_groupby_g.c
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+select avg(c) from xc_groupby_g group by c order by c;
+        avg         
+--------------------
+ 3.2000000000000000
+ 5.2000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c order by c;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(c), c
+   ->  Sort
+         Output: (avg(c)), c
+         Sort Key: xc_groupby_g.c
+         ->  HashAggregate
+               Output: avg(c), c
+               Group Key: xc_groupby_g.c
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a, b, c
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c;
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(c), c
+   ->  HashAggregate
+         Output: avg(c), c
+         Group Key: xc_groupby_g.c
+         ->  Seq Scan on public.xc_groupby_g
+               Output: a, b, c
+(7 rows)
+
+drop table xc_groupby_def;
+drop table xc_groupby_g;
+reset enable_hashagg;
+-- Combination 3 enable_hashagg off and distributed tables
+set enable_hashagg to off;
+-- create required tables and fill them with data
+create table xc_groupby_tab1 (val int, val2 int);
+create table xc_groupby_tab2 (val int, val2 int);
+insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3);
+insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2);
+select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2;
+ count | sum |        avg         |     ?column?     | val2 
+-------+-----+--------------------+------------------+------
+     2 |   8 | 4.0000000000000000 |                4 |    2
+     3 |   6 | 2.0000000000000000 |                2 |    1
+     3 |  11 | 3.6666666666666667 | 3.66666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2 order by 1, 2;
+                                                        QUERY PLAN                                                         
+---------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   Sort Key: count(*), sum(xc_groupby_tab1.val)
+   ->  Sort
+         Output: (count(*)), (sum(val)), (avg(val)), (((sum(val))::double precision / (count(*))::double precision)), val2
+         Sort Key: (count(*)), (sum(xc_groupby_tab1.val))
+         ->  Finalize GroupAggregate
+               Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+               Group Key: xc_groupby_tab1.val2
+               ->  Sort
+                     Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val))
+                     Sort Key: xc_groupby_tab1.val2
+                     ->  Remote Subquery Scan on all
+                           Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                           Distribute results by H: val2
+                           ->  Partial GroupAggregate
+                                 Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                                 Group Key: xc_groupby_tab1.val2
+                                 ->  Sort
+                                       Output: val2, val
+                                       Sort Key: xc_groupby_tab1.val2
+                                       ->  Seq Scan on public.xc_groupby_tab1
+                                             Output: val2, val
+(23 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2;
+                                                    QUERY PLAN                                                     
+-------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   ->  Finalize GroupAggregate
+         Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val))
+               Sort Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: val2
+                     ->  Partial GroupAggregate
+                           Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Sort
+                                 Output: val2, val
+                                 Sort Key: xc_groupby_tab1.val2
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: val2, val
+(19 rows)
+
+-- joins and group by
+select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2 order by count(*);
+ count | sum |         avg         |     ?column?     | val2 | val2 
+-------+-----+---------------------+------------------+------+------
+     3 |     |                     |                  |      |    4
+     3 |     |                     |                  |    3 |     
+     6 |  96 | 16.0000000000000000 |               16 |    2 |    2
+     9 |  78 |  8.6666666666666667 | 8.66666666666667 |    1 |    1
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2;
+                                                                                                                                   QUERY PLAN                                                                                                                                    
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+   ->  Finalize GroupAggregate
+         Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, (PARTIAL count(*)), (PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val))), (PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val)))
+               Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+               ->  Remote Subquery Scan on all
+                     Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val))
+                     Distribute results by H: val2
+                     ->  Partial GroupAggregate
+                           Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, PARTIAL count(*), PARTIAL sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), PARTIAL avg((xc_groupby_tab1.val * xc_groupby_tab2.val))
+                           Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+                           ->  Sort
+                                 Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+                                 Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+                                 ->  Hash Full Join
+                                       Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+                                       Hash Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
+                                       ->  Remote Subquery Scan on all
+                                             Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                                             Distribute results by H: val2
+                                             ->  Seq Scan on public.xc_groupby_tab1
+                                                   Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                                       ->  Hash
+                                             Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+                                             ->  Remote Subquery Scan on all
+                                                   Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+                                                   Distribute results by H: val2
+                                                   ->  Seq Scan on public.xc_groupby_tab2
+                                                         Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+(32 rows)
+
+-- aggregates over aggregates
+select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x;
+ sum 
+-----
+  17
+   8
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x;
+                                                           QUERY PLAN                                                           
+--------------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(q1.y), q1.x
+   ->  Finalize GroupAggregate
+         Output: sum(q1.y), q1.x
+         Group Key: q1.x
+         ->  Sort
+               Output: q1.x, (PARTIAL sum(q1.y))
+               Sort Key: q1.x
+               ->  Remote Subquery Scan on all
+                     Output: q1.x, PARTIAL sum(q1.y)
+                     Distribute results by H: x
+                     ->  Partial GroupAggregate
+                           Output: q1.x, PARTIAL sum(q1.y)
+                           Group Key: q1.x
+                           ->  Sort
+                                 Output: q1.x, q1.y
+                                 Sort Key: q1.x
+                                 ->  Subquery Scan on q1
+                                       Output: q1.x, q1.y
+                                       ->  Finalize GroupAggregate
+                                             Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2
+                                             Group Key: xc_groupby_tab1.val2
+                                             ->  Sort
+                                                   Output: xc_groupby_tab1.val2, (PARTIAL sum(xc_groupby_tab1.val))
+                                                   Sort Key: xc_groupby_tab1.val2
+                                                   ->  Remote Subquery Scan on all
+                                                         Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val)
+                                                         Distribute results by H: val2
+                                                         ->  Partial GroupAggregate
+                                                               Output: xc_groupby_tab1.val2, PARTIAL sum(xc_groupby_tab1.val)
+                                                               Group Key: xc_groupby_tab1.val2
+                                                               ->  Sort
+                                                                     Output: xc_groupby_tab1.val2, xc_groupby_tab1.val
+                                                                     Sort Key: xc_groupby_tab1.val2
+                                                                     ->  Seq Scan on public.xc_groupby_tab1
+                                                                           Output: xc_groupby_tab1.val2, xc_groupby_tab1.val
+(36 rows)
+
+-- group by without aggregate
+select val2 from xc_groupby_tab1 group by val2 order by 1;
+ val2 
+------
+    1
+    2
+    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2 order by 1;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   Sort Key: xc_groupby_tab1.val2
+   ->  Group
+         Output: val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2
+               Sort Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2
+                     Distribute results by H: val2
+                     ->  Group
+                           Output: val2
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Sort
+                                 Output: val2
+                                 Sort Key: xc_groupby_tab1.val2
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: val2
+(20 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   ->  Group
+         Output: val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2
+               Sort Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2
+                     Distribute results by H: val2
+                     ->  Group
+                           Output: val2
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Sort
+                                 Output: val2
+                                 Sort Key: xc_groupby_tab1.val2
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: val2
+(19 rows)
+
+select val + val2 from xc_groupby_tab1 group by val + val2 order by 1;
+ ?column? 
+----------
+        2
+        3
+        4
+        7
+        8
+        9
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2 order by 1;
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2)
+   Sort Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2)
+   ->  Group
+         Output: ((val + val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((val + val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (val + val2)
+                     Distribute results by H: (val + val2)
+                     ->  Group
+                           Output: ((val + val2))
+                           Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+                           ->  Sort
+                                 Output: ((val + val2))
+                                 Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: (val + val2)
+(20 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2;
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2)
+   ->  Group
+         Output: ((val + val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((val + val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (val + val2)
+                     Distribute results by H: (val + val2)
+                     ->  Group
+                           Output: ((val + val2))
+                           Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+                           ->  Sort
+                                 Output: ((val + val2))
+                                 Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: (val + val2)
+(19 rows)
+
+select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by 1, 2;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        3 |   2 |    1
+        4 |   1 |    3
+        4 |   2 |    2
+        4 |   3 |    1
+        7 |   4 |    3
+        8 |   6 |    2
+        9 |   6 |    3
+(8 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2 order by 1, 2;
+                                      QUERY PLAN                                       
+---------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2), val, val2
+   Sort Key: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val
+   ->  Sort
+         Output: ((val + val2)), val, val2
+         Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2)), xc_groupby_tab1.val
+         ->  Group
+               Output: (val + val2), val, val2
+               Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+               ->  Sort
+                     Output: val, val2
+                     Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                     ->  Seq Scan on public.xc_groupby_tab1
+                           Output: val, val2
+(14 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2;
+                                            QUERY PLAN                                             
+---------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: (xc_groupby_tab1.val + xc_groupby_tab1.val2), xc_groupby_tab1.val, xc_groupby_tab1.val2
+   Remote query: SELECT (val + val2), val, val2 FROM xc_groupby_tab1 GROUP BY val, val2
+   ->  Group
+         Output: (val + val2), val, val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+         ->  Sort
+               Output: val, val2
+               Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val, val2
+(11 rows)
+
+select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1, 2;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        5 |   3 |    2
+        5 |   4 |    1
+        6 |   2 |    4
+        6 |   4 |    2
+        7 |   3 |    4
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1, 2;
+                                                  QUERY PLAN                                                   
+---------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+   Sort Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val
+   ->  Sort
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)), xc_groupby_tab1.val, xc_groupby_tab2.val2
+         Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2)), xc_groupby_tab1.val
+         ->  Group
+               Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+               Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               ->  Sort
+                     Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+                     Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+                     ->  Merge Join
+                           Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+                           Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                           ->  Sort
+                                 Output: xc_groupby_tab1.val
+                                 Sort Key: xc_groupby_tab1.val
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: xc_groupby_tab1.val
+                           ->  Sort
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                                 Sort Key: xc_groupby_tab2.val
+                                 ->  Seq Scan on public.xc_groupby_tab2
+                                       Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(25 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2;
+                                                                                                                        QUERY PLAN                                                                                                                         
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+   Remote query: SELECT (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2 FROM xc_groupby_tab1, xc_groupby_tab2 WHERE (xc_groupby_tab1.val = xc_groupby_tab2.val) GROUP BY xc_groupby_tab1.val, xc_groupby_tab2.val2
+   ->  Group
+         Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               ->  Merge Join
+                     Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val
+                           Sort Key: xc_groupby_tab1.val
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val
+                     ->  Sort
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                           Sort Key: xc_groupby_tab2.val
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(22 rows)
+
+select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1;
+ ?column? 
+----------
+        2
+        5
+        6
+        7
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1;
+                                             QUERY PLAN                                              
+-----------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+   Sort Key: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+   ->  Group
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         ->  Sort
+               Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                     Distribute results by H: (val + val2)
+                     ->  Group
+                           Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                           Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                           ->  Sort
+                                 Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                                 Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                                 ->  Merge Join
+                                       Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                                       Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                                       ->  Sort
+                                             Output: xc_groupby_tab1.val
+                                             Sort Key: xc_groupby_tab1.val
+                                             ->  Seq Scan on public.xc_groupby_tab1
+                                                   Output: xc_groupby_tab1.val
+                                       ->  Sort
+                                             Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                                             Sort Key: xc_groupby_tab2.val
+                                             ->  Seq Scan on public.xc_groupby_tab2
+                                                   Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(31 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2;
+                                             QUERY PLAN                                              
+-----------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+   ->  Group
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         ->  Sort
+               Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                     Distribute results by H: (val + val2)
+                     ->  Group
+                           Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                           Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                           ->  Sort
+                                 Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                                 Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+                                 ->  Merge Join
+                                       Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                                       Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                                       ->  Sort
+                                             Output: xc_groupby_tab1.val
+                                             Sort Key: xc_groupby_tab1.val
+                                             ->  Seq Scan on public.xc_groupby_tab1
+                                                   Output: xc_groupby_tab1.val
+                                       ->  Sort
+                                             Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                                             Sort Key: xc_groupby_tab2.val
+                                             ->  Seq Scan on public.xc_groupby_tab2
+                                                   Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(30 rows)
+
+-- group by with aggregates in expression
+select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1;
+      ?column?       | val2 
+---------------------+------
+ 11.0000000000000000 |    1
+ 14.0000000000000000 |    2
+ 17.6666666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1;
+                                             QUERY PLAN                                             
+----------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   Sort Key: (((count(*) + sum(xc_groupby_tab1.val)))::numeric + avg(xc_groupby_tab1.val))
+   ->  Sort
+         Output: ((((count(*) + sum(val)))::numeric + avg(val))), val2
+         Sort Key: ((((count(*) + sum(xc_groupby_tab1.val)))::numeric + avg(xc_groupby_tab1.val)))
+         ->  Finalize GroupAggregate
+               Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+               Group Key: xc_groupby_tab1.val2
+               ->  Sort
+                     Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val))
+                     Sort Key: xc_groupby_tab1.val2
+                     ->  Remote Subquery Scan on all
+                           Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                           Distribute results by H: val2
+                           ->  Partial GroupAggregate
+                                 Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                                 Group Key: xc_groupby_tab1.val2
+                                 ->  Sort
+                                       Output: val2, val
+                                       Sort Key: xc_groupby_tab1.val2
+                                       ->  Seq Scan on public.xc_groupby_tab1
+                                             Output: val2, val
+(23 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2;
+                                          QUERY PLAN                                          
+----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   ->  Finalize GroupAggregate
+         Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2, (PARTIAL count(*)), (PARTIAL sum(val)), (PARTIAL avg(val))
+               Sort Key: xc_groupby_tab1.val2
+               ->  Remote Subquery Scan on all
+                     Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: val2
+                     ->  Partial GroupAggregate
+                           Output: val2, PARTIAL count(*), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Sort
+                                 Output: val2, val
+                                 Sort Key: xc_groupby_tab1.val2
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: val2, val
+(19 rows)
+
+-- group by with expressions in group by clause
+select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 3;
+ sum |        avg         | ?column? 
+-----+--------------------+----------
+   6 | 2.0000000000000000 |        2
+   8 | 4.0000000000000000 |        4
+  11 | 3.6666666666666667 |        6
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 3;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   Sort Key: (2 * xc_groupby_tab1.val2)
+   ->  Finalize GroupAggregate
+         Output: sum(val), avg(val), ((2 * val2))
+         Group Key: ((2 * xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((2 * val2)), (PARTIAL sum(val)), (PARTIAL avg(val))
+               Sort Key: ((2 * xc_groupby_tab1.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: (2 * val2)
+                     ->  Partial GroupAggregate
+                           Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: ((2 * xc_groupby_tab1.val2))
+                           ->  Sort
+                                 Output: ((2 * val2)), val
+                                 Sort Key: ((2 * xc_groupby_tab1.val2))
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: (2 * val2), val
+(20 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   ->  Finalize GroupAggregate
+         Output: sum(val), avg(val), ((2 * val2))
+         Group Key: ((2 * xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((2 * val2)), (PARTIAL sum(val)), (PARTIAL avg(val))
+               Sort Key: ((2 * xc_groupby_tab1.val2))
+               ->  Remote Subquery Scan on all
+                     Output: (2 * val2), PARTIAL sum(val), PARTIAL avg(val)
+                     Distribute results by H: (2 * val2)
+                     ->  Partial GroupAggregate
+                           Output: ((2 * val2)), PARTIAL sum(val), PARTIAL avg(val)
+                           Group Key: ((2 * xc_groupby_tab1.val2))
+                           ->  Sort
+                                 Output: ((2 * val2)), val
+                                 Sort Key: ((2 * xc_groupby_tab1.val2))
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: (2 * val2), val
+(19 rows)
+
+drop table xc_groupby_tab1;
+drop table xc_groupby_tab2;
+-- some tests involving nulls, characters, float type etc.
+create table xc_groupby_def(a int, b varchar(25)); 
+insert into xc_groupby_def VALUES (NULL, NULL);
+insert into xc_groupby_def VALUES (1, NULL);
+insert into xc_groupby_def VALUES (NULL, 'One');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (3, 'Three');
+insert into xc_groupby_def VALUES (4, 'Three');
+insert into xc_groupby_def VALUES (5, 'Three');
+insert into xc_groupby_def VALUES (6, 'Two');
+insert into xc_groupby_def VALUES (7, NULL);
+insert into xc_groupby_def VALUES (8, 'Two');
+insert into xc_groupby_def VALUES (9, 'Three');
+insert into xc_groupby_def VALUES (10, 'Three');
+select a,count(a) from xc_groupby_def group by a order by a;
+ a  | count 
+----+-------
+  1 |     1
+  2 |     2
+  3 |     1
+  4 |     1
+  5 |     1
+  6 |     1
+  7 |     1
+  8 |     1
+  9 |     1
+ 10 |     1
+    |     0
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: a, count(a)
+   Sort Key: xc_groupby_def.a
+   ->  GroupAggregate
+         Output: a, count(a)
+         Group Key: xc_groupby_def.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a
+(11 rows)
+
+select avg(a) from xc_groupby_def group by a order by 1;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+     3.0000000000000000
+     4.0000000000000000
+     5.0000000000000000
+     6.0000000000000000
+     7.0000000000000000
+     8.0000000000000000
+     9.0000000000000000
+    10.0000000000000000
+                       
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a order by 1;
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   Sort Key: avg(xc_groupby_def.a)
+   ->  Sort
+         Output: (avg(a)), a
+         Sort Key: (avg(xc_groupby_def.a))
+         ->  GroupAggregate
+               Output: avg(a), a
+               Group Key: xc_groupby_def.a
+               ->  Sort
+                     Output: a
+                     Sort Key: xc_groupby_def.a
+                     ->  Seq Scan on public.xc_groupby_def
+                           Output: a
+(14 rows)
+
+select avg(a) from xc_groupby_def group by b order by 1;
+        avg         
+--------------------
+ 4.0000000000000000
+ 4.5000000000000000
+ 6.2000000000000000
+                   
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  Finalize GroupAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, (PARTIAL avg(a))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL avg(a)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL avg(a)
+                           Group Key: xc_groupby_def.b
+                           ->  Sort
+                                 Output: b, a
+                                 Sort Key: xc_groupby_def.b
+                                 ->  Seq Scan on public.xc_groupby_def
+                                       Output: b, a
+(19 rows)
+
+select sum(a) from xc_groupby_def group by b order by 1;
+ sum 
+-----
+   8
+  18
+  31
+    
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), b
+   ->  Finalize GroupAggregate
+         Output: sum(a), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, (PARTIAL sum(a))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL sum(a)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL sum(a)
+                           Group Key: xc_groupby_def.b
+                           ->  Sort
+                                 Output: b, a
+                                 Sort Key: xc_groupby_def.b
+                                 ->  Seq Scan on public.xc_groupby_def
+                                       Output: b, a
+(19 rows)
+
+select count(*) from xc_groupby_def group by b order by 1;
+ count 
+-------
+     1
+     3
+     4
+     5
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  Finalize GroupAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, (PARTIAL count(*))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL count(*)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL count(*)
+                           Group Key: xc_groupby_def.b
+                           ->  Sort
+                                 Output: b
+                                 Sort Key: xc_groupby_def.b
+                                 ->  Seq Scan on public.xc_groupby_def
+                                       Output: b
+(19 rows)
+
+select count(*) from xc_groupby_def where a is not null group by a order by 1;
+ count 
+-------
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     2
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), a
+   ->  GroupAggregate
+         Output: count(*), a
+         Group Key: xc_groupby_def.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a
+                     Filter: (xc_groupby_def.a IS NOT NULL)
+(11 rows)
+
+select b from xc_groupby_def group by b order by 1;
+   b   
+-------
+ One
+ Three
+ Two
+ 
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select b from xc_groupby_def group by b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b
+   ->  Group
+         Output: b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b
+                     Distribute results by H: b
+                     ->  Group
+                           Output: b
+                           Group Key: xc_groupby_def.b
+                           ->  Sort
+                                 Output: b
+                                 Sort Key: xc_groupby_def.b
+                                 ->  Seq Scan on public.xc_groupby_def
+                                       Output: b
+(19 rows)
+
+select b,count(b) from xc_groupby_def group by b order by 1;
+   b   | count 
+-------+-------
+ One   |     1
+ Three |     5
+ Two   |     4
+       |     0
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select b,count(b) from xc_groupby_def group by b;
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b, count(b)
+   ->  Finalize GroupAggregate
+         Output: b, count(b)
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, (PARTIAL count(b))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL count(b)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL count(b)
+                           Group Key: xc_groupby_def.b
+                           ->  Sort
+                                 Output: b
+                                 Sort Key: xc_groupby_def.b
+                                 ->  Seq Scan on public.xc_groupby_def
+                                       Output: b
+(19 rows)
+
+select count(*) from xc_groupby_def where b is null group by b order by 1;
+ count 
+-------
+     3
+(1 row)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  Finalize GroupAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, (PARTIAL count(*))
+               Sort Key: xc_groupby_def.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL count(*)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL count(*)
+                           Group Key: xc_groupby_def.b
+                           ->  Sort
+                                 Output: b
+                                 Sort Key: xc_groupby_def.b
+                                 ->  Seq Scan on public.xc_groupby_def
+                                       Output: b
+                                       Filter: (xc_groupby_def.b IS NULL)
+(20 rows)
+
+create table xc_groupby_g(a int, b float, c numeric);
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(2,2.3,5.2);
+select sum(a) from xc_groupby_g group by a order by 1;
+ sum 
+-----
+   2
+   2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), a
+   ->  GroupAggregate
+         Output: sum(a), a
+         Group Key: xc_groupby_g.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_g.a
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a
+(10 rows)
+
+select sum(b) from xc_groupby_g group by b order by 1;
+ sum 
+-----
+ 2.3
+ 4.2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(b), b
+   ->  Finalize GroupAggregate
+         Output: sum(b), b
+         Group Key: xc_groupby_g.b
+         ->  Sort
+               Output: b, (PARTIAL sum(b))
+               Sort Key: xc_groupby_g.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL sum(b)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL sum(b)
+                           Group Key: xc_groupby_g.b
+                           ->  Sort
+                                 Output: b
+                                 Sort Key: xc_groupby_g.b
+                                 ->  Seq Scan on public.xc_groupby_g
+                                       Output: b
+(19 rows)
+
+select sum(c) from xc_groupby_g group by b order by 1;
+ sum 
+-----
+ 5.2
+ 6.4
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(c), b
+   ->  Finalize GroupAggregate
+         Output: sum(c), b
+         Group Key: xc_groupby_g.b
+         ->  Sort
+               Output: b, (PARTIAL sum(c))
+               Sort Key: xc_groupby_g.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL sum(c)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL sum(c)
+                           Group Key: xc_groupby_g.b
+                           ->  Sort
+                                 Output: b, c
+                                 Sort Key: xc_groupby_g.b
+                                 ->  Seq Scan on public.xc_groupby_g
+                                       Output: b, c
+(19 rows)
+
+select avg(a) from xc_groupby_g group by b order by 1;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  Finalize GroupAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_g.b
+         ->  Sort
+               Output: b, (PARTIAL avg(a))
+               Sort Key: xc_groupby_g.b
+               ->  Remote Subquery Scan on all
+                     Output: b, PARTIAL avg(a)
+                     Distribute results by H: b
+                     ->  Partial GroupAggregate
+                           Output: b, PARTIAL avg(a)
+                           Group Key: xc_groupby_g.b
+                           ->  Sort
+                                 Output: b, a
+                                 Sort Key: xc_groupby_g.b
+                                 ->  Seq Scan on public.xc_groupby_g
+                                       Output: b, a
+(19 rows)
+
+select avg(b) from xc_groupby_g group by c order by 1;
+ avg 
+-----
+ 2.1
+ 2.3
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(b), c
+   ->  Finalize GroupAggregate
+         Output: avg(b), c
+         Group Key: xc_groupby_g.c
+         ->  Sort
+               Output: c, (PARTIAL avg(b))
+               Sort Key: xc_groupby_g.c
+               ->  Remote Subquery Scan on all
+                     Output: c, PARTIAL avg(b)
+                     Distribute results by H: c
+                     ->  Partial GroupAggregate
+                           Output: c, PARTIAL avg(b)
+                           Group Key: xc_groupby_g.c
+                           ->  Sort
+                                 Output: c, b
+                                 Sort Key: xc_groupby_g.c
+                                 ->  Seq Scan on public.xc_groupby_g
+                                       Output: c, b
+(19 rows)
+
+select avg(c) from xc_groupby_g group by c order by 1;
+        avg         
+--------------------
+ 3.2000000000000000
+ 5.2000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(c), c
+   ->  Finalize GroupAggregate
+         Output: avg(c), c
+         Group Key: xc_groupby_g.c
+         ->  Sort
+               Output: c, (PARTIAL avg(c))
+               Sort Key: xc_groupby_g.c
+               ->  Remote Subquery Scan on all
+                     Output: c, PARTIAL avg(c)
+                     Distribute results by H: c
+                     ->  Partial GroupAggregate
+                           Output: c, PARTIAL avg(c)
+                           Group Key: xc_groupby_g.c
+                           ->  Sort
+                                 Output: c
+                                 Sort Key: xc_groupby_g.c
+                                 ->  Seq Scan on public.xc_groupby_g
+                                       Output: c
+(19 rows)
+
+drop table xc_groupby_def;
+drop table xc_groupby_g;
+-- Combination 4 enable_hashagg off and replicated tables.
+-- repeat the same tests for replicated tables
+-- create required tables and fill them with data
+create table xc_groupby_tab1 (val int, val2 int) distribute by replication;
+create table xc_groupby_tab2 (val int, val2 int) distribute by replication;
+insert into xc_groupby_tab1 values (1, 1), (2, 1), (3, 1), (2, 2), (6, 2), (4, 3), (1, 3), (6, 3);
+insert into xc_groupby_tab2 values (1, 1), (4, 1), (8, 1), (2, 4), (9, 4), (3, 4), (4, 2), (5, 2), (3, 2);
+select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2;
+ count | sum |        avg         |     ?column?     | val2 
+-------+-----+--------------------+------------------+------
+     3 |   6 | 2.0000000000000000 |                2 |    1
+     2 |   8 | 4.0000000000000000 |                4 |    2
+     3 |  11 | 3.6666666666666667 | 3.66666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from xc_groupby_tab1 group by val2;
+                                                    QUERY PLAN                                                     
+-------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+   ->  GroupAggregate
+         Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2, val
+               Sort Key: xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val2, val
+(10 rows)
+
+-- joins and group by
+select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2 order by count(*);
+ count | sum |         avg         |     ?column?     | val2 | val2 
+-------+-----+---------------------+------------------+------+------
+     3 |     |                     |                  |    3 |     
+     3 |     |                     |                  |      |    4
+     6 |  96 | 16.0000000000000000 |               16 |    2 |    2
+     9 |  78 |  8.6666666666666667 | 8.66666666666667 |    1 |    1
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*), sum(xc_groupby_tab1.val * xc_groupby_tab2.val), avg(xc_groupby_tab1.val*xc_groupby_tab2.val), sum(xc_groupby_tab1.val*xc_groupby_tab2.val)::float8/count(*), xc_groupby_tab1.val2, xc_groupby_tab2.val2 from xc_groupby_tab1 full outer join xc_groupby_tab2 on xc_groupby_tab1.val2 = xc_groupby_tab2.val2 group by xc_groupby_tab1.val2, xc_groupby_tab2.val2;
+                                                                                                                                   QUERY PLAN                                                                                                                                    
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+   ->  GroupAggregate
+         Output: count(*), sum((xc_groupby_tab1.val * xc_groupby_tab2.val)), avg((xc_groupby_tab1.val * xc_groupby_tab2.val)), ((sum((xc_groupby_tab1.val * xc_groupby_tab2.val)))::double precision / (count(*))::double precision), xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+               Sort Key: xc_groupby_tab1.val2, xc_groupby_tab2.val2
+               ->  Merge Full Join
+                     Output: xc_groupby_tab1.val2, xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val
+                     Merge Cond: (xc_groupby_tab1.val2 = xc_groupby_tab2.val2)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                           Sort Key: xc_groupby_tab1.val2
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val, xc_groupby_tab1.val2
+                     ->  Sort
+                           Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+                           Sort Key: xc_groupby_tab2.val2
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val, xc_groupby_tab2.val2
+(21 rows)
+
+-- aggregates over aggregates
+select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x;
+ sum 
+-----
+   8
+  17
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(y) from (select sum(val) y, val2%2 x from xc_groupby_tab1 group by val2) q1 group by x;
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(q1.y), q1.x
+   ->  GroupAggregate
+         Output: sum(q1.y), q1.x
+         Group Key: q1.x
+         ->  Sort
+               Output: q1.x, q1.y
+               Sort Key: q1.x
+               ->  Subquery Scan on q1
+                     Output: q1.x, q1.y
+                     ->  GroupAggregate
+                           Output: sum(xc_groupby_tab1.val), (xc_groupby_tab1.val2 % 2), xc_groupby_tab1.val2
+                           Group Key: xc_groupby_tab1.val2
+                           ->  Sort
+                                 Output: xc_groupby_tab1.val2, xc_groupby_tab1.val
+                                 Sort Key: xc_groupby_tab1.val2
+                                 ->  Seq Scan on public.xc_groupby_tab1
+                                       Output: xc_groupby_tab1.val2, xc_groupby_tab1.val
+(18 rows)
+
+-- group by without aggregate
+select val2 from xc_groupby_tab1 group by val2 order by 1;
+ val2 
+------
+    1
+    2
+    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select val2 from xc_groupby_tab1 group by val2;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val2
+   ->  Group
+         Output: val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2
+               Sort Key: xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val2
+(10 rows)
+
+select val + val2 from xc_groupby_tab1 group by val + val2 order by 1;
+ ?column? 
+----------
+        2
+        3
+        4
+        7
+        8
+        9
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2 from xc_groupby_tab1 group by val + val2;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2)
+   ->  Group
+         Output: ((val + val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((val + val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab1.val2))
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: (val + val2)
+(10 rows)
+
+select val + val2, val, val2 from xc_groupby_tab1 group by val, val2;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        4 |   1 |    3
+        3 |   2 |    1
+        4 |   2 |    2
+        4 |   3 |    1
+        7 |   4 |    3
+        8 |   6 |    2
+        9 |   6 |    3
+(8 rows)
+
+explain (verbose true, costs false, nodes false) select val + val2, val, val2 from xc_groupby_tab1 group by val, val2;
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (val + val2), val, val2
+   ->  Group
+         Output: (val + val2), val, val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+         ->  Sort
+               Output: val, val2
+               Sort Key: xc_groupby_tab1.val, xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val, val2
+(10 rows)
+
+select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2 order by 1;
+ ?column? | val | val2 
+----------+-----+------
+        2 |   1 |    1
+        5 |   3 |    2
+        5 |   4 |    1
+        6 |   2 |    4
+        6 |   4 |    2
+        7 |   3 |    4
+(6 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2, xc_groupby_tab1.val, xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val, xc_groupby_tab2.val2;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+   ->  Group
+         Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2), xc_groupby_tab1.val, xc_groupby_tab2.val2
+         Group Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+         ->  Sort
+               Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               Sort Key: xc_groupby_tab1.val, xc_groupby_tab2.val2
+               ->  Merge Join
+                     Output: xc_groupby_tab1.val, xc_groupby_tab2.val2
+                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val
+                           Sort Key: xc_groupby_tab1.val
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val
+                     ->  Sort
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                           Sort Key: xc_groupby_tab2.val
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(21 rows)
+
+select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2 order by 1;
+ ?column? 
+----------
+        2
+        5
+        6
+        7
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select xc_groupby_tab1.val + xc_groupby_tab2.val2 from xc_groupby_tab1, xc_groupby_tab2 where xc_groupby_tab1.val = xc_groupby_tab2.val group by xc_groupby_tab1.val + xc_groupby_tab2.val2;
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+   ->  Group
+         Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         Group Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+         ->  Sort
+               Output: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               Sort Key: ((xc_groupby_tab1.val + xc_groupby_tab2.val2))
+               ->  Merge Join
+                     Output: (xc_groupby_tab1.val + xc_groupby_tab2.val2)
+                     Merge Cond: (xc_groupby_tab1.val = xc_groupby_tab2.val)
+                     ->  Sort
+                           Output: xc_groupby_tab1.val
+                           Sort Key: xc_groupby_tab1.val
+                           ->  Seq Scan on public.xc_groupby_tab1
+                                 Output: xc_groupby_tab1.val
+                     ->  Sort
+                           Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+                           Sort Key: xc_groupby_tab2.val
+                           ->  Seq Scan on public.xc_groupby_tab2
+                                 Output: xc_groupby_tab2.val2, xc_groupby_tab2.val
+(21 rows)
+
+-- group by with aggregates in expression
+select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2 order by 1;
+      ?column?       | val2 
+---------------------+------
+ 11.0000000000000000 |    1
+ 14.0000000000000000 |    2
+ 17.6666666666666667 |    3
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) + sum(val) + avg(val), val2 from xc_groupby_tab1 group by val2;
+                             QUERY PLAN                              
+---------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+   ->  GroupAggregate
+         Output: (((count(*) + sum(val)))::numeric + avg(val)), val2
+         Group Key: xc_groupby_tab1.val2
+         ->  Sort
+               Output: val2, val
+               Sort Key: xc_groupby_tab1.val2
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: val2, val
+(10 rows)
+
+-- group by with expressions in group by clause
+select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2 order by 1;
+ sum |        avg         | ?column? 
+-----+--------------------+----------
+   6 | 2.0000000000000000 |        2
+   8 | 4.0000000000000000 |        4
+  11 | 3.6666666666666667 |        6
+(3 rows)
+
+explain (verbose true, costs false, nodes false) select sum(val), avg(val), 2 * val2 from xc_groupby_tab1 group by 2 * val2;
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), (2 * val2)
+   ->  GroupAggregate
+         Output: sum(val), avg(val), ((2 * val2))
+         Group Key: ((2 * xc_groupby_tab1.val2))
+         ->  Sort
+               Output: ((2 * val2)), val
+               Sort Key: ((2 * xc_groupby_tab1.val2))
+               ->  Seq Scan on public.xc_groupby_tab1
+                     Output: (2 * val2), val
+(10 rows)
+
+drop table xc_groupby_tab1;
+drop table xc_groupby_tab2;
+-- some tests involving nulls, characters, float type etc.
+create table xc_groupby_def(a int, b varchar(25)) distribute by replication; 
+insert into xc_groupby_def VALUES (NULL, NULL);
+insert into xc_groupby_def VALUES (1, NULL);
+insert into xc_groupby_def VALUES (NULL, 'One');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (2, 'Two');
+insert into xc_groupby_def VALUES (3, 'Three');
+insert into xc_groupby_def VALUES (4, 'Three');
+insert into xc_groupby_def VALUES (5, 'Three');
+insert into xc_groupby_def VALUES (6, 'Two');
+insert into xc_groupby_def VALUES (7, NULL);
+insert into xc_groupby_def VALUES (8, 'Two');
+insert into xc_groupby_def VALUES (9, 'Three');
+insert into xc_groupby_def VALUES (10, 'Three');
+select a,count(a) from xc_groupby_def group by a order by a;
+ a  | count 
+----+-------
+  1 |     1
+  2 |     2
+  3 |     1
+  4 |     1
+  5 |     1
+  6 |     1
+  7 |     1
+  8 |     1
+  9 |     1
+ 10 |     1
+    |     0
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select a,count(a) from xc_groupby_def group by a order by a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: a, count(a)
+   ->  GroupAggregate
+         Output: a, count(a)
+         Group Key: xc_groupby_def.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a
+(10 rows)
+
+select avg(a) from xc_groupby_def group by a; 
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+     3.0000000000000000
+     4.0000000000000000
+     5.0000000000000000
+     6.0000000000000000
+     7.0000000000000000
+     8.0000000000000000
+     9.0000000000000000
+    10.0000000000000000
+                       
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a; 
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  GroupAggregate
+         Output: avg(a), a
+         Group Key: xc_groupby_def.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a
+(10 rows)
+
+select avg(a) from xc_groupby_def group by a;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+     3.0000000000000000
+     4.0000000000000000
+     5.0000000000000000
+     6.0000000000000000
+     7.0000000000000000
+     8.0000000000000000
+     9.0000000000000000
+    10.0000000000000000
+                       
+(11 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by a;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), a
+   ->  GroupAggregate
+         Output: avg(a), a
+         Group Key: xc_groupby_def.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a
+(10 rows)
+
+select avg(a) from xc_groupby_def group by b order by 1;
+        avg         
+--------------------
+ 4.0000000000000000
+ 4.5000000000000000
+ 6.2000000000000000
+                   
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_def group by b;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  GroupAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, a
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b, a
+(10 rows)
+
+select sum(a) from xc_groupby_def group by b order by 1;
+ sum 
+-----
+   8
+  18
+  31
+    
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_def group by b;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), b
+   ->  GroupAggregate
+         Output: sum(a), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b, a
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b, a
+(10 rows)
+
+select count(*) from xc_groupby_def group by b;
+ count 
+-------
+     1
+     5
+     4
+     3
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def group by b;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  GroupAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b
+(10 rows)
+
+select count(*) from xc_groupby_def where a is not null group by a;
+ count 
+-------
+     1
+     2
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+(10 rows)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where a is not null group by a;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), a
+   ->  GroupAggregate
+         Output: count(*), a
+         Group Key: xc_groupby_def.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_def.a
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: a
+                     Filter: (xc_groupby_def.a IS NOT NULL)
+(11 rows)
+
+select b from xc_groupby_def group by b order by 1;
+   b   
+-------
+ One
+ Three
+ Two
+ 
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select b from xc_groupby_def group by b;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b
+   ->  Group
+         Output: b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b
+(10 rows)
+
+select b,count(b) from xc_groupby_def group by b order by 1;
+   b   | count 
+-------+-------
+ One   |     1
+ Three |     5
+ Two   |     4
+       |     0
+(4 rows)
+
+explain (verbose true, costs false, nodes false) select b,count(b) from xc_groupby_def group by b;
+                     QUERY PLAN                      
+-----------------------------------------------------
+ Remote Subquery Scan on all
+   Output: b, count(b)
+   ->  GroupAggregate
+         Output: b, count(b)
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b
+(10 rows)
+
+select count(*) from xc_groupby_def where b is null group by b;
+ count 
+-------
+     3
+(1 row)
+
+explain (verbose true, costs false, nodes false) select count(*) from xc_groupby_def where b is null group by b;
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: count(*), b
+   ->  GroupAggregate
+         Output: count(*), b
+         Group Key: xc_groupby_def.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_def.b
+               ->  Seq Scan on public.xc_groupby_def
+                     Output: b
+                     Filter: (xc_groupby_def.b IS NULL)
+(11 rows)
+
+create table xc_groupby_g(a int, b float, c numeric) distribute by replication;
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(1,2.1,3.2);
+insert into xc_groupby_g values(2,2.3,5.2);
+select sum(a) from xc_groupby_g group by a;
+ sum 
+-----
+   2
+   2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(a) from xc_groupby_g group by a;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(a), a
+   ->  GroupAggregate
+         Output: sum(a), a
+         Group Key: xc_groupby_g.a
+         ->  Sort
+               Output: a
+               Sort Key: xc_groupby_g.a
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: a
+(10 rows)
+
+select sum(b) from xc_groupby_g group by b;
+ sum 
+-----
+ 4.2
+ 2.3
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(b) from xc_groupby_g group by b;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(b), b
+   ->  GroupAggregate
+         Output: sum(b), b
+         Group Key: xc_groupby_g.b
+         ->  Sort
+               Output: b
+               Sort Key: xc_groupby_g.b
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: b
+(10 rows)
+
+select sum(c) from xc_groupby_g group by b;
+ sum 
+-----
+ 6.4
+ 5.2
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select sum(c) from xc_groupby_g group by b;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(c), b
+   ->  GroupAggregate
+         Output: sum(c), b
+         Group Key: xc_groupby_g.b
+         ->  Sort
+               Output: b, c
+               Sort Key: xc_groupby_g.b
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: b, c
+(10 rows)
+
+select avg(a) from xc_groupby_g group by b;
+          avg           
+------------------------
+ 1.00000000000000000000
+     2.0000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(a) from xc_groupby_g group by b;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(a), b
+   ->  GroupAggregate
+         Output: avg(a), b
+         Group Key: xc_groupby_g.b
+         ->  Sort
+               Output: b, a
+               Sort Key: xc_groupby_g.b
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: b, a
+(10 rows)
+
+select avg(b) from xc_groupby_g group by c;
+ avg 
+-----
+ 2.1
+ 2.3
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(b) from xc_groupby_g group by c;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(b), c
+   ->  GroupAggregate
+         Output: avg(b), c
+         Group Key: xc_groupby_g.c
+         ->  Sort
+               Output: c, b
+               Sort Key: xc_groupby_g.c
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: c, b
+(10 rows)
+
+select avg(c) from xc_groupby_g group by c;
+        avg         
+--------------------
+ 3.2000000000000000
+ 5.2000000000000000
+(2 rows)
+
+explain (verbose true, costs false, nodes false) select avg(c) from xc_groupby_g group by c;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Remote Subquery Scan on all
+   Output: avg(c), c
+   ->  GroupAggregate
+         Output: avg(c), c
+         Group Key: xc_groupby_g.c
+         ->  Sort
+               Output: c
+               Sort Key: xc_groupby_g.c
+               ->  Seq Scan on public.xc_groupby_g
+                     Output: c
+(10 rows)
+
+drop table xc_groupby_def;
+drop table xc_groupby_g;
+reset enable_hashagg;
+reset enable_fast_query_shipping;

From 47edcde7bc775bd510e59cf411aeb44f754979c9 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 9 Mar 2021 16:19:34 +0800
Subject: [PATCH 309/578] Allow UNIQUE indexes on partitioned tables If we
 restrict unique constraints on partitioned tables so that they must always
 include the partition key, then our standard approach to unique indexes
 already works --- each unique key is forced to exist within a single
 partition, so enforcing the unique restriction in each index individually is
 enough to have it enforced globally. Therefore we can implement unique
 indexes on partitions by simply removing a few restrictions (and adding
 others.)

Discussion: https://postgr.es/m/20171222212921.hi6hg6pem2w2t36z@alvherre.pgsql
Discussion: https://postgr.es/m/20171229230607.3iib6b62fn3uaf47@alvherre.pgsql
Reviewed-by: Simon Riggs, Jesper Pedersen, Peter Eisentraut, Jaime
Casanova, Amit Langote
---
 src/backend/bootstrap/bootparse.y             |   2 +
 src/backend/catalog/index.c                   |  50 ++-
 src/backend/catalog/pg_constraint.c           |  77 +++++
 src/backend/catalog/toasting.c                |   4 +-
 src/backend/commands/indexcmds.c              | 126 +++++++-
 src/backend/commands/tablecmds.c              |  72 ++++-
 src/backend/parser/analyze.c                  |   7 +
 src/backend/parser/parse_utilcmd.c            |  33 +-
 src/backend/tcop/utility.c                    |   2 +
 src/bin/pg_dump/t/002_pg_dump.pl              |  65 ++++
 src/include/catalog/index.h                   |   5 +-
 src/include/catalog/pg_constraint_fn.h        |  87 +++---
 src/include/commands/defrem.h                 |   1 +
 src/include/parser/parse_utilcmd.h            |   3 +-
 src/test/regress/expected/alter_table.out     |   8 -
 src/test/regress/expected/create_index.out    |   6 +
 src/test/regress/expected/create_table.out    |  12 -
 src/test/regress/expected/indexing.out        | 293 +++++++++++++++++-
 src/test/regress/expected/insert_conflict.out |   2 +-
 .../regress/expected/insert_conflict_1.out    |   2 +-
 src/test/regress/output/tablespace.source     |  29 +-
 src/test/regress/sql/alter_table.sql          |   2 -
 src/test/regress/sql/create_index.sql         |   6 +
 src/test/regress/sql/create_table.sql         |   8 -
 src/test/regress/sql/indexing.sql             | 171 +++++++++-
 25 files changed, 921 insertions(+), 152 deletions(-)

diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y
index 128b2e6c..e720c618 100644
--- a/src/backend/bootstrap/bootparse.y
+++ b/src/backend/bootstrap/bootparse.y
@@ -387,6 +387,7 @@ Boot_DeclareIndexStmt:
 								stmt,
 								$4,
 								InvalidOid,
+								InvalidOid,
 								false,
 								false,
 								false,
@@ -433,6 +434,7 @@ Boot_DeclareUniqueIndexStmt:
 								stmt,
 								$5,
 								InvalidOid,
+								InvalidOid,
 								false,
 								false,
 								false,
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 81c91015..89c9a1ea 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -717,6 +717,8 @@ UpdateIndexRelation(Oid indexoid,
  *        nonzero to specify a preselected OID.
  * parentIndexRelid: if creating an index partition, the OID of the
  *		parent index; otherwise InvalidOid.
+ * parentConstraintId: if creating a constraint on a partition, the OID
+ *		of the constraint in the parent; otherwise InvalidOid.
  * relFileNode: normally, pass InvalidOid to get new storage.  May be
  *        nonzero to attach an existing valid build.
  * indexInfo: same info executor uses to insert into the index
@@ -748,6 +750,7 @@ UpdateIndexRelation(Oid indexoid,
  *		(only if INDEX_CREATE_ADD_CONSTRAINT is set)
  * allow_system_table_mods: allow table to be a system catalog
  * is_internal: if true, post creation hook for new index
+ * constraintId: if not NULL, receives OID of created constraint
  *
  * Returns the OID of the created index.
  */
@@ -756,6 +759,7 @@ index_create(Relation heapRelation,
              const char *indexRelationName,
              Oid indexRelationId,
 			 Oid parentIndexRelid,
+			 Oid parentConstraintId,
              Oid relFileNode,
              IndexInfo *indexInfo,
              List *indexColNames,
@@ -768,7 +772,8 @@ index_create(Relation heapRelation,
 			 bits16 flags,
 			 bits16 constr_flags,
              bool allow_system_table_mods,
-			 bool is_internal)
+			 bool is_internal,
+			 Oid *constraintId)
 {
     Oid            heapRelationId = RelationGetRelid(heapRelation);
     Relation    pg_class;
@@ -1015,6 +1020,7 @@ index_create(Relation heapRelation,
 		if ((flags & INDEX_CREATE_ADD_CONSTRAINT) != 0)
         {
             char        constraintType;
+			ObjectAddress localaddr;
 
             if (isprimary)
                 constraintType = CONSTRAINT_PRIMARY;
@@ -1028,14 +1034,17 @@ index_create(Relation heapRelation,
                 constraintType = 0; /* keep compiler quiet */
             }
 
-            index_constraint_create(heapRelation,
+			localaddr = index_constraint_create(heapRelation,
                                     indexRelationId,
+									parentConstraintId,
                                     indexInfo,
                                     indexRelationName,
                                     constraintType,
 									constr_flags,
                                     allow_system_table_mods,
                                     is_internal);
+			if (constraintId)
+				*constraintId = localaddr.objectId;
         }
         else
         {
@@ -1206,6 +1215,8 @@ index_create(Relation heapRelation,
  *
  * heapRelation: table owning the index (must be suitably locked by caller)
  * indexRelationId: OID of the index
+ * parentConstraintId: if constraint is on a partition, the OID of the
+ *		constraint in the parent.
  * indexInfo: same info executor uses to insert into the index
  * constraintName: what it say (generally, should match name of index)
  * constraintType: one of CONSTRAINT_PRIMARY, CONSTRAINT_UNIQUE, or
@@ -1223,6 +1234,7 @@ index_create(Relation heapRelation,
 ObjectAddress
 index_constraint_create(Relation heapRelation,
                         Oid indexRelationId,
+						Oid parentConstraintId,
                         IndexInfo *indexInfo,
                         const char *constraintName,
                         char constraintType,
@@ -1237,6 +1249,9 @@ index_constraint_create(Relation heapRelation,
 	bool		deferrable;
 	bool		initdeferred;
 	bool		mark_as_primary;
+	bool		islocal;
+	bool		noinherit;
+	int			inhcount;
 
 	deferrable = (constr_flags & INDEX_CONSTR_CREATE_DEFERRABLE) != 0;
 	initdeferred = (constr_flags & INDEX_CONSTR_CREATE_INIT_DEFERRED) != 0;
@@ -1271,6 +1286,19 @@ index_constraint_create(Relation heapRelation,
         deleteDependencyRecordsForClass(RelationRelationId, indexRelationId,
                                         RelationRelationId, DEPENDENCY_AUTO);
 
+	if (OidIsValid(parentConstraintId))
+	{
+		islocal = false;
+		inhcount = 1;
+		noinherit = false;
+	}
+	else
+	{
+		islocal = true;
+		inhcount = 0;
+		noinherit = true;
+	}
+
     /*
      * Construct a pg_constraint entry.
      */
@@ -1298,9 +1326,9 @@ index_constraint_create(Relation heapRelation,
                                    NULL,    /* no check constraint */
                                    NULL,
                                    NULL,
-                                   true,    /* islocal */
-                                   0,    /* inhcount */
-                                   true,    /* noinherit */
+								   islocal,
+								   inhcount,
+								   noinherit,
                                    is_internal);
 
     /*
@@ -1320,6 +1348,18 @@ index_constraint_create(Relation heapRelation,
     recordDependencyOn(&myself, &referenced, DEPENDENCY_INTERNAL);
 
     /*
+	 * Also, if this is a constraint on a partition, mark it as depending
+	 * on the constraint in the parent.
+	 */
+	if (OidIsValid(parentConstraintId))
+	{
+		ObjectAddress	parentConstr;
+
+		ObjectAddressSet(parentConstr, ConstraintRelationId, parentConstraintId);
+		recordDependencyOn(&referenced, &parentConstr, DEPENDENCY_INTERNAL_AUTO);
+	}
+
+	/*
      * If the constraint is deferrable, create the deferred uniqueness
      * checking trigger.  (The trigger will be given an internal dependency on
      * the constraint by CreateTrigger.)
diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c
index fa6f8bc9..a1f21b1b 100644
--- a/src/backend/catalog/pg_constraint.c
+++ b/src/backend/catalog/pg_constraint.c
@@ -747,6 +747,43 @@ AlterConstraintNamespaces(Oid ownerId, Oid oldNspId,
     heap_close(conRel, RowExclusiveLock);
 }
 
+/*
+ * ConstraintSetParentConstraint
+ *		Set a partition's constraint as child of its parent table's
+ *
+ * This updates the constraint's pg_constraint row to show it as inherited, and
+ * add a dependency to the parent so that it cannot be removed on its own.
+ */
+void
+ConstraintSetParentConstraint(Oid childConstrId, Oid parentConstrId)
+{
+	Relation		constrRel;
+	Form_pg_constraint constrForm;
+	HeapTuple		tuple,
+					newtup;
+	ObjectAddress	depender;
+	ObjectAddress	referenced;
+
+	constrRel = heap_open(ConstraintRelationId, RowExclusiveLock);
+	tuple = SearchSysCache1(CONSTROID, ObjectIdGetDatum(childConstrId));
+	if (!HeapTupleIsValid(tuple))
+		elog(ERROR, "cache lookup failed for constraint %u", childConstrId);
+	newtup = heap_copytuple(tuple);
+	constrForm = (Form_pg_constraint) GETSTRUCT(newtup);
+	constrForm->conislocal = false;
+	constrForm->coninhcount++;
+	CatalogTupleUpdate(constrRel, &tuple->t_self, newtup);
+	ReleaseSysCache(tuple);
+
+	ObjectAddressSet(referenced, ConstraintRelationId, parentConstrId);
+	ObjectAddressSet(depender, ConstraintRelationId, childConstrId);
+
+	recordDependencyOn(&depender, &referenced, DEPENDENCY_INTERNAL_AUTO);
+
+	heap_close(constrRel, RowExclusiveLock);
+}
+
+
 /*
  * get_relation_constraint_oid
  *        Find a constraint on the specified relation with the specified name.
@@ -805,6 +842,46 @@ get_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok)
     return conOid;
 }
 
+/*
+ * Return the OID of the constraint associated with the given index in the
+ * given relation; or InvalidOid if no such index is catalogued.
+ */
+Oid
+get_relation_idx_constraint_oid(Oid relationId, Oid indexId)
+{
+	Relation	pg_constraint;
+	SysScanDesc	scan;
+	ScanKeyData	key;
+	HeapTuple	tuple;
+	Oid			constraintId = InvalidOid;
+
+	pg_constraint = heap_open(ConstraintRelationId, AccessShareLock);
+
+	ScanKeyInit(&key,
+				Anum_pg_constraint_conrelid,
+				BTEqualStrategyNumber,
+				F_OIDEQ,
+				ObjectIdGetDatum(relationId));
+	scan = systable_beginscan(pg_constraint, ConstraintRelidIndexId,
+							  true, NULL, 1, &key);
+	while ((tuple = systable_getnext(scan)) != NULL)
+	{
+		Form_pg_constraint	constrForm;
+
+		constrForm = (Form_pg_constraint) GETSTRUCT(tuple);
+		if (constrForm->conindid == indexId)
+		{
+			constraintId = HeapTupleGetOid(tuple);
+			break;
+		}
+	}
+	systable_endscan(scan);
+
+	heap_close(pg_constraint, AccessShareLock);
+	return constraintId;
+}
+
+
 /*
  * get_domain_constraint_oid
  *        Find a constraint on the specified domain with the specified name.
diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c
index a82b2037..95b0564c 100644
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@@ -392,13 +392,13 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
     coloptions[1] = 0;
 
     index_create(toast_rel, toast_idxname, toastIndexOid, InvalidOid,
-				 InvalidOid,
+				 InvalidOid, InvalidOid,
                  indexInfo,
                  list_make2("chunk_id", "chunk_seq"),
                  BTREE_AM_OID,
                  rel->rd_rel->reltablespace,
                  collationObjectId, classObjectId, coloptions, (Datum) 0,
-				 INDEX_CREATE_IS_PRIMARY, 0, true, true);
+				 INDEX_CREATE_IS_PRIMARY, 0, true, true, NULL);
 
     heap_close(toast_rel, NoLock);
 
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 76701f4a..03245150 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -27,6 +27,7 @@
 #include "catalog/indexing.h"
 #include "catalog/partition.h"
 #include "catalog/pg_am.h"
+#include "catalog/pg_constraint_fn.h"
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_inherits_fn.h"
 #include "catalog/pg_opclass.h"
@@ -309,6 +310,8 @@ CheckIndexCompatible(Oid oldId,
  *        nonzero to specify a preselected OID for the index.
  * 'parentIndexId': the OID of the parent index; InvalidOid if not the child
  *		of a partitioned index.
+ * 'parentConstraintId': the OID of the parent constraint; InvalidOid if not
+ *		the child of a constraint (only used when recursing)
  * 'is_alter_table': this is due to an ALTER rather than a CREATE operation.
  * 'check_rights': check for CREATE rights in namespace and tablespace.  (This
  *        should be true except when ALTER is deleting/recreating an index.)
@@ -325,6 +328,7 @@ DefineIndex(Oid relationId,
             IndexStmt *stmt,
             Oid indexRelationId,
 			Oid parentIndexId,
+			Oid	parentConstraintId,
             bool is_alter_table,
             bool check_rights,
             bool check_not_in_use,
@@ -339,6 +343,7 @@ DefineIndex(Oid relationId,
     Oid            accessMethodId;
     Oid            namespaceId;
     Oid            tablespaceId;
+	Oid			createdConstraintId = InvalidOid;
     List       *indexColNames;
     Relation    rel;
     Relation    indexRelation;
@@ -446,20 +451,11 @@ DefineIndex(Oid relationId,
 	               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 	                errmsg("cannot create index on partitioned table \"%s\" concurrently",
 	                       RelationGetRelationName(rel))));
-	   if (stmt->unique)
-	       ereport(ERROR,
-	               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-	                errmsg("cannot create unique index on partitioned table \"%s\"",
-	                       RelationGetRelationName(rel))));
 	   if (stmt->excludeOpNames)
 	       ereport(ERROR,
 	               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 	                errmsg("cannot create exclusion constraints on partitioned table \"%s\"",
 	                       RelationGetRelationName(rel))));
-	   if (stmt->primary || stmt->isconstraint)
-	       ereport(ERROR,
-	               (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-	                errmsg("cannot create constraints on partitioned tables")));
 	}
 
 	/*
@@ -729,6 +725,85 @@ DefineIndex(Oid relationId,
 		index_check_primary_key(rel, indexInfo, is_alter_table, stmt);
 
     /*
+	 * If this table is partitioned and we're creating a unique index or a
+	 * primary key, make sure that the indexed columns are part of the
+	 * partition key.  Otherwise it would be possible to violate uniqueness by
+	 * putting values that ought to be unique in different partitions.
+	 *
+	 * We could lift this limitation if we had global indexes, but those have
+	 * their own problems, so this is a useful feature combination.
+	 */
+	if (partitioned && (stmt->unique || stmt->primary))
+	{
+		PartitionKey key = rel->rd_partkey;
+		int			i;
+
+		/*
+		 * A partitioned table can have unique indexes, as long as all the
+		 * columns in the partition key appear in the unique key.  A
+		 * partition-local index can enforce global uniqueness iff the PK
+		 * value completely determines the partition that a row is in.
+		 *
+		 * Thus, verify that all the columns in the partition key appear
+		 * in the unique key definition.
+		 */
+		for (i = 0; i < key->partnatts; i++)
+		{
+			bool	found = false;
+			int		j;
+			const char *constraint_type;
+
+			if (stmt->primary)
+				constraint_type = "PRIMARY KEY";
+			else if (stmt->unique)
+				constraint_type = "UNIQUE";
+			else if (stmt->excludeOpNames != NIL)
+				constraint_type = "EXCLUDE";
+			else
+			{
+				elog(ERROR, "unknown constraint type");
+				constraint_type = NULL; /* keep compiler quiet */
+			}
+
+			/*
+			 * It may be possible to support UNIQUE constraints when partition
+			 * keys are expressions, but is it worth it?  Give up for now.
+			 */
+			if (key->partattrs[i] == 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("unsupported %s constraint with partition key definition",
+								constraint_type),
+						 errdetail("%s constraints cannot be used when partition keys include expressions.",
+								constraint_type)));
+
+			for (j = 0; j < indexInfo->ii_NumIndexAttrs; j++)
+			{
+				if (key->partattrs[i] == indexInfo->ii_KeyAttrNumbers[j])
+				{
+					found = true;
+					break;
+				}
+			}
+			if (!found)
+			{
+				Form_pg_attribute att;
+
+				att = TupleDescAttr(RelationGetDescr(rel), key->partattrs[i] - 1);
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("insufficient columns in %s constraint definition",
+								constraint_type),
+						 errdetail("%s constraint on table \"%s\" lacks column \"%s\" which is part of the partition key.",
+								   constraint_type, RelationGetRelationName(rel),
+								   NameStr(att->attname))));
+			}
+		}
+	}
+
+
+
+	/*
      * We disallow indexes on system columns other than OID.  They would not
      * necessarily get updated correctly, and they don't seem useful anyway.
      */
@@ -825,12 +900,14 @@ DefineIndex(Oid relationId,
 
     indexRelationId =
 		index_create(rel, indexRelationName, indexRelationId, parentIndexId,
+					 parentConstraintId,
 					 stmt->oldNode, indexInfo, indexColNames,
                      accessMethodId, tablespaceId,
                      collationObjectId, classObjectId,
 					 coloptions, reloptions,
 					 flags, constr_flags,
-					 allowSystemTableMods, !check_rights);
+					 allowSystemTableMods, !check_rights,
+					 &createdConstraintId);
 
     ObjectAddressSet(address, RelationRelationId, indexRelationId);
 
@@ -924,16 +1001,40 @@ DefineIndex(Oid relationId,
 										 opfamOids,
 										 attmap, maplen))
 					{
+						Oid		cldConstrOid = InvalidOid;
+
 						/*
-						 * Found a match.  Attach index to parent and we're
-						 * done, but keep lock till commit.
+						 * Found a match.
+						 *
+						 * If this index is being created in the parent
+						 * because of a constraint, then the child needs to
+						 * have a constraint also, so look for one.  If there
+						 * is no such constraint, this index is no good, so
+						 * keep looking.
 						 */
+						if (createdConstraintId != InvalidOid)
+						{
+							cldConstrOid =
+								get_relation_idx_constraint_oid(childRelid,
+																cldidxid);
+							if (cldConstrOid == InvalidOid)
+							{
+								index_close(cldidx, lockmode);
+								continue;
+							}
+						}
+
+						/* Attach index to parent and we're done. */
 						IndexSetParentIndex(cldidx, indexRelationId);
+						if (createdConstraintId != InvalidOid)
+							ConstraintSetParentConstraint(cldConstrOid,
+														  createdConstraintId);
 
 						if (!IndexIsValid(cldidx->rd_index))
 							invalidate_parent = true;
 
 						found = true;
+						/* keep lock till commit */
 						index_close(cldidx, NoLock);
 						break;
 					}
@@ -964,6 +1065,7 @@ DefineIndex(Oid relationId,
 					DefineIndex(childRelid, childStmt,
 								InvalidOid,			/* no predefined OID */
 								indexRelationId,	/* this is our child */
+								createdConstraintId,
 								is_alter_table, check_rights, check_not_in_use,
 								skip_build, quiet);
 				}
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 989fb062..6f525512 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -1282,17 +1282,20 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 	       Relation    idxRel = index_open(lfirst_oid(cell), AccessShareLock);
 	       AttrNumber *attmap;
 	       IndexStmt  *idxstmt;
+			Oid			constraintOid;
 
 	       attmap = convert_tuples_by_name_map(RelationGetDescr(rel),
 	                                           RelationGetDescr(parent),
 	                                           gettext_noop("could not convert row type"));
 	       idxstmt =
 	           generateClonedIndexStmt(NULL, RelationGetRelid(rel), idxRel,
-	                                   attmap, RelationGetDescr(rel)->natts);
+										attmap, RelationGetDescr(rel)->natts,
+										&constraintOid);
 	       DefineIndex(RelationGetRelid(rel),
 	                   idxstmt,
 	                   InvalidOid,
 	                   RelationGetRelid(idxRel),
+						constraintOid,
 	                   false, false, false, false, false);
 
 	       index_close(idxRel, AccessShareLock);
@@ -8611,6 +8614,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
                           stmt,
                           InvalidOid,    /* no predefined OID */
 						  InvalidOid,	/* no parent index */
+						  InvalidOid,	/* no parent constraint */
                           true, /* is_alter_table */
                           check_rights,
                           false,    /* check_not_in_use - we did it already */
@@ -8659,6 +8663,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
                                    partidxstmt,
                                    InvalidOid, /* no predefined OID */
 								   InvalidOid,	/* no parent index */
+								   InvalidOid,	/* no parent constraint */
                                    true,    /* is_alter_table */
                                    check_rights,    /* check_rights */
                                    false,    /* check_not_in_use */
@@ -8727,6 +8732,15 @@ ATExecAddIndexConstraint(AlteredTableInfo *tab, Relation rel,
     Assert(OidIsValid(index_oid));
     Assert(stmt->isconstraint);
 
+	/*
+	 * Doing this on partitioned tables is not a simple feature to implement,
+	 * so let's punt for now.
+	 */
+	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("ALTER TABLE / ADD CONSTRAINT USING INDEX is not supported on partitioned tables")));
+
     indexRel = index_open(index_oid, AccessShareLock);
 
     indexName = pstrdup(RelationGetRelationName(indexRel));
@@ -8774,6 +8788,7 @@ ATExecAddIndexConstraint(AlteredTableInfo *tab, Relation rel,
 
     address = index_constraint_create(rel,
                                       index_oid,
+									  InvalidOid,
                                       indexInfo,
                                       constraintName,
                                       constraintType,
@@ -17099,6 +17114,7 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel)
 		IndexInfo  *info;
 		AttrNumber *attmap;
 		bool		found = false;
+		Oid			constraintOid;
 
 		/*
 		 * Ignore indexes in the partitioned table other than partitioned
@@ -17115,6 +17131,7 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel)
 		attmap = convert_tuples_by_name_map(RelationGetDescr(attachrel),
 											RelationGetDescr(rel),
 											gettext_noop("could not convert row type"));
+		constraintOid = get_relation_idx_constraint_oid(RelationGetRelid(rel), idx);
 
 		/*
 		 * Scan the list of existing indexes in the partition-to-be, and mark
@@ -17123,6 +17140,8 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel)
 		 */
 		for (i = 0; i < list_length(attachRelIdxs); i++)
 		{
+			Oid		cldConstrOid = InvalidOid;
+
 			/* does this index have a parent?  if so, can't use it */
 			if (has_superclass(RelationGetRelid(attachrelIdxRels[i])))
 				continue;
@@ -17135,8 +17154,26 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel)
 								 attmap,
 								 RelationGetDescr(rel)->natts))
 			{
+				/*
+				 * If this index is being created in the parent because of a
+				 * constraint, then the child needs to have a constraint also,
+				 * so look for one.  If there is no such constraint, this
+				 * index is no good, so keep looking.
+				 */
+				if (OidIsValid(constraintOid))
+				{
+					cldConstrOid =
+						get_relation_idx_constraint_oid(RelationGetRelid(attachrel),
+														RelationGetRelid(attachrelIdxRels[i]));
+					/* no dice */
+					if (!OidIsValid(cldConstrOid))
+						continue;
+				}
+
 				/* bingo. */
 				IndexSetParentIndex(attachrelIdxRels[i], idx);
+				if (OidIsValid(constraintOid))
+					ConstraintSetParentConstraint(cldConstrOid, constraintOid);
 				found = true;
 				break;
 			}
@@ -17149,12 +17186,15 @@ AttachPartitionEnsureIndexes(Relation rel, Relation attachrel)
 		if (!found)
 		{
 			IndexStmt  *stmt;
+			Oid			constraintOid;
 
 			stmt = generateClonedIndexStmt(NULL, RelationGetRelid(attachrel),
 										   idxRel, attmap,
-										   RelationGetDescr(rel)->natts);
+										   RelationGetDescr(rel)->natts,
+										   &constraintOid);
 			DefineIndex(RelationGetRelid(attachrel), stmt, InvalidOid,
 						RelationGetRelid(idxRel),
+						constraintOid,
 						true, false, false, false, false);
 		}
 
@@ -17422,6 +17462,8 @@ ATExecAttachPartitionIdx(List **wqueue, Relation parentIdx, RangeVar *name)
        bool        found;
        int         i;
        PartitionDesc partDesc;
+		Oid			constraintOid,
+					cldConstrId = InvalidOid;
 
        /*
         * If this partition already has an index attached, refuse the operation.
@@ -17477,8 +17519,34 @@ ATExecAttachPartitionIdx(List **wqueue, Relation parentIdx, RangeVar *name)
                            RelationGetRelationName(parentIdx)),
                     errdetail("The index definitions do not match.")));
 
+		/*
+		 * If there is a constraint in the parent, make sure there is one
+		 * in the child too.
+		 */
+		constraintOid = get_relation_idx_constraint_oid(RelationGetRelid(parentTbl),
+														RelationGetRelid(parentIdx));
+
+		if (OidIsValid(constraintOid))
+		{
+			cldConstrId = get_relation_idx_constraint_oid(RelationGetRelid(partTbl),
+														  partIdxId);
+			if (!OidIsValid(cldConstrId))
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+						 errmsg("cannot attach index \"%s\" as a partition of index \"%s\"",
+								RelationGetRelationName(partIdx),
+								RelationGetRelationName(parentIdx)),
+						 errdetail("The index \"%s\" belongs to a constraint in table \"%s\" but no constraint exists for index \"%s\".",
+								RelationGetRelationName(parentIdx),
+								RelationGetRelationName(parentTbl),
+								RelationGetRelationName(partIdx))));
+		}
+
        /* All good -- do it */
        IndexSetParentIndex(partIdx, RelationGetRelid(parentIdx));
+		if (OidIsValid(constraintOid))
+			ConstraintSetParentConstraint(cldConstrId, constraintOid);
+
        pfree(attmap);
 
        CommandCounterIncrement();
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index 62db0557..cd374523 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -1477,6 +1477,13 @@ transformOnConflictClause(ParseState *pstate,
         TargetEntry *te;
         int            attno;
 
+		if (targetrel->rd_partdesc)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("%s cannot be applied to partitioned table \"%s\"",
+							"ON CONFLICT DO UPDATE",
+							RelationGetRelationName(targetrel))));
+
         /*
          * All INSERT expressions have been parsed, get ready for potentially
          * existing SET statements that need to be processed like an UPDATE.
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index cf1bc20a..5360c6a5 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -1091,12 +1091,6 @@ transformColumnDefinition(CreateStmtContext *cxt, ColumnDef *column)
                              errmsg("primary key constraints are not supported on foreign tables"),
                              parser_errposition(cxt->pstate,
                                                 constraint->location)));
-                if (cxt->ispartitioned)
-                    ereport(ERROR,
-                            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                             errmsg("primary key constraints are not supported on partitioned tables"),
-                             parser_errposition(cxt->pstate,
-                                                constraint->location)));
                 /* FALL THRU */
 
             case CONSTR_UNIQUE:
@@ -1106,12 +1100,6 @@ transformColumnDefinition(CreateStmtContext *cxt, ColumnDef *column)
                              errmsg("unique constraints are not supported on foreign tables"),
                              parser_errposition(cxt->pstate,
                                                 constraint->location)));
-                if (cxt->ispartitioned)
-                    ereport(ERROR,
-                            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                             errmsg("unique constraints are not supported on partitioned tables"),
-                             parser_errposition(cxt->pstate,
-                                                constraint->location)));
                 if (constraint->keys == NIL)
                     constraint->keys = list_make1(makeString(column->colname));
                 cxt->ixconstraints = lappend(cxt->ixconstraints, constraint);
@@ -1208,12 +1196,6 @@ transformTableConstraint(CreateStmtContext *cxt, Constraint *constraint)
                          errmsg("primary key constraints are not supported on foreign tables"),
                          parser_errposition(cxt->pstate,
                                             constraint->location)));
-            if (cxt->ispartitioned)
-                ereport(ERROR,
-                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                         errmsg("primary key constraints are not supported on partitioned tables"),
-                         parser_errposition(cxt->pstate,
-                                            constraint->location)));
             cxt->ixconstraints = lappend(cxt->ixconstraints, constraint);
             break;
 
@@ -1224,12 +1206,6 @@ transformTableConstraint(CreateStmtContext *cxt, Constraint *constraint)
                          errmsg("unique constraints are not supported on foreign tables"),
                          parser_errposition(cxt->pstate,
                                             constraint->location)));
-            if (cxt->ispartitioned)
-                ereport(ERROR,
-                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                         errmsg("unique constraints are not supported on partitioned tables"),
-                         parser_errposition(cxt->pstate,
-                                            constraint->location)));
             cxt->ixconstraints = lappend(cxt->ixconstraints, constraint);
             break;
 
@@ -1631,7 +1607,7 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
             /* Build CREATE INDEX statement to recreate the parent_index */
                         index_stmt = generateClonedIndexStmt(cxt->relation, InvalidOid,
                                                                                                  parent_index,
-                                                 attmap, tupleDesc->natts);
+												 attmap, tupleDesc->natts, NULL);
 
 #ifdef __TBASE__
             if(cxt->interval_child)
@@ -1724,8 +1700,8 @@ transformOfType(CreateStmtContext *cxt, TypeName *ofTypename)
  */
 IndexStmt *
 generateClonedIndexStmt(RangeVar *heapRel, Oid heapRelid, Relation source_idx,
-                        const AttrNumber *attmap, int attmap_length)
-{// #lizard forgives
+						const AttrNumber *attmap, int attmap_length, Oid *constraintOid)
+{
     Oid            source_relid = RelationGetRelid(source_idx);
     Form_pg_attribute *attrs = RelationGetDescr(source_idx)->attrs;
     HeapTuple    ht_idxrel;
@@ -1824,6 +1800,9 @@ generateClonedIndexStmt(RangeVar *heapRel, Oid heapRelid, Relation source_idx,
             HeapTuple    ht_constr;
             Form_pg_constraint conrec;
 
+			if (constraintOid)
+				*constraintOid = constraintId;
+
             ht_constr = SearchSysCache1(CONSTROID,
                                         ObjectIdGetDatum(constraintId));
             if (!HeapTupleIsValid(ht_constr))
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 1ef4a799..da28b820 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -3748,6 +3748,7 @@ ProcessUtilitySlow(ParseState *pstate,
                                     stmt,
                                     InvalidOid, /* no predefined OID */
 									InvalidOid, /* no parent index */
+									InvalidOid, /* no parent constraint */
                                     false,    /* is_alter_table */
                                     true,    /* check_rights */
                                     true,    /* check_not_in_use */
@@ -3818,6 +3819,7 @@ ProcessUtilitySlow(ParseState *pstate,
                                                    partidxstmt,
                                                    InvalidOid, /* no predefined OID */
 												   InvalidOid,
+												   InvalidOid, /* no parent constraint */
                                                    false,    /* is_alter_table */
                                                    true,    /* check_rights */
                                                    true,    /* check_not_in_use */
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index 360d5954..79ff6e4e 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -5196,6 +5196,40 @@
 			role                     => 1,
 			section_pre_data         => 1, }, },
 
+	'ALTER TABLE measurement PRIMARY KEY' => {
+		all_runs     => 1,
+		catch_all    => 'CREATE ... commands',
+		create_order => 93,
+		create_sql   => 'ALTER TABLE dump_test.measurement ADD PRIMARY KEY (city_id, logdate);',
+		regexp => qr/^
+			\QALTER TABLE ONLY measurement\E \n^\s+
+			\QADD CONSTRAINT measurement_pkey PRIMARY KEY (city_id, logdate);\E
+		/xm,
+		like => {
+			binary_upgrade           => 1,
+			clean                    => 1,
+			clean_if_exists          => 1,
+			createdb                 => 1,
+			defaults                 => 1,
+			exclude_test_table       => 1,
+			exclude_test_table_data  => 1,
+			no_blobs                 => 1,
+			no_privs                 => 1,
+			no_owner                 => 1,
+			only_dump_test_schema    => 1,
+			pg_dumpall_dbprivs       => 1,
+			schema_only              => 1,
+			section_post_data        => 1,
+			test_schema_plus_blobs   => 1,
+			with_oids                => 1, },
+		unlike => {
+			exclude_dump_test_schema => 1,
+			only_dump_test_table     => 1,
+			pg_dumpall_globals       => 1,
+			pg_dumpall_globals_clean => 1,
+			role                     => 1,
+			section_pre_data         => 1, }, },
+
 	'CREATE INDEX ... ON measurement_y2006_m2' => {
 		all_runs     => 1,
 		catch_all    => 'CREATE ... commands',
@@ -5258,6 +5292,37 @@
 			section_pre_data         => 1,
 			test_schema_plus_blobs   => 1, }, },
 
+	'ALTER INDEX ... ATTACH PARTITION (primary key)' => {
+		all_runs     => 1,
+		catch_all    => 'CREATE ... commands',
+		regexp       => qr/^
+		\QALTER INDEX dump_test.measurement_pkey ATTACH PARTITION dump_test_second_schema.measurement_y2006m2_pkey\E
+		/xm,
+		like => {
+			binary_upgrade           => 1,
+			clean                    => 1,
+			clean_if_exists          => 1,
+			createdb                 => 1,
+			defaults                 => 1,
+			exclude_dump_test_schema => 1,
+			exclude_test_table       => 1,
+			exclude_test_table_data  => 1,
+			no_blobs                 => 1,
+			no_privs                 => 1,
+			no_owner                 => 1,
+			pg_dumpall_dbprivs       => 1,
+			role                     => 1,
+			schema_only              => 1,
+			section_post_data        => 1,
+			with_oids                => 1, },
+		unlike => {
+			only_dump_test_schema    => 1,
+			only_dump_test_table     => 1,
+			pg_dumpall_globals       => 1,
+			pg_dumpall_globals_clean => 1,
+			section_pre_data         => 1,
+			test_schema_plus_blobs   => 1, }, },
+
 	'CREATE VIEW test_view' => {
 		all_runs     => 1,
 		catch_all    => 'CREATE ... commands',
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index 3a7ed05f..c60ad12f 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -115,6 +115,7 @@ extern Oid index_create(Relation heapRelation,
              const char *indexRelationName,
              Oid indexRelationId,
 			 Oid parentIndexRelid,
+			 Oid parentConstraintId,
              Oid relFileNode,
              IndexInfo *indexInfo,
              List *indexColNames,
@@ -127,7 +128,8 @@ extern Oid index_create(Relation heapRelation,
 			 bits16 flags,
 			 bits16 constr_flags,
              bool allow_system_table_mods,
-			 bool is_internal);
+			 bool is_internal,
+			 Oid *constraintId);
 
 #define	INDEX_CONSTR_CREATE_MARK_AS_PRIMARY	(1 << 0)
 #define	INDEX_CONSTR_CREATE_DEFERRABLE		(1 << 1)
@@ -137,6 +139,7 @@ extern Oid index_create(Relation heapRelation,
 
 extern ObjectAddress index_constraint_create(Relation heapRelation,
                         Oid indexRelationId,
+						Oid parentConstraintId,
                         IndexInfo *indexInfo,
                         const char *constraintName,
                         char constraintType,
diff --git a/src/include/catalog/pg_constraint_fn.h b/src/include/catalog/pg_constraint_fn.h
index 0086a640..544db00b 100644
--- a/src/include/catalog/pg_constraint_fn.h
+++ b/src/include/catalog/pg_constraint_fn.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pg_constraint_fn.h
- *     prototypes for functions in catalog/pg_constraint.c
+ *	 prototypes for functions in catalog/pg_constraint.c
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -22,61 +22,64 @@
  */
 typedef enum ConstraintCategory
 {
-    CONSTRAINT_RELATION,
-    CONSTRAINT_DOMAIN,
-    CONSTRAINT_ASSERTION        /* for future expansion */
+	CONSTRAINT_RELATION,
+	CONSTRAINT_DOMAIN,
+	CONSTRAINT_ASSERTION		/* for future expansion */
 } ConstraintCategory;
 
 extern Oid CreateConstraintEntry(const char *constraintName,
-                      Oid constraintNamespace,
-                      char constraintType,
-                      bool isDeferrable,
-                      bool isDeferred,
-                      bool isValidated,
-                      Oid relId,
-                      const int16 *constraintKey,
-                      int constraintNKeys,
-                      Oid domainId,
-                      Oid indexRelId,
-                      Oid foreignRelId,
-                      const int16 *foreignKey,
-                      const Oid *pfEqOp,
-                      const Oid *ppEqOp,
-                      const Oid *ffEqOp,
-                      int foreignNKeys,
-                      char foreignUpdateType,
-                      char foreignDeleteType,
-                      char foreignMatchType,
-                      const Oid *exclOp,
-                      Node *conExpr,
-                      const char *conBin,
-                      const char *conSrc,
-                      bool conIsLocal,
-                      int conInhCount,
-                      bool conNoInherit,
-                      bool is_internal);
+					  Oid constraintNamespace,
+					  char constraintType,
+					  bool isDeferrable,
+					  bool isDeferred,
+					  bool isValidated,
+					  Oid relId,
+					  const int16 *constraintKey,
+					  int constraintNKeys,
+					  Oid domainId,
+					  Oid indexRelId,
+					  Oid foreignRelId,
+					  const int16 *foreignKey,
+					  const Oid *pfEqOp,
+					  const Oid *ppEqOp,
+					  const Oid *ffEqOp,
+					  int foreignNKeys,
+					  char foreignUpdateType,
+					  char foreignDeleteType,
+					  char foreignMatchType,
+					  const Oid *exclOp,
+					  Node *conExpr,
+					  const char *conBin,
+					  const char *conSrc,
+					  bool conIsLocal,
+					  int conInhCount,
+					  bool conNoInherit,
+					  bool is_internal);
 
 extern void RemoveConstraintById(Oid conId);
 extern void RenameConstraintById(Oid conId, const char *newname);
 extern void SetValidatedConstraintById(Oid conId);
 
 extern bool ConstraintNameIsUsed(ConstraintCategory conCat, Oid objId,
-                     Oid objNamespace, const char *conname);
+					 Oid objNamespace, const char *conname);
 extern char *ChooseConstraintName(const char *name1, const char *name2,
-                     const char *label, Oid namespaceid,
-                     List *others);
+					 const char *label, Oid namespaceid,
+					 List *others);
 
 extern void AlterConstraintNamespaces(Oid ownerId, Oid oldNspId,
-                          Oid newNspId, bool isType, ObjectAddresses *objsMoved);
-extern Oid    get_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok);
-extern Oid    get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok);
+						  Oid newNspId, bool isType, ObjectAddresses *objsMoved);
+extern void ConstraintSetParentConstraint(Oid childConstrId,
+							  Oid parentConstrId);
+extern Oid	get_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok);
+extern Oid	get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok);
+extern Oid get_relation_idx_constraint_oid(Oid relationId, Oid indexId);
 
 extern Bitmapset *get_primary_key_attnos(Oid relid, bool deferrableOk,
-                       Oid *constraintOid);
+					   Oid *constraintOid);
 
 extern bool check_functional_grouping(Oid relid,
-                          Index varno, Index varlevelsup,
-                          List *grouping_columns,
-                          List **constraintDeps);
+						  Index varno, Index varlevelsup,
+						  List *grouping_columns,
+						  List **constraintDeps);
 
-#endif                            /* PG_CONSTRAINT_FN_H */
+#endif							/* PG_CONSTRAINT_FN_H */
diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h
index 377f9f94..1d3959b2 100644
--- a/src/include/commands/defrem.h
+++ b/src/include/commands/defrem.h
@@ -26,6 +26,7 @@ extern ObjectAddress DefineIndex(Oid relationId,
             IndexStmt *stmt,
             Oid indexRelationId,
 			Oid parentIndexId,
+			Oid parentConstraintId,
             bool is_alter_table,
             bool check_rights,
             bool check_not_in_use,
diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h
index e527a119..6cb25dbc 100644
--- a/src/include/parser/parse_utilcmd.h
+++ b/src/include/parser/parse_utilcmd.h
@@ -103,6 +103,7 @@ extern PartitionBoundSpec *transformPartitionBound(ParseState *pstate, Relation
                         PartitionBoundSpec *spec);
 extern IndexStmt *generateClonedIndexStmt(RangeVar *heapRel, Oid heapOid,
                                                  Relation source_idx,
-                                                 const AttrNumber *attmap, int attmap_length);
+						const AttrNumber *attmap, int attmap_length,
+						Oid *constraintOid);
 
 #endif                            /* PARSE_UTILCMD_H */
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 737710bc..50f89f13 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3181,14 +3181,6 @@ CREATE TABLE partitioned (
 	a int,
 	b int
 ) PARTITION BY RANGE (a, (a+b+1));
-ALTER TABLE partitioned ADD UNIQUE (a);
-ERROR:  unique constraints are not supported on partitioned tables
-LINE 1: ALTER TABLE partitioned ADD UNIQUE (a);
-                                    ^
-ALTER TABLE partitioned ADD PRIMARY KEY (a);
-ERROR:  primary key constraints are not supported on partitioned tables
-LINE 1: ALTER TABLE partitioned ADD PRIMARY KEY (a);
-                                    ^
 ALTER TABLE partitioned ADD FOREIGN KEY (a) REFERENCES blah;
 ERROR:  foreign key constraints are not supported on partitioned tables
 LINE 1: ALTER TABLE partitioned ADD FOREIGN KEY (a) REFERENCES blah;
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index dd727bc7..50c501c1 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -2701,6 +2701,12 @@ DROP INDEX cwi_replaced_pkey;	-- Should fail; a constraint depends on it
 ERROR:  cannot drop index cwi_replaced_pkey because constraint cwi_replaced_pkey on table cwi_test requires it
 HINT:  You can drop constraint cwi_replaced_pkey on table cwi_test instead.
 DROP TABLE cwi_test;
+-- ADD CONSTRAINT USING INDEX is forbidden on partitioned tables
+CREATE TABLE cwi_test(a int) PARTITION BY hash (a);
+create unique index on cwi_test (a);
+alter table cwi_test add primary key using index cwi_test_a_idx ;
+ERROR:  ALTER TABLE / ADD CONSTRAINT USING INDEX is not supported on partitioned tables
+DROP TABLE cwi_test;
 --
 -- Check handling of indexes on system columns
 --
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index 4f679633..26d364a5 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -279,12 +279,6 @@ CREATE TABLE partitioned (
 ) PARTITION BY LIST (a1, a2);	-- fail
 ERROR:  cannot use "list" partition strategy with more than one column
 -- unsupported constraint type for partitioned tables
-CREATE TABLE partitioned (
-	a int PRIMARY KEY
-) PARTITION BY RANGE (a);
-ERROR:  primary key constraints are not supported on partitioned tables
-LINE 2:  a int PRIMARY KEY
-               ^
 CREATE TABLE pkrel (
 	a int PRIMARY KEY
 );
@@ -295,12 +289,6 @@ ERROR:  foreign key constraints are not supported on partitioned tables
 LINE 2:  a int REFERENCES pkrel(a)
                ^
 DROP TABLE pkrel;
-CREATE TABLE partitioned (
-	a int UNIQUE
-) PARTITION BY RANGE (a);
-ERROR:  unique constraints are not supported on partitioned tables
-LINE 2:  a int UNIQUE
-               ^
 CREATE TABLE partitioned (
 	a int,
 	EXCLUDE USING gist (a WITH &&)
diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out
index f996a88d..4cd0596f 100644
--- a/src/test/regress/expected/indexing.out
+++ b/src/test/regress/expected/indexing.out
@@ -26,8 +26,6 @@ drop table idxpart;
 -- Some unsupported features
 create table idxpart (a int, b int, c text) partition by range (a);
 create table idxpart1 partition of idxpart for values from (0) to (10);
-create unique index on idxpart (a);
-ERROR:  cannot create unique index on partitioned table "idxpart"
 create index concurrently on idxpart (a);
 ERROR:  PGXC does not support concurrent INDEX yet
 DETAIL:  The feature is not currently supported
@@ -759,6 +757,295 @@ select attrelid::regclass, attname, attnum from pg_attribute
  idxpart_col_keep_idx  | col_keep                     |      1
 (7 rows)
 
+drop table idxpart;
+--
+-- Constraint-related indexes
+--
+-- Verify that it works to add primary key / unique to partitioned tables
+create table idxpart (a int primary key, b int) partition by range (a);
+\d idxpart
+              Table "public.idxpart"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           |          | 
+Partition key: RANGE (a)
+Indexes:
+    "idxpart_pkey" PRIMARY KEY, btree (a)
+Number of partitions: 0
+
+drop table idxpart;
+-- but not if you fail to use the full partition key
+create table idxpart (a int unique, b int) partition by range (a, b);
+ERROR:  insufficient columns in UNIQUE constraint definition
+DETAIL:  UNIQUE constraint on table "idxpart" lacks column "b" which is part of the partition key.
+create table idxpart (a int, b int unique) partition by range (a, b);
+ERROR:  insufficient columns in UNIQUE constraint definition
+DETAIL:  UNIQUE constraint on table "idxpart" lacks column "a" which is part of the partition key.
+create table idxpart (a int primary key, b int) partition by range (b, a);
+ERROR:  insufficient columns in PRIMARY KEY constraint definition
+DETAIL:  PRIMARY KEY constraint on table "idxpart" lacks column "b" which is part of the partition key.
+create table idxpart (a int, b int primary key) partition by range (b, a);
+ERROR:  insufficient columns in PRIMARY KEY constraint definition
+DETAIL:  PRIMARY KEY constraint on table "idxpart" lacks column "a" which is part of the partition key.
+-- OK if you use them in some other order
+create table idxpart (a int, b int, c text, primary key  (a, b, c)) partition by range (b, c, a);
+drop table idxpart;
+-- not other types of index-based constraints
+create table idxpart (a int, exclude (a with = )) partition by range (a);
+ERROR:  exclusion constraints are not supported on partitioned tables
+LINE 1: create table idxpart (a int, exclude (a with = )) partition ...
+                                     ^
+-- no expressions in partition key for PK/UNIQUE
+create table idxpart (a int primary key, b int) partition by range ((b + a));
+ERROR:  unsupported PRIMARY KEY constraint with partition key definition
+DETAIL:  PRIMARY KEY constraints cannot be used when partition keys include expressions.
+create table idxpart (a int unique, b int) partition by range ((b + a));
+ERROR:  unsupported UNIQUE constraint with partition key definition
+DETAIL:  UNIQUE constraints cannot be used when partition keys include expressions.
+-- use ALTER TABLE to add a primary key
+create table idxpart (a int, b int, c text) partition by range (a, b);
+alter table idxpart add primary key (a);	-- not an incomplete one though
+ERROR:  insufficient columns in PRIMARY KEY constraint definition
+DETAIL:  PRIMARY KEY constraint on table "idxpart" lacks column "b" which is part of the partition key.
+alter table idxpart add primary key (a, b);	-- this works
+\d idxpart
+              Table "public.idxpart"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           | not null | 
+ c      | text    |           |          | 
+Partition key: RANGE (a, b)
+Indexes:
+    "idxpart_pkey" PRIMARY KEY, btree (a, b)
+Number of partitions: 0
+
+create table idxpart1 partition of idxpart for values from (0, 0) to (1000, 1000);
+\d idxpart1
+              Table "public.idxpart1"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           | not null | 
+ b      | integer |           | not null | 
+ c      | text    |           |          | 
+Partition of: idxpart FOR VALUES FROM (0, 0) TO (1000, 1000)
+Indexes:
+    "idxpart1_pkey" PRIMARY KEY, btree (a, b)
+
+drop table idxpart;
+-- use ALTER TABLE to add a unique constraint
+create table idxpart (a int, b int) partition by range (a, b);
+alter table idxpart add unique (a);			-- not an incomplete one though
+ERROR:  insufficient columns in UNIQUE constraint definition
+DETAIL:  UNIQUE constraint on table "idxpart" lacks column "b" which is part of the partition key.
+alter table idxpart add unique (b, a);		-- this works
+\d idxpart
+              Table "public.idxpart"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+ b      | integer |           |          | 
+Partition key: RANGE (a, b)
+Indexes:
+    "idxpart_b_a_key" UNIQUE CONSTRAINT, btree (b, a)
+Number of partitions: 0
+
+drop table idxpart;
+-- Exclusion constraints cannot be added
+create table idxpart (a int, b int) partition by range (a);
+alter table idxpart add exclude (a with =);
+ERROR:  exclusion constraints are not supported on partitioned tables
+LINE 1: alter table idxpart add exclude (a with =);
+                                ^
+drop table idxpart;
+-- When (sub)partitions are created, they also contain the constraint
+create table idxpart (a int, b int, primary key (a, b)) partition by range (a, b);
+create table idxpart1 partition of idxpart for values from (1, 1) to (10, 10);
+create table idxpart2 partition of idxpart for values from (10, 10) to (20, 20)
+  partition by range (b);
+create table idxpart21 partition of idxpart2 for values from (10) to (15);
+create table idxpart22 partition of idxpart2 for values from (15) to (20);
+create table idxpart3 (a int not null, b int not null);
+alter table idxpart attach partition idxpart3 for values from (20, 20) to (30, 30);
+select conname, contype, conrelid::regclass, conindid::regclass, conkey
+  from pg_constraint where conrelid::regclass::text like 'idxpart%'
+  order by conname;
+    conname     | contype | conrelid  |    conindid    | conkey 
+----------------+---------+-----------+----------------+--------
+ idxpart1_pkey  | p       | idxpart1  | idxpart1_pkey  | {1,2}
+ idxpart21_pkey | p       | idxpart21 | idxpart21_pkey | {1,2}
+ idxpart22_pkey | p       | idxpart22 | idxpart22_pkey | {1,2}
+ idxpart2_pkey  | p       | idxpart2  | idxpart2_pkey  | {1,2}
+ idxpart3_pkey  | p       | idxpart3  | idxpart3_pkey  | {1,2}
+ idxpart_pkey   | p       | idxpart   | idxpart_pkey   | {1,2}
+(6 rows)
+
+drop table idxpart;
+-- Verify that multi-layer partitioning honors the requirement that all
+-- columns in the partition key must appear in primary key
+create table idxpart (a int, b int, primary key (a)) partition by range (a);
+create table idxpart2 partition of idxpart
+for values from (0) to (1000) partition by range (b); -- fail
+ERROR:  insufficient columns in PRIMARY KEY constraint definition
+DETAIL:  PRIMARY KEY constraint on table "idxpart2" lacks column "b" which is part of the partition key.
+drop table idxpart;
+-- Multi-layer partitioning works correctly in this case:
+create table idxpart (a int, b int, primary key (a, b)) partition by range (a);
+create table idxpart2 partition of idxpart for values from (0) to (1000) partition by range (b);
+create table idxpart21 partition of idxpart2 for values from (0) to (1000);
+select conname, contype, conrelid::regclass, conindid::regclass, conkey
+  from pg_constraint where conrelid::regclass::text like 'idxpart%'
+  order by conname;
+    conname     | contype | conrelid  |    conindid    | conkey 
+----------------+---------+-----------+----------------+--------
+ idxpart21_pkey | p       | idxpart21 | idxpart21_pkey | {1,2}
+ idxpart2_pkey  | p       | idxpart2  | idxpart2_pkey  | {1,2}
+ idxpart_pkey   | p       | idxpart   | idxpart_pkey   | {1,2}
+(3 rows)
+
+drop table idxpart;
+-- If a partitioned table has a unique/PK constraint, then it's not possible
+-- to drop the corresponding constraint in the children; nor it's possible
+-- to drop the indexes individually.  Dropping the constraint in the parent
+-- gets rid of the lot.
+create table idxpart (i int) partition by hash (i);
+create table idxpart0 partition of idxpart (i) for values with (modulus 2, remainder 0);
+create table idxpart1 partition of idxpart (i) for values with (modulus 2, remainder 1);
+alter table idxpart0 add primary key(i);
+alter table idxpart add primary key(i);
+select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid,
+  conname, conislocal, coninhcount, connoinherit, convalidated
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+  left join pg_constraint con on (idx.indexrelid = con.conindid)
+  where indrelid::regclass::text like 'idxpart%'
+  order by indexrelid::regclass::text collate "C";
+ indrelid |  indexrelid   |  inhparent   | indisvalid |    conname    | conislocal | coninhcount | connoinherit | convalidated 
+----------+---------------+--------------+------------+---------------+------------+-------------+--------------+--------------
+ idxpart0 | idxpart0_pkey | idxpart_pkey | t          | idxpart0_pkey | f          |           1 | t            | t
+ idxpart1 | idxpart1_pkey | idxpart_pkey | t          | idxpart1_pkey | f          |           1 | f            | t
+ idxpart  | idxpart_pkey  |              | t          | idxpart_pkey  | t          |           0 | t            | t
+(3 rows)
+
+drop index idxpart0_pkey;								-- fail
+ERROR:  cannot drop index idxpart0_pkey because index idxpart_pkey requires it
+HINT:  You can drop index idxpart_pkey instead.
+drop index idxpart1_pkey;								-- fail
+ERROR:  cannot drop index idxpart1_pkey because index idxpart_pkey requires it
+HINT:  You can drop index idxpart_pkey instead.
+alter table idxpart0 drop constraint idxpart0_pkey;		-- fail
+ERROR:  cannot drop inherited constraint "idxpart0_pkey" of relation "idxpart0"
+alter table idxpart1 drop constraint idxpart1_pkey;		-- fail
+ERROR:  cannot drop inherited constraint "idxpart1_pkey" of relation "idxpart1"
+alter table idxpart drop constraint idxpart_pkey;		-- ok
+select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid,
+  conname, conislocal, coninhcount, connoinherit, convalidated
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+  left join pg_constraint con on (idx.indexrelid = con.conindid)
+  where indrelid::regclass::text like 'idxpart%'
+  order by indexrelid::regclass::text collate "C";
+ indrelid | indexrelid | inhparent | indisvalid | conname | conislocal | coninhcount | connoinherit | convalidated 
+----------+------------+-----------+------------+---------+------------+-------------+--------------+--------------
+(0 rows)
+
+drop table idxpart;
+-- If a partitioned table has a constraint whose index is not valid,
+-- attaching a missing partition makes it valid.
+create table idxpart (a int) partition by range (a);
+create table idxpart0 (like idxpart);
+alter table idxpart0 add primary key (a);
+alter table idxpart attach partition idxpart0 for values from (0) to (1000);
+alter table only idxpart add primary key (a);
+select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid,
+  conname, conislocal, coninhcount, connoinherit, convalidated
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+  left join pg_constraint con on (idx.indexrelid = con.conindid)
+  where indrelid::regclass::text like 'idxpart%'
+  order by indexrelid::regclass::text collate "C";
+ indrelid |  indexrelid   | inhparent | indisvalid |    conname    | conislocal | coninhcount | connoinherit | convalidated 
+----------+---------------+-----------+------------+---------------+------------+-------------+--------------+--------------
+ idxpart0 | idxpart0_pkey |           | t          | idxpart0_pkey | t          |           0 | t            | t
+ idxpart  | idxpart_pkey  |           | f          | idxpart_pkey  | t          |           0 | t            | t
+(2 rows)
+
+alter index idxpart_pkey attach partition idxpart0_pkey;
+select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid,
+  conname, conislocal, coninhcount, connoinherit, convalidated
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+  left join pg_constraint con on (idx.indexrelid = con.conindid)
+  where indrelid::regclass::text like 'idxpart%'
+  order by indexrelid::regclass::text collate "C";
+ indrelid |  indexrelid   |  inhparent   | indisvalid |    conname    | conislocal | coninhcount | connoinherit | convalidated 
+----------+---------------+--------------+------------+---------------+------------+-------------+--------------+--------------
+ idxpart0 | idxpart0_pkey | idxpart_pkey | t          | idxpart0_pkey | f          |           1 | t            | t
+ idxpart  | idxpart_pkey  |              | t          | idxpart_pkey  | t          |           0 | t            | t
+(2 rows)
+
+drop table idxpart;
+-- if a partition has a unique index without a constraint, does not attach
+-- automatically; creates a new index instead.
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (a int not null, b int);
+create unique index on idxpart1 (a);
+alter table idxpart add primary key (a);
+alter table idxpart attach partition idxpart1 for values from (1) to (1000);
+select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid,
+  conname, conislocal, coninhcount, connoinherit, convalidated
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+  left join pg_constraint con on (idx.indexrelid = con.conindid)
+  where indrelid::regclass::text like 'idxpart%'
+  order by indexrelid::regclass::text collate "C";
+ indrelid |   indexrelid   |  inhparent   | indisvalid |    conname    | conislocal | coninhcount | connoinherit | convalidated 
+----------+----------------+--------------+------------+---------------+------------+-------------+--------------+--------------
+ idxpart1 | idxpart1_a_idx |              | t          |               |            |             |              | 
+ idxpart1 | idxpart1_pkey  | idxpart_pkey | t          | idxpart1_pkey | f          |           1 | f            | t
+ idxpart  | idxpart_pkey   |              | t          | idxpart_pkey  | t          |           0 | t            | t
+(3 rows)
+
+drop table idxpart;
+-- Can't attach an index without a corresponding constraint
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (a int not null, b int);
+create unique index on idxpart1 (a);
+alter table idxpart attach partition idxpart1 for values from (1) to (1000);
+alter table only idxpart add primary key (a);
+alter index idxpart_pkey attach partition idxpart1_a_idx;	-- fail
+ERROR:  cannot attach index "idxpart1_a_idx" as a partition of index "idxpart_pkey"
+DETAIL:  The index "idxpart_pkey" belongs to a constraint in table "idxpart" but no constraint exists for index "idxpart1_a_idx".
+drop table idxpart;
+-- Test that unique constraints are working
+create table idxpart (a int, b text, primary key (a, b)) partition by range (a);
+create table idxpart1 partition of idxpart for values from (0) to (100000);
+create table idxpart2 (like idxpart);
+insert into idxpart2 (a, b) values (572814, 'inserted first');
+create unique index on idxpart (a);
+alter table idxpart attach partition idxpart2 for values from (100000) to (1000000);
+insert into idxpart values (0, 'zero'), (42, 'life'), (2^16, 'sixteen');
+insert into idxpart select 2^g, format('two to power of %s', g) from generate_series(15, 17) g;
+ERROR:  duplicate key value violates unique constraint "idxpart1_a_idx"
+DETAIL:  Key (a)=(65536) already exists.
+insert into idxpart values (16, 'sixteen');
+insert into idxpart (b, a) values ('one', 142857), ('two', 285714);
+insert into idxpart select a * 2, b || b from idxpart where a between 2^16 and 2^19;
+ERROR:  duplicate key value violates unique constraint "idxpart2_a_idx"
+DETAIL:  Key (a)=(285714) already exists.
+insert into idxpart values (572814, 'five');
+ERROR:  duplicate key value violates unique constraint "idxpart2_a_idx"
+DETAIL:  Key (a)=(572814) already exists.
+insert into idxpart values (857142, 'six');
+select tableoid::regclass, * from idxpart order by a;
+ tableoid |   a    |       b        
+----------+--------+----------------
+ idxpart1 |      0 | zero
+ idxpart1 |     16 | sixteen
+ idxpart1 |     42 | life
+ idxpart1 |  65536 | sixteen
+ idxpart2 | 142857 | one
+ idxpart2 | 285714 | two
+ idxpart2 | 572814 | inserted first
+ idxpart2 | 857142 | six
+(8 rows)
+
 drop table idxpart;
 -- intentionally leave some objects around
 create table idxpart (a int) partition by range (a);
@@ -771,3 +1058,5 @@ create index on idxpart22 (a);
 create index on only idxpart2 (a);
 alter index idxpart2_a_idx attach partition idxpart22_a_idx;
 create index on idxpart (a);
+create table idxpart_another (a int, b int, primary key (a, b)) partition by range (a);
+create table idxpart_another_1 partition of idxpart_another for values from (0) to (100);
diff --git a/src/test/regress/expected/insert_conflict.out b/src/test/regress/expected/insert_conflict.out
index f10974de..dcd26834 100644
--- a/src/test/regress/expected/insert_conflict.out
+++ b/src/test/regress/expected/insert_conflict.out
@@ -822,7 +822,7 @@ insert into parted_conflict_test values (1, 'a') on conflict do nothing;
 insert into parted_conflict_test values (1, 'a') on conflict do nothing;
 -- however, on conflict do update is not supported yet
 insert into parted_conflict_test values (1) on conflict (b) do update set a = excluded.a;
-ERROR:  there is no unique or exclusion constraint matching the ON CONFLICT specification
+ERROR:  ON CONFLICT DO UPDATE cannot be applied to partitioned table "parted_conflict_test"
 -- but it works OK if we target the partition directly
 insert into parted_conflict_test_1 values (1) on conflict (b) do
 update set a = excluded.a;
diff --git a/src/test/regress/expected/insert_conflict_1.out b/src/test/regress/expected/insert_conflict_1.out
index 40048bfb..5b13fc64 100644
--- a/src/test/regress/expected/insert_conflict_1.out
+++ b/src/test/regress/expected/insert_conflict_1.out
@@ -843,7 +843,7 @@ ERROR:  no partition of relation "parted_conflict_test" found for row
 DETAIL:  Partition key of the failing row contains (a) = (1).
 -- however, on conflict do update is not supported yet
 insert into parted_conflict_test values (1) on conflict (b) do update set a = excluded.a;
-ERROR:  Distributed column or partition column "a" can't be updated in current version
+ERROR:  ON CONFLICT DO UPDATE cannot be applied to partitioned table "parted_conflict_test"
 -- but it works OK if we target the partition directly
 insert into parted_conflict_test_1 values (1) on conflict (b) do
 update set a = excluded.a;
diff --git a/src/test/regress/output/tablespace.source b/src/test/regress/output/tablespace.source
index 1a5dc4d1..b78953ac 100644
--- a/src/test/regress/output/tablespace.source
+++ b/src/test/regress/output/tablespace.source
@@ -132,34 +132,19 @@ Tablespace: "regress_tblspace"
 
 -- partitioned rels cannot specify the primary key.  These fail:
 CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default;
-ERROR:  primary key constraints are not supported on partitioned tables
-LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION B...
-                                            ^
+ERROR:  cannot specify default tablespace for partitioned relations
 CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a);
-ERROR:  primary key constraints are not supported on partitioned tables
-LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX ...
-                                            ^
+ERROR:  cannot specify default tablespace for partitioned relation
 SET default_tablespace TO 'pg_default';
 CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
-ERROR:  primary key constraints are not supported on partitioned tables
-LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION B...
-                                            ^
+ERROR:  cannot specify default tablespace for partitioned relations
 CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a);
-ERROR:  primary key constraints are not supported on partitioned tables
-LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX ...
-                                            ^
+ERROR:  cannot specify default tablespace for partitioned relations
 -- but these work:
 CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace;
-ERROR:  primary key constraints are not supported on partitioned tables
-LINE 1: CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX ...
-                                            ^
 SET default_tablespace TO '';
 CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a);
-ERROR:  primary key constraints are not supported on partitioned tables
-LINE 1: CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION ...
-                                             ^
 DROP TABLE testschema.dflt, testschema.dflt2;
-ERROR:  table "dflt" does not exist
 -- check that default_tablespace doesn't affect ALTER TABLE index rebuilds
 CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace;
 INSERT INTO testschema.test_default_tab VALUES (1);
@@ -277,13 +262,7 @@ INSERT INTO testschema.test_default_tab_p VALUES (1);
 CREATE INDEX test_index1 on testschema.test_default_tab_p (val);
 CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace;
 ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id);
-ERROR:  primary key constraints are not supported on partitioned tables
-LINE 1: ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT tes...
-                                                      ^
 ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace;
-ERROR:  unique constraints are not supported on partitioned tables
-LINE 1: ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT tes...
-                                                      ^
 \d testschema.test_index1
 Index "testschema.test_index1"
  Column |  Type  | Definition 
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index c0e41d0f..daa8f09d 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -1986,8 +1986,6 @@ CREATE TABLE partitioned (
 	a int,
 	b int
 ) PARTITION BY RANGE (a, (a+b+1));
-ALTER TABLE partitioned ADD UNIQUE (a);
-ALTER TABLE partitioned ADD PRIMARY KEY (a);
 ALTER TABLE partitioned ADD FOREIGN KEY (a) REFERENCES blah;
 ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&);
 
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index ac87a957..14492a24 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -833,6 +833,12 @@ DROP INDEX cwi_replaced_pkey;	-- Should fail; a constraint depends on it
 
 DROP TABLE cwi_test;
 
+-- ADD CONSTRAINT USING INDEX is forbidden on partitioned tables
+CREATE TABLE cwi_test(a int) PARTITION BY hash (a);
+create unique index on cwi_test (a);
+alter table cwi_test add primary key using index cwi_test_a_idx ;
+DROP TABLE cwi_test;
+
 --
 -- Check handling of indexes on system columns
 --
diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql
index d00a5935..82f1a87b 100644
--- a/src/test/regress/sql/create_table.sql
+++ b/src/test/regress/sql/create_table.sql
@@ -296,10 +296,6 @@ CREATE TABLE partitioned (
 ) PARTITION BY LIST (a1, a2);	-- fail
 
 -- unsupported constraint type for partitioned tables
-CREATE TABLE partitioned (
-	a int PRIMARY KEY
-) PARTITION BY RANGE (a);
-
 CREATE TABLE pkrel (
 	a int PRIMARY KEY
 );
@@ -308,10 +304,6 @@ CREATE TABLE partitioned (
 ) PARTITION BY RANGE (a);
 DROP TABLE pkrel;
 
-CREATE TABLE partitioned (
-	a int UNIQUE
-) PARTITION BY RANGE (a);
-
 CREATE TABLE partitioned (
 	a int,
 	EXCLUDE USING gist (a WITH &&)
diff --git a/src/test/regress/sql/indexing.sql b/src/test/regress/sql/indexing.sql
index 130ee7cc..8f5a33e9 100644
--- a/src/test/regress/sql/indexing.sql
+++ b/src/test/regress/sql/indexing.sql
@@ -15,7 +15,6 @@ drop table idxpart;
 -- Some unsupported features
 create table idxpart (a int, b int, c text) partition by range (a);
 create table idxpart1 partition of idxpart for values from (0) to (10);
-create unique index on idxpart (a);
 create index concurrently on idxpart (a);
 drop table idxpart;
 
@@ -378,6 +377,174 @@ select attrelid::regclass, attname, attnum from pg_attribute
   order by attrelid::regclass, attnum;
 drop table idxpart;
 
+--
+-- Constraint-related indexes
+--
+
+-- Verify that it works to add primary key / unique to partitioned tables
+create table idxpart (a int primary key, b int) partition by range (a);
+\d idxpart
+drop table idxpart;
+
+-- but not if you fail to use the full partition key
+create table idxpart (a int unique, b int) partition by range (a, b);
+create table idxpart (a int, b int unique) partition by range (a, b);
+create table idxpart (a int primary key, b int) partition by range (b, a);
+create table idxpart (a int, b int primary key) partition by range (b, a);
+
+-- OK if you use them in some other order
+create table idxpart (a int, b int, c text, primary key  (a, b, c)) partition by range (b, c, a);
+drop table idxpart;
+
+-- not other types of index-based constraints
+create table idxpart (a int, exclude (a with = )) partition by range (a);
+
+-- no expressions in partition key for PK/UNIQUE
+create table idxpart (a int primary key, b int) partition by range ((b + a));
+create table idxpart (a int unique, b int) partition by range ((b + a));
+
+-- use ALTER TABLE to add a primary key
+create table idxpart (a int, b int, c text) partition by range (a, b);
+alter table idxpart add primary key (a);	-- not an incomplete one though
+alter table idxpart add primary key (a, b);	-- this works
+\d idxpart
+create table idxpart1 partition of idxpart for values from (0, 0) to (1000, 1000);
+\d idxpart1
+drop table idxpart;
+
+-- use ALTER TABLE to add a unique constraint
+create table idxpart (a int, b int) partition by range (a, b);
+alter table idxpart add unique (a);			-- not an incomplete one though
+alter table idxpart add unique (b, a);		-- this works
+\d idxpart
+drop table idxpart;
+
+-- Exclusion constraints cannot be added
+create table idxpart (a int, b int) partition by range (a);
+alter table idxpart add exclude (a with =);
+drop table idxpart;
+
+-- When (sub)partitions are created, they also contain the constraint
+create table idxpart (a int, b int, primary key (a, b)) partition by range (a, b);
+create table idxpart1 partition of idxpart for values from (1, 1) to (10, 10);
+create table idxpart2 partition of idxpart for values from (10, 10) to (20, 20)
+  partition by range (b);
+create table idxpart21 partition of idxpart2 for values from (10) to (15);
+create table idxpart22 partition of idxpart2 for values from (15) to (20);
+create table idxpart3 (a int not null, b int not null);
+alter table idxpart attach partition idxpart3 for values from (20, 20) to (30, 30);
+select conname, contype, conrelid::regclass, conindid::regclass, conkey
+  from pg_constraint where conrelid::regclass::text like 'idxpart%'
+  order by conname;
+drop table idxpart;
+
+-- Verify that multi-layer partitioning honors the requirement that all
+-- columns in the partition key must appear in primary key
+create table idxpart (a int, b int, primary key (a)) partition by range (a);
+create table idxpart2 partition of idxpart
+for values from (0) to (1000) partition by range (b); -- fail
+drop table idxpart;
+
+-- Multi-layer partitioning works correctly in this case:
+create table idxpart (a int, b int, primary key (a, b)) partition by range (a);
+create table idxpart2 partition of idxpart for values from (0) to (1000) partition by range (b);
+create table idxpart21 partition of idxpart2 for values from (0) to (1000);
+select conname, contype, conrelid::regclass, conindid::regclass, conkey
+  from pg_constraint where conrelid::regclass::text like 'idxpart%'
+  order by conname;
+drop table idxpart;
+
+-- If a partitioned table has a unique/PK constraint, then it's not possible
+-- to drop the corresponding constraint in the children; nor it's possible
+-- to drop the indexes individually.  Dropping the constraint in the parent
+-- gets rid of the lot.
+create table idxpart (i int) partition by hash (i);
+create table idxpart0 partition of idxpart (i) for values with (modulus 2, remainder 0);
+create table idxpart1 partition of idxpart (i) for values with (modulus 2, remainder 1);
+alter table idxpart0 add primary key(i);
+alter table idxpart add primary key(i);
+select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid,
+  conname, conislocal, coninhcount, connoinherit, convalidated
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+  left join pg_constraint con on (idx.indexrelid = con.conindid)
+  where indrelid::regclass::text like 'idxpart%'
+  order by indexrelid::regclass::text collate "C";
+drop index idxpart0_pkey;								-- fail
+drop index idxpart1_pkey;								-- fail
+alter table idxpart0 drop constraint idxpart0_pkey;		-- fail
+alter table idxpart1 drop constraint idxpart1_pkey;		-- fail
+alter table idxpart drop constraint idxpart_pkey;		-- ok
+select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid,
+  conname, conislocal, coninhcount, connoinherit, convalidated
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+  left join pg_constraint con on (idx.indexrelid = con.conindid)
+  where indrelid::regclass::text like 'idxpart%'
+  order by indexrelid::regclass::text collate "C";
+drop table idxpart;
+
+-- If a partitioned table has a constraint whose index is not valid,
+-- attaching a missing partition makes it valid.
+create table idxpart (a int) partition by range (a);
+create table idxpart0 (like idxpart);
+alter table idxpart0 add primary key (a);
+alter table idxpart attach partition idxpart0 for values from (0) to (1000);
+alter table only idxpart add primary key (a);
+select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid,
+  conname, conislocal, coninhcount, connoinherit, convalidated
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+  left join pg_constraint con on (idx.indexrelid = con.conindid)
+  where indrelid::regclass::text like 'idxpart%'
+  order by indexrelid::regclass::text collate "C";
+alter index idxpart_pkey attach partition idxpart0_pkey;
+select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid,
+  conname, conislocal, coninhcount, connoinherit, convalidated
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+  left join pg_constraint con on (idx.indexrelid = con.conindid)
+  where indrelid::regclass::text like 'idxpart%'
+  order by indexrelid::regclass::text collate "C";
+drop table idxpart;
+
+-- if a partition has a unique index without a constraint, does not attach
+-- automatically; creates a new index instead.
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (a int not null, b int);
+create unique index on idxpart1 (a);
+alter table idxpart add primary key (a);
+alter table idxpart attach partition idxpart1 for values from (1) to (1000);
+select indrelid::regclass, indexrelid::regclass, inhparent::regclass, indisvalid,
+  conname, conislocal, coninhcount, connoinherit, convalidated
+  from pg_index idx left join pg_inherits inh on (idx.indexrelid = inh.inhrelid)
+  left join pg_constraint con on (idx.indexrelid = con.conindid)
+  where indrelid::regclass::text like 'idxpart%'
+  order by indexrelid::regclass::text collate "C";
+drop table idxpart;
+
+-- Can't attach an index without a corresponding constraint
+create table idxpart (a int, b int) partition by range (a);
+create table idxpart1 (a int not null, b int);
+create unique index on idxpart1 (a);
+alter table idxpart attach partition idxpart1 for values from (1) to (1000);
+alter table only idxpart add primary key (a);
+alter index idxpart_pkey attach partition idxpart1_a_idx;	-- fail
+drop table idxpart;
+
+-- Test that unique constraints are working
+create table idxpart (a int, b text, primary key (a, b)) partition by range (a);
+create table idxpart1 partition of idxpart for values from (0) to (100000);
+create table idxpart2 (like idxpart);
+insert into idxpart2 (a, b) values (572814, 'inserted first');
+create unique index on idxpart (a);
+alter table idxpart attach partition idxpart2 for values from (100000) to (1000000);
+insert into idxpart values (0, 'zero'), (42, 'life'), (2^16, 'sixteen');
+insert into idxpart select 2^g, format('two to power of %s', g) from generate_series(15, 17) g;
+insert into idxpart values (16, 'sixteen');
+insert into idxpart (b, a) values ('one', 142857), ('two', 285714);
+insert into idxpart select a * 2, b || b from idxpart where a between 2^16 and 2^19;
+insert into idxpart values (572814, 'five');
+insert into idxpart values (857142, 'six');
+select tableoid::regclass, * from idxpart order by a;
+drop table idxpart;
+
 -- intentionally leave some objects around
 create table idxpart (a int) partition by range (a);
 create table idxpart1 partition of idxpart for values from (0) to (100);
@@ -389,3 +556,5 @@ create index on idxpart22 (a);
 create index on only idxpart2 (a);
 alter index idxpart2_a_idx attach partition idxpart22_a_idx;
 create index on idxpart (a);
+create table idxpart_another (a int, b int, primary key (a, b)) partition by range (a);
+create table idxpart_another_1 partition of idxpart_another for values from (0) to (100);

From e097352eb5e8ed997c83bd3029ff718226870d0d Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Thu, 18 Mar 2021 12:26:51 +0000
Subject: [PATCH 310/578] Solve the issue of hash index on coordinator.

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131085729301&url_cache_key=d4e1402777dc733479aac463ad1a9d24

(cherry picked from commit 3bb2732c)

74fa9796 Solve the issue of hash index on coordinator.

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131085729301&url_cache_key=d4e1402777dc733479aac463ad1a9d24
---
 src/backend/access/hash/hash.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index b7e21348..82d6e924 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -120,7 +120,15 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
              RelationGetRelationName(index));
 
     /* Estimate the number of rows currently present in the table */
+	if (IS_PGXC_COORDINATOR)
+	{
+		/* Coordinator has no data */
+		relpages = reltuples = allvisfrac  = 0;
+	}
+	else
+	{
     estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
+	}
 
     /* Initialize the hash index metadata page and initial buckets */
     num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);

From 73937edc074f05e41ddb4e4ea78ae012005d085c Mon Sep 17 00:00:00 2001
From: gregsun <gregsun@tencent.com>
Date: Tue, 22 Dec 2020 23:27:59 +0800
Subject: [PATCH 311/578] Patch from PostgreSQL - pg_dump keeps finding loop
 due to partition indexes loop dependency.

http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696084240131

commit 8cff4f5348d075e063100071013f00a900c32b0f
Author: Tom Lane <tgl@sss.pgh.pa.us>
Date:   Tue Aug 28 19:33:04 2018 -0400

    Code review for pg_dump's handling of ALTER INDEX ATTACH PARTITION.

    Ensure the TOC entry is marked with the correct schema, so that its
    name is as unique as the index's is.

    Fix the dependencies: we want dependencies from this TOC entry to the
    two indexes it depends on, and we don't care (at least not for this
    purpose) what order the indexes are created in.  Also, add dependencies
    on the indexes' underlying tables.  Those might seem pointless given
    the index dependencies, but they are helpful to cue parallel restore
    to avoid running the ATTACH PARTITION in parallel with other DDL on
    the same tables.

    Discussion: https://postgr.es/m/10817.1535494963@sss.pgh.pa.us
---
 src/bin/pg_dump/common.c  | 24 +++++++++++++++++++-----
 src/bin/pg_dump/pg_dump.c |  5 +++--
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c
index 0942fa5b..21fdcc52 100644
--- a/src/bin/pg_dump/common.c
+++ b/src/bin/pg_dump/common.c
@@ -403,17 +403,31 @@ flagInhIndexes(Archive *fout, TableInfo tblinfo[], int numTables)
 			attachinfo[k].dobj.catId.oid = 0;
 			AssignDumpId(&attachinfo[k].dobj);
 			attachinfo[k].dobj.name = pg_strdup(index->dobj.name);
+			attachinfo[k].dobj.namespace = index->indextable->dobj.namespace;
 			attachinfo[k].parentIdx = parentidx;
 			attachinfo[k].partitionIdx = index;
 
 			/*
-			 * We want dependencies from parent to partition (so that the
-			 * partition index is created first), and another one from
-			 * attach object to parent (so that the partition index is
-			 * attached once the parent index has been created).
+			 * We must state the DO_INDEX_ATTACH object's dependencies
+			 * explicitly, since it will not match anything in pg_depend.
+			 *
+			 * Give it dependencies on both the partition index and the parent
+			 * index, so that it will not be executed till both of those
+			 * exist.  (There's no need to care what order those are created
+			 * in.)
+			 *
+			 * In addition, give it dependencies on the indexes' underlying
+			 * tables.  This does nothing of great value so far as serial
+			 * restore ordering goes, but it ensures that a parallel restore
+			 * will not try to run the ATTACH concurrently with other
+			 * operations on those tables.
 			 */
-			addObjectDependency(&parentidx->dobj, index->dobj.dumpId);
+			addObjectDependency(&attachinfo[k].dobj, index->dobj.dumpId);
 			addObjectDependency(&attachinfo[k].dobj, parentidx->dobj.dumpId);
+			addObjectDependency(&attachinfo[k].dobj,
+								index->indextable->dobj.dumpId);
+			addObjectDependency(&attachinfo[k].dobj,
+								parentidx->indextable->dobj.dumpId);
 
 			k++;
 		}
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index b11b02ae..89685c9f 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -16591,7 +16591,7 @@ dumpIndexAttach(Archive *fout, IndexAttachInfo *attachinfo)
 	{
 		PQExpBuffer	q = createPQExpBuffer();
 
-		appendPQExpBuffer(q, "\nALTER INDEX %s ",
+		appendPQExpBuffer(q, "ALTER INDEX %s ",
 						  fmtQualifiedId(fout->remoteVersion,
 										 attachinfo->parentIdx->dobj.namespace->dobj.name,
 										 attachinfo->parentIdx->dobj.name));
@@ -16602,7 +16602,8 @@ dumpIndexAttach(Archive *fout, IndexAttachInfo *attachinfo)
 
 		ArchiveEntry(fout, attachinfo->dobj.catId, attachinfo->dobj.dumpId,
 					 attachinfo->dobj.name,
-					 NULL, NULL,
+					 attachinfo->dobj.namespace->dobj.name,
+					 NULL,
 					 "",
 					 false, "INDEX ATTACH", SECTION_POST_DATA,
 					 q->data, "", NULL,

From 7c3781072d85468101d69278a93e95047619e833 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Wed, 28 Apr 2021 17:36:05 +0800
Subject: [PATCH 312/578] Optimize distinct agg. Do distinct on datanodes,then
 agg can run parallel.

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131086201101&url_cache_key=d4e1402777dc733479aac463ad1a9d24
---
 src/backend/executor/nodeAgg.c          |  14 +--
 src/backend/nodes/copyfuncs.c           |   1 +
 src/backend/nodes/outfuncs.c            |   2 +
 src/backend/nodes/readfuncs.c           |   1 +
 src/backend/optimizer/plan/createplan.c |   7 +-
 src/backend/optimizer/plan/planner.c    | 153 ++++++++++++++++++++++--
 src/backend/optimizer/plan/setrefs.c    |   1 -
 src/backend/optimizer/util/pathnode.c   |  86 +++++++++++++
 src/backend/optimizer/util/tlist.c      |  25 ++++
 src/backend/utils/misc/guc.c            |  15 +++
 src/include/nodes/plannodes.h           |   1 +
 src/include/nodes/relation.h            |   1 +
 src/include/optimizer/pathnode.h        |   3 +
 src/include/optimizer/tlist.h           |  30 +++--
 src/test/regress/expected/sysviews.out  |   3 +-
 15 files changed, 309 insertions(+), 34 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 8b1695a6..0d0185c9 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -4624,6 +4624,7 @@ build_pertrans_for_aggref(AggStatePerTrans pertrans,
     int            numDistinctCols;
     int            naggs;
     int            i;
+	Agg         *agg = (Agg *)aggstate->ss.ps.plan;
 
     /* Begin filling in the pertrans data */
     pertrans->aggref = aggref;
@@ -4785,11 +4786,13 @@ build_pertrans_for_aggref(AggStatePerTrans pertrans,
      * have a list of SortGroupClause nodes; fish out the data in them and
      * stick them into arrays.  We ignore ORDER BY for an ordered-set agg,
      * however; the agg's transfn and finalfn are responsible for that.
+     * Distributed distinct agg does not need distinct in second phase.
      *
      * Note that by construction, if there is a DISTINCT clause then the ORDER
      * BY clause is a prefix of it (see transformDistinctClause).
      */
-    if (AGGKIND_IS_ORDERED_SET(aggref->aggkind))
+	if (AGGKIND_IS_ORDERED_SET(aggref->aggkind)
+	    || agg->noDistinct)
     {
         sortlist = NIL;
         numSortCols = numDistinctCols = 0;
@@ -4820,12 +4823,6 @@ build_pertrans_for_aggref(AggStatePerTrans pertrans,
         pertrans->sortslot = ExecInitExtraTupleSlot(estate);
         ExecSetSlotDescriptor(pertrans->sortslot, pertrans->sortdesc);
 
-        /*
-         * We don't implement DISTINCT or ORDER BY aggs in the HASHED case
-         * (yet)
-         */
-        Assert(aggstate->aggstrategy != AGG_HASHED && aggstate->aggstrategy != AGG_MIXED);
-
         /* If we have only one input, we need its len/byval info. */
         if (numInputs == 1)
         {
@@ -4869,7 +4866,8 @@ build_pertrans_for_aggref(AggStatePerTrans pertrans,
         Assert(i == numSortCols);
     }
 
-    if (aggref->aggdistinct)
+	/* Distributed distinct agg does not need distinct in second phase. */
+	if (aggref->aggdistinct && !agg->noDistinct)
     {
         Assert(numArguments > 0);
 
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 8bc360f1..ed04f1d4 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -1047,6 +1047,7 @@ _copyAgg(const Agg *from)
 #ifdef __TBASE__
 	COPY_SCALAR_FIELD(entrySize);
 	COPY_SCALAR_FIELD(hybrid);
+	COPY_SCALAR_FIELD(noDistinct);
 #endif
 
     return newnode;
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 7df4571b..ac7ea190 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -1330,6 +1330,7 @@ _outAgg(StringInfo str, const Agg *node)
 #ifdef __TBASE__
 	WRITE_UINT_FIELD(entrySize);
 	WRITE_BOOL_FIELD(hybrid);
+	WRITE_BOOL_FIELD(noDistinct);
 #endif
 }
 
@@ -3317,6 +3318,7 @@ _outAggPath(StringInfo str, const AggPath *node)
 #ifdef __TBASE__
 	WRITE_UINT_FIELD(entrySize);
 	WRITE_BOOL_FIELD(hybrid);
+	WRITE_BOOL_FIELD(noDistinct);
 #endif
 }
 
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 72e9a6fa..b13796e7 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -3279,6 +3279,7 @@ _readAgg(void)
 #ifdef __TBASE__
 	READ_UINT_FIELD(entrySize);
 	READ_BOOL_FIELD(hybrid);
+	READ_BOOL_FIELD(noDistinct);
 #endif
 
     READ_DONE();
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 706b3340..13c8ec79 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -2331,6 +2331,8 @@ create_agg_plan(PlannerInfo *root, AggPath *best_path)
 			}
 		}
 	}
+
+	plan->noDistinct = best_path->noDistinct;
 #endif
 
     return plan;
@@ -6513,7 +6515,9 @@ make_remotesubplan(PlannerInfo *root,
                         {
                             Agg *node = (Agg *)lefttree;
 
-                            if (node->aggsplit == AGGSPLIT_INITIAL_SERIAL)
+                            /* do not parallel if it's not safe */
+							if (node->aggsplit == AGGSPLIT_INITIAL_SERIAL
+							    && lefttree->parallel_safe)
                             {
                                 switch(node->aggstrategy)
                                 {
@@ -7999,6 +8003,7 @@ make_agg(List *tlist, List *qual,
 #ifdef __TBASE__
 	node->hybrid = false;
 	node->entrySize = 0;
+	node->noDistinct = false;
 #endif
     plan->qual = qual;
     plan->targetlist = tlist;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 7ee9d475..fc87fcbc 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -79,6 +79,7 @@ create_upper_paths_hook_type create_upper_paths_hook = NULL;
 
 #ifdef __TBASE__
 bool olap_optimizer = false;
+bool enable_distinct_optimizer;
 #endif
 
 /* Expression kind codes for preprocess_expression */
@@ -208,7 +209,7 @@ static PathTarget *make_sort_input_target(PlannerInfo *root,
 					   PathTarget *final_target,
 					   bool *have_postponed_srfs);
 static bool grouping_distribution_match(PlannerInfo *root, Query *parse,
-                      Path *path, List *clauses);
+					  Path *path, List *clauses, List *targetList);
 static bool groupingsets_distribution_match(PlannerInfo *root, Query *parse,
                       Path *path);
 static Path *adjust_path_distribution(PlannerInfo *root, Query *parse,
@@ -239,6 +240,9 @@ static bool can_parallel_agg(PlannerInfo *root, RelOptInfo *input_rel,
                 RelOptInfo *grouped_rel, const AggClauseCosts *agg_costs);
 #ifdef __TBASE__
 static Path *adjust_modifytable_subpath(PlannerInfo *root, Query *parse, Path *path);
+static bool can_distinct_agg_optimize(PlannerInfo *root, RelOptInfo *input_rel,
+                                      RelOptInfo *grouped_rel, PathTarget *pathtarget,
+                                      const AggClauseCosts *agg_costs);
 #endif
 
 /*****************************************************************************
@@ -4170,6 +4174,7 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
    bool        can_sort;
    bool        try_parallel_aggregation;
     bool		try_distributed_aggregation;
+	bool		try_distributed_distinct_agg_optimize;
 	PathTarget *partial_grouping_target = NULL;
 
 	ListCell   *lc;
@@ -4241,6 +4246,7 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
     {
         /* Not even parallel-safe. */
         try_distributed_aggregation = false;
+		try_distributed_distinct_agg_optimize = false;
     }
     else if (!parse->hasAggs && parse->groupClause == NIL)
     {
@@ -4249,26 +4255,44 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
          * some aggregates or a grouping clause.
          */
         try_distributed_aggregation = false;
+		try_distributed_distinct_agg_optimize = false;
     }
     else if (parse->groupingSets)
     {
         /* We don't know how to do grouping sets in parallel. */
         try_distributed_aggregation = false;
+		try_distributed_distinct_agg_optimize = false;
     }
-    else if (agg_costs->hasNonPartial || agg_costs->hasNonSerial)
+	else if (agg_costs->hasNonSerial)
+	{
+		/* Insufficient support for partial mode. */
+		try_distributed_aggregation = false;
+		try_distributed_distinct_agg_optimize = false;
+	}
+	else if (agg_costs->hasNonPartial)
     {
         /* Insufficient support for partial mode. */
         try_distributed_aggregation = false;
+        /* Ignore by distint agg optimize */
+		try_distributed_distinct_agg_optimize = true;
     }
     else
     {
         /* Everything looks good. */
         try_distributed_aggregation = true;
+		try_distributed_distinct_agg_optimize = true;
     }
 
     /* Whenever parallel aggregation is allowed, distributed should be too. */
     Assert(!(try_parallel_aggregation && !try_distributed_aggregation));
 
+	if (try_distributed_distinct_agg_optimize &&
+	    !can_distinct_agg_optimize(root, input_rel, grouped_rel,
+	                               target ,agg_costs))
+	{
+		try_distributed_distinct_agg_optimize = false;
+	}
+
     /*
      * Before generating paths for grouped_rel, we first generate any possible
      * partial paths; that way, later code can easily consider both parallel
@@ -4383,7 +4407,7 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
 	 * we know the per-node groupings won't overlap. But here we need to be
 	 * more careful.
 	 */
-	if (try_distributed_aggregation)
+	if (try_distributed_aggregation || try_distributed_distinct_agg_optimize)
 	{
 		partial_grouping_target = make_partial_grouping_target(root, target,
 				                                               (Node *) parse->havingQual);
@@ -4415,7 +4439,10 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
 								 AGGSPLIT_FINAL_DESERIAL,
 								 &agg_final_costs);
 		}
+	}
 
+	if (try_distributed_aggregation)
+	{
 		/* Build final XL grouping paths */
     if (can_sort)
     {
@@ -4762,6 +4789,57 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
         }
     }
 
+	if (try_distributed_distinct_agg_optimize)
+	{
+		List *groupExprs = NIL;
+		Aggref *agg = get_optimize_distinct_agg(target);
+
+		groupExprs = get_sortgrouplist_exprs(agg->aggdistinct, agg->args);
+
+		dNumPartialGroups = estimate_num_groups(root, groupExprs, cheapest_path->rows,
+		                                        NULL);
+
+		foreach (lc, input_rel->pathlist)
+		{
+			Path *path = (Path *)lfirst(lc);
+
+			/* check if we need redistribute */
+			if (!grouping_distribution_match(root, parse, path, agg->aggdistinct, agg->args))
+			{
+				path = create_redistribute_distinct_agg_path(root, parse, path, agg);
+			}
+
+			path = (Path *)create_agg_path(root,
+			                               grouped_rel,
+			                               path,
+			                               partial_grouping_target,
+			                               AGG_HASHED,
+			                               AGGSPLIT_INITIAL_SERIAL,
+			                               parse->groupClause,
+			                               NULL,
+			                               &agg_partial_costs,
+			                               dNumPartialGroups);
+			/* partial is not parallel safe */
+			path->parallel_safe = false;
+
+			path = create_remotesubplan_path(root, path, NULL);
+
+			path = (Path *)create_agg_path(root,
+			                               grouped_rel,
+			                               path,
+			                               target,
+			                               AGG_HASHED,
+			                               AGGSPLIT_FINAL_DESERIAL,
+			                               parse->groupClause,
+			                               NULL,
+			                               &agg_final_costs,
+			                               1);
+			((AggPath *)path)->noDistinct = true;
+
+			add_path(grouped_rel, path);
+		}
+	}
+
 	/* Give a helpful error if we failed to find any implementation */
 	if (grouped_rel->pathlist == NIL)
 		ereport(ERROR,
@@ -5443,7 +5521,7 @@ create_distinct_paths(PlannerInfo *root,
 				 * FIXME This could probably benefit from pushing a UNIQUE
 				 * to the remote side, and only doing a merge locally.
                          */
-				if (!grouping_distribution_match(root, parse, path, parse->distinctClause))
+				if (!grouping_distribution_match(root, parse, path, parse->distinctClause, parse->targetList))
 					path = create_remotesubplan_path(root, path, NULL);
 
 				add_path(distinct_rel, (Path *)
@@ -5474,7 +5552,7 @@ create_distinct_paths(PlannerInfo *root,
                                                              -1.0);
 
 		/* In case of grouping / distribution mismatch, inject remote scan. */
-		if (!grouping_distribution_match(root, parse, path, parse->distinctClause))
+		if (!grouping_distribution_match(root, parse, path, parse->distinctClause, parse->targetList))
 			path = create_remotesubplan_path(root, path, NULL);
 
 		add_path(distinct_rel, (Path *)
@@ -5520,7 +5598,7 @@ create_distinct_paths(PlannerInfo *root,
 		Path *input_path = cheapest_input_path;
 
 		/* If needed, inject RemoteSubplan redistributing the data. */
-		if (!grouping_distribution_match(root, parse, input_path, parse->distinctClause))
+		if (!grouping_distribution_match(root, parse, input_path, parse->distinctClause, parse->targetList))
 			input_path = create_remotesubplan_path(root, input_path, NULL);
 
 		/* XXX Maybe we can make this a 2-phase aggregate too? */
@@ -6784,7 +6862,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid)
  */
 static bool
 grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path,
-							List *clauses)
+							List *clauses, List *targetList)
 {
     int            i;
 	bool	matches_key = false;
@@ -6792,7 +6870,7 @@ grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path,
 
 	int numGroupCols = list_length(clauses);
 	AttrNumber *groupColIdx = extract_grouping_cols(clauses,
-													parse->targetList);
+													targetList);
 
 #ifdef __COLD_HOT__
 	if (has_cold_hot_table)
@@ -6826,7 +6904,7 @@ grouping_distribution_match(PlannerInfo *root, Query *parse, Path *path,
      */
 	for (i = 0; i < numGroupCols; i++)
     {
-		TargetEntry *te = (TargetEntry *)list_nth(parse->targetList,
+		TargetEntry *te = (TargetEntry *)list_nth(targetList,
 												  groupColIdx[i]-1);
 
 		if (equal(te->expr, distribution->distributionExpr))
@@ -8248,6 +8326,61 @@ adjust_path_distribution(PlannerInfo *root, Query *parse, Path *path)
     return path;
 }
 
+#ifdef __TBASE__
+/*
+ * can_distinct_agg_optimize
+ *		Check if distinct app is workable.
+ */
+static bool
+can_distinct_agg_optimize(PlannerInfo *root, RelOptInfo *input_rel,
+                          RelOptInfo *grouped_rel, PathTarget *pathtarget,
+                          const AggClauseCosts *agg_costs)
+{
+	ListCell *lc = NULL;
+	Query  *parse = NULL;
+	bool meet_distint_agg_clause = false;
+
+	parse = root->parse;
+
+    /* It's no use for 2phase agg on datanode */
+	if (!grouped_rel->consider_parallel || input_rel->partial_pathlist == NIL ||
+	    !agg_costs->hasOnlyDistinct || agg_costs->hasNonSerial || agg_costs->hasOrder ||
+	    parse->groupClause || parse->groupingSets || parse->havingQual ||
+	    parse->distinctClause || has_cold_hot_table || !olap_optimizer || !enable_distinct_optimizer ||
+	    IS_PGXC_DATANODE)
+	{
+		return false;
+	}
+
+	foreach (lc, pathtarget->exprs)
+	{
+		Aggref *aggref = (Aggref *)lfirst(lc);
+
+		if (IsA(aggref, Aggref) && aggref->aggdistinct != NIL)
+		{
+			/* only one distinct agg is allowed */
+			if(meet_distint_agg_clause)
+				return false;
+
+			if (list_length(aggref->aggdistinct) != 1 ||
+				list_length(aggref->args) != 1)
+			{
+				return false;	
+			}
+
+			/* currently we only support hash agg */
+			if (!grouping_is_hashable(aggref->aggdistinct))
+			{
+				return false;
+			}
+			meet_distint_agg_clause = true;
+		}
+	}
+
+	return meet_distint_agg_clause;
+}
+#endif
+
 static bool
 can_push_down_grouping(PlannerInfo *root, Query *parse, Path *path)
 {
@@ -8269,7 +8402,7 @@ can_push_down_grouping(PlannerInfo *root, Query *parse, Path *path)
     if (parse->groupingSets)
         return groupingsets_distribution_match(root, parse, path);
 
-    return grouping_distribution_match(root, parse, path, parse->groupClause);
+	return grouping_distribution_match(root, parse, path, parse->groupClause, parse->targetList);
 }
 
 static bool
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index 805585b7..d1016beb 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -1877,7 +1877,6 @@ convert_combining_aggrefs(Node *node, void *context)
 
         /* Assert we've not chosen to partial-ize any unsupported cases */
         Assert(orig_agg->aggorder == NIL);
-        Assert(orig_agg->aggdistinct == NIL);
 
         /*
          * Since aggregate calls can't be nested, we needn't recurse into the
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 4d2a1f32..35bf8b8a 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -3058,6 +3058,92 @@ get_num_connections(int numnodes, int nRemotePlans)
 
     return num_connections;
 }
+
+/*
+ * redistribute local grouping results among datanodes for 
+ * distinct aggs like count(distinct a) or avg(distinct a)...
+ *
+ * Tips: we do not check the agg column's type, directly use that
+ * as hash column, but some data types are not supported as hash column now,
+ * maybe some errors.
+ */
+Path *
+create_redistribute_distinct_agg_path(PlannerInfo *root, Query *parse, Path *path, Aggref *agg)
+{
+	PathTarget *pathtarget = path->pathtarget;
+	TargetEntry *te    = NULL;
+	Bitmapset   *nodes = NULL;
+	Oid group;
+	int i;
+
+	te = get_sortgroupclause_tle((SortGroupClause *)linitial(agg->aggdistinct),
+								 agg->args);
+
+	if(te == NULL)
+	{
+		elog(ERROR, "Distinct aggref not found in pathtarget.");
+	}
+
+	if (list_length(groupOids) > 1)
+	{
+		groupOids = NULL;
+		elog(ERROR, "Tables from different groups should not be invloved in one Query.");
+	}
+
+	if (groupOids)
+	{
+		group = linitial_oid(groupOids);
+	}
+	else
+	{
+		group = InvalidOid;
+	}
+
+	if (group == InvalidOid)
+	{
+		for (i = 0; i < NumDataNodes; i++)
+			nodes = bms_add_member(nodes, i);
+
+		/*
+		 * FIXING ME! check hash column's data type to satisfity hash locator func
+		 */
+		path = redistribute_path(root,
+								 path,
+								 NULL,
+								 LOCATOR_TYPE_HASH,
+								 (Node *)te->expr,
+								 nodes,
+								 NULL);
+	}
+	else
+	{
+		ListCell *cell;
+		List *nodelist = GetGroupNodeList(group);
+
+		foreach (cell, nodelist)
+		{
+			int nodeid = lfirst_int(cell);
+
+			nodes = bms_add_member(nodes, nodeid);
+		}
+		/*
+		 * FIXING ME! check hash column's data type to satisfity hash locator func
+		 */
+		path = redistribute_path(root,
+								 path,
+								 NULL,
+								 LOCATOR_TYPE_SHARD,
+								 (Node *)te->expr,
+								 nodes,
+								 NULL);
+	}
+
+	path->pathkeys = NULL;
+	path->pathtarget = pathtarget;
+
+	return path;
+}
+
 /*
   * redistribute local grouping results among datanodes, then
   * get the final grouping results. seems more efficient...
diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c
index 6fc75bd5..496fd970 100644
--- a/src/backend/optimizer/util/tlist.c
+++ b/src/backend/optimizer/util/tlist.c
@@ -468,6 +468,31 @@ get_sortgrouplist_exprs(List *sgClauses, List *targetList)
     return result;
 }
 
+/*
+ * get_distinct_agg_sortgroupclause
+ *		Given a pathtarget , acquire distinct clause
+ *		for aggref with distinct.
+ * Notice: only one distinct agg clause with one col
+ * 		is allowed.
+ */
+Aggref *
+get_optimize_distinct_agg(PathTarget *pathtarget)
+{
+	ListCell *lc = NULL;
+
+	foreach (lc, pathtarget->exprs)
+	{
+		Aggref *aggref = (Aggref *)lfirst(lc);
+
+		if (IsA(aggref, Aggref) && aggref->aggdistinct != NIL)
+		{
+			Assert(list_length(aggref->aggdistinct) == 1);
+			return aggref;
+		}
+	}
+
+	return NULL;
+}
 
 /*****************************************************************************
  *        Functions to extract data from a list of SortGroupClauses
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index e7ba54b0..488fd88f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -201,6 +201,7 @@ extern bool    PlpgsqlDebugPrint;
 /* used for get total size of session */
 static int32 g_TotalMemorySize = 0;
 extern bool    enable_parallel_ddl;
+extern bool    enable_distinct_optimizer;
 #endif
 static int    GUC_check_errcode_value;
 
@@ -2787,6 +2788,20 @@ static struct config_bool ConfigureNamesBool[] =
 	},
 
 #endif
+	{
+		{"enable_distinct_optimizer", PGC_SUSET, CUSTOM_OPTIONS,
+			 gettext_noop("push down distinct to datanodes."),
+			 NULL
+		},
+		&enable_distinct_optimizer
+		,
+#ifdef _PG_REGRESS_
+		true,
+#else
+		false,
+#endif
+		NULL, NULL, NULL
+	},
 
     /* End-of-list marker */
     {
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 4b3c49d2..c20a6741 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -855,6 +855,7 @@ typedef struct Agg
 #ifdef __TBASE__
 	uint32     entrySize;
 	bool       hybrid;
+	bool       noDistinct;      /* no need of distinct related initialization */
 #endif
 } Agg;
 
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index e49bc1a0..4b752d16 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -1710,6 +1710,7 @@ typedef struct AggPath
 #ifdef __TBASE__
 	uint32      entrySize;
 	bool        hybrid;
+	bool        noDistinct;     /* no need of distinct related initialization */
 #endif
 } AggPath;
 
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 4097e568..a3afb1a4 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -365,6 +365,9 @@ extern RelOptInfo *build_child_join_rel(PlannerInfo *root,
 #ifdef __TBASE__
 extern Path *create_redistribute_grouping_path(PlannerInfo *root, 
                                                 Query *parse, Path *path);
+extern Path *create_redistribute_distinct_agg_path(PlannerInfo *root,
+												   Query *parse, Path *path,
+												   Aggref *agg);
 extern void contains_remotesubplan(Path *path, int *number, bool *redistribute);
 
 extern int replication_level;
diff --git a/src/include/optimizer/tlist.h b/src/include/optimizer/tlist.h
index 5b9d94b0..076bfbab 100644
--- a/src/include/optimizer/tlist.h
+++ b/src/include/optimizer/tlist.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * tlist.h
- *      prototypes for tlist.c.
+ *	  prototypes for tlist.c.
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -24,7 +24,7 @@ extern List *add_to_flat_tlist(List *tlist, List *exprs);
 
 extern List *get_tlist_exprs(List *tlist, bool includeJunk);
 
-extern int    count_nonjunk_tlist_entries(List *tlist);
+extern int	count_nonjunk_tlist_entries(List *tlist);
 
 extern bool tlist_same_exprs(List *tlist1, List *tlist2);
 
@@ -34,18 +34,22 @@ extern bool tlist_same_collations(List *tlist, List *colCollations, bool junkOK)
 extern void apply_tlist_labeling(List *dest_tlist, List *src_tlist);
 
 extern TargetEntry *get_sortgroupref_tle(Index sortref,
-                     List *targetList);
+					 List *targetList);
 extern TargetEntry *get_sortgroupclause_tle(SortGroupClause *sgClause,
-                        List *targetList);
+						List *targetList);
 extern Node *get_sortgroupclause_expr(SortGroupClause *sgClause,
-                         List *targetList);
+						 List *targetList);
 extern List *get_sortgrouplist_exprs(List *sgClauses,
-                        List *targetList);
+						List *targetList);
+
+#ifdef __TBASE__
+extern Aggref *get_optimize_distinct_agg(PathTarget *pathtarget);
+#endif
 
 extern SortGroupClause *get_sortgroupref_clause(Index sortref,
-                        List *clauses);
+						List *clauses);
 extern SortGroupClause *get_sortgroupref_clause_noerr(Index sortref,
-                              List *clauses);
+							  List *clauses);
 
 extern Oid *extract_grouping_ops(List *groupClause);
 extern AttrNumber *extract_grouping_cols(List *groupClause, List *tlist);
@@ -57,16 +61,16 @@ extern List *make_tlist_from_pathtarget(PathTarget *target);
 extern PathTarget *copy_pathtarget(PathTarget *src);
 extern PathTarget *create_empty_pathtarget(void);
 extern void add_column_to_pathtarget(PathTarget *target,
-                         Expr *expr, Index sortgroupref);
+						 Expr *expr, Index sortgroupref);
 extern void add_new_column_to_pathtarget(PathTarget *target, Expr *expr);
 extern void add_new_columns_to_pathtarget(PathTarget *target, List *exprs);
 extern void apply_pathtarget_labeling_to_tlist(List *tlist, PathTarget *target);
 extern void split_pathtarget_at_srfs(PlannerInfo *root,
-                         PathTarget *target, PathTarget *input_target,
-                         List **targets, List **targets_contain_srfs);
+						 PathTarget *target, PathTarget *input_target,
+						 List **targets, List **targets_contain_srfs);
 
 /* Convenience macro to get a PathTarget with valid cost/width fields */
 #define create_pathtarget(root, tlist) \
-    set_pathtarget_cost_width(root, make_pathtarget_from_tlist(tlist))
+	set_pathtarget_cost_width(root, make_pathtarget_from_tlist(tlist))
 
-#endif                            /* TLIST_H */
+#endif							/* TLIST_H */
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 65fd3d80..ca0d242d 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -91,6 +91,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_crypt_parellel_debug       | off
  enable_data_mask                  | on
  enable_datanode_row_triggers      | off
+ enable_distinct_optimizer             | on
  enable_distri_debug               | off
  enable_distri_debug_print         | off
  enable_distri_visibility_print    | off
@@ -135,7 +136,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
  enable_xlog_mprotect              | on
-(62 rows)
+(63 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From b2b74898e764807bba6cea78a611dace2d4de3ed Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 29 Jun 2021 15:28:15 +0800
Subject: [PATCH 313/578] 1. Look deep into subxid array, set max as local_xid
 on secondary DN 2. Copy local_subxids to parallel workers 3. Force parallel
 workers to send cid 4. regress

tpad: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696089290349
---
 src/backend/access/transam/varsup.c          | 47 +++++++++-
 src/backend/access/transam/xact.c            | 30 +++++++
 src/backend/pgxc/pool/pgxcnode.c             |  3 +-
 src/backend/storage/ipc/procarray.c          | 19 +++-
 src/include/access/transam.h                 |  2 +
 src/include/storage/procarray.h              | 92 ++++++++++----------
 src/test/regress/expected/transactions_2.out | 66 ++++++++++++++
 src/test/regress/sql/transactions.sql        | 51 +++++++++++
 8 files changed, 256 insertions(+), 54 deletions(-)

diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 51749e7c..6fdadcc2 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -91,6 +91,11 @@ GetForceXidFromGTM(void)
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
 static TransactionId local_xid = InvalidTransactionId;
+static TransactionId local_subxids[PGPROC_MAX_CACHED_SUBXIDS] = {};
+static int local_nsub;
+/* exported information about parallel workers, see xact.c */
+extern int nParallelCurrentXids;
+extern TransactionId *ParallelCurrentXids;
 /*
  * Set next transaction id to use
  */
@@ -123,10 +128,10 @@ StoreGlobalXid(const char *globalXid)
     else if(IsConnFromDatanode())
     {
         
-        local_xid = GetLocalTransactionId(globalXid);
+		local_xid = GetLocalTransactionId(globalXid, local_subxids, &local_nsub);
         if(enable_distri_print)
         {
-            elog (LOG, " global xid %s to local xid %d", globalXid, local_xid);
+			elog (LOG, " global xid %s to local xid %d, %d subxids", globalXid, local_xid, local_nsub);
         }
     }
     
@@ -158,21 +163,55 @@ void SetLocalTransactionId(TransactionId xid)
     }
     local_xid = xid;
 
+	/* if xid is invalid, also need to reset subxid array */
+	if (!TransactionIdIsValid(xid))
+	{
+		local_nsub = 0;
+	}
 }
 
-TransactionId GetNextTransactionId(void)
+TransactionId
+GetNextTransactionId(void)
 {
     return local_xid;
 }
 
+int
+GetNumSubTransactions(void)
+{
+	return local_nsub;
+}
+
+TransactionId *
+GetSubTransactions(void)
+{
+	return local_subxids;
+}
+
 bool
 TransactIdIsCurentGlobalTransacId(TransactionId xid)
 {
+	int i;
+	
     if(enable_distri_print)
     {
         elog(LOG, "is current transaction xid %u local xid %d", xid, local_xid);
     }
-    return TransactionIdIsValid(local_xid) && TransactionIdEquals(xid, local_xid);
+	
+	if (!TransactionIdIsValid(local_xid))
+		return false;
+	
+	if (TransactionIdEquals(xid, local_xid))
+		return true;
+	
+	/* check subxids */
+	for (i = 0; i < local_nsub; i++)
+	{
+		if (TransactionIdEquals(local_subxids[i], xid))
+			return true;
+	}
+	
+	return false;
 }
 
 #ifdef __TWO_PHASE_TRANS__
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 9632a415..91cd002a 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -6512,7 +6512,14 @@ EstimateTransactionStateSpace(void)
                                  * command counter, XID count */
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__    
+	int nsub = GetNumSubTransactions();
     nxids++; /* local xid */
+	if (nsub > 0)
+	{
+		nxids = add_size(nxids, nsub); /* local subxids */
+	}
+	else /* only do for loop below */
+	{
 #endif
 
     for (s = CurrentTransactionState; s != NULL; s = s->parent)
@@ -6521,6 +6528,9 @@ EstimateTransactionStateSpace(void)
             nxids = add_size(nxids, 1);
         nxids = add_size(nxids, s->nChildXids);
     }
+#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
+	}
+#endif
 
     nxids = add_size(nxids, nParallelCurrentXids);
 
@@ -6562,6 +6572,7 @@ SerializeTransactionState(Size maxsize, char *start_address)
     Size        c = 0;
     TransactionId *workspace;
     TransactionId *result = (TransactionId *) start_address;
+	int         nsub = 0;
 
     result[c++] = (TransactionId) XactIsoLevel;
     result[c++] = (TransactionId) XactDeferrable;
@@ -6588,10 +6599,17 @@ SerializeTransactionState(Size maxsize, char *start_address)
         return;
     }
 
+	nsub = GetNumSubTransactions();
     /*
      * OK, we need to generate a sorted list of XIDs that our workers should
      * view as current.  First, figure out how many there are.
      */
+	if (nsub > 0)
+	{
+		nxids = add_size(nxids, nsub);
+	}
+	else
+	{
     for (s = CurrentTransactionState; s != NULL; s = s->parent)
     {
         if (TransactionIdIsValid(s->transactionId))
@@ -6599,9 +6617,20 @@ SerializeTransactionState(Size maxsize, char *start_address)
         nxids = add_size(nxids, s->nChildXids);
     }
     Assert((c + 1 + nxids) * sizeof(TransactionId) <= maxsize);
+	}
 
     /* Copy them to our scratch space. */
     workspace = palloc(nxids * sizeof(TransactionId));
+	
+	if (nsub > 0)
+	{
+		TransactionId *subxids = GetSubTransactions();
+		memcpy(&workspace[i], subxids,
+		       nsub * sizeof(TransactionId));
+		i += nsub;
+	}
+	else
+	{
     for (s = CurrentTransactionState; s != NULL; s = s->parent)
     {
         if (TransactionIdIsValid(s->transactionId))
@@ -6611,6 +6640,7 @@ SerializeTransactionState(Size maxsize, char *start_address)
         i += s->nChildXids;
     }
     Assert(i == nxids);
+	}
 
     /* Sort them. */
     qsort(workspace, nxids, sizeof(TransactionId), xidComparator);
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 5811f647..b69e928d 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -3088,7 +3088,8 @@ pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid)
     int            i32;
 
     /* No need to send command ID if its sending flag is not enabled */
-    if (!IsSendCommandId())
+	/* XXX: parallel worker always send cid */
+	if (!IsSendCommandId() && !IsParallelWorker())
     {
         return 0;
     }
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 64481951..35636976 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -1963,7 +1963,7 @@ GetMaxSnapshotSubxidCount(void)
 }
 
 #ifdef __TBASE__
-TransactionId GetLocalTransactionId(const char *globalXid)
+TransactionId GetLocalTransactionId(const char *globalXid, TransactionId *subxids, int *nsub)
 {
     
     ProcArrayStruct *arrayP = procArray;
@@ -1979,6 +1979,8 @@ TransactionId GetLocalTransactionId(const char *globalXid)
         int         pgprocno = pgprocnos[index];
         PGPROC *proc = &allProcs[pgprocno];
         volatile PGXACT *pgxact = &allPgXact[pgprocno];
+		TransactionId    result =  InvalidTransactionId;
+		int         nxid;
         
         LWLockAcquire(&proc->globalxidLock, LW_SHARED);
         if (!proc->hasGlobalXid || strcmp(globalXid, proc->globalXid) != 0)
@@ -1992,10 +1994,21 @@ TransactionId GetLocalTransactionId(const char *globalXid)
             continue;
         }
         
+		result = pgxact->xid;
+		
+		/* look for max xid in subtrans */
+		*nsub = pgxact->nxids;
+		for (nxid = 0; nxid < pgxact->nxids; nxid++)
+		{
+			TransactionId subxid = proc->subxids.xids[nxid];
+			subxids[nxid] = subxid;
+		}
+		
         LWLockRelease(&proc->globalxidLock);
         LWLockRelease(ProcArrayLock);
-        elog(DEBUG8, "found xid %d for global xid %s", pgxact->xid, globalXid);
-        return pgxact->xid;
+		elog(DEBUG8, "found xid %d for global xid %s", result, globalXid);
+		
+		return result;
     }
 
     LWLockRelease(ProcArrayLock);
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index 0a56dda3..d94c4d26 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -270,6 +270,8 @@ extern TransactionId GetNewTransactionId(bool isSubXact);
 extern bool TransactIdIsCurentGlobalTransacId(TransactionId xid);
 extern TransactionId GetNextTransactionId(void);
 extern void ExtendLogs(TransactionId xid);
+extern int GetNumSubTransactions(void);
+extern TransactionId *GetSubTransactions(void);
 #endif
 extern TransactionId ReadNewTransactionId(void);
 extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index 24f45705..d6607bcf 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * procarray.h
- *      POSTGRES process array definitions.
+ *	  POSTGRES process array definitions.
  *
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
@@ -26,8 +26,8 @@ extern int GlobalSnapshotSource;
 
 typedef enum GlobalSnapshotSourceType
 {
-    GLOBAL_SNAPSHOT_SOURCE_GTM,
-    GLOBAL_SNAPSHOT_SOURCE_COORDINATOR
+	GLOBAL_SNAPSHOT_SOURCE_GTM,
+	GLOBAL_SNAPSHOT_SOURCE_COORDINATOR
 } GlobalSnapshotSourceType;
 #endif
 
@@ -38,33 +38,33 @@ typedef enum GlobalSnapshotSourceType
  * to avoid forcing to include proc.h when including procarray.h. So if you modify
  * PROC_XXX flags, you need to modify these flags.
  */
-#define        PROCARRAY_VACUUM_FLAG            0x02    /* currently running lazy
-                                                     * vacuum */
-#define        PROCARRAY_ANALYZE_FLAG            0x04    /* currently running
-                                                     * analyze */
-#define        PROCARRAY_LOGICAL_DECODING_FLAG 0x10    /* currently doing logical
-                                                     * decoding outside xact */
-
-#define        PROCARRAY_SLOTS_XMIN            0x20    /* replication slot xmin,
-                                                     * catalog_xmin */
+#define		PROCARRAY_VACUUM_FLAG			0x02	/* currently running lazy
+													 * vacuum */
+#define		PROCARRAY_ANALYZE_FLAG			0x04	/* currently running
+													 * analyze */
+#define		PROCARRAY_LOGICAL_DECODING_FLAG 0x10	/* currently doing logical
+													 * decoding outside xact */
+
+#define		PROCARRAY_SLOTS_XMIN			0x20	/* replication slot xmin,
+													 * catalog_xmin */
 /*
  * Only flags in PROCARRAY_PROC_FLAGS_MASK are considered when matching
  * PGXACT->vacuumFlags. Other flags are used for different purposes and
  * have no corresponding PROC flag equivalent.
  */
-#define        PROCARRAY_PROC_FLAGS_MASK    (PROCARRAY_VACUUM_FLAG | \
-                                         PROCARRAY_ANALYZE_FLAG | \
-                                         PROCARRAY_LOGICAL_DECODING_FLAG)
+#define		PROCARRAY_PROC_FLAGS_MASK	(PROCARRAY_VACUUM_FLAG | \
+										 PROCARRAY_ANALYZE_FLAG | \
+										 PROCARRAY_LOGICAL_DECODING_FLAG)
 
 /* Use the following flags as an input "flags" to GetOldestXmin function */
 /* Consider all backends except for logical decoding ones which manage xmin separately */
-#define        PROCARRAY_FLAGS_DEFAULT            PROCARRAY_LOGICAL_DECODING_FLAG
+#define		PROCARRAY_FLAGS_DEFAULT			PROCARRAY_LOGICAL_DECODING_FLAG
 /* Ignore vacuum backends */
-#define        PROCARRAY_FLAGS_VACUUM            PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG
+#define		PROCARRAY_FLAGS_VACUUM			PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG
 /* Ignore analyze backends */
-#define        PROCARRAY_FLAGS_ANALYZE            PROCARRAY_FLAGS_DEFAULT | PROCARRAY_ANALYZE_FLAG
+#define		PROCARRAY_FLAGS_ANALYZE			PROCARRAY_FLAGS_DEFAULT | PROCARRAY_ANALYZE_FLAG
 /* Ignore both vacuum and analyze backends */
-#define        PROCARRAY_FLAGS_VACUUM_ANALYZE    PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG | PROCARRAY_ANALYZE_FLAG
+#define		PROCARRAY_FLAGS_VACUUM_ANALYZE	PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG | PROCARRAY_ANALYZE_FLAG
 
 extern Size ProcArrayShmemSize(void);
 extern void CreateSharedProcArray(void);
@@ -77,17 +77,17 @@ extern void ProcArrayClearTransaction(PGPROC *proc);
 #ifdef PGXC  /* PGXC_DATANODE */
 typedef enum
 {
-    SNAPSHOT_UNDEFINED,   /* Coordinator has not sent snapshot or not yet connected */
-    SNAPSHOT_LOCAL,       /* Coordinator has instructed Datanode to build up snapshot from the local procarray */
-    SNAPSHOT_COORDINATOR, /* Coordinator has sent snapshot data */
-    SNAPSHOT_DIRECT       /* Datanode obtained directly from GTM */
+	SNAPSHOT_UNDEFINED,   /* Coordinator has not sent snapshot or not yet connected */
+	SNAPSHOT_LOCAL,       /* Coordinator has instructed Datanode to build up snapshot from the local procarray */
+	SNAPSHOT_COORDINATOR, /* Coordinator has sent snapshot data */
+	SNAPSHOT_DIRECT       /* Datanode obtained directly from GTM */
 } SnapshotSource;
 
 extern void SetGlobalTimestamp(GlobalTimestamp gts, SnapshotSource source);
 #if 0
 extern void SetGlobalSnapshotData(TransactionId xmin, TransactionId xmax, int xcnt,
-        TransactionId *xip,
-        SnapshotSource source);
+		TransactionId *xip,
+		SnapshotSource source);
 #endif
 extern void UnsetGlobalSnapshotData(void);
 extern void ReloadConnInfoOnBackends(bool refresh_only);
@@ -95,23 +95,23 @@ extern void ReloadConnInfoOnBackends(bool refresh_only);
 extern void ProcArrayInitRecovery(TransactionId initializedUptoXID);
 extern void ProcArrayApplyRecoveryInfo(RunningTransactions running);
 extern void ProcArrayApplyXidAssignment(TransactionId topxid,
-                            int nsubxids, TransactionId *subxids);
+							int nsubxids, TransactionId *subxids);
 
 extern void RecordKnownAssignedTransactionIds(TransactionId xid);
 extern void ExpireTreeKnownAssignedTransactionIds(TransactionId xid,
-                                      int nsubxids, TransactionId *subxids,
-                                      TransactionId max_xid);
+									  int nsubxids, TransactionId *subxids,
+									  TransactionId max_xid);
 extern void ExpireAllKnownAssignedTransactionIds(void);
 extern void ExpireOldKnownAssignedTransactionIds(TransactionId xid);
 
-extern int    GetMaxSnapshotXidCount(void);
-extern int    GetMaxSnapshotSubxidCount(void);
+extern int	GetMaxSnapshotXidCount(void);
+extern int	GetMaxSnapshotSubxidCount(void);
 
 #define GetSnapshotData(snapshot, latest) GetSnapshotData_shard(snapshot, latest, true)
 extern Snapshot GetSnapshotData_shard(Snapshot snapshot, bool latest, bool need_shardmap);
 
 extern bool ProcArrayInstallImportedXmin(TransactionId xmin,
-                             VirtualTransactionId *sourcevxid);
+							 VirtualTransactionId *sourcevxid);
 extern bool ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc);
 extern void ProcArrayCheckXminConsistency(TransactionId global_xmin);
 extern void SetLatestCompletedXid(TransactionId latestCompletedXid);
@@ -123,13 +123,13 @@ extern bool TransactionIdIsInProgress(TransactionId xid);
 extern bool TransactionIdIsPrepared(TransactionId xid, Snapshot snapshot, GlobalTimestamp *prepare_ts);
 #endif
 #ifdef __TBASE__
-extern TransactionId GetLocalTransactionId(const char *globalXid);
+extern TransactionId GetLocalTransactionId(const char *globalXid, TransactionId *subxids, int *nsub);
 #endif
 extern char *GetGlobalTransactionId(const TransactionId pid);
 extern bool TransactionIdIsActive(TransactionId xid);
 extern TransactionId GetOldestXmin(Relation rel, int flags);
 extern TransactionId GetOldestXminInternal(Relation rel, int flags,
-        bool computeLocal, TransactionId lastGlobalXmin);
+		bool computeLocal, TransactionId lastGlobalXmin);
 extern TransactionId GetOldestActiveTransactionId(void);
 extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
 
@@ -138,38 +138,38 @@ extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids
 
 extern PGPROC *BackendPidGetProc(int pid);
 extern PGPROC *BackendPidGetProcWithLock(int pid);
-extern int    BackendXidGetPid(TransactionId xid);
+extern int	BackendXidGetPid(TransactionId xid);
 extern bool IsBackendPid(int pid);
 
 extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin,
-                      bool excludeXmin0, bool allDbs, int excludeVacuum,
-                      int *nvxids);
+					  bool excludeXmin0, bool allDbs, int excludeVacuum,
+					  int *nvxids);
 extern VirtualTransactionId *GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid);
 extern pid_t CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode);
 
 extern bool MinimumActiveBackends(int min);
-extern int    CountDBBackends(Oid databaseid);
-extern int    CountDBConnections(Oid databaseid);
+extern int	CountDBBackends(Oid databaseid);
+extern int	CountDBConnections(Oid databaseid);
 extern void CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending);
-extern int    CountUserBackends(Oid roleid);
+extern int	CountUserBackends(Oid roleid);
 extern bool CountOtherDBBackends(Oid databaseId,
-                     int *nbackends, int *nprepared);
+					 int *nbackends, int *nprepared);
 
 extern void XidCacheRemoveRunningXids(TransactionId xid,
-                          int nxids, const TransactionId *xids,
-                          TransactionId latestXid);
+						  int nxids, const TransactionId *xids,
+						  TransactionId latestXid);
 #ifdef XCP
 extern void GetGlobalSessionInfo(int pid, Oid *coordId, int *coordPid);
-extern int    GetFirstBackendId(int *numBackends, int *backends);
+extern int	GetFirstBackendId(int *numBackends, int *backends);
 #endif /* XCP */
 
 extern void ProcArraySetReplicationSlotXmin(TransactionId xmin,
-                                TransactionId catalog_xmin, bool already_locked);
+								TransactionId catalog_xmin, bool already_locked);
 
 extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
-                                TransactionId *catalog_xmin);
+								TransactionId *catalog_xmin);
 #ifdef __TBASE__
 extern RunningTransactions GetCurrentRunningTransaction(void);
 extern GlobalTimestamp GetLatestCommitTS(void);
 #endif
-#endif                            /* PROCARRAY_H */
+#endif							/* PROCARRAY_H */
diff --git a/src/test/regress/expected/transactions_2.out b/src/test/regress/expected/transactions_2.out
index e121bf87..30a34e63 100644
--- a/src/test/regress/expected/transactions_2.out
+++ b/src/test/regress/expected/transactions_2.out
@@ -676,6 +676,72 @@ ERROR:  portal "ctt" cannot be run
 COMMIT;
 DROP FUNCTION create_temp_tab();
 DROP FUNCTION invert(x float8);
+-- Test for distributed subtrans in secondary DNs
+begin;
+savepoint s;
+-- create tables in subtransaction
+create table t1_trans(f1 int,f2 int);
+create table t2_trans(f1 int,f2 int);
+create table t3_trans(f1 int,f2 int);
+insert into t1_trans values(1,1),(2,2);
+insert into t3_trans select * from t1_trans;
+insert into t2_trans(f2) select count(1) from t3_trans;
+select * from t2_trans;
+ f1 | f2 
+----+----
+    |  2
+(1 row)
+
+abort;
+begin;
+-- create tables in parent transaction
+create table t1_trans(f1 int,f2 int);
+create table t2_trans(f1 int,f2 int);
+create table t3_trans(f1 int,f2 int);
+savepoint s;
+insert into t1_trans values(1,1),(2,2);
+insert into t3_trans select * from t1_trans;
+insert into t2_trans(f2) select count(1) from t3_trans;
+select * from t2_trans;
+ f1 | f2 
+----+----
+    |  2
+(1 row)
+
+abort;
+-- create tables out of transaction
+create table t1_trans(f1 int,f2 int);
+create table t2_trans(f1 int,f2 int);
+create table t3_trans(f1 int,f2 int);
+insert into t1_trans values(1,1),(2,2);
+begin;
+savepoint s;
+insert into t3_trans select * from t1_trans;
+insert into t2_trans(f2) select count(1) from t3_trans;
+select * from t2_trans;
+ f1 | f2 
+----+----
+    |  2
+(1 row)
+
+abort;
+-- test for subtrans in parallel worker
+begin;
+savepoint s;
+set parallel_setup_cost=0;
+set parallel_tuple_cost=0;
+set min_parallel_table_scan_size=0;
+set min_parallel_rows_size=0;
+set max_parallel_workers_per_gather=2;
+insert into t3_trans select * from t1_trans;
+select count(*) from t3_trans join t1_trans using (f2);
+ count 
+-------
+     2
+(1 row)
+
+abort;
+drop table t1_trans, t2_trans, t3_trans;
 -- Test for successful cleanup of an aborted transaction at session exit.
 -- THIS MUST BE THE LAST TEST IN THIS FILE.
 begin;
diff --git a/src/test/regress/sql/transactions.sql b/src/test/regress/sql/transactions.sql
index e8c1b3c9..80f235e0 100644
--- a/src/test/regress/sql/transactions.sql
+++ b/src/test/regress/sql/transactions.sql
@@ -459,6 +459,57 @@ COMMIT;
 DROP FUNCTION create_temp_tab();
 DROP FUNCTION invert(x float8);
 
+-- Test for distributed subtrans in secondary DNs
+begin;
+savepoint s;
+-- create tables in subtransaction
+create table t1_trans(f1 int,f2 int);
+create table t2_trans(f1 int,f2 int);
+create table t3_trans(f1 int,f2 int);
+insert into t1_trans values(1,1),(2,2);
+insert into t3_trans select * from t1_trans;
+insert into t2_trans(f2) select count(1) from t3_trans;
+select * from t2_trans;
+abort;
+
+begin;
+-- create tables in parent transaction
+create table t1_trans(f1 int,f2 int);
+create table t2_trans(f1 int,f2 int);
+create table t3_trans(f1 int,f2 int);
+savepoint s;
+insert into t1_trans values(1,1),(2,2);
+insert into t3_trans select * from t1_trans;
+insert into t2_trans(f2) select count(1) from t3_trans;
+select * from t2_trans;
+abort;
+
+-- create tables out of transaction
+create table t1_trans(f1 int,f2 int);
+create table t2_trans(f1 int,f2 int);
+create table t3_trans(f1 int,f2 int);
+insert into t1_trans values(1,1),(2,2);
+begin;
+savepoint s;
+insert into t3_trans select * from t1_trans;
+insert into t2_trans(f2) select count(1) from t3_trans;
+select * from t2_trans;
+abort;
+
+-- test for subtrans in parallel worker
+begin;
+savepoint s;
+set parallel_setup_cost=0;
+set parallel_tuple_cost=0;
+set min_parallel_table_scan_size=0;
+set min_parallel_rows_size=0;
+set max_parallel_workers_per_gather=2;
+insert into t3_trans select * from t1_trans;
+select count(*) from t3_trans join t1_trans using (f2);
+abort;
+
+drop table t1_trans, t2_trans, t3_trans;
+
 
 -- Test for successful cleanup of an aborted transaction at session exit.
 -- THIS MUST BE THE LAST TEST IN THIS FILE.

From a85cb1e840a9135ec20f8dfd7ea4513b680e83ec Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Tue, 27 Jul 2021 10:21:53 +0800
Subject: [PATCH 314/578] fix launched parallel workers > expected
 http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696086983875
 (merge request !534)

---
 src/backend/optimizer/plan/createplan.c | 199 +++++++++++++++++++++++-
 src/backend/optimizer/plan/planner.c    |   1 +
 src/include/optimizer/planmain.h        |   1 +
 3 files changed, 200 insertions(+), 1 deletion(-)

diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 13c8ec79..7a00bef7 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -107,6 +107,7 @@ bool mergejoin = false;
 bool child_of_gather = false;
 bool enable_group_across_query = false;
 bool enable_distributed_unique_plan = false;
+int min_workers_of_hashjon_gather = PG_INT32_MAX;
 #endif
 #ifdef __COLD_HOT__
 bool has_cold_hot_table = false;
@@ -345,6 +346,7 @@ static int add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll,
 static double GetPlanRows(Plan *plan);
 static bool set_plan_parallel(Plan *plan);
 static void set_plan_nonparallel(Plan *plan);
+static bool contain_hashjon_walker(Plan *node);
 
 #endif
 static RemoteSubplan *find_push_down_plan(Plan *plan, bool force);
@@ -6472,7 +6474,7 @@ make_remotesubplan(PlannerInfo *root,
 
             heap_parallel_workers = Min(heap_parallel_workers, max_parallel_workers_per_gather);
 
-            gather->num_workers = Max(heap_parallel_workers, nWorkers);
+			gather->num_workers = Min(Max(heap_parallel_workers, nWorkers), min_workers_of_hashjon_gather);
         }
         else
         {
@@ -6711,6 +6713,8 @@ make_remotesubplan(PlannerInfo *root,
                 parallel_workers = heap_parallel_workers;
 
                 parallel_workers = Min(parallel_workers, max_parallel_workers_per_gather);
+				/* launched parallel workers must less than hashjoin's parallel workers under it */
+                parallel_workers = Min(parallel_workers, min_workers_of_hashjon_gather);
                 
 				gather_plan = make_gather(copyObject(gather_left->targetlist),
                                           NIL,
@@ -6747,6 +6751,7 @@ make_remotesubplan(PlannerInfo *root,
             }
         }
     }
+    min_workers_of_hashjon_gather = PG_INT32_MAX;
 #endif
 
     if (resultDistribution)
@@ -8263,6 +8268,15 @@ make_gather(List *qptlist,
     node->single_copy = single_copy;
     node->invisible = false;
 
+#ifdef __TBASE__
+	/*
+	 * if there has hashjoin in the lower layer, write down the smallest workers
+	 */
+    if (min_workers_of_hashjon_gather > nworkers && contain_hashjon_walker(subplan))
+    {
+        min_workers_of_hashjon_gather = nworkers;
+    }
+#endif
     return node;
 }
 
@@ -8928,6 +8942,189 @@ contain_remote_subplan_walker(Node *node, void *context, bool include_cte)
     return false;
 }
 
+/*
+ * check if contain hashjon in the plan
+ */
+static bool
+contain_hashjon_walker(Plan *node)
+{
+    Plan *plan = node;
+
+    if (!plan)
+    {
+        return false;
+    }
+
+    if (IsA(node, RemoteSubplan) || IsA(node, RemoteQuery) || IsA(plan, Gather))
+    {
+        return false;
+    }
+
+    if (IsA(node, HashJoin))
+    {
+        return true;
+    }
+
+    if (IsA(node, SubqueryScan))
+    {
+        SubqueryScan *subquery = (SubqueryScan *)node;
+        plan = subquery->subplan;
+    }
+
+    if (IsA(plan, Append))
+    {
+        ListCell *lc;
+        Append *append = (Append *)plan;
+
+        foreach(lc, append->appendplans)
+        {
+            Plan *appendplan = (Plan *)lfirst(lc);
+
+            if (appendplan && contain_hashjon_walker(appendplan))
+            {
+                return true;
+            }
+        }
+
+        return false;
+    }
+    else if (IsA(plan, MergeAppend))
+    {
+        ListCell *lc;
+        MergeAppend *mergeappend = (MergeAppend *)plan;
+
+        foreach(lc, mergeappend->mergeplans)
+        {
+            Plan *mergeappendplan = (Plan *)lfirst(lc);
+
+            if (mergeappendplan && contain_hashjon_walker(mergeappendplan))
+            {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    if (outerPlan(plan))
+    {
+        if (contain_hashjon_walker(outerPlan(plan)))
+        {
+            return true;
+        }
+    }
+
+    if (innerPlan(plan))
+    {
+        if (contain_hashjon_walker(innerPlan(plan)))
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+
+static Plan*
+materialize_top_remote_subplan(Plan *node)
+{
+    Node *plan = (Node *)node;
+
+    if (!plan)
+    {
+        return NULL;
+    }
+
+    if (IsA(node, Material))
+    {
+        return node;
+    }
+
+    if (IsA(node, RemoteSubplan))
+    {
+        Plan	   *matplan = (Plan *) make_material(node);
+
+        /*
+         * We assume the materialize will not spill to disk, and therefore
+         * charge just cpu_operator_cost per tuple.  (Keep this estimate in
+         * sync with cost_mergejoin.)
+         */
+        copy_plan_costsize(matplan, node);
+        matplan->total_cost += cpu_operator_cost * matplan->plan_rows;
+
+        return matplan;
+    }
+
+    if (IsA(node, SubqueryScan))
+    {
+        SubqueryScan *subquery = (SubqueryScan *)node;
+        plan = (Node *)subquery->subplan;
+    }
+
+    if (IsA(plan, Append))
+    {
+        ListCell *lc;
+        Append *append = (Append *)plan;
+
+        foreach(lc, append->appendplans)
+        {
+            Plan *appendplan = (Plan *)lfirst(lc);
+
+            if (appendplan)
+            {
+                Plan *tmpplan = materialize_top_remote_subplan(appendplan);
+                if (tmpplan && tmpplan != lfirst(lc))
+                {
+                    lfirst(lc) = tmpplan;
+                }
+            }
+        }
+
+        return node;
+    }
+    else if (IsA(plan, MergeAppend))
+    {
+        ListCell *lc;
+        MergeAppend *mergeappend = (MergeAppend *)plan;
+
+        foreach(lc, mergeappend->mergeplans)
+        {
+            Plan *mergeappendplan = (Plan *)lfirst(lc);
+
+            if (mergeappendplan)
+            {
+                Plan *tmpplan = materialize_top_remote_subplan(mergeappendplan);
+                if (tmpplan && tmpplan != lfirst(lc))
+                {
+                    lfirst(lc) = tmpplan;
+                }
+            }
+        }
+
+        return node;
+    }
+
+    if (outerPlan(plan))
+    {
+        Plan *tmpplan = materialize_top_remote_subplan(outerPlan(plan));
+        if (tmpplan && tmpplan != outerPlan(plan))
+        {
+            outerPlan(plan) = tmpplan;
+        }
+    }
+
+    if (innerPlan(plan))
+    {
+        Plan *tmpplan = materialize_top_remote_subplan(innerPlan(plan));
+        if (tmpplan && tmpplan != innerPlan(plan))
+        {
+            innerPlan(plan) = tmpplan;
+        }
+    }
+    return node;
+}
+
 static void
 create_remotequery_for_rel(PlannerInfo *root, ModifyTable *mt, RangeTblEntry *res_rel, Index resultRelationIndex,
                                    int relcount, CmdType cmdtyp, RelationAccessType    accessType, int partindex,
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index fc87fcbc..62a92b5f 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -324,6 +324,7 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
     glob->dependsOnRole = false;
 #ifdef __TBASE__
     groupOids = NULL;
+	min_workers_of_hashjon_gather = PG_INT32_MAX;
 #endif
 #ifdef __COLD_HOT__
     has_cold_hot_table = false;
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index 8139e134..4e32be80 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -99,6 +99,7 @@ extern int remote_subplan_depth;
 extern List *groupOids;
 extern bool enable_distributed_unique_plan;
 extern bool has_cold_hot_table;
+extern int min_workers_of_hashjon_gather;
 
 #define INSERT_TRIGGER "tt_dn_in_"
 #define UPDATE_TRIGGER "tt_dn_up_"

From fee71a4c5a65220e0b19d1aff2372740942d87cd Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 18 Aug 2021 16:19:57 +0800
Subject: [PATCH 315/578] fix latch already owned in parallel mode
 http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696090962107#
  http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131086984137 (merge
 request !604)

---
 src/backend/optimizer/plan/createplan.c | 67 ++++++++++++++++++++-----
 1 file changed, 55 insertions(+), 12 deletions(-)

diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 7a00bef7..9444951b 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -346,7 +346,7 @@ static int add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll,
 static double GetPlanRows(Plan *plan);
 static bool set_plan_parallel(Plan *plan);
 static void set_plan_nonparallel(Plan *plan);
-static bool contain_hashjon_walker(Plan *node);
+static bool contain_node_walker(Plan *node, NodeTag type, bool search_nonparallel);
 
 #endif
 static RemoteSubplan *find_push_down_plan(Plan *plan, bool force);
@@ -1961,6 +1961,7 @@ create_gather_plan(PlannerInfo *root, GatherPath *best_path)
     Plan       *subplan;
     List       *tlist;
     bool reset = false;
+    bool contain_nonparallel_hashjoin = false;
 
     /* if child_of_gather is false, set child_of_gather true, and reset the value before return */
     if (!child_of_gather)
@@ -1977,9 +1978,12 @@ create_gather_plan(PlannerInfo *root, GatherPath *best_path)
 
     tlist = build_path_tlist(root, &best_path->path);
 
+	/* if contain nonparallel hashjoin, set num_workers to 1 */
+    contain_nonparallel_hashjoin = contain_node_walker(subplan, T_HashJoin, true);
+
     gather_plan = make_gather(tlist,
                               NIL,
-                              best_path->num_workers,
+                              (contain_nonparallel_hashjoin) ? 1 : best_path->num_workers,
                               best_path->single_copy,
                               subplan);
 
@@ -6458,10 +6462,19 @@ make_remotesubplan(PlannerInfo *root,
             Gather *gather = (Gather *)lefttree;
             int nWorkers = gather->num_workers;
             Plan *leftplan = lefttree->lefttree;
+            /* if contain nonparallel hashjoin, set num_workers to 1 */
+            bool contain_nonparallel_hashjoin = contain_node_walker(leftplan, T_HashJoin, true);
+            if (contain_nonparallel_hashjoin)
+            {
+                gather->num_workers = 1;
+            }
+            else
+            {
 			/* rows estimate is cut down to per data nodes, set it to all nodes for parallel estimate. */
 			double rows = GetPlanRows(leftplan) * nodes;
             int    heap_parallel_threshold = 0;
             int    heap_parallel_workers = 1;
+                bool contain_gather = contain_node_walker(leftplan, T_Gather, false);
 
 			heap_parallel_threshold = Max(min_parallel_rows_size, 1);
             while (rows >= (heap_parallel_threshold * 3))
@@ -6473,8 +6486,10 @@ make_remotesubplan(PlannerInfo *root,
             }
 
             heap_parallel_workers = Min(heap_parallel_workers, max_parallel_workers_per_gather);
-
-			gather->num_workers = Min(Max(heap_parallel_workers, nWorkers), min_workers_of_hashjon_gather);
+                heap_parallel_workers = Max(heap_parallel_workers, nWorkers);
+                /* if contain gather, need compare the workers with min_workers_of_hashjon_gather */
+                gather->num_workers = (contain_gather) ? Min(heap_parallel_workers, min_workers_of_hashjon_gather) : heap_parallel_workers;
+            }
         }
         else
         {
@@ -6485,9 +6500,16 @@ make_remotesubplan(PlannerInfo *root,
             double inner_rows              = lefttree->righttree ? lefttree->righttree->plan_rows : 0;
             double rows                    = outer_rows > inner_rows ? 
                                              outer_rows : inner_rows;
+            bool contain_nonparallel_hashjoin = contain_node_walker(lefttree, T_HashJoin, true);
             bool   need_parallel           = true;
             int parallel_workers           = 0;
 
+            /* if contain nonparallel hashjoin, don't add gather plan */
+			if (contain_nonparallel_hashjoin)
+            {
+                need_parallel = false;
+            }
+
             /* only add gather to remote_subplan at top */
             if (need_parallel && distributionType == LOCATOR_TYPE_NONE)
             {
@@ -8272,7 +8294,7 @@ make_gather(List *qptlist,
 	/*
 	 * if there has hashjoin in the lower layer, write down the smallest workers
 	 */
-    if (min_workers_of_hashjon_gather > nworkers && contain_hashjon_walker(subplan))
+    if (min_workers_of_hashjon_gather > nworkers && contain_node_walker(subplan, T_HashJoin, false))
     {
         min_workers_of_hashjon_gather = nworkers;
     }
@@ -8943,10 +8965,12 @@ contain_remote_subplan_walker(Node *node, void *context, bool include_cte)
 }
 
 /*
- * check if contain hashjon in the plan
+ * check if contain the type node in the plan, only support
+ * T_HashJoin and T_Gather now
+ * search_nonparallel only work if type is T_HashJoin
  */
 static bool
-contain_hashjon_walker(Plan *node)
+contain_node_walker(Plan *node, NodeTag type, bool search_nonparallel)
 {
     Plan *plan = node;
 
@@ -8955,15 +8979,34 @@ contain_hashjon_walker(Plan *node)
         return false;
     }
 
-    if (IsA(node, RemoteSubplan) || IsA(node, RemoteQuery) || IsA(plan, Gather))
+    if (IsA(node, RemoteSubplan) || IsA(node, RemoteQuery))
     {
         return false;
     }
 
+    if (type == T_HashJoin)
+    {
     if (IsA(node, HashJoin))
     {
+            if (search_nonparallel)
+            {
+                /* return if contain non parallel hashjoin */
+                HashJoin *join_plan = (HashJoin *) node;
+                return !join_plan->join.plan.parallel_aware;
+            }
+            else
+            {
         return true;
     }
+        }
+    }
+    else if (type == T_Gather)
+    {
+        if (IsA(node, Gather))
+        {
+            return true;
+        }
+    }
 
     if (IsA(node, SubqueryScan))
     {
@@ -8980,7 +9023,7 @@ contain_hashjon_walker(Plan *node)
         {
             Plan *appendplan = (Plan *)lfirst(lc);
 
-            if (appendplan && contain_hashjon_walker(appendplan))
+            if (appendplan && contain_node_walker(appendplan, type, search_nonparallel))
             {
                 return true;
             }
@@ -8997,7 +9040,7 @@ contain_hashjon_walker(Plan *node)
         {
             Plan *mergeappendplan = (Plan *)lfirst(lc);
 
-            if (mergeappendplan && contain_hashjon_walker(mergeappendplan))
+            if (mergeappendplan && contain_node_walker(mergeappendplan, type, search_nonparallel))
             {
                 return true;
             }
@@ -9008,7 +9051,7 @@ contain_hashjon_walker(Plan *node)
 
     if (outerPlan(plan))
     {
-        if (contain_hashjon_walker(outerPlan(plan)))
+        if (contain_node_walker(outerPlan(plan), type, search_nonparallel))
         {
             return true;
         }
@@ -9016,7 +9059,7 @@ contain_hashjon_walker(Plan *node)
 
     if (innerPlan(plan))
     {
-        if (contain_hashjon_walker(innerPlan(plan)))
+        if (contain_node_walker(innerPlan(plan), type, search_nonparallel))
         {
             return true;
         }

From 1fe35058c9c28b94b142ecf4476de7e7c02d4a96 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Mon, 26 Jul 2021 15:48:54 +0800
Subject: [PATCH 316/578] fix nestloop error failed to found slot for consumer
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131088591001

---
 src/backend/optimizer/plan/createplan.c       |  7 ++++--
 src/test/regress/expected/gist_1.out          |  4 ++--
 src/test/regress/expected/join_3.out          | 24 +++++++++----------
 src/test/regress/expected/partition_prune.out |  8 +++----
 src/test/regress/expected/rowsecurity_1.out   | 12 ++++------
 src/test/regress/expected/subselect.out       |  4 +---
 6 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 9444951b..42ffb26e 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -346,8 +346,8 @@ static int add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll,
 static double GetPlanRows(Plan *plan);
 static bool set_plan_parallel(Plan *plan);
 static void set_plan_nonparallel(Plan *plan);
+static Plan *materialize_top_remote_subplan(Plan *node);
 static bool contain_node_walker(Plan *node, NodeTag type, bool search_nonparallel);
-
 #endif
 static RemoteSubplan *find_push_down_plan(Plan *plan, bool force);
 
@@ -4806,9 +4806,11 @@ create_nestloop_plan(PlannerInfo *root,
      */
 #ifdef __TBASE__
 	if (!IsA(inner_plan, Material) && contain_remote_subplan_walker((Node*)inner_plan, NULL, true))
+    {
+        inner_plan = materialize_top_remote_subplan(inner_plan);
+    }
 #else
     if (IsA(inner_plan, RemoteSubplan))
-#endif
     {
         Plan       *matplan = (Plan *) make_material(inner_plan);
 
@@ -4822,6 +4824,7 @@ create_nestloop_plan(PlannerInfo *root,
 
         inner_plan = matplan;
     }
+#endif
 #endif
 
     join_plan = make_nestloop(tlist,
diff --git a/src/test/regress/expected/gist_1.out b/src/test/regress/expected/gist_1.out
index 0653fb98..9c135e9a 100644
--- a/src/test/regress/expected/gist_1.out
+++ b/src/test/regress/expected/gist_1.out
@@ -131,8 +131,8 @@ cross join lateral
  Nested Loop
    ->  Remote Subquery Scan on all (datanode_1)
          ->  Values Scan on "*VALUES*"
-   ->  Materialize
-         ->  Limit
+   ->  Limit
+         ->  Materialize
                ->  Remote Subquery Scan on all (datanode_1)
                      ->  Limit
                            ->  Index Only Scan using gist_tbl_point_index on gist_tbl
diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index 4b1d3032..7a70d26a 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -3844,10 +3844,10 @@ where t1.f1 = ss.f1;
                ->  Seq Scan on public.int8_tbl i8
                      Output: i8.q1, i8.q2
                      Filter: (i8.q2 = 123)
-   ->  Materialize
-         Output: (i8.q1), t2.f1
          ->  Limit
                Output: (i8.q1), t2.f1
+         ->  Materialize
+               Output: (i8.q1), t2.f1
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Output: i8.q1, t2.f1
                      ->  Limit
@@ -3876,7 +3876,7 @@ select * from
   lateral (select ss1.* from text_tbl t3 limit 1) as ss2
 where t1.f1 = ss2.f1;
                                     QUERY PLAN                                     
------------------------------------------------------------------------------------
+-----------------------------------------------------------------------------
  Nested Loop
    Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1, ((i8.q1)), (t2.f1)
    Join Filter: (t1.f1 = (t2.f1))
@@ -3889,22 +3889,22 @@ where t1.f1 = ss2.f1;
                ->  Seq Scan on public.int8_tbl i8
                      Output: i8.q1, i8.q2
                      Filter: (i8.q2 = 123)
-   ->  Materialize
-         Output: (i8.q1), t2.f1, ((i8.q1)), (t2.f1)
          ->  Nested Loop
                Output: (i8.q1), t2.f1, ((i8.q1)), (t2.f1)
                ->  Limit
                      Output: (i8.q1), t2.f1
+               ->  Materialize
+                     Output: (i8.q1), t2.f1
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Output: i8.q1, t2.f1
                            ->  Limit
                                  Output: (i8.q1), t2.f1
                                  ->  Seq Scan on public.text_tbl t2
                                        Output: i8.q1, t2.f1
-               ->  Materialize
-                     Output: ((i8.q1)), (t2.f1)
                      ->  Limit
                            Output: ((i8.q1)), (t2.f1)
+               ->  Materialize
+                     Output: ((i8.q1)), (t2.f1)
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                  Output: (i8.q1), t2.f1
                                  ->  Limit
@@ -3962,13 +3962,13 @@ where tt1.f1 = ss1.c0;
                            ->  Seq Scan on public.text_tbl tt4
                                  Output: tt4.f1
                                  Filter: (tt4.f1 = 'foo'::text)
-   ->  Materialize
-         Output: ss1.c0
          ->  Subquery Scan on ss1
                Output: ss1.c0
                Filter: (ss1.c0 = 'foo'::text)
                ->  Limit
                      Output: (tt4.f1)
+               ->  Materialize
+                     Output: (tt4.f1)
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Output: tt4.f1
                            ->  Limit
@@ -4026,10 +4026,10 @@ where ss1.c2 = 0;
                      Output: i42.f1
                      ->  Seq Scan on public.int4_tbl i42
                            Output: i42.f1
-   ->  Materialize
-         Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
          ->  Limit
                Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
+         ->  Materialize
+               Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Output: i41.f1, i8.q1, i8.q2, i42.f1, i43.f1, (42)
                      ->  Limit
@@ -4065,9 +4065,9 @@ select * from
  Nested Loop Left Join
    Join Filter: ((1) = COALESCE((1)))
    ->  Result
-   ->  Materialize
          ->  Hash Full Join
                Hash Cond: (a1.unique1 = (1))
+         ->  Materialize
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      ->  Seq Scan on tenk1 a1
                ->  Hash
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index 61bbdf23..2b2b13b9 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -1410,8 +1410,8 @@ explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2
                      Filter: (a = 1)
                ->  Seq Scan on mc2p_default t1_2
                      Filter: (a = 1)
-   ->  Materialize
-         ->  Finalize Aggregate
+   ->  Finalize Aggregate
+         ->  Materialize
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      ->  Partial Aggregate
                            ->  Append
@@ -1449,8 +1449,8 @@ explain (costs off) select * from mc2p t1, lateral (select count(*) from mc3p t2
                      Filter: (a = 1)
                ->  Seq Scan on mc2p_default t1_2
                      Filter: (a = 1)
-   ->  Materialize
-         ->  Finalize Aggregate
+   ->  Finalize Aggregate
+         ->  Materialize
                ->  Remote Subquery Scan on all (datanode_1)
                      ->  Partial Aggregate
                            ->  Append
diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out
index 7ea346ae..1e0441a4 100644
--- a/src/test/regress/expected/rowsecurity_1.out
+++ b/src/test/regress/expected/rowsecurity_1.out
@@ -2074,9 +2074,8 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Seq Scan on z1
                Filter: (((a % 2) = 0) AND f_leak(b))
-   ->  Materialize
          ->  CTE Scan on q
-(9 rows)
+(8 rows)
 
 SET ROLE regress_rls_group1;
 SELECT * FROM z1 WHERE f_leak(b) order by 1;
@@ -2126,9 +2125,8 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Seq Scan on z1
                Filter: (((a % 2) = 0) AND f_leak(b))
-   ->  Materialize
          ->  CTE Scan on q
-(9 rows)
+(8 rows)
 
 SET SESSION AUTHORIZATION regress_rls_carol;
 SELECT * FROM z1 WHERE f_leak(b) order by 1;
@@ -2178,9 +2176,8 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Seq Scan on z1
                Filter: (((a % 2) = 1) AND f_leak(b))
-   ->  Materialize
          ->  CTE Scan on q
-(9 rows)
+(8 rows)
 
 SET ROLE regress_rls_group2;
 SELECT * FROM z1 WHERE f_leak(b) order by 1;
@@ -2230,9 +2227,8 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Seq Scan on z1
                Filter: (((a % 2) = 1) AND f_leak(b))
-   ->  Materialize
          ->  CTE Scan on q
-(9 rows)
+(8 rows)
 
 --
 -- Views should follow policy for view owner.
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index f38e79c4..8f30b1c9 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -2043,11 +2043,9 @@ select * from x;
                                Output: x_1.a
                        ->  CTE Scan on z
                              Output: z.a
-                       ->  Materialize
-                             Output: z1.a
                              ->  CTE Scan on z z1
                                    Output: z1.a
-(20 rows)
+(18 rows)
 
 with recursive x(a) as
   ((values ('a'), ('b'))

From b860205c6ed46e8e218c78687c5eb1e80f6e05fd Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 15 Dec 2021 16:41:16 +0800
Subject: [PATCH 317/578] fix the number of pg_proc.h columns does not match
 the error

---
 contrib/Makefile              | 2 +-
 src/include/catalog/pg_proc.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/contrib/Makefile b/contrib/Makefile
index 22110f25..43e984e3 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -47,7 +47,7 @@ SUBDIRS = \
                 spi             \
                 tablefunc       \
 				tbase_gts_tools \
-		tbase_memory_tools	\
+		        tbase_memory_tools	\
                 tcn             \
                 test_decoding   \
                 tsm_system_rows \
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 76881d68..324dac39 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -1303,7 +1303,7 @@ DATA(insert OID = 1080 (  hashbpchar       PGNSP PGUID 12 1 0 0 0 f f f f t f i
 DESCR("hash");
 DATA(insert OID = 1081 (  format_type       PGNSP PGUID 12 1 0 0 0 f f f f f f s s 2 0 25 "26 23" _null_ _null_ _null_ _null_ _null_ format_type _null_ _null_ _null_ ));
 DESCR("format a type oid and atttypmod to canonical SQL");
-DATA(insert OID = 4676 (  hashbpcharextended PGNSP PGUID 12 1 0 0 0 f f f f f t f i s 2 0 20 "1042 20" _null_ _null_ _null_ _null_ _null_    hashbpcharextended _null_ _null_ _null_ ));
+DATA(insert OID = 4676 (  hashbpcharextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1042 20" _null_ _null_ _null_ _null_ _null_    hashbpcharextended _null_ _null_ _null_ ));
 DESCR("hash");
 DATA(insert OID = 1084 (  date_in           PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 1082 "2275" _null_ _null_ _null_ _null_ _null_ date_in _null_ _null_ _null_ ));
 DESCR("I/O");
@@ -3261,7 +3261,7 @@ DATA(insert OID = 2039 (  timestamp_hash    PGNSP PGUID 12 1 0 0 0 f f f f t f i
 DESCR("hash");
 DATA(insert OID = 2041 ( overlaps            PGNSP PGUID 12 1 0 0 0 f f f f f f i s 4 0 16 "1114 1114 1114 1114" _null_ _null_ _null_ _null_ _null_    overlaps_timestamp _null_ _null_ _null_ ));
 DESCR("intervals overlap?");
-DATA(insert OID = 4680 (  timestamp_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f f t f i s 2 0 20 "1114 20" _null_ _null_ _null_ _null_ _null_ timestamp_hash_extended _null_ _null_ _null_ ));
+DATA(insert OID = 4680 (  timestamp_hash_extended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "1114 20" _null_ _null_ _null_ _null_ _null_ timestamp_hash_extended _null_ _null_ _null_ ));
 DESCR("hash");
 DATA(insert OID = 2042 ( overlaps            PGNSP PGUID 14 1 0 0 0 f f f f f f i s 4 0 16 "1114 1186 1114 1186" _null_ _null_ _null_ _null_ _null_ "select ($1, ($1 + $2)) overlaps ($3, ($3 + $4))" _null_ _null_ _null_ ));
 DESCR("intervals overlap?");
@@ -4782,7 +4782,7 @@ DATA(insert OID = 3515 (  hashenum        PGNSP PGUID 12 1 0 0 0 f f f f t f i s
 DESCR("hash");
 DATA(insert OID = 3524 (  enum_smaller    PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3500 "3500 3500" _null_ _null_ _null_ _null_ _null_ enum_smaller _null_ _null_ _null_ ));
 DESCR("smaller of two");
-DATA(insert OID = 4683 (  hashenumextended PGNSP PGUID 12 1 0 0 0 f f f f f t f i s 2 0 20 "3500 20" _null_ _null_ _null_ _null_ _null_ hashenumextended _null_ _null_ _null_ ));
+DATA(insert OID = 4683 (  hashenumextended PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 20 "3500 20" _null_ _null_ _null_ _null_ _null_ hashenumextended _null_ _null_ _null_ ));
 DESCR("hash");
 DATA(insert OID = 3525 (  enum_larger    PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 3500 "3500 3500" _null_ _null_ _null_ _null_ _null_ enum_larger _null_ _null_ _null_ ));
 DESCR("larger of two");

From a170d578c402280d19c276dfa01f3c251655b994 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Fri, 22 Oct 2021 15:55:47 +0800
Subject: [PATCH 318/578] fix self-development partition number :
 http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696093310765&url_cache_key=868a8470a54906f0346329b940daf1c8
 (merge request !837)

Squash merge branch '5.06_jenny_partitions_number' into 'Tbase_v5.06'
fix self-development partition number : http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696093310765&url_cache_key=868a8470a54906f0346329b940daf1c8
---
 src/backend/utils/adt/ruleutils.c | 28 ++++++++++++++++++++++++++++
 src/bin/psql/describe.c           |  4 ++--
 src/include/catalog/pg_proc.h     |  3 +++
 src/include/utils/ruleutils.h     |  2 ++
 4 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 8ce8cefe..8fa13f29 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -12079,6 +12079,34 @@ RelationGetAllPartitions(Relation rel)
 }
 
 int
+GetAllPartitionIntervalCount(Oid parent_oid)
+{
+	int count = 0;
+	List *children = NULL; 
+	Relation rel = heap_open(parent_oid, NoLock);
+
+	children = RelationGetAllPartitions(rel);
+
+	if(children)
+	{
+		count = children->length;
+		list_free(children);
+	}
+
+	heap_close(rel, NoLock);
+
+	return count;
+}
+
+Datum
+partitions_number(PG_FUNCTION_ARGS)
+{
+	Oid parent_oid = PG_GETARG_OID(0);
+	int ret = GetAllPartitionIntervalCount(parent_oid);
+	PG_RETURN_INT32(ret);
+}
+
+int
 RelationGetChildIndex(Relation rel, Oid childoid)
 {
     int nparts = 0;
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index ff427084..85318723 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -3219,7 +3219,7 @@ describeOneTableDetails(const char *schemaname,
         if (verbose && pset.sversion >= 90500 && tableinfo.relkind == 'r' && tableinfo.relpartkind == 'p')
         {
             printfPQExpBuffer(&buf,
-                        "SELECT 'RANGE(' || a.attname || ')', p.partnparts,"
+						"SELECT 'RANGE(' || a.attname || ')', partitions_number(c.oid),"
                             "p.partdatatype, p.partstartvalue_ts :: date, p.partstartvalue_int,"
                             "CASE WHEN p.partinterval_type=5  THEN p.partinterval_int || ' MONTH' "
                                 "WHEN p.partinterval_type=4 THEN p.partinterval_int || ' DAY' "
@@ -3239,7 +3239,7 @@ describeOneTableDetails(const char *schemaname,
             {
                 char * partdatatype;
                 const char *part_by = _("Partition By");
-                const char *nparts = _("# Of Partitions");
+				const char *nparts = _("Partitions number");
                 const char *start_with = _("Start With");
                 const char *interv = _("Interval Of Partition");
     
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 324dac39..8f79ca30 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -5720,6 +5720,9 @@ DESCR("get top-most partition root parent");
 
 DATA(insert OID = 4690 ( pg_partition_ancestors PGNSP PGUID 12 1 10 0 0 f f f f t t v s 1 0 2205 "2205" "{2205,2205}" "{i,o}" "{partitionid,relid}" _null_ _null_ pg_partition_ancestors _null_ _null_ _null_ ));
 DESCR("view ancestors of the partition");
+/* get partition interval children count */
+DATA(insert OID = 4691 ( partitions_number PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2205 "2205" _null_ _null_ _null_ _null_ _null_ partitions_number _null_ _null_ _null_ ));
+DESCR("get partition interval children count ");
 DATA(insert OID = 3410 (  pg_extent_info                PGNSP PGUID 12 10 20 0 0 f f f f f t v s 1 0 2249 "2205" "{23,16,23,23,23,23,23,23,23}" "{o,o,o,o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next,scan_prev,alloc_next,alloc_prev}" _null_ _null_ pg_extent_info_oid _null_ _null_ _null_ ));
 DESCR("get extent info of a relation");
 DATA(insert OID = 3411 (  pg_shard_scan_list            PGNSP PGUID 12 10 20 0 0 f f f f f t v s 2 0 2249 "2205 23" "{23,16,23,23,23,23}" "{o,o,o,o,o,o}" "{eid,is_occupied,shardid,freespace_cat,hwm,scan_next}" _null_ _null_ pg_shard_scan_list_oid _null_ _null_ _null_ ));
diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h
index 585bc16e..5dc0e217 100644
--- a/src/include/utils/ruleutils.h
+++ b/src/include/utils/ruleutils.h
@@ -102,6 +102,8 @@ extern int RelationGetPartitionIdxByValue(Relation rel, Datum value);
 
 extern List *RelationGetAllPartitions(Relation rel);
 
+extern int GetAllPartitionIntervalCount(Oid parent_oid);
+
 extern int RelationGetChildIndex(Relation rel, Oid childoid);
 
 extern Oid RelationGetPartitionIndex(Relation rel, Oid indexOid, int partidx);

From 99635e22edda78fbfd88ea814a92b2c7e4d6fffa Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 21 Dec 2021 21:25:34 +0800
Subject: [PATCH 319/578] fix regress errors

---
 contrib/pg_clean/pg_clean.c                   | 3198 -----------------
 src/test/regress/expected/create_table.out    |    8 +-
 src/test/regress/expected/inherit_3.out       |   20 +-
 src/test/regress/expected/join_3.out          |  181 +-
 src/test/regress/expected/limit.out           |    2 +-
 .../regress/expected/partition_join_2.out     |  377 +-
 src/test/regress/expected/rowsecurity_1.out   |    8 +-
 src/test/regress/expected/sanity_check.out    |    2 +
 .../regress/expected/select_parallel_4.out    |  110 +-
 src/test/regress/expected/stats_ext_2.out     |   20 +-
 src/test/regress/expected/subselect.out       |    4 +-
 src/test/regress/expected/sysviews.out        |    4 +-
 12 files changed, 399 insertions(+), 3535 deletions(-)
 delete mode 100644 contrib/pg_clean/pg_clean.c

diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c
deleted file mode 100644
index 08a189f9..00000000
--- a/contrib/pg_clean/pg_clean.c
+++ /dev/null
@@ -1,3198 +0,0 @@
-#include "postgres.h"
-#include "fmgr.h"
-#include "funcapi.h"
-#include "miscadmin.h"
-
-#include <stdio.h>
-#include <sys/stat.h>
-#include <string.h>
-#include <stdlib.h>
-#include <time.h>
-
-#include "storage/procarray.h"
-#include "storage/lwlock.h"
-#include "storage/proc.h"
-#include "utils/varlena.h"
-#include "utils/lsyscache.h"
-#include "utils/palloc.h"
-#include "utils/builtins.h"
-
-#include "executor/tuptable.h"
-#include "pgxc/execRemote.h"
-#include "pgxc/pgxcnode.h"
-#include "access/tupdesc.h"
-#include "access/htup_details.h"
-#include "lib/stringinfo.h"
-
-#include "access/gtm.h"
-#include "datatype/timestamp.h"
-#include "access/xact.h"
-#include "pgxc/pgxcnode.h"
-#include "pgxc/poolmgr.h"
-#include "utils/timestamp.h"
-#include "catalog/pg_control.h"
-#include "commands/dbcommands.h"
-
-#include "utils/memutils.h"
-#include "nodes/memnodes.h"
-
-#ifdef XCP
-#include "catalog/pg_type.h"
-#include "catalog/pgxc_node.h"
-#include "executor/executor.h"
-#include "nodes/makefuncs.h"
-#include "utils/snapmgr.h"
-#endif
-#ifdef PGXC
-#include "pgxc/nodemgr.h"
-#include "pgxc/pgxc.h"
-#endif
-
-#include "storage/fd.h"
-#include "pgstat.h"
-#include "access/xact.h"
-#include "access/twophase.h"
-#include "access/hash.h"
-
-/*hash_create hash_search*/
-#include "utils/hsearch.h"
-
-#define TWOPHASE_RECORD_DIR "pg_2pc"
-int  transaction_threshold = 200000;
-#define MAXIMUM_CLEAR_FILE 10000
-#define MAXIMUM_OUTPUT_FILE 1000
-#define XIDPREFIX "_$XC$"
-#define DEFAULT_CLEAN_TIME_INTERVAL 120000000
-#ifdef __TWO_PHASE_TESTS__
-#define LEAST_CLEAN_TIME_INTERVAL 10000000 /* in pg_clean test_mode should not clean twophase trans prepared in ten seconds or commit in ten seconds */
-#else
-#define LEAST_CLEAN_TIME_INTERVAL 60000000 /* should not clean twophase trans prepared in a minite or commit in a minite */
-#endif
-GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL;
-
-
-PG_MODULE_MAGIC;
-
-#define MAX_GID 50
-#define MAX_DBNAME	64
-#define GET_START_XID "startxid:"
-#define GET_COMMIT_TIMESTAMP "global_commit_timestamp:"
-#define GET_START_NODE "startnode:"
-#define GET_NODE "nodes:"
-#define GET_XID "\nxid:"
-#define GET_READONLY "readonly"
-#define GIDSIZE (200 + 24)
-#define MAX_TWOPC_TXN 1000
-#define STRING_BUFF_LEN 1024
-
-#define MAX_CMD_LENGTH 120
-
-#define XIDFOUND 1
-#define XIDNOTFOUND -1
-#define XIDEXECFAIL -2
-
-#define FILEFOUND 1
-#define FILEUNKOWN -1
-#define FILENOTFOUND -2
-
-#define INIT(x)\
-do{\
-	x = NULL;\
-	x##_count = 0;\
-	x##_size = 0;\
-}while(0);
-
-#define RPALLOC(x)\
-do{\
-    if (x##_size < x##_count+1)\
-    {\
-        int temp_size = (x##_size > 0) ? x##_size : 1;\
-        if (NULL == x)\
-        {\
-			x = palloc0(2*temp_size*sizeof(*x));\
-		}\
-        else\
-        {\
-        	x = repalloc(x, 2*temp_size*sizeof(*x));\
-        }\
-    	x##_size = 2*temp_size;\
-    }\
-}while(0);
-
-#define PALLOC(x, y)\
-do{\
-    RPALLOC(x);\
-    x[x##_count] = y;\
-    x##_count++;\
-}while(0);
-
-#define RFREE(x)\
-do{\
-    if (x##_size > 0)\
-    {\
-        pfree(x);\
-    }\
-    x = NULL;\
-    x##_count = 0;\
-    x##_size = 0;\
-}while(0);
-	
-#define ENUM_TOCHAR_CASE(x)   case x: return(#x);
-
-/*data structures*/
-typedef enum TXN_STATUS
-{
-	TXN_STATUS_INITIAL = 0,	/* Initial */
-	TXN_STATUS_PREPARED,
-	TXN_STATUS_COMMITTED,
-	TXN_STATUS_ABORTED,
-	TXN_STATUS_INPROGRESS,
-	TXN_STATUS_FAILED,		/* Error detected while interacting with the node */
-	TXN_STATUS_UNKNOWN	/* Unknown: Frozen, running, or not started */
-} TXN_STATUS;
-
-
-typedef enum 
-{
-	UNDO = 0,
-	ABORT,
-	COMMIT
-} OPERATION;
-
-typedef enum
-{
-    TWOPHASE_FILE_EXISTS = 0,
-    TWOPHASE_FILE_NOT_EXISTS,
-    TWOPHASE_FILE_OLD, 
-    TWOPHASE_FILE_ERROR
-}TWOPHASE_FILE_STATUS;
-	
-typedef struct txn_info
-{
-	char			gid[MAX_GID];
-	uint32			*xid;				/* xid used in prepare */
-	TimestampTz		*prepare_timestamp;
-	char			*owner;
-    char            *participants;
-	Oid				origcoord;			/* Original coordinator who initiated the txn */
-    bool            after_first_phase;
-    uint32          startxid;           /* xid in Original coordinator */
-	bool			isorigcoord_part;	/* Is original coordinator a
-										   participant? */
-	int				num_dnparts;		/* Number of participant datanodes */
-	int				num_coordparts;		/* Number of participant coordinators */
-	int				*dnparts;			/* Whether a node was participant in the txn */
-	int				*coordparts;
-	TXN_STATUS		*txn_stat;			/* Array for each nodes */
-	char			*msg;				/* Notice message for this txn. */
-	GlobalTimestamp  global_commit_timestamp;	/* get global_commit_timestamp from node once it is committed*/
-
-	TXN_STATUS		global_txn_stat;
-	OPERATION		op;
-	bool			op_issuccess;
-    bool            is_readonly;
-    bool            belong_abnormal_node;
-}txn_info;
-
-typedef struct database_info
-{
-	struct database_info *next;
-	char *database_name;
-
-    HTAB *all_txn_info;
-#if 0 
-	txn_info *head_txn_info;
-	txn_info *last_txn_info;
-#endif
-} database_info;
-
-typedef struct 
-{
-	int index;
-	txn_info **txn;
-	int txn_count;
-	int txn_size;
-	MemoryContext mycontext;
-} print_txn_info;
-
-typedef struct
-{
-	int index;
-	int count;
-	char **gid;
-	int gid_count;
-	int gid_size;
-	char **database;
-	int database_count;
-	int database_size;
-	char **global_status;
-	int global_status_count;
-	int global_status_size;
-	char **status;
-	int status_count;
-	int status_size;
-	MemoryContext mycontext;
-} print_status;
-
-typedef struct 
-{
-	char ***slot;	/*slot[i][j] stores value of row i, colum j*/
-	int slot_count;	/*number of rows*/
-	int slot_size;
-	int attnum;
-}TupleTableSlots;
-
-/*global variable*/
-static Oid	        *cn_node_list = NULL;
-static Oid	        *dn_node_list = NULL;
-static bool         *cn_health_map = NULL;
-static bool         *dn_health_map = NULL;
-static int	        cn_nodes_num = 0;
-static int	        dn_nodes_num = 0;
-static int	        pgxc_clean_node_count = 0;
-static Oid	        my_nodeoid;
-static 
-database_info       *head_database_info = NULL;
-static 
-database_info       *last_database_info = NULL;
-bool		        execute = false;
-int                 total_twopc_txn = 0;
-
-TimestampTz         current_time;
-GlobalTimestamp     abnormal_time = InvalidGlobalTimestamp;
-char                *abnormal_nodename = NULL;
-Oid                 abnormal_nodeoid = InvalidOid;
-bool                clear_2pc_belong_node = false;
-
-
-/*function list*/
-	/*plugin entry function*/
-
-static bool check_node_health(Oid node_oid);
-static Datum 
-	 execute_query_on_single_node(Oid node, const char * query, int attnum, TupleTableSlots * tuples);
-void DestroyTxnHash(void);
-static void ResetGlobalVariables(void);
-
-static Oid  
-	 getMyNodeoid(void);
-static void 
-	 getDatabaseList(void);
-static char* TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num);
-static void DropTupleTableSlots(TupleTableSlots *
-Slots);
-static void 
-	 getTxnInfoOnNodesAll(void);
-void getTxnInfoOnNode(Oid node);
-void add_txn_info(char * dbname, Oid node_oid, uint32 xid, char * gid, char * owner, 
-					  TimestampTz prepared_time, TXN_STATUS status);
-TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info * txn, Oid node_oid);
-static txn_info *
-	 find_txn(char *gid);
-txn_info*	
-	 make_txn_info(char * dbname, char * gid, char * owner);
-database_info*	
-	 find_database_info(char *database_name);
-database_info*
-	 add_database_info(char *database_name);
-int	 find_node_index(Oid node_oid);
-Oid  find_node_oid(int node_idx);
-void getTxnInfoOnOtherNodesAll(void);
-void getTxnInfoOnOtherNodesForDatabase(database_info *database);
-void getTxnInfoOnOtherNodes(txn_info *txn);
-int Get2PCXidByGid(Oid node_oid, char * gid, uint32 * transactionid);
-int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid);
-
-char *get2PCInfo(const char *tid);
-
-void getTxnStatus(txn_info * txn, int node_idx);
-void recover2PCForDatabaseAll(void);
-void recover2PCForDatabase(database_info * db_info);
-#if 0    
-static bool 
-	 setMaintenanceMode(bool status);
-#endif
-bool send_query_clean_transaction(PGXCNodeHandle * conn, txn_info * txn, const char * finish_cmd);
-bool check_2pc_belong_node(txn_info * txn);
-bool check_node_participate(txn_info * txn, int node_idx);
-
-void recover2PC(txn_info * txn);
-TXN_STATUS 
-	 check_txn_global_status(txn_info *txn);
-bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check);
-bool clean_2PC_files(txn_info *txn);
-void Init_print_txn_info(print_txn_info *print_txn);
-void Init_print_stats_all(print_status *pstatus);
-void Init_print_stats(txn_info * txn, char * database, print_status * pstatus);
-static const char *
-	 txn_status_to_string(TXN_STATUS status);
-static const char *
-	 txn_op_to_string(OPERATION op);
-static void 
-     CheckFirstPhase(txn_info *txn);
-static void 
-     get_transaction_handles(PGXCNodeAllHandles **pgxc_handles, txn_info *txn);
-static void 
-     get_node_handles(PGXCNodeAllHandles ** pgxc_handles, Oid nodeoid);
-
-Datum	pg_clean_execute(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pg_clean_execute);
-Datum	pg_clean_execute(PG_FUNCTION_ARGS)
-{
-#ifdef ACCESS_CONTROL_ATTR_NUM
-#undef ACCESS_CONTROL_ATTR_NUM
-#endif
-#define ACCESS_CONTROL_ATTR_NUM  4
-	FuncCallContext 	*funcctx;
-	HeapTuple			tuple;		
-	print_txn_info		*print_txn = NULL;
-	txn_info 			*temp_txn;
-	char				txn_gid[100];
-	char				txn_status[100];
-	char				txn_op[100];
-	char				txn_op_issuccess[100];
-	
-	Datum		values[ACCESS_CONTROL_ATTR_NUM];
-	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
-	
-	if(!IS_PGXC_COORDINATOR)
-	{
-		elog(ERROR, "can only called on coordinator");
-	}
-
-	if (SRF_IS_FIRSTCALL())
-	{
-		MemoryContext oldcontext;
-		TupleDesc	tupdesc;
-		MemoryContext mycontext;
-		funcctx = SRF_FIRSTCALL_INIT();
-		
-		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
-
-		tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid",
-						   TEXTOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "global_transaction_status",
-						   TEXTOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "operation",
-						   TEXTOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "operation_status",
-						   TEXTOID, -1, 0);
-		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
-		
-		funcctx->user_fctx = (print_txn_info *)palloc0(sizeof(print_txn_info));
-		print_txn = (print_txn_info *) funcctx->user_fctx;
-	
-		
-		MemoryContextSwitchTo(oldcontext);
-		mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx,
-												  "clean_check",
-												  ALLOCSET_DEFAULT_MINSIZE,
-												  ALLOCSET_DEFAULT_INITSIZE,
-												  ALLOCSET_DEFAULT_MAXSIZE);
-		oldcontext = MemoryContextSwitchTo(mycontext);
-		
-        /*clear Global*/
-        ResetGlobalVariables();
-        execute = true;
-        clean_time_interval = PG_GETARG_INT32(0) * 1000000;
-        if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval)
-        {
-            clean_time_interval = LEAST_CLEAN_TIME_INTERVAL;
-        }
-        
-		/*get node list*/
-		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
-						&cn_nodes_num, &dn_nodes_num, true);
-		pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
-		my_nodeoid = getMyNodeoid();
-		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
-		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
-
-		/*add my database info*/
-		add_database_info(get_database_name(MyDatabaseId));
-
-		/*get all info of 2PC transactions*/
-		getTxnInfoOnNodesAll();
-
-		/*get txn info on other nodes all*/
-		getTxnInfoOnOtherNodesAll();
-
-		/*recover all 2PC transactions*/
-		recover2PCForDatabaseAll();
-
-		Init_print_txn_info(print_txn);
-		
-		print_txn->mycontext = mycontext;
-		
-		MemoryContextSwitchTo(oldcontext);
-
-	}
-	
-	funcctx = SRF_PERCALL_SETUP();	
-	print_txn = (print_txn_info *) funcctx->user_fctx;
-	
-	if (print_txn->index < print_txn->txn_count)
-	{
-		temp_txn = print_txn->txn[print_txn->index];
-		strncpy(txn_gid, temp_txn->gid, 100);
-		strncpy(txn_status, txn_status_to_string(temp_txn->global_txn_stat), 100);
-		strncpy(txn_op, txn_op_to_string(temp_txn->op), 100);
-		if (temp_txn->op_issuccess)
-			strncpy(txn_op_issuccess, "success", 100);
-		else
-			strncpy(txn_op_issuccess, "fail", 100);
-		
-		MemSet(values, 0, sizeof(values));
-		MemSet(nulls, 0, sizeof(nulls));
-
-		values[0] = PointerGetDatum(cstring_to_text(txn_gid));
-		values[1] = PointerGetDatum(cstring_to_text(txn_status));
-		values[2] = PointerGetDatum(cstring_to_text(txn_op));
-		values[3] = PointerGetDatum(cstring_to_text(txn_op_issuccess));
-		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
-		print_txn->index++;
-		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
-	}
-	else
-	{
-		
-		//MemoryContextDelete(print_txn->mycontext);
-		DestroyTxnHash();
-		ResetGlobalVariables();
-		SRF_RETURN_DONE(funcctx);
-	}
-}
-
-/*
- * clear 2pc after oss detect abnormal node and restart it , 
- * only clear 2pc belong the abnormal node and before the abnormal time
- */
-Datum	pg_clean_execute_on_node(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pg_clean_execute_on_node);
-Datum	pg_clean_execute_on_node(PG_FUNCTION_ARGS)
-{
-#ifdef ACCESS_CONTROL_ATTR_NUM
-#undef ACCESS_CONTROL_ATTR_NUM
-#endif
-#define ACCESS_CONTROL_ATTR_NUM  4
-	FuncCallContext 	*funcctx;
-	HeapTuple			tuple;		
-	print_txn_info		*print_txn = NULL;
-	txn_info 			*temp_txn;
-	char				txn_gid[100];
-	char				txn_status[100];
-	char				txn_op[100];
-	char				txn_op_issuccess[100];
-	
-	Datum		values[ACCESS_CONTROL_ATTR_NUM];
-	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
-	
-	if(!IS_PGXC_COORDINATOR)
-	{
-		elog(ERROR, "can only called on coordinator");
-	}
-
-	if (SRF_IS_FIRSTCALL())
-	{
-		MemoryContext oldcontext;
-		TupleDesc	tupdesc;
-		MemoryContext mycontext;
-		funcctx = SRF_FIRSTCALL_INIT();
-		
-		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
-
-		tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid",
-						   TEXTOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "global_transaction_status",
-						   TEXTOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "operation",
-						   TEXTOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "operation_status",
-						   TEXTOID, -1, 0);
-		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
-		
-		funcctx->user_fctx = (print_txn_info *)palloc0(sizeof(print_txn_info));
-		print_txn = (print_txn_info *) funcctx->user_fctx;
-	
-		
-		MemoryContextSwitchTo(oldcontext);
-		mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx,
-												  "clean_check",
-												  ALLOCSET_DEFAULT_MINSIZE,
-												  ALLOCSET_DEFAULT_INITSIZE,
-												  ALLOCSET_DEFAULT_MAXSIZE);
-		oldcontext = MemoryContextSwitchTo(mycontext);
-		
-        /*clear Global*/
-        ResetGlobalVariables();
-        execute = true;
-        clear_2pc_belong_node = true;
-
-        abnormal_nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
-        abnormal_nodeoid = get_pgxc_nodeoid(abnormal_nodename);
-        if (InvalidOid == abnormal_nodeoid)
-        {
-            elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of invalid nodename '%s'", abnormal_nodename);
-        }
-        abnormal_time = PG_GETARG_INT64(1);
-        current_time = GetCurrentTimestamp();
-        if (abnormal_time >= current_time)
-        {
-            elog(ERROR, "pg_clean_execute_on_node, abnormal time "INT64_FORMAT" must before current_time "INT64_FORMAT, abnormal_time, current_time);
-        }
-        
-		/*get node list*/
-		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
-						&cn_nodes_num, &dn_nodes_num, true);
-		pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
-		my_nodeoid = getMyNodeoid();
-		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
-		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
-
-		/*add my database info*/
-		add_database_info(get_database_name(MyDatabaseId));
-
-		/*get all info of 2PC transactions*/
-		getTxnInfoOnNodesAll();
-
-		/*get txn info on other nodes all*/
-		getTxnInfoOnOtherNodesAll();
-
-		/*recover all 2PC transactions*/
-		recover2PCForDatabaseAll();
-
-		Init_print_txn_info(print_txn);
-		
-		print_txn->mycontext = mycontext;
-		
-		MemoryContextSwitchTo(oldcontext);
-
-	}
-	
-	funcctx = SRF_PERCALL_SETUP();	
-	print_txn = (print_txn_info *) funcctx->user_fctx;
-	
-	if (print_txn->index < print_txn->txn_count)
-	{
-		temp_txn = print_txn->txn[print_txn->index];
-		strncpy(txn_gid, temp_txn->gid, 100);
-		strncpy(txn_status, txn_status_to_string(temp_txn->global_txn_stat), 100);
-		strncpy(txn_op, txn_op_to_string(temp_txn->op), 100);
-		if (temp_txn->op_issuccess)
-			strncpy(txn_op_issuccess, "success", 100);
-		else
-			strncpy(txn_op_issuccess, "fail", 100);
-		
-		MemSet(values, 0, sizeof(values));
-		MemSet(nulls, 0, sizeof(nulls));
-
-		values[0] = PointerGetDatum(cstring_to_text(txn_gid));
-		values[1] = PointerGetDatum(cstring_to_text(txn_status));
-		values[2] = PointerGetDatum(cstring_to_text(txn_op));
-		values[3] = PointerGetDatum(cstring_to_text(txn_op_issuccess));
-		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
-		print_txn->index++;
-		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
-	}
-	else
-	{
-		DestroyTxnHash();
-        pfree(abnormal_nodename);
-		ResetGlobalVariables();
-		SRF_RETURN_DONE(funcctx);
-	}
-}
-
-
-Datum	pg_clean_check_txn(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pg_clean_check_txn);
-Datum	pg_clean_check_txn(PG_FUNCTION_ARGS)
-{
-#ifdef ACCESS_CONTROL_ATTR_NUM
-#undef ACCESS_CONTROL_ATTR_NUM
-#endif
-#define ACCESS_CONTROL_ATTR_NUM  4
-	FuncCallContext 	*funcctx;
-	HeapTuple			tuple;		
-	print_status		*pstatus = NULL;
-	
-	Datum		values[ACCESS_CONTROL_ATTR_NUM];
-	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
-	execute = false;
-    
-	if(!IS_PGXC_COORDINATOR)
-	{
-		elog(ERROR, "can only called on coordinator");
-	}
-
-	if (SRF_IS_FIRSTCALL())
-	{
-		MemoryContext oldcontext;
-		MemoryContext mycontext;
-		TupleDesc	tupdesc;
-		funcctx = SRF_FIRSTCALL_INIT();
-		
-		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
-
-		tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid",
-						   TEXTOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "database",
-						   TEXTOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "global_transaction_status",
-						   TEXTOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "transaction_status_on_allnodes",
-						   TEXTOID, -1, 0);
-		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
-		
-		funcctx->user_fctx = (print_status *)palloc0(sizeof(print_status));
-		pstatus = (print_status *) funcctx->user_fctx;
-		pstatus->index = pstatus->count = 0;
-		pstatus->gid = NULL;
-		pstatus->global_status = pstatus->status = (char **)NULL;
-		pstatus->database = NULL;
-		pstatus->mycontext = NULL;
-	
-
-		MemoryContextSwitchTo(oldcontext);
-
-		mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx,
-												  "clean_check",
-												  ALLOCSET_DEFAULT_MINSIZE,
-												  ALLOCSET_DEFAULT_INITSIZE,
-												  ALLOCSET_DEFAULT_MAXSIZE);
-		oldcontext = MemoryContextSwitchTo(mycontext);
-
-        /*clear Global*/
-        ResetGlobalVariables();
-        
-        clean_time_interval = PG_GETARG_INT32(0) * 1000000;
-        if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval)
-        {
-            clean_time_interval = LEAST_CLEAN_TIME_INTERVAL;
-        }
-		/*get node list*/
-		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
-						&cn_nodes_num, &dn_nodes_num, true);
-        if (cn_node_list == NULL || dn_node_list == NULL)
-            elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list");
-		pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
-		my_nodeoid = getMyNodeoid();
-		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
-		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
-
-		/*get all database info*/
-		getDatabaseList();
-
-		/*get all info of 2PC transactions*/
-		getTxnInfoOnNodesAll();
-
-		/*get txn info on other nodes all*/
-		getTxnInfoOnOtherNodesAll();
-
-		/*recover all 2PC transactions*/
-		Init_print_stats_all(pstatus);
-	
-		pstatus->mycontext = mycontext;
-	
-		MemoryContextSwitchTo(oldcontext);
-
-	}
-	
-	funcctx = SRF_PERCALL_SETUP();	
-	pstatus = (print_status *) funcctx->user_fctx;
-	
-	if (pstatus->index < pstatus->count)
-	{
-		MemSet(values, 0, sizeof(values));
-		MemSet(nulls, 0, sizeof(nulls));
-
-		values[0] = PointerGetDatum(cstring_to_text(pstatus->gid[pstatus->index]));
-		values[1] = PointerGetDatum(cstring_to_text(pstatus->database[pstatus->index]));
-		values[2] = PointerGetDatum(cstring_to_text(pstatus->global_status[pstatus->index]));
-		values[3] = PointerGetDatum(cstring_to_text(pstatus->status[pstatus->index]));
-		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
-		pstatus->index++;
-		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
-	}
-	else
-	{
-		/*
-		MemoryContextDelete(pstatus->mycontext);
-		DropDatabaseInfo();
-		*/
-		DestroyTxnHash();
-		ResetGlobalVariables();
-		SRF_RETURN_DONE(funcctx);
-	}
-}
-
-void DestroyTxnHash(void)
-{
-    database_info *dbinfo = head_database_info;
-    while (dbinfo)
-    {
-        hash_destroy(dbinfo->all_txn_info);
-        dbinfo = dbinfo->next;
-    }
-}
-
-static void ResetGlobalVariables(void)
-{
-	cn_node_list = NULL;
-	dn_node_list = NULL;
-	cn_health_map = NULL;
-	dn_health_map = NULL;
-	cn_nodes_num = 0;
-	dn_nodes_num = 0;
-	pgxc_clean_node_count = 0;
-	execute = false;
-    total_twopc_txn = 0;
-
-	head_database_info = last_database_info = NULL;
-
-    current_time = 0;
-    abnormal_time = InvalidGlobalTimestamp;
-    abnormal_nodename = NULL;
-    abnormal_nodeoid = InvalidOid;
-    clear_2pc_belong_node = false;
-
-}
-
-static Oid getMyNodeoid(void)
-{
-	return get_pgxc_nodeoid(PGXCNodeName);
-}
-
-/* 
- * execute_query_on_single_node -- execute query on certain node and get results
- * input: 	node oid, execute query, number of attribute in results, results
- * return:	(Datum) 0
- */
-static Datum
-execute_query_on_single_node(Oid node, const char *query, int attnum, TupleTableSlots *tuples)  //delete numnodes, delete nodelist, insert node
-{
-	int 		ii;
-	bool		issuccess = false;
-
-	/*check health of node*/
-	bool ishealthy = check_node_health(node);
-
-#ifdef XCP
-	EState				*estate;
-	MemoryContext		oldcontext;
-	RemoteQuery			*plan;
-	RemoteQueryState	*pstate;
-	TupleTableSlot		*result = NULL;
-	Var			   		*dummy;
-	char ntype = PGXC_NODE_NONE;
-
-	/*
-	 * Make up RemoteQuery plan node
-	 */
-	plan = makeNode(RemoteQuery);
-	plan->combine_type = COMBINE_TYPE_NONE;
-	plan->exec_nodes = makeNode(ExecNodes);
-	plan->exec_type = EXEC_ON_NONE;
-
-	plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList,
-		PGXCNodeGetNodeId(node, &ntype));
-	if (ntype == PGXC_NODE_NONE)
-		ereport(ERROR,
-				(errcode(ERRCODE_INTERNAL_ERROR),
-				 errmsg("Unknown node Oid: %u", node)));
-	else if (ntype == PGXC_NODE_COORDINATOR) 
-	{
-		plan->exec_type = EXEC_ON_COORDS;
-	}
-	else
-	{
-		plan->exec_type = EXEC_ON_DATANODES;
-	}
-
-	plan->sql_statement = (char *)query;
-	plan->force_autocommit = false;
-	/*
-	 * We only need the target entry to determine result data type.
-	 * So create dummy even if real expression is a function.
-	 */
-	for (ii = 1; ii <= attnum; ii++)
-	{
-		dummy = makeVar(1, ii, TEXTOID, 0, InvalidOid, 0);
-		plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist,
-										  makeTargetEntry((Expr *) dummy, ii, NULL, false));
-	}
-	/* prepare to execute */
-	estate = CreateExecutorState();
-	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
-	estate->es_snapshot = GetActiveSnapshot();
-	pstate = ExecInitRemoteQuery(plan, estate, 0);
-	MemoryContextSwitchTo(oldcontext);
-
-	/*execute query on node when node is healthy*/
-	INIT(tuples->slot);
-	tuples->attnum = 0;	
-	if (ishealthy)
-	{
-		int i_tuple = 0;
-		int i_attnum = 0;
-		issuccess = true;
-		result = ExecRemoteQuery((PlanState *) pstate);
-		tuples->attnum = attnum;
-		while (result != NULL && !TupIsNull(result))
-		{
-			slot_getallattrs(result); 
-			RPALLOC(tuples->slot);
-			tuples->slot[i_tuple] = (char **) palloc0(attnum * sizeof(char *));
-		
-			for (i_attnum = 0; i_attnum < attnum; i_attnum++)
-			{
-				/*if (result->tts_values[i_attnum] != (Datum)0)*/
-				if (result->tts_isnull[i_attnum] == false)
-				{
-					tuples->slot[i_tuple][i_attnum] = text_to_cstring(DatumGetTextP(result->tts_values[i_attnum]));
-				}
-				else
-				{
-					tuples->slot[i_tuple][i_attnum] = NULL;
-				}
-			}
-			tuples->slot_count++;
-
-			result = ExecRemoteQuery((PlanState *) pstate);
-			i_tuple++;
-		}
-	}
-	ExecEndRemoteQuery(pstate);
-#endif
-	return issuccess == true ? (Datum) 1 : (Datum) 0;
-}
-
-static bool check_node_health(Oid node_oid)
-{
-	int i;
-	bool ishealthy = false;
-	
-	PoolPingNodeRecheck(node_oid);
-	PgxcNodeGetHealthMap(cn_node_list, dn_node_list, 
-						&cn_nodes_num, &dn_nodes_num, 
-						cn_health_map, dn_health_map);
-	if (get_pgxc_nodetype(node_oid) == 'C')
-	{
-		for (i = 0; i < cn_nodes_num; i++)
-		{
-			if (cn_node_list[i] == node_oid)
-			{
-				ishealthy = cn_health_map[i];
-			}
-		}
-	}
-	else
-	{
-		for (i = 0; i < dn_nodes_num; i++)
-		{
-			if (dn_node_list[i] == node_oid)
-			{
-				ishealthy = dn_health_map[i];
-			}
-		}
-	}
-	return ishealthy;
-}
-
-static void getDatabaseList(void)
-{
-	int i;
-	TupleTableSlots result_db;
-	const char *query_db = "select datname::text from pg_database;";
-	/*add datname into tail of head_database_info*/
-	if (execute_query_on_single_node(my_nodeoid, query_db, 1, &result_db) == (Datum) 1)
-	{
-		for (i = 0; i < result_db.slot_count; i++)
-		{
-			if (TTSgetvalue(&result_db, i, 0))
-			{
-				add_database_info(TTSgetvalue(&result_db, i, 0));
-			}
-		}
-	}
-	else
-	{
-		elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(my_nodeoid));
-	}
-	DropTupleTableSlots(&result_db);
-}
-
-/* 
- * TTSgetvalue -- get attribute from TupleTableSlots
- * input: 	result, index of tuple, index of field
- * return:	attribute result
- */
-static char * TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num)
-{
-	return result->slot[tup_num][field_num];
-}
-
-static void DropTupleTableSlots(TupleTableSlots *
-Slots)
-{
-	int i;
-	int j;
-	for (i = 0; i < Slots->slot_count; i++)
-	{
-		if (Slots->slot[i])
-		{
-			for (j = 0; j < Slots->attnum; j++)
-			{
-				if (Slots->slot[i][j])
-				{
-					pfree(Slots->slot[i][j]);
-				}
-			}
-			pfree(Slots->slot[i]);
-		}
-	}
-	RFREE(Slots->slot);
-	Slots->attnum = 0;
-	return;
-}
-
-static void getTxnInfoOnNodesAll(void)
-{
-	int i;
-	current_time = GetCurrentTimestamp();
-	/*upload 2PC transaction from CN*/
-	for (i = 0; i < cn_nodes_num; i++)
-	{
-        if (total_twopc_txn >= MAX_TWOPC_TXN)
-            return;
-		getTxnInfoOnNode(cn_node_list[i]);
-	}
-
-	/*upload 2PC transaction from DN*/
-	for (i = 0; i < dn_nodes_num; i++)
-	{
-        if (total_twopc_txn >= MAX_TWOPC_TXN)
-            return;
-		getTxnInfoOnNode(dn_node_list[i]);
-	}
-}
-
-void getTxnInfoOnNode(Oid node)
-{
-	int i;
-	TupleTableSlots result_txn;
-	Datum execute_res;
-	char query_execute[1024];
-	const char *query_txn_status = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text "
-										  "from pg_prepared_xacts;";
-	const char *query_txn_status_execute = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text "
-										  		  "from pg_prepared_xacts where database = '%s';";
-	snprintf(query_execute, 1024, query_txn_status_execute, get_database_name(MyDatabaseId));
-
-	if (execute)
-		execute_res = execute_query_on_single_node(node, query_execute, 5, &result_txn);
-	else
-		execute_res = execute_query_on_single_node(node, query_txn_status, 5, &result_txn);
-	
-	if (execute_res == (Datum) 1)
-	{
-		for (i = 0; i < result_txn.slot_count; i++)
-		{
-			uint32	xid;
-			char*	gid;
-			char*	owner;
-			char*	datname;
-			TimestampTz	prepared_time;
-			
-			/*read results from each tuple*/
-			xid		= strtoul(TTSgetvalue(&result_txn, i, 0), NULL, 10);
-			gid		= TTSgetvalue(&result_txn, i, 1);
-			owner	= TTSgetvalue(&result_txn, i, 2);
-			datname	= TTSgetvalue(&result_txn, i, 3);
-			prepared_time = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
-												CStringGetDatum(TTSgetvalue(&result_txn, i, 4)),
-												ObjectIdGetDatum(InvalidOid),
-												Int32GetDatum(-1)));
-			
-			/*add txn to database*/
-			add_txn_info(datname, node, xid, gid, owner, prepared_time, TXN_STATUS_PREPARED);
-            if (total_twopc_txn >= MAX_TWOPC_TXN)
-            {
-                break;
-            }
-		}
-	}
-	else
-	{
-		elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(node));
-	}
-	DropTupleTableSlots(&result_txn);
-}
-
-void add_txn_info(char* dbname, Oid node_oid, uint32 xid, char * gid, 
-						char * owner, TimestampTz prepared_time, TXN_STATUS status)
-{
-	txn_info *txn = NULL;
-	int	nodeidx;
-
-	if ((txn = find_txn(gid)) == NULL)
-	{
-		txn = make_txn_info(dbname, gid, owner);
-        total_twopc_txn++;
-		if (txn == NULL)
-		{
-			/*no more memory*/
-			elog(ERROR, "there is no more memory for palloc a 2PC transaction");
-		}
-	}
-	nodeidx = find_node_index(node_oid);
-	txn->txn_stat[nodeidx] = status;
-	txn->xid[nodeidx] = xid;
-	txn->prepare_timestamp[nodeidx] = prepared_time;
-	if (nodeidx < cn_nodes_num)
-	{
-		txn->coordparts[nodeidx] = 1;
-		txn->num_coordparts++;
-	}
-	else
-	{
-		txn->dnparts[nodeidx-cn_nodes_num] = 1;
-		txn->num_dnparts++;
-	}
-	return;
-}
-
-TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
-{
-	/*get all the participates and initiate to each transactions*/
-	TWOPHASE_FILE_STATUS res = TWOPHASE_FILE_NOT_EXISTS;
-	TupleTableSlots result;
-	char *partnodes = NULL;
-    char *startnode = NULL;
-    char *file_content = NULL;
-    uint32 startxid = 0;
-    char *str_startxid = NULL;
-    char *str_timestamp = NULL;
-	char *temp = NULL;
-	Oid	 temp_nodeoid;
-	char temp_nodetype;
-	int  temp_nodeidx;
-	char stmt[1024];
-	static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text";
-	snprintf(stmt, 1024, STMT_FORM, txn->gid, txn->gid, txn->gid, txn->gid);
-    
-	if (execute_query_on_single_node(node_oid, stmt, 1, &result) == (Datum) 1)
-	{
-		if (result.slot_count && TTSgetvalue(&result, 0, 0))
-#if 0
-            TTSgetvalue(&result, 0, 0) && 
-            TTSgetvalue(&result, 0, 1) && 
-            TTSgetvalue(&result, 0, 2))
-#endif
-		{
-            file_content = TTSgetvalue(&result, 0, 0);    
-            
-            if (!IsXidImplicit(txn->gid) && strstr(file_content, GET_READONLY))
-            {
-                txn->is_readonly = true;
-                txn->global_txn_stat = TXN_STATUS_COMMITTED;
-                DropTupleTableSlots(&result);
-	            return TWOPHASE_FILE_EXISTS;
-            }
-            startnode = strstr(file_content, GET_START_NODE);
-            str_startxid = strstr(file_content, GET_START_XID);
-            partnodes = strstr(file_content, GET_NODE);
-            temp = strstr(file_content, GET_COMMIT_TIMESTAMP);
-            
-            /* get the last global_commit_timestamp */
-            while (temp)
-            {
-                str_timestamp = temp;
-                temp += strlen(GET_COMMIT_TIMESTAMP);
-                temp = strstr(temp, GET_COMMIT_TIMESTAMP);
-            }
-            
-            if (startnode)
-            {
-                startnode += strlen(GET_START_NODE);
-                startnode = strtok(startnode, "\n");
-                txn->origcoord = get_pgxc_nodeoid(startnode);
-            }
-            
-            if (str_startxid)
-            {
-                str_startxid += strlen(GET_START_XID);
-                str_startxid = strtok(str_startxid, "\n");
-                startxid = strtoul(str_startxid, NULL, 10);
-                txn->startxid = startxid;
-            }
-            
-            if (partnodes)
-            {
-                partnodes += strlen(GET_NODE);
-                partnodes = strtok(partnodes, "\n");
-                txn->participants = (char *) palloc0(strlen(partnodes) + 1);
-                strncpy(txn->participants, partnodes, strlen(partnodes) + 1);
-            }
-            
-            if (NULL == startnode || NULL == str_startxid)
-            {
-                res = TWOPHASE_FILE_OLD;
-                DropTupleTableSlots(&result);
-                return res;
-            }
-
-            if (NULL == partnodes)
-            {
-                res = TWOPHASE_FILE_ERROR;
-                DropTupleTableSlots(&result);
-                return res;
-            }
-
-            if (str_timestamp)
-            {
-                str_timestamp += strlen(GET_COMMIT_TIMESTAMP);
-                str_timestamp = strtok(str_timestamp, "\n");
-                txn->global_commit_timestamp = strtoull(str_timestamp, NULL, 10);
-            }
-            
-            elog(DEBUG1, "get 2pc txn:%s partnodes in nodename: %s (nodeoid:%u) result: partnodes:%s, startnode:%s, startnodeoid:%u, startxid:%u", 
-                txn->gid, get_pgxc_nodename(node_oid), node_oid, partnodes, startnode, txn->origcoord, startxid);
-            /* in explicit transaction startnode participate the transaction */
-            if (strstr(partnodes, startnode) || !IsXidImplicit(txn->gid))
-            {
-                txn->isorigcoord_part = true;
-            }
-            else
-            {
-                txn->isorigcoord_part = false;
-            }
-            
-			res = TWOPHASE_FILE_EXISTS;
-			txn->num_coordparts = 0;
-			txn->num_dnparts = 0;
-			temp = strtok(partnodes,", ");
-			while(temp)
-			{
-				/*check node type*/
-				temp_nodeoid = get_pgxc_nodeoid(temp);
-                if (temp_nodeoid == InvalidOid)
-                {
-                    res = TWOPHASE_FILE_ERROR;
-                    break;
-                }
-				temp_nodetype = get_pgxc_nodetype(temp_nodeoid);
-				temp_nodeidx = find_node_index(temp_nodeoid);
-				
-				switch (temp_nodetype)
-				{
-					case 'C':
-						txn->coordparts[temp_nodeidx] = 1;
-						txn->num_coordparts++;
-						break;
-					case 'D':
-						txn->dnparts[temp_nodeidx-cn_nodes_num] = 1;
-						txn->num_dnparts++;
-						break;
-					default:
-						elog(ERROR,"nodetype of %s is not 'C' or 'D'", temp);
-						break;
-				}
-				temp = strtok(NULL,", ");
-			}
-		}
-	}
-	else
-	{
-		elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(node_oid));
-		res = TWOPHASE_FILE_ERROR;
-	}
-	DropTupleTableSlots(&result);
-	return res;
-}
-
-static txn_info *find_txn(char *gid)
-{
-  bool found;
-  database_info *cur_db;
-  txn_info *txn;
-
-  for (cur_db = head_database_info; cur_db; cur_db = cur_db->next)
-  {
-#if 0
-	  for (cur_txn = cur_db->head_txn_info; cur_txn; cur_txn = cur_txn->next)
-	  {
-		  if (0 == strcmp(cur_txn->gid, gid))
-			  return cur_txn;
-	  }
-#endif
-      txn = (txn_info *)hash_search(cur_db->all_txn_info, (void *)gid, HASH_FIND, &found);
-      if (found)
-        return txn;
-  }
-  return NULL;
-}
-
-txn_info* make_txn_info(char* dbname, char* gid, char* owner)
-{
-    bool found;
-    txn_info *txn_insert_pos = NULL;
-	database_info *dbinfo;
-	txn_info *txn;
-
-	dbinfo = add_database_info(dbname);
-	txn = (txn_info *)palloc0(sizeof(txn_info));
-	if (txn == NULL)
-		return NULL;
-	//txn->next = NULL;
-	
-	//txn->gid = (char *)palloc0(strlen(gid)+1);
-	strncpy(txn->gid, gid, strlen(gid)+1);
-	txn->owner = (char *)palloc0(strlen(owner)+1);
-	strncpy(txn->owner, owner, strlen(owner)+1);
-	
-	txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count);
-	txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count);
-	txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count);
-	txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int));
-	
-	txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int));
-	if (txn->gid == NULL || txn->owner == NULL || txn->txn_stat == NULL
-		|| txn->xid == NULL || txn->coordparts == NULL || txn->dnparts == NULL || txn->prepare_timestamp == NULL)
-	{
-		pfree(txn);
-		return(NULL);
-	}
-
-    txn_insert_pos = (txn_info *)hash_search(dbinfo->all_txn_info, 
-                   (void *)txn->gid, HASH_ENTER, &found);
-    if (!found)
-        memcpy(txn_insert_pos, txn, sizeof(txn_info));
-
-#if 0        
-	if (dbinfo->head_txn_info == NULL)
-	{
-		dbinfo->head_txn_info = dbinfo->last_txn_info = txn;
-	}
-	else
-	{
-		dbinfo->last_txn_info->next = txn;
-		dbinfo->last_txn_info = txn;
-	}
-#endif
-
-	return txn_insert_pos;
-}
-
-database_info *find_database_info(char *database_name)
-{
-	database_info *cur_database_info = head_database_info;
-
-	for (;cur_database_info; cur_database_info = cur_database_info->next)
-	{
-		if(cur_database_info->database_name &&
-		   database_name && 
-		   strcmp(cur_database_info->database_name, database_name) == 0)
-			return(cur_database_info);
-	}
-	return(NULL);
-}
-
-database_info *add_database_info(char *database_name)
-{
-	database_info *rv;
-    HASHCTL txn_ctl;
-    char tabname[STRING_BUFF_LEN];
-
-	if ((rv = find_database_info(database_name)) != NULL)
-		return rv;		/* Already in the list */
-	rv = (database_info *)palloc0(sizeof(database_info));
-	if (rv == NULL)
-		return NULL;
-	rv->next = NULL;
-	rv->database_name = (char *)palloc0(strlen(database_name) + 1);
-	strncpy(rv->database_name, database_name, strlen(database_name) + 1);
-	if (rv->database_name == NULL)
-	{
-		pfree(rv);
-		return NULL;
-	}
-#if 0    
-	rv->head_txn_info = NULL;
-	rv->last_txn_info = NULL;
-#endif
-
-    snprintf(tabname, STRING_BUFF_LEN, "%s txn info", rv->database_name);
-    txn_ctl.keysize = MAX_GID;
-    txn_ctl.entrysize = sizeof(txn_info); 
-    rv->all_txn_info = hash_create(tabname, 64, 
-                                   &txn_ctl, HASH_ELEM);
-	if (head_database_info == NULL)
-	{
-		head_database_info = last_database_info = rv;
-		return rv;
-	}
-	else
-	{
-		last_database_info->next = rv;
-		last_database_info = rv;
-		return rv;
-	}
-}
-
-int find_node_index(Oid node_oid)
-{
-	int res = -1;
-	int i;
-	if (get_pgxc_nodetype(node_oid) == 'C')
-	{
-		for (i = 0; i < cn_nodes_num; i++)
-		{
-			if (node_oid == cn_node_list[i])
-			{
-				res = i;
-				break;
-			}
-		}
-	}
-	else
-	{
-		for (i = 0; i < dn_nodes_num; i++)
-		{
-			if (node_oid == dn_node_list[i])
-			{
-				res = i+cn_nodes_num;
-				break;
-			}
-		}
-	}
-	return res;
-}
-
-Oid find_node_oid(int node_idx)
-{
-	return (node_idx < cn_nodes_num) ? cn_node_list[node_idx] :
-									   dn_node_list[node_idx-cn_nodes_num];
-}
-
-void getTxnInfoOnOtherNodesAll(void)
-{
-	database_info *cur_database;
-
-	for (cur_database = head_database_info; cur_database; cur_database = cur_database->next)
-	{
-		getTxnInfoOnOtherNodesForDatabase(cur_database);
-	}
-}
-
-void getTxnInfoOnOtherNodesForDatabase(database_info *database)
-{
-	txn_info *cur_txn;
-	HASH_SEQ_STATUS status;
-    HTAB *txn = database->all_txn_info;
-	hash_seq_init(&status, txn);
-
-    while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL)
-    {
-		getTxnInfoOnOtherNodes(cur_txn);
-    }
-#if 0
-	for (cur_txn = database->head_txn_info; cur_txn; cur_txn = cur_txn->next)
-	{
-		getTxnInfoOnOtherNodes(cur_txn);
-	}
-#endif
-}
-
-void getTxnInfoOnOtherNodes(txn_info *txn)
-{
-	int ii;
-    int ret;
-	char node_type;
-    TWOPHASE_FILE_STATUS status = TWOPHASE_FILE_NOT_EXISTS;
-    Oid node_oid;
-    uint32 transactionid = 0;
-    char gid[MAX_GID];
-    char *ptr = NULL;
-
-    if (IsXidImplicit(txn->gid))
-    {
-        strncpy(gid, txn->gid, strlen(txn->gid)+1);
-        ptr = strtok(gid, ":");
-        ptr = strtok(NULL, ":");
-        node_oid = get_pgxc_nodeoid(ptr);
-        status = GetTransactionPartNodes(txn, node_oid);
-    }
-    else
-    {
-        for (ii = 0; ii < cn_nodes_num + dn_nodes_num; ii++)
-        {
-            if (ii < cn_nodes_num)
-            {
-                status = GetTransactionPartNodes(txn, cn_node_list[ii]);
-                if (TWOPHASE_FILE_EXISTS == status || 
-                    TWOPHASE_FILE_OLD == status || 
-                    TWOPHASE_FILE_ERROR == status)
-                {
-                    node_oid = cn_node_list[ii];
-                    break;
-                }
-            }
-            else
-            {
-                status = GetTransactionPartNodes(txn, dn_node_list[ii - cn_nodes_num]);
-                if (TWOPHASE_FILE_EXISTS == status || 
-                    TWOPHASE_FILE_OLD == status || 
-                    TWOPHASE_FILE_ERROR == status)
-                {
-                    node_oid = dn_node_list[ii - cn_nodes_num];
-                    break;
-                }
-            }
-        }
-        
-        /* since there may be explicit readonly  twophase transactions */
-        if (txn->is_readonly)
-        {
-            return;
-        }
-        if (TWOPHASE_FILE_EXISTS == status && 
-            InvalidGlobalTimestamp == txn->global_commit_timestamp && 
-            node_oid != txn->origcoord)
-        {
-            status = GetTransactionPartNodes(txn, txn->origcoord);
-        }
-
-    }
-    
-    if (TWOPHASE_FILE_EXISTS != status)
-    {
-        /*
-         * if 2pc file not exists in all nodes, the trans did not pass the prepared phase, 
-         * 
-         */
-        txn->global_txn_stat = (TWOPHASE_FILE_NOT_EXISTS == status) ? 
-                                TXN_STATUS_ABORTED : TXN_STATUS_UNKNOWN;
-        return;
-    }
-
-
-    /* judge the range of global status */
-    CheckFirstPhase(txn);
-
-	for (ii = 0; ii < pgxc_clean_node_count; ii++)
-	{
-		if (txn->txn_stat[ii] == TXN_STATUS_INITIAL)
-		{
-			/*check node ii is 'C' or 'D'*/
-            node_oid = find_node_oid(ii);
-            if (node_oid == txn->origcoord)
-                continue;
-			node_type = get_pgxc_nodetype(node_oid);
-			if (node_type == 'C' && txn->coordparts[ii] != 1)
-				continue;
-			if (node_type == 'D' && txn->dnparts[ii - cn_nodes_num] != 1)
-				continue;
-			/*check coordparts or dnparts*/
-			if (txn->xid[ii] == 0)
-			{
-                ret = Get2PCXidByGid(node_oid, txn->gid, &transactionid);
-                if (ret == XIDFOUND)
-                {
-                    txn->xid[ii] = transactionid;
-                    if (txn->xid[ii] > 0)
-                        getTxnStatus(txn, ii);
-                }
-                else if (ret == XIDNOTFOUND)
-                {
-                    if (txn->after_first_phase)
-                        txn->txn_stat[ii] = TXN_STATUS_COMMITTED;
-                }
-                else
-                    txn->txn_stat[ii] = TXN_STATUS_UNKNOWN;
-
-			}
-		}
-	}
-}
-
-/*get xid by gid on node_oid*/
-int Get2PCXidByGid(Oid node_oid, char *gid, uint32 *transactionid)
-{
-    int ret = XIDFOUND;
-	TupleTableSlots result;
-	uint32 xid = 0;
-	static const char *STMT_FORM = "select pgxc_get_2pc_xid('%s')::text;";
-	char stmt[100];
-	snprintf(stmt, 100, STMT_FORM, gid);
-	/*if exist get xid by gid on node_oid*/
-	if (execute_query_on_single_node(node_oid, stmt, 1, &result) != (Datum) 0)
-	{
-		if (result.slot_count)
-		{
-			if (TTSgetvalue(&result, 0, 0))
-			{
-				xid = strtoul(TTSgetvalue(&result, 0, 0), NULL, 10);
-                *transactionid = xid;
-                if (xid == 0)
-                    ret = XIDNOTFOUND;
-			}
-            else
-                ret = XIDNOTFOUND;
-		}
-		else
-			ret = XIDNOTFOUND;
-	}
-	else
-		ret = XIDEXECFAIL;
-	DropTupleTableSlots(&result);
-	return ret;
-}
-
-int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid)
-{
-    int ret = FILEFOUND;
-	TupleTableSlots result;
-	static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text;";
-	char stmt[100];
-	snprintf(stmt, 100, STMT_FORM, gid);
-	/*if exist get xid by gid on node_oid*/
-	if (execute_query_on_single_node(node_oid, stmt, 1, &result) != (Datum) 0)
-	{
-		if (result.slot_count)
-		{
-			if (!TTSgetvalue(&result, 0, 0))
-			{
-                ret = FILENOTFOUND;
-			}
-            else
-            {
-                ret = FILEFOUND;
-            }
-		}
-		else
-			ret = FILENOTFOUND;
-	}
-	else
-		ret = FILEUNKOWN;
-	DropTupleTableSlots(&result);
-	return ret;
-}
-
-
-void getTxnStatus(txn_info *txn, int node_idx)
-{
-	Oid				node_oid;
-	char			stmt[1024];
-	char			*att1;
-	TupleTableSlots result;
-
-	static const char *STMT_FORM = "SELECT pgxc_is_committed('%d'::xid)::text";
-	snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx], txn->xid[node_idx]);
-
-	node_oid = find_node_oid(node_idx);
-	if (0 != execute_query_on_single_node(node_oid, stmt, 1, &result))
-	{
-		att1 = TTSgetvalue(&result, 0, 0);
-		
-		if (att1)
-		{
-			if (strcmp(att1, "true") == 0)
-			{
-				txn->txn_stat[node_idx] = TXN_STATUS_COMMITTED;
-			}
-			else
-				txn->txn_stat[node_idx] = TXN_STATUS_ABORTED;
-		}
-		else
-		{
-            txn->txn_stat[node_idx] = TXN_STATUS_INITIAL;
-		}
-	}
-	else
-		txn->txn_stat[node_idx] = TXN_STATUS_UNKNOWN;
-	DropTupleTableSlots(&result);
-}
-
-char *get2PCInfo(const char *tid)
-{
-    char *result = NULL;
-    char *info = NULL;
-    int size = 0;
-    File fd = -1;
-    int ret = -1;
-    struct stat filestate;
-    char path[MAXPGPATH];
-
-    info = get_2pc_info_from_cache(tid);
-    if (NULL != info)
-    {
-        size = strlen(info);
-        result = (char *)palloc0(size + 1);
-        memcpy(result, info, size);
-        return result;
-    }
-
-    elog(LOG, "try to get 2pc info from disk, tid: %s", tid);
-
-    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
-    if(access(path, F_OK) == 0)
-    {
-        if(stat(path, &filestate) == -1)
-        {
-            ereport(ERROR,
-                (errcode_for_file_access(),
-                errmsg("could not get status of file \"%s\"", path)));
-        }
-        
-        size = filestate.st_size;
-
-        if (0 == size) 
-        {
-            return NULL;
-        }
-
-        result = (char *)palloc0(size + 1);
-
-        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
-        if (fd < 0)
-        {   
-            pfree(result);
-            ereport(ERROR,
-                (errcode_for_file_access(),
-                errmsg("could not open file \"%s\" for read", path)));
-        } 
-
-        ret = FileRead(fd, result, size, WAIT_EVENT_BUFFILE_READ);
-        if(ret != size)
-        {
-            pfree(result);
-            ereport(ERROR,
-                (errcode_for_file_access(),
-                errmsg("could not read file \"%s\"", path)));
-        }
-
-        FileClose(fd);
-        return result;
-    }
-
-    return NULL;
-}
-
-Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_get_2pc_file);
-Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS)
-{
-    char *tid = NULL;
-    char *result = NULL;
-    text *t_result = NULL;
-
-    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-    result = get2PCInfo(tid);
-    if (NULL != result)
-    {
-        t_result = cstring_to_text(result);
-        pfree(result);
-        return PointerGetDatum(t_result);
-    }
-    PG_RETURN_NULL();
-}
-
-
-Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_get_2pc_nodes);
-Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS)
-{
-    char *tid = NULL;
-    char *result = NULL;
-    char *nodename = NULL;
-    text *t_result = NULL;
-
-    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-    result = get2PCInfo(tid);
-    if (NULL != result)
-    {
-        nodename = strstr(result, GET_NODE);
-        if (NULL != nodename)
-        {
-            nodename += strlen(GET_NODE);
-            nodename = strtok(nodename, "\n");
-            t_result = cstring_to_text(nodename);
-            pfree(result);
-            return PointerGetDatum(t_result);
-        }
-    }
-
-    PG_RETURN_NULL();
-}
-
-Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_get_2pc_startnode);
-Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS)
-{
-    char *tid = NULL;
-    char *result = NULL;
-    char *nodename = NULL;
-    text *t_result = NULL;
-
-    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-    result = get2PCInfo(tid);
-    if (NULL != result)
-    {
-        nodename = strstr(result, GET_START_NODE);
-        if (NULL != nodename)
-        {
-            nodename += strlen(GET_START_NODE);
-            nodename = strtok(nodename, "\n");
-            t_result = cstring_to_text(nodename);
-            pfree(result);
-            return PointerGetDatum(t_result);
-
-        }
-    }
-    PG_RETURN_NULL();
-}
-
-Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_get_2pc_startxid);
-Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS)
-{
-    char *tid = NULL;
-    char *result = NULL;
-    char *startxid = NULL;
-    text *t_result = NULL;
-
-    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-    result = get2PCInfo(tid);
-    if (NULL != result)
-    {
-        startxid = strstr(result, GET_START_XID);
-        if (NULL != startxid)
-        {
-            startxid += strlen(GET_START_XID);
-            startxid = strtok(startxid, "\n");
-            t_result = cstring_to_text(startxid);
-            pfree(result);
-            return PointerGetDatum(t_result);
-        }
-    }
-    PG_RETURN_NULL();
-}
-
-
-Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_get_2pc_commit_timestamp);
-Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS)
-{
-    char *tid = NULL;
-    char *result = NULL;
-    char *commit_timestamp = NULL;
-    text *t_result = NULL;
-
-    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-    result = get2PCInfo(tid);
-    if (NULL != result)
-    {
-        commit_timestamp = strstr(result, GET_COMMIT_TIMESTAMP);
-        if (NULL != commit_timestamp)
-        {
-            commit_timestamp += strlen(GET_COMMIT_TIMESTAMP);
-            commit_timestamp = strtok(commit_timestamp, "\n");
-            t_result = cstring_to_text(commit_timestamp);
-            pfree(result);
-            return PointerGetDatum(t_result);
-        }
-    }
-    PG_RETURN_NULL();
-}
-
-
-
-Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_get_2pc_xid);
-Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS)
-{
-    char *tid = NULL;
-    char *result = NULL;
-    char *str_xid = NULL;
-    GlobalTransactionId xid;
-
-    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-    result = get2PCInfo(tid);
-    if (NULL != result)
-    {
-        str_xid = strstr(result, GET_XID);
-        if (NULL != str_xid)
-        {
-            str_xid += strlen(GET_XID);
-            str_xid = strtok(str_xid, "\n");
-            xid = strtoul(str_xid, NULL, 10);
-            pfree(result);
-            PG_RETURN_UINT32(xid);
-        }
-    }
-    PG_RETURN_NULL();
-}
-
-Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_remove_2pc_records);
-Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS)
-{
-    char *tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-	remove_2pc_records(tid, true);
-    pfree(tid);
-    PG_RETURN_BOOL(true);
-}
-
-Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_clear_2pc_records);
-Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
-{
-	MemoryContext oldcontext;
-	MemoryContext mycontext;
-	
-	int i = 0;
-    int count = 0;
-    TupleTableSlots *result;
-    TupleTableSlots clear_result;
-    const char *query = "select pgxc_get_record_list()::text";
-    const char *CLEAR_STMT = "select pgxc_remove_2pc_records('%s')::text";
-    char clear_query[100];
-    char *twopcfiles = NULL;
-    char *ptr = NULL;
-    bool res = true;
-    
-	if(!IS_PGXC_COORDINATOR)
-	{
-		elog(ERROR, "can only called on coordinator");
-	}
-	
-	mycontext = AllocSetContextCreate(CurrentMemoryContext,
-											  "clean_check",
-											  ALLOCSET_DEFAULT_MINSIZE,
-											  ALLOCSET_DEFAULT_INITSIZE,
-											  ALLOCSET_DEFAULT_MAXSIZE);
-	oldcontext = MemoryContextSwitchTo(mycontext);
-
-    ResetGlobalVariables();
-#if 0
-	if((dir = opendir(TWOPHASE_RECORD_DIR)))
-	{		
-		while((ptr = readdir(dir)) != NULL)
-	    {
-	    	if (count > 999)
-				break;
-	        if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0)
-	        {
-	            continue;
-	        }       
-			snprintf(path[count], MAX_GID, "/%s", ptr->d_name);
-			//snprintf(path[count], MAX_GID, "/%s", ptr->d_name);
-			count++;
-		}
-
-		closedir(dir);
-	}
-#endif
-
-	/*get node list*/
-	PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
-					&cn_nodes_num, &dn_nodes_num, true);
-	pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
-	my_nodeoid = getMyNodeoid();
-	cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
-	dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
-    result = (TupleTableSlots *)palloc0(pgxc_clean_node_count * sizeof(TupleTableSlots));
-
-    /*collect the 2pc file in nodes*/
-    for (i = 0; i < cn_nodes_num; i++)
-    {
-        (void) execute_query_on_single_node(cn_node_list[i], query, 1, result+i);
-    }
-
-    for (i = 0; i < dn_nodes_num; i++)
-    {
-        (void) execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i);
-    }
-	/*get all database info*/
-	getDatabaseList();
-	
-	/*get all info of 2PC transactions*/
-	getTxnInfoOnNodesAll();
-#if 0
-	if((dir = opendir(TWOPHASE_RECORD_DIR)))
-	{		
-		while (i < count)
-		{
-			if (!find_txn(path[i]))
-			{
-				unlink(path[i]);
-				WriteClean2pcXlogRec(path[i]);
-			}
-			i++;
-		}
-
-		closedir(dir);
-	}
-#endif
-    /*delete all rest 2pc file in each nodes*/
-    for (i = 0; i < cn_nodes_num; i++)
-    {
-        if (0 == result[i].slot_count)
-        {
-            continue;
-        }
-        if (!(twopcfiles = TTSgetvalue(result+i, 0, 0)))
-            continue;
-        ptr = strtok(twopcfiles, ",");
-        while(ptr)
-        {
-            if (count >= MAXIMUM_CLEAR_FILE)
-                break;
-            if (!find_txn(ptr))
-            {
-                snprintf(clear_query, 100, CLEAR_STMT, ptr);
-                if (execute_query_on_single_node(cn_node_list[i], clear_query, 1, &clear_result) == (Datum)0)
-                    res = false;
-                DropTupleTableSlots(&clear_result);
-                count++;
-            }
-            ptr = strtok(NULL, ",");
-        }
-    }
-
-    for (i = 0; i < dn_nodes_num; i++)
-    {
-        if (0 == result[cn_nodes_num+i].slot_count)
-        {
-            continue;
-        }
-        if (!(twopcfiles = TTSgetvalue(result+cn_nodes_num+i, 0, 0)))
-            continue;
-        ptr = strtok(twopcfiles, ",");
-        while(ptr)
-        {
-            if (count >= MAXIMUM_CLEAR_FILE)
-                break;
-            if (!find_txn(ptr))
-            {
-                snprintf(clear_query, 100, CLEAR_STMT, ptr);
-                if (execute_query_on_single_node(dn_node_list[i], clear_query, 1, &clear_result) == (Datum)0)
-                    res = false;
-                DropTupleTableSlots(&clear_result);
-                count++;
-            }
-            ptr = strtok(NULL, ",");
-        }
-    }
-
-    for (i = 0; i < pgxc_clean_node_count; i++)
-        DropTupleTableSlots(result+i);
-    
-    DestroyTxnHash();
-	ResetGlobalVariables();
-
-	MemoryContextSwitchTo(oldcontext);
-	MemoryContextDelete(mycontext);
-	
-	
-    PG_RETURN_BOOL(res);
-}
-
-Datum pgxc_get_record_list(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_get_record_list);
-Datum pgxc_get_record_list(PG_FUNCTION_ARGS)
-{
-    int count = 0;
-    DIR *dir = NULL;
-    struct dirent *ptr = NULL;
-    char *recordList = NULL;
-    text *t_recordList = NULL;
-
-    /* get from hash table */
-    recordList = get_2pc_list_from_cache(&count);
-    if (count >= MAXIMUM_OUTPUT_FILE)
-    {
-        Assert(NULL != recordList);
-        t_recordList = cstring_to_text(recordList);
-        return PointerGetDatum(t_recordList);
-    }
-
-    /* get from disk */
-    if(!(dir = opendir(TWOPHASE_RECORD_DIR)))
-    {
-        if(NULL == recordList)
-        {
-            PG_RETURN_NULL();
-        }
-
-        t_recordList = cstring_to_text(recordList);
-        return PointerGetDatum(t_recordList);
-    }
-
-    while((ptr = readdir(dir)) != NULL)
-    {
-        if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0)
-        {
-            continue;
-        }       
-        if (count >= MAXIMUM_OUTPUT_FILE)
-        {
-            break;
-        }
-        
-        if(!recordList)
-        {
-            recordList = (char *)palloc0(strlen(ptr->d_name) + 1);
-            sprintf(recordList, "%s", ptr->d_name);
-        }
-        else
-        {
-            recordList = (char *) repalloc(recordList,
-                                    strlen(ptr->d_name) + strlen(recordList) + 2);
-            sprintf(recordList, "%s,%s", recordList, ptr->d_name);
-        }
-        count++;
-    }
-
-    closedir(dir);
-
-    if(!recordList)
-    {
-        PG_RETURN_NULL();
-    }
-    else
-    {
-        t_recordList = cstring_to_text(recordList);
-        return PointerGetDatum(t_recordList);
-    }
-}
-
-Datum pgxc_commit_on_node(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_commit_on_node);
-Datum pgxc_commit_on_node(PG_FUNCTION_ARGS)
-{
-    /* nodename, gid */
-    char *nodename;
-    Oid  nodeoid;
-    char *gid;
-    txn_info *txn;
-	char command[MAX_CMD_LENGTH];
-	PGXCNodeHandle **connections = NULL;
-	int					conn_count = 0;
-	ResponseCombiner	combiner;
-	PGXCNodeAllHandles *pgxc_handles = NULL;
-    PGXCNodeHandle *conn = NULL;
-    
-    /*clear Global*/
-    ResetGlobalVariables();
-    /*get node list*/
-    PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
-                    &cn_nodes_num, &dn_nodes_num, true);
-    if (cn_node_list == NULL || dn_node_list == NULL)
-        elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list");
-    pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
-    my_nodeoid = getMyNodeoid();
-    cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
-    dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
- 
-    nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
-    gid = text_to_cstring(PG_GETARG_TEXT_P(1));
-    nodeoid = get_pgxc_nodeoid(nodename);
-    if (InvalidOid == nodeoid)
-    {
-        elog(ERROR, "Invalid nodename '%s'", nodename);
-    }
-    
-	txn = (txn_info *)palloc0(sizeof(txn_info));
-	if (txn == NULL)
-	{
-		PG_RETURN_BOOL(false);
-	}
-	txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count);
-	txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count);
-	txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count);
-	txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int));
-	txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int));
-
-	strncpy(txn->gid, gid, strlen(gid)+1);
-    getTxnInfoOnOtherNodes(txn);
-	snprintf(command, MAX_CMD_LENGTH, "commit prepared '%s'", txn->gid);
-
-
-    if (InvalidGlobalTimestamp == txn->global_commit_timestamp)
-    {
-        if (!txn->is_readonly)
-        {
-            elog(ERROR, "in pg_clean, fail to get global_commit_timestamp for transaction '%s' on", gid);
-        }
-        else
-        {
-            txn->global_commit_timestamp = GetGlobalTimestampGTM();
-        }
-    }
-    
-	connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*));
-    get_node_handles(&pgxc_handles, nodeoid);
-
-    conn = (PGXC_NODE_COORDINATOR == get_pgxc_nodetype(nodeoid)) ? 
-            pgxc_handles->coord_handles[0] : pgxc_handles->datanode_handles[0];
-    if (!send_query_clean_transaction(conn, txn, command))
-    {
-        elog(ERROR, "pg_clean: send query '%s' from '%s' to '%s' failed ", 
-            command, get_pgxc_nodename(my_nodeoid) , nodename);
-    }
-    else
-    {
-        connections[conn_count++] = conn;
-    }
-    /* receive response */
-    if (conn_count)
-    {
-        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
-        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
-                !validate_combiner(&combiner))
-        {
-            if (combiner.errorMessage)
-                pgxc_node_report_error(&combiner);
-            else
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to FINISH the transaction on one or more nodes")));
-        }
-        else
-            CloseCombiner(&combiner);
-    }
-    /*clear Global*/
-    ResetGlobalVariables();
-	clear_handles();
-	pfree_pgxc_all_handles(pgxc_handles);
-    pgxc_handles = NULL;
-    pfree(connections);
-    connections = NULL;
-
-    PG_RETURN_BOOL(true);
-}
-
-Datum pgxc_abort_on_node(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_abort_on_node);
-Datum pgxc_abort_on_node(PG_FUNCTION_ARGS)
-{
-    /* nodename, gid */
-    char *nodename;
-    Oid  nodeoid;
-    char *gid;
-    txn_info *txn;
-	char command[MAX_CMD_LENGTH];
-	PGXCNodeHandle **connections = NULL;
-	int					conn_count = 0;
-	ResponseCombiner	combiner;
-	PGXCNodeAllHandles *pgxc_handles = NULL;
-    PGXCNodeHandle *conn = NULL;
-    
-    /*clear Global*/
-    ResetGlobalVariables();
-    /*get node list*/
-    PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
-                    &cn_nodes_num, &dn_nodes_num, true);
-    if (cn_node_list == NULL || dn_node_list == NULL)
-        elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list");
-    pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
-    my_nodeoid = getMyNodeoid();
-    cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
-    dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
- 
-    nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
-    gid = text_to_cstring(PG_GETARG_TEXT_P(1));
-    nodeoid = get_pgxc_nodeoid(nodename);
-    if (InvalidOid == nodeoid)
-    {
-        elog(ERROR, "Invalid nodename '%s'", nodename);
-    }
-    
-	txn = (txn_info *)palloc0(sizeof(txn_info));
-	if (txn == NULL)
-	{
-		PG_RETURN_BOOL(false);
-	}
-	txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count);
-	txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count);
-	txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count);
-	txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int));
-	txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int));
-
-	strncpy(txn->gid, gid, strlen(gid)+1);
-	connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*));
-    getTxnInfoOnOtherNodes(txn);
-	snprintf(command, MAX_CMD_LENGTH, "rollback prepared '%s'", txn->gid);
-#if 0    
-	if (!setMaintenanceMode(true))
-	{
-		elog(ERROR, "Error: fail to set maintenance mode on in pg_clean");
-	}
-#endif    
-
-    get_node_handles(&pgxc_handles, nodeoid);
-
-    conn = (PGXC_NODE_COORDINATOR == get_pgxc_nodetype(nodeoid)) ? 
-            pgxc_handles->coord_handles[0] : pgxc_handles->datanode_handles[0];
-    if (!send_query_clean_transaction(conn, txn, command))
-    {
-        elog(ERROR, "pg_clean: send query '%s' from '%s' to '%s' failed ", 
-            command, get_pgxc_nodename(my_nodeoid) , nodename);
-    }
-    else
-    {
-        connections[conn_count++] = conn;
-    }
-    /* receive response */
-    if (conn_count)
-    {
-        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
-        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
-                !validate_combiner(&combiner))
-        {
-            if (combiner.errorMessage)
-                pgxc_node_report_error(&combiner);
-            else
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to FINISH the transaction on one or more nodes")));
-        }
-        else
-            CloseCombiner(&combiner);
-    }
-    /*clear Global*/
-    ResetGlobalVariables();
-	clear_handles();
-	pfree_pgxc_all_handles(pgxc_handles);
-    pgxc_handles = NULL;
-    pfree(connections);
-    connections = NULL;
-
-    PG_RETURN_BOOL(true);
-}
-
-
-
-void recover2PCForDatabaseAll(void)
-{
-	database_info *cur_db = head_database_info;
-	while (cur_db)
-	{
-		recover2PCForDatabase(cur_db);
-		cur_db = cur_db->next;
-	}
-	//clean_old_2PC_files();
-}
-
-void recover2PCForDatabase(database_info * db_info)
-{
-	txn_info *cur_txn;
-	HASH_SEQ_STATUS status;
-    HTAB *txn = db_info->all_txn_info;
-
-	hash_seq_init(&status, txn);
-	while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL)
-	{
-		recover2PC(cur_txn);
-    }
-}
-
-bool send_query_clean_transaction(PGXCNodeHandle* conn, txn_info *txn, const char *finish_cmd)
-{
-#ifdef __TWO_PHASE_TESTS__
-    if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case &&
-        PG_CLEAN_SEND_QUERY >= twophase_exception_case)
-    {
-        twophase_in = IN_PG_CLEAN;
-    }
-#endif
-	if (!GlobalTimestampIsValid(txn->global_commit_timestamp) && 
-        TXN_STATUS_COMMITTED == txn->global_txn_stat &&
-        !txn->is_readonly)
-		return false;
-	
-    if (pgxc_node_send_clean(conn))
-    {
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("in pg_clean failed to send pg_clean flag for %s PREPARED command",
-                        TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
-        return false;
-    }
-    if (txn->is_readonly && pgxc_node_send_readonly(conn))
-    {
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("in pg_clean failed to send readonly flag for %s PREPARED command",
-                        TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
-        return false;
-    }
-
-    if (txn->after_first_phase && pgxc_node_send_after_prepare(conn))
-    {
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("in pg_clean failed to send after prepare flag for %s PREPARED command",
-                        TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
-        return false;
-    }
-    
-    /* 
-     * only transaction finished in commit prepared/rollback prepared phase send timestamp 
-     * partial prepared transaction has no need to send other information
-     */
-	if (InvalidGlobalTimestamp != txn->global_commit_timestamp && 
-        pgxc_node_send_global_timestamp(conn, txn->global_commit_timestamp))
-	{
-		ereport(ERROR,
-				(errcode(ERRCODE_INTERNAL_ERROR),
-				 errmsg("in pg_clean failed to send global committs for %s PREPARED command",
-						TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
-	}
-    if (!txn->is_readonly)
-    {
-        if (InvalidOid != txn->origcoord && pgxc_node_send_starter(conn, get_pgxc_nodename(txn->origcoord)))
-        {
-    		ereport(ERROR,
-    				(errcode(ERRCODE_INTERNAL_ERROR),
-    				 errmsg("in pg_clean failed to send start node for %s PREPARED command",
-    						TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
-        }
-
-        if (InvalidTransactionId != txn->startxid && pgxc_node_send_startxid(conn, txn->startxid))
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("in pg_clean failed to send start xid for %s PREPARED command",
-                            TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
-        }
-        
-        if (NULL != txn->participants && pgxc_node_send_partnodes(conn, txn->participants))
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("in pg_clean failed to send participants for %s PREPARED command",
-                            TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
-        }
-    }
-    
-    if (pgxc_node_send_query(conn, finish_cmd))
-    {
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                 errmsg("in pg_clean failed to send query for %s PREPARED command",
-                        TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
-        return false;
-    }
-	return true;
-}
-
-bool check_2pc_belong_node(txn_info * txn)
-{
-    int node_index = 0;
-    char node_type;
-    node_index = find_node_index(abnormal_nodeoid);
-    if (abnormal_nodeoid == txn->origcoord)
-    {
-        txn->belong_abnormal_node = true;
-        return true;
-    }
-    node_type = get_pgxc_nodetype(abnormal_nodeoid);
-    if (node_type == 'C' && txn->coordparts[node_index] == 1)
-    {
-        txn->belong_abnormal_node = true;
-        return true;
-    }
-    if (node_type == 'D' && txn->dnparts[node_index - cn_nodes_num] == 1)
-    {
-        txn->belong_abnormal_node = true;
-        return true;
-    }
-    txn->belong_abnormal_node = false;
-    return false;
-}
-
-bool check_node_participate(txn_info * txn, int node_idx)
-{
-    char node_type = get_pgxc_nodetype(abnormal_nodeoid);
-    if (PGXC_NODE_COORDINATOR == node_type) 
-    {
-        return txn->coordparts[node_idx] == 1 ? true : false;
-    } else if (PGXC_NODE_DATANODE == node_type)
-    {
-        return txn->dnparts[node_idx] == 1 ? true : false;
-    }
-    return false;
-}
-
-void recover2PC(txn_info * txn)
-{
-	TXN_STATUS txn_stat;
-	txn_stat = check_txn_global_status(txn);
-	txn->global_txn_stat = txn_stat;
-
-#ifdef DEBUG_EXECABORT
-	txn_stat = TXN_STATUS_ABORTED;
-#endif
-
-	switch (txn_stat)
-	{
-		case TXN_STATUS_FAILED:
-			elog(LOG, "cannot recover 2PC transaction %s for TXN_STATUS_FAILED", txn->gid);
-            txn->op = UNDO;
-			txn->op_issuccess = true;
-			break;
-		
-		case TXN_STATUS_UNKNOWN:
-			elog(LOG, "cannot recover 2PC transaction %s for TXN_STATUS_UNKNOWN", txn->gid);
-            txn->op = UNDO;
-			txn->op_issuccess = true;
-			break;
-		
-		case TXN_STATUS_PREPARED:
-			elog(DEBUG1, "2PC recovery of transaction %s not needed for TXN_STATUS_PREPARED", txn->gid);
-            txn->op = UNDO;
-			txn->op_issuccess = true;
-			break;
-		
-		case TXN_STATUS_COMMITTED:
-            if (InvalidOid == txn->origcoord || txn->is_readonly)
-            {
-                txn->op = UNDO;
-                txn->op_issuccess = true;
-            }
-            else
-            {
-    			txn->op = COMMIT;
-    			/* check whether all nodes can commit prepared */
-    			if (!clean_2PC_iscommit(txn, true, true))
-    			{
-    				txn->op_issuccess = false;
-    				elog(LOG, "check commit 2PC transaction %s failed", txn->gid);
-    				return;
-    			}
-    			/* send commit prepared to all nodes */
-    			if (!clean_2PC_iscommit(txn, true, false))
-    			{
-    				txn->op_issuccess = false;
-    				elog(LOG, "commit 2PC transaction %s failed", txn->gid);
-    				return;
-    			}
-    			txn->op_issuccess = true;
-    			clean_2PC_files(txn);
-            }
-			break;
-		
-		case TXN_STATUS_ABORTED:
-			txn->op = ABORT;
-			/* check whether all nodes can rollback prepared */
-			if (!clean_2PC_iscommit(txn, false, true))
-			{
-				txn->op_issuccess = false;
-				elog(LOG, "check rollback 2PC transaction %s failed", txn->gid);
-				return;
-			}
-			/* send rollback prepared to all nodes */
-			if (!clean_2PC_iscommit(txn, false, false))
-			{
-				txn->op_issuccess = false;
-				elog(LOG, "rollback 2PC transaction %s failed", txn->gid);
-				return;
-			}
-			txn->op_issuccess = true;
-			clean_2PC_files(txn);
-			break;
-		
-		case TXN_STATUS_INPROGRESS:
-			elog(DEBUG1, "2PC recovery of transaction %s not needed for TXN_STATUS_INPROGRESS", txn->gid);
-            txn->op = UNDO;
-			txn->op_issuccess = true;
-			break;
-		
-		default:
-			elog(ERROR, "cannot recover 2PC transaction %s for unkown status", txn->gid);
-			break;
-	}
-	return;
-}
-
-TXN_STATUS check_txn_global_status(txn_info *txn)
-{
-#define TXN_PREPARED 	0x0001
-#define TXN_COMMITTED 	0x0002
-#define TXN_ABORTED		0x0004
-#define TXN_UNKNOWN		0x0008
-#define TXN_INITIAL		0x0010
-#define TXN_INPROGRESS	0X0020
-	int ii;
-	int check_flag = 0;
-    int node_idx = 0;
-	TimestampTz prepared_time = 0;
-	TimestampTz time_gap = clean_time_interval;
-
-    if (!IsXidImplicit(txn->gid) && txn->is_readonly)
-    {
-        return TXN_STATUS_COMMITTED;
-    }
-    if (txn->global_txn_stat == TXN_STATUS_UNKNOWN)
-    {
-        check_flag |= TXN_UNKNOWN;
-    }
-    if (txn->global_txn_stat == TXN_STATUS_ABORTED)
-    {
-        check_flag |= TXN_ABORTED;
-    }
-
-	/*check dn participates*/
-	for (ii = 0; ii < dn_nodes_num; ii++)
-	{
-		if (txn->dnparts[ii] == 1)
-		{
-			if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_INITIAL)
-				check_flag |= TXN_INITIAL;
-			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_UNKNOWN)
-				check_flag |= TXN_UNKNOWN;
-			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_PREPARED)
-			{
-				check_flag |= TXN_PREPARED;
-				prepared_time = txn->prepare_timestamp[ii + cn_nodes_num] > prepared_time ? 
-								txn->prepare_timestamp[ii + cn_nodes_num] : prepared_time;
-			}
-			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_INPROGRESS)
-				check_flag |= TXN_INPROGRESS;
-			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_COMMITTED)
-				check_flag |= TXN_COMMITTED;
-			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_ABORTED)
-				check_flag |= TXN_ABORTED;
-			else
-				return TXN_STATUS_FAILED;
-		}
-	}
-	/*check cn participates*/
-	for (ii = 0; ii < cn_nodes_num; ii++)
-	{
-		if (txn->coordparts[ii] == 1)
-		{
-			if (txn->txn_stat[ii] == TXN_STATUS_INITIAL)
-				check_flag |= TXN_ABORTED;
-			else if (txn->txn_stat[ii] == TXN_STATUS_UNKNOWN)
-				check_flag |= TXN_UNKNOWN;
-			else if (txn->txn_stat[ii] == TXN_STATUS_PREPARED)
-			{
-				check_flag |= TXN_PREPARED;
-				prepared_time = txn->prepare_timestamp[ii] > prepared_time ? 
-								txn->prepare_timestamp[ii] : prepared_time;
-			}
-			else if (txn->txn_stat[ii] == TXN_STATUS_INPROGRESS)
-				check_flag |= TXN_INPROGRESS;
-			else if (txn->txn_stat[ii] == TXN_STATUS_COMMITTED)
-				check_flag |= TXN_COMMITTED;
-			else if (txn->txn_stat[ii] == TXN_STATUS_ABORTED)
-				check_flag |= TXN_ABORTED;
-			else
-				return TXN_STATUS_FAILED;
-		}
-	}
-
-    /*
-     * first check the prepare timestamp of both implicit and explicit trans within the time_gap or not
-     * if not, check the commit timestamp explicit trans within the time_gap or not 
-     */
-#if 0     
-    if ((check_flag & TXN_INPROGRESS) ||
-        (IsXidImplicit(txn->gid) && current_time - prepared_time <= time_gap) ||
-        (!IsXidImplicit(txn->gid) && 
-            ((!txn->after_first_phase && current_time - prepared_time <= time_gap) ||
-            (txn->after_first_phase && 
-                (InvalidGlobalTimestamp != commit_time && 
-                current_time - commit_time <= time_gap)))))
-    {
-		/* transaction inprogress */
-        return TXN_STATUS_INPROGRESS;
-    }
-#endif                
-    if (clear_2pc_belong_node)
-    {
-        node_idx = find_node_index(abnormal_nodeoid);
-        if (!check_2pc_belong_node(txn) || 
-           !check_node_participate(txn, node_idx) ||
-            abnormal_time < txn->prepare_timestamp[node_idx])
-        {
-            return TXN_STATUS_INPROGRESS;
-        }
-    }
-    else
-    {
-        if (check_flag & TXN_INPROGRESS ||current_time - prepared_time <= time_gap)
-        {
-            /* transaction inprogress */
-            return TXN_STATUS_INPROGRESS;
-        }
-    }
-
-
-    if (!IsXidImplicit(txn->gid) && txn->after_first_phase && (TXN_PREPARED == check_flag))
-    {
-        return TXN_STATUS_PREPARED;
-    }
-
-	if (check_flag & TXN_UNKNOWN)
-		return TXN_STATUS_UNKNOWN;
-    
-	if ((check_flag & TXN_COMMITTED) && (check_flag & TXN_ABORTED))
-		/* Mix of committed and aborted. This should not happen. */
-		return TXN_STATUS_UNKNOWN;
-    
-	if ((check_flag & TXN_PREPARED) == 0)
-		/* Should be at least one "prepared statement" in nodes */
-		return TXN_STATUS_FAILED;
-		
-	if (check_flag & TXN_COMMITTED)
-		/* Some 2PC transactions are committed.  Need to commit others. */
-		return TXN_STATUS_COMMITTED;
-	/* All the transactions remain prepared.   No need to recover. */
-	return TXN_STATUS_ABORTED;
-}
-
-bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check)
-{
-	int ii;
-	static const char *STMT_FORM = "%s prepared '%s';";
-	static const char *STMT_FORM_CHECK = "%s prepared '%s' for check only;";
-	char command[MAX_CMD_LENGTH];
-	int node_idx;
-    Oid node_oid;
-	PGXCNodeHandle **connections = NULL;
-	int					conn_count = 0;
-	ResponseCombiner	combiner;
-	PGXCNodeAllHandles *pgxc_handles = NULL;
-
-	if (is_commit)
-	{
-		if (is_check)
-		{
-			snprintf(command, MAX_CMD_LENGTH, STMT_FORM_CHECK, "commit", txn->gid);
-		}
-		else
-		{
-			snprintf(command, MAX_CMD_LENGTH, STMT_FORM, "commit", txn->gid);
-		}
-	}
-	else
-	{
-		if (is_check)
-		{
-			snprintf(command, MAX_CMD_LENGTH, STMT_FORM_CHECK, "rollback", txn->gid);
-		}
-		else
-		{
-			snprintf(command, MAX_CMD_LENGTH, STMT_FORM, "rollback", txn->gid);
-		}
-	}
-	if (is_commit && InvalidGlobalTimestamp == txn->global_commit_timestamp)	
-	{
-		elog(ERROR, "twophase transaction '%s' has InvalidGlobalCommitTimestamp", txn->gid);
-	}
-
-	connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*) * (txn->num_dnparts + txn->num_coordparts));
-	if (connections == NULL)
-    {
-        ereport(ERROR,
-                (errcode(ERRCODE_OUT_OF_MEMORY),
-                 errmsg("out of memory for connections")));
-    }
-    get_transaction_handles(&pgxc_handles, txn);
-	//pgxc_handles = get_handles(nodelist, coordlist, false, true);
-#ifdef __TWO_PHASE_TESTS__
-    if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && 
-        PG_CLEAN_ELOG_ERROR >= twophase_exception_case)
-    {
-        exception_count = 0;
-    }
-#endif
-	for (ii = 0; ii < pgxc_handles->dn_conn_count; ii++)
-	{
-        node_oid = pgxc_handles->datanode_handles[ii]->nodeoid;
-    	node_idx = find_node_index(node_oid);
-        if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx])
-        {
-            continue;
-        }
-		/*send global timestamp to dn_node_list[ii]*/
-		if (!send_query_clean_transaction(pgxc_handles->datanode_handles[ii], txn, command))
-		{
-			elog(LOG, "pg_clean: send query '%s' from '%s' to '%s' failed ", 
-				command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->datanode_handles[ii]->nodename);
-			return false;
-		}
-        else
-        {
-            connections[conn_count++] = pgxc_handles->datanode_handles[ii];
-#ifdef __TWO_PHASE_TESTS__
-            if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && 
-                PG_CLEAN_ELOG_ERROR >= twophase_exception_case)
-            {
-                exception_count++;
-                if (1 == exception_count && 
-                    PG_CLEAN_ELOG_ERROR == twophase_exception_case)
-                {
-                    elog(ERROR, "PG_CLEAN_ELOG_ERROR complish");
-                }
-            }
-#endif
-        }
-	}
-
-	for (ii = 0; ii < pgxc_handles->co_conn_count; ii++)
-	{
-        node_oid = pgxc_handles->coord_handles[ii]->nodeoid;
-    	node_idx = find_node_index(node_oid);
-        if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx])
-        {
-            continue;
-        }
-		/*send global timestamp to dn_node_list[ii]*/
-		if (!send_query_clean_transaction(pgxc_handles->coord_handles[ii], txn, command))
-		{
-			elog(LOG, "pg_clean: send query '%s' from '%s' to '%s' failed ", 
-				command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->coord_handles[ii]->nodename);
-			return false;
-		}
-        else
-        {
-            connections[conn_count++] = pgxc_handles->coord_handles[ii];
-#ifdef __TWO_PHASE_TESTS__
-            if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && 
-                PG_CLEAN_ELOG_ERROR >= twophase_exception_case)
-            {
-                exception_count++;
-                if (1 == exception_count && 
-                    PG_CLEAN_ELOG_ERROR == twophase_exception_case)
-                {
-                    elog(ERROR, "PG_CLEAN_ELOG_ERROR complish");
-                }
-            }
-#endif
-        }
-
-	}
-
-    /* receive response */
-    if (conn_count)
-    {
-        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
-        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
-                !validate_combiner(&combiner))
-        {
-            if (combiner.errorMessage)
-                pgxc_node_report_error(&combiner);
-            else
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to FINISH the transaction on one or more nodes")));
-        }
-        else
-            CloseCombiner(&combiner);
-    }
-    if (enable_distri_print)
-    {
-        for (ii = 0; ii < conn_count; ii++)
-        {
-            if (DN_CONNECTION_STATE_IDLE != connections[ii]->state)
-            {
-                elog(WARNING, "IN pg_clean node:%s invalid stauts:%d", connections[ii]->nodename, connections[ii]->state);
-            }
-        }
-    }
-    conn_count = 0;
-	clear_handles();
-	pfree_pgxc_all_handles(pgxc_handles);
-    pgxc_handles = NULL;
-
-	/*last commit or rollback on origcoord if it participate this txn, since after commit the 2pc file is deleted on origcoord*/
-    if (txn->origcoord != InvalidOid)
-    {
-    	node_idx = find_node_index(txn->origcoord);
-    	if (txn->coordparts[node_idx] == 1)
-    	{
-			/*send global timestamp to dn_node_list[ii]*/
-            
-			if (txn->txn_stat[node_idx] == TXN_STATUS_PREPARED)
-			{
-                get_node_handles(&pgxc_handles, txn->origcoord);
-                if (!send_query_clean_transaction(pgxc_handles->coord_handles[0], txn, command))
-                {
-                    elog(LOG, "pg_clean: send query '%s' from %s to %s failed ", 
-                        command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->coord_handles[0]->nodename);
-                    return false;
-                }
-                else
-                {
-                    connections[conn_count++] = pgxc_handles->coord_handles[0];
-                }
-            }
-    	}
-    }
-	
-    /* receive response */
-    if (conn_count)
-    {
-        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
-        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
-                !validate_combiner(&combiner))
-        {
-            if (combiner.errorMessage)
-                pgxc_node_report_error(&combiner);
-            else
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to FINISH the transaction on one or more nodes")));
-        }
-        else
-            CloseCombiner(&combiner);
-    }
-	/*free hash record from gtm*/
-	FinishGIDGTM(txn->gid);
-    
-	clear_handles();
-	pfree_pgxc_all_handles(pgxc_handles);
-    pgxc_handles = NULL;
-    pfree(connections);
-    connections = NULL;
-	return true;
-}
-
-bool clean_2PC_files(txn_info * txn)
-{
-	int ii;
-	TupleTableSlots result;
-	bool issuccess = true;
-	static const char *STMT_FORM = "select pgxc_remove_2pc_records('%s')::text";
-	char query[MAX_CMD_LENGTH];
-	
-	snprintf(query, MAX_CMD_LENGTH, STMT_FORM, txn->gid);
-
-	for (ii = 0; ii < dn_nodes_num; ii++)
-	{
-		if (execute_query_on_single_node(dn_node_list[ii], query, 1, &result) == (Datum) 1)
-		{
-			if (TTSgetvalue(&result, 0, 0) == false)
-			{
-				elog(LOG, "pg_clean: delete 2PC file failed of transaction %s on node %s",
-						  txn->gid, get_pgxc_nodename(txn->dnparts[ii]));
-				issuccess = false;
-			}
-		}
-		else
-		{
-			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(dn_node_list[ii]));
-			issuccess = false;
-		}
-		DropTupleTableSlots(&result);
-		if (!issuccess)
-			return false;
-	}
-
-	for (ii = 0; ii < cn_nodes_num; ii++)
-	{
-		if (execute_query_on_single_node(cn_node_list[ii], query, 1, &result) == (Datum) 1)
-		{
-			if (TTSgetvalue(&result, 0, 0) == false)
-			{
-				elog(LOG, "Error:delete 2PC file failed of transaction %s on node %s",
-						  txn->gid, get_pgxc_nodename(txn->coordparts[ii]));
-				issuccess = false;
-			}
-		}
-		else
-		{
-			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(cn_node_list[ii]));
-			issuccess = false;
-		}
-		DropTupleTableSlots(&result);
-		if (!issuccess)
-			return false;
-	}
-	return true;
-}
-
-void Init_print_txn_info(print_txn_info * print_txn)
-{
-	database_info *cur_database = head_database_info;
-	txn_info *cur_txn;
-	HASH_SEQ_STATUS status;
-    HTAB *txn;
-
-	print_txn->index = 0;
-	INIT(print_txn->txn);
-
-	for (; cur_database; cur_database = cur_database->next)
-	{
-        txn = cur_database->all_txn_info;
-        hash_seq_init(&status, txn);
-        while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL)
-        {
-            if (clear_2pc_belong_node && !cur_txn->belong_abnormal_node)
-            {
-                continue;
-            }
-			if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS)
-				PALLOC(print_txn->txn, cur_txn);
-        }
-        
-#if 0
-		cur_txn = cur_database->head_txn_info;
-		for (; cur_txn; cur_txn = cur_txn->next)
-		{
-			if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS)
-				PALLOC(print_txn->txn, cur_txn);
-		}
-#endif
-	}
-}
-
-void Init_print_stats_all(print_status *pstatus)
-{
-	database_info *cur_database;
-	txn_info *cur_txn;
-	HASH_SEQ_STATUS status;
-    HTAB *txn;
-
-	pstatus->index = 0;
-	pstatus->count = 0;
-	INIT(pstatus->gid);
-	INIT(pstatus->global_status);
-	INIT(pstatus->status);
-	INIT(pstatus->database);
-
-	for (cur_database = head_database_info; cur_database; cur_database = cur_database->next)
-	{
-        txn = cur_database->all_txn_info;
-        hash_seq_init(&status, txn);
-        while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL)
-        {
-			cur_txn->global_txn_stat = check_txn_global_status(cur_txn);
-			if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS)
-				Init_print_stats(cur_txn, cur_database->database_name, pstatus);
-        }
-#if 0
-		for (cur_txn = cur_database->head_txn_info; cur_txn; cur_txn = cur_txn->next)
-		{
-			cur_txn->global_txn_stat = check_txn_global_status(cur_txn);
-			if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS)
-				Init_print_stats(cur_txn, cur_database->database_name, pstatus);
-		}
-#endif
-	}
-}
-
-void Init_print_stats(txn_info *txn, char *database, print_status * pstatus)
-{
-	int ii;
-	StringInfoData	query;	
-	initStringInfo(&query);
-
-	RPALLOC(pstatus->gid);
-	RPALLOC(pstatus->global_status);
-	RPALLOC(pstatus->status);
-	RPALLOC(pstatus->database);
-
-	pstatus->gid[pstatus->count] = (char *)palloc0(100 * sizeof(char));
-	pstatus->database[pstatus->count] = (char *)palloc0(100 * sizeof(char));
-	pstatus->global_status[pstatus->count] = (char *)palloc0(100 * sizeof(char));
-
-	strncpy(pstatus->gid[pstatus->count], txn->gid, 100);
-	strncpy(pstatus->database[pstatus->count], database, 100);
-	strncpy(pstatus->global_status[pstatus->count], txn_status_to_string(check_txn_global_status(txn)), 100);
-
-	for (ii = 0; ii < pgxc_clean_node_count; ii++)
-	{
-		appendStringInfo(&query, "%-12s:%-15s", get_pgxc_nodename(find_node_oid(ii)), 
-						txn_status_to_string(txn->txn_stat[ii]));
-		if (ii < pgxc_clean_node_count - 1)
-		{
-			appendStringInfoChar(&query, '\n');
-		}
-	}
-
-	pstatus->status[pstatus->count] = (char *)palloc0((strlen(query.data)+1) * sizeof(char));
-	strncpy(pstatus->status[pstatus->count], query.data, strlen(query.data)+1);
-	pstatus->gid_count++;
-	pstatus->database_count++;
-	pstatus->global_status_count++;
-	pstatus->status_count++;
-	pstatus->count++;
-}
-
-static const char *txn_status_to_string(TXN_STATUS status)
-{
-	switch (status)
-	{
-		ENUM_TOCHAR_CASE(TXN_STATUS_INITIAL)
-	    ENUM_TOCHAR_CASE(TXN_STATUS_UNKNOWN)
-	    ENUM_TOCHAR_CASE(TXN_STATUS_PREPARED)
-	    ENUM_TOCHAR_CASE(TXN_STATUS_COMMITTED)       
-	    ENUM_TOCHAR_CASE(TXN_STATUS_ABORTED)
-	    ENUM_TOCHAR_CASE(TXN_STATUS_INPROGRESS)
-	    ENUM_TOCHAR_CASE(TXN_STATUS_FAILED)
-	}
-	return NULL;
-}
-
-static const char *txn_op_to_string(OPERATION op)
-{
-	switch (op)
-	{
-		ENUM_TOCHAR_CASE(UNDO)
-	    ENUM_TOCHAR_CASE(ABORT)
-	    ENUM_TOCHAR_CASE(COMMIT)
-	}
-	return NULL;
-}
-
-
-static void 
-CheckFirstPhase(txn_info *txn)
-{
-//    int ret;
-    Oid orignode = txn->origcoord;
-    uint32 startxid = txn->startxid;
-//    uint32 transactionid;
-    int nodeidx;
-
-    /*
-     * if the twophase trans does not success in prepare phase, the orignode == InvalidOid.
-     */
-    if (InvalidOid == orignode)
-    {
-        return;
-    }
-    nodeidx = find_node_index(orignode);
-    if (0 == txn->xid[nodeidx])
-    {
-        txn->xid[nodeidx] = startxid;
-    }
-    /* start node participate */
-    if (txn->isorigcoord_part)
-    {
-        if (0 == txn->coordparts[nodeidx])
-        {
-            txn->coordparts[nodeidx] = 1;
-            txn->num_coordparts++;
-        }
-        if (txn->txn_stat[nodeidx] == TXN_STATUS_INITIAL)
-        {
-            /*select * from pgxc_is_committed...*/
-            getTxnStatus(txn, nodeidx);
-        }
-        if (txn->txn_stat[nodeidx] == TXN_STATUS_PREPARED && txn->global_commit_timestamp != InvalidGlobalTimestamp)
-        {
-            txn->after_first_phase = true;
-        }
-    }
-    /* start node node participate */
-    else
-    {
-#if 0        
-        ret = Get2PCFile(orignode, txn->gid, &transactionid);
-        if (ret == FILENOTFOUND)
-            txn->after_first_phase = false;
-        else if (ret == FILEUNKOWN)
-            txn->global_txn_stat = TXN_STATUS_UNKNOWN;
-        else if (ret == FILEFOUND && txn->global_commit_timestamp != InvalidGlobalTimestamp)
-            txn->after_first_phase = true;
-#endif
-        if (txn->global_commit_timestamp != InvalidGlobalTimestamp)
-        {
-            txn->after_first_phase = true;
-        } else {
-            txn->after_first_phase = false;
-        }
-    }
-}
-
-void get_transaction_handles(PGXCNodeAllHandles **pgxc_handles, txn_info *txn)
-{
-    int dn_index = 0;
-    int cn_index = 0;
-    int  nodeIndex;
-    char nodetype;
-	List *coordlist = NIL;
-	List *nodelist = NIL;
-    
-    while (dn_index < dn_nodes_num)
-    {
-
-        /* Get node type and index */
-        nodetype = PGXC_NODE_NONE;
-        if (TXN_STATUS_PREPARED != txn->txn_stat[dn_index + cn_nodes_num])
-        {
-            dn_index++;
-            continue;
-        }
-        nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(dn_node_list[dn_index]), &nodetype);
-        if (nodetype == PGXC_NODE_NONE)
-            ereport(ERROR,
-                    (errcode(ERRCODE_UNDEFINED_OBJECT),
-                     errmsg("PGXC Node %s: object not defined",
-                            get_pgxc_nodename(dn_node_list[dn_index]))));
-
-        /* Check if node is requested is the self-node or not */
-        if (nodetype == PGXC_NODE_DATANODE)
-        {
-            nodelist = lappend_int(nodelist, nodeIndex);
-        }
-        dn_index++;
-
-    }
-
-    while (cn_index < cn_nodes_num)
-    {
-        /* Get node type and index */
-        nodetype = PGXC_NODE_NONE;
-        if (TXN_STATUS_PREPARED != txn->txn_stat[cn_index] || cn_node_list[cn_index] == txn->origcoord)
-        {
-            cn_index++;
-            continue;
-        }
-        nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(cn_node_list[cn_index]), &nodetype);
-        if (nodetype == PGXC_NODE_NONE)
-            ereport(ERROR,
-                    (errcode(ERRCODE_UNDEFINED_OBJECT),
-                     errmsg("PGXC Node %s: object not defined",
-                            get_pgxc_nodename(cn_node_list[cn_index]))));
-
-        /* Check if node is requested is the self-node or not */
-        if (nodetype == PGXC_NODE_COORDINATOR)
-        {
-            coordlist = lappend_int(coordlist, nodeIndex);
-        }
-        cn_index++;
-    }
-    *pgxc_handles = get_handles(nodelist, coordlist, false, true, true);
-}
-
-void get_node_handles(PGXCNodeAllHandles **pgxc_handles, Oid nodeoid)
-{
-    char nodetype = PGXC_NODE_NONE;
-	int nodeIndex;
-	List *coordlist = NIL;
-	List *nodelist = NIL;
-
-	nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(nodeoid), &nodetype);
-	if (nodetype == PGXC_NODE_COORDINATOR)
-	{
-		coordlist = lappend_int(coordlist, nodeIndex);
-	}
-    else
-    {
-        nodelist = lappend_int(nodelist, nodeIndex);
-    }
-	*pgxc_handles = get_handles(nodelist, coordlist, false, true, true);
-}
-
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index 26d364a5..2cf920b9 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -333,7 +333,7 @@ ERROR:  cannot use constant expression as partition key
 DROP FUNCTION const_func();
 -- only accept valid partitioning strategy
 CREATE TABLE partitioned (
-	a int
+    a int
 ) PARTITION BY MAGIC (a);
 ERROR:  unrecognized partitioning strategy "magic"
 -- specified column must be present in the table
@@ -427,10 +427,10 @@ Partition key: RANGE (a oid_ops, plusone(b), c, d COLLATE "C")
 Number of partitions: 0
 
 \d+ partitioned2
-            Table "public.partitioned2"
- Column |  Type   | Collation | Nullable | Default | Storage | Stats target | Description 
+                                Table "public.partitioned2"
+ Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
 --------+---------+-----------+----------+---------+----------+--------------+-------------
- a      | integer |           |          |         | plain   |              | 
+ a      | integer |           |          |         | plain    |              | 
  b      | text    |           |          |         | extended |              | 
 Partition key: RANGE (((a + 1)), substr(b, 1, 5))
 Number of partitions: 0
diff --git a/src/test/regress/expected/inherit_3.out b/src/test/regress/expected/inherit_3.out
index 251ee257..f845d5fc 100644
--- a/src/test/regress/expected/inherit_3.out
+++ b/src/test/regress/expected/inherit_3.out
@@ -876,15 +876,13 @@ update mlparted_tab mlp set c = 'xxx'
 from
   (select a from some_tab union all select a+1 from some_tab) ss (a)
 where (mlp.a = ss.a and mlp.b = 'b') or mlp.a = 3;
-ERROR:  could not plan this distributed update
-DETAIL:  correlated UPDATE or updating distribution column currently not supported in Postgres-XL.
 select tableoid::regclass::text as relname, mlparted_tab.* from mlparted_tab order by 1,2;
-       relname       | a | b | c 
----------------------+---+---+---
+       relname       | a | b |  c  
+---------------------+---+---+-----
  mlparted_tab_part1  | 1 | a | 
  mlparted_tab_part2a | 2 | a | 
- mlparted_tab_part2b | 2 | b | 
- mlparted_tab_part3  | 3 | a | 
+ mlparted_tab_part2b | 2 | b | xxx
+ mlparted_tab_part3  | 3 | a | xxx
 (4 rows)
 
 drop table mlparted_tab;
@@ -1022,18 +1020,18 @@ select NULL::derived::base;
 
 -- remove redundant conversions.
 explain (verbose on, costs off) select row(i, b)::more_derived::derived::base from more_derived;
-                QUERY PLAN                 
+                                          QUERY PLAN                                          
 ----------------------------------------------------------------------------------------------
  Remote Fast Query Execution
    Output: ((ROW(more_derived.i, more_derived.b)::more_derived)::derived)::base
    Node/s: datanode_1, datanode_2
    Remote query: SELECT ((ROW(i, b)::more_derived)::derived)::base AS "row" FROM more_derived
    ->  Seq Scan on public.more_derived
-   Output: (ROW(i, b)::more_derived)::base
+         Output: (ROW(i, b)::more_derived)::base
 (6 rows)
 
 explain (verbose on, costs off) select (1, 2)::more_derived::derived::base;
-      QUERY PLAN       
+                QUERY PLAN                 
 -------------------------------------------
  Result
    Output: (ROW(1, 2)::more_derived)::base
@@ -2199,7 +2197,7 @@ create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20
 create table mcrparted4 partition of mcrparted for values from (20, 10, 10) to (20, 20, 20);
 create table mcrparted5 partition of mcrparted for values from (20, 20, 20) to (maxvalue, maxvalue, maxvalue);
 explain (costs off) select * from mcrparted where a = 0;	-- scans mcrparted0, mcrparted_def
-             QUERY PLAN             
+              QUERY PLAN               
 ---------------------------------------
  Remote Fast Query Execution
    Node/s: datanode_2
@@ -2237,7 +2235,7 @@ explain (costs off) select * from mcrparted where a = 10 and abs(b) = 5;	-- scan
 (9 rows)
 
 explain (costs off) select * from mcrparted where abs(b) = 5;	-- scans all partitions
-             QUERY PLAN             
+              QUERY PLAN               
 ---------------------------------------
  Remote Fast Query Execution
    Node/s: datanode_1, datanode_2
diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index 7a70d26a..5b7dfb96 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -3844,8 +3844,8 @@ where t1.f1 = ss.f1;
                ->  Seq Scan on public.int8_tbl i8
                      Output: i8.q1, i8.q2
                      Filter: (i8.q2 = 123)
-         ->  Limit
-               Output: (i8.q1), t2.f1
+   ->  Limit
+         Output: (i8.q1), t2.f1
          ->  Materialize
                Output: (i8.q1), t2.f1
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
@@ -3875,7 +3875,7 @@ select * from
   lateral (select i8.q1, t2.f1 from text_tbl t2 limit 1) as ss1,
   lateral (select ss1.* from text_tbl t3 limit 1) as ss2
 where t1.f1 = ss2.f1;
-                                    QUERY PLAN                                     
+                                 QUERY PLAN                                  
 -----------------------------------------------------------------------------
  Nested Loop
    Output: t1.f1, i8.q1, i8.q2, (i8.q1), t2.f1, ((i8.q1)), (t2.f1)
@@ -3889,10 +3889,10 @@ where t1.f1 = ss2.f1;
                ->  Seq Scan on public.int8_tbl i8
                      Output: i8.q1, i8.q2
                      Filter: (i8.q2 = 123)
-         ->  Nested Loop
-               Output: (i8.q1), t2.f1, ((i8.q1)), (t2.f1)
-               ->  Limit
-                     Output: (i8.q1), t2.f1
+   ->  Nested Loop
+         Output: (i8.q1), t2.f1, ((i8.q1)), (t2.f1)
+         ->  Limit
+               Output: (i8.q1), t2.f1
                ->  Materialize
                      Output: (i8.q1), t2.f1
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
@@ -3901,16 +3901,16 @@ where t1.f1 = ss2.f1;
                                  Output: (i8.q1), t2.f1
                                  ->  Seq Scan on public.text_tbl t2
                                        Output: i8.q1, t2.f1
-                     ->  Limit
-                           Output: ((i8.q1)), (t2.f1)
+         ->  Limit
+               Output: ((i8.q1)), (t2.f1)
                ->  Materialize
                      Output: ((i8.q1)), (t2.f1)
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Output: (i8.q1), t2.f1
-                                 ->  Limit
-                                       Output: ((i8.q1)), (t2.f1)
-                                       ->  Seq Scan on public.text_tbl t3
-                                             Output: (i8.q1), t2.f1
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Output: (i8.q1), t2.f1
+                           ->  Limit
+                                 Output: ((i8.q1)), (t2.f1)
+                                 ->  Seq Scan on public.text_tbl t3
+                                       Output: (i8.q1), t2.f1
 (34 rows)
 
 select * from
@@ -3962,11 +3962,11 @@ where tt1.f1 = ss1.c0;
                            ->  Seq Scan on public.text_tbl tt4
                                  Output: tt4.f1
                                  Filter: (tt4.f1 = 'foo'::text)
-         ->  Subquery Scan on ss1
-               Output: ss1.c0
-               Filter: (ss1.c0 = 'foo'::text)
-               ->  Limit
-                     Output: (tt4.f1)
+   ->  Subquery Scan on ss1
+         Output: ss1.c0
+         Filter: (ss1.c0 = 'foo'::text)
+         ->  Limit
+               Output: (tt4.f1)
                ->  Materialize
                      Output: (tt4.f1)
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
@@ -4026,8 +4026,8 @@ where ss1.c2 = 0;
                      Output: i42.f1
                      ->  Seq Scan on public.int4_tbl i42
                            Output: i42.f1
-         ->  Limit
-               Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
+   ->  Limit
+         Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
          ->  Materialize
                Output: (i41.f1), (i8.q1), (i8.q2), (i42.f1), (i43.f1), ((42))
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
@@ -4065,13 +4065,13 @@ select * from
  Nested Loop Left Join
    Join Filter: ((1) = COALESCE((1)))
    ->  Result
-         ->  Hash Full Join
-               Hash Cond: (a1.unique1 = (1))
+   ->  Hash Full Join
+         Hash Cond: (a1.unique1 = (1))
          ->  Materialize
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      ->  Seq Scan on tenk1 a1
-               ->  Hash
-                     ->  Result
+         ->  Hash
+               ->  Result
 (10 rows)
 
 select * from
@@ -4614,8 +4614,8 @@ select *, (select r from (select q1 as q2) x, (select q2 as r) y) from int8_tbl;
 ------------------+-------------------+-------------------
               123 |               456 |               456
               123 |  4567890123456789 |  4567890123456789
- 4567890123456789 |               123 |               123
  4567890123456789 |  4567890123456789 |  4567890123456789
+ 4567890123456789 |               123 |               123
  4567890123456789 | -4567890123456789 | -4567890123456789
 (5 rows)
 
@@ -4624,8 +4624,8 @@ select *, (select r from (select q1 as q2) x, lateral (select q2 as r) y) from i
 ------------------+-------------------+------------------
               123 |               456 |              123
               123 |  4567890123456789 |              123
- 4567890123456789 |               123 | 4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 | 4567890123456789
  4567890123456789 | -4567890123456789 | 4567890123456789
 (5 rows)
 
@@ -4929,13 +4929,13 @@ select * from
 ------------------+-------------------+------------------+-------------------+------------------+------------------+-------------------
               123 |               456 |                  |                   |              123 |                  |                  
               123 |  4567890123456789 | 4567890123456789 | -4567890123456789 |              123 | 4567890123456789 | -4567890123456789
-              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 |              123 | 4567890123456789 |  4567890123456789
               123 |  4567890123456789 | 4567890123456789 |               123 |              123 | 4567890123456789 |               123
- 4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789 |              123 |  4567890123456789
- 4567890123456789 |               123 |              123 |               456 | 4567890123456789 |              123 |               456
+              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 |              123 | 4567890123456789 |  4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789
- 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 |  4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789 | 4567890123456789 |               123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 |  4567890123456789
+ 4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789 |              123 |  4567890123456789
+ 4567890123456789 |               123 |              123 |               456 | 4567890123456789 |              123 |               456
  4567890123456789 | -4567890123456789 |                  |                   | 4567890123456789 |                  |                  
 (10 rows)
 
@@ -4946,13 +4946,13 @@ select * from
 ------------------+-------------------+------------------+-------------------+------------------+------------------+-------------------
               123 |               456 |                  |                   |              123 |                  |                  
               123 |  4567890123456789 | 4567890123456789 | -4567890123456789 |              123 | 4567890123456789 | -4567890123456789
-              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 |              123 | 4567890123456789 |  4567890123456789
               123 |  4567890123456789 | 4567890123456789 |               123 |              123 | 4567890123456789 |               123
- 4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789 |              123 |  4567890123456789
- 4567890123456789 |               123 |              123 |               456 | 4567890123456789 |              123 |               456
+              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 |              123 | 4567890123456789 |  4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789
- 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 |  4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789 | 4567890123456789 |               123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 |  4567890123456789
+ 4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789 |              123 |  4567890123456789
+ 4567890123456789 |               123 |              123 |               456 | 4567890123456789 |              123 |               456
  4567890123456789 | -4567890123456789 |                  |                   | 4567890123456789 |                  |                  
 (10 rows)
 
@@ -4965,11 +4965,11 @@ select x.* from
               123 |  4567890123456789
               123 |  4567890123456789
               123 |  4567890123456789
- 4567890123456789 |               123
- 4567890123456789 |               123
  4567890123456789 |  4567890123456789
  4567890123456789 |  4567890123456789
  4567890123456789 |  4567890123456789
+ 4567890123456789 |               123
+ 4567890123456789 |               123
  4567890123456789 | -4567890123456789
 (10 rows)
 
@@ -5086,14 +5086,14 @@ select * from
         q1        |        q2         |        q1        |        q2         |        x         
 ------------------+-------------------+------------------+-------------------+------------------
               123 |               456 |                  |                   |                 
-              123 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
               123 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+              123 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
               123 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
- 4567890123456789 |               123 |              123 |               456 |              123
- 4567890123456789 |               123 |              123 |  4567890123456789 |              123
- 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 |              123 |               456 |              123
+ 4567890123456789 |               123 |              123 |  4567890123456789 |              123
  4567890123456789 | -4567890123456789 |                  |                   |                 
 (10 rows)
 
@@ -5107,14 +5107,14 @@ select * from
         q1        |        q2         |        q1        |        q2         |        x         
 ------------------+-------------------+------------------+-------------------+------------------
               123 |               456 |                  |                   |                 
-              123 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
               123 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+              123 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
               123 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
- 4567890123456789 |               123 |              123 |               456 |              123
- 4567890123456789 |               123 |              123 |  4567890123456789 |              123
- 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 |              123 |               456 |              123
+ 4567890123456789 |               123 |              123 |  4567890123456789 |              123
  4567890123456789 | -4567890123456789 |                  |                   |                 
 (10 rows)
 
@@ -5284,16 +5284,6 @@ select * from
               123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
               123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
               123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
- 4567890123456789 |               123 |              123 |              123 |              123
- 4567890123456789 |               123 |              123 |              123 |              123
- 4567890123456789 |               123 |              123 | 4567890123456789 |              123
- 4567890123456789 |               123 |              123 | 4567890123456789 |              123
- 4567890123456789 |               123 |              123 | 4567890123456789 |              123
- 4567890123456789 |               123 |              123 |              123 |              123
- 4567890123456789 |               123 |              123 |              123 |              123
- 4567890123456789 |               123 |              123 | 4567890123456789 |              123
- 4567890123456789 |               123 |              123 | 4567890123456789 |              123
- 4567890123456789 |               123 |              123 | 4567890123456789 |              123
  4567890123456789 |  4567890123456789 | 4567890123456789 |              123 |              123
  4567890123456789 |  4567890123456789 | 4567890123456789 |              123 |              123
  4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
@@ -5309,6 +5299,16 @@ select * from
  4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
+ 4567890123456789 |               123 |              123 |              123 |              123
+ 4567890123456789 |               123 |              123 |              123 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 |              123 |              123
+ 4567890123456789 |               123 |              123 |              123 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
  4567890123456789 | -4567890123456789 |                  |                  |                 
 (42 rows)
 
@@ -5523,8 +5523,8 @@ lateral (select * from int8_tbl t1,
          where t1.q1 = ss.q2) ss0;
  id |        q1        |        q2         |        q1        |        q2        
 ----+------------------+-------------------+------------------+------------------
-  0 | 4567890123456789 |               123 | 4567890123456789 | 4567890123456789
   0 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789
+  0 | 4567890123456789 |               123 | 4567890123456789 | 4567890123456789
   0 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789
 (3 rows)
 
@@ -5647,6 +5647,73 @@ SELECT count(*) FROM testr WHERE NOT EXISTS (SELECT * FROM testh WHERE testr.b =
   3000
 (1 row)
 
+--
+-- test LATERAL reference propagation down a multi-level inheritance hierarchy
+-- produced for a multi-level partitioned table hierarchy.
+--
+create table pt1 (a int, b int, c varchar) partition by range(a);
+create table pt1p1 partition of pt1 for values from (0) to (100) partition by range(b);
+create table pt1p2 partition of pt1 for values from (100) to (200);
+create table pt1p1p1 partition of pt1p1 for values from (0) to (100);
+insert into pt1 values (1, 1, 'x'), (101, 101, 'y');
+create table ut1 (a int, b int, c varchar);
+insert into ut1 values (101, 101, 'y'), (2, 2, 'z');
+explain (verbose, costs off)
+select t1.b, ss.phv from ut1 t1 left join lateral
+              (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv
+                     from pt1 t2 join ut1 t3 on t2.a = t3.b) ss
+              on t1.a = ss.t2a order by t1.a;
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: t1.b, LEAST(t1.a, a, t3.a), t1.a
+   Sort Key: t1.a
+   ->  Sort
+         Output: t1.b, (LEAST(t1.a, a, t3.a)), t1.a
+         Sort Key: t1.a
+         ->  Nested Loop Left Join
+               Output: t1.b, (LEAST(t1.a, a, t3.a)), t1.a
+               ->  Seq Scan on public.ut1 t1
+                     Output: t1.a, t1.b, t1.c
+               ->  Materialize
+                     Output: a, (LEAST(t1.a, a, t3.a))
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Output: a, LEAST(t1.a, a, t3.a)
+                           Distribute results by H: a
+                           ->  Nested Loop
+                                 Output: a, LEAST(t1.a, a, t3.a)
+                                 Join Filter: (a = t3.b)
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Output: t3.b, t3.a
+                                       Distribute results by H: b
+                                       ->  Seq Scan on public.ut1 t3
+                                             Output: t3.b, t3.a
+                                 ->  Materialize
+                                       Output: a
+                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                             Output: a
+                                             Distribute results by H: a
+                                             ->  Append
+                                                   ->  Seq Scan on public.pt1p1p1 t2
+                                                         Output: t2.a
+                                                         Filter: (t1.a = t2.a)
+                                                   ->  Seq Scan on public.pt1p2 t2_1
+                                                         Output: t2_1.a
+                                                         Filter: (t1.a = t2_1.a)
+(35 rows)
+
+select t1.b, ss.phv from ut1 t1 left join lateral
+              (select t2.a as t2a, t3.a t3a, least(t1.a, t2.a, t3.a) phv
+                     from pt1 t2 join ut1 t3 on t2.a = t3.b) ss
+              on t1.a = ss.t2a order by t1.a;
+  b  | phv 
+-----+-----
+   2 |    
+ 101 | 101
+(2 rows)
+
+drop table pt1;
+drop table ut1;
 --
 -- test that foreign key join estimation performs sanely for outer joins
 --
diff --git a/src/test/regress/expected/limit.out b/src/test/regress/expected/limit.out
index 68ff5e10..61a3f53e 100644
--- a/src/test/regress/expected/limit.out
+++ b/src/test/regress/expected/limit.out
@@ -523,7 +523,7 @@ select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2
                                  ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                        Output: thousand, PARTIAL sum(tenthous)
                                        Distribute results by H: thousand
-                                       ->  Partial GroupAggregate
+                                       ->  Partial HashAggregate
                                              Output: thousand, PARTIAL sum(tenthous)
                                              Group Key: tenk1.thousand
                                              ->  Index Only Scan using tenk1_thous_tenthous on public.tenk1
diff --git a/src/test/regress/expected/partition_join_2.out b/src/test/regress/expected/partition_join_2.out
index 8a414251..2ae2b8a2 100644
--- a/src/test/regress/expected/partition_join_2.out
+++ b/src/test/regress/expected/partition_join_2.out
@@ -187,25 +187,25 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0)
 -- Join with pruned partitions from joining relations
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b;
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
          Sort Key: a
-         ->  Nested Loop
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     ->  Append
-                           ->  Seq Scan on prt1_p1 t1
-                                 Filter: ((a < 450) AND (b = 0))
-                           ->  Seq Scan on prt1_p2 t1_1
-                                 Filter: ((a < 450) AND (b = 0))
+         ->  Hash Join
+               Hash Cond: (t2.b = a)
                ->  Append
-                     ->  Index Scan using iprt2_p2_b on prt2_p2 t2
-                           Index Cond: ((b = a) AND (b > 250))
-                     ->  Bitmap Heap Scan on prt2_p3 t2_1
-                           Recheck Cond: ((b = a) AND (b > 250))
-                           ->  Bitmap Index Scan on iprt2_p3_b
-                                 Index Cond: ((b = a) AND (b > 250))
+                     ->  Seq Scan on prt2_p2 t2
+                           Filter: (b > 250)
+                     ->  Seq Scan on prt2_p3 t2_1
+                           Filter: (b > 250)
+               ->  Hash
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Append
+                                 ->  Seq Scan on prt1_p1 t1
+                                       Filter: ((a < 450) AND (b = 0))
+                                 ->  Seq Scan on prt1_p2 t1_1
+                                       Filter: ((a < 450) AND (b = 0))
 (17 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt2 t2 WHERE t1.a = t2.b AND t1.a < 450 AND t2.b > 250 AND t1.b = 0 ORDER BY t1.a, t2.b;
@@ -304,10 +304,11 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0)
                                 QUERY PLAN                                
 --------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Sort
-         Sort Key: t1.a
-         ->  Nested Loop
-               ->  Remote Subquery Scan on all (datanode_2)
+   ->  Merge Join
+         Merge Cond: (b = t1.a)
+         ->  Remote Subquery Scan on all (datanode_2)
+               ->  Sort
+                     Sort Key: b
                      ->  HashAggregate
                            Group Key: b
                            ->  Remote Subquery Scan on all (datanode_2)
@@ -321,15 +322,14 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0)
                                                    Filter: (a = 0)
                                              ->  Seq Scan on prt2_p3 t2_2
                                                    Filter: (a = 0)
+         ->  Sort
+               Sort Key: t1.a
                ->  Append
-                     ->  Index Scan using iprt1_p1_a on prt1_p1 t1
-                           Index Cond: (a = b)
+                     ->  Seq Scan on prt1_p1 t1
                            Filter: (b = 0)
-                     ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
-                           Index Cond: (a = b)
+                     ->  Seq Scan on prt1_p2 t1_1
                            Filter: (b = 0)
-                     ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
-                           Index Cond: (a = b)
+                     ->  Seq Scan on prt1_p3 t1_2
                            Filter: (b = 0)
 (28 rows)
 
@@ -378,24 +378,28 @@ SELECT * FROM prt1 t1 LEFT JOIN LATERAL
                                                QUERY PLAN                                               
 --------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Gather Merge
-         Workers Planned: 1
-         ->  Sort
-               Sort Key: a
-               ->  Parallel Nested Loop Left Join
-                     ->  Parallel Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a
+         ->  Nested Loop Left Join
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p2 t1_1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_p3 t1_2
+                                 Filter: (b = 0)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Distribute results by H: a
-                           ->  Parallel Append
-                                 ->  Parallel Seq Scan on prt1_p1 t1
-                                       Filter: (b = 0)
-                                 ->  Parallel Seq Scan on prt1_p2 t1_1
-                                       Filter: (b = 0)
-                                 ->  Parallel Seq Scan on prt1_p3 t1_2
-                                       Filter: (b = 0)
-                     ->  Materialize
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: a
-                                 ->  Nested Loop
+                           ->  Hash Join
+                                 Hash Cond: (t3.b = a)
+                                 ->  Append
+                                       ->  Index Scan using iprt2_p1_b on prt2_p1 t3
+                                       ->  Index Scan using iprt2_p2_b on prt2_p2 t3_1
+                                       ->  Index Scan using iprt2_p3_b on prt2_p3 t3_2
+                                 ->  Hash
                                        ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                              ->  Append
                                                    ->  Index Only Scan using iprt1_p1_a on prt1_p1 t2
@@ -404,16 +408,7 @@ SELECT * FROM prt1 t1 LEFT JOIN LATERAL
                                                          Index Cond: (a = a)
                                                    ->  Index Only Scan using iprt1_p3_a on prt1_p3 t2_2
                                                          Index Cond: (a = a)
-                                       ->  Append
-                                             ->  Index Scan using iprt2_p1_b on prt2_p1 t3
-                                                   Index Cond: (b = a)
-                                             ->  Index Scan using iprt2_p2_b on prt2_p2 t3_1
-                                                   Index Cond: (b = a)
-                                             ->  Bitmap Heap Scan on prt2_p3 t3_2
-                                                   Recheck Cond: (b = a)
-                                                   ->  Bitmap Index Scan on iprt2_p3_b
-                                                         Index Cond: (b = a)
-(36 rows)
+(31 rows)
 
 SELECT * FROM prt1 t1 LEFT JOIN LATERAL
 			  (SELECT t2.a AS t2a, t3.a AS t3a, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
@@ -438,8 +433,8 @@ EXPLAIN (COSTS OFF)
 SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
 			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.b) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
 			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
-                                          QUERY PLAN                                           
------------------------------------------------------------------------------------------------
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
          Sort Key: a
@@ -456,17 +451,17 @@ SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Distribute results by H: c
                            ->  Hash Join
-                                 Hash Cond: (t2.a = b)
-                                 ->  Append
-                                       ->  Seq Scan on prt1_p1 t2
-                                       ->  Seq Scan on prt1_p2 t2_1
-                                       ->  Seq Scan on prt1_p3 t2_2
+                                 Hash Cond: (b = t2.a)
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       ->  Append
+                                             ->  Seq Scan on prt2_p1 t3
+                                             ->  Seq Scan on prt2_p2 t3_1
+                                             ->  Seq Scan on prt2_p3 t3_2
                                  ->  Hash
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             ->  Append
-                                                   ->  Seq Scan on prt2_p1 t3
-                                                   ->  Seq Scan on prt2_p2 t3_1
-                                                   ->  Seq Scan on prt2_p3 t3_2
+                                       ->  Append
+                                             ->  Seq Scan on prt1_p1 t2
+                                             ->  Seq Scan on prt1_p2 t2_1
+                                             ->  Seq Scan on prt1_p3 t2_2
 (27 rows)
 
 SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
@@ -748,8 +743,8 @@ SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * F
 -- Semi-join
 EXPLAIN (COSTS OFF)
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
-                                               QUERY PLAN                                                
----------------------------------------------------------------------------------------------------------
+                                           QUERY PLAN                                           
+------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Merge Join
          Merge Cond: (a = b)
@@ -774,23 +769,22 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHER
                                  Distribute results by H: b
                                  ->  HashAggregate
                                        Group Key: b
-                                       ->  Nested Loop
-                                             ->  Remote Subquery Scan on all (datanode_2)
-                                                   ->  Append
-                                                         ->  Seq Scan on prt2_p1 t1_3
-                                                               Filter: (a = 0)
-                                                         ->  Seq Scan on prt2_p2 t1_4
-                                                               Filter: (a = 0)
-                                                         ->  Seq Scan on prt2_p3 t1_5
-                                                               Filter: (a = 0)
+                                       ->  Hash Join
+                                             Hash Cond: (((t2.a + t2.b) / 2) = b)
                                              ->  Append
-                                                   ->  Index Scan using iprt1_e_p1_ab2 on prt1_e_p1 t2
-                                                         Index Cond: (((a + b) / 2) = b)
-                                                   ->  Index Scan using iprt1_e_p2_ab2 on prt1_e_p2 t2_1
-                                                         Index Cond: (((a + b) / 2) = b)
-                                                   ->  Index Scan using iprt1_e_p3_ab2 on prt1_e_p3 t2_2
-                                                         Index Cond: (((a + b) / 2) = b)
-(40 rows)
+                                                   ->  Seq Scan on prt1_e_p1 t2
+                                                   ->  Seq Scan on prt1_e_p2 t2_1
+                                                   ->  Seq Scan on prt1_e_p3 t2_2
+                                             ->  Hash
+                                                   ->  Remote Subquery Scan on all (datanode_2)
+                                                         ->  Append
+                                                               ->  Seq Scan on prt2_p1 t1_3
+                                                                     Filter: (a = 0)
+                                                               ->  Seq Scan on prt2_p2 t1_4
+                                                                     Filter: (a = 0)
+                                                               ->  Seq Scan on prt2_p3 t1_5
+                                                                     Filter: (a = 0)
+(39 rows)
 
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
   a  | b |  c   
@@ -1171,26 +1165,26 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a;
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                           QUERY PLAN                            
+-----------------------------------------------------------------
  Hash Join
-   Hash Cond: (a = b)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-         ->  Append
-               ->  Seq Scan on prt1_p1 t3
-               ->  Seq Scan on prt1_p2 t3_1
-               ->  Seq Scan on prt1_p3 t3_2
+   Hash Cond: (b = a)
+   ->  Hash Left Join
+         Hash Cond: (b = a)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Append
+                     ->  Seq Scan on prt2_p1 t2
+                     ->  Seq Scan on prt2_p2 t2_1
+                     ->  Seq Scan on prt2_p3 t2_2
+         ->  Hash
+               ->  Result
+                     One-Time Filter: false
    ->  Hash
-         ->  Hash Left Join
-               Hash Cond: (b = a)
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     ->  Append
-                           ->  Seq Scan on prt2_p1 t2
-                           ->  Seq Scan on prt2_p2 t2_1
-                           ->  Seq Scan on prt2_p3 t2_2
-               ->  Hash
-                     ->  Result
-                           One-Time Filter: false
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Append
+                     ->  Seq Scan on prt1_p1 t3
+                     ->  Seq Scan on prt1_p2 t3_1
+                     ->  Seq Scan on prt1_p3 t3_2
 (18 rows)
 
 EXPLAIN (COSTS OFF)
@@ -1507,50 +1501,48 @@ EXPLAIN (COSTS OFF)
 SELECT * FROM prt1_l t1 LEFT JOIN LATERAL
 			  (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss
 			  ON t1.a = ss.t2a AND t1.c = ss.t2c WHERE t1.b = 0 ORDER BY t1.a;
-                                              QUERY PLAN                                               
--------------------------------------------------------------------------------------------------------
+                                           QUERY PLAN                                            
+-------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Gather Merge
-         Workers Planned: 1
-         ->  Sort
-               Sort Key: a
-               ->  Parallel Nested Loop Left Join
-                     ->  Parallel Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: a
+         ->  Nested Loop Left Join
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: a
+                     ->  Append
+                           ->  Seq Scan on prt1_l_p1 t1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p2_p1 t1_1
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p2_p2 t1_2
+                                 Filter: (b = 0)
+                           ->  Seq Scan on prt1_l_p3_p1 t1_3
+                                 Filter: (b = 0)
+               ->  Materialize
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Distribute results by H: a
-                           ->  Parallel Append
-                                 ->  Parallel Seq Scan on prt1_l_p1 t1
-                                       Filter: (b = 0)
-                                 ->  Parallel Seq Scan on prt1_l_p2_p1 t1_1
-                                       Filter: (b = 0)
-                                 ->  Parallel Seq Scan on prt1_l_p2_p2 t1_2
-                                       Filter: (b = 0)
-                                 ->  Parallel Seq Scan on prt1_l_p3_p1 t1_3
-                                       Filter: (b = 0)
-                     ->  Materialize
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: a
-                                 ->  Hash Join
-                                       Hash Cond: ((b = t2.a) AND ((c)::text = (t2.c)::text))
-                                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                             ->  Append
-                                                   ->  Seq Scan on prt2_l_p1 t3
-                                                   ->  Seq Scan on prt2_l_p2_p1 t3_1
-                                                   ->  Seq Scan on prt2_l_p2_p2 t3_2
-                                                   ->  Seq Scan on prt2_l_p3_p1 t3_3
-                                                   ->  Seq Scan on prt2_l_p3_p2 t3_4
-                                       ->  Hash
-                                             ->  Append
-                                                   ->  Seq Scan on prt1_l_p1 t2
-                                                         Filter: ((a = a) AND ((c)::text = (c)::text))
-                                                   ->  Seq Scan on prt1_l_p2_p1 t2_1
-                                                         Filter: ((a = a) AND ((c)::text = (c)::text))
-                                                   ->  Seq Scan on prt1_l_p2_p2 t2_2
-                                                         Filter: ((a = a) AND ((c)::text = (c)::text))
-                                                   ->  Seq Scan on prt1_l_p3_p1 t2_3
-                                                         Filter: ((a = a) AND ((c)::text = (c)::text))
-                                                   ->  Seq Scan on prt1_l_p3_p2 t2_4
-                                                         Filter: ((a = a) AND ((c)::text = (c)::text))
-(41 rows)
+                           ->  Hash Join
+                                 Hash Cond: ((b = t2.a) AND ((c)::text = (t2.c)::text))
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       ->  Append
+                                             ->  Seq Scan on prt2_l_p1 t3
+                                             ->  Seq Scan on prt2_l_p2_p1 t3_1
+                                             ->  Seq Scan on prt2_l_p2_p2 t3_2
+                                             ->  Seq Scan on prt2_l_p3_p1 t3_3
+                                             ->  Seq Scan on prt2_l_p3_p2 t3_4
+                                 ->  Hash
+                                       ->  Append
+                                             ->  Seq Scan on prt1_l_p1 t2
+                                                   Filter: ((a = a) AND ((c)::text = (c)::text))
+                                             ->  Seq Scan on prt1_l_p2_p1 t2_1
+                                                   Filter: ((a = a) AND ((c)::text = (c)::text))
+                                             ->  Seq Scan on prt1_l_p2_p2 t2_2
+                                                   Filter: ((a = a) AND ((c)::text = (c)::text))
+                                             ->  Seq Scan on prt1_l_p3_p1 t2_3
+                                                   Filter: ((a = a) AND ((c)::text = (c)::text))
+                                             ->  Seq Scan on prt1_l_p3_p2 t2_4
+                                                   Filter: ((a = a) AND ((c)::text = (c)::text))
+(39 rows)
 
 SELECT * FROM prt1_l t1 LEFT JOIN LATERAL
 			  (SELECT t2.a AS t2a, t2.c AS t2c, t2.b AS t2b, t3.b AS t3b, least(t1.a,t2.a,t3.b) FROM prt1_l t2 JOIN prt2_l t3 ON (t2.a = t3.b AND t2.c = t3.c)) ss
@@ -1622,43 +1614,44 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2 WHERE t1.a = t2.a;
 -----------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Hash Join
-         Hash Cond: (t1.a = t2.a)
+         Hash Cond: (t2.a = t1.a)
          ->  Append
-               ->  Seq Scan on prt1_p1 t1
-               ->  Seq Scan on prt1_p2 t1_1
-               ->  Seq Scan on prt1_p3 t1_2
+               ->  Seq Scan on prt4_n_p1 t2
+               ->  Seq Scan on prt4_n_p2 t2_1
+               ->  Seq Scan on prt4_n_p3 t2_2
          ->  Hash
                ->  Append
-                     ->  Seq Scan on prt4_n_p1 t2
-                     ->  Seq Scan on prt4_n_p2 t2_1
-                     ->  Seq Scan on prt4_n_p3 t2_2
+                     ->  Seq Scan on prt1_p1 t1
+                     ->  Seq Scan on prt1_p2 t1_1
+                     ->  Seq Scan on prt1_p3 t1_2
 (12 rows)
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1, prt4_n t2, prt2 t3 WHERE t1.a = t2.a and t1.a = t3.b;
-                                    QUERY PLAN                                     
------------------------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Hash Join
-         Hash Cond: (t2.a = t1.a)
-         ->  Append
-               ->  Seq Scan on prt4_n_p1 t2
-               ->  Seq Scan on prt4_n_p2 t2_1
-               ->  Seq Scan on prt4_n_p3 t2_2
-         ->  Hash
-               ->  Hash Join
-                     Hash Cond: (t1.a = b)
+         Hash Cond: (t1.a = b)
+         ->  Hash Join
+               Hash Cond: (t2.a = t1.a)
+               ->  Append
+                     ->  Seq Scan on prt4_n_p1 t2
+                     ->  Seq Scan on prt4_n_p2 t2_1
+                     ->  Seq Scan on prt4_n_p3 t2_2
+               ->  Hash
                      ->  Append
                            ->  Seq Scan on prt1_p1 t1
                            ->  Seq Scan on prt1_p2 t1_1
                            ->  Seq Scan on prt1_p3 t1_2
-                     ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 ->  Append
-                                       ->  Seq Scan on prt2_p1 t3
-                                       ->  Seq Scan on prt2_p2 t3_1
-                                       ->  Seq Scan on prt2_p3 t3_2
-(20 rows)
+         ->  Hash
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  Append
+                           ->  Seq Scan on prt2_p1 t3
+                           ->  Seq Scan on prt2_p2 t3_1
+                           ->  Seq Scan on prt2_p3 t3_2
+(21 rows)
 
 -- partition-wise join can not be applied if there are no equi-join conditions
 -- between partition keys
@@ -1710,41 +1703,41 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1, prt2_m t2 WHERE t1.a = (t2.b + t2.
 -- partition-wise join
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.a = t2.b;
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                           QUERY PLAN                            
+-----------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Hash Left Join
-         Hash Cond: (t1.a = b)
-         ->  Append
-               ->  Seq Scan on prt1_m_p1 t1
-               ->  Seq Scan on prt1_m_p2 t1_1
-               ->  Seq Scan on prt1_m_p3 t1_2
+   ->  Hash Right Join
+         Hash Cond: (b = t1.a)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Append
+                     ->  Seq Scan on prt2_m_p1 t2
+                     ->  Seq Scan on prt2_m_p2 t2_1
+                     ->  Seq Scan on prt2_m_p3 t2_2
          ->  Hash
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     ->  Append
-                           ->  Seq Scan on prt2_m_p1 t2
-                           ->  Seq Scan on prt2_m_p2 t2_1
-                           ->  Seq Scan on prt2_m_p3 t2_2
+               ->  Append
+                     ->  Seq Scan on prt1_m_p1 t1
+                     ->  Seq Scan on prt1_m_p2 t1_1
+                     ->  Seq Scan on prt1_m_p3 t1_2
 (13 rows)
 
 -- equi-join between non-key columns does not qualify for partition-wise join
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_m t1 LEFT JOIN prt2_m t2 ON t1.c = t2.c;
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                           QUERY PLAN                            
+-----------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Hash Left Join
-         Hash Cond: (t1.c = c)
-         ->  Append
-               ->  Seq Scan on prt1_m_p1 t1
-               ->  Seq Scan on prt1_m_p2 t1_1
-               ->  Seq Scan on prt1_m_p3 t1_2
+   ->  Hash Right Join
+         Hash Cond: (c = t1.c)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Append
+                     ->  Seq Scan on prt2_m_p1 t2
+                     ->  Seq Scan on prt2_m_p2 t2_1
+                     ->  Seq Scan on prt2_m_p3 t2_2
          ->  Hash
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     ->  Append
-                           ->  Seq Scan on prt2_m_p1 t2
-                           ->  Seq Scan on prt2_m_p2 t2_1
-                           ->  Seq Scan on prt2_m_p3 t2_2
+               ->  Append
+                     ->  Seq Scan on prt1_m_p1 t1
+                     ->  Seq Scan on prt1_m_p2 t1_1
+                     ->  Seq Scan on prt1_m_p3 t1_2
 (13 rows)
 
 -- partition-wise join can not be applied between tables with different
diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out
index 1e0441a4..770c320f 100644
--- a/src/test/regress/expected/rowsecurity_1.out
+++ b/src/test/regress/expected/rowsecurity_1.out
@@ -2074,7 +2074,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Seq Scan on z1
                Filter: (((a % 2) = 0) AND f_leak(b))
-         ->  CTE Scan on q
+   ->  CTE Scan on q
 (8 rows)
 
 SET ROLE regress_rls_group1;
@@ -2125,7 +2125,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Seq Scan on z1
                Filter: (((a % 2) = 0) AND f_leak(b))
-         ->  CTE Scan on q
+   ->  CTE Scan on q
 (8 rows)
 
 SET SESSION AUTHORIZATION regress_rls_carol;
@@ -2176,7 +2176,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Seq Scan on z1
                Filter: (((a % 2) = 1) AND f_leak(b))
-         ->  CTE Scan on q
+   ->  CTE Scan on q
 (8 rows)
 
 SET ROLE regress_rls_group2;
@@ -2227,7 +2227,7 @@ EXPLAIN (COSTS OFF) EXECUTE plancache_test3;
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Seq Scan on z1
                Filter: (((a % 2) = 1) AND f_leak(b))
-         ->  CTE Scan on q
+   ->  CTE Scan on q
 (8 rows)
 
 --
diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out
index 20bce908..8bea6498 100644
--- a/src/test/regress/expected/sanity_check.out
+++ b/src/test/regress/expected/sanity_check.out
@@ -39,6 +39,8 @@ date_tbl|f
 default_tbl|f
 defaultexpr_tbl|f
 dept|f
+donothingbrtrig_test1|f
+donothingbrtrig_test2|f
 dupindexcols|t
 e_star|f
 emp|f
diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out
index 93228c2e..f3b81ec8 100644
--- a/src/test/regress/expected/select_parallel_4.out
+++ b/src/test/regress/expected/select_parallel_4.out
@@ -81,34 +81,34 @@ select length(stringu1) from tenk1 group by length(stringu1);
 
 explain (costs off)
 	select stringu1, count(*) from tenk1 group by stringu1 order by stringu1;
-                        QUERY PLAN                         
------------------------------------------------------------
- Finalize GroupAggregate
-   Group Key: stringu1
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-         ->  Sort
-               Sort Key: stringu1
-               ->  Partial HashAggregate
-                     Group Key: stringu1
-                     ->  Gather
-                           Workers Planned: 4
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Sort
+   Sort Key: stringu1
+   ->  Finalize HashAggregate
+         Group Key: stringu1
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Gather
+                     Workers Planned: 4
+                     ->  Partial HashAggregate
+                           Group Key: stringu1
                            ->  Parallel Seq Scan on tenk1
 (10 rows)
 
 explain (costs off)
 	select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE 'SHORT' END) as islong
 		from tenk1 group by islong order by num;
-                                                     QUERY PLAN                                                     
---------------------------------------------------------------------------------------------------------------------
+                                                       QUERY PLAN                                                       
+------------------------------------------------------------------------------------------------------------------------
  Sort
    Sort Key: (count(stringu1))
    ->  Finalize HashAggregate
          Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Partial HashAggregate
-                     Group Key: (CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END)
-                     ->  Gather
-                           Workers Planned: 4
+               ->  Gather
+                     Workers Planned: 4
+                     ->  Partial HashAggregate
+                           Group Key: CASE WHEN (length((stringu1)::text) > 5) THEN 'LONG'::text ELSE 'SHORT'::text END
                            ->  Parallel Seq Scan on tenk1
 (10 rows)
 
@@ -378,8 +378,10 @@ SELECT xc_node_id != 0 FROM t_worker_identifier;
 (1 row)
 
 -- provoke error in worker
+SAVEPOINT settings;
 select stringu1::int2 from tenk1 where unique1 = 1;
 ERROR:  invalid input syntax for integer: "BAAAAA"
+ROLLBACK TO SAVEPOINT settings;
 -- test interaction with set-returning functions
 SAVEPOINT settings;
 -- multiple subqueries under a single Gather node
@@ -389,16 +391,17 @@ EXPLAIN (COSTS OFF)
 SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1
 UNION ALL
 SELECT unique1 FROM tenk1 WHERE fivethous = tenthous + 1;
-                     QUERY PLAN                     
-----------------------------------------------------
- Gather
-   Workers Planned: 4
-   ->  Parallel Append
-         ->  Parallel Seq Scan on tenk1
-               Filter: (fivethous = (tenthous + 1))
-         ->  Parallel Seq Scan on tenk1 tenk1_1
-               Filter: (fivethous = (tenthous + 1))
-(7 rows)
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Gather
+         Workers Planned: 4
+         ->  Append
+               ->  Parallel Seq Scan on tenk1
+                     Filter: (fivethous = (tenthous + 1))
+               ->  Parallel Seq Scan on tenk1 tenk1_1
+                     Filter: (fivethous = (tenthous + 1))
+(8 rows)
 
 ROLLBACK TO SAVEPOINT settings;
 -- can't use multiple subqueries under a single Gather node due to initPlans
@@ -409,34 +412,33 @@ UNION ALL
 SELECT unique1 FROM tenk1 WHERE fivethous =
    (SELECT unique2 FROM tenk1 WHERE fivethous = 1 LIMIT 1)
 ORDER BY 1;
-                             QUERY PLAN                             
---------------------------------------------------------------------
- Sort
-   Sort Key: tenk1.unique1
-   ->  Append
-         ->  Gather
-               Workers Planned: 4
-               Params Evaluated: $1
-               InitPlan 1 (returns $1)
-                 ->  Limit
-                       ->  Gather
-                             Workers Planned: 4
-                             ->  Parallel Seq Scan on tenk1 tenk1_2
-                                   Filter: (fivethous = 1)
-               ->  Parallel Seq Scan on tenk1
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Sort
+         Sort Key: tenk1.unique1
+         ->  Append
+               ->  Seq Scan on tenk1
+                     Filter: (fivethous = $0)
+                     InitPlan 1 (returns $0)
+                       ->  Limit
+                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                   ->  Limit
+                                         ->  Gather
+                                               Workers Planned: 4
+                                               ->  Parallel Seq Scan on tenk1 tenk1_2
+                                                     Filter: (fivethous = 1)
+               ->  Seq Scan on tenk1 tenk1_1
                      Filter: (fivethous = $1)
-         ->  Gather
-               Workers Planned: 4
-               Params Evaluated: $3
-               InitPlan 2 (returns $3)
-                 ->  Limit
-                       ->  Gather
-                             Workers Planned: 4
-                             ->  Parallel Seq Scan on tenk1 tenk1_3
-                                   Filter: (fivethous = 1)
-               ->  Parallel Seq Scan on tenk1 tenk1_1
-                     Filter: (fivethous = $3)
-(25 rows)
+                     InitPlan 2 (returns $1)
+                       ->  Limit
+                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                   ->  Limit
+                                         ->  Gather
+                                               Workers Planned: 4
+                                               ->  Parallel Seq Scan on tenk1 tenk1_3
+                                                     Filter: (fivethous = 1)
+(24 rows)
 
 -- test interaction with SRFs
 SELECT * FROM information_schema.foreign_data_wrapper_options
diff --git a/src/test/regress/expected/stats_ext_2.out b/src/test/regress/expected/stats_ext_2.out
index e058f176..16b06053 100644
--- a/src/test/regress/expected/stats_ext_2.out
+++ b/src/test/regress/expected/stats_ext_2.out
@@ -660,7 +660,7 @@ EXPLAIN
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
  Finalize Aggregate  (cost=177.52..177.53 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.52 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.52 rows=1 width=0)
          ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
                ->  Seq Scan on subset  (cost=0.00..77.50 rows=1 width=0)
                      Filter: ((b = 'prefix_1'::text) AND (c = 1))
@@ -680,9 +680,9 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=177.52..177.53 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.52 rows=1 width=8)
-         ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
+ Finalize Aggregate  (cost=177.64..177.65 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.62..177.64 rows=1 width=0)
+         ->  Partial Aggregate  (cost=77.62..77.64 rows=1 width=8)
                ->  Seq Scan on subset  (cost=0.00..77.50 rows=50 width=0)
                      Filter: ((b = 'prefix_1'::text) AND (c = 1))
 (5 rows)
@@ -698,9 +698,9 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=177.52..177.53 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.52 rows=1 width=8)
-         ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
+ Finalize Aggregate  (cost=177.53..177.54 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.51..177.53 rows=1 width=0)
+         ->  Partial Aggregate  (cost=77.51..77.52 rows=1 width=8)
                ->  Seq Scan on subset  (cost=0.00..77.50 rows=5 width=0)
                      Filter: ((b ~~ '%_1'::text) AND (c = 1))
 (5 rows)
@@ -722,9 +722,9 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=177.52..177.53 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.52 rows=1 width=8)
-         ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
+ Finalize Aggregate  (cost=177.64..177.65 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.62..177.64 rows=1 width=0)
+         ->  Partial Aggregate  (cost=77.62..77.64 rows=1 width=8)
                ->  Seq Scan on subset  (cost=0.00..77.50 rows=50 width=0)
                      Filter: ((b ~~ '%_1'::text) AND (c = 1))
 (5 rows)
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 8f30b1c9..c68869c3 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -2043,8 +2043,8 @@ select * from x;
                                Output: x_1.a
                        ->  CTE Scan on z
                              Output: z.a
-                             ->  CTE Scan on z z1
-                                   Output: z1.a
+                       ->  CTE Scan on z z1
+                             Output: z1.a
 (18 rows)
 
 with recursive x(a) as
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index ca0d242d..c3f67c12 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -91,7 +91,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_crypt_parellel_debug       | off
  enable_data_mask                  | on
  enable_datanode_row_triggers      | off
- enable_distinct_optimizer             | on
+ enable_distinct_optimizer         | on
  enable_distri_debug               | off
  enable_distri_debug_print         | off
  enable_distri_visibility_print    | off
@@ -136,7 +136,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
  enable_xlog_mprotect              | on
-(63 rows)
+(64 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From 2766f3223d1b6e2ac104837026671a5b617a298b Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 23 Dec 2021 19:14:34 +0800
Subject: [PATCH 320/578] add contrib pg_clean pg_unlock tbase_subscription

---
 contrib/Makefile                              |    3 +
 contrib/pg_clean/Makefile                     |   18 +
 contrib/pg_clean/pg_clean--1.0.sql            |  106 +
 .../pg_clean/pg_clean--unpackaged--1.0.sql    |   19 +
 contrib/pg_clean/pg_clean.c                   | 3311 +++++++++++++++++
 contrib/pg_clean/pg_clean.control             |    5 +
 contrib/pg_clean/test.sh                      |  171 +
 contrib/pg_unlock/Makefile                    |   18 +
 contrib/pg_unlock/pg_unlock--1.0.sql          |   56 +
 .../pg_unlock/pg_unlock--unpackaged--1.0.sql  |   10 +
 contrib/pg_unlock/pg_unlock.c                 | 2349 ++++++++++++
 contrib/pg_unlock/pg_unlock.control           |    5 +
 contrib/tbase_subscription/Makefile           |   19 +
 .../tbase_subscription--1.0.sql               |   36 +
 .../tbase_subscription--unpackaged--1.0.sql   |    4 +
 .../tbase_subscription/tbase_subscription.c   |   26 +
 .../tbase_subscription.control                |    5 +
 17 files changed, 6161 insertions(+)
 create mode 100644 contrib/pg_clean/Makefile
 create mode 100644 contrib/pg_clean/pg_clean--1.0.sql
 create mode 100644 contrib/pg_clean/pg_clean--unpackaged--1.0.sql
 create mode 100644 contrib/pg_clean/pg_clean.c
 create mode 100644 contrib/pg_clean/pg_clean.control
 create mode 100644 contrib/pg_clean/test.sh
 create mode 100644 contrib/pg_unlock/Makefile
 create mode 100644 contrib/pg_unlock/pg_unlock--1.0.sql
 create mode 100644 contrib/pg_unlock/pg_unlock--unpackaged--1.0.sql
 create mode 100644 contrib/pg_unlock/pg_unlock.c
 create mode 100644 contrib/pg_unlock/pg_unlock.control
 create mode 100644 contrib/tbase_subscription/Makefile
 create mode 100644 contrib/tbase_subscription/tbase_subscription--1.0.sql
 create mode 100644 contrib/tbase_subscription/tbase_subscription--unpackaged--1.0.sql
 create mode 100644 contrib/tbase_subscription/tbase_subscription.c
 create mode 100644 contrib/tbase_subscription/tbase_subscription.control

diff --git a/contrib/Makefile b/contrib/Makefile
index 43e984e3..14745884 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -39,6 +39,8 @@ SUBDIRS = \
                 pgrowlocks      \
                 pgstattuple     \
                 pgxc_clean      \
+                pg_clean        \
+                pg_unlock       \
                 pgxc_ctl        \
                 pgxc_monitor \
                 pg_visibility   \
@@ -55,6 +57,7 @@ SUBDIRS = \
                 unaccent        \
                 vacuumlo        \
                 stormstats		\
+                tbase_subscription  \
 				tbase_pooler_stat \
                 pg_stat_cluster_activity
 
diff --git a/contrib/pg_clean/Makefile b/contrib/pg_clean/Makefile
new file mode 100644
index 00000000..9913e074
--- /dev/null
+++ b/contrib/pg_clean/Makefile
@@ -0,0 +1,18 @@
+# contrib/pg_clean/Makefile
+
+MODULE_big = pg_clean
+OBJS = pg_clean.o
+
+EXTENSION = pg_clean
+DATA = pg_clean--1.0.sql pg_clean--unpackaged--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_clean
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
\ No newline at end of file
diff --git a/contrib/pg_clean/pg_clean--1.0.sql b/contrib/pg_clean/pg_clean--1.0.sql
new file mode 100644
index 00000000..e5bbc9ca
--- /dev/null
+++ b/contrib/pg_clean/pg_clean--1.0.sql
@@ -0,0 +1,106 @@
+/* contrib/pg_clean/pg_clean--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_clean" to load this file. \quit
+
+-- Register functions.
+CREATE FUNCTION pg_clean_execute(IN time_interval integer DEFAULT 120,
+	OUT gid text,
+	OUT global_transaction_status text,
+	OUT operation text,
+	OUT operation_status text
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT VOLATILE;
+
+CREATE FUNCTION pg_clean_execute_on_node(IN abnormal_nodename text, IN abnormal_time bigint,
+	OUT gid text,
+	OUT global_transaction_status text,
+	OUT operation text,
+	OUT operation_status text
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT VOLATILE;
+
+
+CREATE FUNCTION pg_clean_check_txn(IN time_interval integer DEFAULT 120,
+	OUT gid text,
+	OUT database text,
+	OUT global_transaction_status text,
+	OUT transaction_status_on_allnodes text
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT VOLATILE;
+
+CREATE FUNCTION pgxc_get_2pc_nodes(gid text)
+RETURNS text
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pgxc_get_2pc_startnode(gid text)
+RETURNS text
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pgxc_get_2pc_startxid(gid text)
+RETURNS text
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pgxc_get_2pc_commit_timestamp(gid text)
+RETURNS text
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pgxc_get_2pc_xid(gid text)
+RETURNS integer
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pgxc_get_2pc_file(gid text)
+RETURNS text
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pgxc_remove_2pc_records(gid text)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pgxc_clear_2pc_records()
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pgxc_get_record_list()
+RETURNS text
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pgxc_commit_on_node(nodename text, gid text)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pgxc_abort_on_node(nodename text, gid text)
+RETURNS bool
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+GRANT ALL ON FUNCTION pg_clean_execute(time_interval integer) TO PUBLIC;
+GRANT ALL ON FUNCTION pg_clean_execute_on_node(abnormal_nodename text, abnormal_time bigint) TO PUBLIC;
+GRANT ALL ON FUNCTION pg_clean_check_txn(time_interval integer) TO PUBLIC;
+GRANT ALL ON FUNCTION pgxc_get_2pc_nodes(gid text) TO PUBLIC;
+GRANT ALL ON FUNCTION pgxc_get_2pc_startnode(gid text) TO PUBLIC;
+GRANT ALL ON FUNCTION pgxc_get_2pc_startxid(gid text) TO PUBLIC;
+GRANT ALL ON FUNCTION pgxc_get_2pc_commit_timestamp(gid text) TO PUBLIC;
+GRANT ALL ON FUNCTION pgxc_get_2pc_xid(gid text) TO PUBLIC;
+GRANT ALL ON FUNCTION pgxc_get_2pc_file(gid text) TO PUBLIC;
+GRANT ALL ON FUNCTION pgxc_remove_2pc_records(gid text) TO PUBLIC;
+GRANT ALL ON FUNCTION pgxc_clear_2pc_records() TO PUBLIC;
+GRANT ALL ON FUNCTION pgxc_get_record_list() TO PUBLIC;
+GRANT ALL ON FUNCTION pgxc_commit_on_node(nodename text, gid text) TO PUBLIC;
+GRANT ALL ON FUNCTION pgxc_abort_on_node(nodename text, gid text) TO PUBLIC;
diff --git a/contrib/pg_clean/pg_clean--unpackaged--1.0.sql b/contrib/pg_clean/pg_clean--unpackaged--1.0.sql
new file mode 100644
index 00000000..a6a67659
--- /dev/null
+++ b/contrib/pg_clean/pg_clean--unpackaged--1.0.sql
@@ -0,0 +1,19 @@
+/* contrib/pg_clean/pg_clean--unpackaged--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_clean" to load this file. \quit
+
+ALTER EXTENSION pg_clean ADD function pg_clean_execute(time_interval integer);
+ALTER EXTENSION pg_clean ADD function pg_clean_execute_on_node(abnormal_nodename text, abnormal_time bigint);
+ALTER EXTENSION pg_clean ADD function pg_clean_check_txn(time_interval integer);
+ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_nodes(gid text);
+ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_startnode(gid text);
+ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_startxid(gid text);
+ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_commit_timestamp(gid text);
+ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_xid(gid text);
+ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_file(gid text);
+ALTER EXTENSION pg_clean ADD function pgxc_remove_2pc_records(gid text);
+ALTER EXTENSION pg_clean ADD function pgxc_clear_2pc_records();
+ALTER EXTENSION pg_clean ADD function pgxc_get_record_list();
+ALTER EXTENSION pg_clean ADD function pgxc_commit_on_node(nodename text, gid text);
+ALTER EXTENSION pg_clean ADD function pgxc_abort_on_node(nodename text, gid text);
diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c
new file mode 100644
index 00000000..4dc898ff
--- /dev/null
+++ b/contrib/pg_clean/pg_clean.c
@@ -0,0 +1,3311 @@
+#include "postgres.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+
+#include <stdio.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "storage/procarray.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "utils/varlena.h"
+#include "utils/lsyscache.h"
+#include "utils/palloc.h"
+#include "utils/builtins.h"
+
+#include "executor/tuptable.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/pgxcnode.h"
+#include "access/tupdesc.h"
+#include "access/htup_details.h"
+#include "lib/stringinfo.h"
+
+#include "access/gtm.h"
+#include "datatype/timestamp.h"
+#include "access/xact.h"
+#include "pgxc/pgxcnode.h"
+#include "pgxc/poolmgr.h"
+#include "utils/timestamp.h"
+#include "catalog/pg_control.h"
+#include "commands/dbcommands.h"
+
+#include "utils/memutils.h"
+#include "nodes/memnodes.h"
+
+#ifdef XCP
+#include "catalog/pg_type.h"
+#include "catalog/pgxc_node.h"
+#include "executor/executor.h"
+#include "nodes/makefuncs.h"
+#include "utils/snapmgr.h"
+#endif
+#ifdef PGXC
+#include "pgxc/nodemgr.h"
+#include "pgxc/pgxc.h"
+#endif
+
+#include "storage/fd.h"
+#include "pgstat.h"
+#include "access/xact.h"
+#include "access/twophase.h"
+#include "access/hash.h"
+
+/*hash_create hash_search*/
+#include "utils/hsearch.h"
+
+#define TWOPHASE_RECORD_DIR "pg_2pc"
+int  transaction_threshold = 200000;
+#define MAXIMUM_CLEAR_FILE 10000
+#define MAXIMUM_OUTPUT_FILE 1000
+#define XIDPREFIX "_$XC$"
+#define DEFAULT_CLEAN_TIME_INTERVAL 120000000
+#ifdef __TWO_PHASE_TESTS__
+#define LEAST_CLEAN_TIME_INTERVAL 10000000 /* in pg_clean test_mode should not clean twophase trans prepared in ten seconds or commit in ten seconds */
+#else
+#define LEAST_CLEAN_TIME_INTERVAL 60000000 /* should not clean twophase trans prepared in a minite or commit in a minite */
+#endif
+GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL;
+
+
+PG_MODULE_MAGIC;
+
+#define MAX_GID 50
+#define MAX_DBNAME	64
+#define GET_START_XID "startxid:"
+#define GET_COMMIT_TIMESTAMP "global_commit_timestamp:"
+#define GET_START_NODE "startnode:"
+#define GET_NODE "nodes:"
+#define GET_XID "\nxid:"
+#define GET_READONLY "readonly"
+#define GIDSIZE (200 + 24)
+#define MAX_TWOPC_TXN 1000
+
+#define XIDFOUND 1
+#define XIDNOTFOUND -1
+#define XIDEXECFAIL -2
+
+#define FILEFOUND 1
+#define FILEUNKOWN -1
+#define FILENOTFOUND -2
+
+#define INIT(x)\
+do{\
+	x = NULL;\
+	x##_count = 0;\
+	x##_size = 0;\
+}while(0);
+
+#define RPALLOC(x)\
+do{\
+    if (x##_size < x##_count+1)\
+    {\
+        int temp_size = (x##_size > 0) ? x##_size : 1;\
+        if (NULL == x)\
+        {\
+			x = palloc0(2*temp_size*sizeof(*x));\
+		}\
+        else\
+        {\
+        	x = repalloc(x, 2*temp_size*sizeof(*x));\
+        }\
+    	x##_size = 2*temp_size;\
+    }\
+}while(0);
+
+#define PALLOC(x, y)\
+do{\
+    RPALLOC(x);\
+    x[x##_count] = y;\
+    x##_count++;\
+}while(0);
+
+#define RFREE(x)\
+do{\
+    if (x##_size > 0)\
+    {\
+        pfree(x);\
+    }\
+    x = NULL;\
+    x##_count = 0;\
+    x##_size = 0;\
+}while(0);
+	
+#define ENUM_TOCHAR_CASE(x)   case x: return(#x);
+
+/*data structures*/
+typedef enum TXN_STATUS
+{
+	TXN_STATUS_INITIAL = 0,	/* Initial */
+	TXN_STATUS_PREPARED,
+	TXN_STATUS_COMMITTED,
+	TXN_STATUS_ABORTED,
+	TXN_STATUS_INPROGRESS,
+	TXN_STATUS_FAILED,		/* Error detected while interacting with the node */
+	TXN_STATUS_UNKNOWN	/* Unknown: Frozen, running, or not started */
+} TXN_STATUS;
+
+
+typedef enum 
+{
+	UNDO = 0,
+	ABORT,
+	COMMIT
+} OPERATION;
+
+typedef enum
+{
+    TWOPHASE_FILE_EXISTS = 0,
+    TWOPHASE_FILE_NOT_EXISTS,
+    TWOPHASE_FILE_OLD, 
+    TWOPHASE_FILE_ERROR
+}TWOPHASE_FILE_STATUS;
+	
+typedef struct txn_info
+{
+	char			gid[MAX_GID];
+	uint32			*xid;				/* xid used in prepare */
+	TimestampTz		*prepare_timestamp;
+	char			*owner;
+    char            *participants;
+	Oid				origcoord;			/* Original coordinator who initiated the txn */
+    bool            after_first_phase;
+    uint32          startxid;           /* xid in Original coordinator */
+	bool			isorigcoord_part;	/* Is original coordinator a
+										   participant? */
+	int				num_dnparts;		/* Number of participant datanodes */
+	int				num_coordparts;		/* Number of participant coordinators */
+	int				*dnparts;			/* Whether a node was participant in the txn */
+	int				*coordparts;
+	TXN_STATUS		*txn_stat;			/* Array for each nodes */
+	char			*msg;				/* Notice message for this txn. */
+	GlobalTimestamp  global_commit_timestamp;	/* get global_commit_timestamp from node once it is committed*/
+
+	TXN_STATUS		global_txn_stat;
+	OPERATION		op;
+	bool			op_issuccess;
+    bool            is_readonly;
+    bool            belong_abnormal_node;
+}txn_info;
+
+typedef struct database_info
+{
+	struct database_info *next;
+	char *database_name;
+
+    HTAB *all_txn_info;
+#if 0 
+	txn_info *head_txn_info;
+	txn_info *last_txn_info;
+#endif
+} database_info;
+
+typedef struct 
+{
+	int index;
+	txn_info **txn;
+	int txn_count;
+	int txn_size;
+	MemoryContext mycontext;
+} print_txn_info;
+
+typedef struct
+{
+	int index;
+	int count;
+	char **gid;
+	int gid_count;
+	int gid_size;
+	char **database;
+	int database_count;
+	int database_size;
+	char **global_status;
+	int global_status_count;
+	int global_status_size;
+	char **status;
+	int status_count;
+	int status_size;
+	MemoryContext mycontext;
+} print_status;
+
+typedef struct 
+{
+	char ***slot;	/*slot[i][j] stores value of row i, colum j*/
+	int slot_count;	/*number of rows*/
+	int slot_size;
+	int attnum;
+}TupleTableSlots;
+
+/*global variable*/
+static Oid	        *cn_node_list = NULL;
+static Oid	        *dn_node_list = NULL;
+static bool         *cn_health_map = NULL;
+static bool         *dn_health_map = NULL;
+static int	        cn_nodes_num = 0;
+static int	        dn_nodes_num = 0;
+static int	        pgxc_clean_node_count = 0;
+static Oid	        my_nodeoid;
+static 
+database_info       *head_database_info = NULL;
+static 
+database_info       *last_database_info = NULL;
+bool		        execute = false;
+int                 total_twopc_txn = 0;
+
+TimestampTz         current_time;
+GlobalTimestamp     abnormal_time = InvalidGlobalTimestamp;
+char                *abnormal_nodename = NULL;
+Oid                 abnormal_nodeoid = InvalidOid;
+bool                clear_2pc_belong_node = false;
+
+
+/*function list*/
+	/*plugin entry function*/
+
+static bool check_node_health(Oid node_oid);
+static Datum 
+	 execute_query_on_single_node(Oid node, const char * query, int attnum, TupleTableSlots * tuples);
+void DestroyTxnHash(void);
+static void ResetGlobalVariables(void);
+
+static Oid  
+	 getMyNodeoid(void);
+static void 
+	 getDatabaseList(void);
+static char* TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num);
+static void DropTupleTableSlots(TupleTableSlots *
+Slots);
+static void 
+	 getTxnInfoOnNodesAll(void);
+void getTxnInfoOnNode(Oid node);
+void add_txn_info(char * dbname, Oid node_oid, uint32 xid, char * gid, char * owner, 
+					  TimestampTz prepared_time, TXN_STATUS status);
+TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info * txn, Oid node_oid);
+static txn_info *
+	 find_txn(char *gid);
+txn_info*	
+	 make_txn_info(char * dbname, char * gid, char * owner);
+database_info*	
+	 find_database_info(char *database_name);
+database_info*
+	 add_database_info(char *database_name);
+int	 find_node_index(Oid node_oid);
+Oid  find_node_oid(int node_idx);
+void getTxnInfoOnOtherNodesAll(void);
+void getTxnInfoOnOtherNodesForDatabase(database_info *database);
+void getTxnInfoOnOtherNodes(txn_info *txn);
+int Get2PCXidByGid(Oid node_oid, char * gid, uint32 * transactionid);
+int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid);
+
+void getTxnStatus(txn_info * txn, int node_idx);
+void recover2PCForDatabaseAll(void);
+void recover2PCForDatabase(database_info * db_info);
+#if 0    
+static bool 
+	 setMaintenanceMode(bool status);
+#endif
+bool send_query_clean_transaction(PGXCNodeHandle * conn, txn_info * txn, const char * finish_cmd);
+bool check_2pc_belong_node(txn_info * txn);
+bool check_node_participate(txn_info * txn, int node_idx);
+
+void recover2PC(txn_info * txn);
+TXN_STATUS 
+	 check_txn_global_status(txn_info *txn);
+bool clean_2PC_iscommit(txn_info *txn, bool iscommit);
+bool clean_2PC_files(txn_info *txn);
+void Init_print_txn_info(print_txn_info *print_txn);
+void Init_print_stats_all(print_status *pstatus);
+void Init_print_stats(txn_info * txn, char * database, print_status * pstatus);
+static const char *
+	 txn_status_to_string(TXN_STATUS status);
+static const char *
+	 txn_op_to_string(OPERATION op);
+static void 
+     CheckFirstPhase(txn_info *txn);
+static void 
+     get_transaction_handles(PGXCNodeAllHandles **pgxc_handles, txn_info *txn);
+static void 
+     get_node_handles(PGXCNodeAllHandles ** pgxc_handles, Oid nodeoid);
+
+Datum	pg_clean_execute(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pg_clean_execute);
+Datum	pg_clean_execute(PG_FUNCTION_ARGS)
+{
+#ifdef ACCESS_CONTROL_ATTR_NUM
+#undef ACCESS_CONTROL_ATTR_NUM
+#endif
+#define ACCESS_CONTROL_ATTR_NUM  4
+	FuncCallContext 	*funcctx;
+	HeapTuple			tuple;		
+	print_txn_info		*print_txn = NULL;
+	txn_info 			*temp_txn;
+	char				txn_gid[100];
+	char				txn_status[100];
+	char				txn_op[100];
+	char				txn_op_issuccess[100];
+	
+	Datum		values[ACCESS_CONTROL_ATTR_NUM];
+	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
+	
+	if(!IS_PGXC_COORDINATOR)
+	{
+		elog(ERROR, "can only called on coordinator");
+	}
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcontext;
+		TupleDesc	tupdesc;
+		MemoryContext mycontext;
+		funcctx = SRF_FIRSTCALL_INIT();
+		
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "global_transaction_status",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "operation",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "operation_status",
+						   TEXTOID, -1, 0);
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+		
+		funcctx->user_fctx = (print_txn_info *)palloc0(sizeof(print_txn_info));
+		print_txn = (print_txn_info *) funcctx->user_fctx;
+	
+		
+		MemoryContextSwitchTo(oldcontext);
+		mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx,
+												  "clean_check",
+												  ALLOCSET_DEFAULT_MINSIZE,
+												  ALLOCSET_DEFAULT_INITSIZE,
+												  ALLOCSET_DEFAULT_MAXSIZE);
+		oldcontext = MemoryContextSwitchTo(mycontext);
+		
+        /*clear Global*/
+        ResetGlobalVariables();
+        execute = true;
+        clean_time_interval = PG_GETARG_INT32(0) * 1000000;
+        if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval)
+        {
+            clean_time_interval = LEAST_CLEAN_TIME_INTERVAL;
+        }
+        
+		/*get node list*/
+		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
+						&cn_nodes_num, &dn_nodes_num, true);
+		pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
+		my_nodeoid = getMyNodeoid();
+		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+
+		/*add my database info*/
+		add_database_info(get_database_name(MyDatabaseId));
+
+		/*get all info of 2PC transactions*/
+		getTxnInfoOnNodesAll();
+
+		/*get txn info on other nodes all*/
+		getTxnInfoOnOtherNodesAll();
+
+		/*recover all 2PC transactions*/
+		recover2PCForDatabaseAll();
+
+		Init_print_txn_info(print_txn);
+		
+		print_txn->mycontext = mycontext;
+		
+		MemoryContextSwitchTo(oldcontext);
+
+	}
+	
+	funcctx = SRF_PERCALL_SETUP();	
+	print_txn = (print_txn_info *) funcctx->user_fctx;
+	
+	if (print_txn->index < print_txn->txn_count)
+	{
+		temp_txn = print_txn->txn[print_txn->index];
+		strncpy(txn_gid, temp_txn->gid, 100);
+		strncpy(txn_status, txn_status_to_string(temp_txn->global_txn_stat), 100);
+		strncpy(txn_op, txn_op_to_string(temp_txn->op), 100);
+		if (temp_txn->op_issuccess)
+			strncpy(txn_op_issuccess, "success", 100);
+		else
+			strncpy(txn_op_issuccess, "fail", 100);
+		
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		values[0] = PointerGetDatum(cstring_to_text(txn_gid));
+		values[1] = PointerGetDatum(cstring_to_text(txn_status));
+		values[2] = PointerGetDatum(cstring_to_text(txn_op));
+		values[3] = PointerGetDatum(cstring_to_text(txn_op_issuccess));
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+		print_txn->index++;
+		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+	}
+	else
+	{
+		
+		//MemoryContextDelete(print_txn->mycontext);
+		DestroyTxnHash();
+		ResetGlobalVariables();
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
+/*
+ * clear 2pc after oss detect abnormal node and restart it , 
+ * only clear 2pc belong the abnormal node and before the abnormal time
+ */
+Datum	pg_clean_execute_on_node(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pg_clean_execute_on_node);
+Datum	pg_clean_execute_on_node(PG_FUNCTION_ARGS)
+{
+#ifdef ACCESS_CONTROL_ATTR_NUM
+#undef ACCESS_CONTROL_ATTR_NUM
+#endif
+#define ACCESS_CONTROL_ATTR_NUM  4
+	FuncCallContext 	*funcctx;
+	HeapTuple			tuple;		
+	print_txn_info		*print_txn = NULL;
+	txn_info 			*temp_txn;
+	char				txn_gid[100];
+	char				txn_status[100];
+	char				txn_op[100];
+	char				txn_op_issuccess[100];
+	
+	Datum		values[ACCESS_CONTROL_ATTR_NUM];
+	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
+	
+	if(!IS_PGXC_COORDINATOR)
+	{
+		elog(ERROR, "can only called on coordinator");
+	}
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcontext;
+		TupleDesc	tupdesc;
+		MemoryContext mycontext;
+		funcctx = SRF_FIRSTCALL_INIT();
+		
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "global_transaction_status",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "operation",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "operation_status",
+						   TEXTOID, -1, 0);
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+		
+		funcctx->user_fctx = (print_txn_info *)palloc0(sizeof(print_txn_info));
+		print_txn = (print_txn_info *) funcctx->user_fctx;
+	
+		
+		MemoryContextSwitchTo(oldcontext);
+		mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx,
+												  "clean_check",
+												  ALLOCSET_DEFAULT_MINSIZE,
+												  ALLOCSET_DEFAULT_INITSIZE,
+												  ALLOCSET_DEFAULT_MAXSIZE);
+		oldcontext = MemoryContextSwitchTo(mycontext);
+		
+        /*clear Global*/
+        ResetGlobalVariables();
+        execute = true;
+        clear_2pc_belong_node = true;
+
+        abnormal_nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
+        abnormal_nodeoid = get_pgxc_nodeoid(abnormal_nodename);
+        if (InvalidOid == abnormal_nodeoid)
+        {
+            elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of invalid nodename '%s'", abnormal_nodename);
+        }
+        abnormal_time = PG_GETARG_INT64(1);
+        current_time = GetCurrentTimestamp();
+        if (abnormal_time >= current_time)
+        {
+            elog(ERROR, "pg_clean_execute_on_node, abnormal time "INT64_FORMAT" must before current_time "INT64_FORMAT, abnormal_time, current_time);
+        }
+        
+		/*get node list*/
+		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
+						&cn_nodes_num, &dn_nodes_num, true);
+		pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
+		my_nodeoid = getMyNodeoid();
+		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+
+		/*add my database info*/
+		add_database_info(get_database_name(MyDatabaseId));
+
+		/*get all info of 2PC transactions*/
+		getTxnInfoOnNodesAll();
+
+		/*get txn info on other nodes all*/
+		getTxnInfoOnOtherNodesAll();
+
+		/*recover all 2PC transactions*/
+		recover2PCForDatabaseAll();
+
+		Init_print_txn_info(print_txn);
+		
+		print_txn->mycontext = mycontext;
+		
+		MemoryContextSwitchTo(oldcontext);
+
+	}
+	
+	funcctx = SRF_PERCALL_SETUP();	
+	print_txn = (print_txn_info *) funcctx->user_fctx;
+	
+	if (print_txn->index < print_txn->txn_count)
+	{
+		temp_txn = print_txn->txn[print_txn->index];
+		strncpy(txn_gid, temp_txn->gid, 100);
+		strncpy(txn_status, txn_status_to_string(temp_txn->global_txn_stat), 100);
+		strncpy(txn_op, txn_op_to_string(temp_txn->op), 100);
+		if (temp_txn->op_issuccess)
+			strncpy(txn_op_issuccess, "success", 100);
+		else
+			strncpy(txn_op_issuccess, "fail", 100);
+		
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		values[0] = PointerGetDatum(cstring_to_text(txn_gid));
+		values[1] = PointerGetDatum(cstring_to_text(txn_status));
+		values[2] = PointerGetDatum(cstring_to_text(txn_op));
+		values[3] = PointerGetDatum(cstring_to_text(txn_op_issuccess));
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+		print_txn->index++;
+		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+	}
+	else
+	{
+		DestroyTxnHash();
+        pfree(abnormal_nodename);
+		ResetGlobalVariables();
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
+
+Datum	pg_clean_check_txn(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pg_clean_check_txn);
+Datum	pg_clean_check_txn(PG_FUNCTION_ARGS)
+{
+#ifdef ACCESS_CONTROL_ATTR_NUM
+#undef ACCESS_CONTROL_ATTR_NUM
+#endif
+#define ACCESS_CONTROL_ATTR_NUM  4
+	FuncCallContext 	*funcctx;
+	HeapTuple			tuple;		
+	print_status		*pstatus = NULL;
+	
+	Datum		values[ACCESS_CONTROL_ATTR_NUM];
+	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
+	execute = false;
+    
+	if(!IS_PGXC_COORDINATOR)
+	{
+		elog(ERROR, "can only called on coordinator");
+	}
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcontext;
+		MemoryContext mycontext;
+		TupleDesc	tupdesc;
+		funcctx = SRF_FIRSTCALL_INIT();
+		
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "gid",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "database",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "global_transaction_status",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "transaction_status_on_allnodes",
+						   TEXTOID, -1, 0);
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+		
+		funcctx->user_fctx = (print_status *)palloc0(sizeof(print_status));
+		pstatus = (print_status *) funcctx->user_fctx;
+		pstatus->index = pstatus->count = 0;
+		pstatus->gid = NULL;
+		pstatus->global_status = pstatus->status = (char **)NULL;
+		pstatus->database = NULL;
+		pstatus->mycontext = NULL;
+	
+
+		MemoryContextSwitchTo(oldcontext);
+
+		mycontext = AllocSetContextCreate(funcctx->multi_call_memory_ctx,
+												  "clean_check",
+												  ALLOCSET_DEFAULT_MINSIZE,
+												  ALLOCSET_DEFAULT_INITSIZE,
+												  ALLOCSET_DEFAULT_MAXSIZE);
+		oldcontext = MemoryContextSwitchTo(mycontext);
+
+        /*clear Global*/
+        ResetGlobalVariables();
+        
+        clean_time_interval = PG_GETARG_INT32(0) * 1000000;
+        if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval)
+        {
+            clean_time_interval = LEAST_CLEAN_TIME_INTERVAL;
+        }
+		/*get node list*/
+		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
+						&cn_nodes_num, &dn_nodes_num, true);
+        if (cn_node_list == NULL || dn_node_list == NULL)
+            elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list");
+		pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
+		my_nodeoid = getMyNodeoid();
+		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+
+		/*get all database info*/
+		getDatabaseList();
+
+		/*get all info of 2PC transactions*/
+		getTxnInfoOnNodesAll();
+
+		/*get txn info on other nodes all*/
+		getTxnInfoOnOtherNodesAll();
+
+		/*recover all 2PC transactions*/
+		Init_print_stats_all(pstatus);
+	
+		pstatus->mycontext = mycontext;
+	
+		MemoryContextSwitchTo(oldcontext);
+
+	}
+	
+	funcctx = SRF_PERCALL_SETUP();	
+	pstatus = (print_status *) funcctx->user_fctx;
+	
+	if (pstatus->index < pstatus->count)
+	{
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		values[0] = PointerGetDatum(cstring_to_text(pstatus->gid[pstatus->index]));
+		values[1] = PointerGetDatum(cstring_to_text(pstatus->database[pstatus->index]));
+		values[2] = PointerGetDatum(cstring_to_text(pstatus->global_status[pstatus->index]));
+		values[3] = PointerGetDatum(cstring_to_text(pstatus->status[pstatus->index]));
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+		pstatus->index++;
+		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+	}
+	else
+	{
+		/*
+		MemoryContextDelete(pstatus->mycontext);
+		DropDatabaseInfo();
+		*/
+		DestroyTxnHash();
+		ResetGlobalVariables();
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
+void DestroyTxnHash(void)
+{
+    database_info *dbinfo = head_database_info;
+    while (dbinfo)
+    {
+        hash_destroy(dbinfo->all_txn_info);
+        dbinfo = dbinfo->next;
+    }
+}
+
+static void ResetGlobalVariables(void)
+{
+	cn_node_list = NULL;
+	dn_node_list = NULL;
+	cn_health_map = NULL;
+	dn_health_map = NULL;
+	cn_nodes_num = 0;
+	dn_nodes_num = 0;
+	pgxc_clean_node_count = 0;
+	execute = false;
+    total_twopc_txn = 0;
+
+	head_database_info = last_database_info = NULL;
+
+    current_time = 0;
+    abnormal_time = InvalidGlobalTimestamp;
+    abnormal_nodename = NULL;
+    abnormal_nodeoid = InvalidOid;
+    clear_2pc_belong_node = false;
+
+}
+
+static Oid getMyNodeoid(void)
+{
+	return get_pgxc_nodeoid(PGXCNodeName);
+}
+
+/* 
+ * execute_query_on_single_node -- execute query on certain node and get results
+ * input: 	node oid, execute query, number of attribute in results, results
+ * return:	(Datum) 0
+ */
+static Datum
+execute_query_on_single_node(Oid node, const char *query, int attnum, TupleTableSlots *tuples)  //delete numnodes, delete nodelist, insert node
+{
+	int 		ii;
+	bool		issuccess = false;
+
+	/*check health of node*/
+	bool ishealthy = check_node_health(node);
+
+#ifdef XCP
+	EState				*estate;
+	MemoryContext		oldcontext;
+	RemoteQuery			*plan;
+	RemoteQueryState	*pstate;
+	TupleTableSlot		*result = NULL;
+	Var			   		*dummy;
+	char ntype = PGXC_NODE_NONE;
+
+	/*
+	 * Make up RemoteQuery plan node
+	 */
+	plan = makeNode(RemoteQuery);
+	plan->combine_type = COMBINE_TYPE_NONE;
+	plan->exec_nodes = makeNode(ExecNodes);
+	plan->exec_type = EXEC_ON_NONE;
+
+	plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList,
+		PGXCNodeGetNodeId(node, &ntype));
+	if (ntype == PGXC_NODE_NONE)
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("Unknown node Oid: %u", node)));
+	else if (ntype == PGXC_NODE_COORDINATOR) 
+	{
+		plan->exec_type = EXEC_ON_COORDS;
+	}
+	else
+	{
+		plan->exec_type = EXEC_ON_DATANODES;
+	}
+
+	plan->sql_statement = (char *)query;
+	plan->force_autocommit = false;
+	/*
+	 * We only need the target entry to determine result data type.
+	 * So create dummy even if real expression is a function.
+	 */
+	for (ii = 1; ii <= attnum; ii++)
+	{
+		dummy = makeVar(1, ii, TEXTOID, 0, InvalidOid, 0);
+		plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist,
+										  makeTargetEntry((Expr *) dummy, ii, NULL, false));
+	}
+	/* prepare to execute */
+	estate = CreateExecutorState();
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+	estate->es_snapshot = GetActiveSnapshot();
+	pstate = ExecInitRemoteQuery(plan, estate, 0);
+	MemoryContextSwitchTo(oldcontext);
+
+	/*execute query on node when node is healthy*/
+	INIT(tuples->slot);
+	tuples->attnum = 0;	
+	if (ishealthy)
+	{
+		int i_tuple = 0;
+		int i_attnum = 0;
+		issuccess = true;
+		result = ExecRemoteQuery((PlanState *) pstate);
+		tuples->attnum = attnum;
+		while (result != NULL && !TupIsNull(result))
+		{
+			slot_getallattrs(result); 
+			RPALLOC(tuples->slot);
+			tuples->slot[i_tuple] = (char **) palloc0(attnum * sizeof(char *));
+		
+			for (i_attnum = 0; i_attnum < attnum; i_attnum++)
+			{
+				/*if (result->tts_values[i_attnum] != (Datum)0)*/
+				if (result->tts_isnull[i_attnum] == false)
+				{
+					tuples->slot[i_tuple][i_attnum] = text_to_cstring(DatumGetTextP(result->tts_values[i_attnum]));
+				}
+				else
+				{
+					tuples->slot[i_tuple][i_attnum] = NULL;
+				}
+			}
+			tuples->slot_count++;
+
+			result = ExecRemoteQuery((PlanState *) pstate);
+			i_tuple++;
+		}
+	}
+	ExecEndRemoteQuery(pstate);
+#endif
+	return issuccess == true ? (Datum) 1 : (Datum) 0;
+}
+
+static bool check_node_health(Oid node_oid)
+{
+	int i;
+	bool ishealthy = false;
+	
+	PoolPingNodeRecheck(node_oid);
+	PgxcNodeGetHealthMap(cn_node_list, dn_node_list, 
+						&cn_nodes_num, &dn_nodes_num, 
+						cn_health_map, dn_health_map);
+	if (get_pgxc_nodetype(node_oid) == 'C')
+	{
+		for (i = 0; i < cn_nodes_num; i++)
+		{
+			if (cn_node_list[i] == node_oid)
+			{
+				ishealthy = cn_health_map[i];
+			}
+		}
+	}
+	else
+	{
+		for (i = 0; i < dn_nodes_num; i++)
+		{
+			if (dn_node_list[i] == node_oid)
+			{
+				ishealthy = dn_health_map[i];
+			}
+		}
+	}
+	return ishealthy;
+}
+
+static void getDatabaseList(void)
+{
+	int i;
+	TupleTableSlots result_db;
+	const char *query_db = "select datname::text from pg_database;";
+	/*add datname into tail of head_database_info*/
+	if (execute_query_on_single_node(my_nodeoid, query_db, 1, &result_db) == (Datum) 1)
+	{
+		for (i = 0; i < result_db.slot_count; i++)
+		{
+			if (TTSgetvalue(&result_db, i, 0))
+			{
+				add_database_info(TTSgetvalue(&result_db, i, 0));
+			}
+		}
+	}
+	else
+	{
+		elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(my_nodeoid));
+	}
+	DropTupleTableSlots(&result_db);
+}
+
+/* 
+ * TTSgetvalue -- get attribute from TupleTableSlots
+ * input: 	result, index of tuple, index of field
+ * return:	attribute result
+ */
+static char * TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num)
+{
+	return result->slot[tup_num][field_num];
+}
+
+static void DropTupleTableSlots(TupleTableSlots *
+Slots)
+{
+	int i;
+	int j;
+	for (i = 0; i < Slots->slot_count; i++)
+	{
+		if (Slots->slot[i])
+		{
+			for (j = 0; j < Slots->attnum; j++)
+			{
+				if (Slots->slot[i][j])
+				{
+					pfree(Slots->slot[i][j]);
+				}
+			}
+			pfree(Slots->slot[i]);
+		}
+	}
+	RFREE(Slots->slot);
+	Slots->attnum = 0;
+	return;
+}
+
+static void getTxnInfoOnNodesAll(void)
+{
+	int i;
+	current_time = GetCurrentTimestamp();
+	/*upload 2PC transaction from CN*/
+	for (i = 0; i < cn_nodes_num; i++)
+	{
+        if (total_twopc_txn >= MAX_TWOPC_TXN)
+            return;
+		getTxnInfoOnNode(cn_node_list[i]);
+	}
+
+	/*upload 2PC transaction from DN*/
+	for (i = 0; i < dn_nodes_num; i++)
+	{
+        if (total_twopc_txn >= MAX_TWOPC_TXN)
+            return;
+		getTxnInfoOnNode(dn_node_list[i]);
+	}
+}
+
+void getTxnInfoOnNode(Oid node)
+{
+	int i;
+	TupleTableSlots result_txn;
+	Datum execute_res;
+	char query_execute[1024];
+	const char *query_txn_status = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text "
+										  "from pg_prepared_xacts;";
+	const char *query_txn_status_execute = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text "
+										  		  "from pg_prepared_xacts where database = '%s';";
+	snprintf(query_execute, 1024, query_txn_status_execute, get_database_name(MyDatabaseId));
+
+	if (execute)
+		execute_res = execute_query_on_single_node(node, query_execute, 5, &result_txn);
+	else
+		execute_res = execute_query_on_single_node(node, query_txn_status, 5, &result_txn);
+	
+	if (execute_res == (Datum) 1)
+	{
+		for (i = 0; i < result_txn.slot_count; i++)
+		{
+			uint32	xid;
+			char*	gid;
+			char*	owner;
+			char*	datname;
+			TimestampTz	prepared_time;
+			
+			/*read results from each tuple*/
+			xid		= strtoul(TTSgetvalue(&result_txn, i, 0), NULL, 10);
+			gid		= TTSgetvalue(&result_txn, i, 1);
+			owner	= TTSgetvalue(&result_txn, i, 2);
+			datname	= TTSgetvalue(&result_txn, i, 3);
+			prepared_time = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
+												CStringGetDatum(TTSgetvalue(&result_txn, i, 4)),
+												ObjectIdGetDatum(InvalidOid),
+												Int32GetDatum(-1)));
+			
+			/*add txn to database*/
+			add_txn_info(datname, node, xid, gid, owner, prepared_time, TXN_STATUS_PREPARED);
+            if (total_twopc_txn >= MAX_TWOPC_TXN)
+            {
+                break;
+            }
+		}
+	}
+	else
+	{
+		elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(node));
+	}
+	DropTupleTableSlots(&result_txn);
+}
+
+void add_txn_info(char* dbname, Oid node_oid, uint32 xid, char * gid, 
+						char * owner, TimestampTz prepared_time, TXN_STATUS status)
+{
+	txn_info *txn = NULL;
+	int	nodeidx;
+
+	if ((txn = find_txn(gid)) == NULL)
+	{
+		txn = make_txn_info(dbname, gid, owner);
+        total_twopc_txn++;
+		if (txn == NULL)
+		{
+			/*no more memory*/
+			elog(ERROR, "there is no more memory for palloc a 2PC transaction");
+		}
+	}
+	nodeidx = find_node_index(node_oid);
+	txn->txn_stat[nodeidx] = status;
+	txn->xid[nodeidx] = xid;
+	txn->prepare_timestamp[nodeidx] = prepared_time;
+	if (nodeidx < cn_nodes_num)
+	{
+		txn->coordparts[nodeidx] = 1;
+		txn->num_coordparts++;
+	}
+	else
+	{
+		txn->dnparts[nodeidx-cn_nodes_num] = 1;
+		txn->num_dnparts++;
+	}
+	return;
+}
+
+TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
+{
+	/*get all the participates and initiate to each transactions*/
+	TWOPHASE_FILE_STATUS res = TWOPHASE_FILE_NOT_EXISTS;
+	TupleTableSlots result;
+	char *partnodes = NULL;
+    char *startnode = NULL;
+    char *file_content = NULL;
+    uint32 startxid = 0;
+    char *str_startxid = NULL;
+    char *str_timestamp = NULL;
+	char *temp = NULL;
+	Oid	 temp_nodeoid;
+	char temp_nodetype;
+	int  temp_nodeidx;
+	char stmt[1024];
+	static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text";
+	snprintf(stmt, 1024, STMT_FORM, txn->gid, txn->gid, txn->gid, txn->gid);
+    
+	if (execute_query_on_single_node(node_oid, stmt, 1, &result) == (Datum) 1)
+	{
+		if (result.slot_count && TTSgetvalue(&result, 0, 0))
+#if 0
+            TTSgetvalue(&result, 0, 0) && 
+            TTSgetvalue(&result, 0, 1) && 
+            TTSgetvalue(&result, 0, 2))
+#endif
+		{
+            file_content = TTSgetvalue(&result, 0, 0);    
+            
+            if (!IsXidImplicit(txn->gid) && strstr(file_content, GET_READONLY))
+            {
+                txn->is_readonly = true;
+                txn->global_txn_stat = TXN_STATUS_COMMITTED;
+                DropTupleTableSlots(&result);
+	            return TWOPHASE_FILE_EXISTS;
+            }
+            startnode = strstr(file_content, GET_START_NODE);
+            str_startxid = strstr(file_content, GET_START_XID);
+            partnodes = strstr(file_content, GET_NODE);
+            temp = strstr(file_content, GET_COMMIT_TIMESTAMP);
+            
+            /* get the last global_commit_timestamp */
+            while (temp)
+            {
+                str_timestamp = temp;
+                temp += strlen(GET_COMMIT_TIMESTAMP);
+                temp = strstr(temp, GET_COMMIT_TIMESTAMP);
+            }
+            
+            if (startnode)
+            {
+                startnode += strlen(GET_START_NODE);
+                startnode = strtok(startnode, "\n");
+                txn->origcoord = get_pgxc_nodeoid(startnode);
+            }
+            
+            if (str_startxid)
+            {
+                str_startxid += strlen(GET_START_XID);
+                str_startxid = strtok(str_startxid, "\n");
+                startxid = strtoul(str_startxid, NULL, 10);
+                txn->startxid = startxid;
+            }
+            
+            if (partnodes)
+            {
+                partnodes += strlen(GET_NODE);
+                partnodes = strtok(partnodes, "\n");
+                txn->participants = (char *) palloc0(strlen(partnodes) + 1);
+                strncpy(txn->participants, partnodes, strlen(partnodes) + 1);
+            }
+            
+            if (NULL == startnode || NULL == str_startxid)
+            {
+                res = TWOPHASE_FILE_OLD;
+                DropTupleTableSlots(&result);
+                return res;
+            }
+
+            if (NULL == partnodes)
+            {
+                res = TWOPHASE_FILE_ERROR;
+                DropTupleTableSlots(&result);
+                return res;
+            }
+
+            if (str_timestamp)
+            {
+                str_timestamp += strlen(GET_COMMIT_TIMESTAMP);
+                str_timestamp = strtok(str_timestamp, "\n");
+                txn->global_commit_timestamp = strtoull(str_timestamp, NULL, 10);
+            }
+            
+            elog(DEBUG1, "get 2pc txn:%s partnodes in nodename: %s (nodeoid:%u) result: partnodes:%s, startnode:%s, startnodeoid:%u, startxid:%u", 
+                txn->gid, get_pgxc_nodename(node_oid), node_oid, partnodes, startnode, txn->origcoord, startxid);
+            /* in explicit transaction startnode participate the transaction */
+            if (strstr(partnodes, startnode) || !IsXidImplicit(txn->gid))
+            {
+                txn->isorigcoord_part = true;
+            }
+            else
+            {
+                txn->isorigcoord_part = false;
+            }
+            
+			res = TWOPHASE_FILE_EXISTS;
+			txn->num_coordparts = 0;
+			txn->num_dnparts = 0;
+			temp = strtok(partnodes,", ");
+			while(temp)
+			{
+				/*check node type*/
+				temp_nodeoid = get_pgxc_nodeoid(temp);
+                if (temp_nodeoid == InvalidOid)
+                {
+                    res = TWOPHASE_FILE_ERROR;
+                    break;
+                }
+				temp_nodetype = get_pgxc_nodetype(temp_nodeoid);
+				temp_nodeidx = find_node_index(temp_nodeoid);
+				
+				switch (temp_nodetype)
+				{
+					case 'C':
+						txn->coordparts[temp_nodeidx] = 1;
+						txn->num_coordparts++;
+						break;
+					case 'D':
+						txn->dnparts[temp_nodeidx-cn_nodes_num] = 1;
+						txn->num_dnparts++;
+						break;
+					default:
+						elog(ERROR,"nodetype of %s is not 'C' or 'D'", temp);
+						break;
+				}
+				temp = strtok(NULL,", ");
+			}
+		}
+	}
+	else
+	{
+		elog(LOG, "pg_clean: failed get database list on node %s", get_pgxc_nodename(node_oid));
+		res = TWOPHASE_FILE_ERROR;
+	}
+	DropTupleTableSlots(&result);
+	return res;
+}
+
+static txn_info *find_txn(char *gid)
+{
+  bool found;
+  database_info *cur_db;
+  txn_info *txn;
+
+  for (cur_db = head_database_info; cur_db; cur_db = cur_db->next)
+  {
+#if 0
+	  for (cur_txn = cur_db->head_txn_info; cur_txn; cur_txn = cur_txn->next)
+	  {
+		  if (0 == strcmp(cur_txn->gid, gid))
+			  return cur_txn;
+	  }
+#endif
+      txn = (txn_info *)hash_search(cur_db->all_txn_info, (void *)gid, HASH_FIND, &found);
+      if (found)
+        return txn;
+  }
+  return NULL;
+}
+
+txn_info* make_txn_info(char* dbname, char* gid, char* owner)
+{
+    bool found;
+    txn_info *txn_insert_pos = NULL;
+	database_info *dbinfo;
+	txn_info *txn;
+
+	dbinfo = add_database_info(dbname);
+	txn = (txn_info *)palloc0(sizeof(txn_info));
+	if (txn == NULL)
+		return NULL;
+	//txn->next = NULL;
+	
+	//txn->gid = (char *)palloc0(strlen(gid)+1);
+	strncpy(txn->gid, gid, strlen(gid)+1);
+	txn->owner = (char *)palloc0(strlen(owner)+1);
+	strncpy(txn->owner, owner, strlen(owner)+1);
+	
+	txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count);
+	txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count);
+	txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count);
+	txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int));
+	
+	txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int));
+	if (txn->gid == NULL || txn->owner == NULL || txn->txn_stat == NULL
+		|| txn->xid == NULL || txn->coordparts == NULL || txn->dnparts == NULL || txn->prepare_timestamp == NULL)
+	{
+		pfree(txn);
+		return(NULL);
+	}
+
+    txn_insert_pos = (txn_info *)hash_search(dbinfo->all_txn_info, 
+                   (void *)txn->gid, HASH_ENTER, &found);
+    if (!found)
+        memcpy(txn_insert_pos, txn, sizeof(txn_info));
+
+#if 0        
+	if (dbinfo->head_txn_info == NULL)
+	{
+		dbinfo->head_txn_info = dbinfo->last_txn_info = txn;
+	}
+	else
+	{
+		dbinfo->last_txn_info->next = txn;
+		dbinfo->last_txn_info = txn;
+	}
+#endif
+
+	return txn_insert_pos;
+}
+
+database_info *find_database_info(char *database_name)
+{
+	database_info *cur_database_info = head_database_info;
+
+	for (;cur_database_info; cur_database_info = cur_database_info->next)
+	{
+		if(cur_database_info->database_name &&
+		   database_name && 
+		   strcmp(cur_database_info->database_name, database_name) == 0)
+			return(cur_database_info);
+	}
+	return(NULL);
+}
+
+database_info *add_database_info(char *database_name)
+{
+	database_info *rv;
+    HASHCTL txn_ctl;
+    char tabname[MAX_GID];
+
+	if ((rv = find_database_info(database_name)) != NULL)
+		return rv;		/* Already in the list */
+	rv = (database_info *)palloc0(sizeof(database_info));
+	if (rv == NULL)
+		return NULL;
+	rv->next = NULL;
+	rv->database_name = (char *)palloc0(strlen(database_name) + 1);
+	strncpy(rv->database_name, database_name, strlen(database_name) + 1);
+	if (rv->database_name == NULL)
+	{
+		pfree(rv);
+		return NULL;
+	}
+#if 0    
+	rv->head_txn_info = NULL;
+	rv->last_txn_info = NULL;
+#endif
+
+    snprintf(tabname, 64, "%s txn info", rv->database_name);
+    txn_ctl.keysize = MAX_GID;
+    txn_ctl.entrysize = sizeof(txn_info); 
+    rv->all_txn_info = hash_create(tabname, 64, 
+                                   &txn_ctl, HASH_ELEM);
+	if (head_database_info == NULL)
+	{
+		head_database_info = last_database_info = rv;
+		return rv;
+	}
+	else
+	{
+		last_database_info->next = rv;
+		last_database_info = rv;
+		return rv;
+	}
+}
+
+int find_node_index(Oid node_oid)
+{
+	int res;
+	int i;
+	if (get_pgxc_nodetype(node_oid) == 'C')
+	{
+		for (i = 0; i < cn_nodes_num; i++)
+		{
+			if (node_oid == cn_node_list[i])
+			{
+				res = i;
+				break;
+			}
+		}
+	}
+	else
+	{
+		for (i = 0; i < dn_nodes_num; i++)
+		{
+			if (node_oid == dn_node_list[i])
+			{
+				res = i+cn_nodes_num;
+				break;
+			}
+		}
+	}
+	return res;
+}
+
+Oid find_node_oid(int node_idx)
+{
+	return (node_idx < cn_nodes_num) ? cn_node_list[node_idx] :
+									   dn_node_list[node_idx-cn_nodes_num];
+}
+
+void getTxnInfoOnOtherNodesAll(void)
+{
+	database_info *cur_database;
+
+	for (cur_database = head_database_info; cur_database; cur_database = cur_database->next)
+	{
+		getTxnInfoOnOtherNodesForDatabase(cur_database);
+	}
+}
+
+void getTxnInfoOnOtherNodesForDatabase(database_info *database)
+{
+	txn_info *cur_txn;
+	HASH_SEQ_STATUS status;
+    HTAB *txn = database->all_txn_info;
+	hash_seq_init(&status, txn);
+
+    while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL)
+    {
+		getTxnInfoOnOtherNodes(cur_txn);
+    }
+#if 0
+	for (cur_txn = database->head_txn_info; cur_txn; cur_txn = cur_txn->next)
+	{
+		getTxnInfoOnOtherNodes(cur_txn);
+	}
+#endif
+}
+
+void getTxnInfoOnOtherNodes(txn_info *txn)
+{
+	int ii;
+    int ret;
+	char node_type;
+    TWOPHASE_FILE_STATUS status = TWOPHASE_FILE_NOT_EXISTS;
+    Oid node_oid;
+    uint32 transactionid = 0;
+    char gid[MAX_GID];
+    char *ptr = NULL;
+
+    if (IsXidImplicit(txn->gid))
+    {
+        strncpy(gid, txn->gid, strlen(txn->gid)+1);
+        ptr = strtok(gid, ":");
+        ptr = strtok(NULL, ":");
+        node_oid = get_pgxc_nodeoid(ptr);
+        status = GetTransactionPartNodes(txn, node_oid);
+    }
+    else
+    {
+        for (ii = 0; ii < cn_nodes_num + dn_nodes_num; ii++)
+        {
+            if (ii < cn_nodes_num)
+            {
+                status = GetTransactionPartNodes(txn, cn_node_list[ii]);
+                if (TWOPHASE_FILE_EXISTS == status || 
+                    TWOPHASE_FILE_OLD == status || 
+                    TWOPHASE_FILE_ERROR == status)
+                {
+                    node_oid = cn_node_list[ii];
+                    break;
+                }
+            }
+            else
+            {
+                status = GetTransactionPartNodes(txn, dn_node_list[ii - cn_nodes_num]);
+                if (TWOPHASE_FILE_EXISTS == status || 
+                    TWOPHASE_FILE_OLD == status || 
+                    TWOPHASE_FILE_ERROR == status)
+                {
+                    node_oid = dn_node_list[ii - cn_nodes_num];
+                    break;
+                }
+            }
+        }
+        
+        /* since there may be explicit readonly  twophase transactions */
+        if (txn->is_readonly)
+        {
+            return;
+        }
+        if (TWOPHASE_FILE_EXISTS == status && 
+            InvalidGlobalTimestamp == txn->global_commit_timestamp && 
+            node_oid != txn->origcoord)
+        {
+            status = GetTransactionPartNodes(txn, txn->origcoord);
+        }
+
+    }
+    
+    if (TWOPHASE_FILE_EXISTS != status)
+    {
+        /*
+         * if 2pc file not exists in all nodes, the trans did not pass the prepared phase, 
+         * 
+         */
+        txn->global_txn_stat = (TWOPHASE_FILE_NOT_EXISTS == status) ? 
+                                TXN_STATUS_ABORTED : TXN_STATUS_UNKNOWN;
+        return;
+    }
+
+
+    /* judge the range of global status */
+    CheckFirstPhase(txn);
+
+	for (ii = 0; ii < pgxc_clean_node_count; ii++)
+	{
+		if (txn->txn_stat[ii] == TXN_STATUS_INITIAL)
+		{
+			/*check node ii is 'C' or 'D'*/
+            node_oid = find_node_oid(ii);
+            if (node_oid == txn->origcoord)
+                continue;
+			node_type = get_pgxc_nodetype(node_oid);
+			if (node_type == 'C' && txn->coordparts[ii] != 1)
+				continue;
+			if (node_type == 'D' && txn->dnparts[ii - cn_nodes_num] != 1)
+				continue;
+			/*check coordparts or dnparts*/
+			if (txn->xid[ii] == 0)
+			{
+                ret = Get2PCXidByGid(node_oid, txn->gid, &transactionid);
+                if (ret == XIDFOUND)
+                {
+                    txn->xid[ii] = transactionid;
+                    if (txn->xid[ii] > 0)
+                        getTxnStatus(txn, ii);
+                }
+                else if (ret == XIDNOTFOUND)
+                {
+                    if (txn->after_first_phase)
+                        txn->txn_stat[ii] = TXN_STATUS_COMMITTED;
+                }
+                else
+                    txn->txn_stat[ii] = TXN_STATUS_UNKNOWN;
+
+			}
+		}
+	}
+}
+
+/*get xid by gid on node_oid*/
+int Get2PCXidByGid(Oid node_oid, char *gid, uint32 *transactionid)
+{
+    int ret = XIDFOUND;
+	TupleTableSlots result;
+	uint32 xid = 0;
+	static const char *STMT_FORM = "select pgxc_get_2pc_xid('%s')::text;";
+	char stmt[100];
+	snprintf(stmt, 100, STMT_FORM, gid);
+	/*if exist get xid by gid on node_oid*/
+	if (execute_query_on_single_node(node_oid, stmt, 1, &result) != (Datum) 0)
+	{
+		if (result.slot_count)
+		{
+			if (TTSgetvalue(&result, 0, 0))
+			{
+				xid = strtoul(TTSgetvalue(&result, 0, 0), NULL, 10);
+                *transactionid = xid;
+                if (xid == 0)
+                    ret = XIDNOTFOUND;
+			}
+            else
+                ret = XIDNOTFOUND;
+		}
+		else
+			ret = XIDNOTFOUND;
+	}
+	else
+		ret = XIDEXECFAIL;
+	DropTupleTableSlots(&result);
+	return ret;
+}
+
+int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid)
+{
+    int ret = FILEFOUND;
+	TupleTableSlots result;
+	static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text;";
+	char stmt[100];
+	snprintf(stmt, 100, STMT_FORM, gid);
+	/*if exist get xid by gid on node_oid*/
+	if (execute_query_on_single_node(node_oid, stmt, 1, &result) != (Datum) 0)
+	{
+		if (result.slot_count)
+		{
+			if (!TTSgetvalue(&result, 0, 0))
+			{
+                ret = FILENOTFOUND;
+			}
+            else
+            {
+                ret = FILEFOUND;
+            }
+		}
+		else
+			ret = FILENOTFOUND;
+	}
+	else
+		ret = FILEUNKOWN;
+	DropTupleTableSlots(&result);
+	return ret;
+}
+
+
+void getTxnStatus(txn_info *txn, int node_idx)
+{
+	Oid				node_oid;
+	char			stmt[1024];
+	char			*att1;
+	TupleTableSlots result;
+
+	static const char *STMT_FORM = "SELECT pgxc_is_committed('%d'::xid)::text";
+	snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx], txn->xid[node_idx]);
+
+	node_oid = find_node_oid(node_idx);
+	if (0 != execute_query_on_single_node(node_oid, stmt, 1, &result))
+	{
+		att1 = TTSgetvalue(&result, 0, 0);
+		
+		if (att1)
+		{
+			if (strcmp(att1, "true") == 0)
+			{
+				txn->txn_stat[node_idx] = TXN_STATUS_COMMITTED;
+			}
+			else
+				txn->txn_stat[node_idx] = TXN_STATUS_ABORTED;
+		}
+		else
+		{
+            txn->txn_stat[node_idx] = TXN_STATUS_INITIAL;
+		}
+	}
+	else
+		txn->txn_stat[node_idx] = TXN_STATUS_UNKNOWN;
+	DropTupleTableSlots(&result);
+}
+
+Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_file);
+Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS)
+{
+    char *tid;
+    char path[MAXPGPATH];
+    File fd;
+    int ret;
+    char *result;
+	text *t_result = NULL;
+    struct stat filestate;
+    off_t fileSize;
+    
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+
+    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
+    
+    if(access(path, F_OK) == 0)
+    {
+    	if(stat(path, &filestate) == -1)
+    	{
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not get status of file \"%s\"", path)));
+    	}
+        
+    	fileSize = filestate.st_size;
+
+        if (0 == fileSize) 
+        {
+            PG_RETURN_NULL();
+        }
+
+        result = (char *)palloc0(fileSize + 1);
+
+        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
+    	if (fd < 0)
+    	{   
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not open file \"%s\" for read", path)));
+    	} 
+
+    	ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
+
+    	if(ret != fileSize)
+    	{
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not read file \"%s\"", path)));
+    	}
+
+        FileClose(fd);
+		if (result)
+		{
+			t_result = cstring_to_text(result);
+			return PointerGetDatum(t_result);
+		}
+    }
+    PG_RETURN_NULL();
+}
+
+
+Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_nodes);
+Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS)
+{
+    char *tid;
+    char path[MAXPGPATH];
+    File fd;
+    int ret;
+    char *result;
+	char *nodename;
+	text *t_result = NULL;
+    struct stat filestate;
+    off_t fileSize;
+    
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+
+    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
+    
+    if(access(path, F_OK) == 0)
+    {
+    	if(stat(path, &filestate) == -1)
+    	{
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not get status of file \"%s\"", path)));
+    	}
+        
+    	fileSize = filestate.st_size;
+
+        result = (char *)palloc0(fileSize + 1);
+
+        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
+    	if (fd < 0)
+    	{   
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not open file \"%s\" for read", path)));
+    	} 
+
+    	ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
+
+    	if(ret != fileSize)
+    	{
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not read file \"%s\"", path)));
+    	}
+
+        FileClose(fd);
+		if (result)
+		{
+			nodename = strstr(result, GET_NODE);
+			if (nodename)
+			{
+				nodename += strlen(GET_NODE);
+				nodename = strtok(nodename, "\n");
+				t_result = cstring_to_text(nodename);
+				return PointerGetDatum(t_result);
+			}
+		}
+    }
+    PG_RETURN_NULL();
+}
+
+Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_startnode);
+Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS)
+{
+    char *tid;
+    char path[MAXPGPATH];
+    File fd;
+    int ret;
+    char *result;
+	char *nodename;
+	text *t_result = NULL;
+    struct stat filestate;
+    off_t fileSize;
+    
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+
+    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
+    
+    if(access(path, F_OK) == 0)
+    {
+    	if(stat(path, &filestate) == -1)
+    	{
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not get status of file \"%s\"", path)));
+    	}
+        
+    	fileSize = filestate.st_size;
+
+        result = (char *)palloc0(fileSize + 1);
+
+        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
+    	if (fd < 0)
+    	{   
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not open file \"%s\" for read", path)));
+    	} 
+
+    	ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
+
+    	if(ret != fileSize)
+    	{
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not read file \"%s\"", path)));
+    	}
+
+        FileClose(fd);
+		if (result)
+		{
+			nodename = strstr(result, GET_START_NODE);
+			if (nodename)
+			{
+				nodename += strlen(GET_START_NODE);
+				nodename = strtok(nodename, "\n");
+				t_result = cstring_to_text(nodename);
+				return PointerGetDatum(t_result);
+			}
+		}
+    }
+    PG_RETURN_NULL();
+}
+
+Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_startxid);
+Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS)
+{
+    char *tid;
+    char path[MAXPGPATH];
+    File fd;
+    int ret;
+    char *result;
+	char *startxid;
+	text *t_result = NULL;
+    struct stat filestate;
+    off_t fileSize;
+    
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+
+    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
+    
+    if(access(path, F_OK) == 0)
+    {
+    	if(stat(path, &filestate) == -1)
+    	{
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not get status of file \"%s\"", path)));
+    	}
+        
+    	fileSize = filestate.st_size;
+
+        result = (char *)palloc0(fileSize + 1);
+
+        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
+    	if (fd < 0)
+    	{   
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not open file \"%s\" for read", path)));
+    	} 
+
+    	ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
+
+    	if(ret != fileSize)
+    	{
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not read file \"%s\"", path)));
+    	}
+
+        FileClose(fd);
+		if (result)
+		{
+			startxid = strstr(result, GET_START_XID);
+			if (startxid)
+			{
+				startxid += strlen(GET_START_XID);
+				startxid = strtok(startxid, "\n");
+				t_result = cstring_to_text(startxid);
+				return PointerGetDatum(t_result);
+			}
+		}
+    }
+    PG_RETURN_NULL();
+}
+
+
+Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_commit_timestamp);
+Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS)
+{
+    char *tid;
+    char path[MAXPGPATH];
+    File fd;
+    int ret;
+    char *result;
+	char *commit_timestamp;
+	text *t_result = NULL;
+    struct stat filestate;
+    off_t fileSize;
+    
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+
+    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
+    
+    if(access(path, F_OK) == 0)
+    {
+    	if(stat(path, &filestate) == -1)
+    	{
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not get status of file \"%s\"", path)));
+    	}
+        
+    	fileSize = filestate.st_size;
+
+        result = (char *)palloc0(fileSize + 1);
+
+        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
+    	if (fd < 0)
+    	{   
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not open file \"%s\" for read", path)));
+    	} 
+
+    	ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
+
+    	if(ret != fileSize)
+    	{
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not read file \"%s\"", path)));
+    	}
+
+        FileClose(fd);
+		if (result)
+		{
+			commit_timestamp = strstr(result, GET_COMMIT_TIMESTAMP);
+			if (commit_timestamp)
+			{
+				commit_timestamp += strlen(GET_COMMIT_TIMESTAMP);
+				commit_timestamp = strtok(commit_timestamp, "\n");
+				t_result = cstring_to_text(commit_timestamp);
+				return PointerGetDatum(t_result);
+			}
+		}
+    }
+    PG_RETURN_NULL();
+}
+
+
+
+Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_xid);
+Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS)
+{
+    char *tid;
+    char path[MAXPGPATH];
+    File fd;
+    int ret;
+    GlobalTransactionId xid;
+	char *result;
+	char *str_xid;
+    struct stat filestate;
+    off_t fileSize;
+    
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+
+    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
+    
+    if(access(path, F_OK) == 0)
+    {
+    	if(stat(path, &filestate) == -1)
+    	{
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not get status of file \"%s\"", path)));
+    	}
+        
+    	fileSize = filestate.st_size;
+		result = (char *)palloc0(fileSize + 1);
+		
+        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
+    	if (fd < 0)
+    	{   
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not open file \"%s\" for read", path)));
+    	} 
+		
+
+    	ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
+
+    	if(ret != fileSize)
+    	{
+    		ereport(ERROR,
+    			(errcode_for_file_access(),
+    			errmsg("could not read file \"%s\"", path)));
+    	}
+
+        FileClose(fd);
+		
+		str_xid = strstr(result, GET_XID);
+		if (str_xid)
+		{
+			str_xid += strlen(GET_XID);
+			str_xid = strtok(str_xid, "\n");
+			xid = strtoul(str_xid, NULL, 10);
+			PG_RETURN_UINT32(xid);
+		}
+		
+    }
+    PG_RETURN_NULL();
+}
+
+Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_remove_2pc_records);
+Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS)
+{
+#define SLEEP_COUNT 1000
+    char *tid               = NULL;
+
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+
+	remove_2pc_records(tid, true);
+
+    pfree(tid);
+    
+    PG_RETURN_BOOL(true);
+}
+
+Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_clear_2pc_records);
+Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
+{
+	MemoryContext oldcontext;
+	MemoryContext mycontext;
+	
+	int i = 0;
+    int count = 0;
+    TupleTableSlots *result;
+    TupleTableSlots clear_result;
+    const char *query = "select pgxc_get_record_list()::text";
+    const char *CLEAR_STMT = "select pgxc_remove_2pc_records('%s')::text";
+    char clear_query[100];
+    char *twopcfiles = NULL;
+    char *ptr = NULL;
+    bool res = true;
+    
+	if(!IS_PGXC_COORDINATOR)
+	{
+		elog(ERROR, "can only called on coordinator");
+	}
+	
+	mycontext = AllocSetContextCreate(CurrentMemoryContext,
+											  "clean_check",
+											  ALLOCSET_DEFAULT_MINSIZE,
+											  ALLOCSET_DEFAULT_INITSIZE,
+											  ALLOCSET_DEFAULT_MAXSIZE);
+	oldcontext = MemoryContextSwitchTo(mycontext);
+
+    ResetGlobalVariables();
+#if 0
+	if((dir = opendir(TWOPHASE_RECORD_DIR)))
+	{		
+		while((ptr = readdir(dir)) != NULL)
+	    {
+	    	if (count > 999)
+				break;
+	        if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0)
+	        {
+	            continue;
+	        }       
+			snprintf(path[count], MAX_GID, "/%s", ptr->d_name);
+			//snprintf(path[count], MAX_GID, "/%s", ptr->d_name);
+			count++;
+		}
+
+		closedir(dir);
+	}
+#endif
+
+	/*get node list*/
+	PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
+					&cn_nodes_num, &dn_nodes_num, true);
+	pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
+	my_nodeoid = getMyNodeoid();
+	cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+	dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+    result = (TupleTableSlots *)palloc0(pgxc_clean_node_count * sizeof(TupleTableSlots));
+
+    /*collect the 2pc file in nodes*/
+    for (i = 0; i < cn_nodes_num; i++)
+    {
+        execute_query_on_single_node(cn_node_list[i], query, 1, result+i);
+    }
+
+    for (i = 0; i < dn_nodes_num; i++)
+    {
+        execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i);
+    }
+	/*get all database info*/
+	getDatabaseList();
+	
+	/*get all info of 2PC transactions*/
+	getTxnInfoOnNodesAll();
+#if 0
+	if((dir = opendir(TWOPHASE_RECORD_DIR)))
+	{		
+		while (i < count)
+		{
+			if (!find_txn(path[i]))
+			{
+				unlink(path[i]);
+				WriteClean2pcXlogRec(path[i]);
+			}
+			i++;
+		}
+
+		closedir(dir);
+	}
+#endif
+    /*delete all rest 2pc file in each nodes*/
+    for (i = 0; i < cn_nodes_num; i++)
+    {
+        if (0 == result[i].slot_count)
+        {
+            continue;
+        }
+        if (!(twopcfiles = TTSgetvalue(result+i, 0, 0)))
+            continue;
+        ptr = strtok(twopcfiles, ",");
+        while(ptr)
+        {
+            if (count >= MAXIMUM_CLEAR_FILE)
+                break;
+            if (!find_txn(ptr))
+            {
+                snprintf(clear_query, 100, CLEAR_STMT, ptr);
+                if (execute_query_on_single_node(cn_node_list[i], clear_query, 1, &clear_result) == (Datum)0)
+                    res = false;
+                DropTupleTableSlots(&clear_result);
+                count++;
+            }
+            ptr = strtok(NULL, ",");
+        }
+    }
+
+    for (i = 0; i < dn_nodes_num; i++)
+    {
+        if (0 == result[cn_nodes_num+i].slot_count)
+        {
+            continue;
+        }
+        if (!(twopcfiles = TTSgetvalue(result+cn_nodes_num+i, 0, 0)))
+            continue;
+        ptr = strtok(twopcfiles, ",");
+        while(ptr)
+        {
+            if (count >= MAXIMUM_CLEAR_FILE)
+                break;
+            if (!find_txn(ptr))
+            {
+                snprintf(clear_query, 100, CLEAR_STMT, ptr);
+                if (execute_query_on_single_node(dn_node_list[i], clear_query, 1, &clear_result) == (Datum)0)
+                    res = false;
+                DropTupleTableSlots(&clear_result);
+                count++;
+            }
+            ptr = strtok(NULL, ",");
+        }
+    }
+
+    for (i = 0; i < pgxc_clean_node_count; i++)
+        DropTupleTableSlots(result+i);
+    
+    DestroyTxnHash();
+	ResetGlobalVariables();
+
+	MemoryContextSwitchTo(oldcontext);
+	MemoryContextDelete(mycontext);
+	
+	
+    PG_RETURN_BOOL(res);
+}
+
+Datum pgxc_get_record_list(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_record_list);
+Datum pgxc_get_record_list(PG_FUNCTION_ARGS)
+{
+    int count = 0;
+    DIR *dir = NULL;
+    struct dirent *ptr = NULL;
+    char *recordList = NULL;
+	text *t_recordList = NULL;
+
+	if(!(dir = opendir(TWOPHASE_RECORD_DIR)))
+	{
+		PG_RETURN_NULL();
+	}
+
+    while((ptr = readdir(dir)) != NULL)
+    {
+        if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0)
+        {
+            continue;
+        }       
+        if (count >= MAXIMUM_OUTPUT_FILE)
+            break;
+        
+        if(!recordList)
+        {
+            recordList = (char *)palloc0(strlen(ptr->d_name) + 1);
+            sprintf(recordList, "%s", ptr->d_name);
+        }
+        else
+        {
+    		recordList = (char *) repalloc(recordList,
+								   strlen(ptr->d_name) + strlen(recordList) + 2);
+            sprintf(recordList, "%s,%s", recordList, ptr->d_name);
+        }
+        count++;
+    }
+
+    closedir(dir);
+    
+    if(!recordList)
+    {
+        PG_RETURN_NULL();
+    }
+    else
+    {
+    	t_recordList = cstring_to_text(recordList);
+        return PointerGetDatum(t_recordList);
+    }
+}
+
+Datum pgxc_commit_on_node(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_commit_on_node);
+Datum pgxc_commit_on_node(PG_FUNCTION_ARGS)
+{
+    /* nodename, gid */
+    char *nodename;
+    Oid  nodeoid;
+    char *gid;
+    txn_info *txn;
+	char command[100];
+	PGXCNodeHandle **connections = NULL;
+	int					conn_count = 0;
+	ResponseCombiner	combiner;
+	PGXCNodeAllHandles *pgxc_handles = NULL;
+    PGXCNodeHandle *conn = NULL;
+    
+    /*clear Global*/
+    ResetGlobalVariables();
+    /*get node list*/
+    PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
+                    &cn_nodes_num, &dn_nodes_num, true);
+    if (cn_node_list == NULL || dn_node_list == NULL)
+        elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list");
+    pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
+    my_nodeoid = getMyNodeoid();
+    cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+    dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+ 
+    nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
+    gid = text_to_cstring(PG_GETARG_TEXT_P(1));
+    nodeoid = get_pgxc_nodeoid(nodename);
+    if (InvalidOid == nodeoid)
+    {
+        elog(ERROR, "Invalid nodename '%s'", nodename);
+    }
+    
+	txn = (txn_info *)palloc0(sizeof(txn_info));
+	if (txn == NULL)
+	{
+		PG_RETURN_BOOL(false);
+	}
+	txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count);
+	txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count);
+	txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count);
+	txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int));
+	txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int));
+
+	strncpy(txn->gid, gid, strlen(gid)+1);
+    getTxnInfoOnOtherNodes(txn);
+	snprintf(command, 100, "commit prepared '%s'", txn->gid);
+
+
+    if (InvalidGlobalTimestamp == txn->global_commit_timestamp)
+    {
+        if (!txn->is_readonly)
+        {
+            elog(ERROR, "in pg_clean, fail to get global_commit_timestamp for transaction '%s' on", gid);
+        }
+        else
+        {
+            txn->global_commit_timestamp = GetGlobalTimestampGTM();
+        }
+    }
+    
+	connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*));
+    get_node_handles(&pgxc_handles, nodeoid);
+
+    conn = (PGXC_NODE_COORDINATOR == get_pgxc_nodetype(nodeoid)) ? 
+            pgxc_handles->coord_handles[0] : pgxc_handles->datanode_handles[0];
+    if (!send_query_clean_transaction(conn, txn, command))
+    {
+        elog(ERROR, "pg_clean: send query '%s' from '%s' to '%s' failed ", 
+            command, get_pgxc_nodename(my_nodeoid) , nodename);
+    }
+    else
+    {
+        connections[conn_count++] = conn;
+    }
+    /* receive response */
+    if (conn_count)
+    {
+        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
+                !validate_combiner(&combiner))
+        {
+            if (combiner.errorMessage)
+                pgxc_node_report_error(&combiner);
+            else
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                         errmsg("Failed to FINISH the transaction on one or more nodes")));
+        }
+        else
+            CloseCombiner(&combiner);
+    }
+    /*clear Global*/
+    ResetGlobalVariables();
+	clear_handles();
+	pfree_pgxc_all_handles(pgxc_handles);
+    pgxc_handles = NULL;
+    pfree(connections);
+    connections = NULL;
+
+    PG_RETURN_BOOL(true);
+}
+
+Datum pgxc_abort_on_node(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_abort_on_node);
+Datum pgxc_abort_on_node(PG_FUNCTION_ARGS)
+{
+    /* nodename, gid */
+    char *nodename;
+    Oid  nodeoid;
+    char *gid;
+    txn_info *txn;
+	char command[100];
+	PGXCNodeHandle **connections = NULL;
+	int					conn_count = 0;
+	ResponseCombiner	combiner;
+	PGXCNodeAllHandles *pgxc_handles = NULL;
+    PGXCNodeHandle *conn = NULL;
+    
+    /*clear Global*/
+    ResetGlobalVariables();
+    /*get node list*/
+    PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
+                    &cn_nodes_num, &dn_nodes_num, true);
+    if (cn_node_list == NULL || dn_node_list == NULL)
+        elog(ERROR, "pg_clean:fail to get cn_node_list and dn_node_list");
+    pgxc_clean_node_count = cn_nodes_num + dn_nodes_num;
+    my_nodeoid = getMyNodeoid();
+    cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+    dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+ 
+    nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
+    gid = text_to_cstring(PG_GETARG_TEXT_P(1));
+    nodeoid = get_pgxc_nodeoid(nodename);
+    if (InvalidOid == nodeoid)
+    {
+        elog(ERROR, "Invalid nodename '%s'", nodename);
+    }
+    
+	txn = (txn_info *)palloc0(sizeof(txn_info));
+	if (txn == NULL)
+	{
+		PG_RETURN_BOOL(false);
+	}
+	txn->txn_stat = (TXN_STATUS *)palloc0(sizeof(TXN_STATUS) * pgxc_clean_node_count);
+	txn->xid = (uint32 *)palloc0(sizeof(uint32) * pgxc_clean_node_count);
+	txn->prepare_timestamp = (TimestampTz *)palloc0(sizeof(TimestampTz) * pgxc_clean_node_count);
+	txn->coordparts = (int *)palloc0(cn_nodes_num * sizeof(int));
+	txn->dnparts = (int *)palloc0(dn_nodes_num * sizeof(int));
+
+	strncpy(txn->gid, gid, strlen(gid)+1);
+	connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*));
+    getTxnInfoOnOtherNodes(txn);
+	snprintf(command, 100, "rollback prepared '%s'", txn->gid);
+#if 0    
+	if (!setMaintenanceMode(true))
+	{
+		elog(ERROR, "Error: fail to set maintenance mode on in pg_clean");
+	}
+#endif    
+
+    get_node_handles(&pgxc_handles, nodeoid);
+
+    conn = (PGXC_NODE_COORDINATOR == get_pgxc_nodetype(nodeoid)) ? 
+            pgxc_handles->coord_handles[0] : pgxc_handles->datanode_handles[0];
+    if (!send_query_clean_transaction(conn, txn, command))
+    {
+        elog(ERROR, "pg_clean: send query '%s' from '%s' to '%s' failed ", 
+            command, get_pgxc_nodename(my_nodeoid) , nodename);
+    }
+    else
+    {
+        connections[conn_count++] = conn;
+    }
+    /* receive response */
+    if (conn_count)
+    {
+        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
+                !validate_combiner(&combiner))
+        {
+            if (combiner.errorMessage)
+                pgxc_node_report_error(&combiner);
+            else
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                         errmsg("Failed to FINISH the transaction on one or more nodes")));
+        }
+        else
+            CloseCombiner(&combiner);
+    }
+    /*clear Global*/
+    ResetGlobalVariables();
+	clear_handles();
+	pfree_pgxc_all_handles(pgxc_handles);
+    pgxc_handles = NULL;
+    pfree(connections);
+    connections = NULL;
+
+    PG_RETURN_BOOL(true);
+}
+
+
+
+void recover2PCForDatabaseAll(void)
+{
+	database_info *cur_db = head_database_info;
+	while (cur_db)
+	{
+		recover2PCForDatabase(cur_db);
+		cur_db = cur_db->next;
+	}
+	//clean_old_2PC_files();
+}
+
+void recover2PCForDatabase(database_info * db_info)
+{
+	txn_info *cur_txn;
+	HASH_SEQ_STATUS status;
+    HTAB *txn = db_info->all_txn_info;
+
+	hash_seq_init(&status, txn);
+	while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL)
+	{
+		recover2PC(cur_txn);
+    }
+}
+
+bool send_query_clean_transaction(PGXCNodeHandle* conn, txn_info *txn, const char *finish_cmd)
+{
+#ifdef __TWO_PHASE_TESTS__
+    if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case &&
+        PG_CLEAN_SEND_QUERY >= twophase_exception_case)
+    {
+        twophase_in = IN_PG_CLEAN;
+    }
+#endif
+	if (!GlobalTimestampIsValid(txn->global_commit_timestamp) && 
+        TXN_STATUS_COMMITTED == txn->global_txn_stat &&
+        !txn->is_readonly)
+		return false;
+	
+    if (pgxc_node_send_clean(conn))
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_INTERNAL_ERROR),
+                 errmsg("in pg_clean failed to send pg_clean flag for %s PREPARED command",
+                        TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        return false;
+    }
+    if (txn->is_readonly && pgxc_node_send_readonly(conn))
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_INTERNAL_ERROR),
+                 errmsg("in pg_clean failed to send readonly flag for %s PREPARED command",
+                        TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        return false;
+    }
+
+    if (txn->after_first_phase && pgxc_node_send_after_prepare(conn))
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_INTERNAL_ERROR),
+                 errmsg("in pg_clean failed to send after prepare flag for %s PREPARED command",
+                        TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        return false;
+    }
+    
+    /* 
+     * only transaction finished in commit prepared/rollback prepared phase send timestamp 
+     * partial prepared transaction has no need to send other information
+     */
+	if (InvalidGlobalTimestamp != txn->global_commit_timestamp && 
+        pgxc_node_send_global_timestamp(conn, txn->global_commit_timestamp))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("in pg_clean failed to send global committs for %s PREPARED command",
+						TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+	}
+    if (!txn->is_readonly)
+    {
+        if (InvalidOid != txn->origcoord && pgxc_node_send_starter(conn, get_pgxc_nodename(txn->origcoord)))
+        {
+    		ereport(ERROR,
+    				(errcode(ERRCODE_INTERNAL_ERROR),
+    				 errmsg("in pg_clean failed to send start node for %s PREPARED command",
+    						TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        }
+
+        if (InvalidTransactionId != txn->startxid && pgxc_node_send_startxid(conn, txn->startxid))
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_INTERNAL_ERROR),
+                     errmsg("in pg_clean failed to send start xid for %s PREPARED command",
+                            TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        }
+        
+        if (NULL != txn->participants && pgxc_node_send_partnodes(conn, txn->participants))
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_INTERNAL_ERROR),
+                     errmsg("in pg_clean failed to send participants for %s PREPARED command",
+                            TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        }
+    }
+    
+    if (pgxc_node_send_query(conn, finish_cmd))
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_INTERNAL_ERROR),
+                 errmsg("in pg_clean failed to send query for %s PREPARED command",
+                        TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        return false;
+    }
+	return true;
+}
+
+bool check_2pc_belong_node(txn_info * txn)
+{
+    int node_index = 0;
+    char node_type;
+    node_index = find_node_index(abnormal_nodeoid);
+    if (abnormal_nodeoid == txn->origcoord)
+    {
+        txn->belong_abnormal_node = true;
+        return true;
+    }
+    node_type = get_pgxc_nodetype(abnormal_nodeoid);
+    if (node_type == 'C' && txn->coordparts[node_index] == 1)
+    {
+        txn->belong_abnormal_node = true;
+        return true;
+    }
+    if (node_type == 'D' && txn->dnparts[node_index - cn_nodes_num] == 1)
+    {
+        txn->belong_abnormal_node = true;
+        return true;
+    }
+    txn->belong_abnormal_node = false;
+    return false;
+}
+
+bool check_node_participate(txn_info * txn, int node_idx)
+{
+    char node_type = get_pgxc_nodetype(abnormal_nodeoid);
+    if (PGXC_NODE_COORDINATOR == node_type) 
+    {
+        return txn->coordparts[node_idx] == 1 ? true : false;
+    } else if (PGXC_NODE_DATANODE == node_type)
+    {
+        return txn->dnparts[node_idx] == 1 ? true : false;
+    }
+    return false;
+}
+
+void recover2PC(txn_info * txn)
+{
+	TXN_STATUS txn_stat;
+	txn_stat = check_txn_global_status(txn);
+	txn->global_txn_stat = txn_stat;
+
+#ifdef DEBUG_EXECABORT
+	txn_stat = TXN_STATUS_ABORTED;
+#endif
+
+	switch (txn_stat)
+	{
+		case TXN_STATUS_FAILED:
+			elog(LOG, "cannot recover 2PC transaction %s for TXN_STATUS_FAILED", txn->gid);
+            txn->op = UNDO;
+			txn->op_issuccess = true;
+			break;
+		
+		case TXN_STATUS_UNKNOWN:
+			elog(LOG, "cannot recover 2PC transaction %s for TXN_STATUS_UNKNOWN", txn->gid);
+            txn->op = UNDO;
+			txn->op_issuccess = true;
+			break;
+		
+		case TXN_STATUS_PREPARED:
+			elog(DEBUG1, "2PC recovery of transaction %s not needed for TXN_STATUS_PREPARED", txn->gid);
+            txn->op = UNDO;
+			txn->op_issuccess = true;
+			break;
+		
+		case TXN_STATUS_COMMITTED:
+            if (InvalidOid == txn->origcoord || txn->is_readonly)
+            {
+                txn->op = UNDO;
+                txn->op_issuccess = true;
+            }
+            else
+            {
+    			txn->op = COMMIT;
+    			if (!clean_2PC_iscommit(txn, true))
+    			{
+    				txn->op_issuccess = false;
+    				elog(LOG, "commit 2PC transaction %s failed", txn->gid);
+    				return;
+    			}
+    			txn->op_issuccess = true;
+    			clean_2PC_files(txn);
+            }
+			break;
+		
+		case TXN_STATUS_ABORTED:
+			txn->op = ABORT;
+			if (!clean_2PC_iscommit(txn, false))
+			{
+				txn->op_issuccess = false;
+				elog(LOG, "rollback 2PC transaction %s failed", txn->gid);
+				return;
+			}
+			txn->op_issuccess = true;
+			clean_2PC_files(txn);
+			break;
+		
+		case TXN_STATUS_INPROGRESS:
+			elog(DEBUG1, "2PC recovery of transaction %s not needed for TXN_STATUS_INPROGRESS", txn->gid);
+            txn->op = UNDO;
+			txn->op_issuccess = true;
+			break;
+		
+		default:
+			elog(ERROR, "cannot recover 2PC transaction %s for unkown status", txn->gid);
+			break;
+	}
+	return;
+}
+
+TXN_STATUS check_txn_global_status(txn_info *txn)
+{
+#define TXN_PREPARED 	0x0001
+#define TXN_COMMITTED 	0x0002
+#define TXN_ABORTED		0x0004
+#define TXN_UNKNOWN		0x0008
+#define TXN_INITIAL		0x0010
+#define TXN_INPROGRESS	0X0020
+	int ii;
+	int check_flag = 0;
+    int node_idx = 0;
+	TimestampTz prepared_time = 0;
+	TimestampTz time_gap = clean_time_interval;
+
+    if (!IsXidImplicit(txn->gid) && txn->is_readonly)
+    {
+        return TXN_STATUS_COMMITTED;
+    }
+    if (txn->global_txn_stat == TXN_STATUS_UNKNOWN)
+    {
+        check_flag |= TXN_UNKNOWN;
+    }
+    if (txn->global_txn_stat == TXN_STATUS_ABORTED)
+    {
+        check_flag |= TXN_ABORTED;
+    }
+
+	/*check dn participates*/
+	for (ii = 0; ii < dn_nodes_num; ii++)
+	{
+		if (txn->dnparts[ii] == 1)
+		{
+			if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_INITIAL)
+				check_flag |= TXN_INITIAL;
+			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_UNKNOWN)
+				check_flag |= TXN_UNKNOWN;
+			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_PREPARED)
+			{
+				check_flag |= TXN_PREPARED;
+				prepared_time = txn->prepare_timestamp[ii + cn_nodes_num] > prepared_time ? 
+								txn->prepare_timestamp[ii + cn_nodes_num] : prepared_time;
+			}
+			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_INPROGRESS)
+				check_flag |= TXN_INPROGRESS;
+			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_COMMITTED)
+				check_flag |= TXN_COMMITTED;
+			else if (txn->txn_stat[ii + cn_nodes_num] == TXN_STATUS_ABORTED)
+				check_flag |= TXN_ABORTED;
+			else
+				return TXN_STATUS_FAILED;
+		}
+	}
+	/*check cn participates*/
+	for (ii = 0; ii < cn_nodes_num; ii++)
+	{
+		if (txn->coordparts[ii] == 1)
+		{
+			if (txn->txn_stat[ii] == TXN_STATUS_INITIAL)
+				check_flag |= TXN_ABORTED;
+			else if (txn->txn_stat[ii] == TXN_STATUS_UNKNOWN)
+				check_flag |= TXN_UNKNOWN;
+			else if (txn->txn_stat[ii] == TXN_STATUS_PREPARED)
+			{
+				check_flag |= TXN_PREPARED;
+				prepared_time = txn->prepare_timestamp[ii] > prepared_time ? 
+								txn->prepare_timestamp[ii] : prepared_time;
+			}
+			else if (txn->txn_stat[ii] == TXN_STATUS_INPROGRESS)
+				check_flag |= TXN_INPROGRESS;
+			else if (txn->txn_stat[ii] == TXN_STATUS_COMMITTED)
+				check_flag |= TXN_COMMITTED;
+			else if (txn->txn_stat[ii] == TXN_STATUS_ABORTED)
+				check_flag |= TXN_ABORTED;
+			else
+				return TXN_STATUS_FAILED;
+		}
+	}
+
+    /*
+     * first check the prepare timestamp of both implicit and explicit trans within the time_gap or not
+     * if not, check the commit timestamp explicit trans within the time_gap or not 
+     */
+#if 0     
+    if ((check_flag & TXN_INPROGRESS) ||
+        (IsXidImplicit(txn->gid) && current_time - prepared_time <= time_gap) ||
+        (!IsXidImplicit(txn->gid) && 
+            ((!txn->after_first_phase && current_time - prepared_time <= time_gap) ||
+            (txn->after_first_phase && 
+                (InvalidGlobalTimestamp != commit_time && 
+                current_time - commit_time <= time_gap)))))
+    {
+		/* transaction inprogress */
+        return TXN_STATUS_INPROGRESS;
+    }
+#endif                
+    if (clear_2pc_belong_node)
+    {
+        node_idx = find_node_index(abnormal_nodeoid);
+        if (!check_2pc_belong_node(txn) || 
+           !check_node_participate(txn, node_idx) ||
+            abnormal_time < txn->prepare_timestamp[node_idx])
+        {
+            return TXN_STATUS_INPROGRESS;
+        }
+    }
+    else
+    {
+        if (check_flag & TXN_INPROGRESS ||current_time - prepared_time <= time_gap)
+        {
+            /* transaction inprogress */
+            return TXN_STATUS_INPROGRESS;
+        }
+    }
+
+
+    if (!IsXidImplicit(txn->gid) && txn->after_first_phase && (TXN_PREPARED == check_flag))
+    {
+        return TXN_STATUS_PREPARED;
+    }
+
+	if (check_flag & TXN_UNKNOWN)
+		return TXN_STATUS_UNKNOWN;
+    
+	if ((check_flag & TXN_COMMITTED) && (check_flag & TXN_ABORTED))
+		/* Mix of committed and aborted. This should not happen. */
+		return TXN_STATUS_UNKNOWN;
+    
+	if ((check_flag & TXN_PREPARED) == 0)
+		/* Should be at least one "prepared statement" in nodes */
+		return TXN_STATUS_FAILED;
+		
+	if (check_flag & TXN_COMMITTED)
+		/* Some 2PC transactions are committed.  Need to commit others. */
+		return TXN_STATUS_COMMITTED;
+	/* All the transactions remain prepared.   No need to recover. */
+	return TXN_STATUS_ABORTED;
+}
+
+bool clean_2PC_iscommit(txn_info *txn, bool iscommit)
+{
+	int ii;
+	static const char *STMT_FORM = "%s prepared '%s';";
+	char command[100];
+	int node_idx;
+    Oid node_oid;
+	PGXCNodeHandle **connections = NULL;
+	int					conn_count = 0;
+	ResponseCombiner	combiner;
+	PGXCNodeAllHandles *pgxc_handles = NULL;
+
+	if (iscommit)
+		snprintf(command, 100, STMT_FORM, "commit", txn->gid);
+	else
+		snprintf(command, 100, STMT_FORM, "rollback", txn->gid);
+	if (iscommit && InvalidGlobalTimestamp == txn->global_commit_timestamp)	
+	{
+		elog(ERROR, "twophase transaction '%s' has InvalidGlobalCommitTimestamp", txn->gid);
+	}
+
+	connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*) * (txn->num_dnparts + txn->num_coordparts));
+	if (connections == NULL)
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_OUT_OF_MEMORY),
+                 errmsg("out of memory for connections")));
+    }
+    get_transaction_handles(&pgxc_handles, txn);
+
+#ifdef __TWO_PHASE_TESTS__
+    if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && 
+        PG_CLEAN_ELOG_ERROR >= twophase_exception_case)
+    {
+        exception_count = 0;
+    }
+#endif
+	for (ii = 0; ii < pgxc_handles->dn_conn_count; ii++)
+	{
+        node_oid = pgxc_handles->datanode_handles[ii]->nodeoid;
+    	node_idx = find_node_index(node_oid);
+        if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx])
+        {
+            continue;
+        }
+		/*send global timestamp to dn_node_list[ii]*/
+		if (!send_query_clean_transaction(pgxc_handles->datanode_handles[ii], txn, command))
+		{
+			elog(LOG, "pg_clean: send query '%s' from '%s' to '%s' failed ", 
+				command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->datanode_handles[ii]->nodename);
+			return false;
+		}
+        else
+        {
+            connections[conn_count++] = pgxc_handles->datanode_handles[ii];
+#ifdef __TWO_PHASE_TESTS__
+            if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && 
+                PG_CLEAN_ELOG_ERROR >= twophase_exception_case)
+            {
+                exception_count++;
+                if (1 == exception_count && 
+                    PG_CLEAN_ELOG_ERROR == twophase_exception_case)
+                {
+                    elog(ERROR, "PG_CLEAN_ELOG_ERROR complish");
+                }
+            }
+#endif
+        }
+	}
+
+	for (ii = 0; ii < pgxc_handles->co_conn_count; ii++)
+	{
+        node_oid = pgxc_handles->coord_handles[ii]->nodeoid;
+    	node_idx = find_node_index(node_oid);
+        if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx])
+        {
+            continue;
+        }
+		/*send global timestamp to dn_node_list[ii]*/
+		if (!send_query_clean_transaction(pgxc_handles->coord_handles[ii], txn, command))
+		{
+			elog(LOG, "pg_clean: send query '%s' from '%s' to '%s' failed ", 
+				command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->coord_handles[ii]->nodename);
+			return false;
+		}
+        else
+        {
+            connections[conn_count++] = pgxc_handles->coord_handles[ii];
+#ifdef __TWO_PHASE_TESTS__
+            if (PG_CLEAN_SEND_CLEAN <= twophase_exception_case && 
+                PG_CLEAN_ELOG_ERROR >= twophase_exception_case)
+            {
+                exception_count++;
+                if (1 == exception_count && 
+                    PG_CLEAN_ELOG_ERROR == twophase_exception_case)
+                {
+                    elog(ERROR, "PG_CLEAN_ELOG_ERROR complish");
+                }
+            }
+#endif
+        }
+
+	}
+
+    /* receive response */
+    if (conn_count)
+    {
+        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
+                !validate_combiner(&combiner))
+        {
+            if (combiner.errorMessage)
+                pgxc_node_report_error(&combiner);
+            else
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                         errmsg("Failed to FINISH the transaction on one or more nodes")));
+        }
+        else
+            CloseCombiner(&combiner);
+    }
+    if (enable_distri_print)
+    {
+        for (ii = 0; ii < conn_count; ii++)
+        {
+            if (DN_CONNECTION_STATE_IDLE != connections[ii]->state)
+            {
+                elog(WARNING, "IN pg_clean node:%s invalid stauts:%d", connections[ii]->nodename, connections[ii]->state);
+            }
+        }
+    }
+    conn_count = 0;
+	clear_handles();
+	pfree_pgxc_all_handles(pgxc_handles);
+    pgxc_handles = NULL;
+
+	/*last commit or rollback on origcoord if it participate this txn, since after commit the 2pc file is deleted on origcoord*/
+    if (txn->origcoord != InvalidOid)
+    {
+    	node_idx = find_node_index(txn->origcoord);
+    	if (txn->coordparts[node_idx] == 1)
+    	{
+			/*send global timestamp to dn_node_list[ii]*/
+            
+			if (txn->txn_stat[node_idx] == TXN_STATUS_PREPARED)
+			{
+                get_node_handles(&pgxc_handles, txn->origcoord);
+                if (!send_query_clean_transaction(pgxc_handles->coord_handles[0], txn, command))
+                {
+                    elog(LOG, "pg_clean: send query '%s' from %s to %s failed ", 
+                        command, get_pgxc_nodename(my_nodeoid) , pgxc_handles->coord_handles[0]->nodename);
+                    return false;
+                }
+                else
+                {
+                    connections[conn_count++] = pgxc_handles->coord_handles[0];
+                }
+            }
+    	}
+    }
+	
+    /* receive response */
+    if (conn_count)
+    {
+        InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+        if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
+                !validate_combiner(&combiner))
+        {
+            if (combiner.errorMessage)
+                pgxc_node_report_error(&combiner);
+            else
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                         errmsg("Failed to FINISH the transaction on one or more nodes")));
+        }
+        else
+            CloseCombiner(&combiner);
+    }
+	/*free hash record from gtm*/
+	FinishGIDGTM(txn->gid);
+    
+	clear_handles();
+	pfree_pgxc_all_handles(pgxc_handles);
+    pgxc_handles = NULL;
+    pfree(connections);
+    connections = NULL;
+	return true;
+}
+
+bool clean_2PC_files(txn_info * txn)
+{
+	int ii;
+	TupleTableSlots result;
+	bool issuccess = true;
+	static const char *STMT_FORM = "select pgxc_remove_2pc_records('%s')::text";
+	char query[100];
+	
+	snprintf(query, 100, STMT_FORM, txn->gid);
+
+	for (ii = 0; ii < dn_nodes_num; ii++)
+	{
+		if (execute_query_on_single_node(dn_node_list[ii], query, 1, &result) == (Datum) 1)
+		{
+			if (TTSgetvalue(&result, 0, 0) == false)
+			{
+				elog(LOG, "pg_clean: delete 2PC file failed of transaction %s on node %s",
+						  txn->gid, get_pgxc_nodename(txn->dnparts[ii]));
+				issuccess = false;
+			}
+		}
+		else
+		{
+			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(dn_node_list[ii]));
+			issuccess = false;
+		}
+		DropTupleTableSlots(&result);
+		if (!issuccess)
+			return false;
+	}
+
+	for (ii = 0; ii < cn_nodes_num; ii++)
+	{
+		if (execute_query_on_single_node(cn_node_list[ii], query, 1, &result) == (Datum) 1)
+		{
+			if (TTSgetvalue(&result, 0, 0) == false)
+			{
+				elog(LOG, "Error:delete 2PC file failed of transaction %s on node %s",
+						  txn->gid, get_pgxc_nodename(txn->coordparts[ii]));
+				issuccess = false;
+			}
+		}
+		else
+		{
+			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(cn_node_list[ii]));
+			issuccess = false;
+		}
+		DropTupleTableSlots(&result);
+		if (!issuccess)
+			return false;
+	}
+	return true;
+}
+
+void Init_print_txn_info(print_txn_info * print_txn)
+{
+	database_info *cur_database = head_database_info;
+	txn_info *cur_txn;
+	HASH_SEQ_STATUS status;
+    HTAB *txn;
+
+	print_txn->index = 0;
+	INIT(print_txn->txn);
+
+	for (; cur_database; cur_database = cur_database->next)
+	{
+        txn = cur_database->all_txn_info;
+        hash_seq_init(&status, txn);
+        while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL)
+        {
+            if (clear_2pc_belong_node && !cur_txn->belong_abnormal_node)
+            {
+                continue;
+            }
+			if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS)
+				PALLOC(print_txn->txn, cur_txn);
+        }
+        
+#if 0
+		cur_txn = cur_database->head_txn_info;
+		for (; cur_txn; cur_txn = cur_txn->next)
+		{
+			if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS)
+				PALLOC(print_txn->txn, cur_txn);
+		}
+#endif
+	}
+}
+
+void Init_print_stats_all(print_status *pstatus)
+{
+	database_info *cur_database;
+	txn_info *cur_txn;
+	HASH_SEQ_STATUS status;
+    HTAB *txn;
+
+	pstatus->index = 0;
+	pstatus->count = 0;
+	INIT(pstatus->gid);
+	INIT(pstatus->global_status);
+	INIT(pstatus->status);
+	INIT(pstatus->database);
+
+	for (cur_database = head_database_info; cur_database; cur_database = cur_database->next)
+	{
+        txn = cur_database->all_txn_info;
+        hash_seq_init(&status, txn);
+        while ((cur_txn = (txn_info *) hash_seq_search(&status)) != NULL)
+        {
+			cur_txn->global_txn_stat = check_txn_global_status(cur_txn);
+			if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS)
+				Init_print_stats(cur_txn, cur_database->database_name, pstatus);
+        }
+#if 0
+		for (cur_txn = cur_database->head_txn_info; cur_txn; cur_txn = cur_txn->next)
+		{
+			cur_txn->global_txn_stat = check_txn_global_status(cur_txn);
+			if (cur_txn->global_txn_stat != TXN_STATUS_INPROGRESS)
+				Init_print_stats(cur_txn, cur_database->database_name, pstatus);
+		}
+#endif
+	}
+}
+
+void Init_print_stats(txn_info *txn, char *database, print_status * pstatus)
+{
+	int ii;
+	StringInfoData	query;	
+	initStringInfo(&query);
+
+	RPALLOC(pstatus->gid);
+	RPALLOC(pstatus->global_status);
+	RPALLOC(pstatus->status);
+	RPALLOC(pstatus->database);
+
+	pstatus->gid[pstatus->count] = (char *)palloc0(100 * sizeof(char));
+	pstatus->database[pstatus->count] = (char *)palloc0(100 * sizeof(char));
+	pstatus->global_status[pstatus->count] = (char *)palloc0(100 * sizeof(char));
+
+	strncpy(pstatus->gid[pstatus->count], txn->gid, 100);
+	strncpy(pstatus->database[pstatus->count], database, 100);
+	strncpy(pstatus->global_status[pstatus->count], txn_status_to_string(check_txn_global_status(txn)), 100);
+
+	for (ii = 0; ii < pgxc_clean_node_count; ii++)
+	{
+		appendStringInfo(&query, "%-12s:%-15s", get_pgxc_nodename(find_node_oid(ii)), 
+						txn_status_to_string(txn->txn_stat[ii]));
+		if (ii < pgxc_clean_node_count - 1)
+		{
+			appendStringInfoChar(&query, '\n');
+		}
+	}
+
+	pstatus->status[pstatus->count] = (char *)palloc0((strlen(query.data)+1) * sizeof(char));
+	strncpy(pstatus->status[pstatus->count], query.data, strlen(query.data)+1);
+	pstatus->gid_count++;
+	pstatus->database_count++;
+	pstatus->global_status_count++;
+	pstatus->status_count++;
+	pstatus->count++;
+}
+
+static const char *txn_status_to_string(TXN_STATUS status)
+{
+	switch (status)
+	{
+		ENUM_TOCHAR_CASE(TXN_STATUS_INITIAL)
+	    ENUM_TOCHAR_CASE(TXN_STATUS_UNKNOWN)
+	    ENUM_TOCHAR_CASE(TXN_STATUS_PREPARED)
+	    ENUM_TOCHAR_CASE(TXN_STATUS_COMMITTED)       
+	    ENUM_TOCHAR_CASE(TXN_STATUS_ABORTED)
+	    ENUM_TOCHAR_CASE(TXN_STATUS_INPROGRESS)
+	    ENUM_TOCHAR_CASE(TXN_STATUS_FAILED)
+	}
+	return NULL;
+}
+
+static const char *txn_op_to_string(OPERATION op)
+{
+	switch (op)
+	{
+		ENUM_TOCHAR_CASE(UNDO)
+	    ENUM_TOCHAR_CASE(ABORT)
+	    ENUM_TOCHAR_CASE(COMMIT)
+	}
+	return NULL;
+}
+
+
+static void 
+CheckFirstPhase(txn_info *txn)
+{
+//    int ret;
+    Oid orignode = txn->origcoord;
+    uint32 startxid = txn->startxid;
+//    uint32 transactionid;
+    int nodeidx;
+
+    /*
+     * if the twophase trans does not success in prepare phase, the orignode == InvalidOid.
+     */
+    if (InvalidOid == orignode)
+    {
+        return;
+    }
+    nodeidx = find_node_index(orignode);
+    if (0 == txn->xid[nodeidx])
+    {
+        txn->xid[nodeidx] = startxid;
+    }
+    /* start node participate */
+    if (txn->isorigcoord_part)
+    {
+        if (0 == txn->coordparts[nodeidx])
+        {
+            txn->coordparts[nodeidx] = 1;
+            txn->num_coordparts++;
+        }
+        if (txn->txn_stat[nodeidx] == TXN_STATUS_INITIAL)
+        {
+            /*select * from pgxc_is_committed...*/
+            getTxnStatus(txn, nodeidx);
+        }
+        if (txn->txn_stat[nodeidx] == TXN_STATUS_PREPARED && txn->global_commit_timestamp != InvalidGlobalTimestamp)
+        {
+            txn->after_first_phase = true;
+        }
+    }
+    /* start node node participate */
+    else
+    {
+#if 0        
+        ret = Get2PCFile(orignode, txn->gid, &transactionid);
+        if (ret == FILENOTFOUND)
+            txn->after_first_phase = false;
+        else if (ret == FILEUNKOWN)
+            txn->global_txn_stat = TXN_STATUS_UNKNOWN;
+        else if (ret == FILEFOUND && txn->global_commit_timestamp != InvalidGlobalTimestamp)
+            txn->after_first_phase = true;
+#endif
+        if (txn->global_commit_timestamp != InvalidGlobalTimestamp)
+        {
+            txn->after_first_phase = true;
+        } else {
+            txn->after_first_phase = false;
+        }
+    }
+}
+
+void get_transaction_handles(PGXCNodeAllHandles **pgxc_handles, txn_info *txn)
+{
+    int dn_index = 0;
+    int cn_index = 0;
+    int  nodeIndex;
+    char nodetype;
+	List *coordlist = NIL;
+	List *nodelist = NIL;
+    
+    while (dn_index < dn_nodes_num)
+    {
+
+        /* Get node type and index */
+        nodetype = PGXC_NODE_NONE;
+        if (TXN_STATUS_PREPARED != txn->txn_stat[dn_index + cn_nodes_num])
+        {
+            dn_index++;
+            continue;
+        }
+        nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(dn_node_list[dn_index]), &nodetype);
+        if (nodetype == PGXC_NODE_NONE)
+            ereport(ERROR,
+                    (errcode(ERRCODE_UNDEFINED_OBJECT),
+                     errmsg("PGXC Node %s: object not defined",
+                            get_pgxc_nodename(dn_node_list[dn_index]))));
+
+        /* Check if node is requested is the self-node or not */
+        if (nodetype == PGXC_NODE_DATANODE)
+        {
+            nodelist = lappend_int(nodelist, nodeIndex);
+        }
+        dn_index++;
+
+    }
+
+    while (cn_index < cn_nodes_num)
+    {
+        /* Get node type and index */
+        nodetype = PGXC_NODE_NONE;
+        if (TXN_STATUS_PREPARED != txn->txn_stat[cn_index] || cn_node_list[cn_index] == txn->origcoord)
+        {
+            cn_index++;
+            continue;
+        }
+        nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(cn_node_list[cn_index]), &nodetype);
+        if (nodetype == PGXC_NODE_NONE)
+            ereport(ERROR,
+                    (errcode(ERRCODE_UNDEFINED_OBJECT),
+                     errmsg("PGXC Node %s: object not defined",
+                            get_pgxc_nodename(cn_node_list[cn_index]))));
+
+        /* Check if node is requested is the self-node or not */
+        if (nodetype == PGXC_NODE_COORDINATOR)
+        {
+            coordlist = lappend_int(coordlist, nodeIndex);
+        }
+        cn_index++;
+    }
+    *pgxc_handles = get_handles(nodelist, coordlist, false, true, true);
+}
+
+void get_node_handles(PGXCNodeAllHandles **pgxc_handles, Oid nodeoid)
+{
+    char nodetype = PGXC_NODE_NONE;
+	int nodeIndex;
+	List *coordlist = NIL;
+	List *nodelist = NIL;
+
+	nodeIndex = PGXCNodeGetNodeIdFromName(get_pgxc_nodename(nodeoid), &nodetype);
+	if (nodetype == PGXC_NODE_COORDINATOR)
+	{
+		coordlist = lappend_int(coordlist, nodeIndex);
+	}
+    else
+    {
+        nodelist = lappend_int(nodelist, nodeIndex);
+    }
+	*pgxc_handles = get_handles(nodelist, coordlist, false, true, true);
+}
+
diff --git a/contrib/pg_clean/pg_clean.control b/contrib/pg_clean/pg_clean.control
new file mode 100644
index 00000000..00524ac0
--- /dev/null
+++ b/contrib/pg_clean/pg_clean.control
@@ -0,0 +1,5 @@
+# 2PC transaction recovering extention
+comment = 'tools for clean all the remaining 2PC transactions'
+default_version = '1.0'
+module_pathname = '$libdir/pg_clean'
+relocatable = true
\ No newline at end of file
diff --git a/contrib/pg_clean/test.sh b/contrib/pg_clean/test.sh
new file mode 100644
index 00000000..2f56a302
--- /dev/null
+++ b/contrib/pg_clean/test.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+#
+# This script sets up test environment for pgxc_clean.
+# Please note that all the prepared transactions are
+# partially committed or aborted.
+#
+# You should configure PGPORT and PGHOST to connect, as
+# well as node names for your test environment.
+#
+# Before you run this script, XC should be up and ready.
+# Also, this may try to drop test databases.   You may need
+# to run CLEAN CONNECTION satement for each coordinator in
+# advance.
+#
+
+export PGPORT=52898
+export PGHOST=localhost
+sourcedb=postgres
+
+{
+psql -e postgres <<EOF
+drop table if exists t_1;
+begin;
+create table t_1 (a int);
+prepare transaction 'test1_1';
+set xc_maintenance_mode = on;
+execute direct on (cn002) 'rollback prepared ''test1_1'' ';
+set xc_maintenance_mode = off;
+select pg_sleep(70);
+\q
+EOF
+}&
+
+{
+psql -e postgres <<EOF
+drop table if exists t_2;
+begin;
+create table t_2 (a int);
+prepare transaction 'test2_1';
+set xc_maintenance_mode = on;
+execute direct on (dn002) 'rollback prepared ''test2_1'' ';
+set xc_maintenance_mode = off;
+select pg_sleep(70);
+\q
+EOF
+}&
+
+
+{
+psql -e postgres <<EOF
+drop table if exists t_3;
+begin;
+create table t_3 (a int);
+prepare transaction 'test3_1';
+set xc_maintenance_mode = on;
+execute direct on (dn003) 'rollback prepared ''test3_1'' ';
+set xc_maintenance_mode = off;
+select pg_sleep(70);
+\q
+EOF
+}&
+
+{
+psql -e postgres <<EOF
+drop table if exists t_4;
+begin;
+create table t_4 (a int);
+prepare transaction 'test4_1';
+set xc_maintenance_mode = on;
+execute direct on (dn003) 'rollback prepared ''test4_1'' ';
+set xc_maintenance_mode = off;
+select pg_sleep(70);
+\q
+EOF
+}&
+
+{
+psql -e postgres <<EOF
+drop table if exists t_5;
+begin;
+create table t_5 (a int);
+prepare transaction 'test5_1';
+set xc_maintenance_mode = on;
+execute direct on (dn003) 'rollback prepared ''test5_1'' ';
+set xc_maintenance_mode = off;
+select pg_sleep(70);
+\q
+EOF
+}&
+
+{
+psql -e postgres <<EOF
+drop table if exists t_6;
+begin;
+create table t_6 (a int);
+prepare transaction 'test6_1';
+set xc_maintenance_mode = on;
+execute direct on (dn003) 'rollback prepared ''test6_1'' ';
+set xc_maintenance_mode = off;
+select pg_sleep(70);
+\q
+EOF
+}&
+
+{
+psql -e postgres <<EOF
+drop table if exists t_7;
+begin;
+create table t_7 (a int);
+prepare transaction 'test7_1';
+set xc_maintenance_mode = on;
+execute direct on (dn003) 'rollback prepared ''test7_1'' ';
+set xc_maintenance_mode = off;
+select pg_sleep(70);
+\q
+EOF
+}&
+
+
+{
+psql -e postgres <<EOF
+drop table if exists t_8;
+begin;
+create table t_8 (a int);
+prepare transaction 'test8_1';
+set xc_maintenance_mode = on;
+execute direct on (dn003) 'rollback prepared ''test8_1'' ';
+set xc_maintenance_mode = off;
+select pg_sleep(70);
+\q
+EOF
+}&
+
+{
+psql -e postgres <<EOF
+drop table if exists t_9;
+begin;
+create table t_9 (a int);
+prepare transaction 'test9_1';
+set xc_maintenance_mode = on;
+execute direct on (dn003) 'rollback prepared ''test9_1'' ';
+set xc_maintenance_mode = off;
+select pg_sleep(70);
+\q
+EOF
+}&
+
+
+
+{
+psql -e postgres <<EOF
+drop table if exists t_10;
+begin;
+create table t_10 (a int);
+prepare transaction 'test10_1';
+set xc_maintenance_mode = on;
+execute direct on (dn003) 'rollback prepared ''test10_1'' ';
+set xc_maintenance_mode = off;
+select pg_sleep(70);
+\q
+EOF
+}&
+
+wait
+
+psql -e postgres <<EOF
+select * from pg_clean_check_txn();
+select * from pg_clean_execute();
+\q
+EOF
+
diff --git a/contrib/pg_unlock/Makefile b/contrib/pg_unlock/Makefile
new file mode 100644
index 00000000..5477e7ad
--- /dev/null
+++ b/contrib/pg_unlock/Makefile
@@ -0,0 +1,18 @@
+# contrib/pg_unlock/Makefile
+
+MODULE_big = pg_unlock
+OBJS = pg_unlock.o
+
+EXTENSION = pg_unlock
+DATA = pg_unlock--1.0.sql pg_unlock--unpackaged--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_unlock
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
\ No newline at end of file
diff --git a/contrib/pg_unlock/pg_unlock--1.0.sql b/contrib/pg_unlock/pg_unlock--1.0.sql
new file mode 100644
index 00000000..30b27507
--- /dev/null
+++ b/contrib/pg_unlock/pg_unlock--1.0.sql
@@ -0,0 +1,56 @@
+/* contrib/pg_unlock/pg_unlock--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_unlock" to load this file. \quit
+
+-- Register functions.
+CREATE FUNCTION pg_unlock_execute(
+	OUT executetime int8,
+	OUT txnindex int8,
+	OUT rollbacktxnifo text,
+	OUT nodename text,
+	OUT cancel_query text
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pg_unlock_check_deadlock(
+	OUT deadlockid int8,
+	OUT	deadlocks text,
+	OUT nodename text,
+	OUT query text
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pg_unlock_check_dependency(
+	OUT dependencyid int8,
+	OUT	dependency text,
+	OUT nodename text,
+	OUT query text
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pg_unlock_killbypid(
+	IN	nodename text, 
+	IN	pid	int4
+)
+RETURNS text
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+CREATE FUNCTION pg_findgxid(IN txnid int8)
+RETURNS text
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+GRANT ALL ON FUNCTION pg_unlock_execute() TO PUBLIC;
+GRANT ALL ON FUNCTION pg_unlock_check_deadlock() TO PUBLIC;
+ GRANT ALL ON FUNCTION pg_unlock_killbypid(nodename text, pid int4) TO PUBLIC;
+GRANT ALL ON FUNCTION pg_unlock_check_dependency() TO PUBLIC;
+GRANT ALL ON FUNCTION pg_findgxid(txnid int8) TO PUBLIC;
+
diff --git a/contrib/pg_unlock/pg_unlock--unpackaged--1.0.sql b/contrib/pg_unlock/pg_unlock--unpackaged--1.0.sql
new file mode 100644
index 00000000..b9031f96
--- /dev/null
+++ b/contrib/pg_unlock/pg_unlock--unpackaged--1.0.sql
@@ -0,0 +1,10 @@
+/* contrib/pg_unlock/pg_unlock--unpackaged--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_unlock" to load this file. \quit
+
+ALTER EXTENSION pg_unlock ADD function pg_unlock_execute();
+ALTER EXTENSION pg_unlock ADD function pg_unlock_check_deadlock();
+ALTER EXTENSION pg_unlock ADD function pg_unlock_killbypid(nodename text, pid int4);
+ALTER EXTENSION pg_unlock ADD function pg_unlock_check_dependency();
+ALTER EXTENSION pg_findgxid ADD function pg_findgxid(txnid int8);
\ No newline at end of file
diff --git a/contrib/pg_unlock/pg_unlock.c b/contrib/pg_unlock/pg_unlock.c
new file mode 100644
index 00000000..f070a071
--- /dev/null
+++ b/contrib/pg_unlock/pg_unlock.c
@@ -0,0 +1,2349 @@
+#include "postgres.h"
+#include "fmgr.h"
+#include "c.h"
+#include "funcapi.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "storage/procarray.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "utils/varlena.h"
+#include "utils/lsyscache.h"
+#include "utils/palloc.h"
+#include "utils/builtins.h"
+
+#include "executor/tuptable.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/pgxcnode.h"
+#include "pgxc/poolmgr.h"
+#include "access/tupdesc.h"
+#include "access/htup_details.h"
+#include "lib/stringinfo.h"
+#ifdef XCP
+#include "catalog/pg_type.h"
+#include "catalog/pgxc_node.h"
+#include "executor/executor.h"
+#include "nodes/makefuncs.h"
+#include "utils/snapmgr.h"
+#endif
+#ifdef PGXC
+#include "pgxc/nodemgr.h"
+#include "pgxc/pgxc.h"
+#endif
+
+PG_MODULE_MAGIC;
+
+#define MAX_GID 50
+#define MAX_DBNAME	64
+#define MAX_RELNAME 64
+#define MAX_MODE 30
+#define MAX_DEADLOCK 10000
+
+/*macros about space allocation and release*/
+#define INIT(x)\
+do{\
+	x = NULL;\
+	x##_count = 0;\
+	x##_size = 0;\
+}while(0);
+
+#define RPALLOC(x)\
+do{\
+    if (x##_size < x##_count+1)\
+    {\
+        int temp_size = (x##_size > 0) ? x##_size : 1;\
+        if (NULL == x)\
+        {\
+			x = palloc0(2*temp_size*sizeof(*x));\
+		}\
+        else\
+        {\
+        	x = repalloc(x, 2*temp_size*sizeof(*x));\
+        }\
+    	x##_size = 2*temp_size;\
+    }\
+}while(0);
+
+#define PALLOC(x, y)\
+do{\
+    RPALLOC(x);\
+    x[x##_count] = y;\
+    x##_count++;\
+}while(0);
+
+#define RFREE(x)\
+do{\
+    if (x##_size > 0)\
+    {\
+        pfree(x);\
+    }\
+    x = NULL;\
+    x##_count = 0;\
+    x##_size = 0;\
+}while(0);
+
+/*data structures*/
+	/*about lock*/
+typedef enum
+{
+	Lockmode_ASL = 0,	/*AccessShareLock*/
+	Lockmode_RSL,		/*RowShareLock*/
+	Lockmode_REL,		/*RowExclusiveLock*/
+	Lockmode_SUEL,		/*ShareUpdateExclusiveLock*/
+	Lockmode_SL,			/*ShareLock*/
+	Lockmode_SREL,		/*ShareRowExclusiveLock*/
+	Lockmode_EL,			/*ExclusiveLock*/
+	Lockmode_AEL 		/*AccessExclusiveLock*/
+} MODE;
+
+typedef enum
+{
+	Locktype_Relation = 0,
+	Locktype_Page,
+	Locktype_Tuple,
+	Locktype_Transactionid,
+	Locktype_Object,
+	Locktype_Userlock, 
+	Locktype_Advisory
+} LOCKTYPE;
+
+typedef struct
+{
+	LOCKTYPE	m_locktype;
+	char		m_dbname[MAX_DBNAME];
+	char		m_relname[MAX_RELNAME];
+	uint32 		m_page;
+	uint16 		m_tuple;
+	MODE		m_mode;
+	bool		m_granted;
+	uint32		m_transactionid;
+	Oid			m_node;		
+	uint32		m_pid;
+    char *      m_query;
+} lockinfo;
+
+	/*about deadlock*/
+typedef struct
+{
+	int*	txns;
+	int		txns_count;
+	int		txns_size;
+	bool	killed;
+} deadlock;
+
+	/*about transactions*/
+typedef struct
+{
+	int			pre;
+	int		 	post;
+}Edge;
+
+typedef struct 
+{
+	char ***slot;	/*slot[i][j] stores value of row i, colum j*/
+	int slot_count;	/*number of rows*/
+	int slot_size;
+	int attnum;
+}TupleTableSlots;
+
+typedef struct
+{
+	char		gid[MAX_GID];	/*globla transactionid*/
+	uint32		*pid;			/*Local pid on each node*/
+	int			pid_count;
+	int			pid_size;
+	Oid		 	*node;			/*a global transaction corresponding to multiple nodes*/
+	int			node_count;
+	int			node_size;
+	Oid  		initiator;		/*node initiating the transaction*/
+	lockinfo	*hold;			/*hold lock list of the transaction*/
+	int			hold_count;
+	int			hold_size;
+	lockinfo	*wait;			/*wait lock list of the transaction*/
+	int			wait_count;
+	int			wait_size;
+	bool	 	searched;		/*transaction travesal status during deadlock detection*/
+	bool		alive;			/*whether the transaction is killed*/
+	int*		deadlock;		/*belonging deadlocks*/
+	int 		deadlock_count;	/*deadlock count of the transaction*/
+	int			deadlock_size;
+	Edge*		out;
+	int			out_count;
+	int			out_size;
+	int			wait_txn;
+    char*       query;
+}transaction;
+
+typedef struct 
+{
+	int*	stack;			/*stack during depth-first search*/
+	int		stack_count;
+	int		stack_size;
+	int*	stackpre;		/*stores parents of transactions in stack*/
+	int		stackpre_count;
+	int		stackpre_size;
+	int*	path;			/*extended path in depth-first search*/
+	int		path_count;
+	int		path_size;
+	int*	txn_exist;		/*stores index of trasaction[i] in path, 
+							txn_exist[txnid] = i; (path[i] = txnid or txn_exist[txnid] = -1;)*/
+} deeplist;
+
+	/*about output results*/
+typedef struct 
+{
+	int index;
+	char **edge;
+	int edge_count;
+	int edge_size;
+    
+    char **nodes;
+    int nodes_count;
+    int nodes_size;
+    
+    char **querys;
+    int querys_count;
+    int querys_size;
+} PrintEdge;
+
+typedef struct 
+{
+	int index;
+	char **deadlock;
+    char **nodename;
+    char **query;
+	int deadlock_count;
+	int *per_size;
+} PrintDeadlock;
+
+typedef struct 
+{
+	int index;
+	char **txn;
+	int txn_count;
+	int txn_size;
+    
+    char **cancel_query;
+    int cancel_query_count;
+    int cancel_query_size;
+    
+    char **nodename;
+    int nodename_count;
+    int nodename_size;
+} PrintRollbackTxn;
+
+typedef struct 
+{
+	int index;
+	PrintRollbackTxn *Ptxns;
+	int	Ptxns_count;
+	int Ptxns_size;
+} PrintAllRollbackTxns;
+
+	
+/*function list*/
+static void ResetGlobalVariables(void);
+
+	/*plugin entry function*/
+Datum	pg_unlock_execute(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pg_unlock_execute);
+
+Datum	pg_unlock_check_deadlock(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pg_unlock_check_deadlock);
+
+Datum	pg_unlock_check_dependency(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pg_unlock_check_dependency);
+
+Datum	pg_unlock_killbypid(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pg_unlock_killbypid);
+
+Datum pg_findgxid(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pg_findgxid);
+
+	/*get all the transaction info*/
+static char * TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num);
+static void	DropTupleTableSlots(TupleTableSlots *Slots);
+static Datum	execute_on_single_node(Oid node, const char * query, int attnum, TupleTableSlots * tuples);
+void	GetAllTransInfo(void);
+void	LoadTransaction(Oid node);
+void	InitTransaction(int txn_index);
+void	add_pid_node(int txn_index, uint32 pid, Oid node);
+LOCKTYPE	
+		find_locktype(char *locktype);
+MODE	find_mode(char *mode);
+
+	/*build transaction dependency gragh*/
+void	InitAllEdge(void);
+void	InitEdge(int pre, int post);
+bool	is_conflict_withtxn(lockinfo *wait, int post_txn);
+bool	is_conflict_withlock(lockinfo *wait, lockinfo *hold);
+bool	check_include(lockinfo *wait, lockinfo *hold);
+void	DropTransaction(int i);
+void	DropAlltransactions(void);
+void	DropEdge(int id);
+
+	/*find all deadlocks*/
+void	InitDeadlock(void);
+void	DropDeadlock(deadlock *loop);
+void	DropAlldeadlocks(void);
+void	DetectDeadlock(void);
+int		traverse(deeplist* list);
+void	path_deadlock(deeplist * list, int start);
+void	InitDeeplist(deeplist* list);
+void	DropDeeplist(deeplist* list); 
+void	ClearDeeplist(deeplist* list); 
+
+	/*recover all deadlocks*/
+void	RecoverDeadlock(void);
+void	CountDeadlocks(void);
+void	CountWaitTxn(void);
+void	SortByDeadlock(int *sort_txnid);
+void	quiksort(int *sort_txnid, int low, int high);
+void	KillDeadlockByTxn(int txnid);
+bool	DeadlockExists(int id);
+
+	/*output results*/
+void	InitPrintEdge(PrintEdge *Pedge);
+void	DropPrintEdge(PrintEdge *Pedge);
+void	InitPrintDeadlock(PrintDeadlock *Pdeadlock);
+void	DropPrintDeadlock(PrintDeadlock *Pdeadlock);
+void	InitPrinttxn(PrintRollbackTxn *Ptxn);
+void	DropPrinttxn(PrintRollbackTxn *Ptxn);
+char	*GetGxid(Oid node, uint32 pid);
+int		check_node_pid(char *nodename, uint32 pid);
+bool	check_exist_gid(char *gid);
+void	KillTxn(int txnid);
+
+/*global variables*/
+static Oid		*cn_node_list = NULL;
+static Oid		*dn_node_list = NULL;
+static Oid		*sdn_node_list = NULL;
+static bool		*cn_health_map = NULL;
+static bool		*dn_health_map = NULL;
+static int		cn_nodes_num;
+static int		dn_nodes_num;
+static int		sdn_nodes_num;
+
+static transaction	*
+				pgxc_transaction = NULL;	/*stores all transactions*/
+static int		pgxc_transaction_count = 0;	/*transaction count*/
+static int		pgxc_transaction_size = 0;	/*records capacity of pgxc_transaction*/
+static int		**pgxc_edge = NULL;
+static deadlock *
+				pgxc_deadlock = NULL;
+static int		pgxc_deadlock_count = 0;
+static int		pgxc_deadlock_size = 0;
+
+static int m_matrix[8][8] = /*conflict info among lock modes*/
+{
+	{0, 0, 0, 0, 0, 0, 0, 1},
+	{0, 0, 0, 0, 0, 0, 1, 1},
+	{0, 0, 1, 0, 1, 1, 1, 1},
+	{0, 0, 0, 1, 1, 1, 1, 1},
+	{0, 0, 1, 1, 0, 1, 1, 1},
+	{0, 0, 1, 1, 1, 1, 1, 1},
+	{0, 1, 1, 1, 1, 1, 1, 1},
+	{1, 1, 1, 1, 1, 1, 1, 1}
+};
+
+static void ResetGlobalVariables(void)
+{
+    cn_node_list = NULL;
+    dn_node_list = NULL;
+    sdn_node_list = NULL;
+    cn_health_map = NULL;
+    dn_health_map = NULL;
+    cn_nodes_num = 0;
+    dn_nodes_num = 0;
+    sdn_nodes_num = 0;
+
+    pgxc_transaction = NULL;	/*stores all transactions*/
+    pgxc_transaction_count = 0;	/*transaction count*/
+    pgxc_transaction_size = 0;	/*records capacity of pgxc_transaction*/
+    pgxc_edge = NULL;
+
+    pgxc_deadlock = NULL;
+    pgxc_deadlock_count = 0;
+    pgxc_deadlock_size = 0;
+
+}
+
+
+/* 
+ * pg_unlock_execute -- detect and recover deadlocks
+ * input: 	no
+ * output:	info of rollback transactions
+ */
+Datum
+pg_unlock_execute(PG_FUNCTION_ARGS)
+{
+#ifdef ACCESS_CONTROL_ATTR_NUM
+#undef ACCESS_CONTROL_ATTR_NUM
+#endif
+#define ACCESS_CONTROL_ATTR_NUM  5
+		FuncCallContext 		*funcctx;
+		PrintAllRollbackTxns	*Partxns;
+		char					**rec;
+        char                    **nodename;
+        char                    **query;
+		HeapTuple				tuple;		
+	
+		Datum		values[ACCESS_CONTROL_ATTR_NUM];
+		bool		nulls[ACCESS_CONTROL_ATTR_NUM];
+		
+	if(!IS_PGXC_COORDINATOR)
+	{
+		elog(ERROR, "can only called on coordinator");
+	}
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcontext;
+		TupleDesc	tupdesc;
+		funcctx = SRF_FIRSTCALL_INIT();
+		
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "executetime",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "txnindex",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "rollbacktxn(ip:port)",
+						   TEXTOID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 4, "nodename",
+						   TEXTOID, -1, 0);
+        TupleDescInitEntry(tupdesc, (AttrNumber) 5, "cancel_query",
+						   TEXTOID, -1, 0);
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+		funcctx->user_fctx = palloc0(sizeof(PrintAllRollbackTxns));
+		Partxns = (PrintAllRollbackTxns *)funcctx->user_fctx;
+		INIT(Partxns->Ptxns);
+		Partxns->index = 0; 
+
+		ResetGlobalVariables();
+        /*get node list*/
+		PgxcNodeGetOidsExtend(&cn_node_list, &dn_node_list, &sdn_node_list, 
+							  &cn_nodes_num, &dn_nodes_num, &sdn_nodes_num, true);
+		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+		do
+		{
+			/*get all transaction info and associat it to global xid*/
+			GetAllTransInfo();
+			if (pgxc_transaction_count == 0)
+			{
+				elog(DEBUG1, "pg_unlock: there is no transaction");
+				break;
+			}
+			
+			/*build transaction dependency graph*/
+			InitAllEdge();
+			
+			/*detect deadlocks*/
+			DetectDeadlock();
+			if (pgxc_deadlock_count == 0)
+			{
+				/*program ends until there is no deadlock*/
+				elog(DEBUG1, "pg_unlock: there is no deadlock");
+				break;
+			}
+			/*recover deadlocks through killing one transaction*/
+			RecoverDeadlock();
+
+			/*record output info*/
+			RPALLOC(Partxns->Ptxns);
+			InitPrinttxn(&(Partxns->Ptxns[Partxns->Ptxns_count]));
+			if (Partxns->Ptxns[Partxns->Ptxns_count].txn_count > 0)
+			{
+				Partxns->Ptxns_count++;
+			}
+			DropAlldeadlocks();
+			DropAlltransactions();
+		}while(true);
+		MemoryContextSwitchTo(oldcontext);
+	}
+	
+	funcctx = SRF_PERCALL_SETUP();	
+	Partxns = (PrintAllRollbackTxns *) funcctx->user_fctx;
+	
+	if (Partxns->index < Partxns->Ptxns_count)
+	{
+		PrintRollbackTxn *temp = &(Partxns->Ptxns[Partxns->index]);
+		rec = Partxns->Ptxns[Partxns->index].txn;
+		nodename = Partxns->Ptxns[Partxns->index].nodename;
+		query = Partxns->Ptxns[Partxns->index].cancel_query;
+        
+		while (temp->index < temp->txn_count)
+		{
+			MemSet(values, 0, sizeof(values));
+			MemSet(nulls, 0, sizeof(nulls));
+
+			if (temp->index == 0)
+			{
+				values[0] = Int32GetDatum(Partxns->index);
+			}
+			values[1] = Int32GetDatum(temp->index);
+			values[2] = PointerGetDatum(cstring_to_text(rec[temp->index]));
+            values[3] = PointerGetDatum(cstring_to_text(nodename[temp->index]));
+            values[4] = PointerGetDatum(cstring_to_text(query[temp->index]));
+			tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+			temp->index++;
+            if (temp->index < temp->txn_count)
+            {
+			    SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+            }
+		}
+		Partxns->index++;
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+	}
+	else
+	{
+		RFREE(Partxns->Ptxns);
+		Partxns->index = 0;               
+		DropAlldeadlocks();
+		DropAlltransactions();
+		pfree(cn_health_map);
+		pfree(dn_health_map);
+		if (cn_node_list)
+		{
+			pfree(cn_node_list);
+			cn_nodes_num = 0;
+		}
+		if (dn_node_list)
+		{
+			pfree(dn_node_list);
+			dn_nodes_num = 0;
+		}
+		if (sdn_node_list)
+		{
+			pfree(sdn_node_list);
+			sdn_nodes_num = 0;
+		}  
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
+/* 
+ * pg_unlock_check_deadlock -- detect deadlocks without recover
+ * input: 	no
+ * output:	info of deadlocks
+ */
+Datum	pg_unlock_check_deadlock(PG_FUNCTION_ARGS)
+{
+#ifdef ACCESS_CONTROL_ATTR_NUM
+#undef ACCESS_CONTROL_ATTR_NUM
+#endif
+#define ACCESS_CONTROL_ATTR_NUM  4
+	FuncCallContext 		*funcctx;
+	PrintDeadlock			*Pdeadlock;
+	char					**rec;
+    char                    **nodes;
+    char                    **querys;
+	HeapTuple				tuple;		
+
+	Datum		values[ACCESS_CONTROL_ATTR_NUM];
+	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcontext;
+		TupleDesc	tupdesc;
+		funcctx = SRF_FIRSTCALL_INIT();
+		
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "deadlockid",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "deadlocks",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "nodename",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "query",
+						   TEXTOID, -1, 0);
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+		funcctx->user_fctx = palloc0(sizeof(PrintDeadlock));
+		Pdeadlock = (PrintDeadlock*)funcctx->user_fctx;
+		
+		ResetGlobalVariables();
+		/*get node list*/
+		PgxcNodeGetOidsExtend(&cn_node_list, &dn_node_list, &sdn_node_list, 
+							  &cn_nodes_num, &dn_nodes_num, &sdn_nodes_num, true);
+		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+		
+		/*get all transaction info and associat it to global xid*/
+		GetAllTransInfo();
+
+		/*build transaction dependency graph*/
+		InitAllEdge();
+
+		/*detect deadlocks*/
+		DetectDeadlock();
+
+		/*record output info*/
+		InitPrintDeadlock(Pdeadlock);
+		MemoryContextSwitchTo(oldcontext);
+	}
+	
+	funcctx = SRF_PERCALL_SETUP();	
+	Pdeadlock = (PrintDeadlock *) funcctx->user_fctx;
+	rec = Pdeadlock->deadlock;
+    nodes = Pdeadlock->nodename;
+    querys = Pdeadlock->query;
+	
+	if (Pdeadlock->index < Pdeadlock->deadlock_count)
+	{
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		values[0] = Int32GetDatum(Pdeadlock->index);
+		values[1] = PointerGetDatum(cstring_to_text(rec[Pdeadlock->index]));
+        values[2] = PointerGetDatum(cstring_to_text(nodes[Pdeadlock->index]));
+        values[3] = PointerGetDatum(cstring_to_text(querys[Pdeadlock->index]));
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+		Pdeadlock->index++;
+		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+	}
+	else
+	{
+		DropPrintDeadlock(Pdeadlock);
+		DropAlldeadlocks();
+		DropAlltransactions();
+		pfree(cn_health_map);
+		pfree(dn_health_map);
+		if (cn_node_list)
+		{
+			pfree(cn_node_list);
+			cn_nodes_num = 0;
+		}
+		if (dn_node_list)
+		{
+			pfree(dn_node_list);
+			dn_nodes_num = 0;
+		}
+		if (sdn_node_list)
+		{
+			pfree(sdn_node_list);
+			sdn_nodes_num = 0;
+		}
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
+/* 
+ * pg_unlock_check_dependency -- only detect transaction dependency
+ * input: 	no
+ * output:	info of transaction dependency
+ */
+Datum	pg_unlock_check_dependency(PG_FUNCTION_ARGS)
+{
+#ifdef ACCESS_CONTROL_ATTR_NUM
+#undef ACCESS_CONTROL_ATTR_NUM
+#endif
+#define ACCESS_CONTROL_ATTR_NUM  4
+	FuncCallContext 		*funcctx;
+	PrintEdge				*Pedge;
+	char					**rec;
+    char                    **nodes;
+    char                    **querys;
+	HeapTuple				tuple;		
+
+	Datum		values[ACCESS_CONTROL_ATTR_NUM];
+	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcontext;
+		TupleDesc	tupdesc;
+		funcctx = SRF_FIRSTCALL_INIT();
+		
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		tupdesc = CreateTemplateTupleDesc(ACCESS_CONTROL_ATTR_NUM, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "dependencyid",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "dependency",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 3, "nodename",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 4, "query",
+						   TEXTOID, -1, 0);
+		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
+
+		funcctx->user_fctx = palloc0(sizeof(PrintEdge));
+		Pedge = (PrintEdge*)funcctx->user_fctx;
+		
+		ResetGlobalVariables();
+		/*get node list*/
+		PgxcNodeGetOidsExtend(&cn_node_list, &dn_node_list, &sdn_node_list, 
+							  &cn_nodes_num, &dn_nodes_num, &sdn_nodes_num, true);
+		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+
+		/*get all transaction info and associat it to global xid*/
+		GetAllTransInfo();
+
+		/*build transaction dependency graph*/
+		InitAllEdge();
+
+		/*record output info*/
+		InitPrintEdge(Pedge);
+		MemoryContextSwitchTo(oldcontext);
+	}
+	
+	funcctx = SRF_PERCALL_SETUP();	
+	Pedge = (PrintEdge *) funcctx->user_fctx;
+	rec = Pedge->edge;
+    nodes = Pedge->nodes;
+    querys = Pedge->querys;
+	
+	if (Pedge->index < Pedge->edge_count)
+	{
+		MemSet(values, 0, sizeof(values));
+		MemSet(nulls, 0, sizeof(nulls));
+
+		values[0] = Int32GetDatum(Pedge->index);
+		values[1] = PointerGetDatum(cstring_to_text(rec[Pedge->index]));
+        values[2] = PointerGetDatum(cstring_to_text(nodes[Pedge->index]));
+        values[3] = PointerGetDatum(cstring_to_text(querys[Pedge->index]));
+		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
+		Pedge->index++;
+		SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
+	}
+	else
+	{
+		DropPrintEdge(Pedge);
+		DropAlltransactions();
+		pfree(cn_health_map);
+		pfree(dn_health_map);
+		if (cn_node_list)
+		{
+			pfree(cn_node_list);
+			cn_nodes_num = 0;
+		}
+		if (dn_node_list)
+		{
+			pfree(dn_node_list);
+			dn_nodes_num = 0;
+		}
+		if (sdn_node_list)
+		{
+			pfree(sdn_node_list);
+			sdn_nodes_num = 0;
+		}
+		SRF_RETURN_DONE(funcctx);
+	}
+
+}
+
+/* 
+ * pg_unlock_killbypid -- kill certain transaction by user
+ * input: 	nodename, pid
+ * output:	execute result success of error info
+ */
+Datum	pg_unlock_killbypid(PG_FUNCTION_ARGS)
+{
+	char	*Kstatus;
+	char	*nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
+	uint32	kpid = PG_GETARG_UINT32(1);
+	int		size = sizeof(char) * 100;
+	char	gid[MAX_GID];
+	text	*t_status = NULL;
+	int		txnindex;
+	
+	Kstatus = (char *)palloc0(size);
+
+	if(!IS_PGXC_COORDINATOR)
+	{
+		elog(ERROR, "can only called on coordinator");
+	}
+	
+	do
+	{
+		ResetGlobalVariables();
+		/*get node list*/
+		PgxcNodeGetOidsExtend(&cn_node_list, &dn_node_list, &sdn_node_list, 
+							  &cn_nodes_num, &dn_nodes_num, &sdn_nodes_num, true);
+		cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
+		dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
+
+		/*get all transaction info and associat it to global xid*/
+		GetAllTransInfo();
+
+		/*find global transaction according to nodename and pid*/
+		txnindex = check_node_pid(nodename, kpid);
+		if (txnindex < 0)
+		{
+			snprintf(Kstatus, size, "Fail:error not exists node:%s or pid:%u on node %s", nodename, kpid, nodename);
+			break;
+		}
+		if (get_pgxc_nodetype(get_pgxc_nodeoid(nodename)) != 'C')
+		{
+			snprintf(Kstatus, size, "Fail:error node:%s is not coordinator", nodename);
+			break;
+		}
+		memcpy(gid, pgxc_transaction[txnindex].gid, sizeof(gid));
+
+		/*kill the transaction*/
+		KillTxn(txnindex);
+		DropAlltransactions();
+		
+		/*check whether this transaction is existed*/
+		LoadTransaction(get_pgxc_nodeoid(nodename));
+		if(!check_exist_gid(gid))
+		{
+			snprintf(Kstatus, size, "Success: pid:%u on node %s is killed", kpid, nodename);
+			break;
+		}
+		else
+		{
+			snprintf(Kstatus, size, "Fail:error pid:%u on node %s is not killed", kpid, nodename);
+			break;
+		}
+	}while(0);
+	DropAlltransactions();
+	pfree(nodename);
+	pfree(cn_health_map);
+	pfree(dn_health_map);
+	if (cn_node_list)
+	{
+		pfree(cn_node_list);
+		cn_nodes_num = 0;
+	}
+	if (dn_node_list)
+	{
+		pfree(dn_node_list);
+		dn_nodes_num = 0;
+	}
+	if (sdn_node_list)
+	{
+		pfree(sdn_node_list);
+		sdn_nodes_num = 0;
+	}
+	t_status = cstring_to_text(Kstatus);
+	pfree(Kstatus);
+	return PointerGetDatum(t_status);
+}
+
+
+/* 
+ * execute_on_single_node -- execute query on certain node and get results
+ * input: 	node oid, execute query, number of attribute in results, results
+ * return:	(Datum) 0
+ */
+static Datum
+execute_on_single_node(Oid node, const char *query, int attnum, TupleTableSlots *tuples)  //delete numnodes, delete nodelist, insert node
+{
+
+	int			i;
+	int			ii;
+	Datum		datum = (Datum) 0;
+	bool		isnull = false;
+	int			i_tuple;
+	int			i_attnum;
+	/*check health of node*/
+	bool ishealthy;
+
+#ifdef XCP
+	EState				*estate;
+	MemoryContext		oldcontext;
+	RemoteQuery 		*plan;
+	RemoteQueryState	*pstate;
+	TupleTableSlot		*result = NULL;
+	Var 				*dummy;
+	char				ntype;
+#endif
+
+
+	/*get heathy status of query node*/
+	PoolPingNodeRecheck(node);
+	PgxcNodeGetHealthMap(cn_node_list, dn_node_list, &cn_nodes_num, &dn_nodes_num, cn_health_map, dn_health_map);
+	if (get_pgxc_nodetype(node) == 'C')
+	{
+		for (i = 0; i < cn_nodes_num; i++)
+		{
+			if (cn_node_list[i] == node)
+			{
+				ishealthy = cn_health_map[i];
+			}
+		}
+	}
+	else
+	{
+		for (i = 0; i < dn_nodes_num; i++)
+		{
+			if (dn_node_list[i] == node)
+			{
+				ishealthy = dn_health_map[i];
+			}
+		}
+	}
+
+#ifdef XCP
+	/*
+	 * Make up RemoteQuery plan node
+	 */
+	plan = makeNode(RemoteQuery);
+	plan->combine_type = COMBINE_TYPE_NONE;
+	plan->exec_nodes = makeNode(ExecNodes);
+	plan->exec_type = EXEC_ON_NONE;
+
+	ntype = PGXC_NODE_NONE;
+	plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList,
+		PGXCNodeGetNodeId(node, &ntype));
+	if (ntype == PGXC_NODE_NONE)
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("Unknown node Oid: %u", node)));
+	else if (ntype == PGXC_NODE_COORDINATOR) 
+	{
+		plan->exec_type = EXEC_ON_COORDS;
+	}
+	else
+	{
+		plan->exec_type = EXEC_ON_DATANODES;
+	}
+
+	plan->sql_statement = (char*)query;
+	plan->force_autocommit = false;
+	/*
+	 * We only need the target entry to determine result data type.
+	 * So create dummy even if real expression is a function.
+	 */
+	for (ii = 1; ii <= attnum; ii++)
+	{
+		dummy = makeVar(1, ii, TEXTOID, 0, InvalidOid, 0);	//TEXTOID??
+		plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist,
+										  makeTargetEntry((Expr *) dummy, ii, NULL, false));
+	}
+	/* prepare to execute */
+	estate = CreateExecutorState();
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+	estate->es_snapshot = GetActiveSnapshot();
+	pstate = ExecInitRemoteQuery(plan, estate, 0);
+	MemoryContextSwitchTo(oldcontext);
+
+	/*execute query on node when node is healthy*/
+	INIT(tuples->slot);
+	tuples->attnum = 0;
+	if (ishealthy)
+	{
+		result = ExecRemoteQuery((PlanState *) pstate);
+		tuples->attnum = attnum;
+		i_tuple = 0;
+		i_attnum = 0;
+		while (result != NULL && !TupIsNull(result))
+		{
+			slot_getallattrs(result); 
+			RPALLOC(tuples->slot);
+			tuples->slot[i_tuple] = (char **) palloc(attnum * sizeof(char *));
+		
+			for (i_attnum = 0; i_attnum < attnum; i_attnum++)
+			{
+				if (result->tts_values[i_attnum] != (Datum)0)
+				{
+					tuples->slot[i_tuple][i_attnum] = text_to_cstring(DatumGetTextP(result->tts_values[i_attnum]));
+				}
+				else
+				{
+					tuples->slot[i_tuple][i_attnum] = NULL;
+				}
+			}
+			tuples->slot_count++;
+
+			result = ExecRemoteQuery((PlanState *) pstate);
+			i_tuple++;
+		}
+	}
+	ExecEndRemoteQuery(pstate);
+#else
+	/*
+	 * Connect to SPI manager
+	 */
+	if ((ret = SPI_connect()) < 0)
+		/* internal error */
+		elog(ERROR, "SPI connect failure - returned %d", ret);
+
+	initStringInfo(&buf);
+
+	/* Get pg_***_size function results from all Datanodes */
+	nodename = get_pgxc_nodename(node);
+
+	ret = SPI_execute_direct(query, nodename);
+	spi_tupdesc = SPI_tuptable->tupdesc;
+
+	if (ret != SPI_OK_SELECT)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				 errmsg("failed to execute query '%s' on node '%s'",
+				        query, nodename)));
+	}
+
+	/*
+	 * The query must always return one row having one column:
+	 */
+	Assert(SPI_processed == 1 && spi_tupdesc->natts == 1);
+
+	datum = SPI_getbinval(SPI_tuptable->vals[0], spi_tupdesc, 1, &isnull);
+
+	/* For single node, don't assume the type of datum. It can be bool also. */
+	SPI_finish();
+#endif
+	return (Datum) 0;
+		if (isnull 
+#ifdef _MLS_
+            && (NULL != result))
+#endif            
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Expected datum but got null instead "
+						"while executing query '%s'",
+						query)));
+		PG_RETURN_DATUM(datum);
+}
+
+/* 
+ * GetAllTransInfo -- get all transactions from all nodes and stores them in pgxc_transaction
+ * input: 	no
+ * return:	no
+ */
+void GetAllTransInfo(void)
+{	
+	int	i;
+	for (i = 0; i < cn_nodes_num; i++)
+	{
+		LoadTransaction(cn_node_list[i]);
+	}
+	for (i = 0; i < dn_nodes_num; i++)
+	{
+		LoadTransaction(dn_node_list[i]);
+	}
+}
+
+/* 
+ * LoadTransaction -- get transactions from certain node and stores them in pgxc_transaction
+ * input: 	node oid
+ * return:	no
+ */
+void LoadTransaction(Oid node)
+{	
+	const char *query_stmt = "select a1.pid::text, a1.locktype::text, a2.datname::text, a2.relname::text, "
+						 "a1.page::text, a1.tuple::text, a1.mode::text, a1.granted::text, a1.transactionid::text, a3.query::text "
+						 "from (select locktype::text, database, relation, page::text, "
+									  "tuple::text, mode::text, granted::text, pid::text, transactionid::text "
+									  "from pg_locks where (locktype = 'relation' or locktype = 'page' or locktype = 'tuple' or locktype = 'transactionid')"
+									  " and (pid is not null))a1 "
+ 							  "left join "
+ 							  "(select distinct pg_database.datname::text, pg_class.relname::text, "
+				  								"pg_locks.database, pg_locks.relation "
+				  								"from pg_database, pg_class, pg_locks, pg_namespace "
+				  								"where pg_database.oid = pg_locks.database and pg_class.oid = pg_locks.relation "
+				  									  "and pg_namespace.oid = pg_class.relnamespace and pg_namespace.nspname "
+				 									  "not in ('pg_catalog','information_schema'))a2 "
+								  "on a1.database = a2.database and a1.relation = a2.relation "
+								  "left join "
+								  "(select pid::text, query::text from pg_stat_activity)a3 on a1.pid = a3.pid and a3.pid != '%d' "
+								  "where (a1.locktype = 'transactionid' and a1.transactionid is not null)"
+								  	" or (a1.locktype != 'transactionid' and a2.datname is not null and a2.relname is not null) order by a1.pid;";
+
+    char query_txnid[2048];
+
+    /*stores tuples in result_txnid*/
+	TupleTableSlots	result_txnid;
+	int i;
+	int i_txn;
+	int ntuples; 
+	uint32 pid;
+	char *temp = NULL;
+	char *rel_name = NULL;
+	char *db_name = NULL;
+	char *ptr = NULL;
+	char *gid = NULL;
+    int nodeid = 0;
+	lockinfo templock;
+	
+    sprintf(query_txnid, query_stmt, MyProcPid);
+	execute_on_single_node(node, query_txnid, 10, &result_txnid);
+	if (result_txnid.slot == NULL) 
+	{
+		elog(DEBUG1, "pg_unlock: there is no transaction on node %s", get_pgxc_nodename(node));
+		return;
+	}
+	
+	ntuples = result_txnid.slot_count;
+	for (i = 0; i < ntuples; i++)
+	{
+		pid = strtoul(TTSgetvalue(&result_txnid, i, 0), NULL, 10);
+		/*get global xid of pid on node*/
+		gid = GetGxid(node, pid);
+			/*select for update apply for transactionid without global xid*/
+		if (gid == NULL)
+		{
+			continue;
+		}
+		
+		/*check whether the gid is already existed*/
+		for (i_txn = 0; i_txn < pgxc_transaction_count; i_txn++)
+		{
+			if (strcmp(gid, pgxc_transaction[i_txn].gid) == 0)
+			{
+				break;
+			}
+		}
+		
+		/*insert this new transaction when gid is not find in pgxc_transaction*/
+		if (i_txn >= pgxc_transaction_count)
+		{
+			RPALLOC(pgxc_transaction);
+			InitTransaction(pgxc_transaction_count);
+			memcpy(pgxc_transaction[pgxc_transaction_count].gid, gid, sizeof(char) * MAX_GID);
+			pgxc_transaction_count++;
+			i_txn = pgxc_transaction_count-1;
+		}
+		add_pid_node(i_txn, pid, node);
+		ptr = strtok(gid, ":");
+        nodeid = atoi(ptr);
+        pgxc_transaction[i_txn].initiator = get_nodeoid_from_nodeid(nodeid, PGXC_NODE_COORDINATOR);
+		//pgxc_transaction[i_txn].initiator = get_pgxc_nodeoid(ptr);
+		pfree(gid);
+
+		/*read lockinfo from result_txnid*/
+		templock.m_pid = pid;
+		templock.m_node = node;
+		templock.m_locktype = find_locktype(TTSgetvalue(&result_txnid, i, 1));
+		
+		/*we only consider the first four locktypes*/
+		if (templock.m_locktype > Locktype_Transactionid)
+		{
+			continue;
+		}
+		
+		db_name = TTSgetvalue(&result_txnid, i, 2);
+		if (db_name)
+		{
+			memcpy(templock.m_dbname, db_name, strlen(db_name)+1);
+		}
+		else
+		{
+			MemSet(templock.m_dbname, 0, sizeof(templock.m_dbname));
+		}
+		rel_name = TTSgetvalue(&result_txnid, i, 3);
+		if (rel_name)
+		{
+			memcpy(templock.m_relname, rel_name, strlen(rel_name)+1);
+		}
+		else
+		{
+			MemSet(templock.m_relname, 0, sizeof(templock.m_relname));
+		}
+		if (TTSgetvalue(&result_txnid, i, 4) != NULL)
+		{
+			templock.m_page = strtoul(TTSgetvalue(&result_txnid, i, 4), NULL, 10);
+		}
+		else
+		{
+			templock.m_page = 0;
+		}
+		if (TTSgetvalue(&result_txnid, i, 5) != NULL)
+		{
+			templock.m_tuple = strtoul(TTSgetvalue(&result_txnid, i, 5), NULL, 10);
+		}
+		else
+		{
+			templock.m_tuple = 0;
+		}
+		templock.m_mode = find_mode(TTSgetvalue(&result_txnid, i, 6));
+		if (TTSgetvalue(&result_txnid, i, 8) != NULL)
+		{
+			templock.m_transactionid = strtoul(TTSgetvalue(&result_txnid, i, 8), NULL, 10);
+		}
+		else
+		{
+			templock.m_transactionid = 0;
+		}
+		temp = TTSgetvalue(&result_txnid, i, 7);
+
+        if (TTSgetvalue(&result_txnid, i, 9)) 
+        {
+            if (strlen(TTSgetvalue(&result_txnid, i, 9)) <= 1024) 
+            {
+                templock.m_query = (char *)pstrdup(TTSgetvalue(&result_txnid, i, 9));
+            } 
+            else
+            {
+                templock.m_query = (char *)palloc0(1025);
+                strncpy(templock.m_query, TTSgetvalue(&result_txnid, i, 9), 1024);
+            }
+        } 
+        else 
+        {
+            templock.m_query = NULL;
+        }
+		/*put templock into transaction hold list or wait list due to granted*/
+		if (strcmp(temp, "true") == 0)
+		{
+			templock.m_granted = true;
+			PALLOC(pgxc_transaction[i_txn].hold, templock); 
+		}
+		else
+		{
+			templock.m_granted = false;
+			PALLOC(pgxc_transaction[i_txn].wait, templock); 
+		}
+        if (pgxc_transaction[i_txn].initiator == node) 
+        {
+            if (templock.m_query) 
+            {
+                pgxc_transaction[i_txn].query = pstrdup(templock.m_query);
+            } 
+            else 
+            {
+                pgxc_transaction[i_txn].query = pstrdup("unknown");
+            }
+        }
+
+	}
+	DropTupleTableSlots(&result_txnid);
+}
+
+/* 
+ * TTSgetvalue -- get attribute from TupleTableSlots
+ * input: 	result, index of tuple, index of field
+ * return:	attribute result
+ */
+static char * TTSgetvalue(TupleTableSlots *result, int tup_num, int field_num)
+{
+	return result->slot[tup_num][field_num];
+}
+
+static void DropTupleTableSlots(TupleTableSlots *Slots)
+{
+	int i;
+	int j;
+	for (i = 0; i < Slots->slot_count; i++)
+	{
+		if (Slots->slot[i])
+		{
+			for (j = 0; j < Slots->attnum; j++)
+			{
+				if (Slots->slot[i][j])
+				{
+					pfree(Slots->slot[i][j]);
+				}
+			}
+			pfree(Slots->slot[i]);
+		}
+	}
+	RFREE(Slots->slot);
+	Slots->attnum = 0;
+	return;
+}
+
+void InitTransaction(int txn_index)
+{
+	transaction *temp;
+	temp = pgxc_transaction;
+	if (temp == NULL)
+	{
+		elog(LOG, "pg_unlock: error pgxc_transaction is null");
+		exit(1);
+	}
+	INIT(temp[txn_index].pid);
+	INIT(temp[txn_index].node);
+	INIT(temp[txn_index].hold);
+	INIT(temp[txn_index].wait);
+	INIT(temp[txn_index].out);
+	temp[txn_index].searched = false;
+	temp[txn_index].alive = true;
+	INIT(temp[txn_index].deadlock);
+	temp[txn_index].wait_txn = 0;
+	temp[txn_index].query = NULL;
+}
+
+/* 
+ * add_pid_node -- add pid and node to certain transaction
+ * input: 	index of transaction, pid, node oid
+ * return:	void
+ */
+void add_pid_node(int txn_index, uint32 pid, Oid node)
+{
+	transaction *temp;
+	temp = pgxc_transaction;
+
+	PALLOC(temp[txn_index].pid, pid);
+	PALLOC(temp[txn_index].node, node);
+}
+
+LOCKTYPE find_locktype(char * locktype)
+{
+	LOCKTYPE j;
+	if (strcmp(locktype, "relation") == 0)
+	{
+		j = Locktype_Relation;
+	}
+	else if (strcmp(locktype, "page") == 0)
+	{
+		j = Locktype_Page;
+	}
+	else if (strcmp(locktype, "tuple") == 0)
+	{
+		j = Locktype_Tuple;
+	}
+	else if (strcmp(locktype, "transactionid") == 0)
+	{
+		j = Locktype_Transactionid;
+	}
+	else if (strcmp(locktype, "object") == 0)
+	{
+		j = Locktype_Object;
+	}
+	else if (strcmp(locktype, "userlock") == 0)
+	{
+		j = Locktype_Userlock;
+	}
+	else if (strcmp(locktype, "advisory") == 0)
+	{
+		j = Locktype_Advisory;
+	}
+	else
+	{
+		elog(LOG, "pg_unlock: unknown locktype: %s", locktype);
+		exit (1);
+	}
+	return j;
+}
+
+MODE find_mode(char *mode)
+{
+	MODE i;
+	if (strcmp(mode, "AccessShareLock") == 0)
+	{
+		i = Lockmode_ASL;
+	}
+	else if (strcmp(mode, "RowShareLock") == 0)
+	{
+		i = Lockmode_RSL;
+	}
+	else if (strcmp(mode, "RowExclusiveLock") == 0)
+	{
+		i = Lockmode_REL;
+	}
+	else if (strcmp(mode, "ShareUpdateExclusiveLock") == 0)
+	{
+		i = Lockmode_SUEL;
+	}
+	else if (strcmp(mode, "ShareLock") == 0)
+	{
+		i = Lockmode_SL;
+	}
+	else if (strcmp(mode, "ShareRowExclusiveLock") == 0)
+	{
+		i = Lockmode_SREL;
+	}
+	else if (strcmp(mode, "ExclusiveLock") == 0)
+	{
+		i = Lockmode_EL;
+	}
+	else if (strcmp(mode, "AccessExclusiveLock") == 0)
+	{
+		i = Lockmode_AEL;
+	}
+	else
+	{
+		elog(LOG, "pg_unlock: unkown lock mode %s", mode);
+		exit (1);
+	}
+	return i; 
+}
+
+/* 
+ * InitAllEdge -- build all transaction dependency graph and stores in pgxc_transaction, pgxc_edge
+ * input: 	no
+ * return:	no
+ */
+void InitAllEdge(void)
+{
+	int i;
+	int j;
+	pgxc_edge = (int **)palloc(pgxc_transaction_count * sizeof(int *));
+	for (i = 0; i < pgxc_transaction_count; i++)
+	{
+		pgxc_edge[i] = (int *)palloc(pgxc_transaction_count * sizeof(int));
+		for (j = 0; j < pgxc_transaction_count; j++)
+		{
+			pgxc_edge[i][j] = 0;
+		}
+	}
+	
+	/*search for all edges*/
+	for (i = 0; i < pgxc_transaction_count; i++)
+	{
+		for (j = 0; j < pgxc_transaction_count; j++)
+		{
+			if (i == j)
+			{
+				continue;
+			}
+			InitEdge(i, j);
+		}
+	}
+}
+
+/* 
+ * InitEdge -- build dependency between two transactions and stores it in pgxc_transaction, pgxc_edge
+ * input: 	pre transaction index, post transaction index
+ * return:	no
+ */
+void InitEdge(int pre, int post) 
+{
+	int i;
+	int out_count;
+	Edge *out = NULL;
+	int	pre_end = pgxc_transaction[pre].wait_count;
+	lockinfo *pre_wait = pgxc_transaction[pre].wait;
+	
+	for (i = 0; i < pre_end; i++)
+	{
+		/*if lock pre_wait[i] conflict with pgxc_transaction[post]*/
+		if (is_conflict_withtxn(pre_wait+i, post))
+		{
+			RPALLOC(pgxc_transaction[pre].out);
+			out = pgxc_transaction[pre].out;
+			out_count = pgxc_transaction[pre].out_count;
+			out[out_count].pre = pre;
+			out[out_count].post = post;
+			pgxc_transaction[pre].out_count++;
+			pgxc_edge[pre][post] = 1;
+			break;
+		}
+	}
+}
+
+/* 
+ * is_conflict_withtxn -- build dependency between two transactions and stores it in pgxc_transaction, pgxc_edge
+ * input: 	pre transaction index, post transaction index
+ * return:	conflict or not
+ */
+bool is_conflict_withtxn(lockinfo *wait, int post_txn)
+{
+	bool conflict = false;
+	lockinfo *hold = pgxc_transaction[post_txn].hold;
+	int hold_count = pgxc_transaction[post_txn].hold_count;
+	int i;
+	for (i = 0; i < hold_count; i++)
+	{
+		if (is_conflict_withlock(wait, hold + i))
+		{
+			conflict = true;
+			break;
+		}
+	}
+	return conflict;
+}
+
+/* 
+ * is_conflict_withlock -- build dependency between two locks
+ * input: 	pre lockinfo, post lockinfo
+ * return:	conflict or not
+ */
+bool is_conflict_withlock(lockinfo *wait, lockinfo *hold)
+{
+	bool conflict = false;
+	bool sameobject = true;
+	
+	/*locks of same granted will not conflict*/
+	if (wait->m_node != hold->m_node || wait->m_granted == hold->m_granted)
+	{
+		return conflict;
+	}
+	
+	/*locks of different locktype will not conflict*/
+	if ((wait->m_locktype < Locktype_Transactionid) ^ (hold->m_locktype < Locktype_Transactionid))
+	{
+		sameobject = false;
+	}
+	
+	/*check locktype among relation, page and tuple*/
+	else if(wait->m_locktype < Locktype_Transactionid && hold->m_locktype < Locktype_Transactionid) 
+	{
+		if ((strcmp(wait->m_dbname, hold->m_dbname) == 0) && !check_include(wait, hold))
+		{
+			sameobject = false;
+		}
+	}
+	
+	/*check between transactionid*/
+	else if(wait->m_locktype == Locktype_Transactionid && hold->m_locktype == Locktype_Transactionid)
+	{
+		if (wait->m_node != hold->m_node || wait->m_transactionid != hold->m_transactionid)
+		{
+			sameobject = false;
+		}	
+	}
+	
+	/*check locktype among relation, page and tuple*/
+	if (sameobject == true)
+	{
+		conflict = (m_matrix[(int)wait->m_mode][(int)hold->m_mode] == 1);
+	}
+	return conflict;
+}
+
+bool check_include(lockinfo *wait, lockinfo *hold)
+{
+	bool include = false;
+	LOCKTYPE i = wait->m_locktype;
+	LOCKTYPE j = hold->m_locktype;
+	int min;
+	int max;
+	
+	if ((i >= Locktype_Transactionid) || (j >= Locktype_Transactionid))
+	{
+		return include;
+	}
+	min = i <= j ? i : j;
+	max = i <= j ? j : i;
+	switch (min)
+	{
+		case Locktype_Relation:
+			if (strcmp(wait->m_relname, hold->m_relname) == 0)
+			{
+				include = true;
+			}
+			break;
+		case Locktype_Page:
+			if (strcmp(wait->m_relname, hold->m_relname) == 0)
+			{
+				/*locks in same relation and page or 
+				relation lock and page lock of the same relation*/
+				if ((i != j) || (wait->m_page == hold->m_page))
+				{
+					include = true;
+				}
+			}
+			break;
+		case Locktype_Tuple:
+			if (strcmp(wait->m_relname, hold->m_relname) == 0)
+			{
+				if (max == Locktype_Relation)
+				{
+					include = true;
+					break;
+				}
+				if (wait->m_page == hold->m_page)
+				{
+					if (max == Locktype_Page)
+					{
+						include = true;
+						break;
+					}
+					if (wait->m_tuple == hold->m_tuple)
+					{
+						if (max == Locktype_Tuple)
+						{
+							include = true;
+							break;
+						}
+					}
+				}
+			}
+			break;
+		default:
+			elog(LOG, "pg_unlock: could not match locktype %d to relation, page or tuple", min);
+			break;
+	}
+	return include;
+}
+
+void InitDeadlock(void)
+{
+	RPALLOC(pgxc_deadlock);
+	INIT(pgxc_deadlock[pgxc_deadlock_count].txns);
+	RPALLOC(pgxc_deadlock[pgxc_deadlock_count].txns);
+	pgxc_deadlock[pgxc_deadlock_count].killed = false;
+	return;
+}
+
+void DropDeadlock(deadlock *loop)
+{
+	RFREE(loop->txns);
+	loop->killed = false;
+	return;
+}
+
+void DropAlldeadlocks(void)
+{
+	int i;
+	for (i = pgxc_deadlock_count - 1; i >= 0; i--)
+	{
+		DropDeadlock(pgxc_deadlock+i);
+	}
+	RFREE(pgxc_deadlock);
+}
+
+/* 
+ * DetectDeadlock -- detect deadlock according to transaction dependency and store them in pgxc_deadlock
+ * input: 	no
+ * return:	no
+ */
+void DetectDeadlock(void)
+{
+	int i;
+	deeplist dfs;
+    int loop_start;
+    
+	InitDeeplist(&dfs);
+	for (i = 0; i < pgxc_transaction_count; i++)
+	{
+		if (pgxc_deadlock_count > MAX_DEADLOCK)
+		{
+			break;
+		}
+		
+		/*we can find all the deadlocks that conclude the transaction through tranvers it*/
+		if (pgxc_transaction[i].searched == true)
+		{
+			continue;
+		}
+		else
+		{
+			/*push i into stack*/
+			PALLOC(dfs.stack, i);
+			PALLOC(dfs.stackpre, -1);
+		}
+		while (dfs.stack_count != 0 )
+		{
+    		if (pgxc_deadlock_count > MAX_DEADLOCK)
+    		{
+    			break;
+    		}
+			/*loop_start indicate whether deadlock exists*/
+			loop_start = traverse(&dfs);
+			if (loop_start > -1)
+			{
+				path_deadlock(&dfs, loop_start);
+			}
+		}
+		ClearDeeplist(&dfs);
+	}
+	DropDeeplist(&dfs);
+}
+
+/* 
+ * traverse -- traverse according to transaction dependency and store them in list->path
+ * input: 	deeplist
+ * return:	index of deadlock start transaction in path
+ */
+int traverse(deeplist* list)
+{
+	int res = -1;
+	
+	/*pop the last element in stack*/
+	int i;
+	int post;
+	int start = list->stack[list->stack_count - 1]; 
+	int startpre = list->stackpre[list->stackpre_count - 1];
+	
+	list->stack_count--;
+	list->stackpre_count--;
+	pgxc_transaction[start].searched = true;
+
+	/*delete element in path, if the pop element in stack is not its post*/
+	if (list->path_count > 0)
+	{
+		while(list->path[list->path_count-1] != startpre)
+		{
+			list->path_count--;
+			list->txn_exist[list->path[list->path_count]] = -1;
+		}
+	}
+	
+	/*push the pop element into path*/
+	PALLOC(list->path, start);
+	list->txn_exist[start] = list->path_count-1;
+	
+	/*find all the outedge of the above pop element*/
+	for (i = 0; i < pgxc_transaction[start].out_count; i++)
+	{
+		post = pgxc_transaction[start].out[i].post;
+		
+		/*if the transaction post does not exit in path*/
+		if (list->txn_exist[post] < 0)
+		{
+			PALLOC(list->stack, post);
+			PALLOC(list->stackpre, start);
+		}
+		/*or return the index of path according to the transaction*/
+		else
+		{
+			res = list->txn_exist[post];
+		}
+	}
+	return res;
+}
+
+/* 
+ * path_deadlock -- add element in path to pgxc_deadlock
+ * input: 	deeplist, index of deadlock start element in path
+ * return:	no
+ */
+void path_deadlock(deeplist *list, int start)
+{
+	deadlock *loop = NULL;
+	int i;
+	int ii;
+	int ij;
+	int total_count = list->path_count - start;
+	bool isexist = false;
+	int ii_txns_count;
+	int ij_txns_count;
+	
+	InitDeadlock();
+	loop = pgxc_deadlock+pgxc_deadlock_count;
+	
+	for (i = start; i < list->path_count; i++)
+	{
+		PALLOC(loop->txns, list->path[i]);
+	}
+	/*first check whether the deadlock is exits*/
+	for (i = 0; i < pgxc_deadlock_count; i++)
+	{
+		if (pgxc_deadlock[i].txns_count == total_count)
+		{
+			isexist = true;
+			ii_txns_count = pgxc_deadlock[i].txns_count;
+			ij_txns_count = loop->txns_count * 2 - 1;
+			for (ii = 0, ij = 0; ii < ii_txns_count && ij < ij_txns_count;)
+			{
+				if (pgxc_deadlock[i].txns[ii] != loop->txns[ij % loop->txns_count])
+				{
+					if (ii == 0 && ij < loop->txns_count)
+					{
+						ij++;
+					}
+					else
+					{
+						/*deadlock not exist*/
+						isexist = false;	
+						break;
+					}
+				}
+				else
+				{
+					ii++;
+					ij++;
+				}
+			}
+			if (isexist == true)
+			{
+				break;
+			}
+			/*deadlock in list[start~path_count-1] is already exist*/
+		}
+	}
+	
+	if (isexist == false)
+	{
+		pgxc_deadlock_count++;
+	}
+	else
+	{
+		RFREE(loop->txns);
+	}
+	/*if not existed then insert into pgxc_deadlock*/
+	return;
+}
+
+void InitDeeplist(deeplist* list)
+{
+	int i; 
+	INIT(list->stack);
+	INIT(list->stackpre);
+	INIT(list->path);
+	list->txn_exist = (int *)palloc(pgxc_transaction_count * sizeof(int));
+	for (i = 0; i < pgxc_transaction_count; i++)
+	{
+		list->txn_exist[i] = -1;
+	}
+	return;
+}
+
+void ClearDeeplist(deeplist * list)
+{
+	int i = 0;
+	list->stack_count = 0;
+	list->stackpre_count = 0;
+	list->path_count = 0;
+	for (i = 0; i < pgxc_transaction_count; i++)
+	{
+		list->txn_exist[i] = -1;
+	}
+	return;
+}
+
+void DropDeeplist(deeplist * list)
+{
+	RFREE(list->stack);
+	RFREE(list->stackpre);
+	RFREE(list->path);
+	pfree(list->txn_exist);
+	list->txn_exist = NULL;
+	return;
+}
+
+/* 
+ * RecoverDeadlock -- kill at most one transaction in each deadlock
+ * input: 	no
+ * return:	no
+ */
+void RecoverDeadlock(void)
+{
+	int* sort_txnid = NULL;
+	if (pgxc_deadlock_count == 0)
+	{
+		return;
+	}
+
+	sort_txnid = (int *)palloc(pgxc_transaction_count * sizeof(int));
+	/*Count deadlocks belong to each transactions*/
+	CountDeadlocks();
+	CountWaitTxn();
+
+	/*sort transaction index by deadlock count*/
+	SortByDeadlock(sort_txnid);
+	/*first kill transaction with the most deadlocks*/
+	KillDeadlockByTxn(sort_txnid[0]);
+	pfree(sort_txnid);
+	return;
+}
+
+void CountDeadlocks(void)
+{
+	int i;
+	int j;
+	
+	for (i = 0; i < pgxc_deadlock_count; i++)
+	{
+		for (j = 0; j < pgxc_deadlock[i].txns_count; j++)
+		{
+			PALLOC(pgxc_transaction[pgxc_deadlock[i].txns[j]].deadlock, i);
+		}
+	}
+	return;
+}
+
+void SortByDeadlock(int *sort_txnid)
+{
+	int i;
+	for (i = 0; i < pgxc_transaction_count; i++)
+	{
+		sort_txnid[i] = i;
+	}
+	quiksort(sort_txnid, 0, pgxc_transaction_count-1);
+}
+
+void quiksort(int *sort_txnid, int low, int high)
+{
+	int i = low;
+	int j = high;  
+	int temp = sort_txnid[i]; 
+  
+	if( low > high)
+	{		   
+	   return ;
+	}
+	while(i < j) 
+	{
+		while(((pgxc_transaction[sort_txnid[j]].deadlock_count
+			< pgxc_transaction[temp].deadlock_count) 
+				|| ((pgxc_transaction[sort_txnid[j]].deadlock_count
+					== pgxc_transaction[temp].deadlock_count)
+					&& (pgxc_transaction[sort_txnid[j]].wait_txn
+						<= pgxc_transaction[temp].wait_txn))) 
+			&& (i < j))
+		{ 
+			j--; 
+		}
+		sort_txnid[i] = sort_txnid[j];
+		while(((pgxc_transaction[sort_txnid[i]].deadlock_count
+			> pgxc_transaction[temp].deadlock_count)
+				|| ((pgxc_transaction[sort_txnid[j]].deadlock_count
+					== pgxc_transaction[temp].deadlock_count) 
+					&& (pgxc_transaction[sort_txnid[j]].wait_txn
+						>= pgxc_transaction[temp].wait_txn)))
+			&& (i < j))
+		{
+			i++; 
+		}  
+		sort_txnid[j]= sort_txnid[i];
+	}
+	 sort_txnid[i] = temp;
+	 quiksort(sort_txnid,low,i-1);
+	 quiksort(sort_txnid,j+1,high);
+}
+
+
+/* 
+ * KillDeadlockByTxn -- kill certain transaction
+ * input: 	transaction index
+ * return:	no
+ */
+void KillDeadlockByTxn(int txnid)
+{
+	int i;
+	transaction *txn = pgxc_transaction;
+	Oid* node = pgxc_transaction[txnid].node;
+	uint32* pid = pgxc_transaction[txnid].pid;
+	char query[500];
+	TupleTableSlots result;
+
+	if (DeadlockExists(txnid) == false)
+	{
+		return;
+	}
+	
+	txn[txnid].alive = false;
+	for (i = 0; i < txn[txnid].deadlock_count; i++)
+	{
+		pgxc_deadlock[txn[txnid].deadlock[i]].killed = true;
+	}
+	
+	for (i = 0; i < pgxc_transaction[txnid].node_count; i++)
+	{
+		snprintf(query, 500,"select pg_cancel_backend(%u);", pid[i]);
+		execute_on_single_node(node[i], query, 0, &result);
+		DropTupleTableSlots(&result);
+	}
+	return;
+}
+
+bool DeadlockExists(int id)
+{
+	bool res = false;
+	transaction *txn = pgxc_transaction;
+	int i;
+	for (i = 0; i < txn[id].deadlock_count; i++)
+	{
+		if (pgxc_deadlock[txn[id].deadlock[i]].killed == false)
+		{
+			res = true;
+		}
+	}
+	return res;
+}
+
+void DropTransaction(int i)
+{
+	transaction *txn = pgxc_transaction;
+    
+	txn[i].gid[0] = '\0';
+	txn[i].searched = false;
+	txn[i].alive = true;
+	txn[i].wait_txn = 0;
+    
+	RFREE(txn[i].pid);
+	RFREE(txn[i].node);
+	if (txn[i].hold_size && txn[i].hold->m_query)
+	{
+		pfree(txn[i].hold->m_query);
+		txn[i].hold->m_query = NULL;
+	}
+	RFREE(txn[i].hold);
+	if (txn[i].wait_size && txn[i].wait->m_query)
+	{
+		pfree(txn[i].wait->m_query);
+		txn[i].wait->m_query = NULL;
+	}
+	RFREE(txn[i].wait);
+	RFREE(txn[i].deadlock);
+	RFREE(txn[i].out);
+	if (txn[i].query)
+	{
+		pfree(txn[i].query);
+		txn[i].query = NULL;
+	}
+}
+
+void DropAlltransactions(void)
+{
+	int i;
+	
+	for (i = 0; i < pgxc_transaction_count; i++)
+	{
+		DropTransaction(i);
+	}
+
+	if (pgxc_edge != NULL)
+	{
+		for (i = 0; i < pgxc_transaction_count; i++)
+		{
+			pfree(pgxc_edge[i]);
+		}
+		if (pgxc_transaction_count)
+		{
+			pfree(pgxc_edge);
+		}
+		pgxc_edge = NULL;
+	}
+    
+	RFREE(pgxc_transaction);
+}
+
+void InitPrintEdge(PrintEdge *Pedge)
+{
+	int i;
+	int j;
+	int index1;
+	int index2;
+    int len = 0;
+	
+    Pedge->index = 0;
+	INIT(Pedge->edge);
+    INIT(Pedge->nodes);
+    INIT(Pedge->querys);
+	RPALLOC(Pedge->edge);
+	RPALLOC(Pedge->nodes);
+	RPALLOC(Pedge->querys);
+	
+	for (i = 0; i < pgxc_transaction_count; i++)
+	{
+		for (j = 0; j < pgxc_transaction[i].out_count; j++)
+		{
+			RPALLOC(Pedge->edge);
+			Pedge->edge[Pedge->edge_count] = (char *) palloc(2*MAX_GID*sizeof(char) + 10);
+			
+			index1 = pgxc_transaction[i].out[j].pre;
+			index2 = pgxc_transaction[i].out[j].post;
+			snprintf(Pedge->edge[Pedge->edge_count], 2*MAX_GID*sizeof(char) + 10, "%s --> %s", 
+					pgxc_transaction[index1].gid, pgxc_transaction[index2].gid);
+            
+            RPALLOC(Pedge->nodes);
+			Pedge->nodes[Pedge->nodes_count] = (char *) palloc(2*NAMEDATALEN*sizeof(char) + 10);
+            snprintf(Pedge->nodes[Pedge->nodes_count], 2*NAMEDATALEN*sizeof(char) + 10, "%s --> %s", 
+					get_pgxc_nodename(pgxc_transaction[index1].initiator), 
+					get_pgxc_nodename(pgxc_transaction[index2].initiator));
+            
+            RPALLOC(Pedge->querys);
+            len = 0;
+            if (pgxc_transaction[index1].query) 
+            {
+                len += strlen(pgxc_transaction[index1].query);
+            }
+            if (pgxc_transaction[index2].query) 
+            {
+                len += strlen(pgxc_transaction[index2].query);
+            }
+			Pedge->querys[Pedge->querys_count] = (char *) palloc(len+ 10);
+			snprintf(Pedge->querys[Pedge->querys_count], len+10, "%s --> %s", 
+					pgxc_transaction[index1].query, pgxc_transaction[index2].query);
+
+            Pedge->edge_count++;
+			Pedge->nodes_count++;
+			Pedge->querys_count++;
+		}
+	}
+}
+
+void DropPrintEdge(PrintEdge *Pedge)
+{
+	int i;
+    if (NULL == Pedge)
+    {
+        return;
+    }
+	for (i = 0; i < Pedge->edge_count; i++)
+	{
+		pfree(Pedge->edge[i]);
+	}
+	RFREE(Pedge->edge);
+    
+	for (i = 0; i < Pedge->nodes_count; i++)
+	{
+		pfree(Pedge->nodes[i]);
+	}
+	RFREE(Pedge->nodes);
+
+    for (i = 0; i < Pedge->querys_count; i++)
+	{
+		pfree(Pedge->querys[i]);
+	}
+	RFREE(Pedge->querys);
+    Pedge->index = 0;
+	Pedge = NULL;
+}
+
+void InitPrintDeadlock(PrintDeadlock *Pdeadlock)
+{
+	int i;
+	int j;		
+	StringInfoData	query;
+    StringInfoData  nodename;
+    StringInfoData  deadlock_query;
+	
+	Pdeadlock->index = 0;
+	Pdeadlock->deadlock = NULL;
+	Pdeadlock->deadlock_count = pgxc_deadlock_count;
+	Pdeadlock->per_size = (int *)palloc(pgxc_deadlock_count * sizeof(int));
+	Pdeadlock->deadlock = (char **)palloc(pgxc_deadlock_count * sizeof(char *));
+    Pdeadlock->nodename = (char **)palloc(pgxc_deadlock_count * sizeof(char *));
+    Pdeadlock->query = (char **)palloc(pgxc_deadlock_count * sizeof(char *));
+
+	for (i = 0; i < pgxc_deadlock_count; i++)
+	{
+		Pdeadlock->per_size[i] = pgxc_deadlock[i].txns_count*(MAX_GID+10)*sizeof(char);
+		Pdeadlock->deadlock[i] = (char *) palloc(Pdeadlock->per_size[i]);
+        Pdeadlock->nodename[i] = (char *) palloc(pgxc_deadlock[i].txns_count * NAMEDATALEN);
+		
+		initStringInfo(&query);
+        initStringInfo(&nodename);
+        initStringInfo(&deadlock_query);
+
+		for (j = 0; j < pgxc_deadlock[i].txns_count; j++)
+		{
+			appendStringInfo(&query, "%-15s(%-15s:%-12d)", pgxc_transaction[pgxc_deadlock[i].txns[j]].gid, 
+					get_pgxc_nodehost(pgxc_transaction[pgxc_deadlock[i].txns[j]].initiator), 
+					get_pgxc_nodeport(pgxc_transaction[pgxc_deadlock[i].txns[j]].initiator));
+            appendStringInfo(&nodename, "%s", get_pgxc_nodename(pgxc_transaction[pgxc_deadlock[i].txns[j]].initiator));
+            appendStringInfo(&deadlock_query, "%s", pgxc_transaction[pgxc_deadlock[i].txns[j]].query);
+			if (j < pgxc_deadlock[i].txns_count-1)
+			{
+				appendStringInfoChar(&query, '\n');
+				appendStringInfoChar(&nodename, '\n');
+				appendStringInfoChar(&deadlock_query, '\n');
+			}
+		}
+		snprintf(Pdeadlock->deadlock[i], Pdeadlock->per_size[i], "%s", query.data);
+        snprintf(Pdeadlock->nodename[i], pgxc_deadlock[i].txns_count * NAMEDATALEN, "%s", nodename.data);
+        
+        Pdeadlock->query[i] = (char *) palloc(deadlock_query.len + 1);
+        snprintf(Pdeadlock->query[i], deadlock_query.len + 1, "%s", deadlock_query.data);
+	}
+}
+
+void DropPrintDeadlock(PrintDeadlock *Pdeadlock)
+{
+	int i;
+	for (i = 0; i < Pdeadlock->deadlock_count; i++)
+	{
+		pfree(Pdeadlock->deadlock[i]);
+        pfree(Pdeadlock->nodename[i]);
+        pfree(Pdeadlock->query[i]);
+	}
+	pfree(Pdeadlock->deadlock);
+	pfree(Pdeadlock->nodename);
+	pfree(Pdeadlock->query);
+	pfree(Pdeadlock->per_size);
+	Pdeadlock->deadlock = NULL;
+	Pdeadlock->nodename = NULL;
+	Pdeadlock->query = NULL;
+	Pdeadlock->per_size = NULL;
+	Pdeadlock->index = 0;
+	Pdeadlock->deadlock_count = 0;
+}
+
+void InitPrinttxn(PrintRollbackTxn *Ptxn)
+{
+	int i;
+    int len;
+	
+	Ptxn->index = 0;
+	INIT(Ptxn->txn);
+	INIT(Ptxn->nodename);
+	INIT(Ptxn->cancel_query);
+
+	for (i = 0; i < pgxc_transaction_count; i++)
+	{
+		if (pgxc_transaction[i].alive == false)
+		{
+			RPALLOC(Ptxn->txn);
+			Ptxn->txn[Ptxn->txn_count] = (char *) palloc((MAX_GID+10) * sizeof(char));
+			sprintf(Ptxn->txn[Ptxn->txn_count], "%-15s(%-15s:%-15d)", pgxc_transaction[i].gid,
+											get_pgxc_nodehost(pgxc_transaction[i].initiator), 
+											get_pgxc_nodeport(pgxc_transaction[i].initiator));
+			RPALLOC(Ptxn->nodename);
+			Ptxn->nodename[Ptxn->nodename_count] = (char *) palloc(NAMEDATALEN);
+            sprintf(Ptxn->nodename[Ptxn->nodename_count], "%s", get_pgxc_nodename(pgxc_transaction[i].initiator));
+            
+			RPALLOC(Ptxn->cancel_query);
+            len = 0;
+            if (pgxc_transaction[i].query) 
+            {
+                len += strlen(pgxc_transaction[i].query);
+                Ptxn->cancel_query[Ptxn->cancel_query_count] = (char *) palloc0(len + 1);
+                sprintf(Ptxn->cancel_query[Ptxn->cancel_query_count], "%s", pgxc_transaction[i].query);
+            } 
+            else
+            {
+                Ptxn->cancel_query[Ptxn->cancel_query_count] = (char *) palloc0(10);
+                sprintf(Ptxn->cancel_query[Ptxn->cancel_query_count], "unknown");
+            }
+            
+			Ptxn->txn_count++;
+            Ptxn->nodename_count++;
+            Ptxn->cancel_query_count++;
+		}
+	}
+}
+
+void DropPrinttxn(PrintRollbackTxn *Ptxn)
+{
+	int i;
+	for (i = 0; i < Ptxn->txn_count; i++)
+	{
+		pfree(Ptxn->txn[i]);
+	}
+
+	for (i = 0; i < Ptxn->cancel_query_count; i++)
+	{
+		pfree(Ptxn->cancel_query[i]);
+	}
+
+	for (i = 0; i < Ptxn->nodename_count; i++)
+	{
+		pfree(Ptxn->nodename[i]);
+	}
+	RFREE(Ptxn->txn);
+	RFREE(Ptxn->cancel_query);
+	RFREE(Ptxn->nodename);
+	Ptxn->index = 0;
+	Ptxn = NULL;
+}
+
+/* 
+ * GetGxid -- get global xid of certain pid on certain node
+ * input: 	node oid, pid
+ * return:	global xid
+ */
+char *GetGxid(Oid node, uint32 pid)
+{
+	char *res = NULL;
+	char *temp = NULL;
+	TupleTableSlots result;	
+	char query[100];
+	
+	snprintf(query, 100, "select pg_findgxid(%u)", pid); 
+	execute_on_single_node(node, query, 1, &result);
+	if (result.slot == NULL) 
+	{
+		elog(LOG, "pg_unlock: could not obtain global transactionid from pid %u on node %s", pid, get_pgxc_nodename(node));
+		return res;
+	}
+	temp = TTSgetvalue(&result, 0, 0);
+	if (temp != NULL)
+	{
+		res = (char *)palloc(20 * sizeof(char));
+		memcpy(res, temp, 20 * sizeof(char));
+	}
+	DropTupleTableSlots(&result);
+	return res;
+}
+
+/* 
+ * pg_findgxid -- get global xid of certain pid
+ * input: 	pid
+ * return:	global xid
+ */
+Datum pg_findgxid(PG_FUNCTION_ARGS)
+{
+	uint32	pid = PG_GETARG_UINT32(0);
+	char *globalXid = GetGlobalTransactionId(pid);
+	text *t_gxid = NULL;
+	if (globalXid != NULL)
+	{
+		t_gxid = cstring_to_text(globalXid);
+		return	PointerGetDatum(t_gxid);
+	}
+	PG_RETURN_NULL();
+}
+
+/* 
+ * check_node_pid -- check whether certain pid on certain node exists
+ * input: 	nodename, pid
+ * return:	exist or not
+ */
+int check_node_pid(char *nodename, uint32 pid)
+{
+	int res = -1;
+	int i;
+	int j;
+	for (i = 0; i < pgxc_transaction_count; i++)
+	{
+		if (strcmp(get_pgxc_nodename(pgxc_transaction[i].initiator) , nodename) == 0)
+		{
+			for (j = 0; j < pgxc_transaction[i].pid_count; j++)
+			{
+				if (pid == pgxc_transaction[i].pid[j])
+				{
+					res = i;
+				}
+			}
+		}
+	}
+	return res;
+}
+
+/* 
+ * KillTxn -- kill certain transaction
+ * input: 	transaction index
+ * return:	no
+ */
+void KillTxn(int txnid)
+{
+	int i;
+	TupleTableSlots result;
+	char query[500];
+	Oid* node = pgxc_transaction[txnid].node;
+	uint32* pid = pgxc_transaction[txnid].pid;
+	
+	for (i = 0; i < pgxc_transaction[txnid].node_count; i++)
+	{
+		snprintf(query, 500,"select pg_cancel_backend(%u);", pid[i]);
+		execute_on_single_node(node[i], query, 0, &result);
+		DropTupleTableSlots(&result);
+	}
+	return;
+}
+
+/* 
+ * check_exist_gid -- check whether certain transaction exists
+ * input: 	transaction global xid
+ * return:	exist or not
+ */
+bool check_exist_gid(char *gid)
+{
+	bool res = false;
+	int i;
+	for (i = 0; i < pgxc_transaction_count; i++)
+	{
+		if (strcmp(pgxc_transaction[i].gid, gid) == 0)
+		{
+			res = true;
+		}
+	}
+	return res;
+}
+
+void CountWaitTxn(void)
+{
+	int i;
+	int j;
+	for (i = 0; i < pgxc_transaction_count; i++)
+	{
+		for (j = 0; j < pgxc_transaction_count; j++)
+		{
+			if (pgxc_edge[i][j] == 1)
+			{
+				pgxc_transaction[j].wait_txn++;
+			}
+		}
+	}
+}
diff --git a/contrib/pg_unlock/pg_unlock.control b/contrib/pg_unlock/pg_unlock.control
new file mode 100644
index 00000000..033558c4
--- /dev/null
+++ b/contrib/pg_unlock/pg_unlock.control
@@ -0,0 +1,5 @@
+# deadlock detect extention
+comment = 'tools for detect and unlock all the deadlocks'
+default_version = '1.0'
+module_pathname = '$libdir/pg_unlock'
+relocatable = true
\ No newline at end of file
diff --git a/contrib/tbase_subscription/Makefile b/contrib/tbase_subscription/Makefile
new file mode 100644
index 00000000..59cb31c6
--- /dev/null
+++ b/contrib/tbase_subscription/Makefile
@@ -0,0 +1,19 @@
+# contrib/tbase_subscription/Makefile
+
+MODULE_big = tbase_subscription
+OBJS = tbase_subscription.o
+
+EXTENSION = tbase_subscription
+DATA = tbase_subscription--1.0.sql \
+	tbase_subscription--unpackaged--1.0.sql
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/tbase_subscription
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/tbase_subscription/tbase_subscription--1.0.sql b/contrib/tbase_subscription/tbase_subscription--1.0.sql
new file mode 100644
index 00000000..02391235
--- /dev/null
+++ b/contrib/tbase_subscription/tbase_subscription--1.0.sql
@@ -0,0 +1,36 @@
+/* contrib/tbase_subscription/tbase_subscription--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION tbase_subscription" to load this file. \quit
+
+CREATE TABLE tbase_subscription
+(
+	sub_name				name,				-- Name of TBase subscription created on coordinator
+	sub_ignore_pk_conflict	bool,				-- ignore primary key conflict occurs when apply 
+	sub_manual_hot_date		text,				-- GUC parameter, manual_hot_date
+	sub_temp_hot_date		text,				-- GUC parameter, temp_hot_date
+	sub_temp_cold_date		text,				-- GUC parameter, temp_cold_date
+	sub_parallel_number		int4,				-- Split TBase subscription into multiple parallel tbase-sub-subscriptions
+	sub_is_all_actived		bool				-- Whether all parallel tbase-sub-subscriptions are actived.
+												-- If there are some parallel tbase-sub-subscriptions, 
+												-- other tbase-sub-subscriptions can be activated only after 
+												-- the first tbase-sub-subscription has completed the data COPY.
+												-- And other tbase-sub-subscriptions can only be activated by 
+												-- the first tbase-sub-subscription.
+) WITH OIDS;
+
+CREATE TABLE tbase_subscription_parallel
+(
+	sub_parent				oid,				-- Oid of parent tbase subsription stored in tbase_subscription above
+	sub_child				oid,				-- A TBase subscription may be split into multiple parallel tbase-sub-subscriptions,
+												-- and each tbase-sub-subscription is recorded in pg_subscription with a given oid
+	sub_index				int4,				-- Index of this tbase-sub-subscription in all parallel tbase-sub-subscriptions
+	sub_active_state		bool,				-- Whether the current tbase-sub-subscription is activated by the first tbase-sub-subscription,
+												-- valid only when sub_index > 0
+	sub_active_lsn			pg_lsn				-- The LSN value that was set when the current tbase-sub-subscription was activated by the first
+												-- tbase-sub-subscription, valid only when sub_index > 0
+) WITH OIDS;
+
+-- Don't want this to be available to non-superusers.
+REVOKE ALL ON TABLE tbase_subscription FROM PUBLIC;
+REVOKE ALL ON TABLE tbase_subscription_parallel FROM PUBLIC;
diff --git a/contrib/tbase_subscription/tbase_subscription--unpackaged--1.0.sql b/contrib/tbase_subscription/tbase_subscription--unpackaged--1.0.sql
new file mode 100644
index 00000000..9b576874
--- /dev/null
+++ b/contrib/tbase_subscription/tbase_subscription--unpackaged--1.0.sql
@@ -0,0 +1,4 @@
+/* contrib/tbase_subscription/tbase_subscription--unpackaged--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION tbase_subscription" to load this file. \quit
diff --git a/contrib/tbase_subscription/tbase_subscription.c b/contrib/tbase_subscription/tbase_subscription.c
new file mode 100644
index 00000000..33312ad2
--- /dev/null
+++ b/contrib/tbase_subscription/tbase_subscription.c
@@ -0,0 +1,26 @@
+#include "postgres.h"
+
+#include <unistd.h>
+#include <time.h>
+
+#include "access/hash.h"
+#include "executor/instrument.h"
+#include "funcapi.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "parser/analyze.h"
+#include "parser/parsetree.h"
+#include "parser/scanner.h"
+#include "parser/scansup.h"
+#include "pgstat.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/spin.h"
+#include "tcop/utility.h"
+#include "utils/builtins.h"
+#include "access/xact.h"
+#include "access/transam.h"
+#include "utils/timestamp.h"
+
+PG_MODULE_MAGIC;
+
diff --git a/contrib/tbase_subscription/tbase_subscription.control b/contrib/tbase_subscription/tbase_subscription.control
new file mode 100644
index 00000000..5a7ae862
--- /dev/null
+++ b/contrib/tbase_subscription/tbase_subscription.control
@@ -0,0 +1,5 @@
+# tbase_subscription extension
+comment = 'support for hot and cold subscriptions and two-way subscriptions'
+default_version = '1.0'
+module_pathname = '$libdir/tbase_subscription'
+relocatable = true

From 6343ea17701a471d25fa75959785bc1ebef33f1b Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Sat, 1 Jan 2022 17:51:54 +0800
Subject: [PATCH 321/578] create branch v2.3.0

---
 src/backend/utils/adt/version.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c
index 9b7f49d3..2636c448 100644
--- a/src/backend/utils/adt/version.c
+++ b/src/backend/utils/adt/version.c
@@ -78,7 +78,7 @@
 #include "utils/builtins.h"
 
 
-#define TBASE_VERSION_STR "TBase_V2.0.0_release"
+#define TBASE_VERSION_STR "TBase_V2.3.0_release"
 
 Datum
 pgsql_version(PG_FUNCTION_ARGS)

From 3747e3c0df82b4d1d45018976e8e1231f32d7f01 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Sat, 1 Jan 2022 17:55:29 +0800
Subject: [PATCH 322/578] master branch

---
 src/backend/utils/adt/version.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c
index 2636c448..b11ef63b 100644
--- a/src/backend/utils/adt/version.c
+++ b/src/backend/utils/adt/version.c
@@ -78,7 +78,7 @@
 #include "utils/builtins.h"
 
 
-#define TBASE_VERSION_STR "TBase_V2.3.0_release"
+#define TBASE_VERSION_STR "TBase_master"
 
 Datum
 pgsql_version(PG_FUNCTION_ARGS)

From 20e3f09bc0e8fe8507890b9b5d539506bff4729a Mon Sep 17 00:00:00 2001
From: JennyJennyChen <48546628+JennyJennyChen@users.noreply.github.com>
Date: Wed, 19 Jan 2022 16:48:25 +0800
Subject: [PATCH 323/578] fix parallel select hang (#119)

https://github.com/Tencent/TBase/issues/108

Co-authored-by: bethding <bethding@tencent.com>
---
 src/backend/pgxc/squeue/squeue.c | 47 +++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
index a4deed0d..1c6fc478 100644
--- a/src/backend/pgxc/squeue/squeue.c
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -430,6 +430,7 @@ typedef struct ParallelSendDataQueue
     size_t           send_data_len;
     size_t           write_data_len;
     bool             long_tuple;
+	bool             wait_free_space;
     DataPumpSndStatus     status;      /* status of the data sending */
     bool             stuck;
     bool             last_send;
@@ -633,7 +634,7 @@ static void *ParallelSenderThreadMain(void *arg);
 static void ParallelSenderSendData(ParallelSendThreadControl *threadControl, bool last_send);
 static bool SendNodeData(ParallelSendNodeControl *node, bool last_send);
 static char *GetNodeData(ParallelSendDataQueue *buf, uint32 *uiLen, bool *long_tuple);
-static uint32 NodeDataSize(ParallelSendDataQueue *buf, bool *long_tuple);
+static uint32 NodeDataSize(ParallelSendDataQueue *buf, bool *long_tuple, bool *wait_free_space);
 static void  IncNodeDataOff(ParallelSendDataQueue *buf, uint32 uiLen);
 static int RawSendNodeData(ParallelSendNodeControl *node, int32 sock, char * data, int32 len, int32 * reason);
 static int32 SetNodeSocket(void *sndctl, int32 nodeindex, int32 nodeId,  int32 socket);
@@ -647,7 +648,7 @@ static void SendNodeDataRemote(SharedQueue squeue, ParallelWorkerControl *contro
                            TupleTableSlot *slot, Tuplestorestate **tuplestore, MemoryContext tmpcxt);
 static bool ParallelSendDataRow(ParallelWorkerControl *control, ParallelSendDataQueue *buf, char *data, size_t len, int32 consumerIdx);
 static uint32 BufferFreeSpace(ParallelSendDataQueue *buf);
-static void SetBufferBorder(ParallelSendDataQueue *buf, bool long_tuple);
+static void SetBufferBorderAndWaitFlag(ParallelSendDataQueue *buf, bool long_tuple, bool wait_free_space);
 static void PutNodeData(ParallelSendDataQueue *buf, char *data, uint32 len);
 static char *GetBufferWriteOff(ParallelSendDataQueue *buf, uint32 *uiLen);
 static void IncBufferWriteOff(ParallelSendDataQueue *buf, uint32 uiLen);
@@ -6649,6 +6650,7 @@ InitParallelSendSharedData(SharedQueue sq, ParallelSendControl *senderControl, i
             buffer->send_data_len  = 0;
             buffer->write_data_len = 0;
             buffer->long_tuple     = 0;
+			buffer->wait_free_space = false;
             buffer->status         = DataPumpSndStatus_no_socket;
             buffer->stuck          = false;
             buffer->last_send      = false;
@@ -6956,6 +6958,7 @@ SendNodeData(ParallelSendNodeControl *node, bool last_send)
 {// #lizard forgives
     bool   should_send   = false;
     bool   long_tuple    = false;
+	bool   wait_free_space = false;
     int    i             = 0;
     uint32 len           = 0;
     int32  ret           = 0;
@@ -6992,10 +6995,13 @@ SendNodeData(ParallelSendNodeControl *node, bool last_send)
 
             if (!should_send)
             {
-                data_size = NodeDataSize(buffer, &long_tuple);
+				data_size = NodeDataSize(buffer, &long_tuple, &wait_free_space);
 
-                /* too small to send */
-                if (data_size < g_SndBatchSize * 1024)
+				/* 
+				 * If wait_free_space is true, sender thread should send data to free buffer space,
+				 * else wait until data_size reach to batch threshold.
+				 */
+				if (!wait_free_space && data_size < g_SndBatchSize * 1024)
                 {
                     node->current_buffer = (node->current_buffer + 1) % node->numParallelWorkers;
 
@@ -7072,7 +7078,7 @@ SendNodeData(ParallelSendNodeControl *node, bool last_send)
                     }
 
                     /* get left data length */
-                    len = NodeDataSize(buffer, &long_tuple);
+					len = NodeDataSize(buffer, &long_tuple, &wait_free_space);
 
                     if (len == 0)
                     {
@@ -7114,7 +7120,8 @@ GetNodeData(ParallelSendDataQueue *buf, uint32 *uiLen, bool *long_tuple)
     char  *data;
     if (buf)
     {
-        if (0 == NodeDataSize(buf, long_tuple))
+		bool wait_flag = false;
+		if (0 == NodeDataSize(buf, long_tuple, &wait_flag))
         {
             *uiLen = 0;
             return NULL;
@@ -7166,7 +7173,7 @@ GetNodeData(ParallelSendDataQueue *buf, uint32 *uiLen, bool *long_tuple)
 
 /* Return total data size in buffer */
 static uint32 
-NodeDataSize(ParallelSendDataQueue *buf, bool *long_tuple)
+NodeDataSize(ParallelSendDataQueue *buf, bool *long_tuple, bool *wait_free_space)
 {
     uint32 border = 0;
     uint32 tail   = 0;
@@ -7177,6 +7184,7 @@ NodeDataSize(ParallelSendDataQueue *buf, bool *long_tuple)
         tail   = buf->bufTail;
         border = buf->bufBorder;
         *long_tuple = buf->long_tuple;
+		*wait_free_space = buf->wait_free_space;
         spinlock_unlock(&(buf->bufLock));
         
         if (INVALID_BORDER == border)
@@ -7882,9 +7890,10 @@ ParallelSendDataRow(ParallelWorkerControl *control, ParallelSendDataQueue *buf,
     /* no space left, */
     if (BufferFreeSpace(buf) < (uint32)tuple_len)
     {
+		/* Set flag to notice sender thread send data without waiting batch size threshold */
         if (!long_tuple)
         {
-            SetBufferBorder(buf, false);
+			SetBufferBorderAndWaitFlag(buf, false, true);
             pg_usleep(50L);
             return false;
         }
@@ -7906,7 +7915,7 @@ ParallelSendDataRow(ParallelWorkerControl *control, ParallelSendDataQueue *buf,
         /* Data */
         PutNodeData(buf, data, len);
 
-        SetBufferBorder(buf, false);
+		SetBufferBorderAndWaitFlag(buf, false, false);
     }
     else
     {
@@ -7923,7 +7932,7 @@ ParallelSendDataRow(ParallelWorkerControl *control, ParallelSendDataQueue *buf,
         /* put message 'D' */
         while (data_len < header_len)
         {
-            SetBufferBorder(buf, false);
+			SetBufferBorderAndWaitFlag(buf, false, true);
             pg_usleep(100L);
             data_len = BufferFreeSpace(buf);
 
@@ -7962,7 +7971,7 @@ ParallelSendDataRow(ParallelWorkerControl *control, ParallelSendDataQueue *buf,
             }
             else
             {
-                SetBufferBorder(buf, true);
+				SetBufferBorderAndWaitFlag(buf, true, true);
                 pg_usleep(100L);
 
                 if (buf->status == DataPumpSndStatus_error)
@@ -7972,7 +7981,7 @@ ParallelSendDataRow(ParallelWorkerControl *control, ParallelSendDataQueue *buf,
             }
         }
 
-        SetBufferBorder(buf, false);
+		SetBufferBorderAndWaitFlag(buf, false, false);
     }
 
     buf->ntuples++;
@@ -8011,15 +8020,15 @@ BufferFreeSpace(ParallelSendDataQueue *buf)
 }
 
 static void
-SetBufferBorder(ParallelSendDataQueue *buf, bool long_tuple)
+SetBufferBorderAndWaitFlag(ParallelSendDataQueue *buf, bool long_tuple, bool wait_free_space)
 {
     spinlock_lock(&(buf->bufLock));
     buf->bufBorder = buf->bufHead;
     buf->long_tuple = long_tuple;
+	buf->wait_free_space = wait_free_space;
     spinlock_unlock(&(buf->bufLock));
 }
 
-
 /* Send data into buffer */
 static void 
 PutNodeData(ParallelSendDataQueue *buf, char *data, uint32 len)
@@ -8568,7 +8577,7 @@ ParallelFastSendDatarow(ParallelSendDataQueue *buf, TupleTableSlot *slot, void *
                 }
                 else
                 {
-                    SetBufferBorder(buf, true);
+					SetBufferBorderAndWaitFlag(buf, true, true);
                     pg_usleep(50L);
 
                     if (buf->status == DataPumpSndStatus_error)
@@ -8596,11 +8605,11 @@ ParallelFastSendDatarow(ParallelSendDataQueue *buf, TupleTableSlot *slot, void *
                 }
             }
 
-            SetBufferBorder(buf, false);
+			SetBufferBorderAndWaitFlag(buf, false, false);
         }
         else
         {            
-            SetBufferBorder(buf, false);        
+			SetBufferBorderAndWaitFlag(buf, false, false);
         }
 
         
@@ -8628,7 +8637,7 @@ ParallelFastSendDatarow(ParallelSendDataQueue *buf, TupleTableSlot *slot, void *
 #if 1
         /* Not enough space, wakeup sender. */
         //ParallelSendWakeupSender(control, buf, nodeindex);
-        SetBufferBorder(buf, false);
+		SetBufferBorderAndWaitFlag(buf, false, true);
         //pg_usleep(50L);
 #endif
         return false;

From caf14f34f10bb2995d5aac6b67071ce9bb973f8b Mon Sep 17 00:00:00 2001
From: JennyJennyChen <48546628+JennyJennyChen@users.noreply.github.com>
Date: Wed, 19 Jan 2022 16:53:07 +0800
Subject: [PATCH 324/578] fix active snapshot null when set_global_snapshot =
 false (#120)

* fix parallel select hang
https://github.com/Tencent/TBase/issues/108

* fix active snapshot null when set_global_snapshot = false
https://github.com/Tencent/TBase/issues/106

Co-authored-by: bethding <bethding@tencent.com>
---
 src/backend/executor/execParallel.c | 9 +++++++++
 src/backend/pgxc/pool/execRemote.c  | 3 +--
 src/include/pgxc/execRemote.h       | 1 +
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 920bc32e..72cc61df 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -752,6 +752,15 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, int nworkers)
     }
 #endif
 
+#ifdef __TBASE__
+	/* set snapshot as needed */
+	if (!g_set_global_snapshot && !ActiveSnapshotSet())
+	{
+		SetSnapshot(estate);
+	}
+#endif
+
+
     /* Everyone's had a chance to ask for space, so now create the DSM. */
     InitializeParallelDSM(pcxt);
 
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index d105295a..c781abcf 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -150,7 +150,6 @@ pgxc_node_remote_prefinish(char *prepareGID, char *nodestring);
 static void pgxc_abort_connections(PGXCNodeAllHandles *all_handles);
 static void pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle);
 static void pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle);
-static bool SetSnapshot(EState *state);
 static int pgxc_node_remote_commit_internal(PGXCNodeAllHandles *handles, TranscationType txn_type);
 #endif
 
@@ -12943,7 +12942,7 @@ SubTranscation_PreAbort_Remote(void)
     
 }
 
-static bool
+bool
 SetSnapshot(EState *state)
 {
     bool result = false;
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index 60910919..5b6d46c6 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -438,6 +438,7 @@ extern TupleDesc create_tuple_desc(char *msg_body, size_t len);
 
 extern void ExecFinishRemoteSubplan(RemoteSubplanState *node);
 extern void ExecShutdownRemoteSubplan(RemoteSubplanState *node);
+extern bool SetSnapshot(EState *state);
 #endif
 
 #ifdef __SUBSCRIPTION__

From b65d457372da7fb94f50844af835962c9091cf56 Mon Sep 17 00:00:00 2001
From: dafoerx <dafoer_x@163.com>
Date: Tue, 15 Mar 2022 15:20:38 +0800
Subject: [PATCH 325/578] [BUGFIX] 1.Skip invalid relid in group information
 check 2.Determine whether tables of different groups are allowed to insert.

---
 src/backend/parser/analyze.c           | 80 +++++++++++++++++++-------
 src/test/regress/expected/insert_1.out | 48 ++++++++++++++++
 src/test/regress/sql/insert.sql        | 28 +++++++++
 3 files changed, 136 insertions(+), 20 deletions(-)

diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index fb5e27f1..8d01fc39 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -575,6 +575,44 @@ transformDeleteStmt(ParseState *pstate, DeleteStmt *stmt)
     return qry;
 }
 
+/*
+ * Determine whether tables of different groups are allowed to insert.
+ */
+static bool
+is_table_allowed_insert(RelationLocInfo *from, RelationLocInfo *to)
+{
+	List *from_nodelist = from->rl_nodeList;
+	List *to_nodelist = to->rl_nodeList;
+	List *diff = NULL;
+	bool result = false;
+
+	/* necessary check, will never happened. */
+	if (from == NULL || to == NULL)
+	{
+		elog(ERROR, "is_reptable_allow_insert, invalid params %s:%s",
+			from ? " " : "from is null",
+			to ? " " : "to is null");
+	}
+
+	/* step1: From table must be replication table. */
+	if (
+#ifdef __COLD_HOT__
+		(from->coldGroupId != to->coldGroupId) ||
+#endif
+		((from->groupId != to->groupId) && (!IsRelationReplicated(from))))
+	{
+		return false;
+	}
+
+	/* step2: Data distribution nodes have intersections */
+	diff = list_difference_int(to_nodelist, from_nodelist);
+
+	/* stemp3: Insertions are allowed if there is an intersection of data distribution nodes. */
+	result = (list_length(diff) != list_length(to_nodelist));
+	list_free(diff);
+	return result;
+}
+
 /*
  * transformInsertStmt -
  *      transform an Insert Statement
@@ -704,35 +742,37 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
         Query       *selectQuery;
 
 #ifdef __TBASE__
-        /* prevent insert into cold_hot table select ... */
-        if (pstate->p_target_relation)
-        {
+		/* prevent insert into cold_hot table select ... */
+		if (pstate->p_target_relation)
+		{
 			RelationLocInfo *target_rel_loc_info = pstate->p_target_relation->rd_locator_info;
 			RelationLocInfo *from_rel_loc_info;
 
 			if (target_rel_loc_info && target_rel_loc_info->locatorType == LOCATOR_TYPE_SHARD)
-            {
+			{
 				foreach(lc, selectStmt->fromClause)
-                {
+				{
 					Node   *node = lfirst(lc);
 					if (IsA(node, RangeVar))
 					{
-						Relation rel = heap_openrv((RangeVar *) node, AccessShareLock);
-					
-					from_rel_loc_info = rel->rd_locator_info;
-					if (from_rel_loc_info == NULL || /* from system table */
-#ifdef __COLD_HOT__
-					    from_rel_loc_info->coldGroupId != target_rel_loc_info->coldGroupId ||
-#endif
-					    from_rel_loc_info->groupId != target_rel_loc_info->groupId)
-					{
-						elog(ERROR, "shard table could not be inserted from any other tables in different group");
+						Oid relid = RangeVarGetRelid((RangeVar *) node, NoLock, true);
+						
+						if (InvalidOid != relid)
+						{
+							Relation rel = heap_open(relid, AccessShareLock);
+
+							from_rel_loc_info = rel->rd_locator_info;
+							if (!is_table_allowed_insert(from_rel_loc_info, target_rel_loc_info))
+							{
+								elog(ERROR,
+									"shard table could not be inserted from any other tables in different group");
+							}
+							
+							heap_close(rel, AccessShareLock);
+						}
 					}
-					
-					heap_close(rel, AccessShareLock);
-                }
-            }
-        }
+				}
+			}
 		}
 #endif
 
diff --git a/src/test/regress/expected/insert_1.out b/src/test/regress/expected/insert_1.out
index 592137e9..40dd14ab 100644
--- a/src/test/regress/expected/insert_1.out
+++ b/src/test/regress/expected/insert_1.out
@@ -728,3 +728,51 @@ insert into returningwrtest values (2, 'foo') returning returningwrtest;
 (1 row)
 
 drop table returningwrtest;
+-- check insert into a shard table from a CTE table
+create table t1(f1 int,f2 int) distribute by shard(f1);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+create table t2(f1 int,f2 int) distribute by shard(f1);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into t1 values(1,1);
+insert into t1 values(2,2);
+with baseInfo as(select * from t1)
+insert into t2 select * from baseInfo;
+drop table t1;
+drop table t2;
+-- Determine whether tables of different groups are allowed to insert.
+set default_locator_type to shard;
+drop table if exists t2;
+NOTICE:  table "t2" does not exist, skipping
+drop table if exists t2_rep;
+NOTICE:  table "t2_rep" does not exist, skipping
+drop table if exists t2_new;
+NOTICE:  table "t2_new" does not exist, skipping
+create table t2(f1 int,f2 int);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+create table t2_rep(f1 int,f2 int) distribute by replication;
+insert into t2_rep values(1,1),(2,2);
+insert into t2 select * from t2_rep;
+select count(*) from t2_rep;
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from t2;
+ count 
+-------
+     2
+(1 row)
+
+create table t2_new as select * from t2_rep;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+select count(*) from t2_new;
+ count 
+-------
+     2
+(1 row)
+
+drop table t2;
+drop table t2_rep;
+drop table t2_new;
+reset default_locator_type;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 75d801b9..b121eac0 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -407,3 +407,31 @@ alter table returningwrtest2 drop c;
 alter table returningwrtest attach partition returningwrtest2 for values in (2);
 insert into returningwrtest values (2, 'foo') returning returningwrtest;
 drop table returningwrtest;
+
+-- check insert into a shard table from a CTE table
+create table t1(f1 int,f2 int) distribute by shard(f1);
+create table t2(f1 int,f2 int) distribute by shard(f1);
+insert into t1 values(1,1);
+insert into t1 values(2,2);
+with baseInfo as(select * from t1)
+insert into t2 select * from baseInfo;
+drop table t1;
+drop table t2;
+
+-- Determine whether tables of different groups are allowed to insert.
+set default_locator_type to shard;
+drop table if exists t2;
+drop table if exists t2_rep;
+drop table if exists t2_new;
+create table t2(f1 int,f2 int);
+create table t2_rep(f1 int,f2 int) distribute by replication;
+insert into t2_rep values(1,1),(2,2);
+insert into t2 select * from t2_rep;
+select count(*) from t2_rep;
+select count(*) from t2;
+create table t2_new as select * from t2_rep;
+select count(*) from t2_new;
+drop table t2;
+drop table t2_rep;
+drop table t2_new;
+reset default_locator_type;
\ No newline at end of file

From 19ba10741cf8383b6ad18bfc3092c45d7254ebf7 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 25 Feb 2021 11:40:03 +0800
Subject: [PATCH 326/578] for gtm unix domain socket
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131085360117(merge request
 !178)

---
 src/backend/access/transam/gtm.c    |  41 ++
 src/backend/utils/misc/guc.c        |  17 +-
 src/gtm/client/fe-connect.c         |  21 +-
 src/gtm/client/ip.c                 | 131 +++++
 src/gtm/gtm_ctl/gtm_ctl.c           |  17 +
 src/gtm/libpq/ip.c                  | 132 +++++
 src/gtm/libpq/pqcomm.c              | 203 +++++++-
 src/gtm/main/gtm_opt.c              | 736 +++++++++++++++-------------
 src/gtm/main/main.c                 |  39 +-
 src/gtm/proxy/proxy_main.c          |  10 +-
 src/include/gtm/gtm_c.h             |   2 +-
 src/include/gtm/gtm_opt.h           | 253 +++++-----
 src/include/gtm/libpq.h             |  24 +-
 src/include/gtm/pqcomm.h            |  29 +-
 src/include/postmaster/postmaster.h |  29 +-
 15 files changed, 1163 insertions(+), 521 deletions(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 5f4e8218..969311e5 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -54,6 +54,7 @@
 char *NewGtmHost = NULL;
 int      NewGtmPort = -1;
 bool  g_GTM_skip_catalog = false;
+char *gtm_unix_socket_directory = DEFAULT_PGSOCKET_DIR;
 #endif
 char *GtmHost = NULL;
 int GtmPort = 0;
@@ -1191,6 +1192,7 @@ InitGTM(void)
 #ifdef __TBASE__
 	int  try_cnt = 0;
 	const int max_try_cnt = 1;
+    bool  same_host = false;
 
 	/*
 	 * Only re-set gtm info in two cases:
@@ -1209,6 +1211,13 @@ InitGTM(void)
 				 errmsg("GtmHost and GtmPort are not set")));
 		return;
 	}
+
+#ifdef HAVE_UNIX_SOCKETS
+    if (GtmHost && (strcmp(PGXCNodeHost, GtmHost) == 0) && gtm_unix_socket_file_exists())
+    {
+        same_host = true;
+    }
+#endif
 #endif
 
 try_connect_gtm:
@@ -1222,11 +1231,24 @@ InitGTM(void)
 		else if (IS_PGXC_DATANODE)
 			remote_type = GTM_NODE_DATANODE;
 
+#ifdef __TBASE__
+        if (same_host)
+        {
+            /* Use 60s as connection timeout */
+            snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s remote_type=%d postmaster=1 connect_timeout=%d",
+                     gtm_unix_socket_directory, GtmPort, PGXCNodeName, remote_type,
+                     tcp_keepalives_idle > 0 ?
+                     tcp_keepalives_idle : GtmConnectTimeout);
+        }
+        else
+#endif
+        {
 		/* Use 60s as connection timeout */
 		snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s remote_type=%d postmaster=1 connect_timeout=%d",
 								GtmHost, GtmPort, PGXCNodeName, remote_type,
 								tcp_keepalives_idle > 0 ?
 								tcp_keepalives_idle : GtmConnectTimeout);
+        }
 
 		/* Log activity of GTM connections */
 		if(GTMDebugPrint)
@@ -1234,11 +1256,24 @@ InitGTM(void)
 	}
 	else
 	{
+#ifdef __TBASE__
+        if (same_host)
+        {
+            /* Use 60s as connection timeout */
+            snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s connect_timeout=%d",
+                     gtm_unix_socket_directory, GtmPort, PGXCNodeName,
+                     tcp_keepalives_idle > 0 ?
+                     tcp_keepalives_idle : GtmConnectTimeout);
+        }
+        else
+#endif
+        {
 		/* Use 60s as connection timeout */
 		snprintf(conn_str, CONNECT_STR_LEN, "host=%s port=%d node_name=%s connect_timeout=%d",
 				GtmHost, GtmPort, PGXCNodeName,
 				tcp_keepalives_idle > 0 ?
 				tcp_keepalives_idle : GtmConnectTimeout);
+        }
 
 		/* Log activity of GTM connections */
 		if (IsAutoVacuumWorkerProcess() && GTMDebugPrint)
@@ -1268,6 +1303,12 @@ InitGTM(void)
 			}
 			CloseGTM();
 			try_cnt++;
+
+			/* if connect with unix domain socket failed */
+			if (same_host)
+            {
+                same_host = false;
+            }
 			goto try_connect_gtm;
 		}
 		else
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 488fd88f..a0db968c 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -5368,7 +5368,22 @@ static struct config_string ConfigureNamesString[] =
 #endif
         NULL, NULL, NULL
     },
-
+#ifdef __TBASE__
+    {
+        {"gtm_unix_socket_directory", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
+         gettext_noop("Sets the directory where GTM Unix-domain sockets used."),
+         NULL,
+         GUC_SUPERUSER_ONLY
+        },
+        &gtm_unix_socket_directory,
+#ifdef HAVE_UNIX_SOCKETS
+        DEFAULT_PGSOCKET_DIR,
+#else
+        "",
+#endif
+        NULL, NULL, NULL
+    },
+#endif
     {
         {"listen_addresses", PGC_POSTMASTER, CONN_AUTH_SETTINGS,
             gettext_noop("Sets the host name or IP address(es) to listen to."),
diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c
index 97e6e0c5..5f1f167f 100644
--- a/src/gtm/client/fe-connect.c
+++ b/src/gtm/client/fe-connect.c
@@ -260,7 +260,7 @@ static int
 connectGTMStart(GTM_Conn *conn)
 {// #lizard forgives
     int            portnum = 0;
-    char        portstr[128];
+	char		portstr[MAXGTMPATH];
     struct addrinfo *addrs = NULL;
     struct addrinfo hint;
     const char *node;
@@ -299,10 +299,27 @@ connectGTMStart(GTM_Conn *conn)
         /* Using pghost, so we have to look-up the hostname */
         node = conn->pghost;
         hint.ai_family = AF_UNSPEC;
+#ifdef __TBASE__
+#ifdef HAVE_UNIX_SOCKETS
+        if (is_absolute_path(conn->pghost))
+        {
+            node = NULL;
+            hint.ai_family = AF_UNIX;
+            UNIXSOCK_PATH(portstr, portnum, conn->pghost);
+            if (strlen(portstr) >= UNIXSOCK_PATH_BUFLEN)
+            {
+                appendGTMPQExpBuffer(&conn->errorMessage,
+                                     libpq_gettext("Unix-domain socket path \"%s\" is too long (maximum %d bytes)\n"),
+                                     portstr,
+                                     (int) (UNIXSOCK_PATH_BUFLEN - 1));
+                goto connect_errReturn;
+            }
+        }
+#endif
+#endif
     }
     else
     {
-        /* Without Unix sockets, default to localhost instead */
         node = "localhost";
         hint.ai_family = AF_UNSPEC;
     }
diff --git a/src/gtm/client/ip.c b/src/gtm/client/ip.c
index 73c6ec72..8327a204 100644
--- a/src/gtm/client/ip.c
+++ b/src/gtm/client/ip.c
@@ -46,6 +46,16 @@ static int range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr,
                         const struct sockaddr_in6 * netmask);
 #endif
 
+#ifdef	HAVE_UNIX_SOCKETS
+static int getaddrinfo_unix(const char *path,
+				 const struct addrinfo *hintsp,
+				 struct addrinfo **result);
+
+static int getnameinfo_unix(const struct sockaddr_un *sa, int salen,
+				 char *node, int nodelen,
+				 char *service, int servicelen,
+				 int flags);
+#endif
 
 /*
  *    gtm_getaddrinfo_all - get address info for Unix, IPv4 and IPv6 sockets
@@ -59,6 +69,11 @@ gtm_getaddrinfo_all(const char *hostname, const char *servname,
     /* not all versions of getaddrinfo() zero *result on failure */
     *result = NULL;
 
+#ifdef HAVE_UNIX_SOCKETS
+    if (hintp->ai_family == AF_UNIX)
+		return getaddrinfo_unix(servname, hintp, result);
+#endif
+
     /* NULL has special meaning to getaddrinfo(). */
     rc = getaddrinfo((!hostname || hostname[0] == '\0') ? NULL : hostname,
                      servname, hintp, result);
@@ -103,6 +118,14 @@ gtm_getnameinfo_all(const struct sockaddr_storage * addr, int salen,
 {
     int            rc;
 
+#ifdef HAVE_UNIX_SOCKETS
+    if (addr && addr->ss_family == AF_UNIX)
+		rc = getnameinfo_unix((const struct sockaddr_un *) addr, salen,
+							  node, nodelen,
+							  service, servicelen,
+							  flags);
+	else
+#endif
     rc = getnameinfo((const struct sockaddr *) addr, salen,
                      node, nodelen,
                      service, servicelen,
@@ -322,3 +345,111 @@ gtm_promote_v4_to_v6_mask(struct sockaddr_storage * addr)
 }
 
 #endif   /* HAVE_IPV6 */
+
+
+#ifdef HAVE_UNIX_SOCKETS
+
+/* -------
+ *	getaddrinfo_unix - get unix socket info using IPv6-compatible API
+ *
+ *	Bugs: only one addrinfo is set even though hintsp is NULL or
+ *		  ai_socktype is 0
+ *		  AI_CANONNAME is not supported.
+ * -------
+ */
+static int
+getaddrinfo_unix(const char *path, const struct addrinfo *hintsp,
+				 struct addrinfo **result)
+{
+	struct addrinfo hints;
+	struct addrinfo *aip;
+	struct sockaddr_un *unp;
+
+	*result = NULL;
+
+	MemSet(&hints, 0, sizeof(hints));
+
+	if (strlen(path) >= sizeof(unp->sun_path))
+		return EAI_FAIL;
+
+	if (hintsp == NULL)
+	{
+		hints.ai_family = AF_UNIX;
+		hints.ai_socktype = SOCK_STREAM;
+	}
+	else
+		memcpy(&hints, hintsp, sizeof(hints));
+
+	if (hints.ai_socktype == 0)
+		hints.ai_socktype = SOCK_STREAM;
+
+	if (hints.ai_family != AF_UNIX)
+	{
+		/* shouldn't have been called */
+		return EAI_FAIL;
+	}
+
+	aip = calloc(1, sizeof(struct addrinfo));
+	if (aip == NULL)
+		return EAI_MEMORY;
+
+	unp = calloc(1, sizeof(struct sockaddr_un));
+	if (unp == NULL)
+	{
+		free(aip);
+		return EAI_MEMORY;
+	}
+
+	aip->ai_family = AF_UNIX;
+	aip->ai_socktype = hints.ai_socktype;
+	aip->ai_protocol = hints.ai_protocol;
+	aip->ai_next = NULL;
+	aip->ai_canonname = NULL;
+	*result = aip;
+
+	unp->sun_family = AF_UNIX;
+	aip->ai_addr = (struct sockaddr *) unp;
+	aip->ai_addrlen = sizeof(struct sockaddr_un);
+
+	strcpy(unp->sun_path, path);
+
+#ifdef HAVE_STRUCT_SOCKADDR_STORAGE_SS_LEN
+	unp->sun_len = sizeof(struct sockaddr_un);
+#endif
+
+	return 0;
+}
+
+/*
+ * Convert an address to a hostname.
+ */
+static int
+getnameinfo_unix(const struct sockaddr_un *sa, int salen,
+				 char *node, int nodelen,
+				 char *service, int servicelen,
+				 int flags)
+{
+	int			ret = -1;
+
+	/* Invalid arguments. */
+	if (sa == NULL || sa->sun_family != AF_UNIX ||
+		(node == NULL && service == NULL))
+		return EAI_FAIL;
+
+	if (node)
+	{
+		ret = snprintf(node, nodelen, "%s", "[local]");
+		if (ret == -1 || ret > nodelen)
+			return EAI_MEMORY;
+	}
+
+	if (service)
+	{
+		ret = snprintf(service, servicelen, "%s", sa->sun_path);
+		if (ret == -1 || ret > servicelen)
+			return EAI_MEMORY;
+	}
+
+	return 0;
+}
+#endif							/* HAVE_UNIX_SOCKETS */
\ No newline at end of file
diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c
index 3d1cd2f4..bf804b98 100644
--- a/src/gtm/gtm_ctl/gtm_ctl.c
+++ b/src/gtm/gtm_ctl/gtm_ctl.c
@@ -118,6 +118,8 @@ static char gtmopts_file[MAXPGPATH];
 static char pid_file[MAXPGPATH];
 static char conf_file[MAXPGPATH];
 
+void CreateLockFile(const char *filename, const char *refName);
+void DeleteLockFile(const char *filename);
 /*
  * Write errors to stderr (or by gtm_equal means when stderr is
  * not available).
@@ -1776,3 +1778,18 @@ pg_realloc(void *ptr, size_t size)
         write_stderr("out of memory\n");
     return tmp;
 }
+
+/*
+ * for compile
+ */
+void
+CreateLockFile(const char *filename, const char *refName)
+{
+    return;
+}
+
+void
+DeleteLockFile(const char *filename)
+{
+    return;
+}
\ No newline at end of file
diff --git a/src/gtm/libpq/ip.c b/src/gtm/libpq/ip.c
index 5cf8f041..6e94a5fc 100644
--- a/src/gtm/libpq/ip.c
+++ b/src/gtm/libpq/ip.c
@@ -46,6 +46,17 @@ static int range_sockaddr_AF_INET6(const struct sockaddr_in6 * addr,
                         const struct sockaddr_in6 * netmask);
 #endif
 
+#ifdef	HAVE_UNIX_SOCKETS
+static int getaddrinfo_unix(const char *path,
+				 const struct addrinfo *hintsp,
+				 struct addrinfo **result);
+
+static int getnameinfo_unix(const struct sockaddr_un *sa, int salen,
+				 char *node, int nodelen,
+				 char *service, int servicelen,
+				 int flags);
+#endif
+
 
 /*
  *    pg_getaddrinfo_all - get address info for Unix, IPv4 and IPv6 sockets
@@ -59,6 +70,11 @@ pg_getaddrinfo_all(const char *hostname, const char *servname,
     /* not all versions of getaddrinfo() zero *result on failure */
     *result = NULL;
 
+#ifdef HAVE_UNIX_SOCKETS
+    if (hintp->ai_family == AF_UNIX)
+		return getaddrinfo_unix(servname, hintp, result);
+#endif
+
     /* NULL has special meaning to getaddrinfo(). */
     rc = getaddrinfo((!hostname || hostname[0] == '\0') ? NULL : hostname,
                      servname, hintp, result);
@@ -103,6 +119,14 @@ pg_getnameinfo_all(const struct sockaddr_storage * addr, int salen,
 {
     int            rc;
 
+#ifdef HAVE_UNIX_SOCKETS
+    if (addr && addr->ss_family == AF_UNIX)
+		rc = getnameinfo_unix((const struct sockaddr_un *) addr, salen,
+							  node, nodelen,
+							  service, servicelen,
+							  flags);
+	else
+#endif
     rc = getnameinfo((const struct sockaddr *) addr, salen,
                      node, nodelen,
                      service, servicelen,
@@ -322,3 +346,111 @@ pg_promote_v4_to_v6_mask(struct sockaddr_storage * addr)
 }
 
 #endif   /* HAVE_IPV6 */
+
+
+#ifdef HAVE_UNIX_SOCKETS
+
+/* -------
+ *	getaddrinfo_unix - get unix socket info using IPv6-compatible API
+ *
+ *	Bugs: only one addrinfo is set even though hintsp is NULL or
+ *		  ai_socktype is 0
+ *		  AI_CANONNAME is not supported.
+ * -------
+ */
+static int
+getaddrinfo_unix(const char *path, const struct addrinfo *hintsp,
+				 struct addrinfo **result)
+{
+	struct addrinfo hints;
+	struct addrinfo *aip;
+	struct sockaddr_un *unp;
+
+	*result = NULL;
+
+	MemSet(&hints, 0, sizeof(hints));
+
+	if (strlen(path) >= sizeof(unp->sun_path))
+		return EAI_FAIL;
+
+	if (hintsp == NULL)
+	{
+		hints.ai_family = AF_UNIX;
+		hints.ai_socktype = SOCK_STREAM;
+	}
+	else
+		memcpy(&hints, hintsp, sizeof(hints));
+
+	if (hints.ai_socktype == 0)
+		hints.ai_socktype = SOCK_STREAM;
+
+	if (hints.ai_family != AF_UNIX)
+	{
+		/* shouldn't have been called */
+		return EAI_FAIL;
+	}
+
+	aip = calloc(1, sizeof(struct addrinfo));
+	if (aip == NULL)
+		return EAI_MEMORY;
+
+	unp = calloc(1, sizeof(struct sockaddr_un));
+	if (unp == NULL)
+	{
+		free(aip);
+		return EAI_MEMORY;
+	}
+
+	aip->ai_family = AF_UNIX;
+	aip->ai_socktype = hints.ai_socktype;
+	aip->ai_protocol = hints.ai_protocol;
+	aip->ai_next = NULL;
+	aip->ai_canonname = NULL;
+	*result = aip;
+
+	unp->sun_family = AF_UNIX;
+	aip->ai_addr = (struct sockaddr *) unp;
+	aip->ai_addrlen = sizeof(struct sockaddr_un);
+
+	strcpy(unp->sun_path, path);
+
+#ifdef HAVE_STRUCT_SOCKADDR_STORAGE_SS_LEN
+	unp->sun_len = sizeof(struct sockaddr_un);
+#endif
+
+	return 0;
+}
+
+/*
+ * Convert an address to a hostname.
+ */
+static int
+getnameinfo_unix(const struct sockaddr_un *sa, int salen,
+				 char *node, int nodelen,
+				 char *service, int servicelen,
+				 int flags)
+{
+	int			ret = -1;
+
+	/* Invalid arguments. */
+	if (sa == NULL || sa->sun_family != AF_UNIX ||
+		(node == NULL && service == NULL))
+		return EAI_FAIL;
+
+	if (node)
+	{
+		ret = snprintf(node, nodelen, "%s", "[local]");
+		if (ret == -1 || ret > nodelen)
+			return EAI_MEMORY;
+	}
+
+	if (service)
+	{
+		ret = snprintf(service, servicelen, "%s", sa->sun_path);
+		if (ret == -1 || ret > servicelen)
+			return EAI_MEMORY;
+	}
+
+	return 0;
+}
+#endif							/* HAVE_UNIX_SOCKETS */
diff --git a/src/gtm/libpq/pqcomm.c b/src/gtm/libpq/pqcomm.c
index 3ddb5c91..90a8c93f 100644
--- a/src/gtm/libpq/pqcomm.c
+++ b/src/gtm/libpq/pqcomm.c
@@ -92,7 +92,6 @@
 #include "gtm/libpq-be.h"
 #include "gtm/elog.h"
 
-#define MAXGTMPATH    256
 
 /* Where the Unix socket file is */
 static char sock_path[MAXGTMPATH];
@@ -110,6 +109,20 @@ extern int         tcp_keepalives_count;
 static int    internal_putbytes(Port *myport, const char *s, size_t len);
 static int    internal_flush(Port *myport);
 
+#ifdef HAVE_UNIX_SOCKETS
+static int	Lock_AF_UNIX(char *unixSocketDir, char *unixSocketPath);
+static int	Setup_AF_UNIX(char *sock_path);
+#endif							/* HAVE_UNIX_SOCKETS */
+
+extern void CreateLockFile(const char *filename, const char *refName);
+extern void DeleteLockFile(const char *filename);
+extern void RemoveSocketFile(void);
+/*
+ * Configuration options
+ */
+int			unix_socket_permissions = 0777;
+char	    *unix_socket_group = "";
+
 /*
  * Streams -- wrapper around Unix socket system calls
  *
@@ -126,9 +139,8 @@ static int    internal_flush(Port *myport);
  *
  * RETURNS: STATUS_OK or STATUS_ERROR
  */
-
 int
-StreamServerPort(int family, char *hostName, unsigned short portNumber,
+StreamServerPort(int family, char *hostName, unsigned short portNumber, char *unixSocketDir,
                  int ListenSocket[], int MaxListen)
 {// #lizard forgives
     int            fd,
@@ -143,6 +155,8 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber,
     struct addrinfo hint;
     int            listen_index = 0;
     int            added = 0;
+    const char *addrDesc;
+    char		addrBuf[NI_MAXHOST];
 
 #if !defined(WIN32) || defined(IPV6_V6ONLY)
     int            one = 1;
@@ -154,6 +168,28 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber,
     hint.ai_flags = AI_PASSIVE;
     hint.ai_socktype = SOCK_STREAM;
 
+#ifdef HAVE_UNIX_SOCKETS
+	if (family == AF_UNIX)
+	{
+		/*
+		 * Create unixSocketPath from portNumber and unixSocketDir and lock
+		 * that file path
+		 */
+		UNIXSOCK_PATH(sock_path, portNumber, unixSocketDir);
+		if (strlen(sock_path) >= UNIXSOCK_PATH_BUFLEN)
+		{
+			ereport(LOG,
+					(errmsg("Unix-domain socket path \"%s\" is too long (maximum %d bytes)",
+                            sock_path,
+							(int) (UNIXSOCK_PATH_BUFLEN - 1))));
+			return STATUS_ERROR;
+		}
+		if (Lock_AF_UNIX(unixSocketDir, sock_path) != STATUS_OK)
+			return STATUS_ERROR;
+		service = sock_path;
+	}
+	else
+#endif							/* HAVE_UNIX_SOCKETS */
     {
         snprintf(portNumberStr, sizeof(portNumberStr), "%d", portNumber);
         service = portNumberStr;
@@ -210,6 +246,11 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber,
             case AF_INET6:
                 familyDesc = "IPv6";
                 break;
+#endif
+#ifdef HAVE_UNIX_SOCKETS
+			case AF_UNIX:
+				familyDesc = "Unix";
+				break;
 #endif
             default:
                 snprintf(familyDescBuf, sizeof(familyDescBuf),
@@ -219,7 +260,22 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber,
                 break;
         }
 
-        if ((fd = socket(addr->ai_family, SOCK_STREAM, 0)) < 0)
+        /* set up text form of address for log messages */
+#ifdef HAVE_UNIX_SOCKETS
+        if (addr->ai_family == AF_UNIX)
+            addrDesc = sock_path;
+        else
+#endif
+        {
+            pg_getnameinfo_all((const struct sockaddr_storage *) addr->ai_addr,
+                               addr->ai_addrlen,
+                               addrBuf, sizeof(addrBuf),
+                               NULL, 0,
+                               NI_NUMERICHOST);
+            addrDesc = addrBuf;
+        }
+
+		if ((fd = socket(addr->ai_family, SOCK_STREAM, 0)) < 0)
         {
             ereport(LOG,
                     (EACCES,
@@ -296,6 +352,16 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber,
             continue;
         }
 
+#ifdef HAVE_UNIX_SOCKETS
+		if (addr->ai_family == AF_UNIX)
+		{
+			if (Setup_AF_UNIX(service) != STATUS_OK)
+			{
+                close(fd);
+				break;
+			}
+		}
+#endif
 #define GTM_MAX_CONNECTIONS        4096
 
         /*
@@ -314,6 +380,19 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber,
             close(fd);
             continue;
         }
+
+#ifdef HAVE_UNIX_SOCKETS
+        if (addr->ai_family == AF_UNIX)
+            ereport(LOG,
+                    (errmsg("listening on Unix socket \"%s\"",
+                            addrDesc)));
+        else
+#endif
+            ereport(LOG,
+            /* translator: first %s is IPv4 or IPv6 */
+                    (errmsg("listening on %s address \"%s\", port %d",
+                            familyDesc, addrDesc, (int) portNumber)));
+
         ListenSocket[listen_index] = fd;
         added++;
     }
@@ -327,6 +406,122 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber,
 }
 
 
+/*
+ * Create a lockfile for the specified Unix socket file.
+ */
+static void
+CreateSocketLockFile(const char *socketfile, const char *socketDir)
+{
+    char		lockfile[MAXPGPATH];
+
+    snprintf(lockfile, sizeof(lockfile), "%s.lock", socketfile);
+    CreateLockFile(lockfile, socketDir);
+}
+
+/*
+ * Remove a lockfile and Unix socket file.
+ */
+void
+RemoveSocketFile(void)
+{
+    char		lockfile[MAXPGPATH];
+
+    snprintf(lockfile, sizeof(lockfile), "%s.lock", sock_path);
+    DeleteLockFile(lockfile);
+
+    (void) unlink(sock_path);
+}
+
+#ifdef HAVE_UNIX_SOCKETS
+
+/*
+ * Lock_AF_UNIX -- configure unix socket file path
+ */
+static int
+Lock_AF_UNIX(char *unixSocketDir, char *unixSocketPath)
+{
+	/*
+	 * Grab an interlock file associated with the socket file.
+	 *
+	 * Note: there are two reasons for using a socket lock file, rather than
+	 * trying to interlock directly on the socket itself.  First, it's a lot
+	 * more portable, and second, it lets us remove any pre-existing socket
+	 * file without race conditions.
+	 */
+	CreateSocketLockFile(unixSocketPath, unixSocketDir);
+
+	/*
+	 * Once we have the interlock, we can safely delete any pre-existing
+	 * socket file to avoid failure at bind() time.
+	 */
+	(void) unlink(unixSocketPath);
+
+	return STATUS_OK;
+}
+
+
+/*
+ * Setup_AF_UNIX -- configure unix socket permissions
+ */
+static int
+Setup_AF_UNIX(char *sock_path)
+{
+	/*
+	 * Fix socket ownership/permission if requested.  Note we must do this
+	 * before we listen() to avoid a window where unwanted connections could
+	 * get accepted.
+	 */
+	Assert(unix_socket_group);
+	if (unix_socket_group[0] != '\0')
+	{
+#ifdef WIN32
+		elog(WARNING, "configuration item unix_socket_group is not supported on this platform");
+#else
+		char	   *endptr;
+		unsigned long val;
+		gid_t		gid;
+
+		val = strtoul(unix_socket_group, &endptr, 10);
+		if (*endptr == '\0')
+		{						/* numeric group id */
+			gid = val;
+		}
+		else
+		{						/* convert group name to id */
+			struct group *gr;
+
+			gr = getgrnam(unix_socket_group);
+			if (!gr)
+			{
+				ereport(LOG,
+						(errmsg("group \"%s\" does not exist",
+								unix_socket_group)));
+				return STATUS_ERROR;
+			}
+			gid = gr->gr_gid;
+		}
+		if (chown(sock_path, -1, gid) == -1)
+		{
+			ereport(LOG,
+					(errmsg("could not set group of file \"%s\": %m",
+							sock_path)));
+			return STATUS_ERROR;
+		}
+#endif
+	}
+
+	if (chmod(sock_path, unix_socket_permissions) == -1)
+	{
+		ereport(LOG,
+				(errmsg("could not set permissions of file \"%s\": %m",
+						sock_path)));
+		return STATUS_ERROR;
+	}
+	return STATUS_OK;
+}
+#endif							/* HAVE_UNIX_SOCKETS */
+
+
 /*
  * StreamConnection -- create a new connection with client using
  *        server port.  Set port->sock to the FD of the new connection.
diff --git a/src/gtm/main/gtm_opt.c b/src/gtm/main/gtm_opt.c
index f8c1c2b9..f1a9418d 100644
--- a/src/gtm/main/gtm_opt.c
+++ b/src/gtm/main/gtm_opt.c
@@ -12,7 +12,7 @@
  * Written by Peter Eisentraut <peter_e@gmx.net>.
  *
  * IDENTIFICATION
- *      src/backend/utils/misc/guc.c
+ *	  src/backend/utils/misc/guc.c
  *
  *--------------------------------------------------------------------
  */
@@ -55,10 +55,10 @@ extern int tcp_keepalives_idle;
 extern int tcp_keepalives_count;
 extern int tcp_keepalives_interval;
 extern char *GTMDataDir;
-extern int            scale_factor_threads;
-extern int            worker_thread_number;
+extern int			scale_factor_threads;
+extern int			worker_thread_number;
 #ifdef __TBASE__
-extern bool    enable_gtm_sequence_debug;
+extern bool	enable_gtm_sequence_debug;
 extern int      wal_writer_delay;
 extern int      checkpoint_interval;
 extern char     *archive_command;
@@ -77,7 +77,9 @@ extern int     GTMStartupGTSDelta;
 extern int     GTMGTSFreezeLimit;
 
 #endif
-
+extern char*   unix_socket_directory;
+extern char*   unix_socket_group;
+extern int     unix_socket_permissions;
 
 /*
  * We have different sets for client and server message level options because
@@ -90,8 +92,8 @@ Gtm_Startup_Mode_Options();
 /*
  * GTM option variables that are exported from this module
  */
-char       *data_directory;
-char       *GTMConfigFileName;
+char	   *data_directory;
+char	   *GTMConfigFileName;
 
 /*
  * Displayable names for context types (enum GtmContext)
@@ -117,23 +119,23 @@ Config_Type_Names();
  * TO ADD AN OPTION:
  *
  * 1. Declare a global variable of type bool, int, double, or char*
- *      and make use of it.
+ *	  and make use of it.
  *
  * 2. Decide at what times it's safe to set the option. See guc.h for
- *      details.
+ *	  details.
  *
  * 3. Decide on a name, a default value, upper and lower bounds (if
- *      applicable), etc.
+ *	  applicable), etc.
  *
  * 4. Add a record below.
  *
  * 5. Add it to src/backend/utils/misc/postgresql.conf.sample, if
- *      appropriate.
+ *	  appropriate.
  *
  * 6. Don't forget to document the option (at least in config.sgml).
  *
  * 7. If it's a new GTMOPT_LIST option you must edit pg_dumpall.c to ensure
- *      it is not single quoted at dump time.
+ *	  it is not single quoted at dump time.
  */
 
 /*
@@ -146,205 +148,205 @@ Config_Type_Names();
 
 struct config_bool ConfigureNamesBool[] =
 {
-    {
-        {GTM_OPTNAME_SYNCHRONOUS_BACKUP, GTMC_STARTUP,
-           gettext_noop("Specifies if backup to GTM-Standby is taken in synchronous manner."),
-           gettext_noop("Default value is off."),
-           0
-        },
-        &Backup_synchronously,
+	{
+		{GTM_OPTNAME_SYNCHRONOUS_BACKUP, GTMC_STARTUP,
+		   gettext_noop("Specifies if backup to GTM-Standby is taken in synchronous manner."),
+		   gettext_noop("Default value is off."),
+		   0
+		},
+		&Backup_synchronously,
 		false, NULL, NULL, false, NULL
-    },
-#ifdef __TBASE__    
-    {
-        {GTM_OPTNAME_SYNCHRONOUS_COMMIT, GTMC_SIGHUP,
-           gettext_noop("enable GTM synchronous commit."),
-           gettext_noop("Standby must be connected when set."),
-           0
-        },
-        &enable_sync_commit,
+	},
+#ifdef __TBASE__	
+	{
+		{GTM_OPTNAME_SYNCHRONOUS_COMMIT, GTMC_SIGHUP,
+		   gettext_noop("enable GTM synchronous commit."),
+		   gettext_noop("Standby must be connected when set."),
+		   0
+		},
+		&enable_sync_commit,
 		false, NULL, NULL, false, NULL
-    },
-    {
+	},
+	{
 		{GTM_OPTNAME_ENABLE_DEBUG, GTMC_STARTUP,
-           gettext_noop("enable GTM debug print."),
-           gettext_noop("Default value is off."),
-           0
-        },
-        &enable_gtm_debug,
+		   gettext_noop("enable GTM debug print."),
+		   gettext_noop("Default value is off."),
+		   0
+		},
+		&enable_gtm_debug,
 		false, NULL, NULL, false, NULL
-    },
+	},
 #ifdef __XLOG__
     {
 		{GTM_OPTNAME_ARCHIVE_MODE, GTMC_STARTUP,
-           gettext_noop("enable archive."),
-           gettext_noop("Default value is off."),
-           0
-        },
-        &archive_mode,
+		   gettext_noop("enable archive."),
+		   gettext_noop("Default value is off."),
+		   0
+		},
+		&archive_mode,
 		false, NULL, NULL, false, NULL
-    },
-    {
+	},
+	{
 		{GTM_OPTNAME_ENABLE_XLOG_DEBUG, GTMC_STARTUP,
-           gettext_noop("enable GTM xlog debug print."),
-           gettext_noop("Default value is off."),
-           0
-        },
-        &enalbe_gtm_xlog_debug,
+		   gettext_noop("enable GTM xlog debug print."),
+		   gettext_noop("Default value is off."),
+		   0
+		},
+		&enalbe_gtm_xlog_debug,
 		false, NULL, NULL, false, NULL
-    },
+	},
 #endif
-    /* Set it as a GUC only if we are running regression. */
-    {
+	/* Set it as a GUC only if we are running regression. */
+	{
 		{GTM_OPTNAME_ENABLE_SEQ_DEBUG, GTMC_STARTUP,
-           gettext_noop("enable GTM sequence debug."),
-           gettext_noop("Default value is off."),
-           0
-        },
-        &enable_gtm_sequence_debug,
+		   gettext_noop("enable GTM sequence debug."),
+		   gettext_noop("Default value is off."),
+		   0
+		},
+		&enable_gtm_sequence_debug,
 #ifdef _PG_REGRESS_
 		true, NULL, NULL, false, NULL
 #else
 		false, NULL, NULL, false, NULL
 #endif
-    },
+	},
 #endif
-    {
+	{
 			{GTM_OPTNAME_CLUSTER_READ_ONLY, GTMC_STARTUP,
-             gettext_noop("Nodes connected with gtm will be readonly."),
-             gettext_noop("Default value is off."),
-             0
-            },
-            &GTMClusterReadOnly,
+			 gettext_noop("Nodes connected with gtm will be readonly."),
+			 gettext_noop("Default value is off."),
+			 0
+			},
+			&GTMClusterReadOnly,
 			false, NULL, NULL, false, NULL
-    },
+	},
 
-    /* End-of-list marker */
-    {
+	/* End-of-list marker */
+	{
 		{NULL, 0, NULL, NULL, 0}, NULL, false, NULL, NULL, false, NULL
-    }
+	}
 };
 
 
 struct config_int ConfigureNamesInt[] =
 {
-    {
-        {GTM_OPTNAME_PORT, GTMC_STARTUP,
-            gettext_noop("Listen Port of GTM or GTM standby server."),
-            NULL,
-            0
-        },
-        &GTMPortNumber,
+	{
+		{GTM_OPTNAME_PORT, GTMC_STARTUP,
+			gettext_noop("Listen Port of GTM or GTM standby server."),
+			NULL,
+			0
+		},
+		&GTMPortNumber,
 		0, 0, INT_MAX, NULL, NULL,
-        0, NULL
-    },
-    {
+		0, NULL
+	},
+	{
 		{GTM_OPTNAME_ACTIVE_PORT, GTMC_STARTUP,
-            gettext_noop("GTM server port number when it works as GTM-Standby."),
-            NULL,
-            0
-        },
-        &active_port,
+			gettext_noop("GTM server port number when it works as GTM-Standby."),
+			NULL,
+			0
+		},
+		&active_port,
 		0, 0, INT_MAX, NULL, NULL,
-        0, NULL
-    },
-    {
-        {GTM_OPTNAME_KEEPALIVES_IDLE, GTMC_STARTUP,
-            gettext_noop("Sets \"keepalives_idle\" option for the connection to GTM."),
-             gettext_noop("This option is effective only when it runs as GTM-Standby."),
-            GTMOPT_UNIT_TIME
-        },
-        &tcp_keepalives_idle,
+	    0, NULL
+	},
+	{
+		{GTM_OPTNAME_KEEPALIVES_IDLE, GTMC_STARTUP,
+			gettext_noop("Sets \"keepalives_idle\" option for the connection to GTM."),
+		 	gettext_noop("This option is effective only when it runs as GTM-Standby."),
+			GTMOPT_UNIT_TIME
+		},
+		&tcp_keepalives_idle,
 		0, 0, INT_MAX, NULL, NULL,
-        0, NULL
-    },
-    {
-        {GTM_OPTNAME_KEEPALIVES_INTERVAL, GTMC_STARTUP,
-            gettext_noop("Sets \"keepalives_interval\" option fo the connetion to GTM."),
-            gettext_noop("This option is effective only when it runs as GTM-Standby."),
-            GTMOPT_UNIT_TIME
-        },
-        &tcp_keepalives_interval,
+		0, NULL
+	},
+	{
+		{GTM_OPTNAME_KEEPALIVES_INTERVAL, GTMC_STARTUP,
+			gettext_noop("Sets \"keepalives_interval\" option fo the connetion to GTM."),
+			gettext_noop("This option is effective only when it runs as GTM-Standby."),
+			GTMOPT_UNIT_TIME
+		},
+		&tcp_keepalives_interval,
 		0, 0, INT_MAX, NULL, NULL,
-        0, NULL
-    },
-    {
-        {GTM_OPTNAME_KEEPALIVES_COUNT, GTMC_STARTUP,
-            gettext_noop("Sets \"keepalives_count\" option to the connection to GTM."),
-            gettext_noop("This option is effective only when it runs as GTM-Standby."),
-            0
-        },
-        &tcp_keepalives_count,
+		0, NULL
+	},
+	{
+		{GTM_OPTNAME_KEEPALIVES_COUNT, GTMC_STARTUP,
+			gettext_noop("Sets \"keepalives_count\" option to the connection to GTM."),
+			gettext_noop("This option is effective only when it runs as GTM-Standby."),
+			0
+		},
+		&tcp_keepalives_count,
 		0, 0, INT_MAX, NULL, NULL,
-        0, NULL
-    },
-    {
-        {
-            GTM_OPTNAME_SCALE_FACTOR_THREADS, GTMC_STARTUP,
-            gettext_noop("The scale factor of the number of worker thread, zero means disabled."),
-            NULL,
-            0
-        },
-        &scale_factor_threads,
+		0, NULL
+	},
+	{
+		{
+			GTM_OPTNAME_SCALE_FACTOR_THREADS, GTMC_STARTUP,
+			gettext_noop("The scale factor of the number of worker thread, zero means disabled."),
+			NULL,
+			0
+		},
+		&scale_factor_threads,
 		1, 0, INT_MAX, NULL, NULL,
-        0, NULL
-    },
-    {
-        {
-            GTM_OPTNAME_WORKER_THREADS_NUMBER, GTMC_STARTUP,
-            gettext_noop("The number of worker thread, zero means disabled."),
-            NULL,
-            0
-        },
-        &worker_thread_number,
+		0, NULL
+	},
+	{
+		{
+			GTM_OPTNAME_WORKER_THREADS_NUMBER, GTMC_STARTUP,
+			gettext_noop("The number of worker thread, zero means disabled."),
+			NULL,
+			0
+		},
+		&worker_thread_number,
 		2, 0, INT_MAX, NULL, NULL,
-        0, NULL
-    },            
+		0, NULL
+	},			
 #ifdef __XLOG__
-    {
-        {
+	{
+		{
 			GTM_OPTNAME_WAL_WRITER_DELAY, GTMC_STARTUP,
-            gettext_noop("Wal_writer will flush xlog every wal_writer_delay ms."),
-            NULL,
-            0
-        },
-        &wal_writer_delay,
+			gettext_noop("Wal_writer will flush xlog every wal_writer_delay ms."),
+			NULL,
+			0
+		},
+		&wal_writer_delay,
 		100, 10, INT_MAX, NULL, NULL,
-        0, NULL
-    },
-    {
-        {
+		0, NULL
+	},
+	{
+		{
 			GTM_OPTNAME_CHECKPOINT_INTERVAL, GTMC_STARTUP,
-            gettext_noop("Checkpointer will do checkpoint every checkpoint_interval minute."),
-            NULL,
-            0
-        },
-        &checkpoint_interval,
+			gettext_noop("Checkpointer will do checkpoint every checkpoint_interval minute."),
+			NULL,
+			0
+		},
+		&checkpoint_interval,
 		30, 1, INT_MAX, NULL, NULL,
-        0, NULL
-    },
-    {
-        {
-            GTM_OPTNAME_MAX_RESERVED_WAL_NUMBER, GTMC_STARTUP,
-            gettext_noop("Max number of reserved xlog segments."),
-            NULL,
-            0
-        },
-        &max_reserved_wal_number,
+		0, NULL
+	},
+	{
+		{
+			GTM_OPTNAME_MAX_RESERVED_WAL_NUMBER, GTMC_STARTUP,
+			gettext_noop("Max number of reserved xlog segments."),
+			NULL,
+			0
+		},
+		&max_reserved_wal_number,
 		0, 0, INT_MAX, NULL, NULL,
-        0, NULL
-    },
-    {
-        {
-            GTM_OPTNAME_MAX_WAL_SENDER, GTMC_STARTUP,
-            gettext_noop("Max number of wal senders."),
-            NULL,
-            0
-        },
-        &max_wal_sender,
+		0, NULL
+	},
+	{
+		{
+			GTM_OPTNAME_MAX_WAL_SENDER, GTMC_STARTUP,
+			gettext_noop("Max number of wal senders."),
+			NULL,
+			0
+		},
+		&max_wal_sender,
 		3, 0, 100, NULL, NULL,
-        0, NULL
-    },
+		0, NULL
+	},
 	{
 		{
 			GTM_OPTNAME_MAX_WAL_SENDER, GTMC_STARTUP,
@@ -357,204 +359,222 @@ struct config_int ConfigureNamesInt[] =
 		0, NULL
 	},
 #endif
-    {
-            {
-                    GTM_OPTNAME_GTS_FREEZE_TIME_LIMIT, GTMC_STARTUP,
-                    gettext_noop("refuse to start gtm before GTS has n days left,default 100 years"),
-                    NULL,
-                    0
-            },
-            &GTMGTSFreezeLimit,
+	{
+			{
+					GTM_OPTNAME_GTS_FREEZE_TIME_LIMIT, GTMC_STARTUP,
+					gettext_noop("refuse to start gtm before GTS has n days left,default 100 years"),
+					NULL,
+					0
+			},
+			&GTMGTSFreezeLimit,
 			365 * 100, 0, INT_MAX, NULL, NULL,
-            0, NULL
-    },
+			0, NULL
+	},
+	{
+			{
+					GTM_OPTNAME_STARTUP_GTS_DELTA, GTMC_STARTUP,
+					gettext_noop("Add -d seconds to GTS when started"),
+					NULL,
+					0
+			},
+			&GTMStartupGTSDelta,
+			300 , 0, INT_MAX, NULL, NULL,
+			0, NULL
+	},
+
     {
             {
-                    GTM_OPTNAME_STARTUP_GTS_DELTA, GTMC_STARTUP,
-                    gettext_noop("Add -d seconds to GTS when started"),
+                    GTM_OPTNAME_UNIX_SOCKET_PERMISSIONS, GTMC_STARTUP,
+                    gettext_noop("Sets the access permissions of the Unix-domain socket."
+                                 "Unix-domain sockets use the usual Unix file system "
+                                 "permission set. The parameter value is expected "
+                                 "to be a numeric mode specification in the form "
+                                 "accepted by the chmod and umask system calls. "
+                                 "(To use the customary octal format the number must "
+                                 "start with a 0 (zero).)"),
                     NULL,
                     0
             },
-            &GTMStartupGTSDelta,
-			300 , 0, INT_MAX, NULL, NULL,
+            &unix_socket_permissions,
+            0777, 0000, 0777, NULL, NULL,
             0, NULL
     },
 
-    /* End-of-list marker */
-    {
+	/* End-of-list marker */
+	{
 		{NULL, 0, NULL, NULL, 0}, NULL, 0, 0, 0, NULL, NULL, 0, NULL
-    }
+	}
 };
 
 
 struct config_real ConfigureNamesReal[] =
 {
-    /* End-of-list marker */
-    {
+	/* End-of-list marker */
+	{
 		{NULL, 0, NULL, NULL, 0}, NULL, 0.0, 0.0, 0.0, NULL, NULL, 0.0, NULL
-    }
+	}
 };
 
 struct config_string ConfigureNamesString[] =
 {
-    {
-        {GTM_OPTNAME_DATA_DIR, GTMC_STARTUP,
-            gettext_noop("Work directory."),
-            NULL,
-            0
-        },
-        &GTMDataDir,
-        NULL,
+	{
+		{GTM_OPTNAME_DATA_DIR, GTMC_STARTUP,
+			gettext_noop("Work directory."),
+			NULL,
+			0
+		},
+		&GTMDataDir,
+		NULL,
 		NULL, NULL,
-        NULL,
-        NULL
-    },
+		NULL,
+		NULL
+	},
 
-    {
+	{
 		{GTM_OPTNAME_CONFIG_FILE, GTMC_STARTUP,
-             gettext_noop("Configuration file name."),
-             NULL,
-             0
-        },
-        &GTMConfigFileName,
-        CONFIG_FILENAME,
+		 	gettext_noop("Configuration file name."),
+		 	NULL,
+		 	0
+		},
+		&GTMConfigFileName,
+		CONFIG_FILENAME,
 		NULL, NULL,
-        NULL,
-        NULL
-    },
+		NULL,
+		NULL
+	},
 
-    {
-        {GTM_OPTNAME_NODENAME, GTMC_STARTUP,
-            gettext_noop("Name of this GTM/GTM-Standby."),
-            NULL,
-            0
-        },
-        &NodeName,
-        "gtm",
+	{
+		{GTM_OPTNAME_NODENAME, GTMC_STARTUP,
+			gettext_noop("Name of this GTM/GTM-Standby."),
+			NULL,
+			0
+		},
+		&NodeName,
+		"gtm",
 		NULL, NULL,
-        NULL,
-        NULL
-    },
+		NULL,
+		NULL
+	},
 
-    {
-        {GTM_OPTNAME_LISTEN_ADDRESSES, GTMC_STARTUP,
-            gettext_noop("Listen address."),
-            NULL,
-            0
-        },
-        &ListenAddresses,
-        "*",
+	{
+		{GTM_OPTNAME_LISTEN_ADDRESSES, GTMC_STARTUP,
+			gettext_noop("Listen address."),
+			NULL,
+			0
+		},
+		&ListenAddresses,
+		"*",
 		NULL, NULL,
-        NULL, NULL
-    },
+		NULL, NULL
+	},
 
-    {
+	{
 		{GTM_OPTNAME_ACTIVE_HOST, GTMC_STARTUP,
-            gettext_noop("Address of target GTM ACT."),
-            gettext_noop("This parameter is effective only when it runs as GTM-Standby"),
-            0
-        },
-        &active_addr,
-        NULL,
+			gettext_noop("Address of target GTM ACT."),
+			gettext_noop("This parameter is effective only when it runs as GTM-Standby"),
+			0
+		},
+		&active_addr,
+		NULL,
 		NULL, NULL,
-        NULL, NULL
-    },
+		NULL, NULL
+	},
 
-    {
+	{
 		{GTM_OPTNAME_LOG_FILE, GTMC_STARTUP,
-            gettext_noop("Log file name."),
-            NULL,
-            0
-        },
-        &GTMLogFile,
-        "gtm.log",
+			gettext_noop("Log file name."),
+			NULL,
+			0
+		},
+		&GTMLogFile,
+		"gtm.log",
 		NULL, NULL,
-        NULL, NULL
-    },
+		NULL, NULL
+	},
 
-    {
+	{
 		{GTM_OPTNAME_ERROR_REPORTER, GTMC_STARTUP,
-            gettext_noop("Command to report various errors."),
-            NULL,
-            0
-        },
-        &error_reporter,
-        NULL,
+			gettext_noop("Command to report various errors."),
+			NULL,
+			0
+		},
+		&error_reporter,
+		NULL,
 		NULL, NULL,
-        NULL, NULL
-    },
+		NULL, NULL
+	},
 
-    {
+	{
 		{GTM_OPTNAME_STATUS_READER, GTMC_STARTUP,
-            gettext_noop("Command to get status of global XC node status."),
-            gettext_noop("Runs when configuration file is read by SIGHUP"),
-            0
-        },
-        &status_reader,
-        NULL,
+			gettext_noop("Command to get status of global XC node status."),
+			gettext_noop("Runs when configuration file is read by SIGHUP"),
+			0
+		},
+		&status_reader,
+		NULL,
 		NULL, NULL,
-        NULL, NULL
-    },
+		NULL, NULL
+	},
 #ifdef __XLOG__
     {
 		{GTM_OPTNAME_ARCHIVE_COMMAND, GTMC_STARTUP,
-            gettext_noop("Archive use this command to backup xlog."),
-            NULL,
-            0
-        },
-        &archive_command,
-        NULL,
+			gettext_noop("Archive use this command to backup xlog."),
+			NULL,
+			0
+		},
+		&archive_command,
+		NULL,
 		NULL, NULL,
-        NULL, NULL
-    },
+		NULL, NULL
+	},
 
     {
-        {GTM_OPTNAME_SYNCHRONOUS_STANDBY_NAMES, GTMC_SIGHUP,
-            gettext_noop("to indicate which are synchronous slaves."),
-            NULL,
-            0
-        },
-        &synchronous_standby_names,
+		{GTM_OPTNAME_SYNCHRONOUS_STANDBY_NAMES, GTMC_SIGHUP,
+			gettext_noop("to indicate which are synchronous slaves."),
+			NULL,
+			0
+		},
+		&synchronous_standby_names,
 		"",
 		NULL, NULL,
-        NULL, NULL
-    },
-    
-    {
-        {GTM_OPTNAME_APPLICATION_NAME, GTMC_STARTUP,
-            gettext_noop("application name used in sync replication indication"),
-            NULL,
-            0
-        },
-        &application_name,
-        "",
+		NULL, NULL
+	},
+	
+	{
+		{GTM_OPTNAME_APPLICATION_NAME, GTMC_STARTUP,
+			gettext_noop("application name used in sync replication indication"),
+			NULL,
+			0
+		},
+		&application_name,
+		"",
 		NULL, NULL,
-        NULL, NULL
-    },
-    {
-        {GTM_OPTNAME_RECOVERY_COMMAND, GTMC_STARTUP,
-             gettext_noop("Point in time recovery,recovery command"),
-             NULL,
-             0
-        },
-        &recovery_command,
-        NULL,
+		NULL, NULL
+	},
+	{
+		{GTM_OPTNAME_RECOVERY_COMMAND, GTMC_STARTUP,
+		 	gettext_noop("Point in time recovery,recovery command"),
+		 	NULL,
+		 	0
+		},
+		&recovery_command,
+		NULL,
 		NULL, NULL,
-        NULL, NULL
-    },
-    {
-        {GTM_OPTNAME_RECOVERY_TARGET_GLOBALTIMESTAMP, GTMC_STARTUP,
-         gettext_noop("Point in time recovery,recovery timestamp"),
-         NULL,
-         0
-        },
-        &recovery_target_timestamp,
-        NULL,
+		NULL, NULL
+	},
+	{
+		{GTM_OPTNAME_RECOVERY_TARGET_GLOBALTIMESTAMP, GTMC_STARTUP,
+		 gettext_noop("Point in time recovery,recovery timestamp"),
+		 NULL,
+		 0
+		},
+		&recovery_target_timestamp,
+		NULL,
 		NULL, NULL,
-        NULL, NULL
-    },
+		NULL, NULL
+	},
 #endif
-    {
+	{
 		{GTM_OPTNAME_STARTUP_GTS_SET, GTMC_STARTUP,
 		 gettext_noop("Force start GTM with this GTS"),
 		 NULL,
@@ -564,47 +584,75 @@ struct config_string ConfigureNamesString[] =
 		NULL,
 		NULL, NULL,
 		NULL, NULL
+	},
+
+    {
+        {GTM_OPTNAME_UNIX_SOCKET_DIRECTORY, GTMC_STARTUP,
+         gettext_noop("Sets the directory where Unix-domain sockets will be created."),
+         NULL,
+         0
+        },
+        &unix_socket_directory,
+#ifdef HAVE_UNIX_SOCKETS
+        DEFAULT_PGSOCKET_DIR,
+#else
+        "",
+#endif
+        NULL, NULL,
+        NULL, NULL
     },
 
-    /* End-of-list marker */
     {
+        {GTM_OPTNAME_UNIX_SOCKET_GROUP, GTMC_STARTUP,
+         gettext_noop("Sets the owning group of the Unix-domain socket."),
+         NULL,
+         0
+        },
+        &unix_socket_group,
+        "",
+        NULL, NULL,
+        NULL, NULL
+    },
+
+	/* End-of-list marker */
+	{
 		{NULL, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL, NULL
-    }
+	}
 };
 
 
 struct config_enum ConfigureNamesEnum[] =
 {
-    {
+	{
 		{GTM_OPTNAME_LOG_MIN_MESSAGES, GTMC_STARTUP,
-            gettext_noop("Minimum message level to write to the log file."),
-            NULL,
-             0
-        },
-        &log_min_messages,
-        WARNING,
-        server_message_level_options,
+			gettext_noop("Minimum message level to write to the log file."),
+			NULL,
+		 	0
+		},
+		&log_min_messages,
+		WARNING,
+		server_message_level_options,
 		NULL,NULL,
-        WARNING, NULL
-    },
+		WARNING, NULL
+	},
 
-    {
+	{
 		{GTM_OPTNAME_STARTUP, GTMC_STARTUP,
-            gettext_noop("Specifies startup mode, act or standby."),
-            NULL,
-            0
-        },
-        &GTM_StandbyMode,
-        GTM_ACT_MODE,
-        gtm_startup_mode_options,
+			gettext_noop("Specifies startup mode, act or standby."),
+			NULL,
+			0
+		},
+		&GTM_StandbyMode,
+		GTM_ACT_MODE,
+		gtm_startup_mode_options,
 		NULL,NULL,
-        GTM_ACT_MODE, NULL
-    },
+		GTM_ACT_MODE, NULL
+	},
 
-    /* End-of-list marker */
-    {
+	/* End-of-list marker */
+	{
 		{NULL, 0, NULL, NULL, 0}, NULL, 0, NULL,NULL,NULL, 0, NULL
-    }
+	}
 };
 
 /******** end of options list ********/
@@ -615,12 +663,12 @@ struct config_enum ConfigureNamesEnum[] =
 struct config_generic **gtm_opt_variables;
 
 /* Current number of variables contained in the vector */
-int    num_gtm_opt_variables;
+int	num_gtm_opt_variables;
 
 /* Vector capacity */
-int    size_gtm_opt_variables;
+int	size_gtm_opt_variables;
 
 
-bool reporting_enabled;    /* TRUE to enable GTMOPT_REPORT */
+bool reporting_enabled;	/* TRUE to enable GTMOPT_REPORT */
 
-int    GTMOptUpdateCount = 0; /* Indicates when specific option is updated */
+int	GTMOptUpdateCount = 0; /* Indicates when specific option is updated */
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index ded1a044..81fa43c6 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -116,6 +116,7 @@ bool        GTMClusterReadOnly;
 char        *GTMStartupGTSSet;
 int         GTMGTSFreezeLimit;
 int         GTMStartupGTSDelta;
+char	    *unix_socket_directory = NULL;
 
 #endif
 GTM_MutexLock   control_lock;
@@ -229,11 +230,12 @@ static void GTM_RegisterPGXCNode(Port *myport, char *PGXCNodeName);
 
 static bool CreateOptsFile(int argc, char *argv[]);
 static void CreateDataDirLockFile(void);
-static void CreateLockFile(const char *filename, const char *refName);
+void CreateLockFile(const char *filename, const char *refName);
 static void SetDataDir(void);
 static void ChangeToDataDir(void);
 static void checkDataDir(void);
-static void DeleteLockFile(const char *filename);
+void DeleteLockFile(const char *filename);
+extern void RemoveSocketFile(void);
 static void PromoteToActive(void);
 #ifndef __XLOG__
 static void ProcessSyncStandbyCommand(Port *myport, GTM_MessageType mtype, StringInfo message);
@@ -614,7 +616,7 @@ main(int argc, char *argv[])
 #endif
     bool        force_xid = false;
 
-    int            process_thread_num;
+	int			process_thread_num = 0;
     bool        do_basebackup = false;
     /*
      * Local variable to hold command line options.
@@ -938,10 +940,12 @@ main(int argc, char *argv[])
         if (strcmp(ListenAddresses, "*") == 0)
             status = StreamServerPort(AF_UNSPEC, NULL,
                                       (unsigned short) GTMPortNumber,
+                                      NULL,
                                       ListenSocket, MAXLISTEN);
         else
             status = StreamServerPort(AF_UNSPEC, ListenAddresses,
                                       (unsigned short) GTMPortNumber,
+                                      NULL,
                                       ListenSocket, MAXLISTEN);
 
         if (status != STATUS_OK)
@@ -950,6 +954,25 @@ main(int argc, char *argv[])
                             ListenAddresses)));
     }
 
+#ifdef __TBASE__
+#ifdef HAVE_UNIX_SOCKETS
+    if (unix_socket_directory)
+	{
+        status = StreamServerPort(AF_UNIX, NULL,
+                                  (unsigned short) GTMPortNumber,
+                                  unix_socket_directory,
+                                  ListenSocket, MAXLISTEN);
+
+        if (status != STATUS_OK)
+        {
+            ereport(FATAL,
+                    (errmsg("could not create Unix-domain socket in directory \"%s\"",
+                            unix_socket_directory)));
+        }
+	}
+#endif
+#endif
+
     /*
      * check that we have some socket to listen on
      */
@@ -1658,6 +1681,12 @@ ServerLoop(void)
 #ifdef __XLOG__
             /* Delete pid file */
             DeleteLockFile(GTM_PID_FILE);
+#endif
+
+#ifdef __TBASE__
+#ifdef HAVE_UNIX_SOCKETS
+            RemoveSocketFile();
+#endif
 #endif
             elog(LOG, "GTM exits");
             exit(1);
@@ -4599,7 +4628,7 @@ CreateDataDirLockFile()
  * amPostmaster is used to determine how to encode the output PID.
  * isDDLock and refName are used to determine what error message to produce.
  */
-static void
+void
 CreateLockFile(const char *filename, const char *refName)
 {// #lizard forgives
     int            fd;
@@ -4806,7 +4835,7 @@ CreateOptsFile(int argc, char *argv[])
 }
 
 /* delete pid file */
-static void
+void
 DeleteLockFile(const char *filename)
 {
     if (unlink(filename) < 0)
diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c
index 7ea36366..5508c06d 100644
--- a/src/gtm/proxy/proxy_main.c
+++ b/src/gtm/proxy/proxy_main.c
@@ -232,11 +232,11 @@ static void GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo,
 
 static bool CreateOptsFile(int argc, char *argv[]);
 static void CreateDataDirLockFile(void);
-static void CreateLockFile(const char *filename, const char *refName);
+void CreateLockFile(const char *filename, const char *refName);
 static void SetDataDir(void);
 static void ChangeToDataDir(void);
 static void checkDataDir(void);
-static void DeleteLockFile(const char *filename);
+void DeleteLockFile(const char *filename);
 static void RegisterProxy(bool is_reconnect);
 static void UnregisterProxy(void);
 static GTM_Conn *ConnectGTM(void);
@@ -877,10 +877,12 @@ main(int argc, char *argv[])
         if (strcmp(ListenAddresses, "*") == 0)
             status = StreamServerPort(AF_UNSPEC, NULL,
                                       (unsigned short) GTMProxyPortNumber,
+									  NULL,
                                       ListenSocket, MAXLISTEN);
         else
             status = StreamServerPort(AF_UNSPEC, ListenAddresses,
                                       (unsigned short) GTMProxyPortNumber,
+                                      NULL,
                                       ListenSocket, MAXLISTEN);
 
         if (status == STATUS_OK)
@@ -2800,7 +2802,7 @@ CreateDataDirLockFile()
  * amPostmaster is used to determine how to encode the output PID.
  * isDDLock and refName are used to determine what error message to produce.
  */
-static void
+void
 CreateLockFile(const char *filename, const char *refName)
 {// #lizard forgives
     int            fd;
@@ -2999,7 +3001,7 @@ CreateOptsFile(int argc, char *argv[])
 }
 
 /* delete pid file */
-static void
+void
 DeleteLockFile(const char *filename)
 {
     if (unlink(filename) < 0)
diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h
index b5a18302..b5257f1a 100644
--- a/src/include/gtm/gtm_c.h
+++ b/src/include/gtm/gtm_c.h
@@ -393,7 +393,7 @@ typedef enum
 #define GTM_SYNC_CYCLE                     (5   * GTM_GTS_ONE_SECOND)
 #define GTM_SYNC_TIME_LIMIT              (60  * GTM_GTS_ONE_SECOND)
 #define GTM_LOG_COLLECT_CYCLE		     (5   * GTM_GTS_ONE_SECOND)
-
+#define MAXGTMPATH	                     256
 #pragma pack()
 
 #endif
diff --git a/src/include/gtm/gtm_opt.h b/src/include/gtm/gtm_opt.h
index 9dc07802..8bd33409 100644
--- a/src/include/gtm/gtm_opt.h
+++ b/src/include/gtm/gtm_opt.h
@@ -37,7 +37,7 @@
  * configuration file, or by client request in the connection startup
  * packet (e.g., from libpq's PGOPTIONS variable).  Furthermore, an
  * already-started backend will ignore changes to such an option in the
- * configuration file.    The idea is that these options are fixed for a
+ * configuration file.	The idea is that these options are fixed for a
  * given backend once it's started, but they can vary across backends.
  *
  * SUSET options can be set at postmaster startup, with the SIGHUP
@@ -47,10 +47,10 @@
  */
 typedef enum
 {
-    GTMC_DEFAULT,
-    GTMC_STARTUP,
-    GTMC_SIGHUP,
-    GTMC_USERSET
+	GTMC_DEFAULT,
+	GTMC_STARTUP,
+	GTMC_SIGHUP,
+	GTMC_USERSET
 } GtmOptContext;
 
 /*
@@ -69,7 +69,7 @@ typedef enum
  *
  * GTMC_S_TEST is used when testing values to be stored as per-database or
  * per-user defaults ("doit" will always be false, so this never gets stored
- * as the actual source of any value).    This is an interactive case, but
+ * as the actual source of any value).	This is an interactive case, but
  * it needs its own source value because some assign hooks need to make
  * different validity checks in this case.
  *
@@ -77,19 +77,19 @@ typedef enum
  */
 typedef enum
 {
-    GTMC_S_DEFAULT,                /* hard-wired default ("boot_val") */
-    GTMC_S_DYNAMIC_DEFAULT,        /* default computed during initialization */
-    GTMC_S_ENV_VAR,                /* postmaster environment variable *//* Not used in GTM */
-    GTMC_S_FILE,                /* gtm.conf or gtm_proxy.conf */
-    GTMC_S_ARGV,                /* postmaster command line */
-    GTMC_S_DATABASE,            /* per-database setting *//* Not used in GTM */
-    GTMC_S_USER,                /* per-user setting *//* Not used in GTM */
-    GTMC_S_DATABASE_USER,        /* per-user-and-database setting *//* Not used in GTM */
-    GTMC_S_CLIENT,                /* from client connection request *//* Not used in GTM */
-    GTMC_S_OVERRIDE,            /* special case to forcibly set default *//* Not used in GTM */
-    GTMC_S_INTERACTIVE,            /* dividing line for error reporting *//* Not used in GTM */
-    GTMC_S_TEST,                /* test per-database or per-user setting *//* Not used in GTM */
-    GTMC_S_SESSION                /* SET command *//* Not used in GTM */
+	GTMC_S_DEFAULT,				/* hard-wired default ("boot_val") */
+	GTMC_S_DYNAMIC_DEFAULT,		/* default computed during initialization */
+	GTMC_S_ENV_VAR,				/* postmaster environment variable *//* Not used in GTM */
+	GTMC_S_FILE,				/* gtm.conf or gtm_proxy.conf */
+	GTMC_S_ARGV,				/* postmaster command line */
+	GTMC_S_DATABASE,			/* per-database setting *//* Not used in GTM */
+	GTMC_S_USER,				/* per-user setting *//* Not used in GTM */
+	GTMC_S_DATABASE_USER,		/* per-user-and-database setting *//* Not used in GTM */
+	GTMC_S_CLIENT,				/* from client connection request *//* Not used in GTM */
+	GTMC_S_OVERRIDE,			/* special case to forcibly set default *//* Not used in GTM */
+	GTMC_S_INTERACTIVE,			/* dividing line for error reporting *//* Not used in GTM */
+	GTMC_S_TEST,				/* test per-database or per-user setting *//* Not used in GTM */
+	GTMC_S_SESSION				/* SET command *//* Not used in GTM */
 } GtmOptSource;
 
 /*
@@ -98,19 +98,19 @@ typedef enum
  */
 typedef struct ConfigVariable
 {
-    char       *name;
-    char       *value;
-    char       *filename;
-    int            sourceline;
-    struct ConfigVariable *next;
+	char	   *name;
+	char	   *value;
+	char	   *filename;
+	int			sourceline;
+	struct ConfigVariable *next;
 } ConfigVariable;
 
 extern bool ParseConfigFile(const char *config_file, const char *calling_file,
-                int depth, int elevel,
-                ConfigVariable **head_p, ConfigVariable **tail_p);
+				int depth, int elevel,
+				ConfigVariable **head_p, ConfigVariable **tail_p);
 extern bool ParseConfigFp(FILE *fp, const char *config_file,
-              int depth, int elevel,
-              ConfigVariable **head_p, ConfigVariable **tail_p);
+			  int depth, int elevel,
+			  ConfigVariable **head_p, ConfigVariable **tail_p);
 extern void FreeConfigVariables(ConfigVariable *list);
 
 /*
@@ -120,9 +120,9 @@ extern void FreeConfigVariables(ConfigVariable *list);
  */
 struct config_enum_entry
 {
-    const char *name;
-    int            val;
-    bool        hidden;
+	const char *name;
+	int			val;
+	bool		hidden;
 };
 
 /*
@@ -152,10 +152,10 @@ typedef const char *(*GtmOptShowHook) (void);
  */
 typedef enum
 {
-    /* Types of set_config_option actions */
-    GTMOPT_ACTION_SET,                /* regular SET command */
-    GTMOPT_ACTION_LOCAL,            /* SET LOCAL command */
-    GTMOPT_ACTION_SAVE                /* function SET option */
+	/* Types of set_config_option actions */
+	GTMOPT_ACTION_SET,				/* regular SET command */
+	GTMOPT_ACTION_LOCAL,			/* SET LOCAL command */
+	GTMOPT_ACTION_SAVE				/* function SET option */
 } GtmOptAction;
 
 #define GTMOPT_QUALIFIER_SEPARATOR '.'
@@ -163,34 +163,34 @@ typedef enum
 /*
  * bit values in "flags" of a GUC variable
  */
-#define GTMOPT_LIST_INPUT            0x0001    /* input can be list format */
-#define GTMOPT_LIST_QUOTE            0x0002    /* double-quote list elements */
-#define GTMOPT_NO_SHOW_ALL            0x0004    /* exclude from SHOW ALL */
-#define GTMOPT_NO_RESET_ALL            0x0008    /* exclude from RESET ALL */
-#define GTMOPT_REPORT                0x0010    /* auto-report changes to client */
-#define GTMOPT_NOT_IN_SAMPLE        0x0020    /* not in postgresql.conf.sample */
-#define GTMOPT_DISALLOW_IN_FILE        0x0040    /* can't set in postgresql.conf */
-#define GTMOPT_CUSTOM_PLACEHOLDER    0x0080    /* placeholder for custom variable */
-#define GTMOPT_SUPERUSER_ONLY        0x0100    /* show only to superusers */
-#define GTMOPT_IS_NAME                0x0200    /* limit string to NAMEDATALEN-1 */
-
-#define GTMOPT_UNIT_KB                0x0400    /* value is in kilobytes */
-#define GTMOPT_UNIT_BLOCKS            0x0800    /* value is in blocks */
-#define GTMOPT_UNIT_XBLOCKS            0x0C00    /* value is in xlog blocks */
-#define GTMOPT_UNIT_MEMORY            0x0C00    /* mask for KB, BLOCKS, XBLOCKS */
-
-#define GTMOPT_UNIT_MS                0x1000    /* value is in milliseconds */
-#define GTMOPT_UNIT_S                0x2000    /* value is in seconds */
-#define GTMOPT_UNIT_MIN                0x4000    /* value is in minutes */
-#define GTMOPT_UNIT_TIME            0x7000    /* mask for MS, S, MIN */
-
-#define GTMOPT_NOT_WHILE_SEC_REST    0x8000    /* can't set if security restricted */
+#define GTMOPT_LIST_INPUT			0x0001	/* input can be list format */
+#define GTMOPT_LIST_QUOTE			0x0002	/* double-quote list elements */
+#define GTMOPT_NO_SHOW_ALL			0x0004	/* exclude from SHOW ALL */
+#define GTMOPT_NO_RESET_ALL			0x0008	/* exclude from RESET ALL */
+#define GTMOPT_REPORT				0x0010	/* auto-report changes to client */
+#define GTMOPT_NOT_IN_SAMPLE		0x0020	/* not in postgresql.conf.sample */
+#define GTMOPT_DISALLOW_IN_FILE		0x0040	/* can't set in postgresql.conf */
+#define GTMOPT_CUSTOM_PLACEHOLDER	0x0080	/* placeholder for custom variable */
+#define GTMOPT_SUPERUSER_ONLY		0x0100	/* show only to superusers */
+#define GTMOPT_IS_NAME				0x0200	/* limit string to NAMEDATALEN-1 */
+
+#define GTMOPT_UNIT_KB				0x0400	/* value is in kilobytes */
+#define GTMOPT_UNIT_BLOCKS			0x0800	/* value is in blocks */
+#define GTMOPT_UNIT_XBLOCKS			0x0C00	/* value is in xlog blocks */
+#define GTMOPT_UNIT_MEMORY			0x0C00	/* mask for KB, BLOCKS, XBLOCKS */
+
+#define GTMOPT_UNIT_MS				0x1000	/* value is in milliseconds */
+#define GTMOPT_UNIT_S				0x2000	/* value is in seconds */
+#define GTMOPT_UNIT_MIN				0x4000	/* value is in minutes */
+#define GTMOPT_UNIT_TIME			0x7000	/* mask for MS, S, MIN */
+
+#define GTMOPT_NOT_WHILE_SEC_REST	0x8000	/* can't set if security restricted */
 
 /*
  * Functions exported by gtm_opt.c
  */
 extern void SetConfigOption(const char *name, const char *value,
-                GtmOptContext context, GtmOptSource source);
+				GtmOptContext context, GtmOptSource source);
 
 extern void EmitWarningsOnPlaceholders(const char *className);
 
@@ -200,17 +200,17 @@ extern bool ProcessConfigFile(GtmOptContext context);
 extern void InitializeGTMOptions(void);
 extern bool SelectConfigFiles(const char *userDoption, const char *progname);
 extern void ResetAllOptions(void);
-extern int    NewGTMNestLevel(void);
+extern int	NewGTMNestLevel(void);
 extern bool parse_int(const char *value, int *result, int flags,
-                      const char **hintmsg);
+					  const char **hintmsg);
 extern bool parse_real(const char *value, double *result);
 extern bool set_config_option(const char *name, const char *value,
-                              GtmOptContext context, GtmOptSource source,
-                              bool changeVal);
+							  GtmOptContext context, GtmOptSource source,
+							  bool changeVal);
 
 extern char *GetConfigOptionByName(const char *name, const char **varname);
 extern void GetConfigOptionByNum(int varnum, const char **values, bool *noshow);
-extern int    GetNumConfigOptions(void);
+extern int	GetNumConfigOptions(void);
 extern void ParseLongOption(const char *string, char **name, char **value);
 
 #ifndef PG_KRB_SRVTAB
@@ -223,9 +223,9 @@ extern void ParseLongOption(const char *string, char **name, char **value);
 /* upper limit for GUC variables measured in kilobytes of memory */
 /* note that various places assume the byte size fits in a "long" variable */
 #if SIZEOF_SIZE_T > 4 && SIZEOF_LONG > 4
-#define MAX_KILOBYTES    INT_MAX
+#define MAX_KILOBYTES	INT_MAX
 #else
-#define MAX_KILOBYTES    (INT_MAX / 1024)
+#define MAX_KILOBYTES	(INT_MAX / 1024)
 #endif
 
 #ifdef TRACE_SORT
@@ -245,20 +245,20 @@ extern volatile sig_atomic_t ConfigReloadPending;
  */
 #define Server_Message_Level_Options()\
 static const struct config_enum_entry server_message_level_options[] = {\
-    {"debug", DEBUG2, true},\
-    {"debug5", DEBUG5, false},\
-    {"debug4", DEBUG4, false},\
-    {"debug3", DEBUG3, false},\
-    {"debug2", DEBUG2, false},\
-    {"debug1", DEBUG1, false},\
-    {"info", INFO, false},\
-    {"notice", NOTICE, false},\
-    {"warning", WARNING, false},\
-    {"error", ERROR, false},\
-    {"log", LOG, false},\
-    {"fatal", FATAL, false},\
-    {"panic", PANIC, false},\
-    {NULL, 0, false}\
+	{"debug", DEBUG2, true},\
+	{"debug5", DEBUG5, false},\
+	{"debug4", DEBUG4, false},\
+	{"debug3", DEBUG3, false},\
+	{"debug2", DEBUG2, false},\
+	{"debug1", DEBUG1, false},\
+	{"info", INFO, false},\
+	{"notice", NOTICE, false},\
+	{"warning", WARNING, false},\
+	{"error", ERROR, false},\
+	{"log", LOG, false},\
+	{"fatal", FATAL, false},\
+	{"panic", PANIC, false},\
+	{NULL, 0, false}\
 }
 
 /*
@@ -266,9 +266,9 @@ static const struct config_enum_entry server_message_level_options[] = {\
  */
 #define Gtm_Startup_Mode_Options()\
 static const struct config_enum_entry gtm_startup_mode_options[] = {\
-    {"act", GTM_ACT_MODE, false},\
-    {"standby", GTM_STANDBY_MODE, false},\
-    {NULL, 0, false}\
+	{"act", GTM_ACT_MODE, false},\
+	{"standby", GTM_STANDBY_MODE, false},\
+	{NULL, 0, false}\
 }
 
 /*
@@ -279,8 +279,8 @@ static const struct config_enum_entry gtm_startup_mode_options[] = {\
 #define gtmOptContext_Names()\
 const char *const GtmOptContext_Names[] =\
 {\
-     /* GTMC_STGARTUP */    "startup",\
-     /* GTMC_SIGHUP */        "sighup"\
+	 /* GTMC_STGARTUP */	"startup",\
+	 /* GTMC_SIGHUP */		"sighup"\
 }
 
 /*
@@ -291,19 +291,19 @@ const char *const GtmOptContext_Names[] =\
 #define gtmOptSource_Names()\
 const char *const GtmOptSource_Names[] =\
 {\
-     /* GTMC_S_DEFAULT */             "default",\
-     /* GTMC_S_DYNAMIC_DEFAULT */     "default",\
-     /* GTMC_S_ENV_VAR */             "environment variable",\
-     /* GTMC_S_FILE */                 "configuration file",\
-     /* GTMC_S_ARGV */                 "command line",\
-     /* GTMC_S_DATABASE */             "database",\
-     /* GTMC_S_USER */                 "user",\
-     /* GTMC_S_DATABASE_USER */     "database user",\
-     /* GTMC_S_CLIENT */             "client",\
-     /* GTMC_S_OVERRIDE */             "override",\
-     /* GTMC_S_INTERACTIVE */         "interactive",\
-     /* GTMC_S_TEST */                 "test",\
-     /* GTMC_S_SESSION */             "session"\
+	 /* GTMC_S_DEFAULT */ 			"default",\
+	 /* GTMC_S_DYNAMIC_DEFAULT */ 	"default",\
+	 /* GTMC_S_ENV_VAR */ 			"environment variable",\
+	 /* GTMC_S_FILE */ 				"configuration file",\
+	 /* GTMC_S_ARGV */ 				"command line",\
+	 /* GTMC_S_DATABASE */ 			"database",\
+	 /* GTMC_S_USER */ 				"user",\
+	 /* GTMC_S_DATABASE_USER */ 	"database user",\
+	 /* GTMC_S_CLIENT */ 			"client",\
+	 /* GTMC_S_OVERRIDE */ 			"override",\
+	 /* GTMC_S_INTERACTIVE */ 		"interactive",\
+	 /* GTMC_S_TEST */ 				"test",\
+	 /* GTMC_S_SESSION */ 			"session"\
 }
 
 /*
@@ -314,11 +314,11 @@ const char *const GtmOptSource_Names[] =\
 #define Config_Type_Names()\
 const char *const config_type_names[] =\
 {\
-     /* GTMC_BOOL */    "bool",\
-     /* GTMC_INT */        "integer",\
-     /* GTMC_REAL */    "real",\
-     /* GTMC_STRING */    "string",\
-     /* GTMC_ENUM */    "enum"\
+	 /* GTMC_BOOL */	"bool",\
+	 /* GTMC_INT */		"integer",\
+	 /* GTMC_REAL */	"real",\
+	 /* GTMC_STRING */	"string",\
+	 /* GTMC_ENUM */	"enum"\
 }
 
 
@@ -328,37 +328,37 @@ const char *const config_type_names[] =\
  * This will be used both in *.conf and command line option override.
  */
 
-#define GTM_OPTNAME_ACTIVE_HOST            "active_host"
-#define GTM_OPTNAME_ACTIVE_PORT         "active_port"
-#define GTM_OPTNAME_CONFIG_FILE            "config_file"
-#define GTM_OPTNAME_DATA_DIR            "data_dir"
-#define GTM_OPTNAME_ERROR_REPORTER        "error_reporter"
+#define GTM_OPTNAME_ACTIVE_HOST			"active_host"
+#define GTM_OPTNAME_ACTIVE_PORT 		"active_port"
+#define GTM_OPTNAME_CONFIG_FILE			"config_file"
+#define GTM_OPTNAME_DATA_DIR			"data_dir"
+#define GTM_OPTNAME_ERROR_REPORTER		"error_reporter"
 #define GTM_OPTNAME_CONNECT_RETRY_INTERVAL "gtm_connect_retry_interval"
-#define GTM_OPTNAME_GTM_HOST            "gtm_host"
-#define GTM_OPTNAME_GTM_PORT            "gtm_port"
-#define GTM_OPTNAME_KEEPALIVES_IDLE        "keepalives_idle"
-#define GTM_OPTNAME_KEEPALIVES_INTERVAL    "keepalives_interval"
-#define GTM_OPTNAME_KEEPALIVES_COUNT    "keepalives_count"
-#define GTM_OPTNAME_LISTEN_ADDRESSES    "listen_addresses"
-#define GTM_OPTNAME_LOG_FILE            "log_file"
-#define GTM_OPTNAME_LOG_MIN_MESSAGES    "log_min_messages"
-#define GTM_OPTNAME_NODENAME            "nodename"
-#define GTM_OPTNAME_PORT                "port"
-#define GTM_OPTNAME_STARTUP                "startup"
-#define GTM_OPTNAME_STATUS_READER        "status_reader"
-#define GTM_OPTNAME_SYNCHRONOUS_BACKUP    "synchronous_backup"
-#define GTM_OPTNAME_WORKER_THREADS        "worker_threads"
+#define GTM_OPTNAME_GTM_HOST			"gtm_host"
+#define GTM_OPTNAME_GTM_PORT			"gtm_port"
+#define GTM_OPTNAME_KEEPALIVES_IDLE		"keepalives_idle"
+#define GTM_OPTNAME_KEEPALIVES_INTERVAL	"keepalives_interval"
+#define GTM_OPTNAME_KEEPALIVES_COUNT	"keepalives_count"
+#define GTM_OPTNAME_LISTEN_ADDRESSES	"listen_addresses"
+#define GTM_OPTNAME_LOG_FILE			"log_file"
+#define GTM_OPTNAME_LOG_MIN_MESSAGES	"log_min_messages"
+#define GTM_OPTNAME_NODENAME			"nodename"
+#define GTM_OPTNAME_PORT				"port"
+#define GTM_OPTNAME_STARTUP				"startup"
+#define GTM_OPTNAME_STATUS_READER		"status_reader"
+#define GTM_OPTNAME_SYNCHRONOUS_BACKUP	"synchronous_backup"
+#define GTM_OPTNAME_WORKER_THREADS		"worker_threads"
 #define GTM_OPTNAME_ENABLE_DEBUG        "enable_gtm_debug"
 #define GTM_OPTNAME_ENABLE_SEQ_DEBUG    "enable_gtm_sequence_debug"
-#define GTM_OPTNAME_SCALE_FACTOR_THREADS        "scale_factor_threads"
-#define GTM_OPTNAME_WORKER_THREADS_NUMBER        "worker_thread_number"
+#define GTM_OPTNAME_SCALE_FACTOR_THREADS		"scale_factor_threads"
+#define GTM_OPTNAME_WORKER_THREADS_NUMBER		"worker_thread_number"
 
 #define GTM_OPTNAME_GTS_FREEZE_TIME_LIMIT  "gtm_freeze_time_limit"
 #define GTM_OPTNAME_STARTUP_GTS_DELTA      "gtm_startup_gts_delta"
 #define GTM_OPTNAME_STARTUP_GTS_SET        "gtm_startup_gts_set"
 #define GTM_OPTNAME_CLUSTER_READ_ONLY      "gtm_cluster_read_only"
 #ifdef __XLOG__
-#define GTM_OPTNAME_SYNCHRONOUS_COMMIT    "synchronous_commit"
+#define GTM_OPTNAME_SYNCHRONOUS_COMMIT	"synchronous_commit"
 #define GTM_OPTNAME_WAL_WRITER_DELAY    "wal_writer_delay"
 #define GTM_OPTNAME_CHECKPOINT_INTERVAL "checkpoint_interval"
 #define GTM_OPTNAME_ARCHIVE_COMMAND     "archive_command"
@@ -372,8 +372,9 @@ const char *const config_type_names[] =\
 #define GTM_OPTNAME_RECOVERY_COMMAND              "recovery_command"
 #endif
 
-
-
+#define GTM_OPTNAME_UNIX_SOCKET_DIRECTORY       "unix_socket_directory"
+#define GTM_OPTNAME_UNIX_SOCKET_GROUP           "unix_socket_group"
+#define GTM_OPTNAME_UNIX_SOCKET_PERMISSIONS     "unix_socket_permissions"
 
 
 #endif   /* GTM_OPT_H */
diff --git a/src/include/gtm/libpq.h b/src/include/gtm/libpq.h
index 2a60d8c9..8c2ac2b0 100644
--- a/src/include/gtm/libpq.h
+++ b/src/include/gtm/libpq.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * libpq.h
- *      POSTGRES LIBPQ buffer structure definitions.
+ *	  POSTGRES LIBPQ buffer structure definitions.
  *
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
@@ -29,19 +29,19 @@
  * prototypes for functions in pqcomm.c
  */
 extern int StreamServerPort(int family, char *hostName,
-         unsigned short portNumber, int ListenSocket[],
-                 int MaxListen);
-extern int    StreamConnection(int server_fd, Port *port);
+		 unsigned short portNumber, char *unixSocketDir, int ListenSocket[],
+				 int MaxListen);
+extern int	StreamConnection(int server_fd, Port *port);
 extern void StreamClose(int sock);
 extern void TouchSocketFile(void);
 extern void pq_comm_reset(void);
-extern int    pq_getbytes(Port *myport, char *s, size_t len);
-extern int    pq_getstring(Port *myport, StringInfo s);
-extern int    pq_getmessage(Port *myport, StringInfo s, int maxlen);
-extern int    pq_getbyte(Port *myport);
-extern int    pq_peekbyte(Port *myport);
-extern int    pq_putbytes(Port *myport, const char *s, size_t len);
-extern int    pq_flush(Port *myport);
-extern int    pq_putmessage(Port *myport, char msgtype, const char *s, size_t len);
+extern int	pq_getbytes(Port *myport, char *s, size_t len);
+extern int	pq_getstring(Port *myport, StringInfo s);
+extern int	pq_getmessage(Port *myport, StringInfo s, int maxlen);
+extern int	pq_getbyte(Port *myport);
+extern int	pq_peekbyte(Port *myport);
+extern int	pq_putbytes(Port *myport, const char *s, size_t len);
+extern int	pq_flush(Port *myport);
+extern int	pq_putmessage(Port *myport, char msgtype, const char *s, size_t len);
 
 #endif   /* LIBPQ_H */
diff --git a/src/include/gtm/pqcomm.h b/src/include/gtm/pqcomm.h
index d98934db..e39a72a2 100644
--- a/src/include/gtm/pqcomm.h
+++ b/src/include/gtm/pqcomm.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pqcomm.h
- *        Definitions common to frontends and backends.
+ *		Definitions common to frontends and backends.
  *
  * NOTE: for historical reasons, this does not correspond to pqcomm.c.
  * pqcomm.c's routines are declared in libpq.h.
@@ -26,24 +26,37 @@
 
 typedef struct
 {
-    struct sockaddr_storage addr;
-    size_t    salen;
+	struct sockaddr_storage addr;
+	size_t	salen;
 } SockAddr;
 
 /* Configure the UNIX socket location for the well known port. */
 
 #define UNIXSOCK_PATH(path, port, sockdir) \
-        snprintf(path, sizeof(path), "%s/.s.PGSQL.%d", \
-                ((sockdir) && *(sockdir) != '\0') ? (sockdir) : \
-                DEFAULT_PGSOCKET_DIR, \
-                (port))
+		snprintf(path, sizeof(path), "%s/.s.GTM.%d", \
+				((sockdir) && *(sockdir) != '\0') ? (sockdir) : \
+				DEFAULT_PGSOCKET_DIR, \
+				(port))
 
 /*
  * In protocol 3.0 and later, the startup packet length is not fixed, but
- * we set an arbitrary limit on it anyway.    This is just to prevent simple
+ * we set an arbitrary limit on it anyway.	This is just to prevent simple
  * denial-of-service attacks via sending enough data to run the server
  * out of memory.
  */
 #define MAX_STARTUP_PACKET_LENGTH 10000
 
+
+/*
+ * The maximum workable length of a socket path is what will fit into
+ * struct sockaddr_un.  This is usually only 100 or so bytes :-(.
+ *
+ * For consistency, always pass a MAXPGPATH-sized buffer to UNIXSOCK_PATH(),
+ * then complain if the resulting string is >= UNIXSOCK_PATH_BUFLEN bytes.
+ * (Because the standard API for getaddrinfo doesn't allow it to complain in
+ * a useful way when the socket pathname is too long, we have to test for
+ * this explicitly, instead of just letting the subroutine return an error.)
+ */
+#define UNIXSOCK_PATH_BUFLEN sizeof(((struct sockaddr_un *) NULL)->sun_path)
+
 #endif   /* PQCOMM_H */
diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h
index f32e7db9..0c27047e 100644
--- a/src/include/postmaster/postmaster.h
+++ b/src/include/postmaster/postmaster.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * postmaster.h
- *      Exports from postmaster/postmaster.c.
+ *	  Exports from postmaster/postmaster.c.
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -15,15 +15,15 @@
 
 /* GUC options */
 extern bool EnableSSL;
-extern int    ReservedBackends;
-extern int    PostPortNumber;
-extern int    Unix_socket_permissions;
+extern int	ReservedBackends;
+extern int	PostPortNumber;
+extern int	Unix_socket_permissions;
 extern char *Unix_socket_group;
 extern char *Unix_socket_directories;
 extern char *ListenAddresses;
 extern bool ClientAuthInProgress;
-extern int    PreAuthDelay;
-extern int    AuthenticationTimeout;
+extern int	PreAuthDelay;
+extern int	AuthenticationTimeout;
 extern bool Log_connections;
 extern bool log_hostname;
 extern bool enable_bonjour;
@@ -37,6 +37,7 @@ extern char *g_BouncerConf;
 extern bool enable_null_string;
 extern bool g_concurrently_index;
 extern bool g_set_global_snapshot;
+extern char *gtm_unix_socket_directory;
 #endif
 
 #ifdef __COLD_HOT__
@@ -66,15 +67,15 @@ enum
 #ifdef WIN32
 extern HANDLE PostmasterHandle;
 #else
-extern int    postmaster_alive_fds[2];
+extern int	postmaster_alive_fds[2];
 
 /*
  * Constants that represent which of postmaster_alive_fds is held by
  * postmaster, and which is used in children to check for postmaster death.
  */
-#define POSTMASTER_FD_WATCH        0    /* used in children to check for
-                                     * postmaster death */
-#define POSTMASTER_FD_OWN        1    /* kept open by postmaster only */
+#define POSTMASTER_FD_WATCH		0	/* used in children to check for
+									 * postmaster death */
+#define POSTMASTER_FD_OWN		1	/* kept open by postmaster only */
 #endif
 
 extern const char *progname;
@@ -82,9 +83,9 @@ extern const char *progname;
 extern void PostmasterMain(int argc, char *argv[]) pg_attribute_noreturn();
 extern void ClosePostmasterPorts(bool am_syslogger);
 
-extern int    MaxLivePostmasterChildren(void);
+extern int	MaxLivePostmasterChildren(void);
 
-extern int    GetNumShmemAttachedBgworkers(void);
+extern int	GetNumShmemAttachedBgworkers(void);
 extern bool PostmasterMarkPIDForWorkerNotify(int);
 
 #ifdef EXEC_BACKEND
@@ -105,10 +106,10 @@ extern void ShmemBackendArrayAllocation(void);
  * compute 4*MaxBackends without any overflow check.  This is rechecked in the
  * relevant GUC check hooks and in RegisterBackgroundWorker().
  */
-#define MAX_BACKENDS    0x3FFFF
+#define MAX_BACKENDS	0x3FFFF
 #ifdef __TBASE__
 extern void PostmasterEnableLogTimeout(void);
 extern void PostmasterDisableTimeout(void);
 extern bool PostmasterIsPrimaryAndNormal(void);
 #endif
-#endif                            /* _POSTMASTER_H */
+#endif							/* _POSTMASTER_H */

From 55318242da135554e697df6231e60e25c30cbf2c Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Wed, 20 Jan 2021 16:21:54 +0800
Subject: [PATCH 327/578] Bugfix: gtm switch cause prepared statement can not
 use, ID84618929

---
 src/backend/access/transam/gtm.c  |  85 ++++++++-
 src/backend/access/transam/xact.c |  11 ++
 src/backend/utils/misc/guc.c      |  21 +++
 src/include/access/gtm.h          |   3 +
 src/include/access/xact.h         | 287 +++++++++++++++---------------
 5 files changed, 257 insertions(+), 150 deletions(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 969311e5..daf77a90 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -56,6 +56,10 @@ int      NewGtmPort = -1;
 bool  g_GTM_skip_catalog = false;
 char *gtm_unix_socket_directory = DEFAULT_PGSOCKET_DIR;
 #endif
+
+int reconnect_gtm_retry_times = 3;
+int reconnect_gtm_retry_interval = 500;
+
 char *GtmHost = NULL;
 int GtmPort = 0;
 static int GtmConnectTimeout = 60;
@@ -82,6 +86,7 @@ List *g_DropSeqList   = NULL;
 List *g_AlterSeqList  = NULL;
 #define GTM_SEQ_POSTFIX "_$TBASE$_"
 static void CheckConnection(void);
+static void ResetGTMConnection(void);
 static int GetGTMStoreStatus(GTMStorageStatus *header);
 static int GetGTMStoreSequence(GTM_StoredSeqInfo **store_seq);
 static int GetGTMStoreTransaction(GTM_StoredTransactionInfo **store_txn);
@@ -1107,6 +1112,10 @@ GetMasterGtmInfo(void)
 	/* If NewGtmHost and NewGtmPort, just use it. */
 	if (NewGtmHost && NewGtmPort != 0)
 	{
+		elog(LOG,
+			"GetMasterGtmInfo: set master gtm info with NewGtmHost:%s NewGtmPort:%d",
+			NewGtmHost, NewGtmPort);
+
 		GtmHost = strdup(NewGtmHost);
 		GtmPort = NewGtmPort;
 
@@ -1114,9 +1123,6 @@ GetMasterGtmInfo(void)
 		NewGtmHost = NULL;
 		NewGtmPort = 0;
 
-		elog(LOG,
-			"GetMasterGtmInfo: set master gtm info with NewGtmHost:%s NewGtmPort:%d",
-			NewGtmHost, NewGtmPort);
 		return;
 	}
 
@@ -1184,6 +1190,53 @@ CheckConnection(void)
 	}
 }
 
+static void
+ResetGTMConnection(void)
+{
+	Relation rel;
+	HeapScanDesc scan;
+	HeapTuple gtmtup;
+	Form_pgxc_node nodeForm;
+	bool found = false;
+
+	CloseGTM();
+	ResetGtmInfo();
+
+	/*
+	 * We must be sure there is no error report, because we may be
+	 * in AbortTransaction now.
+	 * 1.If we are not in a inprogress or commit transaction, we should not open relation.
+	 * 2.If we do not get lock, it is ok to try it next time.
+	 */
+	if ( (IsTransactionState() || IsTransactionCommit()) &&
+		ConditionalLockRelationOid(PgxcNodeRelationId, AccessShareLock))
+	{
+		rel = relation_open(PgxcNodeRelationId, NoLock);
+		scan = heap_beginscan_catalog(rel, 0, NULL);
+		/* Only one record will match */
+		while (HeapTupleIsValid(gtmtup = heap_getnext(scan, ForwardScanDirection)))
+		{
+			nodeForm = (Form_pgxc_node) GETSTRUCT(gtmtup);
+			if (PGXC_NODE_GTM == nodeForm->node_type && nodeForm->nodeis_primary)
+			{
+				GtmHost = strdup(NameStr(nodeForm->node_host));
+				GtmPort = nodeForm->node_port;
+				found = true;
+				break;
+			}
+		}
+		heap_endscan(scan);
+		relation_close(rel, AccessShareLock);
+
+		if (!found)
+		{
+			elog(LOG, "can not get master gtm info from pgxc_node");
+		}
+	}
+
+	InitGTM();
+}
+
 void
 InitGTM(void)
 {// #lizard forgives
@@ -1382,6 +1435,7 @@ GetGlobalTimestampGTM(void)
     GTM_Timestamp  latest_gts = InvalidGlobalTimestamp;
     struct rusage start_r;
     struct timeval start_t;
+	int  retry_cnt = 0;
 
     if (log_gtm_stats)
         ResetUsageCommon(&start_r, &start_t);
@@ -1400,21 +1454,38 @@ GetGlobalTimestampGTM(void)
     /* If something went wrong (timeout), try and reset GTM connection
      * and retry. This is safe at the beginning of a transaction.
      */
-    if (!GlobalTimestampIsValid(gts_result.gts))
+	while (!GlobalTimestampIsValid(gts_result.gts) &&
+		retry_cnt < reconnect_gtm_retry_times)
     {
         if(GTMDebugPrint)
         {
             elog(LOG, "get global timestamp reconnect");
         }
-        CloseGTM();
-        InitGTM();
+
+		ResetGTMConnection();
+		retry_cnt++;
+
+		elog(DEBUG5, "reset gtm connection %d times", retry_cnt);
+
         if (conn)
         {
             gts_result = get_global_timestamp(conn);
+			if (GlobalTimestampIsValid(gts_result.gts))
+			{
+				elog(DEBUG5, "retry get global timestamp gts " INT64_FORMAT,
+					gts_result.gts);
+				break;
+			}
         }
         else if(GTMDebugPrint)
         {
-            elog(LOG, "get global timestamp conn is null after retry");
+			elog(LOG, "get global timestamp conn is null after retry %d times",
+				retry_cnt);
+		}
+
+		if (retry_cnt < reconnect_gtm_retry_times)
+		{
+			pg_usleep(reconnect_gtm_retry_interval * 1000);
         }
     }
     elog(DEBUG7, "get global timestamp gts " INT64_FORMAT, gts_result.gts);
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 91cd002a..0f80f39c 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -540,6 +540,17 @@ IsTransactionState(void)
     return (s->state == TRANS_INPROGRESS);
 }
 
+/*
+ *	IsTransactionCommit
+ *
+ *	This returns true if transaction state is TRANS_COMMIT
+ */
+bool
+IsTransactionCommit(void)
+{
+	return (CurrentTransactionState->state == TRANS_COMMIT);
+}
+
 /*
  *    IsAbortedTransactionBlockState
  *
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index a0db968c..6e3a43dc 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4757,6 +4757,27 @@ static struct config_int ConfigureNamesInt[] =
     },
 #endif
 
+	{
+		{"reconnect_gtm_retry_times", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("reconnect gtm retry times"),
+			NULL
+		},
+		&reconnect_gtm_retry_times,
+		3, 0, 100,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"reconnect_gtm_retry_interval", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("reconnect gtm retry interval"),
+			NULL,
+			GUC_UNIT_MS
+		},
+		&reconnect_gtm_retry_interval,
+		500, 0, 60000,
+		NULL, NULL, NULL
+	},
+
     /* End-of-list marker */
     {
         {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h
index aa11962c..5da0eb6a 100644
--- a/src/include/access/gtm.h
+++ b/src/include/access/gtm.h
@@ -107,6 +107,9 @@ extern char *NewGtmHost;
 extern int     NewGtmPort;
 #endif
 
+extern int reconnect_gtm_retry_times;
+extern int reconnect_gtm_retry_interval;
+
 extern bool IsGTMConnected(void);
 extern void InitGTM(void);
 extern void CloseGTM(void);
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index d63c63ed..a06c14d4 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * xact.h
- *      postgres transaction system definitions
+ *	  postgres transaction system definitions
  *
  *
  * Portions Copyright (c) 2012-2014, TransLattice, Inc.
@@ -33,12 +33,12 @@
 /*
  * Xact isolation levels
  */
-#define XACT_READ_UNCOMMITTED    0
-#define XACT_READ_COMMITTED        1
-#define XACT_REPEATABLE_READ    2
-#define XACT_SERIALIZABLE        3
+#define XACT_READ_UNCOMMITTED	0
+#define XACT_READ_COMMITTED		1
+#define XACT_REPEATABLE_READ	2
+#define XACT_SERIALIZABLE		3
 
-extern int    DefaultXactIsoLevel;
+extern int	DefaultXactIsoLevel;
 extern PGDLLIMPORT int XactIsoLevel;
 
 /*
@@ -68,19 +68,19 @@ extern bool XactDeferrable;
 
 typedef enum
 {
-    SYNCHRONOUS_COMMIT_OFF,        /* asynchronous commit */
-    SYNCHRONOUS_COMMIT_LOCAL_FLUSH, /* wait for local flush only */
-    SYNCHRONOUS_COMMIT_REMOTE_WRITE,    /* wait for local flush and remote
-                                         * write */
-    SYNCHRONOUS_COMMIT_REMOTE_FLUSH,    /* wait for local and remote flush */
-    SYNCHRONOUS_COMMIT_REMOTE_APPLY /* wait for local flush and remote apply */
-}            SyncCommitLevel;
+	SYNCHRONOUS_COMMIT_OFF,		/* asynchronous commit */
+	SYNCHRONOUS_COMMIT_LOCAL_FLUSH, /* wait for local flush only */
+	SYNCHRONOUS_COMMIT_REMOTE_WRITE,	/* wait for local flush and remote
+										 * write */
+	SYNCHRONOUS_COMMIT_REMOTE_FLUSH,	/* wait for local and remote flush */
+	SYNCHRONOUS_COMMIT_REMOTE_APPLY /* wait for local flush and remote apply */
+}			SyncCommitLevel;
 
 /* Define the default setting for synchronous_commit */
-#define SYNCHRONOUS_COMMIT_ON    SYNCHRONOUS_COMMIT_REMOTE_FLUSH
+#define SYNCHRONOUS_COMMIT_ON	SYNCHRONOUS_COMMIT_REMOTE_FLUSH
 
 /* Synchronous commit level */
-extern int    synchronous_commit;
+extern int	synchronous_commit;
 
 /*
  * Miscellaneous flag bits to record events which occur on the top level
@@ -89,48 +89,48 @@ extern int    synchronous_commit;
  * globally accessible, so can be set from anywhere in the code which requires
  * recording flags.
  */
-extern int    MyXactFlags;
+extern int	MyXactFlags;
 
 /*
  * XACT_FLAGS_ACCESSEDTEMPREL - set when a temporary relation is accessed. We
  * don't allow PREPARE TRANSACTION in that case.
  */
-#define XACT_FLAGS_ACCESSEDTEMPREL                (1U << 0)
+#define XACT_FLAGS_ACCESSEDTEMPREL				(1U << 0)
 
 /*
  * XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK - records whether the top level xact
  * logged any Access Exclusive Locks.
  */
-#define XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK    (1U << 1)
+#define XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK	(1U << 1)
 
 
 /*
- *    start- and end-of-transaction callbacks for dynamically loaded modules
+ *	start- and end-of-transaction callbacks for dynamically loaded modules
  */
 typedef enum
 {
-    XACT_EVENT_COMMIT,
-    XACT_EVENT_PARALLEL_COMMIT,
-    XACT_EVENT_ABORT,
-    XACT_EVENT_PARALLEL_ABORT,
-    XACT_EVENT_PREPARE,
-    XACT_EVENT_PRE_COMMIT,
-    XACT_EVENT_PARALLEL_PRE_COMMIT,
-    XACT_EVENT_PRE_PREPARE
+	XACT_EVENT_COMMIT,
+	XACT_EVENT_PARALLEL_COMMIT,
+	XACT_EVENT_ABORT,
+	XACT_EVENT_PARALLEL_ABORT,
+	XACT_EVENT_PREPARE,
+	XACT_EVENT_PRE_COMMIT,
+	XACT_EVENT_PARALLEL_PRE_COMMIT,
+	XACT_EVENT_PRE_PREPARE
 } XactEvent;
 
 typedef void (*XactCallback) (XactEvent event, void *arg);
 
 typedef enum
 {
-    SUBXACT_EVENT_START_SUB,
-    SUBXACT_EVENT_COMMIT_SUB,
-    SUBXACT_EVENT_ABORT_SUB,
-    SUBXACT_EVENT_PRE_COMMIT_SUB
+	SUBXACT_EVENT_START_SUB,
+	SUBXACT_EVENT_COMMIT_SUB,
+	SUBXACT_EVENT_ABORT_SUB,
+	SUBXACT_EVENT_PRE_COMMIT_SUB
 } SubXactEvent;
 
 typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid,
-                                 SubTransactionId parentSubid, void *arg);
+								 SubTransactionId parentSubid, void *arg);
 
 #ifdef PGXC
 /*
@@ -138,16 +138,16 @@ typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid,
  */
 typedef enum
 {
-    GTM_EVENT_COMMIT,
-    GTM_EVENT_ABORT,
-    GTM_EVENT_PREPARE
+	GTM_EVENT_COMMIT,
+	GTM_EVENT_ABORT,
+	GTM_EVENT_PREPARE
 } GTMEvent;
 
 typedef void (*GTMCallback) (GTMEvent event, void *arg);
 #endif
 
 /* ----------------
- *        transaction-related XLOG entries
+ *		transaction-related XLOG entries
  * ----------------
  */
 
@@ -155,24 +155,24 @@ typedef void (*GTMCallback) (GTMEvent event, void *arg);
  * XLOG allows to store some information in high 4 bits of log record xl_info
  * field. We use 3 for the opcode, and one about an optional flag variable.
  */
-#define XLOG_XACT_COMMIT            0x00
-#define XLOG_XACT_PREPARE            0x10
-#define XLOG_XACT_ABORT                0x20
-#define XLOG_XACT_COMMIT_PREPARED    0x30
-#define XLOG_XACT_ABORT_PREPARED    0x40
-#define XLOG_XACT_ASSIGNMENT        0x50
+#define XLOG_XACT_COMMIT			0x00
+#define XLOG_XACT_PREPARE			0x10
+#define XLOG_XACT_ABORT				0x20
+#define XLOG_XACT_COMMIT_PREPARED	0x30
+#define XLOG_XACT_ABORT_PREPARED	0x40
+#define XLOG_XACT_ASSIGNMENT		0x50
 #ifdef __TBASE__
 /* free opcode 0x60 */
-#define XLOG_XACT_ACQUIRE_GTS        0x60
+#define XLOG_XACT_ACQUIRE_GTS		0x60
 #endif
 
 /* free opcode 0x70 */
 
 /* mask for filtering opcodes out of xl_info */
-#define XLOG_XACT_OPMASK            0x70
+#define XLOG_XACT_OPMASK			0x70
 
 /* does this record have a 'xinfo' field or not */
-#define XLOG_XACT_HAS_INFO            0x80
+#define XLOG_XACT_HAS_INFO			0x80
 
 /* record 2plc file for readonly explicit transaction */
 #define XLOG_XACT_RECORD_READONLY 0x90
@@ -180,13 +180,13 @@ typedef void (*GTMCallback) (GTMEvent event, void *arg);
  * The following flags, stored in xinfo, determine which information is
  * contained in commit/abort records.
  */
-#define XACT_XINFO_HAS_DBINFO            (1U << 0)
-#define XACT_XINFO_HAS_SUBXACTS            (1U << 1)
-#define XACT_XINFO_HAS_RELFILENODES        (1U << 2)
-#define XACT_XINFO_HAS_INVALS            (1U << 3)
-#define XACT_XINFO_HAS_TWOPHASE            (1U << 4)
-#define XACT_XINFO_HAS_ORIGIN            (1U << 5)
-#define XACT_XINFO_HAS_AE_LOCKS            (1U << 6)
+#define XACT_XINFO_HAS_DBINFO			(1U << 0)
+#define XACT_XINFO_HAS_SUBXACTS			(1U << 1)
+#define XACT_XINFO_HAS_RELFILENODES		(1U << 2)
+#define XACT_XINFO_HAS_INVALS			(1U << 3)
+#define XACT_XINFO_HAS_TWOPHASE			(1U << 4)
+#define XACT_XINFO_HAS_ORIGIN			(1U << 5)
+#define XACT_XINFO_HAS_AE_LOCKS			(1U << 6)
 
 /*
  * Also stored in xinfo, these indicating a variety of additional actions that
@@ -196,24 +196,24 @@ typedef void (*GTMCallback) (GTMEvent event, void *arg);
  * EOXact... routines which run at the end of the original transaction
  * completion.
  */
-#define XACT_COMPLETION_APPLY_FEEDBACK            (1U << 29)
-#define XACT_COMPLETION_UPDATE_RELCACHE_FILE    (1U << 30)
-#define XACT_COMPLETION_FORCE_SYNC_COMMIT        (1U << 31)
+#define XACT_COMPLETION_APPLY_FEEDBACK			(1U << 29)
+#define XACT_COMPLETION_UPDATE_RELCACHE_FILE	(1U << 30)
+#define XACT_COMPLETION_FORCE_SYNC_COMMIT		(1U << 31)
 
 /* Access macros for above flags */
 #define XactCompletionApplyFeedback(xinfo) \
-    ((xinfo & XACT_COMPLETION_APPLY_FEEDBACK) != 0)
+	((xinfo & XACT_COMPLETION_APPLY_FEEDBACK) != 0)
 #define XactCompletionRelcacheInitFileInval(xinfo) \
-    ((xinfo & XACT_COMPLETION_UPDATE_RELCACHE_FILE) != 0)
+	((xinfo & XACT_COMPLETION_UPDATE_RELCACHE_FILE) != 0)
 #define XactCompletionForceSyncCommit(xinfo) \
-    ((xinfo & XACT_COMPLETION_FORCE_SYNC_COMMIT) != 0)
+	((xinfo & XACT_COMPLETION_FORCE_SYNC_COMMIT) != 0)
 
 
 typedef struct xl_xact_assignment
 {
-    TransactionId xtop;            /* assigned XID's top-level XID */
-    int            nsubxacts;        /* number of subtransaction XIDs */
-    TransactionId xsub[FLEXIBLE_ARRAY_MEMBER];    /* assigned subxids */
+	TransactionId xtop;			/* assigned XID's top-level XID */
+	int			nsubxacts;		/* number of subtransaction XIDs */
+	TransactionId xsub[FLEXIBLE_ARRAY_MEMBER];	/* assigned subxids */
 } xl_xact_assignment;
 
 #define MinSizeOfXactAssignment offsetof(xl_xact_assignment, xsub)
@@ -237,78 +237,78 @@ typedef struct xl_xact_assignment
 
 typedef struct xl_xact_xinfo
 {
-    /*
-     * Even though we right now only require 1 byte of space in xinfo we use
-     * four so following records don't have to care about alignment. Commit
-     * records can be large, so copying large portions isn't attractive.
-     */
-    uint32        xinfo;
+	/*
+	 * Even though we right now only require 1 byte of space in xinfo we use
+	 * four so following records don't have to care about alignment. Commit
+	 * records can be large, so copying large portions isn't attractive.
+	 */
+	uint32		xinfo;
 } xl_xact_xinfo;
 
 typedef struct xl_xact_dbinfo
 {
-    Oid            dbId;            /* MyDatabaseId */
-    Oid            tsId;            /* MyDatabaseTableSpace */
+	Oid			dbId;			/* MyDatabaseId */
+	Oid			tsId;			/* MyDatabaseTableSpace */
 } xl_xact_dbinfo;
 
 typedef struct xl_xact_subxacts
 {
-    int            nsubxacts;        /* number of subtransaction XIDs */
-    TransactionId subxacts[FLEXIBLE_ARRAY_MEMBER];
+	int			nsubxacts;		/* number of subtransaction XIDs */
+	TransactionId subxacts[FLEXIBLE_ARRAY_MEMBER];
 } xl_xact_subxacts;
 #define MinSizeOfXactSubxacts offsetof(xl_xact_subxacts, subxacts)
 
 typedef struct xl_xact_relfilenodes
 {
-    int            nrels;            /* number of subtransaction XIDs */
-    RelFileNode xnodes[FLEXIBLE_ARRAY_MEMBER];
+	int			nrels;			/* number of subtransaction XIDs */
+	RelFileNode xnodes[FLEXIBLE_ARRAY_MEMBER];
 } xl_xact_relfilenodes;
 #define MinSizeOfXactRelfilenodes offsetof(xl_xact_relfilenodes, xnodes)
 
 typedef struct xl_xact_invals
 {
-    int            nmsgs;            /* number of shared inval msgs */
-    SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER];
+	int			nmsgs;			/* number of shared inval msgs */
+	SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER];
 } xl_xact_invals;
 #define MinSizeOfXactInvals offsetof(xl_xact_invals, msgs)
 
 typedef struct xl_xact_twophase
 {
-    TransactionId xid;
+	TransactionId xid;
 } xl_xact_twophase;
 
 typedef struct xl_xact_origin
 {
-    XLogRecPtr    origin_lsn;
-    TimestampTz origin_timestamp;
+	XLogRecPtr	origin_lsn;
+	TimestampTz origin_timestamp;
 } xl_xact_origin;
 
 typedef struct xl_xact_commit
 {
-    TimestampTz global_timestamp;   /* logical global timestamp */
-    TimestampTz xact_time;        /* time of commit */
-
-    /* xl_xact_xinfo follows if XLOG_XACT_HAS_INFO */
-    /* xl_xact_dbinfo follows if XINFO_HAS_DBINFO */
-    /* xl_xact_subxacts follows if XINFO_HAS_SUBXACT */
-    /* xl_xact_relfilenodes follows if XINFO_HAS_RELFILENODES */
-    /* xl_xact_invals follows if XINFO_HAS_INVALS */
-    /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */
-    /* xl_xact_origin follows if XINFO_HAS_ORIGIN, stored unaligned! */
+	TimestampTz global_timestamp;   /* logical global timestamp */
+	TimestampTz xact_time;		/* time of commit */
+
+	/* xl_xact_xinfo follows if XLOG_XACT_HAS_INFO */
+	/* xl_xact_dbinfo follows if XINFO_HAS_DBINFO */
+	/* xl_xact_subxacts follows if XINFO_HAS_SUBXACT */
+	/* xl_xact_relfilenodes follows if XINFO_HAS_RELFILENODES */
+	/* xl_xact_invals follows if XINFO_HAS_INVALS */
+	/* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */
+	/* xl_xact_origin follows if XINFO_HAS_ORIGIN, stored unaligned! */
 } xl_xact_commit;
 #define MinSizeOfXactCommit (offsetof(xl_xact_commit, xact_time) + sizeof(TimestampTz))
 
 typedef struct xl_xact_abort
 {
-    TimestampTz global_timestamp;   /* logical global timestamp */
-    TimestampTz xact_time;        /* time of abort */
-
-    /* xl_xact_xinfo follows if XLOG_XACT_HAS_INFO */
-    /* No db_info required */
-    /* xl_xact_subxacts follows if HAS_SUBXACT */
-    /* xl_xact_relfilenodes follows if HAS_RELFILENODES */
-    /* No invalidation messages needed. */
-    /* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */
+	TimestampTz global_timestamp;   /* logical global timestamp */
+	TimestampTz xact_time;		/* time of abort */
+
+	/* xl_xact_xinfo follows if XLOG_XACT_HAS_INFO */
+	/* No db_info required */
+	/* xl_xact_subxacts follows if HAS_SUBXACT */
+	/* xl_xact_relfilenodes follows if HAS_RELFILENODES */
+	/* No invalidation messages needed. */
+	/* xl_xact_twophase follows if XINFO_HAS_TWOPHASE */
 } xl_xact_abort;
 #define MinSizeOfXactAbort sizeof(xl_xact_abort)
 
@@ -319,48 +319,48 @@ typedef struct xl_xact_abort
  */
 typedef struct xl_xact_parsed_commit
 {
-    TimestampTz global_timestamp;   /* logical global timestamp */
-    TimestampTz xact_time;
+	TimestampTz global_timestamp;   /* logical global timestamp */
+	TimestampTz xact_time;
 
-    uint32        xinfo;
+	uint32		xinfo;
 
-    Oid            dbId;            /* MyDatabaseId */
-    Oid            tsId;            /* MyDatabaseTableSpace */
+	Oid			dbId;			/* MyDatabaseId */
+	Oid			tsId;			/* MyDatabaseTableSpace */
 
-    int            nsubxacts;
-    TransactionId *subxacts;
+	int			nsubxacts;
+	TransactionId *subxacts;
 
-    int            nrels;
-    RelFileNode *xnodes;
+	int			nrels;
+	RelFileNode *xnodes;
 
-    int            nmsgs;
-    SharedInvalidationMessage *msgs;
+	int			nmsgs;
+	SharedInvalidationMessage *msgs;
 
-    TransactionId twophase_xid; /* only for 2PC */
+	TransactionId twophase_xid; /* only for 2PC */
 
-    XLogRecPtr    origin_lsn;
-    TimestampTz origin_timestamp;
+	XLogRecPtr	origin_lsn;
+	TimestampTz origin_timestamp;
 } xl_xact_parsed_commit;
 
 typedef struct xl_xact_parsed_abort
 {
-    TimestampTz global_timestamp;   /* logical global timestamp */
-    TimestampTz xact_time;
-    uint32        xinfo;
+	TimestampTz global_timestamp;   /* logical global timestamp */
+	TimestampTz xact_time;
+	uint32		xinfo;
 
-    int            nsubxacts;
-    TransactionId *subxacts;
+	int			nsubxacts;
+	TransactionId *subxacts;
 
-    int            nrels;
-    RelFileNode *xnodes;
+	int			nrels;
+	RelFileNode *xnodes;
 
-    TransactionId twophase_xid; /* only for 2PC */
+	TransactionId twophase_xid; /* only for 2PC */
 } xl_xact_parsed_abort;
 
 #ifdef __TBASE__
 typedef struct xl_xact_acquire_gts
 {
-    TimestampTz global_timestamp;   /* logical global timestamp */
+	TimestampTz global_timestamp;   /* logical global timestamp */
 }xl_xact_acquire_gts;
 #endif
 
@@ -483,12 +483,12 @@ typedef enum
     REMOTE_ABORT                /* from pgxc_node_remote_abort */
 }CurrentOperation;              /* record twophase trans operation before receive responses */
 
-typedef struct ConnTransState    /* record twophase trasaction state of each connection*/
+typedef struct ConnTransState	/* record twophase trasaction state of each connection*/
 {
     bool                is_participant;
     ConnState           conn_state;     /* record state of each connection in twophase trans */
-    TwoPhaseTransState    state;            /* state of twophase trans in each connection */
-    int                    handle_idx;     /* index of dn_handles or cn_handles */
+	TwoPhaseTransState	state;	        /* state of twophase trans in each connection */
+	int			        handle_idx;     /* index of dn_handles or cn_handles */
 }ConnTransState;
 
 typedef struct AllConnNodeInfo
@@ -503,14 +503,14 @@ typedef struct LocalTwoPhaseState
     bool                is_start_node;
     bool                is_readonly;   /* since explicit transaction can be readonly, need to record readonly in 2pc file */
     bool                is_after_prepare; /* record whether the transaction pass the whole prepare phase */
-    char                 *gid;            /* gid of twophase transaction*/
-    TwoPhaseTransState    state;                /* global twophase state */        
-    ConnTransState         *coord_state;       /* each coord participants state */
+	char 		        *gid;	        /* gid of twophase transaction*/
+	TwoPhaseTransState	state;			    /* global twophase state */		
+	ConnTransState 	    *coord_state;       /* each coord participants state */
     int                 coord_index;          /* index of coord_state */
-    ConnTransState         *datanode_state;
+	ConnTransState 	    *datanode_state;
     int                 datanode_index;       /* index of datanode_state */
     bool                isprinted;          /* is printed in AbortTransaction */
-    char                start_node_name[NAMEDATALEN];   /* twophase trans startnode */
+	char		        start_node_name[NAMEDATALEN];   /* twophase trans startnode */
     TransactionId       start_xid;
     char                *participants;
     PGXCNodeAllHandles  *handles;   /* handles in each phase in twophase trans */
@@ -522,10 +522,11 @@ extern LocalTwoPhaseState g_twophase_state;
 #endif
 
 /* ----------------
- *        extern definitions
+ *		extern definitions
  * ----------------
  */
 extern bool IsTransactionState(void);
+extern bool IsTransactionCommit(void);
 extern bool IsAbortedTransactionBlockState(void);
 extern TransactionId GetTopTransactionId(void);
 extern TransactionId GetTopTransactionIdIfAny(void);
@@ -606,7 +607,7 @@ extern void SetCurrentStatementStartTimestamp(void);
 extern TimestampTz GetCurrentGTMStartTimestamp(void);
 extern void SetCurrentGTMDeltaTimestamp(TimestampTz timestamp);
 #endif
-extern int    GetCurrentTransactionNestLevel(void);
+extern int	GetCurrentTransactionNestLevel(void);
 extern bool TransactionIdIsCurrentTransactionId(TransactionId xid);
 extern void CommandCounterIncrement(void);
 extern void ForceSyncCommit(void);
@@ -671,22 +672,22 @@ extern bool IsPGXCNodeXactDatanodeDirect(void);
 extern void TransactionRecordXidWait(TransactionId xid);
 #endif
 
-extern int    xactGetCommittedChildren(TransactionId **ptr);
+extern int	xactGetCommittedChildren(TransactionId **ptr);
 
 extern XLogRecPtr XactLogCommitRecord(TimestampTz global_timestamp,
-                    TimestampTz     commit_time,
-                    int nsubxacts, TransactionId *subxacts,
-                    int nrels, RelFileNode *rels,
-                    int nmsgs, SharedInvalidationMessage *msgs,
-                    bool relcacheInval, bool forceSync,
-                    int xactflags,
-                    TransactionId twophase_xid);
+					TimestampTz	 commit_time,
+					int nsubxacts, TransactionId *subxacts,
+					int nrels, RelFileNode *rels,
+					int nmsgs, SharedInvalidationMessage *msgs,
+					bool relcacheInval, bool forceSync,
+					int xactflags,
+					TransactionId twophase_xid);
 
 extern XLogRecPtr XactLogAbortRecord(TimestampTz global_timestamp,
-                    TimestampTz abort_time,
-                   int nsubxacts, TransactionId *subxacts,
-                   int nrels, RelFileNode *rels,
-                   int xactflags, TransactionId twophase_xid);
+					TimestampTz abort_time,
+				   int nsubxacts, TransactionId *subxacts,
+				   int nrels, RelFileNode *rels,
+				   int xactflags, TransactionId twophase_xid);
 extern void xact_redo(XLogReaderState *record);
 
 /* xactdesc.c */
@@ -701,4 +702,4 @@ extern void EnterParallelMode(void);
 extern void ExitParallelMode(void);
 extern bool IsInParallelMode(void);
 
-#endif                            /* XACT_H */
+#endif							/* XACT_H */

From 513207d06e4f48bf85d374fc7ba98697790f9607 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Wed, 24 Feb 2021 22:37:43 +0800
Subject: [PATCH 328/578] Fix coredump due to node number changes.

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131084977493&url_cache_key=d4e1402777dc733479aac463ad1a9d24

When a transcation is running, we hold pooler_reload until transaction finishes and rebuild global memory that related with node number after reload.
---
 src/backend/access/transam/xact.c  |  3 -
 src/backend/commands/prepare.c     | 48 +++++++++++++++
 src/backend/pgxc/nodemgr/nodemgr.c |  9 +++
 src/backend/pgxc/pool/execRemote.c | 15 +++++
 src/backend/pgxc/pool/pgxcnode.c   | 53 +++++++++++++----
 src/backend/pgxc/pool/poolutils.c  | 38 +-----------
 src/backend/tcop/postgres.c        |  4 +-
 src/backend/tcop/utility.c         | 10 ++++
 src/backend/utils/hash/dynahash.c  |  9 +++
 src/include/commands/prepare.h     |  1 +
 src/include/pgxc/execRemote.h      |  1 +
 src/include/pgxc/nodemgr.h         |  1 +
 src/include/pgxc/pgxcnode.h        |  3 +
 src/include/utils/hsearch.h        | 95 +++++++++++++++---------------
 14 files changed, 187 insertions(+), 103 deletions(-)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 0f80f39c..f43288dc 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -8168,9 +8168,6 @@ IsTransactionIdle(void)
         }
     }
     
-    elog(WARNING,"reload is be processing in transaction. trans state: %d", CurrentTransactionState->state);
-    elog(WARNING,"reload is be processing in transaction. trans block state: %d", CurrentTransactionState->blockState);
-    
     return false;
 }
 
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index 7cf29f5f..1dc55563 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -479,6 +479,54 @@ InitQueryHashTable(void)
 #endif
 }
 
+/*
+ * Rebuild query hash table.
+ */
+void
+RebuildDatanodeQueryHashTable(void)
+{
+	HASHCTL		hash_ctl;
+	HASH_SEQ_STATUS seq;
+	DatanodeStatement *entry;
+	DatanodeStatement *entry_tmp;
+	Size               original_entry_size;
+	HTAB        *datanode_queries_tmp = NULL;
+
+	if (!IS_PGXC_COORDINATOR || !datanode_queries)
+	{
+		return;
+	}
+
+	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+	hash_ctl.keysize = NAMEDATALEN;
+	hash_ctl.entrysize = sizeof(DatanodeStatement) + NumDataNodes * sizeof(int);
+
+	original_entry_size = hash_get_entry_size(datanode_queries);
+
+	/* node number not changed, no need to rebuild */
+	if (original_entry_size == hash_ctl.entrysize)
+	{
+		return ;
+	}
+
+	datanode_queries_tmp = hash_create("Datanode Queries",
+	                               64,
+	                               &hash_ctl,
+	                               HASH_ELEM);
+	/* walk over cache */
+	hash_seq_init(&seq, datanode_queries);
+	while ((entry = hash_seq_search(&seq)) != NULL)
+	{
+		/* Now we can copy the hash table entry */
+		entry_tmp = (DatanodeStatement  *) hash_search(datanode_queries_tmp, entry->stmt_name,
+		                                               HASH_ENTER, NULL);
+		memcpy(entry_tmp, entry, original_entry_size);
+	}
+
+	hash_destroy(datanode_queries);
+	datanode_queries = datanode_queries_tmp;
+}
+
 #ifdef PGXC
 /*
  * Assign the statement name for all the RemoteQueries in the plan tree, so
diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c
index 570aadee..830b1b8d 100644
--- a/src/backend/pgxc/nodemgr/nodemgr.c
+++ b/src/backend/pgxc/nodemgr/nodemgr.c
@@ -573,6 +573,15 @@ count_coords_datanodes(Relation rel, int *num_coord, int *num_dns)
     *num_dns = dnCount;
 }
 
+/*
+ * Whether node changes happened
+ */
+bool
+PrimaryNodeNumberChanged(void)
+{
+	return (*shmemNumCoords + *shmemNumDataNodes != NumCoords + NumDataNodes);
+}
+
 /*
  * PgxcNodeListAndCount
  *
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index c781abcf..c11ab9d7 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -189,6 +189,21 @@ stat_statement()
     current_tran_statements++;
 }
 
+/*
+ * clean memory related to stat transaction
+ */
+void
+clean_stat_transaction(void)
+{
+	if(!nodes_per_transaction)
+	{
+		return ;
+	}
+
+	free(nodes_per_transaction);
+	nodes_per_transaction = NULL;
+}
+
 /*
  * To collect statistics: count a transaction
  */
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index b69e928d..353f6b7f 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -229,7 +229,6 @@ InitMultinodeExecutor(bool is_force)
     PGXCNodeHandlesLookupEnt *node_handle_ent = NULL;
 #endif
 
-
     /* Free all the existing information first */
     if (is_force)
         pgxc_node_all_free();
@@ -245,6 +244,10 @@ InitMultinodeExecutor(bool is_force)
     /* Get classified list of node Oids */
     PgxcNodeGetOidsExtend(&coOids, &dnOids, &sdnOids, &NumCoords, &NumDataNodes, &NumSlaveDataNodes, true);
 
+	/* Process node number related memory */
+	RebuildDatanodeQueryHashTable();
+	clean_stat_transaction();
+
 #ifdef XCP
     /*
      * Coordinator and datanode handles should be available during all the
@@ -3629,12 +3632,6 @@ get_any_handle(List *datanodelist)
                  errmsg("Invalid NULL node list")));
     }
 
-    if (HandlesInvalidatePending)
-        if (DoInvalidateRemoteHandles())
-            ereport(ERROR,
-                    (errcode(ERRCODE_QUERY_CANCELED),
-                     errmsg("canceling transaction due to cluster configuration reset by administrator command")));
-
     if (HandlesRefreshPending)
         if (DoRefreshRemoteHandles())
             ereport(ERROR,
@@ -3772,12 +3769,6 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
     /* index of the result array */
     int            i = 0;
 
-    if (HandlesInvalidatePending)
-        if (DoInvalidateRemoteHandles())
-            ereport(ERROR,
-                    (errcode(ERRCODE_QUERY_CANCELED),
-                     errmsg("canceling transaction due to cluster configuration reset by administrator command")));
-
     if (HandlesRefreshPending)
         if (DoRefreshRemoteHandles())
             ereport(ERROR,
@@ -5011,6 +5002,21 @@ PoolerMessagesPending(void)
     return false;
 }
 
+/*
+ * Check HandleInvalidatePending flag
+ */
+void
+CheckInvalidateRemoteHandles(void)
+{
+	if (!HandlesInvalidatePending)
+		return ;
+
+	if (DoInvalidateRemoteHandles())
+		ereport(ERROR,
+		        (errcode(ERRCODE_QUERY_CANCELED),
+				        errmsg("canceling transaction due to cluster configuration reset by administrator command")));
+}
+
 /*
  * For all handles, mark as they are not in use and discard pending input/output
  */
@@ -5019,9 +5025,30 @@ DoInvalidateRemoteHandles(void)
 {
     bool            result = false;
 
+	/*
+	 * Not reload until transaction is complete.
+	 * That contain two condition.
+	 * 1. transaction status is idle.
+	 * 2. GlobalCommitTimestamp has to be invalid
+	 *    which makes sure we are not in 2pc commit phase.
+	 */
+	if (InterruptHoldoffCount || !IsTransactionIdle() || GetGlobalCommitTimestamp() != InvalidGlobalTimestamp)
+	{
+		return result;
+	}
+
 	HOLD_INTERRUPTS();
 
+	/*
+   	 * Reinitialize session, it updates the shared memory table.
+     * Initialize XL executor. This must be done inside a transaction block.
+     */
+	StartTransactionCommand();
 	InitMultinodeExecutor(true);
+	CommitTransactionCommand();
+
+	/* Disconnect from the pooler to get new connection infos next time */
+	PoolManagerDisconnect();
 
 	HandlesInvalidatePending = false;
 	HandlesRefreshPending = false;
diff --git a/src/backend/pgxc/pool/poolutils.c b/src/backend/pgxc/pool/poolutils.c
index b03a4e2b..0b684619 100644
--- a/src/backend/pgxc/pool/poolutils.c
+++ b/src/backend/pgxc/pool/poolutils.c
@@ -26,6 +26,7 @@
 #include "pgxc/nodemgr.h"
 #include "pgxc/poolutils.h"
 #include "pgxc/pgxcnode.h"
+#include "pgxc/execRemote.h"
 #include "access/gtm.h"
 #include "access/xact.h"
 #include "catalog/pgxc_node.h"
@@ -415,45 +416,8 @@ HandlePoolerReload(void)
     if (proc_exit_inprogress)
         return;
 
-	if (InterruptHoldoffCount != 0)
-		return;
-
-#ifdef __TBASE__
-    if (PoolerReloadHoldoffCount)
-    {
-        PoolerReloadPending = true;
-        return;
-    }
-    
-    if (false == IsTransactionIdle())
-    {
-        return;
-    }
-
-    PoolerReloadPending = false;
-#endif
-
-	HOLD_INTERRUPTS();
-
-	/*
-	 * Reinitialize session, it updates the shared memory table.
-	 * Initialize XL executor. This must be done inside a transaction block.
-	 */
-	StartTransactionCommand();
-	InitMultinodeExecutor(true);
-	CommitTransactionCommand();
-
-    /* Request query cancel, when convenient */
-    InterruptPending = true;
-    QueryCancelPending = true;
-
-    /* Disconnect from the pooler to get new connection infos next time */
-    PoolManagerDisconnect();
-
     /* Prevent using of cached connections to remote nodes */
     RequestInvalidateRemoteHandles();
-
-	RESUME_INTERRUPTS();
 }
 
 /*
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 7020932d..48a49f73 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -5264,9 +5264,7 @@ PostgresMain(int argc, char *argv[],
 #endif
 
 #ifdef __TBASE__
-        RESUME_POOLER_RELOAD();
-        CHECK_FOR_POOLER_RELOAD();
-        HOLD_POOLER_RELOAD();
+		CheckInvalidateRemoteHandles();
 #endif
 
         initStringInfo(&input_message);
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index da28b820..37139a63 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -1845,7 +1845,17 @@ standard_ProcessUtility(PlannedStmt *pstmt,
     {
         /* Is the statement a prohibited one? */
         if (!IsStmtAllowedInLockedMode(parsetree, queryString))
+		{
+			/* node number changes with ddl is not allowed */
+			if (HandlesInvalidatePending && PrimaryNodeNumberChanged())
+			{
+				ereport(ERROR,
+				        (errcode(ERRCODE_QUERY_CANCELED),
+						        errmsg("canceling transaction due to cluster configuration reset by administrator command")));
+			}
             pgxc_lock_for_utility_stmt(parsetree);
+
+		}
     }
 
     check_xact_readonly(parsetree);
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index ee65603b..8e62e871 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -1347,6 +1347,15 @@ hash_get_num_entries(HTAB *hashp)
     return sum;
 }
 
+/*
+ * hash_get_entry_size -- get the entry size of a hashtable
+ */
+Size
+hash_get_entry_size(HTAB *hashp)
+{
+	return hashp->hctl->entrysize;
+}
+
 /*
  * hash_seq_init/_search/_term
  *            Sequentially search through hash table and return
diff --git a/src/include/commands/prepare.h b/src/include/commands/prepare.h
index 53fbdede..a5d6383e 100644
--- a/src/include/commands/prepare.h
+++ b/src/include/commands/prepare.h
@@ -146,6 +146,7 @@ extern void PrepareRemoteDMLStatement(bool upsert, char *stmt,
                                     char *select_stmt, char *update_stmt);
 
 extern void DropRemoteDMLStatement(char *stmt, char *update_stmt);
+extern void RebuildDatanodeQueryHashTable(void);
 #endif
 
 #endif                            /* PREPARE_H */
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index 5b6d46c6..98d51719 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -457,6 +457,7 @@ extern void ClearLocalTwoPhaseState(void);
 extern char *GetTransStateString(TwoPhaseTransState state);
 extern char *GetConnStateString(ConnState state);
 extern void get_partnodes(PGXCNodeAllHandles * handles, StringInfo participants);
+extern void clean_stat_transaction(void);
 #endif
 
 #endif
diff --git a/src/include/pgxc/nodemgr.h b/src/include/pgxc/nodemgr.h
index 0f31d8cc..ad39f5c6 100644
--- a/src/include/pgxc/nodemgr.h
+++ b/src/include/pgxc/nodemgr.h
@@ -67,6 +67,7 @@ extern void PgxcNodeRemove(DropNodeStmt *stmt);
 extern void PgxcNodeDnListHealth(List *nodeList, bool *dnhealth);
 extern bool PgxcNodeUpdateHealth(Oid node, bool status);
 
+extern bool PrimaryNodeNumberChanged(void);
 /* GUC parameter */
 extern bool enable_multi_cluster;
 extern bool enable_multi_cluster_print;
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index f15515c3..6643dc36 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -143,6 +143,8 @@ typedef struct
 	PGXCNodeHandle	  **coord_handles;	/* an array of Coordinator handles */
 } PGXCNodeAllHandles;
 
+extern volatile bool HandlesInvalidatePending;
+
 extern void InitMultinodeExecutor(bool is_force);
 extern Oid get_nodeoid_from_nodeid(int nodeid, char node_type);
 
@@ -295,6 +297,7 @@ inline bool  is_ddl_leader_cn(char *leader_cn);
 extern int pgxc_node_send_sessionid(PGXCNodeHandle * handle);
 extern void SerializeSessionId(Size maxsize, char *start_address);
 extern void StartParallelWorkerSessionId(char *address);
+void CheckInvalidateRemoteHandles(void);
 #endif
 
 #ifdef __AUDIT__
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h
index 43bbd08c..651b3b59 100644
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * hsearch.h
- *      exported definitions for utils/hash/dynahash.c; see notes therein
+ *	  exported definitions for utils/hash/dynahash.c; see notes therein
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -27,7 +27,7 @@ typedef uint32 (*HashValueFunc) (const void *key, Size keysize);
  * as key comparison functions.)
  */
 typedef int (*HashCompareFunc) (const void *key1, const void *key2,
-                                Size keysize);
+								Size keysize);
 
 /*
  * Key copying functions must have this signature.  The return value is not
@@ -50,8 +50,8 @@ typedef void *(*HashAllocFunc) (Size request);
  */
 typedef struct HASHELEMENT
 {
-    struct HASHELEMENT *link;    /* link to next entry in same bucket */
-    uint32        hashvalue;        /* hash function result for this entry */
+	struct HASHELEMENT *link;	/* link to next entry in same bucket */
+	uint32		hashvalue;		/* hash function result for this entry */
 } HASHELEMENT;
 
 /* Hash table header struct is an opaque type known only within dynahash.c */
@@ -64,74 +64,75 @@ typedef struct HTAB HTAB;
 /* Only those fields indicated by hash_flags need be set */
 typedef struct HASHCTL
 {
-    long        num_partitions; /* # partitions (must be power of 2) */
-    long        ssize;            /* segment size */
-    long        dsize;            /* (initial) directory size */
-    long        max_dsize;        /* limit to dsize if dir size is limited */
-    long        ffactor;        /* fill factor */
-    Size        keysize;        /* hash key length in bytes */
-    Size        entrysize;        /* total user element size in bytes */
-    HashValueFunc hash;            /* hash function */
-    HashCompareFunc match;        /* key comparison function */
-    HashCopyFunc keycopy;        /* key copying function */
-    HashAllocFunc alloc;        /* memory allocator */
-    MemoryContext hcxt;            /* memory context to use for allocations */
-    HASHHDR    *hctl;            /* location of header in shared mem */
+	long		num_partitions; /* # partitions (must be power of 2) */
+	long		ssize;			/* segment size */
+	long		dsize;			/* (initial) directory size */
+	long		max_dsize;		/* limit to dsize if dir size is limited */
+	long		ffactor;		/* fill factor */
+	Size		keysize;		/* hash key length in bytes */
+	Size		entrysize;		/* total user element size in bytes */
+	HashValueFunc hash;			/* hash function */
+	HashCompareFunc match;		/* key comparison function */
+	HashCopyFunc keycopy;		/* key copying function */
+	HashAllocFunc alloc;		/* memory allocator */
+	MemoryContext hcxt;			/* memory context to use for allocations */
+	HASHHDR    *hctl;			/* location of header in shared mem */
 } HASHCTL;
 
 /* Flags to indicate which parameters are supplied */
-#define HASH_PARTITION    0x0001    /* Hashtable is used w/partitioned locking */
-#define HASH_SEGMENT    0x0002    /* Set segment size */
-#define HASH_DIRSIZE    0x0004    /* Set directory size (initial and max) */
-#define HASH_FFACTOR    0x0008    /* Set fill factor */
-#define HASH_ELEM        0x0010    /* Set keysize and entrysize */
-#define HASH_BLOBS        0x0020    /* Select support functions for binary keys */
-#define HASH_FUNCTION    0x0040    /* Set user defined hash function */
-#define HASH_COMPARE    0x0080    /* Set user defined comparison function */
-#define HASH_KEYCOPY    0x0100    /* Set user defined key-copying function */
-#define HASH_ALLOC        0x0200    /* Set memory allocator */
-#define HASH_CONTEXT    0x0400    /* Set memory allocation context */
-#define HASH_SHARED_MEM 0x0800    /* Hashtable is in shared memory */
-#define HASH_ATTACH        0x1000    /* Do not initialize hctl */
-#define HASH_FIXED_SIZE 0x2000    /* Initial size is a hard limit */
+#define HASH_PARTITION	0x0001	/* Hashtable is used w/partitioned locking */
+#define HASH_SEGMENT	0x0002	/* Set segment size */
+#define HASH_DIRSIZE	0x0004	/* Set directory size (initial and max) */
+#define HASH_FFACTOR	0x0008	/* Set fill factor */
+#define HASH_ELEM		0x0010	/* Set keysize and entrysize */
+#define HASH_BLOBS		0x0020	/* Select support functions for binary keys */
+#define HASH_FUNCTION	0x0040	/* Set user defined hash function */
+#define HASH_COMPARE	0x0080	/* Set user defined comparison function */
+#define HASH_KEYCOPY	0x0100	/* Set user defined key-copying function */
+#define HASH_ALLOC		0x0200	/* Set memory allocator */
+#define HASH_CONTEXT	0x0400	/* Set memory allocation context */
+#define HASH_SHARED_MEM 0x0800	/* Hashtable is in shared memory */
+#define HASH_ATTACH		0x1000	/* Do not initialize hctl */
+#define HASH_FIXED_SIZE 0x2000	/* Initial size is a hard limit */
 
 
 /* max_dsize value to indicate expansible directory */
-#define NO_MAX_DSIZE            (-1)
+#define NO_MAX_DSIZE			(-1)
 
 /* hash_search operations */
 typedef enum
 {
-    HASH_FIND,
-    HASH_ENTER,
-    HASH_REMOVE,
-    HASH_ENTER_NULL
+	HASH_FIND,
+	HASH_ENTER,
+	HASH_REMOVE,
+	HASH_ENTER_NULL
 } HASHACTION;
 
 /* hash_seq status (should be considered an opaque type by callers) */
 typedef struct
 {
-    HTAB       *hashp;
-    uint32        curBucket;        /* index of current bucket */
-    HASHELEMENT *curEntry;        /* current entry in bucket */
+	HTAB	   *hashp;
+	uint32		curBucket;		/* index of current bucket */
+	HASHELEMENT *curEntry;		/* current entry in bucket */
 } HASH_SEQ_STATUS;
 
 /*
  * prototypes for functions in dynahash.c
  */
 extern HTAB *hash_create(const char *tabname, long nelem,
-            HASHCTL *info, int flags);
+			HASHCTL *info, int flags);
 extern void hash_destroy(HTAB *hashp);
 extern void hash_stats(const char *where, HTAB *hashp);
 extern void *hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action,
-            bool *foundPtr);
+			bool *foundPtr);
 extern uint32 get_hash_value(HTAB *hashp, const void *keyPtr);
 extern void *hash_search_with_hash_value(HTAB *hashp, const void *keyPtr,
-                            uint32 hashvalue, HASHACTION action,
-                            bool *foundPtr);
+							uint32 hashvalue, HASHACTION action,
+							bool *foundPtr);
 extern bool hash_update_hash_key(HTAB *hashp, void *existingEntry,
-                     const void *newKeyPtr);
+					 const void *newKeyPtr);
 extern long hash_get_num_entries(HTAB *hashp);
+extern Size hash_get_entry_size(HTAB *hashp);
 extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp);
 extern void *hash_seq_search(HASH_SEQ_STATUS *status);
 extern void hash_seq_term(HASH_SEQ_STATUS *status);
@@ -153,8 +154,8 @@ extern uint32 string_hash(const void *key, Size keysize);
 extern uint32 tag_hash(const void *key, Size keysize);
 extern uint32 uint32_hash(const void *key, Size keysize);
 extern uint32 bitmap_hash(const void *key, Size keysize);
-extern int    bitmap_match(const void *key1, const void *key2, Size keysize);
+extern int	bitmap_match(const void *key1, const void *key2, Size keysize);
 
-#define oid_hash uint32_hash    /* Remove me eventually */
+#define oid_hash uint32_hash	/* Remove me eventually */
 
-#endif                            /* HSEARCH_H */
+#endif							/* HSEARCH_H */

From 518b65fa4d82542a4247d46ebe1ff8488614b7fa Mon Sep 17 00:00:00 2001
From: yeyukui <yukuiye@tencent.com>
Date: Thu, 25 Feb 2021 19:17:46 +0800
Subject: [PATCH 329/578] optimize node start slow because crypt table too much

---
 src/backend/utils/cache/relcryptmap.c | 9 +++++----
 src/backend/utils/misc/guc.c          | 9 +++++++++
 src/backend/utils/misc/mls.c          | 4 +++-
 src/include/utils/relcrypt.h          | 1 +
 4 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/backend/utils/cache/relcryptmap.c b/src/backend/utils/cache/relcryptmap.c
index c7fb8b63..79eef334 100644
--- a/src/backend/utils/cache/relcryptmap.c
+++ b/src/backend/utils/cache/relcryptmap.c
@@ -131,8 +131,8 @@
 
 #define REL_CRYPT_MAP_FILENAME                  "pg_rel_crypt.map"
 #define REL_CRYPT_MAP_FILEMAGIC                 0x952702    /* version ID value */
-#define REL_CRYPT_HASHTABLE_MAX_SIZE            (1 << 20)
-#define REL_CRYPT_HASHTABLE_INIT_SIZE           (1 << 11)
+#define REL_CRYPT_HASHTABLE_MAX_SIZE            ((g_rel_crypt_hash_size > (1 << 20))? g_rel_crypt_hash_size : (1 << 20))
+#define REL_CRYPT_HASHTABLE_INIT_SIZE           g_rel_crypt_hash_size
 #define REL_CRYPT_HASHTABLE_NUM_PARTITIONS      128
 
 
@@ -1267,9 +1267,9 @@ static int rel_crypt_hash_key_cmp (const void *key1, const void *key2, Size keys
 {
     const RelFileNode *tagPtr1 = key1, *tagPtr2 = key2;
 
-    if (tagPtr1->spcNode == tagPtr2->spcNode 
+	if (tagPtr1->relNode == tagPtr2->relNode
         && tagPtr1->dbNode == tagPtr2->dbNode 
-        && tagPtr1->relNode == tagPtr2->relNode )
+        && tagPtr1->spcNode == tagPtr2->spcNode )
         return 0;
 
     return 1;
@@ -1294,6 +1294,7 @@ void rel_crypt_hash_insert(RelFileNode * rnode, AlgoId algo_id, bool write_wal,
                                                     hashcode,
                                                     HASH_ENTER,
                                                     &found);
+
     if (false == found)
     {
         relcrypt->algo_id        = algo_id;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6e3a43dc..ffc6dbd4 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4488,6 +4488,15 @@ static struct config_int ConfigureNamesInt[] =
         32, 4, 64,
         NULL, NULL, NULL
     },
+	{
+		{"rel_crypt_hash_size", PGC_POSTMASTER, CUSTOM_OPTIONS,
+			gettext_noop("Number of rel crypt hash table size, it will influence node start time."),
+			NULL
+		},
+		&g_rel_crypt_hash_size,
+		2048, 2048, INT_MAX,
+		NULL, NULL, NULL
+	},
 #endif
     {
         {"pooler_port", PGC_POSTMASTER, DATA_NODES,
diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c
index e55f36dd..5ff3cf8e 100644
--- a/src/backend/utils/misc/mls.c
+++ b/src/backend/utils/misc/mls.c
@@ -142,7 +142,7 @@ bool g_enable_data_mask         = false;
 bool g_enable_transparent_crypt = false;
 bool g_enable_crypt_debug       = false;
 #endif
-
+int g_rel_crypt_hash_size = 2048;
 
 
 #define MLS_QUERY_STRING_PRUNE_DELIMETER '('
@@ -1619,7 +1619,9 @@ void MlsShmemInit(void)
     MlsInitFileAccess();
     
     crypt_key_info_load_mapfile();
+    elog(LOG, "start rel crypt load mapfile");
     rel_crypt_load_mapfile();
+	elog(LOG, "end rel crypt load mapfile");
 
     /* after vfd access, rollback all init actions */
     MlsCleanFileAccess();
diff --git a/src/include/utils/relcrypt.h b/src/include/utils/relcrypt.h
index 399e703b..fc071bad 100644
--- a/src/include/utils/relcrypt.h
+++ b/src/include/utils/relcrypt.h
@@ -70,6 +70,7 @@ typedef int16 AlgoId;
 extern bool g_enable_cls;
 extern bool g_enable_transparent_crypt;
 extern bool g_enable_crypt_debug;
+extern int g_rel_crypt_hash_size;
 
 extern int g_checkpoint_crypt_worker;
 extern int g_checkpoint_crypt_queue_length;

From 819d218caaa7d2602a6d65b0de7a481cf5d3def6 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 2 Mar 2021 20:28:45 +0800
Subject: [PATCH 330/578] Perfects database dropping procedure.

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131085665555&url_cache_key=d4e1402777dc733479aac463ad1a9d24

(cherry picked from commit ba4dddc4)

58b03906 Fix comment

f51d80ab Fix compile warning.

12d8c778 Perfects database dropping procedure.

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131085665555&url_cache_key=d4e1402777dc733479aac463ad1a9d24
---
 src/backend/commands/dbcommands.c | 125 ++++++++++++++++++++++++++++++
 src/backend/nodes/copyfuncs.c     |   1 +
 src/backend/parser/gram.y         |  18 +++++
 src/backend/tcop/utility.c        |  40 +++++++++-
 src/include/commands/dbcommands.h |  13 ++--
 src/include/nodes/parsenodes.h    |   1 +
 6 files changed, 188 insertions(+), 10 deletions(-)

diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 6e209fff..ae8c18c1 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -815,6 +815,131 @@ createdb_failure_callback(int code, Datum arg)
     remove_dbtablespaces(fparms->dest_dboid);
 }
 
+/*
+ * DROP DATABASE PREPARE
+ * 
+ * Lock the database and check the constraint in advance.
+ */
+void
+dropdb_prepare(const char *dbname, bool missing_ok)
+{
+	Oid			db_id;
+	bool		db_istemplate;
+	Relation	pgdbrel;
+	int			notherbackends;
+	int			npreparedxacts;
+	int			nslots,
+			nslots_active;
+	int			nsubscriptions;
+
+	/*
+	 * Look up the target database's OID, and get exclusive lock on it. We
+	 * need this to ensure that no new backend starts up in the target
+	 * database while we are deleting it (see postinit.c), and that no one is
+	 * using it as a CREATE DATABASE template or trying to delete it for
+	 * themselves.
+	 */
+	pgdbrel = heap_open(DatabaseRelationId, RowExclusiveLock);
+
+	if (!get_db_info(dbname, AccessExclusiveLock, &db_id, NULL, NULL,
+	                 &db_istemplate, NULL, NULL, NULL, NULL, NULL, NULL, NULL))
+	{
+		if (!missing_ok)
+		{
+			ereport(ERROR,
+			        (errcode(ERRCODE_UNDEFINED_DATABASE),
+					        errmsg("database \"%s\" does not exist", dbname)));
+		}
+		else
+		{
+			/* Close pg_database, release the lock, since we changed nothing */
+			heap_close(pgdbrel, RowExclusiveLock);
+			ereport(NOTICE,
+			        (errmsg("database \"%s\" does not exist, skipping",
+			                dbname)));
+			return;
+		}
+	}
+
+	/*
+	 * Permission checks
+	 */
+	if (!pg_database_ownercheck(db_id, GetUserId()))
+		aclcheck_error(ACLCHECK_NOT_OWNER, ACL_KIND_DATABASE,
+		               dbname);
+
+	/* DROP hook for the database being removed */
+	InvokeObjectDropHook(DatabaseRelationId, db_id, 0);
+
+	/*
+	 * Disallow dropping a DB that is marked istemplate.  This is just to
+	 * prevent people from accidentally dropping template0 or template1; they
+	 * can do so if they're really determined ...
+	 */
+	if (db_istemplate)
+		ereport(ERROR,
+		        (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				        errmsg("cannot drop a template database")));
+
+	/* Obviously can't drop my own database */
+	if (db_id == MyDatabaseId)
+		ereport(ERROR,
+		        (errcode(ERRCODE_OBJECT_IN_USE),
+				        errmsg("cannot drop the currently open database")));
+
+	/*
+	 * Check whether there are active logical slots that refer to the
+	 * to-be-dropped database. The database lock we are holding prevents the
+	 * creation of new slots using the database or existing slots becoming
+	 * active.
+	 */
+	(void) ReplicationSlotsCountDBSlots(db_id, &nslots, &nslots_active);
+	if (nslots_active)
+	{
+		ereport(ERROR,
+		        (errcode(ERRCODE_OBJECT_IN_USE),
+				        errmsg("database \"%s\" is used by an active logical replication slot",
+				               dbname),
+				        errdetail_plural("There is %d active slot",
+				                         "There are %d active slots",
+				                         nslots_active, nslots_active)));
+	}
+
+	/*
+	 * Check for other backends in the target database.  (Because we hold the
+	 * database lock, no new ones can start after this.)
+	 *
+	 * As in CREATE DATABASE, check this after other error conditions.
+	 */
+	if (CountOtherDBBackends(db_id, &notherbackends, &npreparedxacts))
+	{
+#ifndef _PG_REGRESS_
+		ereport(ERROR,
+		        (errcode(ERRCODE_OBJECT_IN_USE),
+				        errmsg("database \"%s\" is being accessed by other users",
+				               dbname),
+				        errdetail_busy_db(notherbackends, npreparedxacts)));
+#else
+		elog(ERROR, "database \"%s\" is being accessed by other users", dbname);
+#endif
+	}
+
+	/*
+	 * Check if there are subscriptions defined in the target database.
+	 *
+	 * We can't drop them automatically because they might be holding
+	 * resources in other databases/instances.
+	 */
+	if ((nsubscriptions = CountDBSubscriptions(db_id)) > 0)
+		ereport(ERROR,
+		        (errcode(ERRCODE_OBJECT_IN_USE),
+				        errmsg("database \"%s\" is being used by logical replication subscription",
+				               dbname),
+				        errdetail_plural("There is %d subscription.",
+				                         "There are %d subscriptions.",
+				                         nsubscriptions, nsubscriptions)));
+	heap_close(pgdbrel, RowExclusiveLock);
+}
 
 /*
  * DROP DATABASE
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index ed04f1d4..6e6e562f 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -4128,6 +4128,7 @@ _copyDropdbStmt(const DropdbStmt *from)
 
     COPY_STRING_FIELD(dbname);
     COPY_SCALAR_FIELD(missing_ok);
+	COPY_SCALAR_FIELD(prepare);
 
     return newnode;
 }
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 4e74a77d..7f8b9e22 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -10632,6 +10632,7 @@ DropdbStmt: DROP DATABASE database_name
 					DropdbStmt *n = makeNode(DropdbStmt);
 					n->dbname = $3;
 					n->missing_ok = FALSE;
+					n->prepare = FALSE;
 					$$ = (Node *)n;
 				}
 			| DROP DATABASE IF_P EXISTS database_name
@@ -10639,6 +10640,23 @@ DropdbStmt: DROP DATABASE database_name
 					DropdbStmt *n = makeNode(DropdbStmt);
 					n->dbname = $5;
 					n->missing_ok = TRUE;
+					n->prepare = FALSE;
+					$$ = (Node *)n;
+				}
+			| DROP DATABASE PREPARE database_name
+        			{
+        				DropdbStmt *n = makeNode(DropdbStmt);
+        				n->dbname = $4;
+        				n->missing_ok = FALSE;
+        				n->prepare = TRUE;
+        				$$ = (Node *)n;
+        			}
+    			| DROP DATABASE PREPARE IF_P EXISTS database_name
+				{
+					DropdbStmt *n = makeNode(DropdbStmt);
+					n->dbname = $6;
+					n->missing_ok = TRUE;
+					n->prepare = TRUE;
 					$$ = (Node *)n;
 				}
 		;
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 37139a63..1527823f 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -643,18 +643,35 @@ ProcessUtilityPre(PlannedStmt *pstmt,
             break;
 
         case T_DropdbStmt:
-            /* Clean connections before dropping a database on local node */
             if (IS_PGXC_LOCAL_COORDINATOR)
             {
                 DropdbStmt *stmt = (DropdbStmt *) parsetree;
-                char query[256];
+				char query[STRINGLENGTH];
 
+				/* Clean connections before dropping a database on local node */
                 DropDBCleanConnection(stmt->dbname);
                 /* Clean also remote Coordinators */
-                sprintf(query, "CLEAN CONNECTION TO ALL FOR DATABASE %s;",
+				snprintf(query, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;",
                         quote_identifier(stmt->dbname));
                 ExecUtilityStmtOnNodes(parsetree, query, NULL, sentToRemote, true,
                         EXEC_ON_ALL_NODES, false, false);
+
+				if (!stmt->prepare)
+				{
+					/* Lock database and check the constraints before we actually dropping */
+					if (stmt->missing_ok)
+					{
+						snprintf(query, STRINGLENGTH, "DROP DATABASE PREPARE IF EXISTS %s;",
+						        quote_identifier(stmt->dbname));
+					}
+					else
+					{
+						snprintf(query, STRINGLENGTH, "DROP DATABASE PREPARE %s;",
+						        quote_identifier(stmt->dbname));
+					}
+					ExecUtilityStmtOnNodes(parsetree, query, NULL, sentToRemote, false,
+					                       EXEC_ON_ALL_NODES, false, false);
+				}
             }
             break;
 
@@ -2107,12 +2124,20 @@ standard_ProcessUtility(PlannedStmt *pstmt,
             {
                 DropdbStmt *stmt = (DropdbStmt *) parsetree;
 
+				if (!stmt->prepare)
+				{
                 /* no event triggers for global objects */
                 if (IS_PGXC_LOCAL_COORDINATOR)
+					{
                     PreventTransactionChain(isTopLevel, "DROP DATABASE");
-
+					}
                 dropdb(stmt->dbname, stmt->missing_ok);
             }
+				else
+				{
+					dropdb_prepare(stmt->dbname, stmt->missing_ok);
+				}
+			}
             break;
 
             /* Query-level asynchronous notification */
@@ -5290,7 +5315,14 @@ CreateCommandTag(Node *parsetree)
             break;
 
         case T_DropdbStmt:
+			if (((DropdbStmt *) parsetree)->prepare)
+			{
+				tag = "DROP DATABASE PREPARE";
+			}
+			else
+			{
             tag = "DROP DATABASE";
+			}
             break;
 
         case T_NotifyStmt:
diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h
index 98aa3a6c..cb5844ff 100644
--- a/src/include/commands/dbcommands.h
+++ b/src/include/commands/dbcommands.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * dbcommands.h
- *        Database management commands (create/drop database).
+ *		Database management commands (create/drop database).
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -19,14 +19,15 @@
 #include "lib/stringinfo.h"
 #include "nodes/parsenodes.h"
 
-extern Oid    createdb(ParseState *pstate, const CreatedbStmt *stmt);
+extern Oid	createdb(ParseState *pstate, const CreatedbStmt *stmt);
 extern void dropdb(const char *dbname, bool missing_ok);
+extern void dropdb_prepare(const char *dbname, bool missing_ok);
 extern ObjectAddress RenameDatabase(const char *oldname, const char *newname);
-extern Oid    AlterDatabase(ParseState *pstate, AlterDatabaseStmt *stmt, bool isTopLevel);
-extern Oid    AlterDatabaseSet(AlterDatabaseSetStmt *stmt);
+extern Oid	AlterDatabase(ParseState *pstate, AlterDatabaseStmt *stmt, bool isTopLevel);
+extern Oid	AlterDatabaseSet(AlterDatabaseSetStmt *stmt);
 extern ObjectAddress AlterDatabaseOwner(const char *dbname, Oid newOwnerId);
 
-extern Oid    get_database_oid(const char *dbname, bool missingok);
+extern Oid	get_database_oid(const char *dbname, bool missingok);
 extern char *get_database_name(Oid dbid);
 
 extern void check_encoding_locale_matches(int encoding, const char *collate, const char *ctype);
@@ -35,4 +36,4 @@ extern void check_encoding_locale_matches(int encoding, const char *collate, con
 extern bool IsSetTableSpace(AlterDatabaseStmt *stmt);
 #endif
 
-#endif                            /* DBCOMMANDS_H */
+#endif							/* DBCOMMANDS_H */
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index df2746c9..2c54a436 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -3239,6 +3239,7 @@ typedef struct DropdbStmt
     NodeTag        type;
     char       *dbname;            /* database to drop */
     bool        missing_ok;        /* skip error if db is missing? */
+	bool        prepare;        /* database drop preparation step */
 } DropdbStmt;
 
 /* ----------------------

From 2293c35431e0b84989fa285a1359d0e6ac48f05a Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Fri, 5 Mar 2021 20:28:08 +0800
Subject: [PATCH 331/578] Bugfix: 2pc is rollbacked by pg_clean_execute when
 waiting for wal sync, ID85257415 (merge request !194)

---
 contrib/pg_clean/pg_clean.c             | 67 +++++++++++++++++++------
 doc/src/sgml/ref/commit_prepared.sgml   | 11 +++-
 doc/src/sgml/ref/rollback_prepared.sgml | 11 +++-
 src/backend/access/transam/twophase.c   | 30 +++++++++--
 src/backend/parser/gram.y               | 14 ++++++
 src/backend/tcop/utility.c              | 32 ++++++++++++
 src/include/access/twophase.h           |  2 +
 src/include/nodes/parsenodes.h          |  4 +-
 8 files changed, 150 insertions(+), 21 deletions(-)

diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c
index 4dc898ff..e31394c4 100644
--- a/contrib/pg_clean/pg_clean.c
+++ b/contrib/pg_clean/pg_clean.c
@@ -84,6 +84,8 @@ PG_MODULE_MAGIC;
 #define GIDSIZE (200 + 24)
 #define MAX_TWOPC_TXN 1000
 
+#define MAX_CMD_LENGTH 120
+
 #define XIDFOUND 1
 #define XIDNOTFOUND -1
 #define XIDEXECFAIL -2
@@ -314,7 +316,7 @@ bool check_node_participate(txn_info * txn, int node_idx);
 void recover2PC(txn_info * txn);
 TXN_STATUS 
 	 check_txn_global_status(txn_info *txn);
-bool clean_2PC_iscommit(txn_info *txn, bool iscommit);
+bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check);
 bool clean_2PC_files(txn_info *txn);
 void Init_print_txn_info(print_txn_info *print_txn);
 void Init_print_stats_all(print_status *pstatus);
@@ -2228,7 +2230,7 @@ Datum pgxc_commit_on_node(PG_FUNCTION_ARGS)
     Oid  nodeoid;
     char *gid;
     txn_info *txn;
-	char command[100];
+	char command[MAX_CMD_LENGTH];
 	PGXCNodeHandle **connections = NULL;
 	int					conn_count = 0;
 	ResponseCombiner	combiner;
@@ -2268,7 +2270,7 @@ Datum pgxc_commit_on_node(PG_FUNCTION_ARGS)
 
 	strncpy(txn->gid, gid, strlen(gid)+1);
     getTxnInfoOnOtherNodes(txn);
-	snprintf(command, 100, "commit prepared '%s'", txn->gid);
+	snprintf(command, MAX_CMD_LENGTH, "commit prepared '%s'", txn->gid);
 
 
     if (InvalidGlobalTimestamp == txn->global_commit_timestamp)
@@ -2334,7 +2336,7 @@ Datum pgxc_abort_on_node(PG_FUNCTION_ARGS)
     Oid  nodeoid;
     char *gid;
     txn_info *txn;
-	char command[100];
+	char command[MAX_CMD_LENGTH];
 	PGXCNodeHandle **connections = NULL;
 	int					conn_count = 0;
 	ResponseCombiner	combiner;
@@ -2375,7 +2377,7 @@ Datum pgxc_abort_on_node(PG_FUNCTION_ARGS)
 	strncpy(txn->gid, gid, strlen(gid)+1);
 	connections = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*));
     getTxnInfoOnOtherNodes(txn);
-	snprintf(command, 100, "rollback prepared '%s'", txn->gid);
+	snprintf(command, MAX_CMD_LENGTH, "rollback prepared '%s'", txn->gid);
 #if 0    
 	if (!setMaintenanceMode(true))
 	{
@@ -2617,7 +2619,15 @@ void recover2PC(txn_info * txn)
             else
             {
     			txn->op = COMMIT;
-    			if (!clean_2PC_iscommit(txn, true))
+    			/* check whether all nodes can commit prepared */
+    			if (!clean_2PC_iscommit(txn, true, true))
+    			{
+    				txn->op_issuccess = false;
+    				elog(LOG, "check commit 2PC transaction %s failed", txn->gid);
+    				return;
+    			}
+    			/* send commit prepared to all nodes */
+    			if (!clean_2PC_iscommit(txn, true, false))
     			{
     				txn->op_issuccess = false;
     				elog(LOG, "commit 2PC transaction %s failed", txn->gid);
@@ -2630,7 +2640,15 @@ void recover2PC(txn_info * txn)
 		
 		case TXN_STATUS_ABORTED:
 			txn->op = ABORT;
-			if (!clean_2PC_iscommit(txn, false))
+			/* check whether all nodes can rollback prepared */
+			if (!clean_2PC_iscommit(txn, false, true))
+			{
+				txn->op_issuccess = false;
+				elog(LOG, "check rollback 2PC transaction %s failed", txn->gid);
+				return;
+			}
+			/* send rollback prepared to all nodes */
+			if (!clean_2PC_iscommit(txn, false, false))
 			{
 				txn->op_issuccess = false;
 				elog(LOG, "rollback 2PC transaction %s failed", txn->gid);
@@ -2791,11 +2809,12 @@ TXN_STATUS check_txn_global_status(txn_info *txn)
 	return TXN_STATUS_ABORTED;
 }
 
-bool clean_2PC_iscommit(txn_info *txn, bool iscommit)
+bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check)
 {
 	int ii;
 	static const char *STMT_FORM = "%s prepared '%s';";
-	char command[100];
+	static const char *STMT_FORM_CHECK = "%s prepared '%s' for check only;";
+	char command[MAX_CMD_LENGTH];
 	int node_idx;
     Oid node_oid;
 	PGXCNodeHandle **connections = NULL;
@@ -2803,11 +2822,29 @@ bool clean_2PC_iscommit(txn_info *txn, bool iscommit)
 	ResponseCombiner	combiner;
 	PGXCNodeAllHandles *pgxc_handles = NULL;
 
-	if (iscommit)
-		snprintf(command, 100, STMT_FORM, "commit", txn->gid);
+	if (is_commit)
+	{
+		if (is_check)
+		{
+			snprintf(command, MAX_CMD_LENGTH, STMT_FORM_CHECK, "commit", txn->gid);
+		}
 	else
-		snprintf(command, 100, STMT_FORM, "rollback", txn->gid);
-	if (iscommit && InvalidGlobalTimestamp == txn->global_commit_timestamp)	
+		{
+			snprintf(command, MAX_CMD_LENGTH, STMT_FORM, "commit", txn->gid);
+		}
+	}
+	else
+	{
+		if (is_check)
+		{
+			snprintf(command, MAX_CMD_LENGTH, STMT_FORM_CHECK, "rollback", txn->gid);
+		}
+		else
+		{
+			snprintf(command, MAX_CMD_LENGTH, STMT_FORM, "rollback", txn->gid);
+		}
+	}
+	if (is_commit && InvalidGlobalTimestamp == txn->global_commit_timestamp)	
 	{
 		elog(ERROR, "twophase transaction '%s' has InvalidGlobalCommitTimestamp", txn->gid);
 	}
@@ -2986,9 +3023,9 @@ bool clean_2PC_files(txn_info * txn)
 	TupleTableSlots result;
 	bool issuccess = true;
 	static const char *STMT_FORM = "select pgxc_remove_2pc_records('%s')::text";
-	char query[100];
+	char query[MAX_CMD_LENGTH];
 	
-	snprintf(query, 100, STMT_FORM, txn->gid);
+	snprintf(query, MAX_CMD_LENGTH, STMT_FORM, txn->gid);
 
 	for (ii = 0; ii < dn_nodes_num; ii++)
 	{
diff --git a/doc/src/sgml/ref/commit_prepared.sgml b/doc/src/sgml/ref/commit_prepared.sgml
index 58438f99..cad0a868 100644
--- a/doc/src/sgml/ref/commit_prepared.sgml
+++ b/doc/src/sgml/ref/commit_prepared.sgml
@@ -21,7 +21,7 @@ PostgreSQL documentation
 
  <refsynopsisdiv>
 <synopsis>
-COMMIT PREPARED <replaceable class="PARAMETER">transaction_id</replaceable>
+COMMIT PREPARED <replaceable class="PARAMETER">transaction_id</replaceable> [ FOR CHECK ONLY ]
 </synopsis>
  </refsynopsisdiv>
 
@@ -47,6 +47,15 @@ COMMIT PREPARED <replaceable class="PARAMETER">transaction_id</replaceable>
      </para>
     </listitem>
    </varlistentry>
+
+   <varlistentry>
+    <term><literal>FOR CHECK ONLY</literal></term>
+    <listitem>
+     <para>
+      Check whether a prepared transaction can be committed.
+     </para>
+    </listitem>
+   </varlistentry>
   </variablelist>
  </refsect1>
 
diff --git a/doc/src/sgml/ref/rollback_prepared.sgml b/doc/src/sgml/ref/rollback_prepared.sgml
index 141c77b1..c1835358 100644
--- a/doc/src/sgml/ref/rollback_prepared.sgml
+++ b/doc/src/sgml/ref/rollback_prepared.sgml
@@ -21,7 +21,7 @@ PostgreSQL documentation
 
  <refsynopsisdiv>
 <synopsis>
-ROLLBACK PREPARED <replaceable class="PARAMETER">transaction_id</replaceable>
+ROLLBACK PREPARED <replaceable class="PARAMETER">transaction_id</replaceable> [ FOR CHECK ONLY ]
 </synopsis>
  </refsynopsisdiv>
 
@@ -47,6 +47,15 @@ ROLLBACK PREPARED <replaceable class="PARAMETER">transaction_id</replaceable>
      </para>
     </listitem>
    </varlistentry>
+
+   <varlistentry>
+    <term><literal>FOR CHECK ONLY</literal></term>
+    <listitem>
+     <para>
+      Check whether a prepared transaction can be rollbacked.
+     </para>
+    </listitem>
+   </varlistentry>
   </variablelist>
  </refsect1>
 
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 28197d77..33ce87ad 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -149,7 +149,7 @@ static GlobalTransaction
 LookupGXact(const char *gid, Oid user);
 
 static GlobalTransaction
-LockGXact(const char *gid, Oid user);
+LockGXact(const char *gid, Oid user, bool is_check);
 
 
@@ -763,7 +763,7 @@ LookupGXact(const char *gid, Oid user)
  *        Locate the prepared transaction and mark it busy for COMMIT or PREPARE.
  */
 static GlobalTransaction
-LockGXact(const char *gid, Oid user)
+LockGXact(const char *gid, Oid user, bool is_check)
 {// #lizard forgives
     int            i;
 
@@ -812,9 +812,12 @@ LockGXact(const char *gid, Oid user)
                      errmsg("prepared transaction belongs to another database"),
                      errhint("Connect to the database where the transaction was prepared to finish it.")));
 
+		if (!is_check)
+		{
         /* OK for me to lock it */
         gxact->locking_backend = MyBackendId;
         MyLockedGxact = gxact;
+		}
 
         LWLockRelease(TwoPhaseStateLock);
 
@@ -1710,6 +1713,27 @@ StandbyTransactionIdIsPrepared(TransactionId xid)
     return result;
 }
 
+/*
+ * CheckPreparedTransactionLock: Check whether the prepared transaction
+ * can be rollbacked
+ */
+void
+CheckPreparedTransactionLock(const char *gid)
+{
+	GlobalTransaction gxact = LockGXact(gid, GetUserId(), true);
+	if (enable_distri_print)
+	{
+		if (gxact == NULL)
+		{
+			elog(LOG, "prepared gid %s gxact is NULL.", gid);
+		}
+		else
+		{
+			elog(LOG, "prepared gid %s gxact xid %d.", gid, gxact->xid);
+		}
+	}
+}
+
 /*
  * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED
  */
@@ -1821,7 +1845,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
      * Validate the GID, and lock the GXACT to ensure that two backends do not
      * try to commit the same GID at once.
      */
-    gxact = LockGXact(gid, GetUserId());
+	gxact = LockGXact(gid, GetUserId(), false);
 #ifdef PGXC
     /*
      * LockGXact returns NULL if this node does not contain given two-phase
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 7f8b9e22..2f34a131 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -10370,6 +10370,20 @@ TransactionStmt:
 					n->gid = $3;
 					$$ = (Node *)n;
 				}
+			| COMMIT PREPARED Sconst FOR CHECK ONLY
+				{
+					TransactionStmt *n = makeNode(TransactionStmt);
+					n->kind = TRANS_STMT_COMMIT_PREPARED_CHECK;
+					n->gid = $3;
+					$$ = (Node *)n;
+				}
+			| ROLLBACK PREPARED Sconst FOR CHECK ONLY
+				{
+					TransactionStmt *n = makeNode(TransactionStmt);
+					n->kind = TRANS_STMT_ROLLBACK_PREPARED_CHECK;
+					n->gid = $3;
+					$$ = (Node *)n;
+				}
 		;
 
 opt_transaction:	WORK							{}
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 1527823f..29409d44 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -602,6 +602,18 @@ ProcessUtilityPre(PlannedStmt *pstmt,
                         }
                         break;
 
+					case TRANS_STMT_COMMIT_PREPARED_CHECK:
+						PreventTransactionChain(isTopLevel, "COMMIT PREPARED CHECK");
+						PreventCommandDuringRecovery("COMMIT PREPARED CHECK");
+						elog(LOG, "COMMIT PREPARED %s FOR CHECK ONLY", stmt->gid);
+						break;
+
+					case TRANS_STMT_ROLLBACK_PREPARED_CHECK:
+						PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED CHECK");
+						PreventCommandDuringRecovery("ROLLBACK PREPARED CHECK");
+						elog(LOG, "ROLLBACK PREPARED %s FOR CHECK ONLY", stmt->gid);
+						break;
+
                     case TRANS_STMT_ROLLBACK:
                         break;
 
@@ -1974,6 +1986,18 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                         FinishPreparedTransaction(stmt->gid, false);
                         break;
 
+					case TRANS_STMT_COMMIT_PREPARED_CHECK:
+						PreventTransactionChain(isTopLevel, "COMMIT PREPARED CHECK");
+						PreventCommandDuringRecovery("COMMIT PREPARED CHECK");
+						CheckPreparedTransactionLock(stmt->gid);
+						break;
+
+					case TRANS_STMT_ROLLBACK_PREPARED_CHECK:
+						PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED CHECK");
+						PreventCommandDuringRecovery("ROLLBACK PREPARED CHECK");
+						CheckPreparedTransactionLock(stmt->gid);
+						break;
+
                     case TRANS_STMT_ROLLBACK:
                         UserAbortTransactionBlock();
                         break;
@@ -4946,6 +4970,14 @@ CreateCommandTag(Node *parsetree)
                         tag = "ROLLBACK PREPARED";
                         break;
 
+					case TRANS_STMT_COMMIT_PREPARED_CHECK:
+						tag = "COMMIT PREPARED CHECK";
+						break;
+
+					case TRANS_STMT_ROLLBACK_PREPARED_CHECK:
+						tag = "ROLLBACK PREPARED CHECK";
+						break;
+
                     default:
                         tag = "???";
                         break;
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index 4f12b674..8a4831b0 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -122,6 +122,8 @@ extern void CheckPointTwoPhase(XLogRecPtr redo_horizon);
 
 extern void FinishPreparedTransaction(const char *gid, bool isCommit);
 
+extern void CheckPreparedTransactionLock(const char *gid);
+
 extern void PrepareRedoAdd(char *buf, XLogRecPtr start_lsn,
                XLogRecPtr end_lsn);
 extern void PrepareRedoRemove(TransactionId xid, bool giveWarning);
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 2c54a436..57111155 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -3110,7 +3110,9 @@ typedef enum TransactionStmtKind
     TRANS_STMT_ROLLBACK_PREPARED,
     TRANS_STMT_BEGIN_SUBTXN,
     TRANS_STMT_ROLLBACK_SUBTXN,
-    TRANS_STMT_COMMIT_SUBTXN
+	TRANS_STMT_COMMIT_SUBTXN,
+	TRANS_STMT_COMMIT_PREPARED_CHECK,
+	TRANS_STMT_ROLLBACK_PREPARED_CHECK
 } TransactionStmtKind;
 
 typedef struct TransactionStmt

From d94f0215a9ed34d3227a04e5c765e1445e0c183d Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Mon, 8 Mar 2021 12:32:37 +0800
Subject: [PATCH 332/578] snyc dynamic shared memory from pg

---
 src/backend/utils/mmgr/dsa.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c
index f7f11c06..9a6d036d 100644
--- a/src/backend/utils/mmgr/dsa.c
+++ b/src/backend/utils/mmgr/dsa.c
@@ -2255,6 +2255,7 @@ check_for_freed_segments(dsa_area *area)
         LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
 		check_for_freed_segments_locked(area);
 		LWLockRelease(DSA_AREA_LOCK(area));
+		area->freed_segment_counter = freed_segment_counter;
 	}
 }
 

From 76b08dbff66e362b1180e5cff19a23aea56dc79a Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Sat, 20 Feb 2021 11:56:08 +0800
Subject: [PATCH 333/578] Bugfix: dn core in core FinishPreparedTransaction,
 ID85239005 (merge request !198)

---
 src/backend/access/transam/twophase.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 33ce87ad..ebe93238 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1911,9 +1911,25 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
      * to disk if for some reason they have lived for a long time.
      */
     if (gxact->ondisk)
+	{
         buf = ReadTwoPhaseFile(xid, true);
+		if (NULL == buf)
+		{
+			ereport(PANIC,
+					(errcode_for_file_access(),
+					errmsg("read two-phase file failed, gid: %s", gid)));
+		}
+	}
     else
+	{
         XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, NULL);
+		if (NULL == buf)
+		{
+			ereport(PANIC,
+					(errcode_for_file_access(),
+					errmsg("read two-phase data from xlog failed, gid: %s", gid)));
+		}
+	}
 
 
     /*

From 331adf0463e14a65d9f77354e4d0d4b7001f5684 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Mon, 8 Mar 2021 16:29:14 +0800
Subject: [PATCH 334/578] fix
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131080533479 (merge request
 !199)

---
 src/gtm/main/gtm_standby.c    | 959 +++++++++++++++++-----------------
 src/gtm/main/main.c           |  94 +++-
 src/include/gtm/gtm_standby.h |   7 +
 3 files changed, 558 insertions(+), 502 deletions(-)

diff --git a/src/gtm/main/gtm_standby.c b/src/gtm/main/gtm_standby.c
index 06fa54b1..e84cca5d 100644
--- a/src/gtm/main/gtm_standby.c
+++ b/src/gtm/main/gtm_standby.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * gtm_standby.c
- *        Functionalities of GTM Standby
+ *		Functionalities of GTM Standby
  *
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *        src/gtm/common/gtm_standby.c
+ *		src/gtm/common/gtm_standby.c
  *
  *-------------------------------------------------------------------------
  */
@@ -47,7 +47,7 @@ static char standbyNodeName[NI_MAXHOST];
 static int  standbyPortNumber;
 static char *standbyDataDir;
 extern char  *NodeName;
-extern int     GTMPortNumber;
+extern int	 GTMPortNumber;
 
 #ifndef __XLOG__
 static GTM_Conn *gtm_standby_connect_to_standby_int(int *report_needed);
@@ -64,214 +64,215 @@ extern int   GTM_Standby_Connetion_Timeout;
 int
 gtm_standby_start_startup(void)
 {
-    GTM_ActiveConn = gtm_standby_connectToActiveGTM();
-    if (GTM_ActiveConn == NULL || GTMPQstatus(GTM_ActiveConn) != CONNECTION_OK)
-    {
-        int save_errno = errno;
-        if(GTM_ActiveConn)
-            elog(ERROR, "can not connect to GTM: %s %m", GTMPQerrorMessage(GTM_ActiveConn));
-        else
-            elog(ERROR, "connection is null: %m");
-
-        errno = save_errno;
-        if(GTM_ActiveConn)
-            GTMPQfinish(GTM_ActiveConn);
-    }
-
-    elog(LOG, "Connection established to the GTM active.");
-
-    return 1;
+	GTM_ActiveConn = gtm_standby_connectToActiveGTM();
+	if (GTM_ActiveConn == NULL || GTMPQstatus(GTM_ActiveConn) != CONNECTION_OK)
+	{
+		int save_errno = errno;
+		if(GTM_ActiveConn)
+			elog(LOG, "can not connect to GTM: %s %m", GTMPQerrorMessage(GTM_ActiveConn));
+		else
+			elog(LOG, "connection is null: %m");
+
+		errno = save_errno;
+		if(GTM_ActiveConn)
+			GTMPQfinish(GTM_ActiveConn);
+		return 0;
+	}
+
+	elog(LOG, "Connection established to the GTM active.");
+
+	return 1;
 }
 
 int
 gtm_standby_finish_startup(void)
 {
-    elog(DEBUG1, "Closing a startup connection...");
+	elog(DEBUG1, "Closing a startup connection...");
 
-    GTMPQfinish(GTM_ActiveConn);
-    GTM_ActiveConn = NULL;
+	GTMPQfinish(GTM_ActiveConn);
+	GTM_ActiveConn = NULL;
 
-    elog(DEBUG1, "A startup connection closed.");
-    return 1;
+	elog(DEBUG1, "A startup connection closed.");
+	return 1;
 }
 
 int
 gtm_standby_restore_next_gxid(void)
 {
-    GlobalTransactionId next_gxid = InvalidGlobalTransactionId;
-#ifdef __TBASE__    
-    next_gxid = get_next_gxid(GTM_ActiveConn);
-    GTM_RestoreStoreInfo(next_gxid, true);
+	GlobalTransactionId next_gxid = InvalidGlobalTransactionId;
+#ifdef __TBASE__	
+	next_gxid = get_next_gxid(GTM_ActiveConn);
+	GTM_RestoreStoreInfo(next_gxid, true);
 #else
-    next_gxid = get_next_gxid(GTM_ActiveConn);
-    GTM_RestoreTxnInfo(NULL, next_gxid, NULL, true);
+	next_gxid = get_next_gxid(GTM_ActiveConn);
+	GTM_RestoreTxnInfo(NULL, next_gxid, NULL, true);
 #endif
-    elog(DEBUG1, "Restoring the next GXID done.");
-    return 1;
+	elog(DEBUG1, "Restoring the next GXID done.");
+	return 1;
 }
 
 int
 gtm_standby_restore_sequence(void)
 {
 #ifndef __TBASE__
-    GTM_SeqInfo *seq_list;
-    int num_seq;
-    int i;
-    
-    /*
-     * Restore sequence data.
-     */
-    num_seq = get_sequence_list(GTM_ActiveConn, &seq_list);
-
-    for (i = 0; i < num_seq; i++)
-    {
-        GTM_SeqRestore(seq_list[i].gs_key,
-                       seq_list[i].gs_increment_by,
-                       seq_list[i].gs_min_value,
-                       seq_list[i].gs_max_value,
-                       seq_list[i].gs_init_value,
-                       seq_list[i].gs_value,
-                       seq_list[i].gs_state,
-                       seq_list[i].gs_cycle,
-                       seq_list[i].gs_called);
-    }    
-
-    elog(DEBUG1, "Restoring sequences done.");
+	GTM_SeqInfo *seq_list;
+	int num_seq;
+	int i;
+	
+	/*
+	 * Restore sequence data.
+	 */
+	num_seq = get_sequence_list(GTM_ActiveConn, &seq_list);
+
+	for (i = 0; i < num_seq; i++)
+	{
+		GTM_SeqRestore(seq_list[i].gs_key,
+					   seq_list[i].gs_increment_by,
+					   seq_list[i].gs_min_value,
+					   seq_list[i].gs_max_value,
+					   seq_list[i].gs_init_value,
+					   seq_list[i].gs_value,
+					   seq_list[i].gs_state,
+					   seq_list[i].gs_cycle,
+					   seq_list[i].gs_called);
+	}	
+
+	elog(DEBUG1, "Restoring sequences done.");
 #endif
-    return 1;
+	return 1;
 }
 
 int
 gtm_standby_restore_gxid(void)
 {
 #ifndef __TBASE__
-    int num_txn;
-    GTM_Transactions txn;
-    int i;
-    /*
-     * Restore gxid data.
-     */
-    num_txn = get_txn_gxid_list(GTM_ActiveConn, &txn);
-
-    GTM_RWLockAcquire(&GTMTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE);
-    GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
-
-    GTMTransactions.gt_txn_count = txn.gt_txn_count;
-    GTMTransactions.gt_gtm_state = txn.gt_gtm_state;
-    GTMTransactions.gt_nextXid = txn.gt_nextXid;
-    GTMTransactions.gt_oldestXid = txn.gt_oldestXid;
-    GTMTransactions.gt_xidVacLimit = txn.gt_xidVacLimit;
-    GTMTransactions.gt_xidWarnLimit = txn.gt_xidWarnLimit;
-    GTMTransactions.gt_xidStopLimit = txn.gt_xidStopLimit;
-    GTMTransactions.gt_xidWrapLimit = txn.gt_xidWrapLimit;
-    GTMTransactions.gt_latestCompletedXid = txn.gt_latestCompletedXid;
-    GTMTransactions.gt_recent_global_xmin = txn.gt_recent_global_xmin;
-    GTMTransactions.gt_lastslot = txn.gt_lastslot;
-
-    for (i = 0; i < num_txn; i++)
-    {
-        int handle = txn.gt_transactions_array[i].gti_handle;
-
-        GTMTransactions.gt_transactions_array[handle].gti_handle = txn.gt_transactions_array[i].gti_handle;
-
-        GTMTransactions.gt_transactions_array[handle].gti_client_id = txn.gt_transactions_array[i].gti_client_id;
-        GTMTransactions.gt_transactions_array[handle].gti_in_use = txn.gt_transactions_array[i].gti_in_use;
-        GTMTransactions.gt_transactions_array[handle].gti_gxid = txn.gt_transactions_array[i].gti_gxid;
-        GTMTransactions.gt_transactions_array[handle].gti_state = txn.gt_transactions_array[i].gti_state;
-        GTMTransactions.gt_transactions_array[handle].gti_xmin = txn.gt_transactions_array[i].gti_xmin;
-        GTMTransactions.gt_transactions_array[handle].gti_isolevel = txn.gt_transactions_array[i].gti_isolevel;
-        GTMTransactions.gt_transactions_array[handle].gti_readonly = txn.gt_transactions_array[i].gti_readonly;
-        GTMTransactions.gt_transactions_array[handle].gti_proxy_client_id = txn.gt_transactions_array[i].gti_proxy_client_id;
-
-        if (txn.gt_transactions_array[i].nodestring == NULL )
-            GTMTransactions.gt_transactions_array[handle].nodestring = NULL;
-        else
-            GTMTransactions.gt_transactions_array[handle].nodestring = txn.gt_transactions_array[i].nodestring;
-
-        /* GID */
-        if (txn.gt_transactions_array[i].gti_gid == NULL )
-            GTMTransactions.gt_transactions_array[handle].gti_gid = NULL;
-        else
-            GTMTransactions.gt_transactions_array[handle].gti_gid = txn.gt_transactions_array[i].gti_gid;
-
-        /* copy GTM_SnapshotData */
-        GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xmin =
-                        txn.gt_transactions_array[i].gti_current_snapshot.sn_xmin;
-        GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xmax =
-                        txn.gt_transactions_array[i].gti_current_snapshot.sn_xmax;
-        GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xcnt =
-                        txn.gt_transactions_array[i].gti_current_snapshot.sn_xcnt;
-        GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xip =
-                        txn.gt_transactions_array[i].gti_current_snapshot.sn_xip;
-        /* end of copying GTM_SnapshotData */
-
-        GTMTransactions.gt_transactions_array[handle].gti_snapshot_set =
-                        txn.gt_transactions_array[i].gti_snapshot_set;
-        GTMTransactions.gt_transactions_array[handle].gti_vacuum =
-                        txn.gt_transactions_array[i].gti_vacuum;
-
-        /*
-         * Is this correct? Is GTM_TXN_COMMITTED transaction categorized as "open"?
-         */
-        if (GTMTransactions.gt_transactions_array[handle].gti_state != GTM_TXN_ABORTED)
-        {
-            GTMTransactions.gt_open_transactions =
-                    gtm_lappend(GTMTransactions.gt_open_transactions,
-                            &GTMTransactions.gt_transactions_array[handle]);
-        }
-    }
-
-    dump_transactions_elog(&GTMTransactions, num_txn);
-
-    GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
-    GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
-    
-    elog(DEBUG1, "Restoring %d gxid(s) done.", num_txn);
+	int num_txn;
+	GTM_Transactions txn;
+	int i;
+	/*
+	 * Restore gxid data.
+	 */
+	num_txn = get_txn_gxid_list(GTM_ActiveConn, &txn);
+
+	GTM_RWLockAcquire(&GTMTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE);
+	GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
+
+	GTMTransactions.gt_txn_count = txn.gt_txn_count;
+	GTMTransactions.gt_gtm_state = txn.gt_gtm_state;
+	GTMTransactions.gt_nextXid = txn.gt_nextXid;
+	GTMTransactions.gt_oldestXid = txn.gt_oldestXid;
+	GTMTransactions.gt_xidVacLimit = txn.gt_xidVacLimit;
+	GTMTransactions.gt_xidWarnLimit = txn.gt_xidWarnLimit;
+	GTMTransactions.gt_xidStopLimit = txn.gt_xidStopLimit;
+	GTMTransactions.gt_xidWrapLimit = txn.gt_xidWrapLimit;
+	GTMTransactions.gt_latestCompletedXid = txn.gt_latestCompletedXid;
+	GTMTransactions.gt_recent_global_xmin = txn.gt_recent_global_xmin;
+	GTMTransactions.gt_lastslot = txn.gt_lastslot;
+
+	for (i = 0; i < num_txn; i++)
+	{
+		int handle = txn.gt_transactions_array[i].gti_handle;
+
+		GTMTransactions.gt_transactions_array[handle].gti_handle = txn.gt_transactions_array[i].gti_handle;
+
+		GTMTransactions.gt_transactions_array[handle].gti_client_id = txn.gt_transactions_array[i].gti_client_id;
+		GTMTransactions.gt_transactions_array[handle].gti_in_use = txn.gt_transactions_array[i].gti_in_use;
+		GTMTransactions.gt_transactions_array[handle].gti_gxid = txn.gt_transactions_array[i].gti_gxid;
+		GTMTransactions.gt_transactions_array[handle].gti_state = txn.gt_transactions_array[i].gti_state;
+		GTMTransactions.gt_transactions_array[handle].gti_xmin = txn.gt_transactions_array[i].gti_xmin;
+		GTMTransactions.gt_transactions_array[handle].gti_isolevel = txn.gt_transactions_array[i].gti_isolevel;
+		GTMTransactions.gt_transactions_array[handle].gti_readonly = txn.gt_transactions_array[i].gti_readonly;
+		GTMTransactions.gt_transactions_array[handle].gti_proxy_client_id = txn.gt_transactions_array[i].gti_proxy_client_id;
+
+		if (txn.gt_transactions_array[i].nodestring == NULL )
+			GTMTransactions.gt_transactions_array[handle].nodestring = NULL;
+		else
+			GTMTransactions.gt_transactions_array[handle].nodestring = txn.gt_transactions_array[i].nodestring;
+
+		/* GID */
+		if (txn.gt_transactions_array[i].gti_gid == NULL )
+			GTMTransactions.gt_transactions_array[handle].gti_gid = NULL;
+		else
+			GTMTransactions.gt_transactions_array[handle].gti_gid = txn.gt_transactions_array[i].gti_gid;
+
+		/* copy GTM_SnapshotData */
+		GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xmin =
+						txn.gt_transactions_array[i].gti_current_snapshot.sn_xmin;
+		GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xmax =
+						txn.gt_transactions_array[i].gti_current_snapshot.sn_xmax;
+		GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xcnt =
+						txn.gt_transactions_array[i].gti_current_snapshot.sn_xcnt;
+		GTMTransactions.gt_transactions_array[handle].gti_current_snapshot.sn_xip =
+						txn.gt_transactions_array[i].gti_current_snapshot.sn_xip;
+		/* end of copying GTM_SnapshotData */
+
+		GTMTransactions.gt_transactions_array[handle].gti_snapshot_set =
+						txn.gt_transactions_array[i].gti_snapshot_set;
+		GTMTransactions.gt_transactions_array[handle].gti_vacuum =
+						txn.gt_transactions_array[i].gti_vacuum;
+
+		/*
+		 * Is this correct? Is GTM_TXN_COMMITTED transaction categorized as "open"?
+		 */
+		if (GTMTransactions.gt_transactions_array[handle].gti_state != GTM_TXN_ABORTED)
+		{
+			GTMTransactions.gt_open_transactions =
+					gtm_lappend(GTMTransactions.gt_open_transactions,
+							&GTMTransactions.gt_transactions_array[handle]);
+		}
+	}
+
+	dump_transactions_elog(&GTMTransactions, num_txn);
+
+	GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+	GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+	
+	elog(DEBUG1, "Restoring %d gxid(s) done.", num_txn);
 #endif
-    return 1;
+	return 1;
 }
 
 int
 gtm_standby_restore_node(void)
 {
-    GTM_PGXCNodeInfo *data;
-    int rc, i;
-    int num_node;
-
-    elog(LOG, "Copying node information from the GTM active...");
-
-    data = (GTM_PGXCNodeInfo *) malloc(sizeof(GTM_PGXCNodeInfo) * 128);
-    memset(data, 0, sizeof(GTM_PGXCNodeInfo) * 128);
-    
-    rc = get_node_list(GTM_ActiveConn, data, 128);
-    if (rc < 0)
-    {
-        elog(DEBUG3, "get_node_list() failed.");
-        rc = 0;
-        goto finished;
-    }
-
-    num_node = rc;
-
-    for (i = 0; i < num_node; i++)
-    {
-        elog(DEBUG1, "get_node_list: nodetype=%d, nodename=%s, datafolder=%s",
-             data[i].type, data[i].nodename, data[i].datafolder);
-        if (Recovery_PGXCNodeRegister(data[i].type, data[i].nodename, data[i].port,
-                     data[i].proxyname, data[i].status,
-                     data[i].ipaddress, data[i].datafolder, true,
-                     -1 /* dummy socket */, false) != 0)
-        {
-            rc = 0;
-            goto finished;
-        }
-    }
-
-    elog(LOG, "Copying node information from GTM active done.");
-
-finished:    
-    free(data);
-    return rc;
+	GTM_PGXCNodeInfo *data;
+	int rc, i;
+	int num_node;
+
+	elog(LOG, "Copying node information from the GTM active...");
+
+	data = (GTM_PGXCNodeInfo *) malloc(sizeof(GTM_PGXCNodeInfo) * 128);
+	memset(data, 0, sizeof(GTM_PGXCNodeInfo) * 128);
+	
+	rc = get_node_list(GTM_ActiveConn, data, 128);
+	if (rc < 0)
+	{
+		elog(DEBUG3, "get_node_list() failed.");
+		rc = 0;
+		goto finished;
+	}
+
+	num_node = rc;
+
+	for (i = 0; i < num_node; i++)
+	{
+		elog(DEBUG1, "get_node_list: nodetype=%d, nodename=%s, datafolder=%s",
+			 data[i].type, data[i].nodename, data[i].datafolder);
+		if (Recovery_PGXCNodeRegister(data[i].type, data[i].nodename, data[i].port,
+					 data[i].proxyname, data[i].status,
+					 data[i].ipaddress, data[i].datafolder, true,
+					 -1 /* dummy socket */, false) != 0)
+		{
+			rc = 0;
+			goto finished;
+		}
+	}
+
+	elog(LOG, "Copying node information from GTM active done.");
+
+finished:	
+	free(data);
+	return rc;
 }
 
 /*
@@ -285,54 +286,54 @@ gtm_standby_restore_node(void)
 int
 gtm_standby_register_self(const char *node_name, int port, const char *datadir)
 {
-    int rc;
+	int rc;
 #ifdef __XLOG__
     static char *s_node_name = NULL;
-    static int   s_port      = 0;
-    static char *s_datadir   = NULL;
-
-    static bool init         = false;
-
-    if(init == false)
-    {
-        s_node_name = strdup(node_name);
-        s_datadir     = strdup(datadir);
-        s_port        = port;
-
-        init = true;
-    }
-    else
-    {
-        node_name = s_node_name;
-        port      = s_port;
-        datadir   = s_datadir;
-    }
+	static int   s_port      = 0;
+	static char *s_datadir   = NULL;
+
+	static bool init         = false;
+
+	if(init == false)
+	{
+		s_node_name = strdup(node_name);
+		s_datadir     = strdup(datadir);
+		s_port        = port;
+
+		init = true;
+	}
+	else
+	{
+		node_name = s_node_name;
+		port      = s_port;
+		datadir   = s_datadir;
+	}
 #endif
 
-    elog(DEBUG8, "Registering standby-GTM status...");
+	elog(DEBUG8, "Registering standby-GTM status...");
 
 
-    node_get_local_addr(GTM_ActiveConn, standbyHostName, sizeof(standbyNodeName), &rc);
+	node_get_local_addr(GTM_ActiveConn, standbyHostName, sizeof(standbyNodeName), &rc);
 
-    memset(standbyNodeName, 0, NI_MAXHOST);
-    strncpy(standbyNodeName, node_name, NI_MAXHOST - 1);
-    standbyPortNumber = port;
-    standbyDataDir= (char *)datadir;
-    elog(LOG, "register standbyhostname %s, port number %d node name %s datadir %s", 
-                    standbyHostName, standbyPortNumber, standbyNodeName, standbyDataDir);
-    rc = node_register_internal(GTM_ActiveConn, GTM_NODE_GTM, standbyHostName, standbyPortNumber,
-                                standbyNodeName, standbyDataDir,
-                                NODE_DISCONNECTED);
-    if (rc < 0)
-    {
-        elog(DEBUG1, "Failed to register a standby-GTM status.");
+	memset(standbyNodeName, 0, NI_MAXHOST);
+	strncpy(standbyNodeName, node_name, NI_MAXHOST - 1);
+	standbyPortNumber = port;
+	standbyDataDir= (char *)datadir;
+	elog(LOG, "register standbyhostname %s, port number %d node name %s datadir %s", 
+					standbyHostName, standbyPortNumber, standbyNodeName, standbyDataDir);
+	rc = node_register_internal(GTM_ActiveConn, GTM_NODE_GTM, standbyHostName, standbyPortNumber,
+								standbyNodeName, standbyDataDir,
+								NODE_DISCONNECTED);
+	if (rc < 0)
+	{
+		elog(DEBUG1, "Failed to register a standby-GTM status.");
 
-        return 0;
-    }
+		return 0;
+	}
 
-    elog(DEBUG1, "Registering standby-GTM done.");
-    
-    return 1;
+	elog(DEBUG1, "Registering standby-GTM done.");
+	
+	return 1;
 }
 
 /*
@@ -343,31 +344,31 @@ gtm_standby_register_self(const char *node_name, int port, const char *datadir)
 int
 gtm_standby_activate_self(void)
 {
-    int rc;
-
-    elog(DEBUG1, "Updating the standby-GTM status to \"CONNECTED\"...");
-
-    rc = node_unregister(GTM_ActiveConn, GTM_NODE_GTM, standbyNodeName);
-    if (rc < 0)
-    {
-        elog(DEBUG1, "Failed to unregister old standby-GTM status.");
-        return 0;
-    }
-    elog(LOG, "register standbyhostname %s, port number %d node name %s datadir %s", 
-                    standbyHostName, standbyPortNumber, standbyNodeName, standbyDataDir);
-    rc = node_register_internal(GTM_ActiveConn, GTM_NODE_GTM, standbyHostName, standbyPortNumber,
-                                standbyNodeName, standbyDataDir,
-                                NODE_CONNECTED);
-
-    if (rc < 0)
-    {
-        elog(DEBUG1, "Failed to register a new standby-GTM status.");
-        return 0;
-    }    
-
-    elog(DEBUG1, "Updating the standby-GTM status done.");
-
-    return 1;
+	int rc;
+
+	elog(DEBUG1, "Updating the standby-GTM status to \"CONNECTED\"...");
+
+	rc = node_unregister(GTM_ActiveConn, GTM_NODE_GTM, standbyNodeName);
+	if (rc < 0)
+	{
+		elog(DEBUG1, "Failed to unregister old standby-GTM status.");
+		return 0;
+	}
+	elog(LOG, "register standbyhostname %s, port number %d node name %s datadir %s", 
+					standbyHostName, standbyPortNumber, standbyNodeName, standbyDataDir);
+	rc = node_register_internal(GTM_ActiveConn, GTM_NODE_GTM, standbyHostName, standbyPortNumber,
+								standbyNodeName, standbyDataDir,
+								NODE_CONNECTED);
+
+	if (rc < 0)
+	{
+		elog(DEBUG1, "Failed to register a new standby-GTM status.");
+		return 0;
+	}	
+
+	elog(DEBUG1, "Updating the standby-GTM status done.");
+
+	return 1;
 }
 
 
@@ -380,35 +381,35 @@ gtm_standby_activate_self(void)
 GTM_PGXCNodeInfo *
 find_standby_node_info(void)
 {
-    GTM_PGXCNodeInfo *node[1024];
-    size_t n;
-    int i;
-
-    n = pgxcnode_find_by_type(GTM_NODE_GTM, node, 1024);
-
-    for (i = 0 ; i < n ; i++)
-    {
-        elog(DEBUG8, "pgxcnode_find_by_type: nodename=%s, type=%d, ipaddress=%s, port=%d, status=%d",
-             node[i]->nodename,
-             node[i]->type,
-             node[i]->ipaddress,
-             node[i]->port,
-             node[i]->status);
-
-        /* 
-         * Must not try and connect to ourself. That will lead to a deadlock
-         *
-         * !!TODO Ideally we should not be registered on the GTM, but when a
-         * failover happens, the standby may carry forward the node
-         * registration information previously sent by the original master as a
-         * backup. This needs to be studied further
-         */
-        if (strcmp(node[i]->nodename, NodeName) &&
-            node[i]->status == NODE_CONNECTED)
-            return node[i];
-    }
-
-    return NULL;
+	GTM_PGXCNodeInfo *node[1024];
+	size_t n;
+	int i;
+
+	n = pgxcnode_find_by_type(GTM_NODE_GTM, node, 1024);
+
+	for (i = 0 ; i < n ; i++)
+	{
+		elog(DEBUG8, "pgxcnode_find_by_type: nodename=%s, type=%d, ipaddress=%s, port=%d, status=%d",
+			 node[i]->nodename,
+			 node[i]->type,
+			 node[i]->ipaddress,
+			 node[i]->port,
+			 node[i]->status);
+
+		/* 
+		 * Must not try and connect to ourself. That will lead to a deadlock
+		 *
+		 * !!TODO Ideally we should not be registered on the GTM, but when a
+		 * failover happens, the standby may carry forward the node
+		 * registration information previously sent by the original master as a
+		 * backup. This needs to be studied further
+		 */
+		if (strcmp(node[i]->nodename, NodeName) &&
+			node[i]->status == NODE_CONNECTED)
+			return node[i];
+	}
+
+	return NULL;
 }
 
 
@@ -423,100 +424,100 @@ find_standby_node_info(void)
 GTM_Conn *
 gtm_standby_connect_to_standby(void)
 {
-    GTM_Conn *conn;
-    int report;
+	GTM_Conn *conn;
+	int report;
 
-    conn = gtm_standby_connect_to_standby_int(&report);
+	conn = gtm_standby_connect_to_standby_int(&report);
 
-    return conn;
+	return conn;
 }
 #endif
 
 GTM_Conn *
 gtm_connect_to_standby(GTM_PGXCNodeInfo *n,int timeout)
 {
-    GTM_Conn *standby = NULL;
-    char conn_string[1024];
-
-    elog(DEBUG8, "GTM standby is active. Going to connect.");
-
-    snprintf(conn_string, sizeof(conn_string),
-         "host=%s port=%d node_name=%s remote_type=%d connect_timeout=%d",
-             n->ipaddress, n->port, NodeName, GTM_NODE_GTM, timeout);
-
-    standby = PQconnectGTM(conn_string);
-    if (standby == NULL || GTMPQstatus(standby) != CONNECTION_OK)
-    {
-        int save_errno = errno;
-        if(standby)
-        {
-            elog(LOG, "can not connect to GTM standby: %s %m", GTMPQerrorMessage(standby));
-        }
-        else
-        {
-            elog(LOG, "connection is null: %m");
-        }
-
-        errno = save_errno;
-        if(standby)
-        {
-            GTMPQfinish(standby);
-        }
-        return NULL;
-    }
-
-    return standby;
+	GTM_Conn *standby = NULL;
+	char conn_string[1024];
+
+	elog(DEBUG8, "GTM standby is active. Going to connect.");
+
+	snprintf(conn_string, sizeof(conn_string),
+		 "host=%s port=%d node_name=%s remote_type=%d connect_timeout=%d",
+			 n->ipaddress, n->port, NodeName, GTM_NODE_GTM, timeout);
+
+	standby = PQconnectGTM(conn_string);
+	if (standby == NULL || GTMPQstatus(standby) != CONNECTION_OK)
+	{
+		int save_errno = errno;
+		if(standby)
+		{
+			elog(LOG, "can not connect to GTM standby: %s %m", GTMPQerrorMessage(standby));
+		}
+		else
+		{
+			elog(LOG, "connection is null: %m");
+		}
+
+		errno = save_errno;
+		if(standby)
+		{
+			GTMPQfinish(standby);
+		}
+		return NULL;
+	}
+
+	return standby;
 }
 
 #ifndef __XLOG__
 static GTM_Conn *
 gtm_standby_connect_to_standby_int(int *report_needed)
 {
-    GTM_Conn *standby = NULL;
-    GTM_PGXCNodeInfo *n;
-    char conn_string[1024];
-    
-    *report_needed = 0;
-
-    n = find_standby_node_info();
-    if (!n)
-    {
-        elog(LOG, "Any GTM standby node not found in registered node(s).");
-        return NULL;
-    }
-
-    elog(DEBUG8, "GTM standby is active. Going to connect.");
-    *report_needed = 1;
-
-    
-    snprintf(conn_string, sizeof(conn_string),
-         "host=%s port=%d node_name=%s remote_type=%d connect_timeout=%d",
-             n->ipaddress, n->port, NodeName, GTM_NODE_GTM, GTM_Standby_Connetion_Timeout);
-
-    standby = PQconnectGTM(conn_string);
-    if (standby == NULL || GTMPQstatus(standby) != CONNECTION_OK)
-    {
-        int save_errno = errno;
-        if(standby)
-        {
-            elog(LOG, "can not connect to GTM standby: %s %m", GTMPQerrorMessage(standby));
-        }
-        else
-        {
-            elog(LOG, "connection is null: %m");
-        }
-
-        errno = save_errno;
-        if(standby)
-        {
-            GTMPQfinish(standby);
-        }
-        return NULL;
-    }
-
-    elog(DEBUG8, "Connection established with GTM standby. - %p conn %s socket %d", n, conn_string, standby->sock);
-
-    return standby;
+	GTM_Conn *standby = NULL;
+	GTM_PGXCNodeInfo *n;
+	char conn_string[1024];
+	
+	*report_needed = 0;
+
+	n = find_standby_node_info();
+	if (!n)
+	{
+		elog(LOG, "Any GTM standby node not found in registered node(s).");
+		return NULL;
+	}
+
+	elog(DEBUG8, "GTM standby is active. Going to connect.");
+	*report_needed = 1;
+
+	
+	snprintf(conn_string, sizeof(conn_string),
+		 "host=%s port=%d node_name=%s remote_type=%d connect_timeout=%d",
+			 n->ipaddress, n->port, NodeName, GTM_NODE_GTM, GTM_Standby_Connetion_Timeout);
+
+	standby = PQconnectGTM(conn_string);
+	if (standby == NULL || GTMPQstatus(standby) != CONNECTION_OK)
+	{
+		int save_errno = errno;
+		if(standby)
+		{
+			elog(LOG, "can not connect to GTM standby: %s %m", GTMPQerrorMessage(standby));
+		}
+		else
+		{
+			elog(LOG, "connection is null: %m");
+		}
+
+		errno = save_errno;
+		if(standby)
+		{
+			GTMPQfinish(standby);
+		}
+		return NULL;
+	}
+
+	elog(DEBUG8, "Connection established with GTM standby. - %p conn %s socket %d", n, conn_string, standby->sock);
+
+	return standby;
 }
 #endif
 
@@ -524,10 +525,10 @@ gtm_standby_connect_to_standby_int(int *report_needed)
 void
 gtm_standby_disconnect_from_standby(GTM_Conn *conn)
 {
-    if (Recovery_IsStandby())
-        return;
+	if (Recovery_IsStandby())
+		return;
 
-    GTMPQfinish(conn);
+	GTMPQfinish(conn);
 }
 
 
@@ -535,28 +536,28 @@ gtm_standby_disconnect_from_standby(GTM_Conn *conn)
 GTM_Conn *
 gtm_standby_reconnect_to_standby(GTM_Conn *old_conn, int retry_max)
 {
-    GTM_Conn *newconn = NULL;
-    int report;
-    int i;
+	GTM_Conn *newconn = NULL;
+	int report;
+	int i;
 
-    if (Recovery_IsStandby())
-        return NULL;
+	if (Recovery_IsStandby())
+		return NULL;
 
-    if (old_conn != NULL)
-        gtm_standby_disconnect_from_standby(old_conn);
+	if (old_conn != NULL)
+		gtm_standby_disconnect_from_standby(old_conn);
 
-    for (i = 0; i < retry_max; i++)
-    {
-        elog(DEBUG1, "gtm_standby_reconnect_to_standby(): going to re-connect. retry=%d", i);
+	for (i = 0; i < retry_max; i++)
+	{
+		elog(DEBUG1, "gtm_standby_reconnect_to_standby(): going to re-connect. retry=%d", i);
 
-        newconn = gtm_standby_connect_to_standby_int(&report);
-        if (newconn != NULL)
-            break;
+		newconn = gtm_standby_connect_to_standby_int(&report);
+		if (newconn != NULL)
+			break;
 
-        elog(DEBUG1, "gtm_standby_reconnect_to_standby(): re-connect failed. retry=%d", i);
-    }
+		elog(DEBUG1, "gtm_standby_reconnect_to_standby(): re-connect failed. retry=%d", i);
+	}
 
-    return newconn;
+	return newconn;
 }
 #endif
 
@@ -567,96 +568,96 @@ gtm_standby_reconnect_to_standby(GTM_Conn *old_conn, int retry_max)
 bool
 gtm_standby_check_communication_error(Port *myport, int *retry_count, GTM_Conn *oldconn)
 {
-    
-    /*
-     * This function may be called without result from standby.
-     */
-    if (GetMyConnection(myport)->standby->result
-        && GetMyConnection(myport)->standby->result->gr_status == GTM_RESULT_COMM_ERROR)
-    {
-        if (*retry_count == 0)
-        {
-            (*retry_count)++;
-
-            GetMyConnection(myport)->standby =
-                    gtm_standby_reconnect_to_standby(GetMyConnection(myport)->standby,
-                                                     GTM_STANDBY_RETRY_MAX);
-
-            if (GetMyConnection(myport)->standby)
-                return true;
-        }
-
-        elog(DEBUG1, "communication error with standby.");
-    }
-    return false;
+	
+	/*
+	 * This function may be called without result from standby.
+	 */
+	if (GetMyConnection(myport)->standby->result
+		&& GetMyConnection(myport)->standby->result->gr_status == GTM_RESULT_COMM_ERROR)
+	{
+		if (*retry_count == 0)
+		{
+			(*retry_count)++;
+
+			GetMyConnection(myport)->standby =
+					gtm_standby_reconnect_to_standby(GetMyConnection(myport)->standby,
+													 GTM_STANDBY_RETRY_MAX);
+
+			if (GetMyConnection(myport)->standby)
+				return true;
+		}
+
+		elog(DEBUG1, "communication error with standby.");
+	}
+	return false;
 }
 #endif
 
 int
 gtm_standby_begin_backup(int64 identifier, int64 lsn, GlobalTimestamp gts)
 {
-    int rc = set_begin_backup(GTM_ActiveConn, identifier, lsn, gts);
-    return (rc ? 0 : 1);
+	int rc = set_begin_backup(GTM_ActiveConn, identifier, lsn, gts);
+	return (rc ? 0 : 1);
 }
 
 int
 gtm_standby_end_backup(void)
 {
 
-    int rc = set_end_backup(GTM_ActiveConn, false);
-    
-    return (rc ? 0 : 1);
+	int rc = set_end_backup(GTM_ActiveConn, false);
+	
+	return (rc ? 0 : 1);
 }
 
 int
 gtm_standby_start_replication(const char *application_name)
 {
-    char ip_port[NI_MAXHOST];
+	char ip_port[NI_MAXHOST];
     int  rc = 0;
-    int  i  = 0;
-    int len = 0;
+	int  i  = 0;
+	int len = 0;
 
-    if(strlen(application_name) == 0)
-    {
-        node_get_local_addr(GTM_ActiveConn, ip_port, NI_MAXHOST, &rc);
+	if(strlen(application_name) == 0)
+	{
+		node_get_local_addr(GTM_ActiveConn, ip_port, NI_MAXHOST, &rc);
 
-        len = strlen(ip_port);
+		len = strlen(ip_port);
 
-        snprintf(ip_port + len,NI_MAXHOST - len,":%d",GTMPortNumber);
+		snprintf(ip_port + len,NI_MAXHOST - len,":%d",GTMPortNumber);
 
-        for(i = 0; i < len ; i++)
-        {
-            if(ip_port[i] == '_')
-                ip_port[i] = '.';
-        }
+		for(i = 0; i < len ; i++)
+		{
+			if(ip_port[i] == '_')
+				ip_port[i] = '.';
+		}
 
-        return set_begin_replication(GTM_ActiveConn,ip_port,NodeName);
-    }
+		return set_begin_replication(GTM_ActiveConn,ip_port,NodeName);
+	}
 
     return set_begin_replication(GTM_ActiveConn,application_name,NodeName);
 }
 
-extern char *NodeName;        /* Defined in main.c */
+extern char *NodeName;		/* Defined in main.c */
 
 void
 gtm_standby_finishActiveConn(void)
 {
-    
-    GTM_ActiveConn = gtm_standby_connectToActiveGTM();
-    if (GTM_ActiveConn == NULL)
-    {
-        elog(DEBUG3, "Error in connection");
-        return;
-    }
-    elog(DEBUG1, "Connection established to the GTM active.");
-
-    /* Unregister self from Active-GTM */
-    node_unregister(GTM_ActiveConn, GTM_NODE_GTM, NodeName);
-    /* Disconnect form Active */
-    GTMPQfinish(GTM_ActiveConn);
-    
+	
+	GTM_ActiveConn = gtm_standby_connectToActiveGTM();
+	if (GTM_ActiveConn == NULL)
+	{
+		elog(DEBUG3, "Error in connection");
+		return;
+	}
+	elog(DEBUG1, "Connection established to the GTM active.");
+
+	/* Unregister self from Active-GTM */
+	node_unregister(GTM_ActiveConn, GTM_NODE_GTM, NodeName);
+	/* Disconnect form Active */
+	GTMPQfinish(GTM_ActiveConn);
+	
 #ifdef __TBASE__
-    GTM_ActiveConn = NULL;
+	GTM_ActiveConn = NULL;
 #endif
 
 }
@@ -664,17 +665,17 @@ gtm_standby_finishActiveConn(void)
 static GTM_Conn *
 gtm_standby_connectToActiveGTM(void)
 {
-    char connect_string[1024];
-    int active_port = Recovery_StandbyGetActivePort();
-    char *active_address = Recovery_StandbyGetActiveAddress();
+	char connect_string[1024];
+	int active_port = Recovery_StandbyGetActivePort();
+	char *active_address = Recovery_StandbyGetActiveAddress();
 
-    /* Need to connect to Active-GTM again here */
-    elog(LOG, "Connecting the GTM active on %s:%d...", active_address, active_port);
+	/* Need to connect to Active-GTM again here */
+	elog(LOG, "Connecting the GTM active on %s:%d...", active_address, active_port);
 
-    sprintf(connect_string, "host=%s port=%d node_name=%s remote_type=%d",
-            active_address, active_port, NodeName, GTM_NODE_GTM);
+	sprintf(connect_string, "host=%s port=%d node_name=%s remote_type=%d",
+			active_address, active_port, NodeName, GTM_NODE_GTM);
 
-    return PQconnectGTM(connect_string);
+	return PQconnectGTM(connect_string);
 }
 #ifdef __TBASE__
 /*
@@ -682,59 +683,59 @@ gtm_standby_connectToActiveGTM(void)
  */
 int32 GTM_StoreStandbyInitFromMaster(char *data_dir)
 {
-    int32  ret   = 0;
-    size_t size  = 0;
-    char   *data = NULL;
-
-    if (NULL == data_dir)
-    {
-        elog(LOG, "GTM_StoreStandbyInitFromMaster invalid null parameter");
-        return GTM_STORE_ERROR;
-    }
-    
-    if (enable_gtm_sequence_debug)
-    {
-        elog(LOG, "GTM_StoreStandbyInitFromMaster begin");
-    }
-    
-    size = (uint32)get_storage_file(GTM_ActiveConn, &data,&XLogCtl->apply,&XLogCtl->thisTimeLineID);
-    if (-1 == size)
-    {
-        elog(LOG, "GTM_StoreStandbyInitFromMaster get_storage_file failed");
-        return GTM_STORE_ERROR;
-    }
-
-    ret = GTM_StoreStandbyInit(data_dir, data, (uint32)size);
-    if (ret)
-    {
-        elog(LOG, "GTM_StoreStandbyInitFromMaster GTM_StoreStandbyInit failed");
-        return GTM_STORE_ERROR;
-    }
+	int32  ret   = 0;
+	size_t size  = 0;
+	char   *data = NULL;
+
+	if (NULL == data_dir)
+	{
+		elog(LOG, "GTM_StoreStandbyInitFromMaster invalid null parameter");
+		return GTM_STORE_ERROR;
+	}
+	
+	if (enable_gtm_sequence_debug)
+	{
+		elog(LOG, "GTM_StoreStandbyInitFromMaster begin");
+	}
+	
+	size = (uint32)get_storage_file(GTM_ActiveConn, &data,&XLogCtl->apply,&XLogCtl->thisTimeLineID);
+	if (-1 == size)
+	{
+		elog(LOG, "GTM_StoreStandbyInitFromMaster get_storage_file failed");
+		return GTM_STORE_ERROR;
+	}
+
+	ret = GTM_StoreStandbyInit(data_dir, data, (uint32)size);
+	if (ret)
+	{
+		elog(LOG, "GTM_StoreStandbyInitFromMaster GTM_StoreStandbyInit failed");
+		return GTM_STORE_ERROR;
+	}
 
     /* we transfer data from the beginning of xlog */
-    XLogCtl->LogwrtResult.Write = XLogCtl->LogwrtResult.Flush = XLogCtl->apply - (XLogCtl->apply % GTM_XLOG_SEG_SIZE);
-    NewXLogFile(GetSegmentNo(XLogCtl->LogwrtResult.Flush));
+	XLogCtl->LogwrtResult.Write = XLogCtl->LogwrtResult.Flush = XLogCtl->apply - (XLogCtl->apply % GTM_XLOG_SEG_SIZE);
+	NewXLogFile(GetSegmentNo(XLogCtl->LogwrtResult.Flush));
 
     ControlData->checkPoint     = XLogCtl->apply;
     ControlData->prevCheckPoint = InvalidXLogRecPtr;
-    ControlData->thisTimeLineID = XLogCtl->thisTimeLineID;
-    ControlData->gts            = g_GTM_Store_Header->m_next_gts;
-    ControlData->time           = time(NULL);
+	ControlData->thisTimeLineID = XLogCtl->thisTimeLineID;
+	ControlData->gts            = g_GTM_Store_Header->m_next_gts;
+	ControlData->time           = time(NULL);
 
-    ControlDataSync(false);
+	ControlDataSync(false);
     AddBackupLabel(GetSegmentNo(XLogCtl->LogwrtResult.Flush));
 
-    elog(LOG,"Get start replication at %X/%X,timeLine: %d",
-         (uint32_t)(XLogCtl->LogwrtResult.Flush>>32),
-         (uint32_t)(XLogCtl->LogwrtResult.Flush),
-         XLogCtl->thisTimeLineID);
+	elog(LOG,"Get start replication at %X/%X,timeLine: %d",
+		 (uint32_t)(XLogCtl->LogwrtResult.Flush>>32),
+		 (uint32_t)(XLogCtl->LogwrtResult.Flush),
+		 XLogCtl->thisTimeLineID);
 
 
-    if (enable_gtm_sequence_debug)
-    {
-        elog(LOG, "GTM_StoreStandbyInitFromMaster done");
-    }
-    return GTM_STORE_OK;
+	if (enable_gtm_sequence_debug)
+	{
+		elog(LOG, "GTM_StoreStandbyInitFromMaster done");
+	}
+	return GTM_STORE_OK;
 }
 
 static void
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index 81fa43c6..e15bcbf6 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -181,6 +181,8 @@ bool        isGTM = true;
 
 GTM_ThreadID    TopMostThreadID;
 
+enum GTM_PromoteStatus promote_status = GTM_PRPMOTE_INIT;
+s_lock_t promote_status_lck;
 
 /* The socket(s) we're listening to. */
 #define MAXLISTEN    64
@@ -1000,6 +1002,38 @@ main(int argc, char *argv[])
         Recovery_StandbySetConnInfo(active_addr, active_port);
     }
 
+    SpinLockInit(&promote_status_lck);
+    promote_status = GTM_PRPMOTE_INIT;
+	
+    pqsignal(SIGHUP, GTM_SigleHandler);
+    pqsignal(SIGKILL, GTM_SigleHandler);
+    pqsignal(SIGQUIT, GTM_SigleHandler);
+    pqsignal(SIGTERM, GTM_SigleHandler);
+    pqsignal(SIGINT, GTM_SigleHandler);
+    pqsignal(SIGUSR1, GTM_SigleHandler);
+    pqsignal(SIGPIPE, SIG_IGN);
+
+    pqinitmask();
+
+    /*
+     * Establish a connection between the active and standby.
+     */
+    while (Recovery_IsStandby())
+    {
+        if (gtm_standby_start_startup())
+        {
+            elog(LOG, "Standby GTM Startup connection established with active-GTM.");
+            break;
+        }
+
+        elog(LOG, "Failed to establish a connection to active-GTM.");
+        usleep(GTM_GTS_ONE_SECOND);
+    }
+
+    SpinLockAcquire(&promote_status_lck);
+    promote_status = GTM_PRPMOTE_CONNED;
+    SpinLockRelease(&promote_status_lck);
+
 #ifdef __XLOG__
 
     if(access(RECOVERY_CONF_NAME,F_OK) == 0)
@@ -1053,20 +1087,6 @@ main(int argc, char *argv[])
 
 #endif
     
-    /*
-     * Establish a connection between the active and standby.
-     */
-    if (Recovery_IsStandby())
-    {
-
-        if (!gtm_standby_start_startup())
-        {
-            elog(ERROR, "Failed to establish a connection to active-GTM.");
-            exit(1);
-        }
-        elog(LOG, "Standby GTM Startup connection established with active-GTM.");
-    }
-    
 #ifdef __TBASE__
     elog(LOG, "Starting GTM server at (%s:%d) with syn storage", ListenAddresses, GTMPortNumber);
 #else
@@ -1279,15 +1299,21 @@ main(int argc, char *argv[])
     if (!CreateOptsFile(argc, argv))
         exit(1);
 
-    pqsignal(SIGHUP, GTM_SigleHandler);
-    pqsignal(SIGKILL, GTM_SigleHandler);
-    pqsignal(SIGQUIT, GTM_SigleHandler);
-    pqsignal(SIGTERM, GTM_SigleHandler);
-    pqsignal(SIGINT, GTM_SigleHandler);
-    pqsignal(SIGUSR1, GTM_SigleHandler);
-    pqsignal(SIGPIPE, SIG_IGN);
+    SpinLockAcquire(&promote_status_lck);
+    /*
+     * GTM_PRPMOTE_IN_STARTUP is setting in PromoteToActive,
+     * do CurrentTimeLineID++ here.
+     */
+    if (promote_status == GTM_PRPMOTE_IN_STARTUP)
+    {
+        SetCurrentTimeLineID(GetCurrentTimeLineID() + 1);
+    }
 
-    pqinitmask();
+    /*
+     * set promote_status to GTM_PRPMOTE_NORMAL finally
+     */
+    promote_status = GTM_PRPMOTE_NORMAL;
+    SpinLockRelease(&promote_status_lck);
 
     /*
      * Now, activating a standby GTM...
@@ -2942,7 +2968,10 @@ GTM_ThreadWalReceiver(void *argp)
 
     sleep(1);
 
-    gtm_standby_start_startup();
+	if (!gtm_standby_start_startup())
+    {
+        elog(ERROR, "Failed to establish a connection to active-GTM.");
+    }
 
     if (GTM_ActiveConn == NULL || GTMPQstatus(GTM_ActiveConn) != CONNECTION_OK ||
         gtm_standby_register_self(NULL,0,NULL)  == 0 ||
@@ -4868,10 +4897,29 @@ PromoteToActive(void)
      */
 //    GTM_SetInitialAndNextClientIdentifierAtPromote();
 
+    SpinLockAcquire(&promote_status_lck);
+    if (promote_status != GTM_PRPMOTE_INIT && promote_status != GTM_PRPMOTE_NORMAL)
+    {
+        elog(LOG, "Promote signal received. But not allow to promote, promote status %d", promote_status);
+        SpinLockRelease(&promote_status_lck);
+        return;
+    }
+	
     /*
      * Do promoting things here.
+     * if promote_status is GTM_PRPMOTE_INIT, should use the CurrentTimeLineID to Recovery or OpenXLogFile,
+     * keep it's value here
      */
+    if (promote_status == GTM_PRPMOTE_NORMAL)
+    {
     SetCurrentTimeLineID(GetCurrentTimeLineID() + 1);
+    }
+    else
+    {
+        promote_status = GTM_PRPMOTE_IN_STARTUP;
+    }
+    SpinLockRelease(&promote_status_lck);
+
     Recovery_StandbySetStandby(false);
     StartupThreadAfterPromote();
     CreateDataDirLockFile();
diff --git a/src/include/gtm/gtm_standby.h b/src/include/gtm/gtm_standby.h
index cdc056ef..01a037b7 100644
--- a/src/include/gtm/gtm_standby.h
+++ b/src/include/gtm/gtm_standby.h
@@ -67,5 +67,12 @@ extern int gtm_standby_start_replication(const char *application_name);
 #define GTM_ACT_MODE 0
 #define GTM_STANDBY_MODE 1
 
+enum GTM_PromoteStatus
+{
+    GTM_PRPMOTE_INIT          = 0,
+    GTM_PRPMOTE_IN_STARTUP    = 1,
+    GTM_PRPMOTE_CONNED        = 2,
+    GTM_PRPMOTE_NORMAL        = 3,
+};
 
 #endif /* GTM_STANDBY_H */

From 1548ccf168042ac2fb4c61d66afcb8695c838e82 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Tue, 9 Mar 2021 10:09:46 +0800
Subject: [PATCH 335/578] fix
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131080533479

---
 src/gtm/main/main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index e15bcbf6..63e0951a 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -1031,7 +1031,10 @@ main(int argc, char *argv[])
     }
 
     SpinLockAcquire(&promote_status_lck);
+    if (promote_status == GTM_PRPMOTE_INIT)
+    {
     promote_status = GTM_PRPMOTE_CONNED;
+    }
     SpinLockRelease(&promote_status_lck);
 
 #ifdef __XLOG__

From 4d36953e9bb717cb1c33fdd1dd3201f8e20f9819 Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Tue, 9 Mar 2021 10:15:52 +0800
Subject: [PATCH 336/578] remove unsed para

---
 src/backend/utils/mmgr/dsa.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c
index 9a6d036d..f7f11c06 100644
--- a/src/backend/utils/mmgr/dsa.c
+++ b/src/backend/utils/mmgr/dsa.c
@@ -2255,7 +2255,6 @@ check_for_freed_segments(dsa_area *area)
         LWLockAcquire(DSA_AREA_LOCK(area), LW_EXCLUSIVE);
 		check_for_freed_segments_locked(area);
 		LWLockRelease(DSA_AREA_LOCK(area));
-		area->freed_segment_counter = freed_segment_counter;
 	}
 }
 

From c756b84d17ff336a6955288b835587688a0b0f81 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Mon, 15 Mar 2021 16:14:19 +0800
Subject: [PATCH 337/578] fix gtm
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131080533479 (merge request
 !213)

---
 src/gtm/main/gtm_standby.c |   1 +
 src/gtm/main/gtm_txn.c     |  42 +++++
 src/gtm/main/gtm_xlog.c    |   5 +-
 src/gtm/main/main.c        | 354 +++++++++++++++++++++++++++++++------
 src/include/gtm/gtm_txn.h  | 271 ++++++++++++++--------------
 5 files changed, 479 insertions(+), 194 deletions(-)

diff --git a/src/gtm/main/gtm_standby.c b/src/gtm/main/gtm_standby.c
index e84cca5d..f441c889 100644
--- a/src/gtm/main/gtm_standby.c
+++ b/src/gtm/main/gtm_standby.c
@@ -76,6 +76,7 @@ gtm_standby_start_startup(void)
 		errno = save_errno;
 		if(GTM_ActiveConn)
 			GTMPQfinish(GTM_ActiveConn);
+		GTM_ActiveConn = NULL;
 		return 0;
 	}
 
diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c
index 1a8f83f7..bc91470f 100644
--- a/src/gtm/main/gtm_txn.c
+++ b/src/gtm/main/gtm_txn.c
@@ -1563,6 +1563,48 @@ ProcessCheckGTMCommand(Port *myport, StringInfo message)
     pq_endmessage(myport, &buf);
     pq_flush(myport);
 }
+
+
+/*
+ * Check gtm slave status by acquiring gts.
+ */
+void
+ProcessStandbyPreCheckGTMCommand(Port *myport, StringInfo message)
+{
+    StringInfoData buf;
+    int            is_master = 0;
+    GTM_Timestamp  master_timestamp  = InvalidGTS;
+    int            standby_count = 0;
+    XLogRecPtr     flush_ptr;
+
+    /* read timeout message */
+    pq_getmsgint(message,sizeof(int));
+    pq_getmsgend(message);
+
+    if (myport->remote_type != GTM_NODE_GTM_CTL)
+    {
+        /* standby node only handle GTS request from gtm_ctl*/
+        elog(ERROR, "check gtm command is supposed to be fired only by gtm or gtm_ctl!!");
+    }
+
+    /* get static gts from ControlData */
+    GTM_RWLockAcquire(&ControlDataLock,GTM_LOCKMODE_WRITE);
+    master_timestamp = ControlData->gts;
+    GTM_RWLockRelease(&ControlDataLock);
+
+    flush_ptr = GetCurrentXLogwrtResult().Flush;
+    is_master = Recovery_IsStandby();
+
+    pq_beginmessage(&buf, 'S');
+    pq_sendint(&buf, TXN_CHECK_GTM_STATUS_RESULT, 4);
+    pq_sendbytes(&buf, (char *) &is_master, sizeof(is_master));
+    pq_sendbytes(&buf, (char *) &master_timestamp, sizeof(GTM_Timestamp));
+    pq_sendint64(&buf, flush_ptr);
+    pq_sendint(&buf, standby_count, sizeof(int));
+
+    pq_endmessage(myport, &buf);
+    pq_flush(myport);
+}
 #endif 
 /*
  * Process MSG_TXN_BEGIN_GETGXID message
diff --git a/src/gtm/main/gtm_xlog.c b/src/gtm/main/gtm_xlog.c
index 492fa491..ec3d7c66 100644
--- a/src/gtm/main/gtm_xlog.c
+++ b/src/gtm/main/gtm_xlog.c
@@ -133,6 +133,7 @@ extern int  GTMStartupGTSDelta;
 static bool      g_recovery_finish;
 static bool     *g_GTMStoreDirtyMap;
 static GTM_MutexLock g_CheckPointLock;
+extern enum GTM_PromoteStatus promote_status;
 
 XLogCtlData     *XLogCtl;
 XLogSyncStandby *XLogSync;
@@ -2344,7 +2345,7 @@ RedoRangeOverwrite(XLogCmdRangerOverWrite *cmd)
     if(enalbe_gtm_xlog_debug)
         PrintRedoRangeOverwrite(cmd);
 
-    if(Recovery_IsStandby() && recovery_pitr_mode == false)
+    if(Recovery_IsStandby() && recovery_pitr_mode == false && promote_status == GTM_PRPMOTE_NORMAL)
     {
         memcpy(g_GTMStoreMapAddr + cmd->offset,cmd->data,cmd->bytes);
 
@@ -2386,7 +2387,7 @@ RedoCheckPoint(XLogCmdCheckPoint *cmd,XLogRecPtr pos)
 
     SetCurrentTimeLineID(cmd->timeline);
 
-    if(Recovery_IsStandby() && recovery_pitr_mode == false)
+    if(Recovery_IsStandby() && recovery_pitr_mode == false && promote_status == GTM_PRPMOTE_NORMAL)
         DoCheckPoint(false);
 
     return sizeof(XLogCmdCheckPoint);
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index 63e0951a..113cee78 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -140,6 +140,7 @@ GTM_ThreadInfo  *g_timer_thread = NULL;
 GTM_ThreadInfo  *g_logcollector_thread = NULL;
 void *GTM_ThreadLogCollector(void *argp);
 extern void GTM_ErrorLogCollector(ErrorData *edata, StringInfo buff);
+GTM_ThreadInfo  *g_standby_pre_server_thread = NULL;
 
 #ifdef __XLOG__
 GTM_ThreadInfo  *g_basebackup_thread   = NULL;
@@ -245,6 +246,8 @@ static void ProcessSyncStandbyCommand(Port *myport, GTM_MessageType mtype, Strin
 static void ProcessBarrierCommand(Port *myport, GTM_MessageType mtype, StringInfo message);
 static int 
 GTMInitConnection(GTM_ConnectionInfo *conninfo);
+static void SetNonBlockConnection(GTM_ConnectionInfo *conninfo);
+static void gtm_standby_pre_server_loop(const char *data_dir);
 
 #ifdef __XLOG__
 static void thread_replication_clean(GTM_StandbyReplication *replication);
@@ -604,6 +607,26 @@ static bool CheckClockSource(void)
 
 #endif
 
+static void GTM_XLogRecoveryIfNeed(const char *data_dir)
+{
+    Assert(ControlData != NULL);
+
+    switch(ControlData->state)
+    {
+        case DB_SHUTDOWNED_IN_RECOVERY:
+        case DB_SHUTDOWNING:
+        case DB_STARTUP:
+        case DB_IN_CRASH_RECOVERY:
+        case DB_IN_ARCHIVE_RECOVERY:
+        case DB_IN_PRODUCTION:
+            elog(LOG, "Detect GTM server crash.");
+            GTM_XLogRecovery(ControlData->checkPoint,data_dir);
+            break;
+        case DB_SHUTDOWNED:
+            break;
+    }
+}
+
 int
 main(int argc, char *argv[])
 {// #lizard forgives
@@ -659,6 +682,8 @@ main(int argc, char *argv[])
     int     util_thread_cnt = 0;
 
     isStartUp = true;
+    SpinLockInit(&promote_status_lck);
+    promote_status = GTM_PRPMOTE_INIT;
 
     /*
      * At first, initialize options.  Also moved something from BaseInit() here.
@@ -1002,41 +1027,6 @@ main(int argc, char *argv[])
         Recovery_StandbySetConnInfo(active_addr, active_port);
     }
 
-    SpinLockInit(&promote_status_lck);
-    promote_status = GTM_PRPMOTE_INIT;
-	
-    pqsignal(SIGHUP, GTM_SigleHandler);
-    pqsignal(SIGKILL, GTM_SigleHandler);
-    pqsignal(SIGQUIT, GTM_SigleHandler);
-    pqsignal(SIGTERM, GTM_SigleHandler);
-    pqsignal(SIGINT, GTM_SigleHandler);
-    pqsignal(SIGUSR1, GTM_SigleHandler);
-    pqsignal(SIGPIPE, SIG_IGN);
-
-    pqinitmask();
-
-    /*
-     * Establish a connection between the active and standby.
-     */
-    while (Recovery_IsStandby())
-    {
-        if (gtm_standby_start_startup())
-        {
-            elog(LOG, "Standby GTM Startup connection established with active-GTM.");
-            break;
-        }
-
-        elog(LOG, "Failed to establish a connection to active-GTM.");
-        usleep(GTM_GTS_ONE_SECOND);
-    }
-
-    SpinLockAcquire(&promote_status_lck);
-    if (promote_status == GTM_PRPMOTE_INIT)
-    {
-    promote_status = GTM_PRPMOTE_CONNED;
-    }
-    SpinLockRelease(&promote_status_lck);
-
 #ifdef __XLOG__
 
     if(access(RECOVERY_CONF_NAME,F_OK) == 0)
@@ -1061,24 +1051,61 @@ main(int argc, char *argv[])
 
     if(Recovery_IsStandby() == false)
     {
-        Assert(ControlData != NULL);
+        GTM_XLogRecoveryIfNeed(data_dir);
+	}
+
+#endif
+
+#ifdef __TBASE__
+    elog(LOG, "Starting GTM server at (%s:%d) with syn storage", ListenAddresses, GTMPortNumber);
+#else
+    elog(LOG, "Starting GTM server at (%s:%d) -- control file %s", ListenAddresses, GTMPortNumber, GTMControlFile);
+#endif
+
+    g_max_lock_number = 6000;
+    pqsignal(SIGHUP, GTM_SigleHandler);
+    pqsignal(SIGKILL, GTM_SigleHandler);
+    pqsignal(SIGQUIT, GTM_SigleHandler);
+    pqsignal(SIGTERM, GTM_SigleHandler);
+    pqsignal(SIGINT, GTM_SigleHandler);
+    pqsignal(SIGUSR1, GTM_SigleHandler);
+    pqsignal(SIGPIPE, SIG_IGN);
 
-        switch(ControlData->state)
+    pqinitmask();
+
+    /*
+     * Establish a connection between the active and standby.
+     */
+    if (Recovery_IsStandby())
+    {
+        if (!gtm_standby_start_startup())
         {
-            case DB_SHUTDOWNED_IN_RECOVERY:
-            case DB_SHUTDOWNING:
-            case DB_STARTUP:
-            case DB_IN_CRASH_RECOVERY:
-            case DB_IN_ARCHIVE_RECOVERY:
-            case DB_IN_PRODUCTION:
-                elog(LOG, "Detect GTM server crash.");
-                GTM_XLogRecovery(ControlData->checkPoint,data_dir);
-                break;
-            case DB_SHUTDOWNED:
-                break;
+#ifdef __TBASE__
+            elog(LOG, "Failed to establish a connection to active-GTM.");
+
+            /*
+             * if failed to establish a connection to active-GTM, just
+             * retry, but support the check status command.
+             */
+            gtm_standby_pre_server_loop(data_dir);
+#else
+            elog(ERROR, "Failed to establish a connection to active-GTM.");
+#endif
         }
+        else
+        {
+            elog(LOG, "Standby GTM Startup connection established with active-GTM.");
+        }
+    }
+
+    SpinLockAcquire(&promote_status_lck);
+    if (promote_status == GTM_PRPMOTE_INIT)
+    {
+        promote_status = GTM_PRPMOTE_CONNED;
     }
+    SpinLockRelease(&promote_status_lck);
 
+#ifdef __XLOG__
     GTM_XLogFileInit(data_dir);
 
     GTM_RWLockAcquire(&ControlDataLock,GTM_LOCKMODE_WRITE);
@@ -1090,12 +1117,6 @@ main(int argc, char *argv[])
 
 #endif
     
-#ifdef __TBASE__
-    elog(LOG, "Starting GTM server at (%s:%d) with syn storage", ListenAddresses, GTMPortNumber);
-#else
-    elog(LOG, "Starting GTM server at (%s:%d) -- control file %s", ListenAddresses, GTMPortNumber, GTMControlFile);
-#endif    
-
     /*
      * Read the last GXID and start from there
      */
@@ -1109,6 +1130,8 @@ main(int argc, char *argv[])
         GlobalTimestamp gts = 0;
         int  max_retry_times = 10;
         
+        system("rm -rf gtm_xlog/*");
+
         bret = GTM_StoreGetSysInfo(&identifier, &lsn, &gts);
         if (!bret)
         {
@@ -1434,8 +1457,6 @@ main(int argc, char *argv[])
         process_thread_num = g_max_thread_number < process_thread_num ? g_max_thread_number : process_thread_num;
     }
 
-    g_max_lock_number = 6000;
-    
     /* Create GTM threads handling requests */
     g_timekeeper_thread = GTM_ThreadCreate(GTM_ThreadTimeKeeper, g_max_lock_number);
     if (NULL == g_timekeeper_thread)
@@ -1521,7 +1542,11 @@ main(int argc, char *argv[])
         util_thread_cnt++;
     }
     
-    for(i = 0; i < process_thread_num; i++)
+    /*
+     * maybe one GTM_ThreadMain create as g_standby_pre_server_thread before
+     */
+    i = (g_standby_pre_server_thread == NULL) ? 0 : 1;
+	for(; i < process_thread_num; i++)
     {
         elog(DEBUG8, "Create thread %d.\n", i);
         if (NULL == GTM_ThreadCreate(GTM_ThreadMain, g_max_lock_number))
@@ -1807,6 +1832,206 @@ ServerLoop(void)
     }
 }
 
+/*
+ * add connection into g_standby_pre_server_thread
+ */
+static int
+gtm_add_connection_standby_pre_server(Port *port)
+{
+    GTM_ConnectionInfo *conninfo = NULL;
+    struct epoll_event event;
+
+    if (!g_standby_pre_server_thread->thr_epoll_ok)
+    {
+        elog(LOG, "g_standby_pre_server_thread epoll not ready.");
+        return STATUS_ERROR;
+    }
+
+    conninfo = (GTM_ConnectionInfo *)palloc0(sizeof (GTM_ConnectionInfo));
+    conninfo->con_port = port;
+    conninfo->con_init = false;
+    port->conn = conninfo;
+
+    /* Set conn to non-blocking mode for epoll wait */
+    SetNonBlockConnection(conninfo);
+
+    conninfo->con_thrinfo = g_standby_pre_server_thread;
+    event.data.ptr = conninfo;
+    event.events = EPOLLIN | EPOLLERR | EPOLLHUP | EPOLLRDHUP;
+    if(-1 == epoll_ctl (g_standby_pre_server_thread->thr_efd, EPOLL_CTL_ADD, conninfo->con_port->sock, &event))
+    {
+        elog(LOG, "failed to add socket to epoll");
+        return STATUS_ERROR;
+    }
+
+    return STATUS_OK;
+}
+
+/*
+ * handle loop before establish a connection to active-GTM
+ */
+static void
+gtm_standby_pre_server_loop(const char *data_dir)
+{
+    fd_set		readmask;
+    int			nSockets;
+    sigjmp_buf  local_sigjmp_buf;
+
+    /*
+     * recovery here first
+     */
+    GTM_XLogRecoveryIfNeed(data_dir);
+
+    /*
+     * start GTM_ThreadMain to support get gtm status command
+     */
+    g_standby_pre_server_thread = GTM_ThreadCreate(GTM_ThreadMain, g_max_lock_number);
+    if (NULL == g_standby_pre_server_thread)
+    {
+        elog(LOG, "Failed to create standby_pre_server_thread thread");
+        exit(1);
+    }
+
+    if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+    {
+        RWLockCleanUp();
+        /* Report the error to the server log */
+        EmitErrorReport(NULL);
+
+        /*
+         * Now return to normal top-level context and clear ErrorContext for
+         * next time.
+         */
+        FlushErrorState();
+    }
+
+    /* We can now handle ereport(ERROR) */
+    PG_exception_stack = &local_sigjmp_buf;
+
+    nSockets = initMasks(&readmask);
+    while (Recovery_IsStandby())
+    {
+        fd_set		rmask;
+        int			selres;
+
+        /*
+         * Wait for a connection request to arrive.
+         *
+         * We wait at most one minute, to ensure that the other background
+         * tasks handled below get done even when no requests are arriving.
+         */
+        memcpy((char *) &rmask, (char *) &readmask, sizeof(fd_set));
+
+        PG_SETMASK(&UnBlockSig);
+
+        /* if timekeeper thread exit, main thread should prepare to exit. */
+        if (GTMAbortPending)
+        {
+            /*
+             * XXX We should do a clean shutdown here. For the time being, just
+             * write the next GXID to be issued in the control file and exit
+             * gracefully
+             */
+
+            elog(LOG, "GTM shutting down.");
+
+            /*
+             * Tell GTM that we are shutting down so that no new GXIDs are
+             * issued this point onwards
+             */
+            GTM_SetShuttingDown();
+
+            GTM_RWLockAcquire(&ControlDataLock,GTM_LOCKMODE_WRITE);
+            ControlData->state = DB_SHUTDOWNED;
+            ControlDataSync(false);
+            GTM_RWLockRelease(&ControlDataLock);
+
+            /* Delete pid file */
+            DeleteLockFile(GTM_PID_FILE);
+#ifdef HAVE_UNIX_SOCKETS
+            RemoveSocketFile();
+#endif
+            elog(LOG, "GTM exits");
+            exit(1);
+        }
+
+        {
+            /* must set timeout each time; some OSes change it! */
+            struct timeval timeout;
+
+            timeout.tv_sec = 5;
+            timeout.tv_usec = 0;
+
+            selres = select(nSockets, &rmask, NULL, NULL, &timeout);
+        }
+
+        /*
+         * Block all signals until we wait again.  (This makes it safe for our
+         * signal handlers to do nontrivial work.)
+         */
+        PG_SETMASK(&BlockSig);
+
+        /* Now check the select() result */
+        if (selres < 0)
+        {
+            if (errno != EINTR && errno != EWOULDBLOCK)
+            {
+                ereport(LOG,
+                        (EACCES,
+                                errmsg("select() failed in main thread: %m")));
+                exit(1);
+            }
+        }
+
+        /*
+         * New connection pending on any of our sockets? If so, fork a child
+         * process to deal with it.
+         */
+        if (selres > 0)
+        {
+            int			i;
+
+            for (i = 0; i < MAXLISTEN; i++)
+            {
+                if (ListenSocket[i] == -1)
+                {
+                    break;
+                }
+
+                if (FD_ISSET(ListenSocket[i], &rmask))
+                {
+                    Port	   *port;
+
+                    port = ConnCreate(ListenSocket[i]);
+                    if (port)
+                    {
+                        if (gtm_add_connection_standby_pre_server(port) != STATUS_OK)
+                        {
+                            StreamClose(port->sock);
+                            ConnFree(port);
+                        }
+                    }
+                }
+            }
+        }
+
+        /*
+         * retry establish a connection between the active and standby,
+         * controlling frequency with select timeout
+         */
+        if (gtm_standby_start_startup())
+        {
+            elog(LOG, "Standby GTM Startup connection established with active-GTM.");
+            break;
+        }
+        elog(LOG, "Failed to establish a connection to active-GTM.");
+    }
+
+    /*
+     * clear exception stack here
+     */
+    PG_exception_stack = NULL;
+}
 
 /*
  * Initialise the masks for select() for the ports we are listening on.
@@ -2967,7 +3192,10 @@ GTM_ThreadWalReceiver(void *argp)
         goto promote;
 
     if(GTM_ActiveConn)
+    {
         GTMPQfinish(GTM_ActiveConn);
+        GTM_ActiveConn = NULL;
+    }
 
     sleep(1);
 
@@ -3619,7 +3847,18 @@ ProcessCommand(Port *myport, StringInfo input_message)
      * compile option.
      */
     elog(DEBUG1, "mtype = %s (%d).", gtm_util_message_name(mtype), (int)mtype);
+
 #ifdef __TBASE__
+    if (promote_status != GTM_PRPMOTE_NORMAL)
+    {
+        if (mtype != MSG_CHECK_GTM_STATUS)
+        {
+            elog(ERROR, "standby gtm only support get gtm status command before establish a connection to active-GTM or promote, mtype = %s (%d).", gtm_util_message_name(mtype), (int)mtype);
+        }
+
+        return ProcessStandbyPreCheckGTMCommand(myport, input_message);
+    }
+
     start_time = getSystemTime();
     /*
      * Get Timestamp does not need to sync with standby
@@ -4054,6 +4293,7 @@ GTMAddConnection(Port *port, GTM_Conn *standby)
         if(-1 == epoll_ctl (thrinfo->thr_efd, EPOLL_CTL_ADD, conninfo->con_port->sock, &event))
         {
             elog(LOG, "failed to add socket to epoll");
+            return STATUS_ERROR;
         }
         break;
     }
diff --git a/src/include/gtm/gtm_txn.h b/src/include/gtm/gtm_txn.h
index b84c97db..0dc754ba 100644
--- a/src/include/gtm/gtm_txn.h
+++ b/src/include/gtm/gtm_txn.h
@@ -41,11 +41,11 @@ extern void GlobalTransactionIdAbort(GlobalTransactionId transactionId);
 /* in transam/varsup.c */
 extern GlobalTransactionId GTM_GetGlobalTransactionId(GTM_TransactionHandle handle);
 extern bool GTM_GetGlobalTransactionIdMulti(
-        GTM_TransactionHandle handle[],
-        int txn_count,
-        GlobalTransactionId gxids[],
-        GTM_TransactionHandle new_handle[],
-        int *new_txn_count);
+		GTM_TransactionHandle handle[],
+		int txn_count,
+		GlobalTransactionId gxids[],
+		GTM_TransactionHandle new_handle[],
+		int *new_txn_count);
 extern GlobalTransactionId ReadNewGlobalTransactionId(void);
 extern GlobalTransactionId GTM_GetLatestCompletedXID(void);
 extern void SetGlobalTransactionIdLimit(GlobalTransactionId oldest_datfrozenxid);
@@ -72,118 +72,118 @@ extern void GTM_WriteRestorePointXid(FILE *f);
 
 typedef enum GTM_States
 {
-    GTM_STARTING,
-    GTM_RUNNING,
-    GTM_SHUTTING_DOWN
+	GTM_STARTING,
+	GTM_RUNNING,
+	GTM_SHUTTING_DOWN
 } GTM_States;
 
 /* Global transaction states at the GTM */
 typedef enum GTM_TransactionStates
 {
-    GTM_TXN_INIT,
-    GTM_TXN_STARTING,
-    GTM_TXN_IN_PROGRESS,
-    GTM_TXN_PREPARE_IN_PROGRESS,
-    GTM_TXN_PREPARED,
-    GTM_TXN_COMMIT_IN_PROGRESS,
-    GTM_TXN_COMMITTED,
-    GTM_TXN_ABORT_IN_PROGRESS,
-    GTM_TXN_ABORTED,
-    GTM_TXN_IMPLICATE_PREPARED
+	GTM_TXN_INIT,
+	GTM_TXN_STARTING,
+	GTM_TXN_IN_PROGRESS,
+	GTM_TXN_PREPARE_IN_PROGRESS,
+	GTM_TXN_PREPARED,
+	GTM_TXN_COMMIT_IN_PROGRESS,
+	GTM_TXN_COMMITTED,
+	GTM_TXN_ABORT_IN_PROGRESS,
+	GTM_TXN_ABORTED,
+	GTM_TXN_IMPLICATE_PREPARED
 } GTM_TransactionStates;
 
 typedef struct GTM_TransactionInfo
 {
-    GTM_TransactionHandle    gti_handle;
-    uint32                    gti_client_id;
-    char                    gti_global_session_id[GTM_MAX_SESSION_ID_LEN];
-    bool                    gti_in_use;
-    GlobalTransactionId        gti_gxid;
-    GTM_TransactionStates    gti_state;
-    GlobalTransactionId        gti_xmin;
-    GTM_IsolationLevel        gti_isolevel;
-    bool                    gti_readonly;
-    GTMProxy_ConnID            gti_proxy_client_id;
-    char                    *nodestring; /* List of nodes prepared */
-    char                    *gti_gid;
-
-    GTM_SnapshotData        gti_current_snapshot;
-    bool                    gti_snapshot_set;
-
-    GTM_RWLock                gti_lock;
-    bool                    gti_vacuum;
-    gtm_List                *gti_created_seqs;
-    gtm_List                *gti_dropped_seqs;
-    gtm_List                *gti_altered_seqs;
+	GTM_TransactionHandle	gti_handle;
+	uint32					gti_client_id;
+	char					gti_global_session_id[GTM_MAX_SESSION_ID_LEN];
+	bool					gti_in_use;
+	GlobalTransactionId		gti_gxid;
+	GTM_TransactionStates	gti_state;
+	GlobalTransactionId		gti_xmin;
+	GTM_IsolationLevel		gti_isolevel;
+	bool					gti_readonly;
+	GTMProxy_ConnID			gti_proxy_client_id;
+	char					*nodestring; /* List of nodes prepared */
+	char					*gti_gid;
+
+	GTM_SnapshotData		gti_current_snapshot;
+	bool					gti_snapshot_set;
+
+	GTM_RWLock				gti_lock;
+	bool					gti_vacuum;
+	gtm_List				*gti_created_seqs;
+	gtm_List				*gti_dropped_seqs;
+	gtm_List				*gti_altered_seqs;
 } GTM_TransactionInfo;
 
-#define GTM_MAX_2PC_NODES                16
+#define GTM_MAX_2PC_NODES				16
 /* By default a GID length is limited to 256 bits in PostgreSQL */
-#define GTM_MAX_GID_LEN                    256
-#define GTM_MAX_NODESTRING_LEN            1024
-#define GTM_CheckTransactionHandle(x)    ((x) >= 0 && (x) < GTM_MAX_GLOBAL_TRANSACTIONS)
-#define GTM_IsTransSerializable(x)        ((x)->gti_isolevel == GTM_ISOLATION_SERIALIZABLE)
+#define GTM_MAX_GID_LEN					256
+#define GTM_MAX_NODESTRING_LEN			1024
+#define GTM_CheckTransactionHandle(x)	((x) >= 0 && (x) < GTM_MAX_GLOBAL_TRANSACTIONS)
+#define GTM_IsTransSerializable(x)		((x)->gti_isolevel == GTM_ISOLATION_SERIALIZABLE)
 
 #define GTM_MAX_THREADS 512
 #define CACHE_LINE_SIZE 64
 typedef union rw_lock {
-    int lock;
-    char padding[CACHE_LINE_SIZE];
+	int lock;
+	char padding[CACHE_LINE_SIZE];
 } RW_lock;
 
 typedef struct GTM_Transactions
 {
-    uint32                gt_txn_count;
-    GTM_States            gt_gtm_state;
-
-    GTM_RWLock            gt_XidGenLock;
-
-    /*
-     * These fields are protected by XidGenLock
-     */
-    GlobalTransactionId gt_nextXid;        /* next XID to assign */
-    GlobalTransactionId gt_backedUpXid;    /* backed up, restoration point */
-
-    GlobalTransactionId gt_oldestXid;    /* cluster-wide minimum datfrozenxid */
-    GlobalTransactionId gt_xidVacLimit;    /* start forcing autovacuums here */
-    GlobalTransactionId gt_xidWarnLimit; /* start complaining here */
-    GlobalTransactionId gt_xidStopLimit; /* refuse to advance nextXid beyond here */
-    GlobalTransactionId gt_xidWrapLimit; /* where the world ends */
-
-    /*
-     * These fields are protected by TransArrayLock.
-     */
-    GlobalTransactionId gt_latestCompletedXid;    /* newest XID that has committed or
-                                                  * aborted */
-
-    GlobalTransactionId    gt_recent_global_xmin;
-
-    int32                gt_lastslot;
-    GTM_TransactionInfo    gt_transactions_array[GTM_MAX_GLOBAL_TRANSACTIONS];
-    gtm_List            *gt_open_transactions;
-
-    GTM_RWLock            gt_TransArrayLock;
-    pg_atomic_uint32    gt_global_xid;
-
-    GlobalTimestamp        gt_last_cycle;
-    GlobalTimestamp     gt_global_timestamp;
-    /* For debug purpose */
-    GlobalTimestamp        gt_last_issue_timestamp;
-    GlobalTimestamp     gt_last_raw_timestamp;
-    GlobalTimestamp        gt_last_last_cycle;
-    GlobalTimestamp        gt_last_global_timestamp;
-    GlobalTimestamp        gt_last_tv_sec;
-    GlobalTimestamp        gt_last_tv_nsec;
-    pg_atomic_uint64    gt_access_ts_seq;
-    pg_atomic_uint64    gt_last_access_ts_seq;
-    
-    RW_lock                gt_in_locking[GTM_MAX_THREADS];
+	uint32				gt_txn_count;
+	GTM_States			gt_gtm_state;
+
+	GTM_RWLock			gt_XidGenLock;
+
+	/*
+	 * These fields are protected by XidGenLock
+	 */
+	GlobalTransactionId gt_nextXid;		/* next XID to assign */
+	GlobalTransactionId gt_backedUpXid;	/* backed up, restoration point */
+
+	GlobalTransactionId gt_oldestXid;	/* cluster-wide minimum datfrozenxid */
+	GlobalTransactionId gt_xidVacLimit;	/* start forcing autovacuums here */
+	GlobalTransactionId gt_xidWarnLimit; /* start complaining here */
+	GlobalTransactionId gt_xidStopLimit; /* refuse to advance nextXid beyond here */
+	GlobalTransactionId gt_xidWrapLimit; /* where the world ends */
+
+	/*
+	 * These fields are protected by TransArrayLock.
+	 */
+	GlobalTransactionId gt_latestCompletedXid;	/* newest XID that has committed or
+										 		 * aborted */
+
+	GlobalTransactionId	gt_recent_global_xmin;
+
+	int32				gt_lastslot;
+	GTM_TransactionInfo	gt_transactions_array[GTM_MAX_GLOBAL_TRANSACTIONS];
+	gtm_List			*gt_open_transactions;
+
+	GTM_RWLock			gt_TransArrayLock;
+	pg_atomic_uint32	gt_global_xid;
+
+	GlobalTimestamp		gt_last_cycle;
+	GlobalTimestamp 	gt_global_timestamp;
+	/* For debug purpose */
+	GlobalTimestamp		gt_last_issue_timestamp;
+	GlobalTimestamp 	gt_last_raw_timestamp;
+	GlobalTimestamp		gt_last_last_cycle;
+	GlobalTimestamp		gt_last_global_timestamp;
+	GlobalTimestamp		gt_last_tv_sec;
+	GlobalTimestamp		gt_last_tv_nsec;
+	pg_atomic_uint64	gt_access_ts_seq;
+	pg_atomic_uint64	gt_last_access_ts_seq;
+	
+	RW_lock				gt_in_locking[GTM_MAX_THREADS];
 } GTM_Transactions;
 
-extern GTM_Transactions    GTMTransactions;
+extern GTM_Transactions	GTMTransactions;
 
 /* NOTE: This macro should be used with READ lock held on gt_TransArrayLock! */
-#define GTM_CountOpenTransactions()    (gtm_list_length(GTMTransactions.gt_open_transactions))
+#define GTM_CountOpenTransactions()	(gtm_list_length(GTMTransactions.gt_open_transactions))
 
 /*
  * Two hash tables will be maintained to quickly find the
@@ -198,53 +198,53 @@ bool GTM_IsGXIDInProgress(GlobalTransactionId gxid);
 /* Transaction Control */
 void GTM_InitTxnManager(void);
 GTM_TransactionHandle GTM_BeginTransaction(GTM_IsolationLevel isolevel,
-                                           bool readonly,
-                                           const char *global_sessionid);
+										   bool readonly,
+										   const char *global_sessionid);
 int GTM_BeginTransactionMulti(GTM_IsolationLevel isolevel[],
-                                           bool readonly[],
-                                           const char *global_sessionid[],
-                                           GTMProxy_ConnID connid[],
-                                           int txn_count,
-                                           GTM_TransactionHandle txns[]);
+										   bool readonly[],
+										   const char *global_sessionid[],
+										   GTMProxy_ConnID connid[],
+										   int txn_count,
+										   GTM_TransactionHandle txns[]);
 int GTM_RollbackTransaction(GTM_TransactionHandle txn);
 int GTM_RollbackTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int status[]);
 int GTM_RollbackTransactionGXID(GlobalTransactionId gxid);
 int GTM_CommitTransaction(GTM_TransactionHandle txn,
-        int waited_xid_count, GlobalTransactionId *waited_xids);
+		int waited_xid_count, GlobalTransactionId *waited_xids);
 int GTM_CommitTransactionMulti(GTM_TransactionHandle txn[], int txn_count,
-        int waited_xid_count, GlobalTransactionId *waited_xids,
-        int status[]);
+		int waited_xid_count, GlobalTransactionId *waited_xids,
+		int status[]);
 int GTM_CommitTransactionGXID(GlobalTransactionId gxid);
 int GTM_PrepareTransaction(GTM_TransactionHandle txn);
 int GTM_StartPreparedTransaction(GTM_TransactionHandle txn,
-                                 char *gid,
-                                 char *nodestring);
+								 char *gid,
+								 char *nodestring);
 int
-GTM_LogTransaction(    GlobalTransactionId gxid,
-                             const char *gid,
-                             const char *nodestring,
-                             int node_count,
-                             int isGlobal,
-                             int isCommit,
-                             GlobalTimestamp prepare_ts,
-                             GlobalTimestamp commit_ts);
+GTM_LogTransaction(	GlobalTransactionId gxid,
+							 const char *gid,
+							 const char *nodestring,
+							 int node_count,
+							 int isGlobal,
+							 int isCommit,
+							 GlobalTimestamp prepare_ts,
+							 GlobalTimestamp commit_ts);
 int
 GTM_LogScan(GlobalTransactionId gxid,
-                 const char *nodestring,
-                 GlobalTimestamp start_ts,
-                 GlobalTimestamp local_start_ts,
-                 GlobalTimestamp local_complete_ts,
-                 int scan_type,
-                 const char *rel_name,
-                 int64 scan_number);
+				 const char *nodestring,
+			     GlobalTimestamp start_ts,
+			     GlobalTimestamp local_start_ts,
+			     GlobalTimestamp local_complete_ts,
+			     int scan_type,
+			     const char *rel_name,
+			     int64 scan_number);
 
 
 int GTM_StartPreparedTransactionGXID(GlobalTransactionId gxid,
-                                     char *gid,
-                                     char *nodestring);
+									 char *gid,
+									 char *nodestring);
 int GTM_GetGIDData(GTM_TransactionHandle prepared_txn,
-                   GlobalTransactionId *prepared_gxid,
-                   char **nodestring);
+				   GlobalTransactionId *prepared_gxid,
+				   char **nodestring);
 uint32 GTM_GetAllPrepared(GlobalTransactionId gxids[], uint32 gxidcnt);
 GTM_TransactionStates GTM_GetStatus(GTM_TransactionHandle txn);
 GTM_TransactionStates GTM_GetStatusGXID(GlobalTransactionId gxid);
@@ -254,9 +254,9 @@ uint32 GTMGetFirstClientIdentifier(void);
 uint32 GTMGetLastClientIdentifier(void);
 
 GTM_Snapshot GTM_GetSnapshotData(GTM_TransactionInfo *my_txninfo,
-                                 GTM_Snapshot snapshot);
+								 GTM_Snapshot snapshot);
 GTM_Snapshot GTM_GetTransactionSnapshot(GTM_TransactionHandle handle[],
-        int txn_count, int *status);
+		int txn_count, int *status);
 void GTM_FreeCachedTransInfo(void);
 
 void ProcessBeginTransactionCommand(Port *myport, StringInfo message);
@@ -267,11 +267,11 @@ ProcessBkupGlobalTimestamp(Port *myport, StringInfo message);
 
 void ProcessBkupBeginTransactionCommand(Port *myport, StringInfo message);
 void GTM_BkupBeginTransactionMulti(GTM_IsolationLevel *isolevel,
-                                   bool *readonly,
-                                   const char **global_sessionid,
-                                   uint32 *client_id,
-                                   GTMProxy_ConnID *connid,
-                                   int    txn_count);
+								   bool *readonly,
+								   const char **global_sessionid,
+								   uint32 *client_id,
+								   GTMProxy_ConnID *connid,
+								   int	txn_count);
 
 void ProcessBeginTransactionCommandMulti(Port *myport, StringInfo message);
 void ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message);
@@ -299,11 +299,11 @@ void GTM_WriteRestorePointVersion(FILE *f);
 void GTM_RestoreStart(FILE *ctlf, struct GTM_RestoreContext *context);
 void GTM_SaveTxnInfo(FILE *ctlf);
 void GTM_RestoreTxnInfo(FILE *ctlf, GlobalTransactionId next_gxid,
-        struct GTM_RestoreContext *context, bool force_xid);
+		struct GTM_RestoreContext *context, bool force_xid);
 void GTM_BkupBeginTransaction(GTM_IsolationLevel isolevel,
-                              bool readonly,
-                              const char *global_sessionid,
-                              uint32 client_id);
+							  bool readonly,
+							  const char *global_sessionid,
+							  uint32 client_id);
 void ProcessBkupBeginTransactionGetGXIDCommand(Port *myport, StringInfo message);
 void ProcessBkupBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message);
 
@@ -326,6 +326,7 @@ extern void ProcessFinishGIDTransactionCommand(Port *myport, StringInfo message)
 void ProcessGetGTSCommand(Port *myport, StringInfo message);
 void ProcessGetGTSCommandMulti(Port *myport, StringInfo message);
 void ProcessCheckGTMCommand(Port *myport, StringInfo message);
+void ProcessStandbyPreCheckGTMCommand(Port *myport, StringInfo message);
 #endif
 
 #endif

From a3c89d525a0f7cb7ad77fec16b800dae1c20a21b Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 16 Mar 2021 16:27:34 +0800
Subject: [PATCH 338/578] Converity and memory problem fix.

---
 contrib/pg_clean/pg_clean.c             |  11 +-
 contrib/pgcrypto/pgp-mpi-internal.c     | 397 ++++++++--------
 contrib/pgxc_ctl/coord_cmd.c            |   2 +-
 contrib/pgxc_ctl/datanode_cmd.c         |   2 +-
 contrib/pgxc_ctl/variables.c            | 572 ++++++++++++------------
 src/backend/access/common/printtup.c    |   1 +
 src/backend/access/heap/heapam.c        |   3 +-
 src/backend/access/spgist/spgscan.c     |   4 +
 src/backend/access/transam/gtm.c        |  15 +-
 src/backend/access/transam/twophase.c   |   4 +-
 src/backend/access/transam/xlog.c       |   4 +-
 src/backend/audit/audit_fga.c           |   3 +-
 src/backend/catalog/objectaddress.c     |   1 +
 src/backend/catalog/pgxc_class.c        |   6 -
 src/backend/commands/sequence.c         |   4 +-
 src/backend/commands/statscmds.c        |   5 +
 src/backend/contrib/pgcrypto/internal.c |   2 +-
 src/backend/executor/execUtils.c        |   2 +
 src/backend/libpq/hba.c                 |  21 +
 src/backend/libpq/pqcomm.c              |   7 +-
 src/backend/nodes/bitmapset.c           |   2 +-
 src/backend/optimizer/path/clausesel.c  |   2 +-
 src/backend/optimizer/plan/subselect.c  |   2 +-
 src/backend/optimizer/util/pathnode.c   |  12 +-
 src/backend/parser/analyze.c            |   2 +-
 src/backend/parser/parse_utilcmd.c      |   3 +
 src/backend/pgxc/copy/copyops.c         |   3 -
 src/backend/pgxc/locator/locator.c      |   2 +
 src/backend/pgxc/nodemgr/nodemgr.c      |   1 +
 src/backend/pgxc/plan/planner.c         |   4 +
 src/backend/pgxc/pool/execRemote.c      |  44 +-
 src/backend/pgxc/pool/pgxcnode.c        |  14 +-
 src/backend/pgxc/pool/poolcomm.c        |  23 +-
 src/backend/pgxc/pool/poolmgr.c         |  67 ++-
 src/backend/pgxc/squeue/squeue.c        |   4 +-
 src/backend/replication/slotfuncs.c     |   4 +-
 src/backend/storage/file/fd.c           |   2 +-
 src/backend/tcop/postgres.c             |   4 +-
 src/backend/tcop/pquery.c               |   2 +-
 src/backend/tcop/utility.c              |   2 +-
 src/backend/utils/adt/datetime.c        |   4 +-
 src/backend/utils/adt/jsonb_util.c      |   6 +-
 src/backend/utils/adt/network_gist.c    |   2 +-
 src/backend/utils/adt/oid.c             |   2 +-
 src/backend/utils/adt/timestamp.c       |   2 +-
 src/backend/utils/cache/relcryptmap.c   |   2 +-
 src/backend/utils/misc/datamask.c       |   4 +-
 src/backend/utils/misc/guc.c            |   3 +
 src/backend/utils/misc/mls.c            |   8 +-
 src/backend/utils/misc/relcrypt.c       |   4 +-
 src/backend/utils/mmgr/dsa.c            |   1 +
 src/backend/utils/sort/tuplesort.c      |   6 +
 src/bin/initgtm/initgtm.c               |   2 +-
 src/bin/pg_dump/pg_backup_tar.c         |   2 +-
 src/gtm/client/fe-connect.c             |  14 +-
 src/gtm/client/gtm_client.c             |   9 +-
 src/gtm/common/gtm_opt_handler.c        |  13 +
 src/gtm/gtm_ctl/gtm_ctl.c               |   5 +-
 src/gtm/main/gtm_store.c                |  14 +-
 src/gtm/main/gtm_xlog.c                 |   7 +-
 src/gtm/main/main.c                     |  19 +-
 src/gtm/proxy/proxy_main.c              |   5 +-
 src/gtm/recovery/register_common.c      |   7 +-
 src/gtm/xlog_test/xlog_reader.c         |  10 +-
 src/include/access/xlog.h               |   1 +
 src/include/audit/audit_fga.h           |   2 +-
 src/include/pgxc/pgxcnode.h             |   2 +-
 src/interfaces/ecpg/ecpglib/execute.c   |   5 +-
 src/interfaces/ecpg/preproc/ecpg.c      |   2 +-
 src/interfaces/libpq/fe-auth.c          |   2 -
 src/interfaces/libpq/fe-connect.c       |   4 +-
 src/interfaces/libpq/fe-protocol2.c     |   2 -
 src/pl/plpgsql/src/pl_exec.c            |   1 +
 src/timezone/localtime.c                |   2 -
 74 files changed, 796 insertions(+), 643 deletions(-)

diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c
index e31394c4..8d1514f4 100644
--- a/contrib/pg_clean/pg_clean.c
+++ b/contrib/pg_clean/pg_clean.c
@@ -83,6 +83,7 @@ PG_MODULE_MAGIC;
 #define GET_READONLY "readonly"
 #define GIDSIZE (200 + 24)
 #define MAX_TWOPC_TXN 1000
+#define STRING_BUFF_LEN 1024
 
 #define MAX_CMD_LENGTH 120
 
@@ -1302,7 +1303,7 @@ database_info *add_database_info(char *database_name)
 {
 	database_info *rv;
     HASHCTL txn_ctl;
-    char tabname[MAX_GID];
+    char tabname[STRING_BUFF_LEN];
 
 	if ((rv = find_database_info(database_name)) != NULL)
 		return rv;		/* Already in the list */
@@ -1322,7 +1323,7 @@ database_info *add_database_info(char *database_name)
 	rv->last_txn_info = NULL;
 #endif
 
-    snprintf(tabname, 64, "%s txn info", rv->database_name);
+    snprintf(tabname, STRING_BUFF_LEN, "%s txn info", rv->database_name);
     txn_ctl.keysize = MAX_GID;
     txn_ctl.entrysize = sizeof(txn_info); 
     rv->all_txn_info = hash_create(tabname, 64, 
@@ -1342,7 +1343,7 @@ database_info *add_database_info(char *database_name)
 
 int find_node_index(Oid node_oid)
 {
-	int res;
+	int res = -1;
 	int i;
 	if (get_pgxc_nodetype(node_oid) == 'C')
 	{
@@ -2078,12 +2079,12 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
     /*collect the 2pc file in nodes*/
     for (i = 0; i < cn_nodes_num; i++)
     {
-        execute_query_on_single_node(cn_node_list[i], query, 1, result+i);
+        (void) execute_query_on_single_node(cn_node_list[i], query, 1, result+i);
     }
 
     for (i = 0; i < dn_nodes_num; i++)
     {
-        execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i);
+        (void) execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i);
     }
 	/*get all database info*/
 	getDatabaseList();
diff --git a/contrib/pgcrypto/pgp-mpi-internal.c b/contrib/pgcrypto/pgp-mpi-internal.c
index 9420d678..e7c8637e 100644
--- a/contrib/pgcrypto/pgp-mpi-internal.c
+++ b/contrib/pgcrypto/pgp-mpi-internal.c
@@ -1,6 +1,6 @@
 /*
  * pgp-mpi-internal.c
- *      OpenPGP MPI functions.
+ *	  OpenPGP MPI functions.
  *
  * Copyright (c) 2005 Marko Kreen
  * All rights reserved.
@@ -9,10 +9,10 @@
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
+ *	  notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
+ *	  notice, this list of conditions and the following disclaimer in the
+ *	  documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -38,19 +38,22 @@
 static mpz_t *
 mp_new()
 {
-    mpz_t       *mp = mp_int_alloc();
+	mpz_t	   *mp = mp_int_alloc();
 
-    mp_int_init_size(mp, 256);
-    return mp;
+	if (mp_int_init_size(mp, 256))
+    {
+	    return NULL;
+    }
+	return mp;
 }
 
 static void
 mp_clear_free(mpz_t *a)
 {
-    if (!a)
-        return;
-    /* fixme: no clear? */
-    mp_int_free(a);
+	if (!a)
+		return;
+	/* fixme: no clear? */
+	mp_int_free(a);
 }
 
 
@@ -58,86 +61,86 @@ static int
 mp_px_rand(uint32 bits, mpz_t *res)
 {
 #ifdef HAVE_STRONG_RANDOM
-    unsigned    bytes = (bits + 7) / 8;
-    int            last_bits = bits & 7;
-    uint8       *buf;
-
-    buf = px_alloc(bytes);
-    if (!pg_strong_random((char *) buf, bytes))
-    {
-        px_free(buf);
-        return PXE_NO_RANDOM;
-    }
-
-    /* clear unnecessary bits and set last bit to one */
-    if (last_bits)
-    {
-        buf[0] >>= 8 - last_bits;
-        buf[0] |= 1 << (last_bits - 1);
-    }
-    else
-        buf[0] |= 1 << 7;
-
-    mp_int_read_unsigned(res, buf, bytes);
-
-    px_free(buf);
-
-    return 0;
+	unsigned	bytes = (bits + 7) / 8;
+	int			last_bits = bits & 7;
+	uint8	   *buf;
+
+	buf = px_alloc(bytes);
+	if (!pg_strong_random((char *) buf, bytes))
+	{
+		px_free(buf);
+		return PXE_NO_RANDOM;
+	}
+
+	/* clear unnecessary bits and set last bit to one */
+	if (last_bits)
+	{
+		buf[0] >>= 8 - last_bits;
+		buf[0] |= 1 << (last_bits - 1);
+	}
+	else
+		buf[0] |= 1 << 7;
+
+	mp_int_read_unsigned(res, buf, bytes);
+
+	px_free(buf);
+
+	return 0;
 #else
-    return PXE_NO_RANDOM;
+	return PXE_NO_RANDOM;
 #endif
 }
 
 static void
 mp_modmul(mpz_t *a, mpz_t *b, mpz_t *p, mpz_t *res)
 {
-    mpz_t       *tmp = mp_new();
+	mpz_t	   *tmp = mp_new();
 
-    mp_int_mul(a, b, tmp);
-    mp_int_mod(tmp, p, res);
-    mp_clear_free(tmp);
+	mp_int_mul(a, b, tmp);
+	mp_int_mod(tmp, p, res);
+	mp_clear_free(tmp);
 }
 
 static mpz_t *
 mpi_to_bn(PGP_MPI *n)
 {
-    mpz_t       *bn = mp_new();
-
-    mp_int_read_unsigned(bn, n->data, n->bytes);
-
-    if (!bn)
-        return NULL;
-    if (mp_int_count_bits(bn) != n->bits)
-    {
-        px_debug("mpi_to_bn: bignum conversion failed: mpi=%d, bn=%d",
-                 n->bits, mp_int_count_bits(bn));
-        mp_clear_free(bn);
-        return NULL;
-    }
-    return bn;
+	mpz_t	   *bn = mp_new();
+
+	mp_int_read_unsigned(bn, n->data, n->bytes);
+
+	if (!bn)
+		return NULL;
+	if (mp_int_count_bits(bn) != n->bits)
+	{
+		px_debug("mpi_to_bn: bignum conversion failed: mpi=%d, bn=%d",
+				 n->bits, mp_int_count_bits(bn));
+		mp_clear_free(bn);
+		return NULL;
+	}
+	return bn;
 }
 
 static PGP_MPI *
 bn_to_mpi(mpz_t *bn)
 {
-    int            res;
-    PGP_MPI    *n;
-    int            bytes;
-
-    res = pgp_mpi_alloc(mp_int_count_bits(bn), &n);
-    if (res < 0)
-        return NULL;
-
-    bytes = (mp_int_count_bits(bn) + 7) / 8;
-    if (bytes != n->bytes)
-    {
-        px_debug("bn_to_mpi: bignum conversion failed: bn=%d, mpi=%d",
-                 bytes, n->bytes);
-        pgp_mpi_free(n);
-        return NULL;
-    }
-    mp_int_to_unsigned(bn, n->data, n->bytes);
-    return n;
+	int			res;
+	PGP_MPI    *n;
+	int			bytes;
+
+	res = pgp_mpi_alloc(mp_int_count_bits(bn), &n);
+	if (res < 0)
+		return NULL;
+
+	bytes = (mp_int_count_bits(bn) + 7) / 8;
+	if (bytes != n->bytes)
+	{
+		px_debug("bn_to_mpi: bignum conversion failed: bn=%d, mpi=%d",
+				 bytes, n->bytes);
+		pgp_mpi_free(n);
+		return NULL;
+	}
+	mp_int_to_unsigned(bn, n->data, n->bytes);
+	return n;
 }
 
 /*
@@ -158,152 +161,152 @@ bn_to_mpi(mpz_t *bn)
 static int
 decide_k_bits(int p_bits)
 {
-    if (p_bits <= 5120)
-        return p_bits / 10 + 160;
-    else
-        return (p_bits / 8 + 200) * 3 / 2;
+	if (p_bits <= 5120)
+		return p_bits / 10 + 160;
+	else
+		return (p_bits / 8 + 200) * 3 / 2;
 }
 
 int
 pgp_elgamal_encrypt(PGP_PubKey *pk, PGP_MPI *_m,
-                    PGP_MPI **c1_p, PGP_MPI **c2_p)
+					PGP_MPI **c1_p, PGP_MPI **c2_p)
 {
-    int            res = PXE_PGP_MATH_FAILED;
-    int            k_bits;
-    mpz_t       *m = mpi_to_bn(_m);
-    mpz_t       *p = mpi_to_bn(pk->pub.elg.p);
-    mpz_t       *g = mpi_to_bn(pk->pub.elg.g);
-    mpz_t       *y = mpi_to_bn(pk->pub.elg.y);
-    mpz_t       *k = mp_new();
-    mpz_t       *yk = mp_new();
-    mpz_t       *c1 = mp_new();
-    mpz_t       *c2 = mp_new();
-
-    if (!m || !p || !g || !y || !k || !yk || !c1 || !c2)
-        goto err;
-
-    /*
-     * generate k
-     */
-    k_bits = decide_k_bits(mp_int_count_bits(p));
-    res = mp_px_rand(k_bits, k);
-    if (res < 0)
-        return res;
-
-    /*
-     * c1 = g^k c2 = m * y^k
-     */
-    mp_int_exptmod(g, k, p, c1);
-    mp_int_exptmod(y, k, p, yk);
-    mp_modmul(m, yk, p, c2);
-
-    /* result */
-    *c1_p = bn_to_mpi(c1);
-    *c2_p = bn_to_mpi(c2);
-    if (*c1_p && *c2_p)
-        res = 0;
+	int			res = PXE_PGP_MATH_FAILED;
+	int			k_bits;
+	mpz_t	   *m = mpi_to_bn(_m);
+	mpz_t	   *p = mpi_to_bn(pk->pub.elg.p);
+	mpz_t	   *g = mpi_to_bn(pk->pub.elg.g);
+	mpz_t	   *y = mpi_to_bn(pk->pub.elg.y);
+	mpz_t	   *k = mp_new();
+	mpz_t	   *yk = mp_new();
+	mpz_t	   *c1 = mp_new();
+	mpz_t	   *c2 = mp_new();
+
+	if (!m || !p || !g || !y || !k || !yk || !c1 || !c2)
+		goto err;
+
+	/*
+	 * generate k
+	 */
+	k_bits = decide_k_bits(mp_int_count_bits(p));
+	res = mp_px_rand(k_bits, k);
+	if (res < 0)
+		return res;
+
+	/*
+	 * c1 = g^k c2 = m * y^k
+	 */
+	mp_int_exptmod(g, k, p, c1);
+	mp_int_exptmod(y, k, p, yk);
+	mp_modmul(m, yk, p, c2);
+
+	/* result */
+	*c1_p = bn_to_mpi(c1);
+	*c2_p = bn_to_mpi(c2);
+	if (*c1_p && *c2_p)
+		res = 0;
 err:
-    mp_clear_free(c2);
-    mp_clear_free(c1);
-    mp_clear_free(yk);
-    mp_clear_free(k);
-    mp_clear_free(y);
-    mp_clear_free(g);
-    mp_clear_free(p);
-    mp_clear_free(m);
-    return res;
+	mp_clear_free(c2);
+	mp_clear_free(c1);
+	mp_clear_free(yk);
+	mp_clear_free(k);
+	mp_clear_free(y);
+	mp_clear_free(g);
+	mp_clear_free(p);
+	mp_clear_free(m);
+	return res;
 }
 
 int
 pgp_elgamal_decrypt(PGP_PubKey *pk, PGP_MPI *_c1, PGP_MPI *_c2,
-                    PGP_MPI **msg_p)
+					PGP_MPI **msg_p)
 {
-    int            res = PXE_PGP_MATH_FAILED;
-    mpz_t       *c1 = mpi_to_bn(_c1);
-    mpz_t       *c2 = mpi_to_bn(_c2);
-    mpz_t       *p = mpi_to_bn(pk->pub.elg.p);
-    mpz_t       *x = mpi_to_bn(pk->sec.elg.x);
-    mpz_t       *c1x = mp_new();
-    mpz_t       *div = mp_new();
-    mpz_t       *m = mp_new();
-
-    if (!c1 || !c2 || !p || !x || !c1x || !div || !m)
-        goto err;
-
-    /*
-     * m = c2 / (c1^x)
-     */
-    mp_int_exptmod(c1, x, p, c1x);
-    mp_int_invmod(c1x, p, div);
-    mp_modmul(c2, div, p, m);
-
-    /* result */
-    *msg_p = bn_to_mpi(m);
-    if (*msg_p)
-        res = 0;
+	int			res = PXE_PGP_MATH_FAILED;
+	mpz_t	   *c1 = mpi_to_bn(_c1);
+	mpz_t	   *c2 = mpi_to_bn(_c2);
+	mpz_t	   *p = mpi_to_bn(pk->pub.elg.p);
+	mpz_t	   *x = mpi_to_bn(pk->sec.elg.x);
+	mpz_t	   *c1x = mp_new();
+	mpz_t	   *div = mp_new();
+	mpz_t	   *m = mp_new();
+
+	if (!c1 || !c2 || !p || !x || !c1x || !div || !m)
+		goto err;
+
+	/*
+	 * m = c2 / (c1^x)
+	 */
+	mp_int_exptmod(c1, x, p, c1x);
+	mp_int_invmod(c1x, p, div);
+	mp_modmul(c2, div, p, m);
+
+	/* result */
+	*msg_p = bn_to_mpi(m);
+	if (*msg_p)
+		res = 0;
 err:
-    mp_clear_free(m);
-    mp_clear_free(div);
-    mp_clear_free(c1x);
-    mp_clear_free(x);
-    mp_clear_free(p);
-    mp_clear_free(c2);
-    mp_clear_free(c1);
-    return res;
+	mp_clear_free(m);
+	mp_clear_free(div);
+	mp_clear_free(c1x);
+	mp_clear_free(x);
+	mp_clear_free(p);
+	mp_clear_free(c2);
+	mp_clear_free(c1);
+	return res;
 }
 
 int
 pgp_rsa_encrypt(PGP_PubKey *pk, PGP_MPI *_m, PGP_MPI **c_p)
 {
-    int            res = PXE_PGP_MATH_FAILED;
-    mpz_t       *m = mpi_to_bn(_m);
-    mpz_t       *e = mpi_to_bn(pk->pub.rsa.e);
-    mpz_t       *n = mpi_to_bn(pk->pub.rsa.n);
-    mpz_t       *c = mp_new();
-
-    if (!m || !e || !n || !c)
-        goto err;
-
-    /*
-     * c = m ^ e
-     */
-    mp_int_exptmod(m, e, n, c);
-
-    *c_p = bn_to_mpi(c);
-    if (*c_p)
-        res = 0;
+	int			res = PXE_PGP_MATH_FAILED;
+	mpz_t	   *m = mpi_to_bn(_m);
+	mpz_t	   *e = mpi_to_bn(pk->pub.rsa.e);
+	mpz_t	   *n = mpi_to_bn(pk->pub.rsa.n);
+	mpz_t	   *c = mp_new();
+
+	if (!m || !e || !n || !c)
+		goto err;
+
+	/*
+	 * c = m ^ e
+	 */
+	mp_int_exptmod(m, e, n, c);
+
+	*c_p = bn_to_mpi(c);
+	if (*c_p)
+		res = 0;
 err:
-    mp_clear_free(c);
-    mp_clear_free(n);
-    mp_clear_free(e);
-    mp_clear_free(m);
-    return res;
+	mp_clear_free(c);
+	mp_clear_free(n);
+	mp_clear_free(e);
+	mp_clear_free(m);
+	return res;
 }
 
 int
 pgp_rsa_decrypt(PGP_PubKey *pk, PGP_MPI *_c, PGP_MPI **m_p)
 {
-    int            res = PXE_PGP_MATH_FAILED;
-    mpz_t       *c = mpi_to_bn(_c);
-    mpz_t       *d = mpi_to_bn(pk->sec.rsa.d);
-    mpz_t       *n = mpi_to_bn(pk->pub.rsa.n);
-    mpz_t       *m = mp_new();
-
-    if (!m || !d || !n || !c)
-        goto err;
-
-    /*
-     * m = c ^ d
-     */
-    mp_int_exptmod(c, d, n, m);
-
-    *m_p = bn_to_mpi(m);
-    if (*m_p)
-        res = 0;
+	int			res = PXE_PGP_MATH_FAILED;
+	mpz_t	   *c = mpi_to_bn(_c);
+	mpz_t	   *d = mpi_to_bn(pk->sec.rsa.d);
+	mpz_t	   *n = mpi_to_bn(pk->pub.rsa.n);
+	mpz_t	   *m = mp_new();
+
+	if (!m || !d || !n || !c)
+		goto err;
+
+	/*
+	 * m = c ^ d
+	 */
+	mp_int_exptmod(c, d, n, m);
+
+	*m_p = bn_to_mpi(m);
+	if (*m_p)
+		res = 0;
 err:
-    mp_clear_free(m);
-    mp_clear_free(n);
-    mp_clear_free(d);
-    mp_clear_free(c);
-    return res;
+	mp_clear_free(m);
+	mp_clear_free(n);
+	mp_clear_free(d);
+	mp_clear_free(c);
+	return res;
 }
diff --git a/contrib/pgxc_ctl/coord_cmd.c b/contrib/pgxc_ctl/coord_cmd.c
index 0df40e89..f45eb417 100644
--- a/contrib/pgxc_ctl/coord_cmd.c
+++ b/contrib/pgxc_ctl/coord_cmd.c
@@ -1715,7 +1715,7 @@ int add_coordinatorSlave(char *name, char *host, int port, int pooler_port, char
             "# archive_command = 'rsync %%p %s@%s:%s/%%f'\n"
             "max_wal_senders = %d\n"
             "# End of Addition\n",
-            timeStampString(date, MAXPATH),
+			timeStampString(date, MAXTOKEN+1),
             sval(VAR_pgxcUser), host, archDir,
             getDefaultWalSender(TRUE));
     pclose(f);
diff --git a/contrib/pgxc_ctl/datanode_cmd.c b/contrib/pgxc_ctl/datanode_cmd.c
index 8b6326aa..a0d6636e 100644
--- a/contrib/pgxc_ctl/datanode_cmd.c
+++ b/contrib/pgxc_ctl/datanode_cmd.c
@@ -1436,7 +1436,7 @@ int add_datanodeSlave(char *name, char *host, int port, int pooler, char *dir,
     fprintf(f, 
             "#================================================\n"
             "# Additional entry by adding the slave, %s\n",
-            timeStampString(date, MAXPATH));
+			timeStampString(date, MAXTOKEN+1));
 
     for (kk = 0; aval(VAR_datanodePgHbaEntries)[kk]; kk++)
     {
diff --git a/contrib/pgxc_ctl/variables.c b/contrib/pgxc_ctl/variables.c
index 9ed61ddb..f8bd8917 100644
--- a/contrib/pgxc_ctl/variables.c
+++ b/contrib/pgxc_ctl/variables.c
@@ -24,367 +24,367 @@ static void clear_var(pgxc_ctl_var *var);
  */
 static int hash_val(char *name)
 {
-    unsigned char *name_u = (unsigned char *)name;
-    unsigned char v;
-    
-    for(v = 0; *name_u; name_u++)
-        v += *name_u;
-    return (v%NUM_HASH_BUCKET);
+	unsigned char *name_u = (unsigned char *)name;
+	unsigned char v;
+	
+	for(v = 0; *name_u; name_u++)
+		v += *name_u;
+	return (v%NUM_HASH_BUCKET);
 }
 
 #define LIMIT_TO_DOUBLE 128
 #define INCR_OVER_DOUBLE 10
 static int next_size(int sz)
 {
-    if (sz <= 0)
-        return 1;
-    if (sz <= LIMIT_TO_DOUBLE)
-        return sz*2;
-    else
-        return sz + INCR_OVER_DOUBLE;
+	if (sz <= 0)
+		return 1;
+	if (sz <= LIMIT_TO_DOUBLE)
+		return sz*2;
+	else
+		return sz + INCR_OVER_DOUBLE;
 }
 
 void init_var_hash()
 {
-    int i;
+	int i;
 
-    for (i = 0; i < NUM_HASH_BUCKET; i++)
-    {
-        var_hash[i].el_size = 1;
-        var_hash[i].el_used = 0;
-        var_hash[i].el = (pgxc_ctl_var **)Malloc(sizeof(pgxc_ctl_var *));
-        var_hash[i].el[0] = NULL;
-    }
+	for (i = 0; i < NUM_HASH_BUCKET; i++)
+	{
+		var_hash[i].el_size = 1;
+		var_hash[i].el_used = 0;
+		var_hash[i].el = (pgxc_ctl_var **)Malloc(sizeof(pgxc_ctl_var *));
+		var_hash[i].el[0] = NULL;
+	}
 }
 
 static void remove_from_hash(pgxc_ctl_var *var)
 {
-    int hash_v = hash_val(var->varname);
-    int ii, jj;
+	int hash_v = hash_val(var->varname);
+	int ii, jj;
 
-    for(ii = 0; var_hash[hash_v].el[ii]; ii++)
-    {
-        if (var_hash[hash_v].el[ii] != var)
-            continue;
-        else
-        {
-            for(jj = ii; var_hash[hash_v].el[jj]; jj++)
-                var_hash[hash_v].el[jj] = var_hash[hash_v].el[jj + 1];
-            var_hash[hash_v].el_used--;
-            return;
-        }
-    }
-    return;
+	for(ii = 0; var_hash[hash_v].el[ii]; ii++)
+	{
+		if (var_hash[hash_v].el[ii] != var)
+			continue;
+		else
+		{
+			for(jj = ii; var_hash[hash_v].el[jj]; jj++)
+				var_hash[hash_v].el[jj] = var_hash[hash_v].el[jj + 1];
+			var_hash[hash_v].el_used--;
+			return;
+		}
+	}
+	return;
 }
 
 void add_var_hash(pgxc_ctl_var *var)
 {
-    int    hash_v = hash_val(var->varname);
-    if (var_hash[hash_v].el_used + 1 >= var_hash[hash_v].el_size)
-    {
-        var_hash[hash_v].el_size = next_size(var_hash[hash_v].el_size);
-        var_hash[hash_v].el = (pgxc_ctl_var **)Realloc(var_hash[hash_v].el, sizeof(pgxc_ctl_var *) * var_hash[hash_v].el_size);
-    }
-    var_hash[hash_v].el[var_hash[hash_v].el_used++] = var;
-    var_hash[hash_v].el[var_hash[hash_v].el_used] = NULL;
+	int	hash_v = hash_val(var->varname);
+	if (var_hash[hash_v].el_used + 1 >= var_hash[hash_v].el_size)
+	{
+		var_hash[hash_v].el_size = next_size(var_hash[hash_v].el_size);
+		var_hash[hash_v].el = (pgxc_ctl_var **)Realloc(var_hash[hash_v].el, sizeof(pgxc_ctl_var *) * var_hash[hash_v].el_size);
+	}
+	var_hash[hash_v].el[var_hash[hash_v].el_used++] = var;
+	var_hash[hash_v].el[var_hash[hash_v].el_used] = NULL;
 }
 
 pgxc_ctl_var *new_var(char *name)
 {
-    pgxc_ctl_var *newv;
-
-    if (find_var(name))
-    {
-        elog(ERROR, "ERROR: Variable %s already defined. Check your configuration.\n", name);
-        return NULL;
-    }
-
-    newv = (pgxc_ctl_var *)Malloc(sizeof(pgxc_ctl_var));
-    if (var_head == NULL)
-    {
-        var_head = var_tail = newv;
-        newv->prev = NULL;
-    }
-    else
-    {
-        newv->prev = var_tail;
-        var_tail->next = newv;
-        var_tail = newv;
-    }
-    newv->next = NULL;
-    newv->varname = Strdup(name);
-    newv->val_size = 1;
-    newv->val_used = 0;
-    newv->val = (char **)Malloc(sizeof(char *));
-    newv->val[0] = NULL;
-    add_var_hash(newv);
-    return(newv);
+	pgxc_ctl_var *newv;
+
+	if (find_var(name))
+	{
+		elog(ERROR, "ERROR: Variable %s already defined. Check your configuration.\n", name);
+		return NULL;
+	}
+
+	newv = (pgxc_ctl_var *)Malloc(sizeof(pgxc_ctl_var));
+	if (var_head == NULL)
+	{
+		var_head = var_tail = newv;
+		newv->prev = NULL;
+	}
+	else
+	{
+		newv->prev = var_tail;
+		var_tail->next = newv;
+		var_tail = newv;
+	}
+	newv->next = NULL;
+	newv->varname = Strdup(name);
+	newv->val_size = 1;
+	newv->val_used = 0;
+	newv->val = (char **)Malloc(sizeof(char *));
+	newv->val[0] = NULL;
+	add_var_hash(newv);
+	return(newv);
 }
 
 void remove_var(pgxc_ctl_var *var)
 {
-    if ((var_head == var_tail) && (var_head == var))
-        var_head = var_tail = NULL;
-    else if (var_head == var)
-    {
-        var_head = var_head->next;
-        var_head->prev = NULL;
-    }
-    else if (var_tail == var)
-    {
-        var_tail->next = NULL;
-        var_tail = var_tail->prev;
-    }
-    else
-    {
-        var->prev->next = var->next;
-        var->next->prev = var->prev;
-    }
-    clear_var(var);
+	if ((var_head == var_tail) && (var_head == var))
+		var_head = var_tail = NULL;
+	else if (var_head == var)
+	{
+		var_head = var_head->next;
+		var_head->prev = NULL;
+	}
+	else if (var_tail == var)
+	{
+		var_tail->next = NULL;
+		var_tail = var_tail->prev;
+	}
+	else
+	{
+		var->prev->next = var->next;
+		var->next->prev = var->prev;
+	}
+	clear_var(var);
 }
 
 static void clear_var(pgxc_ctl_var *var)
 {
-    int ii;
+	int ii;
 
-    remove_from_hash(var);
-    for (ii = 0; var->val[ii]; ii++)
-        free(var->val[ii]);
-    free(var->varname);
-    free(var);
-             
-}        
+	remove_from_hash(var);
+	for (ii = 0; var->val[ii]; ii++)
+		free(var->val[ii]);
+	free(var->varname);
+	free(var);
+			 
+}		
 
 void add_val(pgxc_ctl_var *var, char *val)
 {
-    if (var->val_size <= var->val_used+1)
-    {
-        var->val_size = next_size(var->val_size);
-        var->val = (char **)Realloc(var->val, sizeof(char *)*var->val_size);
-    }
-    var->val[var->val_used++] = Strdup(val);
-    var->val[var->val_used] = NULL;
+	if (var->val_size <= var->val_used+1)
+	{
+		var->val_size = next_size(var->val_size);
+		var->val = (char **)Realloc(var->val, sizeof(char *)*var->val_size);
+	}
+	var->val[var->val_used++] = Strdup(val);
+	var->val[var->val_used] = NULL;
 }
 
 void add_val_name(char *name, char *val)
 {
-    pgxc_ctl_var *var;
-    if (!(var = find_var(name)))
-        return;
-    add_val(var, name);
-    return;
+	pgxc_ctl_var *var;
+	if (!(var = find_var(name)))
+		return;
+	add_val(var, name);
+	return;
 }
 
 
 pgxc_ctl_var *find_var(char *name)
 {
-    pgxc_var_hash *hash = &var_hash[hash_val(name)];
-    int    i;
+	pgxc_var_hash *hash = &var_hash[hash_val(name)];
+	int	i;
 
-    for (i = 0; i < hash->el_used; i++)
-    {
-        if (strcmp(hash->el[i]->varname, name) == 0)
-            return hash->el[i];
-    }
-    return NULL;
+	for (i = 0; i < hash->el_used; i++)
+	{
+		if (strcmp(hash->el[i]->varname, name) == 0)
+			return hash->el[i];
+	}
+	return NULL;
 }
 
 char *sval(char *name)
 {
-    pgxc_ctl_var *var = find_var(name);
-    if (!var)
-        return NULL;
-    return var->val[0];
+	pgxc_ctl_var *var = find_var(name);
+	if (!var)
+		return NULL;
+	return var->val[0];
 }
 
 char **aval(char *name)
 {
-    pgxc_ctl_var *var = find_var(name);
-    if (!var)
-        return NULL;
-    return var->val;
+	pgxc_ctl_var *var = find_var(name);
+	if (!var)
+		return NULL;
+	return var->val;
 }
 
 void reset_value(pgxc_ctl_var *var)
 {
-    int i;
-    for (i = 0; var->val[i]; i++)
-    {
-        Free (var->val[i]);
-        var->val[i] = NULL;
-    }
-    var->val_used = 0;
+	int i;
+	for (i = 0; var->val[i]; i++)
+	{
+		Free (var->val[i]);
+		var->val[i] = NULL;
+	}
+	var->val_used = 0;
 }
 
 void assign_val(char *destName, char *srcName)
 {
-    pgxc_ctl_var *dest = find_var(destName);
-    pgxc_ctl_var *src = find_var(srcName);
-    int ii;
+	pgxc_ctl_var *dest = find_var(destName);
+	pgxc_ctl_var *src = find_var(srcName);
+	int ii;
 
-    reset_value(dest);
-    for (ii = 0; ii < src->val_used; ii++)
-        add_val(dest, src->val[ii]);
+	reset_value(dest);
+	for (ii = 0; ii < src->val_used; ii++)
+		add_val(dest, src->val[ii]);
 }
 
 void assign_sval(char *destName, char *val)
 {
-    pgxc_ctl_var *dest = find_var(destName);
+	pgxc_ctl_var *dest = find_var(destName);
 
-    reset_value(dest);
-    add_val(dest, val);
+	reset_value(dest);
+	add_val(dest, val);
 }
 
 void reset_var(char *name)
 {
-    confirm_var(name);
-    reset_value(find_var(name));
+	confirm_var(name);
+	reset_value(find_var(name));
 }
 
 void reset_var_val(char *name, char *val)
 {
-    reset_var(name);
-    add_val(find_var(name), val);
+	reset_var(name);
+	add_val(find_var(name), val);
 }
 
 pgxc_ctl_var *confirm_var(char *name)
 {
-    pgxc_ctl_var *rc;
-    if ((rc = find_var(name)))
-        return rc;
-    return new_var(name);
+	pgxc_ctl_var *rc;
+	if ((rc = find_var(name)))
+		return rc;
+	return new_var(name);
 }
 
 void print_vars(void)
 {
-    pgxc_ctl_var *cur;
+	pgxc_ctl_var *cur;
 
-    lockLogFile();
-    for(cur = var_head; cur; cur=cur->next)
-        print_var(cur->varname);
-    unlockLogFile();
+	lockLogFile();
+	for(cur = var_head; cur; cur=cur->next)
+		print_var(cur->varname);
+	unlockLogFile();
 }
 
 void print_var(char *vname)
 {
-    pgxc_ctl_var *var;
-    char outBuf[MAXLINE + 1];
-
-    outBuf[0] = 0;
-    if ((var = find_var(vname)) == NULL)
-    {
-        elog(ERROR, "ERROR: Variable %s not found.\n", vname);
-        return;
-    }
-    else
-    {
-        char **curv;
-        char editbuf[MAXPATH];
-
-        snprintf(editbuf, MAXPATH, "%s (", vname);
-        strncat(outBuf, editbuf, MAXLINE);
-        for (curv=var->val; *curv; curv++)
-        {
-            snprintf(editbuf, MAXPATH, " \"%s\" ", *curv);
-            strncat(outBuf, editbuf, MAXLINE);
-        }
-        strncat(outBuf, ")", MAXLINE);
-        elog(NOTICE, "%s\n", outBuf);
-    }
-    
+	pgxc_ctl_var *var;
+	char outBuf[MAXLINE + 1];
+
+	outBuf[0] = 0;
+	if ((var = find_var(vname)) == NULL)
+	{
+		elog(ERROR, "ERROR: Variable %s not found.\n", vname);
+		return;
+	}
+	else
+	{
+		char **curv;
+		char editbuf[MAXPATH];
+
+		snprintf(editbuf, MAXPATH, "%s (", vname);
+		strncat(outBuf, editbuf, MAXLINE);
+		for (curv=var->val; *curv; curv++)
+		{
+			snprintf(editbuf, MAXPATH, " \"%s\" ", *curv);
+			strncat(outBuf, editbuf, MAXLINE);
+		}
+		strncat(outBuf, ")", MAXLINE);
+		elog(NOTICE, "%s\n", outBuf);
+	}
+	
 }
 
 void log_var(char *varname)
 {
-    if (logFile)
-        print_var(varname);
+	if (logFile)
+		print_var(varname);
 }
 
 int arraySizeName(char *name)
 {
-    pgxc_ctl_var *var;
+	pgxc_ctl_var *var;
 
-    if ((var = find_var(name)) == NULL)
-        return -1;
-    return(arraySize(var));
+	if ((var = find_var(name)) == NULL)
+		return -1;
+	return(arraySize(var));
 }
 
 int arraySize(pgxc_ctl_var *var)
 {
-    return var->val_used;
+	return var->val_used;
 }
 
 char **add_member(char **array, char *val)
 {
-    char **rv;
-    int ii;
+	char **rv;
+	int ii;
 
-    for (ii = 0; array[ii]; ii++);
-    rv = Realloc(array, sizeof(char *) * (ii + 2));
-    rv[ii] = Strdup(val);
-    rv[ii+1] = NULL;
-    return(rv);
+	for (ii = 0; array[ii]; ii++);
+	rv = Realloc(array, sizeof(char *) * (ii + 2));
+	rv[ii] = Strdup(val);
+	rv[ii+1] = NULL;
+	return(rv);
 }
 
 void clean_array(char **array)
 {
-    int ii;
-    if (array)
-    {
-        for(ii = 0; array[ii]; ii++)
-            Free(array[ii]);
-        Free(array);
-    }
+	int ii;
+	if (array)
+	{
+		for(ii = 0; array[ii]; ii++)
+			Free(array[ii]);
+		Free(array);
+	}
 }
 
 void var_assign(char **dest, char *src)
 {
-    Free(*dest);
-    *dest = src;
+	Free(*dest);
+	*dest = src;
 }
 
 char *listValue(char *name)
 {
-    pgxc_ctl_var *dest;
-    int ii;
-    char *buf;
+	pgxc_ctl_var *dest;
+	int ii;
+	char *buf;
 
-    if ((dest = find_var(name)) == NULL)
-        return Strdup("");
-    buf = Malloc(MAXLINE+1);
-    buf[0]=0;
-    for(ii = 0; ii < dest->val_used; ii++)
-    {
-        strncat(buf, dest->val[ii], MAXLINE);
-        strncat(buf, " ", MAXLINE);
-    }
-    return buf;
+	if ((dest = find_var(name)) == NULL)
+		return Strdup("");
+	buf = Malloc(MAXLINE+1);
+	buf[0]=0;
+	for(ii = 0; ii < dest->val_used; ii++)
+	{
+		strncat(buf, dest->val[ii], MAXLINE);
+		strncat(buf, " ", MAXLINE);
+	}
+	return buf;
 }
 
 int ifExists(char *name, char *value)
 {
-    pgxc_ctl_var *var = find_var(name);
-    int ii;
+	pgxc_ctl_var *var = find_var(name);
+	int ii;
 
-    if (!var)
-        return FALSE;
-    for (ii = 0; ii < var->val_used; ii++)
-        if (strcmp((var->val)[ii], value) == 0)
-            return TRUE;
-    return FALSE;
+	if (!var)
+		return FALSE;
+	for (ii = 0; ii < var->val_used; ii++)
+		if (strcmp((var->val)[ii], value) == 0)
+			return TRUE;
+	return FALSE;
 }
-    
+	
 int IfExists(char *name, char *value)
 {
-    pgxc_ctl_var *var = find_var(name);
-    int ii;
+	pgxc_ctl_var *var = find_var(name);
+	int ii;
 
-    if (!var)
-        return FALSE;
-    for (ii = 0; ii < var->val_used; ii++)
-        if (strcasecmp((var->val)[ii], value) == 0)
-            return TRUE;
-    return FALSE;
+	if (!var)
+		return FALSE;
+	for (ii = 0; ii < var->val_used; ii++)
+		if (strcasecmp((var->val)[ii], value) == 0)
+			return TRUE;
+	return FALSE;
 }
 
 /*
@@ -393,39 +393,39 @@ int IfExists(char *name, char *value)
  */ 
 int extendVar(char *name, int newSize, char *def_value)
 {
-    pgxc_ctl_var *target;
-    char **old_val;
-    int old_size;
-    int ii;
-
-    if ((target = find_var(name)) == NULL)
-        return -1;
-    if (def_value == NULL)
-        def_value = "none";
-
-    /* 
-     * If the allocated array is not already big enough to store newSize + 1
-     * elements, we must extend it newSize + 1
-     */
-    if (target->val_size <= newSize)
-    {
-        old_val = target->val;
-        old_size = target->val_size;
-        target->val = Malloc0(sizeof(char *) * (newSize + 1));
-        memcpy(target->val, old_val, sizeof(char *) * old_size);
-        target->val_size = newSize + 1;
-        Free(old_val);
-    }
-
-    for (ii = target->val_used; ii < newSize; ii++)
-        (target->val)[ii] = Strdup(def_value);
-
-    /* Store NULL in the last element to mark the end-of-array */
-    (target->val)[newSize] = NULL;
-    if (target->val_used < newSize)
-        target->val_used = newSize;
-    
-    return 0;
+	pgxc_ctl_var *target;
+	char **old_val;
+	int old_size;
+	int ii;
+
+	if ((target = find_var(name)) == NULL)
+		return -1;
+	if (def_value == NULL)
+		def_value = "none";
+
+	/* 
+	 * If the allocated array is not already big enough to store newSize + 1
+	 * elements, we must extend it newSize + 1
+	 */
+	if (target->val_size <= newSize)
+	{
+		old_val = target->val;
+		old_size = target->val_size;
+		target->val = Malloc0(sizeof(char *) * (newSize + 1));
+		memcpy(target->val, old_val, sizeof(char *) * old_size);
+		target->val_size = newSize + 1;
+		Free(old_val);
+	}
+
+	for (ii = target->val_used; ii < newSize; ii++)
+		(target->val)[ii] = Strdup(def_value);
+
+	/* Store NULL in the last element to mark the end-of-array */
+	(target->val)[newSize] = NULL;
+	if (target->val_used < newSize)
+		target->val_used = newSize;
+	
+	return 0;
 }
 
 
@@ -434,40 +434,40 @@ int extendVar(char *name, int newSize, char *def_value)
  * Returns *val if success, NULL if failed
  */
 void assign_arrayEl_internal(char *name, int idx, char *val, char *pad,
-        int extend)
+		int extend)
 {
-    pgxc_ctl_var *var = confirm_var(name);
+	pgxc_ctl_var *var = confirm_var(name);
 
-    if (pad == NULL)
-        pad = "none";
-    /*
-     * Pad if needed
-     */
-    if (extend)
-        extendVar(name, idx+1, pad);
-    Free(var->val[idx]);
-    var->val[idx] = Strdup(val);
+	if (pad == NULL)
+		pad = "none";
+	/*
+	 * Pad if needed
+	 */
+	if (extend)
+        (void) extendVar(name, idx+1, pad);
+	Free(var->val[idx]);
+	var->val[idx] = Strdup(val);
 }
 
 void assign_arrayEl(char *name, int idx, char *val, char *pad)
 {
-    return assign_arrayEl_internal(name, idx, val, pad, TRUE);
+	return assign_arrayEl_internal(name, idx, val, pad, TRUE);
 }
 
 void replace_arrayEl(char *name, int idx, char *val, char *pad)
 {
-    return assign_arrayEl_internal(name, idx, val, pad, FALSE);
+	return assign_arrayEl_internal(name, idx, val, pad, FALSE);
 }
 
 int doesExist(char *name, int idx)
 {
-    pgxc_ctl_var *var;
+	pgxc_ctl_var *var;
 
-    if (name == NULL)
-        return 0;
-    if ((var = find_var(name)) == NULL)
-        return 0;
-    if (var->val_used <= idx)
-        return 0;
-    return 1;
+	if (name == NULL)
+		return 0;
+	if ((var = find_var(name)) == NULL)
+		return 0;
+	if (var->val_used <= idx)
+		return 0;
+	return 1;
 }
diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index dfd64707..c72fb1af 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -449,6 +449,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
         {
 	            int len = strlen(outputstr);
 #ifdef __TBASE__
+	            int len = strlen(outputstr);
 	            if (slot->tts_tupleDescriptor->attrs[i]->atttypid == RECORDOID && self->mydest == DestRemoteExecute)
 	            {
 		            Oid			    tupType;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index b0129032..db17aec9 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -3827,6 +3827,7 @@ heap_delete(Relation relation, ItemPointer tid,
     if (RelationNeedsWAL(relation))
     {
         xl_heap_delete xlrec;
+		xl_heap_header xlhdr;
         XLogRecPtr    recptr;
 
         /* For logical decode we need combocids to properly decode the catalog */
@@ -3860,8 +3861,6 @@ heap_delete(Relation relation, ItemPointer tid,
 #ifdef __STORAGE_SCALABLE__
             HeapTuple   tup;
 #endif
-            xl_heap_header xlhdr;
-
             xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
             xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
             xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c
index 9d0dfda6..29519f67 100644
--- a/src/backend/access/spgist/spgscan.c
+++ b/src/backend/access/spgist/spgscan.c
@@ -590,6 +590,10 @@ storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr,
     so->recheck[so->nPtrs] = recheck;
     if (so->want_itup)
     {
+        if (so->indexTupDesc->natts != 1)
+            ereport(ERROR,
+                    (errcode(ERRCODE_DATATYPE_MISMATCH),
+                            errmsg("wrong natts in indexTupDesc.")));
         /*
          * Reconstruct index data.  We have to copy the datum out of the temp
          * context anyway, so we may as well create the tuple here.
diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index daf77a90..1e6c908b 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -1430,12 +1430,17 @@ CloseGTM(void)
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
 GTM_Timestamp 
 GetGlobalTimestampGTM(void)
-{// #lizard forgives
-    Get_GTS_Result gts_result = {InvalidGlobalTimestamp,false};
-    GTM_Timestamp  latest_gts = InvalidGlobalTimestamp;
-    struct rusage start_r;
-    struct timeval start_t;
+{
+	struct rusage start_r;
+	struct timeval start_t;
 	int  retry_cnt = 0;
+	Get_GTS_Result gts_result = {InvalidGlobalTimestamp,false};
+	GTM_Timestamp  latest_gts = InvalidGlobalTimestamp;
+
+	if (!g_set_global_snapshot)
+	{
+		return LocalCommitTimestamp;
+	}
 
     if (log_gtm_stats)
         ResetUsageCommon(&start_r, &start_t);
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index ebe93238..37a041f6 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -3386,7 +3386,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
     {
         XLogBeginInsert();
         XLogRegisterData((char *)tid, strlen(tid)+1);
-        XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp) + 1);
+        XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp));
         xlogrec = XLogInsert(RM_XLOG_ID, XLOG_RECORD_2PC_TIMESTAMP);
         /* only start node need to flush and sync XLOG_RECORD_2PC_TIMESTAMP */
         if (IS_PGXC_LOCAL_COORDINATOR)
@@ -3398,7 +3398,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 
     if (enable_distri_print)
     {
-        read(fd, file_content, 2048);//FileRead(fd, file_content, 2048, WAIT_EVENT_BUFFILE_READ);
+        (void) read(fd, file_content, 2048);//FileRead(fd, file_content, 2048, WAIT_EVENT_BUFFILE_READ);
         elog(LOG, "before append 2pc file: %s, file_content: %s", tid, file_content);
     }
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 405e9bac..18ed9b22 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -10199,11 +10199,11 @@ xlog_redo(XLogReaderState *record)
         {
             startnode = temp;
             memcpy(&startxid, pos, sizeof(TransactionId));
-            pos = pos + sizeof(TransactionId) + 1;
+            pos = pos + sizeof(TransactionId) ;
             nodestring = pos;
             pos = pos + strlen(nodestring) + 1;
             memcpy(&xid, pos, sizeof(TransactionId));
-            pos = pos + sizeof(TransactionId) + 1;
+            pos = pos + sizeof(TransactionId) ;
             if (IsXidImplicit(gid))
             {
                 memcpy(&commit_timestamp, pos, sizeof(GlobalTimestamp));
diff --git a/src/backend/audit/audit_fga.c b/src/backend/audit/audit_fga.c
index 06bdbd6e..2327ac00 100644
--- a/src/backend/audit/audit_fga.c
+++ b/src/backend/audit/audit_fga.c
@@ -431,7 +431,7 @@ exec_policy_funct_on_other_node(char *query_string)
         {
             cn_node_list = (Oid *) palloc0(cn_nodes_num * sizeof(Oid));
                 
-            PGXCGetCoordOidOthers(&cn_node_list);
+            PGXCGetCoordOidOthers(cn_node_list);
             pgxc_execute_on_nodes(cn_nodes_num, cn_node_list, query_string);
         }
     }
@@ -1822,6 +1822,7 @@ process_fga_trigger(bool timeout)
                 else
                 {
                     elog(LOG, "AUDIT_FGA: cannot connect to db");
+	                PQfinish(conn);
                 }
             }
         }
diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c
index 1a45c53b..d085bae1 100644
--- a/src/backend/catalog/objectaddress.c
+++ b/src/backend/catalog/objectaddress.c
@@ -3570,6 +3570,7 @@ getObjectDescription(const ObjectAddress *object)
             {
                 appendStringInfoString(&buffer, _("distributed "));
                 getRelationDescription(&buffer, object->objectId);
+				break;
             }
 
             /*
diff --git a/src/backend/catalog/pgxc_class.c b/src/backend/catalog/pgxc_class.c
index 8384bccd..ccbc45c3 100644
--- a/src/backend/catalog/pgxc_class.c
+++ b/src/backend/catalog/pgxc_class.c
@@ -744,12 +744,6 @@ ModifyPgxcClass(PgxcClassModifyType type, PgxcClassModifyData *data)
                                 pfree(nodelist);
                                 pfree(newtup);
                             }
-                            else
-                            {
-                                heap_endscan(scan);
-                                heap_close(rel,AccessExclusiveLock);
-                                elog(ERROR, "unknow PgxcClassModifyType %d.", type);
-                            }
                         }
                         
                         tup = heap_getnext(scan, ForwardScanDirection);
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 3d522795..3cb9c044 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -168,7 +168,7 @@ DefineSequence(ParseState *pstate, CreateSeqStmt *seq)
     List       *owned_by;
     CreateStmt *stmt = makeNode(CreateStmt);
     Oid            seqoid;
-    ObjectAddress address;
+	ObjectAddress address = InvalidObjectAddress;
     Relation    rel;
     HeapTuple    tuple;
     TupleDesc    tupDesc;
@@ -575,7 +575,7 @@ AlterSequence(ParseState *pstate, AlterSeqStmt *stmt)
     bool            cycle;
     bool            is_restart;
 #endif
-    ObjectAddress address;
+	ObjectAddress address = InvalidObjectAddress;
     Relation    rel;
     HeapTuple    seqtuple;
     HeapTuple    newdatatuple;
diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c
index 8fefe73b..63ca4812 100644
--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -67,7 +67,12 @@ CreateStatistics(CreateStatsStmt *stmt)
 	Oid			relid;
 	ObjectAddress parentobject,
 				myself;
+#ifdef __TBASE__
+    Datum		types[3];		/* one for each possible type of statistic */
+#else
 	Datum		types[2];		/* one for each possible type of statistic */
+#endif
+
 	int			ntypes;
 	ArrayType  *stxkind;
 	bool		build_ndistinct;
diff --git a/src/backend/contrib/pgcrypto/internal.c b/src/backend/contrib/pgcrypto/internal.c
index ce369693..63d1df30 100644
--- a/src/backend/contrib/pgcrypto/internal.c
+++ b/src/backend/contrib/pgcrypto/internal.c
@@ -705,7 +705,7 @@ system_reseed(void)
         check_time = t;
 
         /* roll dice */
-        px_get_random_bytes(buf, 1);
+        (void) px_get_random_bytes(buf, 1);
         skip = buf[0] >= SYSTEM_RESEED_CHANCE;
     }
     /* clear 1 byte */
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
index 79c629be..c6401651 100644
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -1083,6 +1083,7 @@ GetAttributeByName(HeapTupleHeader tuple, const char *attname, bool *isNull)
     tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple);
     ItemPointerSetInvalid(&(tmptup.t_self));
     tmptup.t_tableOid = InvalidOid;
+	tmptup.t_xc_node_id = InvalidOid;
     tmptup.t_data = tuple;
 
     result = heap_getattr(&tmptup,
@@ -1131,6 +1132,7 @@ GetAttributeByNum(HeapTupleHeader tuple,
     tmptup.t_len = HeapTupleHeaderGetDatumLength(tuple);
     ItemPointerSetInvalid(&(tmptup.t_self));
     tmptup.t_tableOid = InvalidOid;
+	tmptup.t_xc_node_id = InvalidOid;
     tmptup.t_data = tuple;
 
     result = heap_getattr(&tmptup,
diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c
index 97f886f5..35ad2dc1 100644
--- a/src/backend/libpq/hba.c
+++ b/src/backend/libpq/hba.c
@@ -2542,38 +2542,59 @@ gethba_options(HbaLine *hba)
                 CStringGetTextDatum(psprintf("ldapbinddn=%s", hba->ldapbinddn));
 
         if (hba->ldapbindpasswd)
+        {
+            Assert(noptions < MAX_HBA_OPTIONS);
             options[noptions++] =
                 CStringGetTextDatum(psprintf("ldapbindpasswd=%s",
                                              hba->ldapbindpasswd));
+        }
 
         if (hba->ldapsearchattribute)
+        {
+            Assert(noptions < MAX_HBA_OPTIONS);
             options[noptions++] =
                 CStringGetTextDatum(psprintf("ldapsearchattribute=%s",
                                              hba->ldapsearchattribute));
+        }
 
         if (hba->ldapscope)
+        {
+            Assert(noptions < MAX_HBA_OPTIONS);
             options[noptions++] =
                 CStringGetTextDatum(psprintf("ldapscope=%d", hba->ldapscope));
     }
+	}
 
     if (hba->auth_method == uaRADIUS)
     {
         if (hba->radiusservers_s)
+        {
+            Assert(noptions < MAX_HBA_OPTIONS);
             options[noptions++] =
                 CStringGetTextDatum(psprintf("radiusservers=%s", hba->radiusservers_s));
+        }
 
         if (hba->radiussecrets_s)
+        {
+            Assert(noptions < MAX_HBA_OPTIONS);
             options[noptions++] =
                 CStringGetTextDatum(psprintf("radiussecrets=%s", hba->radiussecrets_s));
+        }
 
         if (hba->radiusidentifiers_s)
+        {
+            Assert(noptions < MAX_HBA_OPTIONS);
             options[noptions++] =
                 CStringGetTextDatum(psprintf("radiusidentifiers=%s", hba->radiusidentifiers_s));
+        }
 
         if (hba->radiusports_s)
+        {
+            Assert(noptions < MAX_HBA_OPTIONS);
             options[noptions++] =
                 CStringGetTextDatum(psprintf("radiusports=%s", hba->radiusports_s));
     }
+	}
 
     Assert(noptions <= MAX_HBA_OPTIONS);
 
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index ca926c8c..da7c9a50 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -2024,7 +2024,12 @@ SetSockKeepAlive(int sock)
 	struct tcp_info info;
 	int len = sizeof(info);
 	/* check sock */
-	getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len);
+	if (getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len) < 0)
+    {
+        elog(LOG, "getsockopt(TCP_INFO) failed");
+        return;
+    }
+
 	if (info.tcpi_state != TCP_ESTABLISHED)
 	{
 		return;
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index f4b56e9f..61b30a35 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -1246,7 +1246,7 @@ bms_any_member(Bitmapset *a)
     int member;
     int random = abs(rand()) % bms_num_members(a);
     for (member = 0; member < random; member++)
-        bms_first_member(a);
+        (void) bms_first_member(a);
     return bms_first_member(a);
 }
 #endif
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index 86fe951b..794a8d81 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -938,7 +938,7 @@ clause_selectivity_could_under_estimated(PlannerInfo *root, Path *path)
 			if (is_opclause(clause))
 			{
 				OpExpr	   		   *opclause = (OpExpr *) clause;
-				char		 	   *oprname;
+				char		 	   *oprname  = NULL;
 				Oid					opno = opclause->opno;
 				HeapTuple 			opTuple;
 				Form_pg_operator 	operform;
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index c1583491..13150602 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -3580,7 +3580,7 @@ check_or_exist_qual_pullupable(PlannerInfo *root, Node *node)
 	}
 	else if (or_clause(node))
 	{
-		return pull_vars_of_level((Node *)lfirst(l), 1) == NIL;
+		return pull_vars_of_level(node, 1) == NIL;
 	}
 	else
 	{
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 35bf8b8a..cc19120e 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -3868,7 +3868,6 @@ create_index_path(PlannerInfo *root,
     List       *indexquals,
                *indexqualcols;
 #ifdef __COLD_HOT__
-    bool or_clause = false;
     List *quals = NULL;
     RangeTblEntry    *rte = planner_rt_fetch(rel->relid, root);
     RelationLocInfo *rel_loc_info = GetRelationLocInfo(rte->relid);
@@ -3939,7 +3938,7 @@ create_index_path(PlannerInfo *root,
         }
     }
     
-    if (IS_PGXC_COORDINATOR && !or_clause)
+	if (IS_PGXC_COORDINATOR)
     {
         int count = 0;
         Distribution   *distribution = ((Path *)pathnode)->distribution;
@@ -4013,15 +4012,6 @@ create_index_path(PlannerInfo *root,
             }
         }
     }
-/*
-    else if (IS_PGXC_COORDINATOR && or_clause && root->parse->commandType == CMD_SELECT)
-    {
-        if (rel_loc_info && AttributeNumberIsValid(rel_loc_info->secAttrNum))
-        {
-            add_groups_to_list(false, rte->relid, rel_loc_info, NULL, NULL, NULL);
-        }
-    }
-*/
 #ifdef __COLD_HOT__
     if (IS_PGXC_COORDINATOR)
     {
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index efabf774..d0d5c909 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -326,7 +326,7 @@ transformOptionalSelectInto(ParseState *pstate, Node *parseTree)
             stmt = stmt->larg;
         Assert(stmt && IsA(stmt, SelectStmt) &&stmt->larg == NULL);
 
-        if ((stmt != NULL) && (stmt->intoClause))
+		if (stmt && stmt->intoClause)
         {
             CreateTableAsStmt *ctas = makeNode(CreateTableAsStmt);
 
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 5360c6a5..d041fdfb 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -3314,6 +3314,9 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
     cxt.ispartitioned = (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
     cxt.partbound = NULL;
 	cxt.ofType = false;
+	cxt.interval_child = false;
+	cxt.interval_child_idx = -1;
+	cxt.interval_parentId = InvalidOid;
 
     /*
      * The only subtypes that currently require parse transformation handling
diff --git a/src/backend/pgxc/copy/copyops.c b/src/backend/pgxc/copy/copyops.c
index aba4aa05..0e9e1b90 100644
--- a/src/backend/pgxc/copy/copyops.c
+++ b/src/backend/pgxc/copy/copyops.c
@@ -123,9 +123,6 @@ attribute_out_text(StringInfo buf, char *string)
                         c = 'v';
                         break;
                     default:
-                        /* If it's the delimiter, must backslash it */
-                        if (c == delimc)
-                            break;
                         /* All ASCII control chars are length 1 */
                         ptr++;
                         continue;        /* fall to end of loop */
diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index 9ce0b468..20abfd91 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -1767,6 +1767,7 @@ static int locate_shard_insert(Locator *self, Datum value, bool isnull,
                 }
                 else
                 {
+                    Assert(global_index >= 0);
                     local_index = self->nodeindexMap[global_index];    
                 }
                 ((void **) self->results)[0] = ((void **) self->nodeMap)[local_index];
@@ -1893,6 +1894,7 @@ static int locate_shard_select(Locator *self, Datum value, bool isnull,
             hashvalue = compute_hash(self->dataType, value, LOCATOR_TYPE_SHARD);
 
             global_index = GetNodeIndexByHashValue(self->groupid, hashvalue);
+			Assert(global_index >= 0);
             
             switch (self->listType)
             {
diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c
index 830b1b8d..65fbcccd 100644
--- a/src/backend/pgxc/nodemgr/nodemgr.c
+++ b/src/backend/pgxc/nodemgr/nodemgr.c
@@ -36,6 +36,7 @@
 #include "catalog/pgxc_shard_map.h"
 #include "utils/fmgroids.h"
 #include "catalog/pgxc_class.h"
+#include "access/xact.h"
 #endif
 
 #ifdef __TBASE__
diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c
index 2692e307..e2d7158f 100644
--- a/src/backend/pgxc/plan/planner.c
+++ b/src/backend/pgxc/plan/planner.c
@@ -1067,7 +1067,9 @@ pgxc_build_dml_statement(PlannerInfo *root, CmdType cmdtype,
     ListCell        *lc;
     bool            can_use_pk_for_rep_change = false;
     int16            *indexed_col_numbers = NULL;
+#if 0
     int                index_col_count = 0;
+#endif
 
     /* Make sure we are dealing with DMLs */
     if (cmdtype != CMD_UPDATE &&
@@ -1328,6 +1330,7 @@ pgxc_build_dml_statement(PlannerInfo *root, CmdType cmdtype,
                 rqplan->rq_param_types[rqplan->rq_num_params++] = INT4OID;
             }
         }
+#if 0
         else
         {
             /*
@@ -1360,6 +1363,7 @@ pgxc_build_dml_statement(PlannerInfo *root, CmdType cmdtype,
                             pkattno, resultRelationIndex, INT4OID, false);
             }
         }
+#endif
         query_to_deparse->jointree->quals = (Node *)make_andclause(
                         (List *)query_to_deparse->jointree->quals);
     }
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index c11ab9d7..8f0e5ab9 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -252,9 +252,11 @@ stat_log()
             for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++)
                 elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)",
                      i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions);
-        }
+
         elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)",
              MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions);
+		}
+
         if (nodes_per_transaction)
         {
             int            i;
@@ -9233,19 +9235,12 @@ ExecRemoteQuery(PlanState *pstate)
 #ifdef __TBASE__
     if (enable_statistic)
     {
-        double __tmp__ = ((double)combiner->recv_tuples);
-        if (__tmp__ != 0)
-        {
-            elog(LOG, "FetchTuple: recv_node_count:%d, recv_tuples:%lu, recv_total_time:%ld, avg_time:%lf.",
-                   combiner->recv_node_count, combiner->recv_tuples, combiner->recv_total_time, 
-                   ((double)combiner->recv_total_time) / __tmp__);
-        }
-        else
-        {
-            elog(LOG, "FetchTuple: recv_node_count:%d, recv_tuples:%lu, recv_total_time:%ld, avg_time:--",
-                               combiner->recv_node_count, combiner->recv_tuples, combiner->recv_total_time
-                              );
-        }
+        elog(LOG, "FetchTuple: recv_node_count:%d, recv_tuples:%lu, "
+                    "recv_total_time:%ld, avg_time:%lf.",
+                    combiner->recv_node_count,combiner->recv_tuples,
+                    combiner->recv_total_time,
+                    combiner->recv_tuples ? ((double)combiner->recv_total_time)/
+                    ((double)combiner->recv_tuples) : -1);
     }
 #endif
     return NULL;
@@ -11184,19 +11179,12 @@ ExecRemoteSubplan(PlanState *pstate)
 #ifdef __TBASE__
     if (enable_statistic)
     {
-        double __tmp__= (double)combiner->recv_tuples;
-        if(__tmp__)
-        {
-        elog(LOG, "FetchTuple: worker:%d, recv_node_count:%d, recv_tuples:%lu, recv_total_time:%ld, avg_time:%lf.",
-                   ParallelWorkerNumber, combiner->recv_node_count, combiner->recv_tuples, combiner->recv_total_time, 
-                   ((double)combiner->recv_total_time) / __tmp__);
-        }
-        else
-        {
-            elog(LOG, "FetchTuple: worker:%d, recv_node_count:%d, recv_tuples:%lu, recv_total_time:%ld, avg_time:--.",
-                       ParallelWorkerNumber, combiner->recv_node_count, combiner->recv_tuples, combiner->recv_total_time
-                       );
-        }
+        elog(LOG, "FetchTuple: recv_node_count:%d, recv_tuples:%lu, "
+                    "recv_total_time:%ld, avg_time:%lf.",
+                    combiner->recv_node_count,combiner->recv_tuples,
+                    combiner->recv_total_time,
+                    combiner->recv_tuples ? ((double)combiner->recv_total_time)/
+                    ((double)combiner->recv_tuples) : -1);
     }
 #endif
     return NULL;
@@ -12560,7 +12548,7 @@ is_node_prepared(RemoteQueryState *rstate, int node)
 {
     int32 wordindex  = 0;
     int32 wordoffset = 0;
-    if (node > MAX_NODES_NUMBER)
+	if (node >= MAX_NODES_NUMBER)
     {
         elog(ERROR, "invalid nodeid:%d is bigger than maximum node number of the cluster", node);
     }
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 353f6b7f..c7c630fa 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -1274,9 +1274,6 @@ pgxc_node_read_data(PGXCNodeHandle *conn, bool close_if_error)
         return EOF;
     }
 
-    if (someread)
-        return 1;                /* got a zero read after successful tries */
-
     return 0;
 }
 
@@ -4926,7 +4923,12 @@ PGXCNodeGetTransactionParamStr(void)
 void
 pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
 {
-    pgxc_node_send_query(handle, set_query);
+	if (pgxc_node_send_query(handle, set_query) != 0)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+						errmsg("Failed to send query %s",set_query)));
+	}
     /*
      * Now read responses until ReadyForQuery.
      * XXX We may need to handle possible errors here.
@@ -5787,7 +5789,7 @@ PGXCNodeTypeString(char node_type)
 #endif
 
 #ifdef __AUDIT_FGA__
-void PGXCGetCoordOidOthers(Oid **nodelist)
+void PGXCGetCoordOidOthers(Oid *nodelist)
 {
     Oid     node_oid; 
     int     i;
@@ -5798,7 +5800,7 @@ void PGXCGetCoordOidOthers(Oid **nodelist)
         node_oid  = co_handles[i].nodeoid;
         if (co_handles[PGXCNodeId - 1].nodeoid != node_oid)
         {
-            (*nodelist)[j] = node_oid;
+            nodelist[j] = node_oid;
             j++;
         }
     }
diff --git a/src/backend/pgxc/pool/poolcomm.c b/src/backend/pgxc/pool/poolcomm.c
index 8a70dffb..c0168b00 100644
--- a/src/backend/pgxc/pool/poolcomm.c
+++ b/src/backend/pgxc/pool/poolcomm.c
@@ -83,7 +83,10 @@ pool_listen(unsigned short port, const char *unixSocketName)
 
     /* bind the name to the descriptor */
     if (bind(fd, (struct sockaddr *) & unix_addr, len) < 0)
+	{
+		close(fd);
         return -1;
+	}
 
     /*
      * Select appropriate accept-queue length limit.  PG_SOMAXCONN is only
@@ -96,7 +99,10 @@ pool_listen(unsigned short port, const char *unixSocketName)
 
     /* tell kernel we're a server */
     if (listen(fd, maxconn) < 0)
+	{
+		close(fd);
         return -1;
+	}
 
 
@@ -165,7 +171,10 @@ pool_connect(unsigned short port, const char *unixSocketName)
         strlen(unix_addr.sun_path) + 1;
 
     if (connect(fd, (struct sockaddr *) & unix_addr, len) < 0)
+	{
+		close(fd);
         return -1;
+	}
 
     return fd;
 #else
@@ -833,14 +842,9 @@ pool_recvfds(PoolPort *port, int *fds, int count)
             }
             else if (r == 0)
             {
-                if(recved_size == size)
-                    break;
-                else
-                {
                     error_no = errno;
                     goto receive_error;
                 }
-            }
 
             recved_size += r;
             if(recved_size == size)
@@ -1159,8 +1163,8 @@ pool_sendres_with_command_id(PoolPort *port, int res, CommandId cmdID, char *err
 failure:
     if (buf)
     {
-        buf = NULL;
         free(buf);
+		buf = NULL;
     }
     
     if (PoolConnectDebugPrint)
@@ -1329,9 +1333,6 @@ pool_recvres(PoolPort *port, bool need_log)
 		}
 		else if (r == 0)
 		{
-			if(recved_size == size)
-				break;
-			else
 				goto failure;
 		}
 
@@ -1449,9 +1450,6 @@ pool_recvpids(PoolPort *port, int **pids)
         }
         else if (r == 0)
         {
-            if(size == recved_size)
-                break;
-            else 
                 goto failure;
         }
         
@@ -1476,6 +1474,7 @@ pool_recvpids(PoolPort *port, int **pids)
     if (n32 == 0)
     {
         elog(WARNING, "No transaction to abort");
+		free(buf);
         return 0;
     }
 
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 63d2c3e9..b9c3bd8c 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -1115,7 +1115,13 @@ char *session_options(void)
             continue;
         }
 
-        SplitIdentifierString(strdup(value), ',', &value_list);
+		if (SplitIdentifierString(strdup(value), ',', &value_list) < 0)
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_SYNTAX_ERROR),
+                            errmsg("parse session options failed.")));
+        }
+
         foreach(l, value_list)
         {
             char *value = (char *) lfirst(l);
@@ -10903,6 +10909,65 @@ handle_session_command(PoolAgent * agent, StringInfo s)
     }
 }
 
+static bool
+remove_all_agent_references(Oid nodeoid)
+{
+	int i, j, index;
+	bool res = true;
+
+	/*
+	 * Identify if it's a coordinator or datanode first
+	 * and get its index
+	 */
+	for (i = 0; i < agentCount; i++)
+	{
+		bool found = false;
+		PoolAgent *agent;
+
+		index = agentIndexes[i];
+		agent = poolAgents[index];
+
+		for (j = 0; j < agent->num_dn_connections; j++)
+		{
+			if (agent->dn_conn_oids[j] == nodeoid)
+			{
+				found = true;
+				break;
+			}
+		}
+		if (found)
+		{
+			PGXCNodePoolSlot *slot = agent->dn_connections[j];
+			if (slot)
+				release_connection(agent->pool, slot, j, agent->dn_conn_oids[j], false, false);
+			agent->dn_connections[j] = NULL;
+		}
+		else
+		{
+			for (j = 0; j < agent->num_coord_connections; j++)
+			{
+				if (agent->coord_conn_oids[j] == nodeoid)
+				{
+					found = true;
+					break;
+				}
+			}
+			if (found)
+			{
+				PGXCNodePoolSlot *slot = agent->coord_connections[j];
+				if (slot)
+					release_connection(agent->pool, slot, j, agent->coord_conn_oids[j], true, true);
+				agent->coord_connections[j] = NULL;
+			}
+			else
+			{
+				elog(LOG, "Node not found! (%u)", nodeoid);
+				res = false;
+			}
+		}
+	}
+	return res;
+}
 
 /*
  * refresh_database_pools
diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
index 1c6fc478..a145edc5 100644
--- a/src/backend/pgxc/squeue/squeue.c
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -5781,7 +5781,7 @@ ExecFastSendDatarow(TupleTableSlot *slot, void *sndctl, int32 nodeindex, MemoryC
         uint32 remaining_length = 0;
         MemoryContext    savecxt = NULL;
 
-        ReserveSpace(node->buffer, tuple_len, &data_offset);
+        (void) ReserveSpace(node->buffer, tuple_len, &data_offset);
         remaining_length = tuple_len;
         /* MsgType */
         FillReserveSpace(node->buffer, data_offset, "D", 1);
@@ -8167,7 +8167,7 @@ ParallelFastSendDatarow(ParallelSendDataQueue *buf, TupleTableSlot *slot, void *
         uint32 remaining_length = 0;
         MemoryContext    savecxt = NULL;
 
-        ReserveBufferSpace(buf, tuple_len, &data_offset);
+        (void) ReserveBufferSpace(buf, tuple_len, &data_offset);
         remaining_length = tuple_len;
         /* MsgType */
         FillReserveBufferSpace(buf, data_offset, "D", 1);
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 3646fd85..96991186 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -352,7 +352,7 @@ RenameSlot(const char *oldname, const char *newname)
 Oid
 get_replication_slot_slotid(const char *slotname, bool missing_ok)
 {
-    Oid			oid;
+    Oid			oid = InvalidOid;
     int i = 0;
 
     for (i = 0; i < max_replication_slots; i++)
@@ -382,7 +382,7 @@ get_replication_slot_slotid(const char *slotname, bool missing_ok)
 Oid
 get_replication_slot_dbid(const char *slotname, bool missing_ok)
 {
-    Oid			oid;
+    Oid			oid = InvalidOid;
     int i = 0;
 
     for (i = 0; i < max_replication_slots; i++)
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 18da64a4..67ae7984 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -1584,7 +1584,7 @@ OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
          * just did the same thing.  If it doesn't work then we'll bomb out on
          * the second create attempt, instead.
          */
-        mkdir(tempdirpath, S_IRWXU);
+        (void) mkdir(tempdirpath, S_IRWXU);
 
         file = PathNameOpenFile(tempfilepath,
                                 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 48a49f73..1ea3d3ea 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -217,7 +217,7 @@ bool explain_stmt = false;
 #endif
 
 #ifdef __AUDIT_FGA__
-char *g_commandTag = NULL;
+const char *g_commandTag = NULL;
 #endif
 
 
@@ -1303,7 +1303,7 @@ exec_simple_query(const char *query_string)
         commandTag = CreateCommandTag(parsetree->stmt);
 
 #ifdef __AUDIT_FGA__
-        g_commandTag = pnstrdup(commandTag, strlen(commandTag));
+        g_commandTag = commandTag;
 #endif
 
 #ifdef __TBASE__
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 29c53160..1a2cb2cc 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -1615,7 +1615,7 @@ PortalRun(Portal portal, long count, bool isTopLevel, bool run_once,
      * saveResourceOwner points to subtransaction's resourceOwner, but ROLLBACK SUBTXN
      * has already released the resource, so we need to switch to current transaction owner.
      */
-    else if (IS_PGXC_DATANODE && (strcmp(portal->commandTag, "ROLLBACK SUBTXN") == 0))
+	else if (IS_PGXC_DATANODE && portal->commandTag && (strcmp(portal->commandTag, "ROLLBACK SUBTXN") == 0))
     {
         CurrentResourceOwner = GetCurrentTransactionResourceOwner();
     }
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 29409d44..cf727d6f 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -3134,7 +3134,7 @@ ProcessUtilitySlow(ParseState *pstate,
     bool        isCompleteQuery = (context <= PROCESS_UTILITY_QUERY);
     bool        needCleanup;
     bool        commandCollected = false;
-    ObjectAddress address;
+	ObjectAddress address = InvalidObjectAddress;
     ObjectAddress secondaryObject = InvalidObjectAddress;
 
     /* All event trigger calls are done only when isCompleteQuery is true */
diff --git a/src/backend/utils/adt/datetime.c b/src/backend/utils/adt/datetime.c
index 02c3455c..6661ab49 100644
--- a/src/backend/utils/adt/datetime.c
+++ b/src/backend/utils/adt/datetime.c
@@ -435,7 +435,7 @@ GetCurrentDateTime(struct pg_tm *tm)
     int            tz;
     fsec_t        fsec;
 
-    timestamp2tm(GetCurrentTransactionStartTimestamp(), &tz, tm, &fsec,
+    (void) timestamp2tm(GetCurrentTransactionStartTimestamp(), &tz, tm, &fsec,
                  NULL, NULL);
     /* Note: don't pass NULL tzp to timestamp2tm; affects behavior */
 }
@@ -451,7 +451,7 @@ GetCurrentTimeUsec(struct pg_tm *tm, fsec_t *fsec, int *tzp)
 {
     int            tz;
 
-    timestamp2tm(GetCurrentTransactionStartTimestamp(), &tz, tm, fsec,
+    (void) timestamp2tm(GetCurrentTransactionStartTimestamp(), &tz, tm, fsec,
                  NULL, NULL);
     /* Note: don't pass NULL tzp to timestamp2tm; affects behavior */
     if (tzp != NULL)
diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 91078189..a3877dff 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -582,15 +582,15 @@ pushJsonbValueScalar(JsonbParseState **pstate, JsonbIteratorToken seq,
                                                          (*pstate)->size);
             break;
         case WJB_KEY:
-            Assert(scalarVal->type == jbvString);
+			Assert(scalarVal && scalarVal->type == jbvString);
             appendKey(*pstate, scalarVal);
             break;
         case WJB_VALUE:
-            Assert(IsAJsonbScalar(scalarVal));
+			Assert(scalarVal && IsAJsonbScalar(scalarVal));
             appendValue(*pstate, scalarVal);
             break;
         case WJB_ELEM:
-            Assert(IsAJsonbScalar(scalarVal));
+			Assert(scalarVal && IsAJsonbScalar(scalarVal));
             appendElement(*pstate, scalarVal);
             break;
         case WJB_END_OBJECT:
diff --git a/src/backend/utils/adt/network_gist.c b/src/backend/utils/adt/network_gist.c
index e7a4a5e3..4edfdec3 100644
--- a/src/backend/utils/adt/network_gist.c
+++ b/src/backend/utils/adt/network_gist.c
@@ -472,7 +472,7 @@ build_inet_union_key(int family, int minbits, int commonbits,
                      unsigned char *addr)
 {
     GistInetKey *result;
-
+    Assert(commonbits >= 0);
     /* Make sure any unused bits are zeroed. */
     result = (GistInetKey *) palloc0(sizeof(GistInetKey));
 
diff --git a/src/backend/utils/adt/oid.c b/src/backend/utils/adt/oid.c
index a4f41db7..8b28d653 100644
--- a/src/backend/utils/adt/oid.c
+++ b/src/backend/utils/adt/oid.c
@@ -551,7 +551,7 @@ oidvector_append(oidvector *oldoids, Oid newOid)
     result->ndim = 1;
     SET_VARSIZE(result, OidVectorSize(oldlen + 1));
 
-    if ((oldoids) && (oldoids->dim1 > 0))
+	if (oldoids && oldoids->dim1 > 0)
         memcpy(result->values, oldoids->values, oldlen * sizeof(Oid));
 
     result->values[result->dim1-1] = newOid;
diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c
index 70e1125e..aaaf55cf 100644
--- a/src/backend/utils/adt/timestamp.c
+++ b/src/backend/utils/adt/timestamp.c
@@ -2036,7 +2036,7 @@ SetEpochTimestamp(void)
 
     GetEpochTime(tm);
     /* we don't bother to test for failure ... */
-    tm2timestamp(tm, 0, NULL, &dt);
+    (void) tm2timestamp(tm, 0, NULL, &dt);
 
     return dt;
 }                                /* SetEpochTimestamp() */
diff --git a/src/backend/utils/cache/relcryptmap.c b/src/backend/utils/cache/relcryptmap.c
index 79eef334..fd16e5fb 100644
--- a/src/backend/utils/cache/relcryptmap.c
+++ b/src/backend/utils/cache/relcryptmap.c
@@ -283,7 +283,7 @@ void cyprt_key_info_hash_init(void)
                                           CRYPT_KEY_INFO_HASHTABLE_INIT_SIZE, 
                                           CRYPT_KEY_INFO_HASHTABLE_MAX_SIZE,
                                           &info,
-                                          HASH_ELEM  | HASH_PARTITION | HASH_COMPARE);
+        								  HASH_ELEM  | HASH_PARTITION | HASH_COMPARE | HASH_BLOBS);
     g_crypt_key_info_lock = (CryptKeyInfoLock) ShmemInitStruct("crypt key info lock shmem",
                                                               MAXALIGN64(sizeof(CryptKeyInfoLockData)),
                                                               &found);
diff --git a/src/backend/utils/misc/datamask.c b/src/backend/utils/misc/datamask.c
index edf4db7c..a0101b84 100644
--- a/src/backend/utils/misc/datamask.c
+++ b/src/backend/utils/misc/datamask.c
@@ -625,10 +625,10 @@ bool datamask_scan_key_contain_mask(ScanState *node)
 	ScanKey		ScanKeys;
 	int			NumScanKeys;
 
-	if(!IsA(node, IndexScanState) && !IsA(node, IndexOnlyScanState))
+	if(node == NULL)
 		return false;
 
-	if(node == NULL)
+	if(!IsA(node, IndexScanState) && !IsA(node, IndexOnlyScanState))
 		return false;
 
     if (node->ss_currentRelation &&
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ffc6dbd4..d1bf813f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -6985,6 +6985,7 @@ SelectConfigFiles(const char *userDoption, const char *progname)
                      strerror(errno));
         if (errno == ENOENT)
             write_stderr("Run initdb or pg_basebackup to initialize a PostgreSQL data directory.\n");
+		free(configdir);
         return false;
     }
 
@@ -7054,6 +7055,7 @@ SelectConfigFiles(const char *userDoption, const char *progname)
                      "or by the -D invocation option, or by the "
                      "PGDATA environment variable.\n",
                      progname, ConfigFileName);
+		free(configdir);
         return false;
     }
 
@@ -7102,6 +7104,7 @@ SelectConfigFiles(const char *userDoption, const char *progname)
                      "or by the -D invocation option, or by the "
                      "PGDATA environment variable.\n",
                      progname, ConfigFileName);
+		free(configdir);
         return false;
     }
     SetConfigOption("hba_file", fname, PGC_POSTMASTER, PGC_S_OVERRIDE);
diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c
index 5ff3cf8e..4d7ea96e 100644
--- a/src/backend/utils/misc/mls.c
+++ b/src/backend/utils/misc/mls.c
@@ -1317,10 +1317,10 @@ void InsertTrsprtCryptPolicyMapTuple(Relation pg_transp_crypt_map_desc,
 
     rd_rel = pg_transp_crypt_map_desc->rd_rel;
 
-    memset(&NameStr(schemaname), 0, NAMEDATALEN);
-    memcpy(&NameStr(schemaname), GetSchemaNameByOid(relnamespace), NAMEDATALEN);
-    memset(&NameStr(spcname),    0, NAMEDATALEN);
-    memcpy(&NameStr(spcname),    get_tablespace_name(spaceoid), NAMEDATALEN);
+    memset(NameStr(schemaname), 0, NAMEDATALEN);
+    strncpy(NameStr(schemaname), GetSchemaNameByOid(relnamespace), NAMEDATALEN);
+    memset(NameStr(spcname),    0, NAMEDATALEN);
+    strncpy(NameStr(spcname), get_tablespace_name(spaceoid), NAMEDATALEN);
 
     
     /*
diff --git a/src/backend/utils/misc/relcrypt.c b/src/backend/utils/misc/relcrypt.c
index b3d25625..fa69b54d 100644
--- a/src/backend/utils/misc/relcrypt.c
+++ b/src/backend/utils/misc/relcrypt.c
@@ -2260,7 +2260,7 @@ text * decrypt_procedure(AlgoId algo_id, text * text_src, int context_length)
             
             sm4_crypt_ecb(&(cryptkey->sm4_ctx_decrypt), 0, ctx_len, (unsigned char*)VARDATA_ANY(text_src), (unsigned char*)VARDATA_ANY(text_src));
             
-            text_ret = text_src;
+            text_ret = NULL;
         }
         
     }
@@ -2273,7 +2273,7 @@ text * decrypt_procedure(AlgoId algo_id, text * text_src, int context_length)
                                             PointerGetDatum(text_src),
                                             PointerGetDatum(privatekey),
                                             PointerGetDatum(text_src));
-        text_ret    = text_src;
+        text_ret    = NULL;
     }
     else
     {
diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c
index f7f11c06..ce9fc58a 100644
--- a/src/backend/utils/mmgr/dsa.c
+++ b/src/backend/utils/mmgr/dsa.c
@@ -1975,6 +1975,7 @@ get_best_segment(dsa_area *area, Size npages)
 
     Assert(LWLockHeldByMe(DSA_AREA_LOCK(area)));
 	check_for_freed_segments_locked(area);
+    Assert(npages > 0);
 
     /*
      * Start searching from the first bin that *might* have enough contiguous
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index ad5d9988..11e27a41 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -3810,6 +3810,8 @@ comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
     ltup.t_data = (HeapTupleHeader) ((char *) a->tuple - MINIMAL_TUPLE_OFFSET);
     rtup.t_len = ((MinimalTuple) b->tuple)->t_len + MINIMAL_TUPLE_OFFSET;
     rtup.t_data = (HeapTupleHeader) ((char *) b->tuple - MINIMAL_TUPLE_OFFSET);
+	ltup.t_tableOid = InvalidOid;
+	rtup.t_xc_node_id = InvalidOid;
     tupDesc = state->tupDesc;
 
     if (sortKey->abbrev_converter)
@@ -3864,6 +3866,9 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
     /* set up first-column key value */
     htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
     htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
+	htup.t_tableOid = InvalidOid;
+	htup.t_xc_node_id = InvalidOid;
+
     original = heap_getattr(&htup,
                             state->sortKeys[0].ssup_attno,
                             state->tupDesc,
@@ -4017,6 +4022,7 @@ readtup_datanode(Tuplesortstate *state, SortTuple *stup,
     /* set up first-column key value */
     htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
     htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
+	htup.t_tableOid = InvalidOid;
     stup->datum1 = heap_getattr(&htup,
                                 state->sortKeys[0].ssup_attno,
                                 state->tupDesc,
diff --git a/src/bin/initgtm/initgtm.c b/src/bin/initgtm/initgtm.c
index 77b9baeb..204b6b0b 100644
--- a/src/bin/initgtm/initgtm.c
+++ b/src/bin/initgtm/initgtm.c
@@ -544,7 +544,7 @@ setup_config(void)
     }
 
     writefile(path, conflines);
-    chmod(path, S_IRUSR | S_IWUSR);
+    (void) chmod(path, S_IRUSR | S_IWUSR);
 
     free(conflines);
 
diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c
index bab64a56..387370b4 100644
--- a/src/bin/pg_dump/pg_backup_tar.c
+++ b/src/bin/pg_dump/pg_backup_tar.c
@@ -867,7 +867,7 @@ _CloseArchive(ArchiveHandle *AH)
          */
         th = tarOpen(AH, "restore.sql", 'w');
 
-        tarPrintf(AH, th, "--\n"
+        (void) tarPrintf(AH, th, "--\n"
                   "-- NOTE:\n"
                   "--\n"
                   "-- File paths need to be edited. Search for $$PATH$$ and\n"
diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c
index 5f1f167f..c5a0842e 100644
--- a/src/gtm/client/fe-connect.c
+++ b/src/gtm/client/fe-connect.c
@@ -635,7 +635,7 @@ GTMPQconnectPoll(GTM_Conn *conn)
         case CONNECTION_STARTED:
             {
                 int            optval;
-                size_t optlen = sizeof(optval);
+				ACCEPT_TYPE_ARG3 optlen = sizeof(optval);
 
                 /*
                  * Write ready, since we've made it here, so the connection
@@ -1052,9 +1052,9 @@ closeGTM_Conn(GTM_Conn *conn)
          * Force length word for backends may try to read that in a generic
          * code
          */
-        gtmpqPutMsgStart('X', true, conn);
-        gtmpqPutMsgEnd(conn);
-        gtmpqFlush(conn);
+        (void) gtmpqPutMsgStart('X', true, conn);
+        (void) gtmpqPutMsgEnd(conn);
+        (void) gtmpqFlush(conn);
     }
 
     /*
@@ -1463,7 +1463,11 @@ GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle,
 	struct tcp_info info;
 	int len = sizeof(info);
 	/* check sock */
-	getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len);
+	if (getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len) < 0)
+    {
+        return false;
+    }
+    
 	if (info.tcpi_state != TCP_ESTABLISHED)
 	{
 		/* No need to set */
diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c
index c0ef6ab1..8ff8b131 100644
--- a/src/gtm/client/gtm_client.c
+++ b/src/gtm/client/gtm_client.c
@@ -2828,9 +2828,12 @@ begin_transaction_multi(GTM_Conn *conn, int txn_count, GTM_IsolationLevel *txn_i
 
     for (i = 0; i < txn_count; i++)
     {
-        gtmpqPutInt(txn_isolation_level[i], sizeof(int), conn);
-        gtmpqPutc(txn_read_only[i], conn);
-        gtmpqPutInt(txn_connid[i], sizeof(int), conn);
+		if (gtmpqPutInt(txn_isolation_level[i], sizeof(int), conn) ||
+            gtmpqPutc(txn_read_only[i], conn) ||
+            gtmpqPutInt(txn_connid[i], sizeof(int), conn))
+        {
+            goto send_failed;
+        }
     }
 
     /* Finish the message. */
diff --git a/src/gtm/common/gtm_opt_handler.c b/src/gtm/common/gtm_opt_handler.c
index a3fea6b4..bb0344a5 100644
--- a/src/gtm/common/gtm_opt_handler.c
+++ b/src/gtm/common/gtm_opt_handler.c
@@ -1440,6 +1440,7 @@ SelectConfigFiles(const char *userDoption, const char *progname)
                      "You must specify the --config-file or -D invocation "
                      "option or set the PGDATA environment variable.\n",
                      progname);
+		free(configdir);
         return false;
     }
 
@@ -2699,7 +2700,10 @@ set_config_sourcefile(const char *name, char *sourcefile, int sourceline)
     if (record == NULL)
     {
         if (isStartUp)
+        {
             write_stderr("unrecognized configuration parameter \"%s\"\n", name);
+            exit(1);
+        }
         else
             elog(ERROR, "unrecognized configuration parameter \"%s\"", name);
     }
@@ -2752,7 +2756,10 @@ GetConfigOption(const char *name, bool restrict_superuser)
     if (record == NULL)
     {
         if (isStartUp)
+        {
             write_stderr("unrecognized configuration parameter \"%s\"\n", name);
+            exit(1);
+        }
         else
             ereport(ERROR,
                     (0,
@@ -2801,7 +2808,10 @@ GetConfigOptionResetString(const char *name)
     if (record == NULL)
     {
         if (isStartUp)
+        {
             write_stderr("unrecognized configuration parameter \"%s\"\n", name);
+            exit(1);
+        }
         else
             ereport(ERROR,
                     (0,
@@ -2874,7 +2884,10 @@ GetConfigOptionByName(const char *name, const char **varname)
     if (record == NULL)
     {
         if (isStartUp)
+        {
             write_stderr("unrecognized configuration parameter \"%s\"\n", name);
+            exit(1);
+        }
         else
             ereport(ERROR,
                     (0,
diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c
index bf804b98..4989ed41 100644
--- a/src/gtm/gtm_ctl/gtm_ctl.c
+++ b/src/gtm/gtm_ctl/gtm_ctl.c
@@ -567,7 +567,7 @@ kill_zombie()
         }
     }
 
-    fclose(fp);
+	pclose(fp);
 }
 
 static void
@@ -1097,8 +1097,11 @@ do_status(void)
         printf("\"-D\" \"%s\"",gtm_data);
         optlines = readfile(gtmopts_file);
         if (optlines != NULL)
+		{
             for (; *optlines != NULL; optlines++)
                 fputs(*optlines, stdout);
+			free(optlines);
+		}
     }
 
 
diff --git a/src/gtm/main/gtm_store.c b/src/gtm/main/gtm_store.c
index 8208b72a..43a1c9ab 100644
--- a/src/gtm/main/gtm_store.c
+++ b/src/gtm/main/gtm_store.c
@@ -3448,7 +3448,13 @@ int32 GTM_StoreDropAllSeqInDatabase(GTM_SequenceKey seq_database_key)
         {
             elog(LOG, "GTM_StoreDropAllSeqInDatabase drop %s",seq_list[i].gs_key.gsk_key);
         }
-        GTM_StoreDropSeq(seq_list[i].gti_store_handle);
+
+        if (GTM_StoreDropSeq(seq_list[i].gti_store_handle))
+        {
+            ereport(LOG,
+                    (ERANGE,
+                            errmsg("GTM_StoreDropSeq fail")));
+        }
     }
 
     if (enable_gtm_sequence_debug)
@@ -3973,7 +3979,7 @@ ProcessCheckStorageTransactionCommand(Port *myport, StringInfo message)
         if (error)
         {
             memcpy(&txn_list[txn_count].txn, txn_info, sizeof(GTM_StoredTransactionInfo));         
-            if (error || GTMStorageStatus_CRC_error)
+			if (error & GTMStorageStatus_CRC_error)
             {
                 if (need_fix)
                 {
@@ -3986,12 +3992,12 @@ ProcessCheckStorageTransactionCommand(Port *myport, StringInfo message)
                 }
             }
 
-            if (error || GTMStorageStatus_freelist_error)
+			if (error & GTMStorageStatus_freelist_error)
             {
                 txn_list[txn_count].status |= GTMStorageStatus_freelist_unchanged;
             }
 
-            if (error || GTMStorageStatus_hashtab_error)
+			if (error & GTMStorageStatus_hashtab_error)
             {
                 txn_list[txn_count].status |= GTMStorageStatus_hashtab_unchanged;
             }
diff --git a/src/gtm/main/gtm_xlog.c b/src/gtm/main/gtm_xlog.c
index ec3d7c66..d38aea68 100644
--- a/src/gtm/main/gtm_xlog.c
+++ b/src/gtm/main/gtm_xlog.c
@@ -836,7 +836,7 @@ ReadXLogFileToBuffIntern(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo
 
     for(;;)
     {
-        bytes = read(fd,buff->buff + buff->total_length,GTM_XLOG_SEG_SIZE);
+        bytes = read(fd,buff->buff + buff->total_length,GTM_XLOG_SEG_SIZE - buff->total_length);
 
         if(bytes < 0)
         {
@@ -1227,7 +1227,7 @@ GTM_XLogCtlDataInit(void)
     g_checkpointDirtySize  = (uint32 *)palloc(sizeof(uint32) * g_GTMStoreSize);
     g_checkpointDirtyStart = (uint32 *)palloc(sizeof(uint32) * g_GTMStoreSize);
 
-    if(enalbe_gtm_xlog_debug || enalbe_gtm_xlog_debug)
+    if(enalbe_gtm_xlog_debug || enable_gtm_debug)
     {
         elog(LOG,"Read ControlData CurrBytePos to %"PRIu64" PrevBytePos to %"PRIu64"",Insert->CurrBytePos,Insert->PrevBytePos);
         elog(LOG,"Read ControlData EndOfXLog to %X/%X PreEndOfXLog to %X/%X",
@@ -3437,9 +3437,6 @@ XLogWrite(XLogRecPtr req)
 
     do
     {
-        if(nleft == 0)
-            break;
-
         errno = 0;
         written = write(XLogCtl->xlog_fd, XLogCtl->writerBuff + start_pos, nleft);
         if (written <= 0)
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index 113cee78..7324ad8b 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -563,7 +563,10 @@ static int CheckTscFeatures(char *cmd)
     if (file == NULL)
         return false;
 
-    fscanf(file, "%d", &count);
+	if (fscanf(file, "%d", &count) == EOF)
+    {
+	    count = 0;
+    }
     pclose(file);
     
     return count;
@@ -2190,6 +2193,7 @@ GTM_ThreadTimeKeeper(void *argp)
        
     action.sa_flags = 0;  
     action.sa_handler = GTM_ThreadSigHandler;  
+	sigemptyset(&action.sa_mask);
          
     ret = sigaction(SIGQUIT, &action, NULL);  
     if (ret)
@@ -2451,7 +2455,7 @@ GTM_ThreadCheckPointer(void *argp)
        
     action.sa_flags = 0;  
     action.sa_handler = GTM_ThreadSigHandler;  
-         
+	sigemptyset(&action.sa_mask);
     ret = sigaction(SIGQUIT, &action, NULL);  
     if (ret)
     {
@@ -2567,6 +2571,7 @@ GTM_ThreadWalSender(void *argp)
 
     action.sa_flags = 0;
     action.sa_handler = GTM_ThreadSigHandler;  
+	sigemptyset(&action.sa_mask);
     ret = sigaction(SIGQUIT, &action, NULL);  
     
     if (ret)
@@ -2859,6 +2864,8 @@ GTM_ThreadArchiver(void *argp)
 
     action.sa_flags = 0;
     action.sa_handler = GTM_ThreadSigHandler;
+	sigemptyset(&action.sa_mask);
+
     ret = sigaction(SIGQUIT, &action, NULL);
 
     if (ret)
@@ -3081,6 +3088,7 @@ GTM_ThreadWalRedoer(void *argp)
 
     action.sa_flags = 0;
     action.sa_handler = GTM_ThreadSigHandler;  
+	sigemptyset(&action.sa_mask);
     ret = sigaction(SIGQUIT, &action, NULL);  
     
     if (ret)
@@ -3149,6 +3157,8 @@ GTM_ThreadWalReceiver(void *argp)
     
     action.sa_flags = 0;  
     action.sa_handler = GTM_ThreadSigHandler;  
+	sigemptyset(&action.sa_mask);
+
     ret = sigaction(SIGQUIT, &action, NULL);
 
     if (ret)
@@ -3348,6 +3358,7 @@ GTM_ThreadMain(void *argp)
        
     action.sa_flags = 0;  
     action.sa_handler = GTM_ThreadSigHandler;  
+	sigemptyset(&action.sa_mask);
          
     ret = sigaction(SIGQUIT, &action, NULL);  
 	if (ret)
@@ -3611,6 +3622,7 @@ GTM_ThreadBasebackup(void *argp)
 
     action.sa_flags = 0;
     action.sa_handler = GTM_ThreadSigHandler;
+	sigemptyset(&action.sa_mask);
 
     ret = sigaction(SIGQUIT, &action, NULL);
     if (ret)
@@ -3877,7 +3889,7 @@ ProcessCommand(Port *myport, StringInfo input_message)
                       mtype != MSG_CHECK_GTM_STATUS
     );
 
-    if(GetMyThreadInfo->handle_standby)
+    if(my_threadinfo->handle_standby)
     {
 #ifndef __XLOG__
         /* Handle standby connecion staff. */
@@ -4259,6 +4271,7 @@ GTMAddConnection(Port *port, GTM_Conn *standby)
                 if (NULL == GTM_ThreadCreate(GTM_ThreadMain, g_max_lock_number))
                 {
                     elog(WARNING, "Failed to create gtm thread.");
+	                GTM_RWLockAcquire(&GTMThreads->gt_lock, GTM_LOCKMODE_READ);
                     break;
                 }
                 GTM_RWLockAcquire(&GTMThreads->gt_lock, GTM_LOCKMODE_READ);
diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c
index 5508c06d..279471d3 100644
--- a/src/gtm/proxy/proxy_main.c
+++ b/src/gtm/proxy/proxy_main.c
@@ -1522,7 +1522,10 @@ GTMProxy_ThreadMain(void *argp)
          * Make sure everything is on wire now
          */
         Enable_Longjmp();
-        gtmpqFlush(thrinfo->thr_gtm_conn);
+		if (gtmpqFlush(thrinfo->thr_gtm_conn))
+        {
+            elog(ERROR, "Error sending flush message");
+        }
         Disable_Longjmp();
 
         /*
diff --git a/src/gtm/recovery/register_common.c b/src/gtm/recovery/register_common.c
index a10c60e9..9c79bb33 100644
--- a/src/gtm/recovery/register_common.c
+++ b/src/gtm/recovery/register_common.c
@@ -598,12 +598,17 @@ Recovery_RecordRegisterInfo(GTM_PGXCNodeInfo *nodeinfo, bool is_register)
     int ctlfd;
     int len;
 
+	if (nodeinfo == NULL)
+	{
+		return;
+	}
+
     GTM_RWLockAcquire(&RegisterFileLock, GTM_LOCKMODE_WRITE);
 
     ctlfd = open(GTMPGXCNodeFile, O_WRONLY | O_CREAT | O_APPEND,
                  S_IRUSR | S_IWUSR);
 
-    if (ctlfd == -1 || nodeinfo == NULL)
+	if (ctlfd == -1)
     {
         GTM_RWLockRelease(&RegisterFileLock);
         return;
diff --git a/src/gtm/xlog_test/xlog_reader.c b/src/gtm/xlog_test/xlog_reader.c
index 400846f7..3e0d6520 100644
--- a/src/gtm/xlog_test/xlog_reader.c
+++ b/src/gtm/xlog_test/xlog_reader.c
@@ -915,7 +915,8 @@ void bind_service_threads(void)
 }
 
 /* time keeper thread will not handle any signal, any signal will cause the thread exit. */
-void *
+void 
+*
 GTM_ThreadTimeKeeper(void *argp)
 {
     GTM_ThreadInfo *my_threadinfo = (GTM_ThreadInfo *)argp;
@@ -1028,7 +1029,8 @@ GTM_ThreadTimeKeeper(void *argp)
 
 
 /* time keeper thread will not handle any signal, any signal will cause the thread exit. */
-void *
+void 
+*
 GTM_ThreadTimeBackup(void *argp)
 {
     GTM_ThreadInfo *my_threadinfo = (GTM_ThreadInfo *)argp;
@@ -1047,6 +1049,7 @@ GTM_ThreadTimeBackup(void *argp)
        
     action.sa_flags = 0;  
     action.sa_handler = GTM_ThreadSigHandler;  
+	sigemptyset(&action.sa_mask);
          
     ret = sigaction(SIGQUIT, &action, NULL);  
     if (ret)
@@ -1174,6 +1177,7 @@ GTM_ThreadCheckPointer(void *argp)
        
     action.sa_flags = 0;  
     action.sa_handler = GTM_ThreadSigHandler;  
+	sigemptyset(&action.sa_mask);
          
     ret = sigaction(SIGQUIT, &action, NULL);  
     if (ret)
@@ -1253,7 +1257,7 @@ GTM_ThreadXLogWriter(void *argp)
        
     action.sa_flags = 0;  
     action.sa_handler = GTM_ThreadSigHandler;  
-         
+	sigemptyset(&action.sa_mask);
     ret = sigaction(SIGQUIT, &action, NULL);  
     if (ret)
     {
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index a0db442b..018cecd9 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -244,6 +244,7 @@ extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
 extern XLogSegNo XLogGetLastRemovedSegno(void);
 extern void XLogSetAsyncXactLSN(XLogRecPtr record);
 extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn);
+extern XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 
 extern void xlog_redo(XLogReaderState *record);
 extern void xlog_desc(StringInfo buf, XLogReaderState *record);
diff --git a/src/include/audit/audit_fga.h b/src/include/audit/audit_fga.h
index d0697439..c60c3549 100644
--- a/src/include/audit/audit_fga.h
+++ b/src/include/audit/audit_fga.h
@@ -70,7 +70,7 @@
 #define AUDIT_TRIGGER_FEEDBACK_LEN  256
 
 extern bool enable_fga;
-extern char *g_commandTag;
+extern const char *g_commandTag;
 
 
 /* simple list of strings */
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 6643dc36..be08deab 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -305,7 +305,7 @@ extern const char * PGXCNodeTypeString(char node_type);
 #endif
 
 #ifdef __AUDIT_FGA__
-extern void PGXCGetCoordOidOthers(Oid **nodelist);
+extern void PGXCGetCoordOidOthers(Oid *nodelist);
 extern void PGXCGetAllDnOid(Oid *nodelist);
 #endif
 
diff --git a/src/interfaces/ecpg/ecpglib/execute.c b/src/interfaces/ecpg/ecpglib/execute.c
index e5fb8ba0..ced7beb8 100644
--- a/src/interfaces/ecpg/ecpglib/execute.c
+++ b/src/interfaces/ecpg/ecpglib/execute.c
@@ -299,7 +299,10 @@ ecpg_is_type_an_array(int type, const struct statement *stmt, const struct varia
     else
         return (ECPG_ARRAY_ERROR);
 
-    ecpg_type_infocache_push(&(stmt->connection->cache_head), type, isarray, stmt->lineno);
+    if (!ecpg_type_infocache_push(&(stmt->connection->cache_head), type, isarray, stmt->lineno))
+    {
+        return (ECPG_ARRAY_ERROR);
+    }
     ecpg_log("ecpg_is_type_an_array on line %d: type (%d); C (%d); array (%s)\n", stmt->lineno, type, var->type, ECPG_IS_ARRAY(isarray) ? "yes" : "no");
     return isarray;
 }
diff --git a/src/interfaces/ecpg/preproc/ecpg.c b/src/interfaces/ecpg/preproc/ecpg.c
index e83faf84..353a7d64 100644
--- a/src/interfaces/ecpg/preproc/ecpg.c
+++ b/src/interfaces/ecpg/preproc/ecpg.c
@@ -469,7 +469,7 @@ main(int argc, char *const argv[])
                  */
                 if (ret_value != 0)
                 {
-                    if (strcmp(output_filename, "-") != 0 && unlink(output_filename) != 0)
+					if (output_filename && strcmp(output_filename, "-") != 0 && unlink(output_filename) != 0)
                         fprintf(stderr, _("could not remove output file \"%s\"\n"), output_filename);
                 }
             }
diff --git a/src/interfaces/libpq/fe-auth.c b/src/interfaces/libpq/fe-auth.c
index 44659d51..e0dbc56a 100644
--- a/src/interfaces/libpq/fe-auth.c
+++ b/src/interfaces/libpq/fe-auth.c
@@ -606,8 +606,6 @@ pg_SASL_init(PGconn *conn, int payloadlen)
 
 oom_error:
     termPQExpBuffer(&mechanism_buf);
-    if (initialresponse)
-        free(initialresponse);
     printfPQExpBuffer(&conn->errorMessage,
                       libpq_gettext("out of memory\n"));
     return STATUS_ERROR;
diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index e8be4dae..8c1ec04b 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -3631,8 +3631,8 @@ sendTerminateConn(PGconn *conn)
          * Try to send "close connection" message to backend. Ignore any
          * error.
          */
-        pqPutMsgStart('X', false, conn);
-        pqPutMsgEnd(conn);
+        (void) pqPutMsgStart('X', false, conn);
+        (void) pqPutMsgEnd(conn);
         (void) pqFlush(conn);
     }
 }
diff --git a/src/interfaces/libpq/fe-protocol2.c b/src/interfaces/libpq/fe-protocol2.c
index 8ab4b5e5..d0d63160 100644
--- a/src/interfaces/libpq/fe-protocol2.c
+++ b/src/interfaces/libpq/fe-protocol2.c
@@ -1064,8 +1064,6 @@ pqGetErrorNotice2(PGconn *conn, bool isError)
     return 0;
 
 failure:
-    if (res)
-        PQclear(res);
     termPQExpBuffer(&workBuf);
     return EOF;
 }
diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c
index 309956d5..78331d9d 100644
--- a/src/pl/plpgsql/src/pl_exec.c
+++ b/src/pl/plpgsql/src/pl_exec.c
@@ -6197,6 +6197,7 @@ get_tuple_from_datum(Datum value)
     ItemPointerSetInvalid(&(tmptup.t_self));
     tmptup.t_tableOid = InvalidOid;
     tmptup.t_data = td;
+	tmptup.t_xc_node_id = InvalidOid;
 
     /* Build a copy and return it */
     return heap_copytuple(&tmptup);
diff --git a/src/timezone/localtime.c b/src/timezone/localtime.c
index 9adc4eab..262ca97a 100644
--- a/src/timezone/localtime.c
+++ b/src/timezone/localtime.c
@@ -230,8 +230,6 @@ tzloadbody(char const *name, char *canonname, struct state *sp, bool doextend,
     if (!name)
     {
         name = TZDEFAULT;
-        if (!name)
-            return EINVAL;
     }
 
     if (name[0] == ':')

From b3c94a4c40b835dfbd1232c97ed6cc1e7c640e63 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 16 Mar 2021 21:15:55 +0800
Subject: [PATCH 339/578] Fix ce.

---
 src/backend/access/common/printtup.c | 1 -
 src/backend/parser/parse_utilcmd.c   | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index c72fb1af..dfd64707 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -449,7 +449,6 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
         {
 	            int len = strlen(outputstr);
 #ifdef __TBASE__
-	            int len = strlen(outputstr);
 	            if (slot->tts_tupleDescriptor->attrs[i]->atttypid == RECORDOID && self->mydest == DestRemoteExecute)
 	            {
 		            Oid			    tupType;
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index d041fdfb..25550493 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -3313,7 +3313,6 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
 #endif
     cxt.ispartitioned = (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE);
     cxt.partbound = NULL;
-	cxt.ofType = false;
 	cxt.interval_child = false;
 	cxt.interval_child_idx = -1;
 	cxt.interval_parentId = InvalidOid;

From 63a9f3086f047b6be67d63d2a305dc5a2bb6d210 Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Sat, 13 Mar 2021 11:11:41 +0800
Subject: [PATCH 340/578] check interrupts before recv retry

---
 src/backend/libpq/be-secure.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index f0006171..ea947f5b 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -257,8 +257,10 @@ secure_read(Port *port, void *ptr, size_t len)
             /*
              * We'll retry the read. Most likely it will return immediately
              * because there's still no data available, and we'll wait for the
-             * socket to become ready again.
+			 * socket to become ready again. But we should check interrupts
+			 * before retry incase of conflict interrupt.
              */
+			CHECK_FOR_INTERRUPTS();
         }
         goto retry;
     }

From 5095f437e8c0fd71c6f91f06523896e5b10db0e1 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 18 Mar 2021 17:24:49 +0800
Subject: [PATCH 341/578] fix bug in coverity code

---
 src/backend/libpq/pqcomm.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index da7c9a50..fb35a142 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -2026,7 +2026,6 @@ SetSockKeepAlive(int sock)
 	/* check sock */
 	if (getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len) < 0)
     {
-        elog(LOG, "getsockopt(TCP_INFO) failed");
         return;
     }
 

From 57dcc31186ce36c7b2c4e2bf80844d76679759a8 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 18 Mar 2021 19:36:09 +0800
Subject: [PATCH 342/578] fix complie warning

---
 src/gtm/main/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index 7324ad8b..ec2b2143 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -610,7 +610,7 @@ static bool CheckClockSource(void)
 
 #endif
 
-static void GTM_XLogRecoveryIfNeed(const char *data_dir)
+static void GTM_XLogRecoveryIfNeed(char *data_dir)
 {
     Assert(ControlData != NULL);
 

From 90b348d4ced58f040c318efb18cc5c0df0302eca Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Fri, 19 Mar 2021 10:54:16 +0800
Subject: [PATCH 343/578] fix ID85357491 for range partition table with big
 step

---
 src/backend/utils/adt/ruleutils.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 8fa13f29..eb2e5420 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -11717,7 +11717,7 @@ find_partidx_by_int(int64 start, int step, int partitions,
     int gap = -1;
     int align = -1;
 
-    if(value < start || value >= start + step*partitions)
+    if(value < start || value >= start + ((int64)step)*partitions)
     {
         return PARTITION_ROUTER_RESULT_NULL;
     }

From 0c110531600bc86c2afdc73455085aac970e5dbe Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Fri, 19 Mar 2021 16:31:14 +0800
Subject: [PATCH 344/578] Fix ce

---
 src/gtm/gtm_ctl/gtm_ctl.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c
index 4989ed41..2b34ad0d 100644
--- a/src/gtm/gtm_ctl/gtm_ctl.c
+++ b/src/gtm/gtm_ctl/gtm_ctl.c
@@ -1100,7 +1100,6 @@ do_status(void)
 		{
             for (; *optlines != NULL; optlines++)
                 fputs(*optlines, stdout);
-			free(optlines);
 		}
     }
 

From d09218dca2bc9644d2b1acd5e761702549ce93dc Mon Sep 17 00:00:00 2001
From: jackywpxie <jackywpxie@tencent.com>
Date: Mon, 22 Mar 2021 15:29:45 +0800
Subject: [PATCH 345/578] jacky/bugfix/coredump_Tbase_v5.05.3 (merge request
 !206)

Squash merge branch 'jacky/bugfix/coredump_Tbase_v5.05.3' into 'Tbase_v5.05.3'

* rollback to former.

* rollback

* delete nouse code.

* delete nouse code.

* refactor

* disable nodePool->created

* delete nouse code.

* m_version

* Revert 'debug info.'

* Revert 'debug info'

* comment

* add log info.

* add log info.

* debug info.

* debug info

* debug info.

* modified accoording to suggestion

* modified according to suggestion.

* delete nouse code.

* time

* delete nouse code.

* fixed warm invalid slot.

* fix

Conflicts:
	src/backend/pgxc/pool/poolmgr.c
---
 src/backend/pgxc/pool/poolmgr.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index b9c3bd8c..f33b95f0 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -4725,6 +4725,7 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
          */
         if (PoolConnectDebugPrint)
         {
+<<<<<<< HEAD
 			elog(LOG,
 				POOL_MGR_PREFIX"release_connection connection to "
 				"database:%s user:%s "
@@ -4733,6 +4734,9 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
 				dbPool->database, dbPool->user_name,
 				nodePool->node_name, slot->backend_pid, nodeidx,
 				nodePool->size, nodePool->freeSize);
+=======
+			elog(LOG, POOL_MGR_PREFIX"release_connection connection to node:%s backend_pid:%d nodeidx:%d size:%d freeSize:%d can not find nodepool, just destory it", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize);
+>>>>>>> jacky/bugfix/coredump_Tbase_v5.05.3 (merge request !206)
         }
         destroy_slot(nodeidx, node, slot);
         return;

From 59ce0b55fdf1b23f292d68acfdda130ff8ca8dea Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 30 Jun 2022 11:01:56 +0800
Subject: [PATCH 346/578] Revert "jacky/bugfix/coredump_Tbase_v5.05.3 (merge
 request !206)"

This reverts commit 16a767922044eef82de654e833469729dc9f019b.
---
 src/backend/pgxc/pool/poolmgr.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index f33b95f0..b9c3bd8c 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -4725,7 +4725,6 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
          */
         if (PoolConnectDebugPrint)
         {
-<<<<<<< HEAD
 			elog(LOG,
 				POOL_MGR_PREFIX"release_connection connection to "
 				"database:%s user:%s "
@@ -4734,9 +4733,6 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
 				dbPool->database, dbPool->user_name,
 				nodePool->node_name, slot->backend_pid, nodeidx,
 				nodePool->size, nodePool->freeSize);
-=======
-			elog(LOG, POOL_MGR_PREFIX"release_connection connection to node:%s backend_pid:%d nodeidx:%d size:%d freeSize:%d can not find nodepool, just destory it", nodePool->node_name, slot->backend_pid, nodeidx, nodePool->size, nodePool->freeSize);
->>>>>>> jacky/bugfix/coredump_Tbase_v5.05.3 (merge request !206)
         }
         destroy_slot(nodeidx, node, slot);
         return;

From 467a3da532eb1462a914025cd3acab576b991137 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Thu, 1 Apr 2021 15:44:40 +0800
Subject: [PATCH 347/578] Skip early ExecFinishNode if limit have been pushed
 down

Upper level limit node could skip early ExecFinishNode to save 2~3ms of
meaningless communication, since we've already push down the limit.
---
 src/backend/executor/execProcnode.c     |  3 +++
 src/backend/executor/nodeLimit.c        |  1 +
 src/backend/nodes/copyfuncs.c           |  4 +++-
 src/backend/nodes/outfuncs.c            |  6 ++++++
 src/backend/nodes/readfuncs.c           |  4 +++-
 src/backend/optimizer/plan/createplan.c | 15 +++++++++++----
 src/backend/optimizer/plan/planner.c    | 16 ++++++++++++++--
 src/backend/optimizer/util/pathnode.c   |  4 +++-
 src/include/nodes/plannodes.h           |  3 +++
 src/include/nodes/relation.h            |  3 +++
 src/include/optimizer/pathnode.h        |  3 ++-
 src/include/optimizer/planmain.h        |  2 +-
 12 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index cec4400d..d9ae1aa9 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -1075,6 +1075,9 @@ ExecFinishNode(PlanState *node)
                 return;
             }
             break;
+		case T_LimitState:
+			elog(LOG, "[LIMITSTATE]");
+			break;
         default:
             break;
     }
diff --git a/src/backend/executor/nodeLimit.c b/src/backend/executor/nodeLimit.c
index 10e90910..9849ff29 100644
--- a/src/backend/executor/nodeLimit.c
+++ b/src/backend/executor/nodeLimit.c
@@ -142,6 +142,7 @@ ExecLimit(PlanState *pstate)
                         elog(LOG, "ExecLimit: pid %d nodeLimit finishing", MyProcPid);
                     }
                     
+					if (!((Limit *)node->ps.plan)->skipEarlyFinish)
                     ExecFinishNode(pstate);
 
                     if (g_DataPumpDebug)
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 6e6e562f..9ccd69bd 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -1197,7 +1197,9 @@ _copyLimit(const Limit *from)
      */
     COPY_NODE_FIELD(limitOffset);
     COPY_NODE_FIELD(limitCount);
-
+#ifdef __TBASE__
+	COPY_SCALAR_FIELD(skipEarlyFinish);
+#endif
     return newnode;
 }
 
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index ac7ea190..8266ad33 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -1679,6 +1679,9 @@ _outLimit(StringInfo str, const Limit *node)
 
     WRITE_NODE_FIELD(limitOffset);
     WRITE_NODE_FIELD(limitCount);
+#ifdef __TBASE__
+    WRITE_BOOL_FIELD(skipEarlyFinish);
+#endif
 }
 
 #ifdef XCP
@@ -3454,6 +3457,9 @@ _outLimitPath(StringInfo str, const LimitPath *node)
     WRITE_NODE_FIELD(subpath);
     WRITE_NODE_FIELD(limitOffset);
     WRITE_NODE_FIELD(limitCount);
+#ifdef __TBASE__
+    WRITE_BOOL_FIELD(skipEarlyFinish);
+#endif
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index b13796e7..db2b9441 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -3656,7 +3656,9 @@ _readLimit(void)
 
     READ_NODE_FIELD(limitOffset);
     READ_NODE_FIELD(limitCount);
-
+#ifdef __TBASE__
+	READ_BOOL_FIELD(skipEarlyFinish);
+#endif
     READ_DONE();
 }
 
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 42ffb26e..367b6766 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -2576,7 +2576,8 @@ create_minmaxagg_plan(PlannerInfo *root, MinMaxAggPath *best_path)
         plan = (Plan *) make_limit(plan,
                                    subparse->limitOffset,
                                    subparse->limitCount,
-                                   0, 1);
+								   0, 1,
+								   false);
 
         /* Must apply correct cost/width data to Limit node */
         plan->startup_cost = mminfo->path->startup_cost;
@@ -2608,7 +2609,8 @@ create_minmaxagg_plan(PlannerInfo *root, MinMaxAggPath *best_path)
             plan = (Plan *) make_limit(plan,
                                        subparse->limitOffset,
                                        subparse->limitCount,
-                                       0, 1);
+									   0, 1,
+									   false);
 
             plan->startup_cost = mminfo->path->startup_cost;
             plan->total_cost = mminfo->pathcost;
@@ -3066,7 +3068,8 @@ create_limit_plan(PlannerInfo *root, LimitPath *best_path, int flags,
     plan = make_limit(subplan,
                       best_path->limitOffset,
                       best_path->limitCount,
-                      offset_est, count_est);
+					  offset_est, count_est,
+					  best_path->skipEarlyFinish);
 
     copy_generic_path_info(&plan->plan, (Path *) best_path);
 
@@ -8386,7 +8389,7 @@ make_lockrows(Plan *lefttree, List *rowMarks, int epqParam)
  */
 Limit *
 make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount,
-           int64 offset_est, int64 count_est)
+		   int64 offset_est, int64 count_est, bool skipEarlyFinish)
 {
     Limit       *node = makeNode(Limit);
     Plan       *plan = &node->plan;
@@ -8399,6 +8402,10 @@ make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount,
     node->limitOffset = limitOffset;
     node->limitCount = limitCount;
 
+#ifdef __TBASE__
+	node->skipEarlyFinish = skipEarlyFinish;
+#endif
+
     return node;
 }
 
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 62a92b5f..de1c8ab4 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -2361,6 +2361,8 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
          */
         if (limit_needed(parse))
         {
+			bool pushDown = false;
+
             /* If needed, add a LimitPath on top of a RemoteSubplan. */
             if (path->distribution)
             {
@@ -2392,7 +2394,16 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
                     path = (Path *) create_limit_path(root, final_rel, path,
                                               NULL,
                                               limitCount, /* LIMIT + OFFSET */
-                                              0, offset_est + count_est);
+											  0, offset_est + count_est,
+											  false);
+#ifdef __TBASE__
+					/*
+					 * Upper level limit node could skip early ExecFinishNode to save
+					 * 2~3ms of meaningless communication, since we've already push
+					 * down the limit.
+					 */
+					pushDown = true;
+#endif
                 }
 
                 path = create_remotesubplan_path(root, path, NULL);
@@ -2401,7 +2412,8 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
             path = (Path *) create_limit_path(root, final_rel, path,
                                               parse->limitOffset,
                                               parse->limitCount,
-                                              offset_est, count_est);
+											  offset_est, count_est,
+											  pushDown);
         }
 
         /*
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index cc19120e..032253ed 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -6843,7 +6843,8 @@ LimitPath *
 create_limit_path(PlannerInfo *root, RelOptInfo *rel,
                   Path *subpath,
                   Node *limitOffset, Node *limitCount,
-                  int64 offset_est, int64 count_est)
+                  int64 offset_est, int64 count_est,
+                  bool skipEarlyFinish)
 {// #lizard forgives
     LimitPath  *pathnode = makeNode(LimitPath);
 
@@ -6864,6 +6865,7 @@ create_limit_path(PlannerInfo *root, RelOptInfo *rel,
     pathnode->subpath = subpath;
     pathnode->limitOffset = limitOffset;
     pathnode->limitCount = limitCount;
+	pathnode->skipEarlyFinish = skipEarlyFinish;
 
     pathnode->path.distribution = copyObject(subpath->distribution);
 
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index c20a6741..03457b86 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -988,6 +988,9 @@ typedef struct Limit
     Plan        plan;
     Node       *limitOffset;    /* OFFSET parameter, or NULL if none */
     Node       *limitCount;        /* COUNT parameter, or NULL if none */
+#ifdef __TBASE__
+	bool		skipEarlyFinish;	/* Early ExecFinishNode ? */
+#endif
 } Limit;
 
 
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 4b752d16..689a392f 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -1851,6 +1851,9 @@ typedef struct LimitPath
     Path       *subpath;        /* path representing input source */
     Node       *limitOffset;    /* OFFSET parameter, or NULL if none */
     Node       *limitCount;        /* COUNT parameter, or NULL if none */
+#ifdef __TBASE__
+	bool		skipEarlyFinish;	/* Early ExecFinishNode ? */
+#endif
 } LimitPath;
 
 
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index a3afb1a4..505cb463 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -308,7 +308,8 @@ extern ModifyTablePath *create_modifytable_path(PlannerInfo *root,
 extern LimitPath *create_limit_path(PlannerInfo *root, RelOptInfo *rel,
                   Path *subpath,
                   Node *limitOffset, Node *limitCount,
-                  int64 offset_est, int64 count_est);
+				  int64 offset_est, int64 count_est,
+				  bool earlyFinish);
 
 extern Path *reparameterize_path(PlannerInfo *root, Path *path,
                     Relids required_outer,
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index 4e32be80..dc4ca53e 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -140,7 +140,7 @@ extern Agg *make_agg(List *tlist, List *qual,
          List *groupingSets, List *chain,
          double dNumGroups, Plan *lefttree);
 extern Limit *make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount,
-                         int64 offset_est, int64 count_est);
+						 int64 offset_est, int64 count_est, bool skipEarlyFinish);
 extern RemoteSubplan *make_remotesubplan(PlannerInfo *root,
                    Plan *lefttree,
                    Distribution *resultDistribution,

From 317d14b57e445319c250a7141786e011a8b0a773 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Wed, 31 Mar 2021 14:56:53 +0800
Subject: [PATCH 348/578] Remove hasAggs/having/sort/limit restrictions to
 support more subquery FQS

We will walk through all these expressions to check exec_nodes in
RTE_SUBQUERY, so these query sub cluases are safe to FQS.
---
 src/backend/optimizer/util/pgxcship.c  |  7 +------
 src/test/regress/expected/xc_FQS_2.out | 20 ++++++++++++++++++++
 src/test/regress/sql/xc_FQS.sql        |  2 ++
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index 7bff63dd..c07af9a4 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -1828,16 +1828,11 @@ pgxc_is_simple_subquery(Query *query)
 	 * Can't pushdown a subquery involving grouping, aggregation, SRFs,
 	 * sorting, limiting, or WITH.
 	 */
-	if (query->hasAggs ||
-		query->hasWindowFuncs ||
+	if (query->hasWindowFuncs ||
 		query->hasTargetSRFs ||
 		query->groupClause ||
 		query->groupingSets ||
-		query->havingQual ||
-		query->sortClause ||
 		query->distinctClause ||
-		query->limitOffset ||
-		query->limitCount ||
 		query->hasForUpdate ||
 		query->cteList)
 		return false;
diff --git a/src/test/regress/expected/xc_FQS_2.out b/src/test/regress/expected/xc_FQS_2.out
index 9b35d802..7f9570b4 100644
--- a/src/test/regress/expected/xc_FQS_2.out
+++ b/src/test/regress/expected/xc_FQS_2.out
@@ -1704,6 +1704,26 @@ select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from sub
   1 | sz | 2
 (3 rows)
 
+explain select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2;
+                                      QUERY PLAN                                      
+--------------------------------------------------------------------------------------
+ Remote Fast Query Execution  (cost=0.00..0.00 rows=0 width=0)
+   Node/s: datanode_1
+   ->  Subquery Scan on "__Alias_22__"  (cost=21.02..21.04 rows=1 width=40)
+         Filter: ("__Alias_22__".c = 2)
+         ->  Limit  (cost=21.02..21.02 rows=1 width=40)
+               ->  Sort  (cost=21.02..21.03 rows=4 width=40)
+                     Sort Key: subquery_fqs.c
+                     ->  Seq Scan on subquery_fqs  (cost=0.00..21.00 rows=4 width=40)
+                           Filter: (id = 1)
+(9 rows)
+
+select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2;
+ id | a  | c 
+----+----+---
+  1 | gd | 2
+(1 row)
+
 drop table tab1_rr;
 drop table tab1_hash;
 drop table tab1_modulo;
diff --git a/src/test/regress/sql/xc_FQS.sql b/src/test/regress/sql/xc_FQS.sql
index 14721a76..bc99b709 100644
--- a/src/test/regress/sql/xc_FQS.sql
+++ b/src/test/regress/sql/xc_FQS.sql
@@ -290,6 +290,8 @@ explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from
 select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1);
 explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1);
 select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1);
+explain select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2;
+select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2;
 
 drop table tab1_rr;
 drop table tab1_hash;

From 7ce1beece3461b1373ad692c552cc2fc9aa9f1b8 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Thu, 1 Apr 2021 16:53:40 +0800
Subject: [PATCH 349/578] Stable function can be pushed down.

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131086322927&url_cache_key=d4e1402777dc733479aac463ad1a9d24
---
 src/backend/optimizer/util/pgxcship.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index c07af9a4..ec2f0504 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -2221,7 +2221,7 @@ pgxc_is_func_shippable(Oid funcid)
             
         default:
         {
-            result = (func_volatile(funcid) == PROVOLATILE_IMMUTABLE);
+			result = (func_volatile(funcid) != PROVOLATILE_VOLATILE);
             break;
         }
     }

From 0cfc0c39aab376c593d50eec77677fd7e86d60c6 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 2 Apr 2021 10:44:38 +0800
Subject: [PATCH 350/578] fix coverity code

---
 src/gtm/client/fe-connect.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c
index c5a0842e..080af583 100644
--- a/src/gtm/client/fe-connect.c
+++ b/src/gtm/client/fe-connect.c
@@ -1465,7 +1465,8 @@ GTMSetSockKeepAlive(GTM_Conn *conn, int tcp_keepalives_idle,
 	/* check sock */
 	if (getsockopt(sock, IPPROTO_TCP, TCP_INFO, &info, (socklen_t *)&len) < 0)
     {
-        return false;
+        /* No need to set */
+        return true;
     }
     
 	if (info.tcpi_state != TCP_ESTABLISHED)

From c7c05327ffb7236baab7420e465544e28a10b398 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 2 Apr 2021 19:03:48 +0800
Subject: [PATCH 351/578] for gtm log rotation
 http://tapd.oa.com/pgxz/prong/stories/view/1010092131863477653

---
 src/gtm/common/Makefile          |   2 +-
 src/gtm/common/elog.c            |  87 +++-
 src/gtm/common/gtm_opt_handler.c |   4 +
 src/gtm/common/syslogger.c       | 676 +++++++++++++++++++++++++++++++
 src/gtm/main/gtm_opt.c           | 133 ++++--
 src/gtm/main/main.c              | 340 +++++++++++++++-
 src/include/gtm/elog.h           |   4 +
 src/include/gtm/gtm.h            | 211 +++++-----
 src/include/gtm/gtm_c.h          |   2 +
 src/include/gtm/gtm_opt.h        |   4 +
 src/include/gtm/gtm_time.h       |  31 +-
 src/include/gtm/syslogger.h      |  98 +++++
 12 files changed, 1420 insertions(+), 172 deletions(-)
 create mode 100644 src/gtm/common/syslogger.c
 create mode 100644 src/include/gtm/syslogger.h

diff --git a/src/gtm/common/Makefile b/src/gtm/common/Makefile
index 43d80dad..47052e32 100644
--- a/src/gtm/common/Makefile
+++ b/src/gtm/common/Makefile
@@ -23,7 +23,7 @@ LDFLAGS=-L$(top_builddir)/common -L$(top_builddir)/libpq
 LIBS=-lpthread -lrt
 
 OBJS = gtm_opt_handler.o aset.o mcxt.o gtm_utils.o elog.o assert.o stringinfo.o gtm_lock.o \
-       gtm_list.o gtm_serialize.o gtm_serialize_debug.o gtm_time.o gtm_gxid.o heap.o datapump.o bloom.o
+       gtm_list.o gtm_serialize.o gtm_serialize_debug.o gtm_time.o gtm_gxid.o heap.o datapump.o bloom.o syslogger.o
 
 all:all-lib
 
diff --git a/src/gtm/common/elog.c b/src/gtm/common/elog.c
index 833b9e25..a6c6c66d 100644
--- a/src/gtm/common/elog.c
+++ b/src/gtm/common/elog.c
@@ -28,6 +28,7 @@
 #include "gtm/gtm_ext.h"
 #include "gtm/libpq.h"
 #include "gtm/pqformat.h"
+#include "gtm/syslogger.h"
 
 #undef _
 #define _(x)    x
@@ -42,6 +43,8 @@ char *GTMLogFile = NULL;
 /* GUC parameters */
 int            Log_destination = LOG_DESTINATION_STDERR;
 
+int	exit_flag = GTM_DEFAULT_EXIT_FLAG;
+
 /* Macro for checking errordata_stack_depth is reasonable */
 #define CHECK_STACK_DEPTH() \
     do { \
@@ -321,15 +324,15 @@ errfinish(int dummy,...)
     }
 
     /* Emit the message to the right places */
-    {    
         GTM_ThreadInfo *thrinfo = GetMyThreadInfo;
         if(thrinfo->thr_conn)
         {
             EmitErrorReport(thrinfo->thr_conn->con_port);
         }
-        
-    }
+    else
+    {
     EmitErrorReport(NULL);
+    }
 
     /* Now free up subsidiary data attached to stack entry, and release it */
     if (edata->message)
@@ -745,6 +748,76 @@ pg_re_throw(void)
 }
 
 
+/*
+ * Send data to the syslogger using the chunked protocol
+ *
+ * Note: when there are multiple backends writing into the syslogger pipe,
+ * it's critical that each write go into the pipe indivisibly, and not
+ * get interleaved with data from other processes.  Fortunately, the POSIX
+ * spec requires that writes to pipes be atomic so long as they are not
+ * more than PIPE_BUF bytes long.  So we divide long messages into chunks
+ * that are no more than that length, and send one chunk per write() call.
+ * The collector process knows how to reassemble the chunks.
+ *
+ * Because of the atomic write requirement, there are only two possible
+ * results from write() here: -1 for failure, or the requested number of
+ * bytes.  There is not really anything we can do about a failure; retry would
+ * probably be an infinite loop, and we can't even report the error usefully.
+ * (There is noplace else we could send it!)  So we might as well just ignore
+ * the result from write().  However, on some platforms you get a compiler
+ * warning from ignoring write()'s result, so do a little dance with casting
+ * rc to void to shut up the compiler.
+ */
+static void
+write_pipe_chunks(char *data, int len, int dest)
+{
+    PipeProtoChunk p;
+    int			fd = fileno(stderr);
+    int			rc;
+
+    Assert(len > 0);
+
+    p.proto.nuls[0] = p.proto.nuls[1] = '\0';
+    p.proto.pid = (exit_flag == GTM_DEFAULT_EXIT_FLAG) ? (int) MyThreadID : 0;
+
+    /* write all but the last chunk */
+    while (len > PIPE_MAX_PAYLOAD)
+    {
+        p.proto.is_last = (dest == LOG_DESTINATION_CSVLOG ? 'F' : 'f');
+        p.proto.len = PIPE_MAX_PAYLOAD;
+        memcpy(p.proto.data, data, PIPE_MAX_PAYLOAD);
+
+        rc = write(fd, &p, PIPE_HEADER_SIZE + PIPE_MAX_PAYLOAD);
+
+#ifdef __TBASE__
+        /* if we are interruppted, just return */
+        if (EINTR == errno && rc < 0)
+        {
+            return;
+        }
+#endif
+        (void) rc;
+        data += PIPE_MAX_PAYLOAD;
+        len -= PIPE_MAX_PAYLOAD;
+    }
+
+    /* write the last chunk */
+    p.proto.is_last = (dest == LOG_DESTINATION_CSVLOG ? 'T' : 't');
+    p.proto.len = len;
+    memcpy(p.proto.data, data, len);
+
+    rc = write(fd, &p, PIPE_HEADER_SIZE + len);
+#ifdef __TBASE__
+    /* if we are interruppted, just return */
+    if (EINTR == errno && rc < 0)
+    {
+        return;
+    }
+#endif
+    (void) rc;
+}
+
+
 /*
  * Initialization of error output file
  */
@@ -857,9 +930,11 @@ send_message_to_server_log(ErrorData *edata)
 						 edata->filename, edata->lineno);
 	}
 
-	/* Write to stderr, if enabled */
-	if (Log_destination & LOG_DESTINATION_STDERR)
-		write(fileno(stderr), buf.data, buf.len);
+    /* If in the syslogger thread, try to write messages direct to file */
+    if (GetMyThreadInfo->am_syslogger)
+        write_syslogger_file(buf.data, buf.len, LOG_DESTINATION_STDERR);
+    else
+        write_pipe_chunks(buf.data, buf.len, LOG_DESTINATION_STDERR);
 
     if (errlog_collection_func && (buf.len > 0) && ('\0' != buf.data[0]))
         (*errlog_collection_func) (edata, &buf);
diff --git a/src/gtm/common/gtm_opt_handler.c b/src/gtm/common/gtm_opt_handler.c
index bb0344a5..569db18d 100644
--- a/src/gtm/common/gtm_opt_handler.c
+++ b/src/gtm/common/gtm_opt_handler.c
@@ -47,6 +47,7 @@ static int gtm_opt_var_compare(const void *a, const void *b);
 static void InitializeOneGTMOption(struct config_generic * gconf);
 static void ReportGTMOption(struct config_generic * record);
 static char *_ShowOption(struct config_generic * record, bool use_units);
+extern void GTM_SendNotifyByte();
 
 /*
  * Variables to bel fed by specific option definition: gtm_opt.c and gtm_proxy_opt.c
@@ -350,6 +351,9 @@ ProcessConfigFile(GtmOptContext context)
     FreeConfigVariables(head);
     if (cvc)
         free(cvc);
+
+	/* notify the syslogger */
+    GTM_SendNotifyByte();
     return true;
 
 cleanup_list:
diff --git a/src/gtm/common/syslogger.c b/src/gtm/common/syslogger.c
new file mode 100644
index 00000000..844d6218
--- /dev/null
+++ b/src/gtm/common/syslogger.c
@@ -0,0 +1,676 @@
+/*-------------------------------------------------------------------------
+ *
+ * syslogger.c
+ *
+ * The system logger (syslogger) catches all
+ * stderr output from the gtm thread by redirecting to a pipe, and
+ * writes it to a set of logfiles. It's possible to have size and
+ * age limits for the logfile configured in gtm.conf. If these limits
+ * are reached or passed, the current logfile is closed and a new one
+ * is created (rotated) The logfiles are stored in a subdirectory gtm_log.
+ *
+ * Copyright (c) 2021-Present TBase development team, Tencent
+ *
+ * IDENTIFICATION
+ *	  src/gtm/common/syslogger.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <time.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/epoll.h>
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+#include "gtm/stringinfo.h"
+#include "gtm/gtm_list.h"
+#include "gtm/palloc.h"
+#include "gtm/syslogger.h"
+#include "gtm/gtm_time.h"
+#include "gtm/elog.h"
+
+/*
+ * GUC parameters.  Logging_collector cannot be changed after postmaster
+ * start, but the rest can change at SIGHUP.
+ */
+bool		Logging_collector = false;
+int			Log_RotationAge = HOURS_PER_DAY * MINS_PER_HOUR;
+int			Log_RotationSize = 10 * 1024;
+char	   *Log_directory = NULL;
+char	   *Log_filename = "gtm-%Y-%m-%d_%H%M%S.log";
+bool		Log_truncate_on_rotation = false;
+int			Log_file_mode = S_IRUSR | S_IWUSR;
+
+/*
+ * Private state
+ */
+pg_time_t next_rotation_time;
+bool rotation_disabled = false;
+FILE *gtmlogFile = NULL;
+pg_time_t first_syslogger_file_time = 0;
+static char *last_file_name = NULL;
+bool rotation_requested = false;
+
+/*
+ * Buffers for saving partial messages from different backends.
+ *
+ * Keep NBUFFER_LISTS lists of these, with the entry for a given source pid
+ * being in the list numbered (pid % NBUFFER_LISTS), so as to cut down on
+ * the number of entries we have to examine for any one incoming message.
+ * There must never be more than one entry for the same source pid.
+ *
+ * An inactive buffer is not removed from its list, just held for re-use.
+ * An inactive buffer has pid == 0 and undefined contents of data.
+ */
+typedef struct
+{
+    int32		pid;			/* PID of source process */
+    StringInfoData data;		/* accumulated data, as a StringInfo */
+} save_buffer;
+
+#define NBUFFER_LISTS 256
+static gtm_List *buffer_lists[NBUFFER_LISTS];
+
+
+int	syslogPipe[2] = {-1, -1};
+int	signalPipe[2] = {-1, -1};
+
+
+void flush_pipe_input(char *logbuffer, int *bytes_in_logbuffer);
+static FILE *logfile_open(const char *filename, const char *mode,
+                          bool allow_errors);
+
+void logfile_rotate(bool time_based_rotation, int size_rotation_for);
+static char *logfile_getname(pg_time_t timestamp, const char *suffix);
+
+
+/* --------------------------------
+ *		pipe protocol handling
+ * --------------------------------
+ */
+
+/*
+ * Process data received through the syslogger pipe.
+ *
+ * This routine interprets the log pipe protocol which sends log messages as
+ * (hopefully atomic) chunks - such chunks are detected and reassembled here.
+ *
+ * The protocol has a header that starts with two nul bytes, then has a 16 bit
+ * length, the pid of the sending process, and a flag to indicate if it is
+ * the last chunk in a message. Incomplete chunks are saved until we read some
+ * more, and non-final chunks are accumulated until we get the final chunk.
+ *
+ * All of this is to avoid 2 problems:
+ * . partial messages being written to logfiles (messes rotation), and
+ * . messages from different backends being interleaved (messages garbled).
+ *
+ * Any non-protocol messages are written out directly. These should only come
+ * from non-PostgreSQL sources, however (e.g. third party libraries writing to
+ * stderr).
+ *
+ * logbuffer is the data input buffer, and *bytes_in_logbuffer is the number
+ * of bytes present.  On exit, any not-yet-eaten data is left-justified in
+ * logbuffer, and *bytes_in_logbuffer is updated.
+ */
+void
+process_pipe_input(char *logbuffer, int *bytes_in_logbuffer, bool* pipe_eof_seen)
+{
+    char	   *cursor = logbuffer;
+    int			count = *bytes_in_logbuffer;
+    int			dest = LOG_DESTINATION_STDERR;
+
+    /* While we have enough for a header, process data... */
+    while (count >= (int) (offsetof(PipeProtoHeader, data) + 1))
+    {
+        PipeProtoHeader p;
+        int			chunklen;
+
+        /* Do we have a valid header? */
+        memcpy(&p, cursor, offsetof(PipeProtoHeader, data));
+        if (p.nuls[0] == '\0' && p.nuls[1] == '\0' &&
+            p.len > 0 && p.len <= PIPE_MAX_PAYLOAD &&
+            (p.is_last == 't' || p.is_last == 'f' ||
+             p.is_last == 'T' || p.is_last == 'F'))
+        {
+            gtm_List	   *buffer_list;
+            gtm_ListCell   *cell;
+            save_buffer *existing_slot = NULL,
+                    *free_slot = NULL;
+            StringInfo	str;
+
+            chunklen = PIPE_HEADER_SIZE + p.len;
+
+            if (p.pid == 0)
+            {
+                *pipe_eof_seen = true;
+            }
+
+            /* Fall out of loop if we don't have the whole chunk yet */
+            if (count < chunklen)
+                break;
+
+            dest = (p.is_last == 'T' || p.is_last == 'F') ?
+                   LOG_DESTINATION_CSVLOG : LOG_DESTINATION_STDERR;
+
+            /* Locate any existing buffer for this source pid */
+            buffer_list = buffer_lists[p.pid % NBUFFER_LISTS];
+            gtm_foreach(cell, buffer_list)
+            {
+                save_buffer *buf = (save_buffer *) gtm_lfirst(cell);
+
+                if (buf->pid == p.pid)
+                {
+                    existing_slot = buf;
+                    break;
+                }
+                if (buf->pid == 0 && free_slot == NULL)
+                    free_slot = buf;
+            }
+
+            if (p.is_last == 'f' || p.is_last == 'F')
+            {
+                /*
+                 * Save a complete non-final chunk in a per-pid buffer
+                 */
+                if (existing_slot != NULL)
+                {
+                    /* Add chunk to data from preceding chunks */
+                    str = &(existing_slot->data);
+                    appendBinaryStringInfo(str,
+                                           cursor + PIPE_HEADER_SIZE,
+                                           p.len);
+                }
+                else
+                {
+                    /* First chunk of message, save in a new buffer */
+                    if (free_slot == NULL)
+                    {
+                        /*
+                         * Need a free slot, but there isn't one in the list,
+                         * so create a new one and extend the list with it.
+                         */
+                        free_slot = palloc(sizeof(save_buffer));
+                        buffer_list = gtm_lappend(buffer_list, free_slot);
+                        buffer_lists[p.pid % NBUFFER_LISTS] = buffer_list;
+                    }
+                    free_slot->pid = p.pid;
+                    str = &(free_slot->data);
+                    initStringInfo(str);
+                    appendBinaryStringInfo(str,
+                                           cursor + PIPE_HEADER_SIZE,
+                                           p.len);
+                }
+            }
+            else
+            {
+                /*
+                 * Final chunk --- add it to anything saved for that pid, and
+                 * either way write the whole thing out.
+                 */
+                if (existing_slot != NULL)
+                {
+                    str = &(existing_slot->data);
+                    appendBinaryStringInfo(str,
+                                           cursor + PIPE_HEADER_SIZE,
+                                           p.len);
+                    write_syslogger_file(str->data, str->len, dest);
+                    /* Mark the buffer unused, and reclaim string storage */
+                    existing_slot->pid = 0;
+                    pfree(str->data);
+                }
+                else
+                {
+                    /* The whole message was one chunk, evidently. */
+                    write_syslogger_file(cursor + PIPE_HEADER_SIZE, p.len,
+                                         dest);
+                }
+            }
+
+            /* Finished processing this chunk */
+            cursor += chunklen;
+            count -= chunklen;
+        }
+        else
+        {
+            /* Process non-protocol data */
+
+            /*
+             * Look for the start of a protocol header.  If found, dump data
+             * up to there and repeat the loop.  Otherwise, dump it all and
+             * fall out of the loop.  (Note: we want to dump it all if at all
+             * possible, so as to avoid dividing non-protocol messages across
+             * logfiles.  We expect that in many scenarios, a non-protocol
+             * message will arrive all in one read(), and we want to respect
+             * the read() boundary if possible.)
+             */
+            for (chunklen = 1; chunklen < count; chunklen++)
+            {
+                if (cursor[chunklen] == '\0')
+                    break;
+            }
+            /* fall back on the stderr log as the destination */
+            write_syslogger_file(cursor, chunklen, LOG_DESTINATION_STDERR);
+            cursor += chunklen;
+            count -= chunklen;
+        }
+    }
+
+    /* We don't have a full chunk, so left-align what remains in the buffer */
+    if (count > 0 && cursor != logbuffer)
+        memmove(logbuffer, cursor, count);
+    *bytes_in_logbuffer = count;
+}
+
+/*
+ * Force out any buffered data
+ *
+ * This is currently used only at syslogger shutdown, but could perhaps be
+ * useful at other times, so it is careful to leave things in a clean state.
+ */
+void
+flush_pipe_input(char *logbuffer, int *bytes_in_logbuffer)
+{
+    int			i;
+
+    /* Dump any incomplete protocol messages */
+    for (i = 0; i < NBUFFER_LISTS; i++)
+    {
+        gtm_List	   *list = buffer_lists[i];
+        gtm_ListCell   *cell;
+
+        gtm_foreach(cell, list)
+        {
+            save_buffer *buf = (save_buffer *) gtm_lfirst(cell);
+
+            if (buf->pid != 0)
+            {
+                StringInfo	str = &(buf->data);
+
+                write_syslogger_file(str->data, str->len, LOG_DESTINATION_STDERR);
+                /* Mark the buffer unused, and reclaim string storage */
+                buf->pid = 0;
+                pfree(str->data);
+            }
+        }
+    }
+
+    /*
+     * Force out any remaining pipe data as-is; we don't bother trying to
+     * remove any protocol headers that may exist in it.
+     */
+    if (*bytes_in_logbuffer > 0)
+        write_syslogger_file(logbuffer, *bytes_in_logbuffer, LOG_DESTINATION_STDERR);
+    *bytes_in_logbuffer = 0;
+}
+
+
+/* --------------------------------
+ *		logfile routines
+ * --------------------------------
+ */
+
+/*
+ * Write text to the currently open logfile
+ *
+ * This is exported so that elog.c can call it when am_syslogger is true.
+ * This allows the syslogger process to record elog messages of its own,
+ * even though its stderr does not point at the syslog pipe.
+ */
+void
+write_syslogger_file(const char *buffer, int count, int destination)
+{
+    int			rc;
+
+    if (destination != LOG_DESTINATION_STDERR)
+    {
+        return;
+    }
+
+    if (gtmlogFile == NULL)
+    {
+        write(fileno(stderr), buffer, count);
+        return;
+    }
+
+    rc = fwrite(buffer, 1, count, gtmlogFile);
+
+    /* can't use ereport here because of possible recursion */
+    if (rc != count)
+        write_stderr("could not write to log file: %s\n", strerror(errno));
+}
+
+/*
+ * Open a new logfile with proper permissions and buffering options.
+ *
+ * If allow_errors is true, we just log any open failure and return NULL
+ * (with errno still correct for the fopen failure).
+ * Otherwise, errors are treated as fatal.
+ */
+static FILE *
+logfile_open(const char *filename, const char *mode, bool allow_errors)
+{
+    FILE	   *fh;
+    mode_t		oumask;
+
+    /*
+     * Note we do not let Log_file_mode disable IWUSR, since we certainly want
+     * to be able to write the files ourselves.
+     */
+    oumask = umask((mode_t) ((~(Log_file_mode | S_IWUSR)) & (S_IRWXU | S_IRWXG | S_IRWXO)));
+    fh = fopen(filename, mode);
+    umask(oumask);
+
+    if (fh)
+    {
+        setvbuf(fh, NULL, PG_IOLBF, 0);
+
+#ifdef WIN32
+        /* use CRLF line endings on Windows */
+        _setmode(_fileno(fh), _O_TEXT);
+#endif
+    }
+    else
+    {
+        int			save_errno = errno;
+
+        ereport(allow_errors ? LOG : FATAL,
+                (errmsg("could not open log file \"%s\": %m",
+                               filename)));
+        errno = save_errno;
+    }
+
+    return fh;
+}
+
+/*
+ * perform logfile rotation
+ */
+void
+logfile_rotate(bool time_based_rotation, int size_rotation_for)
+{
+    char	   *filename;
+    pg_time_t	fntime;
+    FILE	   *fh;
+
+    rotation_requested = false;
+
+    /*
+     * When doing a time-based rotation, invent the new logfile name based on
+     * the planned rotation time, not current time, to avoid "slippage" in the
+     * file name when we don't do the rotation immediately.
+     */
+    if (time_based_rotation)
+        fntime = next_rotation_time;
+    else
+        fntime = time(NULL);
+    filename = logfile_getname(fntime, NULL);
+
+    /*
+     * Decide whether to overwrite or append.  We can overwrite if (a)
+     * Log_truncate_on_rotation is set, (b) the rotation was triggered by
+     * elapsed time and not something else, and (c) the computed file name is
+     * different from what we were previously logging into.
+     *
+     * Note: last_file_name should never be NULL here, but if it is, append.
+     */
+    if (time_based_rotation || (size_rotation_for & LOG_DESTINATION_STDERR))
+    {
+        if (Log_truncate_on_rotation && time_based_rotation &&
+            last_file_name != NULL &&
+            strcmp(filename, last_file_name) != 0)
+            fh = logfile_open(filename, "w", true);
+        else
+            fh = logfile_open(filename, "a", true);
+
+        if (!fh)
+        {
+            /*
+             * ENFILE/EMFILE are not too surprising on a busy system; just
+             * keep using the old file till we manage to get a new one.
+             * Otherwise, assume something's wrong with Log_directory and stop
+             * trying to create files.
+             */
+            if (errno != ENFILE && errno != EMFILE)
+            {
+                ereport(LOG,
+                        (errmsg("disabling automatic rotation (use SIGHUP to re-enable)")));
+                rotation_disabled = true;
+            }
+
+            if (filename)
+                pfree(filename);
+            return;
+        }
+
+        fclose(gtmlogFile);
+        gtmlogFile = fh;
+
+        /* instead of pfree'ing filename, remember it for next time */
+        if (last_file_name != NULL)
+            pfree(last_file_name);
+        last_file_name = filename;
+        filename = NULL;
+    }
+
+    if (filename)
+        pfree(filename);
+
+    set_next_rotation_time();
+}
+
+
+/*
+ * construct logfile name using timestamp information
+ *
+ * If suffix isn't NULL, append it to the name, replacing any ".log"
+ * that may be in the pattern.
+ *
+ * Result is palloc'd.postgresql-%Y-%m-%d_%H%M%S.log
+ */
+static char *
+logfile_getname(pg_time_t timestamp, const char *suffix)
+{
+    char	   *filename;
+    int			len;
+    time_t	stamp_time;
+    struct tm   timeinfo;
+    filename = palloc(MAXPGPATH);
+
+    snprintf(filename, MAXPGPATH, "%s/", Log_directory);
+
+    len = strlen(filename);
+
+    stamp_time = time(NULL);
+    localtime_r(&stamp_time,&timeinfo);
+    /* treat Log_filename as a strftime pattern */
+    strftime(filename + len, MAXPGPATH - len, Log_filename,
+             &timeinfo);
+
+    if (suffix != NULL)
+    {
+        len = strlen(filename);
+        if (len > 4 && (strcmp(filename + (len - 4), ".log") == 0))
+            len -= 4;
+        strlcpy(filename + len, suffix, MAXPGPATH - len);
+    }
+
+    return filename;
+}
+
+/*
+ * Determine the next planned rotation time, and store in next_rotation_time.
+ */
+void
+set_next_rotation_time(void)
+{
+    pg_time_t	now;
+    struct tm timeinfo;
+    int			rotinterval;
+
+    /* nothing to do if time-based rotation is disabled */
+    if (Log_RotationAge <= 0)
+        return;
+
+    /*
+     * The requirements here are to choose the next time > now that is a
+     * "multiple" of the log rotation interval.  "Multiple" can be interpreted
+     * fairly loosely.  In this version we align to log_timezone rather than
+     * GMT.
+     */
+    rotinterval = Log_RotationAge * SECS_PER_MINUTE;	/* convert to seconds */
+    now = (pg_time_t) time(NULL);
+    localtime_r(&now,&timeinfo);
+    now += timeinfo.tm_gmtoff;
+    now -= now % rotinterval;
+    now += rotinterval;
+    now -= timeinfo.tm_gmtoff;
+    next_rotation_time = now;
+}
+
+/*
+ * Initialization of error output file
+ */
+void
+GTM_LogFileInit(void)
+{
+    char	   *filename;
+
+    /*
+     * Create log directory if not present; ignore errors
+     */
+    mkdir(Log_directory, S_IRWXU);
+
+    first_syslogger_file_time = time(NULL);
+    filename = logfile_getname(first_syslogger_file_time, NULL);
+
+    gtmlogFile = logfile_open(filename, "a", false);
+
+    pfree(filename);
+}
+
+/*
+ * Send one byte to the signal pipe, to wake up syslogger
+ */
+void
+GTM_SendNotifyByte(void)
+{
+    int			rc;
+    char		dummy = 0;
+
+    if (signalPipe[1] == -1)
+    {
+        return;
+    }
+
+retry:
+    rc = write(signalPipe[1], &dummy, 1);
+    if (rc < 0)
+    {
+        /* If interrupted by signal, just retry */
+        if (errno == EINTR)
+            goto retry;
+
+        /*
+         * If the pipe is full, we don't need to retry, the data that's there
+         * already is enough to wake up WaitLatch.
+         */
+        if (errno == EAGAIN || errno == EWOULDBLOCK)
+            return;
+
+        /*
+         * Oops, the write() failed for some other reason. We might be in a
+         * signal handler, so it's not safe to elog(). We have no choice but
+         * silently ignore the error.
+         */
+        return;
+    }
+}
+
+/*
+ * Read all available data from the signal pipe
+ */
+void
+GTM_drainNotifyBytes(void)
+{
+    /*
+     * There shouldn't normally be more than one byte in the pipe, or maybe a
+     * few bytes if multiple processes run SetLatch at the same instant.
+     */
+    char		buf[16];
+    int			rc;
+
+    if (signalPipe[0] == -1)
+    {
+        return;
+    }
+
+    for (;;)
+    {
+        rc = read(signalPipe[0], buf, sizeof(buf));
+        if (rc < 0)
+        {
+            if (errno == EAGAIN || errno == EWOULDBLOCK)
+                break;			/* the pipe is empty */
+            else if (errno == EINTR)
+                continue;		/* retry */
+            else
+            {
+                elog(LOG, "read() on signalPipe failed: %m");
+                break;
+            }
+        }
+        else if (rc == 0)
+        {
+            elog(LOG, "unexpected EOF on signalPipe");
+            break;
+        }
+        else if (rc < sizeof(buf))
+        {
+            /* we successfully drained the pipe; no need to read() again */
+            break;
+        }
+        /* else buffer wasn't big enough, so read again */
+    }
+}
+
+int
+GTM_InitSysloggerEpoll(void)
+{
+    int efd = -1;
+    struct epoll_event event;
+
+    if (syslogPipe[0] == -1 || signalPipe[0] == -1)
+    {
+        return -1;
+    }
+
+    efd = epoll_create1(0);
+    if(efd == -1)
+    {
+        elog(LOG, "failed to create epoll");
+        return -1;
+    }
+
+    event.data.fd = syslogPipe[0];
+    event.events = EPOLLIN | EPOLLERR | EPOLLHUP | EPOLLRDHUP;
+    if(-1 == epoll_ctl (efd, EPOLL_CTL_ADD, syslogPipe[0], &event))
+    {
+        elog(LOG, "failed to add socket to epoll");
+        return -1;
+    }
+
+    event.data.fd = signalPipe[0];
+    event.events = EPOLLIN | EPOLLERR | EPOLLHUP | EPOLLRDHUP;
+    if(-1 == epoll_ctl (efd, EPOLL_CTL_ADD, signalPipe[0], &event))
+    {
+        elog(LOG, "failed to add socket to epoll");
+        return -1;
+    }
+
+    return efd;
+}
diff --git a/src/gtm/main/gtm_opt.c b/src/gtm/main/gtm_opt.c
index f1a9418d..4029cc8f 100644
--- a/src/gtm/main/gtm_opt.c
+++ b/src/gtm/main/gtm_opt.c
@@ -30,7 +30,7 @@
 #include "gtm/gtm_opt_tables.h"
 #include "gtm/gtm_opt.h"
 #include "gtm/gtm_standby.h"
-
+#include "gtm/gtm_time.h"
 
 #define CONFIG_FILENAME "gtm.conf"
 const char *config_filename = CONFIG_FILENAME;
@@ -81,6 +81,11 @@ extern char*   unix_socket_directory;
 extern char*   unix_socket_group;
 extern int     unix_socket_permissions;
 
+extern char *Log_filename;
+extern int	Log_RotationAge;
+extern int	Log_RotationSize;
+extern bool	Log_truncate_on_rotation;
+
 /*
  * We have different sets for client and server message level options because
  * they sort slightly different (see "log" level)
@@ -212,15 +217,25 @@ struct config_bool ConfigureNamesBool[] =
 	},
 #endif
 	{
-			{GTM_OPTNAME_CLUSTER_READ_ONLY, GTMC_STARTUP,
-			 gettext_noop("Nodes connected with gtm will be readonly."),
-			 gettext_noop("Default value is off."),
-			 0
-			},
-			&GTMClusterReadOnly,
-			false, NULL, NULL, false, NULL
+        {GTM_OPTNAME_CLUSTER_READ_ONLY, GTMC_STARTUP,
+         gettext_noop("Nodes connected with gtm will be readonly."),
+         gettext_noop("Default value is off."),
+         0
+        },
+        &GTMClusterReadOnly,
+        false, NULL, NULL, false, NULL
 	},
 
+    {
+        {GTM_OPTNAME_LOG_TRUNCATE_ON_ROTATION, GTMC_SIGHUP,
+         gettext_noop("Truncate existing log files of same name during log rotation."),
+         gettext_noop("Default value is off."),
+         0
+        },
+        &Log_truncate_on_rotation,
+        false, NULL, NULL, false, NULL
+    },
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, NULL, NULL, 0}, NULL, false, NULL, NULL, false, NULL
@@ -360,44 +375,68 @@ struct config_int ConfigureNamesInt[] =
 	},
 #endif
 	{
-			{
-					GTM_OPTNAME_GTS_FREEZE_TIME_LIMIT, GTMC_STARTUP,
-					gettext_noop("refuse to start gtm before GTS has n days left,default 100 years"),
-					NULL,
-					0
-			},
-			&GTMGTSFreezeLimit,
-			365 * 100, 0, INT_MAX, NULL, NULL,
-			0, NULL
+        {
+            GTM_OPTNAME_GTS_FREEZE_TIME_LIMIT, GTMC_STARTUP,
+            gettext_noop("refuse to start gtm before GTS has n days left,default 100 years"),
+            NULL,
+            0
+        },
+        &GTMGTSFreezeLimit,
+        365 * 100, 0, INT_MAX, NULL, NULL,
+        0, NULL
 	},
 	{
-			{
-					GTM_OPTNAME_STARTUP_GTS_DELTA, GTMC_STARTUP,
-					gettext_noop("Add -d seconds to GTS when started"),
-					NULL,
-					0
-			},
-			&GTMStartupGTSDelta,
-			300 , 0, INT_MAX, NULL, NULL,
-			0, NULL
+        {
+            GTM_OPTNAME_STARTUP_GTS_DELTA, GTMC_STARTUP,
+            gettext_noop("Add -d seconds to GTS when started"),
+            NULL,
+            0
+        },
+        &GTMStartupGTSDelta,
+        300 , 0, INT_MAX, NULL, NULL,
+        0, NULL
 	},
 
     {
-            {
-                    GTM_OPTNAME_UNIX_SOCKET_PERMISSIONS, GTMC_STARTUP,
-                    gettext_noop("Sets the access permissions of the Unix-domain socket."
-                                 "Unix-domain sockets use the usual Unix file system "
-                                 "permission set. The parameter value is expected "
-                                 "to be a numeric mode specification in the form "
-                                 "accepted by the chmod and umask system calls. "
-                                 "(To use the customary octal format the number must "
-                                 "start with a 0 (zero).)"),
-                    NULL,
-                    0
-            },
-            &unix_socket_permissions,
-            0777, 0000, 0777, NULL, NULL,
-            0, NULL
+        {
+            GTM_OPTNAME_UNIX_SOCKET_PERMISSIONS, GTMC_STARTUP,
+            gettext_noop("Sets the access permissions of the Unix-domain socket."
+                         "Unix-domain sockets use the usual Unix file system "
+                         "permission set. The parameter value is expected "
+                         "to be a numeric mode specification in the form "
+                         "accepted by the chmod and umask system calls. "
+                         "(To use the customary octal format the number must "
+                         "start with a 0 (zero).)"),
+            NULL,
+            0
+        },
+        &unix_socket_permissions,
+        0777, 0000, 0777, NULL, NULL,
+        0, NULL
+    },
+
+    {
+        {
+            GTM_OPTNAME_LOG_ROTATION_AGE, GTMC_SIGHUP,
+            gettext_noop("Automatic log file rotation will occur after N minutes."),
+            NULL,
+            0
+        },
+        &Log_RotationAge,
+        HOURS_PER_DAY * MINS_PER_HOUR, 0, INT_MAX / SECS_PER_MINUTE, NULL, NULL,
+        0, NULL
+    },
+
+    {
+        {
+            GTM_OPTNAME_LOG_ROTATION_SIZE, GTMC_SIGHUP,
+            gettext_noop("Automatic log file rotation will occur after N kilobytes."),
+            NULL,
+            0
+        },
+        &Log_RotationSize,
+        10 * 1024, 0, INT_MAX / 1024, NULL, NULL,
+        0, NULL
     },
 
 	/* End-of-list marker */
@@ -492,6 +531,18 @@ struct config_string ConfigureNamesString[] =
 		NULL, NULL
 	},
 
+    {
+        {GTM_OPTNAME_LOG_FILENAME_PATTERN, GTMC_SIGHUP,
+         gettext_noop("Sets the file name pattern for log files."),
+         NULL,
+         0
+        },
+        &Log_filename,
+        "gtm-%Y-%m-%d_%H%M%S.log",
+        NULL, NULL,
+        NULL, NULL
+    },
+
 	{
 		{GTM_OPTNAME_ERROR_REPORTER, GTMC_STARTUP,
 			gettext_noop("Command to report various errors."),
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index ec2b2143..7d9563f1 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -59,6 +59,7 @@
 #include "gtm/gtm_time.h"
 #include "gtm/gtm_stat.h"
 #include "gtm/gtm_stat_error.h"
+#include "gtm/syslogger.h"
 
 #ifdef __TBASE__
 #include "gtm/gtm_store.h"
@@ -76,6 +77,7 @@ extern char *optarg;
 #define GTM_DEFAULT_PORT        6666
 #define GTM_PID_FILE            "gtm.pid"
 #define GTM_LOG_FILE            "gtm.log"
+#define GTM_LOG_FILE_DIR		"gtm_log"
 
 #define LOOPS_UNTIL_HIBERNATE        50
 #define HIBERNATE_FACTOR        25
@@ -138,6 +140,8 @@ GTM_ThreadInfo  *g_timekeeper_thread = NULL;
 GTM_ThreadInfo    *g_timebackup_thread = NULL;
 GTM_ThreadInfo  *g_timer_thread = NULL;
 GTM_ThreadInfo  *g_logcollector_thread = NULL;
+GTM_ThreadInfo  *g_syslogger_thread = NULL;
+
 void *GTM_ThreadLogCollector(void *argp);
 extern void GTM_ErrorLogCollector(ErrorData *edata, StringInfo buff);
 GTM_ThreadInfo  *g_standby_pre_server_thread = NULL;
@@ -202,6 +206,8 @@ void GTM_PortCleanup(Port *con_port);
 #endif
 void *GTM_ThreadMain(void *argp);
 void *GTM_ThreadTimeKeeper(void *argp);
+static void *GTM_ThreadSysLogger(void *argp);
+static bool GTM_SysLoggerStart(void);
 
 #ifdef __XLOG__
 void *GTM_ThreadCheckPointer(void *argp);
@@ -247,7 +253,7 @@ static void ProcessBarrierCommand(Port *myport, GTM_MessageType mtype, StringInf
 static int 
 GTMInitConnection(GTM_ConnectionInfo *conninfo);
 static void SetNonBlockConnection(GTM_ConnectionInfo *conninfo);
-static void gtm_standby_pre_server_loop(const char *data_dir);
+static void gtm_standby_pre_server_loop(char *data_dir);
 
 #ifdef __XLOG__
 static void thread_replication_clean(GTM_StandbyReplication *replication);
@@ -256,6 +262,8 @@ static void WaitRedoertoExit(void);
 static void GTMSigHupHandler(void);
 #endif
 
+void GTM_Exit(void);
+
 /*
  * One-time initialization. It's called immediately after the main process
  * starts
@@ -337,6 +345,8 @@ MainThreadInit()
     memset(thrinfo->locks_hold, 0x00, sizeof(void*) * g_max_lock_number);
 #endif
 
+	/* thread main is syslogger before syslogger thread create */
+    thrinfo->am_syslogger = true;
     GTM_RWLockInit(&thrinfo->thr_lock);
     GTM_RWLockAcquire(&thrinfo->thr_lock, GTM_LOCKMODE_WRITE);    
 
@@ -387,16 +397,22 @@ BaseInit(char *data_dir)
     SpinLockInit(&g_last_sync_gts_lock);
 #endif
 
+    if (Log_directory == NULL)
+    {
+        Log_directory = (char *) malloc(GTM_MAX_PATH);
+        sprintf(Log_directory, "%s/%s", GTMDataDir, GTM_LOG_FILE_DIR);
+    }
+
     if (GTMLogFile == NULL)
     {
         GTMLogFile = (char *) malloc(GTM_MAX_PATH);
-        sprintf(GTMLogFile, "%s/%s", GTMDataDir, GTM_LOG_FILE);
+        sprintf(GTMLogFile, "%s/%s", Log_directory, GTM_LOG_FILE);
     }
 
     /* Save Node Register File in register.c */
     Recovery_SaveRegisterFileName(GTMDataDir);
 
-    DebugFileOpen();
+	GTM_LogFileInit();
 
     GTM_InitTxnManager();
     GTM_InitSeqManager();
@@ -1460,6 +1476,14 @@ main(int argc, char *argv[])
         process_thread_num = g_max_thread_number < process_thread_num ? g_max_thread_number : process_thread_num;
     }
 
+	/* start syslogger thread to handle log */
+    if (!GTM_SysLoggerStart())
+    {
+        elog(ERROR, "Failed to create syslogger thread.");
+        exit(1);
+    }
+    util_thread_cnt++;
+
     /* Create GTM threads handling requests */
     g_timekeeper_thread = GTM_ThreadCreate(GTM_ThreadTimeKeeper, g_max_lock_number);
     if (NULL == g_timekeeper_thread)
@@ -1874,7 +1898,7 @@ gtm_add_connection_standby_pre_server(Port *port)
  * handle loop before establish a connection to active-GTM
  */
 static void
-gtm_standby_pre_server_loop(const char *data_dir)
+gtm_standby_pre_server_loop(char *data_dir)
 {
     fd_set		readmask;
     int			nSockets;
@@ -5912,4 +5936,312 @@ void CheckStandbyConnect(GTM_ThreadInfo *my_threadinfo, GTM_ConnectionInfo *conn
 }
 
 #endif
+
+
+/*
+ * syslogger thread, handle log rotation and write log to logfile.
+ */
+static void*
+GTM_ThreadSysLogger(void *argp)
+{
+#define GTM_SYSLOGGER_WAIT_EVENTS 2
+    GTM_ThreadInfo *my_threadinfo = (GTM_ThreadInfo *)argp;
+    sigjmp_buf      local_sigjmp_buf;
+    pg_time_t		now = 0;
+    int             efd = -1;
+    int             n = 0;
+    char		    logbuffer[READ_BUF_SIZE];
+    int			    bytes_in_logbuffer = 0;
+    char	        *currentLogFilename = NULL;
+    int			    currentLogRotationAge = 0;
+    bool            got_SIGHUP = false;
+    bool            pipe_eof_seen = false;
+    sigset_t        mask;
+    struct epoll_event events[GTM_SYSLOGGER_WAIT_EVENTS];
+
+    my_threadinfo->am_syslogger = true;
+
+    /* ignore signal */
+    sigfillset(&mask);
+    pthread_sigmask(SIG_BLOCK, &mask, NULL);
+
+    elog(DEBUG8, "Starting the syslogger thread");
+
+    bind_service_threads();
+
+    efd = GTM_InitSysloggerEpoll();
+    if (efd == -1)
+    {
+        elog(LOG, "failed to init syslogger epoll");
+        exit(1);
+    }
+
+    MessageContext = AllocSetContextCreate(TopMemoryContext,
+                                           "MessageContext",
+                                           ALLOCSET_DEFAULT_MINSIZE,
+                                           ALLOCSET_DEFAULT_INITSIZE,
+                                           ALLOCSET_DEFAULT_MAXSIZE,
+                                           false);
+
+    /*
+     * POSTGRES main processing loop begins here
+     *
+     * If an exception is encountered, processing resumes here so we abort the
+     * current transaction and start a new one.
+     *
+     * You might wonder why this isn't coded as an infinite loop around a
+     * PG_TRY construct.  The reason is that this is the bottom of the
+     * exception stack, and so with PG_TRY there would be no exception handler
+     * in force at all during the CATCH part.  By leaving the outermost setjmp
+     * always active, we have at least some chance of recovering from an error
+     * during error recovery.  (If we get into an infinite loop thereby, it
+     * will soon be stopped by overflow of elog.c's internal state stack.)
+     */
+    if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+    {
+#ifdef __TBASE__
+        RWLockCleanUp();
+#endif
+        EmitErrorReport(NULL);
+
+        /*
+         * Now return to normal top-level context and clear ErrorContext for
+         * next time.
+         */
+        MemoryContextSwitchTo(TopMemoryContext);
+        FlushErrorState();
+    }
+
+    /* We can now handle ereport(ERROR) */
+    PG_exception_stack = &local_sigjmp_buf;
+
+    MemoryContextSwitchTo(MessageContext);
+    MemoryContextResetAndDeleteChildren(MessageContext);
+
+    /* remember active logfile parameters */
+    currentLogFilename = pstrdup(Log_filename);
+    currentLogRotationAge = Log_RotationAge;
+
+    /* set next planned rotation time */
+    set_next_rotation_time();
+
+    for(;;)
+    {
+        bool		time_based_rotation = false;
+        int			size_rotation_for = 0;
+        long		cur_timeout;
+        int         i = 0;
+
+        /*
+		 * Process any requests or signals received recently.
+		 */
+        if (got_SIGHUP)
+        {
+            got_SIGHUP = false;
+
+            if (strcmp(Log_filename, currentLogFilename) != 0)
+            {
+                pfree(currentLogFilename);
+                currentLogFilename = pstrdup(Log_filename);
+                rotation_requested = true;
+            }
+
+            /*
+             * If rotation time parameter changed, reset next rotation time,
+             * but don't immediately force a rotation.
+             */
+            if (currentLogRotationAge != Log_RotationAge)
+            {
+                currentLogRotationAge = Log_RotationAge;
+                set_next_rotation_time();
+            }
+
+            /*
+             * If we had a rotation-disabling failure, re-enable rotation
+             * attempts after SIGHUP, and force one immediately.
+             */
+            if (rotation_disabled)
+            {
+                rotation_disabled = false;
+                rotation_requested = true;
+            }
+        }
+
+        if (Log_RotationAge > 0 && !rotation_disabled)
+        {
+            /* Do a logfile rotation if it's time */
+            now = (pg_time_t) time(NULL);
+            if (now >= next_rotation_time)
+                rotation_requested = time_based_rotation = true;
+        }
+
+        if (!rotation_requested && Log_RotationSize > 0 && !rotation_disabled)
+        {
+            /* Do a rotation if file is too big */
+            if (ftell(gtmlogFile) >= Log_RotationSize * 1024L)
+            {
+                rotation_requested = true;
+                size_rotation_for |= LOG_DESTINATION_STDERR;
+            }
+        }
+
+        if (rotation_requested)
+        {
+            /*
+             * Force rotation when both values are zero. It means the request
+             * was sent by pg_rotate_logfile.
+             */
+            if (!time_based_rotation && size_rotation_for == 0)
+                size_rotation_for = LOG_DESTINATION_STDERR;
+            logfile_rotate(time_based_rotation, size_rotation_for);
+        }
+
+        /*
+         * Calculate time till next time-based rotation, so that we don't
+         * sleep longer than that.  We assume the value of "now" obtained
+         * above is still close enough.  Note we can't make this calculation
+         * until after calling logfile_rotate(), since it will advance
+         * next_rotation_time.
+         *
+         * Also note that we need to beware of overflow in calculation of the
+         * timeout: with large settings of Log_RotationAge, next_rotation_time
+         * could be more than INT_MAX msec in the future.  In that case we'll
+         * wait no more than INT_MAX msec, and try again.
+         */
+        if (Log_RotationAge > 0 && !rotation_disabled)
+        {
+            pg_time_t	delay;
+
+            delay = next_rotation_time - now;
+            if (delay > 0)
+            {
+                if (delay > INT_MAX / 1000)
+                    delay = INT_MAX / 1000;
+                cur_timeout = delay * 1000L;	/* msec */
+            }
+            else
+                cur_timeout = 0;
+        }
+        else
+        {
+            cur_timeout = -1L;
+        }
+
+        /*
+         * Sleep until there's something to do
+         */
+        n = epoll_wait (efd, events, GTM_SYSLOGGER_WAIT_EVENTS, cur_timeout);
+        for(i = 0; i < n; i++)
+        {
+            if(events[i].events & EPOLLIN)
+            {
+                if (events[i].data.fd == signalPipe[0])
+                {
+                    got_SIGHUP = true;
+                    GTM_drainNotifyBytes();
+                    elog(LOG, "Configuration update message received in syslogger thread.");
+                }
+                else
+                {
+                    int			bytesRead;
+
+                    bytesRead = read(syslogPipe[0],
+                                     logbuffer + bytes_in_logbuffer,
+                                     sizeof(logbuffer) - bytes_in_logbuffer);
+                    if (bytesRead < 0)
+                    {
+                        if (errno != EINTR)
+                            ereport(LOG,
+                                    (errmsg("could not read from logger pipe: %m")));
+                    }
+                    else if (bytesRead > 0)
+                    {
+                        bytes_in_logbuffer += bytesRead;
+                        process_pipe_input(logbuffer, &bytes_in_logbuffer, &pipe_eof_seen);
+                        continue;
+                    }
+                    else
+                    {
+                        /*
+                         * Zero bytes read when select() is saying read-ready means
+                         * EOF on the pipe: that is, there are no longer any processes
+                         * with the pipe write end open.  Therefore, the postmaster
+                         * and all backends are shut down, and we are done.
+                         */
+                        pipe_eof_seen = true;
+                    }
+                }
+            }
+        }
+
+        if (pipe_eof_seen)
+        {
+            elog(LOG, "GTM syslogger exit(%d)", exit_flag);
+            /* if there's any data left then force it out now */
+            flush_pipe_input(logbuffer, &bytes_in_logbuffer);
+
+            exit(exit_flag);
+        }
+    }
+
+    return my_threadinfo;
+}
+
+static bool
+GTM_SysLoggerStart(void)
+{
+    if (syslogPipe[0] < 0)
+    {
+        if (pipe(syslogPipe) < 0)
+            ereport(FATAL,
+                    (errmsg("could not create pipe for syslog: %m")));
+    }
+
+    if (signalPipe[0] < 0)
+    {
+        if (pipe(signalPipe) < 0)
+            ereport(FATAL,
+                    (errmsg("could not create pipe for signal: %m")));
+    }
+
+    /* Create GTM threads handling requests */
+    g_syslogger_thread = GTM_ThreadCreate(GTM_ThreadSysLogger, g_max_lock_number);
+    if (NULL == g_syslogger_thread)
+    {
+        return false;
+    }
+
+    fflush(stdout);
+    if (dup2(syslogPipe[1], fileno(stdout)) < 0)
+        ereport(FATAL,
+                (errmsg("could not redirect stdout: %m")));
+    fflush(stderr);
+    if (dup2(syslogPipe[1], fileno(stderr)) < 0)
+        ereport(FATAL,
+                (errmsg("could not redirect stderr: %m")));
+    /* Now we are done with the write end of the pipe. */
+    close(syslogPipe[1]);
+    syslogPipe[1] = -1;
+
+    GetMyThreadInfo->am_syslogger = false;
+    atexit(GTM_Exit);
+    return true;
+}
+
+/*
+ * let exit in syslogger
+ */
+void
+GTM_Exit(void)
+{
+    if (g_syslogger_thread != NULL && !GetMyThreadInfo->am_syslogger)
+    {
+        Assert(exit_flag != GTM_DEFAULT_EXIT_FLAG);
+
+        /* notify syslogger to do the exit */
+        elog(LOG, "notify syslogger to exit(%d).", exit_flag);
+        sleep(-1);
+    }
+}
+
 #endif
diff --git a/src/include/gtm/elog.h b/src/include/gtm/elog.h
index 70387e54..ffdc99c5 100644
--- a/src/include/gtm/elog.h
+++ b/src/include/gtm/elog.h
@@ -299,6 +299,10 @@ typedef enum
 #define LOG_DESTINATION_EVENTLOG 4
 #define LOG_DESTINATION_CSVLOG     8
 
+#define GTM_DEFAULT_EXIT_FLAG	 1024
+extern int	exit_flag;
+#define exit(x) (exit)(exit_flag = (x))
+
 /* Other exported functions */
 extern void pg_re_throw(void);
 extern void DebugFileOpen(void);
diff --git a/src/include/gtm/gtm.h b/src/include/gtm/gtm.h
index 715d91cb..714587eb 100644
--- a/src/include/gtm/gtm.h
+++ b/src/include/gtm/gtm.h
@@ -31,60 +31,60 @@
 extern char *GTMLogFile;
 typedef enum GTM_ThreadStatus
 {
-    GTM_THREAD_STARTING,
-    GTM_THREAD_RUNNING,
-    GTM_THREAD_EXITING,
-    GTM_THREAD_BACKUP,         /* Backup to standby is in progress */
-    /* Must be the last */
-    GTM_THREAD_INVALID
+	GTM_THREAD_STARTING,
+	GTM_THREAD_RUNNING,
+	GTM_THREAD_EXITING,
+	GTM_THREAD_BACKUP, 		/* Backup to standby is in progress */
+	/* Must be the last */
+	GTM_THREAD_INVALID
 } GTM_ThreadStatus;
 
 struct GTM_ConnectionInfo;
 #define ERRORDATA_STACK_SIZE  20
-#define GTM_MAX_CONNECTIONS_PER_THREAD    1024
-#define MAX_LOCKS_PER_THREAD        256
-#define GTM_MIN_THREADS 32            /* Provision for minimum threads */
-#define GTM_MAX_THREADS 512            /* Max threads allowed in the GTM */
+#define GTM_MAX_CONNECTIONS_PER_THREAD	1024
+#define MAX_LOCKS_PER_THREAD		256
+#define GTM_MIN_THREADS 32			/* Provision for minimum threads */
+#define GTM_MAX_THREADS 512		    /* Max threads allowed in the GTM */
 
 
-#define GTM_TIMEOUT                        -1L
-#define GTM_THREAD_FACTOR                1
+#define GTM_TIMEOUT						-1L
+#define GTM_THREAD_FACTOR				1
 typedef int32  GTM_TimerHandle;
 #define  INVALID_TIMER_HANDLE INVALID_STORAGE_HANDLE
 
 typedef struct GTM_ThreadInfo
 {
-    /*
-     * Initial few members get includes from gtm_common.h. This is to make sure
-     * that the GTMProxy_ThreadInfo and GTM_ThreadInfo structure can be
-     * typecasted to each other and these initial members can be safely
-     * accessed. If you need a member which should be common to both
-     * structures, consider adding them to GTM_COMMON_THREAD_INFO
-     */
-    GTM_COMMON_THREAD_INFO
-
-    GTM_ThreadStatus    thr_status;
-    GTM_ConnectionInfo    *thr_conn;
+	/*
+	 * Initial few members get includes from gtm_common.h. This is to make sure
+	 * that the GTMProxy_ThreadInfo and GTM_ThreadInfo structure can be
+	 * typecasted to each other and these initial members can be safely
+	 * accessed. If you need a member which should be common to both
+	 * structures, consider adding them to GTM_COMMON_THREAD_INFO
+	 */
+	GTM_COMMON_THREAD_INFO
+
+	GTM_ThreadStatus	thr_status;
+	GTM_ConnectionInfo	*thr_conn;
 
 #ifndef __XLOG__
-    GTM_Conn            *standby;
+	GTM_Conn            *standby;
 #endif
 
-    GTM_RWLock            thr_lock; /* Used to protect standby connection when new GTM standby registered. */
-    
-    gtm_List            *thr_cached_txninfo;
-    GTM_SnapshotData    thr_snapshot;
-    
-    /* fields for epoll. */
-    int                    thr_efd;
-    bool                thr_epoll_ok;
+	GTM_RWLock			thr_lock; /* Used to protect standby connection when new GTM standby registered. */
+	
+	gtm_List			*thr_cached_txninfo;
+	GTM_SnapshotData    thr_snapshot;
+	
+	/* fields for epoll. */
+	int					thr_efd;
+	bool			    thr_epoll_ok;
 
-    /* fields for lock track. */
-    GTM_RWLock          **locks_hold;
-    int                 max_lock_number;
-    int                    current_number;
+	/* fields for lock track. */
+	GTM_RWLock          **locks_hold;
+	int                 max_lock_number;
+	int					current_number;
 
-    GTM_TimerHandle      backup_timer_handle;
+	GTM_TimerHandle      backup_timer_handle;
 
     int                  insert_lock_id;
     int                  insert_try_lock_id;
@@ -92,38 +92,39 @@ typedef struct GTM_ThreadInfo
 #ifdef __XLOG__
     XLogRegisterBuff    *register_buff;
     time_t              last_sync_gts; /* copy of g_last_sync_gts used to detect gts sync timeout */
-    GTM_RWLock          **write_locks_hold;
+	GTM_RWLock          **write_locks_hold;
     int                 *write_counters;
-    int                 current_write_number;
-    bool                xlog_inserting;
+	int                 current_write_number;
+	bool                xlog_inserting;
 
-    XLogWaiter          xlog_waiter;
+	XLogWaiter          xlog_waiter;
     bool                handle_standby;
 #endif
     GTM_WorkerStatistics  *stat_handle;     /* statistics hanndle */
     DataPumpBuf           *datapump_buff;   /* log collection buff */
+    bool                  am_syslogger;
 } GTM_ThreadInfo;
 
 typedef struct GTM_Threads
 {
-    uint32                gt_thread_count;
-    uint32                gt_start_thread_count;
-    uint32                gt_array_size;
+	uint32				gt_thread_count;
+	uint32				gt_start_thread_count;
+	uint32				gt_array_size;
 #ifndef __XLOG__
-    bool                gt_standby_ready;
+	bool				gt_standby_ready;
 #endif
-    GTM_ThreadInfo        **gt_threads;
-    uint32                gt_starting_client_id;
-    uint32                gt_next_client_id;
-    uint32                gt_next_thread;
-    bool                gt_block_new_connection;
-    GTM_RWLock            gt_lock;
+	GTM_ThreadInfo		**gt_threads;
+	uint32				gt_starting_client_id;
+	uint32				gt_next_client_id;
+	uint32				gt_next_thread;
+	bool                gt_block_new_connection;
+	GTM_RWLock			gt_lock;
 } GTM_Threads;
 
 extern GTM_Threads *GTMThreads;
 
 typedef struct GTM_RestoreContext {
-    int    version;
+	int	version;
 } GTM_RestoreContext;
 
 int GTM_ThreadAdd(GTM_ThreadInfo *thrinfo);
@@ -137,78 +138,78 @@ void GTM_DoForAllOtherThreads(void (* process_routine)(GTM_ThreadInfo *));
 void GTM_SetInitialAndNextClientIdentifierAtPromote(void);
 
 GTM_ThreadInfo *GTM_ThreadCreate(
-                  void *(* startroutine)(void *), int32 max_lock);
+				  void *(* startroutine)(void *), int32 max_lock);
 GTM_ThreadInfo * GTM_GetThreadInfo(GTM_ThreadID thrid);
 #ifdef XCP
 extern void SaveControlInfo(void);
 void GTM_RestoreSeqInfo(FILE *ctlf, struct GTM_RestoreContext *context);
-#define CONTROL_INTERVAL        50000
+#define CONTROL_INTERVAL		50000
 #endif
 extern void
 GTM_ConnCleanup(GTM_ConnectionInfo *conn);
 extern void
 GTM_RemoveConnection(GTM_ConnectionInfo *conn);
 #ifdef __TBASE__
-extern   bool    enable_gtm_sequence_debug;
-extern     bool     enable_gtm_debug;
+extern   bool	enable_gtm_sequence_debug;
+extern	 bool 	enable_gtm_debug;
 extern   bool   enable_sync_commit;
 extern   int    warnning_time_cost;
 #endif
 /*
  * pthread keys to get thread specific information
  */
-extern pthread_key_t                    threadinfo_key;
-extern MemoryContext                    TopMostMemoryContext;
-extern GTM_ThreadID                        TopMostThreadID;
-
-#define SetMyThreadInfo(thrinfo)        pthread_setspecific(threadinfo_key, (thrinfo))
-#define GetMyThreadInfo                    ((GTM_ThreadInfo *)pthread_getspecific(threadinfo_key))
-#define GetMyConnection(port)                    ((GTM_ConnectionInfo *)((port)->conn))
-
-#define ThreadId                (GetMyThreadInfo->thr_localid)
-#define TopMemoryContext        (GetMyThreadInfo->thr_thread_context)
-#define ThreadTopContext        (GetMyThreadInfo->thr_thread_context)
-#define MessageContext            (GetMyThreadInfo->thr_message_context)
-#define CurrentMemoryContext    (GetMyThreadInfo->thr_current_context)
-#define ErrorContext            (GetMyThreadInfo->thr_error_context)
-#define errordata                (GetMyThreadInfo->thr_error_data)
-#define recursion_depth            (GetMyThreadInfo->thr_error_recursion_depth)
-#define errordata_stack_depth    (GetMyThreadInfo->thr_error_stack_depth)
-#define CritSectionCount        (GetMyThreadInfo->thr_criticalsec_count)
-
-#define PG_exception_stack        (GetMyThreadInfo->thr_sigjmp_buf)
-#define MyConnection            (GetMyThreadInfo->thr_conn)
-#define MyPort                    ((GetMyThreadInfo->thr_conn != NULL) ?    \
-                                    GetMyThreadInfo->thr_conn->con_port :    \
-                                    NULL)
-#define MyThreadID                (GetMyThreadInfo->thr_id)
-#define IsMainThread()            (GetMyThreadInfo->thr_id == TopMostThreadID)
-
-#define GTM_CachedTransInfo                (GetMyThreadInfo->thr_cached_txninfo)
-#define GTM_HaveFreeCachedTransInfo()    (gtm_list_length(GTM_CachedTransInfo))
-
-#define GTM_MAX_CACHED_TRANSINFO        0
-#define GTM_HaveEnoughCachedTransInfo()    (gtm_list_length(GTM_CachedTransInfo) >= GTM_MAX_CACHED_TRANSINFO)
+extern pthread_key_t					threadinfo_key;
+extern MemoryContext					TopMostMemoryContext;
+extern GTM_ThreadID						TopMostThreadID;
+
+#define SetMyThreadInfo(thrinfo)		pthread_setspecific(threadinfo_key, (thrinfo))
+#define GetMyThreadInfo					((GTM_ThreadInfo *)pthread_getspecific(threadinfo_key))
+#define GetMyConnection(port)					((GTM_ConnectionInfo *)((port)->conn))
+
+#define ThreadId				(GetMyThreadInfo->thr_localid)
+#define TopMemoryContext		(GetMyThreadInfo->thr_thread_context)
+#define ThreadTopContext		(GetMyThreadInfo->thr_thread_context)
+#define MessageContext			(GetMyThreadInfo->thr_message_context)
+#define CurrentMemoryContext	(GetMyThreadInfo->thr_current_context)
+#define ErrorContext			(GetMyThreadInfo->thr_error_context)
+#define errordata				(GetMyThreadInfo->thr_error_data)
+#define recursion_depth			(GetMyThreadInfo->thr_error_recursion_depth)
+#define errordata_stack_depth	(GetMyThreadInfo->thr_error_stack_depth)
+#define CritSectionCount		(GetMyThreadInfo->thr_criticalsec_count)
+
+#define PG_exception_stack		(GetMyThreadInfo->thr_sigjmp_buf)
+#define MyConnection			(GetMyThreadInfo->thr_conn)
+#define MyPort					((GetMyThreadInfo->thr_conn != NULL) ?	\
+									GetMyThreadInfo->thr_conn->con_port :	\
+									NULL)
+#define MyThreadID				(GetMyThreadInfo->thr_id)
+#define IsMainThread()			(GetMyThreadInfo->thr_id == TopMostThreadID)
+
+#define GTM_CachedTransInfo				(GetMyThreadInfo->thr_cached_txninfo)
+#define GTM_HaveFreeCachedTransInfo()	(gtm_list_length(GTM_CachedTransInfo))
+
+#define GTM_MAX_CACHED_TRANSINFO		0
+#define GTM_HaveEnoughCachedTransInfo()	(gtm_list_length(GTM_CachedTransInfo) >= GTM_MAX_CACHED_TRANSINFO)
 
 #define START_CRIT_SECTION()  (CritSectionCount++)
 
 #define END_CRIT_SECTION() \
-    do { \
-            Assert(CritSectionCount > 0); \
-            CritSectionCount--; \
-    } while(0)
-
-
-#define GTM_CLIENT_ID_EQ(a, b)        \
-    ((a) == (b))
-#define GTM_CLIENT_ID_LT(a, b)        \
-    (((int32)((a) - (b)) < 0) ? true : false)
-#define GTM_CLIENT_ID_GT(a, b)        \
-    (!GTM_CLIENT_ID_LT(a, b) && !GTM_CLIENT_ID_EQ(a, b))
-#define GTM_CLIENT_ID_NEXT(a)    \
-    ((((a) + 1) == UINT32_MAX) ? 1 : ((a) + 1))
-
-#define GTM_CONTROL_FILE        "gtm.control"
+	do { \
+		    Assert(CritSectionCount > 0); \
+		    CritSectionCount--; \
+	} while(0)
+
+
+#define GTM_CLIENT_ID_EQ(a, b)		\
+	((a) == (b))
+#define GTM_CLIENT_ID_LT(a, b)		\
+	(((int32)((a) - (b)) < 0) ? true : false)
+#define GTM_CLIENT_ID_GT(a, b)		\
+	(!GTM_CLIENT_ID_LT(a, b) && !GTM_CLIENT_ID_EQ(a, b))
+#define GTM_CLIENT_ID_NEXT(a)	\
+	((((a) + 1) == UINT32_MAX) ? 1 : ((a) + 1))
+
+#define GTM_CONTROL_FILE		"gtm.control"
 #define GTM_CONTROL_FILE_TMP    "gtm.control.tmp"
-#define GTM_CONTROL_VERSION        20180716
+#define GTM_CONTROL_VERSION		20180716
 #endif
diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h
index b5257f1a..ad0be27e 100644
--- a/src/include/gtm/gtm_c.h
+++ b/src/include/gtm/gtm_c.h
@@ -385,6 +385,8 @@ typedef enum
     GTMStorageStatus_status_butty
 }GTMStorageCheckStatus;
 
+typedef int64 pg_time_t;
+
 /* 
  * Add delta 100s by assuming 10 timstamp/us 
  * at which rate the GTM can provide 1000w/s throughput.
diff --git a/src/include/gtm/gtm_opt.h b/src/include/gtm/gtm_opt.h
index 8bd33409..c0f202d6 100644
--- a/src/include/gtm/gtm_opt.h
+++ b/src/include/gtm/gtm_opt.h
@@ -341,6 +341,10 @@ const char *const config_type_names[] =\
 #define GTM_OPTNAME_KEEPALIVES_COUNT	"keepalives_count"
 #define GTM_OPTNAME_LISTEN_ADDRESSES	"listen_addresses"
 #define GTM_OPTNAME_LOG_FILE			"log_file"
+#define GTM_OPTNAME_LOG_FILENAME_PATTERN	 "log_filename_pattern"
+#define GTM_OPTNAME_LOG_ROTATION_AGE	     "log_rotation_age"
+#define GTM_OPTNAME_LOG_ROTATION_SIZE	     "log_rotation_size"
+#define GTM_OPTNAME_LOG_TRUNCATE_ON_ROTATION "log_truncate_on_rotation"
 #define GTM_OPTNAME_LOG_MIN_MESSAGES	"log_min_messages"
 #define GTM_OPTNAME_NODENAME			"nodename"
 #define GTM_OPTNAME_PORT				"port"
diff --git a/src/include/gtm/gtm_time.h b/src/include/gtm/gtm_time.h
index 71c763df..afeb1abd 100644
--- a/src/include/gtm/gtm_time.h
+++ b/src/include/gtm/gtm_time.h
@@ -16,21 +16,22 @@
 #define GTM_TIME_H
 
 /* Julian-date equivalents of Day 0 in Unix and GTM reckoning */
-#define UNIX_EPOCH_JDATE    2440588 /* == date2j(1970, 1, 1) */
-#define GTM_EPOCH_JDATE        2451545 /* == date2j(2000, 1, 1) */
+#define UNIX_EPOCH_JDATE	2440588 /* == date2j(1970, 1, 1) */
+#define GTM_EPOCH_JDATE		2451545 /* == date2j(2000, 1, 1) */
 
-#define SECS_PER_YEAR    (36525 * 864)    /* avoid floating-point computation */
-#define SECS_PER_DAY    86400
-#define SECS_PER_HOUR    3600
-#define SECS_PER_MINUTE    60
-#define MINS_PER_HOUR    60
+#define SECS_PER_YEAR	(36525 * 864)	/* avoid floating-point computation */
+#define SECS_PER_DAY	86400
+#define SECS_PER_HOUR	3600
+#define SECS_PER_MINUTE	60
+#define MINS_PER_HOUR	60
+#define HOURS_PER_DAY	24
 
 #ifdef HAVE_INT64_TIMESTAMP
-#define USECS_PER_DAY        INT64CONST(86400000000)
-#define USECS_PER_HOUR        INT64CONST(3600000000)
-#define USECS_PER_MINUTE    INT64CONST(60000000)
-#define USECS_PER_SEC        INT64CONST(1000000)
-#define NSECS_PER_SEC        INT64CONST(1000000000)
+#define USECS_PER_DAY		INT64CONST(86400000000)
+#define USECS_PER_HOUR		INT64CONST(3600000000)
+#define USECS_PER_MINUTE	INT64CONST(60000000)
+#define USECS_PER_SEC		INT64CONST(1000000)
+#define NSECS_PER_SEC		INT64CONST(1000000000)
 
 #endif
 
@@ -43,9 +44,9 @@ extern GlobalTimestamp
 GTM_TimestampGetMonotonicRawPrecise(GlobalTimestamp *tv_sec, GlobalTimestamp *tv_nsec);
 
 void GTM_TimestampDifference(GTM_Timestamp start_time, GTM_Timestamp stop_time,
-                    long *secs, int *microsecs);
+					long *secs, int *microsecs);
 bool GTM_TimestampDifferenceExceeds(GTM_Timestamp start_time,
-                           GTM_Timestamp stop_time,
-                           int msec);
+						   GTM_Timestamp stop_time,
+						   int msec);
 
 #endif
diff --git a/src/include/gtm/syslogger.h b/src/include/gtm/syslogger.h
new file mode 100644
index 00000000..3ebd66a0
--- /dev/null
+++ b/src/include/gtm/syslogger.h
@@ -0,0 +1,98 @@
+/*-------------------------------------------------------------------------
+ *
+ * syslogger.h
+ *	  Exports from gtm/syslogger.c.
+ *
+ * Copyright (c) 2021-Present TBase development team, Tencent
+ *
+ * src/include/gtm/syslogger.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _SYSLOGGER_H
+#define _SYSLOGGER_H
+
+#include <limits.h>				/* for PIPE_BUF */
+
+
+/*
+ * Primitive protocol structure for writing to syslogger pipe(s).  The idea
+ * here is to divide long messages into chunks that are not more than
+ * PIPE_BUF bytes long, which according to POSIX spec must be written into
+ * the pipe atomically.  The pipe reader then uses the protocol headers to
+ * reassemble the parts of a message into a single string.  The reader can
+ * also cope with non-protocol data coming down the pipe, though we cannot
+ * guarantee long strings won't get split apart.
+ *
+ * We use non-nul bytes in is_last to make the protocol a tiny bit
+ * more robust against finding a false double nul byte prologue. But
+ * we still might find it in the len and/or pid bytes unless we're careful.
+ */
+
+#ifdef PIPE_BUF
+/* Are there any systems with PIPE_BUF > 64K?  Unlikely, but ... */
+#if PIPE_BUF > 65536
+#define PIPE_CHUNK_SIZE  65536
+#else
+#define PIPE_CHUNK_SIZE  ((int) PIPE_BUF)
+#endif
+#else							/* not defined */
+/* POSIX says the value of PIPE_BUF must be at least 512, so use that */
+#define PIPE_CHUNK_SIZE  512
+#endif
+
+/*
+ * We read() into a temp buffer twice as big as a chunk, so that any fragment
+ * left after processing can be moved down to the front and we'll still have
+ * room to read a full chunk.
+ */
+#define READ_BUF_SIZE (2 * PIPE_CHUNK_SIZE)
+
+typedef struct
+{
+	char		nuls[2];		/* always \0\0 */
+	uint16		len;			/* size of this chunk (counts data only) */
+	int32		pid;			/* writer's pid */
+	char		is_last;		/* last chunk of message? 't' or 'f' ('T' or
+								 * 'F' for CSV case) */
+	char		data[FLEXIBLE_ARRAY_MEMBER];	/* data payload starts here */
+} PipeProtoHeader;
+
+typedef union
+{
+	PipeProtoHeader proto;
+	char		filler[PIPE_CHUNK_SIZE];
+} PipeProtoChunk;
+
+#define PIPE_HEADER_SIZE  offsetof(PipeProtoHeader, data)
+#define PIPE_MAX_PAYLOAD  ((int) (PIPE_CHUNK_SIZE - PIPE_HEADER_SIZE))
+
+
+/* GUC options */
+extern bool Logging_collector;
+extern int	Log_RotationAge;
+extern int	Log_RotationSize;
+extern char *Log_directory;
+extern char *Log_filename;
+extern bool Log_truncate_on_rotation;
+extern int	Log_file_mode;
+
+extern int	syslogPipe[2];
+extern int	signalPipe[2];
+extern bool rotation_disabled;
+extern pg_time_t next_rotation_time;
+extern pg_time_t first_syslogger_file_time;
+extern FILE *gtmlogFile;
+extern bool rotation_requested;
+
+extern int	SysLogger_Start(void);
+extern void logfile_rotate(bool time_based_rotation, int size_rotation_for);
+extern void write_syslogger_file(const char *buffer, int count, int dest);
+extern void set_next_rotation_time(void);
+extern void process_pipe_input(char *logbuffer, int *bytes_in_logbuffer, bool *pipe_eof_seen);
+extern void flush_pipe_input(char *logbuffer, int *bytes_in_logbuffer);
+extern void GTM_LogFileInit(void);
+extern void GTM_SendNotifyByte(void);
+extern void GTM_drainNotifyBytes(void);
+extern int GTM_InitSysloggerEpoll(void);
+#endif							/* _SYSLOGGER_H */

From 2fd1374fddcdbd200464f00b9a689337631e7b2e Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Wed, 7 Apr 2021 19:47:21 +0800
Subject: [PATCH 352/578] Improve performance with type converter function when
 insert with multi values. (merge request !267)

(cherry picked from commit f223a4f5)

8ba3cbdc Improve performance with type converter function when insert with multi values.

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131086323123&url_cache_key=d4e1402777dc733479aac463ad1a9d24
---
 src/backend/parser/analyze.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index d0d5c909..a5a5e96b 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -1146,7 +1146,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
             if (IsExtendedQuery() && qry->isMultiValues && !qry->hasUnshippableTriggers)
             {
                 /*
-                 * simple insert if all values are params
+				 * simple insert if all values are params or can be pushed down
                  *
                  * if not simple insert, do not transform insert into to copy from
                  */
@@ -1154,7 +1154,8 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
                 foreach(cell, sublist)
                 {
                     Node *node = (Node *)lfirst(cell);
-                    if (!IsA(node, Param))
+					if (!IsA(node, Param) &&
+					    !pgxc_is_expr_shippable(node, NULL))
                     {
                         qry->isMultiValues = false;
                         break;

From da91cb2bd70091ed7d40ba9a9631e5113b898d45 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Mon, 12 Apr 2021 10:47:37 +0800
Subject: [PATCH 353/578] fix warning

---
 src/gtm/common/elog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gtm/common/elog.c b/src/gtm/common/elog.c
index a6c6c66d..597a252e 100644
--- a/src/gtm/common/elog.c
+++ b/src/gtm/common/elog.c
@@ -290,6 +290,7 @@ errfinish(int dummy,...)
 {// #lizard forgives
     ErrorData  *edata = &errordata[errordata_stack_depth];
     int            elevel = edata->elevel;
+	GTM_ThreadInfo *thrinfo = GetMyThreadInfo;
 
     MemoryContext oldcontext;
     recursion_depth++;
@@ -324,7 +325,6 @@ errfinish(int dummy,...)
     }
 
     /* Emit the message to the right places */
-        GTM_ThreadInfo *thrinfo = GetMyThreadInfo;
         if(thrinfo->thr_conn)
         {
             EmitErrorReport(thrinfo->thr_conn->con_port);

From a172b03288d43fcb7b8d82c52a4561b45f1641fd Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 16 Apr 2021 15:21:08 +0800
Subject: [PATCH 354/578] for
 http://tapd.oa.com/pgxz/prong/stories/view/1010092131863532547 (merge request
 !277)

---
 contrib/pg_unlock/pg_unlock.c | 75 ++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 18 deletions(-)

diff --git a/contrib/pg_unlock/pg_unlock.c b/contrib/pg_unlock/pg_unlock.c
index f070a071..50401a9f 100644
--- a/contrib/pg_unlock/pg_unlock.c
+++ b/contrib/pg_unlock/pg_unlock.c
@@ -41,6 +41,7 @@ PG_MODULE_MAGIC;
 #define MAX_RELNAME 64
 #define MAX_MODE 30
 #define MAX_DEADLOCK 10000
+#define MAX_DEADLOCK_CHECKLOOP (10)
 
 /*macros about space allocation and release*/
 #define INIT(x)\
@@ -461,6 +462,11 @@ pg_unlock_execute(PG_FUNCTION_ARGS)
 			if (Partxns->Ptxns[Partxns->Ptxns_count].txn_count > 0)
 			{
 				Partxns->Ptxns_count++;
+				if (Partxns->Ptxns_count >= MAX_DEADLOCK_CHECKLOOP)
+                {
+				    /* avoid deadlock all the time */
+				    break;
+                }
 			}
 			DropAlldeadlocks();
 			DropAlltransactions();
@@ -1029,6 +1035,46 @@ void GetAllTransInfo(void)
 	}
 }
 
+/*
+ * BinarySearchGid -- Binary search gid in pgxc_transaction
+ * input: 	gid
+ * return:	gid pos or insert pos, was gid found
+ */
+static int
+BinarySearchGid(char *gid, bool *found)
+{
+    int low = 0;
+    int high = pgxc_transaction_count - 1;
+    int mid = 0;
+    int cmp_result = 0;
+    *found = false;
+
+    while (low <= high)
+    {
+        mid = (low + high) / 2;
+        cmp_result = strcmp(gid, pgxc_transaction[mid].gid);
+        if (cmp_result == 0)
+        {
+            /* gid == pgxc_transaction[mid].gid */
+            *found = true;
+            return mid;
+        }
+        else if (cmp_result > 0)
+        {
+            /* gid > pgxc_transaction[mid].gid */
+            low = mid + 1;
+        }
+        else
+        {
+            /* gid < pgxc_transaction[mid].gid */
+            high = mid - 1;
+        }
+    }
+
+    /* return insert pos */
+    return high + 1;
+}
+
 /* 
  * LoadTransaction -- get transactions from certain node and stores them in pgxc_transaction
  * input: 	node oid
@@ -1037,7 +1083,7 @@ void GetAllTransInfo(void)
 void LoadTransaction(Oid node)
 {	
 	const char *query_stmt = "select a1.pid::text, a1.locktype::text, a2.datname::text, a2.relname::text, "
-						 "a1.page::text, a1.tuple::text, a1.mode::text, a1.granted::text, a1.transactionid::text, a3.query::text "
+						 "a1.page::text, a1.tuple::text, a1.mode::text, a1.granted::text, a1.transactionid::text, a3.query::text, pg_findgxid(a1.pid::int)::text "
 						 "from (select locktype::text, database, relation, page::text, "
 									  "tuple::text, mode::text, granted::text, pid::text, transactionid::text "
 									  "from pg_locks where (locktype = 'relation' or locktype = 'page' or locktype = 'tuple' or locktype = 'transactionid')"
@@ -1070,9 +1116,10 @@ void LoadTransaction(Oid node)
 	char *gid = NULL;
     int nodeid = 0;
 	lockinfo templock;
-	
+	bool found = false;
+
     sprintf(query_txnid, query_stmt, MyProcPid);
-	execute_on_single_node(node, query_txnid, 10, &result_txnid);
+	execute_on_single_node(node, query_txnid, 11, &result_txnid);
 	if (result_txnid.slot == NULL) 
 	{
 		elog(DEBUG1, "pg_unlock: there is no transaction on node %s", get_pgxc_nodename(node));
@@ -1084,37 +1131,29 @@ void LoadTransaction(Oid node)
 	{
 		pid = strtoul(TTSgetvalue(&result_txnid, i, 0), NULL, 10);
 		/*get global xid of pid on node*/
-		gid = GetGxid(node, pid);
-			/*select for update apply for transactionid without global xid*/
+        gid = TTSgetvalue(&result_txnid, i, 10);
+		/*select for update apply for transactionid without global xid*/
 		if (gid == NULL)
 		{
 			continue;
 		}
 		
 		/*check whether the gid is already existed*/
-		for (i_txn = 0; i_txn < pgxc_transaction_count; i_txn++)
-		{
-			if (strcmp(gid, pgxc_transaction[i_txn].gid) == 0)
-			{
-				break;
-			}
-		}
-		
+        i_txn = BinarySearchGid(gid, &found);
 		/*insert this new transaction when gid is not find in pgxc_transaction*/
-		if (i_txn >= pgxc_transaction_count)
+		if (!found)
 		{
 			RPALLOC(pgxc_transaction);
-			InitTransaction(pgxc_transaction_count);
-			memcpy(pgxc_transaction[pgxc_transaction_count].gid, gid, sizeof(char) * MAX_GID);
+            memmove(&pgxc_transaction[i_txn + 1], &pgxc_transaction[i_txn], (pgxc_transaction_count - i_txn) * sizeof(transaction));
+			InitTransaction(i_txn);
+			memcpy(pgxc_transaction[i_txn].gid, gid, sizeof(char) * MAX_GID);
 			pgxc_transaction_count++;
-			i_txn = pgxc_transaction_count-1;
 		}
 		add_pid_node(i_txn, pid, node);
 		ptr = strtok(gid, ":");
         nodeid = atoi(ptr);
         pgxc_transaction[i_txn].initiator = get_nodeoid_from_nodeid(nodeid, PGXC_NODE_COORDINATOR);
 		//pgxc_transaction[i_txn].initiator = get_pgxc_nodeoid(ptr);
-		pfree(gid);
 
 		/*read lockinfo from result_txnid*/
 		templock.m_pid = pid;

From 1782234fe80950b3671c4c1bfd4df8b8e5b0ac36 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 23 Apr 2021 14:28:05 +0800
Subject: [PATCH 355/578] Fix gtm backup node memory leak problem during xlog
 redo (merge request !289)

http://tapd.oa.com/TBase_C/bugtrace/bugs/view?bug_id=1020385652086953173
---
 src/gtm/client/fe-connect.c  |  72 +--------
 src/gtm/client/fe-protocol.c | 239 +++++++++++++++++++++------
 src/gtm/gtm_ctl/gtm_ctl.c    |   2 +
 src/include/gtm/gtm_client.h | 281 ++++++++++++++++----------------
 src/include/gtm/gtm_msg.h    | 305 ++++++++++++++++++-----------------
 5 files changed, 491 insertions(+), 408 deletions(-)

diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c
index 080af583..4a46c9b8 100644
--- a/src/gtm/client/fe-connect.c
+++ b/src/gtm/client/fe-connect.c
@@ -952,75 +952,9 @@ freeGTM_Conn(GTM_Conn *conn)
 		if (conn->result->gr_snapshot.sn_xip)
 			free(conn->result->gr_snapshot.sn_xip);
 
-		/* Depending on result type there could be allocated data */
-		switch (conn->result->gr_type)
-		{
-			case SEQUENCE_INIT_RESULT:
-			case SEQUENCE_RESET_RESULT:
-			case SEQUENCE_CLOSE_RESULT:
-			case SEQUENCE_RENAME_RESULT:
-			case SEQUENCE_ALTER_RESULT:
-			case SEQUENCE_SET_VAL_RESULT:
-			case MSG_DB_SEQUENCE_RENAME_RESULT:
-				if (conn->result->gr_resdata.grd_seqkey.gsk_key)
-					free(conn->result->gr_resdata.grd_seqkey.gsk_key);
-				break;
-
-			case SEQUENCE_GET_NEXT_RESULT:
-			case SEQUENCE_GET_LAST_RESULT:
-				if (conn->result->gr_resdata.grd_seq.seqkey.gsk_key)
-					free(conn->result->gr_resdata.grd_seq.seqkey.gsk_key);
-				break;
-				
-			default:
-				break;
-		}
-
-		
-#ifdef __TBASE__					
-		if (conn->result->grd_storage_data.len && conn->result->grd_storage_data.data)
-		{
-			free(conn->result->grd_storage_data.data);
-			conn->result->grd_storage_data.data = NULL;
-			conn->result->grd_storage_data.len  = 0;
-		}
-
-		if (conn->result->grd_store_seq.count && conn->result->grd_store_seq.seqs)
-		{
-			free(conn->result->grd_store_seq.seqs);
-			conn->result->grd_store_seq.seqs = NULL;
-			conn->result->grd_store_seq.count  = 0;
-		}
-
-		if (conn->result->grd_store_txn.count && conn->result->grd_store_txn.txns)
-		{
-			free(conn->result->grd_store_txn.txns);
-			conn->result->grd_store_txn.txns   = NULL;
-			conn->result->grd_store_txn.count  = 0;
-		}	
-
-		if (conn->result->grd_store_check_seq.count && conn->result->grd_store_check_seq.seqs)
-		{
-			free(conn->result->grd_store_check_seq.seqs);
-			conn->result->grd_store_check_seq.seqs   = NULL;
-			conn->result->grd_store_check_seq.count  = 0;
-		}
-
-		if (conn->result->grd_store_check_txn.count && conn->result->grd_store_check_txn.txns)
-		{
-			free(conn->result->grd_store_check_txn.txns);
-			conn->result->grd_store_check_txn.txns   = NULL;
-			conn->result->grd_store_check_txn.count  = 0;
-		}
-
-		if (conn->result->grd_errlog.len && conn->result->grd_errlog.errlog)
-        {
-		    free(conn->result->grd_errlog.errlog);
-            conn->result->grd_errlog.errlog = NULL;
-            conn->result->grd_errlog.len = 0;
-        }
-		
-#endif	
+		/* release memory for one-time application */
+        gtmpqFreeResultResource(conn->result);
+
 		free(conn->result);
 	}
 #endif
diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c
index d545384c..89bedf88 100644
--- a/src/gtm/client/fe-protocol.c
+++ b/src/gtm/client/fe-protocol.c
@@ -435,7 +435,7 @@ break;
         {
             int len = 0;
             int count = 0;
-
+            memset(&result->gr_resdata.grd_gts, 0, sizeof(result->gr_resdata.grd_gts));
 
             if (gtmpqGetnchar((char *) &result->gr_resdata.grd_gts.node_status,
                               sizeof(int), conn))
@@ -713,14 +713,6 @@ result->gr_status = GTM_RESULT_ERROR;
             int data_len = 0;
             char *data_buf = NULL;
 
-            /* free result of last call */
-            if (result->grd_storage_data.len && result->grd_storage_data.data)
-            {
-                free(result->grd_storage_data.data);
-                result->grd_storage_data.data = NULL;
-                result->grd_storage_data.len = 0;
-            }
-
 #ifdef __XLOG__
             /* get xlog start pos and timeline */
             if (gtmpqGetInt64((int64 *)&result->grd_storage_data.start_pos, conn))
@@ -897,13 +889,6 @@ result->gr_status = GTM_RESULT_ERROR;
 
 		case MSG_LIST_GTM_STORE_SEQ_RESULT:    /* List  gtm running sequence info */
 		{
-			if (conn->result->grd_store_seq.count && conn->result->grd_store_seq.seqs)
-			{
-				free(conn->result->grd_store_seq.seqs);
-				conn->result->grd_store_seq.seqs = NULL;
-				conn->result->grd_store_seq.count = 0;
-			}
-
 			if (gtmpqGetInt(&conn->result->grd_store_seq.count,
 							sizeof(int32), conn))
 			{
@@ -927,13 +912,6 @@ result->gr_status = GTM_RESULT_ERROR;
 
 		case MSG_LIST_GTM_TXN_STORE_RESULT:    /* List  gtm running sequence info */
 		{
-			if (conn->result->grd_store_txn.count && conn->result->grd_store_txn.txns)
-			{
-				free(conn->result->grd_store_txn.txns);
-				conn->result->grd_store_txn.txns = NULL;
-				conn->result->grd_store_txn.count = 0;
-			}
-
 			if (gtmpqGetInt(&conn->result->grd_store_txn.count,
 							sizeof(int32), conn))
 			{
@@ -959,13 +937,6 @@ result->gr_status = GTM_RESULT_ERROR;
 
 		case MSG_CHECK_GTM_SEQ_STORE_RESULT:    /* Check gtm sequence valid info */
 		{
-			if (conn->result->grd_store_check_seq.count && conn->result->grd_store_check_seq.seqs)
-			{
-				free(conn->result->grd_store_check_seq.seqs);
-				conn->result->grd_store_check_seq.seqs = NULL;
-				conn->result->grd_store_check_seq.count = 0;
-			}
-
 			if (gtmpqGetInt(&conn->result->grd_store_check_seq.count,
 							sizeof(int32), conn))
 			{
@@ -990,13 +961,6 @@ result->gr_status = GTM_RESULT_ERROR;
 
 		case MSG_CHECK_GTM_TXN_STORE_RESULT:    /* Check gtm transaction usage info */
 		{
-			if (conn->result->grd_store_check_txn.count && conn->result->grd_store_check_txn.txns)
-			{
-				free(conn->result->grd_store_check_txn.txns);
-				conn->result->grd_store_check_txn.txns = NULL;
-				conn->result->grd_store_check_txn.count = 0;
-			}
-
 			if (gtmpqGetInt(&conn->result->grd_store_check_txn.count,
 							sizeof(int32), conn))
 			{
@@ -1167,7 +1131,7 @@ result->gr_status = GTM_RESULT_ERROR;
             }
             if (result->gr_resdata.grd_txn_get_gid_data.nodelen != 0)
             {
-                /* Do necessary allocation */
+				/* Do necessary allocation, free outside */
                 result->gr_resdata.grd_txn_get_gid_data.nodestring =
                         (char *) malloc(sizeof(char *) * result->gr_resdata.grd_txn_get_gid_data.nodelen + 1);
                 if (result->gr_resdata.grd_txn_get_gid_data.nodestring == NULL)
@@ -1268,6 +1232,8 @@ result->gr_status = GTM_RESULT_ERROR;
             char *buf = NULL;
             int   buf_size = 8192;
 
+            memset(result->gr_resdata.grd_node_list.nodeinfo, 0, sizeof(result->gr_resdata.grd_node_list.nodeinfo));
+
             if (gtmpqGetInt(&result->gr_resdata.grd_node_list.num_node, sizeof(int32), conn))
             {
                 result->gr_status = GTM_RESULT_ERROR;
@@ -1286,6 +1252,7 @@ result->gr_status = GTM_RESULT_ERROR;
 			{
 				int size;
 				GTM_PGXCNodeInfo *data = (GTM_PGXCNodeInfo *) malloc(sizeof(GTM_PGXCNodeInfo));
+                memset(data, 0, sizeof(GTM_PGXCNodeInfo));
 
 				if (gtmpqGetInt(&size, sizeof(int32), conn))
 				{
@@ -1316,6 +1283,26 @@ result->gr_status = GTM_RESULT_ERROR;
 				if (!gtm_deserialize_pgxcnodeinfo(data, buf, size, &conn->errorMessage))
 				{
 					result->gr_status = GTM_RESULT_ERROR;
+                    if (data->nodename)
+                    {
+                        genFree(data->nodename);
+                    }
+                    if (data->proxyname)
+                    {
+                        genFree(data->proxyname);
+                    }
+                    if (data->ipaddress)
+                    {
+                        genFree(data->ipaddress);
+                    }
+                    if (data->datafolder)
+                    {
+                        genFree(data->datafolder);
+                    }
+                    if (data->sessions)
+                    {
+                        genFree(data->sessions);
+                    }
 					free(data);
 					break;
 				}
@@ -1392,6 +1379,8 @@ result->gr_status = GTM_RESULT_ERROR;
             int offset     = 0;
             int pack_size  = 0;
             int i          = 0;
+            result->gr_resdata.grd_xlog_data.length = 0;
+            result->gr_resdata.grd_xlog_data.xlog_data = NULL;
 
             if (gtmpqGetInt64((int64 *)&result->gr_resdata.grd_xlog_data.flush, conn))
             {
@@ -1497,17 +1486,14 @@ gtmpqReadSeqKey(GTM_SequenceKey seqkey, GTM_Conn *conn)
     return 0;
 }
 
+/*
+ * release the one-time-applied memory. if the memory design is reused,
+ * please release it last in freeGTM_Conn
+ */
 void
-gtmpqFreeResultData(GTM_Result *result, GTM_PGXCNodeType remote_type)
-{// #lizard forgives
-    /*
-     * If we are running as a GTM proxy, we don't have anything to do. This may
-     * change though as we add more message types below and some of them may
-     * need cleanup even at the proxy level
-     */
-    if (remote_type == GTM_NODE_GTM_PROXY)
-        return;
-
+gtmpqFreeResultResource(GTM_Result *result)
+{
+    int i = 0;
     switch (result->gr_type)
     {
         case SEQUENCE_INIT_RESULT:
@@ -1546,8 +1532,165 @@ gtmpqFreeResultData(GTM_Result *result, GTM_PGXCNodeType remote_type)
              * again shortly
              */
             break;
+        case NODE_UNREGISTER_RESULT:
+        case NODE_REGISTER_RESULT:
+            if (result->gr_resdata.grd_node.node_name)
+            {
+                free(result->gr_resdata.grd_node.node_name);
+                result->gr_resdata.grd_node.node_name = NULL;
+            }
+            break;
+        case NODE_LIST_RESULT:
+            if (result->gr_resdata.grd_node_list.num_node)
+            {
+                for (i = 0; i < result->gr_resdata.grd_node_list.num_node; i++)
+                {
+                    if (result->gr_resdata.grd_node_list.nodeinfo[i])
+                    {
+                        GTM_PGXCNodeInfo *data = result->gr_resdata.grd_node_list.nodeinfo[i];
+                        if (data->nodename)
+                        {
+                            genFree(data->nodename);
+                            data->nodename = NULL;
+                        }
+                        if (data->proxyname)
+                        {
+                            genFree(data->proxyname);
+                            data->proxyname = NULL;
+                        }
+                        if (data->ipaddress)
+                        {
+                            genFree(data->ipaddress);
+                            data->ipaddress = NULL;
+                        }
+                        if (data->datafolder)
+                        {
+                            genFree(data->datafolder);
+                            data->datafolder = NULL;
+                        }
+                        if (data->sessions)
+                        {
+                            genFree(data->sessions);
+                            data->sessions = NULL;
+                        }
+                        free(result->gr_resdata.grd_node_list.nodeinfo[i]);
+                        result->gr_resdata.grd_node_list.nodeinfo[i] = NULL;
+                    }
+                }
+                result->gr_resdata.grd_node_list.num_node = 0;
+            }
+            break;
+#ifdef __XLOG__
+        case MSG_REPLICATION_CONTENT:
+            if (result->gr_resdata.grd_xlog_data.length && result->gr_resdata.grd_xlog_data.xlog_data)
+            {
+                free(result->gr_resdata.grd_xlog_data.xlog_data);
+                result->gr_resdata.grd_xlog_data.xlog_data = NULL;
+                result->gr_resdata.grd_xlog_data.length = 0;
+            }
+            break;
+#endif
+#ifdef __TBASE__
+        case TXN_CHECK_GTM_STATUS_RESULT:
+            if (result->gr_resdata.grd_gts.standby_count > 0 &&
+                result->gr_resdata.grd_gts.standby_count <= GTM_MAX_WALSENDER)
+            {
+                if (result->gr_resdata.grd_gts.slave_is_sync)
+                {
+                    free(result->gr_resdata.grd_gts.slave_is_sync);
+                    result->gr_resdata.grd_gts.slave_is_sync = NULL;
+                }
 
+                if (result->gr_resdata.grd_gts.slave_timestamp)
+                {
+                    free(result->gr_resdata.grd_gts.slave_timestamp);
+                    result->gr_resdata.grd_gts.slave_timestamp = NULL;
+                }
+
+                if (result->gr_resdata.grd_gts.slave_flush_ptr)
+                {
+                    free(result->gr_resdata.grd_gts.slave_flush_ptr);
+                    result->gr_resdata.grd_gts.slave_flush_ptr = NULL;
+                }
+
+                for (i = 0; i < result->gr_resdata.grd_gts.standby_count; i++)
+                {
+                    if (result->gr_resdata.grd_gts.application_name[i])
+                    {
+                        free(result->gr_resdata.grd_gts.application_name[i]);
+                        result->gr_resdata.grd_gts.application_name[i] = NULL;
+                    }
+                }
+
+                result->gr_resdata.grd_gts.standby_count = 0;
+            }
+            break;
+        case MSG_GET_GTM_ERRORLOG_RESULT:
+            if (result->grd_errlog.len && result->grd_errlog.errlog)
+            {
+                free(result->grd_errlog.errlog);
+                result->grd_errlog.errlog = NULL;
+                result->grd_errlog.len = 0;
+            }
+            break;
+        case STORAGE_TRANSFER_RESULT:
+            /* free result of last call */
+            if (result->grd_storage_data.len && result->grd_storage_data.data)
+            {
+                free(result->grd_storage_data.data);
+                result->grd_storage_data.data = NULL;
+                result->grd_storage_data.len = 0;
+            }
+            break;
+        case MSG_LIST_GTM_STORE_SEQ_RESULT:
+            if (result->grd_store_seq.count && result->grd_store_seq.seqs)
+            {
+                free(result->grd_store_seq.seqs);
+                result->grd_store_seq.seqs = NULL;
+                result->grd_store_seq.count  = 0;
+            }
+            break;
+        case MSG_LIST_GTM_TXN_STORE_RESULT:
+            if (result->grd_store_txn.count && result->grd_store_txn.txns)
+            {
+                free(result->grd_store_txn.txns);
+                result->grd_store_txn.txns = NULL;
+                result->grd_store_txn.count = 0;
+            }
+            break;
+        case MSG_CHECK_GTM_SEQ_STORE_RESULT:
+            if (result->grd_store_check_seq.count && result->grd_store_check_seq.seqs)
+            {
+                free(result->grd_store_check_seq.seqs);
+                result->grd_store_check_seq.seqs = NULL;
+                result->grd_store_check_seq.count = 0;
+            }
+            break;
+        case MSG_CHECK_GTM_TXN_STORE_RESULT:
+            if (result->grd_store_check_txn.count && result->grd_store_check_txn.txns)
+            {
+                free(result->grd_store_check_txn.txns);
+                result->grd_store_check_txn.txns = NULL;
+                result->grd_store_check_txn.count = 0;
+            }
+            break;
+#endif
         default:
             break;
     }
 }
+
+void
+gtmpqFreeResultData(GTM_Result *result, GTM_PGXCNodeType remote_type)
+{
+
+    /*
+	 * If we are running as a GTM proxy, we don't have anything to do. This may
+	 * change though as we add more message types below and some of them may
+	 * need cleanup even at the proxy level
+	 */
+	if (remote_type == GTM_NODE_GTM_PROXY)
+		return;
+
+    gtmpqFreeResultResource(result);
+}
diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c
index 2b34ad0d..9aeeb322 100644
--- a/src/gtm/gtm_ctl/gtm_ctl.c
+++ b/src/gtm/gtm_ctl/gtm_ctl.c
@@ -1050,6 +1050,8 @@ do_status(void)
             exit(1);
         }
 
+        if (gtm_conn)
+            disconnect_gtm(gtm_conn);
         return ;
     }
         
diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h
index 2381286a..2ae03ff1 100644
--- a/src/include/gtm/gtm_client.h
+++ b/src/include/gtm/gtm_client.h
@@ -29,54 +29,54 @@
 
 typedef union GTM_ResultData
 {
-    GTM_TransactionHandle        grd_txnhandle;    /* TXN_BEGIN */
+	GTM_TransactionHandle		grd_txnhandle;	/* TXN_BEGIN */
 
-    bool                         backup_result;        /* BEGIN_BACKUP result */
-    struct
-    {
-        GlobalTransactionId        gxid;
-        GTM_Timestamp            timestamp;
-    } grd_gxid_tp;                                /* TXN_BEGIN_GETGXID */                
-    
+	bool 						backup_result;		/* BEGIN_BACKUP result */
+	struct
+	{
+		GlobalTransactionId		gxid;
+		GTM_Timestamp			timestamp;
+	} grd_gxid_tp;								/* TXN_BEGIN_GETGXID */				
+	
 
-    struct
-    {        
-        GTM_Timestamp                grd_gts;            /* GETGTS or when CHECK_GTM  GTS from primary GTM. */
-        bool                        gtm_readonly;       /* read only mode for gtm */
-        int                            node_status;        /* Master or Slave, 0:master, 1 slave */
+	struct
+	{		
+		GTM_Timestamp		        grd_gts;            /* GETGTS or when CHECK_GTM  GTS from primary GTM. */
+		bool                        gtm_readonly;       /* read only mode for gtm */
+		int							node_status;		/* Master or Slave, 0:master, 1 slave */
 #ifndef __XLOG__
-        GTM_Timestamp                grd_gts_standby;    /* CHECK_GTM, GTS from standby. */
-        char                        standbyhost[MAX_HOSTADDR_LEN];
-        char                        standbyport[MAX_PORT_LEN];
+		GTM_Timestamp		        grd_gts_standby;    /* CHECK_GTM, GTS from standby. */
+		char                        standbyhost[MAX_HOSTADDR_LEN];
+		char						standbyport[MAX_PORT_LEN];
 #else
         XLogRecPtr                  master_flush;
 
         int                         standby_count;
-        int                         *slave_is_sync;
+		int                         *slave_is_sync;
         char                        *application_name[GTM_MAX_WALSENDER];
-        XLogRecPtr                  *slave_flush_ptr;
-        GTM_Timestamp               *slave_timestamp;
+		XLogRecPtr                  *slave_flush_ptr;
+		GTM_Timestamp               *slave_timestamp;
 #endif
-    }grd_gts;
+	}grd_gts;
 
 #ifdef __XLOG__
 
-    struct
-    {        
-        XLogRecPtr             flush;
-        XLogRecPtr             write;
-        XLogRecPtr             apply;
-    } grd_replication;
+	struct
+	{		
+		XLogRecPtr             flush;
+		XLogRecPtr             write;
+		XLogRecPtr             apply;
+	} grd_replication;
 
-    struct
-    {
-        XLogRecPtr             pos;
-        int                    length;
-        char*                  xlog_data;
-        int                    reply;
-        XLogRecPtr             flush;
-    } grd_xlog_data;
-    
+	struct
+	{
+    	XLogRecPtr             pos;
+    	int                    length;
+    	char*                  xlog_data;
+    	int                    reply;
+    	XLogRecPtr             flush;
+	} grd_xlog_data;
+	
 #endif
 
 	GlobalTransactionId			grd_gxid;			/* TXN_PREPARE		
@@ -198,20 +198,20 @@ typedef union GTM_ResultData
 
 typedef struct GTM_Result
 {
-    GTM_ResultType        gr_type;
-    int                    gr_msglen;
-    int                    gr_status;
-    GTM_ProxyMsgHeader    gr_proxyhdr;
-    GTM_ResultData        gr_resdata;
-    
-#ifdef __TBASE__    
-    struct
-    {
-        int32                    len;
-        char                   *data;
+	GTM_ResultType		gr_type;
+	int					gr_msglen;
+	int					gr_status;
+	GTM_ProxyMsgHeader	gr_proxyhdr;
+	GTM_ResultData		gr_resdata;
+	
+#ifdef __TBASE__	
+	struct
+	{
+		int32					len;
+		char				   *data;
 #ifdef __XLOG__
-        XLogRecPtr              start_pos;
-        TimeLineID              time_line;
+		XLogRecPtr              start_pos;
+		TimeLineID              time_line;
 #endif
 	} grd_storage_data; 					/* STORAGE_TRANSFER_RESULT */
 	int							gr_finish_status;	/* TXN_FINISH_GID_RESULT result */
@@ -249,24 +249,24 @@ typedef struct GTM_Result
     } grd_errlog;
 
 #endif
-    /*
-     * We keep these two items outside the union to avoid repeated malloc/free
-     * of the xip array. If these items are pushed inside the union, they may
-     * get overwritten by other members in the union
-     */
-    int                    gr_xip_size;
-    GTM_SnapshotData    gr_snapshot;
-
-    /*
-     * Similarly, keep the buffer for proxying data outside the union
-     */
-    char        *gr_proxy_data;
-    int            gr_proxy_datalen;
+	/*
+	 * We keep these two items outside the union to avoid repeated malloc/free
+	 * of the xip array. If these items are pushed inside the union, they may
+	 * get overwritten by other members in the union
+	 */
+	int					gr_xip_size;
+	GTM_SnapshotData	gr_snapshot;
+
+	/*
+	 * Similarly, keep the buffer for proxying data outside the union
+	 */
+	char		*gr_proxy_data;
+	int			gr_proxy_datalen;
 } GTM_Result;
 
 typedef struct Get_GTS_Result {
 
-    GTM_Timestamp                gts;    /* GETGTS or when CHECK_GTM  GTS from primary GTM. */
+    GTM_Timestamp		        gts;    /* GETGTS or when CHECK_GTM  GTS from primary GTM. */
     bool                        gtm_readonly;   /* read only mode for gtm */
 } Get_GTS_Result;
 
@@ -288,19 +288,19 @@ size_t get_sequence_list(GTM_Conn *, GTM_SeqInfo **);
  * Transaction Management API
  */
 GlobalTransactionId begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel,
-                          const char *global_sessionid,
-                          GTM_Timestamp *timestamp);
+						  const char *global_sessionid,
+						  GTM_Timestamp *timestamp);
 
 
 int bkup_begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel,
-                           bool read_only, const char *global_sessionid,
-                           uint32 client_id, GTM_Timestamp timestamp);
+						   bool read_only, const char *global_sessionid,
+						   uint32 client_id, GTM_Timestamp timestamp);
 #ifdef __TBASE__
 Get_GTS_Result get_global_timestamp(GTM_Conn *conn);
 #ifdef __XLOG__
 int check_gtm_status(GTM_Conn *conn, int *status, GTM_Timestamp *master,XLogRecPtr *master_ptr,
-                     int *standby_count,int **slave_is_sync, GTM_Timestamp **standby ,
-                     XLogRecPtr **slave_flush_ptr,char **application_name[GTM_MAX_WALSENDER],int timeout_seconds);
+					 int *standby_count,int **slave_is_sync, GTM_Timestamp **standby ,
+					 XLogRecPtr **slave_flush_ptr,char **application_name[GTM_MAX_WALSENDER],int timeout_seconds);
 #else
 int check_gtm_status(GTM_Conn *conn, int *status, GTM_Timestamp *master, GTM_Timestamp *standby, char *standbyhost, char *standbyport, int32 buflen);
 #endif
@@ -311,151 +311,151 @@ int get_gtm_errlog(GTM_Conn *conn, int timeout_seconds, char** errlog, int* len)
 #endif
 
 int bkup_begin_transaction_gxid(GTM_Conn *conn, GlobalTransactionId gxid,
-                                GTM_IsolationLevel isolevel, bool read_only,
-                                const char *global_sessionid,
-                                uint32 client_id, GTM_Timestamp timestamp);
+								GTM_IsolationLevel isolevel, bool read_only,
+								const char *global_sessionid,
+								uint32 client_id, GTM_Timestamp timestamp);
 
 GlobalTransactionId begin_transaction_autovacuum(GTM_Conn *conn, GTM_IsolationLevel isolevel);
 int bkup_begin_transaction_autovacuum(GTM_Conn *conn, GlobalTransactionId gxid,
-                                      GTM_IsolationLevel isolevel,
-                                      uint32 client_id);
+									  GTM_IsolationLevel isolevel,
+									  uint32 client_id);
 int commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid,
-                       int waited_xid_count,
-                       GlobalTransactionId *waited_xids);
+					   int waited_xid_count,
+					   GlobalTransactionId *waited_xids);
 int bkup_commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid);
 int commit_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid,
-                                GlobalTransactionId prepared_gxid,
-                                int waited_xid_count,
-                                GlobalTransactionId *waited_xids);
+								GlobalTransactionId prepared_gxid,
+								int waited_xid_count,
+								GlobalTransactionId *waited_xids);
 int bkup_commit_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid, GlobalTransactionId prepared_gxid);
 int abort_transaction(GTM_Conn *conn, GlobalTransactionId gxid);
 int bkup_abort_transaction(GTM_Conn *conn, GlobalTransactionId gxid);
 int start_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid, char *gid,
-                               char *nodestring);
+							   char *nodestring);
 int
 log_commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid,const char *gid,
-                           const char *nodestring, int node_count, bool isGlobal, bool isCommit, 
-                           GlobalTimestamp prepare_ts, GlobalTimestamp commit_ts);
+						   const char *nodestring, int node_count, bool isGlobal, bool isCommit, 
+						   GlobalTimestamp prepare_ts, GlobalTimestamp commit_ts);
 int
 log_scan_transaction(GTM_Conn *conn,
-                             GlobalTransactionId gxid, 
-                              const char *node_string, 
-                              GlobalTimestamp     start_ts,
-                              GlobalTimestamp     local_start_ts,
-                              GlobalTimestamp     local_complete_ts,
-                              int scan_type,
-                              const char *rel_name,
-                             int64  scan_number);
+							 GlobalTransactionId gxid, 
+ 							 const char *node_string, 
+ 							 GlobalTimestamp	 start_ts,
+ 							 GlobalTimestamp	 local_start_ts,
+ 							 GlobalTimestamp	 local_complete_ts,
+ 							 int scan_type,
+ 							 const char *rel_name,
+							 int64  scan_number);
 
 
 int backup_start_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid, char *gid,
-                                      char *nodestring);
+									  char *nodestring);
 int prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid);
 int bkup_prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid);
 int get_gid_data(GTM_Conn *conn, GTM_IsolationLevel isolevel, char *gid,
-                 GlobalTransactionId *gxid,
-                 GlobalTransactionId *prepared_gxid,
-                 char **nodestring);
+				 GlobalTransactionId *gxid,
+				 GlobalTransactionId *prepared_gxid,
+				 char **nodestring);
 /*
  * Multiple Transaction Management API
  */
 int
 begin_transaction_multi(GTM_Conn *conn, int txn_count, GTM_IsolationLevel *txn_isolation_level,
-            bool *txn_read_only, GTMProxy_ConnID *txn_connid,
-            int *txn_count_out, GlobalTransactionId *gxid_out, GTM_Timestamp *ts_out);
+			bool *txn_read_only, GTMProxy_ConnID *txn_connid,
+			int *txn_count_out, GlobalTransactionId *gxid_out, GTM_Timestamp *ts_out);
 int
 bkup_begin_transaction_multi(GTM_Conn *conn, int txn_count,
-                             GlobalTransactionId *gxid, GTM_IsolationLevel *isolevel,
-                             bool *read_only,
-                             const char *txn_global_sessionid[], 
-                             uint32 *client_id,
-                             GTMProxy_ConnID *txn_connid);
+							 GlobalTransactionId *gxid, GTM_IsolationLevel *isolevel,
+							 bool *read_only,
+							 const char *txn_global_sessionid[], 
+							 uint32 *client_id,
+							 GTMProxy_ConnID *txn_connid);
 int
 commit_transaction_multi(GTM_Conn *conn, int txn_count, GlobalTransactionId *gxid,
-                         int *txn_count_out, int *status_out);
+						 int *txn_count_out, int *status_out);
 int
 bkup_commit_transaction_multi(GTM_Conn *conn, int txn_count,
-        GlobalTransactionId *gxid);
+		GlobalTransactionId *gxid);
 int
 abort_transaction_multi(GTM_Conn *conn, int txn_count, GlobalTransactionId *gxid,
-            int *txn_count_out, int *status_out);
+			int *txn_count_out, int *status_out);
 int
 bkup_abort_transaction_multi(GTM_Conn *conn, int txn_count, GlobalTransactionId *gxid);
 int
 snapshot_get_multi(GTM_Conn *conn, int txn_count, GlobalTransactionId *gxid,
-           int *txn_count_out, int *status_out,
-           GlobalTransactionId *xmin_out, GlobalTransactionId *xmax_out,
-           GlobalTransactionId *recent_global_xmin_out, int32 *xcnt_out);
+		   int *txn_count_out, int *status_out,
+		   GlobalTransactionId *xmin_out, GlobalTransactionId *xmax_out,
+		   GlobalTransactionId *recent_global_xmin_out, int32 *xcnt_out);
 
 /*
  * Snapshot Management API
  */
 GTM_SnapshotData *get_snapshot(GTM_Conn *conn, GlobalTransactionId gxid,
-        bool canbe_grouped);
+		bool canbe_grouped);
 
 /*
  * Node Registering management API
  */
 int node_register(GTM_Conn *conn,
-                  GTM_PGXCNodeType type,
-                  GTM_PGXCNodePort port,
-                  char *node_name,
-                  char *datafolder);
+				  GTM_PGXCNodeType type,
+				  GTM_PGXCNodePort port,
+				  char *node_name,
+				  char *datafolder);
 int node_register(GTM_Conn *conn, GTM_PGXCNodeType type, GTM_PGXCNodePort port,
-        char *node_name, char *datafolder);
-int node_register_internal(GTM_Conn *conn, GTM_PGXCNodeType type, const char *host,    GTM_PGXCNodePort port, char *node_name,
-                           char *datafolder, GTM_PGXCNodeStatus status);
+		char *node_name, char *datafolder);
+int node_register_internal(GTM_Conn *conn, GTM_PGXCNodeType type, const char *host,	GTM_PGXCNodePort port, char *node_name,
+						   char *datafolder, GTM_PGXCNodeStatus status);
 int bkup_node_register_internal(GTM_Conn *conn, GTM_PGXCNodeType type, const char *host, GTM_PGXCNodePort port,
-                                char *node_name, char *datafolder,
-                                GTM_PGXCNodeStatus status);
+								char *node_name, char *datafolder,
+								GTM_PGXCNodeStatus status);
 
 int node_unregister(GTM_Conn *conn, GTM_PGXCNodeType type, const char *node_name);
 int bkup_node_unregister(GTM_Conn *conn, GTM_PGXCNodeType type, const char * node_name);
 int backend_disconnect(GTM_Conn *conn, bool is_postmaster, GTM_PGXCNodeType type, char *node_name);
 char *node_get_local_addr(GTM_Conn *conn, char *buf, size_t buflen, int *rc);
 int register_session(GTM_Conn *conn, const char *coord_name, int coord_procid,
-                 int coord_backendid);
+				 int coord_backendid);
 int report_global_xmin(GTM_Conn *conn, const char *node_name,
-        GTM_PGXCNodeType type, GlobalTransactionId gxid,
-        GlobalTransactionId *global_xmin,
-        GlobalTransactionId *latest_completed_xid,
-        int *errcode);
+		GTM_PGXCNodeType type, GlobalTransactionId gxid,
+		GlobalTransactionId *global_xmin,
+		GlobalTransactionId *latest_completed_xid,
+		int *errcode);
 
 /*
  * Sequence Management API
  */
 int open_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment,
-                  GTM_Sequence minval, GTM_Sequence maxval,
-                  GTM_Sequence startval, bool cycle,
-                  GlobalTransactionId gxid);
+				  GTM_Sequence minval, GTM_Sequence maxval,
+				  GTM_Sequence startval, bool cycle,
+				  GlobalTransactionId gxid);
 int bkup_open_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment,
-                       GTM_Sequence minval, GTM_Sequence maxval,
-                       GTM_Sequence startval, bool cycle,
-                       GlobalTransactionId gxid);
+					   GTM_Sequence minval, GTM_Sequence maxval,
+					   GTM_Sequence startval, bool cycle,
+					   GlobalTransactionId gxid);
 int alter_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment,
-                   GTM_Sequence minval, GTM_Sequence maxval,
-                   GTM_Sequence startval, GTM_Sequence lastval, bool cycle, bool is_restart);
+				   GTM_Sequence minval, GTM_Sequence maxval,
+				   GTM_Sequence startval, GTM_Sequence lastval, bool cycle, bool is_restart);
 int bkup_alter_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment,
-                        GTM_Sequence minval, GTM_Sequence maxval,
-                        GTM_Sequence startval, GTM_Sequence lastval, bool cycle, bool is_restart);
+						GTM_Sequence minval, GTM_Sequence maxval,
+						GTM_Sequence startval, GTM_Sequence lastval, bool cycle, bool is_restart);
 int close_sequence(GTM_Conn *conn, GTM_SequenceKey key, GlobalTransactionId gxid);
 int bkup_close_sequence(GTM_Conn *conn, GTM_SequenceKey key, GlobalTransactionId gxid);
 int rename_sequence(GTM_Conn *conn, GTM_SequenceKey key,
-                        GTM_SequenceKey newkey, GlobalTransactionId gxid);
+						GTM_SequenceKey newkey, GlobalTransactionId gxid);
 int bkup_rename_sequence(GTM_Conn *conn, GTM_SequenceKey key,
-                        GTM_SequenceKey newkey, GlobalTransactionId gxid);
+						GTM_SequenceKey newkey, GlobalTransactionId gxid);
 int get_current(GTM_Conn *conn, GTM_SequenceKey key,
-            char *coord_name, int coord_procid, GTM_Sequence *result);
+			char *coord_name, int coord_procid, GTM_Sequence *result);
 int get_next(GTM_Conn *conn, GTM_SequenceKey key,
-         char *coord_name, int coord_procid,
-         GTM_Sequence range, GTM_Sequence *result, GTM_Sequence *rangemax);
+		 char *coord_name, int coord_procid,
+		 GTM_Sequence range, GTM_Sequence *result, GTM_Sequence *rangemax);
 int bkup_get_next(GTM_Conn *conn, GTM_SequenceKey key,
-         char *coord_name, int coord_procid,
-         GTM_Sequence range, GTM_Sequence *result, GTM_Sequence *rangemax);
+		 char *coord_name, int coord_procid,
+		 GTM_Sequence range, GTM_Sequence *result, GTM_Sequence *rangemax);
 int set_val(GTM_Conn *conn, GTM_SequenceKey key, char *coord_name,
-        int coord_procid, GTM_Sequence nextval, bool iscalled);
+		int coord_procid, GTM_Sequence nextval, bool iscalled);
 int bkup_set_val(GTM_Conn *conn, GTM_SequenceKey key, char *coord_name,
-             int coord_procid, GTM_Sequence nextval, bool iscalled);
+			 int coord_procid, GTM_Sequence nextval, bool iscalled);
 int reset_sequence(GTM_Conn *conn, GTM_SequenceKey key);
 int bkup_reset_sequence(GTM_Conn *conn, GTM_SequenceKey key);
 
@@ -499,4 +499,5 @@ int32 check_storage_sequence(GTM_Conn *conn, GTMStorageSequneceStatus **store_se
 int32 check_storage_transaction(GTM_Conn *conn, GTMStorageTransactionStatus **store_txn, bool need_fix);
 int   rename_db_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_SequenceKey newkey, GlobalTransactionId gxid);
 #endif
+void gtmpqFreeResultResource(GTM_Result *result);
 #endif
diff --git a/src/include/gtm/gtm_msg.h b/src/include/gtm/gtm_msg.h
index bb66c194..acedc926 100644
--- a/src/include/gtm/gtm_msg.h
+++ b/src/include/gtm/gtm_msg.h
@@ -22,112 +22,114 @@
  */
 typedef enum GTM_MessageType
 {
-    MSG_TYPE_INVALID,
-    MSG_SYNC_STANDBY,            /* Message to sync woth GTM-Standby */
-    MSG_NODE_REGISTER,            /* Register a PGXC Node with GTM */
-    MSG_BKUP_NODE_REGISTER,        /* Backup of MSG_NODE_REGISTER */
-    MSG_NODE_UNREGISTER,        /* Unregister a PGXC Node with GTM */
-    MSG_BKUP_NODE_UNREGISTER,    /* Backup of MSG_NODE_UNREGISTER */
-    MSG_REGISTER_SESSION,        /* Register distributed session with GTM */
-    MSG_REPORT_XMIN,            /* Report RecentGlobalXmin to GTM */
-    MSG_BKUP_REPORT_XMIN,
-    MSG_NODE_LIST,                /* Get node list */
-    MSG_NODE_BEGIN_REPLICATION_INIT,
-    MSG_NODE_END_REPLICATION_INIT,
-    MSG_BEGIN_BACKUP,            /* Start backup by Standby */
-    MSG_END_BACKUP,                /* End backup preparation by Standby */
-    MSG_TXN_BEGIN,                /* Start a new transaction */
-    MSG_BKUP_TXN_BEGIN,            /* Backup of MSG_TXN_BEGIN */
-    MSG_BKUP_GLOBAL_TIMESTAMP,  /* Backup of the latest issued global timestmap */
-    MSG_TXN_BEGIN_GETGXID,        /* Start a new transaction and get GXID */
-    MSG_BKUP_TXN_BEGIN_GETGXID,    /* Backup of MSG_TXN_BEGIN_GETGXID */
-    MSG_TXN_BEGIN_GETGXID_MULTI,        /* Start multiple new transactions and get GXIDs */
-    MSG_BKUP_TXN_BEGIN_GETGXID_MULTI,    /* Backup of MSG_TXN_BEGIN_GETGXID_MULTI */
-    MSG_TXN_START_PREPARED,            /* Begins to prepare a transation for commit */
-    MSG_BKUP_TXN_START_PREPARED,    /* Backup of MSG_TXN_START_PREPARED */
-    MSG_TXN_COMMIT,            /* Commit a running or prepared transaction */
-    MSG_BKUP_TXN_COMMIT,    /* Backup of MSG_TXN_COMMIT */
-    MSG_TXN_LOG_GLOBAL_COMMIT,            /* Log a committed transaction*/
-    MSG_TXN_LOG_COMMIT,            /* Log a committed transaction*/
-    MSG_TXN_LOG_GLOBAL_SCAN,            /* Log a global scan */
-    MSG_TXN_LOG_SCAN,            /* Log a scan */
-    MSG_TXN_COMMIT_MULTI,        /* Commit multiple running or prepared transactions */
-    MSG_BKUP_TXN_COMMIT_MULTI,    /* Bacukp of MSG_TXN_COMMIT_MULTI */
-    MSG_TXN_COMMIT_PREPARED,        /* Commit a prepared transaction */
-    MSG_BKUP_TXN_COMMIT_PREPARED,    /* Backup of MSG_TXN_COMMIT_PREPARED */
-    MSG_TXN_PREPARE,        /* Finish preparing a transaction */
-    MSG_BKUP_TXN_PREPARE,    /* Backup of MSG_TXN_PREPARE */
-    MSG_TXN_ROLLBACK,        /* Rollback a transaction */
-    MSG_BKUP_TXN_ROLLBACK,    /* Backup of MSG_TXN_ROLLBACK */
-    MSG_TXN_ROLLBACK_MULTI,            /* Rollback multiple transactions */
-    MSG_BKUP_TXN_ROLLBACK_MULTI,    /* Backup of MSG_TXN_ROLLBACK_MULTI */
-    MSG_TXN_GET_GID_DATA,    /* Get info associated with a GID, and get a GXID */
-    MSG_TXN_GET_GXID,        /* Get a GXID for a transaction */
-    MSG_BKUP_TXN_GET_GXID,
-    MSG_TXN_GET_NEXT_GXID,    /* Get next GXID */
-    MSG_TXN_GXID_LIST,
-    MSG_SNAPSHOT_GET,        /* Get a global snapshot */
-    MSG_SNAPSHOT_GET_MULTI,    /* Get multiple global snapshots */
-    MSG_SNAPSHOT_GXID_GET,    /* Get GXID and snapshot together */
-    MSG_SEQUENCE_INIT,        /* Initialize a new global sequence */
-    MSG_BKUP_SEQUENCE_INIT,    /* Backup of MSG_SEQUENCE_INIT */
-    MSG_SEQUENCE_GET_CURRENT,/* Get the current value of sequence */
-    MSG_SEQUENCE_GET_NEXT,        /* Get the next sequence value of sequence */
-    MSG_BKUP_SEQUENCE_GET_NEXT,    /* Backup of MSG_SEQUENCE_GET_NEXT */
-    MSG_SEQUENCE_GET_LAST,    /* Get the last sequence value of sequence */
-    MSG_SEQUENCE_SET_VAL,        /* Set values for sequence */
-    MSG_BKUP_SEQUENCE_SET_VAL,    /* Backup of MSG_SEQUENCE_SET_VAL */
-    MSG_SEQUENCE_RESET,            /* Reset the sequence */
-    MSG_BKUP_SEQUENCE_RESET,    /* Backup of MSG_SEQUENCE_RESET */
-    MSG_SEQUENCE_CLOSE,            /* Close a previously inited sequence */
-    MSG_BKUP_SEQUENCE_CLOSE,    /* Backup of MSG_SEQUENCE_CLOSE */
-    MSG_SEQUENCE_RENAME,        /* Rename a sequence */
-    MSG_BKUP_SEQUENCE_RENAME,    /* Backup of MSG_SEQUENCE_RENAME */
-    MSG_SEQUENCE_ALTER,            /* Alter a sequence */
-    MSG_BKUP_SEQUENCE_ALTER,    /* Backup of MSG_SEQUENCE_ALTER */
-    MSG_SEQUENCE_LIST,        /* Get a list of sequences */
-    MSG_TXN_GET_STATUS,        /* Get status of a given transaction */
-    MSG_TXN_GET_ALL_PREPARED,    /* Get information about all outstanding
-                                 * prepared transactions */
-    MSG_TXN_BEGIN_GETGXID_AUTOVACUUM,        /* Start a new transaction and get GXID for autovacuum */
-    MSG_BKUP_TXN_BEGIN_GETGXID_AUTOVACUUM,    /* Backup of MSG_TXN_BEGIN_GETGXID_AUTOVACUUM */
-    MSG_DATA_FLUSH,                    /* flush pending data */
-    MSG_BACKEND_DISCONNECT,            /* tell GTM that the backend diconnected from the proxy */
-    MSG_BARRIER,                /* Tell the barrier was issued */
-    MSG_BKUP_BARRIER,            /* Backup barrier to standby */
+	MSG_TYPE_INVALID,
+	MSG_SYNC_STANDBY,			/* Message to sync woth GTM-Standby */
+	MSG_NODE_REGISTER,			/* Register a PGXC Node with GTM */
+	MSG_BKUP_NODE_REGISTER,		/* Backup of MSG_NODE_REGISTER */
+	MSG_NODE_UNREGISTER,		/* Unregister a PGXC Node with GTM */
+	MSG_BKUP_NODE_UNREGISTER,	/* Backup of MSG_NODE_UNREGISTER */
+	MSG_REGISTER_SESSION,		/* Register distributed session with GTM */
+	MSG_REPORT_XMIN,			/* Report RecentGlobalXmin to GTM */
+	MSG_BKUP_REPORT_XMIN,
+	MSG_NODE_LIST,				/* Get node list */
+	MSG_NODE_BEGIN_REPLICATION_INIT,
+	MSG_NODE_END_REPLICATION_INIT,
+	MSG_BEGIN_BACKUP,			/* Start backup by Standby */
+	MSG_END_BACKUP,				/* End backup preparation by Standby */
+	MSG_TXN_BEGIN,				/* Start a new transaction */
+	MSG_BKUP_TXN_BEGIN,			/* Backup of MSG_TXN_BEGIN */
+	MSG_BKUP_GLOBAL_TIMESTAMP,  /* Backup of the latest issued global timestmap */
+	MSG_TXN_BEGIN_GETGXID,		/* Start a new transaction and get GXID */
+	MSG_BKUP_TXN_BEGIN_GETGXID,	/* Backup of MSG_TXN_BEGIN_GETGXID */
+	MSG_TXN_BEGIN_GETGXID_MULTI,		/* Start multiple new transactions and get GXIDs */
+	MSG_BKUP_TXN_BEGIN_GETGXID_MULTI,	/* Backup of MSG_TXN_BEGIN_GETGXID_MULTI */
+	MSG_TXN_START_PREPARED,			/* Begins to prepare a transation for commit */
+	MSG_BKUP_TXN_START_PREPARED,	/* Backup of MSG_TXN_START_PREPARED */
+	MSG_TXN_COMMIT,			/* Commit a running or prepared transaction */
+	MSG_BKUP_TXN_COMMIT,	/* Backup of MSG_TXN_COMMIT */
+	MSG_TXN_LOG_GLOBAL_COMMIT,			/* Log a committed transaction*/
+	MSG_TXN_LOG_COMMIT,			/* Log a committed transaction*/
+	MSG_TXN_LOG_GLOBAL_SCAN,			/* Log a global scan */
+	MSG_TXN_LOG_SCAN,			/* Log a scan */
+	MSG_TXN_COMMIT_MULTI,		/* Commit multiple running or prepared transactions */
+	MSG_BKUP_TXN_COMMIT_MULTI,	/* Bacukp of MSG_TXN_COMMIT_MULTI */
+	MSG_TXN_COMMIT_PREPARED,		/* Commit a prepared transaction */
+	MSG_BKUP_TXN_COMMIT_PREPARED,	/* Backup of MSG_TXN_COMMIT_PREPARED */
+	MSG_TXN_PREPARE,		/* Finish preparing a transaction */
+	MSG_BKUP_TXN_PREPARE,	/* Backup of MSG_TXN_PREPARE */
+	MSG_TXN_ROLLBACK,		/* Rollback a transaction */
+	MSG_BKUP_TXN_ROLLBACK,	/* Backup of MSG_TXN_ROLLBACK */
+	MSG_TXN_ROLLBACK_MULTI,			/* Rollback multiple transactions */
+	MSG_BKUP_TXN_ROLLBACK_MULTI,	/* Backup of MSG_TXN_ROLLBACK_MULTI */
+	MSG_TXN_GET_GID_DATA,	/* Get info associated with a GID, and get a GXID */
+	MSG_TXN_GET_GXID,		/* Get a GXID for a transaction */
+	MSG_BKUP_TXN_GET_GXID,
+	MSG_TXN_GET_NEXT_GXID,	/* Get next GXID */
+	MSG_TXN_GXID_LIST,
+	MSG_SNAPSHOT_GET,		/* Get a global snapshot */
+	MSG_SNAPSHOT_GET_MULTI,	/* Get multiple global snapshots */
+	MSG_SNAPSHOT_GXID_GET,	/* Get GXID and snapshot together */
+	MSG_SEQUENCE_INIT,		/* Initialize a new global sequence */
+	MSG_BKUP_SEQUENCE_INIT,	/* Backup of MSG_SEQUENCE_INIT */
+	MSG_SEQUENCE_GET_CURRENT,/* Get the current value of sequence */
+	MSG_SEQUENCE_GET_NEXT,		/* Get the next sequence value of sequence */
+	MSG_BKUP_SEQUENCE_GET_NEXT,	/* Backup of MSG_SEQUENCE_GET_NEXT */
+	MSG_SEQUENCE_GET_LAST,	/* Get the last sequence value of sequence */
+	MSG_SEQUENCE_SET_VAL,		/* Set values for sequence */
+	MSG_BKUP_SEQUENCE_SET_VAL,	/* Backup of MSG_SEQUENCE_SET_VAL */
+	MSG_SEQUENCE_RESET,			/* Reset the sequence */
+	MSG_BKUP_SEQUENCE_RESET,	/* Backup of MSG_SEQUENCE_RESET */
+	MSG_SEQUENCE_CLOSE,			/* Close a previously inited sequence */
+	MSG_BKUP_SEQUENCE_CLOSE,	/* Backup of MSG_SEQUENCE_CLOSE */
+	MSG_SEQUENCE_RENAME,		/* Rename a sequence */
+	MSG_BKUP_SEQUENCE_RENAME,	/* Backup of MSG_SEQUENCE_RENAME */
+	MSG_SEQUENCE_ALTER,			/* Alter a sequence */
+	MSG_BKUP_SEQUENCE_ALTER,	/* Backup of MSG_SEQUENCE_ALTER */
+	MSG_SEQUENCE_LIST,		/* Get a list of sequences */
+	MSG_TXN_GET_STATUS,		/* Get status of a given transaction */
+	MSG_TXN_GET_ALL_PREPARED,	/* Get information about all outstanding
+								 * prepared transactions */
+	MSG_TXN_BEGIN_GETGXID_AUTOVACUUM,		/* Start a new transaction and get GXID for autovacuum */
+	MSG_BKUP_TXN_BEGIN_GETGXID_AUTOVACUUM,	/* Backup of MSG_TXN_BEGIN_GETGXID_AUTOVACUUM */
+	MSG_DATA_FLUSH,					/* flush pending data */
+	MSG_BACKEND_DISCONNECT,			/* tell GTM that the backend diconnected from the proxy */
+	MSG_BARRIER,				/* Tell the barrier was issued */
+	MSG_BKUP_BARRIER,			/* Backup barrier to standby */
 #ifdef __TBASE__
-    /* Gtm storage tags. */
-    MSG_GET_STORAGE,            /* Backup get storage file */
-    MSG_TXN_FINISH_GID,            /* Finish gid transaction in GTM */
-    MSG_LIST_GTM_STORE,            /* List  gtm store info */
-    MSG_LIST_GTM_STORE_SEQ,            /* List  gtm running sequence info */
-    MSG_LIST_GTM_STORE_TXN,            /* List  gtm running transaction info */
-    MSG_CHECK_GTM_STORE_SEQ,            /* Check gtm sequence usage info */
-    MSG_CHECK_GTM_STORE_TXN,            /* Check gtm transaction usage info */
-    MSG_CLEAN_SESSION_SEQ,        /* clean up session related seq */
+	/* Gtm storage tags. */
+	MSG_GET_STORAGE,			/* Backup get storage file */
+	MSG_TXN_FINISH_GID,			/* Finish gid transaction in GTM */
+	MSG_LIST_GTM_STORE,			/* List  gtm store info */
+	MSG_LIST_GTM_STORE_SEQ,			/* List  gtm running sequence info */
+	MSG_LIST_GTM_STORE_TXN,			/* List  gtm running transaction info */
+	MSG_CHECK_GTM_STORE_SEQ,			/* Check gtm sequence usage info */
+	MSG_CHECK_GTM_STORE_TXN,			/* Check gtm transaction usage info */
+	MSG_CLEAN_SESSION_SEQ,		/* clean up session related seq */
 
-    /* Global timestamp tags. */
-    MSG_GETGTS,                 /* Get a global timestamp */
-    MSG_GETGTS_MULTI,            /* Get multiple global timestamps */
+	/* Global timestamp tags. */
+	MSG_GETGTS, 				/* Get a global timestamp */
+	MSG_GETGTS_MULTI,			/* Get multiple global timestamps */
 
-    MSG_CHECK_GTM_STATUS,         /* Get global timestamp from both master and slave gtm. */
+	MSG_CHECK_GTM_STATUS, 		/* Get global timestamp from both master and slave gtm. */
 
-    MSG_DB_SEQUENCE_RENAME,      /* Rename all sequence in database*/
-    MSG_BKUP_DB_SEQUENCE_RENAME,
+	MSG_DB_SEQUENCE_RENAME,      /* Rename all sequence in database*/
+	MSG_BKUP_DB_SEQUENCE_RENAME,
 #endif
 
 #ifdef __XLOG__
-    MSG_START_REPLICATION,
+	MSG_START_REPLICATION,
     MSG_GET_REPLICATION_STATUS,
     MSG_GET_REPLICATION_TRANSFER,
 #endif
+#ifdef __TBASE__
     MSG_GET_STATISTICS,
     MSG_GET_ERRORLOG,
+#endif
 
-    /*
-     * Must be at the end
-     */
-    MSG_TYPE_COUNT            /* A dummmy entry just to count the message types */
+	/*
+	 * Must be at the end
+	 */
+	MSG_TYPE_COUNT			/* A dummmy entry just to count the message types */
 } GTM_MessageType;
 
 /*
@@ -136,67 +138,67 @@ typedef enum GTM_MessageType
  */
 typedef enum GTM_ResultType
 {
-    SYNC_STANDBY_RESULT,
-    NODE_REGISTER_RESULT,
-    NODE_UNREGISTER_RESULT,
-    REGISTER_SESSION_RESULT,
-    REPORT_XMIN_RESULT,
-    NODE_LIST_RESULT,
-    NODE_BEGIN_REPLICATION_INIT_RESULT,
-    NODE_END_REPLICATION_INIT_RESULT,
-    BEGIN_BACKUP_SUCCEED_RESULT,
+	SYNC_STANDBY_RESULT,
+	NODE_REGISTER_RESULT,
+	NODE_UNREGISTER_RESULT,
+	REGISTER_SESSION_RESULT,
+	REPORT_XMIN_RESULT,
+	NODE_LIST_RESULT,
+	NODE_BEGIN_REPLICATION_INIT_RESULT,
+	NODE_END_REPLICATION_INIT_RESULT,
+	BEGIN_BACKUP_SUCCEED_RESULT,
 #ifdef __TBASE__
-    BEGIN_BACKUP_FAIL_RESULT,
+	BEGIN_BACKUP_FAIL_RESULT,
 #endif
-    END_BACKUP_RESULT,
-    TXN_BEGIN_RESULT,
-    TXN_BEGIN_GETGXID_RESULT,
-    TXN_BEGIN_GETGTS_RESULT,
-    TXN_BEGIN_GETGXID_MULTI_RESULT,
-    TXN_BEGIN_GETGTS_MULTI_RESULT,
+	END_BACKUP_RESULT,
+	TXN_BEGIN_RESULT,
+	TXN_BEGIN_GETGXID_RESULT,
+	TXN_BEGIN_GETGTS_RESULT,
+	TXN_BEGIN_GETGXID_MULTI_RESULT,
+	TXN_BEGIN_GETGTS_MULTI_RESULT,
 #ifdef __TBASE__
-    TXN_CHECK_GTM_STATUS_RESULT,
+	TXN_CHECK_GTM_STATUS_RESULT,
 #endif
-    TXN_PREPARE_RESULT,
-    TXN_START_PREPARED_RESULT,
-    TXN_LOG_TRANSACTION_RESULT,
-    TXN_LOG_SCAN_RESULT,
-    TXN_COMMIT_PREPARED_RESULT,
-    TXN_COMMIT_RESULT,
-    TXN_COMMIT_MULTI_RESULT,
-    TXN_ROLLBACK_RESULT,
-    TXN_ROLLBACK_MULTI_RESULT,
-    TXN_GET_GID_DATA_RESULT,
-    TXN_GET_GXID_RESULT,
-    TXN_GET_NEXT_GXID_RESULT,
-    TXN_GXID_LIST_RESULT,
-    SNAPSHOT_GET_RESULT,
-    SNAPSHOT_GET_MULTI_RESULT,
-    SNAPSHOT_GXID_GET_RESULT,
-    SEQUENCE_INIT_RESULT,
-    SEQUENCE_GET_CURRENT_RESULT,
-    SEQUENCE_GET_NEXT_RESULT,
-    SEQUENCE_GET_LAST_RESULT,
-    SEQUENCE_SET_VAL_RESULT,
-    SEQUENCE_RESET_RESULT,
-    SEQUENCE_CLOSE_RESULT,
-    SEQUENCE_RENAME_RESULT,
-    SEQUENCE_ALTER_RESULT,
-    SEQUENCE_LIST_RESULT,
-    TXN_GET_STATUS_RESULT,
-    TXN_GET_ALL_PREPARED_RESULT,
-    TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT,
-    BARRIER_RESULT,
+	TXN_PREPARE_RESULT,
+	TXN_START_PREPARED_RESULT,
+	TXN_LOG_TRANSACTION_RESULT,
+	TXN_LOG_SCAN_RESULT,
+	TXN_COMMIT_PREPARED_RESULT,
+	TXN_COMMIT_RESULT,
+	TXN_COMMIT_MULTI_RESULT,
+	TXN_ROLLBACK_RESULT,
+	TXN_ROLLBACK_MULTI_RESULT,
+	TXN_GET_GID_DATA_RESULT,
+	TXN_GET_GXID_RESULT,
+	TXN_GET_NEXT_GXID_RESULT,
+	TXN_GXID_LIST_RESULT,
+	SNAPSHOT_GET_RESULT,
+	SNAPSHOT_GET_MULTI_RESULT,
+	SNAPSHOT_GXID_GET_RESULT,
+	SEQUENCE_INIT_RESULT,
+	SEQUENCE_GET_CURRENT_RESULT,
+	SEQUENCE_GET_NEXT_RESULT,
+	SEQUENCE_GET_LAST_RESULT,
+	SEQUENCE_SET_VAL_RESULT,
+	SEQUENCE_RESET_RESULT,
+	SEQUENCE_CLOSE_RESULT,
+	SEQUENCE_RENAME_RESULT,
+	SEQUENCE_ALTER_RESULT,
+	SEQUENCE_LIST_RESULT,
+	TXN_GET_STATUS_RESULT,
+	TXN_GET_ALL_PREPARED_RESULT,
+	TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT,
+	BARRIER_RESULT,
 #ifdef __TBASE__
-    STORAGE_TRANSFER_RESULT,
-    TXN_FINISH_GID_RESULT,
-    MSG_LIST_GTM_STORE_RESULT,
-    MSG_LIST_GTM_STORE_SEQ_RESULT,    /* List  gtm running sequence info */
-    MSG_LIST_GTM_TXN_STORE_RESULT,    /* List  gtm running transaction info */
-    MSG_CHECK_GTM_SEQ_STORE_RESULT,    /* Check gtm sequence usage info */
-    MSG_CHECK_GTM_TXN_STORE_RESULT,    /* Check gtm transaction usage info */
-    MSG_CLEAN_SESSION_SEQ_RESULT,     
-    MSG_DB_SEQUENCE_RENAME_RESULT,
+	STORAGE_TRANSFER_RESULT,
+	TXN_FINISH_GID_RESULT,
+	MSG_LIST_GTM_STORE_RESULT,
+	MSG_LIST_GTM_STORE_SEQ_RESULT,	/* List  gtm running sequence info */
+	MSG_LIST_GTM_TXN_STORE_RESULT,	/* List  gtm running transaction info */
+	MSG_CHECK_GTM_SEQ_STORE_RESULT,	/* Check gtm sequence usage info */
+	MSG_CHECK_GTM_TXN_STORE_RESULT,	/* Check gtm transaction usage info */
+	MSG_CLEAN_SESSION_SEQ_RESULT, 	
+	MSG_DB_SEQUENCE_RENAME_RESULT,
 #endif
 
 #ifdef __XLOG__
@@ -206,9 +208,10 @@ typedef enum GTM_ResultType
     MSG_REPLICATION_CONTENT,
 #endif
 
+#ifdef __TBASE__
     MSG_GET_GTM_STATISTICS_RESULT,
     MSG_GET_GTM_ERRORLOG_RESULT,
-
+#endif
 	RESULT_TYPE_COUNT
 } GTM_ResultType;
 
@@ -221,7 +224,7 @@ typedef enum GTM_ResultType
  */
 typedef struct GTM_ProxyMsgHeader
 {
-    GTMProxy_ConnID    ph_conid;
+	GTMProxy_ConnID	ph_conid;
 } GTM_ProxyMsgHeader;
 
 #endif

From 008ba88f444d5b07685979260309609fbf6762bd Mon Sep 17 00:00:00 2001
From: anthonyyan <anthonyyan@tencent.com>
Date: Tue, 27 Apr 2021 14:47:01 +0800
Subject: [PATCH 356/578] avoid handle remote handles when dn_handles or
 co_handles is NULL(merge request !297)

http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131087088693
---
 src/backend/pgxc/pool/execRemote.c | 13 ++++++++++++-
 src/backend/pgxc/pool/pgxcnode.c   |  6 ++++++
 src/include/pgxc/pgxcnode.h        |  2 +-
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 8f0e5ab9..e02f262f 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -7623,6 +7623,11 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle)
     struct rusage        start_r;
     struct timeval        start_t;
 
+	if (!is_pgxc_handles_init())
+	{
+		return true;
+	}
+
     clean_nodes = (PGXCNodeHandle**)palloc(sizeof(PGXCNodeHandle*) * (NumCoords + NumDataNodes));
     cancel_dn_list = (int*)palloc(sizeof(int) * NumDataNodes);
     cancel_co_list = (int*)palloc(sizeof(int) * NumCoords);
@@ -12859,8 +12864,14 @@ void SetCurrentHandlesReadonly(void)
 {
     int i = 0;
     PGXCNodeHandle        *conn     = NULL;
-    PGXCNodeAllHandles *handles = get_current_handles();
+	PGXCNodeAllHandles *handles = NULL;
     
+	if (!is_pgxc_handles_init())
+	{
+		return;
+	}
+
+	handles = get_current_handles();
     
     for (i = 0; i < handles->dn_conn_count; i++)
     {
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index c7c630fa..6db7d43b 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -5860,6 +5860,12 @@ is_ddl_leader_cn(char *first_cn)
 
     return strcmp(first_cn, PGXCNodeName) == 0;
 }
+
+inline bool
+is_pgxc_handles_init()
+{
+	return (dn_handles != NULL && co_handles != NULL);
+}
 #endif
 
 /*
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index be08deab..e5f9c6e1 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -297,7 +297,7 @@ inline bool  is_ddl_leader_cn(char *leader_cn);
 extern int pgxc_node_send_sessionid(PGXCNodeHandle * handle);
 extern void SerializeSessionId(Size maxsize, char *start_address);
 extern void StartParallelWorkerSessionId(char *address);
-void CheckInvalidateRemoteHandles(void);
+extern bool is_pgxc_handles_init(void);
 #endif
 
 #ifdef __AUDIT__

From 724c50e44a2398dbbbf81377d45fe654ebf6158c Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Wed, 21 Apr 2021 15:30:08 +0800
Subject: [PATCH 357/578] fix update with returning clause on partitioned table

---
 src/backend/executor/nodeModifyTable.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 3d6b9769..83042aac 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -2275,6 +2275,11 @@ ExecModifyTable(PlanState *pstate)
     {
         subplanstate = node->partplans[node->part_whichplan];
         part_resultRelInfo = resultRelInfo->part_relinfo[node->part_whichplan];
+        /* when use update ... returning  this fuction will be reentered, 
+         * so the execution should ues the last state of part_resultRelInfo
+         * */ 
+        junkfilter = resultRelInfo->ri_junkFilter;
+        estate->es_result_relation_info = part_resultRelInfo;
     }
     else
     {

From 46179ceeefb5d3ecd4ce65b069777643b299dc58 Mon Sep 17 00:00:00 2001
From: qiannzhang <qiannzhang@tencent.com>
Date: Mon, 26 Apr 2021 20:55:21 +0800
Subject: [PATCH 358/578] Fix several bugs when pull up or sublinks.

1.Fix if one of two Exists contains sublinks itself and is not related (not pullup).
2.Fix if some qual of one Exists constains only local vars (can pullup).
3.Fix if some qual of one Exists constains only upper vars (not pullup).
4.Fix if some qual of one Exists is too complicated to convert (not pullup).
Add regress.

http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131087122695
---
 src/backend/optimizer/plan/subselect.c    |  30 +++-
 src/backend/optimizer/prep/prepjointree.c |   2 +-
 src/test/regress/expected/subselect.out   | 192 ++++++++++++++++++++++
 src/test/regress/sql/subselect.sql        |  97 +++++++++++
 4 files changed, 311 insertions(+), 10 deletions(-)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 13150602..61647167 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -3029,7 +3029,7 @@ get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, List **targetLis
 		}
 		if (list_length(new_args) == 1)
 		{
-			return (Node *)list_head(new_args);
+			return (Node *)linitial(new_args);
 		}
 		else if (list_length(new_args) == 0)
 		{
@@ -3044,7 +3044,7 @@ get_or_exist_subquery_targetlist(PlannerInfo *root, Node *node, List **targetLis
 		Var *var;
 
 		vars = pull_vars_of_level(node, 0);
-
+		/* only support upper_var = local_var */
 		Assert(list_length(vars) == 1);
         
 		*targetList = lappend(*targetList, lfirst(vars->head));
@@ -3584,30 +3584,39 @@ check_or_exist_qual_pullupable(PlannerInfo *root, Node *node)
 	}
 	else
 	{
-		List *vars = pull_vars_of_level(node, 1);
-		if (vars == NIL)
+		bool result = false;
+
+		if (pull_vars_of_level(node, 1) == NIL)
 			return true;
+		/* If upper_var, only support upper_var = local_var */
+		if (pull_vars_of_level(node, 0) == NIL)
+			return false;
 
 		if (IsA(node, OpExpr))
 		{
 			HeapTuple opertup;
 			Form_pg_operator operform;
 			char *oprname;
-
 			OpExpr *expr = (OpExpr *)node;
 
+			if (list_length(expr->args) != 2 ||
+				!IsA(linitial(expr->args), Var) ||
+				!IsA(llast(expr->args), Var))
+			{
+				return false;
+			}
+
 			opertup = SearchSysCache1(OPEROID, ObjectIdGetDatum(expr->opno));
 			if (!HeapTupleIsValid(opertup))
 				return false;
 				
 			operform = (Form_pg_operator)GETSTRUCT(opertup);
 			oprname = NameStr(operform->oprname);
-
+			/* only support simple equal */
+			result = (strcmp(oprname, "=") == 0);
 			ReleaseSysCache(opertup);
-			if (strcmp(oprname, "=") == 0 && list_length(expr->args) == 2)
-				return true;
 		}
-		return false;
+		return result;
 	}
 	return true;
 }
@@ -3625,6 +3634,9 @@ bool check_or_exist_sublink_pullupable(PlannerInfo *root, Node *node)
 	if (subselect->cteList)
 		return false;
 		
+	if (subselect->hasSubLinks)
+		return false;
+
 	if (!simplify_EXISTS_query(root, subselect))
 		return false;
 		
diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c
index 93ceb77e..45e03eb6 100644
--- a/src/backend/optimizer/prep/prepjointree.c
+++ b/src/backend/optimizer/prep/prepjointree.c
@@ -528,7 +528,7 @@ pull_up_or_sublinks_qual_recurse(PlannerInfo *root, Node *node, Node **jtlink, N
 		}
 		else
 		{
-			BoolExpr *expr = (BoolExpr *)(*or_clause);
+			BoolExpr *expr = (BoolExpr *)(*orquals);
 			if (expr->boolop == OR_EXPR)
 			{
 				*orquals = (Node *)make_andclause(list_make2(*orquals,
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index c68869c3..32ed8e4f 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -2222,3 +2222,195 @@ drop table notin_t1;
 drop table notin_t2;
 drop function explain_sq_limit();
 drop table sq_limit;
+-- pull up or sublinks
+set enable_pullup_subquery to on;
+create table coltest(
+ c1 int,
+ c2 bigint,
+ c3 int2,
+ c4 bool,
+ c5 name, 
+ c6 float4,
+ c7 float8,
+ c9 numeric,
+ c10 text,
+ c11 char(100),
+ c12 varchar,
+ c13 money,
+ c14 date,
+ c15 timestamp,
+ c16 timestamp with time zone,
+ c17 time,
+ c18 time with time zone,
+ c19 interval,
+ c20 abstime,  
+ c21 reltime,
+ c22 tinterval,
+ c23 box,
+ c24 line,
+ c25 path,
+ c26 point,
+ c27 lseg,
+ c28 polygon,
+ c29 circle,
+ c30 inet,
+ c31 macaddr
+);
+explain (costs off)
+select subq_2.c3 as c0
+from coltest as subq_2
+where
+  (EXISTS (
+    select subq_2.c5 as c3, sample_2.c9 as c2
+    from public.coltest as sample_2
+    where subq_2.c6 = sample_2.c6))
+  or
+  (EXISTS (
+    select ref_2.c3 as c2
+    from public.coltest as sample_3
+         left join coltest as ref_2 on (true)
+    where (EXISTS (
+        select sample_3.c1 as c1, ref_3.c11 as c7
+        from public.coltest as ref_3
+        where ref_3.c6 = sample_3.c6))))
+;
+                                        QUERY PLAN                                         
+-------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Seq Scan on coltest subq_2
+         Filter: ((SubPlan 1) OR $1)
+         InitPlan 2 (returns $1)
+           ->  Nested Loop Left Join
+                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                       ->  Nested Loop Semi Join
+                             Join Filter: (sample_3.c6 = ref_3.c6)
+                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                   Distribute results by H: c6
+                                   ->  Seq Scan on coltest sample_3
+                             ->  Materialize
+                                   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                         Distribute results by H: c6
+                                         ->  Seq Scan on coltest ref_3
+                 ->  Materialize
+                       ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                             ->  Seq Scan on coltest ref_2
+         SubPlan 1
+           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                 ->  Seq Scan on coltest sample_2
+                       Filter: (subq_2.c6 = c6)
+(22 rows)
+
+explain (costs off)
+select subq_2.c3 as c0
+from coltest as subq_2
+where
+  (EXISTS (
+    select subq_2.c5 as c3, sample_2.c9 as c2
+    from public.coltest as sample_2
+    where case when subq_2.c6 is NULL then sample_2.c14 else cast(null as date) end
+           = sample_2.c14))
+  or
+  (EXISTS (
+    select sample_3.c3 as c2
+    from public.coltest as sample_3
+    where subq_2.c6 = sample_3.c6))
+;
+                                            QUERY PLAN                                            
+--------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Seq Scan on coltest subq_2
+         Filter: ((SubPlan 1) OR (SubPlan 2))
+         SubPlan 1
+           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                 ->  Seq Scan on coltest sample_2
+                       Filter: (CASE WHEN (subq_2.c6 IS NULL) THEN c14 ELSE NULL::date END = c14)
+         SubPlan 2
+           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                 ->  Seq Scan on coltest sample_3
+                       Filter: (subq_2.c6 = c6)
+(11 rows)
+
+explain (costs off)
+select subq_2.c3 as c0
+from coltest as subq_2
+where
+  (EXISTS (
+    select subq_2.c5 as c3, sample_2.c9 as c2
+    from public.coltest as sample_2
+    where subq_2.c6 = sample_2.c6 and subq_2.c10='a'))
+  or
+  (EXISTS (
+    select sample_3.c3 as c2
+    from public.coltest as sample_3
+    where subq_2.c6 = sample_3.c6))
+;
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Seq Scan on coltest subq_2
+         Filter: ((SubPlan 1) OR (SubPlan 2))
+         SubPlan 1
+           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                 ->  Result
+                       One-Time Filter: (subq_2.c10 = 'a'::text)
+                       ->  Seq Scan on coltest sample_2
+                             Filter: (subq_2.c6 = c6)
+         SubPlan 2
+           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                 ->  Seq Scan on coltest sample_3
+                       Filter: (subq_2.c6 = c6)
+(13 rows)
+
+explain (costs off)
+select subq_2.c3 as c0
+from coltest as subq_2
+where
+  (EXISTS (
+    select subq_2.c5 as c3, sample_2.c9 as c2
+    from public.coltest as sample_2
+    where subq_2.c6 = sample_2.c6 and sample_2.c10='a'))
+  or
+  (EXISTS (
+    select sample_3.c3 as c2
+    from public.coltest as sample_3
+    where subq_2.c6 = sample_3.c6))
+;
+                                             QUERY PLAN                                              
+-----------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Hash Right Join
+         Hash Cond: (fake = subq_2.c6)
+         Filter: ((fake_1 IS NOT NULL) OR (fake_1 IS NOT NULL))
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Distribute results by H: fake
+               ->  Finalize HashAggregate
+                     Group Key: sample_3.c6
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: c6
+                           ->  Partial HashAggregate
+                                 Group Key: sample_3.c6
+                                 ->  Seq Scan on coltest sample_3
+         ->  Hash
+               ->  Hash Left Join
+                     Hash Cond: (subq_2.c6 = fake)
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: c6
+                           ->  Seq Scan on coltest subq_2
+                     ->  Hash
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Distribute results by H: fake
+                                 ->  Group
+                                       Group Key: sample_2.c6
+                                       ->  Sort
+                                             Sort Key: sample_2.c6
+                                             ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                                   Distribute results by H: c6
+                                                   ->  Group
+                                                         Group Key: sample_2.c6
+                                                         ->  Sort
+                                                               Sort Key: sample_2.c6
+                                                               ->  Seq Scan on coltest sample_2
+                                                                     Filter: (c10 = 'a'::text)
+(34 rows)
+
+drop table coltest;
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index f17f38f3..17f719d9 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -904,3 +904,100 @@ drop table notin_t2;
 drop function explain_sq_limit();
 
 drop table sq_limit;
+
+-- pull up or sublinks
+set enable_pullup_subquery to on;
+create table coltest(
+ c1 int,
+ c2 bigint,
+ c3 int2,
+ c4 bool,
+ c5 name, 
+ c6 float4,
+ c7 float8,
+ c9 numeric,
+ c10 text,
+ c11 char(100),
+ c12 varchar,
+ c13 money,
+ c14 date,
+ c15 timestamp,
+ c16 timestamp with time zone,
+ c17 time,
+ c18 time with time zone,
+ c19 interval,
+ c20 abstime,  
+ c21 reltime,
+ c22 tinterval,
+ c23 box,
+ c24 line,
+ c25 path,
+ c26 point,
+ c27 lseg,
+ c28 polygon,
+ c29 circle,
+ c30 inet,
+ c31 macaddr
+);
+explain (costs off)
+select subq_2.c3 as c0
+from coltest as subq_2
+where
+  (EXISTS (
+    select subq_2.c5 as c3, sample_2.c9 as c2
+    from public.coltest as sample_2
+    where subq_2.c6 = sample_2.c6))
+  or
+  (EXISTS (
+    select ref_2.c3 as c2
+    from public.coltest as sample_3
+         left join coltest as ref_2 on (true)
+    where (EXISTS (
+        select sample_3.c1 as c1, ref_3.c11 as c7
+        from public.coltest as ref_3
+        where ref_3.c6 = sample_3.c6))))
+;
+explain (costs off)
+select subq_2.c3 as c0
+from coltest as subq_2
+where
+  (EXISTS (
+    select subq_2.c5 as c3, sample_2.c9 as c2
+    from public.coltest as sample_2
+    where case when subq_2.c6 is NULL then sample_2.c14 else cast(null as date) end
+           = sample_2.c14))
+  or
+  (EXISTS (
+    select sample_3.c3 as c2
+    from public.coltest as sample_3
+    where subq_2.c6 = sample_3.c6))
+;
+explain (costs off)
+select subq_2.c3 as c0
+from coltest as subq_2
+where
+  (EXISTS (
+    select subq_2.c5 as c3, sample_2.c9 as c2
+    from public.coltest as sample_2
+    where subq_2.c6 = sample_2.c6 and subq_2.c10='a'))
+  or
+  (EXISTS (
+    select sample_3.c3 as c2
+    from public.coltest as sample_3
+    where subq_2.c6 = sample_3.c6))
+;
+explain (costs off)
+select subq_2.c3 as c0
+from coltest as subq_2
+where
+  (EXISTS (
+    select subq_2.c5 as c3, sample_2.c9 as c2
+    from public.coltest as sample_2
+    where subq_2.c6 = sample_2.c6 and sample_2.c10='a'))
+  or
+  (EXISTS (
+    select sample_3.c3 as c2
+    from public.coltest as sample_3
+    where subq_2.c6 = sample_3.c6))
+;
+drop table coltest;

From b529a4eef0d6336fae2db6f2173d70fc97e94428 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 19 Jan 2021 14:56:46 +0800
Subject: [PATCH 359/578] add switch for 2pc recovery file.

---
 src/backend/access/transam/twophase.c  | 30 ++++++++++++++++++++++++++
 src/backend/utils/misc/guc.c           |  9 ++++++++
 src/include/access/twophase.h          |  4 ++++
 src/test/regress/expected/sysviews.out |  1 +
 4 files changed, 44 insertions(+)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 37a041f6..2d1cbd4a 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -143,6 +143,9 @@ int            max_prepared_xacts = 10000;  /* We require 2PC */
 #else
 int            max_prepared_xacts = 0;
 #endif
+#ifdef __TBASE__
+bool        enable_2pc_recovery_info = true;
+#endif
 
 
 static GlobalTransaction
@@ -3148,6 +3151,12 @@ void record_2pc_redo_remove_gid_xid(TransactionId xid)
     int i;
     GlobalTransaction gxact = NULL;
     bool found = false;
+
+	if(!enable_2pc_recovery_info)
+	{
+		return ;
+	}
+
     for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
     {
         gxact = TwoPhaseState->prepXacts[i];
@@ -3185,6 +3194,11 @@ void record_2pc_involved_nodes_xid(const char * tid,
     XLogRecPtr xlogrec = 0;
 #endif
         
+    if (!enable_2pc_recovery_info)
+    {
+        return ;
+    }
+
     if (enable_distri_print)
     {
         elog(LOG, "record twophase txn gid: %s, startnode: %s, participants: %s", tid, startnode, nodestring);
@@ -3333,6 +3347,11 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
     GlobalTransaction gxact = NULL;
 #endif
     
+    if (!enable_2pc_recovery_info)
+    {
+    	return ;
+    }
+
     if (enable_distri_print)
     {
         elog(LOG, "record twophase txn gid: %s, commit_timestamp: %ld", tid, commit_timestamp);
@@ -3433,6 +3452,12 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 void remove_2pc_records(const char * tid, bool record_in_xlog)
 {
     char path[MAXPGPATH];    
+
+    if (!enable_2pc_recovery_info)
+    {
+    	return ;
+    }
+
     snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
 
     /* no need to check file exists. since when it do not exists , unlink won't success */
@@ -3455,6 +3480,11 @@ void record_2pc_readonly(const char *gid)
     char path[MAXPGPATH];
     char content[10] = "readonly";
         
+    if(!enable_2pc_recovery_info)
+    {
+    	return ;
+    }
+        
     if (enable_distri_print)
     {
         elog(LOG, "record readonly twophase txn gid: %s", gid);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index d1bf813f..d7c85782 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2552,6 +2552,15 @@ static struct config_bool ConfigureNamesBool[] =
         false,
         NULL, NULL, NULL
     },
+	{
+		{"enable_2pc_recovery_info", PGC_POSTMASTER, DEVELOPER_OPTIONS,
+			 gettext_noop("write extra file for 2pc crash recovery."),
+			 NULL
+		},
+		&enable_2pc_recovery_info,
+		true,
+		NULL, NULL, NULL
+	},
 #endif
 
 #ifdef __AUDIT__
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index 8a4831b0..cbf83a3e 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -92,6 +92,10 @@ extern int    max_prepared_xacts;
 
 extern int  transaction_threshold;
 
+#ifdef __TBASE__
+extern bool enable_2pc_recovery_info;
+#endif
+
 extern Size TwoPhaseShmemSize(void);
 extern void TwoPhaseShmemInit(void);
 
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index c3f67c12..9c939d43 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -72,6 +72,7 @@ select count(*) >= 0 as ok from pg_prepared_xacts;
 select name, setting from pg_settings where name like 'enable%';
                name                | setting 
 -----------------------------------+---------
+ enable_2pc_recovery_info          | on
  enable_audit                      | off
  enable_audit_warning              | off
  enable_auditlogger_warning        | off

From e504107243f55360c7a3f5aeb2936ff841b527d5 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Wed, 28 Apr 2021 21:36:20 +0800
Subject: [PATCH 360/578] 2pc files opt: add 2pc hash table on shmem (merge
 request !300)

---
 contrib/pg_clean/pg_clean.c            |  353 +++-----
 src/backend/access/transam/twophase.c  | 1054 +++++++++++++++++++++---
 src/backend/access/transam/xlog.c      |   28 +-
 src/backend/storage/ipc/ipci.c         |    7 +
 src/backend/utils/misc/guc.c           |   66 ++
 src/include/access/twophase.h          |   18 +
 src/include/catalog/pg_control.h       |  368 ++++-----
 src/test/regress/expected/sysviews.out |    6 +-
 8 files changed, 1330 insertions(+), 570 deletions(-)

diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c
index 8d1514f4..459a2fc0 100644
--- a/contrib/pg_clean/pg_clean.c
+++ b/contrib/pg_clean/pg_clean.c
@@ -303,6 +303,8 @@ void getTxnInfoOnOtherNodes(txn_info *txn);
 int Get2PCXidByGid(Oid node_oid, char * gid, uint32 * transactionid);
 int Get2PCFile(Oid node_oid, char * gid, uint32 * transactionid);
 
+char *get2PCInfo(const char *tid);
+
 void getTxnStatus(txn_info * txn, int node_idx);
 void recover2PCForDatabaseAll(void);
 void recover2PCForDatabase(database_info * db_info);
@@ -1615,23 +1617,28 @@ void getTxnStatus(txn_info *txn, int node_idx)
 	DropTupleTableSlots(&result);
 }
 
-Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_get_2pc_file);
-Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS)
+char *get2PCInfo(const char *tid)
 {
-    char *tid;
-    char path[MAXPGPATH];
-    File fd;
-    int ret;
-    char *result;
-	text *t_result = NULL;
+    char *result = NULL;
+    char *info = NULL;
+    int size = 0;
+    File fd = -1;
+    int ret = -1;
     struct stat filestate;
-    off_t fileSize;
+    char path[MAXPGPATH];
     
-    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+    info = get_2pc_info_from_cache(tid);
+    if (NULL != info)
+    {
+        size = strlen(info);
+        result = (char *)palloc0(size + 1);
+        memcpy(result, info, size);
+        return result;
+    }
 
-    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
+    elog(LOG, "try to get 2pc info from disk, tid: %s", tid);
     
+    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
     if(access(path, F_OK) == 0)
     {
     	if(stat(path, &filestate) == -1)
@@ -1641,39 +1648,56 @@ Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS)
     			errmsg("could not get status of file \"%s\"", path)));
     	}
         
-    	fileSize = filestate.st_size;
+        size = filestate.st_size;
 
-        if (0 == fileSize) 
+        if (0 == size) 
         {
-            PG_RETURN_NULL();
+            return NULL;
         }
 
-        result = (char *)palloc0(fileSize + 1);
+        result = (char *)palloc0(size + 1);
 
         fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
     	if (fd < 0)
     	{   
+            pfree(result);
     		ereport(ERROR,
     			(errcode_for_file_access(),
     			errmsg("could not open file \"%s\" for read", path)));
     	} 
 
-    	ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
-
-    	if(ret != fileSize)
+        ret = FileRead(fd, result, size, WAIT_EVENT_BUFFILE_READ);
+        if(ret != size)
     	{
+            pfree(result);
     		ereport(ERROR,
     			(errcode_for_file_access(),
     			errmsg("could not read file \"%s\"", path)));
     	}
 
         FileClose(fd);
-		if (result)
+        return result;
+    }
+
+    return NULL;
+}
+
+Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_file);
+Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS)
+{
+    char *tid = NULL;
+    char *result = NULL;
+    text *t_result = NULL;
+
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+    result = get2PCInfo(tid);
+    if (NULL != result)
 		{
 			t_result = cstring_to_text(result);
+        pfree(result);
 			return PointerGetDatum(t_result);
 		}
-    }
     PG_RETURN_NULL();
 }
 
@@ -1682,63 +1706,26 @@ Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_nodes);
 Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS)
 {
-    char *tid;
-    char path[MAXPGPATH];
-    File fd;
-    int ret;
-    char *result;
-	char *nodename;
+    char *tid = NULL;
+    char *result = NULL;
+    char *nodename = NULL;
 	text *t_result = NULL;
-    struct stat filestate;
-    off_t fileSize;
     
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-
-    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
-    
-    if(access(path, F_OK) == 0)
-    {
-    	if(stat(path, &filestate) == -1)
-    	{
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not get status of file \"%s\"", path)));
-    	}
-        
-    	fileSize = filestate.st_size;
-
-        result = (char *)palloc0(fileSize + 1);
-
-        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
-    	if (fd < 0)
-    	{   
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not open file \"%s\" for read", path)));
-    	} 
-
-    	ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
-
-    	if(ret != fileSize)
-    	{
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not read file \"%s\"", path)));
-    	}
-
-        FileClose(fd);
-		if (result)
+    result = get2PCInfo(tid);
+    if (NULL != result)
 		{
 			nodename = strstr(result, GET_NODE);
-			if (nodename)
+        if (NULL != nodename)
 			{
 				nodename += strlen(GET_NODE);
 				nodename = strtok(nodename, "\n");
 				t_result = cstring_to_text(nodename);
+            pfree(result);
 				return PointerGetDatum(t_result);
 			}
 		}
-    }
+
     PG_RETURN_NULL();
 }
 
@@ -1746,61 +1733,24 @@ Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_startnode);
 Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS)
 {
-    char *tid;
-    char path[MAXPGPATH];
-    File fd;
-    int ret;
-    char *result;
-	char *nodename;
+    char *tid = NULL;
+    char *result = NULL;
+    char *nodename = NULL;
 	text *t_result = NULL;
-    struct stat filestate;
-    off_t fileSize;
     
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-
-    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
-    
-    if(access(path, F_OK) == 0)
-    {
-    	if(stat(path, &filestate) == -1)
-    	{
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not get status of file \"%s\"", path)));
-    	}
-        
-    	fileSize = filestate.st_size;
-
-        result = (char *)palloc0(fileSize + 1);
-
-        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
-    	if (fd < 0)
-    	{   
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not open file \"%s\" for read", path)));
-    	} 
-
-    	ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
-
-    	if(ret != fileSize)
-    	{
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not read file \"%s\"", path)));
-    	}
-
-        FileClose(fd);
-		if (result)
+    result = get2PCInfo(tid);
+    if (NULL != result)
 		{
 			nodename = strstr(result, GET_START_NODE);
-			if (nodename)
+        if (NULL != nodename)
 			{
 				nodename += strlen(GET_START_NODE);
 				nodename = strtok(nodename, "\n");
 				t_result = cstring_to_text(nodename);
+            pfree(result);
 				return PointerGetDatum(t_result);
-			}
+
 		}
     }
     PG_RETURN_NULL();
@@ -1810,63 +1760,25 @@ Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_startxid);
 Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS)
 {
-    char *tid;
-    char path[MAXPGPATH];
-    File fd;
-    int ret;
-    char *result;
-	char *startxid;
+    char *tid = NULL;
+    char *result = NULL;
+    char *startxid = NULL;
 	text *t_result = NULL;
-    struct stat filestate;
-    off_t fileSize;
     
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-
-    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
-    
-    if(access(path, F_OK) == 0)
-    {
-    	if(stat(path, &filestate) == -1)
-    	{
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not get status of file \"%s\"", path)));
-    	}
-        
-    	fileSize = filestate.st_size;
-
-        result = (char *)palloc0(fileSize + 1);
-
-        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
-    	if (fd < 0)
-    	{   
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not open file \"%s\" for read", path)));
-    	} 
-
-    	ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
-
-    	if(ret != fileSize)
-    	{
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not read file \"%s\"", path)));
-    	}
-
-        FileClose(fd);
-		if (result)
+    result = get2PCInfo(tid);
+    if (NULL != result)
 		{
 			startxid = strstr(result, GET_START_XID);
-			if (startxid)
+        if (NULL != startxid)
 			{
 				startxid += strlen(GET_START_XID);
 				startxid = strtok(startxid, "\n");
 				t_result = cstring_to_text(startxid);
+            pfree(result);
 				return PointerGetDatum(t_result);
 			}
 		}
-    }
     PG_RETURN_NULL();
 }
 
@@ -1875,63 +1787,25 @@ Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_commit_timestamp);
 Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS)
 {
-    char *tid;
-    char path[MAXPGPATH];
-    File fd;
-    int ret;
-    char *result;
-	char *commit_timestamp;
+    char *tid = NULL;
+    char *result = NULL;
+    char *commit_timestamp = NULL;
 	text *t_result = NULL;
-    struct stat filestate;
-    off_t fileSize;
     
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-
-    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
-    
-    if(access(path, F_OK) == 0)
-    {
-    	if(stat(path, &filestate) == -1)
-    	{
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not get status of file \"%s\"", path)));
-    	}
-        
-    	fileSize = filestate.st_size;
-
-        result = (char *)palloc0(fileSize + 1);
-
-        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
-    	if (fd < 0)
-    	{   
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not open file \"%s\" for read", path)));
-    	} 
-
-    	ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
-
-    	if(ret != fileSize)
-    	{
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not read file \"%s\"", path)));
-    	}
-
-        FileClose(fd);
-		if (result)
+    result = get2PCInfo(tid);
+    if (NULL != result)
 		{
 			commit_timestamp = strstr(result, GET_COMMIT_TIMESTAMP);
-			if (commit_timestamp)
+        if (NULL != commit_timestamp)
 			{
 				commit_timestamp += strlen(GET_COMMIT_TIMESTAMP);
 				commit_timestamp = strtok(commit_timestamp, "\n");
 				t_result = cstring_to_text(commit_timestamp);
+            pfree(result);
 				return PointerGetDatum(t_result);
 			}
 		}
-    }
     PG_RETURN_NULL();
 }
 
@@ -1941,61 +1815,24 @@ Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_xid);
 Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS)
 {
-    char *tid;
-    char path[MAXPGPATH];
-    File fd;
-    int ret;
+    char *tid = NULL;
+    char *result = NULL;
+    char *str_xid = NULL;
     GlobalTransactionId xid;
-	char *result;
-	char *str_xid;
-    struct stat filestate;
-    off_t fileSize;
     
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-
-    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
-    
-    if(access(path, F_OK) == 0)
+    result = get2PCInfo(tid);
+    if (NULL != result)
     {
-    	if(stat(path, &filestate) == -1)
-    	{
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not get status of file \"%s\"", path)));
-    	}
-        
-    	fileSize = filestate.st_size;
-		result = (char *)palloc0(fileSize + 1);
-		
-        fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
-    	if (fd < 0)
-    	{   
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not open file \"%s\" for read", path)));
-    	} 
-		
-
-    	ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
-
-    	if(ret != fileSize)
-    	{
-    		ereport(ERROR,
-    			(errcode_for_file_access(),
-    			errmsg("could not read file \"%s\"", path)));
-    	}
-
-        FileClose(fd);
-		
 		str_xid = strstr(result, GET_XID);
-		if (str_xid)
+        if (NULL != str_xid)
 		{
 			str_xid += strlen(GET_XID);
 			str_xid = strtok(str_xid, "\n");
 			xid = strtoul(str_xid, NULL, 10);
+            pfree(result);
 			PG_RETURN_UINT32(xid);
 		}
-		
     }
     PG_RETURN_NULL();
 }
@@ -2004,15 +1841,9 @@ Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_remove_2pc_records);
 Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS)
 {
-#define SLEEP_COUNT 1000
-    char *tid               = NULL;
-
-    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-
+    char *tid = text_to_cstring(PG_GETARG_TEXT_P(0));
 	remove_2pc_records(tid, true);
-
     pfree(tid);
-    
     PG_RETURN_BOOL(true);
 }
 
@@ -2181,11 +2012,27 @@ Datum pgxc_get_record_list(PG_FUNCTION_ARGS)
     char *recordList = NULL;
 	text *t_recordList = NULL;
 
+    /* get from hash table */
+    recordList = get_2pc_list_from_cache(&count);
+    if (count >= MAXIMUM_OUTPUT_FILE)
+    {
+        Assert(NULL != recordList);
+        t_recordList = cstring_to_text(recordList);
+        return PointerGetDatum(t_recordList);
+    }
+
+    /* get from disk */
 	if(!(dir = opendir(TWOPHASE_RECORD_DIR)))
 	{
+        if(NULL == recordList)
+        {
 		PG_RETURN_NULL();
 	}
 
+        t_recordList = cstring_to_text(recordList);
+        return PointerGetDatum(t_recordList);
+    }
+
     while((ptr = readdir(dir)) != NULL)
     {
         if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0)
@@ -2193,7 +2040,9 @@ Datum pgxc_get_record_list(PG_FUNCTION_ARGS)
             continue;
         }       
         if (count >= MAXIMUM_OUTPUT_FILE)
+        {
             break;
+        }
         
         if(!recordList)
         {
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 2d1cbd4a..46570383 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -130,12 +130,13 @@
 #define TWOPHASE_DIR "pg_twophase"
 
 #define TWOPHASE_RECORD_DIR "pg_2pc"
+
+#define GET_2PC_FILE_PATH(path, tid) \
+    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid)
+
 int  transaction_threshold = 200000;
-#define GET_START_XID "startxid:"
-#define GET_COMMIT_TIMESTAMP "global_commit_timestamp:"
+
 #define GET_START_NODE "startnode:"
-#define GET_NODE "nodes:"
-#define GET_XID "xid:"
 
 /* GUC variable, can't be changed after startup */
 #ifdef PGXC
@@ -147,6 +148,38 @@ int            max_prepared_xacts = 0;
 bool        enable_2pc_recovery_info = true;
 #endif
 
+#ifdef __TWO_PHASE_TRANS__
+static HTAB *record_2pc_cache = NULL;
+
+bool enable_2pc_file_cache = true;
+bool enable_2pc_file_check = true;
+bool enable_2pc_entry_key_check = true;
+bool enable_2pc_entry_trace = false;
+
+int record_2pc_cache_size = 50000;
+int record_2pc_entry_size = 2048;
+int record_2pc_partitions = 32;
+
+#define MAX_OUTPUT_FILE 1000
+
+#define MAX_TID_SIZE        MAXPGPATH
+#define MAX_2PC_INFO_SIZE   (record_2pc_entry_size - MAX_TID_SIZE)
+#define DFLT_2PC_INFO_SIZE  1024  /* default size */
+
+/* hash table entry for 2pc record */
+typedef struct Cache2pcInfo
+{
+	char key[MAX_TID_SIZE];        /* hash key: tid */
+	char info[DFLT_2PC_INFO_SIZE];
+
+} Cache2pcInfo;
+
+inline void
+check_entry_key(const char *tid, const char *key, const char *func);
+
+void
+check_2pc_file(const char *tid, const char *info, const char *func);
+#endif
 
 static GlobalTransaction
 LookupGXact(const char *gid, Oid user);
@@ -2107,6 +2140,12 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
     {
         remove_2pc_records(gid, false);
     }
+    else
+    {
+        /* rename 2pc file when rollback on the current node */
+        rename_2pc_records(gid, 0);
+    }
+
     ClearLocalTwoPhaseState();
     
     if (isCommit) 
@@ -2263,6 +2302,17 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
     int            i;
     int            serialized_xacts = 0;
 
+#ifdef __TWO_PHASE_TRANS__
+	File fd = -1;
+	int ret = 0;
+	int size = 0;
+	Cache2pcInfo *entry = NULL;
+	bool found = false;
+	char path[MAXPGPATH];
+#endif
+
+	elog(LOG, "[CheckPointTwoPhase] checkpoint: "UINT64_FORMAT, redo_horizon);
+
     if (max_prepared_xacts <= 0)
         return;                    /* nothing to do */
 
@@ -2300,17 +2350,167 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
             char       *buf;
             int            len;
 
+			/* save to pg_twophase */
             XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len);
             RecreateTwoPhaseFile(gxact->xid, buf, len);
+			pfree(buf);
+
+#ifdef __TWO_PHASE_TRANS__
+			/* save to pg_2pc */
+			if (NULL != record_2pc_cache)
+			{
+				Assert(strlen(gxact->gid) < MAX_TID_SIZE);
+				entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
+					gxact->gid, HASH_FIND, &found);
+				if (found)
+				{
+					/* save to file */
+					Assert(NULL != entry);
+					check_entry_key(gxact->gid, entry->key, "CheckPointTwoPhase");
+					check_2pc_file(gxact->gid, entry->info, "CheckPointTwoPhase");
+
+					elog(LOG, "[CheckPointTwoPhase] %s is found "
+						"in hash table", gxact->gid);
+
+					size = strlen(entry->info);
+
+					memset(path, 0, MAXPGPATH);
+					GET_2PC_FILE_PATH(path, gxact->gid);
+
+					fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+					if (fd < 0)
+					{
+						elog(ERROR, "[CheckPointTwoPhase] could not create file %s, "
+							"errMsg: %s", path, strerror(errno));
+					}
+
+					ret = write(fd, entry->info, size);
+					if(ret != size)
+					{
+						close(fd);
+						elog(ERROR, "[CheckPointTwoPhase] could not write file %s, "
+							"errMsg: %s, ret: %d, info: %s",
+							path, strerror(errno), ret, entry->info);
+					}
+					close(fd);
+
+					/* remove from hash table */
+					entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
+						gxact->gid, HASH_REMOVE, &found);
+					if (!found)
+					{
+						elog(WARNING, "[CheckPointTwoPhase] %s is not found "
+							"in hash table when remove it", gxact->gid);
+					}
+					else if (enable_2pc_entry_trace)
+					{
+						elog(LOG, "[CheckPointTwoPhase] %s is removed "
+							"from hash table", gxact->gid);
+					}
+				}
+				else
+				{
+					elog(LOG, "[CheckPointTwoPhase] %s is not found "
+						"in hash table", gxact->gid);
+				}
+			}
+#endif
+
             gxact->ondisk = true;
             gxact->prepare_start_lsn = InvalidXLogRecPtr;
             gxact->prepare_end_lsn = InvalidXLogRecPtr;
-            pfree(buf);
             serialized_xacts++;
         }
     }
     LWLockRelease(TwoPhaseStateLock);
 
+#ifdef __TWO_PHASE_TRANS__
+	/* start node maybe no in prepared xacts */
+	if (IS_PGXC_COORDINATOR && NULL != record_2pc_cache)
+	{
+		HASH_SEQ_STATUS seq;
+		Cache2pcInfo *entry = NULL;
+		char *start_node = NULL;
+		char info[MAX_2PC_INFO_SIZE];
+
+		hash_seq_init(&seq, record_2pc_cache);
+		while ((entry = hash_seq_search(&seq)) != NULL)
+		{
+			Assert(NULL != entry);
+			check_2pc_file(entry->key, entry->info, "CheckPointTwoPhase");
+
+			elog(LOG, "[CheckPointTwoPhase] key %s is found "
+				"in hash table", entry->key);
+
+			if (IsXidImplicit(entry->key))
+			{
+				memset(info, 0, MAX_2PC_INFO_SIZE);
+				memcpy(info, entry->info, strlen(entry->info));
+
+				start_node = strstr(info, GET_START_NODE);
+				if (NULL != start_node)
+				{
+					start_node += strlen(GET_START_NODE);
+					start_node = strtok(start_node, "\n");
+
+					if (0 != strcmp(start_node, PGXCNodeName))
+					{
+						elog(LOG, "[CheckPointTwoPhase] %s start node is not %s",
+							entry->key, PGXCNodeName);
+						continue;
+					}
+					else
+					{
+						elog(LOG, "[CheckPointTwoPhase] %s start node is %s",
+							entry->key, PGXCNodeName);
+					}
+				}
+				else
+				{
+					elog(WARNING, "[CheckPointTwoPhase] %s get start node failed, "
+						"info: %s", entry->key, entry->info);
+				}
+			}
+
+			size = strlen(entry->info);
+
+			memset(path, 0, MAXPGPATH);
+			GET_2PC_FILE_PATH(path, entry->key);
+
+			fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+			if (fd < 0)
+			{
+				elog(ERROR, "[CheckPointTwoPhase] could not create file %s, "
+					"errMsg: %s", path, strerror(errno));
+			}
+
+			ret = write(fd, entry->info, size);
+			if(ret != size)
+			{
+				close(fd);
+				elog(ERROR, "[CheckPointTwoPhase] could not write file %s, "
+					"errMsg: %s, ret: %d, info: %s",
+					path, strerror(errno), ret, entry->info);
+			}
+			close(fd);
+
+			/* remove from hash table */
+			entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
+				entry->key, HASH_REMOVE, &found);
+			if (!found)
+			{
+				elog(WARNING, "[CheckPointTwoPhase] %s is not found "
+					"in hash table when remove it", entry->key);
+			}
+			else if (enable_2pc_entry_trace)
+			{
+				elog(LOG, "[CheckPointTwoPhase] %s is removed "
+					"from hash table", entry->key);
+			}
+		}
+	}
+#endif
+
     /*
      * Flush unconditionally the parent directory to make any information
      * durable on disk.  Two-phase files could have been removed and those
@@ -3146,6 +3346,117 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning)
 }
 
 #ifdef __TWO_PHASE_TRANS__
+/*
+ * Check the entry key in the hash table is same with tid.
+ */
+inline void check_entry_key(const char *tid, const char *key, const char *func)
+{
+	if (!enable_2pc_entry_key_check)
+	{
+		return;
+	}
+
+	if (0 != strcmp(tid, key))
+	{
+		elog(PANIC, "[%s] %s get wrong key: %s", func, tid, key);
+	}
+}
+
+/*
+ * Check whether the 2pc file is exist when it is saved in the hash table.
+ */
+void check_2pc_file(const char *tid, const char *info, const char *func)
+{
+	if (enable_2pc_file_check)
+	{
+		int size = 0;
+		struct stat filestate;
+		char path[MAXPGPATH];
+		Cache2pcInfo *entry = NULL;
+		bool found = false;
+
+		Assert (NULL != tid);
+		Assert (NULL != info);
+		Assert (NULL != func);
+
+		GET_2PC_FILE_PATH(path, tid);
+		if (0 != access(path, F_OK))
+		{
+			return;
+		}
+
+		elog(LOG, "[check_2pc_file][%s] node(%s) found file %s",
+			func, PGXCNodeName, path);
+
+		if(stat(path, &filestate) == -1)
+		{
+			elog(ERROR, "[check_2pc_file][%s] could not get status of file %s",
+				func, path);
+		}
+
+		size = filestate.st_size;
+
+		if (0 != size)
+		{
+			int ret = 0;
+			File fd = -1;
+			char result[size + 1];
+
+			fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
+			if (fd < 0)
+			{
+				elog(ERROR, "[check_2pc_file][%s] could not open file %s for read",
+					func, path);
+			}
+
+			memset(result, 0, size +1);
+			ret = FileRead(fd, result, size, WAIT_EVENT_BUFFILE_READ);
+			if(ret != size)
+			{
+				FileClose(fd);
+				elog(ERROR, "[check_2pc_file][%s] read %s error, ret: %d, size: %d",
+					func, path, ret, size);
+			}
+			FileClose(fd);
+
+			if (0 != strcmp(result, info))
+			{
+				elog(LOG, "[check_2pc_file][%s] file %s result: %s, info: %s",
+					func, path, result, info);
+			}
+		}
+		else
+		{
+			elog(LOG, "[check_2pc_file][%s] get empty file %s, info: %s",
+				func, path, info);
+		}
+
+		if (NULL == record_2pc_cache)
+		{
+			elog(LOG, "[check_2pc_file][%s] record_2pc_cache is NULL, "
+				"tid: %s, info: %s", func, tid, info);
+			return;
+		}
+
+		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
+			tid, HASH_FIND, &found);
+		if (!found)
+		{
+			elog(LOG, "[check_2pc_file][%s] %s is not found "
+				"in hash table, info: %s", func, tid, info);
+			return;
+		}
+
+		Assert (NULL != entry);
+
+		if (0 != strcmp(entry->info, info))
+		{
+			elog(LOG, "[check_2pc_file][%s] %s info change from '%s' to '%s'",
+				func, tid, info, entry->info);
+		}
+	}
+}
+
 void record_2pc_redo_remove_gid_xid(TransactionId xid)
 {
     int i;
@@ -3190,6 +3501,8 @@ void record_2pc_involved_nodes_xid(const char * tid,
     char path[MAXPGPATH];
     off_t fileSize;
     char *result = NULL;
+	Cache2pcInfo *entry = NULL;
+	bool found = false;
 #ifdef __TWO_PHASE_TESTS__
     XLogRecPtr xlogrec = 0;
 #endif
@@ -3199,31 +3512,65 @@ void record_2pc_involved_nodes_xid(const char * tid,
         return ;
     }
 
-    if (enable_distri_print)
+	if (enable_distri_print || enable_2pc_entry_trace)
     {
-        elog(LOG, "record twophase txn gid: %s, startnode: %s, participants: %s", tid, startnode, nodestring);
+		elog(LOG, "[record_2pc_involved_nodes_xid] record %s, "
+			"startnode: %s, participants: %s",
+			tid, startnode, nodestring);
     }
 
     if (NULL == tid || '\0' == tid[0])
     {
-        elog(ERROR, "record twophase txn GID is empty");
+		elog(ERROR, "[record_2pc_involved_nodes_xid] gid is empty");
     }
 
     if (NULL == startnode || '\0' == startnode[0])
     {
-        elog(PANIC, "record twophase txn gid: %s, startnode is empty", tid);
+		elog(PANIC, "[record_2pc_involved_nodes_xid] %s startnode is empty", tid);
     }
     if (NULL == nodestring || '\0' == nodestring[0])
     {
-        elog(PANIC, "record twophase txn gid: %s, participants is empty", tid);
+		elog(PANIC, "[record_2pc_involved_nodes_xid] %s participants is empty", tid);
     }
     
-    /* the 2pc dir is already created in initdb */
-    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
+	initStringInfo(&content);
+	appendStringInfo(&content, "startnode:%s\n", startnode);
+	appendStringInfo(&content, "startxid:%u\n", startxid);
+	appendStringInfo(&content, "nodes:%s\n", nodestring);
+	appendStringInfo(&content, "xid:%u\n", xid);
+	size = content.len;
+
+	Assert(size == strlen(content.data));
 
     /* if in_pg_clean, then check whether the file exists */
     if (g_twophase_state.in_pg_clean)
     {
+		/* if tid already exists, check content and return */
+		if (NULL != record_2pc_cache)
+		{
+			Assert(strlen(tid) < MAX_TID_SIZE);
+			entry = (Cache2pcInfo *)hash_search(record_2pc_cache, tid, HASH_FIND, &found);
+			if (found)
+			{
+				Assert(NULL != entry);
+				check_entry_key(tid, entry->key, "record_2pc_involved_nodes_xid");
+				check_2pc_file(tid, entry->info, "record_2pc_involved_nodes_xid");
+
+				if (strncmp(entry->info, content.data, size) != 0)
+				{
+					elog(ERROR, "[record_2pc_involved_nodes_xid] pg_clean attemp to "
+						"write %s info conflict, content: %s, info: %s",
+						tid, content.data, entry->info);
+				}
+
+				resetStringInfo(&content);
+				pfree(content.data);
+				return;
+			}
+		}
+
+		GET_2PC_FILE_PATH(path, tid);
+
         /* if file already exists, check content and return */
         if (stat(path, &fst) >= 0)
         {
@@ -3235,40 +3582,111 @@ void record_2pc_involved_nodes_xid(const char * tid,
             {   
                 ereport(ERROR,
                     (errcode_for_file_access(),
-                    errmsg("could not open file \"%s\" for read", path)));
+					errmsg("[record_2pc_involved_nodes_xid] could not "
+						"open file %s for read", path)));
             } 
             ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
             if(ret != fileSize)
             {
+				FileClose(fd);
                 ereport(ERROR,
                     (errcode_for_file_access(),
-                    errmsg("could not read file \"%s\"", path)));
+					errmsg("[record_2pc_involved_nodes_xid] could not "
+						"read file %s, ret: %d", path, ret)));
             }
-            
             FileClose(fd);
-            if (result)
+
+			Assert(NULL != result);
+
+			if (strncmp(result, content.data, size) != 0)
             {
-                initStringInfo(&content);
-                appendStringInfo(&content, "startnode:%s\n", startnode);
-                appendStringInfo(&content, "startxid:%u\n", startxid);
-                appendStringInfo(&content, "nodes:%s\n", nodestring);
-                appendStringInfo(&content, "xid:%u\n", xid);
-                if (strncmp(result, content.data, content.len) != 0)
-                {
-                    elog(ERROR, "pg_clean attemp to write 2pc file conflict with file '%s', "
-                                                        "attemp to write startnode: %s, startxid: %u, "
-                                                        "nodestring: %s, xid: %u", tid, startnode, startxid, nodestring, xid);
-                }
-                else
-                {
-                    resetStringInfo(&content);
-                    pfree(content.data);
-                    return;
-                }
+				elog(ERROR, "[record_2pc_involved_nodes_xid] pg_clean attemp to "
+					"write %s info conflict, content: %s, info: %s",
+					tid, content.data, result);
             }
+
+			pfree(result);
+
+            resetStringInfo(&content);
+            pfree(content.data);
+            return;
         }
     }
     
+	if (!RecoveryInProgress())
+	{
+		XLogBeginInsert();
+		XLogRegisterData((char *)tid, strlen(tid) + 1);
+		XLogRegisterData((char *)startnode, strlen(startnode) + 1);
+		XLogRegisterData((char *)&startxid, sizeof(GlobalTransactionId) + 1);
+		XLogRegisterData((char *)nodestring, strlen(nodestring) + 1);
+		XLogRegisterData((char *)&xid, sizeof(GlobalTransactionId) + 1);
+#ifdef __TWO_PHASE_TESTS__
+		xlogrec = 
+#endif
+		XLogInsert(RM_XLOG_ID, XLOG_CREATE_2PC_FILE);
+#ifdef __TWO_PHASE_TESTS__
+		if (PART_PREPARE_AFTER_RECORD_2PC == twophase_exception_case && 
+			g_twophase_state.is_start_node)
+		{
+			XLogFlush(xlogrec);
+			run_pg_clean = 1;
+			complish = true;
+			elog(STOP, "[record_2pc_involved_nodes_xid] twophase exception: "
+				"simulate kill start node after record 2pc file");
+		}
+#endif
+	}
+
+	if (NULL != record_2pc_cache && size < MAX_2PC_INFO_SIZE)
+	{
+		Assert(strlen(tid) < MAX_TID_SIZE);
+		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
+			tid, HASH_ENTER_NULL, &found);
+		if (NULL != entry)
+		{
+			check_entry_key(tid, entry->key, "record_2pc_involved_nodes_xid");
+			check_2pc_file(tid, entry->info, "record_2pc_involved_nodes_xid");
+
+			if (found)
+			{
+				if (RecoveryInProgress())
+				{
+					elog(LOG, "[record_2pc_involved_nodes_xid] %s is found "
+						"in hash table in recovery mode", tid);
+				}
+				else
+				{
+					elog(LOG, "[record_2pc_involved_nodes_xid] %s is found "
+						"in hash table", tid);
+				}
+			}
+			else if (enable_2pc_entry_trace)
+			{
+				elog(LOG, "[record_2pc_involved_nodes_xid] %s is added "
+					"to hash table", tid);
+			}
+
+			memset(entry->info, 0, MAX_2PC_INFO_SIZE);
+			memcpy(entry->info, content.data, size);
+
+			resetStringInfo(&content);
+			pfree(content.data);
+			return;
+		}
+		else
+		{
+			elog(LOG, "[record_2pc_involved_nodes_xid] %s entry is NULL", tid);
+		}
+	}
+	else if (NULL != record_2pc_cache)
+	{
+		elog(LOG, "[record_2pc_involved_nodes_xid] %s size: %d, "
+			"max info size: %d", tid, size, MAX_2PC_INFO_SIZE);
+	}
+
+	GET_2PC_FILE_PATH(path, tid);
+
     /*
      * we open 2pc file under the following two different situations:
      * a. if in recovery mode, 
@@ -3287,49 +3705,23 @@ void record_2pc_involved_nodes_xid(const char * tid,
     }
     if (fd < 0)
     {   
-        elog(ERROR, "could not create 2pc file \"%s\", errMsg:%s", path, strerror(errno));
+		elog(ERROR, "[record_2pc_involved_nodes_xid] could not create file %s, "
+			"errMsg: %s", path, strerror(errno));
         return;
     }
 
-    initStringInfo(&content);
-    appendStringInfo(&content, "startnode:%s\n", startnode);
-    appendStringInfo(&content, "startxid:%u\n", startxid);
-    appendStringInfo(&content, "nodes:%s\n", nodestring);
-    appendStringInfo(&content, "xid:%u\n", xid);
-    size = strlen(content.data);
     ret = FileWrite(fd, content.data, size, WAIT_EVENT_BUFFILE_WRITE);
     if(ret != size)
     {
-        elog(ERROR, "could not write 2pc file \"%s\", errMsg:%s", path, strerror(errno));
+		FileClose(fd);
+		elog(ERROR, "[record_2pc_involved_nodes_xid] could not write file %s, "
+			"errMsg: %s, ret: %d, content: %s",
+			path, strerror(errno), ret, content.data);
     }
-    resetStringInfo(&content);
-    pfree(content.data);
     FileClose(fd);
     
-    if (!RecoveryInProgress())
-    {
-        XLogBeginInsert();
-        XLogRegisterData((char *)tid, strlen(tid)+1);
-        XLogRegisterData((char *)startnode, strlen(startnode)+1);
-        XLogRegisterData((char *)&startxid, sizeof(GlobalTransactionId) + 1);
-        XLogRegisterData((char *)nodestring, strlen(nodestring)+1);
-        XLogRegisterData((char *)&xid, sizeof(GlobalTransactionId) + 1);
-#ifdef __TWO_PHASE_TESTS__
-        xlogrec = 
-#endif
-        XLogInsert(RM_XLOG_ID, XLOG_CREATE_2PC_FILE);
-#ifdef __TWO_PHASE_TESTS__
-        if (PART_PREPARE_AFTER_RECORD_2PC == twophase_exception_case && 
-            g_twophase_state.is_start_node)
-        {
-            XLogFlush(xlogrec);
-            run_pg_clean = 1;
-            complish = true;
-            elog(STOP, "twophase exception: simulate kill start node after record 2pc file");
-        }
-#endif
-    }
-
+	resetStringInfo(&content);
+	pfree(content.data);
 }
 
 /* record commit timestamp in 2pc file while twophase trans failed in commit phase in the current node */
@@ -3338,10 +3730,13 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
     char path[MAXPGPATH];
     char file_content[2048];
     StringInfoData content;
-    File fd;
-    int ret;
-    int size;
+	File fd = -1;
+	int ret = 0;
+	int size = 0;
+	int new_size = 0;
     XLogRecPtr xlogrec = 0;
+	Cache2pcInfo *entry = NULL;
+	bool found = false;
 #if 0    
     int i;
     GlobalTransaction gxact = NULL;
@@ -3352,30 +3747,146 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
     	return ;
     }
 
-    if (enable_distri_print)
+	if (enable_distri_print || enable_2pc_entry_trace)
     {
-        elog(LOG, "record twophase txn gid: %s, commit_timestamp: %ld", tid, commit_timestamp);
+		elog(LOG, "[record_2pc_commit_timestamp] %s commit_timestamp: "
+			INT64_FORMAT, tid, commit_timestamp);
     }
     Assert(tid[0] != '\0');
     if (InvalidGlobalTimestamp == commit_timestamp && 
         (TWO_PHASE_COMMITTING == g_twophase_state.state || 
         TWO_PHASE_COMMIT_END == g_twophase_state.state))
     {
-        elog(ERROR, "can not commit transaction'%s' on node '%s' with InvalidGlobalTimestamp", tid, PGXCNodeName);
+		elog(ERROR, "[record_2pc_commit_timestamp] could not commit "
+			"transaction '%s' on node '%s' with InvalidGlobalTimestamp",
+			tid, PGXCNodeName);
     }
 
+	if (!RecoveryInProgress())
+	{
+		XLogBeginInsert();
+		XLogRegisterData((char *)tid, strlen(tid) + 1);
+		XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp) + 1);
+		xlogrec = XLogInsert(RM_XLOG_ID, XLOG_RECORD_2PC_TIMESTAMP);
+		/* only start node need to flush and sync XLOG_RECORD_2PC_TIMESTAMP */
+		if (IS_PGXC_LOCAL_COORDINATOR)
+		{
+			XLogFlush(xlogrec);
+			SyncRepWaitForLSN(xlogrec, false);
+		}
+	}
         
-    /* the 2pc dir is already created in initdb */
-    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
+	initStringInfo(&content);
+	appendStringInfo(&content, "global_commit_timestamp:"INT64_FORMAT"\n",
+		commit_timestamp);
+	size = strlen(content.data);
 
-    /* the 2pc file exists already */ 
-    fd = open(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR);//PathNameOpenFile(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR);
+	if (NULL != record_2pc_cache)
+	{
+		Assert(strlen(tid) < MAX_TID_SIZE);
+		entry = (Cache2pcInfo *)hash_search(record_2pc_cache, tid, HASH_FIND, &found);
+		if (found)
+		{
+			Assert(NULL != entry);
+			check_entry_key(tid, entry->key, "record_2pc_commit_timestamp");
+			check_2pc_file(tid, entry->info, "record_2pc_commit_timestamp");
+
+			if (RecoveryInProgress())
+			{
+				elog(LOG, "[record_2pc_commit_timestamp] %s is found "
+					"in hash table in recovery mode", tid);
+			}
+			else if (enable_2pc_entry_trace)
+			{
+				elog(LOG, "[record_2pc_commit_timestamp] %s is found "
+					"in hash table", tid);
+			}
+
+			new_size = size + strlen(entry->info);
+
+			if (new_size >= MAX_2PC_INFO_SIZE)
+			{
+				/* save to file */
+				elog(LOG, "[record_2pc_commit_timestamp] %s new size(%d) "
+					"overflow(%d)", tid, new_size, MAX_2PC_INFO_SIZE);
+
+				GET_2PC_FILE_PATH(path, tid);
+
+				fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
     if (fd < 0)
     {   
-        if (enable_distri_print)
+					if (RecoveryInProgress())
+					{
+						elog(LOG, "[record_2pc_commit_timestamp] could not "
+							"append timestamp in file %s, errMsg: %s",
+							path, strerror(errno));
+					}
+					else
         {
-            elog(LOG, "cannot open 2pc file %s", tid);
+						elog(ERROR, "[record_2pc_commit_timestamp] could not "
+							"append timestamp in file %s, errMsg: %s",
+							path, strerror(errno));
+					}
+					return;
         }
+
+				ret = write(fd, entry->info, strlen(entry->info));
+				if(ret != new_size)
+				{
+					close(fd);
+					elog(ERROR, "[record_2pc_commit_timestamp] could not write "
+						"file %s, errMsg: %s, ret: %d, info: %s",
+						path, strerror(errno), ret, entry->info);
+				}
+				ret = write(fd, content.data, size);
+				if(ret != new_size)
+				{
+					close(fd);
+					elog(ERROR, "[record_2pc_commit_timestamp] could not write "
+						"file %s, errMsg: %s, ret: %d, info: %s",
+						path, strerror(errno), ret, content.data);
+				}
+				close(fd);
+
+				/* remove from hash table */
+				entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
+					tid, HASH_REMOVE, &found);
+				if (!found)
+				{
+					elog(WARNING, "[record_2pc_commit_timestamp] %s is not found"
+						"in hash table when remove it", tid);
+				}
+				else if (enable_2pc_entry_trace)
+				{
+					elog(LOG, "[record_2pc_commit_timestamp] %s is removed "
+						"from hash table", entry->key);
+				}
+
+				resetStringInfo(&content);
+				pfree(content.data);
+				return;
+			}
+
+			/* save to hash table */
+			memcpy(entry->info + strlen(entry->info), content.data, size);
+
+			resetStringInfo(&content);
+			pfree(content.data);
+			return;
+		}
+		else
+		{
+			elog(LOG, "[record_2pc_commit_timestamp] %s is not found "
+				"in hash table", tid);
+		}
+	}
+
+	GET_2PC_FILE_PATH(path, tid);
+
+	/* the 2pc file exists already */
+	fd = open(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR);
+	if (fd < 0)
+	{
         if (RecoveryInProgress())
         {
 #if 0            
@@ -3388,109 +3899,242 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
                 }
                 if (0 == strcmp(gxact->gid, tid))
                 {
-                    elog(ERROR, "in record_2pc_commit_timestamp could not append timestamp in 2pc file \"%s\", errMsg:%s", path, strerror(errno));
+					elog(ERROR, "[record_2pc_commit_timestamp] could not "
+						"append timestamp in file %s, errMsg: %s",
+						path, strerror(errno));
                 }
             }
 #endif            
-            elog(LOG, "in record_2pc_commit_timestamp could not append timestamp in 2pc file \"%s\", errMsg:%s", path, strerror(errno));
+			elog(LOG, "[record_2pc_commit_timestamp] could not open file %s, "
+				"errMsg: %s", path, strerror(errno));
         }
         else
         {
-            elog(ERROR, "in record_2pc_commit_timestamp could not append timestamp in 2pc file \"%s\", errMsg:%s", path, strerror(errno));
+			elog(ERROR, "[record_2pc_commit_timestamp] could not open file %s, "
+				"errMsg: %s", path, strerror(errno));
         }
         return;
     }
 
-    if (!RecoveryInProgress())
-    {
-        XLogBeginInsert();
-        XLogRegisterData((char *)tid, strlen(tid)+1);
-        XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp));
-        xlogrec = XLogInsert(RM_XLOG_ID, XLOG_RECORD_2PC_TIMESTAMP);
-        /* only start node need to flush and sync XLOG_RECORD_2PC_TIMESTAMP */
-        if (IS_PGXC_LOCAL_COORDINATOR)
-        {
-            XLogFlush(xlogrec);
-            SyncRepWaitForLSN(xlogrec, false);
-        }
-    }
-
     if (enable_distri_print)
     {
-        (void) read(fd, file_content, 2048);//FileRead(fd, file_content, 2048, WAIT_EVENT_BUFFILE_READ);
-        elog(LOG, "before append 2pc file: %s, file_content: %s", tid, file_content);
+		memset(file_content, 0, 2048);
+		ret = read(fd, file_content, 2048);
+		elog(LOG, "[record_2pc_commit_timestamp] before append file: %s, "
+			"file_content: %s, content.data: %s, ret: %d",
+			path, file_content, content.data, ret);
     }
 
-    initStringInfo(&content);
-    appendStringInfo(&content, "global_commit_timestamp:"INT64_FORMAT"\n", commit_timestamp);
-    size = strlen(content.data);
-    if (enable_distri_print)
-    {
-        elog(LOG, "before append 2pc file: %s, content.data: %s", tid, content.data);
-    }
     ret = write(fd, content.data, size);
     if(ret != size)
     {
-        if (enable_distri_print)
-        {
-            elog(LOG, "cannot append timestamp to 2pc file %s", tid);
-        }
-        elog(ERROR, "in could not write 2pc file \"%s\", errMsg:%s", path, strerror(errno));
+		close(fd);
+		elog(ERROR, "[record_2pc_commit_timestamp] could not write file %s, "
+			"errMsg: %s", path, strerror(errno));
     }
+
     if (enable_distri_print)
     {
         memset(file_content, 0, 2048);
         lseek(fd, 0, SEEK_SET);
         ret = read(fd, file_content, 2048);
-        elog(LOG, "after append 2pc file: %s, file_content: %s, ret = %d", tid, file_content, ret);
+		elog(LOG, "[record_2pc_commit_timestamp] after append file: %s, "
+			"file_content: %s, ret: %d", tid, file_content, ret);
     }
+
+	close(fd);
+
     resetStringInfo(&content);
     pfree(content.data);
-    close(fd);
 }
 
 void remove_2pc_records(const char * tid, bool record_in_xlog)
 {
     char path[MAXPGPATH];    
+	Cache2pcInfo *entry = NULL;
+	bool found = false;
 
     if (!enable_2pc_recovery_info)
     {
     	return ;
     }
 
-    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
+	if (enable_distri_print || enable_2pc_entry_trace)
+	{
+		elog(LOG, "[remove_2pc_records] %s record_in_xlog: %d",
+			tid, record_in_xlog);
+	}
 
-    /* no need to check file exists. since when it do not exists , unlink won't success */
     if (!RecoveryInProgress() && record_in_xlog)
     {
+		char *type = "remove";
         XLogBeginInsert();
         XLogRegisterData((char *)tid, strlen(tid)+1);
+		XLogRegisterData((char *)type, strlen(type) + 1);
         XLogInsert(RM_XLOG_ID, XLOG_CLEAN_2PC_FILE);
     }
+
+	if (NULL != record_2pc_cache)
+	{
+		Assert(strlen(tid) < MAX_TID_SIZE);
+		if (enable_2pc_entry_key_check)
+		{
+			entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
+				tid, HASH_FIND, &found);
+			if (found)
+			{
+				Assert(NULL != entry);
+				check_entry_key(tid, entry->key, "remove_2pc_records");
+				check_2pc_file(tid, entry->info, "remove_2pc_records");
+			}
+		}
+		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
+			tid, HASH_REMOVE, &found);
+		if (found)
+		{
+			Assert(NULL != entry);
+			if (enable_2pc_entry_trace)
+			{
+				elog(LOG, "[remove_2pc_records] %s is removed "
+					"from hash table", tid);
+			}
+			return;
+		}
+	}
+
+	GET_2PC_FILE_PATH(path, tid);
+
+	/*
+		* no need to check file exists.
+		* since when it do not exists, unlink won't success.
+	*/
     if (0 != unlink(path))
     {
-        elog(LOG, "node: %s fail to remove 2pc file: %s", PGXCNodeName, tid);
+		elog(LOG, "[remove_2pc_records] could not unlink file %s, "
+			"errMsg: %s", path, strerror(errno));
     }
 }
 
+void rename_2pc_records(const char *tid, TimestampTz timestamp)
+{
+	char path[MAXPGPATH];
+	char new_path[MAXPGPATH];
+	Cache2pcInfo *entry = NULL;
+	bool found = false;
+	File fd = 0;
+	int ret = 0;
+
+	if (!enable_2pc_recovery_info)
+	{
+		return;
+	}
+
+	if (enable_distri_print || enable_2pc_entry_trace)
+	{
+		elog(LOG, "[rename_2pc_records] %s timestamp: "
+			INT64_FORMAT, tid, timestamp);
+	}
+
+	if (0 == timestamp)
+	{
+		timestamp = GetCurrentTimestamp();
+	}
+
+	if (!RecoveryInProgress())
+	{
+		char *type = "rename";
+		XLogBeginInsert();
+		XLogRegisterData((char *)tid, strlen(tid) + 1);
+		XLogRegisterData((char *)type, strlen(type) + 1);
+		XLogRegisterData((char *)&timestamp, sizeof(TimestampTz) + 1);
+		XLogInsert(RM_XLOG_ID, XLOG_CLEAN_2PC_FILE);
+	}
+
+	GET_2PC_FILE_PATH(path, tid);
+	snprintf(new_path, MAXPGPATH, "%s." INT64_FORMAT ".rollback", path, timestamp);
+
+	if (NULL != record_2pc_cache)
+	{
+		Assert(strlen(tid) < MAX_TID_SIZE);
+		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
+			tid, HASH_FIND, &found);
+		if (found)
+		{
+			Assert(NULL != entry);
+			check_entry_key(tid, entry->key, "rename_2pc_records");
+			check_2pc_file(tid, entry->info, "rename_2pc_records");
+
+			fd = PathNameOpenFile(new_path, O_RDWR | O_CREAT | O_EXCL,
+				S_IRUSR | S_IWUSR);
+			if (fd < 0)
+			{
+				elog(ERROR, "[rename_2pc_records] could not create file %s, "
+					"errMsg: %s", new_path, strerror(errno));
+			}
+
+			ret = FileWrite(fd, entry->info, strlen(entry->info),
+				WAIT_EVENT_BUFFILE_WRITE);
+			if(ret != strlen(entry->info))
+			{
+				FileClose(fd);
+				elog(ERROR, "[rename_2pc_records] could not write file %s, "
+					"errMsg: %s, ret: %d, info: %s",
+					path, strerror(errno), ret, entry->info);
+			}
+			FileClose(fd);
+
+			entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
+				tid, HASH_REMOVE, &found);
+			if (!found)
+			{
+				elog(ERROR, "[rename_2pc_records] %s is not found "
+					"in hash table when remove it", tid);
+			}
+			else if (enable_2pc_entry_trace)
+			{
+				elog(LOG, "[rename_2pc_records] %s is removed "
+					"from hash table", tid);
+			}
+			return;
+		}
+	}
+
+	if (0 != access(path, F_OK))
+	{
+		elog(LOG, "[rename_2pc_records] could not access file %s, "
+			"errMsg: %s", path, strerror(errno));
+		return;
+	}
+	if (0 != link(path, new_path))
+	{
+		elog(ERROR, "[rename_2pc_records] could not link file %s to %s, "
+			"errMsg: %s", path, new_path, strerror(errno));
+	}
+	if (0 != unlink(path))
+	{
+		elog(WARNING, "[rename_2pc_records] could not unlink file %s, "
+			"errMsg: %s", path, strerror(errno));
+	}
+}
+
 void record_2pc_readonly(const char *gid)
 {
     File fd = 0;
     int ret = 0;
     char path[MAXPGPATH];
     char content[10] = "readonly";
+	Cache2pcInfo *entry = NULL;
+	bool found = false;
         
     if(!enable_2pc_recovery_info)
     {
     	return ;
     }
         
-    if (enable_distri_print)
+	if (enable_distri_print || enable_2pc_entry_trace)
     {
-        elog(LOG, "record readonly twophase txn gid: %s", gid);
+		elog(LOG, "[record_2pc_readonly] %s is readonly", gid);
     }
-    /* the 2pc dir is already created in initdb */
-    snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", gid);
 
     if (!RecoveryInProgress())
     {
@@ -3500,6 +4144,45 @@ void record_2pc_readonly(const char *gid)
         XLogInsert(RM_XLOG_ID, XLOG_CREATE_2PC_FILE);
     }
 
+	if (NULL != record_2pc_cache)
+	{
+		Assert(strlen(gid) < MAX_TID_SIZE);
+		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
+			gid, HASH_ENTER_NULL, &found);
+		if (NULL != entry)
+		{
+			check_entry_key(gid, entry->key, "record_2pc_readonly");
+			check_2pc_file(gid, entry->info, "record_2pc_readonly");
+
+			if (found)
+			{
+				if (RecoveryInProgress())
+				{
+					elog(LOG, "[record_2pc_readonly] %s is found "
+						"in hash table in recovery mode", gid);
+				}
+				else
+				{
+					elog(LOG, "[record_2pc_readonly] %s is found "
+						"in hash table", gid);
+				}
+			}
+			else if (enable_2pc_entry_trace)
+			{
+				elog(LOG, "[record_2pc_readonly] %s is added "
+					"to hash table", gid);
+			}
+			memcpy(entry->info, content, strlen(content));
+			return;
+		}
+		else
+		{
+			elog(LOG, "[record_2pc_readonly] %s entry is NULL", gid);
+		}
+	}
+
+	/* the 2pc dir is already created in initdb */
+	GET_2PC_FILE_PATH(path, gid);
 
     /*
      * we open 2pc file under the following two different situations:
@@ -3507,7 +4190,8 @@ void record_2pc_readonly(const char *gid)
      *  the existed 2pc file can be trucated and reused.
      * b. if not under recovery progress, 
      *  we not allowed the implicit trans gid existed, 
-     *  since the xid in startnode should not be truncate if the twophase trans is part commit or part abort.
+		*  since the xid in startnode should not be truncate if the 
+		*  twophase trans is part commit or part abort.
      */
     if (RecoveryInProgress())
     {
@@ -3519,18 +4203,142 @@ void record_2pc_readonly(const char *gid)
     }
     if (fd < 0)
     {   
-        elog(ERROR, "could not create readonly 2pc file \"%s\", errMsg:%s", path, strerror(errno));
+		elog(ERROR, "[record_2pc_readonly] could not create file %s, "
+			"errMsg: %s", path, strerror(errno));
         return;
     }
 
     ret = FileWrite(fd, content, strlen(content), WAIT_EVENT_BUFFILE_WRITE);
     if(ret != strlen(content))
     {
-        elog(ERROR, "could not write 2pc file \"%s\", errMsg:%s", path, strerror(errno));
+		FileClose(fd);
+		elog(ERROR, "[record_2pc_readonly] could not write file %s, "
+			"errMsg: %s, ret: %d, content: %s",
+			path, strerror(errno), ret, content);
     }
     FileClose(fd);
+}
     
+/*
+ * Get 2pc info from hash table.
+ */
+char *get_2pc_info_from_cache(const char *tid)
+{
+	Cache2pcInfo *entry = NULL;
+	bool found = false;
+	if (NULL != record_2pc_cache)
+	{
+		Assert(strlen(tid) < MAX_TID_SIZE);
+		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
+			tid, HASH_FIND, &found);
+		if (found)
+		{
+			Assert(NULL != entry);
+
+			check_entry_key(tid, entry->key, "get_2pc_info_from_cache");
+
+			if (enable_2pc_entry_trace)
+			{
+				elog(LOG, "[get_2pc_info_from_cache] %s is found "
+					"in hast table, key: %s, info: %s",
+					tid, entry->key, entry->info);
 }
 
-#endif
+			return entry->info;
+		}
 
+		if (enable_2pc_entry_trace)
+		{
+			elog(LOG, "[get_2pc_info_from_cache] %s is not found "
+				"in hast table", tid);
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Get 2pc list from hash table.
+ */
+char *get_2pc_list_from_cache(int *count)
+{
+	HASH_SEQ_STATUS seq;
+	Cache2pcInfo *entry = NULL;
+	char *recordList = NULL;
+
+	if (NULL == record_2pc_cache)
+	{
+		return NULL;
+	}
+
+	hash_seq_init(&seq, record_2pc_cache);
+	while ((entry = hash_seq_search(&seq)) != NULL)
+	{
+		Assert(NULL != entry);
+		check_2pc_file(entry->key, entry->info, "get_2pc_list_from_cache");
+
+		if (NULL != count && *count >= MAX_OUTPUT_FILE)
+		{
+			break;
+		}
+
+		if(NULL == recordList)
+		{
+			recordList = (char *)palloc0(strlen(entry->key) + 1);
+			sprintf(recordList, "%s", entry->key);
+		}
+		else
+		{
+			recordList = (char *) repalloc(recordList,
+				strlen(entry->key) + strlen(recordList) + 2);
+			sprintf(recordList, "%s,%s", recordList, entry->key);
+		}
+		if (NULL != count)
+		{
+			(*count)++;
+		}
+	}
+
+	return recordList;
+}
+
+/*
+ * Initialize 2pc info cache using shared memory hash table.
+ */
+void
+Record2pcCacheInit(void)
+{
+	HASHCTL info;
+	int flags = 0;
+
+	if (!enable_2pc_file_cache)
+	{
+		record_2pc_cache = NULL;
+		return;
+	}
+
+	info.keysize = MAX_TID_SIZE;
+	info.entrysize = record_2pc_entry_size;
+	info.num_partitions = record_2pc_partitions;
+
+	flags = HASH_ELEM | HASH_PARTITION;
+
+	record_2pc_cache = ShmemInitHash("Record 2pc Cache",
+		record_2pc_cache_size/4, record_2pc_cache_size,
+		&info, flags);
+}
+
+/*
+ * Return 2pc info cache size.
+ */
+Size
+Record2pcCacheSize(void)
+{
+	long cache_size = 0;
+	if (enable_2pc_file_cache)
+	{
+		cache_size = (long)record_2pc_cache_size * record_2pc_entry_size;
+	}
+	return cache_size;
+}
+
+#endif
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 18ed9b22..7044cd8b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -10171,15 +10171,27 @@ xlog_redo(XLogReaderState *record)
 #ifdef __TWO_PHASE_TRANS__
     else if (info == XLOG_CLEAN_2PC_FILE)
     {
-        char *gid;
+		char *pos = NULL;
+		char *gid = NULL;
+		char *type = NULL;
+		TimestampTz timestamp = 0;
         gid = XLogRecGetData(record);
+		type = gid + strlen(gid) + 1;
+		pos = type + strlen(type) + 1;
+		memcpy(&timestamp, pos, sizeof(TimestampTz));
+		if (0 == strcmp(type, "rename"))
+		{
+			rename_2pc_records(gid, timestamp);
+		}
+		else
+		{
         remove_2pc_records(gid, false);
     }
+	}
     else if (info == XLOG_CREATE_2PC_FILE)
     {
         TransactionId xid;
         TransactionId startxid;
-        GlobalTimestamp commit_timestamp = 0;
         char *gid; 
         char *startnode;
         char *nodestring;
@@ -10199,19 +10211,15 @@ xlog_redo(XLogReaderState *record)
         {
             startnode = temp;
             memcpy(&startxid, pos, sizeof(TransactionId));
-            pos = pos + sizeof(TransactionId) ;
+            pos = pos + sizeof(TransactionId) + 1;
             nodestring = pos;
             pos = pos + strlen(nodestring) + 1;
             memcpy(&xid, pos, sizeof(TransactionId));
-            pos = pos + sizeof(TransactionId) ;
-            if (IsXidImplicit(gid))
-            {
-                memcpy(&commit_timestamp, pos, sizeof(GlobalTimestamp));
-            }
             if (enable_distri_print)
             {
-                elog(LOG, "xlog redo 2pc file name: '%s', startnode: %s, startxid: %u, nodestring: %s, "
-                                       "xid: %u, commit_timestamp:"INT64_FORMAT, gid, startnode, startxid, nodestring, xid, commit_timestamp);
+                elog(LOG, "xlog redo 2pc file name: '%s', startnode: %s, "
+                    "startxid: %u, nodestring: %s, xid: %u",
+                    gid, startnode, startxid, nodestring, xid);
             }
 #ifdef __TWO_PHASE_TESTS__            
             if (FILE_XLOG_EXISTED == twophase_exception_case)
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index cdc96d59..3cdb9063 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -306,6 +306,9 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
         size = add_size(size, PubStatDataShmemSize(g_PubStatHashSize, g_PubTableStatHashSize));
         size = add_size(size, SubStatDataShmemSize(g_SubStatHashSize, g_SubTableStatHashSize));
 #endif
+#ifdef __TWO_PHASE_TRANS__
+		size = add_size(size, Record2pcCacheSize());
+#endif
 #ifdef __COLD_HOT__
         size = add_size(size, DualWriteTableSize()); 
 #endif
@@ -482,6 +485,10 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
     InitSubStatData(g_SubStatHashSize, g_SubTableStatHashSize);
 #endif
 
+#ifdef __TWO_PHASE_TRANS__
+	Record2pcCacheInit();
+#endif
+
 #ifdef __COLD_HOT__
     DualWriteCtlInit();
 #endif
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index d7c85782..9c699458 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2689,6 +2689,45 @@ static struct config_bool ConfigureNamesBool[] =
     },
 #endif
 
+#ifdef __TWO_PHASE_TRANS__
+	{
+		{"enable_2pc_file_cache", PGC_POSTMASTER, CUSTOM_OPTIONS,
+			gettext_noop("Enable 2PC cache."),
+			NULL
+		},
+		&enable_2pc_file_cache,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_2pc_file_check", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("Enable 2PC file check."),
+			NULL
+		},
+		&enable_2pc_file_check,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_2pc_entry_key_check", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("Enable 2PC entry key check."),
+			NULL
+		},
+		&enable_2pc_entry_key_check,
+		true,
+		NULL, NULL, NULL
+	},
+	{
+		{"enable_2pc_entry_trace", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("Enable 2PC entry trace."),
+			NULL
+		},
+		&enable_2pc_entry_trace,
+		false,
+		NULL, NULL, NULL
+	},
+#endif
+
 #ifdef __TBASE__
 	{
 		{"enable_lock_account", PGC_SUSET, CUSTOM_OPTIONS,
@@ -4744,6 +4783,33 @@ static struct config_int ConfigureNamesInt[] =
         NULL, NULL, NULL
     },
 
+#ifdef __TWO_PHASE_TRANS__
+	{
+		{"record_2pc_cache_size", PGC_POSTMASTER, CUSTOM_OPTIONS,
+			gettext_noop("2PC info cache size."),
+		},
+		&record_2pc_cache_size,
+		50000, 100, INT_MAX,
+		NULL, NULL, NULL
+	},
+	{
+		{"record_2pc_entry_size", PGC_POSTMASTER, CUSTOM_OPTIONS,
+			gettext_noop("2PC info cache entry size."),
+		},
+		&record_2pc_entry_size,
+		2048, 1200, INT_MAX,
+		NULL, NULL, NULL
+	},
+	{
+		{"record_2pc_partitions", PGC_POSTMASTER, CUSTOM_OPTIONS,
+			gettext_noop("2PC info cache partition number."),
+		},
+		&record_2pc_partitions,
+		32, 1, INT_MAX,
+		NULL, NULL, NULL
+	},
+#endif
+
 #ifdef __TBASE__
     {
         {"account_lock_track_count", PGC_POSTMASTER, LOGGING,
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index cbf83a3e..bd76266f 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -96,6 +96,17 @@ extern int  transaction_threshold;
 extern bool enable_2pc_recovery_info;
 #endif
 
+#ifdef __TWO_PHASE_TRANS__
+extern bool enable_2pc_file_cache;
+extern bool enable_2pc_file_check;
+extern bool enable_2pc_entry_key_check;
+extern bool enable_2pc_entry_trace;
+
+extern int record_2pc_cache_size;
+extern int record_2pc_entry_size;
+extern int record_2pc_partitions;
+#endif
+
 extern Size TwoPhaseShmemSize(void);
 extern void TwoPhaseShmemInit(void);
 
@@ -143,7 +154,14 @@ extern void record_2pc_involved_nodes_xid(const char * tid,
                                         GlobalTransactionId xid);
 extern void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timestamp);
 extern void remove_2pc_records(const char *tid, bool record_in_xlog);
+extern void rename_2pc_records(const char *tid, TimestampTz timestamp);
 extern void record_2pc_readonly(const char *gid);
+
+extern char *get_2pc_info_from_cache(const char *tid);
+extern char *get_2pc_list_from_cache(int *count);
+
+extern void Record2pcCacheInit(void);
+extern Size Record2pcCacheSize(void);
 #endif
 
 #endif                            /* TWOPHASE_H */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 89a74b0c..8828efe7 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -1,8 +1,8 @@
 /*-------------------------------------------------------------------------
  *
  * pg_control.h
- *      The system control file "pg_control" is not a heap relation.
- *      However, we define it here so that the format is documented.
+ *	  The system control file "pg_control" is not a heap relation.
+ *	  However, we define it here so that the format is documented.
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -16,15 +16,15 @@
 #define PG_CONTROL_H
 
 #include "access/xlogdefs.h"
-#include "pgtime.h"                /* for pg_time_t */
+#include "pgtime.h"				/* for pg_time_t */
 #include "port/pg_crc32c.h"
 
 
 /* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION    1002
+#define PG_CONTROL_VERSION	1002
 
 /* Nonce key length, see below */
-#define MOCK_AUTH_NONCE_LEN        32
+#define MOCK_AUTH_NONCE_LEN		32
 
 /*
  * Body of CheckPoint XLOG records.  This is declared here because we keep
@@ -33,61 +33,61 @@
  */
 typedef struct CheckPoint
 {
-    XLogRecPtr    redo;            /* next RecPtr available when we began to
-                                 * create CheckPoint (i.e. REDO start point) */
-    TimeLineID    ThisTimeLineID; /* current TLI */
-    TimeLineID    PrevTimeLineID; /* previous TLI, if this record begins a new
-                                 * timeline (equals ThisTimeLineID otherwise) */
-    bool        fullPageWrites; /* current full_page_writes */
-    uint32        nextXidEpoch;    /* higher-order bits of nextXid */
-    TransactionId nextXid;        /* next free XID */
-    Oid            nextOid;        /* next free OID */
-    MultiXactId nextMulti;        /* next free MultiXactId */
-    MultiXactOffset nextMultiOffset;    /* next free MultiXact offset */
-    TransactionId oldestXid;    /* cluster-wide minimum datfrozenxid */
-    Oid            oldestXidDB;    /* database with minimum datfrozenxid */
-    MultiXactId oldestMulti;    /* cluster-wide minimum datminmxid */
-    Oid            oldestMultiDB;    /* database with minimum datminmxid */
-    pg_time_t    time;            /* time stamp of checkpoint */
-    TransactionId oldestCommitTsXid;    /* oldest Xid with valid commit
-                                         * timestamp */
-    TransactionId newestCommitTsXid;    /* newest Xid with valid commit
-                                         * timestamp */
+	XLogRecPtr	redo;			/* next RecPtr available when we began to
+								 * create CheckPoint (i.e. REDO start point) */
+	TimeLineID	ThisTimeLineID; /* current TLI */
+	TimeLineID	PrevTimeLineID; /* previous TLI, if this record begins a new
+								 * timeline (equals ThisTimeLineID otherwise) */
+	bool		fullPageWrites; /* current full_page_writes */
+	uint32		nextXidEpoch;	/* higher-order bits of nextXid */
+	TransactionId nextXid;		/* next free XID */
+	Oid			nextOid;		/* next free OID */
+	MultiXactId nextMulti;		/* next free MultiXactId */
+	MultiXactOffset nextMultiOffset;	/* next free MultiXact offset */
+	TransactionId oldestXid;	/* cluster-wide minimum datfrozenxid */
+	Oid			oldestXidDB;	/* database with minimum datfrozenxid */
+	MultiXactId oldestMulti;	/* cluster-wide minimum datminmxid */
+	Oid			oldestMultiDB;	/* database with minimum datminmxid */
+	pg_time_t	time;			/* time stamp of checkpoint */
+	TransactionId oldestCommitTsXid;	/* oldest Xid with valid commit
+										 * timestamp */
+	TransactionId newestCommitTsXid;	/* newest Xid with valid commit
+										 * timestamp */
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-    GlobalTimestamp latestCommitTs;
-    GlobalTimestamp    latestGTS;
+	GlobalTimestamp latestCommitTs;
+	GlobalTimestamp	latestGTS;
 #endif
 
-    /*
-     * Oldest XID still running. This is only needed to initialize hot standby
-     * mode from an online checkpoint, so we only bother calculating this for
-     * online checkpoints and only when wal_level is replica. Otherwise it's
-     * set to InvalidTransactionId.
-     */
-    TransactionId oldestActiveXid;
+	/*
+	 * Oldest XID still running. This is only needed to initialize hot standby
+	 * mode from an online checkpoint, so we only bother calculating this for
+	 * online checkpoints and only when wal_level is replica. Otherwise it's
+	 * set to InvalidTransactionId.
+	 */
+	TransactionId oldestActiveXid;
 
 } CheckPoint;
 
 /* XLOG info values for XLOG rmgr */
-#define XLOG_CHECKPOINT_SHUTDOWN        0x00
-#define XLOG_CHECKPOINT_ONLINE            0x10
-#define XLOG_NOOP                        0x20
-#define XLOG_NEXTOID                    0x30
-#define XLOG_SWITCH                        0x40
-#define XLOG_BACKUP_END                    0x50
-#define XLOG_PARAMETER_CHANGE            0x60
-#define XLOG_RESTORE_POINT                0x70
-#define XLOG_FPW_CHANGE                    0x80
-#define XLOG_END_OF_RECOVERY            0x90
-#define XLOG_FPI_FOR_HINT                0xA0
-#define XLOG_FPI                        0xB0
+#define XLOG_CHECKPOINT_SHUTDOWN		0x00
+#define XLOG_CHECKPOINT_ONLINE			0x10
+#define XLOG_NOOP						0x20
+#define XLOG_NEXTOID					0x30
+#define XLOG_SWITCH						0x40
+#define XLOG_BACKUP_END					0x50
+#define XLOG_PARAMETER_CHANGE			0x60
+#define XLOG_RESTORE_POINT				0x70
+#define XLOG_FPW_CHANGE					0x80
+#define XLOG_END_OF_RECOVERY			0x90
+#define XLOG_FPI_FOR_HINT				0xA0
+#define XLOG_FPI						0xB0
 #ifdef __TBASE__
-#define XLOG_MVCC                        0xC0
+#define XLOG_MVCC						0xC0
 #endif
-/* remove 2pc file while 2pc is cleaned*/
-#define XLOG_CLEAN_2PC_FILE        0XD0
-#define XLOG_CREATE_2PC_FILE    0xE0
+/* remove or rename 2pc file when 2pc is cleaned */
+#define XLOG_CLEAN_2PC_FILE       0XD0
+#define XLOG_CREATE_2PC_FILE      0xE0
 #define XLOG_RECORD_2PC_TIMESTAMP 0xF0
 
 /*
@@ -96,13 +96,13 @@ typedef struct CheckPoint
  */
 typedef enum DBState
 {
-    DB_STARTUP = 0,
-    DB_SHUTDOWNED,
-    DB_SHUTDOWNED_IN_RECOVERY,
-    DB_SHUTDOWNING,
-    DB_IN_CRASH_RECOVERY,
-    DB_IN_ARCHIVE_RECOVERY,
-    DB_IN_PRODUCTION
+	DB_STARTUP = 0,
+	DB_SHUTDOWNED,
+	DB_SHUTDOWNED_IN_RECOVERY,
+	DB_SHUTDOWNING,
+	DB_IN_CRASH_RECOVERY,
+	DB_IN_ARCHIVE_RECOVERY,
+	DB_IN_PRODUCTION
 } DBState;
 
 /*
@@ -111,150 +111,150 @@ typedef enum DBState
 
 typedef struct ControlFileData
 {
-    /*
-     * Unique system identifier --- to ensure we match up xlog files with the
-     * installation that produced them.
-     */
-    uint64        system_identifier;
+	/*
+	 * Unique system identifier --- to ensure we match up xlog files with the
+	 * installation that produced them.
+	 */
+	uint64		system_identifier;
 
-    /*
-     * Version identifier information.  Keep these fields at the same offset,
-     * especially pg_control_version; they won't be real useful if they move
-     * around.  (For historical reasons they must be 8 bytes into the file
-     * rather than immediately at the front.)
-     *
-     * pg_control_version identifies the format of pg_control itself.
-     * catalog_version_no identifies the format of the system catalogs.
-     *
-     * There are additional version identifiers in individual files; for
-     * example, WAL logs contain per-page magic numbers that can serve as
-     * version cues for the WAL log.
-     */
-    uint32        pg_control_version; /* PG_CONTROL_VERSION */
-    uint32        catalog_version_no; /* see catversion.h */
+	/*
+	 * Version identifier information.  Keep these fields at the same offset,
+	 * especially pg_control_version; they won't be real useful if they move
+	 * around.  (For historical reasons they must be 8 bytes into the file
+	 * rather than immediately at the front.)
+	 *
+	 * pg_control_version identifies the format of pg_control itself.
+	 * catalog_version_no identifies the format of the system catalogs.
+	 *
+	 * There are additional version identifiers in individual files; for
+	 * example, WAL logs contain per-page magic numbers that can serve as
+	 * version cues for the WAL log.
+	 */
+	uint32		pg_control_version; /* PG_CONTROL_VERSION */
+	uint32		catalog_version_no; /* see catversion.h */
 
-    /*
-     * System status data
-     */
-    DBState        state;            /* see enum above */
-    pg_time_t    time;            /* time stamp of last pg_control update */
-    XLogRecPtr    checkPoint;        /* last check point record ptr */
-    XLogRecPtr    prevCheckPoint; /* previous check point record ptr */
+	/*
+	 * System status data
+	 */
+	DBState		state;			/* see enum above */
+	pg_time_t	time;			/* time stamp of last pg_control update */
+	XLogRecPtr	checkPoint;		/* last check point record ptr */
+	XLogRecPtr	prevCheckPoint; /* previous check point record ptr */
 
-    CheckPoint    checkPointCopy; /* copy of last check point record */
+	CheckPoint	checkPointCopy; /* copy of last check point record */
 
-    XLogRecPtr    unloggedLSN;    /* current fake LSN value, for unlogged rels */
+	XLogRecPtr	unloggedLSN;	/* current fake LSN value, for unlogged rels */
 
-    /*
-     * These two values determine the minimum point we must recover up to
-     * before starting up:
-     *
-     * minRecoveryPoint is updated to the latest replayed LSN whenever we
-     * flush a data change during archive recovery. That guards against
-     * starting archive recovery, aborting it, and restarting with an earlier
-     * stop location. If we've already flushed data changes from WAL record X
-     * to disk, we mustn't start up until we reach X again. Zero when not
-     * doing archive recovery.
-     *
-     * backupStartPoint is the redo pointer of the backup start checkpoint, if
-     * we are recovering from an online backup and haven't reached the end of
-     * backup yet. It is reset to zero when the end of backup is reached, and
-     * we mustn't start up before that. A boolean would suffice otherwise, but
-     * we use the redo pointer as a cross-check when we see an end-of-backup
-     * record, to make sure the end-of-backup record corresponds the base
-     * backup we're recovering from.
-     *
-     * backupEndPoint is the backup end location, if we are recovering from an
-     * online backup which was taken from the standby and haven't reached the
-     * end of backup yet. It is initialized to the minimum recovery point in
-     * pg_control which was backed up last. It is reset to zero when the end
-     * of backup is reached, and we mustn't start up before that.
-     *
-     * If backupEndRequired is true, we know for sure that we're restoring
-     * from a backup, and must see a backup-end record before we can safely
-     * start up. If it's false, but backupStartPoint is set, a backup_label
-     * file was found at startup but it may have been a leftover from a stray
-     * pg_start_backup() call, not accompanied by pg_stop_backup().
-     */
-    XLogRecPtr    minRecoveryPoint;
-    TimeLineID    minRecoveryPointTLI;
-    XLogRecPtr    backupStartPoint;
-    XLogRecPtr    backupEndPoint;
-    bool        backupEndRequired;
+	/*
+	 * These two values determine the minimum point we must recover up to
+	 * before starting up:
+	 *
+	 * minRecoveryPoint is updated to the latest replayed LSN whenever we
+	 * flush a data change during archive recovery. That guards against
+	 * starting archive recovery, aborting it, and restarting with an earlier
+	 * stop location. If we've already flushed data changes from WAL record X
+	 * to disk, we mustn't start up until we reach X again. Zero when not
+	 * doing archive recovery.
+	 *
+	 * backupStartPoint is the redo pointer of the backup start checkpoint, if
+	 * we are recovering from an online backup and haven't reached the end of
+	 * backup yet. It is reset to zero when the end of backup is reached, and
+	 * we mustn't start up before that. A boolean would suffice otherwise, but
+	 * we use the redo pointer as a cross-check when we see an end-of-backup
+	 * record, to make sure the end-of-backup record corresponds the base
+	 * backup we're recovering from.
+	 *
+	 * backupEndPoint is the backup end location, if we are recovering from an
+	 * online backup which was taken from the standby and haven't reached the
+	 * end of backup yet. It is initialized to the minimum recovery point in
+	 * pg_control which was backed up last. It is reset to zero when the end
+	 * of backup is reached, and we mustn't start up before that.
+	 *
+	 * If backupEndRequired is true, we know for sure that we're restoring
+	 * from a backup, and must see a backup-end record before we can safely
+	 * start up. If it's false, but backupStartPoint is set, a backup_label
+	 * file was found at startup but it may have been a leftover from a stray
+	 * pg_start_backup() call, not accompanied by pg_stop_backup().
+	 */
+	XLogRecPtr	minRecoveryPoint;
+	TimeLineID	minRecoveryPointTLI;
+	XLogRecPtr	backupStartPoint;
+	XLogRecPtr	backupEndPoint;
+	bool		backupEndRequired;
 
-    /*
-     * Parameter settings that determine if the WAL can be used for archival
-     * or hot standby.
-     */
-    int            wal_level;
-    bool        wal_log_hints;
-    int            MaxConnections;
-    int            max_worker_processes;
-    int            max_prepared_xacts;
-    int            max_locks_per_xact;
-    bool        track_commit_timestamp;
+	/*
+	 * Parameter settings that determine if the WAL can be used for archival
+	 * or hot standby.
+	 */
+	int			wal_level;
+	bool		wal_log_hints;
+	int			MaxConnections;
+	int			max_worker_processes;
+	int			max_prepared_xacts;
+	int			max_locks_per_xact;
+	bool		track_commit_timestamp;
 
-    /*
-     * This data is used to check for hardware-architecture compatibility of
-     * the database and the backend executable.  We need not check endianness
-     * explicitly, since the pg_control version will surely look wrong to a
-     * machine of different endianness, but we do need to worry about MAXALIGN
-     * and floating-point format.  (Note: storage layout nominally also
-     * depends on SHORTALIGN and INTALIGN, but in practice these are the same
-     * on all architectures of interest.)
-     *
-     * Testing just one double value is not a very bulletproof test for
-     * floating-point compatibility, but it will catch most cases.
-     */
-    uint32        maxAlign;        /* alignment requirement for tuples */
-    double        floatFormat;    /* constant 1234567.0 */
-#define FLOATFORMAT_VALUE    1234567.0
+	/*
+	 * This data is used to check for hardware-architecture compatibility of
+	 * the database and the backend executable.  We need not check endianness
+	 * explicitly, since the pg_control version will surely look wrong to a
+	 * machine of different endianness, but we do need to worry about MAXALIGN
+	 * and floating-point format.  (Note: storage layout nominally also
+	 * depends on SHORTALIGN and INTALIGN, but in practice these are the same
+	 * on all architectures of interest.)
+	 *
+	 * Testing just one double value is not a very bulletproof test for
+	 * floating-point compatibility, but it will catch most cases.
+	 */
+	uint32		maxAlign;		/* alignment requirement for tuples */
+	double		floatFormat;	/* constant 1234567.0 */
+#define FLOATFORMAT_VALUE	1234567.0
 
-    /*
-     * This data is used to make sure that configuration of this database is
-     * compatible with the backend executable.
-     */
-    uint32        blcksz;            /* data block size for this DB */
-    uint32        relseg_size;    /* blocks per segment of large relation */
+	/*
+	 * This data is used to make sure that configuration of this database is
+	 * compatible with the backend executable.
+	 */
+	uint32		blcksz;			/* data block size for this DB */
+	uint32		relseg_size;	/* blocks per segment of large relation */
 
-    uint32        xlog_blcksz;    /* block size within WAL files */
-    uint32        xlog_seg_size;    /* size of each WAL segment */
+	uint32		xlog_blcksz;	/* block size within WAL files */
+	uint32		xlog_seg_size;	/* size of each WAL segment */
 
-    uint32        nameDataLen;    /* catalog name field width */
-    uint32        indexMaxKeys;    /* max number of columns in an index */
+	uint32		nameDataLen;	/* catalog name field width */
+	uint32		indexMaxKeys;	/* max number of columns in an index */
 
-    uint32        toast_max_chunk_size;    /* chunk size in TOAST tables */
-    uint32        loblksize;        /* chunk size in pg_largeobject */
+	uint32		toast_max_chunk_size;	/* chunk size in TOAST tables */
+	uint32		loblksize;		/* chunk size in pg_largeobject */
 
-    /* flags indicating pass-by-value status of various types */
-    bool        float4ByVal;    /* float4 pass-by-value? */
-    bool        float8ByVal;    /* float8, int8, etc pass-by-value? */
+	/* flags indicating pass-by-value status of various types */
+	bool		float4ByVal;	/* float4 pass-by-value? */
+	bool		float8ByVal;	/* float8, int8, etc pass-by-value? */
 
-    /* Are data pages protected by checksums? Zero if no checksum version */
-    uint32        data_checksum_version;
+	/* Are data pages protected by checksums? Zero if no checksum version */
+	uint32		data_checksum_version;
 
-    /*
-     * Random nonce, used in authentication requests that need to proceed
-     * based on values that are cluster-unique, like a SASL exchange that
-     * failed at an early stage.
-     */
-    char        mock_authentication_nonce[MOCK_AUTH_NONCE_LEN];
+	/*
+	 * Random nonce, used in authentication requests that need to proceed
+	 * based on values that are cluster-unique, like a SASL exchange that
+	 * failed at an early stage.
+	 */
+	char		mock_authentication_nonce[MOCK_AUTH_NONCE_LEN];
 
 #ifdef __TBASE__
-    /*
-     * need mvcc if page is all visible?
+	/*
+	 * need mvcc if page is all visible?
      */
     int32       need_mvcc;
-    /* reserved */
-    int32       reserved_1;
-    int32       reserved_2;
-    int32       reserved_3;
-    int32       reserved_4;
-    int32       reserved_5;
+	/* reserved */
+	int32       reserved_1;
+	int32       reserved_2;
+	int32       reserved_3;
+	int32       reserved_4;
+	int32       reserved_5;
 #endif
 
-    /* CRC of all above ... MUST BE LAST! */
-    pg_crc32c    crc;
+	/* CRC of all above ... MUST BE LAST! */
+	pg_crc32c	crc;
 } ControlFileData;
 
 /*
@@ -263,7 +263,7 @@ typedef struct ControlFileData
  * means the active data can't be more than one disk sector, which is 512
  * bytes on common hardware.  Be very careful about raising this limit.
  */
-#define PG_CONTROL_MAX_SAFE_SIZE    512
+#define PG_CONTROL_MAX_SAFE_SIZE	512
 
 /*
  * Physical size of the pg_control file.  Note that this is considerably
@@ -272,6 +272,6 @@ typedef struct ControlFileData
  * changes, so that ReadControlFile will deliver a suitable wrong-version
  * message instead of a read error if it's looking at an incompatible file.
  */
-#define PG_CONTROL_FILE_SIZE        8192
+#define PG_CONTROL_FILE_SIZE		8192
 
-#endif                            /* PG_CONTROL_H */
+#endif							/* PG_CONTROL_H */
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 9c939d43..9cfd0f21 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -72,6 +72,10 @@ select count(*) >= 0 as ok from pg_prepared_xacts;
 select name, setting from pg_settings where name like 'enable%';
                name                | setting 
 -----------------------------------+---------
+ enable_2pc_entry_key_check        | on
+ enable_2pc_entry_trace            | off
+ enable_2pc_file_cache             | on
+ enable_2pc_file_check             | on
  enable_2pc_recovery_info          | on
  enable_audit                      | off
  enable_audit_warning              | off
@@ -137,7 +141,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
  enable_xlog_mprotect              | on
-(64 rows)
+(67 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From ed7cbf0202143b594712bece67eba7e06d5f8e13 Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Wed, 28 Apr 2021 17:48:07 +0800
Subject: [PATCH 361/578] fix committed sequenced in gtm be dropped when
 subtranction abort
 http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131087135229

---
 src/backend/access/transam/xact.c          | 14 ++++
 src/test/regress/expected/create_table.out | 75 ++++++++++++++++++++++
 src/test/regress/sql/create_table.sql      | 55 ++++++++++++++++
 3 files changed, 144 insertions(+)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index f43288dc..bc765d44 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -6095,6 +6095,20 @@ CommitSubTransaction(void)
         s->parallelModeLevel = 0;
     }
 
+#ifdef __TBASE__
+	if (s->curTransactionOwner)
+	{
+		TransactionId xid = GetCurrentTransactionIdIfAny();
+
+		if (TransactionIdIsValid(xid))
+		{
+			CheckGTMConnection();
+		}
+
+		FinishSeqOp(true);
+	}
+#endif
+
     /* Do the actual "commit", such as it is */
     s->state = TRANS_COMMIT;
 
diff --git a/src/test/regress/expected/create_table.out b/src/test/regress/expected/create_table.out
index 2cf920b9..f3e08d39 100644
--- a/src/test/regress/expected/create_table.out
+++ b/src/test/regress/expected/create_table.out
@@ -949,3 +949,78 @@ Distribute By: HASH(a)
 Location Nodes: ALL DATANODES
 
 drop table boolspart;
+drop function if exists create_multi_tables1(integer, varchar);
+NOTICE:  function create_multi_tables1(pg_catalog.int4,pg_catalog.varchar) does not exist, skipping
+CREATE OR REPLACE FUNCTION create_multi_tables1(table_num_in integer, table_sql varchar) RETURNS void
+    LANGUAGE 'plpgsql'
+    COST 100
+    VOLATILE
+AS $BODY$
+declare
+    v_idx integer := 0;
+    v_strTable varchar :='';
+    v_strSql varchar :='';
+begin
+    while v_idx < table_num_in loop
+        v_idx = v_idx+1;
+        v_strTable = CONCAT('simple_metadata_query_', v_idx);
+        v_strSql = table_sql||' '||v_strTable||'(c1 bigint, c31 smallserial);';
+        RAISE NOTICE 'create %', v_strTable;
+        BEGIN
+            EXECUTE v_strSql;
+            EXCEPTION when others then
+            raise notice 'ERROR: (%)', SQLERRM;
+        end;
+    end loop;
+
+    RAISE NOTICE 'finished .....';
+end
+$BODY$;
+drop function if exists del_multi_table1(varchar);
+NOTICE:  function del_multi_table1(pg_catalog.varchar) does not exist, skipping
+CREATE FUNCTION del_multi_table1(table_sql varchar) RETURNS void AS $$
+DECLARE
+    tmp VARCHAR(512);
+DECLARE names CURSOR FOR
+    select tablename from pg_tables where tablename like 'simple_metadata_query_%';
+BEGIN
+  FOR stmt IN names LOOP
+    tmp := table_sql||' '|| quote_ident(stmt.tablename) || ' CASCADE;';
+    RAISE NOTICE '%', tmp;
+    BEGIN
+        EXECUTE tmp;
+        EXCEPTION when others then
+        raise notice 'ERROR: (%)', SQLERRM;
+    end;
+  END LOOP;
+  RAISE NOTICE 'finished .....';
+END
+$$ LANGUAGE plpgsql VOLATILE COST 100;
+CREATE TABLE simple_metadata_query_3(c1 int, c31 smallserial);
+SELECT create_multi_tables1(5, 'create table');
+NOTICE:  create simple_metadata_query_1
+NOTICE:  create simple_metadata_query_2
+NOTICE:  create simple_metadata_query_3
+NOTICE:  ERROR: (relation "simple_metadata_query_3" already exists)
+NOTICE:  create simple_metadata_query_4
+NOTICE:  create simple_metadata_query_5
+NOTICE:  finished .....
+ create_multi_tables1 
+----------------------
+ 
+(1 row)
+
+SELECT del_multi_table1('drop table if exists');
+NOTICE:  drop table if exists simple_metadata_query_1 CASCADE;
+NOTICE:  drop table if exists simple_metadata_query_2 CASCADE;
+NOTICE:  drop table if exists simple_metadata_query_3 CASCADE;
+NOTICE:  drop table if exists simple_metadata_query_4 CASCADE;
+NOTICE:  drop table if exists simple_metadata_query_5 CASCADE;
+NOTICE:  finished .....
+ del_multi_table1 
+------------------
+ 
+(1 row)
+
+DROP FUNCTION create_multi_tables1;
+DROP FUNCTION del_multi_table1;
diff --git a/src/test/regress/sql/create_table.sql b/src/test/regress/sql/create_table.sql
index 82f1a87b..9fc3ae65 100644
--- a/src/test/regress/sql/create_table.sql
+++ b/src/test/regress/sql/create_table.sql
@@ -746,3 +746,58 @@ create table boolspart_t partition of boolspart for values in (true);
 create table boolspart_f partition of boolspart for values in (false);
 \d+ boolspart
 drop table boolspart;
+
+drop function if exists create_multi_tables1(integer, varchar);
+CREATE OR REPLACE FUNCTION create_multi_tables1(table_num_in integer, table_sql varchar) RETURNS void
+    LANGUAGE 'plpgsql'
+    COST 100
+    VOLATILE
+AS $BODY$
+declare
+    v_idx integer := 0;
+    v_strTable varchar :='';
+    v_strSql varchar :='';
+begin
+    while v_idx < table_num_in loop
+        v_idx = v_idx+1;
+        v_strTable = CONCAT('simple_metadata_query_', v_idx);
+        v_strSql = table_sql||' '||v_strTable||'(c1 bigint, c31 smallserial);';
+        RAISE NOTICE 'create %', v_strTable;
+        BEGIN
+            EXECUTE v_strSql;
+            EXCEPTION when others then
+            raise notice 'ERROR: (%)', SQLERRM;
+        end;
+    end loop;
+
+    RAISE NOTICE 'finished .....';
+end
+$BODY$;
+
+drop function if exists del_multi_table1(varchar);
+CREATE FUNCTION del_multi_table1(table_sql varchar) RETURNS void AS $$
+DECLARE
+    tmp VARCHAR(512);
+DECLARE names CURSOR FOR
+    select tablename from pg_tables where tablename like 'simple_metadata_query_%';
+BEGIN
+  FOR stmt IN names LOOP
+    tmp := table_sql||' '|| quote_ident(stmt.tablename) || ' CASCADE;';
+    RAISE NOTICE '%', tmp;
+    BEGIN
+        EXECUTE tmp;
+        EXCEPTION when others then
+        raise notice 'ERROR: (%)', SQLERRM;
+    end;
+  END LOOP;
+  RAISE NOTICE 'finished .....';
+END
+$$ LANGUAGE plpgsql VOLATILE COST 100;
+
+CREATE TABLE simple_metadata_query_3(c1 int, c31 smallserial);
+SELECT create_multi_tables1(5, 'create table');
+SELECT del_multi_table1('drop table if exists');
+
+DROP FUNCTION create_multi_tables1;
+DROP FUNCTION del_multi_table1;
+

From a8c5007545aa5dff9f8f10e92308f203cbafb48a Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Thu, 29 Apr 2021 14:31:29 +0800
Subject: [PATCH 362/578] Fix distinct agg regress.

---
 src/test/regress/expected/sysviews.out | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 9cfd0f21..842fabf5 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -141,7 +141,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
  enable_xlog_mprotect              | on
-(67 rows)
+(68 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From 2759d7daf70c41b5a7a97a98d5c1194f1040c93c Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Fri, 30 Apr 2021 14:42:02 +0800
Subject: [PATCH 363/578] 2pc files opt: add 2pc hash table on shmem (merge
 request 300), code opt

---
 src/backend/access/transam/twophase.c | 28 +++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 46570383..2fc12062 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -3539,7 +3539,6 @@ void record_2pc_involved_nodes_xid(const char * tid,
 	appendStringInfo(&content, "nodes:%s\n", nodestring);
 	appendStringInfo(&content, "xid:%u\n", xid);
 	size = content.len;
-
 	Assert(size == strlen(content.data));
 
     /* if in_pg_clean, then check whether the file exists */
@@ -3667,8 +3666,7 @@ void record_2pc_involved_nodes_xid(const char * tid,
 					"to hash table", tid);
 			}
 
-			memset(entry->info, 0, MAX_2PC_INFO_SIZE);
-			memcpy(entry->info, content.data, size);
+			memcpy(entry->info, content.data, size + 1);
 
 			resetStringInfo(&content);
 			pfree(content.data);
@@ -3779,7 +3777,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 	initStringInfo(&content);
 	appendStringInfo(&content, "global_commit_timestamp:"INT64_FORMAT"\n",
 		commit_timestamp);
-	size = strlen(content.data);
+	size = content.len;
+	Assert(size == strlen(content.data));
 
 	if (NULL != record_2pc_cache)
 	{
@@ -3804,8 +3803,16 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 
 			new_size = size + strlen(entry->info);
 
-			if (new_size >= MAX_2PC_INFO_SIZE)
+			if (new_size < MAX_2PC_INFO_SIZE)
 			{
+				/* save to hash table */
+				memcpy(entry->info + strlen(entry->info), content.data, size + 1);
+
+				resetStringInfo(&content);
+				pfree(content.data);
+				return;
+			}
+
 				/* save to file */
 				elog(LOG, "[record_2pc_commit_timestamp] %s new size(%d) "
 					"overflow(%d)", tid, new_size, MAX_2PC_INFO_SIZE);
@@ -3866,14 +3873,6 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 				pfree(content.data);
 				return;
 			}
-
-			/* save to hash table */
-			memcpy(entry->info + strlen(entry->info), content.data, size);
-
-			resetStringInfo(&content);
-			pfree(content.data);
-			return;
-		}
 		else
 		{
 			elog(LOG, "[record_2pc_commit_timestamp] %s is not found "
@@ -4147,6 +4146,7 @@ void record_2pc_readonly(const char *gid)
 	if (NULL != record_2pc_cache)
 	{
 		Assert(strlen(gid) < MAX_TID_SIZE);
+		Assert(strlen(content) < MAX_2PC_INFO_SIZE);
 		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
 			gid, HASH_ENTER_NULL, &found);
 		if (NULL != entry)
@@ -4172,7 +4172,7 @@ void record_2pc_readonly(const char *gid)
 				elog(LOG, "[record_2pc_readonly] %s is added "
 					"to hash table", gid);
 			}
-			memcpy(entry->info, content, strlen(content));
+			memcpy(entry->info, content, strlen(content) + 1);
 			return;
 		}
 		else

From 618f7530b441e996a2dd3fa9a8eef07dbe696aad Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Fri, 30 Apr 2021 15:56:27 +0800
Subject: [PATCH 364/578] precheck befor choose sequence name

---
 src/backend/parser/parse_utilcmd.c | 57 ++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 25550493..88c6077e 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -86,6 +86,7 @@
 #ifdef __TBASE__
 #include "utils/fmgroids.h"
 #include "catalog/pgxc_class.h"
+#include "utils/inval.h"
 #endif
 
 #ifdef XCP
@@ -200,6 +201,8 @@ static Const *transformPartitionBoundValue(ParseState *pstate, A_Const *con,
 
 #ifdef __TBASE__
 static void transformPartitionBy(ParseState *pstate, ColumnDef *partcol, PartitionBy *partitionby);
+static char * ChooseSerialName(const char *relname, const char *colname,
+									const char *label, Oid namespaceid);
 #endif
 /*
  * transformCreateStmt -
@@ -722,6 +725,52 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString)
     return result;
 }
 
+#ifdef __TBASE__
+/*
+ * Check relation exists before choose sequence name, if
+ * the relation already exists, no need to create sequence
+ * and relation.
+ */
+static char *
+ChooseSerialName(const char *relname, const char *colname,
+					const char *label, Oid namespaceid)
+{
+	int		pass = 0;
+	char	modlabel[NAMEDATALEN];
+	char	*sqname;
+	Oid		seqoid;
+
+	/* try the unmodified label first */
+	StrNCpy(modlabel, label, sizeof(modlabel));
+
+	for (;;)
+	{
+		sqname = makeObjectName(relname, colname, modlabel);
+
+		AcceptInvalidationMessages();
+		seqoid = get_relname_relid(sqname, namespaceid);
+		if (OidIsValid(seqoid))
+		{
+			Relation rel = heap_open(seqoid, AccessShareLock);
+			if (OidIsValid(get_relname_relid(relname, namespaceid)))
+			{
+				heap_close(rel, AccessShareLock);
+				elog(ERROR, "relation \"%s\" already exists", relname);
+			}
+			heap_close(rel, AccessShareLock);
+
+			/* found a conflict, so try a new name component */
+			pfree(sqname);
+			snprintf(modlabel, sizeof(modlabel), "%s%d", label, ++pass);
+		}
+		else
+			break;
+	}
+
+	return sqname;
+}
+#endif
+
 /*
  * generateSerialExtraStmts
  *        Generate CREATE SEQUENCE and ALTER SEQUENCE ... OWNED BY statements
@@ -801,6 +850,14 @@ generateSerialExtraStmts(CreateStmtContext *cxt, ColumnDef *column,
             RangeVarAdjustRelationPersistence(cxt->relation, snamespaceid);
         }
         snamespace = get_namespace_name(snamespaceid);
+#ifdef __TBASE__
+		if (strcmp("CREATE TABLE", cxt->stmtType) == 0)
+			sname = ChooseSerialName(cxt->relation->relname,
+									column->colname,
+									"seq",
+									snamespaceid);
+		else
+#endif
         sname = ChooseRelationName(cxt->relation->relname,
                                    column->colname,
                                    "seq",

From c9b82d01d5028413090f6c80f17e962c5f00e310 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Fri, 30 Apr 2021 17:37:11 +0800
Subject: [PATCH 365/578] 2pc files opt: add 2pc hash table on shmem (merge
 request 300), bugfix

---
 src/backend/access/transam/twophase.c | 64 ++++++++++++++++-----------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 2fc12062..78ae69ff 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -136,6 +136,8 @@
 
 int  transaction_threshold = 200000;
 
+#define FILE_CONTENT_SIZE 2048
+
 #define GET_START_NODE "startnode:"
 
 /* GUC variable, can't be changed after startup */
@@ -3726,7 +3728,7 @@ void record_2pc_involved_nodes_xid(const char * tid,
 void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timestamp)
 {// #lizard forgives
     char path[MAXPGPATH];
-    char file_content[2048];
+	char file_content[FILE_CONTENT_SIZE];
     StringInfoData content;
 	File fd = -1;
 	int ret = 0;
@@ -3819,41 +3821,42 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 
 				GET_2PC_FILE_PATH(path, tid);
 
-				fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
-    if (fd < 0)
-    {   
 					if (RecoveryInProgress())
 					{
-						elog(LOG, "[record_2pc_commit_timestamp] could not "
-							"append timestamp in file %s, errMsg: %s",
-							path, strerror(errno));
+				fd = PathNameOpenFile(path, O_RDWR | O_TRUNC | O_CREAT,
+					S_IRUSR | S_IWUSR);
 					}
 					else
         {
+				fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL,
+					S_IRUSR | S_IWUSR);
+			}
+			if (fd < 0)
+			{
 						elog(ERROR, "[record_2pc_commit_timestamp] could not "
 							"append timestamp in file %s, errMsg: %s",
 							path, strerror(errno));
 					}
-					return;
-        }
 
-				ret = write(fd, entry->info, strlen(entry->info));
-				if(ret != new_size)
+			ret = FileWrite(fd, entry->info, strlen(entry->info),
+				WAIT_EVENT_BUFFILE_WRITE);
+			if(ret != strlen(entry->info))
 				{
-					close(fd);
+				FileClose(fd);
 					elog(ERROR, "[record_2pc_commit_timestamp] could not write "
 						"file %s, errMsg: %s, ret: %d, info: %s",
 						path, strerror(errno), ret, entry->info);
 				}
-				ret = write(fd, content.data, size);
-				if(ret != new_size)
+			ret = FileWrite(fd, content.data, size,
+				WAIT_EVENT_BUFFILE_WRITE);
+			if(ret != size)
 				{
-					close(fd);
+				FileClose(fd);
 					elog(ERROR, "[record_2pc_commit_timestamp] could not write "
 						"file %s, errMsg: %s, ret: %d, info: %s",
 						path, strerror(errno), ret, content.data);
 				}
-				close(fd);
+			FileClose(fd);
 
 				/* remove from hash table */
 				entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
@@ -3883,7 +3886,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 	GET_2PC_FILE_PATH(path, tid);
 
 	/* the 2pc file exists already */
-	fd = open(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR);
+	fd = PathNameOpenFile(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR);
 	if (fd < 0)
 	{
         if (RecoveryInProgress())
@@ -3917,31 +3920,32 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 
     if (enable_distri_print)
     {
-		memset(file_content, 0, 2048);
-		ret = read(fd, file_content, 2048);
+		memset(file_content, 0, FILE_CONTENT_SIZE);
+		ret = FileRead(fd, file_content, FILE_CONTENT_SIZE, WAIT_EVENT_BUFFILE_READ);
 		elog(LOG, "[record_2pc_commit_timestamp] before append file: %s, "
 			"file_content: %s, content.data: %s, ret: %d",
 			path, file_content, content.data, ret);
     }
 
-    ret = write(fd, content.data, size);
+	ret = FileWrite(fd, content.data, size, WAIT_EVENT_BUFFILE_WRITE);
     if(ret != size)
     {
-		close(fd);
+		FileClose(fd);
 		elog(ERROR, "[record_2pc_commit_timestamp] could not write file %s, "
 			"errMsg: %s", path, strerror(errno));
     }
 
     if (enable_distri_print)
     {
-        memset(file_content, 0, 2048);
-        lseek(fd, 0, SEEK_SET);
-        ret = read(fd, file_content, 2048);
+		memset(file_content, 0, FILE_CONTENT_SIZE);
+		FileSeek(fd, 0, SEEK_SET);
+		ret = FileRead(fd, file_content, FILE_CONTENT_SIZE, WAIT_EVENT_BUFFILE_READ);
 		elog(LOG, "[record_2pc_commit_timestamp] after append file: %s, "
-			"file_content: %s, ret: %d", tid, file_content, ret);
+			"file_content: %s, ret: %d", 
+			path, file_content, ret);
     }
 
-	close(fd);
+	FileClose(fd);
 
     resetStringInfo(&content);
     pfree(content.data);
@@ -4063,8 +4067,16 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 			check_entry_key(tid, entry->key, "rename_2pc_records");
 			check_2pc_file(tid, entry->info, "rename_2pc_records");
 
+			if (RecoveryInProgress())
+			{
+				fd = PathNameOpenFile(new_path, O_RDWR | O_TRUNC | O_CREAT,
+					S_IRUSR | S_IWUSR);
+			}
+			else
+			{
 			fd = PathNameOpenFile(new_path, O_RDWR | O_CREAT | O_EXCL,
 				S_IRUSR | S_IWUSR);
+			}
 			if (fd < 0)
 			{
 				elog(ERROR, "[rename_2pc_records] could not create file %s, "

From dc3ecb2072bd6768cc69c2dbad4105e4b03544d1 Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Fri, 30 Apr 2021 17:58:59 +0800
Subject: [PATCH 366/578] fix warning

---
 src/backend/optimizer/util/pgxcship.c | 4 ++++
 src/backend/parser/analyze.c          | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index ec2f0504..b749e028 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -158,6 +158,7 @@ static ExecNodes* pgxc_is_group_subquery_shippable(Query *query, Shippability_co
 static void pgxc_is_rte_subquery_shippable(Node *node, Shippability_context *sc_context);
 static bool pgxc_is_simple_subquery(Query *subquery);
 static bool pgxc_FQS_check_subquery_const(Query *query);
+static ExecNodes *make_FQS_single_node();
 #endif
 /*
  * Set the given reason in Shippability_context indicating why the query can not be
@@ -1878,6 +1879,9 @@ pgxc_query_contains_only_pg_catalog(List *rtable)
     return true;
 }
 
+/*
+ * Construct ExecNodes for single datanode to fqs
+ */
 static ExecNodes *
 make_FQS_single_node()
 {
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index a5a5e96b..539cd7c8 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -1155,7 +1155,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
                 {
                     Node *node = (Node *)lfirst(cell);
 					if (!IsA(node, Param) &&
-					    !pgxc_is_expr_shippable(node, NULL))
+					    !pgxc_is_expr_shippable((Expr*)node, NULL))
                     {
                         qry->isMultiValues = false;
                         break;

From 044e41270182b7616a7cd72060919b8d5b894229 Mon Sep 17 00:00:00 2001
From: challzhang <challzhang@tencent.com>
Date: Mon, 3 May 2021 17:48:39 +0800
Subject: [PATCH 367/578] Merge to v2.15.19 from v3. Fix drop sequence will
 fail when cn drop sequence failed before.
 http://tapd.oa.com/TBase_C/bugtrace/bugs/view/1020385652086462863

---
 src/backend/access/transam/gtm.c |   5 +-
 src/gtm/main/gtm_seq.c           | 134 ++++++++++++++++++++++++++++++-
 src/include/gtm/gtm_seq.h        | 125 ++++++++++++++--------------
 3 files changed, 199 insertions(+), 65 deletions(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 1e6c908b..1ac9e6d2 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -84,7 +84,8 @@ typedef struct
 List *g_CreateSeqList = NULL;
 List *g_DropSeqList   = NULL;
 List *g_AlterSeqList  = NULL;
-#define GTM_SEQ_POSTFIX "_$TBASE$_"
+/* constant postfix for sequence to avoid same name */
+#define GTM_SEQ_POSTFIX "_$TBASE$_sequence_temp_54312678712612"
 static void CheckConnection(void);
 static void ResetGTMConnection(void);
 static int GetGTMStoreStatus(GTMStorageStatus *header);
@@ -156,7 +157,7 @@ void RegisterSeqDrop(char *name, int32 type)
     if (GTM_SEQ_FULL_NAME == type)
     {
         /* Here we can only add postfix for the temp sequence, or drop database will fail. */
-        snprintf(temp, GTM_NAME_LEN, "%s_%d_%zu"GTM_SEQ_POSTFIX, name, MyProcPid, tp.tv_usec);
+	    snprintf(temp, GTM_NAME_LEN, "%s"GTM_SEQ_POSTFIX, name);
         if (RenameSequenceGTM((char *)name, temp))
         {
             elog(ERROR, "Deletion of sequences on database %s failed when backup old seq", name);
diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c
index 9941cb30..d103ce2b 100644
--- a/src/gtm/main/gtm_seq.c
+++ b/src/gtm/main/gtm_seq.c
@@ -633,6 +633,8 @@ int GTM_SeqAlter(GTM_SequenceKey seqkey,
     int32 ret = 0;
 #endif
     GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+    GTM_SequenceKeyData     newseqkey;
+    char        *seqkey_copy;
 
 #ifdef __TBASE__    
     if (NULL ==seqinfo)
@@ -644,11 +646,30 @@ int GTM_SeqAlter(GTM_SequenceKey seqkey,
 
     if (seqinfo == NULL)
     {
-        ereport(LOG,
+        /* Find seqinfo by using GTM_SEQ_POSTFIX seqkey when can not find by seqkey*/
+        seqkey_copy = palloc(sizeof(char) * (seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX)));
+        memcpy(seqkey_copy, seqkey->gsk_key, seqkey->gsk_keylen);
+        newseqkey.gsk_keylen = seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX);
+        newseqkey.gsk_type = seqkey->gsk_type;
+        newseqkey.gsk_key = strcat(seqkey_copy, GTM_SEQ_POSTFIX);
+
+        seqinfo = seq_find_seqinfo(&newseqkey);
+#ifdef __TBASE__
+        if (NULL == seqinfo)
+        {
+            GTM_FormSeqOfStore(&newseqkey);
+            seqinfo = seq_find_seqinfo(&newseqkey);
+        }
+#endif
+        if (NULL == seqinfo)
+        {
+            ereport(ERROR,
                 (EINVAL,
                  errmsg("The sequence with the given key does not exist")));
         return EINVAL;
     }
+        GTM_SeqRename(&newseqkey, seqkey, InvalidGlobalTransactionId);
+    }
 
     GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
 
@@ -979,6 +1000,7 @@ GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey,
     int32 ret = 0;
 #endif
     GTM_SeqInfo *seqinfo = NULL;
+    GTM_SeqInfo *newseqinfo = NULL;
     int errcode = 0;
     MemoryContext oldContext;
     GTM_SeqAlteredInfo *alterinfo;
@@ -995,11 +1017,28 @@ GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey,
     /* replace old key by new key */
     if (seqinfo == NULL)
     {
+        newseqinfo = seq_find_seqinfo(newseqkey);
+#ifdef __TBASE__
+        if (NULL == seqinfo)
+        {
+            GTM_FormSeqOfStore(newseqkey);
+            newseqinfo = seq_find_seqinfo(newseqkey);
+        }
+#endif
+
+        if(newseqinfo == NULL)
+        {
         ereport(LOG,
                 (EINVAL,
                  errmsg("Sequence with the key:%s does not exist", seqkey->gsk_key)));
         return EINVAL;
     }
+        ereport(LOG,
+                    (EEXIST,
+                            errmsg("Sequence with the key:%s has been renamed to %s", seqkey->gsk_key, newseqkey->gsk_key)));
+        seq_release_seqinfo(newseqinfo);
+        return 0;
+    }
 
     oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
     alterinfo = (GTM_SeqAlteredInfo *) palloc0(sizeof (GTM_SeqAlteredInfo));
@@ -1044,6 +1083,8 @@ GTM_SeqGetCurrent(GTM_SequenceKey seqkey, char *coord_name,
     GTM_SeqInfo *seqinfo = NULL;
     int            i;
     bool        found = false;
+    GTM_SequenceKeyData     newseqkey;
+    char        *seqkey_copy = NULL;
 
     seqinfo = seq_find_seqinfo(seqkey);
 #ifdef __TBASE__
@@ -1059,11 +1100,30 @@ GTM_SeqGetCurrent(GTM_SequenceKey seqkey, char *coord_name,
 
     if (seqinfo == NULL)
     {
+        /* Find seqinfo by using GTM_SEQ_POSTFIX seqkey when can not find by seqkey*/
+        seqkey_copy = palloc(sizeof(char) * (seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX)));
+        memcpy(seqkey_copy, seqkey->gsk_key, seqkey->gsk_keylen);
+        newseqkey.gsk_keylen = seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX);
+        newseqkey.gsk_type = seqkey->gsk_type;
+        newseqkey.gsk_key = strcat(seqkey_copy, GTM_SEQ_POSTFIX);
+
+        seqinfo = seq_find_seqinfo(&newseqkey);
+#ifdef __TBASE__
+        if (NULL == seqinfo)
+        {
+            GTM_FormSeqOfStore(&newseqkey);
+            seqinfo = seq_find_seqinfo(&newseqkey);
+        }
+#endif
+        if (NULL == seqinfo)
+        {
         ereport(ERROR,
                 (EINVAL,
                  errmsg("sequence \"%s\" does not exist", seqkey->gsk_key)));
         return;
     }
+        GTM_SeqRename(&newseqkey, seqkey, InvalidGlobalTransactionId);
+	}
 
     GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_READ);
 
@@ -1167,6 +1227,8 @@ GTM_SeqSetVal(GTM_SequenceKey seqkey, char *coord_name,
     int32 ret = 0;
     GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
 #endif
+    GTM_SequenceKeyData     newseqkey;
+    char        *seqkey_copy;
     
 #ifdef __TBASE__    
     if (NULL ==seqinfo)
@@ -1178,12 +1240,29 @@ GTM_SeqSetVal(GTM_SequenceKey seqkey, char *coord_name,
 
     if (seqinfo == NULL)
     {
+        /* Find seqinfo by using GTM_SEQ_POSTFIX seqkey when can not find by seqkey*/
+        seqkey_copy = palloc(sizeof(char) * (seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX)));
+        memcpy(seqkey_copy, seqkey->gsk_key, seqkey->gsk_keylen);
+        newseqkey.gsk_keylen = seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX);
+        newseqkey.gsk_type = seqkey->gsk_type;
+        newseqkey.gsk_key = strcat(seqkey_copy, GTM_SEQ_POSTFIX);
+        seqinfo = seq_find_seqinfo(&newseqkey);
+#ifdef __TBASE__
+        if (NULL == seqinfo)
+        {
+            GTM_FormSeqOfStore(&newseqkey);
+            seqinfo = seq_find_seqinfo(&newseqkey);
+        }
+#endif
+        if (NULL == seqinfo)
+        {
         ereport(LOG,
                 (EINVAL,
                  errmsg("The sequence with the given key does not exist")));
-
         return EINVAL;
     }
+        GTM_SeqRename(&newseqkey, seqkey, InvalidGlobalTransactionId);
+    }
 
     GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
 
@@ -1240,6 +1319,8 @@ GTM_SeqGetNext(GTM_SequenceKey seqkey, char *coord_name,
     char         buf[100] = {0};
     GTM_Sequence used_count = 0;
 #endif
+    GTM_SequenceKeyData     newseqkey;
+    char        *seqkey_copy;
 
     GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
 #ifdef __TBASE__
@@ -1252,11 +1333,30 @@ GTM_SeqGetNext(GTM_SequenceKey seqkey, char *coord_name,
 
     if (seqinfo == NULL)
     {
+        /* Find seqinfo by using GTM_SEQ_POSTFIX seqkey when can not find by seqkey*/
+        seqkey_copy = palloc(sizeof(char) * (seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX)));
+        memcpy(seqkey_copy, seqkey->gsk_key, seqkey->gsk_keylen);
+        newseqkey.gsk_keylen = seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX);
+        newseqkey.gsk_type = seqkey->gsk_type;
+        newseqkey.gsk_key = strcat(seqkey_copy, GTM_SEQ_POSTFIX);
+
+        seqinfo = seq_find_seqinfo(&newseqkey);
+#ifdef __TBASE__
+        if (NULL == seqinfo)
+        {
+            GTM_FormSeqOfStore(&newseqkey);
+            seqinfo = seq_find_seqinfo(&newseqkey);
+        }
+#endif
+        if (NULL == seqinfo)
+        {
         ereport(LOG,
                 (EINVAL,
                  errmsg("The sequence with the given key does not exist")));
         return EINVAL;
     }
+        GTM_SeqRename(&newseqkey, seqkey, InvalidGlobalTransactionId);
+    }
     
     GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
 
@@ -1515,6 +1615,8 @@ GTM_SeqReset(GTM_SequenceKey seqkey)
 #ifdef __TBASE__
     int32         ret = 0;
 #endif
+    GTM_SequenceKeyData     newseqkey;
+    char        *seqkey_copy;
 
     GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
 #ifdef __TBASE__    
@@ -1527,11 +1629,30 @@ GTM_SeqReset(GTM_SequenceKey seqkey)
 
     if (seqinfo == NULL)
     {
+        /* Find seqinfo by using GTM_SEQ_POSTFIX seqkey when can not find by seqkey*/
+        seqkey_copy = palloc(sizeof(char) * (seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX)));
+        memcpy(seqkey_copy, seqkey->gsk_key, seqkey->gsk_keylen);
+        newseqkey.gsk_keylen = seqkey->gsk_keylen + strlen(GTM_SEQ_POSTFIX);
+        newseqkey.gsk_type = seqkey->gsk_type;
+        newseqkey.gsk_key = strcat(seqkey_copy, GTM_SEQ_POSTFIX);
+
+        seqinfo = seq_find_seqinfo(&newseqkey);
+#ifdef __TBASE__
+        if (NULL == seqinfo)
+        {
+            GTM_FormSeqOfStore(&newseqkey);
+            seqinfo = seq_find_seqinfo(&newseqkey);
+        }
+#endif
+        if (NULL == seqinfo)
+        {
         ereport(LOG,
                 (EINVAL,
                  errmsg("The sequence with the given key does not exist")));
         return EINVAL;
     }
+        GTM_SeqRename(&newseqkey, seqkey, InvalidGlobalTransactionId);
+    }
 
     GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
     seqinfo->gs_value = seqinfo->gs_backedUpValue = seqinfo->gs_init_value;
@@ -1597,6 +1718,7 @@ ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup)
     MemoryContext oldContext;
     const char *data;
     GlobalTransactionId gxid;
+	char postfix[100];
 
     if (Recovery_IsStandby())
     {
@@ -1612,6 +1734,14 @@ ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup)
     seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
     seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
 
+    /* Check whether the seqkey contains GTM_SEQ_POSTFIX */
+    if (seqkey.gsk_keylen > strlen(GTM_SEQ_POSTFIX))
+    {
+        strncpy(postfix, seqkey.gsk_key + (seqkey.gsk_keylen - strlen(GTM_SEQ_POSTFIX) - 1), strlen(GTM_SEQ_POSTFIX));
+        if (!strcmp(postfix, GTM_SEQ_POSTFIX))
+            elog(ERROR, "postfix of sequence key can not be _$TBASE$_sequence_temp_54312678712612.");
+    }
+
     /*
      * Read various sequence parameters
      */
diff --git a/src/include/gtm/gtm_seq.h b/src/include/gtm/gtm_seq.h
index 14d5918c..d9dd072d 100644
--- a/src/include/gtm/gtm_seq.h
+++ b/src/include/gtm/gtm_seq.h
@@ -24,84 +24,87 @@
 #define  SEQ_RESERVE_COUNT     5000
 #define  SEQ_RESERVE_MIN_GAP   10
 #endif
+/* constant postfix for sequence to avoid same name */
+#define GTM_SEQ_POSTFIX "_$TBASE$_sequence_temp_54312678712612"
+#define SEQ_KEY_LEN 256
 
 typedef struct GTM_SeqLastVal
 {
-    char            gs_coord_name[SP_NODE_NAME];
-    int32            gs_coord_procid;
-    GTM_Sequence    gs_last_value;
+	char			gs_coord_name[SP_NODE_NAME];
+	int32			gs_coord_procid;
+	GTM_Sequence	gs_last_value;
 } GTM_SeqLastVal;
 
 typedef struct GTM_SeqInfo
 {
-    GTM_SequenceKey    gs_key;
-    GTM_SequenceKey    gs_oldkey;
-    GTM_Sequence    gs_value;
-    GTM_Sequence    gs_backedUpValue;
-    GTM_Sequence    gs_init_value;
-    int32            gs_max_lastvals;
-    int32            gs_lastval_count;
-    GTM_SeqLastVal *gs_last_values;
-    GTM_Sequence    gs_increment_by; /* increase step */
-    GTM_Sequence    gs_min_value;    /* min value of the seq */
-    GTM_Sequence    gs_max_value;    /* max value of the seq */
-    bool            gs_cycle;        /* whether we are cycled */
-    bool            gs_called;       
-    GlobalTransactionId    gs_created_gxid;
-
-    int32            gs_ref_count;
-    int32            gs_state;
-    GTM_RWLock        gs_lock;
+	GTM_SequenceKey	gs_key;
+	GTM_SequenceKey	gs_oldkey;
+	GTM_Sequence	gs_value;
+	GTM_Sequence	gs_backedUpValue;
+	GTM_Sequence	gs_init_value;
+	int32			gs_max_lastvals;
+	int32			gs_lastval_count;
+	GTM_SeqLastVal *gs_last_values;
+	GTM_Sequence	gs_increment_by; /* increase step */
+	GTM_Sequence	gs_min_value;    /* min value of the seq */
+	GTM_Sequence	gs_max_value;    /* max value of the seq */
+	bool			gs_cycle;        /* whether we are cycled */
+	bool			gs_called;       
+	GlobalTransactionId	gs_created_gxid;
+
+	int32			gs_ref_count;
+	int32			gs_state;
+	GTM_RWLock		gs_lock;
 #ifdef __TBASE__
-    bool             gs_reserved;     /* whether we have reserve value*/
-    GTMStorageHandle gs_store_handle;
-    int32            gs_left_reserve_seq_number;
+	bool			 gs_reserved;	 /* whether we have reserve value*/
+	GTMStorageHandle gs_store_handle;
+	int32            gs_left_reserve_seq_number;
 #endif
 } GTM_SeqInfo;
 
-#define SEQ_STATE_ACTIVE    1
-#define SEQ_STATE_DELETED    2
+#define SEQ_STATE_ACTIVE	1
+#define SEQ_STATE_DELETED	2
 
-#define SEQ_IS_ASCENDING(s)        ((s)->gs_increment_by > 0)
-#define SEQ_IS_CYCLE(s)        ((s)->gs_cycle)
-#define SEQ_IS_CALLED(s)    ((s)->gs_called)
+#define SEQ_IS_ASCENDING(s)		((s)->gs_increment_by > 0)
+#define SEQ_IS_CYCLE(s)		((s)->gs_cycle)
+#define SEQ_IS_CALLED(s)	((s)->gs_called)
 
-#define SEQ_DEF_MAX_SEQVAL_ASCEND            0x7ffffffffffffffeLL
-#define SEQ_DEF_MIN_SEQVAL_ASCEND            0x1
+#define SEQ_DEF_MAX_SEQVAL_ASCEND			0x7ffffffffffffffeLL
+#define SEQ_DEF_MIN_SEQVAL_ASCEND			0x1
 
-#define SEQ_DEF_MAX_SEQVAL_DESCEND            -0x1
-#define SEQ_DEF_MIN_SEQVAL_DESCEND            -0x7ffffffffffffffeLL
+#define SEQ_DEF_MAX_SEQVAL_DESCEND			-0x1
+#define SEQ_DEF_MIN_SEQVAL_DESCEND			-0x7ffffffffffffffeLL
 
-#define SEQ_MAX_REFCOUNT        1024
+#define SEQ_MAX_REFCOUNT		1024
 
 /* SEQUENCE Management */
 void GTM_InitSeqManager(void);
 int GTM_SeqOpen(GTM_SequenceKey seqkey,
-            GTM_Sequence increment_by,
-            GTM_Sequence minval,
-            GTM_Sequence maxval,
-            GTM_Sequence startval,
-            bool cycle,
-            GlobalTransactionId gxid
-            );
+			GTM_Sequence increment_by,
+			GTM_Sequence minval,
+			GTM_Sequence maxval,
+			GTM_Sequence startval,
+			bool cycle,
+			GlobalTransactionId gxid
+			);
 int GTM_SeqAlter(GTM_SequenceKey seqkey,
-                 GTM_Sequence increment_by,
-                 GTM_Sequence minval,
-                 GTM_Sequence maxval,
-                 GTM_Sequence startval,
-                 GTM_Sequence lastval,
-                 bool cycle,
-                 bool is_restart);
+				 GTM_Sequence increment_by,
+				 GTM_Sequence minval,
+				 GTM_Sequence maxval,
+				 GTM_Sequence startval,
+				 GTM_Sequence lastval,
+				 bool cycle,
+				 bool is_restart);
 int GTM_SeqClose(GTM_SequenceKey seqkey, GlobalTransactionId gxid);
 int GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey,
-                  GlobalTransactionId gxid);
+				  GlobalTransactionId gxid);
 int GTM_SeqGetNext(GTM_SequenceKey seqkey, char *coord_name,
-               int coord_procid, GTM_Sequence range,
-               GTM_Sequence *result, GTM_Sequence *rangemax);
+			   int coord_procid, GTM_Sequence range,
+			   GTM_Sequence *result, GTM_Sequence *rangemax);
 void GTM_SeqGetCurrent(GTM_SequenceKey seqkey, char *coord_name,
-                  int coord_procid, GTM_Sequence *result);
+				  int coord_procid, GTM_Sequence *result);
 int GTM_SeqSetVal(GTM_SequenceKey seqkey, char *coord_name,
-              int coord_procid, GTM_Sequence nextval, bool iscalled);
+			  int coord_procid, GTM_Sequence nextval, bool iscalled);
 int GTM_SeqReset(GTM_SequenceKey seqkey);
 
 void ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup);
@@ -120,14 +123,14 @@ void ProcessDBSequenceRenameCommand(Port *myport, StringInfo message, bool is_ba
 void decode_seq_key(char* value, GTM_SequenceKey seqkey);
 void GTM_SaveSeqInfo(FILE *ctlf);
 int GTM_SeqRestore(GTM_SequenceKey seqkey,
-               GTM_Sequence increment_by,
-               GTM_Sequence minval,
-               GTM_Sequence maxval,
-               GTM_Sequence startval,
-               GTM_Sequence curval,
-               int32 state,
-               bool cycle,
-               bool called);
+			   GTM_Sequence increment_by,
+			   GTM_Sequence minval,
+			   GTM_Sequence maxval,
+			   GTM_Sequence startval,
+			   GTM_Sequence curval,
+			   int32 state,
+			   bool cycle,
+			   bool called);
 
 void GTM_CleanupSeqSession(char *coord_name, int coord_procid);
 

From 91408ee3c7ad6f24e98fac9b3defdc2bcc57b299 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 6 May 2021 17:27:49 +0800
Subject: [PATCH 368/578] fix gtm_ctl -l logfile
 http://tapd.oa.com/TBase_C/bugtrace/bugs/view/1020385652086076245 (merge
 request !313)

Squash merge branch 'sigmalin_v2' into 'Tbase_v2.15.19'

* fix gtm_ctl -l logfile
---
 src/gtm/gtm_ctl/gtm_ctl.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c
index 9aeeb322..fa26bed5 100644
--- a/src/gtm/gtm_ctl/gtm_ctl.c
+++ b/src/gtm/gtm_ctl/gtm_ctl.c
@@ -316,8 +316,8 @@ start_gtm(void)
 		snprintf(gtm_startup_gts,MAXPGPATH,"-g %s",startup_gts);
 
     if (log_file != NULL)
-		len = snprintf(cmd, MAXPGPATH - 1, "\"%s\" %s%s -l %s %s &" ,
-				 gtm_app_path, gtmdata_opt, gtm_opts, log_file, gtm_startup_gts);
+		len = snprintf(cmd, MAXPGPATH - 1, "\"%s\" %s%s %s >> \"%s\" 2>&1 &" ,
+				 gtm_app_path, gtmdata_opt, gtm_opts, gtm_startup_gts, log_file);
     else
 		len = snprintf(cmd, MAXPGPATH - 1, "\"%s\" %s%s %s < \"%s\" 2>&1 &" ,
 				 gtm_app_path, gtmdata_opt, gtm_opts, gtm_startup_gts, DEVNULL);
@@ -348,14 +348,6 @@ static int RunAsDaemon(char *cmd)
         int status;
 
         case 0:
-            /*
-             * Using fileno(xxx) may encounter trivial error because xxx may
-             * have been closed at somewhere else and fileno() may fail.  
-             * Its safer to use literal file descriptor here.
-             */
-            close(0);
-            close(1);
-            close(2);
             if ((status = system(cmd)) == -1)
                 /* 
                  * Same behavior as /bin/sh could not be

From 4e37d605af52c7d433e05398b71e61ca05922b0c Mon Sep 17 00:00:00 2001
From: hanwayjiang <hanwayjiang@tencent.com>
Date: Sat, 8 May 2021 15:46:18 +0800
Subject: [PATCH 369/578] =?UTF-8?q?=E3=80=90=E3=80=90=20TBase=E5=86=85?=
 =?UTF-8?q?=E6=A0=B8=20V2=E3=80=91TBase=20V2=E6=94=AF=E6=8C=81dblink?=
 =?UTF-8?q?=E6=8F=92=E4=BB=B6=EF=BC=8Cdblink=E6=8F=92=E4=BB=B6=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81copy=E5=8A=9F=E8=83=BD=E3=80=91=20http://tapd.oa.com/p?=
 =?UTF-8?q?gxz/prong/stories/view/1010092131864638363=20(merge=20request?=
 =?UTF-8?q?=20!315)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Squash merge branch 'tbase_v2_dblink' into 'Tbase_v2.15.19'

* 【【 TBase内核 V2】TBase V2支持dblink插件，dblink插件支持copy功能】 http://tapd.oa.com/pgxz/prong/stories/view/1010092131864638363
---
 contrib/dblink/dblink--1.2.sql |    5 +
 contrib/dblink/dblink.c        | 4904 +++++++++++++++++---------------
 src/backend/tcop/utility.c     |   20 +-
 3 files changed, 2578 insertions(+), 2351 deletions(-)

diff --git a/contrib/dblink/dblink--1.2.sql b/contrib/dblink/dblink--1.2.sql
index 405eccb0..fabe10fc 100644
--- a/contrib/dblink/dblink--1.2.sql
+++ b/contrib/dblink/dblink--1.2.sql
@@ -178,6 +178,11 @@ RETURNS int4
 AS 'MODULE_PATHNAME', 'dblink_is_busy'
 LANGUAGE C STRICT PARALLEL RESTRICTED;
 
+CREATE FUNCTION dblink_copy_table(text, text, text, text, text)
+RETURNS int4
+AS 'MODULE_PATHNAME', 'dblink_copy_table'
+LANGUAGE C STRICT PARALLEL RESTRICTED;
+
 CREATE FUNCTION dblink_get_result(text)
 RETURNS SETOF record
 AS 'MODULE_PATHNAME', 'dblink_get_result'
diff --git a/contrib/dblink/dblink.c b/contrib/dblink/dblink.c
index bb39b2c6..fb1c99ce 100644
--- a/contrib/dblink/dblink.c
+++ b/contrib/dblink/dblink.c
@@ -44,12 +44,14 @@
 #include "catalog/pg_foreign_server.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_user_mapping.h"
+#include "commands/copy.h"
 #include "executor/spi.h"
 #include "foreign/foreign.h"
 #include "funcapi.h"
 #include "lib/stringinfo.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
+#include "parser/parse_relation.h"
 #include "parser/scansup.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
@@ -65,21 +67,21 @@ PG_MODULE_MAGIC;
 
 typedef struct remoteConn
 {
-    PGconn       *conn;            /* Hold the remote connection */
-    int            openCursorCount;    /* The number of open cursors */
-    bool        newXactForCursor;    /* Opened a transaction for a cursor */
+	PGconn	   *conn;			/* Hold the remote connection */
+	int			openCursorCount;	/* The number of open cursors */
+	bool		newXactForCursor;	/* Opened a transaction for a cursor */
 } remoteConn;
 
 typedef struct storeInfo
 {
-    FunctionCallInfo fcinfo;
-    Tuplestorestate *tuplestore;
-    AttInMetadata *attinmeta;
-    MemoryContext tmpcontext;
-    char      **cstrs;
-    /* temp storage for results to avoid leaks on exception */
-    PGresult   *last_res;
-    PGresult   *cur_res;
+	FunctionCallInfo fcinfo;
+	Tuplestorestate *tuplestore;
+	AttInMetadata *attinmeta;
+	MemoryContext tmpcontext;
+	char	  **cstrs;
+	/* temp storage for results to avoid leaks on exception */
+	PGresult   *last_res;
+	PGresult   *cur_res;
 } storeInfo;
 
 /*
@@ -88,12 +90,12 @@ typedef struct storeInfo
 static Datum dblink_record_internal(FunctionCallInfo fcinfo, bool is_async);
 static void prepTuplestoreResult(FunctionCallInfo fcinfo);
 static void materializeResult(FunctionCallInfo fcinfo, PGconn *conn,
-                  PGresult *res);
+				  PGresult *res);
 static void materializeQueryResult(FunctionCallInfo fcinfo,
-                       PGconn *conn,
-                       const char *conname,
-                       const char *sql,
-                       bool fail);
+					   PGconn *conn,
+					   const char *conname,
+					   const char *sql,
+					   bool fail);
 static PGresult *storeQueryResult(volatile storeInfo *sinfo, PGconn *conn, const char *sql);
 static void storeRow(volatile storeInfo *sinfo, PGresult *res, bool first);
 static remoteConn *getConnectionByName(const char *name);
@@ -106,22 +108,22 @@ static char *get_sql_insert(Relation rel, int *pkattnums, int pknumatts, char **
 static char *get_sql_delete(Relation rel, int *pkattnums, int pknumatts, char **tgt_pkattvals);
 static char *get_sql_update(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals, char **tgt_pkattvals);
 static char *quote_ident_cstr(char *rawstr);
-static int    get_attnum_pk_pos(int *pkattnums, int pknumatts, int key);
+static int	get_attnum_pk_pos(int *pkattnums, int pknumatts, int key);
 static HeapTuple get_tuple_of_interest(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals);
 static Relation get_rel_from_relname(text *relname_text, LOCKMODE lockmode, AclMode aclmode);
 static char *generate_relation_name(Relation rel);
 static void dblink_connstr_check(const char *connstr);
 static void dblink_security_check(PGconn *conn, remoteConn *rconn);
 static void dblink_res_error(PGconn *conn, const char *conname, PGresult *res,
-                 const char *dblink_context_msg, bool fail);
+				 const char *dblink_context_msg, bool fail);
 static char *get_connect_string(const char *servername);
 static char *escape_param_str(const char *from);
 static void validate_pkattnums(Relation rel,
-                   int2vector *pkattnums_arg, int32 pknumatts_arg,
-                   int **pkattnums, int *pknumatts);
+				   int2vector *pkattnums_arg, int32 pknumatts_arg,
+				   int **pkattnums, int *pknumatts);
 static bool is_valid_dblink_option(const PQconninfoOption *options,
-                       const char *option, Oid context);
-static int    applyRemoteGucs(PGconn *conn);
+					   const char *option, Oid context);
+static int	applyRemoteGucs(PGconn *conn);
 static void restoreLocalGucs(int nestlevel);
 
 /* Global */
@@ -129,16 +131,16 @@ static remoteConn *pconn = NULL;
 static HTAB *remoteConnHash = NULL;
 
 /*
- *    Following is list that holds multiple remote connections.
- *    Calling convention of each dblink function changes to accept
- *    connection name as the first parameter. The connection list is
- *    much like ecpg e.g. a mapping between a name and a PGconn object.
+ *	Following is list that holds multiple remote connections.
+ *	Calling convention of each dblink function changes to accept
+ *	connection name as the first parameter. The connection list is
+ *	much like ecpg e.g. a mapping between a name and a PGconn object.
  */
 
 typedef struct remoteConnHashEnt
 {
-    char        name[NAMEDATALEN];
-    remoteConn *rconn;
+	char		name[NAMEDATALEN];
+	remoteConn *rconn;
 } remoteConnHashEnt;
 
 /* initial number of connection hashes */
@@ -147,104 +149,104 @@ typedef struct remoteConnHashEnt
 static char *
 xpstrdup(const char *in)
 {
-    if (in == NULL)
-        return NULL;
-    return pstrdup(in);
+	if (in == NULL)
+		return NULL;
+	return pstrdup(in);
 }
 
 static void
 pg_attribute_noreturn()
 dblink_res_internalerror(PGconn *conn, PGresult *res, const char *p2)
 {
-    char       *msg = pchomp(PQerrorMessage(conn));
+	char	   *msg = pchomp(PQerrorMessage(conn));
 
-    if (res)
-        PQclear(res);
-    elog(ERROR, "%s: %s", p2, msg);
+	if (res)
+		PQclear(res);
+	elog(ERROR, "%s: %s", p2, msg);
 }
 
 static void
 pg_attribute_noreturn()
 dblink_conn_not_avail(const char *conname)
 {
-    if (conname)
-        ereport(ERROR,
-                (errcode(ERRCODE_CONNECTION_DOES_NOT_EXIST),
-                 errmsg("connection \"%s\" not available", conname)));
-    else
-        ereport(ERROR,
-                (errcode(ERRCODE_CONNECTION_DOES_NOT_EXIST),
-                 errmsg("connection not available")));
+	if (conname)
+		ereport(ERROR,
+				(errcode(ERRCODE_CONNECTION_DOES_NOT_EXIST),
+				 errmsg("connection \"%s\" not available", conname)));
+	else
+		ereport(ERROR,
+				(errcode(ERRCODE_CONNECTION_DOES_NOT_EXIST),
+				 errmsg("connection not available")));
 }
 
 static void
 dblink_get_conn(char *conname_or_str,
-                PGconn *volatile *conn_p, char **conname_p, volatile bool *freeconn_p)
+				PGconn *volatile *conn_p, char **conname_p, volatile bool *freeconn_p)
 {
-    remoteConn *rconn = getConnectionByName(conname_or_str);
-    PGconn       *conn;
-    char       *conname;
-    bool        freeconn;
-
-    if (rconn)
-    {
-        conn = rconn->conn;
-        conname = conname_or_str;
-        freeconn = false;
-    }
-    else
-    {
-        const char *connstr;
-
-        connstr = get_connect_string(conname_or_str);
-        if (connstr == NULL)
-            connstr = conname_or_str;
-        dblink_connstr_check(connstr);
-        conn = PQconnectdb(connstr);
-        if (PQstatus(conn) == CONNECTION_BAD)
-        {
-            char       *msg = pchomp(PQerrorMessage(conn));
-
-            PQfinish(conn);
-            ereport(ERROR,
-                    (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-                     errmsg("could not establish connection"),
-                     errdetail_internal("%s", msg)));
-        }
-        dblink_security_check(conn, rconn);
-        if (PQclientEncoding(conn) != GetDatabaseEncoding())
-            PQsetClientEncoding(conn, GetDatabaseEncodingName());
-        freeconn = true;
-        conname = NULL;
-    }
-
-    *conn_p = conn;
-    *conname_p = conname;
-    *freeconn_p = freeconn;
+	remoteConn *rconn = getConnectionByName(conname_or_str);
+	PGconn	   *conn;
+	char	   *conname;
+	bool		freeconn;
+
+	if (rconn)
+	{
+		conn = rconn->conn;
+		conname = conname_or_str;
+		freeconn = false;
+	}
+	else
+	{
+		const char *connstr;
+
+		connstr = get_connect_string(conname_or_str);
+		if (connstr == NULL)
+			connstr = conname_or_str;
+		dblink_connstr_check(connstr);
+		conn = PQconnectdb(connstr);
+		if (PQstatus(conn) == CONNECTION_BAD)
+		{
+			char	   *msg = pchomp(PQerrorMessage(conn));
+
+			PQfinish(conn);
+			ereport(ERROR,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+					 errmsg("could not establish connection"),
+					 errdetail_internal("%s", msg)));
+		}
+		dblink_security_check(conn, rconn);
+		if (PQclientEncoding(conn) != GetDatabaseEncoding())
+			PQsetClientEncoding(conn, GetDatabaseEncodingName());
+		freeconn = true;
+		conname = NULL;
+	}
+
+	*conn_p = conn;
+	*conname_p = conname;
+	*freeconn_p = freeconn;
 }
 
 static PGconn *
 dblink_get_named_conn(const char *conname)
 {
-    remoteConn *rconn = getConnectionByName(conname);
+	remoteConn *rconn = getConnectionByName(conname);
 
-    if (rconn)
-        return rconn->conn;
+	if (rconn)
+		return rconn->conn;
 
-    dblink_conn_not_avail(conname);
-    return NULL;                /* keep compiler quiet */
+	dblink_conn_not_avail(conname);
+	return NULL;				/* keep compiler quiet */
 }
 
 static void
 dblink_init(void)
 {
-    if (!pconn)
-    {
-        pconn = (remoteConn *) MemoryContextAlloc(TopMemoryContext, sizeof(remoteConn));
-        pconn->conn = NULL;
-        pconn->openCursorCount = 0;
-        pconn->newXactForCursor = FALSE;
-    }
+	if (!pconn)
+	{
+		pconn = (remoteConn *) MemoryContextAlloc(TopMemoryContext, sizeof(remoteConn));
+		pconn->conn = NULL;
+		pconn->openCursorCount = 0;
+		pconn->newXactForCursor = FALSE;
+	}
 }
 
 /*
@@ -254,69 +256,69 @@ PG_FUNCTION_INFO_V1(dblink_connect);
 Datum
 dblink_connect(PG_FUNCTION_ARGS)
 {
-    char       *conname_or_str = NULL;
-    char       *connstr = NULL;
-    char       *connname = NULL;
-    char       *msg;
-    PGconn       *conn = NULL;
-    remoteConn *rconn = NULL;
-
-    dblink_init();
-
-    if (PG_NARGS() == 2)
-    {
-        conname_or_str = text_to_cstring(PG_GETARG_TEXT_PP(1));
-        connname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-    }
-    else if (PG_NARGS() == 1)
-        conname_or_str = text_to_cstring(PG_GETARG_TEXT_PP(0));
-
-    if (connname)
-        rconn = (remoteConn *) MemoryContextAlloc(TopMemoryContext,
-                                                  sizeof(remoteConn));
-
-    /* first check for valid foreign data server */
-    connstr = get_connect_string(conname_or_str);
-    if (connstr == NULL)
-        connstr = conname_or_str;
-
-    /* check password in connection string if not superuser */
-    dblink_connstr_check(connstr);
-    conn = PQconnectdb(connstr);
-
-    if (PQstatus(conn) == CONNECTION_BAD)
-    {
-        msg = pchomp(PQerrorMessage(conn));
-        PQfinish(conn);
-        if (rconn)
-            pfree(rconn);
-
-        ereport(ERROR,
-                (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
-                 errmsg("could not establish connection"),
-                 errdetail_internal("%s", msg)));
-    }
-
-    /* check password actually used if not superuser */
-    dblink_security_check(conn, rconn);
-
-    /* attempt to set client encoding to match server encoding, if needed */
-    if (PQclientEncoding(conn) != GetDatabaseEncoding())
-        PQsetClientEncoding(conn, GetDatabaseEncodingName());
-
-    if (connname)
-    {
-        rconn->conn = conn;
-        createNewConnection(connname, rconn);
-    }
-    else
-    {
-        if (pconn->conn)
-            PQfinish(pconn->conn);
-        pconn->conn = conn;
-    }
-
-    PG_RETURN_TEXT_P(cstring_to_text("OK"));
+	char	   *conname_or_str = NULL;
+	char	   *connstr = NULL;
+	char	   *connname = NULL;
+	char	   *msg;
+	PGconn	   *conn = NULL;
+	remoteConn *rconn = NULL;
+
+	dblink_init();
+
+	if (PG_NARGS() == 2)
+	{
+		conname_or_str = text_to_cstring(PG_GETARG_TEXT_PP(1));
+		connname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	}
+	else if (PG_NARGS() == 1)
+		conname_or_str = text_to_cstring(PG_GETARG_TEXT_PP(0));
+
+	if (connname)
+		rconn = (remoteConn *) MemoryContextAlloc(TopMemoryContext,
+												  sizeof(remoteConn));
+
+	/* first check for valid foreign data server */
+	connstr = get_connect_string(conname_or_str);
+	if (connstr == NULL)
+		connstr = conname_or_str;
+
+	/* check password in connection string if not superuser */
+	dblink_connstr_check(connstr);
+	conn = PQconnectdb(connstr);
+
+	if (PQstatus(conn) == CONNECTION_BAD)
+	{
+		msg = pchomp(PQerrorMessage(conn));
+		PQfinish(conn);
+		if (rconn)
+			pfree(rconn);
+
+		ereport(ERROR,
+				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+				 errmsg("could not establish connection"),
+				 errdetail_internal("%s", msg)));
+	}
+
+	/* check password actually used if not superuser */
+	dblink_security_check(conn, rconn);
+
+	/* attempt to set client encoding to match server encoding, if needed */
+	if (PQclientEncoding(conn) != GetDatabaseEncoding())
+		PQsetClientEncoding(conn, GetDatabaseEncodingName());
+
+	if (connname)
+	{
+		rconn->conn = conn;
+		createNewConnection(connname, rconn);
+	}
+	else
+	{
+		if (pconn->conn)
+			PQfinish(pconn->conn);
+		pconn->conn = conn;
+	}
+
+	PG_RETURN_TEXT_P(cstring_to_text("OK"));
 }
 
 /*
@@ -326,35 +328,35 @@ PG_FUNCTION_INFO_V1(dblink_disconnect);
 Datum
 dblink_disconnect(PG_FUNCTION_ARGS)
 {
-    char       *conname = NULL;
-    remoteConn *rconn = NULL;
-    PGconn       *conn = NULL;
-
-    dblink_init();
-
-    if (PG_NARGS() == 1)
-    {
-        conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-        rconn = getConnectionByName(conname);
-        if (rconn)
-            conn = rconn->conn;
-    }
-    else
-        conn = pconn->conn;
-
-    if (!conn)
-        dblink_conn_not_avail(conname);
-
-    PQfinish(conn);
-    if (rconn)
-    {
-        deleteConnection(conname);
-        pfree(rconn);
-    }
-    else
-        pconn->conn = NULL;
-
-    PG_RETURN_TEXT_P(cstring_to_text("OK"));
+	char	   *conname = NULL;
+	remoteConn *rconn = NULL;
+	PGconn	   *conn = NULL;
+
+	dblink_init();
+
+	if (PG_NARGS() == 1)
+	{
+		conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+		rconn = getConnectionByName(conname);
+		if (rconn)
+			conn = rconn->conn;
+	}
+	else
+		conn = pconn->conn;
+
+	if (!conn)
+		dblink_conn_not_avail(conname);
+
+	PQfinish(conn);
+	if (rconn)
+	{
+		deleteConnection(conname);
+		pfree(rconn);
+	}
+	else
+		pconn->conn = NULL;
+
+	PG_RETURN_TEXT_P(cstring_to_text("OK"));
 }
 
 /*
@@ -364,89 +366,89 @@ PG_FUNCTION_INFO_V1(dblink_open);
 Datum
 dblink_open(PG_FUNCTION_ARGS)
 {
-    PGresult   *res = NULL;
-    PGconn       *conn;
-    char       *curname = NULL;
-    char       *sql = NULL;
-    char       *conname = NULL;
-    StringInfoData buf;
-    remoteConn *rconn = NULL;
-    bool        fail = true;    /* default to backward compatible behavior */
-
-    dblink_init();
-    initStringInfo(&buf);
-
-    if (PG_NARGS() == 2)
-    {
-        /* text,text */
-        curname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-        sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
-        rconn = pconn;
-    }
-    else if (PG_NARGS() == 3)
-    {
-        /* might be text,text,text or text,text,bool */
-        if (get_fn_expr_argtype(fcinfo->flinfo, 2) == BOOLOID)
-        {
-            curname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-            sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
-            fail = PG_GETARG_BOOL(2);
-            rconn = pconn;
-        }
-        else
-        {
-            conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-            curname = text_to_cstring(PG_GETARG_TEXT_PP(1));
-            sql = text_to_cstring(PG_GETARG_TEXT_PP(2));
-            rconn = getConnectionByName(conname);
-        }
-    }
-    else if (PG_NARGS() == 4)
-    {
-        /* text,text,text,bool */
-        conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-        curname = text_to_cstring(PG_GETARG_TEXT_PP(1));
-        sql = text_to_cstring(PG_GETARG_TEXT_PP(2));
-        fail = PG_GETARG_BOOL(3);
-        rconn = getConnectionByName(conname);
-    }
-
-    if (!rconn || !rconn->conn)
-        dblink_conn_not_avail(conname);
-
-    conn = rconn->conn;
-
-    /* If we are not in a transaction, start one */
-    if (PQtransactionStatus(conn) == PQTRANS_IDLE)
-    {
-        res = PQexec(conn, "BEGIN");
-        if (PQresultStatus(res) != PGRES_COMMAND_OK)
-            dblink_res_internalerror(conn, res, "begin error");
-        PQclear(res);
-        rconn->newXactForCursor = TRUE;
-
-        /*
-         * Since transaction state was IDLE, we force cursor count to
-         * initially be 0. This is needed as a previous ABORT might have wiped
-         * out our transaction without maintaining the cursor count for us.
-         */
-        rconn->openCursorCount = 0;
-    }
-
-    /* if we started a transaction, increment cursor count */
-    if (rconn->newXactForCursor)
-        (rconn->openCursorCount)++;
-
-    appendStringInfo(&buf, "DECLARE %s CURSOR FOR %s", curname, sql);
-    res = PQexec(conn, buf.data);
-    if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
-    {
-        dblink_res_error(conn, conname, res, "could not open cursor", fail);
-        PG_RETURN_TEXT_P(cstring_to_text("ERROR"));
-    }
-
-    PQclear(res);
-    PG_RETURN_TEXT_P(cstring_to_text("OK"));
+	PGresult   *res = NULL;
+	PGconn	   *conn;
+	char	   *curname = NULL;
+	char	   *sql = NULL;
+	char	   *conname = NULL;
+	StringInfoData buf;
+	remoteConn *rconn = NULL;
+	bool		fail = true;	/* default to backward compatible behavior */
+
+	dblink_init();
+	initStringInfo(&buf);
+
+	if (PG_NARGS() == 2)
+	{
+		/* text,text */
+		curname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+		sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
+		rconn = pconn;
+	}
+	else if (PG_NARGS() == 3)
+	{
+		/* might be text,text,text or text,text,bool */
+		if (get_fn_expr_argtype(fcinfo->flinfo, 2) == BOOLOID)
+		{
+			curname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+			sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
+			fail = PG_GETARG_BOOL(2);
+			rconn = pconn;
+		}
+		else
+		{
+			conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+			curname = text_to_cstring(PG_GETARG_TEXT_PP(1));
+			sql = text_to_cstring(PG_GETARG_TEXT_PP(2));
+			rconn = getConnectionByName(conname);
+		}
+	}
+	else if (PG_NARGS() == 4)
+	{
+		/* text,text,text,bool */
+		conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+		curname = text_to_cstring(PG_GETARG_TEXT_PP(1));
+		sql = text_to_cstring(PG_GETARG_TEXT_PP(2));
+		fail = PG_GETARG_BOOL(3);
+		rconn = getConnectionByName(conname);
+	}
+
+	if (!rconn || !rconn->conn)
+		dblink_conn_not_avail(conname);
+
+	conn = rconn->conn;
+
+	/* If we are not in a transaction, start one */
+	if (PQtransactionStatus(conn) == PQTRANS_IDLE)
+	{
+		res = PQexec(conn, "BEGIN");
+		if (PQresultStatus(res) != PGRES_COMMAND_OK)
+			dblink_res_internalerror(conn, res, "begin error");
+		PQclear(res);
+		rconn->newXactForCursor = TRUE;
+
+		/*
+		 * Since transaction state was IDLE, we force cursor count to
+		 * initially be 0. This is needed as a previous ABORT might have wiped
+		 * out our transaction without maintaining the cursor count for us.
+		 */
+		rconn->openCursorCount = 0;
+	}
+
+	/* if we started a transaction, increment cursor count */
+	if (rconn->newXactForCursor)
+		(rconn->openCursorCount)++;
+
+	appendStringInfo(&buf, "DECLARE %s CURSOR FOR %s", curname, sql);
+	res = PQexec(conn, buf.data);
+	if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
+	{
+		dblink_res_error(conn, conname, res, "could not open cursor", fail);
+		PG_RETURN_TEXT_P(cstring_to_text("ERROR"));
+	}
+
+	PQclear(res);
+	PG_RETURN_TEXT_P(cstring_to_text("OK"));
 }
 
 /*
@@ -456,83 +458,83 @@ PG_FUNCTION_INFO_V1(dblink_close);
 Datum
 dblink_close(PG_FUNCTION_ARGS)
 {
-    PGconn       *conn;
-    PGresult   *res = NULL;
-    char       *curname = NULL;
-    char       *conname = NULL;
-    StringInfoData buf;
-    remoteConn *rconn = NULL;
-    bool        fail = true;    /* default to backward compatible behavior */
-
-    dblink_init();
-    initStringInfo(&buf);
-
-    if (PG_NARGS() == 1)
-    {
-        /* text */
-        curname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-        rconn = pconn;
-    }
-    else if (PG_NARGS() == 2)
-    {
-        /* might be text,text or text,bool */
-        if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID)
-        {
-            curname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-            fail = PG_GETARG_BOOL(1);
-            rconn = pconn;
-        }
-        else
-        {
-            conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-            curname = text_to_cstring(PG_GETARG_TEXT_PP(1));
-            rconn = getConnectionByName(conname);
-        }
-    }
-    if (PG_NARGS() == 3)
-    {
-        /* text,text,bool */
-        conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-        curname = text_to_cstring(PG_GETARG_TEXT_PP(1));
-        fail = PG_GETARG_BOOL(2);
-        rconn = getConnectionByName(conname);
-    }
-
-    if (!rconn || !rconn->conn)
-        dblink_conn_not_avail(conname);
-
-    conn = rconn->conn;
-
-    appendStringInfo(&buf, "CLOSE %s", curname);
-
-    /* close the cursor */
-    res = PQexec(conn, buf.data);
-    if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
-    {
-        dblink_res_error(conn, conname, res, "could not close cursor", fail);
-        PG_RETURN_TEXT_P(cstring_to_text("ERROR"));
-    }
-
-    PQclear(res);
-
-    /* if we started a transaction, decrement cursor count */
-    if (rconn->newXactForCursor)
-    {
-        (rconn->openCursorCount)--;
-
-        /* if count is zero, commit the transaction */
-        if (rconn->openCursorCount == 0)
-        {
-            rconn->newXactForCursor = FALSE;
-
-            res = PQexec(conn, "COMMIT");
-            if (PQresultStatus(res) != PGRES_COMMAND_OK)
-                dblink_res_internalerror(conn, res, "commit error");
-            PQclear(res);
-        }
-    }
-
-    PG_RETURN_TEXT_P(cstring_to_text("OK"));
+	PGconn	   *conn;
+	PGresult   *res = NULL;
+	char	   *curname = NULL;
+	char	   *conname = NULL;
+	StringInfoData buf;
+	remoteConn *rconn = NULL;
+	bool		fail = true;	/* default to backward compatible behavior */
+
+	dblink_init();
+	initStringInfo(&buf);
+
+	if (PG_NARGS() == 1)
+	{
+		/* text */
+		curname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+		rconn = pconn;
+	}
+	else if (PG_NARGS() == 2)
+	{
+		/* might be text,text or text,bool */
+		if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID)
+		{
+			curname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+			fail = PG_GETARG_BOOL(1);
+			rconn = pconn;
+		}
+		else
+		{
+			conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+			curname = text_to_cstring(PG_GETARG_TEXT_PP(1));
+			rconn = getConnectionByName(conname);
+		}
+	}
+	if (PG_NARGS() == 3)
+	{
+		/* text,text,bool */
+		conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+		curname = text_to_cstring(PG_GETARG_TEXT_PP(1));
+		fail = PG_GETARG_BOOL(2);
+		rconn = getConnectionByName(conname);
+	}
+
+	if (!rconn || !rconn->conn)
+		dblink_conn_not_avail(conname);
+
+	conn = rconn->conn;
+
+	appendStringInfo(&buf, "CLOSE %s", curname);
+
+	/* close the cursor */
+	res = PQexec(conn, buf.data);
+	if (!res || PQresultStatus(res) != PGRES_COMMAND_OK)
+	{
+		dblink_res_error(conn, conname, res, "could not close cursor", fail);
+		PG_RETURN_TEXT_P(cstring_to_text("ERROR"));
+	}
+
+	PQclear(res);
+
+	/* if we started a transaction, decrement cursor count */
+	if (rconn->newXactForCursor)
+	{
+		(rconn->openCursorCount)--;
+
+		/* if count is zero, commit the transaction */
+		if (rconn->openCursorCount == 0)
+		{
+			rconn->newXactForCursor = FALSE;
+
+			res = PQexec(conn, "COMMIT");
+			if (PQresultStatus(res) != PGRES_COMMAND_OK)
+				dblink_res_internalerror(conn, res, "commit error");
+			PQclear(res);
+		}
+	}
+
+	PG_RETURN_TEXT_P(cstring_to_text("OK"));
 }
 
 /*
@@ -542,91 +544,91 @@ PG_FUNCTION_INFO_V1(dblink_fetch);
 Datum
 dblink_fetch(PG_FUNCTION_ARGS)
 {
-    PGresult   *res = NULL;
-    char       *conname = NULL;
-    remoteConn *rconn = NULL;
-    PGconn       *conn = NULL;
-    StringInfoData buf;
-    char       *curname = NULL;
-    int            howmany = 0;
-    bool        fail = true;    /* default to backward compatible */
-
-    prepTuplestoreResult(fcinfo);
-
-    dblink_init();
-
-    if (PG_NARGS() == 4)
-    {
-        /* text,text,int,bool */
-        conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-        curname = text_to_cstring(PG_GETARG_TEXT_PP(1));
-        howmany = PG_GETARG_INT32(2);
-        fail = PG_GETARG_BOOL(3);
-
-        rconn = getConnectionByName(conname);
-        if (rconn)
-            conn = rconn->conn;
-    }
-    else if (PG_NARGS() == 3)
-    {
-        /* text,text,int or text,int,bool */
-        if (get_fn_expr_argtype(fcinfo->flinfo, 2) == BOOLOID)
-        {
-            curname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-            howmany = PG_GETARG_INT32(1);
-            fail = PG_GETARG_BOOL(2);
-            conn = pconn->conn;
-        }
-        else
-        {
-            conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-            curname = text_to_cstring(PG_GETARG_TEXT_PP(1));
-            howmany = PG_GETARG_INT32(2);
-
-            rconn = getConnectionByName(conname);
-            if (rconn)
-                conn = rconn->conn;
-        }
-    }
-    else if (PG_NARGS() == 2)
-    {
-        /* text,int */
-        curname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-        howmany = PG_GETARG_INT32(1);
-        conn = pconn->conn;
-    }
-
-    if (!conn)
-        dblink_conn_not_avail(conname);
-
-    initStringInfo(&buf);
-    appendStringInfo(&buf, "FETCH %d FROM %s", howmany, curname);
-
-    /*
-     * Try to execute the query.  Note that since libpq uses malloc, the
-     * PGresult will be long-lived even though we are still in a short-lived
-     * memory context.
-     */
-    res = PQexec(conn, buf.data);
-    if (!res ||
-        (PQresultStatus(res) != PGRES_COMMAND_OK &&
-         PQresultStatus(res) != PGRES_TUPLES_OK))
-    {
-        dblink_res_error(conn, conname, res,
-                         "could not fetch from cursor", fail);
-        return (Datum) 0;
-    }
-    else if (PQresultStatus(res) == PGRES_COMMAND_OK)
-    {
-        /* cursor does not exist - closed already or bad name */
-        PQclear(res);
-        ereport(ERROR,
-                (errcode(ERRCODE_INVALID_CURSOR_NAME),
-                 errmsg("cursor \"%s\" does not exist", curname)));
-    }
-
-    materializeResult(fcinfo, conn, res);
-    return (Datum) 0;
+	PGresult   *res = NULL;
+	char	   *conname = NULL;
+	remoteConn *rconn = NULL;
+	PGconn	   *conn = NULL;
+	StringInfoData buf;
+	char	   *curname = NULL;
+	int			howmany = 0;
+	bool		fail = true;	/* default to backward compatible */
+
+	prepTuplestoreResult(fcinfo);
+
+	dblink_init();
+
+	if (PG_NARGS() == 4)
+	{
+		/* text,text,int,bool */
+		conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+		curname = text_to_cstring(PG_GETARG_TEXT_PP(1));
+		howmany = PG_GETARG_INT32(2);
+		fail = PG_GETARG_BOOL(3);
+
+		rconn = getConnectionByName(conname);
+		if (rconn)
+			conn = rconn->conn;
+	}
+	else if (PG_NARGS() == 3)
+	{
+		/* text,text,int or text,int,bool */
+		if (get_fn_expr_argtype(fcinfo->flinfo, 2) == BOOLOID)
+		{
+			curname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+			howmany = PG_GETARG_INT32(1);
+			fail = PG_GETARG_BOOL(2);
+			conn = pconn->conn;
+		}
+		else
+		{
+			conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+			curname = text_to_cstring(PG_GETARG_TEXT_PP(1));
+			howmany = PG_GETARG_INT32(2);
+
+			rconn = getConnectionByName(conname);
+			if (rconn)
+				conn = rconn->conn;
+		}
+	}
+	else if (PG_NARGS() == 2)
+	{
+		/* text,int */
+		curname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+		howmany = PG_GETARG_INT32(1);
+		conn = pconn->conn;
+	}
+
+	if (!conn)
+		dblink_conn_not_avail(conname);
+
+	initStringInfo(&buf);
+	appendStringInfo(&buf, "FETCH %d FROM %s", howmany, curname);
+
+	/*
+	 * Try to execute the query.  Note that since libpq uses malloc, the
+	 * PGresult will be long-lived even though we are still in a short-lived
+	 * memory context.
+	 */
+	res = PQexec(conn, buf.data);
+	if (!res ||
+		(PQresultStatus(res) != PGRES_COMMAND_OK &&
+		 PQresultStatus(res) != PGRES_TUPLES_OK))
+	{
+		dblink_res_error(conn, conname, res,
+						 "could not fetch from cursor", fail);
+		return (Datum) 0;
+	}
+	else if (PQresultStatus(res) == PGRES_COMMAND_OK)
+	{
+		/* cursor does not exist - closed already or bad name */
+		PQclear(res);
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_CURSOR_NAME),
+				 errmsg("cursor \"%s\" does not exist", curname)));
+	}
+
+	materializeResult(fcinfo, conn, res);
+	return (Datum) 0;
 }
 
 /*
@@ -636,158 +638,382 @@ PG_FUNCTION_INFO_V1(dblink_record);
 Datum
 dblink_record(PG_FUNCTION_ARGS)
 {
-    return dblink_record_internal(fcinfo, false);
+	return dblink_record_internal(fcinfo, false);
 }
 
 PG_FUNCTION_INFO_V1(dblink_send_query);
 Datum
 dblink_send_query(PG_FUNCTION_ARGS)
 {
-    PGconn       *conn;
-    char       *sql;
-    int            retval;
-
-    if (PG_NARGS() == 2)
-    {
-        conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0)));
-        sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
-    }
-    else
-        /* shouldn't happen */
-        elog(ERROR, "wrong number of arguments");
-
-    /* async query send */
-    retval = PQsendQuery(conn, sql);
-    if (retval != 1)
-        elog(NOTICE, "could not send query: %s", pchomp(PQerrorMessage(conn)));
-
-    PG_RETURN_INT32(retval);
+	PGconn	   *conn;
+	char	   *sql;
+	int			retval;
+
+	if (PG_NARGS() == 2)
+	{
+		conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0)));
+		sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
+	}
+	else
+		/* shouldn't happen */
+		elog(ERROR, "wrong number of arguments");
+
+	/* async query send */
+	retval = PQsendQuery(conn, sql);
+	if (retval != 1)
+		elog(NOTICE, "could not send query: %s", pchomp(PQerrorMessage(conn)));
+
+	PG_RETURN_INT32(retval);
 }
 
 PG_FUNCTION_INFO_V1(dblink_get_result);
 Datum
 dblink_get_result(PG_FUNCTION_ARGS)
 {
-    return dblink_record_internal(fcinfo, true);
+	return dblink_record_internal(fcinfo, true);
 }
 
 static Datum
 dblink_record_internal(FunctionCallInfo fcinfo, bool is_async)
 {
-    PGconn       *volatile conn = NULL;
-    volatile bool freeconn = false;
-
-    prepTuplestoreResult(fcinfo);
-
-    dblink_init();
-
-    PG_TRY();
-    {
-        char       *sql = NULL;
-        char       *conname = NULL;
-        bool        fail = true;    /* default to backward compatible */
-
-        if (!is_async)
-        {
-            if (PG_NARGS() == 3)
-            {
-                /* text,text,bool */
-                conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-                sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
-                fail = PG_GETARG_BOOL(2);
-                dblink_get_conn(conname, &conn, &conname, &freeconn);
-            }
-            else if (PG_NARGS() == 2)
-            {
-                /* text,text or text,bool */
-                if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID)
-                {
-                    sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
-                    fail = PG_GETARG_BOOL(1);
-                    conn = pconn->conn;
-                }
-                else
-                {
-                    conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-                    sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
-                    dblink_get_conn(conname, &conn, &conname, &freeconn);
-                }
-            }
-            else if (PG_NARGS() == 1)
-            {
-                /* text */
-                conn = pconn->conn;
-                sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
-            }
-            else
-                /* shouldn't happen */
-                elog(ERROR, "wrong number of arguments");
-        }
-        else                    /* is_async */
-        {
-            /* get async result */
-            conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-
-            if (PG_NARGS() == 2)
-            {
-                /* text,bool */
-                fail = PG_GETARG_BOOL(1);
-                conn = dblink_get_named_conn(conname);
-            }
-            else if (PG_NARGS() == 1)
-            {
-                /* text */
-                conn = dblink_get_named_conn(conname);
-            }
-            else
-                /* shouldn't happen */
-                elog(ERROR, "wrong number of arguments");
-        }
-
-        if (!conn)
-            dblink_conn_not_avail(conname);
-
-        if (!is_async)
-        {
-            /* synchronous query, use efficient tuple collection method */
-            materializeQueryResult(fcinfo, conn, conname, sql, fail);
-        }
-        else
-        {
-            /* async result retrieval, do it the old way */
-            PGresult   *res = PQgetResult(conn);
-
-            /* NULL means we're all done with the async results */
-            if (res)
-            {
-                if (PQresultStatus(res) != PGRES_COMMAND_OK &&
-                    PQresultStatus(res) != PGRES_TUPLES_OK)
-                {
-                    dblink_res_error(conn, conname, res,
-                                     "could not execute query", fail);
-                    /* if fail isn't set, we'll return an empty query result */
-                }
-                else
-                {
-                    materializeResult(fcinfo, conn, res);
-                }
-            }
-        }
-    }
-    PG_CATCH();
-    {
-        /* if needed, close the connection to the database */
-        if (freeconn)
-            PQfinish(conn);
-        PG_RE_THROW();
-    }
-    PG_END_TRY();
-
-    /* if needed, close the connection to the database */
-    if (freeconn)
-        PQfinish(conn);
-
-    return (Datum) 0;
+	PGconn	   *volatile conn = NULL;
+	volatile bool freeconn = false;
+
+	prepTuplestoreResult(fcinfo);
+
+	dblink_init();
+
+	PG_TRY();
+	{
+		char	   *sql = NULL;
+		char	   *conname = NULL;
+		bool		fail = true;	/* default to backward compatible */
+
+		if (!is_async)
+		{
+			if (PG_NARGS() == 3)
+			{
+				/* text,text,bool */
+				conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+				sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
+				fail = PG_GETARG_BOOL(2);
+				dblink_get_conn(conname, &conn, &conname, &freeconn);
+			}
+			else if (PG_NARGS() == 2)
+			{
+				/* text,text or text,bool */
+				if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID)
+				{
+					sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
+					fail = PG_GETARG_BOOL(1);
+					conn = pconn->conn;
+				}
+				else
+				{
+					conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+					sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
+					dblink_get_conn(conname, &conn, &conname, &freeconn);
+				}
+			}
+			else if (PG_NARGS() == 1)
+			{
+				/* text */
+				conn = pconn->conn;
+				sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
+			}
+			else
+				/* shouldn't happen */
+				elog(ERROR, "wrong number of arguments");
+		}
+		else					/* is_async */
+		{
+			/* get async result */
+			conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+
+			if (PG_NARGS() == 2)
+			{
+				/* text,bool */
+				fail = PG_GETARG_BOOL(1);
+				conn = dblink_get_named_conn(conname);
+			}
+			else if (PG_NARGS() == 1)
+			{
+				/* text */
+				conn = dblink_get_named_conn(conname);
+			}
+			else
+				/* shouldn't happen */
+				elog(ERROR, "wrong number of arguments");
+		}
+
+		if (!conn)
+			dblink_conn_not_avail(conname);
+
+		if (!is_async)
+		{
+			/* synchronous query, use efficient tuple collection method */
+			materializeQueryResult(fcinfo, conn, conname, sql, fail);
+		}
+		else
+		{
+			/* async result retrieval, do it the old way */
+			PGresult   *res = PQgetResult(conn);
+
+			/* NULL means we're all done with the async results */
+			if (res)
+			{
+				if (PQresultStatus(res) != PGRES_COMMAND_OK &&
+					PQresultStatus(res) != PGRES_TUPLES_OK)
+				{
+					dblink_res_error(conn, conname, res,
+									 "could not execute query", fail);
+					/* if fail isn't set, we'll return an empty query result */
+				}
+				else
+				{
+					materializeResult(fcinfo, conn, res);
+				}
+			}
+		}
+	}
+	PG_CATCH();
+	{
+		/* if needed, close the connection to the database */
+		if (freeconn)
+			PQfinish(conn);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	/* if needed, close the connection to the database */
+	if (freeconn)
+		PQfinish(conn);
+
+	return (Datum) 0;
+}
+
+static StringInfo	copybuf = NULL;
+static char		*tmp_cbuf = NULL;
+static PGconn	*copy_conn = NULL;
+
+static int
+receive_copy_data(PGconn *conn, char **buffer)
+{
+	int			rawlen;
+
+	if (tmp_cbuf != NULL)
+		PQfreemem(tmp_cbuf);
+	tmp_cbuf = NULL;
+
+	/* Try to receive a CopyData message */
+	rawlen = PQgetCopyData(conn, &tmp_cbuf, 0);
+
+	if (rawlen < -1)
+	{
+		if (tmp_cbuf != NULL)
+			PQfreemem(tmp_cbuf);
+		tmp_cbuf = NULL;
+
+		ereport(ERROR,
+				(errmsg("could not receive data from stream: %s",
+						pchomp(PQerrorMessage(conn)))));
+	}
+
+	/* Return received messages to caller */
+	*buffer = tmp_cbuf;
+	return rawlen;
+}
+
+static int
+copy_read_data(void *outbuf, int minread, int maxread)
+{
+	int			bytesread = 0;
+	int			avail;
+
+	/* If there are some leftover data from previous read, use it. */
+	avail = copybuf->len - copybuf->cursor;
+	if (avail)
+	{
+		if (avail > maxread)
+			avail = maxread;
+		memcpy(outbuf, &copybuf->data[copybuf->cursor], avail);
+		copybuf->cursor += avail;
+		maxread -= avail;
+		bytesread += avail;
+	}
+
+	while (maxread > 0 && bytesread < minread)
+	{
+		int			len;
+		char	   *buf = NULL;
+
+		for (;;)
+		{
+			/* Try read the data. */
+			len = receive_copy_data(copy_conn, &buf);
+
+			CHECK_FOR_INTERRUPTS();
+
+			if (len < 0)
+				return bytesread;
+			else
+			{
+				/* Process the data */
+				copybuf->data = buf;
+				copybuf->len = len;
+				copybuf->cursor = 0;
+
+				avail = copybuf->len - copybuf->cursor;
+				if (avail > maxread)
+					avail = maxread;
+				memcpy(outbuf, &copybuf->data[copybuf->cursor], avail);
+				outbuf = (void *) ((char *) outbuf + avail);
+				copybuf->cursor += avail;
+				maxread -= avail;
+				bytesread += avail;
+			}
+
+			if (maxread <= 0 || bytesread >= minread)
+				return bytesread;
+		}
+	}
+
+	return bytesread;
+}
+
+static bool isRemoteTableAsSelect(char * rtblname) 
+{
+	char *tmp = rtblname;
+
+	while (tmp != NULL)
+	{
+		if (*tmp == ' ')
+		{
+			++tmp;
+			continue;
+		}
+
+		if (*tmp == '(' )
+			++ tmp;
+
+		if (0 == pg_strncasecmp(tmp, "SELECT", 6))
+			return true;
+		else
+			return false;
+	}
+
+	return false;
+}
+
+/*
+ * Copy remote table to local table.
+ * 
+ * Here issue 'COPY TO STDOUT' to remote server, and put data in a buffer,
+ * then local server will use COPY FROM statement to copy data into table
+ * directly.
+ */
+static void
+copyRemoteTableTo(char *nspname, char *tblname, char *rnspname, char *rtblname,
+					char *connstr)
+{
+	bool		freeconn = false;
+	char		*conname = connstr;
+	PGconn		*conn = NULL;
+	ParseState	*pstate = NULL;
+	Relation	rel = NULL;
+
+	dblink_init();
+
+	PG_TRY();
+	{
+		StringInfoData cmd;
+		PGresult	*res;
+		Oid	namespaceId;
+		Oid	relId;
+		CopyState	cstate;
+
+		dblink_get_conn(conname, &conn, &conname, &freeconn);
+		copy_conn = conn;
+		if (conn == NULL)
+			elog(ERROR, "failed to connect to remote server");
+
+		initStringInfo(&cmd);
+
+		/* Send copy statement to remote server */
+		if (isRemoteTableAsSelect(rtblname) )
+		{
+			appendStringInfo(&cmd, "COPY (%s) TO STDOUT", rtblname);
+		}
+		else
+		{
+			appendStringInfo(&cmd, "COPY %s TO STDOUT",
+							quote_qualified_identifier(rnspname, rtblname));
+		}
+
+		if (!PQsendQuery(conn, cmd.data))
+			elog(ERROR, "failed to get data stream from remote server");
+
+		res = PQgetResult(conn);
+		if (PQresultStatus(res) != PGRES_COPY_OUT)
+			elog(ERROR, "get bad stream status from remote server");
+
+		namespaceId = LookupExplicitNamespace(nspname, false);
+		relId = get_relname_relid(tblname, namespaceId);
+		rel = heap_open(relId, RowExclusiveLock);
+
+		copybuf = makeStringInfo();
+		pstate = make_parsestate(NULL);
+		addRangeTableEntryForRelation(pstate, rel, NULL, false, false);
+
+		cstate = BeginCopyFrom(pstate, rel, NULL, false, copy_read_data, NULL, NIL);
+
+		(void) CopyFrom(cstate);
+		EndCopyFrom(cstate);
+
+		relation_close(rel, RowExclusiveLock);
+	}
+	PG_CATCH();
+	{
+		if (tmp_cbuf != NULL)
+			PQfreemem(tmp_cbuf);
+		tmp_cbuf = NULL;
+
+		if (freeconn)
+			PQfinish(conn);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	if (tmp_cbuf != NULL)
+		PQfreemem(tmp_cbuf);
+	tmp_cbuf = NULL;
+	if (freeconn)
+		PQfinish(conn);
+}
+
+PG_FUNCTION_INFO_V1(dblink_copy_table);
+Datum
+dblink_copy_table(PG_FUNCTION_ARGS)
+{
+	char	*nspname;
+	char	*tblname;
+	char	*rnspname;
+	char	*rtblname;
+	char	*connstr;
+
+	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || PG_ARGISNULL(3)
+				|| PG_ARGISNULL(4))
+		elog(ERROR, "function argument has null values");
+
+	nspname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	tblname = text_to_cstring(PG_GETARG_TEXT_PP(1));
+	rnspname = text_to_cstring(PG_GETARG_TEXT_PP(2));
+	rtblname = text_to_cstring(PG_GETARG_TEXT_PP(3));
+	connstr = text_to_cstring(PG_GETARG_TEXT_PP(4));
+
+	copyRemoteTableTo(nspname, tblname, rnspname, rtblname, connstr);
+
+	return (Datum) 0;
 }
 
 /*
@@ -799,24 +1025,24 @@ dblink_record_internal(FunctionCallInfo fcinfo, bool is_async)
 static void
 prepTuplestoreResult(FunctionCallInfo fcinfo)
 {
-    ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
-
-    /* check to see if query supports us returning a tuplestore */
-    if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("set-valued function called in context that cannot accept a set")));
-    if (!(rsinfo->allowedModes & SFRM_Materialize))
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("materialize mode required, but it is not allowed in this context")));
-
-    /* let the executor know we're sending back a tuplestore */
-    rsinfo->returnMode = SFRM_Materialize;
-
-    /* caller must fill these to return a non-empty result */
-    rsinfo->setResult = NULL;
-    rsinfo->setDesc = NULL;
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+
+	/* check to see if query supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not allowed in this context")));
+
+	/* let the executor know we're sending back a tuplestore */
+	rsinfo->returnMode = SFRM_Materialize;
+
+	/* caller must fill these to return a non-empty result */
+	rsinfo->setResult = NULL;
+	rsinfo->setDesc = NULL;
 }
 
 /*
@@ -827,139 +1053,139 @@ prepTuplestoreResult(FunctionCallInfo fcinfo)
 static void
 materializeResult(FunctionCallInfo fcinfo, PGconn *conn, PGresult *res)
 {
-    ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
-
-    /* prepTuplestoreResult must have been called previously */
-    Assert(rsinfo->returnMode == SFRM_Materialize);
-
-    PG_TRY();
-    {
-        TupleDesc    tupdesc;
-        bool        is_sql_cmd;
-        int            ntuples;
-        int            nfields;
-
-        if (PQresultStatus(res) == PGRES_COMMAND_OK)
-        {
-            is_sql_cmd = true;
-
-            /*
-             * need a tuple descriptor representing one TEXT column to return
-             * the command status string as our result tuple
-             */
-            tupdesc = CreateTemplateTupleDesc(1, false);
-            TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status",
-                               TEXTOID, -1, 0);
-            ntuples = 1;
-            nfields = 1;
-        }
-        else
-        {
-            Assert(PQresultStatus(res) == PGRES_TUPLES_OK);
-
-            is_sql_cmd = false;
-
-            /* get a tuple descriptor for our result type */
-            switch (get_call_result_type(fcinfo, NULL, &tupdesc))
-            {
-                case TYPEFUNC_COMPOSITE:
-                    /* success */
-                    break;
-                case TYPEFUNC_RECORD:
-                    /* failed to determine actual type of RECORD */
-                    ereport(ERROR,
-                            (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                             errmsg("function returning record called in context "
-                                    "that cannot accept type record")));
-                    break;
-                default:
-                    /* result type isn't composite */
-                    elog(ERROR, "return type must be a row type");
-                    break;
-            }
-
-            /* make sure we have a persistent copy of the tupdesc */
-            tupdesc = CreateTupleDescCopy(tupdesc);
-            ntuples = PQntuples(res);
-            nfields = PQnfields(res);
-        }
-
-        /*
-         * check result and tuple descriptor have the same number of columns
-         */
-        if (nfields != tupdesc->natts)
-            ereport(ERROR,
-                    (errcode(ERRCODE_DATATYPE_MISMATCH),
-                     errmsg("remote query result rowtype does not match "
-                            "the specified FROM clause rowtype")));
-
-        if (ntuples > 0)
-        {
-            AttInMetadata *attinmeta;
-            int            nestlevel = -1;
-            Tuplestorestate *tupstore;
-            MemoryContext oldcontext;
-            int            row;
-            char      **values;
-
-            attinmeta = TupleDescGetAttInMetadata(tupdesc);
-
-            /* Set GUCs to ensure we read GUC-sensitive data types correctly */
-            if (!is_sql_cmd)
-                nestlevel = applyRemoteGucs(conn);
-
-            oldcontext = MemoryContextSwitchTo(
-                                               rsinfo->econtext->ecxt_per_query_memory);
-            tupstore = tuplestore_begin_heap(true, false, work_mem);
-            rsinfo->setResult = tupstore;
-            rsinfo->setDesc = tupdesc;
-            MemoryContextSwitchTo(oldcontext);
-
-            values = (char **) palloc(nfields * sizeof(char *));
-
-            /* put all tuples into the tuplestore */
-            for (row = 0; row < ntuples; row++)
-            {
-                HeapTuple    tuple;
-
-                if (!is_sql_cmd)
-                {
-                    int            i;
-
-                    for (i = 0; i < nfields; i++)
-                    {
-                        if (PQgetisnull(res, row, i))
-                            values[i] = NULL;
-                        else
-                            values[i] = PQgetvalue(res, row, i);
-                    }
-                }
-                else
-                {
-                    values[0] = PQcmdStatus(res);
-                }
-
-                /* build the tuple and put it into the tuplestore. */
-                tuple = BuildTupleFromCStrings(attinmeta, values);
-                tuplestore_puttuple(tupstore, tuple);
-            }
-
-            /* clean up GUC settings, if we changed any */
-            restoreLocalGucs(nestlevel);
-
-            /* clean up and return the tuplestore */
-            tuplestore_donestoring(tupstore);
-        }
-
-        PQclear(res);
-    }
-    PG_CATCH();
-    {
-        /* be sure to release the libpq result */
-        PQclear(res);
-        PG_RE_THROW();
-    }
-    PG_END_TRY();
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+
+	/* prepTuplestoreResult must have been called previously */
+	Assert(rsinfo->returnMode == SFRM_Materialize);
+
+	PG_TRY();
+	{
+		TupleDesc	tupdesc;
+		bool		is_sql_cmd;
+		int			ntuples;
+		int			nfields;
+
+		if (PQresultStatus(res) == PGRES_COMMAND_OK)
+		{
+			is_sql_cmd = true;
+
+			/*
+			 * need a tuple descriptor representing one TEXT column to return
+			 * the command status string as our result tuple
+			 */
+			tupdesc = CreateTemplateTupleDesc(1, false);
+			TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status",
+							   TEXTOID, -1, 0);
+			ntuples = 1;
+			nfields = 1;
+		}
+		else
+		{
+			Assert(PQresultStatus(res) == PGRES_TUPLES_OK);
+
+			is_sql_cmd = false;
+
+			/* get a tuple descriptor for our result type */
+			switch (get_call_result_type(fcinfo, NULL, &tupdesc))
+			{
+				case TYPEFUNC_COMPOSITE:
+					/* success */
+					break;
+				case TYPEFUNC_RECORD:
+					/* failed to determine actual type of RECORD */
+					ereport(ERROR,
+							(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+							 errmsg("function returning record called in context "
+									"that cannot accept type record")));
+					break;
+				default:
+					/* result type isn't composite */
+					elog(ERROR, "return type must be a row type");
+					break;
+			}
+
+			/* make sure we have a persistent copy of the tupdesc */
+			tupdesc = CreateTupleDescCopy(tupdesc);
+			ntuples = PQntuples(res);
+			nfields = PQnfields(res);
+		}
+
+		/*
+		 * check result and tuple descriptor have the same number of columns
+		 */
+		if (nfields != tupdesc->natts)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("remote query result rowtype does not match "
+							"the specified FROM clause rowtype")));
+
+		if (ntuples > 0)
+		{
+			AttInMetadata *attinmeta;
+			int			nestlevel = -1;
+			Tuplestorestate *tupstore;
+			MemoryContext oldcontext;
+			int			row;
+			char	  **values;
+
+			attinmeta = TupleDescGetAttInMetadata(tupdesc);
+
+			/* Set GUCs to ensure we read GUC-sensitive data types correctly */
+			if (!is_sql_cmd)
+				nestlevel = applyRemoteGucs(conn);
+
+			oldcontext = MemoryContextSwitchTo(
+											   rsinfo->econtext->ecxt_per_query_memory);
+			tupstore = tuplestore_begin_heap(true, false, work_mem);
+			rsinfo->setResult = tupstore;
+			rsinfo->setDesc = tupdesc;
+			MemoryContextSwitchTo(oldcontext);
+
+			values = (char **) palloc(nfields * sizeof(char *));
+
+			/* put all tuples into the tuplestore */
+			for (row = 0; row < ntuples; row++)
+			{
+				HeapTuple	tuple;
+
+				if (!is_sql_cmd)
+				{
+					int			i;
+
+					for (i = 0; i < nfields; i++)
+					{
+						if (PQgetisnull(res, row, i))
+							values[i] = NULL;
+						else
+							values[i] = PQgetvalue(res, row, i);
+					}
+				}
+				else
+				{
+					values[0] = PQcmdStatus(res);
+				}
+
+				/* build the tuple and put it into the tuplestore. */
+				tuple = BuildTupleFromCStrings(attinmeta, values);
+				tuplestore_puttuple(tupstore, tuple);
+			}
+
+			/* clean up GUC settings, if we changed any */
+			restoreLocalGucs(nestlevel);
+
+			/* clean up and return the tuplestore */
+			tuplestore_donestoring(tupstore);
+		}
+
+		PQclear(res);
+	}
+	PG_CATCH();
+	{
+		/* be sure to release the libpq result */
+		PQclear(res);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
 }
 
 /*
@@ -972,117 +1198,117 @@ materializeResult(FunctionCallInfo fcinfo, PGconn *conn, PGresult *res)
  */
 static void
 materializeQueryResult(FunctionCallInfo fcinfo,
-                       PGconn *conn,
-                       const char *conname,
-                       const char *sql,
-                       bool fail)
+					   PGconn *conn,
+					   const char *conname,
+					   const char *sql,
+					   bool fail)
 {
-    ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
-    PGresult   *volatile res = NULL;
-    volatile storeInfo sinfo;
-
-    /* prepTuplestoreResult must have been called previously */
-    Assert(rsinfo->returnMode == SFRM_Materialize);
-
-    /* initialize storeInfo to empty */
-    memset((void *) &sinfo, 0, sizeof(sinfo));
-    sinfo.fcinfo = fcinfo;
-
-    PG_TRY();
-    {
-        /* Create short-lived memory context for data conversions */
-        sinfo.tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
-                                                 "dblink temporary context",
-                                                 ALLOCSET_DEFAULT_SIZES);
-
-        /* execute query, collecting any tuples into the tuplestore */
-        res = storeQueryResult(&sinfo, conn, sql);
-
-        if (!res ||
-            (PQresultStatus(res) != PGRES_COMMAND_OK &&
-             PQresultStatus(res) != PGRES_TUPLES_OK))
-        {
-            /*
-             * dblink_res_error will clear the passed PGresult, so we need
-             * this ugly dance to avoid doing so twice during error exit
-             */
-            PGresult   *res1 = res;
-
-            res = NULL;
-            dblink_res_error(conn, conname, res1,
-                             "could not execute query", fail);
-            /* if fail isn't set, we'll return an empty query result */
-        }
-        else if (PQresultStatus(res) == PGRES_COMMAND_OK)
-        {
-            /*
-             * storeRow didn't get called, so we need to convert the command
-             * status string to a tuple manually
-             */
-            TupleDesc    tupdesc;
-            AttInMetadata *attinmeta;
-            Tuplestorestate *tupstore;
-            HeapTuple    tuple;
-            char       *values[1];
-            MemoryContext oldcontext;
-
-            /*
-             * need a tuple descriptor representing one TEXT column to return
-             * the command status string as our result tuple
-             */
-            tupdesc = CreateTemplateTupleDesc(1, false);
-            TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status",
-                               TEXTOID, -1, 0);
-            attinmeta = TupleDescGetAttInMetadata(tupdesc);
-
-            oldcontext = MemoryContextSwitchTo(
-                                               rsinfo->econtext->ecxt_per_query_memory);
-            tupstore = tuplestore_begin_heap(true, false, work_mem);
-            rsinfo->setResult = tupstore;
-            rsinfo->setDesc = tupdesc;
-            MemoryContextSwitchTo(oldcontext);
-
-            values[0] = PQcmdStatus(res);
-
-            /* build the tuple and put it into the tuplestore. */
-            tuple = BuildTupleFromCStrings(attinmeta, values);
-            tuplestore_puttuple(tupstore, tuple);
-
-            PQclear(res);
-            res = NULL;
-        }
-        else
-        {
-            Assert(PQresultStatus(res) == PGRES_TUPLES_OK);
-            /* storeRow should have created a tuplestore */
-            Assert(rsinfo->setResult != NULL);
-
-            PQclear(res);
-            res = NULL;
-        }
-
-        /* clean up data conversion short-lived memory context */
-        if (sinfo.tmpcontext != NULL)
-            MemoryContextDelete(sinfo.tmpcontext);
-        sinfo.tmpcontext = NULL;
-
-        PQclear(sinfo.last_res);
-        sinfo.last_res = NULL;
-        PQclear(sinfo.cur_res);
-        sinfo.cur_res = NULL;
-    }
-    PG_CATCH();
-    {
-        /* be sure to release any libpq result we collected */
-        PQclear(res);
-        PQclear(sinfo.last_res);
-        PQclear(sinfo.cur_res);
-        /* and clear out any pending data in libpq */
-        while ((res = PQgetResult(conn)) != NULL)
-            PQclear(res);
-        PG_RE_THROW();
-    }
-    PG_END_TRY();
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	PGresult   *volatile res = NULL;
+	volatile storeInfo sinfo;
+
+	/* prepTuplestoreResult must have been called previously */
+	Assert(rsinfo->returnMode == SFRM_Materialize);
+
+	/* initialize storeInfo to empty */
+	memset((void *) &sinfo, 0, sizeof(sinfo));
+	sinfo.fcinfo = fcinfo;
+
+	PG_TRY();
+	{
+		/* Create short-lived memory context for data conversions */
+		sinfo.tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
+												 "dblink temporary context",
+												 ALLOCSET_DEFAULT_SIZES);
+
+		/* execute query, collecting any tuples into the tuplestore */
+		res = storeQueryResult(&sinfo, conn, sql);
+
+		if (!res ||
+			(PQresultStatus(res) != PGRES_COMMAND_OK &&
+			 PQresultStatus(res) != PGRES_TUPLES_OK))
+		{
+			/*
+			 * dblink_res_error will clear the passed PGresult, so we need
+			 * this ugly dance to avoid doing so twice during error exit
+			 */
+			PGresult   *res1 = res;
+
+			res = NULL;
+			dblink_res_error(conn, conname, res1,
+							 "could not execute query", fail);
+			/* if fail isn't set, we'll return an empty query result */
+		}
+		else if (PQresultStatus(res) == PGRES_COMMAND_OK)
+		{
+			/*
+			 * storeRow didn't get called, so we need to convert the command
+			 * status string to a tuple manually
+			 */
+			TupleDesc	tupdesc;
+			AttInMetadata *attinmeta;
+			Tuplestorestate *tupstore;
+			HeapTuple	tuple;
+			char	   *values[1];
+			MemoryContext oldcontext;
+
+			/*
+			 * need a tuple descriptor representing one TEXT column to return
+			 * the command status string as our result tuple
+			 */
+			tupdesc = CreateTemplateTupleDesc(1, false);
+			TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status",
+							   TEXTOID, -1, 0);
+			attinmeta = TupleDescGetAttInMetadata(tupdesc);
+
+			oldcontext = MemoryContextSwitchTo(
+											   rsinfo->econtext->ecxt_per_query_memory);
+			tupstore = tuplestore_begin_heap(true, false, work_mem);
+			rsinfo->setResult = tupstore;
+			rsinfo->setDesc = tupdesc;
+			MemoryContextSwitchTo(oldcontext);
+
+			values[0] = PQcmdStatus(res);
+
+			/* build the tuple and put it into the tuplestore. */
+			tuple = BuildTupleFromCStrings(attinmeta, values);
+			tuplestore_puttuple(tupstore, tuple);
+
+			PQclear(res);
+			res = NULL;
+		}
+		else
+		{
+			Assert(PQresultStatus(res) == PGRES_TUPLES_OK);
+			/* storeRow should have created a tuplestore */
+			Assert(rsinfo->setResult != NULL);
+
+			PQclear(res);
+			res = NULL;
+		}
+
+		/* clean up data conversion short-lived memory context */
+		if (sinfo.tmpcontext != NULL)
+			MemoryContextDelete(sinfo.tmpcontext);
+		sinfo.tmpcontext = NULL;
+
+		PQclear(sinfo.last_res);
+		sinfo.last_res = NULL;
+		PQclear(sinfo.cur_res);
+		sinfo.cur_res = NULL;
+	}
+	PG_CATCH();
+	{
+		/* be sure to release any libpq result we collected */
+		PQclear(res);
+		PQclear(sinfo.last_res);
+		PQclear(sinfo.cur_res);
+		/* and clear out any pending data in libpq */
+		while ((res = PQgetResult(conn)) != NULL)
+			PQclear(res);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
 }
 
 /*
@@ -1091,63 +1317,63 @@ materializeQueryResult(FunctionCallInfo fcinfo,
 static PGresult *
 storeQueryResult(volatile storeInfo *sinfo, PGconn *conn, const char *sql)
 {
-    bool        first = true;
-    int            nestlevel = -1;
-    PGresult   *res;
-
-    if (!PQsendQuery(conn, sql))
-        elog(ERROR, "could not send query: %s", pchomp(PQerrorMessage(conn)));
-
-    if (!PQsetSingleRowMode(conn))    /* shouldn't fail */
-        elog(ERROR, "failed to set single-row mode for dblink query");
-
-    for (;;)
-    {
-        CHECK_FOR_INTERRUPTS();
-
-        sinfo->cur_res = PQgetResult(conn);
-        if (!sinfo->cur_res)
-            break;
-
-        if (PQresultStatus(sinfo->cur_res) == PGRES_SINGLE_TUPLE)
-        {
-            /* got one row from possibly-bigger resultset */
-
-            /*
-             * Set GUCs to ensure we read GUC-sensitive data types correctly.
-             * We shouldn't do this until we have a row in hand, to ensure
-             * libpq has seen any earlier ParameterStatus protocol messages.
-             */
-            if (first && nestlevel < 0)
-                nestlevel = applyRemoteGucs(conn);
-
-            storeRow(sinfo, sinfo->cur_res, first);
-
-            PQclear(sinfo->cur_res);
-            sinfo->cur_res = NULL;
-            first = false;
-        }
-        else
-        {
-            /* if empty resultset, fill tuplestore header */
-            if (first && PQresultStatus(sinfo->cur_res) == PGRES_TUPLES_OK)
-                storeRow(sinfo, sinfo->cur_res, first);
-
-            /* store completed result at last_res */
-            PQclear(sinfo->last_res);
-            sinfo->last_res = sinfo->cur_res;
-            sinfo->cur_res = NULL;
-            first = true;
-        }
-    }
-
-    /* clean up GUC settings, if we changed any */
-    restoreLocalGucs(nestlevel);
-
-    /* return last_res */
-    res = sinfo->last_res;
-    sinfo->last_res = NULL;
-    return res;
+	bool		first = true;
+	int			nestlevel = -1;
+	PGresult   *res;
+
+	if (!PQsendQuery(conn, sql))
+		elog(ERROR, "could not send query: %s", pchomp(PQerrorMessage(conn)));
+
+	if (!PQsetSingleRowMode(conn))	/* shouldn't fail */
+		elog(ERROR, "failed to set single-row mode for dblink query");
+
+	for (;;)
+	{
+		CHECK_FOR_INTERRUPTS();
+
+		sinfo->cur_res = PQgetResult(conn);
+		if (!sinfo->cur_res)
+			break;
+
+		if (PQresultStatus(sinfo->cur_res) == PGRES_SINGLE_TUPLE)
+		{
+			/* got one row from possibly-bigger resultset */
+
+			/*
+			 * Set GUCs to ensure we read GUC-sensitive data types correctly.
+			 * We shouldn't do this until we have a row in hand, to ensure
+			 * libpq has seen any earlier ParameterStatus protocol messages.
+			 */
+			if (first && nestlevel < 0)
+				nestlevel = applyRemoteGucs(conn);
+
+			storeRow(sinfo, sinfo->cur_res, first);
+
+			PQclear(sinfo->cur_res);
+			sinfo->cur_res = NULL;
+			first = false;
+		}
+		else
+		{
+			/* if empty resultset, fill tuplestore header */
+			if (first && PQresultStatus(sinfo->cur_res) == PGRES_TUPLES_OK)
+				storeRow(sinfo, sinfo->cur_res, first);
+
+			/* store completed result at last_res */
+			PQclear(sinfo->last_res);
+			sinfo->last_res = sinfo->cur_res;
+			sinfo->cur_res = NULL;
+			first = true;
+		}
+	}
+
+	/* clean up GUC settings, if we changed any */
+	restoreLocalGucs(nestlevel);
+
+	/* return last_res */
+	res = sinfo->last_res;
+	sinfo->last_res = NULL;
+	return res;
 }
 
 /*
@@ -1159,107 +1385,107 @@ storeQueryResult(volatile storeInfo *sinfo, PGconn *conn, const char *sql)
 static void
 storeRow(volatile storeInfo *sinfo, PGresult *res, bool first)
 {
-    int            nfields = PQnfields(res);
-    HeapTuple    tuple;
-    int            i;
-    MemoryContext oldcontext;
-
-    if (first)
-    {
-        /* Prepare for new result set */
-        ReturnSetInfo *rsinfo = (ReturnSetInfo *) sinfo->fcinfo->resultinfo;
-        TupleDesc    tupdesc;
-
-        /*
-         * It's possible to get more than one result set if the query string
-         * contained multiple SQL commands.  In that case, we follow PQexec's
-         * traditional behavior of throwing away all but the last result.
-         */
-        if (sinfo->tuplestore)
-            tuplestore_end(sinfo->tuplestore);
-        sinfo->tuplestore = NULL;
-
-        /* get a tuple descriptor for our result type */
-        switch (get_call_result_type(sinfo->fcinfo, NULL, &tupdesc))
-        {
-            case TYPEFUNC_COMPOSITE:
-                /* success */
-                break;
-            case TYPEFUNC_RECORD:
-                /* failed to determine actual type of RECORD */
-                ereport(ERROR,
-                        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                         errmsg("function returning record called in context "
-                                "that cannot accept type record")));
-                break;
-            default:
-                /* result type isn't composite */
-                elog(ERROR, "return type must be a row type");
-                break;
-        }
-
-        /* make sure we have a persistent copy of the tupdesc */
-        tupdesc = CreateTupleDescCopy(tupdesc);
-
-        /* check result and tuple descriptor have the same number of columns */
-        if (nfields != tupdesc->natts)
-            ereport(ERROR,
-                    (errcode(ERRCODE_DATATYPE_MISMATCH),
-                     errmsg("remote query result rowtype does not match "
-                            "the specified FROM clause rowtype")));
-
-        /* Prepare attinmeta for later data conversions */
-        sinfo->attinmeta = TupleDescGetAttInMetadata(tupdesc);
-
-        /* Create a new, empty tuplestore */
-        oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory);
-        sinfo->tuplestore = tuplestore_begin_heap(true, false, work_mem);
-        rsinfo->setResult = sinfo->tuplestore;
-        rsinfo->setDesc = tupdesc;
-        MemoryContextSwitchTo(oldcontext);
-
-        /* Done if empty resultset */
-        if (PQntuples(res) == 0)
-            return;
-
-        /*
-         * Set up sufficiently-wide string pointers array; this won't change
-         * in size so it's easy to preallocate.
-         */
-        if (sinfo->cstrs)
-            pfree(sinfo->cstrs);
-        sinfo->cstrs = (char **) palloc(nfields * sizeof(char *));
-    }
-
-    /* Should have a single-row result if we get here */
-    Assert(PQntuples(res) == 1);
-
-    /*
-     * Do the following work in a temp context that we reset after each tuple.
-     * This cleans up not only the data we have direct access to, but any
-     * cruft the I/O functions might leak.
-     */
-    oldcontext = MemoryContextSwitchTo(sinfo->tmpcontext);
-
-    /*
-     * Fill cstrs with null-terminated strings of column values.
-     */
-    for (i = 0; i < nfields; i++)
-    {
-        if (PQgetisnull(res, 0, i))
-            sinfo->cstrs[i] = NULL;
-        else
-            sinfo->cstrs[i] = PQgetvalue(res, 0, i);
-    }
-
-    /* Convert row to a tuple, and add it to the tuplestore */
-    tuple = BuildTupleFromCStrings(sinfo->attinmeta, sinfo->cstrs);
-
-    tuplestore_puttuple(sinfo->tuplestore, tuple);
-
-    /* Clean up */
-    MemoryContextSwitchTo(oldcontext);
-    MemoryContextReset(sinfo->tmpcontext);
+	int			nfields = PQnfields(res);
+	HeapTuple	tuple;
+	int			i;
+	MemoryContext oldcontext;
+
+	if (first)
+	{
+		/* Prepare for new result set */
+		ReturnSetInfo *rsinfo = (ReturnSetInfo *) sinfo->fcinfo->resultinfo;
+		TupleDesc	tupdesc;
+
+		/*
+		 * It's possible to get more than one result set if the query string
+		 * contained multiple SQL commands.  In that case, we follow PQexec's
+		 * traditional behavior of throwing away all but the last result.
+		 */
+		if (sinfo->tuplestore)
+			tuplestore_end(sinfo->tuplestore);
+		sinfo->tuplestore = NULL;
+
+		/* get a tuple descriptor for our result type */
+		switch (get_call_result_type(sinfo->fcinfo, NULL, &tupdesc))
+		{
+			case TYPEFUNC_COMPOSITE:
+				/* success */
+				break;
+			case TYPEFUNC_RECORD:
+				/* failed to determine actual type of RECORD */
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("function returning record called in context "
+								"that cannot accept type record")));
+				break;
+			default:
+				/* result type isn't composite */
+				elog(ERROR, "return type must be a row type");
+				break;
+		}
+
+		/* make sure we have a persistent copy of the tupdesc */
+		tupdesc = CreateTupleDescCopy(tupdesc);
+
+		/* check result and tuple descriptor have the same number of columns */
+		if (nfields != tupdesc->natts)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("remote query result rowtype does not match "
+							"the specified FROM clause rowtype")));
+
+		/* Prepare attinmeta for later data conversions */
+		sinfo->attinmeta = TupleDescGetAttInMetadata(tupdesc);
+
+		/* Create a new, empty tuplestore */
+		oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory);
+		sinfo->tuplestore = tuplestore_begin_heap(true, false, work_mem);
+		rsinfo->setResult = sinfo->tuplestore;
+		rsinfo->setDesc = tupdesc;
+		MemoryContextSwitchTo(oldcontext);
+
+		/* Done if empty resultset */
+		if (PQntuples(res) == 0)
+			return;
+
+		/*
+		 * Set up sufficiently-wide string pointers array; this won't change
+		 * in size so it's easy to preallocate.
+		 */
+		if (sinfo->cstrs)
+			pfree(sinfo->cstrs);
+		sinfo->cstrs = (char **) palloc(nfields * sizeof(char *));
+	}
+
+	/* Should have a single-row result if we get here */
+	Assert(PQntuples(res) == 1);
+
+	/*
+	 * Do the following work in a temp context that we reset after each tuple.
+	 * This cleans up not only the data we have direct access to, but any
+	 * cruft the I/O functions might leak.
+	 */
+	oldcontext = MemoryContextSwitchTo(sinfo->tmpcontext);
+
+	/*
+	 * Fill cstrs with null-terminated strings of column values.
+	 */
+	for (i = 0; i < nfields; i++)
+	{
+		if (PQgetisnull(res, 0, i))
+			sinfo->cstrs[i] = NULL;
+		else
+			sinfo->cstrs[i] = PQgetvalue(res, 0, i);
+	}
+
+	/* Convert row to a tuple, and add it to the tuplestore */
+	tuple = BuildTupleFromCStrings(sinfo->attinmeta, sinfo->cstrs);
+
+	tuplestore_puttuple(sinfo->tuplestore, tuple);
+
+	/* Clean up */
+	MemoryContextSwitchTo(oldcontext);
+	MemoryContextReset(sinfo->tmpcontext);
 }
 
 /*
@@ -1271,27 +1497,27 @@ PG_FUNCTION_INFO_V1(dblink_get_connections);
 Datum
 dblink_get_connections(PG_FUNCTION_ARGS)
 {
-    HASH_SEQ_STATUS status;
-    remoteConnHashEnt *hentry;
-    ArrayBuildState *astate = NULL;
-
-    if (remoteConnHash)
-    {
-        hash_seq_init(&status, remoteConnHash);
-        while ((hentry = (remoteConnHashEnt *) hash_seq_search(&status)) != NULL)
-        {
-            /* stash away current value */
-            astate = accumArrayResult(astate,
-                                      CStringGetTextDatum(hentry->name),
-                                      false, TEXTOID, CurrentMemoryContext);
-        }
-    }
-
-    if (astate)
-        PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
-                                              CurrentMemoryContext));
-    else
-        PG_RETURN_NULL();
+	HASH_SEQ_STATUS status;
+	remoteConnHashEnt *hentry;
+	ArrayBuildState *astate = NULL;
+
+	if (remoteConnHash)
+	{
+		hash_seq_init(&status, remoteConnHash);
+		while ((hentry = (remoteConnHashEnt *) hash_seq_search(&status)) != NULL)
+		{
+			/* stash away current value */
+			astate = accumArrayResult(astate,
+									  CStringGetTextDatum(hentry->name),
+									  false, TEXTOID, CurrentMemoryContext);
+		}
+	}
+
+	if (astate)
+		PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
+											  CurrentMemoryContext));
+	else
+		PG_RETURN_NULL();
 }
 
 /*
@@ -1299,53 +1525,53 @@ dblink_get_connections(PG_FUNCTION_ARGS)
  *
  * Returns 1 if the connection is busy, 0 otherwise
  * Params:
- *    text connection_name - name of the connection to check
+ *	text connection_name - name of the connection to check
  *
  */
 PG_FUNCTION_INFO_V1(dblink_is_busy);
 Datum
 dblink_is_busy(PG_FUNCTION_ARGS)
 {
-    PGconn       *conn;
+	PGconn	   *conn;
 
-    dblink_init();
-    conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0)));
+	dblink_init();
+	conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0)));
 
-    PQconsumeInput(conn);
-    PG_RETURN_INT32(PQisBusy(conn));
+	PQconsumeInput(conn);
+	PG_RETURN_INT32(PQisBusy(conn));
 }
 
 /*
  * Cancels a running request on a connection
  *
  * Returns text:
- *    "OK" if the cancel request has been sent correctly,
- *        an error message otherwise
+ *	"OK" if the cancel request has been sent correctly,
+ *		an error message otherwise
  *
  * Params:
- *    text connection_name - name of the connection to check
+ *	text connection_name - name of the connection to check
  *
  */
 PG_FUNCTION_INFO_V1(dblink_cancel_query);
 Datum
 dblink_cancel_query(PG_FUNCTION_ARGS)
 {
-    int            res;
-    PGconn       *conn;
-    PGcancel   *cancel;
-    char        errbuf[256];
-
-    dblink_init();
-    conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0)));
-    cancel = PQgetCancel(conn);
-
-    res = PQcancel(cancel, errbuf, 256);
-    PQfreeCancel(cancel);
-
-    if (res == 1)
-        PG_RETURN_TEXT_P(cstring_to_text("OK"));
-    else
-        PG_RETURN_TEXT_P(cstring_to_text(errbuf));
+	int			res;
+	PGconn	   *conn;
+	PGcancel   *cancel;
+	char		errbuf[256];
+
+	dblink_init();
+	conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0)));
+	cancel = PQgetCancel(conn);
+
+	res = PQcancel(cancel, errbuf, 256);
+	PQfreeCancel(cancel);
+
+	if (res == 1)
+		PG_RETURN_TEXT_P(cstring_to_text("OK"));
+	else
+		PG_RETURN_TEXT_P(cstring_to_text(errbuf));
 }
 
 
@@ -1353,27 +1579,27 @@ dblink_cancel_query(PG_FUNCTION_ARGS)
  * Get error message from a connection
  *
  * Returns text:
- *    "OK" if no error, an error message otherwise
+ *	"OK" if no error, an error message otherwise
  *
  * Params:
- *    text connection_name - name of the connection to check
+ *	text connection_name - name of the connection to check
  *
  */
 PG_FUNCTION_INFO_V1(dblink_error_message);
 Datum
 dblink_error_message(PG_FUNCTION_ARGS)
 {
-    char       *msg;
-    PGconn       *conn;
+	char	   *msg;
+	PGconn	   *conn;
 
-    dblink_init();
-    conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0)));
+	dblink_init();
+	conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0)));
 
-    msg = PQerrorMessage(conn);
-    if (msg == NULL || msg[0] == '\0')
-        PG_RETURN_TEXT_P(cstring_to_text("OK"));
-    else
-        PG_RETURN_TEXT_P(cstring_to_text(pchomp(msg)));
+	msg = PQerrorMessage(conn);
+	if (msg == NULL || msg[0] == '\0')
+		PG_RETURN_TEXT_P(cstring_to_text("OK"));
+	else
+		PG_RETURN_TEXT_P(cstring_to_text(pchomp(msg)));
 }
 
 /*
@@ -1383,101 +1609,101 @@ PG_FUNCTION_INFO_V1(dblink_exec);
 Datum
 dblink_exec(PG_FUNCTION_ARGS)
 {
-    text       *volatile sql_cmd_status = NULL;
-    PGconn       *volatile conn = NULL;
-    volatile bool freeconn = false;
-
-    dblink_init();
-
-    PG_TRY();
-    {
-        PGresult   *res = NULL;
-        char       *sql = NULL;
-        char       *conname = NULL;
-        bool        fail = true;    /* default to backward compatible behavior */
-
-        if (PG_NARGS() == 3)
-        {
-            /* must be text,text,bool */
-            conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-            sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
-            fail = PG_GETARG_BOOL(2);
-            dblink_get_conn(conname, &conn, &conname, &freeconn);
-        }
-        else if (PG_NARGS() == 2)
-        {
-            /* might be text,text or text,bool */
-            if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID)
-            {
-                sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
-                fail = PG_GETARG_BOOL(1);
-                conn = pconn->conn;
-            }
-            else
-            {
-                conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
-                sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
-                dblink_get_conn(conname, &conn, &conname, &freeconn);
-            }
-        }
-        else if (PG_NARGS() == 1)
-        {
-            /* must be single text argument */
-            conn = pconn->conn;
-            sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
-        }
-        else
-            /* shouldn't happen */
-            elog(ERROR, "wrong number of arguments");
-
-        if (!conn)
-            dblink_conn_not_avail(conname);
-
-        res = PQexec(conn, sql);
-        if (!res ||
-            (PQresultStatus(res) != PGRES_COMMAND_OK &&
-             PQresultStatus(res) != PGRES_TUPLES_OK))
-        {
-            dblink_res_error(conn, conname, res,
-                             "could not execute command", fail);
-
-            /*
-             * and save a copy of the command status string to return as our
-             * result tuple
-             */
-            sql_cmd_status = cstring_to_text("ERROR");
-        }
-        else if (PQresultStatus(res) == PGRES_COMMAND_OK)
-        {
-            /*
-             * and save a copy of the command status string to return as our
-             * result tuple
-             */
-            sql_cmd_status = cstring_to_text(PQcmdStatus(res));
-            PQclear(res);
-        }
-        else
-        {
-            PQclear(res);
-            ereport(ERROR,
-                    (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED),
-                     errmsg("statement returning results not allowed")));
-        }
-    }
-    PG_CATCH();
-    {
-        /* if needed, close the connection to the database */
-        if (freeconn)
-            PQfinish(conn);
-        PG_RE_THROW();
-    }
-    PG_END_TRY();
-
-    /* if needed, close the connection to the database */
-    if (freeconn)
-        PQfinish(conn);
-
-    PG_RETURN_TEXT_P(sql_cmd_status);
+	text	   *volatile sql_cmd_status = NULL;
+	PGconn	   *volatile conn = NULL;
+	volatile bool freeconn = false;
+
+	dblink_init();
+
+	PG_TRY();
+	{
+		PGresult   *res = NULL;
+		char	   *sql = NULL;
+		char	   *conname = NULL;
+		bool		fail = true;	/* default to backward compatible behavior */
+
+		if (PG_NARGS() == 3)
+		{
+			/* must be text,text,bool */
+			conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+			sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
+			fail = PG_GETARG_BOOL(2);
+			dblink_get_conn(conname, &conn, &conname, &freeconn);
+		}
+		else if (PG_NARGS() == 2)
+		{
+			/* might be text,text or text,bool */
+			if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID)
+			{
+				sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
+				fail = PG_GETARG_BOOL(1);
+				conn = pconn->conn;
+			}
+			else
+			{
+				conname = text_to_cstring(PG_GETARG_TEXT_PP(0));
+				sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
+				dblink_get_conn(conname, &conn, &conname, &freeconn);
+			}
+		}
+		else if (PG_NARGS() == 1)
+		{
+			/* must be single text argument */
+			conn = pconn->conn;
+			sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
+		}
+		else
+			/* shouldn't happen */
+			elog(ERROR, "wrong number of arguments");
+
+		if (!conn)
+			dblink_conn_not_avail(conname);
+
+		res = PQexec(conn, sql);
+		if (!res ||
+			(PQresultStatus(res) != PGRES_COMMAND_OK &&
+			 PQresultStatus(res) != PGRES_TUPLES_OK))
+		{
+			dblink_res_error(conn, conname, res,
+							 "could not execute command", fail);
+
+			/*
+			 * and save a copy of the command status string to return as our
+			 * result tuple
+			 */
+			sql_cmd_status = cstring_to_text("ERROR");
+		}
+		else if (PQresultStatus(res) == PGRES_COMMAND_OK)
+		{
+			/*
+			 * and save a copy of the command status string to return as our
+			 * result tuple
+			 */
+			sql_cmd_status = cstring_to_text(PQcmdStatus(res));
+			PQclear(res);
+		}
+		else
+		{
+			PQclear(res);
+			ereport(ERROR,
+					(errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED),
+					 errmsg("statement returning results not allowed")));
+		}
+	}
+	PG_CATCH();
+	{
+		/* if needed, close the connection to the database */
+		if (freeconn)
+			PQfinish(conn);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	/* if needed, close the connection to the database */
+	if (freeconn)
+		PQfinish(conn);
+
+	PG_RETURN_TEXT_P(sql_cmd_status);
 }
 
 
@@ -1491,104 +1717,104 @@ PG_FUNCTION_INFO_V1(dblink_get_pkey);
 Datum
 dblink_get_pkey(PG_FUNCTION_ARGS)
 {
-    int16        numatts;
-    char      **results;
-    FuncCallContext *funcctx;
-    int32        call_cntr;
-    int32        max_calls;
-    AttInMetadata *attinmeta;
-    MemoryContext oldcontext;
-
-    /* stuff done only on the first call of the function */
-    if (SRF_IS_FIRSTCALL())
-    {
-        Relation    rel;
-        TupleDesc    tupdesc;
-
-        /* create a function context for cross-call persistence */
-        funcctx = SRF_FIRSTCALL_INIT();
-
-        /*
-         * switch to memory context appropriate for multiple function calls
-         */
-        oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
-
-        /* open target relation */
-        rel = get_rel_from_relname(PG_GETARG_TEXT_PP(0), AccessShareLock, ACL_SELECT);
-
-        /* get the array of attnums */
-        results = get_pkey_attnames(rel, &numatts);
-
-        relation_close(rel, AccessShareLock);
-
-        /*
-         * need a tuple descriptor representing one INT and one TEXT column
-         */
-        tupdesc = CreateTemplateTupleDesc(2, false);
-        TupleDescInitEntry(tupdesc, (AttrNumber) 1, "position",
-                           INT4OID, -1, 0);
-        TupleDescInitEntry(tupdesc, (AttrNumber) 2, "colname",
-                           TEXTOID, -1, 0);
-
-        /*
-         * Generate attribute metadata needed later to produce tuples from raw
-         * C strings
-         */
-        attinmeta = TupleDescGetAttInMetadata(tupdesc);
-        funcctx->attinmeta = attinmeta;
-
-        if ((results != NULL) && (numatts > 0))
-        {
-            funcctx->max_calls = numatts;
-
-            /* got results, keep track of them */
-            funcctx->user_fctx = results;
-        }
-        else
-        {
-            /* fast track when no results */
-            MemoryContextSwitchTo(oldcontext);
-            SRF_RETURN_DONE(funcctx);
-        }
-
-        MemoryContextSwitchTo(oldcontext);
-    }
-
-    /* stuff done on every call of the function */
-    funcctx = SRF_PERCALL_SETUP();
-
-    /*
-     * initialize per-call variables
-     */
-    call_cntr = funcctx->call_cntr;
-    max_calls = funcctx->max_calls;
-
-    results = (char **) funcctx->user_fctx;
-    attinmeta = funcctx->attinmeta;
-
-    if (call_cntr < max_calls)    /* do when there is more left to send */
-    {
-        char      **values;
-        HeapTuple    tuple;
-        Datum        result;
-
-        values = (char **) palloc(2 * sizeof(char *));
-        values[0] = psprintf("%d", call_cntr + 1);
-        values[1] = results[call_cntr];
-
-        /* build the tuple */
-        tuple = BuildTupleFromCStrings(attinmeta, values);
-
-        /* make the tuple into a datum */
-        result = HeapTupleGetDatum(tuple);
-
-        SRF_RETURN_NEXT(funcctx, result);
-    }
-    else
-    {
-        /* do when there is no more left */
-        SRF_RETURN_DONE(funcctx);
-    }
+	int16		numatts;
+	char	  **results;
+	FuncCallContext *funcctx;
+	int32		call_cntr;
+	int32		max_calls;
+	AttInMetadata *attinmeta;
+	MemoryContext oldcontext;
+
+	/* stuff done only on the first call of the function */
+	if (SRF_IS_FIRSTCALL())
+	{
+		Relation	rel;
+		TupleDesc	tupdesc;
+
+		/* create a function context for cross-call persistence */
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		/*
+		 * switch to memory context appropriate for multiple function calls
+		 */
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		/* open target relation */
+		rel = get_rel_from_relname(PG_GETARG_TEXT_PP(0), AccessShareLock, ACL_SELECT);
+
+		/* get the array of attnums */
+		results = get_pkey_attnames(rel, &numatts);
+
+		relation_close(rel, AccessShareLock);
+
+		/*
+		 * need a tuple descriptor representing one INT and one TEXT column
+		 */
+		tupdesc = CreateTemplateTupleDesc(2, false);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "position",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "colname",
+						   TEXTOID, -1, 0);
+
+		/*
+		 * Generate attribute metadata needed later to produce tuples from raw
+		 * C strings
+		 */
+		attinmeta = TupleDescGetAttInMetadata(tupdesc);
+		funcctx->attinmeta = attinmeta;
+
+		if ((results != NULL) && (numatts > 0))
+		{
+			funcctx->max_calls = numatts;
+
+			/* got results, keep track of them */
+			funcctx->user_fctx = results;
+		}
+		else
+		{
+			/* fast track when no results */
+			MemoryContextSwitchTo(oldcontext);
+			SRF_RETURN_DONE(funcctx);
+		}
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	/* stuff done on every call of the function */
+	funcctx = SRF_PERCALL_SETUP();
+
+	/*
+	 * initialize per-call variables
+	 */
+	call_cntr = funcctx->call_cntr;
+	max_calls = funcctx->max_calls;
+
+	results = (char **) funcctx->user_fctx;
+	attinmeta = funcctx->attinmeta;
+
+	if (call_cntr < max_calls)	/* do when there is more left to send */
+	{
+		char	  **values;
+		HeapTuple	tuple;
+		Datum		result;
+
+		values = (char **) palloc(2 * sizeof(char *));
+		values[0] = psprintf("%d", call_cntr + 1);
+		values[1] = results[call_cntr];
+
+		/* build the tuple */
+		tuple = BuildTupleFromCStrings(attinmeta, values);
+
+		/* make the tuple into a datum */
+		result = HeapTupleGetDatum(tuple);
+
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else
+	{
+		/* do when there is no more left */
+		SRF_RETURN_DONE(funcctx);
+	}
 }
 
 
@@ -1615,75 +1841,75 @@ PG_FUNCTION_INFO_V1(dblink_build_sql_insert);
 Datum
 dblink_build_sql_insert(PG_FUNCTION_ARGS)
 {
-    text       *relname_text = PG_GETARG_TEXT_PP(0);
-    int2vector *pkattnums_arg = (int2vector *) PG_GETARG_POINTER(1);
-    int32        pknumatts_arg = PG_GETARG_INT32(2);
-    ArrayType  *src_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(3);
-    ArrayType  *tgt_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(4);
-    Relation    rel;
-    int           *pkattnums;
-    int            pknumatts;
-    char      **src_pkattvals;
-    char      **tgt_pkattvals;
-    int            src_nitems;
-    int            tgt_nitems;
-    char       *sql;
-
-    /*
-     * Open target relation.
-     */
-    rel = get_rel_from_relname(relname_text, AccessShareLock, ACL_SELECT);
-
-    /*
-     * Process pkattnums argument.
-     */
-    validate_pkattnums(rel, pkattnums_arg, pknumatts_arg,
-                       &pkattnums, &pknumatts);
-
-    /*
-     * Source array is made up of key values that will be used to locate the
-     * tuple of interest from the local system.
-     */
-    src_pkattvals = get_text_array_contents(src_pkattvals_arry, &src_nitems);
-
-    /*
-     * There should be one source array key value for each key attnum
-     */
-    if (src_nitems != pknumatts)
-        ereport(ERROR,
-                (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
-                 errmsg("source key array length must match number of key " \
-                        "attributes")));
-
-    /*
-     * Target array is made up of key values that will be used to build the
-     * SQL string for use on the remote system.
-     */
-    tgt_pkattvals = get_text_array_contents(tgt_pkattvals_arry, &tgt_nitems);
-
-    /*
-     * There should be one target array key value for each key attnum
-     */
-    if (tgt_nitems != pknumatts)
-        ereport(ERROR,
-                (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
-                 errmsg("target key array length must match number of key " \
-                        "attributes")));
-
-    /*
-     * Prep work is finally done. Go get the SQL string.
-     */
-    sql = get_sql_insert(rel, pkattnums, pknumatts, src_pkattvals, tgt_pkattvals);
-
-    /*
-     * Now we can close the relation.
-     */
-    relation_close(rel, AccessShareLock);
-
-    /*
-     * And send it
-     */
-    PG_RETURN_TEXT_P(cstring_to_text(sql));
+	text	   *relname_text = PG_GETARG_TEXT_PP(0);
+	int2vector *pkattnums_arg = (int2vector *) PG_GETARG_POINTER(1);
+	int32		pknumatts_arg = PG_GETARG_INT32(2);
+	ArrayType  *src_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(3);
+	ArrayType  *tgt_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(4);
+	Relation	rel;
+	int		   *pkattnums;
+	int			pknumatts;
+	char	  **src_pkattvals;
+	char	  **tgt_pkattvals;
+	int			src_nitems;
+	int			tgt_nitems;
+	char	   *sql;
+
+	/*
+	 * Open target relation.
+	 */
+	rel = get_rel_from_relname(relname_text, AccessShareLock, ACL_SELECT);
+
+	/*
+	 * Process pkattnums argument.
+	 */
+	validate_pkattnums(rel, pkattnums_arg, pknumatts_arg,
+					   &pkattnums, &pknumatts);
+
+	/*
+	 * Source array is made up of key values that will be used to locate the
+	 * tuple of interest from the local system.
+	 */
+	src_pkattvals = get_text_array_contents(src_pkattvals_arry, &src_nitems);
+
+	/*
+	 * There should be one source array key value for each key attnum
+	 */
+	if (src_nitems != pknumatts)
+		ereport(ERROR,
+				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
+				 errmsg("source key array length must match number of key " \
+						"attributes")));
+
+	/*
+	 * Target array is made up of key values that will be used to build the
+	 * SQL string for use on the remote system.
+	 */
+	tgt_pkattvals = get_text_array_contents(tgt_pkattvals_arry, &tgt_nitems);
+
+	/*
+	 * There should be one target array key value for each key attnum
+	 */
+	if (tgt_nitems != pknumatts)
+		ereport(ERROR,
+				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
+				 errmsg("target key array length must match number of key " \
+						"attributes")));
+
+	/*
+	 * Prep work is finally done. Go get the SQL string.
+	 */
+	sql = get_sql_insert(rel, pkattnums, pknumatts, src_pkattvals, tgt_pkattvals);
+
+	/*
+	 * Now we can close the relation.
+	 */
+	relation_close(rel, AccessShareLock);
+
+	/*
+	 * And send it
+	 */
+	PG_RETURN_TEXT_P(cstring_to_text(sql));
 }
 
 
@@ -1706,57 +1932,57 @@ PG_FUNCTION_INFO_V1(dblink_build_sql_delete);
 Datum
 dblink_build_sql_delete(PG_FUNCTION_ARGS)
 {
-    text       *relname_text = PG_GETARG_TEXT_PP(0);
-    int2vector *pkattnums_arg = (int2vector *) PG_GETARG_POINTER(1);
-    int32        pknumatts_arg = PG_GETARG_INT32(2);
-    ArrayType  *tgt_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(3);
-    Relation    rel;
-    int           *pkattnums;
-    int            pknumatts;
-    char      **tgt_pkattvals;
-    int            tgt_nitems;
-    char       *sql;
-
-    /*
-     * Open target relation.
-     */
-    rel = get_rel_from_relname(relname_text, AccessShareLock, ACL_SELECT);
-
-    /*
-     * Process pkattnums argument.
-     */
-    validate_pkattnums(rel, pkattnums_arg, pknumatts_arg,
-                       &pkattnums, &pknumatts);
-
-    /*
-     * Target array is made up of key values that will be used to build the
-     * SQL string for use on the remote system.
-     */
-    tgt_pkattvals = get_text_array_contents(tgt_pkattvals_arry, &tgt_nitems);
-
-    /*
-     * There should be one target array key value for each key attnum
-     */
-    if (tgt_nitems != pknumatts)
-        ereport(ERROR,
-                (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
-                 errmsg("target key array length must match number of key " \
-                        "attributes")));
-
-    /*
-     * Prep work is finally done. Go get the SQL string.
-     */
-    sql = get_sql_delete(rel, pkattnums, pknumatts, tgt_pkattvals);
-
-    /*
-     * Now we can close the relation.
-     */
-    relation_close(rel, AccessShareLock);
-
-    /*
-     * And send it
-     */
-    PG_RETURN_TEXT_P(cstring_to_text(sql));
+	text	   *relname_text = PG_GETARG_TEXT_PP(0);
+	int2vector *pkattnums_arg = (int2vector *) PG_GETARG_POINTER(1);
+	int32		pknumatts_arg = PG_GETARG_INT32(2);
+	ArrayType  *tgt_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(3);
+	Relation	rel;
+	int		   *pkattnums;
+	int			pknumatts;
+	char	  **tgt_pkattvals;
+	int			tgt_nitems;
+	char	   *sql;
+
+	/*
+	 * Open target relation.
+	 */
+	rel = get_rel_from_relname(relname_text, AccessShareLock, ACL_SELECT);
+
+	/*
+	 * Process pkattnums argument.
+	 */
+	validate_pkattnums(rel, pkattnums_arg, pknumatts_arg,
+					   &pkattnums, &pknumatts);
+
+	/*
+	 * Target array is made up of key values that will be used to build the
+	 * SQL string for use on the remote system.
+	 */
+	tgt_pkattvals = get_text_array_contents(tgt_pkattvals_arry, &tgt_nitems);
+
+	/*
+	 * There should be one target array key value for each key attnum
+	 */
+	if (tgt_nitems != pknumatts)
+		ereport(ERROR,
+				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
+				 errmsg("target key array length must match number of key " \
+						"attributes")));
+
+	/*
+	 * Prep work is finally done. Go get the SQL string.
+	 */
+	sql = get_sql_delete(rel, pkattnums, pknumatts, tgt_pkattvals);
+
+	/*
+	 * Now we can close the relation.
+	 */
+	relation_close(rel, AccessShareLock);
+
+	/*
+	 * And send it
+	 */
+	PG_RETURN_TEXT_P(cstring_to_text(sql));
 }
 
 
@@ -1783,75 +2009,75 @@ PG_FUNCTION_INFO_V1(dblink_build_sql_update);
 Datum
 dblink_build_sql_update(PG_FUNCTION_ARGS)
 {
-    text       *relname_text = PG_GETARG_TEXT_PP(0);
-    int2vector *pkattnums_arg = (int2vector *) PG_GETARG_POINTER(1);
-    int32        pknumatts_arg = PG_GETARG_INT32(2);
-    ArrayType  *src_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(3);
-    ArrayType  *tgt_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(4);
-    Relation    rel;
-    int           *pkattnums;
-    int            pknumatts;
-    char      **src_pkattvals;
-    char      **tgt_pkattvals;
-    int            src_nitems;
-    int            tgt_nitems;
-    char       *sql;
-
-    /*
-     * Open target relation.
-     */
-    rel = get_rel_from_relname(relname_text, AccessShareLock, ACL_SELECT);
-
-    /*
-     * Process pkattnums argument.
-     */
-    validate_pkattnums(rel, pkattnums_arg, pknumatts_arg,
-                       &pkattnums, &pknumatts);
-
-    /*
-     * Source array is made up of key values that will be used to locate the
-     * tuple of interest from the local system.
-     */
-    src_pkattvals = get_text_array_contents(src_pkattvals_arry, &src_nitems);
-
-    /*
-     * There should be one source array key value for each key attnum
-     */
-    if (src_nitems != pknumatts)
-        ereport(ERROR,
-                (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
-                 errmsg("source key array length must match number of key " \
-                        "attributes")));
-
-    /*
-     * Target array is made up of key values that will be used to build the
-     * SQL string for use on the remote system.
-     */
-    tgt_pkattvals = get_text_array_contents(tgt_pkattvals_arry, &tgt_nitems);
-
-    /*
-     * There should be one target array key value for each key attnum
-     */
-    if (tgt_nitems != pknumatts)
-        ereport(ERROR,
-                (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
-                 errmsg("target key array length must match number of key " \
-                        "attributes")));
-
-    /*
-     * Prep work is finally done. Go get the SQL string.
-     */
-    sql = get_sql_update(rel, pkattnums, pknumatts, src_pkattvals, tgt_pkattvals);
-
-    /*
-     * Now we can close the relation.
-     */
-    relation_close(rel, AccessShareLock);
-
-    /*
-     * And send it
-     */
-    PG_RETURN_TEXT_P(cstring_to_text(sql));
+	text	   *relname_text = PG_GETARG_TEXT_PP(0);
+	int2vector *pkattnums_arg = (int2vector *) PG_GETARG_POINTER(1);
+	int32		pknumatts_arg = PG_GETARG_INT32(2);
+	ArrayType  *src_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(3);
+	ArrayType  *tgt_pkattvals_arry = PG_GETARG_ARRAYTYPE_P(4);
+	Relation	rel;
+	int		   *pkattnums;
+	int			pknumatts;
+	char	  **src_pkattvals;
+	char	  **tgt_pkattvals;
+	int			src_nitems;
+	int			tgt_nitems;
+	char	   *sql;
+
+	/*
+	 * Open target relation.
+	 */
+	rel = get_rel_from_relname(relname_text, AccessShareLock, ACL_SELECT);
+
+	/*
+	 * Process pkattnums argument.
+	 */
+	validate_pkattnums(rel, pkattnums_arg, pknumatts_arg,
+					   &pkattnums, &pknumatts);
+
+	/*
+	 * Source array is made up of key values that will be used to locate the
+	 * tuple of interest from the local system.
+	 */
+	src_pkattvals = get_text_array_contents(src_pkattvals_arry, &src_nitems);
+
+	/*
+	 * There should be one source array key value for each key attnum
+	 */
+	if (src_nitems != pknumatts)
+		ereport(ERROR,
+				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
+				 errmsg("source key array length must match number of key " \
+						"attributes")));
+
+	/*
+	 * Target array is made up of key values that will be used to build the
+	 * SQL string for use on the remote system.
+	 */
+	tgt_pkattvals = get_text_array_contents(tgt_pkattvals_arry, &tgt_nitems);
+
+	/*
+	 * There should be one target array key value for each key attnum
+	 */
+	if (tgt_nitems != pknumatts)
+		ereport(ERROR,
+				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
+				 errmsg("target key array length must match number of key " \
+						"attributes")));
+
+	/*
+	 * Prep work is finally done. Go get the SQL string.
+	 */
+	sql = get_sql_update(rel, pkattnums, pknumatts, src_pkattvals, tgt_pkattvals);
+
+	/*
+	 * Now we can close the relation.
+	 */
+	relation_close(rel, AccessShareLock);
+
+	/*
+	 * And send it
+	 */
+	PG_RETURN_TEXT_P(cstring_to_text(sql));
 }
 
 /*
@@ -1864,8 +2090,8 @@ PG_FUNCTION_INFO_V1(dblink_current_query);
 Datum
 dblink_current_query(PG_FUNCTION_ARGS)
 {
-    /* This is now just an alias for the built-in function current_query() */
-    PG_RETURN_DATUM(current_query(fcinfo));
+	/* This is now just an alias for the built-in function current_query() */
+	PG_RETURN_DATUM(current_query(fcinfo));
 }
 
 /*
@@ -1876,77 +2102,77 @@ dblink_current_query(PG_FUNCTION_ARGS)
  * connection per default.
  *
  */
-#define DBLINK_NOTIFY_COLS        3
+#define DBLINK_NOTIFY_COLS		3
 
 PG_FUNCTION_INFO_V1(dblink_get_notify);
 Datum
 dblink_get_notify(PG_FUNCTION_ARGS)
 {
-    PGconn       *conn;
-    PGnotify   *notify;
-    ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
-    TupleDesc    tupdesc;
-    Tuplestorestate *tupstore;
-    MemoryContext per_query_ctx;
-    MemoryContext oldcontext;
-
-    prepTuplestoreResult(fcinfo);
-
-    dblink_init();
-    if (PG_NARGS() == 1)
-        conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0)));
-    else
-        conn = pconn->conn;
-
-    /* create the tuplestore in per-query memory */
-    per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
-    oldcontext = MemoryContextSwitchTo(per_query_ctx);
-
-    tupdesc = CreateTemplateTupleDesc(DBLINK_NOTIFY_COLS, false);
-    TupleDescInitEntry(tupdesc, (AttrNumber) 1, "notify_name",
-                       TEXTOID, -1, 0);
-    TupleDescInitEntry(tupdesc, (AttrNumber) 2, "be_pid",
-                       INT4OID, -1, 0);
-    TupleDescInitEntry(tupdesc, (AttrNumber) 3, "extra",
-                       TEXTOID, -1, 0);
-
-    tupstore = tuplestore_begin_heap(true, false, work_mem);
-    rsinfo->setResult = tupstore;
-    rsinfo->setDesc = tupdesc;
-
-    MemoryContextSwitchTo(oldcontext);
-
-    PQconsumeInput(conn);
-    while ((notify = PQnotifies(conn)) != NULL)
-    {
-        Datum        values[DBLINK_NOTIFY_COLS];
-        bool        nulls[DBLINK_NOTIFY_COLS];
-
-        memset(values, 0, sizeof(values));
-        memset(nulls, 0, sizeof(nulls));
-
-        if (notify->relname != NULL)
-            values[0] = CStringGetTextDatum(notify->relname);
-        else
-            nulls[0] = true;
-
-        values[1] = Int32GetDatum(notify->be_pid);
-
-        if (notify->extra != NULL)
-            values[2] = CStringGetTextDatum(notify->extra);
-        else
-            nulls[2] = true;
-
-        tuplestore_putvalues(tupstore, tupdesc, values, nulls);
-
-        PQfreemem(notify);
-        PQconsumeInput(conn);
-    }
-
-    /* clean up and return the tuplestore */
-    tuplestore_donestoring(tupstore);
-
-    return (Datum) 0;
+	PGconn	   *conn;
+	PGnotify   *notify;
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+
+	prepTuplestoreResult(fcinfo);
+
+	dblink_init();
+	if (PG_NARGS() == 1)
+		conn = dblink_get_named_conn(text_to_cstring(PG_GETARG_TEXT_PP(0)));
+	else
+		conn = pconn->conn;
+
+	/* create the tuplestore in per-query memory */
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupdesc = CreateTemplateTupleDesc(DBLINK_NOTIFY_COLS, false);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "notify_name",
+					   TEXTOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "be_pid",
+					   INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "extra",
+					   TEXTOID, -1, 0);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	PQconsumeInput(conn);
+	while ((notify = PQnotifies(conn)) != NULL)
+	{
+		Datum		values[DBLINK_NOTIFY_COLS];
+		bool		nulls[DBLINK_NOTIFY_COLS];
+
+		memset(values, 0, sizeof(values));
+		memset(nulls, 0, sizeof(nulls));
+
+		if (notify->relname != NULL)
+			values[0] = CStringGetTextDatum(notify->relname);
+		else
+			nulls[0] = true;
+
+		values[1] = Int32GetDatum(notify->be_pid);
+
+		if (notify->extra != NULL)
+			values[2] = CStringGetTextDatum(notify->extra);
+		else
+			nulls[2] = true;
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+
+		PQfreemem(notify);
+		PQconsumeInput(conn);
+	}
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
 }
 
 /*
@@ -1960,61 +2186,61 @@ PG_FUNCTION_INFO_V1(dblink_fdw_validator);
 Datum
 dblink_fdw_validator(PG_FUNCTION_ARGS)
 {
-    List       *options_list = untransformRelOptions(PG_GETARG_DATUM(0));
-    Oid            context = PG_GETARG_OID(1);
-    ListCell   *cell;
-
-    static const PQconninfoOption *options = NULL;
-
-    /*
-     * Get list of valid libpq options.
-     *
-     * To avoid unnecessary work, we get the list once and use it throughout
-     * the lifetime of this backend process.  We don't need to care about
-     * memory context issues, because PQconndefaults allocates with malloc.
-     */
-    if (!options)
-    {
-        options = PQconndefaults();
-        if (!options)            /* assume reason for failure is OOM */
-            ereport(ERROR,
-                    (errcode(ERRCODE_FDW_OUT_OF_MEMORY),
-                     errmsg("out of memory"),
-                     errdetail("could not get libpq's default connection options")));
-    }
-
-    /* Validate each supplied option. */
-    foreach(cell, options_list)
-    {
-        DefElem    *def = (DefElem *) lfirst(cell);
-
-        if (!is_valid_dblink_option(options, def->defname, context))
-        {
-            /*
-             * Unknown option, or invalid option for the context specified, so
-             * complain about it.  Provide a hint with list of valid options
-             * for the context.
-             */
-            StringInfoData buf;
-            const PQconninfoOption *opt;
-
-            initStringInfo(&buf);
-            for (opt = options; opt->keyword; opt++)
-            {
-                if (is_valid_dblink_option(options, opt->keyword, context))
-                    appendStringInfo(&buf, "%s%s",
-                                     (buf.len > 0) ? ", " : "",
-                                     opt->keyword);
-            }
-            ereport(ERROR,
-                    (errcode(ERRCODE_FDW_OPTION_NAME_NOT_FOUND),
-                     errmsg("invalid option \"%s\"", def->defname),
-                     errhint("Valid options in this context are: %s",
-                             buf.data)));
-        }
-    }
-
-    PG_RETURN_VOID();
+	List	   *options_list = untransformRelOptions(PG_GETARG_DATUM(0));
+	Oid			context = PG_GETARG_OID(1);
+	ListCell   *cell;
+
+	static const PQconninfoOption *options = NULL;
+
+	/*
+	 * Get list of valid libpq options.
+	 *
+	 * To avoid unnecessary work, we get the list once and use it throughout
+	 * the lifetime of this backend process.  We don't need to care about
+	 * memory context issues, because PQconndefaults allocates with malloc.
+	 */
+	if (!options)
+	{
+		options = PQconndefaults();
+		if (!options)			/* assume reason for failure is OOM */
+			ereport(ERROR,
+					(errcode(ERRCODE_FDW_OUT_OF_MEMORY),
+					 errmsg("out of memory"),
+					 errdetail("could not get libpq's default connection options")));
+	}
+
+	/* Validate each supplied option. */
+	foreach(cell, options_list)
+	{
+		DefElem    *def = (DefElem *) lfirst(cell);
+
+		if (!is_valid_dblink_option(options, def->defname, context))
+		{
+			/*
+			 * Unknown option, or invalid option for the context specified, so
+			 * complain about it.  Provide a hint with list of valid options
+			 * for the context.
+			 */
+			StringInfoData buf;
+			const PQconninfoOption *opt;
+
+			initStringInfo(&buf);
+			for (opt = options; opt->keyword; opt++)
+			{
+				if (is_valid_dblink_option(options, opt->keyword, context))
+					appendStringInfo(&buf, "%s%s",
+									 (buf.len > 0) ? ", " : "",
+									 opt->keyword);
+			}
+			ereport(ERROR,
+					(errcode(ERRCODE_FDW_OPTION_NAME_NOT_FOUND),
+					 errmsg("invalid option \"%s\"", def->defname),
+					 errhint("Valid options in this context are: %s",
+							 buf.data)));
+		}
+	}
+
+	PG_RETURN_VOID();
 }
 
 
@@ -2032,52 +2258,52 @@ dblink_fdw_validator(PG_FUNCTION_ARGS)
 static char **
 get_pkey_attnames(Relation rel, int16 *numatts)
 {
-    Relation    indexRelation;
-    ScanKeyData skey;
-    SysScanDesc scan;
-    HeapTuple    indexTuple;
-    int            i;
-    char      **result = NULL;
-    TupleDesc    tupdesc;
-
-    /* initialize numatts to 0 in case no primary key exists */
-    *numatts = 0;
-
-    tupdesc = rel->rd_att;
-
-    /* Prepare to scan pg_index for entries having indrelid = this rel. */
-    indexRelation = heap_open(IndexRelationId, AccessShareLock);
-    ScanKeyInit(&skey,
-                Anum_pg_index_indrelid,
-                BTEqualStrategyNumber, F_OIDEQ,
-                ObjectIdGetDatum(RelationGetRelid(rel)));
-
-    scan = systable_beginscan(indexRelation, IndexIndrelidIndexId, true,
-                              NULL, 1, &skey);
-
-    while (HeapTupleIsValid(indexTuple = systable_getnext(scan)))
-    {
-        Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple);
-
-        /* we're only interested if it is the primary key */
-        if (index->indisprimary)
-        {
-            *numatts = index->indnatts;
-            if (*numatts > 0)
-            {
-                result = (char **) palloc(*numatts * sizeof(char *));
-
-                for (i = 0; i < *numatts; i++)
-                    result[i] = SPI_fname(tupdesc, index->indkey.values[i]);
-            }
-            break;
-        }
-    }
-
-    systable_endscan(scan);
-    heap_close(indexRelation, AccessShareLock);
-
-    return result;
+	Relation	indexRelation;
+	ScanKeyData skey;
+	SysScanDesc scan;
+	HeapTuple	indexTuple;
+	int			i;
+	char	  **result = NULL;
+	TupleDesc	tupdesc;
+
+	/* initialize numatts to 0 in case no primary key exists */
+	*numatts = 0;
+
+	tupdesc = rel->rd_att;
+
+	/* Prepare to scan pg_index for entries having indrelid = this rel. */
+	indexRelation = heap_open(IndexRelationId, AccessShareLock);
+	ScanKeyInit(&skey,
+				Anum_pg_index_indrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(RelationGetRelid(rel)));
+
+	scan = systable_beginscan(indexRelation, IndexIndrelidIndexId, true,
+							  NULL, 1, &skey);
+
+	while (HeapTupleIsValid(indexTuple = systable_getnext(scan)))
+	{
+		Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple);
+
+		/* we're only interested if it is the primary key */
+		if (index->indisprimary)
+		{
+			*numatts = index->indnatts;
+			if (*numatts > 0)
+			{
+				result = (char **) palloc(*numatts * sizeof(char *));
+
+				for (i = 0; i < *numatts; i++)
+					result[i] = SPI_fname(tupdesc, index->indkey.values[i]);
+			}
+			break;
+		}
+	}
+
+	systable_endscan(scan);
+	heap_close(indexRelation, AccessShareLock);
+
+	return result;
 }
 
 /*
@@ -2087,255 +2313,255 @@ get_pkey_attnames(Relation rel, int16 *numatts)
 static char **
 get_text_array_contents(ArrayType *array, int *numitems)
 {
-    int            ndim = ARR_NDIM(array);
-    int           *dims = ARR_DIMS(array);
-    int            nitems;
-    int16        typlen;
-    bool        typbyval;
-    char        typalign;
-    char      **values;
-    char       *ptr;
-    bits8       *bitmap;
-    int            bitmask;
-    int            i;
-
-    Assert(ARR_ELEMTYPE(array) == TEXTOID);
-
-    *numitems = nitems = ArrayGetNItems(ndim, dims);
-
-    get_typlenbyvalalign(ARR_ELEMTYPE(array),
-                         &typlen, &typbyval, &typalign);
-
-    values = (char **) palloc(nitems * sizeof(char *));
-
-    ptr = ARR_DATA_PTR(array);
-    bitmap = ARR_NULLBITMAP(array);
-    bitmask = 1;
-
-    for (i = 0; i < nitems; i++)
-    {
-        if (bitmap && (*bitmap & bitmask) == 0)
-        {
-            values[i] = NULL;
-        }
-        else
-        {
-            values[i] = TextDatumGetCString(PointerGetDatum(ptr));
-            ptr = att_addlength_pointer(ptr, typlen, ptr);
-            ptr = (char *) att_align_nominal(ptr, typalign);
-        }
-
-        /* advance bitmap pointer if any */
-        if (bitmap)
-        {
-            bitmask <<= 1;
-            if (bitmask == 0x100)
-            {
-                bitmap++;
-                bitmask = 1;
-            }
-        }
-    }
-
-    return values;
+	int			ndim = ARR_NDIM(array);
+	int		   *dims = ARR_DIMS(array);
+	int			nitems;
+	int16		typlen;
+	bool		typbyval;
+	char		typalign;
+	char	  **values;
+	char	   *ptr;
+	bits8	   *bitmap;
+	int			bitmask;
+	int			i;
+
+	Assert(ARR_ELEMTYPE(array) == TEXTOID);
+
+	*numitems = nitems = ArrayGetNItems(ndim, dims);
+
+	get_typlenbyvalalign(ARR_ELEMTYPE(array),
+						 &typlen, &typbyval, &typalign);
+
+	values = (char **) palloc(nitems * sizeof(char *));
+
+	ptr = ARR_DATA_PTR(array);
+	bitmap = ARR_NULLBITMAP(array);
+	bitmask = 1;
+
+	for (i = 0; i < nitems; i++)
+	{
+		if (bitmap && (*bitmap & bitmask) == 0)
+		{
+			values[i] = NULL;
+		}
+		else
+		{
+			values[i] = TextDatumGetCString(PointerGetDatum(ptr));
+			ptr = att_addlength_pointer(ptr, typlen, ptr);
+			ptr = (char *) att_align_nominal(ptr, typalign);
+		}
+
+		/* advance bitmap pointer if any */
+		if (bitmap)
+		{
+			bitmask <<= 1;
+			if (bitmask == 0x100)
+			{
+				bitmap++;
+				bitmask = 1;
+			}
+		}
+	}
+
+	return values;
 }
 
 static char *
 get_sql_insert(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals, char **tgt_pkattvals)
 {
-    char       *relname;
-    HeapTuple    tuple;
-    TupleDesc    tupdesc;
-    int            natts;
-    StringInfoData buf;
-    char       *val;
-    int            key;
-    int            i;
-    bool        needComma;
-
-    initStringInfo(&buf);
-
-    /* get relation name including any needed schema prefix and quoting */
-    relname = generate_relation_name(rel);
-
-    tupdesc = rel->rd_att;
-    natts = tupdesc->natts;
-
-    tuple = get_tuple_of_interest(rel, pkattnums, pknumatts, src_pkattvals);
-    if (!tuple)
-        ereport(ERROR,
-                (errcode(ERRCODE_CARDINALITY_VIOLATION),
-                 errmsg("source row not found")));
-
-    appendStringInfo(&buf, "INSERT INTO %s(", relname);
-
-    needComma = false;
-    for (i = 0; i < natts; i++)
-    {
-        if (tupdesc->attrs[i]->attisdropped)
-            continue;
-
-        if (needComma)
-            appendStringInfoChar(&buf, ',');
-
-        appendStringInfoString(&buf,
-                               quote_ident_cstr(NameStr(tupdesc->attrs[i]->attname)));
-        needComma = true;
-    }
-
-    appendStringInfoString(&buf, ") VALUES(");
-
-    /*
-     * Note: i is physical column number (counting from 0).
-     */
-    needComma = false;
-    for (i = 0; i < natts; i++)
-    {
-        if (tupdesc->attrs[i]->attisdropped)
-            continue;
-
-        if (needComma)
-            appendStringInfoChar(&buf, ',');
-
-        key = get_attnum_pk_pos(pkattnums, pknumatts, i);
-
-        if (key >= 0)
-            val = tgt_pkattvals[key] ? pstrdup(tgt_pkattvals[key]) : NULL;
-        else
-            val = SPI_getvalue(tuple, tupdesc, i + 1);
-
-        if (val != NULL)
-        {
-            appendStringInfoString(&buf, quote_literal_cstr(val));
-            pfree(val);
-        }
-        else
-            appendStringInfoString(&buf, "NULL");
-        needComma = true;
-    }
-    appendStringInfoChar(&buf, ')');
-
-    return (buf.data);
+	char	   *relname;
+	HeapTuple	tuple;
+	TupleDesc	tupdesc;
+	int			natts;
+	StringInfoData buf;
+	char	   *val;
+	int			key;
+	int			i;
+	bool		needComma;
+
+	initStringInfo(&buf);
+
+	/* get relation name including any needed schema prefix and quoting */
+	relname = generate_relation_name(rel);
+
+	tupdesc = rel->rd_att;
+	natts = tupdesc->natts;
+
+	tuple = get_tuple_of_interest(rel, pkattnums, pknumatts, src_pkattvals);
+	if (!tuple)
+		ereport(ERROR,
+				(errcode(ERRCODE_CARDINALITY_VIOLATION),
+				 errmsg("source row not found")));
+
+	appendStringInfo(&buf, "INSERT INTO %s(", relname);
+
+	needComma = false;
+	for (i = 0; i < natts; i++)
+	{
+		if (tupdesc->attrs[i]->attisdropped)
+			continue;
+
+		if (needComma)
+			appendStringInfoChar(&buf, ',');
+
+		appendStringInfoString(&buf,
+							   quote_ident_cstr(NameStr(tupdesc->attrs[i]->attname)));
+		needComma = true;
+	}
+
+	appendStringInfoString(&buf, ") VALUES(");
+
+	/*
+	 * Note: i is physical column number (counting from 0).
+	 */
+	needComma = false;
+	for (i = 0; i < natts; i++)
+	{
+		if (tupdesc->attrs[i]->attisdropped)
+			continue;
+
+		if (needComma)
+			appendStringInfoChar(&buf, ',');
+
+		key = get_attnum_pk_pos(pkattnums, pknumatts, i);
+
+		if (key >= 0)
+			val = tgt_pkattvals[key] ? pstrdup(tgt_pkattvals[key]) : NULL;
+		else
+			val = SPI_getvalue(tuple, tupdesc, i + 1);
+
+		if (val != NULL)
+		{
+			appendStringInfoString(&buf, quote_literal_cstr(val));
+			pfree(val);
+		}
+		else
+			appendStringInfoString(&buf, "NULL");
+		needComma = true;
+	}
+	appendStringInfoChar(&buf, ')');
+
+	return (buf.data);
 }
 
 static char *
 get_sql_delete(Relation rel, int *pkattnums, int pknumatts, char **tgt_pkattvals)
 {
-    char       *relname;
-    TupleDesc    tupdesc;
-    StringInfoData buf;
-    int            i;
+	char	   *relname;
+	TupleDesc	tupdesc;
+	StringInfoData buf;
+	int			i;
 
-    initStringInfo(&buf);
+	initStringInfo(&buf);
 
-    /* get relation name including any needed schema prefix and quoting */
-    relname = generate_relation_name(rel);
+	/* get relation name including any needed schema prefix and quoting */
+	relname = generate_relation_name(rel);
 
-    tupdesc = rel->rd_att;
+	tupdesc = rel->rd_att;
 
-    appendStringInfo(&buf, "DELETE FROM %s WHERE ", relname);
-    for (i = 0; i < pknumatts; i++)
-    {
-        int            pkattnum = pkattnums[i];
+	appendStringInfo(&buf, "DELETE FROM %s WHERE ", relname);
+	for (i = 0; i < pknumatts; i++)
+	{
+		int			pkattnum = pkattnums[i];
 
-        if (i > 0)
-            appendStringInfoString(&buf, " AND ");
+		if (i > 0)
+			appendStringInfoString(&buf, " AND ");
 
-        appendStringInfoString(&buf,
-                               quote_ident_cstr(NameStr(tupdesc->attrs[pkattnum]->attname)));
+		appendStringInfoString(&buf,
+							   quote_ident_cstr(NameStr(tupdesc->attrs[pkattnum]->attname)));
 
-        if (tgt_pkattvals[i] != NULL)
-            appendStringInfo(&buf, " = %s",
-                             quote_literal_cstr(tgt_pkattvals[i]));
-        else
-            appendStringInfoString(&buf, " IS NULL");
-    }
+		if (tgt_pkattvals[i] != NULL)
+			appendStringInfo(&buf, " = %s",
+							 quote_literal_cstr(tgt_pkattvals[i]));
+		else
+			appendStringInfoString(&buf, " IS NULL");
+	}
 
-    return (buf.data);
+	return (buf.data);
 }
 
 static char *
 get_sql_update(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals, char **tgt_pkattvals)
 {
-    char       *relname;
-    HeapTuple    tuple;
-    TupleDesc    tupdesc;
-    int            natts;
-    StringInfoData buf;
-    char       *val;
-    int            key;
-    int            i;
-    bool        needComma;
-
-    initStringInfo(&buf);
-
-    /* get relation name including any needed schema prefix and quoting */
-    relname = generate_relation_name(rel);
-
-    tupdesc = rel->rd_att;
-    natts = tupdesc->natts;
-
-    tuple = get_tuple_of_interest(rel, pkattnums, pknumatts, src_pkattvals);
-    if (!tuple)
-        ereport(ERROR,
-                (errcode(ERRCODE_CARDINALITY_VIOLATION),
-                 errmsg("source row not found")));
-
-    appendStringInfo(&buf, "UPDATE %s SET ", relname);
-
-    /*
-     * Note: i is physical column number (counting from 0).
-     */
-    needComma = false;
-    for (i = 0; i < natts; i++)
-    {
-        if (tupdesc->attrs[i]->attisdropped)
-            continue;
-
-        if (needComma)
-            appendStringInfoString(&buf, ", ");
-
-        appendStringInfo(&buf, "%s = ",
-                         quote_ident_cstr(NameStr(tupdesc->attrs[i]->attname)));
-
-        key = get_attnum_pk_pos(pkattnums, pknumatts, i);
-
-        if (key >= 0)
-            val = tgt_pkattvals[key] ? pstrdup(tgt_pkattvals[key]) : NULL;
-        else
-            val = SPI_getvalue(tuple, tupdesc, i + 1);
-
-        if (val != NULL)
-        {
-            appendStringInfoString(&buf, quote_literal_cstr(val));
-            pfree(val);
-        }
-        else
-            appendStringInfoString(&buf, "NULL");
-        needComma = true;
-    }
-
-    appendStringInfoString(&buf, " WHERE ");
-
-    for (i = 0; i < pknumatts; i++)
-    {
-        int            pkattnum = pkattnums[i];
-
-        if (i > 0)
-            appendStringInfoString(&buf, " AND ");
-
-        appendStringInfoString(&buf,
-                               quote_ident_cstr(NameStr(tupdesc->attrs[pkattnum]->attname)));
-
-        val = tgt_pkattvals[i];
-
-        if (val != NULL)
-            appendStringInfo(&buf, " = %s", quote_literal_cstr(val));
-        else
-            appendStringInfoString(&buf, " IS NULL");
-    }
-
-    return (buf.data);
+	char	   *relname;
+	HeapTuple	tuple;
+	TupleDesc	tupdesc;
+	int			natts;
+	StringInfoData buf;
+	char	   *val;
+	int			key;
+	int			i;
+	bool		needComma;
+
+	initStringInfo(&buf);
+
+	/* get relation name including any needed schema prefix and quoting */
+	relname = generate_relation_name(rel);
+
+	tupdesc = rel->rd_att;
+	natts = tupdesc->natts;
+
+	tuple = get_tuple_of_interest(rel, pkattnums, pknumatts, src_pkattvals);
+	if (!tuple)
+		ereport(ERROR,
+				(errcode(ERRCODE_CARDINALITY_VIOLATION),
+				 errmsg("source row not found")));
+
+	appendStringInfo(&buf, "UPDATE %s SET ", relname);
+
+	/*
+	 * Note: i is physical column number (counting from 0).
+	 */
+	needComma = false;
+	for (i = 0; i < natts; i++)
+	{
+		if (tupdesc->attrs[i]->attisdropped)
+			continue;
+
+		if (needComma)
+			appendStringInfoString(&buf, ", ");
+
+		appendStringInfo(&buf, "%s = ",
+						 quote_ident_cstr(NameStr(tupdesc->attrs[i]->attname)));
+
+		key = get_attnum_pk_pos(pkattnums, pknumatts, i);
+
+		if (key >= 0)
+			val = tgt_pkattvals[key] ? pstrdup(tgt_pkattvals[key]) : NULL;
+		else
+			val = SPI_getvalue(tuple, tupdesc, i + 1);
+
+		if (val != NULL)
+		{
+			appendStringInfoString(&buf, quote_literal_cstr(val));
+			pfree(val);
+		}
+		else
+			appendStringInfoString(&buf, "NULL");
+		needComma = true;
+	}
+
+	appendStringInfoString(&buf, " WHERE ");
+
+	for (i = 0; i < pknumatts; i++)
+	{
+		int			pkattnum = pkattnums[i];
+
+		if (i > 0)
+			appendStringInfoString(&buf, " AND ");
+
+		appendStringInfoString(&buf,
+							   quote_ident_cstr(NameStr(tupdesc->attrs[pkattnum]->attname)));
+
+		val = tgt_pkattvals[i];
+
+		if (val != NULL)
+			appendStringInfo(&buf, " = %s", quote_literal_cstr(val));
+		else
+			appendStringInfoString(&buf, " IS NULL");
+	}
+
+	return (buf.data);
 }
 
 /*
@@ -2345,136 +2571,136 @@ get_sql_update(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals
 static char *
 quote_ident_cstr(char *rawstr)
 {
-    text       *rawstr_text;
-    text       *result_text;
-    char       *result;
+	text	   *rawstr_text;
+	text	   *result_text;
+	char	   *result;
 
-    rawstr_text = cstring_to_text(rawstr);
-    result_text = DatumGetTextPP(DirectFunctionCall1(quote_ident,
-                                                     PointerGetDatum(rawstr_text)));
-    result = text_to_cstring(result_text);
+	rawstr_text = cstring_to_text(rawstr);
+	result_text = DatumGetTextPP(DirectFunctionCall1(quote_ident,
+													 PointerGetDatum(rawstr_text)));
+	result = text_to_cstring(result_text);
 
-    return result;
+	return result;
 }
 
 static int
 get_attnum_pk_pos(int *pkattnums, int pknumatts, int key)
 {
-    int            i;
+	int			i;
 
-    /*
-     * Not likely a long list anyway, so just scan for the value
-     */
-    for (i = 0; i < pknumatts; i++)
-        if (key == pkattnums[i])
-            return i;
+	/*
+	 * Not likely a long list anyway, so just scan for the value
+	 */
+	for (i = 0; i < pknumatts; i++)
+		if (key == pkattnums[i])
+			return i;
 
-    return -1;
+	return -1;
 }
 
 static HeapTuple
 get_tuple_of_interest(Relation rel, int *pkattnums, int pknumatts, char **src_pkattvals)
 {
-    char       *relname;
-    TupleDesc    tupdesc;
-    int            natts;
-    StringInfoData buf;
-    int            ret;
-    HeapTuple    tuple;
-    int            i;
-
-    /*
-     * Connect to SPI manager
-     */
-    if ((ret = SPI_connect()) < 0)
-        /* internal error */
-        elog(ERROR, "SPI connect failure - returned %d", ret);
-
-    initStringInfo(&buf);
-
-    /* get relation name including any needed schema prefix and quoting */
-    relname = generate_relation_name(rel);
-
-    tupdesc = rel->rd_att;
-    natts = tupdesc->natts;
-
-    /*
-     * Build sql statement to look up tuple of interest, ie, the one matching
-     * src_pkattvals.  We used to use "SELECT *" here, but it's simpler to
-     * generate a result tuple that matches the table's physical structure,
-     * with NULLs for any dropped columns.  Otherwise we have to deal with two
-     * different tupdescs and everything's very confusing.
-     */
-    appendStringInfoString(&buf, "SELECT ");
-
-    for (i = 0; i < natts; i++)
-    {
-        if (i > 0)
-            appendStringInfoString(&buf, ", ");
-
-        if (tupdesc->attrs[i]->attisdropped)
-            appendStringInfoString(&buf, "NULL");
-        else
-            appendStringInfoString(&buf,
-                                   quote_ident_cstr(NameStr(tupdesc->attrs[i]->attname)));
-    }
-
-    appendStringInfo(&buf, " FROM %s WHERE ", relname);
-
-    for (i = 0; i < pknumatts; i++)
-    {
-        int            pkattnum = pkattnums[i];
-
-        if (i > 0)
-            appendStringInfoString(&buf, " AND ");
-
-        appendStringInfoString(&buf,
-                               quote_ident_cstr(NameStr(tupdesc->attrs[pkattnum]->attname)));
-
-        if (src_pkattvals[i] != NULL)
-            appendStringInfo(&buf, " = %s",
-                             quote_literal_cstr(src_pkattvals[i]));
-        else
-            appendStringInfoString(&buf, " IS NULL");
-    }
-
-    /*
-     * Retrieve the desired tuple
-     */
-    ret = SPI_exec(buf.data, 0);
-    pfree(buf.data);
-
-    /*
-     * Only allow one qualifying tuple
-     */
-    if ((ret == SPI_OK_SELECT) && (SPI_processed > 1))
-        ereport(ERROR,
-                (errcode(ERRCODE_CARDINALITY_VIOLATION),
-                 errmsg("source criteria matched more than one record")));
-
-    else if (ret == SPI_OK_SELECT && SPI_processed == 1)
-    {
-        SPITupleTable *tuptable = SPI_tuptable;
-
-        tuple = SPI_copytuple(tuptable->vals[0]);
-        SPI_finish();
-
-        return tuple;
-    }
-    else
-    {
-        /*
-         * no qualifying tuples
-         */
-        SPI_finish();
-
-        return NULL;
-    }
-
-    /*
-     * never reached, but keep compiler quiet
-     */
-    return NULL;
+	char	   *relname;
+	TupleDesc	tupdesc;
+	int			natts;
+	StringInfoData buf;
+	int			ret;
+	HeapTuple	tuple;
+	int			i;
+
+	/*
+	 * Connect to SPI manager
+	 */
+	if ((ret = SPI_connect()) < 0)
+		/* internal error */
+		elog(ERROR, "SPI connect failure - returned %d", ret);
+
+	initStringInfo(&buf);
+
+	/* get relation name including any needed schema prefix and quoting */
+	relname = generate_relation_name(rel);
+
+	tupdesc = rel->rd_att;
+	natts = tupdesc->natts;
+
+	/*
+	 * Build sql statement to look up tuple of interest, ie, the one matching
+	 * src_pkattvals.  We used to use "SELECT *" here, but it's simpler to
+	 * generate a result tuple that matches the table's physical structure,
+	 * with NULLs for any dropped columns.  Otherwise we have to deal with two
+	 * different tupdescs and everything's very confusing.
+	 */
+	appendStringInfoString(&buf, "SELECT ");
+
+	for (i = 0; i < natts; i++)
+	{
+		if (i > 0)
+			appendStringInfoString(&buf, ", ");
+
+		if (tupdesc->attrs[i]->attisdropped)
+			appendStringInfoString(&buf, "NULL");
+		else
+			appendStringInfoString(&buf,
+								   quote_ident_cstr(NameStr(tupdesc->attrs[i]->attname)));
+	}
+
+	appendStringInfo(&buf, " FROM %s WHERE ", relname);
+
+	for (i = 0; i < pknumatts; i++)
+	{
+		int			pkattnum = pkattnums[i];
+
+		if (i > 0)
+			appendStringInfoString(&buf, " AND ");
+
+		appendStringInfoString(&buf,
+							   quote_ident_cstr(NameStr(tupdesc->attrs[pkattnum]->attname)));
+
+		if (src_pkattvals[i] != NULL)
+			appendStringInfo(&buf, " = %s",
+							 quote_literal_cstr(src_pkattvals[i]));
+		else
+			appendStringInfoString(&buf, " IS NULL");
+	}
+
+	/*
+	 * Retrieve the desired tuple
+	 */
+	ret = SPI_exec(buf.data, 0);
+	pfree(buf.data);
+
+	/*
+	 * Only allow one qualifying tuple
+	 */
+	if ((ret == SPI_OK_SELECT) && (SPI_processed > 1))
+		ereport(ERROR,
+				(errcode(ERRCODE_CARDINALITY_VIOLATION),
+				 errmsg("source criteria matched more than one record")));
+
+	else if (ret == SPI_OK_SELECT && SPI_processed == 1)
+	{
+		SPITupleTable *tuptable = SPI_tuptable;
+
+		tuple = SPI_copytuple(tuptable->vals[0]);
+		SPI_finish();
+
+		return tuple;
+	}
+	else
+	{
+		/*
+		 * no qualifying tuples
+		 */
+		SPI_finish();
+
+		return NULL;
+	}
+
+	/*
+	 * never reached, but keep compiler quiet
+	 */
+	return NULL;
 }
 
 /*
@@ -2485,146 +2711,146 @@ get_tuple_of_interest(Relation rel, int *pkattnums, int pknumatts, char **src_pk
 static Relation
 get_rel_from_relname(text *relname_text, LOCKMODE lockmode, AclMode aclmode)
 {
-    RangeVar   *relvar;
-    Relation    rel;
-    AclResult    aclresult;
+	RangeVar   *relvar;
+	Relation	rel;
+	AclResult	aclresult;
 
-    relvar = makeRangeVarFromNameList(textToQualifiedNameList(relname_text));
-    rel = heap_openrv(relvar, lockmode);
+	relvar = makeRangeVarFromNameList(textToQualifiedNameList(relname_text));
+	rel = heap_openrv(relvar, lockmode);
 
-    aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(),
-                                  aclmode);
-    if (aclresult != ACLCHECK_OK)
-        aclcheck_error(aclresult, ACL_KIND_CLASS,
-                       RelationGetRelationName(rel));
+	aclresult = pg_class_aclcheck(RelationGetRelid(rel), GetUserId(),
+								  aclmode);
+	if (aclresult != ACLCHECK_OK)
+		aclcheck_error(aclresult, ACL_KIND_CLASS,
+					   RelationGetRelationName(rel));
 
-    return rel;
+	return rel;
 }
 
 /*
  * generate_relation_name - copied from ruleutils.c
- *        Compute the name to display for a relation
+ *		Compute the name to display for a relation
  *
  * The result includes all necessary quoting and schema-prefixing.
  */
 static char *
 generate_relation_name(Relation rel)
 {
-    char       *nspname;
-    char       *result;
+	char	   *nspname;
+	char	   *result;
 
-    /* Qualify the name if not visible in search path */
-    if (RelationIsVisible(RelationGetRelid(rel)))
-        nspname = NULL;
-    else
-        nspname = get_namespace_name(rel->rd_rel->relnamespace);
+	/* Qualify the name if not visible in search path */
+	if (RelationIsVisible(RelationGetRelid(rel)))
+		nspname = NULL;
+	else
+		nspname = get_namespace_name(rel->rd_rel->relnamespace);
 
-    result = quote_qualified_identifier(nspname, RelationGetRelationName(rel));
+	result = quote_qualified_identifier(nspname, RelationGetRelationName(rel));
 
-    return result;
+	return result;
 }
 
 
 static remoteConn *
 getConnectionByName(const char *name)
 {
-    remoteConnHashEnt *hentry;
-    char       *key;
+	remoteConnHashEnt *hentry;
+	char	   *key;
 
-    if (!remoteConnHash)
-        remoteConnHash = createConnHash();
+	if (!remoteConnHash)
+		remoteConnHash = createConnHash();
 
-    key = pstrdup(name);
-    truncate_identifier(key, strlen(key), false);
-    hentry = (remoteConnHashEnt *) hash_search(remoteConnHash,
-                                               key, HASH_FIND, NULL);
+	key = pstrdup(name);
+	truncate_identifier(key, strlen(key), false);
+	hentry = (remoteConnHashEnt *) hash_search(remoteConnHash,
+											   key, HASH_FIND, NULL);
 
-    if (hentry)
-        return (hentry->rconn);
+	if (hentry)
+		return (hentry->rconn);
 
-    return (NULL);
+	return (NULL);
 }
 
 static HTAB *
 createConnHash(void)
 {
-    HASHCTL        ctl;
+	HASHCTL		ctl;
 
-    ctl.keysize = NAMEDATALEN;
-    ctl.entrysize = sizeof(remoteConnHashEnt);
+	ctl.keysize = NAMEDATALEN;
+	ctl.entrysize = sizeof(remoteConnHashEnt);
 
-    return hash_create("Remote Con hash", NUMCONN, &ctl, HASH_ELEM);
+	return hash_create("Remote Con hash", NUMCONN, &ctl, HASH_ELEM);
 }
 
 static void
 createNewConnection(const char *name, remoteConn *rconn)
 {
-    remoteConnHashEnt *hentry;
-    bool        found;
-    char       *key;
-
-    if (!remoteConnHash)
-        remoteConnHash = createConnHash();
-
-    key = pstrdup(name);
-    truncate_identifier(key, strlen(key), true);
-    hentry = (remoteConnHashEnt *) hash_search(remoteConnHash, key,
-                                               HASH_ENTER, &found);
-
-    if (found)
-    {
-        PQfinish(rconn->conn);
-        pfree(rconn);
-
-        ereport(ERROR,
-                (errcode(ERRCODE_DUPLICATE_OBJECT),
-                 errmsg("duplicate connection name")));
-    }
-
-    hentry->rconn = rconn;
-    strlcpy(hentry->name, name, sizeof(hentry->name));
+	remoteConnHashEnt *hentry;
+	bool		found;
+	char	   *key;
+
+	if (!remoteConnHash)
+		remoteConnHash = createConnHash();
+
+	key = pstrdup(name);
+	truncate_identifier(key, strlen(key), true);
+	hentry = (remoteConnHashEnt *) hash_search(remoteConnHash, key,
+											   HASH_ENTER, &found);
+
+	if (found)
+	{
+		PQfinish(rconn->conn);
+		pfree(rconn);
+
+		ereport(ERROR,
+				(errcode(ERRCODE_DUPLICATE_OBJECT),
+				 errmsg("duplicate connection name")));
+	}
+
+	hentry->rconn = rconn;
+	strlcpy(hentry->name, name, sizeof(hentry->name));
 }
 
 static void
 deleteConnection(const char *name)
 {
-    remoteConnHashEnt *hentry;
-    bool        found;
-    char       *key;
+	remoteConnHashEnt *hentry;
+	bool		found;
+	char	   *key;
 
-    if (!remoteConnHash)
-        remoteConnHash = createConnHash();
+	if (!remoteConnHash)
+		remoteConnHash = createConnHash();
 
-    key = pstrdup(name);
-    truncate_identifier(key, strlen(key), false);
-    hentry = (remoteConnHashEnt *) hash_search(remoteConnHash,
-                                               key, HASH_REMOVE, &found);
+	key = pstrdup(name);
+	truncate_identifier(key, strlen(key), false);
+	hentry = (remoteConnHashEnt *) hash_search(remoteConnHash,
+											   key, HASH_REMOVE, &found);
 
-    if (!hentry)
-        ereport(ERROR,
-                (errcode(ERRCODE_UNDEFINED_OBJECT),
-                 errmsg("undefined connection name")));
+	if (!hentry)
+		ereport(ERROR,
+				(errcode(ERRCODE_UNDEFINED_OBJECT),
+				 errmsg("undefined connection name")));
 
 }
 
 static void
 dblink_security_check(PGconn *conn, remoteConn *rconn)
 {
-    if (!superuser())
-    {
-        if (!PQconnectionUsedPassword(conn))
-        {
-            PQfinish(conn);
-            if (rconn)
-                pfree(rconn);
-
-            ereport(ERROR,
-                    (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED),
-                     errmsg("password is required"),
-                     errdetail("Non-superuser cannot connect if the server does not request a password."),
-                     errhint("Target server's authentication method must be changed.")));
-        }
-    }
+	if (!superuser())
+	{
+		if (!PQconnectionUsedPassword(conn))
+		{
+			PQfinish(conn);
+			if (rconn)
+				pfree(rconn);
+
+			ereport(ERROR,
+					(errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED),
+					 errmsg("password is required"),
+					 errdetail("Non-superuser cannot connect if the server does not request a password."),
+					 errhint("Target server's authentication method must be changed.")));
+		}
+	}
 }
 
 /*
@@ -2636,96 +2862,96 @@ dblink_security_check(PGconn *conn, remoteConn *rconn)
 static void
 dblink_connstr_check(const char *connstr)
 {
-    if (!superuser())
-    {
-        PQconninfoOption *options;
-        PQconninfoOption *option;
-        bool        connstr_gives_password = false;
-
-        options = PQconninfoParse(connstr, NULL);
-        if (options)
-        {
-            for (option = options; option->keyword != NULL; option++)
-            {
-                if (strcmp(option->keyword, "password") == 0)
-                {
-                    if (option->val != NULL && option->val[0] != '\0')
-                    {
-                        connstr_gives_password = true;
-                        break;
-                    }
-                }
-            }
-            PQconninfoFree(options);
-        }
-
-        if (!connstr_gives_password)
-            ereport(ERROR,
-                    (errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED),
-                     errmsg("password is required"),
-                     errdetail("Non-superusers must provide a password in the connection string.")));
-    }
+	if (!superuser())
+	{
+		PQconninfoOption *options;
+		PQconninfoOption *option;
+		bool		connstr_gives_password = false;
+
+		options = PQconninfoParse(connstr, NULL);
+		if (options)
+		{
+			for (option = options; option->keyword != NULL; option++)
+			{
+				if (strcmp(option->keyword, "password") == 0)
+				{
+					if (option->val != NULL && option->val[0] != '\0')
+					{
+						connstr_gives_password = true;
+						break;
+					}
+				}
+			}
+			PQconninfoFree(options);
+		}
+
+		if (!connstr_gives_password)
+			ereport(ERROR,
+					(errcode(ERRCODE_S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED),
+					 errmsg("password is required"),
+					 errdetail("Non-superusers must provide a password in the connection string.")));
+	}
 }
 
 static void
 dblink_res_error(PGconn *conn, const char *conname, PGresult *res,
-                 const char *dblink_context_msg, bool fail)
+				 const char *dblink_context_msg, bool fail)
 {
-    int            level;
-    char       *pg_diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
-    char       *pg_diag_message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY);
-    char       *pg_diag_message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL);
-    char       *pg_diag_message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT);
-    char       *pg_diag_context = PQresultErrorField(res, PG_DIAG_CONTEXT);
-    int            sqlstate;
-    char       *message_primary;
-    char       *message_detail;
-    char       *message_hint;
-    char       *message_context;
-    const char *dblink_context_conname = "unnamed";
-
-    if (fail)
-        level = ERROR;
-    else
-        level = NOTICE;
-
-    if (pg_diag_sqlstate)
-        sqlstate = MAKE_SQLSTATE(pg_diag_sqlstate[0],
-                                 pg_diag_sqlstate[1],
-                                 pg_diag_sqlstate[2],
-                                 pg_diag_sqlstate[3],
-                                 pg_diag_sqlstate[4]);
-    else
-        sqlstate = ERRCODE_CONNECTION_FAILURE;
-
-    message_primary = xpstrdup(pg_diag_message_primary);
-    message_detail = xpstrdup(pg_diag_message_detail);
-    message_hint = xpstrdup(pg_diag_message_hint);
-    message_context = xpstrdup(pg_diag_context);
-
-    /*
-     * If we don't get a message from the PGresult, try the PGconn.  This is
-     * needed because for connection-level failures, PQexec may just return
-     * NULL, not a PGresult at all.
-     */
-    if (message_primary == NULL)
-        message_primary = pchomp(PQerrorMessage(conn));
-
-    if (res)
-        PQclear(res);
-
-    if (conname)
-        dblink_context_conname = conname;
-
-    ereport(level,
-            (errcode(sqlstate),
-             message_primary ? errmsg_internal("%s", message_primary) :
-             errmsg("could not obtain message string for remote error"),
-             message_detail ? errdetail_internal("%s", message_detail) : 0,
-             message_hint ? errhint("%s", message_hint) : 0,
-             message_context ? errcontext("%s", message_context) : 0,
-             errcontext("Error occurred on dblink connection named \"%s\": %s.",
-                        dblink_context_conname, dblink_context_msg)));
+	int			level;
+	char	   *pg_diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+	char	   *pg_diag_message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY);
+	char	   *pg_diag_message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL);
+	char	   *pg_diag_message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT);
+	char	   *pg_diag_context = PQresultErrorField(res, PG_DIAG_CONTEXT);
+	int			sqlstate;
+	char	   *message_primary;
+	char	   *message_detail;
+	char	   *message_hint;
+	char	   *message_context;
+	const char *dblink_context_conname = "unnamed";
+
+	if (fail)
+		level = ERROR;
+	else
+		level = NOTICE;
+
+	if (pg_diag_sqlstate)
+		sqlstate = MAKE_SQLSTATE(pg_diag_sqlstate[0],
+								 pg_diag_sqlstate[1],
+								 pg_diag_sqlstate[2],
+								 pg_diag_sqlstate[3],
+								 pg_diag_sqlstate[4]);
+	else
+		sqlstate = ERRCODE_CONNECTION_FAILURE;
+
+	message_primary = xpstrdup(pg_diag_message_primary);
+	message_detail = xpstrdup(pg_diag_message_detail);
+	message_hint = xpstrdup(pg_diag_message_hint);
+	message_context = xpstrdup(pg_diag_context);
+
+	/*
+	 * If we don't get a message from the PGresult, try the PGconn.  This is
+	 * needed because for connection-level failures, PQexec may just return
+	 * NULL, not a PGresult at all.
+	 */
+	if (message_primary == NULL)
+		message_primary = pchomp(PQerrorMessage(conn));
+
+	if (res)
+		PQclear(res);
+
+	if (conname)
+		dblink_context_conname = conname;
+
+	ereport(level,
+			(errcode(sqlstate),
+			 message_primary ? errmsg_internal("%s", message_primary) :
+			 errmsg("could not obtain message string for remote error"),
+			 message_detail ? errdetail_internal("%s", message_detail) : 0,
+			 message_hint ? errhint("%s", message_hint) : 0,
+			 message_context ? errcontext("%s", message_context) : 0,
+			 errcontext("Error occurred on dblink connection named \"%s\": %s.",
+						dblink_context_conname, dblink_context_msg)));
 }
 
 /*
@@ -2734,86 +2960,86 @@ dblink_res_error(PGconn *conn, const char *conname, PGresult *res,
 static char *
 get_connect_string(const char *servername)
 {
-    ForeignServer *foreign_server = NULL;
-    UserMapping *user_mapping;
-    ListCell   *cell;
-    StringInfoData buf;
-    ForeignDataWrapper *fdw;
-    AclResult    aclresult;
-    char       *srvname;
-
-    static const PQconninfoOption *options = NULL;
-
-    initStringInfo(&buf);
-
-    /*
-     * Get list of valid libpq options.
-     *
-     * To avoid unnecessary work, we get the list once and use it throughout
-     * the lifetime of this backend process.  We don't need to care about
-     * memory context issues, because PQconndefaults allocates with malloc.
-     */
-    if (!options)
-    {
-        options = PQconndefaults();
-        if (!options)            /* assume reason for failure is OOM */
-            ereport(ERROR,
-                    (errcode(ERRCODE_FDW_OUT_OF_MEMORY),
-                     errmsg("out of memory"),
-                     errdetail("could not get libpq's default connection options")));
-    }
-
-    /* first gather the server connstr options */
-    srvname = pstrdup(servername);
-    truncate_identifier(srvname, strlen(srvname), false);
-    foreign_server = GetForeignServerByName(srvname, true);
-
-    if (foreign_server)
-    {
-        Oid            serverid = foreign_server->serverid;
-        Oid            fdwid = foreign_server->fdwid;
-        Oid            userid = GetUserId();
-
-        user_mapping = GetUserMapping(userid, serverid);
-        fdw = GetForeignDataWrapper(fdwid);
-
-        /* Check permissions, user must have usage on the server. */
-        aclresult = pg_foreign_server_aclcheck(serverid, userid, ACL_USAGE);
-        if (aclresult != ACLCHECK_OK)
-            aclcheck_error(aclresult, ACL_KIND_FOREIGN_SERVER, foreign_server->servername);
-
-        foreach(cell, fdw->options)
-        {
-            DefElem    *def = lfirst(cell);
-
-            if (is_valid_dblink_option(options, def->defname, ForeignDataWrapperRelationId))
-                appendStringInfo(&buf, "%s='%s' ", def->defname,
-                                 escape_param_str(strVal(def->arg)));
-        }
-
-        foreach(cell, foreign_server->options)
-        {
-            DefElem    *def = lfirst(cell);
-
-            if (is_valid_dblink_option(options, def->defname, ForeignServerRelationId))
-                appendStringInfo(&buf, "%s='%s' ", def->defname,
-                                 escape_param_str(strVal(def->arg)));
-        }
-
-        foreach(cell, user_mapping->options)
-        {
-
-            DefElem    *def = lfirst(cell);
-
-            if (is_valid_dblink_option(options, def->defname, UserMappingRelationId))
-                appendStringInfo(&buf, "%s='%s' ", def->defname,
-                                 escape_param_str(strVal(def->arg)));
-        }
-
-        return buf.data;
-    }
-    else
-        return NULL;
+	ForeignServer *foreign_server = NULL;
+	UserMapping *user_mapping;
+	ListCell   *cell;
+	StringInfoData buf;
+	ForeignDataWrapper *fdw;
+	AclResult	aclresult;
+	char	   *srvname;
+
+	static const PQconninfoOption *options = NULL;
+
+	initStringInfo(&buf);
+
+	/*
+	 * Get list of valid libpq options.
+	 *
+	 * To avoid unnecessary work, we get the list once and use it throughout
+	 * the lifetime of this backend process.  We don't need to care about
+	 * memory context issues, because PQconndefaults allocates with malloc.
+	 */
+	if (!options)
+	{
+		options = PQconndefaults();
+		if (!options)			/* assume reason for failure is OOM */
+			ereport(ERROR,
+					(errcode(ERRCODE_FDW_OUT_OF_MEMORY),
+					 errmsg("out of memory"),
+					 errdetail("could not get libpq's default connection options")));
+	}
+
+	/* first gather the server connstr options */
+	srvname = pstrdup(servername);
+	truncate_identifier(srvname, strlen(srvname), false);
+	foreign_server = GetForeignServerByName(srvname, true);
+
+	if (foreign_server)
+	{
+		Oid			serverid = foreign_server->serverid;
+		Oid			fdwid = foreign_server->fdwid;
+		Oid			userid = GetUserId();
+
+		user_mapping = GetUserMapping(userid, serverid);
+		fdw = GetForeignDataWrapper(fdwid);
+
+		/* Check permissions, user must have usage on the server. */
+		aclresult = pg_foreign_server_aclcheck(serverid, userid, ACL_USAGE);
+		if (aclresult != ACLCHECK_OK)
+			aclcheck_error(aclresult, ACL_KIND_FOREIGN_SERVER, foreign_server->servername);
+
+		foreach(cell, fdw->options)
+		{
+			DefElem    *def = lfirst(cell);
+
+			if (is_valid_dblink_option(options, def->defname, ForeignDataWrapperRelationId))
+				appendStringInfo(&buf, "%s='%s' ", def->defname,
+								 escape_param_str(strVal(def->arg)));
+		}
+
+		foreach(cell, foreign_server->options)
+		{
+			DefElem    *def = lfirst(cell);
+
+			if (is_valid_dblink_option(options, def->defname, ForeignServerRelationId))
+				appendStringInfo(&buf, "%s='%s' ", def->defname,
+								 escape_param_str(strVal(def->arg)));
+		}
+
+		foreach(cell, user_mapping->options)
+		{
+
+			DefElem    *def = lfirst(cell);
+
+			if (is_valid_dblink_option(options, def->defname, UserMappingRelationId))
+				appendStringInfo(&buf, "%s='%s' ", def->defname,
+								 escape_param_str(strVal(def->arg)));
+		}
+
+		return buf.data;
+	}
+	else
+		return NULL;
 }
 
 /*
@@ -2824,19 +3050,19 @@ get_connect_string(const char *servername)
 static char *
 escape_param_str(const char *str)
 {
-    const char *cp;
-    StringInfoData buf;
+	const char *cp;
+	StringInfoData buf;
 
-    initStringInfo(&buf);
+	initStringInfo(&buf);
 
-    for (cp = str; *cp; cp++)
-    {
-        if (*cp == '\\' || *cp == '\'')
-            appendStringInfoChar(&buf, '\\');
-        appendStringInfoChar(&buf, *cp);
-    }
+	for (cp = str; *cp; cp++)
+	{
+		if (*cp == '\\' || *cp == '\'')
+			appendStringInfoChar(&buf, '\\');
+		appendStringInfoChar(&buf, *cp);
+	}
 
-    return buf.data;
+	return buf.data;
 }
 
 /*
@@ -2856,58 +3082,58 @@ escape_param_str(const char *str)
  */
 static void
 validate_pkattnums(Relation rel,
-                   int2vector *pkattnums_arg, int32 pknumatts_arg,
-                   int **pkattnums, int *pknumatts)
+				   int2vector *pkattnums_arg, int32 pknumatts_arg,
+				   int **pkattnums, int *pknumatts)
 {
-    TupleDesc    tupdesc = rel->rd_att;
-    int            natts = tupdesc->natts;
-    int            i;
-
-    /* Don't take more array elements than there are */
-    pknumatts_arg = Min(pknumatts_arg, pkattnums_arg->dim1);
-
-    /* Must have at least one pk attnum selected */
-    if (pknumatts_arg <= 0)
-        ereport(ERROR,
-                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                 errmsg("number of key attributes must be > 0")));
-
-    /* Allocate output array */
-    *pkattnums = (int *) palloc(pknumatts_arg * sizeof(int));
-    *pknumatts = pknumatts_arg;
-
-    /* Validate attnums and convert to internal form */
-    for (i = 0; i < pknumatts_arg; i++)
-    {
-        int            pkattnum = pkattnums_arg->values[i];
-        int            lnum;
-        int            j;
-
-        /* Can throw error immediately if out of range */
-        if (pkattnum <= 0 || pkattnum > natts)
-            ereport(ERROR,
-                    (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                     errmsg("invalid attribute number %d", pkattnum)));
-
-        /* Identify which physical column has this logical number */
-        lnum = 0;
-        for (j = 0; j < natts; j++)
-        {
-            /* dropped columns don't count */
-            if (tupdesc->attrs[j]->attisdropped)
-                continue;
-
-            if (++lnum == pkattnum)
-                break;
-        }
-
-        if (j < natts)
-            (*pkattnums)[i] = j;
-        else
-            ereport(ERROR,
-                    (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                     errmsg("invalid attribute number %d", pkattnum)));
-    }
+	TupleDesc	tupdesc = rel->rd_att;
+	int			natts = tupdesc->natts;
+	int			i;
+
+	/* Don't take more array elements than there are */
+	pknumatts_arg = Min(pknumatts_arg, pkattnums_arg->dim1);
+
+	/* Must have at least one pk attnum selected */
+	if (pknumatts_arg <= 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("number of key attributes must be > 0")));
+
+	/* Allocate output array */
+	*pkattnums = (int *) palloc(pknumatts_arg * sizeof(int));
+	*pknumatts = pknumatts_arg;
+
+	/* Validate attnums and convert to internal form */
+	for (i = 0; i < pknumatts_arg; i++)
+	{
+		int			pkattnum = pkattnums_arg->values[i];
+		int			lnum;
+		int			j;
+
+		/* Can throw error immediately if out of range */
+		if (pkattnum <= 0 || pkattnum > natts)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("invalid attribute number %d", pkattnum)));
+
+		/* Identify which physical column has this logical number */
+		lnum = 0;
+		for (j = 0; j < natts; j++)
+		{
+			/* dropped columns don't count */
+			if (tupdesc->attrs[j]->attisdropped)
+				continue;
+
+			if (++lnum == pkattnum)
+				break;
+		}
+
+		if (j < natts)
+			(*pkattnums)[i] = j;
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("invalid attribute number %d", pkattnum)));
+	}
 }
 
 /*
@@ -2915,11 +3141,11 @@ validate_pkattnums(Relation rel,
  *
  * We basically allow whatever libpq thinks is an option, with these
  * restrictions:
- *        debug options: disallowed
- *        "client_encoding": disallowed
- *        "user": valid only in USER MAPPING options
- *        secure options (eg password): valid only in USER MAPPING options
- *        others: valid only in FOREIGN SERVER options
+ *		debug options: disallowed
+ *		"client_encoding": disallowed
+ *		"user": valid only in USER MAPPING options
+ *		secure options (eg password): valid only in USER MAPPING options
+ *		others: valid only in FOREIGN SERVER options
  *
  * We disallow client_encoding because it would be overridden anyway via
  * PQclientEncoding; allowing it to be specified would merely promote
@@ -2927,43 +3153,43 @@ validate_pkattnums(Relation rel,
  */
 static bool
 is_valid_dblink_option(const PQconninfoOption *options, const char *option,
-                       Oid context)
+					   Oid context)
 {
-    const PQconninfoOption *opt;
-
-    /* Look up the option in libpq result */
-    for (opt = options; opt->keyword; opt++)
-    {
-        if (strcmp(opt->keyword, option) == 0)
-            break;
-    }
-    if (opt->keyword == NULL)
-        return false;
-
-    /* Disallow debug options (particularly "replication") */
-    if (strchr(opt->dispchar, 'D'))
-        return false;
-
-    /* Disallow "client_encoding" */
-    if (strcmp(opt->keyword, "client_encoding") == 0)
-        return false;
-
-    /*
-     * If the option is "user" or marked secure, it should be specified only
-     * in USER MAPPING.  Others should be specified only in SERVER.
-     */
-    if (strcmp(opt->keyword, "user") == 0 || strchr(opt->dispchar, '*'))
-    {
-        if (context != UserMappingRelationId)
-            return false;
-    }
-    else
-    {
-        if (context != ForeignServerRelationId)
-            return false;
-    }
-
-    return true;
+	const PQconninfoOption *opt;
+
+	/* Look up the option in libpq result */
+	for (opt = options; opt->keyword; opt++)
+	{
+		if (strcmp(opt->keyword, option) == 0)
+			break;
+	}
+	if (opt->keyword == NULL)
+		return false;
+
+	/* Disallow debug options (particularly "replication") */
+	if (strchr(opt->dispchar, 'D'))
+		return false;
+
+	/* Disallow "client_encoding" */
+	if (strcmp(opt->keyword, "client_encoding") == 0)
+		return false;
+
+	/*
+	 * If the option is "user" or marked secure, it should be specified only
+	 * in USER MAPPING.  Others should be specified only in SERVER.
+	 */
+	if (strcmp(opt->keyword, "user") == 0 || strchr(opt->dispchar, '*'))
+	{
+		if (context != UserMappingRelationId)
+			return false;
+	}
+	else
+	{
+		if (context != ForeignServerRelationId)
+			return false;
+	}
+
+	return true;
 }
 
 /*
@@ -2979,50 +3205,50 @@ is_valid_dblink_option(const PQconninfoOption *options, const char *option,
 static int
 applyRemoteGucs(PGconn *conn)
 {
-    static const char *const GUCsAffectingIO[] = {
-        "DateStyle",
-        "IntervalStyle"
-    };
-
-    int            nestlevel = -1;
-    int            i;
-
-    for (i = 0; i < lengthof(GUCsAffectingIO); i++)
-    {
-        const char *gucName = GUCsAffectingIO[i];
-        const char *remoteVal = PQparameterStatus(conn, gucName);
-        const char *localVal;
-
-        /*
-         * If the remote server is pre-8.4, it won't have IntervalStyle, but
-         * that's okay because its output format won't be ambiguous.  So just
-         * skip the GUC if we don't get a value for it.  (We might eventually
-         * need more complicated logic with remote-version checks here.)
-         */
-        if (remoteVal == NULL)
-            continue;
-
-        /*
-         * Avoid GUC-setting overhead if the remote and local GUCs already
-         * have the same value.
-         */
-        localVal = GetConfigOption(gucName, false, false);
-        Assert(localVal != NULL);
-
-        if (strcmp(remoteVal, localVal) == 0)
-            continue;
-
-        /* Create new GUC nest level if we didn't already */
-        if (nestlevel < 0)
-            nestlevel = NewGUCNestLevel();
-
-        /* Apply the option (this will throw error on failure) */
-        (void) set_config_option(gucName, remoteVal,
-                                 PGC_USERSET, PGC_S_SESSION,
-                                 GUC_ACTION_SAVE, true, 0, false);
-    }
-
-    return nestlevel;
+	static const char *const GUCsAffectingIO[] = {
+		"DateStyle",
+		"IntervalStyle"
+	};
+
+	int			nestlevel = -1;
+	int			i;
+
+	for (i = 0; i < lengthof(GUCsAffectingIO); i++)
+	{
+		const char *gucName = GUCsAffectingIO[i];
+		const char *remoteVal = PQparameterStatus(conn, gucName);
+		const char *localVal;
+
+		/*
+		 * If the remote server is pre-8.4, it won't have IntervalStyle, but
+		 * that's okay because its output format won't be ambiguous.  So just
+		 * skip the GUC if we don't get a value for it.  (We might eventually
+		 * need more complicated logic with remote-version checks here.)
+		 */
+		if (remoteVal == NULL)
+			continue;
+
+		/*
+		 * Avoid GUC-setting overhead if the remote and local GUCs already
+		 * have the same value.
+		 */
+		localVal = GetConfigOption(gucName, false, false);
+		Assert(localVal != NULL);
+
+		if (strcmp(remoteVal, localVal) == 0)
+			continue;
+
+		/* Create new GUC nest level if we didn't already */
+		if (nestlevel < 0)
+			nestlevel = NewGUCNestLevel();
+
+		/* Apply the option (this will throw error on failure) */
+		(void) set_config_option(gucName, remoteVal,
+								 PGC_USERSET, PGC_S_SESSION,
+								 GUC_ACTION_SAVE, true, 0, false);
+	}
+
+	return nestlevel;
 }
 
 /*
@@ -3031,7 +3257,7 @@ applyRemoteGucs(PGconn *conn)
 static void
 restoreLocalGucs(int nestlevel)
 {
-    /* Do nothing if no new nestlevel was created */
-    if (nestlevel > 0)
-        AtEOXact_GUC(true, nestlevel);
+	/* Do nothing if no new nestlevel was created */
+	if (nestlevel > 0)
+		AtEOXact_GUC(true, nestlevel);
 }
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index cf727d6f..73736d71 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -953,35 +953,31 @@ ProcessUtilityPre(PlannedStmt *pstmt,
             break;
 
         case T_CreateFdwStmt:
-            ereport(ERROR,
-                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                     errmsg("Postgres-XL does not support FOREIGN DATA WRAPPER yet"),
-                     errdetail("The feature is not currently supported")));
+            exec_type = EXEC_ON_ALL_NODES;
             break;
 
         case T_AlterFdwStmt:
+            exec_type = EXEC_ON_ALL_NODES;
             break;
 
         case T_CreateForeignServerStmt:
-            ereport(ERROR,
-                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                     errmsg("Postgres-XL does not support SERVER yet"),
-                     errdetail("The feature is not currently supported")));
+            exec_type = EXEC_ON_ALL_NODES;
             break;
 
         case T_AlterForeignServerStmt:
+            exec_type = EXEC_ON_ALL_NODES;
             break;
 
         case T_CreateUserMappingStmt:
-            ereport(ERROR,
-                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                     errmsg("Postgres-XL does not support USER MAPPING yet"),
-                     errdetail("The feature is not currently supported")));
+            exec_type = EXEC_ON_ALL_NODES;
             break;
 
         case T_AlterUserMappingStmt:
         case T_DropUserMappingStmt:
+            exec_type = EXEC_ON_ALL_NODES;
+            break;
         case T_ImportForeignSchemaStmt:
+            break;
         case T_CompositeTypeStmt:    /* CREATE TYPE (composite) */
         case T_CreateEnumStmt:    /* CREATE TYPE AS ENUM */
         case T_CreateRangeStmt: /* CREATE TYPE AS RANGE */

From 35c5c7526b75d66f73b297fb25fff3c0b5225863 Mon Sep 17 00:00:00 2001
From: yeyukui <yukuiye@tencent.com>
Date: Tue, 6 Apr 2021 17:34:13 +0800
Subject: [PATCH 370/578] fix coredump about crypt

---
 src/backend/utils/misc/relcrypt.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/backend/utils/misc/relcrypt.c b/src/backend/utils/misc/relcrypt.c
index fa69b54d..954ff6c6 100644
--- a/src/backend/utils/misc/relcrypt.c
+++ b/src/backend/utils/misc/relcrypt.c
@@ -1609,7 +1609,15 @@ Datum trsprt_crypt_decrypt_one_col_value(TranspCrypt*transp_crypt,
     if (TRANSP_CRYPT_INVALID_ALGORITHM_ID != transp_crypt->algo_id)
     {
         datum_text = decrypt_procedure(transp_crypt->algo_id, DatumGetTextP(inputval), INVALID_CONTEXT_LENGTH); 
+        if (datum_text)
+        {
         datum_ret = transparent_crypt_text_get_datum(datum_text, attr);
+        }
+        else
+        {
+	        datum_ret = transparent_crypt_text_get_datum(DatumGetTextP(inputval), attr);
+        }
+
         return datum_ret;
     }
     

From 2d5f2ba3cae3e7d8526f8489af44579df90a4f20 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Thu, 13 May 2021 19:24:35 +0800
Subject: [PATCH 371/578] 2pc files opt: add 2pc hash table on shmem (merge
 request 300), bugfix: 2pc file not found

---
 src/backend/access/transam/twophase.c | 398 +++++++++++++++-----------
 1 file changed, 226 insertions(+), 172 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 78ae69ff..f2fbc7e6 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -168,6 +168,8 @@ int record_2pc_partitions = 32;
 #define MAX_2PC_INFO_SIZE   (record_2pc_entry_size - MAX_TID_SIZE)
 #define DFLT_2PC_INFO_SIZE  1024  /* default size */
 
+#define MAX_RETRY_TIMES     2
+
 /* hash table entry for 2pc record */
 typedef struct Cache2pcInfo
 {
@@ -179,6 +181,9 @@ typedef struct Cache2pcInfo
 inline void
 check_entry_key(const char *tid, const char *key, const char *func);
 
+void
+print_record_2pc_cache(const char *func);
+
 void
 check_2pc_file(const char *tid, const char *info, const char *func);
 #endif
@@ -2303,6 +2308,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 {// #lizard forgives
     int            i;
     int            serialized_xacts = 0;
+	char *func = "CheckPointTwoPhase";
 
 #ifdef __TWO_PHASE_TRANS__
 	File fd = -1;
@@ -2313,7 +2319,8 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 	char path[MAXPGPATH];
 #endif
 
-	elog(LOG, "[CheckPointTwoPhase] checkpoint: "UINT64_FORMAT, redo_horizon);
+
+	elog(LOG, "[%s] checkpoint: "UINT64_FORMAT, func, redo_horizon);
 
     if (max_prepared_xacts <= 0)
         return;                    /* nothing to do */
@@ -2368,11 +2375,10 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 				{
 					/* save to file */
 					Assert(NULL != entry);
-					check_entry_key(gxact->gid, entry->key, "CheckPointTwoPhase");
-					check_2pc_file(gxact->gid, entry->info, "CheckPointTwoPhase");
+					check_entry_key(gxact->gid, entry->key, func);
+					check_2pc_file(gxact->gid, entry->info, func);
 
-					elog(LOG, "[CheckPointTwoPhase] %s is found "
-						"in hash table", gxact->gid);
+					elog(LOG, "[%s] %s is found in hash table", func, gxact->gid);
 
 					size = strlen(entry->info);
 
@@ -2382,38 +2388,54 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 					fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
 					if (fd < 0)
 					{
-						elog(ERROR, "[CheckPointTwoPhase] could not create file %s, "
-							"errMsg: %s", path, strerror(errno));
+						elog(ERROR, "[%s] could not create file %s, errMsg: %s",
+							func, path, strerror(errno));
 					}
 
 					ret = write(fd, entry->info, size);
 					if(ret != size)
 					{
 						close(fd);
-						elog(ERROR, "[CheckPointTwoPhase] could not write file %s, "
-							"errMsg: %s, ret: %d, info: %s",
-							path, strerror(errno), ret, entry->info);
+						elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
+							"ret: %d, info: %s",
+							func, path, strerror(errno), ret, entry->info);
+					}
+
+					if (size != strlen(entry->info))
+					{
+						elog(LOG, "[%s] %s size change from %d to %zu, info: %s",
+							func, gxact->gid, size, strlen(entry->info), entry->info);
+
+						Assert(size < strlen(entry->info));
+						ret = write(fd, entry->info + size, strlen(entry->info) - size);
+						if(ret != strlen(entry->info) - size)
+						{
+							close(fd);
+							elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
+								"ret: %d, info: %s",
+								func, path, strerror(errno), ret, entry->info);
+						}
 					}
 					close(fd);
+					fsync_fname(path, false);
 
 					/* remove from hash table */
 					entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
 						gxact->gid, HASH_REMOVE, &found);
 					if (!found)
 					{
-						elog(WARNING, "[CheckPointTwoPhase] %s is not found "
-							"in hash table when remove it", gxact->gid);
+						elog(WARNING, "[%s] %s is not found in hash table "
+							"when remove it", func, gxact->gid);
 					}
-					else if (enable_2pc_entry_trace)
+					else
 					{
-						elog(LOG, "[CheckPointTwoPhase] %s is removed "
-							"from hash table", gxact->gid);
+						elog(LOG, "[%s] %s is removed from hash table",
+							func, gxact->gid);
 					}
 				}
 				else
 				{
-					elog(LOG, "[CheckPointTwoPhase] %s is not found "
-						"in hash table", gxact->gid);
+					elog(LOG, "[%s] %s is not found in hash table", func, gxact->gid);
 				}
 			}
 #endif
@@ -2439,10 +2461,9 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 		while ((entry = hash_seq_search(&seq)) != NULL)
 		{
 			Assert(NULL != entry);
-			check_2pc_file(entry->key, entry->info, "CheckPointTwoPhase");
+			check_2pc_file(entry->key, entry->info, func);
 
-			elog(LOG, "[CheckPointTwoPhase] key %s is found "
-				"in hash table", entry->key);
+			elog(LOG, "[%s] key %s is found in hash table", func, entry->key);
 
 			if (IsXidImplicit(entry->key))
 			{
@@ -2457,20 +2478,20 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 
 					if (0 != strcmp(start_node, PGXCNodeName))
 					{
-						elog(LOG, "[CheckPointTwoPhase] %s start node is not %s",
-							entry->key, PGXCNodeName);
+						elog(LOG, "[%s] %s start node is not %s",
+							func, entry->key, PGXCNodeName);
 						continue;
 					}
 					else
 					{
-						elog(LOG, "[CheckPointTwoPhase] %s start node is %s",
-							entry->key, PGXCNodeName);
+						elog(LOG, "[%s] %s start node is %s",
+							func, entry->key, PGXCNodeName);
 					}
 				}
 				else
 				{
-					elog(WARNING, "[CheckPointTwoPhase] %s get start node failed, "
-						"info: %s", entry->key, entry->info);
+					elog(WARNING, "[%s] %s get start node failed, info: %s",
+						func, entry->key, entry->info);
 				}
 			}
 
@@ -2482,32 +2503,49 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 			fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
 			if (fd < 0)
 			{
-				elog(ERROR, "[CheckPointTwoPhase] could not create file %s, "
-					"errMsg: %s", path, strerror(errno));
+				elog(ERROR, "[%s] could not create file %s, errMsg: %s",
+					func, path, strerror(errno));
 			}
 
 			ret = write(fd, entry->info, size);
 			if(ret != size)
 			{
 				close(fd);
-				elog(ERROR, "[CheckPointTwoPhase] could not write file %s, "
-					"errMsg: %s, ret: %d, info: %s",
-					path, strerror(errno), ret, entry->info);
+				elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
+					"ret: %d, info: %s",
+					func, path, strerror(errno), ret, entry->info);
 			}
+
+			if (size != strlen(entry->info))
+			{
+				elog(LOG, "[%s] %s size change from %d to %zu, info: %s",
+					func, entry->key, size, strlen(entry->info), entry->info);
+
+				Assert(size < strlen(entry->info));
+				ret = write(fd, entry->info + size, strlen(entry->info) - size);
+				if(ret != strlen(entry->info) - size)
+				{
 			close(fd);
+					elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
+						"ret: %d, info: %s",
+						func, path, strerror(errno), ret, entry->info);
+				}
+			}
+			close(fd);
+			fsync_fname(path, false);
 
 			/* remove from hash table */
 			entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
 				entry->key, HASH_REMOVE, &found);
 			if (!found)
 			{
-				elog(WARNING, "[CheckPointTwoPhase] %s is not found "
-					"in hash table when remove it", entry->key);
+				elog(WARNING, "[%s] %s is not found in hash table "
+					"when remove it", func, entry->key);
 			}
-			else if (enable_2pc_entry_trace)
+			else
 			{
-				elog(LOG, "[CheckPointTwoPhase] %s is removed "
-					"from hash table", entry->key);
+				elog(LOG, "[%s] %s is removed from hash table",
+					func, entry->key);
 			}
 		}
 	}
@@ -3364,6 +3402,22 @@ inline void check_entry_key(const char *tid, const char *key, const char *func)
 	}
 }
 
+void print_record_2pc_cache(const char *func)
+{
+	if (NULL != record_2pc_cache)
+	{
+		HASH_SEQ_STATUS seq;
+		Cache2pcInfo *entry = NULL;
+
+		hash_seq_init(&seq, record_2pc_cache);
+		while ((entry = hash_seq_search(&seq)) != NULL)
+		{
+			Assert(NULL != entry);
+			elog(LOG, "[print_record_2pc_cache][%s] key: %s, info: %s",
+				func, entry->key, entry->info);
+		}
+	}
+}
 /*
  * Check whether the 2pc file is exist when it is saved in the hash table.
  */
@@ -3505,6 +3559,8 @@ void record_2pc_involved_nodes_xid(const char * tid,
     char *result = NULL;
 	Cache2pcInfo *entry = NULL;
 	bool found = false;
+	char *func = "record_2pc_involved_nodes_xid";
+
 #ifdef __TWO_PHASE_TESTS__
     XLogRecPtr xlogrec = 0;
 #endif
@@ -3516,23 +3572,22 @@ void record_2pc_involved_nodes_xid(const char * tid,
 
 	if (enable_distri_print || enable_2pc_entry_trace)
     {
-		elog(LOG, "[record_2pc_involved_nodes_xid] record %s, "
-			"startnode: %s, participants: %s",
-			tid, startnode, nodestring);
+		elog(LOG, "[%s] record %s, startnode: %s, participants: %s",
+			func, tid, startnode, nodestring);
     }
 
     if (NULL == tid || '\0' == tid[0])
     {
-		elog(ERROR, "[record_2pc_involved_nodes_xid] gid is empty");
+		elog(ERROR, "[%s] gid is empty", func);
     }
 
     if (NULL == startnode || '\0' == startnode[0])
     {
-		elog(PANIC, "[record_2pc_involved_nodes_xid] %s startnode is empty", tid);
+		elog(PANIC, "[%s] %s startnode is empty", func, tid);
     }
     if (NULL == nodestring || '\0' == nodestring[0])
     {
-		elog(PANIC, "[record_2pc_involved_nodes_xid] %s participants is empty", tid);
+		elog(PANIC, "[%s] %s participants is empty", func, tid);
     }
     
 	initStringInfo(&content);
@@ -3554,14 +3609,14 @@ void record_2pc_involved_nodes_xid(const char * tid,
 			if (found)
 			{
 				Assert(NULL != entry);
-				check_entry_key(tid, entry->key, "record_2pc_involved_nodes_xid");
-				check_2pc_file(tid, entry->info, "record_2pc_involved_nodes_xid");
+				check_entry_key(tid, entry->key, func);
+				check_2pc_file(tid, entry->info, func);
 
 				if (strncmp(entry->info, content.data, size) != 0)
 				{
-					elog(ERROR, "[record_2pc_involved_nodes_xid] pg_clean attemp to "
-						"write %s info conflict, content: %s, info: %s",
-						tid, content.data, entry->info);
+					elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, "
+						"content: %s, info: %s",
+						func, tid, content.data, entry->info);
 				}
 
 				resetStringInfo(&content);
@@ -3583,8 +3638,7 @@ void record_2pc_involved_nodes_xid(const char * tid,
             {   
                 ereport(ERROR,
                     (errcode_for_file_access(),
-					errmsg("[record_2pc_involved_nodes_xid] could not "
-						"open file %s for read", path)));
+					errmsg("[%s] could not open file %s for read", func, path)));
             } 
             ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
             if(ret != fileSize)
@@ -3592,8 +3646,7 @@ void record_2pc_involved_nodes_xid(const char * tid,
 				FileClose(fd);
                 ereport(ERROR,
                     (errcode_for_file_access(),
-					errmsg("[record_2pc_involved_nodes_xid] could not "
-						"read file %s, ret: %d", path, ret)));
+					errmsg("[%s] could not read file %s, ret: %d", func, path, ret)));
             }
             FileClose(fd);
 
@@ -3601,9 +3654,9 @@ void record_2pc_involved_nodes_xid(const char * tid,
 
 			if (strncmp(result, content.data, size) != 0)
             {
-				elog(ERROR, "[record_2pc_involved_nodes_xid] pg_clean attemp to "
-					"write %s info conflict, content: %s, info: %s",
-					tid, content.data, result);
+				elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, "
+					"content: %s, info: %s",
+					func, tid, content.data, result);
             }
 
 			pfree(result);
@@ -3633,8 +3686,8 @@ void record_2pc_involved_nodes_xid(const char * tid,
 			XLogFlush(xlogrec);
 			run_pg_clean = 1;
 			complish = true;
-			elog(STOP, "[record_2pc_involved_nodes_xid] twophase exception: "
-				"simulate kill start node after record 2pc file");
+			elog(STOP, "[%s] twophase exception: simulate kill start node "
+				"after record 2pc file", func);
 		}
 #endif
 	}
@@ -3646,29 +3699,28 @@ void record_2pc_involved_nodes_xid(const char * tid,
 			tid, HASH_ENTER_NULL, &found);
 		if (NULL != entry)
 		{
-			check_entry_key(tid, entry->key, "record_2pc_involved_nodes_xid");
-			check_2pc_file(tid, entry->info, "record_2pc_involved_nodes_xid");
+			check_entry_key(tid, entry->key, func);
+			check_2pc_file(tid, entry->info, func);
 
 			if (found)
 			{
 				if (RecoveryInProgress())
 				{
-					elog(LOG, "[record_2pc_involved_nodes_xid] %s is found "
-						"in hash table in recovery mode", tid);
+					elog(LOG, "[%s] %s is found in hash table in recovery mode",
+						func, tid);
 				}
 				else
 				{
-					elog(LOG, "[record_2pc_involved_nodes_xid] %s is found "
-						"in hash table", tid);
+					elog(LOG, "[%s] %s is found in hash table", func, tid);
 				}
 			}
 			else if (enable_2pc_entry_trace)
 			{
-				elog(LOG, "[record_2pc_involved_nodes_xid] %s is added "
-					"to hash table", tid);
+				elog(LOG, "[%s] %s is added to hash table", func, tid);
 			}
 
 			memcpy(entry->info, content.data, size + 1);
+			check_entry_key(tid, entry->key, func);
 
 			resetStringInfo(&content);
 			pfree(content.data);
@@ -3676,13 +3728,13 @@ void record_2pc_involved_nodes_xid(const char * tid,
 		}
 		else
 		{
-			elog(LOG, "[record_2pc_involved_nodes_xid] %s entry is NULL", tid);
+			elog(LOG, "[%s] %s entry is NULL", func, tid);
 		}
 	}
 	else if (NULL != record_2pc_cache)
 	{
-		elog(LOG, "[record_2pc_involved_nodes_xid] %s size: %d, "
-			"max info size: %d", tid, size, MAX_2PC_INFO_SIZE);
+		elog(LOG, "[%s] %s size: %d, max info size: %d",
+			func, tid, size, MAX_2PC_INFO_SIZE);
 	}
 
 	GET_2PC_FILE_PATH(path, tid);
@@ -3705,8 +3757,8 @@ void record_2pc_involved_nodes_xid(const char * tid,
     }
     if (fd < 0)
     {   
-		elog(ERROR, "[record_2pc_involved_nodes_xid] could not create file %s, "
-			"errMsg: %s", path, strerror(errno));
+		elog(ERROR, "[%s] could not create file %s, errMsg: %s",
+			func, path, strerror(errno));
         return;
     }
 
@@ -3714,9 +3766,8 @@ void record_2pc_involved_nodes_xid(const char * tid,
     if(ret != size)
     {
 		FileClose(fd);
-		elog(ERROR, "[record_2pc_involved_nodes_xid] could not write file %s, "
-			"errMsg: %s, ret: %d, content: %s",
-			path, strerror(errno), ret, content.data);
+		elog(ERROR, "[%s] could not write file %s, errMsg: %s, ret: %d, content: %s",
+			func, path, strerror(errno), ret, content.data);
     }
     FileClose(fd);
     
@@ -3734,13 +3785,11 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 	int ret = 0;
 	int size = 0;
 	int new_size = 0;
+	int retry_times = 0;
     XLogRecPtr xlogrec = 0;
 	Cache2pcInfo *entry = NULL;
 	bool found = false;
-#if 0    
-    int i;
-    GlobalTransaction gxact = NULL;
-#endif
+	char *func = "record_2pc_commit_timestamp";
     
     if (!enable_2pc_recovery_info)
     {
@@ -3749,17 +3798,16 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 
 	if (enable_distri_print || enable_2pc_entry_trace)
     {
-		elog(LOG, "[record_2pc_commit_timestamp] %s commit_timestamp: "
-			INT64_FORMAT, tid, commit_timestamp);
+		elog(LOG, "[%s] %s commit_timestamp: "INT64_FORMAT,
+			func, tid, commit_timestamp);
     }
     Assert(tid[0] != '\0');
     if (InvalidGlobalTimestamp == commit_timestamp && 
         (TWO_PHASE_COMMITTING == g_twophase_state.state || 
         TWO_PHASE_COMMIT_END == g_twophase_state.state))
     {
-		elog(ERROR, "[record_2pc_commit_timestamp] could not commit "
-			"transaction '%s' on node '%s' with InvalidGlobalTimestamp",
-			tid, PGXCNodeName);
+		elog(ERROR, "[%s] could not commit transaction '%s' on node '%s' "
+			"with InvalidGlobalTimestamp", func, tid, PGXCNodeName);
     }
 
 	if (!RecoveryInProgress())
@@ -3782,25 +3830,24 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 	size = content.len;
 	Assert(size == strlen(content.data));
 
-	if (NULL != record_2pc_cache)
+	while (NULL != record_2pc_cache && retry_times++ < MAX_RETRY_TIMES)
 	{
 		Assert(strlen(tid) < MAX_TID_SIZE);
 		entry = (Cache2pcInfo *)hash_search(record_2pc_cache, tid, HASH_FIND, &found);
 		if (found)
 		{
 			Assert(NULL != entry);
-			check_entry_key(tid, entry->key, "record_2pc_commit_timestamp");
-			check_2pc_file(tid, entry->info, "record_2pc_commit_timestamp");
+			check_entry_key(tid, entry->key, func);
+			check_2pc_file(tid, entry->info, func);
 
 			if (RecoveryInProgress())
 			{
-				elog(LOG, "[record_2pc_commit_timestamp] %s is found "
-					"in hash table in recovery mode", tid);
+				elog(LOG, "[%s] %s is found in hash table in recovery mode",
+					func, tid);
 			}
 			else if (enable_2pc_entry_trace)
 			{
-				elog(LOG, "[record_2pc_commit_timestamp] %s is found "
-					"in hash table", tid);
+				elog(LOG, "[%s] %s is found in hash table", func, tid);
 			}
 
 			new_size = size + strlen(entry->info);
@@ -3809,6 +3856,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 			{
 				/* save to hash table */
 				memcpy(entry->info + strlen(entry->info), content.data, size + 1);
+				check_entry_key(tid, entry->key, func);
 
 				resetStringInfo(&content);
 				pfree(content.data);
@@ -3816,8 +3864,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 			}
 
 				/* save to file */
-				elog(LOG, "[record_2pc_commit_timestamp] %s new size(%d) "
-					"overflow(%d)", tid, new_size, MAX_2PC_INFO_SIZE);
+			elog(LOG, "[%s] %s new size(%d) overflow(%d)",
+				func, tid, new_size, MAX_2PC_INFO_SIZE);
 
 				GET_2PC_FILE_PATH(path, tid);
 
@@ -3833,9 +3881,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 			}
 			if (fd < 0)
 			{
-						elog(ERROR, "[record_2pc_commit_timestamp] could not "
-							"append timestamp in file %s, errMsg: %s",
-							path, strerror(errno));
+				elog(ERROR, "[%s] could not append timestamp in file %s, errMsg: %s",
+					func, path, strerror(errno));
 					}
 
 			ret = FileWrite(fd, entry->info, strlen(entry->info),
@@ -3843,18 +3890,18 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 			if(ret != strlen(entry->info))
 				{
 				FileClose(fd);
-					elog(ERROR, "[record_2pc_commit_timestamp] could not write "
-						"file %s, errMsg: %s, ret: %d, info: %s",
-						path, strerror(errno), ret, entry->info);
+				elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
+					"ret: %d, info: %s",
+					func, path, strerror(errno), ret, entry->info);
 				}
 			ret = FileWrite(fd, content.data, size,
 				WAIT_EVENT_BUFFILE_WRITE);
 			if(ret != size)
 				{
 				FileClose(fd);
-					elog(ERROR, "[record_2pc_commit_timestamp] could not write "
-						"file %s, errMsg: %s, ret: %d, info: %s",
-						path, strerror(errno), ret, content.data);
+				elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
+					"ret: %d, info: %s",
+					func, path, strerror(errno), ret, content.data);
 				}
 			FileClose(fd);
 
@@ -3863,24 +3910,32 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 					tid, HASH_REMOVE, &found);
 				if (!found)
 				{
-					elog(WARNING, "[record_2pc_commit_timestamp] %s is not found"
-						"in hash table when remove it", tid);
+				elog(WARNING, "[%s] %s is not found in hash table when remove it",
+					func, tid);
 				}
 				else if (enable_2pc_entry_trace)
 				{
-					elog(LOG, "[record_2pc_commit_timestamp] %s is removed "
-						"from hash table", entry->key);
+				elog(LOG, "[%s] %s is removed from hash table", func, entry->key);
 				}
 
 				resetStringInfo(&content);
 				pfree(content.data);
 				return;
 			}
-		else
-		{
-			elog(LOG, "[record_2pc_commit_timestamp] %s is not found "
-				"in hash table", tid);
+
+		/* not found */
+		elog(LOG, "[%s] %s is not found in hash table, retry times: %d",
+			func, tid, retry_times);
+
+		Assert(NULL == entry);
+		print_record_2pc_cache(func);
+
+		pg_usleep(5000L);	/* sleep 5ms */
 		}
+
+	if (NULL != record_2pc_cache)
+	{
+		elog(LOG, "[%s] %s is not found in hash table, get from disk", func, tid);
 	}
 
 	GET_2PC_FILE_PATH(path, tid);
@@ -3892,6 +3947,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
         if (RecoveryInProgress())
         {
 #if 0            
+			int i;
+			GlobalTransaction gxact = NULL;
             for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
             {
                 gxact = TwoPhaseState->prepXacts[i];
@@ -3901,19 +3958,18 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
                 }
                 if (0 == strcmp(gxact->gid, tid))
                 {
-					elog(ERROR, "[record_2pc_commit_timestamp] could not "
-						"append timestamp in file %s, errMsg: %s",
-						path, strerror(errno));
+					elog(ERROR, "[%s] could not append timestamp in file %s, "
+						"errMsg: %s", func, path, strerror(errno));
                 }
             }
 #endif            
-			elog(LOG, "[record_2pc_commit_timestamp] could not open file %s, "
-				"errMsg: %s", path, strerror(errno));
+			elog(LOG, "[%s] could not open file %s, errMsg: %s",
+				func, path, strerror(errno));
         }
         else
         {
-			elog(ERROR, "[record_2pc_commit_timestamp] could not open file %s, "
-				"errMsg: %s", path, strerror(errno));
+			elog(ERROR, "[%s] could not open file %s, errMsg: %s",
+				func, path, strerror(errno));
         }
         return;
     }
@@ -3922,17 +3978,16 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
     {
 		memset(file_content, 0, FILE_CONTENT_SIZE);
 		ret = FileRead(fd, file_content, FILE_CONTENT_SIZE, WAIT_EVENT_BUFFILE_READ);
-		elog(LOG, "[record_2pc_commit_timestamp] before append file: %s, "
-			"file_content: %s, content.data: %s, ret: %d",
-			path, file_content, content.data, ret);
+		elog(LOG, "[%s] before append file: %s, file_content: %s, content.data: %s, "
+			"ret: %d", func, path, file_content, content.data, ret);
     }
 
 	ret = FileWrite(fd, content.data, size, WAIT_EVENT_BUFFILE_WRITE);
     if(ret != size)
     {
 		FileClose(fd);
-		elog(ERROR, "[record_2pc_commit_timestamp] could not write file %s, "
-			"errMsg: %s", path, strerror(errno));
+		elog(ERROR, "[%s] could not write file %s, errMsg: %s",
+			func, path, strerror(errno));
     }
 
     if (enable_distri_print)
@@ -3940,9 +3995,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 		memset(file_content, 0, FILE_CONTENT_SIZE);
 		FileSeek(fd, 0, SEEK_SET);
 		ret = FileRead(fd, file_content, FILE_CONTENT_SIZE, WAIT_EVENT_BUFFILE_READ);
-		elog(LOG, "[record_2pc_commit_timestamp] after append file: %s, "
-			"file_content: %s, ret: %d", 
-			path, file_content, ret);
+		elog(LOG, "[%s] after append file: %s, file_content: %s, ret: %d",
+			func, path, file_content, ret);
     }
 
 	FileClose(fd);
@@ -3956,6 +4010,7 @@ void remove_2pc_records(const char * tid, bool record_in_xlog)
     char path[MAXPGPATH];    
 	Cache2pcInfo *entry = NULL;
 	bool found = false;
+	char *func = "remove_2pc_records";
 
     if (!enable_2pc_recovery_info)
     {
@@ -3964,8 +4019,7 @@ void remove_2pc_records(const char * tid, bool record_in_xlog)
 
 	if (enable_distri_print || enable_2pc_entry_trace)
 	{
-		elog(LOG, "[remove_2pc_records] %s record_in_xlog: %d",
-			tid, record_in_xlog);
+		elog(LOG, "[%s] %s record_in_xlog: %d", func, tid, record_in_xlog);
 	}
 
     if (!RecoveryInProgress() && record_in_xlog)
@@ -3987,8 +4041,8 @@ void remove_2pc_records(const char * tid, bool record_in_xlog)
 			if (found)
 			{
 				Assert(NULL != entry);
-				check_entry_key(tid, entry->key, "remove_2pc_records");
-				check_2pc_file(tid, entry->info, "remove_2pc_records");
+				check_entry_key(tid, entry->key, func);
+				check_2pc_file(tid, entry->info, func);
 			}
 		}
 		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
@@ -3998,8 +4052,7 @@ void remove_2pc_records(const char * tid, bool record_in_xlog)
 			Assert(NULL != entry);
 			if (enable_2pc_entry_trace)
 			{
-				elog(LOG, "[remove_2pc_records] %s is removed "
-					"from hash table", tid);
+				elog(LOG, "[%s] %s is removed from hash table", func, tid);
 			}
 			return;
 		}
@@ -4013,8 +4066,8 @@ void remove_2pc_records(const char * tid, bool record_in_xlog)
 	*/
     if (0 != unlink(path))
     {
-		elog(LOG, "[remove_2pc_records] could not unlink file %s, "
-			"errMsg: %s", path, strerror(errno));
+		elog(LOG, "[%s] could not unlink file %s, errMsg: %s",
+			func, path, strerror(errno));
     }
 }
 
@@ -4026,6 +4079,7 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 	bool found = false;
 	File fd = 0;
 	int ret = 0;
+	char *func = "rename_2pc_records";
 
 	if (!enable_2pc_recovery_info)
 	{
@@ -4034,8 +4088,7 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 
 	if (enable_distri_print || enable_2pc_entry_trace)
 	{
-		elog(LOG, "[rename_2pc_records] %s timestamp: "
-			INT64_FORMAT, tid, timestamp);
+		elog(LOG, "[%s] %s timestamp: "INT64_FORMAT, func, tid, timestamp);
 	}
 
 	if (0 == timestamp)
@@ -4064,8 +4117,8 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 		if (found)
 		{
 			Assert(NULL != entry);
-			check_entry_key(tid, entry->key, "rename_2pc_records");
-			check_2pc_file(tid, entry->info, "rename_2pc_records");
+			check_entry_key(tid, entry->key, func);
+			check_2pc_file(tid, entry->info, func);
 
 			if (RecoveryInProgress())
 			{
@@ -4079,8 +4132,8 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 			}
 			if (fd < 0)
 			{
-				elog(ERROR, "[rename_2pc_records] could not create file %s, "
-					"errMsg: %s", new_path, strerror(errno));
+				elog(ERROR, "[%s] could not create file %s, errMsg: %s",
+					func, new_path, strerror(errno));
 			}
 
 			ret = FileWrite(fd, entry->info, strlen(entry->info),
@@ -4088,9 +4141,9 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 			if(ret != strlen(entry->info))
 			{
 				FileClose(fd);
-				elog(ERROR, "[rename_2pc_records] could not write file %s, "
-					"errMsg: %s, ret: %d, info: %s",
-					path, strerror(errno), ret, entry->info);
+				elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
+					"ret: %d, info: %s",
+					func, path, strerror(errno), ret, entry->info);
 			}
 			FileClose(fd);
 
@@ -4098,13 +4151,12 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 				tid, HASH_REMOVE, &found);
 			if (!found)
 			{
-				elog(ERROR, "[rename_2pc_records] %s is not found "
-					"in hash table when remove it", tid);
+				elog(ERROR, "[%s] %s is not found in hash table when remove it",
+					func, tid);
 			}
 			else if (enable_2pc_entry_trace)
 			{
-				elog(LOG, "[rename_2pc_records] %s is removed "
-					"from hash table", tid);
+				elog(LOG, "[%s] %s is removed from hash table", func, tid);
 			}
 			return;
 		}
@@ -4112,19 +4164,19 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 
 	if (0 != access(path, F_OK))
 	{
-		elog(LOG, "[rename_2pc_records] could not access file %s, "
-			"errMsg: %s", path, strerror(errno));
+		elog(LOG, "[%s] could not access file %s, errMsg: %s",
+			func, path, strerror(errno));
 		return;
 	}
 	if (0 != link(path, new_path))
 	{
-		elog(ERROR, "[rename_2pc_records] could not link file %s to %s, "
-			"errMsg: %s", path, new_path, strerror(errno));
+		elog(ERROR, "[%s] could not link file %s to %s, errMsg: %s",
+			func, path, new_path, strerror(errno));
 	}
 	if (0 != unlink(path))
 	{
-		elog(WARNING, "[rename_2pc_records] could not unlink file %s, "
-			"errMsg: %s", path, strerror(errno));
+		elog(WARNING, "[%s] could not unlink file %s, errMsg: %s",
+			func, path, strerror(errno));
 	}
 }
 
@@ -4136,6 +4188,7 @@ void record_2pc_readonly(const char *gid)
     char content[10] = "readonly";
 	Cache2pcInfo *entry = NULL;
 	bool found = false;
+	char *func = "record_2pc_readonly";
         
     if(!enable_2pc_recovery_info)
     {
@@ -4144,7 +4197,7 @@ void record_2pc_readonly(const char *gid)
         
 	if (enable_distri_print || enable_2pc_entry_trace)
     {
-		elog(LOG, "[record_2pc_readonly] %s is readonly", gid);
+		elog(LOG, "[%s] %s is readonly", func, gid);
     }
 
     if (!RecoveryInProgress())
@@ -4163,33 +4216,34 @@ void record_2pc_readonly(const char *gid)
 			gid, HASH_ENTER_NULL, &found);
 		if (NULL != entry)
 		{
-			check_entry_key(gid, entry->key, "record_2pc_readonly");
-			check_2pc_file(gid, entry->info, "record_2pc_readonly");
+			check_entry_key(gid, entry->key, func);
+			check_2pc_file(gid, entry->info, func);
 
 			if (found)
 			{
 				if (RecoveryInProgress())
 				{
-					elog(LOG, "[record_2pc_readonly] %s is found "
-						"in hash table in recovery mode", gid);
+					elog(LOG, "[%s] %s is found in hash table in recovery mode",
+						func, gid);
 				}
 				else
 				{
-					elog(LOG, "[record_2pc_readonly] %s is found "
-						"in hash table", gid);
+					elog(LOG, "[%s] %s is found in hash table", func, gid);
 				}
 			}
 			else if (enable_2pc_entry_trace)
 			{
-				elog(LOG, "[record_2pc_readonly] %s is added "
-					"to hash table", gid);
+				elog(LOG, "[%s] %s is added to hash table", func, gid);
 			}
+
 			memcpy(entry->info, content, strlen(content) + 1);
+			check_entry_key(gid, entry->key, func);
+
 			return;
 		}
 		else
 		{
-			elog(LOG, "[record_2pc_readonly] %s entry is NULL", gid);
+			elog(LOG, "[%s] %s entry is NULL", func, gid);
 		}
 	}
 
@@ -4215,8 +4269,8 @@ void record_2pc_readonly(const char *gid)
     }
     if (fd < 0)
     {   
-		elog(ERROR, "[record_2pc_readonly] could not create file %s, "
-			"errMsg: %s", path, strerror(errno));
+		elog(ERROR, "[%s] could not create file %s, errMsg: %s",
+			func, path, strerror(errno));
         return;
     }
 
@@ -4224,9 +4278,8 @@ void record_2pc_readonly(const char *gid)
     if(ret != strlen(content))
     {
 		FileClose(fd);
-		elog(ERROR, "[record_2pc_readonly] could not write file %s, "
-			"errMsg: %s, ret: %d, content: %s",
-			path, strerror(errno), ret, content);
+		elog(ERROR, "[%s] could not write file %s, errMsg: %s, ret: %d, content: %s",
+			func, path, strerror(errno), ret, content);
     }
     FileClose(fd);
 }
@@ -4238,6 +4291,8 @@ char *get_2pc_info_from_cache(const char *tid)
 {
 	Cache2pcInfo *entry = NULL;
 	bool found = false;
+	char *func = "get_2pc_info_from_cache";
+
 	if (NULL != record_2pc_cache)
 	{
 		Assert(strlen(tid) < MAX_TID_SIZE);
@@ -4247,13 +4302,12 @@ char *get_2pc_info_from_cache(const char *tid)
 		{
 			Assert(NULL != entry);
 
-			check_entry_key(tid, entry->key, "get_2pc_info_from_cache");
+			check_entry_key(tid, entry->key, func);
 
 			if (enable_2pc_entry_trace)
 			{
-				elog(LOG, "[get_2pc_info_from_cache] %s is found "
-					"in hast table, key: %s, info: %s",
-					tid, entry->key, entry->info);
+				elog(LOG, "[%s] %s is found in hast table, key: %s, info: %s",
+					func, tid, entry->key, entry->info);
 }
 
 			return entry->info;
@@ -4261,8 +4315,7 @@ char *get_2pc_info_from_cache(const char *tid)
 
 		if (enable_2pc_entry_trace)
 		{
-			elog(LOG, "[get_2pc_info_from_cache] %s is not found "
-				"in hast table", tid);
+			elog(LOG, "[%s] %s is not found in hast table", func, tid);
 		}
 	}
 	return NULL;
@@ -4276,6 +4329,7 @@ char *get_2pc_list_from_cache(int *count)
 	HASH_SEQ_STATUS seq;
 	Cache2pcInfo *entry = NULL;
 	char *recordList = NULL;
+	char *func = "get_2pc_list_from_cache";
 
 	if (NULL == record_2pc_cache)
 	{
@@ -4286,7 +4340,7 @@ char *get_2pc_list_from_cache(int *count)
 	while ((entry = hash_seq_search(&seq)) != NULL)
 	{
 		Assert(NULL != entry);
-		check_2pc_file(entry->key, entry->info, "get_2pc_list_from_cache");
+		check_2pc_file(entry->key, entry->info, func);
 
 		if (NULL != count && *count >= MAX_OUTPUT_FILE)
 		{
@@ -4335,7 +4389,7 @@ Record2pcCacheInit(void)
 	flags = HASH_ELEM | HASH_PARTITION;
 
 	record_2pc_cache = ShmemInitHash("Record 2pc Cache",
-		record_2pc_cache_size/4, record_2pc_cache_size,
+		record_2pc_cache_size/2, record_2pc_cache_size,
 		&info, flags);
 }
 

From 89d29959851865d6f3d2d28d7af871db8530036f Mon Sep 17 00:00:00 2001
From: hanwayjiang <hanwayjiang@tencent.com>
Date: Thu, 13 May 2021 20:16:42 +0800
Subject: [PATCH 372/578] fix regress for
 http://tapd.oa.com/pgxz/prong/stories/view/1010092131864638363 (merge request
 !324)

Squash merge branch 'tbase_v2_hanway513' into 'Tbase_v2.15.19'

* fix regress for http://tapd.oa.com/pgxz/prong/stories/view/1010092131864638363
---
 src/test/regress/expected/alter_generic.out   |   30 +-
 src/test/regress/expected/event_trigger.out   |    7 +-
 src/test/regress/expected/foreign_data.out    | 1298 +++++++++++------
 src/test/regress/expected/object_address.out  |  219 ++-
 src/test/regress/expected/rolenames.out       |  191 ++-
 .../regress/expected/xl_limitations_1.out     |    5 +-
 6 files changed, 1145 insertions(+), 605 deletions(-)

diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out
index 2d7998ff..788c5964 100644
--- a/src/test/regress/expected/alter_generic.out
+++ b/src/test/regress/expected/alter_generic.out
@@ -149,34 +149,28 @@ SELECT n.nspname, c.conname, a.rolname
 -- Foreign Data Wrapper and Foreign Server
 --
 CREATE FOREIGN DATA WRAPPER alt_fdw1;
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 CREATE FOREIGN DATA WRAPPER alt_fdw2;
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER alt_fserv1 FOREIGN DATA WRAPPER alt_fdw1;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER alt_fserv2 FOREIGN DATA WRAPPER alt_fdw2;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 ALTER FOREIGN DATA WRAPPER alt_fdw1 RENAME TO alt_fdw2;  -- failed (name conflict)
-ERROR:  foreign-data wrapper "alt_fdw1" does not exist
+ERROR:  foreign-data wrapper "alt_fdw2" already exists
 ALTER FOREIGN DATA WRAPPER alt_fdw1 RENAME TO alt_fdw3;  -- OK
-ERROR:  foreign-data wrapper "alt_fdw1" does not exist
 ALTER SERVER alt_fserv1 RENAME TO alt_fserv2;   -- failed (name conflict)
-ERROR:  server "alt_fserv1" does not exist
+ERROR:  server "alt_fserv2" already exists
 ALTER SERVER alt_fserv1 RENAME TO alt_fserv3;   -- OK
-ERROR:  server "alt_fserv1" does not exist
 SELECT fdwname FROM pg_foreign_data_wrapper WHERE fdwname like 'alt_fdw%';
  fdwname 
----------
-(0 rows)
+----------
+ alt_fdw2
+ alt_fdw3
+(2 rows)
 
 SELECT srvname FROM pg_foreign_server WHERE srvname like 'alt_fserv%';
  srvname 
----------
-(0 rows)
+------------
+ alt_fserv2
+ alt_fserv3
+(2 rows)
 
 --
 -- Procedural Language
@@ -681,9 +675,9 @@ SELECT nspname, prsname
 ---
 \set VERBOSITY terse \\ -- suppress cascade details
 DROP FOREIGN DATA WRAPPER alt_fdw2 CASCADE;
-ERROR:  foreign-data wrapper "alt_fdw2" does not exist
+NOTICE:  drop cascades to server alt_fserv2
 DROP FOREIGN DATA WRAPPER alt_fdw3 CASCADE;
-ERROR:  foreign-data wrapper "alt_fdw3" does not exist
+NOTICE:  drop cascades to server alt_fserv3
 DROP LANGUAGE alt_lang2 CASCADE;
 DROP LANGUAGE alt_lang3 CASCADE;
 DROP SCHEMA alt_nsp1 CASCADE;
diff --git a/src/test/regress/expected/event_trigger.out b/src/test/regress/expected/event_trigger.out
index 6ff64a59..8c96ae64 100644
--- a/src/test/regress/expected/event_trigger.out
+++ b/src/test/regress/expected/event_trigger.out
@@ -109,14 +109,8 @@ comment on table event_trigger_fire1 is 'here is a comment';
 revoke all on table event_trigger_fire1 from public;
 drop table event_trigger_fire1;
 create foreign data wrapper useless;
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 create server useless_server foreign data wrapper useless;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 create user mapping for regress_evt_user server useless_server;
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 alter default privileges for role regress_evt_user
  revoke delete on tables from regress_evt_user;
 -- alter owner to non-superuser should fail
@@ -139,6 +133,7 @@ ERROR:  event trigger "regress_event_trigger" does not exist
 drop role regress_evt_user;
 ERROR:  role "regress_evt_user" cannot be dropped because some objects depend on it
 DETAIL:  owner of default privileges on new relations belonging to role regress_evt_user
+owner of user mapping for regress_evt_user on server useless_server
 -- cleanup before next test
 -- these are all OK; the second one should emit a NOTICE
 drop event trigger if exists regress_event_trigger2;
diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out
index dec9af7f..26e01f5f 100644
--- a/src/test/regress/expected/foreign_data.out
+++ b/src/test/regress/expected/foreign_data.out
@@ -14,18 +14,15 @@ CREATE ROLE regress_test_role_super SUPERUSER;
 CREATE ROLE regress_test_indirect;
 CREATE ROLE regress_unprivileged_role;
 CREATE FOREIGN DATA WRAPPER dummy;
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
-ERROR:  foreign-data wrapper "dummy" does not exist
 CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator;
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 -- At this point we should have 2 built-in wrappers and no servers.
 SELECT fdwname, fdwhandler::regproc, fdwvalidator::regproc, fdwoptions FROM pg_foreign_data_wrapper ORDER BY 1, 2, 3;
  fdwname | fdwhandler | fdwvalidator | fdwoptions 
----------+------------+--------------+------------
-(0 rows)
+------------+------------+--------------------------+------------
+ dummy      | -          | -                        | 
+ postgresql | -          | postgresql_fdw_validator | 
+(2 rows)
 
 SELECT srvname, srvoptions FROM pg_foreign_server;
  srvname | srvoptions 
@@ -39,60 +36,58 @@ SELECT * FROM pg_user_mapping;
 
 -- CREATE FOREIGN DATA WRAPPER
 CREATE FOREIGN DATA WRAPPER foo VALIDATOR bar;            -- ERROR
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
+ERROR:  function bar(text[], oid) does not exist
 CREATE FOREIGN DATA WRAPPER foo;
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 \dew
    List of foreign-data wrappers
  Name | Owner | Handler | Validator 
-------+-------+---------+-----------
-(0 rows)
+------------+---------------------------+---------+--------------------------
+ dummy      | regress_foreign_data_user | -       | -
+ foo        | regress_foreign_data_user | -       | -
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator
+(3 rows)
 
 CREATE FOREIGN DATA WRAPPER foo; -- duplicate
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
+ERROR:  foreign-data wrapper "foo" already exists
 DROP FOREIGN DATA WRAPPER foo;
-ERROR:  foreign-data wrapper "foo" does not exist
 CREATE FOREIGN DATA WRAPPER foo OPTIONS (testing '1');
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+---------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |               | useless
+ foo        | regress_foreign_data_user | -       | -                        |                   | (testing '1') | 
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |               | 
+(3 rows)
 
 DROP FOREIGN DATA WRAPPER foo;
-ERROR:  foreign-data wrapper "foo" does not exist
 CREATE FOREIGN DATA WRAPPER foo OPTIONS (testing '1', testing '2');   -- ERROR
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
+ERROR:  option "testing" provided more than once
 CREATE FOREIGN DATA WRAPPER foo OPTIONS (testing '1', another '2');
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+----------------------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |                            | useless
+ foo        | regress_foreign_data_user | -       | -                        |                   | (testing '1', another '2') | 
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |                            | 
+(3 rows)
 
 DROP FOREIGN DATA WRAPPER foo;
-ERROR:  foreign-data wrapper "foo" does not exist
 SET ROLE regress_test_role;
 CREATE FOREIGN DATA WRAPPER foo; -- ERROR
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
+ERROR:  permission denied to create foreign-data wrapper "foo"
+HINT:  Must be superuser to create a foreign-data wrapper.
 RESET ROLE;
 CREATE FOREIGN DATA WRAPPER foo VALIDATOR postgresql_fdw_validator;
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
+ foo        | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
+(3 rows)
 
 -- ALTER FOREIGN DATA WRAPPER
 ALTER FOREIGN DATA WRAPPER foo;                             -- ERROR
@@ -100,46 +95,53 @@ ERROR:  syntax error at or near ";"
 LINE 1: ALTER FOREIGN DATA WRAPPER foo;
                                       ^
 ALTER FOREIGN DATA WRAPPER foo VALIDATOR bar;               -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  function bar(text[], oid) does not exist
 ALTER FOREIGN DATA WRAPPER foo NO VALIDATOR;
-ERROR:  foreign-data wrapper "foo" does not exist
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
+ foo        | regress_foreign_data_user | -       | -                        |                   |             | 
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
+(3 rows)
 
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (a '1', b '2');
-ERROR:  foreign-data wrapper "foo" does not exist
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (SET c '4');         -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  option "c" not found
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (DROP c);            -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  option "c" not found
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD x '1', DROP x);
-ERROR:  foreign-data wrapper "foo" does not exist
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+----------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |                | useless
+ foo        | regress_foreign_data_user | -       | -                        |                   | (a '1', b '2') | 
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |                | 
+(3 rows)
 
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (DROP a, SET b '3', ADD c '4');
-ERROR:  foreign-data wrapper "foo" does not exist
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+----------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |                | useless
+ foo        | regress_foreign_data_user | -       | -                        |                   | (b '3', c '4') | 
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |                | 
+(3 rows)
 
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (a '2');
-ERROR:  foreign-data wrapper "foo" does not exist
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (b '4');             -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  option "b" provided more than once
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+-----------------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |                       | useless
+ foo        | regress_foreign_data_user | -       | -                        |                   | (b '3', c '4', a '2') | 
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |                       | 
+(3 rows)
 
 SET ROLE regress_test_role;
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD d '5');         -- ERROR
@@ -147,17 +149,19 @@ ERROR:  permission denied to alter foreign-data wrapper "foo"
 HINT:  Must be superuser to alter a foreign-data wrapper.
 SET ROLE regress_test_role_super;
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD d '5');
-ERROR:  foreign-data wrapper "foo" does not exist
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+------------------------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |                              | useless
+ foo        | regress_foreign_data_user | -       | -                        |                   | (b '3', c '4', a '2', d '5') | 
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |                              | 
+(3 rows)
 
 ALTER FOREIGN DATA WRAPPER foo OWNER TO regress_test_role;  -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  permission denied to change owner of foreign-data wrapper "foo"
+HINT:  The owner of a foreign-data wrapper must be a superuser.
 ALTER FOREIGN DATA WRAPPER foo OWNER TO regress_test_role_super;
-ERROR:  foreign-data wrapper "foo" does not exist
 ALTER ROLE regress_test_role_super NOSUPERUSER;
 SET ROLE regress_test_role_super;
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD e '6');         -- ERROR
@@ -167,19 +171,23 @@ RESET ROLE;
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+------------------------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |                              | useless
+ foo        | regress_test_role_super   | -       | -                        |                   | (b '3', c '4', a '2', d '5') | 
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |                              | 
+(3 rows)
 
 ALTER FOREIGN DATA WRAPPER foo RENAME TO foo1;
-ERROR:  foreign-data wrapper "foo" does not exist
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+------------------------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |                              | useless
+ foo1       | regress_test_role_super   | -       | -                        |                   | (b '3', c '4', a '2', d '5') | 
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |                              | 
+(3 rows)
 
 ALTER FOREIGN DATA WRAPPER foo1 RENAME TO foo;
-ERROR:  foreign-data wrapper "foo1" does not exist
 -- DROP FOREIGN DATA WRAPPER
 DROP FOREIGN DATA WRAPPER nonexistent;                      -- ERROR
 ERROR:  foreign-data wrapper "nonexistent" does not exist
@@ -188,71 +196,78 @@ NOTICE:  foreign-data wrapper "nonexistent" does not exist, skipping
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+------------------------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |                              | useless
+ foo        | regress_test_role_super   | -       | -                        |                   | (b '3', c '4', a '2', d '5') | 
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |                              | 
+(3 rows)
 
 DROP ROLE regress_test_role_super;                          -- ERROR
+ERROR:  role "regress_test_role_super" cannot be dropped because some objects depend on it
+DETAIL:  owner of foreign-data wrapper foo
 SET ROLE regress_test_role_super;
-ERROR:  role "regress_test_role_super" does not exist
 DROP FOREIGN DATA WRAPPER foo;
-ERROR:  foreign-data wrapper "foo" does not exist
 RESET ROLE;
 DROP ROLE regress_test_role_super;
-ERROR:  role "regress_test_role_super" does not exist
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
+(2 rows)
 
 CREATE FOREIGN DATA WRAPPER foo;
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER s1 FOREIGN DATA WRAPPER foo;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 COMMENT ON SERVER s1 IS 'foreign server';
-ERROR:  server "s1" does not exist
 CREATE USER MAPPING FOR current_user SERVER s1;
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR current_user SERVER s1;				-- ERROR
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
+ERROR:  user mapping for "regress_foreign_data_user" already exists for server s1
 CREATE USER MAPPING IF NOT EXISTS FOR current_user SERVER s1; -- NOTICE
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
+NOTICE:  user mapping for "regress_foreign_data_user" already exists for server s1, skipping
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
+ foo        | regress_foreign_data_user | -       | -                        |                   |             | 
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
+(3 rows)
 
 \des+
                                        List of foreign servers
  Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
-------+-------+----------------------+-------------------+------+---------+-------------+-------------
-(0 rows)
+------+---------------------------+----------------------+-------------------+------+---------+-------------+----------------
+ s1   | regress_foreign_data_user | foo                  |                   |      |         |             | foreign server
+(1 row)
 
 \deu+
       List of user mappings
  Server | User name | FDW options 
---------+-----------+-------------
-(0 rows)
+--------+---------------------------+-------------
+ s1     | regress_foreign_data_user | 
+(1 row)
 
 DROP FOREIGN DATA WRAPPER foo;                              -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  cannot drop foreign-data wrapper foo because other objects depend on it
+DETAIL:  server s1 depends on foreign-data wrapper foo
+user mapping for regress_foreign_data_user on server s1 depends on server s1
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
 SET ROLE regress_test_role;
 DROP FOREIGN DATA WRAPPER foo CASCADE;                      -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  must be owner of foreign-data wrapper foo
 RESET ROLE;
 DROP FOREIGN DATA WRAPPER foo CASCADE;
-ERROR:  foreign-data wrapper "foo" does not exist
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to server s1
+drop cascades to user mapping for regress_foreign_data_user on server s1
 \dew+
                            List of foreign-data wrappers
  Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
-------+-------+---------+-----------+-------------------+-------------+-------------
-(0 rows)
+------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
+ dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
+ postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
+(2 rows)
 
 \des+
                                        List of foreign servers
@@ -268,87 +283,84 @@ ERROR:  foreign-data wrapper "foo" does not exist
 
 -- exercise CREATE SERVER
 CREATE SERVER s1 FOREIGN DATA WRAPPER foo;                  -- ERROR
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
+ERROR:  foreign-data wrapper "foo" does not exist
 CREATE FOREIGN DATA WRAPPER foo OPTIONS ("test wrapper" 'true');
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER s1 FOREIGN DATA WRAPPER foo;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER s1 FOREIGN DATA WRAPPER foo;                  -- ERROR
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
+ERROR:  server "s1" already exists
 CREATE SERVER IF NOT EXISTS s1 FOREIGN DATA WRAPPER foo;	-- No ERROR, just NOTICE
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
+NOTICE:  server "s1" already exists, skipping
 CREATE SERVER s2 FOREIGN DATA WRAPPER foo OPTIONS (host 'a', dbname 'b');
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER s3 TYPE 'oracle' FOREIGN DATA WRAPPER foo;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER s4 TYPE 'oracle' FOREIGN DATA WRAPPER foo OPTIONS (host 'a', dbname 'b');
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER s5 VERSION '15.0' FOREIGN DATA WRAPPER foo;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER s6 VERSION '16.0' FOREIGN DATA WRAPPER foo OPTIONS (host 'a', dbname 'b');
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER s7 TYPE 'oracle' VERSION '17.0' FOREIGN DATA WRAPPER foo OPTIONS (host 'a', dbname 'b');
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER s8 FOREIGN DATA WRAPPER postgresql OPTIONS (foo '1'); -- ERROR
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
+ERROR:  invalid option "foo"
+HINT:  Valid options in this context are: authtype, service, connect_timeout, dbname, host, hostaddr, port, tty, options, requiressl, sslmode, gsslib
 CREATE SERVER s8 FOREIGN DATA WRAPPER postgresql OPTIONS (host 'localhost', dbname 's8db');
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 \des+
                                        List of foreign servers
  Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
-------+-------+----------------------+-------------------+------+---------+-------------+-------------
-(0 rows)
+------+---------------------------+----------------------+-------------------+--------+---------+-----------------------------------+-------------
+ s1   | regress_foreign_data_user | foo                  |                   |        |         |                                   | 
+ s2   | regress_foreign_data_user | foo                  |                   |        |         | (host 'a', dbname 'b')            | 
+ s3   | regress_foreign_data_user | foo                  |                   | oracle |         |                                   | 
+ s4   | regress_foreign_data_user | foo                  |                   | oracle |         | (host 'a', dbname 'b')            | 
+ s5   | regress_foreign_data_user | foo                  |                   |        | 15.0    |                                   | 
+ s6   | regress_foreign_data_user | foo                  |                   |        | 16.0    | (host 'a', dbname 'b')            | 
+ s7   | regress_foreign_data_user | foo                  |                   | oracle | 17.0    | (host 'a', dbname 'b')            | 
+ s8   | regress_foreign_data_user | postgresql           |                   |        |         | (host 'localhost', dbname 's8db') | 
+(8 rows)
 
 SET ROLE regress_test_role;
 CREATE SERVER t1 FOREIGN DATA WRAPPER foo;                 -- ERROR: no usage on FDW
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
+ERROR:  permission denied for foreign-data wrapper foo
 RESET ROLE;
 GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role;
-ERROR:  foreign-data wrapper "foo" does not exist
 SET ROLE regress_test_role;
 CREATE SERVER t1 FOREIGN DATA WRAPPER foo;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 RESET ROLE;
 \des+
                                        List of foreign servers
  Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
-------+-------+----------------------+-------------------+------+---------+-------------+-------------
-(0 rows)
+------+---------------------------+----------------------+-------------------+--------+---------+-----------------------------------+-------------
+ s1   | regress_foreign_data_user | foo                  |                   |        |         |                                   | 
+ s2   | regress_foreign_data_user | foo                  |                   |        |         | (host 'a', dbname 'b')            | 
+ s3   | regress_foreign_data_user | foo                  |                   | oracle |         |                                   | 
+ s4   | regress_foreign_data_user | foo                  |                   | oracle |         | (host 'a', dbname 'b')            | 
+ s5   | regress_foreign_data_user | foo                  |                   |        | 15.0    |                                   | 
+ s6   | regress_foreign_data_user | foo                  |                   |        | 16.0    | (host 'a', dbname 'b')            | 
+ s7   | regress_foreign_data_user | foo                  |                   | oracle | 17.0    | (host 'a', dbname 'b')            | 
+ s8   | regress_foreign_data_user | postgresql           |                   |        |         | (host 'localhost', dbname 's8db') | 
+ t1   | regress_test_role         | foo                  |                   |        |         |                                   | 
+(9 rows)
 
 REVOKE USAGE ON FOREIGN DATA WRAPPER foo FROM regress_test_role;
-ERROR:  foreign-data wrapper "foo" does not exist
 GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_indirect;
-ERROR:  foreign-data wrapper "foo" does not exist
 SET ROLE regress_test_role;
 CREATE SERVER t2 FOREIGN DATA WRAPPER foo;                 -- ERROR
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
+ERROR:  permission denied for foreign-data wrapper foo
 RESET ROLE;
 GRANT regress_test_indirect TO regress_test_role;
 SET ROLE regress_test_role;
 CREATE SERVER t2 FOREIGN DATA WRAPPER foo;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 \des+
                                        List of foreign servers
  Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
-------+-------+----------------------+-------------------+------+---------+-------------+-------------
-(0 rows)
+------+---------------------------+----------------------+-------------------+--------+---------+-----------------------------------+-------------
+ s1   | regress_foreign_data_user | foo                  |                   |        |         |                                   | 
+ s2   | regress_foreign_data_user | foo                  |                   |        |         | (host 'a', dbname 'b')            | 
+ s3   | regress_foreign_data_user | foo                  |                   | oracle |         |                                   | 
+ s4   | regress_foreign_data_user | foo                  |                   | oracle |         | (host 'a', dbname 'b')            | 
+ s5   | regress_foreign_data_user | foo                  |                   |        | 15.0    |                                   | 
+ s6   | regress_foreign_data_user | foo                  |                   |        | 16.0    | (host 'a', dbname 'b')            | 
+ s7   | regress_foreign_data_user | foo                  |                   | oracle | 17.0    | (host 'a', dbname 'b')            | 
+ s8   | regress_foreign_data_user | postgresql           |                   |        |         | (host 'localhost', dbname 's8db') | 
+ t1   | regress_test_role         | foo                  |                   |        |         |                                   | 
+ t2   | regress_test_role         | foo                  |                   |        |         |                                   | 
+(10 rows)
 
 RESET ROLE;
 REVOKE regress_test_indirect FROM regress_test_role;
@@ -360,72 +372,97 @@ LINE 1: ALTER SERVER s0;
 ALTER SERVER s0 OPTIONS (a '1');                            -- ERROR
 ERROR:  server "s0" does not exist
 ALTER SERVER s1 VERSION '1.0' OPTIONS (servername 's1');
-ERROR:  server "s1" does not exist
 ALTER SERVER s2 VERSION '1.1';
-ERROR:  server "s2" does not exist
 ALTER SERVER s3 OPTIONS ("tns name" 'orcl', port '1521');
-ERROR:  server "s3" does not exist
 GRANT USAGE ON FOREIGN SERVER s1 TO regress_test_role;
-ERROR:  server "s1" does not exist
 GRANT USAGE ON FOREIGN SERVER s6 TO regress_test_role2 WITH GRANT OPTION;
-ERROR:  server "s6" does not exist
 \des+
                                        List of foreign servers
  Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
-------+-------+----------------------+-------------------+------+---------+-------------+-------------
-(0 rows)
+------+---------------------------+----------------------+-------------------------------------------------------+--------+---------+-----------------------------------+-------------
+ s1   | regress_foreign_data_user | foo                  | regress_foreign_data_user=U/regress_foreign_data_user+|        | 1.0     | (servername 's1')                 | 
+      |                           |                      | regress_test_role=U/regress_foreign_data_user         |        |         |                                   | 
+ s2   | regress_foreign_data_user | foo                  |                                                       |        | 1.1     | (host 'a', dbname 'b')            | 
+ s3   | regress_foreign_data_user | foo                  |                                                       | oracle |         | ("tns name" 'orcl', port '1521')  | 
+ s4   | regress_foreign_data_user | foo                  |                                                       | oracle |         | (host 'a', dbname 'b')            | 
+ s5   | regress_foreign_data_user | foo                  |                                                       |        | 15.0    |                                   | 
+ s6   | regress_foreign_data_user | foo                  | regress_foreign_data_user=U/regress_foreign_data_user+|        | 16.0    | (host 'a', dbname 'b')            | 
+      |                           |                      | regress_test_role2=U*/regress_foreign_data_user       |        |         |                                   | 
+ s7   | regress_foreign_data_user | foo                  |                                                       | oracle | 17.0    | (host 'a', dbname 'b')            | 
+ s8   | regress_foreign_data_user | postgresql           |                                                       |        |         | (host 'localhost', dbname 's8db') | 
+ t1   | regress_test_role         | foo                  |                                                       |        |         |                                   | 
+ t2   | regress_test_role         | foo                  |                                                       |        |         |                                   | 
+(10 rows)
 
 SET ROLE regress_test_role;
 ALTER SERVER s1 VERSION '1.1';                              -- ERROR
-ERROR:  server "s1" does not exist
+ERROR:  must be owner of foreign server s1
 ALTER SERVER s1 OWNER TO regress_test_role;                 -- ERROR
-ERROR:  server "s1" does not exist
+ERROR:  must be owner of foreign server s1
 RESET ROLE;
 ALTER SERVER s1 OWNER TO regress_test_role;
-ERROR:  server "s1" does not exist
 GRANT regress_test_role2 TO regress_test_role;
 SET ROLE regress_test_role;
 ALTER SERVER s1 VERSION '1.1';
-ERROR:  server "s1" does not exist
 ALTER SERVER s1 OWNER TO regress_test_role2;                -- ERROR
-ERROR:  server "s1" does not exist
+ERROR:  permission denied for foreign-data wrapper foo
 RESET ROLE;
 ALTER SERVER s8 OPTIONS (foo '1');                          -- ERROR option validation
-ERROR:  server "s8" does not exist
+ERROR:  invalid option "foo"
+HINT:  Valid options in this context are: authtype, service, connect_timeout, dbname, host, hostaddr, port, tty, options, requiressl, sslmode, gsslib
 ALTER SERVER s8 OPTIONS (connect_timeout '30', SET dbname 'db1', DROP host);
-ERROR:  server "s8" does not exist
 SET ROLE regress_test_role;
 ALTER SERVER s1 OWNER TO regress_test_indirect;             -- ERROR
-ERROR:  server "s1" does not exist
+ERROR:  must be member of role "regress_test_indirect"
 RESET ROLE;
 GRANT regress_test_indirect TO regress_test_role;
 SET ROLE regress_test_role;
 ALTER SERVER s1 OWNER TO regress_test_indirect;
-ERROR:  server "s1" does not exist
 RESET ROLE;
 GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_indirect;
-ERROR:  foreign-data wrapper "foo" does not exist
 SET ROLE regress_test_role;
 ALTER SERVER s1 OWNER TO regress_test_indirect;
-ERROR:  server "s1" does not exist
 RESET ROLE;
 DROP ROLE regress_test_indirect;                            -- ERROR
+ERROR:  role "regress_test_indirect" cannot be dropped because some objects depend on it
+DETAIL:  owner of server s1
+privileges for foreign-data wrapper foo
 \des+
                                        List of foreign servers
  Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
-------+-------+----------------------+-------------------+------+---------+-------------+-------------
-(0 rows)
+------+---------------------------+----------------------+-------------------------------------------------------+--------+---------+--------------------------------------+-------------
+ s1   | regress_test_indirect     | foo                  | regress_test_indirect=U/regress_test_indirect         |        | 1.1     | (servername 's1')                    | 
+ s2   | regress_foreign_data_user | foo                  |                                                       |        | 1.1     | (host 'a', dbname 'b')               | 
+ s3   | regress_foreign_data_user | foo                  |                                                       | oracle |         | ("tns name" 'orcl', port '1521')     | 
+ s4   | regress_foreign_data_user | foo                  |                                                       | oracle |         | (host 'a', dbname 'b')               | 
+ s5   | regress_foreign_data_user | foo                  |                                                       |        | 15.0    |                                      | 
+ s6   | regress_foreign_data_user | foo                  | regress_foreign_data_user=U/regress_foreign_data_user+|        | 16.0    | (host 'a', dbname 'b')               | 
+      |                           |                      | regress_test_role2=U*/regress_foreign_data_user       |        |         |                                      | 
+ s7   | regress_foreign_data_user | foo                  |                                                       | oracle | 17.0    | (host 'a', dbname 'b')               | 
+ s8   | regress_foreign_data_user | postgresql           |                                                       |        |         | (dbname 'db1', connect_timeout '30') | 
+ t1   | regress_test_role         | foo                  |                                                       |        |         |                                      | 
+ t2   | regress_test_role         | foo                  |                                                       |        |         |                                      | 
+(10 rows)
 
 ALTER SERVER s8 RENAME to s8new;
-ERROR:  server "s8" does not exist
 \des+
                                        List of foreign servers
  Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
-------+-------+----------------------+-------------------+------+---------+-------------+-------------
-(0 rows)
+-------+---------------------------+----------------------+-------------------------------------------------------+--------+---------+--------------------------------------+-------------
+ s1    | regress_test_indirect     | foo                  | regress_test_indirect=U/regress_test_indirect         |        | 1.1     | (servername 's1')                    | 
+ s2    | regress_foreign_data_user | foo                  |                                                       |        | 1.1     | (host 'a', dbname 'b')               | 
+ s3    | regress_foreign_data_user | foo                  |                                                       | oracle |         | ("tns name" 'orcl', port '1521')     | 
+ s4    | regress_foreign_data_user | foo                  |                                                       | oracle |         | (host 'a', dbname 'b')               | 
+ s5    | regress_foreign_data_user | foo                  |                                                       |        | 15.0    |                                      | 
+ s6    | regress_foreign_data_user | foo                  | regress_foreign_data_user=U/regress_foreign_data_user+|        | 16.0    | (host 'a', dbname 'b')               | 
+       |                           |                      | regress_test_role2=U*/regress_foreign_data_user       |        |         |                                      | 
+ s7    | regress_foreign_data_user | foo                  |                                                       | oracle | 17.0    | (host 'a', dbname 'b')               | 
+ s8new | regress_foreign_data_user | postgresql           |                                                       |        |         | (dbname 'db1', connect_timeout '30') | 
+ t1    | regress_test_role         | foo                  |                                                       |        |         |                                      | 
+ t2    | regress_test_role         | foo                  |                                                       |        |         |                                      | 
+(10 rows)
 
 ALTER SERVER s8new RENAME to s8;
-ERROR:  server "s8new" does not exist
 -- DROP SERVER
 DROP SERVER nonexistent;                                    -- ERROR
 ERROR:  server "nonexistent" does not exist
@@ -434,51 +471,83 @@ NOTICE:  server "nonexistent" does not exist, skipping
 \des
        List of foreign servers
  Name | Owner | Foreign-data wrapper 
-------+-------+----------------------
-(0 rows)
+------+---------------------------+----------------------
+ s1   | regress_test_indirect     | foo
+ s2   | regress_foreign_data_user | foo
+ s3   | regress_foreign_data_user | foo
+ s4   | regress_foreign_data_user | foo
+ s5   | regress_foreign_data_user | foo
+ s6   | regress_foreign_data_user | foo
+ s7   | regress_foreign_data_user | foo
+ s8   | regress_foreign_data_user | postgresql
+ t1   | regress_test_role         | foo
+ t2   | regress_test_role         | foo
+(10 rows)
 
 SET ROLE regress_test_role;
 DROP SERVER s2;                                             -- ERROR
-ERROR:  server "s2" does not exist
+ERROR:  must be owner of foreign server s2
 DROP SERVER s1;
-ERROR:  server "s1" does not exist
 RESET ROLE;
 \des
        List of foreign servers
  Name | Owner | Foreign-data wrapper 
-------+-------+----------------------
-(0 rows)
+------+---------------------------+----------------------
+ s2   | regress_foreign_data_user | foo
+ s3   | regress_foreign_data_user | foo
+ s4   | regress_foreign_data_user | foo
+ s5   | regress_foreign_data_user | foo
+ s6   | regress_foreign_data_user | foo
+ s7   | regress_foreign_data_user | foo
+ s8   | regress_foreign_data_user | postgresql
+ t1   | regress_test_role         | foo
+ t2   | regress_test_role         | foo
+(9 rows)
 
 ALTER SERVER s2 OWNER TO regress_test_role;
-ERROR:  server "s2" does not exist
 SET ROLE regress_test_role;
 DROP SERVER s2;
-ERROR:  server "s2" does not exist
 RESET ROLE;
 \des
        List of foreign servers
  Name | Owner | Foreign-data wrapper 
-------+-------+----------------------
-(0 rows)
+------+---------------------------+----------------------
+ s3   | regress_foreign_data_user | foo
+ s4   | regress_foreign_data_user | foo
+ s5   | regress_foreign_data_user | foo
+ s6   | regress_foreign_data_user | foo
+ s7   | regress_foreign_data_user | foo
+ s8   | regress_foreign_data_user | postgresql
+ t1   | regress_test_role         | foo
+ t2   | regress_test_role         | foo
+(8 rows)
 
 CREATE USER MAPPING FOR current_user SERVER s3;
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 \deu
 List of user mappings
  Server | User name 
---------+-----------
-(0 rows)
+--------+---------------------------
+ s3     | regress_foreign_data_user
+(1 row)
 
 DROP SERVER s3;                                             -- ERROR
-ERROR:  server "s3" does not exist
+ERROR:  cannot drop server s3 because other objects depend on it
+DETAIL:  user mapping for regress_foreign_data_user on server s3 depends on server s3
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
 DROP SERVER s3 CASCADE;
-ERROR:  server "s3" does not exist
+NOTICE:  drop cascades to user mapping for regress_foreign_data_user on server s3
 \des
        List of foreign servers
  Name | Owner | Foreign-data wrapper 
-------+-------+----------------------
-(0 rows)
+------+---------------------------+----------------------
+ s4   | regress_foreign_data_user | foo
+ s5   | regress_foreign_data_user | foo
+ s6   | regress_foreign_data_user | foo
+ s7   | regress_foreign_data_user | foo
+ s8   | regress_foreign_data_user | postgresql
+ t1   | regress_test_role         | foo
+ t2   | regress_test_role         | foo
+(7 rows)
 
 \deu
 List of user mappings
@@ -488,59 +557,44 @@ List of user mappings
 
 -- CREATE USER MAPPING
 CREATE USER MAPPING FOR regress_test_missing_role SERVER s1;  -- ERROR
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
+ERROR:  role "regress_test_missing_role" does not exist
 CREATE USER MAPPING FOR current_user SERVER s1;             -- ERROR
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
+ERROR:  server "s1" does not exist
 CREATE USER MAPPING FOR current_user SERVER s4;
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR user SERVER s4;                     -- ERROR duplicate
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
+ERROR:  user mapping for "regress_foreign_data_user" already exists for server s4
 CREATE USER MAPPING FOR public SERVER s4 OPTIONS ("this mapping" 'is public');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR user SERVER s8 OPTIONS (username 'test', password 'secret');    -- ERROR
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
+ERROR:  invalid option "username"
+HINT:  Valid options in this context are: user, password
 CREATE USER MAPPING FOR user SERVER s8 OPTIONS (user 'test', password 'secret');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 ALTER SERVER s5 OWNER TO regress_test_role;
-ERROR:  server "s5" does not exist
 ALTER SERVER s6 OWNER TO regress_test_indirect;
-ERROR:  role "regress_test_indirect" does not exist
 SET ROLE regress_test_role;
 CREATE USER MAPPING FOR current_user SERVER s5;
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR current_user SERVER s6 OPTIONS (username 'test');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR current_user SERVER s7;             -- ERROR
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
+ERROR:  permission denied for foreign server s7
 CREATE USER MAPPING FOR public SERVER s8;                   -- ERROR
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
+ERROR:  must be owner of foreign server s8
 RESET ROLE;
 ALTER SERVER t1 OWNER TO regress_test_indirect;
-ERROR:  role "regress_test_indirect" does not exist
 SET ROLE regress_test_role;
 CREATE USER MAPPING FOR current_user SERVER t1 OPTIONS (username 'bob', password 'boo');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR public SERVER t1;
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 RESET ROLE;
 \deu
 List of user mappings
  Server | User name 
---------+-----------
-(0 rows)
+--------+---------------------------
+ s4     | public
+ s4     | regress_foreign_data_user
+ s5     | regress_test_role
+ s6     | regress_test_role
+ s8     | regress_foreign_data_user
+ t1     | public
+ t1     | regress_test_role
+(7 rows)
 
 -- ALTER USER MAPPING
 ALTER USER MAPPING FOR regress_test_missing_role SERVER s4 OPTIONS (gotcha 'true'); -- ERROR
@@ -548,24 +602,29 @@ ERROR:  role "regress_test_missing_role" does not exist
 ALTER USER MAPPING FOR user SERVER ss4 OPTIONS (gotcha 'true'); -- ERROR
 ERROR:  server "ss4" does not exist
 ALTER USER MAPPING FOR public SERVER s5 OPTIONS (gotcha 'true');            -- ERROR
-ERROR:  server "s5" does not exist
+ERROR:  user mapping for "public" does not exist for the server
 ALTER USER MAPPING FOR current_user SERVER s8 OPTIONS (username 'test');    -- ERROR
-ERROR:  server "s8" does not exist
+ERROR:  invalid option "username"
+HINT:  Valid options in this context are: user, password
 ALTER USER MAPPING FOR current_user SERVER s8 OPTIONS (DROP user, SET password 'public');
-ERROR:  server "s8" does not exist
 SET ROLE regress_test_role;
 ALTER USER MAPPING FOR current_user SERVER s5 OPTIONS (ADD modified '1');
-ERROR:  server "s5" does not exist
 ALTER USER MAPPING FOR public SERVER s4 OPTIONS (ADD modified '1'); -- ERROR
-ERROR:  server "s4" does not exist
+ERROR:  must be owner of foreign server s4
 ALTER USER MAPPING FOR public SERVER t1 OPTIONS (ADD modified '1');
-ERROR:  server "t1" does not exist
 RESET ROLE;
 \deu+
       List of user mappings
  Server | User name | FDW options 
---------+-----------+-------------
-(0 rows)
+--------+---------------------------+----------------------------------
+ s4     | public                    | ("this mapping" 'is public')
+ s4     | regress_foreign_data_user | 
+ s5     | regress_test_role         | (modified '1')
+ s6     | regress_test_role         | (username 'test')
+ s8     | regress_foreign_data_user | (password 'public')
+ t1     | public                    | (modified '1')
+ t1     | regress_test_role         | (username 'bob', password 'boo')
+(7 rows)
 
 -- DROP USER MAPPING
 DROP USER MAPPING FOR regress_test_missing_role SERVER s4;  -- ERROR
@@ -573,33 +632,36 @@ ERROR:  role "regress_test_missing_role" does not exist
 DROP USER MAPPING FOR user SERVER ss4;
 ERROR:  server "ss4" does not exist
 DROP USER MAPPING FOR public SERVER s7;                     -- ERROR
-ERROR:  server "s7" does not exist
+ERROR:  user mapping for "public" does not exist for the server
 DROP USER MAPPING IF EXISTS FOR regress_test_missing_role SERVER s4;
 NOTICE:  role "regress_test_missing_role" does not exist, skipping
 DROP USER MAPPING IF EXISTS FOR user SERVER ss4;
 NOTICE:  server does not exist, skipping
 DROP USER MAPPING IF EXISTS FOR public SERVER s7;
-NOTICE:  server does not exist, skipping
+NOTICE:  user mapping for "public" does not exist for the server, skipping
 CREATE USER MAPPING FOR public SERVER s8;
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 SET ROLE regress_test_role;
 DROP USER MAPPING FOR public SERVER s8;                     -- ERROR
-ERROR:  server "s8" does not exist
+ERROR:  must be owner of foreign server s8
 RESET ROLE;
 DROP SERVER s7;
-ERROR:  server "s7" does not exist
 \deu
 List of user mappings
  Server | User name 
---------+-----------
-(0 rows)
+--------+---------------------------
+ s4     | public
+ s4     | regress_foreign_data_user
+ s5     | regress_test_role
+ s6     | regress_test_role
+ s8     | public
+ s8     | regress_foreign_data_user
+ t1     | public
+ t1     | regress_test_role
+(8 rows)
 
 -- CREATE FOREIGN TABLE
 CREATE SCHEMA foreign_schema;
 CREATE SERVER s0 FOREIGN DATA WRAPPER dummy;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE FOREIGN TABLE ft1 ();                                    -- ERROR
 ERROR:  syntax error at or near ";"
 LINE 1: CREATE FOREIGN TABLE ft1 ();
@@ -643,123 +705,132 @@ CREATE FOREIGN TABLE ft1 (
 	c3 date,
 	CHECK (c3 BETWEEN '1994-01-01'::date AND '1994-01-31'::date)
 ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
-ERROR:  server "s0" does not exist
 COMMENT ON FOREIGN TABLE ft1 IS 'ft1';
-ERROR:  relation "ft1" does not exist
 COMMENT ON COLUMN ft1.c1 IS 'ft1.c1';
-ERROR:  relation "ft1" does not exist
 \d+ ft1
+                                                 Foreign table "public.ft1"
+ Column |  Type   | Collation | Nullable | Default |          FDW options           | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+--------------------------------+----------+--------------+-------------
+ c1     | integer |           | not null |         | ("param 1" 'val1')             | plain    |              | ft1.c1
+ c2     | text    |           |          |         | (param2 'val2', param3 'val3') | extended |              | 
+ c3     | date    |           |          |         |                                | plain    |              | 
+Check constraints:
+    "ft1_c2_check" CHECK (c2 <> ''::text)
+    "ft1_c3_check" CHECK (c3 >= '01-01-1994'::date AND c3 <= '01-31-1994'::date)
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+
 \det+
                List of foreign tables
  Schema | Table | Server | FDW options | Description 
---------+-------+--------+-------------+-------------
-(0 rows)
+--------+-------+--------+-------------------------------------------------+-------------
+ public | ft1   | s0     | (delimiter ',', quote '"', "be quoted" 'value') | ft1
+(1 row)
 
 CREATE INDEX id_ft1_c2 ON ft1 (c2);                             -- ERROR
-ERROR:  relation "ft1" does not exist
+ERROR:  cannot create index on foreign table "ft1"
 SELECT * FROM ft1;                                              -- ERROR
-ERROR:  relation "ft1" does not exist
-LINE 1: SELECT * FROM ft1;
-                      ^
+ERROR:  foreign-data wrapper "dummy" has no handler
 EXPLAIN SELECT * FROM ft1;                                      -- ERROR
-ERROR:  relation "ft1" does not exist
-LINE 1: EXPLAIN SELECT * FROM ft1;
-                              ^
+ERROR:  foreign-data wrapper "dummy" has no handler
 -- ALTER FOREIGN TABLE
 COMMENT ON FOREIGN TABLE ft1 IS 'foreign table';
-ERROR:  relation "ft1" does not exist
 COMMENT ON FOREIGN TABLE ft1 IS NULL;
-ERROR:  relation "ft1" does not exist
 COMMENT ON COLUMN ft1.c1 IS 'foreign column';
-ERROR:  relation "ft1" does not exist
 COMMENT ON COLUMN ft1.c1 IS NULL;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ADD COLUMN c4 integer;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ADD COLUMN c5 integer DEFAULT 0;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ADD COLUMN c6 integer;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ADD COLUMN c7 integer NOT NULL;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ADD COLUMN c8 integer;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ADD COLUMN c9 integer;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ADD COLUMN c10 integer OPTIONS (p1 'v1');
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c4 SET DEFAULT 0;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c5 DROP DEFAULT;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c6 SET NOT NULL;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c7 DROP NOT NULL;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 TYPE char(10) USING '0'; -- ERROR
-ERROR:  relation "ft1" does not exist
+ERROR:  "ft1" is not a table
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 TYPE char(10);
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 SET DATA TYPE text;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER COLUMN xmin OPTIONS (ADD p1 'v1'); -- ERROR
-ERROR:  relation "ft1" does not exist
+ERROR:  cannot alter system column "xmin"
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c7 OPTIONS (ADD p1 'v1', ADD p2 'v2'),
                         ALTER COLUMN c8 OPTIONS (ADD p1 'v1', ADD p2 'v2');
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 OPTIONS (SET p2 'V2', DROP p1);
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c1 SET STATISTICS 10000;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c1 SET (n_distinct = 100);
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 SET STATISTICS -1;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 SET STORAGE PLAIN;
-ERROR:  relation "ft1" does not exist
 \d+ ft1
+                                                 Foreign table "public.ft1"
+ Column |  Type   | Collation | Nullable | Default |          FDW options           | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+--------------------------------+----------+--------------+-------------
+ c1     | integer |           | not null |         | ("param 1" 'val1')             | plain    | 10000        | 
+ c2     | text    |           |          |         | (param2 'val2', param3 'val3') | extended |              | 
+ c3     | date    |           |          |         |                                | plain    |              | 
+ c4     | integer |           |          | 0       |                                | plain    |              | 
+ c5     | integer |           |          |         |                                | plain    |              | 
+ c6     | integer |           | not null |         |                                | plain    |              | 
+ c7     | integer |           |          |         | (p1 'v1', p2 'v2')             | plain    |              | 
+ c8     | text    |           |          |         | (p2 'V2')                      | plain    |              | 
+ c9     | integer |           |          |         |                                | plain    |              | 
+ c10    | integer |           |          |         | (p1 'v1')                      | plain    |              | 
+Check constraints:
+    "ft1_c2_check" CHECK (c2 <> ''::text)
+    "ft1_c3_check" CHECK (c3 >= '01-01-1994'::date AND c3 <= '01-31-1994'::date)
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+
 -- can't change the column type if it's used elsewhere
 CREATE TABLE use_ft1_column_type (x ft1);
-ERROR:  type "ft1" does not exist
-LINE 1: CREATE TABLE use_ft1_column_type (x ft1);
-                                            ^
 ALTER FOREIGN TABLE ft1 ALTER COLUMN c8 SET DATA TYPE integer;	-- ERROR
-ERROR:  relation "ft1" does not exist
+ERROR:  cannot alter foreign table "ft1" because column "use_ft1_column_type.x" uses its row type
 DROP TABLE use_ft1_column_type;
-ERROR:  table "use_ft1_column_type" does not exist
 ALTER FOREIGN TABLE ft1 ADD PRIMARY KEY (c7);                   -- ERROR
-ERROR:  relation "ft1" does not exist
+ERROR:  primary key constraints are not supported on foreign tables
+LINE 1: ALTER FOREIGN TABLE ft1 ADD PRIMARY KEY (c7);
+                                    ^
 ALTER FOREIGN TABLE ft1 ADD CONSTRAINT ft1_c9_check CHECK (c9 < 0) NOT VALID;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 ALTER CONSTRAINT ft1_c9_check DEFERRABLE; -- ERROR
-ERROR:  relation "ft1" does not exist
+ERROR:  "ft1" is not a table
 ALTER FOREIGN TABLE ft1 DROP CONSTRAINT ft1_c9_check;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 DROP CONSTRAINT no_const;               -- ERROR
-ERROR:  relation "ft1" does not exist
+ERROR:  constraint "no_const" of relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 DROP CONSTRAINT IF EXISTS no_const;
-ERROR:  relation "ft1" does not exist
+NOTICE:  constraint "no_const" of relation "ft1" does not exist, skipping
 ALTER FOREIGN TABLE ft1 SET WITH OIDS;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 OWNER TO regress_test_role;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 OPTIONS (DROP delimiter, SET quote '~', ADD escape '@');
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 DROP COLUMN no_column;                  -- ERROR
-ERROR:  relation "ft1" does not exist
+ERROR:  column "no_column" of relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 DROP COLUMN IF EXISTS no_column;
-ERROR:  relation "ft1" does not exist
+NOTICE:  column "no_column" of relation "ft1" does not exist, skipping
 ALTER FOREIGN TABLE ft1 DROP COLUMN c9;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 SET SCHEMA foreign_schema;
-ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE ft1 SET TABLESPACE ts;                      -- ERROR
 ERROR:  relation "ft1" does not exist
 ALTER FOREIGN TABLE foreign_schema.ft1 RENAME c1 TO foreign_column_1;
-ERROR:  relation "foreign_schema.ft1" does not exist
 ALTER FOREIGN TABLE foreign_schema.ft1 RENAME TO foreign_table_1;
-ERROR:  relation "foreign_schema.ft1" does not exist
 \d foreign_schema.foreign_table_1
+                        Foreign table "foreign_schema.foreign_table_1"
+      Column      |  Type   | Collation | Nullable | Default |          FDW options           
+------------------+---------+-----------+----------+---------+--------------------------------
+ foreign_column_1 | integer |           | not null |         | ("param 1" 'val1')
+ c2               | text    |           |          |         | (param2 'val2', param3 'val3')
+ c3               | date    |           |          |         | 
+ c4               | integer |           |          | 0       | 
+ c5               | integer |           |          |         | 
+ c6               | integer |           | not null |         | 
+ c7               | integer |           |          |         | (p1 'v1', p2 'v2')
+ c8               | text    |           |          |         | (p2 'V2')
+ c10              | integer |           |          |         | (p1 'v1')
+Check constraints:
+    "ft1_c2_check" CHECK (c2 <> ''::text)
+    "ft1_c3_check" CHECK (c3 >= '01-01-1994'::date AND c3 <= '01-31-1994'::date)
+Server: s0
+FDW options: (quote '~', "be quoted" 'value', escape '@')
+
 -- alter noexisting table
 ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 ADD COLUMN c4 integer;
 NOTICE:  relation "doesnt_exist_ft1" does not exist, skipping
@@ -807,77 +878,132 @@ NOTICE:  relation "doesnt_exist_ft1" does not exist, skipping
 -- Information schema
 SELECT * FROM information_schema.foreign_data_wrappers ORDER BY 1, 2;
  foreign_data_wrapper_catalog | foreign_data_wrapper_name | authorization_identifier | library_name | foreign_data_wrapper_language 
-------------------------------+---------------------------+--------------------------+--------------+-------------------------------
-(0 rows)
+------------------------------+---------------------------+---------------------------+--------------+-------------------------------
+ regression                   | dummy                     | regress_foreign_data_user |              | c
+ regression                   | foo                       | regress_foreign_data_user |              | c
+ regression                   | postgresql                | regress_foreign_data_user |              | c
+(3 rows)
 
 SELECT * FROM information_schema.foreign_data_wrapper_options ORDER BY 1, 2, 3;
  foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value 
-------------------------------+---------------------------+-------------+--------------
-(0 rows)
+------------------------------+---------------------------+--------------+--------------
+ regression                   | foo                       | test wrapper | true
+(1 row)
 
 SELECT * FROM information_schema.foreign_servers ORDER BY 1, 2;
  foreign_server_catalog | foreign_server_name | foreign_data_wrapper_catalog | foreign_data_wrapper_name | foreign_server_type | foreign_server_version | authorization_identifier 
-------------------------+---------------------+------------------------------+---------------------------+---------------------+------------------------+--------------------------
-(0 rows)
+------------------------+---------------------+------------------------------+---------------------------+---------------------+------------------------+---------------------------
+ regression             | s0                  | regression                   | dummy                     |                     |                        | regress_foreign_data_user
+ regression             | s4                  | regression                   | foo                       | oracle              |                        | regress_foreign_data_user
+ regression             | s5                  | regression                   | foo                       |                     | 15.0                   | regress_test_role
+ regression             | s6                  | regression                   | foo                       |                     | 16.0                   | regress_test_indirect
+ regression             | s8                  | regression                   | postgresql                |                     |                        | regress_foreign_data_user
+ regression             | t1                  | regression                   | foo                       |                     |                        | regress_test_indirect
+ regression             | t2                  | regression                   | foo                       |                     |                        | regress_test_role
+(7 rows)
 
 SELECT * FROM information_schema.foreign_server_options ORDER BY 1, 2, 3;
  foreign_server_catalog | foreign_server_name | option_name | option_value 
-------------------------+---------------------+-------------+--------------
-(0 rows)
+------------------------+---------------------+-----------------+--------------
+ regression             | s4                  | dbname          | b
+ regression             | s4                  | host            | a
+ regression             | s6                  | dbname          | b
+ regression             | s6                  | host            | a
+ regression             | s8                  | connect_timeout | 30
+ regression             | s8                  | dbname          | db1
+(6 rows)
 
 SELECT * FROM information_schema.user_mappings ORDER BY lower(authorization_identifier), 2, 3;
  authorization_identifier | foreign_server_catalog | foreign_server_name 
---------------------------+------------------------+---------------------
-(0 rows)
+---------------------------+------------------------+---------------------
+ PUBLIC                    | regression             | s4
+ PUBLIC                    | regression             | s8
+ PUBLIC                    | regression             | t1
+ regress_foreign_data_user | regression             | s4
+ regress_foreign_data_user | regression             | s8
+ regress_test_role         | regression             | s5
+ regress_test_role         | regression             | s6
+ regress_test_role         | regression             | t1
+(8 rows)
 
 SELECT * FROM information_schema.user_mapping_options ORDER BY lower(authorization_identifier), 2, 3, 4;
  authorization_identifier | foreign_server_catalog | foreign_server_name | option_name | option_value 
---------------------------+------------------------+---------------------+-------------+--------------
-(0 rows)
+---------------------------+------------------------+---------------------+--------------+--------------
+ PUBLIC                    | regression             | s4                  | this mapping | is public
+ PUBLIC                    | regression             | t1                  | modified     | 1
+ regress_foreign_data_user | regression             | s8                  | password     | public
+ regress_test_role         | regression             | s5                  | modified     | 1
+ regress_test_role         | regression             | s6                  | username     | test
+ regress_test_role         | regression             | t1                  | password     | boo
+ regress_test_role         | regression             | t1                  | username     | bob
+(7 rows)
 
 SELECT * FROM information_schema.usage_privileges WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5;
  grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable 
----------+---------+----------------+---------------+-------------+-------------+----------------+--------------
-(0 rows)
+---------------------------+---------------------------+----------------+---------------+-------------+----------------------+----------------+--------------
+ regress_foreign_data_user | regress_foreign_data_user | regression     |               | foo         | FOREIGN DATA WRAPPER | USAGE          | YES
+ regress_foreign_data_user | regress_test_indirect     | regression     |               | foo         | FOREIGN DATA WRAPPER | USAGE          | NO
+ regress_test_indirect     | regress_test_indirect     | regression     |               | s6          | FOREIGN SERVER       | USAGE          | YES
+ regress_test_indirect     | regress_test_role2        | regression     |               | s6          | FOREIGN SERVER       | USAGE          | YES
+(4 rows)
 
 SELECT * FROM information_schema.role_usage_grants WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5;
  grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable 
----------+---------+----------------+---------------+-------------+-------------+----------------+--------------
-(0 rows)
+---------------------------+---------------------------+----------------+---------------+-------------+----------------------+----------------+--------------
+ regress_foreign_data_user | regress_foreign_data_user | regression     |               | foo         | FOREIGN DATA WRAPPER | USAGE          | YES
+ regress_foreign_data_user | regress_test_indirect     | regression     |               | foo         | FOREIGN DATA WRAPPER | USAGE          | NO
+ regress_test_indirect     | regress_test_indirect     | regression     |               | s6          | FOREIGN SERVER       | USAGE          | YES
+ regress_test_indirect     | regress_test_role2        | regression     |               | s6          | FOREIGN SERVER       | USAGE          | YES
+(4 rows)
 
 SELECT * FROM information_schema.foreign_tables ORDER BY 1, 2, 3;
  foreign_table_catalog | foreign_table_schema | foreign_table_name | foreign_server_catalog | foreign_server_name 
 -----------------------+----------------------+--------------------+------------------------+---------------------
-(0 rows)
+ regression            | foreign_schema       | foreign_table_1    | regression             | s0
+(1 row)
 
 SELECT * FROM information_schema.foreign_table_options ORDER BY 1, 2, 3, 4;
  foreign_table_catalog | foreign_table_schema | foreign_table_name | option_name | option_value 
 -----------------------+----------------------+--------------------+-------------+--------------
-(0 rows)
+ regression            | foreign_schema       | foreign_table_1    | be quoted   | value
+ regression            | foreign_schema       | foreign_table_1    | escape      | @
+ regression            | foreign_schema       | foreign_table_1    | quote       | ~
+(3 rows)
 
 SET ROLE regress_test_role;
 SELECT * FROM information_schema.user_mapping_options ORDER BY 1, 2, 3, 4;
  authorization_identifier | foreign_server_catalog | foreign_server_name | option_name | option_value 
 --------------------------+------------------------+---------------------+-------------+--------------
-(0 rows)
+ PUBLIC                   | regression             | t1                  | modified    | 1
+ regress_test_role        | regression             | s5                  | modified    | 1
+ regress_test_role        | regression             | s6                  | username    | test
+ regress_test_role        | regression             | t1                  | password    | boo
+ regress_test_role        | regression             | t1                  | username    | bob
+(5 rows)
 
 SELECT * FROM information_schema.usage_privileges WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5;
  grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable 
----------+---------+----------------+---------------+-------------+-------------+----------------+--------------
-(0 rows)
+---------------------------+-----------------------+----------------+---------------+-------------+----------------------+----------------+--------------
+ regress_foreign_data_user | regress_test_indirect | regression     |               | foo         | FOREIGN DATA WRAPPER | USAGE          | NO
+ regress_test_indirect     | regress_test_indirect | regression     |               | s6          | FOREIGN SERVER       | USAGE          | YES
+ regress_test_indirect     | regress_test_role2    | regression     |               | s6          | FOREIGN SERVER       | USAGE          | YES
+(3 rows)
 
 SELECT * FROM information_schema.role_usage_grants WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5;
  grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable 
----------+---------+----------------+---------------+-------------+-------------+----------------+--------------
-(0 rows)
+---------------------------+-----------------------+----------------+---------------+-------------+----------------------+----------------+--------------
+ regress_foreign_data_user | regress_test_indirect | regression     |               | foo         | FOREIGN DATA WRAPPER | USAGE          | NO
+ regress_test_indirect     | regress_test_indirect | regression     |               | s6          | FOREIGN SERVER       | USAGE          | YES
+ regress_test_indirect     | regress_test_role2    | regression     |               | s6          | FOREIGN SERVER       | USAGE          | YES
+(3 rows)
 
 DROP USER MAPPING FOR current_user SERVER t1;
-ERROR:  server "t1" does not exist
 SET ROLE regress_test_role2;
 SELECT * FROM information_schema.user_mapping_options ORDER BY 1, 2, 3, 4;
  authorization_identifier | foreign_server_catalog | foreign_server_name | option_name | option_value 
 --------------------------+------------------------+---------------------+-------------+--------------
-(0 rows)
+ regress_test_role        | regression             | s6                  | username    | 
+(1 row)
 
 RESET ROLE;
 -- has_foreign_data_wrapper_privilege
@@ -885,229 +1011,255 @@ SELECT has_foreign_data_wrapper_privilege('regress_test_role',
     (SELECT oid FROM pg_foreign_data_wrapper WHERE fdwname='foo'), 'USAGE');
  has_foreign_data_wrapper_privilege 
 ------------------------------------
- 
+ t
 (1 row)
 
 SELECT has_foreign_data_wrapper_privilege('regress_test_role', 'foo', 'USAGE');
-ERROR:  foreign-data wrapper "foo" does not exist
+ has_foreign_data_wrapper_privilege 
+------------------------------------
+ t
+(1 row)
+
 SELECT has_foreign_data_wrapper_privilege(
     (SELECT oid FROM pg_catalog.pg_roles WHERE rolname='regress_test_role'),
     (SELECT oid FROM pg_foreign_data_wrapper WHERE fdwname='foo'), 'USAGE');
  has_foreign_data_wrapper_privilege 
 ------------------------------------
- 
+ t
 (1 row)
 
 SELECT has_foreign_data_wrapper_privilege(
     (SELECT oid FROM pg_foreign_data_wrapper WHERE fdwname='foo'), 'USAGE');
  has_foreign_data_wrapper_privilege 
 ------------------------------------
- 
+ t
 (1 row)
 
 SELECT has_foreign_data_wrapper_privilege(
     (SELECT oid FROM pg_catalog.pg_roles WHERE rolname='regress_test_role'), 'foo', 'USAGE');
-ERROR:  foreign-data wrapper "foo" does not exist
+ has_foreign_data_wrapper_privilege 
+------------------------------------
+ t
+(1 row)
+
 SELECT has_foreign_data_wrapper_privilege('foo', 'USAGE');
-ERROR:  foreign-data wrapper "foo" does not exist
+ has_foreign_data_wrapper_privilege 
+------------------------------------
+ t
+(1 row)
+
 GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role;
-ERROR:  foreign-data wrapper "foo" does not exist
 SELECT has_foreign_data_wrapper_privilege('regress_test_role', 'foo', 'USAGE');
-ERROR:  foreign-data wrapper "foo" does not exist
+ has_foreign_data_wrapper_privilege 
+------------------------------------
+ t
+(1 row)
+
 -- has_server_privilege
 SELECT has_server_privilege('regress_test_role',
     (SELECT oid FROM pg_foreign_server WHERE srvname='s8'), 'USAGE');
  has_server_privilege 
 ----------------------
- 
+ f
 (1 row)
 
 SELECT has_server_privilege('regress_test_role', 's8', 'USAGE');
-ERROR:  server "s8" does not exist
+ has_server_privilege 
+----------------------
+ f
+(1 row)
+
 SELECT has_server_privilege(
     (SELECT oid FROM pg_catalog.pg_roles WHERE rolname='regress_test_role'),
     (SELECT oid FROM pg_foreign_server WHERE srvname='s8'), 'USAGE');
  has_server_privilege 
 ----------------------
- 
+ f
 (1 row)
 
 SELECT has_server_privilege(
     (SELECT oid FROM pg_foreign_server WHERE srvname='s8'), 'USAGE');
  has_server_privilege 
 ----------------------
- 
+ t
 (1 row)
 
 SELECT has_server_privilege(
     (SELECT oid FROM pg_catalog.pg_roles WHERE rolname='regress_test_role'), 's8', 'USAGE');
-ERROR:  server "s8" does not exist
+ has_server_privilege 
+----------------------
+ f
+(1 row)
+
 SELECT has_server_privilege('s8', 'USAGE');
-ERROR:  server "s8" does not exist
+ has_server_privilege 
+----------------------
+ t
+(1 row)
+
 GRANT USAGE ON FOREIGN SERVER s8 TO regress_test_role;
-ERROR:  server "s8" does not exist
 SELECT has_server_privilege('regress_test_role', 's8', 'USAGE');
-ERROR:  server "s8" does not exist
+ has_server_privilege 
+----------------------
+ t
+(1 row)
+
 REVOKE USAGE ON FOREIGN SERVER s8 FROM regress_test_role;
-ERROR:  server "s8" does not exist
 GRANT USAGE ON FOREIGN SERVER s4 TO regress_test_role;
-ERROR:  server "s4" does not exist
 DROP USER MAPPING FOR public SERVER s4;
-ERROR:  server "s4" does not exist
 ALTER SERVER s6 OPTIONS (DROP host, DROP dbname);
-ERROR:  server "s6" does not exist
 ALTER USER MAPPING FOR regress_test_role SERVER s6 OPTIONS (DROP username);
-ERROR:  server "s6" does not exist
 ALTER FOREIGN DATA WRAPPER foo VALIDATOR postgresql_fdw_validator;
-ERROR:  foreign-data wrapper "foo" does not exist
+WARNING:  changing the foreign-data wrapper validator can cause the options for dependent objects to become invalid
 -- Privileges
 SET ROLE regress_unprivileged_role;
 CREATE FOREIGN DATA WRAPPER foobar;                             -- ERROR
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
+ERROR:  permission denied to create foreign-data wrapper "foobar"
+HINT:  Must be superuser to create a foreign-data wrapper.
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (gotcha 'true');         -- ERROR
 ERROR:  permission denied to alter foreign-data wrapper "foo"
 HINT:  Must be superuser to alter a foreign-data wrapper.
 ALTER FOREIGN DATA WRAPPER foo OWNER TO regress_unprivileged_role; -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  permission denied to change owner of foreign-data wrapper "foo"
+HINT:  Must be superuser to change owner of a foreign-data wrapper.
 DROP FOREIGN DATA WRAPPER foo;                                  -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  must be owner of foreign-data wrapper foo
 GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role;   -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  permission denied for foreign-data wrapper foo
 CREATE SERVER s9 FOREIGN DATA WRAPPER foo;                      -- ERROR
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
+ERROR:  permission denied for foreign-data wrapper foo
 ALTER SERVER s4 VERSION '0.5';                                  -- ERROR
-ERROR:  server "s4" does not exist
+ERROR:  must be owner of foreign server s4
 ALTER SERVER s4 OWNER TO regress_unprivileged_role;             -- ERROR
-ERROR:  server "s4" does not exist
+ERROR:  must be owner of foreign server s4
 DROP SERVER s4;                                                 -- ERROR
-ERROR:  server "s4" does not exist
+ERROR:  must be owner of foreign server s4
 GRANT USAGE ON FOREIGN SERVER s4 TO regress_test_role;          -- ERROR
-ERROR:  server "s4" does not exist
+ERROR:  permission denied for foreign server s4
 CREATE USER MAPPING FOR public SERVER s4;                       -- ERROR
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
+ERROR:  must be owner of foreign server s4
 ALTER USER MAPPING FOR regress_test_role SERVER s6 OPTIONS (gotcha 'true'); -- ERROR
-ERROR:  server "s6" does not exist
+ERROR:  must be owner of foreign server s6
 DROP USER MAPPING FOR regress_test_role SERVER s6;              -- ERROR
-ERROR:  server "s6" does not exist
+ERROR:  must be owner of foreign server s6
 RESET ROLE;
 GRANT USAGE ON FOREIGN DATA WRAPPER postgresql TO regress_unprivileged_role;
-ERROR:  foreign-data wrapper "postgresql" does not exist
 GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_unprivileged_role WITH GRANT OPTION;
-ERROR:  foreign-data wrapper "foo" does not exist
 SET ROLE regress_unprivileged_role;
 CREATE FOREIGN DATA WRAPPER foobar;                             -- ERROR
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
+ERROR:  permission denied to create foreign-data wrapper "foobar"
+HINT:  Must be superuser to create a foreign-data wrapper.
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (gotcha 'true');         -- ERROR
 ERROR:  permission denied to alter foreign-data wrapper "foo"
 HINT:  Must be superuser to alter a foreign-data wrapper.
 DROP FOREIGN DATA WRAPPER foo;                                  -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  must be owner of foreign-data wrapper foo
 GRANT USAGE ON FOREIGN DATA WRAPPER postgresql TO regress_test_role; -- WARNING
-ERROR:  foreign-data wrapper "postgresql" does not exist
+WARNING:  no privileges were granted for "postgresql"
 GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role;
-ERROR:  foreign-data wrapper "foo" does not exist
 CREATE SERVER s9 FOREIGN DATA WRAPPER postgresql;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 ALTER SERVER s6 VERSION '0.5';                                  -- ERROR
-ERROR:  server "s6" does not exist
+ERROR:  must be owner of foreign server s6
 DROP SERVER s6;                                                 -- ERROR
-ERROR:  server "s6" does not exist
+ERROR:  must be owner of foreign server s6
 GRANT USAGE ON FOREIGN SERVER s6 TO regress_test_role;          -- ERROR
-ERROR:  server "s6" does not exist
+ERROR:  permission denied for foreign server s6
 GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role;
-ERROR:  server "s9" does not exist
 CREATE USER MAPPING FOR public SERVER s6;                       -- ERROR
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
+ERROR:  must be owner of foreign server s6
 CREATE USER MAPPING FOR public SERVER s9;
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 ALTER USER MAPPING FOR regress_test_role SERVER s6 OPTIONS (gotcha 'true'); -- ERROR
-ERROR:  server "s6" does not exist
+ERROR:  must be owner of foreign server s6
 DROP USER MAPPING FOR regress_test_role SERVER s6;              -- ERROR
-ERROR:  server "s6" does not exist
+ERROR:  must be owner of foreign server s6
 RESET ROLE;
 REVOKE USAGE ON FOREIGN DATA WRAPPER foo FROM regress_unprivileged_role; -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  dependent privileges exist
+HINT:  Use CASCADE to revoke them too.
 REVOKE USAGE ON FOREIGN DATA WRAPPER foo FROM regress_unprivileged_role CASCADE;
-ERROR:  foreign-data wrapper "foo" does not exist
 SET ROLE regress_unprivileged_role;
 GRANT USAGE ON FOREIGN DATA WRAPPER foo TO regress_test_role;   -- ERROR
-ERROR:  foreign-data wrapper "foo" does not exist
+ERROR:  permission denied for foreign-data wrapper foo
 CREATE SERVER s10 FOREIGN DATA WRAPPER foo;                     -- ERROR
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
+ERROR:  permission denied for foreign-data wrapper foo
 ALTER SERVER s9 VERSION '1.1';
-ERROR:  server "s9" does not exist
 GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role;
-ERROR:  server "s9" does not exist
 CREATE USER MAPPING FOR current_user SERVER s9;
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 -- We use terse mode to avoid ordering issues in cascade detail output.
 \set VERBOSITY terse
 DROP SERVER s9 CASCADE;
-ERROR:  server "s9" does not exist
+NOTICE:  drop cascades to 2 other objects
 \set VERBOSITY default
 RESET ROLE;
 CREATE SERVER s9 FOREIGN DATA WRAPPER foo;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 GRANT USAGE ON FOREIGN SERVER s9 TO regress_unprivileged_role;
-ERROR:  server "s9" does not exist
 SET ROLE regress_unprivileged_role;
 ALTER SERVER s9 VERSION '1.2';                                  -- ERROR
-ERROR:  server "s9" does not exist
+ERROR:  must be owner of foreign server s9
 GRANT USAGE ON FOREIGN SERVER s9 TO regress_test_role;          -- WARNING
-ERROR:  server "s9" does not exist
+WARNING:  no privileges were granted for "s9"
 CREATE USER MAPPING FOR current_user SERVER s9;
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 DROP SERVER s9 CASCADE;                                         -- ERROR
-ERROR:  server "s9" does not exist
+ERROR:  must be owner of foreign server s9
 -- Check visibility of user mapping data
 SET ROLE regress_test_role;
 CREATE SERVER s10 FOREIGN DATA WRAPPER foo;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR public SERVER s10 OPTIONS (user 'secret');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR regress_unprivileged_role SERVER s10 OPTIONS (user 'secret');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 -- owner of server can see some option fields
 \deu+
       List of user mappings
  Server | User name | FDW options 
---------+-----------+-------------
-(0 rows)
+--------+---------------------------+-------------------
+ s10    | public                    | ("user" 'secret')
+ s10    | regress_unprivileged_role | 
+ s4     | regress_foreign_data_user | 
+ s5     | regress_test_role         | (modified '1')
+ s6     | regress_test_role         | 
+ s8     | public                    | 
+ s8     | regress_foreign_data_user | 
+ s9     | regress_unprivileged_role | 
+ t1     | public                    | (modified '1')
+(9 rows)
 
 RESET ROLE;
 -- superuser can see all option fields
 \deu+
       List of user mappings
  Server | User name | FDW options 
---------+-----------+-------------
-(0 rows)
+--------+---------------------------+---------------------
+ s10    | public                    | ("user" 'secret')
+ s10    | regress_unprivileged_role | ("user" 'secret')
+ s4     | regress_foreign_data_user | 
+ s5     | regress_test_role         | (modified '1')
+ s6     | regress_test_role         | 
+ s8     | public                    | 
+ s8     | regress_foreign_data_user | (password 'public')
+ s9     | regress_unprivileged_role | 
+ t1     | public                    | (modified '1')
+(9 rows)
 
 -- unprivileged user cannot see any option field
 SET ROLE regress_unprivileged_role;
 \deu+
       List of user mappings
  Server | User name | FDW options 
---------+-----------+-------------
-(0 rows)
+--------+---------------------------+-------------
+ s10    | public                    | 
+ s10    | regress_unprivileged_role | 
+ s4     | regress_foreign_data_user | 
+ s5     | regress_test_role         | 
+ s6     | regress_test_role         | 
+ s8     | public                    | 
+ s8     | regress_foreign_data_user | 
+ s9     | regress_unprivileged_role | 
+ t1     | public                    | 
+(9 rows)
 
 RESET ROLE;
 \set VERBOSITY terse
 DROP SERVER s10 CASCADE;
-ERROR:  server "s10" does not exist
+NOTICE:  drop cascades to 2 other objects
 \set VERBOSITY default
 -- Triggers
 CREATE FUNCTION dummy_trigger() RETURNS TRIGGER AS $$
@@ -1154,18 +1306,18 @@ ERROR:  Postgres-XL does not support TRIGGER yet
 DETAIL:  The feature is not currently supported
 ALTER FOREIGN TABLE foreign_schema.foreign_table_1
 	DISABLE TRIGGER trigtest_before_stmt;
-ERROR:  relation "foreign_schema.foreign_table_1" does not exist
+ERROR:  trigger "trigtest_before_stmt" for table "foreign_table_1" does not exist
 ALTER FOREIGN TABLE foreign_schema.foreign_table_1
 	ENABLE TRIGGER trigtest_before_stmt;
-ERROR:  relation "foreign_schema.foreign_table_1" does not exist
+ERROR:  trigger "trigtest_before_stmt" for table "foreign_table_1" does not exist
 DROP TRIGGER trigtest_before_stmt ON foreign_schema.foreign_table_1;
-ERROR:  relation "foreign_schema.foreign_table_1" does not exist
+ERROR:  trigger "trigtest_before_stmt" for table "foreign_table_1" does not exist
 DROP TRIGGER trigtest_before_row ON foreign_schema.foreign_table_1;
-ERROR:  relation "foreign_schema.foreign_table_1" does not exist
+ERROR:  trigger "trigtest_before_row" for table "foreign_table_1" does not exist
 DROP TRIGGER trigtest_after_stmt ON foreign_schema.foreign_table_1;
-ERROR:  relation "foreign_schema.foreign_table_1" does not exist
+ERROR:  trigger "trigtest_after_stmt" for table "foreign_table_1" does not exist
 DROP TRIGGER trigtest_after_row ON foreign_schema.foreign_table_1;
-ERROR:  relation "foreign_schema.foreign_table_1" does not exist
+ERROR:  trigger "trigtest_after_row" for table "foreign_table_1" does not exist
 DROP FUNCTION dummy_trigger();
 -- Table inheritance
 CREATE TABLE pt1 (
@@ -1175,7 +1327,6 @@ CREATE TABLE pt1 (
 );
 CREATE FOREIGN TABLE ft2 () INHERITS (pt1)
   SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
-ERROR:  server "s0" does not exist
 \d+ pt1
                                     Table "public.pt1"
  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
@@ -1183,12 +1334,22 @@ ERROR:  server "s0" does not exist
  c1     | integer |           | not null |         | plain    |              | 
  c2     | text    |           |          |         | extended |              | 
  c3     | date    |           |          |         | plain    |              | 
+Child tables: ft2
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+
 DROP FOREIGN TABLE ft2;
-ERROR:  foreign table "ft2" does not exist
 \d+ pt1
                                     Table "public.pt1"
  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
@@ -1204,10 +1365,17 @@ CREATE FOREIGN TABLE ft2 (
 	c2 text,
 	c3 date
 ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
-ERROR:  server "s0" does not exist
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+
 ALTER FOREIGN TABLE ft2 INHERIT pt1;
-ERROR:  relation "ft2" does not exist
 \d+ pt1
                                     Table "public.pt1"
  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
@@ -1215,20 +1383,41 @@ ERROR:  relation "ft2" does not exist
  c1     | integer |           | not null |         | plain    |              | 
  c2     | text    |           |          |         | extended |              | 
  c3     | date    |           |          |         | plain    |              | 
+Child tables: ft2
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+
 CREATE TABLE ct3() INHERITS(ft2);
-ERROR:  relation "ft2" does not exist
+ERROR:  inherited relation "ft2" is not a table
 CREATE FOREIGN TABLE ft3 (
 	c1 integer NOT NULL,
 	c2 text,
 	c3 date
 ) INHERITS(ft2)
   SERVER s0;
-ERROR:  relation "ft2" does not exist
+ERROR:  inherited relation "ft2" is not a table
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+
 \d+ ct3
 \d+ ft3
 -- add attributes recursively
@@ -1249,10 +1438,26 @@ ALTER TABLE pt1 ADD COLUMN c8 integer;
  c6     | integer |           |          |         | plain    |              | 
  c7     | integer |           | not null |         | plain    |              | 
  c8     | integer |           |          |         | plain    |              | 
+Child tables: ft2
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+ c4     | integer |           |          |         |             | plain    |              | 
+ c5     | integer |           |          | 0       |             | plain    |              | 
+ c6     | integer |           |          |         |             | plain    |              | 
+ c7     | integer |           | not null |         |             | plain    |              | 
+ c8     | integer |           |          |         |             | plain    |              | 
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+
 \d+ ct3
 \d+ ft3
 -- alter attributes recursively
@@ -1261,6 +1466,7 @@ ALTER TABLE pt1 ALTER COLUMN c5 DROP DEFAULT;
 ALTER TABLE pt1 ALTER COLUMN c6 SET NOT NULL;
 ALTER TABLE pt1 ALTER COLUMN c7 DROP NOT NULL;
 ALTER TABLE pt1 ALTER COLUMN c8 TYPE char(10) USING '0';        -- ERROR
+ERROR:  "ft2" is not a table
 ALTER TABLE pt1 ALTER COLUMN c8 TYPE char(10);
 ALTER TABLE pt1 ALTER COLUMN c8 SET DATA TYPE text;
 ALTER TABLE pt1 ALTER COLUMN c1 SET STATISTICS 10000;
@@ -1279,10 +1485,26 @@ ALTER TABLE pt1 ALTER COLUMN c8 SET STORAGE EXTERNAL;
  c6     | integer |           | not null |         | plain    |              | 
  c7     | integer |           |          |         | plain    |              | 
  c8     | text    |           |          |         | external |              | 
+Child tables: ft2
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    | 10000        | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+ c4     | integer |           |          | 0       |             | plain    |              | 
+ c5     | integer |           |          |         |             | plain    |              | 
+ c6     | integer |           | not null |         |             | plain    |              | 
+ c7     | integer |           |          |         |             | plain    |              | 
+ c8     | text    |           |          |         |             | external |              | 
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+
 -- drop attributes recursively
 ALTER TABLE pt1 DROP COLUMN c4;
 ALTER TABLE pt1 DROP COLUMN c5;
@@ -1296,10 +1518,21 @@ ALTER TABLE pt1 DROP COLUMN c8;
  c1     | integer |           | not null |         | plain    | 10000        | 
  c2     | text    |           |          |         | extended |              | 
  c3     | date    |           |          |         | plain    |              | 
+Child tables: ft2
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    | 10000        | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+
 -- add constraints recursively
 ALTER TABLE pt1 ADD CONSTRAINT pt1chk1 CHECK (c1 > 0) NO INHERIT;
 ALTER TABLE pt1 ADD CONSTRAINT pt1chk2 CHECK (c2 <> '');
@@ -1325,13 +1558,25 @@ SELECT relname, conname, contype, conislocal, coninhcount, connoinherit
 Check constraints:
     "pt1chk1" CHECK (c1 > 0) NO INHERIT
     "pt1chk2" CHECK (c2 <> ''::text)
+Child tables: ft2
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    | 10000        | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Check constraints:
+    "pt1chk2" CHECK (c2 <> ''::text)
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+
 \set VERBOSITY terse
 DROP FOREIGN TABLE ft2; -- ERROR
-ERROR:  foreign table "ft2" does not exist
 DROP FOREIGN TABLE ft2 CASCADE;
 ERROR:  foreign table "ft2" does not exist
 \set VERBOSITY default
@@ -1340,14 +1585,11 @@ CREATE FOREIGN TABLE ft2 (
 	c2 text,
 	c3 date
 ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
-ERROR:  server "s0" does not exist
 -- child must have parent's INHERIT constraints
 ALTER FOREIGN TABLE ft2 INHERIT pt1;                            -- ERROR
-ERROR:  relation "ft2" does not exist
+ERROR:  child table is missing constraint "pt1chk2"
 ALTER FOREIGN TABLE ft2 ADD CONSTRAINT pt1chk2 CHECK (c2 <> '');
-ERROR:  relation "ft2" does not exist
 ALTER FOREIGN TABLE ft2 INHERIT pt1;
-ERROR:  relation "ft2" does not exist
 -- child does not inherit NO INHERIT constraints
 \d+ pt1
                                     Table "public.pt1"
@@ -1359,10 +1601,23 @@ ERROR:  relation "ft2" does not exist
 Check constraints:
     "pt1chk1" CHECK (c1 > 0) NO INHERIT
     "pt1chk2" CHECK (c2 <> ''::text)
+Child tables: ft2
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Check constraints:
+    "pt1chk2" CHECK (c2 <> ''::text)
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+
 -- drop constraints recursively
 ALTER TABLE pt1 DROP CONSTRAINT pt1chk1 CASCADE;
 ALTER TABLE pt1 DROP CONSTRAINT pt1chk2 CASCADE;
@@ -1378,10 +1633,24 @@ ALTER TABLE pt1 ADD CONSTRAINT pt1chk3 CHECK (c2 <> '') NOT VALID;
  c3     | date    |           |          |         | plain    |              | 
 Check constraints:
     "pt1chk3" CHECK (c2 <> ''::text) NOT VALID
+Child tables: ft2
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Check constraints:
+    "pt1chk2" CHECK (c2 <> ''::text)
+    "pt1chk3" CHECK (c2 <> ''::text) NOT VALID
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+
 -- VALIDATE CONSTRAINT need do nothing on foreign tables
 ALTER TABLE pt1 VALIDATE CONSTRAINT pt1chk3;
 \d+ pt1
@@ -1393,10 +1662,24 @@ ALTER TABLE pt1 VALIDATE CONSTRAINT pt1chk3;
  c3     | date    |           |          |         | plain    |              | 
 Check constraints:
     "pt1chk3" CHECK (c2 <> ''::text)
+Child tables: ft2
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Check constraints:
+    "pt1chk2" CHECK (c2 <> ''::text)
+    "pt1chk3" CHECK (c2 <> ''::text)
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+
 -- OID system column
 ALTER TABLE pt1 SET WITH OIDS;
 \d+ pt1
@@ -1408,13 +1691,28 @@ ALTER TABLE pt1 SET WITH OIDS;
  c3     | date    |           |          |         | plain    |              | 
 Check constraints:
     "pt1chk3" CHECK (c2 <> ''::text)
+Child tables: ft2
 Has OIDs: yes
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Check constraints:
+    "pt1chk2" CHECK (c2 <> ''::text)
+    "pt1chk3" CHECK (c2 <> ''::text)
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+Has OIDs: yes
+
 ALTER TABLE ft2 SET WITHOUT OIDS;  -- ERROR
-ERROR:  relation "ft2" does not exist
+ERROR:  cannot drop inherited column "oid"
 ALTER TABLE pt1 SET WITHOUT OIDS;
 \d+ pt1
                                     Table "public.pt1"
@@ -1425,10 +1723,24 @@ ALTER TABLE pt1 SET WITHOUT OIDS;
  c3     | date    |           |          |         | plain    |              | 
 Check constraints:
     "pt1chk3" CHECK (c2 <> ''::text)
+Child tables: ft2
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Check constraints:
+    "pt1chk2" CHECK (c2 <> ''::text)
+    "pt1chk3" CHECK (c2 <> ''::text)
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+
 -- changes name of an attribute recursively
 ALTER TABLE pt1 RENAME COLUMN c1 TO f1;
 ALTER TABLE pt1 RENAME COLUMN c2 TO f2;
@@ -1444,36 +1756,55 @@ ALTER TABLE pt1 RENAME CONSTRAINT pt1chk3 TO f2_check;
  f3     | date    |           |          |         | plain    |              | 
 Check constraints:
     "f2_check" CHECK (f2 <> ''::text)
+Child tables: ft2
 Distribute By: HASH(f1)
 Location Nodes: ALL DATANODES
 
 \d+ ft2
+                                       Foreign table "public.ft2"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ f1     | integer |           | not null |         |             | plain    |              | 
+ f2     | text    |           |          |         |             | extended |              | 
+ f3     | date    |           |          |         |             | plain    |              | 
+Check constraints:
+    "f2_check" CHECK (f2 <> ''::text)
+    "pt1chk2" CHECK (f2 <> ''::text)
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+Inherits: pt1
+
 -- TRUNCATE doesn't work on foreign tables, either directly or recursively
 TRUNCATE ft2;  -- ERROR
-ERROR:  relation "ft2" does not exist
+ERROR:  "ft2" is not a table
 TRUNCATE pt1;  -- ERROR
+ERROR:  "ft2" is not a table
 DROP TABLE pt1 CASCADE;
+NOTICE:  drop cascades to foreign table ft2
 -- IMPORT FOREIGN SCHEMA
 IMPORT FOREIGN SCHEMA s1 FROM SERVER s9 INTO public; -- ERROR
-ERROR:  server "s9" does not exist
+ERROR:  foreign-data wrapper "foo" has no handler
 IMPORT FOREIGN SCHEMA s1 LIMIT TO (t1) FROM SERVER s9 INTO public; --ERROR
-ERROR:  server "s9" does not exist
+ERROR:  foreign-data wrapper "foo" has no handler
 IMPORT FOREIGN SCHEMA s1 EXCEPT (t1) FROM SERVER s9 INTO public; -- ERROR
-ERROR:  server "s9" does not exist
+ERROR:  foreign-data wrapper "foo" has no handler
 IMPORT FOREIGN SCHEMA s1 EXCEPT (t1, t2) FROM SERVER s9 INTO public
 OPTIONS (option1 'value1', option2 'value2'); -- ERROR
-ERROR:  server "s9" does not exist
+ERROR:  foreign-data wrapper "foo" has no handler
 -- DROP FOREIGN TABLE
 DROP FOREIGN TABLE no_table;                                    -- ERROR
 ERROR:  foreign table "no_table" does not exist
 DROP FOREIGN TABLE IF EXISTS no_table;
 NOTICE:  foreign table "no_table" does not exist, skipping
 DROP FOREIGN TABLE foreign_schema.foreign_table_1;
-ERROR:  foreign table "foreign_table_1" does not exist
 -- REASSIGN OWNED/DROP OWNED of foreign objects
 REASSIGN OWNED BY regress_test_role TO regress_test_role2;
 DROP OWNED BY regress_test_role2;
+ERROR:  cannot drop desired object(s) because other objects depend on them
+DETAIL:  user mapping for regress_test_role on server s5 depends on server s5
+HINT:  Use DROP ... CASCADE to drop the dependent objects too.
 DROP OWNED BY regress_test_role2 CASCADE;
+NOTICE:  drop cascades to user mapping for regress_test_role on server s5
 -- Foreign partition DDL stuff
 CREATE TABLE pt2 (
 	c1 integer NOT NULL,
@@ -1482,7 +1813,6 @@ CREATE TABLE pt2 (
 ) PARTITION BY LIST (c1);
 CREATE FOREIGN TABLE pt2_1 PARTITION OF pt2 FOR VALUES IN (1)
   SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
-ERROR:  server "s0" does not exist
 \d+ pt2
                                     Table "public.pt2"
  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
@@ -1491,26 +1821,45 @@ ERROR:  server "s0" does not exist
  c2     | text    |           |          |         | extended |              | 
  c3     | date    |           |          |         | plain    |              | 
 Partition key: LIST (c1)
-Number of partitions: 0
+Partitions: pt2_1 FOR VALUES IN (1)
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ pt2_1
+                                      Foreign table "public.pt2_1"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Partition of: pt2 FOR VALUES IN (1)
+Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1])))
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+
 -- partition cannot have additional columns
 DROP FOREIGN TABLE pt2_1;
-ERROR:  foreign table "pt2_1" does not exist
 CREATE FOREIGN TABLE pt2_1 (
 	c1 integer NOT NULL,
 	c2 text,
 	c3 date,
 	c4 char
 ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
-ERROR:  server "s0" does not exist
 \d+ pt2_1
+                                         Foreign table "public.pt2_1"
+ Column |     Type     | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+--------------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer      |           | not null |         |             | plain    |              | 
+ c2     | text         |           |          |         |             | extended |              | 
+ c3     | date         |           |          |         |             | plain    |              | 
+ c4     | character(1) |           |          |         |             | extended |              | 
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+
 ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);       -- ERROR
-ERROR:  relation "pt2_1" does not exist
+ERROR:  table "pt2_1" contains column "c4" not found in parent "pt2"
+DETAIL:  New partition should contain only the columns present in parent.
 DROP FOREIGN TABLE pt2_1;
-ERROR:  foreign table "pt2_1" does not exist
 \d+ pt2
                                     Table "public.pt2"
  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
@@ -1528,11 +1877,18 @@ CREATE FOREIGN TABLE pt2_1 (
 	c2 text,
 	c3 date
 ) SERVER s0 OPTIONS (delimiter ',', quote '"', "be quoted" 'value');
-ERROR:  server "s0" does not exist
 \d+ pt2_1
+                                      Foreign table "public.pt2_1"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+
 -- no attach partition validation occurs for foreign tables
 ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);
-ERROR:  relation "pt2_1" does not exist
 \d+ pt2
                                     Table "public.pt2"
  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
@@ -1541,19 +1897,28 @@ ERROR:  relation "pt2_1" does not exist
  c2     | text    |           |          |         | extended |              | 
  c3     | date    |           |          |         | plain    |              | 
 Partition key: LIST (c1)
-Number of partitions: 0
+Partitions: pt2_1 FOR VALUES IN (1)
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ pt2_1
+                                      Foreign table "public.pt2_1"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           |          |         |             | plain    |              | 
+Partition of: pt2 FOR VALUES IN (1)
+Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1])))
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+
 -- cannot add column to a partition
 ALTER TABLE pt2_1 ADD c4 char;
-ERROR:  relation "pt2_1" does not exist
+ERROR:  cannot add column to a partition
 -- ok to have a partition's own constraints though
 ALTER TABLE pt2_1 ALTER c3 SET NOT NULL;
-ERROR:  relation "pt2_1" does not exist
 ALTER TABLE pt2_1 ADD CONSTRAINT p21chk CHECK (c2 <> '');
-ERROR:  relation "pt2_1" does not exist
 \d+ pt2
                                     Table "public.pt2"
  Column |  Type   | Collation | Nullable | Default | Storage  | Stats target | Description 
@@ -1562,17 +1927,29 @@ ERROR:  relation "pt2_1" does not exist
  c2     | text    |           |          |         | extended |              | 
  c3     | date    |           |          |         | plain    |              | 
 Partition key: LIST (c1)
-Number of partitions: 0
+Partitions: pt2_1 FOR VALUES IN (1)
 Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ pt2_1
+                                      Foreign table "public.pt2_1"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           | not null |         |             | plain    |              | 
+Partition of: pt2 FOR VALUES IN (1)
+Partition constraint: ((c1 IS NOT NULL) AND (c1 = ANY (ARRAY[1])))
+Check constraints:
+    "p21chk" CHECK (c2 <> ''::text)
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+
 -- cannot drop inherited NOT NULL constraint from a partition
 ALTER TABLE pt2_1 ALTER c1 DROP NOT NULL;
-ERROR:  relation "pt2_1" does not exist
+ERROR:  column "c1" is marked NOT NULL in parent table
 -- partition must have parent's constraints
 ALTER TABLE pt2 DETACH PARTITION pt2_1;
-ERROR:  relation "pt2_1" does not exist
 ALTER TABLE pt2 ALTER c2 SET NOT NULL;
 \d+ pt2
                                     Table "public.pt2"
@@ -1587,14 +1964,22 @@ Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ pt2_1
+                                      Foreign table "public.pt2_1"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           |          |         |             | extended |              | 
+ c3     | date    |           | not null |         |             | plain    |              | 
+Check constraints:
+    "p21chk" CHECK (c2 <> ''::text)
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+
 ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);       -- ERROR
-ERROR:  relation "pt2_1" does not exist
+ERROR:  column "c2" in child table must be marked NOT NULL
 ALTER FOREIGN TABLE pt2_1 ALTER c2 SET NOT NULL;
-ERROR:  relation "pt2_1" does not exist
 ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);
-ERROR:  relation "pt2_1" does not exist
 ALTER TABLE pt2 DETACH PARTITION pt2_1;
-ERROR:  relation "pt2_1" does not exist
 ALTER TABLE pt2 ADD CONSTRAINT pt2chk1 CHECK (c1 > 0);
 \d+ pt2
                                     Table "public.pt2"
@@ -1611,46 +1996,55 @@ Distribute By: HASH(c1)
 Location Nodes: ALL DATANODES
 
 \d+ pt2_1
+                                      Foreign table "public.pt2_1"
+ Column |  Type   | Collation | Nullable | Default | FDW options | Storage  | Stats target | Description 
+--------+---------+-----------+----------+---------+-------------+----------+--------------+-------------
+ c1     | integer |           | not null |         |             | plain    |              | 
+ c2     | text    |           | not null |         |             | extended |              | 
+ c3     | date    |           | not null |         |             | plain    |              | 
+Check constraints:
+    "p21chk" CHECK (c2 <> ''::text)
+Server: s0
+FDW options: (delimiter ',', quote '"', "be quoted" 'value')
+
 ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);       -- ERROR
-ERROR:  relation "pt2_1" does not exist
+ERROR:  child table is missing constraint "pt2chk1"
 ALTER FOREIGN TABLE pt2_1 ADD CONSTRAINT pt2chk1 CHECK (c1 > 0);
-ERROR:  relation "pt2_1" does not exist
 ALTER TABLE pt2 ATTACH PARTITION pt2_1 FOR VALUES IN (1);
-ERROR:  relation "pt2_1" does not exist
 -- TRUNCATE doesn't work on foreign tables, either directly or recursively
 TRUNCATE pt2_1;  -- ERROR
-ERROR:  relation "pt2_1" does not exist
+ERROR:  "pt2_1" is not a table
 TRUNCATE pt2;  -- ERROR
+ERROR:  "pt2_1" is not a table
 DROP FOREIGN TABLE pt2_1;
-ERROR:  foreign table "pt2_1" does not exist
 DROP TABLE pt2;
 -- Cleanup
 DROP SCHEMA foreign_schema CASCADE;
 DROP ROLE regress_test_role;                                -- ERROR
+ERROR:  role "regress_test_role" cannot be dropped because some objects depend on it
+DETAIL:  privileges for server s4
+privileges for foreign-data wrapper foo
+owner of user mapping for regress_test_role on server s6
 DROP SERVER t1 CASCADE;
-ERROR:  server "t1" does not exist
+NOTICE:  drop cascades to user mapping for public on server t1
 DROP USER MAPPING FOR regress_test_role SERVER s6;
-ERROR:  role "regress_test_role" does not exist
 \set VERBOSITY terse
 DROP FOREIGN DATA WRAPPER foo CASCADE;
-ERROR:  foreign-data wrapper "foo" does not exist
+NOTICE:  drop cascades to 5 other objects
 DROP SERVER s8 CASCADE;
-ERROR:  server "s8" does not exist
+NOTICE:  drop cascades to 2 other objects
 \set VERBOSITY default
 DROP ROLE regress_test_indirect;
-ERROR:  role "regress_test_indirect" does not exist
 DROP ROLE regress_test_role;
-ERROR:  role "regress_test_role" does not exist
 DROP ROLE regress_unprivileged_role;                        -- ERROR
+ERROR:  role "regress_unprivileged_role" cannot be dropped because some objects depend on it
+DETAIL:  privileges for foreign-data wrapper postgresql
 REVOKE ALL ON FOREIGN DATA WRAPPER postgresql FROM regress_unprivileged_role;
-ERROR:  foreign-data wrapper "postgresql" does not exist
 DROP ROLE regress_unprivileged_role;
-ERROR:  role "regress_unprivileged_role" does not exist
 DROP ROLE regress_test_role2;
 DROP FOREIGN DATA WRAPPER postgresql CASCADE;
-ERROR:  foreign-data wrapper "postgresql" does not exist
 DROP FOREIGN DATA WRAPPER dummy CASCADE;
-ERROR:  foreign-data wrapper "dummy" does not exist
+NOTICE:  drop cascades to server s0
 \c
 DROP ROLE regress_foreign_data_user;
 -- At this point we should have no wrappers, no servers, and no mappings.
diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out
index 75cc6638..c0a5ceac 100644
--- a/src/test/regress/expected/object_address.out
+++ b/src/test/regress/expected/object_address.out
@@ -10,11 +10,7 @@ CREATE USER regress_addr_user;
 CREATE SCHEMA addr_nsp;
 SET search_path TO 'addr_nsp';
 CREATE FOREIGN DATA WRAPPER addr_fdw;
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER addr_fserv FOREIGN DATA WRAPPER addr_fdw;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE TEXT SEARCH DICTIONARY addr_ts_dict (template=simple);
 CREATE TEXT SEARCH CONFIGURATION addr_ts_conf (copy=english);
 CREATE TEXT SEARCH TEMPLATE addr_ts_temp (lexize=dsimple_lexize);
@@ -28,7 +24,6 @@ CREATE MATERIALIZED VIEW addr_nsp.genmatview AS SELECT * FROM addr_nsp.gentable;
 CREATE TYPE addr_nsp.gencomptype AS (a int);
 CREATE TYPE addr_nsp.genenum AS ENUM ('one', 'two');
 CREATE FOREIGN TABLE addr_nsp.genftable (a int) SERVER addr_fserv;
-ERROR:  server "addr_fserv" does not exist
 CREATE AGGREGATE addr_nsp.genaggr(int4) (sfunc = int4pl, stype = int4);
 CREATE DOMAIN addr_nsp.gendomain AS int4 CONSTRAINT domconstr CHECK (value > 0);
 CREATE FUNCTION addr_nsp.trig() RETURNS TRIGGER LANGUAGE plpgsql AS $$ BEGIN END; $$;
@@ -37,21 +32,17 @@ ERROR:  Postgres-XL does not support TRIGGER yet
 DETAIL:  The feature is not currently supported
 CREATE POLICY genpol ON addr_nsp.gentable;
 CREATE SERVER "integer" FOREIGN DATA WRAPPER addr_fdw;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR regress_addr_user SERVER "integer";
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 ALTER DEFAULT PRIVILEGES FOR ROLE regress_addr_user IN SCHEMA public GRANT ALL ON TABLES TO regress_addr_user;
 ALTER DEFAULT PRIVILEGES FOR ROLE regress_addr_user REVOKE DELETE ON TABLES FROM regress_addr_user;
 CREATE TRANSFORM FOR int LANGUAGE SQL (
 	FROM SQL WITH FUNCTION varchar_transform(internal),
 	TO SQL WITH FUNCTION int4recv(internal));
 CREATE PUBLICATION addr_pub FOR TABLE addr_nsp.gentable;
-ERROR:  Postgres-XL does not support CREATE PUBLICATION
+ERROR:  COORDINATOR does not support CREATE PUBLICATION
 DETAIL:  The feature is not currently supported
 CREATE SUBSCRIPTION addr_sub CONNECTION '' PUBLICATION bar WITH (connect = false, slot_name = NONE);
-ERROR:  Postgres-XL does not support CREATE SUBSCRIPTION
+ERROR:  COORDINATOR only supports CREATE TBASE SUBSCRIPTION
 DETAIL:  The feature is not currently supported
 CREATE STATISTICS addr_nsp.gentable_stat ON a, b FROM addr_nsp.gentable;
 -- test some error cases
@@ -77,8 +68,12 @@ BEGIN
 	END LOOP;
 END;
 $$;
-ERROR:  Internal subtransactions not supported in Postgres-XL
-CONTEXT:  PL/pgSQL function inline_code_block line 8 during statement block entry
+WARNING:  error for toast table: unsupported object type "toast table"
+WARNING:  error for index column: unsupported object type "index column"
+WARNING:  error for sequence column: unsupported object type "sequence column"
+WARNING:  error for toast table column: unsupported object type "toast table column"
+WARNING:  error for view column: unsupported object type "view column"
+WARNING:  error for materialized view column: unsupported object type "materialized view column"
 -- miscellaneous other errors
 select * from pg_get_object_address('operator of access method', '{btree,integer_ops,1}', '{int4,bool}');
 ERROR:  operator 1 (int4, bool) of operator family integer_ops for access method btree does not exist
@@ -121,8 +116,198 @@ BEGIN
 	END LOOP;
 END;
 $$;
-ERROR:  Internal subtransactions not supported in Postgres-XL
-CONTEXT:  PL/pgSQL function inline_code_block line 24 during statement block entry
+WARNING:  error for table,{eins},{}: relation "eins" does not exist
+WARNING:  error for table,{eins},{integer}: relation "eins" does not exist
+WARNING:  error for table,{addr_nsp,zwei},{}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for table,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for table,{eins,zwei,drei},{}: cross-database references are not implemented: "eins.zwei.drei"
+WARNING:  error for table,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei"
+WARNING:  error for index,{eins},{}: relation "eins" does not exist
+WARNING:  error for index,{eins},{integer}: relation "eins" does not exist
+WARNING:  error for index,{addr_nsp,zwei},{}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for index,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for index,{eins,zwei,drei},{}: cross-database references are not implemented: "eins.zwei.drei"
+WARNING:  error for index,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei"
+WARNING:  error for sequence,{eins},{}: relation "eins" does not exist
+WARNING:  error for sequence,{eins},{integer}: relation "eins" does not exist
+WARNING:  error for sequence,{addr_nsp,zwei},{}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for sequence,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for sequence,{eins,zwei,drei},{}: cross-database references are not implemented: "eins.zwei.drei"
+WARNING:  error for sequence,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei"
+WARNING:  error for view,{eins},{}: relation "eins" does not exist
+WARNING:  error for view,{eins},{integer}: relation "eins" does not exist
+WARNING:  error for view,{addr_nsp,zwei},{}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for view,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for view,{eins,zwei,drei},{}: cross-database references are not implemented: "eins.zwei.drei"
+WARNING:  error for view,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei"
+WARNING:  error for materialized view,{eins},{}: relation "eins" does not exist
+WARNING:  error for materialized view,{eins},{integer}: relation "eins" does not exist
+WARNING:  error for materialized view,{addr_nsp,zwei},{}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for materialized view,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for materialized view,{eins,zwei,drei},{}: cross-database references are not implemented: "eins.zwei.drei"
+WARNING:  error for materialized view,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei"
+WARNING:  error for foreign table,{eins},{}: relation "eins" does not exist
+WARNING:  error for foreign table,{eins},{integer}: relation "eins" does not exist
+WARNING:  error for foreign table,{addr_nsp,zwei},{}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for foreign table,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for foreign table,{eins,zwei,drei},{}: cross-database references are not implemented: "eins.zwei.drei"
+WARNING:  error for foreign table,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei"
+WARNING:  error for table column,{eins},{}: column name must be qualified
+WARNING:  error for table column,{eins},{integer}: column name must be qualified
+WARNING:  error for table column,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist
+WARNING:  error for table column,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist
+WARNING:  error for table column,{eins,zwei,drei},{}: schema "eins" does not exist
+WARNING:  error for table column,{eins,zwei,drei},{integer}: schema "eins" does not exist
+WARNING:  error for foreign table column,{eins},{}: column name must be qualified
+WARNING:  error for foreign table column,{eins},{integer}: column name must be qualified
+WARNING:  error for foreign table column,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist
+WARNING:  error for foreign table column,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist
+WARNING:  error for foreign table column,{eins,zwei,drei},{}: schema "eins" does not exist
+WARNING:  error for foreign table column,{eins,zwei,drei},{integer}: schema "eins" does not exist
+WARNING:  error for aggregate,{eins},{}: aggregate eins(*) does not exist
+WARNING:  error for aggregate,{eins},{integer}: aggregate eins(integer) does not exist
+WARNING:  error for aggregate,{addr_nsp,zwei},{}: aggregate addr_nsp.zwei(*) does not exist
+WARNING:  error for aggregate,{addr_nsp,zwei},{integer}: aggregate addr_nsp.zwei(integer) does not exist
+WARNING:  error for aggregate,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for aggregate,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for function,{eins},{}: function eins() does not exist
+WARNING:  error for function,{eins},{integer}: function eins(integer) does not exist
+WARNING:  error for function,{addr_nsp,zwei},{}: function addr_nsp.zwei() does not exist
+WARNING:  error for function,{addr_nsp,zwei},{integer}: function addr_nsp.zwei(integer) does not exist
+WARNING:  error for function,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for function,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for type,{eins},{}: type "eins" does not exist
+WARNING:  error for type,{eins},{integer}: type "eins" does not exist
+WARNING:  error for type,{addr_nsp,zwei},{}: name list length must be exactly 1
+WARNING:  error for type,{addr_nsp,zwei},{integer}: name list length must be exactly 1
+WARNING:  error for type,{eins,zwei,drei},{}: name list length must be exactly 1
+WARNING:  error for type,{eins,zwei,drei},{integer}: name list length must be exactly 1
+WARNING:  error for cast,{eins},{}: argument list length must be exactly 1
+WARNING:  error for cast,{eins},{integer}: type "eins" does not exist
+WARNING:  error for cast,{addr_nsp,zwei},{}: name list length must be exactly 1
+WARNING:  error for cast,{addr_nsp,zwei},{integer}: name list length must be exactly 1
+WARNING:  error for cast,{eins,zwei,drei},{}: name list length must be exactly 1
+WARNING:  error for cast,{eins,zwei,drei},{integer}: name list length must be exactly 1
+WARNING:  error for table constraint,{eins},{}: must specify relation and object name
+WARNING:  error for table constraint,{eins},{integer}: must specify relation and object name
+WARNING:  error for table constraint,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist
+WARNING:  error for table constraint,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist
+WARNING:  error for table constraint,{eins,zwei,drei},{}: schema "eins" does not exist
+WARNING:  error for table constraint,{eins,zwei,drei},{integer}: schema "eins" does not exist
+WARNING:  error for domain constraint,{eins},{}: argument list length must be exactly 1
+WARNING:  error for domain constraint,{eins},{integer}: type "eins" does not exist
+WARNING:  error for domain constraint,{addr_nsp,zwei},{}: name list length must be exactly 1
+WARNING:  error for domain constraint,{addr_nsp,zwei},{integer}: name list length must be exactly 1
+WARNING:  error for domain constraint,{eins,zwei,drei},{}: name list length must be exactly 1
+WARNING:  error for domain constraint,{eins,zwei,drei},{integer}: name list length must be exactly 1
+WARNING:  error for conversion,{eins},{}: conversion "eins" does not exist
+WARNING:  error for conversion,{eins},{integer}: conversion "eins" does not exist
+WARNING:  error for conversion,{addr_nsp,zwei},{}: conversion "addr_nsp.zwei" does not exist
+WARNING:  error for conversion,{addr_nsp,zwei},{integer}: conversion "addr_nsp.zwei" does not exist
+WARNING:  error for conversion,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for conversion,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for default value,{eins},{}: column name must be qualified
+WARNING:  error for default value,{eins},{integer}: column name must be qualified
+WARNING:  error for default value,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist
+WARNING:  error for default value,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist
+WARNING:  error for default value,{eins,zwei,drei},{}: schema "eins" does not exist
+WARNING:  error for default value,{eins,zwei,drei},{integer}: schema "eins" does not exist
+WARNING:  error for operator,{eins},{}: argument list length must be exactly 2
+WARNING:  error for operator,{eins},{integer}: argument list length must be exactly 2
+WARNING:  error for operator,{addr_nsp,zwei},{}: argument list length must be exactly 2
+WARNING:  error for operator,{addr_nsp,zwei},{integer}: argument list length must be exactly 2
+WARNING:  error for operator,{eins,zwei,drei},{}: argument list length must be exactly 2
+WARNING:  error for operator,{eins,zwei,drei},{integer}: argument list length must be exactly 2
+WARNING:  error for operator class,{eins},{}: name list length must be at least 2
+WARNING:  error for operator class,{eins},{integer}: name list length must be at least 2
+WARNING:  error for operator class,{addr_nsp,zwei},{}: access method "addr_nsp" does not exist
+WARNING:  error for operator class,{addr_nsp,zwei},{integer}: access method "addr_nsp" does not exist
+WARNING:  error for operator class,{eins,zwei,drei},{}: access method "eins" does not exist
+WARNING:  error for operator class,{eins,zwei,drei},{integer}: access method "eins" does not exist
+WARNING:  error for operator family,{eins},{}: name list length must be at least 2
+WARNING:  error for operator family,{eins},{integer}: name list length must be at least 2
+WARNING:  error for operator family,{addr_nsp,zwei},{}: access method "addr_nsp" does not exist
+WARNING:  error for operator family,{addr_nsp,zwei},{integer}: access method "addr_nsp" does not exist
+WARNING:  error for operator family,{eins,zwei,drei},{}: access method "eins" does not exist
+WARNING:  error for operator family,{eins,zwei,drei},{integer}: access method "eins" does not exist
+WARNING:  error for rule,{eins},{}: must specify relation and object name
+WARNING:  error for rule,{eins},{integer}: must specify relation and object name
+WARNING:  error for rule,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist
+WARNING:  error for rule,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist
+WARNING:  error for rule,{eins,zwei,drei},{}: schema "eins" does not exist
+WARNING:  error for rule,{eins,zwei,drei},{integer}: schema "eins" does not exist
+WARNING:  error for trigger,{eins},{}: must specify relation and object name
+WARNING:  error for trigger,{eins},{integer}: must specify relation and object name
+WARNING:  error for trigger,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist
+WARNING:  error for trigger,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist
+WARNING:  error for trigger,{eins,zwei,drei},{}: schema "eins" does not exist
+WARNING:  error for trigger,{eins,zwei,drei},{integer}: schema "eins" does not exist
+WARNING:  error for text search parser,{eins},{}: text search parser "eins" does not exist
+WARNING:  error for text search parser,{eins},{integer}: text search parser "eins" does not exist
+WARNING:  error for text search parser,{addr_nsp,zwei},{}: text search parser "addr_nsp.zwei" does not exist
+WARNING:  error for text search parser,{addr_nsp,zwei},{integer}: text search parser "addr_nsp.zwei" does not exist
+WARNING:  error for text search parser,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for text search parser,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for text search dictionary,{eins},{}: text search dictionary "eins" does not exist
+WARNING:  error for text search dictionary,{eins},{integer}: text search dictionary "eins" does not exist
+WARNING:  error for text search dictionary,{addr_nsp,zwei},{}: text search dictionary "addr_nsp.zwei" does not exist
+WARNING:  error for text search dictionary,{addr_nsp,zwei},{integer}: text search dictionary "addr_nsp.zwei" does not exist
+WARNING:  error for text search dictionary,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for text search dictionary,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for text search template,{eins},{}: text search template "eins" does not exist
+WARNING:  error for text search template,{eins},{integer}: text search template "eins" does not exist
+WARNING:  error for text search template,{addr_nsp,zwei},{}: text search template "addr_nsp.zwei" does not exist
+WARNING:  error for text search template,{addr_nsp,zwei},{integer}: text search template "addr_nsp.zwei" does not exist
+WARNING:  error for text search template,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for text search template,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for text search configuration,{eins},{}: text search configuration "eins" does not exist
+WARNING:  error for text search configuration,{eins},{integer}: text search configuration "eins" does not exist
+WARNING:  error for text search configuration,{addr_nsp,zwei},{}: text search configuration "addr_nsp.zwei" does not exist
+WARNING:  error for text search configuration,{addr_nsp,zwei},{integer}: text search configuration "addr_nsp.zwei" does not exist
+WARNING:  error for text search configuration,{eins,zwei,drei},{}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for text search configuration,{eins,zwei,drei},{integer}: cross-database references are not implemented: eins.zwei.drei
+WARNING:  error for policy,{eins},{}: must specify relation and object name
+WARNING:  error for policy,{eins},{integer}: must specify relation and object name
+WARNING:  error for policy,{addr_nsp,zwei},{}: relation "addr_nsp" does not exist
+WARNING:  error for policy,{addr_nsp,zwei},{integer}: relation "addr_nsp" does not exist
+WARNING:  error for policy,{eins,zwei,drei},{}: schema "eins" does not exist
+WARNING:  error for policy,{eins,zwei,drei},{integer}: schema "eins" does not exist
+WARNING:  error for user mapping,{eins},{}: argument list length must be exactly 1
+WARNING:  error for user mapping,{eins},{integer}: user mapping for user "eins" on server "integer" does not exist
+WARNING:  error for user mapping,{addr_nsp,zwei},{}: argument list length must be exactly 1
+WARNING:  error for user mapping,{addr_nsp,zwei},{integer}: user mapping for user "addr_nsp" on server "integer" does not exist
+WARNING:  error for user mapping,{eins,zwei,drei},{}: argument list length must be exactly 1
+WARNING:  error for user mapping,{eins,zwei,drei},{integer}: user mapping for user "eins" on server "integer" does not exist
+WARNING:  error for default acl,{eins},{}: argument list length must be exactly 1
+WARNING:  error for default acl,{eins},{integer}: unrecognized default ACL object type "i"
+WARNING:  error for default acl,{addr_nsp,zwei},{}: argument list length must be exactly 1
+WARNING:  error for default acl,{addr_nsp,zwei},{integer}: unrecognized default ACL object type "i"
+WARNING:  error for default acl,{eins,zwei,drei},{}: argument list length must be exactly 1
+WARNING:  error for default acl,{eins,zwei,drei},{integer}: unrecognized default ACL object type "i"
+WARNING:  error for transform,{eins},{}: argument list length must be exactly 1
+WARNING:  error for transform,{eins},{integer}: type "eins" does not exist
+WARNING:  error for transform,{addr_nsp,zwei},{}: name list length must be exactly 1
+WARNING:  error for transform,{addr_nsp,zwei},{integer}: name list length must be exactly 1
+WARNING:  error for transform,{eins,zwei,drei},{}: name list length must be exactly 1
+WARNING:  error for transform,{eins,zwei,drei},{integer}: name list length must be exactly 1
+WARNING:  error for operator of access method,{eins},{}: name list length must be at least 3
+WARNING:  error for operator of access method,{eins},{integer}: name list length must be at least 3
+WARNING:  error for operator of access method,{addr_nsp,zwei},{}: name list length must be at least 3
+WARNING:  error for operator of access method,{addr_nsp,zwei},{integer}: name list length must be at least 3
+WARNING:  error for operator of access method,{eins,zwei,drei},{}: argument list length must be exactly 2
+WARNING:  error for operator of access method,{eins,zwei,drei},{integer}: argument list length must be exactly 2
+WARNING:  error for function of access method,{eins},{}: name list length must be at least 3
+WARNING:  error for function of access method,{eins},{integer}: name list length must be at least 3
+WARNING:  error for function of access method,{addr_nsp,zwei},{}: name list length must be at least 3
+WARNING:  error for function of access method,{addr_nsp,zwei},{integer}: name list length must be at least 3
+WARNING:  error for function of access method,{eins,zwei,drei},{}: argument list length must be exactly 2
+WARNING:  error for function of access method,{eins,zwei,drei},{integer}: argument list length must be exactly 2
+WARNING:  error for publication relation,{eins},{}: argument list length must be exactly 1
+WARNING:  error for publication relation,{eins},{integer}: relation "eins" does not exist
+WARNING:  error for publication relation,{addr_nsp,zwei},{}: argument list length must be exactly 1
+WARNING:  error for publication relation,{addr_nsp,zwei},{integer}: relation "addr_nsp.zwei" does not exist
+WARNING:  error for publication relation,{eins,zwei,drei},{}: argument list length must be exactly 1
+WARNING:  error for publication relation,{eins,zwei,drei},{integer}: cross-database references are not implemented: "eins.zwei.drei"
 -- these object types cannot be qualified names
 SELECT pg_get_object_address('language', '{one}', '{}');
 ERROR:  language "one" does not exist
@@ -278,11 +463,11 @@ SELECT (pg_identify_object(addr1.classid, addr1.objid, addr1.objsubid)).*,
 ---
 \set VERBOSITY terse \\ -- suppress cascade details
 DROP FOREIGN DATA WRAPPER addr_fdw CASCADE;
-ERROR:  foreign-data wrapper "addr_fdw" does not exist
+NOTICE:  drop cascades to 4 other objects
 DROP PUBLICATION addr_pub;
 ERROR:  publication "addr_pub" does not exist
 DROP SUBSCRIPTION addr_sub;
-ERROR:  subscription "addr_sub" does not exist
+ERROR:  COORDINATOR only supports DROP TBASE SUBSCRIPTION
 DROP SCHEMA addr_nsp CASCADE;
 NOTICE:  drop cascades to 12 other objects
 DROP OWNED BY regress_addr_user;
diff --git a/src/test/regress/expected/rolenames.out b/src/test/regress/expected/rolenames.out
index 1540568b..dce82f5d 100644
--- a/src/test/regress/expected/rolenames.out
+++ b/src/test/regress/expected/rolenames.out
@@ -609,59 +609,23 @@ SELECT p.proname, r.rolname
 
 -- CREATE USER MAPPING
 CREATE FOREIGN DATA WRAPPER test_wrapper;
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER sv1 FOREIGN DATA WRAPPER test_wrapper;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER sv2 FOREIGN DATA WRAPPER test_wrapper;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER sv3 FOREIGN DATA WRAPPER test_wrapper;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER sv4 FOREIGN DATA WRAPPER test_wrapper;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER sv5 FOREIGN DATA WRAPPER test_wrapper;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER sv6 FOREIGN DATA WRAPPER test_wrapper;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER sv7 FOREIGN DATA WRAPPER test_wrapper;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER sv8 FOREIGN DATA WRAPPER test_wrapper;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE SERVER sv9 FOREIGN DATA WRAPPER test_wrapper;
-ERROR:  Postgres-XL does not support SERVER yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR CURRENT_USER SERVER sv1 OPTIONS (user 'CURRENT_USER');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR "current_user" SERVER sv2 OPTIONS (user '"current_user"');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR USER SERVER sv3 OPTIONS (user 'USER');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR "user" SERVER sv4 OPTIONS (user '"USER"');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR SESSION_USER SERVER sv5 OPTIONS (user 'SESSION_USER');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR PUBLIC SERVER sv6 OPTIONS (user 'PUBLIC');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR "Public" SERVER sv7 OPTIONS (user '"Public"');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR regress_testrolx SERVER sv8 OPTIONS (user 'regress_testrolx');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR CURRENT_ROLE SERVER sv9
 	    OPTIONS (user 'CURRENT_ROLE'); -- error
 ERROR:  syntax error at or near "CURRENT_ROLE"
@@ -669,38 +633,37 @@ LINE 1: CREATE USER MAPPING FOR CURRENT_ROLE SERVER sv9
                                 ^
 CREATE USER MAPPING FOR nonexistent SERVER sv9
 	    OPTIONS (user 'nonexistent'); -- error;
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
+ERROR:  role "nonexistent" does not exist
 SELECT * FROM chkumapping();
- umname | umserver | umoptions 
---------+----------+-----------
-(0 rows)
+      umname      | umserver |         umoptions         
+------------------+----------+---------------------------
+ regress_testrol2 | sv1      | {user=CURRENT_USER}
+ current_user     | sv2      | {"user=\"current_user\""}
+ regress_testrol2 | sv3      | {user=USER}
+ user             | sv4      | {"user=\"USER\""}
+ regress_testrol1 | sv5      | {user=SESSION_USER}
+                  | sv6      | {user=PUBLIC}
+ Public           | sv7      | {"user=\"Public\""}
+ regress_testrolx | sv8      | {user=regress_testrolx}
+(8 rows)
 
 -- ALTER USER MAPPING
 ALTER USER MAPPING FOR CURRENT_USER SERVER sv1
  OPTIONS (SET user 'CURRENT_USER_alt');
-ERROR:  server "sv1" does not exist
 ALTER USER MAPPING FOR "current_user" SERVER sv2
  OPTIONS (SET user '"current_user"_alt');
-ERROR:  server "sv2" does not exist
 ALTER USER MAPPING FOR USER SERVER sv3
  OPTIONS (SET user 'USER_alt');
-ERROR:  server "sv3" does not exist
 ALTER USER MAPPING FOR "user" SERVER sv4
  OPTIONS (SET user '"user"_alt');
-ERROR:  server "sv4" does not exist
 ALTER USER MAPPING FOR SESSION_USER SERVER sv5
  OPTIONS (SET user 'SESSION_USER_alt');
-ERROR:  server "sv5" does not exist
 ALTER USER MAPPING FOR PUBLIC SERVER sv6
  OPTIONS (SET user 'public_alt');
-ERROR:  server "sv6" does not exist
 ALTER USER MAPPING FOR "Public" SERVER sv7
  OPTIONS (SET user '"Public"_alt');
-ERROR:  server "sv7" does not exist
 ALTER USER MAPPING FOR regress_testrolx SERVER sv8
  OPTIONS (SET user 'regress_testrolx_alt');
-ERROR:  server "sv8" does not exist
 ALTER USER MAPPING FOR CURRENT_ROLE SERVER sv9
  OPTIONS (SET user 'CURRENT_ROLE_alt');
 ERROR:  syntax error at or near "CURRENT_ROLE"
@@ -710,27 +673,27 @@ ALTER USER MAPPING FOR nonexistent SERVER sv9
  OPTIONS (SET user 'nonexistent_alt'); -- error
 ERROR:  role "nonexistent" does not exist
 SELECT * FROM chkumapping();
- umname | umserver | umoptions 
---------+----------+-----------
-(0 rows)
+      umname      | umserver |           umoptions           
+------------------+----------+-------------------------------
+ regress_testrol2 | sv1      | {user=CURRENT_USER_alt}
+ current_user     | sv2      | {"user=\"current_user\"_alt"}
+ regress_testrol2 | sv3      | {user=USER_alt}
+ user             | sv4      | {"user=\"user\"_alt"}
+ regress_testrol1 | sv5      | {user=SESSION_USER_alt}
+                  | sv6      | {user=public_alt}
+ Public           | sv7      | {"user=\"Public\"_alt"}
+ regress_testrolx | sv8      | {user=regress_testrolx_alt}
+(8 rows)
 
 -- DROP USER MAPPING
 DROP USER MAPPING FOR CURRENT_USER SERVER sv1;
-ERROR:  server "sv1" does not exist
 DROP USER MAPPING FOR "current_user" SERVER sv2;
-ERROR:  server "sv2" does not exist
 DROP USER MAPPING FOR USER SERVER sv3;
-ERROR:  server "sv3" does not exist
 DROP USER MAPPING FOR "user" SERVER sv4;
-ERROR:  server "sv4" does not exist
 DROP USER MAPPING FOR SESSION_USER SERVER sv5;
-ERROR:  server "sv5" does not exist
 DROP USER MAPPING FOR PUBLIC SERVER sv6;
-ERROR:  server "sv6" does not exist
 DROP USER MAPPING FOR "Public" SERVER sv7;
-ERROR:  server "sv7" does not exist
 DROP USER MAPPING FOR regress_testrolx SERVER sv8;
-ERROR:  server "sv8" does not exist
 DROP USER MAPPING FOR CURRENT_ROLE SERVER sv9; -- error
 ERROR:  syntax error at or near "CURRENT_ROLE"
 LINE 1: DROP USER MAPPING FOR CURRENT_ROLE SERVER sv9;
@@ -743,86 +706,98 @@ SELECT * FROM chkumapping();
 (0 rows)
 
 CREATE USER MAPPING FOR CURRENT_USER SERVER sv1 OPTIONS (user 'CURRENT_USER');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR "current_user" SERVER sv2 OPTIONS (user '"current_user"');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR USER SERVER sv3 OPTIONS (user 'USER');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR "user" SERVER sv4 OPTIONS (user '"USER"');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR SESSION_USER SERVER sv5 OPTIONS (user 'SESSION_USER');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR PUBLIC SERVER sv6 OPTIONS (user 'PUBLIC');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR "Public" SERVER sv7 OPTIONS (user '"Public"');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 CREATE USER MAPPING FOR regress_testrolx SERVER sv8 OPTIONS (user 'regress_testrolx');
-ERROR:  Postgres-XL does not support USER MAPPING yet
-DETAIL:  The feature is not currently supported
 SELECT * FROM chkumapping();
- umname | umserver | umoptions 
---------+----------+-----------
-(0 rows)
+      umname      | umserver |         umoptions         
+------------------+----------+---------------------------
+ regress_testrol2 | sv1      | {user=CURRENT_USER}
+ current_user     | sv2      | {"user=\"current_user\""}
+ regress_testrol2 | sv3      | {user=USER}
+ user             | sv4      | {"user=\"USER\""}
+ regress_testrol1 | sv5      | {user=SESSION_USER}
+                  | sv6      | {user=PUBLIC}
+ Public           | sv7      | {"user=\"Public\""}
+ regress_testrolx | sv8      | {user=regress_testrolx}
+(8 rows)
 
 -- DROP USER MAPPING IF EXISTS
 DROP USER MAPPING IF EXISTS FOR CURRENT_USER SERVER sv1;
-NOTICE:  server does not exist, skipping
 SELECT * FROM chkumapping();
- umname | umserver | umoptions 
---------+----------+-----------
-(0 rows)
+      umname      | umserver |         umoptions         
+------------------+----------+---------------------------
+ current_user     | sv2      | {"user=\"current_user\""}
+ regress_testrol2 | sv3      | {user=USER}
+ user             | sv4      | {"user=\"USER\""}
+ regress_testrol1 | sv5      | {user=SESSION_USER}
+                  | sv6      | {user=PUBLIC}
+ Public           | sv7      | {"user=\"Public\""}
+ regress_testrolx | sv8      | {user=regress_testrolx}
+(7 rows)
 
 DROP USER MAPPING IF EXISTS FOR "current_user" SERVER sv2;
-NOTICE:  server does not exist, skipping
 SELECT * FROM chkumapping();
- umname | umserver | umoptions 
---------+----------+-----------
-(0 rows)
+      umname      | umserver |        umoptions        
+------------------+----------+-------------------------
+ regress_testrol2 | sv3      | {user=USER}
+ user             | sv4      | {"user=\"USER\""}
+ regress_testrol1 | sv5      | {user=SESSION_USER}
+                  | sv6      | {user=PUBLIC}
+ Public           | sv7      | {"user=\"Public\""}
+ regress_testrolx | sv8      | {user=regress_testrolx}
+(6 rows)
 
 DROP USER MAPPING IF EXISTS FOR USER SERVER sv3;
-NOTICE:  server does not exist, skipping
 SELECT * FROM chkumapping();
- umname | umserver | umoptions 
---------+----------+-----------
-(0 rows)
+      umname      | umserver |        umoptions        
+------------------+----------+-------------------------
+ user             | sv4      | {"user=\"USER\""}
+ regress_testrol1 | sv5      | {user=SESSION_USER}
+                  | sv6      | {user=PUBLIC}
+ Public           | sv7      | {"user=\"Public\""}
+ regress_testrolx | sv8      | {user=regress_testrolx}
+(5 rows)
 
 DROP USER MAPPING IF EXISTS FOR "user" SERVER sv4;
-NOTICE:  server does not exist, skipping
 SELECT * FROM chkumapping();
- umname | umserver | umoptions 
---------+----------+-----------
-(0 rows)
+      umname      | umserver |        umoptions        
+------------------+----------+-------------------------
+ regress_testrol1 | sv5      | {user=SESSION_USER}
+                  | sv6      | {user=PUBLIC}
+ Public           | sv7      | {"user=\"Public\""}
+ regress_testrolx | sv8      | {user=regress_testrolx}
+(4 rows)
 
 DROP USER MAPPING IF EXISTS FOR SESSION_USER SERVER sv5;
-NOTICE:  server does not exist, skipping
 SELECT * FROM chkumapping();
- umname | umserver | umoptions 
---------+----------+-----------
-(0 rows)
+      umname      | umserver |        umoptions        
+------------------+----------+-------------------------
+                  | sv6      | {user=PUBLIC}
+ Public           | sv7      | {"user=\"Public\""}
+ regress_testrolx | sv8      | {user=regress_testrolx}
+(3 rows)
 
 DROP USER MAPPING IF EXISTS FOR PUBLIC SERVER sv6;
-NOTICE:  server does not exist, skipping
 SELECT * FROM chkumapping();
- umname | umserver | umoptions 
---------+----------+-----------
-(0 rows)
+      umname      | umserver |        umoptions        
+------------------+----------+-------------------------
+ Public           | sv7      | {"user=\"Public\""}
+ regress_testrolx | sv8      | {user=regress_testrolx}
+(2 rows)
 
 DROP USER MAPPING IF EXISTS FOR "Public" SERVER sv7;
-NOTICE:  server does not exist, skipping
 SELECT * FROM chkumapping();
- umname | umserver | umoptions 
---------+----------+-----------
-(0 rows)
+      umname      | umserver |        umoptions        
+------------------+----------+-------------------------
+ regress_testrolx | sv8      | {user=regress_testrolx}
+(1 row)
 
 DROP USER MAPPING IF EXISTS FOR regress_testrolx SERVER sv8;
-NOTICE:  server does not exist, skipping
 SELECT * FROM chkumapping();
  umname | umserver | umoptions 
 --------+----------+-----------
diff --git a/src/test/regress/expected/xl_limitations_1.out b/src/test/regress/expected/xl_limitations_1.out
index 161cd7b4..b1dfd26c 100644
--- a/src/test/regress/expected/xl_limitations_1.out
+++ b/src/test/regress/expected/xl_limitations_1.out
@@ -730,12 +730,9 @@ SELECT sum(n) FROM t;
 
 --FDWs are not supported
 CREATE FOREIGN DATA WRAPPER xl_foo; -- ERROR
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
 RESET ROLE;
 CREATE FOREIGN DATA WRAPPER xl_foo VALIDATOR postgresql_fdw_validator;
-ERROR:  Postgres-XL does not support FOREIGN DATA WRAPPER yet
-DETAIL:  The feature is not currently supported
+ERROR:  foreign-data wrapper "xl_foo" already exists
 --LISTEN/NOTIFY is not supported. Looks like they are supported now. 
 --We would obviously have issues with LISTEN/NOTIFY if clients are connected to different coordinators. Need to test that manually as it is difficult via regression.
 --LISTEN notify_async1; 

From 26fafa56d99ba804d0bf0c21fe3dd6fb9fc9623e Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Fri, 14 May 2021 16:18:02 +0800
Subject: [PATCH 373/578] 2pc files opt: add 2pc hash table on shmem (merge
 request 300), bugfix: add retry

---
 src/backend/access/transam/twophase.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index f2fbc7e6..bf6f5c1b 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -168,7 +168,7 @@ int record_2pc_partitions = 32;
 #define MAX_2PC_INFO_SIZE   (record_2pc_entry_size - MAX_TID_SIZE)
 #define DFLT_2PC_INFO_SIZE  1024  /* default size */
 
-#define MAX_RETRY_TIMES     2
+#define MAX_RETRY_TIMES     10
 
 /* hash table entry for 2pc record */
 typedef struct Cache2pcInfo
@@ -3716,7 +3716,8 @@ void record_2pc_involved_nodes_xid(const char * tid,
 			}
 			else if (enable_2pc_entry_trace)
 			{
-				elog(LOG, "[%s] %s is added to hash table", func, tid);
+				elog(LOG, "[%s] %s is added to hash table, entry: %p",
+					func, tid, entry);
 			}
 
 			memcpy(entry->info, content.data, size + 1);
@@ -3830,6 +3831,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 	size = content.len;
 	Assert(size == strlen(content.data));
 
+	GET_2PC_FILE_PATH(path, tid);
+
 	while (NULL != record_2pc_cache && retry_times++ < MAX_RETRY_TIMES)
 	{
 		Assert(strlen(tid) < MAX_TID_SIZE);
@@ -3930,6 +3933,12 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 		Assert(NULL == entry);
 		print_record_2pc_cache(func);
 
+		if (0 == access(path, F_OK))
+		{
+			elog(LOG, "[%s] %s found 2pc file %s", func, tid, path);
+			break;
+		}
+
 		pg_usleep(5000L);	/* sleep 5ms */
 		}
 
@@ -3938,8 +3947,6 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 		elog(LOG, "[%s] %s is not found in hash table, get from disk", func, tid);
 	}
 
-	GET_2PC_FILE_PATH(path, tid);
-
 	/* the 2pc file exists already */
 	fd = PathNameOpenFile(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR);
 	if (fd < 0)
@@ -3968,7 +3975,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
         }
         else
         {
-			elog(ERROR, "[%s] could not open file %s, errMsg: %s",
+			elog(PANIC, "[%s] could not open file %s, errMsg: %s",
 				func, path, strerror(errno));
         }
         return;

From 8b92338d071d6096cc8443fcbf07889734d9f9dd Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Wed, 19 May 2021 17:37:48 +0800
Subject: [PATCH 374/578] 2pc files opt: add 2pc hash table on shmem (merge
 request 300), bugfix: rename conflict when startup

---
 src/backend/access/transam/twophase.c | 40 ++++++++++++++++++++++-----
 src/backend/utils/misc/guc.c          |  4 +--
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index bf6f5c1b..3124e174 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -158,7 +158,7 @@ bool enable_2pc_file_check = true;
 bool enable_2pc_entry_key_check = true;
 bool enable_2pc_entry_trace = false;
 
-int record_2pc_cache_size = 50000;
+int record_2pc_cache_size = 4096;
 int record_2pc_entry_size = 2048;
 int record_2pc_partitions = 32;
 
@@ -4127,16 +4127,25 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 			check_entry_key(tid, entry->key, func);
 			check_2pc_file(tid, entry->info, func);
 
+			if (0 == access(new_path, F_OK))
+			{
 			if (RecoveryInProgress())
 			{
-				fd = PathNameOpenFile(new_path, O_RDWR | O_TRUNC | O_CREAT,
-					S_IRUSR | S_IWUSR);
+					elog(LOG, "[%s] file %s exist", func, new_path);
 			}
 			else
 			{
+					elog(WARNING, "[%s] file %s exist", func, new_path);
+				}
+				if (0 != unlink(new_path))
+				{
+					elog(ERROR, "[%s] could not unlink file %s, errMsg: %s",
+						func, new_path, strerror(errno));
+				}
+			}
+
 			fd = PathNameOpenFile(new_path, O_RDWR | O_CREAT | O_EXCL,
 				S_IRUSR | S_IWUSR);
-			}
 			if (fd < 0)
 			{
 				elog(ERROR, "[%s] could not create file %s, errMsg: %s",
@@ -4175,6 +4184,23 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 			func, path, strerror(errno));
 		return;
 	}
+	if (0 == access(new_path, F_OK))
+	{
+		if (RecoveryInProgress())
+		{
+			elog(LOG, "[%s] file %s exist", func, new_path);
+		}
+		else
+		{
+			elog(WARNING, "[%s] file %s exist", func, new_path);
+		}
+		if (0 != unlink(new_path))
+		{
+			elog(WARNING, "[%s] could not unlink file %s, errMsg: %s",
+				func, new_path, strerror(errno));
+			return;
+		}
+	}
 	if (0 != link(path, new_path))
 	{
 		elog(ERROR, "[%s] could not link file %s to %s, errMsg: %s",
@@ -4396,7 +4422,7 @@ Record2pcCacheInit(void)
 	flags = HASH_ELEM | HASH_PARTITION;
 
 	record_2pc_cache = ShmemInitHash("Record 2pc Cache",
-		record_2pc_cache_size/2, record_2pc_cache_size,
+		record_2pc_cache_size, record_2pc_cache_size,
 		&info, flags);
 }
 
@@ -4406,10 +4432,10 @@ Record2pcCacheInit(void)
 Size
 Record2pcCacheSize(void)
 {
-	long cache_size = 0;
+	Size cache_size = 0;
 	if (enable_2pc_file_cache)
 	{
-		cache_size = (long)record_2pc_cache_size * record_2pc_entry_size;
+		cache_size = hash_estimate_size(record_2pc_cache_size, record_2pc_entry_size);
 	}
 	return cache_size;
 }
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 9c699458..5dda04d5 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4789,7 +4789,7 @@ static struct config_int ConfigureNamesInt[] =
 			gettext_noop("2PC info cache size."),
 		},
 		&record_2pc_cache_size,
-		50000, 100, INT_MAX,
+		4096, 1, INT_MAX,
 		NULL, NULL, NULL
 	},
 	{
@@ -4797,7 +4797,7 @@ static struct config_int ConfigureNamesInt[] =
 			gettext_noop("2PC info cache entry size."),
 		},
 		&record_2pc_entry_size,
-		2048, 1200, INT_MAX,
+		2048, 1028, INT_MAX,
 		NULL, NULL, NULL
 	},
 	{

From 5c5838201896f8f635af4ad4cbb027df8435da7e Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Wed, 19 May 2021 17:48:42 +0800
Subject: [PATCH 375/578] 2pc files opt: add 2pc hash table on shmem (merge
 request 300), bugfix: rename conflict when startup, opt

---
 src/backend/utils/misc/guc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 5dda04d5..5066d491 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4789,7 +4789,7 @@ static struct config_int ConfigureNamesInt[] =
 			gettext_noop("2PC info cache size."),
 		},
 		&record_2pc_cache_size,
-		4096, 1, INT_MAX,
+		4096, 32, INT_MAX,
 		NULL, NULL, NULL
 	},
 	{

From 11badf21ed44b64facf64f5e5fc2b44bbada1c56 Mon Sep 17 00:00:00 2001
From: gregsun <gregsun@tencent.com>
Date: Tue, 27 Apr 2021 12:40:00 +0800
Subject: [PATCH 376/578] Release sub-transaction also in datanode.

http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696087091551

Conflicts:
	src/test/regress/parallel_schedule
---
 src/backend/pgxc/pool/execRemote.c    |    4 +
 src/test/regress/expected/pl_bugs.out | 1944 +++++++++++++++++++++++
 src/test/regress/parallel_schedule    |    3 +
 src/test/regress/sql/pl_bugs.sql      | 2052 +++++++++++++++++++++++++
 4 files changed, 4003 insertions(+)
 create mode 100644 src/test/regress/expected/pl_bugs.out
 create mode 100644 src/test/regress/sql/pl_bugs.sql

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index e02f262f..17e6f838 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -12915,7 +12915,11 @@ SubTranscation_PreCommit_Remote(void)
 												  ALLOCSET_DEFAULT_SIZES);
 	old = MemoryContextSwitchTo(temp);
     /* Only local coord can send down commit_subtxn when exec plpgsql */
+#ifdef _PG_ORCL_
+	if (InPlpgsqlFunc())
+#else
     if (InPlpgsqlFunc() && IS_PGXC_LOCAL_COORDINATOR)
+#endif
     {
         pgxc_node_remote_commit(TXN_TYPE_CommitSubTxn, false);
     }
diff --git a/src/test/regress/expected/pl_bugs.out b/src/test/regress/expected/pl_bugs.out
new file mode 100644
index 00000000..0930dd68
--- /dev/null
+++ b/src/test/regress/expected/pl_bugs.out
@@ -0,0 +1,1944 @@
+CREATE SCHEMA sync;
+SET search_path = sync, pg_catalog;
+set enable_oracle_compatible to on;
+--
+-- Name: func_getlastnetvalue(varchar2, date); Type: FUNCTION; Schema: sync; Owner: gregsun
+--
+CREATE FUNCTION func_getlastnetvalue(v_fundcode varchar2, v_cdate date) RETURNS numeric
+    LANGUAGE plpgsql
+    AS $$
+ declare   v_netvalue text;
+begin
+  begin
+    select p1
+      into v_netvalue
+      from p
+	limit 1;
+  exception
+    when no_data_found then
+      return 1;
+  
+  end;
+  return 1;
+end;
+ $$;
+--
+-- Name: sp_b03_ts_remetrade(varchar2, varchar2, varchar2, varchar2); Type: PROCEDURE; Schema: sync; Owner: gregsun
+--
+CREATE function sp_b03_ts_remetrade(p_start_date varchar2, p_work_date varchar2, INOUT err_num varchar2 DEFAULT 0, INOUT err_msg varchar2 DEFAULT NULL::varchar2)
+    LANGUAGE plpgsql
+    AS $$
+ declare   
+  V_START_DATE     DATE;
+  V_END_DATE       DATE;
+  V_WORK_DATE      DATE;
+  V_SP_NAME        VARCHAR(30);
+  V_TAB_LEVEL      VARCHAR(20);
+  V_LOG_STEP_NO    VARCHAR(20);
+  V_LOG_BEGIN_TIME DATE := SYSDATE;
+  V_LOG_END_TIME   DATE;
+  V_LOG_ROWCOUNT   NUMBER := 0;
+  V_ELAPSED        NUMBER;
+  V_ALL_ELAPSED    NUMBER;
+  V_STEP_DESC      sys_stat_error_log.STEP_DESC%TYPE;
+BEGIN
+  
+  V_SP_NAME   := 'SP_B03_TS_REMETRADE';
+  V_TAB_LEVEL := 'B';
+  
+  IF P_START_DATE IS NULL
+  THEN
+    RAISE EXCEPTION 'P_START_DATE IS NULL!';
+  ELSE
+    V_START_DATE := TO_DATE(P_START_DATE, 'YYYY-MM-DD');
+  END IF;
+  IF P_WORK_DATE IS NULL
+  THEN
+    RAISE EXCEPTION 'P_WORK_DATE IS NULL!';
+  ELSE
+    V_WORK_DATE := TO_DATE(P_WORK_DATE, 'YYYY-MM-DD');
+  END IF;
+  IF P_WORK_DATE IS NULL
+  THEN
+    RAISE EXCEPTION 'P_WORK_DATE IS NULL!';
+  ELSE
+    V_END_DATE := TO_DATE(P_WORK_DATE, 'YYYY-MM-DD');
+  END IF;
+  
+  
+  
+  V_LOG_STEP_NO    := 'STEP_01';
+  V_STEP_DESC      := '清除目标表数据';
+  V_LOG_BEGIN_TIME := SYSDATE;
+  V_LOG_ROWCOUNT   := NULL;
+  CALL SP_PUB_INSERT_LOG_DATE(V_SP_NAME 
+                        ,
+                         V_TAB_LEVEL 
+                        ,
+                         V_LOG_STEP_NO 
+                        ,
+                         V_STEP_DESC 
+                        ,
+                         V_LOG_BEGIN_TIME 
+                        ,
+                         V_LOG_END_TIME 
+                        ,
+                         V_WORK_DATE 
+                        ,
+                         V_LOG_ROWCOUNT 
+                        ,
+                         V_ELAPSED 
+                        ,
+                         V_ALL_ELAPSED);
+  
+  CALL SP_PUB_DEL_TB('B03_TS_REMETRADE');
+  /*DELETE FROM B03_TS_REMETRADE Y
+  WHERE Y.ENDDATE >=V_START_DATE;*/
+  
+ GET DIAGNOSTICS V_LOG_ROWCOUNT = ROW_COUNT;
+  
+  
+  
+  CALL SP_PUB_UPDATE_LOG_DATE(V_SP_NAME 
+                        ,
+                         V_TAB_LEVEL 
+                        ,
+                         V_LOG_STEP_NO 
+                        ,
+                         V_LOG_BEGIN_TIME 
+                        ,
+                        SYSDATE::DATE 
+                        ,
+                         V_WORK_DATE 
+                        ,
+                         V_LOG_ROWCOUNT 
+                        ,
+                         (SYSDATE - V_LOG_BEGIN_TIME)::NUMERIC 
+                        ,
+                         V_ALL_ELAPSED);
+  
+  V_LOG_STEP_NO    := 'STEP_02';
+  V_STEP_DESC      := '插入目标表B03_TS_REMETRADE';
+  V_LOG_BEGIN_TIME := SYSDATE;
+  V_LOG_ROWCOUNT   := NULL;
+  CALL  SP_PUB_INSERT_LOG_DATE(V_SP_NAME, 
+                         V_TAB_LEVEL, 
+                         V_LOG_STEP_NO, 
+                         V_STEP_DESC, 
+                         V_LOG_BEGIN_TIME, 
+                         V_LOG_END_TIME, 
+                         V_WORK_DATE, 
+                         V_LOG_ROWCOUNT, 
+                         V_ELAPSED, 
+                         V_ALL_ELAPSED);
+  
+  INSERT INTO B03_TS_REMETRADE
+    (C_FUNDCODE,
+     C_FUNDNAME,
+     C_FUNDACCO,
+     F_NETVALUE,
+     C_AGENCYNAME,
+     C_CUSTNAME,
+     D_DATE,
+     D_CDATE,
+     F_CONFIRMBALANCE,
+     F_TRADEFARE,
+     F_CONFIRMSHARES,
+     F_RELBALANCE,
+     F_INTEREST,
+     INFO,
+     WORK_DATE,
+     LOAD_DATE)
+    SELECT A.C_FUNDCODE,
+           A.C_FUNDNAME,
+           A.C_FUNDACCO,
+           A.F_NETVALUE,
+           A.C_AGENCYNAME,
+           A.C_CUSTNAME,
+           A.D_DATE,
+           A.D_CDATE,
+           A.F_CONFIRMBALANCE,
+           A.F_TRADEFARE,
+           A.F_CONFIRMSHARES,
+           ABS(NVL(B.F_OCCURBALANCE, A.F_RELBALANCE)) F_RELBALANCE,
+           A.F_INTEREST,
+           NVL(DECODE(B.C_BUSINFLAG,
+                      '02',
+                      '申购',
+                      '50',
+                      '申购',
+                      '74',
+                      '申购',
+                      '03',
+                      '赎回'),
+               DECODE(A.C_BUSINFLAG,
+                      '01',
+                      '认购',
+                      '02',
+                      '申购',
+                      '03',
+                      '赎回',
+                      '53',
+                      '强制赎回',
+                      '50',
+                      '产品成立')) AS INFO,
+           V_WORK_DATE,
+           SYSDATE AS LOAD_DATE
+      FROM (SELECT A.C_FUNDCODE,
+                   C.C_FUNDNAME,
+                   A.C_FUNDACCO,
+                   FUNC_GETLASTNETVALUE(A.C_FUNDCODE, A.D_CDATE) F_NETVALUE,
+                   (SELECT C_AGENCYNAME
+                      FROM S017_TAGENCYINFO
+                     WHERE A.C_AGENCYNO = C_AGENCYNO) C_AGENCYNAME,
+                   B.C_CUSTNAME,
+                   TO_CHAR(A.D_DATE, 'yyyy-mm-dd') D_DATE,
+                   TO_CHAR(A.D_CDATE, 'yyyy-mm-dd') D_CDATE,
+                   DECODE(A.C_BUSINFLAG,
+                          '03',
+                          A.F_CONFIRMBALANCE + A.F_TRADEFARE,
+                          '53',
+                          A.F_CONFIRMBALANCE + A.F_TRADEFARE,
+                          A.F_CONFIRMBALANCE) F_CONFIRMBALANCE,
+                   A.F_TRADEFARE,
+                   A.F_CONFIRMSHARES,
+                   DECODE(A.C_BUSINFLAG,
+                          '03',
+                          A.F_CONFIRMBALANCE,
+                          '53',
+                          A.F_CONFIRMBALANCE,
+                          A.F_CONFIRMBALANCE - A.F_TRADEFARE) F_RELBALANCE,
+                   A.F_INTEREST,
+                   A.C_BUSINFLAG,
+                   A.C_CSERIALNO
+              FROM (SELECT D_DATE,
+                           C_AGENCYNO,
+                           DECODE(C_BUSINFLAG,
+                                  '03',
+                                  DECODE(C_IMPROPERREDEEM,
+                                         '3',
+                                         '100',
+                                         '5',
+                                         '100',
+                                         C_BUSINFLAG),
+                                  C_BUSINFLAG) C_BUSINFLAG,
+                           C_FUNDACCO,
+                           D_CDATE,
+                           C_FUNDCODE,
+                           F_CONFIRMBALANCE,
+                           F_CONFIRMSHARES,
+                           C_REQUESTNO,
+                           F_TRADEFARE,
+                           C_TRADEACCO,
+                           F_INTEREST,
+                           C_CSERIALNO,
+                           L_SERIALNO,
+                           L_CONTRACTSERIALNO
+                      FROM S017_TCONFIRM_ALL T3
+                    UNION
+                    SELECT D_DATE,
+                           C_AGENCYNO,
+                           '02' C_BUSINFLAG,
+                           C_FUNDACCO,
+                           D_LASTDATE AS D_CDATE, 
+                           C_FUNDCODE,
+                           F_REINVESTBALANCE F_CONFIRMBALANCE,
+                           F_REALSHARES F_CONFIRMSHARES,
+                           '' C_REQUESTNO,
+                           0 F_TRADEFARE,
+                           C_TRADEACCO,
+                           0 F_INTEREST,
+                           C_CSERIALNO,
+                           0 L_SERIALNO,
+                           L_CONTRACTSERIALNO
+                      FROM S017_TDIVIDENDDETAIL T1
+                     WHERE T1.C_FLAG = '0') A
+              LEFT JOIN S017_TACCONET TACN
+                ON A.C_TRADEACCO = TACN.C_TRADEACCO
+              LEFT JOIN (SELECT * FROM S017_TACCOINFO WHERE C_ACCOUNTTYPE = 'A') X
+                ON A.C_FUNDACCO = X.C_FUNDACCO
+              LEFT JOIN S017_TTRUSTCLIENTINFO_ALL B
+                ON X.C_CUSTNO = B.C_CUSTNO
+             INNER JOIN S017_TFUNDINFO C
+                ON A.C_FUNDCODE = C.C_FUNDCODE
+                       ) A
+      LEFT JOIN (SELECT ST1.D_CDATE,
+                        ST1.C_FUNDCODE,
+                        ST1.F_OCCURBALANCE,
+                        ST1.C_BUSINFLAG,
+                        ST1.C_FUNDACCO,
+                        ST1.C_CSERIALNO
+                   FROM S017_TSHARECURRENTS_ALL ST1
+                  WHERE ST1.C_BUSINFLAG <> '74'
+                 UNION ALL
+                 SELECT ST2.D_DATE AS D_CDATE,
+                        ST2.C_FUNDCODE,
+                        ST2.F_TOTALPROFIT AS F_OCCURBALANCE,
+                        '74' AS C_BUSINFLAG,
+                        ST2.C_FUNDACCO,
+                        ST2.C_CSERIALNO
+                   FROM S017_TDIVIDENDDETAIL ST2
+                  WHERE ST2.C_FLAG = '0') B
+        ON A.C_FUNDCODE = B.C_FUNDCODE
+       AND A.C_FUNDACCO = B.C_FUNDACCO
+       AND TO_DATE(A.D_CDATE, 'YYYY-MM-DD') = B.D_CDATE
+       AND A.C_CSERIALNO = B.C_CSERIALNO;
+ GET DIAGNOSTICS V_LOG_ROWCOUNT = ROW_COUNT;
+  
+  
+  CALL SP_PUB_UPDATE_LOG_DATE(V_SP_NAME, 
+                         V_TAB_LEVEL, 
+                         V_LOG_STEP_NO, 
+                         V_LOG_BEGIN_TIME, 
+                         SYSDATE, 
+                         V_WORK_DATE, 
+                         V_LOG_ROWCOUNT, 
+                         (SYSDATE - V_LOG_BEGIN_TIME)::NUMERIC, 
+                         V_ALL_ELAPSED);
+  ERR_NUM := 0;
+  ERR_MSG := 'NORMAL,SUCCESSFUL COMPLETION';
+END;
+ $$;
+ERROR:  invalid type name "sys_stat_error_log.STEP_DESC%TYPE"
+LINE 16:   V_STEP_DESC      sys_stat_error_log.STEP_DESC%TYPE;
+                            ^
+--
+-- Name: sp_pub_del_tb(varchar2); Type: PROCEDURE; Schema: sync; Owner: gregsun
+--
+CREATE PROCEDURE sp_pub_del_tb(p_tab_name varchar2)
+    LANGUAGE plpgsql
+    AS $$
+ declare  n_sql varchar2(4000);
+begin
+   
+   n_sql := 'truncate table '||p_tab_name;
+   
+   execute immediate n_sql;
+exception
+   when no_data_found then null;
+   when others then raise;
+end ;
+ $$;
+ERROR:  syntax error at or near "PROCEDURE"
+LINE 1: CREATE PROCEDURE sp_pub_del_tb(p_tab_name varchar2)
+               ^
+--
+-- Name: sp_pub_insert_log_date(varchar2, varchar2, varchar2, varchar2, date, date, date, numeric, numeric, numeric); Type: PROCEDURE; Schema: sync; Owner: gregsun
+--
+CREATE PROCEDURE sp_pub_insert_log_date(p_in_proc_name varchar2, p_in_tab_level varchar2, p_in_step_no varchar2, p_in_step_desc varchar2, p_in_begin_time date, p_in_end_time date, p_in_work_date date, p_in_row_num numeric, p_in_elapsed numeric, p_in_all_elapsed numeric)
+    LANGUAGE plpgsql
+    AS $$
+ declare   
+  BEGIN
+    INSERT INTO SYNC.SYS_STAT_ERROR_LOG
+      (PROC_NAME
+      ,TAB_LEVEL
+      ,STEP_NO
+      ,STEP_DESC
+      ,BEGIN_TIME
+      ,END_TIME
+      ,WORKDATE
+      ,ROW_NUM
+      ,ELAPSED
+      ,ALL_ELAPSED)
+    VALUES
+      (P_IN_PROC_NAME
+      ,P_IN_TAB_LEVEL
+      ,P_IN_STEP_NO
+      ,P_IN_STEP_DESC
+      ,P_IN_BEGIN_TIME
+      ,P_IN_END_TIME
+      ,P_IN_WORK_DATE
+      ,P_IN_ROW_NUM
+      ,P_IN_ELAPSED
+      ,P_IN_ALL_ELAPSED);
+    COMMIT;
+  END ;
+ $$;
+ERROR:  syntax error at or near "PROCEDURE"
+LINE 1: CREATE PROCEDURE sp_pub_insert_log_date(p_in_proc_name varch...
+               ^
+--
+-- Name: sp_pub_update_log_date(varchar2, varchar2, varchar2, date, date, date, numeric, numeric, numeric); Type: PROCEDURE; Schema: sync; Owner: gregsun
+--
+CREATE PROCEDURE sp_pub_update_log_date(p_in_proc_name varchar2, p_in_tab_level varchar2, p_in_step_no varchar2, p_in_begin_time date, p_in_end_time date, p_in_work_date date, p_in_row_num numeric, p_in_elapsed numeric, p_in_all_elapsed numeric)
+    LANGUAGE plpgsql
+    AS $$   BEGIN
+    UPDATE SYNC.SYS_STAT_ERROR_LOG
+       SET END_TIME = P_IN_END_TIME
+          ,ROW_NUM = P_IN_ROW_NUM
+          ,ELAPSED = P_IN_ELAPSED
+          ,ALL_ELAPSED = P_IN_ALL_ELAPSED
+     WHERE PROC_NAME = P_IN_PROC_NAME
+       AND TAB_LEVEL = P_IN_TAB_LEVEL
+       AND STEP_NO = P_IN_STEP_NO
+       AND BEGIN_TIME = P_IN_BEGIN_TIME
+       AND WORKDATE = P_IN_WORK_DATE;
+    COMMIT;
+  END ;
+ $$;
+ERROR:  syntax error at or near "PROCEDURE"
+LINE 1: CREATE PROCEDURE sp_pub_update_log_date(p_in_proc_name varch...
+               ^
+SET default_tablespace = '';
+SET default_with_oids = false;
+--
+-- Name: b03_ts_remetrade; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE b03_ts_remetrade (
+    c_fundcode character varying(500) NOT NULL,
+    c_fundname character varying(4000),
+    c_fundacco character varying(30),
+    f_netvalue numeric(16,2),
+    c_agencyname character varying(4000),
+    c_custname character varying(4000),
+    d_date character varying(100),
+    d_cdate character varying(100),
+    f_confirmbalance numeric(16,2),
+    f_tradefare numeric(16,2),
+    f_confirmshares numeric(16,2),
+    f_relbalance numeric(16,2),
+    f_interest numeric(16,2),
+    info character varying(500),
+    work_date timestamp(0) without time zone,
+    load_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (c_fundcode) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+--
+-- Name: b03_ts_remetrade_bak; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE b03_ts_remetrade_bak (
+    c_fundcode character varying(500) NOT NULL,
+    c_fundname character varying(4000),
+    c_fundacco character varying(30),
+    f_netvalue numeric(16,2),
+    c_agencyname character varying(4000),
+    c_custname character varying(4000),
+    d_date character varying(100),
+    d_cdate character varying(100),
+    f_confirmbalance numeric(16,2),
+    f_tradefare numeric(16,2),
+    f_confirmshares numeric(16,2),
+    f_relbalance numeric(16,2),
+    f_interest numeric(16,2),
+    info character varying(500),
+    work_date timestamp(0) without time zone,
+    load_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (c_fundcode) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+--
+-- Name: ks0_fund_base_26; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE ks0_fund_base_26 (
+    id1 numeric(48,0) NOT NULL,
+    acc_cd character varying(500) NOT NULL,
+    tdate timestamp(0) without time zone NOT NULL,
+    ins_cd character varying(500) NOT NULL,
+    cost_price_asset numeric(30,8),
+    pcol character varying(50)
+)
+DISTRIBUTE BY SHARD (id1) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+--
+-- Name: p; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE p (
+    p1 text,
+    p2 text
+)
+DISTRIBUTE BY HASH (p1);
+--
+-- Name: s017_taccoinfo; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE s017_taccoinfo (
+    c_custno character varying(30) NOT NULL,
+    c_accounttype character(1),
+    c_fundacco character varying(30),
+    c_agencyno character(3),
+    c_netno character varying(30),
+    c_childnetno character varying(30),
+    d_opendate timestamp(0) without time zone,
+    d_lastmodify timestamp(0) without time zone,
+    c_accostatus character(1),
+    c_freezecause character(1),
+    d_backdate timestamp(0) without time zone,
+    l_changetime numeric(10,0),
+    d_firstinvest timestamp(0) without time zone,
+    c_password character varying(100),
+    c_bourseflag character(1),
+    c_operator character varying(100),
+    jy_custid numeric(10,0),
+    work_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (c_custno) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+--
+-- Name: s017_tacconet; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE s017_tacconet (
+    c_fundacco character varying(30) NOT NULL,
+    c_agencyno character varying(6),
+    c_netno character varying(30),
+    c_tradeacco character varying(100),
+    c_openflag character varying(2),
+    c_bonustype character varying(2),
+    c_bankno character varying(500),
+    c_bankacco character varying(500),
+    c_nameinbank character varying(1000),
+    d_appenddate timestamp(0) without time zone,
+    c_childnetno character varying(30),
+    c_tradeaccobak character varying(100),
+    c_bankname character varying(500),
+    c_banklinecode character varying(100),
+    c_channelbankno character varying(30),
+    c_bankprovincecode character varying(30),
+    c_bankcityno character varying(30),
+    sys_id character varying(10),
+    work_date timestamp(0) without time zone,
+    load_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (c_fundacco) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+--
+-- Name: s017_tagencyinfo; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE s017_tagencyinfo (
+    c_agencyno character varying(6) NOT NULL,
+    c_agencyname character varying(1000),
+    c_fullname character varying(1000),
+    c_agncyaddress character varying(500),
+    c_agncyzipcode character varying(30),
+    c_agncycontact character varying(30),
+    c_agncyphone character varying(100),
+    c_agncyfaxno character varying(100),
+    c_agncymail character varying(100),
+    c_agncybankno character varying(24),
+    c_agncybankacco character varying(100),
+    c_agncybankname character varying(500),
+    d_agncyregdate timestamp(0) without time zone,
+    c_agncystatus character varying(2),
+    d_lastdate timestamp(0) without time zone,
+    c_agencytype character varying(2),
+    c_detail character varying(2),
+    c_right character varying(2),
+    c_zdcode character varying(30),
+    l_liquidateredeem numeric(10,0),
+    l_liquidateallot numeric(10,0),
+    l_liquidatebonus numeric(10,0),
+    l_liquidatesub numeric(10,0),
+    c_sharetypes character varying(30),
+    f_agio numeric(5,4),
+    c_ztgonestep character varying(2),
+    c_preassign character varying(2),
+    l_cserialno numeric(10,0),
+    c_comparetype character varying(2),
+    c_liquidatetype character varying(2),
+    c_multitradeacco character varying(2),
+    c_iversion character varying(6),
+    c_imode character varying(2),
+    c_changeonstep character varying(2),
+    f_outagio numeric(5,4),
+    f_agiohint numeric(5,4),
+    f_outagiohint numeric(5,4),
+    c_allotliqtype character varying(2),
+    c_redeemliqtype character varying(2),
+    c_centerflag character varying(2),
+    c_netno character varying(6),
+    c_littledealtype character varying(2),
+    c_overtimedeal character varying(2),
+    d_lastinputtime timestamp(0) without time zone,
+    f_interestrate numeric(5,4),
+    c_clearsite character varying(2),
+    c_isdeal character varying(2),
+    c_agencyenglishname character varying(100),
+    l_fundaccono numeric(10,0),
+    c_rationflag character varying(2),
+    c_splitflag character varying(2),
+    c_tacode character varying(30),
+    c_outdataflag character varying(2),
+    c_hasindex character varying(2),
+    c_transferbyadjust character varying(2),
+    c_sharedetailexptype character varying(2),
+    c_navexptype character varying(2),
+    c_ecdmode character varying(2),
+    c_agencytypedetail character varying(2),
+    c_advanceshrconfirm character varying(2),
+    c_ecdversion character varying(2),
+    c_capmode character varying(2),
+    c_internetplatform character varying(2),
+    c_capautoarrive character varying(2),
+    c_outcapitaldata character varying(30),
+    c_ecdcheckmode character varying(30),
+    c_ecddealmode character varying(30),
+    c_fileimpmode character varying(30),
+    c_isotc character varying(2),
+    c_enableecd character varying(30),
+    c_autoaccotype character varying(30),
+    c_tncheckmode numeric(10,0),
+    c_captureidinfo character varying(30),
+    c_realfreeze character varying(30),
+    sys_id character varying(10),
+    work_date timestamp(0) without time zone,
+    load_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (c_agencyno) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+--
+-- Name: s017_tconfirm_all; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE s017_tconfirm_all (
+    c_businflag character(2) NOT NULL,
+    d_cdate timestamp(0) without time zone,
+    c_cserialno character varying(100),
+    d_date timestamp(0) without time zone,
+    l_serialno numeric(10,0),
+    c_agencyno character(3),
+    c_netno character varying(30),
+    c_fundacco character varying(30),
+    c_tradeacco character varying(100),
+    c_fundcode character varying(30),
+    c_sharetype character(1),
+    f_confirmbalance numeric(16,2),
+    f_confirmshares numeric(16,2),
+    f_tradefare numeric(16,2),
+    f_tafare numeric(16,2),
+    f_stamptax numeric(16,2),
+    f_backfare numeric(16,2),
+    f_otherfare1 numeric(16,2),
+    f_interest numeric(16,2),
+    f_interesttax numeric(16,2),
+    f_totalfare numeric(16,2),
+    f_agencyfare numeric(16,2),
+    f_netvalue numeric(12,4),
+    f_frozenbalance numeric(16,2),
+    f_unfrozenbalance numeric(16,2),
+    c_status character(1),
+    c_cause character varying(100),
+    c_taflag character(1),
+    c_custtype character(1),
+    c_custno character varying(30),
+    f_gainbalance numeric(16,2),
+    f_orifare numeric(16,2),
+    c_requestendflag character(1),
+    f_unbalance numeric(16,2),
+    f_unshares numeric(16,2),
+    c_reserve character varying(500),
+    f_interestshare numeric(16,2),
+    f_chincome numeric(16,2),
+    f_chshare numeric(16,2),
+    f_confirmincome numeric(16,2),
+    f_oritradefare numeric(16,2),
+    f_oritafare numeric(16,2),
+    f_oribackfare numeric(16,2),
+    f_oriotherfare1 numeric(16,2),
+    c_requestno character varying(100),
+    f_balance numeric(16,2),
+    f_shares numeric(16,2),
+    f_agio numeric(5,4),
+    f_lastshares numeric(16,2),
+    f_lastfreezeshare numeric(16,2),
+    c_othercode character varying(30),
+    c_otheracco character varying(30),
+    c_otheragency character(3),
+    c_othernetno character varying(30),
+    c_bonustype character(1),
+    c_foriginalno character varying(500),
+    c_exceedflag character(1),
+    c_childnetno character varying(30),
+    c_othershare character(1),
+    c_actcode character(3),
+    c_acceptmode character(1),
+    c_freezecause character(1),
+    c_freezeenddate character varying(100),
+    f_totalbalance numeric(16,2),
+    f_totalshares numeric(16,2),
+    c_outbusinflag character(3),
+    c_protocolno character varying(30),
+    c_memo character varying(500),
+    f_registfare numeric(16,2),
+    f_fundfare numeric(16,2),
+    f_oriagio numeric(5,4),
+    c_shareclass character(1),
+    d_cisdate timestamp(0) without time zone,
+    c_bourseflag character(1),
+    c_fundtype character(1),
+    f_backfareagio numeric(5,4),
+    c_bankno character varying(30),
+    c_subfundmethod character varying(30),
+    c_combcode character varying(30),
+    f_returnfare numeric(16,2),
+    c_contractno character varying(100),
+    c_captype character(1),
+    l_contractserialno numeric(10,0),
+    l_othercontractserialno numeric(10,0),
+    d_exportdate timestamp(0) without time zone,
+    f_transferfee numeric(16,2),
+    f_oriconfirmbalance numeric(16,2),
+    f_extendnetvalue numeric(23,15),
+    l_remitserialno numeric(10,0),
+    c_zhxtht character varying(500),
+    c_improperredeem character(1),
+    f_untradefare numeric(16,2),
+    f_untradeinfare numeric(16,2),
+    f_untradeoutfare numeric(16,2),
+    c_profitnottransfer character(1),
+    f_outprofit numeric(9,6),
+    f_inprofit numeric(9,6),
+    c_totrustcontractid character varying(500),
+    d_repurchasedate timestamp(0) without time zone,
+    f_chengoutbalance numeric(16,2),
+    c_exporting character(1),
+    jy_fundid numeric(10,0),
+    jy_contractbh character varying(100),
+    jy_custid numeric(10,0),
+    jy_tocustid numeric(10,0),
+    jy_fare numeric(16,2),
+    c_trustcontractid character varying(500),
+    f_taagencyfare numeric(16,2),
+    f_taregisterfare numeric(16,2),
+    d_cdate_jy timestamp(0) without time zone,
+    jy_adjust character(1),
+    jy_subfundid numeric,
+    jy_adjust1114 character(1),
+    jy_cdate timestamp(0) without time zone,
+    c_bankacco character varying(500),
+    c_bankname character varying(500),
+    c_nameinbank character varying(1000),
+    f_riskcapital numeric(16,2),
+    f_replenishriskcapital numeric(16,2),
+    c_fromfundcode character varying(30),
+    c_fromtrustcontractid character varying(500),
+    c_trustagencyno character varying(100),
+    l_rdmschserialno numeric(10,0),
+    f_redeemprofit numeric(16,2),
+    f_redeemproyieldrate numeric(13,10),
+    d_redeemprobigdate timestamp(0) without time zone,
+    d_redeemproenddate timestamp(0) without time zone,
+    c_changeownerincomebelong character(1),
+    l_midremitserialno numeric(10,0),
+    c_fromtype character(1),
+    c_iscycinvest character(1),
+    l_fromserialno numeric(10,0),
+    l_frominterestconserialno numeric(10,0),
+    c_changeownerinterest character(1),
+    c_msgsendflag character(1),
+    l_sharedelaydays numeric(3,0),
+    c_istodayconfirm character(1),
+    f_newincome numeric(16,2),
+    f_floorincome numeric(10,9),
+    l_incomeremitserialno numeric(10,0),
+    c_isnetting character(1),
+    l_bankserialno numeric(10,0),
+    c_subfundcode character varying(30),
+    f_chengoutsum numeric(16,2),
+    f_chengoutprofit numeric(16,2),
+    l_confirmtransserialno numeric(10,0),
+    c_shareadjustgzexpflag character(1),
+    c_issend character(1),
+    c_exchangeflag character(1),
+    yh_date_1112 timestamp(0) without time zone,
+    l_banktocontractserialno numeric(10,0),
+    c_payfeetype character(1),
+    c_tobankno character varying(30),
+    c_tobankacco character varying(500),
+    c_tobankname character varying(500),
+    c_tonameinbank character varying(1000),
+    c_tobanklinecode character varying(100),
+    c_tobankprovincecode character varying(30),
+    c_tobankcityno character varying(30),
+    l_assetseperateno numeric(10,0),
+    c_sharecserialno character varying(100),
+    c_redeemprincipaltype character(1),
+    work_date timestamp(0) without time zone,
+    c_businname character varying(100)
+)
+DISTRIBUTE BY SHARD (c_businflag) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+--
+-- Name: s017_tdividenddetail; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE s017_tdividenddetail (
+    d_cdate timestamp(0) without time zone NOT NULL,
+    c_cserialno character varying(100),
+    d_regdate timestamp(0) without time zone,
+    d_date timestamp(0) without time zone,
+    c_fundacco character varying(30),
+    c_tradeacco character varying(100),
+    c_fundcode character varying(30),
+    c_sharetype character varying(2),
+    c_agencyno character varying(6),
+    c_netno character varying(30),
+    f_totalshare numeric(16,2),
+    f_unitprofit numeric(7,4),
+    f_totalprofit numeric(16,2),
+    f_tax numeric(16,2),
+    c_flag character varying(2),
+    f_realbalance numeric(16,2),
+    f_reinvestbalance numeric(16,2),
+    f_realshares numeric(16,2),
+    f_fare numeric(16,2),
+    d_lastdate timestamp(0) without time zone,
+    f_netvalue numeric(7,4),
+    f_frozenbalance numeric(16,2),
+    f_frozenshares numeric(16,2),
+    f_incometax numeric(9,4),
+    c_reserve character varying(100),
+    d_requestdate timestamp(0) without time zone,
+    c_shareclass character varying(30),
+    l_contractserialno numeric(10,0),
+    l_specprjserialno numeric(10,0),
+    f_investadvisorratio numeric(9,8),
+    f_transferfee numeric(16,2),
+    l_profitserialno numeric(10,0),
+    d_exportdate timestamp(0) without time zone,
+    c_custid character varying(30),
+    jy_fundid numeric,
+    jy_subfundid numeric,
+    jy_custid numeric,
+    jy_contractbh character varying(100),
+    jy_profitsn numeric,
+    jy_profitmoney numeric,
+    jy_capitalmoney numeric,
+    jy_adjust character varying(2),
+    c_reinvestnetvalue character varying(2),
+    f_transferbalance numeric(16,2),
+    l_relatedserialno numeric(10,0),
+    c_printoperator character varying(100),
+    c_printauditor character varying(100),
+    sys_id character varying(10),
+    work_date timestamp(0) without time zone,
+    load_date timestamp(0) without time zone,
+    f_remainshares numeric(16,2)
+)
+DISTRIBUTE BY SHARD (d_cdate) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+--
+-- Name: s017_tfundday; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE s017_tfundday (
+    d_date timestamp(0) without time zone,
+    d_cdate timestamp(0) without time zone,
+    c_fundcode varchar2(30),
+    c_todaystatus varchar2(2),
+    c_status varchar2(2),
+    f_netvalue numeric(7,4),
+    f_lastshares numeric(16,2),
+    f_lastasset numeric(16,2),
+    f_asucceed numeric(16,2),
+    f_rsucceed numeric(16,2),
+    c_vastflag varchar2(2),
+    f_encashratio numeric(9,8),
+    f_changeratio numeric(9,8),
+    c_excessflag varchar2(2),
+    f_subscriberatio numeric(9,8),
+    c_inputpersonnel varchar2(100),
+    c_checkpersonnel varchar2(100),
+    f_income numeric(16,2),
+    f_incomeratio numeric(9,6),
+    f_unassign numeric(16,2),
+    f_incomeunit numeric(10,5),
+    f_totalnetvalue numeric(7,4),
+    f_servicefare numeric(16,2),
+    f_assign numeric(16,2),
+    f_growthrate numeric(9,8),
+    c_netvalueflag varchar2(2),
+    f_managefare numeric(16,2),
+    d_exportdate timestamp(0) without time zone,
+    c_flag varchar2(2),
+    f_advisorfee numeric(16,2),
+    d_auditdate timestamp(0) without time zone,
+    f_extendnetvalue numeric(23,15),
+    f_extendtotalnetvalue numeric(23,15),
+    jy_fundcode varchar2(30),
+    f_yearincomeratio numeric(9,6),
+    f_riskcapital numeric(16,2),
+    f_totalincome numeric(16,2),
+    f_agencyexpyearincomeration numeric(9,6),
+    f_agencyexpincomeunit numeric(10,5),
+    f_agencyexpincomeration numeric(9,6),
+    f_agencyexpincome numeric(16,2),
+    c_isspecflag varchar2(2),
+    c_isasync varchar2(2),
+    sys_id varchar2(10),
+    work_date timestamp(0) without time zone,
+    load_date timestamp(0) without time zone DEFAULT orcl_sysdate()
+)
+DISTRIBUTE BY HASH (d_date);
+--
+-- Name: s017_tfundinfo; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE s017_tfundinfo (
+    c_fundcode character varying(30) NOT NULL,
+    c_fundname character varying(1000),
+    c_moneytype character varying(6),
+    c_managername character varying(100),
+    c_trusteecode character varying(30),
+    f_parvalue numeric(7,4),
+    f_issueprice numeric(12,4),
+    c_trusteeacco character varying(100),
+    d_issuedate timestamp(0) without time zone,
+    d_setupdate timestamp(0) without time zone,
+    f_maxbala numeric(16,2),
+    f_maxshares numeric(16,2),
+    f_minbala numeric(16,2),
+    f_minshares numeric(16,2),
+    l_elimitday numeric(10,0),
+    l_slimitday numeric(10,0),
+    l_alimitday numeric(10,0),
+    l_mincount numeric(10,0),
+    l_climitday numeric(10,0),
+    f_maxallot numeric(9,8),
+    f_maxredeem numeric(9,8),
+    c_fundcharacter character varying(500),
+    c_fundstatus character varying(2),
+    c_subscribemode character varying(2),
+    l_timelimit numeric(10,0),
+    l_subscribeunit numeric(10,0),
+    c_sharetypes character varying(30),
+    c_issuetype character varying(2),
+    f_factcollect numeric(16,2),
+    d_failuedate timestamp(0) without time zone,
+    f_allotratio numeric(9,8),
+    c_feeratiotype1 character varying(2),
+    c_feeratiotype2 character varying(2),
+    c_feetype character varying(2),
+    c_exceedpart character varying(2),
+    c_bonustype character varying(2),
+    c_forceredeem character varying(2),
+    c_interestdealtype character varying(2),
+    f_redeemfareratio numeric(5,4),
+    f_changefareratio numeric(5,4),
+    f_managerfee numeric(7,6),
+    f_right numeric(5,4),
+    c_property character varying(2),
+    d_evendate timestamp(0) without time zone,
+    f_totalbonus numeric(7,4),
+    c_changefree character varying(2),
+    c_reportcode character varying(30),
+    c_backfarecal character varying(2),
+    l_moneydate numeric(10,0),
+    l_netprecision numeric(10,0),
+    c_corpuscontent character varying(2),
+    f_corpusratio numeric(5,4),
+    c_farecaltype character varying(2),
+    l_liquidateallot numeric(10,0),
+    l_liquidateredeem numeric(10,0),
+    l_liquidatebonus numeric(10,0),
+    l_taspecialacco numeric(10,0),
+    c_fareprecision character varying(2),
+    d_issueenddate timestamp(0) without time zone,
+    c_farebelongasset character varying(2),
+    l_liquidatechange numeric(10,0),
+    l_liquidatefail numeric(10,0),
+    l_liquidateend numeric(10,0),
+    c_sharedetail character varying(2),
+    c_trusteebankname character varying(500),
+    c_boursetradeflag character varying(2),
+    c_fundenglishname character varying(100),
+    l_bankaccono numeric(10,0),
+    c_cleanflag character varying(2),
+    c_precision character varying(2),
+    c_upgradeflag character varying(2),
+    c_isdeal character varying(2),
+    c_farecltprecision character varying(2),
+    c_balanceprecision character varying(2),
+    c_shareprecision character varying(2),
+    c_bonusprecision character varying(2),
+    c_interestprecision character varying(2),
+    f_maxallotasset numeric(16,2),
+    f_maxallotshares numeric(16,2),
+    c_foreigntrustee character varying(6),
+    l_tnconfirm numeric(3,0),
+    c_rationallotstatus character varying(2),
+    f_trusteefee numeric(7,6),
+    c_fundacco character varying(30),
+    c_financetype character varying(2),
+    l_liquidatechangein numeric(10,0),
+    c_custname character varying(500),
+    c_identitytype character varying(2),
+    c_custtype character varying(2),
+    c_identityno character varying(100),
+    c_deductschemecode character varying(30),
+    c_customermanager character varying(30),
+    c_templateid character varying(30),
+    f_pr0 numeric(7,4),
+    f_deductratio numeric(5,4),
+    c_farecalculatetype character varying(2),
+    c_saletype character varying(2),
+    l_maxcount numeric(10,0),
+    l_zhallotliqdays numeric(10,0),
+    l_zhredeemliqdays numeric(10,0),
+    f_liqasset numeric(16,2),
+    l_zhallotexpdays numeric(10,0),
+    l_zhredeemexpdays numeric(10,0),
+    c_limitmode character varying(2),
+    c_ordermode character varying(2),
+    c_acntlmtdealmode character varying(2),
+    l_informdays numeric(2,0),
+    c_allowpartredeem character varying(2),
+    c_fundendmode character varying(2),
+    f_fundendagio numeric(10,9),
+    c_minbalalimitisconfirm character varying(2),
+    c_gradetype character varying(2),
+    c_qryfreqtype character varying(2),
+    l_qrydaysltd numeric(2,0),
+    d_contractenddate timestamp(0) without time zone,
+    c_useinopenday character varying(2),
+    c_allotcalinterst character varying(2),
+    c_fundrisk character varying(2),
+    c_exitallot character varying(2),
+    c_subinterestcalc character varying(2),
+    c_earlyexitredfee character varying(2),
+    c_navexpfqy character varying(2),
+    l_navexpday numeric(10,0),
+    c_isbounded character varying(2),
+    c_earlyexitfeecalc character varying(2),
+    c_designdptid character varying(100),
+    c_fixeddividway character varying(2),
+    c_trusttype character varying(2),
+    f_maxnaturalmoney numeric(16,2),
+    c_projectid character varying(30),
+    c_trustclass character varying(2),
+    f_trustscale numeric(16,2),
+    c_structflag character varying(2),
+    c_priconveyflag character varying(2),
+    c_repurchasetype character varying(2),
+    c_iswholerepurchase character varying(2),
+    f_repurchaseminbala numeric(16,2),
+    c_repurchasemainbody character varying(2),
+    c_canelyrepurchase character varying(2),
+    c_earlybacktime character varying(2),
+    c_repurchaseprice character varying(2),
+    c_premiumpaymenttime character varying(2),
+    c_liquisource character varying(2),
+    l_period numeric(3,0),
+    c_canextensionflag character varying(2),
+    c_canelyliquidflag character varying(2),
+    c_trustassetdesc character varying(100),
+    c_returnside character varying(2),
+    c_returnpaymentway character varying(2),
+    c_returnbase character varying(2),
+    c_refepaymentway character varying(2),
+    c_refeside character varying(2),
+    c_refebase character varying(2),
+    f_warnline numeric(5,4),
+    f_stopline numeric(5,4),
+    f_collectinterest numeric(11,8),
+    f_durationinterest numeric(7,4),
+    f_investadvisorratio numeric(7,6),
+    c_bonusschema character varying(2),
+    c_guaranteetype character varying(2),
+    c_guaranteedesc character varying(100),
+    c_expectedyieldtype character varying(2),
+    f_minexpectedyield numeric(12,4),
+    f_maxexpectedyield numeric(12,4),
+    c_incomecycletype character varying(2),
+    f_incomecyclevalue numeric(10,0),
+    c_subaccotype character varying(2),
+    c_allotaccotype character varying(2),
+    c_fundtype character varying(2),
+    c_cootype character varying(1000),
+    c_projecttype character varying(2),
+    c_investdirection character varying(30),
+    c_investdirectionfractionize character varying(2),
+    c_industrydetail character varying(1000),
+    c_initeresttype character varying(2),
+    c_isextended character varying(2),
+    d_extenddate timestamp(0) without time zone,
+    c_dealmanagetype character varying(2),
+    c_investarea character varying(2),
+    c_projectcode character varying(1000),
+    c_fundshortname character varying(500),
+    c_contractid character varying(500),
+    c_functype character varying(2),
+    c_specialbusintype character varying(1000),
+    c_investindustry character varying(2),
+    c_managetype character varying(2),
+    c_area character varying(500),
+    c_risk character varying(2),
+    c_iscommitteedisscuss character varying(2),
+    c_structtype character varying(2),
+    c_commendplace character varying(2),
+    l_npmaxcount numeric(5,0),
+    c_client character varying(100),
+    c_clientcusttype character varying(2),
+    c_clientidtype character varying(2),
+    c_clientidno character varying(100),
+    c_clientbankname character varying(100),
+    c_clientaccono character varying(100),
+    c_clientaddress character varying(500),
+    c_clientzipcode character varying(30),
+    c_clientphoneno1 character varying(100),
+    c_clientphoneno2 character varying(100),
+    c_clientfax character varying(100),
+    c_beneficiary character varying(100),
+    c_collectbankname character varying(500),
+    c_collectbankno character varying(6),
+    c_collectaccountname character varying(500),
+    c_collectbankacco character varying(100),
+    c_keeperbankname character varying(500),
+    c_keeperaccountname character varying(500),
+    c_keeperaccountno character varying(100),
+    c_keepername character varying(500),
+    c_keepercorporation character varying(500),
+    c_keeperaddress character varying(500),
+    c_keeperzipcode character varying(30),
+    c_keeperphoneno1 character varying(100),
+    c_keeperphoneno2 character varying(100),
+    c_keeperfax character varying(100),
+    c_incomedistributetype character varying(2),
+    c_alarmline character varying(1000),
+    c_stoplossline character varying(1000),
+    f_investadvisorfee numeric(12,2),
+    c_investadvisordeduct character varying(1000),
+    c_capitalacco character varying(500),
+    c_stockacconame character varying(500),
+    c_stocksalesdept character varying(500),
+    c_thirdpartybankno character varying(6),
+    c_thirdpartybankname character varying(500),
+    c_thirdpartyacconame character varying(500),
+    c_thirdpartyaccono character varying(100),
+    c_investadvisor character varying(500),
+    c_investadvisorbankno character varying(6),
+    c_investadvisorbankname character varying(500),
+    c_investadvisoracconame character varying(500),
+    c_investadvisoraccono character varying(100),
+    c_investadvisorcorporation character varying(500),
+    c_investadvisoraddress character varying(500),
+    c_investadvisorzipcode character varying(30),
+    c_investadvisorphoneno1 character varying(100),
+    c_investadvisorphoneno2 character varying(100),
+    c_investadvisorfax character varying(100),
+    c_authdelegate character varying(100),
+    c_loanfinanceparty character varying(500),
+    c_loanfinancepartycorporation character varying(500),
+    c_loanfinancepartyaddress character varying(500),
+    c_loanfinancepartyzipcode character varying(30),
+    c_loanfinancepartyphoneno1 character varying(100),
+    c_loanfinancepartyphoneno2 character varying(100),
+    c_loanfinancepartyfax character varying(100),
+    c_loaninteresttype character varying(2),
+    f_loaninterestrate numeric(7,4),
+    f_loanduration numeric(5,0),
+    c_loanmanagebank character varying(500),
+    f_loanmanagefee numeric(9,2),
+    f_loanfinancecost numeric(9,2),
+    f_creditattornduration numeric(5,0),
+    f_creditattorninterestduration numeric(7,4),
+    f_creditattornprice numeric(12,2),
+    f_billattornduration numeric(5,0),
+    f_billattorninterestduration numeric(7,4),
+    f_billattornprice numeric(12,2),
+    c_stkincfincparty character varying(1000),
+    c_stkincfincpartycorporation character varying(500),
+    c_stkincfincpartyaddress character varying(500),
+    c_stkincfincpartyzipcode character varying(30),
+    c_stkincfincpartyphoneno1 character varying(100),
+    c_stkincfincpartyphoneno2 character varying(100),
+    c_stkincfincpartyfax character varying(100),
+    c_stkincincomeannualizedrate numeric(7,4),
+    c_stkincinteresttype character varying(2),
+    f_stkincattornprice numeric(12,2),
+    f_stkincattornduration numeric(5,0),
+    f_stkincbail numeric(12,2),
+    f_stkincfinccost numeric(9,2),
+    c_stkincmemo1 character varying(1000),
+    c_stkincmemo2 character varying(1000),
+    c_debtincfincparty character varying(500),
+    c_debtincfincpartycorporation character varying(500),
+    c_debtincfincpartyaddress character varying(500),
+    c_debtincfincpartyzipcode character varying(30),
+    c_debtincfincpartyphoneno1 character varying(100),
+    c_debtincfincpartyphoneno2 character varying(100),
+    c_debtincfincpartyfax character varying(100),
+    c_debtincincomerate numeric(7,4),
+    c_debtincinteresttype character varying(2),
+    f_debtincattornprice numeric(12,2),
+    f_debtincattornduration numeric(5,0),
+    f_debtincbail numeric(12,2),
+    f_debtincfinccost numeric(9,2),
+    c_debtincmemo1 character varying(1000),
+    c_othinvfincparty character varying(500),
+    c_othinvfincpartycorporation character varying(500),
+    c_othinvfincpartyaddress character varying(500),
+    c_othinvfincpartyzipcode character varying(30),
+    c_othinvfincpartyphoneno1 character varying(100),
+    c_othinvfincpartyphoneno2 character varying(100),
+    c_othinvfincpartyfax character varying(100),
+    f_othinvfinccost numeric(9,2),
+    c_othinvmemo1 character varying(1000),
+    c_othinvmemo2 character varying(1000),
+    c_othinvmemo3 character varying(1000),
+    c_banktrustcoobank character varying(500),
+    c_banktrustproductname character varying(500),
+    c_banktrustproductcode character varying(100),
+    c_banktrustundertakingletter character varying(2),
+    c_trustgovgovname character varying(500),
+    c_trustgovprojecttype character varying(1000),
+    c_trustgovcootype character varying(4),
+    c_trustgovoptype character varying(4),
+    c_housecapital character varying(4),
+    c_houseispe character varying(2),
+    c_tradetype character varying(2),
+    c_businesstype character varying(2),
+    c_trustname character varying(500),
+    c_trustidtype character varying(2),
+    c_trustidno character varying(100),
+    d_trustidvaliddate timestamp(0) without time zone,
+    c_trustbankname character varying(500),
+    c_trustaccounttype character varying(2),
+    c_trustnameinbank character varying(100),
+    c_zhtrustbankname character varying(500),
+    c_zhtrustbankacco character varying(100),
+    c_issecmarket character varying(2),
+    c_fundoperation character varying(2),
+    c_trustmanager character varying(100),
+    c_tradeother character varying(4000),
+    c_watchdog character varying(500),
+    c_memo character varying(1000),
+    c_benefittype character varying(2),
+    c_redeemaccotype character varying(2),
+    c_bonusaccotype character varying(2),
+    c_fundendaccotype character varying(2),
+    c_collectfailaccotype character varying(2),
+    d_lastmodifydate timestamp(0) without time zone,
+    c_shareholdlimtype character varying(2),
+    c_redeemtimelimtype character varying(2),
+    c_isprincipalrepayment character varying(2),
+    c_principalrepaymenttype character varying(2),
+    l_interestyeardays numeric(3,0),
+    l_incomeyeardays numeric(3,0),
+    c_capuseprovcode character varying(30),
+    c_capusecitycode character varying(30),
+    c_capsourceprovcode character varying(30),
+    c_banktrustcoobankcode character varying(30),
+    c_banktrustisbankcap character varying(2),
+    c_trusteefeedesc character varying(4000),
+    c_managefeedesc character varying(4000),
+    c_investfeedesc character varying(4000),
+    f_investadvisordeductratio numeric(7,6),
+    c_investdeductdesc character varying(4000),
+    c_investadvisor2 character varying(500),
+    f_investadvisorratio2 numeric(7,6),
+    f_investadvisordeductratio2 numeric(7,6),
+    c_investfeedesc2 character varying(4000),
+    c_investdeductdesc2 character varying(4000),
+    c_investadvisor3 character varying(500),
+    f_investadvisorratio3 numeric(7,6),
+    f_investadvisordeductratio3 numeric(7,6),
+    c_investfeedesc3 character varying(4000),
+    c_investdeductdesc3 character varying(4000),
+    c_profitclassdesc character varying(4000),
+    c_deductratiodesc character varying(4000),
+    c_redeemfeedesc character varying(4000),
+    l_defaultprecision numeric(10,0),
+    c_allotfeeaccotype character varying(2),
+    c_isposf character varying(2),
+    c_opendaydesc character varying(4000),
+    c_actualmanager character varying(100),
+    c_subindustrydetail character varying(30),
+    c_isbankleading character varying(2),
+    c_subprojectcode character varying(500),
+    c_iscycleinvest character varying(2),
+    f_liquidationinterest numeric(13,10),
+    c_liquidationinteresttype character varying(2),
+    c_isbonusinvestfare character varying(2),
+    c_subfeeaccotype character varying(2),
+    c_redeemfeeaccotype character varying(2),
+    c_fundrptcode character varying(30),
+    c_ordertype character varying(2),
+    c_flag character varying(2),
+    c_allotliqtype character varying(2),
+    l_sharelimitday numeric(5,0),
+    c_iseverydayopen character varying(2),
+    c_tradebynetvalue character varying(2),
+    c_isstage character varying(2),
+    c_specbenfitmemo character varying(4000),
+    d_effectivedate timestamp(0) without time zone,
+    c_issueendflag character varying(2),
+    c_resharehasrdmfee character varying(2),
+    jy_fundcode numeric,
+    jy_fundid numeric,
+    jy_subfundid numeric,
+    jy_dptid numeric,
+    c_iswealth character varying(2),
+    c_interestcalctype character varying(2),
+    c_allotinterestcalctype character varying(2),
+    c_isriskcapital character varying(2),
+    c_fundstatus_1225 character varying(2),
+    c_isincomeeverydaycalc character varying(2),
+    c_isredeemreturninterest character varying(2),
+    c_isrefundrtninterest character varying(2),
+    d_estimatedsetupdate timestamp(0) without time zone,
+    f_estimatedfactcollect numeric(16,2),
+    c_isfinancialproducts character varying(2),
+    c_fundredeemtype character varying(2),
+    c_trademanualinput character varying(2),
+    f_clientmanageration numeric(7,6),
+    c_profitclassadjustment character varying(2),
+    c_mainfundcode character varying(30),
+    c_contractsealoff character varying(2),
+    c_permitnextperiod character varying(2),
+    c_preprofitschematype character varying(2),
+    c_fundredeemprofit character varying(2),
+    f_incomeration numeric(9,8),
+    c_incomecalctype character varying(2),
+    c_allocateaccoid character varying(30),
+    c_outfundcode character varying(500),
+    c_matchprofitclass character varying(30),
+    l_lastdays numeric(5,0),
+    c_contractprofitflag character varying(2),
+    c_agencysaleliqtype character varying(2),
+    l_delaydays numeric(3,0),
+    c_profitclassperiod character varying(2),
+    c_reportshowname character varying(1000),
+    c_currencyincometype character varying(2),
+    c_beforeredeemcapital character varying(2),
+    c_contractversion character varying(30),
+    c_confirmacceptedflag character varying(2),
+    c_selectcontract character varying(2),
+    f_schemainterest numeric(11,8),
+    c_riskgrade character varying(30),
+    l_sharedelaydays numeric(3,0),
+    l_reservationdays numeric(3,0),
+    c_transfertype character varying(2),
+    c_schemavoluntarily character varying(2),
+    l_schemadetaildata numeric(4,0),
+    c_schemadetailtype character varying(2),
+    c_iscurrencyconfirm character varying(2),
+    c_allowmultiaccobank character varying(2),
+    d_capverif timestamp(0) without time zone,
+    c_templatetype character varying(12),
+    c_capitalprecision character varying(2),
+    c_fundno character varying(100),
+    c_profittype character varying(2),
+    d_paydate timestamp(0) without time zone,
+    d_shelvedate timestamp(0) without time zone,
+    d_offshelvedate timestamp(0) without time zone,
+    c_schemabegindatetype character varying(2),
+    l_schemabegindatedays numeric(3,0),
+    c_isautoredeem character varying(2),
+    c_isnettingrequest character varying(2),
+    c_issuingquotedtype character varying(2),
+    d_firstdistributedate timestamp(0) without time zone,
+    c_bonusfrequency character varying(2),
+    c_interestbigdatetype character varying(2),
+    c_gzdatatype character varying(2),
+    f_allotfareratio numeric(5,4),
+    f_subfareratio numeric(5,4),
+    c_begindatebeyond character varying(2),
+    c_profitnotinterest character varying(2),
+    c_setuplimittype character varying(2),
+    c_limitredeemtype character varying(2),
+    c_bonusfrequencytype character varying(2),
+    c_rfaccotype character varying(2),
+    c_capitalfee character varying(2),
+    c_exceedflag character varying(2),
+    c_enableecd character varying(2),
+    c_isfixedtrade character varying(2),
+    c_profitcaltype character varying(2),
+    f_ominbala numeric(16,2),
+    f_stepbala numeric(16,2),
+    c_remittype character varying(30),
+    c_interestcycle character varying(30),
+    c_repayguaranteecopy character varying(30),
+    c_repaytype character varying(30),
+    c_fundprofitdes character varying(4000),
+    c_fundinfodes character varying(4000),
+    c_riskeval character varying(2),
+    l_maxage numeric(3,0),
+    l_minage numeric(3,0),
+    c_fundriskdes character varying(1000),
+    mig_l_assetid numeric(48,0),
+    l_faincomedays numeric(10,0),
+    c_producttype character varying(2),
+    c_otherbenefitproducttype character varying(2),
+    c_isotc character varying(2),
+    c_iseverydayprovision character varying(2),
+    c_incometogz character varying(2),
+    c_setuptransfundacco character varying(30),
+    c_issuefeeownerrequired character varying(2),
+    c_calcinterestbeforeallot character varying(30),
+    c_islimit300wnature character varying(2),
+    c_allowoverflow character varying(30),
+    c_trustfundtype character varying(30),
+    c_disclose character varying(2),
+    c_collectaccoid character varying(30),
+    c_isissuebymarket character varying(2),
+    c_setupstatus character varying(30),
+    c_isentitytrust character varying(2),
+    l_liquidatesub numeric(10,0),
+    c_incomeassigndesc character varying(4000),
+    c_keeporgancode character varying(30),
+    d_defaultbegincacldate timestamp(0) without time zone,
+    c_zcbborrower character varying(100),
+    c_zcbborroweridno character varying(100),
+    c_zcbremittype character varying(100),
+    c_registcode character varying(100),
+    c_redeeminvestaccotype character varying(2),
+    c_bonusinvestaccotype character varying(2),
+    c_isabsnotopentrade character varying(2),
+    l_interestdiffdays numeric(5,0),
+    c_outfundstatus character varying(2),
+    c_reqsyntype character varying(2),
+    c_allredeemtype character varying(2),
+    c_isabsopentrade character varying(2),
+    c_funddesc character varying(1000),
+    l_allotliquidays numeric(3,0),
+    l_subliquidays numeric(3,0),
+    c_autoupcontractenddaterule character varying(2),
+    c_fcsubaccotype character varying(2),
+    c_fcallotaccotype character varying(2),
+    c_fcredeemaccotype character varying(2),
+    c_fcbonusaccotype character varying(2),
+    c_captranslimitflag character varying(30),
+    c_redeemprincipaltype character varying(2),
+    c_interestcalcdealtype character varying(30),
+    c_collectconfirm character varying(30),
+    d_oldcontractenddate timestamp(0) without time zone,
+    c_tnvaluation character varying(30),
+    c_contractendnotify character varying(2),
+    c_rdmfeebase character varying(30),
+    c_exceedcfmratio character varying(30),
+    c_allowallotcustlimittype character varying(2),
+    c_yeardayscalctype character varying(2),
+    c_iscompoundinterest character varying(30),
+    c_dbcfm character varying(30),
+    c_limitaccountstype character varying(2),
+    c_cycleinvestrange character varying(2),
+    c_tncheckmode character varying(2),
+    c_enableearlyredeem character varying(2),
+    c_ispurceandredeemset character varying(30),
+    c_perfpaydealtype character varying(2),
+    c_allowappend character varying(2),
+    c_allowredeem character varying(2),
+    c_inputstatus character varying(2),
+    c_profitbalanceadjust character varying(2),
+    c_profitperiodadjust character varying(2),
+    c_autogeneratecontractid character varying(2),
+    c_transferneednetting character varying(100),
+    underwrite character varying(1000),
+    undertook character varying(1000),
+    undertake character varying(1000),
+    c_issmsend character varying(2),
+    d_contractshortenddate timestamp(0) without time zone,
+    d_contractlongenddate timestamp(0) without time zone,
+    c_assetseperatefundcodesrc character varying(30),
+    f_averageprofit numeric(11,8),
+    c_currencycontractlimittype character varying(2),
+    l_profitlastdays numeric(5,0),
+    l_liquidationlastdays numeric(5,0),
+    c_arlimitincludeallreq character varying(2),
+    c_reqfundchange character varying(2),
+    c_dealnetvaluerule character varying(2),
+    c_contractdealtype character varying(2),
+    c_bonusplanbeginday timestamp(0) without time zone,
+    c_contractbalaupright character varying(2),
+    c_isneedinterestrate character varying(2),
+    c_isneedexcessratio character varying(2),
+    c_riskgraderemark character varying(1000),
+    c_lossprobability character varying(2),
+    c_suitcusttype character varying(2),
+    c_createbonusschema character varying(2),
+    d_closedenddate timestamp(0) without time zone,
+    c_timelimitunit character varying(30),
+    c_exceedredeemdealtype character varying(2),
+    c_profitperiod character varying(2),
+    l_navgetintervaldays numeric(3,0),
+    load_date timestamp(0) without time zone,
+    sys_id character varying(10) DEFAULT 'S017'::character varying,
+    work_date timestamp(0) without time zone,
+    c_limittransfertype character varying(1),
+    c_transaccotype character varying(1),
+    c_incometaxbase character varying(1),
+    c_isredeemfareyearcalc character varying(1),
+    c_otherbenefitinputmode character varying(1),
+    c_aftdefaultinterestdeducttype character varying(1),
+    c_allowzerobalanceconfirm character varying(1),
+    c_incomejoinassign character varying(1),
+    l_liquidateliqbonus numeric(10,0),
+    c_predefaultinterestdeducttype character varying(1),
+    c_worktype character varying(1),
+    c_defaultinterestadduptype character varying(1),
+    c_issupportsubmode character varying(1),
+    f_expectedyield numeric(14,0),
+    c_recodecode character varying(40),
+    l_liquidatetransfer numeric(10,0),
+    c_ispayincometax character varying(1),
+    c_groupmainfundcode character varying(6),
+    c_redeemfeesplittype character varying(1),
+    c_capitalfromcrmorta character varying(1),
+    c_needcalcdefaultinterest character varying(1),
+    c_issuercode character varying(10),
+    l_redeemfareyeardays numeric(10,0),
+    c_floatyield character varying(30),
+    l_minriskscore numeric(3,0),
+    c_islocalmoneytypecollect character varying(1)
+)
+DISTRIBUTE BY SHARD (c_fundcode) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+--
+-- Name: s017_tsharecurrents_all; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE s017_tsharecurrents_all (
+    d_cdate timestamp(0) without time zone NOT NULL,
+    c_cserialno character varying(100),
+    c_businflag character(2),
+    d_requestdate timestamp(0) without time zone,
+    c_requestno character varying(100),
+    c_custno character varying(30),
+    c_fundacco character varying(30),
+    c_tradeacco character varying(100),
+    c_fundcode character varying(30),
+    c_sharetype character(1),
+    c_agencyno character(3),
+    c_netno character varying(30),
+    f_occurshares numeric(16,2),
+    f_occurbalance numeric(16,2),
+    f_lastshares numeric(16,2),
+    f_occurfreeze numeric(16,2),
+    f_lastfreezeshare numeric(16,2),
+    c_summary character varying(100),
+    f_gainbalance numeric(16,2),
+    d_sharevaliddate timestamp(0) without time zone,
+    c_bonustype character(1),
+    c_custtype character(1),
+    c_shareclass character(1),
+    c_bourseflag character varying(20),
+    d_exportdate timestamp(0) without time zone,
+    l_contractserialno numeric(10,0),
+    c_issend character(1),
+    c_sendbatch character varying(30),
+    work_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (d_cdate) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+--
+-- Name: s017_ttrustclientinfo_all; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE s017_ttrustclientinfo_all (
+    c_custno character varying(30) NOT NULL,
+    c_custtype character(1),
+    c_custname character varying(500),
+    c_shortname character varying(500),
+    c_helpcode character varying(30),
+    c_identitytype character(1),
+    c_identityno character varying(500),
+    c_zipcode character varying(30),
+    c_address character varying(1000),
+    c_phone character varying(100),
+    c_faxno character varying(500),
+    c_mobileno character varying(100),
+    c_email character varying(500),
+    c_sex character(1),
+    c_birthday character varying(30),
+    c_vocation character(2),
+    c_education character(2),
+    c_income character varying(30),
+    c_contact character varying(100),
+    c_contype character(1),
+    c_contno character varying(100),
+    c_billsendflag character(1),
+    c_callcenter character(1),
+    c_internet character(1),
+    c_secretcode character varying(30),
+    c_nationality character(3),
+    c_cityno character varying(30),
+    c_lawname character varying(100),
+    c_shacco character varying(30),
+    c_szacco character varying(30),
+    c_broker character varying(100),
+    f_agio numeric(5,4),
+    c_memo character varying(4000),
+    c_reserve character varying(500),
+    c_corpname character varying(100),
+    c_corptel character varying(100),
+    c_specialcode character varying(100),
+    c_actcode character varying(30),
+    c_billsendpass character(1),
+    c_addressinvalid character(1),
+    d_appenddate timestamp(0) without time zone,
+    d_backdate timestamp(0) without time zone,
+    c_invalidaddress character varying(500),
+    c_backreason character varying(500),
+    c_modifyinfo character(2),
+    c_riskcontent character varying(4000),
+    l_querydaysltd numeric(3,0),
+    c_customermanager character varying(100),
+    c_custproperty character(1),
+    c_custclass character(1),
+    c_custright character varying(4000),
+    c_daysltdtype character(1),
+    d_idvaliddate timestamp(0) without time zone,
+    l_custgroup numeric(10,0),
+    c_recommender character varying(100),
+    c_recommendertype character(1),
+    d_idnovaliddate timestamp(0) without time zone,
+    c_organcode character(10),
+    c_othercontact character varying(100),
+    c_taxregistno character varying(100),
+    c_taxidentitytype character(1),
+    c_taxidentityno character varying(100),
+    d_legalvaliddate timestamp(0) without time zone,
+    c_shareholder character varying(500),
+    c_shareholderidtype character(1),
+    c_shareholderidno character varying(100),
+    d_holderidvaliddate timestamp(0) without time zone,
+    c_leader character varying(500),
+    c_leaderidtype character(1),
+    c_leaderidno character varying(100),
+    d_leadervaliddate timestamp(0) without time zone,
+    c_managercode character varying(100),
+    c_linemanager character varying(100),
+    c_clientinfoid character varying(30),
+    c_provincecode character varying(30),
+    c_countytown character varying(1000),
+    c_phone2 character varying(100),
+    c_clienttype character(1),
+    c_agencyno character(3),
+    c_industrydetail character varying(30),
+    c_isqualifiedcust character(1),
+    c_industryidentityno character varying(100),
+    c_lawidentitytype character(1),
+    c_lawidentityno character varying(100),
+    d_lawidvaliddate timestamp(0) without time zone,
+    d_conidvaliddate timestamp(0) without time zone,
+    c_conisrevmsg character(1),
+    c_conmobileno character varying(100),
+    c_conmoaddress character varying(1000),
+    c_conzipcode character varying(30),
+    c_conphone1 character varying(100),
+    c_conphone2 character varying(100),
+    c_conemail character varying(100),
+    c_confaxno character varying(500),
+    c_incomsource character varying(500),
+    c_zhidentityno character varying(500),
+    c_zhidentitytype character(1),
+    c_eastcusttype character varying(30),
+    jy_custid numeric(10,0),
+    c_idtype201201030 character(1),
+    c_emcontact character varying(500),
+    c_emcontactphone character varying(100),
+    c_instiregaddr character varying(1000),
+    c_regcusttype character varying(30),
+    c_riskgrade character varying(30),
+    c_riskgraderemark character varying(1000),
+    d_idvaliddatebeg timestamp(0) without time zone,
+    d_industryidvaliddatebeg timestamp(0) without time zone,
+    d_industryidvaliddate timestamp(0) without time zone,
+    c_incomesourceotherdesc character varying(1000),
+    c_vocationotherdesc character varying(1000),
+    c_businscope character varying(4000),
+    d_conidvaliddatebeg timestamp(0) without time zone,
+    d_lawidvaliddatebeg timestamp(0) without time zone,
+    c_regmoneytype character(3),
+    f_regcapital numeric(15,2),
+    c_orgtype character(2),
+    c_contrholderno character varying(100),
+    c_contrholdername character varying(500),
+    c_contrholderidtype character(2),
+    c_contrholderidno character varying(500),
+    d_contrholderidvalidatebeg timestamp(0) without time zone,
+    d_contrholderidvalidate timestamp(0) without time zone,
+    c_responpername character varying(500),
+    c_responperidtype character(2),
+    c_responperidno character varying(500),
+    d_responperidvalidatebeg timestamp(0) without time zone,
+    d_responperidvalidate timestamp(0) without time zone,
+    c_lawphone character varying(100),
+    c_contrholderphone character varying(100),
+    c_responperphone character varying(100),
+    c_consex character(1),
+    c_conrelative character varying(500),
+    l_riskserialno numeric(10,0),
+    c_convocation character(2),
+    c_iscustrelated character(1),
+    c_businlicissuorgan character varying(500),
+    c_manageridno character varying(500),
+    c_manageridtype character varying(500),
+    c_managername character varying(500),
+    d_companyregdate timestamp(0) without time zone,
+    c_electronicagreement character(1),
+    c_householdregno character varying(500),
+    c_guardianrela character varying(500),
+    c_guardianname character varying(500),
+    c_guardianidtype character(1),
+    c_guardianidno character varying(500),
+    c_isfranchisingidstry character(1),
+    c_franchidstrybusinlic character varying(500),
+    c_workunittype character(2),
+    c_normalresidaddr character varying(1000),
+    c_domicile character varying(1000),
+    c_finainvestyears character(2),
+    c_parentidtype character(1),
+    c_parentidno character varying(500),
+    c_videono character varying(1000),
+    c_bonustype character(1),
+    d_retirementdate timestamp(0) without time zone,
+    c_issendbigcustbill character(1),
+    c_idaddress character varying(1000),
+    c_isproinvestor character(1),
+    c_sendkfflag character(1),
+    c_sendkfcause character varying(1000),
+    c_sendsaflag character(1),
+    c_sendsacause character varying(1000),
+    c_custrelationchannel character(1),
+    c_companytype character(1),
+    c_businlocation character varying(1000),
+    c_custodian character varying(500),
+    d_elecsigndate timestamp(0) without time zone,
+    d_riskinputdate timestamp(0) without time zone,
+    c_circno character varying(1000),
+    c_financeindustrydetail character varying(30),
+    c_outclientinfoid character varying(30),
+    d_duediligencedate timestamp(0) without time zone,
+    c_duediligencestatus character(1),
+    c_inputstatus character(1),
+    c_address2 character varying(1000),
+    c_reportcusttype character(1),
+    c_reportcusttypedetail character varying(30),
+    c_custsource character varying(30),
+    work_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (c_custno) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+--
+-- Name: sys_stat_error_log; Type: TABLE; Schema: sync; Owner: gregsun
+--
+CREATE TABLE sys_stat_error_log (
+    proc_name varchar2(50) NOT NULL,
+    tab_level varchar2(20),
+    step_no varchar2(20),
+    step_desc varchar2(500),
+    begin_time timestamp(0) without time zone,
+    end_time timestamp(0) without time zone,
+    workdate timestamp(0) without time zone,
+    row_num numeric,
+    elapsed numeric,
+    all_elapsed numeric,
+    sql_code varchar2(20),
+    sql_errm varchar2(500)
+)
+DISTRIBUTE BY SHARD (proc_name) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+--
+-- Data for Name: b03_ts_remetrade; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY b03_ts_remetrade (c_fundcode, c_fundname, c_fundacco, f_netvalue, c_agencyname, c_custname, d_date, d_cdate, f_confirmbalance, f_tradefare, f_confirmshares, f_relbalance, f_interest, info, work_date, load_date) FROM stdin;
+--
+-- Data for Name: b03_ts_remetrade_bak; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY b03_ts_remetrade_bak (c_fundcode, c_fundname, c_fundacco, f_netvalue, c_agencyname, c_custname, d_date, d_cdate, f_confirmbalance, f_tradefare, f_confirmshares, f_relbalance, f_interest, info, work_date, load_date) FROM stdin;
+--
+-- Data for Name: ks0_fund_base_26; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY ks0_fund_base_26 (id1, acc_cd, tdate, ins_cd, cost_price_asset, pcol) FROM stdin;
+--
+-- Data for Name: p; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY p (p1, p2) FROM stdin;
+--
+-- Data for Name: s017_taccoinfo; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY s017_taccoinfo (c_custno, c_accounttype, c_fundacco, c_agencyno, c_netno, c_childnetno, d_opendate, d_lastmodify, c_accostatus, c_freezecause, d_backdate, l_changetime, d_firstinvest, c_password, c_bourseflag, c_operator, jy_custid, work_date) FROM stdin;
+--
+-- Data for Name: s017_tacconet; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY s017_tacconet (c_fundacco, c_agencyno, c_netno, c_tradeacco, c_openflag, c_bonustype, c_bankno, c_bankacco, c_nameinbank, d_appenddate, c_childnetno, c_tradeaccobak, c_bankname, c_banklinecode, c_channelbankno, c_bankprovincecode, c_bankcityno, sys_id, work_date, load_date) FROM stdin;
+--
+-- Data for Name: s017_tagencyinfo; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY s017_tagencyinfo (c_agencyno, c_agencyname, c_fullname, c_agncyaddress, c_agncyzipcode, c_agncycontact, c_agncyphone, c_agncyfaxno, c_agncymail, c_agncybankno, c_agncybankacco, c_agncybankname, d_agncyregdate, c_agncystatus, d_lastdate, c_agencytype, c_detail, c_right, c_zdcode, l_liquidateredeem, l_liquidateallot, l_liquidatebonus, l_liquidatesub, c_sharetypes, f_agio, c_ztgonestep, c_preassign, l_cserialno, c_comparetype, c_liquidatetype, c_multitradeacco, c_iversion, c_imode, c_changeonstep, f_outagio, f_agiohint, f_outagiohint, c_allotliqtype, c_redeemliqtype, c_centerflag, c_netno, c_littledealtype, c_overtimedeal, d_lastinputtime, f_interestrate, c_clearsite, c_isdeal, c_agencyenglishname, l_fundaccono, c_rationflag, c_splitflag, c_tacode, c_outdataflag, c_hasindex, c_transferbyadjust, c_sharedetailexptype, c_navexptype, c_ecdmode, c_agencytypedetail, c_advanceshrconfirm, c_ecdversion, c_capmode, c_internetplatform, c_capautoarrive, c_outcapitaldata, c_ecdcheckmode, c_ecddealmode, c_fileimpmode, c_isotc, c_enableecd, c_autoaccotype, c_tncheckmode, c_captureidinfo, c_realfreeze, sys_id, work_date, load_date) FROM stdin;
+--
+-- Data for Name: s017_tconfirm_all; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY s017_tconfirm_all (c_businflag, d_cdate, c_cserialno, d_date, l_serialno, c_agencyno, c_netno, c_fundacco, c_tradeacco, c_fundcode, c_sharetype, f_confirmbalance, f_confirmshares, f_tradefare, f_tafare, f_stamptax, f_backfare, f_otherfare1, f_interest, f_interesttax, f_totalfare, f_agencyfare, f_netvalue, f_frozenbalance, f_unfrozenbalance, c_status, c_cause, c_taflag, c_custtype, c_custno, f_gainbalance, f_orifare, c_requestendflag, f_unbalance, f_unshares, c_reserve, f_interestshare, f_chincome, f_chshare, f_confirmincome, f_oritradefare, f_oritafare, f_oribackfare, f_oriotherfare1, c_requestno, f_balance, f_shares, f_agio, f_lastshares, f_lastfreezeshare, c_othercode, c_otheracco, c_otheragency, c_othernetno, c_bonustype, c_foriginalno, c_exceedflag, c_childnetno, c_othershare, c_actcode, c_acceptmode, c_freezecause, c_freezeenddate, f_totalbalance, f_totalshares, c_outbusinflag, c_protocolno, c_memo, f_registfare, f_fundfare, f_oriagio, c_shareclass, d_cisdate, c_bourseflag, c_fundtype, f_backfareagio, c_bankno, c_subfundmethod, c_combcode, f_returnfare, c_contractno, c_captype, l_contractserialno, l_othercontractserialno, d_exportdate, f_transferfee, f_oriconfirmbalance, f_extendnetvalue, l_remitserialno, c_zhxtht, c_improperredeem, f_untradefare, f_untradeinfare, f_untradeoutfare, c_profitnottransfer, f_outprofit, f_inprofit, c_totrustcontractid, d_repurchasedate, f_chengoutbalance, c_exporting, jy_fundid, jy_contractbh, jy_custid, jy_tocustid, jy_fare, c_trustcontractid, f_taagencyfare, f_taregisterfare, d_cdate_jy, jy_adjust, jy_subfundid, jy_adjust1114, jy_cdate, c_bankacco, c_bankname, c_nameinbank, f_riskcapital, f_replenishriskcapital, c_fromfundcode, c_fromtrustcontractid, c_trustagencyno, l_rdmschserialno, f_redeemprofit, f_redeemproyieldrate, d_redeemprobigdate, d_redeemproenddate, c_changeownerincomebelong, l_midremitserialno, c_fromtype, c_iscycinvest, l_fromserialno, l_frominterestconserialno, c_changeownerinterest, c_msgsendflag, l_sharedelaydays, c_istodayconfirm, f_newincome, f_floorincome, l_incomeremitserialno, c_isnetting, l_bankserialno, c_subfundcode, f_chengoutsum, f_chengoutprofit, l_confirmtransserialno, c_shareadjustgzexpflag, c_issend, c_exchangeflag, yh_date_1112, l_banktocontractserialno, c_payfeetype, c_tobankno, c_tobankacco, c_tobankname, c_tonameinbank, c_tobanklinecode, c_tobankprovincecode, c_tobankcityno, l_assetseperateno, c_sharecserialno, c_redeemprincipaltype, work_date, c_businname) FROM stdin;
+--
+-- Data for Name: s017_tdividenddetail; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY s017_tdividenddetail (d_cdate, c_cserialno, d_regdate, d_date, c_fundacco, c_tradeacco, c_fundcode, c_sharetype, c_agencyno, c_netno, f_totalshare, f_unitprofit, f_totalprofit, f_tax, c_flag, f_realbalance, f_reinvestbalance, f_realshares, f_fare, d_lastdate, f_netvalue, f_frozenbalance, f_frozenshares, f_incometax, c_reserve, d_requestdate, c_shareclass, l_contractserialno, l_specprjserialno, f_investadvisorratio, f_transferfee, l_profitserialno, d_exportdate, c_custid, jy_fundid, jy_subfundid, jy_custid, jy_contractbh, jy_profitsn, jy_profitmoney, jy_capitalmoney, jy_adjust, c_reinvestnetvalue, f_transferbalance, l_relatedserialno, c_printoperator, c_printauditor, sys_id, work_date, load_date, f_remainshares) FROM stdin;
+--
+-- Data for Name: s017_tfundday; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY s017_tfundday (d_date, d_cdate, c_fundcode, c_todaystatus, c_status, f_netvalue, f_lastshares, f_lastasset, f_asucceed, f_rsucceed, c_vastflag, f_encashratio, f_changeratio, c_excessflag, f_subscriberatio, c_inputpersonnel, c_checkpersonnel, f_income, f_incomeratio, f_unassign, f_incomeunit, f_totalnetvalue, f_servicefare, f_assign, f_growthrate, c_netvalueflag, f_managefare, d_exportdate, c_flag, f_advisorfee, d_auditdate, f_extendnetvalue, f_extendtotalnetvalue, jy_fundcode, f_yearincomeratio, f_riskcapital, f_totalincome, f_agencyexpyearincomeration, f_agencyexpincomeunit, f_agencyexpincomeration, f_agencyexpincome, c_isspecflag, c_isasync, sys_id, work_date, load_date) FROM stdin;
+--
+-- Data for Name: s017_tfundinfo; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY s017_tfundinfo (c_fundcode, c_fundname, c_moneytype, c_managername, c_trusteecode, f_parvalue, f_issueprice, c_trusteeacco, d_issuedate, d_setupdate, f_maxbala, f_maxshares, f_minbala, f_minshares, l_elimitday, l_slimitday, l_alimitday, l_mincount, l_climitday, f_maxallot, f_maxredeem, c_fundcharacter, c_fundstatus, c_subscribemode, l_timelimit, l_subscribeunit, c_sharetypes, c_issuetype, f_factcollect, d_failuedate, f_allotratio, c_feeratiotype1, c_feeratiotype2, c_feetype, c_exceedpart, c_bonustype, c_forceredeem, c_interestdealtype, f_redeemfareratio, f_changefareratio, f_managerfee, f_right, c_property, d_evendate, f_totalbonus, c_changefree, c_reportcode, c_backfarecal, l_moneydate, l_netprecision, c_corpuscontent, f_corpusratio, c_farecaltype, l_liquidateallot, l_liquidateredeem, l_liquidatebonus, l_taspecialacco, c_fareprecision, d_issueenddate, c_farebelongasset, l_liquidatechange, l_liquidatefail, l_liquidateend, c_sharedetail, c_trusteebankname, c_boursetradeflag, c_fundenglishname, l_bankaccono, c_cleanflag, c_precision, c_upgradeflag, c_isdeal, c_farecltprecision, c_balanceprecision, c_shareprecision, c_bonusprecision, c_interestprecision, f_maxallotasset, f_maxallotshares, c_foreigntrustee, l_tnconfirm, c_rationallotstatus, f_trusteefee, c_fundacco, c_financetype, l_liquidatechangein, c_custname, c_identitytype, c_custtype, c_identityno, c_deductschemecode, c_customermanager, c_templateid, f_pr0, f_deductratio, c_farecalculatetype, c_saletype, l_maxcount, l_zhallotliqdays, l_zhredeemliqdays, f_liqasset, l_zhallotexpdays, l_zhredeemexpdays, c_limitmode, c_ordermode, c_acntlmtdealmode, l_informdays, c_allowpartredeem, c_fundendmode, f_fundendagio, c_minbalalimitisconfirm, c_gradetype, c_qryfreqtype, l_qrydaysltd, d_contractenddate, c_useinopenday, c_allotcalinterst, c_fundrisk, c_exitallot, c_subinterestcalc, c_earlyexitredfee, c_navexpfqy, l_navexpday, c_isbounded, c_earlyexitfeecalc, c_designdptid, c_fixeddividway, c_trusttype, f_maxnaturalmoney, c_projectid, c_trustclass, f_trustscale, c_structflag, c_priconveyflag, c_repurchasetype, c_iswholerepurchase, f_repurchaseminbala, c_repurchasemainbody, c_canelyrepurchase, c_earlybacktime, c_repurchaseprice, c_premiumpaymenttime, c_liquisource, l_period, c_canextensionflag, c_canelyliquidflag, c_trustassetdesc, c_returnside, c_returnpaymentway, c_returnbase, c_refepaymentway, c_refeside, c_refebase, f_warnline, f_stopline, f_collectinterest, f_durationinterest, f_investadvisorratio, c_bonusschema, c_guaranteetype, c_guaranteedesc, c_expectedyieldtype, f_minexpectedyield, f_maxexpectedyield, c_incomecycletype, f_incomecyclevalue, c_subaccotype, c_allotaccotype, c_fundtype, c_cootype, c_projecttype, c_investdirection, c_investdirectionfractionize, c_industrydetail, c_initeresttype, c_isextended, d_extenddate, c_dealmanagetype, c_investarea, c_projectcode, c_fundshortname, c_contractid, c_functype, c_specialbusintype, c_investindustry, c_managetype, c_area, c_risk, c_iscommitteedisscuss, c_structtype, c_commendplace, l_npmaxcount, c_client, c_clientcusttype, c_clientidtype, c_clientidno, c_clientbankname, c_clientaccono, c_clientaddress, c_clientzipcode, c_clientphoneno1, c_clientphoneno2, c_clientfax, c_beneficiary, c_collectbankname, c_collectbankno, c_collectaccountname, c_collectbankacco, c_keeperbankname, c_keeperaccountname, c_keeperaccountno, c_keepername, c_keepercorporation, c_keeperaddress, c_keeperzipcode, c_keeperphoneno1, c_keeperphoneno2, c_keeperfax, c_incomedistributetype, c_alarmline, c_stoplossline, f_investadvisorfee, c_investadvisordeduct, c_capitalacco, c_stockacconame, c_stocksalesdept, c_thirdpartybankno, c_thirdpartybankname, c_thirdpartyacconame, c_thirdpartyaccono, c_investadvisor, c_investadvisorbankno, c_investadvisorbankname, c_investadvisoracconame, c_investadvisoraccono, c_investadvisorcorporation, c_investadvisoraddress, c_investadvisorzipcode, c_investadvisorphoneno1, c_investadvisorphoneno2, c_investadvisorfax, c_authdelegate, c_loanfinanceparty, c_loanfinancepartycorporation, c_loanfinancepartyaddress, c_loanfinancepartyzipcode, c_loanfinancepartyphoneno1, c_loanfinancepartyphoneno2, c_loanfinancepartyfax, c_loaninteresttype, f_loaninterestrate, f_loanduration, c_loanmanagebank, f_loanmanagefee, f_loanfinancecost, f_creditattornduration, f_creditattorninterestduration, f_creditattornprice, f_billattornduration, f_billattorninterestduration, f_billattornprice, c_stkincfincparty, c_stkincfincpartycorporation, c_stkincfincpartyaddress, c_stkincfincpartyzipcode, c_stkincfincpartyphoneno1, c_stkincfincpartyphoneno2, c_stkincfincpartyfax, c_stkincincomeannualizedrate, c_stkincinteresttype, f_stkincattornprice, f_stkincattornduration, f_stkincbail, f_stkincfinccost, c_stkincmemo1, c_stkincmemo2, c_debtincfincparty, c_debtincfincpartycorporation, c_debtincfincpartyaddress, c_debtincfincpartyzipcode, c_debtincfincpartyphoneno1, c_debtincfincpartyphoneno2, c_debtincfincpartyfax, c_debtincincomerate, c_debtincinteresttype, f_debtincattornprice, f_debtincattornduration, f_debtincbail, f_debtincfinccost, c_debtincmemo1, c_othinvfincparty, c_othinvfincpartycorporation, c_othinvfincpartyaddress, c_othinvfincpartyzipcode, c_othinvfincpartyphoneno1, c_othinvfincpartyphoneno2, c_othinvfincpartyfax, f_othinvfinccost, c_othinvmemo1, c_othinvmemo2, c_othinvmemo3, c_banktrustcoobank, c_banktrustproductname, c_banktrustproductcode, c_banktrustundertakingletter, c_trustgovgovname, c_trustgovprojecttype, c_trustgovcootype, c_trustgovoptype, c_housecapital, c_houseispe, c_tradetype, c_businesstype, c_trustname, c_trustidtype, c_trustidno, d_trustidvaliddate, c_trustbankname, c_trustaccounttype, c_trustnameinbank, c_zhtrustbankname, c_zhtrustbankacco, c_issecmarket, c_fundoperation, c_trustmanager, c_tradeother, c_watchdog, c_memo, c_benefittype, c_redeemaccotype, c_bonusaccotype, c_fundendaccotype, c_collectfailaccotype, d_lastmodifydate, c_shareholdlimtype, c_redeemtimelimtype, c_isprincipalrepayment, c_principalrepaymenttype, l_interestyeardays, l_incomeyeardays, c_capuseprovcode, c_capusecitycode, c_capsourceprovcode, c_banktrustcoobankcode, c_banktrustisbankcap, c_trusteefeedesc, c_managefeedesc, c_investfeedesc, f_investadvisordeductratio, c_investdeductdesc, c_investadvisor2, f_investadvisorratio2, f_investadvisordeductratio2, c_investfeedesc2, c_investdeductdesc2, c_investadvisor3, f_investadvisorratio3, f_investadvisordeductratio3, c_investfeedesc3, c_investdeductdesc3, c_profitclassdesc, c_deductratiodesc, c_redeemfeedesc, l_defaultprecision, c_allotfeeaccotype, c_isposf, c_opendaydesc, c_actualmanager, c_subindustrydetail, c_isbankleading, c_subprojectcode, c_iscycleinvest, f_liquidationinterest, c_liquidationinteresttype, c_isbonusinvestfare, c_subfeeaccotype, c_redeemfeeaccotype, c_fundrptcode, c_ordertype, c_flag, c_allotliqtype, l_sharelimitday, c_iseverydayopen, c_tradebynetvalue, c_isstage, c_specbenfitmemo, d_effectivedate, c_issueendflag, c_resharehasrdmfee, jy_fundcode, jy_fundid, jy_subfundid, jy_dptid, c_iswealth, c_interestcalctype, c_allotinterestcalctype, c_isriskcapital, c_fundstatus_1225, c_isincomeeverydaycalc, c_isredeemreturninterest, c_isrefundrtninterest, d_estimatedsetupdate, f_estimatedfactcollect, c_isfinancialproducts, c_fundredeemtype, c_trademanualinput, f_clientmanageration, c_profitclassadjustment, c_mainfundcode, c_contractsealoff, c_permitnextperiod, c_preprofitschematype, c_fundredeemprofit, f_incomeration, c_incomecalctype, c_allocateaccoid, c_outfundcode, c_matchprofitclass, l_lastdays, c_contractprofitflag, c_agencysaleliqtype, l_delaydays, c_profitclassperiod, c_reportshowname, c_currencyincometype, c_beforeredeemcapital, c_contractversion, c_confirmacceptedflag, c_selectcontract, f_schemainterest, c_riskgrade, l_sharedelaydays, l_reservationdays, c_transfertype, c_schemavoluntarily, l_schemadetaildata, c_schemadetailtype, c_iscurrencyconfirm, c_allowmultiaccobank, d_capverif, c_templatetype, c_capitalprecision, c_fundno, c_profittype, d_paydate, d_shelvedate, d_offshelvedate, c_schemabegindatetype, l_schemabegindatedays, c_isautoredeem, c_isnettingrequest, c_issuingquotedtype, d_firstdistributedate, c_bonusfrequency, c_interestbigdatetype, c_gzdatatype, f_allotfareratio, f_subfareratio, c_begindatebeyond, c_profitnotinterest, c_setuplimittype, c_limitredeemtype, c_bonusfrequencytype, c_rfaccotype, c_capitalfee, c_exceedflag, c_enableecd, c_isfixedtrade, c_profitcaltype, f_ominbala, f_stepbala, c_remittype, c_interestcycle, c_repayguaranteecopy, c_repaytype, c_fundprofitdes, c_fundinfodes, c_riskeval, l_maxage, l_minage, c_fundriskdes, mig_l_assetid, l_faincomedays, c_producttype, c_otherbenefitproducttype, c_isotc, c_iseverydayprovision, c_incometogz, c_setuptransfundacco, c_issuefeeownerrequired, c_calcinterestbeforeallot, c_islimit300wnature, c_allowoverflow, c_trustfundtype, c_disclose, c_collectaccoid, c_isissuebymarket, c_setupstatus, c_isentitytrust, l_liquidatesub, c_incomeassigndesc, c_keeporgancode, d_defaultbegincacldate, c_zcbborrower, c_zcbborroweridno, c_zcbremittype, c_registcode, c_redeeminvestaccotype, c_bonusinvestaccotype, c_isabsnotopentrade, l_interestdiffdays, c_outfundstatus, c_reqsyntype, c_allredeemtype, c_isabsopentrade, c_funddesc, l_allotliquidays, l_subliquidays, c_autoupcontractenddaterule, c_fcsubaccotype, c_fcallotaccotype, c_fcredeemaccotype, c_fcbonusaccotype, c_captranslimitflag, c_redeemprincipaltype, c_interestcalcdealtype, c_collectconfirm, d_oldcontractenddate, c_tnvaluation, c_contractendnotify, c_rdmfeebase, c_exceedcfmratio, c_allowallotcustlimittype, c_yeardayscalctype, c_iscompoundinterest, c_dbcfm, c_limitaccountstype, c_cycleinvestrange, c_tncheckmode, c_enableearlyredeem, c_ispurceandredeemset, c_perfpaydealtype, c_allowappend, c_allowredeem, c_inputstatus, c_profitbalanceadjust, c_profitperiodadjust, c_autogeneratecontractid, c_transferneednetting, underwrite, undertook, undertake, c_issmsend, d_contractshortenddate, d_contractlongenddate, c_assetseperatefundcodesrc, f_averageprofit, c_currencycontractlimittype, l_profitlastdays, l_liquidationlastdays, c_arlimitincludeallreq, c_reqfundchange, c_dealnetvaluerule, c_contractdealtype, c_bonusplanbeginday, c_contractbalaupright, c_isneedinterestrate, c_isneedexcessratio, c_riskgraderemark, c_lossprobability, c_suitcusttype, c_createbonusschema, d_closedenddate, c_timelimitunit, c_exceedredeemdealtype, c_profitperiod, l_navgetintervaldays, load_date, sys_id, work_date, c_limittransfertype, c_transaccotype, c_incometaxbase, c_isredeemfareyearcalc, c_otherbenefitinputmode, c_aftdefaultinterestdeducttype, c_allowzerobalanceconfirm, c_incomejoinassign, l_liquidateliqbonus, c_predefaultinterestdeducttype, c_worktype, c_defaultinterestadduptype, c_issupportsubmode, f_expectedyield, c_recodecode, l_liquidatetransfer, c_ispayincometax, c_groupmainfundcode, c_redeemfeesplittype, c_capitalfromcrmorta, c_needcalcdefaultinterest, c_issuercode, l_redeemfareyeardays, c_floatyield, l_minriskscore, c_islocalmoneytypecollect) FROM stdin;
+--
+-- Data for Name: s017_tsharecurrents_all; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY s017_tsharecurrents_all (d_cdate, c_cserialno, c_businflag, d_requestdate, c_requestno, c_custno, c_fundacco, c_tradeacco, c_fundcode, c_sharetype, c_agencyno, c_netno, f_occurshares, f_occurbalance, f_lastshares, f_occurfreeze, f_lastfreezeshare, c_summary, f_gainbalance, d_sharevaliddate, c_bonustype, c_custtype, c_shareclass, c_bourseflag, d_exportdate, l_contractserialno, c_issend, c_sendbatch, work_date) FROM stdin;
+--
+-- Data for Name: s017_ttrustclientinfo_all; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY s017_ttrustclientinfo_all (c_custno, c_custtype, c_custname, c_shortname, c_helpcode, c_identitytype, c_identityno, c_zipcode, c_address, c_phone, c_faxno, c_mobileno, c_email, c_sex, c_birthday, c_vocation, c_education, c_income, c_contact, c_contype, c_contno, c_billsendflag, c_callcenter, c_internet, c_secretcode, c_nationality, c_cityno, c_lawname, c_shacco, c_szacco, c_broker, f_agio, c_memo, c_reserve, c_corpname, c_corptel, c_specialcode, c_actcode, c_billsendpass, c_addressinvalid, d_appenddate, d_backdate, c_invalidaddress, c_backreason, c_modifyinfo, c_riskcontent, l_querydaysltd, c_customermanager, c_custproperty, c_custclass, c_custright, c_daysltdtype, d_idvaliddate, l_custgroup, c_recommender, c_recommendertype, d_idnovaliddate, c_organcode, c_othercontact, c_taxregistno, c_taxidentitytype, c_taxidentityno, d_legalvaliddate, c_shareholder, c_shareholderidtype, c_shareholderidno, d_holderidvaliddate, c_leader, c_leaderidtype, c_leaderidno, d_leadervaliddate, c_managercode, c_linemanager, c_clientinfoid, c_provincecode, c_countytown, c_phone2, c_clienttype, c_agencyno, c_industrydetail, c_isqualifiedcust, c_industryidentityno, c_lawidentitytype, c_lawidentityno, d_lawidvaliddate, d_conidvaliddate, c_conisrevmsg, c_conmobileno, c_conmoaddress, c_conzipcode, c_conphone1, c_conphone2, c_conemail, c_confaxno, c_incomsource, c_zhidentityno, c_zhidentitytype, c_eastcusttype, jy_custid, c_idtype201201030, c_emcontact, c_emcontactphone, c_instiregaddr, c_regcusttype, c_riskgrade, c_riskgraderemark, d_idvaliddatebeg, d_industryidvaliddatebeg, d_industryidvaliddate, c_incomesourceotherdesc, c_vocationotherdesc, c_businscope, d_conidvaliddatebeg, d_lawidvaliddatebeg, c_regmoneytype, f_regcapital, c_orgtype, c_contrholderno, c_contrholdername, c_contrholderidtype, c_contrholderidno, d_contrholderidvalidatebeg, d_contrholderidvalidate, c_responpername, c_responperidtype, c_responperidno, d_responperidvalidatebeg, d_responperidvalidate, c_lawphone, c_contrholderphone, c_responperphone, c_consex, c_conrelative, l_riskserialno, c_convocation, c_iscustrelated, c_businlicissuorgan, c_manageridno, c_manageridtype, c_managername, d_companyregdate, c_electronicagreement, c_householdregno, c_guardianrela, c_guardianname, c_guardianidtype, c_guardianidno, c_isfranchisingidstry, c_franchidstrybusinlic, c_workunittype, c_normalresidaddr, c_domicile, c_finainvestyears, c_parentidtype, c_parentidno, c_videono, c_bonustype, d_retirementdate, c_issendbigcustbill, c_idaddress, c_isproinvestor, c_sendkfflag, c_sendkfcause, c_sendsaflag, c_sendsacause, c_custrelationchannel, c_companytype, c_businlocation, c_custodian, d_elecsigndate, d_riskinputdate, c_circno, c_financeindustrydetail, c_outclientinfoid, d_duediligencedate, c_duediligencestatus, c_inputstatus, c_address2, c_reportcusttype, c_reportcusttypedetail, c_custsource, work_date) FROM stdin;
+--
+-- Data for Name: sys_stat_error_log; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+COPY sys_stat_error_log (proc_name, tab_level, step_no, step_desc, begin_time, end_time, workdate, row_num, elapsed, all_elapsed, sql_code, sql_errm) FROM stdin;
+--
+-- Name: ks0_fund_base_26 pk_ks0_fund_base_26; Type: CONSTRAINT; Schema: sync; Owner: gregsun
+--
+ALTER TABLE ONLY ks0_fund_base_26
+    ADD CONSTRAINT pk_ks0_fund_base_26 PRIMARY KEY (id1, acc_cd, ins_cd);
+--
+-- PostgreSQL database dump complete
+--
+create table newtab as 
+    SELECT A.C_FUNDCODE,
+           A.C_FUNDNAME,
+           A.C_FUNDACCO,
+           A.F_NETVALUE,
+           A.C_AGENCYNAME,
+           A.C_CUSTNAME,
+           A.D_DATE,
+           A.D_CDATE,
+           A.F_CONFIRMBALANCE,
+           A.F_TRADEFARE,
+           A.F_CONFIRMSHARES,
+           ABS(NVL(B.F_OCCURBALANCE, A.F_RELBALANCE)) F_RELBALANCE,
+           A.F_INTEREST,
+           NVL(DECODE(B.C_BUSINFLAG,
+                      '02',
+                      '申购',
+                      '50',
+                      '申购',
+                      '74',
+                      '申购',
+                      '03',
+                      '赎回'),
+               DECODE(A.C_BUSINFLAG,
+                      '01',
+                      '认购',
+                      '02',
+                      '申购',
+                      '03',
+                      '赎回',
+                      '53',
+                      '强制赎回',
+                      '50',
+                      '产品成立')) AS INFO,
+           null,
+           SYSDATE AS LOAD_DATE
+      FROM (SELECT A.C_FUNDCODE,
+                   C.C_FUNDNAME,
+                   A.C_FUNDACCO,
+                   FUNC_GETLASTNETVALUE(A.C_FUNDCODE, A.D_CDATE::date) F_NETVALUE,
+                   (SELECT C_AGENCYNAME
+                      FROM S017_TAGENCYINFO
+                     WHERE A.C_AGENCYNO = C_AGENCYNO) C_AGENCYNAME,
+                   B.C_CUSTNAME,
+                   TO_CHAR(A.D_DATE, 'yyyy-mm-dd') D_DATE,
+                   TO_CHAR(A.D_CDATE, 'yyyy-mm-dd') D_CDATE,
+                   DECODE(A.C_BUSINFLAG,
+                          '03',
+                          A.F_CONFIRMBALANCE + A.F_TRADEFARE,
+                          '53',
+                          A.F_CONFIRMBALANCE + A.F_TRADEFARE,
+                          A.F_CONFIRMBALANCE) F_CONFIRMBALANCE,
+                   A.F_TRADEFARE,
+                   A.F_CONFIRMSHARES,
+                   DECODE(A.C_BUSINFLAG,
+                          '03',
+                          A.F_CONFIRMBALANCE,
+                          '53',
+                          A.F_CONFIRMBALANCE,
+                          A.F_CONFIRMBALANCE - A.F_TRADEFARE) F_RELBALANCE,
+                   A.F_INTEREST,
+                   A.C_BUSINFLAG,
+                   A.C_CSERIALNO
+              FROM (SELECT D_DATE,
+                           C_AGENCYNO,
+                           DECODE(C_BUSINFLAG,
+                                  '03',
+                                  DECODE(C_IMPROPERREDEEM,
+                                         '3',
+                                         '100',
+                                         '5',
+                                         '100',
+                                         C_BUSINFLAG),
+                                  C_BUSINFLAG) C_BUSINFLAG,
+                           C_FUNDACCO,
+                           D_CDATE,
+                           C_FUNDCODE,
+                           F_CONFIRMBALANCE,
+                           F_CONFIRMSHARES,
+                           C_REQUESTNO,
+                           F_TRADEFARE,
+                           C_TRADEACCO,
+                           F_INTEREST,
+                           C_CSERIALNO,
+                           L_SERIALNO,
+                           L_CONTRACTSERIALNO
+                      FROM S017_TCONFIRM_ALL T3
+                    UNION
+                    SELECT D_DATE,
+                           C_AGENCYNO,
+                           '02' C_BUSINFLAG,
+                           C_FUNDACCO,
+                           D_LASTDATE AS D_CDATE, 
+                           C_FUNDCODE,
+                           F_REINVESTBALANCE F_CONFIRMBALANCE,
+                           F_REALSHARES F_CONFIRMSHARES,
+                           '' C_REQUESTNO,
+                           0 F_TRADEFARE,
+                           C_TRADEACCO,
+                           0 F_INTEREST,
+                           C_CSERIALNO,
+                           0 L_SERIALNO,
+                           L_CONTRACTSERIALNO
+                      FROM S017_TDIVIDENDDETAIL T1
+                     /*WHERE T1.C_FLAG = '0'*/) A
+              LEFT JOIN S017_TACCONET TACN
+                ON A.C_TRADEACCO = TACN.C_TRADEACCO
+              LEFT JOIN (SELECT * FROM S017_TACCOINFO WHERE C_ACCOUNTTYPE = 'A') X
+                ON A.C_FUNDACCO = X.C_FUNDACCO
+              LEFT JOIN S017_TTRUSTCLIENTINFO_ALL B
+                ON X.C_CUSTNO = B.C_CUSTNO
+             INNER JOIN S017_TFUNDINFO C
+                ON A.C_FUNDCODE = C.C_FUNDCODE
+                       ) A
+      LEFT JOIN (SELECT ST1.D_CDATE,
+                        ST1.C_FUNDCODE,
+                        ST1.F_OCCURBALANCE,
+                        ST1.C_BUSINFLAG,
+                        ST1.C_FUNDACCO,
+                        ST1.C_CSERIALNO
+                   FROM S017_TSHARECURRENTS_ALL ST1
+                 -- WHERE ST1.C_BUSINFLAG <> '74'
+                 UNION ALL
+                 SELECT ST2.D_DATE AS D_CDATE,
+                        ST2.C_FUNDCODE,
+                        ST2.F_TOTALPROFIT AS F_OCCURBALANCE,
+                        '74' AS C_BUSINFLAG,
+                        ST2.C_FUNDACCO,
+                        ST2.C_CSERIALNO
+                   FROM S017_TDIVIDENDDETAIL ST2
+              --    WHERE ST2.C_FLAG = '0'
+				  ) B
+        ON A.C_FUNDCODE = B.C_FUNDCODE
+		/*
+       AND A.C_FUNDACCO = B.C_FUNDACCO
+       AND TO_DATE(A.D_CDATE, 'YYYY-MM-DD') = B.D_CDATE
+       AND A.C_CSERIALNO = B.C_CSERIALNO*/;
+DROP SCHEMA sync cascade;
+NOTICE:  drop cascades to 16 other objects
+DETAIL:  drop cascades to function func_getlastnetvalue(varchar2,date)
+drop cascades to table b03_ts_remetrade
+drop cascades to table b03_ts_remetrade_bak
+drop cascades to table ks0_fund_base_26
+drop cascades to table p
+drop cascades to table s017_taccoinfo
+drop cascades to table s017_tacconet
+drop cascades to table s017_tagencyinfo
+drop cascades to table s017_tconfirm_all
+drop cascades to table s017_tdividenddetail
+drop cascades to table s017_tfundday
+drop cascades to table s017_tfundinfo
+drop cascades to table s017_tsharecurrents_all
+drop cascades to table s017_ttrustclientinfo_all
+drop cascades to table sys_stat_error_log
+drop cascades to table newtab
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 58485cf1..036a73c3 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -164,3 +164,6 @@ test: xl_primary_key xl_foreign_key xl_distribution_column_types xl_alter_table
 
 # This runs TBase specific tests
 test: tbase_explain
+
+test: redistribute_custom_types pl_bugs
+test: nestloop_by_shard
diff --git a/src/test/regress/sql/pl_bugs.sql b/src/test/regress/sql/pl_bugs.sql
new file mode 100644
index 00000000..0059dc90
--- /dev/null
+++ b/src/test/regress/sql/pl_bugs.sql
@@ -0,0 +1,2052 @@
+CREATE SCHEMA sync;
+
+SET search_path = sync, pg_catalog;
+set enable_oracle_compatible to on;
+
+--
+-- Name: func_getlastnetvalue(varchar2, date); Type: FUNCTION; Schema: sync; Owner: gregsun
+--
+
+CREATE FUNCTION func_getlastnetvalue(v_fundcode varchar2, v_cdate date) RETURNS numeric
+    LANGUAGE plpgsql
+    AS $$
+ declare   v_netvalue text;
+begin
+  begin
+    select p1
+      into v_netvalue
+      from p
+	limit 1;
+  exception
+    when no_data_found then
+      return 1;
+  
+  end;
+  return 1;
+end;
+ $$;
+
+
+--
+-- Name: sp_b03_ts_remetrade(varchar2, varchar2, varchar2, varchar2); Type: PROCEDURE; Schema: sync; Owner: gregsun
+--
+
+CREATE function sp_b03_ts_remetrade(p_start_date varchar2, p_work_date varchar2, INOUT err_num varchar2 DEFAULT 0, INOUT err_msg varchar2 DEFAULT NULL::varchar2)
+    LANGUAGE plpgsql
+    AS $$
+ declare   
+  V_START_DATE     DATE;
+  V_END_DATE       DATE;
+  V_WORK_DATE      DATE;
+  V_SP_NAME        VARCHAR(30);
+  V_TAB_LEVEL      VARCHAR(20);
+  V_LOG_STEP_NO    VARCHAR(20);
+  V_LOG_BEGIN_TIME DATE := SYSDATE;
+  V_LOG_END_TIME   DATE;
+  V_LOG_ROWCOUNT   NUMBER := 0;
+  V_ELAPSED        NUMBER;
+  V_ALL_ELAPSED    NUMBER;
+  V_STEP_DESC      sys_stat_error_log.STEP_DESC%TYPE;
+BEGIN
+  
+  V_SP_NAME   := 'SP_B03_TS_REMETRADE';
+  V_TAB_LEVEL := 'B';
+  
+  IF P_START_DATE IS NULL
+  THEN
+    RAISE EXCEPTION 'P_START_DATE IS NULL!';
+  ELSE
+    V_START_DATE := TO_DATE(P_START_DATE, 'YYYY-MM-DD');
+  END IF;
+  IF P_WORK_DATE IS NULL
+  THEN
+    RAISE EXCEPTION 'P_WORK_DATE IS NULL!';
+  ELSE
+    V_WORK_DATE := TO_DATE(P_WORK_DATE, 'YYYY-MM-DD');
+  END IF;
+  IF P_WORK_DATE IS NULL
+  THEN
+    RAISE EXCEPTION 'P_WORK_DATE IS NULL!';
+  ELSE
+    V_END_DATE := TO_DATE(P_WORK_DATE, 'YYYY-MM-DD');
+  END IF;
+  
+  
+  
+  V_LOG_STEP_NO    := 'STEP_01';
+  V_STEP_DESC      := '清除目标表数据';
+  V_LOG_BEGIN_TIME := SYSDATE;
+  V_LOG_ROWCOUNT   := NULL;
+  CALL SP_PUB_INSERT_LOG_DATE(V_SP_NAME 
+                        ,
+                         V_TAB_LEVEL 
+                        ,
+                         V_LOG_STEP_NO 
+                        ,
+                         V_STEP_DESC 
+                        ,
+                         V_LOG_BEGIN_TIME 
+                        ,
+                         V_LOG_END_TIME 
+                        ,
+                         V_WORK_DATE 
+                        ,
+                         V_LOG_ROWCOUNT 
+                        ,
+                         V_ELAPSED 
+                        ,
+                         V_ALL_ELAPSED);
+  
+  CALL SP_PUB_DEL_TB('B03_TS_REMETRADE');
+  /*DELETE FROM B03_TS_REMETRADE Y
+  WHERE Y.ENDDATE >=V_START_DATE;*/
+  
+ GET DIAGNOSTICS V_LOG_ROWCOUNT = ROW_COUNT;
+  
+  
+  
+  CALL SP_PUB_UPDATE_LOG_DATE(V_SP_NAME 
+                        ,
+                         V_TAB_LEVEL 
+                        ,
+                         V_LOG_STEP_NO 
+                        ,
+                         V_LOG_BEGIN_TIME 
+                        ,
+                        SYSDATE::DATE 
+                        ,
+                         V_WORK_DATE 
+                        ,
+                         V_LOG_ROWCOUNT 
+                        ,
+                         (SYSDATE - V_LOG_BEGIN_TIME)::NUMERIC 
+                        ,
+                         V_ALL_ELAPSED);
+  
+  V_LOG_STEP_NO    := 'STEP_02';
+  V_STEP_DESC      := '插入目标表B03_TS_REMETRADE';
+  V_LOG_BEGIN_TIME := SYSDATE;
+  V_LOG_ROWCOUNT   := NULL;
+  CALL  SP_PUB_INSERT_LOG_DATE(V_SP_NAME, 
+                         V_TAB_LEVEL, 
+                         V_LOG_STEP_NO, 
+                         V_STEP_DESC, 
+                         V_LOG_BEGIN_TIME, 
+                         V_LOG_END_TIME, 
+                         V_WORK_DATE, 
+                         V_LOG_ROWCOUNT, 
+                         V_ELAPSED, 
+                         V_ALL_ELAPSED);
+  
+  INSERT INTO B03_TS_REMETRADE
+    (C_FUNDCODE,
+     C_FUNDNAME,
+     C_FUNDACCO,
+     F_NETVALUE,
+     C_AGENCYNAME,
+     C_CUSTNAME,
+     D_DATE,
+     D_CDATE,
+     F_CONFIRMBALANCE,
+     F_TRADEFARE,
+     F_CONFIRMSHARES,
+     F_RELBALANCE,
+     F_INTEREST,
+     INFO,
+     WORK_DATE,
+     LOAD_DATE)
+    SELECT A.C_FUNDCODE,
+           A.C_FUNDNAME,
+           A.C_FUNDACCO,
+           A.F_NETVALUE,
+           A.C_AGENCYNAME,
+           A.C_CUSTNAME,
+           A.D_DATE,
+           A.D_CDATE,
+           A.F_CONFIRMBALANCE,
+           A.F_TRADEFARE,
+           A.F_CONFIRMSHARES,
+           ABS(NVL(B.F_OCCURBALANCE, A.F_RELBALANCE)) F_RELBALANCE,
+           A.F_INTEREST,
+           NVL(DECODE(B.C_BUSINFLAG,
+                      '02',
+                      '申购',
+                      '50',
+                      '申购',
+                      '74',
+                      '申购',
+                      '03',
+                      '赎回'),
+               DECODE(A.C_BUSINFLAG,
+                      '01',
+                      '认购',
+                      '02',
+                      '申购',
+                      '03',
+                      '赎回',
+                      '53',
+                      '强制赎回',
+                      '50',
+                      '产品成立')) AS INFO,
+           V_WORK_DATE,
+           SYSDATE AS LOAD_DATE
+      FROM (SELECT A.C_FUNDCODE,
+                   C.C_FUNDNAME,
+                   A.C_FUNDACCO,
+                   FUNC_GETLASTNETVALUE(A.C_FUNDCODE, A.D_CDATE) F_NETVALUE,
+                   (SELECT C_AGENCYNAME
+                      FROM S017_TAGENCYINFO
+                     WHERE A.C_AGENCYNO = C_AGENCYNO) C_AGENCYNAME,
+                   B.C_CUSTNAME,
+                   TO_CHAR(A.D_DATE, 'yyyy-mm-dd') D_DATE,
+                   TO_CHAR(A.D_CDATE, 'yyyy-mm-dd') D_CDATE,
+                   DECODE(A.C_BUSINFLAG,
+                          '03',
+                          A.F_CONFIRMBALANCE + A.F_TRADEFARE,
+                          '53',
+                          A.F_CONFIRMBALANCE + A.F_TRADEFARE,
+                          A.F_CONFIRMBALANCE) F_CONFIRMBALANCE,
+                   A.F_TRADEFARE,
+                   A.F_CONFIRMSHARES,
+                   DECODE(A.C_BUSINFLAG,
+                          '03',
+                          A.F_CONFIRMBALANCE,
+                          '53',
+                          A.F_CONFIRMBALANCE,
+                          A.F_CONFIRMBALANCE - A.F_TRADEFARE) F_RELBALANCE,
+                   A.F_INTEREST,
+                   A.C_BUSINFLAG,
+                   A.C_CSERIALNO
+              FROM (SELECT D_DATE,
+                           C_AGENCYNO,
+                           DECODE(C_BUSINFLAG,
+                                  '03',
+                                  DECODE(C_IMPROPERREDEEM,
+                                         '3',
+                                         '100',
+                                         '5',
+                                         '100',
+                                         C_BUSINFLAG),
+                                  C_BUSINFLAG) C_BUSINFLAG,
+                           C_FUNDACCO,
+                           D_CDATE,
+                           C_FUNDCODE,
+                           F_CONFIRMBALANCE,
+                           F_CONFIRMSHARES,
+                           C_REQUESTNO,
+                           F_TRADEFARE,
+                           C_TRADEACCO,
+                           F_INTEREST,
+                           C_CSERIALNO,
+                           L_SERIALNO,
+                           L_CONTRACTSERIALNO
+                      FROM S017_TCONFIRM_ALL T3
+                    UNION
+                    SELECT D_DATE,
+                           C_AGENCYNO,
+                           '02' C_BUSINFLAG,
+                           C_FUNDACCO,
+                           D_LASTDATE AS D_CDATE, 
+                           C_FUNDCODE,
+                           F_REINVESTBALANCE F_CONFIRMBALANCE,
+                           F_REALSHARES F_CONFIRMSHARES,
+                           '' C_REQUESTNO,
+                           0 F_TRADEFARE,
+                           C_TRADEACCO,
+                           0 F_INTEREST,
+                           C_CSERIALNO,
+                           0 L_SERIALNO,
+                           L_CONTRACTSERIALNO
+                      FROM S017_TDIVIDENDDETAIL T1
+                     WHERE T1.C_FLAG = '0') A
+              LEFT JOIN S017_TACCONET TACN
+                ON A.C_TRADEACCO = TACN.C_TRADEACCO
+              LEFT JOIN (SELECT * FROM S017_TACCOINFO WHERE C_ACCOUNTTYPE = 'A') X
+                ON A.C_FUNDACCO = X.C_FUNDACCO
+              LEFT JOIN S017_TTRUSTCLIENTINFO_ALL B
+                ON X.C_CUSTNO = B.C_CUSTNO
+             INNER JOIN S017_TFUNDINFO C
+                ON A.C_FUNDCODE = C.C_FUNDCODE
+                       ) A
+      LEFT JOIN (SELECT ST1.D_CDATE,
+                        ST1.C_FUNDCODE,
+                        ST1.F_OCCURBALANCE,
+                        ST1.C_BUSINFLAG,
+                        ST1.C_FUNDACCO,
+                        ST1.C_CSERIALNO
+                   FROM S017_TSHARECURRENTS_ALL ST1
+                  WHERE ST1.C_BUSINFLAG <> '74'
+                 UNION ALL
+                 SELECT ST2.D_DATE AS D_CDATE,
+                        ST2.C_FUNDCODE,
+                        ST2.F_TOTALPROFIT AS F_OCCURBALANCE,
+                        '74' AS C_BUSINFLAG,
+                        ST2.C_FUNDACCO,
+                        ST2.C_CSERIALNO
+                   FROM S017_TDIVIDENDDETAIL ST2
+                  WHERE ST2.C_FLAG = '0') B
+        ON A.C_FUNDCODE = B.C_FUNDCODE
+       AND A.C_FUNDACCO = B.C_FUNDACCO
+       AND TO_DATE(A.D_CDATE, 'YYYY-MM-DD') = B.D_CDATE
+       AND A.C_CSERIALNO = B.C_CSERIALNO;
+ GET DIAGNOSTICS V_LOG_ROWCOUNT = ROW_COUNT;
+  
+  
+  CALL SP_PUB_UPDATE_LOG_DATE(V_SP_NAME, 
+                         V_TAB_LEVEL, 
+                         V_LOG_STEP_NO, 
+                         V_LOG_BEGIN_TIME, 
+                         SYSDATE, 
+                         V_WORK_DATE, 
+                         V_LOG_ROWCOUNT, 
+                         (SYSDATE - V_LOG_BEGIN_TIME)::NUMERIC, 
+                         V_ALL_ELAPSED);
+  ERR_NUM := 0;
+  ERR_MSG := 'NORMAL,SUCCESSFUL COMPLETION';
+END;
+ $$;
+
+
+--
+-- Name: sp_pub_del_tb(varchar2); Type: PROCEDURE; Schema: sync; Owner: gregsun
+--
+
+CREATE PROCEDURE sp_pub_del_tb(p_tab_name varchar2)
+    LANGUAGE plpgsql
+    AS $$
+ declare  n_sql varchar2(4000);
+begin
+   
+   n_sql := 'truncate table '||p_tab_name;
+   
+   execute immediate n_sql;
+exception
+   when no_data_found then null;
+   when others then raise;
+end ;
+ $$;
+
+
+--
+-- Name: sp_pub_insert_log_date(varchar2, varchar2, varchar2, varchar2, date, date, date, numeric, numeric, numeric); Type: PROCEDURE; Schema: sync; Owner: gregsun
+--
+
+CREATE PROCEDURE sp_pub_insert_log_date(p_in_proc_name varchar2, p_in_tab_level varchar2, p_in_step_no varchar2, p_in_step_desc varchar2, p_in_begin_time date, p_in_end_time date, p_in_work_date date, p_in_row_num numeric, p_in_elapsed numeric, p_in_all_elapsed numeric)
+    LANGUAGE plpgsql
+    AS $$
+ declare   
+  BEGIN
+    INSERT INTO SYNC.SYS_STAT_ERROR_LOG
+      (PROC_NAME
+      ,TAB_LEVEL
+      ,STEP_NO
+      ,STEP_DESC
+      ,BEGIN_TIME
+      ,END_TIME
+      ,WORKDATE
+      ,ROW_NUM
+      ,ELAPSED
+      ,ALL_ELAPSED)
+    VALUES
+      (P_IN_PROC_NAME
+      ,P_IN_TAB_LEVEL
+      ,P_IN_STEP_NO
+      ,P_IN_STEP_DESC
+      ,P_IN_BEGIN_TIME
+      ,P_IN_END_TIME
+      ,P_IN_WORK_DATE
+      ,P_IN_ROW_NUM
+      ,P_IN_ELAPSED
+      ,P_IN_ALL_ELAPSED);
+    COMMIT;
+  END ;
+ $$;
+
+--
+-- Name: sp_pub_update_log_date(varchar2, varchar2, varchar2, date, date, date, numeric, numeric, numeric); Type: PROCEDURE; Schema: sync; Owner: gregsun
+--
+
+CREATE PROCEDURE sp_pub_update_log_date(p_in_proc_name varchar2, p_in_tab_level varchar2, p_in_step_no varchar2, p_in_begin_time date, p_in_end_time date, p_in_work_date date, p_in_row_num numeric, p_in_elapsed numeric, p_in_all_elapsed numeric)
+    LANGUAGE plpgsql
+    AS $$   BEGIN
+    UPDATE SYNC.SYS_STAT_ERROR_LOG
+       SET END_TIME = P_IN_END_TIME
+          ,ROW_NUM = P_IN_ROW_NUM
+          ,ELAPSED = P_IN_ELAPSED
+          ,ALL_ELAPSED = P_IN_ALL_ELAPSED
+     WHERE PROC_NAME = P_IN_PROC_NAME
+       AND TAB_LEVEL = P_IN_TAB_LEVEL
+       AND STEP_NO = P_IN_STEP_NO
+       AND BEGIN_TIME = P_IN_BEGIN_TIME
+       AND WORKDATE = P_IN_WORK_DATE;
+    COMMIT;
+  END ;
+ $$;
+
+
+SET default_tablespace = '';
+
+SET default_with_oids = false;
+
+--
+-- Name: b03_ts_remetrade; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE b03_ts_remetrade (
+    c_fundcode character varying(500) NOT NULL,
+    c_fundname character varying(4000),
+    c_fundacco character varying(30),
+    f_netvalue numeric(16,2),
+    c_agencyname character varying(4000),
+    c_custname character varying(4000),
+    d_date character varying(100),
+    d_cdate character varying(100),
+    f_confirmbalance numeric(16,2),
+    f_tradefare numeric(16,2),
+    f_confirmshares numeric(16,2),
+    f_relbalance numeric(16,2),
+    f_interest numeric(16,2),
+    info character varying(500),
+    work_date timestamp(0) without time zone,
+    load_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (c_fundcode) to GROUP default_group;
+
+
+--
+-- Name: b03_ts_remetrade_bak; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE b03_ts_remetrade_bak (
+    c_fundcode character varying(500) NOT NULL,
+    c_fundname character varying(4000),
+    c_fundacco character varying(30),
+    f_netvalue numeric(16,2),
+    c_agencyname character varying(4000),
+    c_custname character varying(4000),
+    d_date character varying(100),
+    d_cdate character varying(100),
+    f_confirmbalance numeric(16,2),
+    f_tradefare numeric(16,2),
+    f_confirmshares numeric(16,2),
+    f_relbalance numeric(16,2),
+    f_interest numeric(16,2),
+    info character varying(500),
+    work_date timestamp(0) without time zone,
+    load_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (c_fundcode) to GROUP default_group;
+
+
+--
+-- Name: ks0_fund_base_26; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE ks0_fund_base_26 (
+    id1 numeric(48,0) NOT NULL,
+    acc_cd character varying(500) NOT NULL,
+    tdate timestamp(0) without time zone NOT NULL,
+    ins_cd character varying(500) NOT NULL,
+    cost_price_asset numeric(30,8),
+    pcol character varying(50)
+)
+DISTRIBUTE BY SHARD (id1) to GROUP default_group;
+
+--
+-- Name: p; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE p (
+    p1 text,
+    p2 text
+)
+DISTRIBUTE BY HASH (p1);
+
+
+--
+-- Name: s017_taccoinfo; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE s017_taccoinfo (
+    c_custno character varying(30) NOT NULL,
+    c_accounttype character(1),
+    c_fundacco character varying(30),
+    c_agencyno character(3),
+    c_netno character varying(30),
+    c_childnetno character varying(30),
+    d_opendate timestamp(0) without time zone,
+    d_lastmodify timestamp(0) without time zone,
+    c_accostatus character(1),
+    c_freezecause character(1),
+    d_backdate timestamp(0) without time zone,
+    l_changetime numeric(10,0),
+    d_firstinvest timestamp(0) without time zone,
+    c_password character varying(100),
+    c_bourseflag character(1),
+    c_operator character varying(100),
+    jy_custid numeric(10,0),
+    work_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (c_custno) to GROUP default_group;
+
+
+--
+-- Name: s017_tacconet; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE s017_tacconet (
+    c_fundacco character varying(30) NOT NULL,
+    c_agencyno character varying(6),
+    c_netno character varying(30),
+    c_tradeacco character varying(100),
+    c_openflag character varying(2),
+    c_bonustype character varying(2),
+    c_bankno character varying(500),
+    c_bankacco character varying(500),
+    c_nameinbank character varying(1000),
+    d_appenddate timestamp(0) without time zone,
+    c_childnetno character varying(30),
+    c_tradeaccobak character varying(100),
+    c_bankname character varying(500),
+    c_banklinecode character varying(100),
+    c_channelbankno character varying(30),
+    c_bankprovincecode character varying(30),
+    c_bankcityno character varying(30),
+    sys_id character varying(10),
+    work_date timestamp(0) without time zone,
+    load_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (c_fundacco) to GROUP default_group;
+
+
+--
+-- Name: s017_tagencyinfo; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE s017_tagencyinfo (
+    c_agencyno character varying(6) NOT NULL,
+    c_agencyname character varying(1000),
+    c_fullname character varying(1000),
+    c_agncyaddress character varying(500),
+    c_agncyzipcode character varying(30),
+    c_agncycontact character varying(30),
+    c_agncyphone character varying(100),
+    c_agncyfaxno character varying(100),
+    c_agncymail character varying(100),
+    c_agncybankno character varying(24),
+    c_agncybankacco character varying(100),
+    c_agncybankname character varying(500),
+    d_agncyregdate timestamp(0) without time zone,
+    c_agncystatus character varying(2),
+    d_lastdate timestamp(0) without time zone,
+    c_agencytype character varying(2),
+    c_detail character varying(2),
+    c_right character varying(2),
+    c_zdcode character varying(30),
+    l_liquidateredeem numeric(10,0),
+    l_liquidateallot numeric(10,0),
+    l_liquidatebonus numeric(10,0),
+    l_liquidatesub numeric(10,0),
+    c_sharetypes character varying(30),
+    f_agio numeric(5,4),
+    c_ztgonestep character varying(2),
+    c_preassign character varying(2),
+    l_cserialno numeric(10,0),
+    c_comparetype character varying(2),
+    c_liquidatetype character varying(2),
+    c_multitradeacco character varying(2),
+    c_iversion character varying(6),
+    c_imode character varying(2),
+    c_changeonstep character varying(2),
+    f_outagio numeric(5,4),
+    f_agiohint numeric(5,4),
+    f_outagiohint numeric(5,4),
+    c_allotliqtype character varying(2),
+    c_redeemliqtype character varying(2),
+    c_centerflag character varying(2),
+    c_netno character varying(6),
+    c_littledealtype character varying(2),
+    c_overtimedeal character varying(2),
+    d_lastinputtime timestamp(0) without time zone,
+    f_interestrate numeric(5,4),
+    c_clearsite character varying(2),
+    c_isdeal character varying(2),
+    c_agencyenglishname character varying(100),
+    l_fundaccono numeric(10,0),
+    c_rationflag character varying(2),
+    c_splitflag character varying(2),
+    c_tacode character varying(30),
+    c_outdataflag character varying(2),
+    c_hasindex character varying(2),
+    c_transferbyadjust character varying(2),
+    c_sharedetailexptype character varying(2),
+    c_navexptype character varying(2),
+    c_ecdmode character varying(2),
+    c_agencytypedetail character varying(2),
+    c_advanceshrconfirm character varying(2),
+    c_ecdversion character varying(2),
+    c_capmode character varying(2),
+    c_internetplatform character varying(2),
+    c_capautoarrive character varying(2),
+    c_outcapitaldata character varying(30),
+    c_ecdcheckmode character varying(30),
+    c_ecddealmode character varying(30),
+    c_fileimpmode character varying(30),
+    c_isotc character varying(2),
+    c_enableecd character varying(30),
+    c_autoaccotype character varying(30),
+    c_tncheckmode numeric(10,0),
+    c_captureidinfo character varying(30),
+    c_realfreeze character varying(30),
+    sys_id character varying(10),
+    work_date timestamp(0) without time zone,
+    load_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (c_agencyno) to GROUP default_group;
+
+
+--
+-- Name: s017_tconfirm_all; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE s017_tconfirm_all (
+    c_businflag character(2) NOT NULL,
+    d_cdate timestamp(0) without time zone,
+    c_cserialno character varying(100),
+    d_date timestamp(0) without time zone,
+    l_serialno numeric(10,0),
+    c_agencyno character(3),
+    c_netno character varying(30),
+    c_fundacco character varying(30),
+    c_tradeacco character varying(100),
+    c_fundcode character varying(30),
+    c_sharetype character(1),
+    f_confirmbalance numeric(16,2),
+    f_confirmshares numeric(16,2),
+    f_tradefare numeric(16,2),
+    f_tafare numeric(16,2),
+    f_stamptax numeric(16,2),
+    f_backfare numeric(16,2),
+    f_otherfare1 numeric(16,2),
+    f_interest numeric(16,2),
+    f_interesttax numeric(16,2),
+    f_totalfare numeric(16,2),
+    f_agencyfare numeric(16,2),
+    f_netvalue numeric(12,4),
+    f_frozenbalance numeric(16,2),
+    f_unfrozenbalance numeric(16,2),
+    c_status character(1),
+    c_cause character varying(100),
+    c_taflag character(1),
+    c_custtype character(1),
+    c_custno character varying(30),
+    f_gainbalance numeric(16,2),
+    f_orifare numeric(16,2),
+    c_requestendflag character(1),
+    f_unbalance numeric(16,2),
+    f_unshares numeric(16,2),
+    c_reserve character varying(500),
+    f_interestshare numeric(16,2),
+    f_chincome numeric(16,2),
+    f_chshare numeric(16,2),
+    f_confirmincome numeric(16,2),
+    f_oritradefare numeric(16,2),
+    f_oritafare numeric(16,2),
+    f_oribackfare numeric(16,2),
+    f_oriotherfare1 numeric(16,2),
+    c_requestno character varying(100),
+    f_balance numeric(16,2),
+    f_shares numeric(16,2),
+    f_agio numeric(5,4),
+    f_lastshares numeric(16,2),
+    f_lastfreezeshare numeric(16,2),
+    c_othercode character varying(30),
+    c_otheracco character varying(30),
+    c_otheragency character(3),
+    c_othernetno character varying(30),
+    c_bonustype character(1),
+    c_foriginalno character varying(500),
+    c_exceedflag character(1),
+    c_childnetno character varying(30),
+    c_othershare character(1),
+    c_actcode character(3),
+    c_acceptmode character(1),
+    c_freezecause character(1),
+    c_freezeenddate character varying(100),
+    f_totalbalance numeric(16,2),
+    f_totalshares numeric(16,2),
+    c_outbusinflag character(3),
+    c_protocolno character varying(30),
+    c_memo character varying(500),
+    f_registfare numeric(16,2),
+    f_fundfare numeric(16,2),
+    f_oriagio numeric(5,4),
+    c_shareclass character(1),
+    d_cisdate timestamp(0) without time zone,
+    c_bourseflag character(1),
+    c_fundtype character(1),
+    f_backfareagio numeric(5,4),
+    c_bankno character varying(30),
+    c_subfundmethod character varying(30),
+    c_combcode character varying(30),
+    f_returnfare numeric(16,2),
+    c_contractno character varying(100),
+    c_captype character(1),
+    l_contractserialno numeric(10,0),
+    l_othercontractserialno numeric(10,0),
+    d_exportdate timestamp(0) without time zone,
+    f_transferfee numeric(16,2),
+    f_oriconfirmbalance numeric(16,2),
+    f_extendnetvalue numeric(23,15),
+    l_remitserialno numeric(10,0),
+    c_zhxtht character varying(500),
+    c_improperredeem character(1),
+    f_untradefare numeric(16,2),
+    f_untradeinfare numeric(16,2),
+    f_untradeoutfare numeric(16,2),
+    c_profitnottransfer character(1),
+    f_outprofit numeric(9,6),
+    f_inprofit numeric(9,6),
+    c_totrustcontractid character varying(500),
+    d_repurchasedate timestamp(0) without time zone,
+    f_chengoutbalance numeric(16,2),
+    c_exporting character(1),
+    jy_fundid numeric(10,0),
+    jy_contractbh character varying(100),
+    jy_custid numeric(10,0),
+    jy_tocustid numeric(10,0),
+    jy_fare numeric(16,2),
+    c_trustcontractid character varying(500),
+    f_taagencyfare numeric(16,2),
+    f_taregisterfare numeric(16,2),
+    d_cdate_jy timestamp(0) without time zone,
+    jy_adjust character(1),
+    jy_subfundid numeric,
+    jy_adjust1114 character(1),
+    jy_cdate timestamp(0) without time zone,
+    c_bankacco character varying(500),
+    c_bankname character varying(500),
+    c_nameinbank character varying(1000),
+    f_riskcapital numeric(16,2),
+    f_replenishriskcapital numeric(16,2),
+    c_fromfundcode character varying(30),
+    c_fromtrustcontractid character varying(500),
+    c_trustagencyno character varying(100),
+    l_rdmschserialno numeric(10,0),
+    f_redeemprofit numeric(16,2),
+    f_redeemproyieldrate numeric(13,10),
+    d_redeemprobigdate timestamp(0) without time zone,
+    d_redeemproenddate timestamp(0) without time zone,
+    c_changeownerincomebelong character(1),
+    l_midremitserialno numeric(10,0),
+    c_fromtype character(1),
+    c_iscycinvest character(1),
+    l_fromserialno numeric(10,0),
+    l_frominterestconserialno numeric(10,0),
+    c_changeownerinterest character(1),
+    c_msgsendflag character(1),
+    l_sharedelaydays numeric(3,0),
+    c_istodayconfirm character(1),
+    f_newincome numeric(16,2),
+    f_floorincome numeric(10,9),
+    l_incomeremitserialno numeric(10,0),
+    c_isnetting character(1),
+    l_bankserialno numeric(10,0),
+    c_subfundcode character varying(30),
+    f_chengoutsum numeric(16,2),
+    f_chengoutprofit numeric(16,2),
+    l_confirmtransserialno numeric(10,0),
+    c_shareadjustgzexpflag character(1),
+    c_issend character(1),
+    c_exchangeflag character(1),
+    yh_date_1112 timestamp(0) without time zone,
+    l_banktocontractserialno numeric(10,0),
+    c_payfeetype character(1),
+    c_tobankno character varying(30),
+    c_tobankacco character varying(500),
+    c_tobankname character varying(500),
+    c_tonameinbank character varying(1000),
+    c_tobanklinecode character varying(100),
+    c_tobankprovincecode character varying(30),
+    c_tobankcityno character varying(30),
+    l_assetseperateno numeric(10,0),
+    c_sharecserialno character varying(100),
+    c_redeemprincipaltype character(1),
+    work_date timestamp(0) without time zone,
+    c_businname character varying(100)
+)
+DISTRIBUTE BY SHARD (c_businflag) to GROUP default_group;
+
+
+--
+-- Name: s017_tdividenddetail; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE s017_tdividenddetail (
+    d_cdate timestamp(0) without time zone NOT NULL,
+    c_cserialno character varying(100),
+    d_regdate timestamp(0) without time zone,
+    d_date timestamp(0) without time zone,
+    c_fundacco character varying(30),
+    c_tradeacco character varying(100),
+    c_fundcode character varying(30),
+    c_sharetype character varying(2),
+    c_agencyno character varying(6),
+    c_netno character varying(30),
+    f_totalshare numeric(16,2),
+    f_unitprofit numeric(7,4),
+    f_totalprofit numeric(16,2),
+    f_tax numeric(16,2),
+    c_flag character varying(2),
+    f_realbalance numeric(16,2),
+    f_reinvestbalance numeric(16,2),
+    f_realshares numeric(16,2),
+    f_fare numeric(16,2),
+    d_lastdate timestamp(0) without time zone,
+    f_netvalue numeric(7,4),
+    f_frozenbalance numeric(16,2),
+    f_frozenshares numeric(16,2),
+    f_incometax numeric(9,4),
+    c_reserve character varying(100),
+    d_requestdate timestamp(0) without time zone,
+    c_shareclass character varying(30),
+    l_contractserialno numeric(10,0),
+    l_specprjserialno numeric(10,0),
+    f_investadvisorratio numeric(9,8),
+    f_transferfee numeric(16,2),
+    l_profitserialno numeric(10,0),
+    d_exportdate timestamp(0) without time zone,
+    c_custid character varying(30),
+    jy_fundid numeric,
+    jy_subfundid numeric,
+    jy_custid numeric,
+    jy_contractbh character varying(100),
+    jy_profitsn numeric,
+    jy_profitmoney numeric,
+    jy_capitalmoney numeric,
+    jy_adjust character varying(2),
+    c_reinvestnetvalue character varying(2),
+    f_transferbalance numeric(16,2),
+    l_relatedserialno numeric(10,0),
+    c_printoperator character varying(100),
+    c_printauditor character varying(100),
+    sys_id character varying(10),
+    work_date timestamp(0) without time zone,
+    load_date timestamp(0) without time zone,
+    f_remainshares numeric(16,2)
+)
+DISTRIBUTE BY SHARD (d_cdate) to GROUP default_group;
+
+
+--
+-- Name: s017_tfundday; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE s017_tfundday (
+    d_date timestamp(0) without time zone,
+    d_cdate timestamp(0) without time zone,
+    c_fundcode varchar2(30),
+    c_todaystatus varchar2(2),
+    c_status varchar2(2),
+    f_netvalue numeric(7,4),
+    f_lastshares numeric(16,2),
+    f_lastasset numeric(16,2),
+    f_asucceed numeric(16,2),
+    f_rsucceed numeric(16,2),
+    c_vastflag varchar2(2),
+    f_encashratio numeric(9,8),
+    f_changeratio numeric(9,8),
+    c_excessflag varchar2(2),
+    f_subscriberatio numeric(9,8),
+    c_inputpersonnel varchar2(100),
+    c_checkpersonnel varchar2(100),
+    f_income numeric(16,2),
+    f_incomeratio numeric(9,6),
+    f_unassign numeric(16,2),
+    f_incomeunit numeric(10,5),
+    f_totalnetvalue numeric(7,4),
+    f_servicefare numeric(16,2),
+    f_assign numeric(16,2),
+    f_growthrate numeric(9,8),
+    c_netvalueflag varchar2(2),
+    f_managefare numeric(16,2),
+    d_exportdate timestamp(0) without time zone,
+    c_flag varchar2(2),
+    f_advisorfee numeric(16,2),
+    d_auditdate timestamp(0) without time zone,
+    f_extendnetvalue numeric(23,15),
+    f_extendtotalnetvalue numeric(23,15),
+    jy_fundcode varchar2(30),
+    f_yearincomeratio numeric(9,6),
+    f_riskcapital numeric(16,2),
+    f_totalincome numeric(16,2),
+    f_agencyexpyearincomeration numeric(9,6),
+    f_agencyexpincomeunit numeric(10,5),
+    f_agencyexpincomeration numeric(9,6),
+    f_agencyexpincome numeric(16,2),
+    c_isspecflag varchar2(2),
+    c_isasync varchar2(2),
+    sys_id varchar2(10),
+    work_date timestamp(0) without time zone,
+    load_date timestamp(0) without time zone DEFAULT orcl_sysdate()
+)
+DISTRIBUTE BY HASH (d_date);
+
+
+--
+-- Name: s017_tfundinfo; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE s017_tfundinfo (
+    c_fundcode character varying(30) NOT NULL,
+    c_fundname character varying(1000),
+    c_moneytype character varying(6),
+    c_managername character varying(100),
+    c_trusteecode character varying(30),
+    f_parvalue numeric(7,4),
+    f_issueprice numeric(12,4),
+    c_trusteeacco character varying(100),
+    d_issuedate timestamp(0) without time zone,
+    d_setupdate timestamp(0) without time zone,
+    f_maxbala numeric(16,2),
+    f_maxshares numeric(16,2),
+    f_minbala numeric(16,2),
+    f_minshares numeric(16,2),
+    l_elimitday numeric(10,0),
+    l_slimitday numeric(10,0),
+    l_alimitday numeric(10,0),
+    l_mincount numeric(10,0),
+    l_climitday numeric(10,0),
+    f_maxallot numeric(9,8),
+    f_maxredeem numeric(9,8),
+    c_fundcharacter character varying(500),
+    c_fundstatus character varying(2),
+    c_subscribemode character varying(2),
+    l_timelimit numeric(10,0),
+    l_subscribeunit numeric(10,0),
+    c_sharetypes character varying(30),
+    c_issuetype character varying(2),
+    f_factcollect numeric(16,2),
+    d_failuedate timestamp(0) without time zone,
+    f_allotratio numeric(9,8),
+    c_feeratiotype1 character varying(2),
+    c_feeratiotype2 character varying(2),
+    c_feetype character varying(2),
+    c_exceedpart character varying(2),
+    c_bonustype character varying(2),
+    c_forceredeem character varying(2),
+    c_interestdealtype character varying(2),
+    f_redeemfareratio numeric(5,4),
+    f_changefareratio numeric(5,4),
+    f_managerfee numeric(7,6),
+    f_right numeric(5,4),
+    c_property character varying(2),
+    d_evendate timestamp(0) without time zone,
+    f_totalbonus numeric(7,4),
+    c_changefree character varying(2),
+    c_reportcode character varying(30),
+    c_backfarecal character varying(2),
+    l_moneydate numeric(10,0),
+    l_netprecision numeric(10,0),
+    c_corpuscontent character varying(2),
+    f_corpusratio numeric(5,4),
+    c_farecaltype character varying(2),
+    l_liquidateallot numeric(10,0),
+    l_liquidateredeem numeric(10,0),
+    l_liquidatebonus numeric(10,0),
+    l_taspecialacco numeric(10,0),
+    c_fareprecision character varying(2),
+    d_issueenddate timestamp(0) without time zone,
+    c_farebelongasset character varying(2),
+    l_liquidatechange numeric(10,0),
+    l_liquidatefail numeric(10,0),
+    l_liquidateend numeric(10,0),
+    c_sharedetail character varying(2),
+    c_trusteebankname character varying(500),
+    c_boursetradeflag character varying(2),
+    c_fundenglishname character varying(100),
+    l_bankaccono numeric(10,0),
+    c_cleanflag character varying(2),
+    c_precision character varying(2),
+    c_upgradeflag character varying(2),
+    c_isdeal character varying(2),
+    c_farecltprecision character varying(2),
+    c_balanceprecision character varying(2),
+    c_shareprecision character varying(2),
+    c_bonusprecision character varying(2),
+    c_interestprecision character varying(2),
+    f_maxallotasset numeric(16,2),
+    f_maxallotshares numeric(16,2),
+    c_foreigntrustee character varying(6),
+    l_tnconfirm numeric(3,0),
+    c_rationallotstatus character varying(2),
+    f_trusteefee numeric(7,6),
+    c_fundacco character varying(30),
+    c_financetype character varying(2),
+    l_liquidatechangein numeric(10,0),
+    c_custname character varying(500),
+    c_identitytype character varying(2),
+    c_custtype character varying(2),
+    c_identityno character varying(100),
+    c_deductschemecode character varying(30),
+    c_customermanager character varying(30),
+    c_templateid character varying(30),
+    f_pr0 numeric(7,4),
+    f_deductratio numeric(5,4),
+    c_farecalculatetype character varying(2),
+    c_saletype character varying(2),
+    l_maxcount numeric(10,0),
+    l_zhallotliqdays numeric(10,0),
+    l_zhredeemliqdays numeric(10,0),
+    f_liqasset numeric(16,2),
+    l_zhallotexpdays numeric(10,0),
+    l_zhredeemexpdays numeric(10,0),
+    c_limitmode character varying(2),
+    c_ordermode character varying(2),
+    c_acntlmtdealmode character varying(2),
+    l_informdays numeric(2,0),
+    c_allowpartredeem character varying(2),
+    c_fundendmode character varying(2),
+    f_fundendagio numeric(10,9),
+    c_minbalalimitisconfirm character varying(2),
+    c_gradetype character varying(2),
+    c_qryfreqtype character varying(2),
+    l_qrydaysltd numeric(2,0),
+    d_contractenddate timestamp(0) without time zone,
+    c_useinopenday character varying(2),
+    c_allotcalinterst character varying(2),
+    c_fundrisk character varying(2),
+    c_exitallot character varying(2),
+    c_subinterestcalc character varying(2),
+    c_earlyexitredfee character varying(2),
+    c_navexpfqy character varying(2),
+    l_navexpday numeric(10,0),
+    c_isbounded character varying(2),
+    c_earlyexitfeecalc character varying(2),
+    c_designdptid character varying(100),
+    c_fixeddividway character varying(2),
+    c_trusttype character varying(2),
+    f_maxnaturalmoney numeric(16,2),
+    c_projectid character varying(30),
+    c_trustclass character varying(2),
+    f_trustscale numeric(16,2),
+    c_structflag character varying(2),
+    c_priconveyflag character varying(2),
+    c_repurchasetype character varying(2),
+    c_iswholerepurchase character varying(2),
+    f_repurchaseminbala numeric(16,2),
+    c_repurchasemainbody character varying(2),
+    c_canelyrepurchase character varying(2),
+    c_earlybacktime character varying(2),
+    c_repurchaseprice character varying(2),
+    c_premiumpaymenttime character varying(2),
+    c_liquisource character varying(2),
+    l_period numeric(3,0),
+    c_canextensionflag character varying(2),
+    c_canelyliquidflag character varying(2),
+    c_trustassetdesc character varying(100),
+    c_returnside character varying(2),
+    c_returnpaymentway character varying(2),
+    c_returnbase character varying(2),
+    c_refepaymentway character varying(2),
+    c_refeside character varying(2),
+    c_refebase character varying(2),
+    f_warnline numeric(5,4),
+    f_stopline numeric(5,4),
+    f_collectinterest numeric(11,8),
+    f_durationinterest numeric(7,4),
+    f_investadvisorratio numeric(7,6),
+    c_bonusschema character varying(2),
+    c_guaranteetype character varying(2),
+    c_guaranteedesc character varying(100),
+    c_expectedyieldtype character varying(2),
+    f_minexpectedyield numeric(12,4),
+    f_maxexpectedyield numeric(12,4),
+    c_incomecycletype character varying(2),
+    f_incomecyclevalue numeric(10,0),
+    c_subaccotype character varying(2),
+    c_allotaccotype character varying(2),
+    c_fundtype character varying(2),
+    c_cootype character varying(1000),
+    c_projecttype character varying(2),
+    c_investdirection character varying(30),
+    c_investdirectionfractionize character varying(2),
+    c_industrydetail character varying(1000),
+    c_initeresttype character varying(2),
+    c_isextended character varying(2),
+    d_extenddate timestamp(0) without time zone,
+    c_dealmanagetype character varying(2),
+    c_investarea character varying(2),
+    c_projectcode character varying(1000),
+    c_fundshortname character varying(500),
+    c_contractid character varying(500),
+    c_functype character varying(2),
+    c_specialbusintype character varying(1000),
+    c_investindustry character varying(2),
+    c_managetype character varying(2),
+    c_area character varying(500),
+    c_risk character varying(2),
+    c_iscommitteedisscuss character varying(2),
+    c_structtype character varying(2),
+    c_commendplace character varying(2),
+    l_npmaxcount numeric(5,0),
+    c_client character varying(100),
+    c_clientcusttype character varying(2),
+    c_clientidtype character varying(2),
+    c_clientidno character varying(100),
+    c_clientbankname character varying(100),
+    c_clientaccono character varying(100),
+    c_clientaddress character varying(500),
+    c_clientzipcode character varying(30),
+    c_clientphoneno1 character varying(100),
+    c_clientphoneno2 character varying(100),
+    c_clientfax character varying(100),
+    c_beneficiary character varying(100),
+    c_collectbankname character varying(500),
+    c_collectbankno character varying(6),
+    c_collectaccountname character varying(500),
+    c_collectbankacco character varying(100),
+    c_keeperbankname character varying(500),
+    c_keeperaccountname character varying(500),
+    c_keeperaccountno character varying(100),
+    c_keepername character varying(500),
+    c_keepercorporation character varying(500),
+    c_keeperaddress character varying(500),
+    c_keeperzipcode character varying(30),
+    c_keeperphoneno1 character varying(100),
+    c_keeperphoneno2 character varying(100),
+    c_keeperfax character varying(100),
+    c_incomedistributetype character varying(2),
+    c_alarmline character varying(1000),
+    c_stoplossline character varying(1000),
+    f_investadvisorfee numeric(12,2),
+    c_investadvisordeduct character varying(1000),
+    c_capitalacco character varying(500),
+    c_stockacconame character varying(500),
+    c_stocksalesdept character varying(500),
+    c_thirdpartybankno character varying(6),
+    c_thirdpartybankname character varying(500),
+    c_thirdpartyacconame character varying(500),
+    c_thirdpartyaccono character varying(100),
+    c_investadvisor character varying(500),
+    c_investadvisorbankno character varying(6),
+    c_investadvisorbankname character varying(500),
+    c_investadvisoracconame character varying(500),
+    c_investadvisoraccono character varying(100),
+    c_investadvisorcorporation character varying(500),
+    c_investadvisoraddress character varying(500),
+    c_investadvisorzipcode character varying(30),
+    c_investadvisorphoneno1 character varying(100),
+    c_investadvisorphoneno2 character varying(100),
+    c_investadvisorfax character varying(100),
+    c_authdelegate character varying(100),
+    c_loanfinanceparty character varying(500),
+    c_loanfinancepartycorporation character varying(500),
+    c_loanfinancepartyaddress character varying(500),
+    c_loanfinancepartyzipcode character varying(30),
+    c_loanfinancepartyphoneno1 character varying(100),
+    c_loanfinancepartyphoneno2 character varying(100),
+    c_loanfinancepartyfax character varying(100),
+    c_loaninteresttype character varying(2),
+    f_loaninterestrate numeric(7,4),
+    f_loanduration numeric(5,0),
+    c_loanmanagebank character varying(500),
+    f_loanmanagefee numeric(9,2),
+    f_loanfinancecost numeric(9,2),
+    f_creditattornduration numeric(5,0),
+    f_creditattorninterestduration numeric(7,4),
+    f_creditattornprice numeric(12,2),
+    f_billattornduration numeric(5,0),
+    f_billattorninterestduration numeric(7,4),
+    f_billattornprice numeric(12,2),
+    c_stkincfincparty character varying(1000),
+    c_stkincfincpartycorporation character varying(500),
+    c_stkincfincpartyaddress character varying(500),
+    c_stkincfincpartyzipcode character varying(30),
+    c_stkincfincpartyphoneno1 character varying(100),
+    c_stkincfincpartyphoneno2 character varying(100),
+    c_stkincfincpartyfax character varying(100),
+    c_stkincincomeannualizedrate numeric(7,4),
+    c_stkincinteresttype character varying(2),
+    f_stkincattornprice numeric(12,2),
+    f_stkincattornduration numeric(5,0),
+    f_stkincbail numeric(12,2),
+    f_stkincfinccost numeric(9,2),
+    c_stkincmemo1 character varying(1000),
+    c_stkincmemo2 character varying(1000),
+    c_debtincfincparty character varying(500),
+    c_debtincfincpartycorporation character varying(500),
+    c_debtincfincpartyaddress character varying(500),
+    c_debtincfincpartyzipcode character varying(30),
+    c_debtincfincpartyphoneno1 character varying(100),
+    c_debtincfincpartyphoneno2 character varying(100),
+    c_debtincfincpartyfax character varying(100),
+    c_debtincincomerate numeric(7,4),
+    c_debtincinteresttype character varying(2),
+    f_debtincattornprice numeric(12,2),
+    f_debtincattornduration numeric(5,0),
+    f_debtincbail numeric(12,2),
+    f_debtincfinccost numeric(9,2),
+    c_debtincmemo1 character varying(1000),
+    c_othinvfincparty character varying(500),
+    c_othinvfincpartycorporation character varying(500),
+    c_othinvfincpartyaddress character varying(500),
+    c_othinvfincpartyzipcode character varying(30),
+    c_othinvfincpartyphoneno1 character varying(100),
+    c_othinvfincpartyphoneno2 character varying(100),
+    c_othinvfincpartyfax character varying(100),
+    f_othinvfinccost numeric(9,2),
+    c_othinvmemo1 character varying(1000),
+    c_othinvmemo2 character varying(1000),
+    c_othinvmemo3 character varying(1000),
+    c_banktrustcoobank character varying(500),
+    c_banktrustproductname character varying(500),
+    c_banktrustproductcode character varying(100),
+    c_banktrustundertakingletter character varying(2),
+    c_trustgovgovname character varying(500),
+    c_trustgovprojecttype character varying(1000),
+    c_trustgovcootype character varying(4),
+    c_trustgovoptype character varying(4),
+    c_housecapital character varying(4),
+    c_houseispe character varying(2),
+    c_tradetype character varying(2),
+    c_businesstype character varying(2),
+    c_trustname character varying(500),
+    c_trustidtype character varying(2),
+    c_trustidno character varying(100),
+    d_trustidvaliddate timestamp(0) without time zone,
+    c_trustbankname character varying(500),
+    c_trustaccounttype character varying(2),
+    c_trustnameinbank character varying(100),
+    c_zhtrustbankname character varying(500),
+    c_zhtrustbankacco character varying(100),
+    c_issecmarket character varying(2),
+    c_fundoperation character varying(2),
+    c_trustmanager character varying(100),
+    c_tradeother character varying(4000),
+    c_watchdog character varying(500),
+    c_memo character varying(1000),
+    c_benefittype character varying(2),
+    c_redeemaccotype character varying(2),
+    c_bonusaccotype character varying(2),
+    c_fundendaccotype character varying(2),
+    c_collectfailaccotype character varying(2),
+    d_lastmodifydate timestamp(0) without time zone,
+    c_shareholdlimtype character varying(2),
+    c_redeemtimelimtype character varying(2),
+    c_isprincipalrepayment character varying(2),
+    c_principalrepaymenttype character varying(2),
+    l_interestyeardays numeric(3,0),
+    l_incomeyeardays numeric(3,0),
+    c_capuseprovcode character varying(30),
+    c_capusecitycode character varying(30),
+    c_capsourceprovcode character varying(30),
+    c_banktrustcoobankcode character varying(30),
+    c_banktrustisbankcap character varying(2),
+    c_trusteefeedesc character varying(4000),
+    c_managefeedesc character varying(4000),
+    c_investfeedesc character varying(4000),
+    f_investadvisordeductratio numeric(7,6),
+    c_investdeductdesc character varying(4000),
+    c_investadvisor2 character varying(500),
+    f_investadvisorratio2 numeric(7,6),
+    f_investadvisordeductratio2 numeric(7,6),
+    c_investfeedesc2 character varying(4000),
+    c_investdeductdesc2 character varying(4000),
+    c_investadvisor3 character varying(500),
+    f_investadvisorratio3 numeric(7,6),
+    f_investadvisordeductratio3 numeric(7,6),
+    c_investfeedesc3 character varying(4000),
+    c_investdeductdesc3 character varying(4000),
+    c_profitclassdesc character varying(4000),
+    c_deductratiodesc character varying(4000),
+    c_redeemfeedesc character varying(4000),
+    l_defaultprecision numeric(10,0),
+    c_allotfeeaccotype character varying(2),
+    c_isposf character varying(2),
+    c_opendaydesc character varying(4000),
+    c_actualmanager character varying(100),
+    c_subindustrydetail character varying(30),
+    c_isbankleading character varying(2),
+    c_subprojectcode character varying(500),
+    c_iscycleinvest character varying(2),
+    f_liquidationinterest numeric(13,10),
+    c_liquidationinteresttype character varying(2),
+    c_isbonusinvestfare character varying(2),
+    c_subfeeaccotype character varying(2),
+    c_redeemfeeaccotype character varying(2),
+    c_fundrptcode character varying(30),
+    c_ordertype character varying(2),
+    c_flag character varying(2),
+    c_allotliqtype character varying(2),
+    l_sharelimitday numeric(5,0),
+    c_iseverydayopen character varying(2),
+    c_tradebynetvalue character varying(2),
+    c_isstage character varying(2),
+    c_specbenfitmemo character varying(4000),
+    d_effectivedate timestamp(0) without time zone,
+    c_issueendflag character varying(2),
+    c_resharehasrdmfee character varying(2),
+    jy_fundcode numeric,
+    jy_fundid numeric,
+    jy_subfundid numeric,
+    jy_dptid numeric,
+    c_iswealth character varying(2),
+    c_interestcalctype character varying(2),
+    c_allotinterestcalctype character varying(2),
+    c_isriskcapital character varying(2),
+    c_fundstatus_1225 character varying(2),
+    c_isincomeeverydaycalc character varying(2),
+    c_isredeemreturninterest character varying(2),
+    c_isrefundrtninterest character varying(2),
+    d_estimatedsetupdate timestamp(0) without time zone,
+    f_estimatedfactcollect numeric(16,2),
+    c_isfinancialproducts character varying(2),
+    c_fundredeemtype character varying(2),
+    c_trademanualinput character varying(2),
+    f_clientmanageration numeric(7,6),
+    c_profitclassadjustment character varying(2),
+    c_mainfundcode character varying(30),
+    c_contractsealoff character varying(2),
+    c_permitnextperiod character varying(2),
+    c_preprofitschematype character varying(2),
+    c_fundredeemprofit character varying(2),
+    f_incomeration numeric(9,8),
+    c_incomecalctype character varying(2),
+    c_allocateaccoid character varying(30),
+    c_outfundcode character varying(500),
+    c_matchprofitclass character varying(30),
+    l_lastdays numeric(5,0),
+    c_contractprofitflag character varying(2),
+    c_agencysaleliqtype character varying(2),
+    l_delaydays numeric(3,0),
+    c_profitclassperiod character varying(2),
+    c_reportshowname character varying(1000),
+    c_currencyincometype character varying(2),
+    c_beforeredeemcapital character varying(2),
+    c_contractversion character varying(30),
+    c_confirmacceptedflag character varying(2),
+    c_selectcontract character varying(2),
+    f_schemainterest numeric(11,8),
+    c_riskgrade character varying(30),
+    l_sharedelaydays numeric(3,0),
+    l_reservationdays numeric(3,0),
+    c_transfertype character varying(2),
+    c_schemavoluntarily character varying(2),
+    l_schemadetaildata numeric(4,0),
+    c_schemadetailtype character varying(2),
+    c_iscurrencyconfirm character varying(2),
+    c_allowmultiaccobank character varying(2),
+    d_capverif timestamp(0) without time zone,
+    c_templatetype character varying(12),
+    c_capitalprecision character varying(2),
+    c_fundno character varying(100),
+    c_profittype character varying(2),
+    d_paydate timestamp(0) without time zone,
+    d_shelvedate timestamp(0) without time zone,
+    d_offshelvedate timestamp(0) without time zone,
+    c_schemabegindatetype character varying(2),
+    l_schemabegindatedays numeric(3,0),
+    c_isautoredeem character varying(2),
+    c_isnettingrequest character varying(2),
+    c_issuingquotedtype character varying(2),
+    d_firstdistributedate timestamp(0) without time zone,
+    c_bonusfrequency character varying(2),
+    c_interestbigdatetype character varying(2),
+    c_gzdatatype character varying(2),
+    f_allotfareratio numeric(5,4),
+    f_subfareratio numeric(5,4),
+    c_begindatebeyond character varying(2),
+    c_profitnotinterest character varying(2),
+    c_setuplimittype character varying(2),
+    c_limitredeemtype character varying(2),
+    c_bonusfrequencytype character varying(2),
+    c_rfaccotype character varying(2),
+    c_capitalfee character varying(2),
+    c_exceedflag character varying(2),
+    c_enableecd character varying(2),
+    c_isfixedtrade character varying(2),
+    c_profitcaltype character varying(2),
+    f_ominbala numeric(16,2),
+    f_stepbala numeric(16,2),
+    c_remittype character varying(30),
+    c_interestcycle character varying(30),
+    c_repayguaranteecopy character varying(30),
+    c_repaytype character varying(30),
+    c_fundprofitdes character varying(4000),
+    c_fundinfodes character varying(4000),
+    c_riskeval character varying(2),
+    l_maxage numeric(3,0),
+    l_minage numeric(3,0),
+    c_fundriskdes character varying(1000),
+    mig_l_assetid numeric(48,0),
+    l_faincomedays numeric(10,0),
+    c_producttype character varying(2),
+    c_otherbenefitproducttype character varying(2),
+    c_isotc character varying(2),
+    c_iseverydayprovision character varying(2),
+    c_incometogz character varying(2),
+    c_setuptransfundacco character varying(30),
+    c_issuefeeownerrequired character varying(2),
+    c_calcinterestbeforeallot character varying(30),
+    c_islimit300wnature character varying(2),
+    c_allowoverflow character varying(30),
+    c_trustfundtype character varying(30),
+    c_disclose character varying(2),
+    c_collectaccoid character varying(30),
+    c_isissuebymarket character varying(2),
+    c_setupstatus character varying(30),
+    c_isentitytrust character varying(2),
+    l_liquidatesub numeric(10,0),
+    c_incomeassigndesc character varying(4000),
+    c_keeporgancode character varying(30),
+    d_defaultbegincacldate timestamp(0) without time zone,
+    c_zcbborrower character varying(100),
+    c_zcbborroweridno character varying(100),
+    c_zcbremittype character varying(100),
+    c_registcode character varying(100),
+    c_redeeminvestaccotype character varying(2),
+    c_bonusinvestaccotype character varying(2),
+    c_isabsnotopentrade character varying(2),
+    l_interestdiffdays numeric(5,0),
+    c_outfundstatus character varying(2),
+    c_reqsyntype character varying(2),
+    c_allredeemtype character varying(2),
+    c_isabsopentrade character varying(2),
+    c_funddesc character varying(1000),
+    l_allotliquidays numeric(3,0),
+    l_subliquidays numeric(3,0),
+    c_autoupcontractenddaterule character varying(2),
+    c_fcsubaccotype character varying(2),
+    c_fcallotaccotype character varying(2),
+    c_fcredeemaccotype character varying(2),
+    c_fcbonusaccotype character varying(2),
+    c_captranslimitflag character varying(30),
+    c_redeemprincipaltype character varying(2),
+    c_interestcalcdealtype character varying(30),
+    c_collectconfirm character varying(30),
+    d_oldcontractenddate timestamp(0) without time zone,
+    c_tnvaluation character varying(30),
+    c_contractendnotify character varying(2),
+    c_rdmfeebase character varying(30),
+    c_exceedcfmratio character varying(30),
+    c_allowallotcustlimittype character varying(2),
+    c_yeardayscalctype character varying(2),
+    c_iscompoundinterest character varying(30),
+    c_dbcfm character varying(30),
+    c_limitaccountstype character varying(2),
+    c_cycleinvestrange character varying(2),
+    c_tncheckmode character varying(2),
+    c_enableearlyredeem character varying(2),
+    c_ispurceandredeemset character varying(30),
+    c_perfpaydealtype character varying(2),
+    c_allowappend character varying(2),
+    c_allowredeem character varying(2),
+    c_inputstatus character varying(2),
+    c_profitbalanceadjust character varying(2),
+    c_profitperiodadjust character varying(2),
+    c_autogeneratecontractid character varying(2),
+    c_transferneednetting character varying(100),
+    underwrite character varying(1000),
+    undertook character varying(1000),
+    undertake character varying(1000),
+    c_issmsend character varying(2),
+    d_contractshortenddate timestamp(0) without time zone,
+    d_contractlongenddate timestamp(0) without time zone,
+    c_assetseperatefundcodesrc character varying(30),
+    f_averageprofit numeric(11,8),
+    c_currencycontractlimittype character varying(2),
+    l_profitlastdays numeric(5,0),
+    l_liquidationlastdays numeric(5,0),
+    c_arlimitincludeallreq character varying(2),
+    c_reqfundchange character varying(2),
+    c_dealnetvaluerule character varying(2),
+    c_contractdealtype character varying(2),
+    c_bonusplanbeginday timestamp(0) without time zone,
+    c_contractbalaupright character varying(2),
+    c_isneedinterestrate character varying(2),
+    c_isneedexcessratio character varying(2),
+    c_riskgraderemark character varying(1000),
+    c_lossprobability character varying(2),
+    c_suitcusttype character varying(2),
+    c_createbonusschema character varying(2),
+    d_closedenddate timestamp(0) without time zone,
+    c_timelimitunit character varying(30),
+    c_exceedredeemdealtype character varying(2),
+    c_profitperiod character varying(2),
+    l_navgetintervaldays numeric(3,0),
+    load_date timestamp(0) without time zone,
+    sys_id character varying(10) DEFAULT 'S017'::character varying,
+    work_date timestamp(0) without time zone,
+    c_limittransfertype character varying(1),
+    c_transaccotype character varying(1),
+    c_incometaxbase character varying(1),
+    c_isredeemfareyearcalc character varying(1),
+    c_otherbenefitinputmode character varying(1),
+    c_aftdefaultinterestdeducttype character varying(1),
+    c_allowzerobalanceconfirm character varying(1),
+    c_incomejoinassign character varying(1),
+    l_liquidateliqbonus numeric(10,0),
+    c_predefaultinterestdeducttype character varying(1),
+    c_worktype character varying(1),
+    c_defaultinterestadduptype character varying(1),
+    c_issupportsubmode character varying(1),
+    f_expectedyield numeric(14,0),
+    c_recodecode character varying(40),
+    l_liquidatetransfer numeric(10,0),
+    c_ispayincometax character varying(1),
+    c_groupmainfundcode character varying(6),
+    c_redeemfeesplittype character varying(1),
+    c_capitalfromcrmorta character varying(1),
+    c_needcalcdefaultinterest character varying(1),
+    c_issuercode character varying(10),
+    l_redeemfareyeardays numeric(10,0),
+    c_floatyield character varying(30),
+    l_minriskscore numeric(3,0),
+    c_islocalmoneytypecollect character varying(1)
+)
+DISTRIBUTE BY SHARD (c_fundcode) to GROUP default_group;
+
+
+--
+-- Name: s017_tsharecurrents_all; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE s017_tsharecurrents_all (
+    d_cdate timestamp(0) without time zone NOT NULL,
+    c_cserialno character varying(100),
+    c_businflag character(2),
+    d_requestdate timestamp(0) without time zone,
+    c_requestno character varying(100),
+    c_custno character varying(30),
+    c_fundacco character varying(30),
+    c_tradeacco character varying(100),
+    c_fundcode character varying(30),
+    c_sharetype character(1),
+    c_agencyno character(3),
+    c_netno character varying(30),
+    f_occurshares numeric(16,2),
+    f_occurbalance numeric(16,2),
+    f_lastshares numeric(16,2),
+    f_occurfreeze numeric(16,2),
+    f_lastfreezeshare numeric(16,2),
+    c_summary character varying(100),
+    f_gainbalance numeric(16,2),
+    d_sharevaliddate timestamp(0) without time zone,
+    c_bonustype character(1),
+    c_custtype character(1),
+    c_shareclass character(1),
+    c_bourseflag character varying(20),
+    d_exportdate timestamp(0) without time zone,
+    l_contractserialno numeric(10,0),
+    c_issend character(1),
+    c_sendbatch character varying(30),
+    work_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (d_cdate) to GROUP default_group;
+
+
+--
+-- Name: s017_ttrustclientinfo_all; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE s017_ttrustclientinfo_all (
+    c_custno character varying(30) NOT NULL,
+    c_custtype character(1),
+    c_custname character varying(500),
+    c_shortname character varying(500),
+    c_helpcode character varying(30),
+    c_identitytype character(1),
+    c_identityno character varying(500),
+    c_zipcode character varying(30),
+    c_address character varying(1000),
+    c_phone character varying(100),
+    c_faxno character varying(500),
+    c_mobileno character varying(100),
+    c_email character varying(500),
+    c_sex character(1),
+    c_birthday character varying(30),
+    c_vocation character(2),
+    c_education character(2),
+    c_income character varying(30),
+    c_contact character varying(100),
+    c_contype character(1),
+    c_contno character varying(100),
+    c_billsendflag character(1),
+    c_callcenter character(1),
+    c_internet character(1),
+    c_secretcode character varying(30),
+    c_nationality character(3),
+    c_cityno character varying(30),
+    c_lawname character varying(100),
+    c_shacco character varying(30),
+    c_szacco character varying(30),
+    c_broker character varying(100),
+    f_agio numeric(5,4),
+    c_memo character varying(4000),
+    c_reserve character varying(500),
+    c_corpname character varying(100),
+    c_corptel character varying(100),
+    c_specialcode character varying(100),
+    c_actcode character varying(30),
+    c_billsendpass character(1),
+    c_addressinvalid character(1),
+    d_appenddate timestamp(0) without time zone,
+    d_backdate timestamp(0) without time zone,
+    c_invalidaddress character varying(500),
+    c_backreason character varying(500),
+    c_modifyinfo character(2),
+    c_riskcontent character varying(4000),
+    l_querydaysltd numeric(3,0),
+    c_customermanager character varying(100),
+    c_custproperty character(1),
+    c_custclass character(1),
+    c_custright character varying(4000),
+    c_daysltdtype character(1),
+    d_idvaliddate timestamp(0) without time zone,
+    l_custgroup numeric(10,0),
+    c_recommender character varying(100),
+    c_recommendertype character(1),
+    d_idnovaliddate timestamp(0) without time zone,
+    c_organcode character(10),
+    c_othercontact character varying(100),
+    c_taxregistno character varying(100),
+    c_taxidentitytype character(1),
+    c_taxidentityno character varying(100),
+    d_legalvaliddate timestamp(0) without time zone,
+    c_shareholder character varying(500),
+    c_shareholderidtype character(1),
+    c_shareholderidno character varying(100),
+    d_holderidvaliddate timestamp(0) without time zone,
+    c_leader character varying(500),
+    c_leaderidtype character(1),
+    c_leaderidno character varying(100),
+    d_leadervaliddate timestamp(0) without time zone,
+    c_managercode character varying(100),
+    c_linemanager character varying(100),
+    c_clientinfoid character varying(30),
+    c_provincecode character varying(30),
+    c_countytown character varying(1000),
+    c_phone2 character varying(100),
+    c_clienttype character(1),
+    c_agencyno character(3),
+    c_industrydetail character varying(30),
+    c_isqualifiedcust character(1),
+    c_industryidentityno character varying(100),
+    c_lawidentitytype character(1),
+    c_lawidentityno character varying(100),
+    d_lawidvaliddate timestamp(0) without time zone,
+    d_conidvaliddate timestamp(0) without time zone,
+    c_conisrevmsg character(1),
+    c_conmobileno character varying(100),
+    c_conmoaddress character varying(1000),
+    c_conzipcode character varying(30),
+    c_conphone1 character varying(100),
+    c_conphone2 character varying(100),
+    c_conemail character varying(100),
+    c_confaxno character varying(500),
+    c_incomsource character varying(500),
+    c_zhidentityno character varying(500),
+    c_zhidentitytype character(1),
+    c_eastcusttype character varying(30),
+    jy_custid numeric(10,0),
+    c_idtype201201030 character(1),
+    c_emcontact character varying(500),
+    c_emcontactphone character varying(100),
+    c_instiregaddr character varying(1000),
+    c_regcusttype character varying(30),
+    c_riskgrade character varying(30),
+    c_riskgraderemark character varying(1000),
+    d_idvaliddatebeg timestamp(0) without time zone,
+    d_industryidvaliddatebeg timestamp(0) without time zone,
+    d_industryidvaliddate timestamp(0) without time zone,
+    c_incomesourceotherdesc character varying(1000),
+    c_vocationotherdesc character varying(1000),
+    c_businscope character varying(4000),
+    d_conidvaliddatebeg timestamp(0) without time zone,
+    d_lawidvaliddatebeg timestamp(0) without time zone,
+    c_regmoneytype character(3),
+    f_regcapital numeric(15,2),
+    c_orgtype character(2),
+    c_contrholderno character varying(100),
+    c_contrholdername character varying(500),
+    c_contrholderidtype character(2),
+    c_contrholderidno character varying(500),
+    d_contrholderidvalidatebeg timestamp(0) without time zone,
+    d_contrholderidvalidate timestamp(0) without time zone,
+    c_responpername character varying(500),
+    c_responperidtype character(2),
+    c_responperidno character varying(500),
+    d_responperidvalidatebeg timestamp(0) without time zone,
+    d_responperidvalidate timestamp(0) without time zone,
+    c_lawphone character varying(100),
+    c_contrholderphone character varying(100),
+    c_responperphone character varying(100),
+    c_consex character(1),
+    c_conrelative character varying(500),
+    l_riskserialno numeric(10,0),
+    c_convocation character(2),
+    c_iscustrelated character(1),
+    c_businlicissuorgan character varying(500),
+    c_manageridno character varying(500),
+    c_manageridtype character varying(500),
+    c_managername character varying(500),
+    d_companyregdate timestamp(0) without time zone,
+    c_electronicagreement character(1),
+    c_householdregno character varying(500),
+    c_guardianrela character varying(500),
+    c_guardianname character varying(500),
+    c_guardianidtype character(1),
+    c_guardianidno character varying(500),
+    c_isfranchisingidstry character(1),
+    c_franchidstrybusinlic character varying(500),
+    c_workunittype character(2),
+    c_normalresidaddr character varying(1000),
+    c_domicile character varying(1000),
+    c_finainvestyears character(2),
+    c_parentidtype character(1),
+    c_parentidno character varying(500),
+    c_videono character varying(1000),
+    c_bonustype character(1),
+    d_retirementdate timestamp(0) without time zone,
+    c_issendbigcustbill character(1),
+    c_idaddress character varying(1000),
+    c_isproinvestor character(1),
+    c_sendkfflag character(1),
+    c_sendkfcause character varying(1000),
+    c_sendsaflag character(1),
+    c_sendsacause character varying(1000),
+    c_custrelationchannel character(1),
+    c_companytype character(1),
+    c_businlocation character varying(1000),
+    c_custodian character varying(500),
+    d_elecsigndate timestamp(0) without time zone,
+    d_riskinputdate timestamp(0) without time zone,
+    c_circno character varying(1000),
+    c_financeindustrydetail character varying(30),
+    c_outclientinfoid character varying(30),
+    d_duediligencedate timestamp(0) without time zone,
+    c_duediligencestatus character(1),
+    c_inputstatus character(1),
+    c_address2 character varying(1000),
+    c_reportcusttype character(1),
+    c_reportcusttypedetail character varying(30),
+    c_custsource character varying(30),
+    work_date timestamp(0) without time zone
+)
+DISTRIBUTE BY SHARD (c_custno) to GROUP default_group;
+
+
+--
+-- Name: sys_stat_error_log; Type: TABLE; Schema: sync; Owner: gregsun
+--
+
+CREATE TABLE sys_stat_error_log (
+    proc_name varchar2(50) NOT NULL,
+    tab_level varchar2(20),
+    step_no varchar2(20),
+    step_desc varchar2(500),
+    begin_time timestamp(0) without time zone,
+    end_time timestamp(0) without time zone,
+    workdate timestamp(0) without time zone,
+    row_num numeric,
+    elapsed numeric,
+    all_elapsed numeric,
+    sql_code varchar2(20),
+    sql_errm varchar2(500)
+)
+DISTRIBUTE BY SHARD (proc_name) to GROUP default_group;
+
+
+--
+-- Data for Name: b03_ts_remetrade; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY b03_ts_remetrade (c_fundcode, c_fundname, c_fundacco, f_netvalue, c_agencyname, c_custname, d_date, d_cdate, f_confirmbalance, f_tradefare, f_confirmshares, f_relbalance, f_interest, info, work_date, load_date) FROM stdin;
+\.
+
+
+--
+-- Data for Name: b03_ts_remetrade_bak; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY b03_ts_remetrade_bak (c_fundcode, c_fundname, c_fundacco, f_netvalue, c_agencyname, c_custname, d_date, d_cdate, f_confirmbalance, f_tradefare, f_confirmshares, f_relbalance, f_interest, info, work_date, load_date) FROM stdin;
+\.
+
+
+--
+-- Data for Name: ks0_fund_base_26; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY ks0_fund_base_26 (id1, acc_cd, tdate, ins_cd, cost_price_asset, pcol) FROM stdin;
+\.
+
+
+--
+-- Data for Name: p; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY p (p1, p2) FROM stdin;
+2021-12-12	2021-12-12
+2021-12-13	2021-12-12
+2020-12-13	2021-12-12
+\.
+
+
+--
+-- Data for Name: s017_taccoinfo; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY s017_taccoinfo (c_custno, c_accounttype, c_fundacco, c_agencyno, c_netno, c_childnetno, d_opendate, d_lastmodify, c_accostatus, c_freezecause, d_backdate, l_changetime, d_firstinvest, c_password, c_bourseflag, c_operator, jy_custid, work_date) FROM stdin;
+\.
+
+
+--
+-- Data for Name: s017_tacconet; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY s017_tacconet (c_fundacco, c_agencyno, c_netno, c_tradeacco, c_openflag, c_bonustype, c_bankno, c_bankacco, c_nameinbank, d_appenddate, c_childnetno, c_tradeaccobak, c_bankname, c_banklinecode, c_channelbankno, c_bankprovincecode, c_bankcityno, sys_id, work_date, load_date) FROM stdin;
+\.
+
+
+--
+-- Data for Name: s017_tagencyinfo; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY s017_tagencyinfo (c_agencyno, c_agencyname, c_fullname, c_agncyaddress, c_agncyzipcode, c_agncycontact, c_agncyphone, c_agncyfaxno, c_agncymail, c_agncybankno, c_agncybankacco, c_agncybankname, d_agncyregdate, c_agncystatus, d_lastdate, c_agencytype, c_detail, c_right, c_zdcode, l_liquidateredeem, l_liquidateallot, l_liquidatebonus, l_liquidatesub, c_sharetypes, f_agio, c_ztgonestep, c_preassign, l_cserialno, c_comparetype, c_liquidatetype, c_multitradeacco, c_iversion, c_imode, c_changeonstep, f_outagio, f_agiohint, f_outagiohint, c_allotliqtype, c_redeemliqtype, c_centerflag, c_netno, c_littledealtype, c_overtimedeal, d_lastinputtime, f_interestrate, c_clearsite, c_isdeal, c_agencyenglishname, l_fundaccono, c_rationflag, c_splitflag, c_tacode, c_outdataflag, c_hasindex, c_transferbyadjust, c_sharedetailexptype, c_navexptype, c_ecdmode, c_agencytypedetail, c_advanceshrconfirm, c_ecdversion, c_capmode, c_internetplatform, c_capautoarrive, c_outcapitaldata, c_ecdcheckmode, c_ecddealmode, c_fileimpmode, c_isotc, c_enableecd, c_autoaccotype, c_tncheckmode, c_captureidinfo, c_realfreeze, sys_id, work_date, load_date) FROM stdin;
+1	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
+\.
+
+
+--
+-- Data for Name: s017_tconfirm_all; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY s017_tconfirm_all (c_businflag, d_cdate, c_cserialno, d_date, l_serialno, c_agencyno, c_netno, c_fundacco, c_tradeacco, c_fundcode, c_sharetype, f_confirmbalance, f_confirmshares, f_tradefare, f_tafare, f_stamptax, f_backfare, f_otherfare1, f_interest, f_interesttax, f_totalfare, f_agencyfare, f_netvalue, f_frozenbalance, f_unfrozenbalance, c_status, c_cause, c_taflag, c_custtype, c_custno, f_gainbalance, f_orifare, c_requestendflag, f_unbalance, f_unshares, c_reserve, f_interestshare, f_chincome, f_chshare, f_confirmincome, f_oritradefare, f_oritafare, f_oribackfare, f_oriotherfare1, c_requestno, f_balance, f_shares, f_agio, f_lastshares, f_lastfreezeshare, c_othercode, c_otheracco, c_otheragency, c_othernetno, c_bonustype, c_foriginalno, c_exceedflag, c_childnetno, c_othershare, c_actcode, c_acceptmode, c_freezecause, c_freezeenddate, f_totalbalance, f_totalshares, c_outbusinflag, c_protocolno, c_memo, f_registfare, f_fundfare, f_oriagio, c_shareclass, d_cisdate, c_bourseflag, c_fundtype, f_backfareagio, c_bankno, c_subfundmethod, c_combcode, f_returnfare, c_contractno, c_captype, l_contractserialno, l_othercontractserialno, d_exportdate, f_transferfee, f_oriconfirmbalance, f_extendnetvalue, l_remitserialno, c_zhxtht, c_improperredeem, f_untradefare, f_untradeinfare, f_untradeoutfare, c_profitnottransfer, f_outprofit, f_inprofit, c_totrustcontractid, d_repurchasedate, f_chengoutbalance, c_exporting, jy_fundid, jy_contractbh, jy_custid, jy_tocustid, jy_fare, c_trustcontractid, f_taagencyfare, f_taregisterfare, d_cdate_jy, jy_adjust, jy_subfundid, jy_adjust1114, jy_cdate, c_bankacco, c_bankname, c_nameinbank, f_riskcapital, f_replenishriskcapital, c_fromfundcode, c_fromtrustcontractid, c_trustagencyno, l_rdmschserialno, f_redeemprofit, f_redeemproyieldrate, d_redeemprobigdate, d_redeemproenddate, c_changeownerincomebelong, l_midremitserialno, c_fromtype, c_iscycinvest, l_fromserialno, l_frominterestconserialno, c_changeownerinterest, c_msgsendflag, l_sharedelaydays, c_istodayconfirm, f_newincome, f_floorincome, l_incomeremitserialno, c_isnetting, l_bankserialno, c_subfundcode, f_chengoutsum, f_chengoutprofit, l_confirmtransserialno, c_shareadjustgzexpflag, c_issend, c_exchangeflag, yh_date_1112, l_banktocontractserialno, c_payfeetype, c_tobankno, c_tobankacco, c_tobankname, c_tonameinbank, c_tobanklinecode, c_tobankprovincecode, c_tobankcityno, l_assetseperateno, c_sharecserialno, c_redeemprincipaltype, work_date, c_businname) FROM stdin;
+1 	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
+\.
+
+
+--
+-- Data for Name: s017_tdividenddetail; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY s017_tdividenddetail (d_cdate, c_cserialno, d_regdate, d_date, c_fundacco, c_tradeacco, c_fundcode, c_sharetype, c_agencyno, c_netno, f_totalshare, f_unitprofit, f_totalprofit, f_tax, c_flag, f_realbalance, f_reinvestbalance, f_realshares, f_fare, d_lastdate, f_netvalue, f_frozenbalance, f_frozenshares, f_incometax, c_reserve, d_requestdate, c_shareclass, l_contractserialno, l_specprjserialno, f_investadvisorratio, f_transferfee, l_profitserialno, d_exportdate, c_custid, jy_fundid, jy_subfundid, jy_custid, jy_contractbh, jy_profitsn, jy_profitmoney, jy_capitalmoney, jy_adjust, c_reinvestnetvalue, f_transferbalance, l_relatedserialno, c_printoperator, c_printauditor, sys_id, work_date, load_date, f_remainshares) FROM stdin;
+2021-04-26 20:34:00	\N	\N	\N	\N	\N	2	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
+\.
+
+
+--
+-- Data for Name: s017_tfundday; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY s017_tfundday (d_date, d_cdate, c_fundcode, c_todaystatus, c_status, f_netvalue, f_lastshares, f_lastasset, f_asucceed, f_rsucceed, c_vastflag, f_encashratio, f_changeratio, c_excessflag, f_subscriberatio, c_inputpersonnel, c_checkpersonnel, f_income, f_incomeratio, f_unassign, f_incomeunit, f_totalnetvalue, f_servicefare, f_assign, f_growthrate, c_netvalueflag, f_managefare, d_exportdate, c_flag, f_advisorfee, d_auditdate, f_extendnetvalue, f_extendtotalnetvalue, jy_fundcode, f_yearincomeratio, f_riskcapital, f_totalincome, f_agencyexpyearincomeration, f_agencyexpincomeunit, f_agencyexpincomeration, f_agencyexpincome, c_isspecflag, c_isasync, sys_id, work_date, load_date) FROM stdin;
+\.
+
+
+--
+-- Data for Name: s017_tfundinfo; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY s017_tfundinfo (c_fundcode, c_fundname, c_moneytype, c_managername, c_trusteecode, f_parvalue, f_issueprice, c_trusteeacco, d_issuedate, d_setupdate, f_maxbala, f_maxshares, f_minbala, f_minshares, l_elimitday, l_slimitday, l_alimitday, l_mincount, l_climitday, f_maxallot, f_maxredeem, c_fundcharacter, c_fundstatus, c_subscribemode, l_timelimit, l_subscribeunit, c_sharetypes, c_issuetype, f_factcollect, d_failuedate, f_allotratio, c_feeratiotype1, c_feeratiotype2, c_feetype, c_exceedpart, c_bonustype, c_forceredeem, c_interestdealtype, f_redeemfareratio, f_changefareratio, f_managerfee, f_right, c_property, d_evendate, f_totalbonus, c_changefree, c_reportcode, c_backfarecal, l_moneydate, l_netprecision, c_corpuscontent, f_corpusratio, c_farecaltype, l_liquidateallot, l_liquidateredeem, l_liquidatebonus, l_taspecialacco, c_fareprecision, d_issueenddate, c_farebelongasset, l_liquidatechange, l_liquidatefail, l_liquidateend, c_sharedetail, c_trusteebankname, c_boursetradeflag, c_fundenglishname, l_bankaccono, c_cleanflag, c_precision, c_upgradeflag, c_isdeal, c_farecltprecision, c_balanceprecision, c_shareprecision, c_bonusprecision, c_interestprecision, f_maxallotasset, f_maxallotshares, c_foreigntrustee, l_tnconfirm, c_rationallotstatus, f_trusteefee, c_fundacco, c_financetype, l_liquidatechangein, c_custname, c_identitytype, c_custtype, c_identityno, c_deductschemecode, c_customermanager, c_templateid, f_pr0, f_deductratio, c_farecalculatetype, c_saletype, l_maxcount, l_zhallotliqdays, l_zhredeemliqdays, f_liqasset, l_zhallotexpdays, l_zhredeemexpdays, c_limitmode, c_ordermode, c_acntlmtdealmode, l_informdays, c_allowpartredeem, c_fundendmode, f_fundendagio, c_minbalalimitisconfirm, c_gradetype, c_qryfreqtype, l_qrydaysltd, d_contractenddate, c_useinopenday, c_allotcalinterst, c_fundrisk, c_exitallot, c_subinterestcalc, c_earlyexitredfee, c_navexpfqy, l_navexpday, c_isbounded, c_earlyexitfeecalc, c_designdptid, c_fixeddividway, c_trusttype, f_maxnaturalmoney, c_projectid, c_trustclass, f_trustscale, c_structflag, c_priconveyflag, c_repurchasetype, c_iswholerepurchase, f_repurchaseminbala, c_repurchasemainbody, c_canelyrepurchase, c_earlybacktime, c_repurchaseprice, c_premiumpaymenttime, c_liquisource, l_period, c_canextensionflag, c_canelyliquidflag, c_trustassetdesc, c_returnside, c_returnpaymentway, c_returnbase, c_refepaymentway, c_refeside, c_refebase, f_warnline, f_stopline, f_collectinterest, f_durationinterest, f_investadvisorratio, c_bonusschema, c_guaranteetype, c_guaranteedesc, c_expectedyieldtype, f_minexpectedyield, f_maxexpectedyield, c_incomecycletype, f_incomecyclevalue, c_subaccotype, c_allotaccotype, c_fundtype, c_cootype, c_projecttype, c_investdirection, c_investdirectionfractionize, c_industrydetail, c_initeresttype, c_isextended, d_extenddate, c_dealmanagetype, c_investarea, c_projectcode, c_fundshortname, c_contractid, c_functype, c_specialbusintype, c_investindustry, c_managetype, c_area, c_risk, c_iscommitteedisscuss, c_structtype, c_commendplace, l_npmaxcount, c_client, c_clientcusttype, c_clientidtype, c_clientidno, c_clientbankname, c_clientaccono, c_clientaddress, c_clientzipcode, c_clientphoneno1, c_clientphoneno2, c_clientfax, c_beneficiary, c_collectbankname, c_collectbankno, c_collectaccountname, c_collectbankacco, c_keeperbankname, c_keeperaccountname, c_keeperaccountno, c_keepername, c_keepercorporation, c_keeperaddress, c_keeperzipcode, c_keeperphoneno1, c_keeperphoneno2, c_keeperfax, c_incomedistributetype, c_alarmline, c_stoplossline, f_investadvisorfee, c_investadvisordeduct, c_capitalacco, c_stockacconame, c_stocksalesdept, c_thirdpartybankno, c_thirdpartybankname, c_thirdpartyacconame, c_thirdpartyaccono, c_investadvisor, c_investadvisorbankno, c_investadvisorbankname, c_investadvisoracconame, c_investadvisoraccono, c_investadvisorcorporation, c_investadvisoraddress, c_investadvisorzipcode, c_investadvisorphoneno1, c_investadvisorphoneno2, c_investadvisorfax, c_authdelegate, c_loanfinanceparty, c_loanfinancepartycorporation, c_loanfinancepartyaddress, c_loanfinancepartyzipcode, c_loanfinancepartyphoneno1, c_loanfinancepartyphoneno2, c_loanfinancepartyfax, c_loaninteresttype, f_loaninterestrate, f_loanduration, c_loanmanagebank, f_loanmanagefee, f_loanfinancecost, f_creditattornduration, f_creditattorninterestduration, f_creditattornprice, f_billattornduration, f_billattorninterestduration, f_billattornprice, c_stkincfincparty, c_stkincfincpartycorporation, c_stkincfincpartyaddress, c_stkincfincpartyzipcode, c_stkincfincpartyphoneno1, c_stkincfincpartyphoneno2, c_stkincfincpartyfax, c_stkincincomeannualizedrate, c_stkincinteresttype, f_stkincattornprice, f_stkincattornduration, f_stkincbail, f_stkincfinccost, c_stkincmemo1, c_stkincmemo2, c_debtincfincparty, c_debtincfincpartycorporation, c_debtincfincpartyaddress, c_debtincfincpartyzipcode, c_debtincfincpartyphoneno1, c_debtincfincpartyphoneno2, c_debtincfincpartyfax, c_debtincincomerate, c_debtincinteresttype, f_debtincattornprice, f_debtincattornduration, f_debtincbail, f_debtincfinccost, c_debtincmemo1, c_othinvfincparty, c_othinvfincpartycorporation, c_othinvfincpartyaddress, c_othinvfincpartyzipcode, c_othinvfincpartyphoneno1, c_othinvfincpartyphoneno2, c_othinvfincpartyfax, f_othinvfinccost, c_othinvmemo1, c_othinvmemo2, c_othinvmemo3, c_banktrustcoobank, c_banktrustproductname, c_banktrustproductcode, c_banktrustundertakingletter, c_trustgovgovname, c_trustgovprojecttype, c_trustgovcootype, c_trustgovoptype, c_housecapital, c_houseispe, c_tradetype, c_businesstype, c_trustname, c_trustidtype, c_trustidno, d_trustidvaliddate, c_trustbankname, c_trustaccounttype, c_trustnameinbank, c_zhtrustbankname, c_zhtrustbankacco, c_issecmarket, c_fundoperation, c_trustmanager, c_tradeother, c_watchdog, c_memo, c_benefittype, c_redeemaccotype, c_bonusaccotype, c_fundendaccotype, c_collectfailaccotype, d_lastmodifydate, c_shareholdlimtype, c_redeemtimelimtype, c_isprincipalrepayment, c_principalrepaymenttype, l_interestyeardays, l_incomeyeardays, c_capuseprovcode, c_capusecitycode, c_capsourceprovcode, c_banktrustcoobankcode, c_banktrustisbankcap, c_trusteefeedesc, c_managefeedesc, c_investfeedesc, f_investadvisordeductratio, c_investdeductdesc, c_investadvisor2, f_investadvisorratio2, f_investadvisordeductratio2, c_investfeedesc2, c_investdeductdesc2, c_investadvisor3, f_investadvisorratio3, f_investadvisordeductratio3, c_investfeedesc3, c_investdeductdesc3, c_profitclassdesc, c_deductratiodesc, c_redeemfeedesc, l_defaultprecision, c_allotfeeaccotype, c_isposf, c_opendaydesc, c_actualmanager, c_subindustrydetail, c_isbankleading, c_subprojectcode, c_iscycleinvest, f_liquidationinterest, c_liquidationinteresttype, c_isbonusinvestfare, c_subfeeaccotype, c_redeemfeeaccotype, c_fundrptcode, c_ordertype, c_flag, c_allotliqtype, l_sharelimitday, c_iseverydayopen, c_tradebynetvalue, c_isstage, c_specbenfitmemo, d_effectivedate, c_issueendflag, c_resharehasrdmfee, jy_fundcode, jy_fundid, jy_subfundid, jy_dptid, c_iswealth, c_interestcalctype, c_allotinterestcalctype, c_isriskcapital, c_fundstatus_1225, c_isincomeeverydaycalc, c_isredeemreturninterest, c_isrefundrtninterest, d_estimatedsetupdate, f_estimatedfactcollect, c_isfinancialproducts, c_fundredeemtype, c_trademanualinput, f_clientmanageration, c_profitclassadjustment, c_mainfundcode, c_contractsealoff, c_permitnextperiod, c_preprofitschematype, c_fundredeemprofit, f_incomeration, c_incomecalctype, c_allocateaccoid, c_outfundcode, c_matchprofitclass, l_lastdays, c_contractprofitflag, c_agencysaleliqtype, l_delaydays, c_profitclassperiod, c_reportshowname, c_currencyincometype, c_beforeredeemcapital, c_contractversion, c_confirmacceptedflag, c_selectcontract, f_schemainterest, c_riskgrade, l_sharedelaydays, l_reservationdays, c_transfertype, c_schemavoluntarily, l_schemadetaildata, c_schemadetailtype, c_iscurrencyconfirm, c_allowmultiaccobank, d_capverif, c_templatetype, c_capitalprecision, c_fundno, c_profittype, d_paydate, d_shelvedate, d_offshelvedate, c_schemabegindatetype, l_schemabegindatedays, c_isautoredeem, c_isnettingrequest, c_issuingquotedtype, d_firstdistributedate, c_bonusfrequency, c_interestbigdatetype, c_gzdatatype, f_allotfareratio, f_subfareratio, c_begindatebeyond, c_profitnotinterest, c_setuplimittype, c_limitredeemtype, c_bonusfrequencytype, c_rfaccotype, c_capitalfee, c_exceedflag, c_enableecd, c_isfixedtrade, c_profitcaltype, f_ominbala, f_stepbala, c_remittype, c_interestcycle, c_repayguaranteecopy, c_repaytype, c_fundprofitdes, c_fundinfodes, c_riskeval, l_maxage, l_minage, c_fundriskdes, mig_l_assetid, l_faincomedays, c_producttype, c_otherbenefitproducttype, c_isotc, c_iseverydayprovision, c_incometogz, c_setuptransfundacco, c_issuefeeownerrequired, c_calcinterestbeforeallot, c_islimit300wnature, c_allowoverflow, c_trustfundtype, c_disclose, c_collectaccoid, c_isissuebymarket, c_setupstatus, c_isentitytrust, l_liquidatesub, c_incomeassigndesc, c_keeporgancode, d_defaultbegincacldate, c_zcbborrower, c_zcbborroweridno, c_zcbremittype, c_registcode, c_redeeminvestaccotype, c_bonusinvestaccotype, c_isabsnotopentrade, l_interestdiffdays, c_outfundstatus, c_reqsyntype, c_allredeemtype, c_isabsopentrade, c_funddesc, l_allotliquidays, l_subliquidays, c_autoupcontractenddaterule, c_fcsubaccotype, c_fcallotaccotype, c_fcredeemaccotype, c_fcbonusaccotype, c_captranslimitflag, c_redeemprincipaltype, c_interestcalcdealtype, c_collectconfirm, d_oldcontractenddate, c_tnvaluation, c_contractendnotify, c_rdmfeebase, c_exceedcfmratio, c_allowallotcustlimittype, c_yeardayscalctype, c_iscompoundinterest, c_dbcfm, c_limitaccountstype, c_cycleinvestrange, c_tncheckmode, c_enableearlyredeem, c_ispurceandredeemset, c_perfpaydealtype, c_allowappend, c_allowredeem, c_inputstatus, c_profitbalanceadjust, c_profitperiodadjust, c_autogeneratecontractid, c_transferneednetting, underwrite, undertook, undertake, c_issmsend, d_contractshortenddate, d_contractlongenddate, c_assetseperatefundcodesrc, f_averageprofit, c_currencycontractlimittype, l_profitlastdays, l_liquidationlastdays, c_arlimitincludeallreq, c_reqfundchange, c_dealnetvaluerule, c_contractdealtype, c_bonusplanbeginday, c_contractbalaupright, c_isneedinterestrate, c_isneedexcessratio, c_riskgraderemark, c_lossprobability, c_suitcusttype, c_createbonusschema, d_closedenddate, c_timelimitunit, c_exceedredeemdealtype, c_profitperiod, l_navgetintervaldays, load_date, sys_id, work_date, c_limittransfertype, c_transaccotype, c_incometaxbase, c_isredeemfareyearcalc, c_otherbenefitinputmode, c_aftdefaultinterestdeducttype, c_allowzerobalanceconfirm, c_incomejoinassign, l_liquidateliqbonus, c_predefaultinterestdeducttype, c_worktype, c_defaultinterestadduptype, c_issupportsubmode, f_expectedyield, c_recodecode, l_liquidatetransfer, c_ispayincometax, c_groupmainfundcode, c_redeemfeesplittype, c_capitalfromcrmorta, c_needcalcdefaultinterest, c_issuercode, l_redeemfareyeardays, c_floatyield, l_minriskscore, c_islocalmoneytypecollect) FROM stdin;
+2	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	S017	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
+\.
+
+
+--
+-- Data for Name: s017_tsharecurrents_all; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY s017_tsharecurrents_all (d_cdate, c_cserialno, c_businflag, d_requestdate, c_requestno, c_custno, c_fundacco, c_tradeacco, c_fundcode, c_sharetype, c_agencyno, c_netno, f_occurshares, f_occurbalance, f_lastshares, f_occurfreeze, f_lastfreezeshare, c_summary, f_gainbalance, d_sharevaliddate, c_bonustype, c_custtype, c_shareclass, c_bourseflag, d_exportdate, l_contractserialno, c_issend, c_sendbatch, work_date) FROM stdin;
+\.
+
+
+--
+-- Data for Name: s017_ttrustclientinfo_all; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY s017_ttrustclientinfo_all (c_custno, c_custtype, c_custname, c_shortname, c_helpcode, c_identitytype, c_identityno, c_zipcode, c_address, c_phone, c_faxno, c_mobileno, c_email, c_sex, c_birthday, c_vocation, c_education, c_income, c_contact, c_contype, c_contno, c_billsendflag, c_callcenter, c_internet, c_secretcode, c_nationality, c_cityno, c_lawname, c_shacco, c_szacco, c_broker, f_agio, c_memo, c_reserve, c_corpname, c_corptel, c_specialcode, c_actcode, c_billsendpass, c_addressinvalid, d_appenddate, d_backdate, c_invalidaddress, c_backreason, c_modifyinfo, c_riskcontent, l_querydaysltd, c_customermanager, c_custproperty, c_custclass, c_custright, c_daysltdtype, d_idvaliddate, l_custgroup, c_recommender, c_recommendertype, d_idnovaliddate, c_organcode, c_othercontact, c_taxregistno, c_taxidentitytype, c_taxidentityno, d_legalvaliddate, c_shareholder, c_shareholderidtype, c_shareholderidno, d_holderidvaliddate, c_leader, c_leaderidtype, c_leaderidno, d_leadervaliddate, c_managercode, c_linemanager, c_clientinfoid, c_provincecode, c_countytown, c_phone2, c_clienttype, c_agencyno, c_industrydetail, c_isqualifiedcust, c_industryidentityno, c_lawidentitytype, c_lawidentityno, d_lawidvaliddate, d_conidvaliddate, c_conisrevmsg, c_conmobileno, c_conmoaddress, c_conzipcode, c_conphone1, c_conphone2, c_conemail, c_confaxno, c_incomsource, c_zhidentityno, c_zhidentitytype, c_eastcusttype, jy_custid, c_idtype201201030, c_emcontact, c_emcontactphone, c_instiregaddr, c_regcusttype, c_riskgrade, c_riskgraderemark, d_idvaliddatebeg, d_industryidvaliddatebeg, d_industryidvaliddate, c_incomesourceotherdesc, c_vocationotherdesc, c_businscope, d_conidvaliddatebeg, d_lawidvaliddatebeg, c_regmoneytype, f_regcapital, c_orgtype, c_contrholderno, c_contrholdername, c_contrholderidtype, c_contrholderidno, d_contrholderidvalidatebeg, d_contrholderidvalidate, c_responpername, c_responperidtype, c_responperidno, d_responperidvalidatebeg, d_responperidvalidate, c_lawphone, c_contrholderphone, c_responperphone, c_consex, c_conrelative, l_riskserialno, c_convocation, c_iscustrelated, c_businlicissuorgan, c_manageridno, c_manageridtype, c_managername, d_companyregdate, c_electronicagreement, c_householdregno, c_guardianrela, c_guardianname, c_guardianidtype, c_guardianidno, c_isfranchisingidstry, c_franchidstrybusinlic, c_workunittype, c_normalresidaddr, c_domicile, c_finainvestyears, c_parentidtype, c_parentidno, c_videono, c_bonustype, d_retirementdate, c_issendbigcustbill, c_idaddress, c_isproinvestor, c_sendkfflag, c_sendkfcause, c_sendsaflag, c_sendsacause, c_custrelationchannel, c_companytype, c_businlocation, c_custodian, d_elecsigndate, d_riskinputdate, c_circno, c_financeindustrydetail, c_outclientinfoid, d_duediligencedate, c_duediligencestatus, c_inputstatus, c_address2, c_reportcusttype, c_reportcusttypedetail, c_custsource, work_date) FROM stdin;
+\.
+
+
+--
+-- Data for Name: sys_stat_error_log; Type: TABLE DATA; Schema: sync; Owner: gregsun
+--
+
+COPY sys_stat_error_log (proc_name, tab_level, step_no, step_desc, begin_time, end_time, workdate, row_num, elapsed, all_elapsed, sql_code, sql_errm) FROM stdin;
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+SP_B03_TS_REMETRADE	B	STEP_01	清除目标表数据	2021-04-26 00:00:00	\N	2021-04-26 00:00:00	\N	\N	\N	\N	\N
+\.
+
+
+--
+-- Name: ks0_fund_base_26 pk_ks0_fund_base_26; Type: CONSTRAINT; Schema: sync; Owner: gregsun
+--
+
+ALTER TABLE ONLY ks0_fund_base_26
+    ADD CONSTRAINT pk_ks0_fund_base_26 PRIMARY KEY (id1, acc_cd, ins_cd);
+
+
+--
+-- PostgreSQL database dump complete
+--
+
+create table newtab as 
+    SELECT A.C_FUNDCODE,
+           A.C_FUNDNAME,
+           A.C_FUNDACCO,
+           A.F_NETVALUE,
+           A.C_AGENCYNAME,
+           A.C_CUSTNAME,
+           A.D_DATE,
+           A.D_CDATE,
+           A.F_CONFIRMBALANCE,
+           A.F_TRADEFARE,
+           A.F_CONFIRMSHARES,
+           ABS(NVL(B.F_OCCURBALANCE, A.F_RELBALANCE)) F_RELBALANCE,
+           A.F_INTEREST,
+           NVL(DECODE(B.C_BUSINFLAG,
+                      '02',
+                      '申购',
+                      '50',
+                      '申购',
+                      '74',
+                      '申购',
+                      '03',
+                      '赎回'),
+               DECODE(A.C_BUSINFLAG,
+                      '01',
+                      '认购',
+                      '02',
+                      '申购',
+                      '03',
+                      '赎回',
+                      '53',
+                      '强制赎回',
+                      '50',
+                      '产品成立')) AS INFO,
+           null,
+           SYSDATE AS LOAD_DATE
+      FROM (SELECT A.C_FUNDCODE,
+                   C.C_FUNDNAME,
+                   A.C_FUNDACCO,
+                   FUNC_GETLASTNETVALUE(A.C_FUNDCODE, A.D_CDATE::date) F_NETVALUE,
+                   (SELECT C_AGENCYNAME
+                      FROM S017_TAGENCYINFO
+                     WHERE A.C_AGENCYNO = C_AGENCYNO) C_AGENCYNAME,
+                   B.C_CUSTNAME,
+                   TO_CHAR(A.D_DATE, 'yyyy-mm-dd') D_DATE,
+                   TO_CHAR(A.D_CDATE, 'yyyy-mm-dd') D_CDATE,
+                   DECODE(A.C_BUSINFLAG,
+                          '03',
+                          A.F_CONFIRMBALANCE + A.F_TRADEFARE,
+                          '53',
+                          A.F_CONFIRMBALANCE + A.F_TRADEFARE,
+                          A.F_CONFIRMBALANCE) F_CONFIRMBALANCE,
+                   A.F_TRADEFARE,
+                   A.F_CONFIRMSHARES,
+                   DECODE(A.C_BUSINFLAG,
+                          '03',
+                          A.F_CONFIRMBALANCE,
+                          '53',
+                          A.F_CONFIRMBALANCE,
+                          A.F_CONFIRMBALANCE - A.F_TRADEFARE) F_RELBALANCE,
+                   A.F_INTEREST,
+                   A.C_BUSINFLAG,
+                   A.C_CSERIALNO
+              FROM (SELECT D_DATE,
+                           C_AGENCYNO,
+                           DECODE(C_BUSINFLAG,
+                                  '03',
+                                  DECODE(C_IMPROPERREDEEM,
+                                         '3',
+                                         '100',
+                                         '5',
+                                         '100',
+                                         C_BUSINFLAG),
+                                  C_BUSINFLAG) C_BUSINFLAG,
+                           C_FUNDACCO,
+                           D_CDATE,
+                           C_FUNDCODE,
+                           F_CONFIRMBALANCE,
+                           F_CONFIRMSHARES,
+                           C_REQUESTNO,
+                           F_TRADEFARE,
+                           C_TRADEACCO,
+                           F_INTEREST,
+                           C_CSERIALNO,
+                           L_SERIALNO,
+                           L_CONTRACTSERIALNO
+                      FROM S017_TCONFIRM_ALL T3
+                    UNION
+                    SELECT D_DATE,
+                           C_AGENCYNO,
+                           '02' C_BUSINFLAG,
+                           C_FUNDACCO,
+                           D_LASTDATE AS D_CDATE, 
+                           C_FUNDCODE,
+                           F_REINVESTBALANCE F_CONFIRMBALANCE,
+                           F_REALSHARES F_CONFIRMSHARES,
+                           '' C_REQUESTNO,
+                           0 F_TRADEFARE,
+                           C_TRADEACCO,
+                           0 F_INTEREST,
+                           C_CSERIALNO,
+                           0 L_SERIALNO,
+                           L_CONTRACTSERIALNO
+                      FROM S017_TDIVIDENDDETAIL T1
+                     /*WHERE T1.C_FLAG = '0'*/) A
+              LEFT JOIN S017_TACCONET TACN
+                ON A.C_TRADEACCO = TACN.C_TRADEACCO
+              LEFT JOIN (SELECT * FROM S017_TACCOINFO WHERE C_ACCOUNTTYPE = 'A') X
+                ON A.C_FUNDACCO = X.C_FUNDACCO
+              LEFT JOIN S017_TTRUSTCLIENTINFO_ALL B
+                ON X.C_CUSTNO = B.C_CUSTNO
+             INNER JOIN S017_TFUNDINFO C
+                ON A.C_FUNDCODE = C.C_FUNDCODE
+                       ) A
+      LEFT JOIN (SELECT ST1.D_CDATE,
+                        ST1.C_FUNDCODE,
+                        ST1.F_OCCURBALANCE,
+                        ST1.C_BUSINFLAG,
+                        ST1.C_FUNDACCO,
+                        ST1.C_CSERIALNO
+                   FROM S017_TSHARECURRENTS_ALL ST1
+                 -- WHERE ST1.C_BUSINFLAG <> '74'
+                 UNION ALL
+                 SELECT ST2.D_DATE AS D_CDATE,
+                        ST2.C_FUNDCODE,
+                        ST2.F_TOTALPROFIT AS F_OCCURBALANCE,
+                        '74' AS C_BUSINFLAG,
+                        ST2.C_FUNDACCO,
+                        ST2.C_CSERIALNO
+                   FROM S017_TDIVIDENDDETAIL ST2
+              --    WHERE ST2.C_FLAG = '0'
+				  ) B
+        ON A.C_FUNDCODE = B.C_FUNDCODE
+		/*
+       AND A.C_FUNDACCO = B.C_FUNDACCO
+       AND TO_DATE(A.D_CDATE, 'YYYY-MM-DD') = B.D_CDATE
+       AND A.C_CSERIALNO = B.C_CSERIALNO*/;
+
+DROP SCHEMA sync cascade;

From c9db8471c320b7d65320bf15696e38f050811d38 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 20 May 2021 11:41:34 +0800
Subject: [PATCH 377/578] fix GTM standby lost when xlog of GTM host is not
 available http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131087099711
 (merge request !333)

Squash merge branch 'sigmalin_oracle' into 'Tbase_v5.09'

* fix GTM standby lost when xlog of GTM host is not available
---
 src/gtm/client/fe-protocol.c | 11 ++++++++++
 src/gtm/main/gtm_xlog.c      | 39 ++++++++++++++++++++++++++----------
 src/gtm/main/main.c          |  8 ++++++++
 src/include/gtm/gtm_client.h |  3 ++-
 src/include/gtm/gtm_xlog.h   |  3 ++-
 5 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c
index 89bedf88..a575510e 100644
--- a/src/gtm/client/fe-protocol.c
+++ b/src/gtm/client/fe-protocol.c
@@ -1382,6 +1382,17 @@ result->gr_status = GTM_RESULT_ERROR;
             result->gr_resdata.grd_xlog_data.length = 0;
             result->gr_resdata.grd_xlog_data.xlog_data = NULL;
 
+            if (gtmpqGetInt(&result->gr_resdata.grd_xlog_data.status, sizeof(int), conn))
+            {
+                result->gr_status = GTM_RESULT_ERROR;
+                break;
+            }
+
+            if (result->gr_resdata.grd_xlog_data.status != Send_OK)
+            {
+                break;
+            }
+
             if (gtmpqGetInt64((int64 *)&result->gr_resdata.grd_xlog_data.flush, conn))
             {
                 result->gr_status = GTM_RESULT_ERROR;
diff --git a/src/gtm/main/gtm_xlog.c b/src/gtm/main/gtm_xlog.c
index d38aea68..3d46942e 100644
--- a/src/gtm/main/gtm_xlog.c
+++ b/src/gtm/main/gtm_xlog.c
@@ -816,7 +816,7 @@ GTM_GetReplicationResultIfAny(GTM_StandbyReplication *replication,Port *port)
     return 1;
 }
 
-static bool
+static int
 ReadXLogFileToBuffIntern(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo segment_no)
 {
     char     path[MAXFNAMELEN];
@@ -829,7 +829,7 @@ ReadXLogFileToBuffIntern(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo
     if(fd == -1)
     {
         elog(LOG,"Fail to open xlog %s : %s",path,strerror(errno));
-        return false;
+        return Send_Error;
     }
 
     buff->total_length = 0;
@@ -842,7 +842,7 @@ ReadXLogFileToBuffIntern(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo
         {
             elog(LOG,"Read xlog file %s fails : %s",path,strerror(errno));
             close(fd);
-            return false;
+            return Send_Error;
         }
 
         if(bytes == 0)
@@ -857,10 +857,10 @@ ReadXLogFileToBuffIntern(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo
     if(enalbe_gtm_xlog_debug)
         elog(LOG,"read xlog file %s with bytes %d",path,buff->total_length);
 
-    return true;
+    return Send_OK;
 }
 
-static bool
+static int
 ReadXLogFileToBuff(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo segment_no)
 {
     char path[MAXFNAMELEN];
@@ -881,7 +881,8 @@ ReadXLogFileToBuff(GTM_XLogSegmentBuff *buff,TimeLineID timeline,XLogSegNo segme
     if(access(path,F_OK) < 0)
     {
         elog(LOG,"xlog file %s not found ,that is not support to happen.",path);
-        return false;
+        /* need to tell the standby that the required xlog is not available */
+        return Send_XlogFile_Not_Found;
     }
 
     if(enalbe_gtm_xlog_debug)
@@ -986,6 +987,7 @@ GetXLogFileSize(TimeLineID timeline,XLogSegNo segment_no)
 static int
 SendXLogDataFromFileBuff(GTM_StandbyReplication *replication,StringInfo message_buff)
 {
+    int ret = Send_OK;
     GTM_XLogSegmentBuff *local_buff      = &replication->xlog_read_buff;
     XLogSegNo            request_segment = GetSegmentNo(replication->send_ptr);
 
@@ -998,8 +1000,9 @@ SendXLogDataFromFileBuff(GTM_StandbyReplication *replication,StringInfo message_
         return Send_Data_Not_Found;
     }
 
-    if(ReadXLogFileToBuff(local_buff,replication->time_line,request_segment) == false)
-        return Send_Error;
+    ret = ReadXLogFileToBuff(local_buff,replication->time_line,request_segment);
+    if(ret != Send_OK)
+        return ret;
 
     SendXLogDataFromFileBuffInternal(replication,message_buff);
 
@@ -1070,10 +1073,10 @@ SendXLogContext(GTM_StandbyReplication *replication,Port *port)
     int bytes;
     StringInfoData out_message;
 
-    initStringInfo(&out_message);
-
     pq_beginmessage(&out_message, 'S');
     pq_sendint(&out_message, MSG_REPLICATION_CONTENT, 4);
+    /* send the processing result status to the standby */
+    pq_sendint(&out_message, Send_OK, sizeof(int));
     pq_sendint64(&out_message, GetReplicationSendRequestPtr(replication));
 
     /* request send reply */
@@ -1081,7 +1084,16 @@ SendXLogContext(GTM_StandbyReplication *replication,Port *port)
 
     bytes = SendXLogData(replication,&out_message);
 
-    if(bytes == Send_Error)
+    if (bytes == Send_XlogFile_Not_Found)
+    {
+        pfree(out_message.data);
+        out_message.data = NULL;
+        pq_beginmessage(&out_message, 'S');
+        pq_sendint(&out_message, MSG_REPLICATION_CONTENT, 4);
+        /* send the processing result status to the standby */
+        pq_sendint(&out_message, Send_XlogFile_Not_Found, sizeof(int));
+    }
+    else if(bytes == Send_Error)
         goto send_fail;
 
     pq_endmessage(port,&out_message);
@@ -1089,6 +1101,11 @@ SendXLogContext(GTM_StandbyReplication *replication,Port *port)
     if(pq_flush(port))
         goto send_fail;
 
+    if (bytes == Send_XlogFile_Not_Found)
+    {
+        return false;
+    }
+
     return true;
 
 send_fail:
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index 7d9563f1..e894a5bc 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -3309,6 +3309,14 @@ GTM_ThreadWalReceiver(void *argp)
         Assert(res->gr_status == GTM_RESULT_OK);
         Assert(res->gr_type   == MSG_REPLICATION_CONTENT);
             
+        if (res->gr_resdata.grd_xlog_data.status != Send_OK)
+        {
+            Assert(res->gr_resdata.grd_xlog_data.status == Send_XlogFile_Not_Found);
+            elog(LOG,"xlog file not found in master, exit now");
+            exit(1);
+        }
+
+        Assert(res->gr_resdata.grd_xlog_data.status == Send_OK);
         size      = res->gr_resdata.grd_xlog_data.length;
         start_pos = res->gr_resdata.grd_xlog_data.pos;
         end_pos   = start_pos + size;
diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h
index 2ae03ff1..f85e7d13 100644
--- a/src/include/gtm/gtm_client.h
+++ b/src/include/gtm/gtm_client.h
@@ -75,7 +75,8 @@ typedef union GTM_ResultData
     	char*                  xlog_data;
     	int                    reply;
     	XLogRecPtr             flush;
-	} grd_xlog_data;
+        int                    status;
+    } grd_xlog_data;
 	
 #endif
 
diff --git a/src/include/gtm/gtm_xlog.h b/src/include/gtm/gtm_xlog.h
index d2f9c050..986f73c3 100644
--- a/src/include/gtm/gtm_xlog.h
+++ b/src/include/gtm/gtm_xlog.h
@@ -115,7 +115,8 @@ enum XLogSendResult
     Send_OK               = 1,
     Send_No_data          = 0,
     Send_Data_Not_Found   = -1,
-    Send_Error            = -2
+    Send_Error                = -2,
+    Send_XlogFile_Not_Found   = -3
 };
 
 typedef struct XLogCtlData

From 7ed6b5643124e68d8d88870410cff2edf4692ab9 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Fri, 21 May 2021 17:23:04 +0800
Subject: [PATCH 378/578] 2pc files opt: add 2pc hash table on shmem (merge
 request 300), hash table optimize

---
 src/backend/access/transam/twophase.c | 85 ++++++++++++++++++++++-----
 src/backend/utils/hash/dynahash.c     | 56 +++++++++++++++++-
 src/backend/utils/misc/guc.c          |  9 +++
 src/include/access/twophase.h         |  1 +
 src/include/utils/hsearch.h           |  2 +
 5 files changed, 138 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 3124e174..56d80bf1 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -157,6 +157,7 @@ bool enable_2pc_file_cache = true;
 bool enable_2pc_file_check = true;
 bool enable_2pc_entry_key_check = true;
 bool enable_2pc_entry_trace = false;
+bool enable_2pc_hash_table_check = true;
 
 int record_2pc_cache_size = 4096;
 int record_2pc_entry_size = 2048;
@@ -168,7 +169,8 @@ int record_2pc_partitions = 32;
 #define MAX_2PC_INFO_SIZE   (record_2pc_entry_size - MAX_TID_SIZE)
 #define DFLT_2PC_INFO_SIZE  1024  /* default size */
 
-#define MAX_RETRY_TIMES     10
+#define HASH_TAB_RETRY_MAX   10
+#define HASH_TAB_RETRY_SLEEP 2000 /* sleep time: 2ms */
 
 /* hash table entry for 2pc record */
 typedef struct Cache2pcInfo
@@ -2463,10 +2465,15 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 			Assert(NULL != entry);
 			check_2pc_file(entry->key, entry->info, func);
 
-			elog(LOG, "[%s] key %s is found in hash table", func, entry->key);
+			elog(LOG, "[%s] %s is found in hash table seq", func, entry->key);
 
 			if (IsXidImplicit(entry->key))
 			{
+				if (0 == strlen(entry->info))
+				{
+					elog(WARNING, "[%s] %s info length is 0", func, entry->key);
+					continue;
+				}
 				memset(info, 0, MAX_2PC_INFO_SIZE);
 				memcpy(info, entry->info, strlen(entry->info));
 
@@ -2482,12 +2489,10 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 							func, entry->key, PGXCNodeName);
 						continue;
 					}
-					else
-					{
+
 						elog(LOG, "[%s] %s start node is %s",
 							func, entry->key, PGXCNodeName);
 					}
-				}
 				else
 				{
 					elog(WARNING, "[%s] %s get start node failed, info: %s",
@@ -3565,6 +3570,8 @@ void record_2pc_involved_nodes_xid(const char * tid,
     XLogRecPtr xlogrec = 0;
 #endif
         
+	enable_hash_table_trace = false;
+
     if (!enable_2pc_recovery_info)
     {
         return ;
@@ -3711,18 +3718,64 @@ void record_2pc_involved_nodes_xid(const char * tid,
 				}
 				else
 				{
-					elog(LOG, "[%s] %s is found in hash table", func, tid);
+					elog(LOG, "[%s] %s is added to hash table, entry: %p, "
+						"record_2pc_cache: %p, hashvalue: %u", func, tid, entry,
+						record_2pc_cache, string_hash(tid, MAX_TID_SIZE));
 				}
 			}
-			else if (enable_2pc_entry_trace)
+			else if (enable_2pc_entry_trace || enable_2pc_hash_table_check)
 			{
-				elog(LOG, "[%s] %s is added to hash table, entry: %p",
-					func, tid, entry);
+				elog(LOG, "[%s] %s is added to hash table, entry: %p, "
+					"record_2pc_cache: %p, hashvalue: %u", func, tid, entry,
+					record_2pc_cache, string_hash(tid, MAX_TID_SIZE));
 			}
 
 			memcpy(entry->info, content.data, size + 1);
 			check_entry_key(tid, entry->key, func);
 
+			if (enable_2pc_hash_table_check)
+			{
+				int retry_times = 0;
+				Cache2pcInfo *entry_debug = NULL;
+
+				GET_2PC_FILE_PATH(path, tid);
+
+				while (retry_times++ < HASH_TAB_RETRY_MAX)
+				{
+					entry_debug = (Cache2pcInfo *)hash_search(record_2pc_cache,
+						tid, HASH_FIND, &found);
+					if (found)
+					{
+						Assert(NULL != entry_debug);
+						check_entry_key(tid, entry_debug->key, func);
+						break;
+					}
+
+					/* not found */
+					elog(LOG, "[%s] %s is not found in hash table, retry times: %d",
+						func, tid, retry_times);
+
+					Assert(NULL == entry_debug);
+
+					if (0 == access(path, F_OK))
+					{
+						elog(LOG, "[%s] %s found 2pc file %s", func, tid, path);
+						break;
+					}
+
+					print_record_2pc_cache(func);
+					pg_usleep(HASH_TAB_RETRY_SLEEP);
+					enable_hash_table_trace = true;
+				}
+
+				enable_hash_table_trace = false;
+
+				if (retry_times >= HASH_TAB_RETRY_MAX)
+				{
+					elog(PANIC, "[%s] %s is not found in hash table", func, tid);
+				}
+			}
+
 			resetStringInfo(&content);
 			pfree(content.data);
 			return;
@@ -3833,7 +3886,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 
 	GET_2PC_FILE_PATH(path, tid);
 
-	while (NULL != record_2pc_cache && retry_times++ < MAX_RETRY_TIMES)
+	while (NULL != record_2pc_cache && retry_times++ < HASH_TAB_RETRY_MAX)
 	{
 		Assert(strlen(tid) < MAX_TID_SIZE);
 		entry = (Cache2pcInfo *)hash_search(record_2pc_cache, tid, HASH_FIND, &found);
@@ -3884,7 +3937,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 			}
 			if (fd < 0)
 			{
-				elog(ERROR, "[%s] could not append timestamp in file %s, errMsg: %s",
+				elog(ERROR, "[%s] could not append timestamp, file %s, errMsg: %s",
 					func, path, strerror(errno));
 					}
 
@@ -3931,17 +3984,21 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 			func, tid, retry_times);
 
 		Assert(NULL == entry);
-		print_record_2pc_cache(func);
-
 		if (0 == access(path, F_OK))
 		{
 			elog(LOG, "[%s] %s found 2pc file %s", func, tid, path);
 			break;
 		}
 
-		pg_usleep(5000L);	/* sleep 5ms */
+		print_record_2pc_cache(func);
+
+		pg_usleep(HASH_TAB_RETRY_SLEEP);
+
+		enable_hash_table_trace = true;
 		}
 
+	enable_hash_table_trace = false;
+
 	if (NULL != record_2pc_cache)
 	{
 		elog(LOG, "[%s] %s is not found in hash table, get from disk", func, tid);
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 8e62e871..0e751a96 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -115,6 +115,8 @@
 /* Number of freelists to be used for a partitioned hash table. */
 #define NUM_FREELISTS            32
 
+bool enable_hash_table_trace = false;
+
 /* A hash bucket is a linked list of HASHELEMENTs */
 typedef HASHELEMENT *HASHBUCKET;
 
@@ -926,9 +928,15 @@ hash_search_with_hash_value(HTAB *hashp,
     long        segment_ndx;
     HASHSEGMENT segp;
     HASHBUCKET    currBucket;
+	HASHBUCKET *firstBucketPtr;
     HASHBUCKET *prevBucketPtr;
+	HASHBUCKET *prevBucketPtrCheck;
     HashCompareFunc match;
 
+	char *func = "hash_search_with_hash_value";
+	bool is_trace = (enable_hash_table_trace &&
+					0 == strcmp(hashp->tabname, "Record 2pc Cache"));
+
 #if HASH_STATISTICS
     hash_accesses++;
     hctl->accesses++;
@@ -965,12 +973,27 @@ hash_search_with_hash_value(HTAB *hashp,
 
     segp = hashp->dir[segment_num];
 
+	if (is_trace)
+	{
+		elog(LOG, "[%s] %s hashvalue: %u, freelist_idx: %d, IS_PARTITIONED: %d, "
+			"bucket: %u, segment_num: %ld, segment_ndx %ld, segp: %p",
+			func, (char *)keyPtr, hashvalue, freelist_idx, IS_PARTITIONED(hctl),
+			bucket, segment_num, segment_ndx, segp);
+	}
+
     if (segp == NULL)
         hash_corrupted(hashp);
 
-    prevBucketPtr = &segp[segment_ndx];
+	firstBucketPtr = &segp[segment_ndx];
+	prevBucketPtr = firstBucketPtr;
     currBucket = *prevBucketPtr;
 
+	if (is_trace)
+	{
+		elog(LOG, "[%s] %s prevBucketPtr: %p, currBucket: %p",
+			func, (char *)keyPtr, prevBucketPtr, currBucket);
+	}
+
     /*
      * Follow collision chain looking for matching key
      */
@@ -979,9 +1002,19 @@ hash_search_with_hash_value(HTAB *hashp,
 
     while (currBucket != NULL)
     {
+		if (is_trace)
+		{
+			elog(LOG, "[%s] %s currBucket: %p", func, (char *)keyPtr, currBucket);
+		}
         if (currBucket->hashvalue == hashvalue &&
             match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
+		{
+			if (is_trace)
+			{
+				elog(LOG, "[%s] %s break currBucket: %p", func, (char *)keyPtr, currBucket);
+			}
             break;
+		}
         prevBucketPtr = &(currBucket->link);
         currBucket = *prevBucketPtr;
 #if HASH_STATISTICS
@@ -1065,10 +1098,31 @@ hash_search_with_hash_value(HTAB *hashp,
                              errmsg("out of memory")));
             }
 
+			prevBucketPtrCheck  = prevBucketPtr;
+
+			/* if partitioned, must lock freeList */
+			if (IS_PARTITIONED(hctl))
+				SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
+
+			prevBucketPtr = firstBucketPtr;
+			while (*prevBucketPtr != NULL)
+			{
+				prevBucketPtr = &((*prevBucketPtr)->link);
+			}
+
+			if (prevBucketPtr != prevBucketPtrCheck)
+			{
+				elog(LOG, "[%s] prevBucketPtr(%p) != prevBucketPtrCheck(%p)",
+					func, prevBucketPtr, prevBucketPtrCheck);
+			}
+
             /* link into hashbucket chain */
             *prevBucketPtr = currBucket;
             currBucket->link = NULL;
 
+			if (IS_PARTITIONED(hctl))
+				SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
+
             /* copy key into record */
             currBucket->hashvalue = hashvalue;
             hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 5066d491..ec9db352 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2726,6 +2726,15 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_2pc_hash_table_check", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("Enable 2PC hash table check."),
+			NULL
+		},
+		&enable_2pc_hash_table_check,
+		false,
+		NULL, NULL, NULL
+	},
 #endif
 
 #ifdef __TBASE__
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index bd76266f..e0fe09d2 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -101,6 +101,7 @@ extern bool enable_2pc_file_cache;
 extern bool enable_2pc_file_check;
 extern bool enable_2pc_entry_key_check;
 extern bool enable_2pc_entry_trace;
+extern bool enable_2pc_hash_table_check;
 
 extern int record_2pc_cache_size;
 extern int record_2pc_entry_size;
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h
index 651b3b59..15c7049a 100644
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -116,6 +116,8 @@ typedef struct
 	HASHELEMENT *curEntry;		/* current entry in bucket */
 } HASH_SEQ_STATUS;
 
+extern bool enable_hash_table_trace;
+
 /*
  * prototypes for functions in dynahash.c
  */

From bc4b6bcde4a8808f5cc0f9163fdd2b51197c313e Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Fri, 21 May 2021 18:08:38 +0800
Subject: [PATCH 379/578] 2pc files opt: add 2pc hash table on shmem (merge
 request 300), hash table optimize, regress fix

---
 src/test/regress/expected/sysviews.out | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 842fabf5..f9926dda 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -76,6 +76,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_2pc_entry_trace            | off
  enable_2pc_file_cache             | on
  enable_2pc_file_check             | on
+ enable_2pc_hash_table_check       | off
  enable_2pc_recovery_info          | on
  enable_audit                      | off
  enable_audit_warning              | off
@@ -141,7 +142,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
  enable_xlog_mprotect              | on
-(68 rows)
+(69 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From 8ea9290d305748a6561c29301f74a00414cae73e Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Mon, 31 May 2021 20:15:40 +0800
Subject: [PATCH 380/578] 2pc files opt: add 2pc hash table on shmem (merge
 request 300), remove hash table debug code

---
 src/backend/access/transam/twophase.c |  9 -----
 src/backend/utils/hash/dynahash.c     | 56 +--------------------------
 src/include/utils/hsearch.h           |  2 -
 3 files changed, 1 insertion(+), 66 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 56d80bf1..14935855 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -3570,8 +3570,6 @@ void record_2pc_involved_nodes_xid(const char * tid,
     XLogRecPtr xlogrec = 0;
 #endif
         
-	enable_hash_table_trace = false;
-
     if (!enable_2pc_recovery_info)
     {
         return ;
@@ -3765,11 +3763,8 @@ void record_2pc_involved_nodes_xid(const char * tid,
 
 					print_record_2pc_cache(func);
 					pg_usleep(HASH_TAB_RETRY_SLEEP);
-					enable_hash_table_trace = true;
 				}
 
-				enable_hash_table_trace = false;
-
 				if (retry_times >= HASH_TAB_RETRY_MAX)
 				{
 					elog(PANIC, "[%s] %s is not found in hash table", func, tid);
@@ -3993,12 +3988,8 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 		print_record_2pc_cache(func);
 
 		pg_usleep(HASH_TAB_RETRY_SLEEP);
-
-		enable_hash_table_trace = true;
 		}
 
-	enable_hash_table_trace = false;
-
 	if (NULL != record_2pc_cache)
 	{
 		elog(LOG, "[%s] %s is not found in hash table, get from disk", func, tid);
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 0e751a96..fc7e8bf3 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -115,8 +115,6 @@
 /* Number of freelists to be used for a partitioned hash table. */
 #define NUM_FREELISTS            32
 
-bool enable_hash_table_trace = false;
-
 /* A hash bucket is a linked list of HASHELEMENTs */
 typedef HASHELEMENT *HASHBUCKET;
 
@@ -928,15 +926,9 @@ hash_search_with_hash_value(HTAB *hashp,
     long        segment_ndx;
     HASHSEGMENT segp;
     HASHBUCKET    currBucket;
-	HASHBUCKET *firstBucketPtr;
     HASHBUCKET *prevBucketPtr;
-	HASHBUCKET *prevBucketPtrCheck;
     HashCompareFunc match;
 
-	char *func = "hash_search_with_hash_value";
-	bool is_trace = (enable_hash_table_trace &&
-					0 == strcmp(hashp->tabname, "Record 2pc Cache"));
-
 #if HASH_STATISTICS
     hash_accesses++;
     hctl->accesses++;
@@ -973,27 +965,12 @@ hash_search_with_hash_value(HTAB *hashp,
 
     segp = hashp->dir[segment_num];
 
-	if (is_trace)
-	{
-		elog(LOG, "[%s] %s hashvalue: %u, freelist_idx: %d, IS_PARTITIONED: %d, "
-			"bucket: %u, segment_num: %ld, segment_ndx %ld, segp: %p",
-			func, (char *)keyPtr, hashvalue, freelist_idx, IS_PARTITIONED(hctl),
-			bucket, segment_num, segment_ndx, segp);
-	}
-
     if (segp == NULL)
         hash_corrupted(hashp);
 
-	firstBucketPtr = &segp[segment_ndx];
-	prevBucketPtr = firstBucketPtr;
+	prevBucketPtr = &segp[segment_ndx];
     currBucket = *prevBucketPtr;
 
-	if (is_trace)
-	{
-		elog(LOG, "[%s] %s prevBucketPtr: %p, currBucket: %p",
-			func, (char *)keyPtr, prevBucketPtr, currBucket);
-	}
-
     /*
      * Follow collision chain looking for matching key
      */
@@ -1002,19 +979,9 @@ hash_search_with_hash_value(HTAB *hashp,
 
     while (currBucket != NULL)
     {
-		if (is_trace)
-		{
-			elog(LOG, "[%s] %s currBucket: %p", func, (char *)keyPtr, currBucket);
-		}
         if (currBucket->hashvalue == hashvalue &&
             match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
-		{
-			if (is_trace)
-			{
-				elog(LOG, "[%s] %s break currBucket: %p", func, (char *)keyPtr, currBucket);
-			}
             break;
-		}
         prevBucketPtr = &(currBucket->link);
         currBucket = *prevBucketPtr;
 #if HASH_STATISTICS
@@ -1098,31 +1065,10 @@ hash_search_with_hash_value(HTAB *hashp,
                              errmsg("out of memory")));
             }
 
-			prevBucketPtrCheck  = prevBucketPtr;
-
-			/* if partitioned, must lock freeList */
-			if (IS_PARTITIONED(hctl))
-				SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
-
-			prevBucketPtr = firstBucketPtr;
-			while (*prevBucketPtr != NULL)
-			{
-				prevBucketPtr = &((*prevBucketPtr)->link);
-			}
-
-			if (prevBucketPtr != prevBucketPtrCheck)
-			{
-				elog(LOG, "[%s] prevBucketPtr(%p) != prevBucketPtrCheck(%p)",
-					func, prevBucketPtr, prevBucketPtrCheck);
-			}
-
             /* link into hashbucket chain */
             *prevBucketPtr = currBucket;
             currBucket->link = NULL;
 
-			if (IS_PARTITIONED(hctl))
-				SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
-
             /* copy key into record */
             currBucket->hashvalue = hashvalue;
             hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h
index 15c7049a..651b3b59 100644
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -116,8 +116,6 @@ typedef struct
 	HASHELEMENT *curEntry;		/* current entry in bucket */
 } HASH_SEQ_STATUS;
 
-extern bool enable_hash_table_trace;
-
 /*
  * prototypes for functions in dynahash.c
  */

From b46f0c53c128f1cd8df419f901eda90e12f55e4f Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Tue, 1 Jun 2021 17:35:42 +0800
Subject: [PATCH 381/578] release seqinfo before return error
 http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131088349973&jump_count=1

---
 src/gtm/main/gtm_seq.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c
index d103ce2b..27e0a3e0 100644
--- a/src/gtm/main/gtm_seq.c
+++ b/src/gtm/main/gtm_seq.c
@@ -457,6 +457,10 @@ GTM_SeqOpen(GTM_SequenceKey seqkey,
         ereport(LOG,
                 (EEXIST,
                  errmsg("GTM_SeqOpen Sequence with key:%s found in hashtab", seqkey->gsk_key)));
+		/*
+		 * Release sequence, otherwise the sequence will be busy when be dropped.
+		 */
+		seq_release_seqinfo(seqinfo);
         return EEXIST;
     }
 
@@ -467,6 +471,10 @@ GTM_SeqOpen(GTM_SequenceKey seqkey,
         ereport(LOG,
                 (EEXIST,
                  errmsg("GTM_SeqOpen Sequence with key:%s found in store", seqkey->gsk_key)));
+		/*
+		 * Release sequence, otherwise the sequence will be busy when be dropped.
+		 */
+		seq_release_seqinfo(seqinfo);
         return EEXIST;
     }
 #endif

From 1915bc1979f759d363529a0521d5d02662dc6ed9 Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Tue, 1 Jun 2021 17:43:48 +0800
Subject: [PATCH 382/578] parallel ddl, leader cn execute firstly

---
 src/backend/access/transam/xact.c    |   23 +
 src/backend/catalog/dependency.c     |  245 +++++-
 src/backend/catalog/heap.c           |   15 +-
 src/backend/catalog/namespace.c      |    2 +
 src/backend/catalog/objectaddress.c  |   27 +
 src/backend/commands/dbcommands.c    |    5 +-
 src/backend/commands/dropcmds.c      |  138 ++-
 src/backend/commands/sequence.c      |   69 +-
 src/backend/commands/tablecmds.c     |  250 ++++--
 src/backend/commands/tablespace.c    |   67 +-
 src/backend/commands/user.c          |  451 ++++++----
 src/backend/commands/view.c          |   74 +-
 src/backend/parser/parse_relation.c  |   28 +-
 src/backend/parser/parse_utilcmd.c   |   34 +-
 src/backend/pgxc/locator/redistrib.c |    4 +
 src/backend/pgxc/pool/execRemote.c   |  608 ++++++-------
 src/backend/pgxc/pool/pgxcnode.c     |   42 +-
 src/backend/tcop/postgres.c          |    1 +
 src/backend/tcop/utility.c           | 1209 +++++++++++++++++++++++---
 src/backend/utils/misc/guc.c         |    4 +
 src/include/catalog/dependency.h     |   12 +
 src/include/catalog/objectaddress.h  |    4 +
 src/include/commands/dbcommands.h    |    2 +-
 src/include/commands/defrem.h        |    4 +-
 src/include/commands/sequence.h      |    6 +
 src/include/commands/tablecmds.h     |    6 +
 src/include/commands/tablespace.h    |    6 +-
 src/include/commands/user.h          |   23 +-
 src/include/commands/view.h          |   10 +-
 src/include/nodes/parsenodes.h       |   10 +
 src/include/parser/parse_relation.h  |  151 ++--
 src/include/parser/parse_utilcmd.h   |   11 +-
 src/include/pgxc/execRemote.h        |   28 +
 src/include/pgxc/pgxcnode.h          |    5 +-
 src/include/tcop/utility.h           |   54 +-
 35 files changed, 2755 insertions(+), 873 deletions(-)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index bc765d44..b6881ece 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -98,6 +98,7 @@
 #include "pgxc/squeue.h"
 #include "postmaster/postmaster.h"
 #include "commands/extension.h"
+#include "tcop/utility.h"
 #endif
 /*
  *    User-tweakable parameters
@@ -4501,6 +4502,9 @@ CommitTransactionCommand(void)
             }
             break;
     }
+#ifdef __TBASE__
+	leader_cn_executed_ddl = false;
+#endif
 }
 
 /*
@@ -4626,8 +4630,24 @@ AbortCurrentTransaction(void)
              * we get ROLLBACK.
              */
         case TBLOCK_SUBINPROGRESS:
+			{
+				/*
+				 * In parallel mode, leader cn execute before local cn, so when
+				 * error occured, local cn will send ROLLBACK_SUBTXN to leader
+				 * cn, we deal with subtxn abort there.
+				 */
+				if (is_txn_has_parallel_ddl && !IS_PGXC_LOCAL_COORDINATOR)
+				{
+					PGXCNodeHandle	*leaderCnHandle = NULL;
+					leaderCnHandle = find_ddl_leader_cn();
+					if (is_ddl_leader_cn(leaderCnHandle->nodename))
+					{
+						break;
+					}
+				}
             AbortSubTransaction();
             s->blockState = TBLOCK_SUBABORT;
+			}
             break;
 
             /*
@@ -4654,6 +4674,9 @@ AbortCurrentTransaction(void)
             AbortCurrentTransaction();
             break;
     }
+#ifdef __TBASE__
+	leader_cn_executed_ddl = false;
+#endif
 }
 
 /*
diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c
index 737e549d..6b6d568b 100644
--- a/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@ -109,6 +109,10 @@
 #ifdef _MLS_
 #include "utils/relcrypt.h"
 #endif
+#ifdef __TBASE__
+#include "parser/scansup.h"
+#include "catalog/catalog.h"
+#endif
 /*
  * Deletion processing requires additional state for each ObjectAddress that
  * it's planning to delete.  For simplicity and code-sharing we make the
@@ -128,7 +132,6 @@ typedef struct
 #define DEPFLAG_EXTENSION    0x0010    /* reached via extension dependency */
 #define DEPFLAG_REVERSE        0x0020    /* reverse internal/extension link */
 
-
 /* expansible list of ObjectAddresses */
 struct ObjectAddresses
 {
@@ -391,6 +394,246 @@ performDeletion(const ObjectAddress *object,
     heap_close(depRel, RowExclusiveLock);
 }
 
+#ifdef __TBASE__
+
+/*
+ * replace all invisible characters with ' ',
+ * leave no spaces next to ',' or '.'
+ */
+void
+OmitqueryStringSpace(char *queryString)
+{
+    char *front = queryString;
+    char *last = queryString;
+    bool skip = false;
+
+    if (queryString == NULL)
+    {
+        return;
+    }
+
+    /* omit space */
+    while (scanner_isspace(*front))
+    {
+        ++front;
+    }
+
+    while ((*front) != '\0')
+    {
+        if(scanner_isspace(*front) && skip == false)
+        {
+            while(scanner_isspace(*front))
+            {
+                ++front;
+            }
+
+            if ((*front) == ',' || (*front) == '.')
+            {
+                /* no need space */
+            }
+            else if (last != queryString && (*(last - 1) == ',' || *(last - 1) == '.'))
+            {
+                /* no need space */
+            }
+            else
+            {
+                /* replace all invisible characters with ' ' */
+                *last = ' ';
+                ++last;
+                continue;
+            }
+        }
+
+        if ((*front) == '\"')
+        {
+            skip = (skip == true) ? false : true;
+            *last = *front;
+            ++front;
+        }
+        else
+        {
+            *last = *front;
+            ++front;
+        }
+        ++last;
+    }
+    *last = '\0';
+}
+
+/*
+ * remove object name in query string (replace with ' ')
+ */
+void
+RemoveObjnameInQueryString(char *queryString, char *full_name)
+{
+    char *ptr = NULL;
+    char *tmp = NULL;
+    char *tmpStr = NULL;
+    char *start_ptr = queryString;
+    char *end_ptr = queryString + strlen(queryString) - 1;
+    int  len = 0;
+
+    tmpStr = queryString;
+    len = strlen(full_name);
+    while ((ptr = strstr(tmpStr, full_name)) != NULL)
+    {
+        /* is not independent string, skip */
+        if (((ptr - 1) >= start_ptr && *(ptr - 1) != ' ' && (*(ptr - 1) != ',')) ||
+                    ((ptr + len) <= end_ptr && *(ptr + len) != ' ' && *(ptr + len) != ',' && *(ptr + len) != ';'))
+        {
+            if (((ptr - 1) >= start_ptr && *(ptr - 1) == '\"' && (ptr + len) <= end_ptr && *(ptr + len) == '\"') &&
+                        ((ptr - 2) < start_ptr || *(ptr - 2) != '.'))
+            {
+                *(ptr - 1) = ' ';
+                *(ptr + len) = ' ';
+            }
+            else
+            {
+                tmpStr = ptr + len;
+                continue;
+            }
+        }
+
+        /* replace obj name with ' ' */
+        MemSet(ptr, ' ', len);
+
+        /* find the previous ',' */
+        tmp = ptr - 1;
+        while (tmp >= start_ptr && *tmp == ' ')
+        {
+            tmp--;
+        }
+
+        if (tmp >= start_ptr && *tmp == ',')
+        {
+            *tmp = ' ';
+        }
+        else
+        {
+            /* find the following ',' */
+            tmp = ptr + len;
+            while (tmp <= end_ptr && *tmp == ' ')
+            {
+                tmp++;
+            }
+
+            if (tmp <= end_ptr && *tmp == ',')
+            {
+                *tmp = ' ';
+            }
+        }
+
+        tmpStr = ptr + len;
+    }
+}
+
+/*
+ * Like RemoveRelations, implements drop relations. But the function
+ * only be used for local cn in parallel ddl mode.
+ */
+void
+RemoveRelationsParallelMode(DropStmt *drop, ObjectAddresses* objects,
+								List *heap_list)
+{
+	int		flags = 0;
+	int		i = 0;
+	char	relkind;
+	ListCell 	*lc;
+	Oid			heap_oid;
+
+	/* Determine required relkind */
+	relkind = GetRemoveObjectRelkind(drop->removeType);
+
+	if (drop->concurrent)
+	{
+		flags |= PERFORM_DELETION_CONCURRENTLY;
+	}
+
+	/*
+	 * In DROP INDEX, attempt to acquire lock on the parent table before
+	 * locking the index.
+	 */
+	foreach(lc, heap_list)
+	{
+		heap_oid = lfirst_oid(lc);
+		if (flags & PERFORM_DELETION_CONCURRENTLY)
+			LockRelationOid(heap_oid, ShareUpdateExclusiveLock);
+		else
+			LockRelationOid(heap_oid, AccessExclusiveLock);
+	}
+
+	for (i = 0; i < objects->numrefs; i++)
+	{
+		const ObjectAddress* thisobj = objects->refs + i;
+		Oid		 relOid = thisobj->objectId;
+		Relation child_rel = NULL;
+
+		AcquireDeletionLock(thisobj, flags);
+
+		/* could not drop child interval partition or its index */
+		if (RELKIND_RELATION == relkind)
+		{
+			bool report_error = false;
+
+			elog(DEBUG1, "drop table relOid: %u", relOid);
+
+			if (RELKIND_RELATION == relkind)
+			{
+				child_rel = heap_open(relOid, NoLock);
+			}
+			else
+			{
+				child_rel = index_open(relOid, NoLock);
+			}
+
+			if (RELATION_IS_CHILD(child_rel))
+			{
+				report_error = true;
+			}
+
+			if (RELKIND_RELATION == relkind)
+			{
+				heap_close(child_rel, NoLock);
+			}
+			else
+			{
+				index_close(child_rel, NoLock);
+			}
+
+			if (report_error)
+			{
+				;
+			}
+		}
+	}
+
+	performMultipleDeletions(objects, drop->behavior, flags);
+}
+
+/*
+ * Implements drop one or more objects such as schema/function/type. 
+ * The function only be used for local cn in parallel ddl mode.
+ */
+void
+RemoveObjectsParallelMode(DropStmt *stmt, ObjectAddresses *objects)
+{
+	int i;
+	for (i = 0; i < objects->numrefs; i++)
+	{
+		const ObjectAddress* thisobj = objects->refs + i;
+
+		if (IsSharedRelation(thisobj->classId))
+			LockSharedObject(thisobj->classId, thisobj->objectId,
+								0, AccessExclusiveLock);
+		else
+			LockDatabaseObject(thisobj->classId, thisobj->objectId,
+								0, AccessExclusiveLock);
+    }
+	/* Here we really delete them. */
+    performMultipleDeletions(objects, stmt->behavior, 0);
+}
+#endif
+
 /*
  * performMultipleDeletions: Similar to performDeletion, but act on multiple
  * objects at once.
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 39a9c235..b79d4be3 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -116,6 +116,10 @@
 #include "catalog/pgxc_key_values.h"
 #endif
 
+#ifdef __TBASE__
+extern bool enable_parallel_ddl;
+#endif
+
 /* Potentially set by pg_upgrade_support functions */
 Oid            binary_upgrade_next_heap_pg_class_oid = InvalidOid;
 Oid            binary_upgrade_next_toast_pg_class_oid = InvalidOid;
@@ -2806,8 +2810,15 @@ heap_drop_with_catalog(Oid relid)
      * shared-cache-inval notice that will make them update their index lists.
      */
     tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
-	if (!HeapTupleIsValid(tuple))
-		elog(ERROR, "cache lookup failed for relation %u", relid);
+
+#ifdef __TBASE__
+	if (enable_parallel_ddl && tuple == NULL)
+	{
+		elog(WARNING, "The tuple may have been dropped by parallel ddl");
+		return;
+	}
+#endif	
+
     if (((Form_pg_class) GETSTRUCT(tuple))->relispartition)
     {
         parentOid = get_partition_parent(relid);
diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c
index 3523874b..53be4dc7 100644
--- a/src/backend/catalog/namespace.c
+++ b/src/backend/catalog/namespace.c
@@ -385,7 +385,9 @@ RangeVarGetRelidExtended(const RangeVar *relation, LOCKMODE lockmode,
         if (!OidIsValid(relId))
             AcceptInvalidationMessages();
         else if (!nowait)
+		{
             LockRelationOid(relId, lockmode);
+		}
         else if (!ConditionalLockRelationOid(relId, lockmode))
         {
             if (relation->schemaname)
diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c
index d085bae1..7a2f0956 100644
--- a/src/backend/catalog/objectaddress.c
+++ b/src/backend/catalog/objectaddress.c
@@ -1164,6 +1164,33 @@ get_object_address(ObjectType objtype, Node *object,
     return address;
 }
 
+#ifdef __TBASE__
+char *GetRemoveObjectName(ObjectType objtype, Node *object)
+{
+	switch (objtype)
+	{
+		case OBJECT_SCHEMA:
+			{
+				Value *strVal = (Value *)object;
+				return strVal(strVal);
+			}
+		case OBJECT_TYPE:
+			{
+				TypeName *typename = castNode(TypeName, object);
+				return TypeNameToString(typename);
+			}
+		case OBJECT_FUNCTION:
+			{
+				ObjectWithArgs *func = castNode(ObjectWithArgs, object);
+				return NameListToString(func->objname);
+			}
+		default:
+			break;
+	}
+	return NULL;
+}
+#endif
+
 /*
  * Return an ObjectAddress based on a RangeVar and an object name. The
  * name of the relation identified by the RangeVar is prepended to the
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index ae8c18c1..070646fd 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -944,7 +944,7 @@ dropdb_prepare(const char *dbname, bool missing_ok)
 /*
  * DROP DATABASE
  */
-void
+bool
 dropdb(const char *dbname, bool missing_ok)
 {// #lizard forgives
     Oid            db_id;
@@ -982,7 +982,7 @@ dropdb(const char *dbname, bool missing_ok)
             ereport(NOTICE,
                     (errmsg("database \"%s\" does not exist, skipping",
                             dbname)));
-            return;
+			return false;
         }
     }
 
@@ -1156,6 +1156,7 @@ dropdb(const char *dbname, bool missing_ok)
 #endif
     }
 #endif
+	return true;
 }
 
 
diff --git a/src/backend/commands/dropcmds.c b/src/backend/commands/dropcmds.c
index 25dcf184..3d3c522a 100644
--- a/src/backend/commands/dropcmds.c
+++ b/src/backend/commands/dropcmds.c
@@ -29,13 +29,15 @@
 #include "utils/syscache.h"
 #ifdef __TBASE__
 #include "utils/rel.h"
+#include "catalog/catalog.h"
+#include "storage/lmgr.h"
 #endif
 #ifdef _MLS_
 #include "utils/mls.h"
 #endif
 
 static void does_not_exist_skipping(ObjectType objtype,
-                        Node *object);
+						Node *object, bool missing_ok);
 static bool owningrel_does_not_exist_skipping(List *object,
                                   const char **msg, char **name);
 static bool schema_does_not_exist_skipping(List *object,
@@ -43,25 +45,19 @@ static bool schema_does_not_exist_skipping(List *object,
 static bool type_in_list_does_not_exist_skipping(List *typenames,
                                      const char **msg, char **name);
 
-
 /*
- * Drop one or more objects.
- *
- * We don't currently handle all object types here.  Relations, for example,
- * require special handling, because (for example) indexes have additional
- * locking requirements.
- *
- * We look up all the objects first, and then delete them in a single
- * performMultipleDeletions() call.  This avoids unnecessary DROP RESTRICT
- * errors if there are dependencies between them.
+ * Check object exists or not before remove.
  */
-void
-RemoveObjects(DropStmt *stmt)
-{// #lizard forgives
+ObjectAddresses* PreCheckforRemoveObjects(DropStmt *stmt, bool missing_ok,
+										bool *need_drop, char *query_string,
+										bool need_unlock)
+{
     ObjectAddresses *objects;
     ListCell   *cell1;
+	bool        querystring_omit = false;
 
     objects = new_object_addresses();
+	*need_drop = false;
 
     foreach(cell1, stmt->objects)
     {
@@ -84,8 +80,23 @@ RemoveObjects(DropStmt *stmt)
          */
         if (!OidIsValid(address.objectId))
         {
+			char *relation_name = NULL;
             Assert(stmt->missing_ok);
-            does_not_exist_skipping(stmt->removeType, object);
+
+			does_not_exist_skipping(stmt->removeType, object, missing_ok);
+
+#ifdef __TBASE__
+			if (query_string)
+			{
+				if (!querystring_omit)
+				{
+					OmitqueryStringSpace(query_string);
+					querystring_omit = true;
+				}
+				relation_name = GetRemoveObjectName(stmt->removeType, object);
+				RemoveObjnameInQueryString(query_string, relation_name);
+			}
+#endif
             continue;
         }
 
@@ -150,9 +161,36 @@ RemoveObjects(DropStmt *stmt)
             }
         }
 #endif
+		*need_drop = true;
+		if (need_unlock)
+		{
+			if (IsSharedRelation(address.classId))
+            	UnlockSharedObject(address.classId, address.objectId, 0, AccessExclusiveLock);
+			else
+				UnlockDatabaseObject(address.classId, address.objectId, 0, AccessExclusiveLock);
+		}
+	}
+	return objects;
+}
 
+/*
+ * Drop one or more objects.
+ *
+ * We don't currently handle all object types here.  Relations, for example,
+ * require special handling, because (for example) indexes have additional
+ * locking requirements.
+ *
+ * We look up all the objects first, and then delete them in a single
+ * performMultipleDeletions() call.  This avoids unnecessary DROP RESTRICT
+ * errors if there are dependencies between them.
+ */
+void
+RemoveObjects(DropStmt *stmt, bool missing_ok, bool *need_drop, char *query_string)
+{
+	ObjectAddresses *objects;
         
-    }
+	objects = PreCheckforRemoveObjects(stmt, missing_ok, need_drop,
+										query_string, false);
 
     /* Here we really delete them. */
     performMultipleDeletions(objects, stmt->behavior, 0);
@@ -276,16 +314,24 @@ type_in_list_does_not_exist_skipping(List *typenames, const char **msg,
  * get_object_address() in RemoveObjects would have thrown an ERROR.
  */
 static void
-does_not_exist_skipping(ObjectType objtype, Node *object)
-{// #lizard forgives
+does_not_exist_skipping(ObjectType objtype, Node *object, bool missing_ok)
+{
     const char *msg = NULL;
     char       *name = NULL;
     char       *args = NULL;
+	char	   *missmsg = "skipping";
+	int			elevel = NOTICE;
+
+	if (!missing_ok)
+	{
+		missmsg = "can not skip in parallel ddl mode";
+		elevel = ERROR;
+	}
 
     switch (objtype)
     {
         case OBJECT_ACCESS_METHOD:
-            msg = gettext_noop("access method \"%s\" does not exist, skipping");
+			msg = gettext_noop("access method \"%s\" does not exist, %s");
             name = strVal((Value *) object);
             break;
         case OBJECT_TYPE:
@@ -295,7 +341,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
 
                 if (!schema_does_not_exist_skipping(typ->names, &msg, &name))
                 {
-                    msg = gettext_noop("type \"%s\" does not exist, skipping");
+					msg = gettext_noop("type \"%s\" does not exist, %s");
                     name = TypeNameToString(typ);
                 }
             }
@@ -303,58 +349,58 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
         case OBJECT_COLLATION:
             if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name))
             {
-                msg = gettext_noop("collation \"%s\" does not exist, skipping");
+				msg = gettext_noop("collation \"%s\" does not exist, %s");
                 name = NameListToString(castNode(List, object));
             }
             break;
         case OBJECT_CONVERSION:
             if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name))
             {
-                msg = gettext_noop("conversion \"%s\" does not exist, skipping");
+				msg = gettext_noop("conversion \"%s\" does not exist, %s");
                 name = NameListToString(castNode(List, object));
             }
             break;
         case OBJECT_SCHEMA:
-            msg = gettext_noop("schema \"%s\" does not exist, skipping");
+			msg = gettext_noop("schema \"%s\" does not exist, %s");
             name = strVal((Value *) object);
             break;
         case OBJECT_STATISTIC_EXT:
             if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name))
             {
-                msg = gettext_noop("statistics object \"%s\" does not exist, skipping");
+				msg = gettext_noop("statistics object \"%s\" does not exist, %s");
                 name = NameListToString(castNode(List, object));
             }
             break;
         case OBJECT_TSPARSER:
             if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name))
             {
-                msg = gettext_noop("text search parser \"%s\" does not exist, skipping");
+				msg = gettext_noop("text search parser \"%s\" does not exist, %s");
                 name = NameListToString(castNode(List, object));
             }
             break;
         case OBJECT_TSDICTIONARY:
             if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name))
             {
-                msg = gettext_noop("text search dictionary \"%s\" does not exist, skipping");
+				msg = gettext_noop("text search dictionary \"%s\" does not exist, %s");
                 name = NameListToString(castNode(List, object));
             }
             break;
         case OBJECT_TSTEMPLATE:
             if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name))
             {
-                msg = gettext_noop("text search template \"%s\" does not exist, skipping");
+				msg = gettext_noop("text search template \"%s\" does not exist, %s");
                 name = NameListToString(castNode(List, object));
             }
             break;
         case OBJECT_TSCONFIGURATION:
             if (!schema_does_not_exist_skipping(castNode(List, object), &msg, &name))
             {
-                msg = gettext_noop("text search configuration \"%s\" does not exist, skipping");
+				msg = gettext_noop("text search configuration \"%s\" does not exist, %s");
                 name = NameListToString(castNode(List, object));
             }
             break;
         case OBJECT_EXTENSION:
-            msg = gettext_noop("extension \"%s\" does not exist, skipping");
+			msg = gettext_noop("extension \"%s\" does not exist, %s");
             name = strVal((Value *) object);
             break;
         case OBJECT_FUNCTION:
@@ -364,7 +410,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
                 if (!schema_does_not_exist_skipping(owa->objname, &msg, &name) &&
                     !type_in_list_does_not_exist_skipping(owa->objargs, &msg, &name))
                 {
-                    msg = gettext_noop("function %s(%s) does not exist, skipping");
+					msg = gettext_noop("function %s(%s) does not exist, %s");
                     name = NameListToString(owa->objname);
                     args = TypeNameListToString(owa->objargs);
                 }
@@ -377,7 +423,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
                 if (!schema_does_not_exist_skipping(owa->objname, &msg, &name) &&
                     !type_in_list_does_not_exist_skipping(owa->objargs, &msg, &name))
                 {
-                    msg = gettext_noop("aggregate %s(%s) does not exist, skipping");
+					msg = gettext_noop("aggregate %s(%s) does not exist, %s");
                     name = NameListToString(owa->objname);
                     args = TypeNameListToString(owa->objargs);
                 }
@@ -390,13 +436,13 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
                 if (!schema_does_not_exist_skipping(owa->objname, &msg, &name) &&
                     !type_in_list_does_not_exist_skipping(owa->objargs, &msg, &name))
                 {
-                    msg = gettext_noop("operator %s does not exist, skipping");
+					msg = gettext_noop("operator %s does not exist, %s");
                     name = NameListToString(owa->objname);
                 }
                 break;
             }
         case OBJECT_LANGUAGE:
-            msg = gettext_noop("language \"%s\" does not exist, skipping");
+			msg = gettext_noop("language \"%s\" does not exist, %s");
             name = strVal((Value *) object);
             break;
         case OBJECT_CAST:
@@ -405,7 +451,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
                     !type_in_list_does_not_exist_skipping(list_make1(lsecond(castNode(List, object))), &msg, &name))
                 {
                     /* XXX quote or no quote? */
-                    msg = gettext_noop("cast from type %s to type %s does not exist, skipping");
+					msg = gettext_noop("cast from type %s to type %s does not exist, %s");
                     name = TypeNameToString(linitial_node(TypeName, castNode(List, object)));
                     args = TypeNameToString(lsecond_node(TypeName, castNode(List, object)));
                 }
@@ -414,7 +460,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
         case OBJECT_TRANSFORM:
             if (!type_in_list_does_not_exist_skipping(list_make1(linitial(castNode(List, object))), &msg, &name))
             {
-                msg = gettext_noop("transform for type %s language \"%s\" does not exist, skipping");
+				msg = gettext_noop("transform for type %s language \"%s\" does not exist, %s");
                 name = TypeNameToString(linitial_node(TypeName, castNode(List, object)));
                 args = strVal(lsecond(castNode(List, object)));
             }
@@ -422,7 +468,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
         case OBJECT_TRIGGER:
             if (!owningrel_does_not_exist_skipping(castNode(List, object), &msg, &name))
             {
-                msg = gettext_noop("trigger \"%s\" for relation \"%s\" does not exist, skipping");
+				msg = gettext_noop("trigger \"%s\" for relation \"%s\" does not exist, %s");
                 name = strVal(llast(castNode(List, object)));
                 args = NameListToString(list_truncate(list_copy(castNode(List, object)),
                                                       list_length(castNode(List, object)) - 1));
@@ -431,31 +477,31 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
         case OBJECT_POLICY:
             if (!owningrel_does_not_exist_skipping(castNode(List, object), &msg, &name))
             {
-                msg = gettext_noop("policy \"%s\" for relation \"%s\" does not exist, skipping");
+				msg = gettext_noop("policy \"%s\" for relation \"%s\" does not exist, %s");
                 name = strVal(llast(castNode(List, object)));
                 args = NameListToString(list_truncate(list_copy(castNode(List, object)),
                                                       list_length(castNode(List, object)) - 1));
             }
             break;
         case OBJECT_EVENT_TRIGGER:
-            msg = gettext_noop("event trigger \"%s\" does not exist, skipping");
+			msg = gettext_noop("event trigger \"%s\" does not exist, %s");
             name = strVal((Value *) object);
             break;
         case OBJECT_RULE:
             if (!owningrel_does_not_exist_skipping(castNode(List, object), &msg, &name))
             {
-                msg = gettext_noop("rule \"%s\" for relation \"%s\" does not exist, skipping");
+				msg = gettext_noop("rule \"%s\" for relation \"%s\" does not exist, %s");
                 name = strVal(llast(castNode(List, object)));
                 args = NameListToString(list_truncate(list_copy(castNode(List, object)),
                                                       list_length(castNode(List, object)) - 1));
             }
             break;
         case OBJECT_FDW:
-            msg = gettext_noop("foreign-data wrapper \"%s\" does not exist, skipping");
+			msg = gettext_noop("foreign-data wrapper \"%s\" does not exist, %s");
             name = strVal((Value *) object);
             break;
         case OBJECT_FOREIGN_SERVER:
-            msg = gettext_noop("server \"%s\" does not exist, skipping");
+			msg = gettext_noop("server \"%s\" does not exist, %s");
             name = strVal((Value *) object);
             break;
         case OBJECT_OPCLASS:
@@ -464,7 +510,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
 
                 if (!schema_does_not_exist_skipping(opcname, &msg, &name))
                 {
-                    msg = gettext_noop("operator class \"%s\" does not exist for access method \"%s\", skipping");
+					msg = gettext_noop("operator class \"%s\" does not exist for access method \"%s\", %s");
                     name = NameListToString(opcname);
                     args = strVal(linitial(castNode(List, object)));
                 }
@@ -476,14 +522,14 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
 
                 if (!schema_does_not_exist_skipping(opfname, &msg, &name))
                 {
-                    msg = gettext_noop("operator family \"%s\" does not exist for access method \"%s\", skipping");
+					msg = gettext_noop("operator family \"%s\" does not exist for access method \"%s\", %s");
                     name = NameListToString(opfname);
                     args = strVal(linitial(castNode(List, object)));
                 }
             }
             break;
         case OBJECT_PUBLICATION:
-            msg = gettext_noop("publication \"%s\" does not exist, skipping");
+			msg = gettext_noop("publication \"%s\" does not exist, %s");
             name = strVal((Value *) object);
             break;
         default:
@@ -492,7 +538,7 @@ does_not_exist_skipping(ObjectType objtype, Node *object)
     }
 
     if (!args)
-        ereport(NOTICE, (errmsg(msg, name)));
+		ereport(elevel, (errmsg(msg, name, missmsg)));
     else
-        ereport(NOTICE, (errmsg(msg, name, args)));
+		ereport(elevel, (errmsg(msg, name, args, missmsg)));
 }
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 3cb9c044..a248f5c0 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -28,6 +28,9 @@
 #include "catalog/dependency.h"
 #include "catalog/indexing.h"
 #include "catalog/namespace.h"
+#ifdef __TBASE__
+#include "catalog/pg_namespace.h"
+#endif
 #include "catalog/objectaccess.h"
 #include "catalog/pg_sequence.h"
 #include "catalog/pg_type.h"
@@ -155,13 +158,69 @@ static void process_owned_by(Relation seqrel, List *owned_by, bool for_identity)
 extern bool  g_GTM_skip_catalog;
 #endif
 
+#ifdef __TBASE__
+extern bool is_txn_has_parallel_ddl;
+
+/*
+ * Check sequence exists or not
+ */
+bool PrecheckDefineSequence(CreateSeqStmt *seq)
+{
+	Oid		seqoid;
+	Oid		nspid;
+	bool	need_send = true;
+
+	if (g_GTM_skip_catalog && IS_PGXC_DATANODE)
+	{
+		ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("skip_gtm_catalog can not be true on datanode.")));
+	}
+	
+	if (!g_GTM_skip_catalog)
+	{
+		/* Unlogged sequences are not implemented -- not clear if useful. */
+		if (seq->sequence->relpersistence == RELPERSISTENCE_UNLOGGED)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("unlogged sequences are not supported")));
+
+		/*
+		 * If if_not_exists was given and a relation with the same name already
+		 * exists, bail out. (Note: we needn't check this when not if_not_exists,
+		 * because DefineRelation will complain anyway.)
+		 */
+		if (seq->if_not_exists)
+		{
+			nspid = RangeVarGetAndCheckCreationNamespace(seq->sequence, NoLock,
+															&seqoid);
+			if (OidIsValid(seqoid))
+			{
+				ereport(NOTICE,
+						(errcode(ERRCODE_DUPLICATE_TABLE),
+						 errmsg("relation \"%s\" already exists, skipping",
+								seq->sequence->relname)));
+				need_send = false;
+			}
+			UnlockDatabaseObject(NamespaceRelationId, nspid, 0,
+									AccessShareLock);
+		}
+	}
+
+	return need_send;
+}
+
 /*
  * DefineSequence
  *                Creates a new sequence relation
  */
 ObjectAddress
+DefineSequence(ParseState *pstate, CreateSeqStmt *seq, bool exists_ok)
+#else
+ObjectAddress
 DefineSequence(ParseState *pstate, CreateSeqStmt *seq)
-{// #lizard forgives
+#endif
+{
     FormData_pg_sequence seqform;
     FormData_pg_sequence_data seqdataform;
     bool        need_seq_rewrite;
@@ -214,6 +273,14 @@ DefineSequence(ParseState *pstate, CreateSeqStmt *seq)
             RangeVarGetAndCheckCreationNamespace(seq->sequence, NoLock, &seqoid);
             if (OidIsValid(seqoid))
             {
+#ifdef __TBASE__
+				if (!exists_ok)
+					ereport(ERROR,
+						(errcode(ERRCODE_DUPLICATE_TABLE),
+						 errmsg("relation \"%s\" already exists",
+								seq->sequence->relname)));
+				else
+#endif
                 ereport(NOTICE,
                         (errcode(ERRCODE_DUPLICATE_TABLE),
                          errmsg("relation \"%s\" already exists, skipping",
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 6f525512..eb5b2b6b 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -158,6 +158,9 @@ typedef struct OnCommitItem
 
 static List *on_commits = NIL;
 
+#ifdef __TBASE__
+extern bool    is_txn_has_parallel_ddl;
+#endif
 
 /*
  * State information for ALTER TABLE
@@ -1405,70 +1408,6 @@ DropErrorMsgWrongType(const char *relname, char wrongkind, char rightkind)
 
 #ifdef __TBASE__
 
-/*
- * replace all invisible characters with ' ',
- * leave no spaces next to ',' or '.'
- */
-static void
-OmitqueryStringSpace(char *queryString)
-{
-    char *front = queryString;
-    char *last = queryString;
-    bool skip = false;
-
-    if (queryString == NULL)
-    {
-        return;
-    }
-
-    /* omit space */
-    while (scanner_isspace(*front))
-    {
-        ++front;
-    }
-
-    while ((*front) != '\0')
-    {
-        if(scanner_isspace(*front) && skip == false)
-        {
-            while(scanner_isspace(*front))
-            {
-                ++front;
-            }
-
-            if ((*front) == ',' || (*front) == '.')
-            {
-                /* no need space */
-            }
-            else if (last != queryString && (*(last - 1) == ',' || *(last - 1) == '.'))
-            {
-                /* no need space */
-            }
-            else
-            {
-                /* replace all invisible characters with ' ' */
-                *last = ' ';
-                ++last;
-                continue;
-            }
-        }
-
-        if ((*front) == '\"')
-        {
-            skip = (skip == true) ? false : true;
-            *last = *front;
-            ++front;
-        }
-        else
-        {
-            *last = *front;
-            ++front;
-        }
-        ++last;
-    }
-    *last = '\0';
-}
-
 /*
  * remove relname in query string (replace with ' ')
  */
@@ -1543,6 +1482,146 @@ RemoveRelnameInQueryString(char *queryString, RangeVar *rel)
     }
 }
 
+char GetRemoveObjectRelkind(ObjectType removeType)
+{
+	char relkind;
+	switch (removeType)
+	{
+		case OBJECT_TABLE:
+			relkind = RELKIND_RELATION;
+			break;
+
+		case OBJECT_INDEX:
+			relkind = RELKIND_INDEX;
+			break;
+
+		case OBJECT_SEQUENCE:
+			relkind = RELKIND_SEQUENCE;
+			break;
+
+		case OBJECT_VIEW:
+			relkind = RELKIND_VIEW;
+			break;
+
+		case OBJECT_MATVIEW:
+			relkind = RELKIND_MATVIEW;
+			break;
+
+		case OBJECT_FOREIGN_TABLE:
+			relkind = RELKIND_FOREIGN_TABLE;
+			break;
+
+		default:
+			elog(ERROR, "unrecognized drop object type: %d",
+				 (int)removeType);
+			relkind = 0;		/* keep compiler quiet */
+			break;
+	}
+	return relkind;
+}
+
+/*
+ * PreCheckforRemoveRelation
+ * Check before implementing DROP TABLE, DROP INDEX, DROP SEQUENCE,
+ * DROP VIEW, DROP FOREIGN TABLE, DROP MATERIALIZED VIEW, return the
+ * object of existing relations.
+ */
+ObjectAddresses* PreCheckforRemoveRelation(DropStmt* drop, char* queryString,
+											bool *needDrop, List **heap_list)
+{
+	char		relkind;
+	ListCell   *cell;
+	LOCKMODE	lockmode = AccessExclusiveLock;
+    bool        querystring_omit = false;
+	ObjectAddresses* objects = NULL;
+
+    /* DROP CONCURRENTLY uses a weaker lock, and has some restrictions */
+	if (drop->concurrent)
+	{
+		lockmode = ShareUpdateExclusiveLock;
+		Assert(drop->removeType == OBJECT_INDEX);
+		if (list_length(drop->objects) != 1)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("DROP INDEX CONCURRENTLY does not support dropping multiple objects")));
+		if (drop->behavior == DROP_CASCADE)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("DROP INDEX CONCURRENTLY does not support CASCADE")));
+	}
+
+	/*
+	 * First we identify all the relations, then we delete them in a single
+	 * performMultipleDeletions() call.  This is to avoid unwanted DROP
+	 * RESTRICT errors if one of the relations depends on another.
+	 */
+
+	/* Determine required relkind */
+	relkind = GetRemoveObjectRelkind(drop->removeType);
+	objects = new_object_addresses();
+	*needDrop = false;
+
+    foreach (cell, drop->objects) 
+	{
+		RangeVar   *rel = makeRangeVarFromNameList((List *) lfirst(cell));
+		Oid			relOid;
+		ObjectAddress obj;
+		struct DropRelationCallbackState state;
+
+		/*
+		 * These next few steps are a great deal like relation_openrv, but we
+		 * don't bother building a relcache entry since we don't need it.
+		 *
+		 * Check for shared-cache-inval messages before trying to access the
+		 * relation.  This is needed to cover the case where the name
+		 * identifies a rel that has been dropped and recreated since the
+		 * start of our transaction: if we don't flush the old syscache entry,
+		 * then we'll latch onto that entry and suffer an error later.
+		 */
+		AcceptInvalidationMessages();
+
+		/* Look up the appropriate relation using namespace search. */
+		state.relkind = relkind;
+		state.heapOid = InvalidOid;
+		state.partParentOid = InvalidOid;
+		state.concurrent = drop->concurrent;
+
+        relOid = RangeVarGetRelidExtended(rel, lockmode, true, false,
+											RangeVarCallbackForDropRelation,
+											(void*)&state);
+		/* Not there? */
+		if (!OidIsValid(relOid))
+		{
+			DropErrorMsgNonExistent(rel, relkind, drop->missing_ok);
+			if (!querystring_omit)
+            {
+                OmitqueryStringSpace(queryString);
+                querystring_omit = true;
+            }
+
+            RemoveRelnameInQueryString(queryString, rel);
+			continue;
+		}
+
+		/* OK, we're ready to delete this one */
+        obj.classId = RelationRelationId;
+        obj.objectId = relOid;
+        obj.objectSubId = 0;
+        add_exact_object_address(&obj, objects);
+		*needDrop = true;
+
+		if (OidIsValid(state.heapOid)) 
+		{
+			LOCKMODE heapLockMode = AccessExclusiveLock;
+			if (state.concurrent)
+				heapLockMode = ShareUpdateExclusiveLock;
+			UnlockRelationOid(state.heapOid, heapLockMode);
+			*heap_list = list_append_unique_oid(*heap_list, state.heapOid);
+		}
+		UnlockRelationOid(relOid, lockmode);
+    }
+	return objects;
+}
 #endif
 
 /*
@@ -1661,7 +1740,12 @@ RemoveRelations(DropStmt *drop)
         /* Not there? */
         if (!OidIsValid(relOid))
         {
-            DropErrorMsgNonExistent(rel, relkind, drop->missing_ok);
+			bool missing_ok = drop->missing_ok;
+#ifdef __TBASE__
+			if (IsConnFromCoord() && is_txn_has_parallel_ddl)
+				missing_ok = false;
+#endif
+			DropErrorMsgNonExistent(rel, relkind, missing_ok);
 #ifdef __TBASE__
 			if (!querystring_omit)
             {
@@ -1681,6 +1765,8 @@ RemoveRelations(DropStmt *drop)
         {
             bool report_error = false;
 
+			elog(LOG, "drop table relOid: %u", relOid);
+
             if (RELKIND_RELATION == relkind)
             {
                 child_rel = heap_open(relOid, NoLock);
@@ -1761,6 +1847,19 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
      */
     if (relOid != oldRelOid && OidIsValid(state->heapOid))
     {
+#ifdef __TBASE__
+		/*
+		 * Unlock index before unlock table, or may cause deadlock
+		 * when drop index and create same index executed concurrently.
+		 */
+		if (is_txn_has_parallel_ddl && relkind == RELKIND_INDEX)
+		{
+			Assert(OidIsValid(oldRelOid));
+			UnlockRelationOid(oldRelOid, heap_lockmode);
+			elog(LOG, "Unlock index(name:oid):(%s:%u) before unlock table",
+					rel->relname, oldRelOid);
+		}
+#endif
         UnlockRelationOid(state->heapOid, heap_lockmode);
         state->heapOid = InvalidOid;
     }
@@ -1782,7 +1881,16 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
 
     tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(relOid));
     if (!HeapTupleIsValid(tuple))
+	{
+#ifdef __TBASE__
+		if (is_txn_has_parallel_ddl && !state->concurrent)
+		{
+			elog(ERROR, "Can't get valid tuple, relation %s had been invalid"
+				"by other process in parallel ddl mode", rel->relname);
+		}
+#endif
         return;                    /* concurrently dropped, so nothing to do */
+	}
     classform = (Form_pg_class) GETSTRUCT(tuple);
     is_partition = classform->relispartition;
 
@@ -1830,8 +1938,20 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
     {
         state->heapOid = IndexGetRelation(relOid, true);
         if (OidIsValid(state->heapOid))
+		{
             LockRelationOid(state->heapOid, heap_lockmode);
     }
+#ifdef __TBASE__
+		else
+		{
+			if (is_txn_has_parallel_ddl && !state->concurrent)
+			{
+				elog(ERROR, "Can't get valid tableoid, index %s had been invalid"
+					"by other process in parallel ddl mode", rel->relname);
+			}
+		}
+#endif
+	}
 
     /*
      * Similarly, if the relation is a partition, we must acquire lock on its
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 1b208c6c..e88463ac 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -414,14 +414,70 @@ CreateTableSpace(CreateTableSpaceStmt *stmt)
 #endif                            /* HAVE_SYMLINK */
 }
 
+#ifdef __TBASE__
+bool
+PreCheckforDropTableSpace(DropTableSpaceStmt *stmt)
+{
+#ifdef HAVE_SYMLINK
+	char	   *tablespacename = stmt->tablespacename;
+	HeapScanDesc scandesc;
+	Relation	rel;
+	HeapTuple	tuple;
+	ScanKeyData entry[1];
+
+	/*
+	 * Find the target tuple
+	 */
+	rel = heap_open(TableSpaceRelationId, RowExclusiveLock);
+
+	ScanKeyInit(&entry[0],
+				Anum_pg_tablespace_spcname,
+				BTEqualStrategyNumber, F_NAMEEQ,
+				CStringGetDatum(tablespacename));
+	scandesc = heap_beginscan_catalog(rel, 1, entry);
+	tuple = heap_getnext(scandesc, ForwardScanDirection);
+
+	if (!HeapTupleIsValid(tuple))
+	{
+		if (!stmt->missing_ok)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_OBJECT),
+					 errmsg("tablespace \"%s\" does not exist",
+							tablespacename)));
+		}
+		else
+		{
+			ereport(NOTICE,
+					(errmsg("tablespace \"%s\" does not exist, skipping",
+							tablespacename)));
+			/* XXX I assume I need one or both of these next two calls */
+			heap_endscan(scandesc);
+			heap_close(rel, RowExclusiveLock);
+		}
+		return false;
+	}
+
+	heap_endscan(scandesc);
+	heap_close(rel, RowExclusiveLock);
+
+#else							/* !HAVE_SYMLINK */
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("tablespaces are not supported on this platform")));
+#endif
+	return true;
+}
+#endif
+
 /*
  * Drop a table space
  *
  * Be careful to check that the tablespace is empty.
  */
-void
-DropTableSpace(DropTableSpaceStmt *stmt)
-{// #lizard forgives
+bool
+DropTableSpace(DropTableSpaceStmt *stmt, bool missing_ok)
+{
 #ifdef HAVE_SYMLINK
     char       *tablespacename = stmt->tablespacename;
     HeapScanDesc scandesc;
@@ -444,7 +500,7 @@ DropTableSpace(DropTableSpaceStmt *stmt)
 
     if (!HeapTupleIsValid(tuple))
     {
-        if (!stmt->missing_ok)
+		if (!missing_ok)
         {
             ereport(ERROR,
                     (errcode(ERRCODE_UNDEFINED_OBJECT),
@@ -460,7 +516,7 @@ DropTableSpace(DropTableSpaceStmt *stmt)
             heap_endscan(scandesc);
             heap_close(rel, NoLock);
         }
-        return;
+		return false;
     }
 
     tablespaceoid = HeapTupleGetOid(tuple);
@@ -573,6 +629,7 @@ DropTableSpace(DropTableSpaceStmt *stmt)
             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
              errmsg("tablespaces are not supported on this platform")));
 #endif                            /* HAVE_SYMLINK */
+	return true;
 }
 
 
diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c
index 038119db..e98cd209 100644
--- a/src/backend/commands/user.c
+++ b/src/backend/commands/user.c
@@ -1073,188 +1073,236 @@ AlterRoleSet(AlterRoleSetStmt *stmt)
 }
 
 
-/*
- * DROP ROLE
- */
-void
-DropRole(DropRoleStmt *stmt)
-{// #lizard forgives
-    Relation    pg_authid_rel,
-                pg_auth_members_rel;
-    ListCell   *item;
+void DropRoleByTuple(char *role, HeapTuple tuple, Relation pg_authid_rel,
+					Relation pg_auth_members_rel)
+{
+    HeapTuple	tmp_tuple;
+    ScanKeyData scankey;
+    char       *detail;
+    char       *detail_log;
+    SysScanDesc sscan;
+    Oid            roleid;
+
+    roleid = HeapTupleGetOid(tuple);
 
-    if (!have_createrole_privilege())
+    if (roleid == GetUserId())
         ereport(ERROR,
-                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                 errmsg("permission denied to drop role")));
+                (errcode(ERRCODE_OBJECT_IN_USE),
+                    errmsg("current user cannot be dropped")));
+    if (roleid == GetOuterUserId())
+        ereport(ERROR,
+                (errcode(ERRCODE_OBJECT_IN_USE),
+                    errmsg("current user cannot be dropped")));
+    if (roleid == GetSessionUserId())
+        ereport(ERROR,
+                (errcode(ERRCODE_OBJECT_IN_USE),
+                    errmsg("session user cannot be dropped")));
 
     /*
-     * Scan the pg_authid relation to find the Oid of the role(s) to be
-     * deleted.
-     */
-    pg_authid_rel = heap_open(AuthIdRelationId, RowExclusiveLock);
-    pg_auth_members_rel = heap_open(AuthMemRelationId, RowExclusiveLock);
-
-    foreach(item, stmt->roles)
-    {
-        RoleSpec   *rolspec = lfirst(item);
-        char       *role;
-        HeapTuple    tuple,
-                    tmp_tuple;
-        ScanKeyData scankey;
-        char       *detail;
-        char       *detail_log;
-        SysScanDesc sscan;
-        Oid            roleid;
-
-        if (rolspec->roletype != ROLESPEC_CSTRING)
-            ereport(ERROR,
-                    (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                     errmsg("cannot use special role specifier in DROP ROLE")));
-        role = rolspec->rolename;
-
-        tuple = SearchSysCache1(AUTHNAME, PointerGetDatum(role));
-        if (!HeapTupleIsValid(tuple))
-        {
-            if (!stmt->missing_ok)
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_UNDEFINED_OBJECT),
-                         errmsg("role \"%s\" does not exist", role)));
-            }
-            else
-            {
-                ereport(NOTICE,
-                        (errmsg("role \"%s\" does not exist, skipping",
-                                role)));
-            }
-
-            continue;
-        }
-
-        roleid = HeapTupleGetOid(tuple);
-
-        if (roleid == GetUserId())
-            ereport(ERROR,
-                    (errcode(ERRCODE_OBJECT_IN_USE),
-                     errmsg("current user cannot be dropped")));
-        if (roleid == GetOuterUserId())
-            ereport(ERROR,
-                    (errcode(ERRCODE_OBJECT_IN_USE),
-                     errmsg("current user cannot be dropped")));
-        if (roleid == GetSessionUserId())
-            ereport(ERROR,
-                    (errcode(ERRCODE_OBJECT_IN_USE),
-                     errmsg("session user cannot be dropped")));
-
-        /*
-         * For safety's sake, we allow createrole holders to drop ordinary
-         * roles but not superuser roles.  This is mainly to avoid the
-         * scenario where you accidentally drop the last superuser.
-         */
-        if (((Form_pg_authid) GETSTRUCT(tuple))->rolsuper &&
-            !superuser())
-            ereport(ERROR,
-                    (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                     errmsg("must be superuser to drop superusers")));
-
-        /* DROP hook for the role being removed */
-        InvokeObjectDropHook(AuthIdRelationId, roleid, 0);
+    * For safety's sake, we allow createrole holders to drop ordinary
+    * roles but not superuser roles.  This is mainly to avoid the
+    * scenario where you accidentally drop the last superuser.
+    */
+    if (((Form_pg_authid) GETSTRUCT(tuple))->rolsuper &&
+        !superuser())
+        ereport(ERROR,
+                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+                    errmsg("must be superuser to drop superusers")));
 
-        /*
-         * Lock the role, so nobody can add dependencies to her while we drop
-         * her.  We keep the lock until the end of transaction.
-         */
-        LockSharedObject(AuthIdRelationId, roleid, 0, AccessExclusiveLock);
+    /* DROP hook for the role being removed */
+    InvokeObjectDropHook(AuthIdRelationId, roleid, 0);
 
-        /* Check for pg_shdepend entries depending on this role */
-        if (checkSharedDependencies(AuthIdRelationId, roleid,
-                                    &detail, &detail_log))
-            ereport(ERROR,
-                    (errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST),
-                     errmsg("role \"%s\" cannot be dropped because some objects depend on it",
-                            role),
-                     errdetail_internal("%s", detail),
-                     errdetail_log("%s", detail_log)));
+    /*
+    * Lock the role, so nobody can add dependencies to her while we drop
+    * her.  We keep the lock until the end of transaction.
+    */
+    LockSharedObject(AuthIdRelationId, roleid, 0, AccessExclusiveLock);
+
+    /* Check for pg_shdepend entries depending on this role */
+    if (checkSharedDependencies(AuthIdRelationId, roleid,
+                                &detail, &detail_log))
+        ereport(ERROR,
+                (errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST),
+                    errmsg("role \"%s\" cannot be dropped because some objects depend on it",
+                        role),
+                    errdetail_internal("%s", detail),
+                    errdetail_log("%s", detail_log)));
 
 #ifdef _MLS_
-        if (true == mls_check_role_permission(roleid) ||
-                true == cls_check_user_has_policy(roleid))
-        {
-            elog(ERROR, "could not drop role:%s, cause this role has mls poilcy bound", 
-                            role);
-        }
+    if (true == mls_check_role_permission(roleid) ||
+            true == cls_check_user_has_policy(roleid))
+    {
+        elog(ERROR, "could not drop role:%s, cause this role has mls poilcy bound", 
+                        role);
+    }
 #endif
-        /*
-         * Remove the role from the pg_authid table
-         */
-        CatalogTupleDelete(pg_authid_rel, &tuple->t_self);
-
-        ReleaseSysCache(tuple);
+    /*
+    * Remove the role from the pg_authid table
+    */
+    CatalogTupleDelete(pg_authid_rel, &tuple->t_self);
 
-        /*
-         * Remove role from the pg_auth_members table.  We have to remove all
-         * tuples that show it as either a role or a member.
-         *
-         * XXX what about grantor entries?    Maybe we should do one heap scan.
-         */
-        ScanKeyInit(&scankey,
-                    Anum_pg_auth_members_roleid,
-                    BTEqualStrategyNumber, F_OIDEQ,
-                    ObjectIdGetDatum(roleid));
+    ReleaseSysCache(tuple);
 
-        sscan = systable_beginscan(pg_auth_members_rel, AuthMemRoleMemIndexId,
-                                   true, NULL, 1, &scankey);
+    /*
+    * Remove role from the pg_auth_members table.  We have to remove all
+    * tuples that show it as either a role or a member.
+    *
+    * XXX what about grantor entries?    Maybe we should do one heap scan.
+    */
+    ScanKeyInit(&scankey,
+                Anum_pg_auth_members_roleid,
+                BTEqualStrategyNumber, F_OIDEQ,
+                ObjectIdGetDatum(roleid));
+
+    sscan = systable_beginscan(pg_auth_members_rel, AuthMemRoleMemIndexId,
+                                true, NULL, 1, &scankey);
+
+    while (HeapTupleIsValid(tmp_tuple = systable_getnext(sscan)))
+    {
+        CatalogTupleDelete(pg_auth_members_rel, &tmp_tuple->t_self);
+    }
 
-        while (HeapTupleIsValid(tmp_tuple = systable_getnext(sscan)))
-        {
-            CatalogTupleDelete(pg_auth_members_rel, &tmp_tuple->t_self);
-        }
+    systable_endscan(sscan);
 
-        systable_endscan(sscan);
+    ScanKeyInit(&scankey,
+                Anum_pg_auth_members_member,
+                BTEqualStrategyNumber, F_OIDEQ,
+                ObjectIdGetDatum(roleid));
 
-        ScanKeyInit(&scankey,
-                    Anum_pg_auth_members_member,
-                    BTEqualStrategyNumber, F_OIDEQ,
-                    ObjectIdGetDatum(roleid));
+    sscan = systable_beginscan(pg_auth_members_rel, AuthMemMemRoleIndexId,
+                                true, NULL, 1, &scankey);
 
-        sscan = systable_beginscan(pg_auth_members_rel, AuthMemMemRoleIndexId,
-                                   true, NULL, 1, &scankey);
+    while (HeapTupleIsValid(tmp_tuple = systable_getnext(sscan)))
+    {
+        CatalogTupleDelete(pg_auth_members_rel, &tmp_tuple->t_self);
+    }
 
-        while (HeapTupleIsValid(tmp_tuple = systable_getnext(sscan)))
-        {
-            CatalogTupleDelete(pg_auth_members_rel, &tmp_tuple->t_self);
-        }
+    systable_endscan(sscan);
 
-        systable_endscan(sscan);
+    /*
+    * Remove any comments or security labels on this role.
+    */
+    DeleteSharedComments(roleid, AuthIdRelationId);
+    DeleteSharedSecurityLabel(roleid, AuthIdRelationId);
 
-        /*
-         * Remove any comments or security labels on this role.
-         */
-        DeleteSharedComments(roleid, AuthIdRelationId);
-        DeleteSharedSecurityLabel(roleid, AuthIdRelationId);
+    /*
+    * Remove settings for this role.
+    */
+    DropSetting(InvalidOid, roleid);
 
-        /*
-         * Remove settings for this role.
-         */
-        DropSetting(InvalidOid, roleid);
+    /*
+    * Advance command counter so that later iterations of this loop will
+    * see the changes already made.  This is essential if, for example,
+    * we are trying to drop both a role and one of its direct members ---
+    * we'll get an error if we try to delete the linking pg_auth_members
+    * tuple twice.  (We do not need a CCI between the two delete loops
+    * above, because it's not allowed for a role to directly contain
+    * itself.)
+    */
+    CommandCounterIncrement();
+
+    if (POOL_CONN_RELEASE_SUCCESS != PoolManagerClosePooledConnections(NULL, role))
+    {
+        elog(ERROR, "failed to close pooled connection for role:%s", role);
+    }
+}
 
-        /*
-         * Advance command counter so that later iterations of this loop will
-         * see the changes already made.  This is essential if, for example,
-         * we are trying to drop both a role and one of its direct members ---
-         * we'll get an error if we try to delete the linking pg_auth_members
-         * tuple twice.  (We do not need a CCI between the two delete loops
-         * above, because it's not allowed for a role to directly contain
-         * itself.)
-         */
-        CommandCounterIncrement();
+#ifdef __TBASE__
+bool PreCheckDropRole(DropRoleStmt *stmt, char *query_string,
+						List **exist_roles)
+{
+	Relation	pg_authid_rel,
+				pg_auth_members_rel;
+	ListCell   *item;
+	bool		need_drop = false;
+	bool        querystring_omit = false;
+
+	if (!have_createrole_privilege())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("permission denied to drop role")));
+
+	pg_authid_rel = heap_open(AuthIdRelationId, RowExclusiveLock);
+	pg_auth_members_rel = heap_open(AuthMemRelationId, RowExclusiveLock);
+
+	foreach(item, stmt->roles)
+	{
+		RoleSpec   *rolspec = lfirst(item);
+		char	   *role;
+		HeapTuple	tuple;
+
+		if (rolspec->roletype != ROLESPEC_CSTRING)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("cannot use special role specifier in DROP ROLE")));
+		role = rolspec->rolename;
+
+		tuple = SearchSysCache1(AUTHNAME, PointerGetDatum(role));
+		if (!HeapTupleIsValid(tuple))
+		{
+			if (!stmt->missing_ok)
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_UNDEFINED_OBJECT),
+						 errmsg("role \"%s\" does not exist", role)));
+			}
+			else
+			{
+				ereport(NOTICE,
+						(errmsg("role \"%s\" does not exist, skipping",
+								role)));
+			}
+
+			if (query_string)
+			{
+				if (!querystring_omit)
+				{
+					OmitqueryStringSpace(query_string);
+					querystring_omit = true;
+				}
+				RemoveObjnameInQueryString(query_string, role);
+			}
+
+			continue;
+		}
+		ReleaseSysCache(tuple);
+		*exist_roles = lappend(*exist_roles, role);
+		need_drop = true;
+	}
+	heap_close(pg_auth_members_rel, RowExclusiveLock);
+	heap_close(pg_authid_rel, RowExclusiveLock);
+	return need_drop;
+}
 
-        if (POOL_CONN_RELEASE_SUCCESS != PoolManagerClosePooledConnections(NULL, role))
-        {
-            elog(ERROR, "failed to close pooled connection for role:%s", role);
-        }
-    }
+void DropRoleParallelMode(List *role_list)
+{
+	Relation	pg_authid_rel,
+				pg_auth_members_rel;
+	ListCell   *item;
+
+	/*
+	 * Scan the pg_authid relation to find the Oid of the role(s) to be
+	 * deleted.
+	 */
+	pg_authid_rel = heap_open(AuthIdRelationId, RowExclusiveLock);
+	pg_auth_members_rel = heap_open(AuthMemRelationId, RowExclusiveLock);
+
+	foreach(item, role_list)
+	{
+		char	   *role;
+		HeapTuple	tuple;
+
+		role = lfirst(item);
+		/* tuple will be release by DropRoleByTuple below */
+		tuple = SearchSysCache1(AUTHNAME, PointerGetDatum(role));
+		if (!HeapTupleIsValid(tuple))
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_OBJECT),
+					errmsg("Precheck role \"%s\" existed, but now does not exist", role)));
+		}
+		DropRoleByTuple(role, tuple, pg_authid_rel, pg_auth_members_rel);
+	}
 
     /*
      * Now we can clean up; but keep locks until commit.
@@ -1263,6 +1311,87 @@ DropRole(DropRoleStmt *stmt)
     heap_close(pg_authid_rel, NoLock);
 }
 
+#endif
+
+/*
+ * DROP ROLE
+ */
+bool
+DropRole(DropRoleStmt *stmt, bool missing_ok, char *query_string)
+{
+	Relation	pg_authid_rel,
+				pg_auth_members_rel;
+	ListCell   *item;
+	bool        querystring_omit = false;
+	bool		need_drop = false;
+
+	if (!have_createrole_privilege())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("permission denied to drop role")));
+
+	/*
+	 * Scan the pg_authid relation to find the Oid of the role(s) to be
+	 * deleted.
+	 */
+	pg_authid_rel = heap_open(AuthIdRelationId, RowExclusiveLock);
+	pg_auth_members_rel = heap_open(AuthMemRelationId, RowExclusiveLock);
+
+	foreach(item, stmt->roles)
+	{
+		RoleSpec   *rolspec = lfirst(item);
+		HeapTuple	tuple;
+		char		*role;
+
+		if (rolspec->roletype != ROLESPEC_CSTRING)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("cannot use special role specifier in DROP ROLE")));
+		role = rolspec->rolename;
+
+		tuple = SearchSysCache1(AUTHNAME, PointerGetDatum(role));
+		if (!HeapTupleIsValid(tuple))
+		{
+			if (!missing_ok)
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_UNDEFINED_OBJECT),
+						 errmsg("role \"%s\" does not exist", role)));
+			}
+			else
+			{
+				ereport(NOTICE,
+						(errmsg("role \"%s\" does not exist, skipping",
+								role)));
+			}
+
+			if (query_string)
+			{
+				if (!querystring_omit)
+				{
+					OmitqueryStringSpace(query_string);
+					querystring_omit = true;
+				}
+				RemoveObjnameInQueryString(query_string, role);
+			}
+
+			continue;
+		}
+
+		DropRoleByTuple(role, tuple, pg_authid_rel, pg_auth_members_rel);
+
+		need_drop = true;
+	}
+
+	/*
+	 * Now we can clean up; but keep locks until commit.
+	 */
+	heap_close(pg_auth_members_rel, NoLock);
+	heap_close(pg_authid_rel, NoLock);
+
+	return need_drop;
+}
+
 /*
  * Rename role
  */
diff --git a/src/backend/commands/view.c b/src/backend/commands/view.c
index b2a9ebc6..30862aa4 100644
--- a/src/backend/commands/view.c
+++ b/src/backend/commands/view.c
@@ -416,26 +416,18 @@ UpdateRangeTableOfViewParse(Oid viewOid, Query *viewParse)
 
     return viewParse;
 }
-
 /*
- * DefineView
- *        Execute a CREATE VIEW command.
+ * MakeViewParse
+ * Run parse analysis to convert the raw parse tree to a Query.  Note this
+ * also acquires sufficient locks on the source table(s).
  */
-ObjectAddress
-DefineView(ViewStmt *stmt, const char *queryString,
+Query *
+MakeViewParse(ViewStmt* stmt, const char* query_string,
            int stmt_location, int stmt_len)
-{// #lizard forgives
+{
+	Query	*viewParse = NULL;
     RawStmt    *rawstmt;
-    Query       *viewParse;
-    RangeVar   *view;
-    ListCell   *cell;
-    bool        check_option;
-    ObjectAddress address;
-
     /*
-     * Run parse analysis to convert the raw parse tree to a Query.  Note this
-     * also acquires sufficient locks on the source table(s).
-     *
      * Since parse analysis scribbles on its input, copy the raw parse tree;
      * this ensures we don't corrupt a prepared statement, for example.
      */
@@ -443,9 +435,59 @@ DefineView(ViewStmt *stmt, const char *queryString,
     rawstmt->stmt = (Node *) copyObject(stmt->query);
     rawstmt->stmt_location = stmt_location;
     rawstmt->stmt_len = stmt_len;
+	viewParse = parse_analyze(rawstmt, query_string, NULL, 0, NULL);
+	return viewParse;
+}
+
+#ifdef __TBASE__
+/*
+ * IsViewTemp
+ *		Check whethe we need a temporary view.
+ */
+bool
+IsViewTemp(ViewStmt* stmt, const char* query_string,
+				int stmt_location, int stmt_len,
+				List **relation_list)
+{
+	Query		*viewParse = NULL;
+    RangeVar	*view = NULL;
+	
+
+	/* don't corrupt original command */
+    view = (RangeVar*)copyObject(stmt->view);
+	viewParse = MakeViewParse(stmt, query_string, stmt_location, stmt_len);
 
-    viewParse = parse_analyze(rawstmt, queryString, NULL, 0, NULL);
+    /*
+     * If the user didn't explicitly ask for a temporary view, check whether
+     * we need one implicitly.	We allow TEMP to be inserted automatically as
+     * long as the CREATE command is consistent with that --- no explicit
+     * schema name.
+     */
+    if (view->relpersistence == RELPERSISTENCE_PERMANENT &&
+		CheckAndGetRelation(viewParse, relation_list)) 
+	{
+        view->relpersistence = RELPERSISTENCE_TEMP;
+    }
+
+    return view->relpersistence == RELPERSISTENCE_TEMP;
+}
+#endif
+
+/*
+ * DefineView
+ *		Execute a CREATE VIEW command.
+ */
+ObjectAddress
+DefineView(ViewStmt *stmt, const char *queryString,
+		   int stmt_location, int stmt_len)
+{
+	Query	   *viewParse;
+	RangeVar   *view;
+	ListCell   *cell;
+	bool		check_option;
+	ObjectAddress address;
 
+	viewParse = MakeViewParse(stmt, queryString, stmt_location, stmt_len);
     /*
      * The grammar should ensure that the result is a single SELECT Query.
      * However, it doesn't forbid SELECT INTO, so we have to check for that.
diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c
index 24196b21..10b20a9e 100644
--- a/src/backend/parser/parse_relation.c
+++ b/src/backend/parser/parse_relation.c
@@ -71,6 +71,12 @@ static int    specialAttNum(const char *attname);
 #endif
 static bool isQueryUsingTempRelation_walker(Node *node, void *context);
 
+#ifdef __TBASE__
+typedef struct
+{
+	List *related_oids;		/* the related tableoid list */
+} ViewRelatedContext;
+#endif
 
 /*
  * refnameRangeTblEntry
@@ -3425,6 +3431,19 @@ errorMissingColumn(ParseState *pstate,
     }
 }
 
+#ifdef __TBASE__
+bool
+CheckAndGetRelation(Query *query, List **relation_list)
+{
+	bool				tmp = false;
+	ViewRelatedContext	context;
+
+	context.related_oids = NIL;
+	tmp = isQueryUsingTempRelation_walker((Node *) query, &context);
+	*relation_list = context.related_oids;
+	return tmp;
+}
+#endif
 
 /*
  * Examine a fully-parsed query, and return TRUE iff any relation underlying
@@ -3455,8 +3474,15 @@ isQueryUsingTempRelation_walker(Node *node, void *context)
             {
                 Relation    rel = heap_open(rte->relid, AccessShareLock);
                 char        relpersistence = rel->rd_rel->relpersistence;
-
                 heap_close(rel, AccessShareLock);
+#ifdef __TBASE__
+				if (context)
+				{
+					ViewRelatedContext *vrContext = (ViewRelatedContext *)context;
+					vrContext->related_oids = lappend_oid(vrContext->related_oids,
+														RelationGetRelid(rel));
+				}
+#endif
                 if (relpersistence == RELPERSISTENCE_TEMP)
                     return true;
             }
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 88c6077e..08cb09e6 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -218,7 +218,11 @@ static char * ChooseSerialName(const char *relname, const char *colname,
  * then expand those into multiple IndexStmt blocks.
  *      - thomas 1997-12-02
  */
-#ifdef XCP
+#ifdef __TBASE__
+List *
+transformCreateStmt(CreateStmt *stmt, const char *queryString,
+					bool autodistribute, Oid *nspaceid, bool existsok)
+#elif XCP
 List *
 transformCreateStmt(CreateStmt *stmt, const char *queryString,
                     bool autodistribute)
@@ -312,16 +316,36 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString)
                                              &existing_relid);
     cancel_parser_errposition_callback(&pcbstate);
 
+#ifdef __TBASE__
+	if (nspaceid)
+		*nspaceid = namespaceid;
+#endif
+
     /*
      * If the relation already exists and the user specified "IF NOT EXISTS",
      * bail out with a NOTICE.
      */
     if (stmt->if_not_exists && OidIsValid(existing_relid))
     {
+		if (existsok)
+		{
         ereport(NOTICE,
                 (errcode(ERRCODE_DUPLICATE_TABLE),
                  errmsg("relation \"%s\" already exists, skipping",
                         stmt->relation->relname)));
+		}
+		else
+		{
+			/* 
+			 * In PARALLEL DDL mode, remote node emit error if relation
+			 * already exists to keep consistency with local cn.
+			 */
+			ereport(ERROR,
+				(errcode(ERRCODE_DUPLICATE_TABLE),
+				 errmsg("relation \"%s\" already exists, skipping",
+						stmt->relation->relname)));
+		}
+		
         return NIL;
     }
 
@@ -3317,6 +3341,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
     RangeTblEntry *rte;
 #ifdef __TBASE__
     List *createlist = NULL;
+	List *partlist = NIL;
 #endif
     /*
      * We must not scribble on the passed-in AlterTableStmt, so copy it. (This
@@ -3550,7 +3575,14 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                         createpart->partbound = NULL;
                         createpart->partspec  = NULL;
 
+#ifdef __TBASE__
+						partlist = transformCreateStmt(createpart,
+															queryString, true,
+															NULL, true);
+						createlist = list_concat(createlist, partlist);
+#else
                         createlist = list_concat(createlist, transformCreateStmt(createpart, queryString, true));
+#endif
                     }
                 }
                 else
diff --git a/src/backend/pgxc/locator/redistrib.c b/src/backend/pgxc/locator/redistrib.c
index 6a011400..ad13088a 100644
--- a/src/backend/pgxc/locator/redistrib.c
+++ b/src/backend/pgxc/locator/redistrib.c
@@ -969,7 +969,11 @@ distrib_execute_query(char *sql, bool is_temp, ExecNodes *exec_nodes)
 
     /* Redistribution operations only concern Datanodes */
     step->exec_type = EXEC_ON_DATANODES;
+#ifdef __TBASE__
+	ExecRemoteUtility(step, NULL, NON_PARALLEL_DDL);
+#else
     ExecRemoteUtility(step);
+#endif
     pfree(step->sql_statement);
     pfree(step);
 
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 17e6f838..1bb82166 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -123,8 +123,7 @@ static bool temp_object_included = false;
 static abort_callback_type dbcleanup_info = { NULL, NULL };
 
 static int    pgxc_node_begin(int conn_count, PGXCNodeHandle ** connections,
-                GlobalTransactionId gxid, bool need_tran_block,
-                bool readOnly, char node_type);
+				GlobalTransactionId gxid, bool need_tran_block, bool readOnly);
 
 static PGXCNodeAllHandles *get_exec_connections(RemoteQueryState *planstate,
                      ExecNodes *exec_nodes,
@@ -3424,16 +3423,14 @@ is_data_node_ready(PGXCNodeHandle * conn)
     return false;
 }
 
-
 /*
  * Send BEGIN command to the Datanodes or Coordinators and receive responses.
  * Also send the GXID for the transaction.
  */
 static int
 pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
-                GlobalTransactionId gxid, bool need_tran_block,
-                bool readOnly, char node_type)
-{// #lizard forgives
+				GlobalTransactionId gxid, bool need_tran_block, bool readOnly)
+{
 #define    SET_CMD_LENGTH 128
     int            i;
     struct timeval *timeout = NULL;
@@ -4725,7 +4722,7 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 {
     int conn_count = 0;
 
-    if (!enable_parallel_ddl || !is_txn_has_parallel_ddl)
+    if (!is_txn_has_parallel_ddl)
     {
         /* normal cases */
         conn_count = pgxc_node_remote_commit_internal(get_current_handles(), txn_type);
@@ -5899,7 +5896,8 @@ DataNodeCopyBegin(RemoteCopyData *rcstate)
     gxid = GetCurrentTransactionId();
 
     /* Start transaction on connections where it is not started */
-    if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false, PGXC_NODE_DATANODE))
+
+	if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false))
     {
         ereport(ERROR,
                 (errcode(ERRCODE_INTERNAL_ERROR),
@@ -6718,6 +6716,211 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection,
     return true;
 }
 
+/*
+ * Get snapshot and gxid for remote utility.
+ */
+void
+GetGlobInfoForRemoteUtility(RemoteQuery *node, GlobalTransactionId *gxid,
+							Snapshot *snapshot)
+{
+	bool                utility_need_transcation = true;
+
+#ifdef __TBASE__
+	/* Some DDL such as ROLLBACK, SET does not need transaction */
+	utility_need_transcation =
+			(!ExecDDLWithoutAcquireXid(node->parsetree) && !node->is_set);
+
+	if (utility_need_transcation)
+#endif		
+	{
+		elog(LOG, "[SAVEPOINT] node->sql_statement:%s", node->sql_statement);
+		*gxid = GetCurrentTransactionId();
+	}
+	
+	if (ActiveSnapshotSet())
+		*snapshot = GetActiveSnapshot();
+
+#ifdef __TBASE__	
+	if (utility_need_transcation)
+#endif
+	{
+		if (!GlobalTransactionIdIsValid(*gxid))
+			ereport(ERROR,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("Failed to get next transaction ID")));
+	}
+
+#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
+	if(!IS_PGXC_LOCAL_COORDINATOR)
+	{
+		/* 
+		 * Distributed DDLs only dispatch from the requested coordinator, thus
+		 * we skip sending gxid to avoid cycling.
+		 *
+		 * Note: except for 'set_config_option'.
+		 */
+		*gxid = InvalidTransactionId;
+	}
+
+#endif
+}
+
+/*
+ * Send snapshot/cmdid/query to remote node.
+ */
+void
+SendTxnInfo(RemoteQuery *node, PGXCNodeHandle *conn,
+			CommandId cid, Snapshot snapshot)
+{
+	if (conn->state == DN_CONNECTION_STATE_QUERY)
+		BufferConnection(conn);
+	if (snapshot && pgxc_node_send_snapshot(conn, snapshot))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("Failed to send snapshot to %s", conn->nodename)));
+	}
+	if (pgxc_node_send_cmd_id(conn, cid) < 0)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("Failed to send command ID to %s", conn->nodename)));
+	}
+
+	if (pgxc_node_send_query(conn, node->sql_statement) != 0)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("Failed to send command to %s", conn->nodename)));
+	}
+}
+
+/*
+ * Check response of remote connection.
+ */
+bool
+CheckRemoteRespond(PGXCNodeHandle *conn, ResponseCombiner *combiner,
+					int *index, int *conn_count)
+{
+	int res = handle_response(conn, combiner);
+	if (res == RESPONSE_EOF)
+	{
+		(*index)++;
+	}
+	else if (res == RESPONSE_COMPLETE)
+	{
+		/* Ignore, wait for ReadyForQuery */
+		if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+		{
+			ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("Unexpected FATAL ERROR on Connection to "
+							"Datanode %s pid %d",
+						conn->nodename, conn->backend_pid)));
+		}
+	}
+	else if (res == RESPONSE_ERROR)
+	{
+		/* Ignore, wait for ReadyForQuery */
+	}
+	else if (res == RESPONSE_READY)
+	{
+		if ((*index) < --(*conn_count))
+			return true;
+	}
+	else if (res == RESPONSE_TUPDESC)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("Unexpected response from %s pid %d",
+						conn->nodename, conn->backend_pid)));
+	}
+	else if (res == RESPONSE_DATAROW)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("Unexpected response from %s pid %d",
+						conn->nodename, conn->backend_pid)));
+	}
+	return false;
+}
+
+/*
+ * Receive remote response and chek receive status.
+ */
+void RemoteReceiveAndCheck(int conn_count, PGXCNodeHandle **conns,
+							ResponseCombiner *combiner)
+{
+	/*
+     * Stop if all commands are completed or we got a data row and
+     * initialized state node for subsequent invocations
+     */
+    while (conn_count > 0)
+    {
+        int		i = 0;
+		bool	remote_ready = false;
+
+        /* Wait until one of the connections has data available */
+        if (pgxc_node_receive(conn_count,
+                              conns,
+                              NULL))
+        {
+            /*
+             * Got error
+             * TODO(Tbase): How do we check the error here?
+             */
+            break;
+        }
+
+        while (i < conn_count)
+        {
+            PGXCNodeHandle *conn = NULL;
+			if (remote_ready)
+			{
+				conns[i] = conns[conn_count];
+			}
+			conn = conns[i];
+            remote_ready = CheckRemoteRespond(conn, combiner, &i, &conn_count);
+        }
+    }
+}
+
+#ifdef __TBASE__
+/*
+ * Send ddl to leader cn, the function only be invoked
+ * in parallel ddl mode.
+ */
+void
+LeaderCnExecRemoteUtility(RemoteQuery *node,
+								PGXCNodeHandle *leader_cn_conn,
+								ResponseCombiner *combiner,
+								bool need_tran_block,
+								GlobalTransactionId gxid,
+								Snapshot snapshot,
+								CommandId cid)
+{
+	int cn_cout = 1;
+	char *init_str = PGXCNodeGetSessionParamStr();
+	if (init_str)
+	{
+		pgxc_node_set_query(leader_cn_conn, init_str);
+	}
+	
+	SetPlpgsqlTransactionBegin(leader_cn_conn);
+	if (pgxc_node_begin(cn_cout, &leader_cn_conn, gxid,
+						need_tran_block, false))
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+				errmsg("Could not begin transaction on leader coordinator")));
+	}
+
+	/* Send other txn related messages to leader cn */
+	SendTxnInfo(node, leader_cn_conn, cid, snapshot);
+
+	RemoteReceiveAndCheck(cn_cout, &leader_cn_conn, combiner);
+}
+
 /*
  * Execute utility statement on multiple Datanodes
  * It does approximately the same as
@@ -6730,8 +6933,11 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection,
  * like allocating tuple slots.
  */
 void
+ExecRemoteUtility(RemoteQuery *node, PGXCNodeHandle *leader_cn_conn, ParallelDDLRemoteType type)
+#else
 ExecRemoteUtility(RemoteQuery *node)
-{// #lizard forgives
+#endif
+{
     RemoteQueryState *remotestate;
     ResponseCombiner *combiner;
     bool              force_autocommit = node->force_autocommit;
@@ -6739,13 +6945,12 @@ ExecRemoteUtility(RemoteQuery *node)
     GlobalTransactionId gxid = InvalidGlobalTransactionId;
     Snapshot snapshot = NULL;
     PGXCNodeAllHandles *pgxc_connections;
-    int            co_conn_count;
-    int            dn_conn_count;
+	int					co_conn_count = 0;
+	int					dn_conn_count = 0;
     bool        need_tran_block;
     ExecDirectType        exec_direct_type = node->exec_direct_type;
     int            i;
     CommandId    cid = GetCurrentCommandId(true);    
-	bool                utility_need_transcation = true;
 
     if (!force_autocommit)
         RegisterTransactionLocalNode(true);
@@ -6761,6 +6966,13 @@ ExecRemoteUtility(RemoteQuery *node)
     pgxc_connections = get_exec_connections(NULL, node->exec_nodes, exec_type, 
                                             exec_direct_type != EXEC_DIRECT_UTILITY);
 
+#ifdef __TBASE__
+	if (type == EXCLUED_LEADER_DDL)
+	{
+		delete_leadercn_handle(pgxc_connections, leader_cn_conn);
+	}
+#endif
+
     dn_conn_count = pgxc_connections->dn_conn_count;
     co_conn_count = pgxc_connections->co_conn_count;
 
@@ -6792,172 +7004,31 @@ ExecRemoteUtility(RemoteQuery *node)
 							"transaction block")));
     }
 
-#ifdef __TBASE__    
-	/* Some DDL such as ROLLBACK, SET does not need transaction */
-	utility_need_transcation =
-			(!ExecDDLWithoutAcquireXid(node->parsetree) && !node->is_set);
-
-	if (utility_need_transcation)
-#endif        
-    {
-        elog(LOG, "[SAVEPOINT] node->sql_statement:%s", node->sql_statement);
-        gxid = GetCurrentTransactionId();
-    }
-    
-    if (ActiveSnapshotSet())
-        snapshot = GetActiveSnapshot();
+	GetGlobInfoForRemoteUtility(node, &gxid, &snapshot);
 
 #ifdef __TBASE__    
-	if (utility_need_transcation)
-#endif
+	if (type == ONLY_LEADER_DDL)
     {
-        if (!GlobalTransactionIdIsValid(gxid))
-             ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Failed to get next transaction ID")));
-    }
-
-#ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-    if(!IS_PGXC_LOCAL_COORDINATOR)
-    {
-		/* 
-		 * Distributed DDLs only dispatch from the requested coordinator, thus
-		 * we skip sending gxid to avoid cycling.
-		 *
-		 * Note: except for 'set_config_option'.
-		 */
-        gxid = InvalidTransactionId;
-    }
-
-#endif
-
-#ifdef __TBASE__
-	/* Set node begin transaction in plpgsql function for CN/DN */
-	for (i = 0; i < dn_conn_count; i++)
-    {
-        SetPlpgsqlTransactionBegin(pgxc_connections->datanode_handles[i]);
-    }  
-    
-	for (i = 0; i < co_conn_count; i++)
-	{
-		SetPlpgsqlTransactionBegin(pgxc_connections->coord_handles[i]);
-	}     
-#endif 
-
-    /*
-	 * DDL will firstly be executed on coordinators then datanodes
-	 * which will avoid deadlocks in cluster.
-	 * Let us assume that user sql and ddl hold conflict locks,
-	 * then there will be two situations:
-	 * 1. The coordinator is not locked, user sql will see datanodes with no lock.
-	 * 2. The coordinator is locked, user sql will wait for ddl to complete.
-     *
-     * Send BEGIN control command to all coordinator nodes
-     */
-    if (pgxc_node_begin(co_conn_count,
-                        pgxc_connections->coord_handles,
-                        gxid,
-                        need_tran_block,
-                        false,
-                        PGXC_NODE_COORDINATOR))
-    {
-        ereport(ERROR,
-                (errcode(ERRCODE_INTERNAL_ERROR),
-                        errmsg("Could not begin transaction on coordinators")));
-    }
-
-    /* Send other txn related messages to coordinator nodes */
-    for (i = 0; i < co_conn_count; i++)
-    {
-        PGXCNodeHandle *conn = pgxc_connections->coord_handles[i];
-
-        if (snapshot && pgxc_node_send_snapshot(conn, snapshot))
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                            errmsg("Failed to send command to coordinators")));
-        }
-        if (pgxc_node_send_cmd_id(conn, cid) < 0)
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                            errmsg("Failed to send command ID to Datanodes")));
-        }
-
-        if (pgxc_node_send_query(conn, node->sql_statement) != 0)
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                            errmsg("Failed to send command to coordinators")));
-        }
+		LeaderCnExecRemoteUtility(node, leader_cn_conn, combiner,
+									need_tran_block, gxid, snapshot, cid);
+		pfree_pgxc_all_handles(pgxc_connections);
+		pgxc_node_report_error(combiner);
+		return;
     }
-
-    /*
-     * Stop if all commands are completed or we got a data row and
-     * initialized state node for subsequent invocations
-     */
-    while (co_conn_count > 0)
+	else
     {
-        int i = 0;
-
-        /* Wait until one of the connections has data available */
-        if (pgxc_node_receive(co_conn_count,
-                              pgxc_connections->coord_handles,
-                              NULL))
+        /* Set node begin transaction in plpgsql function for CN/DN */
+        for (i = 0; i < dn_conn_count; i++)
         {
-            /*
-             * Got error
-             * TODO(Tbase): How do we check the error here?
-             */
-            break;
-        }
-
-        while (i < co_conn_count)
+            SetPlpgsqlTransactionBegin(pgxc_connections->datanode_handles[i]);
+        }  
+        
+        for (i = 0; i < co_conn_count; i++)
         {
-            PGXCNodeHandle *conn = pgxc_connections->coord_handles[i];
-            int 			res = handle_response(conn, combiner);
-
-            if (res == RESPONSE_EOF)
-            {
-                i++;
-            }
-            else if (res == RESPONSE_COMPLETE)
-            {
-                /* Ignore, wait for ReadyForQuery */
-                if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
-                {
-                    ereport(ERROR,
-                            (errcode(ERRCODE_INTERNAL_ERROR),
-                                    errmsg("Unexpected FATAL ERROR on Connection to "
-                                           "Coordinator %s pid %d",
-                                           pgxc_connections->coord_handles[i]->nodename,
-                                           pgxc_connections->coord_handles[i]->backend_pid)));
-                }
-            }
-            else if (res == RESPONSE_ERROR)
-            {
-                /* Ignore, wait for ReadyForQuery */
-            }
-            else if (res == RESPONSE_READY)
-            {
-                if (i < --co_conn_count)
-                    pgxc_connections->coord_handles[i] =
-                            pgxc_connections->coord_handles[co_conn_count];
-            }
-            else if (res == RESPONSE_TUPDESC)
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                                errmsg("Unexpected response from coordinator")));
-            }
-            else if (res == RESPONSE_DATAROW)
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                                errmsg("Unexpected response from coordinator")));
-            }
-        }
-    }
+            SetPlpgsqlTransactionBegin(pgxc_connections->coord_handles[i]);
+        }     
+	}
+#endif 
 
     /*
 	 * DDL will firstly be executed on coordinators then datanodes
@@ -6973,8 +7044,7 @@ ExecRemoteUtility(RemoteQuery *node)
                         pgxc_connections->coord_handles,
                         gxid,
                         need_tran_block,
-                        false,
-                        PGXC_NODE_COORDINATOR))
+                        false))
     {
         ereport(ERROR,
                 (errcode(ERRCODE_INTERNAL_ERROR),
@@ -6985,94 +7055,24 @@ ExecRemoteUtility(RemoteQuery *node)
     for (i = 0; i < co_conn_count; i++)
     {
         PGXCNodeHandle *conn = pgxc_connections->coord_handles[i];
-
-        if (snapshot && pgxc_node_send_snapshot(conn, snapshot))
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                            errmsg("Failed to send command to coordinators")));
-        }
-        if (pgxc_node_send_cmd_id(conn, cid) < 0)
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                            errmsg("Failed to send command ID to Datanodes")));
-        }
-
-        if (pgxc_node_send_query(conn, node->sql_statement) != 0)
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                            errmsg("Failed to send command to coordinators")));
-        }
+		SendTxnInfo(node, conn, cid, snapshot);
     }
 
     /*
      * Stop if all commands are completed or we got a data row and
      * initialized state node for subsequent invocations
      */
-    while (co_conn_count > 0)
-    {
-        int i = 0;
-
-        /* Wait until one of the connections has data available */
-        if (pgxc_node_receive(co_conn_count,
+	RemoteReceiveAndCheck(co_conn_count, 
                               pgxc_connections->coord_handles,
-                              NULL))
-        {
-            /*
-             * Got error
-             * TODO(Tbase): How do we check the error here?
-             */
-            break;
-        }
+							combiner);
 
-        while (i < co_conn_count)
-        {
-            PGXCNodeHandle *conn = pgxc_connections->coord_handles[i];
-            int 			res = handle_response(conn, combiner);
-
-            if (res == RESPONSE_EOF)
-            {
-                i++;
-            }
-            else if (res == RESPONSE_COMPLETE)
-            {
-                /* Ignore, wait for ReadyForQuery */
-                if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
-                {
-                    ereport(ERROR,
-                            (errcode(ERRCODE_INTERNAL_ERROR),
-                                    errmsg("Unexpected FATAL ERROR on Connection to "
-                                           "Coordinator %s pid %d",
-                                           pgxc_connections->coord_handles[i]->nodename,
-                                           pgxc_connections->coord_handles[i]->backend_pid)));
-                }
-            }
-            else if (res == RESPONSE_ERROR)
-            {
-                /* Ignore, wait for ReadyForQuery */
-            }
-            else if (res == RESPONSE_READY)
-            {
-                if (i < --co_conn_count)
-                    pgxc_connections->coord_handles[i] =
-                            pgxc_connections->coord_handles[co_conn_count];
-            }
-            else if (res == RESPONSE_TUPDESC)
-            {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                                errmsg("Unexpected response from coordinator")));
-            }
-            else if (res == RESPONSE_DATAROW)
+#ifdef __TBASE__
+	if (LOCAL_PARALLEL_DDL && combiner && combiner->errorMessage)
             {
-                ereport(ERROR,
-                        (errcode(ERRCODE_INTERNAL_ERROR),
-                                errmsg("Unexpected response from coordinator")));
-            }
-        }
+		pfree_pgxc_all_handles(pgxc_connections);
+		pgxc_node_report_error(combiner);
     }
+#endif
 
 	/*
 	 * Send BEGIN control command to all data nodes
@@ -7081,8 +7081,7 @@ ExecRemoteUtility(RemoteQuery *node)
 						pgxc_connections->datanode_handles,
 						gxid,
 						need_tran_block,
-						false,
-						PGXC_NODE_DATANODE))
+						false))	
 	{
 		ereport(ERROR,
 				(errcode(ERRCODE_INTERNAL_ERROR),
@@ -7093,98 +7092,12 @@ ExecRemoteUtility(RemoteQuery *node)
 	for (i = 0; i < dn_conn_count; i++)
 	{
 		PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
-
-		if (conn->state == DN_CONNECTION_STATE_QUERY)
-			BufferConnection(conn);
-		if (snapshot && pgxc_node_send_snapshot(conn, snapshot))
-		{
-			ereport(ERROR,
-					(errcode(ERRCODE_INTERNAL_ERROR),
-					 errmsg("Failed to send snapshot to Datanodes")));
-		}
-		if (pgxc_node_send_cmd_id(conn, cid) < 0)
-		{
-			ereport(ERROR,
-					(errcode(ERRCODE_INTERNAL_ERROR),
-					 errmsg("Failed to send command ID to Datanodes")));
-		}
-
-		if (pgxc_node_send_query(conn, node->sql_statement) != 0)
-		{
-			ereport(ERROR,
-					(errcode(ERRCODE_INTERNAL_ERROR),
-					 errmsg("Failed to send command to Datanodes")));
-		}
+		SendTxnInfo(node, conn, cid, snapshot);
 	}
 
-
-    /* Make the same for data nodes */
-	while (dn_conn_count > 0)
-	{
-		int i = 0;
-
-		/* Wait until one of the connections has data available */
-		if (pgxc_node_receive(dn_conn_count,
+	RemoteReceiveAndCheck(dn_conn_count, 
 							  pgxc_connections->datanode_handles,
-							  NULL))
-		{
-			/*
-			 * Got error
-			 * TODO(Tbase): How do we check the error here?
-			 */
-			break;
-		}
-
-		/*
-		 * Handle input from the data nodes. We do not expect data nodes
-		 * returning tuples when running utility command. If we got EOF, move
-		 * to the next connection, will receive more data on the next
-		 * iteration.
-		 */
-		while (i < dn_conn_count)
-		{
-			PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
-			int res = handle_response(conn, combiner);
-			if (res == RESPONSE_EOF)
-			{
-				i++;
-			}
-			else if (res == RESPONSE_COMPLETE)
-			{
-				/* Ignore, wait for ReadyForQuery */
-				if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
-				{
-					ereport(ERROR,
-						(errcode(ERRCODE_INTERNAL_ERROR),
-						 errmsg("Unexpected FATAL ERROR on Connection to "
-								 "Datanode %s pid %d",
-								conn->nodename, conn->backend_pid)));
-				}
-			}
-			else if (res == RESPONSE_ERROR)
-			{
-				/* Ignore, wait for ReadyForQuery */
-			}
-			else if (res == RESPONSE_READY)
-			{
-				if (i < --dn_conn_count)
-					pgxc_connections->datanode_handles[i] =
-						pgxc_connections->datanode_handles[dn_conn_count];
-			}
-			else if (res == RESPONSE_TUPDESC)
-			{
-				ereport(ERROR,
-						(errcode(ERRCODE_INTERNAL_ERROR),
-						 errmsg("Unexpected response from Datanode")));
-			}
-			else if (res == RESPONSE_DATAROW)
-			{
-				ereport(ERROR,
-						(errcode(ERRCODE_INTERNAL_ERROR),
-						 errmsg("Unexpected response from Datanode")));
-			}
-		}
-	}
+							combiner);
 
 	/*
 	 * We have processed all responses from nodes and if we have error message
@@ -9086,7 +8999,7 @@ ExecRemoteQuery(PlanState *pstate)
 			combiner->current_conn = 0;
 #endif
 			if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block,
-								step->read_only, PGXC_NODE_DATANODE))
+								step->read_only))
 				ereport(ERROR,
 						(errcode(ERRCODE_INTERNAL_ERROR),
 						 errmsg("Could not begin transaction on data node:%s.",
@@ -9155,8 +9068,9 @@ ExecRemoteQuery(PlanState *pstate)
 #ifdef __TBASE__
 			connections[i]->recv_datarows = 0;
 #endif
+
 			if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block,
-								step->read_only, PGXC_NODE_DATANODE))
+								step->read_only))
 				ereport(ERROR,
 						(errcode(ERRCODE_INTERNAL_ERROR),
 						 errmsg("Could not begin transaction on data node:%s.",
@@ -10593,8 +10507,7 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node)
     {
         PGXCNodeHandle *connection = combiner->connections[i];
 
-		if (pgxc_node_begin(1, &connection, gxid, true,
-							is_read_only, PGXC_NODE_DATANODE))
+		if (pgxc_node_begin(1, &connection, gxid, true, is_read_only))
 			ereport(ERROR,
 					(errcode(ERRCODE_INTERNAL_ERROR),
 					 errmsg("Could not begin transaction on data node:%s.",
@@ -12641,8 +12554,7 @@ ExecRemoteDML(ModifyTableState *mtstate, ItemPointer tupleid, HeapTuple oldtuple
     {
         gxid = GetCurrentTransactionIdIfAny();
         
-        if (pgxc_node_begin(1, &connections[i], gxid, true,
-                false, PGXC_NODE_DATANODE))
+		if (pgxc_node_begin(1, &connections[i], gxid, true, false))
         {
             elog(ERROR, "Could not begin transaction on datanode in ExecRemoteDML, nodeid:%d.",
                          connections[i]->nodeid);
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 6db7d43b..5424a200 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -2701,6 +2701,7 @@ pgxc_node_send_query_internal(PGXCNodeHandle * handle, const char *query,
 {
     int            strLen;
     int            msgLen;
+
     /*
      * Its appropriate to send ROLLBACK commands on a failed connection, but
      * for everything else we expect the connection to be in a sane state
@@ -5829,23 +5830,22 @@ PGXCGetAllDnOid(Oid *nodelist)
 /*
  * Return the name of ascii-minimized coordinator as ddl leader cn
  */
-inline char*
+PGXCNodeHandle*
 find_ddl_leader_cn(void)
 {
     int i = 0;
-    char* result = NULL;
+    char			*name = NULL;
+	PGXCNodeHandle	*result = NULL;
 
     for (i = 0; i < NumCoords; i++)
     {
-        if(result == NULL || strcmp(co_handles[i].nodename, result) < 0)
+        if(name == NULL || strcmp(co_handles[i].nodename, name) < 0)
         {
-            result = co_handles[i].nodename;
+            name = co_handles[i].nodename;
+			result = &co_handles[i];
         }
     }
 
-    if(result)
-        result = pstrdup(result);
-
     return result;
 }
 
@@ -5866,6 +5866,34 @@ is_pgxc_handles_init()
 {
 	return (dn_handles != NULL && co_handles != NULL);
 }
+
+/*
+ * Remove leader_cn_handle from pgxc_connections
+ */
+void
+delete_leadercn_handle(PGXCNodeAllHandles *pgxc_connections,
+							PGXCNodeHandle* leader_cn_handle)
+{
+	int co_conn_count = 0;
+	int i = 0;
+
+	if (!pgxc_connections || !leader_cn_handle)
+		return;
+
+	co_conn_count = pgxc_connections->co_conn_count;
+	for (i = 0; i < co_conn_count; i++)
+	{
+		if (pgxc_connections->coord_handles[i] == leader_cn_handle)
+		{
+			if (i+1 < co_conn_count)
+				pgxc_connections->coord_handles[i] = pgxc_connections->coord_handles[i+1];
+			else
+				pgxc_connections->coord_handles[i] = NULL;
+			pgxc_connections->co_conn_count--;
+			break;
+		}
+	}
+}
 #endif
 
 /*
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 1ea3d3ea..a607a515 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -5195,6 +5195,7 @@ PostgresMain(int argc, char *argv[],
 #ifdef __TBASE__
 		/* Clear parallel DDL flag */
 		is_txn_has_parallel_ddl = false;
+		leader_cn_executed_ddl = false;
 #endif
 
 		/*
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 73736d71..a984b9e6 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -17,6 +17,7 @@
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
+#include "stdio.h"
 
 #include "access/htup_details.h"
 #include "access/reloptions.h"
@@ -104,6 +105,8 @@
 #include "utils/ruleutils.h"
 #include "utils/memutils.h"
 #include "catalog/index.h"
+#include "catalog/pg_namespace.h"
+#include "storage/lmgr.h"
 #endif
 
 #ifdef __AUDIT__
@@ -152,8 +155,12 @@ extern bool    g_GTM_skip_catalog;
 
 bool is_txn_has_parallel_ddl;
 bool enable_parallel_ddl;
+bool leader_cn_executed_ddl;
+
 #endif
 
+static RemoteQueryExecType GetRenameExecType(RenameStmt *stmt, bool *is_temp);
+
 #endif
 
 /* Hook for plugins to get control in ProcessUtility() */
@@ -665,10 +672,14 @@ ProcessUtilityPre(PlannedStmt *pstmt,
                 /* Clean also remote Coordinators */
 				snprintf(query, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;",
                         quote_identifier(stmt->dbname));
+
                 ExecUtilityStmtOnNodes(parsetree, query, NULL, sentToRemote, true,
                         EXEC_ON_ALL_NODES, false, false);
 
-				if (!stmt->prepare)
+				/*
+				 * parallel ddl mode, we send drop db prepare in standard_ProcessUtility 
+				 */
+				if (!stmt->prepare && !is_txn_has_parallel_ddl)
 				{
 					/* Lock database and check the constraints before we actually dropping */
 					if (stmt->missing_ok)
@@ -779,7 +790,9 @@ ProcessUtilityPre(PlannedStmt *pstmt,
 #ifdef _MIGRATE_
             if(!IsConnFromCoord() && !isRestoreMode && IS_PGXC_COORDINATOR)
             {                        
-                ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false);        
+				ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+										sentToRemote, false, EXEC_ON_ALL_NODES,
+										false, false);
             }
 #endif
             all_done = true;
@@ -790,7 +803,9 @@ ProcessUtilityPre(PlannedStmt *pstmt,
 #ifdef _MIGRATE_
             if(!IsConnFromCoord() && IS_PGXC_COORDINATOR)
             {                        
-                ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false);        
+				ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+										sentToRemote, false, EXEC_ON_ALL_NODES,
+										false, false);	
             }
 #endif
             all_done = true;
@@ -813,7 +828,9 @@ ProcessUtilityPre(PlannedStmt *pstmt,
 #ifdef _MIGRATE_
                 if(!IsConnFromCoord() && IS_PGXC_COORDINATOR)
                 {                        
-                    ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false);        
+					ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+											sentToRemote, false, EXEC_ON_ALL_NODES,
+											false, false);
                 }    
 #endif
                 all_done = true;
@@ -844,33 +861,10 @@ ProcessUtilityPre(PlannedStmt *pstmt,
 
                 if (IS_PGXC_LOCAL_COORDINATOR)
                 {
-                    /*
-                     * Get the necessary details about the relation before we
-                     * run ExecRenameStmt locally. Otherwise we may not be able
-                     * to look-up using the old relation name.
-                     */
-                    if (stmt->relation)
-                    {
-                        /*
-                         * If the table does not exist, don't send the query to
-                         * the remote nodes. The local node will eventually
-                         * report an error, which is then sent back to the
-                         * client.
-                         */
-                        Oid relid = RangeVarGetRelid(stmt->relation, NoLock, true);
-
-                        if (OidIsValid(relid))
-                            exec_type = ExecUtilityFindNodes(stmt->renameType,
-                                    relid,
-                                    &is_temp);
-                        else
-                            exec_type = EXEC_ON_NONE;
-                    }
-                    else
-                        exec_type = ExecUtilityFindNodes(stmt->renameType,
-                                InvalidOid,
-                                &is_temp);
+					exec_type = GetRenameExecType(stmt, &is_temp);
 #ifdef __TBASE__
+					if (LOCAL_PARALLEL_DDL)
+						exec_type = EXEC_ON_NONE;
                     /* clean connections of the old name first. */
                     if (OBJECT_DATABASE == stmt->renameType)
                     {
@@ -879,8 +873,9 @@ ProcessUtilityPre(PlannedStmt *pstmt,
                         DropDBCleanConnection(stmt->subname);
                         /* Clean also remote nodes */
                         sprintf(query, "CLEAN CONNECTION TO ALL FOR DATABASE %s;", stmt->subname);
-                        ExecUtilityStmtOnNodes(parsetree, query, NULL, sentToRemote, true,
-                                EXEC_ON_ALL_NODES, false, false);
+						ExecUtilityStmtOnNodes(parsetree, query, NULL,
+										sentToRemote, true, EXEC_ON_ALL_NODES,
+										false, false);
                     }
 #endif
                 }
@@ -899,7 +894,27 @@ ProcessUtilityPre(PlannedStmt *pstmt,
              * it will cause a deadlock in the cluster at Datanode levels.
              */
             if (!IsConnFromCoord())
+			{
+#ifdef __TBASE__
+				if (LOCAL_PARALLEL_DDL)
+				{
+					PGXCNodeHandle* leaderCnHandle = find_ddl_leader_cn();
+					RemoteQueryExecType execType = ((RemoteQuery *) parsetree)->exec_type;
+					if ((execType == EXEC_ON_ALL_NODES || execType == EXEC_ON_COORDS))
+					{
+						if (!is_ddl_leader_cn(leaderCnHandle->nodename))
+							Assert(leader_cn_executed_ddl);
+					}
+					ExecRemoteUtility((RemoteQuery *) parsetree,
+										leaderCnHandle, EXCLUED_LEADER_DDL);
+				}
+				else
+					ExecRemoteUtility((RemoteQuery *) parsetree,
+										NULL, NON_PARALLEL_DDL);
+#else
                 ExecRemoteUtility((RemoteQuery *) parsetree);
+#endif
+			}
             break;
 
         case T_CleanConnStmt:
@@ -1245,7 +1260,6 @@ ProcessUtilityPre(PlannedStmt *pstmt,
         ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, auto_commit,
                 exec_type, is_temp, add_context);
 
-
     return all_done;
 }
 
@@ -1369,16 +1383,21 @@ ProcessUtilityPost(PlannedStmt *pstmt,
             add_context = true;
             exec_type = EXEC_ON_ALL_NODES;
             break;
-
+		case T_DropdbStmt:
+		case T_DropRoleStmt:
         case T_DropTableSpaceStmt:
+#ifdef __TBASE__
+			if (LOCAL_PARALLEL_DDL)
+				break;
+#endif
+			exec_type = EXEC_ON_ALL_NODES;
+			break;
         case T_AlterTableSpaceOptionsStmt:
         case T_GrantRoleStmt:
         case T_AlterDatabaseSetStmt:
-        case T_DropdbStmt:
         case T_CreateRoleStmt:
         case T_AlterRoleStmt:
         case T_AlterRoleSetStmt:
-        case T_DropRoleStmt:
         case T_ReassignOwnedStmt:
         case T_LockStmt:
         case T_AlterOwnerStmt:
@@ -1658,6 +1677,10 @@ ProcessUtilityPost(PlannedStmt *pstmt,
             break;
 
         case T_CreateSeqStmt:
+#ifdef __TBASE__
+			if (LOCAL_PARALLEL_DDL)
+				break;
+#endif
             if (IS_PGXC_LOCAL_COORDINATOR)
             {
                 CreateSeqStmt *stmt = (CreateSeqStmt *) parsetree;
@@ -1781,22 +1804,117 @@ ProcessUtilityPost(PlannedStmt *pstmt,
 static void
 parallel_ddl_process(Node *node)
 {
-    if (!enable_parallel_ddl || !IS_PGXC_LOCAL_COORDINATOR)
+	/*
+	 * set is_txn_has_parallel_ddl to be false in case of combination command
+	 * that include some type support parallel ddl and some unsupport parallel
+	 * ddl. eg: create extension which include T_CreateFunctionStmt and
+	 * T_CreateOpClassStmt and so on.
+	 */
+	if (is_txn_has_parallel_ddl && nodeTag(node) != T_RemoteQuery)
+	{
+		is_txn_has_parallel_ddl = false;
+	}
+
+	if (!enable_parallel_ddl)
     {
         return ;
     }
 
 	switch (nodeTag(node))
 	{
+		case T_AlterTableStmt:
+		case T_AlterDatabaseStmt:
+		case T_AlterDatabaseSetStmt:
+		case T_AlterRoleSetStmt:
+			break;
+		case T_AlterOwnerStmt:
+			{
+				AlterOwnerStmt *stmt = (AlterOwnerStmt *) node;
+				switch (stmt->objectType)
+				{
+					case OBJECT_DATABASE:
+					case OBJECT_SCHEMA:
+					case OBJECT_TABLE:
+					case OBJECT_FUNCTION:
+					case OBJECT_TYPE:
+						break;
+					default:
+						return;
+				}
+			}
+			break;
+		case T_AlterObjectSchemaStmt:
+			{
+				AlterObjectSchemaStmt *stmt = (AlterObjectSchemaStmt *) node;
+				switch (stmt->objectType)
+				{
+					case OBJECT_TABLE:
+					case OBJECT_FUNCTION:
+					case OBJECT_VIEW:
+					case OBJECT_TYPE:
+						break;
+					default:
+						return;
+				}
+			}
+			break;
+		case T_AlterSeqStmt:
 		case T_CreateStmt:
 		case T_CreateForeignTableStmt:
 		case T_CreateTableAsStmt:
 		case T_CreateSchemaStmt:
-		case T_AlterTableStmt:
-		case T_DefineStmt:
+		case T_CreateTableSpaceStmt:
+		case T_CreatedbStmt:
+		case T_CreateRoleStmt:
+		case T_CompositeTypeStmt:
+		case T_CreateEnumStmt:
+		case T_CreateRangeStmt:
+		case T_CreateSeqStmt:
+		case T_CreateFunctionStmt:
+		case T_ViewStmt:
+		case T_DropTableSpaceStmt:
+		case T_DropdbStmt:
+		case T_DropRoleStmt:
+			break;
 		case T_DropStmt:
+			{
+				DropStmt *stmt = (DropStmt *)node;
+				switch (stmt->removeType)
+				{
+					case OBJECT_INDEX:
+					case OBJECT_SEQUENCE:
+					case OBJECT_TABLE:
+					case OBJECT_VIEW:
+					case OBJECT_MATVIEW:
+					case OBJECT_FOREIGN_TABLE:
+					case OBJECT_SCHEMA:
+					case OBJECT_FUNCTION:
+					case OBJECT_TYPE:
+						break;
+					default:
+						return;
+				}
+			}
+			break;
 		case T_RenameStmt:
-		case T_TruncateStmt:
+			{
+				RenameStmt *stmt = (RenameStmt *)node;
+				switch (stmt->renameType)
+				{
+					case OBJECT_DATABASE:
+					case OBJECT_SCHEMA:
+					case OBJECT_ROLE:
+					case OBJECT_TABLE:
+					case OBJECT_INDEX:
+					case OBJECT_VIEW:
+					case OBJECT_FUNCTION:
+					case OBJECT_TYPE:
+						break;
+					default:
+						return;
+				}
+			}
+			break;
 		case T_IndexStmt:
     /* CONCURRENT INDEX is not supported */
     if (IsA(node,IndexStmt) && castNode(IndexStmt,node)->concurrent)
@@ -1804,6 +1922,9 @@ parallel_ddl_process(Node *node)
 				return ;
     }
 			break;
+		case T_TruncateStmt:
+		case T_ReindexStmt:
+			break;
 		default:
 			return ;
     }
@@ -2067,16 +2188,72 @@ standard_ProcessUtility(PlannedStmt *pstmt,
             /* no event triggers for global objects */
             if (IS_PGXC_LOCAL_COORDINATOR)
                 PreventTransactionChain(isTopLevel, "CREATE TABLESPACE");
+#ifdef __TBASE__
+			/*
+			 * If I am the main execute CN but not Leader CN,
+			 * Notify the Leader CN to create firstly.
+			 */
+			if (!sentToRemote && LOCAL_PARALLEL_DDL)
+			{
+				SendLeaderCNUtilityWithContext(queryString, false);
+			}
+#endif
             CreateTableSpace((CreateTableSpaceStmt *) parsetree);
             break;
 
         case T_DropTableSpaceStmt:
-            /* no event triggers for global objects */
-            /* Allow this to be run inside transaction block on remote nodes */
+			{
+				DropTableSpaceStmt *stmt = (DropTableSpaceStmt *)parsetree;
+				/* 
+				 * no event triggers for global objects
+				 * Allow this to be run inside transaction block on remote nodes
+				 */
             if (IS_PGXC_LOCAL_COORDINATOR)
                 PreventTransactionChain(isTopLevel, "DROP TABLESPACE");
 
-            DropTableSpace((DropTableSpaceStmt *) parsetree);
+#ifdef __TBASE__
+				/*
+				 * If I am the main execute CN but not Leader CN,
+				 * Notify the Leader CN to create firstly.
+				 */
+				if (!sentToRemote && LOCAL_PARALLEL_DDL)
+				{
+					PGXCNodeHandle	*leaderCnHandle = NULL;
+					leaderCnHandle = find_ddl_leader_cn();
+					if (!is_ddl_leader_cn(leaderCnHandle->nodename))
+					{
+						if (PreCheckforDropTableSpace(stmt))
+						{
+							SendLeaderCNUtility(queryString, false);
+							DropTableSpace(stmt, false);
+							ExecUtilityStmtOnNodes(parsetree, queryString,
+												NULL, sentToRemote, false,
+												EXEC_ON_ALL_NODES, false,
+												false);
+						}
+					}
+					else if (DropTableSpace(stmt, stmt->missing_ok))
+					{
+						ExecUtilityStmtOnNodes(parsetree, queryString,
+											NULL, sentToRemote, false,
+											EXEC_ON_ALL_NODES, false,
+											false);
+					}
+				}
+				/* From remote cn */
+				else if (!IS_PGXC_LOCAL_COORDINATOR && is_txn_has_parallel_ddl)
+				{
+					DropTableSpace(stmt, false);
+				}
+				/* non parallel ddl mode */
+				else
+				{
+					DropTableSpace(stmt, stmt->missing_ok);
+				}
+#else
+				DropTableSpace(stmt, stmt->missing_ok);
+#endif
+			}
             break;
 
         case T_AlterTableSpaceOptionsStmt:
@@ -2085,6 +2262,31 @@ standard_ProcessUtility(PlannedStmt *pstmt,
             break;
 
         case T_TruncateStmt:
+#ifdef __TBASE__
+			/*
+			 * If I am the main execute CN but not Leader CN,
+			 * Notify the Leader CN to create firstly.
+			 */
+			if (!sentToRemote && LOCAL_PARALLEL_DDL)
+			{
+				bool		is_temp = false;
+				ListCell	*cell;
+				foreach (cell, ((TruncateStmt *) parsetree)->relations) 
+				{
+					Oid relid;
+					RangeVar* rel = (RangeVar*)lfirst(cell);
+
+					relid = RangeVarGetRelid(rel, NoLock, false);
+
+					if (IsTempTable(relid))
+					{
+						is_temp = true;
+						break;
+					}
+				}
+				SendLeaderCNUtility(queryString, is_temp);
+			}
+#endif
             ExecuteTruncate((TruncateStmt *) parsetree);
             break;
 
@@ -2127,31 +2329,118 @@ standard_ProcessUtility(PlannedStmt *pstmt,
             /* no event triggers for global objects */
             if (IS_PGXC_LOCAL_COORDINATOR)
                 PreventTransactionChain(isTopLevel, "CREATE DATABASE");
+
+#ifdef __TBASE__
+			/*
+			 * If I am the main execute CN but not Leader CN,
+			 * Notify the Leader CN to create firstly.
+			 */
+			if (!sentToRemote && LOCAL_PARALLEL_DDL)
+			{
+				SendLeaderCNUtilityWithContext(queryString, false);
+			}
+#endif
+
             createdb(pstate, (CreatedbStmt *) parsetree);
             break;
 
         case T_AlterDatabaseStmt:
+#ifdef __TBASE__
+			/*
+			 * If I am the main execute CN but not Leader CN,
+			 * Notify the Leader CN to create firstly.
+			 */
+			if (!sentToRemote && LOCAL_PARALLEL_DDL)
+			{
+				/*
+				 * If this is not a SET TABLESPACE statement, just propogate
+				 * the cmd as usual.
+				 */
+				if (IsSetTableSpace((AlterDatabaseStmt*) parsetree))
+					SendLeaderCNUtility(queryString, false);
+				else
+					SendLeaderCNUtilityWithContext(queryString, false);
+			}
+#endif
             /* no event triggers for global objects */
             AlterDatabase(pstate, (AlterDatabaseStmt *) parsetree, isTopLevel);
             break;
 
         case T_AlterDatabaseSetStmt:
+#ifdef __TBASE__
+			/*
+			 * If I am the main execute CN but not Leader CN,
+			 * Notify the Leader CN to create firstly.
+			 */
+			if (!sentToRemote && LOCAL_PARALLEL_DDL)
+			{
+				SendLeaderCNUtility(queryString, false);
+			}
+#endif		
             /* no event triggers for global objects */
             AlterDatabaseSet((AlterDatabaseSetStmt *) parsetree);
             break;
 
         case T_DropdbStmt:
             {
+				char prepareQuery[STRINGLENGTH];
                 DropdbStmt *stmt = (DropdbStmt *) parsetree;
-
 				if (!stmt->prepare)
 				{
+					bool missing_ok = stmt->missing_ok;
                 /* no event triggers for global objects */
                 if (IS_PGXC_LOCAL_COORDINATOR)
 					{
                     PreventTransactionChain(isTopLevel, "DROP DATABASE");
 					}
-                dropdb(stmt->dbname, stmt->missing_ok);
+					/*
+					 * If I am the main execute CN but not Leader CN,
+					 * Notify the Leader CN to drop firstly.
+					 */
+					if (!sentToRemote && LOCAL_PARALLEL_DDL)
+					{
+						PGXCNodeHandle	*leaderCnHandle = NULL;
+						Oid				db_oid = InvalidOid;
+						leaderCnHandle = find_ddl_leader_cn();
+						
+						db_oid = get_database_oid(stmt->dbname,	missing_ok);
+
+						if (OidIsValid(db_oid))
+						{
+							snprintf(prepareQuery, STRINGLENGTH, "DROP DATABASE PREPARE %s;",
+						        			quote_identifier(stmt->dbname));
+							if (!is_ddl_leader_cn(leaderCnHandle->nodename))
+								SendLeaderCNUtility(prepareQuery, false);
+							else
+								dropdb_prepare(stmt->dbname, false);
+							ExecUtilityStmtOnNodes(parsetree, prepareQuery,
+												NULL, sentToRemote, false,
+												EXEC_ON_ALL_NODES, false,
+												false);
+
+							if (!is_ddl_leader_cn(leaderCnHandle->nodename))
+								SendLeaderCNUtility(queryString, false);
+						}
+						else
+							break;
+					}
+					/* 
+					 * In parallel ddl mode, we only send cmd to remote when
+					 * database exists, so database can not miss when the cmd
+					 * come from remote cn.
+					 */
+					if (!IS_PGXC_LOCAL_COORDINATOR && is_txn_has_parallel_ddl)
+					{
+						missing_ok = false;
+					}
+
+					if (dropdb(stmt->dbname, missing_ok) &&	LOCAL_PARALLEL_DDL)
+					{
+						ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+												sentToRemote, false,
+												EXEC_ON_ALL_NODES, false,
+												false);
+					}
             }
 				else
 				{
@@ -2301,6 +2590,16 @@ standard_ProcessUtility(PlannedStmt *pstmt,
              * ******************************** ROLE statements ****
              */
         case T_CreateRoleStmt:
+#ifdef __TBASE__
+			/*
+			 * If I am the main execute CN but not Leader CN,
+			 * Notify the Leader CN to create firstly.
+			 */
+            if (!sentToRemote && LOCAL_PARALLEL_DDL)
+            {
+                SendLeaderCNUtility(queryString, false);
+            }
+#endif
             /* no event triggers for global objects */
             CreateRole(pstate, (CreateRoleStmt *) parsetree);
             break;
@@ -2311,13 +2610,29 @@ standard_ProcessUtility(PlannedStmt *pstmt,
             break;
 
         case T_AlterRoleSetStmt:
+#ifdef __TBASE__
+			/*
+			 * If I am the main execute CN but not Leader CN,
+			 * Notify the Leader CN to create firstly.
+			 */
+            if (!sentToRemote && LOCAL_PARALLEL_DDL)
+            {
+                SendLeaderCNUtility(queryString, false);
+            }
+#endif
             /* no event triggers for global objects */
             AlterRoleSet((AlterRoleSetStmt *) parsetree);
             break;
 
         case T_DropRoleStmt:
+			{
+#ifdef __TBASE__
+				CheckAndDropRole(parsetree, sentToRemote, queryString);
+#else
             /* no event triggers for global objects */
-            DropRole((DropRoleStmt *) parsetree);
+				DropRole(stmt, stmt->missing_ok, NULL);
+#endif
+			}
             break;
 
         case T_ReassignOwnedStmt:
@@ -2368,9 +2683,17 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                 switch (stmt->kind)
                 {
                     case REINDEX_OBJECT_INDEX:
+#ifdef __TBASE__	
+						CheckAndSendLeaderCNReindex(sentToRemote, stmt,
+													queryString);
+#endif
                         ReindexIndex(stmt->relation, stmt->options);
                         break;
                     case REINDEX_OBJECT_TABLE:
+#ifdef __TBASE__	
+						CheckAndSendLeaderCNReindex(sentToRemote, stmt,
+													queryString);
+#endif
                         ReindexTable(stmt->relation, stmt->options);
                         break;
                     case REINDEX_OBJECT_SCHEMA:
@@ -2437,11 +2760,34 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                 RenameStmt *stmt = (RenameStmt *) parsetree;
 
                 if (EventTriggerSupportsObjectType(stmt->renameType))
+				{
                     ProcessUtilitySlow(pstate, pstmt, queryString,
                                        context, params, queryEnv,
                                        dest,
                                        sentToRemote,
                                        completionTag);
+				}
+#ifdef __TBASE__
+				else if (LOCAL_PARALLEL_DDL)
+				{
+					bool is_temp = false;
+					PGXCNodeHandle	*leaderCnHandle = find_ddl_leader_cn();
+					bool is_leader_cn = is_ddl_leader_cn(leaderCnHandle->nodename);
+					RemoteQueryExecType exec_type = GetRenameExecType(stmt, &is_temp);
+					/*
+					 * If I am the main execute CN but not Leader CN,
+					 * Notify the Leader CN to create firstly.
+					 */
+					if (!is_leader_cn)
+					{
+						SendLeaderCNUtility(queryString, is_temp);
+					}
+					ExecRenameStmt(stmt);
+					ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+											sentToRemote, false, exec_type,
+											is_temp, false);
+				}
+#endif
                 else
                     ExecRenameStmt(stmt);
 
@@ -2489,8 +2835,20 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                                        sentToRemote,
                                        completionTag);
                 else
+				{
+#ifdef __TBASE__
+					/*
+					 * If I am the main execute CN but not Leader CN,
+					 * Notify the Leader CN to create firstly.
+					 */
+					if (!sentToRemote && LOCAL_PARALLEL_DDL)
+					{
+						SendLeaderCNUtility(queryString, false);
+					}
+#endif
                     ExecAlterOwnerStmt(stmt);
             }
+			}
             break;
 
         case T_CommentStmt:
@@ -2550,7 +2908,9 @@ standard_ProcessUtility(PlannedStmt *pstmt,
             /* only if am the original session I will revoke other nodes to do the create sharding job */
             if(IS_PGXC_COORDINATOR && !IsConnFromCoord())
             {                        
-                ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false, false);
+				ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+										sentToRemote, false, EXEC_ON_COORDS,
+										false, false);
             
                 execnodes = (ExecNodes *)makeNode(ExecNodes);
                 for(i = 0; i < nodenum; i++)
@@ -2566,8 +2926,8 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                     execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIndex[i]);
                     
                     ExecUtilityStmtOnNodes(parsetree, queryString, execnodes, 
-                                            sentToRemote, false, 
-                                            EXEC_ON_DATANODES, false, false);
+											sentToRemote, false, EXEC_ON_DATANODES,
+											false, false);
                     list_free(execnodes->nodeList);
                     execnodes->nodeList = NIL;
                 }
@@ -2622,7 +2982,9 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                 /* Send Move Data Command to All Coordinator,
                  *    BUT,it is necessary to add new node to all the Coordinators independently 
                  */
-                ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false, false);
+				ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+										sentToRemote, false, EXEC_ON_COORDS,
+										false, false);
 
                 /* generate new query string to datanode s*/
                 switch (stmt->strategy)
@@ -2702,8 +3064,8 @@ standard_ProcessUtility(PlannedStmt *pstmt,
     
                 /* Send Move Data Command to Data Node */
                 ExecUtilityStmtOnNodes(parsetree, movecmd, execnodes, 
-                                        sentToRemote, false, 
-                                        EXEC_ON_DATANODES, false, false);
+										sentToRemote, false, EXEC_ON_DATANODES,
+										false, false);
 
                 pfree(qstring_tonode->data);
                 pfree(qstring_tonode);
@@ -2754,7 +3116,9 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                 ExecNodes *execnodes;
                 
                 /* drop remote coord sharding map */
-                ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false, false);
+				ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+										sentToRemote, false, EXEC_ON_COORDS,
+										false, false);
                 
                 /* drop datanodes sharding map */
                 GetGroupNodesByNameOrder(group, nodeIndex, &nodenum);
@@ -2773,8 +3137,8 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                     execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIndex[i]);
                 }
                 ExecUtilityStmtOnNodes(parsetree, queryString, execnodes, 
-                                                sentToRemote, false, 
-                                                EXEC_ON_DATANODES, false, false);
+										sentToRemote, false, EXEC_ON_DATANODES,
+										false, false);
                 list_free(execnodes->nodeList);
                 pfree(execnodes);                            
             }
@@ -2810,7 +3174,9 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                 /* Send cleansharding msg to all other cn and dn */
                 if (IS_PGXC_LOCAL_COORDINATOR)
                 {
-                    ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false);
+					ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+											sentToRemote, false, EXEC_ON_ALL_NODES,
+											false, false);
                 }
                 /* Then cleansharding self */
                 ForceRefreshShardMap(InvalidOid);
@@ -2867,11 +3233,13 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                         elog(ERROR, "innel error: datanode %d cannot be found.", tooid);
                     execnodes->nodeList = lappend_int(execnodes->nodeList,toidx);
                     ExecUtilityStmtOnNodes(NULL, "CLEAN SHARDING;", execnodes, 
-                                            sentToRemote, false, 
-                                            EXEC_ON_DATANODES, false, false);
+											sentToRemote, false, EXEC_ON_DATANODES,
+											false, false);
                     
                     //second clean sharding of all cooridnators
-                    ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false, false);
+					ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+											sentToRemote, false, EXEC_ON_COORDS,
+											false, false);
                     //and self                        
                     ForceRefreshShardMap(InvalidOid);
 
@@ -2895,8 +3263,8 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                         execnodes->nodeList = lappend_int(execnodes->nodeList, nodeindex);
                     }
                     ExecUtilityStmtOnNodes(NULL, "CLEAN SHARDING;", execnodes, 
-                                            sentToRemote, false, 
-                                            EXEC_ON_DATANODES, false, false);
+											sentToRemote, false, EXEC_ON_DATANODES,
+											false, false);
             
 
                     //finally clean sharding at from datanode
@@ -2909,8 +3277,8 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                     execnodes->nodeList = lappend_int(execnodes->nodeList,fromidx);
 
                     ExecUtilityStmtOnNodes(NULL, "CLEAN SHARDING;", execnodes, 
-                                            sentToRemote, false, 
-                                            EXEC_ON_DATANODES, false, false);
+											sentToRemote, false, EXEC_ON_DATANODES,
+											false, false);
                     list_free(execnodes->nodeList);
                     pfree(execnodes);
                 }                    
@@ -3000,7 +3368,9 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                         || (CREATE_KEY_VALUE_EXEC_CN == g_create_key_value_mode))
                     {
                         /* first tell other coord node to create */
-                        ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false, false);
+    					ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+												sentToRemote, false, EXEC_ON_COORDS,
+												false, false);
                     }
 
                     if ((CREATE_KEY_VALUE_EXEC_ALL == g_create_key_value_mode)
@@ -3021,8 +3391,8 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                             execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIndex[i]);
 
                             ExecUtilityStmtOnNodes(parsetree, queryString, execnodes, 
-                                                    sentToRemote, false, 
-                                                    EXEC_ON_DATANODES, false, false);
+    												sentToRemote, false, EXEC_ON_DATANODES,
+													false, false);
                             list_free(execnodes->nodeList);
                             execnodes->nodeList = NIL;
                         }
@@ -3045,8 +3415,8 @@ standard_ProcessUtility(PlannedStmt *pstmt,
                                 execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIndex[i]);
 
                                 ExecUtilityStmtOnNodes(parsetree, queryString, execnodes, 
-                                                        sentToRemote, false, 
-                                                        EXEC_ON_DATANODES, false, false);
+    													sentToRemote, false, EXEC_ON_DATANODES,
+														false, false);
                                 list_free(execnodes->nodeList);
                                 execnodes->nodeList = NIL;
                             }
@@ -3148,6 +3518,16 @@ ProcessUtilitySlow(ParseState *pstate,
                  * relation and attribute manipulation
                  */
             case T_CreateSchemaStmt:
+#ifdef __TBASE__
+				/*
+                 * If I am the main execute CN but not Leader CN,
+                 * Notify the Leader CN to create firstly.
+                 */
+				if (!sentToRemote && LOCAL_PARALLEL_DDL)
+				{
+					SendLeaderCNUtility(queryString, false);
+				}
+#endif
                 CreateSchemaCommand((CreateSchemaStmt *) parsetree,
                                     queryString, sentToRemote,
                                     pstmt->stmt_location,
@@ -3172,6 +3552,13 @@ ProcessUtilitySlow(ParseState *pstate,
                        PGXCSubCluster *subcluster = NULL;
 #endif
 
+#ifdef __TBASE__
+					Oid nspaceid;
+					bool exist_ok = true;
+
+					if (is_txn_has_parallel_ddl && IsConnFromCoord())
+						exist_ok = false;
+
                     /* Run parse analysis ... */
                     /*
                      * If sentToRemote is set it is either EXECUTE DIRECT or part
@@ -3181,14 +3568,18 @@ ProcessUtilitySlow(ParseState *pstate,
                      * it should explicitly specify distribution.
                      */
                     stmts = transformCreateStmt((CreateStmt *) parsetree,
-                            queryString, !is_local && !sentToRemote);
+							queryString, !is_local && !sentToRemote,
+							&nspaceid, exist_ok);
 
-#ifdef __TBASE__
 					if (NULL == stmts)
                     {
                         commandCollected = true;
                         break;
                     }
+
+#else
+					stmts = transformCreateStmt((CreateStmt *) parsetree,
+							queryString, !is_local && !sentToRemote);
 #endif
 
                     if (IS_PGXC_LOCAL_COORDINATOR)
@@ -3278,6 +3669,29 @@ ProcessUtilitySlow(ParseState *pstate,
                             }
                         }
                     }
+#ifdef __TBASE__
+					/*
+					 * If I am the main execute CN but not Leader CN,
+					 * Notify the Leader CN to create firstly.
+					 */
+					if (!sentToRemote && LOCAL_PARALLEL_DDL)
+					{
+						PGXCNodeHandle *leader_cn = find_ddl_leader_cn();
+						if (!is_ddl_leader_cn(leader_cn->nodename))
+						{
+							/*
+							 * Unlock namespace before send to Leader CN
+							 * in case of concurrent drop schema and create
+							 * schema.xxx dead lock.
+							 */
+							UnlockDatabaseObject(NamespaceRelationId, nspaceid,
+													0, AccessShareLock);
+							SendLeaderCNUtility(queryString, is_temp);
+							LockDatabaseObject(NamespaceRelationId, nspaceid,
+													0, AccessShareLock);
+						}
+					}
+#endif
 #ifdef __COLD_HOT__
                     /* Add check overlap remote query on top of query tree */
                     if (subcluster && distributeby)
@@ -3474,8 +3888,9 @@ ProcessUtilitySlow(ParseState *pstate,
                 {
                     if (auditString != NULL)
                     {
-                        ExecUtilityStmtOnNodes(parsetree, auditString, NULL, sentToRemote, true,
-                                EXEC_ON_ALL_NODES, false, false);
+						ExecUtilityStmtOnNodes(parsetree, auditString, NULL,
+										sentToRemote, true, EXEC_ON_ALL_NODES,
+										false, false);
                     }
                 }
 
@@ -3493,8 +3908,9 @@ ProcessUtilitySlow(ParseState *pstate,
                 {
                     if (cleanString != NULL)
                     {
-                        ExecUtilityStmtOnNodes(parsetree, cleanString, NULL, sentToRemote, true,
-                            EXEC_ON_ALL_NODES, false, false);
+						ExecUtilityStmtOnNodes(parsetree, cleanString, NULL,
+							sentToRemote, true,	EXEC_ON_ALL_NODES,
+							false, false);
                     }
                 }
 
@@ -3520,6 +3936,36 @@ ProcessUtilitySlow(ParseState *pstate,
                      * permissions.
                      */
                     lockmode = AlterTableGetLockLevel(atstmt->cmds);
+#ifdef __TBASE__
+					/*
+					 * If I am the main execute CN but not Leader CN,
+					 * Notify the Leader CN to create firstly.
+					 */
+					if (!sentToRemote && LOCAL_PARALLEL_DDL)
+					{
+						bool is_temp = false;
+						PGXCNodeHandle	*leaderCnHandle = find_ddl_leader_cn();
+						if (!is_ddl_leader_cn(leaderCnHandle->nodename))
+						{
+							relid = RangeVarGetRelid(atstmt->relation,
+															lockmode, true);
+							if (OidIsValid(relid))
+							{
+								ExecUtilityFindNodes(atstmt->relkind,
+													relid, &is_temp);
+								UnlockRelationOid(relid, lockmode);
+								SendLeaderCNUtility(queryString, is_temp);
+							}
+							else
+							{
+								ereport(NOTICE,
+									(errmsg("relation \"%s\" does not exist, skipping",
+											atstmt->relation->relname)));
+								break;
+							}
+						}
+					}
+#endif
                     relid = AlterTableLookupRelation(atstmt, lockmode);
 
                     if (OidIsValid(relid))
@@ -3543,7 +3989,6 @@ ProcessUtilitySlow(ParseState *pstate,
                                 exec_type = ExecUtilityFindNodes(atstmt->relkind,
                                         relid,
                                         &is_temp);
-
                                 stmts = AddRemoteQueryNode(stmts, queryString, exec_type);
                             }
                         }
@@ -3736,12 +4181,44 @@ ProcessUtilitySlow(ParseState *pstate,
 					List       *inheritors = NIL;
 #ifdef __TBASE__
                     Relation   rel = NULL;
+					bool		istemp = false;
 #endif
 
                     if (stmt->concurrent)
                         PreventTransactionChain(isTopLevel,
                                                 "CREATE INDEX CONCURRENTLY");
 
+#ifdef __TBASE__
+					if (!sentToRemote && LOCAL_PARALLEL_DDL)
+					{
+						relid =	RangeVarGetRelidExtended(stmt->relation,
+														AccessShareLock, true,
+												 		false, NULL, NULL);
+						if (OidIsValid(relid))
+						{
+							RemoteQueryExecType exectype;
+							exectype = ExecUtilityFindNodes(OBJECT_INDEX,
+															relid, &istemp);
+
+							/*
+							 * If I am the main execute CN but not Leader CN,
+							 * Notify the Leader CN to create firstly.
+							 */
+							if (exectype == EXEC_ON_ALL_NODES ||
+									exectype == EXEC_ON_COORDS)
+							{
+								PGXCNodeHandle *leaderCnHandle;
+								leaderCnHandle = find_ddl_leader_cn();
+								if (!is_ddl_leader_cn(leaderCnHandle->nodename))
+								{
+									UnlockRelationOid(relid, AccessShareLock);
+									SendLeaderCNUtility(queryString, istemp);
+								}
+							}
+						}
+					}
+#endif
+
                     /*
                      * Look up the relation OID just once, right here at the
                      * beginning, so that we don't end up repeating the name
@@ -3758,7 +4235,6 @@ ProcessUtilitySlow(ParseState *pstate,
                                                  false, false,
                                                  RangeVarCallbackOwnsRelation,
                                                  NULL);
-
 #if 0
                     /* could not create index on interval child table directly */
                     if (OidIsValid(relid))
@@ -3984,7 +4460,11 @@ ProcessUtilitySlow(ParseState *pstate,
 										queryString);
 						/* Send prepare extension msg to all other cn and dn */
 						extension_query_string = qstring->data;
-						ExecUtilityStmtOnNodes(parsetree, extension_query_string, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false);	
+						ExecUtilityStmtOnNodes(parsetree,
+												extension_query_string,
+												NULL, sentToRemote, false,
+												EXEC_ON_ALL_NODES,
+												false, false);
 						
 						/* stage 2 */
 						ExecuteExtension(pstate, (CreateExtensionStmt *) parsetree);
@@ -3994,7 +4474,11 @@ ProcessUtilitySlow(ParseState *pstate,
 										queryString);
 						/* Send execute extension msg to all other cn and dn */
 						extension_query_string = qstring->data;
-						ExecUtilityStmtOnNodes(parsetree, extension_query_string, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false, false);
+						ExecUtilityStmtOnNodes(parsetree,
+												extension_query_string,
+												NULL, sentToRemote, false,
+												EXEC_ON_ALL_NODES,
+												false, false);
 
 						pfree(qstring->data);
 						pfree(qstring);
@@ -4063,17 +4547,46 @@ ProcessUtilitySlow(ParseState *pstate,
             case T_CompositeTypeStmt:    /* CREATE TYPE (composite) */
                 {
                     CompositeTypeStmt *stmt = (CompositeTypeStmt *) parsetree;
-
+#ifdef __TBASE__
+					/*
+					 * If I am the main execute CN but not Leader CN,
+					 * Notify the Leader CN to create firstly.
+					 */
+					if (!sentToRemote && LOCAL_PARALLEL_DDL)
+					{
+						SendLeaderCNUtility(queryString, false);
+					}
+#endif
                     address = DefineCompositeType(stmt->typevar,
                                                   stmt->coldeflist);
                 }
                 break;
 
             case T_CreateEnumStmt:    /* CREATE TYPE AS ENUM */
+#ifdef __TBASE__
+					/*
+					 * If I am the main execute CN but not Leader CN,
+					 * Notify the Leader CN to create firstly.
+					 */
+					if (!sentToRemote && LOCAL_PARALLEL_DDL)
+					{
+						SendLeaderCNUtility(queryString, false);
+					}
+#endif
                 address = DefineEnum((CreateEnumStmt *) parsetree);
                 break;
 
             case T_CreateRangeStmt: /* CREATE TYPE AS RANGE */
+#ifdef __TBASE__
+					/*
+					 * If I am the main execute CN but not Leader CN,
+					 * Notify the Leader CN to create firstly.
+					 */
+					if (!sentToRemote && LOCAL_PARALLEL_DDL)
+					{
+						SendLeaderCNUtility(queryString, false);
+					}
+#endif
                 address = DefineRange((CreateRangeStmt *) parsetree);
                 break;
 
@@ -4083,6 +4596,37 @@ ProcessUtilitySlow(ParseState *pstate,
 
             case T_ViewStmt:    /* CREATE VIEW */
                 EventTriggerAlterTableStart(parsetree);
+#ifdef __TBASE__
+				/*
+				 * If I am the main execute CN but not Leader CN,
+				 * Notify the Leader CN to create firstly.
+				 */
+				if (!sentToRemote && LOCAL_PARALLEL_DDL)
+				{
+					PGXCNodeHandle	*leaderCnHandle = NULL;
+					leaderCnHandle = find_ddl_leader_cn();
+					if (!is_ddl_leader_cn(leaderCnHandle->nodename))
+					{
+						List		*relation_list = NIL;
+						ListCell	*lc;
+						bool tmp = IsViewTemp(((ViewStmt*)parsetree),
+												queryString,
+												pstmt->stmt_location,
+												pstmt->stmt_len,
+												&relation_list);
+
+						/* Unlock before we send to leander cn */
+						foreach(lc, relation_list)
+						{
+							Oid reloid = lfirst_oid(lc);
+							UnlockRelationOid(reloid, AccessShareLock);
+						}
+						if (!tmp)
+							SendLeaderCNUtility(queryString, tmp);
+						
+					}
+				}
+#endif
                 address = DefineView((ViewStmt *) parsetree, queryString,
                                      pstmt->stmt_location, pstmt->stmt_len);
                 EventTriggerCollectSimpleCommand(address, secondaryObject,
@@ -4093,6 +4637,16 @@ ProcessUtilitySlow(ParseState *pstate,
                 break;
 
             case T_CreateFunctionStmt:    /* CREATE FUNCTION */
+#ifdef __TBASE__
+				/*
+				 * If I am the main execute CN but not Leader CN,
+				 * Notify the Leader CN to create firstly.
+				 */
+				if (!sentToRemote && LOCAL_PARALLEL_DDL)
+				{
+					SendLeaderCNUtility(queryString, false);
+				}
+#endif
                 address = CreateFunction(pstate, (CreateFunctionStmt *) parsetree);
                 break;
 
@@ -4104,27 +4658,91 @@ ProcessUtilitySlow(ParseState *pstate,
                 address = DefineRule((RuleStmt *) parsetree, queryString);
                 break;
 
-            case T_CreateSeqStmt:
-                address = DefineSequence(pstate, (CreateSeqStmt *) parsetree);
+			case T_CreateSeqStmt:
+#ifdef __TBASE__
+				{
+					bool need_send = false;
+					bool is_temp = false;
+					bool exist_ok = !is_txn_has_parallel_ddl;
+					CreateSeqStmt *stmt = (CreateSeqStmt *) parsetree;
+					if (!stmt->is_serial)
+					{
+						is_temp = stmt->sequence->relpersistence == RELPERSISTENCE_TEMP;
+					}
+
+					if (!sentToRemote && LOCAL_PARALLEL_DDL)
+					{
+						PGXCNodeHandle	*leaderCnHandle = NULL;
+						need_send = PrecheckDefineSequence(stmt);
+						leaderCnHandle = find_ddl_leader_cn();
+
+						if (!need_send)
+							break;
+
+						/*
+						 * If I am the main execute CN but not Leader CN,
+						 * Notify the Leader CN to create firstly.
+						 */
+						if (!is_ddl_leader_cn(leaderCnHandle->nodename))
+						{
+							if (!is_temp && need_send)
+								SendLeaderCNUtility(queryString, is_temp);
+						}
+					}
+
+					address = DefineSequence(pstate, stmt, exist_ok);
+
+					if (is_temp)
+					{
+						PoolManagerSetCommand(NULL, 0, POOL_CMD_TEMP, NULL);
+					}
+
+					if (need_send)
+					{
+						RemoteQueryExecType exec_type = 
+							is_temp ? EXEC_ON_DATANODES : EXEC_ON_ALL_NODES;
+						ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+												sentToRemote, false, exec_type,
+												is_temp, false);
+					}
+				}
+#else
+				address = DefineSequence(pstate, (CreateSeqStmt *) parsetree);
+#endif
+				break;
+
+			case T_AlterSeqStmt:
 #ifdef __TBASE__
+                if (!sentToRemote && LOCAL_PARALLEL_DDL)
                 {
+					AlterSeqStmt *stmt = (AlterSeqStmt *) parsetree;
                     bool is_temp = false;
-                    CreateSeqStmt *stmt = (CreateSeqStmt *) parsetree;
-
-                    if (!stmt->is_serial)
+					PGXCNodeHandle	*leaderCnHandle = NULL;
+					leaderCnHandle = find_ddl_leader_cn();
+					/*
+					 * If I am the main execute CN but not Leader CN,
+					 * Notify the Leader CN to create firstly.
+					 */
+					if (!is_ddl_leader_cn(leaderCnHandle->nodename))
                     {
-                        is_temp = stmt->sequence->relpersistence == RELPERSISTENCE_TEMP;
+						Oid relid = RangeVarGetRelid(stmt->sequence,
+													NoLock, stmt->missing_ok);
+						RemoteQueryExecType exec_type = EXEC_ON_NONE;
+						if (!OidIsValid(relid))
+						{
+							break;
                     }
-
-                    if (is_temp)
+						exec_type = ExecUtilityFindNodes(OBJECT_SEQUENCE,
+															relid, &is_temp);
+						if (exec_type == EXEC_ON_ALL_NODES ||
+							exec_type == EXEC_ON_COORDS)
                     {
-                        PoolManagerSetCommand(NULL, 0, POOL_CMD_TEMP, NULL);
+							SendLeaderCNUtility(queryString, is_temp);
                     }
                 }
-#endif
-                break;
 
-            case T_AlterSeqStmt:
+                }
+#endif
                 address = AlterSequence(pstate, (AlterSeqStmt *) parsetree);
                 break;
 
@@ -4133,6 +4751,16 @@ ProcessUtilitySlow(ParseState *pstate,
                     CreateTableAsStmt *stmt = (CreateTableAsStmt *) parsetree;
                     if (IS_PGXC_DATANODE && stmt->relkind == OBJECT_MATVIEW)
                         stmt->into->skipData = true;
+#ifdef __TBASE__
+                /*
+                 * If I am the main execute CN but not Leader CN,
+                 * Notify the Leader CN to create firstly.
+                 */
+                if (!sentToRemote && LOCAL_PARALLEL_DDL)
+                {
+                    SendLeaderCNUtility(queryString, false);
+                }
+#endif
                     address = ExecCreateTableAs((CreateTableAsStmt *) parsetree,
                                                 queryString, params, queryEnv,
                                                 completionTag);
@@ -4286,7 +4914,33 @@ ProcessUtilitySlow(ParseState *pstate,
                 break;
 
             case T_RenameStmt:
-                address = ExecRenameStmt((RenameStmt *) parsetree);
+				{
+					RenameStmt * stmt = (RenameStmt *) parsetree;
+#ifdef __TBASE__
+					if (LOCAL_PARALLEL_DDL)
+					{
+						bool is_temp = false;
+						PGXCNodeHandle	*leaderCnHandle = find_ddl_leader_cn();
+						bool is_leader_cn = is_ddl_leader_cn(leaderCnHandle->nodename);
+						RemoteQueryExecType exec_type = GetRenameExecType(stmt, &is_temp);
+
+						/*
+						 * If I am the main execute CN but not Leader CN,
+						 * Notify the Leader CN to create firstly.
+						 */
+						if (!is_leader_cn)
+						{
+							SendLeaderCNUtility(queryString, is_temp);
+						}
+						address = ExecRenameStmt(stmt);
+						ExecUtilityStmtOnNodes(parsetree, queryString, NULL,
+												sentToRemote, false, exec_type,
+												is_temp, false);
+					}
+					else
+#endif
+						address = ExecRenameStmt(stmt);
+				}
                 break;
 
             case T_AlterObjectDependsStmt:
@@ -4296,12 +4950,32 @@ ProcessUtilitySlow(ParseState *pstate,
                 break;
 
             case T_AlterObjectSchemaStmt:
+#ifdef __TBASE__
+				/*
+				 * If I am the main execute CN but not Leader CN,
+				 * Notify the Leader CN to create firstly.
+				 */
+				if (!sentToRemote && LOCAL_PARALLEL_DDL)
+				{
+					SendLeaderCNUtility(queryString, false);
+				}
+#endif			
                 address =
                     ExecAlterObjectSchemaStmt((AlterObjectSchemaStmt *) parsetree,
                                               &secondaryObject);
                 break;
 
             case T_AlterOwnerStmt:
+#ifdef __TBASE__
+				/*
+				 * If I am the main execute CN but not Leader CN,
+				 * Notify the Leader CN to create firstly.
+				 */
+				if (!sentToRemote && LOCAL_PARALLEL_DDL)
+				{
+					SendLeaderCNUtility(queryString, false);
+				}
+#endif
                 address = ExecAlterOwnerStmt((AlterOwnerStmt *) parsetree);
                 break;
 
@@ -4444,6 +5118,122 @@ ProcessUtilitySlow(ParseState *pstate,
         EventTriggerEndCompleteQuery();
 }
 
+#ifdef __TBASE__
+/*
+ * SendLeaderCNUtility
+ * For parallel ddl, we execute ddl in leader cn firstly
+ * to avoid deadlock.
+ */
+void SendLeaderCNUtility(const char *queryString,
+						bool temp)
+{
+	PGXCNodeHandle	*leaderCnHandle = NULL;
+	RemoteQuery		*step = NULL;
+
+	leaderCnHandle = find_ddl_leader_cn();
+	if (is_ddl_leader_cn(leaderCnHandle->nodename))
+		return;
+
+	step = makeNode(RemoteQuery);
+	step->combine_type = COMBINE_TYPE_SAME;
+	step->sql_statement = pstrdup(queryString);
+	step->exec_type = temp ? EXEC_ON_NONE : EXEC_ON_COORDS;
+	step->exec_nodes = NULL;
+	step->is_temp = temp;
+	ExecRemoteUtility(step, leaderCnHandle, ONLY_LEADER_DDL);
+	pfree(step);
+
+	leader_cn_executed_ddl = true;
+}
+
+void SendLeaderCNUtilityWithContext(const char *queryString,
+									bool temp)
+{
+	PG_TRY();
+    {
+		SendLeaderCNUtility(queryString, temp);
+    }
+    PG_CATCH();
+	{
+
+        /*
+         * Some nodes failed. Add context about what all nodes the query
+         * failed
+         */
+        ExecNodes* coord_success_nodes = NULL;
+        ExecNodes* data_success_nodes = NULL;
+        char* msg_failed_nodes = NULL;
+
+        pgxc_all_success_nodes(&data_success_nodes, &coord_success_nodes, &msg_failed_nodes);
+        if (msg_failed_nodes != NULL)
+            errcontext("%s", msg_failed_nodes);
+        PG_RE_THROW();
+    }
+    PG_END_TRY();
+}
+
+void CheckAndSendLeaderCNReindex(bool sentToRemote, ReindexStmt *stmt,
+									const char *queryString)
+{
+	RemoteQueryExecType exec_type = EXEC_ON_NONE;
+	PGXCNodeHandle	*leaderCnHandle = NULL;
+
+	if (sentToRemote || !LOCAL_PARALLEL_DDL)
+		return;
+
+	/*
+	 * If I am the main execute CN but not Leader CN, notify the Leader CN
+	 * to reindex firstly.
+	 */
+	leaderCnHandle = find_ddl_leader_cn();
+	if (!is_ddl_leader_cn(leaderCnHandle->nodename))
+	{
+		bool is_temp = false;
+		Oid relid = RangeVarGetRelid(stmt->relation, AccessShareLock, false);
+		if (OidIsValid(relid))
+		{
+			exec_type = ExecUtilityFindNodes(stmt->kind, relid, &is_temp);
+			UnlockRelationOid(relid, AccessShareLock);
+		}
+		if (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_COORDS)
+		{
+			SendLeaderCNUtility(queryString, is_temp);
+		}
+	}
+}
+
+#endif
+
+static RemoteQueryExecType GetRenameExecType(RenameStmt *stmt, bool *is_temp)
+{
+	RemoteQueryExecType	exec_type = EXEC_ON_NONE;
+	/*
+	 * Get the necessary details about the relation before we
+	 * run ExecRenameStmt locally. Otherwise we may not be able
+	 * to look-up using the old relation name.
+	 */
+	if (stmt->relation)
+	{
+		/*
+			* If the table does not exist, don't send the query to
+			* the remote nodes. The local node will eventually
+			* report an error, which is then sent back to the
+			* client.
+			*/
+		Oid relid = RangeVarGetRelid(stmt->relation,
+										NoLock, true);
+		if (OidIsValid(relid))
+			exec_type = ExecUtilityFindNodes(stmt->renameType,
+										relid, is_temp);
+		else
+			exec_type = EXEC_ON_NONE;
+	}
+	else
+		exec_type = ExecUtilityFindNodes(stmt->renameType,
+									InvalidOid,	is_temp);
+	return exec_type;
+}
+
 /*
  * Dispatch function for DropStmt
  */
@@ -4489,32 +5279,77 @@ ExecDropStmt(DropStmt *stmt, bool isTopLevel)
 #ifdef PGXC
             {
                 bool        is_temp = false;
+				RemoteQueryExecType exec_type = EXEC_ON_ALL_NODES;
 #ifdef __TBASE__
-				int         drop_cnt = 0;
                 char        *new_query_string = pstrdup(queryString);
+				ObjectAddresses *new_objects = NULL;
+				PGXCNodeHandle	*leaderCnHandle = NULL;
+				bool need_sendto_leadercn = false;
 #endif
-                RemoteQueryExecType exec_type = EXEC_ON_ALL_NODES;
 
                 /* Check restrictions on objects dropped */
                 DropStmtPreTreatment((DropStmt *) stmt, queryString, sentToRemote,
                         &is_temp, &exec_type);
 #endif
-
 #ifdef __TBASE__
-                drop_cnt = RemoveRelations(stmt, new_query_string);
+				if (!sentToRemote && LOCAL_PARALLEL_DDL)
+				{
+					leaderCnHandle = find_ddl_leader_cn();
+					if (!is_ddl_leader_cn(leaderCnHandle->nodename))
+						need_sendto_leadercn = true;
+				}
+				if (need_sendto_leadercn)
+				{
+					/*
+					 * For DROP TABLE/INDEX/VIEW/... IF EXISTS query, only 
+					 * notice is emitted, if the referred objects are not
+					 * found. In such case, the atomicity and consistency of
+					 * the query or transaction among local CN and remote nodes
+					 * can not be guaranteed against concurrent CREATE TABLE/
+					 * INDEX/VIEW/... query.
+					 *
+					 * To ensure such atomicity and consistency, we only refer
+					 * to local CN about the visibility of the objects to be
+					 * deleted and rewrite the query into new_query_string 
+					 * without the inivisible objects. Later, if the objects in
+					 * new_query_string are not found on remote nodes, which
+					 * should not happen, just ERROR.
+					 */
+					bool need_drop = false;
+					List *heap_list = NIL;
+					new_objects = PreCheckforRemoveRelation(stmt, 
+															new_query_string,
+															&need_drop,
+															&heap_list);
+					if (need_drop)
+					{
+						/*
+						 * If I am the main execute CN but not Leader CN,
+						 * Notify the Leader CN to create firstly.
+						 */
+						SendLeaderCNUtility(new_query_string, is_temp);
+						RemoveRelationsParallelMode(stmt, new_objects,
+													heap_list);
+						free_object_addresses(new_objects);
+					}
+					else
+					{
+						pfree(new_query_string);
+						free_object_addresses(new_objects);
+						break;
+					}
+				}
+				else if (RemoveRelations(stmt, new_query_string) == 0)
+				{
+					pfree(new_query_string);
+					break;
+				}
 #else
                 RemoveRelations(stmt);
 #endif
 
 #ifdef PGXC
 #ifdef __TBASE__
-                /* if drop nothing, skip */
-                if (drop_cnt == 0)
-                {
-                    pfree(new_query_string);
-                    break;
-                }
-
 				/* DROP is done depending on the object type and its temporary type */
 				if (IS_PGXC_LOCAL_COORDINATOR)
 					ExecUtilityStmtOnNodes(NULL, new_query_string, NULL, sentToRemote, false,
@@ -4529,17 +5364,101 @@ ExecDropStmt(DropStmt *stmt, bool isTopLevel)
             }
 #endif
             break;
+#ifdef __TBASE__
+		case OBJECT_SCHEMA:
+		case OBJECT_FUNCTION:
+		case OBJECT_TYPE:
+			{
+				bool is_temp = false;
+				bool need_drop = false;
+				RemoteQueryExecType exec_type = EXEC_ON_ALL_NODES;
+				ObjectAddresses *new_objects = NULL;
+				PGXCNodeHandle	*leaderCnHandle = NULL;
+				bool is_leader_cn = false;
+				char *new_query_string = pstrdup(queryString);
+
+				/* Check restrictions on objects dropped */
+				DropStmtPreTreatment((DropStmt *) stmt, queryString, sentToRemote,
+						&is_temp, &exec_type);
+
+				if (!sentToRemote && LOCAL_PARALLEL_DDL)
+				{
+					leaderCnHandle = find_ddl_leader_cn();
+					is_leader_cn = is_ddl_leader_cn(leaderCnHandle->nodename);
+					if (!is_leader_cn)
+					{
+						/*
+						 * To ensure such atomicity and consistency, we only refer
+						 * to local CN about the visibility of the objects to be
+						 * deleted and rewrite the query into new_query_string 
+						 * without the inivisible objects. Later, if the objects in
+						 * new_query_string are not found on remote nodes, which
+						 * should not happen, just ERROR.
+						 */
+						new_objects = PreCheckforRemoveObjects(stmt,
+															true,
+															&need_drop,
+															new_query_string,
+															true);
+						if (need_drop)
+						{
+							/*
+							* If I am the main execute CN but not Leader CN,
+							* Notify the Leader CN to create firstly.
+							*/
+							SendLeaderCNUtility(new_query_string, is_temp);
+							RemoveObjectsParallelMode(stmt, new_objects);
+							free_object_addresses(new_objects);
+						}
+						else
+						{
+							free_object_addresses(new_objects);
+							pfree(new_query_string);
+							break;
+						}
+					}
+					else
+					{
+						RemoveObjects(stmt, true, &need_drop,
+										new_query_string);
+						if (!need_drop)
+						{
+							pfree(new_query_string);
+							break;
+						}
+					}
+				}
+				else if (is_txn_has_parallel_ddl)
+				{
+					/* parallel ddl mode, from remote cn, can't miss object */
+					RemoveObjects(stmt, false, &need_drop, NULL);
+				}
+				else
+				{
+					/* non parallel ddl mode */
+					RemoveObjects(stmt, true, &need_drop, NULL);
+				}
+
+				if (IS_PGXC_LOCAL_COORDINATOR)
+					ExecUtilityStmtOnNodes(NULL, new_query_string, NULL,
+											sentToRemote, false, exec_type,
+											is_temp, false);
+				pfree(new_query_string);
+			}
+			break;
+#endif
         default:
 #ifdef PGXC
             {
                 bool        is_temp = false;
+				bool		need_drop = false;
                 RemoteQueryExecType exec_type = EXEC_ON_ALL_NODES;
 
                 /* Check restrictions on objects dropped */
                 DropStmtPreTreatment((DropStmt *) stmt, queryString, sentToRemote,
                         &is_temp, &exec_type);
 #endif
-                RemoveObjects(stmt);
+				RemoveObjects(stmt, true, &need_drop, NULL);
 #ifdef PGXC
                 if (IS_PGXC_LOCAL_COORDINATOR)
                     ExecUtilityStmtOnNodes(NULL, queryString, NULL, sentToRemote, false,
@@ -4550,6 +5469,70 @@ ExecDropStmt(DropStmt *stmt, bool isTopLevel)
     }
 }
 
+#ifdef __TBASE__
+void
+CheckAndDropRole(Node *parsetree, bool sentToRemote, const char *queryString)
+{
+	DropRoleStmt *stmt = (DropRoleStmt *) parsetree;
+	char *new_query_string = pstrdup(queryString);
+	bool need_drop = true;
+
+	if (!sentToRemote && LOCAL_PARALLEL_DDL)
+	{
+		PGXCNodeHandle	*leaderCnHandle = NULL;
+		leaderCnHandle = find_ddl_leader_cn();
+
+		/*
+		 * If I am the main execute CN but not Leader CN,
+		 * Notify the Leader CN to create firstly.
+		 */
+		if (!is_ddl_leader_cn(leaderCnHandle->nodename))
+		{
+			List *role_list = NIL;
+			need_drop = PreCheckDropRole(stmt, new_query_string, &role_list);
+			if (!need_drop)
+			{
+				pfree(new_query_string);
+				return;
+			}
+			SendLeaderCNUtility(new_query_string, false);
+			DropRoleParallelMode(role_list);
+			ExecUtilityStmtOnNodes(parsetree, new_query_string, NULL,
+									sentToRemote, false,
+									EXEC_ON_ALL_NODES, false,
+									false);
+		}
+		else
+		{
+			if (!DropRole(stmt, stmt->missing_ok, new_query_string))
+			{
+				pfree(new_query_string);
+				return;
+			}
+			ExecUtilityStmtOnNodes(parsetree, new_query_string, NULL,
+									sentToRemote, false,
+									EXEC_ON_ALL_NODES, false,
+									false);
+		}
+	}
+	/* From remote cn */
+	else if (!IS_PGXC_LOCAL_COORDINATOR && is_txn_has_parallel_ddl)
+	{
+		/* 
+		 * In parallel ddl mode, we only send cmd to remote when
+		 * database exists, so database can not miss when the cmd
+		 * come from remote cn.
+		 */
+		DropRole(stmt, false, NULL);
+	}
+	/* Non parallel ddl mode */
+	else
+	{
+		DropRole(stmt, stmt->missing_ok, NULL);
+	}
+	pfree(new_query_string);
+}
+#endif
 
 /*
  * UtilityReturnsTuples
@@ -6421,8 +7404,11 @@ GetCommandLogLevel(Node *parsetree)
 
 #ifdef PGXC
 static void
-ExecUtilityStmtOnNodesInternal(Node* parsetree, const char *queryString, ExecNodes *nodes, bool sentToRemote,
-        bool force_autocommit, RemoteQueryExecType exec_type, bool is_temp)
+ExecUtilityStmtOnNodesInternal(Node* parsetree, const char *queryString,
+								ExecNodes *nodes, bool sentToRemote,
+								bool force_autocommit,
+								RemoteQueryExecType exec_type,
+								bool is_temp)
 {
     /* Return if query is launched on no nodes */
     if (exec_type == EXEC_ON_NONE)
@@ -6449,7 +7435,18 @@ ExecUtilityStmtOnNodesInternal(Node* parsetree, const char *queryString, ExecNod
         step->force_autocommit = force_autocommit;
         step->exec_type = exec_type;
         step->parsetree = parsetree;
+#ifdef __TBASE__
+		if (LOCAL_PARALLEL_DDL &&
+			(exec_type == EXEC_ON_COORDS ||	exec_type == EXEC_ON_ALL_NODES))
+		{
+			PGXCNodeHandle* leaderCnHandle = find_ddl_leader_cn();
+			ExecRemoteUtility(step,	leaderCnHandle, EXCLUED_LEADER_DDL);
+		}
+		else
+			ExecRemoteUtility(step, NULL, NON_PARALLEL_DDL);
+#else
         ExecRemoteUtility(step);
+#endif
         pfree(step->sql_statement);
         pfree(step);
     }
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index ec9db352..1f205d34 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -9356,7 +9356,11 @@ set_config_option(const char *name, const char *value,
         step->force_autocommit = true;
         step->exec_type = EXEC_ON_CURRENT;
 		step->is_set = true;
+#ifdef __TBASE__
+		ExecRemoteUtility(step, NULL, NON_PARALLEL_DDL);
+#else
         ExecRemoteUtility(step);
+#endif
         pfree(step);
         pfree(poolcmd.data);
     }
diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h
index c2c0c9b1..589bbaab 100644
--- a/src/include/catalog/dependency.h
+++ b/src/include/catalog/dependency.h
@@ -281,6 +281,18 @@ extern void performDeletion(const ObjectAddress *object,
 extern void performMultipleDeletions(const ObjectAddresses *objects,
                          DropBehavior behavior, int flags);
 
+#ifdef __TBASE__
+extern void RemoveRelationsParallelMode(DropStmt *drop,
+										ObjectAddresses* objects,
+										List *heap_list);
+extern void RemoveObjectsParallelMode(DropStmt *stmt, ObjectAddresses *objects);
+extern void OmitqueryStringSpace(char *queryString);
+extern void RemoveObjnameInQueryString(char *queryString, char *full_name);
+extern ObjectAddresses* PreCheckforRemoveObjects(DropStmt *stmt, bool missing_ok,
+										bool *need_drop, char *query_string,
+										bool need_unlock);
+#endif
+
 #ifdef PGXC
 extern void performRename(const ObjectAddress *object,
                           const char *oldname,
diff --git a/src/include/catalog/objectaddress.h b/src/include/catalog/objectaddress.h
index 0d80f74c..ec2cd56f 100644
--- a/src/include/catalog/objectaddress.h
+++ b/src/include/catalog/objectaddress.h
@@ -108,6 +108,10 @@ extern const ObjectAddress InvalidObjectAddress;
 #define ObjectAddressSet(addr, class_id, object_id) \
     ObjectAddressSubSet(addr, class_id, object_id, 0)
 
+#ifdef __TBASE__
+extern char *GetRemoveObjectName(ObjectType objtype, Node *object);
+#endif
+
 extern ObjectAddress get_object_address(ObjectType objtype, Node *object,
                    Relation *relp,
                    LOCKMODE lockmode, bool missing_ok);
diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h
index cb5844ff..539d5ac2 100644
--- a/src/include/commands/dbcommands.h
+++ b/src/include/commands/dbcommands.h
@@ -20,7 +20,7 @@
 #include "nodes/parsenodes.h"
 
 extern Oid	createdb(ParseState *pstate, const CreatedbStmt *stmt);
-extern void dropdb(const char *dbname, bool missing_ok);
+extern bool dropdb(const char *dbname, bool missing_ok);
 extern void dropdb_prepare(const char *dbname, bool missing_ok);
 extern ObjectAddress RenameDatabase(const char *oldname, const char *newname);
 extern Oid	AlterDatabase(ParseState *pstate, AlterDatabaseStmt *stmt, bool isTopLevel);
diff --git a/src/include/commands/defrem.h b/src/include/commands/defrem.h
index 1d3959b2..7b276490 100644
--- a/src/include/commands/defrem.h
+++ b/src/include/commands/defrem.h
@@ -19,8 +19,8 @@
 #include "utils/array.h"
 
 /* commands/dropcmds.c */
-extern void RemoveObjects(DropStmt *stmt);
-
+extern void RemoveObjects(DropStmt *stmt, bool missing_ok,
+							bool *need_drop, char *query_string);
 /* commands/indexcmds.c */
 extern ObjectAddress DefineIndex(Oid relationId,
             IndexStmt *stmt,
diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h
index 729d73c4..a9f5ddca 100644
--- a/src/include/commands/sequence.h
+++ b/src/include/commands/sequence.h
@@ -121,7 +121,13 @@ extern int64 nextval_internal(Oid relid, bool check_permissions);
 extern Datum nextval(PG_FUNCTION_ARGS);
 extern List *sequence_options(Oid relid);
 
+#ifdef __TBASE__
+extern ObjectAddress DefineSequence(ParseState *pstate, CreateSeqStmt *seq,
+								bool exists_ok);
+extern bool PrecheckDefineSequence(CreateSeqStmt *seq);
+#else
 extern ObjectAddress DefineSequence(ParseState *pstate, CreateSeqStmt *stmt);
+#endif
 extern ObjectAddress AlterSequence(ParseState *pstate, AlterSeqStmt *stmt);
 extern void DeleteSequenceTuple(Oid relid);
 extern void ResetSequence(Oid seq_relid);
diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h
index ea788476..cdc9eb05 100644
--- a/src/include/commands/tablecmds.h
+++ b/src/include/commands/tablecmds.h
@@ -26,11 +26,17 @@
 extern ObjectAddress DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 			   ObjectAddress *typaddress, const char *queryString);
 #ifdef __TBASE__
+extern ObjectAddresses* PreCheckforRemoveRelation(DropStmt* drop,
+													char* queryString,
+													bool *needDrop,
+													List **heap_list);
 extern int RemoveRelations(DropStmt *drop, char* queryString);
 #else
 extern void RemoveRelations(DropStmt *drop);
 #endif
 
+extern char GetRemoveObjectRelkind(ObjectType removeType);
+
 extern Oid	AlterTableLookupRelation(AlterTableStmt *stmt, LOCKMODE lockmode);
 
 extern void AlterTable(Oid relid, LOCKMODE lockmode, AlterTableStmt *stmt);
diff --git a/src/include/commands/tablespace.h b/src/include/commands/tablespace.h
index 32805ab4..f4ad6b41 100644
--- a/src/include/commands/tablespace.h
+++ b/src/include/commands/tablespace.h
@@ -43,7 +43,7 @@ typedef struct TableSpaceOpts
 } TableSpaceOpts;
 
 extern Oid	CreateTableSpace(CreateTableSpaceStmt *stmt);
-extern void DropTableSpace(DropTableSpaceStmt *stmt);
+extern bool DropTableSpace(DropTableSpaceStmt *stmt, bool missing_ok);
 extern ObjectAddress RenameTableSpace(const char *oldname, const char *newname);
 extern Oid	AlterTableSpaceOptions(AlterTableSpaceOptionsStmt *stmt);
 
@@ -63,4 +63,8 @@ extern void tblspc_redo(XLogReaderState *rptr);
 extern void tblspc_desc(StringInfo buf, XLogReaderState *rptr);
 extern const char *tblspc_identify(uint8 info);
 
+#ifdef __TBASE__
+extern bool PreCheckforDropTableSpace(DropTableSpaceStmt *stmt);
+#endif
+
 #endif							/* TABLESPACE_H */
diff --git a/src/include/commands/user.h b/src/include/commands/user.h
index 69e9aa46..e172d500 100644
--- a/src/include/commands/user.h
+++ b/src/include/commands/user.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * user.h
- *      Commands for manipulating roles (formerly called users).
+ *	  Commands for manipulating roles (formerly called users).
  *
  *
  * src/include/commands/user.h
@@ -17,21 +17,30 @@
 #include "parser/parse_node.h"
 
 /* GUC. Is actually of type PasswordType. */
-extern int    Password_encryption;
+extern int	Password_encryption;
 
 /* Hook to check passwords in CreateRole() and AlterRole() */
 typedef void (*check_password_hook_type) (const char *username, const char *shadow_pass, PasswordType password_type, Datum validuntil_time, bool validuntil_null);
 
 extern PGDLLIMPORT check_password_hook_type check_password_hook;
 
-extern Oid    CreateRole(ParseState *pstate, CreateRoleStmt *stmt);
-extern Oid    AlterRole(AlterRoleStmt *stmt);
-extern Oid    AlterRoleSet(AlterRoleSetStmt *stmt);
-extern void DropRole(DropRoleStmt *stmt);
+extern Oid	CreateRole(ParseState *pstate, CreateRoleStmt *stmt);
+extern Oid	AlterRole(AlterRoleStmt *stmt);
+extern Oid	AlterRoleSet(AlterRoleSetStmt *stmt);
+extern void DropRoleByTuple(char *role, HeapTuple tuple,
+                                Relation pg_authid_rel,
+					            Relation pg_auth_members_rel);
+extern bool DropRole(DropRoleStmt *stmt, bool missing_ok, char *query_string);
 extern void GrantRole(GrantRoleStmt *stmt);
 extern ObjectAddress RenameRole(const char *oldname, const char *newname);
 extern void DropOwnedObjects(DropOwnedStmt *stmt);
 extern void ReassignOwnedObjects(ReassignOwnedStmt *stmt);
 extern List *roleSpecsToIds(List *memberNames);
 
-#endif                            /* USER_H */
+#ifdef __TBASE__
+extern bool PreCheckDropRole(DropRoleStmt *stmt, char *query_string,
+                                List **exist_roles);
+extern void DropRoleParallelMode(List *role_list);
+#endif
+
+#endif							/* USER_H */
diff --git a/src/include/commands/view.h b/src/include/commands/view.h
index 270996b8..facf592c 100644
--- a/src/include/commands/view.h
+++ b/src/include/commands/view.h
@@ -20,8 +20,14 @@
 extern void validateWithCheckOption(char *value);
 
 extern ObjectAddress DefineView(ViewStmt *stmt, const char *queryString,
-           int stmt_location, int stmt_len);
+		   int stmt_location, int stmt_len);
 
 extern void StoreViewQuery(Oid viewOid, Query *viewParse, bool replace);
 
-#endif                            /* VIEW_H */
+extern Query *MakeViewParse(ViewStmt* stmt, const char* query_string,
+									int stmt_location, int stmt_len);
+#ifdef __TBASE__
+extern bool IsViewTemp(ViewStmt* stmt, const char* query_string,
+						int stmt_location, int stmt_len, List **relation_list);
+#endif
+#endif							/* VIEW_H */
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 57111155..5554ee7b 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -66,6 +66,16 @@ typedef enum SortByNulls
     SORTBY_NULLS_LAST
 } SortByNulls;
 
+#ifdef __TBASE__
+typedef enum ParallelDDLRemoteType
+{
+	NON_PARALLEL_DDL,	/* non parallel ddl mode, exec_type decides */
+						/* execution nodes */
+	ONLY_LEADER_DDL,	/* only leader cn will execute ddl */
+	EXCLUED_LEADER_DDL	/* remove leader cn from execution nodes */
+} ParallelDDLRemoteType;
+#endif
+
 /*
  * Grantable rights are encoded so that we can OR them together in a bitmask.
  * The present representation of AclItem limits us to 16 distinct rights,
diff --git a/src/include/parser/parse_relation.h b/src/include/parser/parse_relation.h
index 896a543a..e22afd54 100644
--- a/src/include/parser/parse_relation.h
+++ b/src/include/parser/parse_relation.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * parse_relation.h
- *      prototypes for parse_relation.c.
+ *	  prototypes for parse_relation.c.
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -26,111 +26,114 @@
  */
 typedef struct
 {
-    int            distance;        /* Weighted distance (lowest so far) */
-    RangeTblEntry *rfirst;        /* RTE of first */
-    AttrNumber    first;            /* Closest attribute so far */
-    RangeTblEntry *rsecond;        /* RTE of second */
-    AttrNumber    second;            /* Second closest attribute so far */
+	int			distance;		/* Weighted distance (lowest so far) */
+	RangeTblEntry *rfirst;		/* RTE of first */
+	AttrNumber	first;			/* Closest attribute so far */
+	RangeTblEntry *rsecond;		/* RTE of second */
+	AttrNumber	second;			/* Second closest attribute so far */
 } FuzzyAttrMatchState;
 
 
 extern RangeTblEntry *refnameRangeTblEntry(ParseState *pstate,
-                     const char *schemaname,
-                     const char *refname,
-                     int location,
-                     int *sublevels_up);
+					 const char *schemaname,
+					 const char *refname,
+					 int location,
+					 int *sublevels_up);
 extern CommonTableExpr *scanNameSpaceForCTE(ParseState *pstate,
-                    const char *refname,
-                    Index *ctelevelsup);
+					const char *refname,
+					Index *ctelevelsup);
 extern bool scanNameSpaceForENR(ParseState *pstate, const char *refname);
 extern void checkNameSpaceConflicts(ParseState *pstate, List *namespace1,
-                        List *namespace2);
+						List *namespace2);
 extern int RTERangeTablePosn(ParseState *pstate,
-                  RangeTblEntry *rte,
-                  int *sublevels_up);
+				  RangeTblEntry *rte,
+				  int *sublevels_up);
 extern RangeTblEntry *GetRTEByRangeTablePosn(ParseState *pstate,
-                       int varno,
-                       int sublevels_up);
+					   int varno,
+					   int sublevels_up);
 extern CommonTableExpr *GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte,
-             int rtelevelsup);
+			 int rtelevelsup);
 extern Node *scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte,
-                 char *colname, int location,
-                 int fuzzy_rte_penalty, FuzzyAttrMatchState *fuzzystate);
+				 char *colname, int location,
+				 int fuzzy_rte_penalty, FuzzyAttrMatchState *fuzzystate);
 extern Node *colNameToVar(ParseState *pstate, char *colname, bool localonly,
-             int location);
+			 int location);
 extern void markVarForSelectPriv(ParseState *pstate, Var *var,
-                     RangeTblEntry *rte);
+					 RangeTblEntry *rte);
 extern Relation parserOpenTable(ParseState *pstate, const RangeVar *relation,
-                int lockmode);
+				int lockmode);
 extern RangeTblEntry *addRangeTableEntry(ParseState *pstate,
-                   RangeVar *relation,
-                   Alias *alias,
-                   bool inh,
-                   bool inFromCl);
+				   RangeVar *relation,
+				   Alias *alias,
+				   bool inh,
+				   bool inFromCl);
 extern RangeTblEntry *addRangeTableEntryForRelation(ParseState *pstate,
-                              Relation rel,
-                              Alias *alias,
-                              bool inh,
-                              bool inFromCl);
+							  Relation rel,
+							  Alias *alias,
+							  bool inh,
+							  bool inFromCl);
 extern RangeTblEntry *addRangeTableEntryForSubquery(ParseState *pstate,
-                              Query *subquery,
-                              Alias *alias,
-                              bool lateral,
-                              bool inFromCl);
+							  Query *subquery,
+							  Alias *alias,
+							  bool lateral,
+							  bool inFromCl);
 extern RangeTblEntry *addRangeTableEntryForFunction(ParseState *pstate,
-                              List *funcnames,
-                              List *funcexprs,
-                              List *coldeflists,
-                              RangeFunction *rangefunc,
-                              bool lateral,
-                              bool inFromCl);
+							  List *funcnames,
+							  List *funcexprs,
+							  List *coldeflists,
+							  RangeFunction *rangefunc,
+							  bool lateral,
+							  bool inFromCl);
 extern RangeTblEntry *addRangeTableEntryForValues(ParseState *pstate,
-                            List *exprs,
-                            List *coltypes,
-                            List *coltypmods,
-                            List *colcollations,
-                            Alias *alias,
-                            bool lateral,
-                            bool inFromCl);
+							List *exprs,
+							List *coltypes,
+							List *coltypmods,
+							List *colcollations,
+							Alias *alias,
+							bool lateral,
+							bool inFromCl);
 extern RangeTblEntry *addRangeTableEntryForTableFunc(ParseState *pstate,
-                               TableFunc *tf,
-                               Alias *alias,
-                               bool lateral,
-                               bool inFromCl);
+							   TableFunc *tf,
+							   Alias *alias,
+							   bool lateral,
+							   bool inFromCl);
 extern RangeTblEntry *addRangeTableEntryForJoin(ParseState *pstate,
-                          List *colnames,
-                          JoinType jointype,
-                          List *aliasvars,
-                          Alias *alias,
-                          bool inFromCl);
+						  List *colnames,
+						  JoinType jointype,
+						  List *aliasvars,
+						  Alias *alias,
+						  bool inFromCl);
 extern RangeTblEntry *addRangeTableEntryForCTE(ParseState *pstate,
-                         CommonTableExpr *cte,
-                         Index levelsup,
-                         RangeVar *rv,
-                         bool inFromCl);
+						 CommonTableExpr *cte,
+						 Index levelsup,
+						 RangeVar *rv,
+						 bool inFromCl);
 extern RangeTblEntry *addRangeTableEntryForENR(ParseState *pstate,
-                         RangeVar *rv,
-                         bool inFromCl);
+						 RangeVar *rv,
+						 bool inFromCl);
 extern bool isLockedRefname(ParseState *pstate, const char *refname);
 extern void addRTEtoQuery(ParseState *pstate, RangeTblEntry *rte,
-              bool addToJoinList,
-              bool addToRelNameSpace, bool addToVarNameSpace);
+			  bool addToJoinList,
+			  bool addToRelNameSpace, bool addToVarNameSpace);
 extern void errorMissingRTE(ParseState *pstate, RangeVar *relation) pg_attribute_noreturn();
 extern void errorMissingColumn(ParseState *pstate,
-                   char *relname, char *colname, int location) pg_attribute_noreturn();
+				   char *relname, char *colname, int location) pg_attribute_noreturn();
 extern void expandRTE(RangeTblEntry *rte, int rtindex, int sublevels_up,
-          int location, bool include_dropped,
-          List **colnames, List **colvars);
+		  int location, bool include_dropped,
+		  List **colnames, List **colvars);
 extern List *expandRelAttrs(ParseState *pstate, RangeTblEntry *rte,
-               int rtindex, int sublevels_up, int location);
-extern int    attnameAttNum(Relation rd, const char *attname, bool sysColOK);
+			   int rtindex, int sublevels_up, int location);
+extern int	attnameAttNum(Relation rd, const char *attname, bool sysColOK);
 extern Name attnumAttName(Relation rd, int attid);
-extern Oid    attnumTypeId(Relation rd, int attid);
-extern Oid    attnumCollationId(Relation rd, int attid);
+extern Oid	attnumTypeId(Relation rd, int attid);
+extern Oid	attnumCollationId(Relation rd, int attid);
 extern bool isQueryUsingTempRelation(Query *query);
+#ifdef __TBASE__
+extern bool CheckAndGetRelation(Query *query, List **relation_list);
+#endif
 
 #ifdef PGXC
-extern int    specialAttNum(const char *attname);
+extern int	specialAttNum(const char *attname);
 #endif
 
-#endif                            /* PARSE_RELATION_H */
+#endif							/* PARSE_RELATION_H */
diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h
index 6cb25dbc..b6a0be60 100644
--- a/src/include/parser/parse_utilcmd.h
+++ b/src/include/parser/parse_utilcmd.h
@@ -82,13 +82,20 @@
 extern bool loose_unique_index;
 #endif
 
-#ifdef XCP
-extern bool loose_constraints;
+#ifdef __TBASE__
+extern List *transformCreateStmt(CreateStmt *stmt, const char *queryString,
+					bool autodistribute, Oid *nspaceid, bool existsok);
+#elif XCP
 extern List *transformCreateStmt(CreateStmt *stmt, const char *queryString,
                     bool autodistribute);
 #else
 extern List *transformCreateStmt(CreateStmt *stmt, const char *queryString);
 #endif
+
+#ifdef XCP
+extern bool loose_constraints;
+#endif
+
 extern List *transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                         const char *queryString);
 extern IndexStmt *transformIndexStmt(Oid relid, IndexStmt *stmt,
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index 98d51719..236979f8 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -373,7 +373,13 @@ extern void ExecFinishInitRemoteSubplan(RemoteSubplanState *node);
 extern TupleTableSlot* ExecRemoteSubplan(PlanState *pstate);
 extern void ExecEndRemoteSubplan(RemoteSubplanState *node);
 extern void ExecReScanRemoteSubplan(RemoteSubplanState *node);
+#ifdef __TBASE__
+extern void ExecRemoteUtility(RemoteQuery *node,
+								PGXCNodeHandle *leader_cn_conn,
+								ParallelDDLRemoteType type);
+#else
 extern void ExecRemoteUtility(RemoteQuery *node);
+#endif
 
 extern bool    is_data_node_ready(PGXCNodeHandle * conn);
 
@@ -439,8 +445,30 @@ extern TupleDesc create_tuple_desc(char *msg_body, size_t len);
 extern void ExecFinishRemoteSubplan(RemoteSubplanState *node);
 extern void ExecShutdownRemoteSubplan(RemoteSubplanState *node);
 extern bool SetSnapshot(EState *state);
+
+extern void ExecRemoteUtility_ParallelDDLMode(RemoteQuery *node,
+							PGXCNodeHandle *leader_cn_handle);
+extern void LeaderCnExecRemoteUtility(RemoteQuery *node,
+								PGXCNodeHandle *leader_cn_conn,
+								ResponseCombiner *combiner,
+								bool need_tran_block,
+								GlobalTransactionId gxid,
+								Snapshot snapshot,
+								CommandId cid);
 #endif
 
+extern void GetGlobInfoForRemoteUtility(RemoteQuery *node,
+										GlobalTransactionId *gxid,
+										Snapshot *snapshot);
+extern void SendTxnInfo(RemoteQuery *node, PGXCNodeHandle *conn,
+						CommandId cid, Snapshot snapshot);
+extern bool CheckRemoteRespond(PGXCNodeHandle *conn,
+								ResponseCombiner *combiner,
+								int *index, int *conn_count);
+extern void RemoteReceiveAndCheck(int conn_count,
+									PGXCNodeHandle **conns,
+									ResponseCombiner *combiner);
+
 #ifdef __SUBSCRIPTION__
 extern void pgxc_node_report_error(ResponseCombiner *combiner);
 extern int pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index e5f9c6e1..22075b68 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -292,12 +292,15 @@ void pgxc_set_coordinator_proc_pid(int proc_pid);
 int pgxc_get_coordinator_proc_pid(void);
 void pgxc_set_coordinator_proc_vxid(TransactionId proc_vxid);
 TransactionId pgxc_get_coordinator_proc_vxid(void);
-inline char* find_ddl_leader_cn(void);
+PGXCNodeHandle* find_ddl_leader_cn(void);
 inline bool  is_ddl_leader_cn(char *leader_cn);
+void CheckInvalidateRemoteHandles(void);
 extern int pgxc_node_send_sessionid(PGXCNodeHandle * handle);
 extern void SerializeSessionId(Size maxsize, char *start_address);
 extern void StartParallelWorkerSessionId(char *address);
 extern bool is_pgxc_handles_init(void);
+void delete_leadercn_handle(PGXCNodeAllHandles *pgxc_connections,
+						PGXCNodeHandle* leader_cn_handle);
 #endif
 
 #ifdef __AUDIT__
diff --git a/src/include/tcop/utility.h b/src/include/tcop/utility.h
index 92605dff..aca694be 100644
--- a/src/include/tcop/utility.h
+++ b/src/include/tcop/utility.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * utility.h
- *      prototypes for utility.c.
+ *	  prototypes for utility.c.
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -20,33 +20,33 @@
 #endif
 typedef enum
 {
-    PROCESS_UTILITY_TOPLEVEL,    /* toplevel interactive command */
-    PROCESS_UTILITY_QUERY,        /* a complete query, but not toplevel */
-    PROCESS_UTILITY_SUBCOMMAND    /* a portion of a query */
+	PROCESS_UTILITY_TOPLEVEL,	/* toplevel interactive command */
+	PROCESS_UTILITY_QUERY,		/* a complete query, but not toplevel */
+	PROCESS_UTILITY_SUBCOMMAND	/* a portion of a query */
 } ProcessUtilityContext;
 
 /* Hook for plugins to get control in ProcessUtility() */
 typedef void (*ProcessUtility_hook_type) (PlannedStmt *pstmt,
-                                          const char *queryString, ProcessUtilityContext context,
-                                          ParamListInfo params,
-                                          QueryEnvironment *queryEnv,
-                                          DestReceiver *dest,
-                                          bool sentToRemote,
-                                          char *completionTag);
+										  const char *queryString, ProcessUtilityContext context,
+										  ParamListInfo params,
+										  QueryEnvironment *queryEnv,
+										  DestReceiver *dest,
+										  bool sentToRemote,
+										  char *completionTag);
 extern PGDLLIMPORT ProcessUtility_hook_type ProcessUtility_hook;
 
 extern void ProcessUtility(PlannedStmt *pstmt, const char *queryString,
-               ProcessUtilityContext context, ParamListInfo params,
-               QueryEnvironment *queryEnv,
-               DestReceiver *dest,
-               bool sentToRemote,
-               char *completionTag);
+			   ProcessUtilityContext context, ParamListInfo params,
+			   QueryEnvironment *queryEnv,
+			   DestReceiver *dest,
+			   bool sentToRemote,
+			   char *completionTag);
 extern void standard_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
-                        ProcessUtilityContext context, ParamListInfo params,
-                        QueryEnvironment *queryEnv,
-                        DestReceiver *dest,
-                        bool sentToRemote,
-                        char *completionTag);
+						ProcessUtilityContext context, ParamListInfo params,
+						QueryEnvironment *queryEnv,
+						DestReceiver *dest,
+						bool sentToRemote,
+						char *completionTag);
 
 extern bool UtilityReturnsTuples(Node *parsetree);
 
@@ -71,5 +71,17 @@ extern PGDLLIMPORT ErrcodeHookType g_pfErrcodeHook;
 extern bool is_txn_has_parallel_ddl;
 /* Parallel DDL switch */
 extern bool enable_parallel_ddl;
+
+#define LOCAL_PARALLEL_DDL	\
+	(IS_PGXC_LOCAL_COORDINATOR && is_txn_has_parallel_ddl)
+extern void CheckAndDropRole(Node *parsetree, bool sentToRemote,
+								const char *queryString);
+extern void CheckAndSendLeaderCNReindex(bool sentToRemote, ReindexStmt *stmt,
+											const char *queryString);
+
+/* Has leader CN executed ddl */
+extern bool leader_cn_executed_ddl;
+extern void SendLeaderCNUtility(const char *queryString, bool temp);
+extern void SendLeaderCNUtilityWithContext(const char *queryString, bool temp);
 #endif
-#endif                            /* UTILITY_H */
+#endif							/* UTILITY_H */

From 70b06646068e6bcda93623d1fb8d9ef93e3d939b Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 3 Jun 2021 11:08:05 +0800
Subject: [PATCH 383/578] Skip generating remote path for single node shard
 distribution

Skip shutdown remote subplan node if execute locally

tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131088416973
---
 src/backend/optimizer/plan/planner.c        |  6 +-
 src/backend/pgxc/pool/execRemote.c          |  3 +-
 src/test/regress/expected/tbase_explain.out | 87 ++++++++++++++++++++-
 src/test/regress/sql/tbase_explain.sql      | 13 ++-
 4 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index de1c8ab4..df9a5333 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -8322,7 +8322,8 @@ adjust_path_distribution(PlannerInfo *root, Query *parse, Path *path)
          * already have Result path, and if the distribution is one of
          *
          * a) 'hash' restricted to a single node
-         * b) 'replicate' without volatile functions in the target list
+		 * b) 'shard' restricted to a single node
+		 * c) 'replicate' without volatile functions in the target list
          *
          * In those cases we don't need the RemoteSubplan.
          *
@@ -8330,7 +8331,8 @@ adjust_path_distribution(PlannerInfo *root, Query *parse, Path *path)
          * See planner.c:2730 in 9.5.
          */
         if (!(IsA(path, ResultPath) && /* FIXME missing (result_plan->lefttree == NULL) condition */
-            ((root->distribution->distributionType == 'H' && bms_num_members(root->distribution->restrictNodes) == 1) ||
+		      (((root->distribution->distributionType == 'H' || root->distribution->distributionType == 'S') &&
+		        bms_num_members(root->distribution->restrictNodes) == 1) ||
              (root->distribution->distributionType == 'R' && !contain_mutable_functions((Node *)parse->targetList)))))
 
             path = create_remotesubplan_path(root, path, root->distribution);
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 1bb82166..74bdc28e 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -11161,7 +11161,8 @@ ExecShutdownRemoteSubplan(RemoteSubplanState *node)
 	Plan                *plan = ps->plan;
 	EState              *estate = ps->state;
 	
-	if ((node->eflags & EXEC_FLAG_EXPLAIN_ONLY) != 0)
+	/* do nothing if explain only or execute locally */
+	if ((node->eflags & EXEC_FLAG_EXPLAIN_ONLY) != 0 || node->local_exec)
 		return;
 	
 	elog(DEBUG1, "shutdown remote subplan worker %d, plan_node_id %d", ParallelWorkerNumber, plan->plan_node_id);
diff --git a/src/test/regress/expected/tbase_explain.out b/src/test/regress/expected/tbase_explain.out
index 691d1bb5..d91ef65e 100644
--- a/src/test/regress/expected/tbase_explain.out
+++ b/src/test/regress/expected/tbase_explain.out
@@ -1,10 +1,95 @@
 --explain analyze
 create table a1(id int, num int, name text);
 create table a2(id int, num int, name text);
+--fqs case
+explain (costs off,timing off,summary off,analyze,verbose)
 insert into a1 values(1,generate_series(1,100),'a');
-insert into a1 values(2,generate_series(1,100),'b');
+                                          QUERY PLAN                                           
+-----------------------------------------------------------------------------------------------
+ Remote Fast Query Execution (actual rows=0 loops=1)
+   Output: 1, generate_series(1, 100), 'a'::text
+   Node expr: 1
+   Remote query: INSERT INTO a1 (id, num, name) VALUES (1, generate_series(1, 100), 'a'::text)
+(4 rows)
+
+set enable_fast_query_shipping to off;
+--insert into single value
+explain (costs off,timing off,summary off,analyze,verbose)
+insert into a1 values(2,1,'b');
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1) (actual rows=0 loops=1)
+   ->  Insert on public.a1
+         DN (actual rows=0..0 loops=1..1)
+         - datanode_1 (actual rows=0 loops=1)
+         ->  Result
+               DN (actual rows=1..1 loops=1..1)
+               - datanode_1 (actual rows=1 loops=1)
+               Output: 2, 1, 'b'::text
+(8 rows)
+
+--insert with set returning function
+explain (costs off,timing off,summary off,analyze,verbose)
+insert into a1 values(2,generate_series(2,100),'b');
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1) (actual rows=0 loops=1)
+   ->  Insert on public.a1
+         DN (actual rows=0..0 loops=1..1)
+         - datanode_1 (actual rows=0 loops=1)
+         ->  Remote Subquery Scan on local node
+               DN (actual rows=99..99 loops=1..1)
+               - datanode_1 (actual rows=99 loops=1)
+               Output: 2, generate_series(2, 100), 'b'::text
+               Distribute results by H: 2
+               ->  ProjectSet
+                     DN (actual rows=99..99 loops=1..1)
+                     - datanode_1 (actual rows=99 loops=1)
+                     Output: 2, generate_series(2, 100), 'b'::text
+                     ->  Result
+                           DN (actual rows=1..1 loops=1..1)
+                           - datanode_1 (actual rows=1 loops=1)
+(16 rows)
+
+explain (costs off,timing off,summary off,analyze,verbose)
 insert into a1 values(3,generate_series(1,100),'c');
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_2) (actual rows=0 loops=1)
+   ->  Insert on public.a1
+         DN (actual rows=0..0 loops=1..1)
+         - datanode_2 (actual rows=0 loops=1)
+         ->  Remote Subquery Scan on local node
+               DN (actual rows=100..100 loops=1..1)
+               - datanode_2 (actual rows=100 loops=1)
+               Output: 3, generate_series(1, 100), 'c'::text
+               Distribute results by H: 3
+               ->  ProjectSet
+                     DN (actual rows=100..100 loops=1..1)
+                     - datanode_2 (actual rows=100 loops=1)
+                     Output: 3, generate_series(1, 100), 'c'::text
+                     ->  Result
+                           DN (actual rows=1..1 loops=1..1)
+                           - datanode_2 (actual rows=1 loops=1)
+(16 rows)
+
+explain (costs off,timing off,summary off,analyze,verbose)
 insert into a2 select * from a1;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=0 loops=1)
+   ->  Insert on public.a2
+         DN (actual rows=0..0 loops=1..1)
+         - datanode_1 (actual rows=0 loops=1)
+         - datanode_2 (actual rows=0 loops=1)
+         ->  Seq Scan on public.a1
+               DN (actual rows=100..200 loops=1..1)
+               - datanode_1 (actual rows=200 loops=1)
+               - datanode_2 (actual rows=100 loops=1)
+               Output: a1.id, a1.num, a1.name
+(10 rows)
+
+reset enable_fast_query_shipping;
 --normal cases
 explain (costs off,timing off,summary off,analyze,verbose)
 select count(*) from a1;
diff --git a/src/test/regress/sql/tbase_explain.sql b/src/test/regress/sql/tbase_explain.sql
index 7e212bc7..d15c7c2c 100644
--- a/src/test/regress/sql/tbase_explain.sql
+++ b/src/test/regress/sql/tbase_explain.sql
@@ -1,10 +1,21 @@
 --explain analyze
 create table a1(id int, num int, name text);
 create table a2(id int, num int, name text);
+--fqs case
+explain (costs off,timing off,summary off,analyze,verbose)
 insert into a1 values(1,generate_series(1,100),'a');
-insert into a1 values(2,generate_series(1,100),'b');
+set enable_fast_query_shipping to off;
+--insert into single value
+explain (costs off,timing off,summary off,analyze,verbose)
+insert into a1 values(2,1,'b');
+--insert with set returning function
+explain (costs off,timing off,summary off,analyze,verbose)
+insert into a1 values(2,generate_series(2,100),'b');
+explain (costs off,timing off,summary off,analyze,verbose)
 insert into a1 values(3,generate_series(1,100),'c');
+explain (costs off,timing off,summary off,analyze,verbose)
 insert into a2 select * from a1;
+reset enable_fast_query_shipping;
 
 --normal cases
 explain (costs off,timing off,summary off,analyze,verbose)

From 54d384f5be8661b7c71cae5f54dd7cbbab92b8f4 Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Fri, 4 Jun 2021 10:07:37 +0800
Subject: [PATCH 384/578] fix rename bug
 http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131088349973&jump_count=1

---
 src/gtm/main/gtm_seq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c
index 27e0a3e0..c9bb59c8 100644
--- a/src/gtm/main/gtm_seq.c
+++ b/src/gtm/main/gtm_seq.c
@@ -1027,7 +1027,7 @@ GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey,
     {
         newseqinfo = seq_find_seqinfo(newseqkey);
 #ifdef __TBASE__
-        if (NULL == seqinfo)
+        if (NULL == newseqinfo)
         {
             GTM_FormSeqOfStore(newseqkey);
             newseqinfo = seq_find_seqinfo(newseqkey);

From 50c66c9fb5ddcd3f7c7e234f6c4c22797f6e1e4c Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Fri, 4 Jun 2021 16:10:53 +0800
Subject: [PATCH 385/578] fix warning

---
 src/backend/executor/execMain.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 3bc95f7d..d30ee629 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -56,6 +56,7 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "optimizer/clauses.h"
+#include "optimizer/pgxcship.h"
 #include "parser/parsetree.h"
 #include "rewrite/rewriteManip.h"
 #include "storage/bufmgr.h"

From 3e805b5786b598c21882e97c8ebb14bc42d179e6 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Fri, 4 Jun 2021 21:12:46 +0800
Subject: [PATCH 386/578] Bugfix: run tpcc core after 2pc files opt, ID88129643
 (merge request !361)

---
 src/backend/access/transam/twophase.c  | 1194 ++++++++++++------------
 src/backend/storage/lmgr/lwlock.c      |    7 +
 src/backend/utils/misc/guc.c           |   19 +-
 src/include/access/twophase.h          |    2 -
 src/include/storage/lwlock.h           |    9 +-
 src/test/regress/expected/sysviews.out |    5 +-
 6 files changed, 595 insertions(+), 641 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 14935855..61bc6b50 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -138,8 +138,6 @@ int  transaction_threshold = 200000;
 
 #define FILE_CONTENT_SIZE 2048
 
-#define GET_START_NODE "startnode:"
-
 /* GUC variable, can't be changed after startup */
 #ifdef PGXC
 int            max_prepared_xacts = 10000;  /* We require 2PC */
@@ -154,14 +152,12 @@ bool        enable_2pc_recovery_info = true;
 static HTAB *record_2pc_cache = NULL;
 
 bool enable_2pc_file_cache = true;
-bool enable_2pc_file_check = true;
+bool enable_2pc_file_check = false;
 bool enable_2pc_entry_key_check = true;
 bool enable_2pc_entry_trace = false;
-bool enable_2pc_hash_table_check = true;
 
 int record_2pc_cache_size = 4096;
 int record_2pc_entry_size = 2048;
-int record_2pc_partitions = 32;
 
 #define MAX_OUTPUT_FILE 1000
 
@@ -169,8 +165,21 @@ int record_2pc_partitions = 32;
 #define MAX_2PC_INFO_SIZE   (record_2pc_entry_size - MAX_TID_SIZE)
 #define DFLT_2PC_INFO_SIZE  1024  /* default size */
 
-#define HASH_TAB_RETRY_MAX   10
-#define HASH_TAB_RETRY_SLEEP 2000 /* sleep time: 2ms */
+uint32 Record2pcCacheHashCode(const char *tid);
+
+/*
+ * The 2pc info cache is partitioned to reduce contention.
+ * To determine which partition lock a given tid requires, compute the tid's
+ * hash code with Record2pcCacheHashCode(), then apply Cache2pcPartitionLock().
+ * NB: NUM_CACHE_2PC_PARTITIONS must be a power of 2!
+ */
+#define Cache2pcHashPartition(hashcode) \
+	((hashcode) % NUM_CACHE_2PC_PARTITIONS)
+#define Cache2pcPartitionLock(hashcode) \
+	(&MainLWLockArray[CACHE_2PC_LWLOCK_OFFSET + \
+		Cache2pcHashPartition(hashcode)].lock)
+#define Cache2pcPartitionLockByIndex(i) \
+	(&MainLWLockArray[CACHE_2PC_LWLOCK_OFFSET + (i)].lock)
 
 /* hash table entry for 2pc record */
 typedef struct Cache2pcInfo
@@ -180,14 +189,20 @@ typedef struct Cache2pcInfo
 
 } Cache2pcInfo;
 
-inline void
-check_entry_key(const char *tid, const char *key, const char *func);
+inline void check_entry_key(const char *tid, const char *key);
 
-void
-print_record_2pc_cache(const char *func);
+bool add_2pc_info(const char *tid, const char *info);
+
+bool append_2pc_info(const char *tid, const char *info, bool *overflow);
+
+bool remove_2pc_info(const char *tid);
+
+bool get_2pc_info(const char *tid, char *info);
+
+bool save_and_remove_2pc_info(const char *tid);
+
+void check_2pc_file(const char *tid, const char *info, const char *func);
 
-void
-check_2pc_file(const char *tid, const char *info, const char *func);
 #endif
 
 static GlobalTransaction
@@ -2310,19 +2325,8 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 {// #lizard forgives
     int            i;
     int            serialized_xacts = 0;
-	char *func = "CheckPointTwoPhase";
-
-#ifdef __TWO_PHASE_TRANS__
-	File fd = -1;
-	int ret = 0;
-	int size = 0;
-	Cache2pcInfo *entry = NULL;
-	bool found = false;
-	char path[MAXPGPATH];
-#endif
-
 
-	elog(LOG, "[%s] checkpoint: "UINT64_FORMAT, func, redo_horizon);
+	elog(LOG, "[%s] checkpoint: "UINT64_FORMAT, __FUNCTION__, redo_horizon);
 
     if (max_prepared_xacts <= 0)
         return;                    /* nothing to do */
@@ -2355,97 +2359,41 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
         GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
 
         if ((gxact->valid || gxact->inredo) &&
-            !gxact->ondisk &&
             gxact->prepare_end_lsn <= redo_horizon)
         {
             char       *buf;
             int            len;
 
+			if (!gxact->ondisk)
+			{
 			/* save to pg_twophase */
             XlogReadTwoPhaseData(gxact->prepare_start_lsn, &buf, &len);
             RecreateTwoPhaseFile(gxact->xid, buf, len);
 			pfree(buf);
 
+				gxact->ondisk = true;
+				gxact->prepare_start_lsn = InvalidXLogRecPtr;
+				gxact->prepare_end_lsn = InvalidXLogRecPtr;
+				serialized_xacts++;
+			}
+
 #ifdef __TWO_PHASE_TRANS__
 			/* save to pg_2pc */
 			if (NULL != record_2pc_cache)
 			{
 				Assert(strlen(gxact->gid) < MAX_TID_SIZE);
-				entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
-					gxact->gid, HASH_FIND, &found);
-				if (found)
-				{
-					/* save to file */
-					Assert(NULL != entry);
-					check_entry_key(gxact->gid, entry->key, func);
-					check_2pc_file(gxact->gid, entry->info, func);
-
-					elog(LOG, "[%s] %s is found in hash table", func, gxact->gid);
-
-					size = strlen(entry->info);
-
-					memset(path, 0, MAXPGPATH);
-					GET_2PC_FILE_PATH(path, gxact->gid);
 
-					fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
-					if (fd < 0)
+				if (!save_and_remove_2pc_info(gxact->gid))
 					{
-						elog(ERROR, "[%s] could not create file %s, errMsg: %s",
-							func, path, strerror(errno));
-					}
-
-					ret = write(fd, entry->info, size);
-					if(ret != size)
-					{
-						close(fd);
-						elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
-							"ret: %d, info: %s",
-							func, path, strerror(errno), ret, entry->info);
-					}
-
-					if (size != strlen(entry->info))
-					{
-						elog(LOG, "[%s] %s size change from %d to %zu, info: %s",
-							func, gxact->gid, size, strlen(entry->info), entry->info);
-
-						Assert(size < strlen(entry->info));
-						ret = write(fd, entry->info + size, strlen(entry->info) - size);
-						if(ret != strlen(entry->info) - size)
-						{
-							close(fd);
-							elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
-								"ret: %d, info: %s",
-								func, path, strerror(errno), ret, entry->info);
-						}
-					}
-					close(fd);
-					fsync_fname(path, false);
-
-					/* remove from hash table */
-					entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
-						gxact->gid, HASH_REMOVE, &found);
-					if (!found)
-					{
-						elog(WARNING, "[%s] %s is not found in hash table "
-							"when remove it", func, gxact->gid);
-					}
-					else
-					{
-						elog(LOG, "[%s] %s is removed from hash table",
-							func, gxact->gid);
-					}
+					elog(LOG, "[%s] %s save to file failed",
+						__FUNCTION__, gxact->gid);
 				}
 				else
 				{
-					elog(LOG, "[%s] %s is not found in hash table", func, gxact->gid);
+					elog(LOG, "[%s] %s is saved to file", __FUNCTION__, gxact->gid);
 				}
 			}
 #endif
-
-            gxact->ondisk = true;
-            gxact->prepare_start_lsn = InvalidXLogRecPtr;
-            gxact->prepare_end_lsn = InvalidXLogRecPtr;
-            serialized_xacts++;
         }
     }
     LWLockRelease(TwoPhaseStateLock);
@@ -2456,101 +2404,71 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 	{
 		HASH_SEQ_STATUS seq;
 		Cache2pcInfo *entry = NULL;
-		char *start_node = NULL;
-		char info[MAX_2PC_INFO_SIZE];
+		char tid[MAX_TID_SIZE];
+		char start_node[MAX_TID_SIZE];
+		char *pos = NULL;
+		int size = 0;
+
+		/*
+		 * set start_node likes ":cn001:"
+		 * use to check whether the tid is started from this node
+		 */
+		memset(start_node, 0, MAX_TID_SIZE);
+		size = strlen(PGXCNodeName);
+		if (size + 2 >= MAX_TID_SIZE)
+		{
+			elog(PANIC, "[%s] node name length(%d) overflow", __FUNCTION__, size);
+		}
+		start_node[0] = ':';
+		memcpy(start_node + 1, PGXCNodeName, size);
+		start_node[size + 1] = ':';
 
 		hash_seq_init(&seq, record_2pc_cache);
 		while ((entry = hash_seq_search(&seq)) != NULL)
 		{
 			Assert(NULL != entry);
-			check_2pc_file(entry->key, entry->info, func);
 
-			elog(LOG, "[%s] %s is found in hash table seq", func, entry->key);
-
-			if (IsXidImplicit(entry->key))
+			size = strlen(entry->key);
+			Assert(size < MAX_TID_SIZE);
+			if (0 == size)
 			{
-				if (0 == strlen(entry->info))
-				{
-					elog(WARNING, "[%s] %s info length is 0", func, entry->key);
+				elog(LOG, "[%s] entry key is empty", __FUNCTION__);
 					continue;
 				}
-				memset(info, 0, MAX_2PC_INFO_SIZE);
-				memcpy(info, entry->info, strlen(entry->info));
-
-				start_node = strstr(info, GET_START_NODE);
-				if (NULL != start_node)
-				{
-					start_node += strlen(GET_START_NODE);
-					start_node = strtok(start_node, "\n");
 
-					if (0 != strcmp(start_node, PGXCNodeName))
+			memset(tid, 0, MAX_TID_SIZE);
+			memcpy(tid, entry->key, size + 1);
+			if (0 == strlen(tid))
 					{
-						elog(LOG, "[%s] %s start node is not %s",
-							func, entry->key, PGXCNodeName);
+				elog(LOG, "[%s] tid is empty", __FUNCTION__);
 						continue;
 					}
+			Assert(strlen(tid) < MAX_TID_SIZE);
 
-						elog(LOG, "[%s] %s start node is %s",
-							func, entry->key, PGXCNodeName);
-					}
-				else
+			if (enable_2pc_file_check)
 				{
-					elog(WARNING, "[%s] %s get start node failed, info: %s",
-						func, entry->key, entry->info);
-				}
+				elog(LOG, "[%s] %s is found in hash table seq", __FUNCTION__, tid);
 			}
 
-			size = strlen(entry->info);
-
-			memset(path, 0, MAXPGPATH);
-			GET_2PC_FILE_PATH(path, entry->key);
-
-			fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
-			if (fd < 0)
+			if (IsXidImplicit(tid))
 			{
-				elog(ERROR, "[%s] could not create file %s, errMsg: %s",
-					func, path, strerror(errno));
-			}
-
-			ret = write(fd, entry->info, size);
-			if(ret != size)
+				pos = strstr(tid, start_node);
+				if (NULL == pos)
 			{
-				close(fd);
-				elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
-					"ret: %d, info: %s",
-					func, path, strerror(errno), ret, entry->info);
+					elog(LOG, "[%s] %s is not on start node", __FUNCTION__, tid);
+					continue;
 			}
 
-			if (size != strlen(entry->info))
-			{
-				elog(LOG, "[%s] %s size change from %d to %zu, info: %s",
-					func, entry->key, size, strlen(entry->info), entry->info);
-
-				Assert(size < strlen(entry->info));
-				ret = write(fd, entry->info + size, strlen(entry->info) - size);
-				if(ret != strlen(entry->info) - size)
-				{
-			close(fd);
-					elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
-						"ret: %d, info: %s",
-						func, path, strerror(errno), ret, entry->info);
-				}
+				elog(LOG, "[%s] %s is on start node", __FUNCTION__, tid);
 			}
-			close(fd);
-			fsync_fname(path, false);
 
-			/* remove from hash table */
-			entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
-				entry->key, HASH_REMOVE, &found);
-			if (!found)
+			if (!save_and_remove_2pc_info(tid))
 			{
-				elog(WARNING, "[%s] %s is not found in hash table "
-					"when remove it", func, entry->key);
+				elog(LOG, "[%s] %s save to file failed", __FUNCTION__, tid);
 			}
 			else
 			{
-				elog(LOG, "[%s] %s is removed from hash table",
-					func, entry->key);
+				elog(LOG, "[%s] %s is saved to file", __FUNCTION__, tid);
 			}
 		}
 	}
@@ -3392,37 +3310,315 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning)
 
 #ifdef __TWO_PHASE_TRANS__
 /*
- * Check the entry key in the hash table is same with tid.
+ * check_entry_key: check the entry key in the hash table whether is same with tid.
  */
-inline void check_entry_key(const char *tid, const char *key, const char *func)
+inline void check_entry_key(const char *tid, const char *key)
 {
-	if (!enable_2pc_entry_key_check)
+	if (enable_2pc_entry_key_check)
 	{
-		return;
-	}
-
 	if (0 != strcmp(tid, key))
 	{
-		elog(PANIC, "[%s] %s get wrong key: %s", func, tid, key);
+			elog(PANIC, "%s(hashvalue: 0x%x) mismatch with %s(hashvalue: 0x%x)",
+				tid, Record2pcCacheHashCode(tid), key, Record2pcCacheHashCode(key));
+		}
 	}
 }
 
-void print_record_2pc_cache(const char *func)
-{
-	if (NULL != record_2pc_cache)
+/*
+ * add_2pc_info: add 2pc info to hash table
+ * return true: add success
+ * return false: add failed
+ */
+bool add_2pc_info(const char *tid, const char *info)
 	{
-		HASH_SEQ_STATUS seq;
+	bool found = false;
 		Cache2pcInfo *entry = NULL;
+	uint32 hashvalue = Record2pcCacheHashCode(tid);
+	LWLock *lock = Cache2pcPartitionLock(hashvalue);
 
-		hash_seq_init(&seq, record_2pc_cache);
-		while ((entry = hash_seq_search(&seq)) != NULL)
+	Assert(NULL != record_2pc_cache);
+	Assert(NULL != tid);
+	Assert(NULL != info);
+	Assert(strlen(info) < MAX_2PC_INFO_SIZE);
+
+	LWLockAcquire(lock, LW_EXCLUSIVE);
+
+	entry = (Cache2pcInfo *)hash_search_with_hash_value(record_2pc_cache,
+		tid, hashvalue, HASH_ENTER_NULL, &found);
+	if (NULL == entry)
 		{
+		LWLockRelease(lock);
+		return false;
+	}
+
 			Assert(NULL != entry);
-			elog(LOG, "[print_record_2pc_cache][%s] key: %s, info: %s",
-				func, entry->key, entry->info);
+	check_entry_key(tid, entry->key);
+
+	memcpy(entry->info, info, strlen(info) + 1);
+
+	LWLockRelease(lock);
+
+	if (found)
+	{
+		elog(WARNING, "[%s] found %s", __FUNCTION__, tid);
+		return true;
+		}
+
+	if (enable_2pc_entry_trace)
+	{
+		elog(LOG, "[%s] %s is added to hash table, entry: %p, info: %s",
+			__FUNCTION__, tid, entry, info);
+	}
+
+	return true;
+}
+
+/*
+ * append_2pc_info: append 2pc info to hash table
+ * return true: append success
+ * return false: append failed
+ */
+bool append_2pc_info(const char *tid, const char *info, bool *overflow)
+	{
+		bool found = false;
+	int cur_size = 0;
+	int app_size = 0;
+	int new_size = 0;
+	Cache2pcInfo *entry = NULL;
+	uint32 hashvalue = Record2pcCacheHashCode(tid);
+	LWLock *lock = Cache2pcPartitionLock(hashvalue);
+
+	Assert(NULL != record_2pc_cache);
+		Assert (NULL != tid);
+		Assert (NULL != info);
+	Assert(NULL != overflow);
+	Assert(strlen(info) < MAX_2PC_INFO_SIZE);
+
+	*overflow = false;
+
+	LWLockAcquire(lock, LW_EXCLUSIVE);
+
+	entry = (Cache2pcInfo *)hash_search_with_hash_value(record_2pc_cache,
+		tid, hashvalue, HASH_FIND, &found);
+	if (!found)
+		{
+		/* not found */
+		LWLockRelease(lock);
+		return false;
+		}
+
+	/* found */
+	Assert(NULL != entry);
+	check_entry_key(tid, entry->key);
+
+	cur_size = strlen(entry->info);
+	app_size = strlen(info);
+	new_size = cur_size + app_size;
+	if (new_size >= MAX_2PC_INFO_SIZE)
+		{
+		/* overflow */
+		LWLockRelease(lock);
+		elog(LOG, "[%s] %s new size(%d) overflow(%d)",
+			__FUNCTION__, tid, new_size, MAX_2PC_INFO_SIZE);
+		*overflow = true;
+		return false;
+		}
+
+	memcpy(entry->info + cur_size, info, app_size + 1);
+
+	Assert(strlen(entry->info) < MAX_2PC_INFO_SIZE);
+
+	LWLockRelease(lock);
+
+	if (enable_2pc_entry_trace)
+			{
+		elog(LOG, "[%s] %s is found in hash table", __FUNCTION__, tid);
+			}
+
+	return true;
+		}
+
+/*
+ * remove_2pc_info: remove 2pc info from hash table
+ * return true: remove success
+ * return false: remove failed
+ */
+bool remove_2pc_info(const char *tid)
+		{
+	bool found = false;
+	Cache2pcInfo *entry = NULL;
+	uint32 hashvalue = Record2pcCacheHashCode(tid);
+	LWLock *lock = Cache2pcPartitionLock(hashvalue);
+
+	Assert(NULL != record_2pc_cache);
+	Assert(NULL != tid);
+
+	LWLockAcquire(lock, LW_EXCLUSIVE);
+
+	entry = (Cache2pcInfo *)hash_search_with_hash_value(record_2pc_cache,
+		tid, hashvalue, HASH_REMOVE, &found);
+
+	LWLockRelease(lock);
+
+		if (!found)
+		{
+		/* not found */
+		Assert(NULL == entry);
+		return false;
+		}
+
+	/* found */
+		Assert (NULL != entry);
+
+	if (enable_2pc_entry_trace)
+		{
+		elog(LOG, "[%s] %s is removed from hash table, entry: %p",
+			__FUNCTION__, tid, entry);
+	}
+
+	return true;
+}
+
+/*
+ * get_2pc_info: get 2pc info from hash table
+ * return true: get success
+ * return false: get failed
+ */
+bool get_2pc_info(const char *tid, char *info)
+{
+    bool found = false;
+	Cache2pcInfo *entry = NULL;
+	uint32 hashvalue = Record2pcCacheHashCode(tid);
+	LWLock *lock = Cache2pcPartitionLock(hashvalue);
+
+	Assert(NULL != record_2pc_cache);
+	Assert(NULL != tid);
+	Assert(NULL != info);
+
+	LWLockAcquire(lock, LW_SHARED);
+
+	entry = (Cache2pcInfo *)hash_search_with_hash_value(record_2pc_cache,
+		tid, hashvalue, HASH_FIND, &found);
+	if (!found)
+        {
+		/* not found */
+		LWLockRelease(lock);
+		Assert(NULL == entry);
+		return false;
+    }
+
+	/* found */
+	Assert(NULL != entry);
+	check_entry_key(tid, entry->key);
+
+	Assert(strlen(entry->info) < MAX_2PC_INFO_SIZE);
+	memcpy(info, entry->info, strlen(entry->info) + 1);
+
+	LWLockRelease(lock);
+	return true;
+}
+
+/*
+ * save_and_remove_2pc_info: save 2pc info from hash table to disk file, 
+ * then remove it
+ * return true: save and remove success
+ * return false: save and remove failed
+ */
+bool save_and_remove_2pc_info(const char *tid)
+{
+	bool found = false;
+	Cache2pcInfo *entry = NULL;
+	File fd = -1;
+    int ret = 0;
+    int size = 0;
+    char path[MAXPGPATH];
+	uint32 hashvalue = Record2pcCacheHashCode(tid);
+	LWLock *lock = Cache2pcPartitionLock(hashvalue);
+
+	Assert(NULL != record_2pc_cache);
+	Assert(NULL != tid);
+        
+	memset(path, 0, MAXPGPATH);
+	GET_2PC_FILE_PATH(path, tid);
+
+	LWLockAcquire(lock, LW_EXCLUSIVE);
+
+	/* get 2pc info */
+	entry = (Cache2pcInfo *)hash_search_with_hash_value(record_2pc_cache,
+		tid, hashvalue, HASH_FIND, &found);
+	if (!found)
+	{
+		/* not found */
+		LWLockRelease(lock);
+		Assert(NULL == entry);
+		return false;
+	}
+
+	/* found */
+	Assert(NULL != entry);
+	check_entry_key(tid, entry->key);
+
+	Assert(strlen(entry->info) < MAX_2PC_INFO_SIZE);
+
+	if (0 == access(path, F_OK))
+	{
+		/* file exist */
+		if (enable_2pc_file_check)
+		{
+			elog(LOG, "[%s] found file %s", __FUNCTION__, path);
+		}
+
+		/* remove file */
+		if (0 != unlink(path))
+		{
+			elog(WARNING, "[%s] could not unlink file %s, errMsg: %s",
+				__FUNCTION__, path, strerror(errno));
 		}
+		else
+		{
+			elog(LOG, "[%s] unlink file %s", __FUNCTION__, path);
+		}
+	}
+
+	/* save to file */
+	fd = open(path, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+	if (fd < 0)
+	{
+		LWLockRelease(lock);
+		elog(ERROR, "[%s] could not create file %s, errMsg: %s",
+			__FUNCTION__, path, strerror(errno));
+	}
+
+	size = strlen(entry->info);
+	ret = write(fd, entry->info, size);
+	if(ret != size)
+	{
+		LWLockRelease(lock);
+		close(fd);
+		elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
+			"ret: %d, size: %d, info: %s",
+			__FUNCTION__, path, strerror(errno), ret, size, entry->info);
 	}
+	close(fd);
+
+	/* remove 2pc info */
+	entry = (Cache2pcInfo *)hash_search_with_hash_value(record_2pc_cache,
+		tid, hashvalue, HASH_REMOVE, &found);
+
+	LWLockRelease(lock);
+
+	Assert(found);
+	Assert(NULL != entry);
+
+	if (enable_2pc_entry_trace)
+	{
+		elog(LOG, "[%s] %s is removed from hash table, entry: %p",
+			__FUNCTION__, tid, entry);
+	}
+
+	return true;
 }
+
 /*
  * Check whether the 2pc file is exist when it is saved in the hash table.
  */
@@ -3433,12 +3629,13 @@ void check_2pc_file(const char *tid, const char *info, const char *func)
 		int size = 0;
 		struct stat filestate;
 		char path[MAXPGPATH];
-		Cache2pcInfo *entry = NULL;
-		bool found = false;
+		int ret = 0;
+		File fd = -1;
+		char result[MAX_2PC_INFO_SIZE];
 
-		Assert (NULL != tid);
-		Assert (NULL != info);
-		Assert (NULL != func);
+		Assert(NULL != tid);
+		Assert(NULL != info);
+		Assert(NULL != func);
 
 		GET_2PC_FILE_PATH(path, tid);
 		if (0 != access(path, F_OK))
@@ -3446,106 +3643,86 @@ void check_2pc_file(const char *tid, const char *info, const char *func)
 			return;
 		}
 
-		elog(LOG, "[check_2pc_file][%s] node(%s) found file %s",
-			func, PGXCNodeName, path);
+		elog(LOG, "[check_2pc_file][%s] found file %s", func, path);
 
 		if(stat(path, &filestate) == -1)
 		{
-			elog(ERROR, "[check_2pc_file][%s] could not get status of file %s",
-				func, path);
+			elog(WARNING, "[check_2pc_file][%s] could not stat file %s, info: %s",
+				func, path, info);
+			return;
 		}
 
 		size = filestate.st_size;
 
-		if (0 != size)
-		{
-			int ret = 0;
-			File fd = -1;
-			char result[size + 1];
-
-			fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
-			if (fd < 0)
-			{
-				elog(ERROR, "[check_2pc_file][%s] could not open file %s for read",
-					func, path);
-			}
-
-			memset(result, 0, size +1);
-			ret = FileRead(fd, result, size, WAIT_EVENT_BUFFILE_READ);
-			if(ret != size)
-			{
-				FileClose(fd);
-				elog(ERROR, "[check_2pc_file][%s] read %s error, ret: %d, size: %d",
-					func, path, ret, size);
-			}
-			FileClose(fd);
-
-			if (0 != strcmp(result, info))
-			{
-				elog(LOG, "[check_2pc_file][%s] file %s result: %s, info: %s",
-					func, path, result, info);
-			}
-		}
-		else
+		if (0 == size)
 		{
-			elog(LOG, "[check_2pc_file][%s] get empty file %s, info: %s",
+			elog(WARNING, "[check_2pc_file][%s] file %s is empty, info: %s",
 				func, path, info);
+			return;
 		}
 
-		if (NULL == record_2pc_cache)
+		if (size >= MAX_2PC_INFO_SIZE)
 		{
-			elog(LOG, "[check_2pc_file][%s] record_2pc_cache is NULL, "
-				"tid: %s, info: %s", func, tid, info);
+			elog(WARNING, "[check_2pc_file][%s] file %s size(%d) overflow(%d)",
+				func, path, size, MAX_2PC_INFO_SIZE);
 			return;
 		}
 
-		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
-			tid, HASH_FIND, &found);
-		if (!found)
+		fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
+		if (fd < 0)
 		{
-			elog(LOG, "[check_2pc_file][%s] %s is not found "
-				"in hash table, info: %s", func, tid, info);
+			elog(WARNING, "[check_2pc_file][%s] could not open file %s, "
+				"errMsg: %s", func, path, strerror(errno));
 			return;
 		}
 
-		Assert (NULL != entry);
+		memset(result, 0, size +1);
+		ret = FileRead(fd, result, size, WAIT_EVENT_BUFFILE_READ);
+		if(ret != size)
+		{
+			FileClose(fd);
+			elog(WARNING, "[check_2pc_file][%s] could not read file %s, "
+				"ret: %d, file size: %d", func, path, ret, size);
+			return;
+		}
+		FileClose(fd);
 
-		if (0 != strcmp(entry->info, info))
+		if (0 != strcmp(result, info))
 		{
-			elog(LOG, "[check_2pc_file][%s] %s info change from '%s' to '%s'",
-				func, tid, info, entry->info);
+			elog(LOG, "[check_2pc_file][%s] file %s mismatch, "
+				"result: %s, info: %s", func, path, result, info);
 		}
 	}
 }
 
 void record_2pc_redo_remove_gid_xid(TransactionId xid)
 {
-    int i;
-    GlobalTransaction gxact = NULL;
-    bool found = false;
+	int i;
+	GlobalTransaction gxact = NULL;
+	bool found = false;
 
 	if(!enable_2pc_recovery_info)
 	{
-		return ;
+		return;
 	}
 
-    for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
-    {
-        gxact = TwoPhaseState->prepXacts[i];
+	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
+	{
+		gxact = TwoPhaseState->prepXacts[i];
 
-        if (gxact->xid == xid)
-        {
-            found = true;
-            break;
-        }
-    }
+		if (gxact->xid == xid)
+		{
+			found = true;
+			break;
+		}
+	}
 
-    Assert(RecoveryInProgress());
+	Assert(RecoveryInProgress());
 
-    if (found)
-    {
-        remove_2pc_records(gxact->gid, false);
-    }
+	if (found)
+	{
+		remove_2pc_records(gxact->gid, false);
+	}
 }
 
 void record_2pc_involved_nodes_xid(const char * tid, 
@@ -3553,46 +3730,42 @@ void record_2pc_involved_nodes_xid(const char * tid,
                                   GlobalTransactionId startxid, 
                                   char * nodestring, 
                                   GlobalTransactionId xid)
-{// #lizard forgives
-    File fd = 0;
-    int ret = 0;
-    int size = 0;
-    StringInfoData content;
-    struct stat fst;
-    char path[MAXPGPATH];
-    off_t fileSize;
-    char *result = NULL;
-	Cache2pcInfo *entry = NULL;
-	bool found = false;
-	char *func = "record_2pc_involved_nodes_xid";
+{
+	File fd = 0;
+	int ret = 0;
+	int size = 0;
+	StringInfoData content;
+	struct stat fst;
+	char path[MAXPGPATH];
+	char *result = NULL;
 
 #ifdef __TWO_PHASE_TESTS__
-    XLogRecPtr xlogrec = 0;
+	XLogRecPtr xlogrec = 0;
 #endif
-        
-    if (!enable_2pc_recovery_info)
-    {
-        return ;
-    }
+
+	if (!enable_2pc_recovery_info)
+	{
+		return;
+	}
 
 	if (enable_distri_print || enable_2pc_entry_trace)
     {
 		elog(LOG, "[%s] record %s, startnode: %s, participants: %s",
-			func, tid, startnode, nodestring);
+			__FUNCTION__, tid, startnode, nodestring);
     }
 
     if (NULL == tid || '\0' == tid[0])
     {
-		elog(ERROR, "[%s] gid is empty", func);
+		elog(ERROR, "[%s] gid is empty", __FUNCTION__);
     }
 
     if (NULL == startnode || '\0' == startnode[0])
     {
-		elog(PANIC, "[%s] %s startnode is empty", func, tid);
+		elog(PANIC, "[%s] %s startnode is empty", __FUNCTION__, tid);
     }
     if (NULL == nodestring || '\0' == nodestring[0])
     {
-		elog(PANIC, "[%s] %s participants is empty", func, tid);
+		elog(PANIC, "[%s] %s participants is empty", __FUNCTION__, tid);
     }
     
 	initStringInfo(&content);
@@ -3609,19 +3782,20 @@ void record_2pc_involved_nodes_xid(const char * tid,
 		/* if tid already exists, check content and return */
 		if (NULL != record_2pc_cache)
 		{
+			char info[MAX_2PC_INFO_SIZE];
+
 			Assert(strlen(tid) < MAX_TID_SIZE);
-			entry = (Cache2pcInfo *)hash_search(record_2pc_cache, tid, HASH_FIND, &found);
-			if (found)
+
+			if (get_2pc_info(tid, info))
 			{
-				Assert(NULL != entry);
-				check_entry_key(tid, entry->key, func);
-				check_2pc_file(tid, entry->info, func);
+				Assert(strlen(info) < MAX_2PC_INFO_SIZE);
+				check_2pc_file(tid, info, __FUNCTION__);
 
-				if (strncmp(entry->info, content.data, size) != 0)
+				if (strncmp(info, content.data, size) != 0)
 				{
 					elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, "
-						"content: %s, info: %s",
-						func, tid, content.data, entry->info);
+						"content: %s, info: %s", __FUNCTION__, tid,
+						content.data, info);
 				}
 
 				resetStringInfo(&content);
@@ -3635,23 +3809,21 @@ void record_2pc_involved_nodes_xid(const char * tid,
         /* if file already exists, check content and return */
         if (stat(path, &fst) >= 0)
         {
-            fileSize = fst.st_size;
-            result = (char *)palloc0(fileSize + 1);
+			int file_size = fst.st_size;
+			result = (char *)palloc0(file_size + 1);
             
             fd = PathNameOpenFile(path, O_RDONLY, S_IRUSR | S_IWUSR);
             if (fd < 0)
             {   
-                ereport(ERROR,
-                    (errcode_for_file_access(),
-					errmsg("[%s] could not open file %s for read", func, path)));
+				elog(ERROR, "[%s] could not open file %s, errMsg: %s",
+					__FUNCTION__, path, strerror(errno));
             } 
-            ret = FileRead(fd, result, fileSize, WAIT_EVENT_BUFFILE_READ);
-            if(ret != fileSize)
+			ret = FileRead(fd, result, file_size, WAIT_EVENT_BUFFILE_READ);
+			if(ret != file_size)
             {
 				FileClose(fd);
-                ereport(ERROR,
-                    (errcode_for_file_access(),
-					errmsg("[%s] could not read file %s, ret: %d", func, path, ret)));
+				elog(ERROR, "[%s] could not read file %s, ret: %d, file_size: %d",
+					__FUNCTION__, path, ret, file_size);
             }
             FileClose(fd);
 
@@ -3661,7 +3833,7 @@ void record_2pc_involved_nodes_xid(const char * tid,
             {
 				elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, "
 					"content: %s, info: %s",
-					func, tid, content.data, result);
+					__FUNCTION__, tid, content.data, result);
             }
 
 			pfree(result);
@@ -3692,98 +3864,34 @@ void record_2pc_involved_nodes_xid(const char * tid,
 			run_pg_clean = 1;
 			complish = true;
 			elog(STOP, "[%s] twophase exception: simulate kill start node "
-				"after record 2pc file", func);
+				"after record 2pc file", __FUNCTION__);
 		}
 #endif
 	}
 
-	if (NULL != record_2pc_cache && size < MAX_2PC_INFO_SIZE)
-	{
-		Assert(strlen(tid) < MAX_TID_SIZE);
-		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
-			tid, HASH_ENTER_NULL, &found);
-		if (NULL != entry)
-		{
-			check_entry_key(tid, entry->key, func);
-			check_2pc_file(tid, entry->info, func);
-
-			if (found)
-			{
-				if (RecoveryInProgress())
-				{
-					elog(LOG, "[%s] %s is found in hash table in recovery mode",
-						func, tid);
-				}
-				else
-				{
-					elog(LOG, "[%s] %s is added to hash table, entry: %p, "
-						"record_2pc_cache: %p, hashvalue: %u", func, tid, entry,
-						record_2pc_cache, string_hash(tid, MAX_TID_SIZE));
-				}
-			}
-			else if (enable_2pc_entry_trace || enable_2pc_hash_table_check)
-			{
-				elog(LOG, "[%s] %s is added to hash table, entry: %p, "
-					"record_2pc_cache: %p, hashvalue: %u", func, tid, entry,
-					record_2pc_cache, string_hash(tid, MAX_TID_SIZE));
-			}
-
-			memcpy(entry->info, content.data, size + 1);
-			check_entry_key(tid, entry->key, func);
-
-			if (enable_2pc_hash_table_check)
+	if (NULL != record_2pc_cache)
 			{
-				int retry_times = 0;
-				Cache2pcInfo *entry_debug = NULL;
-
-				GET_2PC_FILE_PATH(path, tid);
-
-				while (retry_times++ < HASH_TAB_RETRY_MAX)
+		if (size < MAX_2PC_INFO_SIZE)
 				{
-					entry_debug = (Cache2pcInfo *)hash_search(record_2pc_cache,
-						tid, HASH_FIND, &found);
-					if (found)
-					{
-						Assert(NULL != entry_debug);
-						check_entry_key(tid, entry_debug->key, func);
-						break;
-					}
-
-					/* not found */
-					elog(LOG, "[%s] %s is not found in hash table, retry times: %d",
-						func, tid, retry_times);
-
-					Assert(NULL == entry_debug);
-
-					if (0 == access(path, F_OK))
-					{
-						elog(LOG, "[%s] %s found 2pc file %s", func, tid, path);
-						break;
-					}
-
-					print_record_2pc_cache(func);
-					pg_usleep(HASH_TAB_RETRY_SLEEP);
-				}
+			Assert(strlen(tid) < MAX_TID_SIZE);
 
-				if (retry_times >= HASH_TAB_RETRY_MAX)
+			if (add_2pc_info(tid, content.data))
 				{
-					elog(PANIC, "[%s] %s is not found in hash table", func, tid);
-				}
-			}
+				check_2pc_file(tid, content.data, __FUNCTION__);
 
 			resetStringInfo(&content);
 			pfree(content.data);
+
 			return;
 		}
+
+			elog(LOG, "[%s] %s add to cache failed", __FUNCTION__, tid);
+		}
 		else
 		{
-			elog(LOG, "[%s] %s entry is NULL", func, tid);
-		}
+			elog(LOG, "[%s] %s info size(%d) overflow(%d)",
+				__FUNCTION__, tid, size, MAX_2PC_INFO_SIZE);
 	}
-	else if (NULL != record_2pc_cache)
-	{
-		elog(LOG, "[%s] %s size: %d, max info size: %d",
-			func, tid, size, MAX_2PC_INFO_SIZE);
 	}
 
 	GET_2PC_FILE_PATH(path, tid);
@@ -3807,7 +3915,7 @@ void record_2pc_involved_nodes_xid(const char * tid,
     if (fd < 0)
     {   
 		elog(ERROR, "[%s] could not create file %s, errMsg: %s",
-			func, path, strerror(errno));
+			__FUNCTION__, path, strerror(errno));
         return;
     }
 
@@ -3816,7 +3924,7 @@ void record_2pc_involved_nodes_xid(const char * tid,
     {
 		FileClose(fd);
 		elog(ERROR, "[%s] could not write file %s, errMsg: %s, ret: %d, content: %s",
-			func, path, strerror(errno), ret, content.data);
+			__FUNCTION__, path, strerror(errno), ret, content.data);
     }
     FileClose(fd);
     
@@ -3833,12 +3941,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 	File fd = -1;
 	int ret = 0;
 	int size = 0;
-	int new_size = 0;
-	int retry_times = 0;
     XLogRecPtr xlogrec = 0;
-	Cache2pcInfo *entry = NULL;
-	bool found = false;
-	char *func = "record_2pc_commit_timestamp";
     
     if (!enable_2pc_recovery_info)
     {
@@ -3848,7 +3951,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 	if (enable_distri_print || enable_2pc_entry_trace)
     {
 		elog(LOG, "[%s] %s commit_timestamp: "INT64_FORMAT,
-			func, tid, commit_timestamp);
+			__FUNCTION__, tid, commit_timestamp);
     }
     Assert(tid[0] != '\0');
     if (InvalidGlobalTimestamp == commit_timestamp && 
@@ -3856,7 +3959,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
         TWO_PHASE_COMMIT_END == g_twophase_state.state))
     {
 		elog(ERROR, "[%s] could not commit transaction '%s' on node '%s' "
-			"with InvalidGlobalTimestamp", func, tid, PGXCNodeName);
+			"with InvalidGlobalTimestamp", __FUNCTION__, tid, PGXCNodeName);
     }
 
 	if (!RecoveryInProgress())
@@ -3879,121 +3982,39 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 	size = content.len;
 	Assert(size == strlen(content.data));
 
-	GET_2PC_FILE_PATH(path, tid);
-
-	while (NULL != record_2pc_cache && retry_times++ < HASH_TAB_RETRY_MAX)
-	{
-		Assert(strlen(tid) < MAX_TID_SIZE);
-		entry = (Cache2pcInfo *)hash_search(record_2pc_cache, tid, HASH_FIND, &found);
-		if (found)
-		{
-			Assert(NULL != entry);
-			check_entry_key(tid, entry->key, func);
-			check_2pc_file(tid, entry->info, func);
-
-			if (RecoveryInProgress())
-			{
-				elog(LOG, "[%s] %s is found in hash table in recovery mode",
-					func, tid);
-			}
-			else if (enable_2pc_entry_trace)
+	if (NULL != record_2pc_cache)
 			{
-				elog(LOG, "[%s] %s is found in hash table", func, tid);
-			}
+		bool overflow = false;
 
-			new_size = size + strlen(entry->info);
+		Assert(strlen(tid) < MAX_TID_SIZE);
 
-			if (new_size < MAX_2PC_INFO_SIZE)
+		if (append_2pc_info(tid, content.data, &overflow))
 			{
-				/* save to hash table */
-				memcpy(entry->info + strlen(entry->info), content.data, size + 1);
-				check_entry_key(tid, entry->key, func);
-
 				resetStringInfo(&content);
 				pfree(content.data);
 				return;
 			}
 
-				/* save to file */
-			elog(LOG, "[%s] %s new size(%d) overflow(%d)",
-				func, tid, new_size, MAX_2PC_INFO_SIZE);
-
-				GET_2PC_FILE_PATH(path, tid);
-
-					if (RecoveryInProgress())
-					{
-				fd = PathNameOpenFile(path, O_RDWR | O_TRUNC | O_CREAT,
-					S_IRUSR | S_IWUSR);
-					}
-					else
-        {
-				fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL,
-					S_IRUSR | S_IWUSR);
-			}
-			if (fd < 0)
-			{
-				elog(ERROR, "[%s] could not append timestamp, file %s, errMsg: %s",
-					func, path, strerror(errno));
-					}
-
-			ret = FileWrite(fd, entry->info, strlen(entry->info),
-				WAIT_EVENT_BUFFILE_WRITE);
-			if(ret != strlen(entry->info))
-				{
-				FileClose(fd);
-				elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
-					"ret: %d, info: %s",
-					func, path, strerror(errno), ret, entry->info);
-				}
-			ret = FileWrite(fd, content.data, size,
-				WAIT_EVENT_BUFFILE_WRITE);
-			if(ret != size)
+		if (overflow)
 				{
-				FileClose(fd);
-				elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
-					"ret: %d, info: %s",
-					func, path, strerror(errno), ret, content.data);
-				}
-			FileClose(fd);
+			elog(LOG, "[%s] %s is overflow", __FUNCTION__, tid);
 
-				/* remove from hash table */
-				entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
-					tid, HASH_REMOVE, &found);
-				if (!found)
+			if (save_and_remove_2pc_info(tid))
 				{
-				elog(WARNING, "[%s] %s is not found in hash table when remove it",
-					func, tid);
+				elog(LOG, "[%s] %s save to file", __FUNCTION__, tid);
 				}
-				else if (enable_2pc_entry_trace)
+			else
 				{
-				elog(LOG, "[%s] %s is removed from hash table", func, entry->key);
+				elog(LOG, "[%s] %s save to file failed", __FUNCTION__, tid);
 				}
-
-				resetStringInfo(&content);
-				pfree(content.data);
-				return;
 			}
-
-		/* not found */
-		elog(LOG, "[%s] %s is not found in hash table, retry times: %d",
-			func, tid, retry_times);
-
-		Assert(NULL == entry);
-		if (0 == access(path, F_OK))
+		else
 		{
-			elog(LOG, "[%s] %s found 2pc file %s", func, tid, path);
-			break;
+			elog(LOG, "[%s] %s is not found in hash table", __FUNCTION__, tid);
 		}
-
-		print_record_2pc_cache(func);
-
-		pg_usleep(HASH_TAB_RETRY_SLEEP);
 		}
 
-	if (NULL != record_2pc_cache)
-	{
-		elog(LOG, "[%s] %s is not found in hash table, get from disk", func, tid);
-	}
+	GET_2PC_FILE_PATH(path, tid);
 
 	/* the 2pc file exists already */
 	fd = PathNameOpenFile(path, O_RDWR | O_APPEND, S_IRUSR | S_IWUSR);
@@ -4014,17 +4035,17 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
                 if (0 == strcmp(gxact->gid, tid))
                 {
 					elog(ERROR, "[%s] could not append timestamp in file %s, "
-						"errMsg: %s", func, path, strerror(errno));
+						"errMsg: %s", __FUNCTION__, path, strerror(errno));
                 }
             }
 #endif            
 			elog(LOG, "[%s] could not open file %s, errMsg: %s",
-				func, path, strerror(errno));
+				__FUNCTION__, path, strerror(errno));
         }
         else
         {
 			elog(PANIC, "[%s] could not open file %s, errMsg: %s",
-				func, path, strerror(errno));
+				__FUNCTION__, path, strerror(errno));
         }
         return;
     }
@@ -4034,7 +4055,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 		memset(file_content, 0, FILE_CONTENT_SIZE);
 		ret = FileRead(fd, file_content, FILE_CONTENT_SIZE, WAIT_EVENT_BUFFILE_READ);
 		elog(LOG, "[%s] before append file: %s, file_content: %s, content.data: %s, "
-			"ret: %d", func, path, file_content, content.data, ret);
+			"ret: %d", __FUNCTION__, path, file_content, content.data, ret);
     }
 
 	ret = FileWrite(fd, content.data, size, WAIT_EVENT_BUFFILE_WRITE);
@@ -4042,7 +4063,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
     {
 		FileClose(fd);
 		elog(ERROR, "[%s] could not write file %s, errMsg: %s",
-			func, path, strerror(errno));
+			__FUNCTION__, path, strerror(errno));
     }
 
     if (enable_distri_print)
@@ -4051,7 +4072,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 		FileSeek(fd, 0, SEEK_SET);
 		ret = FileRead(fd, file_content, FILE_CONTENT_SIZE, WAIT_EVENT_BUFFILE_READ);
 		elog(LOG, "[%s] after append file: %s, file_content: %s, ret: %d",
-			func, path, file_content, ret);
+			__FUNCTION__, path, file_content, ret);
     }
 
 	FileClose(fd);
@@ -4063,9 +4084,6 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 void remove_2pc_records(const char * tid, bool record_in_xlog)
 {
     char path[MAXPGPATH];    
-	Cache2pcInfo *entry = NULL;
-	bool found = false;
-	char *func = "remove_2pc_records";
 
     if (!enable_2pc_recovery_info)
     {
@@ -4074,7 +4092,7 @@ void remove_2pc_records(const char * tid, bool record_in_xlog)
 
 	if (enable_distri_print || enable_2pc_entry_trace)
 	{
-		elog(LOG, "[%s] %s record_in_xlog: %d", func, tid, record_in_xlog);
+		elog(LOG, "[%s] %s record_in_xlog: %d", __FUNCTION__, tid, record_in_xlog);
 	}
 
     if (!RecoveryInProgress() && record_in_xlog)
@@ -4086,35 +4104,36 @@ void remove_2pc_records(const char * tid, bool record_in_xlog)
         XLogInsert(RM_XLOG_ID, XLOG_CLEAN_2PC_FILE);
     }
 
+	GET_2PC_FILE_PATH(path, tid);
+
 	if (NULL != record_2pc_cache)
 	{
 		Assert(strlen(tid) < MAX_TID_SIZE);
+
 		if (enable_2pc_entry_key_check)
 		{
-			entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
-				tid, HASH_FIND, &found);
-			if (found)
+			char info[MAX_2PC_INFO_SIZE];
+			if (get_2pc_info(tid, info))
 			{
-				Assert(NULL != entry);
-				check_entry_key(tid, entry->key, func);
-				check_2pc_file(tid, entry->info, func);
+				Assert(strlen(info) < MAX_2PC_INFO_SIZE);
+				check_2pc_file(tid, info, __FUNCTION__);
 			}
 		}
-		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
-			tid, HASH_REMOVE, &found);
-		if (found)
+
+		/* remove from hash table */
+		if (remove_2pc_info(tid))
 		{
-			Assert(NULL != entry);
-			if (enable_2pc_entry_trace)
+			if (enable_2pc_file_check)
 			{
-				elog(LOG, "[%s] %s is removed from hash table", func, tid);
+				if (0 == access(path, F_OK))
+			{
+					elog(LOG, "[%s] still found file %s", __FUNCTION__, path);
+				}
 			}
 			return;
 		}
 	}
 
-	GET_2PC_FILE_PATH(path, tid);
-
 	/*
 		* no need to check file exists.
 		* since when it do not exists, unlink won't success.
@@ -4122,7 +4141,7 @@ void remove_2pc_records(const char * tid, bool record_in_xlog)
     if (0 != unlink(path))
     {
 		elog(LOG, "[%s] could not unlink file %s, errMsg: %s",
-			func, path, strerror(errno));
+			__FUNCTION__, path, strerror(errno));
     }
 }
 
@@ -4130,11 +4149,6 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 {
 	char path[MAXPGPATH];
 	char new_path[MAXPGPATH];
-	Cache2pcInfo *entry = NULL;
-	bool found = false;
-	File fd = 0;
-	int ret = 0;
-	char *func = "rename_2pc_records";
 
 	if (!enable_2pc_recovery_info)
 	{
@@ -4143,7 +4157,7 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 
 	if (enable_distri_print || enable_2pc_entry_trace)
 	{
-		elog(LOG, "[%s] %s timestamp: "INT64_FORMAT, func, tid, timestamp);
+		elog(LOG, "[%s] %s timestamp: "INT64_FORMAT, __FUNCTION__, tid, timestamp);
 	}
 
 	if (0 == timestamp)
@@ -4161,103 +4175,64 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 		XLogInsert(RM_XLOG_ID, XLOG_CLEAN_2PC_FILE);
 	}
 
-	GET_2PC_FILE_PATH(path, tid);
-	snprintf(new_path, MAXPGPATH, "%s." INT64_FORMAT ".rollback", path, timestamp);
-
 	if (NULL != record_2pc_cache)
 	{
 		Assert(strlen(tid) < MAX_TID_SIZE);
-		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
-			tid, HASH_FIND, &found);
-		if (found)
-		{
-			Assert(NULL != entry);
-			check_entry_key(tid, entry->key, func);
-			check_2pc_file(tid, entry->info, func);
 
-			if (0 == access(new_path, F_OK))
-			{
-			if (RecoveryInProgress())
+		if (save_and_remove_2pc_info(tid))
 			{
-					elog(LOG, "[%s] file %s exist", func, new_path);
+			elog(LOG, "[%s] %s save to file", __FUNCTION__, tid);
 			}
 			else
 			{
-					elog(WARNING, "[%s] file %s exist", func, new_path);
-				}
-				if (0 != unlink(new_path))
-				{
-					elog(ERROR, "[%s] could not unlink file %s, errMsg: %s",
-						func, new_path, strerror(errno));
+			elog(LOG, "[%s] %s save to file failed", __FUNCTION__, tid);
 				}
 			}
 
-			fd = PathNameOpenFile(new_path, O_RDWR | O_CREAT | O_EXCL,
-				S_IRUSR | S_IWUSR);
-			if (fd < 0)
-			{
-				elog(ERROR, "[%s] could not create file %s, errMsg: %s",
-					func, new_path, strerror(errno));
-			}
+	GET_2PC_FILE_PATH(path, tid);
+	snprintf(new_path, MAXPGPATH, "%s." INT64_FORMAT ".rollback", path, timestamp);
 
-			ret = FileWrite(fd, entry->info, strlen(entry->info),
-				WAIT_EVENT_BUFFILE_WRITE);
-			if(ret != strlen(entry->info))
+	if (0 != access(path, F_OK))
 			{
-				FileClose(fd);
-				elog(ERROR, "[%s] could not write file %s, errMsg: %s, "
-					"ret: %d, info: %s",
-					func, path, strerror(errno), ret, entry->info);
-			}
-			FileClose(fd);
-
-			entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
-				tid, HASH_REMOVE, &found);
-			if (!found)
+		if (RecoveryInProgress())
 			{
-				elog(ERROR, "[%s] %s is not found in hash table when remove it",
-					func, tid);
+			elog(LOG, "[%s] could not access file %s in recovery mode, errMsg: %s",
+				__FUNCTION__, path, strerror(errno));
 			}
-			else if (enable_2pc_entry_trace)
+		else
 			{
-				elog(LOG, "[%s] %s is removed from hash table", func, tid);
-			}
-			return;
-		}
+			elog(WARNING, "[%s] could not access file %s, errMsg: %s",
+				__FUNCTION__, path, strerror(errno));
 	}
 
-	if (0 != access(path, F_OK))
-	{
-		elog(LOG, "[%s] could not access file %s, errMsg: %s",
-			func, path, strerror(errno));
 		return;
 	}
 	if (0 == access(new_path, F_OK))
 	{
 		if (RecoveryInProgress())
 		{
-			elog(LOG, "[%s] file %s exist", func, new_path);
+			elog(LOG, "[%s] file %s exist", __FUNCTION__, new_path);
 		}
 		else
 		{
-			elog(WARNING, "[%s] file %s exist", func, new_path);
+			elog(WARNING, "[%s] file %s exist", __FUNCTION__, new_path);
 		}
 		if (0 != unlink(new_path))
 		{
 			elog(WARNING, "[%s] could not unlink file %s, errMsg: %s",
-				func, new_path, strerror(errno));
+				__FUNCTION__, new_path, strerror(errno));
 			return;
 		}
 	}
 	if (0 != link(path, new_path))
 	{
 		elog(ERROR, "[%s] could not link file %s to %s, errMsg: %s",
-			func, path, new_path, strerror(errno));
+			__FUNCTION__, path, new_path, strerror(errno));
 	}
 	if (0 != unlink(path))
 	{
 		elog(WARNING, "[%s] could not unlink file %s, errMsg: %s",
-			func, path, strerror(errno));
+			__FUNCTION__, path, strerror(errno));
 	}
 }
 
@@ -4266,10 +4241,7 @@ void record_2pc_readonly(const char *gid)
     File fd = 0;
     int ret = 0;
     char path[MAXPGPATH];
-    char content[10] = "readonly";
-	Cache2pcInfo *entry = NULL;
-	bool found = false;
-	char *func = "record_2pc_readonly";
+	char *content = "readonly";
         
     if(!enable_2pc_recovery_info)
     {
@@ -4278,7 +4250,7 @@ void record_2pc_readonly(const char *gid)
         
 	if (enable_distri_print || enable_2pc_entry_trace)
     {
-		elog(LOG, "[%s] %s is readonly", func, gid);
+		elog(LOG, "[%s] %s is readonly", __FUNCTION__, gid);
     }
 
     if (!RecoveryInProgress())
@@ -4293,39 +4265,14 @@ void record_2pc_readonly(const char *gid)
 	{
 		Assert(strlen(gid) < MAX_TID_SIZE);
 		Assert(strlen(content) < MAX_2PC_INFO_SIZE);
-		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
-			gid, HASH_ENTER_NULL, &found);
-		if (NULL != entry)
-		{
-			check_entry_key(gid, entry->key, func);
-			check_2pc_file(gid, entry->info, func);
 
-			if (found)
-			{
-				if (RecoveryInProgress())
-				{
-					elog(LOG, "[%s] %s is found in hash table in recovery mode",
-						func, gid);
-				}
-				else
-				{
-					elog(LOG, "[%s] %s is found in hash table", func, gid);
-				}
-			}
-			else if (enable_2pc_entry_trace)
+		if (add_2pc_info(gid, content))
 			{
-				elog(LOG, "[%s] %s is added to hash table", func, gid);
-			}
-
-			memcpy(entry->info, content, strlen(content) + 1);
-			check_entry_key(gid, entry->key, func);
-
+			check_2pc_file(gid, content, __FUNCTION__);
 			return;
 		}
-		else
-		{
-			elog(LOG, "[%s] %s entry is NULL", func, gid);
-		}
+
+		elog(LOG, "[%s] %s add to cache failed", __FUNCTION__, gid);
 	}
 
 	/* the 2pc dir is already created in initdb */
@@ -4351,7 +4298,7 @@ void record_2pc_readonly(const char *gid)
     if (fd < 0)
     {   
 		elog(ERROR, "[%s] could not create file %s, errMsg: %s",
-			func, path, strerror(errno));
+			__FUNCTION__, path, strerror(errno));
         return;
     }
 
@@ -4360,7 +4307,7 @@ void record_2pc_readonly(const char *gid)
     {
 		FileClose(fd);
 		elog(ERROR, "[%s] could not write file %s, errMsg: %s, ret: %d, content: %s",
-			func, path, strerror(errno), ret, content);
+			__FUNCTION__, path, strerror(errno), ret, content);
     }
     FileClose(fd);
 }
@@ -4370,35 +4317,36 @@ void record_2pc_readonly(const char *gid)
  */
 char *get_2pc_info_from_cache(const char *tid)
 {
-	Cache2pcInfo *entry = NULL;
-	bool found = false;
-	char *func = "get_2pc_info_from_cache";
+	char *info = NULL;
 
-	if (NULL != record_2pc_cache)
+	if (NULL == record_2pc_cache)
 	{
+		return NULL;
+	}
+
 		Assert(strlen(tid) < MAX_TID_SIZE);
-		entry = (Cache2pcInfo *)hash_search(record_2pc_cache,
-			tid, HASH_FIND, &found);
-		if (found)
-		{
-			Assert(NULL != entry);
 
-			check_entry_key(tid, entry->key, func);
+	info = (char *)palloc0(MAX_2PC_INFO_SIZE);
+	if (get_2pc_info(tid, info))
+	{
+		Assert(strlen(info) < MAX_2PC_INFO_SIZE);
+		check_2pc_file(tid, info, __FUNCTION__);
 
 			if (enable_2pc_entry_trace)
 			{
-				elog(LOG, "[%s] %s is found in hast table, key: %s, info: %s",
-					func, tid, entry->key, entry->info);
+			elog(LOG, "[%s] %s is found in hash table", __FUNCTION__, tid);
 }
 
-			return entry->info;
+		return info;
 		}
 
+	pfree(info);
+
 		if (enable_2pc_entry_trace)
 		{
-			elog(LOG, "[%s] %s is not found in hast table", func, tid);
-		}
+		elog(LOG, "[%s] %s is not found in hash table", __FUNCTION__, tid);
 	}
+
 	return NULL;
 }
 
@@ -4410,7 +4358,8 @@ char *get_2pc_list_from_cache(int *count)
 	HASH_SEQ_STATUS seq;
 	Cache2pcInfo *entry = NULL;
 	char *recordList = NULL;
-	char *func = "get_2pc_list_from_cache";
+
+	Assert(NULL != count);
 
 	if (NULL == record_2pc_cache)
 	{
@@ -4421,12 +4370,7 @@ char *get_2pc_list_from_cache(int *count)
 	while ((entry = hash_seq_search(&seq)) != NULL)
 	{
 		Assert(NULL != entry);
-		check_2pc_file(entry->key, entry->info, func);
-
-		if (NULL != count && *count >= MAX_OUTPUT_FILE)
-		{
-			break;
-		}
+		check_2pc_file(entry->key, entry->info, __FUNCTION__);
 
 		if(NULL == recordList)
 		{
@@ -4439,9 +4383,10 @@ char *get_2pc_list_from_cache(int *count)
 				strlen(entry->key) + strlen(recordList) + 2);
 			sprintf(recordList, "%s,%s", recordList, entry->key);
 		}
-		if (NULL != count)
+
+		if (++(*count) >= MAX_OUTPUT_FILE)
 		{
-			(*count)++;
+			break;
 		}
 	}
 
@@ -4465,11 +4410,11 @@ Record2pcCacheInit(void)
 
 	info.keysize = MAX_TID_SIZE;
 	info.entrysize = record_2pc_entry_size;
-	info.num_partitions = record_2pc_partitions;
+	info.num_partitions = NUM_CACHE_2PC_PARTITIONS;
 
 	flags = HASH_ELEM | HASH_PARTITION;
 
-	record_2pc_cache = ShmemInitHash("Record 2pc Cache",
+	record_2pc_cache = ShmemInitHash("Record 2pc cache",
 		record_2pc_cache_size, record_2pc_cache_size,
 		&info, flags);
 }
@@ -4488,4 +4433,21 @@ Record2pcCacheSize(void)
 	return cache_size;
 }
 
+/*
+ * Record2pcCacheHashCode
+ *             Compute the hash code associated with a tid
+ *
+ * This must be passed to the lookup/insert/delete routines along with the
+ * tag.  We do it like this because the callers need to know the hash code
+ * in order to determine which buffer partition to lock, and we don't want
+ * to do the hash computation twice.
+ */
+uint32
+Record2pcCacheHashCode(const char *tid)
+{
+	Assert(NULL != record_2pc_cache);
+	Assert(NULL != tid);
+	return get_hash_value(record_2pc_cache, tid);
+}
+
 #endif
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 342621e3..2a555ebc 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -457,6 +457,13 @@ InitializeLWLocks(void)
     for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++)
         LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER);
 
+	/* Initialize 2pc info cache LWLocks in main array */
+	lock = MainLWLockArray + NUM_INDIVIDUAL_LWLOCKS +
+		NUM_BUFFER_PARTITIONS + NUM_LOCK_PARTITIONS +
+		NUM_PREDICATELOCK_PARTITIONS;
+	for (id = 0; id < NUM_CACHE_2PC_PARTITIONS; id++, lock++)
+		LWLockInitialize(&lock->lock, LWTRANCHE_2PC_INFO_CACHE);
+
     /* Initialize named tranches. */
     if (NamedLWLockTrancheRequests > 0)
     {
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 1f205d34..378f9c7a 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2705,7 +2705,7 @@ static struct config_bool ConfigureNamesBool[] =
 			NULL
 		},
 		&enable_2pc_file_check,
-		true,
+		false,
 		NULL, NULL, NULL
 	},
 	{
@@ -2726,15 +2726,6 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
-	{
-		{"enable_2pc_hash_table_check", PGC_USERSET, CUSTOM_OPTIONS,
-			gettext_noop("Enable 2PC hash table check."),
-			NULL
-		},
-		&enable_2pc_hash_table_check,
-		false,
-		NULL, NULL, NULL
-	},
 #endif
 
 #ifdef __TBASE__
@@ -4809,14 +4800,6 @@ static struct config_int ConfigureNamesInt[] =
 		2048, 1028, INT_MAX,
 		NULL, NULL, NULL
 	},
-	{
-		{"record_2pc_partitions", PGC_POSTMASTER, CUSTOM_OPTIONS,
-			gettext_noop("2PC info cache partition number."),
-		},
-		&record_2pc_partitions,
-		32, 1, INT_MAX,
-		NULL, NULL, NULL
-	},
 #endif
 
 #ifdef __TBASE__
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index e0fe09d2..06f9685e 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -101,11 +101,9 @@ extern bool enable_2pc_file_cache;
 extern bool enable_2pc_file_check;
 extern bool enable_2pc_entry_key_check;
 extern bool enable_2pc_entry_trace;
-extern bool enable_2pc_hash_table_check;
 
 extern int record_2pc_cache_size;
 extern int record_2pc_entry_size;
-extern int record_2pc_partitions;
 #endif
 
 extern Size TwoPhaseShmemSize(void);
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 4a088f5e..25ee91a8 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -173,6 +173,9 @@ extern PGDLLIMPORT int NamedLWLockTrancheRequests;
 /* Number of partitions of the shared buffer mapping hashtable */
 #define NUM_BUFFER_PARTITIONS  128
 
+/* Number of partitions of the 2pc info cache hashtable */
+#define NUM_CACHE_2PC_PARTITIONS  128
+
 /* Number of partitions the shared lock tables are divided into */
 #define LOG2_NUM_LOCK_PARTITIONS  4
 #define NUM_LOCK_PARTITIONS  (1 << LOG2_NUM_LOCK_PARTITIONS)
@@ -187,9 +190,10 @@ extern PGDLLIMPORT int NamedLWLockTrancheRequests;
     (BUFFER_MAPPING_LWLOCK_OFFSET + NUM_BUFFER_PARTITIONS)
 #define PREDICATELOCK_MANAGER_LWLOCK_OFFSET \
     (LOCK_MANAGER_LWLOCK_OFFSET + NUM_LOCK_PARTITIONS)
-#define NUM_FIXED_LWLOCKS \
+#define CACHE_2PC_LWLOCK_OFFSET \
     (PREDICATELOCK_MANAGER_LWLOCK_OFFSET + NUM_PREDICATELOCK_PARTITIONS)
-
+#define NUM_FIXED_LWLOCKS \
+	(CACHE_2PC_LWLOCK_OFFSET + NUM_CACHE_2PC_PARTITIONS)
 typedef enum LWLockMode
 {
     LW_EXCLUSIVE,
@@ -288,6 +292,7 @@ typedef enum BuiltinTrancheIds
     LWTRANCHE_PARALLEL_WORKER_DSA,
 #endif
     LWTRANCHE_TBM,
+	LWTRANCHE_2PC_INFO_CACHE,
     LWTRANCHE_FIRST_USER_DEFINED
 }            BuiltinTrancheIds;
 
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index f9926dda..e13fdd2a 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -75,8 +75,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_2pc_entry_key_check        | on
  enable_2pc_entry_trace            | off
  enable_2pc_file_cache             | on
- enable_2pc_file_check             | on
- enable_2pc_hash_table_check       | off
+ enable_2pc_file_check             | off
  enable_2pc_recovery_info          | on
  enable_audit                      | off
  enable_audit_warning              | off
@@ -142,7 +141,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
  enable_xlog_mprotect              | on
-(69 rows)
+(68 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From 3c4d6a7696c2b85c1af5d8db4a6cb9c0a38ba145 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 3 Jun 2021 11:15:53 +0800
Subject: [PATCH 387/578] fix gtm standby promote when master shutdown
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131087569165 (merge request
 !388)

Squash merge branch 'sigmalin_v2.15.20' into 'Tbase_v2.15.19'

* fix gtm standby promote when master shutdown http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131087569165
---
 src/gtm/main/gtm_standby.c    | 22 +++++++++++++++-------
 src/gtm/main/main.c           |  8 +++++---
 src/include/gtm/gtm_standby.h |  2 +-
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/gtm/main/gtm_standby.c b/src/gtm/main/gtm_standby.c
index f441c889..ba11b416 100644
--- a/src/gtm/main/gtm_standby.c
+++ b/src/gtm/main/gtm_standby.c
@@ -52,7 +52,7 @@ extern int	 GTMPortNumber;
 #ifndef __XLOG__
 static GTM_Conn *gtm_standby_connect_to_standby_int(int *report_needed);
 #endif
-static GTM_Conn *gtm_standby_connectToActiveGTM(void);
+static GTM_Conn *gtm_standby_connectToActiveGTM(int timeout);
 static void AddBackupLabel(uint64 segment_no);
 
 /* Defined in main.c */
@@ -62,9 +62,9 @@ extern int   GTM_Standby_Connetion_Timeout;
 
 
 int
-gtm_standby_start_startup(void)
+gtm_standby_start_startup(int timeout)
 {
-	GTM_ActiveConn = gtm_standby_connectToActiveGTM();
+	GTM_ActiveConn = gtm_standby_connectToActiveGTM(timeout);
 	if (GTM_ActiveConn == NULL || GTMPQstatus(GTM_ActiveConn) != CONNECTION_OK)
 	{
 		int save_errno = errno;
@@ -644,7 +644,7 @@ void
 gtm_standby_finishActiveConn(void)
 {
 	
-	GTM_ActiveConn = gtm_standby_connectToActiveGTM();
+	GTM_ActiveConn = gtm_standby_connectToActiveGTM(0);
 	if (GTM_ActiveConn == NULL)
 	{
 		elog(DEBUG3, "Error in connection");
@@ -664,7 +664,7 @@ gtm_standby_finishActiveConn(void)
 }
 
 static GTM_Conn *
-gtm_standby_connectToActiveGTM(void)
+gtm_standby_connectToActiveGTM(int timeout)
 {
 	char connect_string[1024];
 	int active_port = Recovery_StandbyGetActivePort();
@@ -673,8 +673,16 @@ gtm_standby_connectToActiveGTM(void)
 	/* Need to connect to Active-GTM again here */
 	elog(LOG, "Connecting the GTM active on %s:%d...", active_address, active_port);
 
-	sprintf(connect_string, "host=%s port=%d node_name=%s remote_type=%d",
-			active_address, active_port, NodeName, GTM_NODE_GTM);
+	if (timeout != 0)
+    {
+        sprintf(connect_string, "host=%s port=%d node_name=%s remote_type=%d connect_timeout=%d",
+                active_address, active_port, NodeName, GTM_NODE_GTM, timeout);
+    }
+    else
+    {
+        sprintf(connect_string, "host=%s port=%d node_name=%s remote_type=%d",
+                active_address, active_port, NodeName, GTM_NODE_GTM);
+    }
 
 	return PQconnectGTM(connect_string);
 }
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index e894a5bc..93fc1901 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -82,6 +82,8 @@ extern char *optarg;
 #define LOOPS_UNTIL_HIBERNATE        50
 #define HIBERNATE_FACTOR        25
 
+#define GTM_STARTUP_CONNECT_ACTIVE_TIMEOUT	(2)
+
 static char *progname = "gtm";
 char       *ListenAddresses;
 int            GTMPortNumber;
@@ -1097,7 +1099,7 @@ main(int argc, char *argv[])
      */
     if (Recovery_IsStandby())
     {
-        if (!gtm_standby_start_startup())
+        if (!gtm_standby_start_startup(GTM_STARTUP_CONNECT_ACTIVE_TIMEOUT))
         {
 #ifdef __TBASE__
             elog(LOG, "Failed to establish a connection to active-GTM.");
@@ -2046,7 +2048,7 @@ gtm_standby_pre_server_loop(char *data_dir)
          * retry establish a connection between the active and standby,
          * controlling frequency with select timeout
          */
-        if (gtm_standby_start_startup())
+        if (gtm_standby_start_startup(GTM_STARTUP_CONNECT_ACTIVE_TIMEOUT))
         {
             elog(LOG, "Standby GTM Startup connection established with active-GTM.");
             break;
@@ -3233,7 +3235,7 @@ GTM_ThreadWalReceiver(void *argp)
 
     sleep(1);
 
-	if (!gtm_standby_start_startup())
+	if (!gtm_standby_start_startup(0))
     {
         elog(ERROR, "Failed to establish a connection to active-GTM.");
     }
diff --git a/src/include/gtm/gtm_standby.h b/src/include/gtm/gtm_standby.h
index 01a037b7..406fed05 100644
--- a/src/include/gtm/gtm_standby.h
+++ b/src/include/gtm/gtm_standby.h
@@ -27,7 +27,7 @@ bool gtm_is_standby(void);
 void gtm_set_standby(bool standby);
 void gtm_set_active_conninfo(const char *addr, int port);
 
-int gtm_standby_start_startup(void);
+int gtm_standby_start_startup(int timeout);
 int gtm_standby_finish_startup(void);
 
 int gtm_standby_restore_next_gxid(void);

From 9368787b4435047425ea52eec6a72559af6c056d Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Wed, 9 Jun 2021 11:17:50 +0800
Subject: [PATCH 388/578] Bugfix: use extended protocol cause perform bad in
 some case (merge request !392), ID88518281
 http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131088518281

---
 src/backend/pgxc/pool/execRemote.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 74bdc28e..d5f96393 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -8966,9 +8966,7 @@ ExecRemoteQuery(PlanState *pstate)
 		if (step->force_autocommit)
 			need_tran_block = false;
 		else
-			need_tran_block = (step->statement && step->statement[0] != '\0') ||
-				step->cursor ||
-				node->rqs_num_params ||
+			need_tran_block = step->cursor ||
 					(!step->read_only && total_conn_count > 1) ||
 					(TransactionBlockStatusCode() == 'T');
 

From 8f3b934a110320eb7f4f1fcbc27b872d20b11214 Mon Sep 17 00:00:00 2001
From: ceciliasu <ceciliasu@tencent.com>
Date: Fri, 11 Jun 2021 15:27:43 +0800
Subject: [PATCH 389/578] fix bug when accessing temp sequence in a
 redistribution-plan.
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131088413441

---
 src/backend/catalog/namespace.c       |  9 +++++++++
 src/backend/commands/sequence.c       | 19 +++++++++++++++++--
 src/test/regress/expected/xc_temp.out | 14 ++++++++++++++
 src/test/regress/sql/xc_temp.sql      |  8 ++++++++
 4 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c
index 53be4dc7..6af59c1c 100644
--- a/src/backend/catalog/namespace.c
+++ b/src/backend/catalog/namespace.c
@@ -4108,6 +4108,15 @@ recomputeNamespacePath(void)
     if (!list_member_oid(oidlist, PG_CATALOG_NAMESPACE))
         oidlist = lcons_oid(PG_CATALOG_NAMESPACE, oidlist);
 
+#ifdef __TBASE__
+        /*
+         * If this is secondary backend of a distributed session, check if primary backend
+         * of the same session has created temporary namespace and wire it up.
+         */
+        if (IsConnFromDatanode() && IS_PGXC_DATANODE && !OidIsValid(myTempNamespace))
+                FindTemporaryNamespace();
+#endif
+
     if (OidIsValid(myTempNamespace) &&
         !list_member_oid(oidlist, myTempNamespace))
         oidlist = lcons_oid(myTempNamespace, oidlist);
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index a248f5c0..254b0d63 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -1922,7 +1922,18 @@ GetGlobalSeqName(Relation seqrel, const char *new_seqname, const char *new_schem
     char *seqname, *dbname, *relname;
     char namespace[NAMEDATALEN * 2];
     int charlen;
-    bool is_temp = seqrel->rd_backend == MyBackendId;
+	bool is_temp = false;
+ 
+#ifdef PGXC
+	/*
+	 * In case of distributed session use MyFirstBackendId for temp objects.
+	 */
+	if (OidIsValid(MyCoordId))
+	        is_temp = seqrel->rd_backend == MyFirstBackendId;
+	else
+#endif
+        is_temp = seqrel->rd_backend == MyBackendId;
+
     /* Get all the necessary relation names */
     dbname = get_database_name(seqrel->rd_node.dbNode);
 
@@ -1989,7 +2000,11 @@ IsTempSequence(Oid relid)
 
     /* open and AccessShareLock sequence */
     init_sequence(relid, &elm, &seqrel);
-
+#ifdef PGXC
+        if (OidIsValid(MyCoordId))
+                res = seqrel->rd_backend == MyFirstBackendId;
+        else
+#endif
     res = seqrel->rd_backend == MyBackendId;
     relation_close(seqrel, NoLock);
     return res;
diff --git a/src/test/regress/expected/xc_temp.out b/src/test/regress/expected/xc_temp.out
index 6f779cc5..4a3843f6 100644
--- a/src/test/regress/expected/xc_temp.out
+++ b/src/test/regress/expected/xc_temp.out
@@ -1044,3 +1044,17 @@ CREATE TABLE table_child (like table_parent, b int);
 ERROR:  relation "table_parent" does not exist
 DROP TABLE table_child;
 ERROR:  table "table_child" does not exist
+-- Access temp sequence in redistribution.
+CREATE TEMP TABLE IF NOT EXISTS rep_tbl_temp (col_int int, col_bigserial bigserial, constraint pk_p_id_key primary key (col_int)) DISTRIBUTE BY REPLICATION;
+INSERT INTO rep_tbl_temp values (0);
+INSERT INTO rep_tbl_temp values (1);
+INSERT INTO rep_tbl_temp values (2);
+SELECT col_int, col_bigserial FROM rep_tbl_temp ORDER BY col_int;
+ col_int | col_bigserial 
+---------+---------------
+       0 |             1
+       1 |             2
+       2 |             3
+(3 rows)
+
+DROP TABLE rep_tbl_temp;
diff --git a/src/test/regress/sql/xc_temp.sql b/src/test/regress/sql/xc_temp.sql
index 539e1c07..1a8ccbed 100644
--- a/src/test/regress/sql/xc_temp.sql
+++ b/src/test/regress/sql/xc_temp.sql
@@ -141,3 +141,11 @@ DROP TABLE table_rep,table_hash,table_rb;
 CREATE TEMP TABLE table_parent (a int);
 CREATE TABLE table_child (like table_parent, b int);
 DROP TABLE table_child;
+
+-- Access temp sequence in redistribution.
+CREATE TEMP TABLE IF NOT EXISTS rep_tbl_temp (col_int int, col_bigserial bigserial, constraint pk_p_id_key primary key (col_int)) DISTRIBUTE BY REPLICATION;
+INSERT INTO rep_tbl_temp values (0);
+INSERT INTO rep_tbl_temp values (1);
+INSERT INTO rep_tbl_temp values (2);
+SELECT col_int, col_bigserial FROM rep_tbl_temp ORDER BY col_int;
+DROP TABLE rep_tbl_temp;
\ No newline at end of file

From f4bd333f3310dfec772e15b194c1bb808b9200a3 Mon Sep 17 00:00:00 2001
From: ceciliasu <ceciliasu@tencent.com>
Date: Fri, 11 Jun 2021 15:59:16 +0800
Subject: [PATCH 390/578] fix review

---
 src/backend/commands/sequence.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 254b0d63..35eb5109 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -1929,10 +1929,10 @@ GetGlobalSeqName(Relation seqrel, const char *new_seqname, const char *new_schem
 	 * In case of distributed session use MyFirstBackendId for temp objects.
 	 */
 	if (OidIsValid(MyCoordId))
-	        is_temp = seqrel->rd_backend == MyFirstBackendId;
+	        is_temp = (seqrel->rd_backend == MyFirstBackendId);
 	else
 #endif
-        is_temp = seqrel->rd_backend == MyBackendId;
+        is_temp = (seqrel->rd_backend == MyBackendId);
 
     /* Get all the necessary relation names */
     dbname = get_database_name(seqrel->rd_node.dbNode);

From 97522c2fc5bf4f03c5c0c0fa531e9eb9f4a28eba Mon Sep 17 00:00:00 2001
From: ceciliasu <ceciliasu@tencent.com>
Date: Fri, 11 Jun 2021 16:00:35 +0800
Subject: [PATCH 391/578] fix review

---
 src/backend/commands/sequence.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 35eb5109..07209cc8 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -2000,9 +2000,10 @@ IsTempSequence(Oid relid)
 
     /* open and AccessShareLock sequence */
     init_sequence(relid, &elm, &seqrel);
+	
 #ifdef PGXC
         if (OidIsValid(MyCoordId))
-                res = seqrel->rd_backend == MyFirstBackendId;
+                res = (seqrel->rd_backend == MyFirstBackendId);
         else
 #endif
     res = seqrel->rd_backend == MyBackendId;

From 4621fe6f8867fd69341b7d1879bda8ffeee2a4f8 Mon Sep 17 00:00:00 2001
From: ceciliasu <ceciliasu@tencent.com>
Date: Fri, 11 Jun 2021 16:06:21 +0800
Subject: [PATCH 392/578] fix review

---
 src/backend/commands/sequence.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 07209cc8..c21d7639 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -2006,7 +2006,7 @@ IsTempSequence(Oid relid)
                 res = (seqrel->rd_backend == MyFirstBackendId);
         else
 #endif
-    res = seqrel->rd_backend == MyBackendId;
+	res = (seqrel->rd_backend == MyBackendId);
     relation_close(seqrel, NoLock);
     return res;
 }

From a2a8e2bd1020a008bfa8c979e7b8f225339ce357 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 11 Jun 2021 20:10:41 +0800
Subject: [PATCH 393/578] Skip invalid relid in group information check

this could happen when from list contain CTE tables, but they are okay to join with

tapd: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696088748699
---
 src/test/regress/expected/insert.out | 11 +++++++++++
 src/test/regress/sql/insert.sql      |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index d12e3494..528cd56d 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -975,3 +975,14 @@ insert into returningwrtest values (2, 'foo') returning returningwrtest;
 (1 row)
 
 drop table returningwrtest;
+-- check insert into a shard table from a CTE table
+create table t1(f1 int,f2 int) distribute by shard(f1);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+create table t2(f1 int,f2 int) distribute by shard(f1);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into t1 values(1,1);
+insert into t1 values(2,2);
+with baseInfo as(select * from t1)
+insert into t2 select * from baseInfo;
+drop table t1;
+drop table t2;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index d8f352ab..b9b08d55 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -572,4 +572,4 @@ select count(*) from t2_new;
 drop table t2;
 drop table t2_rep;
 drop table t2_new;
-reset default_locator_type;
\ No newline at end of file
+reset default_locator_type;

From 09c0fcb2ff705e0953eff4619817306fac6e4394 Mon Sep 17 00:00:00 2001
From: hanwayjiang <hanwayjiang@tencent.com>
Date: Wed, 16 Jun 2021 10:30:04 +0800
Subject: [PATCH 394/578] =?UTF-8?q?=E3=80=90=E3=80=90TBase=E3=80=91?=
 =?UTF-8?q?=E3=80=902.15.19=E3=80=91dblink=E6=8F=92=E4=BB=B6=E7=9A=84DBLIN?=
 =?UTF-8?q?K=5FCOPY=5FTABLE=E4=B8=8D=E8=83=BD=E8=BF=94=E5=9B=9E=E6=8B=B7?=
 =?UTF-8?q?=E8=B4=9D=E7=9A=84=E8=A1=8C=E6=95=B0=EF=BC=8C=E5=A2=9E=E5=8A=A0?=
 =?UTF-8?q?=E8=BF=94=E5=9B=9E=E7=9A=84=E8=A1=8C=E6=95=B0=E3=80=82=E3=80=91?=
 =?UTF-8?q?http://tapd.oa.com/pgxz/bugtrace/bugs/view=3Fbug=5Fid=3D1010092?=
 =?UTF-8?q?131088867115=20(merge=20request=20!403)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Squash merge branch 'tbase_v2_hanway_616' into 'Tbase_v2.15.19'

* 【【TBase】【2.15.19】dblink插件的DBLINK_COPY_TABLE不能返回拷贝的行数，增加返回的行数。】http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131088867115
---
 contrib/dblink/dblink--1.2.sql |  2 +-
 contrib/dblink/dblink.c        | 23 ++++++++++++++---------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/contrib/dblink/dblink--1.2.sql b/contrib/dblink/dblink--1.2.sql
index fabe10fc..5def7be5 100644
--- a/contrib/dblink/dblink--1.2.sql
+++ b/contrib/dblink/dblink--1.2.sql
@@ -179,7 +179,7 @@ AS 'MODULE_PATHNAME', 'dblink_is_busy'
 LANGUAGE C STRICT PARALLEL RESTRICTED;
 
 CREATE FUNCTION dblink_copy_table(text, text, text, text, text)
-RETURNS int4
+RETURNS int8
 AS 'MODULE_PATHNAME', 'dblink_copy_table'
 LANGUAGE C STRICT PARALLEL RESTRICTED;
 
diff --git a/contrib/dblink/dblink.c b/contrib/dblink/dblink.c
index fb1c99ce..82f0e05d 100644
--- a/contrib/dblink/dblink.c
+++ b/contrib/dblink/dblink.c
@@ -912,15 +912,16 @@ static bool isRemoteTableAsSelect(char * rtblname)
  * then local server will use COPY FROM statement to copy data into table
  * directly.
  */
-static void
+static uint64
 copyRemoteTableTo(char *nspname, char *tblname, char *rnspname, char *rtblname,
 					char *connstr)
 {
-	bool		freeconn = false;
-	char		*conname = connstr;
-	PGconn		*conn = NULL;
-	ParseState	*pstate = NULL;
-	Relation	rel = NULL;
+	bool	freeconn = false;
+	char	*conname = connstr;
+	PGconn	*conn = NULL;
+	ParseState	*pstate;
+	Relation	rel;
+	uint64 processed = 0;
 
 	dblink_init();
 
@@ -967,7 +968,8 @@ copyRemoteTableTo(char *nspname, char *tblname, char *rnspname, char *rtblname,
 
 		cstate = BeginCopyFrom(pstate, rel, NULL, false, copy_read_data, NULL, NIL);
 
-		(void) CopyFrom(cstate);
+		processed = CopyFrom(cstate);
+
 		EndCopyFrom(cstate);
 
 		relation_close(rel, RowExclusiveLock);
@@ -989,12 +991,15 @@ copyRemoteTableTo(char *nspname, char *tblname, char *rnspname, char *rtblname,
 	tmp_cbuf = NULL;
 	if (freeconn)
 		PQfinish(conn);
+
+	return processed;
 }
 
 PG_FUNCTION_INFO_V1(dblink_copy_table);
 Datum
 dblink_copy_table(PG_FUNCTION_ARGS)
 {
+	uint64 processed = 0;
 	char	*nspname;
 	char	*tblname;
 	char	*rnspname;
@@ -1011,9 +1016,9 @@ dblink_copy_table(PG_FUNCTION_ARGS)
 	rtblname = text_to_cstring(PG_GETARG_TEXT_PP(3));
 	connstr = text_to_cstring(PG_GETARG_TEXT_PP(4));
 
-	copyRemoteTableTo(nspname, tblname, rnspname, rtblname, connstr);
+	processed = copyRemoteTableTo(nspname, tblname, rnspname, rtblname, connstr);
 
-	return (Datum) 0;
+	PG_RETURN_INT64((int64)processed);
 }
 
 /*

From 4be1335dbd3ed0a9e51821fd5aa5453cb4d3c035 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Tue, 22 Jun 2021 21:01:17 +0800
Subject: [PATCH 395/578] fix gtm coredump due to LogCollector free
 http://tapd.oa.com/TBase_C/bugtrace/bugs/view/1020385652089017999 (merge
 request !416)

Squash merge branch 'sigmalin001' into 'Tbase_v2.15.20'

* fix gtm coredump due to LogCollector free http://tapd.oa.com/TBase_C/bugtrace/bugs/view/1020385652089017999
---
 src/gtm/main/main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index 93fc1901..43a9424a 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -2767,7 +2767,6 @@ GTM_ThreadLogCollector(void *argp)
         GTM_ProcessLogCollection();
     }
 
-    GTM_DeInitLogCollector();
     elog(LOG, "GTM is shutting down, log collector exits!");
     return my_threadinfo;
 }

From 305560a00321c5ef7ff5119d1c4a549a143b8dfd Mon Sep 17 00:00:00 2001
From: yeyukui <yukuiye@tencent.com>
Date: Thu, 24 Jun 2021 10:00:50 +0800
Subject: [PATCH 396/578] fix the crypt table problems (merge request !419)

* add function to clean the invalid elem in rel_crypt_hash table

* drop rel will delete elem in rel crypt hash, tapd http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131085590771
---
 contrib/tbase_mls/tbase_mls.c               |  47 +++++
 src/backend/access/rmgrdesc/relcryptdesc.c  |   8 +-
 src/backend/access/transam/twophase.c       |   7 +
 src/backend/access/transam/xact.c           |  13 ++
 src/backend/catalog/storage.c               |   9 +
 src/backend/storage/freespace/emapage.c     |   7 +
 src/backend/storage/freespace/extent_xlog.c |   7 +
 src/backend/utils/cache/relcryptmap.c       | 193 +++++++++++++++++++-
 src/include/utils/relcryptmap.h             |   7 +
 9 files changed, 291 insertions(+), 7 deletions(-)
 create mode 100644 contrib/tbase_mls/tbase_mls.c

diff --git a/contrib/tbase_mls/tbase_mls.c b/contrib/tbase_mls/tbase_mls.c
new file mode 100644
index 00000000..e2d53b31
--- /dev/null
+++ b/contrib/tbase_mls/tbase_mls.c
@@ -0,0 +1,47 @@
+#include "postgres.h"
+
+#include "catalog/catalog.h"
+#include "catalog/storage.h"
+#include "miscadmin.h"
+#include "fmgr.h"
+#include "postmaster/bgwriter.h"
+
+#include "storage/bufmgr.h"
+#include "utils/relcrypt.h"
+#include "utils/relcryptmap.h"
+#include "utils/mls.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(pg_rel_crypt_hash_clean);
+
+/*
+ *  Add function to clean the rel_crypt_hash table invalid elem
+ */
+Datum pg_rel_crypt_hash_clean(PG_FUNCTION_ARGS)
+{
+	RelCryptEntry *relcrypt;
+	List *mark_delete = NIL;
+	ListCell *	lc;
+
+	if (!is_mls_user())
+	{
+		elog(ERROR, "execute by mls user please");
+	}
+
+	/* set to flush rel crypt map */
+	RequestFlushRelcryptMap();
+
+	/* make rel crypt map for a backup file */
+	rel_crypt_write_mapfile(true);
+
+	mark_delete = MarkRelCryptInvalid();
+	/* delete the elem one by one */
+	foreach(lc, mark_delete)
+	{
+		relcrypt = (RelCryptEntry *) lfirst(lc);
+		rel_crypt_hash_delete(&(relcrypt->relfilenode), true);
+	}
+
+	PG_RETURN_BOOL(true);
+}
diff --git a/src/backend/access/rmgrdesc/relcryptdesc.c b/src/backend/access/rmgrdesc/relcryptdesc.c
index 09082689..2b05fe81 100644
--- a/src/backend/access/rmgrdesc/relcryptdesc.c
+++ b/src/backend/access/rmgrdesc/relcryptdesc.c
@@ -94,7 +94,6 @@ void rel_crypt_desc(StringInfo buf, XLogReaderState *record)
                     xlrec->algo_id, xlrec->option, xlrec->keysize);
                 break;
             }
-            break;
         case XLOG_CRYPT_KEY_DELETE:
             appendStringInfo(buf, "xlog type is comming, info:%u", XLOG_CRYPT_KEY_DELETE);
             break;
@@ -107,8 +106,13 @@ void rel_crypt_desc(StringInfo buf, XLogReaderState *record)
                 break;
             }
         case XLOG_REL_CRYPT_DELETE:
-            appendStringInfo(buf, "xlog type is comming, info:%u", XLOG_REL_CRYPT_DELETE);
+            {
+            	xl_rel_crypt_delete *xlrec;
+				xlrec = (xl_rel_crypt_delete *) XLogRecGetData(record);
+				appendStringInfo(buf, "rel crypt delete, database:%u tablespace:%u relnode:%u, algo_id:%d",
+						xlrec->rnode.dbNode, xlrec->rnode.spcNode, xlrec->rnode.relNode, xlrec->algo_id);
             break;
+            }
         default:
             Assert(0);
             break;
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 61bc6b50..a9078bda 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -120,6 +120,7 @@
 #ifdef __TBASE__
 #include "access/gtm.h"
 #include "utils/timeout.h"
+#include "utils/relcryptmap.h"
 #endif
 #include "pgxc/execRemote.h"
 
@@ -2080,6 +2081,12 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
         SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
 
         smgrdounlink(srel, false);
+#ifdef _MLS_
+		/*
+		 * clean up the rnode infomation in rel crypt hash table
+		 */
+		remove_rel_crypt_hash_elem(&(srel->smgr_relcrypt), true);
+#endif
         smgrclose(srel);
     }
 
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index b6881ece..3e59c7f4 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -99,6 +99,7 @@
 #include "postmaster/postmaster.h"
 #include "commands/extension.h"
 #include "tcop/utility.h"
+#include "utils/relcryptmap.h"
 #endif
 /*
  *    User-tweakable parameters
@@ -7412,6 +7413,12 @@ xact_redo_commit(xl_xact_parsed_commit *parsed,
             for (fork = 0; fork <= MAX_FORKNUM; fork++)
                 XLogDropRelation(parsed->xnodes[i], fork);
             smgrdounlink(srel, true);
+#ifdef _MLS_
+			/*
+			 * clean up the rnode infomation in rel crypt hash table
+			 */
+			remove_rel_crypt_hash_elem(&(srel->smgr_relcrypt), false);
+#endif
             smgrclose(srel);
         }
     }
@@ -7537,6 +7544,12 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid)
         for (fork = 0; fork <= MAX_FORKNUM; fork++)
             XLogDropRelation(parsed->xnodes[i], fork);
         smgrdounlink(srel, true);
+#ifdef _MLS_
+		/*
+		 * clean up the rnode infomation in rel crypt hash table
+		 */
+		remove_rel_crypt_hash_elem(&(srel->smgr_relcrypt), false);
+#endif
         smgrclose(srel);
     }
 }
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index a1396e62..b9136469 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -97,6 +97,7 @@
 #include "storage/smgr.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
+#include "utils/relcryptmap.h"
 
 /*
  * We keep a list of all relations (represented as RelFileNode values)
@@ -466,7 +467,15 @@ smgrDoPendingDeletes(bool isCommit)
         smgrdounlinkall(srels, nrels, false);
 
         for (i = 0; i < nrels; i++)
+		{
+#ifdef _MLS_
+			/*
+			 * clean up the rnode infomation in rel crypt hash table
+			 */
+			remove_rel_crypt_hash_elem(&(srels[i]->smgr_relcrypt), true);
+#endif
             smgrclose(srels[i]);
+		}
 
         pfree(srels);
     }
diff --git a/src/backend/storage/freespace/emapage.c b/src/backend/storage/freespace/emapage.c
index 063d08f3..ade86acd 100644
--- a/src/backend/storage/freespace/emapage.c
+++ b/src/backend/storage/freespace/emapage.c
@@ -100,6 +100,7 @@
 #include "utils/lsyscache.h"
 #include "funcapi.h"
 #include "lib/stringinfo.h"
+#include "utils/relcryptmap.h"
 
 #define ExtentAssertEMEIsFree(eme) ExtentAssert((eme).is_occupied == 0)
 #define ExtentAssertEMEIsOccup(eme) ExtentAssert((eme).is_occupied == 1)
@@ -7078,6 +7079,12 @@ RebuildExtentMap(Relation rel)
     //TODO: write xlog for truncate extent file
     RelationOpenSmgr(rel);
     smgrdounlinkfork(rel->rd_smgr, EXTENT_FORKNUM, false);
+#ifdef _MLS_
+	/*
+     * clean up the rnode infomation in rel crypt hash table
+     */
+	remove_rel_crypt_hash_elem(&(rel->rd_smgr->smgr_relcrypt), true);
+#endif
     RelationCloseSmgr(rel);
     
     INIT_EXLOG_TRUNCATE(&xlrec);
diff --git a/src/backend/storage/freespace/extent_xlog.c b/src/backend/storage/freespace/extent_xlog.c
index b08f3f64..1f51a047 100644
--- a/src/backend/storage/freespace/extent_xlog.c
+++ b/src/backend/storage/freespace/extent_xlog.c
@@ -83,6 +83,7 @@
 #include "storage/extentmapping.h"
 #include "storage/extent_xlog.h"
 #include "storage/smgr.h"
+#include "utils/relcryptmap.h"
 
 static void extent_xlog_apply_record(XLogReaderState *record);
 static void extent_xlog_apply_truncate(XLogReaderState *record);
@@ -364,6 +365,12 @@ extent_xlog_apply_truncate(XLogReaderState *record)
     SMgrRelation reln;
     reln = smgropen(xlrec->rnode, InvalidBackendId);
     smgrdounlinkfork(reln, EXTENT_FORKNUM, true);
+#ifdef _MLS_
+	/*
+     * clean up the rnode infomation in rel crypt hash table
+     */
+	remove_rel_crypt_hash_elem(&(reln->smgr_relcrypt), false);
+#endif
     smgrclose(reln);
 }
 
diff --git a/src/backend/utils/cache/relcryptmap.c b/src/backend/utils/cache/relcryptmap.c
index fd16e5fb..69fc1a06 100644
--- a/src/backend/utils/cache/relcryptmap.c
+++ b/src/backend/utils/cache/relcryptmap.c
@@ -113,6 +113,9 @@
 #include "utils/relcryptmisc.h"
 #include "storage/relcryptstorage.h"
 #include "utils/relcryptmap.h"
+#include "catalog/indexing.h"
+#include "utils/fmgroids.h"
+#include "utils/relfilenodemap.h"
 
 #ifdef _MLS_
 #include "utils/mls_extension.h"
@@ -1212,8 +1215,18 @@ void rel_crypt_redo(XLogReaderState *record)
                 break;
             }
         case XLOG_REL_CRYPT_DELETE:
-            elog(ERROR, "xlog type is comming, info:%u", XLOG_REL_CRYPT_DELETE);
+            {
+        	    xl_rel_crypt_delete *xlrec;
+        	    xlrec = (xl_rel_crypt_delete *) XLogRecGetData(record);
+        	    if (g_enable_crypt_debug)
+	            {
+        	    	elog(LOG, "REL_CRYPT_DELETE, redo XLOG_REL_CRYPT_DELETE, relfilenode:%d:%d:%d, algo_id:%d",
+        	    			xlrec->rnode.dbNode, xlrec->rnode.spcNode, xlrec->rnode.relNode,
+        	    			xlrec->algo_id);
+	            }
+        	    rel_crypt_hash_delete(&(xlrec->rnode), false);
             break;
+            }
         default:
             elog(ERROR, "recrypt redo, unknown info, info:%u", info & XLR_RMGR_INFO_MASK);
             break;
@@ -1275,6 +1288,89 @@ static int rel_crypt_hash_key_cmp (const void *key1, const void *key2, Size keys
     return 1;
 }
 
+/*
+ * this function is used to remove hash elem
+ *
+ * if write_wal is true, remove action will write wal
+ */
+void remove_rel_crypt_hash_elem(RelCrypt relCrypt, bool write_wal)
+{
+	if (relCrypt != NULL)
+	{
+		/*
+		 * if the algo_id is invalid, skip
+		 */
+		if (relCrypt->algo_id == TRANSP_CRYPT_INVALID_ALGORITHM_ID)
+		{
+			return;
+		}
+		/*
+		 * do remove the rnode and algo_id map in rel_crypt_hash table
+	     */
+		rel_crypt_hash_delete(&(relCrypt->relfilenode), write_wal);
+	}
+}
+
+/*
+ * do delete rel crypt hash elem about a rnode
+ */
+void rel_crypt_hash_delete(RelFileNode *rnode, bool write_wal)
+{
+	RelCrypt relCrypt;
+	bool     found = false;
+
+	uint32 hashcode;
+	int 	 partitionno;
+	LWLock	*partitionLock;
+
+	hashcode = rel_crypt_hash_code(rnode);
+	partitionno   = rel_crypt_hash_partition(hashcode);
+	partitionLock = rel_crypt_get_partition_lock(partitionno);
+
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	relCrypt = (RelCrypt) hash_search_with_hash_value(g_rel_crypt_hash,
+	                                                  (void *) rnode,
+	                                                  hashcode,
+	                                                  HASH_REMOVE,
+	                                                  &found);
+
+	if (found)
+	{
+		/*
+		 * need to flush crypt map in next checkpoint
+		 */
+		RequestFlushRelcryptMap();
+	}
+
+	/*
+	 * Critical section
+	 */
+	if (found && write_wal)
+	{
+		xl_rel_crypt_delete xlrec;
+		XLogRecPtr	lsn;
+
+		/* now errors are fatal ... */
+		START_CRIT_SECTION();
+
+		xlrec.rnode   = relCrypt->relfilenode;
+		xlrec.algo_id = relCrypt->algo_id;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) (&xlrec), sizeof(xl_rel_crypt_delete));
+
+		lsn = XLogInsert(RM_REL_CRYPT_ID, XLOG_REL_CRYPT_DELETE);
+
+		/* As always, WAL must hit the disk before the data update does */
+		XLogFlush(lsn);
+
+		END_CRIT_SECTION();
+	}
+
+	LWLockRelease(partitionLock);
+}
+
 void rel_crypt_hash_insert(RelFileNode * rnode, AlgoId algo_id, bool write_wal, bool in_building_procedure)
 {// #lizard forgives
     RelCrypt relcrypt;
@@ -1663,8 +1759,13 @@ static void rel_crypt_write_mapfile_post(RelCryptMapFile *map, int element_cnt,
     return;
 }
 
-static void rel_crypt_write_mapfile(void)
-{// #lizard forgives
+/*
+ * if is_backup is true, it means to backup the pg_rel_crypt.map
+ * to pg_rel_crypt.map.backup, if is_backup is false, it means
+ * flush the data to disk
+ */
+void rel_crypt_write_mapfile(bool is_backup)
+{
     int              loop;
     int              lock_loop;
     char            *mapfilename;
@@ -1685,7 +1786,18 @@ static void rel_crypt_write_mapfile(void)
     mapfilename     = palloc0(MAXPGPATH);
     mapfilename_new = palloc0(MAXPGPATH);
 
+    /*
+     * if backup the file, the filename will be renamed as pg_rel_crypt.map.backup
+     * else the file named as pg_rel_crypt.map
+     */
+    if (is_backup)
+    {
+    	snprintf(mapfilename, MAXPGPATH, "%s/%s.backup", "global", REL_CRYPT_MAP_FILENAME);
+    }
+    else
+    {
     snprintf(mapfilename,     MAXPGPATH, "%s/%s",    "global", REL_CRYPT_MAP_FILENAME);
+    }
     snprintf(mapfilename_new, MAXPGPATH, "%s/%s.%d", "global", REL_CRYPT_MAP_FILENAME, MyProcPid);
 
     buffile = BufFileOpen(mapfilename_new, (O_WRONLY|O_CREAT|PG_BINARY), (S_IRUSR|S_IWUSR), true, ERROR);
@@ -2016,13 +2128,80 @@ Datum pg_crypt_key_hash_dump(PG_FUNCTION_ARGS)
     return (Datum) 0;
 }
 
+/*
+ * Check the relfilenode exist
+ */
+bool CheckRelFileNodeExists(RelFileNode *rnode)
+{
+	Oid relid;
+
+	if (rnode != NULL)
+	{
+		relid = RelidByRelfilenode(rnode->spcNode, rnode->relNode);
+
+		if (OidIsValid(relid))
+		{
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * mark the invalid elem in g_rel_crypt_hash to delete
+ */
+List * MarkRelCryptInvalid(void)
+{
+	List * result = NIL;
+	HASH_SEQ_STATUS status;
+	int lock_loop = 0;
+	RelCryptEntry *relcrypt;
+	bool is_exist = false;
+
+	/* lock all partition lock */
+	for (lock_loop = 0; lock_loop < REL_CRYPT_HASHTABLE_NUM_PARTITIONS; lock_loop++)
+	{
+		LWLockAcquire(rel_crypt_get_partition_lock(lock_loop), LW_SHARED);
+	}
+
+	hash_seq_init(&status, g_rel_crypt_hash);
+	while ((relcrypt = (RelCryptEntry *) hash_seq_search(&status)) != NULL)
+	{
+		 /* only deal with current database */
+		if (relcrypt->relfilenode.dbNode != MyDatabaseId)
+		{
+			continue;
+		}
+
+		is_exist = CheckRelFileNodeExists(&(relcrypt->relfilenode));
+		if (!is_exist)
+		{
+			elog(DEBUG5, "check relfilenode exist, dbNode:%d, spcNode:%d, relNode:%d",
+			     relcrypt->relfilenode.dbNode, relcrypt->relfilenode.spcNode, relcrypt->relfilenode.relNode);
+			result = lappend(result, relcrypt);
+		}
+	}
+
+	/* release all */
+	for (lock_loop = REL_CRYPT_HASHTABLE_NUM_PARTITIONS - 1; lock_loop >= 0; lock_loop--)
+	{
+		LWLockRelease(rel_crypt_get_partition_lock(lock_loop));
+	}
+
+	return result;
+}
+
+/*
+ * do checkpoint to flush crypt map file to disk
+ */
 void CheckPointRelCrypt(void)
 {
     if (g_enable_crypt_debug)
     {
         elog(LOG, "CheckPointRelCrypt check to flush crypt mapfile BEGIN");
     }
-    rel_crypt_write_mapfile();
+    rel_crypt_write_mapfile(false);
     crypt_key_info_write_mapfile();
     if (g_enable_crypt_debug)
     {
@@ -2031,13 +2210,16 @@ void CheckPointRelCrypt(void)
     return;
 }
 
+/*
+ * if system in startup state, need to flush crypt map file
+ */
 void StartupReachConsistentState(void)
 {
     if (g_enable_crypt_debug)
     {
         elog(LOG, "StartupReachConsistentState check to flush crypt mapfile BEGIN");
     }
-    rel_crypt_write_mapfile();
+    rel_crypt_write_mapfile(false);
     crypt_key_info_write_mapfile();
     if (g_enable_crypt_debug)
     {
@@ -2046,5 +2228,6 @@ void StartupReachConsistentState(void)
     return;
 }
 
+
 #endif
 
diff --git a/src/include/utils/relcryptmap.h b/src/include/utils/relcryptmap.h
index 290ef886..ff434bbe 100644
--- a/src/include/utils/relcryptmap.h
+++ b/src/include/utils/relcryptmap.h
@@ -95,6 +95,8 @@ typedef struct xl_rel_crypt_insert
     int         algo_id;
 } xl_rel_crypt_insert;
 
+typedef xl_rel_crypt_insert xl_rel_crypt_delete;
+
 extern void rel_crypt_redo(XLogReaderState *record);
 extern void rel_crypt_desc(StringInfo buf, XLogReaderState *record);
 extern const char * rel_crypt_identify(uint8 info);
@@ -112,6 +114,11 @@ extern void crypt_key_info_load_default_key(void);
 extern void crypt_key_info_free(CryptKeyInfo cryptkey);
 extern CryptKeyInfo crypt_key_info_alloc(int option);
 extern void rel_crypt_hash_insert(RelFileNode * rnode, AlgoId algo_id, bool write_wal, bool in_building_procedure);
+extern void remove_rel_crypt_hash_elem(RelCrypt relCrypt, bool write_wal);
+extern void rel_crypt_hash_delete(RelFileNode * rnode, bool write_wal);
 extern void crypt_key_info_hash_insert(CryptKeyInfo cryptkey_input, bool write_wal, bool in_building_procedure);
 extern int crypt_key_info_cal_key_size(CryptKeyInfo cryptkey);
+extern bool CheckRelFileNodeExists(RelFileNode *rnode);
+extern List* MarkRelCryptInvalid(void);
+extern void rel_crypt_write_mapfile(bool is_backup);
 #endif                          /* RELCRYPT_MAP_H */

From 07353c592a502b44e3492aceba77cf6a39081b1a Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 23 Apr 2021 15:05:40 +0800
Subject: [PATCH 397/578] Fix explain of INSERT INTO  part table

---
 src/backend/commands/explain.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 2f7ea8e7..d08ccaa8 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1956,7 +1956,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
     {
         case T_ModifyTable:
 #ifdef __TBASE__
-            if(((ModifyTable *) plan)->haspartparent)
+			/* compatible with make_modifytable */
+			if (((ModifyTable *) plan)->haspartparent &&
+			    (((ModifyTable *) plan)->operation == CMD_UPDATE ||
+			     ((ModifyTable *) plan)->operation == CMD_DELETE))
             {
                 ExplainMemberNodes(((ModifyTable *) plan)->partplans,
                                ((ModifyTableState *) planstate)->partplans,

From 1f2eab940b8e4642d279ce1445b842cef03db038 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 22 Jun 2021 16:52:11 +0800
Subject: [PATCH 398/578] Give DN a proper session id

tapd: http://tapd.oa.com/pgxz/prong/stories/view/1010092131865528291
---
 src/backend/pgxc/pool/pgxcnode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 5424a200..7f4a9171 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -394,6 +394,10 @@ InitMultinodeExecutor(bool is_force)
     MemoryContextSwitchTo(oldcontext);
 
 	PGXCSessionId[0] = '\0';
+	if (IsConnFromApp())
+	{
+		sprintf(PGXCSessionId, "%s_%d_%ld", PGXCNodeName, MyProcPid, GetCurrentTimestamp());
+	}
 	
     if (IS_PGXC_COORDINATOR)
     {
@@ -403,8 +407,6 @@ InitMultinodeExecutor(bool is_force)
                        get_pgxc_nodename(co_handles[count].nodeoid)) == 0)
                 PGXCNodeId = count + 1;
         }
-		
-		sprintf(PGXCSessionId, "%s_%d_%ld", PGXCNodeName, MyProcPid, GetCurrentTimestamp());
     }
     else /* DataNode */
     {

From 7787433330424817b40cba7488e95f0cc7c54d00 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Thu, 24 Jun 2021 17:41:16 +0800
Subject: [PATCH 399/578] bugfix: get error after dn switch when
 persistent_datanode_connections is on (merge request !418)

http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131088924419
---
 src/backend/pgxc/pool/pgxcnode.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 7f4a9171..dd034e94 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -5487,6 +5487,11 @@ PgxcNodeRefreshBackendHandlesShmem(List *nodes_alter)
     int nid;
     PGXCNodeHandle *handle = NULL;
 
+	if (PersistentConnections && nodes_alter != NIL)
+	{
+		release_handles(true);
+	}
+
     foreach(lc, nodes_alter)
     {
         char ntype = PGXC_NODE_NONE;

From b7aac6119f15ea03c017cf6a1557f88c395ee365 Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Fri, 25 Jun 2021 13:59:37 +0800
Subject: [PATCH 400/578] set enable_parallel_ddl on

---
 src/backend/utils/misc/guc.c           | 2 +-
 src/test/regress/expected/sysviews.out | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 378f9c7a..42619d16 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2753,7 +2753,7 @@ static struct config_bool ConfigureNamesBool[] =
              NULL
         },
         &enable_parallel_ddl,
-        false,
+        true,
         NULL, NULL, NULL
     },
     {
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index e13fdd2a..58642165 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -121,7 +121,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_nestloop_suppression       | off
  enable_null_string                | off
  enable_oracle_compatible          | off
- enable_parallel_ddl               | off
+ enable_parallel_ddl               | on
  enable_partition_wise_join        | off
  enable_pgbouncer                  | off
  enable_plpgsql_debug_print        | off

From 7bc615ae902fb998af22da21501bf795673c521e Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 24 Jun 2021 21:03:15 +0800
Subject: [PATCH 401/578] Allocate a page with enough free space when doing
 CLUSTER copy

tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131089169139
---
 src/backend/access/heap/rewriteheap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 2bb4f98b..0e04a8d3 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -747,7 +747,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 #ifdef _SHARDING_        
         state->rs_buf = RelationGetBufferForTuple_shard(state->rs_new_rel, 
                                         HeapTupleGetShardId(tup),
-                                        BLCKSZ/2,
+										len,
                                         InvalidBuffer,
                                         0,
                                         NULL,

From 635b1972668ac73160a5f03add1a06b84859d3cd Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 25 Jun 2021 19:51:20 +0800
Subject: [PATCH 402/578] fix seq inconsistency when rename database
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131087148145 (merge request
 !429)

---
 src/gtm/main/gtm_store.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gtm/main/gtm_store.c b/src/gtm/main/gtm_store.c
index 43a1c9ab..858c636f 100644
--- a/src/gtm/main/gtm_store.c
+++ b/src/gtm/main/gtm_store.c
@@ -4302,7 +4302,7 @@ GTMStorageHandle *GTM_StoreGetAllSeqInDatabase(GTM_SequenceKey seq_database_key,
     GTMStorageHandle    bucket_handle = INVALID_STORAGE_HANDLE;
     GTM_StoredSeqInfo  *seq_info      = NULL;
     bool                ret           = false;
-    
+	Assert(seq_database_key->gsk_keylen <= SEQ_KEY_MAX_LENGTH);
 
     if (enable_gtm_sequence_debug)
     {
@@ -4326,7 +4326,8 @@ GTMStorageHandle *GTM_StoreGetAllSeqInDatabase(GTM_SequenceKey seq_database_key,
         {
             seq_info = GetSeqStore(bucket_handle);
 
-            if(strncmp(seq_database_key->gsk_key,seq_info->gs_key.gsk_key,seq_database_key->gsk_keylen - 1) != 0)
+			if(!(strncmp(seq_database_key->gsk_key,seq_info->gs_key.gsk_key,seq_database_key->gsk_keylen - 1) == 0 &&
+			        seq_info->gs_key.gsk_key[seq_database_key->gsk_keylen - 1] == '.'))
             {
                 bucket_handle = seq_info->gs_next;
                 continue;

From eb25d53f3d8d2a07e1ada46548582ddb8b0bf5d6 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Thu, 1 Jul 2021 10:50:28 +0800
Subject: [PATCH 403/578] bugfix: prepare regress failed (merge request !440)

(cherry picked from commit 817da2ec)

6122f119 bugfix: prepare regress failed
---
 src/backend/pgxc/pool/execRemote.c | 41 +++++++++++++++++++++++++-----
 src/backend/pgxc/pool/pgxcnode.c   |  6 -----
 2 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index d5f96393..7e2ad873 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3673,12 +3673,6 @@ pgxc_node_remote_cleanup_all(void)
         return;
     }
 
-	/* Do not cleanup connections if we have prepared statements on nodes */
-	if (HaveActiveDatanodeStatements())
-	{
-		return;
-	}
-
     /*
      * Send down snapshot followed by DISCARD ALL command.
      */
@@ -4679,9 +4673,16 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 		}
 		else
 		{
+			if (HaveActiveDatanodeStatements())
+			{
+				reset_handles();
+			}
+			else
+			{
         release_handles(false);
     }
 	}
+	}
     
     clear_handles();
 
@@ -4749,10 +4750,17 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
     }
     else
     {
+				if (HaveActiveDatanodeStatements())
+				{
+					reset_handles();
+				}
+				else
+				{
 				release_handles(false);
 			}
         }
     }
+	}
 
     clear_handles();
 }
@@ -4980,10 +4988,17 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 	}
 	else
 	{
+				if (HaveActiveDatanodeStatements())
+				{
+					reset_handles();
+				}
+				else
+				{
 				release_handles(false);
 			}
 		}
 	}
+	}
 	
 	clear_handles();
 #endif
@@ -5826,9 +5841,16 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
             pgxc_node_remote_cleanup_all();
 		if (need_release_handle)
         {
+			if (HaveActiveDatanodeStatements())
+			{
+				reset_handles();
+			}
+			else
+			{
 			release_handles(false);
         }
     }
+	}
     
     clear_handles();
     pfree_pgxc_all_handles(handles);
@@ -8763,9 +8785,16 @@ pgxc_node_remote_finish(char *prepareGID, bool commit,
 		}
 		else
 		{
+			if (HaveActiveDatanodeStatements())
+			{
+				reset_handles();
+			}
+			else
+			{
         release_handles(false);
     }
 	}
+	}
     clear_handles();
     pfree_pgxc_all_handles(pgxc_handles);
     pfree(finish_cmd);
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index dd034e94..5b36f087 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -1554,12 +1554,6 @@ reset_handles(void)
 		return;
 	}
 
-	/* Do not reset connections if we have prepared statements on nodes */
-	if (HaveActiveDatanodeStatements())
-	{
-		return;
-	}
-
 	/* Reset Datanodes handles occupied memory */
 	for (i = 0; i < NumDataNodes; i++)
 	{

From a4a2e7b7d328b3ea866541edf0266baf7b4f6692 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 1 Jul 2021 16:42:12 +0800
Subject: [PATCH 404/578] Not reset global session info when subtrans end

http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696089391431&url_cache_key=99b4551652ae6634ef20bbffc9885096
---
 src/backend/pgxc/pool/execRemote.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 7e2ad873..492a3ac8 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3649,13 +3649,17 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
  * specific stuff before releasing them to pool for reuse by other sessions.
  */
 static void
-pgxc_node_remote_cleanup_all(void)
-{// #lizard forgives
+pgxc_node_remote_cleanup_all(bool sub)
+{
     PGXCNodeAllHandles *handles = get_current_handles();
     PGXCNodeHandle *new_connections[handles->co_conn_count + handles->dn_conn_count];
     int                new_conn_count = 0;
     int                i;
-    char           *resetcmd = "RESET ALL;"
+	/* if it's called by sub-commit or sub-abort, DO NOT reset global_session */
+	char		   *resetcmd = sub ? "RESET ALL;"
+	                                 "RESET SESSION AUTHORIZATION;"
+	                                 "RESET transaction_isolation;" :
+	                           "RESET ALL;"
                                "RESET SESSION AUTHORIZATION;"
                                "RESET transaction_isolation;"
                                "RESET global_session";
@@ -4665,7 +4669,7 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 	if (!temp_object_included)
     {
         /* Clean up remote sessions */
-        pgxc_node_remote_cleanup_all();
+		pgxc_node_remote_cleanup_all(false);
 
 		if (PersistentConnections)
 		{
@@ -4737,10 +4741,12 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 
     stat_transaction(conn_count);
 
+    /* do not cleanup remote session for subtrans */
 	if (!temp_object_included)
         {
             /* Clean up remote sessions */
-            pgxc_node_remote_cleanup_all();
+		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
+		                             txn_type == TXN_TYPE_RollbackSubTxn);
 
 		if (need_release_handle)
 		{
@@ -5838,7 +5844,8 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
         if (!temp_object_included)
         {
             /* Clean up remote sessions */
-            pgxc_node_remote_cleanup_all();
+		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
+		                             txn_type == TXN_TYPE_RollbackSubTxn);
 		if (need_release_handle)
         {
 			if (HaveActiveDatanodeStatements())
@@ -8778,7 +8785,7 @@ pgxc_node_remote_finish(char *prepareGID, bool commit,
 	if (!temp_object_included)
     {
         /* Clean up remote sessions */
-        pgxc_node_remote_cleanup_all();
+		pgxc_node_remote_cleanup_all(false);
 		if (PersistentConnections)
 		{
 			reset_handles();

From 32340e531ee717eecd253cfe904e21212c790148 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 1 Jul 2021 21:09:17 +0800
Subject: [PATCH 405/578] fix syslogger coredump in process_pipe_input
 http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131089442025

---
 src/gtm/common/elog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gtm/common/elog.c b/src/gtm/common/elog.c
index 597a252e..0bc8e0b9 100644
--- a/src/gtm/common/elog.c
+++ b/src/gtm/common/elog.c
@@ -778,7 +778,7 @@ write_pipe_chunks(char *data, int len, int dest)
     Assert(len > 0);
 
     p.proto.nuls[0] = p.proto.nuls[1] = '\0';
-    p.proto.pid = (exit_flag == GTM_DEFAULT_EXIT_FLAG) ? (int) MyThreadID : 0;
+    p.proto.pid = (exit_flag == GTM_DEFAULT_EXIT_FLAG) ? (int) (ThreadId + 1) : 0;
 
     /* write all but the last chunk */
     while (len > PIPE_MAX_PAYLOAD)

From a749d0b7175bfd320ce16f0c8b9a33b8ec47a66a Mon Sep 17 00:00:00 2001
From: winter <guanhuawang@tencent.com>
Date: Mon, 5 Jul 2021 14:23:24 +0800
Subject: [PATCH 406/578] fix 'could not read block xxxxx ... ' issue after
 master-slave switch

---
 src/backend/access/heap/hio.c | 35 ++++++-----------------------------
 1 file changed, 6 insertions(+), 29 deletions(-)

diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index e65127f4..78e63235 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -182,7 +182,7 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
  * amount which ramps up as the degree of contention ramps up, but limiting
  * the result to some sane overall value.
  */
-static Buffer
+static void
 RelationAddExtraBlocks(Relation relation, BulkInsertState bistate, ShardID sid)
 {// #lizard forgives
     Page        page;
@@ -194,7 +194,6 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate, ShardID sid)
     Buffer        buffer;
 
 #ifdef _SHARDING_    
-    Buffer        firstBuffer = InvalidBuffer;
     if(RelationHasExtent(relation) && !ShardIDIsValid(sid))
     {
         elog(ERROR, "extent-organized relation must extend with shardid.");
@@ -275,19 +274,12 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate, ShardID sid)
         if(bistate)
             bistate->sid = sid;
 #endif
+		UnlockReleaseBuffer(buffer);
 
         /* Remember first block number thus added. */
         if (firstBlock == InvalidBlockNumber)
-        {
             firstBlock = blockNum;
-            firstBuffer = buffer;
-        }
-#ifdef _SHARDING_
-        else
-        {
-            UnlockReleaseBuffer(buffer);
-        }
-#endif
+
         /*
          * Immediately update the bottom level of the FSM.  This has a good
          * chance of making this page visible to other concurrently inserting
@@ -308,7 +300,6 @@ RelationAddExtraBlocks(Relation relation, BulkInsertState bistate, ShardID sid)
      */
     UpdateFreeSpaceMap(relation, firstBlock, blockNum, freespace);
 
-    return firstBuffer;
 }
 
 #ifdef _SHARDING_
@@ -1056,23 +1047,12 @@ RelationGetBufferForTuple_shard(Relation relation, ShardID sid, Size len,
                 UnlockRelationForExtension(relation, ExclusiveLock);
                 goto loop;
             }
+			RelationAddExtraBlocks(relation, bistate, 
+								RelationHasExtent(relation) ? sid : InvalidShardID);
         }
     }
     
-#ifdef _SHARDING_    
-    /*
-     * We can be certain that locking the otherBuffer first is OK, since it
-     * must have a lower page number.
-     */
-    if (otherBuffer != InvalidBuffer)
-        LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
 
-    /* Time to bulk-extend. */
-    buffer = RelationAddExtraBlocks(relation, bistate, 
-                                RelationHasExtent(relation) ? sid : InvalidShardID);
-#endif    
-
-#if 0
     /*
      * In addition to whatever extension we performed above, we always add at
      * least one block to satisfy our own request.
@@ -1095,7 +1075,6 @@ RelationGetBufferForTuple_shard(Relation relation, ShardID sid, Size len,
      * Now acquire lock on the new page.
      */
     LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-#endif
 
     /*
      * Release the file-extension lock; it's now OK for someone else to extend
@@ -1117,14 +1096,12 @@ RelationGetBufferForTuple_shard(Relation relation, ShardID sid, Size len,
     
     page = BufferGetPage(buffer);
     
-#if 0
     if (!PageIsNew(page))
         elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
              BufferGetBlockNumber(buffer),
              RelationGetRelationName(relation));
 
-    PageInit(page, BufferGetPageSize(buffer), 0, sid);
-#endif
+	PageInit_shard(page, BufferGetPageSize(buffer), 0, sid, false);
 
     if (len > PageGetHeapFreeSpace(page))
     {

From 751cfdb1c3685339ff3d77f75b5acdc0ca48e64f Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Tue, 13 Jul 2021 17:18:50 +0800
Subject: [PATCH 407/578] fix rename db cause gtm metadata inconsistent    
 http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131087148145

---
 src/backend/access/transam/gtm.c       |  2 +-
 src/gtm/main/gtm_seq.c                 |  4 ++--
 src/test/regress/expected/sequence.out | 19 +++++++++++++++++++
 src/test/regress/sql/sequence.sql      | 20 ++++++++++++++++++++
 4 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 1ac9e6d2..5f95e859 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -118,7 +118,7 @@ void RegisterSeqCreate(char *name, int32 type)
     
     old_cxt = MemoryContextSwitchTo(TopMemoryContext);
     key = (GTM_SequenceKeyData*)palloc(sizeof(GTM_SequenceKeyData));
-    key->gsk_keylen  = strlen(name);
+	key->gsk_keylen  = strlen(name) + 1;
     key->gsk_key     = pstrdup(name);
     key->gsk_type    = type;    
     g_CreateSeqList  = lappend(g_CreateSeqList, key);
diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c
index c9bb59c8..9abe9580 100644
--- a/src/gtm/main/gtm_seq.c
+++ b/src/gtm/main/gtm_seq.c
@@ -3538,11 +3538,11 @@ ProcessDBSequenceRenameCommand(Port *myport, StringInfo message, bool is_backup)
             
             GTM_StoreGetSeqKey(handles[i], old_key);
             temp_seqkey.gsk_key    = old_key;
-            temp_seqkey.gsk_keylen = strnlen(old_key, SEQ_KEY_MAX_LENGTH);
+			temp_seqkey.gsk_keylen = strnlen(old_key, SEQ_KEY_MAX_LENGTH) + 1;
             
             snprintf(new_key, SEQ_KEY_MAX_LENGTH, "%s%s", newseqkey.gsk_key, old_key + strnlen(seqkey.gsk_key, SEQ_KEY_MAX_LENGTH));
             temp_newseqkey.gsk_key    = new_key;
-            temp_newseqkey.gsk_keylen = strnlen(new_key, SEQ_KEY_MAX_LENGTH);
+			temp_newseqkey.gsk_keylen = strnlen(new_key, SEQ_KEY_MAX_LENGTH) + 1;
             if ((errcode = GTM_SeqRename(&temp_seqkey, &temp_newseqkey, gxid)))
             {
                 ereport(ERROR,
diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out
index 10c76ead..2eae7bde 100644
--- a/src/test/regress/expected/sequence.out
+++ b/src/test/regress/expected/sequence.out
@@ -1,6 +1,13 @@
 --
 -- CREATE SEQUENCE
 --
+CREATE DATABASE db_seq1;
+CREATE DATABASE db_seq2;
+\c db_seq1
+CREATE SEQUENCE my_seq;
+\c db_seq2
+CREATE SEQUENCE my_seq;
+\c regression
 -- various error cases
 CREATE UNLOGGED SEQUENCE sequence_testx;
 ERROR:  unlogged sequences are not supported
@@ -833,3 +840,15 @@ SELECT nextval('test_seq1');
 (1 row)
 
 DROP SEQUENCE test_seq1;
+-- Test sequece when alter database
+ALTER DATABASE db_seq1 RENAME TO db_seq3;
+ALTER DATABASE db_seq2 RENAME TO db_seq1;
+\c db_seq1
+DROP SEQUENCE my_seq;
+CREATE SEQUENCE my_seq;
+DROP SEQUENCE my_seq;
+\c db_seq3
+DROP SEQUENCE my_seq;
+CREATE SEQUENCE my_seq;
+DROP SEQUENCE my_seq;
+\q
\ No newline at end of file
diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql
index 3ca98bb3..fda62262 100644
--- a/src/test/regress/sql/sequence.sql
+++ b/src/test/regress/sql/sequence.sql
@@ -2,6 +2,14 @@
 -- CREATE SEQUENCE
 --
 
+CREATE DATABASE db_seq1;
+CREATE DATABASE db_seq2;
+\c db_seq1
+CREATE SEQUENCE my_seq;
+\c db_seq2
+CREATE SEQUENCE my_seq;
+\c regression
+
 -- various error cases
 CREATE UNLOGGED SEQUENCE sequence_testx;
 CREATE SEQUENCE sequence_testx INCREMENT BY 0;
@@ -414,3 +422,15 @@ SELECT nextval('test_seq1');
 SELECT nextval('test_seq1');
 
 DROP SEQUENCE test_seq1;
+-- Test sequece when alter database
+ALTER DATABASE db_seq1 RENAME TO db_seq3;
+ALTER DATABASE db_seq2 RENAME TO db_seq1;
+\c db_seq1
+DROP SEQUENCE my_seq;
+CREATE SEQUENCE my_seq;
+DROP SEQUENCE my_seq;
+\c db_seq3
+DROP SEQUENCE my_seq;
+CREATE SEQUENCE my_seq;
+DROP SEQUENCE my_seq;
+\q

From 412c4d20f1364e1febc0a8a448514b384bfd1c87 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Wed, 21 Jul 2021 17:11:25 +0800
Subject: [PATCH 408/578] fix regress

---
 src/test/regress/expected/join_3.out          | 120 ++++++++----------
 .../regress/expected/updatable_views_1.out    |  76 ++++-------
 2 files changed, 78 insertions(+), 118 deletions(-)

diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index 5b7dfb96..16264c50 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -5776,13 +5776,13 @@ select * from j1 inner join j2 on j1.id = j2.id;
    ->  Nested Loop
          Output: j1.id, j2.id
          Inner Unique: true
+         Join Filter: (j1.id = j2.id)
          ->  Seq Scan on public.j1
                Output: j1.id
-         ->  Bitmap Heap Scan on public.j2
+         ->  Materialize
+               Output: j2.id
+               ->  Seq Scan on public.j2
                Output: j2.id
-               Recheck Cond: (j2.id = j1.id)
-               ->  Bitmap Index Scan on j2_pkey
-                     Index Cond: (j2.id = j1.id)
 (14 rows)
 
 -- ensure join is not unique when not an equi-join
@@ -5795,17 +5795,15 @@ select * from j1 inner join j2 on j1.id > j2.id;
    Join Filter: (j1.id > j2.id)
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          Output: j1.id
-         ->  Bitmap Heap Scan on public.j1
+         ->  Seq Scan on public.j1
                Output: j1.id
-               ->  Bitmap Index Scan on j1_pkey
    ->  Materialize
          Output: j2.id
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Output: j2.id
-               ->  Bitmap Heap Scan on public.j2
+               ->  Seq Scan on public.j2
                      Output: j2.id
-                     ->  Bitmap Index Scan on j2_pkey
-(15 rows)
+(13 rows)
 
 -- ensure non-unique rel is not chosen as inner
 explain (verbose, costs off)
@@ -5819,13 +5817,13 @@ select * from j1 inner join j3 on j1.id = j3.id;
    ->  Nested Loop
          Output: j1.id, j3.id
          Inner Unique: true
+         Join Filter: (j1.id = j3.id)
          ->  Seq Scan on public.j3
                Output: j3.id
-         ->  Bitmap Heap Scan on public.j1
+         ->  Materialize
+               Output: j1.id
+               ->  Seq Scan on public.j1
                Output: j1.id
-               Recheck Cond: (j1.id = j3.id)
-               ->  Bitmap Index Scan on j1_pkey
-                     Index Cond: (j1.id = j3.id)
 (14 rows)
 
 -- ensure left join is marked as unique
@@ -5840,13 +5838,13 @@ select * from j1 left join j2 on j1.id = j2.id;
    ->  Nested Loop Left Join
          Output: j1.id, j2.id
          Inner Unique: true
+         Join Filter: (j1.id = j2.id)
          ->  Seq Scan on public.j1
                Output: j1.id
-         ->  Bitmap Heap Scan on public.j2
+         ->  Materialize
+               Output: j2.id
+               ->  Seq Scan on public.j2
                Output: j2.id
-               Recheck Cond: (j1.id = j2.id)
-               ->  Bitmap Index Scan on j2_pkey
-                     Index Cond: (j1.id = j2.id)
 (14 rows)
 
 -- ensure right join is marked as unique
@@ -5859,13 +5857,13 @@ select * from j1 right join j2 on j1.id = j2.id;
    ->  Nested Loop Left Join
          Output: j1.id, j2.id
          Inner Unique: true
+         Join Filter: (j1.id = j2.id)
          ->  Seq Scan on public.j2
                Output: j2.id
-         ->  Bitmap Heap Scan on public.j1
+         ->  Materialize
+               Output: j1.id
+               ->  Seq Scan on public.j1
                Output: j1.id
-               Recheck Cond: (j1.id = j2.id)
-               ->  Bitmap Index Scan on j1_pkey
-                     Index Cond: (j1.id = j2.id)
 (12 rows)
 
 -- ensure full join is marked as unique
@@ -5898,17 +5896,15 @@ select * from j1 cross join j2;
    Output: j1.id, j2.id
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          Output: j1.id
-         ->  Bitmap Heap Scan on public.j1
+         ->  Seq Scan on public.j1
                Output: j1.id
-               ->  Bitmap Index Scan on j1_pkey
    ->  Materialize
          Output: j2.id
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Output: j2.id
-               ->  Bitmap Heap Scan on public.j2
+               ->  Seq Scan on public.j2
                      Output: j2.id
-                     ->  Bitmap Index Scan on j2_pkey
-(14 rows)
+(12 rows)
 
 -- ensure a natural join is marked as unique
 explain (verbose, costs off)
@@ -5922,13 +5918,13 @@ select * from j1 natural join j2;
    ->  Nested Loop
          Output: j1.id
          Inner Unique: true
+         Join Filter: (j1.id = j2.id)
          ->  Seq Scan on public.j1
                Output: j1.id
-         ->  Bitmap Heap Scan on public.j2
+         ->  Materialize
+               Output: j2.id
+               ->  Seq Scan on public.j2
                Output: j2.id
-               Recheck Cond: (j2.id = j1.id)
-               ->  Bitmap Index Scan on j2_pkey
-                     Index Cond: (j2.id = j1.id)
 (14 rows)
 
 -- ensure a distinct clause allows the inner to become unique
@@ -5942,6 +5938,7 @@ inner join (select distinct id from j3) j3 on j1.id = j3.id;
    ->  Nested Loop
          Output: j1.id, j3.id
          Inner Unique: true
+         Join Filter: (j1.id = j3.id)
          ->  Unique
                Output: j3.id
                ->  Sort
@@ -5949,12 +5946,9 @@ inner join (select distinct id from j3) j3 on j1.id = j3.id;
                      Sort Key: j3.id
                      ->  Seq Scan on public.j3
                            Output: j3.id
-         ->  Bitmap Heap Scan on public.j1
+         ->  Seq Scan on public.j1
                Output: j1.id
-               Recheck Cond: (j1.id = j3.id)
-               ->  Bitmap Index Scan on j1_pkey
-                     Index Cond: (j1.id = j3.id)
-(17 rows)
+(15 rows)
 
 -- ensure group by clause allows the inner to become unique
 explain (verbose, costs off)
@@ -5967,6 +5961,7 @@ inner join (select id from j3 group by id) j3 on j1.id = j3.id;
    ->  Nested Loop
          Output: j1.id, j3.id
          Inner Unique: true
+         Join Filter: (j1.id = j3.id)
          ->  Group
                Output: j3.id
                Group Key: j3.id
@@ -5975,12 +5970,9 @@ inner join (select id from j3 group by id) j3 on j1.id = j3.id;
                      Sort Key: j3.id
                      ->  Seq Scan on public.j3
                            Output: j3.id
-         ->  Bitmap Heap Scan on public.j1
+         ->  Seq Scan on public.j1
                Output: j1.id
-               Recheck Cond: (j1.id = j3.id)
-               ->  Bitmap Index Scan on j1_pkey
-                     Index Cond: (j1.id = j3.id)
-(18 rows)
+(16 rows)
 
 drop table j1;
 drop table j2;
@@ -6009,7 +6001,7 @@ inner join j2 on j1.id1 = j2.id1;
    ->  Nested Loop
          Output: j1.id1, j1.id2, j2.id1, j2.id2
          Join Filter: (j1.id1 = j2.id1)
-         ->  Index Only Scan using j2_pkey on public.j2
+         ->  Seq Scan on public.j2
                Output: j2.id1, j2.id2
          ->  Seq Scan on public.j1
                Output: j1.id1, j1.id2
@@ -6028,14 +6020,12 @@ inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2;
    ->  Nested Loop
          Output: j1.id1, j1.id2, j2.id1, j2.id2
          Inner Unique: true
-         ->  Index Only Scan using j2_pkey on public.j2
+         Join Filter: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
+         ->  Seq Scan on public.j2
                Output: j2.id1, j2.id2
-         ->  Bitmap Heap Scan on public.j1
+         ->  Seq Scan on public.j1
                Output: j1.id1, j1.id2
-               Recheck Cond: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
-               ->  Bitmap Index Scan on j1_pkey
-                     Index Cond: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
-(14 rows)
+(12 rows)
 
 -- ensure we don't detect the join to be unique when quals are not part of the
 -- join condition
@@ -6050,15 +6040,13 @@ inner join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
    Remote query: SELECT j1.id1, j1.id2, j2.id1, j2.id2 FROM (j1 JOIN j2 ON ((j1.id1 = j2.id1))) WHERE (j1.id2 = 1)
    ->  Nested Loop
          Output: j1.id1, j1.id2, j2.id1, j2.id2
-         Inner Unique: true
-         ->  Index Only Scan using j2_pkey on public.j2
-               Output: j2.id1, j2.id2
-         ->  Bitmap Heap Scan on public.j1
+         Join Filter: (j1.id1 = j2.id1)
+         ->  Seq Scan on public.j1
                Output: j1.id1, j1.id2
-               Recheck Cond: ((j1.id1 = j2.id1) AND (j1.id2 = 1))
-               ->  Bitmap Index Scan on j1_pkey
-                     Index Cond: ((j1.id1 = j2.id1) AND (j1.id2 = 1))
-(14 rows)
+               Filter: (j1.id2 = 1)
+         ->  Seq Scan on public.j2
+               Output: j2.id1, j2.id2
+(12 rows)
 
 -- as above, but for left joins.
 explain (verbose, costs off)
@@ -6073,14 +6061,12 @@ left join j2 on j1.id1 = j2.id1 where j1.id2 = 1;
    ->  Nested Loop Left Join
          Output: j1.id1, j1.id2, j2.id1, j2.id2
          Join Filter: (j1.id1 = j2.id1)
-         ->  Bitmap Heap Scan on public.j1
+         ->  Seq Scan on public.j1
                Output: j1.id1, j1.id2
-               Recheck Cond: (j1.id2 = 1)
-               ->  Bitmap Index Scan on j1_pkey
-                     Index Cond: (j1.id2 = 1)
-         ->  Index Only Scan using j2_pkey on public.j2
+               Filter: (j1.id2 = 1)
+         ->  Seq Scan on public.j2
                Output: j2.id1, j2.id2
-(14 rows)
+(12 rows)
 
 -- validate logic in merge joins which skips mark and restore.
 -- it should only do this if all quals which were used to detect the unique
@@ -6099,13 +6085,11 @@ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1;
    Node/s: datanode_1, datanode_2
    ->  Nested Loop
          Join Filter: ((j1.id1 = j2.id1) AND (j1.id2 = j2.id2))
-         ->  Bitmap Heap Scan on j1
-               Recheck Cond: ((id1 % 1000) = 1)
-               ->  Bitmap Index Scan on j1_id1_idx
-         ->  Bitmap Heap Scan on j1 j2
-               Recheck Cond: ((id1 % 1000) = 1)
-               ->  Bitmap Index Scan on j1_id1_idx
-(10 rows)
+         ->  Seq Scan on j1
+               Filter: ((id1 % 1000) = 1)
+         ->  Seq Scan on j1 j2
+               Filter: ((id1 % 1000) = 1)
+(8 rows)
 
 select * from j1 j1
 inner join j1 j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2
diff --git a/src/test/regress/expected/updatable_views_1.out b/src/test/regress/expected/updatable_views_1.out
index 4c2bfb95..e13b4537 100644
--- a/src/test/regress/expected/updatable_views_1.out
+++ b/src/test/regress/expected/updatable_views_1.out
@@ -2095,42 +2095,30 @@ UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
          Update on public.t11
          Update on public.t12
          Update on public.t111
-         ->  Bitmap Heap Scan on public.t1
+         ->  Index Scan using t1_a_idx on public.t1
                Output: 100, t1.b, t1.c, t1.ctid, t1.shardid
-               Recheck Cond: ((t1.a > 5) AND (t1.a < 7))
+               Index Cond: ((t1.a > 5) AND (t1.a < 7))
                Filter: ((t1.a <> 6) AND (SubPlan 1) AND snoop(t1.a) AND leakproof(t1.a))
-               ->  Bitmap Index Scan on t1_a_idx
-                     Index Cond: ((t1.a > 5) AND (t1.a < 7))
                SubPlan 1
                  ->  Remote Subquery Scan on all (datanode_1)
                        ->  Append
-                             ->  Bitmap Heap Scan on public.t12 t12_1
-                                   Recheck Cond: (t12_1.a = t1.a)
-                                   ->  Bitmap Index Scan on t12_a_idx
-                                         Index Cond: (t12_1.a = t1.a)
-                             ->  Bitmap Heap Scan on public.t111 t111_1
-                                   Recheck Cond: (t111_1.a = t1.a)
-                                   ->  Bitmap Index Scan on t111_a_idx
-                                         Index Cond: (t111_1.a = t1.a)
-         ->  Bitmap Heap Scan on public.t11
+                             ->  Seq Scan on public.t12 t12_1
+                                   Filter: (t12_1.a = t1.a)
+                             ->  Seq Scan on public.t111 t111_1
+                                   Filter: (t111_1.a = t1.a)
+         ->  Index Scan using t11_a_idx on public.t11
                Output: 100, t11.b, t11.c, t11.d, t11.ctid, t11.shardid
-               Recheck Cond: ((t11.a > 5) AND (t11.a < 7))
+               Index Cond: ((t11.a > 5) AND (t11.a < 7))
                Filter: ((t11.a <> 6) AND (SubPlan 1) AND snoop(t11.a) AND leakproof(t11.a))
-               ->  Bitmap Index Scan on t11_a_idx
-                     Index Cond: ((t11.a > 5) AND (t11.a < 7))
-         ->  Bitmap Heap Scan on public.t12
+         ->  Index Scan using t12_a_idx on public.t12
                Output: 100, t12.b, t12.c, t12.e, t12.ctid, t12.shardid
-               Recheck Cond: ((t12.a > 5) AND (t12.a < 7))
+               Index Cond: ((t12.a > 5) AND (t12.a < 7))
                Filter: ((t12.a <> 6) AND (SubPlan 1) AND snoop(t12.a) AND leakproof(t12.a))
-               ->  Bitmap Index Scan on t12_a_idx
-                     Index Cond: ((t12.a > 5) AND (t12.a < 7))
-         ->  Bitmap Heap Scan on public.t111
+         ->  Index Scan using t111_a_idx on public.t111
                Output: 100, t111.b, t111.c, t111.d, t111.e, t111.ctid, t111.shardid
-               Recheck Cond: ((t111.a > 5) AND (t111.a < 7))
+               Index Cond: ((t111.a > 5) AND (t111.a < 7))
                Filter: ((t111.a <> 6) AND (SubPlan 1) AND snoop(t111.a) AND leakproof(t111.a))
-               ->  Bitmap Index Scan on t111_a_idx
-                     Index Cond: ((t111.a > 5) AND (t111.a < 7))
-(41 rows)
+(29 rows)
 
 UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
 SELECT * FROM v1 WHERE a=100; -- Nothing should have been changed to 100
@@ -2153,42 +2141,30 @@ UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8;
          Update on public.t11
          Update on public.t12
          Update on public.t111
-         ->  Bitmap Heap Scan on public.t1
+         ->  Index Scan using t1_a_idx on public.t1
                Output: (t1.a + 1), t1.b, t1.c, t1.ctid, t1.shardid
-               Recheck Cond: ((t1.a > 5) AND (t1.a = 8))
+               Index Cond: ((t1.a > 5) AND (t1.a = 8))
                Filter: ((SubPlan 1) AND snoop(t1.a) AND leakproof(t1.a))
-               ->  Bitmap Index Scan on t1_a_idx
-                     Index Cond: ((t1.a > 5) AND (t1.a = 8))
                SubPlan 1
                  ->  Remote Subquery Scan on all (datanode_1)
                        ->  Append
-                             ->  Bitmap Heap Scan on public.t12 t12_1
-                                   Recheck Cond: (t12_1.a = t1.a)
-                                   ->  Bitmap Index Scan on t12_a_idx
-                                         Index Cond: (t12_1.a = t1.a)
-                             ->  Bitmap Heap Scan on public.t111 t111_1
-                                   Recheck Cond: (t111_1.a = t1.a)
-                                   ->  Bitmap Index Scan on t111_a_idx
-                                         Index Cond: (t111_1.a = t1.a)
-         ->  Bitmap Heap Scan on public.t11
+                             ->  Seq Scan on public.t12 t12_1
+                                   Filter: (t12_1.a = t1.a)
+                             ->  Seq Scan on public.t111 t111_1
+                                   Filter: (t111_1.a = t1.a)
+         ->  Index Scan using t11_a_idx on public.t11
                Output: (t11.a + 1), t11.b, t11.c, t11.d, t11.ctid, t11.shardid
-               Recheck Cond: ((t11.a > 5) AND (t11.a = 8))
+               Index Cond: ((t11.a > 5) AND (t11.a = 8))
                Filter: ((SubPlan 1) AND snoop(t11.a) AND leakproof(t11.a))
-               ->  Bitmap Index Scan on t11_a_idx
-                     Index Cond: ((t11.a > 5) AND (t11.a = 8))
-         ->  Bitmap Heap Scan on public.t12
+         ->  Index Scan using t12_a_idx on public.t12
                Output: (t12.a + 1), t12.b, t12.c, t12.e, t12.ctid, t12.shardid
-               Recheck Cond: ((t12.a > 5) AND (t12.a = 8))
+               Index Cond: ((t12.a > 5) AND (t12.a = 8))
                Filter: ((SubPlan 1) AND snoop(t12.a) AND leakproof(t12.a))
-               ->  Bitmap Index Scan on t12_a_idx
-                     Index Cond: ((t12.a > 5) AND (t12.a = 8))
-         ->  Bitmap Heap Scan on public.t111
+         ->  Index Scan using t111_a_idx on public.t111
                Output: (t111.a + 1), t111.b, t111.c, t111.d, t111.e, t111.ctid, t111.shardid
-               Recheck Cond: ((t111.a > 5) AND (t111.a = 8))
+               Index Cond: ((t111.a > 5) AND (t111.a = 8))
                Filter: ((SubPlan 1) AND snoop(t111.a) AND leakproof(t111.a))
-               ->  Bitmap Index Scan on t111_a_idx
-                     Index Cond: ((t111.a > 5) AND (t111.a = 8))
-(41 rows)
+(29 rows)
 
 UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8;
 SELECT * FROM v1 WHERE b=8;

From d14091ecfa4572f918c88129779bb736ff8e5edc Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 20 Jul 2021 16:49:10 +0800
Subject: [PATCH 409/578] fix: plan_id should be reduced by 1, reflecting the
 serial number in the array

---
 src/backend/nodes/nodeFuncs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c
index 8a10e344..e9916037 100644
--- a/src/backend/nodes/nodeFuncs.c
+++ b/src/backend/nodes/nodeFuncs.c
@@ -3912,7 +3912,7 @@ plantree_walk_initplans(List *plans,
 	foreach(lc, plans)
 	{
 		Plan    *splan = (Plan *) list_nth(subplans,
-								 (lfirst_node(SubPlan, lc))->plan_id);
+		                                   (lfirst_node(SubPlan, lc))->plan_id - 1);
 		
 		if (walker(splan, context))
 			return true;

From 849322202aca767eb70a9c80f9946086960c9d8a Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Thu, 29 Jul 2021 17:32:20 +0800
Subject: [PATCH 410/578] Fix duplicate relfilenode with alter command.
 http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131089349403&url_cache_key=d4e1402777dc733479aac463ad1a9d24

---
 src/backend/commands/tablecmds.c | 51 ++++++++++++++++++++++++++++----
 src/backend/nodes/copyfuncs.c    |  1 +
 src/include/nodes/parsenodes.h   |  1 +
 3 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index eb5b2b6b..4a44ba49 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -494,7 +494,7 @@ static void ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId,
                      bool rewrite);
 static void RebuildConstraintComment(AlteredTableInfo *tab, int pass,
                          Oid objid, Relation rel, char *conname);
-static void TryReuseIndex(Oid oldId, IndexStmt *stmt);
+static void TryReuseIndex(Relation rel, Oid oldId, IndexStmt *stmt);
 static void TryReuseForeignKey(Oid oldId, Constraint *con);
 static void change_owner_fix_column_acls(Oid relationOid,
                              Oid oldOwnerId, Oid newOwnerId);
@@ -8715,6 +8715,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
     bool        check_rights;
     bool        skip_build;
     bool        quiet;
+	bool        save_oldnode;
     ObjectAddress address;
 
     Assert(IsA(stmt, IndexStmt));
@@ -8725,8 +8726,10 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
 
     /* suppress schema rights check when rebuilding existing index */
     check_rights = !is_rebuild;
+	/* if we're resuing an old node */
+	save_oldnode = OidIsValid(stmt->oldNode);
     /* skip index build if phase 3 will do it or we're reusing an old one */
-    skip_build = tab->rewrite > 0 || OidIsValid(stmt->oldNode);
+	skip_build = tab->rewrite > 0 || save_oldnode;
     /* suppress notices when rebuilding existing index */
     quiet = is_rebuild;
 
@@ -8771,6 +8774,10 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
                 partidxstmt = (IndexStmt *)copyObject((void*)stmt);
                 partidxstmt->relation->relname = GetPartitionName(RelationGetRelid(rel), i, false);
                 partidxstmt->idxname = GetPartitionName(indexOid, i, true);
+				if (save_oldnode)
+				{
+					partidxstmt->oldNode = list_nth_oid(stmt->partsOldNode, i);
+				}
 
                 partOid = get_relname_relid(partidxstmt->relation->relname, RelationGetNamespace(rel));
 
@@ -8789,6 +8796,19 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
                                    false,    /* check_not_in_use */
                                    skip_build,    /* skip_build */
                                    quiet); /* quiet */
+				/*
+                 * If TryReuseIndex() stashed a relfilenode for us, we used it for the new
+                 * index instead of building from scratch.  The DROP of the old edition of
+                 * this index will have scheduled the storage for deletion at commit, so
+                 * cancel that pending deletion.
+                 */
+				if (save_oldnode)
+				{
+					Relation	irel = index_open(addr.objectId, NoLock);
+
+					RelationPreserveStorage(irel->rd_node, true);
+					index_close(irel, NoLock);
+				}
 
                 /* Make dependency entries */
                 myself.classId = RelationRelationId;
@@ -8819,7 +8839,7 @@ ATExecAddIndex(AlteredTableInfo *tab, Relation rel,
      * this index will have scheduled the storage for deletion at commit, so
      * cancel that pending deletion.
      */
-    if (OidIsValid(stmt->oldNode))
+	if (save_oldnode)
     {
         Relation    irel = index_open(address.objectId, NoLock);
 
@@ -11912,7 +11932,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
             AlterTableCmd *newcmd;
 
             if (!rewrite)
-                TryReuseIndex(oldId, stmt);
+                TryReuseIndex(rel, oldId, stmt);
 			stmt->reset_default_tblspc = true;
             /* keep the index's comment */
             stmt->idxcomment = GetComment(oldId, RelationRelationId, 0);
@@ -11941,7 +11961,7 @@ ATPostAlterTypeParse(Oid oldId, Oid oldRelId, Oid refRelId, char *cmd,
                     indoid = get_constraint_index(oldId);
 
                     if (!rewrite)
-                        TryReuseIndex(indoid, indstmt);
+						TryReuseIndex(rel, indoid, indstmt);
                     /* keep any comment on the index */
                     indstmt->idxcomment = GetComment(indoid,
                                                      RelationRelationId, 0);
@@ -12028,7 +12048,7 @@ RebuildConstraintComment(AlteredTableInfo *tab, int pass, Oid objid,
  * for the real analysis, then mutates the IndexStmt based on that verdict.
  */
 static void
-TryReuseIndex(Oid oldId, IndexStmt *stmt)
+TryReuseIndex(Relation rel, Oid oldId, IndexStmt *stmt)
 {
     if (CheckIndexCompatible(oldId,
                              stmt->accessMethod,
@@ -12039,6 +12059,25 @@ TryReuseIndex(Oid oldId, IndexStmt *stmt)
 
         stmt->oldNode = irel->rd_node.relNode;
         index_close(irel, NoLock);
+
+		if (RELATION_IS_INTERVAL(rel))
+		{
+			int nParts = 0;
+			int i = 0;
+
+			nParts = RelationGetNParts(rel);
+			stmt->partsOldNode = NULL;
+
+			for (i = 0; i < nParts; i++)
+			{
+				Relation iprel = index_open(RelationGetPartitionIndex(rel, oldId, i),
+				                            NoLock);
+
+				stmt->partsOldNode = lappend_oid(stmt->partsOldNode,
+				                                            iprel->rd_node.relNode);
+				index_close(iprel, NoLock);
+			}
+		}
     }
 }
 
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 9ccd69bd..702eec38 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -3753,6 +3753,7 @@ _copyIndexStmt(const IndexStmt *from)
 	COPY_SCALAR_FIELD(reset_default_tblspc);
 #ifdef __TBASE__
     COPY_SCALAR_FIELD(parentIndexOid);
+	COPY_NODE_FIELD(partsOldNode);
 #endif
     return newnode;
 }
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 5554ee7b..01ab8277 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -2903,6 +2903,7 @@ typedef struct IndexStmt
 #ifdef __TBASE__
     /* used for interval partition */
     Oid         parentIndexOid;
+	List       *partsOldNode;   /* like oldNode just for partition tables */
 #endif
 } IndexStmt;
 

From 1e6affb7c02fccae909bd97d22e91a2ecd8c7576 Mon Sep 17 00:00:00 2001
From: challzhang <challzhang@tencent.com>
Date: Mon, 2 Aug 2021 15:14:45 +0800
Subject: [PATCH 411/578] Fix tuple does not match the descriptor when
 executing insert in JDBC

---
 src/backend/tcop/pquery.c                    |  8 ++-
 src/test/regress/expected/insert_1.out       | 52 ++++++++------------
 src/test/regress/expected/sanity_check_1.out |  1 +
 src/test/regress/sql/insert.sql              | 12 +++++
 4 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 1a2cb2cc..715d407b 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -188,6 +188,7 @@ ProcessQuery(PlannedStmt *plan,
 			 char *completionTag,
 			 int instrument)
 {
+    int         eflags = 0;
     QueryDesc  *queryDesc;
 
     /*
@@ -206,10 +207,15 @@ ProcessQuery(PlannedStmt *plan,
                                 GetActiveSnapshot(), InvalidSnapshot,
 								dest, params, queryEnv, instrument);
 
+	if (plan->hasReturning)
+	{
+	    eflags |= EXEC_FLAG_RETURNING;
+	}
+
     /*
      * Call ExecutorStart to prepare the plan for execution
      */
-    ExecutorStart(queryDesc, 0);
+	ExecutorStart(queryDesc, eflags);
 
     /*
      * Run the plan to completion.
diff --git a/src/test/regress/expected/insert_1.out b/src/test/regress/expected/insert_1.out
index 78de338c..21232dff 100644
--- a/src/test/regress/expected/insert_1.out
+++ b/src/test/regress/expected/insert_1.out
@@ -875,40 +875,30 @@ with baseInfo as(select * from t1)
 insert into t2 select * from baseInfo;
 drop table t1;
 drop table t2;
--- Determine whether tables of different groups are allowed to insert.
-set default_locator_type to shard;
-drop table if exists t2;
-NOTICE:  table "t2" does not exist, skipping
-drop table if exists t2_rep;
-NOTICE:  table "t2_rep" does not exist, skipping
-drop table if exists t2_new;
-NOTICE:  table "t2_new" does not exist, skipping
-create table t2(f1 int,f2 int);
-NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
-create table t2_rep(f1 int,f2 int) distribute by replication;
-insert into t2_rep values(1,1),(2,2);
-insert into t2 select * from t2_rep;
-select count(*) from t2_rep;
- count 
--------
-     2
+-- test insert with returning in JDBC
+drop table if exists insertwithret;
+NOTICE:  table "insertwithret" does not exist, skipping
+create table insertwithret(a int, b text, c int);
+prepare p0(int,text,int) as insert into insertwithret values($1, $2, $3) returning a;
+prepare p1(int,text,int) as insert into insertwithret values($1, $2, $3) returning a,b;
+prepare p2(int,text,int) as insert into insertwithret values($1, $2, $3) returning c;
+prepare p3(int,text,int) as insert into insertwithret values($1, $2, $3);
+execute p0(1, 'abc', 1);
+ a 
+---
+ 1
 (1 row)
 
-select count(*) from t2;
- count 
--------
-     2
+execute p1(1, 'abc', 1);
+ a |  b  
+---+-----
+ 1 | abc
 (1 row)
 
-create table t2_new as select * from t2_rep;
-NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
-select count(*) from t2_new;
- count 
--------
-     2
+execute p2(1, 'abc', 1);
+ c 
+---
+ 1
 (1 row)
 
-drop table t2;
-drop table t2_rep;
-drop table t2_new;
-reset default_locator_type;
+execute p3(1, 'abc', 1);
diff --git a/src/test/regress/expected/sanity_check_1.out b/src/test/regress/expected/sanity_check_1.out
index 8b55f563..dd80648c 100644
--- a/src/test/regress/expected/sanity_check_1.out
+++ b/src/test/regress/expected/sanity_check_1.out
@@ -60,6 +60,7 @@ inet_tbl|t
 inhf|f
 inhx|t
 insert_tbl|f
+insertwithret|f
 int2_tbl|f
 int4_tbl|f
 int8_tbl|f
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index b9b08d55..5591b65e 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -573,3 +573,15 @@ drop table t2;
 drop table t2_rep;
 drop table t2_new;
 reset default_locator_type;
+-- test insert with returning in JDBC
+drop table if exists insertwithret;
+create table insertwithret(a int, b text, c int);
+prepare p0(int,text,int) as insert into insertwithret values($1, $2, $3) returning a;
+prepare p1(int,text,int) as insert into insertwithret values($1, $2, $3) returning a,b;
+prepare p2(int,text,int) as insert into insertwithret values($1, $2, $3) returning c;
+prepare p3(int,text,int) as insert into insertwithret values($1, $2, $3);
+
+execute p0(1, 'abc', 1);
+execute p1(1, 'abc', 1);
+execute p2(1, 'abc', 1);
+execute p3(1, 'abc', 1);

From 95d9ed0e8bc068c7fbaa747abe3d8cf6240a0b65 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Wed, 4 Aug 2021 10:53:55 +0800
Subject: [PATCH 412/578] Make cluster_activity visible in the same time as
 pg_stat_activity

Achieved this by adding a hook in pgstat_report_activity
tapd: http://tapd.oa.com/20418349/bugtrace/bugs/view?bug_id=1020418349090364325&url_cache_key=adcff0a8af8b863601a2454d42ec091b
---
 .../pg_stat_cluster_activity.c                | 29 +++++++++++++++++--
 src/backend/postmaster/pgstat.c               |  5 ++++
 src/include/pgstat.h                          |  2 ++
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
index 0cc836d3..efe74c95 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -96,6 +96,7 @@ static PgClusterStatus *ClusterStatusArray = NULL;
 static PgClusterStatus *MyCSEntry = NULL;
 
 static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+static pgstat_report_hook_type prev_pgstat_report_hook = NULL;
 static PortalStart_hook_type prev_PortalStart = NULL;
 static PortalDrop_hook_type prev_PortalDrop = NULL;
 static ExecutorStart_hook_type prev_ExecutorStart = NULL;
@@ -341,13 +342,34 @@ pgcs_report_common(PgClusterStatus *entry, QueryDesc *desc)
 
 /* ----------
  * pgcs_report_query_activity
+ *
+ *  Do nothing but set common field, just enable this cluster entry
+ *  to make it visible in the same time as pg_stat_activity. Hooked
+ *  in pgstat_report_activity, args are redundant.
+ */
+static void
+pgcs_report_query_activity(BackendState state, const char *cmd_str)
+{
+	volatile PgClusterStatus *entry;
+	
+	pgcs_entry_initialize();
+	entry = MyCSEntry;
+	
+	pgcs_report_common((PgClusterStatus *) entry, NULL);
+	
+	if (prev_pgstat_report_hook)
+		prev_pgstat_report_hook(state, cmd_str);
+}
+
+/* ----------
+ * pgcs_report_executor_activity
  * 
  *  Report fileds of per-query referred, hooked as ExecutorStart_hook
  *  report planstate, cursors and common fields.
  * ----------
  */
 static void
-pgcs_report_query_activity(QueryDesc *desc, int eflags)
+pgcs_report_executor_activity(QueryDesc *desc, int eflags)
 {
 	volatile PgClusterStatus *entry;
 	StringInfo planstate_str = NULL;
@@ -1076,12 +1098,14 @@ _PG_init(void)
 	 */
 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = pgcs_shmem_startup;
+	prev_pgstat_report_hook = pgstat_report_hook;
+	pgstat_report_hook = pgcs_report_query_activity;
 	prev_PortalStart = PortalStart_hook;
 	PortalStart_hook = pgcs_report_activity;
 	prev_PortalDrop = PortalDrop_hook;
 	PortalDrop_hook = pgcs_report_activity;
 	prev_ExecutorStart = ExecutorStart_hook;
-	ExecutorStart_hook = pgcs_report_query_activity;
+	ExecutorStart_hook = pgcs_report_executor_activity;
 }
 
 /*
@@ -1092,6 +1116,7 @@ _PG_fini(void)
 {
 	/* Uninstall hooks. */
 	shmem_startup_hook = prev_shmem_startup_hook;
+	pgstat_report_hook = prev_pgstat_report_hook;
 	PortalStart_hook = prev_PortalStart;
 	PortalDrop_hook = prev_PortalDrop;
 	ExecutorStart_hook = prev_ExecutorStart;
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 0d77754a..76d4ff19 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -143,6 +143,8 @@ char       *pgstat_stat_tmpname = NULL;
  */
 PgStat_MsgBgWriter BgWriterStats;
 
+pgstat_report_hook_type pgstat_report_hook = NULL;
+
 /* ----------
  * Local data
  * ----------
@@ -3128,6 +3130,9 @@ pgstat_report_activity(BackendState state, const char *cmd_str)
     }
 
     pgstat_increment_changecount_after(beentry);
+	
+	if (pgstat_report_hook)
+		pgstat_report_hook(state, cmd_str);
 }
 
 /*-----------
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 15dd8b59..2049d855 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -1132,6 +1132,8 @@ typedef struct PgStat_FunctionCallUsage
 	instr_time	f_start;
 } PgStat_FunctionCallUsage;
 
+typedef void (*pgstat_report_hook_type) (BackendState state, const char *cmd_str);
+extern PGDLLIMPORT pgstat_report_hook_type pgstat_report_hook;
 
 /* ----------
  * GUC parameters

From 8bbdbb85d45b414391dcaf7442cfdae829188c4a Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Tue, 20 Jul 2021 15:44:20 +0800
Subject: [PATCH 413/578] fix deadlock in BufferConnection
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131088847515 (merge request
 git status)

---
 src/backend/pgxc/pool/execRemote.c | 115 ++++++++++++++++++++++++++---
 src/include/pgxc/execRemote.h      |   4 +-
 2 files changed, 108 insertions(+), 11 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 492a3ac8..19eff0ea 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -1163,8 +1163,8 @@ ValidateAndCloseCombiner(ResponseCombiner *combiner)
  * connection should be buffered.
  */
 void
-BufferConnection(PGXCNodeHandle *conn)
-{// #lizard forgives
+BufferConnection(PGXCNodeHandle *conn, bool need_prefetch)
+{
     ResponseCombiner *combiner = conn->combiner;
     MemoryContext oldcontext;
 
@@ -1375,15 +1375,112 @@ BufferConnection(PGXCNodeHandle *conn)
             continue;
         }
 
-        /* incomplete message, read more */
         if (res == RESPONSE_EOF)
         {
+#ifdef __TBASE__
+		    if (need_prefetch)
+            {
+                /*
+                 * We encountered incomplete message, try to read more.
+                 * Here if we read timeout, then we move to other connections to read, because we
+                 * easily got deadlock if a specific cursor run as producer on two nodes. If we can
+                 * consume data from all all connections, we can break the deadlock loop.
+                 */
+                bool   bComplete          = false;
+                DNConnectionState state   = DN_CONNECTION_STATE_IDLE;
+                int    i                  = 0;
+                int    ret 			      = 0;
+                PGXCNodeHandle *save_conn = NULL;
+                struct timeval timeout;
+                timeout.tv_sec  	      = 0;
+                timeout.tv_usec 	      = 1000;
+
+                save_conn = conn;
+                while (1)
+                {
+                    conn  = save_conn;
+                    state = conn->state; /* Save the connection state. */
+                    ret   = pgxc_node_receive(1, &conn, &timeout);
+                    if (DNStatus_OK == ret)
+                    {
+                        /* We got data, handle it. */
+                        break;
+                    }
+                    else if (DNStatus_ERR == ret)
+                    {
+                        ereport(ERROR,
+                                (errcode(ERRCODE_INTERNAL_ERROR),
+                                        errmsg("Failed to receive more data from data node %u", conn->nodeoid)));
+                    }
+                    else
+                    {
+                        /* Restore the saved state of connection. */
+                        conn->state = state;
+                    }
+
+                    /* Try to read data from other connections. */
+                    for (i = 0; i < combiner->conn_count; i ++)
+                    {
+                        conn  = combiner->connections[i];
+                        if (save_conn != conn && conn != NULL)
+                        {
+                            /* Save the connection state. */
+                            state = conn->state;
+                            if (state == DN_CONNECTION_STATE_QUERY)
+                            {
+                                ret = pgxc_node_receive(1, &conn, &timeout);
+                                if (DNStatus_OK == ret)
+                                {
+                                    /* We got data, prefetch it. */
+                                    bComplete = PreFetchConnection(conn, i);
+                                    if (bComplete)
+                                    {
+                                        /* Receive Complete on one connection, we need retry to read from current_conn. */
+                                        break;
+                                    }
+                                    else
+                                    {
+                                        /* Maybe Suspend or Expired, just move to next connection and read. */
+                                        continue;
+                                    }
+                                }
+                                else if (DNStatus_EXPIRED == ret)
+                                {
+                                    /* Restore the saved state of connection. */
+                                    conn->state = state;
+                                    continue;
+                                }
+                                else
+                                {
+                                    ereport(ERROR,
+                                            (errcode(ERRCODE_INTERNAL_ERROR),
+                                                    errmsg("Failed to receive more data from data node %u", conn->nodeoid)));
+                                }
+                            }
+                        }
+                    }
+                }
+                continue;
+            }
+            else
+            {
+                /* incomplete message, read more */
+                if (pgxc_node_receive(1, &conn, NULL))
+                {
+                    PGXCNodeSetConnectionState(conn,
+                                               DN_CONNECTION_STATE_ERROR_FATAL);
+                    add_error_message(conn, "Failed to fetch from data node");
+                }
+            }
+#else
+            /* incomplete message, read more */
             if (pgxc_node_receive(1, &conn, NULL))
             {
                 PGXCNodeSetConnectionState(conn,
                         DN_CONNECTION_STATE_ERROR_FATAL);
                 add_error_message(conn, "Failed to fetch from data node");
             }
+#endif
         }
 
         /*
@@ -3464,7 +3561,7 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
          * any bugs reported
          */
         if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
-            BufferConnection(connections[i]);
+			BufferConnection(connections[i], false);
 
 		/* Send global session id */
 		if (pgxc_node_send_sessionid(connections[i]))
@@ -3979,7 +4076,7 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
         {
             /* Read in any pending input */
             if (conn->state != DN_CONNECTION_STATE_IDLE)
-                BufferConnection(conn);
+				BufferConnection(conn, false);
 
             if (conn->read_only)
             {
@@ -4857,7 +4954,7 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
             /* Read in any pending input */
             if (conn->state != DN_CONNECTION_STATE_IDLE)
             {
-                BufferConnection(conn);                
+				BufferConnection(conn, false);
             }
 
 #if 0    
@@ -5513,7 +5610,7 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
             /* Read in any pending input */
             if (conn->state != DN_CONNECTION_STATE_IDLE)
             {
-                BufferConnection(conn);
+				BufferConnection(conn, false);
             }
 
             /*
@@ -6802,7 +6899,7 @@ SendTxnInfo(RemoteQuery *node, PGXCNodeHandle *conn,
 			CommandId cid, Snapshot snapshot)
 {
 	if (conn->state == DN_CONNECTION_STATE_QUERY)
-		BufferConnection(conn);
+		BufferConnection(conn, false);
 	if (snapshot && pgxc_node_send_snapshot(conn, snapshot))
 	{
 		ereport(ERROR,
@@ -7176,7 +7273,7 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist)
     for (i = 0; i < conn_count; i++)
     {
         if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
-            BufferConnection(connections[i]);
+			BufferConnection(connections[i], false);
         if (pgxc_node_send_close(connections[i], true, stmt_name) != 0)
         {
             /*
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index 236979f8..c76b946a 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -392,7 +392,7 @@ extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const ch
         if ((conn)->state == DN_CONNECTION_STATE_QUERY && \
                 (conn)->combiner && \
                 (conn)->combiner != (ResponseCombiner *) (node)) \
-            BufferConnection(conn); \
+			BufferConnection(conn, true); \
         (conn)->combiner = (ResponseCombiner *) (node); \
     } while(0)
 
@@ -400,7 +400,7 @@ extern TupleTableSlot *FetchTuple(ResponseCombiner *combiner);
 extern void InitResponseCombiner(ResponseCombiner *combiner, int node_count,
                        CombineType combine_type);
 extern void CloseCombiner(ResponseCombiner *combiner);
-extern void BufferConnection(PGXCNodeHandle *conn);
+extern void BufferConnection(PGXCNodeHandle *conn, bool need_prefetch);
 extern bool PreFetchConnection(PGXCNodeHandle *conn, int32 node_index);
 
 extern void ExecRemoteQueryReScan(RemoteQueryState *node, ExprContext *exprCtxt);

From 65b808332da7f6e852d4cd347112cf8f86fcb357 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 17 Aug 2021 14:58:43 +0800
Subject: [PATCH 414/578] Support interval table pruning for IN (array) (merge
 request !598)

Squash merge branch 'andrelin/in_pruning' into 'Tbase_v5.06'

tapd: http://tapd.oa.com/pgxz/tobject/tobjects/view/10217?system_name=onlinebug
---
 src/backend/utils/adt/ruleutils.c       | 267 ++++++++++++++++++++++--
 src/include/catalog/pg_type.h           |   3 +
 src/test/regress/expected/partition.out |  38 ++++
 src/test/regress/sql/partition.sql      |  19 ++
 4 files changed, 307 insertions(+), 20 deletions(-)

diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index eb2e5420..6b2dd38b 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -516,6 +516,7 @@ static char *flatten_reloptions(Oid relid);
 #ifdef __TBASE__
 static Bitmapset *pruning_walker(Relation rel, Node *expr);
 static Bitmapset *pruning_opexpr(Relation rel, OpExpr *expr);
+static Bitmapset *pruning_scalar_array_opexpr(Relation rel, ScalarArrayOpExpr *expr);
 static Bitmapset *get_full_pruning_result(Relation rel);
 static int get_daysofmonth(int startmonth, int startday, 
                                 int endmonth, int endday);
@@ -12275,6 +12276,9 @@ pruning_walker(Relation rel, Node *expr)
                 }
             }
             break;
+		case T_ScalarArrayOpExpr:
+			result = pruning_scalar_array_opexpr(rel, (ScalarArrayOpExpr*)expr);
+			break;
         default:
             result = get_full_pruning_result(rel);
             break;
@@ -12283,6 +12287,49 @@ pruning_walker(Relation rel, Node *expr)
     return result;
 }
 
+static int
+find_partidx_by_const(Datum constvalue, int consttype, Form_pg_partition_interval routerinfo, QulificationType qualtype)
+{
+	int partidx = -1; /* full as default */
+	
+	switch(consttype)
+	{
+		case INT2OID:  /* int2 */
+		{
+			int value_int16;
+			value_int16 = DatumGetInt16(constvalue);
+			partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
+			                              routerinfo->partnparts, (int64) value_int16, qualtype);
+		}
+			break;
+		case INT4OID: /* int4 */
+		{
+			int value_int32;
+			value_int32 = DatumGetInt32(constvalue);
+			partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
+			                              routerinfo->partnparts, (int64) value_int32, qualtype);
+		}
+			break;
+		case INT8OID: /* int8 */
+		{
+			partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
+			                              routerinfo->partnparts, DatumGetInt64(constvalue), qualtype);
+		}
+			break;
+		case TIMESTAMPOID: /* timestamp */
+			partidx = find_partidx_by_timestamp(routerinfo->partstartvalue_ts, routerinfo->partinterval_int,
+			                                    routerinfo->partinterval_type,
+			                                    routerinfo->partnparts, DatumGetTimestamp(constvalue),
+			                                    qualtype);
+			break;
+		default:
+			elog(WARNING, "unsupported partidx type %d", consttype);
+			break;
+	}
+	
+	return partidx;
+}
+
 static Bitmapset *
 pruning_opexpr(Relation rel, OpExpr *expr)
 {// #lizard forgives
@@ -12381,34 +12428,202 @@ pruning_opexpr(Relation rel, OpExpr *expr)
     switch(arg_const->consttype)
     {
         case INT2OID:  /* int2 */
+		case INT4OID: /* int4 */
+		case INT8OID: /* int8 */
+		case TIMESTAMPOID: /* timestamp */
             {
-                int value_int16;
-                value_int16 = DatumGetInt16(arg_const->constvalue);
-                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
-                                    routerinfo->partnparts, (int64)value_int16, qualtype);
+			partidx = find_partidx_by_const(arg_const->constvalue, arg_const->consttype, routerinfo, qualtype);
             }
             break;
-        case INT4OID: /* int4 */
+		default:
+			elog(ERROR, "unsupported const type:[%u]", arg_const->consttype);
+	}
+
+	npart = RelationGetNParts(rel);
+	if(npart <= 0)
             {
-                int value_int32;
-                value_int32 = DatumGetInt32(arg_const->constvalue);
-                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
-                                    routerinfo->partnparts, (int64)value_int32, qualtype);
+		elog(ERROR, "internal error: pruning_opexpr:partitioned table has no partitions");
+	}
+	
+	if(partidx == PARTITION_ROUTER_RESULT_FULL)
+		return get_full_pruning_result(rel);
+	else if(partidx == PARTITION_ROUTER_RESULT_NULL)
+		return NULL;
+	else if(partidx >= 0)
+	{
+		char *partname = NULL;
+		Oid partoid = InvalidOid;
+
+		switch(qualtype)
+		{
+			case QULIFICATION_TYPE_LS:				
+			case QULIFICATION_TYPE_LE:
+				{
+					int i;
+					for(i = 0; i <= partidx; i++)
+					{
+						partname = GetPartitionName(RelationGetRelid(rel), i, false);
+						partoid = get_relname_relid(partname, RelationGetNamespace(rel));
+						if(partoid)
+						{
+							result = bms_add_member(result, i);
+						}
+					}
             }
             break;
-        case INT8OID: /* int8 */
+			case QULIFICATION_TYPE_EQUAL:
             {
-                partidx = find_partidx_by_int(routerinfo->partstartvalue_int, routerinfo->partinterval_int,
-                                routerinfo->partnparts, DatumGetInt64(arg_const->constvalue), qualtype);
+					partname = GetPartitionName(RelationGetRelid(rel), partidx, false);
+					partoid = get_relname_relid(partname, RelationGetNamespace(rel));
+					if(partoid)
+					{
+					    result = bms_make_singleton(partidx);
+					}
             }
             break;
-        case TIMESTAMPOID: /* timestamp */
-            partidx = find_partidx_by_timestamp(routerinfo->partstartvalue_ts, routerinfo->partinterval_int,
-                                routerinfo->partinterval_type,
-                                routerinfo->partnparts, DatumGetTimestamp(arg_const->constvalue), qualtype);
+			case QULIFICATION_TYPE_GE:
+			case QULIFICATION_TYPE_GT:
+				{
+					int i;
+					for(i = partidx; i < npart; i++)
+					{
+						partname = GetPartitionName(RelationGetRelid(rel), i, false);
+						partoid = get_relname_relid(partname, RelationGetNamespace(rel));
+						if(partoid)
+						{
+						    result = bms_add_member(result, i);
+						}
+					}
+				}
             break;
         default:
-            elog(ERROR, "unsupported const type:[%u]", arg_const->consttype);
+				//nerver occur
+				elog(ERROR, "internal error: pruning_opexpr: invalid QulificationType[%d]", qualtype);
+		}
+	}
+
+	return result;
+}
+
+static Bitmapset *
+pruning_scalar_array_opexpr(Relation rel, ScalarArrayOpExpr *expr)
+{
+	Bitmapset 	*result = NULL;
+	char		*opname = NULL;
+	Node		*leftarg = NULL;
+	Node		*rightarg = NULL;
+	Var			*arg_var = NULL;
+	Const		*arg_const = NULL;
+	bool		isswap = false;
+	int			npart;
+	int 		partidx;
+	AttrNumber 	partkey;
+	QulificationType 	qualtype = QULIFICATION_TYPE_EQUAL;
+	Form_pg_partition_interval routerinfo;
+	ArrayType  *arrayval;
+	int16		elmlen;
+	bool		elmbyval;
+	char		elmalign;
+	int			num_elems;
+	Datum	   *elem_values;
+	int         elem_type;
+	bool	   *elem_nulls;
+	int         i;
+	
+	partkey = RelationGetPartitionColumnIndex(rel);
+	
+	if(list_length(expr->args) != 2)
+		return get_full_pruning_result(rel);
+	
+	leftarg = (Node *)list_nth(expr->args,0);
+	rightarg = (Node *)list_nth(expr->args,1);
+	
+	if (IsA(leftarg,Var))
+	{
+		arg_var = (Var *)leftarg;
+		arg_const = (Const *)rightarg;
+	}
+	else if (IsA(rightarg,Var))
+	{
+		arg_var = (Var *)rightarg;
+		arg_const = (Const *)leftarg;
+		isswap = true;
+	}
+	else
+	{
+		return get_full_pruning_result(rel);
+	}
+	
+	if (arg_const == NULL ||
+	    (!IsA(arg_const, Const)) ||
+	    arg_var->varattno != partkey)
+	{
+		return get_full_pruning_result(rel);
+	}
+	
+	opname = get_opname(expr->opno);
+	
+	if(strcmp("<",opname) == 0)
+	{
+		if(!isswap)
+			qualtype = QULIFICATION_TYPE_LS;
+		else
+			qualtype = QULIFICATION_TYPE_GT;
+	}
+	else if(strcmp("<=",opname) == 0)
+	{
+		if(!isswap)
+			qualtype = QULIFICATION_TYPE_LE;
+		else
+			qualtype = QULIFICATION_TYPE_GE;
+	}
+	else if(strcmp("=",opname) == 0)
+	{
+		qualtype = QULIFICATION_TYPE_EQUAL;
+	}
+	else if(strcmp(">=",opname) == 0)
+	{
+		if(!isswap)
+			qualtype = QULIFICATION_TYPE_GE;
+		else
+			qualtype = QULIFICATION_TYPE_LE;
+	}
+	else if(strcmp(">",opname) == 0)
+	{
+		if(!isswap)
+			qualtype = QULIFICATION_TYPE_GT;
+		else
+			qualtype = QULIFICATION_TYPE_LS;
+	}
+	else
+	{
+		/* any other case, get full partitions */
+		return get_full_pruning_result(rel);
+	}
+	
+	routerinfo = rel->rd_partitions_info;
+	
+	if(!routerinfo)
+	{
+		elog(ERROR, "relation[%s] is not a partitioned table", RelationGetRelationName(rel));
+	}
+	
+	switch(arg_const->consttype)
+	{
+		case INT2ARRAYOID:  /* int2 */
+			elem_type = INT2OID;
+			break;
+		case INT4ARRAYOID: /* int4 */
+			elem_type = INT4OID;
+			break;
+		case INT8ARRAYOID: /* int8 */
+			elem_type = INT8OID;
+			break;
+		case TIMESTAMPARRAYOID: /* timestamp */
+			elem_type = TIMESTAMPOID;
+			break;
+		default:
+			return get_full_pruning_result(rel);
     }
 
     npart = RelationGetNParts(rel);
@@ -12417,6 +12632,19 @@ pruning_opexpr(Relation rel, OpExpr *expr)
         elog(ERROR, "internal error: pruning_opexpr:partitioned table has no partitions");
     }
     
+	arrayval = DatumGetArrayTypeP(arg_const->constvalue);
+	/* We could cache this data, but not clear it's worth it */
+	get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
+	                     &elmlen, &elmbyval, &elmalign);
+	deconstruct_array(arrayval,
+	                  ARR_ELEMTYPE(arrayval),
+	                  elmlen, elmbyval, elmalign,
+	                  &elem_values, &elem_nulls, &num_elems);
+	
+	for (i = 0; i < num_elems; i++)
+	{
+		partidx = find_partidx_by_const(elem_values[i], elem_type, routerinfo, qualtype);
+		
     if(partidx == PARTITION_ROUTER_RESULT_FULL)
         return get_full_pruning_result(rel);
     else if(partidx == PARTITION_ROUTER_RESULT_NULL)
@@ -12431,7 +12659,6 @@ pruning_opexpr(Relation rel, OpExpr *expr)
             case QULIFICATION_TYPE_LS:                
             case QULIFICATION_TYPE_LE:
                 {
-                    int i;
                     for(i = 0; i <= partidx; i++)
 					{
 						partname = GetPartitionName(RelationGetRelid(rel), i, false);
@@ -12449,14 +12676,13 @@ pruning_opexpr(Relation rel, OpExpr *expr)
 					partoid = get_relname_relid(partname, RelationGetNamespace(rel));
 					if(partoid)
 					{
-					    result = bms_make_singleton(partidx);
+						result = bms_add_member(result, partidx);
 					}
                 }
                 break;
             case QULIFICATION_TYPE_GE:
             case QULIFICATION_TYPE_GT:
                 {
-                    int i;
                     for(i = partidx; i < npart; i++)
 					{
 						partname = GetPartitionName(RelationGetRelid(rel), i, false);
@@ -12473,6 +12699,7 @@ pruning_opexpr(Relation rel, OpExpr *expr)
                 elog(ERROR, "internal error: pruning_opexpr: invalid QulificationType[%d]", qualtype);
         }
     }
+	}
 
     return result;
 }
diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h
index e1b73fca..79cdcf48 100644
--- a/src/include/catalog/pg_type.h
+++ b/src/include/catalog/pg_type.h
@@ -538,6 +538,7 @@ DATA(insert OID = 1013 (  _oidvector PGNSP PGUID -1 f b A f t \054 0    30 0 arr
 DATA(insert OID = 1014 (  _bpchar     PGNSP PGUID -1 f b A f t \054 0 1042 0 array_in array_out array_recv array_send bpchartypmodin bpchartypmodout array_typanalyze i x f 0 -1 0 100 _null_ _null_ _null_ ));
 DATA(insert OID = 1015 (  _varchar     PGNSP PGUID -1 f b A f t \054 0 1043 0 array_in array_out array_recv array_send varchartypmodin varchartypmodout array_typanalyze i x f 0 -1 0 100 _null_ _null_ _null_ ));
 DATA(insert OID = 1016 (  _int8         PGNSP PGUID -1 f b A f t \054 0    20 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
+#define INT8ARRAYOID        1016
 DATA(insert OID = 1017 (  _point     PGNSP PGUID -1 f b A f t \054 0 600 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 1018 (  _lseg         PGNSP PGUID -1 f b A f t \054 0 601 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 1019 (  _path         PGNSP PGUID -1 f b A f t \054 0 602 0 array_in array_out array_recv array_send - - array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
@@ -579,12 +580,14 @@ DATA(insert OID = 1114 ( timestamp     PGNSP PGUID    8 FLOAT8PASSBYVAL b D f t
 DESCR("date and time");
 #define TIMESTAMPOID    1114
 DATA(insert OID = 1115 ( _timestamp  PGNSP PGUID    -1 f b A f t \054 0 1114 0 array_in array_out array_recv array_send timestamptypmodin timestamptypmodout array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
+#define TIMESTAMPARRAYOID	1115
 DATA(insert OID = 1182 ( _date         PGNSP PGUID    -1 f b A f t \054 0 1082 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 1183 ( _time         PGNSP PGUID    -1 f b A f t \054 0 1083 0 array_in array_out array_recv array_send timetypmodin timetypmodout array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
 DATA(insert OID = 1184 ( timestamptz PGNSP PGUID    8 FLOAT8PASSBYVAL b D t t \054 0    0 1185 timestamptz_in timestamptz_out timestamptz_recv timestamptz_send timestamptztypmodin timestamptztypmodout - d p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("date and time with time zone");
 #define TIMESTAMPTZOID    1184
 DATA(insert OID = 1185 ( _timestamptz PGNSP PGUID -1 f b A f t \054 0    1184 0 array_in array_out array_recv array_send timestamptztypmodin timestamptztypmodout array_typanalyze d x f 0 -1 0 0 _null_ _null_ _null_ ));
+#define TIMESTAMPTZARRAYOID	1185
 DATA(insert OID = 1186 ( interval     PGNSP PGUID 16 f b T t t \054 0    0 1187 interval_in interval_out interval_recv interval_send intervaltypmodin intervaltypmodout - d p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("@ <number> <units>, time interval");
 #define INTERVALOID        1186
diff --git a/src/test/regress/expected/partition.out b/src/test/regress/expected/partition.out
index 46ec29a3..d63e6d2f 100644
--- a/src/test/regress/expected/partition.out
+++ b/src/test/regress/expected/partition.out
@@ -978,3 +978,41 @@ truncate table int_drop partition for(1000);
 ERROR:  the value for locating a partition is out of range
 truncate table int_drop partition for(370);
 drop table int_drop;
+-- IN expr partition pruning
+create table t_in_test(a int, b int, c timestamp)
+partition by range (c) begin
+(timestamp without time zone '2017-09-01 0:0:0')
+step (interval '1 month') partitions (12)
+distribute by shard (a)
+to group default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into t_in_test values(1,1,'20170901');
+insert into t_in_test values(2,2,'20171001');
+insert into t_in_test values(3,3,'20171101');
+insert into t_in_test values(3,3,'20171201');
+explain (costs off) select * from t_in_test where c in ('20171001', '20171201');
+                                                         QUERY PLAN                                                         
+----------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution
+   Node/s: datanode_1, datanode_2
+   ->  Append
+         ->  Seq Scan on t_in_test (partition sequence: 1, name: t_in_test_part_1)
+               Filter: (c = ANY ('{"Sun Oct 01 00:00:00 2017","Fri Dec 01 00:00:00 2017"}'::timestamp without time zone[]))
+         ->  Seq Scan on t_in_test (partition sequence: 3, name: t_in_test_part_3)
+               Filter: (c = ANY ('{"Sun Oct 01 00:00:00 2017","Fri Dec 01 00:00:00 2017"}'::timestamp without time zone[]))
+(7 rows)
+
+set enable_fast_query_shipping to off;
+explain (costs off) select * from t_in_test where c in ('20170901', '20171101');
+                                                         QUERY PLAN                                                         
+----------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   ->  Append
+         ->  Seq Scan on t_in_test (partition sequence: 0, name: t_in_test_part_0)
+               Filter: (c = ANY ('{"Fri Sep 01 00:00:00 2017","Wed Nov 01 00:00:00 2017"}'::timestamp without time zone[]))
+         ->  Seq Scan on t_in_test (partition sequence: 2, name: t_in_test_part_2)
+               Filter: (c = ANY ('{"Fri Sep 01 00:00:00 2017","Wed Nov 01 00:00:00 2017"}'::timestamp without time zone[]))
+(6 rows)
+
+reset enable_fast_query_shipping;
+drop table t_in_test;
diff --git a/src/test/regress/sql/partition.sql b/src/test/regress/sql/partition.sql
index b665cd81..cc2e7dd0 100644
--- a/src/test/regress/sql/partition.sql
+++ b/src/test/regress/sql/partition.sql
@@ -438,3 +438,22 @@ truncate table int_drop partition for(5);
 truncate table int_drop partition for(1000);
 truncate table int_drop partition for(370);
 drop table int_drop;
+
+-- IN expr partition pruning
+create table t_in_test(a int, b int, c timestamp)
+partition by range (c) begin
+(timestamp without time zone '2017-09-01 0:0:0')
+step (interval '1 month') partitions (12)
+distribute by shard (a)
+to group default_group;
+
+insert into t_in_test values(1,1,'20170901');
+insert into t_in_test values(2,2,'20171001');
+insert into t_in_test values(3,3,'20171101');
+insert into t_in_test values(3,3,'20171201');
+
+explain (costs off) select * from t_in_test where c in ('20171001', '20171201');
+set enable_fast_query_shipping to off;
+explain (costs off) select * from t_in_test where c in ('20170901', '20171101');
+reset enable_fast_query_shipping;
+drop table t_in_test;

From a4861f6964adc410f25369345fb750c4d23f3a72 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 17 Aug 2021 15:39:06 +0800
Subject: [PATCH 415/578] Replace datid, usesysid with datname, usename

http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131090773427&url_cache_key=3896fc0d053f1c19ad00a42c3d3ccca6&action_entry_type=bugs
---
 .../pg_stat_cluster_activity--1.0.sql           |  4 ++--
 .../pg_stat_cluster_activity.c                  | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql
index c5514458..72f71480 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity--1.0.sql
@@ -15,8 +15,8 @@ CREATE OR REPLACE FUNCTION pg_stat_get_cluster_activity(
     OUT client_port integer,
     OUT nodename text,
     OUT role text,
-    OUT datid oid,
-    OUT usesysid oid,
+    OUT datname text,
+    OUT usename text,
     OUT wait_event_type text,
     OUT wait_event text,
     OUT state text,
diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
index efe74c95..1bc9f489 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -2,6 +2,7 @@
 
 #include "catalog/pg_authid.h"
 #include "catalog/pg_type.h"
+#include "commands/dbcommands.h"
 #include "commands/explain.h"
 #include "common/ip.h"
 #include "fmgr.h"
@@ -715,12 +716,24 @@ pg_stat_get_cluster_activity(PG_FUNCTION_ARGS)
 		values[1] = Int32GetDatum(beentry->st_procpid);
 		
 		if (beentry->st_databaseid != InvalidOid)
-			values[7] = ObjectIdGetDatum(beentry->st_databaseid);
+		{
+			char *dbname = get_database_name(beentry->st_databaseid);
+			if (dbname != NULL)
+				values[7] = CStringGetTextDatum(dbname);
+			else
+				nulls[7] = true;
+		}
 		else
 			nulls[7] = true;
 		
 		if (beentry->st_userid != InvalidOid)
-			values[8] = ObjectIdGetDatum(beentry->st_userid);
+		{
+			char *usename = GetUserNameFromId(beentry->st_userid, true);
+			if (usename != NULL)
+				values[8] = CStringGetTextDatum(usename);
+			else
+				nulls[8] = true;
+		}
 		else
 			nulls[8] = true;
 		

From ce7f6f113161083ff35cf7f9fff24b5d04ee1569 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Fri, 10 Nov 2017 16:50:50 -0500
Subject: [PATCH 416/578] Account for the effect of lossy pages when costing
 bitmap scans.

Dilip Kumar, reviewed by Alexander Kumenkov, Amul Sul, and me.
Some final adjustments by me.

Discussion: http://postgr.es/m/CAFiTN-sYtqUOXQ4SpuhTv0Z9gD0si3YxZGv_PQAAMX8qbOotcg@mail.gmail.com
---
 src/backend/nodes/tidbitmap.c         | 37 +++++++++++------
 src/backend/optimizer/path/costsize.c | 59 ++++++++++++++++++++++-----
 src/include/nodes/tidbitmap.h         | 21 +++++-----
 3 files changed, 85 insertions(+), 32 deletions(-)

diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c
index 73820707..c3f800a5 100644
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -265,7 +265,6 @@ TIDBitmap *
 tbm_create(long maxbytes, dsa_area *dsa)
 {
     TIDBitmap  *tbm;
-    long        nbuckets;
 
     /* Create the TIDBitmap struct and zero all its fields */
     tbm = makeNode(TIDBitmap);
@@ -273,17 +272,7 @@ tbm_create(long maxbytes, dsa_area *dsa)
     tbm->mcxt = CurrentMemoryContext;
     tbm->status = TBM_EMPTY;
 
-    /*
-     * Estimate number of hashtable entries we can have within maxbytes. This
-     * estimates the hash cost as sizeof(PagetableEntry), which is good enough
-     * for our purpose.  Also count an extra Pointer per entry for the arrays
-     * created during iteration readout.
-     */
-    nbuckets = maxbytes /
-        (sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
-    nbuckets = Min(nbuckets, INT_MAX - 1);    /* safety limit */
-    nbuckets = Max(nbuckets, 16);    /* sanity limit */
-    tbm->maxentries = (int) nbuckets;
+	tbm->maxentries = (int) tbm_calculate_entries(maxbytes);
     tbm->lossify_start = 0;
     tbm->dsa = dsa;
     tbm->dsapagetable = InvalidDsaPointer;
@@ -1546,3 +1535,27 @@ pagetable_free(pagetable_hash *pagetable, void *pointer)
         tbm->dsapagetableold = InvalidDsaPointer;
     }
 }
+
+/*
+ * tbm_calculate_entries
+ *
+ * Estimate number of hashtable entries we can have within maxbytes.
+ */
+long
+tbm_calculate_entries(double maxbytes)
+{
+	long		nbuckets;
+
+	/*
+	 * Estimate number of hashtable entries we can have within maxbytes. This
+	 * estimates the hash cost as sizeof(PagetableEntry), which is good enough
+	 * for our purpose.  Also count an extra Pointer per entry for the arrays
+	 * created during iteration readout.
+	 */
+	nbuckets = maxbytes /
+		(sizeof(PagetableEntry) + sizeof(Pointer) + sizeof(Pointer));
+	nbuckets = Min(nbuckets, INT_MAX - 1);	/* safety limit */
+	nbuckets = Max(nbuckets, 16);	/* sanity limit */
+
+	return nbuckets;
+}
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 18ca6a7d..f8ac09e8 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -5417,6 +5417,8 @@ compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
     double        T;
     double        pages_fetched;
     double        tuples_fetched;
+	double		heap_pages;
+	long		maxentries;
 
     /*
      * Fetch total cost of obtaining the bitmap, as well as its total
@@ -5431,6 +5433,24 @@ compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
 
     T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
 
+	/*
+	 * For a single scan, the number of heap pages that need to be fetched is
+	 * the same as the Mackert and Lohman formula for the case T <= b (ie, no
+	 * re-reads needed).
+	 */
+	pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
+
+	/*
+	 * Calculate the number of pages fetched from the heap.  Then based on
+	 * current work_mem estimate get the estimated maxentries in the bitmap.
+	 * (Note that we always do this calculation based on the number of pages
+	 * that would be fetched in a single iteration, even if loop_count > 1.
+	 * That's correct, because only that number of entries will be stored in
+	 * the bitmap at one time.)
+	 */
+	heap_pages = Min(pages_fetched, baserel->pages);
+	maxentries = tbm_calculate_entries(work_mem * 1024L);
+
     if (loop_count > 1)
     {
         /*
@@ -5445,22 +5465,41 @@ compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
                                             root);
         pages_fetched /= loop_count;
     }
-    else
-    {
-        /*
-         * For a single scan, the number of heap pages that need to be fetched
-         * is the same as the Mackert and Lohman formula for the case T <= b
-         * (ie, no re-reads needed).
-         */
-        pages_fetched =
-            (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
-    }
 
     if (pages_fetched >= T)
         pages_fetched = T;
     else
         pages_fetched = ceil(pages_fetched);
 
+	if (maxentries < heap_pages)
+	{
+		double		exact_pages;
+		double		lossy_pages;
+
+		/*
+		 * Crude approximation of the number of lossy pages.  Because of the
+		 * way tbm_lossify() is coded, the number of lossy pages increases
+		 * very sharply as soon as we run short of memory; this formula has
+		 * that property and seems to perform adequately in testing, but it's
+		 * possible we could do better somehow.
+		 */
+		lossy_pages = Max(0, heap_pages - maxentries / 2);
+		exact_pages = heap_pages - lossy_pages;
+
+		/*
+		 * If there are lossy pages then recompute the  number of tuples
+		 * processed by the bitmap heap node.  We assume here that the chance
+		 * of a given tuple coming from an exact page is the same as the
+		 * chance that a given page is exact.  This might not be true, but
+		 * it's not clear how we can do any better.
+		 */
+		if (lossy_pages > 0)
+			tuples_fetched =
+				clamp_row_est(indexSelectivity *
+							  (exact_pages / heap_pages) * baserel->tuples +
+							  (lossy_pages / heap_pages) * baserel->tuples);
+	}
+
     if (cost)
         *cost = indexTotalCost;
     if (tuple)
diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h
index fbd75c20..d3ad0a55 100644
--- a/src/include/nodes/tidbitmap.h
+++ b/src/include/nodes/tidbitmap.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * tidbitmap.h
- *      PostgreSQL tuple-id (TID) bitmap package
+ *	  PostgreSQL tuple-id (TID) bitmap package
  *
  * This module provides bitmap data structures that are spiritually
  * similar to Bitmapsets, but are specially adapted to store sets of
@@ -39,11 +39,11 @@ typedef struct TBMSharedIterator TBMSharedIterator;
 /* Result structure for tbm_iterate */
 typedef struct
 {
-    BlockNumber blockno;        /* page number containing tuples */
-    int            ntuples;        /* -1 indicates lossy result */
-    bool        recheck;        /* should the tuples be rechecked? */
-    /* Note: recheck is always true if ntuples < 0 */
-    OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
+	BlockNumber blockno;		/* page number containing tuples */
+	int			ntuples;		/* -1 indicates lossy result */
+	bool		recheck;		/* should the tuples be rechecked? */
+	/* Note: recheck is always true if ntuples < 0 */
+	OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
 } TBMIterateResult;
 
 /* function prototypes in nodes/tidbitmap.c */
@@ -53,8 +53,8 @@ extern void tbm_free(TIDBitmap *tbm);
 extern void tbm_free_shared_area(dsa_area *dsa, dsa_pointer dp);
 
 extern void tbm_add_tuples(TIDBitmap *tbm,
-               const ItemPointer tids, int ntids,
-               bool recheck);
+			   const ItemPointer tids, int ntids,
+			   bool recheck);
 extern void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno);
 
 extern void tbm_union(TIDBitmap *a, const TIDBitmap *b);
@@ -69,6 +69,7 @@ extern TBMIterateResult *tbm_shared_iterate(TBMSharedIterator *iterator);
 extern void tbm_end_iterate(TBMIterator *iterator);
 extern void tbm_end_shared_iterate(TBMSharedIterator *iterator);
 extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa,
-                          dsa_pointer dp);
+						  dsa_pointer dp);
+extern long tbm_calculate_entries(double maxbytes);
 
-#endif                            /* TIDBITMAP_H */
+#endif							/* TIDBITMAP_H */

From 4156013532fb015044ada2af00c71c788332d100 Mon Sep 17 00:00:00 2001
From: guanhuawang <guanhuawang@tencent.com>
Date: Tue, 24 Aug 2021 23:04:49 +0800
Subject: [PATCH 417/578] fix an error when perform materialized view
 concurrently refresh.
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131087752683

---
 src/backend/commands/matview.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c
index 102e2f36..b158aaa4 100644
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -832,9 +832,13 @@ refresh_by_match_merge(Oid matviewOid, Oid tempOid, Oid relowner,
 
     /* Analyze the diff table. */
     resetStringInfo(&querybuf);
-    appendStringInfo(&querybuf, "ANALYZE %s", diffname);
+    /* 
+     * Materialized view is stored on CN, use "(COORDINATOR)" option to force
+     * vacuum analyzing "diff table" on CN.
+     */
+    appendStringInfo(&querybuf, "ANALYZE (COORDINATOR) %s", diffname);
     if (SPI_exec(querybuf.data, 0) != SPI_OK_UTILITY)
-        elog(ERROR, "SPI_exec failed: %s", querybuf.data);
+        elog(ERROR, "SPI_exec failed: %s", querybuf.data);;
 
     OpenMatViewIncrementalMaintenance();
 

From 8a4e9139a9c0fef1e6d1482b78250e591f72da64 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Tue, 7 Sep 2021 10:30:52 +0800
Subject: [PATCH 418/578] 2pc stop opt: add clean 2pc process (merge request
 656)

---
 contrib/pg_clean/pg_clean.c              |  134 ++-
 src/backend/access/transam/gtm.c         |    9 +
 src/backend/access/transam/twophase.c    |    2 +-
 src/backend/access/transam/varsup.c      |    2 +
 src/backend/access/transam/xact.c        |  109 ++-
 src/backend/pgxc/pool/execRemote.c       |   22 +-
 src/backend/postmaster/Makefile          |    2 +-
 src/backend/postmaster/clean2pc.c        | 1002 ++++++++++++++++++++++
 src/backend/postmaster/pgstat.c          |   17 +
 src/backend/postmaster/postmaster.c      |   96 +++
 src/backend/storage/ipc/ipci.c           |    4 +
 src/backend/storage/lmgr/lwlocknames.txt |    1 +
 src/backend/storage/lmgr/proc.c          |   22 +-
 src/backend/utils/init/miscinit.c        |    3 +-
 src/backend/utils/init/postinit.c        |    9 +-
 src/backend/utils/misc/guc.c             |   44 +-
 src/include/access/xact.h                |    4 +
 src/include/pgstat.h                     |    4 +-
 src/include/postmaster/clean2pc.h        |   43 +
 src/include/storage/pmsignal.h           |   32 +-
 src/include/storage/proc.h               |    2 +
 src/test/regress/expected/sysviews.out   |    4 +-
 22 files changed, 1512 insertions(+), 55 deletions(-)
 create mode 100644 src/backend/postmaster/clean2pc.c
 create mode 100644 src/include/postmaster/clean2pc.h

diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c
index 459a2fc0..08375f46 100644
--- a/contrib/pg_clean/pg_clean.c
+++ b/contrib/pg_clean/pg_clean.c
@@ -63,17 +63,17 @@ int  transaction_threshold = 200000;
 #define MAXIMUM_OUTPUT_FILE 1000
 #define XIDPREFIX "_$XC$"
 #define DEFAULT_CLEAN_TIME_INTERVAL 120000000
-#ifdef __TWO_PHASE_TESTS__
-#define LEAST_CLEAN_TIME_INTERVAL 10000000 /* in pg_clean test_mode should not clean twophase trans prepared in ten seconds or commit in ten seconds */
-#else
-#define LEAST_CLEAN_TIME_INTERVAL 60000000 /* should not clean twophase trans prepared in a minite or commit in a minite */
-#endif
-GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL;
+#define LEAST_CLEAN_TIME_INTERVAL   1000000 /* should not clean twophase trans prepared in 1s or commit in 1s */
 
+GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL;
 
 PG_MODULE_MAGIC;
 
-#define MAX_GID 50
+#define MAX_GID               64
+
+#define CLEAN_CHECK_TIMES     3
+#define CLEAN_CHECK_INTERVAL  10000
+
 #define MAX_DBNAME	64
 #define GET_START_XID "startxid:"
 #define GET_COMMIT_TIMESTAMP "global_commit_timestamp:"
@@ -2397,6 +2397,7 @@ bool check_2pc_belong_node(txn_info * txn)
     int node_index = 0;
     char node_type;
     node_index = find_node_index(abnormal_nodeoid);
+    Assert(InvalidOid != abnormal_nodeoid);
     if (abnormal_nodeoid == txn->origcoord)
     {
         txn->belong_abnormal_node = true;
@@ -2413,6 +2414,60 @@ bool check_2pc_belong_node(txn_info * txn)
         txn->belong_abnormal_node = true;
         return true;
     }
+
+    if (InvalidOid == txn->origcoord)
+    {
+        char *startnode = NULL;
+        int   node_oid  = InvalidOid;
+        char  gid[MAX_GID];
+
+        if (!IsXidImplicit(txn->gid))
+        {
+            txn->belong_abnormal_node = true;
+            return true;
+        }
+
+        Assert(IsXidImplicit(txn->gid));
+
+        /* get start node from gid */
+        strcpy(gid, txn->gid);
+        startnode = strtok(gid, ":");
+        if (NULL == startnode)
+        {
+            elog(WARNING, "get startnode(%s) from gid(%s) failed",
+                startnode, gid);
+            txn->belong_abnormal_node = false;
+            return false;
+        }
+
+        startnode = strtok(NULL, ":");
+        if (NULL == startnode)
+        {
+            elog(WARNING, "get startnode(%s) from gid(%s) failed",
+                startnode, gid);
+            txn->belong_abnormal_node = false;
+            return false;
+        }
+
+        node_oid = get_pgxc_nodeoid(startnode);
+        if (NULL == startnode)
+        {
+            elog(WARNING, "get invalid oid for startnode(%s) from gid(%s)",
+                startnode, gid);
+            txn->belong_abnormal_node = false;
+            return false;
+        }
+
+        elog(DEBUG5, "get oid(%d) for startnode(%s) from gid(%s)",
+            node_oid, startnode, gid);
+
+        if (abnormal_nodeoid == node_oid)
+        {
+            txn->belong_abnormal_node = true;
+            return true;
+        }
+    }
+
     txn->belong_abnormal_node = false;
     return false;
 }
@@ -2432,6 +2487,10 @@ bool check_node_participate(txn_info * txn, int node_idx)
 
 void recover2PC(txn_info * txn)
 {
+	int i = 0;
+	bool check_ok = false;
+	MemoryContext current_context = NULL;
+	ErrorData* edata = NULL;
 	TXN_STATUS txn_stat;
 	txn_stat = check_txn_global_status(txn);
 	txn->global_txn_stat = txn_stat;
@@ -2470,12 +2529,40 @@ void recover2PC(txn_info * txn)
             {
     			txn->op = COMMIT;
     			/* check whether all nodes can commit prepared */
+				for (i = 0; i < CLEAN_CHECK_TIMES; i++)
+				{
+					check_ok = true;
+					current_context = CurrentMemoryContext;
+					PG_TRY();
+					{
     			if (!clean_2PC_iscommit(txn, true, true))
     			{
+							check_ok = false;
+							elog(LOG, "check commit 2PC transaction %s failed",
+								txn->gid);
+						}
+					}
+					PG_CATCH();
+					{
+						(void)MemoryContextSwitchTo(current_context);
+						edata = CopyErrorData();
+						FlushErrorState();
+
+						check_ok = false;
+						elog(WARNING, "check commit 2PC transaction %s error: %s",
+							txn->gid, edata->message);
+					}
+					PG_END_TRY();
+
+					if (!check_ok)
+					{
     				txn->op_issuccess = false;
-    				elog(LOG, "check commit 2PC transaction %s failed", txn->gid);
     				return;
     			}
+
+					pg_usleep(CLEAN_CHECK_INTERVAL);
+				}
+
     			/* send commit prepared to all nodes */
     			if (!clean_2PC_iscommit(txn, true, false))
     			{
@@ -2491,12 +2578,40 @@ void recover2PC(txn_info * txn)
 		case TXN_STATUS_ABORTED:
 			txn->op = ABORT;
 			/* check whether all nodes can rollback prepared */
+			for (i = 0; i < CLEAN_CHECK_TIMES; i++)
+			{
+				check_ok = true;
+				current_context = CurrentMemoryContext;
+				PG_TRY();
+				{
 			if (!clean_2PC_iscommit(txn, false, true))
 			{
+						check_ok = false;
+						elog(LOG, "check rollback 2PC transaction %s failed",
+							txn->gid);
+					}
+				}
+				PG_CATCH();
+				{
+					check_ok = false;
+					(void)MemoryContextSwitchTo(current_context);
+					edata = CopyErrorData();
+					FlushErrorState();
+
+					elog(WARNING, "check rollback 2PC transaction %s error: %s",
+						txn->gid, edata->message);
+				}
+				PG_END_TRY();
+
+				if (!check_ok)
+				{
 				txn->op_issuccess = false;
-				elog(LOG, "check rollback 2PC transaction %s failed", txn->gid);
 				return;
 			}
+
+				pg_usleep(CLEAN_CHECK_INTERVAL);
+			}
+
 			/* send rollback prepared to all nodes */
 			if (!clean_2PC_iscommit(txn, false, false))
 			{
@@ -2620,7 +2735,6 @@ TXN_STATUS check_txn_global_status(txn_info *txn)
     {
         node_idx = find_node_index(abnormal_nodeoid);
         if (!check_2pc_belong_node(txn) || 
-           !check_node_participate(txn, node_idx) ||
             abnormal_time < txn->prepare_timestamp[node_idx])
         {
             return TXN_STATUS_INPROGRESS;
diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 5f95e859..5fb8904a 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -22,6 +22,7 @@
 #include "pgxc/pgxc.h"
 #include "gtm/gtm_c.h"
 #include "postmaster/autovacuum.h"
+#include "postmaster/clean2pc.h"
 #include "postmaster/clustermon.h"
 #include "storage/backendid.h"
 #include "tcop/tcopprot.h"
@@ -1336,6 +1337,10 @@ InitGTM(void)
 			elog(LOG, "Autovacuum launcher: connection established to GTM with string %s", conn_str);
 		else if (IsClusterMonitorProcess() && GTMDebugPrint)
 			elog(LOG, "Cluster monitor: connection established to GTM with string %s", conn_str);
+		else if (IsClean2pcWorker() && GTMDebugPrint)
+			elog(LOG, "Clean 2pc worker: connection established to GTM with string %s", conn_str);
+		else if (IsClean2pcLauncher() && GTMDebugPrint)
+			elog(LOG, "Clean 2pc launcher: connection established to GTM with string %s", conn_str);
 		else if(GTMDebugPrint)
 			elog(LOG, "Postmaster child: connection established to GTM with string %s", conn_str);
 	}
@@ -1424,6 +1429,10 @@ CloseGTM(void)
         elog(DEBUG1, "Autovacuum launcher: connection to GTM closed");
     else if (IsClusterMonitorProcess())
         elog(DEBUG1, "Cluster monitor: connection to GTM closed");
+	else if (IsClean2pcWorker())
+		elog(DEBUG1, "Clean 2pc worker: connection to GTM closed");
+	else if (IsClean2pcLauncher())
+		elog(DEBUG1, "Clean 2pc launcher: connection to GTM closed");
     else
         elog(DEBUG1, "Postmaster child: connection to GTM closed");
 }
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index a9078bda..387cdf73 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -4051,7 +4051,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
         }
         else
         {
-			elog(PANIC, "[%s] could not open file %s, errMsg: %s",
+			elog(ERROR, "[%s] could not open file %s, errMsg: %s",
 				__FUNCTION__, path, strerror(errno));
         }
         return;
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 6fdadcc2..42baa98f 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -24,6 +24,7 @@
 #include "commands/dbcommands.h"
 #include "miscadmin.h"
 #include "postmaster/autovacuum.h"
+#include "postmaster/clean2pc.h"
 #include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "utils/syscache.h"
@@ -356,6 +357,7 @@ GetNewTransactionId(bool isSubXact)
         (!IsConnFromCoord() ||
         IsAutoVacuumWorkerProcess() ||
         IsAutoVacuumLauncherProcess() ||
+		IsAnyClean2pcProcess() ||
         GetForceXidFromGTM() ||
         (IsInitProcessingMode() && IsPostmasterEnvironment)))
     {
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 3e59c7f4..ed02dff9 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -32,6 +32,7 @@
 #include "pgxc/pause.h"
 /* PGXC_DATANODE */
 #include "postmaster/autovacuum.h"
+#include "postmaster/clean2pc.h"
 #include "libpq/pqformat.h"
 #include "libpq/libpq.h"
 #endif
@@ -65,6 +66,7 @@
 #include "storage/condition_variable.h"
 #include "storage/fd.h"
 #include "storage/lmgr.h"
+#include "storage/pmsignal.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
@@ -101,6 +103,13 @@
 #include "tcop/utility.h"
 #include "utils/relcryptmap.h"
 #endif
+
+#ifdef __TWO_PHASE_TESTS__
+#define TWO_PHASE_TEST_NOT_STOP    1
+#define TWO_PHASE_TEST_STOP_DN     2
+#define TWO_PHASE_TEST_STOP_ALL    3
+#endif
+
 /*
  *    User-tweakable parameters
  */
@@ -395,6 +404,10 @@ static bool XactLocalNodePrepared;
 static bool  XactReadLocalNode;
 static bool  XactWriteLocalNode;
 
+#ifdef __TWO_PHASE_TRANS__
+bool enable_2pc_error_stop = false;
+#endif
+
 /*
  * Some commands want to force synchronous commit.
  */
@@ -3743,13 +3756,17 @@ AbortTransaction(void)
     TransactionState s = CurrentTransactionState;
     TransactionId latestXid;
     bool        is_parallel_worker;
+	bool		can_abort = true;
 
 #ifdef __TWO_PHASE_TRANS__
     StringInfoData errormsg;
     
-    if (
 #ifdef __TWO_PHASE_TESTS__
+    bool test_stop = (complish && run_pg_clean);
+#endif
     
+    can_abort = !(
+#ifdef __TWO_PHASE_TESTS__
       (complish && run_pg_clean) ||
 #endif
       TWO_PHASE_COMMITTING == g_twophase_state.state ||
@@ -3758,18 +3775,76 @@ AbortTransaction(void)
       TWO_PHASE_ABORT_END == g_twophase_state.state ||
       TWO_PHASE_UNKNOW_STATUS == g_twophase_state.state ||
         (TWO_PHASE_PREPARED == g_twophase_state.state &&
-        false == g_twophase_state.is_start_node))
-    {
+        false == g_twophase_state.is_start_node));
         
+    if (!can_abort)
+    {
         if (false == g_twophase_state.isprinted)
         {
             print_twophase_state(&errormsg, false);    
+
+            if (enable_2pc_error_stop)
+            {
             elog(STOP, "errormsg in AbortTransaction:\n %s", errormsg.data);
         }
+#ifdef __TWO_PHASE_TESTS__
+            else if (test_stop)
+            {
+                switch (run_pg_clean)
+                {
+                case TWO_PHASE_TEST_NOT_STOP:
+                    break;
+                case TWO_PHASE_TEST_STOP_DN:
+                    if (IS_PGXC_LOCAL_COORDINATOR)
+                    {
+                        break;
+                    }
+                case TWO_PHASE_TEST_STOP_ALL:
+                    elog(STOP, "in test, in AbortTransaction:\n %s", errormsg.data);
+                    break;
+                default:
+                    break;
+                }
+                elog(WARNING, "in test, in AbortTransaction:\n %s", errormsg.data);
+            }
+#endif
+            else
+            {
+                elog(WARNING, "errormsg in AbortTransaction:\n %s", errormsg.data);
+            }
+        }
         else
         {
+            if (enable_2pc_error_stop)
+            {
             elog(STOP, "STOP postmaster in AbortTransaction");
         }
+#ifdef __TWO_PHASE_TESTS__
+            else if (test_stop)
+            {
+                switch (run_pg_clean)
+                {
+                case TWO_PHASE_TEST_NOT_STOP:
+                    break;
+                case TWO_PHASE_TEST_STOP_DN:
+                    if (IS_PGXC_LOCAL_COORDINATOR)
+                    {
+                        break;
+                    }
+                case TWO_PHASE_TEST_STOP_ALL:
+                    elog(STOP, "in test, postmaster in AbortTransaction");
+                    break;
+                default:
+                    break;
+                }
+                elog(WARNING, "in test, postmaster in AbortTransaction");
+            }
+#endif
+            else
+            {
+                elog(WARNING, "WARNING postmaster in AbortTransaction");
+            }
+        }
     }
 
     /* print prepare err in pgxc_node_remote_prepare */
@@ -3786,8 +3861,12 @@ AbortTransaction(void)
      * Cleanup the files created during database/tablespace operations.
      * This must happen before we release locks, because we want to hold the
      * locks acquired initially while we cleanup the files.
+	 * If can_abort is false, needn't do DBCleanup, Createdb, movedb, createtablespace e.g.
      */
+	if (can_abort)
+	{
     AtEOXact_DBCleanup(false);
+	}
 
 #ifdef __TBASE__
     SqueueProducerExit();
@@ -3798,6 +3877,17 @@ AbortTransaction(void)
      * transaction at the GTM at thr end
      */
     s->topGlobalTransansactionId = s->transactionId;
+
+#ifdef __TWO_PHASE_TRANS__
+	if (IS_PGXC_LOCAL_COORDINATOR && g_twophase_state.state != TWO_PHASE_INITIALTRANS)
+	{
+		elog(LOG, "send signal to clean 2pc launcher, gid: %s", g_twophase_state.gid);
+		SendPostmasterSignal(PMSIGNAL_WAKEN_CLEAN_2PC_TRIGGER);
+	}
+#endif
+
+	if (can_abort)
+	{
     /*
      * Handle remote abort first.
      */
@@ -3820,13 +3910,11 @@ AbortTransaction(void)
         FinishPreparedTransaction(savePrepareGID, false);
         XactLocalNodePrepared = false;
     }
-    else
-    {
+	}
+
 #ifdef __TWO_PHASE_TRANS__
-        g_twophase_state.state = TWO_PHASE_ABORTTED;
         ClearLocalTwoPhaseState();
 #endif
-    }
 
     if(enable_distri_debug && is_distri_report && IS_PGXC_COORDINATOR)
     {
@@ -4003,7 +4091,10 @@ AbortTransaction(void)
 #endif
             latestXid = RecordTransactionAbort(false);
 #ifdef __TBASE__
+		if (can_abort)
+		{
         FinishSeqOp(false);
+		}
 #endif
     }
     else
@@ -4049,7 +4140,10 @@ AbortTransaction(void)
 
         /* See comments in CommitTransaction */
 #ifdef XCP
+		if (can_abort)
+		{
         AtEOXact_GlobalTxn(false);
+		}
 #endif
 
         ResourceOwnerRelease(TopTransactionResourceOwner,
@@ -7839,6 +7933,7 @@ IsPGXCNodeXactDatanodeDirect(void)
            (IsPostmasterEnvironment || !useLocalXid) &&
            IsNormalProcessingMode() &&
            !IsAutoVacuumLauncherProcess() &&
+		   !IsClean2pcLauncher() &&
 #ifdef XCP
            !IsConnFromDatanode() &&
 #endif
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 19eff0ea..1708343b 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -8384,6 +8384,17 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit)
     GlobalTransactionId        gxid, prepare_gxid;
     bool                    prepared_local = false;
 
+#ifdef __TWO_PHASE_TRANS__
+	/*
+	 * Since g_twophase_state is cleared after prepare phase,
+	 * g_twophase_state shoud be assigned here
+	 */
+	strncpy(g_twophase_state.gid, prepareGID, GIDSIZE);
+	strncpy(g_twophase_state.start_node_name, PGXCNodeName, NAMEDATALEN);
+	g_twophase_state.state = TWO_PHASE_PREPARED;
+	g_twophase_state.is_start_node = true;
+#endif
+
     /*
      * Get the list of nodes involved in this transaction.
      *
@@ -8441,7 +8452,6 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit)
 #endif
 
 #ifdef __TWO_PHASE_TRANS__
-     
     /* 
      * not allowed user commit residual transaction in xc_maintenance_mode, 
      * since we need commit them in unified timestamp
@@ -8450,19 +8460,11 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit)
     {
         elog(ERROR, "can not commit transaction '%s' in xc_maintainence_mode", prepareGID);
     }
-    /* 
-     *since g_twophase_state is cleared after prepare phase 
-     *g_twophase_state shoud be assigned here
-     */
-    strncpy(g_twophase_state.gid, prepareGID, GIDSIZE);
-    strncpy(g_twophase_state.start_node_name, PGXCNodeName, NAMEDATALEN);
-    g_twophase_state.state = TWO_PHASE_PREPARED;
-    g_twophase_state.is_start_node = true;
+
     if (nodestring)
     {
         strncpy(g_twophase_state.participants, nodestring,((NAMEDATALEN+1) * (TBASE_MAX_DATANODE_NUMBER + TBASE_MAX_COORDINATOR_NUMBER)));
     }
-    
 #endif
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 56d6a151..2b532af5 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -13,6 +13,6 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = auditlogger.o autovacuum.o bgworker.o bgwriter.o checkpointer.o clustermon.o \
-	fork_process.o pgarch.o pgstat.o postmaster.o startup.o syslogger.o walwriter.o
+	fork_process.o pgarch.o pgstat.o postmaster.o startup.o syslogger.o walwriter.o clean2pc.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/postmaster/clean2pc.c b/src/backend/postmaster/clean2pc.c
new file mode 100644
index 00000000..80ab7103
--- /dev/null
+++ b/src/backend/postmaster/clean2pc.c
@@ -0,0 +1,1002 @@
+/*-------------------------------------------------------------------------
+ *
+ * clean2pc.c
+ *
+ * The background clean 2pc processes are added by whalesong.
+ * They attempt to clean the abnormal 2pc.
+ *
+ * Portions Copyright (c) 1996-2021, TDSQL-PG Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/postmaster/clean2pc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "catalog/pg_database.h"
+#include "catalog/pg_type.h"
+#include "commands/dbcommands.h"
+#include "executor/executor.h"
+#include "libpq/pqsignal.h"
+#include "nodes/makefuncs.h"
+#include "postmaster/clean2pc.h"
+#include "postmaster/fork_process.h"
+#include "postmaster/postmaster.h"
+#include "pgxc/execRemote.h"
+#include "storage/buf_internals.h"
+#include "storage/ipc.h"
+#include "storage/pmsignal.h"
+#include "tcop/tcopprot.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+#include "utils/timeout.h"
+
+#define MAX_GID           64
+
+#define SQL_CMD_LEN       1024
+#define MAX_DB_SIZE       100
+
+#define DB_TEMPLATE0     "template0"
+#define DB_TEMPLATE1     "template1"
+#define DB_DEFAULT       "postgres"
+
+typedef enum
+{
+	Query2pcAttr_gid             = 0,
+	Query2pcAttr_database        = 1,
+	Query2pcAttr_global_status   = 2,
+	Query2pcAttr_status_on_nodes = 3,
+	Query2pcAttr_butty
+} Query2pcAttrEnum;
+
+bool enable_clean_2pc_launcher = true;
+
+int auto_clean_2pc_interval = 10;
+int auto_clean_2pc_delay    = 3;
+
+static volatile sig_atomic_t got_SIGTERM = false;
+static volatile sig_atomic_t got_SIGHUP  = false;
+static volatile sig_atomic_t got_SIGUSR2 = false;
+
+/* Flags to tell if we are in an clean 2pc process */
+static bool am_clean_2pc_launcher = false;
+static bool am_clean_2pc_worker   = false;
+
+static StringInfo result_str = NULL;
+
+#ifdef EXEC_BACKEND
+static pid_t clean_2pc_launcher_forkexec(void);
+static pid_t clean_2pc_worker_forkexec(void);
+#endif
+
+NON_EXEC_STATIC void
+Clean2pcLauncherMain(int argc, char *argv[]) pg_attribute_noreturn();
+NON_EXEC_STATIC void
+Clean2pcWorkerMain(int argc, char *argv[]) pg_attribute_noreturn();
+
+static void	start_query_worker(void);
+static void	start_clean_worker(int count);
+
+static void do_query_2pc(void);
+static void do_clean_2pc(void);
+
+static void clean_2pc_sigterm_handler(SIGNAL_ARGS);
+static void clean_2pc_sighup_handler(SIGNAL_ARGS);
+static void clean_2pc_sigusr2_handler(SIGNAL_ARGS);
+
+static List *get_database_list(void);
+static Oid   get_default_database(void);
+
+static void ExitCleanRunning(int status, Datum arg);
+
+/* struct to keep track of databases in worker */
+typedef struct Clean2pcDBInfo
+{
+	Oid   db_oid;
+	char *db_name;
+} Clean2pcDBInfo;
+
+typedef struct
+{
+	bool worker_running;
+	Oid  worker_db;
+
+	int  db_count;
+	Oid  db_list[MAX_DB_SIZE];
+} Clean2pcShmemStruct;
+
+static Clean2pcShmemStruct *Clean2pcShmem = NULL;
+
+/*
+ * Main entry point for 2pc clean launcher, to be called from the
+ * postmaster.
+ */
+int
+StartClean2pcLauncher(void)
+{
+	pid_t clean_2pc_pid = 0;
+
+#ifdef EXEC_BACKEND
+	switch ((clean_2pc_pid = clean_2pc_launcher_forkexec()))
+#else
+	switch ((clean_2pc_pid = fork_process()))
+#endif
+	{
+		case -1:
+			ereport(LOG,
+					(errmsg("could not fork 2pc clean launcher: %m")));
+			return 0;
+
+#ifndef EXEC_BACKEND
+		case 0:
+			/* in postmaster child ... */
+			InitPostmasterChild();
+
+			/* Close the postmaster's sockets */
+			ClosePostmasterPorts(false);
+
+			Clean2pcLauncherMain(0, NULL);
+			break;
+#endif
+		default:
+			return (int) clean_2pc_pid;
+	}
+
+	return 0;
+}
+
+/*
+ * Main loop for the 2pc clean launcher.
+ */
+NON_EXEC_STATIC void
+Clean2pcLauncherMain(int argc, char *argv[])
+{
+	int wait_time = 0;
+
+	am_clean_2pc_launcher = true;
+
+	/* Identify myself via ps */
+	init_ps_display("2pc clean launcher", "", "", "");
+
+	elog(LOG, "2pc clean launcher start");
+
+	SetProcessingMode(InitProcessing);
+
+	/*
+	 * Set up signal handlers.  We operate on databases much like a regular
+	 * backend, so we use the same signal handling.  See equivalent code in
+	 * tcop/postgres.c.
+	 */
+	pqsignal(SIGHUP, clean_2pc_sighup_handler);
+	pqsignal(SIGTERM, clean_2pc_sigterm_handler);
+	pqsignal(SIGINT, StatementCancelHandler);
+	pqsignal(SIGQUIT, quickdie);
+	InitializeTimeouts(); /* establishes SIGALRM handler */
+
+	pqsignal(SIGPIPE, SIG_IGN);
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGUSR2, clean_2pc_sigusr2_handler);
+	pqsignal(SIGFPE, FloatExceptionHandler);
+	pqsignal(SIGCHLD, SIG_DFL);
+
+	PG_SETMASK(&UnBlockSig);
+
+	/* Early initialization */
+	BaseInit();
+
+	/*
+	 * Create a per-backend PGPROC struct in shared memory, except in the
+	 * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
+	 * this before we can use LWLocks (and in the EXEC_BACKEND case we already
+	 * had to do some stuff with LWLocks).
+	 */
+#ifndef EXEC_BACKEND
+	InitProcess();
+#endif
+
+	InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL);
+
+	SetProcessingMode(NormalProcessing);
+
+	LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE);
+	Clean2pcShmem->worker_running = false;
+	Clean2pcShmem->db_count = 0;
+	Clean2pcShmem->worker_db = InvalidOid;
+	LWLockRelease(Clean2pcLock);
+
+	if (result_str == NULL)
+	{
+		MemoryContext oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+		result_str = makeStringInfo();
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	wait_time = auto_clean_2pc_delay;
+	for (;;)
+	{
+		pg_usleep(1000000L * wait_time);
+
+		if (got_SIGTERM)
+		{
+			elog(LOG, "2pc clean launcher got SIGTERM");
+			got_SIGTERM = false;
+			proc_exit(0);
+		}
+
+		if (got_SIGHUP)
+		{
+			elog(LOG, "2pc clean launcher got SIGHUP");
+			got_SIGHUP = false;
+			ProcessConfigFile(PGC_SIGHUP);
+			wait_time = auto_clean_2pc_delay;
+			continue;
+		}
+
+		if (got_SIGUSR2)
+		{
+			elog(LOG, "2pc clean launcher got SIGUSR2");
+			got_SIGUSR2 = false;
+			wait_time = auto_clean_2pc_delay;
+			continue;
+		}
+
+		start_query_worker();
+
+		if (got_SIGTERM || got_SIGHUP || got_SIGUSR2)
+		{
+			wait_time = 0;
+		}
+		else
+		{
+			wait_time = auto_clean_2pc_interval;
+		}
+	}
+}
+
+/*
+ * Main entry point for 2pc clean worker, to be called from the
+ * postmaster.
+ */
+int
+StartClean2pcWorker(void)
+{
+	pid_t clean_2pc_pid = 0;
+
+#ifdef EXEC_BACKEND
+	switch ((clean_2pc_pid = clean_2pc_worker_forkexec()))
+#else
+	switch ((clean_2pc_pid = fork_process()))
+#endif
+	{
+		case -1:
+			ereport(LOG,
+					(errmsg("could not fork 2pc clean worker: %m")));
+			return 0;
+
+#ifndef EXEC_BACKEND
+		case 0:
+			/* in postmaster child ... */
+			InitPostmasterChild();
+
+			/* Close the postmaster's sockets */
+			ClosePostmasterPorts(false);
+
+			Clean2pcWorkerMain(0, NULL);
+			break;
+#endif
+		default:
+			return (int) clean_2pc_pid;
+	}
+
+	return 0;
+}
+
+/*
+ * Main for the 2pc clean worker.
+ */
+NON_EXEC_STATIC void
+Clean2pcWorkerMain(int argc, char *argv[])
+{
+	char db_name[NAMEDATALEN];
+	Oid  db_oid = InvalidOid;
+	int  clean_db_count = 0;
+
+	am_clean_2pc_worker = true;
+
+	on_proc_exit(ExitCleanRunning, 0);
+
+	/* Identify myself via ps */
+	init_ps_display("2pc clean worker", "", "", "");
+
+	elog(LOG, "2pc clean worker start");
+
+	SetProcessingMode(InitProcessing);
+
+	/*
+	 * Set up signal handlers.  We operate on databases much like a regular
+	 * backend, so we use the same signal handling.  See equivalent code in
+	 * tcop/postgres.c.
+	 */
+	pqsignal(SIGHUP, clean_2pc_sighup_handler);
+	pqsignal(SIGTERM, clean_2pc_sigterm_handler);
+	pqsignal(SIGINT, StatementCancelHandler);
+	pqsignal(SIGQUIT, quickdie);
+	InitializeTimeouts(); /* establishes SIGALRM handler */
+
+	pqsignal(SIGPIPE, SIG_IGN);
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGUSR2, clean_2pc_sigusr2_handler);
+	pqsignal(SIGFPE, FloatExceptionHandler);
+	pqsignal(SIGCHLD, SIG_DFL);
+
+	PG_SETMASK(&UnBlockSig);
+
+	/* Early initialization */
+	BaseInit();
+
+	/*
+	 * Create a per-backend PGPROC struct in shared memory, except in the
+	 * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
+	 * this before we can use LWLocks (and in the EXEC_BACKEND case we already
+	 * had to do some stuff with LWLocks).
+	 */
+#ifndef EXEC_BACKEND
+	InitProcess();
+#endif
+
+	LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE);
+
+	Clean2pcShmem->worker_running = true;
+
+	db_oid = Clean2pcShmem->worker_db;
+
+	Assert(OidIsValid(db_oid));
+
+	InitPostgres(NULL, db_oid, NULL, InvalidOid, db_name);
+
+	SetProcessingMode(NormalProcessing);
+
+	if (result_str == NULL)
+	{
+		MemoryContext oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+		result_str = makeStringInfo();
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	if (Clean2pcShmem->db_count == 0)
+	{
+		elog(DEBUG5, "query 2pc from db: %s", db_name);
+		do_query_2pc();
+		clean_db_count = Clean2pcShmem->db_count;
+	}
+	else
+	{
+		elog(LOG, "clean 2pc for db: %s", db_name);
+		do_clean_2pc();
+	}
+
+	Clean2pcShmem->worker_running = false;
+
+	LWLockRelease(Clean2pcLock);
+
+	if (clean_db_count != 0)
+	{
+		start_clean_worker(clean_db_count);
+	}
+
+	/* All done, go away */
+	proc_exit(0);
+}
+
+static void
+do_query_2pc(void)
+{
+	int                  i = 0;
+	int                  count_db = 0;
+	int                  count_2pc = 0;
+	MemoryContext        oldcontext = NULL;
+	char                 query[SQL_CMD_LEN];
+	char                 gid[MAX_GID];
+	char                *startnode = NULL;
+	bool                 is_start_from = true;
+	Oid                  db_oid = InvalidOid;
+	Oid                  last_db_oid = InvalidOid;
+	EState              *estate = NULL;
+	RemoteQuery         *plan = NULL;
+	RemoteQueryState    *pstate = NULL;
+	TupleTableSlot      *result = NULL;
+	Var                 *dummy = NULL;
+	int                  attr_num = 4;
+	static const char   *attr_name[] = {"gid", "database",
+							"global_transaction_status",
+							"transaction_status_on_allnodes"};
+
+	Assert(result_str != NULL);
+	resetStringInfo(result_str);
+
+	snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_check_txn(%d) "
+			"order by database limit 1000;", auto_clean_2pc_delay);
+
+	elog(DEBUG2, "node(%d) query: %s", PGXCNodeId, query);
+
+	StartTransactionCommand();
+
+	plan = makeNode(RemoteQuery);
+	plan->combine_type = COMBINE_TYPE_NONE;
+	plan->exec_nodes = makeNode(ExecNodes);
+	plan->exec_type = EXEC_ON_COORDS;
+
+	plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, PGXCNodeId);
+
+	plan->sql_statement = (char*)query;
+	plan->force_autocommit = false;
+
+	/*
+	* We only need the target entry to determine result data type.
+	* So create dummy even if real expression is a function.
+	*/
+	for (i = 1; i <= attr_num; i++)
+	{
+		dummy = makeVar(1, i, TEXTOID, 0, InvalidOid, 0);
+		plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist,
+										makeTargetEntry((Expr *) dummy, i, NULL, false));
+	}
+
+	InitMultinodeExecutor(false);
+
+	/* prepare to execute */
+	estate = CreateExecutorState();
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+	estate->es_snapshot = GetActiveSnapshot();
+	pstate = ExecInitRemoteQuery(plan, estate, 0);
+	MemoryContextSwitchTo(oldcontext);
+
+	Clean2pcShmem->db_count = 0;
+
+	result = ExecRemoteQuery((PlanState *) pstate);
+
+	while (result != NULL && !TupIsNull(result))
+	{
+		slot_getallattrs(result);
+
+		is_start_from = true;
+		count_2pc++;
+
+		for (i = 0; i < attr_num; i++)
+		{
+			char *value = text_to_cstring(DatumGetTextP(result->tts_values[i]));
+			appendStringInfo(result_str, "\t%s: %s", attr_name[i], value);
+			switch (i)
+			{
+			case Query2pcAttr_gid: /* value is gid */
+				if (IsXidImplicit(value))
+				{
+					/* get start node from gid */
+					startnode = NULL;
+
+					strcpy(gid, value);
+					startnode = strtok(gid, ":");
+					if (NULL == startnode)
+					{
+						elog(WARNING, "get startnode(%s) from gid(%s) failed",
+							startnode, gid);
+						break;
+					}
+
+					startnode = strtok(NULL, ":");
+					if (NULL == startnode)
+					{
+						elog(WARNING, "get startnode(%s) from gid(%s) failed",
+							startnode, gid);
+						break;
+					}
+
+					if (strcmp(startnode, PGXCNodeName) != 0)
+					{
+						is_start_from = false;
+					}
+				}
+				break;
+			case Query2pcAttr_database: /* value is database */
+				if (is_start_from)
+				{
+					db_oid = get_database_oid(value, true);
+					if (!OidIsValid(db_oid))
+					{
+						elog(WARNING, "get database(%s) oid failed", value);
+					}
+					else if (db_oid != last_db_oid)
+					{
+						if (Clean2pcShmem->db_count < MAX_DB_SIZE)
+						{
+							Clean2pcShmem->db_list[Clean2pcShmem->db_count++] = db_oid;
+						}
+						last_db_oid = db_oid;
+						count_db++;
+					}
+				}
+				break;
+			default:
+				break;
+			}
+		}
+
+		appendStringInfo(result_str, "\n");
+
+		result = ExecRemoteQuery((PlanState *) pstate);
+	}
+
+	ExecEndRemoteQuery(pstate);
+
+	CommitTransactionCommand();
+
+	if (count_2pc > 0)
+	{
+		Assert(result_str->data != NULL);
+		elog(LOG, "query remain 2pc count(%d), db count(%d):\n%s",
+			count_2pc, count_db, result_str->data);
+	}
+}
+
+static void
+do_clean_2pc(void)
+{
+	int                  i = 0;
+	int                  count = 0;
+	MemoryContext        oldcontext = NULL;
+	char                 query[SQL_CMD_LEN];
+	EState              *estate = NULL;
+	RemoteQuery         *plan = NULL;
+	RemoteQueryState    *pstate = NULL;
+	TupleTableSlot      *result = NULL;
+	Var                 *dummy = NULL;
+	int                  attr_num = 4;
+	static const char   *attr_name[] = {"gid", "global_transaction_status",
+								"operation", "operation_status"};
+	TimestampTz			 clean_time = 0;
+
+	Assert(result_str != NULL);
+	resetStringInfo(result_str);
+
+	clean_time = GetCurrentTimestamp() - USECS_PER_SEC * auto_clean_2pc_delay;
+
+	snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_execute_on_node('%s', %ld)"
+			" limit 1000;", PGXCNodeName, clean_time);
+
+	elog(DEBUG2, "node(%d) query: %s", PGXCNodeId, query);
+
+	StartTransactionCommand();
+
+	plan = makeNode(RemoteQuery);
+	plan->combine_type = COMBINE_TYPE_NONE;
+	plan->exec_nodes = makeNode(ExecNodes);
+	plan->exec_type = EXEC_ON_COORDS;
+
+	plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, PGXCNodeId);
+
+	plan->sql_statement = (char*)query;
+	plan->force_autocommit = false;
+
+	/*
+	* We only need the target entry to determine result data type.
+	* So create dummy even if real expression is a function.
+	*/
+	for (i = 1; i <= attr_num; i++)
+	{
+		dummy = makeVar(1, i, TEXTOID, 0, InvalidOid, 0);
+		plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist,
+										makeTargetEntry((Expr *) dummy, i, NULL, false));
+	}
+
+	InitMultinodeExecutor(false);
+
+	/* prepare to execute */
+	estate = CreateExecutorState();
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+	estate->es_snapshot = GetActiveSnapshot();
+	pstate = ExecInitRemoteQuery(plan, estate, 0);
+	MemoryContextSwitchTo(oldcontext);
+
+	result = ExecRemoteQuery((PlanState *) pstate);
+
+	while (result != NULL && !TupIsNull(result))
+	{
+		slot_getallattrs(result); 
+
+		count++;
+
+		for (i = 0; i < attr_num; i++)
+		{
+			char *value = text_to_cstring(DatumGetTextP(result->tts_values[i]));
+			appendStringInfo(result_str, "\t%s: %s", attr_name[i], value);
+		}
+
+		appendStringInfo(result_str, "\n");
+
+		result = ExecRemoteQuery((PlanState *) pstate);
+	}
+
+	ExecEndRemoteQuery(pstate);
+
+	CommitTransactionCommand();
+
+	if (count > 0)
+	{
+		Assert(NULL != result_str->data);
+		elog(LOG, "clean 2pc count(%d):\n%s", count, result_str->data);
+	}
+}
+
+/* SIGTERM: set flag to exit normally */
+static void
+clean_2pc_sigterm_handler(SIGNAL_ARGS)
+{
+	elog(LOG, "SIGTERM: %d", postgres_signal_arg);
+	got_SIGTERM = true;
+}
+
+
+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+clean_2pc_sighup_handler(SIGNAL_ARGS)
+{
+	elog(LOG, "SIGHUP: %d", postgres_signal_arg);
+	got_SIGHUP = true;
+}
+
+/* SIGUSR2: used for notify 2pc abnormal */
+static void
+clean_2pc_sigusr2_handler(SIGNAL_ARGS)
+{
+	elog(LOG, "SIGUSR2: %d", postgres_signal_arg);
+	got_SIGUSR2 = true;
+}
+
+/*
+ * IsClean2pcLauncher functions
+ *		Return whether this is a 2pc clean launcher.
+ */
+bool
+IsClean2pcLauncher(void)
+{
+	return am_clean_2pc_launcher;
+}
+
+/*
+ * IsClean2pcWorker functions
+ *		Return whether this is a 2pc clean worker.
+ */
+bool
+IsClean2pcWorker(void)
+{
+	return am_clean_2pc_worker;
+}
+
+/*
+ * get_database_list
+ *		Return a list of all databases found in pg_database.
+ *
+ * The list and associated data is allocated in the caller's memory context,
+ * which is in charge of ensuring that it's properly cleaned up afterwards.
+ *
+ * Note: this is the only function in which the autovacuum launcher uses a
+ * transaction.  Although we aren't attached to any particular database and
+ * therefore can't access most catalogs, we do have enough infrastructure
+ * to do a seqscan on pg_database.
+ */
+static List *
+get_database_list(void)
+{
+	List	   *dblist = NIL;
+	Relation	rel;
+	HeapScanDesc scan;
+	HeapTuple	tup;
+	MemoryContext resultcxt;
+
+	/* This is the context that we will allocate our output data in */
+	resultcxt = CurrentMemoryContext;
+
+	StartTransactionCommand();
+
+	rel = heap_open(DatabaseRelationId, AccessShareLock);
+	scan = heap_beginscan_catalog(rel, 0, NULL);
+
+	while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
+	{
+		Form_pg_database pgdatabase = (Form_pg_database) GETSTRUCT(tup);
+		Clean2pcDBInfo  *db_info;
+		MemoryContext oldcxt;
+
+		oldcxt = MemoryContextSwitchTo(resultcxt);
+
+		db_info = (Clean2pcDBInfo *) palloc(sizeof(Clean2pcDBInfo));
+
+		db_info->db_oid = HeapTupleGetOid(tup);
+		db_info->db_name = pstrdup(NameStr(pgdatabase->datname));
+
+		dblist = lappend(dblist, db_info);
+		MemoryContextSwitchTo(oldcxt);
+	}
+
+	heap_endscan(scan);
+	heap_close(rel, AccessShareLock);
+
+	CommitTransactionCommand();
+
+	return dblist;
+}
+
+static Oid
+get_default_database(void)
+{
+	Oid default_db = InvalidOid;
+	Oid template0_db = InvalidOid;
+	Oid template1_db = InvalidOid;
+	List *dblist = NULL;
+	ListCell *cell = NULL;
+	Clean2pcDBInfo *db_info = NULL;
+	char *default_db_name = NULL;
+
+	/* Get a list of databases */
+	dblist = get_database_list();
+	foreach(cell, dblist)
+	{
+		db_info = lfirst(cell);
+
+		if (strcmp(db_info->db_name, DB_DEFAULT) == 0)
+		{
+			default_db = db_info->db_oid;
+			default_db_name = db_info->db_name;
+			break;
+		}
+
+		if (strcmp(db_info->db_name, DB_TEMPLATE0) == 0)
+		{
+			template0_db = db_info->db_oid;
+			continue;
+		}
+
+		if (strcmp(db_info->db_name, DB_TEMPLATE1) == 0)
+		{
+			template1_db = db_info->db_oid;
+			continue;
+		}
+
+		if (!OidIsValid(default_db))
+		{
+			default_db = db_info->db_oid;
+			default_db_name = db_info->db_name;
+		}
+	}
+
+	if (!OidIsValid(default_db))
+	{
+		if (OidIsValid(template1_db))
+		{
+			default_db = template1_db;
+			default_db_name = DB_TEMPLATE1;
+		} else if (OidIsValid(template0_db))
+		{
+			default_db = template0_db;
+			default_db_name = DB_TEMPLATE0;
+		}
+	}
+
+	Assert(OidIsValid(default_db));
+
+	elog(DEBUG2, "get default db: oid(%d), name(%s)", default_db, default_db_name);
+
+	return default_db;
+}
+
+/*
+ * start query worker to query 2pc
+ */
+static void
+start_query_worker(void)
+{
+	Oid db_oid = get_default_database();
+	if (!OidIsValid(db_oid))
+	{
+		elog(WARNING, "get default database failed");
+		return;
+	}
+
+	Assert(OidIsValid(db_oid));
+
+	LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE);
+
+	while (Clean2pcShmem->worker_running)
+	{
+		LWLockRelease(Clean2pcLock);
+
+		if (got_SIGTERM)
+		{
+			proc_exit(0);
+		}
+
+		pg_usleep(1000000L); /* wait 1s */
+
+		elog(LOG, "waiting to db(%d)", Clean2pcShmem->worker_db);
+
+		LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE);
+	}
+
+	Clean2pcShmem->worker_running = true;
+	Clean2pcShmem->db_count = 0;
+	Clean2pcShmem->worker_db = db_oid;
+
+	LWLockRelease(Clean2pcLock);
+
+	SendPostmasterSignal(PMSIGNAL_START_CLEAN_2PC_WORKER);
+
+	pg_usleep(1000000L); /* wait 1s */
+}
+
+/*
+ * start clean worker to clean 2pc
+ */
+static void
+start_clean_worker(int count)
+{
+	int i = 0;
+
+	for (i = 0; i < count; i++)
+	{
+		LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE);
+
+		while (Clean2pcShmem->worker_running)
+		{
+			LWLockRelease(Clean2pcLock);
+
+			if (got_SIGTERM)
+			{
+				proc_exit(0);
+			}
+
+			pg_usleep(1000000L); /* wait 1s */
+
+			elog(LOG, "waiting to db(%d)", Clean2pcShmem->worker_db);
+
+			LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE);
+		}
+
+		Clean2pcShmem->worker_db = Clean2pcShmem->db_list[i];
+
+		if (Clean2pcShmem->db_count != count)
+		{
+			elog(WARNING, "db_count(%d)!=count(%d)", Clean2pcShmem->db_count, count);
+			LWLockRelease(Clean2pcLock);
+			break;
+		}
+
+		if (!OidIsValid(Clean2pcShmem->worker_db))
+		{
+			elog(WARNING, "get invalid oid, count: %d, i: %d", count, i);
+			LWLockRelease(Clean2pcLock);
+			continue;
+		}
+
+		Clean2pcShmem->worker_running = true;
+		SendPostmasterSignal(PMSIGNAL_START_CLEAN_2PC_WORKER);
+
+		LWLockRelease(Clean2pcLock);
+
+		pg_usleep(1000000L); /* wait 1s */
+	}
+}
+
+/*
+ * on_proc_exit callback to set worker_running to false
+ */
+static void
+ExitCleanRunning(int status, Datum arg)
+{
+	if (Clean2pcShmem->worker_running)
+	{
+		Clean2pcShmem->worker_running = false;
+		elog(LOG, "2pc clean worker exit abnormally");
+	}
+	else
+	{
+		elog(DEBUG5, "2pc clean worker exit normally");
+	}
+}
+
+/*
+ * Clean2pcShmemSize
+ *		Compute space needed for clean 2pc related shared memory
+ */
+Size
+Clean2pcShmemSize(void)
+{
+	Size		size;
+
+	/*
+	 * Need the fixed struct and the array of WorkerInfoData.
+	 */
+	size = sizeof(Clean2pcShmemStruct);
+	size = MAXALIGN(size);
+
+	return size;
+}
+
+/*
+ * Clean2pcShmemInit
+ *		Allocate and initialize clean 2pc related  shared memory
+ */
+void
+Clean2pcShmemInit(void)
+{
+	bool		found;
+	Clean2pcShmem = (Clean2pcShmemStruct *) ShmemInitStruct("Clean 2pc Data",
+															Clean2pcShmemSize(),
+															&found);
+}
+
+#ifdef EXEC_BACKEND
+/*
+ * forkexec routine for the 2pc clean launcher process.
+ *
+ * Format up the arglist, then fork and exec.
+ */
+static pid_t
+clean_2pc_launcher_forkexec(void)
+{
+	char	   *av[10];
+	int			ac = 0;
+
+	av[ac++] = "postgres";
+	av[ac++] = "--forkclean2pclauncher";
+	av[ac++] = NULL; /* filled in by postmaster_forkexec */
+	av[ac] = NULL;
+
+	Assert(ac < lengthof(av));
+
+	return postmaster_forkexec(ac, av);
+}
+
+/*
+ * forkexec routine for the 2pc clean worker process.
+ *
+ * Format up the arglist, then fork and exec.
+ */
+static pid_t
+clean_2pc_worker_forkexec(void)
+{
+	char	   *av[10];
+	int			ac = 0;
+
+	av[ac++] = "postgres";
+	av[ac++] = "--forkclean2pcworker";
+	av[ac++] = NULL; /* filled in by postmaster_forkexec */
+	av[ac] = NULL;
+
+	Assert(ac < lengthof(av));
+
+	return postmaster_forkexec(ac, av);
+}
+
+/*
+ * We need this set from the outside, before InitProcess is called
+ */
+void
+Clean2pcLauncherIAm(void)
+{
+	am_clean_2pc_launcher = true;
+}
+
+/*
+ * We need this set from the outside, before InitProcess is called
+ */
+void
+Clean2pcWorkerIAm(void)
+{
+	am_clean_2pc_worker = true;
+}
+#endif
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 76d4ff19..1286cd1d 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -49,6 +49,7 @@
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "postmaster/autovacuum.h"
+#include "postmaster/clean2pc.h"
 #include "postmaster/fork_process.h"
 #include "postmaster/postmaster.h"
 #include "replication/walsender.h"
@@ -2902,6 +2903,16 @@ pgstat_bestart(void)
             /* Autovacuum Worker */
             beentry->st_backendType = B_AUTOVAC_WORKER;
         }
+		else if (IsClean2pcLauncher())
+		{
+			/* Clean 2pc Launcher */
+			beentry->st_backendType = B_CLEAN_2PC_LAUNCHER;
+		}
+		else if (IsClean2pcWorker())
+		{
+			/* Clean 2pc Worker */
+			beentry->st_backendType = B_CLEAN_2PC_WORKER;
+		}
         else if (am_walsender)
         {
             /* Wal sender */
@@ -4191,6 +4202,12 @@ pgstat_get_backend_desc(BackendType backendType)
         case B_PGXL_CLUSTER_MONITOR:
             backendDesc = "cluster monitor";
             break;
+		case B_CLEAN_2PC_LAUNCHER:
+			backendDesc = "2pc clean launcher";
+			break;
+		case B_CLEAN_2PC_WORKER:
+			backendDesc = "2pc clean worker";
+			break;
     }
 
     return backendDesc;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index c3fe228d..10be77cd 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -120,6 +120,7 @@
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgworker_internals.h"
+#include "postmaster/clean2pc.h"
 #include "postmaster/fork_process.h"
 #include "postmaster/pgarch.h"
 #include "postmaster/postmaster.h"
@@ -312,6 +313,7 @@ static pid_t StartupPID = 0,
             WalWriterPID = 0,
             WalReceiverPID = 0,
             AutoVacPID = 0,
+			Clean2pcPID = 0,
             PgArchPID = 0,
             PgStatPID = 0,
 #ifdef __TBASE__
@@ -2025,6 +2027,12 @@ ServerLoop(void)
                 start_autovac_launcher = false; /* signal processed */
         }
 
+		if (IS_PGXC_COORDINATOR && Clean2pcPID == 0 &&
+				pmState == PM_RUN && enable_clean_2pc_launcher)
+		{
+			Clean2pcPID = StartClean2pcLauncher();
+		}
+
         /* If we have lost the stats collector, try to start a new one */
         if (PgStatPID == 0 &&
             (pmState == PM_RUN || pmState == PM_HOT_STANDBY))
@@ -2861,6 +2869,8 @@ SIGHUP_handler(SIGNAL_ARGS)
             signal_child(WalReceiverPID, SIGHUP);
         if (AutoVacPID != 0)
             signal_child(AutoVacPID, SIGHUP);
+		if (Clean2pcPID != 0)
+			signal_child(Clean2pcPID, SIGHUP);
         if (PgArchPID != 0)
             signal_child(PgArchPID, SIGHUP);
         if (SysLoggerPID != 0)
@@ -2966,6 +2976,9 @@ pmdie(SIGNAL_ARGS)
                 /* and the autovac launcher too */
                 if (AutoVacPID != 0)
                     signal_child(AutoVacPID, SIGTERM);
+				/* and the clean 2pc launcher too */
+				if (Clean2pcPID != 0)
+					signal_child(Clean2pcPID, SIGTERM);
                 /* and the bgwriter too */
                 if (BgWriterPID != 0)
                     signal_child(BgWriterPID, SIGTERM);
@@ -3093,6 +3106,9 @@ pmdie(SIGNAL_ARGS)
                 /* and the autovac launcher too */
                 if (AutoVacPID != 0)
                     signal_child(AutoVacPID, SIGTERM);
+				/* and the clean 2pc launcher too */
+				if (Clean2pcPID != 0)
+					signal_child(Clean2pcPID, SIGTERM);
                 /* and the walwriter too */
                 if (WalWriterPID != 0)
                     signal_child(WalWriterPID, SIGTERM);
@@ -3272,6 +3288,9 @@ reaper(SIGNAL_ARGS)
              */
             if (!IsBinaryUpgrade && AutoVacuumingActive() && AutoVacPID == 0)
                 AutoVacPID = StartAutoVacLauncher();
+			if (IS_PGXC_COORDINATOR && Clean2pcPID == 0 &&
+					pmState == PM_RUN && enable_clean_2pc_launcher)
+				Clean2pcPID = StartClean2pcLauncher();
             if (PgArchStartupAllowed() && PgArchPID == 0)
                 PgArchPID = pgarch_start();
             if (PgStatPID == 0)
@@ -3431,6 +3450,21 @@ reaper(SIGNAL_ARGS)
         }
 
         /*
+		 * Was it the clean 2pc launcher?	Normal exit can be ignored; we'll
+		 * start a new one at the next iteration of the postmaster's main
+		 * loop, if necessary.  Any other exit condition is treated as a
+		 * crash.
+		 */
+		if (pid == Clean2pcPID)
+		{
+			Clean2pcPID = 0;
+			if (!EXIT_STATUS_0(exitstatus))
+				HandleChildCrash(pid, exitstatus,
+								 _("clean 2pc launcher process"));
+			continue;
+		}
+
+		/*
          * Was it the archiver?  If so, just try to start a new one; no need
          * to force reset of the rest of the system.  (If fail, we'll try
          * again in future cycles of the main loop.).  Unless we were waiting
@@ -4001,6 +4035,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
         signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT));
     }
 
+	/* Take care of the clean 2pc process too */
+	if (pid == Clean2pcPID)
+		Clean2pcPID = 0;
+	else if (Clean2pcPID != 0 && take_action)
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("sending %s to process %d",
+								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+								 (int) Clean2pcPID)));
+		signal_child(Clean2pcPID, (SendStop ? SIGSTOP : SIGQUIT));
+	}
+
 #ifdef PGXC
     /* Take care of the pool manager too */
     if (pid == PgPoolerPID)
@@ -4237,6 +4283,7 @@ PostmasterStateMachine(void)
             (CheckpointerPID == 0 ||
              (!FatalError && Shutdown < ImmediateShutdown)) &&
             WalWriterPID == 0 &&
+			Clean2pcPID == 0 &&
             AutoVacPID == 0)
         {
             if (Shutdown >= ImmediateShutdown || FatalError)
@@ -4345,6 +4392,7 @@ PostmasterStateMachine(void)
             Assert(CheckpointerPID == 0);
             Assert(WalWriterPID == 0);
             Assert(AutoVacPID == 0);
+			Assert(Clean2pcPID == 0);
             /* syslogger is not considered here */
             pmState = PM_NO_CHILDREN;
         }
@@ -4558,6 +4606,8 @@ TerminateChildren(int signal)
         signal_child(WalReceiverPID, signal);
     if (AutoVacPID != 0)
         signal_child(AutoVacPID, signal);
+	if (Clean2pcPID != 0)
+		signal_child(Clean2pcPID, signal);
     if (PgArchPID != 0)
         signal_child(PgArchPID, signal);
     if (PgStatPID != 0)
@@ -5417,6 +5467,12 @@ SubPostmasterMain(int argc, char *argv[])
     if (strcmp(argv[1], "--forkavworker") == 0)
         AutovacuumWorkerIAm();
 
+	/* clean 2pc needs this set before calling InitProcess */
+	if (strcmp(argv[1], "--forkclean2pclauncher") == 0)
+		Clean2pcLauncherIAm();
+	if (strcmp(argv[1], "--forkclean2pcworker") == 0)
+		Clean2pcWorkerIAm();
+
     /*
      * Start our win32 signal implementation. This has to be done after we
      * read the backend variables, because we need to pick up the signal pipe
@@ -5534,6 +5590,32 @@ SubPostmasterMain(int argc, char *argv[])
 
         AutoVacWorkerMain(argc - 2, argv + 2);    /* does not return */
     }
+	if (strcmp(argv[1], "--forkclean2pclauncher") == 0)
+	{
+		/* Restore basic shared memory pointers */
+		InitShmemAccess(UsedShmemSegAddr);
+
+		/* Need a PGPROC to run CreateSharedMemoryAndSemaphores */
+		InitProcess();
+
+		/* Attach process to shared data structures */
+		CreateSharedMemoryAndSemaphores(false, 0);
+
+		Clean2pcLauncherMain(argc - 2, argv + 2);	/* does not return */
+	}
+	if (strcmp(argv[1], "--forkclean2pcworker") == 0)
+	{
+		/* Restore basic shared memory pointers */
+		InitShmemAccess(UsedShmemSegAddr);
+
+		/* Need a PGPROC to run CreateSharedMemoryAndSemaphores */
+		InitProcess();
+
+		/* Attach process to shared data structures */
+		CreateSharedMemoryAndSemaphores(false, 0);
+
+		Clean2pcWorkerMain(argc - 2, argv + 2);	/* does not return */
+	}
     if (strncmp(argv[1], "--forkbgworker=", 15) == 0)
     {
         int            shmem_slot;
@@ -5782,6 +5864,20 @@ sigusr1_handler(SIGNAL_ARGS)
         StartAutovacuumWorker();
     }
 
+	if (CheckPostmasterSignal(PMSIGNAL_WAKEN_CLEAN_2PC_TRIGGER) &&
+		Shutdown == NoShutdown && Clean2pcPID != 0)
+	{
+		/* send SIGUSR2 to clean 2pc launcher to trigger clean */
+		signal_child(Clean2pcPID, SIGUSR2);
+	}
+
+	if (CheckPostmasterSignal(PMSIGNAL_START_CLEAN_2PC_WORKER) &&
+		Shutdown == NoShutdown)
+	{
+		/* The clean 2pc launcher wants us to start a worker process. */
+		StartClean2pcWorker();
+	}
+
     if (CheckPostmasterSignal(PMSIGNAL_START_WALRECEIVER))
     {
         /* Startup Process wants us to start the walreceiver process. */
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 3cdb9063..ae1a9029 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -90,6 +90,7 @@
 #include "postmaster/clustermon.h"
 #endif
 #include "postmaster/autovacuum.h"
+#include "postmaster/clean2pc.h"
 #include "postmaster/clustermon.h"
 #include "postmaster/bgworker_internals.h"
 #include "postmaster/bgwriter.h"
@@ -245,6 +246,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
         size = add_size(size, ReplicationOriginShmemSize());
         size = add_size(size, WalSndShmemSize());
         size = add_size(size, WalRcvShmemSize());
+		size = add_size(size, Clean2pcShmemSize());
 #ifdef XCP
         if (IS_PGXC_DATANODE)
             size = add_size(size, SharedQueueShmemSize());
@@ -421,6 +423,8 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
     WalRcvShmemInit();
     ApplyLauncherShmemInit();
 
+	Clean2pcShmemInit();
+
 #ifdef XCP
     /*
      * Set up distributed executor's shared queues
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index ba4cfcbf..320a55e9 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -76,4 +76,5 @@ DualWriteLock                       58
 #ifdef __TBASE__
 AnalyzeInfoLock                     59
 UserAuthLock						60
+Clean2pcLock						61
 #endif
\ No newline at end of file
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 9ae8494f..9288b715 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -42,6 +42,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
+#include "postmaster/clean2pc.h"
 #ifdef PGXC
 #include "pgxc/pgxc.h"
 #include "pgxc/poolmgr.h"
@@ -63,6 +64,10 @@
 #include "storage/lock.h"
 #endif
 
+#define AUTOVAC_LAUNCHER_NUM    1
+#define CLEAN_2PC_LAUNCHER_NUM  1
+#define CLEAN_2PC_WORKER_NUM    3
+
 /* GUC variables */
 int            DeadlockTimeout = 1000;
 int            StatementTimeout = 0;
@@ -187,6 +192,7 @@ InitProcGlobal(void)
     ProcGlobal->freeProcs = NULL;
     ProcGlobal->autovacFreeProcs = NULL;
     ProcGlobal->bgworkerFreeProcs = NULL;
+	ProcGlobal->clean2pcFreeProcs = NULL;
     ProcGlobal->startupProc = NULL;
     ProcGlobal->startupProcPid = 0;
     ProcGlobal->startupBufferPinWaitBufId = -1;
@@ -256,13 +262,21 @@ InitProcGlobal(void)
             ProcGlobal->freeProcs = &procs[i];
             procs[i].procgloballist = &ProcGlobal->freeProcs;
         }
-        else if (i < MaxConnections + autovacuum_max_workers + 1)
+		else if (i < MaxConnections + autovacuum_max_workers + AUTOVAC_LAUNCHER_NUM)
         {
             /* PGPROC for AV launcher/worker, add to autovacFreeProcs list */
             procs[i].links.next = (SHM_QUEUE *) ProcGlobal->autovacFreeProcs;
             ProcGlobal->autovacFreeProcs = &procs[i];
             procs[i].procgloballist = &ProcGlobal->autovacFreeProcs;
         }
+		else if (i < MaxConnections + autovacuum_max_workers + AUTOVAC_LAUNCHER_NUM
+						+ CLEAN_2PC_LAUNCHER_NUM + CLEAN_2PC_WORKER_NUM)
+		{
+			/* PGPROC for clean 2pc, add to clean2pcFreeProcs list */
+			procs[i].links.next = (SHM_QUEUE *) ProcGlobal->clean2pcFreeProcs;
+			ProcGlobal->clean2pcFreeProcs = &procs[i];
+			procs[i].procgloballist = &ProcGlobal->clean2pcFreeProcs;
+		}
         else if (i < MaxBackends)
         {
             /* PGPROC for bgworker, add to bgworkerFreeProcs list */
@@ -314,6 +328,8 @@ InitProcess(void)
         procgloballist = &ProcGlobal->autovacFreeProcs;
     else if (IsBackgroundWorker)
         procgloballist = &ProcGlobal->bgworkerFreeProcs;
+	else if (IsAnyClean2pcProcess())
+		procgloballist = &ProcGlobal->clean2pcFreeProcs;
     else
         procgloballist = &ProcGlobal->freeProcs;
 
@@ -362,7 +378,7 @@ InitProcess(void)
      * cleaning up.  (XXX autovac launcher currently doesn't participate in
      * this; it probably should.)
      */
-    if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess())
+	if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess() && !IsAnyClean2pcProcess())
         MarkPostmasterChildActive();
 
     /*
@@ -921,7 +937,7 @@ ProcKill(int code, Datum arg)
      * way, so tell the postmaster we've cleaned up acceptably well. (XXX
      * autovac launcher should be included here someday)
      */
-    if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess())
+	if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess() && !IsAnyClean2pcProcess())
         MarkPostmasterChildInactive();
 
     /* wake autovac launcher if needed -- see comments in FreeWorkerInfo */
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index ba4c192d..460061f3 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -45,6 +45,7 @@
 #endif
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
+#include "postmaster/clean2pc.h"
 #include "postmaster/postmaster.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
@@ -612,7 +613,7 @@ InitializeSessionUserIdStandalone(void)
      * This function should only be called in single-user mode, in autovacuum
      * workers, and in background workers.
      */
-    AssertState(!IsUnderPostmaster || IsAutoVacuumWorkerProcess() || IsBackgroundWorker);
+	AssertState(!IsUnderPostmaster || IsAutoVacuumWorkerProcess() || IsBackgroundWorker || IsClean2pcWorker());
 
     /* call only once */
     AssertState(!OidIsValid(AuthenticatedUserId));
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index bfe9572d..0d529949 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -42,6 +42,7 @@
 #include "postmaster/clustermon.h"
 #endif
 #include "postmaster/autovacuum.h"
+#include "postmaster/clean2pc.h"
 #include "postmaster/clustermon.h"
 #include "postmaster/postmaster.h"
 #include "replication/walsender.h"
@@ -333,7 +334,7 @@ CheckMyDatabase(const char *name, bool am_superuser)
      *
      * We do not enforce them for autovacuum worker processes either.
      */
-    if (IsUnderPostmaster && !IsAutoVacuumWorkerProcess())
+	if (IsUnderPostmaster && !IsAutoVacuumWorkerProcess() && !IsClean2pcWorker())
     {
         /*
          * Check that the database is currently allowing connections.
@@ -691,7 +692,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
     before_shmem_exit(ShutdownPostgres, 0);
 
     /* The autovacuum launcher is done here */
-    if (IsAutoVacuumLauncherProcess() || IsClusterMonitorProcess())
+	if (IsAutoVacuumLauncherProcess() || IsClusterMonitorProcess() || IsClean2pcLauncher())
     {
         /* report this backend in the PgBackendStatus array */
         pgstat_bestart();
@@ -731,7 +732,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
      * In standalone mode and in autovacuum worker processes, we use a fixed
      * ID, otherwise we figure it out from the authenticated user name.
      */
-    if (bootstrap || IsAutoVacuumWorkerProcess())
+	if (bootstrap || IsAutoVacuumWorkerProcess() || IsClean2pcWorker())
     {
         InitializeSessionUserIdStandalone();
         am_superuser = true;
@@ -1020,7 +1021,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
      */
     RelationCacheInitializePhase3();
 #ifdef _MLS_    
-    if (bootstrap || IsAutoVacuumWorkerProcess() || !IsUnderPostmaster || IsBackgroundWorker)
+    if (bootstrap || IsAutoVacuumWorkerProcess() || IsClean2pcWorker() || !IsUnderPostmaster || IsBackgroundWorker)
     {
         ;
     }
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 42619d16..56abdb1a 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -85,6 +85,7 @@
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgworker_internals.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/clean2pc.h"
 #include "postmaster/postmaster.h"
 #include "postmaster/syslogger.h"
 #include "postmaster/walwriter.h"
@@ -2726,8 +2727,27 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"enable_2pc_error_stop", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("Enable 2PC stop when commit prepared error."),
+			NULL
+		},
+		&enable_2pc_error_stop,
+		false,
+		NULL, NULL, NULL
+	},
 #endif
 
+	{
+		{"enable_clean_2pc_launcher", PGC_POSTMASTER, CUSTOM_OPTIONS,
+			gettext_noop("Enable clean 2PC launcher."),
+			NULL
+		},
+		&enable_clean_2pc_launcher,
+		true,
+		NULL, NULL, NULL
+	},
+
 #ifdef __TBASE__
 	{
 		{"enable_lock_account", PGC_SUSET, CUSTOM_OPTIONS,
@@ -4710,7 +4730,7 @@ static struct config_int ConfigureNamesInt[] =
             NULL
         },
         &run_pg_clean,
-        0, 0, 1,
+        0, 0, 10,
         NULL, NULL, NULL
     },
 #endif
@@ -4833,6 +4853,28 @@ static struct config_int ConfigureNamesInt[] =
     },
 #endif
 
+	{
+		{"auto_clean_2pc_interval", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("auto clean 2pc interval"),
+			NULL,
+			GUC_UNIT_S
+		},
+		&auto_clean_2pc_interval,
+		30, 1, 3600,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"auto_clean_2pc_delay", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("auto clean 2pc delay"),
+			NULL,
+			GUC_UNIT_S
+		},
+		&auto_clean_2pc_delay,
+		3, 1, 600,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"reconnect_gtm_retry_times", PGC_USERSET, CUSTOM_OPTIONS,
 			gettext_noop("reconnect gtm retry times"),
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index a06c14d4..0e312662 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -59,6 +59,10 @@ extern bool XactReadOnly;
 extern bool GTM_ReadOnly;
 #endif
 
+#ifdef __TWO_PHASE_TRANS__
+extern bool enable_2pc_error_stop;
+#endif
+
 /*
  * Xact is deferrable -- only meaningful (currently) for read only
  * SERIALIZABLE transactions
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 2049d855..7976c39b 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -719,7 +719,9 @@ typedef enum BackendType
 	B_WAL_SENDER,
 	B_WAL_WRITER,
 	B_PGXL_CLUSTER_MONITOR,
-	B_PGXL_POOLER
+	B_PGXL_POOLER,
+	B_CLEAN_2PC_LAUNCHER,
+	B_CLEAN_2PC_WORKER,
 } BackendType;
 
 
diff --git a/src/include/postmaster/clean2pc.h b/src/include/postmaster/clean2pc.h
new file mode 100644
index 00000000..1d6df23f
--- /dev/null
+++ b/src/include/postmaster/clean2pc.h
@@ -0,0 +1,43 @@
+/*--------------------------------------------------------------------
+ * clean2pc.h
+ * A clean 2pc process is a process able to clean the abnormal 2pc.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, TDSQL-PG Development Group
+ *
+ * IDENTIFICATION
+ *		src/include/postmaster/clean2pc.h
+ *--------------------------------------------------------------------
+ */
+#ifndef CLEAN2PC_H
+#define CLEAN2PC_H
+
+#include "storage/block.h"
+
+extern bool enable_clean_2pc_launcher;
+
+extern int auto_clean_2pc_interval;
+extern int auto_clean_2pc_delay;
+
+extern bool IsClean2pcLauncher(void);
+extern bool IsClean2pcWorker(void);
+
+#define IsAnyClean2pcProcess() \
+	(IsClean2pcLauncher() || IsClean2pcWorker())
+
+extern int StartClean2pcLauncher(void);
+extern int StartClean2pcWorker(void);
+
+#ifdef EXEC_BACKEND
+extern void Clean2pcLauncherMain(int argc, char *argv[]) pg_attribute_noreturn();
+extern void Clean2pcWorkerMain(int argc, char *argv[]) pg_attribute_noreturn();
+
+extern void Clean2pcLauncherIAm(void);
+extern void Clean2pcWorkerIAm(void);
+#endif
+
+/* shared memory stuff */
+extern Size Clean2pcShmemSize(void);
+extern void Clean2pcShmemInit(void);
+
+#endif /* CLEAN2PC_H */
diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h
index d186137b..3adcb74a 100644
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pmsignal.h
- *      routines for signaling the postmaster from its child processes
+ *	  routines for signaling the postmaster from its child processes
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
@@ -22,23 +22,25 @@
  */
 typedef enum
 {
-    PMSIGNAL_RECOVERY_STARTED,    /* recovery has started */
-    PMSIGNAL_BEGIN_HOT_STANDBY, /* begin Hot Standby */
-    PMSIGNAL_WAKEN_ARCHIVER,    /* send a NOTIFY signal to xlog archiver */
-    PMSIGNAL_ROTATE_LOGFILE,    /* send SIGUSR1 to syslogger to rotate logfile */
-    PMSIGNAL_START_AUTOVAC_LAUNCHER,    /* start an autovacuum launcher */
-    PMSIGNAL_START_AUTOVAC_WORKER,    /* start an autovacuum worker */
-    PMSIGNAL_BACKGROUND_WORKER_CHANGE,    /* background worker state change */
-    PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */
-    PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */
+	PMSIGNAL_RECOVERY_STARTED,	/* recovery has started */
+	PMSIGNAL_BEGIN_HOT_STANDBY, /* begin Hot Standby */
+	PMSIGNAL_WAKEN_ARCHIVER,	/* send a NOTIFY signal to xlog archiver */
+	PMSIGNAL_ROTATE_LOGFILE,	/* send SIGUSR1 to syslogger to rotate logfile */
+	PMSIGNAL_START_AUTOVAC_LAUNCHER,	/* start an autovacuum launcher */
+	PMSIGNAL_START_AUTOVAC_WORKER,	/* start an autovacuum worker */
+	PMSIGNAL_BACKGROUND_WORKER_CHANGE,	/* background worker state change */
+	PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */
+	PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */
 #ifdef __AUDIT__
-    PMSIGNAL_ROTATE_AUDIT_LOGFILE,    /* send SIGUSR1 to audit logger to rotate logfile */
-    PMSIGNAL_WAKEN_AUDIT_LOGGER,    /* send SIGUSR2 to audit logger to read audit log */
+	PMSIGNAL_ROTATE_AUDIT_LOGFILE,	/* send SIGUSR1 to audit logger to rotate logfile */
+	PMSIGNAL_WAKEN_AUDIT_LOGGER,	/* send SIGUSR2 to audit logger to read audit log */
 #endif
 #ifdef __AUDIT_FGA__
     PMSIGNAL_WAKEN_AUDIT_FGA_TRIGGER, /*send  SIGUSR1 to audit fga bgworker to trigger function */
 #endif
-    NUM_PMSIGNALS                /* Must be last value of enum! */
+	PMSIGNAL_WAKEN_CLEAN_2PC_TRIGGER,	/* send SIGUSR2 to clean 2pc launcher to trigger clean */
+	PMSIGNAL_START_CLEAN_2PC_WORKER,	/* start an clean 2pc worker */
+	NUM_PMSIGNALS				/* Must be last value of enum! */
 } PMSignalReason;
 
 /* PMSignalData is an opaque struct, details known only within pmsignal.c */
@@ -51,7 +53,7 @@ extern Size PMSignalShmemSize(void);
 extern void PMSignalShmemInit(void);
 extern void SendPostmasterSignal(PMSignalReason reason);
 extern bool CheckPostmasterSignal(PMSignalReason reason);
-extern int    AssignPostmasterChildSlot(void);
+extern int	AssignPostmasterChildSlot(void);
 extern bool ReleasePostmasterChildSlot(int slot);
 extern bool IsPostmasterChildWalSender(int slot);
 extern void MarkPostmasterChildActive(void);
@@ -59,4 +61,4 @@ extern void MarkPostmasterChildInactive(void);
 extern void MarkPostmasterChildWalSender(void);
 extern bool PostmasterIsAlive(void);
 
-#endif                            /* PMSIGNAL_H */
+#endif							/* PMSIGNAL_H */
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 48601659..32dea324 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -331,6 +331,8 @@ typedef struct PROC_HDR
     PGPROC       *autovacFreeProcs;
     /* Head of list of bgworker free PGPROC structures */
     PGPROC       *bgworkerFreeProcs;
+	/* Head of list of clean 2pc process free PGPROC structures */
+	PGPROC	   *clean2pcFreeProcs;
     /* First pgproc waiting for group XID clear */
     pg_atomic_uint32 procArrayGroupFirst;
     /* WALWriter process's latch */
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 58642165..4c0e6f5c 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -74,6 +74,7 @@ select name, setting from pg_settings where name like 'enable%';
 -----------------------------------+---------
  enable_2pc_entry_key_check        | on
  enable_2pc_entry_trace            | off
+ enable_2pc_error_stop             | off
  enable_2pc_file_cache             | on
  enable_2pc_file_check             | off
  enable_2pc_recovery_info          | on
@@ -83,6 +84,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_bitmapscan                 | on
  enable_buffer_mprotect            | on
  enable_check_password             | off
+ enable_clean_2pc_launcher         | on
  enable_clog_mprotect              | on
  enable_cls                        | on
  enable_cold_hot_router_print      | off
@@ -141,7 +143,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
  enable_xlog_mprotect              | on
-(68 rows)
+(70 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail

From 898d391e52ec1e88446470fdf46e6518fe537e4e Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 25 Aug 2021 17:11:26 +0800
Subject: [PATCH 419/578] fix could not open relation with OID 0
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131091148837 (merge request
 !621)

---
 src/backend/utils/cache/plancache.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c
index 94579c4e..240a4f9d 100644
--- a/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@@ -2112,6 +2112,11 @@ SetRemoteSubplan(CachedPlanSource *plansource, const char *plan_string)
      */
     PG_TRY();
     {
+	    /*
+	     * Check for shared-cache-inval messages before restoring query plan,
+	     * avoid oid conversion and other operations to find old data.
+	     */
+        AcceptInvalidationMessages();
         set_portable_input(true);
         rstmt = (RemoteStmt *) stringToNode((char *) plan_string);
     }

From 311302b2c1d4991e8bd14e3f4d1418f1c1cef663 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Tue, 12 Oct 2021 17:35:23 +0800
Subject: [PATCH 420/578] 2pc stop opt: add clean 2pc process (merge request
 656), automatic test bugfix

---
 src/backend/postmaster/clean2pc.c | 63 +++++++++++++++++++++++--------
 src/backend/utils/misc/guc.c      | 11 ++++++
 src/include/postmaster/clean2pc.h |  1 +
 3 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/src/backend/postmaster/clean2pc.c b/src/backend/postmaster/clean2pc.c
index 80ab7103..5e0ceaf2 100644
--- a/src/backend/postmaster/clean2pc.c
+++ b/src/backend/postmaster/clean2pc.c
@@ -58,6 +58,7 @@ bool enable_clean_2pc_launcher = true;
 
 int auto_clean_2pc_interval = 10;
 int auto_clean_2pc_delay    = 3;
+int auto_clean_2pc_timeout  = 0;
 
 static volatile sig_atomic_t got_SIGTERM = false;
 static volatile sig_atomic_t got_SIGHUP  = false;
@@ -79,11 +80,11 @@ Clean2pcLauncherMain(int argc, char *argv[]) pg_attribute_noreturn();
 NON_EXEC_STATIC void
 Clean2pcWorkerMain(int argc, char *argv[]) pg_attribute_noreturn();
 
-static void	start_query_worker(void);
+static void	start_query_worker(TimestampTz clean_time);
 static void	start_clean_worker(int count);
 
-static void do_query_2pc(void);
-static void do_clean_2pc(void);
+static void do_query_2pc(TimestampTz clean_time);
+static void do_clean_2pc(TimestampTz clean_time);
 
 static void clean_2pc_sigterm_handler(SIGNAL_ARGS);
 static void clean_2pc_sighup_handler(SIGNAL_ARGS);
@@ -103,6 +104,8 @@ typedef struct Clean2pcDBInfo
 
 typedef struct
 {
+	TimestampTz clean_time;
+
 	bool worker_running;
 	Oid  worker_db;
 
@@ -157,6 +160,7 @@ NON_EXEC_STATIC void
 Clean2pcLauncherMain(int argc, char *argv[])
 {
 	int wait_time = 0;
+	TimestampTz clean_time = GetCurrentTimestamp();
 
 	am_clean_2pc_launcher = true;
 
@@ -239,13 +243,15 @@ Clean2pcLauncherMain(int argc, char *argv[])
 
 		if (got_SIGUSR2)
 		{
-			elog(LOG, "2pc clean launcher got SIGUSR2");
 			got_SIGUSR2 = false;
+			clean_time = GetCurrentTimestamp();
 			wait_time = auto_clean_2pc_delay;
+			elog(LOG, "2pc clean launcher got SIGUSR2, clean_time: "
+				INT64_FORMAT, clean_time);
 			continue;
 		}
 
-		start_query_worker();
+		start_query_worker(clean_time);
 
 		if (got_SIGTERM || got_SIGHUP || got_SIGUSR2)
 		{
@@ -371,13 +377,13 @@ Clean2pcWorkerMain(int argc, char *argv[])
 	if (Clean2pcShmem->db_count == 0)
 	{
 		elog(DEBUG5, "query 2pc from db: %s", db_name);
-		do_query_2pc();
+		do_query_2pc(Clean2pcShmem->clean_time);
 		clean_db_count = Clean2pcShmem->db_count;
 	}
 	else
 	{
 		elog(LOG, "clean 2pc for db: %s", db_name);
-		do_clean_2pc();
+		do_clean_2pc(Clean2pcShmem->clean_time);
 	}
 
 	Clean2pcShmem->worker_running = false;
@@ -394,7 +400,7 @@ Clean2pcWorkerMain(int argc, char *argv[])
 }
 
 static void
-do_query_2pc(void)
+do_query_2pc(TimestampTz clean_time)
 {
 	int                  i = 0;
 	int                  count_db = 0;
@@ -412,6 +418,8 @@ do_query_2pc(void)
 	TupleTableSlot      *result = NULL;
 	Var                 *dummy = NULL;
 	int                  attr_num = 4;
+	int64                check_time = 0;
+	TimestampTz          curr_time = GetCurrentTimestamp();
 	static const char   *attr_name[] = {"gid", "database",
 							"global_transaction_status",
 							"transaction_status_on_allnodes"};
@@ -419,10 +427,25 @@ do_query_2pc(void)
 	Assert(result_str != NULL);
 	resetStringInfo(result_str);
 
-	snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_check_txn(%d) "
-			"order by database limit 1000;", auto_clean_2pc_delay);
+	check_time = (curr_time - clean_time)/USECS_PER_SEC;
 
-	elog(DEBUG2, "node(%d) query: %s", PGXCNodeId, query);
+	if (check_time < 0)
+	{
+		elog(WARNING, "Invalid check_time: " INT64_FORMAT
+			", curr_time: " INT64_FORMAT ", clean_time: " INT64_FORMAT,
+			check_time, curr_time, clean_time);
+		return;
+	}
+
+	if (check_time > INT32_MAX)
+	{
+		check_time = INT32_MAX;
+	}
+
+	snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_check_txn("
+		INT64_FORMAT ") order by database limit 1000;", check_time);
+
+	elog(DEBUG1, "node(%d) query: %s", PGXCNodeId, query);
 
 	StartTransactionCommand();
 
@@ -544,7 +567,7 @@ do_query_2pc(void)
 }
 
 static void
-do_clean_2pc(void)
+do_clean_2pc(TimestampTz clean_time)
 {
 	int                  i = 0;
 	int                  count = 0;
@@ -558,13 +581,10 @@ do_clean_2pc(void)
 	int                  attr_num = 4;
 	static const char   *attr_name[] = {"gid", "global_transaction_status",
 								"operation", "operation_status"};
-	TimestampTz			 clean_time = 0;
 
 	Assert(result_str != NULL);
 	resetStringInfo(result_str);
 
-	clean_time = GetCurrentTimestamp() - USECS_PER_SEC * auto_clean_2pc_delay;
-
 	snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_execute_on_node('%s', %ld)"
 			" limit 1000;", PGXCNodeName, clean_time);
 
@@ -798,7 +818,7 @@ get_default_database(void)
  * start query worker to query 2pc
  */
 static void
-start_query_worker(void)
+start_query_worker(TimestampTz clean_time)
 {
 	Oid db_oid = get_default_database();
 	if (!OidIsValid(db_oid))
@@ -809,8 +829,19 @@ start_query_worker(void)
 
 	Assert(OidIsValid(db_oid));
 
+	if (auto_clean_2pc_timeout != 0)
+	{
+		TimestampTz curr_time = GetCurrentTimestamp();
+		if (curr_time - clean_time > auto_clean_2pc_timeout * USECS_PER_SEC)
+		{
+			clean_time = curr_time - auto_clean_2pc_timeout * USECS_PER_SEC;
+		}
+	}
+
 	LWLockAcquire(Clean2pcLock, LW_EXCLUSIVE);
 
+	Clean2pcShmem->clean_time = clean_time;
+
 	while (Clean2pcShmem->worker_running)
 	{
 		LWLockRelease(Clean2pcLock);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 56abdb1a..f1e0d2f7 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4875,6 +4875,17 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"auto_clean_2pc_timeout", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("auto clean 2pc timeout"),
+			NULL,
+			GUC_UNIT_S
+		},
+		&auto_clean_2pc_timeout,
+		0, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"reconnect_gtm_retry_times", PGC_USERSET, CUSTOM_OPTIONS,
 			gettext_noop("reconnect gtm retry times"),
diff --git a/src/include/postmaster/clean2pc.h b/src/include/postmaster/clean2pc.h
index 1d6df23f..2d94442b 100644
--- a/src/include/postmaster/clean2pc.h
+++ b/src/include/postmaster/clean2pc.h
@@ -18,6 +18,7 @@ extern bool enable_clean_2pc_launcher;
 
 extern int auto_clean_2pc_interval;
 extern int auto_clean_2pc_delay;
+extern int auto_clean_2pc_timeout;
 
 extern bool IsClean2pcLauncher(void);
 extern bool IsClean2pcWorker(void);

From eb008dcaa3064d7ec67f6016408ace9835163b7b Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Sat, 9 Oct 2021 20:27:13 +0800
Subject: [PATCH 421/578] fix ddl failed in non leader cn
 http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131093044435&jump_count=1

---
 src/backend/pgxc/pool/pgxcnode.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 5b36f087..4d93ec43 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -5877,6 +5877,7 @@ delete_leadercn_handle(PGXCNodeAllHandles *pgxc_connections,
 {
 	int co_conn_count = 0;
 	int i = 0;
+	bool find_leader_handle = false;
 
 	if (!pgxc_connections || !leader_cn_handle)
 		return;
@@ -5884,14 +5885,18 @@ delete_leadercn_handle(PGXCNodeAllHandles *pgxc_connections,
 	co_conn_count = pgxc_connections->co_conn_count;
 	for (i = 0; i < co_conn_count; i++)
 	{
-		if (pgxc_connections->coord_handles[i] == leader_cn_handle)
+		if (pgxc_connections->coord_handles[i] == leader_cn_handle || find_leader_handle)
 		{
 			if (i+1 < co_conn_count)
 				pgxc_connections->coord_handles[i] = pgxc_connections->coord_handles[i+1];
 			else
 				pgxc_connections->coord_handles[i] = NULL;
+
+			if (!find_leader_handle)
+			{
 			pgxc_connections->co_conn_count--;
-			break;
+				find_leader_handle = true;
+			}
 		}
 	}
 }

From b40778324bec0c279e26e808cf46c246e04a1917 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Mon, 18 Oct 2021 18:09:18 +0800
Subject: [PATCH 422/578] Revert a wrong code causing concurrent UPDATE of
 partition table coredump

tapd: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696093297211
---
 src/backend/executor/execMain.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index d30ee629..ca60ff43 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -3677,7 +3677,6 @@ EvalPlanQualEnd(EPQState *epqstate)
 
     /* Mark EPQState idle */
     epqstate->estate = NULL;
-	epqstate->parentestate = NULL;
     epqstate->planstate = NULL;
     epqstate->origslot = NULL;
 }

From 3fcef588f29556f95e17140605e79910e66f2609 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Sun, 24 Oct 2021 08:42:25 +0800
Subject: [PATCH 423/578] save

---
 src/backend/optimizer/util/pgxcship.c |   4 +-
 src/backend/pgxc/locator/locator.c    | 356 ++++++++++++++++++++++++--
 src/include/optimizer/pgxcship.h      |   1 +
 src/test/regress/sql/xc_FQS.sql       |  21 +-
 4 files changed, 347 insertions(+), 35 deletions(-)

diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index b749e028..6ad269f9 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -117,8 +117,6 @@ static bool pgxc_is_func_shippable(Oid funcid);
 /* Check equijoin conditions on given relations */
 static Expr *pgxc_find_dist_equijoin_qual(Relids varnos_1, Relids varnos_2,
                                 Oid distcol_type, Node *quals, List *rtable);
-/* Merge given execution nodes based on join shippability conditions */
-static ExecNodes *pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2);
 /* Check if given Query includes distribution column */
 static bool pgxc_query_has_distcolgrouping(Query *query);
 
@@ -2485,7 +2483,7 @@ pgxc_find_dist_equi_nodes(Relids varnos_1,
  * exec_node corresponds to the JOIN of respective relations.
  * If both exec_nodes can not be merged, it returns NULL.
  */
-static ExecNodes *
+ExecNodes *
 pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2)
 {// #lizard forgives
     ExecNodes    *merged_en = makeNode(ExecNodes);
diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index 20abfd91..3ed59f90 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -41,6 +41,7 @@
 #include "utils/varbit.h"
 #include "nodes/nodes.h"
 #include "optimizer/clauses.h"
+#include "optimizer/pgxcship.h"
 #include "parser/parse_coerce.h"
 #include "pgxc/nodemgr.h"
 #include "pgxc/locator.h"
@@ -51,13 +52,19 @@
 #include "catalog/pgxc_node.h"
 #include "catalog/namespace.h"
 #include "access/hash.h"
+
 #ifdef XCP
+
 #include "utils/date.h"
 #include "utils/memutils.h"
+
 #ifdef __COLD_HOT__
+
 #include "catalog/pgxc_key_values.h"
 #include "pgxc/shardmap.h"
+
 #endif
+
 /*
  * Locator details are private
  */
@@ -73,6 +80,7 @@ struct _Locator
                                Datum secValue, bool secIsNull,
 #endif
                                 bool *hasprimary);
+
     Oid            dataType;         /* values of that type are passed to locateNodes function */
     LocatorListType listType;
     bool        primary;
@@ -100,6 +108,7 @@ struct _Locator
     void       *nodeMap; /* map index to node reference according to listType */
     void       *results; /* array to output results */
 };
+
 #endif
 
 #ifdef __COLD_HOT__
@@ -116,38 +125,47 @@ int        num_preferred_data_nodes = 0;
 Oid        preferred_data_node[MAX_PREFERRED_NODES];
 
 #ifdef XCP
+
 static int modulo_value_len(Oid dataType);
+
 static int locate_static(Locator *self, Datum value, bool isnull,
 #ifdef __COLD_HOT__
                 Datum secValue, bool secIsNull,
 #endif
                 bool *hasprimary);
+
 static int locate_roundrobin(Locator *self, Datum value, bool isnull,
 #ifdef __COLD_HOT__
                        Datum secValue, bool secIsNull,
 #endif
                        bool *hasprimary);
+
 static int locate_modulo_random(Locator *self, Datum value, bool isnull,
 #ifdef __COLD_HOT__
                               Datum secValue, bool secIsNull,
 #endif
                               bool *hasprimary);
+
 static int locate_hash_insert(Locator *self, Datum value, bool isnull,
 #ifdef __COLD_HOT__
                         Datum secValue, bool secIsNull,
 #endif
                         bool *hasprimary);
+
 static int locate_hash_select(Locator *self, Datum value, bool isnull,
 #ifdef __COLD_HOT__
                         Datum secValue, bool secIsNull,
 #endif
                         bool *hasprimary);
+
 #ifdef _MIGRATE_
+
 static int locate_shard_insert(Locator *self, Datum value, bool isnull,
 #ifdef __COLD_HOT__
                                     Datum secValue, bool secIsNull,
 #endif
                                     bool *hasprimary);
+
 static int locate_shard_select(Locator *self, Datum value, bool isnull,
 #ifdef __COLD_HOT__
                             Datum secValue, bool secIsNull,
@@ -155,22 +173,26 @@ static int locate_shard_select(Locator *self, Datum value, bool isnull,
                             bool *hasprimary);
                    
 #endif
+
 static int locate_modulo_insert(Locator *self, Datum value, bool isnull,
 #ifdef __COLD_HOT__
                            Datum secValue, bool secIsNull,
 #endif 
                            bool *hasprimary);
+
 static int locate_modulo_select(Locator *self, Datum value, bool isnull,
 #ifdef __COLD_HOT__
                             Datum secValue, bool secIsNull,
 #endif
                             bool *hasprimary);
+
 static Expr * pgxc_find_distcol_expr(Index varno,
                        AttrNumber attrNum,
                        Node *quals);
 
 
 #ifdef __COLD_HOT__
+
 static List * pgxc_find_distcol_exprs(Index varno,
                                      AttrNumber attrNum,
                                     Node *quals);
@@ -189,6 +211,7 @@ static ExecNodes *GetRelationTimeStampRangeNodes(RelationLocInfo *rel_loc_info,
 static bool IsConstAligned(Oid reloid, Datum constvalue, AttrNumber secAttr);
 
 static bool TimeStampRange(Oid op);
+
 #endif
 #endif
 
@@ -218,14 +241,20 @@ GetPreferredReplicationNode(List *relNodes)
         {
             if (PGXCNodeGetNodeId(preferred_data_node[cnt_nodes],
                                   &nodetype) == lfirst_int(item))
+			{
                 nodeid = lfirst_int(item);
         }
+		}
         if (nodeid >= 0)
+		{
             break;
     }
+	}
     if (nodeid < 0)
+	{
         return list_make1_int(list_nth_int(relNodes,
                     ((unsigned int) random()) % list_length(relNodes)));
+	}
 
     return list_make1_int(nodeid);
 }
@@ -249,15 +278,19 @@ GetAnyDataNode(Bitmapset *nodes)
 
         /* OK, found one */
         if (bms_is_member(nodeid, nodes))
+		{
             preferred = bms_add_member(preferred, nodeid);
     }
+	}
 
     /*
      * If no preferred data nodes or they are not in the desired set, pick up
      * from the original set.
      */
     if (bms_is_empty(preferred))
+	{
         preferred = bms_copy(nodes);
+	}
 
     /*
      * Load balance.
@@ -269,7 +302,9 @@ GetAnyDataNode(Bitmapset *nodes)
 
     /* If there is a single member nothing to balance */
     if (nmembers == 1)
+	{
         return members[0];
+	}
 
     /*
      * In general, the set may contain any number of nodes, and if we save
@@ -307,12 +342,15 @@ char *pColName;
 
     pColName = GetRelationHashColumn(rel_loc_info);
     if (pColName == NULL)
+	{
         pColName = GetRelationModuloColumn(rel_loc_info);
+	}
 
     return pColName;
 }
 
 #ifdef _MIGRATE_
+
 /*
  * IsTypeDistributable
  * Returns whether the data type is distributable using a column value.
@@ -350,10 +388,13 @@ IsTypeDistributable(Oid col_type)
     || col_type == NVARCHAR2OID
 #endif
     )
+	{
         return true;
+	}
 
     return false;
 }
+
 #endif
 
 /*
@@ -377,9 +418,13 @@ GetRelationHashColumn(RelationLocInfo * rel_loc_info)
     char       *column_str = NULL;
 
     if (rel_loc_info == NULL)
+	{
         column_str = NULL;
+	}
     else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
+	{
         column_str = NULL;
+	}
     else
     {
         int            len = strlen(rel_loc_info->partAttrName);
@@ -402,15 +447,21 @@ IsDistColumnForRelId(Oid relid, char *part_col_name)
 
     /* if no column is specified, we're done */
     if (!part_col_name)
+	{
         return false;
+	}
 
     /* if no locator, we're done too */
     if (!(rel_loc_info = GetRelationLocInfo(relid)))
+	{
         return false;
+	}
 
     /* is the table distributed by column value */
     if (!IsRelationDistributedByValue(rel_loc_info))
+	{
         return false;
+	}
 
     /* does the column name match the distribution column */
     return !strcmp(part_col_name, rel_loc_info->partAttrName);
@@ -438,9 +489,13 @@ GetRelationModuloColumn(RelationLocInfo * rel_loc_info)
     char       *column_str = NULL;
 
     if (rel_loc_info == NULL)
+	{
         column_str = NULL;
+	}
     else if (rel_loc_info->locatorType != LOCATOR_TYPE_MODULO)
+	{
         column_str = NULL;
+	}
     else
     {
         int    len = strlen(rel_loc_info->partAttrName);
@@ -471,10 +526,14 @@ GetRoundRobinNode(Oid relid)
 
     /* Move round robin indicator to next node */
     if (rel->rd_locator_info->roundRobinNode->next != NULL)
+	{
         rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->roundRobinNode->next;
+	}
     else
+	{
         /* reset to first one */
         rel->rd_locator_info->roundRobinNode = rel->rd_locator_info->rl_nodeList->head;
+	}
 
     relation_close(rel, AccessShareLock);
 
@@ -494,14 +553,18 @@ IsTableDistOnPrimary(RelationLocInfo *rel_loc_info)
     if (!OidIsValid(primary_data_node) ||
         rel_loc_info == NULL ||
         list_length(rel_loc_info->rl_nodeList = 0))
+	{
         return false;
+	}
 
     foreach(item, rel_loc_info->rl_nodeList)
     {
         char ntype = PGXC_NODE_DATANODE;
         if (PGXCNodeGetNodeId(primary_data_node, &ntype) == lfirst_int(item))
+		{
             return true;
     }
+	}
     return false;
 }
 
@@ -521,20 +584,28 @@ IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info
 
     /* Same relation? */
     if (rel_loc_info1->relid != rel_loc_info2->relid)
+	{
         return false;
+	}
 
     /* Same locator type? */
     if (rel_loc_info1->locatorType != rel_loc_info2->locatorType)
+	{
         return false;
+	}
 
     /* Same attribute number? */
     if (rel_loc_info1->partAttrNum != rel_loc_info2->partAttrNum)
+	{
         return false;
+	}
 
     /* Same node list? */
     if (list_difference_int(nodeList1, nodeList2) != NIL ||
         list_difference_int(nodeList2, nodeList1) != NIL)
+	{
         return false;
+	}
 
     /* Everything is equal */
     return true;
@@ -592,7 +663,9 @@ GetLocatorType(Oid relid)
     RelationLocInfo *ret_loc_info = GetRelationLocInfo(relid);
 
     if (ret_loc_info != NULL)
+	{
         ret = ret_loc_info->locatorType;
+	}
 
     return ret;
 }
@@ -634,8 +707,10 @@ GetAllCoordNodes(void)
          */
 
         if (i != PGXCNodeId - 1)
+		{
             nodeList = lappend_int(nodeList, i);
     }
+	}
 
     return nodeList;
 }
@@ -727,7 +802,8 @@ RelationBuildLocator(Relation rel)
     curr_nodeoid = get_pgxc_nodeoid_extend(PGXCNodeName, PGXCMainClusterName);
     if (InvalidOid == curr_nodeoid)
     {
-        elog(ERROR, "no such node:%s on PGXCMainClusterName %s PGXCClustername %s", PGXCNodeName, PGXCMainClusterName, PGXCClusterName);
+		elog(ERROR, "no such node:%s on PGXCMainClusterName %s PGXCClustername %s", PGXCNodeName, PGXCMainClusterName,
+		     PGXCClusterName);
     }
     
     node_in_group = DatanodeInGroup(&(pgxc_class->nodeoids), curr_nodeoid);
@@ -761,7 +837,8 @@ RelationBuildLocator(Relation rel)
                     GetShardNodes(pgxc_class->pcoldgroup, &datanodes, &dn_num, NULL);
                     for(j = 0; j < dn_num; j++)
                     {                
-                        relationLocInfo->rl_nodeList = list_append_unique_int(relationLocInfo->rl_nodeList, datanodes[j]);
+						relationLocInfo->rl_nodeList = list_append_unique_int(relationLocInfo->rl_nodeList,
+						                                                      datanodes[j]);
                     }
                     pfree(datanodes);
                 }
@@ -780,7 +857,8 @@ RelationBuildLocator(Relation rel)
                     GetShardNodes(groups[i], &datanodes, &dn_num, NULL);
                     for(j = 0; j < dn_num; j++)
                     {                
-                        relationLocInfo->rl_nodeList = list_append_unique_int(relationLocInfo->rl_nodeList, datanodes[j]);
+						relationLocInfo->rl_nodeList = list_append_unique_int(relationLocInfo->rl_nodeList,
+						                                                      datanodes[j]);
                     }
                     pfree(datanodes);
                 }
@@ -849,7 +927,9 @@ GetRelationLocInfo(Oid relid)
     Assert(rel->rd_isvalid);
 
     if (rel->rd_locator_info)
+	{
         ret_loc_info = CopyRelationLocInfo(rel->rd_locator_info);
+	}
 
     relation_close(rel, AccessShareLock);
 
@@ -864,7 +944,9 @@ GetRelationLocType(Oid relid)
 {
     RelationLocInfo *locinfo = GetRelationLocInfo(relid);
     if (!locinfo)
+	{
         return LOCATOR_TYPE_NONE;
+	}
 
     return locinfo->locatorType;
 }
@@ -885,7 +967,9 @@ CopyRelationLocInfo(RelationLocInfo * src_info)
     dest_info->locatorType = src_info->locatorType;
     dest_info->partAttrNum = src_info->partAttrNum;
     if (src_info->partAttrName)
+	{
         dest_info->partAttrName = pstrdup(src_info->partAttrName);
+	}
 #ifdef _MIGRATE_
     dest_info->groupId     = src_info->groupId;
 #endif
@@ -898,7 +982,9 @@ CopyRelationLocInfo(RelationLocInfo * src_info)
     }
 #endif
     if (src_info->rl_nodeList)
+	{
         dest_info->rl_nodeList = list_copy(src_info->rl_nodeList);
+	}
     /* Note, for round robin, we use the relcache entry */
 
     return dest_info;
@@ -914,11 +1000,15 @@ FreeRelationLocInfo(RelationLocInfo *relationLocInfo)
     if (relationLocInfo)
     {
         if (relationLocInfo->partAttrName)
+		{
             pfree(relationLocInfo->partAttrName);
+		}
 
 #ifdef __COLD_HOT__
 		if (relationLocInfo->secAttrName)
+		{
 			pfree(relationLocInfo->secAttrName);
+		}
 #endif
 
 		list_free(relationLocInfo->rl_nodeList);
@@ -937,7 +1027,9 @@ FreeExecNodes(ExecNodes **exec_nodes)
 
     /* Nothing to do */
     if (!tmp_en)
+	{
         return;
+	}
     list_free(tmp_en->primarynodelist);
     list_free(tmp_en->nodeList);
     pfree(tmp_en);
@@ -946,6 +1038,7 @@ FreeExecNodes(ExecNodes **exec_nodes)
 
 
 #ifdef XCP
+
 /*
  * Determine value length in bytes for specified type for a module locator.
  * Return -1 if module locator is not supported for the type.
@@ -1038,6 +1131,7 @@ hash_func_ptr(Oid dataType)
 }
 
 #ifdef _MIGRATE_
+
 Locator *
 createLocator(char locatorType, RelationAccessType accessType,
               Oid dataType, LocatorListType listType, int nodeCount,
@@ -1103,8 +1197,7 @@ createLocator(char locatorType, RelationAccessType accessType,
                 int *intptr;
                 nodeMap = palloc(locator->nodeCount * sizeof(int));
                 intptr = (int *) nodeMap;
-                foreach(lc, l)
-                    *intptr++ = lfirst_int(lc);
+				foreach(lc, l) *intptr++ = lfirst_int(lc);
                 locator->listType = LOCATOR_LIST_INT;
             }
             else if (IsA(l, OidList))
@@ -1112,8 +1205,7 @@ createLocator(char locatorType, RelationAccessType accessType,
                 Oid *oidptr;
                 nodeMap = palloc(locator->nodeCount * sizeof(Oid));
                 oidptr = (Oid *) nodeMap;
-                foreach(lc, l)
-                    *oidptr++ = lfirst_oid(lc);
+				foreach(lc, l) *oidptr++ = lfirst_oid(lc);
                 locator->listType = LOCATOR_LIST_OID;
             }
             else if (IsA(l, List))
@@ -1121,8 +1213,7 @@ createLocator(char locatorType, RelationAccessType accessType,
                 void **voidptr;
                 nodeMap = palloc(locator->nodeCount * sizeof(void *));
                 voidptr = (void **) nodeMap;
-                foreach(lc, l)
-                    *voidptr++ = lfirst(lc);
+				foreach(lc, l) *voidptr++ = lfirst(lc);
                 locator->listType = LOCATOR_LIST_POINTER;
             }
             else
@@ -1478,7 +1569,9 @@ createLocator(char locatorType, RelationAccessType accessType,
     }
 
     if (result)
+	{
         *result = locator->results;
+	}
 
     return locator;
 }
@@ -1493,7 +1586,9 @@ freeLocator(Locator *locator)
      * do not free it twice
      */
     if (locator->results != locator->nodeMap)
+	{
         pfree(locator->results);
+	}
     pfree(locator);
 }
 
@@ -1510,7 +1605,9 @@ locate_static(Locator *self, Datum value, bool isnull,
 {
     /* TODO */
     if (hasprimary)
+	{
         *hasprimary = false;
+	}
     return self->nodeCount;
 }
 
@@ -1527,9 +1624,13 @@ locate_roundrobin(Locator *self, Datum value, bool isnull,
 {// #lizard forgives
     /* TODO */
     if (hasprimary)
+	{
         *hasprimary = false;
+	}
     if (++self->roundRobinNode >= self->nodeCount)
+	{
         self->roundRobinNode = 0;
+	}
     switch (self->listType)
     {
         case LOCATOR_LIST_NONE:
@@ -1570,7 +1671,9 @@ locate_modulo_random(Locator *self, Datum value, bool isnull,
     int offset;
 
     if (hasprimary)
+	{
         *hasprimary = false;
+	}
 
     Assert(self->nodeCount > 0);
     offset = compute_modulo(abs(rand()), self->nodeCount);
@@ -1611,9 +1714,13 @@ locate_hash_insert(Locator *self, Datum value, bool isnull,
 {// #lizard forgives
     int index;
     if (hasprimary)
+	{
         *hasprimary = false;
+	}
     if (isnull)
+	{
         index = 0;
+	}
     else
     {
         unsigned int hash32;
@@ -1645,6 +1752,7 @@ locate_hash_insert(Locator *self, Datum value, bool isnull,
 }
 
 #ifdef _MIGRATE_
+
 static int locate_shard_insert(Locator *self, Datum value, bool isnull,
 #ifdef __COLD_HOT__
                                     Datum secValue, bool secIsNull,
@@ -1922,6 +2030,7 @@ static int locate_shard_select(Locator *self, Datum value, bool isnull,
         }
     }
 }
+
 #endif
 
 
@@ -1937,7 +2046,9 @@ locate_hash_select(Locator *self, Datum value, bool isnull,
                         bool *hasprimary)
 {// #lizard forgives
     if (hasprimary)
+	{
         *hasprimary = false;
+	}
     if (isnull)
     {
         int i;
@@ -2010,23 +2121,37 @@ locate_modulo_insert(Locator *self, Datum value, bool isnull,
 {// #lizard forgives
     int index;
     if (hasprimary)
+	{
         *hasprimary = false;
+	}
     if (isnull)
+	{
         index = 0;
+	}
     else
     {
         uint64 val;
 
         if (self->valuelen == 8)
+		{
             val = (uint64) (GET_8_BYTES(value));
+		}
         else if (self->valuelen == 4)
+		{
             val = (uint64) (GET_4_BYTES(value));
+		}
         else if (self->valuelen == 2)
+		{
             val = (uint64) (GET_2_BYTES(value));
+		}
         else if (self->valuelen == 1)
+		{
             val = (uint64) (GET_1_BYTE(value));
+		}
         else
+		{
             val = 0;
+		}
 
         index = compute_modulo(val, self->nodeCount);
     }
@@ -2065,7 +2190,9 @@ locate_modulo_select(Locator *self, Datum value, bool isnull,
                             bool *hasprimary)
 {// #lizard forgives
     if (hasprimary)
+	{
         *hasprimary = false;
+	}
     if (isnull)
     {
         int i;
@@ -2100,15 +2227,25 @@ locate_modulo_select(Locator *self, Datum value, bool isnull,
         int     index;
 
         if (self->valuelen == 8)
+		{
             val = (uint64) (GET_8_BYTES(value));
+		}
         else if (self->valuelen == 4)
+		{
             val = (unsigned int) (GET_4_BYTES(value));
+		}
         else if (self->valuelen == 2)
+		{
             val = (unsigned int) (GET_2_BYTES(value));
+		}
         else if (self->valuelen == 1)
+		{
             val = (unsigned int) (GET_1_BYTE(value));
+		}
         else
+		{
             val = 0;
+		}
 
         index = compute_modulo(val, self->nodeCount);
 
@@ -2151,6 +2288,7 @@ GET_NODES(Locator *self, Datum value, bool isnull,
 }
 
 #ifdef __TBASE__
+
 char
 getLocatorDisType(Locator *self)
 {
@@ -2179,7 +2317,9 @@ int
 calcDistReplications(char distributionType, Bitmapset *nodes)
 {
 	if (!nodes)
+	{
 		return 1;
+	}
 
 	if (IsLocatorReplicated(distributionType) ||
 		IsLocatorNone(distributionType))
@@ -2189,6 +2329,7 @@ calcDistReplications(char distributionType, Bitmapset *nodes)
 
 	return 1;
 }
+
 #endif
 
 void *
@@ -2210,6 +2351,7 @@ getLocatorNodeCount(Locator *self)
 {
     return self->nodeCount;
 }
+
 #endif
 
 /*
@@ -2248,7 +2390,9 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
 #endif
 
     if (rel_loc_info == NULL)
+	{
         return NULL;
+	}
 
 
     if (IsLocatorDistributedByValue(rel_loc_info->locatorType))
@@ -2339,7 +2483,7 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
 {// #lizard forgives
 #define ONE_SECOND_DATUM 1000000
     Expr            *distcol_expr = NULL;
-    ExecNodes        *exec_nodes;
+	ExecNodes *exec_nodes = NULL;
     Datum            distcol_value;
     bool            distcol_isnull;
 #ifdef __COLD_HOT__
@@ -2352,6 +2496,15 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
     Oid             distcol_type = InvalidOid;
     Oid                *opArray            = NULL;
     bool            *isswapArray       = NULL;
+	Oid disttype;
+	int32 disttypmod;
+
+	if (enable_distri_debug)
+	{
+		int r = 1;
+		while(r)
+		;
+	}
 
     if (dis_qual)
     {
@@ -2365,15 +2518,18 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
 #endif
 
     if (!rel_loc_info)
+	{
         return NULL;
+	}
     /*
      * If the table distributed by value, check if we can reduce the Datanodes
      * by looking at the qualifiers for this relation
      */
+	disttype = get_atttype(reloid, rel_loc_info->partAttrNum);
+	disttypmod = get_atttypmod(reloid, rel_loc_info->partAttrNum);
+
     if (IsRelationDistributedByValue(rel_loc_info))
     {
-        Oid        disttype = get_atttype(reloid, rel_loc_info->partAttrNum);
-        int32    disttypmod = get_atttypmod(reloid, rel_loc_info->partAttrNum);
         distcol_expr = pgxc_find_distcol_expr(varno, rel_loc_info->partAttrNum,
                                                     quals);
         /*
@@ -2382,7 +2538,7 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
          * will happen in case of inserting that type of expression value as the
          * distribution column value.
          */
-        if (distcol_expr)
+		if (distcol_expr && !IsA(distcol_expr, ArrayExpr))
         {
             distcol_expr = (Expr *)coerce_to_target_type(NULL,
                                                     (Node *)distcol_expr,
@@ -2553,7 +2709,9 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
                         if (isswapArray[i])
                         {
                             /* const <= var */
-                            minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp : const_expr->constvalue) : const_expr->constvalue;
+							minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp
+							                                                            : const_expr->constvalue)
+							                    : const_expr->constvalue;
                             seccol_type = const_expr->consttype;
                             equal_min = true;
                         }
@@ -2577,7 +2735,9 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
                         else
                         {
                             /* var >= const */
-                            minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp : const_expr->constvalue) : const_expr->constvalue;
+							minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp
+							                                                            : const_expr->constvalue)
+							                    : const_expr->constvalue;
                             seccol_type = const_expr->consttype;
                             equal_min = true;
                         }
@@ -2677,9 +2837,68 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
             }
         }
 #endif
+
+		return GetRelationNodes(rel_loc_info, distcol_value,
+		                        distcol_isnull,
+		                        seccol_value, seccol_isnull,
+		                        relaccess);
+	}
+	else if (distcol_expr && IsA(distcol_expr, ArrayExpr) &&
+	         rel_loc_info->locatorType == LOCATOR_TYPE_SHARD && !seccol_list)
+	{
+		ArrayExpr *arrayExpr = (ArrayExpr *) distcol_expr;
+		ListCell *lc;
+		bool success = true;
+		Const *const_expr;
+		ExecNodes *temp;
+
+		foreach(lc, arrayExpr->elements)
+		{
+			Node *expr = (Node *) lfirst(lc);
+
+			/* convert to distribute column type */
+			expr = coerce_to_target_type(NULL,
+			                             (Node *) expr,
+			                             exprType((Node *) expr),
+			                             disttype, disttypmod,
+			                             COERCION_ASSIGNMENT,
+			                             COERCE_IMPLICIT_CAST, -1);
+			expr = eval_const_expressions(NULL,
+			                              (Node *) expr);
+			if (!expr || !IsA(expr, Const))
+			{
+				success = false;
+				break;
+			}
+
+			const_expr = castNode(Const, expr);
+			temp = GetRelationNodes(rel_loc_info, const_expr->constvalue,
+			                        const_expr->constisnull,
+			                        seccol_value, seccol_isnull,
+			                        relaccess);
+			if (!temp)
+			{
+				success = false;
+				break;
+			}
+
+			if (exec_nodes)
+			{
+				exec_nodes->nodeList = list_concat_unique(exec_nodes->nodeList,
+				                                          temp->nodeList);
     }
     else
     {
+				exec_nodes = temp;
+			}
+		}
+
+		if (success)
+		{
+			return exec_nodes;
+		}
+	}
+
         distcol_value = (Datum) 0;
         distcol_isnull = true;
 #ifdef __TBASE__
@@ -2734,7 +2953,6 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
             }
         }
 #endif
-    }
 
     exec_nodes = GetRelationNodes(rel_loc_info, distcol_value,
                                                 distcol_isnull,
@@ -2754,32 +2972,43 @@ GetRelationDistribColumn(RelationLocInfo *locInfo)
 {
     /* No relation, so simply leave */
     if (!locInfo)
+	{
         return NULL;
+	}
 
     /* No distribution column if relation is not distributed with a key */
     if (!IsRelationDistributedByValue(locInfo))
+	{
         return NULL;
+	}
 
     /* Return column name */
     return get_attname(locInfo->relid, locInfo->partAttrNum);
 }
 
 #ifdef __COLD_HOT__
+
 char *
 GetRelationSecDistribColumn(RelationLocInfo *locInfo)
 {
     /* No relation, so simply leave */
     if (!locInfo)
+	{
         return NULL;
+	}
 
     /* No distribution column if relation is not distributed with a key */
     if (!IsRelationDistributedByValue(locInfo))
+	{
         return NULL;
+	}
 
     /* Return column name */
     return get_attname(locInfo->relid, locInfo->secAttrNum);
 }
+
 #endif
+
 /*
  * pgxc_find_distcol_expr
  * Search through the quals provided and find out an expression which will give
@@ -2802,13 +3031,19 @@ pgxc_find_distcol_expr(Index varno,
 
     /* If no quals, no distribution column expression */
     if (!quals)
+	{
         return NULL;
+	}
 
     /* Convert the qualification into List if it's not already so */
     if (!IsA(quals, List))
+	{
         lquals = make_ands_implicit((Expr *)quals);
+	}
     else
+	{
         lquals = (List *)quals;
+	}
 
     /*
      * For every ANDed expression, check if that expression is of the form
@@ -2817,21 +3052,45 @@ pgxc_find_distcol_expr(Index varno,
     foreach(qual_cell, lquals)
     {
         Expr *qual_expr = (Expr *)lfirst(qual_cell);
-        OpExpr *op;
         Expr *lexpr;
         Expr *rexpr;
         Var *var_expr;
         Expr *distcol_expr;
+		Oid opno;
+
+		if (IsA(qual_expr, OpExpr))
+		{
+			OpExpr *op;
 
-        if (!IsA(qual_expr, OpExpr))
-            continue;
         op = (OpExpr *)qual_expr;
+
         /* If not a binary operator, it can not be '='. */
         if (list_length(op->args) != 2)
+			{
             continue;
+			}
 
         lexpr = linitial(op->args);
         rexpr = lsecond(op->args);
+			opno = op->opno;
+		}
+		else if (IsA(qual_expr, ScalarArrayOpExpr))
+		{
+			ScalarArrayOpExpr *arrayOpExpr = (ScalarArrayOpExpr *) qual_expr;
+
+			if (list_length(arrayOpExpr->args) != 2)
+			{
+				continue;
+			}
+
+			lexpr = linitial(arrayOpExpr->args);
+			rexpr = lsecond(arrayOpExpr->args);
+			opno = arrayOpExpr->opno;
+		}
+		else
+		{
+			continue;
+		}
 
         /*
          * If either of the operands is a RelabelType, extract the Var in the RelabelType.
@@ -2842,9 +3101,13 @@ pgxc_find_distcol_expr(Index varno,
          * should be shipped to one of the nodes only
          */
         if (IsA(lexpr, RelabelType))
+		{
             lexpr = ((RelabelType*)lexpr)->arg;
+		}
         if (IsA(rexpr, RelabelType))
+		{
             rexpr = ((RelabelType*)rexpr)->arg;
+		}
 
         /*
          * If either of the operands is a Var expression, assume the other
@@ -2862,32 +3125,43 @@ pgxc_find_distcol_expr(Index varno,
             distcol_expr = lexpr;
         }
         else
+		{
             continue;
+		}
+
         /*
          * If Var found is not the distribution column of required relation,
          * check next qual
          */
         if (var_expr->varno != varno || var_expr->varattno != attrNum)
+		{
             continue;
+		}
+
         /*
          * If the operator is not an assignment operator, check next
          * constraint. An operator is an assignment operator if it's
          * mergejoinable or hashjoinable. Beware that not every assignment
          * operator is mergejoinable or hashjoinable, so we might leave some
-         * oportunity. But then we have to rely on the opname which may not
+		 * opportunity. But then we have to rely on the opname which may not
          * be something we know to be equality operator as well.
          */
-        if (!op_mergejoinable(op->opno, exprType((Node *)lexpr)) &&
-            !op_hashjoinable(op->opno, exprType((Node *)lexpr)))
+		if (!op_mergejoinable(opno, exprType((Node *) var_expr)) &&
+		    !op_hashjoinable(opno, exprType((Node *) var_expr)))
+		{
             continue;
+		}
+
         /* Found the distribution column expression return it */
         return distcol_expr;
+
     }
     /* Exhausted all quals, but no distribution column expression */
     return NULL;
 }
 
 #ifdef __COLD_HOT__
+
 static bool IsConstAligned(Oid reloid, Datum constvalue, AttrNumber secAttr)
 {// #lizard forgives
     bool    isalign = false;
@@ -2974,13 +3248,19 @@ pgxc_find_distcol_exprs(Index varno,
 
     /* If no quals, no distribution column expression */
     if (!quals)
+	{
         return NULL;
+	}
 
     /* Convert the qualification into List if it's not already so */
     if (!IsA(quals, List))
+	{
         lquals = make_ands_implicit((Expr *)quals);
+	}
     else
+	{
         lquals = (List *)quals;
+	}
 
     /*
      * For every ANDed expression, check if that expression is of the form
@@ -3005,11 +3285,15 @@ pgxc_find_distcol_exprs(Index varno,
         }
 
         if (!IsA(qual_expr, OpExpr))
+		{
             continue;
+		}
         op = (OpExpr *)qual_expr;
         /* If not a binary operator, it can not be '='. */
         if (list_length(op->args) != 2)
+		{
             continue;
+		}
 
         lexpr = linitial(op->args);
         rexpr = lsecond(op->args);
@@ -3023,9 +3307,13 @@ pgxc_find_distcol_exprs(Index varno,
          * should be shipped to one of the nodes only
          */
         if (IsA(lexpr, RelabelType))
+		{
             lexpr = ((RelabelType*)lexpr)->arg;
+		}
         if (IsA(rexpr, RelabelType))
+		{
             rexpr = ((RelabelType*)rexpr)->arg;
+		}
 
         /*
          * If either of the operands is a Var expression, assume the other
@@ -3044,13 +3332,17 @@ pgxc_find_distcol_exprs(Index varno,
             isswap = true;
         }
         else
+		{
             continue;
+		}
         /*
          * If Var found is not the distribution column of required relation,
          * check next qual
          */
         if (var_expr->varno != varno || var_expr->varattno != attrNum)
+		{
             continue;
+		}
         /*
          * If the operator is not an assignment operator, check next
          * constraint. An operator is an assignment operator if it's
@@ -3100,7 +3392,9 @@ GetRelationTimeStampRangeNodes(RelationLocInfo *rel_loc_info,
     ExecNodes    *exec_nodes;
 
     if (rel_loc_info == NULL)
+	{
         return NULL;
+	}
 
     
     switch (rel_loc_info->locatorType)
@@ -3262,7 +3556,8 @@ GetRelationGroupsByQuals(Oid reloid, RelationLocInfo *rel_loc_info, Node *sec_qu
                 if (isswapArray[i])
                 {
                     /* const <= var */
-                    minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp : const_expr->constvalue) : const_expr->constvalue;
+					minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp : const_expr->constvalue)
+					                    : const_expr->constvalue;
                     seccol_type = const_expr->consttype;
                 }
                 else
@@ -3283,7 +3578,8 @@ GetRelationGroupsByQuals(Oid reloid, RelationLocInfo *rel_loc_info, Node *sec_qu
                 else
                 {
                     /* var >= const */
-                    minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp : const_expr->constvalue) : const_expr->constvalue;
+					minStamp = minStamp ? ((const_expr->constvalue >= minStamp) ? minStamp : const_expr->constvalue)
+					                    : const_expr->constvalue;
                     seccol_type = const_expr->consttype;
                 }
             }
@@ -3348,16 +3644,19 @@ GetRelationGroupsByQuals(Oid reloid, RelationLocInfo *rel_loc_info, Node *sec_qu
         List *oids = NULL;
         if (minStamp && maxStamp)
         {            
-            if (IsHotData(minStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp) && IsHotData(maxStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp))
+			if (IsHotData(minStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp) &&
+			    IsHotData(maxStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp))
             {    /* all hot data */
                 oids = lappend_oid(oids, rel_loc_info->groupId);                                
             }
-            else if (!IsHotData(minStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp) && !IsHotData(maxStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp))
+			else if (!IsHotData(minStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp) &&
+			         !IsHotData(maxStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp))
             {
                 /* all cold data */
                 oids = lappend_oid(oids, rel_loc_info->coldGroupId);
             }
-            else if(!IsHotData(minStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp) && IsHotData(maxStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp))
+			else if (!IsHotData(minStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp) &&
+			         IsHotData(maxStamp, RELATION_ACCESS_READ, partitionStrategy, interval_step, start_timestamp))
             {
                 /* range across cold and hot group */
                 oids = lappend_oid(oids, rel_loc_info->groupId);
@@ -3407,10 +3706,12 @@ GetRelationGroupsByQuals(Oid reloid, RelationLocInfo *rel_loc_info, Node *sec_qu
         return oids;
     }
 }
+
 #endif
 
 #ifdef _MLS_
 extern char* g_default_locator_type;
+
 char get_default_locator_type(void)
 {
     if (strlen(g_default_locator_type) == 0)
@@ -3440,6 +3741,7 @@ char get_default_locator_type(void)
     return LOCATOR_TYPE_HASH;
 
 }
+
 int get_default_distype(void)
 {
     if (strlen(g_default_locator_type) == 0)
diff --git a/src/include/optimizer/pgxcship.h b/src/include/optimizer/pgxcship.h
index c42f3a04..d29c2b8f 100644
--- a/src/include/optimizer/pgxcship.h
+++ b/src/include/optimizer/pgxcship.h
@@ -38,5 +38,6 @@ extern bool pgxc_is_trigger_shippable(Trigger *trigger);
 extern Node *get_var_from_arg(Node *arg);
 
 extern bool is_var_distribute_column(Var *var, List *rtable);
+extern ExecNodes *pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2);
 #endif
 #endif
diff --git a/src/test/regress/sql/xc_FQS.sql b/src/test/regress/sql/xc_FQS.sql
index bc99b709..53ce737e 100644
--- a/src/test/regress/sql/xc_FQS.sql
+++ b/src/test/regress/sql/xc_FQS.sql
@@ -276,7 +276,7 @@ explain (verbose on, costs off) delete from tab1_replicated where val = 7;
 select * from tab1_replicated where val = 7;
 
 -- Constant subquery
-create table subquery_fqs(id int, a varchar, c int);
+create table subquery_fqs(id int, a varchar, c int) distribute by shard(id);
 insert into subquery_fqs values(1,'gd', 2);
 insert into subquery_fqs values(1,'zj', 2);
 insert into subquery_fqs values(1,'sz', 2);
@@ -284,15 +284,26 @@ explain select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual un
 select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a);
 
 -- Support subquery FQS only if subquery distributed on same DN with main query(only 1 DN node)
-explain select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1);
+explain  (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1);
 select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1);
-explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1);
+explain  (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1);
 select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1);
-explain select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1);
+explain  (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1);
 select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1);
-explain select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2;
+explain  (num_nodes on, verbose on, nodes off, costs off) select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2;
 select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2;
 
+set enable_oracle_compatible to true;
+explain  (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id in (1 ,1);
+explain   select * from subquery_fqs t1 where t1.id in (1 ,1);
+explain   select * from subquery_fqs t1 where t1.id in (1 ,1);
+explain   select * from subquery_fqs t1 where t1.id in (1 ,1);
+explain   select * from subquery_fqs t1 where t1.id in (1 ,1);
+explain   select * from subquery_fqs t1 where t1.id in (1 ,1);
+explain   select * from subquery_fqs t1 where t1.id in (1 ,1);
+set enable_oracle_compatible to false;
+explain  (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id in (1 ,3);
+
 drop table tab1_rr;
 drop table tab1_hash;
 drop table tab1_modulo;

From b0348dd847513b6f746812889d707b269351e560 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Sun, 24 Oct 2021 09:26:59 +0800
Subject: [PATCH 424/578] final

---
 src/backend/pgxc/locator/locator.c   |  11 +-
 src/test/regress/expected/xc_FQS.out | 571 +++++++++++++++++++--------
 2 files changed, 405 insertions(+), 177 deletions(-)

diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index 3ed59f90..1c0b541e 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -2499,13 +2499,6 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
 	Oid disttype;
 	int32 disttypmod;
 
-	if (enable_distri_debug)
-	{
-		int r = 1;
-		while(r)
-		;
-	}
-
     if (dis_qual)
     {
         *dis_qual = NULL;
@@ -2843,6 +2836,7 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
 		                        seccol_value, seccol_isnull,
 		                        relaccess);
 	}
+	/* Only for shard table without cold hot seperation */
 	else if (distcol_expr && IsA(distcol_expr, ArrayExpr) &&
 	         rel_loc_info->locatorType == LOCATOR_TYPE_SHARD && !seccol_list)
 	{
@@ -2884,7 +2878,8 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
 
 			if (exec_nodes)
 			{
-				exec_nodes->nodeList = list_concat_unique(exec_nodes->nodeList,
+				Assert(exec_nodes->baselocatortype == temp->baselocatortype);
+				exec_nodes->nodeList = list_concat_unique_int(exec_nodes->nodeList,
 				                                          temp->nodeList);
     }
     else
diff --git a/src/test/regress/expected/xc_FQS.out b/src/test/regress/expected/xc_FQS.out
index 6fe94587..f4b88988 100644
--- a/src/test/regress/expected/xc_FQS.out
+++ b/src/test/regress/expected/xc_FQS.out
@@ -221,21 +221,29 @@ select val, val2 from tab1_rr where val2 = 8 group by val, val2;
 (1 row)
 
 explain (verbose on, nodes off, costs off) select val, val2 from tab1_rr where val2 = 8 group by val, val2;
-                   QUERY PLAN                   
-------------------------------------------------
- Group
+                            QUERY PLAN                            
+------------------------------------------------------------------
+ Remote Subquery Scan on all
    Output: val, val2
-   Group Key: tab1_rr.val, tab1_rr.val2
-   ->  Remote Subquery Scan on all
+   ->  Group
          Output: val, val2
-         Sort Key: tab1_rr.val
+         Group Key: tab1_rr.val, tab1_rr.val2
          ->  Sort
                Output: val, val2
                Sort Key: tab1_rr.val
-               ->  Seq Scan on public.tab1_rr
+               ->  Remote Subquery Scan on all
                      Output: val, val2
-                     Filter: (tab1_rr.val2 = 8)
-(12 rows)
+                     Distribute results by H: val
+                     ->  Group
+                           Output: val, val2
+                           Group Key: tab1_rr.val, tab1_rr.val2
+                           ->  Sort
+                                 Output: val, val2
+                                 Sort Key: tab1_rr.val
+                                 ->  Seq Scan on public.tab1_rr
+                                       Output: val, val2
+                                       Filter: (tab1_rr.val2 = 8)
+(20 rows)
 
 -- should not get FQSed because of HAVING clause
 select sum(val) from tab1_rr where val2 = 2 group by val2 having sum(val) > 1;
@@ -245,18 +253,24 @@ select sum(val) from tab1_rr where val2 = 2 group by val2 having sum(val) > 1;
 (1 row)
 
 explain (verbose on, nodes off, costs off) select sum(val) from tab1_rr where val2 = 2 group by val2 having sum(val) > 1;
-                QUERY PLAN                
-------------------------------------------
- GroupAggregate
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
    Output: sum(val), val2
-   Group Key: tab1_rr.val2
-   Filter: (sum(tab1_rr.val) > 1)
-   ->  Remote Subquery Scan on all
-         Output: val2, val
-         ->  Seq Scan on public.tab1_rr
-               Output: val2, val
-               Filter: (tab1_rr.val2 = 2)
-(9 rows)
+   ->  Finalize GroupAggregate
+         Output: sum(val), val2
+         Group Key: tab1_rr.val2
+         Filter: (sum(tab1_rr.val) > 1)
+         ->  Remote Subquery Scan on all
+               Output: val2, PARTIAL sum(val)
+               Distribute results by H: val2
+               ->  Partial GroupAggregate
+                     Output: val2, PARTIAL sum(val)
+                     Group Key: tab1_rr.val2
+                     ->  Seq Scan on public.tab1_rr
+                           Output: val, val2
+                           Filter: (tab1_rr.val2 = 2)
+(15 rows)
 
 -- tests for node reduction by application of quals, for roundrobin node
 -- reduction is not applicable. Having query not FQSed because of existence of ORDER BY,
@@ -416,14 +430,14 @@ explain (verbose on, nodes off, costs off) select distinct val2 from tab1_rr whe
 -- DMLs
 update tab1_rr set val2 = 1000 where val = 7; 
 explain (verbose on, nodes off, costs off) update tab1_rr set val2 = 1000 where val = 7; 
-                           QUERY PLAN                           
-----------------------------------------------------------------
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
  Remote Fast Query Execution
-   Output: 1000, tab1_rr.xc_node_id, tab1_rr.ctid
+   Output: tab1_rr.val, 1000, tab1_rr.xc_node_id, tab1_rr.ctid, tab1_rr.shardid
    Remote query: UPDATE tab1_rr SET val2 = 1000 WHERE (val = 7)
    ->  Update on public.tab1_rr
          ->  Seq Scan on public.tab1_rr
-               Output: val, 1000, ctid
+               Output: val, 1000, ctid, shardid
                Filter: (tab1_rr.val = 7)
 (7 rows)
 
@@ -436,15 +450,15 @@ select * from tab1_rr where val = 7;
 
 delete from tab1_rr where val = 7; 
 explain (verbose on, costs off) delete from tab1_rr where val = 7; 
-                     QUERY PLAN                      
------------------------------------------------------
+                         QUERY PLAN                          
+-------------------------------------------------------------
  Remote Fast Query Execution
-   Output: tab1_rr.xc_node_id, tab1_rr.ctid
+   Output: tab1_rr.xc_node_id, tab1_rr.ctid, tab1_rr.shardid
    Node/s: datanode_1, datanode_2
    Remote query: DELETE FROM tab1_rr WHERE (val = 7)
    ->  Delete on public.tab1_rr
          ->  Seq Scan on public.tab1_rr
-               Output: ctid
+               Output: ctid, shardid
                Filter: (tab1_rr.val = 7)
 (8 rows)
 
@@ -660,18 +674,24 @@ select sum(val) from tab1_hash where val2 = 2 group by val2 having sum(val) > 1;
 (1 row)
 
 explain (verbose on, nodes off, costs off) select sum(val) from tab1_hash where val2 = 2 group by val2 having sum(val) > 1;
-                 QUERY PLAN                 
---------------------------------------------
- GroupAggregate
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Remote Subquery Scan on all
    Output: sum(val), val2
-   Group Key: tab1_hash.val2
-   Filter: (sum(tab1_hash.val) > 1)
-   ->  Remote Subquery Scan on all
-         Output: val2, val
-         ->  Seq Scan on public.tab1_hash
-               Output: val2, val
-               Filter: (tab1_hash.val2 = 2)
-(9 rows)
+   ->  Finalize GroupAggregate
+         Output: sum(val), val2
+         Group Key: tab1_hash.val2
+         Filter: (sum(tab1_hash.val) > 1)
+         ->  Remote Subquery Scan on all
+               Output: val2, PARTIAL sum(val)
+               Distribute results by H: val2
+               ->  Partial GroupAggregate
+                     Output: val2, PARTIAL sum(val)
+                     Group Key: tab1_hash.val2
+                     ->  Seq Scan on public.tab1_hash
+                           Output: val, val2
+                           Filter: (tab1_hash.val2 = 2)
+(15 rows)
 
 -- tests for node reduction by application of quals. Having query FQSed because of
 -- existence of ORDER BY, implies that nodes got reduced.
@@ -832,14 +852,14 @@ explain (verbose on, nodes off, costs off, num_nodes on) select distinct val2 fr
 -- DMLs
 update tab1_hash set val2 = 1000 where val = 7; 
 explain (verbose on, nodes off, costs off) update tab1_hash set val2 = 1000 where val = 7; 
-                            QUERY PLAN                            
-------------------------------------------------------------------
+                                       QUERY PLAN                                       
+----------------------------------------------------------------------------------------
  Remote Fast Query Execution
-   Output: 1000, tab1_hash.xc_node_id, tab1_hash.ctid
+   Output: tab1_hash.val, 1000, tab1_hash.xc_node_id, tab1_hash.ctid, tab1_hash.shardid
    Remote query: UPDATE tab1_hash SET val2 = 1000 WHERE (val = 7)
    ->  Update on public.tab1_hash
          ->  Seq Scan on public.tab1_hash
-               Output: val, 1000, ctid
+               Output: val, 1000, ctid, shardid
                Filter: (tab1_hash.val = 7)
 (7 rows)
 
@@ -852,15 +872,15 @@ select * from tab1_hash where val = 7;
 
 delete from tab1_hash where val = 7; 
 explain (verbose on, costs off) delete from tab1_hash where val = 7; 
-                      QUERY PLAN                       
--------------------------------------------------------
+                                    QUERY PLAN                                    
+----------------------------------------------------------------------------------
  Remote Fast Query Execution
-   Output: tab1_hash.xc_node_id, tab1_hash.ctid
+   Output: tab1_hash.xc_node_id, tab1_hash.ctid, tab1_hash.shardid, tab1_hash.val
    Node/s: datanode_2
    Remote query: DELETE FROM tab1_hash WHERE (val = 7)
    ->  Delete on public.tab1_hash
          ->  Seq Scan on public.tab1_hash
-               Output: ctid
+               Output: ctid, shardid
                Filter: (tab1_hash.val = 7)
 (8 rows)
 
@@ -1076,18 +1096,24 @@ select sum(val) from tab1_modulo where val2 = 2 group by val2 having sum(val) >
 (1 row)
 
 explain (verbose on, nodes off, costs off) select sum(val) from tab1_modulo where val2 = 2 group by val2 having sum(val) > 1;
-                  QUERY PLAN                  
-----------------------------------------------
- GroupAggregate
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Remote Subquery Scan on all
    Output: sum(val), val2
-   Group Key: tab1_modulo.val2
-   Filter: (sum(tab1_modulo.val) > 1)
-   ->  Remote Subquery Scan on all
-         Output: val2, val
-         ->  Seq Scan on public.tab1_modulo
-               Output: val2, val
-               Filter: (tab1_modulo.val2 = 2)
-(9 rows)
+   ->  Finalize GroupAggregate
+         Output: sum(val), val2
+         Group Key: tab1_modulo.val2
+         Filter: (sum(tab1_modulo.val) > 1)
+         ->  Remote Subquery Scan on all
+               Output: val2, PARTIAL sum(val)
+               Distribute results by H: val2
+               ->  Partial GroupAggregate
+                     Output: val2, PARTIAL sum(val)
+                     Group Key: tab1_modulo.val2
+                     ->  Seq Scan on public.tab1_modulo
+                           Output: val, val2
+                           Filter: (tab1_modulo.val2 = 2)
+(15 rows)
 
 -- tests for node reduction by application of quals. Having query FQSed because of
 -- existence of ORDER BY, implies that nodes got reduced.
@@ -1189,17 +1215,18 @@ select avg(val) from tab1_modulo where val = 7;
 (1 row)
 
 explain (verbose on, nodes off, costs off, num_nodes on) select avg(val) from tab1_modulo where val = 7;
-                               QUERY PLAN                                
--------------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: avg(tab1_modulo.val)
-   Remote query: SELECT avg(val) AS avg FROM tab1_modulo WHERE (val = 7)
-   ->  Aggregate
-         Output: avg(val)
-         ->  Seq Scan on public.tab1_modulo
-               Output: val, val2
-               Filter: (tab1_modulo.val = 7)
-(8 rows)
+                    QUERY PLAN                     
+---------------------------------------------------
+ Finalize Aggregate
+   Output: avg(val)
+   ->  Remote Subquery Scan on all
+         Output: PARTIAL avg(val)
+         ->  Partial Aggregate
+               Output: PARTIAL avg(val)
+               ->  Seq Scan on public.tab1_modulo
+                     Output: val, val2
+                     Filter: (tab1_modulo.val = 7)
+(9 rows)
 
 select val, val2 from tab1_modulo where val = 7 order by val2;
  val | val2 
@@ -1209,18 +1236,17 @@ select val, val2 from tab1_modulo where val = 7 order by val2;
 (2 rows)
 
 explain (verbose on, nodes off, costs off, num_nodes on) select val, val2 from tab1_modulo where val = 7 order by val2;
-                                   QUERY PLAN                                    
----------------------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: tab1_modulo.val, tab1_modulo.val2
-   Remote query: SELECT val, val2 FROM tab1_modulo WHERE (val = 7) ORDER BY val2
+                 QUERY PLAN                  
+---------------------------------------------
+ Remote Subquery Scan on all
+   Output: val, val2
    ->  Sort
          Output: val, val2
          Sort Key: tab1_modulo.val2
          ->  Seq Scan on public.tab1_modulo
                Output: val, val2
                Filter: (tab1_modulo.val = 7)
-(9 rows)
+(8 rows)
 
 select distinct val2 from tab1_modulo where val = 7;
  val2 
@@ -1230,12 +1256,11 @@ select distinct val2 from tab1_modulo where val = 7;
 (2 rows)
 
 explain (verbose on, nodes off, costs off, num_nodes on) select distinct val2 from tab1_modulo where val = 7;
-                              QUERY PLAN                               
------------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: tab1_modulo.val2
-   Remote query: SELECT DISTINCT val2 FROM tab1_modulo WHERE (val = 7)
-   ->  Unique
+                    QUERY PLAN                     
+---------------------------------------------------
+ Unique
+   Output: val2
+   ->  Remote Subquery Scan on all
          Output: val2
          ->  Sort
                Output: val2
@@ -1243,19 +1268,19 @@ explain (verbose on, nodes off, costs off, num_nodes on) select distinct val2 fr
                ->  Seq Scan on public.tab1_modulo
                      Output: val2
                      Filter: (tab1_modulo.val = 7)
-(11 rows)
+(10 rows)
 
 -- DMLs
 update tab1_modulo set val2 = 1000 where val = 7; 
 explain (verbose on, nodes off, costs off) update tab1_modulo set val2 = 1000 where val = 7; 
-                             QUERY PLAN                             
---------------------------------------------------------------------
+                                           QUERY PLAN                                           
+------------------------------------------------------------------------------------------------
  Remote Fast Query Execution
-   Output: 1000, tab1_modulo.xc_node_id, tab1_modulo.ctid
+   Output: tab1_modulo.val, 1000, tab1_modulo.xc_node_id, tab1_modulo.ctid, tab1_modulo.shardid
    Remote query: UPDATE tab1_modulo SET val2 = 1000 WHERE (val = 7)
    ->  Update on public.tab1_modulo
          ->  Seq Scan on public.tab1_modulo
-               Output: val, 1000, ctid
+               Output: val, 1000, ctid, shardid
                Filter: (tab1_modulo.val = 7)
 (7 rows)
 
@@ -1268,15 +1293,15 @@ select * from tab1_modulo where val = 7;
 
 delete from tab1_modulo where val = 7; 
 explain (verbose on, costs off) delete from tab1_modulo where val = 7; 
-                       QUERY PLAN                        
----------------------------------------------------------
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
  Remote Fast Query Execution
-   Output: tab1_modulo.xc_node_id, tab1_modulo.ctid
+   Output: tab1_modulo.xc_node_id, tab1_modulo.ctid, tab1_modulo.shardid, tab1_modulo.val
    Node/s: datanode_2
    Remote query: DELETE FROM tab1_modulo WHERE (val = 7)
    ->  Delete on public.tab1_modulo
          ->  Seq Scan on public.tab1_modulo
-               Output: ctid
+               Output: ctid, shardid
                Filter: (tab1_modulo.val = 7)
 (8 rows)
 
@@ -1310,7 +1335,7 @@ explain (verbose on, nodes off, costs off) insert into tab1_replicated values (9
 (6 rows)
 
 -- simple select
-select * from tab1_replicated;
+select * from tab1_replicated order by val;
  val | val2 
 -----+------
    1 |    2
@@ -1337,16 +1362,15 @@ select sum(val), avg(val), count(*) from tab1_replicated;
 (1 row)
 
 explain (num_nodes on, verbose on, nodes off, costs off) select sum(val), avg(val), count(*) from tab1_replicated;
-                                           QUERY PLAN                                            
--------------------------------------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: sum(tab1_replicated.val), avg(tab1_replicated.val), count(*)
-   Remote query: SELECT sum(val) AS sum, avg(val) AS avg, count(*) AS count FROM tab1_replicated
+                   QUERY PLAN                   
+------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), avg(val), count(*)
    ->  Aggregate
          Output: sum(val), avg(val), count(*)
          ->  Seq Scan on public.tab1_replicated
                Output: val, val2
-(7 rows)
+(6 rows)
 
 select first_value(val) over (partition by val2 order by val) from tab1_replicated;
  first_value 
@@ -1359,19 +1383,18 @@ select first_value(val) over (partition by val2 order by val) from tab1_replicat
 (5 rows)
 
 explain (num_nodes on, verbose on, nodes off, costs off) select first_value(val) over (partition by val2 order by val) from tab1_replicated;
-                                                    QUERY PLAN                                                     
--------------------------------------------------------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: first_value(tab1_replicated.val) OVER (?), tab1_replicated.val, tab1_replicated.val2
-   Remote query: SELECT first_value(val) OVER (PARTITION BY val2 ORDER BY val) AS first_value FROM tab1_replicated
-   ->  WindowAgg
-         Output: first_value(val) OVER (?), val, val2
+                            QUERY PLAN                             
+-------------------------------------------------------------------
+ WindowAgg
+   Output: first_value(val) OVER (?), val, val2
+   ->  Remote Subquery Scan on all
+         Output: val, val2
          ->  Sort
                Output: val, val2
                Sort Key: tab1_replicated.val2, tab1_replicated.val
                ->  Seq Scan on public.tab1_replicated
                      Output: val, val2
-(10 rows)
+(9 rows)
 
 select * from tab1_replicated where val2 = 2 limit 2;
  val | val2 
@@ -1381,17 +1404,18 @@ select * from tab1_replicated where val2 = 2 limit 2;
 (2 rows)
 
 explain (num_nodes on, verbose on, nodes off, costs off) select * from tab1_replicated where val2 = 2 limit 2;
-                                   QUERY PLAN                                   
---------------------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: tab1_replicated.val, tab1_replicated.val2
-   Remote query: SELECT val, val2 FROM tab1_replicated WHERE (val2 = 2) LIMIT 2
-   ->  Limit
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Limit
+   Output: val, val2
+   ->  Remote Subquery Scan on all
          Output: val, val2
-         ->  Seq Scan on public.tab1_replicated
+         ->  Limit
                Output: val, val2
-               Filter: (tab1_replicated.val2 = 2)
-(8 rows)
+               ->  Seq Scan on public.tab1_replicated
+                     Output: val, val2
+                     Filter: (tab1_replicated.val2 = 2)
+(9 rows)
 
 select * from tab1_replicated where val2 = 4 offset 1;
  val | val2 
@@ -1399,17 +1423,16 @@ select * from tab1_replicated where val2 = 4 offset 1;
 (0 rows)
 
 explain (num_nodes on, verbose on, nodes off, costs off) select * from tab1_replicated where val2 = 4 offset 1;
-                                   QUERY PLAN                                    
----------------------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: tab1_replicated.val, tab1_replicated.val2
-   Remote query: SELECT val, val2 FROM tab1_replicated WHERE (val2 = 4) OFFSET 1
-   ->  Limit
+                    QUERY PLAN                    
+--------------------------------------------------
+ Limit
+   Output: val, val2
+   ->  Remote Subquery Scan on all
          Output: val, val2
          ->  Seq Scan on public.tab1_replicated
                Output: val, val2
                Filter: (tab1_replicated.val2 = 4)
-(8 rows)
+(7 rows)
 
 select * from tab1_replicated order by val;
  val | val2 
@@ -1422,17 +1445,16 @@ select * from tab1_replicated order by val;
 (5 rows)
 
 explain (num_nodes on, verbose on, nodes off, costs off) select * from tab1_replicated order by val;
-                             QUERY PLAN                             
---------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: tab1_replicated.val, tab1_replicated.val2
-   Remote query: SELECT val, val2 FROM tab1_replicated ORDER BY val
+                   QUERY PLAN                   
+------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val, val2
    ->  Sort
          Output: val, val2
          Sort Key: tab1_replicated.val
          ->  Seq Scan on public.tab1_replicated
                Output: val, val2
-(8 rows)
+(7 rows)
 
 select distinct val, val2 from tab1_replicated order by 1, 2;
  val | val2 
@@ -1445,11 +1467,10 @@ select distinct val, val2 from tab1_replicated order by 1, 2;
 (5 rows)
 
 explain (num_nodes on, verbose on, nodes off, costs off) select distinct val, val2 from tab1_replicated order by 1, 2;
-                                    QUERY PLAN                                     
------------------------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: tab1_replicated.val, tab1_replicated.val2
-   Remote query: SELECT DISTINCT val, val2 FROM tab1_replicated ORDER BY val, val2
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val, val2
    ->  Sort
          Output: val, val2
          Sort Key: tab1_replicated.val, tab1_replicated.val2
@@ -1458,20 +1479,19 @@ explain (num_nodes on, verbose on, nodes off, costs off) select distinct val, va
                Group Key: tab1_replicated.val, tab1_replicated.val2
                ->  Seq Scan on public.tab1_replicated
                      Output: val, val2
-(11 rows)
+(10 rows)
 
 explain (num_nodes on, verbose on, nodes off, costs off) select distinct val, val2 from tab1_replicated;
-                            QUERY PLAN                            
-------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: tab1_replicated.val, tab1_replicated.val2
-   Remote query: SELECT DISTINCT val, val2 FROM tab1_replicated
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val, val2
    ->  HashAggregate
          Output: val, val2
          Group Key: tab1_replicated.val, tab1_replicated.val2
          ->  Seq Scan on public.tab1_replicated
                Output: val, val2
-(8 rows)
+(7 rows)
 
 select val, val2 from tab1_replicated group by val, val2 order by 1, 2;
  val | val2 
@@ -1484,11 +1504,10 @@ select val, val2 from tab1_replicated group by val, val2 order by 1, 2;
 (5 rows)
 
 explain (num_nodes on, verbose on, nodes off, costs off) select val, val2 from tab1_replicated group by val, val2 order by 1, 2;
-                                         QUERY PLAN                                          
----------------------------------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: tab1_replicated.val, tab1_replicated.val2
-   Remote query: SELECT val, val2 FROM tab1_replicated GROUP BY val, val2 ORDER BY val, val2
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val, val2
    ->  Sort
          Output: val, val2
          Sort Key: tab1_replicated.val, tab1_replicated.val2
@@ -1497,20 +1516,19 @@ explain (num_nodes on, verbose on, nodes off, costs off) select val, val2 from t
                Group Key: tab1_replicated.val, tab1_replicated.val2
                ->  Seq Scan on public.tab1_replicated
                      Output: val, val2
-(11 rows)
+(10 rows)
 
 explain (num_nodes on, verbose on, nodes off, costs off) select val, val2 from tab1_replicated group by val, val2;
-                                QUERY PLAN                                
---------------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: tab1_replicated.val, tab1_replicated.val2
-   Remote query: SELECT val, val2 FROM tab1_replicated GROUP BY val, val2
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: val, val2
    ->  HashAggregate
          Output: val, val2
          Group Key: tab1_replicated.val, tab1_replicated.val2
          ->  Seq Scan on public.tab1_replicated
                Output: val, val2
-(8 rows)
+(7 rows)
 
 select sum(val) from tab1_replicated group by val2 having sum(val) > 1 order by 1;
  sum 
@@ -1522,11 +1540,10 @@ select sum(val) from tab1_replicated group by val2 having sum(val) > 1 order by
 (4 rows)
 
 explain (num_nodes on, verbose on, nodes off, costs off) select sum(val) from tab1_replicated group by val2 having sum(val) > 1 order by 1;
-                                                     QUERY PLAN                                                      
----------------------------------------------------------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: sum(tab1_replicated.val), tab1_replicated.val2
-   Remote query: SELECT sum(val) AS sum FROM tab1_replicated GROUP BY val2 HAVING (sum(val) > 1) ORDER BY (sum(val))
+                      QUERY PLAN                      
+------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), val2
    ->  Sort
          Output: (sum(val)), val2
          Sort Key: (sum(tab1_replicated.val))
@@ -1536,33 +1553,32 @@ explain (num_nodes on, verbose on, nodes off, costs off) select sum(val) from ta
                Filter: (sum(tab1_replicated.val) > 1)
                ->  Seq Scan on public.tab1_replicated
                      Output: val, val2
-(12 rows)
+(11 rows)
 
 explain (num_nodes on, verbose on, nodes off, costs off) select sum(val) from tab1_replicated group by val2 having sum(val) > 1;
-                                           QUERY PLAN                                            
--------------------------------------------------------------------------------------------------
- Remote Fast Query Execution (primary node count=0, node count=1)
-   Output: sum(tab1_replicated.val), tab1_replicated.val2
-   Remote query: SELECT sum(val) AS sum FROM tab1_replicated GROUP BY val2 HAVING (sum(val) > 1)
+                   QUERY PLAN                   
+------------------------------------------------
+ Remote Subquery Scan on all
+   Output: sum(val), val2
    ->  HashAggregate
          Output: sum(val), val2
          Group Key: tab1_replicated.val2
          Filter: (sum(tab1_replicated.val) > 1)
          ->  Seq Scan on public.tab1_replicated
                Output: val, val2
-(9 rows)
+(8 rows)
 
 -- DMLs
 update tab1_replicated set val2 = 1000 where val = 7; 
 explain (verbose on, nodes off, costs off) update tab1_replicated set val2 = 1000 where val = 7; 
-                               QUERY PLAN                               
-------------------------------------------------------------------------
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
  Remote Fast Query Execution
-   Output: 1000, tab1_replicated.ctid
+   Output: tab1_replicated.val, 1000, tab1_replicated.ctid, tab1_replicated.shardid
    Remote query: UPDATE tab1_replicated SET val2 = 1000 WHERE (val = 7)
    ->  Update on public.tab1_replicated
          ->  Seq Scan on public.tab1_replicated
-               Output: val, 1000, ctid
+               Output: val, 1000, ctid, shardid
                Filter: (tab1_replicated.val = 7)
 (7 rows)
 
@@ -1577,12 +1593,12 @@ explain (verbose on, costs off) delete from tab1_replicated where val = 7;
                          QUERY PLAN                          
 -------------------------------------------------------------
  Remote Fast Query Execution
-   Output: tab1_replicated.ctid
+   Output: tab1_replicated.ctid, tab1_replicated.shardid
    Node/s: datanode_1, datanode_2
    Remote query: DELETE FROM tab1_replicated WHERE (val = 7)
    ->  Delete on public.tab1_replicated
          ->  Seq Scan on public.tab1_replicated
-               Output: ctid
+               Output: ctid, shardid
                Filter: (tab1_replicated.val = 7)
 (8 rows)
 
@@ -1591,8 +1607,225 @@ select * from tab1_replicated where val = 7;
 -----+------
 (0 rows)
 
+-- Constant subquery
+create table subquery_fqs(id int, a varchar, c int) distribute by shard(id);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into subquery_fqs values(1,'gd', 2);
+insert into subquery_fqs values(1,'zj', 2);
+insert into subquery_fqs values(1,'sz', 2);
+explain select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a);
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Remote Fast Query Execution  (cost=0.00..0.00 rows=0 width=0)
+   Node/s: datanode_1, datanode_2
+   ->  Hash Join  (cost=0.19..25.60 rows=1 width=80)
+         Hash Cond: ((t.id = (1)) AND ((t.a)::text = ('gd'::text)))
+         ->  Seq Scan on subquery_fqs t  (cost=0.00..18.80 rows=880 width=40)
+         ->  Hash  (cost=0.14..0.14 rows=3 width=40)
+               ->  HashAggregate  (cost=0.08..0.11 rows=3 width=40)
+                     Group Key: (1), ('gd'::text), (2)
+                     ->  Append  (cost=0.00..0.06 rows=3 width=40)
+                           ->  Result  (cost=0.00..0.01 rows=1 width=40)
+                           ->  Result  (cost=0.00..0.01 rows=1 width=40)
+                           ->  Result  (cost=0.00..0.01 rows=1 width=40)
+(12 rows)
+
+select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union select 1 id, 'sz' a, 2 c union select 1 id, 'zj' a, 2 c from dual) t2 ON (t.id = t2.id and t.a = t2.a);
+ id | a  | c | id | a  | c 
+----+----+---+----+----+---
+  1 | gd | 2 |  1 | gd | 2
+  1 | zj | 2 |  1 | zj | 2
+  1 | sz | 2 |  1 | sz | 2
+(3 rows)
+
+-- Support subquery FQS only if subquery distributed on same DN with main query(only 1 DN node)
+explain  (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1);
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Remote Subquery Scan on all
+   Output: t1.id, t1.a, t1.c
+   ->  Nested Loop Semi Join
+         Output: t1.id, t1.a, t1.c
+         Join Filter: (t1.c = t2.c)
+         ->  Seq Scan on public.subquery_fqs t1
+               Output: t1.id, t1.a, t1.c
+               Filter: (t1.id = 1)
+         ->  Materialize
+               Output: t2.c
+               ->  Remote Subquery Scan on all
+                     Output: t2.c
+                     ->  Seq Scan on public.subquery_fqs t2
+                           Output: t2.c
+                           Filter: (t2.id = 1)
+(15 rows)
+
+select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1);
+ id | a  | c 
+----+----+---
+  1 | gd | 2
+  1 | zj | 2
+  1 | sz | 2
+(3 rows)
+
+explain  (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1);
+                                                                         QUERY PLAN                                                                         
+------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution (primary node count=0, node count=1)
+   Output: t1.id, t1.a, t1.c
+   Remote query: SELECT id, a, c FROM subquery_fqs t1 WHERE ((id = 1) AND (c = (SELECT t2.c FROM subquery_fqs t2 WHERE (t2.id = 1) ORDER BY t2.c LIMIT 1)))
+   ->  Seq Scan on public.subquery_fqs t1
+         Output: t1.id, t1.a, t1.c
+         Filter: ((t1.id = 1) AND (t1.c = $0))
+         InitPlan 1 (returns $0)
+           ->  Limit
+                 Output: t2.c
+                 ->  Sort
+                       Output: t2.c
+                       Sort Key: t2.c
+                       ->  Seq Scan on public.subquery_fqs t2
+                             Output: t2.c
+                             Filter: (t2.id = 1)
+(15 rows)
+
+select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select c from subquery_fqs t2 where t2.id=1 order by c limit 1);
+ id | a  | c 
+----+----+---
+  1 | gd | 2
+  1 | zj | 2
+  1 | sz | 2
+(3 rows)
+
+explain  (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1);
+                                                                    QUERY PLAN                                                                    
+--------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution (primary node count=0, node count=1)
+   Output: t1.id, t1.a, t1.c
+   Remote query: SELECT id, a, c FROM subquery_fqs t1 WHERE ((id = 1) AND (c = (SELECT max(t2.c) AS max FROM subquery_fqs t2 WHERE (t2.id = 1))))
+   ->  Seq Scan on public.subquery_fqs t1
+         Output: t1.id, t1.a, t1.c
+         Filter: ((t1.id = 1) AND (t1.c = $0))
+         InitPlan 1 (returns $0)
+           ->  Aggregate
+                 Output: max(t2.c)
+                 ->  Seq Scan on public.subquery_fqs t2
+                       Output: t2.id, t2.a, t2.c
+                       Filter: (t2.id = 1)
+(12 rows)
+
+select * from subquery_fqs t1 where t1.id = 1 and t1.c = (select max(c) from subquery_fqs t2 where t2.id=1);
+ id | a  | c 
+----+----+---
+  1 | gd | 2
+  1 | zj | 2
+  1 | sz | 2
+(3 rows)
+
+explain  (num_nodes on, verbose on, nodes off, costs off) select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2;
+                                                                                                QUERY PLAN                                                                                                
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Fast Query Execution (primary node count=0, node count=1)
+   Output: "__Alias_72__".id, "__Alias_72__".a, "__Alias_72__".c
+   Remote query: SELECT id, a, c FROM (SELECT subquery_fqs.id, subquery_fqs.a, subquery_fqs.c FROM subquery_fqs WHERE (subquery_fqs.id = 1) ORDER BY subquery_fqs.c LIMIT 1) "__Alias_72__" WHERE (c = 2)
+   ->  Subquery Scan on "__Alias_72__"
+         Output: "__Alias_72__".id, "__Alias_72__".a, "__Alias_72__".c
+         Filter: ("__Alias_72__".c = 2)
+         ->  Limit
+               Output: subquery_fqs.id, subquery_fqs.a, subquery_fqs.c
+               ->  Sort
+                     Output: subquery_fqs.id, subquery_fqs.a, subquery_fqs.c
+                     Sort Key: subquery_fqs.c
+                     ->  Seq Scan on public.subquery_fqs
+                           Output: subquery_fqs.id, subquery_fqs.a, subquery_fqs.c
+                           Filter: (subquery_fqs.id = 1)
+(14 rows)
+
+select * from (select * from subquery_fqs where id = 1 order by c limit 1) where c = 2;
+ id | a  | c 
+----+----+---
+  1 | gd | 2
+(1 row)
+
+set enable_oracle_compatible to true;
+explain  (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id in (1 ,1);
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Remote Fast Query Execution (primary node count=0, node count=1)
+   Output: t1.id, t1.a, t1.c
+   Remote query: SELECT id, a, c FROM subquery_fqs t1 WHERE (id = ANY (ARRAY[1, 1]))
+   ->  Seq Scan on public.subquery_fqs t1
+         Output: id, a, c
+         Filter: (t1.id = ANY ('{1,1}'::integer[]))
+(6 rows)
+
+explain   select * from subquery_fqs t1 where t1.id in (1 ,1);
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Fast Query Execution  (cost=0.00..0.00 rows=0 width=0)
+   Node/s: datanode_1
+   ->  Seq Scan on subquery_fqs t1  (cost=0.00..21.00 rows=9 width=40)
+         Filter: (id = ANY ('{1,1}'::integer[]))
+(4 rows)
+
+explain   select * from subquery_fqs t1 where t1.id in (1 ,1);
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Fast Query Execution  (cost=0.00..0.00 rows=0 width=0)
+   Node/s: datanode_1
+   ->  Seq Scan on subquery_fqs t1  (cost=0.00..21.00 rows=9 width=40)
+         Filter: (id = ANY ('{1,1}'::integer[]))
+(4 rows)
+
+explain   select * from subquery_fqs t1 where t1.id in (1 ,1);
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Fast Query Execution  (cost=0.00..0.00 rows=0 width=0)
+   Node/s: datanode_1
+   ->  Seq Scan on subquery_fqs t1  (cost=0.00..21.00 rows=9 width=40)
+         Filter: (id = ANY ('{1,1}'::integer[]))
+(4 rows)
+
+explain   select * from subquery_fqs t1 where t1.id in (1 ,1);
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Fast Query Execution  (cost=0.00..0.00 rows=0 width=0)
+   Node/s: datanode_1
+   ->  Seq Scan on subquery_fqs t1  (cost=0.00..21.00 rows=9 width=40)
+         Filter: (id = ANY ('{1,1}'::integer[]))
+(4 rows)
+
+explain   select * from subquery_fqs t1 where t1.id in (1 ,1);
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Fast Query Execution  (cost=0.00..0.00 rows=0 width=0)
+   Node/s: datanode_1
+   ->  Seq Scan on subquery_fqs t1  (cost=0.00..21.00 rows=9 width=40)
+         Filter: (id = ANY ('{1,1}'::integer[]))
+(4 rows)
+
+explain   select * from subquery_fqs t1 where t1.id in (1 ,1);
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Remote Fast Query Execution  (cost=0.00..0.00 rows=0 width=0)
+   Node/s: datanode_1
+   ->  Seq Scan on subquery_fqs t1  (cost=0.00..21.00 rows=9 width=40)
+         Filter: (id = ANY ('{1,1}'::integer[]))
+(4 rows)
+
+set enable_oracle_compatible to false;
+explain  (num_nodes on, verbose on, nodes off, costs off) select * from subquery_fqs t1 where t1.id in (1 ,3);
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
+ Remote Fast Query Execution (primary node count=0, node count=2)
+   Output: t1.id, t1.a, t1.c
+   Remote query: SELECT id, a, c FROM subquery_fqs t1 WHERE (id = ANY (ARRAY[1, 3]))
+   ->  Seq Scan on public.subquery_fqs t1
+         Output: id, a, c
+         Filter: (t1.id = ANY ('{1,3}'::integer[]))
+(6 rows)
+
 drop table tab1_rr;
 drop table tab1_hash;
 drop table tab1_modulo;
 drop table tab1_replicated;
+drop table subquery_fqs;
 drop function cr_table(varchar, int[], varchar); 

From e5e3206ae77ae544317fb7f9074a6c77ffafaa23 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Mon, 25 Oct 2021 17:24:51 +0800
Subject: [PATCH 425/578] Fix varchar

---
 src/backend/pgxc/locator/locator.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index 1c0b541e..0c358c85 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -2525,6 +2525,15 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
     {
         distcol_expr = pgxc_find_distcol_expr(varno, rel_loc_info->partAttrNum,
                                                     quals);
+
+		if (distcol_expr && IsA(distcol_expr, ArrayCoerceExpr) &&
+		    IsA(((ArrayCoerceExpr *)distcol_expr)->arg, ArrayExpr))
+		{
+			ArrayCoerceExpr *arrayCoerceExpr = (ArrayCoerceExpr *) distcol_expr;
+
+			distcol_expr = arrayCoerceExpr->arg;
+		}
+
         /*
          * If the type of expression used to find the Datanode, is not same as
          * the distribution column type, try casting it. This is same as what

From 8d9bccce5bf50399083253750809be9991b348a3 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 26 Oct 2021 10:34:33 +0800
Subject: [PATCH 426/578] Revert autoformat

---
 src/backend/pgxc/locator/locator.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index 0c358c85..b72b65f8 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -1197,7 +1197,10 @@ createLocator(char locatorType, RelationAccessType accessType,
                 int *intptr;
                 nodeMap = palloc(locator->nodeCount * sizeof(int));
                 intptr = (int *) nodeMap;
-				foreach(lc, l) *intptr++ = lfirst_int(lc);
+				foreach(lc, l)
+				{
+					*intptr++ = lfirst_int(lc);
+				}
                 locator->listType = LOCATOR_LIST_INT;
             }
             else if (IsA(l, OidList))
@@ -1205,7 +1208,10 @@ createLocator(char locatorType, RelationAccessType accessType,
                 Oid *oidptr;
                 nodeMap = palloc(locator->nodeCount * sizeof(Oid));
                 oidptr = (Oid *) nodeMap;
-				foreach(lc, l) *oidptr++ = lfirst_oid(lc);
+				foreach(lc, l)
+				{
+					*oidptr++ = lfirst_oid(lc);
+				}
                 locator->listType = LOCATOR_LIST_OID;
             }
             else if (IsA(l, List))
@@ -1213,7 +1219,10 @@ createLocator(char locatorType, RelationAccessType accessType,
                 void **voidptr;
                 nodeMap = palloc(locator->nodeCount * sizeof(void *));
                 voidptr = (void **) nodeMap;
-				foreach(lc, l) *voidptr++ = lfirst(lc);
+				foreach(lc, l)
+				{
+					*voidptr++ = lfirst(lc);
+				}
                 locator->listType = LOCATOR_LIST_POINTER;
             }
             else

From 9ec860267adafc16052ec5cf7fc07b822c0f3f25 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Mon, 25 Oct 2021 10:37:16 +0800
Subject: [PATCH 427/578] bugfix: consistency check error after cn switch
 (merge request !846)
 http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696093416231

(cherry picked from commit 58dfc7f6)

fedb6262 bugfix: consistency check error after cn switch
---
 src/backend/pgxc/pool/pgxcnode.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 4d93ec43..d84611ee 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -5481,11 +5481,6 @@ PgxcNodeRefreshBackendHandlesShmem(List *nodes_alter)
     int nid;
     PGXCNodeHandle *handle = NULL;
 
-	if (PersistentConnections && nodes_alter != NIL)
-	{
-		release_handles(true);
-	}
-
     foreach(lc, nodes_alter)
     {
         char ntype = PGXC_NODE_NONE;

From 055897bc278d6fbd597d7bb00172ac2f5828dfef Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 26 Oct 2021 10:49:30 +0800
Subject: [PATCH 428/578] adjust format

---
 src/backend/pgxc/locator/locator.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index b72b65f8..4685ea1e 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -2535,13 +2535,16 @@ GetRelationNodesByQuals(Oid reloid, RelationLocInfo *rel_loc_info,
         distcol_expr = pgxc_find_distcol_expr(varno, rel_loc_info->partAttrNum,
                                                     quals);
 
-		if (distcol_expr && IsA(distcol_expr, ArrayCoerceExpr) &&
-		    IsA(((ArrayCoerceExpr *)distcol_expr)->arg, ArrayExpr))
+		/* Remove ArrayCoerceExpr at first */
+		if (distcol_expr && IsA(distcol_expr, ArrayCoerceExpr))
 		{
-			ArrayCoerceExpr *arrayCoerceExpr = (ArrayCoerceExpr *) distcol_expr;
+			ArrayCoerceExpr *arrayCoerceExpr = castNode(ArrayCoerceExpr, distcol_expr);
 
+			if (arrayCoerceExpr->arg && IsA(arrayCoerceExpr->arg, ArrayExpr))
+			{
 			distcol_expr = arrayCoerceExpr->arg;
 		}
+		}
 
         /*
          * If the type of expression used to find the Datanode, is not same as

From 30bd59a349ada2f2fac6ab32975918bbf567afe0 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Tue, 26 Oct 2021 10:51:56 +0800
Subject: [PATCH 429/578] revert useless modification.

---
 src/backend/optimizer/util/pgxcship.c | 4 +++-
 src/include/optimizer/pgxcship.h      | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/backend/optimizer/util/pgxcship.c b/src/backend/optimizer/util/pgxcship.c
index 6ad269f9..b749e028 100644
--- a/src/backend/optimizer/util/pgxcship.c
+++ b/src/backend/optimizer/util/pgxcship.c
@@ -117,6 +117,8 @@ static bool pgxc_is_func_shippable(Oid funcid);
 /* Check equijoin conditions on given relations */
 static Expr *pgxc_find_dist_equijoin_qual(Relids varnos_1, Relids varnos_2,
                                 Oid distcol_type, Node *quals, List *rtable);
+/* Merge given execution nodes based on join shippability conditions */
+static ExecNodes *pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2);
 /* Check if given Query includes distribution column */
 static bool pgxc_query_has_distcolgrouping(Query *query);
 
@@ -2483,7 +2485,7 @@ pgxc_find_dist_equi_nodes(Relids varnos_1,
  * exec_node corresponds to the JOIN of respective relations.
  * If both exec_nodes can not be merged, it returns NULL.
  */
-ExecNodes *
+static ExecNodes *
 pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2)
 {// #lizard forgives
     ExecNodes    *merged_en = makeNode(ExecNodes);
diff --git a/src/include/optimizer/pgxcship.h b/src/include/optimizer/pgxcship.h
index d29c2b8f..c42f3a04 100644
--- a/src/include/optimizer/pgxcship.h
+++ b/src/include/optimizer/pgxcship.h
@@ -38,6 +38,5 @@ extern bool pgxc_is_trigger_shippable(Trigger *trigger);
 extern Node *get_var_from_arg(Node *arg);
 
 extern bool is_var_distribute_column(Var *var, List *rtable);
-extern ExecNodes *pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2);
 #endif
 #endif

From 32f1f9cf00d76de36770dc783f03f7db4100dae3 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Wed, 27 Oct 2021 18:00:30 +0800
Subject: [PATCH 430/578] 2pc stop opt:  server time diff opt (merge request
 !849),
 http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131093399717&url_cache_key=80da2c20fd930784041c74db66ffd4d6&action_entry_type=bugs

---
 contrib/pg_clean/pg_clean.c       | 157 ++++++++++++++++++++++++++----
 src/backend/postmaster/clean2pc.c |  66 ++++++++++---
 src/backend/utils/misc/guc.c      |  17 +++-
 src/include/postmaster/clean2pc.h |   1 +
 4 files changed, 204 insertions(+), 37 deletions(-)

diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c
index 08375f46..0b2f6f98 100644
--- a/contrib/pg_clean/pg_clean.c
+++ b/contrib/pg_clean/pg_clean.c
@@ -62,17 +62,21 @@ int  transaction_threshold = 200000;
 #define MAXIMUM_CLEAR_FILE 10000
 #define MAXIMUM_OUTPUT_FILE 1000
 #define XIDPREFIX "_$XC$"
-#define DEFAULT_CLEAN_TIME_INTERVAL 120000000
-#define LEAST_CLEAN_TIME_INTERVAL   1000000 /* should not clean twophase trans prepared in 1s or commit in 1s */
+#define DEFAULT_CLEAN_TIME_INTERVAL 120
+#define LEAST_CLEAN_TIME_INTERVAL     3 /* should not clean twophase trans prepared in 3s */
+#define LEAST_CHECK_TIME_INTERVAL     1 /* should not check twophase trans prepared in 1s */
 
-GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL;
+GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL * USECS_PER_SEC;
 
 PG_MODULE_MAGIC;
 
 #define MAX_GID               64
 
-#define CLEAN_CHECK_TIMES     3
-#define CLEAN_CHECK_INTERVAL  10000
+#define CLEAN_CHECK_TIMES_DEFAULT    3
+#define CLEAN_CHECK_INTERVAL_DEFAULT 100000
+
+#define CLEAN_NODE_CHECK_TIMES       5
+#define CLEAN_NODE_CHECK_INTERVAL    500000
 
 #define MAX_DBNAME	64
 #define GET_START_XID "startxid:"
@@ -316,6 +320,8 @@ bool send_query_clean_transaction(PGXCNodeHandle * conn, txn_info * txn, const c
 bool check_2pc_belong_node(txn_info * txn);
 bool check_node_participate(txn_info * txn, int node_idx);
 
+bool check_2pc_start_from_node(txn_info *txn);
+
 void recover2PC(txn_info * txn);
 TXN_STATUS 
 	 check_txn_global_status(txn_info *txn);
@@ -395,11 +401,15 @@ Datum	pg_clean_execute(PG_FUNCTION_ARGS)
         /*clear Global*/
         ResetGlobalVariables();
         execute = true;
-        clean_time_interval = PG_GETARG_INT32(0) * 1000000;
+
+        clean_time_interval = PG_GETARG_INT32(0);
         if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval)
         {
+            elog(WARNING, "least clean time interval is %ds",
+                LEAST_CLEAN_TIME_INTERVAL);
             clean_time_interval = LEAST_CLEAN_TIME_INTERVAL;
         }
+        clean_time_interval *= USECS_PER_SEC;
         
 		/*get node list*/
 		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
@@ -538,9 +548,11 @@ Datum	pg_clean_execute_on_node(PG_FUNCTION_ARGS)
         }
         abnormal_time = PG_GETARG_INT64(1);
         current_time = GetCurrentTimestamp();
-        if (abnormal_time >= current_time)
+        if (abnormal_time >= current_time - LEAST_CLEAN_TIME_INTERVAL * USECS_PER_SEC)
         {
-            elog(ERROR, "pg_clean_execute_on_node, abnormal time "INT64_FORMAT" must before current_time "INT64_FORMAT, abnormal_time, current_time);
+            elog(ERROR, "pg_clean_execute_on_node, least clean time interval is %ds, "
+                "abnormal time: " INT64_FORMAT ", current_time: " INT64_FORMAT,
+                LEAST_CLEAN_TIME_INTERVAL, abnormal_time, current_time);
         }
         
 		/*get node list*/
@@ -668,11 +680,15 @@ Datum	pg_clean_check_txn(PG_FUNCTION_ARGS)
         /*clear Global*/
         ResetGlobalVariables();
         
-        clean_time_interval = PG_GETARG_INT32(0) * 1000000;
-        if (LEAST_CLEAN_TIME_INTERVAL > clean_time_interval)
+        clean_time_interval = PG_GETARG_INT32(0);
+        if (LEAST_CHECK_TIME_INTERVAL > clean_time_interval)
         {
-            clean_time_interval = LEAST_CLEAN_TIME_INTERVAL;
+            elog(WARNING, "least check time interval is %ds",
+				LEAST_CHECK_TIME_INTERVAL);
+            clean_time_interval = LEAST_CHECK_TIME_INTERVAL;
         }
+        clean_time_interval *= USECS_PER_SEC;
+
 		/*get node list*/
 		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
 						&cn_nodes_num, &dn_nodes_num, true);
@@ -1636,7 +1652,7 @@ char *get2PCInfo(const char *tid)
         return result;
     }
 
-    elog(LOG, "try to get 2pc info from disk, tid: %s", tid);
+    elog(DEBUG1, "try to get 2pc info from disk, tid: %s", tid);
     
     snprintf(path, MAXPGPATH, TWOPHASE_RECORD_DIR "/%s", tid);
     if(access(path, F_OK) == 0)
@@ -2489,12 +2505,20 @@ void recover2PC(txn_info * txn)
 {
 	int i = 0;
 	bool check_ok = false;
+	int check_times = CLEAN_CHECK_TIMES_DEFAULT;
+	int check_interval = CLEAN_CHECK_INTERVAL_DEFAULT;
 	MemoryContext current_context = NULL;
 	ErrorData* edata = NULL;
 	TXN_STATUS txn_stat;
 	txn_stat = check_txn_global_status(txn);
 	txn->global_txn_stat = txn_stat;
 
+	if (clear_2pc_belong_node)
+	{
+		check_times = CLEAN_NODE_CHECK_TIMES;
+		check_interval = CLEAN_NODE_CHECK_INTERVAL;
+	}
+
 #ifdef DEBUG_EXECABORT
 	txn_stat = TXN_STATUS_ABORTED;
 #endif
@@ -2529,7 +2553,7 @@ void recover2PC(txn_info * txn)
             {
     			txn->op = COMMIT;
     			/* check whether all nodes can commit prepared */
-				for (i = 0; i < CLEAN_CHECK_TIMES; i++)
+				for (i = 0; i < check_times; i++)
 				{
 					check_ok = true;
 					current_context = CurrentMemoryContext;
@@ -2560,7 +2584,7 @@ void recover2PC(txn_info * txn)
     				return;
     			}
 
-					pg_usleep(CLEAN_CHECK_INTERVAL);
+					pg_usleep(check_interval);
 				}
 
     			/* send commit prepared to all nodes */
@@ -2578,7 +2602,7 @@ void recover2PC(txn_info * txn)
 		case TXN_STATUS_ABORTED:
 			txn->op = ABORT;
 			/* check whether all nodes can rollback prepared */
-			for (i = 0; i < CLEAN_CHECK_TIMES; i++)
+			for (i = 0; i < check_times; i++)
 			{
 				check_ok = true;
 				current_context = CurrentMemoryContext;
@@ -2609,7 +2633,7 @@ void recover2PC(txn_info * txn)
 				return;
 			}
 
-				pg_usleep(CLEAN_CHECK_INTERVAL);
+				pg_usleep(check_interval);
 			}
 
 			/* send rollback prepared to all nodes */
@@ -2733,10 +2757,39 @@ TXN_STATUS check_txn_global_status(txn_info *txn)
 #endif                
     if (clear_2pc_belong_node)
     {
+        if (!check_2pc_belong_node(txn))
+        {
+            return TXN_STATUS_INPROGRESS;
+        }
+
+        if (!check_2pc_start_from_node(txn))
+        {
+            return TXN_STATUS_INPROGRESS;
+        }
+
         node_idx = find_node_index(abnormal_nodeoid);
-        if (!check_2pc_belong_node(txn) || 
-            abnormal_time < txn->prepare_timestamp[node_idx])
+        if (node_idx >= 0)
+        {
+            if (abnormal_time < txn->prepare_timestamp[node_idx])
+            {
+                elog(WARNING, "gid: %s, abnormal time: " INT64_FORMAT
+                    ", prepare timestamp[%d]: " INT64_FORMAT, txn->gid,
+                    abnormal_time, node_idx, txn->prepare_timestamp[node_idx]);
+
+                return TXN_STATUS_INPROGRESS;
+            }
+        }
+        else
+        {
+            elog(WARNING, "gid: %s, node_idx: %d", txn->gid, node_idx);
+        }
+
+        if (abnormal_time < prepared_time)
         {
+            elog(WARNING, "gid: %s, abnormal time: " INT64_FORMAT
+                ", prepared time: " INT64_FORMAT, txn->gid,
+                abnormal_time, prepared_time);
+
             return TXN_STATUS_INPROGRESS;
         }
     }
@@ -3310,3 +3363,71 @@ void get_node_handles(PGXCNodeAllHandles **pgxc_handles, Oid nodeoid)
 	*pgxc_handles = get_handles(nodelist, coordlist, false, true, true);
 }
 
+
+bool check_2pc_start_from_node(txn_info *txn)
+{
+	char node_type;
+
+	Assert(InvalidOid != abnormal_nodeoid);
+
+	if (abnormal_nodeoid == txn->origcoord)
+	{
+		return true;
+	}
+
+	node_type = get_pgxc_nodetype(abnormal_nodeoid);
+	if (node_type == 'D')
+	{
+		return false;
+	}
+
+	if (InvalidOid == txn->origcoord)
+	{
+		char *startnode = NULL;
+		int   node_oid  = InvalidOid;
+		char  gid[MAX_GID];
+
+		if (!IsXidImplicit(txn->gid))
+		{
+			return true;
+		}
+
+		Assert(IsXidImplicit(txn->gid));
+
+		/* get start node from gid */
+		strcpy(gid, txn->gid);
+		startnode = strtok(gid, ":");
+		if (NULL == startnode)
+		{
+			elog(WARNING, "get startnode(%s) from gid(%s) failed",
+				startnode, gid);
+			return false;
+		}
+
+		startnode = strtok(NULL, ":");
+		if (NULL == startnode)
+		{
+			elog(WARNING, "get startnode(%s) from gid(%s) failed",
+				startnode, gid);
+			return false;
+		}
+
+		node_oid = get_pgxc_nodeoid(startnode);
+		if (NULL == startnode)
+		{
+			elog(WARNING, "get invalid oid for startnode(%s) from gid(%s)",
+				startnode, gid);
+			return false;
+		}
+
+		elog(DEBUG1, "get oid(%d) for startnode(%s) from gid(%s)",
+			node_oid, startnode, gid);
+
+		if (abnormal_nodeoid == node_oid)
+		{
+			return true;
+		}
+	}
+
+	return false;
+}
diff --git a/src/backend/postmaster/clean2pc.c b/src/backend/postmaster/clean2pc.c
index 5e0ceaf2..b7f8fa02 100644
--- a/src/backend/postmaster/clean2pc.c
+++ b/src/backend/postmaster/clean2pc.c
@@ -32,6 +32,7 @@
 #include "storage/pmsignal.h"
 #include "tcop/tcopprot.h"
 #include "utils/builtins.h"
+#include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
 #include "utils/timeout.h"
@@ -56,9 +57,10 @@ typedef enum
 
 bool enable_clean_2pc_launcher = true;
 
-int auto_clean_2pc_interval = 10;
-int auto_clean_2pc_delay    = 3;
-int auto_clean_2pc_timeout  = 0;
+int auto_clean_2pc_interval        = 30;
+int auto_clean_2pc_delay           = 10;
+int auto_clean_2pc_timeout         = 300;
+int auto_clean_2pc_max_check_time  = 300;
 
 static volatile sig_atomic_t got_SIGTERM = false;
 static volatile sig_atomic_t got_SIGHUP  = false;
@@ -420,6 +422,9 @@ do_query_2pc(TimestampTz clean_time)
 	int                  attr_num = 4;
 	int64                check_time = 0;
 	TimestampTz          curr_time = GetCurrentTimestamp();
+	Oid                  node_oid = 0;
+	char                 node_type = PGXC_NODE_COORDINATOR;
+	int                  node_index = 0;
 	static const char   *attr_name[] = {"gid", "database",
 							"global_transaction_status",
 							"transaction_status_on_allnodes"};
@@ -442,19 +447,37 @@ do_query_2pc(TimestampTz clean_time)
 		check_time = INT32_MAX;
 	}
 
+	if (auto_clean_2pc_max_check_time != 0)
+	{
+		if (check_time > auto_clean_2pc_max_check_time)
+		{
+			check_time = auto_clean_2pc_max_check_time;
+		}
+	}
+
 	snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_check_txn("
 		INT64_FORMAT ") order by database limit 1000;", check_time);
 
-	elog(DEBUG1, "node(%d) query: %s", PGXCNodeId, query);
-
 	StartTransactionCommand();
 
+	InitMultinodeExecutor(false);
+
+	node_oid = get_pgxc_nodeoid(PGXCNodeName);
+	if (!OidIsValid(node_oid))
+	{
+		elog(ERROR, "get node(%s) oid failed", PGXCNodeName);
+		return;
+	}
+	node_index = PGXCNodeGetNodeId(node_oid, &node_type);
+
+	elog(DEBUG1, "node(%d) query: %s", node_index, query);
+
 	plan = makeNode(RemoteQuery);
 	plan->combine_type = COMBINE_TYPE_NONE;
 	plan->exec_nodes = makeNode(ExecNodes);
 	plan->exec_type = EXEC_ON_COORDS;
 
-	plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, PGXCNodeId);
+	plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, node_index);
 
 	plan->sql_statement = (char*)query;
 	plan->force_autocommit = false;
@@ -470,8 +493,6 @@ do_query_2pc(TimestampTz clean_time)
 										makeTargetEntry((Expr *) dummy, i, NULL, false));
 	}
 
-	InitMultinodeExecutor(false);
-
 	/* prepare to execute */
 	estate = CreateExecutorState();
 	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
@@ -561,8 +582,9 @@ do_query_2pc(TimestampTz clean_time)
 	if (count_2pc > 0)
 	{
 		Assert(result_str->data != NULL);
-		elog(LOG, "query remain 2pc count(%d), db count(%d):\n%s",
-			count_2pc, count_db, result_str->data);
+		elog(LOG, "query remain 2pc count(%d), db count(%d), sql: %s",
+			count_2pc, count_db, query);
+		elog(DEBUG1, "remain 2pc:\n%s", result_str->data);
 	}
 }
 
@@ -579,6 +601,9 @@ do_clean_2pc(TimestampTz clean_time)
 	TupleTableSlot      *result = NULL;
 	Var                 *dummy = NULL;
 	int                  attr_num = 4;
+	Oid                  node_oid = 0;
+	char                 node_type = PGXC_NODE_COORDINATOR;
+	int                  node_index = 0;
 	static const char   *attr_name[] = {"gid", "global_transaction_status",
 								"operation", "operation_status"};
 
@@ -588,16 +613,26 @@ do_clean_2pc(TimestampTz clean_time)
 	snprintf(query, SQL_CMD_LEN, "select * FROM pg_clean_execute_on_node('%s', %ld)"
 			" limit 1000;", PGXCNodeName, clean_time);
 
-	elog(DEBUG2, "node(%d) query: %s", PGXCNodeId, query);
-
 	StartTransactionCommand();
 
+	InitMultinodeExecutor(false);
+
+	node_oid = get_pgxc_nodeoid(PGXCNodeName);
+	if (!OidIsValid(node_oid))
+	{
+		elog(ERROR, "get node(%s) oid failed", PGXCNodeName);
+		return;
+	}
+	node_index = PGXCNodeGetNodeId(node_oid, &node_type);
+
+	elog(DEBUG1, "node(%d) query: %s", node_index, query);
+
 	plan = makeNode(RemoteQuery);
 	plan->combine_type = COMBINE_TYPE_NONE;
 	plan->exec_nodes = makeNode(ExecNodes);
 	plan->exec_type = EXEC_ON_COORDS;
 
-	plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, PGXCNodeId);
+	plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList, node_index);
 
 	plan->sql_statement = (char*)query;
 	plan->force_autocommit = false;
@@ -613,8 +648,6 @@ do_clean_2pc(TimestampTz clean_time)
 										makeTargetEntry((Expr *) dummy, i, NULL, false));
 	}
 
-	InitMultinodeExecutor(false);
-
 	/* prepare to execute */
 	estate = CreateExecutorState();
 	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
@@ -648,7 +681,8 @@ do_clean_2pc(TimestampTz clean_time)
 	if (count > 0)
 	{
 		Assert(NULL != result_str->data);
-		elog(LOG, "clean 2pc count(%d):\n%s", count, result_str->data);
+		elog(LOG, "clean 2pc count(%d), sql: %s", count, query);
+		elog(LOG, "clean 2pc:\n%s", result_str->data);
 	}
 }
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f1e0d2f7..fd1b4720 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4860,7 +4860,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_interval,
-		30, 1, 3600,
+		30, 10, INT_MAX,
 		NULL, NULL, NULL
 	},
 
@@ -4871,7 +4871,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_delay,
-		3, 1, 600,
+		10, 3, INT_MAX,
 		NULL, NULL, NULL
 	},
 
@@ -4882,7 +4882,18 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_timeout,
-		0, 0, INT_MAX,
+		300, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"auto_clean_2pc_max_check_time", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("auto clean 2pc max check time"),
+			NULL,
+			GUC_UNIT_S
+		},
+		&auto_clean_2pc_max_check_time,
+		300, 0, INT_MAX,
 		NULL, NULL, NULL
 	},
 
diff --git a/src/include/postmaster/clean2pc.h b/src/include/postmaster/clean2pc.h
index 2d94442b..4754ab63 100644
--- a/src/include/postmaster/clean2pc.h
+++ b/src/include/postmaster/clean2pc.h
@@ -19,6 +19,7 @@ extern bool enable_clean_2pc_launcher;
 extern int auto_clean_2pc_interval;
 extern int auto_clean_2pc_delay;
 extern int auto_clean_2pc_timeout;
+extern int auto_clean_2pc_max_check_time;
 
 extern bool IsClean2pcLauncher(void);
 extern bool IsClean2pcWorker(void);

From 4ae15b63748d282a8fc9802387c4d8e3852109c8 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Mon, 1 Nov 2021 10:52:39 +0800
Subject: [PATCH 431/578] fix PGXCNodeSendShowQuery coredump
 http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131093712025

---
 src/backend/pgxc/pool/pgxcnode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index d84611ee..9e645974 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -5559,8 +5559,7 @@ PGXCNodeSendShowQuery(NODE_CONNECTION *conn, const char *sql_command)
     resStatus = PQresultStatus(result);
     if (resStatus == PGRES_TUPLES_OK || resStatus == PGRES_COMMAND_OK)
     {           
-		/* ignore unit */
-		snprintf(number, result->tuples[0][0].len, "%s", PQgetvalue(result, 0, 0));
+		snprintf(number, 128, "%s", PQgetvalue(result, 0, 0));
     }    
     PQclear(result);    
 

From b32ab5e1c4a6f3f0f0106fc8bbe1bdc9cb32d0be Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Wed, 3 Nov 2021 15:13:19 +0800
Subject: [PATCH 432/578] auto clean 2pc guc configuration default value
 optimize

---
 src/backend/postmaster/clean2pc.c | 8 ++++----
 src/backend/utils/misc/guc.c      | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/backend/postmaster/clean2pc.c b/src/backend/postmaster/clean2pc.c
index b7f8fa02..def81c95 100644
--- a/src/backend/postmaster/clean2pc.c
+++ b/src/backend/postmaster/clean2pc.c
@@ -57,10 +57,10 @@ typedef enum
 
 bool enable_clean_2pc_launcher = true;
 
-int auto_clean_2pc_interval        = 30;
-int auto_clean_2pc_delay           = 10;
-int auto_clean_2pc_timeout         = 300;
-int auto_clean_2pc_max_check_time  = 300;
+int auto_clean_2pc_interval        = 60;
+int auto_clean_2pc_delay           = 300;
+int auto_clean_2pc_timeout         = 1200;
+int auto_clean_2pc_max_check_time  = 1200;
 
 static volatile sig_atomic_t got_SIGTERM = false;
 static volatile sig_atomic_t got_SIGHUP  = false;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index fd1b4720..af8cef6e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4860,7 +4860,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_interval,
-		30, 10, INT_MAX,
+		60, 10, INT_MAX,
 		NULL, NULL, NULL
 	},
 
@@ -4871,7 +4871,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_delay,
-		10, 3, INT_MAX,
+		300, 3, INT_MAX,
 		NULL, NULL, NULL
 	},
 
@@ -4882,7 +4882,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_timeout,
-		300, 0, INT_MAX,
+		1200, 0, INT_MAX,
 		NULL, NULL, NULL
 	},
 
@@ -4893,7 +4893,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_max_check_time,
-		300, 0, INT_MAX,
+		1200, 0, INT_MAX,
 		NULL, NULL, NULL
 	},
 

From 37f66eaa879fb32038cb6555729c0309b5a490ca Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Wed, 8 Sep 2021 16:35:53 +0800
Subject: [PATCH 433/578] Bugfix: procedure error, ID90798511 (merge request
 git push origin Tbase_v2.15.19.4)

    (cherry picked from commit e29ea02b)

    69eeac67 bugfix: procedure error, add regress test cases, ID90798511

    7ea215ed bugfix: procedure error, ID90798511
---
 src/backend/access/transam/varsup.c | 28 ++++++++++++++++++++++++----
 src/backend/storage/ipc/procarray.c |  5 ++++-
 src/include/storage/procarray.h     |  3 ++-
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 42baa98f..e4733f82 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -93,7 +93,8 @@ GetForceXidFromGTM(void)
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
 static TransactionId local_xid = InvalidTransactionId;
 static TransactionId local_subxids[PGPROC_MAX_CACHED_SUBXIDS] = {};
-static int local_nsub;
+static int local_nsub = 0;
+static bool local_overflowed = false;
 /* exported information about parallel workers, see xact.c */
 extern int nParallelCurrentXids;
 extern TransactionId *ParallelCurrentXids;
@@ -129,7 +130,8 @@ StoreGlobalXid(const char *globalXid)
     else if(IsConnFromDatanode())
     {
         
-		local_xid = GetLocalTransactionId(globalXid, local_subxids, &local_nsub);
+		local_xid = GetLocalTransactionId(globalXid,
+						local_subxids, &local_nsub, &local_overflowed);
         if(enable_distri_print)
         {
 			elog (LOG, " global xid %s to local xid %d, %d subxids", globalXid, local_xid, local_nsub);
@@ -192,8 +194,6 @@ GetSubTransactions(void)
 bool
 TransactIdIsCurentGlobalTransacId(TransactionId xid)
 {
-	int i;
-	
     if(enable_distri_print)
     {
         elog(LOG, "is current transaction xid %u local xid %d", xid, local_xid);
@@ -205,12 +205,32 @@ TransactIdIsCurentGlobalTransacId(TransactionId xid)
 	if (TransactionIdEquals(xid, local_xid))
 		return true;
 	
+	if (!local_overflowed)
+	{
 	/* check subxids */
+		int i;
 	for (i = 0; i < local_nsub; i++)
 	{
 		if (TransactionIdEquals(local_subxids[i], xid))
 			return true;
 	}
+	}
+	else
+	{
+		TransactionId topxid = SubTransGetTopmostTransaction(xid);
+		Assert(local_nsub == PGPROC_MAX_CACHED_SUBXIDS);
+		if(enable_distri_print)
+		{
+			elog(LOG, "subtransaction overflowed: xid=%d, topxid=%d, local_xid=%d",
+					xid, topxid, local_xid);
+		}
+
+		if (!TransactionIdIsValid(topxid))
+			return false;
+
+		if (TransactionIdEquals(topxid, local_xid))
+			return true;
+	}
 	
 	return false;
 }
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 35636976..45a8304e 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -1963,7 +1963,8 @@ GetMaxSnapshotSubxidCount(void)
 }
 
 #ifdef __TBASE__
-TransactionId GetLocalTransactionId(const char *globalXid, TransactionId *subxids, int *nsub)
+TransactionId GetLocalTransactionId(const char *globalXid,
+				TransactionId *subxids, int *nsub, bool *overflowed)
 {
     
     ProcArrayStruct *arrayP = procArray;
@@ -1996,6 +1997,8 @@ TransactionId GetLocalTransactionId(const char *globalXid, TransactionId *subxid
         
 		result = pgxact->xid;
 		
+		*overflowed = pgxact->overflowed;
+		
 		/* look for max xid in subtrans */
 		*nsub = pgxact->nxids;
 		for (nxid = 0; nxid < pgxact->nxids; nxid++)
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index d6607bcf..3b84d623 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -123,7 +123,8 @@ extern bool TransactionIdIsInProgress(TransactionId xid);
 extern bool TransactionIdIsPrepared(TransactionId xid, Snapshot snapshot, GlobalTimestamp *prepare_ts);
 #endif
 #ifdef __TBASE__
-extern TransactionId GetLocalTransactionId(const char *globalXid, TransactionId *subxids, int *nsub);
+extern TransactionId GetLocalTransactionId(const char *globalXid,
+					TransactionId *subxids, int *nsub, bool *overflowed);
 #endif
 extern char *GetGlobalTransactionId(const TransactionId pid);
 extern bool TransactionIdIsActive(TransactionId xid);

From 0ab1363e95844d07a15caa368bd0151aecd7fc22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cningxpeng=E2=80=9D?= <“ningxpeng@tencent.com”>
Date: Fri, 20 Aug 2021 17:05:53 +0800
Subject: [PATCH 434/578] [BUGFIX] Subtransaction commits should not reset
 session information     (cherry-pick from
 ffca2f98b83e7375c001cf685c61aabef6f0638c)

---
 src/backend/pgxc/pool/execRemote.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 1708343b..6ae97233 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -4839,14 +4839,12 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
     stat_transaction(conn_count);
 
     /* do not cleanup remote session for subtrans */
-	if (!temp_object_included)
+	if (!temp_object_included && need_release_handle)
         {
             /* Clean up remote sessions */
 		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
 		                             txn_type == TXN_TYPE_RollbackSubTxn);
 
-		if (need_release_handle)
-		{
 			if (PersistentConnections)
 			{
 				reset_handles();
@@ -4863,7 +4861,6 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 			}
         }
     }
-	}
 
     clear_handles();
 }
@@ -5938,13 +5935,12 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
 	 * certain issues for aborted transactions, we drop the connections.
 	 * Revisit and fix the issue
 	 */
-        if (!temp_object_included)
+	if (!temp_object_included && need_release_handle)
         {
             /* Clean up remote sessions */
 		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
 		                             txn_type == TXN_TYPE_RollbackSubTxn);
-		if (need_release_handle)
-        {
+
 			if (HaveActiveDatanodeStatements())
 			{
 				reset_handles();
@@ -5954,7 +5950,6 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
 			release_handles(false);
         }
     }
-	}
     
     clear_handles();
     pfree_pgxc_all_handles(handles);

From 045939ad5a7835066915d328ddbe9f1f0e76128a Mon Sep 17 00:00:00 2001
From: ningxpeng <ningxpeng@tencent.com>
Date: Tue, 17 Aug 2021 11:13:42 +0800
Subject: [PATCH 435/578] [BUGFIX] snapshot still active in CTAS mode

---
 src/backend/rewrite/rewriteHandler.c |  5 ++-
 src/backend/tcop/pquery.c            | 14 +++++++
 src/backend/utils/time/snapmgr.c     | 26 +++++++++++-
 src/include/utils/snapmgr.h          | 62 ++++++++++++++++------------
 4 files changed, 78 insertions(+), 29 deletions(-)

diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c
index 8df9fe35..b2f6df8f 100644
--- a/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@ -4112,8 +4112,11 @@ QueryRewriteCTAS(Query *parsetree)
     ProcessUtility(wrapper, cquery.data, PROCESS_UTILITY_QUERY,
             NULL, NULL, NULL, false, NULL);
 
-    PopActiveSnapshot();
+	/* Use new snapshot for insert and update the snapshot status. */
+	if (ActiveSnapshotSet())
+        PopActiveSnapshot();
     PushActiveSnapshot(GetTransactionSnapshot());
+	UpdateActiveSnapshotStatus(S_FOR_CTAS);
 
 
     /*
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 715d407b..295dc2a2 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -2018,6 +2018,20 @@ PortalRunUtility(Portal portal, PlannedStmt *pstmt,
     if (snapshot != NULL && ActiveSnapshotSet() &&
         snapshot == GetActiveSnapshot())
         PopActiveSnapshot();
+	else
+	{
+		/* Clear snapshots created in process QueryRewriteCTAS */
+		while (ActiveSnapshotSet())
+		{
+			if (S_FOR_CTAS == GetActiveSnapshotStatus() ||
+				snapshot == GetActiveSnapshot())
+			{
+				PopActiveSnapshot();
+				continue;
+			}
+			break;
+		}
+	}
 }
 
 /*
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 0f4158e4..d7da6a59 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -190,6 +190,7 @@ typedef struct ActiveSnapshotElt
 {
     Snapshot    as_snap;
     int            as_level;
+	SnapshotStatus status;
     struct ActiveSnapshotElt *as_next;
 } ActiveSnapshotElt;
 
@@ -895,6 +896,7 @@ PushActiveSnapshot(Snapshot snap)
 
     newactive->as_next = ActiveSnapshot;
     newactive->as_level = GetCurrentTransactionNestLevel();
+	newactive->status = S_DEFAULT;
 
     newactive->as_snap->active_count++;
 
@@ -957,6 +959,22 @@ UpdateActiveSnapshotCommandId(void)
 #endif    
 }
 
+void
+UpdateActiveSnapshotStatus(SnapshotStatus new_status)
+{
+	Assert(ActiveSnapshot != NULL);
+
+	ActiveSnapshot->status = new_status;
+}
+
+SnapshotStatus
+GetActiveSnapshotStatus(void)
+{
+	Assert(ActiveSnapshot != NULL);
+
+	return ActiveSnapshot->status;
+}
+
 /*
  * PopActiveSnapshot
  *
@@ -1288,10 +1306,16 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin)
             elog(WARNING, "registered snapshots seem to remain after cleanup");
 
         /* complain about unpopped active snapshots */
-        for (active = ActiveSnapshot; active != NULL; active = active->as_next)
+		for (active = ActiveSnapshot; active != NULL && active->status != S_FOR_CTAS; active = active->as_next)
+		{
             elog(WARNING, "snapshot %p still active", active);
     }
 
+		/* Resources to clean up, pop all active snapshots */
+		while (ActiveSnapshotSet())
+			PopActiveSnapshot();
+	}
+
     /*
      * And reset our state.  We don't need to free the memory explicitly --
      * it'll go away with TopTransactionContext.
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index e1054705..110ed378 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * snapmgr.h
- *      POSTGRES snapshot manager
+ *	  POSTGRES snapshot manager
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -34,13 +34,19 @@
 #define OLD_SNAPSHOT_PADDING_ENTRIES 10
 #define OLD_SNAPSHOT_TIME_MAP_ENTRIES (old_snapshot_threshold + OLD_SNAPSHOT_PADDING_ENTRIES)
 
+typedef enum SnapshotStatus
+{
+	S_DEFAULT,
+	S_FOR_CTAS			/* After creating a table, obtain a new snapshot in the QueryRewriteCTAS process */
+} SnapshotStatus;
+
 /*
  * Common definition of relation properties that allow early pruning/vacuuming
  * when old_snapshot_threshold >= 0.
  */
 #define RelationAllowsEarlyPruning(rel) \
 ( \
-     RelationNeedsWAL(rel) \
+	 RelationNeedsWAL(rel) \
   && !IsCatalogRelation(rel) \
   && !RelationIsAccessibleInLogicalDecoding(rel) \
   && !RelationHasUnloggedIndex(rel) \
@@ -66,7 +72,7 @@ extern TransactionId RecentGlobalDataXmin;
 
 extern GlobalTimestamp RecentCommitTs;
 extern GlobalTimestamp RecentDataTs;
-extern int    vacuum_delta;
+extern int	vacuum_delta;
 extern bool vacuum_debug_print;
 
 
@@ -88,6 +94,8 @@ extern void InvalidateCatalogSnapshotConditionally(void);
 extern void PushActiveSnapshot(Snapshot snapshot);
 extern void PushCopiedSnapshot(Snapshot snapshot);
 extern void UpdateActiveSnapshotCommandId(void);
+void UpdateActiveSnapshotStatus(SnapshotStatus new_status);
+SnapshotStatus GetActiveSnapshotStatus(void);
 extern void PopActiveSnapshot(void);
 extern Snapshot GetActiveSnapshot(void);
 extern bool ActiveSnapshotSet(void);
@@ -106,9 +114,9 @@ extern bool XactHasExportedSnapshots(void);
 extern void DeleteAllExportedSnapshotFiles(void);
 extern bool ThereAreNoPriorRegisteredSnapshots(void);
 extern TransactionId TransactionIdLimitedForOldSnapshots(TransactionId recentXmin,
-                                    Relation relation);
+									Relation relation);
 extern void MaintainOldSnapshotTimeMapping(TimestampTz whenTaken,
-                               TransactionId xmin);
+							   TransactionId xmin);
 
 extern char *ExportSnapshot(Snapshot snapshot);
 
@@ -142,28 +150,28 @@ extern bool LookupPreparedXid(TransactionId xid, GlobalTimestamp *prepare_timest
 static inline bool
 TestForOldTimestamp(GlobalTimestamp currentTimestamp, GlobalTimestamp oldestTimestamp)
 {
-    
-    if(IsInitProcessingMode())
-    {
-        return true;
-    }
-
-    if(CommitTimestampIsLocal(currentTimestamp))
-    {
-        return true;
-    }
-    
-    if(currentTimestamp < oldestTimestamp)
-    {
-        elog(DEBUG12, "test for old time true ts " INT64_FORMAT " recent " INT64_FORMAT, currentTimestamp, oldestTimestamp);
-        return true;
-    }
-    else
-    {
-        elog(DEBUG12, "test for old time false ts " INT64_FORMAT " recent " INT64_FORMAT, currentTimestamp, oldestTimestamp);
-        return false;
-    }
+	
+	if(IsInitProcessingMode())
+	{
+		return true;
+	}
+
+	if(CommitTimestampIsLocal(currentTimestamp))
+	{
+		return true;
+	}
+	
+	if(currentTimestamp < oldestTimestamp)
+	{
+		elog(DEBUG12, "test for old time true ts " INT64_FORMAT " recent " INT64_FORMAT, currentTimestamp, oldestTimestamp);
+		return true;
+	}
+	else
+	{
+		elog(DEBUG12, "test for old time false ts " INT64_FORMAT " recent " INT64_FORMAT, currentTimestamp, oldestTimestamp);
+		return false;
+	}
 }
 
 
-#endif                            /* SNAPMGR_H */
+#endif							/* SNAPMGR_H */

From 261e4744b01445c008e68435e4b9f7c6ed6dc295 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cningxpeng=E2=80=9D?= <“ningxpeng@tencent.com”>
Date: Tue, 7 Sep 2021 15:35:59 +0800
Subject: [PATCH 436/578] [BUGFIX] The original snapshot status must be
 maintained in CTAS process snapshot replacement

---
 src/backend/rewrite/rewriteHandler.c | 15 +++++++++++++++
 src/backend/utils/time/snapmgr.c     | 14 ++++++++++++++
 src/include/utils/snapmgr.h          |  2 ++
 3 files changed, 31 insertions(+)

diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c
index b2f6df8f..35261bee 100644
--- a/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@ -3970,6 +3970,8 @@ QueryRewriteCTAS(Query *parsetree)
     CreateTableAsStmt *stmt;
     IntoClause *into;
     ListCell *lc;
+	const int InvalidLevel = -1;
+	int old_level = InvalidLevel;
 
     if (parsetree->commandType != CMD_UTILITY ||
         !IsA(parsetree->utilityStmt, CreateTableAsStmt))
@@ -4114,12 +4116,25 @@ QueryRewriteCTAS(Query *parsetree)
 
 	/* Use new snapshot for insert and update the snapshot status. */
 	if (ActiveSnapshotSet())
+	{
+		old_level = GetActiveSnapshotLevel();
         PopActiveSnapshot();
+	}
+	
     PushActiveSnapshot(GetTransactionSnapshot());
 	UpdateActiveSnapshotStatus(S_FOR_CTAS);
 
 
     /*
+	 * Only snapshot replacement is performed to prevent abnormal snapshot clearing caused by sub-transactions.
+	 * Active snapshots set by this subtransaction will be cleared.
+	 */
+	if (old_level != InvalidLevel)
+	{
+		SetActiveSnapshotLevel(old_level);
+	}
+
+	/*
      * Now fold the CTAS statement into an INSERT INTO statement. The
      * utility is no more required.
      */
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index d7da6a59..71a2f0a3 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -975,6 +975,20 @@ GetActiveSnapshotStatus(void)
 	return ActiveSnapshot->status;
 }
 
+int
+GetActiveSnapshotLevel(void)
+{
+	Assert(ActiveSnapshot != NULL);
+	return ActiveSnapshot->as_level;
+}
+
+void
+SetActiveSnapshotLevel(int level)
+{
+	Assert(ActiveSnapshot != NULL);
+	ActiveSnapshot->as_level = level;
+}
+
 /*
  * PopActiveSnapshot
  *
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index 110ed378..896a9ff9 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -96,6 +96,8 @@ extern void PushCopiedSnapshot(Snapshot snapshot);
 extern void UpdateActiveSnapshotCommandId(void);
 void UpdateActiveSnapshotStatus(SnapshotStatus new_status);
 SnapshotStatus GetActiveSnapshotStatus(void);
+extern int GetActiveSnapshotLevel(void);
+extern void SetActiveSnapshotLevel(int level);
 extern void PopActiveSnapshot(void);
 extern Snapshot GetActiveSnapshot(void);
 extern bool ActiveSnapshotSet(void);

From b027d47a1c451bb14f7f6cfd7c7b704afa892fdb Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 4 Nov 2021 15:46:27 +0800
Subject: [PATCH 437/578] fix probabilistic error could not open relation with
 OID 0 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093496729 (merge
 request !882)

Squash merge branch 'sigmalin_v2' into 'Tbase_v2.15.19.4'
fix probabilistic error could not open relation with OID 0 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093496729

TAPD: --bug=093496729
---
 src/backend/access/transam/xact.c |  2 +-
 src/backend/pgxc/pool/poolmgr.c   | 19 +++++++++++++++++++
 src/backend/utils/misc/guc.c      | 12 +++++++++---
 src/include/utils/guc.h           |  5 +++++
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index ed02dff9..1d255395 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -757,7 +757,7 @@ AssignGlobalXidInternal(void)
     globalXidVersion++;
     if(enable_distri_print)
     {
-        elog(LOG, "assign global xid %s prono %d seq " UINT64_FORMAT UINT64_FORMAT, 
+		elog(LOG, "assign global xid %s prono %d seq " UINT64_FORMAT" "UINT64_FORMAT,
             str.data, MyProc->pgprocno, seq, globalXidVersion);
     }
 
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index b9c3bd8c..2db4397e 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -67,6 +67,9 @@
 #include "port.h"
 #include <math.h>
 #include <sys/timeb.h>
+#ifdef __TBASE__
+#include "access/xlog.h"
+#endif
 
 /* the mini use conut of a connection */
 #define  MINI_USE_COUNT    10
@@ -409,6 +412,9 @@ static void pooler_subthread_write_log(int elevel, int lineno, const char *filen
 #define MAX_THREAD_LOG_PIPE_LEN         (2 * 1024)                                     /* length of thread log pipe */
 #define DEFAULT_LOG_BUF_LEN             (1024)                                         /* length of thread log length */
 PGPipe  *g_ThreadLogQueue = NULL;
+#ifdef __TBASE__
+bool g_allow_distri_query_on_standby_node = false;
+#endif
 
 static inline void RebuildAgentIndex(void);
 
@@ -1830,6 +1836,19 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, bool raise_error,
 
     int         j = 0;
 
+#ifdef __TBASE__
+	/*
+	 * if it is the standby node of the main plane, the distributed query will be connected to
+	 * the main data node, and the standby cn may generate the same global xid as the main cn,
+	 * so disable the distributed query of the standby node on the main plane
+	 */
+	if (g_allow_distri_query_on_standby_node == false &&
+	            IsPGXCMainCluster && RecoveryInProgress())
+    {
+        elog(ERROR, "can't do distributed query because it is the main plane standby node.");
+    }
+#endif
+
     HOLD_POOLER_RELOAD();
     
     if (poolHandle == NULL)
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index af8cef6e..8b9a9fe9 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2501,9 +2501,15 @@ static struct config_bool ConfigureNamesBool[] =
         false,
         NULL, NULL, NULL
     },
-
-    
-
+    {
+        {"allow_distri_query_on_standby_node", PGC_POSTMASTER, CUSTOM_OPTIONS,
+         gettext_noop("allow distributed query on main plane standby node"),
+         NULL
+        },
+        &g_allow_distri_query_on_standby_node,
+        false,
+        NULL, NULL, NULL
+    },
     {
         {"enable_committs_print", PGC_SUSET, CUSTOM_OPTIONS,
             gettext_noop("enable commit ts debug print"),
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 7ae45b95..95342821 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -312,6 +312,11 @@ extern bool g_allow_dml_on_datanode;
 extern bool g_allow_force_ddl;
 extern bool trace_extent;
 #endif
+
+#ifdef __TBASE__
+extern bool g_allow_distri_query_on_standby_node;
+#endif
+
 #ifdef XCP
 extern char *global_session_string;
 #endif

From 9ce9464560c4a0ec6846c979fe13695ad789884e Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 13 Aug 2021 10:28:31 +0800
Subject: [PATCH 438/578] add 2pc protection fix
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131089315277 (merge request
 git status)

---
 src/backend/pgxc/pool/execRemote.c |  79 ++++++++++---
 src/backend/pgxc/pool/pgxcnode.c   | 177 +++++++++++++++++++++++++++++
 src/include/pgxc/pgxcnode.h        |   7 ++
 3 files changed, 245 insertions(+), 18 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 6ae97233..28af8f31 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -2549,6 +2549,14 @@ FetchTuple(ResponseCombiner *combiner)
         }
         else if (res == RESPONSE_COMPLETE)
         {        
+            if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+            {
+                ereport(ERROR,
+                        (errcode(ERRCODE_INTERNAL_ERROR),
+                                errmsg("Unexpected FATAL ERROR on Connection to Datanode %s pid %d",
+                                       conn->nodename, conn->backend_pid)));
+            }
+
             /*
              * In case of Simple Query Protocol we should receive ReadyForQuery
              * before removing connection from the list. In case of Extended
@@ -2656,13 +2664,6 @@ FetchTuple(ResponseCombiner *combiner)
                     return NULL;
                 }
             }
-            else if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
-            {
-                ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("Unexpected FATAL ERROR on Connection to Datanode %s pid %d",
-                             conn->nodename, conn->backend_pid)));
-            }
         }
         else if (res == RESPONSE_ERROR)
         {
@@ -3683,6 +3684,8 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
 			elog(DEBUG5, "pgxc_node_begin send %s to node %s, pid:%d", cmd,
 					connections[i]->nodename, connections[i]->backend_pid);
 			new_connections[new_count++] = connections[i];
+			/* if send begin, register current connection */
+			register_transaction_handles(connections[i]);
         }
     }
 
@@ -3938,7 +3941,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
     ResponseCombiner combiner;
     PGXCNodeHandle **connections = NULL;
     int                conn_count = 0;
-    PGXCNodeAllHandles *handles = get_current_handles();
+	/* get current transaction handles that we register when pgxc_node_begin */
+	PGXCNodeAllHandles *handles = get_current_txn_handles();
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
     GlobalTimestamp global_prepare_ts = InvalidGlobalTimestamp;
 #endif
@@ -4071,7 +4075,10 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
          * Skip empty slots
          */
         if (conn->sock == NO_SOCKET)
-            continue;
+        {
+            elog(ERROR, "pgxc_node_remote_prepare, remote node %s's connection handle is invalid, backend_pid: %d",
+                 conn->nodename, conn->backend_pid);
+        }
         else if (conn->transaction_status == 'T')
         {
             /* Read in any pending input */
@@ -4277,7 +4284,10 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
          * Skip empty slots
          */
         if (conn->sock == NO_SOCKET)
-            continue;
+        {
+            elog(ERROR, "pgxc_node_remote_prepare, remote node %s's connection handle is invalid, backend_pid: %d",
+                 conn->nodename, conn->backend_pid);
+        }
         else if (conn->transaction_status == 'T')
         {
             if (conn->read_only)
@@ -5489,7 +5499,8 @@ void get_partnodes(PGXCNodeAllHandles * handles, StringInfo participants)
         conn = handles->datanode_handles[i];
         if (conn->sock == NO_SOCKET)
         {
-            continue;
+            elog(ERROR, "get_partnodes, remote node %s's connection handle is invalid, backend_pid: %d",
+                 conn->nodename, conn->backend_pid);
         }
         else if (conn->transaction_status == 'T')
         {
@@ -5499,6 +5510,11 @@ void get_partnodes(PGXCNodeAllHandles * handles, StringInfo participants)
                 appendStringInfo(participants, "%s,", conn->nodename);
             }
         }
+        else if (conn->transaction_status == 'E')
+        {
+            elog(ERROR, "get_partnodes, remote node %s is in error state, backend_pid: %d",
+                 conn->nodename, conn->backend_pid);
+        }
     }
  
     for (i = 0; i < handles->co_conn_count; i++)
@@ -5506,7 +5522,8 @@ void get_partnodes(PGXCNodeAllHandles * handles, StringInfo participants)
         conn = handles->coord_handles[i];
         if (conn->sock == NO_SOCKET)
         {
-            continue;
+            elog(ERROR, "get_partnodes, remote node %s's connection handle is invalid, backend_pid: %d",
+                 conn->nodename, conn->backend_pid);
         }
         else if (conn->transaction_status == 'T')
         {
@@ -5516,6 +5533,11 @@ void get_partnodes(PGXCNodeAllHandles * handles, StringInfo participants)
                 appendStringInfo(participants, "%s,", conn->nodename);
             }
         }
+        else if (conn->transaction_status == 'E')
+        {
+            elog(ERROR, "get_partnodes, remote node %s is in error state, backend_pid: %d",
+                 conn->nodename, conn->backend_pid);
+        }
     }
     if (is_readonly && !IsXidImplicit(gid))
     {
@@ -7524,6 +7546,7 @@ void
 AtEOXact_Remote(void)
 {
     PGXCNodeResetParams(true);
+    reset_transaction_handles();
 }
 
 /*
@@ -8066,6 +8089,7 @@ PostPrepare_Remote(char *prepareGID, bool implicit)
     if (log_gtm_stats)
         ShowUsageCommon("PostPrepare_Remote", &start_r, &start_t);
 #endif
+    reset_transaction_handles();
 }
 
 /*
@@ -8133,8 +8157,8 @@ IsTwoPhaseCommitRequired(bool localWrite)
 		elog(ERROR, "IsTwoPhaseCommitRequired, Found %d sock fatal handles exist", sock_fatal_count);
 	}
 #endif
-
-    handles = get_current_handles();
+    /* get current transaction handles that we register when pgxc_node_begin */
+	handles = get_current_txn_handles();
     for (i = 0; i < handles->dn_conn_count; i++)
     {
         PGXCNodeHandle *conn = handles->datanode_handles[i];
@@ -8143,8 +8167,12 @@ IsTwoPhaseCommitRequired(bool localWrite)
 		elog(DEBUG5, "IsTwoPhaseCommitRequired, conn->nodename=%s, conn->sock=%d, conn->read_only=%d, conn->transaction_status=%c", 
 			conn->nodename, conn->sock, conn->read_only, conn->transaction_status);
 #endif
-        if (conn->sock != NO_SOCKET && !conn->read_only &&
-                conn->transaction_status == 'T')
+		if (conn->sock == NO_SOCKET)
+        {
+            elog(ERROR, "IsTwoPhaseCommitRequired, remote node %s's connection handle is invalid, backend_pid: %d",
+                 conn->nodename, conn->backend_pid);
+        }
+        else if (!conn->read_only && conn->transaction_status == 'T')
         {
             if (found)
             {
@@ -8156,6 +8184,11 @@ IsTwoPhaseCommitRequired(bool localWrite)
                 found = true; /* first found */
             }
         }
+        else if (conn->transaction_status == 'E')
+        {
+            elog(ERROR, "IsTwoPhaseCommitRequired, remote node %s is in error state, backend_pid: %d",
+                    conn->nodename, conn->backend_pid);
+        }
     }
     for (i = 0; i < handles->co_conn_count; i++)
     {
@@ -8165,8 +8198,12 @@ IsTwoPhaseCommitRequired(bool localWrite)
 		elog(DEBUG5, "IsTwoPhaseCommitRequired, conn->nodename=%s, conn->sock=%d, conn->read_only=%d, conn->transaction_status=%c", 
 			conn->nodename, conn->sock, conn->read_only, conn->transaction_status);
 #endif
-        if (conn->sock != NO_SOCKET && !conn->read_only &&
-                conn->transaction_status == 'T')
+		if (conn->sock == NO_SOCKET)
+        {
+            elog(ERROR, "IsTwoPhaseCommitRequired, remote node %s's connection handle is invalid, backend_pid: %d",
+                 conn->nodename, conn->backend_pid);
+        }
+        else if (!conn->read_only && conn->transaction_status == 'T')
         {
             if (found)
             {
@@ -8178,6 +8215,11 @@ IsTwoPhaseCommitRequired(bool localWrite)
                 found = true; /* first found */
             }
         }
+        else if (conn->transaction_status == 'E')
+        {
+            elog(ERROR, "IsTwoPhaseCommitRequired, remote node %s is in error state, backend_pid: %d",
+                 conn->nodename, conn->backend_pid);
+        }
     }
     pfree_pgxc_all_handles(handles);
 
@@ -8898,6 +8940,7 @@ pgxc_node_remote_finish(char *prepareGID, bool commit,
 	}
     clear_handles();
     pfree_pgxc_all_handles(pgxc_handles);
+	reset_transaction_handles();
     pfree(finish_cmd);
 
 #ifdef __TWO_PHASE_TRANS__
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 9e645974..8968700d 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -98,6 +98,8 @@ static PGXCNodeHandle *sdn_handles = NULL;
  */
 static PGXCNodeHandle *co_handles = NULL;
 
+PGXCNodeAllHandles *current_transaction_handles = NULL;
+
 #ifdef __TBASE__
 /* Hash key: nodeoid value: index in  dn_handles or co_handles */
 static HTAB *node_handles_hash = NULL; 
@@ -164,6 +166,8 @@ static void PGXCNodeHandleError(PGXCNodeHandle *handle, char *msg_body, int len)
 static PGXCNodeAllHandles * get_empty_handles(void);
 static void get_current_dn_handles_internal(PGXCNodeAllHandles *result);
 static void get_current_cn_handles_internal(PGXCNodeAllHandles *result);
+static void get_current_txn_dn_handles_internal(PGXCNodeAllHandles *result);
+static void get_current_txn_cn_handles_internal(PGXCNodeAllHandles *result);
 #endif
 
 /*
@@ -324,6 +328,7 @@ InitMultinodeExecutor(bool is_force)
 				"node_handles_hash enter primary datanode nodeoid: %d",
 				node_handle_ent->nodeoid);
         }
+        dn_handles[count].node_type = PGXC_NODE_DATANODE;
 #endif        
         
     }
@@ -354,6 +359,7 @@ InitMultinodeExecutor(bool is_force)
 				"node_handles_hash enter slave datanode nodeoid: %d",
 				node_handle_ent->nodeoid);
         }
+        sdn_handles[count].node_type = PGXC_NODE_SLAVEDATANODE;
 #endif        
         
     }
@@ -383,6 +389,7 @@ InitMultinodeExecutor(bool is_force)
 				"node_handles_hash enter coordinator nodeoid: %d",
 				node_handle_ent->nodeoid);
         }
+        co_handles[count].node_type = PGXC_NODE_COORDINATOR;
 #endif    
     }
 
@@ -427,6 +434,8 @@ InitMultinodeExecutor(bool is_force)
 #ifdef __TBASE__
     if(strcmp(PGXCMainClusterName, PGXCClusterName) == 0)
         IsPGXCMainCluster = true;
+
+    init_transaction_handles();
 #endif
     
 }
@@ -4165,7 +4174,16 @@ get_current_handles(void)
 }
 
 #ifdef __TBASE__
+/* get current transaction handles that register in pgxc_node_begin */
+PGXCNodeAllHandles *
+get_current_txn_handles(void)
+{
+    PGXCNodeAllHandles *result = get_empty_handles();
 
+    get_current_txn_cn_handles_internal(result);
+    get_current_txn_dn_handles_internal(result);
+    return result;
+}
 
 PGXCNodeAllHandles *
 get_current_cn_handles(void)
@@ -4211,6 +4229,35 @@ get_current_dn_handles_internal(PGXCNodeAllHandles *result)
     }
 }
 
+/* get current transaction dn handles that register in pgxc_node_begin */
+static void
+get_current_txn_dn_handles_internal(PGXCNodeAllHandles *result)
+{
+    int					i;
+    int                 count = 0;
+
+    if (current_transaction_handles == NULL || current_transaction_handles->dn_conn_count == 0)
+    {
+        return;
+    }
+
+    count = current_transaction_handles->dn_conn_count;
+    result->datanode_handles = (PGXCNodeHandle **)
+            palloc(count * sizeof(PGXCNodeHandle *));
+    if (!result->datanode_handles)
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_OUT_OF_MEMORY),
+                        errmsg("out of memory")));
+    }
+
+    result->dn_conn_count = 0;
+    for (i = 0; i < count; i++)
+    {
+        result->datanode_handles[result->dn_conn_count++] = current_transaction_handles->datanode_handles[i];
+    }
+}
+
 static void
 get_current_cn_handles_internal(PGXCNodeAllHandles *result)
 {
@@ -4237,6 +4284,35 @@ get_current_cn_handles_internal(PGXCNodeAllHandles *result)
     }
 }
 
+/* get current transaction cn handles that register in pgxc_node_begin */
+static void
+get_current_txn_cn_handles_internal(PGXCNodeAllHandles *result)
+{
+    int					i;
+    int                 count = 0;
+
+    if (current_transaction_handles == NULL || current_transaction_handles->co_conn_count == 0)
+    {
+        return;
+    }
+
+    count = current_transaction_handles->co_conn_count;
+    result->coord_handles = (PGXCNodeHandle **)
+            palloc(count * sizeof(PGXCNodeHandle *));
+    if (!result->coord_handles)
+    {
+        ereport(ERROR,
+                (errcode(ERRCODE_OUT_OF_MEMORY),
+                        errmsg("out of memory")));
+    }
+
+    result->co_conn_count = 0;
+    for (i = 0; i < count; i++)
+    {
+        result->coord_handles[result->co_conn_count++] = current_transaction_handles->coord_handles[i];
+    }
+}
+
 PGXCNodeAllHandles *
 get_sock_fatal_handles(void)
 {
@@ -4290,6 +4366,107 @@ get_sock_fatal_handles(void)
 
 	return result;
 }
+
+/*
+ * init current transaction handles for connections
+ */
+void
+init_transaction_handles(void)
+{
+    MemoryContext oldcontext;
+    oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+    if (current_transaction_handles == NULL)
+    {
+        current_transaction_handles = (PGXCNodeAllHandles *) palloc0(sizeof(PGXCNodeAllHandles));
+    }
+
+    current_transaction_handles->primary_handle = NULL;
+
+    current_transaction_handles->dn_conn_count = 0;
+    if (current_transaction_handles->datanode_handles == NULL)
+    {
+        current_transaction_handles->datanode_handles = (PGXCNodeHandle **) palloc(NumDataNodes * sizeof(PGXCNodeHandle *));
+    }
+    else
+    {
+        current_transaction_handles->datanode_handles = (PGXCNodeHandle **) repalloc(current_transaction_handles->datanode_handles, NumDataNodes * sizeof(PGXCNodeHandle *));
+    }
+
+    current_transaction_handles->co_conn_count = 0;
+    if (current_transaction_handles->coord_handles == NULL)
+    {
+        current_transaction_handles->coord_handles = (PGXCNodeHandle **) palloc(NumCoords * sizeof(PGXCNodeHandle *));
+    }
+    else
+    {
+        current_transaction_handles->coord_handles = (PGXCNodeHandle **) repalloc(current_transaction_handles->coord_handles, NumCoords * sizeof(PGXCNodeHandle *));
+    }
+    MemoryContextSwitchTo(oldcontext);
+    return;
+}
+
+/*
+ * reset current transaction handles
+ */
+void
+reset_transaction_handles(void)
+{
+    if (current_transaction_handles == NULL)
+    {
+        return;
+    }
+
+    current_transaction_handles->dn_conn_count = 0;
+    current_transaction_handles->co_conn_count = 0;
+    return;
+}
+
+/*
+ * register current transaction handle to current_transaction_handles
+ */
+void
+register_transaction_handles(PGXCNodeHandle* handle)
+{
+    int i = 0;
+    char node_type = handle->node_type;
+
+    if (!IS_PGXC_LOCAL_COORDINATOR)
+    {
+        return;
+    }
+
+    Assert (current_transaction_handles != NULL);
+
+    if (node_type == PGXC_NODE_DATANODE)
+    {
+        for (i = 0; i < current_transaction_handles->dn_conn_count; i++)
+        {
+            if (current_transaction_handles->datanode_handles[i] == handle)
+            {
+                return;
+            }
+        }
+        current_transaction_handles->datanode_handles[current_transaction_handles->dn_conn_count++] = handle;
+        Assert(current_transaction_handles->dn_conn_count <= NumDataNodes);
+    }
+    else if (node_type == PGXC_NODE_COORDINATOR)
+    {
+        for (i = 0; i < current_transaction_handles->co_conn_count; i++)
+        {
+            if (current_transaction_handles->coord_handles[i] == handle)
+            {
+                return;
+            }
+        }
+        current_transaction_handles->coord_handles[current_transaction_handles->co_conn_count++] = handle;
+        Assert(current_transaction_handles->co_conn_count <= NumCoords);
+    }
+    else
+    {
+        elog(ERROR, "invalid node_type %c in register_transaction_handles", node_type);
+    }
+}
+
 #endif
 
 /* Free PGXCNodeAllHandles structure */
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 22075b68..8d51d1dc 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -129,6 +129,7 @@ struct pgxc_node_handle
 	long        recv_datarows;
 	bool 		plpgsql_need_begin_sub_txn;
 	bool 		plpgsql_need_begin_txn;
+	char        node_type;
 #endif
 };
 typedef struct pgxc_node_handle PGXCNodeHandle;
@@ -143,6 +144,8 @@ typedef struct
 	PGXCNodeHandle	  **coord_handles;	/* an array of Coordinator handles */
 } PGXCNodeAllHandles;
 
+extern PGXCNodeAllHandles *current_transaction_handles;
+
 extern volatile bool HandlesInvalidatePending;
 
 extern void InitMultinodeExecutor(bool is_force);
@@ -178,9 +181,13 @@ extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist,
 
 extern PGXCNodeAllHandles *get_current_handles(void);
 #ifdef __TBASE__
+extern PGXCNodeAllHandles *get_current_txn_handles(void);
 extern PGXCNodeAllHandles *get_current_cn_handles(void);
 extern PGXCNodeAllHandles *get_current_dn_handles(void);
 extern PGXCNodeAllHandles * get_sock_fatal_handles(void);
+extern void init_transaction_handles(void);
+extern void reset_transaction_handles(void);
+extern void register_transaction_handles(PGXCNodeHandle* handle);
 #endif
 extern void pfree_pgxc_all_handles(PGXCNodeAllHandles *handles);
 

From a9fd7bd7dc9d050bf3a782ba57c426d710954db7 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 11 Nov 2021 17:24:20 +0800
Subject: [PATCH 439/578] Fix concurrent update when existing initplan

http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696094082541
---
 src/backend/executor/execMain.c      | 24 +++++------
 src/backend/pgxc/pool/execRemote.c   | 20 +++++++--
 src/test/regress/expected/insert.out | 63 ++++++++++++++++++++++++++++
 src/test/regress/sql/insert.sql      | 41 ++++++++++++++++++
 4 files changed, 131 insertions(+), 17 deletions(-)

diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index ca60ff43..aa69e7a3 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -3188,12 +3188,7 @@ EvalPlanQualInit(EPQState *epqstate, EState *estate,
     epqstate->planstate = NULL;
     epqstate->origslot = NULL;
     /* ... and remember data that EvalPlanQualBegin will need */
-	epqstate->plan = copyObject(subplan);
-	/* Reset cursor name of remote subplans if any */
-	ResetRemoteSubplanCursor(epqstate->plan,
-	                         (estate->es_plannedstmt ?
-	                          estate->es_plannedstmt->subplans : NULL),
-	                         "epq");
+	epqstate->plan = subplan;
     epqstate->arowMarks = auxrowmarks;
     epqstate->epqParam = epqParam;
 }
@@ -3209,12 +3204,7 @@ EvalPlanQualSetPlan(EPQState *epqstate, Plan *subplan, List *auxrowmarks)
     /* If we have a live EPQ query, shut it down */
     EvalPlanQualEnd(epqstate);
     /* And set/change the plan pointer */
-	epqstate->plan = copyObject(subplan);
-	/* Reset cursor name of remote subplans if any */
-	ResetRemoteSubplanCursor(epqstate->plan,
-	                         (epqstate->parentestate->es_plannedstmt ?
-	                          epqstate->parentestate->es_plannedstmt->subplans : NULL),
-	                         "epq");
+	epqstate->plan = subplan;
     /* The rowmarks depend on the plan, too */
     epqstate->arowMarks = auxrowmarks;
 }
@@ -3448,7 +3438,7 @@ EvalPlanQualBegin(EPQState *epqstate, EState *parentestate)
     if (estate == NULL)
     {
         /* First time through, so create a child EState */
-        EvalPlanQualStart(epqstate, parentestate, epqstate->plan);
+		EvalPlanQualStart(epqstate, parentestate, copyObject(epqstate->plan));
     }
     else
     {
@@ -3522,9 +3512,15 @@ EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree)
     estate->es_snapshot = parentestate->es_snapshot;
     estate->es_crosscheck_snapshot = parentestate->es_crosscheck_snapshot;
     estate->es_range_table = parentestate->es_range_table;
-    estate->es_plannedstmt = parentestate->es_plannedstmt;
+	estate->es_plannedstmt = copyObject(parentestate->es_plannedstmt);
     estate->es_junkFilter = parentestate->es_junkFilter;
     estate->es_output_cid = parentestate->es_output_cid;
+	
+	ResetRemoteSubplanCursor(planTree,
+	                         (estate->es_plannedstmt ?
+	                          estate->es_plannedstmt->subplans : NULL),
+	                         "epq");
+	
     if (parentestate->es_num_result_relations > 0)
     {
         int            numResultRelations = parentestate->es_num_result_relations;
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 28af8f31..31ee6014 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -10869,8 +10869,22 @@ encode_epqcontext(PlanState *planstate, char **result)
 	
 	for (i = 0; i < ntuples; i++)
 	{
-		ItemPointerData tid = estate->es_epqTuple[i]->t_self;
-		int             rtidx = i + 1;
+		ItemPointerData tid;
+		int16           rtidx;
+		int             nodeid;
+		
+		if (estate->es_epqTuple[i] == NULL)
+		{
+			memset(&tid, 0, sizeof(ItemPointerData));
+			rtidx = 0;
+			nodeid = 0;
+		}
+		else
+		{
+			tid = estate->es_epqTuple[i]->t_self;
+			rtidx = i + 1;
+			nodeid = estate->es_epqTuple[i]->t_xc_node_id;
+		}
 		
 		n16 = htons(rtidx);
 		appendBinaryStringInfo(&buf, (char *) &n16, 2);
@@ -10880,7 +10894,7 @@ encode_epqcontext(PlanState *planstate, char **result)
 		appendBinaryStringInfo(&buf, (char *) &n16, 2);
 		n16 = htons(tid.ip_posid);
 		appendBinaryStringInfo(&buf, (char *) &n16, 2);
-		n32 = htonl(estate->es_epqTuple[i]->t_xc_node_id);
+		n32 = htonl(nodeid);
 		appendBinaryStringInfo(&buf, (char *) &n32, 4);
 	}
 	
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index 528cd56d..97d3c276 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -986,3 +986,66 @@ with baseInfo as(select * from t1)
 insert into t2 select * from baseInfo;
 drop table t1;
 drop table t2;
+-- test insert with returning in JDBC
+drop table if exists insertwithret;
+NOTICE:  table "insertwithret" does not exist, skipping
+create table insertwithret(a int, b text, c int);
+prepare p0(int,text,int) as insert into insertwithret values($1, $2, $3) returning a;
+prepare p1(int,text,int) as insert into insertwithret values($1, $2, $3) returning a,b;
+prepare p2(int,text,int) as insert into insertwithret values($1, $2, $3) returning c;
+prepare p3(int,text,int) as insert into insertwithret values($1, $2, $3);
+execute p0(1, 'abc', 1);
+ a 
+---
+ 1
+(1 row)
+
+execute p1(1, 'abc', 1);
+ a |  b  
+---+-----
+ 1 | abc
+(1 row)
+
+execute p2(1, 'abc', 1);
+ c 
+---
+ 1
+(1 row)
+
+execute p3(1, 'abc', 1);
+-- test complex INSERT
+CREATE TABLE ods_time_record (
+    id character(8),
+    mintime character varying(50),
+    describe character varying(50),
+    systemtime timestamp(6) without time zone DEFAULT orcl_sysdate(),
+    remarks character varying(255),
+    total numeric(255,0)
+)
+DISTRIBUTE BY SHARD (id) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+CREATE TABLE ods_today_st_river_r (
+    stcd character(8) NOT NULL,
+    tm timestamp(6) without time zone NOT NULL,
+    z numeric(7,3),
+    q numeric(9,3),
+    xsa numeric(9,3),
+    xsavv numeric(5,3),
+    xsmxv numeric(5,3),
+    flwchrcd character(1),
+    wptn character(1),
+    msqmt character(1),
+    msamt character(1),
+    msvmt character(1),
+    moditime timestamp(6) without time zone
+)
+DISTRIBUTE BY SHARD (stcd) to GROUP default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+COPY ods_time_record (id, mintime, describe, systemtime, remarks, total) FROM stdin;
+COPY ods_today_st_river_r (stcd, tm, z, q, xsa, xsavv, xsmxv, flwchrcd, wptn, msqmt, msamt, msvmt, moditime) FROM stdin;
+ALTER TABLE ONLY ods_today_st_river_r
+ADD CONSTRAINT ods_today_st_river_r_pkey PRIMARY KEY (tm, stcd);
+insert into ods_time_record ("id",mintime,"describe",remarks,total)
+select '1' as "id",max(moditime) as mintime ,'河道' as "describe" ,'st_river_r' as remarks,
+(select count(1) from ods_today_st_river_r  ) as total from ods_today_st_river_r;
+drop table ods_today_st_river_r, ods_time_record;
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 5591b65e..be52ef93 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -585,3 +585,44 @@ execute p0(1, 'abc', 1);
 execute p1(1, 'abc', 1);
 execute p2(1, 'abc', 1);
 execute p3(1, 'abc', 1);
+
+-- test complex INSERT
+CREATE TABLE ods_time_record (
+    id character(8),
+    mintime character varying(50),
+    describe character varying(50),
+    systemtime timestamp(6) without time zone DEFAULT orcl_sysdate(),
+    remarks character varying(255),
+    total numeric(255,0)
+)
+DISTRIBUTE BY SHARD (id) to GROUP default_group;
+CREATE TABLE ods_today_st_river_r (
+    stcd character(8) NOT NULL,
+    tm timestamp(6) without time zone NOT NULL,
+    z numeric(7,3),
+    q numeric(9,3),
+    xsa numeric(9,3),
+    xsavv numeric(5,3),
+    xsmxv numeric(5,3),
+    flwchrcd character(1),
+    wptn character(1),
+    msqmt character(1),
+    msamt character(1),
+    msvmt character(1),
+    moditime timestamp(6) without time zone
+)
+DISTRIBUTE BY SHARD (stcd) to GROUP default_group;
+COPY ods_time_record (id, mintime, describe, systemtime, remarks, total) FROM stdin;
+1       	2021-11-04 00:00:00	st_river_r	2021-11-04 00:00:00	河道水情表	0
+1       	2021-11-04 00:00:00	st_river_r	2021-11-04 00:00:00	河道水情表	0
+\.
+COPY ods_today_st_river_r (stcd, tm, z, q, xsa, xsavv, xsmxv, flwchrcd, wptn, msqmt, msamt, msvmt, moditime) FROM stdin;
+30702300	2021-11-11 12:30:00	96.710	1.150	\N	\N	0.000	\N	5	1	\N	\N	2021-11-11 12:31:54
+41400990	2021-11-11 12:25:00	1.020	\N	\N	\N	0.000	\N	6	\N	\N	\N	2021-11-11 12:31:54
+\.
+ALTER TABLE ONLY ods_today_st_river_r
+ADD CONSTRAINT ods_today_st_river_r_pkey PRIMARY KEY (tm, stcd);
+insert into ods_time_record ("id",mintime,"describe",remarks,total)
+select '1' as "id",max(moditime) as mintime ,'河道' as "describe" ,'st_river_r' as remarks,
+(select count(1) from ods_today_st_river_r  ) as total from ods_today_st_river_r;
+drop table ods_today_st_river_r, ods_time_record;
\ No newline at end of file

From 984f5104c01bcb6547169ba72dbc9e19673eb8a5 Mon Sep 17 00:00:00 2001
From: ceciliasu <ceciliasu@tencent.com>
Date: Fri, 12 Nov 2021 15:19:02 +0800
Subject: [PATCH 440/578] Fix a bug in DecodeMultiInsert, which caused the
 shardID in tuples decoded from MULTI_INSERT xlog always be 0. (merge request
 !918) TAPD: --story=869170029

---
 src/backend/replication/logical/decode.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index f3577766..fa3a078e 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -1381,6 +1381,9 @@ DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
             header->t_infomask = xlhdr->t_infomask;
             header->t_infomask2 = xlhdr->t_infomask2;
             header->t_hoff = xlhdr->t_hoff;
+#ifdef __STORAGE_SCALABLE__
+			header->t_shardid = xlhdr->t_shardid;
+#endif
         }
 
         /*

From 75a5da40ac8725b22a952ab341866f9513ad902f Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 10 Nov 2021 17:05:11 +0800
Subject: [PATCH 441/578] fix function scan cache lookup failed for type
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093147751 (merge request
 !906)

Squash merge branch 'sigmalin001' into 'Tbase_v2.15.19.4'
fix function scan cache lookup failed for type http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093147751
---
 src/backend/nodes/outfuncs.c  | 7 +++++++
 src/backend/nodes/readfuncs.c | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 8266ad33..86320cfb 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -4538,7 +4538,14 @@ _outRangeTblFunction(StringInfo str, const RangeTblFunction *node)
     WRITE_NODE_FIELD(funcexpr);
     WRITE_INT_FIELD(funccolcount);
     WRITE_NODE_FIELD(funccolnames);
+    if (portable_output)
+    {
+        WRITE_TYPID_LIST_FIELD(funccoltypes);
+    }
+    else
+    {
     WRITE_NODE_FIELD(funccoltypes);
+    }
     WRITE_NODE_FIELD(funccoltypmods);
     WRITE_NODE_FIELD(funccolcollations);
     WRITE_BITMAPSET_FIELD(funcparams);
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index db2b9441..33d51a1e 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -2231,7 +2231,14 @@ _readRangeTblFunction(void)
     READ_NODE_FIELD(funcexpr);
     READ_INT_FIELD(funccolcount);
     READ_NODE_FIELD(funccolnames);
+    if (portable_input)
+    {
+        READ_TYPID_LIST_FIELD(funccoltypes);
+    }
+    else
+    {
     READ_NODE_FIELD(funccoltypes);
+    }
     READ_NODE_FIELD(funccoltypmods);
     READ_NODE_FIELD(funccolcollations);
     READ_BITMAPSET_FIELD(funcparams);

From bc3231c6269ba713ea64c5fae722a009b1287450 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 12 Nov 2021 16:25:59 +0800
Subject: [PATCH 442/578] disable PgxcNodeRefresh fix
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094064259 (merge request
 !919)

Squash merge branch 'sigmalin_v2' into 'Tbase_v2.15.19.3'
disable PgxcNodeRefresh fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094064259
---
 src/backend/pgxc/pool/poolutils.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/backend/pgxc/pool/poolutils.c b/src/backend/pgxc/pool/poolutils.c
index 0b684619..e69fae07 100644
--- a/src/backend/pgxc/pool/poolutils.c
+++ b/src/backend/pgxc/pool/poolutils.c
@@ -111,9 +111,11 @@ pgxc_pool_reload(PG_FUNCTION_ARGS)
     if (PgxcNodeRefresh())
         PG_RETURN_BOOL(true);
 #endif
+#if 0
+    /* TODO: disable node refresh now, consider the handle fd state and enable refresh later */
     /* Always send reload msg to pooler */
     PgxcNodeRefresh();
-
+#endif
     /* Session is being reloaded, drop prepared and temporary objects */
     DropAllPreparedStatements();
 

From 2a3c5f3e93b5553efae20514911e71857eaf1528 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Tue, 13 Jul 2021 15:45:30 +0800
Subject: [PATCH 443/578] fix dblink error prepared statement
 http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696089823793

---
 src/backend/tcop/postgres.c | 83 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index a607a515..6e88c1f3 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -1138,6 +1138,76 @@ pg_plan_queries(List *querytrees, int cursorOptions, ParamListInfo boundParams)
     return stmt_list;
 }
 
+/*
+ * get myself query string from original query string,
+ * if the query string contain multi stmt
+ */
+static char*
+get_myself_query_string(char* query_string, char** out_query_string)
+{
+    char       *string_delimeter = NULL;
+    char       *myself_query_string = NULL;
+    int         myself_query_string_len = 0;
+    int         pos = 0;
+    bool        in_quotation = false;
+    int         query_string_len = 0;
+
+    if (query_string && query_string[0] != '\0')
+    {
+        /* skip space and redundant ';' */
+        while (*query_string != '\0')
+        {
+            if (ch_is_space(*query_string) || *query_string == ';')
+            {
+                query_string++;
+            }
+            else
+            {
+                break;
+            }
+        }
+
+        if (*query_string == '\0')
+        {
+            *out_query_string = NULL;
+            return NULL;
+        }
+
+        /* find ';' in query string, be careful of '\'' */
+        query_string_len = strlen(query_string);
+        for (pos = 0; pos < query_string_len; pos++)
+        {
+            if (query_string[pos] == '\'')
+            {
+                in_quotation = (in_quotation) ? false : true;
+            }
+
+            if (query_string[pos] == ';' && !in_quotation)
+            {
+                string_delimeter = &query_string[pos];
+                break;
+            }
+        }
+
+        if (string_delimeter == NULL)
+        {
+            myself_query_string = query_string;
+            query_string = NULL;
+        }
+        else
+        {
+            myself_query_string_len = string_delimeter - query_string;
+            myself_query_string = palloc(myself_query_string_len + 1);
+            memcpy(myself_query_string, query_string, myself_query_string_len);
+            myself_query_string[myself_query_string_len] = '\0';
+
+            query_string = string_delimeter + 1;
+        }
+    }
+
+    *out_query_string = myself_query_string;
+    return query_string;
+}
 
 /*
  * exec_simple_query
@@ -1156,6 +1226,7 @@ exec_simple_query(const char *query_string)
     bool        isTopLevel;
     char        msec_str[32];
     bool        multiCommands = false;
+    char       *query_string_tmp = NULL;
 
     /*
      * Report query to various monitoring facilities.
@@ -1227,6 +1298,8 @@ exec_simple_query(const char *query_string)
                          errmsg("COMMIT or ROLLBACK "
                                 "in multi-statement queries not allowed")));
         }
+
+        query_string_tmp = (char*) query_string;
     }
 
     /*
@@ -1284,6 +1357,14 @@ exec_simple_query(const char *query_string)
         Portal        portal;
         DestReceiver *receiver;
         int16        format;
+        char       *myself_query_string = NULL;
+
+        if (query_string_tmp && query_string_tmp[0] != '\0')
+        {
+            /* get this portal's query when has multi parse tree */
+            query_string_tmp = get_myself_query_string(query_string_tmp, &myself_query_string);
+        }
+
 #ifdef PGXC
 
         /*
@@ -1446,7 +1527,7 @@ exec_simple_query(const char *query_string)
          */
         PortalDefineQuery(portal,
                           NULL,
-                          query_string,
+                          (myself_query_string) ? myself_query_string : query_string,
                           commandTag,
                           plantree_list,
                           NULL);

From 59f71a2ac33925d3977b8fe1609a843758c6d9d3 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Mon, 15 Nov 2021 16:14:09 +0800
Subject: [PATCH 444/578] fix complie, cherry-pick from
 693dda04e738c8a1c8aa9943d38d9367fb8f6a35

---
 src/backend/tcop/postgres.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 6e88c1f3..f7bb987a 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -1138,6 +1138,19 @@ pg_plan_queries(List *querytrees, int cursorOptions, ParamListInfo boundParams)
     return stmt_list;
 }
 
+static bool
+ch_is_space(char ch)
+{
+	if (ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r' || ch == '\f')
+	{
+	    return true;
+	}
+	else
+	{
+		return false;
+	}
+}
+
 /*
  * get myself query string from original query string,
  * if the query string contain multi stmt

From a69038032c9d21d3031769698c85d063605eab63 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 2 Nov 2021 16:33:30 +0800
Subject: [PATCH 445/578] Set index valid after DN done index creation if
 creating index concurrently

tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131092311553
---
 src/backend/commands/indexcmds.c | 40 ++++++++++------
 src/backend/tcop/utility.c       | 81 ++++++++++++++++++++++++++++----
 src/include/catalog/index.h      |  1 +
 3 files changed, 98 insertions(+), 24 deletions(-)

diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 03245150..3d5530b3 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -1341,17 +1341,35 @@ DefineIndex(Oid relationId,
      * Index can now be marked valid -- update its pg_index entry
      */
 #ifdef __TBASE__
-    rel = heap_open(relationId, NoLock);
-    
-    if (!RELATION_IS_INTERVAL(rel))
+	/*
+	 * local coordinator set this after command sent to DN and other CN
+	 * see ProcessUtilityPost.
+	 */
+	if (!IS_PGXC_LOCAL_COORDINATOR)
     {
 #endif
-    index_set_state_flags(indexRelationId, INDEX_CREATE_SET_VALID);
+	IndexCreateSetValid(indexRelationId, heaprelid.relId);
 #ifdef __TBASE__
     }
-
-    heap_close(rel, NoLock);
 #endif
+
+	/*
+	 * Last thing to do is release the session-level lock on the parent table.
+	 */
+	UnlockRelationIdForSession(&heaprelid, ShareUpdateExclusiveLock);
+
+	return address;
+}
+
+/*
+ * Set index in pg_index as valid called after 3 phase of concurrent index
+ * creation. Remember to call it on CN AFTER DN dose
+ */
+void
+IndexCreateSetValid(Oid index, Oid rel)
+{
+	index_set_state_flags(index, INDEX_CREATE_SET_VALID);
+	
     /*
      * The pg_index update will cause backends (including this one) to update
      * relcache entries for the index itself, but we should also send a
@@ -1360,14 +1378,8 @@ DefineIndex(Oid relationId,
      * would be useful.  (Note that our earlier commits did not create reasons
      * to replan; so relcache flush on the index itself was sufficient.)
      */
-    CacheInvalidateRelcacheByRelid(heaprelid.relId);
-
-    /*
-     * Last thing to do is release the session-level lock on the parent table.
-     */
-    UnlockRelationIdForSession(&heaprelid, ShareUpdateExclusiveLock);
-
-    return address;
+	if (OidIsValid(rel))
+		CacheInvalidateRelcacheByRelid(rel);
 }
 
 
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index a984b9e6..c90f1759 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -1648,6 +1648,9 @@ ProcessUtilityPost(PlannedStmt *pstmt,
                 auto_commit = stmt->concurrent;
                 if (stmt->isconstraint)
                     exec_type = EXEC_ON_NONE;
+				
+				if (exec_type == EXEC_ON_ALL_NODES && stmt->concurrent)
+					exec_type = EXEC_ON_DATANODES;
             }
             break;
 
@@ -1793,8 +1796,76 @@ ProcessUtilityPost(PlannedStmt *pstmt,
 #endif
 
     if (IS_PGXC_LOCAL_COORDINATOR)
+	{
         ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, auto_commit,
                 exec_type, is_temp, add_context);
+		
+		if (IsA(parsetree, IndexStmt) &&
+		    ((IndexStmt *) parsetree)->concurrent)
+		{
+			/*
+			 * When we get here, all DN have done with index creation, time to set index
+			 * valid on CN.
+			 */
+			IndexStmt *stmt = (IndexStmt *) parsetree;
+			Oid        indexid = InvalidOid;
+			Relation   rel = relation_openrv_extended(stmt->relation, NoLock, true);
+			
+			/* exec_type can't be EXEC_ON_ALL_NODES, as changed in "switch case" above */
+			Assert(exec_type != EXEC_ON_ALL_NODES);
+			
+			if (rel == NULL)
+			{
+				/*
+				 * Failed to get enough message from stmt, have to guess a namespace.
+				 * This should not happen but ...
+				 */
+				indexid = RelnameGetRelid(stmt->idxname);
+				CommitTransactionCommand();
+				StartTransactionCommand();
+				
+				IndexCreateSetValid(indexid, InvalidOid);
+			}
+			else
+			{
+				Oid relid = RelationGetRelid(rel);
+				Oid namespace = RelationGetNamespace(rel);
+				int nParts = 0;
+				int i;
+				Oid child_index;
+				Oid child_rel;
+				
+				indexid = get_relname_relid(stmt->idxname, namespace);
+				
+				if (rel != NULL && RELATION_IS_INTERVAL(rel))
+					nParts = RelationGetNParts(rel);
+				relation_close(rel, NoLock);
+				
+				CommitTransactionCommand();
+				StartTransactionCommand();
+				IndexCreateSetValid(indexid, relid);
+				
+				/* if there are interval partitions, do the same thing */
+				for (i = 0; i < nParts; i++)
+				{
+					child_index = get_relname_relid(GetPartitionName(indexid, i, true), namespace);
+					child_rel = get_relname_relid(GetPartitionName(relid, i, false), namespace);
+					
+					IndexCreateSetValid(child_index, child_rel);
+				}
+				
+				/*
+				 * Notice: community version of partition table is not allow to build
+				 * index concurrently, so don't bother here.
+				 */
+			}
+			
+			/* finally, tell other CN to create an index */
+			if (exec_type != EXEC_ON_NONE)
+				ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, auto_commit,
+				                       EXEC_ON_COORDS, is_temp, add_context);
+		}
+	}
 }
 
 #ifdef __TBASE__
@@ -4374,16 +4445,6 @@ ProcessUtilitySlow(ParseState *pstate,
                             }
 
                             MemoryContextDelete(temp);
-
-                            if (stmt->concurrent)
-                            {
-                                /*
-                                 * Commit this transaction to make the indisready update visible.
-                                 */
-                                CommitTransactionCommand();
-                                StartTransactionCommand();
-                                index_set_state_flags(indexOid, INDEX_CREATE_SET_VALID);
-                            }
                         }
                         else if (RELATION_IS_CHILD(rel))
                         {
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index c60ad12f..d37be02e 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -211,5 +211,6 @@ extern bool index_is_interval(Oid indexId);
 #endif
 
 extern void IndexSetParentIndex(Relation idx, Oid parentOid);
+extern void IndexCreateSetValid(Oid index, Oid rel);
 
 #endif                            /* INDEX_H */

From d4cb6fb47532bffd4f466e54b9fc8ecfe6bce340 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Mon, 22 Nov 2021 11:46:53 +0800
Subject: [PATCH 446/578] fix error msg when reset handles fix
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094179541 (merge request
 !938)

Squash merge branch 'sigmalin002' into 'Tbase_v2.15.19.3'
fix error msg when reset handles fix http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094179541

TAPD: --bug=094179541
---
 src/backend/pgxc/pool/execRemote.c | 18 ++++++++--
 src/backend/pgxc/pool/pgxcnode.c   | 57 ++++++++++++++++++++++--------
 src/include/pgxc/pgxcnode.h        |  2 +-
 3 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 31ee6014..20cab418 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3733,7 +3733,15 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
     {
         for (i = 0; i < new_count; i++)
         {
-            pgxc_node_set_query(new_connections[i], init_str);
+			if (pgxc_node_set_query(new_connections[i], init_str))
+            {
+                /*
+                 * print log here and return eof indicates execution failure
+                 */
+                elog(LOG, "pgxc_node_begin send %s to node %s, pid:%d failed", init_str,
+                     new_connections[i]->nodename, new_connections[i]->backend_pid);
+                return EOF;
+            }
 			elog(DEBUG5, "pgxc_node_begin send %s to node %s, pid:%d", init_str,
 					new_connections[i]->nodename, new_connections[i]->backend_pid);
         }
@@ -7046,7 +7054,13 @@ LeaderCnExecRemoteUtility(RemoteQuery *node,
 	char *init_str = PGXCNodeGetSessionParamStr();
 	if (init_str)
 	{
-		pgxc_node_set_query(leader_cn_conn, init_str);
+		if (pgxc_node_set_query(leader_cn_conn, init_str))
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_INTERNAL_ERROR),
+                            errmsg("pgxc_node_set_query send %s to node %s, pid:%d failed", init_str,
+                                   leader_cn_conn->nodename, leader_cn_conn->backend_pid)));
+        }
 	}
 	
 	SetPlpgsqlTransactionBegin(leader_cn_conn);
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 8968700d..23225be6 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -149,7 +149,7 @@ static bool DoRefreshRemoteHandles(void);
 
 #ifdef XCP
 static void pgxc_node_init(PGXCNodeHandle *handle, int sock,
-        bool global_session, int pid);
+		bool global_session, int pid, bool is_reset_handle);
 #else
 static void pgxc_node_init(PGXCNodeHandle *handle, int sock);
 #endif
@@ -667,8 +667,8 @@ pgxc_node_all_free(void)
  * Structure stores state info and I/O buffers
  */
 static void
-pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
-{// #lizard forgives
+pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid, bool is_reset_handle)
+{
     char *init_str;
 
     handle->sock = sock;
@@ -701,9 +701,20 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
     if (global_session)
     {
         init_str = PGXCNodeGetSessionParamStr();
-        if (init_str)
+		if (init_str && pgxc_node_set_query(handle, init_str))
+		{
+            if (is_reset_handle)
+            {
+                /* if it is a reset handle, do not throw error, just set handle as error state */
+                PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
+                elog(WARNING, "pgxc_node_set_query send %s to node %s, pid:%d failed", init_str,
+                     handle->nodename, handle->backend_pid);
+            }
+            else
         {
-            pgxc_node_set_query(handle, init_str);
+                elog(ERROR, "pgxc_node_set_query send %s to node %s, pid:%d failed", init_str,
+                     handle->nodename, handle->backend_pid);
+            }
         }
     }
 
@@ -1546,6 +1557,7 @@ release_handles(bool force)
 
 /*
  * Reset all Datanode and Coordinator connections occupied memory.
+ * TODO： fix implicit transaction do not commit on dn and remove reset_handles
  */
 void
 reset_handles(void)
@@ -1570,7 +1582,7 @@ reset_handles(void)
 
 		if (handle->sock != NO_SOCKET)
 		{
-			pgxc_node_init(handle, handle->sock, true, handle->backend_pid);
+			pgxc_node_init(handle, handle->sock, true, handle->backend_pid, true);
 		}
 	}
 
@@ -1580,7 +1592,7 @@ reset_handles(void)
 
 		if (handle->sock != NO_SOCKET)
 		{
-			pgxc_node_init(handle, handle->sock, true, handle->backend_pid);
+			pgxc_node_init(handle, handle->sock, true, handle->backend_pid, true);
 		}
 	}
 
@@ -1593,10 +1605,16 @@ reset_handles(void)
 
 			if (handle->sock != NO_SOCKET)
 			{
-				pgxc_node_init(handle, handle->sock, true, handle->backend_pid);
+				pgxc_node_init(handle, handle->sock, true, handle->backend_pid, true);
 			}
 		}
 	}
+
+    if (validate_handles())
+    {
+        elog(LOG, "found bad remote node connections, force release handles now");
+        release_handles(true);
+    }
 }
 
 /*
@@ -3727,7 +3745,7 @@ get_any_handle(List *datanodelist)
                     
                     
                     node_handle = &dn_handles[node];
-                    pgxc_node_init(node_handle, fds[0], true, pids[0]);
+					pgxc_node_init(node_handle, fds[0], true, pids[0], false);
                     datanode_count++;
 
                     elog(DEBUG1, "Established a connection with datanode \"%s\","
@@ -4003,7 +4021,7 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
 					continue;
 				}
 				
-                pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
+				pgxc_node_init(node_handle, fdsock, is_global_session, be_pid, false);
                 dn_handles[node] = *node_handle;
                 datanode_count++;
 
@@ -4068,7 +4086,7 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
 					continue;
 				}
 				
-                pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
+				pgxc_node_init(node_handle, fdsock, is_global_session, be_pid, false);
                 co_handles[node] = *node_handle;
                 coord_count++;
 
@@ -5094,14 +5112,18 @@ PGXCNodeGetTransactionParamStr(void)
 /*
  * Send down specified query, read and discard all responses until ReadyForQuery
  */
-void
+int
 pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
 {
 	if (pgxc_node_send_query(handle, set_query) != 0)
 	{
-		ereport(ERROR,
+	    /*
+	     * print log only and decide whether to throw an error at the place where it is called
+	     */
+		ereport(LOG,
 				(errcode(ERRCODE_INTERNAL_ERROR),
 						errmsg("Failed to send query %s",set_query)));
+		return EOF;
 	}
     /*
      * Now read responses until ReadyForQuery.
@@ -5142,8 +5164,11 @@ pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
         {
                         PGXCNodeHandleError(handle, msg, msglen);
                         PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
-                        elog(ERROR,"pgxc_node_set_query: %s",handle->error);
-            break;
+            /*
+             * print log only and decide whether to throw an error at the place where it is called
+             */
+            elog(LOG,"pgxc_node_set_query: %s",handle->error);
+			return EOF;
         }
 
         if (msgtype == 'Z') /* ReadyForQuery */
@@ -5154,6 +5179,8 @@ pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
             break;
         }
     }
+
+	return 0;
 }
 
 
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 8d51d1dc..91db953a 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -281,7 +281,7 @@ extern void PGXCNodeSetParam(bool local, const char *name, const char *value,
 extern void PGXCNodeResetParams(bool only_local);
 extern char *PGXCNodeGetSessionParamStr(void);
 extern char *PGXCNodeGetTransactionParamStr(void);
-extern void pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query);
+extern int pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query);
 extern void RequestInvalidateRemoteHandles(void);
 extern void RequestRefreshRemoteHandles(void);
 extern bool PoolerMessagesPending(void);

From 7149d0b4c2f486b6994358b6365b70fcbaf78f93 Mon Sep 17 00:00:00 2001
From: challzhang <challzhang@tencent.com>
Date: Thu, 2 Dec 2021 20:02:51 +0800
Subject: [PATCH 447/578] cn analyze ignore toast fields in tuples gathered
 from dn

---
 src/backend/commands/analyze.c | 58 ++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 48cf8d22..d342a1ac 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -1258,6 +1258,7 @@ acquire_sample_rows(Relation onerel, int elevel,
         {
             ItemId        itemid;
             HeapTupleData targtuple;
+			HeapTuple   newTuple = &targtuple;
             bool        sample_it = false;
 
             itemid = PageGetItemId(targpage, targoffset);
@@ -1351,6 +1352,59 @@ acquire_sample_rows(Relation onerel, int elevel,
             if (sample_it)
             {
                 /*
+                 * If connection is from Coordinator on datanodes, we discard TOAST fields in sample,
+                 * which will lighten the load of memory usage on coordinator.
+                 */
+			    if (IS_PGXC_DATANODE && IsConnFromCoord())
+			    {
+			        Datum       *values;
+			        bool        *nulls;
+			        TupleDesc   tupdesc = NULL;
+			        int     nattrs;
+			        Form_pg_attribute   *attrs;
+			        int     i;
+
+			        tupdesc = RelationGetDescr(onerel);
+			        nattrs = tupdesc->natts;
+			        attrs = tupdesc->attrs;
+
+			        values = (Datum *) palloc0(nattrs * sizeof(Datum));
+			        nulls = (bool *) palloc0(nattrs * sizeof(bool));
+
+			        heap_deform_tuple(&targtuple, tupdesc, values, nulls);
+
+			        for (i = 0; i < nattrs; i++)
+			        {
+			            if (!attrs[i]->attbyval && attrs[i]->attlen == -1)
+			            {
+			                /* varlena */
+			                Pointer     val = DatumGetPointer(values[i]);
+			                if (val == NULL || VARATT_IS_EXTERNAL(val) || VARATT_IS_COMPRESSED(val))
+			                {
+			                    nulls[i] = true;
+			                }
+			            }
+			        }
+
+			        newTuple = heap_form_tuple(tupdesc, values, nulls);
+
+			        pfree(values);
+			        pfree(nulls);
+
+                    /*
+                     * copy the identification info of the old tuple: t_ctid, t_self, and OID
+                     * (if any)
+                     */
+                    newTuple->t_data->t_ctid = targtuple.t_data->t_ctid;
+                    newTuple->t_self = targtuple.t_self;
+                    newTuple->t_tableOid = targtuple.t_tableOid;
+#ifdef PGXC
+                    newTuple->t_xc_node_id = targtuple.t_xc_node_id;
+#endif
+                    if (tupdesc->tdhasoid)
+                        HeapTupleSetOid(newTuple, HeapTupleGetOid(&targtuple));
+			    }
+				/*
                  * The first targrows sample rows are simply copied into the
                  * reservoir. Then we start replacing tuples in the sample
                  * until we reach the end of the relation.  This algorithm is
@@ -1363,7 +1417,7 @@ acquire_sample_rows(Relation onerel, int elevel,
                  * the relation we're done.
                  */
                 if (numrows < targrows)
-                    rows[numrows++] = heap_copytuple(&targtuple);
+					rows[numrows++] = heap_copytuple(newTuple);
                 else
                 {
                     /*
@@ -1385,7 +1439,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 
                         Assert(k >= 0 && k < targrows);
                         heap_freetuple(rows[k]);
-                        rows[k] = heap_copytuple(&targtuple);
+						rows[k] = heap_copytuple(newTuple);
                     }
 
                     rowstoskip -= 1;

From a6b238cb1236a6e9b9a8a9fb73a2f424a42043d0 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 10 Nov 2021 15:12:38 +0800
Subject: [PATCH 448/578] fix g_commandTag coredump
 /an:tgit_woa_pro/ts:tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094029153
 (merge request !902)

Squash merge branch 'sigmalin_v2oid' into 'Tbase_v2.15.19.4'
fix g_commandTag coredump /an:tgit_woa_pro/ts:tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094029153

TAPD: --bug=094029153
---
 src/backend/tcop/postgres.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index f7bb987a..5c80dfb5 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2452,6 +2452,17 @@ exec_bind_message(StringInfo input_message)
     /* Copy the plan's query string into the portal */
     query_string = pstrdup(psrc->query_string);
 
+#ifdef __AUDIT_FGA__
+    if (portal && portal->commandTag)
+    {
+        g_commandTag = pnstrdup(portal->commandTag, strlen(portal->commandTag));
+    }
+    else
+    {
+        g_commandTag = NULL;
+    }
+#endif
+
     /* Likewise make a copy of the statement name, unless it's unnamed */
     if (stmt_name[0])
         saved_stmt_name = pstrdup(stmt_name);

From 3abf57fd2ff4fdc7dc37bb30ced676a3112e0f49 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 14 Dec 2021 15:13:11 +0800
Subject: [PATCH 449/578] Prune interval partition table before calculate
 total_table_pages (merge request !1018)

Squash merge branch 'andrelin/partition_prune' into 'Tbase_v2.15.19.4'
Prune interval partition table before calculate total_table_pages

this affects cost evaluation of indexscan of interval partition table

tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094930123
---
 src/backend/optimizer/path/allpaths.c | 95 +++++++++++++++++++++++++++
 src/backend/optimizer/path/costsize.c | 55 ----------------
 src/backend/optimizer/plan/planmain.c |  2 +
 src/include/optimizer/paths.h         |  3 +
 4 files changed, 100 insertions(+), 55 deletions(-)

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 40bd2cf0..b5ddbfcd 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -47,12 +47,14 @@
 #include "parser/parsetree.h"
 #include "partitioning/partprune.h"
 #include "pgxc/nodemgr.h"
+#include "storage/lmgr.h"
 #ifdef PGXC
 #include "nodes/makefuncs.h"
 #include "miscadmin.h"
 #endif /* PGXC */
 #include "rewrite/rewriteManip.h"
 #include "utils/lsyscache.h"
+#include "utils/ruleutils.h"
 
 
 /* results of subquery_is_pushdown_safe */
@@ -3855,3 +3857,96 @@ debug_print_rel(PlannerInfo *root, RelOptInfo *rel)
 }
 
 #endif                            /* OPTIMIZER_DEBUG */
+
+/*
+ * Prune children of interval partition table by qual, this happens
+ * before path generation phase, and adjust rel->pages and rel->tuples
+ * for a better cost evaluation.
+ */
+void
+prune_interval_base_rel(PlannerInfo *root)
+{
+	Index		rti;
+	
+	for (rti = 1; rti < root->simple_rel_array_size; rti++)
+	{
+		RelOptInfo *rel = root->simple_rel_array[rti];
+		
+		if (rel == NULL)
+			continue;
+		
+		Assert(rel->relid == rti); /* sanity check on array */
+		
+		if (IS_DUMMY_REL(rel))
+			continue;
+		
+		if (IS_SIMPLE_REL(rel) && rel->intervalparent && !rel->isdefault)
+		{
+			RangeTblEntry *rte;
+			Relation    relation;
+			Oid         partoid = InvalidOid;
+			Bitmapset   *tmpset;
+			
+			rte = rt_fetch(rel->relid, root->parse->rtable);
+			relation = heap_open(rte->relid, AccessShareLock);
+			
+			/* pruning by qual */
+			rel->childs = RelationGetPartitionsByQuals(relation, rel->baserestrictinfo);
+
+#ifdef __COLD_HOT__
+			/* only datanode and SELECT command need to prune hot data */
+			if (CMD_SELECT == root->parse->commandType && g_EnableDualWrite && IS_PGXC_DATANODE)
+			{
+				/* prune hot data */
+				PruneHotData(RelationGetRelid(relation), rel->childs);
+			}
+#endif
+			
+			tmpset = bms_copy(rel->childs);
+			
+			if (bms_num_members(tmpset) == 1)
+			{
+				Relids	*attr_needed = rel->attr_needed;
+				int32	*attr_widths = rel->attr_widths;
+				rel->estimate_partidx = bms_first_member(tmpset);
+				partoid = RelationGetPartition(relation, rel->estimate_partidx, false);
+				
+				/* degrate from parent to a child of parent */
+				rte->relid = partoid;
+				rel->intervalparent = false;
+				rel->isdefault = false;
+				rel->estimate_partidx = -1;
+				rel->indexlist = NULL;
+				LockRelationOid(partoid, AccessShareLock);
+				get_relation_info(root, partoid, false, rel);
+				rel->attr_needed = attr_needed;
+				rel->attr_widths = attr_widths;
+				check_index_predicates(root, rel);
+				
+				bms_free(rel->childs);
+				rel->childs = NULL;
+			}
+			else
+			{
+				int         i;
+				Relation    child;
+				
+				rel->pages = 0;
+				rel->tuples = 0;
+				
+				while ((i = bms_first_member(tmpset)) >= 0)
+				{
+					partoid = RelationGetPartition(relation, i, false);
+					
+					child = heap_open(partoid, AccessShareLock);
+					rel->pages += child->rd_rel->relpages;
+					rel->tuples += child->rd_rel->reltuples;
+					heap_close(child, AccessShareLock);
+				}
+			}
+			
+			bms_free(tmpset);
+			heap_close(relation, AccessShareLock);
+		}
+	}
+}
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index f8ac09e8..5f129790 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -4226,61 +4226,6 @@ set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
     /* Should only be applied to base relations */
     Assert(rel->relid > 0);
 
-#ifdef __TBASE__
-    if(rel->intervalparent && !rel->isdefault)
-    {
-        RangeTblEntry *rte;
-        Relation relation;
-
-        rte = rt_fetch(rel->relid, root->parse->rtable);
-        relation = heap_open(rte->relid, AccessShareLock);
-
-        //pruning
-        rel->childs = RelationGetPartitionsByQuals(relation, rel->baserestrictinfo);
-
-#ifdef __COLD_HOT__
-        /* only datanode and SELECT command need to prune hot data */
-        if (CMD_SELECT == root->parse->commandType && g_EnableDualWrite && IS_PGXC_DATANODE)
-        {
-            /* prune hot data */
-            PruneHotData(RelationGetRelid(relation), rel->childs);
-        }
-#endif
-        
-        if(bms_num_members(rel->childs) == 1)
-        {
-            Oid     partoid = InvalidOid;
-            Relids    *attr_needed = rel->attr_needed;
-            int32    *attr_widths = rel->attr_widths;
-            Bitmapset * bmscopy = bms_copy(rel->childs);
-            rel->estimate_partidx = bms_first_member(bmscopy);
-            partoid = RelationGetPartition(relation, rel->estimate_partidx, false);
-
-            //degrate from parent to a child of parent
-            rte->relid = partoid;
-            rel->intervalparent = false;
-            rel->isdefault = false;
-            if(rel->childs)
-            {
-                bms_free(rel->childs);
-                rel->childs = NULL;
-            }
-            rel->estimate_partidx = -1;
-            rel->indexlist = NULL;
-            LockRelationOid(partoid,AccessShareLock);
-            get_relation_info(root, partoid, false, rel);
-            rel->attr_needed = attr_needed;
-            rel->attr_widths = attr_widths;
-            check_index_predicates(root, rel);
-            //UnlockRelationOid(partoid,AccessShareLock);
-
-            bms_free(bmscopy);
-        }
-
-        heap_close(relation, AccessShareLock);
-    }
-#endif
-
     nrows = rel->tuples *
         clauselist_selectivity(root,
                                rel->baserestrictinfo,
diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c
index a1a689ee..c04e838b 100644
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@@ -225,6 +225,8 @@ query_planner(PlannerInfo *root, List *tlist,
      */
     extract_restriction_or_clauses(root);
 
+	prune_interval_base_rel(root);
+
     /*
      * We should now have size estimates for every actual table involved in
      * the query, and we also know which if any have been deleted from the
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 48d6f994..ebeca134 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -236,4 +236,7 @@ extern PathKey *make_canonical_pathkey(PlannerInfo *root,
 #ifdef __TBASE__
 extern double path_count_datanodes(Path *path);
 #endif
+
+extern void prune_interval_base_rel(PlannerInfo *root);
+
 #endif							/* PATHS_H */

From cf935e79c136d380f230f9faaab2862ce0894090 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 8 Dec 2021 21:15:59 +0800
Subject: [PATCH 450/578] fix: tpcc transaction inconsistent. resolve solution:
 release all handles when handle->transaction_status == 'E' in function
 pgxc_node_remote_abort (merge request !1009)

Squash merge branch 'Tbase_v5.06_tpcc_inconsistent' into 'Tbase_v5.06'
fix: tpcc transaction inconsistent. resolve solution: release all handles when handle->transaction_status == 'E' in function pgxc_node_remote_abort

Signed-off-by: JennyJennyChen <chenzaini@sina.com>
---
 src/backend/pgxc/pool/execRemote.c |  8 ++++++--
 src/backend/pgxc/pool/pgxcnode.c   | 21 +++++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 20cab418..1d519162 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3390,6 +3390,8 @@ handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner)
                 conn->transaction_status = msg[0];
                 PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
                 conn->combiner = NULL;
+
+				elog(DEBUG5, "remote_node %s remote_pid %d, conn->transaction_status %c", conn->nodename, conn->backend_pid, conn->transaction_status);
 #ifdef DN_CONNECTION_DEBUG
                 conn->have_row_desc = false;
 #endif
@@ -5661,6 +5663,7 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
                     /* Read responses from these */
                     sync_connections[sync_conn_count++] = conn;
                     result = EOF;
+					elog(DEBUG5, "send SYNC command to CN nodename %s, backend_pid %d", conn->nodename, conn->backend_pid);
                 }
             }
         }
@@ -5695,6 +5698,7 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
                     /* Read responses from these */
                     sync_connections[sync_conn_count++] = conn;
                     result = EOF;
+					elog(DEBUG5, "send SYNC command to DN nodename %s, backend_pid %d", conn->nodename, conn->backend_pid);
                 }
             }
         }
@@ -5934,13 +5938,13 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
             {
                 ereport(LOG,
                         (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to send SYNC to on one or more nodes errmsg:%s", combiner.errorMessage)));
+						 errmsg("Failed to send ROLLBACK to on one or more nodes errmsg:%s", combiner.errorMessage)));
             }
             else
             {
                 ereport(LOG,
                         (errcode(ERRCODE_INTERNAL_ERROR),
-                         errmsg("Failed to send SYNC to on one or more nodes")));
+						 errmsg("Failed to send ROLLBACK to on one or more nodes")));
             }
         }
         CloseCombiner(&combiner);
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 23225be6..11932767 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -1648,6 +1648,13 @@ validate_handles(void)
                     return true;
                 }                
             }
+			
+			if(handle->transaction_status == 'E')
+			{
+				elog(LOG, "Remote node \"%s\", running with pid %d transaction_status %c is bad",
+							handle->nodename, handle->backend_pid, handle->transaction_status);
+				return true;
+			}
         }
     }
 
@@ -1674,6 +1681,13 @@ validate_handles(void)
                 }
                 
             }
+			
+			if(handle->transaction_status == 'E')
+			{
+				elog(LOG, "Remote node \"%s\", running with pid %d transaction_status %c is bad",
+							handle->nodename, handle->backend_pid, handle->transaction_status);
+				return true;
+			}
         }    
     }
 
@@ -1701,6 +1715,13 @@ validate_handles(void)
                         return true;
                     }
                 }
+
+				if(handle->transaction_status == 'E')
+				{
+					elog(LOG, "Remote node \"%s\", running with pid %d transaction_status %c is bad",
+								handle->nodename, handle->backend_pid, handle->transaction_status);
+					return true;
+				}
             }
         }
     }

From bd290dc41b114ab64596c7915136c5f5eac96b9a Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Thu, 16 Dec 2021 16:03:11 +0800
Subject: [PATCH 451/578] bugfix: get gts error, get a earlier one (merge
 request !1023)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

对应的问题单：
http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696095181275
问题描述：
跑TPCC测试过程中，频繁进行一致性检查，偶尔会测试出数据不一致
问题原因：
最后一次重试获取gts时，如果出现超时，gtm的连接未释放，导致下一次获取gts，可能取到上一次返回的结果，因此获取到更早的更小的gts，进而影响到tuple的可见性判断，看到错误的tuple版本，导致一致性检查不通过
修改方案：
最后一次重试获取gts时，如果仍获取到无效gts，则重置gtm的连接，以免残留消息导致后面的消息处理错位，获取到错误的gts

(cherry picked from commit dc583306)

4b61a83b bugfix: get gts error, get a earlier one
---
 src/backend/access/transam/gtm.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 5fb8904a..8b11b530 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -1505,6 +1505,14 @@ GetGlobalTimestampGTM(void)
     }
     elog(DEBUG7, "get global timestamp gts " INT64_FORMAT, gts_result.gts);
 
+	if (retry_cnt >= reconnect_gtm_retry_times &&
+		!GlobalTimestampIsValid(gts_result.gts))
+	{
+		elog(WARNING, "retry %d times, get a invalid global timestamp, "
+			"ResetGTMConnection", retry_cnt);
+		ResetGTMConnection();
+	}
+
     if (log_gtm_stats)
         ShowUsageCommon("BeginTranGTM", &start_r, &start_t);
 

From 62fdfaaebb44014744e0029cc41a0043bb77cc5b Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Tue, 21 Dec 2021 11:05:44 +0800
Subject: [PATCH 452/578] bugfix: fix gts bug again, get a earlier one (merge
 request !1031)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

再次修复gts消息错位导致获取到更早之前gts的问题，之前修改后自测还有一条路径有问题，修复之
之前的修改：
https://git.woa.com/Tbase/PG-XL-v10/merge_requests/1023

(cherry picked from commit 778c5873)

8af73c72 bugfix: fix gts bug again, get a earlier one
---
 src/backend/access/transam/gtm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 8b11b530..03d76457 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -1505,8 +1505,7 @@ GetGlobalTimestampGTM(void)
     }
     elog(DEBUG7, "get global timestamp gts " INT64_FORMAT, gts_result.gts);
 
-	if (retry_cnt >= reconnect_gtm_retry_times &&
-		!GlobalTimestampIsValid(gts_result.gts))
+	if (!GlobalTimestampIsValid(gts_result.gts))
 	{
 		elog(WARNING, "retry %d times, get a invalid global timestamp, "
 			"ResetGTMConnection", retry_cnt);

From 440b2557c34a76433eb8ead912d092902a905d64 Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Tue, 21 Dec 2021 14:09:12 +0800
Subject: [PATCH 453/578] fix: 2pc stop     when the first distribute
 transaction prepared successfully on all dn and then rollback on cn for
 error, then the second transaction abort on dn for error during executation
 will cause a stop because the global g_twophase_state is still the first ones
 status

---
 src/backend/access/transam/xact.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 1d255395..33cb58c9 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2707,7 +2707,11 @@ StartTransaction(void)
      * progress"
      */
     s->state = TRANS_INPROGRESS;
-
+    if (g_twophase_state.state != TWO_PHASE_INITIALTRANS)
+    {
+        ClearLocalTwoPhaseState();
+        elog(WARNING, "clear g_twophase_state when start transaction")
+    }
     ShowTransactionState("StartTransaction");
 }
 

From 0f3fcad11f3b4b2ab5c132ab75746c9a8304c447 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Wed, 1 Dec 2021 11:36:27 +0800
Subject: [PATCH 454/578] bugfix: rollback slow than commit, tpcc performce
 optimize(merge request !965)

(cherry picked from commit cc4a42d3)

0fc32b96 bugfix: rollback slow than commit 2

b63fe1f1 bugfix: rollback slow than commit
---
 src/backend/pgxc/pool/execRemote.c |  9 ---------
 src/backend/pgxc/pool/pgxcnode.c   | 19 ++++++++++++++++---
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 1d519162..45bac135 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -7903,11 +7903,6 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle)
         }
     }
 
-    
-#if PGXC_CANCEL_DELAY > 0
-    pg_usleep(PGXC_CANCEL_DELAY * 1000);
-#endif
-
     /*
      * Now read and discard any data from the connections found "dirty"
      */
@@ -11765,10 +11760,6 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles)
             {
                 break;
             }
-            /* Sleep a while. */
-#if PGXC_CANCEL_DELAY > 0
-            pg_usleep(PGXC_CANCEL_DELAY * 1000);
-#endif
         }
     }
 }
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 11932767..78f1a024 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -2696,6 +2696,7 @@ pgxc_node_flush_read(PGXCNodeHandle *handle)
 {// #lizard forgives
     bool    is_ready= false;
     int    read_result;
+	int	wait_time = 1;
 
     if (handle == NULL)
     {
@@ -2704,9 +2705,6 @@ pgxc_node_flush_read(PGXCNodeHandle *handle)
 
     while(true)
     {        
-#if PGXC_CANCEL_DELAY > 0
-        pg_usleep(PGXC_CANCEL_DELAY * 1000);
-#endif
         /* consume all data */
         while (HAS_MESSAGE_BUFFERED(handle))
         {
@@ -2733,6 +2731,21 @@ pgxc_node_flush_read(PGXCNodeHandle *handle)
             elog(LOG, "pgxc_node_flush_read node:%s read failure.", handle->nodename);
             break;
         }
+
+		if (PGXC_CANCEL_DELAY > 0)
+		{
+			elog(DEBUG5, "pgxc_node_flush_read sleep %dus", wait_time);
+			pg_usleep(wait_time);
+
+			if (wait_time < PGXC_CANCEL_DELAY)
+			{
+				wait_time *= 2;
+			}
+			if (wait_time > PGXC_CANCEL_DELAY)
+			{
+				wait_time = PGXC_CANCEL_DELAY;
+			}
+		}
     }
 }
 

From 2415002449689880f4b10aaf176d7f487eddde57 Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Tue, 21 Dec 2021 17:40:02 +0800
Subject: [PATCH 455/578] fix: compile error

---
 src/backend/access/transam/xact.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 33cb58c9..6c393b81 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2710,7 +2710,7 @@ StartTransaction(void)
     if (g_twophase_state.state != TWO_PHASE_INITIALTRANS)
     {
         ClearLocalTwoPhaseState();
-        elog(WARNING, "clear g_twophase_state when start transaction")
+        elog(WARNING, "clear g_twophase_state when start transaction");
     }
     ShowTransactionState("StartTransaction");
 }

From a15f23bba9cc18ec802db8bbd45414b0d71e75ae Mon Sep 17 00:00:00 2001
From: guanhuawang <guanhuawang@tencent.com>
Date: Fri, 17 Dec 2021 15:25:09 +0800
Subject: [PATCH 456/578] Fix coredump caused by int128 instructions
 http://tapd.oa.com/20421696/prong/stories/view/1020421696870813487

---
 config/c-compiler.m4 | 41 +++++++++++++++++++++++---
 configure            | 69 ++++++++++++++++++++++++++++++++++++++++----
 src/include/c.h      |  1 -
 3 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 8d9844ab..cb35429c 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -108,29 +108,62 @@ AC_DEFUN([PGAC_TYPE_128BIT_INT],
 [AC_CACHE_CHECK([for __int128], [pgac_cv__128bit_int],
 [AC_LINK_IFELSE([AC_LANG_PROGRAM([
 /*
+ * We don't actually run this test, just link it to verify that any support
+ * functions needed for __int128 are present.
+ *
  * These are globals to discourage the compiler from folding all the
  * arithmetic tests down to compile-time constants.  We do not have
  * convenient support for 64bit literals at this point...
+ * convenient support for 128bit literals at this point...
  */
 __int128 a = 48828125;
-__int128 b = 97656255;
+__int128 b = 97656250;
 ],[
 __int128 c,d;
 a = (a << 12) + 1; /* 200000000001 */
 b = (b << 12) + 5; /* 400000000005 */
-/* use the most relevant arithmetic ops */
+/* try the most relevant arithmetic ops */
 c = a * b;
 d = (c + b) / b;
-/* return different values, to prevent optimizations */
+/* must use the results, else compiler may optimize arithmetic away */
 if (d != a+1)
-  return 0;
 return 1;
 ])],
 [pgac_cv__128bit_int=yes],
 [pgac_cv__128bit_int=no])])
 if test x"$pgac_cv__128bit_int" = xyes ; then
+  # Use of non-default alignment with __int128 tickles bugs in some compilers.
+  # If not cross-compiling, we can test for bugs and disable use of __int128
+  # with buggy compilers.  If cross-compiling, hope for the best.
+  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83925
+  AC_CACHE_CHECK([for __int128 alignment bug], [pgac_cv__128bit_int_bug],
+  [AC_RUN_IFELSE([AC_LANG_PROGRAM([
+/* This must match the corresponding code in c.h: */
+#if defined(__GNUC__) || defined(__SUNPRO_C) || defined(__IBMC__)
+#define pg_attribute_aligned(a) __attribute__((aligned(a)))
+#endif
+typedef __int128 int128a
+#if defined(pg_attribute_aligned)
+pg_attribute_aligned(8)
+#endif
+;
+int128a holder;
+void pass_by_val(void *buffer, int128a par) { holder = par; }
+],[
+long int i64 = 97656225L << 12;
+int128a q;
+pass_by_val(main, (int128a) i64);
+q = (int128a) i64;
+if (q != holder)
+  return 1;
+])],
+  [pgac_cv__128bit_int_bug=ok],
+  [pgac_cv__128bit_int_bug=broken],
+  [pgac_cv__128bit_int_bug="assuming ok"])])
+  if test x"$pgac_cv__128bit_int_bug" != xbroken ; then
   AC_DEFINE(PG_INT128_TYPE, __int128, [Define to the name of a signed 128-bit integer type.])
   AC_CHECK_ALIGNOF(PG_INT128_TYPE)
+  fi
 fi])# PGAC_TYPE_128BIT_INT
 
 
diff --git a/configure b/configure
index 26843895..9b92963c 100755
--- a/configure
+++ b/configure
@@ -15058,12 +15058,15 @@ else
 /* end confdefs.h.  */
 
 /*
+ * We don't actually run this test, just link it to verify that any support
+ * functions needed for __int128 are present.
+ *
  * These are globals to discourage the compiler from folding all the
  * arithmetic tests down to compile-time constants.  We do not have
- * convenient support for 64bit literals at this point...
+ + * convenient support for 128bit literals at this point...
  */
 __int128 a = 48828125;
-__int128 b = 97656255;
+__int128 b = 97656250;
 
 int
 main ()
@@ -15072,12 +15075,11 @@ main ()
 __int128 c,d;
 a = (a << 12) + 1; /* 200000000001 */
 b = (b << 12) + 5; /* 400000000005 */
-/* use the most relevant arithmetic ops */
+/* try the most relevant arithmetic ops */
 c = a * b;
 d = (c + b) / b;
-/* return different values, to prevent optimizations */
+/* must use the results, else compiler may optimize arithmetic away */
 if (d != a+1)
-  return 0;
 return 1;
 
   ;
@@ -15095,6 +15097,61 @@ fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__128bit_int" >&5
 $as_echo "$pgac_cv__128bit_int" >&6; }
 if test x"$pgac_cv__128bit_int" = xyes ; then
+  # Use of non-default alignment with __int128 tickles bugs in some compilers.
+  # If not cross-compiling, we can test for bugs and disable use of __int128
+  # with buggy compilers.  If cross-compiling, hope for the best.
+  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83925
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __int128 alignment bug" >&5
+$as_echo_n "checking for __int128 alignment bug... " >&6; }
+if ${pgac_cv__128bit_int_bug+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test "$cross_compiling" = yes; then :
+  pgac_cv__128bit_int_bug="assuming ok"
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* This must match the corresponding code in c.h: */
+#if defined(__GNUC__) || defined(__SUNPRO_C) || defined(__IBMC__)
+#define pg_attribute_aligned(a) __attribute__((aligned(a)))
+#endif
+typedef __int128 int128a
+#if defined(pg_attribute_aligned)
+pg_attribute_aligned(8)
+#endif
+;
+int128a holder;
+void pass_by_val(void *buffer, int128a par) { holder = par; }
+
+int
+main ()
+{
+
+long int i64 = 97656225L << 12;
+int128a q;
+pass_by_val(main, (int128a) i64);
+q = (int128a) i64;
+if (q != holder)
+  return 1;
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+  pgac_cv__128bit_int_bug=ok
+else
+  pgac_cv__128bit_int_bug=broken
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__128bit_int_bug" >&5
+$as_echo "$pgac_cv__128bit_int_bug" >&6; }
+  if test x"$pgac_cv__128bit_int_bug" != xbroken ; then
 
 $as_echo "#define PG_INT128_TYPE __int128" >>confdefs.h
 
@@ -15132,7 +15189,7 @@ cat >>confdefs.h <<_ACEOF
 #define ALIGNOF_PG_INT128_TYPE $ac_cv_alignof_PG_INT128_TYPE
 _ACEOF
 
-
+  fi
 fi
 
 # Check for various atomic operations now that we have checked how to declare
diff --git a/src/include/c.h b/src/include/c.h
index f2c1d8c2..7a6ab8e2 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -377,7 +377,6 @@ typedef unsigned long long int uint64;
 
 /*
  * 128-bit signed and unsigned integers
-
  *		There currently is only limited support for such types.
  *		E.g. 128bit literals and snprintf are not supported; but math is.
  *		Also, because we exclude such types when choosing MAXIMUM_ALIGNOF,

From 25d5f199de8673417735488aa1d720c1b77ef664 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 10 Dec 2021 14:48:38 +0800
Subject: [PATCH 457/578] fix deadlock by pg_blocking_pids
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093597037 (merge request
 !1011)

Squash merge branch 'sigmalin_v2' into 'Tbase_v2.15.19.4'
fix deadlock by pg_blocking_pids http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093597037


TAPD: --bug=093597037
---
 src/backend/storage/lmgr/lock.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 815124f6..f1570bb3 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -3784,6 +3784,13 @@ GetBlockerStatusData(int blocked_pid)
     data->waiter_pids = (int *) palloc(sizeof(int) * data->maxpids);
 
     /*
+     * Acquire lock on the entire shared lock data structure.  See notes
+     * in GetLockStatusData().
+     */
+    for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+        LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+	/*
      * In order to search the ProcArray for blocked_pid and assume that that
      * entry won't immediately disappear under us, we must hold ProcArrayLock.
      * In addition, to examine the lock grouping fields of any other backend,
@@ -3801,13 +3808,6 @@ GetBlockerStatusData(int blocked_pid)
     /* Nothing to do if it's gone */
     if (proc != NULL)
     {
-        /*
-         * Acquire lock on the entire shared lock data structure.  See notes
-         * in GetLockStatusData().
-         */
-        for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
-            LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
-
         if (proc->lockGroupLeader == NULL)
         {
             /* Easy case, proc is not a lock group member */
@@ -3827,17 +3827,17 @@ GetBlockerStatusData(int blocked_pid)
             }
         }
 
+		Assert(data->nprocs <= data->maxprocs);
+	}
+
+	LWLockRelease(ProcArrayLock);
+
         /*
          * And release locks.  See notes in GetLockStatusData().
          */
         for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
             LWLockRelease(LockHashPartitionLockByIndex(i));
 
-        Assert(data->nprocs <= data->maxprocs);
-    }
-
-    LWLockRelease(ProcArrayLock);
-
     return data;
 }
 

From 8add4a85dd3acefafac925071a3197a2deae98b5 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 22 Dec 2021 20:10:23 +0800
Subject: [PATCH 458/578] fix coredump in explain
 http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696095449735
 (merge request !1039)

Squash merge branch 'sigmalin_v5' into 'Tbase_v5.06.2'
fix coredump in explain http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696095449735


TAPD: --bug=095449735

(cherry picked from commit a0c794fe)

53eb92ee fix coredump in explain http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696095449735
---
 src/backend/commands/explain.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index d08ccaa8..3ae93589 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -4063,7 +4063,7 @@ ExplainRemoteQuery(RemoteQuery *plan, PlanState *planstate, List *ancestors, Exp
         step->exec_type = EXEC_ON_DATANODES;
 
         dummy = makeVar(1, 1, TEXTOID, -1, InvalidOid, 0);
-        plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist,
+        step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
                 makeTargetEntry((Expr *) dummy, 1, "QUERY PLAN", false));
 
         estate = planstate->state;

From 7ac5e8988e9967486cb9484f1ebe7b142b3f77db Mon Sep 17 00:00:00 2001
From: jadenchi <jadenchi@tencent.com>
Date: Thu, 23 Dec 2021 19:52:59 +0800
Subject: [PATCH 459/578] fix drop database failed caused by internal
 connection remaining. fix
 http://tapd.oa.com/TencentDB_for_TBase/prong/stories/view/1020418349870883157

---
 src/backend/pgxc/locator/locator.c | 19 +++++++++++++++++++
 src/backend/pgxc/pool/poolutils.c  |  4 ++--
 src/backend/tcop/utility.c         |  7 +++++++
 src/include/pgxc/locator.h         |  1 +
 4 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index 4685ea1e..1fa85051 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -715,6 +715,25 @@ GetAllCoordNodes(void)
     return nodeList;
 }
 
+/*
+ * Return a list of all Coordinators.
+ * Including local Coordinator.
+ * This is used to clean up pooler connections.
+ */
+List *
+GetEntireCoordNodes(void)
+{
+    int i;
+    List *nodeList = NIL;
+
+    for (i = 0; i < NumCoords; i++)
+    {
+        nodeList = lappend_int(nodeList, i);
+    }
+
+    return nodeList;
+}
+
 
 static bool DatanodeInGroup(oidvector* nodeoids, Oid nodeoid)
 {
diff --git a/src/backend/pgxc/pool/poolutils.c b/src/backend/pgxc/pool/poolutils.c
index e69fae07..ab95f7e2 100644
--- a/src/backend/pgxc/pool/poolutils.c
+++ b/src/backend/pgxc/pool/poolutils.c
@@ -354,7 +354,7 @@ CleanConnection(CleanConnStmt *stmt)
         dn_list = stmt_nodes;
     else
     {
-        co_list = GetAllCoordNodes();
+		co_list = GetEntireCoordNodes();
         dn_list = GetAllDataNodes();
     }
 
@@ -388,7 +388,7 @@ CleanConnection(CleanConnStmt *stmt)
 void
 DropDBCleanConnection(char *dbname)
 {
-    List    *co_list = GetAllCoordNodes();
+	List	*co_list = GetEntireCoordNodes();
     List    *dn_list = GetAllDataNodes();
 
     /* Check permissions for this database */
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index c90f1759..d58ae205 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -2455,6 +2455,7 @@ standard_ProcessUtility(PlannedStmt *pstmt,
         case T_DropdbStmt:
             {
 				char prepareQuery[STRINGLENGTH];
+                char query[STRINGLENGTH];
                 DropdbStmt *stmt = (DropdbStmt *) parsetree;
 				if (!stmt->prepare)
 				{
@@ -2478,10 +2479,16 @@ standard_ProcessUtility(PlannedStmt *pstmt,
 
 						if (OidIsValid(db_oid))
 						{
+                            snprintf(query, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;",
+                                     quote_identifier(stmt->dbname));
+
 							snprintf(prepareQuery, STRINGLENGTH, "DROP DATABASE PREPARE %s;",
 						        			quote_identifier(stmt->dbname));
 							if (!is_ddl_leader_cn(leaderCnHandle->nodename))
+                            {
+                                SendLeaderCNUtility(query, false);
 								SendLeaderCNUtility(prepareQuery, false);
+							}
 							else
 								dropdb_prepare(stmt->dbname, false);
 							ExecUtilityStmtOnNodes(parsetree, prepareQuery,
diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h
index 30926928..d5c9d543 100644
--- a/src/include/pgxc/locator.h
+++ b/src/include/pgxc/locator.h
@@ -225,6 +225,7 @@ extern ExecNodes *GetRelationNodesByQuals(Oid reloid,
 extern bool IsTypeHashDistributable(Oid col_type);
 extern List *GetAllDataNodes(void);
 extern List *GetAllCoordNodes(void);
+extern List *GetEntireCoordNodes(void);
 extern int GetAnyDataNode(Bitmapset *nodes);
 extern void RelationBuildLocator(Relation rel);
 extern void FreeRelationLocInfo(RelationLocInfo *relationLocInfo);

From cdf1442996a134486d1149ef952b3bcf266b20fb Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 24 Dec 2021 12:29:46 +0800
Subject: [PATCH 460/578] Prevent SQL injection in pg_stat_cluster_activity
 extension

---
 .../pg_stat_cluster_activity.c                | 63 +++++++++++++------
 1 file changed, 44 insertions(+), 19 deletions(-)

diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
index 1bc9f489..74af0249 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -154,6 +154,41 @@ PG_FUNCTION_INFO_V1(pg_signal_session);
 PG_FUNCTION_INFO_V1(pg_terminate_session);
 PG_FUNCTION_INFO_V1(pg_cancel_session);
 
+
+static ParamListInfo
+EvaluateSessionIDParam(const char *sessionid)
+{
+	int num_params = 1;
+	ParamListInfo paramLI = (ParamListInfo)
+		palloc0(offsetof(ParamListInfoData, params) +
+		        num_params * sizeof(ParamExternData));
+	
+	ParamExternData *prm;
+	
+	/* we have static list of params, so no hooks needed */
+	paramLI->paramFetch = NULL;
+	paramLI->paramFetchArg = NULL;
+	paramLI->parserSetup = NULL;
+	paramLI->parserSetupArg = NULL;
+	paramLI->numParams = num_params;
+	paramLI->paramMask = NULL;
+	
+	prm = &paramLI->params[0];
+	prm->ptype = TEXTOID;
+	prm->pflags = PARAM_FLAG_CONST;
+	if (sessionid != NULL)
+	{
+		prm->value = CStringGetTextDatum(sessionid);
+		prm->isnull = false;
+	}
+	else
+	{
+		prm->isnull = true;
+	}
+	
+	return paramLI;
+}
+
 /*
  * walk through planstate tree and gets cursors it contains in
  * RemoteSubplan node, formed as a single string delimited each
@@ -529,7 +564,7 @@ pgstat_fetch_stat_local_csentry(int beid)
  * ----------
  */
 static void
-pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestorestate *tupstore)
+pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestorestate *tupstore, TupleDesc tupdesc)
 {
 #define QUERY_LEN 1024
 	char    query[QUERY_LEN];
@@ -545,10 +580,7 @@ pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestoresta
 	 * Here we call pg_stat_get_cluster_activity in remote with args:
 	 * coordonly = false, localonly = true, to prevent recursive calls in remote nodes.
 	 */
-	if (sessionid == NULL)
-		snprintf(query, QUERY_LEN, "select * from pg_stat_get_cluster_activity(NULL, false, true)");
-	else
-		snprintf(query, QUERY_LEN, "select * from pg_stat_get_cluster_activity('%s', false, true)", sessionid);
+	snprintf(query, QUERY_LEN, "select * from pg_stat_get_cluster_activity($1, false, true)");
 	
 	plan = makeNode(RemoteQuery);
 	plan->combine_type = COMBINE_TYPE_NONE;
@@ -569,22 +601,13 @@ pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestoresta
 		plan->exec_type = EXEC_ON_COORDS;
 	}
 	
-	/*
-	 * We only need the target entry to determine result data type.
-	 * So create dummy even if real expression is a function.
-	 */
-	for (i = 1; i <= PG_STAT_GET_ClUSTER_ACTIVITY_COLS; i++)
-	{
-		dummy = makeVar(1, i, TEXTOID, 0, InvalidOid, 0);
-		plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist,
-		                                     makeTargetEntry((Expr *) dummy, i, NULL, false));
-	}
-	
 	/* prepare to execute */
 	estate = CreateExecutorState();
 	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
 	estate->es_snapshot = GetActiveSnapshot();
+	estate->es_param_list_info = EvaluateSessionIDParam(sessionid);
 	pstate = ExecInitRemoteQuery(plan, estate, 0);
+	ExecAssignResultType((PlanState *) pstate, tupdesc);
 	MemoryContextSwitchTo(oldcontext);
 	
 	result = ExecRemoteQuery((PlanState *) pstate);
@@ -598,7 +621,7 @@ pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestoresta
 	}
 	
 	ExecEndRemoteQuery(pstate);
-	return;
+	FreeExecutorState(estate);
 }
 
 /* ----------
@@ -660,7 +683,7 @@ pg_stat_get_cluster_activity(PG_FUNCTION_ARGS)
 	
 	/* dispatch query to remote if needed */
 	if (!localonly && IS_PGXC_COORDINATOR)
-		pg_stat_get_remote_activity(sessionid, coordonly, tupstore);
+		pg_stat_get_remote_activity(sessionid, coordonly, tupstore, tupdesc);
 	
 	/* 1-based index */
 	for (curr_backend = 1; curr_backend <= num_backends; curr_backend++)
@@ -948,7 +971,7 @@ pgcs_signal_session_remote(const char *sessionid, int signal)
 	Var 				*dummy;
 	TupleTableSlot		*result = NULL;
 	
-	snprintf(query, QUERY_LEN, "select pg_signal_session('%s', %d, true)", sessionid, signal);
+	snprintf(query, QUERY_LEN, "select pg_signal_session($1, %d, true)", signal);
 	
 	plan = makeNode(RemoteQuery);
 	plan->combine_type = COMBINE_TYPE_NONE;
@@ -973,6 +996,7 @@ pgcs_signal_session_remote(const char *sessionid, int signal)
 	estate = CreateExecutorState();
 	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
 	estate->es_snapshot = GetActiveSnapshot();
+	estate->es_param_list_info = EvaluateSessionIDParam(sessionid);
 	pstate = ExecInitRemoteQuery(plan, estate, 0);
 	MemoryContextSwitchTo(oldcontext);
 	
@@ -984,6 +1008,7 @@ pgcs_signal_session_remote(const char *sessionid, int signal)
 		return false;
 	}
 	
+	FreeExecutorState(estate);
 	return true;
 }
 

From ce845350f1e3e0ac0ce0f654d4521e0be9bccee0 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 14 Oct 2021 11:58:06 +0800
Subject: [PATCH 461/578] fix deadlock between pgxc_connections_cleanup and
 SharedQueueFinish
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131093155193 (merge request
 !800)

---
 src/backend/pgxc/squeue/squeue.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
index a145edc5..5744e22e 100644
--- a/src/backend/pgxc/squeue/squeue.c
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -2868,6 +2868,14 @@ SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc,
                         //LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
                     }
 
+					/*
+					 * Check sq_error status to avoid endless loop here
+					 */
+                    if (squeue->sq_error)
+                    {
+                        elog(ERROR, "SharedQueueFinish: shared_queue %s error because of query-cancel.", squeue->sq_key);
+                    }
+
                     if (unfinish_tuplestore)
                     {
                         pg_usleep(1000L);

From 891fedf6bcdf70c26bdce4269829f79124bc099b Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Wed, 26 May 2021 14:50:16 +0800
Subject: [PATCH 462/578] fix getmissingattr core after ALTER TABLE partabc
 ALTER COLUMN name DROP DEFAULT
 http://tapd.oa.com/10092131/bugtrace/bugs/view/1010092131087999713

---
 src/backend/utils/cache/relcache.c         |  2 +-
 src/test/regress/expected/fast_default.out | 64 +++++++++++-----------
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 55943dff..23a41d00 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -637,6 +637,7 @@ RelationBuildTupleDesc(Relation relation)
             attrdef[ndef].adnum = attnum;
             attrdef[ndef].adbin = NULL;
             ndef++;
+		}
 
 #ifdef _MLS_
             /* Likewise for a missing value */
@@ -693,7 +694,6 @@ RelationBuildTupleDesc(Relation relation)
                 }
             }
 #endif
-        }
         need--;
         if (need == 0)
             break;
diff --git a/src/test/regress/expected/fast_default.out b/src/test/regress/expected/fast_default.out
index d390a452..bf00c540 100644
--- a/src/test/regress/expected/fast_default.out
+++ b/src/test/regress/expected/fast_default.out
@@ -131,28 +131,28 @@ SELECT pk, c_int, c_bpchar, c_text, c_date, c_timestamp,
 FROM T ORDER BY pk;
  pk | c_int | c_bpchar | c_text |   c_date   |       c_timestamp        |     c_timestamp_null     |         c_array          | c_small | c_small_null |       c_big       |       c_num       |  c_time  | c_interval | c_hugetext_origdef | c_hugetext_newdef 
 ----+-------+----------+--------+------------+--------------------------+--------------------------+--------------------------+---------+--------------+-------------------+-------------------+----------+------------+--------------------+-------------------
-  1 |     1 |          |        |            |                          |                          |                          |         |              |                   |                   |          |            |                    | 
-  2 |     1 |          |        |            |                          |                          |                          |         |              |                   |                   |          |            |                    | 
-  3 |     2 | hello    |        |            |                          |                          |                          |         |              |                   |                   |          |            |                    | 
-  4 |     2 | hello    |        |            |                          |                          |                          |         |              |                   |                   |          |            |                    | 
-  5 |     2 | dog      | world  |            |                          |                          |                          |         |              |                   |                   |          |            |                    | 
-  6 |     2 | dog      | world  |            |                          |                          |                          |         |              |                   |                   |          |            |                    | 
-  7 |     2 | dog      | cat    | 06-02-2016 |                          |                          |                          |         |              |                   |                   |          |            |                    | 
-  8 |     2 | dog      | cat    | 06-02-2016 |                          |                          |                          |         |              |                   |                   |          |            |                    | 
-  9 |     2 | dog      | cat    | 01-01-2010 | Thu Sep 01 12:00:00 2016 |                          |                          |         |              |                   |                   |          |            |                    | 
- 10 |     2 | dog      | cat    | 01-01-2010 | Thu Sep 01 12:00:00 2016 |                          |                          |         |              |                   |                   |          |            |                    | 
- 11 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,the,real,world} |         |              |                   |                   |          |            |                    | 
- 12 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,the,real,world} |         |              |                   |                   |          |            |                    | 
- 13 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |      -5 |              |                   |                   |          |            |                    | 
- 14 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |      -5 |              |                   |                   |          |            |                    | 
- 15 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 |   180000000000018 |                   |          |            |                    | 
- 16 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 |   180000000000018 |                   |          |            |                    | 
- 17 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 |     1.00000000001 |          |            |                    | 
- 18 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 |     1.00000000001 |          |            |                    | 
- 19 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 | 2.000000000000002 | 12:00:00 |            |                    | 
- 20 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 | 2.000000000000002 | 12:00:00 |            |                    | 
- 21 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | @ 1 day    |                    | 
- 22 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | @ 1 day    |                    | 
+  1 |     1 | hello    | world  | 06-02-2016 | Thu Sep 01 12:00:00 2016 |                          | {This,is,the,real,world} |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+  2 |     1 | hello    | world  | 06-02-2016 | Thu Sep 01 12:00:00 2016 |                          | {This,is,the,real,world} |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+  3 |     2 | hello    | world  | 06-02-2016 | Thu Sep 01 12:00:00 2016 |                          | {This,is,the,real,world} |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+  4 |     2 | hello    | world  | 06-02-2016 | Thu Sep 01 12:00:00 2016 |                          | {This,is,the,real,world} |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+  5 |     2 | dog      | world  | 06-02-2016 | Thu Sep 01 12:00:00 2016 |                          | {This,is,the,real,world} |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+  6 |     2 | dog      | world  | 06-02-2016 | Thu Sep 01 12:00:00 2016 |                          | {This,is,the,real,world} |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+  7 |     2 | dog      | cat    | 06-02-2016 | Thu Sep 01 12:00:00 2016 |                          | {This,is,the,real,world} |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+  8 |     2 | dog      | cat    | 06-02-2016 | Thu Sep 01 12:00:00 2016 |                          | {This,is,the,real,world} |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+  9 |     2 | dog      | cat    | 01-01-2010 | Thu Sep 01 12:00:00 2016 |                          | {This,is,the,real,world} |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+ 10 |     2 | dog      | cat    | 01-01-2010 | Thu Sep 01 12:00:00 2016 |                          | {This,is,the,real,world} |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+ 11 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,the,real,world} |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+ 12 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,the,real,world} |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+ 13 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+ 14 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |      -5 |              |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+ 15 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+ 16 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 |   180000000000018 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+ 17 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+ 18 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 |     1.00000000001 | 12:00:00 | @ 1 day    | t                  | f
+ 19 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 | 2.000000000000002 | 12:00:00 | @ 1 day    | t                  | f
+ 20 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 | 2.000000000000002 | 12:00:00 | @ 1 day    | t                  | f
+ 21 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | @ 1 day    | t                  | f
+ 22 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | @ 1 day    | t                  | f
  23 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | @ 3 hours  | t                  | f
  24 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 | 2.000000000000002 | 23:59:59 | @ 3 hours  | t                  | f
  25 |     2 | dog      | cat    | 01-01-2010 | Thu Dec 31 11:12:13 1970 | Thu Sep 29 12:00:00 2016 | {This,is,no,fantasy}     |       9 |           13 | -9999999999999999 | 2.000000000000002 | 23:59:59 |            | f                  | t
@@ -226,16 +226,16 @@ INSERT INTO T VALUES (15), (16);
 SELECT * FROM T order by 1;
  pk | c_int | c_bpchar |    c_text    |   c_date   |       c_timestamp        |            c_array            
 ----+-------+----------+--------------+------------+--------------------------+-------------------------------
-  1 |     6 |          |              |            |                          | 
-  2 |     6 |          |              |            |                          | 
-  3 |     8 | abcd     |              |            |                          | 
-  4 |     8 | abcd     |              |            |                          | 
-  5 |     8 | abc      | abcdef       |            |                          | 
-  6 |     8 | abc      | abcdef       |            |                          | 
-  7 |     8 | abc      | abcdefghijkl | 06-12-2016 |                          | 
-  8 |     8 | abc      | abcdefghijkl | 06-12-2016 |                          | 
-  9 |     8 | abc      | abcdefghijkl | 12-28-2009 | Sun Sep 11 00:00:00 2016 | 
- 10 |     8 | abc      | abcdefghijkl | 12-28-2009 | Sun Sep 11 00:00:00 2016 | 
+  1 |     6 | abcd     | abcdef       | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world}
+  2 |     6 | abcd     | abcdef       | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world}
+  3 |     8 | abcd     | abcdef       | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world}
+  4 |     8 | abcd     | abcdef       | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world}
+  5 |     8 | abc      | abcdef       | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world}
+  6 |     8 | abc      | abcdef       | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world}
+  7 |     8 | abc      | abcdefghijkl | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world}
+  8 |     8 | abc      | abcdefghijkl | 06-12-2016 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world}
+  9 |     8 | abc      | abcdefghijkl | 12-28-2009 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world}
+ 10 |     8 | abc      | abcdefghijkl | 12-28-2009 | Sun Sep 11 00:00:00 2016 | {This,is,abcd,the,real,world}
  11 |     8 | abc      | abcdefghijkl | 12-28-2009 | Sat Jan 30 00:00:00 1971 | {This,is,abcd,the,real,world}
  12 |     8 | abc      | abcdefghijkl | 12-28-2009 | Sat Jan 30 00:00:00 1971 | {This,is,abcd,the,real,world}
  13 |       | abc      | abcdefghijkl | 12-28-2009 | Sat Jan 30 00:00:00 1971 | {This,is,a,fantasy}

From 192de069c410854af11f526bd6da263aaa0f3df4 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Mon, 27 Dec 2021 16:44:20 +0800
Subject: [PATCH 463/578] Remove IS_PGXC_DATANODE constrains in epqcontext
 deparse of exec_bind_message

tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131095603881
---
 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c | 2 --
 src/backend/tcop/postgres.c                                 | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
index 74af0249..625c4394 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -568,12 +568,10 @@ pg_stat_get_remote_activity(const char *sessionid, bool coordonly, Tuplestoresta
 {
 #define QUERY_LEN 1024
 	char    query[QUERY_LEN];
-	int     i;
 	EState              *estate;
 	MemoryContext		oldcontext;
 	RemoteQuery 		*plan;
 	RemoteQueryState    *pstate;
-	Var 				*dummy;
 	TupleTableSlot		*result = NULL;
 	
 	/*
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 5c80dfb5..735035de 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2800,7 +2800,7 @@ exec_bind_message(StringInfo input_message)
     }
 
 	/* Get epq context, only datanodes need them */
-	if (IS_PGXC_DATANODE && (IsConnFromCoord() || IsConnFromDatanode()))
+	if (IsConnFromCoord() || IsConnFromDatanode())
 	{
         num_epq_tuple = pq_getmsgint(input_message, 2);
         if (num_epq_tuple > 0)

From 31f0f877f657b29ff54d40737ac049ac5f1aa037 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 8 Dec 2021 15:44:07 +0800
Subject: [PATCH 464/578] fix duplicate binding because of unique overflow
 http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696094794629&jump_count=1
 (merge request !1004)

Squash merge branch 'sigmalin_v5' into 'Tbase_v5.06'
fix duplicate binding because of unique overflow http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696094794629&jump_count=1

TAPD: --bug=094794629

(cherry picked from commit c3df25c4)

4064c9fe fix duplicate binding because of unique overflow http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696094794629&jump_count=1
---
 src/backend/commands/explain.c     |  2 +-
 src/backend/nodes/outfuncs.c       |  6 ++++-
 src/backend/nodes/readfuncs.c      |  8 +++++-
 src/backend/pgxc/pool/execRemote.c | 39 ++++++++++++++++--------------
 src/backend/pgxc/squeue/squeue.c   |  6 ++---
 src/backend/tcop/pquery.c          |  2 +-
 src/include/pgxc/execRemote.h      |  2 +-
 src/include/pgxc/planner.h         |  2 +-
 8 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 3ae93589..5f0ec309 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1244,7 +1244,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
                                 if (rsubplan->cursor)
                                 {
                                     if (rsubplan->unique)
-                                        snprintf(cursor, NAMEDATALEN, "%s_%d", rsubplan->cursor, rsubplan->unique);
+										snprintf(cursor, NAMEDATALEN, "%s_"INT64_FORMAT, rsubplan->cursor, rsubplan->unique);
                                     else
                                         strncpy(cursor, rsubplan->cursor, NAMEDATALEN);
                                 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 86320cfb..f6b2295d 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -99,6 +99,10 @@ static void outChar(StringInfo str, char c);
 #define WRITE_UINT_FIELD(fldname) \
     appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname)
 
+/* Write an int64 field (anything written as ":fldname %d") */
+#define WRITE_INT64_FIELD(fldname) \
+	appendStringInfo(str, " :" CppAsString(fldname) " "INT64_FORMAT, node->fldname)
+
 #ifdef XCP
 /* Only allow output OIDs in not portable mode */
 #define WRITE_OID_FIELD(fldname) \
@@ -1700,7 +1704,7 @@ _outRemoteSubplan(StringInfo str, const RemoteSubplan *node)
     WRITE_BOOL_FIELD(execOnAll);
     WRITE_NODE_FIELD(sort);
     WRITE_STRING_FIELD(cursor);
-    WRITE_INT_FIELD(unique);
+	WRITE_INT64_FIELD(unique);
     WRITE_BOOL_FIELD(parallelWorkerSendTuple);
 	WRITE_BITMAPSET_FIELD(initPlanParams);
 
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 33d51a1e..96f4ca05 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -116,6 +116,12 @@ set_portable_input(bool value)
     token = pg_strtok(&length);        /* get field value */ \
     local_node->fldname = atoui(token)
 
+/* Read an integer field (anything written as ":fldname %d") */
+#define READ_INT64_FIELD(fldname) \
+	token = pg_strtok(&length);		/* skip :fldname */ \
+	token = pg_strtok(&length);		/* get field value */ \
+	local_node->fldname = atoll(token)
+
 #ifdef XCP
 /* Read a long integer field (anything written as ":fldname %ld") */
 #define READ_LONG_FIELD(fldname) \
@@ -3812,7 +3818,7 @@ _readRemoteSubplan(void)
     READ_BOOL_FIELD(execOnAll);
     READ_NODE_FIELD(sort);
     READ_STRING_FIELD(cursor);
-    READ_INT_FIELD(unique);
+    READ_INT64_FIELD(unique);
     READ_BOOL_FIELD(parallelWorkerSendTuple);
 	READ_BITMAPSET_FIELD(initPlanParams);
 
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 45bac135..4da43b60 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -9706,8 +9706,8 @@ ExecEndRemoteQuery(RemoteQueryState *node)
  * take writable copy of the plan tree.
  */
 void
-RemoteSubplanMakeUnique(Node *plan, int unique)
-{// #lizard forgives
+RemoteSubplanMakeUnique(Node *plan, int unique, int pid)
+{
     if (plan == NULL)
         return;
 
@@ -9716,7 +9716,7 @@ RemoteSubplanMakeUnique(Node *plan, int unique)
         ListCell *lc;
         foreach(lc, (List *) plan)
         {
-            RemoteSubplanMakeUnique(lfirst(lc), unique);
+			RemoteSubplanMakeUnique(lfirst(lc), unique, pid);
         }
         return;
     }
@@ -9726,34 +9726,37 @@ RemoteSubplanMakeUnique(Node *plan, int unique)
      */
     if (IsA(plan, RemoteSubplan))
     {
-	    int old = ((RemoteSubplan *)plan)->unique;
-		((RemoteSubplan *)plan)->unique = old * MAX_NODES_NUMBER + unique;
+	    /*
+	     * add node information and pid to make it unique
+	     */
+        ((RemoteSubplan *)plan)->unique = ((int64)unique << 32) | pid;
     }
+
     /* Otherwise it is a Plan descendant */
-    RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->lefttree, unique);
-    RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->righttree, unique);
+	RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->lefttree, unique, pid);
+	RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->righttree, unique, pid);
     /* Tranform special cases */
     switch (nodeTag(plan))
     {
         case T_Append:
             RemoteSubplanMakeUnique((Node *) ((Append *) plan)->appendplans,
-                                    unique);
+									unique, pid);
             break;
         case T_MergeAppend:
             RemoteSubplanMakeUnique((Node *) ((MergeAppend *) plan)->mergeplans,
-                                    unique);
+									unique, pid);
             break;
         case T_BitmapAnd:
             RemoteSubplanMakeUnique((Node *) ((BitmapAnd *) plan)->bitmapplans,
-                                    unique);
+									unique, pid);
             break;
         case T_BitmapOr:
             RemoteSubplanMakeUnique((Node *) ((BitmapOr *) plan)->bitmapplans,
-                                    unique);
+									unique, pid);
             break;
         case T_SubqueryScan:
             RemoteSubplanMakeUnique((Node *) ((SubqueryScan *) plan)->subplan,
-                                    unique);
+									unique, pid);
             break;
         default:
             break;
@@ -10290,7 +10293,7 @@ ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags)
              * traverse the subtree and change SharedQueue name to make it
              * unique.
              */
-            RemoteSubplanMakeUnique((Node *) outerPlan(node), PGXCNodeId);
+			RemoteSubplanMakeUnique((Node *) outerPlan(node), PGXCNodeId - 1, MyProcPid);
             elog(DEBUG3, "RemoteSubplanMakeUnique for LOCATOR_TYPE_NONE unique: %d, cursor: %s",
                  PGXCNodeId, node->cursor);
         }
@@ -10577,7 +10580,7 @@ ExecFinishInitRemoteSubplan(RemoteSubplanState *node)
     Assert(plan->cursor);
 
     if (plan->unique)
-        snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+		snprintf(cursor, NAMEDATALEN, "%s_"INT64_FORMAT, plan->cursor, plan->unique);
     else
         strncpy(cursor, plan->cursor, NAMEDATALEN);
 
@@ -10988,7 +10991,7 @@ ExecRemoteSubplan(PlanState *pstate)
         {
             fetch = PGXLRemoteFetchSize;
             if (plan->unique)
-                snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+				snprintf(cursor, NAMEDATALEN, "%s_"INT64_FORMAT, plan->cursor, plan->unique);
             else
                 strncpy(cursor, plan->cursor, NAMEDATALEN);
         }
@@ -11411,7 +11414,7 @@ ExecFinishRemoteSubplan(RemoteSubplanState *node)
         if (plan->cursor)
         {
             if (plan->unique)
-                snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+				snprintf(cursor, NAMEDATALEN, "%s_"INT64_FORMAT, plan->cursor, plan->unique);
             else
                 strncpy(cursor, plan->cursor, NAMEDATALEN);
         }
@@ -11496,7 +11499,7 @@ ExecDisconnectRemoteSubplan(RemoteSubplanState *node)
             if (plan->cursor)
             {
                 if (plan->unique)
-                    snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+					snprintf(cursor, NAMEDATALEN, "%s_"INT64_FORMAT, plan->cursor, plan->unique);
                 else
                     strncpy(cursor, plan->cursor, NAMEDATALEN);
             }
@@ -11888,7 +11891,7 @@ ExecEndRemoteSubplan(RemoteSubplanState *node)
         if (plan->cursor)
         {
             if (plan->unique)
-                snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+				snprintf(cursor, NAMEDATALEN, "%s_"INT64_FORMAT, plan->cursor, plan->unique);
             else
                 strncpy(cursor, plan->cursor, NAMEDATALEN);
         }
diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
index 5744e22e..19387c66 100644
--- a/src/backend/pgxc/squeue/squeue.c
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -1082,8 +1082,8 @@ SharedQueueBind(const char *sqname, List *consNodes,
 #endif
             Assert(consMap);
 
-            elog(DEBUG1, "Bind node %s to squeue of step %s as a producer",
-                 PGXC_PARENT_NODE, sqname);
+			elog(DEBUG1, "Bind node %s to squeue of step %s as a producer, parentPGXCNode %s, parentPGXCPid %d",
+				 PGXC_PARENT_NODE, sqname, parentPGXCNode, parentPGXCPid);
 
             /* Initialize the shared queue */
             sq->sq_pid = MyProcPid;
@@ -1328,7 +1328,7 @@ SharedQueueBind(const char *sqname, List *consNodes,
 
         elog(DEBUG1, "SQueue %s has a bound producer from node %d, pid %d",
                 sqname, sq->sq_nodeid, sq->sq_pid);
-        elog(DEBUG1, "Bind node %s to SQueue %s as a consumer %d", PGXC_PARENT_NODE, sqname, sq->sq_pid);
+		elog(DEBUG1, "Bind node %s to SQueue %s as a consumer %d, parentPGXCNode %s, parentPGXCPid %d", PGXC_PARENT_NODE, sqname, sq->sq_pid, parentPGXCNode, parentPGXCPid);
 
         /* Sanity checks */
         Assert(myindex);
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 295dc2a2..bb11de0b 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -812,7 +812,7 @@ PortalStart(Portal portal, ParamListInfo params,
                      */
                     RemoteSubplanMakeUnique(
                             (Node *) queryDesc->plannedstmt->planTree,
-                            PGXC_PARENT_NODE_ID);
+							PGXC_PARENT_NODE_ID, parentPGXCPid);
 
                     elog(DEBUG3, "RemoteSubplanMakeUnique for PARAM_EXEC unique: %d, portal: %s",
                          PGXC_PARENT_NODE_ID, portal->name);
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index c76b946a..3ff1953b 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -367,7 +367,7 @@ extern RemoteQueryState *ExecInitRemoteQuery(RemoteQuery *node, EState *estate,
 extern TupleTableSlot* ExecRemoteQuery(PlanState *pstate);
 extern void ExecReScanRemoteQuery(RemoteQueryState *node);
 extern void ExecEndRemoteQuery(RemoteQueryState *step);
-extern void RemoteSubplanMakeUnique(Node *plan, int unique);
+extern void RemoteSubplanMakeUnique(Node *plan, int unique, int pid);
 extern RemoteSubplanState *ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags);
 extern void ExecFinishInitRemoteSubplan(RemoteSubplanState *node);
 extern TupleTableSlot* ExecRemoteSubplan(PlanState *pstate);
diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h
index 9e16886d..2acef598 100644
--- a/src/include/pgxc/planner.h
+++ b/src/include/pgxc/planner.h
@@ -247,7 +247,7 @@ typedef struct
     bool         execOnAll;
     SimpleSort *sort;
     char       *cursor;
-    int            unique;
+    int64       unique;
 #ifdef __TBASE__
     /*
       * if gather is under remotesubplan, parallel worker can send tuples 

From 031a594970d2a515e4d83d80f6781e2edc415d4c Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Wed, 8 Dec 2021 22:32:24 +0800
Subject: [PATCH 465/578] fix: compile warning

---
 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
index 625c4394..8518ae8a 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -204,7 +204,7 @@ cursorCollectWalker(PlanState *planstate, StringInfo str)
 		{
 			appendStringInfoString(str, plan->cursor);
 			if (plan->unique)
-				appendStringInfo(str, "_%d", plan->unique);
+				appendStringInfo(str, "_"INT64_FORMAT, plan->unique);
 			/* add a space as delimiter */
 			appendStringInfoString(str, " ");
 		}

From f8bcda3411bba47d58a210e5c6154ca74f19b102 Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Tue, 28 Dec 2021 18:04:13 +0800
Subject: [PATCH 466/578] fix: change log level to LOG

---
 src/backend/access/transam/xact.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 6c393b81..c3f87e43 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2710,7 +2710,7 @@ StartTransaction(void)
     if (g_twophase_state.state != TWO_PHASE_INITIALTRANS)
     {
         ClearLocalTwoPhaseState();
-        elog(WARNING, "clear g_twophase_state when start transaction");
+        elog(LOG, "clear g_twophase_state when start transaction");
     }
     ShowTransactionState("StartTransaction");
 }

From fcc044c55d3e9bc71c22503d1cf30916fe175dcc Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Tue, 28 Dec 2021 18:08:36 +0800
Subject: [PATCH 467/578] fix: remove log

---
 src/backend/access/transam/xact.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index c3f87e43..88a9e05c 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2710,7 +2710,6 @@ StartTransaction(void)
     if (g_twophase_state.state != TWO_PHASE_INITIALTRANS)
     {
         ClearLocalTwoPhaseState();
-        elog(LOG, "clear g_twophase_state when start transaction");
     }
     ShowTransactionState("StartTransaction");
 }

From 9b3f0cad1cdb00d0ed4d7d5e1cac27ddf06fc8ae Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Mon, 1 Nov 2021 14:59:52 +0800
Subject: [PATCH 468/578] fix: pgxc_abort_connections send too many sync msg

---
 src/backend/pgxc/pool/execRemote.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 4da43b60..b2d52c44 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -11600,6 +11600,7 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles)
     int  ret             = false;
     int  i                  = 0;
     bool need_loop_check = false;
+    bool need_sync = true;
 
     if (all_handles)
     {                
@@ -11688,11 +11689,16 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles)
                     if (handle->state != DN_CONNECTION_STATE_IDLE || !node_ready_for_query(handle) || pgxc_node_is_data_enqueued(handle))
                     {
                         elog(DEBUG1, "pgxc_abort_connections recheck node:%s not ready for query, status:%d, sync", handle->nodename, handle->state);
-                        ret = pgxc_node_send_sync(handle);
-                        if (!ret)
+                        
+						if (need_sync)
                         {
-                            need_loop_check = true;
+                            ret = pgxc_node_send_sync(handle);
+                            if (ret != 0)
+                                elog(WARNING, "pgxc_abort_connections failed to send sync to node %s", handle->nodename);
                         }
+                        
+                        need_loop_check = true;
+
 						if (proc_exit_inprogress)
 						{
 							handle->state = DN_CONNECTION_STATE_IDLE;
@@ -11729,11 +11735,16 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles)
                     if (handle->state != DN_CONNECTION_STATE_IDLE || !node_ready_for_query(handle) || pgxc_node_is_data_enqueued(handle))
                     {
                         elog(DEBUG1, "pgxc_abort_connections recheck node:%s not ready for query, status:%d, sync", handle->nodename, handle->state);
-                        ret = pgxc_node_send_sync(handle);
-                        if (!ret)
+                        
+						if (need_sync)
                         {
-                            need_loop_check = true;
+                            ret = pgxc_node_send_sync(handle);
+                            if (ret != 0)
+                                elog(WARNING, "pgxc_abort_connections failed to send sync to node %s", handle->nodename);
                         }
+
+                        need_loop_check = true;
+
 						if (proc_exit_inprogress)
 						{
 							handle->state = DN_CONNECTION_STATE_IDLE;
@@ -11758,6 +11769,7 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles)
                 }
             }
 
+            need_sync = false; 
             /* no need to recheck, break the loop. */
             if (!need_loop_check)
             {

From 54e88388bbe904cb64c6e9e48bfb3d9cf4c89a3c Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 16 Sep 2020 13:02:57 +0800
Subject: [PATCH 469/578] fix bug that the rollback does not take effect when
 the stored procedure is called

---
 src/backend/access/transam/xact.c             |   8 +-
 src/backend/pgxc/pool/execRemote.c            |   2 +-
 src/backend/tcop/postgres.c                   |   2 +
 src/backend/utils/error/elog.c                |   2 +-
 src/include/pgxc/execRemote.h                 |   4 +-
 .../src/expected/plpgsql_transaction.out      | 625 ++++++++++++++++++
 src/pl/plpgsql/src/pl_exec.c                  |   4 +-
 .../plpgsql/src/sql/plpgsql_transaction.sql   | 537 +++++++++++++++
 8 files changed, 1175 insertions(+), 9 deletions(-)
 create mode 100644 src/pl/plpgsql/src/expected/plpgsql_transaction.out
 create mode 100644 src/pl/plpgsql/src/sql/plpgsql_transaction.sql

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 88a9e05c..8efd0ff2 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -171,7 +171,7 @@ bool		g_allow_force_ddl = false;
 #endif
 
 #ifdef __TBASE__
-extern PGDLLIMPORT bool g_in_plpgsql_exec_fun;
+extern PGDLLIMPORT int g_in_plpgsql_exec_fun;
 extern bool PlpgsqlDebugPrint;
 #endif
 
@@ -4184,7 +4184,7 @@ AbortTransaction(void)
 #endif
 
 #ifdef __TBASE__
-    SetExitPlpgsqlFunc();
+	//SetExitPlpgsqlFunc();
     SetExitCreateExtension();
     SetCurrentHandlesReadonly();
     AtEOXact_Global();
@@ -8248,12 +8248,12 @@ void SetTopXactNeedBeginTxn(void)
 
 void SetEnterPlpgsqlFunc(void)
 {
-    g_in_plpgsql_exec_fun = true;
+	g_in_plpgsql_exec_fun = g_in_plpgsql_exec_fun + 1;
 }
 
 void SetExitPlpgsqlFunc(void)
 {
-    g_in_plpgsql_exec_fun = false;
+	g_in_plpgsql_exec_fun = g_in_plpgsql_exec_fun - 1;
 }
 
 
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index b2d52c44..b9f9c4a2 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -82,7 +82,7 @@
 int PGXLRemoteFetchSize;
 
 #ifdef __TBASE__
-bool g_in_plpgsql_exec_fun = false;
+int g_in_plpgsql_exec_fun = 0;
 bool PlpgsqlDebugPrint = false;
 
 bool need_global_snapshot = false;
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 735035de..7735fd18 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -5248,6 +5248,8 @@ PostgresMain(int argc, char *argv[],
         end_query_requested = false;
         Executor_done = false;
 
+		g_in_plpgsql_exec_fun = 0;
+
         ClearQueryAnalyzeInfo();
 #endif
 
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index ca77995a..c03635cb 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -112,7 +112,7 @@ sigjmp_buf *PG_exception_stack = NULL;
 extern bool redirection_done;
 
 #ifdef __TBASE__
-extern PGDLLIMPORT bool g_in_plpgsql_exec_fun;
+extern PGDLLIMPORT int g_in_plpgsql_exec_fun;
 #endif
 
 #ifdef __TBASE__
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index 3ff1953b..7047d510 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -338,8 +338,8 @@ typedef enum
 extern int PGXLRemoteFetchSize;
 
 
-#if __TBASE__
-extern PGDLLIMPORT bool g_in_plpgsql_exec_fun;
+#ifdef __TBASE__
+extern PGDLLIMPORT int g_in_plpgsql_exec_fun;
 #endif
 
 
diff --git a/src/pl/plpgsql/src/expected/plpgsql_transaction.out b/src/pl/plpgsql/src/expected/plpgsql_transaction.out
new file mode 100644
index 00000000..d5fecb16
--- /dev/null
+++ b/src/pl/plpgsql/src/expected/plpgsql_transaction.out
@@ -0,0 +1,625 @@
+CREATE TABLE test1 (a int, b text);
+CREATE PROCEDURE transaction_test1(x int, y text)
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    FOR i IN 0..x LOOP
+        INSERT INTO test1 (a, b) VALUES (i, y);
+        IF i % 2 = 0 THEN
+            COMMIT;
+        ELSE
+            ROLLBACK;
+        END IF;
+    END LOOP;
+END
+$$;
+CALL transaction_test1(9, 'foo');
+SELECT * FROM test1 ORDER BY a;
+ a |  b  
+---+-----
+ 0 | foo
+ 2 | foo
+ 4 | foo
+ 6 | foo
+ 8 | foo
+(5 rows)
+
+TRUNCATE test1;
+DO
+LANGUAGE plpgsql
+$$
+BEGIN
+    FOR i IN 0..9 LOOP
+        INSERT INTO test1 (a) VALUES (i);
+        IF i % 2 = 0 THEN
+            COMMIT;
+        ELSE
+            ROLLBACK;
+        END IF;
+    END LOOP;
+END
+$$;
+SELECT * FROM test1 ORDER BY a;
+ a | b 
+---+---
+ 0 | 
+ 2 | 
+ 4 | 
+ 6 | 
+ 8 | 
+(5 rows)
+
+-- transaction commands not allowed when called in transaction block
+START TRANSACTION;
+CALL transaction_test1(9, 'error');
+ERROR:  invalid transaction termination
+CONTEXT:  PL/pgSQL function transaction_test1(integer,text) line 6 at COMMIT
+COMMIT;
+START TRANSACTION;
+DO LANGUAGE plpgsql $$ BEGIN COMMIT; END $$;
+ERROR:  invalid transaction termination
+CONTEXT:  PL/pgSQL function inline_code_block line 1 at COMMIT
+COMMIT;
+TRUNCATE test1;
+-- not allowed in a function
+CREATE FUNCTION transaction_test2() RETURNS int
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    FOR i IN 0..9 LOOP
+        INSERT INTO test1 (a) VALUES (i);
+        IF i % 2 = 0 THEN
+            COMMIT;
+        ELSE
+            ROLLBACK;
+        END IF;
+    END LOOP;
+    RETURN 1;
+END
+$$;
+SELECT transaction_test2();
+ERROR:  invalid transaction termination
+CONTEXT:  PL/pgSQL function transaction_test2() line 6 at COMMIT
+SELECT * FROM test1;
+ a | b 
+---+---
+(0 rows)
+
+-- also not allowed if procedure is called from a function
+CREATE FUNCTION transaction_test3() RETURNS int
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    CALL transaction_test1(9, 'error');
+    RETURN 1;
+END;
+$$;
+SELECT transaction_test3();
+ERROR:  invalid transaction termination
+CONTEXT:  PL/pgSQL function transaction_test1(integer,text) line 6 at COMMIT
+SQL statement "CALL transaction_test1(9, 'error')"
+PL/pgSQL function transaction_test3() line 3 at CALL
+SELECT * FROM test1;
+ a | b 
+---+---
+(0 rows)
+
+-- DO block inside function
+CREATE FUNCTION transaction_test4() RETURNS int
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    EXECUTE 'DO LANGUAGE plpgsql $x$ BEGIN COMMIT; END $x$';
+    RETURN 1;
+END;
+$$;
+SELECT transaction_test4();
+ERROR:  invalid transaction termination
+CONTEXT:  PL/pgSQL function inline_code_block line 1 at COMMIT
+SQL statement "DO LANGUAGE plpgsql $x$ BEGIN COMMIT; END $x$"
+PL/pgSQL function transaction_test4() line 3 at EXECUTE
+-- proconfig settings currently disallow transaction statements
+CREATE PROCEDURE transaction_test5()
+LANGUAGE plpgsql
+SET work_mem = 555
+AS $$
+BEGIN
+    COMMIT;
+END;
+$$;
+CALL transaction_test5();
+ERROR:  invalid transaction termination
+CONTEXT:  PL/pgSQL function transaction_test5() line 3 at COMMIT
+-- SECURITY DEFINER currently disallow transaction statements
+CREATE PROCEDURE transaction_test5b()
+LANGUAGE plpgsql
+SECURITY DEFINER
+AS $$
+BEGIN
+    COMMIT;
+END;
+$$;
+CALL transaction_test5b();
+ERROR:  invalid transaction termination
+CONTEXT:  PL/pgSQL function transaction_test5b() line 3 at COMMIT
+TRUNCATE test1;
+-- nested procedure calls
+CREATE PROCEDURE transaction_test6(c text)
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    CALL transaction_test1(9, c);
+END;
+$$;
+CALL transaction_test6('bar');
+SELECT * FROM test1 ORDER BY a;
+ a |  b  
+---+-----
+ 0 | bar
+ 2 | bar
+ 4 | bar
+ 6 | bar
+ 8 | bar
+(5 rows)
+
+TRUNCATE test1;
+CREATE PROCEDURE transaction_test7()
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    DO 'BEGIN CALL transaction_test1(9, $x$baz$x$); END;';
+END;
+$$;
+CALL transaction_test7();
+SELECT * FROM test1 ORDER BY a;
+ a |  b  
+---+-----
+ 0 | baz
+ 2 | baz
+ 4 | baz
+ 6 | baz
+ 8 | baz
+(5 rows)
+
+CREATE PROCEDURE transaction_test8()
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    EXECUTE 'CALL transaction_test1(10, $x$baz$x$)';
+END;
+$$;
+CALL transaction_test8();
+ERROR:  invalid transaction termination
+CONTEXT:  PL/pgSQL function transaction_test1(integer,text) line 6 at COMMIT
+SQL statement "CALL transaction_test1(10, $x$baz$x$)"
+PL/pgSQL function transaction_test8() line 3 at EXECUTE
+-- commit inside cursor loop
+CREATE TABLE test2 (x int);
+INSERT INTO test2 VALUES (0), (1), (2), (3), (4);
+TRUNCATE test1;
+DO LANGUAGE plpgsql $$
+DECLARE
+    r RECORD;
+BEGIN
+    FOR r IN SELECT * FROM test2 ORDER BY x LOOP
+        INSERT INTO test1 (a) VALUES (r.x);
+        COMMIT;
+    END LOOP;
+END;
+$$;
+SELECT * FROM test1 ORDER BY a;
+ a | b 
+---+---
+ 0 | 
+ 1 | 
+ 2 | 
+ 3 | 
+ 4 | 
+(5 rows)
+
+-- check that this doesn't leak a holdable portal
+SELECT * FROM pg_cursors;
+ name | statement | is_holdable | is_binary | is_scrollable | creation_time 
+------+-----------+-------------+-----------+---------------+---------------
+(0 rows)
+
+-- error in cursor loop with commit
+TRUNCATE test1;
+DO LANGUAGE plpgsql $$
+DECLARE
+    r RECORD;
+BEGIN
+    FOR r IN SELECT * FROM test2 ORDER BY x LOOP
+        INSERT INTO test1 (a) VALUES (12/(r.x-2));
+        COMMIT;
+    END LOOP;
+END;
+$$;
+ERROR:  division by zero
+CONTEXT:  SQL statement "INSERT INTO test1 (a) VALUES (12/(r.x-2))"
+PL/pgSQL function inline_code_block line 6 at SQL statement
+SELECT * FROM test1;
+  a  | b 
+-----+---
+  -6 | 
+ -12 | 
+(2 rows)
+
+SELECT * FROM pg_cursors;
+ name | statement | is_holdable | is_binary | is_scrollable | creation_time 
+------+-----------+-------------+-----------+---------------+---------------
+(0 rows)
+
+-- rollback inside cursor loop
+TRUNCATE test1;
+DO LANGUAGE plpgsql $$
+DECLARE
+    r RECORD;
+BEGIN
+    FOR r IN SELECT * FROM test2 ORDER BY x LOOP
+        INSERT INTO test1 (a) VALUES (r.x);
+        ROLLBACK;
+    END LOOP;
+END;
+$$;
+SELECT * FROM test1;
+ a | b 
+---+---
+(0 rows)
+
+SELECT * FROM pg_cursors;
+ name | statement | is_holdable | is_binary | is_scrollable | creation_time 
+------+-----------+-------------+-----------+---------------+---------------
+(0 rows)
+
+-- first commit then rollback inside cursor loop
+TRUNCATE test1;
+DO LANGUAGE plpgsql $$
+DECLARE
+    r RECORD;
+BEGIN
+    FOR r IN SELECT * FROM test2 ORDER BY x LOOP
+        INSERT INTO test1 (a) VALUES (r.x);
+        IF r.x % 2 = 0 THEN
+            COMMIT;
+        ELSE
+            ROLLBACK;
+        END IF;
+    END LOOP;
+END;
+$$;
+SELECT * FROM test1 ORDER BY a;
+ a | b 
+---+---
+ 0 | 
+ 2 | 
+ 4 | 
+(3 rows)
+
+SELECT * FROM pg_cursors;
+ name | statement | is_holdable | is_binary | is_scrollable | creation_time 
+------+-----------+-------------+-----------+---------------+---------------
+(0 rows)
+
+-- rollback inside cursor loop
+TRUNCATE test1;
+DO LANGUAGE plpgsql $$
+DECLARE
+    r RECORD;
+BEGIN
+    FOR r IN UPDATE test2 SET x = x * 2 RETURNING x LOOP
+        INSERT INTO test1 (a) VALUES (r.x);
+        ROLLBACK;
+    END LOOP;
+END;
+$$;
+ERROR:  Distributed column or partition column "x" can't be updated in current version
+CONTEXT:  SQL statement "UPDATE test2 SET x = x * 2 RETURNING x"
+PL/pgSQL function inline_code_block line 5 at FOR over SELECT rows
+SELECT * FROM test1;
+ a | b 
+---+---
+(0 rows)
+
+SELECT * FROM test2 ORDER BY x;
+ x 
+---
+ 0
+ 1
+ 2
+ 3
+ 4
+(5 rows)
+
+SELECT * FROM pg_cursors;
+ name | statement | is_holdable | is_binary | is_scrollable | creation_time 
+------+-----------+-------------+-----------+---------------+---------------
+(0 rows)
+
+-- commit inside block with exception handler
+TRUNCATE test1;
+DO LANGUAGE plpgsql $$
+BEGIN
+    BEGIN
+        INSERT INTO test1 (a) VALUES (1);
+        COMMIT;
+        INSERT INTO test1 (a) VALUES (1/0);
+        COMMIT;
+    EXCEPTION
+        WHEN division_by_zero THEN
+            RAISE NOTICE 'caught division_by_zero';
+    END;
+END;
+$$;
+ERROR:  cannot commit while a subtransaction is active
+CONTEXT:  PL/pgSQL function inline_code_block line 5 at COMMIT
+SELECT * FROM test1;
+ a | b 
+---+---
+(0 rows)
+
+-- rollback inside block with exception handler
+TRUNCATE test1;
+DO LANGUAGE plpgsql $$
+BEGIN
+    BEGIN
+        INSERT INTO test1 (a) VALUES (1);
+        ROLLBACK;
+        INSERT INTO test1 (a) VALUES (1/0);
+        ROLLBACK;
+    EXCEPTION
+        WHEN division_by_zero THEN
+            RAISE NOTICE 'caught division_by_zero';
+    END;
+END;
+$$;
+ERROR:  cannot roll back while a subtransaction is active
+CONTEXT:  PL/pgSQL function inline_code_block line 5 at ROLLBACK
+SELECT * FROM test1;
+ a | b 
+---+---
+(0 rows)
+
+-- COMMIT failures
+DO LANGUAGE plpgsql $$
+BEGIN
+    CREATE TABLE test3 (y int UNIQUE DEFERRABLE INITIALLY DEFERRED);
+    COMMIT;
+    INSERT INTO test3 (y) VALUES (1);
+    COMMIT;
+    INSERT INTO test3 (y) VALUES (1);
+    INSERT INTO test3 (y) VALUES (2);
+    COMMIT;
+    INSERT INTO test3 (y) VALUES (3);  -- won't get here
+END;
+$$;
+ERROR:  duplicate key value violates unique constraint "test3_y_key"
+DETAIL:  Key (y)=(1) already exists.
+CONTEXT:  PL/pgSQL function inline_code_block line 9 at COMMIT
+SELECT * FROM test3;
+ y 
+---
+ 1
+(1 row)
+
+-- failure while trying to persist a cursor across a transaction (bug #15703)
+CREATE PROCEDURE cursor_fail_during_commit()
+ LANGUAGE plpgsql
+AS $$
+  DECLARE id int;
+  BEGIN
+    FOR id IN SELECT 1/(x-1000) FROM generate_series(1,1000) x LOOP
+        INSERT INTO test1 VALUES(id);
+        COMMIT;
+    END LOOP;
+  END;
+$$;
+TRUNCATE test1;
+CALL cursor_fail_during_commit();
+ERROR:  division by zero
+CONTEXT:  PL/pgSQL function cursor_fail_during_commit() line 6 at COMMIT
+-- note that error occurs during first COMMIT, hence nothing is in test1
+SELECT count(*) FROM test1;
+ count 
+-------
+     0
+(1 row)
+
+CREATE PROCEDURE cursor_fail_during_rollback()
+ LANGUAGE plpgsql
+AS $$
+  DECLARE id int;
+  BEGIN
+    FOR id IN SELECT 1/(x-1000) FROM generate_series(1,1000) x LOOP
+        INSERT INTO test1 VALUES(id);
+        ROLLBACK;
+    END LOOP;
+  END;
+$$;
+TRUNCATE test1;
+CALL cursor_fail_during_rollback();
+ERROR:  division by zero
+CONTEXT:  PL/pgSQL function cursor_fail_during_rollback() line 6 at ROLLBACK
+SELECT count(*) FROM test1;
+ count 
+-------
+     0
+(1 row)
+
+-- SET TRANSACTION
+DO LANGUAGE plpgsql $$
+BEGIN
+    PERFORM 1;
+    RAISE INFO '%', current_setting('transaction_isolation');
+    COMMIT;
+    SET TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+    PERFORM 1;
+    RAISE INFO '%', current_setting('transaction_isolation');
+    COMMIT;
+    SET TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+    RESET TRANSACTION ISOLATION LEVEL;
+    PERFORM 1;
+    RAISE INFO '%', current_setting('transaction_isolation');
+    COMMIT;
+END;
+$$;
+INFO:  read committed
+INFO:  repeatable read
+INFO:  read committed
+-- error cases
+DO LANGUAGE plpgsql $$
+BEGIN
+    SET TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+END;
+$$;
+ERROR:  SET TRANSACTION ISOLATION LEVEL must be called before any query
+CONTEXT:  SQL statement "SET TRANSACTION ISOLATION LEVEL REPEATABLE READ"
+PL/pgSQL function inline_code_block line 3 at SET
+DO LANGUAGE plpgsql $$
+BEGIN
+    SAVEPOINT foo;
+END;
+$$;
+ERROR:  unsupported transaction command in PL/pgSQL
+CONTEXT:  PL/pgSQL function inline_code_block line 3 at SQL statement
+DO LANGUAGE plpgsql $$
+BEGIN
+    EXECUTE 'COMMIT';
+END;
+$$;
+ERROR:  EXECUTE of transaction commands is not implemented
+CONTEXT:  PL/pgSQL function inline_code_block line 3 at EXECUTE
+-- snapshot handling test
+TRUNCATE test2;
+CREATE PROCEDURE transaction_test9()
+LANGUAGE SQL
+AS $$
+INSERT INTO test2 VALUES (42);
+$$;
+DO LANGUAGE plpgsql $$
+BEGIN
+  ROLLBACK;
+  CALL transaction_test9();
+END
+$$;
+SELECT * FROM test2;
+ x  
+----
+ 42
+(1 row)
+
+-- Test transaction in procedure with output parameters.  This uses a
+-- different portal strategy and different code paths in pquery.c.
+CREATE PROCEDURE transaction_test10a(INOUT x int)
+LANGUAGE plpgsql
+AS $$
+BEGIN
+  x := x + 1;
+  COMMIT;
+END;
+$$;
+CALL transaction_test10a(10);
+ x  
+----
+ 11
+(1 row)
+
+CREATE PROCEDURE transaction_test10b(INOUT x int)
+LANGUAGE plpgsql
+AS $$
+BEGIN
+  x := x - 1;
+  ROLLBACK;
+END;
+$$;
+CALL transaction_test10b(10);
+ x 
+---
+ 9
+(1 row)
+
+-- transaction timestamp vs. statement timestamp
+CREATE PROCEDURE transaction_test11()
+LANGUAGE plpgsql
+AS $$
+DECLARE
+  s1 timestamp with time zone;
+  s2 timestamp with time zone;
+  s3 timestamp with time zone;
+  t1 timestamp with time zone;
+  t2 timestamp with time zone;
+  t3 timestamp with time zone;
+BEGIN
+  s1 := statement_timestamp();
+  t1 := transaction_timestamp();
+  ASSERT s1 = t1;
+  PERFORM pg_sleep(0.001);
+  COMMIT;
+  s2 := statement_timestamp();
+  t2 := transaction_timestamp();
+  ASSERT s2 = s1;
+  ASSERT t2 > t1;
+  PERFORM pg_sleep(0.001);
+  ROLLBACK;
+  s3 := statement_timestamp();
+  t3 := transaction_timestamp();
+  ASSERT s3 = s1;
+  ASSERT t3 > t2;
+END;
+$$;
+CALL transaction_test11();
+create table test(id int);
+create procedure transaction_test12() as $$
+begin
+  insert into test values(1);
+  commit;
+end;
+$$ language plpgsql;
+create procedure transaction_test13() as $$
+begin
+  insert into test values(100);
+  rollback;
+end;
+$$ language plpgsql;
+create procedure transaction_test14() as $$
+begin
+  call transaction_test12();
+  insert into test values(100);
+  rollback;
+end;
+$$ language plpgsql;
+do $$
+begin
+  call transaction_test12();
+  call transaction_test13();
+  insert into test values(2);
+  rollback;
+  insert into test values(3);
+  commit;
+end;
+$$ language plpgsql;
+select * from test order by 1;
+ id 
+----
+  1
+  3
+(2 rows)
+
+delete from test;
+do $$
+begin
+  call transaction_test14();
+  insert into test values(200);
+  rollback;
+end;
+$$ language plpgsql;
+select * from test order by 1;
+ id 
+----
+  1
+(1 row)
+
+DROP TABLE test1;
+DROP TABLE test2;
+DROP TABLE test3;
+DROP TABLE test;
diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c
index 78331d9d..dd6cffb0 100644
--- a/src/pl/plpgsql/src/pl_exec.c
+++ b/src/pl/plpgsql/src/pl_exec.c
@@ -122,7 +122,7 @@ static SimpleEcontextStackEntry *simple_econtext_stack = NULL;
     MemoryContextAllocZero(get_eval_mcontext(estate), sz)
 
 #ifdef __TBASE__
-extern bool PGDLLIMPORT g_in_plpgsql_exec_fun;
+extern int PGDLLIMPORT g_in_plpgsql_exec_fun;
 extern bool PGDLLIMPORT PlpgsqlDebugPrint;
 #endif
 
@@ -1472,6 +1472,8 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block)
             /* Restore stmt_mcontext stack and release the error data */
             pop_stmt_mcontext(estate);
             MemoryContextReset(stmt_mcontext);
+
+			SetExitPlpgsqlFunc();
         }
         PG_END_TRY();
 
diff --git a/src/pl/plpgsql/src/sql/plpgsql_transaction.sql b/src/pl/plpgsql/src/sql/plpgsql_transaction.sql
new file mode 100644
index 00000000..827e0eba
--- /dev/null
+++ b/src/pl/plpgsql/src/sql/plpgsql_transaction.sql
@@ -0,0 +1,537 @@
+CREATE TABLE test1 (a int, b text);
+
+
+CREATE PROCEDURE transaction_test1(x int, y text)
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    FOR i IN 0..x LOOP
+        INSERT INTO test1 (a, b) VALUES (i, y);
+        IF i % 2 = 0 THEN
+            COMMIT;
+        ELSE
+            ROLLBACK;
+        END IF;
+    END LOOP;
+END
+$$;
+
+CALL transaction_test1(9, 'foo');
+
+SELECT * FROM test1 ORDER BY a;
+
+
+TRUNCATE test1;
+
+DO
+LANGUAGE plpgsql
+$$
+BEGIN
+    FOR i IN 0..9 LOOP
+        INSERT INTO test1 (a) VALUES (i);
+        IF i % 2 = 0 THEN
+            COMMIT;
+        ELSE
+            ROLLBACK;
+        END IF;
+    END LOOP;
+END
+$$;
+
+SELECT * FROM test1 ORDER BY a;
+
+
+-- transaction commands not allowed when called in transaction block
+START TRANSACTION;
+CALL transaction_test1(9, 'error');
+COMMIT;
+
+START TRANSACTION;
+DO LANGUAGE plpgsql $$ BEGIN COMMIT; END $$;
+COMMIT;
+
+
+TRUNCATE test1;
+
+-- not allowed in a function
+CREATE FUNCTION transaction_test2() RETURNS int
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    FOR i IN 0..9 LOOP
+        INSERT INTO test1 (a) VALUES (i);
+        IF i % 2 = 0 THEN
+            COMMIT;
+        ELSE
+            ROLLBACK;
+        END IF;
+    END LOOP;
+    RETURN 1;
+END
+$$;
+
+SELECT transaction_test2();
+
+SELECT * FROM test1;
+
+
+-- also not allowed if procedure is called from a function
+CREATE FUNCTION transaction_test3() RETURNS int
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    CALL transaction_test1(9, 'error');
+    RETURN 1;
+END;
+$$;
+
+SELECT transaction_test3();
+
+SELECT * FROM test1;
+
+
+-- DO block inside function
+CREATE FUNCTION transaction_test4() RETURNS int
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    EXECUTE 'DO LANGUAGE plpgsql $x$ BEGIN COMMIT; END $x$';
+    RETURN 1;
+END;
+$$;
+
+SELECT transaction_test4();
+
+
+-- proconfig settings currently disallow transaction statements
+CREATE PROCEDURE transaction_test5()
+LANGUAGE plpgsql
+SET work_mem = 555
+AS $$
+BEGIN
+    COMMIT;
+END;
+$$;
+
+CALL transaction_test5();
+
+
+-- SECURITY DEFINER currently disallow transaction statements
+CREATE PROCEDURE transaction_test5b()
+LANGUAGE plpgsql
+SECURITY DEFINER
+AS $$
+BEGIN
+    COMMIT;
+END;
+$$;
+
+CALL transaction_test5b();
+
+
+TRUNCATE test1;
+
+-- nested procedure calls
+CREATE PROCEDURE transaction_test6(c text)
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    CALL transaction_test1(9, c);
+END;
+$$;
+
+CALL transaction_test6('bar');
+
+SELECT * FROM test1 ORDER BY a;
+
+TRUNCATE test1;
+
+CREATE PROCEDURE transaction_test7()
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    DO 'BEGIN CALL transaction_test1(9, $x$baz$x$); END;';
+END;
+$$;
+
+CALL transaction_test7();
+
+SELECT * FROM test1 ORDER BY a;
+
+CREATE PROCEDURE transaction_test8()
+LANGUAGE plpgsql
+AS $$
+BEGIN
+    EXECUTE 'CALL transaction_test1(10, $x$baz$x$)';
+END;
+$$;
+
+CALL transaction_test8();
+
+
+-- commit inside cursor loop
+CREATE TABLE test2 (x int);
+INSERT INTO test2 VALUES (0), (1), (2), (3), (4);
+
+TRUNCATE test1;
+
+DO LANGUAGE plpgsql $$
+DECLARE
+    r RECORD;
+BEGIN
+    FOR r IN SELECT * FROM test2 ORDER BY x LOOP
+        INSERT INTO test1 (a) VALUES (r.x);
+        COMMIT;
+    END LOOP;
+END;
+$$;
+
+SELECT * FROM test1 ORDER BY a;
+
+-- check that this doesn't leak a holdable portal
+SELECT * FROM pg_cursors;
+
+
+-- error in cursor loop with commit
+TRUNCATE test1;
+
+DO LANGUAGE plpgsql $$
+DECLARE
+    r RECORD;
+BEGIN
+    FOR r IN SELECT * FROM test2 ORDER BY x LOOP
+        INSERT INTO test1 (a) VALUES (12/(r.x-2));
+        COMMIT;
+    END LOOP;
+END;
+$$;
+
+SELECT * FROM test1;
+
+SELECT * FROM pg_cursors;
+
+
+-- rollback inside cursor loop
+TRUNCATE test1;
+
+DO LANGUAGE plpgsql $$
+DECLARE
+    r RECORD;
+BEGIN
+    FOR r IN SELECT * FROM test2 ORDER BY x LOOP
+        INSERT INTO test1 (a) VALUES (r.x);
+        ROLLBACK;
+    END LOOP;
+END;
+$$;
+
+SELECT * FROM test1;
+
+SELECT * FROM pg_cursors;
+
+
+-- first commit then rollback inside cursor loop
+TRUNCATE test1;
+
+DO LANGUAGE plpgsql $$
+DECLARE
+    r RECORD;
+BEGIN
+    FOR r IN SELECT * FROM test2 ORDER BY x LOOP
+        INSERT INTO test1 (a) VALUES (r.x);
+        IF r.x % 2 = 0 THEN
+            COMMIT;
+        ELSE
+            ROLLBACK;
+        END IF;
+    END LOOP;
+END;
+$$;
+
+SELECT * FROM test1 ORDER BY a;
+
+SELECT * FROM pg_cursors;
+
+
+-- rollback inside cursor loop
+TRUNCATE test1;
+
+DO LANGUAGE plpgsql $$
+DECLARE
+    r RECORD;
+BEGIN
+    FOR r IN UPDATE test2 SET x = x * 2 RETURNING x LOOP
+        INSERT INTO test1 (a) VALUES (r.x);
+        ROLLBACK;
+    END LOOP;
+END;
+$$;
+
+SELECT * FROM test1;
+SELECT * FROM test2 ORDER BY x;
+
+SELECT * FROM pg_cursors;
+
+
+-- commit inside block with exception handler
+TRUNCATE test1;
+
+DO LANGUAGE plpgsql $$
+BEGIN
+    BEGIN
+        INSERT INTO test1 (a) VALUES (1);
+        COMMIT;
+        INSERT INTO test1 (a) VALUES (1/0);
+        COMMIT;
+    EXCEPTION
+        WHEN division_by_zero THEN
+            RAISE NOTICE 'caught division_by_zero';
+    END;
+END;
+$$;
+
+SELECT * FROM test1;
+
+
+-- rollback inside block with exception handler
+TRUNCATE test1;
+
+DO LANGUAGE plpgsql $$
+BEGIN
+    BEGIN
+        INSERT INTO test1 (a) VALUES (1);
+        ROLLBACK;
+        INSERT INTO test1 (a) VALUES (1/0);
+        ROLLBACK;
+    EXCEPTION
+        WHEN division_by_zero THEN
+            RAISE NOTICE 'caught division_by_zero';
+    END;
+END;
+$$;
+
+SELECT * FROM test1;
+
+
+-- COMMIT failures
+DO LANGUAGE plpgsql $$
+BEGIN
+    CREATE TABLE test3 (y int UNIQUE DEFERRABLE INITIALLY DEFERRED);
+    COMMIT;
+    INSERT INTO test3 (y) VALUES (1);
+    COMMIT;
+    INSERT INTO test3 (y) VALUES (1);
+    INSERT INTO test3 (y) VALUES (2);
+    COMMIT;
+    INSERT INTO test3 (y) VALUES (3);  -- won't get here
+END;
+$$;
+
+SELECT * FROM test3;
+
+-- failure while trying to persist a cursor across a transaction (bug #15703)
+CREATE PROCEDURE cursor_fail_during_commit()
+ LANGUAGE plpgsql
+AS $$
+  DECLARE id int;
+  BEGIN
+    FOR id IN SELECT 1/(x-1000) FROM generate_series(1,1000) x LOOP
+        INSERT INTO test1 VALUES(id);
+        COMMIT;
+    END LOOP;
+  END;
+$$;
+
+TRUNCATE test1;
+
+CALL cursor_fail_during_commit();
+
+-- note that error occurs during first COMMIT, hence nothing is in test1
+SELECT count(*) FROM test1;
+
+CREATE PROCEDURE cursor_fail_during_rollback()
+ LANGUAGE plpgsql
+AS $$
+  DECLARE id int;
+  BEGIN
+    FOR id IN SELECT 1/(x-1000) FROM generate_series(1,1000) x LOOP
+        INSERT INTO test1 VALUES(id);
+        ROLLBACK;
+    END LOOP;
+  END;
+$$;
+
+TRUNCATE test1;
+
+CALL cursor_fail_during_rollback();
+
+SELECT count(*) FROM test1;
+
+
+-- SET TRANSACTION
+DO LANGUAGE plpgsql $$
+BEGIN
+    PERFORM 1;
+    RAISE INFO '%', current_setting('transaction_isolation');
+    COMMIT;
+    SET TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+    PERFORM 1;
+    RAISE INFO '%', current_setting('transaction_isolation');
+    COMMIT;
+    SET TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+    RESET TRANSACTION ISOLATION LEVEL;
+    PERFORM 1;
+    RAISE INFO '%', current_setting('transaction_isolation');
+    COMMIT;
+END;
+$$;
+
+-- error cases
+DO LANGUAGE plpgsql $$
+BEGIN
+    SET TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+END;
+$$;
+
+DO LANGUAGE plpgsql $$
+BEGIN
+    SAVEPOINT foo;
+END;
+$$;
+
+DO LANGUAGE plpgsql $$
+BEGIN
+    EXECUTE 'COMMIT';
+END;
+$$;
+
+
+-- snapshot handling test
+TRUNCATE test2;
+
+CREATE PROCEDURE transaction_test9()
+LANGUAGE SQL
+AS $$
+INSERT INTO test2 VALUES (42);
+$$;
+
+DO LANGUAGE plpgsql $$
+BEGIN
+  ROLLBACK;
+  CALL transaction_test9();
+END
+$$;
+
+SELECT * FROM test2;
+
+
+-- Test transaction in procedure with output parameters.  This uses a
+-- different portal strategy and different code paths in pquery.c.
+CREATE PROCEDURE transaction_test10a(INOUT x int)
+LANGUAGE plpgsql
+AS $$
+BEGIN
+  x := x + 1;
+  COMMIT;
+END;
+$$;
+
+CALL transaction_test10a(10);
+
+CREATE PROCEDURE transaction_test10b(INOUT x int)
+LANGUAGE plpgsql
+AS $$
+BEGIN
+  x := x - 1;
+  ROLLBACK;
+END;
+$$;
+
+CALL transaction_test10b(10);
+
+
+-- transaction timestamp vs. statement timestamp
+CREATE PROCEDURE transaction_test11()
+LANGUAGE plpgsql
+AS $$
+DECLARE
+  s1 timestamp with time zone;
+  s2 timestamp with time zone;
+  s3 timestamp with time zone;
+  t1 timestamp with time zone;
+  t2 timestamp with time zone;
+  t3 timestamp with time zone;
+BEGIN
+  s1 := statement_timestamp();
+  t1 := transaction_timestamp();
+  ASSERT s1 = t1;
+  PERFORM pg_sleep(0.001);
+  COMMIT;
+  s2 := statement_timestamp();
+  t2 := transaction_timestamp();
+  ASSERT s2 = s1;
+  ASSERT t2 > t1;
+  PERFORM pg_sleep(0.001);
+  ROLLBACK;
+  s3 := statement_timestamp();
+  t3 := transaction_timestamp();
+  ASSERT s3 = s1;
+  ASSERT t3 > t2;
+END;
+$$;
+
+CALL transaction_test11();
+
+create table test(id int);
+
+create procedure transaction_test12() as $$
+begin
+  insert into test values(1);
+  commit;
+end;
+$$ language plpgsql;
+
+create procedure transaction_test13() as $$
+begin
+  insert into test values(100);
+  rollback;
+end;
+$$ language plpgsql;
+
+create procedure transaction_test14() as $$
+begin
+  call transaction_test12();
+  insert into test values(100);
+  rollback;
+end;
+$$ language plpgsql;
+
+do $$
+begin
+  call transaction_test12();
+  call transaction_test13();
+  insert into test values(2);
+  rollback;
+  insert into test values(3);
+  commit;
+end;
+$$ language plpgsql;
+
+select * from test order by 1;
+
+delete from test;
+
+do $$
+begin
+  call transaction_test14();
+  insert into test values(200);
+  rollback;
+end;
+$$ language plpgsql;
+
+select * from test order by 1;
+
+DROP TABLE test1;
+DROP TABLE test2;
+DROP TABLE test3;
+DROP TABLE test;

From 4e65b9e7919e4a36051ab8843e645416e0e13bd1 Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Thu, 30 Dec 2021 20:34:14 +0800
Subject: [PATCH 470/578] fix prepare for fqs insert
 http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131095655877&jumpfrom=RTX

---
 src/backend/pgxc/pool/execRemote.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index b9f9c4a2..c543fc1f 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -6826,11 +6826,13 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection,
          * exist */
 		if (exec_nodes && exec_nodes->need_rewrite == true)
 			prepared = false;
-		else if (step->statement)
+		if (step->statement)
             prepared =
                 ActivateDatanodeStatementOnNode(step->statement,
                         PGXCNodeGetNodeId(connection->nodeoid,
                             &nodetype));
+		if (prepared && exec_nodes && exec_nodes->need_rewrite == true)
+			prepared = false;
 
         /*
          * execute and fetch rows only if they will be consumed

From 960dbcd831141ba03f6cbc8e380dbecd4d5be377 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Wed, 5 Jan 2022 15:03:20 +0800
Subject: [PATCH 471/578] bugfix: rollback slow than commit, tpcc performce
 optimize(merge request 965), too many logs, adjust log level

---
 src/backend/pgxc/pool/pgxcnode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 78f1a024..f006dc41 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -2720,7 +2720,7 @@ pgxc_node_flush_read(PGXCNodeHandle *handle)
         /* break, only if the connection is ready for query. */
         if (is_ready)
         {
-            elog(LOG, "pgxc_node_flush_read node:%s ready for query.", handle->nodename);
+			elog(DEBUG1, "pgxc_node_flush_read node:%s ready for query.", handle->nodename);
             break;
         }        
 
@@ -2728,7 +2728,7 @@ pgxc_node_flush_read(PGXCNodeHandle *handle)
         read_result = pgxc_node_read_data(handle, true);
         if (read_result <= 0)
         {
-            elog(LOG, "pgxc_node_flush_read node:%s read failure.", handle->nodename);
+			elog(DEBUG1, "pgxc_node_flush_read node:%s read failure.", handle->nodename);
             break;
         }
 

From e87543794bf8bba933d9424e14ae0a51572dfb3e Mon Sep 17 00:00:00 2001
From: jadenchi <jadenchi@tencent.com>
Date: Wed, 5 Jan 2022 21:28:37 +0800
Subject: [PATCH 472/578] fix rename database failed caused by leader cn's
 connection remaining, same as drop database. fix
 http://tapd.oa.com/TencentDB_for_TBase/prong/stories/view/1020418349870883157

---
 src/backend/tcop/utility.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index d58ae205..fcdc89f5 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -2455,7 +2455,7 @@ standard_ProcessUtility(PlannedStmt *pstmt,
         case T_DropdbStmt:
             {
 				char prepareQuery[STRINGLENGTH];
-                char query[STRINGLENGTH];
+                char cleanQuery[STRINGLENGTH];
                 DropdbStmt *stmt = (DropdbStmt *) parsetree;
 				if (!stmt->prepare)
 				{
@@ -2479,14 +2479,14 @@ standard_ProcessUtility(PlannedStmt *pstmt,
 
 						if (OidIsValid(db_oid))
 						{
-                            snprintf(query, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;",
+                            snprintf(cleanQuery, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;",
                                      quote_identifier(stmt->dbname));
 
 							snprintf(prepareQuery, STRINGLENGTH, "DROP DATABASE PREPARE %s;",
 						        			quote_identifier(stmt->dbname));
 							if (!is_ddl_leader_cn(leaderCnHandle->nodename))
                             {
-                                SendLeaderCNUtility(query, false);
+                                SendLeaderCNUtility(cleanQuery, false);
 								SendLeaderCNUtility(prepareQuery, false);
 							}
 							else
@@ -2858,6 +2858,13 @@ standard_ProcessUtility(PlannedStmt *pstmt,
 					 */
 					if (!is_leader_cn)
 					{
+                        if (OBJECT_DATABASE == stmt->renameType) {
+                            char cleanQuery[STRINGLENGTH];
+                            snprintf(cleanQuery, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;",
+                                     quote_identifier(stmt->subname));
+                            SendLeaderCNUtility(cleanQuery, false);
+                        }
+
 						SendLeaderCNUtility(queryString, is_temp);
 					}
 					ExecRenameStmt(stmt);

From 3a34ec77cb90e59649411dbc9f112ea6b1f6f0d4 Mon Sep 17 00:00:00 2001
From: jadenchi <jadenchi@tencent.com>
Date: Thu, 6 Jan 2022 10:37:31 +0800
Subject: [PATCH 473/578] modify line feed style

---
 src/backend/tcop/utility.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index fcdc89f5..4f753680 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -2858,7 +2858,8 @@ standard_ProcessUtility(PlannedStmt *pstmt,
 					 */
 					if (!is_leader_cn)
 					{
-                        if (OBJECT_DATABASE == stmt->renameType) {
+                        if (OBJECT_DATABASE == stmt->renameType)
+                        {
                             char cleanQuery[STRINGLENGTH];
                             snprintf(cleanQuery, STRINGLENGTH, "CLEAN CONNECTION TO ALL FOR DATABASE %s;",
                                      quote_identifier(stmt->subname));

From 7d3f95c0f160a9e9ed76be00b9ef2440f1c3b78d Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Thu, 6 Jan 2022 11:48:22 +0800
Subject: [PATCH 474/578] bugfix: uos testing core in getTxnInfoOnNode (merge
 request !1071)

http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696095841513&jump_count=1


(cherry picked from commit fcdbc285)

417a33be bugfix: uos testing core in getTxnInfoOnNode
---
 contrib/pg_clean/pg_clean.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c
index 0b2f6f98..68d916a5 100644
--- a/contrib/pg_clean/pg_clean.c
+++ b/contrib/pg_clean/pg_clean.c
@@ -1033,6 +1033,21 @@ void getTxnInfoOnNode(Oid node)
 												ObjectIdGetDatum(InvalidOid),
 												Int32GetDatum(-1)));
 			
+			if (gid == NULL)
+			{
+				elog(ERROR, "node(%d) gid is null, xid: %d", node, xid);
+			}
+			else if (owner == NULL)
+			{
+				elog(ERROR, "node(%d) owner is null, xid: %d, gid: %s",
+					node, xid, gid);
+			}
+			else if (datname == NULL)
+			{
+				elog(ERROR, "node(%d) db name is null, xid: %d, gid: %s, owner: %s",
+					node, xid, gid, owner);
+			}
+
 			/*add txn to database*/
 			add_txn_info(datname, node, xid, gid, owner, prepared_time, TXN_STATUS_PREPARED);
             if (total_twopc_txn >= MAX_TWOPC_TXN)

From 73aa3b85f4645d6166378b97abdfb9ea7db0a7bc Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Sat, 11 Dec 2021 16:24:43 +0800
Subject: [PATCH 475/578] arm compile error fix: inline func check_entry_key
 error

---
 src/backend/access/transam/twophase.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 387cdf73..15ddc9f3 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -190,7 +190,7 @@ typedef struct Cache2pcInfo
 
 } Cache2pcInfo;
 
-inline void check_entry_key(const char *tid, const char *key);
+static inline void check_entry_key(const char *tid, const char *key);
 
 bool add_2pc_info(const char *tid, const char *info);
 
@@ -3319,7 +3319,7 @@ PrepareRedoRemove(TransactionId xid, bool giveWarning)
 /*
  * check_entry_key: check the entry key in the hash table whether is same with tid.
  */
-inline void check_entry_key(const char *tid, const char *key)
+static inline void check_entry_key(const char *tid, const char *key)
 {
 	if (enable_2pc_entry_key_check)
 	{

From 9246bef0698a61a3ded01da9cf3bd9540ad430a0 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 5 Jan 2022 16:52:51 +0800
Subject: [PATCH 476/578] fix modify the system parameters on slave cn
 http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696095817651

TAPD: --bug=095817651

(cherry picked from commit be98dc09)

5647a757 add fix

a4b1d061 fix modify the system parameters on slave cn http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696095817651
---
 src/backend/pgxc/pool/poolmgr.c | 3 +--
 src/backend/tcop/utility.c      | 4 +++-
 src/include/pgxc/pgxc.h         | 2 ++
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 2db4397e..6e4f3283 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -1842,8 +1842,7 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist, bool raise_error,
 	 * the main data node, and the standby cn may generate the same global xid as the main cn,
 	 * so disable the distributed query of the standby node on the main plane
 	 */
-	if (g_allow_distri_query_on_standby_node == false &&
-	            IsPGXCMainCluster && RecoveryInProgress())
+	if (g_allow_distri_query_on_standby_node == false && IS_PGXC_MAINCLUSTER_SLAVENODE)
     {
         elog(ERROR, "can't do distributed query because it is the main plane standby node.");
     }
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 4f753680..799b83a2 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -7799,7 +7799,9 @@ IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString)
 		case T_LockNodeStmt:
 #endif
 			return ALLOW;
-
+        case T_AlterSystemStmt:
+            /* allow if it's main cluster slave */
+            return (IS_PGXC_MAINCLUSTER_SLAVENODE) ? ALLOW : DISALLOW;
         default:
             return DISALLOW;
     }
diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h
index 6c5abcf4..687be6c8 100644
--- a/src/include/pgxc/pgxc.h
+++ b/src/include/pgxc/pgxc.h
@@ -120,6 +120,8 @@ extern Datum xc_lockForBackupKey2;
     (IS_PGXC_COORDINATOR && !IsConnFromCoord())
 #define IS_PGXC_REMOTE_COORDINATOR    \
     (IS_PGXC_COORDINATOR && IsConnFromCoord())
+#define IS_PGXC_MAINCLUSTER_SLAVENODE \
+    (IsPGXCMainCluster && RecoveryInProgress())
 
 #define PGXC_PARENT_NODE parentPGXCNode
 #define PGXC_PARENT_NODE_ID    parentPGXCNodeId

From 3c007cdc499676120d47456e2a4222f30558534f Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Tue, 11 Jan 2022 11:16:02 +0800
Subject: [PATCH 477/578] fix core when ExecCloseRemoteStatement
 http://tapd.oa.com/10092131/bugtrace/bugs/view?bug_id=1010092131095972651

---
 src/backend/pgxc/pool/execRemote.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index c543fc1f..798f91af 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -7289,9 +7289,9 @@ PGXCNodeCleanAndRelease(int code, Datum arg)
     stat_log();
 }
 
-void
-ExecCloseRemoteStatement(const char *stmt_name, List *nodelist)
-{// #lizard forgives
+static void
+ExecCloseRemoteStatementInternal(const char *stmt_name, List *nodelist)
+{
     PGXCNodeAllHandles *all_handles;
     PGXCNodeHandle      **connections;
     ResponseCombiner    combiner;
@@ -7379,6 +7379,28 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist)
     pfree_pgxc_all_handles(all_handles);
 }
 
+/*
+ * close remote statement needs to be inside a transaction so that syscache can be accessed
+ */
+void
+ExecCloseRemoteStatement(const char *stmt_name, List *nodelist)
+{
+    bool need_abort = false;
+
+    if (IsTransactionIdle())
+    {
+        StartTransactionCommand();
+        need_abort = true;
+    }
+
+    ExecCloseRemoteStatementInternal(stmt_name, nodelist);
+
+    if (need_abort)
+    {
+        AbortCurrentTransaction();
+    }
+}
+
 /*
  * DataNodeCopyInBinaryForAll
  *

From 0dd416baffc0fdc7f93c82c204092e326a793ce6 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Tue, 18 Jan 2022 15:20:24 +0800
Subject: [PATCH 478/578] fix gtm core when get_node_list
 http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131096263845 (merge
 request !1106)

Squash merge branch 'sigmalin_v2' into 'Tbase_v2.15.19.5'
fix gtm core when get_node_list http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131096263845


TAPD: --bug=096263845
---
 src/gtm/client/gtm_client.c | 85 +++++++++++++++++++++++++++++++++++--
 1 file changed, 82 insertions(+), 3 deletions(-)

diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c
index 8ff8b131..f9cdbd65 100644
--- a/src/gtm/client/gtm_client.c
+++ b/src/gtm/client/gtm_client.c
@@ -179,7 +179,13 @@ begin_replication_initial_sync(GTM_Conn *conn)
         goto receive_failed;
 
     if (res->gr_status == GTM_RESULT_OK)
+    {
         Assert(res->gr_type == NODE_BEGIN_REPLICATION_INIT_RESULT);
+        if (res->gr_type != NODE_BEGIN_REPLICATION_INIT_RESULT)
+        {
+            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_BEGIN_REPLICATION_INIT_RESULT);
+        }
+    }
     else
         return 0;
 
@@ -226,7 +232,13 @@ end_replication_initial_sync(GTM_Conn *conn)
         goto receive_failed;
 
     if (res->gr_status == GTM_RESULT_OK)
+    {
         Assert(res->gr_type == NODE_END_REPLICATION_INIT_RESULT);
+        if (res->gr_type != NODE_END_REPLICATION_INIT_RESULT)
+        {
+            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_END_REPLICATION_INIT_RESULT);
+        }
+    }
 
     return 1;
 
@@ -270,6 +282,15 @@ get_node_list(GTM_Conn *conn, GTM_PGXCNodeInfo *data, size_t maxlen)
     if ((res = GTMPQgetResult(conn)) == NULL)
         goto receive_failed;
 
+    if (res->gr_status == GTM_RESULT_OK)
+    {
+        Assert(res->gr_type == NODE_LIST_RESULT);
+        if (res->gr_type != NODE_LIST_RESULT)
+        {
+            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_LIST_RESULT);
+        }
+    }
+
     /*
      * Do something here.
      */
@@ -287,9 +308,6 @@ get_node_list(GTM_Conn *conn, GTM_PGXCNodeInfo *data, size_t maxlen)
         memcpy(&data[i], res->gr_resdata.grd_node_list.nodeinfo[i], sizeof(GTM_PGXCNodeInfo));
     }
 
-    if (res->gr_status == GTM_RESULT_OK)
-        Assert(res->gr_type == NODE_LIST_RESULT);
-
     return num_node;
 
 receive_failed:
@@ -337,7 +355,13 @@ get_next_gxid(GTM_Conn *conn)
     next_gxid = res->gr_resdata.grd_next_gxid;
 
     if (res->gr_status == GTM_RESULT_OK)
+    {
         Assert(res->gr_type == TXN_GET_NEXT_GXID_RESULT);
+        if (res->gr_type != TXN_GET_NEXT_GXID_RESULT)
+        {
+            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_GET_NEXT_GXID_RESULT);
+        }
+    }
 
     /* FIXME: should be a number of gxids */
     return next_gxid;
@@ -382,7 +406,13 @@ get_txn_gxid_list(GTM_Conn *conn, GTM_Transactions *txn)
         goto receive_failed;
 
     if (res->gr_status == GTM_RESULT_OK)
+    {
         Assert(res->gr_type == TXN_GXID_LIST_RESULT);
+        if (res->gr_type != TXN_GXID_LIST_RESULT)
+        {
+            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_GXID_LIST_RESULT);
+        }
+    }
 
     txn_count = gtm_deserialize_transactions(txn,
                          res->gr_resdata.grd_txn_gid_list.ptr,
@@ -431,7 +461,13 @@ get_sequence_list(GTM_Conn *conn, GTM_SeqInfo **seq_list)
         goto receive_failed;
 
     if (res->gr_status == GTM_RESULT_OK)
+    {
         Assert(res->gr_type == SEQUENCE_LIST_RESULT);
+        if (res->gr_type != SEQUENCE_LIST_RESULT)
+        {
+            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, SEQUENCE_LIST_RESULT);
+        }
+    }
 
     *seq_list = res->gr_resdata.grd_seq_list.seq;
 
@@ -951,6 +987,10 @@ commit_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid,
         {
             Assert(res->gr_type == TXN_COMMIT_RESULT);
             Assert(res->gr_resdata.grd_gxid == gxid);
+            if (res->gr_type != TXN_COMMIT_RESULT)
+            {
+                elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_COMMIT_RESULT);
+            }
 
             if (waited_xid_count > 0)
             {
@@ -1064,6 +1104,11 @@ commit_prepared_transaction_internal(GTM_Conn *conn,
         {
             Assert(res->gr_type == TXN_COMMIT_PREPARED_RESULT);
             Assert(res->gr_resdata.grd_gxid == gxid);
+            if (res->gr_type != TXN_COMMIT_PREPARED_RESULT)
+            {
+                elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_COMMIT_PREPARED_RESULT);
+            }
+
             if (waited_xid_count > 0)
             {
                 if (res->gr_resdata.grd_eof_txn.status == STATUS_DELAYED)
@@ -1138,6 +1183,10 @@ abort_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, bool is_bac
         {
             Assert(res->gr_type == TXN_ROLLBACK_RESULT);
             Assert(res->gr_resdata.grd_gxid == gxid);
+            if (res->gr_type != TXN_ROLLBACK_RESULT)
+            {
+                elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_ROLLBACK_RESULT);
+            }
         }
 
         return res->gr_status;
@@ -1211,6 +1260,10 @@ start_prepared_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, ch
     {
         Assert(res->gr_type == TXN_START_PREPARED_RESULT);
         Assert(res->gr_resdata.grd_gxid == gxid);
+        if (res->gr_type != TXN_START_PREPARED_RESULT)
+        {
+            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_START_PREPARED_RESULT);
+        }
     }
 
     return res->gr_status;
@@ -1311,6 +1364,10 @@ log_commit_transaction_internal(GTM_Conn *conn,
     {
         Assert(res->gr_type == TXN_LOG_TRANSACTION_RESULT);
         Assert(res->gr_resdata.grd_gxid == gxid);
+        if (res->gr_type != TXN_LOG_TRANSACTION_RESULT)
+        {
+            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_LOG_TRANSACTION_RESULT);
+        }
     }
 
     return res->gr_status;
@@ -1398,6 +1455,10 @@ log_scan_transaction_internal(GTM_Conn *conn,
     {
         Assert(res->gr_type == TXN_LOG_SCAN_RESULT);
         Assert(res->gr_resdata.grd_gxid == gxid);
+        if (res->gr_type != TXN_LOG_SCAN_RESULT)
+        {
+            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_LOG_SCAN_RESULT);
+        }
     }
 
     return res->gr_status;
@@ -1458,6 +1519,10 @@ prepare_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, bool is_b
         {
             Assert(res->gr_type == TXN_PREPARE_RESULT);
             Assert(res->gr_resdata.grd_gxid == gxid);
+            if (res->gr_type != TXN_PREPARE_RESULT)
+            {
+                elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_PREPARE_RESULT);
+            }
         }
 
         return res->gr_status;
@@ -1651,7 +1716,13 @@ get_storage_file(GTM_Conn *conn, char **data)
         goto receive_failed;
 
     if (res->gr_status == GTM_RESULT_OK)
+    {
         Assert(res->gr_type == STORAGE_TRANSFER_RESULT);
+        if (res->gr_type != STORAGE_TRANSFER_RESULT)
+        {
+            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, STORAGE_TRANSFER_RESULT);
+        }
+    }
 
 #ifdef __XLOG__
     *start_pos  = res->grd_storage_data.start_pos;
@@ -1921,6 +1992,10 @@ get_snapshot(GTM_Conn *conn, GlobalTransactionId gxid, bool canbe_grouped)
     if (res->gr_status == GTM_RESULT_OK)
     {
         Assert(res->gr_type == res_type);
+        if (res->gr_type != res_type)
+        {
+            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, res_type);
+        }
         /*
          * !!FIXME - The following assertion fails when snapshots are requested
          * in non-grouping mode. We did some investigations and it appears that
@@ -2691,6 +2766,10 @@ static int node_register_worker(GTM_Conn *conn,
         {
             Assert(res->gr_resdata.grd_node.type == type);
             Assert((strcmp(res->gr_resdata.grd_node.node_name,node_name) == 0));
+            if (res->gr_type != NODE_REGISTER_RESULT)
+            {
+                elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_REGISTER_RESULT);
+            }
         }
 
         return res->gr_status;

From c8724ed8012eb5d8f1523f37db74013caad58413 Mon Sep 17 00:00:00 2001
From: guanhuawang <guanhuawang@tencent.com>
Date: Tue, 18 Jan 2022 17:35:16 +0800
Subject: [PATCH 479/578] Fix stddev_samp error caused by numeric_poly_combine
 (merge request !1108)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Squash merge branch 'winter_tbase_v2.15.19.5_fixstddev' into 'Tbase_v2.15.19.5'
Fix stddev_samp error caused by numeric_poly_combine

函数的原地址跟目的地址搞反了。pg的老bug，在2018年修复，由于使用屏蔽int128的patch之后暴露出来。
---
 src/backend/access/transam/twophase.c      |  2 +-
 src/backend/utils/adt/numeric.c            |  4 ++--
 src/bin/pg_rewind/copy_fetch.c             | 10 ++++----
 src/test/regress/expected/aggregates_1.out | 28 ++++++++++++++++++++++
 src/test/regress/sql/aggregates.sql        | 18 ++++++++++++++
 5 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 15ddc9f3..e78f9c53 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -190,7 +190,7 @@ typedef struct Cache2pcInfo
 
 } Cache2pcInfo;
 
-static inline void check_entry_key(const char *tid, const char *key);
+static void check_entry_key(const char *tid, const char *key);
 
 bool add_2pc_info(const char *tid, const char *info);
 
diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index d159c430..1fd3fbb6 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -4104,8 +4104,8 @@ numeric_poly_combine(PG_FUNCTION_ARGS)
         state1->sumX = state2->sumX;
         state1->sumX2 = state2->sumX2;
 #else
-        accum_sum_copy(&state2->sumX, &state1->sumX);
-        accum_sum_copy(&state2->sumX2, &state1->sumX2);
+		accum_sum_copy(&state1->sumX, &state2->sumX);
+		accum_sum_copy(&state1->sumX2, &state2->sumX2);
 #endif
 
         MemoryContextSwitchTo(old_context);
diff --git a/src/bin/pg_rewind/copy_fetch.c b/src/bin/pg_rewind/copy_fetch.c
index 7696a6fa..48a206d7 100644
--- a/src/bin/pg_rewind/copy_fetch.c
+++ b/src/bin/pg_rewind/copy_fetch.c
@@ -156,8 +156,8 @@ recurse_dir(const char *datadir, const char *parentpath,
  * If 'trunc' is true, any existing file with the same name is truncated.
  */
 static void
-rewind_copy_file_range(const char *path, off_t begin, off_t end, bool trunc)
-{// #lizard forgives
+tbase_copy_file_range(const char *path, off_t begin, off_t end, bool trunc)
+{
     char        buf[BLCKSZ];
     char        srcpath[MAXPGPATH];
     int            srcfd;
@@ -222,7 +222,7 @@ copy_executeFileMap(filemap_t *map)
                 break;
 
             case FILE_ACTION_COPY:
-                rewind_copy_file_range(entry->path, 0, entry->newsize, true);
+				tbase_copy_file_range(entry->path, 0, entry->newsize, true);
                 break;
 
             case FILE_ACTION_TRUNCATE:
@@ -230,7 +230,7 @@ copy_executeFileMap(filemap_t *map)
                 break;
 
             case FILE_ACTION_COPY_TAIL:
-                rewind_copy_file_range(entry->path, entry->oldsize, entry->newsize, false);
+				tbase_copy_file_range(entry->path, entry->oldsize, entry->newsize, false);
                 break;
 
             case FILE_ACTION_CREATE:
@@ -257,7 +257,7 @@ execute_pagemap(datapagemap_t *pagemap, const char *path)
     while (datapagemap_next(iter, &blkno))
     {
         offset = blkno * BLCKSZ;
-        rewind_copy_file_range(path, offset, offset + BLCKSZ, false);
+		tbase_copy_file_range(path, offset, offset + BLCKSZ, false);
         /* Ok, this block has now been copied from new data dir to old */
     }
     pg_free(iter);
diff --git a/src/test/regress/expected/aggregates_1.out b/src/test/regress/expected/aggregates_1.out
index 9602196b..2bfcbb7f 100644
--- a/src/test/regress/expected/aggregates_1.out
+++ b/src/test/regress/expected/aggregates_1.out
@@ -2000,3 +2000,31 @@ select my_sum(one),my_half_sum(one) from (values(1),(2),(3),(4)) t(one);
 (1 row)
 
 rollback;
+ -- test coverage for aggregate combine/serial/deserial functions
+ BEGIN ISOLATION LEVEL REPEATABLE READ;
+ SET parallel_setup_cost = 0;
+ SET parallel_tuple_cost = 0;
+ SET min_parallel_table_scan_size = 0;
+ SET max_parallel_workers_per_gather = 4;
+ SET enable_indexonlyscan = off;
+ -- variance(int4) covers numeric_poly_combine
+ -- sum(int8) covers int8_avg_combine
+ EXPLAIN (COSTS OFF)
+   SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1;
+                              QUERY PLAN
+ --------------------------------------------------------------------
+  Parallel Finalize Aggregate
+    ->  Parallel Remote Subquery Scan on all (datanode_1,datanode_2)
+          ->  Gather
+                Workers Planned: 4
+                ->  Partial Aggregate
+                      ->  Parallel Seq Scan on tenk1
+ (6 rows)
+
+ SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1;
+        variance       |   sum
+ ----------------------+----------
+  8334166.666666666667 | 49995000
+ (1 row)
+
+ ROLLBACK;
\ No newline at end of file
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 11d9db70..b9e0b2b4 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -844,3 +844,21 @@ create aggregate my_half_sum(int4)
 select my_sum(one),my_half_sum(one) from (values(1),(2),(3),(4)) t(one);
 
 rollback;
+
+-- test coverage for aggregate combine/serial/deserial functions
+BEGIN ISOLATION LEVEL REPEATABLE READ;
+
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 4;
+SET enable_indexonlyscan = off;
+
+-- variance(int4) covers numeric_poly_combine
+-- sum(int8) covers int8_avg_combine
+EXPLAIN (COSTS OFF)
+  SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1;
+
+SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1;
+
+ROLLBACK;

From 84f0b9cb913c4dce4d69df6de88f7649f1e166a1 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Tue, 18 Jan 2022 20:15:14 +0800
Subject: [PATCH 480/578] fix gtm core when get_node_list
 http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131096263845

---
 src/gtm/client/gtm_client.c | 48 ++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c
index f9cdbd65..27232498 100644
--- a/src/gtm/client/gtm_client.c
+++ b/src/gtm/client/gtm_client.c
@@ -183,7 +183,8 @@ begin_replication_initial_sync(GTM_Conn *conn)
         Assert(res->gr_type == NODE_BEGIN_REPLICATION_INIT_RESULT);
         if (res->gr_type != NODE_BEGIN_REPLICATION_INIT_RESULT)
         {
-            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_BEGIN_REPLICATION_INIT_RESULT);
+            fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, NODE_BEGIN_REPLICATION_INIT_RESULT);
+            goto receive_failed;
         }
     }
     else
@@ -236,7 +237,8 @@ end_replication_initial_sync(GTM_Conn *conn)
         Assert(res->gr_type == NODE_END_REPLICATION_INIT_RESULT);
         if (res->gr_type != NODE_END_REPLICATION_INIT_RESULT)
         {
-            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_END_REPLICATION_INIT_RESULT);
+            fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, NODE_END_REPLICATION_INIT_RESULT);
+            goto receive_failed;
         }
     }
 
@@ -287,7 +289,8 @@ get_node_list(GTM_Conn *conn, GTM_PGXCNodeInfo *data, size_t maxlen)
         Assert(res->gr_type == NODE_LIST_RESULT);
         if (res->gr_type != NODE_LIST_RESULT)
         {
-            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_LIST_RESULT);
+            fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, NODE_LIST_RESULT);
+            goto receive_failed;
         }
     }
 
@@ -359,7 +362,8 @@ get_next_gxid(GTM_Conn *conn)
         Assert(res->gr_type == TXN_GET_NEXT_GXID_RESULT);
         if (res->gr_type != TXN_GET_NEXT_GXID_RESULT)
         {
-            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_GET_NEXT_GXID_RESULT);
+            fprintf(stderr,"res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_GET_NEXT_GXID_RESULT);
+            goto receive_failed;
         }
     }
 
@@ -410,7 +414,8 @@ get_txn_gxid_list(GTM_Conn *conn, GTM_Transactions *txn)
         Assert(res->gr_type == TXN_GXID_LIST_RESULT);
         if (res->gr_type != TXN_GXID_LIST_RESULT)
         {
-            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_GXID_LIST_RESULT);
+            fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_GXID_LIST_RESULT);
+            goto receive_failed;
         }
     }
 
@@ -465,7 +470,8 @@ get_sequence_list(GTM_Conn *conn, GTM_SeqInfo **seq_list)
         Assert(res->gr_type == SEQUENCE_LIST_RESULT);
         if (res->gr_type != SEQUENCE_LIST_RESULT)
         {
-            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, SEQUENCE_LIST_RESULT);
+            fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, SEQUENCE_LIST_RESULT);
+            goto receive_failed;
         }
     }
 
@@ -989,7 +995,8 @@ commit_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid,
             Assert(res->gr_resdata.grd_gxid == gxid);
             if (res->gr_type != TXN_COMMIT_RESULT)
             {
-                elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_COMMIT_RESULT);
+                fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_COMMIT_RESULT);
+                goto receive_failed;
             }
 
             if (waited_xid_count > 0)
@@ -1106,7 +1113,8 @@ commit_prepared_transaction_internal(GTM_Conn *conn,
             Assert(res->gr_resdata.grd_gxid == gxid);
             if (res->gr_type != TXN_COMMIT_PREPARED_RESULT)
             {
-                elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_COMMIT_PREPARED_RESULT);
+                fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_COMMIT_PREPARED_RESULT);
+                goto receive_failed;
             }
 
             if (waited_xid_count > 0)
@@ -1185,7 +1193,8 @@ abort_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, bool is_bac
             Assert(res->gr_resdata.grd_gxid == gxid);
             if (res->gr_type != TXN_ROLLBACK_RESULT)
             {
-                elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_ROLLBACK_RESULT);
+                fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_ROLLBACK_RESULT);
+                goto receive_failed;
             }
         }
 
@@ -1262,7 +1271,8 @@ start_prepared_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, ch
         Assert(res->gr_resdata.grd_gxid == gxid);
         if (res->gr_type != TXN_START_PREPARED_RESULT)
         {
-            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_START_PREPARED_RESULT);
+            fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_START_PREPARED_RESULT);
+            goto receive_failed;
         }
     }
 
@@ -1366,7 +1376,8 @@ log_commit_transaction_internal(GTM_Conn *conn,
         Assert(res->gr_resdata.grd_gxid == gxid);
         if (res->gr_type != TXN_LOG_TRANSACTION_RESULT)
         {
-            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_LOG_TRANSACTION_RESULT);
+            fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_LOG_TRANSACTION_RESULT);
+            goto receive_failed;
         }
     }
 
@@ -1457,7 +1468,8 @@ log_scan_transaction_internal(GTM_Conn *conn,
         Assert(res->gr_resdata.grd_gxid == gxid);
         if (res->gr_type != TXN_LOG_SCAN_RESULT)
         {
-            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_LOG_SCAN_RESULT);
+            fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_LOG_SCAN_RESULT);
+            goto receive_failed;
         }
     }
 
@@ -1521,7 +1533,8 @@ prepare_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, bool is_b
             Assert(res->gr_resdata.grd_gxid == gxid);
             if (res->gr_type != TXN_PREPARE_RESULT)
             {
-                elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, TXN_PREPARE_RESULT);
+                fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, TXN_PREPARE_RESULT);
+                goto receive_failed;
             }
         }
 
@@ -1720,7 +1733,8 @@ get_storage_file(GTM_Conn *conn, char **data)
         Assert(res->gr_type == STORAGE_TRANSFER_RESULT);
         if (res->gr_type != STORAGE_TRANSFER_RESULT)
         {
-            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, STORAGE_TRANSFER_RESULT);
+            fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, STORAGE_TRANSFER_RESULT);
+            goto receive_failed;
         }
     }
 
@@ -1994,7 +2008,8 @@ get_snapshot(GTM_Conn *conn, GlobalTransactionId gxid, bool canbe_grouped)
         Assert(res->gr_type == res_type);
         if (res->gr_type != res_type)
         {
-            elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, res_type);
+            fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, res_type);
+            goto receive_failed;
         }
         /*
          * !!FIXME - The following assertion fails when snapshots are requested
@@ -2768,7 +2783,8 @@ static int node_register_worker(GTM_Conn *conn,
             Assert((strcmp(res->gr_resdata.grd_node.node_name,node_name) == 0));
             if (res->gr_type != NODE_REGISTER_RESULT)
             {
-                elog(ERROR, "res->gr_type %d not match, expected %d.", res->gr_type, NODE_REGISTER_RESULT);
+                fprintf(stderr, "res->gr_type %d not match, expected %d.\n", res->gr_type, NODE_REGISTER_RESULT);
+                goto receive_failed;
             }
         }
 

From 4df0b852896cbe342e2dd510db26339a4e2e4a83 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Wed, 19 Jan 2022 12:06:33 +0800
Subject: [PATCH 481/578] Remote DML with dropped column

tapd: http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131096344771
---
 src/backend/pgxc/plan/planner.c             | 21 +++++++++++++++++++--
 src/backend/pgxc/pool/execRemote.c          |  5 +++--
 src/test/regress/expected/alter_table_3.out | 19 +++++++++++++++++++
 src/test/regress/sql/alter_table.sql        | 19 ++++++++++++++++++-
 4 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c
index e2d7158f..9be80f2f 100644
--- a/src/backend/pgxc/plan/planner.c
+++ b/src/backend/pgxc/plan/planner.c
@@ -844,7 +844,10 @@ pgxc_build_upsert_statement(PlannerInfo *root, CmdType cmdtype,
 
         /* Make sure the column has not been dropped */
         if (get_rte_attribute_is_dropped(res_rel, col_att))
+		{
+			rqplan->rq_param_types[rqplan->rq_num_params++] = InvalidOid;
             continue;
+		}
 
         type = exprType((Node *) tle->expr);
 
@@ -948,13 +951,17 @@ pgxc_build_upsert_statement(PlannerInfo *root, CmdType cmdtype,
     
     natts = get_relnatts(res_rel->relid);
 
-    rqplan->su_param_types = (Oid *)palloc(natts * sizeof(Oid));
+	/* natts + 1(xc_node_id) + 1(ctid) */
+	rqplan->su_param_types = (Oid *)palloc((natts + 2) * sizeof(Oid));
 
     for (attnum = 1; attnum <= natts; attnum++)
     {
         /* Make sure the column has not been dropped */
         if (get_rte_attribute_is_dropped(res_rel, attnum))
+		{
+			rqplan->rq_param_types[rqplan->rq_num_params++] = InvalidOid;
             continue;
+		}
 
         type = get_atttype(res_rel->relid, attnum);
         pgxc_add_param_as_tle(query_to_deparse, attnum,
@@ -1212,7 +1219,10 @@ pgxc_build_dml_statement(PlannerInfo *root, CmdType cmdtype,
         {
             /* Make sure the column has not been dropped */
             if (get_rte_attribute_is_dropped(res_rel, col_att))
+			{
+				rqplan->rq_param_types[rqplan->rq_num_params++] = InvalidOid;
                 continue;
+			}
 
             /*
              * Create the param to be used for VALUES caluse ($1, $2 ...)
@@ -1254,14 +1264,21 @@ pgxc_build_dml_statement(PlannerInfo *root, CmdType cmdtype,
         Oid         type;
         int            natts = get_relnatts(res_rel->relid);
         int            attnum;
+		int         appendix = 0;
 
-        rqplan->rq_param_types = (Oid *)palloc(natts * sizeof(Oid));
+		/* count origin attrs and ctid, nodeid */
+		appendix += node_id_found ? 1 : 0;
+		appendix += ctid_found ? 1 : 0;
+		rqplan->rq_param_types = (Oid *)palloc((natts + appendix) * sizeof(Oid));
 
         for (attnum = 1; attnum <= natts; attnum++)
         {
             /* Make sure the column has not been dropped */
             if (get_rte_attribute_is_dropped(res_rel, attnum))
+			{
+				rqplan->rq_param_types[rqplan->rq_num_params++] = InvalidOid;
                 continue;
+			}
 
             type = get_atttype(res_rel->relid, attnum);
             pgxc_add_param_as_tle(query_to_deparse, attnum,
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 798f91af..73cfbde5 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -12504,6 +12504,7 @@ SetDataRowParams(ModifyTableState *mtstate, RemoteQueryState *node, TupleTableSl
     int                numatts = tdesc->natts;
     ResponseCombiner *combiner = (ResponseCombiner *) node;
     RemoteQuery    *step = (RemoteQuery *) combiner->ss.ps.plan;
+	Oid            *param_types = step->rq_param_types;
     Form_pg_attribute att;
     Oid         typeOutput;
     bool        typIsVarlena;
@@ -12532,7 +12533,7 @@ SetDataRowParams(ModifyTableState *mtstate, RemoteQueryState *node, TupleTableSl
                 uint32 n32;
                 Assert(attindex < numparams);
 
-                if (dataSlot->tts_isnull[attindex])
+				if (dataSlot->tts_isnull[attindex] || !OidIsValid(param_types[attindex]))
                 {
                     n32 = htonl(-1);
                     appendBinaryStringInfo(&buf, (char *) &n32, 4);
@@ -12627,7 +12628,7 @@ SetDataRowParams(ModifyTableState *mtstate, RemoteQueryState *node, TupleTableSl
                 uint32 n32;
                 Assert(attindex < numparams);
 
-                if (dataSlot->tts_isnull[attindex])
+				if (dataSlot->tts_isnull[attindex] || !OidIsValid(param_types[attindex]))
                 {
                     n32 = htonl(-1);
                     appendBinaryStringInfo(&buf, (char *) &n32, 4);
diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out
index 50bc6605..d6a33ebf 100644
--- a/src/test/regress/expected/alter_table_3.out
+++ b/src/test/regress/expected/alter_table_3.out
@@ -3684,3 +3684,22 @@ alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values
 drop table at_test_sql_partop;
 drop operator class at_test_sql_partop using btree;
 drop function at_test_sql_partop;
+-- remote dml with dropped column
+create table dropped_col_remote_dml (a int, b int, c int) distribute by shard(a);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into dropped_col_remote_dml values(1,1,1);
+create or replace function dropped_col_remote_dml_func() returns trigger as
+$$
+begin
+    raise notice 'this is a test';
+    return new;
+end;
+$$
+    language plpgsql;
+create trigger tga after update on dropped_col_remote_dml for each row
+execute PROCEDURE dropped_col_remote_dml_func();
+alter table dropped_col_remote_dml drop column c;
+update dropped_col_remote_dml set b = 2;
+NOTICE:  this is a test
+drop table dropped_col_remote_dml cascade;
+drop function dropped_col_remote_dml_func;
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index daa8f09d..42a9bbfe 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -2545,4 +2545,21 @@ create table at_test_sql_partop_1 (a int);
 alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10);
 drop table at_test_sql_partop;
 drop operator class at_test_sql_partop using btree;
-drop function at_test_sql_partop;
\ No newline at end of file
+drop function at_test_sql_partop;
+-- remote dml with dropped column
+create table dropped_col_remote_dml (a int, b int, c int) distribute by shard(a);
+insert into dropped_col_remote_dml values(1,1,1);
+create or replace function dropped_col_remote_dml_func() returns trigger as
+$$
+begin
+    raise notice 'this is a test';
+    return new;
+end;
+$$
+    language plpgsql;
+create trigger tga after update on dropped_col_remote_dml for each row
+execute PROCEDURE dropped_col_remote_dml_func();
+alter table dropped_col_remote_dml drop column c;
+update dropped_col_remote_dml set b = 2;
+drop table dropped_col_remote_dml cascade;
+drop function dropped_col_remote_dml_func;

From 238bdcb9cd79945b9495eeff49b3b63647ffe594 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 21 Jan 2022 19:28:21 +0800
Subject: [PATCH 482/578] fix gtm seq bug when create databse or drop databse
 http://tapd.woa.com/10092131/bugtrace/bugs/view?bug_id=1010092131096383437&jump_count=1
 and http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131087562597 (merge
 request !1132)

Squash merge branch 'sigmalin_v2_tmp' into 'Tbase_v2.15.19.5'
fix gtm seq bug when create databse or drop databse http://tapd.woa.com/10092131/bugtrace/bugs/view?bug_id=1010092131096383437&jump_count=1 and http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131087562597

TAPD: --bug=096383437
TAPD: --bug=087562597
---
 src/backend/access/transam/gtm.c       |  18 +++++
 src/backend/commands/dbcommands.c      |  18 +++++
 src/gtm/client/fe-protocol.c           |   2 +
 src/gtm/client/gtm_client.c            |  44 ++++++++++
 src/gtm/common/gtm_utils.c             |   7 ++
 src/gtm/main/gtm_seq.c                 | 106 +++++++++++++++++++++++++
 src/gtm/main/gtm_store.c               |  20 ++++-
 src/gtm/main/main.c                    |   4 +
 src/include/access/gtm.h               |   1 +
 src/include/gtm/gtm_c.h                |  10 +++
 src/include/gtm/gtm_client.h           |   2 +
 src/include/gtm/gtm_msg.h              |   2 +
 src/include/gtm/gtm_seq.h              |   1 +
 src/include/gtm/gtm_store.h            |   1 +
 src/test/regress/expected/sequence.out |  29 +++++++
 src/test/regress/sql/sequence.sql      |  32 ++++++++
 16 files changed, 296 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index 03d76457..ded4fdfb 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -2134,6 +2134,24 @@ RenameSequenceGTM(char *seqname, const char *newseqname)
     return conn ? rename_sequence(conn, &seqkey, &newseqkey,
             GetTopTransactionId()) : -1;
 }
+
+/*
+ * Copy the database sequences from src database
+ */
+int
+CopyDataBaseSequenceGTM(char *src_dbname, char *dest_dbname)
+{
+    GTM_SequenceKeyData src_seqkey, dest_seqkey;
+    CheckConnection();
+    src_seqkey.gsk_keylen = strlen(src_dbname) + 1;
+    src_seqkey.gsk_key = src_dbname;
+
+    dest_seqkey.gsk_keylen = strlen(dest_dbname) + 1;
+    dest_seqkey.gsk_key = (char *) dest_dbname;
+    return conn ? copy_database_sequence(conn, &src_seqkey, &dest_seqkey,
+                                  GetTopTransactionId()) : -1;
+}
+
 /*
  * Register Given Node
  * Connection for registering is just used once then closed
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 070646fd..b25eed3c 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -717,6 +717,24 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
                           sizeof(fparms.dest_dboid));
 #endif
 
+    if (IS_PGXC_LOCAL_COORDINATOR)
+    {
+        /*
+         * If we use another database as the template database, and there are
+         * sequences in the template database, we need to create the sequences
+         * from template database in gtm as well, it's safe because the source
+         * database can't being accessed by other now.
+         */
+        RegisterSeqCreate(dbname, GTM_SEQ_DB_NAME);
+
+        if (CopyDataBaseSequenceGTM((char*)dbtemplate, dbname) < 0)
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_CONNECTION_FAILURE),
+                            errmsg("GTM error, could not create sequences for database %s from %s", dbname, dbtemplate)));
+        }
+    }
+
     return dboid;
 }
 
diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c
index a575510e..903e6268 100644
--- a/src/gtm/client/fe-protocol.c
+++ b/src/gtm/client/fe-protocol.c
@@ -679,6 +679,7 @@ break;
         case SEQUENCE_RESET_RESULT:
         case SEQUENCE_CLOSE_RESULT:
         case SEQUENCE_RENAME_RESULT:
+        case SEQUENCE_COPY_RESULT:
         case SEQUENCE_ALTER_RESULT:
         case SEQUENCE_SET_VAL_RESULT:
         case MSG_DB_SEQUENCE_RENAME_RESULT:
@@ -1511,6 +1512,7 @@ gtmpqFreeResultResource(GTM_Result *result)
         case SEQUENCE_RESET_RESULT:
         case SEQUENCE_CLOSE_RESULT:
         case SEQUENCE_RENAME_RESULT:
+        case SEQUENCE_COPY_RESULT:
         case SEQUENCE_ALTER_RESULT:
         case SEQUENCE_SET_VAL_RESULT:
         case MSG_DB_SEQUENCE_RENAME_RESULT:
diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c
index 27232498..1677dfc4 100644
--- a/src/gtm/client/gtm_client.c
+++ b/src/gtm/client/gtm_client.c
@@ -2304,6 +2304,50 @@ rename_sequence_internal(GTM_Conn *conn, GTM_SequenceKey key, GTM_SequenceKey ne
     return -1;
 }
 
+/*
+ * Copy the database sequences from src database
+ */
+int
+copy_database_sequence(GTM_Conn *conn, GTM_SequenceKey src_key, GTM_SequenceKey dest_key,
+                         GlobalTransactionId gxid)
+{
+    GTM_Result *res = NULL;
+    time_t finish_time;
+
+    /* Start the message. */
+    if (gtmpqPutMsgStart('C', true, conn) ||
+        gtmpqPutInt(MSG_SEQUENCE_COPY, sizeof (GTM_MessageType), conn) ||
+        gtmpqPutInt(src_key->gsk_keylen, 4, conn) ||
+        gtmpqPutnchar(src_key->gsk_key, src_key->gsk_keylen, conn)||
+        gtmpqPutInt(dest_key->gsk_keylen, 4, conn) ||
+        gtmpqPutnchar(dest_key->gsk_key, dest_key->gsk_keylen, conn) ||
+        gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn))
+        goto send_failed;
+
+    /* Finish the message. */
+    if (gtmpqPutMsgEnd(conn))
+        goto send_failed;
+
+    /* Flush to ensure backend gets it. */
+    if (gtmpqFlush(conn))
+        goto send_failed;
+
+    finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+    if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+        gtmpqReadData(conn) < 0)
+        goto receive_failed;
+
+    if ((res = GTMPQgetResult(conn)) == NULL)
+        goto receive_failed;
+
+    return res->gr_status;
+
+receive_failed:
+send_failed:
+    conn->result = makeEmptyResultIfIsNull(conn->result);
+    conn->result->gr_status = GTM_RESULT_COMM_ERROR;
+    return -1;
+}
 
 /*
  * Request from GTM current value of the specified sequence in the specified
diff --git a/src/gtm/common/gtm_utils.c b/src/gtm/common/gtm_utils.c
index 79eb782b..0aa1aabd 100644
--- a/src/gtm/common/gtm_utils.c
+++ b/src/gtm/common/gtm_utils.c
@@ -121,6 +121,10 @@ static struct enum_name message_name_tab[] =
     {MSG_GET_REPLICATION_STATUS,"MSG_GET_REPLICATION_STATUS"},
     {MSG_GET_REPLICATION_TRANSFER,"MSG_GET_REPLICATION_TRANSFER"},
 #endif
+    {MSG_GET_STATISTICS, "MSG_GET_STATISTICS"},
+    {MSG_GET_ERRORLOG, "MSG_GET_ERRORLOG"},
+    {MSG_SEQUENCE_COPY, "MSG_SEQUENCE_COPY"},
+
     {-1, NULL}
 };
 
@@ -174,6 +178,9 @@ static struct enum_name result_name_tab[] =
     {TXN_FINISH_GID_RESULT, "TXN_FINISH_GID_RESULT"},
     {MSG_DB_SEQUENCE_RENAME_RESULT, "DB_SEQUENCE_RENAME_RESULT"},
 #endif
+    {MSG_GET_GTM_STATISTICS_RESULT, "MSG_GET_GTM_STATISTICS_RESULT"},
+    {MSG_GET_GTM_ERRORLOG_RESULT, "MSG_GET_GTM_ERRORLOG_RESULT"},
+    {SEQUENCE_COPY_RESULT, "SEQUENCE_COPY_RESULT"},
     {-1, NULL}
 };
 
diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c
index 9abe9580..640f5bcd 100644
--- a/src/gtm/main/gtm_seq.c
+++ b/src/gtm/main/gtm_seq.c
@@ -3619,4 +3619,110 @@ ProcessDBSequenceRenameCommand(Port *myport, StringInfo message, bool is_backup)
     /* FIXME: need to check errors */
 }
 
+/*
+ * Process MSG_SEQUENCE_COPY message.
+ */
+void
+ProcessCopyDataBaseSequenceCommand(Port *myport, StringInfo message)
+{
+    GTM_SequenceKeyData src_seqkey, dest_seqkey;
+    StringInfoData buf;
+    int errcode;
+    MemoryContext oldContext;
+    const char *data;
+    GlobalTransactionId gxid;
+    GTMStorageHandle  *handles = NULL;
+    int32			   i       = 0;
+    int32              count   = 0;
+
+    if (Recovery_IsStandby())
+    {
+        if (myport->remote_type != GTM_NODE_GTM)
+        {
+            elog(ERROR, "gtm standby can't provide sequence to datanodes or coordinators.");
+        }
+    }
+
+    /* get src database name */
+    src_seqkey.gsk_keylen = pq_getmsgint(message, sizeof (src_seqkey.gsk_keylen));
+    src_seqkey.gsk_key = (char *)pq_getmsgbytes(message, src_seqkey.gsk_keylen);
+
+    /* get dest database name */
+    dest_seqkey.gsk_keylen = pq_getmsgint(message, sizeof (dest_seqkey.gsk_keylen));
+    dest_seqkey.gsk_key = (char *)pq_getmsgbytes(message, dest_seqkey.gsk_keylen);
+
+    data = pq_getmsgbytes(message, sizeof (gxid));
+    if (data == NULL)
+        ereport(ERROR,
+                (EPROTO,
+                        errmsg("Message does not contain valid GXID")));
+    memcpy(&gxid, data, sizeof (gxid));
+
+
+    /*
+     * As when creating a sequence, we must use the TopMostMemoryContext
+     * because the sequence information is not bound to a thread and
+     * can outlive any of the thread specific contextes.
+     */
+    oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+    handles = GTM_StoreGetAllSeqInDatabase(&src_seqkey, &count);
+    if (handles)
+    {
+        for (i = 0; i < count; i++)
+        {
+            GTM_SeqCreateInfo  create_info;
+            GTM_SequenceKeyData newseqkey;
+            char new_key[SEQ_KEY_MAX_LENGTH];
+
+            GTM_StoreGetSeqCreateInfo(handles[i], &create_info);
+            /* generate new sequence key name in dest database */
+            newseqkey.gsk_keylen = strlen(create_info.seqkey) - strlen(src_seqkey.gsk_key) + strlen(dest_seqkey.gsk_key) + 1;
+            if (newseqkey.gsk_keylen > SEQ_KEY_MAX_LENGTH)
+            {
+                ereport(ERROR,
+                        (errcode,
+                                errmsg("sequence:%s is too long to copy to database %s", create_info.seqkey, dest_seqkey.gsk_key)));
+            }
+            snprintf(new_key, SEQ_KEY_MAX_LENGTH, "%s%s", dest_seqkey.gsk_key, create_info.seqkey + strlen(src_seqkey.gsk_key));
+            newseqkey.gsk_key = new_key;
+
+            errcode = GTM_SeqOpen(&newseqkey, create_info.increment_by, create_info.minval, create_info.maxval, create_info.startval,
+                                      create_info.cycle, gxid);
+            if (errcode)
+            {
+                ereport(ERROR,
+                        (errcode,
+                                errmsg("Failed to create new sequence:%s for:%s", newseqkey.gsk_key,strerror(errcode))));
+            }
+        }
+        pfree(handles);
+    }
+
+    MemoryContextSwitchTo(oldContext);
+
+    pq_getmsgend(message);
+
+    BeforeReplyToClientXLogTrigger();
+
+    /* Send a SUCCESS message back to the client */
+    pq_beginmessage(&buf, 'S');
+    pq_sendint(&buf, SEQUENCE_COPY_RESULT, 4);
+    if (myport->remote_type == GTM_NODE_GTM_PROXY)
+    {
+        GTM_ProxyMsgHeader proxyhdr;
+        proxyhdr.ph_conid = myport->conn_id;
+        pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+    }
+    pq_sendint(&buf, dest_seqkey.gsk_keylen, 4);
+    pq_sendbytes(&buf, dest_seqkey.gsk_key, dest_seqkey.gsk_keylen);
+    pq_endmessage(myport, &buf);
+
+    if (myport->remote_type != GTM_NODE_GTM_PROXY)
+    {
+        pq_flush(myport);
+    }
+
+}
+
 #endif
diff --git a/src/gtm/main/gtm_store.c b/src/gtm/main/gtm_store.c
index 858c636f..5e0e5ea9 100644
--- a/src/gtm/main/gtm_store.c
+++ b/src/gtm/main/gtm_store.c
@@ -3401,7 +3401,8 @@ int32 GTM_StoreDropAllSeqInDatabase(GTM_SequenceKey seq_database_key)
         {
             seq_info = GetSeqStore(bucket_handle);
 
-            if(strncmp(seq_database_key->gsk_key,seq_info->gs_key.gsk_key,seq_database_key->gsk_keylen - 1) != 0)
+            if(!(strncmp(seq_database_key->gsk_key,seq_info->gs_key.gsk_key,seq_database_key->gsk_keylen - 1) == 0 &&
+                 seq_info->gs_key.gsk_key[seq_database_key->gsk_keylen - 1] == '.'))
             {
                 bucket_handle = seq_info->gs_next;
                 continue;
@@ -4387,3 +4388,20 @@ void GTM_StoreGetSeqKey(GTMStorageHandle handle, char *key)
     seq_info = GetSeqStore(handle);
     snprintf(key, SEQ_KEY_MAX_LENGTH, "%s", seq_info->gs_key.gsk_key);
 }
+
+/*
+ * get seq create info
+ */
+void GTM_StoreGetSeqCreateInfo(GTMStorageHandle handle, GTM_SeqCreateInfo *create_info)
+{
+    GTM_StoredSeqInfo  *seq_info = NULL;
+
+    seq_info = GetSeqStore(handle);
+    snprintf(create_info->seqkey, SEQ_KEY_MAX_LENGTH, "%s", seq_info->gs_key.gsk_key);
+    create_info->increment_by = seq_info->gs_increment_by;
+    create_info->minval = seq_info->gs_min_value;
+    create_info->maxval = seq_info->gs_max_value;
+    /* get gs_value as new sequence's startval */
+    create_info->startval = seq_info->gs_value;
+    create_info->cycle = seq_info->gs_cycle;
+}
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index 43a9424a..9b6deb6a 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -4035,6 +4035,7 @@ ProcessCommand(Port *myport, StringInfo input_message)
 #ifdef __TBASE__
         case MSG_DB_SEQUENCE_RENAME:
         case MSG_BKUP_DB_SEQUENCE_RENAME:    
+		case MSG_SEQUENCE_COPY:
 #endif
             ProcessSequenceCommand(myport, mtype, input_message);
             break;
@@ -4797,6 +4798,9 @@ ProcessSequenceCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
             ProcessSequenceCleanCommand(myport, message, false);
             break;
 
+        case MSG_SEQUENCE_COPY:
+            ProcessCopyDataBaseSequenceCommand(myport, message);
+            break;
         default:
             Assert(0);            /* Shouldn't come here.. keep compiler quite */
     }
diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h
index 5da0eb6a..4df9fc1e 100644
--- a/src/include/access/gtm.h
+++ b/src/include/access/gtm.h
@@ -170,6 +170,7 @@ extern int AlterSequenceGTM(char *seqname, GTM_Sequence increment,
                             GTM_Sequence lastval, bool cycle, bool is_restart);
 extern int DropSequenceGTM(char *name, GTM_SequenceKeyType type);
 extern int RenameSequenceGTM(char *seqname, const char *newseqname);
+extern int CopyDataBaseSequenceGTM(char *src_dbname, char *dest_dbname);
 extern void CleanGTMSeq(void);
 /* Barrier */
 extern int ReportBarrierGTM(const char *barrier_id);
diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h
index ad0be27e..5abc747b 100644
--- a/src/include/gtm/gtm_c.h
+++ b/src/include/gtm/gtm_c.h
@@ -269,6 +269,16 @@ typedef struct GTM_StoredSeqInfo
     pg_crc32c                        gs_crc;                /* crc check value */
 }GTM_StoredSeqInfo;
 
+typedef struct GTM_SeqCreateInfo
+{
+    char seqkey[SEQ_KEY_MAX_LENGTH];
+    GTM_Sequence increment_by;
+    GTM_Sequence minval;
+    GTM_Sequence maxval;
+    GTM_Sequence startval;
+    bool cycle;
+} GTM_SeqCreateInfo;
+
 typedef struct GTM_StoredTransactionInfo
 {
     char                        gti_gid[GTM_MAX_SESSION_ID_LEN];    
diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h
index f85e7d13..82effb81 100644
--- a/src/include/gtm/gtm_client.h
+++ b/src/include/gtm/gtm_client.h
@@ -443,6 +443,8 @@ int close_sequence(GTM_Conn *conn, GTM_SequenceKey key, GlobalTransactionId gxid
 int bkup_close_sequence(GTM_Conn *conn, GTM_SequenceKey key, GlobalTransactionId gxid);
 int rename_sequence(GTM_Conn *conn, GTM_SequenceKey key,
 						GTM_SequenceKey newkey, GlobalTransactionId gxid);
+int copy_database_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_SequenceKey newkey,
+                           GlobalTransactionId gxid);
 int bkup_rename_sequence(GTM_Conn *conn, GTM_SequenceKey key,
 						GTM_SequenceKey newkey, GlobalTransactionId gxid);
 int get_current(GTM_Conn *conn, GTM_SequenceKey key,
diff --git a/src/include/gtm/gtm_msg.h b/src/include/gtm/gtm_msg.h
index acedc926..7ed4a0ac 100644
--- a/src/include/gtm/gtm_msg.h
+++ b/src/include/gtm/gtm_msg.h
@@ -125,6 +125,7 @@ typedef enum GTM_MessageType
     MSG_GET_STATISTICS,
     MSG_GET_ERRORLOG,
 #endif
+    MSG_SEQUENCE_COPY,
 
 	/*
 	 * Must be at the end
@@ -212,6 +213,7 @@ typedef enum GTM_ResultType
     MSG_GET_GTM_STATISTICS_RESULT,
     MSG_GET_GTM_ERRORLOG_RESULT,
 #endif
+    SEQUENCE_COPY_RESULT,
 	RESULT_TYPE_COUNT
 } GTM_ResultType;
 
diff --git a/src/include/gtm/gtm_seq.h b/src/include/gtm/gtm_seq.h
index d9dd072d..a54e33b4 100644
--- a/src/include/gtm/gtm_seq.h
+++ b/src/include/gtm/gtm_seq.h
@@ -119,6 +119,7 @@ void ProcessSequenceAlterCommand(Port *myport, StringInfo message, bool is_backu
 void ProcessSequenceListCommand(Port *myport, StringInfo message);
 void ProcessSequenceCleanCommand(Port *myport, StringInfo message, bool is_backup);
 void ProcessDBSequenceRenameCommand(Port *myport, StringInfo message, bool is_backup);
+void ProcessCopyDataBaseSequenceCommand(Port *myport, StringInfo message);
 
 void decode_seq_key(char* value, GTM_SequenceKey seqkey);
 void GTM_SaveSeqInfo(FILE *ctlf);
diff --git a/src/include/gtm/gtm_store.h b/src/include/gtm/gtm_store.h
index 5dfe1cac..81205f8b 100644
--- a/src/include/gtm/gtm_store.h
+++ b/src/include/gtm/gtm_store.h
@@ -175,4 +175,5 @@ extern bool GTM_StoreGetSysInfo(int64 *identifier, int64 *lsn, GlobalTimestamp *
 extern void GTM_PrintControlHeader(void);
 extern GTMStorageHandle *GTM_StoreGetAllSeqInDatabase(GTM_SequenceKey seq_database_key, int32 *number);
 extern void GTM_StoreGetSeqKey(GTMStorageHandle handle, char *key);
+extern void GTM_StoreGetSeqCreateInfo(GTMStorageHandle handle, GTM_SeqCreateInfo *seq_info);
 #endif
diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out
index 2eae7bde..d510c4f0 100644
--- a/src/test/regress/expected/sequence.out
+++ b/src/test/regress/expected/sequence.out
@@ -851,4 +851,33 @@ DROP SEQUENCE my_seq;
 DROP SEQUENCE my_seq;
 CREATE SEQUENCE my_seq;
 DROP SEQUENCE my_seq;
+-- Test sequece when drop database
+\c db_seq1
+create table t1(f1 serial,f2 int);
+create table t2(f1 serial,f2 int);
+create table t3(f1 serial,f2 int);
+insert into t1(f2) values(1);
+insert into t2(f2) values(2);
+insert into t3(f2) values(3);
+create database db_seq1_bak;
+\c db_seq1_bak
+create table t1(f1 serial,f2 int);
+create table t2(f1 serial,f2 int);
+create table t3(f1 serial,f2 int);
+insert into t1(f2) values(1);
+insert into t2(f2) values(2);
+insert into t3(f2) values(3);
+drop database db_seq1;
+\c db_seq1_bak
+insert into t1(f2) values(4);
+insert into t2(f2) values(5);
+insert into t3(f2) values(6);
+select gsk_key from pg_list_storage_sequence() where gsk_key like '%db_seq1_bak.%';
+           gsk_key            
+------------------------------
+ db_seq1_bak.public.t1_f1_seq
+ db_seq1_bak.public.t2_f1_seq
+ db_seq1_bak.public.t3_f1_seq
+(3 rows)
+
 \q
\ No newline at end of file
diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql
index fda62262..a0f8180d 100644
--- a/src/test/regress/sql/sequence.sql
+++ b/src/test/regress/sql/sequence.sql
@@ -433,4 +433,36 @@ DROP SEQUENCE my_seq;
 DROP SEQUENCE my_seq;
 CREATE SEQUENCE my_seq;
 DROP SEQUENCE my_seq;
+
+-- Test sequece when drop database
+\c db_seq1
+create table t1(f1 serial,f2 int);
+create table t2(f1 serial,f2 int);
+create table t3(f1 serial,f2 int);
+insert into t1(f2) values(1);
+insert into t2(f2) values(2);
+insert into t3(f2) values(3);
+create database db_seq1_bak;
+
+\c db_seq1_bak
+create table t1(f1 serial,f2 int);
+create table t2(f1 serial,f2 int);
+create table t3(f1 serial,f2 int);
+insert into t1(f2) values(1);
+insert into t2(f2) values(2);
+insert into t3(f2) values(3);
+drop database db_seq1;
+
+\c db_seq1_bak
+insert into t1(f2) values(4);
+insert into t2(f2) values(5);
+insert into t3(f2) values(6);
+select gsk_key from pg_list_storage_sequence() where gsk_key like '%db_seq1_bak.%';
 \q
+<<<<<<< HEAD
+=======
+
+
+
+
+>>>>>>> 85b5350be... fix gtm seq bug when create databse or drop databse http://tapd.woa.com/10092131/bugtrace/bugs/view?bug_id=1010092131096383437&jump_count=1 and http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131087562597 (merge request !1132)

From 74d47f2d56af2fa7d23901dc3eb8e3155cbe7f77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cningxpeng=E2=80=9D?= <“ningxpeng@tencent.com”>
Date: Tue, 25 Jan 2022 16:00:13 +0800
Subject: [PATCH 483/578] [BUGFIX] Kill node, fix abort hang problem.Check the
 return value is ok.

---
 src/backend/pgxc/pool/execRemote.c | 25 +++++++++++++++++++++++--
 src/backend/pgxc/pool/pgxcnode.c   | 18 +++++++++++++++---
 src/include/pgxc/pgxcnode.h        |  2 +-
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 73cfbde5..1340649b 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -11625,6 +11625,7 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles)
     int  i                  = 0;
     bool need_loop_check = false;
     bool need_sync = true;
+	int read_status;
 
     if (all_handles)
     {                
@@ -11642,7 +11643,17 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles)
                         elog(DEBUG1, "pgxc_abort_connections node:%s not ready for query, status:%d", handle->nodename, handle->state);
 						if (handle->sock != NO_SOCKET)
                         {
-                            pgxc_node_flush_read(handle);
+							read_status = pgxc_node_flush_read(handle);
+							if (read_status == EOF || read_status < 0)
+							{
+								/* Can not read - no more actions, just discard connection */
+								handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
+								add_error_message(handle, "unexpected EOF on datanode connection.");
+								elog(LOG, "unexpected EOF on node:%s pid:%d, read_status:%d, EOF:%d",
+										handle->nodename, handle->backend_pid, read_status, EOF);
+								return;
+							}
+
                             handle->state = DN_CONNECTION_STATE_IDLE;
                         }                
                         /* Clear any previous error messages */
@@ -11679,7 +11690,17 @@ void pgxc_abort_connections(PGXCNodeAllHandles *all_handles)
                     if (handle->state != DN_CONNECTION_STATE_IDLE || !node_ready_for_query(handle) || pgxc_node_is_data_enqueued(handle))
                     {
                         elog(DEBUG1, "pgxc_abort_connections node:%s not ready for query, status:%d", handle->nodename, handle->state);
-                        pgxc_node_flush_read(handle);
+						read_status = pgxc_node_flush_read(handle);
+						if (read_status == EOF || read_status < 0)
+						{
+							/* Can not read - no more actions, just discard connection */
+							handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
+							add_error_message(handle, "unexpected EOF on datanode connection.");
+							elog(LOG, "unexpected EOF on node:%s pid:%d, read_status:%d, EOF:%d",
+									handle->nodename, handle->backend_pid, read_status, EOF);
+							return;
+						}
+
                         handle->state = DN_CONNECTION_STATE_IDLE;
                                         
                         /* Clear any previous error messages */
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index f006dc41..173e06eb 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -2690,8 +2690,10 @@ pgxc_node_flush(PGXCNodeHandle *handle)
 /*
  * This method won't return until network buffer is empty or error occurs
  * To ensure all data in network buffers is read and wasted
+  *
+  * There are only two possible returns. Return 0 is ok, return is an EOF error when the link is broken.
  */
-void
+int
 pgxc_node_flush_read(PGXCNodeHandle *handle)
 {// #lizard forgives
     bool    is_ready= false;
@@ -2700,7 +2702,7 @@ pgxc_node_flush_read(PGXCNodeHandle *handle)
 
     if (handle == NULL)
     {
-        return;
+		return 0;
     }    
 
     while(true)
@@ -2726,11 +2728,19 @@ pgxc_node_flush_read(PGXCNodeHandle *handle)
 
         /* break, only if the connection is broken. */
         read_result = pgxc_node_read_data(handle, true);
-        if (read_result <= 0)
+
+		/* If no data can be received, the normal break returns success */
+		if (read_result == 0)
         {
 			elog(DEBUG1, "pgxc_node_flush_read node:%s read failure.", handle->nodename);
             break;
         }
+		/* If the link breaks, an EOF error is returned */
+		else if (read_result == EOF || read_result < 0)
+		{
+			elog(LOG, "pgxc_node_flush_read unexpected EOF on node:%s", handle->nodename);
+			return EOF;
+		}
 
 		if (PGXC_CANCEL_DELAY > 0)
 		{
@@ -2747,6 +2757,8 @@ pgxc_node_flush_read(PGXCNodeHandle *handle)
 			}
 		}
     }
+
+	return 0;
 }
 
 /*
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 91db953a..71f8fa40 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -268,7 +268,7 @@ extern int	pgxc_node_is_data_enqueued(PGXCNodeHandle *conn);
 
 extern int	send_some(PGXCNodeHandle * handle, int len);
 extern int	pgxc_node_flush(PGXCNodeHandle *handle);
-extern void	pgxc_node_flush_read(PGXCNodeHandle *handle);
+extern int	pgxc_node_flush_read(PGXCNodeHandle *handle);
 
 extern char get_message(PGXCNodeHandle *conn, int *len, char **msg);
 

From dbec10999485ea5972ba644e04e836e06d1dfbbd Mon Sep 17 00:00:00 2001
From: bethding <bethding@tencent.com>
Date: Wed, 9 Feb 2022 16:10:21 +0800
Subject: [PATCH 484/578]  fix parallel gather core
 http://tapd.woa.com/10092131/bugtrace/bugs/view?bug_id=1010092131096652841&url_cache_key=from_url_iteration_list_74a055bf3a26e3712c2da14069948f4c&action_entry_type=bugs

---
 src/backend/executor/nodeGather.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c
index 55686429..52991767 100644
--- a/src/backend/executor/nodeGather.c
+++ b/src/backend/executor/nodeGather.c
@@ -592,6 +592,10 @@ ExecFinishGather(PlanState *pstate)
     TupleTableSlot *slot = NULL;
     GatherState *node = castNode(GatherState, pstate);
 
+	/* If there if no pei, no need to set status, no need to read data from workers. */
+	if (!node->pei)
+		return;
+
     (*node->pei->executor_done) = true;
 
     if (g_DataPumpDebug)

From ea9232db8dc670fcc82e2a33ba869b9815e5ae33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cningxpeng=E2=80=9D?= <“ningxpeng@tencent.com”>
Date: Tue, 31 Aug 2021 14:29:46 +0800
Subject: [PATCH 485/578] [BUGFIX] trigger support subtransaction

---
 src/backend/pgxc/pool/execRemote.c       |  2 +-
 src/pl/plpgsql/src/pl_exec.c             |  4 ++
 src/test/regress/expected/triggers_1.out | 49 ++++++++++++++++++++++++
 src/test/regress/sql/triggers.sql        | 40 +++++++++++++++++++
 4 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 1340649b..9a0912e8 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3644,7 +3644,7 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
 			if ('T' != connections[i]->transaction_status)
 			{
 				elog(PANIC, "[PLPGSQL] pgxc_node_begin need_begin_sub_txn wrong"
-						"transaction_status");
+						"transaction_status[%c]", connections[i]->transaction_status);
 			}
 		}
 
diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c
index dd6cffb0..3dfc4ffb 100644
--- a/src/pl/plpgsql/src/pl_exec.c
+++ b/src/pl/plpgsql/src/pl_exec.c
@@ -1313,6 +1313,7 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block)
          */
         stmt_mcontext = get_stmt_mcontext(estate);
 
+		SetEnterPlpgsqlFunc();
         BeginInternalSubTransaction(NULL);
         /* Want to run statements inside function's memory context */
         MemoryContextSwitchTo(oldcontext);
@@ -1468,6 +1469,8 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block)
             /* If no match found, re-throw the error */
             if (e == NULL)
                 ReThrowError(edata);
+			 else
+				FreeErrorData(edata);
 
             /* Restore stmt_mcontext stack and release the error data */
             pop_stmt_mcontext(estate);
@@ -1477,6 +1480,7 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block)
         }
         PG_END_TRY();
 
+		SetExitPlpgsqlFunc();
         Assert(save_cur_error == estate->cur_error);
     }
     else
diff --git a/src/test/regress/expected/triggers_1.out b/src/test/regress/expected/triggers_1.out
index ab6838dc..2eae1b48 100644
--- a/src/test/regress/expected/triggers_1.out
+++ b/src/test/regress/expected/triggers_1.out
@@ -2120,3 +2120,52 @@ drop table my_table;
 drop function dump_insert();
 drop function dump_update();
 drop function dump_delete();
+-- trigger support subtransaction
+drop table if exists tb1 cascade;
+NOTICE:  table "tb1" does not exist, skipping
+drop table if exists tb3 cascade;
+NOTICE:  table "tb3" does not exist, skipping
+drop function if exists fun_fbjfyj();
+NOTICE:  function fun_fbjfyj() does not exist, skipping
+create table tb1(a int, b int, c1 varchar(50), c2 varchar(50) COLLATE "pg_catalog"."default", primary key(c1));
+create table tb3(
+    a int,
+    d1 varchar(18) COLLATE "pg_catalog"."default",
+    d2 varchar(600) COLLATE "pg_catalog"."default"
+);
+CREATE OR REPLACE FUNCTION fun_fbjfyj()
+  RETURNS trigger AS $BODY$
+    DECLARE
+        TF              integer :=0;
+    BEGIN
+        begin
+            select NVL2(MAX(a), '1', '0') INTO TF from tb1 where a = 7;
+            IF TF = '1' THEN RETURN new; END IF;
+            new.d1 := '11';
+            new.d2 := '111';
+            insert into tb1 values(12, 12, new.d1, new.d2);
+        end;
+        RETURN new;
+exception
+    when others then
+    return new;
+    END
+$BODY$
+LANGUAGE plpgsql VOLATILE
+COST 100;
+create trigger tb3_insert after insert on tb3
+FOR EACH ROW
+EXECUTE PROCEDURE fun_fbjfyj();
+ERROR:  Postgres-XL does not support TRIGGER yet
+DETAIL:  The feature is not currently supported
+insert into tb3 values(1, '11', '111'), (2, '22', '222'), (3,'33','333');
+insert into tb3 values(1, '11', '111'), (2, '22', '222'), (3,'33','333');
+select count(*) from tb3;
+ count 
+-------
+     6
+(1 row)
+
+drop table tb1 cascade;
+drop table tb3 cascade;
+drop function fun_fbjfyj();
diff --git a/src/test/regress/sql/triggers.sql b/src/test/regress/sql/triggers.sql
index 95b11791..89b019f7 100644
--- a/src/test/regress/sql/triggers.sql
+++ b/src/test/regress/sql/triggers.sql
@@ -1781,3 +1781,43 @@ drop table my_table;
 drop function dump_insert();
 drop function dump_update();
 drop function dump_delete();
+
+-- trigger support subtransaction
+drop table if exists tb1 cascade;
+drop table if exists tb3 cascade;
+drop function if exists fun_fbjfyj();
+create table tb1(a int, b int, c1 varchar(50), c2 varchar(50) COLLATE "pg_catalog"."default", primary key(c1));
+create table tb3(
+    a int,
+    d1 varchar(18) COLLATE "pg_catalog"."default",
+    d2 varchar(600) COLLATE "pg_catalog"."default"
+);
+CREATE OR REPLACE FUNCTION fun_fbjfyj()
+  RETURNS trigger AS $BODY$
+    DECLARE
+        TF              integer :=0;
+    BEGIN
+        begin
+            select NVL2(MAX(a), '1', '0') INTO TF from tb1 where a = 7;
+            IF TF = '1' THEN RETURN new; END IF;
+            new.d1 := '11';
+            new.d2 := '111';
+            insert into tb1 values(12, 12, new.d1, new.d2);
+        end;
+        RETURN new;
+exception
+    when others then
+    return new;
+    END
+$BODY$
+LANGUAGE plpgsql VOLATILE
+COST 100;
+create trigger tb3_insert after insert on tb3
+FOR EACH ROW
+EXECUTE PROCEDURE fun_fbjfyj();
+insert into tb3 values(1, '11', '111'), (2, '22', '222'), (3,'33','333');
+insert into tb3 values(1, '11', '111'), (2, '22', '222'), (3,'33','333');
+select count(*) from tb3;
+drop table tb1 cascade;
+drop table tb3 cascade;
+drop function fun_fbjfyj();

From 4f92c74a90e0506a751446f7d721c200c00fc57d Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Mon, 25 Oct 2021 11:52:37 +0800
Subject: [PATCH 486/578] Two fixes of trigger with cursor and exception block
 1. Fix the remote sub txn begin logic when remote conn does not begen main
 txn yet. 2. Portal of cursor have been dropped when sub-txn(begined for
 execption case) release/rollback, so we just skip the close statment in
 expetion block.

http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696092502857
---
 src/backend/pgxc/pool/execRemote.c |    3 +-
 src/pl/plpgsql/src/pl_exec.c       |   20 +-
 src/pl/plpgsql/src/plpgsql.h       | 1099 ++++++++++++++--------------
 3 files changed, 568 insertions(+), 554 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 9a0912e8..dcb98c13 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3603,8 +3603,7 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
             need_send_begin = true;
         }
 
-		if (connections[i]->plpgsql_need_begin_txn &&
-			connections[i]->plpgsql_need_begin_sub_txn &&
+		if (connections[i]->plpgsql_need_begin_sub_txn &&
 			'I' == connections[i]->transaction_status)
 		{
 			need_send_begin = true;
diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c
index 3dfc4ffb..5c232317 100644
--- a/src/pl/plpgsql/src/pl_exec.c
+++ b/src/pl/plpgsql/src/pl_exec.c
@@ -1318,6 +1318,8 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block)
         /* Want to run statements inside function's memory context */
         MemoryContextSwitchTo(oldcontext);
 
+        estate->handle_exceptions = false;
+
         PG_TRY();
         {
             /*
@@ -1374,7 +1376,6 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block)
         {
             ErrorData  *edata;
             ListCell   *e;
-            SetEnterPlpgsqlFunc();
 
             estate->err_text = gettext_noop("during exception cleanup");
 
@@ -1383,6 +1384,9 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block)
             edata = CopyErrorData();
             FlushErrorState();
 
+			/* Mark handling exceptions */
+            estate->handle_exceptions = true;
+
             /* Abort the inner transaction */
             RollbackAndReleaseCurrentSubTransaction();
             MemoryContextSwitchTo(oldcontext);
@@ -1455,6 +1459,8 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block)
 
                     rc = exec_stmts(estate, exception->action);
 
+					estate->handle_exceptions = false;
+
                     break;
                 }
             }
@@ -1468,15 +1474,16 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block)
 
             /* If no match found, re-throw the error */
             if (e == NULL)
+            {
+                SetExitPlpgsqlFunc();
                 ReThrowError(edata);
+            }
 			 else
 				FreeErrorData(edata);
 
             /* Restore stmt_mcontext stack and release the error data */
             pop_stmt_mcontext(estate);
             MemoryContextReset(stmt_mcontext);
-
-			SetExitPlpgsqlFunc();
         }
         PG_END_TRY();
 
@@ -3450,6 +3457,8 @@ plpgsql_estate_setup(PLpgSQL_execstate *estate,
     estate->cur_error = NULL;
 
     estate->tuple_store = NULL;
+    estate->handle_exceptions = false;
+
     if (rsi)
     {
         estate->tuple_store_cxt = rsi->econtext->ecxt_per_query_memory;
@@ -4382,9 +4391,14 @@ exec_stmt_close(PLpgSQL_execstate *estate, PLpgSQL_stmt_close *stmt)
 
     portal = SPI_cursor_find(curname);
     if (portal == NULL)
+    {
+        if (estate->handle_exceptions)
+            return PLPGSQL_RC_OK;
+
         ereport(ERROR,
                 (errcode(ERRCODE_UNDEFINED_CURSOR),
                  errmsg("cursor \"%s\" does not exist", curname)));
+    }
 
     /* ----------
      * And close it.
diff --git a/src/pl/plpgsql/src/plpgsql.h b/src/pl/plpgsql/src/plpgsql.h
index 3a810ca2..825a7d5e 100644
--- a/src/pl/plpgsql/src/plpgsql.h
+++ b/src/pl/plpgsql/src/plpgsql.h
@@ -1,14 +1,14 @@
 /*-------------------------------------------------------------------------
  *
- * plpgsql.h        - Definitions for the PL/pgSQL
- *              procedural language
+ * plpgsql.h		- Definitions for the PL/pgSQL
+ *			  procedural language
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *      src/pl/plpgsql/src/plpgsql.h
+ *	  src/pl/plpgsql/src/plpgsql.h
  *
  *-------------------------------------------------------------------------
  */
@@ -37,10 +37,10 @@
  */
 typedef enum PLpgSQL_nsitem_type
 {
-    PLPGSQL_NSTYPE_LABEL,
-    PLPGSQL_NSTYPE_VAR,
-    PLPGSQL_NSTYPE_ROW,
-    PLPGSQL_NSTYPE_REC
+	PLPGSQL_NSTYPE_LABEL,
+	PLPGSQL_NSTYPE_VAR,
+	PLPGSQL_NSTYPE_ROW,
+	PLPGSQL_NSTYPE_REC
 } PLpgSQL_nsitem_type;
 
 /*
@@ -48,9 +48,9 @@ typedef enum PLpgSQL_nsitem_type
  */
 typedef enum PLpgSQL_label_type
 {
-    PLPGSQL_LABEL_BLOCK,        /* DECLARE/BEGIN block */
-    PLPGSQL_LABEL_LOOP,            /* looping construct */
-    PLPGSQL_LABEL_OTHER            /* anything else */
+	PLPGSQL_LABEL_BLOCK,		/* DECLARE/BEGIN block */
+	PLPGSQL_LABEL_LOOP,			/* looping construct */
+	PLPGSQL_LABEL_OTHER			/* anything else */
 } PLpgSQL_label_type;
 
 /*
@@ -58,12 +58,12 @@ typedef enum PLpgSQL_label_type
  */
 typedef enum PLpgSQL_datum_type
 {
-    PLPGSQL_DTYPE_VAR,
-    PLPGSQL_DTYPE_ROW,
-    PLPGSQL_DTYPE_REC,
-    PLPGSQL_DTYPE_RECFIELD,
-    PLPGSQL_DTYPE_ARRAYELEM,
-    PLPGSQL_DTYPE_EXPR
+	PLPGSQL_DTYPE_VAR,
+	PLPGSQL_DTYPE_ROW,
+	PLPGSQL_DTYPE_REC,
+	PLPGSQL_DTYPE_RECFIELD,
+	PLPGSQL_DTYPE_ARRAYELEM,
+	PLPGSQL_DTYPE_EXPR
 } PLpgSQL_datum_type;
 
 /*
@@ -71,10 +71,10 @@ typedef enum PLpgSQL_datum_type
  */
 typedef enum PLpgSQL_type_type
 {
-    PLPGSQL_TTYPE_SCALAR,        /* scalar types and domains */
-    PLPGSQL_TTYPE_ROW,            /* composite types */
-    PLPGSQL_TTYPE_REC,            /* RECORD pseudotype */
-    PLPGSQL_TTYPE_PSEUDO        /* other pseudotypes */
+	PLPGSQL_TTYPE_SCALAR,		/* scalar types and domains */
+	PLPGSQL_TTYPE_ROW,			/* composite types */
+	PLPGSQL_TTYPE_REC,			/* RECORD pseudotype */
+	PLPGSQL_TTYPE_PSEUDO		/* other pseudotypes */
 } PLpgSQL_type_type;
 
 /*
@@ -82,30 +82,30 @@ typedef enum PLpgSQL_type_type
  */
 typedef enum PLpgSQL_stmt_type
 {
-    PLPGSQL_STMT_BLOCK,
-    PLPGSQL_STMT_ASSIGN,
-    PLPGSQL_STMT_IF,
-    PLPGSQL_STMT_CASE,
-    PLPGSQL_STMT_LOOP,
-    PLPGSQL_STMT_WHILE,
-    PLPGSQL_STMT_FORI,
-    PLPGSQL_STMT_FORS,
-    PLPGSQL_STMT_FORC,
-    PLPGSQL_STMT_FOREACH_A,
-    PLPGSQL_STMT_EXIT,
-    PLPGSQL_STMT_RETURN,
-    PLPGSQL_STMT_RETURN_NEXT,
-    PLPGSQL_STMT_RETURN_QUERY,
-    PLPGSQL_STMT_RAISE,
-    PLPGSQL_STMT_ASSERT,
-    PLPGSQL_STMT_EXECSQL,
-    PLPGSQL_STMT_DYNEXECUTE,
-    PLPGSQL_STMT_DYNFORS,
-    PLPGSQL_STMT_GETDIAG,
-    PLPGSQL_STMT_OPEN,
-    PLPGSQL_STMT_FETCH,
-    PLPGSQL_STMT_CLOSE,
-    PLPGSQL_STMT_PERFORM
+	PLPGSQL_STMT_BLOCK,
+	PLPGSQL_STMT_ASSIGN,
+	PLPGSQL_STMT_IF,
+	PLPGSQL_STMT_CASE,
+	PLPGSQL_STMT_LOOP,
+	PLPGSQL_STMT_WHILE,
+	PLPGSQL_STMT_FORI,
+	PLPGSQL_STMT_FORS,
+	PLPGSQL_STMT_FORC,
+	PLPGSQL_STMT_FOREACH_A,
+	PLPGSQL_STMT_EXIT,
+	PLPGSQL_STMT_RETURN,
+	PLPGSQL_STMT_RETURN_NEXT,
+	PLPGSQL_STMT_RETURN_QUERY,
+	PLPGSQL_STMT_RAISE,
+	PLPGSQL_STMT_ASSERT,
+	PLPGSQL_STMT_EXECSQL,
+	PLPGSQL_STMT_DYNEXECUTE,
+	PLPGSQL_STMT_DYNFORS,
+	PLPGSQL_STMT_GETDIAG,
+	PLPGSQL_STMT_OPEN,
+	PLPGSQL_STMT_FETCH,
+	PLPGSQL_STMT_CLOSE,
+	PLPGSQL_STMT_PERFORM
 } PLpgSQL_stmt_type;
 
 /*
@@ -113,10 +113,10 @@ typedef enum PLpgSQL_stmt_type
  */
 enum
 {
-    PLPGSQL_RC_OK,
-    PLPGSQL_RC_EXIT,
-    PLPGSQL_RC_RETURN,
-    PLPGSQL_RC_CONTINUE
+	PLPGSQL_RC_OK,
+	PLPGSQL_RC_EXIT,
+	PLPGSQL_RC_RETURN,
+	PLPGSQL_RC_CONTINUE
 };
 
 /*
@@ -124,19 +124,19 @@ enum
  */
 typedef enum PLpgSQL_getdiag_kind
 {
-    PLPGSQL_GETDIAG_ROW_COUNT,
-    PLPGSQL_GETDIAG_RESULT_OID,
-    PLPGSQL_GETDIAG_CONTEXT,
-    PLPGSQL_GETDIAG_ERROR_CONTEXT,
-    PLPGSQL_GETDIAG_ERROR_DETAIL,
-    PLPGSQL_GETDIAG_ERROR_HINT,
-    PLPGSQL_GETDIAG_RETURNED_SQLSTATE,
-    PLPGSQL_GETDIAG_COLUMN_NAME,
-    PLPGSQL_GETDIAG_CONSTRAINT_NAME,
-    PLPGSQL_GETDIAG_DATATYPE_NAME,
-    PLPGSQL_GETDIAG_MESSAGE_TEXT,
-    PLPGSQL_GETDIAG_TABLE_NAME,
-    PLPGSQL_GETDIAG_SCHEMA_NAME
+	PLPGSQL_GETDIAG_ROW_COUNT,
+	PLPGSQL_GETDIAG_RESULT_OID,
+	PLPGSQL_GETDIAG_CONTEXT,
+	PLPGSQL_GETDIAG_ERROR_CONTEXT,
+	PLPGSQL_GETDIAG_ERROR_DETAIL,
+	PLPGSQL_GETDIAG_ERROR_HINT,
+	PLPGSQL_GETDIAG_RETURNED_SQLSTATE,
+	PLPGSQL_GETDIAG_COLUMN_NAME,
+	PLPGSQL_GETDIAG_CONSTRAINT_NAME,
+	PLPGSQL_GETDIAG_DATATYPE_NAME,
+	PLPGSQL_GETDIAG_MESSAGE_TEXT,
+	PLPGSQL_GETDIAG_TABLE_NAME,
+	PLPGSQL_GETDIAG_SCHEMA_NAME
 } PLpgSQL_getdiag_kind;
 
 /*
@@ -144,15 +144,15 @@ typedef enum PLpgSQL_getdiag_kind
  */
 typedef enum PLpgSQL_raise_option_type
 {
-    PLPGSQL_RAISEOPTION_ERRCODE,
-    PLPGSQL_RAISEOPTION_MESSAGE,
-    PLPGSQL_RAISEOPTION_DETAIL,
-    PLPGSQL_RAISEOPTION_HINT,
-    PLPGSQL_RAISEOPTION_COLUMN,
-    PLPGSQL_RAISEOPTION_CONSTRAINT,
-    PLPGSQL_RAISEOPTION_DATATYPE,
-    PLPGSQL_RAISEOPTION_TABLE,
-    PLPGSQL_RAISEOPTION_SCHEMA
+	PLPGSQL_RAISEOPTION_ERRCODE,
+	PLPGSQL_RAISEOPTION_MESSAGE,
+	PLPGSQL_RAISEOPTION_DETAIL,
+	PLPGSQL_RAISEOPTION_HINT,
+	PLPGSQL_RAISEOPTION_COLUMN,
+	PLPGSQL_RAISEOPTION_CONSTRAINT,
+	PLPGSQL_RAISEOPTION_DATATYPE,
+	PLPGSQL_RAISEOPTION_TABLE,
+	PLPGSQL_RAISEOPTION_SCHEMA
 } PLpgSQL_raise_option_type;
 
 /*
@@ -160,9 +160,9 @@ typedef enum PLpgSQL_raise_option_type
  */
 typedef enum PLpgSQL_resolve_option
 {
-    PLPGSQL_RESOLVE_ERROR,        /* throw error if ambiguous */
-    PLPGSQL_RESOLVE_VARIABLE,    /* prefer plpgsql var to table column */
-    PLPGSQL_RESOLVE_COLUMN        /* prefer table column to plpgsql var */
+	PLPGSQL_RESOLVE_ERROR,		/* throw error if ambiguous */
+	PLPGSQL_RESOLVE_VARIABLE,	/* prefer plpgsql var to table column */
+	PLPGSQL_RESOLVE_COLUMN		/* prefer table column to plpgsql var */
 } PLpgSQL_resolve_option;
 
 
@@ -175,16 +175,16 @@ typedef enum PLpgSQL_resolve_option
  */
 typedef struct PLpgSQL_type
 {
-    char       *typname;        /* (simple) name of the type */
-    Oid            typoid;            /* OID of the data type */
-    PLpgSQL_type_type ttype;    /* PLPGSQL_TTYPE_ code */
-    int16        typlen;            /* stuff copied from its pg_type entry */
-    bool        typbyval;
-    char        typtype;
-    Oid            typrelid;
-    Oid            collation;        /* from pg_type, but can be overridden */
-    bool        typisarray;        /* is "true" array, or domain over one */
-    int32        atttypmod;        /* typmod (taken from someplace else) */
+	char	   *typname;		/* (simple) name of the type */
+	Oid			typoid;			/* OID of the data type */
+	PLpgSQL_type_type ttype;	/* PLPGSQL_TTYPE_ code */
+	int16		typlen;			/* stuff copied from its pg_type entry */
+	bool		typbyval;
+	char		typtype;
+	Oid			typrelid;
+	Oid			collation;		/* from pg_type, but can be overridden */
+	bool		typisarray;		/* is "true" array, or domain over one */
+	int32		atttypmod;		/* typmod (taken from someplace else) */
 } PLpgSQL_type;
 
 /*
@@ -195,8 +195,8 @@ typedef struct PLpgSQL_type
  */
 typedef struct PLpgSQL_datum
 {
-    PLpgSQL_datum_type dtype;
-    int            dno;
+	PLpgSQL_datum_type dtype;
+	int			dno;
 } PLpgSQL_datum;
 
 /*
@@ -207,10 +207,10 @@ typedef struct PLpgSQL_datum
  */
 typedef struct PLpgSQL_variable
 {
-    PLpgSQL_datum_type dtype;
-    int            dno;
-    char       *refname;
-    int            lineno;
+	PLpgSQL_datum_type dtype;
+	int			dno;
+	char	   *refname;
+	int			lineno;
 } PLpgSQL_variable;
 
 /*
@@ -218,34 +218,34 @@ typedef struct PLpgSQL_variable
  */
 typedef struct PLpgSQL_expr
 {
-    PLpgSQL_datum_type dtype;
-    int            dno;
-    char       *query;
-    SPIPlanPtr    plan;
-    Bitmapset  *paramnos;        /* all dnos referenced by this query */
-    int            rwparam;        /* dno of read/write param, or -1 if none */
-
-    /* function containing this expr (not set until we first parse query) */
-    struct PLpgSQL_function *func;
-
-    /* namespace chain visible to this expr */
-    struct PLpgSQL_nsitem *ns;
-
-    /* fields for "simple expression" fast-path execution: */
-    Expr       *expr_simple_expr;    /* NULL means not a simple expr */
-    int            expr_simple_generation; /* plancache generation we checked */
-    Oid            expr_simple_type;    /* result type Oid, if simple */
-    int32        expr_simple_typmod; /* result typmod, if simple */
-
-    /*
-     * if expr is simple AND prepared in current transaction,
-     * expr_simple_state and expr_simple_in_use are valid. Test validity by
-     * seeing if expr_simple_lxid matches current LXID.  (If not,
-     * expr_simple_state probably points at garbage!)
-     */
-    ExprState  *expr_simple_state;    /* eval tree for expr_simple_expr */
-    bool        expr_simple_in_use; /* true if eval tree is active */
-    LocalTransactionId expr_simple_lxid;
+	PLpgSQL_datum_type dtype;
+	int			dno;
+	char	   *query;
+	SPIPlanPtr	plan;
+	Bitmapset  *paramnos;		/* all dnos referenced by this query */
+	int			rwparam;		/* dno of read/write param, or -1 if none */
+
+	/* function containing this expr (not set until we first parse query) */
+	struct PLpgSQL_function *func;
+
+	/* namespace chain visible to this expr */
+	struct PLpgSQL_nsitem *ns;
+
+	/* fields for "simple expression" fast-path execution: */
+	Expr	   *expr_simple_expr;	/* NULL means not a simple expr */
+	int			expr_simple_generation; /* plancache generation we checked */
+	Oid			expr_simple_type;	/* result type Oid, if simple */
+	int32		expr_simple_typmod; /* result typmod, if simple */
+
+	/*
+	 * if expr is simple AND prepared in current transaction,
+	 * expr_simple_state and expr_simple_in_use are valid. Test validity by
+	 * seeing if expr_simple_lxid matches current LXID.  (If not,
+	 * expr_simple_state probably points at garbage!)
+	 */
+	ExprState  *expr_simple_state;	/* eval tree for expr_simple_expr */
+	bool		expr_simple_in_use; /* true if eval tree is active */
+	LocalTransactionId expr_simple_lxid;
 } PLpgSQL_expr;
 
 /*
@@ -253,22 +253,22 @@ typedef struct PLpgSQL_expr
  */
 typedef struct PLpgSQL_var
 {
-    PLpgSQL_datum_type dtype;
-    int            dno;
-    char       *refname;
-    int            lineno;
-
-    PLpgSQL_type *datatype;
-    int            isconst;
-    int            notnull;
-    PLpgSQL_expr *default_val;
-    PLpgSQL_expr *cursor_explicit_expr;
-    int            cursor_explicit_argrow;
-    int            cursor_options;
-
-    Datum        value;
-    bool        isnull;
-    bool        freeval;
+	PLpgSQL_datum_type dtype;
+	int			dno;
+	char	   *refname;
+	int			lineno;
+
+	PLpgSQL_type *datatype;
+	int			isconst;
+	int			notnull;
+	PLpgSQL_expr *default_val;
+	PLpgSQL_expr *cursor_explicit_expr;
+	int			cursor_explicit_argrow;
+	int			cursor_options;
+
+	Datum		value;
+	bool		isnull;
+	bool		freeval;
 } PLpgSQL_var;
 
 /*
@@ -276,22 +276,22 @@ typedef struct PLpgSQL_var
  */
 typedef struct PLpgSQL_row
 {
-    PLpgSQL_datum_type dtype;
-    int            dno;
-    char       *refname;
-    int            lineno;
-
-    /* Note: TupleDesc is only set up for named rowtypes, else it is NULL. */
-    TupleDesc    rowtupdesc;
-
-    /*
-     * Note: if the underlying rowtype contains a dropped column, the
-     * corresponding fieldnames[] entry will be NULL, and there is no
-     * corresponding var (varnos[] will be -1).
-     */
-    int            nfields;
-    char      **fieldnames;
-    int           *varnos;
+	PLpgSQL_datum_type dtype;
+	int			dno;
+	char	   *refname;
+	int			lineno;
+
+	/* Note: TupleDesc is only set up for named rowtypes, else it is NULL. */
+	TupleDesc	rowtupdesc;
+
+	/*
+	 * Note: if the underlying rowtype contains a dropped column, the
+	 * corresponding fieldnames[] entry will be NULL, and there is no
+	 * corresponding var (varnos[] will be -1).
+	 */
+	int			nfields;
+	char	  **fieldnames;
+	int		   *varnos;
 } PLpgSQL_row;
 
 /*
@@ -299,15 +299,15 @@ typedef struct PLpgSQL_row
  */
 typedef struct PLpgSQL_rec
 {
-    PLpgSQL_datum_type dtype;
-    int            dno;
-    char       *refname;
-    int            lineno;
-
-    HeapTuple    tup;
-    TupleDesc    tupdesc;
-    bool        freetup;
-    bool        freetupdesc;
+	PLpgSQL_datum_type dtype;
+	int			dno;
+	char	   *refname;
+	int			lineno;
+
+	HeapTuple	tup;
+	TupleDesc	tupdesc;
+	bool		freetup;
+	bool		freetupdesc;
 } PLpgSQL_rec;
 
 /*
@@ -315,10 +315,10 @@ typedef struct PLpgSQL_rec
  */
 typedef struct PLpgSQL_recfield
 {
-    PLpgSQL_datum_type dtype;
-    int            dno;
-    char       *fieldname;
-    int            recparentno;    /* dno of parent record */
+	PLpgSQL_datum_type dtype;
+	int			dno;
+	char	   *fieldname;
+	int			recparentno;	/* dno of parent record */
 } PLpgSQL_recfield;
 
 /*
@@ -326,21 +326,21 @@ typedef struct PLpgSQL_recfield
  */
 typedef struct PLpgSQL_arrayelem
 {
-    PLpgSQL_datum_type dtype;
-    int            dno;
-    PLpgSQL_expr *subscript;
-    int            arrayparentno;    /* dno of parent array variable */
-
-    /* Remaining fields are cached info about the array variable's type */
-    Oid            parenttypoid;    /* type of array variable; 0 if not yet set */
-    int32        parenttypmod;    /* typmod of array variable */
-    Oid            arraytypoid;    /* OID of actual array type */
-    int32        arraytypmod;    /* typmod of array (and its elements too) */
-    int16        arraytyplen;    /* typlen of array type */
-    Oid            elemtypoid;        /* OID of array element type */
-    int16        elemtyplen;        /* typlen of element type */
-    bool        elemtypbyval;    /* element type is pass-by-value? */
-    char        elemtypalign;    /* typalign of element type */
+	PLpgSQL_datum_type dtype;
+	int			dno;
+	PLpgSQL_expr *subscript;
+	int			arrayparentno;	/* dno of parent array variable */
+
+	/* Remaining fields are cached info about the array variable's type */
+	Oid			parenttypoid;	/* type of array variable; 0 if not yet set */
+	int32		parenttypmod;	/* typmod of array variable */
+	Oid			arraytypoid;	/* OID of actual array type */
+	int32		arraytypmod;	/* typmod of array (and its elements too) */
+	int16		arraytyplen;	/* typlen of array type */
+	Oid			elemtypoid;		/* OID of array element type */
+	int16		elemtyplen;		/* typlen of element type */
+	bool		elemtypbyval;	/* element type is pass-by-value? */
+	char		elemtypalign;	/* typalign of element type */
 } PLpgSQL_arrayelem;
 
 /*
@@ -348,15 +348,15 @@ typedef struct PLpgSQL_arrayelem
  */
 typedef struct PLpgSQL_nsitem
 {
-    PLpgSQL_nsitem_type itemtype;
-
-    /*
-     * For labels, itemno is a value of enum PLpgSQL_label_type. For other
-     * itemtypes, itemno is the associated PLpgSQL_datum's dno.
-     */
-    int            itemno;
-    struct PLpgSQL_nsitem *prev;
-    char        name[FLEXIBLE_ARRAY_MEMBER];    /* nul-terminated string */
+	PLpgSQL_nsitem_type itemtype;
+
+	/*
+	 * For labels, itemno is a value of enum PLpgSQL_label_type. For other
+	 * itemtypes, itemno is the associated PLpgSQL_datum's dno.
+	 */
+	int			itemno;
+	struct PLpgSQL_nsitem *prev;
+	char		name[FLEXIBLE_ARRAY_MEMBER];	/* nul-terminated string */
 } PLpgSQL_nsitem;
 
 /*
@@ -364,8 +364,8 @@ typedef struct PLpgSQL_nsitem
  */
 typedef struct PLpgSQL_stmt
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
 } PLpgSQL_stmt;
 
 /*
@@ -373,9 +373,9 @@ typedef struct PLpgSQL_stmt
  */
 typedef struct PLpgSQL_condition
 {
-    int            sqlerrstate;    /* SQLSTATE code */
-    char       *condname;        /* condition name (for debugging) */
-    struct PLpgSQL_condition *next;
+	int			sqlerrstate;	/* SQLSTATE code */
+	char	   *condname;		/* condition name (for debugging) */
+	struct PLpgSQL_condition *next;
 } PLpgSQL_condition;
 
 /*
@@ -383,9 +383,9 @@ typedef struct PLpgSQL_condition
  */
 typedef struct PLpgSQL_exception_block
 {
-    int            sqlstate_varno;
-    int            sqlerrm_varno;
-    List       *exc_list;        /* List of WHEN clauses */
+	int			sqlstate_varno;
+	int			sqlerrm_varno;
+	List	   *exc_list;		/* List of WHEN clauses */
 } PLpgSQL_exception_block;
 
 /*
@@ -393,9 +393,9 @@ typedef struct PLpgSQL_exception_block
  */
 typedef struct PLpgSQL_exception
 {
-    int            lineno;
-    PLpgSQL_condition *conditions;
-    List       *action;            /* List of statements */
+	int			lineno;
+	PLpgSQL_condition *conditions;
+	List	   *action;			/* List of statements */
 } PLpgSQL_exception;
 
 /*
@@ -403,13 +403,13 @@ typedef struct PLpgSQL_exception
  */
 typedef struct PLpgSQL_stmt_block
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    char       *label;
-    List       *body;            /* List of statements */
-    int            n_initvars;
-    int           *initvarnos;
-    PLpgSQL_exception_block *exceptions;
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	char	   *label;
+	List	   *body;			/* List of statements */
+	int			n_initvars;
+	int		   *initvarnos;
+	PLpgSQL_exception_block *exceptions;
 } PLpgSQL_stmt_block;
 
 /*
@@ -417,10 +417,10 @@ typedef struct PLpgSQL_stmt_block
  */
 typedef struct PLpgSQL_stmt_assign
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    int            varno;
-    PLpgSQL_expr *expr;
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	int			varno;
+	PLpgSQL_expr *expr;
 } PLpgSQL_stmt_assign;
 
 /*
@@ -428,9 +428,9 @@ typedef struct PLpgSQL_stmt_assign
  */
 typedef struct PLpgSQL_stmt_perform
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    PLpgSQL_expr *expr;
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	PLpgSQL_expr *expr;
 } PLpgSQL_stmt_perform;
 
 /*
@@ -438,8 +438,8 @@ typedef struct PLpgSQL_stmt_perform
  */
 typedef struct PLpgSQL_diag_item
 {
-    PLpgSQL_getdiag_kind kind;    /* id for diagnostic value desired */
-    int            target;            /* where to assign it */
+	PLpgSQL_getdiag_kind kind;	/* id for diagnostic value desired */
+	int			target;			/* where to assign it */
 } PLpgSQL_diag_item;
 
 /*
@@ -447,10 +447,10 @@ typedef struct PLpgSQL_diag_item
  */
 typedef struct PLpgSQL_stmt_getdiag
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    bool        is_stacked;        /* STACKED or CURRENT diagnostics area? */
-    List       *diag_items;        /* List of PLpgSQL_diag_item */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	bool		is_stacked;		/* STACKED or CURRENT diagnostics area? */
+	List	   *diag_items;		/* List of PLpgSQL_diag_item */
 } PLpgSQL_stmt_getdiag;
 
 /*
@@ -458,12 +458,12 @@ typedef struct PLpgSQL_stmt_getdiag
  */
 typedef struct PLpgSQL_stmt_if
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    PLpgSQL_expr *cond;            /* boolean expression for THEN */
-    List       *then_body;        /* List of statements */
-    List       *elsif_list;        /* List of PLpgSQL_if_elsif structs */
-    List       *else_body;        /* List of statements */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	PLpgSQL_expr *cond;			/* boolean expression for THEN */
+	List	   *then_body;		/* List of statements */
+	List	   *elsif_list;		/* List of PLpgSQL_if_elsif structs */
+	List	   *else_body;		/* List of statements */
 } PLpgSQL_stmt_if;
 
 /*
@@ -471,9 +471,9 @@ typedef struct PLpgSQL_stmt_if
  */
 typedef struct PLpgSQL_if_elsif
 {
-    int            lineno;
-    PLpgSQL_expr *cond;            /* boolean expression for this case */
-    List       *stmts;            /* List of statements */
+	int			lineno;
+	PLpgSQL_expr *cond;			/* boolean expression for this case */
+	List	   *stmts;			/* List of statements */
 } PLpgSQL_if_elsif;
 
 /*
@@ -481,13 +481,13 @@ typedef struct PLpgSQL_if_elsif
  */
 typedef struct PLpgSQL_stmt_case
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    PLpgSQL_expr *t_expr;        /* test expression, or NULL if none */
-    int            t_varno;        /* var to store test expression value into */
-    List       *case_when_list; /* List of PLpgSQL_case_when structs */
-    bool        have_else;        /* flag needed because list could be empty */
-    List       *else_stmts;        /* List of statements */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	PLpgSQL_expr *t_expr;		/* test expression, or NULL if none */
+	int			t_varno;		/* var to store test expression value into */
+	List	   *case_when_list; /* List of PLpgSQL_case_when structs */
+	bool		have_else;		/* flag needed because list could be empty */
+	List	   *else_stmts;		/* List of statements */
 } PLpgSQL_stmt_case;
 
 /*
@@ -495,9 +495,9 @@ typedef struct PLpgSQL_stmt_case
  */
 typedef struct PLpgSQL_case_when
 {
-    int            lineno;
-    PLpgSQL_expr *expr;            /* boolean expression for this case */
-    List       *stmts;            /* List of statements */
+	int			lineno;
+	PLpgSQL_expr *expr;			/* boolean expression for this case */
+	List	   *stmts;			/* List of statements */
 } PLpgSQL_case_when;
 
 /*
@@ -505,10 +505,10 @@ typedef struct PLpgSQL_case_when
  */
 typedef struct PLpgSQL_stmt_loop
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    char       *label;
-    List       *body;            /* List of statements */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	char	   *label;
+	List	   *body;			/* List of statements */
 } PLpgSQL_stmt_loop;
 
 /*
@@ -516,11 +516,11 @@ typedef struct PLpgSQL_stmt_loop
  */
 typedef struct PLpgSQL_stmt_while
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    char       *label;
-    PLpgSQL_expr *cond;
-    List       *body;            /* List of statements */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	char	   *label;
+	PLpgSQL_expr *cond;
+	List	   *body;			/* List of statements */
 } PLpgSQL_stmt_while;
 
 /*
@@ -528,15 +528,15 @@ typedef struct PLpgSQL_stmt_while
  */
 typedef struct PLpgSQL_stmt_fori
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    char       *label;
-    PLpgSQL_var *var;
-    PLpgSQL_expr *lower;
-    PLpgSQL_expr *upper;
-    PLpgSQL_expr *step;            /* NULL means default (ie, BY 1) */
-    int            reverse;
-    List       *body;            /* List of statements */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	char	   *label;
+	PLpgSQL_var *var;
+	PLpgSQL_expr *lower;
+	PLpgSQL_expr *upper;
+	PLpgSQL_expr *step;			/* NULL means default (ie, BY 1) */
+	int			reverse;
+	List	   *body;			/* List of statements */
 } PLpgSQL_stmt_fori;
 
 /*
@@ -546,12 +546,12 @@ typedef struct PLpgSQL_stmt_fori
  */
 typedef struct PLpgSQL_stmt_forq
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    char       *label;
-    PLpgSQL_rec *rec;
-    PLpgSQL_row *row;
-    List       *body;            /* List of statements */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	char	   *label;
+	PLpgSQL_rec *rec;
+	PLpgSQL_row *row;
+	List	   *body;			/* List of statements */
 } PLpgSQL_stmt_forq;
 
 /*
@@ -559,14 +559,14 @@ typedef struct PLpgSQL_stmt_forq
  */
 typedef struct PLpgSQL_stmt_fors
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    char       *label;
-    PLpgSQL_rec *rec;
-    PLpgSQL_row *row;
-    List       *body;            /* List of statements */
-    /* end of fields that must match PLpgSQL_stmt_forq */
-    PLpgSQL_expr *query;
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	char	   *label;
+	PLpgSQL_rec *rec;
+	PLpgSQL_row *row;
+	List	   *body;			/* List of statements */
+	/* end of fields that must match PLpgSQL_stmt_forq */
+	PLpgSQL_expr *query;
 } PLpgSQL_stmt_fors;
 
 /*
@@ -574,15 +574,15 @@ typedef struct PLpgSQL_stmt_fors
  */
 typedef struct PLpgSQL_stmt_forc
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    char       *label;
-    PLpgSQL_rec *rec;
-    PLpgSQL_row *row;
-    List       *body;            /* List of statements */
-    /* end of fields that must match PLpgSQL_stmt_forq */
-    int            curvar;
-    PLpgSQL_expr *argquery;        /* cursor arguments if any */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	char	   *label;
+	PLpgSQL_rec *rec;
+	PLpgSQL_row *row;
+	List	   *body;			/* List of statements */
+	/* end of fields that must match PLpgSQL_stmt_forq */
+	int			curvar;
+	PLpgSQL_expr *argquery;		/* cursor arguments if any */
 } PLpgSQL_stmt_forc;
 
 /*
@@ -590,15 +590,15 @@ typedef struct PLpgSQL_stmt_forc
  */
 typedef struct PLpgSQL_stmt_dynfors
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    char       *label;
-    PLpgSQL_rec *rec;
-    PLpgSQL_row *row;
-    List       *body;            /* List of statements */
-    /* end of fields that must match PLpgSQL_stmt_forq */
-    PLpgSQL_expr *query;
-    List       *params;            /* USING expressions */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	char	   *label;
+	PLpgSQL_rec *rec;
+	PLpgSQL_row *row;
+	List	   *body;			/* List of statements */
+	/* end of fields that must match PLpgSQL_stmt_forq */
+	PLpgSQL_expr *query;
+	List	   *params;			/* USING expressions */
 } PLpgSQL_stmt_dynfors;
 
 /*
@@ -606,13 +606,13 @@ typedef struct PLpgSQL_stmt_dynfors
  */
 typedef struct PLpgSQL_stmt_foreach_a
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    char       *label;
-    int            varno;            /* loop target variable */
-    int            slice;            /* slice dimension, or 0 */
-    PLpgSQL_expr *expr;            /* array expression */
-    List       *body;            /* List of statements */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	char	   *label;
+	int			varno;			/* loop target variable */
+	int			slice;			/* slice dimension, or 0 */
+	PLpgSQL_expr *expr;			/* array expression */
+	List	   *body;			/* List of statements */
 } PLpgSQL_stmt_foreach_a;
 
 /*
@@ -620,15 +620,15 @@ typedef struct PLpgSQL_stmt_foreach_a
  */
 typedef struct PLpgSQL_stmt_open
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    int            curvar;
-    int            cursor_options;
-    PLpgSQL_row *returntype;
-    PLpgSQL_expr *argquery;
-    PLpgSQL_expr *query;
-    PLpgSQL_expr *dynquery;
-    List       *params;            /* USING expressions */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	int			curvar;
+	int			cursor_options;
+	PLpgSQL_row *returntype;
+	PLpgSQL_expr *argquery;
+	PLpgSQL_expr *query;
+	PLpgSQL_expr *dynquery;
+	List	   *params;			/* USING expressions */
 } PLpgSQL_stmt_open;
 
 /*
@@ -636,16 +636,16 @@ typedef struct PLpgSQL_stmt_open
  */
 typedef struct PLpgSQL_stmt_fetch
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    PLpgSQL_rec *rec;            /* target, as record or row */
-    PLpgSQL_row *row;
-    int            curvar;            /* cursor variable to fetch from */
-    FetchDirection direction;    /* fetch direction */
-    long        how_many;        /* count, if constant (expr is NULL) */
-    PLpgSQL_expr *expr;            /* count, if expression */
-    bool        is_move;        /* is this a fetch or move? */
-    bool        returns_multiple_rows;    /* can return more than one row? */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	PLpgSQL_rec *rec;			/* target, as record or row */
+	PLpgSQL_row *row;
+	int			curvar;			/* cursor variable to fetch from */
+	FetchDirection direction;	/* fetch direction */
+	long		how_many;		/* count, if constant (expr is NULL) */
+	PLpgSQL_expr *expr;			/* count, if expression */
+	bool		is_move;		/* is this a fetch or move? */
+	bool		returns_multiple_rows;	/* can return more than one row? */
 } PLpgSQL_stmt_fetch;
 
 /*
@@ -653,9 +653,9 @@ typedef struct PLpgSQL_stmt_fetch
  */
 typedef struct PLpgSQL_stmt_close
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    int            curvar;
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	int			curvar;
 } PLpgSQL_stmt_close;
 
 /*
@@ -663,11 +663,11 @@ typedef struct PLpgSQL_stmt_close
  */
 typedef struct PLpgSQL_stmt_exit
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    bool        is_exit;        /* Is this an exit or a continue? */
-    char       *label;            /* NULL if it's an unlabelled EXIT/CONTINUE */
-    PLpgSQL_expr *cond;
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	bool		is_exit;		/* Is this an exit or a continue? */
+	char	   *label;			/* NULL if it's an unlabelled EXIT/CONTINUE */
+	PLpgSQL_expr *cond;
 } PLpgSQL_stmt_exit;
 
 /*
@@ -675,10 +675,10 @@ typedef struct PLpgSQL_stmt_exit
  */
 typedef struct PLpgSQL_stmt_return
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    PLpgSQL_expr *expr;
-    int            retvarno;
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	PLpgSQL_expr *expr;
+	int			retvarno;
 } PLpgSQL_stmt_return;
 
 /*
@@ -686,10 +686,10 @@ typedef struct PLpgSQL_stmt_return
  */
 typedef struct PLpgSQL_stmt_return_next
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    PLpgSQL_expr *expr;
-    int            retvarno;
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	PLpgSQL_expr *expr;
+	int			retvarno;
 } PLpgSQL_stmt_return_next;
 
 /*
@@ -697,11 +697,11 @@ typedef struct PLpgSQL_stmt_return_next
  */
 typedef struct PLpgSQL_stmt_return_query
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    PLpgSQL_expr *query;        /* if static query */
-    PLpgSQL_expr *dynquery;        /* if dynamic query (RETURN QUERY EXECUTE) */
-    List       *params;            /* USING arguments for dynamic query */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	PLpgSQL_expr *query;		/* if static query */
+	PLpgSQL_expr *dynquery;		/* if dynamic query (RETURN QUERY EXECUTE) */
+	List	   *params;			/* USING arguments for dynamic query */
 } PLpgSQL_stmt_return_query;
 
 /*
@@ -709,13 +709,13 @@ typedef struct PLpgSQL_stmt_return_query
  */
 typedef struct PLpgSQL_stmt_raise
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    int            elog_level;
-    char       *condname;        /* condition name, SQLSTATE, or NULL */
-    char       *message;        /* old-style message format literal, or NULL */
-    List       *params;            /* list of expressions for old-style message */
-    List       *options;        /* list of PLpgSQL_raise_option */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	int			elog_level;
+	char	   *condname;		/* condition name, SQLSTATE, or NULL */
+	char	   *message;		/* old-style message format literal, or NULL */
+	List	   *params;			/* list of expressions for old-style message */
+	List	   *options;		/* list of PLpgSQL_raise_option */
 } PLpgSQL_stmt_raise;
 
 /*
@@ -723,8 +723,8 @@ typedef struct PLpgSQL_stmt_raise
  */
 typedef struct PLpgSQL_raise_option
 {
-    PLpgSQL_raise_option_type opt_type;
-    PLpgSQL_expr *expr;
+	PLpgSQL_raise_option_type opt_type;
+	PLpgSQL_expr *expr;
 } PLpgSQL_raise_option;
 
 /*
@@ -732,10 +732,10 @@ typedef struct PLpgSQL_raise_option
  */
 typedef struct PLpgSQL_stmt_assert
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    PLpgSQL_expr *cond;
-    PLpgSQL_expr *message;
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	PLpgSQL_expr *cond;
+	PLpgSQL_expr *message;
 } PLpgSQL_stmt_assert;
 
 /*
@@ -743,15 +743,15 @@ typedef struct PLpgSQL_stmt_assert
  */
 typedef struct PLpgSQL_stmt_execsql
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    PLpgSQL_expr *sqlstmt;
-    bool        mod_stmt;        /* is the stmt INSERT/UPDATE/DELETE?  Note:
-                                 * mod_stmt is set when we plan the query */
-    bool        into;            /* INTO supplied? */
-    bool        strict;            /* INTO STRICT flag */
-    PLpgSQL_rec *rec;            /* INTO target, if record */
-    PLpgSQL_row *row;            /* INTO target, if row */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	PLpgSQL_expr *sqlstmt;
+	bool		mod_stmt;		/* is the stmt INSERT/UPDATE/DELETE?  Note:
+								 * mod_stmt is set when we plan the query */
+	bool		into;			/* INTO supplied? */
+	bool		strict;			/* INTO STRICT flag */
+	PLpgSQL_rec *rec;			/* INTO target, if record */
+	PLpgSQL_row *row;			/* INTO target, if row */
 } PLpgSQL_stmt_execsql;
 
 /*
@@ -759,14 +759,14 @@ typedef struct PLpgSQL_stmt_execsql
  */
 typedef struct PLpgSQL_stmt_dynexecute
 {
-    PLpgSQL_stmt_type cmd_type;
-    int            lineno;
-    PLpgSQL_expr *query;        /* string expression */
-    bool        into;            /* INTO supplied? */
-    bool        strict;            /* INTO STRICT flag */
-    PLpgSQL_rec *rec;            /* INTO target, if record */
-    PLpgSQL_row *row;            /* INTO target, if row */
-    List       *params;            /* USING expressions */
+	PLpgSQL_stmt_type cmd_type;
+	int			lineno;
+	PLpgSQL_expr *query;		/* string expression */
+	bool		into;			/* INTO supplied? */
+	bool		strict;			/* INTO STRICT flag */
+	PLpgSQL_rec *rec;			/* INTO target, if record */
+	PLpgSQL_row *row;			/* INTO target, if row */
+	List	   *params;			/* USING expressions */
 } PLpgSQL_stmt_dynexecute;
 
 /*
@@ -774,32 +774,32 @@ typedef struct PLpgSQL_stmt_dynexecute
  */
 typedef struct PLpgSQL_func_hashkey
 {
-    Oid            funcOid;
-
-    bool        isTrigger;        /* true if called as a trigger */
-
-    /* be careful that pad bytes in this struct get zeroed! */
-
-    /*
-     * For a trigger function, the OID of the trigger is part of the hash key
-     * --- we want to compile the trigger function separately for each trigger
-     * it is used with, in case the rowtype or transition table names are
-     * different.  Zero if not called as a trigger.
-     */
-    Oid            trigOid;
-
-    /*
-     * We must include the input collation as part of the hash key too,
-     * because we have to generate different plans (with different Param
-     * collations) for different collation settings.
-     */
-    Oid            inputCollation;
-
-    /*
-     * We include actual argument types in the hash key to support polymorphic
-     * PLpgSQL functions.  Be careful that extra positions are zeroed!
-     */
-    Oid            argtypes[FUNC_MAX_ARGS];
+	Oid			funcOid;
+
+	bool		isTrigger;		/* true if called as a trigger */
+
+	/* be careful that pad bytes in this struct get zeroed! */
+
+	/*
+	 * For a trigger function, the OID of the trigger is part of the hash key
+	 * --- we want to compile the trigger function separately for each trigger
+	 * it is used with, in case the rowtype or transition table names are
+	 * different.  Zero if not called as a trigger.
+	 */
+	Oid			trigOid;
+
+	/*
+	 * We must include the input collation as part of the hash key too,
+	 * because we have to generate different plans (with different Param
+	 * collations) for different collation settings.
+	 */
+	Oid			inputCollation;
+
+	/*
+	 * We include actual argument types in the hash key to support polymorphic
+	 * PLpgSQL functions.  Be careful that extra positions are zeroed!
+	 */
+	Oid			argtypes[FUNC_MAX_ARGS];
 } PLpgSQL_func_hashkey;
 
 /*
@@ -807,9 +807,9 @@ typedef struct PLpgSQL_func_hashkey
  */
 typedef enum PLpgSQL_trigtype
 {
-    PLPGSQL_DML_TRIGGER,
-    PLPGSQL_EVENT_TRIGGER,
-    PLPGSQL_NOT_TRIGGER
+	PLPGSQL_DML_TRIGGER,
+	PLPGSQL_EVENT_TRIGGER,
+	PLPGSQL_NOT_TRIGGER
 } PLpgSQL_trigtype;
 
 /*
@@ -817,62 +817,62 @@ typedef enum PLpgSQL_trigtype
  */
 typedef struct PLpgSQL_function
 {
-    char       *fn_signature;
-    Oid            fn_oid;
-    TransactionId fn_xmin;
-    ItemPointerData fn_tid;
-    PLpgSQL_trigtype fn_is_trigger;
-    Oid            fn_input_collation;
-    PLpgSQL_func_hashkey *fn_hashkey;    /* back-link to hashtable key */
-    MemoryContext fn_cxt;
-
-    Oid            fn_rettype;
-    int            fn_rettyplen;
-    bool        fn_retbyval;
-    bool        fn_retistuple;
-    bool        fn_retset;
-    bool        fn_readonly;
-
-    int            fn_nargs;
-    int            fn_argvarnos[FUNC_MAX_ARGS];
-    int            out_param_varno;
-    int            found_varno;
-    int            new_varno;
-    int            old_varno;
-    int            tg_name_varno;
-    int            tg_when_varno;
-    int            tg_level_varno;
-    int            tg_op_varno;
-    int            tg_relid_varno;
-    int            tg_relname_varno;
-    int            tg_table_name_varno;
-    int            tg_table_schema_varno;
-    int            tg_nargs_varno;
-    int            tg_argv_varno;
-
-    /* for event triggers */
-    int            tg_event_varno;
-    int            tg_tag_varno;
-
-    PLpgSQL_resolve_option resolve_option;
-
-    bool        print_strict_params;
-
-    /* extra checks */
-    int            extra_warnings;
-    int            extra_errors;
-
-    /* the datums representing the function's local variables */
-    int            ndatums;
-    PLpgSQL_datum **datums;
-    Bitmapset  *resettable_datums;    /* dnos of non-simple vars */
-
-    /* function body parsetree */
-    PLpgSQL_stmt_block *action;
-
-    /* these fields change when the function is used */
-    struct PLpgSQL_execstate *cur_estate;
-    unsigned long use_count;
+	char	   *fn_signature;
+	Oid			fn_oid;
+	TransactionId fn_xmin;
+	ItemPointerData fn_tid;
+	PLpgSQL_trigtype fn_is_trigger;
+	Oid			fn_input_collation;
+	PLpgSQL_func_hashkey *fn_hashkey;	/* back-link to hashtable key */
+	MemoryContext fn_cxt;
+
+	Oid			fn_rettype;
+	int			fn_rettyplen;
+	bool		fn_retbyval;
+	bool		fn_retistuple;
+	bool		fn_retset;
+	bool		fn_readonly;
+
+	int			fn_nargs;
+	int			fn_argvarnos[FUNC_MAX_ARGS];
+	int			out_param_varno;
+	int			found_varno;
+	int			new_varno;
+	int			old_varno;
+	int			tg_name_varno;
+	int			tg_when_varno;
+	int			tg_level_varno;
+	int			tg_op_varno;
+	int			tg_relid_varno;
+	int			tg_relname_varno;
+	int			tg_table_name_varno;
+	int			tg_table_schema_varno;
+	int			tg_nargs_varno;
+	int			tg_argv_varno;
+
+	/* for event triggers */
+	int			tg_event_varno;
+	int			tg_tag_varno;
+
+	PLpgSQL_resolve_option resolve_option;
+
+	bool		print_strict_params;
+
+	/* extra checks */
+	int			extra_warnings;
+	int			extra_errors;
+
+	/* the datums representing the function's local variables */
+	int			ndatums;
+	PLpgSQL_datum **datums;
+	Bitmapset  *resettable_datums;	/* dnos of non-simple vars */
+
+	/* function body parsetree */
+	PLpgSQL_stmt_block *action;
+
+	/* these fields change when the function is used */
+	struct PLpgSQL_execstate *cur_estate;
+	unsigned long use_count;
 } PLpgSQL_function;
 
 /*
@@ -880,59 +880,60 @@ typedef struct PLpgSQL_function
  */
 typedef struct PLpgSQL_execstate
 {
-    PLpgSQL_function *func;        /* function being executed */
+	PLpgSQL_function *func;		/* function being executed */
 
-    Datum        retval;
-    bool        retisnull;
-    Oid            rettype;        /* type of current retval */
+	Datum		retval;
+	bool		retisnull;
+	Oid			rettype;		/* type of current retval */
 
-    Oid            fn_rettype;        /* info about declared function rettype */
-    bool        retistuple;
-    bool        retisset;
+	Oid			fn_rettype;		/* info about declared function rettype */
+	bool		retistuple;
+	bool		retisset;
 
-    bool        readonly_func;
+	bool		readonly_func;
 
-    TupleDesc    rettupdesc;
-    char       *exitlabel;        /* the "target" label of the current EXIT or
-                                 * CONTINUE stmt, if any */
-    ErrorData  *cur_error;        /* current exception handler's error */
+	TupleDesc	rettupdesc;
+	char	   *exitlabel;		/* the "target" label of the current EXIT or
+								 * CONTINUE stmt, if any */
+	ErrorData  *cur_error;		/* current exception handler's error */
 
-    Tuplestorestate *tuple_store;    /* SRFs accumulate results here */
-    MemoryContext tuple_store_cxt;
-    ResourceOwner tuple_store_owner;
-    ReturnSetInfo *rsi;
+	Tuplestorestate *tuple_store;	/* SRFs accumulate results here */
+	MemoryContext tuple_store_cxt;
+	ResourceOwner tuple_store_owner;
+	ReturnSetInfo *rsi;
 
-    /* the datums representing the function's local variables */
-    int            found_varno;
-    int            ndatums;
-    PLpgSQL_datum **datums;
+	/* the datums representing the function's local variables */
+	int			found_varno;
+	int			ndatums;
+	PLpgSQL_datum **datums;
 
-    /* we pass datums[i] to the executor, when needed, in paramLI->params[i] */
-    ParamListInfo paramLI;
-    bool        params_dirty;    /* T if any resettable datum has been passed */
+	/* we pass datums[i] to the executor, when needed, in paramLI->params[i] */
+	ParamListInfo paramLI;
+	bool		params_dirty;	/* T if any resettable datum has been passed */
 
-    /* EState to use for "simple" expression evaluation */
-    EState       *simple_eval_estate;
+	/* EState to use for "simple" expression evaluation */
+	EState	   *simple_eval_estate;
 
-    /* lookup table to use for executing type casts */
-    HTAB       *cast_hash;
-    MemoryContext cast_hash_context;
+	/* lookup table to use for executing type casts */
+	HTAB	   *cast_hash;
+	MemoryContext cast_hash_context;
 
-    /* memory context for statement-lifespan temporary values */
-    MemoryContext stmt_mcontext;    /* current stmt context, or NULL if none */
-    MemoryContext stmt_mcontext_parent; /* parent of current context */
+	/* memory context for statement-lifespan temporary values */
+	MemoryContext stmt_mcontext;	/* current stmt context, or NULL if none */
+	MemoryContext stmt_mcontext_parent; /* parent of current context */
 
-    /* temporary state for results from evaluation of query or expr */
-    SPITupleTable *eval_tuptable;
-    uint64        eval_processed;
-    Oid            eval_lastoid;
-    ExprContext *eval_econtext; /* for executing simple expressions */
+	/* temporary state for results from evaluation of query or expr */
+	SPITupleTable *eval_tuptable;
+	uint64		eval_processed;
+	Oid			eval_lastoid;
+	ExprContext *eval_econtext; /* for executing simple expressions */
 
-    /* status information for error context reporting */
-    PLpgSQL_stmt *err_stmt;        /* current stmt */
-    const char *err_text;        /* additional state info */
+	/* status information for error context reporting */
+	PLpgSQL_stmt *err_stmt;		/* current stmt */
+	const char *err_text;		/* additional state info */
 
-    void       *plugin_info;    /* reserved for use by optional plugin */
+	void	   *plugin_info;	/* reserved for use by optional plugin */
+    bool		handle_exceptions;
 } PLpgSQL_execstate;
 
 /*
@@ -967,17 +968,17 @@ typedef struct PLpgSQL_execstate
  */
 typedef struct PLpgSQL_plugin
 {
-    /* Function pointers set up by the plugin */
-    void        (*func_setup) (PLpgSQL_execstate *estate, PLpgSQL_function *func);
-    void        (*func_beg) (PLpgSQL_execstate *estate, PLpgSQL_function *func);
-    void        (*func_end) (PLpgSQL_execstate *estate, PLpgSQL_function *func);
-    void        (*stmt_beg) (PLpgSQL_execstate *estate, PLpgSQL_stmt *stmt);
-    void        (*stmt_end) (PLpgSQL_execstate *estate, PLpgSQL_stmt *stmt);
-
-    /* Function pointers set by PL/pgSQL itself */
-    void        (*error_callback) (void *arg);
-    void        (*assign_expr) (PLpgSQL_execstate *estate, PLpgSQL_datum *target,
-                                PLpgSQL_expr *expr);
+	/* Function pointers set up by the plugin */
+	void		(*func_setup) (PLpgSQL_execstate *estate, PLpgSQL_function *func);
+	void		(*func_beg) (PLpgSQL_execstate *estate, PLpgSQL_function *func);
+	void		(*func_end) (PLpgSQL_execstate *estate, PLpgSQL_function *func);
+	void		(*stmt_beg) (PLpgSQL_execstate *estate, PLpgSQL_stmt *stmt);
+	void		(*stmt_end) (PLpgSQL_execstate *estate, PLpgSQL_stmt *stmt);
+
+	/* Function pointers set by PL/pgSQL itself */
+	void		(*error_callback) (void *arg);
+	void		(*assign_expr) (PLpgSQL_execstate *estate, PLpgSQL_datum *target,
+								PLpgSQL_expr *expr);
 } PLpgSQL_plugin;
 
 /*
@@ -986,21 +987,21 @@ typedef struct PLpgSQL_plugin
 
 typedef struct PLword
 {
-    char       *ident;            /* palloc'd converted identifier */
-    bool        quoted;            /* Was it double-quoted? */
+	char	   *ident;			/* palloc'd converted identifier */
+	bool		quoted;			/* Was it double-quoted? */
 } PLword;
 
 typedef struct PLcword
 {
-    List       *idents;            /* composite identifiers (list of String) */
+	List	   *idents;			/* composite identifiers (list of String) */
 } PLcword;
 
 typedef struct PLwdatum
 {
-    PLpgSQL_datum *datum;        /* referenced variable */
-    char       *ident;            /* valid if simple name */
-    bool        quoted;
-    List       *idents;            /* valid if composite name */
+	PLpgSQL_datum *datum;		/* referenced variable */
+	char	   *ident;			/* valid if simple name */
+	bool		quoted;
+	List	   *idents;			/* valid if composite name */
 } PLwdatum;
 
 /**********************************************************************
@@ -1009,33 +1010,33 @@ typedef struct PLwdatum
 
 typedef enum
 {
-    IDENTIFIER_LOOKUP_NORMAL,    /* normal processing of var names */
-    IDENTIFIER_LOOKUP_DECLARE,    /* In DECLARE --- don't look up names */
-    IDENTIFIER_LOOKUP_EXPR        /* In SQL expression --- special case */
+	IDENTIFIER_LOOKUP_NORMAL,	/* normal processing of var names */
+	IDENTIFIER_LOOKUP_DECLARE,	/* In DECLARE --- don't look up names */
+	IDENTIFIER_LOOKUP_EXPR		/* In SQL expression --- special case */
 } IdentifierLookup;
 
 extern IdentifierLookup plpgsql_IdentifierLookup;
 
-extern int    plpgsql_variable_conflict;
+extern int	plpgsql_variable_conflict;
 
 extern bool plpgsql_print_strict_params;
 
 extern bool plpgsql_check_asserts;
 
 /* extra compile-time checks */
-#define PLPGSQL_XCHECK_NONE            0
-#define PLPGSQL_XCHECK_SHADOWVAR    1
-#define PLPGSQL_XCHECK_ALL            ((int) ~0)
+#define PLPGSQL_XCHECK_NONE			0
+#define PLPGSQL_XCHECK_SHADOWVAR	1
+#define PLPGSQL_XCHECK_ALL			((int) ~0)
 
-extern int    plpgsql_extra_warnings;
-extern int    plpgsql_extra_errors;
+extern int	plpgsql_extra_warnings;
+extern int	plpgsql_extra_errors;
 
 extern bool plpgsql_check_syntax;
 extern bool plpgsql_DumpExecTree;
 
 extern PLpgSQL_stmt_block *plpgsql_parse_result;
 
-extern int    plpgsql_nDatums;
+extern int	plpgsql_nDatums;
 extern PLpgSQL_datum **plpgsql_Datums;
 
 extern char *plpgsql_error_funcname;
@@ -1053,32 +1054,32 @@ extern PLpgSQL_plugin **plpgsql_plugin_ptr;
  * Functions in pl_comp.c
  */
 extern PLpgSQL_function *plpgsql_compile(FunctionCallInfo fcinfo,
-                bool forValidator);
+				bool forValidator);
 extern PLpgSQL_function *plpgsql_compile_inline(char *proc_source);
 extern void plpgsql_parser_setup(struct ParseState *pstate,
-                     PLpgSQL_expr *expr);
+					 PLpgSQL_expr *expr);
 extern bool plpgsql_parse_word(char *word1, const char *yytxt,
-                   PLwdatum *wdatum, PLword *word);
+				   PLwdatum *wdatum, PLword *word);
 extern bool plpgsql_parse_dblword(char *word1, char *word2,
-                      PLwdatum *wdatum, PLcword *cword);
+					  PLwdatum *wdatum, PLcword *cword);
 extern bool plpgsql_parse_tripword(char *word1, char *word2, char *word3,
-                       PLwdatum *wdatum, PLcword *cword);
+					   PLwdatum *wdatum, PLcword *cword);
 extern PLpgSQL_type *plpgsql_parse_wordtype(char *ident);
 extern PLpgSQL_type *plpgsql_parse_cwordtype(List *idents);
 extern PLpgSQL_type *plpgsql_parse_wordrowtype(char *ident);
 extern PLpgSQL_type *plpgsql_parse_cwordrowtype(List *idents);
 extern PLpgSQL_type *plpgsql_build_datatype(Oid typeOid, int32 typmod,
-                       Oid collation);
+					   Oid collation);
 extern PLpgSQL_variable *plpgsql_build_variable(const char *refname, int lineno,
-                       PLpgSQL_type *dtype,
-                       bool add2namespace);
+					   PLpgSQL_type *dtype,
+					   bool add2namespace);
 extern PLpgSQL_rec *plpgsql_build_record(const char *refname, int lineno,
-                     bool add2namespace);
+					 bool add2namespace);
 extern int plpgsql_recognize_err_condition(const char *condname,
-                                bool allow_sqlstate);
+								bool allow_sqlstate);
 extern PLpgSQL_condition *plpgsql_parse_err_condition(char *condname);
 extern void plpgsql_adddatum(PLpgSQL_datum *new);
-extern int    plpgsql_add_initdatums(int **varnos);
+extern int	plpgsql_add_initdatums(int **varnos);
 extern void plpgsql_HashTableInit(void);
 
 /*
@@ -1090,35 +1091,35 @@ extern void _PG_init(void);
  * Functions in pl_exec.c
  */
 extern Datum plpgsql_exec_function(PLpgSQL_function *func,
-                      FunctionCallInfo fcinfo,
-                      EState *simple_eval_estate);
+					  FunctionCallInfo fcinfo,
+					  EState *simple_eval_estate);
 extern HeapTuple plpgsql_exec_trigger(PLpgSQL_function *func,
-                     TriggerData *trigdata);
+					 TriggerData *trigdata);
 extern void plpgsql_exec_event_trigger(PLpgSQL_function *func,
-                           EventTriggerData *trigdata);
+						   EventTriggerData *trigdata);
 extern void plpgsql_xact_cb(XactEvent event, void *arg);
 extern void plpgsql_subxact_cb(SubXactEvent event, SubTransactionId mySubid,
-                   SubTransactionId parentSubid, void *arg);
+				   SubTransactionId parentSubid, void *arg);
 extern Oid plpgsql_exec_get_datum_type(PLpgSQL_execstate *estate,
-                            PLpgSQL_datum *datum);
+							PLpgSQL_datum *datum);
 extern void plpgsql_exec_get_datum_type_info(PLpgSQL_execstate *estate,
-                                 PLpgSQL_datum *datum,
-                                 Oid *typeid, int32 *typmod, Oid *collation);
+								 PLpgSQL_datum *datum,
+								 Oid *typeid, int32 *typmod, Oid *collation);
 
 /*
  * Functions for namespace handling in pl_funcs.c
  */
 extern void plpgsql_ns_init(void);
 extern void plpgsql_ns_push(const char *label,
-                PLpgSQL_label_type label_type);
+				PLpgSQL_label_type label_type);
 extern void plpgsql_ns_pop(void);
 extern PLpgSQL_nsitem *plpgsql_ns_top(void);
 extern void plpgsql_ns_additem(PLpgSQL_nsitem_type itemtype, int itemno, const char *name);
 extern PLpgSQL_nsitem *plpgsql_ns_lookup(PLpgSQL_nsitem *ns_cur, bool localmode,
-                  const char *name1, const char *name2,
-                  const char *name3, int *names_used);
+				  const char *name1, const char *name2,
+				  const char *name3, int *names_used);
 extern PLpgSQL_nsitem *plpgsql_ns_lookup_label(PLpgSQL_nsitem *ns_cur,
-                        const char *name);
+						const char *name);
 extern PLpgSQL_nsitem *plpgsql_ns_find_nearest_loop(PLpgSQL_nsitem *ns_cur);
 
 /*
@@ -1132,25 +1133,25 @@ extern void plpgsql_dumptree(PLpgSQL_function *func);
 /*
  * Scanner functions in pl_scanner.c
  */
-extern int    plpgsql_base_yylex(void);
-extern int    plpgsql_yylex(void);
+extern int	plpgsql_base_yylex(void);
+extern int	plpgsql_yylex(void);
 extern void plpgsql_push_back_token(int token);
 extern bool plpgsql_token_is_unreserved_keyword(int token);
 extern void plpgsql_append_source_text(StringInfo buf,
-                           int startlocation, int endlocation);
-extern int    plpgsql_peek(void);
+						   int startlocation, int endlocation);
+extern int	plpgsql_peek(void);
 extern void plpgsql_peek2(int *tok1_p, int *tok2_p, int *tok1_loc,
-              int *tok2_loc);
-extern int    plpgsql_scanner_errposition(int location);
+			  int *tok2_loc);
+extern int	plpgsql_scanner_errposition(int location);
 extern void plpgsql_yyerror(const char *message) pg_attribute_noreturn();
-extern int    plpgsql_location_to_lineno(int location);
-extern int    plpgsql_latest_lineno(void);
+extern int	plpgsql_location_to_lineno(int location);
+extern int	plpgsql_latest_lineno(void);
 extern void plpgsql_scanner_init(const char *str);
 extern void plpgsql_scanner_finish(void);
 
 /*
  * Externs in gram.y
  */
-extern int    plpgsql_yyparse(void);
+extern int	plpgsql_yyparse(void);
 
-#endif                            /* PLPGSQL_H */
+#endif							/* PLPGSQL_H */

From 61a02c6fc16088ffaf2ac65934a38e78b2d035f5 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 18 Feb 2022 14:41:31 +0800
Subject: [PATCH 487/578] Split the role settings in pg_stat_cluster_activity
 to prevent the executor from overwriting the results

tapd: http://tapd.woa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696096872133
---
 .../pg_stat_cluster_activity.c                | 25 +++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
index 8518ae8a..2b36fe39 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -321,18 +321,27 @@ pgcs_entry_initialize(void)
  * 
  *  Report common fileds of cluster backend status activity,
  *  called by pgcs_report_query_activity and pgcs_report_activity.
- *  report role, sqname, also if this backend become consumer, remove
- *  previous planstate and cursor.
  * ----------
  */
 static void
-pgcs_report_common(PgClusterStatus *entry, QueryDesc *desc)
+pgcs_report_common(PgClusterStatus *entry)
 {
 	strncpy((char *) entry->sessionid, PGXCSessionId, NAMEDATALEN);
 	
 	entry->sqdone = false;
 	entry->valid = true;
-	
+}
+
+/* ----------
+ * pgcs_report_role
+ * 
+ *  Report role, sqname, also if this backend become consumer, remove
+ *  previous planstate and cursor.
+ * ----------
+ */
+static void
+pgcs_report_role(PgClusterStatus *entry, QueryDesc *desc)
+{
 	/* fields need queryDesc */
 	if (IS_PGXC_DATANODE)
 	{
@@ -391,7 +400,7 @@ pgcs_report_query_activity(BackendState state, const char *cmd_str)
 	pgcs_entry_initialize();
 	entry = MyCSEntry;
 	
-	pgcs_report_common((PgClusterStatus *) entry, NULL);
+	pgcs_report_common((PgClusterStatus *) entry);
 	
 	if (prev_pgstat_report_hook)
 		prev_pgstat_report_hook(state, cmd_str);
@@ -468,7 +477,8 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags)
 	if (cursors != NULL && cursors->len > 0)
 		memcpy((char *) entry->cursors, cursors->data, Min(cursors->len + 1, NAMEDATALEN * 64));
 	
-	pgcs_report_common((PgClusterStatus *) entry, desc);
+	pgcs_report_common((PgClusterStatus *) entry);
+	pgcs_report_role((PgClusterStatus *) entry, desc);
 	
 	increment_changecount_after(entry);
 }
@@ -501,7 +511,8 @@ pgcs_report_activity(Portal portal)
 	increment_changecount_before(entry);
 	
 	strncpy((char *) entry->portal, portal->name, NAMEDATALEN);
-	pgcs_report_common((PgClusterStatus *) entry, desc);
+	pgcs_report_common((PgClusterStatus *) entry);
+	pgcs_report_role((PgClusterStatus *) entry, desc);
 	
 	increment_changecount_after(entry);
 }

From 0384d593f766a171c0a5391ca362ab3beffdae7c Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Fri, 16 Jul 2021 17:01:22 +0800
Subject: [PATCH 488/578] Fix connection amplification due to remote scan of
 replicate table.
 http://tapd.oa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131089932161 (merge
 request !491)

---
 src/backend/optimizer/util/pathnode.c | 34 +++++++++++++--------------
 src/test/regress/sql/sequence.sql     |  7 ------
 2 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 032253ed..5c1a3ca5 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1657,23 +1657,6 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
     if (innerd == NULL && outerd == NULL)
         return NIL;
 #ifdef __TBASE__
-    /*
-     * DML may need to push down to datanodes, for example:
-     *   DELETE FROM
-     *   	geocode_settings as gc
-     *   USING geocode_settings_default AS gf
-     *   WHERE
-     *   	gf.name = gc.name and gf.setting = gc.setting;
-     * prefer_olap means pulling query up to coordinator node, in case data
-     * re-distribute in TPC-C test case.
-     *
-     * TODO: We need to automatically determine whether we need to pull it up,
-     * but not using GUC.
-     */
-    if(!prefer_olap && false == dml)
-    {
-        goto pull_up;
-    }
 
 	/*
 	 * If outer or inner subpaths are distributed by shard and they do not exist
@@ -1802,6 +1785,23 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
         return alternate;
     }
 
+	/*
+	 * DML may need to push down to datanodes, for example:
+	 *   DELETE FROM
+	 *   	geocode_settings as gc
+	 *   USING geocode_settings_default AS gf
+	 *   WHERE
+	 *   	gf.name = gc.name and gf.setting = gc.setting;
+	 * prefer_olap means pulling query up to coordinator node, in case data
+	 * re-distribute in TPC-C test case.
+	 *
+	 * TODO: We need to automatically determine whether we need to pull it up,
+		* but not using GUC.
+		*/
+	if(!prefer_olap && false == dml)
+	{
+		goto pull_up;
+	}
 
     restrictClauses = list_copy(pathnode->joinrestrictinfo);
     restrictClauses = list_concat(restrictClauses,
diff --git a/src/test/regress/sql/sequence.sql b/src/test/regress/sql/sequence.sql
index a0f8180d..67c91ef8 100644
--- a/src/test/regress/sql/sequence.sql
+++ b/src/test/regress/sql/sequence.sql
@@ -459,10 +459,3 @@ insert into t2(f2) values(5);
 insert into t3(f2) values(6);
 select gsk_key from pg_list_storage_sequence() where gsk_key like '%db_seq1_bak.%';
 \q
-<<<<<<< HEAD
-=======
-
-
-
-
->>>>>>> 85b5350be... fix gtm seq bug when create databse or drop databse http://tapd.woa.com/10092131/bugtrace/bugs/view?bug_id=1010092131096383437&jump_count=1 and http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131087562597 (merge request !1132)

From aac34ee921b8e61dc55c57e31aed1853673ea7a6 Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Fri, 1 Apr 2022 11:24:49 +0800
Subject: [PATCH 489/578] Revert "fix error msg when reset handles fix
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131094179541 (merge request
 !938)"

This reverts commit eb35163c5928605e9520c6fd7cba1fca1923d51a.
---
 src/backend/pgxc/pool/execRemote.c | 18 ++--------
 src/backend/pgxc/pool/pgxcnode.c   | 55 ++++++++----------------------
 src/include/pgxc/pgxcnode.h        |  2 +-
 3 files changed, 17 insertions(+), 58 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index dcb98c13..297f66bf 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3734,15 +3734,7 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
     {
         for (i = 0; i < new_count; i++)
         {
-			if (pgxc_node_set_query(new_connections[i], init_str))
-            {
-                /*
-                 * print log here and return eof indicates execution failure
-                 */
-                elog(LOG, "pgxc_node_begin send %s to node %s, pid:%d failed", init_str,
-                     new_connections[i]->nodename, new_connections[i]->backend_pid);
-                return EOF;
-            }
+			pgxc_node_set_query(new_connections[i], init_str);
 			elog(DEBUG5, "pgxc_node_begin send %s to node %s, pid:%d", init_str,
 					new_connections[i]->nodename, new_connections[i]->backend_pid);
         }
@@ -7059,13 +7051,7 @@ LeaderCnExecRemoteUtility(RemoteQuery *node,
 	char *init_str = PGXCNodeGetSessionParamStr();
 	if (init_str)
 	{
-		if (pgxc_node_set_query(leader_cn_conn, init_str))
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                            errmsg("pgxc_node_set_query send %s to node %s, pid:%d failed", init_str,
-                                   leader_cn_conn->nodename, leader_cn_conn->backend_pid)));
-        }
+		pgxc_node_set_query(leader_cn_conn, init_str);
 	}
 	
 	SetPlpgsqlTransactionBegin(leader_cn_conn);
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 173e06eb..890f9715 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -149,7 +149,7 @@ static bool DoRefreshRemoteHandles(void);
 
 #ifdef XCP
 static void pgxc_node_init(PGXCNodeHandle *handle, int sock,
-		bool global_session, int pid, bool is_reset_handle);
+		bool global_session, int pid);
 #else
 static void pgxc_node_init(PGXCNodeHandle *handle, int sock);
 #endif
@@ -667,7 +667,7 @@ pgxc_node_all_free(void)
  * Structure stores state info and I/O buffers
  */
 static void
-pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid, bool is_reset_handle)
+pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid)
 {
     char *init_str;
 
@@ -701,20 +701,9 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, bool global_session, int pid, b
     if (global_session)
     {
         init_str = PGXCNodeGetSessionParamStr();
-		if (init_str && pgxc_node_set_query(handle, init_str))
-		{
-            if (is_reset_handle)
-            {
-                /* if it is a reset handle, do not throw error, just set handle as error state */
-                PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
-                elog(WARNING, "pgxc_node_set_query send %s to node %s, pid:%d failed", init_str,
-                     handle->nodename, handle->backend_pid);
-            }
-            else
+		if (init_str)
         {
-                elog(ERROR, "pgxc_node_set_query send %s to node %s, pid:%d failed", init_str,
-                     handle->nodename, handle->backend_pid);
-            }
+			pgxc_node_set_query(handle, init_str);
         }
     }
 
@@ -1557,7 +1546,6 @@ release_handles(bool force)
 
 /*
  * Reset all Datanode and Coordinator connections occupied memory.
- * TODO： fix implicit transaction do not commit on dn and remove reset_handles
  */
 void
 reset_handles(void)
@@ -1582,7 +1570,7 @@ reset_handles(void)
 
 		if (handle->sock != NO_SOCKET)
 		{
-			pgxc_node_init(handle, handle->sock, true, handle->backend_pid, true);
+			pgxc_node_init(handle, handle->sock, true, handle->backend_pid);
 		}
 	}
 
@@ -1592,7 +1580,7 @@ reset_handles(void)
 
 		if (handle->sock != NO_SOCKET)
 		{
-			pgxc_node_init(handle, handle->sock, true, handle->backend_pid, true);
+			pgxc_node_init(handle, handle->sock, true, handle->backend_pid);
 		}
 	}
 
@@ -1605,16 +1593,10 @@ reset_handles(void)
 
 			if (handle->sock != NO_SOCKET)
 			{
-				pgxc_node_init(handle, handle->sock, true, handle->backend_pid, true);
+				pgxc_node_init(handle, handle->sock, true, handle->backend_pid);
 			}
 		}
 	}
-
-    if (validate_handles())
-    {
-        elog(LOG, "found bad remote node connections, force release handles now");
-        release_handles(true);
-    }
 }
 
 /*
@@ -3791,7 +3773,7 @@ get_any_handle(List *datanodelist)
                     
                     
                     node_handle = &dn_handles[node];
-					pgxc_node_init(node_handle, fds[0], true, pids[0], false);
+					pgxc_node_init(node_handle, fds[0], true, pids[0]);
                     datanode_count++;
 
                     elog(DEBUG1, "Established a connection with datanode \"%s\","
@@ -4067,7 +4049,7 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
 					continue;
 				}
 				
-				pgxc_node_init(node_handle, fdsock, is_global_session, be_pid, false);
+				pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
                 dn_handles[node] = *node_handle;
                 datanode_count++;
 
@@ -4132,7 +4114,7 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
 					continue;
 				}
 				
-				pgxc_node_init(node_handle, fdsock, is_global_session, be_pid, false);
+				pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
                 co_handles[node] = *node_handle;
                 coord_count++;
 
@@ -5158,18 +5140,14 @@ PGXCNodeGetTransactionParamStr(void)
 /*
  * Send down specified query, read and discard all responses until ReadyForQuery
  */
-int
+void
 pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
 {
 	if (pgxc_node_send_query(handle, set_query) != 0)
 	{
-	    /*
-	     * print log only and decide whether to throw an error at the place where it is called
-	     */
-		ereport(LOG,
+		ereport(ERROR,
 				(errcode(ERRCODE_INTERNAL_ERROR),
 						errmsg("Failed to send query %s",set_query)));
-		return EOF;
 	}
     /*
      * Now read responses until ReadyForQuery.
@@ -5210,11 +5188,8 @@ pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
         {
                         PGXCNodeHandleError(handle, msg, msglen);
                         PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_ERROR_FATAL);
-            /*
-             * print log only and decide whether to throw an error at the place where it is called
-             */
-            elog(LOG,"pgxc_node_set_query: %s",handle->error);
-			return EOF;
+                        elog(ERROR,"pgxc_node_set_query: %s",handle->error);
+			break;
         }
 
         if (msgtype == 'Z') /* ReadyForQuery */
@@ -5225,8 +5200,6 @@ pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
             break;
         }
     }
-
-	return 0;
 }
 
 
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 71f8fa40..402fb28c 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -281,7 +281,7 @@ extern void PGXCNodeSetParam(bool local, const char *name, const char *value,
 extern void PGXCNodeResetParams(bool only_local);
 extern char *PGXCNodeGetSessionParamStr(void);
 extern char *PGXCNodeGetTransactionParamStr(void);
-extern int pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query);
+extern void pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query);
 extern void RequestInvalidateRemoteHandles(void);
 extern void RequestRefreshRemoteHandles(void);
 extern bool PoolerMessagesPending(void);

From d2a76cfe88f2ee50cebfd9836eaea800e72023fc Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Fri, 1 Apr 2022 11:25:02 +0800
Subject: [PATCH 490/578] Revert "[BUGFIX] Subtransaction commits should not
 reset session information"

This reverts commit 98634f5bec4daf68503344f3b251ee30b5cd9bbf.
---
 src/backend/pgxc/pool/execRemote.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 297f66bf..74c414a4 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -4850,12 +4850,14 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
     stat_transaction(conn_count);
 
     /* do not cleanup remote session for subtrans */
-	if (!temp_object_included && need_release_handle)
+	if (!temp_object_included)
         {
             /* Clean up remote sessions */
 		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
 		                             txn_type == TXN_TYPE_RollbackSubTxn);
 
+		if (need_release_handle)
+		{
 			if (PersistentConnections)
 			{
 				reset_handles();
@@ -4872,6 +4874,7 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 			}
         }
     }
+	}
 
     clear_handles();
 }
@@ -5960,12 +5963,13 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
 	 * certain issues for aborted transactions, we drop the connections.
 	 * Revisit and fix the issue
 	 */
-	if (!temp_object_included && need_release_handle)
+	if (!temp_object_included)
         {
             /* Clean up remote sessions */
 		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
 		                             txn_type == TXN_TYPE_RollbackSubTxn);
-
+		if (need_release_handle)
+		{
 			if (HaveActiveDatanodeStatements())
 			{
 				reset_handles();
@@ -5975,6 +5979,7 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
 			release_handles(false);
         }
     }
+	}
     
     clear_handles();
     pfree_pgxc_all_handles(handles);

From 406e57884537dc721cd89516cf4720b681addca4 Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Fri, 1 Apr 2022 11:25:13 +0800
Subject: [PATCH 491/578] Revert "bugfix: prepare regress failed (merge request
 !440)"

This reverts commit 56cd97f98d95100917ff7468cf3a6dee83a9d30a.
---
 src/backend/pgxc/pool/execRemote.c | 41 +++++-------------------------
 src/backend/pgxc/pool/pgxcnode.c   |  6 +++++
 2 files changed, 12 insertions(+), 35 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 74c414a4..e5dfad24 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3778,6 +3778,12 @@ pgxc_node_remote_cleanup_all(bool sub)
         return;
     }
 
+	/* Do not cleanup connections if we have prepared statements on nodes */
+	if (HaveActiveDatanodeStatements())
+	{
+		return;
+	}
+
     /*
      * Send down snapshot followed by DISCARD ALL command.
      */
@@ -4785,16 +4791,9 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 		}
 		else
 		{
-			if (HaveActiveDatanodeStatements())
-			{
-				reset_handles();
-			}
-			else
-			{
         release_handles(false);
     }
 	}
-	}
     
     clear_handles();
 
@@ -4864,17 +4863,10 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
     }
     else
     {
-				if (HaveActiveDatanodeStatements())
-				{
-					reset_handles();
-				}
-				else
-				{
 				release_handles(false);
 			}
         }
     }
-	}
 
     clear_handles();
 }
@@ -5102,17 +5094,10 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 	}
 	else
 	{
-				if (HaveActiveDatanodeStatements())
-				{
-					reset_handles();
-				}
-				else
-				{
 				release_handles(false);
 			}
 		}
 	}
-	}
 	
 	clear_handles();
 #endif
@@ -5970,16 +5955,9 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
 		                             txn_type == TXN_TYPE_RollbackSubTxn);
 		if (need_release_handle)
 		{
-			if (HaveActiveDatanodeStatements())
-			{
-				reset_handles();
-			}
-			else
-			{
 			release_handles(false);
         }
     }
-	}
     
     clear_handles();
     pfree_pgxc_all_handles(handles);
@@ -8955,16 +8933,9 @@ pgxc_node_remote_finish(char *prepareGID, bool commit,
 		}
 		else
 		{
-			if (HaveActiveDatanodeStatements())
-			{
-				reset_handles();
-			}
-			else
-			{
         release_handles(false);
     }
 	}
-	}
     clear_handles();
     pfree_pgxc_all_handles(pgxc_handles);
 	reset_transaction_handles();
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 890f9715..17e271dd 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -1563,6 +1563,12 @@ reset_handles(void)
 		return;
 	}
 
+	/* Do not reset connections if we have prepared statements on nodes */
+	if (HaveActiveDatanodeStatements())
+	{
+		return;
+	}
+
 	/* Reset Datanodes handles occupied memory */
 	for (i = 0; i < NumDataNodes; i++)
 	{

From 9da75bb3a9e3b3cdba4a4599eac47bce8422ea68 Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Fri, 2 Jul 2021 11:47:08 +0800
Subject: [PATCH 492/578] fix: implicit transaction do not commit on dn

---
 src/backend/pgxc/pool/execRemote.c | 104 ++++++++++++++++-------------
 src/backend/pgxc/pool/pgxcnode.c   |  47 ++++++++++++-
 src/backend/tcop/postgres.c        |   5 ++
 src/include/pgxc/pgxcnode.h        |   2 +
 4 files changed, 111 insertions(+), 47 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index e5dfad24..0d1fa08e 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -4780,20 +4780,12 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
     else
         elog(ERROR, "failed to PREPARE transaction on one or more nodes");
 
-	if (!temp_object_included)
+	if (!temp_object_included && !PersistentConnections)
     {
         /* Clean up remote sessions */
 		pgxc_node_remote_cleanup_all(false);
-
-		if (PersistentConnections)
-		{
-			reset_handles();
-		}
-		else
-		{
         release_handles(false);
     }
-	}
     
     clear_handles();
 
@@ -4848,23 +4840,24 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 
     stat_transaction(conn_count);
 
-    /* do not cleanup remote session for subtrans */
-	if (!temp_object_included)
+    if (need_release_handle)
+    {
+        if (!temp_object_included && !PersistentConnections)
         {
             /* Clean up remote sessions */
 		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
 		                             txn_type == TXN_TYPE_RollbackSubTxn);
-
-		if (need_release_handle)
-		{
-			if (PersistentConnections)
-			{
-				reset_handles();
+            release_handles(false);
+        }
     }
     else
     {
-				release_handles(false);
-			}
+        /* in subtxn, we just cleanup the connections. not release the handles. */
+        if (!temp_object_included && !PersistentConnections)
+        {
+            /* Clean up remote sessions without release handles. */
+            pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
+                                         txn_type == TXN_TYPE_RollbackSubTxn);
         }
     }
 
@@ -5081,21 +5074,23 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 #ifndef __TBASE__
 	stat_transaction(conn_count);
 
-	if (!temp_object_included)
-	{
-		/* Clean up remote sessions */
-		pgxc_node_remote_cleanup_all();
 	
 	if (need_release_handle)
 	{
-			if (PersistentConnections)
+		if (!temp_object_included && !PersistentConnections)
 		{
-				reset_handles();
+			/* Clean up remote sessions */
+			pgxc_node_remote_cleanup_all();
+			release_handles(false);
+		}
 	}
 	else
 	{
-				release_handles(false);
-			}
+		/* in subtxn, we just cleanup the connections. not release the handles. */
+		if (!temp_object_included && !PersistentConnections)
+		{
+			/* Clean up remote sessions without release handles. */
+			pgxc_node_remote_cleanup_all();
 		}
 	}
 	
@@ -5940,25 +5935,26 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
     }
 #endif    
     
-	/*
-	 * Drop the connections to ensure aborts are handled properly.
-	 *
-	 * XXX We should really be consulting PersistentConnections parameter and
-	 * keep the connections if its set. But as a short term measure, to address
-	 * certain issues for aborted transactions, we drop the connections.
-	 * Revisit and fix the issue
-	 */
+	if (need_release_handle)
+	{
 	if (!temp_object_included)
         {
             /* Clean up remote sessions */
 		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
 		                             txn_type == TXN_TYPE_RollbackSubTxn);
-		if (need_release_handle)
-		{
 			release_handles(false);
         }
     }
-    
+	else
+	{
+		/* in subtxn, we just cleanup the connections. not release the handles. */
+		if (!temp_object_included)
+		{
+            /* Clean up remote sessions */
+            pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
+                                         txn_type == TXN_TYPE_RollbackSubTxn);
+		}
+	}
     clear_handles();
     pfree_pgxc_all_handles(handles);
 
@@ -7959,6 +7955,29 @@ PreAbort_Remote(TranscationType txn_type, bool need_release_handle)
 
     pgxc_node_remote_abort(txn_type, need_release_handle);
 
+	/*
+	 * Drop the connections to ensure aborts are handled properly.
+	 *
+	 * XXX We should really be consulting PersistentConnections parameter and
+	 * keep the connections if its set. But as a short term measure, to address
+	 * certain issues for aborted transactions, we drop the connections.
+	 * Revisit and fix the issue
+	 */
+	elog(DEBUG5, "temp_object_included %d", temp_object_included);
+	/* cleanup and release handles is already done in pgxc_node_remote_abort */
+#if 0	
+	if (release_handle)
+	{
+		if (!temp_object_included)
+		{
+			/* Clean up remote sessions */
+			pgxc_node_remote_cleanup_all();
+			release_handles();
+		}
+	}
+	
+	clear_handles();
+#endif
     pfree_pgxc_all_handles(all_handles);
 
     if (log_gtm_stats)
@@ -8923,19 +8942,12 @@ pgxc_node_remote_finish(char *prepareGID, bool commit,
     }
 #endif    
 
-	if (!temp_object_included)
+	if (!temp_object_included && !PersistentConnections)
     {
         /* Clean up remote sessions */
 		pgxc_node_remote_cleanup_all(false);
-		if (PersistentConnections)
-		{
-			reset_handles();
-		}
-		else
-		{
         release_handles(false);
     }
-	}
     clear_handles();
     pfree_pgxc_all_handles(pgxc_handles);
 	reset_transaction_handles();
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 17e271dd..445a27c1 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -2571,6 +2571,51 @@ pgxc_node_send_sync(PGXCNodeHandle * handle)
     return pgxc_node_flush(handle);
 }
 
+
+/*
+ * Send SYNC message down to the Datanode
+ */
+int
+pgxc_node_send_my_sync(PGXCNodeHandle * handle)
+{
+    /* size */
+    int			msgLen = 4;
+
+    /* msgType + msgLen */
+    if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+    {
+        add_error_message(handle, "out of memory");
+        return EOF;
+    }
+
+    handle->outBuffer[handle->outEnd++] = 'L';
+    /* size */
+    msgLen = htonl(msgLen);
+    memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+    handle->outEnd += 4;
+
+    handle->in_extended_query = false;
+    handle->needSync = false;
+
+    msgLen = 4;
+    /* msgType + msgLen */
+    if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+    {
+        add_error_message(handle, "out of memory");
+        return EOF;
+    }
+
+    handle->outBuffer[handle->outEnd++] = 'H';
+    /* size */
+    msgLen = htonl(msgLen);
+    memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+    handle->outEnd += 4;
+
+    handle->in_extended_query = true;
+
+    return pgxc_node_flush(handle);
+}
+
 #ifdef __SUBSCRIPTION__
 /*
  * Send logical apply message down to the Datanode
@@ -2633,7 +2678,7 @@ pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
     if (fetch_size >= 0)
         if (pgxc_node_send_execute(handle, portal, fetch_size))
             return EOF;
-    if (pgxc_node_send_flush(handle))
+	if (pgxc_node_send_my_sync(handle))
         return EOF;
 
     return 0;
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 7735fd18..01332ba6 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -629,6 +629,7 @@ SocketBackend(StringInfo inBuf)
                          errmsg("invalid frontend message type %d", qtype)));
             break;
 
+        case 'L':
         case 'S':                /* sync */
             /* stop any active skip-till-Sync */
             ignore_till_sync = false;
@@ -5803,6 +5804,10 @@ PostgresMain(int argc, char *argv[],
                 send_ready_for_query = true;
                 break;
 
+            case 'L':			/* sync */
+                pq_getmsgend(&input_message);
+                finish_xact_command();
+                break;	
 #ifdef __TBASE__
             case 'N':
                 {
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 402fb28c..adbc8f6e 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -213,6 +213,8 @@ extern int	pgxc_node_send_execute(PGXCNodeHandle * handle, const char *portal, i
 extern int	pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
 					 const char *name);
 extern int	pgxc_node_send_sync(PGXCNodeHandle * handle);
+extern int	pgxc_node_send_my_sync(PGXCNodeHandle * handle);
+
 #ifdef __SUBSCRIPTION__
 extern int pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_pk_conflict);
 #endif

From 68a21af8b90493a83559c498d3834931c9ba234a Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Mon, 28 Jun 2021 16:05:55 +0800
Subject: [PATCH 493/578] perf: add parse_snapshot to decrease gtm request

---
 src/backend/tcop/postgres.c         | 10 ++++++----
 src/backend/utils/cache/plancache.c |  4 ++--
 src/backend/utils/misc/guc.c        |  9 +++++++++
 src/include/utils/guc.h             |  1 +
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 01332ba6..b314aa97 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -211,6 +211,8 @@ static char *remotePrepareGID = NULL;
 /* for error code contrib */
 bool g_is_in_init_phase = false;
 
+bool g_parse_snapshot = true;
+
 bool IsNormalPostgres = false;
 
 bool explain_stmt = false;
@@ -1443,7 +1445,7 @@ exec_simple_query(const char *query_string)
         /*
          * Set up a snapshot if parse analysis/planning will need one.
          */
-        if (analyze_requires_snapshot(parsetree))
+		if (analyze_requires_snapshot(parsetree) && g_parse_snapshot)
         {
 #ifdef __TBASE__
             /* use local snapshot instead of global if told so */
@@ -1918,7 +1920,7 @@ exec_parse_message(const char *query_string,    /* string to execute */
         /*
          * Set up a snapshot if parse analysis will need one.
          */
-        if (analyze_requires_snapshot(raw_parse_tree))
+		if (analyze_requires_snapshot(raw_parse_tree) && g_parse_snapshot)
         {
 #ifdef __TBASE__
             /* use local snapshot instead of global if told so */
@@ -2477,9 +2479,9 @@ exec_bind_message(StringInfo input_message)
      * snapshot active till we're done, so that plancache.c doesn't have to
      * take new ones.
      */
-    if (numParams > 0 ||
+	if ((numParams > 0 ||
         (psrc->raw_parse_tree &&
-         analyze_requires_snapshot(psrc->raw_parse_tree)))
+		 analyze_requires_snapshot(psrc->raw_parse_tree))) && g_parse_snapshot)
     {
 #ifdef __TBASE__
         /* use local snapshot instead of global if told so */
diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c
index 240a4f9d..1ad79655 100644
--- a/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@@ -991,9 +991,9 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist,
      * for planning.  But if it isn't, and we need one, install one.
      */
     snapshot_set = false;
-    if (!ActiveSnapshotSet() &&
+	if ((!ActiveSnapshotSet() &&
         plansource->raw_parse_tree &&
-        analyze_requires_snapshot(plansource->raw_parse_tree))
+		analyze_requires_snapshot(plansource->raw_parse_tree)) && g_parse_snapshot)
     {
         PushActiveSnapshot(GetTransactionSnapshot());
         snapshot_set = true;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 8b9a9fe9..7539eb21 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2359,6 +2359,15 @@ static struct config_bool ConfigureNamesBool[] =
         NULL, NULL, NULL
     },
 #endif
+    {
+            {"parse_snapshot", PGC_USERSET, CUSTOM_OPTIONS,
+             gettext_noop("allow forced ddl of inconsistent metadata"),
+             NULL
+            },
+            &g_parse_snapshot,
+            true,
+            NULL, NULL, NULL
+    },    
 #ifdef _SHARDING_
     {
         {"allow_dml_on_datanode", PGC_USERSET, CUSTOM_OPTIONS,
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 95342821..179abb61 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -310,6 +310,7 @@ extern int    tcp_keepalives_count;
 #ifdef _SHARDING_
 extern bool g_allow_dml_on_datanode;
 extern bool g_allow_force_ddl;
+extern bool g_parse_snapshot;
 extern bool trace_extent;
 #endif
 

From 3bbe2f1ac440045b08fb348a8550f8c56e2124a8 Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Wed, 21 Jul 2021 17:16:49 +0800
Subject: [PATCH 494/578] fix comments of mr

---
 src/backend/pgxc/pool/execRemote.c  | 14 ++------------
 src/backend/tcop/postgres.c         |  8 ++++----
 src/backend/utils/cache/plancache.c |  2 +-
 src/backend/utils/misc/guc.c        |  6 +++---
 src/include/utils/guc.h             |  2 +-
 5 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 0d1fa08e..9e306cef 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -4840,24 +4840,14 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 
     stat_transaction(conn_count);
 
-    if (need_release_handle)
-    {
         if (!temp_object_included && !PersistentConnections)
         {
             /* Clean up remote sessions */
 		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
 		                             txn_type == TXN_TYPE_RollbackSubTxn);
-            release_handles(false);
-        }
-    }
-    else
-    {
-        /* in subtxn, we just cleanup the connections. not release the handles. */
-        if (!temp_object_included && !PersistentConnections)
+        if (need_release_handle)
         {
-            /* Clean up remote sessions without release handles. */
-            pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
-                                         txn_type == TXN_TYPE_RollbackSubTxn);
+            release_handles(false);
         }
     }
 
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index b314aa97..126bae58 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -211,7 +211,7 @@ static char *remotePrepareGID = NULL;
 /* for error code contrib */
 bool g_is_in_init_phase = false;
 
-bool g_parse_snapshot = true;
+bool g_snapshot_for_analyze = true;
 
 bool IsNormalPostgres = false;
 
@@ -1445,7 +1445,7 @@ exec_simple_query(const char *query_string)
         /*
          * Set up a snapshot if parse analysis/planning will need one.
          */
-		if (analyze_requires_snapshot(parsetree) && g_parse_snapshot)
+		if (analyze_requires_snapshot(parsetree) && g_snapshot_for_analyze)
         {
 #ifdef __TBASE__
             /* use local snapshot instead of global if told so */
@@ -1920,7 +1920,7 @@ exec_parse_message(const char *query_string,    /* string to execute */
         /*
          * Set up a snapshot if parse analysis will need one.
          */
-		if (analyze_requires_snapshot(raw_parse_tree) && g_parse_snapshot)
+		if (analyze_requires_snapshot(raw_parse_tree) && g_snapshot_for_analyze)
         {
 #ifdef __TBASE__
             /* use local snapshot instead of global if told so */
@@ -2481,7 +2481,7 @@ exec_bind_message(StringInfo input_message)
      */
 	if ((numParams > 0 ||
         (psrc->raw_parse_tree &&
-		 analyze_requires_snapshot(psrc->raw_parse_tree))) && g_parse_snapshot)
+		 analyze_requires_snapshot(psrc->raw_parse_tree))) && g_snapshot_for_analyze)
     {
 #ifdef __TBASE__
         /* use local snapshot instead of global if told so */
diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c
index 1ad79655..b69aa7cb 100644
--- a/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@@ -993,7 +993,7 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist,
     snapshot_set = false;
 	if ((!ActiveSnapshotSet() &&
         plansource->raw_parse_tree &&
-		analyze_requires_snapshot(plansource->raw_parse_tree)) && g_parse_snapshot)
+		analyze_requires_snapshot(plansource->raw_parse_tree)) && g_snapshot_for_analyze)
     {
         PushActiveSnapshot(GetTransactionSnapshot());
         snapshot_set = true;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 7539eb21..27901832 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2360,11 +2360,11 @@ static struct config_bool ConfigureNamesBool[] =
     },
 #endif
     {
-            {"parse_snapshot", PGC_USERSET, CUSTOM_OPTIONS,
-             gettext_noop("allow forced ddl of inconsistent metadata"),
+            {"snapshot_for_analyze", PGC_USERSET, CUSTOM_OPTIONS,
+             gettext_noop("enable/disable get snapshot for analyze and rewrite"),
              NULL
             },
-            &g_parse_snapshot,
+            &g_snapshot_for_analyze,
             true,
             NULL, NULL, NULL
     },    
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 179abb61..2634e983 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -310,7 +310,7 @@ extern int    tcp_keepalives_count;
 #ifdef _SHARDING_
 extern bool g_allow_dml_on_datanode;
 extern bool g_allow_force_ddl;
-extern bool g_parse_snapshot;
+extern bool g_snapshot_for_analyze;
 extern bool trace_extent;
 #endif
 

From 82aa7606f8a69564a37e5805743d481a62621879 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cningxpeng=E2=80=9D?= <“ningxpeng@tencent.com”>
Date: Fri, 20 Aug 2021 17:05:53 +0800
Subject: [PATCH 495/578] [BUGFIX] Subtransaction commits should not reset
 session information

---
 src/backend/pgxc/pool/execRemote.c            | 23 +----
 .../regress/expected/xc_create_function.out   | 92 +++++++++++++++++++
 src/test/regress/sql/xc_create_function.sql   | 66 +++++++++++++
 3 files changed, 163 insertions(+), 18 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 9e306cef..a2e403ce 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -4840,16 +4840,14 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 
     stat_transaction(conn_count);
 
-        if (!temp_object_included && !PersistentConnections)
+    /* do not cleanup remote session for subtrans */
+    if (!temp_object_included && !PersistentConnections && need_release_handle)
         {
             /* Clean up remote sessions */
 		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
 		                             txn_type == TXN_TYPE_RollbackSubTxn);
-        if (need_release_handle)
-        {
             release_handles(false);
         }
-    }
 
     clear_handles();
 }
@@ -5925,26 +5923,15 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
     }
 #endif    
     
-	if (need_release_handle)
-	{
-	if (!temp_object_included)
+	/* do not cleanup remote session for subtrans */
+	if (!temp_object_included && need_release_handle)
         {
             /* Clean up remote sessions */
 		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
 		                             txn_type == TXN_TYPE_RollbackSubTxn);
 			release_handles(false);
         }
-    }
-	else
-	{
-		/* in subtxn, we just cleanup the connections. not release the handles. */
-		if (!temp_object_included)
-		{
-            /* Clean up remote sessions */
-            pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
-                                         txn_type == TXN_TYPE_RollbackSubTxn);
-		}
-	}
+
     clear_handles();
     pfree_pgxc_all_handles(handles);
 
diff --git a/src/test/regress/expected/xc_create_function.out b/src/test/regress/expected/xc_create_function.out
index ff83c0a7..a3520bca 100644
--- a/src/test/regress/expected/xc_create_function.out
+++ b/src/test/regress/expected/xc_create_function.out
@@ -175,3 +175,95 @@ BEGIN
 	str = 'execute direct on (' || node_name || ') $$ ' || query || ' $$'  ;
 	execute str;
 END $D$ language plpgsql;
+-- subtransaction guc check
+drop table if exists t_abort;
+NOTICE:  table "t_abort" does not exist, skipping
+create table t_abort(a int);
+insert into t_abort select generate_series(1,20);
+select count(*) from t_abort;
+ count 
+-------
+    20
+(1 row)
+
+Reset TimeZone;
+show TimeZone;
+ TimeZone 
+----------
+ PST8PDT
+(1 row)
+
+set TimeZone to 'PRC';
+create or replace procedure subtransaction_guc_check()
+as
+$$
+declare
+    names refcursor;
+    results1 refcursor;
+    results2 refcursor;
+    results3 refcursor;
+    guc_result varchar;
+    node_names varchar;
+    node varchar :='';
+    cmd1 varchar;
+    cmd2 varchar;
+BEGIN
+      open names for execute 'select node_name from pgxc_node where node_type=''D'' limit 1';
+      fetch names into node_names;
+      cmd1 := 'execute direct on(' || node_names || ') ''select setting from pg_settings where name=''''TimeZone''''''';
+            BEGIN
+                  raise notice '%',cmd1;
+                open results1 for EXECUTE cmd1;
+                fetch results1 into guc_result;
+                raise notice 'TimeZone = %',guc_result;
+                EXCEPTION when others then
+                    raise notice 'ERROR: (%)', SQLERRM;
+                close results1;
+            end;
+
+            BEGIN
+                  raise notice '%',cmd1;
+                open results2 for EXECUTE cmd1;
+                fetch results2 into guc_result;
+                raise notice 'TimeZone = %',guc_result;
+                cmd2 := 'select a from t_abort';
+                EXECUTE cmd2;
+                EXCEPTION when others then
+                    raise notice 'ERROR: (%)', SQLERRM;
+                close results2;
+            Rollback;
+             end;
+            
+        -- check twice, shoud be same.
+        raise notice '%',cmd1;
+        open results3 for EXECUTE cmd1;
+                fetch results3 into guc_result;
+                raise notice 'TimeZone = %',guc_result;
+                EXCEPTION when others then
+                    raise notice 'ERROR: (%)', SQLERRM;
+                close results3;
+        close names;
+end;
+$$
+language plpgsql;
+call subtransaction_guc_check();
+NOTICE:  execute direct on(datanode_1) 'select setting from pg_settings where name=''TimeZone'''
+NOTICE:  TimeZone = PRC
+NOTICE:  execute direct on(datanode_1) 'select setting from pg_settings where name=''TimeZone'''
+NOTICE:  TimeZone = PRC
+NOTICE:  execute direct on(datanode_1) 'select setting from pg_settings where name=''TimeZone'''
+NOTICE:  TimeZone = PRC
+Show TimeZone;
+ TimeZone 
+----------
+ PRC
+(1 row)
+
+Reset TimeZone;
+Show TimeZone;
+ TimeZone 
+----------
+ PST8PDT
+(1 row)
+
+drop table t_abort;
diff --git a/src/test/regress/sql/xc_create_function.sql b/src/test/regress/sql/xc_create_function.sql
index 02f750ea..272b035c 100644
--- a/src/test/regress/sql/xc_create_function.sql
+++ b/src/test/regress/sql/xc_create_function.sql
@@ -180,3 +180,69 @@ BEGIN
 	str = 'execute direct on (' || node_name || ') $$ ' || query || ' $$'  ;
 	execute str;
 END $D$ language plpgsql;
+
+-- subtransaction guc check
+drop table if exists t_abort;
+create table t_abort(a int);
+insert into t_abort select generate_series(1,20);
+select count(*) from t_abort;
+Reset TimeZone;
+show TimeZone;
+set TimeZone to 'PRC';
+create or replace procedure subtransaction_guc_check()
+as
+$$
+declare
+    names refcursor;
+    results1 refcursor;
+    results2 refcursor;
+    results3 refcursor;
+    guc_result varchar;
+    node_names varchar;
+    node varchar :='';
+    cmd1 varchar;
+    cmd2 varchar;
+BEGIN
+      open names for execute 'select node_name from pgxc_node where node_type=''D'' limit 1';
+      fetch names into node_names;
+      cmd1 := 'execute direct on(' || node_names || ') ''select setting from pg_settings where name=''''TimeZone''''''';
+            BEGIN
+                  raise notice '%',cmd1;
+                open results1 for EXECUTE cmd1;
+                fetch results1 into guc_result;
+                raise notice 'TimeZone = %',guc_result;
+                EXCEPTION when others then
+                    raise notice 'ERROR: (%)', SQLERRM;
+                close results1;
+            end;
+
+            BEGIN
+                  raise notice '%',cmd1;
+                open results2 for EXECUTE cmd1;
+                fetch results2 into guc_result;
+                raise notice 'TimeZone = %',guc_result;
+                cmd2 := 'select a from t_abort';
+                EXECUTE cmd2;
+                EXCEPTION when others then
+                    raise notice 'ERROR: (%)', SQLERRM;
+                close results2;
+            Rollback;
+             end;
+            
+        -- check twice, shoud be same.
+        raise notice '%',cmd1;
+        open results3 for EXECUTE cmd1;
+                fetch results3 into guc_result;
+                raise notice 'TimeZone = %',guc_result;
+                EXCEPTION when others then
+                    raise notice 'ERROR: (%)', SQLERRM;
+                close results3;
+        close names;
+end;
+$$
+language plpgsql;
+call subtransaction_guc_check();
+Show TimeZone;
+Reset TimeZone;
+Show TimeZone;
+drop table t_abort;
\ No newline at end of file

From 316e806d6e887e41984524cb44eed25879e22431 Mon Sep 17 00:00:00 2001
From: ningxpeng <ningxpeng@tencent.com>
Date: Fri, 20 Aug 2021 20:22:22 +0800
Subject: [PATCH 496/578] [revert] Not reset global session info when subtrans
 end

---
 src/backend/pgxc/pool/execRemote.c | 33 +++++++-----------------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index a2e403ce..71e4c53b 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3750,17 +3750,14 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
  * specific stuff before releasing them to pool for reuse by other sessions.
  */
 static void
-pgxc_node_remote_cleanup_all(bool sub)
+pgxc_node_remote_cleanup_all(void)
 {
     PGXCNodeAllHandles *handles = get_current_handles();
     PGXCNodeHandle *new_connections[handles->co_conn_count + handles->dn_conn_count];
     int                new_conn_count = 0;
     int                i;
 	/* if it's called by sub-commit or sub-abort, DO NOT reset global_session */
-	char		   *resetcmd = sub ? "RESET ALL;"
-	                                 "RESET SESSION AUTHORIZATION;"
-	                                 "RESET transaction_isolation;" :
-	                           "RESET ALL;"
+	char		   *resetcmd = "RESET ALL;"
                                "RESET SESSION AUTHORIZATION;"
                                "RESET transaction_isolation;"
                                "RESET global_session";
@@ -4783,7 +4780,7 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 	if (!temp_object_included && !PersistentConnections)
     {
         /* Clean up remote sessions */
-		pgxc_node_remote_cleanup_all(false);
+		pgxc_node_remote_cleanup_all();
         release_handles(false);
     }
     
@@ -4840,12 +4837,10 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 
     stat_transaction(conn_count);
 
-    /* do not cleanup remote session for subtrans */
     if (!temp_object_included && !PersistentConnections && need_release_handle)
         {
             /* Clean up remote sessions */
-		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
-		                             txn_type == TXN_TYPE_RollbackSubTxn);
+        pgxc_node_remote_cleanup_all();
             release_handles(false);
         }
 
@@ -5062,25 +5057,12 @@ pgxc_node_remote_commit(TranscationType txn_type, bool need_release_handle)
 #ifndef __TBASE__
 	stat_transaction(conn_count);
 
-	
-	if (need_release_handle)
-	{
-		if (!temp_object_included && !PersistentConnections)
+	if (!temp_object_included && !PersistentConnections && need_release_handle)
 		{
 			/* Clean up remote sessions */
 			pgxc_node_remote_cleanup_all();
 			release_handles(false);
 		}
-	}
-	else
-	{
-		/* in subtxn, we just cleanup the connections. not release the handles. */
-		if (!temp_object_included && !PersistentConnections)
-		{
-			/* Clean up remote sessions without release handles. */
-			pgxc_node_remote_cleanup_all();
-		}
-	}
 	
 	clear_handles();
 #endif
@@ -5927,8 +5909,7 @@ pgxc_node_remote_abort(TranscationType txn_type, bool need_release_handle)
 	if (!temp_object_included && need_release_handle)
         {
             /* Clean up remote sessions */
-		pgxc_node_remote_cleanup_all(txn_type == TXN_TYPE_CommitSubTxn ||
-		                             txn_type == TXN_TYPE_RollbackSubTxn);
+		pgxc_node_remote_cleanup_all();
 			release_handles(false);
         }
 
@@ -8922,7 +8903,7 @@ pgxc_node_remote_finish(char *prepareGID, bool commit,
 	if (!temp_object_included && !PersistentConnections)
     {
         /* Clean up remote sessions */
-		pgxc_node_remote_cleanup_all(false);
+		pgxc_node_remote_cleanup_all();
         release_handles(false);
     }
     clear_handles();

From a45701511c3bc21f32c8c424cf89c87d35aed719 Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Fri, 1 Apr 2022 15:24:29 +0800
Subject: [PATCH 497/578] fix regrerss: remove the function not supported by v2

---
 .../regress/expected/xc_create_function.out   | 92 -------------------
 src/test/regress/sql/xc_create_function.sql   | 68 +-------------
 2 files changed, 1 insertion(+), 159 deletions(-)

diff --git a/src/test/regress/expected/xc_create_function.out b/src/test/regress/expected/xc_create_function.out
index a3520bca..ff83c0a7 100644
--- a/src/test/regress/expected/xc_create_function.out
+++ b/src/test/regress/expected/xc_create_function.out
@@ -175,95 +175,3 @@ BEGIN
 	str = 'execute direct on (' || node_name || ') $$ ' || query || ' $$'  ;
 	execute str;
 END $D$ language plpgsql;
--- subtransaction guc check
-drop table if exists t_abort;
-NOTICE:  table "t_abort" does not exist, skipping
-create table t_abort(a int);
-insert into t_abort select generate_series(1,20);
-select count(*) from t_abort;
- count 
--------
-    20
-(1 row)
-
-Reset TimeZone;
-show TimeZone;
- TimeZone 
-----------
- PST8PDT
-(1 row)
-
-set TimeZone to 'PRC';
-create or replace procedure subtransaction_guc_check()
-as
-$$
-declare
-    names refcursor;
-    results1 refcursor;
-    results2 refcursor;
-    results3 refcursor;
-    guc_result varchar;
-    node_names varchar;
-    node varchar :='';
-    cmd1 varchar;
-    cmd2 varchar;
-BEGIN
-      open names for execute 'select node_name from pgxc_node where node_type=''D'' limit 1';
-      fetch names into node_names;
-      cmd1 := 'execute direct on(' || node_names || ') ''select setting from pg_settings where name=''''TimeZone''''''';
-            BEGIN
-                  raise notice '%',cmd1;
-                open results1 for EXECUTE cmd1;
-                fetch results1 into guc_result;
-                raise notice 'TimeZone = %',guc_result;
-                EXCEPTION when others then
-                    raise notice 'ERROR: (%)', SQLERRM;
-                close results1;
-            end;
-
-            BEGIN
-                  raise notice '%',cmd1;
-                open results2 for EXECUTE cmd1;
-                fetch results2 into guc_result;
-                raise notice 'TimeZone = %',guc_result;
-                cmd2 := 'select a from t_abort';
-                EXECUTE cmd2;
-                EXCEPTION when others then
-                    raise notice 'ERROR: (%)', SQLERRM;
-                close results2;
-            Rollback;
-             end;
-            
-        -- check twice, shoud be same.
-        raise notice '%',cmd1;
-        open results3 for EXECUTE cmd1;
-                fetch results3 into guc_result;
-                raise notice 'TimeZone = %',guc_result;
-                EXCEPTION when others then
-                    raise notice 'ERROR: (%)', SQLERRM;
-                close results3;
-        close names;
-end;
-$$
-language plpgsql;
-call subtransaction_guc_check();
-NOTICE:  execute direct on(datanode_1) 'select setting from pg_settings where name=''TimeZone'''
-NOTICE:  TimeZone = PRC
-NOTICE:  execute direct on(datanode_1) 'select setting from pg_settings where name=''TimeZone'''
-NOTICE:  TimeZone = PRC
-NOTICE:  execute direct on(datanode_1) 'select setting from pg_settings where name=''TimeZone'''
-NOTICE:  TimeZone = PRC
-Show TimeZone;
- TimeZone 
-----------
- PRC
-(1 row)
-
-Reset TimeZone;
-Show TimeZone;
- TimeZone 
-----------
- PST8PDT
-(1 row)
-
-drop table t_abort;
diff --git a/src/test/regress/sql/xc_create_function.sql b/src/test/regress/sql/xc_create_function.sql
index 272b035c..7000b6ba 100644
--- a/src/test/regress/sql/xc_create_function.sql
+++ b/src/test/regress/sql/xc_create_function.sql
@@ -179,70 +179,4 @@ BEGIN
 	node_name = get_xc_node_name(nodenum);
 	str = 'execute direct on (' || node_name || ') $$ ' || query || ' $$'  ;
 	execute str;
-END $D$ language plpgsql;
-
--- subtransaction guc check
-drop table if exists t_abort;
-create table t_abort(a int);
-insert into t_abort select generate_series(1,20);
-select count(*) from t_abort;
-Reset TimeZone;
-show TimeZone;
-set TimeZone to 'PRC';
-create or replace procedure subtransaction_guc_check()
-as
-$$
-declare
-    names refcursor;
-    results1 refcursor;
-    results2 refcursor;
-    results3 refcursor;
-    guc_result varchar;
-    node_names varchar;
-    node varchar :='';
-    cmd1 varchar;
-    cmd2 varchar;
-BEGIN
-      open names for execute 'select node_name from pgxc_node where node_type=''D'' limit 1';
-      fetch names into node_names;
-      cmd1 := 'execute direct on(' || node_names || ') ''select setting from pg_settings where name=''''TimeZone''''''';
-            BEGIN
-                  raise notice '%',cmd1;
-                open results1 for EXECUTE cmd1;
-                fetch results1 into guc_result;
-                raise notice 'TimeZone = %',guc_result;
-                EXCEPTION when others then
-                    raise notice 'ERROR: (%)', SQLERRM;
-                close results1;
-            end;
-
-            BEGIN
-                  raise notice '%',cmd1;
-                open results2 for EXECUTE cmd1;
-                fetch results2 into guc_result;
-                raise notice 'TimeZone = %',guc_result;
-                cmd2 := 'select a from t_abort';
-                EXECUTE cmd2;
-                EXCEPTION when others then
-                    raise notice 'ERROR: (%)', SQLERRM;
-                close results2;
-            Rollback;
-             end;
-            
-        -- check twice, shoud be same.
-        raise notice '%',cmd1;
-        open results3 for EXECUTE cmd1;
-                fetch results3 into guc_result;
-                raise notice 'TimeZone = %',guc_result;
-                EXCEPTION when others then
-                    raise notice 'ERROR: (%)', SQLERRM;
-                close results3;
-        close names;
-end;
-$$
-language plpgsql;
-call subtransaction_guc_check();
-Show TimeZone;
-Reset TimeZone;
-Show TimeZone;
-drop table t_abort;
\ No newline at end of file
+END $D$ language plpgsql;
\ No newline at end of file

From 617790375b27f995ce185f0e75395469ae038567 Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Fri, 1 Apr 2022 15:34:23 +0800
Subject: [PATCH 498/578] delete function reset_handles not used

---
 src/backend/pgxc/pool/pgxcnode.c | 61 --------------------------------
 src/include/pgxc/pgxcnode.h      |  1 -
 2 files changed, 62 deletions(-)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 445a27c1..e4767e9d 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -1544,67 +1544,6 @@ release_handles(bool force)
     slavedatanode_count = 0;
 }
 
-/*
- * Reset all Datanode and Coordinator connections occupied memory.
- */
-void
-reset_handles(void)
-{
-	int			i;
-
-	/* don't reset connection if holding a cluster lock */
-	if (cluster_ex_lock_held)
-	{
-		return;
-	}
-
-	if (datanode_count == 0 && coord_count == 0 && slavedatanode_count == 0)
-	{
-		return;
-	}
-
-	/* Do not reset connections if we have prepared statements on nodes */
-	if (HaveActiveDatanodeStatements())
-	{
-		return;
-	}
-
-	/* Reset Datanodes handles occupied memory */
-	for (i = 0; i < NumDataNodes; i++)
-	{
-		PGXCNodeHandle *handle = &dn_handles[i];
-
-		if (handle->sock != NO_SOCKET)
-		{
-			pgxc_node_init(handle, handle->sock, true, handle->backend_pid);
-		}
-	}
-
-	for (i = 0; i < NumSlaveDataNodes; i++)
-	{
-		PGXCNodeHandle *handle = &sdn_handles[i];
-
-		if (handle->sock != NO_SOCKET)
-		{
-			pgxc_node_init(handle, handle->sock, true, handle->backend_pid);
-		}
-	}
-
-	if (IS_PGXC_COORDINATOR)
-	{
-		/* Collect Coordinator handles */
-		for (i = 0; i < NumCoords; i++)
-		{
-			PGXCNodeHandle *handle = &co_handles[i];
-
-			if (handle->sock != NO_SOCKET)
-			{
-				pgxc_node_init(handle, handle->sock, true, handle->backend_pid);
-			}
-		}
-	}
-}
-
 /*
  * Check whether there bad connections to remote nodes when abort transactions.
  */
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index adbc8f6e..f0e7c269 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -192,7 +192,6 @@ extern void register_transaction_handles(PGXCNodeHandle* handle);
 extern void pfree_pgxc_all_handles(PGXCNodeAllHandles *handles);
 
 extern void release_handles(bool force);
-extern void reset_handles(void);
 extern void clear_handles(void);
 
 extern int get_transaction_nodes(PGXCNodeHandle ** connections,

From b3be6035e2d1f03ae0a15697556937e550d01241 Mon Sep 17 00:00:00 2001
From: youngxie <youngxie@tencent.com>
Date: Thu, 7 Apr 2022 22:56:04 +0800
Subject: [PATCH 499/578] Increace numumer of LOG2_NUM_LOCK_PARTITIONS to avoid
 heavy lock contention

---
 src/include/storage/lwlock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 25ee91a8..ae936afc 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -177,7 +177,7 @@ extern PGDLLIMPORT int NamedLWLockTrancheRequests;
 #define NUM_CACHE_2PC_PARTITIONS  128
 
 /* Number of partitions the shared lock tables are divided into */
-#define LOG2_NUM_LOCK_PARTITIONS  4
+#define LOG2_NUM_LOCK_PARTITIONS  8
 #define NUM_LOCK_PARTITIONS  (1 << LOG2_NUM_LOCK_PARTITIONS)
 
 /* Number of partitions the shared predicate lock tables are divided into */

From fd9cce4a3178a342470cda6591ef3273bde44d46 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Mon, 7 Dec 2020 21:30:45 +0800
Subject: [PATCH 500/578] compute_hash support custom type, fix
 http://tapd.oa.com/my_worktable?source_user=1001433276&workspace_id=20421696&workitem_type=bug&workitem_id=1020421696083670743#&filter_close=true

---
 src/backend/access/hash/hashfunc.c            |  36 +++++
 .../expected/redistribute_custom_types.out    | 132 ++++++++++++++++++
 .../regress/sql/redistribute_custom_types.sql |  65 +++++++++
 3 files changed, 233 insertions(+)
 create mode 100644 src/test/regress/expected/redistribute_custom_types.out
 create mode 100644 src/test/regress/sql/redistribute_custom_types.sql

diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
index f4959255..16fcbbfa 100644
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -37,6 +37,10 @@
 #include "utils/nabstime.h"
 #endif
 
+#ifdef __TBASE__
+#include "utils/lsyscache.h"
+#endif
+
 /*
  * Datatype-specific hash functions.
  *
@@ -300,6 +304,34 @@ hashvarlena(PG_FUNCTION_ARGS)
 
     return result;
 }
+#ifdef __TBASE__
+static Datum
+hashcustomtype(PG_FUNCTION_ARGS)
+{
+    Oid type = PG_GETARG_OID(0);
+    Datum value = PG_GETARG_DATUM(1);
+    Oid			typsend;
+    bool		typisvarlena;
+    bytea	   *outputbytes;
+    Datum		result;
+
+    /*
+     * Convert the column value to binary
+     */
+    getTypeBinaryOutputInfo(type, &typsend, &typisvarlena);
+
+    outputbytes = OidSendFunctionCall(typsend, value);
+
+    /*
+     * Compute hash
+     */
+    result = hash_any((unsigned char *) VARDATA(outputbytes),
+                      VARSIZE(outputbytes) - VARHDRSZ);
+
+    pfree(outputbytes);
+    return result;
+}
+#endif
 
 Datum
 hashvarlenaextended(PG_FUNCTION_ARGS)
@@ -1043,6 +1075,10 @@ compute_hash(Oid type, Datum value, char locator)
 		    return DirectFunctionCall1(jsonb_hash, value);
 #endif
         default:
+#ifdef __TBASE__
+            if (locator == LOCATOR_TYPE_SHARD)
+                return DirectFunctionCall2(hashcustomtype, type, value);
+#endif
             ereport(ERROR,(errmsg("Unhandled datatype:%d for modulo or hash distribution in compute_hash", type)));
     }
     /* Control should not come here. */
diff --git a/src/test/regress/expected/redistribute_custom_types.out b/src/test/regress/expected/redistribute_custom_types.out
new file mode 100644
index 00000000..2ae77a18
--- /dev/null
+++ b/src/test/regress/expected/redistribute_custom_types.out
@@ -0,0 +1,132 @@
+--
+-- redistribute custom types
+--
+-- enum type
+drop table if exists enum_test;
+NOTICE:  table "enum_test" does not exist, skipping
+drop type if exists enumtype;
+NOTICE:  type "enumtype" does not exist, skipping
+create type enumtype AS enum ('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun');
+create table enum_test(a int, b enumtype) distribute by shard(a);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into enum_test(a,b) values(1,'Mon');
+insert into enum_test(a,b) values(2,'Tue');
+insert into enum_test(a,b) values(3,'Wed');
+insert into enum_test(a,b) values(4,'Thu');
+insert into enum_test(a,b) values(5,'Fri');
+insert into enum_test(a,b) values(6,'Sat');
+insert into enum_test(a,b) values(7,'Sun');
+explain select count(*) from enum_test where a < 100 group by b;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=132.87..134.74 rows=187 width=12)
+   ->  Finalize HashAggregate  (cost=132.87..134.74 rows=187 width=12)
+         Group Key: b
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=129.12..131.93 rows=187 width=0)
+               Distribute results by S: b
+               ->  Partial HashAggregate  (cost=29.12..31.00 rows=187 width=12)
+                     Group Key: b
+                     ->  Seq Scan on enum_test  (cost=0.00..26.88 rows=450 width=4)
+                           Filter: (a < 100)
+(9 rows)
+
+select count(*) from enum_test where a < 100 group by b;
+ count 
+-------
+     1
+     1
+     1
+     1
+     1
+     1
+     1
+(7 rows)
+
+-- composite type
+drop table if exists comptype_test;
+NOTICE:  table "comptype_test" does not exist, skipping
+drop type if exists comptype;
+NOTICE:  type "comptype" does not exist, skipping
+create type comptype as (f1 int, f2 int);
+create table comptype_test(a int, b comptype) distribute by shard(a);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into comptype_test(a,b) values(1,(1,2));
+insert into comptype_test(a,b) values(2,(2,3));
+insert into comptype_test(a,b) values(3,(3,4));
+insert into comptype_test(a,b) values(4,(4,5));
+insert into comptype_test(a,b) values(5,(5,6));
+insert into comptype_test(a,b) values(6,(6,7));
+explain select count(*) from comptype_test where a < 100 group by b;
+                                                  QUERY PLAN                                                   
+---------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=145.27..147.81 rows=169 width=40)
+   ->  Finalize GroupAggregate  (cost=145.27..147.81 rows=169 width=40)
+         Group Key: b
+         ->  Sort  (cost=145.27..145.70 rows=169 width=0)
+               Sort Key: b
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=134.18..139.02 rows=169 width=0)
+                     Distribute results by S: b
+                     ->  Partial GroupAggregate  (cost=34.18..38.17 rows=169 width=40)
+                           Group Key: b
+                           ->  Sort  (cost=34.18..34.95 rows=307 width=32)
+                                 Sort Key: b
+                                 ->  Seq Scan on comptype_test  (cost=0.00..21.50 rows=307 width=32)
+                                       Filter: (a < 100)
+(13 rows)
+
+select count(*) from comptype_test where a < 100 group by b;
+ count 
+-------
+     1
+     1
+     1
+     1
+     1
+     1
+(6 rows)
+
+-- domain type
+drop table if exists domaintype_test;
+NOTICE:  table "domaintype_test" does not exist, skipping
+drop domain if exists domaintype;
+NOTICE:  type "domaintype" does not exist, skipping
+create domain domaintype as int check(value < 100);
+create table domaintype_test(a int, b domaintype) distribute by shard(a);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into domaintype_test(a,b) values(1,1);
+insert into domaintype_test(a,b) values(2,2);
+insert into domaintype_test(a,b) values(3,3);
+insert into domaintype_test(a,b) values(4,4);
+insert into domaintype_test(a,b) values(5,5);
+insert into domaintype_test(a,b) values(6,6);
+explain select count(*) from domaintype_test where a < 100 group by b;
+                                               QUERY PLAN                                                
+---------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=132.87..134.74 rows=187 width=12)
+   ->  Finalize HashAggregate  (cost=132.87..134.74 rows=187 width=12)
+         Group Key: b
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=129.12..131.93 rows=187 width=0)
+               Distribute results by S: b
+               ->  Partial HashAggregate  (cost=29.12..31.00 rows=187 width=12)
+                     Group Key: b
+                     ->  Seq Scan on domaintype_test  (cost=0.00..26.88 rows=450 width=4)
+                           Filter: (a < 100)
+(9 rows)
+
+select count(*) from domaintype_test where a < 100 group by b;
+ count 
+-------
+     1
+     1
+     1
+     1
+     1
+     1
+(6 rows)
+
+drop table enum_test;
+drop table comptype_test;
+drop table domaintype_test;
+drop type enumtype;
+drop type comptype;
+drop type domaintype;
diff --git a/src/test/regress/sql/redistribute_custom_types.sql b/src/test/regress/sql/redistribute_custom_types.sql
new file mode 100644
index 00000000..ca392b2f
--- /dev/null
+++ b/src/test/regress/sql/redistribute_custom_types.sql
@@ -0,0 +1,65 @@
+--
+-- redistribute custom types
+--
+
+-- enum type
+drop table if exists enum_test;
+drop type if exists enumtype;
+
+create type enumtype AS enum ('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun');
+create table enum_test(a int, b enumtype) distribute by shard(a);
+
+insert into enum_test(a,b) values(1,'Mon');
+insert into enum_test(a,b) values(2,'Tue');
+insert into enum_test(a,b) values(3,'Wed');
+insert into enum_test(a,b) values(4,'Thu');
+insert into enum_test(a,b) values(5,'Fri');
+insert into enum_test(a,b) values(6,'Sat');
+insert into enum_test(a,b) values(7,'Sun');
+
+explain select count(*) from enum_test where a < 100 group by b;
+select count(*) from enum_test where a < 100 group by b;
+
+
+-- composite type
+drop table if exists comptype_test;
+drop type if exists comptype;
+
+create type comptype as (f1 int, f2 int);
+create table comptype_test(a int, b comptype) distribute by shard(a);
+
+insert into comptype_test(a,b) values(1,(1,2));
+insert into comptype_test(a,b) values(2,(2,3));
+insert into comptype_test(a,b) values(3,(3,4));
+insert into comptype_test(a,b) values(4,(4,5));
+insert into comptype_test(a,b) values(5,(5,6));
+insert into comptype_test(a,b) values(6,(6,7));
+
+explain select count(*) from comptype_test where a < 100 group by b;
+select count(*) from comptype_test where a < 100 group by b;
+
+
+-- domain type
+drop table if exists domaintype_test;
+drop domain if exists domaintype;
+
+create domain domaintype as int check(value < 100);
+create table domaintype_test(a int, b domaintype) distribute by shard(a);
+
+insert into domaintype_test(a,b) values(1,1);
+insert into domaintype_test(a,b) values(2,2);
+insert into domaintype_test(a,b) values(3,3);
+insert into domaintype_test(a,b) values(4,4);
+insert into domaintype_test(a,b) values(5,5);
+insert into domaintype_test(a,b) values(6,6);
+
+explain select count(*) from domaintype_test where a < 100 group by b;
+select count(*) from domaintype_test where a < 100 group by b;
+
+drop table enum_test;
+drop table comptype_test;
+drop table domaintype_test;
+
+drop type enumtype;
+drop type comptype;
+drop type domaintype;
\ No newline at end of file

From e1bb8b4181aca79e3f2c95460e5007b3c4031568 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Fri, 5 Feb 2021 17:08:07 +0800
Subject: [PATCH 501/578] add nestloop regress test

---
 .../regress/expected/nestloop_by_shard.out    | 343 ++++++++++++++++++
 src/test/regress/sql/nestloop_by_shard.sql    | 191 ++++++++++
 2 files changed, 534 insertions(+)
 create mode 100644 src/test/regress/expected/nestloop_by_shard.out
 create mode 100644 src/test/regress/sql/nestloop_by_shard.sql

diff --git a/src/test/regress/expected/nestloop_by_shard.out b/src/test/regress/expected/nestloop_by_shard.out
new file mode 100644
index 00000000..da851318
--- /dev/null
+++ b/src/test/regress/expected/nestloop_by_shard.out
@@ -0,0 +1,343 @@
+-- test nestloop by shard
+drop table if exists int8_tbl_s;
+NOTICE:  table "int8_tbl_s" does not exist, skipping
+drop table if exists int4_tbl_s;
+NOTICE:  table "int4_tbl_s" does not exist, skipping
+drop table if exists tenk1_s;
+NOTICE:  table "tenk1_s" does not exist, skipping
+drop table if exists onek_s;
+NOTICE:  table "onek_s" does not exist, skipping
+CREATE TABLE int8_tbl_s(q1 int8, q2 int8) distribute by shard(q1);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+INSERT INTO int8_tbl_s VALUES('123','456');
+INSERT INTO int8_tbl_s VALUES('123','4567890123456789');
+INSERT INTO int8_tbl_s VALUES('4567890123456789','123');
+INSERT INTO int8_tbl_s VALUES(+4567890123456789,'4567890123456789');
+INSERT INTO int8_tbl_s VALUES('+4567890123456789','-4567890123456789');
+CREATE TABLE int4_tbl_s(f1 int4) distribute by shard(f1);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+INSERT INTO int4_tbl_s(f1) VALUES ('0');
+INSERT INTO int4_tbl_s(f1) VALUES ('123456');
+INSERT INTO int4_tbl_s(f1) VALUES ('-123456');
+CREATE TABLE tenk1_s (
+	unique1 int4,
+	unique2 int4,
+	two int4,
+	four int4,
+	ten int4,
+	twenty int4,
+	hundred int4,
+	thousand int4,
+	twothousand int4,
+	fivethous int4,
+	tenthous int4,
+	odd int4,
+	even int4,
+	stringu1 name,
+	stringu2 name,
+	string4 name
+) distribute by shard(unique1);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+CREATE INDEX unique1_s ON tenk1_s USING btree(unique1 int4_ops);
+CREATE INDEX unique2_s ON tenk1_s USING btree(unique2 int4_ops);
+CREATE INDEX hundred_s ON tenk1_s USING btree(hundred int4_ops);
+CREATE INDEX thous_tenthous_s ON tenk1_s (thousand, tenthous);
+COPY tenk1_s FROM '/home/tbase/PG-XL-v10/src/test/regress/data/tenk.data';
+CREATE TABLE onek_s (
+	unique1 int4,
+	unique2 int4,
+	two int4,
+	four int4,
+	ten int4,
+	twenty int4,
+	hundred int4,
+	thousand int4,
+	twothousand int4,
+	fivethous int4,
+	tenthous int4,
+	odd int4,
+	even int4,
+	stringu1 name,
+	stringu2 name,
+	string4 name
+) distribute by shard(unique1);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+CREATE INDEX onek_unique1_s ON onek_s USING btree(unique1 int4_ops);
+CREATE INDEX onek_unique2_s ON onek_s USING btree(unique2 int4_ops);
+CREATE INDEX onek_hundred_s ON onek_s USING btree(hundred int4_ops);
+CREATE INDEX onek_stringu1_s ON onek_s USING btree(stringu1 name_ops);
+COPY onek_s FROM '/home/tbase/PG-XL-v10/src/test/regress/data/onek.data';
+set enable_hashjoin=off;
+set enable_mergejoin=off;
+set enable_nestloop=on;
+explain (num_nodes off, nodes off, costs off)
+select * from tenk1_s t1 left join
+  (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2)
+  on t1.hundred = t2.hundred and t1.ten = t3.ten
+where t1.unique1 = 1;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all
+               Distribute results by S: hundred
+               ->  Seq Scan on tenk1_s t1
+                     Filter: (unique1 = 1)
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     Distribute results by S: hundred
+                     ->  Nested Loop
+                           Join Filter: (t1.ten = t3.ten)
+                           ->  Remote Subquery Scan on all
+                                 Distribute results by S: thousand
+                                 ->  Index Scan using hundred_s on tenk1_s t2
+                                       Index Cond: (t1.hundred = hundred)
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all
+                                       Distribute results by S: unique2
+                                       ->  Index Scan using unique2_s on tenk1_s t3
+                                             Index Cond: (unique2 = t2.thousand)
+(20 rows)
+
+--select * from tenk1_s t1 left join
+--  (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2)
+--  on t1.hundred = t2.hundred and t1.ten = t3.ten
+--where t1.unique1 = 1;
+explain (num_nodes off, nodes off, costs off)
+select * from tenk1_s t1 left join
+  (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2)
+  on t1.hundred = t2.hundred and t1.ten + t2.ten = t3.ten
+where t1.unique1 = 1;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Nested Loop Left Join
+         ->  Remote Subquery Scan on all
+               Distribute results by S: hundred
+               ->  Seq Scan on tenk1_s t1
+                     Filter: (unique1 = 1)
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     Distribute results by S: hundred
+                     ->  Nested Loop
+                           Join Filter: ((t1.ten + t2.ten) = t3.ten)
+                           ->  Remote Subquery Scan on all
+                                 Distribute results by S: thousand
+                                 ->  Index Scan using hundred_s on tenk1_s t2
+                                       Index Cond: (t1.hundred = hundred)
+                           ->  Materialize
+                                 ->  Remote Subquery Scan on all
+                                       Distribute results by S: unique2
+                                       ->  Index Scan using unique2_s on tenk1_s t3
+                                             Index Cond: (unique2 = t2.thousand)
+(20 rows)
+
+select * from tenk1_s t1 left join
+  (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2)
+  on t1.hundred = t2.hundred and t1.ten + t2.ten = t3.ten
+where t1.unique1 = 1;
+ unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 | unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 | unique1 | unique2 | two | four | ten | twenty | hundred | thousand | twothousand | fivethous | tenthous | odd | even | stringu1 | stringu2 | string4 
+---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+---------+---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+---------+---------+---------+-----+------+-----+--------+---------+----------+-------------+-----------+----------+-----+------+----------+----------+---------
+       1 |    2838 |   1 |    1 |   1 |      1 |       1 |        1 |           1 |         1 |        1 |   2 |    3 | BAAAAA   | EFEAAA   | OOOOxx  |         |         |     |      |     |        |         |          |             |           |          |     |      |          |          |         |         |         |     |      |     |        |         |          |             |           |          |     |      |          |          | 
+(1 row)
+
+explain (num_nodes off, nodes off, costs off)
+select * from
+(
+  select unique1, q1, coalesce(unique1, -1) + q1 as fault
+  from int8_tbl_s left join tenk1_s on (q2 = unique2)
+) ss
+where fault = 122
+order by fault;
+                                     QUERY PLAN                                     
+------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Nested Loop Left Join
+         Join Filter: (int8_tbl_s.q2 = tenk1_s.unique2)
+         Filter: ((COALESCE(tenk1_s.unique1, '-1'::integer) + int8_tbl_s.q1) = 122)
+         ->  Seq Scan on int8_tbl_s
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     ->  Seq Scan on tenk1_s
+(8 rows)
+
+select * from
+(
+  select unique1, q1, coalesce(unique1, -1) + q1 as fault
+  from int8_tbl_s left join tenk1_s on (q2 = unique2)
+) ss
+where fault = 122
+order by fault;
+ unique1 | q1  | fault 
+---------+-----+-------
+         | 123 |   122
+(1 row)
+
+explain (num_nodes off, nodes off, costs off)
+select q1, unique2, thousand, hundred
+  from int8_tbl_s a left join tenk1_s b on q1 = unique2
+  where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123);
+                                         QUERY PLAN                                         
+--------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Nested Loop Left Join
+         Join Filter: (a.q1 = b.unique2)
+         Filter: ((COALESCE(b.thousand, 123) = a.q1) AND (a.q1 = COALESCE(b.hundred, 123)))
+         ->  Seq Scan on int8_tbl_s a
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     Distribute results by S: COALESCE(thousand, 123)
+                     ->  Seq Scan on tenk1_s b
+(9 rows)
+
+  
+select q1, unique2, thousand, hundred
+  from int8_tbl_s a left join tenk1_s b on q1 = unique2
+  where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123);  
+ q1 | unique2 | thousand | hundred 
+----+---------+----------+---------
+(0 rows)
+
+  
+explain (num_nodes off, nodes off, costs off)
+select f1, unique2, case when unique2 is null then f1 else 0 end
+  from int4_tbl_s a left join tenk1_s b on f1 = unique2
+  where (case when unique2 is null then f1 else 0 end) = 0;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ Remote Subquery Scan on all
+   ->  Nested Loop Left Join
+         Join Filter: (a.f1 = b.unique2)
+         Filter: (CASE WHEN (b.unique2 IS NULL) THEN a.f1 ELSE 0 END = 0)
+         ->  Seq Scan on int4_tbl_s a
+         ->  Materialize
+               ->  Remote Subquery Scan on all
+                     Distribute results by S: unique2
+                     ->  Seq Scan on tenk1_s b
+(9 rows)
+
+select f1, unique2, case when unique2 is null then f1 else 0 end
+  from int4_tbl_s a left join tenk1_s b on f1 = unique2
+  where (case when unique2 is null then f1 else 0 end) = 0;  
+ f1 | unique2 | case 
+----+---------+------
+  0 |       0 |    0
+(1 row)
+
+  
+explain (verbose, costs off)
+select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from
+  (values (0),(1)) foo1(join_key)
+left join
+  (select join_key, bug_field from
+    (select ss1.join_key, ss1.bug_field from
+      (select f1 as join_key, 666 as bug_field from int4_tbl_s i1) ss1
+    ) foo2
+   left join
+    (select unique2 as join_key from tenk1_s i2) ss2
+   using (join_key)
+  ) foo3
+using (join_key);
+                                    QUERY PLAN                                     
+-----------------------------------------------------------------------------------
+ Nested Loop Left Join
+   Output: "*VALUES*".column1, i1.f1, (666)
+   Join Filter: ("*VALUES*".column1 = i1.f1)
+   ->  Values Scan on "*VALUES*"
+         Output: "*VALUES*".column1
+   ->  Materialize
+         Output: i1.f1, (666)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Output: i1.f1, 666
+               ->  Nested Loop Left Join
+                     Output: i1.f1, 666
+                     Join Filter: (i1.f1 = i2.unique2)
+                     ->  Seq Scan on public.int4_tbl_s i1
+                           Output: i1.f1
+                     ->  Materialize
+                           Output: i2.unique2
+                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                 Output: i2.unique2
+                                 Distribute results by S: unique2
+                                 ->  Seq Scan on public.tenk1_s i2
+                                       Output: i2.unique2
+(21 rows)
+
+  
+select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from
+  (values (0),(1)) foo1(join_key)
+left join
+  (select join_key, bug_field from
+    (select ss1.join_key, ss1.bug_field from
+      (select f1 as join_key, 666 as bug_field from int4_tbl_s i1) ss1
+    ) foo2
+   left join
+    (select unique2 as join_key from tenk1_s i2) ss2
+   using (join_key)
+  ) foo3
+using (join_key);
+ foo1_id | foo3_id | bug_field 
+---------+---------+-----------
+       0 |       0 |       666
+       1 |         |          
+(2 rows)
+
+explain (verbose, costs off)
+select t1.unique1, t2.hundred
+from onek_s t1, tenk1_s t2
+where exists (select 1 from tenk1_s t3
+              where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
+      and t1.unique1 < 1;  
+                                                                                                                   QUERY PLAN                                                                                                                    
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)
+   Output: t1.unique1, t2.hundred
+   ->  Nested Loop
+         Output: t1.unique1, t2.hundred
+         Join Filter: (t3.tenthous = t2.hundred)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Output: t2.hundred
+               Distribute results by S: hundred
+               ->  Seq Scan on public.tenk1_s t2
+                     Output: t2.hundred
+         ->  Materialize
+               Output: t1.unique1, t3.tenthous
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Output: t1.unique1, t3.tenthous
+                     Distribute results by S: tenthous
+                     ->  Nested Loop
+                           Output: t1.unique1, t3.tenthous
+                           Join Filter: (t1.unique1 = t3.thousand)
+                           ->  HashAggregate
+                                 Output: t3.thousand, t3.tenthous
+                                 Group Key: t3.thousand, t3.tenthous
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4
+                                       Distribute results by S: thousand
+                                       ->  HashAggregate
+                                             Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4
+                                             Group Key: t3.thousand, t3.tenthous
+                                             ->  Seq Scan on public.tenk1_s t3
+                                                   Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4
+                           ->  Materialize
+                                 Output: t1.unique1
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Output: t1.unique1
+                                       ->  Seq Scan on public.onek_s t1
+                                             Output: t1.unique1
+                                             Filter: (t1.unique1 < 1)
+(36 rows)
+
+--select t1.unique1, t2.hundred
+--from onek_s t1, tenk1_s t2
+--where exists (select 1 from tenk1_s t3
+--              where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
+--      and t1.unique1 < 1;	  
+  
+reset enable_nestloop;
+reset enable_hashjoin;
+reset enable_mergejoin;
+drop table int8_tbl_s;
+drop table int4_tbl_s;
+drop table tenk1_s;
+drop table onek_s;
diff --git a/src/test/regress/sql/nestloop_by_shard.sql b/src/test/regress/sql/nestloop_by_shard.sql
new file mode 100644
index 00000000..eb7868b8
--- /dev/null
+++ b/src/test/regress/sql/nestloop_by_shard.sql
@@ -0,0 +1,191 @@
+
+-- test nestloop by shard
+
+drop table if exists int8_tbl_s;
+drop table if exists int4_tbl_s;
+drop table if exists tenk1_s;
+drop table if exists onek_s;
+
+CREATE TABLE int8_tbl_s(q1 int8, q2 int8) distribute by shard(q1);
+INSERT INTO int8_tbl_s VALUES('123','456');
+INSERT INTO int8_tbl_s VALUES('123','4567890123456789');
+INSERT INTO int8_tbl_s VALUES('4567890123456789','123');
+INSERT INTO int8_tbl_s VALUES(+4567890123456789,'4567890123456789');
+INSERT INTO int8_tbl_s VALUES('+4567890123456789','-4567890123456789');
+
+CREATE TABLE int4_tbl_s(f1 int4) distribute by shard(f1);
+INSERT INTO int4_tbl_s(f1) VALUES ('0');
+INSERT INTO int4_tbl_s(f1) VALUES ('123456');
+INSERT INTO int4_tbl_s(f1) VALUES ('-123456');
+
+
+CREATE TABLE tenk1_s (
+	unique1 int4,
+	unique2 int4,
+	two int4,
+	four int4,
+	ten int4,
+	twenty int4,
+	hundred int4,
+	thousand int4,
+	twothousand int4,
+	fivethous int4,
+	tenthous int4,
+	odd int4,
+	even int4,
+	stringu1 name,
+	stringu2 name,
+	string4 name
+) distribute by shard(unique1);
+
+CREATE INDEX unique1_s ON tenk1_s USING btree(unique1 int4_ops);
+CREATE INDEX unique2_s ON tenk1_s USING btree(unique2 int4_ops);
+CREATE INDEX hundred_s ON tenk1_s USING btree(hundred int4_ops);
+CREATE INDEX thous_tenthous_s ON tenk1_s (thousand, tenthous);
+
+COPY tenk1_s FROM '/home/tbase/PG-XL-v10/src/test/regress/data/tenk.data';
+
+CREATE TABLE onek_s (
+	unique1 int4,
+	unique2 int4,
+	two int4,
+	four int4,
+	ten int4,
+	twenty int4,
+	hundred int4,
+	thousand int4,
+	twothousand int4,
+	fivethous int4,
+	tenthous int4,
+	odd int4,
+	even int4,
+	stringu1 name,
+	stringu2 name,
+	string4 name
+) distribute by shard(unique1);
+
+
+CREATE INDEX onek_unique1_s ON onek_s USING btree(unique1 int4_ops);
+CREATE INDEX onek_unique2_s ON onek_s USING btree(unique2 int4_ops);
+CREATE INDEX onek_hundred_s ON onek_s USING btree(hundred int4_ops);
+CREATE INDEX onek_stringu1_s ON onek_s USING btree(stringu1 name_ops);
+
+COPY onek_s FROM '/home/tbase/PG-XL-v10/src/test/regress/data/onek.data';
+
+
+set enable_hashjoin=off;
+set enable_mergejoin=off;
+set enable_nestloop=on;
+
+
+explain (num_nodes off, nodes off, costs off)
+select * from tenk1_s t1 left join
+  (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2)
+  on t1.hundred = t2.hundred and t1.ten = t3.ten
+where t1.unique1 = 1;
+
+--select * from tenk1_s t1 left join
+--  (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2)
+--  on t1.hundred = t2.hundred and t1.ten = t3.ten
+--where t1.unique1 = 1;
+
+explain (num_nodes off, nodes off, costs off)
+select * from tenk1_s t1 left join
+  (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2)
+  on t1.hundred = t2.hundred and t1.ten + t2.ten = t3.ten
+where t1.unique1 = 1;
+
+select * from tenk1_s t1 left join
+  (tenk1_s t2 join tenk1_s t3 on t2.thousand = t3.unique2)
+  on t1.hundred = t2.hundred and t1.ten + t2.ten = t3.ten
+where t1.unique1 = 1;
+
+
+explain (num_nodes off, nodes off, costs off)
+select * from
+(
+  select unique1, q1, coalesce(unique1, -1) + q1 as fault
+  from int8_tbl_s left join tenk1_s on (q2 = unique2)
+) ss
+where fault = 122
+order by fault;
+
+select * from
+(
+  select unique1, q1, coalesce(unique1, -1) + q1 as fault
+  from int8_tbl_s left join tenk1_s on (q2 = unique2)
+) ss
+where fault = 122
+order by fault;
+
+
+explain (num_nodes off, nodes off, costs off)
+select q1, unique2, thousand, hundred
+  from int8_tbl_s a left join tenk1_s b on q1 = unique2
+  where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123);
+  
+select q1, unique2, thousand, hundred
+  from int8_tbl_s a left join tenk1_s b on q1 = unique2
+  where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123);  
+  
+
+explain (num_nodes off, nodes off, costs off)
+select f1, unique2, case when unique2 is null then f1 else 0 end
+  from int4_tbl_s a left join tenk1_s b on f1 = unique2
+  where (case when unique2 is null then f1 else 0 end) = 0;
+
+select f1, unique2, case when unique2 is null then f1 else 0 end
+  from int4_tbl_s a left join tenk1_s b on f1 = unique2
+  where (case when unique2 is null then f1 else 0 end) = 0;  
+  
+
+explain (verbose, costs off)
+select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from
+  (values (0),(1)) foo1(join_key)
+left join
+  (select join_key, bug_field from
+    (select ss1.join_key, ss1.bug_field from
+      (select f1 as join_key, 666 as bug_field from int4_tbl_s i1) ss1
+    ) foo2
+   left join
+    (select unique2 as join_key from tenk1_s i2) ss2
+   using (join_key)
+  ) foo3
+using (join_key);
+  
+select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from
+  (values (0),(1)) foo1(join_key)
+left join
+  (select join_key, bug_field from
+    (select ss1.join_key, ss1.bug_field from
+      (select f1 as join_key, 666 as bug_field from int4_tbl_s i1) ss1
+    ) foo2
+   left join
+    (select unique2 as join_key from tenk1_s i2) ss2
+   using (join_key)
+  ) foo3
+using (join_key);
+
+
+explain (verbose, costs off)
+select t1.unique1, t2.hundred
+from onek_s t1, tenk1_s t2
+where exists (select 1 from tenk1_s t3
+              where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
+      and t1.unique1 < 1;  
+
+--select t1.unique1, t2.hundred
+--from onek_s t1, tenk1_s t2
+--where exists (select 1 from tenk1_s t3
+--              where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
+--      and t1.unique1 < 1;	  
+  
+
+reset enable_nestloop;
+reset enable_hashjoin;
+reset enable_mergejoin;
+
+drop table int8_tbl_s;
+drop table int4_tbl_s;
+drop table tenk1_s;
+drop table onek_s;

From 7e47ee022aad2b981008826aa56b1138a0711563 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 4 Jul 2022 17:38:41 +0800
Subject: [PATCH 502/578] fix regress error

---
 src/backend/access/transam/gtm.c              |  32 ++++
 src/backend/pgxc/pool/poolmgr.c               |  60 ------
 src/include/access/xlog.h                     |   1 -
 src/test/regress/expected/aggregates_1.out    |  56 +++---
 src/test/regress/expected/alter_generic.out   |   4 +-
 src/test/regress/expected/alter_table.out     |  19 ++
 src/test/regress/expected/foreign_data.out    | 162 ++++++++---------
 src/test/regress/expected/insert.out          |  37 ++++
 src/test/regress/expected/join_3.out          | 100 +++++-----
 .../regress/expected/partition_join_2.out     | 172 ++++++++----------
 src/test/regress/expected/sanity_check.out    |   1 +
 src/test/regress/expected/sequence.out        |   2 +-
 src/test/regress/expected/stats_ext_2.out     |  32 ++--
 src/test/regress/expected/sysviews.out        |   2 +-
 src/test/regress/expected/tbase_explain.out   |  10 +-
 15 files changed, 350 insertions(+), 340 deletions(-)

diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index ded4fdfb..474610be 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -24,6 +24,7 @@
 #include "postmaster/autovacuum.h"
 #include "postmaster/clean2pc.h"
 #include "postmaster/clustermon.h"
+#include "postmaster/postmaster.h"
 #include "storage/backendid.h"
 #include "tcop/tcopprot.h"
 #include "utils/guc.h"
@@ -1239,6 +1240,37 @@ ResetGTMConnection(void)
 	InitGTM();
 }
 
+#ifdef HAVE_UNIX_SOCKETS
+/*
+ * gtm_unix_socket_file_exists()
+ *
+ * Checks whether the gtm unix domain socket file exists.
+ */
+static bool
+gtm_unix_socket_file_exists(void)
+{
+    char		path[MAXGTMPATH];
+    char		lockfile[MAXPGPATH];
+    int			fd;
+
+    UNIXSOCK_PATH(path, GtmPort, gtm_unix_socket_directory);
+    snprintf(lockfile, sizeof(lockfile), "%s.lock", path);
+
+    if ((fd = open(lockfile, O_RDONLY, 0)) < 0)
+    {
+        /* ENOTDIR means we will throw a more useful error later */
+        if (errno != ENOENT && errno != ENOTDIR)
+            elog(LOG, "could not open file \"%s\" for reading: %s\n",
+                     lockfile, strerror(errno));
+
+        return false;
+    }
+
+    close(fd);
+    return true;
+}
+#endif
+
 void
 InitGTM(void)
 {// #lizard forgives
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 6e4f3283..756b2198 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -10927,66 +10927,6 @@ handle_session_command(PoolAgent * agent, StringInfo s)
     }
 }
 
-static bool
-remove_all_agent_references(Oid nodeoid)
-{
-	int i, j, index;
-	bool res = true;
-
-	/*
-	 * Identify if it's a coordinator or datanode first
-	 * and get its index
-	 */
-	for (i = 0; i < agentCount; i++)
-	{
-		bool found = false;
-		PoolAgent *agent;
-
-		index = agentIndexes[i];
-		agent = poolAgents[index];
-
-		for (j = 0; j < agent->num_dn_connections; j++)
-		{
-			if (agent->dn_conn_oids[j] == nodeoid)
-			{
-				found = true;
-				break;
-			}
-		}
-		if (found)
-		{
-			PGXCNodePoolSlot *slot = agent->dn_connections[j];
-			if (slot)
-				release_connection(agent->pool, slot, j, agent->dn_conn_oids[j], false, false);
-			agent->dn_connections[j] = NULL;
-		}
-		else
-		{
-			for (j = 0; j < agent->num_coord_connections; j++)
-			{
-				if (agent->coord_conn_oids[j] == nodeoid)
-				{
-					found = true;
-					break;
-				}
-			}
-			if (found)
-			{
-				PGXCNodePoolSlot *slot = agent->coord_connections[j];
-				if (slot)
-					release_connection(agent->pool, slot, j, agent->coord_conn_oids[j], true, true);
-				agent->coord_connections[j] = NULL;
-			}
-			else
-			{
-				elog(LOG, "Node not found! (%u)", nodeoid);
-				res = false;
-			}
-		}
-	}
-	return res;
-}
-
 /*
  * refresh_database_pools
  *        refresh information for all database pools
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 018cecd9..a0db442b 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -244,7 +244,6 @@ extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
 extern XLogSegNo XLogGetLastRemovedSegno(void);
 extern void XLogSetAsyncXactLSN(XLogRecPtr record);
 extern void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn);
-extern XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 
 extern void xlog_redo(XLogReaderState *record);
 extern void xlog_desc(StringInfo buf, XLogReaderState *record);
diff --git a/src/test/regress/expected/aggregates_1.out b/src/test/regress/expected/aggregates_1.out
index 2bfcbb7f..89f1e157 100644
--- a/src/test/regress/expected/aggregates_1.out
+++ b/src/test/regress/expected/aggregates_1.out
@@ -2000,31 +2000,31 @@ select my_sum(one),my_half_sum(one) from (values(1),(2),(3),(4)) t(one);
 (1 row)
 
 rollback;
- -- test coverage for aggregate combine/serial/deserial functions
- BEGIN ISOLATION LEVEL REPEATABLE READ;
- SET parallel_setup_cost = 0;
- SET parallel_tuple_cost = 0;
- SET min_parallel_table_scan_size = 0;
- SET max_parallel_workers_per_gather = 4;
- SET enable_indexonlyscan = off;
- -- variance(int4) covers numeric_poly_combine
- -- sum(int8) covers int8_avg_combine
- EXPLAIN (COSTS OFF)
-   SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1;
-                              QUERY PLAN
- --------------------------------------------------------------------
-  Parallel Finalize Aggregate
-    ->  Parallel Remote Subquery Scan on all (datanode_1,datanode_2)
-          ->  Gather
-                Workers Planned: 4
-                ->  Partial Aggregate
-                      ->  Parallel Seq Scan on tenk1
- (6 rows)
-
- SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1;
-        variance       |   sum
- ----------------------+----------
-  8334166.666666666667 | 49995000
- (1 row)
-
- ROLLBACK;
\ No newline at end of file
+-- test coverage for aggregate combine/serial/deserial functions
+BEGIN ISOLATION LEVEL REPEATABLE READ;
+SET parallel_setup_cost = 0;
+SET parallel_tuple_cost = 0;
+SET min_parallel_table_scan_size = 0;
+SET max_parallel_workers_per_gather = 4;
+SET enable_indexonlyscan = off;
+-- variance(int4) covers numeric_poly_combine
+-- sum(int8) covers int8_avg_combine
+EXPLAIN (COSTS OFF)
+  SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1;
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Parallel Finalize Aggregate
+   ->  Parallel Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Gather
+               Workers Planned: 4
+               ->  Partial Aggregate
+                     ->  Parallel Seq Scan on tenk1
+(6 rows)
+
+SELECT variance(unique1::int4), sum(unique1::int8) FROM tenk1;
+       variance       |   sum    
+----------------------+----------
+ 8334166.666666666667 | 49995000
+(1 row)
+
+ROLLBACK;
diff --git a/src/test/regress/expected/alter_generic.out b/src/test/regress/expected/alter_generic.out
index 788c5964..767c09be 100644
--- a/src/test/regress/expected/alter_generic.out
+++ b/src/test/regress/expected/alter_generic.out
@@ -159,14 +159,14 @@ ALTER SERVER alt_fserv1 RENAME TO alt_fserv2;   -- failed (name conflict)
 ERROR:  server "alt_fserv2" already exists
 ALTER SERVER alt_fserv1 RENAME TO alt_fserv3;   -- OK
 SELECT fdwname FROM pg_foreign_data_wrapper WHERE fdwname like 'alt_fdw%';
- fdwname 
+ fdwname  
 ----------
  alt_fdw2
  alt_fdw3
 (2 rows)
 
 SELECT srvname FROM pg_foreign_server WHERE srvname like 'alt_fserv%';
- srvname 
+  srvname   
 ------------
  alt_fserv2
  alt_fserv3
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 50f89f13..a1ef7dc2 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3812,3 +3812,22 @@ alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values
 drop table at_test_sql_partop;
 drop operator class at_test_sql_partop using btree;
 drop function at_test_sql_partop;
+-- remote dml with dropped column
+create table dropped_col_remote_dml (a int, b int, c int) distribute by shard(a);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into dropped_col_remote_dml values(1,1,1);
+create or replace function dropped_col_remote_dml_func() returns trigger as
+$$
+begin
+    raise notice 'this is a test';
+    return new;
+end;
+$$
+    language plpgsql;
+create trigger tga after update on dropped_col_remote_dml for each row
+execute PROCEDURE dropped_col_remote_dml_func();
+alter table dropped_col_remote_dml drop column c;
+update dropped_col_remote_dml set b = 2;
+NOTICE:  this is a test
+drop table dropped_col_remote_dml cascade;
+drop function dropped_col_remote_dml_func;
diff --git a/src/test/regress/expected/foreign_data.out b/src/test/regress/expected/foreign_data.out
index 26e01f5f..9aa94459 100644
--- a/src/test/regress/expected/foreign_data.out
+++ b/src/test/regress/expected/foreign_data.out
@@ -18,7 +18,7 @@ COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
 CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator;
 -- At this point we should have 2 built-in wrappers and no servers.
 SELECT fdwname, fdwhandler::regproc, fdwvalidator::regproc, fdwoptions FROM pg_foreign_data_wrapper ORDER BY 1, 2, 3;
- fdwname | fdwhandler | fdwvalidator | fdwoptions 
+  fdwname   | fdwhandler |       fdwvalidator       | fdwoptions 
 ------------+------------+--------------------------+------------
  dummy      | -          | -                        | 
  postgresql | -          | postgresql_fdw_validator | 
@@ -39,8 +39,8 @@ CREATE FOREIGN DATA WRAPPER foo VALIDATOR bar;            -- ERROR
 ERROR:  function bar(text[], oid) does not exist
 CREATE FOREIGN DATA WRAPPER foo;
 \dew
-   List of foreign-data wrappers
- Name | Owner | Handler | Validator 
+                        List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         
 ------------+---------------------------+---------+--------------------------
  dummy      | regress_foreign_data_user | -       | -
  foo        | regress_foreign_data_user | -       | -
@@ -52,8 +52,8 @@ ERROR:  foreign-data wrapper "foo" already exists
 DROP FOREIGN DATA WRAPPER foo;
 CREATE FOREIGN DATA WRAPPER foo OPTIONS (testing '1');
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                 List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges |  FDW options  | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+---------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |               | useless
  foo        | regress_foreign_data_user | -       | -                        |                   | (testing '1') | 
@@ -65,8 +65,8 @@ CREATE FOREIGN DATA WRAPPER foo OPTIONS (testing '1', testing '2');   -- ERROR
 ERROR:  option "testing" provided more than once
 CREATE FOREIGN DATA WRAPPER foo OPTIONS (testing '1', another '2');
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                       List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges |        FDW options         | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+----------------------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |                            | useless
  foo        | regress_foreign_data_user | -       | -                        |                   | (testing '1', another '2') | 
@@ -81,8 +81,8 @@ HINT:  Must be superuser to create a foreign-data wrapper.
 RESET ROLE;
 CREATE FOREIGN DATA WRAPPER foo VALIDATOR postgresql_fdw_validator;
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges | FDW options | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
  foo        | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
@@ -98,8 +98,8 @@ ALTER FOREIGN DATA WRAPPER foo VALIDATOR bar;               -- ERROR
 ERROR:  function bar(text[], oid) does not exist
 ALTER FOREIGN DATA WRAPPER foo NO VALIDATOR;
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges | FDW options | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
  foo        | regress_foreign_data_user | -       | -                        |                   |             | 
@@ -113,8 +113,8 @@ ALTER FOREIGN DATA WRAPPER foo OPTIONS (DROP c);            -- ERROR
 ERROR:  option "c" not found
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD x '1', DROP x);
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                 List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges |  FDW options   | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+----------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |                | useless
  foo        | regress_foreign_data_user | -       | -                        |                   | (a '1', b '2') | 
@@ -123,8 +123,8 @@ ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD x '1', DROP x);
 
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (DROP a, SET b '3', ADD c '4');
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                 List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges |  FDW options   | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+----------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |                | useless
  foo        | regress_foreign_data_user | -       | -                        |                   | (b '3', c '4') | 
@@ -135,8 +135,8 @@ ALTER FOREIGN DATA WRAPPER foo OPTIONS (a '2');
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (b '4');             -- ERROR
 ERROR:  option "b" provided more than once
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                     List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges |      FDW options      | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+-----------------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |                       | useless
  foo        | regress_foreign_data_user | -       | -                        |                   | (b '3', c '4', a '2') | 
@@ -150,8 +150,8 @@ HINT:  Must be superuser to alter a foreign-data wrapper.
 SET ROLE regress_test_role_super;
 ALTER FOREIGN DATA WRAPPER foo OPTIONS (ADD d '5');
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                        List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges |         FDW options          | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+------------------------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |                              | useless
  foo        | regress_foreign_data_user | -       | -                        |                   | (b '3', c '4', a '2', d '5') | 
@@ -169,8 +169,8 @@ ERROR:  permission denied to alter foreign-data wrapper "foo"
 HINT:  Must be superuser to alter a foreign-data wrapper.
 RESET ROLE;
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                        List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges |         FDW options          | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+------------------------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |                              | useless
  foo        | regress_test_role_super   | -       | -                        |                   | (b '3', c '4', a '2', d '5') | 
@@ -179,8 +179,8 @@ RESET ROLE;
 
 ALTER FOREIGN DATA WRAPPER foo RENAME TO foo1;
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                        List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges |         FDW options          | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+------------------------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |                              | useless
  foo1       | regress_test_role_super   | -       | -                        |                   | (b '3', c '4', a '2', d '5') | 
@@ -194,8 +194,8 @@ ERROR:  foreign-data wrapper "nonexistent" does not exist
 DROP FOREIGN DATA WRAPPER IF EXISTS nonexistent;
 NOTICE:  foreign-data wrapper "nonexistent" does not exist, skipping
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                        List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges |         FDW options          | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+------------------------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |                              | useless
  foo        | regress_test_role_super   | -       | -                        |                   | (b '3', c '4', a '2', d '5') | 
@@ -210,8 +210,8 @@ DROP FOREIGN DATA WRAPPER foo;
 RESET ROLE;
 DROP ROLE regress_test_role_super;
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges | FDW options | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
  postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
@@ -226,8 +226,8 @@ ERROR:  user mapping for "regress_foreign_data_user" already exists for server s
 CREATE USER MAPPING IF NOT EXISTS FOR current_user SERVER s1; -- NOTICE
 NOTICE:  user mapping for "regress_foreign_data_user" already exists for server s1, skipping
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges | FDW options | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
  foo        | regress_foreign_data_user | -       | -                        |                   |             | 
@@ -235,15 +235,15 @@ NOTICE:  user mapping for "regress_foreign_data_user" already exists for server
 (3 rows)
 
 \des+
-                                       List of foreign servers
- Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
+                                                   List of foreign servers
+ Name |           Owner           | Foreign-data wrapper | Access privileges | Type | Version | FDW options |  Description   
 ------+---------------------------+----------------------+-------------------+------+---------+-------------+----------------
  s1   | regress_foreign_data_user | foo                  |                   |      |         |             | foreign server
 (1 row)
 
 \deu+
-      List of user mappings
- Server | User name | FDW options 
+              List of user mappings
+ Server |         User name         | FDW options 
 --------+---------------------------+-------------
  s1     | regress_foreign_data_user | 
 (1 row)
@@ -262,8 +262,8 @@ NOTICE:  drop cascades to 2 other objects
 DETAIL:  drop cascades to server s1
 drop cascades to user mapping for regress_foreign_data_user on server s1
 \dew+
-                           List of foreign-data wrappers
- Name | Owner | Handler | Validator | Access privileges | FDW options | Description 
+                                                List of foreign-data wrappers
+    Name    |           Owner           | Handler |        Validator         | Access privileges | FDW options | Description 
 ------------+---------------------------+---------+--------------------------+-------------------+-------------+-------------
  dummy      | regress_foreign_data_user | -       | -                        |                   |             | useless
  postgresql | regress_foreign_data_user | -       | postgresql_fdw_validator |                   |             | 
@@ -301,8 +301,8 @@ ERROR:  invalid option "foo"
 HINT:  Valid options in this context are: authtype, service, connect_timeout, dbname, host, hostaddr, port, tty, options, requiressl, sslmode, gsslib
 CREATE SERVER s8 FOREIGN DATA WRAPPER postgresql OPTIONS (host 'localhost', dbname 's8db');
 \des+
-                                       List of foreign servers
- Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
+                                                             List of foreign servers
+ Name |           Owner           | Foreign-data wrapper | Access privileges |  Type  | Version |            FDW options            | Description 
 ------+---------------------------+----------------------+-------------------+--------+---------+-----------------------------------+-------------
  s1   | regress_foreign_data_user | foo                  |                   |        |         |                                   | 
  s2   | regress_foreign_data_user | foo                  |                   |        |         | (host 'a', dbname 'b')            | 
@@ -323,8 +323,8 @@ SET ROLE regress_test_role;
 CREATE SERVER t1 FOREIGN DATA WRAPPER foo;
 RESET ROLE;
 \des+
-                                       List of foreign servers
- Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
+                                                             List of foreign servers
+ Name |           Owner           | Foreign-data wrapper | Access privileges |  Type  | Version |            FDW options            | Description 
 ------+---------------------------+----------------------+-------------------+--------+---------+-----------------------------------+-------------
  s1   | regress_foreign_data_user | foo                  |                   |        |         |                                   | 
  s2   | regress_foreign_data_user | foo                  |                   |        |         | (host 'a', dbname 'b')            | 
@@ -347,8 +347,8 @@ GRANT regress_test_indirect TO regress_test_role;
 SET ROLE regress_test_role;
 CREATE SERVER t2 FOREIGN DATA WRAPPER foo;
 \des+
-                                       List of foreign servers
- Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
+                                                             List of foreign servers
+ Name |           Owner           | Foreign-data wrapper | Access privileges |  Type  | Version |            FDW options            | Description 
 ------+---------------------------+----------------------+-------------------+--------+---------+-----------------------------------+-------------
  s1   | regress_foreign_data_user | foo                  |                   |        |         |                                   | 
  s2   | regress_foreign_data_user | foo                  |                   |        |         | (host 'a', dbname 'b')            | 
@@ -377,8 +377,8 @@ ALTER SERVER s3 OPTIONS ("tns name" 'orcl', port '1521');
 GRANT USAGE ON FOREIGN SERVER s1 TO regress_test_role;
 GRANT USAGE ON FOREIGN SERVER s6 TO regress_test_role2 WITH GRANT OPTION;
 \des+
-                                       List of foreign servers
- Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
+                                                                               List of foreign servers
+ Name |           Owner           | Foreign-data wrapper |                   Access privileges                   |  Type  | Version |            FDW options            | Description 
 ------+---------------------------+----------------------+-------------------------------------------------------+--------+---------+-----------------------------------+-------------
  s1   | regress_foreign_data_user | foo                  | regress_foreign_data_user=U/regress_foreign_data_user+|        | 1.0     | (servername 's1')                 | 
       |                           |                      | regress_test_role=U/regress_foreign_data_user         |        |         |                                   | 
@@ -428,8 +428,8 @@ ERROR:  role "regress_test_indirect" cannot be dropped because some objects depe
 DETAIL:  owner of server s1
 privileges for foreign-data wrapper foo
 \des+
-                                       List of foreign servers
- Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
+                                                                                 List of foreign servers
+ Name |           Owner           | Foreign-data wrapper |                   Access privileges                   |  Type  | Version |             FDW options              | Description 
 ------+---------------------------+----------------------+-------------------------------------------------------+--------+---------+--------------------------------------+-------------
  s1   | regress_test_indirect     | foo                  | regress_test_indirect=U/regress_test_indirect         |        | 1.1     | (servername 's1')                    | 
  s2   | regress_foreign_data_user | foo                  |                                                       |        | 1.1     | (host 'a', dbname 'b')               | 
@@ -446,8 +446,8 @@ privileges for foreign-data wrapper foo
 
 ALTER SERVER s8 RENAME to s8new;
 \des+
-                                       List of foreign servers
- Name | Owner | Foreign-data wrapper | Access privileges | Type | Version | FDW options | Description 
+                                                                                 List of foreign servers
+ Name  |           Owner           | Foreign-data wrapper |                   Access privileges                   |  Type  | Version |             FDW options              | Description 
 -------+---------------------------+----------------------+-------------------------------------------------------+--------+---------+--------------------------------------+-------------
  s1    | regress_test_indirect     | foo                  | regress_test_indirect=U/regress_test_indirect         |        | 1.1     | (servername 's1')                    | 
  s2    | regress_foreign_data_user | foo                  |                                                       |        | 1.1     | (host 'a', dbname 'b')               | 
@@ -469,8 +469,8 @@ ERROR:  server "nonexistent" does not exist
 DROP SERVER IF EXISTS nonexistent;
 NOTICE:  server "nonexistent" does not exist, skipping
 \des
-       List of foreign servers
- Name | Owner | Foreign-data wrapper 
+                 List of foreign servers
+ Name |           Owner           | Foreign-data wrapper 
 ------+---------------------------+----------------------
  s1   | regress_test_indirect     | foo
  s2   | regress_foreign_data_user | foo
@@ -490,8 +490,8 @@ ERROR:  must be owner of foreign server s2
 DROP SERVER s1;
 RESET ROLE;
 \des
-       List of foreign servers
- Name | Owner | Foreign-data wrapper 
+                 List of foreign servers
+ Name |           Owner           | Foreign-data wrapper 
 ------+---------------------------+----------------------
  s2   | regress_foreign_data_user | foo
  s3   | regress_foreign_data_user | foo
@@ -509,8 +509,8 @@ SET ROLE regress_test_role;
 DROP SERVER s2;
 RESET ROLE;
 \des
-       List of foreign servers
- Name | Owner | Foreign-data wrapper 
+                 List of foreign servers
+ Name |           Owner           | Foreign-data wrapper 
 ------+---------------------------+----------------------
  s3   | regress_foreign_data_user | foo
  s4   | regress_foreign_data_user | foo
@@ -524,8 +524,8 @@ RESET ROLE;
 
 CREATE USER MAPPING FOR current_user SERVER s3;
 \deu
-List of user mappings
- Server | User name 
+       List of user mappings
+ Server |         User name         
 --------+---------------------------
  s3     | regress_foreign_data_user
 (1 row)
@@ -537,8 +537,8 @@ HINT:  Use DROP ... CASCADE to drop the dependent objects too.
 DROP SERVER s3 CASCADE;
 NOTICE:  drop cascades to user mapping for regress_foreign_data_user on server s3
 \des
-       List of foreign servers
- Name | Owner | Foreign-data wrapper 
+                 List of foreign servers
+ Name |           Owner           | Foreign-data wrapper 
 ------+---------------------------+----------------------
  s4   | regress_foreign_data_user | foo
  s5   | regress_foreign_data_user | foo
@@ -584,8 +584,8 @@ CREATE USER MAPPING FOR current_user SERVER t1 OPTIONS (username 'bob', password
 CREATE USER MAPPING FOR public SERVER t1;
 RESET ROLE;
 \deu
-List of user mappings
- Server | User name 
+       List of user mappings
+ Server |         User name         
 --------+---------------------------
  s4     | public
  s4     | regress_foreign_data_user
@@ -614,8 +614,8 @@ ERROR:  must be owner of foreign server s4
 ALTER USER MAPPING FOR public SERVER t1 OPTIONS (ADD modified '1');
 RESET ROLE;
 \deu+
-      List of user mappings
- Server | User name | FDW options 
+                         List of user mappings
+ Server |         User name         |           FDW options            
 --------+---------------------------+----------------------------------
  s4     | public                    | ("this mapping" 'is public')
  s4     | regress_foreign_data_user | 
@@ -646,8 +646,8 @@ ERROR:  must be owner of foreign server s8
 RESET ROLE;
 DROP SERVER s7;
 \deu
-List of user mappings
- Server | User name 
+       List of user mappings
+ Server |         User name         
 --------+---------------------------
  s4     | public
  s4     | regress_foreign_data_user
@@ -721,8 +721,8 @@ Server: s0
 FDW options: (delimiter ',', quote '"', "be quoted" 'value')
 
 \det+
-               List of foreign tables
- Schema | Table | Server | FDW options | Description 
+                                 List of foreign tables
+ Schema | Table | Server |                   FDW options                   | Description 
 --------+-------+--------+-------------------------------------------------+-------------
  public | ft1   | s0     | (delimiter ',', quote '"', "be quoted" 'value') | ft1
 (1 row)
@@ -877,7 +877,7 @@ ALTER FOREIGN TABLE IF EXISTS doesnt_exist_ft1 RENAME TO foreign_table_1;
 NOTICE:  relation "doesnt_exist_ft1" does not exist, skipping
 -- Information schema
 SELECT * FROM information_schema.foreign_data_wrappers ORDER BY 1, 2;
- foreign_data_wrapper_catalog | foreign_data_wrapper_name | authorization_identifier | library_name | foreign_data_wrapper_language 
+ foreign_data_wrapper_catalog | foreign_data_wrapper_name | authorization_identifier  | library_name | foreign_data_wrapper_language 
 ------------------------------+---------------------------+---------------------------+--------------+-------------------------------
  regression                   | dummy                     | regress_foreign_data_user |              | c
  regression                   | foo                       | regress_foreign_data_user |              | c
@@ -885,13 +885,13 @@ SELECT * FROM information_schema.foreign_data_wrappers ORDER BY 1, 2;
 (3 rows)
 
 SELECT * FROM information_schema.foreign_data_wrapper_options ORDER BY 1, 2, 3;
- foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name | option_value 
+ foreign_data_wrapper_catalog | foreign_data_wrapper_name | option_name  | option_value 
 ------------------------------+---------------------------+--------------+--------------
  regression                   | foo                       | test wrapper | true
 (1 row)
 
 SELECT * FROM information_schema.foreign_servers ORDER BY 1, 2;
- foreign_server_catalog | foreign_server_name | foreign_data_wrapper_catalog | foreign_data_wrapper_name | foreign_server_type | foreign_server_version | authorization_identifier 
+ foreign_server_catalog | foreign_server_name | foreign_data_wrapper_catalog | foreign_data_wrapper_name | foreign_server_type | foreign_server_version | authorization_identifier  
 ------------------------+---------------------+------------------------------+---------------------------+---------------------+------------------------+---------------------------
  regression             | s0                  | regression                   | dummy                     |                     |                        | regress_foreign_data_user
  regression             | s4                  | regression                   | foo                       | oracle              |                        | regress_foreign_data_user
@@ -903,7 +903,7 @@ SELECT * FROM information_schema.foreign_servers ORDER BY 1, 2;
 (7 rows)
 
 SELECT * FROM information_schema.foreign_server_options ORDER BY 1, 2, 3;
- foreign_server_catalog | foreign_server_name | option_name | option_value 
+ foreign_server_catalog | foreign_server_name |   option_name   | option_value 
 ------------------------+---------------------+-----------------+--------------
  regression             | s4                  | dbname          | b
  regression             | s4                  | host            | a
@@ -914,7 +914,7 @@ SELECT * FROM information_schema.foreign_server_options ORDER BY 1, 2, 3;
 (6 rows)
 
 SELECT * FROM information_schema.user_mappings ORDER BY lower(authorization_identifier), 2, 3;
- authorization_identifier | foreign_server_catalog | foreign_server_name 
+ authorization_identifier  | foreign_server_catalog | foreign_server_name 
 ---------------------------+------------------------+---------------------
  PUBLIC                    | regression             | s4
  PUBLIC                    | regression             | s8
@@ -927,7 +927,7 @@ SELECT * FROM information_schema.user_mappings ORDER BY lower(authorization_iden
 (8 rows)
 
 SELECT * FROM information_schema.user_mapping_options ORDER BY lower(authorization_identifier), 2, 3, 4;
- authorization_identifier | foreign_server_catalog | foreign_server_name | option_name | option_value 
+ authorization_identifier  | foreign_server_catalog | foreign_server_name | option_name  | option_value 
 ---------------------------+------------------------+---------------------+--------------+--------------
  PUBLIC                    | regression             | s4                  | this mapping | is public
  PUBLIC                    | regression             | t1                  | modified     | 1
@@ -939,7 +939,7 @@ SELECT * FROM information_schema.user_mapping_options ORDER BY lower(authorizati
 (7 rows)
 
 SELECT * FROM information_schema.usage_privileges WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5;
- grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable 
+          grantor          |          grantee          | object_catalog | object_schema | object_name |     object_type      | privilege_type | is_grantable 
 ---------------------------+---------------------------+----------------+---------------+-------------+----------------------+----------------+--------------
  regress_foreign_data_user | regress_foreign_data_user | regression     |               | foo         | FOREIGN DATA WRAPPER | USAGE          | YES
  regress_foreign_data_user | regress_test_indirect     | regression     |               | foo         | FOREIGN DATA WRAPPER | USAGE          | NO
@@ -948,7 +948,7 @@ SELECT * FROM information_schema.usage_privileges WHERE object_type LIKE 'FOREIG
 (4 rows)
 
 SELECT * FROM information_schema.role_usage_grants WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5;
- grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable 
+          grantor          |          grantee          | object_catalog | object_schema | object_name |     object_type      | privilege_type | is_grantable 
 ---------------------------+---------------------------+----------------+---------------+-------------+----------------------+----------------+--------------
  regress_foreign_data_user | regress_foreign_data_user | regression     |               | foo         | FOREIGN DATA WRAPPER | USAGE          | YES
  regress_foreign_data_user | regress_test_indirect     | regression     |               | foo         | FOREIGN DATA WRAPPER | USAGE          | NO
@@ -982,7 +982,7 @@ SELECT * FROM information_schema.user_mapping_options ORDER BY 1, 2, 3, 4;
 (5 rows)
 
 SELECT * FROM information_schema.usage_privileges WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5;
- grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable 
+          grantor          |        grantee        | object_catalog | object_schema | object_name |     object_type      | privilege_type | is_grantable 
 ---------------------------+-----------------------+----------------+---------------+-------------+----------------------+----------------+--------------
  regress_foreign_data_user | regress_test_indirect | regression     |               | foo         | FOREIGN DATA WRAPPER | USAGE          | NO
  regress_test_indirect     | regress_test_indirect | regression     |               | s6          | FOREIGN SERVER       | USAGE          | YES
@@ -990,7 +990,7 @@ SELECT * FROM information_schema.usage_privileges WHERE object_type LIKE 'FOREIG
 (3 rows)
 
 SELECT * FROM information_schema.role_usage_grants WHERE object_type LIKE 'FOREIGN%' AND object_name IN ('s6', 'foo') ORDER BY 1, 2, 3, 4, 5;
- grantor | grantee | object_catalog | object_schema | object_name | object_type | privilege_type | is_grantable 
+          grantor          |        grantee        | object_catalog | object_schema | object_name |     object_type      | privilege_type | is_grantable 
 ---------------------------+-----------------------+----------------+---------------+-------------+----------------------+----------------+--------------
  regress_foreign_data_user | regress_test_indirect | regression     |               | foo         | FOREIGN DATA WRAPPER | USAGE          | NO
  regress_test_indirect     | regress_test_indirect | regression     |               | s6          | FOREIGN SERVER       | USAGE          | YES
@@ -1208,8 +1208,8 @@ CREATE USER MAPPING FOR public SERVER s10 OPTIONS (user 'secret');
 CREATE USER MAPPING FOR regress_unprivileged_role SERVER s10 OPTIONS (user 'secret');
 -- owner of server can see some option fields
 \deu+
-      List of user mappings
- Server | User name | FDW options 
+                 List of user mappings
+ Server |         User name         |    FDW options    
 --------+---------------------------+-------------------
  s10    | public                    | ("user" 'secret')
  s10    | regress_unprivileged_role | 
@@ -1225,8 +1225,8 @@ CREATE USER MAPPING FOR regress_unprivileged_role SERVER s10 OPTIONS (user 'secr
 RESET ROLE;
 -- superuser can see all option fields
 \deu+
-      List of user mappings
- Server | User name | FDW options 
+                  List of user mappings
+ Server |         User name         |     FDW options     
 --------+---------------------------+---------------------
  s10    | public                    | ("user" 'secret')
  s10    | regress_unprivileged_role | ("user" 'secret')
@@ -1242,8 +1242,8 @@ RESET ROLE;
 -- unprivileged user cannot see any option field
 SET ROLE regress_unprivileged_role;
 \deu+
-      List of user mappings
- Server | User name | FDW options 
+              List of user mappings
+ Server |         User name         | FDW options 
 --------+---------------------------+-------------
  s10    | public                    | 
  s10    | regress_unprivileged_role | 
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index 97d3c276..7aa37928 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -986,6 +986,43 @@ with baseInfo as(select * from t1)
 insert into t2 select * from baseInfo;
 drop table t1;
 drop table t2;
+-- Determine whether tables of different groups are allowed to insert.
+set default_locator_type to shard;
+drop table if exists t2;
+NOTICE:  table "t2" does not exist, skipping
+drop table if exists t2_rep;
+NOTICE:  table "t2_rep" does not exist, skipping
+drop table if exists t2_new;
+NOTICE:  table "t2_new" does not exist, skipping
+create table t2(f1 int,f2 int);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+create table t2_rep(f1 int,f2 int) distribute by replication;
+insert into t2_rep values(1,1),(2,2);
+insert into t2 select * from t2_rep;
+select count(*) from t2_rep;
+ count 
+-------
+     2
+(1 row)
+
+select count(*) from t2;
+ count 
+-------
+     2
+(1 row)
+
+create table t2_new as select * from t2_rep;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+select count(*) from t2_new;
+ count 
+-------
+     2
+(1 row)
+
+drop table t2;
+drop table t2_rep;
+drop table t2_new;
+reset default_locator_type;
 -- test insert with returning in JDBC
 drop table if exists insertwithret;
 NOTICE:  table "insertwithret" does not exist, skipping
diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index 16264c50..1225d2ce 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -4614,8 +4614,8 @@ select *, (select r from (select q1 as q2) x, (select q2 as r) y) from int8_tbl;
 ------------------+-------------------+-------------------
               123 |               456 |               456
               123 |  4567890123456789 |  4567890123456789
- 4567890123456789 |  4567890123456789 |  4567890123456789
  4567890123456789 |               123 |               123
+ 4567890123456789 |  4567890123456789 |  4567890123456789
  4567890123456789 | -4567890123456789 | -4567890123456789
 (5 rows)
 
@@ -4624,8 +4624,8 @@ select *, (select r from (select q1 as q2) x, lateral (select q2 as r) y) from i
 ------------------+-------------------+------------------
               123 |               456 |              123
               123 |  4567890123456789 |              123
- 4567890123456789 |  4567890123456789 | 4567890123456789
  4567890123456789 |               123 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789
  4567890123456789 | -4567890123456789 | 4567890123456789
 (5 rows)
 
@@ -4929,13 +4929,13 @@ select * from
 ------------------+-------------------+------------------+-------------------+------------------+------------------+-------------------
               123 |               456 |                  |                   |              123 |                  |                  
               123 |  4567890123456789 | 4567890123456789 | -4567890123456789 |              123 | 4567890123456789 | -4567890123456789
-              123 |  4567890123456789 | 4567890123456789 |               123 |              123 | 4567890123456789 |               123
               123 |  4567890123456789 | 4567890123456789 |  4567890123456789 |              123 | 4567890123456789 |  4567890123456789
- 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789
- 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789 | 4567890123456789 |               123
- 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 |  4567890123456789
+              123 |  4567890123456789 | 4567890123456789 |               123 |              123 | 4567890123456789 |               123
  4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789 |              123 |  4567890123456789
  4567890123456789 |               123 |              123 |               456 | 4567890123456789 |              123 |               456
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 |  4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789 | 4567890123456789 |               123
  4567890123456789 | -4567890123456789 |                  |                   | 4567890123456789 |                  |                  
 (10 rows)
 
@@ -4946,13 +4946,13 @@ select * from
 ------------------+-------------------+------------------+-------------------+------------------+------------------+-------------------
               123 |               456 |                  |                   |              123 |                  |                  
               123 |  4567890123456789 | 4567890123456789 | -4567890123456789 |              123 | 4567890123456789 | -4567890123456789
-              123 |  4567890123456789 | 4567890123456789 |               123 |              123 | 4567890123456789 |               123
               123 |  4567890123456789 | 4567890123456789 |  4567890123456789 |              123 | 4567890123456789 |  4567890123456789
- 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789
- 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789 | 4567890123456789 |               123
- 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 |  4567890123456789
+              123 |  4567890123456789 | 4567890123456789 |               123 |              123 | 4567890123456789 |               123
  4567890123456789 |               123 |              123 |  4567890123456789 | 4567890123456789 |              123 |  4567890123456789
  4567890123456789 |               123 |              123 |               456 | 4567890123456789 |              123 |               456
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789 | -4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 |  4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789 | 4567890123456789 |               123
  4567890123456789 | -4567890123456789 |                  |                   | 4567890123456789 |                  |                  
 (10 rows)
 
@@ -4965,11 +4965,11 @@ select x.* from
               123 |  4567890123456789
               123 |  4567890123456789
               123 |  4567890123456789
+ 4567890123456789 |               123
+ 4567890123456789 |               123
  4567890123456789 |  4567890123456789
  4567890123456789 |  4567890123456789
  4567890123456789 |  4567890123456789
- 4567890123456789 |               123
- 4567890123456789 |               123
  4567890123456789 | -4567890123456789
 (10 rows)
 
@@ -5086,14 +5086,14 @@ select * from
         q1        |        q2         |        q1        |        q2         |        x         
 ------------------+-------------------+------------------+-------------------+------------------
               123 |               456 |                  |                   |                 
-              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
               123 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
+              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
               123 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
- 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
- 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
- 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
  4567890123456789 |               123 |              123 |               456 |              123
  4567890123456789 |               123 |              123 |  4567890123456789 |              123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
  4567890123456789 | -4567890123456789 |                  |                   |                 
 (10 rows)
 
@@ -5107,14 +5107,14 @@ select * from
         q1        |        q2         |        q1        |        q2         |        x         
 ------------------+-------------------+------------------+-------------------+------------------
               123 |               456 |                  |                   |                 
-              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
               123 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
+              123 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
               123 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
- 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
- 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
- 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
  4567890123456789 |               123 |              123 |               456 |              123
  4567890123456789 |               123 |              123 |  4567890123456789 |              123
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |               123 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 |  4567890123456789 | 4567890123456789
+ 4567890123456789 |  4567890123456789 | 4567890123456789 | -4567890123456789 | 4567890123456789
  4567890123456789 | -4567890123456789 |                  |                   |                 
 (10 rows)
 
@@ -5284,6 +5284,16 @@ select * from
               123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
               123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
               123 |  4567890123456789 | 4567890123456789 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 |              123 |              123
+ 4567890123456789 |               123 |              123 |              123 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 |              123 |              123
+ 4567890123456789 |               123 |              123 |              123 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
+ 4567890123456789 |               123 |              123 | 4567890123456789 |              123
  4567890123456789 |  4567890123456789 | 4567890123456789 |              123 |              123
  4567890123456789 |  4567890123456789 | 4567890123456789 |              123 |              123
  4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
@@ -5299,16 +5309,6 @@ select * from
  4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
  4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789 | 4567890123456789
- 4567890123456789 |               123 |              123 |              123 |              123
- 4567890123456789 |               123 |              123 |              123 |              123
- 4567890123456789 |               123 |              123 | 4567890123456789 |              123
- 4567890123456789 |               123 |              123 | 4567890123456789 |              123
- 4567890123456789 |               123 |              123 | 4567890123456789 |              123
- 4567890123456789 |               123 |              123 |              123 |              123
- 4567890123456789 |               123 |              123 |              123 |              123
- 4567890123456789 |               123 |              123 | 4567890123456789 |              123
- 4567890123456789 |               123 |              123 | 4567890123456789 |              123
- 4567890123456789 |               123 |              123 | 4567890123456789 |              123
  4567890123456789 | -4567890123456789 |                  |                  |                 
 (42 rows)
 
@@ -5523,8 +5523,8 @@ lateral (select * from int8_tbl t1,
          where t1.q1 = ss.q2) ss0;
  id |        q1        |        q2         |        q1        |        q2        
 ----+------------------+-------------------+------------------+------------------
-  0 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789
   0 | 4567890123456789 |               123 | 4567890123456789 | 4567890123456789
+  0 | 4567890123456789 |  4567890123456789 | 4567890123456789 | 4567890123456789
   0 | 4567890123456789 | -4567890123456789 | 4567890123456789 | 4567890123456789
 (3 rows)
 
@@ -5782,7 +5782,7 @@ select * from j1 inner join j2 on j1.id = j2.id;
          ->  Materialize
                Output: j2.id
                ->  Seq Scan on public.j2
-               Output: j2.id
+                     Output: j2.id
 (14 rows)
 
 -- ensure join is not unique when not an equi-join
@@ -5823,7 +5823,7 @@ select * from j1 inner join j3 on j1.id = j3.id;
          ->  Materialize
                Output: j1.id
                ->  Seq Scan on public.j1
-               Output: j1.id
+                     Output: j1.id
 (14 rows)
 
 -- ensure left join is marked as unique
@@ -5844,7 +5844,7 @@ select * from j1 left join j2 on j1.id = j2.id;
          ->  Materialize
                Output: j2.id
                ->  Seq Scan on public.j2
-               Output: j2.id
+                     Output: j2.id
 (14 rows)
 
 -- ensure right join is marked as unique
@@ -5863,7 +5863,7 @@ select * from j1 right join j2 on j1.id = j2.id;
          ->  Materialize
                Output: j1.id
                ->  Seq Scan on public.j1
-               Output: j1.id
+                     Output: j1.id
 (12 rows)
 
 -- ensure full join is marked as unique
@@ -5924,7 +5924,7 @@ select * from j1 natural join j2;
          ->  Materialize
                Output: j2.id
                ->  Seq Scan on public.j2
-               Output: j2.id
+                     Output: j2.id
 (14 rows)
 
 -- ensure a distinct clause allows the inner to become unique
@@ -6645,18 +6645,18 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes
 	where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a;
                                                           QUERY PLAN                                                          
 ------------------------------------------------------------------------------------------------------------------------------
- Nested Loop  (cost=200.16..402.39 rows=33 width=4)
+ Nested Loop  (cost=200.16..371.39 rows=33 width=4)
    Join Filter: (t3.b > t2.a)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..280.69 rows=1 width=4)
-         ->  Nested Loop  (cost=0.16..180.68 rows=1 width=4)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..268.69 rows=1 width=4)
+         ->  Nested Loop  (cost=0.16..168.68 rows=1 width=4)
                Join Filter: (t1.a = t2.a)
                ->  Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1  (cost=0.16..8.18 rows=1 width=4)
                      Index Cond: (b = 2)
                      Filter: (((d)::text ~~ 'char%'::text) AND (c = 3))
-               ->  Seq Scan on nestloop_suppression2 t2  (cost=0.00..110.00 rows=5000 width=4)
-   ->  Materialize  (cost=100.00..121.08 rows=50 width=4)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..120.95 rows=50 width=4)
-               ->  Seq Scan on nestloop_suppression3 t3  (cost=0.00..20.50 rows=50 width=4)
+               ->  Seq Scan on nestloop_suppression2 t2  (cost=0.00..98.00 rows=5000 width=4)
+   ->  Materialize  (cost=100.00..102.08 rows=50 width=4)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..101.95 rows=50 width=4)
+               ->  Seq Scan on nestloop_suppression3 t3  (cost=0.00..1.50 rows=50 width=4)
 (12 rows)
 
 set enable_nestloop_suppression = true;
@@ -6664,19 +6664,19 @@ explain select t3.b from nestloop_suppression1 t1, nestloop_suppression2 t2, nes
 	where t1.b=2 and t1.c=3 and t1.d like 'char%' and t1.a=t2.a and t3.b>t2.a;
                                                              QUERY PLAN                                                             
 ------------------------------------------------------------------------------------------------------------------------------------
- Nested Loop  (cost=200.16..414.89 rows=33 width=4)
+ Nested Loop  (cost=200.16..383.89 rows=33 width=4)
    Join Filter: (t3.b > t2.a)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..293.19 rows=1 width=4)
-         ->  Nested Loop  (cost=0.16..193.19 rows=1 width=4)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.16..281.19 rows=1 width=4)
+         ->  Nested Loop  (cost=0.16..181.19 rows=1 width=4)
                Join Filter: (t1.a = t2.a)
-               ->  Seq Scan on nestloop_suppression2 t2  (cost=0.00..110.00 rows=5000 width=4)
+               ->  Seq Scan on nestloop_suppression2 t2  (cost=0.00..98.00 rows=5000 width=4)
                ->  Materialize  (cost=0.16..8.19 rows=1 width=4)
                      ->  Index Scan using idx_nestloop_suppression1_b on nestloop_suppression1 t1  (cost=0.16..8.18 rows=1 width=4)
                            Index Cond: (b = 2)
                            Filter: (((d)::text ~~ 'char%'::text) AND (c = 3))
-   ->  Materialize  (cost=100.00..121.08 rows=50 width=4)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..120.95 rows=50 width=4)
-               ->  Seq Scan on nestloop_suppression3 t3  (cost=0.00..20.50 rows=50 width=4)
+   ->  Materialize  (cost=100.00..102.08 rows=50 width=4)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..101.95 rows=50 width=4)
+               ->  Seq Scan on nestloop_suppression3 t3  (cost=0.00..1.50 rows=50 width=4)
 (13 rows)
 
 drop table nestloop_suppression1;
diff --git a/src/test/regress/expected/partition_join_2.out b/src/test/regress/expected/partition_join_2.out
index 2ae2b8a2..d2435f12 100644
--- a/src/test/regress/expected/partition_join_2.out
+++ b/src/test/regress/expected/partition_join_2.out
@@ -396,8 +396,8 @@ SELECT * FROM prt1 t1 LEFT JOIN LATERAL
                            ->  Hash Join
                                  Hash Cond: (t3.b = a)
                                  ->  Append
-                                       ->  Index Scan using iprt2_p1_b on prt2_p1 t3
-                                       ->  Index Scan using iprt2_p2_b on prt2_p2 t3_1
+                                       ->  Seq Scan on prt2_p1 t3
+                                       ->  Seq Scan on prt2_p2 t3_1
                                        ->  Index Scan using iprt2_p3_b on prt2_p3 t3_2
                                  ->  Hash
                                        ->  Remote Subquery Scan on all (datanode_1,datanode_2)
@@ -743,48 +743,38 @@ SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * F
 -- Semi-join
 EXPLAIN (COSTS OFF)
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
-                                           QUERY PLAN                                           
-------------------------------------------------------------------------------------------------
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Merge Join
-         Merge Cond: (a = b)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: a
-               ->  Sort
-                     Sort Key: t1.a
-                     ->  Append
-                           ->  Seq Scan on prt1_p1 t1
-                                 Filter: (b = 0)
-                           ->  Seq Scan on prt1_p2 t1_1
-                                 Filter: (b = 0)
-                           ->  Seq Scan on prt1_p3 t1_2
-                                 Filter: (b = 0)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: b
-               ->  Sort
-                     Sort Key: b
-                     ->  HashAggregate
-                           Group Key: b
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: b
-                                 ->  HashAggregate
-                                       Group Key: b
-                                       ->  Hash Join
-                                             Hash Cond: (((t2.a + t2.b) / 2) = b)
-                                             ->  Append
-                                                   ->  Seq Scan on prt1_e_p1 t2
-                                                   ->  Seq Scan on prt1_e_p2 t2_1
-                                                   ->  Seq Scan on prt1_e_p3 t2_2
-                                             ->  Hash
-                                                   ->  Remote Subquery Scan on all (datanode_2)
-                                                         ->  Append
-                                                               ->  Seq Scan on prt2_p1 t1_3
-                                                                     Filter: (a = 0)
-                                                               ->  Seq Scan on prt2_p2 t1_4
-                                                                     Filter: (a = 0)
-                                                               ->  Seq Scan on prt2_p3 t1_5
-                                                                     Filter: (a = 0)
-(39 rows)
+   ->  Nested Loop Semi Join
+         Join Filter: (t1.a = b)
+         ->  Merge Append
+               Sort Key: t1.a
+               ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                     Filter: (b = 0)
+               ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                     Filter: (b = 0)
+               ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                     Filter: (b = 0)
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
+                     ->  Hash Join
+                           Hash Cond: (((t2.a + t2.b) / 2) = b)
+                           ->  Append
+                                 ->  Seq Scan on prt1_e_p1 t2
+                                 ->  Seq Scan on prt1_e_p2 t2_1
+                                 ->  Seq Scan on prt1_e_p3 t2_2
+                           ->  Hash
+                                 ->  Remote Subquery Scan on all (datanode_2)
+                                       ->  Append
+                                             ->  Seq Scan on prt2_p1 t1_3
+                                                   Filter: (a = 0)
+                                             ->  Seq Scan on prt2_p2 t1_4
+                                                   Filter: (a = 0)
+                                             ->  Seq Scan on prt2_p3 t1_5
+                                                   Filter: (a = 0)
+(29 rows)
 
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
   a  | b |  c   
@@ -800,23 +790,19 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (
                                        QUERY PLAN                                        
 -----------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Merge Semi Join
-         Merge Cond: (a = b)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: a
-               ->  Sort
-                     Sort Key: t1.a
-                     ->  Append
-                           ->  Seq Scan on prt1_p1 t1
-                                 Filter: (b = 0)
-                           ->  Seq Scan on prt1_p2 t1_1
-                                 Filter: (b = 0)
-                           ->  Seq Scan on prt1_p3 t1_2
-                                 Filter: (b = 0)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               Distribute results by H: b
-               ->  Sort
-                     Sort Key: t1_3.b
+   ->  Nested Loop Semi Join
+         Join Filter: (t1.a = b)
+         ->  Merge Append
+               Sort Key: t1.a
+               ->  Index Scan using iprt1_p1_a on prt1_p1 t1
+                     Filter: (b = 0)
+               ->  Index Scan using iprt1_p2_a on prt1_p2 t1_1
+                     Filter: (b = 0)
+               ->  Index Scan using iprt1_p3_a on prt1_p3 t1_2
+                     Filter: (b = 0)
+         ->  Materialize
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     Distribute results by H: b
                      ->  Hash Semi Join
                            Hash Cond: (t1_3.b = ((a + b) / 2))
                            ->  Append
@@ -832,7 +818,7 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (
                                                    Filter: (c = 0)
                                              ->  Seq Scan on prt1_e_p3 t1_8
                                                    Filter: (c = 0)
-(33 rows)
+(29 rows)
 
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1 WHERE t1.b IN (SELECT (t1.a + t1.b)/2 FROM prt1_e t1 WHERE t1.c = 0)) AND t1.b = 0 ORDER BY t1.a;
   a  | b |  c   
@@ -1165,26 +1151,26 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a = 1 AND a = 2) t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b, prt1 t3 WHERE t2.b = t3.a;
-                           QUERY PLAN                            
------------------------------------------------------------------
- Hash Join
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
+ Hash Left Join
    Hash Cond: (b = a)
-   ->  Hash Left Join
-         Hash Cond: (b = a)
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Append
-                     ->  Seq Scan on prt2_p1 t2
-                     ->  Seq Scan on prt2_p2 t2_1
-                     ->  Seq Scan on prt2_p3 t2_2
-         ->  Hash
-               ->  Result
-                     One-Time Filter: false
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+         ->  Hash Join
+               Hash Cond: (a = t2.b)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                     ->  Append
+                           ->  Seq Scan on prt1_p1 t3
+                           ->  Seq Scan on prt1_p2 t3_1
+                           ->  Seq Scan on prt1_p3 t3_2
+               ->  Hash
+                     ->  Append
+                           ->  Seq Scan on prt2_p1 t2
+                           ->  Seq Scan on prt2_p2 t2_1
+                           ->  Seq Scan on prt2_p3 t2_2
    ->  Hash
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-               ->  Append
-                     ->  Seq Scan on prt1_p1 t3
-                     ->  Seq Scan on prt1_p2 t3_1
-                     ->  Seq Scan on prt1_p3 t3_2
+         ->  Result
+               One-Time Filter: false
 (18 rows)
 
 EXPLAIN (COSTS OFF)
@@ -1764,27 +1750,27 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 LEFT JOIN prt2_n t2 ON (t1.c = t2.c
 
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 JOIN prt2_n t2 ON (t1.c = t2.c) JOIN plt1 t3 ON (t1.c = t3.c);
-                                    QUERY PLAN                                     
------------------------------------------------------------------------------------
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Hash Join
-         Hash Cond: (c = (c)::text)
+         Hash Cond: (c = (t1.c)::text)
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                ->  Append
-                     ->  Seq Scan on plt1_p1 t3
-                     ->  Seq Scan on plt1_p2 t3_1
-                     ->  Seq Scan on plt1_p3 t3_2
+                     ->  Seq Scan on prt2_n_p1 t2
+                     ->  Seq Scan on prt2_n_p2 t2_1
          ->  Hash
                ->  Hash Join
-                     Hash Cond: (t2.c = (c)::text)
-                     ->  Append
-                           ->  Seq Scan on prt2_n_p1 t2
-                           ->  Seq Scan on prt2_n_p2 t2_1
+                     Hash Cond: (c = (t1.c)::text)
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           ->  Append
+                                 ->  Seq Scan on plt1_p1 t3
+                                 ->  Seq Scan on plt1_p2 t3_1
+                                 ->  Seq Scan on plt1_p3 t3_2
                      ->  Hash
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 ->  Append
-                                       ->  Seq Scan on prt1_n_p1 t1
-                                       ->  Seq Scan on prt1_n_p2 t1_1
+                           ->  Append
+                                 ->  Seq Scan on prt1_n_p1 t1
+                                 ->  Seq Scan on prt1_n_p2 t1_1
 (19 rows)
 
 -- partition-wise join can not be applied for a join between list and range
diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out
index 8bea6498..82b95aa8 100644
--- a/src/test/regress/expected/sanity_check.out
+++ b/src/test/regress/expected/sanity_check.out
@@ -60,6 +60,7 @@ inet_tbl|t
 inhf|f
 inhx|t
 insert_tbl|f
+insertwithret|f
 int2_tbl|f
 int4_tbl|f
 int8_tbl|f
diff --git a/src/test/regress/expected/sequence.out b/src/test/regress/expected/sequence.out
index d510c4f0..c830a7f7 100644
--- a/src/test/regress/expected/sequence.out
+++ b/src/test/regress/expected/sequence.out
@@ -880,4 +880,4 @@ select gsk_key from pg_list_storage_sequence() where gsk_key like '%db_seq1_bak.
  db_seq1_bak.public.t3_f1_seq
 (3 rows)
 
-\q
\ No newline at end of file
+\q
diff --git a/src/test/regress/expected/stats_ext_2.out b/src/test/regress/expected/stats_ext_2.out
index 16b06053..3a412ef4 100644
--- a/src/test/regress/expected/stats_ext_2.out
+++ b/src/test/regress/expected/stats_ext_2.out
@@ -659,10 +659,10 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=177.52..177.53 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.50..177.52 rows=1 width=0)
-         ->  Partial Aggregate  (cost=77.50..77.51 rows=1 width=8)
-               ->  Seq Scan on subset  (cost=0.00..77.50 rows=1 width=0)
+ Finalize Aggregate  (cost=163.52..163.53 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=163.50..163.52 rows=1 width=0)
+         ->  Partial Aggregate  (cost=63.50..63.51 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..63.50 rows=1 width=0)
                      Filter: ((b = 'prefix_1'::text) AND (c = 1))
 (5 rows)
 
@@ -680,10 +680,10 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b = 'prefix_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=177.64..177.65 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.62..177.64 rows=1 width=0)
-         ->  Partial Aggregate  (cost=77.62..77.64 rows=1 width=8)
-               ->  Seq Scan on subset  (cost=0.00..77.50 rows=50 width=0)
+ Finalize Aggregate  (cost=163.64..163.65 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=163.62..163.64 rows=1 width=0)
+         ->  Partial Aggregate  (cost=63.62..63.63 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..63.50 rows=50 width=0)
                      Filter: ((b = 'prefix_1'::text) AND (c = 1))
 (5 rows)
 
@@ -698,10 +698,10 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=177.53..177.54 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.51..177.53 rows=1 width=0)
-         ->  Partial Aggregate  (cost=77.51..77.52 rows=1 width=8)
-               ->  Seq Scan on subset  (cost=0.00..77.50 rows=5 width=0)
+ Finalize Aggregate  (cost=163.53..163.54 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=163.51..163.53 rows=1 width=0)
+         ->  Partial Aggregate  (cost=63.51..63.52 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..63.50 rows=5 width=0)
                      Filter: ((b ~~ '%_1'::text) AND (c = 1))
 (5 rows)
 
@@ -722,10 +722,10 @@ EXPLAIN
  SELECT count(*) FROM subset WHERE b like '%_1' and c = 1;
                                            QUERY PLAN                                            
 -------------------------------------------------------------------------------------------------
- Finalize Aggregate  (cost=177.64..177.65 rows=1 width=8)
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=177.62..177.64 rows=1 width=0)
-         ->  Partial Aggregate  (cost=77.62..77.64 rows=1 width=8)
-               ->  Seq Scan on subset  (cost=0.00..77.50 rows=50 width=0)
+ Finalize Aggregate  (cost=163.64..163.65 rows=1 width=8)
+   ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=163.62..163.64 rows=1 width=0)
+         ->  Partial Aggregate  (cost=63.62..63.63 rows=1 width=8)
+               ->  Seq Scan on subset  (cost=0.00..63.50 rows=50 width=0)
                      Filter: ((b ~~ '%_1'::text) AND (c = 1))
 (5 rows)
 
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 4c0e6f5c..d2150f63 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -143,7 +143,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_transparent_crypt          | on
  enable_user_authority_force_check | off
  enable_xlog_mprotect              | on
-(70 rows)
+(71 rows)
 
 -- Test that the pg_timezone_names and pg_timezone_abbrevs views are
 -- more-or-less working.  We can't test their contents in any great detail
diff --git a/src/test/regress/expected/tbase_explain.out b/src/test/regress/expected/tbase_explain.out
index d91ef65e..8b56b4d6 100644
--- a/src/test/regress/expected/tbase_explain.out
+++ b/src/test/regress/expected/tbase_explain.out
@@ -377,17 +377,13 @@ select * from a1 where num >= (select count(*) from a2 where name='c') limit 1;
    ->  Remote Subquery Scan on all (datanode_1,datanode_2) (actual rows=1 loops=1)
          Output: a1.id, a1.num, a1.name
          ->  Limit
-               DN (actual rows=1..1 loops=1..1)
-               - datanode_1 (actual rows=1 loops=1)
-               - datanode_2 (actual rows=1 loops=1)
+               DN (never executed)
                Output: a1.id, a1.num, a1.name
                ->  Seq Scan on public.a1
-                     DN (actual rows=1..1 loops=1..1)
-                     - datanode_1 (actual rows=1 loops=1)
-                     - datanode_2 (actual rows=1 loops=1)
+                     DN (never executed)
                      Output: a1.id, a1.num, a1.name
                      Filter: (a1.num >= $0)
-(31 rows)
+(27 rows)
 
 explain (costs off,timing off,summary off,analyze,verbose)
 select count(*) from a1 group by name having count(*) = (select count(*) from a2 where name='a');

From e6a1711b773ca857646d84b1d24a6cd755f62840 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 4 Jul 2022 19:24:54 +0800
Subject: [PATCH 503/578] fix regress error 2

---
 .../expected/redistribute_custom_types.out    | 46 +++++++++----------
 src/test/regress/parallel_schedule            |  1 -
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/src/test/regress/expected/redistribute_custom_types.out b/src/test/regress/expected/redistribute_custom_types.out
index 2ae77a18..24d9ece5 100644
--- a/src/test/regress/expected/redistribute_custom_types.out
+++ b/src/test/regress/expected/redistribute_custom_types.out
@@ -17,16 +17,16 @@ insert into enum_test(a,b) values(5,'Fri');
 insert into enum_test(a,b) values(6,'Sat');
 insert into enum_test(a,b) values(7,'Sun');
 explain select count(*) from enum_test where a < 100 group by b;
-                                               QUERY PLAN                                                
----------------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=132.87..134.74 rows=187 width=12)
-   ->  Finalize HashAggregate  (cost=132.87..134.74 rows=187 width=12)
+                                               QUERY PLAN                                               
+--------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=116.44..117.38 rows=94 width=12)
+   ->  Finalize HashAggregate  (cost=116.44..117.38 rows=94 width=12)
          Group Key: b
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=129.12..131.93 rows=187 width=0)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=114.56..115.97 rows=94 width=0)
                Distribute results by S: b
-               ->  Partial HashAggregate  (cost=29.12..31.00 rows=187 width=12)
+               ->  Partial HashAggregate  (cost=14.56..15.50 rows=94 width=12)
                      Group Key: b
-                     ->  Seq Scan on enum_test  (cost=0.00..26.88 rows=450 width=4)
+                     ->  Seq Scan on enum_test  (cost=0.00..13.44 rows=225 width=4)
                            Filter: (a < 100)
 (9 rows)
 
@@ -57,20 +57,20 @@ insert into comptype_test(a,b) values(4,(4,5));
 insert into comptype_test(a,b) values(5,(5,6));
 insert into comptype_test(a,b) values(6,(6,7));
 explain select count(*) from comptype_test where a < 100 group by b;
-                                                  QUERY PLAN                                                   
----------------------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=145.27..147.81 rows=169 width=40)
-   ->  Finalize GroupAggregate  (cost=145.27..147.81 rows=169 width=40)
+                                                  QUERY PLAN                                                  
+--------------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=121.50..122.77 rows=85 width=40)
+   ->  Finalize GroupAggregate  (cost=121.50..122.77 rows=85 width=40)
          Group Key: b
-         ->  Sort  (cost=145.27..145.70 rows=169 width=0)
+         ->  Sort  (cost=121.50..121.71 rows=85 width=0)
                Sort Key: b
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=134.18..139.02 rows=169 width=0)
+               ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=116.35..118.78 rows=85 width=0)
                      Distribute results by S: b
-                     ->  Partial GroupAggregate  (cost=34.18..38.17 rows=169 width=40)
+                     ->  Partial GroupAggregate  (cost=16.35..18.35 rows=85 width=40)
                            Group Key: b
-                           ->  Sort  (cost=34.18..34.95 rows=307 width=32)
+                           ->  Sort  (cost=16.35..16.73 rows=154 width=32)
                                  Sort Key: b
-                                 ->  Seq Scan on comptype_test  (cost=0.00..21.50 rows=307 width=32)
+                                 ->  Seq Scan on comptype_test  (cost=0.00..10.75 rows=154 width=32)
                                        Filter: (a < 100)
 (13 rows)
 
@@ -100,16 +100,16 @@ insert into domaintype_test(a,b) values(4,4);
 insert into domaintype_test(a,b) values(5,5);
 insert into domaintype_test(a,b) values(6,6);
 explain select count(*) from domaintype_test where a < 100 group by b;
-                                               QUERY PLAN                                                
----------------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=132.87..134.74 rows=187 width=12)
-   ->  Finalize HashAggregate  (cost=132.87..134.74 rows=187 width=12)
+                                               QUERY PLAN                                               
+--------------------------------------------------------------------------------------------------------
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=116.44..117.38 rows=94 width=12)
+   ->  Finalize HashAggregate  (cost=116.44..117.38 rows=94 width=12)
          Group Key: b
-         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=129.12..131.93 rows=187 width=0)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=114.56..115.97 rows=94 width=0)
                Distribute results by S: b
-               ->  Partial HashAggregate  (cost=29.12..31.00 rows=187 width=12)
+               ->  Partial HashAggregate  (cost=14.56..15.50 rows=94 width=12)
                      Group Key: b
-                     ->  Seq Scan on domaintype_test  (cost=0.00..26.88 rows=450 width=4)
+                     ->  Seq Scan on domaintype_test  (cost=0.00..13.44 rows=225 width=4)
                            Filter: (a < 100)
 (9 rows)
 
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 036a73c3..782b692b 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -166,4 +166,3 @@ test: xl_primary_key xl_foreign_key xl_distribution_column_types xl_alter_table
 test: tbase_explain
 
 test: redistribute_custom_types pl_bugs
-test: nestloop_by_shard

From 1fc49a746f8845fdb884252fc1b60250c7b3eaf4 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Mon, 4 Jul 2022 19:34:29 +0800
Subject: [PATCH 504/578] update TBASE_VERSION_STR to TBase_V2.4.0_release

---
 src/backend/utils/adt/version.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c
index b11ef63b..2cf52a05 100644
--- a/src/backend/utils/adt/version.c
+++ b/src/backend/utils/adt/version.c
@@ -78,7 +78,7 @@
 #include "utils/builtins.h"
 
 
-#define TBASE_VERSION_STR "TBase_master"
+#define TBASE_VERSION_STR "TBase_V2.4.0_release"
 
 Datum
 pgsql_version(PG_FUNCTION_ARGS)

From ad01198a1c8f5799928d2c42167bac02672bab19 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 5 Jul 2022 14:41:06 +0800
Subject: [PATCH 505/578] add v2.4.0-release note

---
 v2.4.0-release-note.txt | 56 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 v2.4.0-release-note.txt

diff --git a/v2.4.0-release-note.txt b/v2.4.0-release-note.txt
new file mode 100644
index 00000000..3e8ec6d4
--- /dev/null
+++ b/v2.4.0-release-note.txt
@@ -0,0 +1,56 @@
+V2.4.0-release 版本主要修改集中在：
+1、分布式计算性能提升：
+1）hasAggs/having/sort/limit/Stable function/ 优化下推到DN上执行，性能提升近百倍
+2）FQS查询下推能力增强：分布key计算返回单条结果时进行sql下推，性能提升约20倍
+3）分布式进程ProcLock 分段锁优化，减少锁冲突，执行效率提升5倍左右
+4）GTM、CN、DN 同台机器部署时，通信使用unix domain socket，性能提升30%
+5）执行计划优化：重分布的节点数为1时跳过remote算子，精简执行计划
+6）复制表下推DN策略调整：连接数过多时数据上拉到CN节点，连接数少时下推DN，节省网络资源
+7）优化全局 sequence 获取性能提升约20%
+
+
+2、2PC事务优化：
+1）死锁检查性能优化：a)批量获取gxid； b）遍历查找gxid修改为二分查找；c）增加try轮次限制
+2）创建内存hash表，减少2pc磁盘文件句柄的创建
+3）增加2PC cleaner进程自动清理功能
+4）隐式事务DN不自动提交，避免出现部分提交的现象
+5）2PC添加保护模式，杜绝部分提交的情况
+6）优化GTS获取流程，保证每次获取到最新的结果
+7）drop database 增加prepare过程，确保不会出现节点失败残留
+
+
+3、高可用能力加强：
+1）业务正常运行时允许添加DN节点
+2）GTM 主备切换时，2PC事务可以正常进行
+3）GTM 主备切换能力增强，主备同时crash时，备机起来可以自主发起生主操作
+4）GTM高可用优化，在备机恢复时如果xlog与主机相差太大时直接采用重做备机的方式
+5）GTM备机高可用优化，添加备连接主超时时间
+
+
+4、易用性增强：
+1）支持读写分离的读平面修改系统参数，加强了用户对系统的控制力
+2）增加 pooler 多线程日志功能，方便用户进行问题分析
+3）GTM 日志优化，日志汇聚一个文件，并自动拆分活跃日志，方便用户随时查看
+4）支持 gtm_ctl -l 指定日志文件路径启动，方便用户对日志进行管理
+5）全局session视图优化 使用usename，datname，替换oid，展现信息更加直观，可读性强
+
+
+5、新特性支持：
+1）dblink支持copy功能，批量query功能
+2）自研分区表剪枝，添加IN语法剪枝，提前加速剪枝
+3）解除存储过程中累计事务最大为64个的限制
+4）支持带有数据shuffle的并发更新能力
+5）数据shuffle支持用户自定义函数
+6）允许删除带有分布式外键约束的主表的列
+7）全局session安全加固,防止SQL注入
+
+
+6、已知问题修复：
+1）存储过程/触发器中死锁、子事务回滚异常问题解决
+2）全局session活跃视图死锁问题解决
+3）分布式网络通信死锁问题解决
+4）扩展协议带有return 的insert语句不能返回数据问题修复
+5）物化视图并发刷新问题解决
+6）复杂sql并发更新报错问题解决
+7）存在多层并行gather算子时查询异常问题解决
+8）GTM 备机内存泄露问题解决
\ No newline at end of file

From 2b65bb7760fb50dc877909737eb235fee6a381ac Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Tue, 5 Jul 2022 14:52:42 +0800
Subject: [PATCH 506/578] update TBASE_VERSION_STR

---
 src/backend/utils/adt/version.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c
index 2cf52a05..b11ef63b 100644
--- a/src/backend/utils/adt/version.c
+++ b/src/backend/utils/adt/version.c
@@ -78,7 +78,7 @@
 #include "utils/builtins.h"
 
 
-#define TBASE_VERSION_STR "TBase_V2.4.0_release"
+#define TBASE_VERSION_STR "TBase_master"
 
 Datum
 pgsql_version(PG_FUNCTION_ARGS)

From 4887e3a62a4d72ab7b04ad538cfcfb8bbf9cc9c7 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Wed, 13 Jul 2022 15:22:28 +0800
Subject: [PATCH 507/578] add v2.4.0-release note

---
 v2.4.0-release-note.txt | 63 +++++++++--------------------------------
 1 file changed, 14 insertions(+), 49 deletions(-)

diff --git a/v2.4.0-release-note.txt b/v2.4.0-release-note.txt
index 3e8ec6d4..b3df61df 100644
--- a/v2.4.0-release-note.txt
+++ b/v2.4.0-release-note.txt
@@ -1,56 +1,21 @@
 V2.4.0-release 版本主要修改集中在：
 1、分布式计算性能提升：
-1）hasAggs/having/sort/limit/Stable function/ 优化下推到DN上执行，性能提升近百倍
-2）FQS查询下推能力增强：分布key计算返回单条结果时进行sql下推，性能提升约20倍
-3）分布式进程ProcLock 分段锁优化，减少锁冲突，执行效率提升5倍左右
-4）GTM、CN、DN 同台机器部署时，通信使用unix domain socket，性能提升30%
-5）执行计划优化：重分布的节点数为1时跳过remote算子，精简执行计划
-6）复制表下推DN策略调整：连接数过多时数据上拉到CN节点，连接数少时下推DN，节省网络资源
-7）优化全局 sequence 获取性能提升约20%
-
+● hasAggs/having/sort/limit/Stable function/ 优化下推到DN上执行，性能提升近百倍。
+● 分布式进程ProcLock分段锁优化，减少锁冲突，执行效率提升约5倍。
+● 执行计划优化：重分布的节点数为1时跳过remote算子，精简执行计划。
 
 2、2PC事务优化：
-1）死锁检查性能优化：a)批量获取gxid； b）遍历查找gxid修改为二分查找；c）增加try轮次限制
-2）创建内存hash表，减少2pc磁盘文件句柄的创建
-3）增加2PC cleaner进程自动清理功能
-4）隐式事务DN不自动提交，避免出现部分提交的现象
-5）2PC添加保护模式，杜绝部分提交的情况
-6）优化GTS获取流程，保证每次获取到最新的结果
-7）drop database 增加prepare过程，确保不会出现节点失败残留
-
-
-3、高可用能力加强：
-1）业务正常运行时允许添加DN节点
-2）GTM 主备切换时，2PC事务可以正常进行
-3）GTM 主备切换能力增强，主备同时crash时，备机起来可以自主发起生主操作
-4）GTM高可用优化，在备机恢复时如果xlog与主机相差太大时直接采用重做备机的方式
-5）GTM备机高可用优化，添加备连接主超时时间
-
-
-4、易用性增强：
-1）支持读写分离的读平面修改系统参数，加强了用户对系统的控制力
-2）增加 pooler 多线程日志功能，方便用户进行问题分析
-3）GTM 日志优化，日志汇聚一个文件，并自动拆分活跃日志，方便用户随时查看
-4）支持 gtm_ctl -l 指定日志文件路径启动，方便用户对日志进行管理
-5）全局session视图优化 使用usename，datname，替换oid，展现信息更加直观，可读性强
-
+● 死锁检查性能优化：批量获取gxid；遍历查找gxid修改为二分查找；增加try轮次限制。
+● 2PC添加保护模式，杜绝部分提交的情况。
+● drop database 增加prepare过程，确保不会出现节点失败残留。
 
-5、新特性支持：
-1）dblink支持copy功能，批量query功能
-2）自研分区表剪枝，添加IN语法剪枝，提前加速剪枝
-3）解除存储过程中累计事务最大为64个的限制
-4）支持带有数据shuffle的并发更新能力
-5）数据shuffle支持用户自定义函数
-6）允许删除带有分布式外键约束的主表的列
-7）全局session安全加固,防止SQL注入
+3、易用性增强：
+● 增加pooler多线程日志功能，方便用户进行问题分析。
+● GTM日志优化，日志汇聚一个文件，并自动拆分活跃日志，方便用户随时查看。
+● 支持gtm_ctl -l指定日志文件路径启动，方便用户对日志进行管理。
 
+4、新特性支持：
+● 自研分区表剪枝，添加IN语法剪枝，提前加速剪枝。
+● 数据shuffle支持用户自定义函数。
+● 允许删除带有分布式外键约束的主表的列。
 
-6、已知问题修复：
-1）存储过程/触发器中死锁、子事务回滚异常问题解决
-2）全局session活跃视图死锁问题解决
-3）分布式网络通信死锁问题解决
-4）扩展协议带有return 的insert语句不能返回数据问题修复
-5）物化视图并发刷新问题解决
-6）复杂sql并发更新报错问题解决
-7）存在多层并行gather算子时查询异常问题解决
-8）GTM 备机内存泄露问题解决
\ No newline at end of file

From de97b2f2bba5b8d865140387349dcd3dbec9ef96 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 22 Mar 2018 15:47:29 -0400
Subject: [PATCH 508/578] Sync up our various ways of estimating
 pg_class.reltuples.

VACUUM thought that reltuples represents the total number of tuples in
the relation, while ANALYZE counted only live tuples.  This can cause
"flapping" in the value when background vacuums and analyzes happen
separately.  The planner's use of reltuples essentially assumes that
it's the count of live (visible) tuples, so let's standardize on having
it mean live tuples.

Another issue is that the definition of "live tuple" isn't totally clear;
what should be done with INSERT_IN_PROGRESS or DELETE_IN_PROGRESS tuples?
ANALYZE's choices in this regard are made on the assumption that if the
originating transaction commits at all, it will happen after ANALYZE
finishes, so we should ignore the effects of the in-progress transaction
--- unless it is our own transaction, and then we should count it.
Let's propagate this definition into VACUUM, too.

Likewise propagate this definition into CREATE INDEX, and into
contrib/pgstattuple's pgstattuple_approx() function.

Tomas Vondra, reviewed by Haribabu Kommi, some corrections by me

Discussion: https://postgr.es/m/16db4468-edfa-830a-f921-39a50498e77e@2ndquadrant.com
---
 contrib/pgstattuple/pgstatapprox.c | 444 +++++++++++++++--------------
 doc/src/sgml/catalogs.sgml         |   4 +-
 src/backend/catalog/index.c        |  52 +++-
 src/backend/commands/vacuum.c      |   6 +-
 src/backend/commands/vacuumlazy.c  |  84 ++++--
 5 files changed, 334 insertions(+), 256 deletions(-)

diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index 13ce7f99..21bda0e7 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -1,12 +1,12 @@
 /*-------------------------------------------------------------------------
  *
  * pgstatapprox.c
- *          Bloat estimation functions
+ *		  Bloat estimation functions
  *
  * Copyright (c) 2014-2017, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *          contrib/pgstattuple/pgstatapprox.c
+ *		  contrib/pgstattuple/pgstatapprox.c
  *
  *-------------------------------------------------------------------------
  */
@@ -31,20 +31,20 @@
 PG_FUNCTION_INFO_V1(pgstattuple_approx);
 PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5);
 
-Datum        pgstattuple_approx_internal(Oid relid, FunctionCallInfo fcinfo);
+Datum		pgstattuple_approx_internal(Oid relid, FunctionCallInfo fcinfo);
 
 typedef struct output_type
 {
-    uint64        table_len;
-    uint64        scanned_percent;
-    uint64        tuple_count;
-    uint64        tuple_len;
-    double        tuple_percent;
-    uint64        dead_tuple_count;
-    uint64        dead_tuple_len;
-    double        dead_tuple_percent;
-    uint64        free_space;
-    double        free_percent;
+	uint64		table_len;
+	uint64		scanned_percent;
+	uint64		tuple_count;
+	uint64		tuple_len;
+	double		tuple_percent;
+	uint64		dead_tuple_count;
+	uint64		dead_tuple_len;
+	double		dead_tuple_percent;
+	uint64		free_space;
+	double		free_percent;
 } output_type;
 
 #define NUM_OUTPUT_COLUMNS 10
@@ -62,147 +62,153 @@ typedef struct output_type
 static void
 statapprox_heap(Relation rel, output_type *stat)
 {
-    BlockNumber scanned,
-                nblocks,
-                blkno;
-    Buffer        vmbuffer = InvalidBuffer;
-    BufferAccessStrategy bstrategy;
-    TransactionId OldestXmin;
-    uint64        misc_count = 0;
-
-    OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM);
-    bstrategy = GetAccessStrategy(BAS_BULKREAD);
-
-    nblocks = RelationGetNumberOfBlocks(rel);
-    scanned = 0;
-
-    for (blkno = 0; blkno < nblocks; blkno++)
-    {
-        Buffer        buf;
-        Page        page;
-        OffsetNumber offnum,
-                    maxoff;
-        Size        freespace;
-
-        CHECK_FOR_INTERRUPTS();
-
-        /*
-         * If the page has only visible tuples, then we can find out the free
-         * space from the FSM and move on.
-         */
-        if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
-        {
-            freespace = GetRecordedFreeSpace(rel, blkno);
-            stat->tuple_len += BLCKSZ - freespace;
-            stat->free_space += freespace;
-            continue;
-        }
-
-        buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
-                                 RBM_NORMAL, bstrategy);
-
-        LockBuffer(buf, BUFFER_LOCK_SHARE);
-
-        page = BufferGetPage(buf);
-
-        /*
-         * It's not safe to call PageGetHeapFreeSpace() on new pages, so we
-         * treat them as being free space for our purposes.
-         */
-        if (!PageIsNew(page))
-            stat->free_space += PageGetHeapFreeSpace(page);
-        else
-            stat->free_space += BLCKSZ - SizeOfPageHeaderData;
-
-        if (PageIsNew(page) || PageIsEmpty(page))
-        {
-            UnlockReleaseBuffer(buf);
-            continue;
-        }
-
-        scanned++;
-
-        /*
-         * Look at each tuple on the page and decide whether it's live or
-         * dead, then count it and its size. Unlike lazy_scan_heap, we can
-         * afford to ignore problems and special cases.
-         */
-        maxoff = PageGetMaxOffsetNumber(page);
-
-        for (offnum = FirstOffsetNumber;
-             offnum <= maxoff;
-             offnum = OffsetNumberNext(offnum))
-        {
-            ItemId        itemid;
-            HeapTupleData tuple;
-
-            itemid = PageGetItemId(page, offnum);
-
-            if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid) ||
-                ItemIdIsDead(itemid))
-            {
-                continue;
-            }
-
-            Assert(ItemIdIsNormal(itemid));
-
-            ItemPointerSet(&(tuple.t_self), blkno, offnum);
-
-            tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
-            tuple.t_len = ItemIdGetLength(itemid);
-            tuple.t_tableOid = RelationGetRelid(rel);
-
-            /*
-             * We count live and dead tuples, but we also need to add up
-             * others in order to feed vac_estimate_reltuples.
-             */
-            switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
-            {
-                case HEAPTUPLE_RECENTLY_DEAD:
-                    misc_count++;
-                    /* Fall through */
-                case HEAPTUPLE_DEAD:
-                    stat->dead_tuple_len += tuple.t_len;
-                    stat->dead_tuple_count++;
-                    break;
-                case HEAPTUPLE_LIVE:
-                    stat->tuple_len += tuple.t_len;
-                    stat->tuple_count++;
-                    break;
-                case HEAPTUPLE_INSERT_IN_PROGRESS:
-                case HEAPTUPLE_DELETE_IN_PROGRESS:
-                    misc_count++;
-                    break;
-                default:
-                    elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
-                    break;
-            }
-        }
-
-        UnlockReleaseBuffer(buf);
-    }
-
-    stat->table_len = (uint64) nblocks * BLCKSZ;
-
-    stat->tuple_count = vac_estimate_reltuples(rel, false, nblocks, scanned,
-                                               stat->tuple_count + misc_count);
-
-    /*
-     * Calculate percentages if the relation has one or more pages.
-     */
-    if (nblocks != 0)
-    {
-        stat->scanned_percent = 100 * scanned / nblocks;
-        stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
-        stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
-        stat->free_percent = 100.0 * stat->free_space / stat->table_len;
-    }
-
-    if (BufferIsValid(vmbuffer))
-    {
-        ReleaseBuffer(vmbuffer);
-        vmbuffer = InvalidBuffer;
-    }
+	BlockNumber scanned,
+				nblocks,
+				blkno;
+	Buffer		vmbuffer = InvalidBuffer;
+	BufferAccessStrategy bstrategy;
+	TransactionId OldestXmin;
+
+	OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM);
+	bstrategy = GetAccessStrategy(BAS_BULKREAD);
+
+	nblocks = RelationGetNumberOfBlocks(rel);
+	scanned = 0;
+
+	for (blkno = 0; blkno < nblocks; blkno++)
+	{
+		Buffer		buf;
+		Page		page;
+		OffsetNumber offnum,
+					maxoff;
+		Size		freespace;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * If the page has only visible tuples, then we can find out the free
+		 * space from the FSM and move on.
+		 */
+		if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
+		{
+			freespace = GetRecordedFreeSpace(rel, blkno);
+			stat->tuple_len += BLCKSZ - freespace;
+			stat->free_space += freespace;
+			continue;
+		}
+
+		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
+								 RBM_NORMAL, bstrategy);
+
+		LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+		page = BufferGetPage(buf);
+
+		/*
+		 * It's not safe to call PageGetHeapFreeSpace() on new pages, so we
+		 * treat them as being free space for our purposes.
+		 */
+		if (!PageIsNew(page))
+			stat->free_space += PageGetHeapFreeSpace(page);
+		else
+			stat->free_space += BLCKSZ - SizeOfPageHeaderData;
+
+		/* We may count the page as scanned even if it's new/empty */
+		scanned++;
+
+		if (PageIsNew(page) || PageIsEmpty(page))
+		{
+			UnlockReleaseBuffer(buf);
+			continue;
+		}
+
+		/*
+		 * Look at each tuple on the page and decide whether it's live or
+		 * dead, then count it and its size. Unlike lazy_scan_heap, we can
+		 * afford to ignore problems and special cases.
+		 */
+		maxoff = PageGetMaxOffsetNumber(page);
+
+		for (offnum = FirstOffsetNumber;
+			 offnum <= maxoff;
+			 offnum = OffsetNumberNext(offnum))
+		{
+			ItemId		itemid;
+			HeapTupleData tuple;
+
+			itemid = PageGetItemId(page, offnum);
+
+			if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid) ||
+				ItemIdIsDead(itemid))
+			{
+				continue;
+			}
+
+			Assert(ItemIdIsNormal(itemid));
+
+			ItemPointerSet(&(tuple.t_self), blkno, offnum);
+
+			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
+			tuple.t_len = ItemIdGetLength(itemid);
+			tuple.t_tableOid = RelationGetRelid(rel);
+
+			/*
+			 * We follow VACUUM's lead in counting INSERT_IN_PROGRESS tuples
+			 * as "dead" while DELETE_IN_PROGRESS tuples are "live".  We don't
+			 * bother distinguishing tuples inserted/deleted by our own
+			 * transaction.
+			 */
+			switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
+			{
+				case HEAPTUPLE_LIVE:
+				case HEAPTUPLE_DELETE_IN_PROGRESS:
+					stat->tuple_len += tuple.t_len;
+					stat->tuple_count++;
+					break;
+				case HEAPTUPLE_DEAD:
+				case HEAPTUPLE_RECENTLY_DEAD:
+				case HEAPTUPLE_INSERT_IN_PROGRESS:
+					stat->dead_tuple_len += tuple.t_len;
+					stat->dead_tuple_count++;
+					break;
+				default:
+					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
+					break;
+			}
+		}
+
+		UnlockReleaseBuffer(buf);
+	}
+
+	stat->table_len = (uint64) nblocks * BLCKSZ;
+
+	/*
+	 * We don't know how many tuples are in the pages we didn't scan, so
+	 * extrapolate the live-tuple count to the whole table in the same way
+	 * that VACUUM does.  (Like VACUUM, we're not taking a random sample, so
+	 * just extrapolating linearly seems unsafe.)  There should be no dead
+	 * tuples in all-visible pages, so no correction is needed for that, and
+	 * we already accounted for the space in those pages, too.
+	 */
+	stat->tuple_count = vac_estimate_reltuples(rel, false, nblocks, scanned,
+	                                           stat->tuple_count);
+
+	/*
+	 * Calculate percentages if the relation has one or more pages.
+	 */
+	if (nblocks != 0)
+	{
+		stat->scanned_percent = 100 * scanned / nblocks;
+		stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
+		stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
+		stat->free_percent = 100.0 * stat->free_space / stat->table_len;
+	}
+
+	if (BufferIsValid(vmbuffer))
+	{
+		ReleaseBuffer(vmbuffer);
+		vmbuffer = InvalidBuffer;
+	}
 }
 
 /*
@@ -215,14 +221,14 @@ statapprox_heap(Relation rel, output_type *stat)
 Datum
 pgstattuple_approx(PG_FUNCTION_ARGS)
 {
-    Oid            relid = PG_GETARG_OID(0);
+	Oid			relid = PG_GETARG_OID(0);
 
-    if (!superuser())
-        ereport(ERROR,
-                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-                 (errmsg("must be superuser to use pgstattuple functions"))));
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 (errmsg("must be superuser to use pgstattuple functions"))));
 
-    PG_RETURN_DATUM(pgstattuple_approx_internal(relid, fcinfo));
+	PG_RETURN_DATUM(pgstattuple_approx_internal(relid, fcinfo));
 }
 
 /*
@@ -235,69 +241,69 @@ pgstattuple_approx(PG_FUNCTION_ARGS)
 Datum
 pgstattuple_approx_v1_5(PG_FUNCTION_ARGS)
 {
-    Oid            relid = PG_GETARG_OID(0);
+	Oid			relid = PG_GETARG_OID(0);
 
-    PG_RETURN_DATUM(pgstattuple_approx_internal(relid, fcinfo));
+	PG_RETURN_DATUM(pgstattuple_approx_internal(relid, fcinfo));
 }
 
 Datum
 pgstattuple_approx_internal(Oid relid, FunctionCallInfo fcinfo)
 {
-    Relation    rel;
-    output_type stat = {0};
-    TupleDesc    tupdesc;
-    bool        nulls[NUM_OUTPUT_COLUMNS];
-    Datum        values[NUM_OUTPUT_COLUMNS];
-    HeapTuple    ret;
-    int            i = 0;
-
-    if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
-        elog(ERROR, "return type must be a row type");
-
-    if (tupdesc->natts != NUM_OUTPUT_COLUMNS)
-        elog(ERROR, "incorrect number of output arguments");
-
-    rel = relation_open(relid, AccessShareLock);
-
-    /*
-     * Reject attempts to read non-local temporary relations; we would be
-     * likely to get wrong data since we have no visibility into the owning
-     * session's local buffers.
-     */
-    if (RELATION_IS_OTHER_TEMP(rel))
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("cannot access temporary tables of other sessions")));
-
-    /*
-     * We support only ordinary relations and materialised views, because we
-     * depend on the visibility map and free space map for our estimates about
-     * unscanned pages.
-     */
-    if (!(rel->rd_rel->relkind == RELKIND_RELATION ||
-          rel->rd_rel->relkind == RELKIND_MATVIEW))
-        ereport(ERROR,
-                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-                 errmsg("\"%s\" is not a table or materialized view",
-                        RelationGetRelationName(rel))));
-
-    statapprox_heap(rel, &stat);
-
-    relation_close(rel, AccessShareLock);
-
-    memset(nulls, 0, sizeof(nulls));
-
-    values[i++] = Int64GetDatum(stat.table_len);
-    values[i++] = Float8GetDatum(stat.scanned_percent);
-    values[i++] = Int64GetDatum(stat.tuple_count);
-    values[i++] = Int64GetDatum(stat.tuple_len);
-    values[i++] = Float8GetDatum(stat.tuple_percent);
-    values[i++] = Int64GetDatum(stat.dead_tuple_count);
-    values[i++] = Int64GetDatum(stat.dead_tuple_len);
-    values[i++] = Float8GetDatum(stat.dead_tuple_percent);
-    values[i++] = Int64GetDatum(stat.free_space);
-    values[i++] = Float8GetDatum(stat.free_percent);
-
-    ret = heap_form_tuple(tupdesc, values, nulls);
-    return HeapTupleGetDatum(ret);
+	Relation	rel;
+	output_type stat = {0};
+	TupleDesc	tupdesc;
+	bool		nulls[NUM_OUTPUT_COLUMNS];
+	Datum		values[NUM_OUTPUT_COLUMNS];
+	HeapTuple	ret;
+	int			i = 0;
+
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	if (tupdesc->natts != NUM_OUTPUT_COLUMNS)
+		elog(ERROR, "incorrect number of output arguments");
+
+	rel = relation_open(relid, AccessShareLock);
+
+	/*
+	 * Reject attempts to read non-local temporary relations; we would be
+	 * likely to get wrong data since we have no visibility into the owning
+	 * session's local buffers.
+	 */
+	if (RELATION_IS_OTHER_TEMP(rel))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot access temporary tables of other sessions")));
+
+	/*
+	 * We support only ordinary relations and materialised views, because we
+	 * depend on the visibility map and free space map for our estimates about
+	 * unscanned pages.
+	 */
+	if (!(rel->rd_rel->relkind == RELKIND_RELATION ||
+		  rel->rd_rel->relkind == RELKIND_MATVIEW))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("\"%s\" is not a table or materialized view",
+						RelationGetRelationName(rel))));
+
+	statapprox_heap(rel, &stat);
+
+	relation_close(rel, AccessShareLock);
+
+	memset(nulls, 0, sizeof(nulls));
+
+	values[i++] = Int64GetDatum(stat.table_len);
+	values[i++] = Float8GetDatum(stat.scanned_percent);
+	values[i++] = Int64GetDatum(stat.tuple_count);
+	values[i++] = Int64GetDatum(stat.tuple_len);
+	values[i++] = Float8GetDatum(stat.tuple_percent);
+	values[i++] = Int64GetDatum(stat.dead_tuple_count);
+	values[i++] = Int64GetDatum(stat.dead_tuple_len);
+	values[i++] = Float8GetDatum(stat.dead_tuple_percent);
+	values[i++] = Int64GetDatum(stat.free_space);
+	values[i++] = Float8GetDatum(stat.free_percent);
+
+	ret = heap_form_tuple(tupdesc, values, nulls);
+	return HeapTupleGetDatum(ret);
 }
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 399f8275..9f11a50f 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1752,8 +1752,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
       <entry><type>float4</type></entry>
       <entry></entry>
       <entry>
-       Number of rows in the table.  This is only an estimate used by the
-       planner.  It is updated by <command>VACUUM</command>,
+       Number of live rows in the table.  This is only an estimate used by
+       the planner.  It is updated by <command>VACUUM</command>,
        <command>ANALYZE</command>, and a few DDL commands such as
        <command>CREATE INDEX</command>.
       </entry>
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 89c9a1ea..b1c18dec 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -2420,12 +2420,12 @@ index_build(Relation heapRelation,
  * things to add it to the new index.  After we return, the AM's index
  * build procedure does whatever cleanup it needs.
  *
- * The total count of heap tuples is returned.  This is for updating pg_class
- * statistics.  (It's annoying not to be able to do that here, but we want
- * to merge that update with others; see index_update_stats.)  Note that the
- * index AM itself must keep track of the number of index tuples; we don't do
- * so here because the AM might reject some of the tuples for its own reasons,
- * such as being unable to store NULLs.
+ * The total count of live heap tuples is returned.  This is for updating
+ * pg_class statistics.  (It's annoying not to be able to do that here, but we
+ * want to merge that update with others; see index_update_stats.)  Note that
+ * the index AM itself must keep track of the number of index tuples; we don't
+ * do so here because the AM might reject some of the tuples for its own
+ * reasons, such as being unable to store NULLs.
  *
  * A side effect is to set indexInfo->ii_BrokenHotChain to true if we detect
  * any potentially broken HOT chains.  Currently, we set this if there are
@@ -2455,8 +2455,8 @@ IndexBuildHeapScan(Relation heapRelation,
  * to scan cannot be done when requesting syncscan.
  *
  * When "anyvisible" mode is requested, all tuples visible to any transaction
- * are considered, including those inserted or deleted by transactions that are
- * still in progress.
+ * are indexed and counted as live, including those inserted or deleted by
+ * transactions that are still in progress.
  */
 double
 IndexBuildHeapRangeScan(Relation heapRelation,
@@ -2628,6 +2628,12 @@ IndexBuildHeapRangeScan(Relation heapRelation,
              */
             LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
 
+			/*
+			 * The criteria for counting a tuple as live in this block need to
+			 * match what analyze.c's acquire_sample_rows() does, otherwise
+			 * CREATE INDEX and ANALYZE may produce wildly different reltuples
+			 * values, e.g. when there are many recently-dead tuples.
+			 */
             switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin,
                                              scan->rs_cbuf))
             {
@@ -2640,6 +2646,8 @@ IndexBuildHeapRangeScan(Relation heapRelation,
                     /* Normal case, index and unique-check it */
                     indexIt = true;
                     tupleIsAlive = true;
+					/* Count it as live, too */
+					reltuples += 1;
                     break;
                 case HEAPTUPLE_RECENTLY_DEAD:
 
@@ -2653,6 +2661,9 @@ IndexBuildHeapRangeScan(Relation heapRelation,
                      * the live tuple at the end of the HOT-chain.  Since this
                      * breaks semantics for pre-existing snapshots, mark the
                      * index as unusable for them.
+					 *
+					 * We don't count recently-dead tuples in reltuples, even
+					 * if we index them; see acquire_sample_rows().
                      */
                     if (HeapTupleIsHotUpdated(heapTuple))
                     {
@@ -2675,6 +2686,7 @@ IndexBuildHeapRangeScan(Relation heapRelation,
                     {
                         indexIt = true;
                         tupleIsAlive = true;
+						reltuples += 1;
                         break;
                     }
 
@@ -2712,6 +2724,15 @@ IndexBuildHeapRangeScan(Relation heapRelation,
                             goto recheck;
                         }
                     }
+					else
+					{
+						/*
+						 * For consistency with acquire_sample_rows(), count
+						 * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only
+						 * when inserted by our own transaction.
+						 */
+						reltuples += 1;
+					}
 
                     /*
                      * We must index such tuples, since if the index build
@@ -2731,6 +2752,7 @@ IndexBuildHeapRangeScan(Relation heapRelation,
                     {
                         indexIt = true;
                         tupleIsAlive = false;
+						reltuples += 1;
                         break;
                     }
 
@@ -2774,6 +2796,14 @@ IndexBuildHeapRangeScan(Relation heapRelation,
                          * the same as a RECENTLY_DEAD tuple.
                          */
                         indexIt = true;
+
+						/*
+						 * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live,
+						 * if they were not deleted by the current
+						 * transaction.  That's what acquire_sample_rows()
+						 * does, and we want the behavior to be consistent.
+						 */
+						reltuples += 1;
                     }
                     else if (HeapTupleIsHotUpdated(heapTuple))
                     {
@@ -2791,8 +2821,8 @@ IndexBuildHeapRangeScan(Relation heapRelation,
                     {
                         /*
                          * It's a regular tuple deleted by our own xact. Index
-                         * it but don't check for uniqueness, the same as a
-                         * RECENTLY_DEAD tuple.
+						 * it, but don't check for uniqueness nor count in
+						 * reltuples, the same as a RECENTLY_DEAD tuple.
                          */
                         indexIt = true;
                     }
@@ -2816,8 +2846,6 @@ IndexBuildHeapRangeScan(Relation heapRelation,
             tupleIsAlive = true;
         }
 
-        reltuples += 1;
-
         MemoryContextReset(econtext->ecxt_per_tuple_memory);
 
         /* Set up for predicate or expression evaluation */
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index efb5aade..3c337815 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -707,7 +707,8 @@ vacuum_set_xid_limits(Relation rel,
  *        we take the old value of pg_class.reltuples as a measurement of the
  *        tuple density in the unscanned pages.
  *
- *        This routine is shared by VACUUM and ANALYZE.
+ *		Note: scanned_tuples should count only *live* tuples, since
+ *		pg_class.reltuples is defined that way.
  */
 double
 vac_estimate_reltuples(Relation relation, bool is_analyze,
@@ -807,6 +808,9 @@ vac_estimate_reltuples(Relation relation, bool is_analyze,
  *        transaction.  This is OK since postponing the flag maintenance is
  *        always allowable.
  *
+ *		Note: num_tuples should count only *live* tuples, since
+ *		pg_class.reltuples is defined that way.
+ *
  *        This routine is shared by VACUUM and ANALYZE.
  */
 void
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 4796152a..90dfe91f 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -123,9 +123,9 @@ typedef struct LVRelStats
     BlockNumber pinskipped_pages;    /* # of pages we skipped due to a pin */
     BlockNumber frozenskipped_pages;    /* # of frozen pages we skipped */
     BlockNumber tupcount_pages; /* pages whose tuples we counted */
-    double        scanned_tuples; /* counts only tuples on tupcount_pages */
-    double        old_rel_tuples; /* previous value of pg_class.reltuples */
+	double		old_live_tuples;	/* previous value of pg_class.reltuples */
     double        new_rel_tuples; /* new estimated total # of tuples */
+	double		new_live_tuples;	/* new estimated total # of live tuples */
     double        new_dead_tuples;    /* new estimated total # of dead tuples */
     BlockNumber pages_removed;
     double        tuples_deleted;
@@ -316,7 +316,6 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params,
     TransactionId xidFullScanLimit;
     MultiXactId mxactFullScanLimit;
     BlockNumber new_rel_pages;
-    double        new_rel_tuples;
     BlockNumber new_rel_allvisible;
     double        new_live_tuples;
     TransactionId new_frozen_xid;
@@ -374,7 +373,7 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params,
     vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
 
     vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
-    vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
+	vacrelstats->old_live_tuples = onerel->rd_rel->reltuples;
     vacrelstats->num_index_scans = 0;
     vacrelstats->pages_removed = 0;
     vacrelstats->lock_waiter_detected = false;
@@ -451,11 +450,11 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params,
      * since then we don't know for certain that all tuples have a newer xmin.
      */
     new_rel_pages = vacrelstats->rel_pages;
-    new_rel_tuples = vacrelstats->new_rel_tuples;
+	new_live_tuples = vacrelstats->new_live_tuples;
     if (vacrelstats->tupcount_pages == 0 && new_rel_pages > 0)
     {
         new_rel_pages = vacrelstats->old_rel_pages;
-        new_rel_tuples = vacrelstats->old_rel_tuples;
+		new_live_tuples = vacrelstats->old_live_tuples;
     }
 
     visibilitymap_count(onerel, &new_rel_allvisible, NULL);
@@ -467,7 +466,7 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params,
 
     vac_update_relstats(onerel,
                         new_rel_pages,
-                        new_rel_tuples,
+						new_live_tuples,
                         new_rel_allvisible,
                         vacrelstats->hasindex,
                         new_frozen_xid,
@@ -475,10 +474,6 @@ lazy_vacuum_rel(Relation onerel, int options, VacuumParams *params,
                         false);
 
     /* report results to the stats collector, too */
-    new_live_tuples = new_rel_tuples - vacrelstats->new_dead_tuples;
-    if (new_live_tuples < 0)
-        new_live_tuples = 0;    /* just in case */
-
     pgstat_report_vacuum(RelationGetRelid(onerel),
                          onerel->rd_rel->relisshared,
                          new_live_tuples,
@@ -604,10 +599,11 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
     char       *relname;
     BlockNumber empty_pages,
                 vacuumed_pages;
-    double        num_tuples,
-                tups_vacuumed,
-                nkeep,
-                nunused;
+	double		num_tuples,		/* total number of nonremovable tuples */
+				live_tuples,	/* live tuples (reltuples estimate) */
+				tups_vacuumed,	/* tuples cleaned up by vacuum */
+				nkeep,			/* dead-but-not-removable tuples */
+				nunused;		/* unused item pointers */
     IndexBulkDeleteResult **indstats;
     int            i;
     PGRUsage    ru0;
@@ -632,7 +628,7 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
                     relname)));
 
     empty_pages = vacuumed_pages = 0;
-    num_tuples = tups_vacuumed = nkeep = nunused = 0;
+	num_tuples = live_tuples = tups_vacuumed = nkeep = nunused = 0;
 
     indstats = (IndexBulkDeleteResult **)
         palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
@@ -1131,6 +1127,17 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 
 			tupgone = false;
 
+			/*
+			 * The criteria for counting a tuple as live in this block need to
+			 * match what analyze.c's acquire_sample_rows() does, otherwise
+			 * VACUUM and ANALYZE may produce wildly different reltuples
+			 * values, e.g. when there are many recently-dead tuples.
+			 *
+			 * The logic here is a bit simpler than acquire_sample_rows(), as
+			 * VACUUM can't run inside a transaction block, which makes some
+			 * cases impossible (e.g. in-progress insert from the same
+			 * transaction).
+			 */
 			switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
 			{
 				case HEAPTUPLE_DEAD:
@@ -1164,6 +1171,12 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 						elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
 							 relname, blkno, offnum);
 
+					/*
+					 * Count it as live.  Not only is this natural, but it's
+					 * also what acquire_sample_rows() does.
+					 */
+					live_tuples += 1;
+
 					/*
 					 * Is the tuple definitely visible to all transactions?
 					 *
@@ -1235,12 +1248,29 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 					all_visible = false;
 					break;
 				case HEAPTUPLE_INSERT_IN_PROGRESS:
-					/* This is an expected case during concurrent vacuum */
+
+					/*
+					 * This is an expected case during concurrent vacuum.
+					 *
+					 * We do not count these rows as live, because we expect
+					 * the inserting transaction to update the counters at
+					 * commit, and we assume that will happen only after we
+					 * report our results.  This assumption is a bit shaky,
+					 * but it is what acquire_sample_rows() does, so be
+					 * consistent.
+					 */
 					all_visible = false;
 					break;
 				case HEAPTUPLE_DELETE_IN_PROGRESS:
 					/* This is an expected case during concurrent vacuum */
 					all_visible = false;
+
+					/*
+					 * Count such rows as live.  As above, we assume the
+					 * deleting transaction will commit and update the
+					 * counters after we report.
+					 */
+					live_tuples += 1;
 					break;
 				default:
 					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
@@ -1448,15 +1478,18 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
     pfree(frozen);
 
     /* save stats for use later */
-    vacrelstats->scanned_tuples = num_tuples;
     vacrelstats->tuples_deleted = tups_vacuumed;
     vacrelstats->new_dead_tuples = nkeep;
 
     /* now we can compute the new value for pg_class.reltuples */
-    vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false,
+	vacrelstats->new_live_tuples = vac_estimate_reltuples(onerel, false,
                                                          nblocks,
                                                          vacrelstats->tupcount_pages,
-                                                         num_tuples);
+														  live_tuples);
+
+	/* also compute total number of surviving heap entries */
+	vacrelstats->new_rel_tuples =
+		vacrelstats->new_live_tuples + vacrelstats->new_dead_tuples;
 
     /*
      * Release any remaining pin on visibility map page.
@@ -1801,7 +1834,8 @@ lazy_vacuum_index(Relation indrel,
     ivinfo.analyze_only = false;
     ivinfo.estimated_count = true;
     ivinfo.message_level = elevel;
-    ivinfo.num_heap_tuples = vacrelstats->old_rel_tuples;
+	/* We can only provide an approximate value of num_heap_tuples here */
+	ivinfo.num_heap_tuples = vacrelstats->old_live_tuples;
     ivinfo.strategy = vac_strategy;
 
     /* Do bulk deletion */
@@ -1832,6 +1866,12 @@ lazy_cleanup_index(Relation indrel,
     ivinfo.analyze_only = false;
     ivinfo.estimated_count = (vacrelstats->tupcount_pages < vacrelstats->rel_pages);
     ivinfo.message_level = elevel;
+
+	/*
+	 * Now we can provide a better estimate of total number of surviving
+	 * tuples (we assume indexes are more interested in that than in the
+	 * number of nominally live tuples).
+	 */
     ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples;
     ivinfo.strategy = vac_strategy;
 
@@ -2458,7 +2498,7 @@ truncate_extent_tuples(Relation onerel,
     vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
 
     vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
-    vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
+	vacrelstats->old_live_tuples = onerel->rd_rel->reltuples;
     vacrelstats->num_index_scans = 0;
     vacrelstats->pages_removed = 0;
     vacrelstats->lock_waiter_detected = false;

From 35a4988b243925baf13042a6fc374f66ef738a0d Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 8 May 2018 00:20:19 -0400
Subject: [PATCH 509/578] Count heap tuples in non-SnapshotAny path in
 IndexBuildHeapRangeScan().

Brown-paper-bag bug in commit 7c91a0364: when we rearranged the placement
of "reltuples += 1" statements, we missed including one in this code path.

The net effect of that was that CREATE INDEX CONCURRENTLY would set the
table's pg_class.reltuples to zero, as would index builds done during
bootstrap mode.  (It seems like parallel index builds ought to fail
similarly, but they don't, perhaps because reltuples is computed in some
other way.  You certainly couldn't figure that out from the abysmally
underdocumented parallelism code in this area.)

I was led to this by wondering why initdb seemed to have slowed down as
a result of 7c91a0364, as is evident in the buildfarm's timing history.
The reason is that every system catalog with indexes had pg_class.reltuples
= 0 after bootstrap, causing the planner to make some terrible choices for
queries in the post-bootstrap steps.  On my workstation, this fix causes
the runtime of "initdb -N" to drop from ~2.0 sec to ~1.4 sec, which is
almost though not quite back to where it was in v10.  That's not much of
a deal for production use perhaps, but it makes a noticeable difference
for buildfarm and "make check-world" runs, which do a lot of initdbs.
---
 src/backend/catalog/index.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index b1c18dec..d3cc3775 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -2844,6 +2844,7 @@ IndexBuildHeapRangeScan(Relation heapRelation,
         {
             /* heap_getnext did the time qual check */
             tupleIsAlive = true;
+			reltuples += 1;
         }
 
         MemoryContextReset(econtext->ecxt_per_tuple_memory);

From fb97cdce3163324a261569538f198d61660ef301 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 12 Sep 2018 06:46:01 +0900
Subject: [PATCH 510/578] Parse more strictly integer parameters from
 connection strings in libpq

The following parameters have been parsed in lossy ways when specified
in a connection string processed by libpq:
- connect_timeout
- keepalives
- keepalives_count
- keepalives_idle
- keepalives_interval
- port

Overflowing values or the presence of incorrect characters were not
properly checked, leading to libpq trying to use such values and fail
with unhelpful error messages.  This commit hardens the parsing of those
parameters so as it is possible to find easily incorrect values.

Author: Fabien Coelho
Reviewed-by: Peter Eisentraut, Michael Paquier
Discussion: https://postgr.es/m/alpine.DEB.2.21.1808171206180.20841@lancre
---
 src/interfaces/libpq/fe-connect.c | 59 ++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 8 deletions(-)

diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 8c1ec04b..9bcefd3e 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -1597,6 +1597,34 @@ useKeepalives(PGconn *conn)
     return val != 0 ? 1 : 0;
 }
 
+/*
+ * Parse and try to interpret "value" as an integer value, and if successful,
+ * store it in *result, complaining if there is any trailing garbage or an
+ * overflow.
+ */
+static bool
+parse_int_param(const char *value, int *result, PGconn *conn,
+				const char *context)
+{
+	char	   *end;
+	long		numval;
+
+	*result = 0;
+
+	errno = 0;
+	numval = strtol(value, &end, 10);
+	if (errno == 0 && *end == '\0' && numval == (int) numval)
+	{
+		*result = numval;
+		return true;
+	}
+
+	appendPQExpBuffer(&conn->errorMessage,
+					  libpq_gettext("invalid integer value \"%s\" for keyword \"%s\"\n"),
+					  value, context);
+	return false;
+}
+
 #ifndef WIN32
 /*
  * Set the keepalive idle timer.
@@ -1609,7 +1637,9 @@ setKeepalivesIdle(PGconn *conn)
     if (conn->keepalives_idle == NULL)
         return 1;
 
-    idle = atoi(conn->keepalives_idle);
+	if (!parse_int_param(conn->keepalives_idle, &idle, conn,
+						 "keepalives_idle"))
+		return 0;
     if (idle < 0)
         idle = 0;
 
@@ -1641,7 +1671,9 @@ setKeepalivesInterval(PGconn *conn)
     if (conn->keepalives_interval == NULL)
         return 1;
 
-    interval = atoi(conn->keepalives_interval);
+	if (!parse_int_param(conn->keepalives_interval, &interval, conn,
+						 "keepalives_interval"))
+		return 0;
     if (interval < 0)
         interval = 0;
 
@@ -1674,7 +1706,9 @@ setKeepalivesCount(PGconn *conn)
     if (conn->keepalives_count == NULL)
         return 1;
 
-    count = atoi(conn->keepalives_count);
+	if (!parse_int_param(conn->keepalives_count, &count, conn,
+						 "keepalives_count"))
+		return 0;
     if (count < 0)
         count = 0;
 
@@ -1708,13 +1742,17 @@ setKeepalivesWin32(PGconn *conn)
     int            idle = 0;
     int            interval = 0;
 
-    if (conn->keepalives_idle)
-        idle = atoi(conn->keepalives_idle);
+	if (conn->keepalives_idle &&
+		!parse_int_param(conn->keepalives_idle, &idle, conn,
+						 "keepalives_idle"))
+		return 0;
     if (idle <= 0)
         idle = 2 * 60 * 60;        /* 2 hours = default */
 
-    if (conn->keepalives_interval)
-        interval = atoi(conn->keepalives_interval);
+	if (conn->keepalives_interval &&
+		!parse_int_param(conn->keepalives_interval, &interval, conn,
+						 "keepalives_interval"))
+		return 0;
     if (interval <= 0)
         interval = 1;            /* 1 second = default */
 
@@ -1918,7 +1956,10 @@ connectDBComplete(PGconn *conn)
      */
     if (conn->connect_timeout != NULL)
     {
-        timeout = atoi(conn->connect_timeout);
+		if (!parse_int_param(conn->connect_timeout, &timeout, conn,
+							 "connect_timeout"))
+			return 0;
+
         if (timeout > 0)
         {
             /*
@@ -1929,6 +1970,8 @@ connectDBComplete(PGconn *conn)
             /* calculate the finish time based on start + timeout */
             finish_time = time(NULL) + timeout;
         }
+		else					/* negative means 0 */
+			timeout = 0;
     }
 
     for (;;)

From 75b7561403ab32b49463257b7895f323542ffabf Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Sat, 6 Apr 2019 15:23:37 +0900
Subject: [PATCH 511/578] Add support TCP user timeout in libpq and the backend
 server

Similarly to the set of parameters for keepalive, a connection parameter
for libpq is added as well as a backend GUC, called tcp_user_timeout.

Increasing the TCP user timeout is useful to allow a connection to
survive extended periods without end-to-end connection, and decreasing
it allows application to fail faster.  By default, the parameter is 0,
which makes the connection use the system default, and follows a logic
close to the keepalive parameters in its handling.  When connecting
through a Unix-socket domain, the parameters have no effect.

Author: Ryohei Nagaura
Reviewed-by: Fabien Coelho, Robert Haas, Kyotaro Horiguchi, Kirk
Jamison, Mikalai Keida, Takayuki Tsunakawa, Andrei Yahorau
Discussion: https://postgr.es/m/EDA4195584F5064680D8130B1CA91C45367328@G01JPEXMBYT04
---
 .../postgres_fdw/expected/postgres_fdw.out    |  1 +
 contrib/postgres_fdw/sql/postgres_fdw.sql     |  1 +
 doc/src/sgml/config.sgml                      | 25 +++++++
 doc/src/sgml/libpq.sgml                       | 14 ++++
 src/backend/libpq/pqcomm.c                    | 72 +++++++++++++++++++
 src/backend/utils/misc/guc.c                  | 31 ++++++++
 src/backend/utils/misc/postgresql.conf.sample | 11 +++
 src/include/libpq/libpq-be.h                  |  6 +-
 src/include/utils/guc.h                       |  2 +
 src/interfaces/libpq/fe-connect.c             | 43 +++++++++++
 src/interfaces/libpq/libpq-int.h              |  1 +
 11 files changed, 206 insertions(+), 1 deletion(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 09aee7c5..25f0967c 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -153,6 +153,7 @@ ALTER SERVER testserver1 OPTIONS (
 	keepalives 'value',
 	keepalives_idle 'value',
 	keepalives_interval 'value',
+	tcp_user_timeout 'value',
 	-- requiressl 'value',
 	sslcompression 'value',
 	sslmode 'value',
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 471bceae..bf923c0e 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -166,6 +166,7 @@ ALTER SERVER testserver1 OPTIONS (
 	keepalives 'value',
 	keepalives_idle 'value',
 	keepalives_interval 'value',
+	tcp_user_timeout 'value',
 	-- requiressl 'value',
 	sslcompression 'value',
 	sslmode 'value',
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 939ddd85..ed2368fa 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -935,6 +935,31 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-tcp-user-timeout" xreflabel="tcp_user_timeout">
+      <term><varname>tcp_user_timeout</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>tcp_user_timeout</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the number of milliseconds that transmitted data may
+        remain unacknowledged before a connection is forcibly closed.
+        A value of 0 uses the system default.
+        This parameter is supported only on systems that support
+        <symbol>TCP_USER_TIMEOUT</symbol>; on other systems, it must be zero.
+        In sessions connected via a Unix-domain socket, this parameter is
+        ignored and always reads as zero. 
+       </para>
+       <note>
+        <para>
+         This parameter is not supported on Windows and on Linux version
+         2.6.36 or older.
+        </para>
+       </note>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
      </sect2>
      <sect2 id="runtime-config-connection-security">
diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml
index ebee3afa..dfa8c5f8 100644
--- a/doc/src/sgml/libpq.sgml
+++ b/doc/src/sgml/libpq.sgml
@@ -1211,6 +1211,20 @@ postgresql://%2Fvar%2Flib%2Fpostgresql/dbname
       </listitem>
      </varlistentry>
 
+     <varlistentry id="libpq-tcp-user-timeout" xreflabel="libpq_tcp_user_timeout">
+      <term><literal>tcp_user_timeout</literal></term>
+      <listitem>
+       <para>
+        Controls the number of milliseconds that transmitted data may
+        remain unacknowledged before a connection is forcibly closed.
+        A value of zero uses the system default. This parameter is
+        ignored for connections made via a Unix-domain socket.
+        It is only supported on systems where <symbol>TCP_USER_TIMEOUT</symbol>
+        is available; on other systems, it has no effect.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="libpq-connect-tty" xreflabel="tty">
       <term><literal>tty</literal></term>
       <listitem>
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index fb35a142..bd089ae0 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -908,6 +908,7 @@ StreamConnection(pgsocket server_fd, Port *port)
         (void) pq_setkeepalivesidle(tcp_keepalives_idle, port);
         (void) pq_setkeepalivesinterval(tcp_keepalives_interval, port);
         (void) pq_setkeepalivescount(tcp_keepalives_count, port);
+		(void) pq_settcpusertimeout(tcp_user_timeout, port);
     }
 
     return STATUS_OK;
@@ -2070,4 +2071,75 @@ SetSockKeepAlive(int sock)
 	{
 		elog(LOG, "SetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m");
 	}
+
+int
+pq_gettcpusertimeout(Port *port)
+{
+#ifdef TCP_USER_TIMEOUT
+	if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+		return 0;
+
+	if (port->tcp_user_timeout != 0)
+		return port->tcp_user_timeout;
+
+	if (port->default_tcp_user_timeout == 0)
+	{
+		ACCEPT_TYPE_ARG3 size = sizeof(port->default_tcp_user_timeout);
+
+		if (getsockopt(port->sock, IPPROTO_TCP, TCP_USER_TIMEOUT,
+					   (char *) &port->default_tcp_user_timeout,
+					   &size) < 0)
+		{
+			elog(LOG, "getsockopt(%s) failed: %m", "TCP_USER_TIMEOUT");
+			port->default_tcp_user_timeout = -1;	/* don't know */
+		}
+	}
+
+	return port->default_tcp_user_timeout;
+#else
+	return 0;
+#endif
+}
+
+int
+pq_settcpusertimeout(int timeout, Port *port)
+{
+	if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family))
+		return STATUS_OK;
+
+#ifdef TCP_USER_TIMEOUT
+	if (timeout == port->tcp_user_timeout)
+		return STATUS_OK;
+
+	if (port->default_tcp_user_timeout <= 0)
+	{
+		if (pq_gettcpusertimeout(port) < 0)
+		{
+			if (timeout == 0)
+				return STATUS_OK;	/* default is set but unknown */
+			else
+				return STATUS_ERROR;
+		}
+	}
+
+	if (timeout == 0)
+		timeout = port->default_tcp_user_timeout;
+
+	if (setsockopt(port->sock, IPPROTO_TCP, TCP_USER_TIMEOUT,
+				   (char *) &timeout, sizeof(timeout)) < 0)
+	{
+		elog(LOG, "setsockopt(%s) failed: %m", "TCP_USER_TIMEOUT");
+		return STATUS_ERROR;
+	}
+
+	port->tcp_user_timeout = timeout;
+#else
+	if (timeout != 0)
+	{
+		elog(LOG, "setsockopt(%s) not supported", "TCP_USER_TIMEOUT");
+		return STATUS_ERROR;
+	}
+#endif
+
+	return STATUS_OK;
 }
\ No newline at end of file
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 27901832..b53a3c57 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -260,9 +260,11 @@ static const char *show_archive_command(void);
 static void assign_tcp_keepalives_idle(int newval, void *extra);
 static void assign_tcp_keepalives_interval(int newval, void *extra);
 static void assign_tcp_keepalives_count(int newval, void *extra);
+static void assign_tcp_user_timeout(int newval, void *extra);
 static const char *show_tcp_keepalives_idle(void);
 static const char *show_tcp_keepalives_interval(void);
 static const char *show_tcp_keepalives_count(void);
+static const char *show_tcp_user_timeout(void);
 static bool check_maxconnections(int *newval, void **extra, GucSource source);
 static bool check_max_worker_processes(int *newval, void **extra, GucSource source);
 static bool check_autovacuum_max_workers(int *newval, void **extra, GucSource source);
@@ -672,6 +674,7 @@ char       *nls_sort_locale = NULL;
 int            tcp_keepalives_idle;
 int            tcp_keepalives_interval;
 int            tcp_keepalives_count;
+int			tcp_user_timeout;
 
 /*
  * SSL renegotiation was been removed in PostgreSQL 9.5, but we tolerate it
@@ -4952,6 +4955,17 @@ static struct config_uint ConfigureNamesUInt[] =
         NULL, NULL, NULL
     },
 
+	{
+		{"tcp_user_timeout", PGC_USERSET, CLIENT_CONN_OTHER,
+			gettext_noop("TCP user timeout."),
+			gettext_noop("A value of 0 uses the system default."),
+			GUC_UNIT_MS
+		},
+		&tcp_user_timeout,
+		0, 0, INT_MAX,
+		NULL, assign_tcp_user_timeout, show_tcp_user_timeout
+	},
+
     /* End-of-list marker */
     {
         {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
@@ -13442,6 +13456,23 @@ show_tcp_keepalives_count(void)
     return nbuf;
 }
 
+static void
+assign_tcp_user_timeout(int newval, void *extra)
+{
+	/* See comments in assign_tcp_keepalives_idle */
+	(void) pq_settcpusertimeout(newval, MyProcPort);
+}
+
+static const char *
+show_tcp_user_timeout(void)
+{
+	/* See comments in assign_tcp_keepalives_idle */
+	static char nbuf[16];
+
+	snprintf(nbuf, sizeof(nbuf), "%d", pq_gettcpusertimeout(MyProcPort));
+	return nbuf;
+}
+
 static bool
 check_maxconnections(int *newval, void **extra, GucSource source)
 {
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 5ef4e565..edc6af3f 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -75,6 +75,17 @@
 
 # - Security and Authentication -
 
+# - TCP settings -
+# see "man 7 tcp" for details
+
+#tcp_keepalives_idle = 0		# TCP_KEEPIDLE, in seconds;
+					# 0 selects the system default
+#tcp_keepalives_interval = 0		# TCP_KEEPINTVL, in seconds;
+					# 0 selects the system default
+#tcp_keepalives_count = 0		# TCP_KEEPCNT;
+					# 0 selects the system default
+#tcp_user_timeout = 0			# TCP_USER_TIMEOUT, in milliseconds;
+					# 0 selects the system default
 #authentication_timeout = 1min		# 1s-600s
 #ssl = off
 #ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL' # allowed SSL ciphers
diff --git a/src/include/libpq/libpq-be.h b/src/include/libpq/libpq-be.h
index 474d9690..00737906 100644
--- a/src/include/libpq/libpq-be.h
+++ b/src/include/libpq/libpq-be.h
@@ -215,7 +215,7 @@ typedef struct Port
     TimestampTz SessionStartTime;    /* backend start time */
 
     /*
-     * TCP keepalive settings.
+	 * TCP keepalive and user timeout settings.
      *
      * default values are 0 if AF_UNIX or not yet known; current values are 0
      * if AF_UNIX or using the default. Also, -1 in a default value means we
@@ -224,9 +224,11 @@ typedef struct Port
     int            default_keepalives_idle;
     int            default_keepalives_interval;
     int            default_keepalives_count;
+	int			default_tcp_user_timeout;
     int            keepalives_idle;
     int            keepalives_interval;
     int            keepalives_count;
+	int			tcp_user_timeout;
 
 #if defined(ENABLE_GSS) || defined(ENABLE_SSPI)
 
@@ -282,10 +284,12 @@ extern ProtocolVersion FrontendProtocol;
 extern int    pq_getkeepalivesidle(Port *port);
 extern int    pq_getkeepalivesinterval(Port *port);
 extern int    pq_getkeepalivescount(Port *port);
+extern int	pq_gettcpusertimeout(Port *port);
 
 extern int    pq_setkeepalivesidle(int idle, Port *port);
 extern int    pq_setkeepalivesinterval(int interval, Port *port);
 extern int    pq_setkeepalivescount(int count, Port *port);
+extern int	pq_settcpusertimeout(int timeout, Port *port);
 
 extern void SetSockKeepAlive(int sock);
 
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 2634e983..c3353b72 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -307,6 +307,8 @@ extern char *nls_sort_locale;
 extern int    tcp_keepalives_idle;
 extern int    tcp_keepalives_interval;
 extern int    tcp_keepalives_count;
+extern int	tcp_user_timeout;
+
 #ifdef _SHARDING_
 extern bool g_allow_dml_on_datanode;
 extern bool g_allow_force_ddl;
diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 9bcefd3e..5cf94b2f 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -322,6 +322,10 @@ static const internalPQconninfoOption PQconninfoOptions[] = {
         "TCP-Keepalives-Count", "", 10, /* strlen(INT32_MAX) == 10 */
     offsetof(struct pg_conn, keepalives_count)},
 
+	{"tcp_user_timeout", NULL, NULL, NULL,
+		"TCP-User-Timeout", "", 10, /* strlen(INT32_MAX) == 10 */
+	offsetof(struct pg_conn, pgtcp_user_timeout)},
+
     /*
      * ssl options are allowed even without client SSL support because the
      * client can still handle SSL modes "disable" and "allow". Other
@@ -1781,6 +1785,41 @@ setKeepalivesWin32(PGconn *conn)
 #endif                            /* SIO_KEEPALIVE_VALS */
 #endif                            /* WIN32 */
 
+/*
+ * Set the TCP user timeout.
+ */
+static int
+setTCPUserTimeout(PGconn *conn)
+{
+	int			timeout;
+
+	if (conn->pgtcp_user_timeout == NULL)
+		return 1;
+
+	if (!parse_int_param(conn->pgtcp_user_timeout, &timeout, conn,
+						 "tcp_user_timeout"))
+		return 0;
+
+	if (timeout < 0)
+		timeout = 0;
+
+#ifdef TCP_USER_TIMEOUT
+	if (setsockopt(conn->sock, IPPROTO_TCP, TCP_USER_TIMEOUT,
+				   (char *) &timeout, sizeof(timeout)) < 0)
+	{
+		char		sebuf[256];
+
+		appendPQExpBuffer(&conn->errorMessage,
+						  libpq_gettext("setsockopt(%s) failed: %s\n"),
+						  "TCP_USER_TIMEOUT",
+						  SOCK_STRERROR(SOCK_ERRNO, sebuf, sizeof(sebuf)));
+		return 0;
+	}
+#endif
+
+	return 1;
+}
+
 /* ----------
  * connectDBStart -
  *        Begin the process of making a connection to the backend.
@@ -2302,6 +2341,8 @@ PQconnectPoll(PGconn *conn)
                             err = 1;
 #endif                            /* SIO_KEEPALIVE_VALS */
 #endif                            /* WIN32 */
+						else if (!setTCPUserTimeout(conn))
+							err = 1;
 
                         if (err)
                         {
@@ -3561,6 +3602,8 @@ freePGconn(PGconn *conn)
         free(conn->pgtty);
     if (conn->connect_timeout)
         free(conn->connect_timeout);
+	if (conn->pgtcp_user_timeout)
+		free(conn->pgtcp_user_timeout);
     if (conn->pgoptions)
         free(conn->pgoptions);
     if (conn->appname)
diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h
index 4a3c071f..a51f3b7b 100644
--- a/src/interfaces/libpq/libpq-int.h
+++ b/src/interfaces/libpq/libpq-int.h
@@ -398,6 +398,7 @@ struct pg_conn
     char       *pgtty;            /* tty on which the backend messages is
                                  * displayed (OBSOLETE, NOT USED) */
     char       *connect_timeout;    /* connection timeout (numeric string) */
+	char	   *pgtcp_user_timeout; /* tcp user timeout (numeric string) */
     char       *client_encoding_initial;    /* encoding to use */
     char       *pgoptions;        /* options to start the backend with */
     char       *appname;        /* application name */

From 6ecd3b1bca35b6ea546acd504adca27d7196f650 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 18 Jan 2022 14:02:43 -0500
Subject: [PATCH 512/578] Make PQcancel use the PGconn's tcp_user_timeout and
 keepalives settings.

If connectivity to the server has been lost or become flaky, the
user might well try to send a query cancel.  It's highly annoying
if PQcancel hangs up in such a case, but that's exactly what's likely
to happen.  To ameliorate this problem, apply the PGconn's
tcp_user_timeout and keepalives settings to the TCP connection used
to send the cancel.  This should be safe on Unix machines, since POSIX
specifies that setsockopt() is async-signal-safe.  We are guessing
that WSAIoctl(SIO_KEEPALIVE_VALS) is similarly safe on Windows.
(Note that at least in psql and our other frontend programs, there's
no safety issue involved anyway, since we run PQcancel in its own
thread rather than in a signal handler.)

Most of the value here comes from the expectation that tcp_user_timeout
will be applied as a connection timeout.  That appears to happen on
Linux, even though its tcp(7) man page claims differently.  The
keepalive options probably won't help much, but as long as we can
apply them for not much code, we might as well.

Jelte Fennema, reviewed by Fujii Masao and myself

Discussion: https://postgr.es/m/AM5PR83MB017870DE81FC84D5E21E9D1EF7AA9@AM5PR83MB0178.EURPRD83.prod.outlook.com
---
 doc/src/sgml/libpq.sgml           |   4 +-
 src/interfaces/libpq/fe-connect.c | 251 +++++++++++++++++++++++-------
 src/interfaces/libpq/libpq-int.h  |   7 +
 3 files changed, 205 insertions(+), 57 deletions(-)

diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml
index dfa8c5f8..1331c0d8 100644
--- a/doc/src/sgml/libpq.sgml
+++ b/doc/src/sgml/libpq.sgml
@@ -4965,8 +4965,8 @@ int PQrequestCancel(PGconn *conn);
        <structname>PGconn</> object, and in case of failure stores the
        error message in the <structname>PGconn</> object (whence it can
        be retrieved by <function>PQerrorMessage</function>).  Although
-       the functionality is the same, this approach creates hazards for
-       multiple-thread programs and signal handlers, since it is possible
+       the functionality is the same, this approach is not safe within
+       multiple-thread programs or signal handlers, since it is possible
        that overwriting the <structname>PGconn</>'s error message will
        mess up the operation currently in progress on the connection.
       </para>
diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 5cf94b2f..1b7e3fe7 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -1737,26 +1737,17 @@ setKeepalivesCount(PGconn *conn)
 /*
  * Enable keepalives and set the keepalive values on Win32,
  * where they are always set in one batch.
+ *
+ * CAUTION: This needs to be signal safe, since it's used by PQcancel.
  */
 static int
-setKeepalivesWin32(PGconn *conn)
+setKeepalivesWin32(pgsocket sock, int idle, int interval)
 {
     struct tcp_keepalive ka;
     DWORD        retsize;
-    int            idle = 0;
-    int            interval = 0;
 
-	if (conn->keepalives_idle &&
-		!parse_int_param(conn->keepalives_idle, &idle, conn,
-						 "keepalives_idle"))
-		return 0;
     if (idle <= 0)
         idle = 2 * 60 * 60;        /* 2 hours = default */
-
-	if (conn->keepalives_interval &&
-		!parse_int_param(conn->keepalives_interval, &interval, conn,
-						 "keepalives_interval"))
-		return 0;
     if (interval <= 0)
         interval = 1;            /* 1 second = default */
 
@@ -1764,7 +1755,7 @@ setKeepalivesWin32(PGconn *conn)
     ka.keepalivetime = idle * 1000;
     ka.keepaliveinterval = interval * 1000;
 
-    if (WSAIoctl(conn->sock,
+	if (WSAIoctl(sock,
                  SIO_KEEPALIVE_VALS,
                  (LPVOID) &ka,
                  sizeof(ka),
@@ -1774,6 +1765,26 @@ setKeepalivesWin32(PGconn *conn)
                  NULL,
                  NULL)
         != 0)
+		return 0;
+	return 1;
+}
+
+static int
+prepKeepalivesWin32(PGconn *conn)
+{
+	int			idle = -1;
+	int			interval = -1;
+
+	if (conn->keepalives_idle &&
+		!parse_int_param(conn->keepalives_idle, &idle, conn,
+						 "keepalives_idle"))
+		return 0;
+	if (conn->keepalives_interval &&
+		!parse_int_param(conn->keepalives_interval, &interval, conn,
+						 "keepalives_interval"))
+		return 0;
+
+	if (!setKeepalivesWin32(conn->sock, idle, interval))
     {
         appendPQExpBuffer(&conn->errorMessage,
                           libpq_gettext("WSAIoctl(SIO_KEEPALIVE_VALS) failed: %ui\n"),
@@ -2337,7 +2348,7 @@ PQconnectPoll(PGconn *conn)
                             err = 1;
 #else                            /* WIN32 */
 #ifdef SIO_KEEPALIVE_VALS
-                        else if (!setKeepalivesWin32(conn))
+						else if (!prepKeepalivesWin32(conn))
                             err = 1;
 #endif                            /* SIO_KEEPALIVE_VALS */
 #endif                            /* WIN32 */
@@ -3923,8 +3934,53 @@ PQgetCancel(PGconn *conn)
     memcpy(&cancel->raddr, &conn->raddr, sizeof(SockAddr));
     cancel->be_pid = conn->be_pid;
     cancel->be_key = conn->be_key;
+	/* We use -1 to indicate an unset connection option */
+	cancel->pgtcp_user_timeout = -1;
+	cancel->keepalives = -1;
+	cancel->keepalives_idle = -1;
+	cancel->keepalives_interval = -1;
+	cancel->keepalives_count = -1;
+	if (conn->pgtcp_user_timeout != NULL)
+	{
+		if (!parse_int_param(conn->pgtcp_user_timeout,
+							 &cancel->pgtcp_user_timeout,
+							 conn, "tcp_user_timeout"))
+			goto fail;
+	}
+	if (conn->keepalives != NULL)
+	{
+		if (!parse_int_param(conn->keepalives,
+							 &cancel->keepalives,
+							 conn, "keepalives"))
+			goto fail;
+	}
+	if (conn->keepalives_idle != NULL)
+	{
+		if (!parse_int_param(conn->keepalives_idle,
+							 &cancel->keepalives_idle,
+							 conn, "keepalives_idle"))
+			goto fail;
+	}
+	if (conn->keepalives_interval != NULL)
+	{
+		if (!parse_int_param(conn->keepalives_interval,
+							 &cancel->keepalives_interval,
+							 conn, "keepalives_interval"))
+			goto fail;
+	}
+	if (conn->keepalives_count != NULL)
+	{
+		if (!parse_int_param(conn->keepalives_count,
+							 &cancel->keepalives_count,
+							 conn, "keepalives_count"))
+			goto fail;
+	}
 
     return cancel;
+
+fail:
+	free(cancel);
+	return NULL;
 }
 
 /* PQfreeCancel: free a cancel structure */
@@ -3937,14 +3993,36 @@ PQfreeCancel(PGcancel *cancel)
 
 
 /*
- * PQcancel and PQrequestCancel: attempt to request cancellation of the
- * current operation.
+ * Sets an integer socket option on a TCP socket, if the provided value is
+ * not negative.  Returns false if setsockopt fails for some reason.
+ *
+ * CAUTION: This needs to be signal safe, since it's used by PQcancel.
+ */
+#if defined(TCP_USER_TIMEOUT) || !defined(WIN32)
+static bool
+optional_setsockopt(int fd, int protoid, int optid, int value)
+{
+	if (value < 0)
+		return true;
+	if (setsockopt(fd, protoid, optid, (char *) &value, sizeof(value)) < 0)
+		return false;
+	return true;
+}
+#endif
+
+
+/*
+ * PQcancel: request query cancel
  *
  * The return value is TRUE if the cancel request was successfully
  * dispatched, FALSE if not (in which case an error message is available).
  * Note: successful dispatch is no guarantee that there will be any effect at
  * the backend.  The application must read the operation result as usual.
  *
+ * On failure, an error message is stored in *errbuf, which must be of size
+ * errbufsize (recommended size is 256 bytes).  *errbuf is not changed on
+ * success return.
+ *
  * CAUTION: we want this routine to be safely callable from a signal handler
  * (for example, an application might want to call it in a SIGINT handler).
  * This means we cannot use any C library routine that might be non-reentrant.
@@ -3952,14 +4030,10 @@ PQfreeCancel(PGcancel *cancel)
  * just as dangerous.  We avoid sprintf here for that reason.  Building up
  * error messages with strcpy/strcat is tedious but should be quite safe.
  * We also save/restore errno in case the signal handler support doesn't.
- *
- * internal_cancel() is an internal helper function to make code-sharing
- * between the two versions of the cancel function possible.
  */
-static int
-internal_cancel(SockAddr *raddr, int be_pid, int be_key,
-                char *errbuf, int errbufsize)
-{// #lizard forgives
+int
+PQcancel(PGcancel *cancel, char *errbuf, int errbufsize)
+{
     int            save_errno = SOCK_ERRNO;
     pgsocket    tmpsock = PGINVALID_SOCKET;
     char        sebuf[256];
@@ -3970,18 +4044,98 @@ internal_cancel(SockAddr *raddr, int be_pid, int be_key,
         CancelRequestPacket cp;
     }            crp;
 
+	if (!cancel)
+	{
+		strlcpy(errbuf, "PQcancel() -- no cancel object supplied", errbufsize);
+		/* strlcpy probably doesn't change errno, but be paranoid */
+		SOCK_ERRNO_SET(save_errno);
+		return false;
+	}
+
     /*
      * We need to open a temporary connection to the postmaster. Do this with
      * only kernel calls.
      */
-    if ((tmpsock = socket(raddr->addr.ss_family, SOCK_STREAM, 0)) == PGINVALID_SOCKET)
+	if ((tmpsock = socket(cancel->raddr.addr.ss_family, SOCK_STREAM, 0)) == PGINVALID_SOCKET)
     {
         strlcpy(errbuf, "PQcancel() -- socket() failed: ", errbufsize);
         goto cancel_errReturn;
     }
+
+	/*
+	 * Since this connection will only be used to send a single packet of
+	 * data, we don't need NODELAY.  We also don't set the socket to
+	 * nonblocking mode, because the API definition of PQcancel requires the
+	 * cancel to be sent in a blocking way.
+	 *
+	 * We do set socket options related to keepalives and other TCP timeouts.
+	 * This ensures that this function does not block indefinitely when
+	 * reasonable keepalive and timeout settings have been provided.
+	 */
+	if (!IS_AF_UNIX(cancel->raddr.addr.ss_family) &&
+		cancel->keepalives != 0)
+	{
+#ifndef WIN32
+		if (!optional_setsockopt(tmpsock, SOL_SOCKET, SO_KEEPALIVE, 1))
+		{
+			strlcpy(errbuf, "PQcancel() -- setsockopt(SO_KEEPALIVE) failed: ", errbufsize);
+			goto cancel_errReturn;
+		}
+
+#ifdef PG_TCP_KEEPALIVE_IDLE
+		if (!optional_setsockopt(tmpsock, IPPROTO_TCP, PG_TCP_KEEPALIVE_IDLE,
+								 cancel->keepalives_idle))
+		{
+			strlcpy(errbuf, "PQcancel() -- setsockopt(" PG_TCP_KEEPALIVE_IDLE_STR ") failed: ", errbufsize);
+			goto cancel_errReturn;
+		}
+#endif
+
+#ifdef TCP_KEEPINTVL
+		if (!optional_setsockopt(tmpsock, IPPROTO_TCP, TCP_KEEPINTVL,
+								 cancel->keepalives_interval))
+		{
+			strlcpy(errbuf, "PQcancel() -- setsockopt(TCP_KEEPINTVL) failed: ", errbufsize);
+			goto cancel_errReturn;
+		}
+#endif
+
+#ifdef TCP_KEEPCNT
+		if (!optional_setsockopt(tmpsock, IPPROTO_TCP, TCP_KEEPCNT,
+								 cancel->keepalives_count))
+		{
+			strlcpy(errbuf, "PQcancel() -- setsockopt(TCP_KEEPCNT) failed: ", errbufsize);
+			goto cancel_errReturn;
+		}
+#endif
+
+#else							/* WIN32 */
+
+#ifdef SIO_KEEPALIVE_VALS
+		if (!setKeepalivesWin32(tmpsock,
+								cancel->keepalives_idle,
+								cancel->keepalives_interval))
+		{
+			strlcpy(errbuf, "PQcancel() -- WSAIoctl(SIO_KEEPALIVE_VALS) failed: ", errbufsize);
+			goto cancel_errReturn;
+		}
+#endif							/* SIO_KEEPALIVE_VALS */
+#endif							/* WIN32 */
+
+		/* TCP_USER_TIMEOUT works the same way on Unix and Windows */
+#ifdef TCP_USER_TIMEOUT
+		if (!optional_setsockopt(tmpsock, IPPROTO_TCP, TCP_USER_TIMEOUT,
+								 cancel->pgtcp_user_timeout))
+		{
+			strlcpy(errbuf, "PQcancel() -- setsockopt(TCP_USER_TIMEOUT) failed: ", errbufsize);
+			goto cancel_errReturn;
+		}
+#endif
+	}
+
 retry3:
-    if (connect(tmpsock, (struct sockaddr *) &raddr->addr,
-                raddr->salen) < 0)
+	if (connect(tmpsock, (struct sockaddr *) &cancel->raddr.addr,
+				cancel->raddr.salen) < 0)
     {
         if (SOCK_ERRNO == EINTR)
             /* Interrupted system call - we'll just try again */
@@ -3990,16 +4144,12 @@ internal_cancel(SockAddr *raddr, int be_pid, int be_key,
         goto cancel_errReturn;
     }
 
-    /*
-     * We needn't set nonblocking I/O or NODELAY options here.
-     */
-
     /* Create and send the cancel request packet. */
 
     crp.packetlen = htonl((uint32) sizeof(crp));
     crp.cp.cancelRequestCode = (MsgType) htonl(CANCEL_REQUEST_CODE);
-    crp.cp.backendPID = htonl(be_pid);
-    crp.cp.cancelAuthCode = htonl(be_key);
+	crp.cp.backendPID = htonl(cancel->be_pid);
+	crp.cp.cancelAuthCode = htonl(cancel->be_key);
 
 retry4:
     if (send(tmpsock, (char *) &crp, sizeof(crp), 0) != (int) sizeof(crp))
@@ -4149,27 +4299,6 @@ internal_end_query(SockAddr *raddr, int be_pid, int be_key,
 
 #endif
 
-/*
- * PQcancel: request query cancel
- *
- * Returns TRUE if able to send the cancel request, FALSE if not.
- *
- * On failure, an error message is stored in *errbuf, which must be of size
- * errbufsize (recommended size is 256 bytes).  *errbuf is not changed on
- * success return.
- */
-int
-PQcancel(PGcancel *cancel, char *errbuf, int errbufsize)
-{
-    if (!cancel)
-    {
-        strlcpy(errbuf, "PQcancel() -- no cancel object supplied", errbufsize);
-        return FALSE;
-    }
-
-    return internal_cancel(&cancel->raddr, cancel->be_pid, cancel->be_key,
-                           errbuf, errbufsize);
-}
 
 #ifdef __TBASE__
 int
@@ -4203,6 +4332,7 @@ int
 PQrequestCancel(PGconn *conn)
 {
     int            r;
+	PGcancel   *cancel;
 
     /* Check we have an open connection */
     if (!conn)
@@ -4218,8 +4348,19 @@ PQrequestCancel(PGconn *conn)
         return FALSE;
     }
 
-    r = internal_cancel(&conn->raddr, conn->be_pid, conn->be_key,
-                        conn->errorMessage.data, conn->errorMessage.maxlen);
+	cancel = PQgetCancel(conn);
+	if (cancel)
+	{
+		r = PQcancel(cancel, conn->errorMessage.data,
+					 conn->errorMessage.maxlen);
+		PQfreeCancel(cancel);
+	}
+	else
+	{
+		strlcpy(conn->errorMessage.data, "out of memory",
+				conn->errorMessage.maxlen);
+		r = false;
+	}
 
     if (!r)
         conn->errorMessage.len = strlen(conn->errorMessage.data);
diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h
index a51f3b7b..ad150bf7 100644
--- a/src/interfaces/libpq/libpq-int.h
+++ b/src/interfaces/libpq/libpq-int.h
@@ -576,6 +576,13 @@ struct pg_cancel
     SockAddr    raddr;            /* Remote address */
     int            be_pid;            /* PID of backend --- needed for cancels */
     int            be_key;            /* key of backend --- needed for cancels */
+	int			pgtcp_user_timeout; /* tcp user timeout */
+	int			keepalives;		/* use TCP keepalives? */
+	int			keepalives_idle;	/* time between TCP keepalives */
+	int			keepalives_interval;	/* time between TCP keepalive
+										 * retransmits */
+	int			keepalives_count;	/* maximum number of TCP keepalive
+									 * retransmits */
 };
 
 
From 54fb76aa54441fa425cec2ee622259e79d53226a Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Tue, 1 Mar 2022 21:34:13 +0800
Subject: [PATCH 513/578] fix hang when pqcancel
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131097019641

---
 src/backend/libpq/pqcomm.c      |  1 +
 src/backend/pgxc/pool/poolmgr.c | 54 +++++++++++++++++++++++++++++++++
 src/backend/utils/misc/guc.c    | 22 ++++++--------
 3 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index bd089ae0..db3b1ea1 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -2071,6 +2071,7 @@ SetSockKeepAlive(int sock)
 	{
 		elog(LOG, "SetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m");
 	}
+}
 
 int
 pq_gettcpusertimeout(Port *port)
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 756b2198..72d5786e 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -7418,6 +7418,57 @@ connect_pools(void)
     }
 }
 
+/*
+ * Set cancel socket keepalive and user_timeout.
+ * We can use this to detect the broken connection quickly.
+ * see SetSockKeepAlive
+ */
+static void
+set_cancel_conn_keepalive(PGcancel *cancelConn)
+{
+    uint32 user_timeout = UINT32_MAX / 1000 < tcp_keepalives_idle ?
+                          0 : tcp_keepalives_idle * (uint32) 1000;
+
+    if (cancelConn == NULL)
+    {
+        return;
+    }
+
+    /*
+     * If the connection did not use the connection option
+     * set the option here
+     * */
+    if (cancelConn->keepalives == -1)
+    {
+        /* use TCP keepalives */
+        cancelConn->keepalives = 1;
+
+        if (tcp_keepalives_idle > 0)
+        {
+            /* time between TCP keepalives */
+            cancelConn->keepalives_idle = tcp_keepalives_idle;
+        }
+
+        if (tcp_keepalives_interval > 0)
+        {
+            /*time between TCP keepalive retransmits */
+            cancelConn->keepalives_interval = tcp_keepalives_interval;
+        }
+
+        if (tcp_keepalives_count > 0)
+        {
+            /* maximum number of TCP keepalive retransmits */
+            cancelConn->keepalives_count = tcp_keepalives_count;
+        }
+    }
+
+    if (cancelConn->pgtcp_user_timeout == -1 && user_timeout > 0)
+    {
+        /* tcp user timeout */
+        cancelConn->pgtcp_user_timeout = user_timeout;
+    }
+}
+
 static bool
 preconnect_and_warm(DatabasePool *dbPool)
 {// #lizard forgives
@@ -7521,6 +7572,7 @@ preconnect_and_warm(DatabasePool *dbPool)
 
 			slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
 			SetSockKeepAlive(((PGconn *)slot->conn)->sock);
+            set_cancel_conn_keepalive((PGcancel *)slot->xc_cancelConn);
 			
 			/* Increase count of pool size */			
 			nodePool->slot[nodePool->freeSize] = slot;
@@ -7628,6 +7680,7 @@ void *pooler_async_connection_management_thread(void *arg)
 						slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
 						slot->bwarmed       = false;
 						SetSockKeepAlive(((PGconn *)slot->conn)->sock);
+                        set_cancel_conn_keepalive((PGcancel *)slot->xc_cancelConn);
 					}					
 					break;
 				}
@@ -7910,6 +7963,7 @@ void *pooler_sync_remote_operator_thread(void *arg)
 								slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
 								slot->bwarmed       = false;
 								SetSockKeepAlive(((PGconn *)slot->conn)->sock);
+                                set_cancel_conn_keepalive((PGcancel *)slot->xc_cancelConn);
 								
 								/* set the time flags */
 								slot->released = time(NULL);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b53a3c57..dbccb8f6 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4298,7 +4298,16 @@ static struct config_int ConfigureNamesInt[] =
         0, 0, INT_MAX,
         NULL, assign_tcp_keepalives_count, show_tcp_keepalives_count
     },
-
+    {
+        {"tcp_user_timeout", PGC_USERSET, CLIENT_CONN_OTHER,
+         gettext_noop("TCP user timeout."),
+         gettext_noop("A value of 0 uses the system default."),
+         GUC_UNIT_MS
+        },
+        &tcp_user_timeout,
+        0, 0, INT_MAX,
+        NULL, assign_tcp_user_timeout, show_tcp_user_timeout
+    },
     {
         {"gin_fuzzy_search_limit", PGC_USERSET, CLIENT_CONN_OTHER,
             gettext_noop("Sets the maximum allowed result for exact search by GIN."),
@@ -4955,17 +4964,6 @@ static struct config_uint ConfigureNamesUInt[] =
         NULL, NULL, NULL
     },
 
-	{
-		{"tcp_user_timeout", PGC_USERSET, CLIENT_CONN_OTHER,
-			gettext_noop("TCP user timeout."),
-			gettext_noop("A value of 0 uses the system default."),
-			GUC_UNIT_MS
-		},
-		&tcp_user_timeout,
-		0, 0, INT_MAX,
-		NULL, assign_tcp_user_timeout, show_tcp_user_timeout
-	},
-
     /* End-of-list marker */
     {
         {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL

From 8cb8cf654b937bef3768a8a48fd0c4a073707201 Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Thu, 17 Mar 2022 19:14:19 +0800
Subject: [PATCH 514/578] When analyzing a interval partitioned table, the
 sub-table not be locked when make the oids listd. Therefore, when serially
 analyzing sub-tables, if the sub-table is droped before processing, the
 analysis process will be interrupted. The fix is to use try_relation_open
 instead of relation_open when processing sub-tables, and skip if the opening
 fails.

---
 src/backend/commands/analyze.c    | 47 +++++--------------------------
 src/backend/commands/vacuum.c     | 17 +++++------
 src/backend/utils/adt/ruleutils.c | 24 ++++++++++++++++
 src/include/utils/ruleutils.h     |  5 +++-
 4 files changed, 44 insertions(+), 49 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index d342a1ac..edd33ef1 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -170,34 +170,6 @@ analyze_rel(Oid relid, RangeVar *relation, int options,
     int            elevel;
     AcquireSampleRowsFunc acquirefunc = NULL;
     BlockNumber relpages = 0;
-#ifdef __TBASE__
-    List        *childs = NULL;
-    Oid         child;
-    ListCell    *lc;
-
-    if(!IsAutoVacuumWorkerProcess())
-    {
-        onerel = relation_open(relid, NoLock);
-
-        if(RELATION_IS_INTERVAL(onerel))
-        {
-            childs = RelationGetAllPartitions(onerel);
-            foreach(lc, childs)
-            {
-                child = lfirst_oid(lc);
-                analyze_rel(child, relation, options, params, va_cols, in_outer_xact,
-                            bstrategy);
-            }
-			if (childs)
-				pfree(childs);
-            childs = NULL;
-			CommandCounterIncrement();
-        }
-
-        relation_close(onerel, NoLock);
-        onerel = NULL;
-    }
-#endif
 
     /* Select logging level */
     if (options & VACOPT_VERBOSE)
@@ -1549,7 +1521,7 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
      */
 	if (RELATION_IS_INTERVAL(onerel))
 	{
-		tableOIDs = RelationGetAllPartitions(onerel);
+		tableOIDs = RelationGetAllPartitionsWithLock(onerel, AccessShareLock);
 	}
 	else 
 	{
@@ -1562,8 +1534,9 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
      * child but no longer does.  In that case, we can clear the
      * relhassubclass field so as not to make the same mistake again later.
      * (This is safe because we hold ShareUpdateExclusiveLock.)
+	 * 
      */
-    if (list_length(tableOIDs) < 2)
+	if (list_length(tableOIDs) < 2 && !(list_length(tableOIDs) == 1 && RELATION_IS_INTERVAL(onerel)))
     {
         /* CCI because we already updated the pg_class row in this command */
         CommandCounterIncrement();
@@ -1594,14 +1567,8 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
         BlockNumber relpages = 0;
 
         /* We already got the needed lock */
-		if (RELATION_IS_INTERVAL(onerel))
-		{
-			childrel = heap_open(childOID, AccessShareLock);
-		}
-		else 
-		{
 			childrel = heap_open(childOID, NoLock);
-		}
+
 
         /* Ignore if temp table of another backend */
         if (RELATION_IS_OTHER_TEMP(childrel))
@@ -4878,12 +4845,12 @@ get_rel_pages_visiblepages(Relation onerel,
 
 		if (RELATION_IS_INTERVAL(onerel))
 		{
-			childs = RelationGetAllPartitions(onerel);
+			childs = RelationGetAllPartitionsWithLock(onerel, AccessShareLock);
 		}
 		else 
 		{
 			childs =
-				find_all_inheritors(RelationGetRelid(onerel), NoLock, NULL);
+				find_all_inheritors(RelationGetRelid(onerel), AccessShareLock, NULL);
 		}
 
 		*pages = 0;
@@ -4896,7 +4863,7 @@ get_rel_pages_visiblepages(Relation onerel,
 			BlockNumber visible;
 
 			/* We already got the needed lock */
-			childrel = heap_open(childOID, AccessShareLock);
+			childrel = heap_open(childOID, NoLock);
 
 			/* Ignore if temp table of another backend */
 			if (RELATION_IS_OTHER_TEMP(childrel))
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 3c337815..7f6c4e18 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -456,6 +456,15 @@ get_rel_oids(Oid relid, const RangeVar *vacrel)
         if (include_parts)
             oid_list = list_concat(oid_list,
                                    find_all_inheritors(relid, NoLock, NULL));
+		else if (!IsAutoVacuumWorkerProcess() &&
+				 classForm->relpartkind == RELPARTKIND_PARENT)
+		{
+			Relation p_rel;
+			p_rel	 = relation_open(relid, NoLock);
+			oid_list = lappend_oid(oid_list, relid);
+			oid_list = list_concat(oid_list, RelationGetAllPartitions(p_rel));
+			relation_close(p_rel, NoLock);
+		}
         else
             oid_list = lappend_oid(oid_list, relid);
         MemoryContextSwitchTo(oldcontext);
@@ -1266,14 +1275,6 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params)
     Oid            save_userid;
     int            save_sec_context;
     int            save_nestlevel;
-#ifdef __TBASE__
-    bool        part_vacuum_result = true;
-    List        *childs = NULL;
-    List        *new_childs = NULL;
-    Oid         child;
-    ListCell    *lc;
-    MemoryContext oldmctx;
-#endif
 
     Assert(params != NULL);
 
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 6b2dd38b..80623c91 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -93,6 +93,7 @@
 #include "postmaster/postmaster.h"
 #endif
 
+#include "storage/lmgr.h"
 /* ----------
  * Pretty formatting constants
  * ----------
@@ -12050,6 +12051,12 @@ RelationGetPartitionByValue(Relation rel, Const *value)
 
 List *
 RelationGetAllPartitions(Relation rel)
+{
+	return RelationGetAllPartitionsWithLock(rel, NoLock);
+}
+
+List *
+RelationGetAllPartitionsWithLock(Relation rel, LOCKMODE lockmode)
 {
     int nparts = 0;
     char *partname = NULL;
@@ -12072,7 +12079,24 @@ RelationGetAllPartitions(Relation rel)
 		{
 			continue;
 		}
+		if (lockmode != NoLock)
+		{
+			/* Get the lock to synchronize against concurrent drop */
+			LockRelationOid(partoid, lockmode);
 
+			/*
+			 * Now that we have the lock, double-check to see if the relation
+			 * really exists or not.  If not, assume it was dropped while we
+			 * waited to acquire lock, and ignore it.
+			 */
+			if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(partoid)))
+			{
+				/* Release useless lock */
+				UnlockRelationOid(partoid, lockmode);
+				/* And ignore this relation */
+				continue;
+			}
+		}
 		result = lappend_oid(result, partoid);
     }
 
diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h
index 5dc0e217..db0106ea 100644
--- a/src/include/utils/ruleutils.h
+++ b/src/include/utils/ruleutils.h
@@ -76,7 +76,8 @@
 #include "nodes/nodes.h"
 #include "nodes/parsenodes.h"
 #include "nodes/pg_list.h"
-
+#include "nodes/relation.h"
+#include "storage/lockdefs.h"
 
 extern char *pg_get_indexdef_string(Oid indexrelid);
 extern char *pg_get_indexdef_columns(Oid indexrelid, bool pretty);
@@ -101,6 +102,8 @@ extern char * GetPartitionName(Oid parentrelid, int partidx, bool isindex);
 extern int RelationGetPartitionIdxByValue(Relation rel, Datum value);
 
 extern List *RelationGetAllPartitions(Relation rel);
+extern List *RelationGetAllPartitionsWithLock(Relation rel, LOCKMODE lockmode);
+extern int GetAllPartitionIntervalCount(Oid parent_oid);
 
 extern int GetAllPartitionIntervalCount(Oid parent_oid);
 

From 03fe6cadb50c582b6e327cb3c12f7de2ac48f627 Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Thu, 17 Mar 2022 19:59:36 +0800
Subject: [PATCH 515/578] When analyzing a interval partitioned table, the
 sub-table not be locked when make the oids listd. Therefore, when serially
 analyzing sub-tables, if the sub-table is droped before processing, the
 analysis process will be interrupted. The fix is to use try_relation_open
 instead of relation_open when processing sub-tables, and skip if the opening
 fails.

---
 src/backend/commands/analyze.c | 33 ++++++++++++++++++++
 src/backend/commands/vacuum.c  | 55 ++++++++++++++++++++--------------
 2 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index edd33ef1..679c521a 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -170,6 +170,39 @@ analyze_rel(Oid relid, RangeVar *relation, int options,
     int            elevel;
     AcquireSampleRowsFunc acquirefunc = NULL;
     BlockNumber relpages = 0;
+#ifdef __TBASE__
+	List	 *childs = NULL;
+	Oid		  child;
+	ListCell *lc;
+	if (!IsAutoVacuumWorkerProcess())
+	{
+		onerel = try_relation_open(relid, NoLock);
+		if(!onerel)
+			return;
+
+		if (RELATION_IS_INTERVAL(onerel))
+		{
+			childs = RelationGetAllPartitions(onerel);
+			foreach (lc, childs)
+			{
+				child = lfirst_oid(lc);
+				analyze_rel(child,
+							relation,
+							options,
+							params,
+							va_cols,
+							in_outer_xact,
+							bstrategy);
+			}
+			if (childs)
+				pfree(childs);
+			childs = NULL;
+			CommandCounterIncrement();
+		}
+		relation_close(onerel, NoLock);
+		onerel = NULL;
+	}
+#endif
 
     /* Select logging level */
     if (options & VACOPT_VERBOSE)
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 7f6c4e18..f9c3f3dc 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -456,15 +456,6 @@ get_rel_oids(Oid relid, const RangeVar *vacrel)
         if (include_parts)
             oid_list = list_concat(oid_list,
                                    find_all_inheritors(relid, NoLock, NULL));
-		else if (!IsAutoVacuumWorkerProcess() &&
-				 classForm->relpartkind == RELPARTKIND_PARENT)
-		{
-			Relation p_rel;
-			p_rel	 = relation_open(relid, NoLock);
-			oid_list = lappend_oid(oid_list, relid);
-			oid_list = list_concat(oid_list, RelationGetAllPartitions(p_rel));
-			relation_close(p_rel, NoLock);
-		}
         else
             oid_list = lappend_oid(oid_list, relid);
         MemoryContextSwitchTo(oldcontext);
@@ -500,6 +491,9 @@ get_rel_oids(Oid relid, const RangeVar *vacrel)
 			if (classForm->relpartkind == RELPARTKIND_CHILD)
 				continue;
 
+			if (classForm->relpartkind == RELPARTKIND_CHILD)
+				continue;
+
             /* Make a relation list entry for this guy */
             oldcontext = MemoryContextSwitchTo(vac_context);
             oid_list = lappend_oid(oid_list, HeapTupleGetOid(tuple));
@@ -1275,6 +1269,15 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params)
     Oid            save_userid;
     int            save_sec_context;
     int            save_nestlevel;
+#ifdef __TBASE__
+       bool            part_vacuum_result = true;
+       List            *childs = NULL;
+       List            *new_childs = NULL;
+       Oid             child;
+       ListCell        *lc;
+       MemoryContext oldmctx;
+#endif
+
 
     Assert(params != NULL);
 
@@ -1285,19 +1288,25 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params)
         /* functions in indexes may want a snapshot set */
         PushActiveSnapshot(GetLocalTransactionSnapshot());
 
-        onerel = relation_open(relid, NoLock);
+        onerel = try_relation_open(relid, NoLock);
+        if (!onerel)
+        {
+            PopActiveSnapshot();
+            CommitTransactionCommand();
+            return false;
+        }
 
-        if(RELATION_IS_INTERVAL(onerel))
+        if (RELATION_IS_INTERVAL(onerel))
         {
-            childs = RelationGetAllPartitions(onerel);
+            childs	  = RelationGetAllPartitions(onerel);
 
-            oldmctx = MemoryContextSwitchTo(vac_context);
+            oldmctx	  = MemoryContextSwitchTo(vac_context);
             new_childs = list_copy(childs);
             MemoryContextSwitchTo(oldmctx);
 
-			if (childs)
-				pfree(childs);
-            childs = NULL;
+            if (childs)
+                pfree(childs);
+            childs	= NULL;
             onerelid = onerel->rd_lockInfo.lockRelId;
             LockRelationIdForSession(&onerelid, RowExclusiveLock);
         }
@@ -1307,23 +1316,23 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params)
 
         PopActiveSnapshot();
         CommitTransactionCommand();
-        
-        if(new_childs)
+
+        if (new_childs)
         {
-            foreach(lc, new_childs)
+            foreach (lc, new_childs)
             {
-                child = lfirst_oid(lc);
+                child			  = lfirst_oid(lc);
                 part_vacuum_result = vacuum_rel(child, relation, options, params);
             }
             UnlockRelationIdForSession(&onerelid, RowExclusiveLock);
             pfree(new_childs);
 
-            if(!part_vacuum_result)
+            if (!part_vacuum_result)
             {
                 return false;
-            }        
+            }
         }
-    }
+	}
 #endif    
 
     /* Begin a transaction for vacuuming this relation */

From f11a813eaf8cab1f4890a09d2b198a5f44630e25 Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Fri, 18 Mar 2022 12:26:01 +0800
Subject: [PATCH 516/578] When analyzing a interval partitioned table, the
 sub-table not be locked when make the oids listd. Therefore, when serially
 analyzing sub-tables, if the sub-table is droped before processing, the
 analysis process will be interrupted. The fix is to use try_relation_open
 instead of relation_open when processing sub-tables, and skip if the opening
 fails.

---
 src/backend/commands/analyze.c | 7 +++++--
 src/backend/commands/vacuum.c  | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 679c521a..ace0cc24 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -176,13 +176,15 @@ analyze_rel(Oid relid, RangeVar *relation, int options,
 	ListCell *lc;
 	if (!IsAutoVacuumWorkerProcess())
 	{
-		onerel = try_relation_open(relid, NoLock);
+		onerel = try_relation_open(relid, AccessShareLock);
 		if(!onerel)
 			return;
 
 		if (RELATION_IS_INTERVAL(onerel))
 		{
 			childs = RelationGetAllPartitions(onerel);
+			/* no need maintain parent lock，unlock and close */
+			relation_close(onerel, AccessShareLock);
 			foreach (lc, childs)
 			{
 				child = lfirst_oid(lc);
@@ -199,7 +201,8 @@ analyze_rel(Oid relid, RangeVar *relation, int options,
 			childs = NULL;
 			CommandCounterIncrement();
 		}
-		relation_close(onerel, NoLock);
+		else
+			relation_close(onerel, AccessShareLock);
 		onerel = NULL;
 	}
 #endif
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index f9c3f3dc..93c91f68 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1288,7 +1288,7 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params)
         /* functions in indexes may want a snapshot set */
         PushActiveSnapshot(GetLocalTransactionSnapshot());
 
-        onerel = try_relation_open(relid, NoLock);
+		   onerel = try_relation_open(relid, AccessShareLock);
         if (!onerel)
         {
             PopActiveSnapshot();
@@ -1311,7 +1311,7 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params)
             LockRelationIdForSession(&onerelid, RowExclusiveLock);
         }
 
-        relation_close(onerel, NoLock);
+		   relation_close(onerel, AccessShareLock);
         onerel = NULL;
 
         PopActiveSnapshot();

From fc15164aaadd9e4e1d745ca06525994194852bb8 Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Fri, 18 Mar 2022 15:34:53 +0800
Subject: [PATCH 517/578] When analyzing a interval partitioned table, the
 sub-table not be locked when make the oids listd. Therefore, when serially
 analyzing sub-tables, if the sub-table is droped before processing, the
 analysis process will be interrupted. The fix is to use try_relation_open
 instead of relation_open when processing sub-tables, and skip if the opening
 fails.

---
 src/backend/commands/analyze.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index ace0cc24..2549b543 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -1570,12 +1570,18 @@ acquire_inherited_sample_rows(Relation onerel, int elevel,
      * child but no longer does.  In that case, we can clear the
      * relhassubclass field so as not to make the same mistake again later.
      * (This is safe because we hold ShareUpdateExclusiveLock.)
-	 * 
+	 * No need to deal with the parent table of interval partitioned table, so tableOIDs
+	 * only carry children table oids.
      */
 	if (list_length(tableOIDs) < 2 && !(list_length(tableOIDs) == 1 && RELATION_IS_INTERVAL(onerel)))
     {
         /* CCI because we already updated the pg_class row in this command */
         CommandCounterIncrement();
+		/*
+		 * the interval partitioned table has nothing to do with attribute named
+		 * relhassubclass
+		 */
+		if(!RELATION_IS_INTERVAL(onerel))
         SetRelationHasSubclass(RelationGetRelid(onerel), false);
         ereport(elevel,
                 (errmsg("skipping analyze of \"%s.%s\" inheritance tree --- this inheritance tree contains no child tables",

From c7bb8fd0cedc0573f6adaa6147ebf9e1461d661c Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Mon, 21 Mar 2022 17:19:29 +0800
Subject: [PATCH 518/578] When analyzing a interval partitioned table, the
 sub-table not be locked when make the oids listd. Therefore, when serially
 analyzing sub-tables, if the sub-table is droped before processing, the
 analysis process will be interrupted. The fix is to use try_relation_open
 instead of relation_open when processing sub-tables, and skip if the opening
 fails.

---
 src/backend/commands/vacuumlazy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 90dfe91f..70687960 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -213,7 +213,7 @@ lazy_vacuum_interval_rel(Relation onerel, VacuumParams *params)
                               &multiXactCutoff, NULL);
     }
 
-	childs = RelationGetAllPartitions(onerel);
+	childs = RelationGetAllPartitionsWithLock(onerel, AccessShareLock);
 
 	foreach (lc, childs)
 	{
@@ -222,7 +222,7 @@ lazy_vacuum_interval_rel(Relation onerel, VacuumParams *params)
 		PgStat_StatTabEntry *tabentry;
 	
 		/* We already got the needed lock */
-		childrel = heap_open(childOID, AccessShareLock);
+		childrel = heap_open(childOID, NoLock);
 
 		/* Ignore if temp table of another backend */
 		if (RELATION_IS_OTHER_TEMP(childrel))

From 6846def21385aa794885a59ce81f4fbef7ea4046 Mon Sep 17 00:00:00 2001
From: ericxwu <ericxwu@tencent.com>
Date: Wed, 15 Sep 2021 16:23:13 +0800
Subject: [PATCH 519/578] Fix plantree_walk_initplans bug that missing one
 input parm of walker (merge request !701)

http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696092259197
---
 src/backend/nodes/nodeFuncs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c
index e9916037..9bec67b0 100644
--- a/src/backend/nodes/nodeFuncs.c
+++ b/src/backend/nodes/nodeFuncs.c
@@ -3914,7 +3914,7 @@ plantree_walk_initplans(List *plans,
 		Plan    *splan = (Plan *) list_nth(subplans,
 		                                   (lfirst_node(SubPlan, lc))->plan_id - 1);
 		
-		if (walker(splan, context))
+		if (walker(splan, subplans, context))
 			return true;
 	}
 	

From 4b9dcbd259c97729ae345a991668d03d8ffc32c6 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Tue, 12 Apr 2022 11:55:03 +0800
Subject: [PATCH 520/578] Bugfix: report "prepared statement XXX does not
 exist" errors all the time after cn switch (merge request !1244),
 http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696098282911

(cherry picked from commit dbc2ef63)

774a3df1 bugfix: report prepared statement XXX does not exist errors all the time after cn switch, http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696098282911
---
 src/backend/pgxc/pool/pgxcnode.c | 42 ++++++++++++++++----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index e4767e9d..c19325a9 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -4033,27 +4033,6 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
 
                 node_handle = &dn_handles[node];
 				
-				if (be_pid == 0 && !raise_error)
-				{
-					PGXCNodeSetConnectionState(node_handle, DN_CONNECTION_STATE_ERROR_FATAL);
-					continue;
-				}
-				
-				pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
-                dn_handles[node] = *node_handle;
-                datanode_count++;
-
-                elog(DEBUG1, "Established a connection with datanode \"%s\","
-                        "remote backend PID %d, socket fd %d, global session %c",
-                        node_handle->nodename, (int) be_pid, fdsock,
-                        is_global_session ? 'T' : 'F');
-#ifdef _PG_REGRESS_
-                elog(LOG, "Established a connection with datanode \"%s\","
-                        "remote backend PID %d, socket fd %d, global session %c",
-                        node_handle->nodename, (int) be_pid, fdsock,
-                        is_global_session ? 'T' : 'F');
-#endif
-
 				if (IS_PGXC_COORDINATOR)
 				{
 					char nodetype = PGXC_NODE_DATANODE;
@@ -4078,6 +4057,27 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query, bool
 							"oid %d, type %c, max nodes %d", node_handle->nodename,
 							nodeidx, node_handle->nodeoid, nodetype, NumDataNodes);
 				}
+
+				if (be_pid == 0 && !raise_error)
+				{
+					PGXCNodeSetConnectionState(node_handle, DN_CONNECTION_STATE_ERROR_FATAL);
+					continue;
+				}
+
+				pgxc_node_init(node_handle, fdsock, is_global_session, be_pid);
+				dn_handles[node] = *node_handle;
+				datanode_count++;
+
+				elog(DEBUG1, "Established a connection with datanode \"%s\","
+						"remote backend PID %d, socket fd %d, global session %c",
+						node_handle->nodename, (int) be_pid, fdsock,
+						is_global_session ? 'T' : 'F');
+#ifdef _PG_REGRESS_
+				elog(LOG, "Established a connection with datanode \"%s\","
+						"remote backend PID %d, socket fd %d, global session %c",
+						node_handle->nodename, (int) be_pid, fdsock,
+						is_global_session ? 'T' : 'F');
+#endif
             }
         }
         /* Initialisation for Coordinators */

From fb5cdc4931b580fed244cd8e9a0bc0854d474b62 Mon Sep 17 00:00:00 2001
From: arrowbowang <arrowbowang@tencent.com>
Date: Tue, 12 Apr 2022 14:20:20 +0800
Subject: [PATCH 521/578] fix: when use exetended protocol change
 pg_stat_activity stat to idle after the sql finished on dn

---
 src/backend/tcop/postgres.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 126bae58..c04d65ed 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -4833,6 +4833,7 @@ PostgresMain(int argc, char *argv[],
     StringInfoData input_message;
     sigjmp_buf    local_sigjmp_buf;
     volatile bool send_ready_for_query = true;
+    volatile bool need_report_activity = false;
     bool        disable_idle_in_transaction_timeout = false;
 
 #ifdef PGXC /* PGXC_DATANODE */
@@ -5410,7 +5411,7 @@ PostgresMain(int argc, char *argv[],
          * uncommitted updates (that confuses autovacuum).  The notification
          * processor wants a call too, if we are not in a transaction block.
          */
-        if (send_ready_for_query)
+		if (send_ready_for_query || need_report_activity)
         {
             if (IsAbortedTransactionBlockState())
             {
@@ -5447,6 +5448,7 @@ PostgresMain(int argc, char *argv[],
                 pgstat_report_activity(STATE_IDLE, NULL);
             }
 
+            if(send_ready_for_query)
             ReadyForQuery(whereToSendOutput);
 
 #ifdef XCP
@@ -5469,6 +5471,7 @@ PostgresMain(int argc, char *argv[],
 #endif
 
             send_ready_for_query = false;
+            need_report_activity = false;
         }
 
         /*
@@ -5809,6 +5812,7 @@ PostgresMain(int argc, char *argv[],
             case 'L':			/* sync */
                 pq_getmsgend(&input_message);
                 finish_xact_command();
+                need_report_activity = true;
                 break;	
 #ifdef __TBASE__
             case 'N':

From 4991a508465cb8d420e53f83c31f5ab6a88b0c0b Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Sat, 2 Apr 2022 16:11:07 +0800
Subject: [PATCH 522/578] fix core when create node
 http://tapd.woa.com/TBase_Oracle_Migration/bugtrace/bugs/view?bug_id=1020421696098208891&jump_count=1
 (merge request !1237)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Squash merge branch 'sigmalin002' into 'Tbase_v5.06.2'
问题：多个stmt的语句，在exec_simple_query进行了拆分。插件这里（pg_stat_log，pg_stat_statements中pgsl_store和pgss_store）还是按照原始的query_string去计算目标的query位置，可能会导致内存访问的问题，从而core。
修复：原来exec_simple_query中拆分sql是自己去匹配分号，修改为根据stmt中的query_location来进行拆分，拆分后的sql作为portal的sourceText，stmt里面的query_location置为0

TAPD: --bug=098208891
---
 src/backend/tcop/postgres.c | 88 ++++++++++++-------------------------
 1 file changed, 27 insertions(+), 61 deletions(-)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index c04d65ed..37819150 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -1159,70 +1159,43 @@ ch_is_space(char ch)
  * if the query string contain multi stmt
  */
 static char*
-get_myself_query_string(char* query_string, char** out_query_string)
+get_myself_query_string(const char* query_string, RawStmt *parsetree)
 {
-    char       *string_delimeter = NULL;
-    char       *myself_query_string = NULL;
-    int         myself_query_string_len = 0;
-    int         pos = 0;
-    bool        in_quotation = false;
-    int         query_string_len = 0;
-
-    if (query_string && query_string[0] != '\0')
-    {
-        /* skip space and redundant ';' */
-        while (*query_string != '\0')
+    static StringInfo myself_query_string = NULL;
+    int			query_location;
+    int			query_len;
+    MemoryContext oldcontext;
+
+    if (parsetree->stmt_location >= 0)
         {
-            if (ch_is_space(*query_string) || *query_string == ';')
-            {
-                query_string++;
+        Assert(parsetree->stmt_location <= strlen(query_string));
+        query_location = parsetree->stmt_location;
+        /* Length of 0 (or -1) means "rest of string" */
+        query_len =  (parsetree->stmt_len <= 0) ? strlen(query_string) : parsetree->stmt_len;
+        /* update the location */
+        parsetree->stmt_location = 0;
             }
             else
             {
-                break;
-            }
-        }
-
-        if (*query_string == '\0')
-        {
-            *out_query_string = NULL;
-            return NULL;
-        }
-
-        /* find ';' in query string, be careful of '\'' */
-        query_string_len = strlen(query_string);
-        for (pos = 0; pos < query_string_len; pos++)
-        {
-            if (query_string[pos] == '\'')
-            {
-                in_quotation = (in_quotation) ? false : true;
+        /* If query location is unknown, distrust query_len as well */
+        query_location = 0;
+        query_len = strlen(query_string);
             }
 
-            if (query_string[pos] == ';' && !in_quotation)
+    oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+    if (myself_query_string == NULL)
             {
-                string_delimeter = &query_string[pos];
-                break;
-            }
-        }
-
-        if (string_delimeter == NULL)
-        {
-            myself_query_string = query_string;
-            query_string = NULL;
+        myself_query_string = makeStringInfo();
         }
         else
         {
-            myself_query_string_len = string_delimeter - query_string;
-            myself_query_string = palloc(myself_query_string_len + 1);
-            memcpy(myself_query_string, query_string, myself_query_string_len);
-            myself_query_string[myself_query_string_len] = '\0';
-
-            query_string = string_delimeter + 1;
-        }
+        resetStringInfo(myself_query_string);
     }
 
-    *out_query_string = myself_query_string;
-    return query_string;
+    appendBinaryStringInfo(myself_query_string, query_string + query_location, query_len);
+    MemoryContextSwitchTo(oldcontext);
+
+    return myself_query_string->data;
 }
 
 /*
@@ -1242,7 +1215,6 @@ exec_simple_query(const char *query_string)
     bool        isTopLevel;
     char        msec_str[32];
     bool        multiCommands = false;
-    char       *query_string_tmp = NULL;
 
     /*
      * Report query to various monitoring facilities.
@@ -1314,8 +1286,6 @@ exec_simple_query(const char *query_string)
                          errmsg("COMMIT or ROLLBACK "
                                 "in multi-statement queries not allowed")));
         }
-
-        query_string_tmp = (char*) query_string;
     }
 
     /*
@@ -1373,13 +1343,9 @@ exec_simple_query(const char *query_string)
         Portal        portal;
         DestReceiver *receiver;
         int16        format;
-        char       *myself_query_string = NULL;
-
-        if (query_string_tmp && query_string_tmp[0] != '\0')
-        {
             /* get this portal's query when has multi parse tree */
-            query_string_tmp = get_myself_query_string(query_string_tmp, &myself_query_string);
-        }
+        const char  *myself_query_string = isTopLevel ? debug_query_string :
+                                           (const char *)get_myself_query_string(debug_query_string, parsetree);
 
 #ifdef PGXC
 
@@ -1543,7 +1509,7 @@ exec_simple_query(const char *query_string)
          */
         PortalDefineQuery(portal,
                           NULL,
-                          (myself_query_string) ? myself_query_string : query_string,
+                          myself_query_string,
                           commandTag,
                           plantree_list,
                           NULL);

From 20e9ddda00b8ddaf82d74416ae2937db80fe7c55 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 7 Apr 2022 16:50:47 +0800
Subject: [PATCH 523/578] fix bug in PgxcNodeAlter

---
 src/backend/pgxc/nodemgr/nodemgr.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c
index 65fbcccd..a906e3dd 100644
--- a/src/backend/pgxc/nodemgr/nodemgr.c
+++ b/src/backend/pgxc/nodemgr/nodemgr.c
@@ -1461,8 +1461,6 @@ PgxcNodeAlter(AlterNodeStmt *stmt)
     /* Check that node exists */
     if (!OidIsValid(nodeOid))
     {
-        nodeOid = get_pgxc_nodeoid_extend(node_name, PGXCDefaultClusterName);
-        if (!OidIsValid(nodeOid))
             ereport(ERROR,
                     (errcode(ERRCODE_UNDEFINED_OBJECT),
                      errmsg("PGXC Node %s: object not defined",

From 580a9ea2dc1a7063e346900db01412229cd4d355 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 25 Mar 2022 14:18:07 +0800
Subject: [PATCH 524/578] free planstate related memory in
 pg_stat_cluster_activity by a memory context

tapd: http://tapd.woa.com/TEG_TBase/bugtrace/bugs/view/1020423208097794079
---
 .../pg_stat_cluster_activity.c                    | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
index 2b36fe39..58c989f1 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -20,6 +20,7 @@
 #include "storage/shmem.h"
 #include "utils/builtins.h"
 #include "utils/guc.h"
+#include "utils/memutils.h"
 #include "utils/portal.h"
 #include "utils/snapmgr.h"
 #include "utils/timestamp.h"
@@ -104,6 +105,8 @@ static ExecutorStart_hook_type prev_ExecutorStart = NULL;
 
 static bool pgcs_enable_planstate; /* whether to show planstate in result sets */
 
+MemoryContext PGCSMemoryContext = NULL;
+
 /*
  * Macros to load and store st_changecount with the memory barriers.
  *
@@ -419,6 +422,8 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags)
 	volatile PgClusterStatus *entry;
 	StringInfo planstate_str = NULL;
 	StringInfo cursors = NULL;
+	ExplainState *es = NULL;
+	MemoryContext oldctx;
 	
 	if (prev_ExecutorStart)
 		prev_ExecutorStart(desc, eflags);
@@ -440,12 +445,14 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags)
 		return;
 	}
 	
+	oldctx = MemoryContextSwitchTo(PGCSMemoryContext);
+	
 	if (desc->planstate != NULL)
 	{
 		/* make planstate text tree if enabled */
 		if (pgcs_enable_planstate)
 		{
-			ExplainState *es = NewExplainState();
+			es = NewExplainState();
 			
 			es->costs = false;
 			/* we don't want plan->targetlist been changed */
@@ -470,6 +477,9 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags)
 		cursorCollectWalker(desc->planstate, cursors);
 	}
 	
+	MemoryContextSwitchTo(oldctx);
+	MemoryContextResetAndDeleteChildren(PGCSMemoryContext);
+	
 	increment_changecount_before(entry);
 	
 	if (planstate_str != NULL && planstate_str->len > 0)
@@ -1140,6 +1150,9 @@ _PG_init(void)
 	 */
 	RequestAddinShmemSpace(pgcs_memsize());
 	
+	PGCSMemoryContext = AllocSetContextCreate(TopMemoryContext,
+	                                          "pg_stat_cluster_activity planstate",
+	                                          ALLOCSET_DEFAULT_SIZES);
 	/*
 	 * Install hooks.
 	 */

From ed6567e4cdefdad5903d97fd23b111e64c574715 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 14 Apr 2022 10:05:31 +0800
Subject: [PATCH 525/578] Reset PGCSMemoryContext properly in
 pg_stat_cluster_activity

---
 contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
index 58c989f1..4e721ead 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -477,9 +477,6 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags)
 		cursorCollectWalker(desc->planstate, cursors);
 	}
 	
-	MemoryContextSwitchTo(oldctx);
-	MemoryContextResetAndDeleteChildren(PGCSMemoryContext);
-	
 	increment_changecount_before(entry);
 	
 	if (planstate_str != NULL && planstate_str->len > 0)
@@ -491,6 +488,9 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags)
 	pgcs_report_role((PgClusterStatus *) entry, desc);
 	
 	increment_changecount_after(entry);
+	
+	MemoryContextSwitchTo(oldctx);
+	MemoryContextResetAndDeleteChildren(PGCSMemoryContext);
 }
 
 /* ----------

From 6855a6dfe611a11229d5aae9780d9bc7139e61a1 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 14 Apr 2022 15:24:00 +0800
Subject: [PATCH 526/578] Squash merge branch 'andrelin/Tbase_v5.06.2' into
 'Tbase_v5.06.2'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

revert了历史提交，用desc->estate->es_query_cxt来保证是每个query独立的context

测试了之前单进程跑的内存测试用例，带着插件跑tpcc都OK了
---
 .../pg_stat_cluster_activity.c                | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
index 4e721ead..4bd82758 100644
--- a/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
+++ b/contrib/pg_stat_cluster_activity/pg_stat_cluster_activity.c
@@ -20,7 +20,6 @@
 #include "storage/shmem.h"
 #include "utils/builtins.h"
 #include "utils/guc.h"
-#include "utils/memutils.h"
 #include "utils/portal.h"
 #include "utils/snapmgr.h"
 #include "utils/timestamp.h"
@@ -105,8 +104,6 @@ static ExecutorStart_hook_type prev_ExecutorStart = NULL;
 
 static bool pgcs_enable_planstate; /* whether to show planstate in result sets */
 
-MemoryContext PGCSMemoryContext = NULL;
-
 /*
  * Macros to load and store st_changecount with the memory barriers.
  *
@@ -422,8 +419,7 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags)
 	volatile PgClusterStatus *entry;
 	StringInfo planstate_str = NULL;
 	StringInfo cursors = NULL;
-	ExplainState *es = NULL;
-	MemoryContext oldctx;
+	MemoryContext oldcxt;
 	
 	if (prev_ExecutorStart)
 		prev_ExecutorStart(desc, eflags);
@@ -445,14 +441,18 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags)
 		return;
 	}
 	
-	oldctx = MemoryContextSwitchTo(PGCSMemoryContext);
+	/*
+	 * Make sure we operate in the per-query context, so any cruft will be
+	 * discarded later during ExecutorEnd. estate should be set by standard_ExecutorStart.
+	 */
+	oldcxt = MemoryContextSwitchTo(desc->estate->es_query_cxt);
 	
 	if (desc->planstate != NULL)
 	{
 		/* make planstate text tree if enabled */
 		if (pgcs_enable_planstate)
 		{
-			es = NewExplainState();
+			ExplainState *es = NewExplainState();
 			
 			es->costs = false;
 			/* we don't want plan->targetlist been changed */
@@ -489,8 +489,7 @@ pgcs_report_executor_activity(QueryDesc *desc, int eflags)
 	
 	increment_changecount_after(entry);
 	
-	MemoryContextSwitchTo(oldctx);
-	MemoryContextResetAndDeleteChildren(PGCSMemoryContext);
+	MemoryContextSwitchTo(oldcxt);
 }
 
 /* ----------
@@ -1150,9 +1149,6 @@ _PG_init(void)
 	 */
 	RequestAddinShmemSpace(pgcs_memsize());
 	
-	PGCSMemoryContext = AllocSetContextCreate(TopMemoryContext,
-	                                          "pg_stat_cluster_activity planstate",
-	                                          ALLOCSET_DEFAULT_SIZES);
 	/*
 	 * Install hooks.
 	 */

From 20f6359700f6f9ed2be7a25e5bff1300a544aec7 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Thu, 10 Feb 2022 11:23:25 +0800
Subject: [PATCH 527/578] fix size calculation error in SharedQueueShmemSize
 http://tapd.woa.com/20421696/bugtrace/bugs/view?bug_id=1020421696096562981

---
 src/backend/pgxc/squeue/squeue.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
index 19387c66..61289161 100644
--- a/src/backend/pgxc/squeue/squeue.c
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -772,15 +772,20 @@ SharedQueueShmemSize(void)
 {
     Size sqs_size;
 
+	/* Shared Queues Sync */
     sqs_size = mul_size(NUM_SQUEUES, SQUEUE_SYNC_SIZE);
+	/* Shared Queue Locks */
+    sqs_size = add_size(sqs_size, mul_size((NUM_SQUEUES * (TBASE_MAX_DATANODE_NUMBER)), sizeof(LWLockPadded)));
 
 #ifdef __TBASE__
     if (g_UseDataPump)
     {
+	    /* Disconnect Consumers */
         sqs_size = add_size(sqs_size, hash_estimate_size(NUM_SQUEUES, sizeof(DisConsumer)));
     }
 #endif
 
+    /* Shared Queues */
     if(g_UseDataPump)
         return add_size(sqs_size, hash_estimate_size(NUM_SQUEUES, SQUEUE_HDR_SIZE(TBASE_MAX_DATANODE_NUMBER)));
     else

From 77a12b5587b5ea8feb6d725114e647feea7db79d Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Mon, 7 Mar 2022 09:22:35 +0800
Subject: [PATCH 528/578] support sync statistic to other coordinator node when
 execute analyze with sync check pick commit: 7b23c7da

---
 src/backend/commands/analyze.c      | 572 +++++++++++++++++++++++++++-
 src/backend/commands/vacuum.c       |  16 +-
 src/backend/nodes/copyfuncs.c       |  15 +
 src/backend/nodes/equalfuncs.c      |  13 +
 src/backend/parser/gram.y           |  43 ++-
 src/backend/postmaster/autovacuum.c |   3 +-
 src/backend/tcop/utility.c          | 161 +++++++-
 src/include/commands/vacuum.h       |  22 +-
 src/include/nodes/nodes.h           |   1 +
 src/include/nodes/parsenodes.h      |   9 +
 src/include/parser/kwlist.h         |   1 +
 11 files changed, 823 insertions(+), 33 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 2549b543..f0f92a5b 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -121,7 +121,7 @@ static BufferAccessStrategy vac_strategy;
 static void do_analyze_rel(Relation onerel, int options,
                VacuumParams *params, List *va_cols,
                AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
-               bool inh, bool in_outer_xact, int elevel);
+						   bool inh, bool in_outer_xact, int elevel, AnalyzeSyncOpt *syncOpt);
 static void compute_index_stats(Relation onerel, double totalrows,
                     AnlIndexData *indexdata, int nindexes,
                     HeapTuple *rows, int numrows,
@@ -139,6 +139,14 @@ static void update_attstats(Oid relid, bool inh,
                 int natts, VacAttrStats **vacattrstats);
 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
+static void					analyze_rel_sync(Relation		 onerel,
+											 bool			 inh,
+											 int			 attr_cnt,
+											 VacAttrStats  **vacattrstats,
+											 int			 nindexes,
+											 Relation		  *indexes,
+											 AnlIndexData	  *indexdata,
+											 AnalyzeSyncOpt *syncOpt);
 
 #ifdef XCP
 static void analyze_rel_coordinator(Relation onerel, bool inh, int attr_cnt,
@@ -162,10 +170,15 @@ static int acquire_coordinator_sample_rows(Relation onerel, int elevel,
  *    analyze_rel() -- analyze one relation
  */
 void
-analyze_rel(Oid relid, RangeVar *relation, int options,
-            VacuumParams *params, List *va_cols, bool in_outer_xact,
-            BufferAccessStrategy bstrategy)
-{// #lizard forgives
+analyze_rel(Oid					 relid,
+			RangeVar			 *relation,
+			int					 options,
+			VacuumParams		 *params,
+			List				 *va_cols,
+			bool				 in_outer_xact,
+			BufferAccessStrategy bstrategy,
+			AnalyzeSyncOpt	   *syncOpt)
+{
     Relation    onerel;
     int            elevel;
     AcquireSampleRowsFunc acquirefunc = NULL;
@@ -194,7 +207,8 @@ analyze_rel(Oid relid, RangeVar *relation, int options,
 							params,
 							va_cols,
 							in_outer_xact,
-							bstrategy);
+							bstrategy,
+							syncOpt);
 			}
 			if (childs)
 				pfree(childs);
@@ -381,14 +395,14 @@ analyze_rel(Oid relid, RangeVar *relation, int options,
      */
     if (onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
         do_analyze_rel(onerel, options, params, va_cols, acquirefunc,
-                       relpages, false, in_outer_xact, elevel);
+					   relpages, false, in_outer_xact, elevel, syncOpt);
 
     /*
      * If there are child tables, do recursive ANALYZE.
      */
     if (onerel->rd_rel->relhassubclass)
         do_analyze_rel(onerel, options, params, va_cols, acquirefunc, relpages,
-                       true, in_outer_xact, elevel);
+					   true, in_outer_xact, elevel, syncOpt);
 
     /*
      * Close source relation now, but keep lock so that no one deletes it
@@ -415,11 +429,17 @@ analyze_rel(Oid relid, RangeVar *relation, int options,
  * appropriate acquirefunc for each child table.
  */
 static void
-do_analyze_rel(Relation onerel, int options, VacuumParams *params,
-               List *va_cols, AcquireSampleRowsFunc acquirefunc,
-               BlockNumber relpages, bool inh, bool in_outer_xact,
-               int elevel)
-{// #lizard forgives
+do_analyze_rel(Relation				 onerel,
+			   int					 options,
+			   VacuumParams			*params,
+			   List					*va_cols,
+			   AcquireSampleRowsFunc acquirefunc,
+			   BlockNumber			 relpages,
+			   bool					 inh,
+			   bool					 in_outer_xact,
+			   int					 elevel,
+			   AnalyzeSyncOpt		  *syncOpt)
+{
     int            attr_cnt,
                 tcnt,
                 i,
@@ -609,6 +629,24 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params,
 					 onerel->rd_locator_info && 
 					 !RELATION_IS_COORDINATOR_LOCAL(onerel));
 
+	/*
+	 * Sync statistics if this session is connected to other remote Coordinator.
+	 * When receiving sync commands directly from the client, we also sync statistics.
+	 */
+	if (iscoordinator && IsConnFromCoord() &&
+		(syncOpt != NULL && syncOpt->is_sync_from == true))
+	{
+		elog(INFO, "SYNC statistic");
+		analyze_rel_sync(onerel,
+						 inh,
+						 attr_cnt,
+						 vacattrstats,
+						 nindexes,
+						 Irel,
+						 indexdata,
+						 syncOpt);
+		goto cleanup;
+	}
 #ifdef XCP
 #ifdef __TBASE__
 	if (!enable_sampling_analyze && iscoordinator)
@@ -5310,3 +5348,511 @@ acquire_coordinator_sample_rows(Relation onerel, int elevel,
 
 
 #endif
+
+
+/*
+ * coord_collect_simple_stats
+ *		Collect simple stats for a relation (pg_statistic contents).
+ *
+ * Collects statistics from the datanodes, and then keeps the one of the
+ * received statistics for each attribute (the first one we receive, but
+ * it's mostly random).
+ *
+ * XXX We do not try to build statistics covering data fro all the nodes,
+ * either by collecting fresh sample of rows or merging the statistics
+ * somehow. The current approach is very simple and cheap, but may have
+ * negative impact on estimate accuracy as the stats only covers data
+ * from a single node, and we may end up with stats from different node
+ * for each attribute.
+ */
+static void
+coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
+					VacAttrStats **vacattrstats, AnalyzeSyncOpt *syncOpt)
+{
+	char 		   *nspname;
+	char 		   *relname;
+	/* Fields to run query to read statistics from data nodes */
+	StringInfoData  query;
+	EState 		   *estate;
+	MemoryContext 	oldcontext;
+	RemoteQuery	    *step;
+	RemoteQueryState *node;
+	TupleTableSlot *result;
+	int 			i;
+	/* Number of data nodes from which attribute statistics are received. */
+	int			   *numnodes;
+	int reltuples;
+	int relpages;
+	int relallvisible;
+	bool relhasindex;
+	ListCell		 *lc;
+	int				  nodeIdx;
+	ExecNodes		  *execnodes = (ExecNodes *)makeNode(ExecNodes);
+	/* Get the relation identifier */
+	relname = RelationGetRelationName(onerel);
+	nspname = get_namespace_name(RelationGetNamespace(onerel));
+
+	/* Make up query string */
+	initStringInfo(&query);
+	/* Generic statistic fields */
+	appendStringInfoString(&query,
+						   "SELECT s.staattnum, "
+						   "c.reltuples, "
+						   "c.relpages,"
+						   "c.relallvisible,"
+						   "c.relhasindex,"
+						   "s.stanullfrac, "
+						   "s.stawidth, "
+						   "s.stadistinct");
+	/* Detailed statistic slots */
+	for (i = 1; i <= STATISTIC_NUM_SLOTS; i++)
+		appendStringInfo(&query, ", s.stakind%d"
+								 ", o%d.oprname"
+								 ", no%d.nspname"
+								 ", t%dl.typname"
+								 ", nt%dl.nspname"
+								 ", t%dr.typname"
+								 ", nt%dr.nspname"
+								 ", s.stanumbers%d"
+								 ", s.stavalues%d",
+						 i, i, i, i, i, i, i, i, i);
+
+	/* Common part of FROM clause */
+	appendStringInfoString(&query, " FROM pg_statistic s JOIN pg_class c "
+									"    ON s.starelid = c.oid "
+									"JOIN pg_namespace nc "
+									"    ON c.relnamespace = nc.oid ");
+	/* Info about involved operations */
+	for (i = 1; i <= STATISTIC_NUM_SLOTS; i++)
+		appendStringInfo(&query, "LEFT JOIN (pg_operator o%d "
+								 "           JOIN pg_namespace no%d "
+								 "               ON o%d.oprnamespace = no%d.oid "
+								 "           JOIN pg_type t%dl "
+								 "               ON o%d.oprleft = t%dl.oid "
+								 "           JOIN pg_namespace nt%dl "
+								 "               ON t%dl.typnamespace = nt%dl.oid "
+								 "           JOIN pg_type t%dr "
+								 "               ON o%d.oprright = t%dr.oid "
+								 "           JOIN pg_namespace nt%dr "
+								 "               ON t%dr.typnamespace = nt%dr.oid) "
+								 "    ON s.staop%d = o%d.oid ",
+						 i, i, i, i, i, i, i, i, i,
+						 i, i, i, i, i, i, i, i, i);
+	appendStringInfo(&query, "WHERE nc.nspname = '%s' "
+							  "AND c.relname = '%s'",
+					 nspname, relname);
+
+	/* Build up RemoteQuery */
+	execnodes->accesstype	   = RELATION_ACCESS_READ;
+	execnodes->baselocatortype = LOCATOR_TYPE_SHARD; /* not used */
+	execnodes->en_expr		   = NULL;
+	execnodes->en_relid		   = InvalidOid;
+	execnodes->primarynodelist = NIL;
+
+	foreach (lc, syncOpt->nodes)
+	{
+		char node_type = PGXC_NODE_COORDINATOR;
+		nodeIdx		   = PGXCNodeGetNodeIdFromName(strVal(lfirst(lc)), &node_type);
+		execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIdx);
+	}
+	step = makeNode(RemoteQuery);
+	step->combine_type = COMBINE_TYPE_NONE;
+	step->exec_nodes = execnodes;
+	step->sql_statement = query.data;
+	step->force_autocommit = true;
+	step->exec_type = EXEC_ON_COORDS;
+
+	/* Add targetlist entries */
+	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+										 make_relation_tle(StatisticRelationId,
+														   "pg_statistic",
+														   "staattnum"));
+	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+										 make_relation_tle(RelationRelationId,
+														   "pg_class",
+														   "reltuples"));
+	step->scan.plan.targetlist =
+		lappend(step->scan.plan.targetlist,
+				make_relation_tle(RelationRelationId, "pg_class", "relpages"));
+	step->scan.plan.targetlist =
+		lappend(step->scan.plan.targetlist,
+				make_relation_tle(RelationRelationId, "pg_class", "relallvisible"));
+	step->scan.plan.targetlist =
+		lappend(step->scan.plan.targetlist,
+				make_relation_tle(RelationRelationId, "pg_class", "relhasindex"));
+	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+										 make_relation_tle(StatisticRelationId,
+														   "pg_statistic",
+														   "stanullfrac"));
+	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+										 make_relation_tle(StatisticRelationId,
+														   "pg_statistic",
+														   "stawidth"));
+	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+										 make_relation_tle(StatisticRelationId,
+														   "pg_statistic",
+														   "stadistinct"));
+	for (i = 1; i <= STATISTIC_NUM_SLOTS; i++)
+	{
+		/* 16 characters would be enough */
+		char 	colname[16];
+
+		sprintf(colname, "stakind%d", i);
+		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+											 make_relation_tle(StatisticRelationId,
+															   "pg_statistic",
+															   colname));
+
+		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+											 make_relation_tle(OperatorRelationId,
+															   "pg_operator",
+															   "oprname"));
+		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+											 make_relation_tle(NamespaceRelationId,
+															   "pg_namespace",
+															   "nspname"));
+		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+											 make_relation_tle(TypeRelationId,
+															   "pg_type",
+															   "typname"));
+		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+											 make_relation_tle(NamespaceRelationId,
+															   "pg_namespace",
+															   "nspname"));
+		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+											 make_relation_tle(TypeRelationId,
+															   "pg_type",
+															   "typname"));
+		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+											 make_relation_tle(NamespaceRelationId,
+															   "pg_namespace",
+															   "nspname"));
+
+		sprintf(colname, "stanumbers%d", i);
+		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+											 make_relation_tle(StatisticRelationId,
+															   "pg_statistic",
+															   colname));
+
+		sprintf(colname, "stavalues%d", i);
+		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+											 make_relation_tle(StatisticRelationId,
+															   "pg_statistic",
+															   colname));
+	}
+	/* Execute query on the data nodes */
+	estate = CreateExecutorState();
+
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	/*
+	 * Take a fresh snapshot so that we see the effects of the ANALYZE command
+	 * on the datanode. That command is run in auto-commit mode hence just
+	 * bumping up the command ID is not good enough
+	 */
+	/* PushActiveSnapshot(GetLocalTransactionSnapshot());  */
+	estate->es_snapshot = GetActiveSnapshot();
+
+	node = ExecInitRemoteQuery(step, estate, 0);
+	MemoryContextSwitchTo(oldcontext);
+
+	/* get ready to combine results */
+	numnodes = (int *) palloc(attr_cnt * sizeof(int));
+	for (i = 0; i < attr_cnt; i++)
+		numnodes[i] = 0;
+
+	result = ExecRemoteQuery((PlanState *) node);
+	/* PopActiveSnapshot(); */
+	while (result != NULL && !TupIsNull(result))
+	{
+		Datum 			value;
+		bool			isnull;
+		int 			colnum = 1;
+		int16			attnum;
+		float4			nullfrac;
+		int32 			width;
+		float4			distinct;
+		VacAttrStats   *stats = NULL;
+
+
+		/* Process statistics from the data node */
+		value = slot_getattr(result, colnum++, &isnull); /* staattnum */
+		attnum = DatumGetInt16(value);
+		for (i = 0; i < attr_cnt; i++)
+			if (vacattrstats[i]->attr->attnum == attnum)
+			{
+				stats = vacattrstats[i];
+				stats->stats_valid = true;
+				numnodes[i]++;
+				break;
+			}
+
+		value = slot_getattr(result, colnum++, &isnull); /* reltuples */
+		reltuples = DatumGetFloat4(value);
+
+		value	  = slot_getattr(result, colnum++, &isnull); /* relpages */
+		relpages = DatumGetInt32(value);
+
+		value	  = slot_getattr(result, colnum++, &isnull); /* relallvisible */
+		relallvisible  = DatumGetInt32(value);
+
+		value		   = slot_getattr(result, colnum++, &isnull); /* relhasindex */
+		relhasindex  = DatumGetBool(value);
+
+		if (stats)
+		{
+			value = slot_getattr(result, colnum++, &isnull); /* stanullfrac */
+			nullfrac = DatumGetFloat4(value);
+			stats->stanullfrac = nullfrac;
+
+			value = slot_getattr(result, colnum++, &isnull); /* stawidth */
+			width = DatumGetInt32(value);
+			stats->stawidth = width;
+
+			value = slot_getattr(result, colnum++, &isnull); /* stadistinct */
+			distinct = DatumGetFloat4(value);
+			stats->stadistinct = distinct;
+
+			/* Detailed statistics */
+			for (i = 1; i <= STATISTIC_NUM_SLOTS; i++)
+			{
+				int16 		kind;
+				float4	   *numbers;
+				Datum	   *values;
+				int			nnumbers, nvalues;
+				int 		k;
+
+				value = slot_getattr(result, colnum++, &isnull); /* kind */
+				kind = DatumGetInt16(value);
+
+				if (kind == 0)
+				{
+					/*
+					 * Empty slot - skip next 8 fields: 6 fields of the
+					 * operation identifier and two data fields (numbers and
+					 * values)
+					 */
+					colnum += 8;
+					continue;
+				}
+				else
+				{
+					Oid			oprid;
+
+					/* Get operator */
+					value = slot_getattr(result, colnum++, &isnull); /* oprname */
+					if (isnull)
+					{
+						/*
+						 * Operator is not specified for that kind, skip remaining
+						 * fields to lookup the operator
+						 */
+						oprid = InvalidOid;
+						colnum += 5; /* skip operation nsp and types */
+					}
+					else
+					{
+						char	   *oprname;
+						char	   *oprnspname;
+						Oid			ltypid, rtypid;
+						char	   *ltypname,
+								   *rtypname;
+						char	   *ltypnspname,
+								   *rtypnspname;
+						oprname = DatumGetCString(value);
+						value = slot_getattr(result, colnum++, &isnull); /* oprnspname */
+						oprnspname = DatumGetCString(value);
+						/* Get left operand data type */
+						value = slot_getattr(result, colnum++, &isnull); /* typname */
+						ltypname = DatumGetCString(value);
+						value = slot_getattr(result, colnum++, &isnull); /* typnspname */
+						ltypnspname = DatumGetCString(value);
+						ltypid = get_typname_typid(ltypname,
+											   get_namespaceid(ltypnspname));
+						/* Get right operand data type */
+						value = slot_getattr(result, colnum++, &isnull); /* typname */
+						rtypname = DatumGetCString(value);
+						value = slot_getattr(result, colnum++, &isnull); /* typnspname */
+						rtypnspname = DatumGetCString(value);
+						rtypid = get_typname_typid(rtypname,
+											   get_namespaceid(rtypnspname));
+						/* lookup operator */
+						oprid = get_operid(oprname, ltypid, rtypid,
+										   get_namespaceid(oprnspname));
+					}
+					/*
+					 * Look up a statistics slot. If there is an entry of the
+					 * same kind already, leave it, assuming the statistics
+					 * is approximately the same on all nodes, so values from
+					 * one node are representing entire relation well.
+					 * If empty slot is found store values here. If no more
+					 * slots skip remaining values.
+					 */
+					for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+					{
+						if (stats->stakind[k] == 0 ||
+								(stats->stakind[k] == kind && stats->staop[k] == oprid))
+							break;
+					}
+
+					if (k >= STATISTIC_NUM_SLOTS)
+					{
+						/* No empty slots */
+						break;
+					}
+
+					/*
+					 * If it is an existing slot which has numbers or values
+					 * continue to the next set. If slot exists but without
+					 * numbers and values, try to acquire them now
+					 */
+					if (stats->stakind[k] != 0 && (stats->numnumbers[k] > 0 ||
+							stats->numvalues[k] > 0))
+					{
+						colnum += 2; /* skip numbers and values */
+						continue;
+					}
+
+					/*
+					 * Initialize slot
+					 */
+					stats->stakind[k] = kind;
+					stats->staop[k] = oprid;
+					stats->numnumbers[k] = 0;
+					stats->stanumbers[k] = NULL;
+					stats->numvalues[k] = 0;
+					stats->stavalues[k] = NULL;
+					stats->statypid[k] = InvalidOid;
+					stats->statyplen[k] = -1;
+					stats->statypalign[k] = 'i';
+					stats->statypbyval[k] = true;
+				}
+
+
+				/* get numbers */
+				value = slot_getattr(result, colnum++, &isnull); /* numbers */
+				if (!isnull)
+				{
+					ArrayType  *arry = DatumGetArrayTypeP(value);
+
+					/*
+					 * We expect the array to be a 1-D float4 array; verify that. We don't
+					 * need to use deconstruct_array() since the array data is just going
+					 * to look like a C array of float4 values.
+					 */
+					nnumbers = ARR_DIMS(arry)[0];
+					if (ARR_NDIM(arry) != 1 || nnumbers <= 0 ||
+						ARR_HASNULL(arry) ||
+						ARR_ELEMTYPE(arry) != FLOAT4OID)
+						elog(ERROR, "stanumbers is not a 1-D float4 array");
+					numbers = (float4 *) palloc(nnumbers * sizeof(float4));
+					memcpy(numbers, ARR_DATA_PTR(arry),
+						   nnumbers * sizeof(float4));
+
+					/*
+					 * Free arry if it's a detoasted copy.
+					 */
+					if ((Pointer) arry != DatumGetPointer(value))
+						pfree(arry);
+
+					stats->numnumbers[k] = nnumbers;
+					stats->stanumbers[k] = numbers;
+				}
+				/* get values */
+				value = slot_getattr(result, colnum++, &isnull); /* values */
+				if (!isnull)
+				{
+					int 		j;
+					ArrayType  *arry;
+					int16		elmlen;
+					bool		elmbyval;
+					char		elmalign;
+					arry = DatumGetArrayTypeP(value);
+					/* We could cache this data, but not clear it's worth it */
+					get_typlenbyvalalign(ARR_ELEMTYPE(arry),
+										 &elmlen, &elmbyval, &elmalign);
+					/* Deconstruct array into Datum elements; NULLs not expected */
+					deconstruct_array(arry,
+									  ARR_ELEMTYPE(arry),
+									  elmlen, elmbyval, elmalign,
+									  &values, NULL, &nvalues);
+
+					/*
+					 * If the element type is pass-by-reference, we now have a bunch of
+					 * Datums that are pointers into the syscache value.  Copy them to
+					 * avoid problems if syscache decides to drop the entry.
+					 */
+					if (!elmbyval)
+					{
+						for (j = 0; j < nvalues; j++)
+							values[j] = datumCopy(values[j], elmbyval, elmlen);
+					}
+
+					/*
+					 * Free statarray if it's a detoasted copy.
+					 */
+					if ((Pointer) arry != DatumGetPointer(value))
+						pfree(arry);
+
+					stats->numvalues[k] = nvalues;
+					stats->stavalues[k] = values;
+					/* store details about values data type */
+					stats->statypid[k] = ARR_ELEMTYPE(arry);
+					stats->statyplen[k] = elmlen;
+					stats->statypalign[k] = elmalign;
+					stats->statypbyval[k] = elmbyval;
+				}
+			}
+		}
+
+		/* fetch next */
+		result = ExecRemoteQuery((PlanState *) node);
+	}
+	ExecEndRemoteQuery(node);
+
+	/* for (i = 0; i < attr_cnt; i++) */
+	/* { */
+	/* 	VacAttrStats *stats = vacattrstats[i]; */
+
+	/* 	if (numnodes[i] > 0) */
+	/* 	{ */
+	/* 		stats->stanullfrac /= numnodes[i]; */
+	/* 		stats->stawidth /= numnodes[i]; */
+	/* 		stats->stadistinct /= numnodes[i]; */
+	/* 	} */
+	/* } */
+	update_attstats(RelationGetRelid(onerel),
+					inh,
+					attr_cnt,
+					vacattrstats,
+					RelationGetRelPersistence(onerel));
+	vac_update_relstats(onerel,
+						relpages,
+						reltuples,
+						relallvisible,
+						relhasindex,
+						InvalidTransactionId,
+						InvalidMultiXactId,
+						false);
+}
+
+static void
+analyze_rel_sync(Relation onerel, bool inh, int attr_cnt,
+						VacAttrStats **vacattrstats, int nindexes,
+				 Relation *indexes, AnlIndexData *indexdata, AnalyzeSyncOpt *syncOpt)
+{
+
+	int i;
+	/* collect and fit simple statistics (pg_statistic) for the relation */
+	coord_collect_stats(onerel, inh, attr_cnt, vacattrstats, syncOpt);
+
+	/* collect and fit simple statistics (pg_statistic) for all indexes */
+	for (i = 0; i < nindexes; i++)
+		coord_collect_stats(indexes[i],
+									   false,
+									   indexdata[i].attr_cnt,
+							indexdata[i].vacattrstats, syncOpt);
+
+	/* extended statistics (pg_statistic) for the relation */
+	/* coord_collect_extended_stats(onerel, attr_cnt);  */
+}
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 93c91f68..f001e056 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -145,7 +145,7 @@ ExecVacuum(VacuumStmt *vacstmt, bool isTopLevel)
 
     /* Now go through the common routine */
     vacuum(vacstmt->options, vacstmt->relation, InvalidOid, &params,
-           vacstmt->va_cols, NULL, isTopLevel);
+		   vacstmt->va_cols, NULL, isTopLevel, vacstmt->sync_option);
 }
 
 /*
@@ -171,9 +171,15 @@ ExecVacuum(VacuumStmt *vacstmt, bool isTopLevel)
  * memory context that will not disappear at transaction commit.
  */
 void
-vacuum(int options, RangeVar *relation, Oid relid, VacuumParams *params,
-       List *va_cols, BufferAccessStrategy bstrategy, bool isTopLevel)
-{// #lizard forgives
+vacuum(int					options,
+	   RangeVar			*relation,
+	   Oid					relid,
+	   VacuumParams		*params,
+	   List				*va_cols,
+	   BufferAccessStrategy bstrategy,
+	   bool					isTopLevel,
+	   AnalyzeSyncOpt	  *syncOpt)
+{
     const char *stmttype;
     volatile bool in_outer_xact,
                 use_own_xacts;
@@ -344,7 +350,7 @@ vacuum(int options, RangeVar *relation, Oid relid, VacuumParams *params,
                 }
 
                 analyze_rel(relid, relation, options, params,
-                            va_cols, in_outer_xact, vac_strategy);
+							va_cols, in_outer_xact, vac_strategy, syncOpt);
 
                 if (use_own_xacts)
                 {
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 702eec38..10a1d424 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -4144,6 +4144,18 @@ _copyVacuumStmt(const VacuumStmt *from)
     COPY_SCALAR_FIELD(options);
     COPY_NODE_FIELD(relation);
     COPY_NODE_FIELD(va_cols);
+	COPY_NODE_FIELD(sync_option);
+
+	return newnode;
+}
+
+static AnalyzeSyncOpt *
+_copyAnalyzeSyncOpt(const AnalyzeSyncOpt *from)
+{
+	AnalyzeSyncOpt *newnode = makeNode(AnalyzeSyncOpt);
+
+	COPY_SCALAR_FIELD(is_sync_from);
+	COPY_NODE_FIELD(nodes);
 
     return newnode;
 }
@@ -5914,6 +5926,9 @@ copyObjectImpl(const void *from)
         case T_VacuumStmt:
             retval = _copyVacuumStmt(from);
             break;
+		case T_AnalyzeSyncOpt:
+			retval = _copyAnalyzeSyncOpt(from);
+			break;
 #ifdef _SHARDING_
         case T_VacuumShardStmt:
             retval = _copyVacuumShardStmt(from);
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 7bbe8255..87934c2d 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -1694,6 +1694,16 @@ _equalVacuumStmt(const VacuumStmt *a, const VacuumStmt *b)
     COMPARE_SCALAR_FIELD(options);
     COMPARE_NODE_FIELD(relation);
     COMPARE_NODE_FIELD(va_cols);
+	COMPARE_NODE_FIELD(sync_option);
+
+	return true;
+}
+
+static bool
+_equalAnalyzeSyncOpt(const AnalyzeSyncOpt *a, const AnalyzeSyncOpt *b)
+{
+	COMPARE_SCALAR_FIELD(is_sync_from);
+	COMPARE_NODE_FIELD(nodes);
 
     return true;
 }
@@ -3592,6 +3602,9 @@ equal(const void *a, const void *b)
         case T_VacuumStmt:
             retval = _equalVacuumStmt(a, b);
             break;
+		case T_AnalyzeSyncOpt:
+			retval = _equalAnalyzeSyncOpt(a, b);
+			break;
 #ifdef _SHARDING_
         case T_VacuumShardStmt:
             retval = _equalVacuumShardStmt(a, b);
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 2f34a131..7efd70e5 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -263,6 +263,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 	RoleSpec			*rolespec;
 	PartitionForExpr	*partfor;
 	PartitionBy         *partby; 
+	AnalyzeSyncOpt      *analyze_sync_opt;
 }
 
 %type <node>	stmt schema_stmt
@@ -634,6 +635,8 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <ival>		audit_stmt audit_obj_type opt_when_success_or_not success_or_not
 /* __AUDIT__ END */
 
+/* AYALYZE */
+%type <analyze_sync_opt> analyze_sync_option
 /*
  * Non-keyword token types.  These are hard-wired into the "flex" lexer.
  * They must be listed first so that their numeric codes do not depend on
@@ -730,7 +733,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 	SERIALIZABLE SERVER SESSION SESSION_USER SESSIONTIMEZONE SET SETS SETOF SHARDING SHARE SHOW
 	SIMILAR SIMPLE SKIP SLOT SMALLINT SNAPSHOT SOME SQL_P STABLE STANDALONE_P
 	START STATEMENT STATISTICS STDIN STDOUT STEP STORAGE STRICT_P STRIP_P
-	SUBSCRIPTION SUBSTRING SUCCESSFUL SYMMETRIC SYSDATE SYSID SYSTEM_P SYSTIMESTAMP 
+	SUBSCRIPTION SUBSTRING SUCCESSFUL SYMMETRIC SYNC SYSDATE SYSID SYSTEM_P SYSTIMESTAMP 
 
 	TABLE TABLES TABLESAMPLE TABLESPACE TBASE_P TEMP TEMPLATE TEMPORARY TEXT_P THEN
 	TIME TIMESTAMP TO TRAILING TRANSACTION TRANSFORM TREAT TRIGGER TRIM TRUE_P
@@ -11060,7 +11063,7 @@ vacuum_option_elem:
 		;
 
 AnalyzeStmt:
-			analyze_keyword opt_verbose
+			analyze_keyword opt_verbose analyze_sync_option
 				{
 					VacuumStmt *n = makeNode(VacuumStmt);
 					n->options = VACOPT_ANALYZE;
@@ -11068,9 +11071,10 @@ AnalyzeStmt:
 						n->options |= VACOPT_VERBOSE;
 					n->relation = NULL;
 					n->va_cols = NIL;
+					n->sync_option = $3;
 					$$ = (Node *)n;
 				}
-			| analyze_keyword opt_verbose qualified_name opt_name_list
+			| analyze_keyword opt_verbose qualified_name opt_name_list analyze_sync_option
 				{
 					VacuumStmt *n = makeNode(VacuumStmt);
 					n->options = VACOPT_ANALYZE;
@@ -11078,22 +11082,25 @@ AnalyzeStmt:
 						n->options |= VACOPT_VERBOSE;
 					n->relation = $3;
 					n->va_cols = $4;
+					n->sync_option = $5;
 					$$ = (Node *)n;
 				}
-			| analyze_keyword '(' analyze_option_list ')'
+			| analyze_keyword '(' analyze_option_list ')' analyze_sync_option
 				{
 					VacuumStmt *n = makeNode(VacuumStmt);
 					n->options = VACOPT_ANALYZE | $3;
 					n->relation = NULL;
 					n->va_cols = NIL;
+					n->sync_option = $5;
 					$$ = (Node *)n;
 				}
-			| analyze_keyword '(' analyze_option_list ')' qualified_name opt_name_list
+			| analyze_keyword '(' analyze_option_list ')' qualified_name opt_name_list analyze_sync_option
 				{
 					VacuumStmt *n = makeNode(VacuumStmt);
 					n->options = VACOPT_ANALYZE | $3;
 					n->relation = $5;
 					n->va_cols = $6;
+					n->sync_option = $7;
 					$$ = (Node *)n;
 				}
 		;
@@ -11103,6 +11110,31 @@ analyze_keyword:
 			| ANALYSE /* British */					{}
 		;
 
+analyze_sync_option :
+/*            SYNC
+			    {
+					AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt);
+					n->is_sync_from = false;
+					n->nodes = NIL;
+					$$ = (Node *)n;
+			    }
+			|*/ SYNC TO pgxcnode_list
+			    {
+					AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt);
+					n->is_sync_from = false;
+					n->nodes = $3;
+					$$ = n;
+			    }
+			| SYNC FROM pgxcnode_list
+			    {
+					AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt);
+					n->is_sync_from = true;
+					n->nodes = $3;
+					$$ = n;
+				}
+			| /*EMPTY*/ { $$ = NULL; }
+		;
+
 opt_verbose:
 			VERBOSE									{ $$ = TRUE; }
 			| /*EMPTY*/								{ $$ = FALSE; }
@@ -16883,6 +16915,7 @@ unreserved_keyword:
 			| STRICT_P
 			| STRIP_P
 			| SUBSCRIPTION
+			| SYNC
 			| SYSID
 			| SYSTEM_P
 			| TABLES
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index f05013fa..f7f9904e 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -3203,8 +3203,9 @@ autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy)
     /* Let pgstat know what we're doing */
     autovac_report_activity(tab);
 
+	/* no need sync for auto vacuum and/or analyze*/
     vacuum(tab->at_vacoptions, &rangevar, tab->at_relid, &tab->at_params, NIL,
-           bstrategy, true);
+		   bstrategy, true, NULL);
 }
 
 /*
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 799b83a2..6521598b 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -711,8 +711,47 @@ ProcessUtilityPre(PlannedStmt *pstmt,
                 VacuumStmt *stmt = (VacuumStmt *) parsetree;
 
                 /* we choose to allow this during "read only" transactions */
-                PreventCommandDuringRecovery((stmt->options & VACOPT_VACUUM) ?
-                                             "VACUUM" : "ANALYZE");
+			PreventCommandDuringRecovery((stmt->options & VACOPT_VACUUM) ? "VACUUM"
+																		 : "ANALYZE");
+			/* When statement is emit by the coordinating node, the statement is not
+			 * rewritten, we adapt it here */
+			if (IsConnFromCoord() && IS_PGXC_COORDINATOR &&
+				(stmt->options & VACOPT_ANALYZE) && stmt->sync_option)
+			{
+				stmt->sync_option->is_sync_from = true;
+				list_free_deep(stmt->sync_option->nodes);
+				stmt->sync_option->nodes = NIL;
+				stmt->sync_option->nodes = list_make1(makeString(parentPGXCNode));
+			}
+			if (!IsConnFromCoord() && IS_PGXC_COORDINATOR && stmt->sync_option &&
+				stmt->sync_option->nodes != NIL)
+			{
+				const ListCell *cell;
+				char			node_type = PGXC_NODE_COORDINATOR;
+				foreach (cell, stmt->sync_option->nodes)
+				{
+					if (0 == strcmp(strVal(lfirst(cell)), PGXCNodeName))
+						ereport(ERROR,
+								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+								 errmsg("Can not sync to/from local!")));
+
+					PGXCNodeGetNodeIdFromName(strVal(lfirst(cell)), &node_type);
+					if (node_type == PGXC_NODE_NONE)
+					{
+						ereport(ERROR,
+								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+								 errmsg("can not find coordinator %s!",
+										strVal(lfirst(cell)))));
+					}
+					if (node_type != PGXC_NODE_COORDINATOR)
+					{
+						ereport(ERROR,
+								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+								 errmsg("node %s is not coordinator!",
+										strVal(lfirst(cell)))));
+					}
+				}
+			}
                 /*
                  * We have to run the command on nodes before Coordinator because
                  * vacuum() pops active snapshot and we can not send it to nodes
@@ -1275,6 +1314,7 @@ ProcessUtilityPost(PlannedStmt *pstmt,
     bool        auto_commit = false;
     bool        add_context = false;
     RemoteQueryExecType    exec_type = EXEC_ON_NONE;
+	ExecNodes		  *exec_nodes = NULL;
 
     /*
      * auto_commit and is_temp is initialised to false and changed if required.
@@ -1313,7 +1353,122 @@ ProcessUtilityPost(PlannedStmt *pstmt,
         case T_NotifyStmt:
         case T_ListenStmt:
         case T_UnlistenStmt:
+			break;
         case T_VacuumStmt:
+		{
+			VacuumStmt *vstmt = (VacuumStmt *)parsetree;
+			if (!IsConnFromCoord() && IS_PGXC_COORDINATOR &&
+				(vstmt->options & VACOPT_ANALYZE) && vstmt->sync_option)
+			{
+				exec_type		  = EXEC_ON_COORDS;
+				if (vstmt->sync_option->nodes)
+				{
+					ListCell *lc;
+					int		  nodeIdx;
+					exec_nodes					= (ExecNodes *)makeNode(ExecNodes);
+					exec_nodes->accesstype		= RELATION_ACCESS_INSERT;
+					exec_nodes->baselocatortype = LOCATOR_TYPE_SHARD; /* not used */
+					exec_nodes->en_expr			= NULL;
+					exec_nodes->en_relid		= InvalidOid;
+					exec_nodes->primarynodelist = NIL;
+
+					foreach (lc, vstmt->sync_option->nodes)
+					{
+						char node_type = PGXC_NODE_COORDINATOR;
+						nodeIdx =
+							PGXCNodeGetNodeIdFromName(strVal(lfirst(lc)), &node_type);
+						/* Assert(nodeIdx > 0 && nodeIdx < NumDataNodes); */
+						/* if(node_type != PGXC_NODE_COORDINATOR){ */
+						/* 	ereport(ERROR, */
+						/* 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), */
+						/* 			 errmsg("node %s is not coordinator!",
+						 * strVal(lfirst(lc))))); */
+						/* } */
+						/* already check/rewrite in pre, just add it */
+						exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, nodeIdx);
+					}
+				}
+				PopActiveSnapshot();
+				CommitTransactionCommand();
+				StartTransactionCommand();
+			}
+			/* if (vstmt->options & VACOPT_ANALYZE && vstmt->sync_option != NULL && */
+			/* 	vstmt->sync_option->is_sync_from != true) */
+			/* { */
+			/* 	StringInfo queryStr = makeStringInfo(); */
+			/* 	appendStringInfo(queryStr, "ANALYZE (COORDINATOR"); */
+			/* 	if (vstmt->options & VACOPT_VERBOSE) */
+			/* 	{ */
+			/* 		appendStringInfoString(queryStr, " ,VERBOSE"); */
+			/* 	} */
+			/* 	appendStringInfoChar(queryStr, ')'); */
+			/* 	if (vstmt->relation) */
+			/* 		appendStringInfo(queryStr, " %s", RangeVarGetName(vstmt->relation));
+			 */
+			/* 	if (vstmt->va_cols) */
+			/* 	{ */
+			/* 		ListCell *lc; */
+			/* 		bool	  comma = false; */
+			/* 		appendStringInfoString(queryStr, " ("); */
+			/* 		foreach (lc, vstmt->va_cols) */
+			/* 		{ */
+			/* 			if (comma) */
+			/* 				comma = true; */
+			/* 			else */
+			/* 				appendStringInfoChar(queryStr, ','); */
+			/* 			appendStringInfoString(queryStr, strVal(lfirst(lc))); */
+			/* 		} */
+			/* 		appendStringInfoChar(queryStr, ')'); */
+			/* 	} */
+
+			/* 	appendStringInfo(queryStr, " SYNC FROM %s", PGXCNodeName); */
+			/* 	PopActiveSnapshot(); */
+			/* 	CommitTransactionCommand(); */
+			/* 	StartTransactionCommand(); */
+			/* 	if (vstmt->sync_option->nodes) */
+			/* 	{ */
+			/* 		ExecNodes *execnodes; */
+			/* 		ListCell *lc; */
+			/* 		int nodeIdx; */
+			/* 		execnodes				   = (ExecNodes *)makeNode(ExecNodes); */
+			/* 		execnodes->accesstype	   = RELATION_ACCESS_INSERT; */
+			/* 		execnodes->baselocatortype = LOCATOR_TYPE_SHARD;   /\* not used *\/ */
+			/* 		execnodes->en_expr		   = NULL; */
+			/* 		execnodes->en_relid		   = InvalidOid; */
+			/* 		execnodes->primarynodelist = NIL; */
+
+			/* 		foreach(lc, vstmt->sync_option->nodes){ */
+			/* 			char node_type = PGXC_NODE_COORDINATOR; */
+			/* 			nodeIdx = */
+			/* 				PGXCNodeGetNodeIdFromName(strVal(lfirst(lc)), &node_type); */
+			/* 			Assert(nodeIdx > 0 && nodeIdx < NumDataNodes); */
+			/* 			execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIdx);
+			 */
+			/* 		} */
+			/* 		ExecUtilityStmtOnNodes(parsetree, */
+			/* 							   queryStr->data, */
+			/* 							   execnodes, */
+			/* 							   sentToRemote, */
+			/* 							   false, */
+			/* 							   EXEC_ON_COORDS, */
+			/* 							   false, */
+			/* 							   false); */
+			/* 		list_free(execnodes->nodeList); */
+			/* 	} */
+			/* 	else */
+			/* 		ExecUtilityStmtOnNodes(parsetree, */
+			/* 							   queryStr->data, */
+			/* 							   NULL, */
+			/* 							   sentToRemote, */
+			/* 							   auto_commit, */
+			/* 							   EXEC_ON_COORDS, */
+			/* 							   false, */
+			/* 							   false); */
+			/* 	pfree(queryStr->data); */
+			/* 	pfree(queryStr); */
+			/* } */
+			break;
+		}
 #ifdef _SHARDING_
         case T_VacuumShardStmt:
 #endif
@@ -1797,7 +1952,7 @@ ProcessUtilityPost(PlannedStmt *pstmt,
 
     if (IS_PGXC_LOCAL_COORDINATOR)
 	{
-        ExecUtilityStmtOnNodes(parsetree, queryString, NULL, sentToRemote, auto_commit,
+		ExecUtilityStmtOnNodes(parsetree, queryString, exec_nodes, sentToRemote, auto_commit,
                 exec_type, is_temp, add_context);
 		
 		if (IsA(parsetree, IndexStmt) &&
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index cd79ba61..9da04880 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -288,9 +288,14 @@ typedef struct
 
 /* in commands/vacuum.c */
 extern void ExecVacuum(VacuumStmt *vacstmt, bool isTopLevel);
-extern void vacuum(int options, RangeVar *relation, Oid relid,
-       VacuumParams *params, List *va_cols,
-       BufferAccessStrategy bstrategy, bool isTopLevel);
+extern void	  vacuum(int				  options,
+					 RangeVar			  *relation,
+					 Oid				  relid,
+					 VacuumParams		  *params,
+					 List				  *va_cols,
+					 BufferAccessStrategy bstrategy,
+					 bool				  isTopLevel,
+					 AnalyzeSyncOpt *syncOpt);
 extern void vac_open_indexes(Relation relation, LOCKMODE lockmode,
                  int *nindexes, Relation **Irel);
 extern void vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode);
@@ -338,9 +343,14 @@ extern void ExecVacuumShard(VacuumShardStmt *stmt);
 #endif
 
 /* in commands/analyze.c */
-extern void analyze_rel(Oid relid, RangeVar *relation, int options,
-            VacuumParams *params, List *va_cols, bool in_outer_xact,
-            BufferAccessStrategy bstrategy);
+extern void	  analyze_rel(Oid				   relid,
+						  RangeVar			   *relation,
+						  int				   options,
+						  VacuumParams		   *params,
+						  List				   *va_cols,
+						  bool				   in_outer_xact,
+						  BufferAccessStrategy bstrategy,
+						  AnalyzeSyncOpt		 *syncOpt);
 extern bool std_typanalyze(VacAttrStats *stats);
 
 /* in utils/misc/sampling.c --- duplicate of declarations in utils/sampling.h */
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 2f585807..227af23f 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -586,6 +586,7 @@ typedef enum NodeTag
 #ifdef _MLS_
     ,T_SyncBufIdInfo            /* in bufmgr.c*/
 #endif
+	 ,T_AnalyzeSyncOpt
 } NodeTag;
 
 /*
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 01ab8277..4dc323ab 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -3301,12 +3301,21 @@ typedef enum VacuumOption
                                   */
 } VacuumOption;
 
+typedef struct AnalyzeSyncOpt
+{
+	NodeTag type;
+	bool  is_sync_from; /* false: sync to other CN node; true: sync from node identified by
+						 node_name */
+	List *nodes;	  /* node list for sync to/from */
+} AnalyzeSyncOpt;
+
 typedef struct VacuumStmt
 {
     NodeTag        type;
     int            options;        /* OR of VacuumOption flags */
     RangeVar   *relation;        /* single table to process, or NULL */
     List       *va_cols;        /* list of column names, or NIL for all */
+	AnalyzeSyncOpt *sync_option; /* Sync statistics to/from other nodes, or NULL */
 } VacuumStmt;
 
 #ifdef _SHARDING_
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index dc44c414..d77a2e68 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -477,6 +477,7 @@ PG_KEYWORD("substring", SUBSTRING, COL_NAME_KEYWORD)
 PG_KEYWORD("successful", SUCCESSFUL, RESERVED_KEYWORD)
 #endif
 PG_KEYWORD("symmetric", SYMMETRIC, RESERVED_KEYWORD)
+PG_KEYWORD("sync", SYNC, UNRESERVED_KEYWORD)
 #ifdef _PG_ORCL_
 PG_KEYWORD("sysdate", SYSDATE, RESERVED_KEYWORD)
 #endif

From 69079e0bd7e1b902930776c460a824b33abfef57 Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Mon, 7 Mar 2022 16:29:55 +0800
Subject: [PATCH 529/578] Added the feature: statistics synchronization.The
 ANALYZE syntax has been extended

cherry pick:
      9510c58d e47d6f98 808f8a6b 8a5348ac 378af856 60882fef
---
 src/backend/commands/analyze.c | 586 ++++++++++++++++++++-------------
 src/backend/parser/gram.y      |  10 +-
 src/backend/tcop/utility.c     | 113 +------
 3 files changed, 371 insertions(+), 338 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index f0f92a5b..d558c5a3 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -5349,64 +5349,174 @@ acquire_coordinator_sample_rows(Relation onerel, int elevel,
 
 #endif
 
+static RemoteQuery *
+init_sync_remotequery(AnalyzeSyncOpt *syncOpt, char **cnname)
+{
+	RemoteQuery *step;
+	ListCell	 *lc;
+	int			 nodeIdx;
+	ExecNodes	  *execnodes	   = (ExecNodes *)makeNode(ExecNodes);
+	char		 node_type	   = PGXC_NODE_COORDINATOR;
+	execnodes->accesstype	   = RELATION_ACCESS_READ;
+	execnodes->baselocatortype = LOCATOR_TYPE_SHARD; /* not used */
+	execnodes->en_expr		   = NULL;
+	execnodes->en_relid		   = InvalidOid;
+	execnodes->primarynodelist = NIL;
+
+	lc						   = list_head(syncOpt->nodes);
+	*cnname					   = strVal(lfirst(lc));
+	nodeIdx					   = PGXCNodeGetNodeIdFromName(*cnname, &node_type);
+	Assert(node_type == PGXC_NODE_COORDINATOR);
+	execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIdx);
+
+	step				= makeNode(RemoteQuery);
+	step->combine_type	= COMBINE_TYPE_NONE;
+	step->exec_nodes	= execnodes;
+	step->exec_type		= EXEC_ON_COORDS;
+	return step;
+}
 
 /*
- * coord_collect_simple_stats
- *		Collect simple stats for a relation (pg_statistic contents).
- *
- * Collects statistics from the datanodes, and then keeps the one of the
- * received statistics for each attribute (the first one we receive, but
- * it's mostly random).
- *
- * XXX We do not try to build statistics covering data fro all the nodes,
- * either by collecting fresh sample of rows or merging the statistics
- * somehow. The current approach is very simple and cheap, but may have
- * negative impact on estimate accuracy as the stats only covers data
- * from a single node, and we may end up with stats from different node
- * for each attribute.
+ * coord_sync_rel_stats
+ *		sync relation stats from the coordinator node specified by syncOpt.
  */
 static void
-coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
-					VacAttrStats **vacattrstats, AnalyzeSyncOpt *syncOpt)
+coord_sync_rel_stats(Relation onerel, AnalyzeSyncOpt *syncOpt)
 {
 	char 		   *nspname;
 	char 		   *relname;
-	/* Fields to run query to read statistics from data nodes */
+	char			 *cnname;
+	/* Fields to run query to read statistics from coordinator nodes */
 	StringInfoData  query;
 	EState 		   *estate;
 	MemoryContext 	oldcontext;
 	RemoteQuery	    *step;
 	RemoteQueryState *node;
 	TupleTableSlot *result;
-	int 			i;
-	/* Number of data nodes from which attribute statistics are received. */
-	int			   *numnodes;
 	int reltuples;
 	int relpages;
 	int relallvisible;
 	bool relhasindex;
-	ListCell		 *lc;
-	int				  nodeIdx;
-	ExecNodes		  *execnodes = (ExecNodes *)makeNode(ExecNodes);
 	/* Get the relation identifier */
 	relname = RelationGetRelationName(onerel);
 	nspname = get_namespace_name(RelationGetNamespace(onerel));
 
 	/* Make up query string */
 	initStringInfo(&query);
-	/* Generic statistic fields */
-	appendStringInfoString(&query,
-						   "SELECT s.staattnum, "
+	appendStringInfo(&query,
+					 "SELECT "
 						   "c.reltuples, "
 						   "c.relpages,"
 						   "c.relallvisible,"
-						   "c.relhasindex,"
+					 "c.relhasindex"
+					 " FROM pg_class c JOIN pg_namespace nc on c.relnamespace = "
+					 "nc.oid WHERE nc.nspname = '%s' and c.relname = '%s'",
+					 nspname,
+					 relname);
+
+	/* Build up RemoteQuery */
+	step				= init_sync_remotequery(syncOpt, &cnname);
+	step->sql_statement = query.data;
+
+	/* Add targetlist entries */
+	step->scan.plan.targetlist =
+		lappend(step->scan.plan.targetlist,
+				make_relation_tle(RelationRelationId, "pg_class", "reltuples"));
+	step->scan.plan.targetlist =
+		lappend(step->scan.plan.targetlist,
+				make_relation_tle(RelationRelationId, "pg_class", "relpages"));
+	step->scan.plan.targetlist =
+		lappend(step->scan.plan.targetlist,
+				make_relation_tle(RelationRelationId, "pg_class", "relallvisible"));
+	step->scan.plan.targetlist =
+		lappend(step->scan.plan.targetlist,
+				make_relation_tle(RelationRelationId, "pg_class", "relhasindex"));
+	/* Execute query on the data nodes */
+	estate	   = CreateExecutorState();
+
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+	node	   = ExecInitRemoteQuery(step, estate, 0);
+	MemoryContextSwitchTo(oldcontext);
+
+	result = ExecRemoteQuery((PlanState *)node);
+	if (result != NULL && !TupIsNull(result))
+	{
+		Datum value;
+		bool  isnull;
+		int	  colnum  = 1;
+
+		/* Process statistics */
+		value		  = slot_getattr(result, colnum++, &isnull); /* reltuple */
+		reltuples	  = DatumGetFloat4(value);
+
+		value		  = slot_getattr(result, colnum++, &isnull); /* relpages */
+		relpages	  = DatumGetInt32(value);
+
+		value		  = slot_getattr(result, colnum++, &isnull); /* relallvisible */
+		relallvisible = DatumGetInt32(value);
+
+		value		  = slot_getattr(result, colnum++, &isnull); /* relhasindex */
+		relhasindex	  = DatumGetBool(value);
+
+		vac_update_relstats(onerel,
+							relpages,
+							reltuples,
+							relallvisible,
+							relhasindex,
+							InvalidTransactionId,
+							InvalidMultiXactId,
+							false);
+	}
+	else
+	{
+		ereport(WARNING,
+				(errcode(ERRCODE_UNDEFINED_TABLE),
+				 errmsg("Relation \"%s\" does not exist in coordinator %s",
+						relname,
+						cnname)));
+	}
+	ExecEndRemoteQuery(node);
+}
+
+/*
+ * coord_sync_col_stats
+ *		sync column stats from the coordinator node specified by syncOpt.
+ */
+static void
+coord_sync_col_stats(Relation		 onerel,
+					 bool			 inh,
+					 int			 attr_cnt,
+					 VacAttrStats  **vacattrstats,
+					 AnalyzeSyncOpt *syncOpt)
+{
+	char			 *nspname;
+	char			 *relname;
+	char			 *cnname;
+	/* Fields to run query to read statistics from coordinator nodes */
+	StringInfoData	  query;
+	EState		   *estate;
+	MemoryContext	  oldcontext;
+	RemoteQuery		*step;
+	RemoteQueryState *node;
+	TupleTableSlot   *result;
+	int				  i;
+
+	/* Get the relation identifier */
+	relname						= RelationGetRelationName(onerel);
+	nspname						= get_namespace_name(RelationGetNamespace(onerel));
+
+	/* Make up query string */
+	initStringInfo(&query);
+	/* Generic statistic fields */
+	appendStringInfoString(&query,
+						   "SELECT s.staattnum, "
 						   "s.stanullfrac, "
 						   "s.stawidth, "
 						   "s.stadistinct");
 	/* Detailed statistic slots */
 	for (i = 1; i <= STATISTIC_NUM_SLOTS; i++)
-		appendStringInfo(&query, ", s.stakind%d"
+		appendStringInfo(&query,
+						 ", s.stakind%d"
 								 ", o%d.oprname"
 								 ", no%d.nspname"
 								 ", t%dl.typname"
@@ -5418,13 +5528,15 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
 						 i, i, i, i, i, i, i, i, i);
 
 	/* Common part of FROM clause */
-	appendStringInfoString(&query, " FROM pg_statistic s JOIN pg_class c "
+	appendStringInfoString(&query,
+						   " FROM pg_statistic s JOIN pg_class c "
 									"    ON s.starelid = c.oid "
 									"JOIN pg_namespace nc "
 									"    ON c.relnamespace = nc.oid ");
 	/* Info about involved operations */
 	for (i = 1; i <= STATISTIC_NUM_SLOTS; i++)
-		appendStringInfo(&query, "LEFT JOIN (pg_operator o%d "
+		appendStringInfo(&query,
+						 "LEFT JOIN (pg_operator o%d "
 								 "           JOIN pg_namespace no%d "
 								 "               ON o%d.oprnamespace = no%d.oid "
 								 "           JOIN pg_type t%dl "
@@ -5436,133 +5548,77 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
 								 "           JOIN pg_namespace nt%dr "
 								 "               ON t%dr.typnamespace = nt%dr.oid) "
 								 "    ON s.staop%d = o%d.oid ",
-						 i, i, i, i, i, i, i, i, i,
-						 i, i, i, i, i, i, i, i, i);
-	appendStringInfo(&query, "WHERE nc.nspname = '%s' "
+						 i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i);
+	appendStringInfo(&query,
+					 "WHERE nc.nspname = '%s' "
 							  "AND c.relname = '%s'",
-					 nspname, relname);
+					 nspname,
+					 relname);
 
 	/* Build up RemoteQuery */
-	execnodes->accesstype	   = RELATION_ACCESS_READ;
-	execnodes->baselocatortype = LOCATOR_TYPE_SHARD; /* not used */
-	execnodes->en_expr		   = NULL;
-	execnodes->en_relid		   = InvalidOid;
-	execnodes->primarynodelist = NIL;
-
-	foreach (lc, syncOpt->nodes)
-	{
-		char node_type = PGXC_NODE_COORDINATOR;
-		nodeIdx		   = PGXCNodeGetNodeIdFromName(strVal(lfirst(lc)), &node_type);
-		execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIdx);
-	}
-	step = makeNode(RemoteQuery);
-	step->combine_type = COMBINE_TYPE_NONE;
-	step->exec_nodes = execnodes;
+	step				= init_sync_remotequery(syncOpt, &cnname);
 	step->sql_statement = query.data;
-	step->force_autocommit = true;
-	step->exec_type = EXEC_ON_COORDS;
 
 	/* Add targetlist entries */
-	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-										 make_relation_tle(StatisticRelationId,
-														   "pg_statistic",
-														   "staattnum"));
-	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-										 make_relation_tle(RelationRelationId,
-														   "pg_class",
-														   "reltuples"));
 	step->scan.plan.targetlist =
 		lappend(step->scan.plan.targetlist,
-				make_relation_tle(RelationRelationId, "pg_class", "relpages"));
+				make_relation_tle(StatisticRelationId, "pg_statistic", "staattnum"));
 	step->scan.plan.targetlist =
 		lappend(step->scan.plan.targetlist,
-				make_relation_tle(RelationRelationId, "pg_class", "relallvisible"));
+				make_relation_tle(StatisticRelationId, "pg_statistic", "stanullfrac"));
 	step->scan.plan.targetlist =
 		lappend(step->scan.plan.targetlist,
-				make_relation_tle(RelationRelationId, "pg_class", "relhasindex"));
-	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-										 make_relation_tle(StatisticRelationId,
-														   "pg_statistic",
-														   "stanullfrac"));
-	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-										 make_relation_tle(StatisticRelationId,
-														   "pg_statistic",
-														   "stawidth"));
-	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-										 make_relation_tle(StatisticRelationId,
-														   "pg_statistic",
-														   "stadistinct"));
+				make_relation_tle(StatisticRelationId, "pg_statistic", "stawidth"));
+	step->scan.plan.targetlist =
+		lappend(step->scan.plan.targetlist,
+				make_relation_tle(StatisticRelationId, "pg_statistic", "stadistinct"));
 	for (i = 1; i <= STATISTIC_NUM_SLOTS; i++)
 	{
 		/* 16 characters would be enough */
 		char 	colname[16];
 
 		sprintf(colname, "stakind%d", i);
-		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-											 make_relation_tle(StatisticRelationId,
-															   "pg_statistic",
-															   colname));
-
-		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-											 make_relation_tle(OperatorRelationId,
-															   "pg_operator",
-															   "oprname"));
-		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-											 make_relation_tle(NamespaceRelationId,
-															   "pg_namespace",
-															   "nspname"));
-		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-											 make_relation_tle(TypeRelationId,
-															   "pg_type",
-															   "typname"));
-		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-											 make_relation_tle(NamespaceRelationId,
-															   "pg_namespace",
-															   "nspname"));
-		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-											 make_relation_tle(TypeRelationId,
-															   "pg_type",
-															   "typname"));
-		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-											 make_relation_tle(NamespaceRelationId,
-															   "pg_namespace",
-															   "nspname"));
+		step->scan.plan.targetlist =
+			lappend(step->scan.plan.targetlist,
+					make_relation_tle(StatisticRelationId, "pg_statistic", colname));
+
+		step->scan.plan.targetlist =
+			lappend(step->scan.plan.targetlist,
+					make_relation_tle(OperatorRelationId, "pg_operator", "oprname"));
+		step->scan.plan.targetlist =
+			lappend(step->scan.plan.targetlist,
+					make_relation_tle(NamespaceRelationId, "pg_namespace", "nspname"));
+		step->scan.plan.targetlist =
+			lappend(step->scan.plan.targetlist,
+					make_relation_tle(TypeRelationId, "pg_type", "typname"));
+		step->scan.plan.targetlist =
+			lappend(step->scan.plan.targetlist,
+					make_relation_tle(NamespaceRelationId, "pg_namespace", "nspname"));
+		step->scan.plan.targetlist =
+			lappend(step->scan.plan.targetlist,
+					make_relation_tle(TypeRelationId, "pg_type", "typname"));
+		step->scan.plan.targetlist =
+			lappend(step->scan.plan.targetlist,
+					make_relation_tle(NamespaceRelationId, "pg_namespace", "nspname"));
 
 		sprintf(colname, "stanumbers%d", i);
-		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-											 make_relation_tle(StatisticRelationId,
-															   "pg_statistic",
-															   colname));
+		step->scan.plan.targetlist =
+			lappend(step->scan.plan.targetlist,
+					make_relation_tle(StatisticRelationId, "pg_statistic", colname));
 
 		sprintf(colname, "stavalues%d", i);
-		step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
-											 make_relation_tle(StatisticRelationId,
-															   "pg_statistic",
-															   colname));
+		step->scan.plan.targetlist =
+			lappend(step->scan.plan.targetlist,
+					make_relation_tle(StatisticRelationId, "pg_statistic", colname));
 	}
 	/* Execute query on the data nodes */
 	estate = CreateExecutorState();
 
 	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
-
-	/*
-	 * Take a fresh snapshot so that we see the effects of the ANALYZE command
-	 * on the datanode. That command is run in auto-commit mode hence just
-	 * bumping up the command ID is not good enough
-	 */
-	/* PushActiveSnapshot(GetLocalTransactionSnapshot());  */
-	estate->es_snapshot = GetActiveSnapshot();
-
 	node = ExecInitRemoteQuery(step, estate, 0);
 	MemoryContextSwitchTo(oldcontext);
 
-	/* get ready to combine results */
-	numnodes = (int *) palloc(attr_cnt * sizeof(int));
-	for (i = 0; i < attr_cnt; i++)
-		numnodes[i] = 0;
-
 	result = ExecRemoteQuery((PlanState *) node);
-	/* PopActiveSnapshot(); */
 	while (result != NULL && !TupIsNull(result))
 	{
 		Datum 			value;
@@ -5574,7 +5630,6 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
 		float4			distinct;
 		VacAttrStats   *stats = NULL;
 
-
 		/* Process statistics from the data node */
 		value = slot_getattr(result, colnum++, &isnull); /* staattnum */
 		attnum = DatumGetInt16(value);
@@ -5583,22 +5638,9 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
 			{
 				stats = vacattrstats[i];
 				stats->stats_valid = true;
-				numnodes[i]++;
 				break;
 			}
 
-		value = slot_getattr(result, colnum++, &isnull); /* reltuples */
-		reltuples = DatumGetFloat4(value);
-
-		value	  = slot_getattr(result, colnum++, &isnull); /* relpages */
-		relpages = DatumGetInt32(value);
-
-		value	  = slot_getattr(result, colnum++, &isnull); /* relallvisible */
-		relallvisible  = DatumGetInt32(value);
-
-		value		   = slot_getattr(result, colnum++, &isnull); /* relhasindex */
-		relhasindex  = DatumGetBool(value);
-
 		if (stats)
 		{
 			value = slot_getattr(result, colnum++, &isnull); /* stanullfrac */
@@ -5614,13 +5656,12 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
 			stats->stadistinct = distinct;
 
 			/* Detailed statistics */
-			for (i = 1; i <= STATISTIC_NUM_SLOTS; i++)
+			for (i = 0; i < STATISTIC_NUM_SLOTS; i++)
 			{
 				int16 		kind;
 				float4	   *numbers;
 				Datum	   *values;
 				int			nnumbers, nvalues;
-				int 		k;
 
 				value = slot_getattr(result, colnum++, &isnull); /* kind */
 				kind = DatumGetInt16(value);
@@ -5655,10 +5696,8 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
 						char	   *oprname;
 						char	   *oprnspname;
 						Oid			ltypid, rtypid;
-						char	   *ltypname,
-								   *rtypname;
-						char	   *ltypnspname,
-								   *rtypnspname;
+						char *ltypname, *rtypname;
+						char *ltypnspname, *rtypnspname;
 						oprname = DatumGetCString(value);
 						value = slot_getattr(result, colnum++, &isnull); /* oprnspname */
 						oprnspname = DatumGetCString(value);
@@ -5667,68 +5706,37 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
 						ltypname = DatumGetCString(value);
 						value = slot_getattr(result, colnum++, &isnull); /* typnspname */
 						ltypnspname = DatumGetCString(value);
-						ltypid = get_typname_typid(ltypname,
-											   get_namespaceid(ltypnspname));
+						ltypid =
+							get_typname_typid(ltypname, get_namespaceid(ltypnspname));
 						/* Get right operand data type */
 						value = slot_getattr(result, colnum++, &isnull); /* typname */
 						rtypname = DatumGetCString(value);
 						value = slot_getattr(result, colnum++, &isnull); /* typnspname */
 						rtypnspname = DatumGetCString(value);
-						rtypid = get_typname_typid(rtypname,
-											   get_namespaceid(rtypnspname));
+						rtypid =
+							get_typname_typid(rtypname, get_namespaceid(rtypnspname));
 						/* lookup operator */
-						oprid = get_operid(oprname, ltypid, rtypid,
+						oprid = get_operid(oprname,
+										   ltypid,
+										   rtypid,
 										   get_namespaceid(oprnspname));
 					}
-					/*
-					 * Look up a statistics slot. If there is an entry of the
-					 * same kind already, leave it, assuming the statistics
-					 * is approximately the same on all nodes, so values from
-					 * one node are representing entire relation well.
-					 * If empty slot is found store values here. If no more
-					 * slots skip remaining values.
-					 */
-					for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
-					{
-						if (stats->stakind[k] == 0 ||
-								(stats->stakind[k] == kind && stats->staop[k] == oprid))
-							break;
-					}
-
-					if (k >= STATISTIC_NUM_SLOTS)
-					{
-						/* No empty slots */
-						break;
-					}
-
-					/*
-					 * If it is an existing slot which has numbers or values
-					 * continue to the next set. If slot exists but without
-					 * numbers and values, try to acquire them now
-					 */
-					if (stats->stakind[k] != 0 && (stats->numnumbers[k] > 0 ||
-							stats->numvalues[k] > 0))
-					{
-						colnum += 2; /* skip numbers and values */
-						continue;
-					}
 
 					/*
 					 * Initialize slot
 					 */
-					stats->stakind[k] = kind;
-					stats->staop[k] = oprid;
-					stats->numnumbers[k] = 0;
-					stats->stanumbers[k] = NULL;
-					stats->numvalues[k] = 0;
-					stats->stavalues[k] = NULL;
-					stats->statypid[k] = InvalidOid;
-					stats->statyplen[k] = -1;
-					stats->statypalign[k] = 'i';
-					stats->statypbyval[k] = true;
+					stats->stakind[i]	  = kind;
+					stats->staop[i]		  = oprid;
+					stats->numnumbers[i]  = 0;
+					stats->stanumbers[i]  = NULL;
+					stats->numvalues[i]	  = 0;
+					stats->stavalues[i]	  = NULL;
+					stats->statypid[i]	  = InvalidOid;
+					stats->statyplen[i]	  = -1;
+					stats->statypalign[i] = 'i';
+					stats->statypbyval[i] = true;
 				}
 
-
 				/* get numbers */
 				value = slot_getattr(result, colnum++, &isnull); /* numbers */
 				if (!isnull)
@@ -5741,13 +5749,11 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
 					 * to look like a C array of float4 values.
 					 */
 					nnumbers = ARR_DIMS(arry)[0];
-					if (ARR_NDIM(arry) != 1 || nnumbers <= 0 ||
-						ARR_HASNULL(arry) ||
+					if (ARR_NDIM(arry) != 1 || nnumbers <= 0 || ARR_HASNULL(arry) ||
 						ARR_ELEMTYPE(arry) != FLOAT4OID)
 						elog(ERROR, "stanumbers is not a 1-D float4 array");
 					numbers = (float4 *) palloc(nnumbers * sizeof(float4));
-					memcpy(numbers, ARR_DATA_PTR(arry),
-						   nnumbers * sizeof(float4));
+					memcpy(numbers, ARR_DATA_PTR(arry), nnumbers * sizeof(float4));
 
 					/*
 					 * Free arry if it's a detoasted copy.
@@ -5755,8 +5761,8 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
 					if ((Pointer) arry != DatumGetPointer(value))
 						pfree(arry);
 
-					stats->numnumbers[k] = nnumbers;
-					stats->stanumbers[k] = numbers;
+					stats->numnumbers[i] = nnumbers;
+					stats->stanumbers[i] = numbers;
 				}
 				/* get values */
 				value = slot_getattr(result, colnum++, &isnull); /* values */
@@ -5770,12 +5776,18 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
 					arry = DatumGetArrayTypeP(value);
 					/* We could cache this data, but not clear it's worth it */
 					get_typlenbyvalalign(ARR_ELEMTYPE(arry),
-										 &elmlen, &elmbyval, &elmalign);
+										 &elmlen,
+										 &elmbyval,
+										 &elmalign);
 					/* Deconstruct array into Datum elements; NULLs not expected */
 					deconstruct_array(arry,
 									  ARR_ELEMTYPE(arry),
-									  elmlen, elmbyval, elmalign,
-									  &values, NULL, &nvalues);
+									  elmlen,
+									  elmbyval,
+									  elmalign,
+									  &values,
+									  NULL,
+									  &nvalues);
 
 					/*
 					 * If the element type is pass-by-reference, we now have a bunch of
@@ -5794,13 +5806,13 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
 					if ((Pointer) arry != DatumGetPointer(value))
 						pfree(arry);
 
-					stats->numvalues[k] = nvalues;
-					stats->stavalues[k] = values;
+					stats->numvalues[i]	  = nvalues;
+					stats->stavalues[i]	  = values;
 					/* store details about values data type */
-					stats->statypid[k] = ARR_ELEMTYPE(arry);
-					stats->statyplen[k] = elmlen;
-					stats->statypalign[k] = elmalign;
-					stats->statypbyval[k] = elmbyval;
+					stats->statypid[i]	  = ARR_ELEMTYPE(arry);
+					stats->statyplen[i]	  = elmlen;
+					stats->statypalign[i] = elmalign;
+					stats->statypbyval[i] = elmbyval;
 				}
 			}
 		}
@@ -5810,49 +5822,151 @@ coord_collect_stats(Relation onerel, bool inh, int attr_cnt,
 	}
 	ExecEndRemoteQuery(node);
 
-	/* for (i = 0; i < attr_cnt; i++) */
-	/* { */
-	/* 	VacAttrStats *stats = vacattrstats[i]; */
-
-	/* 	if (numnodes[i] > 0) */
-	/* 	{ */
-	/* 		stats->stanullfrac /= numnodes[i]; */
-	/* 		stats->stawidth /= numnodes[i]; */
-	/* 		stats->stadistinct /= numnodes[i]; */
-	/* 	} */
-	/* } */
 	update_attstats(RelationGetRelid(onerel),
 					inh,
 					attr_cnt,
 					vacattrstats,
 					RelationGetRelPersistence(onerel));
-	vac_update_relstats(onerel,
-						relpages,
-						reltuples,
-						relallvisible,
-						relhasindex,
-						InvalidTransactionId,
-						InvalidMultiXactId,
-						false);
 }
 
+/*
+ * coord_collect_extended_stats
+ *		sync extended stats for a relation (pg_statistic_ext contents).
+ *
+ * Sync statistics from the coordinator node specified by syncOpt.
+ *
+ */
 static void
-analyze_rel_sync(Relation onerel, bool inh, int attr_cnt,
-						VacAttrStats **vacattrstats, int nindexes,
-				 Relation *indexes, AnlIndexData *indexdata, AnalyzeSyncOpt *syncOpt)
+coord_sync_extended_stats(Relation onerel, int attr_cnt, AnalyzeSyncOpt *syncOpt)
 {
+	char			 *nspname;
+	char			 *relname;
+	char			 *cnname;
+	/* Fields to run query to read statistics from data nodes */
+	StringInfoData	  query;
+	EState		   *estate;
+	MemoryContext	  oldcontext;
+	RemoteQuery		*step;
+	RemoteQueryState *node;
+	TupleTableSlot   *result;
+	int				  i;
+	/* Number of data nodes from which attribute statistics are received. */
+	int				*numnodes;
 
+	/* Get the relation identifier */
+	relname = RelationGetRelationName(onerel);
+	nspname = get_namespace_name(RelationGetNamespace(onerel));
+
+	initStringInfo(&query);
+
+	appendStringInfo(&query,
+					 "SELECT ns.nspname, "
+					 "stxname, "
+					 "stxndistinct::bytea AS stxndistinct, "
+					 "stxdependencies::bytea AS stxdependencies "
+					 " FROM pg_statistic_ext s JOIN pg_class c "
+					 "    ON s.stxrelid = c.oid "
+					 "JOIN pg_namespace nc "
+					 "    ON c.relnamespace = nc.oid "
+					 "JOIN pg_namespace ns "
+					 "    ON s.stxnamespace = ns.oid "
+					 "WHERE nc.nspname = '%s' AND c.relname = '%s'",
+					 nspname,
+					 relname);
+
+	/* Build up RemoteQuery */
+	step				= init_sync_remotequery(syncOpt, &cnname);
+	step->sql_statement = query.data;
+
+	/* Add targetlist entries */
+	step->scan.plan.targetlist =
+		lappend(step->scan.plan.targetlist,
+				make_relation_tle(NamespaceRelationId, "pg_namespace", "nspname"));
+
+	step->scan.plan.targetlist =
+		lappend(step->scan.plan.targetlist,
+				make_relation_tle(StatisticExtRelationId, "pg_statistic_ext", "stxname"));
+
+	step->scan.plan.targetlist = lappend(
+		step->scan.plan.targetlist,
+		make_relation_tle(StatisticExtRelationId, "pg_statistic_ext", "stxndistinct"));
+
+	step->scan.plan.targetlist = lappend(
+		step->scan.plan.targetlist,
+		make_relation_tle(StatisticExtRelationId, "pg_statistic_ext", "stxdependencies"));
+
+	/* Execute query on the data nodes */
+	estate	   = CreateExecutorState();
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+	node	   = ExecInitRemoteQuery(step, estate, 0);
+	MemoryContextSwitchTo(oldcontext);
+
+	/* get ready to combine results */
+	numnodes = (int *)palloc(attr_cnt * sizeof(int));
+	for (i = 0; i < attr_cnt; i++)
+		numnodes[i] = 0;
+
+	result = ExecRemoteQuery((PlanState *)node);
+
+	while (result != NULL && !TupIsNull(result))
+	{
+		Datum  value;
+		bool   isnull;
+		Name   nspname;
+		Name   stxname;
+		bytea *stxndistinct	   = NULL;
+		bytea *stxdependencies = NULL;
+
+		/* Process statistics from the data node */
+		value				   = slot_getattr(result, 1, &isnull); /* nspname */
+		nspname				   = DatumGetName(value);
+
+		value				   = slot_getattr(result, 2, &isnull); /* stxname */
+		stxname				   = DatumGetName(value);
+
+		value				   = slot_getattr(result, 3, &isnull); /* stxndistinct */
+		if (!isnull)
+			stxndistinct = DatumGetByteaP(value);
+
+		value = slot_getattr(result, 4, &isnull); /* stxdependencies */
+		if (!isnull)
+			stxdependencies = DatumGetByteaP(value);
+
+		update_ext_stats(nspname, stxname, stxndistinct, stxdependencies);
+
+		/* fetch stats from next node */
+		result = ExecRemoteQuery((PlanState *)node);
+	}
+	ExecEndRemoteQuery(node);
+}
+
+static void
+analyze_rel_sync(Relation		 onerel,
+				 bool			 inh,
+				 int			 attr_cnt,
+				 VacAttrStats  **vacattrstats,
+				 int			 nindexes,
+				 Relation		  *indexes,
+				 AnlIndexData	  *indexdata,
+				 AnalyzeSyncOpt *syncOpt)
+{
 	int i;
-	/* collect and fit simple statistics (pg_statistic) for the relation */
-	coord_collect_stats(onerel, inh, attr_cnt, vacattrstats, syncOpt);
+	/* sync statistics for the relation */
+	coord_sync_rel_stats(onerel, syncOpt);
+	/* sync column statistics (pg_statistic) for the relation */
+	coord_sync_col_stats(onerel, inh, attr_cnt, vacattrstats, syncOpt);
 
-	/* collect and fit simple statistics (pg_statistic) for all indexes */
+	/* sync simple statistics (pg_statistic) for all indexes */
 	for (i = 0; i < nindexes; i++)
-		coord_collect_stats(indexes[i],
+	{
+		coord_sync_rel_stats(indexes[i], syncOpt);
+		coord_sync_col_stats(indexes[i],
 									   false,
 									   indexdata[i].attr_cnt,
-							indexdata[i].vacattrstats, syncOpt);
+							 indexdata[i].vacattrstats,
+							 syncOpt);
+	}
 
 	/* extended statistics (pg_statistic) for the relation */
-	/* coord_collect_extended_stats(onerel, attr_cnt);  */
+	coord_sync_extended_stats(onerel, attr_cnt, syncOpt);
 }
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 7efd70e5..45aef61f 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -11111,25 +11111,25 @@ analyze_keyword:
 		;
 
 analyze_sync_option :
-/*            SYNC
+            SYNC ALL
 			    {
 					AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt);
 					n->is_sync_from = false;
 					n->nodes = NIL;
-					$$ = (Node *)n;
+					$$ = n;
 			    }
-			|*/ SYNC TO pgxcnode_list
+			| SYNC TO pgxcnode_list
 			    {
 					AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt);
 					n->is_sync_from = false;
 					n->nodes = $3;
 					$$ = n;
 			    }
-			| SYNC FROM pgxcnode_list
+			| SYNC FROM pgxcnode_name
 			    {
 					AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt);
 					n->is_sync_from = true;
-					n->nodes = $3;
+					n->nodes = list_make1(makeString($3));
 					$$ = n;
 				}
 			| /*EMPTY*/ { $$ = NULL; }
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 6521598b..8240f0d6 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -713,16 +713,6 @@ ProcessUtilityPre(PlannedStmt *pstmt,
                 /* we choose to allow this during "read only" transactions */
 			PreventCommandDuringRecovery((stmt->options & VACOPT_VACUUM) ? "VACUUM"
 																		 : "ANALYZE");
-			/* When statement is emit by the coordinating node, the statement is not
-			 * rewritten, we adapt it here */
-			if (IsConnFromCoord() && IS_PGXC_COORDINATOR &&
-				(stmt->options & VACOPT_ANALYZE) && stmt->sync_option)
-			{
-				stmt->sync_option->is_sync_from = true;
-				list_free_deep(stmt->sync_option->nodes);
-				stmt->sync_option->nodes = NIL;
-				stmt->sync_option->nodes = list_make1(makeString(parentPGXCNode));
-			}
 			if (!IsConnFromCoord() && IS_PGXC_COORDINATOR && stmt->sync_option &&
 				stmt->sync_option->nodes != NIL)
 			{
@@ -740,10 +730,10 @@ ProcessUtilityPre(PlannedStmt *pstmt,
 					{
 						ereport(ERROR,
 								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-								 errmsg("can not find coordinator %s!",
+								 errmsg("Can not find coordinator %s!",
 										strVal(lfirst(cell)))));
 					}
-					if (node_type != PGXC_NODE_COORDINATOR)
+					else if (node_type != PGXC_NODE_COORDINATOR)
 					{
 						ereport(ERROR,
 								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
@@ -752,11 +742,22 @@ ProcessUtilityPre(PlannedStmt *pstmt,
 					}
 				}
 			}
+
+			/* When statement is emit by the coordinating node, the statement is not
+			 * rewritten, adapt it here */
+			if (IsConnFromCoord() && IS_PGXC_COORDINATOR &&
+				(stmt->options & VACOPT_ANALYZE) && stmt->sync_option)
+			{
+				stmt->sync_option->is_sync_from = true;
+				list_free_deep(stmt->sync_option->nodes);
+				stmt->sync_option->nodes = NIL;
+				stmt->sync_option->nodes = list_make1(makeString(parentPGXCNode));
+			}
                 /*
-                 * We have to run the command on nodes before Coordinator because
+			 * Not SYNC command, We have to run the command on nodes before Coordinator because
                  * vacuum() pops active snapshot and we can not send it to nodes
                  */
-                if (!(stmt->options & VACOPT_COORDINATOR))
+			else if (!(stmt->options & VACOPT_COORDINATOR))
                     exec_type = EXEC_ON_DATANODES;
                 auto_commit = true;
             }
@@ -1357,6 +1358,7 @@ ProcessUtilityPost(PlannedStmt *pstmt,
         case T_VacuumStmt:
 		{
 			VacuumStmt *vstmt = (VacuumStmt *)parsetree;
+			/* Send synchronization statements to other coordinator nodes  */
 			if (!IsConnFromCoord() && IS_PGXC_COORDINATOR &&
 				(vstmt->options & VACOPT_ANALYZE) && vstmt->sync_option)
 			{
@@ -1377,14 +1379,6 @@ ProcessUtilityPost(PlannedStmt *pstmt,
 						char node_type = PGXC_NODE_COORDINATOR;
 						nodeIdx =
 							PGXCNodeGetNodeIdFromName(strVal(lfirst(lc)), &node_type);
-						/* Assert(nodeIdx > 0 && nodeIdx < NumDataNodes); */
-						/* if(node_type != PGXC_NODE_COORDINATOR){ */
-						/* 	ereport(ERROR, */
-						/* 			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), */
-						/* 			 errmsg("node %s is not coordinator!",
-						 * strVal(lfirst(lc))))); */
-						/* } */
-						/* already check/rewrite in pre, just add it */
 						exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, nodeIdx);
 					}
 				}
@@ -1392,81 +1386,6 @@ ProcessUtilityPost(PlannedStmt *pstmt,
 				CommitTransactionCommand();
 				StartTransactionCommand();
 			}
-			/* if (vstmt->options & VACOPT_ANALYZE && vstmt->sync_option != NULL && */
-			/* 	vstmt->sync_option->is_sync_from != true) */
-			/* { */
-			/* 	StringInfo queryStr = makeStringInfo(); */
-			/* 	appendStringInfo(queryStr, "ANALYZE (COORDINATOR"); */
-			/* 	if (vstmt->options & VACOPT_VERBOSE) */
-			/* 	{ */
-			/* 		appendStringInfoString(queryStr, " ,VERBOSE"); */
-			/* 	} */
-			/* 	appendStringInfoChar(queryStr, ')'); */
-			/* 	if (vstmt->relation) */
-			/* 		appendStringInfo(queryStr, " %s", RangeVarGetName(vstmt->relation));
-			 */
-			/* 	if (vstmt->va_cols) */
-			/* 	{ */
-			/* 		ListCell *lc; */
-			/* 		bool	  comma = false; */
-			/* 		appendStringInfoString(queryStr, " ("); */
-			/* 		foreach (lc, vstmt->va_cols) */
-			/* 		{ */
-			/* 			if (comma) */
-			/* 				comma = true; */
-			/* 			else */
-			/* 				appendStringInfoChar(queryStr, ','); */
-			/* 			appendStringInfoString(queryStr, strVal(lfirst(lc))); */
-			/* 		} */
-			/* 		appendStringInfoChar(queryStr, ')'); */
-			/* 	} */
-
-			/* 	appendStringInfo(queryStr, " SYNC FROM %s", PGXCNodeName); */
-			/* 	PopActiveSnapshot(); */
-			/* 	CommitTransactionCommand(); */
-			/* 	StartTransactionCommand(); */
-			/* 	if (vstmt->sync_option->nodes) */
-			/* 	{ */
-			/* 		ExecNodes *execnodes; */
-			/* 		ListCell *lc; */
-			/* 		int nodeIdx; */
-			/* 		execnodes				   = (ExecNodes *)makeNode(ExecNodes); */
-			/* 		execnodes->accesstype	   = RELATION_ACCESS_INSERT; */
-			/* 		execnodes->baselocatortype = LOCATOR_TYPE_SHARD;   /\* not used *\/ */
-			/* 		execnodes->en_expr		   = NULL; */
-			/* 		execnodes->en_relid		   = InvalidOid; */
-			/* 		execnodes->primarynodelist = NIL; */
-
-			/* 		foreach(lc, vstmt->sync_option->nodes){ */
-			/* 			char node_type = PGXC_NODE_COORDINATOR; */
-			/* 			nodeIdx = */
-			/* 				PGXCNodeGetNodeIdFromName(strVal(lfirst(lc)), &node_type); */
-			/* 			Assert(nodeIdx > 0 && nodeIdx < NumDataNodes); */
-			/* 			execnodes->nodeList = lappend_int(execnodes->nodeList, nodeIdx);
-			 */
-			/* 		} */
-			/* 		ExecUtilityStmtOnNodes(parsetree, */
-			/* 							   queryStr->data, */
-			/* 							   execnodes, */
-			/* 							   sentToRemote, */
-			/* 							   false, */
-			/* 							   EXEC_ON_COORDS, */
-			/* 							   false, */
-			/* 							   false); */
-			/* 		list_free(execnodes->nodeList); */
-			/* 	} */
-			/* 	else */
-			/* 		ExecUtilityStmtOnNodes(parsetree, */
-			/* 							   queryStr->data, */
-			/* 							   NULL, */
-			/* 							   sentToRemote, */
-			/* 							   auto_commit, */
-			/* 							   EXEC_ON_COORDS, */
-			/* 							   false, */
-			/* 							   false); */
-			/* 	pfree(queryStr->data); */
-			/* 	pfree(queryStr); */
-			/* } */
 			break;
 		}
 #ifdef _SHARDING_

From 1ddbbef3301e20505bb21ca30564be8a5ef2fcac Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Tue, 8 Mar 2022 08:53:48 +0800
Subject: [PATCH 530/578] format error info

---
 src/backend/tcop/utility.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 8240f0d6..b8d32cbb 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -721,24 +721,16 @@ ProcessUtilityPre(PlannedStmt *pstmt,
 				foreach (cell, stmt->sync_option->nodes)
 				{
 					if (0 == strcmp(strVal(lfirst(cell)), PGXCNodeName))
-						ereport(ERROR,
-								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-								 errmsg("Can not sync to/from local!")));
+						elog(ERROR, "Can not sync to/from local!");
 
 					PGXCNodeGetNodeIdFromName(strVal(lfirst(cell)), &node_type);
 					if (node_type == PGXC_NODE_NONE)
 					{
-						ereport(ERROR,
-								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-								 errmsg("Can not find coordinator %s!",
-										strVal(lfirst(cell)))));
+						elog(ERROR, "Can not find coordinator %s!", strVal(lfirst(cell)));
 					}
 					else if (node_type != PGXC_NODE_COORDINATOR)
 					{
-						ereport(ERROR,
-								(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-								 errmsg("node %s is not coordinator!",
-										strVal(lfirst(cell)))));
+						elog(ERROR, "node %s is not coordinator!", strVal(lfirst(cell)));
 					}
 				}
 			}

From 2895697a8aff288bda95b822ff458a63152fa4de Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Tue, 8 Mar 2022 09:52:58 +0800
Subject: [PATCH 531/578] "analyze" and "analyze sync" behave the same: sync
 statistics by default

---
 src/backend/parser/gram.y | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 45aef61f..15f0cc65 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -11132,7 +11132,13 @@ analyze_sync_option :
 					n->nodes = list_make1(makeString($3));
 					$$ = n;
 				}
-			| /*EMPTY*/ { $$ = NULL; }
+            | /*EMPTY*/
+				{
+					AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt);
+					n->is_sync_from = false;
+					n->nodes = NIL;
+					$$ = n;
+				}
 		;
 
 opt_verbose:

From 17339b16f9b83089b0f232a3e0be1840a65628cc Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Mon, 14 Mar 2022 09:01:22 +0800
Subject: [PATCH 532/578] format comment

---
 src/backend/tcop/utility.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index b8d32cbb..16e95deb 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -735,8 +735,10 @@ ProcessUtilityPre(PlannedStmt *pstmt,
 				}
 			}
 
-			/* When statement is emit by the coordinating node, the statement is not
-			 * rewritten, adapt it here */
+			/*
+			 * When statement is emit by the coordinating node, the statement is not
+			 * rewritten, adapt it here
+			 */
 			if (IsConnFromCoord() && IS_PGXC_COORDINATOR &&
 				(stmt->options & VACOPT_ANALYZE) && stmt->sync_option)
 			{

From 761bbda286a7f69641ae5b3cf55d6e85b2e3794d Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Tue, 22 Mar 2022 17:00:20 +0800
Subject: [PATCH 533/578] Added the feature: statistics synchronization.The
 ANALYZE syntax has been extended

support sync vacuum statistics for relation

cherry-pick:
     9510c58d e47d6f98 808f8a6b 8a5348ac 378af856 60882fef
---
 src/backend/commands/analyze.c |  24 +++--
 src/backend/commands/vacuum.c  | 156 +++++++++++++++++++++++++++++----
 src/backend/nodes/copyfuncs.c  |  10 +--
 src/backend/nodes/equalfuncs.c |   6 +-
 src/backend/parser/gram.y      |  20 +++--
 src/backend/tcop/utility.c     |  27 ++++--
 src/backend/utils/adt/dbsize.c |   3 +-
 src/include/commands/vacuum.h  |   9 +-
 src/include/nodes/parsenodes.h |   6 +-
 9 files changed, 202 insertions(+), 59 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index d558c5a3..901b0a60 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -121,7 +121,7 @@ static BufferAccessStrategy vac_strategy;
 static void do_analyze_rel(Relation onerel, int options,
                VacuumParams *params, List *va_cols,
                AcquireSampleRowsFunc acquirefunc, BlockNumber relpages,
-						   bool inh, bool in_outer_xact, int elevel, AnalyzeSyncOpt *syncOpt);
+						   bool inh, bool in_outer_xact, int elevel, StatSyncOpt *syncOpt);
 static void compute_index_stats(Relation onerel, double totalrows,
                     AnlIndexData *indexdata, int nindexes,
                     HeapTuple *rows, int numrows,
@@ -146,7 +146,7 @@ static void					analyze_rel_sync(Relation		 onerel,
 											 int			 nindexes,
 											 Relation		  *indexes,
 											 AnlIndexData	  *indexdata,
-											 AnalyzeSyncOpt *syncOpt);
+											 StatSyncOpt *syncOpt);
 
 #ifdef XCP
 static void analyze_rel_coordinator(Relation onerel, bool inh, int attr_cnt,
@@ -177,7 +177,7 @@ analyze_rel(Oid					 relid,
 			List				 *va_cols,
 			bool				 in_outer_xact,
 			BufferAccessStrategy bstrategy,
-			AnalyzeSyncOpt	   *syncOpt)
+			StatSyncOpt	   *syncOpt)
 {
     Relation    onerel;
     int            elevel;
@@ -438,7 +438,7 @@ do_analyze_rel(Relation				 onerel,
 			   bool					 inh,
 			   bool					 in_outer_xact,
 			   int					 elevel,
-			   AnalyzeSyncOpt		  *syncOpt)
+			   StatSyncOpt		  *syncOpt)
 {
     int            attr_cnt,
                 tcnt,
@@ -633,10 +633,8 @@ do_analyze_rel(Relation				 onerel,
 	 * Sync statistics if this session is connected to other remote Coordinator.
 	 * When receiving sync commands directly from the client, we also sync statistics.
 	 */
-	if (iscoordinator && IsConnFromCoord() &&
-		(syncOpt != NULL && syncOpt->is_sync_from == true))
+	if (iscoordinator && (syncOpt != NULL && syncOpt->is_sync_from == true))
 	{
-		elog(INFO, "SYNC statistic");
 		analyze_rel_sync(onerel,
 						 inh,
 						 attr_cnt,
@@ -5349,8 +5347,8 @@ acquire_coordinator_sample_rows(Relation onerel, int elevel,
 
 #endif
 
-static RemoteQuery *
-init_sync_remotequery(AnalyzeSyncOpt *syncOpt, char **cnname)
+RemoteQuery *
+init_sync_remotequery(StatSyncOpt *syncOpt, char **cnname)
 {
 	RemoteQuery *step;
 	ListCell	 *lc;
@@ -5381,7 +5379,7 @@ init_sync_remotequery(AnalyzeSyncOpt *syncOpt, char **cnname)
  *		sync relation stats from the coordinator node specified by syncOpt.
  */
 static void
-coord_sync_rel_stats(Relation onerel, AnalyzeSyncOpt *syncOpt)
+coord_sync_rel_stats(Relation onerel, StatSyncOpt *syncOpt)
 {
 	char 		   *nspname;
 	char 		   *relname;
@@ -5487,7 +5485,7 @@ coord_sync_col_stats(Relation		 onerel,
 					 bool			 inh,
 					 int			 attr_cnt,
 					 VacAttrStats  **vacattrstats,
-					 AnalyzeSyncOpt *syncOpt)
+					 StatSyncOpt *syncOpt)
 {
 	char			 *nspname;
 	char			 *relname;
@@ -5837,7 +5835,7 @@ coord_sync_col_stats(Relation		 onerel,
  *
  */
 static void
-coord_sync_extended_stats(Relation onerel, int attr_cnt, AnalyzeSyncOpt *syncOpt)
+coord_sync_extended_stats(Relation onerel, int attr_cnt, StatSyncOpt *syncOpt)
 {
 	char			 *nspname;
 	char			 *relname;
@@ -5948,7 +5946,7 @@ analyze_rel_sync(Relation		 onerel,
 				 int			 nindexes,
 				 Relation		  *indexes,
 				 AnlIndexData	  *indexdata,
-				 AnalyzeSyncOpt *syncOpt)
+				 StatSyncOpt *syncOpt)
 {
 	int i;
 	/* sync statistics for the relation */
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index f001e056..78cc13ef 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -98,7 +98,7 @@ static void vac_truncate_clog(TransactionId frozenXID,
                   TransactionId lastSaneFrozenXid,
                   MultiXactId lastSaneMinMulti);
 static bool vacuum_rel(Oid relid, RangeVar *relation, int options,
-           VacuumParams *params);
+					   VacuumParams *params, StatSyncOpt *syncOpt);
 
 /*
  * Primary entry point for manual VACUUM and ANALYZE commands
@@ -178,7 +178,7 @@ vacuum(int					options,
 	   List				*va_cols,
 	   BufferAccessStrategy bstrategy,
 	   bool					isTopLevel,
-	   AnalyzeSyncOpt	  *syncOpt)
+	   StatSyncOpt	  *syncOpt)
 {
     const char *stmttype;
     volatile bool in_outer_xact,
@@ -332,7 +332,7 @@ vacuum(int					options,
 
             if (options & VACOPT_VACUUM)
             {
-                if (!vacuum_rel(relid, relation, options, params))
+				if (!vacuum_rel(relid, relation, options, params, syncOpt))
                     continue;
             }
 
@@ -1266,8 +1266,8 @@ vac_truncate_clog(TransactionId frozenXID,
  *        At entry and exit, we are not inside a transaction.
  */
 static bool
-vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params)
-{// #lizard forgives
+vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params, StatSyncOpt *syncOpt)
+{
     LOCKMODE    lmode;
     Relation    onerel;
     LockRelId    onerelid;
@@ -1328,7 +1328,7 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params)
             foreach (lc, new_childs)
             {
                 child			  = lfirst_oid(lc);
-                part_vacuum_result = vacuum_rel(child, relation, options, params);
+				part_vacuum_result = vacuum_rel(child, relation, options, params, syncOpt);
             }
             UnlockRelationIdForSession(&onerelid, RowExclusiveLock);
             pfree(new_childs);
@@ -1554,7 +1554,7 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params)
 		 */
 		if (toast_relid != InvalidOid)
 		{
-			vacuum_rel(toast_relid, relation, options, params);
+			vacuum_rel(toast_relid, relation, options, params, syncOpt);
 		}
 
 		/*
@@ -1574,7 +1574,7 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params)
      */
     if (IS_PGXC_COORDINATOR && onerel->rd_locator_info)
     {
-		vacuum_rel_coordinator(onerel, true, params);
+		vacuum_rel_coordinator(onerel, true, params, syncOpt);
     }
     else
 #endif
@@ -1618,7 +1618,7 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params)
      * totally unimportant for toast relations.
      */
     if (toast_relid != InvalidOid)
-        vacuum_rel(toast_relid, relation, options, params);
+		vacuum_rel(toast_relid, relation, options, params, syncOpt);
 
     /*
      * Now release the session-level lock on the master table.
@@ -1928,13 +1928,112 @@ get_remote_relstat(char *nspname, char *relname, bool replicated,
 }
 
 
+/*
+ * Get relation statistics from coordinator node specified by syncOpt
+ */
+static void
+sync_remote_relstat(char *nspname, char *relname, bool replicated,
+				   int32 *pages, int32 *allvisiblepages,
+					float4 *tuples, TransactionId *frozenXid, StatSyncOpt *syncOpt)
+{
+	char *cnname;
+	StringInfoData query;
+	EState 	   *estate;
+	MemoryContext oldcontext;
+	RemoteQuery *step;
+	RemoteQueryState *node;
+	TupleTableSlot *result;
+
+	/* Make up query string */
+	initStringInfo(&query);
+	appendStringInfo(&query, "SELECT c.relpages, "
+									"c.reltuples, "
+									"c.relallvisible, "
+									"c.relfrozenxid "
+							 "FROM pg_class c JOIN pg_namespace n "
+							 "ON c.relnamespace = n.oid "
+							 "WHERE n.nspname = '%s' "
+							 "AND c.relname = '%s'",
+					 nspname, relname);
+
+	/* Build up RemoteQuery */
+	step = init_sync_remotequery(syncOpt, &cnname);
+	step->sql_statement = query.data;
+	step->force_autocommit = true;
+
+
+	/* Add targetlist entries */
+	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+										 make_relation_tle(RelationRelationId,
+														   "pg_class",
+														   "relpages"));
+	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+										 make_relation_tle(RelationRelationId,
+														   "pg_class",
+														   "reltuples"));
+	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+										 make_relation_tle(RelationRelationId,
+														   "pg_class",
+														   "relallvisible"));
+	step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+										 make_relation_tle(RelationRelationId,
+														   "pg_class",
+														   "relfrozenxid"));
+
+	/* Execute query on the data nodes */
+	estate = CreateExecutorState();
+	oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+	node = ExecInitRemoteQuery(step, estate, 0);
+	MemoryContextSwitchTo(oldcontext);
+	/* get ready to combine results */
+	*pages = 0;
+	*allvisiblepages = 0;
+	*tuples = 0.0;
+	*frozenXid = InvalidTransactionId;
+
+	result = ExecRemoteQuery((PlanState *) node);
+	if (result != NULL && !TupIsNull(result))
+	{
+		Datum 	value;
+		bool	isnull;
+		/* Process statistics from the data node */
+		value = slot_getattr(result, 1, &isnull); /* relpages */
+		if (!isnull)
+		{
+			*pages = DatumGetInt32(value);
+		}
+		value = slot_getattr(result, 2, &isnull); /* reltuples */
+		if (!isnull)
+		{
+			*tuples = DatumGetFloat4(value);
+		}
+		value = slot_getattr(result, 3, &isnull); /* relallvisible */
+		if (!isnull)
+		{
+			*allvisiblepages = DatumGetInt32(value);
+		}
+		value = slot_getattr(result, 4, &isnull); /* relfrozenxid */
+		if (!isnull)
+		{
+			TransactionId xid = DatumGetTransactionId(value);
+			if (TransactionIdIsValid(xid))
+			{
+					*frozenXid = xid;
+			}
+		}
+	}
+	ExecEndRemoteQuery(node);
+}
+
+
 /*
  * Coordinator does not contain any data, so we never need to vacuum relations.
  * This function only updates optimizer statistics based on info from the
  * data nodes.
  */
 void
-vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params)
+vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params, StatSyncOpt *syncOpt)
 {
     char        *nspname;
     char        *relname;
@@ -1945,7 +2044,8 @@ vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params)
     TransactionId min_frozenxid;
     bool        hasindex;
     bool         replicated;
-    int         rel_nodes;
+	int 		rel_nodes = 0;
+	bool        isSync = false;
 #ifdef __TBASE__
 	TransactionId oldestXmin = InvalidTransactionId;
 	TransactionId freezeLimit = InvalidTransactionId;
@@ -1976,10 +2076,23 @@ vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params)
      * Get stats from the remote nodes. Function returns the number of nodes
      * returning correct stats.
      */
+	if (syncOpt != NULL && syncOpt->is_sync_from == true &&
+		!RELATION_IS_COORDINATOR_LOCAL(onerel))
+	{
+		sync_remote_relstat(nspname,
+							relname,
+							replicated,
+							&num_pages,
+							&num_allvisible_pages,
+							&num_tuples,
+							&min_frozenxid,
+							syncOpt);
+		isSync = true;
+	}else
     rel_nodes = get_remote_relstat(nspname, relname, replicated,
                                    &num_pages, &num_allvisible_pages,
                                    &num_tuples, &min_frozenxid);
-    if (rel_nodes > 0)
+	if (rel_nodes > 0 || isSync)
     {
         int            nindexes;
         Relation   *Irel;
@@ -1998,22 +2111,33 @@ vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params)
                 int32    idx_pages, idx_allvisible_pages;
                 float4    idx_tuples;
                 TransactionId idx_frozenxid;
-                int idx_nodes;
+				int idx_nodes = 0;
 
                 /* Get the index identifier */
                 relname = RelationGetRelationName(Irel[i]);
                 nspname = get_namespace_name(RelationGetNamespace(Irel[i]));
                 /* Index is replicated if parent relation is replicated */
+				if(isSync)
+				{
+					sync_remote_relstat(nspname,
+										relname,
+										replicated,
+										&idx_pages,
+										&idx_allvisible_pages,
+										&idx_tuples,
+										&idx_frozenxid,
+										syncOpt);
+				}else
                 idx_nodes = get_remote_relstat(nspname, relname, replicated,
                                         &idx_pages, &idx_allvisible_pages,
                                         &idx_tuples, &idx_frozenxid);
-                if (idx_nodes > 0)
+				if (idx_nodes > 0 || isSync)
                 {
                     /*
                      * Do not update the frozenxid if information was not from
                      * all the expected nodes.
                      */
-                    if (idx_nodes < nodes)
+					if (idx_nodes < nodes && !isSync)
                     {
                         idx_frozenxid = InvalidTransactionId;
                     }
@@ -2038,7 +2162,7 @@ vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params)
          * Do not update the frozenxid if information was not from all
          * the expected nodes.
          */
-        if (rel_nodes < nodes)
+		if (rel_nodes < nodes && !isSync)
         {
             min_frozenxid = InvalidTransactionId;
         }
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 10a1d424..876c407e 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -4149,10 +4149,10 @@ _copyVacuumStmt(const VacuumStmt *from)
 	return newnode;
 }
 
-static AnalyzeSyncOpt *
-_copyAnalyzeSyncOpt(const AnalyzeSyncOpt *from)
+static StatSyncOpt *
+_copyStatSyncOpt(const StatSyncOpt *from)
 {
-	AnalyzeSyncOpt *newnode = makeNode(AnalyzeSyncOpt);
+	StatSyncOpt *newnode = makeNode(StatSyncOpt);
 
 	COPY_SCALAR_FIELD(is_sync_from);
 	COPY_NODE_FIELD(nodes);
@@ -5926,8 +5926,8 @@ copyObjectImpl(const void *from)
         case T_VacuumStmt:
             retval = _copyVacuumStmt(from);
             break;
-		case T_AnalyzeSyncOpt:
-			retval = _copyAnalyzeSyncOpt(from);
+		case T_StatSyncOpt:
+			retval = _copyStatSyncOpt(from);
 			break;
 #ifdef _SHARDING_
         case T_VacuumShardStmt:
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 87934c2d..f5f2bc77 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -1700,7 +1700,7 @@ _equalVacuumStmt(const VacuumStmt *a, const VacuumStmt *b)
 }
 
 static bool
-_equalAnalyzeSyncOpt(const AnalyzeSyncOpt *a, const AnalyzeSyncOpt *b)
+_equalStatSyncOpt(const StatSyncOpt *a, const StatSyncOpt *b)
 {
 	COMPARE_SCALAR_FIELD(is_sync_from);
 	COMPARE_NODE_FIELD(nodes);
@@ -3602,8 +3602,8 @@ equal(const void *a, const void *b)
         case T_VacuumStmt:
             retval = _equalVacuumStmt(a, b);
             break;
-		case T_AnalyzeSyncOpt:
-			retval = _equalAnalyzeSyncOpt(a, b);
+		case T_StatSyncOpt:
+			retval = _equalStatSyncOpt(a, b);
 			break;
 #ifdef _SHARDING_
         case T_VacuumShardStmt:
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 15f0cc65..dad866bf 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -10969,7 +10969,7 @@ cluster_index_specification:
  *
  *****************************************************************************/
 
-VacuumStmt: VACUUM opt_full opt_freeze opt_verbose
+VacuumStmt: VACUUM opt_full opt_freeze opt_verbose analyze_sync_option
 				{
 					VacuumStmt *n = makeNode(VacuumStmt);
 					n->options = VACOPT_VACUUM;
@@ -10981,9 +10981,10 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose
 						n->options |= VACOPT_VERBOSE;
 					n->relation = NULL;
 					n->va_cols = NIL;
+					n->sync_option = $5;
 					$$ = (Node *)n;
 				}
-			| VACUUM opt_full opt_freeze opt_verbose qualified_name
+			| VACUUM opt_full opt_freeze opt_verbose qualified_name analyze_sync_option
 				{
 					VacuumStmt *n = makeNode(VacuumStmt);
 					n->options = VACOPT_VACUUM;
@@ -10995,6 +10996,7 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose
 						n->options |= VACOPT_VERBOSE;
 					n->relation = $5;
 					n->va_cols = NIL;
+					n->sync_option = $6;
 					$$ = (Node *)n;
 				}
 			| VACUUM opt_full opt_freeze opt_verbose AnalyzeStmt
@@ -11009,15 +11011,16 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose
 						n->options |= VACOPT_VERBOSE;
 					$$ = (Node *)n;
 				}
-			| VACUUM '(' vacuum_option_list ')'
+			| VACUUM '(' vacuum_option_list ')' analyze_sync_option
 				{
 					VacuumStmt *n = makeNode(VacuumStmt);
 					n->options = VACOPT_VACUUM | $3;
 					n->relation = NULL;
 					n->va_cols = NIL;
+					n->sync_option = $5;
 					$$ = (Node *) n;
 				}
-			| VACUUM '(' vacuum_option_list ')' qualified_name opt_name_list
+			| VACUUM '(' vacuum_option_list ')' qualified_name opt_name_list analyze_sync_option
 				{
 					VacuumStmt *n = makeNode(VacuumStmt);
 					n->options = VACOPT_VACUUM | $3;
@@ -11025,6 +11028,7 @@ VacuumStmt: VACUUM opt_full opt_freeze opt_verbose
 					n->va_cols = $6;
 					if (n->va_cols != NIL)	/* implies analyze */
 						n->options |= VACOPT_ANALYZE;
+					n->sync_option = $7;
 					$$ = (Node *) n;
 				}
 /* _SHARDING_ BEGIN */
@@ -11113,28 +11117,28 @@ analyze_keyword:
 analyze_sync_option :
             SYNC ALL
 			    {
-					AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt);
+					StatSyncOpt *n = makeNode(StatSyncOpt);
 					n->is_sync_from = false;
 					n->nodes = NIL;
 					$$ = n;
 			    }
 			| SYNC TO pgxcnode_list
 			    {
-					AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt);
+					StatSyncOpt *n = makeNode(StatSyncOpt);
 					n->is_sync_from = false;
 					n->nodes = $3;
 					$$ = n;
 			    }
 			| SYNC FROM pgxcnode_name
 			    {
-					AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt);
+					StatSyncOpt *n = makeNode(StatSyncOpt);
 					n->is_sync_from = true;
 					n->nodes = list_make1(makeString($3));
 					$$ = n;
 				}
             | /*EMPTY*/
 				{
-					AnalyzeSyncOpt *n = makeNode(AnalyzeSyncOpt);
+					StatSyncOpt *n = makeNode(StatSyncOpt);
 					n->is_sync_from = false;
 					n->nodes = NIL;
 					$$ = n;
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 16e95deb..87be1ef4 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -739,8 +739,7 @@ ProcessUtilityPre(PlannedStmt *pstmt,
 			 * When statement is emit by the coordinating node, the statement is not
 			 * rewritten, adapt it here
 			 */
-			if (IsConnFromCoord() && IS_PGXC_COORDINATOR &&
-				(stmt->options & VACOPT_ANALYZE) && stmt->sync_option)
+			if (IsConnFromCoord() && IS_PGXC_COORDINATOR && stmt->sync_option)
 			{
 				stmt->sync_option->is_sync_from = true;
 				list_free_deep(stmt->sync_option->nodes);
@@ -748,10 +747,10 @@ ProcessUtilityPre(PlannedStmt *pstmt,
 				stmt->sync_option->nodes = list_make1(makeString(parentPGXCNode));
 			}
                 /*
-			 * Not SYNC command, We have to run the command on nodes before Coordinator because
+			 * If it is not a SYNC FROM command, We have to run the command on nodes before Coordinator because
                  * vacuum() pops active snapshot and we can not send it to nodes
                  */
-			else if (!(stmt->options & VACOPT_COORDINATOR))
+			else if (!(stmt->options & VACOPT_COORDINATOR) && !(stmt->sync_option && stmt->sync_option->is_sync_from == true))
                     exec_type = EXEC_ON_DATANODES;
                 auto_commit = true;
             }
@@ -1352,9 +1351,21 @@ ProcessUtilityPost(PlannedStmt *pstmt,
         case T_VacuumStmt:
 		{
 			VacuumStmt *vstmt = (VacuumStmt *)parsetree;
-			/* Send synchronization statements to other coordinator nodes  */
+			if (vstmt->relation != NULL)
+			{
+				Relation rel =
+					relation_openrv_extended(vstmt->relation, NoLock, true, false);
+				if (rel && rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
+				{
+					relation_close(rel, NoLock);
+					break;
+				}
+				if (rel)
+					relation_close(rel, NoLock);
+			}
 			if (!IsConnFromCoord() && IS_PGXC_COORDINATOR &&
-				(vstmt->options & VACOPT_ANALYZE) && vstmt->sync_option)
+				!IsInTransactionChain(context == PROCESS_UTILITY_TOPLEVEL) &&
+				vstmt->sync_option)
 			{
 				exec_type		  = EXEC_ON_COORDS;
 				if (vstmt->sync_option->nodes)
@@ -1376,10 +1387,14 @@ ProcessUtilityPost(PlannedStmt *pstmt,
 						exec_nodes->nodeList = lappend_int(exec_nodes->nodeList, nodeIdx);
 					}
 				}
+				if (ActiveSnapshotSet())
+				{
 				PopActiveSnapshot();
+				}
 				CommitTransactionCommand();
 				StartTransactionCommand();
 			}
+			auto_commit = true;
 			break;
 		}
 #ifdef _SHARDING_
diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c
index 17005175..eed6bb8b 100644
--- a/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@ -567,9 +567,10 @@ pg_relation_size(PG_FUNCTION_ARGS)
             partoid = lfirst_oid(lc);
             
             child_rel = try_relation_open(partoid, AccessShareLock);
+			/* skip calculate size of child not exists */
             if (NULL == child_rel)
             {
-                PG_RETURN_NULL();
+				continue;
             }
             size += calculate_relation_size(&(child_rel->rd_node), child_rel->rd_backend,
                                            forkname_to_number(text_to_cstring(forkName)), NULL);
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 9da04880..47859b0b 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -84,7 +84,7 @@
 #include "storage/lock.h"
 #include "storage/relfilenode.h"
 #include "utils/relcache.h"
-
+#include "pgxc/planner.h"
 
 /*----------
  * ANALYZE builds one of these structs for each attribute (column) that is
@@ -295,7 +295,7 @@ extern void	  vacuum(int				  options,
 					 List				  *va_cols,
 					 BufferAccessStrategy bstrategy,
 					 bool				  isTopLevel,
-					 AnalyzeSyncOpt *syncOpt);
+					 StatSyncOpt *syncOpt);
 extern void vac_open_indexes(Relation relation, LOCKMODE lockmode,
                  int *nindexes, Relation **Irel);
 extern void vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode);
@@ -323,7 +323,7 @@ extern void vacuum_set_xid_limits(Relation rel,
 extern void vac_update_datfrozenxid(void);
 extern void vacuum_delay_point(void);
 #ifdef XCP
-extern void vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params);
+extern void vacuum_rel_coordinator(Relation onerel, bool is_outer, VacuumParams *params, StatSyncOpt *syncOpt);
 TargetEntry *make_relation_tle(Oid reloid, const char *relname, const char *column);
 #endif
 
@@ -350,13 +350,14 @@ extern void	  analyze_rel(Oid				   relid,
 						  List				   *va_cols,
 						  bool				   in_outer_xact,
 						  BufferAccessStrategy bstrategy,
-						  AnalyzeSyncOpt		 *syncOpt);
+						  StatSyncOpt		 *syncOpt);
 extern bool std_typanalyze(VacAttrStats *stats);
 
 /* in utils/misc/sampling.c --- duplicate of declarations in utils/sampling.h */
 extern double anl_random_fract(void);
 extern double anl_init_selection_state(int n);
 extern double anl_get_next_S(double t, int n, double *stateptr);
+extern RemoteQuery *init_sync_remotequery(StatSyncOpt *syncOpt, char **cnname);
 
 #ifdef __TBASE__
 extern Size QueryAnalyzeInfoShmemSize(void);
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 4dc323ab..e8ac3d54 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -3301,13 +3301,13 @@ typedef enum VacuumOption
                                   */
 } VacuumOption;
 
-typedef struct AnalyzeSyncOpt
+typedef struct StatSyncOpt
 {
 	NodeTag type;
 	bool  is_sync_from; /* false: sync to other CN node; true: sync from node identified by
 						 node_name */
 	List *nodes;	  /* node list for sync to/from */
-} AnalyzeSyncOpt;
+} StatSyncOpt;
 
 typedef struct VacuumStmt
 {
@@ -3315,7 +3315,7 @@ typedef struct VacuumStmt
     int            options;        /* OR of VacuumOption flags */
     RangeVar   *relation;        /* single table to process, or NULL */
     List       *va_cols;        /* list of column names, or NIL for all */
-	AnalyzeSyncOpt *sync_option; /* Sync statistics to/from other nodes, or NULL */
+	StatSyncOpt *sync_option; /* Sync statistics to/from other nodes, or NULL */
 } VacuumStmt;
 
 #ifdef _SHARDING_

From 529c1a75c2ae89dbf8992bdc935bd168c1566d09 Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Wed, 20 Apr 2022 17:36:42 +0800
Subject: [PATCH 534/578] Added the feature: statistics synchronization.The
 ANALYZE syntax has been extended

free unused estate in coord_sync_rel_stats/coord_sync_col_stats/coord_sync_extended_stats/sync_remote_relstat
---
 src/backend/commands/analyze.c | 3 +++
 src/backend/commands/vacuum.c  | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 901b0a60..31b7cfbf 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -5474,6 +5474,7 @@ coord_sync_rel_stats(Relation onerel, StatSyncOpt *syncOpt)
 						cnname)));
 	}
 	ExecEndRemoteQuery(node);
+	FreeExecutorState(estate);
 }
 
 /*
@@ -5819,6 +5820,7 @@ coord_sync_col_stats(Relation		 onerel,
 		result = ExecRemoteQuery((PlanState *) node);
 	}
 	ExecEndRemoteQuery(node);
+    FreeExecutorState(estate);
 
 	update_attstats(RelationGetRelid(onerel),
 					inh,
@@ -5936,6 +5938,7 @@ coord_sync_extended_stats(Relation onerel, int attr_cnt, StatSyncOpt *syncOpt)
 		result = ExecRemoteQuery((PlanState *)node);
 	}
 	ExecEndRemoteQuery(node);
+	FreeExecutorState(estate);
 }
 
 static void
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 78cc13ef..217b82ef 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -2024,9 +2024,9 @@ sync_remote_relstat(char *nspname, char *relname, bool replicated,
 		}
 	}
 	ExecEndRemoteQuery(node);
+	FreeExecutorState(estate);
 }
 
-
 /*
  * Coordinator does not contain any data, so we never need to vacuum relations.
  * This function only updates optimizer statistics based on info from the

From d637e4e55ea6989b19d1efe8c7d2d3e49e69b154 Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Thu, 21 Apr 2022 17:38:19 +0800
Subject: [PATCH 535/578]     When analyzing a interval partitioned table, the
 sub-table not be locked when make the oids listd.     Therefore, when
 serially analyzing sub-tables, if the sub-table is droped before processing, 
    the analysis process will be interrupted.     The fix is to use
 try_relation_open instead of relation_open when processing sub-tables, and
 skip if the opening fails.

---
 src/backend/commands/vacuum.c     |   1 -
 src/backend/utils/adt/ruleutils.c | 143 ++++++++++++++----------------
 src/include/utils/ruleutils.h     |   2 -
 3 files changed, 68 insertions(+), 78 deletions(-)

diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 217b82ef..5dfa41e0 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1340,7 +1340,6 @@ vacuum_rel(Oid relid, RangeVar *relation, int options, VacuumParams *params, Sta
         }
 	}
 #endif    
-
     /* Begin a transaction for vacuuming this relation */
     StartTransactionCommand();
 
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 80623c91..5ff8c9e7 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -92,7 +92,6 @@
 #ifdef __COLD_HOT__
 #include "postmaster/postmaster.h"
 #endif
-
 #include "storage/lmgr.h"
 /* ----------
  * Pretty formatting constants
@@ -361,7 +360,7 @@ static void decompile_column_index_array(Datum column_index_array, Oid relId,
 static char *pg_get_ruledef_worker(Oid ruleoid, int prettyFlags);
 static char *pg_get_indexdef_worker(Oid indexrelid, int colno,
                        const Oid *excludeOps,
-					   bool attrsOnly, bool showTblSpc, bool inherits,
+					   bool attrsOnly, bool showTblSpc, bool inherits,
                        int prettyFlags, bool missing_ok);
 static char *pg_get_statisticsobj_worker(Oid statextid, bool missing_ok);
 static char *pg_get_partkeydef_worker(Oid relid, int prettyFlags,
@@ -1144,7 +1143,7 @@ pg_get_indexdef(PG_FUNCTION_ARGS)
 
     prettyFlags = PRETTYFLAG_INDENT;
 
-	res = pg_get_indexdef_worker(indexrelid, 0, NULL, false, false, false,
+	res = pg_get_indexdef_worker(indexrelid, 0, NULL, false, false, false,
                                  prettyFlags, true);
 
     if (res == NULL)
@@ -1165,7 +1164,7 @@ pg_get_indexdef_ext(PG_FUNCTION_ARGS)
     prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
 
     res = pg_get_indexdef_worker(indexrelid, colno, NULL, colno != 0, false,
-								 false, prettyFlags, true);
+								 false, prettyFlags, true);
 
     if (res == NULL)
         PG_RETURN_NULL();
@@ -1181,7 +1180,7 @@ pg_get_indexdef_ext(PG_FUNCTION_ARGS)
 char *
 pg_get_indexdef_string(Oid indexrelid)
 {
-	return pg_get_indexdef_worker(indexrelid, 0, NULL, false, true, true, 0, false);
+	return pg_get_indexdef_worker(indexrelid, 0, NULL, false, true, true, 0, false);
 }
 
 /* Internal version that just reports the column definitions */
@@ -1191,7 +1190,7 @@ pg_get_indexdef_columns(Oid indexrelid, bool pretty)
     int            prettyFlags;
 
     prettyFlags = pretty ? PRETTYFLAG_PAREN | PRETTYFLAG_INDENT : PRETTYFLAG_INDENT;
-	return pg_get_indexdef_worker(indexrelid, 0, NULL, true, false, false,
+	return pg_get_indexdef_worker(indexrelid, 0, NULL, true, false, false,
                                   prettyFlags, false);
 }
 
@@ -1204,7 +1203,7 @@ pg_get_indexdef_columns(Oid indexrelid, bool pretty)
 static char *
 pg_get_indexdef_worker(Oid indexrelid, int colno,
                        const Oid *excludeOps,
-					   bool attrsOnly, bool showTblSpc, bool inherits,
+					   bool attrsOnly, bool showTblSpc, bool inherits,
                        int prettyFlags, bool missing_ok)
 {// #lizard forgives
     /* might want a separate isConstraint parameter later */
@@ -1320,11 +1319,11 @@ pg_get_indexdef_worker(Oid indexrelid, int colno,
     if (!attrsOnly)
     {
         if (!isConstraint)
-			appendStringInfo(&buf, "CREATE %sINDEX %s ON %s%s USING %s (",
+			appendStringInfo(&buf, "CREATE %sINDEX %s ON %s%s USING %s (",
                              idxrec->indisunique ? "UNIQUE " : "",
                              quote_identifier(NameStr(idxrelrec->relname)),
-							 idxrelrec->relkind == RELKIND_PARTITIONED_INDEX
-							 && !inherits ? "ONLY " : "",
+							 idxrelrec->relkind == RELKIND_PARTITIONED_INDEX
+							 && !inherits ? "ONLY " : "",
                              generate_relation_name(indrelid, NIL),
                              quote_identifier(NameStr(amrec->amname)));
         else                    /* currently, must be EXCLUDE constraint */
@@ -1476,14 +1475,14 @@ pg_get_indexdef_worker(Oid indexrelid, int colno,
             Oid            tblspc;
 
             tblspc = get_rel_tablespace(indexrelid);
-			if (OidIsValid(tblspc))
-			{
+			if (OidIsValid(tblspc))
+			{
             if (isConstraint)
                 appendStringInfoString(&buf, " USING INDEX");
             appendStringInfo(&buf, " TABLESPACE %s",
                              quote_identifier(get_tablespace_name(tblspc)));
         }
-		}
+		}
 
         /*
          * If it's a partial index, decompile and append the predicate
@@ -1650,7 +1649,7 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
  *
  * Returns the partition key specification, ie, the following:
  *
- * PARTITION BY { RANGE | LIST | HASH } (column opt_collation opt_opclass [, ...])
+ * PARTITION BY { RANGE | LIST | HASH } (column opt_collation opt_opclass [, ...])
  */
 Datum
 pg_get_partkeydef(PG_FUNCTION_ARGS)
@@ -1754,10 +1753,10 @@ pg_get_partkeydef_worker(Oid relid, int prettyFlags,
 
     switch (form->partstrat)
     {
-               case PARTITION_STRATEGY_HASH:
-                        if (!attrsOnly)
-                                appendStringInfo(&buf, "HASH");
-                        break;
+               case PARTITION_STRATEGY_HASH:
+                        if (!attrsOnly)
+                                appendStringInfo(&buf, "HASH");
+                        break;
         case PARTITION_STRATEGY_LIST:
             if (!attrsOnly)
                 appendStringInfo(&buf, "LIST");
@@ -1854,7 +1853,7 @@ pg_get_partition_constraintdef(PG_FUNCTION_ARGS)
 
     constr_expr = get_partition_qual_relid(relationId);
 
-	/* Quick exit if no partition constraint */
+	/* Quick exit if no partition constraint */
     if (constr_expr == NULL)
         PG_RETURN_NULL();
 
@@ -2131,12 +2130,12 @@ pg_get_constraintdef_worker(Oid constraintId, bool fullCommand,
                         pfree(options);
                     }
 
-					/*
-					 * Print the tablespace, unless it's the database default.
-					 * This is to help ALTER TABLE usage of this facility,
-					 * which needs this behavior to recreate exact catalog
-					 * state.
-					 */
+					/*
+					 * Print the tablespace, unless it's the database default.
+					 * This is to help ALTER TABLE usage of this facility,
+					 * which needs this behavior to recreate exact catalog
+					 * state.
+					 */
                     tblspc = get_rel_tablespace(indexId);
                     if (OidIsValid(tblspc))
                         appendStringInfo(&buf, " USING INDEX TABLESPACE %s",
@@ -2241,7 +2240,7 @@ pg_get_constraintdef_worker(Oid constraintId, bool fullCommand,
                                                               operators,
                                                               false,
                                                               false,
-															  false,
+															  false,
                                                               prettyFlags,
                                                               false));
                 break;
@@ -9387,23 +9386,23 @@ get_rule_expr(Node *node, deparse_context *context,
                 ListCell   *cell;
                 char       *sep;
 
-				if (spec->is_default)
-				{
-					appendStringInfoString(buf, "DEFAULT");
-					break;
-				}
-
+				if (spec->is_default)
+				{
+					appendStringInfoString(buf, "DEFAULT");
+					break;
+				}
+
                 switch (spec->strategy)
                 {
-					case PARTITION_STRATEGY_HASH:
-						Assert(spec->modulus > 0 && spec->remainder >= 0);
-						Assert(spec->modulus > spec->remainder);
-
-						appendStringInfoString(buf, "FOR VALUES");
-						appendStringInfo(buf, " WITH (modulus %d, remainder %d)",
-										 spec->modulus, spec->remainder);
-						break;
-
+					case PARTITION_STRATEGY_HASH:
+						Assert(spec->modulus > 0 && spec->remainder >= 0);
+						Assert(spec->modulus > spec->remainder);
+
+						appendStringInfoString(buf, "FOR VALUES");
+						appendStringInfo(buf, " WITH (modulus %d, remainder %d)",
+										 spec->modulus, spec->remainder);
+						break;
+
                     case PARTITION_STRATEGY_LIST:
                         Assert(spec->listdatums != NIL);
 
@@ -12063,18 +12062,14 @@ RelationGetAllPartitionsWithLock(Relation rel, LOCKMODE lockmode)
     Oid     partoid = InvalidOid;
     int partidx = 0;
     List * result = NULL;
-
     nparts = RelationGetNParts(rel);
-
     for(partidx = 0; partidx < nparts; partidx++)
     {
         partname = GetPartitionName(RelationGetRelid(rel), partidx, false);
         partoid = get_relname_relid(partname, RelationGetNamespace(rel));
-
         if(partname)
             pfree(partname);
         partname = NULL;
-
 		if (InvalidOid == partoid)
 		{
 			continue;
@@ -12083,7 +12078,6 @@ RelationGetAllPartitionsWithLock(Relation rel, LOCKMODE lockmode)
 		{
 			/* Get the lock to synchronize against concurrent drop */
 			LockRelationOid(partoid, lockmode);
-
 			/*
 			 * Now that we have the lock, double-check to see if the relation
 			 * really exists or not.  If not, assume it was dropped while we
@@ -12099,39 +12093,38 @@ RelationGetAllPartitionsWithLock(Relation rel, LOCKMODE lockmode)
 		}
 		result = lappend_oid(result, partoid);
     }
-
     return result;
 }
 
 int
-GetAllPartitionIntervalCount(Oid parent_oid)
-{
-	int count = 0;
-	List *children = NULL; 
-	Relation rel = heap_open(parent_oid, NoLock);
-
-	children = RelationGetAllPartitions(rel);
-
-	if(children)
-	{
-		count = children->length;
-		list_free(children);
-	}
-
-	heap_close(rel, NoLock);
-
-	return count;
-}
-
-Datum
-partitions_number(PG_FUNCTION_ARGS)
-{
-	Oid parent_oid = PG_GETARG_OID(0);
-	int ret = GetAllPartitionIntervalCount(parent_oid);
-	PG_RETURN_INT32(ret);
-}
-
-int
+GetAllPartitionIntervalCount(Oid parent_oid)
+{
+	int count = 0;
+	List *children = NULL; 
+	Relation rel = heap_open(parent_oid, NoLock);
+
+	children = RelationGetAllPartitions(rel);
+
+	if(children)
+	{
+		count = children->length;
+		list_free(children);
+	}
+
+	heap_close(rel, NoLock);
+
+	return count;
+}
+
+Datum
+partitions_number(PG_FUNCTION_ARGS)
+{
+	Oid parent_oid = PG_GETARG_OID(0);
+	int ret = GetAllPartitionIntervalCount(parent_oid);
+	PG_RETURN_INT32(ret);
+}
+
+int
 RelationGetChildIndex(Relation rel, Oid childoid)
 {
     int nparts = 0;
diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h
index db0106ea..fd7d7ae4 100644
--- a/src/include/utils/ruleutils.h
+++ b/src/include/utils/ruleutils.h
@@ -105,8 +105,6 @@ extern List *RelationGetAllPartitions(Relation rel);
 extern List *RelationGetAllPartitionsWithLock(Relation rel, LOCKMODE lockmode);
 extern int GetAllPartitionIntervalCount(Oid parent_oid);
 
-extern int GetAllPartitionIntervalCount(Oid parent_oid);
-
 extern int RelationGetChildIndex(Relation rel, Oid childoid);
 
 extern Oid RelationGetPartitionIndex(Relation rel, Oid indexOid, int partidx);

From 107470a78db5ae0050cc6d4197ec1dc521aecf41 Mon Sep 17 00:00:00 2001
From: jadenchi <jadenchi@tencent.com>
Date: Tue, 12 Apr 2022 16:36:20 +0800
Subject: [PATCH 536/578] fix:
 http://tapd.woa.com/20421696/bugtrace/bugs/view?bug_id=1020421696097145423&url_cache_key=from_url_bug_query_list_ebb900eeeb806309840478207fdf43ae

---
 src/backend/parser/parse_utilcmd.c | 37 ++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 08cb09e6..559aee3d 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -3312,6 +3312,9 @@ transformRuleStmt(RuleStmt *stmt, const char *queryString,
 }
 
 
+/* check the year is leak year or common year */
+#define is_leak_year(year) ((year % 100 != 0 && year % 4 == 0) || (year % 400 == 0))
+
 /*
  * transformAlterTableStmt -
  *        parse analysis for ALTER TABLE
@@ -3472,6 +3475,10 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                     int newnparts;
                     Oid groupId;
                     
+                    struct pg_tm start_time;
+                    fsec_t start_sec;
+                    int gap = 0;
+					
                     existnparts = RelationGetNParts(rel);
                     newnparts = ((AddDropPartitions*)cmd->def)->nparts;
 
@@ -3480,6 +3487,36 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                         elog(ERROR, "number of partitions to add cannot be negative or zero");
                     }
 
+                    /*
+                     * Self-developed partition table compatibility processing
+                     */
+                    Form_pg_partition_interval routerinfo = NULL;
+                    routerinfo = rel->rd_partitions_info;
+
+                    if (routerinfo->partdatatype == TIMESTAMPOID)
+                    {
+                        /* timestamp convert to posix struct */
+                        if(timestamp2tm(routerinfo->partstartvalue_ts, NULL, &start_time, &start_sec, NULL, NULL) != 0)
+                            ereport(ERROR,
+                                    (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
+                                            errmsg("timestamp out of range")));
+
+                        if (routerinfo->partinterval_type == IntervalType_Day &&
+                            !is_leak_year(start_time.tm_year) && start_time.tm_mon <= 2 && start_time.tm_mday <= 28)
+                        {
+                            if (start_time.tm_mon < 2)
+                                gap = (31 - start_time.tm_mday) + 28 + 1;
+                            else
+                                gap = 28 - start_time.tm_mday + 1;
+
+                            if (gap >= existnparts && gap <= newnparts + existnparts)
+                            {
+                                newnparts++;
+                                ((AddDropPartitions*)cmd->def)->nparts = newnparts;
+                            }
+                        }
+                    }
+
                     if(newnparts + existnparts > MAX_NUM_INTERVAL_PARTITIONS)
                     {
                         elog(ERROR, "one table only have %d partitions at most", MAX_NUM_INTERVAL_PARTITIONS);

From 4b3f78fb54308864fc09f1ded0293ba8ca926c5a Mon Sep 17 00:00:00 2001
From: jadenchi <jadenchi@tencent.com>
Date: Wed, 13 Apr 2022 11:35:36 +0800
Subject: [PATCH 537/578] fix:
 http://tapd.woa.com/20421696/bugtrace/bugs/view?bug_id=1020421696097145423&url_cache_key=from_url_bug_query_list_ebb900eeeb806309840478207fdf43ae,
 modify some codes

---
 src/backend/parser/parse_utilcmd.c | 54 ++++++++++++++++++------------
 src/backend/utils/adt/ruleutils.c  | 33 ++++++++++++++++++
 src/include/utils/ruleutils.h      |  1 +
 3 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 559aee3d..3c2edaa0 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -3313,7 +3313,7 @@ transformRuleStmt(RuleStmt *stmt, const char *queryString,
 
 
 /* check the year is leak year or common year */
-#define is_leak_year(year) ((year % 100 != 0 && year % 4 == 0) || (year % 400 == 0))
+#define is_leap_year(year) ((year % 100 != 0 && year % 4 == 0) || (year % 400 == 0))
 
 /*
  * transformAlterTableStmt -
@@ -3472,12 +3472,16 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                 {
                     int existnparts;
                     int partidx;
+                    int realPartidx;
                     int newnparts;
+                    int realNewnparts;
                     Oid groupId;
                     
                     struct pg_tm start_time;
                     fsec_t start_sec;
-                    int gap = 0;
+                    int year;
+                    int mon;
+                    int day;
 					
                     existnparts = RelationGetNParts(rel);
                     newnparts = ((AddDropPartitions*)cmd->def)->nparts;
@@ -3487,43 +3491,47 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                         elog(ERROR, "number of partitions to add cannot be negative or zero");
                     }
 
+					if(newnparts + existnparts > MAX_NUM_INTERVAL_PARTITIONS)
+					{
+						elog(ERROR, "one table only have %d partitions at most", MAX_NUM_INTERVAL_PARTITIONS);
+					}
+
                     /*
                      * Self-developed partition table compatibility processing
                      */
                     Form_pg_partition_interval routerinfo = NULL;
                     routerinfo = rel->rd_partitions_info;
 
-                    if (routerinfo->partdatatype == TIMESTAMPOID)
-                    {
                         /* timestamp convert to posix struct */
                         if(timestamp2tm(routerinfo->partstartvalue_ts, NULL, &start_time, &start_sec, NULL, NULL) != 0)
                             ereport(ERROR,
                                     (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE),
                                             errmsg("timestamp out of range")));
 
-                        if (routerinfo->partinterval_type == IntervalType_Day &&
-                            !is_leak_year(start_time.tm_year) && start_time.tm_mon <= 2 && start_time.tm_mday <= 28)
-                        {
-                            if (start_time.tm_mon < 2)
-                                gap = (31 - start_time.tm_mday) + 28 + 1;
-                            else
-                                gap = 28 - start_time.tm_mday + 1;
+                    year = start_time.tm_year;
+                    mon = start_time.tm_mon;
+                    day = start_time.tm_mday;
 
-                            if (gap >= existnparts && gap <= newnparts + existnparts)
+                    realPartidx = existnparts;
+                    realNewnparts = newnparts;
+
+					for(partidx = existnparts; partidx < existnparts + newnparts; partidx++)
                             {
-                                newnparts++;
-                                ((AddDropPartitions*)cmd->def)->nparts = newnparts;
-                            }
-                        }
-                    }
+                        /*
+                         * for compatible with the calculation of the normal time of the self-developed partition table
+                         */
+                        if (routerinfo->partdatatype == TIMESTAMPOID && !is_leap_year(year) && routerinfo->partinterval_type == IntervalType_Day)
+                        {
+                            calculate_time(&year, &mon, &day, 1, IntervalType_Day, false);
 
-                    if(newnparts + existnparts > MAX_NUM_INTERVAL_PARTITIONS)
+                            if (mon == 2 && day == 28)
                     {
-                        elog(ERROR, "one table only have %d partitions at most", MAX_NUM_INTERVAL_PARTITIONS);
+                                partidx--;
+                                realNewnparts++;
+                                ((AddDropPartitions*)cmd->def)->nparts = realNewnparts;
+                            }
                     }
 
-                    for(partidx = existnparts; partidx < existnparts + newnparts; partidx++)
-                    {
                         TableLikeClause *likeclause = makeNode(TableLikeClause);
                         CreateStmt * createpart = makeNode(CreateStmt);
                         createpart->relation = copyObject((void *) stmt->relation);
@@ -3535,7 +3543,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                         createpart->tableElts = lappend(createpart->tableElts, likeclause);
 
                         createpart->interval_child = true;
-                        createpart->interval_child_idx = partidx;
+						createpart->interval_child_idx = realPartidx;
 
                         createpart->interval_parentId = RelationGetRelid(rel);
                         
@@ -3620,6 +3628,8 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
 #else
                         createlist = list_concat(createlist, transformCreateStmt(createpart, queryString, true));
 #endif
+
+                        realPartidx++;
                     }
                 }
                 else
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 5ff8c9e7..fa4fcd89 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -119,6 +119,7 @@
 
 #ifdef __TBASE__
 static int daysofmonth[13] = {0,31,29,31,30,31,30,31,31,30,31,30,31};
+static int daysofmonth_common_year[13] = {0,31,28,31,30,31,30,31,31,30,31,30,31};
 
 static struct pg_tm g_partition_base_time = { 0,
                                                0,
@@ -13254,4 +13255,36 @@ is_first_day_from_start(int step, int steptype, struct pg_tm *start_time, struct
 
     return result;
 }
+
+/*
+ * base on a time, add step days
+ */
+void
+calculate_time(int *year, int *mon, int *day, int step, int steptype, bool is_leap_year)
+{
+    int monDays;
+
+    if (!is_leap_year)
+        monDays = daysofmonth_common_year[*mon];
+    else
+        monDays = daysofmonth[*year];
+
+    /* partition by one day */
+    if (step == 1 && steptype == IntervalType_Day)
+    {
+        if (*day == monDays)
+        {
+            *day = 1;
+            if (*mon < 12)
+                (*mon)++;
+            else
+            {
+                *mon = 1;
+                (*year)++;
+            }
+        }
+        else
+            (*day)++;
+    }
+}
 #endif
diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h
index fd7d7ae4..db582a18 100644
--- a/src/include/utils/ruleutils.h
+++ b/src/include/utils/ruleutils.h
@@ -95,6 +95,7 @@ extern List *select_rtable_names_for_explain(List *rtable,
                                 Bitmapset *rels_used);
 extern char *generate_collation_name(Oid collid);
 extern char *get_range_partbound_string(List *bound_datums);
+extern void calculate_time(int *year, int *mon, int *day, int step, int steptype, bool is_leap_year);
 
 #ifdef __TBASE__
 extern char * GetPartitionName(Oid parentrelid, int partidx, bool isindex);

From 1e42ab850e81fb9b55e870208e8b4fcd3148567f Mon Sep 17 00:00:00 2001
From: jadenchi <jadenchi@tencent.com>
Date: Wed, 13 Apr 2022 21:27:48 +0800
Subject: [PATCH 538/578] add regress for 'fix common year partition'

---
 src/test/regress/expected/partition.out | 14 ++++++++++++++
 src/test/regress/sql/partition.sql      | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/src/test/regress/expected/partition.out b/src/test/regress/expected/partition.out
index d63e6d2f..331f0ead 100644
--- a/src/test/regress/expected/partition.out
+++ b/src/test/regress/expected/partition.out
@@ -1016,3 +1016,17 @@ explain (costs off) select * from t_in_test where c in ('20170901', '20171101');
 
 reset enable_fast_query_shipping;
 drop table t_in_test;
+-- for February of common year timestamp partition, add sub table should be ok
+create table t_time_range (a int, b int, c timestamp)
+partition by range (c) begin
+(timestamp without time zone '2022-02-27 0:0:0')
+step (interval '1 day') partitions (2)
+distribute by shard(a)
+to group default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into t_time_range values(1, 1, '2022-02-28');
+insert into t_time_range values(1, 1, '2022-03-1');
+ERROR:  value to inserted execeed range of partitioned table
+ALTER TABLE t_time_range ADD PARTITIONS 1;
+insert into t_time_range values(1, 1, '2022-03-1');
+drop table t_time_range;
diff --git a/src/test/regress/sql/partition.sql b/src/test/regress/sql/partition.sql
index cc2e7dd0..564995be 100644
--- a/src/test/regress/sql/partition.sql
+++ b/src/test/regress/sql/partition.sql
@@ -457,3 +457,17 @@ set enable_fast_query_shipping to off;
 explain (costs off) select * from t_in_test where c in ('20170901', '20171101');
 reset enable_fast_query_shipping;
 drop table t_in_test;
+
+-- for February of common year timestamp partition, add sub table should be ok
+create table t_time_range (a int, b int, c timestamp)
+partition by range (c) begin
+(timestamp without time zone '2022-02-27 0:0:0')
+step (interval '1 day') partitions (2)
+distribute by shard(a)
+to group default_group;
+
+insert into t_time_range values(1, 1, '2022-02-28');
+insert into t_time_range values(1, 1, '2022-03-1');
+ALTER TABLE t_time_range ADD PARTITIONS 1;
+insert into t_time_range values(1, 1, '2022-03-1');
+drop table t_time_range;

From 16ae553cfb1adbe9eb420096e7a4d0cc526dce5d Mon Sep 17 00:00:00 2001
From: jadenchi <jadenchi@tencent.com>
Date: Fri, 15 Apr 2022 11:21:20 +0800
Subject: [PATCH 539/578] Modification some time calculation for
 AT_AddPartitions

---
 src/backend/parser/parse_utilcmd.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 3c2edaa0..80f45171 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -3512,6 +3512,12 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                     mon = start_time.tm_mon;
                     day = start_time.tm_mday;
 
+                    if(routerinfo->partdatatype == TIMESTAMPOID && !is_leap_year(year) && routerinfo->partinterval_type == IntervalType_Day)
+                    {
+                        for(partidx = 1; partidx < existnparts; partidx++)
+                            calculate_time(&year, &mon, &day, 1, IntervalType_Day, false);
+                    }
+
                     realPartidx = existnparts;
                     realNewnparts = newnparts;
 
@@ -3524,7 +3530,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                         {
                             calculate_time(&year, &mon, &day, 1, IntervalType_Day, false);
 
-                            if (mon == 2 && day == 28)
+                            if(mon == 3 && day == 1)
                     {
                                 partidx--;
                                 realNewnparts++;

From 51d57f24a6a8b40bbcf2c188237955f6121885bc Mon Sep 17 00:00:00 2001
From: jadenchi <jadenchi@tencent.com>
Date: Mon, 18 Apr 2022 16:35:14 +0800
Subject: [PATCH 540/578] add some regress for common/leap year partition

---
 src/test/regress/expected/partition.out | 36 ++++++++++++++++++++++--
 src/test/regress/sql/partition.sql      | 37 +++++++++++++++++++++++--
 2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/src/test/regress/expected/partition.out b/src/test/regress/expected/partition.out
index 331f0ead..af67ce24 100644
--- a/src/test/regress/expected/partition.out
+++ b/src/test/regress/expected/partition.out
@@ -1019,8 +1019,8 @@ drop table t_in_test;
 -- for February of common year timestamp partition, add sub table should be ok
 create table t_time_range (a int, b int, c timestamp)
 partition by range (c) begin
-(timestamp without time zone '2022-02-27 0:0:0')
-step (interval '1 day') partitions (2)
+(timestamp without time zone '2022-02-26 0:0:0')
+step (interval '1 day') partitions (3)
 distribute by shard(a)
 to group default_group;
 NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
@@ -1030,3 +1030,35 @@ ERROR:  value to inserted execeed range of partitioned table
 ALTER TABLE t_time_range ADD PARTITIONS 1;
 insert into t_time_range values(1, 1, '2022-03-1');
 drop table t_time_range;
+create table t_time_range (a int, b int, c timestamp)
+partition by range (c) begin
+(timestamp without time zone '2022-02-26 0:0:0')
+step (interval '1 day') partitions (1)
+distribute by shard(a)
+to group default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into t_time_range values(1, 1, '2022-02-26');
+ALTER TABLE t_time_range ADD PARTITIONS 2;
+insert into t_time_range values(1, 1, '2022-02-28');
+insert into t_time_range values(1, 1, '2022-03-1');
+ERROR:  value to inserted execeed range of partitioned table
+ALTER TABLE t_time_range ADD PARTITIONS 1;
+insert into t_time_range values(1, 1, '2022-03-1');
+drop table t_time_range;
+-- for February of leap year timestamp partition, add sub table should be ok
+create table t_time_range (a int, b int, c timestamp)
+partition by range (c) begin
+(timestamp without time zone '2020-02-26 0:0:0')
+step (interval '1 day') partitions (3)
+distribute by shard(a)
+to group default_group;
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into t_time_range values(1, 1, '2020-02-26');
+insert into t_time_range values(1, 1, '2020-02-27');
+insert into t_time_range values(1, 1, '2020-02-28');
+insert into t_time_range values(1, 1, '2020-02-29');
+ERROR:  value to inserted execeed range of partitioned table
+ALTER TABLE t_time_range ADD PARTITIONS 2;
+insert into t_time_range values(1, 1, '2020-02-29');
+insert into t_time_range values(1, 1, '2020-03-01');
+drop table t_time_range;
diff --git a/src/test/regress/sql/partition.sql b/src/test/regress/sql/partition.sql
index 564995be..43c95f97 100644
--- a/src/test/regress/sql/partition.sql
+++ b/src/test/regress/sql/partition.sql
@@ -461,8 +461,8 @@ drop table t_in_test;
 -- for February of common year timestamp partition, add sub table should be ok
 create table t_time_range (a int, b int, c timestamp)
 partition by range (c) begin
-(timestamp without time zone '2022-02-27 0:0:0')
-step (interval '1 day') partitions (2)
+(timestamp without time zone '2022-02-26 0:0:0')
+step (interval '1 day') partitions (3)
 distribute by shard(a)
 to group default_group;
 
@@ -471,3 +471,36 @@ insert into t_time_range values(1, 1, '2022-03-1');
 ALTER TABLE t_time_range ADD PARTITIONS 1;
 insert into t_time_range values(1, 1, '2022-03-1');
 drop table t_time_range;
+
+create table t_time_range (a int, b int, c timestamp)
+partition by range (c) begin
+(timestamp without time zone '2022-02-26 0:0:0')
+step (interval '1 day') partitions (1)
+distribute by shard(a)
+to group default_group;
+
+insert into t_time_range values(1, 1, '2022-02-26');
+ALTER TABLE t_time_range ADD PARTITIONS 2;
+insert into t_time_range values(1, 1, '2022-02-28');
+insert into t_time_range values(1, 1, '2022-03-1');
+ALTER TABLE t_time_range ADD PARTITIONS 1;
+insert into t_time_range values(1, 1, '2022-03-1');
+drop table t_time_range;
+
+-- for February of leap year timestamp partition, add sub table should be ok
+create table t_time_range (a int, b int, c timestamp)
+partition by range (c) begin
+(timestamp without time zone '2020-02-26 0:0:0')
+step (interval '1 day') partitions (3)
+distribute by shard(a)
+to group default_group;
+
+insert into t_time_range values(1, 1, '2020-02-26');
+insert into t_time_range values(1, 1, '2020-02-27');
+insert into t_time_range values(1, 1, '2020-02-28');
+insert into t_time_range values(1, 1, '2020-02-29');
+ALTER TABLE t_time_range ADD PARTITIONS 2;
+insert into t_time_range values(1, 1, '2020-02-29');
+insert into t_time_range values(1, 1, '2020-03-01');
+drop table t_time_range;
+

From c38ab6bcf3875c345617fb56262ec15dad6e4a2b Mon Sep 17 00:00:00 2001
From: jadenchi <jadenchi@tencent.com>
Date: Tue, 19 Apr 2022 15:54:26 +0800
Subject: [PATCH 541/578] modify function name calculate_time to
 add_day_calculation

---
 src/backend/parser/parse_utilcmd.c | 4 ++--
 src/backend/utils/adt/ruleutils.c  | 2 +-
 src/include/utils/ruleutils.h      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 80f45171..2be306b7 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -3515,7 +3515,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                     if(routerinfo->partdatatype == TIMESTAMPOID && !is_leap_year(year) && routerinfo->partinterval_type == IntervalType_Day)
                     {
                         for(partidx = 1; partidx < existnparts; partidx++)
-                            calculate_time(&year, &mon, &day, 1, IntervalType_Day, false);
+                            add_day_calculation(&year, &mon, &day, 1, IntervalType_Day, false);
                     }
 
                     realPartidx = existnparts;
@@ -3528,7 +3528,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                          */
                         if (routerinfo->partdatatype == TIMESTAMPOID && !is_leap_year(year) && routerinfo->partinterval_type == IntervalType_Day)
                         {
-                            calculate_time(&year, &mon, &day, 1, IntervalType_Day, false);
+                            add_day_calculation(&year, &mon, &day, 1, IntervalType_Day, false);
 
                             if(mon == 3 && day == 1)
                     {
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index fa4fcd89..ef5eb6ca 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -13260,7 +13260,7 @@ is_first_day_from_start(int step, int steptype, struct pg_tm *start_time, struct
  * base on a time, add step days
  */
 void
-calculate_time(int *year, int *mon, int *day, int step, int steptype, bool is_leap_year)
+add_day_calculation(int *year, int *mon, int *day, int step, int steptype, bool is_leap_year)
 {
     int monDays;
 
diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h
index db582a18..03d502f9 100644
--- a/src/include/utils/ruleutils.h
+++ b/src/include/utils/ruleutils.h
@@ -95,7 +95,7 @@ extern List *select_rtable_names_for_explain(List *rtable,
                                 Bitmapset *rels_used);
 extern char *generate_collation_name(Oid collid);
 extern char *get_range_partbound_string(List *bound_datums);
-extern void calculate_time(int *year, int *mon, int *day, int step, int steptype, bool is_leap_year);
+extern void add_day_calculation(int *year, int *mon, int *day, int step, int steptype, bool is_leap_year);
 
 #ifdef __TBASE__
 extern char * GetPartitionName(Oid parentrelid, int partidx, bool isindex);

From 0bbd7846eb3447030f9393a8ea0921c5ac704d8b Mon Sep 17 00:00:00 2001
From: jadenchi <jadenchi@tencent.com>
Date: Fri, 22 Apr 2022 15:18:49 +0800
Subject: [PATCH 542/578] fix add_day_calculation get daysofmonth by mon

---
 src/backend/utils/adt/ruleutils.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index ef5eb6ca..7d16c0f6 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -13267,7 +13267,7 @@ add_day_calculation(int *year, int *mon, int *day, int step, int steptype, bool
     if (!is_leap_year)
         monDays = daysofmonth_common_year[*mon];
     else
-        monDays = daysofmonth[*year];
+        monDays = daysofmonth[*mon];
 
     /* partition by one day */
     if (step == 1 && steptype == IntervalType_Day)

From c96c7115f74309724d9fc728e8a1d80026339f64 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Fri, 15 Apr 2022 14:43:44 +0800
Subject: [PATCH 543/578] bugfix: server time different cause 2pc clean error
 (merge request 1170),
 http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696096815567

---
 contrib/pg_clean/pg_clean--1.0.sql            |   6 +
 .../pg_clean/pg_clean--unpackaged--1.0.sql    |   1 +
 contrib/pg_clean/pg_clean.c                   | 934 ++++++++++++++----
 src/backend/access/transam/twophase.c         |  51 +-
 src/backend/access/transam/xlog.c             |  42 +-
 src/backend/pgxc/pool/execRemote.c            |  49 +-
 src/backend/postmaster/clean2pc.c             |  45 +-
 src/backend/utils/misc/guc.c                  |  20 +-
 src/include/access/twophase.h                 |   4 +
 9 files changed, 910 insertions(+), 242 deletions(-)

diff --git a/contrib/pg_clean/pg_clean--1.0.sql b/contrib/pg_clean/pg_clean--1.0.sql
index e5bbc9ca..be8623f7 100644
--- a/contrib/pg_clean/pg_clean--1.0.sql
+++ b/contrib/pg_clean/pg_clean--1.0.sql
@@ -50,6 +50,11 @@ RETURNS text
 AS 'MODULE_PATHNAME'
 LANGUAGE C;
 
+CREATE FUNCTION pgxc_get_2pc_prepare_timestamp(gid text)
+RETURNS text
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
 CREATE FUNCTION pgxc_get_2pc_commit_timestamp(gid text)
 RETURNS text
 AS 'MODULE_PATHNAME'
@@ -96,6 +101,7 @@ GRANT ALL ON FUNCTION pg_clean_check_txn(time_interval integer) TO PUBLIC;
 GRANT ALL ON FUNCTION pgxc_get_2pc_nodes(gid text) TO PUBLIC;
 GRANT ALL ON FUNCTION pgxc_get_2pc_startnode(gid text) TO PUBLIC;
 GRANT ALL ON FUNCTION pgxc_get_2pc_startxid(gid text) TO PUBLIC;
+GRANT ALL ON FUNCTION pgxc_get_2pc_prepare_timestamp(gid text) TO PUBLIC;
 GRANT ALL ON FUNCTION pgxc_get_2pc_commit_timestamp(gid text) TO PUBLIC;
 GRANT ALL ON FUNCTION pgxc_get_2pc_xid(gid text) TO PUBLIC;
 GRANT ALL ON FUNCTION pgxc_get_2pc_file(gid text) TO PUBLIC;
diff --git a/contrib/pg_clean/pg_clean--unpackaged--1.0.sql b/contrib/pg_clean/pg_clean--unpackaged--1.0.sql
index a6a67659..d173a607 100644
--- a/contrib/pg_clean/pg_clean--unpackaged--1.0.sql
+++ b/contrib/pg_clean/pg_clean--unpackaged--1.0.sql
@@ -9,6 +9,7 @@ ALTER EXTENSION pg_clean ADD function pg_clean_check_txn(time_interval integer);
 ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_nodes(gid text);
 ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_startnode(gid text);
 ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_startxid(gid text);
+ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_prepare_timestamp(gid text);
 ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_commit_timestamp(gid text);
 ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_xid(gid text);
 ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_file(gid text);
diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c
index 68d916a5..4ee21911 100644
--- a/contrib/pg_clean/pg_clean.c
+++ b/contrib/pg_clean/pg_clean.c
@@ -63,8 +63,14 @@ int  transaction_threshold = 200000;
 #define MAXIMUM_OUTPUT_FILE 1000
 #define XIDPREFIX "_$XC$"
 #define DEFAULT_CLEAN_TIME_INTERVAL 120
-#define LEAST_CLEAN_TIME_INTERVAL     3 /* should not clean twophase trans prepared in 3s */
+
+#ifdef __TWO_PHASE_TESTS__
+#define LEAST_CLEAN_TIME_INTERVAL     1 /* should not clean twophase trans prepared in 1s */
 #define LEAST_CHECK_TIME_INTERVAL     1 /* should not check twophase trans prepared in 1s */
+#else
+#define LEAST_CLEAN_TIME_INTERVAL     10 /* should not clean twophase trans prepared in 10s */
+#define LEAST_CHECK_TIME_INTERVAL     3  /* should not check twophase trans prepared in 3s */
+#endif
 
 GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL * USECS_PER_SEC;
 
@@ -72,19 +78,15 @@ PG_MODULE_MAGIC;
 
 #define MAX_GID               64
 
-#define CLEAN_CHECK_TIMES_DEFAULT    3
-#define CLEAN_CHECK_INTERVAL_DEFAULT 100000
-
-#define CLEAN_NODE_CHECK_TIMES       5
-#define CLEAN_NODE_CHECK_INTERVAL    500000
-
 #define MAX_DBNAME	64
 #define GET_START_XID "startxid:"
+#define GET_PREPARE_TIMESTAMP "global_prepare_timestamp:"
 #define GET_COMMIT_TIMESTAMP "global_commit_timestamp:"
 #define GET_START_NODE "startnode:"
 #define GET_NODE "nodes:"
 #define GET_XID "\nxid:"
 #define GET_READONLY "readonly"
+#define ROLLBACK_POSTFIX ".rollback" /* 2pc file postfix when the 2pc is rollbacked */
 #define GIDSIZE (200 + 24)
 #define MAX_TWOPC_TXN 1000
 #define STRING_BUFF_LEN 1024
@@ -190,6 +192,7 @@ typedef struct txn_info
 	TXN_STATUS		*txn_stat;			/* Array for each nodes */
 	char			*msg;				/* Notice message for this txn. */
 	GlobalTimestamp  global_commit_timestamp;	/* get global_commit_timestamp from node once it is committed*/
+	GlobalTimestamp  global_prepare_timestamp;	/* get global_prepare_timestamp from node once it is prepared*/
 
 	TXN_STATUS		global_txn_stat;
 	OPERATION		op;
@@ -262,8 +265,10 @@ database_info       *last_database_info = NULL;
 bool		        execute = false;
 int                 total_twopc_txn = 0;
 
-TimestampTz         current_time;
-GlobalTimestamp     abnormal_time = InvalidGlobalTimestamp;
+TimestampTz         current_time = 0;
+TimestampTz         abnormal_time = 0;
+GlobalTimestamp     current_gts = InvalidGlobalTimestamp;   /* use to save current gts */
+GlobalTimestamp     abnormal_gts = InvalidGlobalTimestamp;  /* use to save abnormal gts, clean 2PCs which prepare gts less than abnormal gts */
 char                *abnormal_nodename = NULL;
 Oid                 abnormal_nodeoid = InvalidOid;
 bool                clear_2pc_belong_node = false;
@@ -341,6 +346,14 @@ static void
 static void 
      get_node_handles(PGXCNodeAllHandles ** pgxc_handles, Oid nodeoid);
 
+uint32 get_start_xid_from_gid(char *gid);
+char *get_start_node_from_gid(char *gid);
+Oid get_start_node_oid_from_gid(char *gid);
+
+bool is_xid_running_on_node(uint32 xid, Oid node_oid);
+bool is_gid_start_xid_running(char *gid);
+bool is_txn_start_xid_running(txn_info *txn);
+
 Datum	pg_clean_execute(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pg_clean_execute);
 Datum	pg_clean_execute(PG_FUNCTION_ARGS)
@@ -494,6 +507,7 @@ Datum	pg_clean_execute_on_node(PG_FUNCTION_ARGS)
 	char				txn_status[100];
 	char				txn_op[100];
 	char				txn_op_issuccess[100];
+	int64				time_gap = 0;
 	
 	Datum		values[ACCESS_CONTROL_ATTR_NUM];
 	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
@@ -540,21 +554,36 @@ Datum	pg_clean_execute_on_node(PG_FUNCTION_ARGS)
         execute = true;
         clear_2pc_belong_node = true;
 
+        if (0 == PG_GETARG_DATUM(0))
+        {
+            elog(ERROR, "pg_clean_execute_on_node: node name is empty");
+        }
         abnormal_nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
         abnormal_nodeoid = get_pgxc_nodeoid(abnormal_nodename);
         if (InvalidOid == abnormal_nodeoid)
         {
-            elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of invalid nodename '%s'", abnormal_nodename);
+            elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of "
+                "invalid nodename '%s'", abnormal_nodename);
         }
         abnormal_time = PG_GETARG_INT64(1);
         current_time = GetCurrentTimestamp();
-        if (abnormal_time >= current_time - LEAST_CLEAN_TIME_INTERVAL * USECS_PER_SEC)
+        time_gap = current_time - abnormal_time;
+        if (time_gap < LEAST_CLEAN_TIME_INTERVAL * USECS_PER_SEC)
         {
-            elog(ERROR, "pg_clean_execute_on_node, least clean time interval is %ds, "
-                "abnormal time: " INT64_FORMAT ", current_time: " INT64_FORMAT,
+            /*time gap less than LEAST_CLEAN_TIME_INTERVAL, can not clean*/
+            elog(ERROR, "pg_clean_execute_on_node, least clean interval is %ds, "
+                "abnormal time: " INT64_FORMAT ", current time: " INT64_FORMAT,
                 LEAST_CLEAN_TIME_INTERVAL, abnormal_time, current_time);
         }
         
+        current_gts = GetGlobalTimestampGTM();
+        if (!GlobalTimestampIsValid(current_gts))
+        {
+            /*get invalid gts, can not clean*/
+            elog(ERROR, "pg_clean_execute_on_node, get invalid gts");
+        }
+        abnormal_gts = current_gts - time_gap;
+
 		/*get node list*/
 		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
 						&cn_nodes_num, &dn_nodes_num, true);
@@ -770,7 +799,9 @@ static void ResetGlobalVariables(void)
 	head_database_info = last_database_info = NULL;
 
     current_time = 0;
-    abnormal_time = InvalidGlobalTimestamp;
+    abnormal_time = 0;
+    current_gts = InvalidGlobalTimestamp;
+    abnormal_gts = InvalidGlobalTimestamp;
     abnormal_nodename = NULL;
     abnormal_nodeoid = InvalidOid;
     clear_2pc_belong_node = false;
@@ -922,7 +953,7 @@ static void getDatabaseList(void)
 {
 	int i;
 	TupleTableSlots result_db;
-	const char *query_db = "select datname::text from pg_database;";
+	const char *query_db = "select datname::text from pg_catalog.pg_database";
 	/*add datname into tail of head_database_info*/
 	if (execute_query_on_single_node(my_nodeoid, query_db, 1, &result_db) == (Datum) 1)
 	{
@@ -979,6 +1010,12 @@ static void getTxnInfoOnNodesAll(void)
 {
 	int i;
 	current_time = GetCurrentTimestamp();
+	current_gts = GetGlobalTimestampGTM();
+	if (!GlobalTimestampIsValid(current_gts))
+	{
+		/*get invalid gts, get txn info error*/
+		elog(ERROR, "getTxnInfoOnNodesAll, get invalid gts");
+	}
 	/*upload 2PC transaction from CN*/
 	for (i = 0; i < cn_nodes_num; i++)
 	{
@@ -1002,10 +1039,12 @@ void getTxnInfoOnNode(Oid node)
 	TupleTableSlots result_txn;
 	Datum execute_res;
 	char query_execute[1024];
-	const char *query_txn_status = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text "
-										  "from pg_prepared_xacts;";
-	const char *query_txn_status_execute = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text "
-										  		  "from pg_prepared_xacts where database = '%s';";
+	const char *query_txn_status = "select transaction::text, gid::text, "
+					"owner::text, database::text, timestamptz_out(prepared)::text "
+					"from pg_catalog.pg_prepared_xacts";
+	const char *query_txn_status_execute = "select transaction::text, gid::text, "
+					"owner::text, database::text, timestamptz_out(prepared)::text "
+					"from pg_catalog.pg_prepared_xacts where database = '%s'";
 	snprintf(query_execute, 1024, query_txn_status_execute, get_database_name(MyDatabaseId));
 
 	if (execute)
@@ -1106,6 +1145,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
     char *file_content = NULL;
     uint32 startxid = 0;
     char *str_startxid = NULL;
+    char *str_prepare_gts = NULL;
     char *str_timestamp = NULL;
 	char *temp = NULL;
 	Oid	 temp_nodeoid;
@@ -1113,7 +1153,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
 	int  temp_nodeidx;
 	char stmt[1024];
 	static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text";
-	snprintf(stmt, 1024, STMT_FORM, txn->gid, txn->gid, txn->gid, txn->gid);
+	snprintf(stmt, 1024, STMT_FORM, txn->gid);
     
 	if (execute_query_on_single_node(node_oid, stmt, 1, &result) == (Datum) 1)
 	{
@@ -1126,6 +1166,12 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
 		{
             file_content = TTSgetvalue(&result, 0, 0);    
             
+            if (strlen(file_content) == 0)
+            {
+                elog(LOG, "gid: %s, 2pc file is not exist", txn->gid);
+	            return TWOPHASE_FILE_NOT_EXISTS;
+            }
+
             if (!IsXidImplicit(txn->gid) && strstr(file_content, GET_READONLY))
             {
                 txn->is_readonly = true;
@@ -1135,6 +1181,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
             }
             startnode = strstr(file_content, GET_START_NODE);
             str_startxid = strstr(file_content, GET_START_XID);
+            str_prepare_gts = strstr(file_content, GET_PREPARE_TIMESTAMP);
             partnodes = strstr(file_content, GET_NODE);
             temp = strstr(file_content, GET_COMMIT_TIMESTAMP);
             
@@ -1146,6 +1193,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
                 temp = strstr(temp, GET_COMMIT_TIMESTAMP);
             }
             
+            /* get start node name */
             if (startnode)
             {
                 startnode += strlen(GET_START_NODE);
@@ -1153,6 +1201,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
                 txn->origcoord = get_pgxc_nodeoid(startnode);
             }
             
+            /* get start xid */
             if (str_startxid)
             {
                 str_startxid += strlen(GET_START_XID);
@@ -1161,6 +1210,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
                 txn->startxid = startxid;
             }
             
+            /* get participated nodes */
             if (partnodes)
             {
                 partnodes += strlen(GET_NODE);
@@ -1183,15 +1233,37 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
                 return res;
             }
 
+            /* get prepare gts */
+            if (str_prepare_gts)
+            {
+                str_prepare_gts += strlen(GET_PREPARE_TIMESTAMP);
+                str_prepare_gts = strtok(str_prepare_gts, "\n");
+                txn->global_prepare_timestamp = strtoull(str_prepare_gts, NULL, 10);
+            }
+            else
+            {
+                txn->global_prepare_timestamp = InvalidGlobalTimestamp;
+            }
+
+            /* get commit gts */
             if (str_timestamp)
             {
                 str_timestamp += strlen(GET_COMMIT_TIMESTAMP);
                 str_timestamp = strtok(str_timestamp, "\n");
                 txn->global_commit_timestamp = strtoull(str_timestamp, NULL, 10);
             }
+            else
+            {
+                txn->global_commit_timestamp = InvalidGlobalTimestamp;
+            }
+
+            elog(DEBUG1, "get 2pc txn: %s, partnodes in nodename: %s(nodeoid:%u), "
+                "partnodes: (%s), startnode: %s(startnodeoid: %u), startxid: %u, "
+                "global_prepare_timestamp: %ld, global_commit_timestamp: %ld", 
+                txn->gid, get_pgxc_nodename(node_oid), node_oid,
+                partnodes, startnode, txn->origcoord, startxid,
+                txn->global_prepare_timestamp, txn->global_commit_timestamp);
             
-            elog(DEBUG1, "get 2pc txn:%s partnodes in nodename: %s (nodeoid:%u) result: partnodes:%s, startnode:%s, startnodeoid:%u, startxid:%u", 
-                txn->gid, get_pgxc_nodename(node_oid), node_oid, partnodes, startnode, txn->origcoord, startxid);
             /* in explicit transaction startnode participate the transaction */
             if (strstr(partnodes, startnode) || !IsXidImplicit(txn->gid))
             {
@@ -1457,7 +1529,8 @@ void getTxnInfoOnOtherNodes(txn_info *txn)
         node_oid = get_pgxc_nodeoid(ptr);
         status = GetTransactionPartNodes(txn, node_oid);
     }
-    else
+
+    if (status == TWOPHASE_FILE_NOT_EXISTS)
     {
         for (ii = 0; ii < cn_nodes_num + dn_nodes_num; ii++)
         {
@@ -1622,7 +1695,7 @@ void getTxnStatus(txn_info *txn, int node_idx)
 	TupleTableSlots result;
 
 	static const char *STMT_FORM = "SELECT pgxc_is_committed('%d'::xid)::text";
-	snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx], txn->xid[node_idx]);
+	snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx]);
 
 	node_oid = find_node_oid(node_idx);
 	if (0 != execute_query_on_single_node(node_oid, stmt, 1, &result))
@@ -1713,6 +1786,10 @@ char *get2PCInfo(const char *tid)
     return NULL;
 }
 
+/*
+ * pgxc_get_2pc_file
+ * Get 2pc file content
+ */
 Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_file);
 Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS)
@@ -1721,6 +1798,10 @@ Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS)
     char *result = NULL;
     text *t_result = NULL;
 
+    if (0 == PG_GETARG_DATUM(0))
+    {
+        elog(ERROR, "2PC gid is empty");
+    }
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
     result = get2PCInfo(tid);
     if (NULL != result)
@@ -1732,7 +1813,10 @@ Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS)
     PG_RETURN_NULL();
 }
 
-
+/*
+ * pgxc_get_2pc_nodes
+ * Get 2pc participants
+ */
 Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_nodes);
 Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS)
@@ -1742,6 +1826,10 @@ Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS)
     char *nodename = NULL;
 	text *t_result = NULL;
     
+    if (0 == PG_GETARG_DATUM(0))
+    {
+        elog(ERROR, "2PC gid is empty");
+    }
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
     result = get2PCInfo(tid);
     if (NULL != result)
@@ -1756,10 +1844,13 @@ Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS)
 				return PointerGetDatum(t_result);
 			}
 		}
-
     PG_RETURN_NULL();
 }
 
+/*
+ * pgxc_get_2pc_startnode
+ * Get 2pc start node
+ */
 Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_startnode);
 Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS)
@@ -1769,6 +1860,10 @@ Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS)
     char *nodename = NULL;
 	text *t_result = NULL;
     
+    if (0 == PG_GETARG_DATUM(0))
+    {
+        elog(ERROR, "2PC gid is empty");
+    }
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
     result = get2PCInfo(tid);
     if (NULL != result)
@@ -1787,6 +1882,10 @@ Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS)
     PG_RETURN_NULL();
 }
 
+/*
+ * pgxc_get_2pc_startxid
+ * Get 2pc start xid
+ */
 Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_startxid);
 Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS)
@@ -1796,6 +1895,10 @@ Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS)
     char *startxid = NULL;
 	text *t_result = NULL;
     
+    if (0 == PG_GETARG_DATUM(0))
+    {
+        elog(ERROR, "2PC gid is empty");
+    }
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
     result = get2PCInfo(tid);
     if (NULL != result)
@@ -1813,7 +1916,44 @@ Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS)
     PG_RETURN_NULL();
 }
 
+/*
+ * pgxc_get_2pc_prepare_timestamp
+ * Get 2pc prepare timestamp
+ */
+Datum pgxc_get_2pc_prepare_timestamp(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pgxc_get_2pc_prepare_timestamp);
+Datum pgxc_get_2pc_prepare_timestamp(PG_FUNCTION_ARGS)
+{
+    char *tid = NULL;
+    char *result = NULL;
+    char *prepare_timestamp = NULL;
+    text *t_result = NULL;
+
+    if (0 == PG_GETARG_DATUM(0))
+    {
+        elog(ERROR, "2PC gid is empty");
+    }
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+    result = get2PCInfo(tid);
+    if (NULL != result)
+    {
+        prepare_timestamp = strstr(result, GET_PREPARE_TIMESTAMP);
+        if (NULL != prepare_timestamp)
+        {
+            prepare_timestamp += strlen(GET_PREPARE_TIMESTAMP);
+            prepare_timestamp = strtok(prepare_timestamp, "\n");
+            t_result = cstring_to_text(prepare_timestamp);
+            pfree(result);
+            return PointerGetDatum(t_result);
+        }
+    }
+    PG_RETURN_NULL();
+}
 
+/*
+ * pgxc_get_2pc_commit_timestamp
+ * Get 2pc commit timestamp
+ */
 Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_commit_timestamp);
 Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS)
@@ -1823,6 +1963,10 @@ Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS)
     char *commit_timestamp = NULL;
 	text *t_result = NULL;
     
+    if (0 == PG_GETARG_DATUM(0))
+    {
+        elog(ERROR, "2PC gid is empty");
+    }
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
     result = get2PCInfo(tid);
     if (NULL != result)
@@ -1840,17 +1984,23 @@ Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS)
     PG_RETURN_NULL();
 }
 
-
-
+/*
+ * pgxc_get_2pc_xid
+ * Get 2pc local xid
+ */
 Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_xid);
 Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS)
 {
+    GlobalTransactionId xid;
     char *tid = NULL;
     char *result = NULL;
     char *str_xid = NULL;
-    GlobalTransactionId xid;
     
+    if (0 == PG_GETARG_DATUM(0))
+    {
+        elog(ERROR, "2PC gid is empty");
+    }
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
     result = get2PCInfo(tid);
     if (NULL != result)
@@ -1868,16 +2018,31 @@ Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS)
     PG_RETURN_NULL();
 }
 
+/*
+ * pgxc_remove_2pc_records
+ * Remove a 2pc file
+ */
 Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_remove_2pc_records);
 Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS)
 {
-    char *tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+    char *tid = NULL;
+
+    if (0 == PG_GETARG_DATUM(0))
+    {
+        elog(ERROR, "2PC gid is empty");
+    }
+    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
+
 	remove_2pc_records(tid, true);
     pfree(tid);
     PG_RETURN_BOOL(true);
 }
 
+/*
+ * pgxc_clear_2pc_records
+ * Clear all 2pc files which are not running
+ */
 Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_clear_2pc_records);
 Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
@@ -1901,6 +2066,8 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
 		elog(ERROR, "can only called on coordinator");
 	}
 	
+	elog(LOG, "clear 2pc files");
+	
 	mycontext = AllocSetContextCreate(CurrentMemoryContext,
 											  "clean_check",
 											  ALLOCSET_DEFAULT_MINSIZE,
@@ -1909,25 +2076,6 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
 	oldcontext = MemoryContextSwitchTo(mycontext);
 
     ResetGlobalVariables();
-#if 0
-	if((dir = opendir(TWOPHASE_RECORD_DIR)))
-	{		
-		while((ptr = readdir(dir)) != NULL)
-	    {
-	    	if (count > 999)
-				break;
-	        if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0)
-	        {
-	            continue;
-	        }       
-			snprintf(path[count], MAX_GID, "/%s", ptr->d_name);
-			//snprintf(path[count], MAX_GID, "/%s", ptr->d_name);
-			count++;
-		}
-
-		closedir(dir);
-	}
-#endif
 
 	/*get node list*/
 	PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
@@ -1948,28 +2096,14 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
     {
         (void) execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i);
     }
+
 	/*get all database info*/
 	getDatabaseList();
 	
 	/*get all info of 2PC transactions*/
 	getTxnInfoOnNodesAll();
-#if 0
-	if((dir = opendir(TWOPHASE_RECORD_DIR)))
-	{		
-		while (i < count)
-		{
-			if (!find_txn(path[i]))
-			{
-				unlink(path[i]);
-				WriteClean2pcXlogRec(path[i]);
-			}
-			i++;
-		}
 
-		closedir(dir);
-	}
-#endif
-    /*delete all rest 2pc file in each nodes*/
+    /*delete all rest 2pc files in each cn*/
     for (i = 0; i < cn_nodes_num; i++)
     {
         if (0 == result[i].slot_count)
@@ -1977,24 +2111,54 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
             continue;
         }
         if (!(twopcfiles = TTSgetvalue(result+i, 0, 0)))
+        {
             continue;
+        }
+
+        /*iterate through all 2pc files, delete rest ones*/
         ptr = strtok(twopcfiles, ",");
-        while(ptr)
+        for (;ptr != NULL; ptr = strtok(NULL, ","))
         {
             if (count >= MAXIMUM_CLEAR_FILE)
+            {
                 break;
-            if (!find_txn(ptr))
+            }
+
+            /*whether 2pc is running?*/
+            if (find_txn(ptr))
+            {
+                /*2pc is running, do not delete its file*/
+                continue;
+            }
+
+            /*whether 2pc is rollbacked?*/
+            if (strstr(ptr, ROLLBACK_POSTFIX) == NULL)
             {
+                /*2pc is not rollbacked*/
+
+                /*whether 2pc start xid transaction is running?*/
+                if (is_gid_start_xid_running(ptr))
+                {
+                    /*2pc start xid transaction is running, do not delete its file*/
+                    elog(LOG, "2PC '%s' is running", ptr);
+                    continue;
+                }
+            }
+
+            /*2pc is not running, delete its file*/
                 snprintf(clear_query, 100, CLEAR_STMT, ptr);
-                if (execute_query_on_single_node(cn_node_list[i], clear_query, 1, &clear_result) == (Datum)0)
+            elog(LOG, "clear 2pc file: %s", ptr);
+            if (execute_query_on_single_node(cn_node_list[i],
+                clear_query, 1, &clear_result) == (Datum)0)
+            {
                     res = false;
+            }
                 DropTupleTableSlots(&clear_result);
                 count++;
             }
-            ptr = strtok(NULL, ",");
-        }
     }
 
+    /*delete all rest 2pc files in each dn*/
     for (i = 0; i < dn_nodes_num; i++)
     {
         if (0 == result[cn_nodes_num+i].slot_count)
@@ -2002,22 +2166,51 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
             continue;
         }
         if (!(twopcfiles = TTSgetvalue(result+cn_nodes_num+i, 0, 0)))
+        {
             continue;
+        }
+
+        /*iterate through all 2pc files, delete rest ones*/
         ptr = strtok(twopcfiles, ",");
-        while(ptr)
+        for (;ptr != NULL; ptr = strtok(NULL, ","))
         {
             if (count >= MAXIMUM_CLEAR_FILE)
+            {
                 break;
-            if (!find_txn(ptr))
+            }
+
+            /*whether 2pc is running?*/
+            if (find_txn(ptr))
+            {
+                /*2pc is running, do not delete its file*/
+                continue;
+            }
+
+            /*whether 2pc is rollbacked?*/
+            if (strstr(ptr, ROLLBACK_POSTFIX) == NULL)
+            {
+                /*2pc is not rollbacked*/
+
+                /*whether 2pc start xid transaction is running?*/
+                if (is_gid_start_xid_running(ptr))
             {
+                    /*2pc start xid transaction is running, do not delete its file*/
+                    elog(LOG, "2PC '%s' is running", ptr);
+                    continue;
+                }
+            }
+
+            /*2pc is not running, delete its file*/
                 snprintf(clear_query, 100, CLEAR_STMT, ptr);
-                if (execute_query_on_single_node(dn_node_list[i], clear_query, 1, &clear_result) == (Datum)0)
+            elog(LOG, "clear 2pc file: %s", ptr);
+            if (execute_query_on_single_node(dn_node_list[i],
+                clear_query, 1, &clear_result) == (Datum)0)
+            {
                     res = false;
+            }
                 DropTupleTableSlots(&clear_result);
                 count++;
             }
-            ptr = strtok(NULL, ",");
-        }
     }
 
     for (i = 0; i < pgxc_clean_node_count; i++)
@@ -2033,6 +2226,10 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
     PG_RETURN_BOOL(res);
 }
 
+/*
+ * pgxc_get_record_list
+ * Get 2pc files list
+ */
 Datum pgxc_get_record_list(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_record_list);
 Datum pgxc_get_record_list(PG_FUNCTION_ARGS)
@@ -2047,7 +2244,11 @@ Datum pgxc_get_record_list(PG_FUNCTION_ARGS)
     recordList = get_2pc_list_from_cache(&count);
     if (count >= MAXIMUM_OUTPUT_FILE)
     {
-        Assert(NULL != recordList);
+        if (NULL == recordList)
+        {
+            elog(PANIC, "recordList is NULL");
+        }
+
         t_recordList = cstring_to_text(recordList);
         return PointerGetDatum(t_recordList);
     }
@@ -2130,8 +2331,18 @@ Datum pgxc_commit_on_node(PG_FUNCTION_ARGS)
     cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
     dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
  
+    if (0 == PG_GETARG_DATUM(0))
+    {
+        elog(ERROR, "pgxc_commit_on_node: node name is empty");
+    }
     nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
+
+    if (0 == PG_GETARG_DATUM(1))
+    {
+        elog(ERROR, "pgxc_commit_on_node: gid is empty");
+    }
     gid = text_to_cstring(PG_GETARG_TEXT_P(1));
+
     nodeoid = get_pgxc_nodeoid(nodename);
     if (InvalidOid == nodeoid)
     {
@@ -2163,6 +2374,10 @@ Datum pgxc_commit_on_node(PG_FUNCTION_ARGS)
         else
         {
             txn->global_commit_timestamp = GetGlobalTimestampGTM();
+            if (!GlobalTimestampIsValid(current_gts))
+            {
+                elog(ERROR, "pgxc_commit_on_node, get invalid gts");
+            }
         }
     }
     
@@ -2236,8 +2451,18 @@ Datum pgxc_abort_on_node(PG_FUNCTION_ARGS)
     cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
     dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
  
+    if (0 == PG_GETARG_DATUM(0))
+    {
+        elog(ERROR, "pgxc_abort_on_node: node name is empty");
+    }
     nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
+
+    if (0 == PG_GETARG_DATUM(1))
+    {
+        elog(ERROR, "pgxc_abort_on_node: gid is empty");
+    }
     gid = text_to_cstring(PG_GETARG_TEXT_P(1));
+
     nodeoid = get_pgxc_nodeoid(nodename);
     if (InvalidOid == nodeoid)
     {
@@ -2403,6 +2628,15 @@ bool send_query_clean_transaction(PGXCNodeHandle* conn, txn_info *txn, const cha
                             TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
         }
         
+        if (InvalidGlobalTimestamp != txn->global_prepare_timestamp &&
+			pgxc_node_send_prepare_timestamp(conn, txn->global_prepare_timestamp))
+        {
+            ereport(ERROR,
+                    (errcode(ERRCODE_INTERNAL_ERROR),
+                     errmsg("in pg_clean failed to send prepare timestamp for %s PREPARED command",
+                            TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
+        }
+
         if (NULL != txn->participants && pgxc_node_send_partnodes(conn, txn->participants))
         {
             ereport(ERROR,
@@ -2428,7 +2662,13 @@ bool check_2pc_belong_node(txn_info * txn)
     int node_index = 0;
     char node_type;
     node_index = find_node_index(abnormal_nodeoid);
-    Assert(InvalidOid != abnormal_nodeoid);
+
+    /* abnormal node oid must be valid here */
+    if (InvalidOid == abnormal_nodeoid)
+    {
+        elog(PANIC, "abnormal_nodeoid is invalid");
+    }
+
     if (abnormal_nodeoid == txn->origcoord)
     {
         txn->belong_abnormal_node = true;
@@ -2448,9 +2688,7 @@ bool check_2pc_belong_node(txn_info * txn)
 
     if (InvalidOid == txn->origcoord)
     {
-        char *startnode = NULL;
         int   node_oid  = InvalidOid;
-        char  gid[MAX_GID];
 
         if (!IsXidImplicit(txn->gid))
         {
@@ -2458,39 +2696,16 @@ bool check_2pc_belong_node(txn_info * txn)
             return true;
         }
 
-        Assert(IsXidImplicit(txn->gid));
-
-        /* get start node from gid */
-        strcpy(gid, txn->gid);
-        startnode = strtok(gid, ":");
-        if (NULL == startnode)
-        {
-            elog(WARNING, "get startnode(%s) from gid(%s) failed",
-                startnode, gid);
-            txn->belong_abnormal_node = false;
-            return false;
-        }
-
-        startnode = strtok(NULL, ":");
-        if (NULL == startnode)
+        /* Get start node oid from gid */
+        node_oid = get_start_node_oid_from_gid(txn->gid);
+        if (node_oid == InvalidOid)
         {
-            elog(WARNING, "get startnode(%s) from gid(%s) failed",
-                startnode, gid);
+            elog(WARNING, "Get invalid start node oid from gid(%s)", txn->gid);
             txn->belong_abnormal_node = false;
             return false;
         }
 
-        node_oid = get_pgxc_nodeoid(startnode);
-        if (NULL == startnode)
-        {
-            elog(WARNING, "get invalid oid for startnode(%s) from gid(%s)",
-                startnode, gid);
-            txn->belong_abnormal_node = false;
-            return false;
-        }
-
-        elog(DEBUG5, "get oid(%d) for startnode(%s) from gid(%s)",
-            node_oid, startnode, gid);
+        elog(DEBUG1, "Get start node oid(%d) from gid(%s)", node_oid, txn->gid);
 
         if (abnormal_nodeoid == node_oid)
         {
@@ -2518,22 +2733,13 @@ bool check_node_participate(txn_info * txn, int node_idx)
 
 void recover2PC(txn_info * txn)
 {
-	int i = 0;
-	bool check_ok = false;
-	int check_times = CLEAN_CHECK_TIMES_DEFAULT;
-	int check_interval = CLEAN_CHECK_INTERVAL_DEFAULT;
+	bool is_running = true;
 	MemoryContext current_context = NULL;
 	ErrorData* edata = NULL;
 	TXN_STATUS txn_stat;
 	txn_stat = check_txn_global_status(txn);
 	txn->global_txn_stat = txn_stat;
 
-	if (clear_2pc_belong_node)
-	{
-		check_times = CLEAN_NODE_CHECK_TIMES;
-		check_interval = CLEAN_NODE_CHECK_INTERVAL;
-	}
-
 #ifdef DEBUG_EXECABORT
 	txn_stat = TXN_STATUS_ABORTED;
 #endif
@@ -2567,46 +2773,59 @@ void recover2PC(txn_info * txn)
             else
             {
     			txn->op = COMMIT;
-    			/* check whether all nodes can commit prepared */
-				for (i = 0; i < check_times; i++)
+
+				/* check whether the 2pc start xid is 0 */
+				if (txn->startxid == 0 && IsXidImplicit(txn->gid))
+				{
+					elog(WARNING, "Commit 2PC '%s' start xid is 0", txn->gid);
+					txn->op_issuccess = false;
+					return;
+				}
+
+				/* check whether the 2pc start xid is still running on start node */
+				if (is_txn_start_xid_running(txn))
 				{
-					check_ok = true;
+					elog(WARNING, "Commit 2PC '%s' start xid %d is running",
+						txn->gid, txn->startxid);
+					txn->op_issuccess = false;
+					return;
+				}
+
+				/* check whether the 2pc is still running on participants */
+				is_running = false;
 					current_context = CurrentMemoryContext;
 					PG_TRY();
 					{
     			if (!clean_2PC_iscommit(txn, true, true))
     			{
-							check_ok = false;
-							elog(LOG, "check commit 2PC transaction %s failed",
-								txn->gid);
+						is_running = true;
+						elog(WARNING, "Commit 2PC '%s' check failed", txn->gid);
 						}
 					}
 					PG_CATCH();
 					{
+					is_running = true;
 						(void)MemoryContextSwitchTo(current_context);
 						edata = CopyErrorData();
 						FlushErrorState();
 
-						check_ok = false;
-						elog(WARNING, "check commit 2PC transaction %s error: %s",
+					elog(WARNING, "Commit 2PC '%s' is running, error: %s",
 							txn->gid, edata->message);
 					}
 					PG_END_TRY();
 
-					if (!check_ok)
+				/* 2pc is still running, do not try to clean */
+				if (is_running)
 					{
     				txn->op_issuccess = false;
     				return;
     			}
 
-					pg_usleep(check_interval);
-				}
-
     			/* send commit prepared to all nodes */
     			if (!clean_2PC_iscommit(txn, true, false))
     			{
     				txn->op_issuccess = false;
-    				elog(LOG, "commit 2PC transaction %s failed", txn->gid);
+					elog(WARNING, "Commit 2PC '%s' failed", txn->gid);
     				return;
     			}
     			txn->op_issuccess = true;
@@ -2616,46 +2835,57 @@ void recover2PC(txn_info * txn)
 		
 		case TXN_STATUS_ABORTED:
 			txn->op = ABORT;
-			/* check whether all nodes can rollback prepared */
-			for (i = 0; i < check_times; i++)
+
+			/* check whether the 2pc start xid is 0 */
+			if (txn->startxid == 0 && IsXidImplicit(txn->gid))
 			{
-				check_ok = true;
+				elog(WARNING, "Rollback 2PC '%s' start xid is 0", txn->gid);
+			}
+
+			/* check whether the 2pc start xid is still running on start node */
+			if (is_txn_start_xid_running(txn))
+			{
+				elog(WARNING, "Rollback 2PC '%s' start xid %d is running",
+					txn->gid, txn->startxid);
+				txn->op_issuccess = false;
+				return;
+			}
+
+			/* check whether the 2pc is still running on participants */
+			is_running = false;
 				current_context = CurrentMemoryContext;
 				PG_TRY();
 				{
 			if (!clean_2PC_iscommit(txn, false, true))
 			{
-						check_ok = false;
-						elog(LOG, "check rollback 2PC transaction %s failed",
-							txn->gid);
+					is_running = true;
+					elog(WARNING, "Rollback 2PC '%s' check failed", txn->gid);
 					}
 				}
 				PG_CATCH();
 				{
-					check_ok = false;
+				is_running = true;
 					(void)MemoryContextSwitchTo(current_context);
 					edata = CopyErrorData();
 					FlushErrorState();
 
-					elog(WARNING, "check rollback 2PC transaction %s error: %s",
+				elog(WARNING, "Rollback 2PC '%s' is running, error: %s",
 						txn->gid, edata->message);
 				}
 				PG_END_TRY();
 
-				if (!check_ok)
+			/* 2pc is still running, do not try to clean */
+			if (is_running)
 				{
 				txn->op_issuccess = false;
 				return;
 			}
 
-				pg_usleep(check_interval);
-			}
-
 			/* send rollback prepared to all nodes */
 			if (!clean_2PC_iscommit(txn, false, false))
 			{
 				txn->op_issuccess = false;
-				elog(LOG, "rollback 2PC transaction %s failed", txn->gid);
+				elog(WARNING, "Rollback 2PC '%s' failed", txn->gid);
 				return;
 			}
 			txn->op_issuccess = true;
@@ -2685,7 +2915,6 @@ TXN_STATUS check_txn_global_status(txn_info *txn)
 #define TXN_INPROGRESS	0X0020
 	int ii;
 	int check_flag = 0;
-    int node_idx = 0;
 	TimestampTz prepared_time = 0;
 	TimestampTz time_gap = clean_time_interval;
 
@@ -2770,43 +2999,124 @@ TXN_STATUS check_txn_global_status(txn_info *txn)
         return TXN_STATUS_INPROGRESS;
     }
 #endif                
-    if (clear_2pc_belong_node)
+
+    /* start xid is 0, maybe at the beginning of the 2pc */
+    if (txn->startxid == 0)
     {
-        if (!check_2pc_belong_node(txn))
+        /* prepare timestamp must be invalid */
+        if (GlobalTimestampIsValid(txn->global_prepare_timestamp))
         {
-            return TXN_STATUS_INPROGRESS;
+            elog(PANIC, "gid: %s, start xid is 0, global_prepare_timestamp: %ld",
+                txn->gid, txn->global_prepare_timestamp);
         }
 
-        if (!check_2pc_start_from_node(txn))
+        elog(DEBUG2, "2PC '%s' start xid is 0", txn->gid);
+
+        if (check_flag & TXN_INPROGRESS
+            || current_time - prepared_time <= time_gap)
         {
+            /* inprogress or less than time gap, do not clean it */
+            elog(LOG, "2PC '%s' start xid is 0, inprogress, "
+                "current_time: %ld, prepared_time: %ld, "
+                "time_gap: %ld, time_diff: %ld",
+                txn->gid, current_time, prepared_time,
+                time_gap, current_time - prepared_time);
+
             return TXN_STATUS_INPROGRESS;
         }
+        else
+        {
+            /* otherwise, abort it */
+            elog(WARNING, "2PC '%s' start xid is 0, "
+                "current_time: %ld, prepared_time: %ld, "
+                "time_gap: %ld, time_diff: %ld",
+                txn->gid, current_time, prepared_time,
+                time_gap, current_time - prepared_time);
+
+            return TXN_STATUS_ABORTED;
+        }
+    }
 
-        node_idx = find_node_index(abnormal_nodeoid);
-        if (node_idx >= 0)
+    /* use for upgrade from old version, no prepare timestamp in old version */
+    if (!GlobalTimestampIsValid(txn->global_prepare_timestamp))
         {
-            if (abnormal_time < txn->prepare_timestamp[node_idx])
+        elog(WARNING, "gid: %s, start xid is %d, global_prepare_timestamp "
+            "is invalid", txn->gid, txn->startxid);
+
+        if (check_flag & TXN_INPROGRESS
+            || current_time - prepared_time <= time_gap)
             {
-                elog(WARNING, "gid: %s, abnormal time: " INT64_FORMAT
-                    ", prepare timestamp[%d]: " INT64_FORMAT, txn->gid,
-                    abnormal_time, node_idx, txn->prepare_timestamp[node_idx]);
+            /* inprogress or less than time gap, do not clean it */
+            elog(WARNING, "gid: %s, start xid is %d, inprogress, "
+                "current_time: %ld, prepared_time: %ld, "
+                "time_gap: %ld, time_diff: %ld",
+                txn->gid, txn->startxid, current_time, prepared_time,
+                time_gap, current_time - prepared_time);
 
                 return TXN_STATUS_INPROGRESS;
             }
+        else
+        {
+            /* otherwise, set prepare timestamp */
+            if (clear_2pc_belong_node)
+            {
+                txn->global_prepare_timestamp = abnormal_gts;
         }
         else
         {
-            elog(WARNING, "gid: %s, node_idx: %d", txn->gid, node_idx);
+                txn->global_prepare_timestamp = current_gts - time_gap;
+            }
+
+            elog(WARNING, "gid: %s, start xid is %d, "
+                "current_time: %ld, prepared_time: %ld, "
+                "time_gap: %ld, time_diff: %ld, "
+                "set global_prepare_timestamp: %ld",
+                txn->gid, txn->startxid, current_time, prepared_time,
+                time_gap, current_time - prepared_time,
+                txn->global_prepare_timestamp);
+        }
+        }
+
+    if (clear_2pc_belong_node)
+        {
+        if (!check_2pc_belong_node(txn))
+        {
+            return TXN_STATUS_INPROGRESS;
+        }
+
+        if (!check_2pc_start_from_node(txn))
+        {
+            return TXN_STATUS_INPROGRESS;
+        }
+
+        /* abnormal gts must be valid */
+        if (!GlobalTimestampIsValid(abnormal_gts))
+        {
+            elog(PANIC, "gid: %s, abnormal_gts is invalid gts", txn->gid);
         }
 
-        if (abnormal_time < prepared_time)
+        /* abnormal gts less than prepare gts, do not clean it */
+        if (abnormal_gts < txn->global_prepare_timestamp)
         {
-            elog(WARNING, "gid: %s, abnormal time: " INT64_FORMAT
-                ", prepared time: " INT64_FORMAT, txn->gid,
-                abnormal_time, prepared_time);
+            elog(LOG, "gid: %s, abnormal gts: " INT64_FORMAT
+                ", prepare gts: " INT64_FORMAT, txn->gid,
+                abnormal_gts, txn->global_prepare_timestamp);
 
             return TXN_STATUS_INPROGRESS;
         }
+
+        if (GlobalTimestampIsValid(txn->global_commit_timestamp))
+        {
+            /* abnormal gts less than commit gts, do not clean it */
+            if (abnormal_gts < txn->global_commit_timestamp)
+            {
+                elog(LOG, "gid: %s, abnormal gts: " INT64_FORMAT
+                    ", commit gts: " INT64_FORMAT, txn->gid,
+                    abnormal_gts, txn->global_commit_timestamp);
+
+                return TXN_STATUS_INPROGRESS;
+            }
+        }
     }
     else
     {
@@ -2815,8 +3125,36 @@ TXN_STATUS check_txn_global_status(txn_info *txn)
             /* transaction inprogress */
             return TXN_STATUS_INPROGRESS;
         }
+
+        /* current gts must be valid */
+        if (!GlobalTimestampIsValid(current_gts))
+        {
+            elog(PANIC, "gid: %s, current_gts is invalid gts", txn->gid);
+        }
+
+        /* 2pc prepare gts gap less than time gap, do not clean it */
+        if (current_gts - txn->global_prepare_timestamp < time_gap)
+        {
+            elog(LOG, "gid: %s, current gts: " INT64_FORMAT
+                ", prepare gts: " INT64_FORMAT ", time gap: " INT64_FORMAT,
+                txn->gid, current_gts, txn->global_prepare_timestamp, time_gap);
+
+            return TXN_STATUS_INPROGRESS;
     }
 
+        if (GlobalTimestampIsValid(txn->global_commit_timestamp))
+        {
+            /* 2pc commit gts gap less than time gap, do not clean it */
+            if (current_gts - txn->global_commit_timestamp <= time_gap)
+            {
+                elog(LOG, "gid: %s, current gts: " INT64_FORMAT
+                    ", commit gts: " INT64_FORMAT ", time gap: " INT64_FORMAT,
+                    txn->gid, current_gts, txn->global_commit_timestamp, time_gap);
+
+                return TXN_STATUS_INPROGRESS;
+            }
+        }
+    }
 
     if (!IsXidImplicit(txn->gid) && txn->after_first_phase && (TXN_PREPARED == check_flag))
     {
@@ -2837,6 +3175,21 @@ TXN_STATUS check_txn_global_status(txn_info *txn)
 	if (check_flag & TXN_COMMITTED)
 		/* Some 2PC transactions are committed.  Need to commit others. */
 		return TXN_STATUS_COMMITTED;
+
+	/* If 2PC commit gts is valid, must commit it. */
+	if (GlobalTimestampIsValid(txn->global_commit_timestamp))
+	{
+		elog(LOG, "'%s' global_commit_timestamp: %ld",
+			txn->gid, txn->global_commit_timestamp);
+
+		if (!(check_flag & TXN_PREPARED))
+		{
+			elog(PANIC, "gid: %s, check_flag: %d", txn->gid, check_flag);
+		}
+
+		return TXN_STATUS_COMMITTED;
+	}
+
 	/* All the transactions remain prepared.   No need to recover. */
 	return TXN_STATUS_ABORTED;
 }
@@ -2901,6 +3254,11 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check)
 	{
         node_oid = pgxc_handles->datanode_handles[ii]->nodeoid;
     	node_idx = find_node_index(node_oid);
+        if (node_idx < 0 || node_idx >= cn_nodes_num + dn_nodes_num)
+        {
+            elog(PANIC, "gid: %s, node_idx(%d) is invalid", txn->gid, node_idx);
+        }
+
         if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx])
         {
             continue;
@@ -2934,6 +3292,11 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check)
 	{
         node_oid = pgxc_handles->coord_handles[ii]->nodeoid;
     	node_idx = find_node_index(node_oid);
+        if (node_idx < 0 || node_idx >= cn_nodes_num + dn_nodes_num)
+        {
+            elog(PANIC, "gid: %s, node_idx(%d) is invalid", txn->gid, node_idx);
+        }
+
         if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx])
         {
             continue;
@@ -2961,7 +3324,6 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check)
             }
 #endif
         }
-
 	}
 
     /* receive response */
@@ -3000,10 +3362,14 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check)
     if (txn->origcoord != InvalidOid)
     {
     	node_idx = find_node_index(txn->origcoord);
+        if (node_idx < 0 || node_idx >= cn_nodes_num + dn_nodes_num)
+        {
+            elog(PANIC, "gid: %s, node_idx(%d) is invalid", txn->gid, node_idx);
+        }
+
     	if (txn->coordparts[node_idx] == 1)
     	{
 			/*send global timestamp to dn_node_list[ii]*/
-            
 			if (txn->txn_stat[node_idx] == TXN_STATUS_PREPARED)
 			{
                 get_node_handles(&pgxc_handles, txn->origcoord);
@@ -3072,7 +3438,8 @@ bool clean_2PC_files(txn_info * txn)
 		}
 		else
 		{
-			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(dn_node_list[ii]));
+			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s",
+				txn->gid, get_pgxc_nodename(dn_node_list[ii]));
 			issuccess = false;
 		}
 		DropTupleTableSlots(&result);
@@ -3086,14 +3453,15 @@ bool clean_2PC_files(txn_info * txn)
 		{
 			if (TTSgetvalue(&result, 0, 0) == false)
 			{
-				elog(LOG, "Error:delete 2PC file failed of transaction %s on node %s",
+				elog(LOG, "pg_clean: delete 2PC file failed of transaction %s on node %s",
 						  txn->gid, get_pgxc_nodename(txn->coordparts[ii]));
 				issuccess = false;
 			}
 		}
 		else
 		{
-			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(cn_node_list[ii]));
+			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s",
+				txn->gid, get_pgxc_nodename(cn_node_list[ii]));
 			issuccess = false;
 		}
 		DropTupleTableSlots(&result);
@@ -3378,12 +3746,14 @@ void get_node_handles(PGXCNodeAllHandles **pgxc_handles, Oid nodeoid)
 	*pgxc_handles = get_handles(nodelist, coordlist, false, true, true);
 }
 
-
 bool check_2pc_start_from_node(txn_info *txn)
 {
 	char node_type;
 
-	Assert(InvalidOid != abnormal_nodeoid);
+	if (InvalidOid == abnormal_nodeoid)
+	{
+		elog(PANIC, "gid: %s, abnormal_nodeoid is invalid", txn->gid);
+	}
 
 	if (abnormal_nodeoid == txn->origcoord)
 	{
@@ -3398,51 +3768,239 @@ bool check_2pc_start_from_node(txn_info *txn)
 
 	if (InvalidOid == txn->origcoord)
 	{
-		char *startnode = NULL;
 		int   node_oid  = InvalidOid;
-		char  gid[MAX_GID];
 
 		if (!IsXidImplicit(txn->gid))
 		{
 			return true;
 		}
 
-		Assert(IsXidImplicit(txn->gid));
-
-		/* get start node from gid */
-		strcpy(gid, txn->gid);
-		startnode = strtok(gid, ":");
-		if (NULL == startnode)
+		/* Get start node oid from gid */
+		node_oid = get_start_node_oid_from_gid(txn->gid);
+		if (InvalidOid == node_oid)
 		{
-			elog(WARNING, "get startnode(%s) from gid(%s) failed",
-				startnode, gid);
+			elog(WARNING, "Get invalid start node oid from gid(%s)", txn->gid);
 			return false;
 		}
 
-		startnode = strtok(NULL, ":");
-		if (NULL == startnode)
+		elog(DEBUG1, "Get start node oid(%d) from gid(%s)", node_oid, txn->gid);
+
+		if (abnormal_nodeoid == node_oid)
 		{
-			elog(WARNING, "get startnode(%s) from gid(%s) failed",
-				startnode, gid);
+			return true;
+		}
+	}
+
 			return false;
 		}
 
-		node_oid = get_pgxc_nodeoid(startnode);
-		if (NULL == startnode)
+/*
+ * get_start_node_from_gid
+ * Get start node name from gid
+ * gid: 2pc gid
+ */
+char *get_start_node_from_gid(char *gid)
 		{
-			elog(WARNING, "get invalid oid for startnode(%s) from gid(%s)",
-				startnode, gid);
-			return false;
+	char *str_start_node = NULL;
+
+	if (!IsXidImplicit(gid))
+	{
+		elog(WARNING, "2PC '%s' is not implicit", gid);
+		return NULL;
+	}
+
+	/* Get start node name from gid */
+	str_start_node = strtok(gid, ":");
+	if (str_start_node == NULL)
+	{
+		elog(WARNING, "Get start node from gid(%s) failed", gid);
+		return NULL;
 		}
 
-		elog(DEBUG1, "get oid(%d) for startnode(%s) from gid(%s)",
-			node_oid, startnode, gid);
+	str_start_node = strtok(NULL, ":");
+	if (str_start_node == NULL)
+	{
+		elog(WARNING, "Get start node from gid(%s) failed", gid);
+		return NULL;
+	}
 
-		if (abnormal_nodeoid == node_oid)
+	return str_start_node;
+}
+
+/*
+ * get_start_node_oid_from_gid
+ * Get start node oid from gid
+ * gid: 2pc gid
+ */
+Oid get_start_node_oid_from_gid(char *gid)
+{
+	Oid start_node_oid = 0;
+	char *str_start_node = NULL;
+	char gid_buf[MAX_GID];
+
+	/* Get start node oid from gid */
+	strcpy(gid_buf, gid);
+	str_start_node = get_start_node_from_gid(gid_buf);
+	if (str_start_node == NULL)
+	{
+		elog(WARNING, "Get start node from gid(%s) failed", gid);
+		return 0;
+	}
+
+	elog(LOG, "Get start node(%s) from gid(%s)", str_start_node, gid);
+
+	start_node_oid = get_pgxc_nodeoid(str_start_node);
+	if (start_node_oid == InvalidOid)
+	{
+		elog(WARNING, "Get invalid oid for start node(%s) from gid(%s)",
+			str_start_node, gid);
+		return 0;
+	}
+
+	return start_node_oid;
+}
+
+/*
+ * get_start_xid_from_gid
+ * Get start xid from gid
+ * gid: 2pc gid
+ */
+uint32 get_start_xid_from_gid(char *gid)
+{
+	uint32 start_xid = 0;
+	char *str_start_xid = NULL;
+	char gid_buf[MAX_GID];
+
+	if (!IsXidImplicit(gid))
+	{
+		elog(WARNING, "2PC '%s' is not implicit", gid);
+		return 0;
+	}
+
+	/* Get start xid from gid */
+	strcpy(gid_buf, gid);
+	str_start_xid = gid_buf + strlen(XIDPREFIX);
+	str_start_xid = strtok(str_start_xid, ":");
+	start_xid = strtoul(str_start_xid, NULL, 10);
+	if (start_xid == 0)
 		{
+		elog(WARNING, "Get start xid from gid(%s) failed", gid);
+		return 0;
+	}
+
+	return start_xid;
+}
+
+/*
+ * is_xid_running_on_node
+ * Whether the transaction with the xid is still running on the node
+ * xid: transaction id
+ * node_oid: node oid
+ */
+bool is_xid_running_on_node(uint32 xid, Oid node_oid)
+{
+	bool is_running = true;
+
+	Datum execute_res;
+	TupleTableSlots result;
+	char command[MAX_CMD_LENGTH];
+
+	if (xid == 0 || node_oid == InvalidOid)
+	{
+		elog(PANIC, "2PC xid: %d, node oid: %d", xid, node_oid);
 			return true;
 		}
+
+	snprintf(command, MAX_CMD_LENGTH, "select pid::text, backend_xid::text "
+			"from pg_catalog.pg_stat_activity where backend_xid=%d", xid);
+
+	execute_res = execute_query_on_single_node(node_oid, command, 2, &result);
+	if (execute_res == (Datum) 1)
+	{
+		if (result.slot_count == 0)
+		{
+			is_running = false;
+		}
+		else
+		{
+			is_running = true;
+
+			if (result.slot_count != 1)
+			{
+				elog(PANIC, "Get %d resules for xid: %d", result.slot_count, xid);
+			}
+		}
+	}
+	else
+	{
+		elog(WARNING, "pg_clean: Faile to query xid %d on node %s",
+			xid, get_pgxc_nodename(node_oid));
+		is_running = true;
+	}
+	DropTupleTableSlots(&result);
+
+	return is_running;
 	}
 
+/*
+ * is_gid_start_xid_running
+ * Whether the transaction with the start xid is still running on start node
+ * gid: 2pc gid
+ */
+bool is_gid_start_xid_running(char *gid)
+{
+	uint32 start_xid = 0;
+	Oid start_node_oid = InvalidOid;
+
+	if (!IsXidImplicit(gid))
+	{
+		elog(LOG, "Explicit 2PC '%s'", gid);
+		return true;
+	}
+
+	/* Get start xid from gid */
+	start_xid = get_start_xid_from_gid(gid);
+	if (start_xid == 0)
+	{
+		elog(ERROR, "Get start xid from gid(%s) failed", gid);
+		return true;
+	}
+
+	elog(LOG, "Get start xid(%d) from gid(%s)", start_xid, gid);
+
+	/* Get start node oid from gid */
+	start_node_oid = get_start_node_oid_from_gid(gid);
+	if (start_node_oid == InvalidOid)
+	{
+		elog(WARNING, "Get invalid start node oid from gid(%s)", gid);
 	return false;
 }
+
+	elog(LOG, "Get start node oid(%d) from gid(%s)", start_node_oid, gid);
+
+	return is_xid_running_on_node(start_xid, start_node_oid);
+}
+
+/*
+ * is_txn_start_xid_running
+ * Whether the transaction with the start xid is still running on start node
+ * txn: 2pc transaction info
+ */
+bool is_txn_start_xid_running(txn_info *txn)
+{
+	if (txn->startxid != 0)
+	{
+		Assert(txn->origcoord != InvalidOid);
+		return is_xid_running_on_node(txn->startxid, txn->origcoord);
+	}
+
+	Assert(txn->origcoord == InvalidOid);
+
+	if (!IsXidImplicit(txn->gid))
+	{
+		elog(LOG, "Explicit 2PC '%s' start xid is %d", txn->gid, txn->startxid);
+		return false;
+	}
+
+	return is_gid_start_xid_running(txn->gid);
+}
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index e78f9c53..ea188961 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -2392,12 +2392,11 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 
 				if (!save_and_remove_2pc_info(gxact->gid))
 					{
-					elog(LOG, "[%s] %s save to file failed",
-						__FUNCTION__, gxact->gid);
+					elog(DEBUG1, "checkpoint: %s save to file failed", gxact->gid);
 				}
 				else
 				{
-					elog(LOG, "[%s] %s is saved to file", __FUNCTION__, gxact->gid);
+					elog(LOG, "checkpoint: %s is saved to file", gxact->gid);
 				}
 			}
 #endif
@@ -3741,10 +3740,12 @@ void record_2pc_involved_nodes_xid(const char * tid,
 	File fd = 0;
 	int ret = 0;
 	int size = 0;
+	int pg_clean_check_size = 0;
 	StringInfoData content;
 	struct stat fst;
 	char path[MAXPGPATH];
 	char *result = NULL;
+	GlobalTimestamp prepare_gts = InvalidGlobalTimestamp;
 
 #ifdef __TWO_PHASE_TESTS__
 	XLogRecPtr xlogrec = 0;
@@ -3755,6 +3756,18 @@ void record_2pc_involved_nodes_xid(const char * tid,
 		return;
 	}
 
+	prepare_gts = GetGlobalPrepareTimestamp();
+	if (!GlobalTimestampIsValid(prepare_gts))
+	{
+		elog(WARNING, "prepare gts is invalid");
+		prepare_gts = GetGlobalTimestampGTM();
+		if (!GlobalTimestampIsValid(prepare_gts))
+		{
+			elog(ERROR, "get gts for prepare is invalid");
+		}
+		SetGlobalPrepareTimestamp(prepare_gts);
+	}
+
 	if (enable_distri_print || enable_2pc_entry_trace)
     {
 		elog(LOG, "[%s] record %s, startnode: %s, participants: %s",
@@ -3780,6 +3793,10 @@ void record_2pc_involved_nodes_xid(const char * tid,
 	appendStringInfo(&content, "startxid:%u\n", startxid);
 	appendStringInfo(&content, "nodes:%s\n", nodestring);
 	appendStringInfo(&content, "xid:%u\n", xid);
+	pg_clean_check_size = content.len;
+	Assert(pg_clean_check_size == strlen(content.data));
+
+	appendStringInfo(&content, "global_prepare_timestamp:%ld\n", prepare_gts);
 	size = content.len;
 	Assert(size == strlen(content.data));
 
@@ -3798,11 +3815,10 @@ void record_2pc_involved_nodes_xid(const char * tid,
 				Assert(strlen(info) < MAX_2PC_INFO_SIZE);
 				check_2pc_file(tid, info, __FUNCTION__);
 
-				if (strncmp(info, content.data, size) != 0)
+				if (pg_strncasecmp(info, content.data, pg_clean_check_size) != 0)
 				{
-					elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, "
-						"content: %s, info: %s", __FUNCTION__, tid,
-						content.data, info);
+					elog(ERROR, "pg_clean attemp to write %s info conflict, "
+						"content: %s, info: %s", tid, content.data, info);
 				}
 
 				resetStringInfo(&content);
@@ -3836,11 +3852,10 @@ void record_2pc_involved_nodes_xid(const char * tid,
 
 			Assert(NULL != result);
 
-			if (strncmp(result, content.data, size) != 0)
+			if (pg_strncasecmp(result, content.data, pg_clean_check_size) != 0)
             {
-				elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, "
-					"content: %s, info: %s",
-					__FUNCTION__, tid, content.data, result);
+				elog(ERROR, "pg_clean attemp to write %s info conflict, "
+					"content: %s, info: %s", tid, content.data, result);
             }
 
 			pfree(result);
@@ -3853,12 +3868,16 @@ void record_2pc_involved_nodes_xid(const char * tid,
     
 	if (!RecoveryInProgress())
 	{
+		char *fmt_v2 = XLOG_FMT_2PC_V2;
 		XLogBeginInsert();
 		XLogRegisterData((char *)tid, strlen(tid) + 1);
+		XLogRegisterData((char *)fmt_v2, strlen(fmt_v2) + 1);
 		XLogRegisterData((char *)startnode, strlen(startnode) + 1);
-		XLogRegisterData((char *)&startxid, sizeof(GlobalTransactionId) + 1);
+		XLogRegisterData((char *)&startxid, sizeof(GlobalTransactionId));
 		XLogRegisterData((char *)nodestring, strlen(nodestring) + 1);
-		XLogRegisterData((char *)&xid, sizeof(GlobalTransactionId) + 1);
+		XLogRegisterData((char *)&xid, sizeof(GlobalTransactionId));
+		XLogRegisterData((char *)&prepare_gts, sizeof(GlobalTimestamp));
+
 #ifdef __TWO_PHASE_TESTS__
 		xlogrec = 
 #endif
@@ -3973,7 +3992,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 	{
 		XLogBeginInsert();
 		XLogRegisterData((char *)tid, strlen(tid) + 1);
-		XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp) + 1);
+		XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp));
 		xlogrec = XLogInsert(RM_XLOG_ID, XLOG_RECORD_2PC_TIMESTAMP);
 		/* only start node need to flush and sync XLOG_RECORD_2PC_TIMESTAMP */
 		if (IS_PGXC_LOCAL_COORDINATOR)
@@ -4178,7 +4197,7 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 		XLogBeginInsert();
 		XLogRegisterData((char *)tid, strlen(tid) + 1);
 		XLogRegisterData((char *)type, strlen(type) + 1);
-		XLogRegisterData((char *)&timestamp, sizeof(TimestampTz) + 1);
+		XLogRegisterData((char *)&timestamp, sizeof(TimestampTz));
 		XLogInsert(RM_XLOG_ID, XLOG_CLEAN_2PC_FILE);
 	}
 
@@ -4388,7 +4407,7 @@ char *get_2pc_list_from_cache(int *count)
 		{
 			recordList = (char *) repalloc(recordList,
 				strlen(entry->key) + strlen(recordList) + 2);
-			sprintf(recordList, "%s,%s", recordList, entry->key);
+			sprintf(recordList + strlen(recordList), ",%s", entry->key);
 		}
 
 		if (++(*count) >= MAX_OUTPUT_FILE)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 7044cd8b..99cc62f3 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -10177,10 +10177,10 @@ xlog_redo(XLogReaderState *record)
 		TimestampTz timestamp = 0;
         gid = XLogRecGetData(record);
 		type = gid + strlen(gid) + 1;
-		pos = type + strlen(type) + 1;
-		memcpy(&timestamp, pos, sizeof(TimestampTz));
 		if (0 == strcmp(type, "rename"))
 		{
+			pos = type + strlen(type) + 1;
+			memcpy(&timestamp, pos, sizeof(TimestampTz));
 			rename_2pc_records(gid, timestamp);
 		}
 		else
@@ -10192,11 +10192,13 @@ xlog_redo(XLogReaderState *record)
     {
         TransactionId xid;
         TransactionId startxid;
+        GlobalTimestamp prepare_gts = InvalidGlobalTimestamp;
+        char *fmt_v2 = XLOG_FMT_2PC_V2;
         char *gid; 
         char *startnode;
         char *nodestring;
         char *pos;
-        char *temp;
+        char *type;
 #ifdef __TWO_PHASE_TESTS__            
         TransactionId old_shem_nextxid = ShmemVariableCache->nextXid;
 #endif
@@ -10204,27 +10206,48 @@ xlog_redo(XLogReaderState *record)
         gid = XLogRecGetData(record);
         pos = gid + strlen(gid) +1;
         /* if the transaction is readonly */
-        temp = pos;
-        pos = pos + strlen(temp) +1;
+        type = pos;
+        pos = pos + strlen(type) + 1;
 
-        if (0 != strcmp(temp, "readonly"))
+        if (0 != strcmp(type, "readonly"))
         {
-            startnode = temp;
+            if (0 == strcmp(type, fmt_v2))
+            {
+                startnode = pos;
+                pos = pos + strlen(startnode) + 1;
+                memcpy(&startxid, pos, sizeof(TransactionId));
+                pos = pos + sizeof(TransactionId);
+                nodestring = pos;
+                pos = pos + strlen(nodestring) + 1;
+                memcpy(&xid, pos, sizeof(TransactionId));
+                pos = pos + sizeof(TransactionId);
+                memcpy(&prepare_gts, pos, sizeof(GlobalTimestamp));
+                pos = pos + sizeof(GlobalTimestamp);
+            }
+            else
+        {
+                /* compatible with old format */
+                startnode = type;
             memcpy(&startxid, pos, sizeof(TransactionId));
             pos = pos + sizeof(TransactionId) + 1;
             nodestring = pos;
             pos = pos + strlen(nodestring) + 1;
             memcpy(&xid, pos, sizeof(TransactionId));
+                pos = pos + sizeof(TransactionId) + 1;
+            }
+
             if (enable_distri_print)
             {
                 elog(LOG, "xlog redo 2pc file name: '%s', startnode: %s, "
-                    "startxid: %u, nodestring: %s, xid: %u",
-                    gid, startnode, startxid, nodestring, xid);
+                    "startxid: %u, prepare_gts: %ld, nodestring: %s, xid: %u",
+                    gid, startnode, startxid, prepare_gts, nodestring, xid);
             }
+
 #ifdef __TWO_PHASE_TESTS__            
             if (FILE_XLOG_EXISTED == twophase_exception_case)
             {
                 elog(LOG, "FILE_XLOG_EXISTED complish");
+                SetGlobalPrepareTimestamp(prepare_gts);
                 record_2pc_involved_nodes_xid(gid, startnode, startxid, nodestring, xid);
             }
 #endif            
@@ -10248,6 +10271,7 @@ xlog_redo(XLogReaderState *record)
                 LWLockRelease(XidGenLock);
             }
 
+            SetGlobalPrepareTimestamp(prepare_gts);
             record_2pc_involved_nodes_xid(gid, startnode, startxid, nodestring, xid);
         }
         else
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 71e4c53b..2cedb37c 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3974,11 +3974,10 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 #endif
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-    if(implicit)
-    {
         if(enable_distri_print)
         {
-            elog(LOG, "prepare remote transaction xid %d gid %s", GetTopTransactionIdIfAny(), prepareGID);
+		elog(LOG, "prepare remote transaction xid %d gid %s",
+			GetTopTransactionIdIfAny(), prepareGID);
         }
         global_prepare_ts = GetGlobalTimestampGTM();
 
@@ -3988,17 +3987,19 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
         global_prepare_ts = 0;
     }
 #endif
-        if(!GlobalTimestampIsValid(global_prepare_ts)){
+
+	if (!GlobalTimestampIsValid(global_prepare_ts))
+	{
             ereport(ERROR,
             (errcode(ERRCODE_INTERNAL_ERROR),
              errmsg("failed to get global timestamp for PREPARED command")));
         }
         if(enable_distri_print)
         {
-            elog(LOG, "prepare phase get global prepare timestamp gid %s, time " INT64_FORMAT, prepareGID, global_prepare_ts);
+		elog(LOG, "prepare phase get global prepare timestamp gid %s, time "
+			INT64_FORMAT, prepareGID, global_prepare_ts);
         }
         SetGlobalPrepareTimestamp(global_prepare_ts);
-    }
 #endif
 
 #ifdef __TWO_PHASE_TRANS__
@@ -4093,19 +4094,18 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
             {
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-                if(implicit)
-                {
                     if(enable_distri_print)
                     {
-                        elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(),
+					elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts "
+						INT64_FORMAT, GetTopTransactionIdIfAny(),
                                                         prepareGID, global_prepare_ts);
                     }
                     if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts))
                     {
                         ereport(ERROR,
                                 (errcode(ERRCODE_INTERNAL_ERROR),
-                                 errmsg("failed to send global prepare committs for PREPARED command")));
-                    }
+						 errmsg("failed to send global prepare committs for "
+							"PREPARED command")));
                 }
 #endif
                 /* Send down prepare command */
@@ -4139,11 +4139,10 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 #endif
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-                if(implicit)
-                {
                     if(enable_distri_print)
                     {
-                        elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(),
+					elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts "
+						INT64_FORMAT, GetTopTransactionIdIfAny(),
                                                         prepareGID, global_prepare_ts);
                     }
                     if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts))
@@ -4157,8 +4156,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 #endif
                         ereport(ERROR,
                                 (errcode(ERRCODE_INTERNAL_ERROR),
-                                 errmsg("failed to send global prepare committs for PREPARED command")));
-                    }
+						 errmsg("failed to send global prepare committs for "
+							"PREPARED command")));
                 }
 #endif
 
@@ -4297,19 +4296,18 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
             if (conn->read_only)
             {
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-                if(implicit)
-                {
                     if(enable_distri_print)
                     {
-                        elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(),
+					elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts "
+						INT64_FORMAT,GetTopTransactionIdIfAny(),
                                                         prepareGID, global_prepare_ts);
                     }
                     if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts))
                     {
                         ereport(ERROR,
                                 (errcode(ERRCODE_INTERNAL_ERROR),
-                                 errmsg("failed to send global prepare committs for PREPARED command")));
-                    }
+						 errmsg("failed to send global prepare committs for "
+							"PREPARED command")));
                 }
 #endif
                 /* Send down prepare command */
@@ -4340,11 +4338,10 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 #endif
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
-                if(implicit)
-                {
                     if(enable_distri_print)
                     {
-                        elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(),
+					elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts "
+						INT64_FORMAT,GetTopTransactionIdIfAny(),
                                                         prepareGID, global_prepare_ts);
                     }
                     if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts))
@@ -4358,8 +4355,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 #endif
                         ereport(ERROR,
                                 (errcode(ERRCODE_INTERNAL_ERROR),
-                                 errmsg("failed to send global prepare committs for PREPARED command")));
-                    }
+						 errmsg("failed to send global prepare committs for "
+							"PREPARED command")));
                 }
 #endif
 
diff --git a/src/backend/postmaster/clean2pc.c b/src/backend/postmaster/clean2pc.c
index def81c95..c1e3a31f 100644
--- a/src/backend/postmaster/clean2pc.c
+++ b/src/backend/postmaster/clean2pc.c
@@ -17,6 +17,7 @@
 #include "postgres.h"
 
 #include "access/htup_details.h"
+#include "catalog/namespace.h"
 #include "catalog/pg_database.h"
 #include "catalog/pg_type.h"
 #include "commands/dbcommands.h"
@@ -58,7 +59,7 @@ typedef enum
 bool enable_clean_2pc_launcher = true;
 
 int auto_clean_2pc_interval        = 60;
-int auto_clean_2pc_delay           = 300;
+int auto_clean_2pc_delay           = 60;
 int auto_clean_2pc_timeout         = 1200;
 int auto_clean_2pc_max_check_time  = 1200;
 
@@ -88,6 +89,8 @@ static void	start_clean_worker(int count);
 static void do_query_2pc(TimestampTz clean_time);
 static void do_clean_2pc(TimestampTz clean_time);
 
+static bool check_pg_clean_extension(void);
+
 static void clean_2pc_sigterm_handler(SIGNAL_ARGS);
 static void clean_2pc_sighup_handler(SIGNAL_ARGS);
 static void clean_2pc_sigusr2_handler(SIGNAL_ARGS);
@@ -432,6 +435,12 @@ do_query_2pc(TimestampTz clean_time)
 	Assert(result_str != NULL);
 	resetStringInfo(result_str);
 
+	if (!check_pg_clean_extension())
+	{
+		elog(WARNING, "create extension pg_clean please");
+		return;
+	}
+
 	check_time = (curr_time - clean_time)/USECS_PER_SEC;
 
 	if (check_time < 0)
@@ -686,6 +695,40 @@ do_clean_2pc(TimestampTz clean_time)
 	}
 }
 
+/*
+ * check if pg_clean_check_txn funciton exist
+ */
+static bool
+check_pg_clean_extension(void)
+{
+	bool                 res = false;
+	List                *names = NULL;
+	FuncCandidateList    clist = NULL;
+	char                *fuc_name = "pg_clean_check_txn";
+
+	StartTransactionCommand();
+
+	/*
+	 * Parse the name into components and see if it matches any pg_proc
+	 * entries in the current search path.
+	 */
+	names = list_make1(makeString(fuc_name));
+	clist = FuncnameGetCandidates(names, -1, NIL, false, false, true);
+
+	if (clist == NULL || clist->next != NULL)
+	{
+		res = false;
+	}
+	else
+	{
+		res = true;
+	}
+
+	CommitTransactionCommand();
+
+	return res;
+}
+
 /* SIGTERM: set flag to exit normally */
 static void
 clean_2pc_sigterm_handler(SIGNAL_ARGS)
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index dbccb8f6..dc1d39ed 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4887,7 +4887,11 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_interval,
-		60, 10, INT_MAX,
+#ifdef __TWO_PHASE_TESTS__
+		60, 0, INT_MAX,
+#else
+		60, 30, INT_MAX,
+#endif
 		NULL, NULL, NULL
 	},
 
@@ -4898,7 +4902,11 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_delay,
-		300, 3, INT_MAX,
+#ifdef __TWO_PHASE_TESTS__
+		60, 0, INT_MAX,
+#else
+		60, 30, INT_MAX,
+#endif
 		NULL, NULL, NULL
 	},
 
@@ -4909,7 +4917,11 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_timeout,
+#ifdef __TWO_PHASE_TESTS__
 		1200, 0, INT_MAX,
+#else
+		1200, 30, INT_MAX,
+#endif
 		NULL, NULL, NULL
 	},
 
@@ -4920,7 +4932,11 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_max_check_time,
+#ifdef __TWO_PHASE_TESTS__
 		1200, 0, INT_MAX,
+#else
+		1200, 30, INT_MAX,
+#endif
 		NULL, NULL, NULL
 	},
 
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index 06f9685e..132f19d8 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -81,6 +81,10 @@
 #include "gtm/gtm_c.h"
 
 #define GIDSIZE (200 + 24)
+
+/* 2pc xlog v2 add prepare timestamp */
+#define XLOG_FMT_2PC_V2 "fmt_v2"
+
 /*
  * GlobalTransactionData is defined in twophase.c; other places have no
  * business knowing the internal definition.

From db0d112324d582dec1d5f1a464b3d38f7cd173d4 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Fri, 22 Apr 2022 14:16:30 +0800
Subject: [PATCH 544/578] support wal sender proxy on cn (merge request 1183),
 http://tapd.woa.com/20421696/prong/stories/view/1020421696872688189

---
 src/backend/access/common/printtup.c |   2 +-
 src/backend/pgxc/pool/execRemote.c   | 323 ++++++++++++++++++++-
 src/backend/pgxc/pool/pgxcnode.c     |  66 +++++
 src/backend/postmaster/pgstat.c      |   8 +
 src/backend/postmaster/postmaster.c  |  39 +++
 src/backend/replication/walsender.c  |   3 +
 src/backend/tcop/postgres.c          | 279 ++++++++++++++++++
 src/backend/utils/misc/guc.c         |   8 +
 src/backend/utils/misc/ps_status.c   |  23 ++
 src/include/pgstat.h                 |   1 +
 src/include/pgxc/execRemote.h        |   3 +
 src/include/pgxc/pgxc.h              |   2 +
 src/include/pgxc/pgxcnode.h          |   8 +
 src/include/postgres.h               | 419 ++++++++++++++-------------
 src/include/replication/walsender.h  |  43 +--
 src/include/utils/ps_status.h        |   6 +-
 16 files changed, 1004 insertions(+), 229 deletions(-)

diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index dfd64707..3c12980a 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -228,7 +228,7 @@ SendRowDescriptionMessage(TupleDesc typeinfo, List *targetlist, int16 *formats)
          * Send the type name from a Postgres-XC backend node.
          * This preserves from OID inconsistencies as architecture is shared nothing.
          */
-        if (IsConnFromCoord())
+		if (IsConnFromCoord() && !IsConnFromProxy())
         {
             char       *typename;
 			typename = get_typenamespace_typename(atttypid);
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 2cedb37c..7ad4a8be 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -30,6 +30,7 @@
 #include "executor/executor.h"
 #include "gtm/gtm_c.h"
 #include "libpq/libpq.h"
+#include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "pgxc/execRemote.h"
 #include "tcop/tcopprot.h"
@@ -156,6 +157,7 @@ static void pgxc_connections_cleanup(ResponseCombiner *combiner);
 
 static bool determine_param_types(Plan *plan,  struct find_params_context *context);
 
+static int handle_reply_msg_on_proxy(PGXCNodeHandle *conn);
 
 #define REMOVE_CURR_CONN(combiner) \
     if ((combiner)->current_conn < --((combiner)->conn_count)) \
@@ -3026,7 +3028,17 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
         while (i < count)
         {
             int32 nbytes = 0;
-            int result =  handle_response(to_receive[i], combiner);
+			int result =  0;
+
+			if (am_proxy_for_dn)
+			{
+				result =  handle_response_on_proxy(to_receive[i], combiner);
+			}
+			else
+			{
+				result =  handle_response(to_receive[i], combiner);
+			}
+
 #ifdef __TBASE__
 #ifdef     _PG_REGRESS_            
             elog(LOG, "Received response %d on connection to node %s",
@@ -13090,4 +13102,313 @@ SetSnapshot(EState *state)
 
     return result;
 }
+
+/*
+ * Reveive dn message on proxy.
+ * Forward the dn message to client and forward the client reply message to dn.
+ */
+int pgxc_node_receive_on_proxy(PGXCNodeHandle *handle)
+{
+	int result = 0;
+	ResponseCombiner combiner;
+
+	struct timeval timeout;
+	timeout.tv_sec = 1;
+	timeout.tv_usec = 0;
+
+	MemSet(&combiner, 0, sizeof(ResponseCombiner));
+
+	InitResponseCombiner(&combiner, 1, COMBINE_TYPE_NONE);
+
+	/* Receive responses */
+	result = pgxc_node_receive_responses(1, &handle, &timeout, &combiner);
+	if (result != 0)
+	{
+		elog(LOG, "Proxy receive responses result is %d", result);
+		return result;
+	}
+
+	CloseCombiner(&combiner);
+	return result;
+}
+
+/*
+ * Handle reply message on proxy.
+ * Forward the client reply message to dn.
+ */
+int handle_reply_msg_on_proxy(PGXCNodeHandle *conn)
+{
+	int ret = 0;
+	unsigned char firstchar;
+	StringInfoData msg;
+
+	Assert(IS_PGXC_COORDINATOR);
+
+	initStringInfo(&msg);
+
+	for (;;)
+	{
+		pq_startmsgread();
+		ret = pq_getbyte_if_available(&firstchar);
+		if (ret < 0)
+		{
+			/* Unexpected error or EOF */
+			ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("unexpected EOF on proxy for %s", proxy_for_dn)));
+		}
+
+		if (ret == 0)
+		{
+			/* No data available without blocking */
+			pq_endmsgread();
+			break;
+		}
+
+		/* Read the message contents */
+		if (pq_getmessage(&msg, 0))
+		{
+			ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("unexpected EOF on proxy for %s", proxy_for_dn)));
+		}
+
+		elog(DEBUG2, "%s proxy firstchar is %c(%d), reply message length: %d",
+			proxy_for_dn, firstchar, firstchar, msg.len);
+
+		ret = pgxc_node_send_on_proxy(conn, firstchar, &msg);
+		if (ret != 0)
+		{
+			ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("%s proxy send reply message error: %d",
+						proxy_for_dn, ret)));
+		}
+
+		/* Handle the very limited subset of commands expected in this phase */
+		switch (firstchar)
+		{
+			/*
+			 * 'd' means a client reply message.
+			 */
+			case 'd':
+				break;
+
+			/*
+			 * 'c' means the client requested to finish streaming.
+			 */
+			case 'c':
+				elog(LOG, "%s proxy: reply message type %c(%d), "
+					"the client requested to finish streaming",
+					proxy_for_dn, firstchar, firstchar);
+
+				/* When replicate stream is closed, set stream_closed to true */
+				conn->stream_closed = true;
+
+				break;
+
+			/*
+			 * 'X' means the client is closing down the socket.
+			 */
+			case 'X':
+				elog(LOG, "%s proxy: reply message type %c(%d), "
+					"the client is closing down the socket",
+					proxy_for_dn, firstchar, firstchar);
+
+				proc_exit(0);
+
+			default:
+				elog(FATAL, "%s proxy: unexpected message type %c(%d), length: %d",
+					proxy_for_dn, firstchar, firstchar, msg.len);
+				break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Read next message from the connection and update
+ * connection state accordingly on the proxy
+ * If we are in an error state we just consume the messages, and do not proxy
+ * Long term, we should look into cancelling executing statements
+ * and closing the connections.
+ * It returns if states need to be handled
+ * Return values:
+ * RESPONSE_EOF - need to receive more data for the connection
+ * RESPONSE_READY - got ReadyForQuery
+ * RESPONSE_COMPLETE - done with the connection, but not yet ready for query.
+ * Also this result is output in case of error
+ * RESPONSE_TUPLEDESC - got tuple description
+ * RESPONSE_DATAROW - got data row
+ */
+int handle_response_on_proxy(PGXCNodeHandle *conn, ResponseCombiner *combiner)
+{
+	char *msg;
+	int  msg_len;
+	char msg_type;
+	int  ret = 0;
+	StringInfoData buf;
+
+	/* proxy must be cn */
+	Assert(IS_PGXC_COORDINATOR);
+
+	/* proxy must be not in extended query */
+	Assert(!conn->in_extended_query);
+	Assert(!combiner->extended_query);
+
+	for (;;)
+	{
+		/*
+		 * If we are in the process of shutting down, we
+		 * may be rolling back, and the buffer may contain other messages.
+		 * We want to avoid a procarray exception
+		 * as well as an error stack overflow.
+		 */
+		if (proc_exit_inprogress)
+		{
+			PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
+		}
+
+		/*
+		 * Don't read from from the connection if there is a fatal error.
+		 * We still return RESPONSE_COMPLETE, not RESPONSE_ERROR, since
+		 * Handling of RESPONSE_ERROR assumes sending SYNC message, but
+		 * State DN_CONNECTION_STATE_ERROR_FATAL indicates connection is
+		 * not usable.
+		 */
+		if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+		{
+			return RESPONSE_COMPLETE;
+		}
+
+		ret = handle_reply_msg_on_proxy(conn);
+		if (ret != 0)
+		{
+			ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("Handle reply message on proxy for %s error: %d",
+						proxy_for_dn, ret)));
+		}
+
+		/* No data available, exit */
+		if (!HAS_MESSAGE_BUFFERED(conn))
+			return RESPONSE_EOF;
+
+		Assert(conn->combiner == combiner || conn->combiner == NULL);
+
+		msg_type = get_message(conn, &msg_len, &msg);
+		elog(DEBUG1, "handle_response_on_proxy - received message %c, node %s, "
+			"current_state %d", msg_type, conn->nodename, conn->state);
+
+		/*
+		 * Add some protection code when receiving a messy message,
+		 * close the connection, and throw error
+		 */
+		if (msg_len < 0)
+		{
+			PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
+
+			elog(LOG, "handle_response_on_proxy, fatal_conn=%p, "
+				"fatal_conn->nodename=%s, fatal_conn->sock=%d, "
+				"fatal_conn->read_only=%d, fatal_conn->transaction_status=%c, "
+				"fatal_conn->sock_fatal_occurred=%d, conn->backend_pid=%d, "
+				"fatal_conn->error=%s", conn, conn->nodename, conn->sock,
+				conn->read_only, conn->transaction_status,
+				conn->sock_fatal_occurred, conn->backend_pid,  conn->error);
+
+			closesocket(conn->sock);
+			conn->sock = NO_SOCKET;
+			conn->sock_fatal_occurred = true;
+
+			elog(LOG, "Received messy message from node:%s host:%s port:%d pid:%d, "
+				"inBuffer:%p inSize:%lu inStart:%lu inEnd:%lu inCursor:%lu "
+				"msg_len:%d, This probably means the remote node terminated "
+				"abnormally before or while processing the request.",
+				conn->nodename, conn->nodehost, conn->nodeport, conn->backend_pid,
+				conn->inBuffer, conn->inSize, conn->inStart, conn->inEnd,
+				conn->inCursor, msg_len);
+
+			ereport(ERROR,
+				(errcode(ERRCODE_INTERNAL_ERROR),
+					errmsg("Proxy: handle_response_on_proxy - received message "
+						"length %d, type %c, node %s, current_state %d",
+						msg_len, msg_type, conn->nodename, conn->state)));
+		}
+
+		if (msg_type == '\0')
+		{
+			/* Not enough data in the buffer */
+			return RESPONSE_EOF;
+		}
+
+		if (conn->stream_closed && msg_type == 'd')
+		{
+			/* When replicate stream is closed, skip 'd' message */
+			elog(DEBUG1, "Proxy: handle_response_on_proxy - received message "
+				"type %c, length %d, node %s, current_state %d, remote pid %d, skip",
+				msg_type, msg_len, conn->nodename, conn->state, conn->backend_pid);
+			continue;;
+		}
+
+		conn->last_command = msg_type;
+
+		elog(DEBUG1, "Proxy: handle_response_on_proxy - received message "
+			"type %c, length %d, node %s, current_state %d, remote pid %d",
+			msg_type, msg_len, conn->nodename, conn->state, conn->backend_pid);
+
+		/* Send message to client */
+		pq_beginmessage(&buf, msg_type);
+		pq_sendbytes(&buf, msg, msg_len);
+		pq_endmessage(&buf);
+		pq_flush();
+
+		switch (msg_type)
+		{
+			case 'c':			/* CopyToCommandComplete */
+				break;
+
+			case 'C':			/* CommandComplete */
+				conn->combiner = NULL;
+				PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
+				return RESPONSE_COMPLETE;
+
+			case 'E':			/* ErrorResponse */
+				HandleError(combiner, msg, msg_len, conn);
+				add_error_message_from_combiner(conn, combiner);
+
+				combiner->errorNode   = conn->nodename;
+				combiner->backend_pid = conn->backend_pid;
+				return RESPONSE_ERROR;
+
+			case 'Z':			/* ReadyForQuery */
+				conn->transaction_status = msg[0];
+				PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
+				conn->combiner = NULL;
+				return RESPONSE_READY;
+
+			case 'T':			/* RowDescription */
+				return RESPONSE_TUPDESC;
+
+			case 'D':			/* DataRow */
+				return RESPONSE_DATAROW;
+
+			case 'd': 			/* CopyOutDataRow */
+				PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_OUT);
+				break;
+
+			case 'W':			/* CopyBothResponse */
+				/* Get a CopyBothResponse message when start streaming */
+				break;
+
+			default:
+				elog(DEBUG1, "Proxy received message type: %c", msg_type);
+				break;
+		}
+	}
+
+	/* Never happen, but keep compiler quiet */
+	return RESPONSE_EOF;
+}
+
 #endif
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index c19325a9..84259600 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -2595,6 +2595,72 @@ pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_p
 }
 #endif
 
+/*
+ * Send message to dn
+ */
+int
+pgxc_node_send_on_proxy(PGXCNodeHandle *handle, int firstchar, StringInfo inBuf)
+{
+	/* size + len */
+	int msgLen = 4 + inBuf->len;
+
+	/* msgType + msgLen */
+	if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+	{
+		add_error_message(handle, "out of memory");
+		return EOF;
+	}
+
+	/* msg type */
+	handle->outBuffer[handle->outEnd++] = firstchar;
+
+	/* size */
+	msgLen = htonl(msgLen);
+	memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+	handle->outEnd += 4;
+
+	/* msg data */
+	memcpy(handle->outBuffer + handle->outEnd, inBuf->data, inBuf->len);
+	handle->outEnd += inBuf->len;
+
+	PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
+	handle->in_extended_query = false;
+
+	return pgxc_node_flush(handle);
+}
+
+/*
+ * Send proxy configuration to dn
+ */
+int
+pgxc_node_send_proxy_flag(PGXCNodeHandle *handle, int flag)
+{
+	/* size + flag */
+	int msgLen = 4 + sizeof(int);
+
+	/* msgType + msgLen */
+	if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+	{
+		add_error_message(handle, "out of memory");
+		return EOF;
+	}
+
+	/* msg type */
+	handle->outBuffer[handle->outEnd++] = 'w';
+
+	/* size */
+	msgLen = htonl(msgLen);
+	memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+	handle->outEnd += 4;
+
+	/* flag */
+	flag = htonl(flag);
+	memcpy(handle->outBuffer + handle->outEnd, &flag, sizeof(int));
+	handle->outEnd += sizeof(int);
+
+	return pgxc_node_flush(handle);
+}
+
 /*
  * Send series of Extended Query protocol messages to the data node
  */
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 1286cd1d..cfeea974 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -2913,6 +2913,11 @@ pgstat_bestart(void)
 			/* Clean 2pc Worker */
 			beentry->st_backendType = B_CLEAN_2PC_WORKER;
 		}
+		else if (am_proxy_for_dn)
+		{
+			/* Proxy for dn */
+			beentry->st_backendType = B_PROXY_FOR_DN;
+		}
         else if (am_walsender)
         {
             /* Wal sender */
@@ -4208,6 +4213,9 @@ pgstat_get_backend_desc(BackendType backendType)
 		case B_CLEAN_2PC_WORKER:
 			backendDesc = "2pc clean worker";
 			break;
+		case B_PROXY_FOR_DN:
+			backendDesc = "proxy for dn";
+			break;
     }
 
     return backendDesc;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 10be77cd..7d6d230b 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -162,6 +162,8 @@
 #include "audit/audit_fga.h"
 #endif
 
+#define PS_DISPLAY_MAX_LENGTH	256		/* process display max length */
+
 /*
  * Possible types of a backend. Beyond being the possible bkend_type values in
  * struct bkend, these are OR-able request flag bits for SignalSomeChildren()
@@ -2387,6 +2389,20 @@ ProcessStartupPacket(Port *port, bool SSLdone)
                                     valptr),
                              errhint("Valid values are: \"false\", 0, \"true\", 1, \"database\".")));
             }
+			else if (strcmp(nameptr, "proxy_for_dn") == 0)
+			{
+				if (!IS_PGXC_COORDINATOR)
+				{
+					ereport(FATAL,
+							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							 errmsg("parameter \"%s\" only support on cn", nameptr)));
+				}
+
+				elog(LOG, "Proxy for dn: %s", valptr);
+
+				am_proxy_for_dn = true;
+				proxy_for_dn = pstrdup(valptr);
+			}
             else
             {
                 /* Assume it's a generic GUC option */
@@ -4940,12 +4956,35 @@ BackendInitialize(Port *port)
      * as dbname to init_ps_display(). XXX: should add a new variant of
      * init_ps_display() to avoid abusing the parameters like this.
      */
+	if (am_proxy_for_dn)
+	{
+		char proxy_display[PS_DISPLAY_MAX_LENGTH];
     if (am_walsender)
+		{
+			snprintf(proxy_display, PS_DISPLAY_MAX_LENGTH,
+				"wal sender proxy for %s", proxy_for_dn);
+		}
+		else
+		{
+			snprintf(proxy_display, PS_DISPLAY_MAX_LENGTH,
+				"proxy for %s", proxy_for_dn);
+		}
+		init_ps_display(proxy_display, port->user_name, remote_ps_data,
+						update_process_title ? "authentication" : "");
+	}
+	else
+	{
+		if (am_walsender)
+		{
         init_ps_display("wal sender process", port->user_name, remote_ps_data,
                         update_process_title ? "authentication" : "");
+		}
     else
+		{
         init_ps_display(port->user_name, port->database_name, remote_ps_data,
                         update_process_title ? "authentication" : "");
+		}
+	}
 
     /*
      * Disable the timeout, and prevent SIGTERM/SIGQUIT again.
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 4b46d9c8..464cfcd9 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3327,7 +3327,10 @@ WalSndSignals(void)
     pqsignal(SIGINT, StatementCancelHandler);    /* query cancel */
     pqsignal(SIGTERM, die);        /* request shutdown */
     pqsignal(SIGQUIT, quickdie);    /* hard crash time */
+	if (!IsConnFromProxy())
+	{
     InitializeTimeouts();        /* establishes SIGALRM handler */
+	}
     pqsignal(SIGPIPE, SIG_IGN);
     pqsignal(SIGUSR1, procsignal_sigusr1_handler);
     pqsignal(SIGUSR2, WalSndLastCycleHandler);    /* request a last cycle and
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 37819150..b075a7e8 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -124,6 +124,10 @@
 #include "replication/worker_internal.h"
 #endif
 
+char		*proxy_for_dn = NULL;       /* Proxy for which dn? */
+bool		am_proxy_for_dn = false;    /* Am I a proxy for dn? */
+bool		am_conn_from_proxy = false; /* Am I connected from proxy? */
+
 extern int    optind;
 
 /* ----------------
@@ -250,6 +254,13 @@ static void replace_null_with_blank(char *src, int length);
 static bool NeedResourceOwner(const char *stmt_name);
 #endif
 
+static PGXCNodeHandle *
+get_handle_on_proxy(void);
+static PGXCNodeHandle *
+handle_request_msg_on_proxy(PGXCNodeHandle *conn, int firstchar, StringInfo input_msg);
+void
+set_flag_from_proxy(int flag, const char *username);
+
 #ifdef __COLD_HOT__
 /*
  * Release memory alloc in TopMemoryContext and only used in single Session.
@@ -654,6 +665,7 @@ SocketBackend(StringInfo inBuf)
                         (errcode(ERRCODE_PROTOCOL_VIOLATION),
                          errmsg("invalid frontend message type %d", qtype)));
             break;
+		case 'w':				/* Set connected by proxy */
 #ifdef PGXC /* PGXC_DATANODE */
 #ifdef __TBASE__
         case 'N':
@@ -4802,6 +4814,8 @@ PostgresMain(int argc, char *argv[],
     volatile bool need_report_activity = false;
     bool        disable_idle_in_transaction_timeout = false;
 
+	PGXCNodeHandle *proxy_conn = NULL;
+
 #ifdef PGXC /* PGXC_DATANODE */
     /* Snapshot info */
     TransactionId             xmin PG_USED_FOR_ASSERTS_ONLY;
@@ -5513,6 +5527,12 @@ PostgresMain(int argc, char *argv[],
         }
 #endif /* XCP */
 
+		if (am_proxy_for_dn)
+		{
+			proxy_conn = handle_request_msg_on_proxy(proxy_conn, firstchar, &input_message);
+			continue;
+		}
+
         switch (firstchar)
         {
             case 'Q':            /* simple query */
@@ -6127,6 +6147,18 @@ PostgresMain(int argc, char *argv[],
                 }
                 break;
 #endif
+			case 'w':				/* Set connected by proxy */
+				{
+					int flag = 0;
+
+					Assert(input_message.len == 4);
+
+					flag = pq_getmsgint(&input_message, 4);
+					pq_getmsgend(&input_message);
+
+					set_flag_from_proxy(flag, username);
+				}
+				break;
             default:
                 ereport(FATAL,
                         (errcode(ERRCODE_PROTOCOL_VIOLATION),
@@ -6403,4 +6435,251 @@ IsExtendedQuery(void)
 {
     return doing_extended_query_message;
 }
+
+/*
+ * Get a dn connection on proxy
+ */
+PGXCNodeHandle *
+get_handle_on_proxy(void)
+{
+	PGXCNodeHandle *conn = NULL;
+	char node_type = PGXC_NODE_DATANODE;
+	Oid node_oid = InvalidOid;
+	int node_id = -1;
+	int flag = 0;
+	PGXCNodeAllHandles *handles = NULL;
+	List *dnList = NIL;
+	int ret = 0;
+
+	Assert(IS_PGXC_COORDINATOR);
+
+	/* Get dn oid */
+	StartTransactionCommand();
+	InitMultinodeExecutor(false);
+	node_oid = get_pgxc_nodeoid(proxy_for_dn);
+	CommitTransactionCommand();
+
+	if (node_oid == InvalidOid)
+	{
+		ereport(FATAL,
+			(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				errmsg("Unknow dn: %s, oid is invalid", proxy_for_dn)));
+	}
+
+	/* Get dn id */
+	node_id = PGXCNodeGetNodeId(node_oid, &node_type);
+	if (node_id == -1)
+	{
+		ereport(FATAL,
+			(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				errmsg("Unknow dn: %s, oid: %d, id: -1", proxy_for_dn, node_oid)));
+	}
+
+	elog(LOG, "Proxy for dn %s, node oid %d, node id %d",
+		proxy_for_dn, node_oid, node_id);
+
+	/* Get dn connection */
+	dnList = lappend_int(dnList, node_id);
+	Assert(list_length(dnList) == 1);
+	handles = get_handles(dnList, NIL, false, false, true);
+	if (handles == NULL)
+	{
+		ereport(ERROR,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+				errmsg("Get connections failed for %s", proxy_for_dn)));
+
+	}
+	if (handles->dn_conn_count == 0)
+	{
+		ereport(ERROR,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+				errmsg("Get 0 connection for %s", proxy_for_dn)));
+	}
+
+	Assert(handles->co_conn_count == 0);
+	Assert(handles->dn_conn_count == 1);
+
+	conn = handles->datanode_handles[0];
+	Assert(conn != NULL);
+
+	pfree_pgxc_all_handles(handles);
+	handles = NULL;
+
+	/* Set dn process */
+	if (am_walsender)
+	{
+		flag |= FLAG_AM_WALSENDER;
+		if (am_db_walsender)
+		{
+			flag |= FLAG_AM_DB_WALSENDER;
+		}
+	}
+	ret = pgxc_node_send_proxy_flag(conn, flag);
+	if (ret != 0)
+	{
+		ereport(ERROR,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+				errmsg("Proxy send flag to %s error: %d", proxy_for_dn, ret)));
+	}
+
+	return conn;
+}
+
+/*
+ * Forward client request command to dn and receive response
+ */
+PGXCNodeHandle *
+handle_request_msg_on_proxy(PGXCNodeHandle *conn, int firstchar, StringInfo input_msg)
+{
+	int ret = 0;
+
+	Assert(IS_PGXC_COORDINATOR);
+
+	if (conn == NULL)
+	{
+		conn = get_handle_on_proxy();
+	}
+
+	Assert(conn != NULL);
+
+	/* Before query, replicate stream is not closed, set stream_closed to false */
+	conn->stream_closed = false;
+
+	if (firstchar == 'Q')
+	{
+		const char *query_string = pq_getmsgstring(input_msg);
+		pq_getmsgend(input_msg);
+		debug_query_string = query_string;
+	}
+
+	elog(DEBUG1, "Proxy: firstchar is %c(%d)", firstchar, firstchar);
+
+	/* Send message */
+	ret = pgxc_node_send_on_proxy(conn, firstchar, input_msg);
+	if (ret != 0)
+	{
+		ereport(ERROR,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+				errmsg("Proxy send request to %s error: %d", proxy_for_dn, ret)));
+	}
+
+	switch (firstchar)
+	{
+		/*
+		 * 'X' means that the frontend is closing down the socket. EOF
+		 * means unexpected loss of frontend connection. Either way,
+		 * perform normal shutdown.
+		 */
+		case 'X':
+		case EOF:
+			/*
+			 * Reset whereToSendOutput to prevent ereport from attempting
+			 * to send any more messages to client.
+			 */
+			if (whereToSendOutput == DestRemote)
+			{
+				elog(LOG, "Set whereToSendOutput from %d to %d",
+					whereToSendOutput, DestNone);
+				whereToSendOutput = DestNone;
+			}
+
+			/* Destroy the dn connection on proxy */
+			PoolManagerDisconnect();
+
+			/*
+			 * NOTE: if you are tempted to add more code here, DON'T!
+			 * Whatever you had in mind to do should be set up as an
+			 * on_proc_exit or on_shmem_exit callback, instead. Otherwise
+			 * it will fail to be called during other backend-shutdown
+			 * scenarios.
+			 */
+			proc_exit(0);
+
+		default:
+			break;
+	}
+
+	/* Receive message */
+	ret = pgxc_node_receive_on_proxy(conn);
+	if (ret != 0)
+	{
+		ereport(ERROR,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+				errmsg("Proxy receive from %s error: %d", proxy_for_dn, ret)));
+	}
+
+	debug_query_string = NULL;
+
+	return conn;
+}
+
+/*
+ * Set flag from proxy
+ */
+void
+set_flag_from_proxy(int flag, const char *username)
+{
+	if (am_conn_from_proxy)
+	{
+		ereport(ERROR,
+			(errcode(ERRCODE_CONNECTION_EXCEPTION),
+				errmsg("It is connected from proxy already")));
+	}
+
+	am_conn_from_proxy = true;
+
+	elog(LOG, "It is connected from proxy");
+
+	if (am_walsender)
+	{
+		ereport(ERROR,
+			(errcode(ERRCODE_CONNECTION_EXCEPTION),
+				errmsg("It is a wal sender already")));
+	}
+
+	if (flag & FLAG_AM_WALSENDER)
+	{
+		am_walsender = true;
+		if (flag & FLAG_AM_DB_WALSENDER)
+		{
+			am_db_walsender = true;
+		}
+	}
+
+	elog(LOG, "Set wal sender: am_walsender(%d), am_db_walsender(%d)",
+		am_walsender, am_db_walsender);
+
+	if (am_walsender)
+	{
+		int fixed_len = 0;
+		const char *fixed = get_ps_display_fixed(&fixed_len);
+		char fixed_buf[fixed_len + 1];
+		char *display = NULL;
+
+		if (fixed_len != 0)
+		{
+			Assert (fixed != NULL);
+
+			snprintf(fixed_buf, fixed_len, "%s", fixed);
+			fixed_buf[fixed_len] = '\0';
+
+			display = strstr(fixed_buf, username);
+			Assert (display != NULL);
+
+			init_ps_display("wal sender used by proxy", display, "", "");
+		}
+		else
+		{
+			elog(WARNING, "Get ps display fixed length is 0");
+
+			init_ps_display("wal sender used by proxy", "", "", "");
+		}
+
+		IsNormalPostgres = false;
+
+		WalSndSignals();
+		InitWalSender();
+	}
+}
+
 #endif
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index dc1d39ed..3f2e046a 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -7321,6 +7321,14 @@ ResetAllOptions(void)
 {// #lizard forgives
     int            i;
 
+	if (am_walsender)
+	{
+		/* never be here */
+		ereport(ERROR,
+			(errcode(ERRCODE_INTERNAL_ERROR),
+				errmsg("RESET ALL is forbidden on wal sender")));
+	}
+
     for (i = 0; i < num_guc_variables; i++)
     {
         struct config_generic *gconf = guc_variables[i];
diff --git a/src/backend/utils/misc/ps_status.c b/src/backend/utils/misc/ps_status.c
index 06f6c857..51f668d1 100644
--- a/src/backend/utils/misc/ps_status.c
+++ b/src/backend/utils/misc/ps_status.c
@@ -417,3 +417,26 @@ get_ps_display(int *displen)
 
     return ps_buffer + ps_buffer_fixed_size;
 }
+
+/*
+ * Returns the fixed part in the ps display, in case someone needs
+ * it.  Note that only the fixed part is returned.
+ * The string will not be null-terminated, so return the effective
+ * length into *fixlen.
+ */
+const char *
+get_ps_display_fixed(int *fixlen)
+{
+#ifdef PS_USE_CLOBBER_ARGV
+	/* If ps_buffer is a pointer, it might still be null */
+	if (!ps_buffer)
+	{
+		*fixlen = 0;
+		return "";
+	}
+#endif
+
+	*fixlen = (int) ps_buffer_fixed_size;
+
+	return ps_buffer;
+}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 7976c39b..6c4c5886 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -722,6 +722,7 @@ typedef enum BackendType
 	B_PGXL_POOLER,
 	B_CLEAN_2PC_LAUNCHER,
 	B_CLEAN_2PC_WORKER,
+	B_PROXY_FOR_DN,
 } BackendType;
 
 
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index 7047d510..baa30f65 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -384,6 +384,7 @@ extern void ExecRemoteUtility(RemoteQuery *node);
 extern bool    is_data_node_ready(PGXCNodeHandle * conn);
 
 extern int handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner);
+extern int handle_response_on_proxy(PGXCNodeHandle *conn, ResponseCombiner *combiner);
 extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const char *msg_body,
                                     size_t len);
 
@@ -476,6 +477,8 @@ extern int pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** c
 extern bool validate_combiner(ResponseCombiner *combiner);
 #endif
 
+extern int pgxc_node_receive_on_proxy(PGXCNodeHandle *handle);
+
 #ifdef __TWO_PHASE_TRANS__
 extern char *get_nodelist(char * prepareGID, bool localNode, bool implicit);
 extern void InitLocalTwoPhaseState(void);
diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h
index 687be6c8..370882dd 100644
--- a/src/include/pgxc/pgxc.h
+++ b/src/include/pgxc/pgxc.h
@@ -134,6 +134,8 @@ extern Datum xc_lockForBackupKey2;
 #define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM)
 #define IsConnFromGtmProxy() (remoteConnType == REMOTE_CONN_GTM_PROXY)
 
+#define IsConnFromProxy() (am_conn_from_proxy)
+
 /* key pair to be used as object id while using advisory lock for backup */
 #define XC_LOCK_FOR_BACKUP_KEY_1      0xFFFF
 #define XC_LOCK_FOR_BACKUP_KEY_2      0xFFFF
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index f0e7c269..d69aa7f1 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -123,6 +123,9 @@ struct pgxc_node_handle
 
 	bool		in_extended_query;
 	bool		needSync; /* set when error and extend query. */
+
+	bool		stream_closed; /* Whether replicate stream is closed on proxy? */
+
 #ifdef __TBASE__
 	bool		sock_fatal_occurred;	/*Network failure occurred, and sock descriptor was closed */
 	char        last_command; /*last command we processed. */
@@ -217,6 +220,11 @@ extern int	pgxc_node_send_my_sync(PGXCNodeHandle * handle);
 #ifdef __SUBSCRIPTION__
 extern int pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_pk_conflict);
 #endif
+
+extern int pgxc_node_send_proxy_flag(PGXCNodeHandle *handle, int flag);
+extern int pgxc_node_send_on_proxy(PGXCNodeHandle *handle, int firstchar,
+									StringInfo inBuf);
+
 #ifdef __TBASE__
 extern int pgxc_node_send_disconnect(PGXCNodeHandle * handle, char *cursor, int cons);
 #endif
diff --git a/src/include/postgres.h b/src/include/postgres.h
index 2074389f..c35967b0 100644
--- a/src/include/postgres.h
+++ b/src/include/postgres.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * postgres.h
- *      Primary include file for PostgreSQL server .c files
+ *	  Primary include file for PostgreSQL server .c files
  *
  * This should be the first file included by PostgreSQL backend modules.
  * Client-side code should include postgres_fe.h instead.
@@ -17,28 +17,28 @@
  */
 /*
  *----------------------------------------------------------------
- *     TABLE OF CONTENTS
+ *	 TABLE OF CONTENTS
  *
- *        When adding stuff to this file, please try to put stuff
- *        into the relevant section, or add new sections as appropriate.
+ *		When adding stuff to this file, please try to put stuff
+ *		into the relevant section, or add new sections as appropriate.
  *
- *      section    description
- *      -------    ------------------------------------------------
- *        1)        variable-length datatypes (TOAST support)
- *        2)        datum type + support macros
- *        3)        exception handling backend support
+ *	  section	description
+ *	  -------	------------------------------------------------
+ *		1)		variable-length datatypes (TOAST support)
+ *		2)		datum type + support macros
+ *		3)		exception handling backend support
  *
- *     NOTES
+ *	 NOTES
  *
- *    In general, this file should contain declarations that are widely needed
- *    in the backend environment, but are of no interest outside the backend.
+ *	In general, this file should contain declarations that are widely needed
+ *	in the backend environment, but are of no interest outside the backend.
  *
- *    Simple type definitions live in c.h, where they are shared with
- *    postgres_fe.h.  We do that since those type definitions are needed by
- *    frontend modules that want to deal with binary data transmission to or
- *    from the backend.  Type definitions in this file should be for
- *    representations that never escape the backend, such as Datum or
- *    TOASTed varlena objects.
+ *	Simple type definitions live in c.h, where they are shared with
+ *	postgres_fe.h.  We do that since those type definitions are needed by
+ *	frontend modules that want to deal with binary data transmission to or
+ *	from the backend.  Type definitions in this file should be for
+ *	representations that never escape the backend, such as Datum or
+ *	TOASTed varlena objects.
  *
  *----------------------------------------------------------------
  */
@@ -71,8 +71,13 @@
 #define EXTENT_FIRST_BLOCKNUMBER(eid) ((eid)*PAGES_PER_EXTENTS)
 #endif
 
+extern char *proxy_for_dn;      /* Proxy for which dn? */
+extern bool am_proxy_for_dn;    /* Am I a proxy for dn? */
+extern bool am_conn_from_proxy; /* Am I connected from proxy? */
+
+
 /* ----------------------------------------------------------------
- *                Section 1:    variable-length datatypes (TOAST support)
+ *				Section 1:	variable-length datatypes (TOAST support)
  * ----------------------------------------------------------------
  */
 
@@ -90,11 +95,11 @@
  */
 typedef struct varatt_external
 {
-    int32        va_rawsize;        /* Original data size (includes header) */
-    int32        va_extsize;        /* External saved size (doesn't) */
-    Oid            va_valueid;        /* Unique ID of value within TOAST table */
-    Oid            va_toastrelid;    /* RelID of TOAST table containing it */
-}            varatt_external;
+	int32		va_rawsize;		/* Original data size (includes header) */
+	int32		va_extsize;		/* External saved size (doesn't) */
+	Oid			va_valueid;		/* Unique ID of value within TOAST table */
+	Oid			va_toastrelid;	/* RelID of TOAST table containing it */
+}			varatt_external;
 
 /*
  * struct varatt_indirect is a "TOAST pointer" representing an out-of-line
@@ -107,8 +112,8 @@ typedef struct varatt_external
  */
 typedef struct varatt_indirect
 {
-    struct varlena *pointer;    /* Pointer to in-memory varlena */
-}            varatt_indirect;
+	struct varlena *pointer;	/* Pointer to in-memory varlena */
+}			varatt_indirect;
 
 /*
  * struct varatt_expanded is a "TOAST pointer" representing an out-of-line
@@ -124,7 +129,7 @@ typedef struct ExpandedObjectHeader ExpandedObjectHeader;
 
 typedef struct varatt_expanded
 {
-    ExpandedObjectHeader *eohptr;
+	ExpandedObjectHeader *eohptr;
 } varatt_expanded;
 
 /*
@@ -134,21 +139,21 @@ typedef struct varatt_expanded
  */
 typedef enum vartag_external
 {
-    VARTAG_INDIRECT = 1,
-    VARTAG_EXPANDED_RO = 2,
-    VARTAG_EXPANDED_RW = 3,
-    VARTAG_ONDISK = 18
+	VARTAG_INDIRECT = 1,
+	VARTAG_EXPANDED_RO = 2,
+	VARTAG_EXPANDED_RW = 3,
+	VARTAG_ONDISK = 18
 } vartag_external;
 
 /* this test relies on the specific tag values above */
 #define VARTAG_IS_EXPANDED(tag) \
-    (((tag) & ~1) == VARTAG_EXPANDED_RO)
+	(((tag) & ~1) == VARTAG_EXPANDED_RO)
 
 #define VARTAG_SIZE(tag) \
-    ((tag) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \
-     VARTAG_IS_EXPANDED(tag) ? sizeof(varatt_expanded) : \
-     (tag) == VARTAG_ONDISK ? sizeof(varatt_external) : \
-     TrapMacro(true, "unrecognized TOAST vartag"))
+	((tag) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \
+	 VARTAG_IS_EXPANDED(tag) ? sizeof(varatt_expanded) : \
+	 (tag) == VARTAG_ONDISK ? sizeof(varatt_external) : \
+	 TrapMacro(true, "unrecognized TOAST vartag"))
 
 /*
  * These structs describe the header of a varlena object that may have been
@@ -161,31 +166,31 @@ typedef enum vartag_external
  */
 typedef union
 {
-    struct                        /* Normal varlena (4-byte length) */
-    {
-        uint32        va_header;
-        char        va_data[FLEXIBLE_ARRAY_MEMBER];
-    }            va_4byte;
-    struct                        /* Compressed-in-line format */
-    {
-        uint32        va_header;
-        uint32        va_rawsize; /* Original data size (excludes header) */
-        char        va_data[FLEXIBLE_ARRAY_MEMBER]; /* Compressed data */
-    }            va_compressed;
+	struct						/* Normal varlena (4-byte length) */
+	{
+		uint32		va_header;
+		char		va_data[FLEXIBLE_ARRAY_MEMBER];
+	}			va_4byte;
+	struct						/* Compressed-in-line format */
+	{
+		uint32		va_header;
+		uint32		va_rawsize; /* Original data size (excludes header) */
+		char		va_data[FLEXIBLE_ARRAY_MEMBER]; /* Compressed data */
+	}			va_compressed;
 } varattrib_4b;
 
 typedef struct
 {
-    uint8        va_header;
-    char        va_data[FLEXIBLE_ARRAY_MEMBER]; /* Data begins here */
+	uint8		va_header;
+	char		va_data[FLEXIBLE_ARRAY_MEMBER]; /* Data begins here */
 } varattrib_1b;
 
 /* TOAST pointers are a subset of varattrib_1b with an identifying tag byte */
 typedef struct
 {
-    uint8        va_header;        /* Always 0x80 or 0x01 */
-    uint8        va_tag;            /* Type of datum */
-    char        va_data[FLEXIBLE_ARRAY_MEMBER]; /* Type-specific data */
+	uint8		va_header;		/* Always 0x80 or 0x01 */
+	uint8		va_tag;			/* Type of datum */
+	char		va_data[FLEXIBLE_ARRAY_MEMBER]; /* Type-specific data */
 } varattrib_1b_e;
 
 /*
@@ -226,86 +231,86 @@ typedef struct
 #ifdef WORDS_BIGENDIAN
 
 #define VARATT_IS_4B(PTR) \
-    ((((varattrib_1b *) (PTR))->va_header & 0x80) == 0x00)
+	((((varattrib_1b *) (PTR))->va_header & 0x80) == 0x00)
 #define VARATT_IS_4B_U(PTR) \
-    ((((varattrib_1b *) (PTR))->va_header & 0xC0) == 0x00)
+	((((varattrib_1b *) (PTR))->va_header & 0xC0) == 0x00)
 #define VARATT_IS_4B_C(PTR) \
-    ((((varattrib_1b *) (PTR))->va_header & 0xC0) == 0x40)
+	((((varattrib_1b *) (PTR))->va_header & 0xC0) == 0x40)
 #define VARATT_IS_1B(PTR) \
-    ((((varattrib_1b *) (PTR))->va_header & 0x80) == 0x80)
+	((((varattrib_1b *) (PTR))->va_header & 0x80) == 0x80)
 #define VARATT_IS_1B_E(PTR) \
-    ((((varattrib_1b *) (PTR))->va_header) == 0x80)
+	((((varattrib_1b *) (PTR))->va_header) == 0x80)
 #define VARATT_NOT_PAD_BYTE(PTR) \
-    (*((uint8 *) (PTR)) != 0)
+	(*((uint8 *) (PTR)) != 0)
 
 /* VARSIZE_4B() should only be used on known-aligned data */
 #define VARSIZE_4B(PTR) \
-    (((varattrib_4b *) (PTR))->va_4byte.va_header & 0x3FFFFFFF)
+	(((varattrib_4b *) (PTR))->va_4byte.va_header & 0x3FFFFFFF)
 #define VARSIZE_1B(PTR) \
-    (((varattrib_1b *) (PTR))->va_header & 0x7F)
+	(((varattrib_1b *) (PTR))->va_header & 0x7F)
 #define VARTAG_1B_E(PTR) \
-    (((varattrib_1b_e *) (PTR))->va_tag)
+	(((varattrib_1b_e *) (PTR))->va_tag)
 
 #define SET_VARSIZE_4B(PTR,len) \
-    (((varattrib_4b *) (PTR))->va_4byte.va_header = (len) & 0x3FFFFFFF)
+	(((varattrib_4b *) (PTR))->va_4byte.va_header = (len) & 0x3FFFFFFF)
 #define SET_VARSIZE_4B_C(PTR,len) \
-    (((varattrib_4b *) (PTR))->va_4byte.va_header = ((len) & 0x3FFFFFFF) | 0x40000000)
+	(((varattrib_4b *) (PTR))->va_4byte.va_header = ((len) & 0x3FFFFFFF) | 0x40000000)
 #define SET_VARSIZE_1B(PTR,len) \
-    (((varattrib_1b *) (PTR))->va_header = (len) | 0x80)
+	(((varattrib_1b *) (PTR))->va_header = (len) | 0x80)
 #define SET_VARTAG_1B_E(PTR,tag) \
-    (((varattrib_1b_e *) (PTR))->va_header = 0x80, \
-     ((varattrib_1b_e *) (PTR))->va_tag = (tag))
-#else                            /* !WORDS_BIGENDIAN */
+	(((varattrib_1b_e *) (PTR))->va_header = 0x80, \
+	 ((varattrib_1b_e *) (PTR))->va_tag = (tag))
+#else							/* !WORDS_BIGENDIAN */
 
 #define VARATT_IS_4B(PTR) \
-    ((((varattrib_1b *) (PTR))->va_header & 0x01) == 0x00)
+	((((varattrib_1b *) (PTR))->va_header & 0x01) == 0x00)
 #define VARATT_IS_4B_U(PTR) \
-    ((((varattrib_1b *) (PTR))->va_header & 0x03) == 0x00)
+	((((varattrib_1b *) (PTR))->va_header & 0x03) == 0x00)
 #define VARATT_IS_4B_C(PTR) \
-    ((((varattrib_1b *) (PTR))->va_header & 0x03) == 0x02)
+	((((varattrib_1b *) (PTR))->va_header & 0x03) == 0x02)
 #define VARATT_IS_1B(PTR) \
-    ((((varattrib_1b *) (PTR))->va_header & 0x01) == 0x01)
+	((((varattrib_1b *) (PTR))->va_header & 0x01) == 0x01)
 #define VARATT_IS_1B_E(PTR) \
-    ((((varattrib_1b *) (PTR))->va_header) == 0x01)
+	((((varattrib_1b *) (PTR))->va_header) == 0x01)
 #define VARATT_NOT_PAD_BYTE(PTR) \
-    (*((uint8 *) (PTR)) != 0)
+	(*((uint8 *) (PTR)) != 0)
 
 /* VARSIZE_4B() should only be used on known-aligned data */
 #define VARSIZE_4B(PTR) \
-    ((((varattrib_4b *) (PTR))->va_4byte.va_header >> 2) & 0x3FFFFFFF)
+	((((varattrib_4b *) (PTR))->va_4byte.va_header >> 2) & 0x3FFFFFFF)
 #define VARSIZE_1B(PTR) \
-    ((((varattrib_1b *) (PTR))->va_header >> 1) & 0x7F)
+	((((varattrib_1b *) (PTR))->va_header >> 1) & 0x7F)
 #define VARTAG_1B_E(PTR) \
-    (((varattrib_1b_e *) (PTR))->va_tag)
+	(((varattrib_1b_e *) (PTR))->va_tag)
 
 #define SET_VARSIZE_4B(PTR,len) \
-    (((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2))
+	(((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2))
 #define SET_VARSIZE_4B_C(PTR,len) \
-    (((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2) | 0x02)
+	(((varattrib_4b *) (PTR))->va_4byte.va_header = (((uint32) (len)) << 2) | 0x02)
 #define SET_VARSIZE_1B(PTR,len) \
-    (((varattrib_1b *) (PTR))->va_header = (((uint8) (len)) << 1) | 0x01)
+	(((varattrib_1b *) (PTR))->va_header = (((uint8) (len)) << 1) | 0x01)
 #define SET_VARTAG_1B_E(PTR,tag) \
-    (((varattrib_1b_e *) (PTR))->va_header = 0x01, \
-     ((varattrib_1b_e *) (PTR))->va_tag = (tag))
-#endif                            /* WORDS_BIGENDIAN */
+	(((varattrib_1b_e *) (PTR))->va_header = 0x01, \
+	 ((varattrib_1b_e *) (PTR))->va_tag = (tag))
+#endif							/* WORDS_BIGENDIAN */
 
-#define VARHDRSZ_SHORT            offsetof(varattrib_1b, va_data)
-#define VARATT_SHORT_MAX        0x7F
+#define VARHDRSZ_SHORT			offsetof(varattrib_1b, va_data)
+#define VARATT_SHORT_MAX		0x7F
 #define VARATT_CAN_MAKE_SHORT(PTR) \
-    (VARATT_IS_4B_U(PTR) && \
-     (VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT) <= VARATT_SHORT_MAX)
+	(VARATT_IS_4B_U(PTR) && \
+	 (VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT) <= VARATT_SHORT_MAX)
 #define VARATT_CONVERTED_SHORT_SIZE(PTR) \
-    (VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT)
+	(VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT)
 
-#define VARHDRSZ_EXTERNAL        offsetof(varattrib_1b_e, va_data)
+#define VARHDRSZ_EXTERNAL		offsetof(varattrib_1b_e, va_data)
 
-#define VARDATA_4B(PTR)        (((varattrib_4b *) (PTR))->va_4byte.va_data)
-#define VARDATA_4B_C(PTR)    (((varattrib_4b *) (PTR))->va_compressed.va_data)
-#define VARDATA_1B(PTR)        (((varattrib_1b *) (PTR))->va_data)
-#define VARDATA_1B_E(PTR)    (((varattrib_1b_e *) (PTR))->va_data)
+#define VARDATA_4B(PTR)		(((varattrib_4b *) (PTR))->va_4byte.va_data)
+#define VARDATA_4B_C(PTR)	(((varattrib_4b *) (PTR))->va_compressed.va_data)
+#define VARDATA_1B(PTR)		(((varattrib_1b *) (PTR))->va_data)
+#define VARDATA_1B_E(PTR)	(((varattrib_1b_e *) (PTR))->va_data)
 
 #define VARRAWSIZE_4B_C(PTR) \
-    (((varattrib_4b *) (PTR))->va_compressed.va_rawsize)
+	(((varattrib_4b *) (PTR))->va_compressed.va_rawsize)
 
 /* Externally visible macros */
 
@@ -323,66 +328,66 @@ typedef struct
  * Other macros here should usually be used only by tuple assembly/disassembly
  * code and code that specifically wants to work with still-toasted Datums.
  */
-#define VARDATA(PTR)                        VARDATA_4B(PTR)
-#define VARSIZE(PTR)                        VARSIZE_4B(PTR)
+#define VARDATA(PTR)						VARDATA_4B(PTR)
+#define VARSIZE(PTR)						VARSIZE_4B(PTR)
 
-#define VARSIZE_SHORT(PTR)                    VARSIZE_1B(PTR)
-#define VARDATA_SHORT(PTR)                    VARDATA_1B(PTR)
+#define VARSIZE_SHORT(PTR)					VARSIZE_1B(PTR)
+#define VARDATA_SHORT(PTR)					VARDATA_1B(PTR)
 
-#define VARTAG_EXTERNAL(PTR)                VARTAG_1B_E(PTR)
-#define VARSIZE_EXTERNAL(PTR)                (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR)))
-#define VARDATA_EXTERNAL(PTR)                VARDATA_1B_E(PTR)
+#define VARTAG_EXTERNAL(PTR)				VARTAG_1B_E(PTR)
+#define VARSIZE_EXTERNAL(PTR)				(VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR)))
+#define VARDATA_EXTERNAL(PTR)				VARDATA_1B_E(PTR)
 
-#define VARATT_IS_COMPRESSED(PTR)            VARATT_IS_4B_C(PTR)
-#define VARATT_IS_EXTERNAL(PTR)                VARATT_IS_1B_E(PTR)
+#define VARATT_IS_COMPRESSED(PTR)			VARATT_IS_4B_C(PTR)
+#define VARATT_IS_EXTERNAL(PTR)				VARATT_IS_1B_E(PTR)
 #define VARATT_IS_EXTERNAL_ONDISK(PTR) \
-    (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK)
+	(VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK)
 #define VARATT_IS_EXTERNAL_INDIRECT(PTR) \
-    (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_INDIRECT)
+	(VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_INDIRECT)
 #define VARATT_IS_EXTERNAL_EXPANDED_RO(PTR) \
-    (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_EXPANDED_RO)
+	(VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_EXPANDED_RO)
 #define VARATT_IS_EXTERNAL_EXPANDED_RW(PTR) \
-    (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_EXPANDED_RW)
+	(VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_EXPANDED_RW)
 #define VARATT_IS_EXTERNAL_EXPANDED(PTR) \
-    (VARATT_IS_EXTERNAL(PTR) && VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR)))
-#define VARATT_IS_SHORT(PTR)                VARATT_IS_1B(PTR)
-#define VARATT_IS_EXTENDED(PTR)                (!VARATT_IS_4B_U(PTR))
+	(VARATT_IS_EXTERNAL(PTR) && VARTAG_IS_EXPANDED(VARTAG_EXTERNAL(PTR)))
+#define VARATT_IS_SHORT(PTR)				VARATT_IS_1B(PTR)
+#define VARATT_IS_EXTENDED(PTR)				(!VARATT_IS_4B_U(PTR))
 
-#define SET_VARSIZE(PTR, len)                SET_VARSIZE_4B(PTR, len)
-#define SET_VARSIZE_SHORT(PTR, len)            SET_VARSIZE_1B(PTR, len)
-#define SET_VARSIZE_COMPRESSED(PTR, len)    SET_VARSIZE_4B_C(PTR, len)
+#define SET_VARSIZE(PTR, len)				SET_VARSIZE_4B(PTR, len)
+#define SET_VARSIZE_SHORT(PTR, len)			SET_VARSIZE_1B(PTR, len)
+#define SET_VARSIZE_COMPRESSED(PTR, len)	SET_VARSIZE_4B_C(PTR, len)
 
-#define SET_VARTAG_EXTERNAL(PTR, tag)        SET_VARTAG_1B_E(PTR, tag)
+#define SET_VARTAG_EXTERNAL(PTR, tag)		SET_VARTAG_1B_E(PTR, tag)
 
 #define VARSIZE_ANY(PTR) \
-    (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR) : \
-     (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR) : \
-      VARSIZE_4B(PTR)))
+	(VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR) : \
+	 (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR) : \
+	  VARSIZE_4B(PTR)))
 
 /* Size of a varlena data, excluding header */
 #define VARSIZE_ANY_EXHDR(PTR) \
-    (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR)-VARHDRSZ_EXTERNAL : \
-     (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR)-VARHDRSZ_SHORT : \
-      VARSIZE_4B(PTR)-VARHDRSZ))
+	(VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR)-VARHDRSZ_EXTERNAL : \
+	 (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR)-VARHDRSZ_SHORT : \
+	  VARSIZE_4B(PTR)-VARHDRSZ))
 
 /* caution: this will not work on an external or compressed-in-line Datum */
 /* caution: this will return a possibly unaligned pointer */
 #define VARDATA_ANY(PTR) \
-     (VARATT_IS_1B(PTR) ? VARDATA_1B(PTR) : VARDATA_4B(PTR))
+	 (VARATT_IS_1B(PTR) ? VARDATA_1B(PTR) : VARDATA_4B(PTR))
 
 
 /* ----------------------------------------------------------------
- *                Section 2:    datum type + support macros
+ *				Section 2:	datum type + support macros
  * ----------------------------------------------------------------
  */
 
 /*
  * Port Notes:
- *    Postgres makes the following assumptions about datatype sizes:
+ *	Postgres makes the following assumptions about datatype sizes:
  *
- *    sizeof(Datum) == sizeof(void *) == 4 or 8
- *    sizeof(char) == 1
- *    sizeof(short) == 2
+ *	sizeof(Datum) == sizeof(void *) == 4 or 8
+ *	sizeof(char) == 1
+ *	sizeof(short) == 2
  *
  * When a type narrower than Datum is stored in a Datum, we place it in the
  * low-order bits and are careful that the DatumGetXXX macro for it discards
@@ -398,26 +403,26 @@ typedef uintptr_t Datum;
 
 typedef Datum *DatumPtr;
 
-#define GET_1_BYTE(datum)    (((Datum) (datum)) & 0x000000ff)
-#define GET_2_BYTES(datum)    (((Datum) (datum)) & 0x0000ffff)
-#define GET_4_BYTES(datum)    (((Datum) (datum)) & 0xffffffff)
+#define GET_1_BYTE(datum)	(((Datum) (datum)) & 0x000000ff)
+#define GET_2_BYTES(datum)	(((Datum) (datum)) & 0x0000ffff)
+#define GET_4_BYTES(datum)	(((Datum) (datum)) & 0xffffffff)
 #if SIZEOF_DATUM == 8
-#define GET_8_BYTES(datum)    ((Datum) (datum))
+#define GET_8_BYTES(datum)	((Datum) (datum))
 #endif
-#define SET_1_BYTE(value)    (((Datum) (value)) & 0x000000ff)
-#define SET_2_BYTES(value)    (((Datum) (value)) & 0x0000ffff)
-#define SET_4_BYTES(value)    (((Datum) (value)) & 0xffffffff)
+#define SET_1_BYTE(value)	(((Datum) (value)) & 0x000000ff)
+#define SET_2_BYTES(value)	(((Datum) (value)) & 0x0000ffff)
+#define SET_4_BYTES(value)	(((Datum) (value)) & 0xffffffff)
 #if SIZEOF_DATUM == 8
-#define SET_8_BYTES(value)    ((Datum) (value))
+#define SET_8_BYTES(value)	((Datum) (value))
 #endif
 
 #ifdef XCP
-#define CONTROL_INTERVAL        50000
+#define CONTROL_INTERVAL		50000
 #endif
 
 /*
  * DatumGetBool
- *        Returns boolean value of a datum.
+ *		Returns boolean value of a datum.
  *
  * Note: any nonzero value will be considered TRUE, but we ignore bits to
  * the left of the width of bool, per comment above.
@@ -427,7 +432,7 @@ typedef Datum *DatumPtr;
 
 /*
  * BoolGetDatum
- *        Returns datum representation for a boolean.
+ *		Returns datum representation for a boolean.
  *
  * Note: any nonzero value will be considered TRUE.
  */
@@ -436,161 +441,161 @@ typedef Datum *DatumPtr;
 
 /*
  * DatumGetChar
- *        Returns character value of a datum.
+ *		Returns character value of a datum.
  */
 
 #define DatumGetChar(X) ((char) GET_1_BYTE(X))
 
 /*
  * CharGetDatum
- *        Returns datum representation for a character.
+ *		Returns datum representation for a character.
  */
 
 #define CharGetDatum(X) ((Datum) SET_1_BYTE(X))
 
 /*
  * Int8GetDatum
- *        Returns datum representation for an 8-bit integer.
+ *		Returns datum representation for an 8-bit integer.
  */
 
 #define Int8GetDatum(X) ((Datum) SET_1_BYTE(X))
 
 /*
  * DatumGetUInt8
- *        Returns 8-bit unsigned integer value of a datum.
+ *		Returns 8-bit unsigned integer value of a datum.
  */
 
 #define DatumGetUInt8(X) ((uint8) GET_1_BYTE(X))
 
 /*
  * UInt8GetDatum
- *        Returns datum representation for an 8-bit unsigned integer.
+ *		Returns datum representation for an 8-bit unsigned integer.
  */
 
 #define UInt8GetDatum(X) ((Datum) SET_1_BYTE(X))
 
 /*
  * DatumGetInt16
- *        Returns 16-bit integer value of a datum.
+ *		Returns 16-bit integer value of a datum.
  */
 
 #define DatumGetInt16(X) ((int16) GET_2_BYTES(X))
 
 /*
  * Int16GetDatum
- *        Returns datum representation for a 16-bit integer.
+ *		Returns datum representation for a 16-bit integer.
  */
 
 #define Int16GetDatum(X) ((Datum) SET_2_BYTES(X))
 
 /*
  * DatumGetUInt16
- *        Returns 16-bit unsigned integer value of a datum.
+ *		Returns 16-bit unsigned integer value of a datum.
  */
 
 #define DatumGetUInt16(X) ((uint16) GET_2_BYTES(X))
 
 /*
  * UInt16GetDatum
- *        Returns datum representation for a 16-bit unsigned integer.
+ *		Returns datum representation for a 16-bit unsigned integer.
  */
 
 #define UInt16GetDatum(X) ((Datum) SET_2_BYTES(X))
 
 /*
  * DatumGetInt32
- *        Returns 32-bit integer value of a datum.
+ *		Returns 32-bit integer value of a datum.
  */
 
 #define DatumGetInt32(X) ((int32) GET_4_BYTES(X))
 
 /*
  * Int32GetDatum
- *        Returns datum representation for a 32-bit integer.
+ *		Returns datum representation for a 32-bit integer.
  */
 
 #define Int32GetDatum(X) ((Datum) SET_4_BYTES(X))
 
 /*
  * DatumGetUInt32
- *        Returns 32-bit unsigned integer value of a datum.
+ *		Returns 32-bit unsigned integer value of a datum.
  */
 
 #define DatumGetUInt32(X) ((uint32) GET_4_BYTES(X))
 
 /*
  * UInt32GetDatum
- *        Returns datum representation for a 32-bit unsigned integer.
+ *		Returns datum representation for a 32-bit unsigned integer.
  */
 
 #define UInt32GetDatum(X) ((Datum) SET_4_BYTES(X))
 
 /*
  * DatumGetObjectId
- *        Returns object identifier value of a datum.
+ *		Returns object identifier value of a datum.
  */
 
 #define DatumGetObjectId(X) ((Oid) GET_4_BYTES(X))
 
 /*
  * ObjectIdGetDatum
- *        Returns datum representation for an object identifier.
+ *		Returns datum representation for an object identifier.
  */
 
 #define ObjectIdGetDatum(X) ((Datum) SET_4_BYTES(X))
 
 /*
  * DatumGetTransactionId
- *        Returns transaction identifier value of a datum.
+ *		Returns transaction identifier value of a datum.
  */
 
 #define DatumGetTransactionId(X) ((TransactionId) GET_4_BYTES(X))
 
 /*
  * TransactionIdGetDatum
- *        Returns datum representation for a transaction identifier.
+ *		Returns datum representation for a transaction identifier.
  */
 
 #define TransactionIdGetDatum(X) ((Datum) SET_4_BYTES((X)))
 
 /*
  * MultiXactIdGetDatum
- *        Returns datum representation for a multixact identifier.
+ *		Returns datum representation for a multixact identifier.
  */
 
 #define MultiXactIdGetDatum(X) ((Datum) SET_4_BYTES((X)))
 
 /*
  * DatumGetCommandId
- *        Returns command identifier value of a datum.
+ *		Returns command identifier value of a datum.
  */
 
 #define DatumGetCommandId(X) ((CommandId) GET_4_BYTES(X))
 
 /*
  * CommandIdGetDatum
- *        Returns datum representation for a command identifier.
+ *		Returns datum representation for a command identifier.
  */
 
 #define CommandIdGetDatum(X) ((Datum) SET_4_BYTES(X))
 
 /*
  * DatumGetPointer
- *        Returns pointer value of a datum.
+ *		Returns pointer value of a datum.
  */
 
 #define DatumGetPointer(X) ((Pointer) (X))
 
 /*
  * PointerGetDatum
- *        Returns datum representation for a pointer.
+ *		Returns datum representation for a pointer.
  */
 
 #define PointerGetDatum(X) ((Datum) (X))
 
 /*
  * DatumGetCString
- *        Returns C string (null-terminated string) value of a datum.
+ *		Returns C string (null-terminated string) value of a datum.
  *
  * Note: C string is not a full-fledged Postgres type at present,
  * but type input functions use this conversion for their inputs.
@@ -600,7 +605,7 @@ typedef Datum *DatumPtr;
 
 /*
  * CStringGetDatum
- *        Returns datum representation for a C string (null-terminated string).
+ *		Returns datum representation for a C string (null-terminated string).
  *
  * Note: C string is not a full-fledged Postgres type at present,
  * but type output functions use this conversion for their outputs.
@@ -612,14 +617,14 @@ typedef Datum *DatumPtr;
 
 /*
  * DatumGetName
- *        Returns name value of a datum.
+ *		Returns name value of a datum.
  */
 
 #define DatumGetName(X) ((Name) DatumGetPointer(X))
 
 /*
  * NameGetDatum
- *        Returns datum representation for a name.
+ *		Returns datum representation for a name.
  *
  * Note: Name is pass-by-reference; caller must ensure the pointed-to
  * value has adequate lifetime.
@@ -629,7 +634,7 @@ typedef Datum *DatumPtr;
 
 /*
  * DatumGetInt64
- *        Returns 64-bit integer value of a datum.
+ *		Returns 64-bit integer value of a datum.
  *
  * Note: this macro hides whether int64 is pass by value or by reference.
  */
@@ -642,7 +647,7 @@ typedef Datum *DatumPtr;
 
 /*
  * Int64GetDatum
- *        Returns datum representation for a 64-bit integer.
+ *		Returns datum representation for a 64-bit integer.
  *
  * Note: if int64 is pass by reference, this function returns a reference
  * to palloc'd space.
@@ -656,7 +661,7 @@ extern Datum Int64GetDatum(int64 X);
 
 /*
  * DatumGetUInt64
- *        Returns 64-bit unsigned integer value of a datum.
+ *		Returns 64-bit unsigned integer value of a datum.
  *
  * Note: this macro hides whether int64 is pass by value or by reference.
  */
@@ -669,7 +674,7 @@ extern Datum Int64GetDatum(int64 X);
 
 /*
  * UInt64GetDatum
- *        Returns datum representation for a 64-bit unsigned integer.
+ *		Returns datum representation for a 64-bit unsigned integer.
  *
  * Note: if int64 is pass by reference, this function returns a reference
  * to palloc'd space.
@@ -691,7 +696,7 @@ extern Datum Int64GetDatum(int64 X);
 
 /*
  * DatumGetFloat4
- *        Returns 4-byte floating point value of a datum.
+ *		Returns 4-byte floating point value of a datum.
  *
  * Note: this macro hides whether float4 is pass by value or by reference.
  */
@@ -700,14 +705,14 @@ extern Datum Int64GetDatum(int64 X);
 static inline float4
 DatumGetFloat4(Datum X)
 {
-    union
-    {
-        int32        value;
-        float4        retval;
-    }            myunion;
-
-    myunion.value = DatumGetInt32(X);
-    return myunion.retval;
+	union
+	{
+		int32		value;
+		float4		retval;
+	}			myunion;
+
+	myunion.value = DatumGetInt32(X);
+	return myunion.retval;
 }
 #else
 #define DatumGetFloat4(X) (* ((float4 *) DatumGetPointer(X)))
@@ -715,7 +720,7 @@ DatumGetFloat4(Datum X)
 
 /*
  * Float4GetDatum
- *        Returns datum representation for a 4-byte floating point number.
+ *		Returns datum representation for a 4-byte floating point number.
  *
  * Note: if float4 is pass by reference, this function returns a reference
  * to palloc'd space.
@@ -724,14 +729,14 @@ DatumGetFloat4(Datum X)
 static inline Datum
 Float4GetDatum(float4 X)
 {
-    union
-    {
-        float4        value;
-        int32        retval;
-    }            myunion;
-
-    myunion.value = X;
-    return Int32GetDatum(myunion.retval);
+	union
+	{
+		float4		value;
+		int32		retval;
+	}			myunion;
+
+	myunion.value = X;
+	return Int32GetDatum(myunion.retval);
 }
 #else
 extern Datum Float4GetDatum(float4 X);
@@ -739,7 +744,7 @@ extern Datum Float4GetDatum(float4 X);
 
 /*
  * DatumGetFloat8
- *        Returns 8-byte floating point value of a datum.
+ *		Returns 8-byte floating point value of a datum.
  *
  * Note: this macro hides whether float8 is pass by value or by reference.
  */
@@ -748,14 +753,14 @@ extern Datum Float4GetDatum(float4 X);
 static inline float8
 DatumGetFloat8(Datum X)
 {
-    union
-    {
-        int64        value;
-        float8        retval;
-    }            myunion;
-
-    myunion.value = DatumGetInt64(X);
-    return myunion.retval;
+	union
+	{
+		int64		value;
+		float8		retval;
+	}			myunion;
+
+	myunion.value = DatumGetInt64(X);
+	return myunion.retval;
 }
 #else
 #define DatumGetFloat8(X) (* ((float8 *) DatumGetPointer(X)))
@@ -763,7 +768,7 @@ DatumGetFloat8(Datum X)
 
 /*
  * Float8GetDatum
- *        Returns datum representation for an 8-byte floating point number.
+ *		Returns datum representation for an 8-byte floating point number.
  *
  * Note: if float8 is pass by reference, this function returns a reference
  * to palloc'd space.
@@ -773,14 +778,14 @@ DatumGetFloat8(Datum X)
 static inline Datum
 Float8GetDatum(float8 X)
 {
-    union
-    {
-        float8        value;
-        int64        retval;
-    }            myunion;
-
-    myunion.value = X;
-    return Int64GetDatum(myunion.retval);
+	union
+	{
+		float8		value;
+		int64		retval;
+	}			myunion;
+
+	myunion.value = X;
+	return Int64GetDatum(myunion.retval);
 }
 #else
 extern Datum Float8GetDatum(float8 X);
@@ -817,7 +822,7 @@ extern Datum Float8GetDatum(float8 X);
 
 
 /* ----------------------------------------------------------------
- *                Section 3:    exception handling backend support
+ *				Section 3:	exception handling backend support
  * ----------------------------------------------------------------
  */
 
@@ -827,14 +832,14 @@ extern Datum Float8GetDatum(float8 X);
  * ExceptionalCondition must be present even when assertions are not enabled.
  */
 extern void ExceptionalCondition(const char *conditionName,
-                     const char *errorType,
-                     const char *fileName, int lineNumber) pg_attribute_noreturn();
+					 const char *errorType,
+					 const char *fileName, int lineNumber) pg_attribute_noreturn();
 
 
 extern void ResetUsageCommon(struct rusage *save_r, struct timeval *save_t);
 extern void ResetUsage(void);
 extern void ShowUsageCommon(const char *title, struct rusage *save_r, struct
-        timeval *save_t);
+		timeval *save_t);
 #ifdef __TBASE__
 #define CLEAR_BIT(data, bit) data = (~(1 << (bit)) & (data))
 #define SET_BIT(data, bit)   data = ((1 << (bit)) | (data))
@@ -843,4 +848,4 @@ extern void ShowUsageCommon(const char *title, struct rusage *save_r, struct
 /* for error code */
 extern bool g_is_in_init_phase;
 #endif
-#endif                            /* POSTGRES_H */
+#endif							/* POSTGRES_H */
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index 7c88a772..a3b7876c 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * walsender.h
- *      Exports from replication/walsender.c.
+ *	  Exports from replication/walsender.c.
  *
  * Portions Copyright (c) 2010-2017, PostgreSQL Global Development Group
  *
@@ -16,14 +16,21 @@
 
 #include "fmgr.h"
 
+#define FLAG_AM_WALSENDER      0x01		/* Flag to set am_walsender(Am I a walsender process?) */
+#define FLAG_AM_DB_WALSENDER   0x02		/* Flag to set am_db_walsender(Am I a
+											walsender process and connected to
+											a database?
+												Yes: used for logical replicate.
+												No:  used for physical replicate. */
+
 /*
  * What to do with a snapshot in create replication slot command.
  */
 typedef enum
 {
-    CRS_EXPORT_SNAPSHOT,
-    CRS_NOEXPORT_SNAPSHOT,
-    CRS_USE_SNAPSHOT
+	CRS_EXPORT_SNAPSHOT,
+	CRS_NOEXPORT_SNAPSHOT,
+	CRS_USE_SNAPSHOT
 } CRSSnapshotAction;
 
 /* global state */
@@ -33,8 +40,8 @@ extern bool am_db_walsender;
 extern bool wake_wal_senders;
 
 /* user-settable parameters */
-extern int    max_wal_senders;
-extern int    wal_sender_timeout;
+extern int	max_wal_senders;
+extern int	wal_sender_timeout;
 extern bool log_replication_commands;
 
 extern void InitWalSender(void);
@@ -56,20 +63,20 @@ extern void WalSndRqstFileReload(void);
  * while holding contended locks.
  */
 #define WalSndWakeupRequest() \
-    do { wake_wal_senders = true; } while (0)
+	do { wake_wal_senders = true; } while (0)
 
 /*
  * wakeup walsenders if there is work to be done
  */
-#define WalSndWakeupProcessRequests()        \
-    do                                        \
-    {                                        \
-        if (wake_wal_senders)                \
-        {                                    \
-            wake_wal_senders = false;        \
-            if (max_wal_senders > 0)        \
-                WalSndWakeup();                \
-        }                                    \
-    } while (0)
+#define WalSndWakeupProcessRequests()		\
+	do										\
+	{										\
+		if (wake_wal_senders)				\
+		{									\
+			wake_wal_senders = false;		\
+			if (max_wal_senders > 0)		\
+				WalSndWakeup();				\
+		}									\
+	} while (0)
 
-#endif                            /* _WALSENDER_H */
+#endif							/* _WALSENDER_H */
diff --git a/src/include/utils/ps_status.h b/src/include/utils/ps_status.h
index ea26cfab..097474c5 100644
--- a/src/include/utils/ps_status.h
+++ b/src/include/utils/ps_status.h
@@ -17,10 +17,12 @@ extern bool update_process_title;
 extern char **save_ps_display_args(int argc, char **argv);
 
 extern void init_ps_display(const char *username, const char *dbname,
-                const char *host_info, const char *initial_str);
+				const char *host_info, const char *initial_str);
 
 extern void set_ps_display(const char *activity, bool force);
 
 extern const char *get_ps_display(int *displen);
 
-#endif                            /* PS_STATUS_H */
+extern const char *get_ps_display_fixed(int *displen);
+
+#endif							/* PS_STATUS_H */

From 0b24a5adeb3f42f34e1c579877670dc756eea222 Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Sun, 24 Apr 2022 10:14:09 +0800
Subject: [PATCH 545/578] fix multi-values insert error: Failing row contains
 (null, null).

TAPD:http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131098492799&jump_count=1
---
 src/backend/executor/spi.c          |  37 +++++++++-
 src/backend/nodes/copyfuncs.c       |  16 ++++-
 src/backend/parser/analyze.c        |  16 ++++-
 src/backend/utils/cache/plancache.c | 101 ++++++++++++++++++++++++++--
 4 files changed, 159 insertions(+), 11 deletions(-)

diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c
index 808c75f8..fa81c2b1 100644
--- a/src/backend/executor/spi.c
+++ b/src/backend/executor/spi.c
@@ -34,7 +34,7 @@
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/typcache.h"
-
+#include "parser/analyze.h"
 
 uint64        SPI_processed = 0;
 Oid            SPI_lastoid = InvalidOid;
@@ -1881,6 +1881,41 @@ _SPI_pgxc_prepare_plan(const char *src, List *src_parsetree, SPIPlanPtr plan)
                                                _SPI_current->queryEnv);
         }
 
+		if (unlikely(IS_PGXC_COORDINATOR && list_length(stmt_list) == 1
+					 && IsA(parsetree->stmt, InsertStmt)))
+		{
+			Query *parse = (Query *)linitial(stmt_list);
+			/*
+			 * set insert_into when we get multi-values insert, not
+			 * often happen
+			 */
+			if (unlikely(parse->isMultiValues && !parse->hasUnshippableTriggers))
+			{
+				MemoryContext old_ctx;
+				InsertStmt *iStmt = (InsertStmt*)parsetree->stmt;
+				InsertStmt *pStmt = (InsertStmt*)plansource->raw_parse_tree->stmt;
+				int colIdx = 0;
+				int rowIdx = 0;
+
+				plansource->insert_into = true;
+				old_ctx = MemoryContextSwitchTo(plansource->context);
+				if (iStmt->data_list != NULL)
+				{
+				    pStmt->data_list = (char ***)palloc(sizeof(char **) * iStmt->ndatarows);
+					for (rowIdx = 0; rowIdx < iStmt->ndatarows; rowIdx++)
+					{
+						pStmt->data_list[rowIdx] = (char **)palloc(
+													sizeof(char *) * iStmt->ninsert_columns);
+						for (colIdx = 0; colIdx < iStmt->ninsert_columns; colIdx++)
+							pStmt->data_list[rowIdx][colIdx] = pstrdup(iStmt->data_list[rowIdx][colIdx]);
+					}
+				}
+				pStmt->ndatarows = iStmt->ndatarows;
+				pStmt->ninsert_columns = iStmt->ninsert_columns;
+				MemoryContextSwitchTo(old_ctx);
+			}
+		}
+
         /* Finish filling in the CachedPlanSource */
         CompleteCachedPlan(plansource,
                            stmt_list,
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 876c407e..5bc4e05c 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -3303,6 +3303,8 @@ static InsertStmt *
 _copyInsertStmt(const InsertStmt *from)
 {
     InsertStmt *newnode = makeNode(InsertStmt);
+	int colIdx = 0;
+	int rowIdx = 0;
 
     COPY_NODE_FIELD(relation);
     COPY_NODE_FIELD(cols);
@@ -3313,8 +3315,20 @@ _copyInsertStmt(const InsertStmt *from)
     COPY_SCALAR_FIELD(override);
 #ifdef __TBASE__
     COPY_SCALAR_FIELD(ninsert_columns);
+	if(from->data_list != NULL)
+	{
+	newnode->data_list =
+		(char ***)palloc(sizeof(char **) * from->ndatarows);
+	for (rowIdx = 0; rowIdx < from->ndatarows; rowIdx++) {
+		newnode->data_list[rowIdx] =
+			(char **)palloc(sizeof(char *) * from->ninsert_columns);
+		for (colIdx = 0; colIdx < from->ninsert_columns; colIdx++)
+            newnode->data_list[rowIdx][colIdx] =
+                pstrdup(from->data_list[rowIdx][colIdx]);
+	}
+	}
+	COPY_SCALAR_FIELD(ndatarows);
 #endif
-
     return newnode;
 }
 
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index 539cd7c8..6e2cf055 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -875,6 +875,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
         List       *colcollations = NIL;
         int            sublist_length = -1;
         bool        lateral = false;
+		bool        all_params = true;
 
         Assert(selectStmt->intoClause == NULL);
 
@@ -1005,11 +1006,12 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
                     switch(nodeTag(v))
                     {
                         case T_A_Const:
+							all_params = false;
                             break;
                         case T_TypeCast:
                         {
                             TypeCast *cast = (TypeCast *)v;
-
+							all_params = false;
                             if (IsA(cast->arg, A_Const))
                             {
                                 v = (A_Const *)cast->arg;
@@ -1020,6 +1022,9 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
                             }
                             break;
                         }
+					    case T_ParamRef:
+							copy_from = all_params;
+							break;
                         default:
                             {
                                 copy_from = false;
@@ -1034,7 +1039,9 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
                     }
 
                     index++;
-
+					if(all_params){
+						continue;
+					}
                     /* A_Const */
                     switch(v->val.type)
                     {
@@ -1092,7 +1099,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
 
             if (copy_from)
             {
-                if (ndatarows != column_index)
+				if (ndatarows != column_index && !all_params)
                 {
                     elog(ERROR, "datarow count mismatched, expected %d, result %d",
                                  ndatarows, column_index);
@@ -1100,7 +1107,10 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
                 qry->copy_filename = palloc(MAXPGPATH);
                 snprintf(qry->copy_filename, MAXPGPATH, "%s", "Insert_into to Copy_from(Simple Protocl)");
                 stmt->ndatarows = ndatarows;
+				if(!all_params)
                 stmt->data_list = data_list;
+				else
+					 goto TRANSFORM_VALUELISTS;
             }
             else
             {
diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c
index b69aa7cb..39915901 100644
--- a/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@@ -71,6 +71,7 @@
 #include "utils/rls.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
+#include "utils/lsyscache.h"
 #ifdef PGXC
 #include "commands/prepare.h"
 #include "pgxc/execRemote.h"
@@ -956,7 +957,7 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist,
     MemoryContext plan_context;
     MemoryContext oldcxt = CurrentMemoryContext;
     ListCell   *lc;
-
+	char ***data_list = NULL;
     /*
      * Normally the querytree should be valid already, but if it's not,
      * rebuild it.
@@ -1004,6 +1005,18 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist,
      */
     plist = pg_plan_queries(qlist, plansource->cursor_options, boundParams);
 
+	/*
+	 * When get the cached multi-values insert plan, we transform insert to copyfrom plan
+	 */
+	if (plansource->insert_into && plansource->raw_parse_tree != NULL
+		&& IsA(plansource->raw_parse_tree->stmt, InsertStmt))
+	{
+          InsertStmt *iStmt =
+              (InsertStmt *)plansource->raw_parse_tree->stmt;
+		  Query *query = (Query*) linitial(qlist);
+		  bool suc = false;
+		  plist=transformInsertValuesIntoCopyFrom(NULL, iStmt, &suc, query->copy_filename, query);
+	}
     /* Release snapshot if we got one */
     if (snapshot_set)
         PopActiveSnapshot();
@@ -1024,12 +1037,17 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist,
          * Copy plan into the new context.
          */
         MemoryContextSwitchTo(plan_context);
-
+		/*
+		 * when we got a CopyStmt tansformed from multi values InsertStmt,
+		 * no need copy data_list, we set later
+		 */
         plist = copyObject(plist);
     }
     else
         plan_context = CurrentMemoryContext;
 
+
+
 #ifdef PGXC
     /*
      * If this plansource belongs to a named prepared statement, store the stmt
@@ -1355,7 +1373,78 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams,
     }
 
     Assert(plan != NULL);
+	if (plansource->insert_into && plansource->raw_parse_tree != NULL &&
+		IsA(plansource->raw_parse_tree->stmt, InsertStmt)) {
+		MemoryContext old_top;
+		InsertStmt *iStmt = (InsertStmt *)plansource->raw_parse_tree->stmt;
+		char ***data_list = NULL;
+		PlannedStmt *planstmt = (PlannedStmt *)linitial(plan->stmt_list);
+		CopyStmt *copyStmt = (CopyStmt *)planstmt->utilityStmt;
+		/*
+		 * we got parameters passed in, need trans them into data_list in
+		 * InsertStmt, then trans the insertStmt to copyStmt
+		 */
+		if (boundParams != NULL)
+		{
+            int colCnt = iStmt->ninsert_columns;
+            int i = 0;
+            char *valStr = NULL;
+            int colIdx = 0;
+            int rowIdx = 0;
+
+            if (colCnt == 0 || boundParams->numParams == 0 ||
+                boundParams->numParams % colCnt != 0)
+				plansource->insert_into = false;
+
+            old_top = MemoryContextSwitchTo(TopTransactionContext);
+            data_list = (char ***)palloc0(sizeof(char **) *
+                                          (boundParams->numParams / colCnt));
+            for (i = 0; i < (boundParams->numParams / colCnt); i++)
+			{
+              data_list[i] = (char **)palloc0(sizeof(char *) * colCnt);
+            }
 
+            for (i = 0; i < boundParams->numParams; i++)
+			{
+				Oid typOutput;
+				bool typIsVarlena;
+				Datum value;
+				Oid ptype = boundParams->params[i].ptype;
+				getTypeOutputInfo(ptype, &typOutput, &typIsVarlena);
+
+				if(typIsVarlena)
+				{
+					value = PointerGetDatum(PG_DETOAST_DATUM(boundParams->params[i].value));
+				}
+				else
+				{
+					value = boundParams->params[i].value;
+				}
+
+				if (boundParams->params[i].isnull)
+					data_list[rowIdx][colIdx++] = NULL;
+				else {
+					valStr = OidOutputFunctionCall(typOutput, value);
+					data_list[rowIdx][colIdx++] = pstrdup(valStr);
+				}
+				if (colIdx >= colCnt)
+				{
+					colIdx = 0;
+					rowIdx++;
+				}
+            }
+			copyStmt->data_list = data_list;
+			copyStmt->ndatarows = rowIdx;
+			copyStmt->ncolumns = colCnt;
+			MemoryContextSwitchTo(old_top);
+		}
+		else if(iStmt->data_list != NULL)
+		{
+			copyStmt->data_list = iStmt->data_list;
+			copyStmt->ndatarows = iStmt->ndatarows;
+			copyStmt->ncolumns = iStmt->ninsert_columns;
+		}
+	}
     /* Flag the plan as in use by caller */
     if (useResOwner)
         ResourceOwnerEnlargePlanCacheRefs(CurrentResourceOwner);
@@ -1364,10 +1453,10 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams,
         ResourceOwnerRememberPlanCacheRef(CurrentResourceOwner, plan);
 
     /*
-     * Saved plans should be under CacheMemoryContext so they will not go away
-     * until their reference count goes to zero.  In the generic-plan cases we
-     * already took care of that, but for a custom plan, do it as soon as we
-     * have created a reference-counted link.
+	 * Saved plans should be under CacheMemoryContext so they will not go
+	 * away until their reference count goes to zero.  In the generic-plan
+	 * cases we already took care of that, but for a custom plan, do it as
+	 * soon as we have created a reference-counted link.
      */
     if (customplan && plansource->is_saved)
     {

From 7a91cc7edd9747643999b1dd2e1a897ece0cf9a8 Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Sun, 24 Apr 2022 15:40:59 +0800
Subject: [PATCH 546/578] fix multi-values insert error: Failing row contains
 (null, null).

    TAPD:http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131098492799&jump_count=1
---
 src/backend/executor/spi.c          | 104 ++++++++++++++++++----------
 src/backend/parser/analyze.c        |  15 +---
 src/backend/utils/cache/plancache.c |  99 ++------------------------
 3 files changed, 77 insertions(+), 141 deletions(-)

diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c
index fa81c2b1..510f1fcb 100644
--- a/src/backend/executor/spi.c
+++ b/src/backend/executor/spi.c
@@ -53,6 +53,10 @@ static Portal SPI_cursor_open_internal(const char *name, SPIPlanPtr plan,
 static void _SPI_pgxc_prepare_plan(const char *src, List *src_parsetree,
                   SPIPlanPtr plan);
 #endif
+static void _SPI_multi_insert_rewrite(
+    CachedPlanSource *plansource, RawStmt *parsetree,
+    List *stmtList);
+
 static void _SPI_prepare_plan(const char *src, SPIPlanPtr plan);
 
 static void _SPI_prepare_oneshot_plan(const char *src, SPIPlanPtr plan);
@@ -1881,40 +1885,7 @@ _SPI_pgxc_prepare_plan(const char *src, List *src_parsetree, SPIPlanPtr plan)
                                                _SPI_current->queryEnv);
         }
 
-		if (unlikely(IS_PGXC_COORDINATOR && list_length(stmt_list) == 1
-					 && IsA(parsetree->stmt, InsertStmt)))
-		{
-			Query *parse = (Query *)linitial(stmt_list);
-			/*
-			 * set insert_into when we get multi-values insert, not
-			 * often happen
-			 */
-			if (unlikely(parse->isMultiValues && !parse->hasUnshippableTriggers))
-			{
-				MemoryContext old_ctx;
-				InsertStmt *iStmt = (InsertStmt*)parsetree->stmt;
-				InsertStmt *pStmt = (InsertStmt*)plansource->raw_parse_tree->stmt;
-				int colIdx = 0;
-				int rowIdx = 0;
-
-				plansource->insert_into = true;
-				old_ctx = MemoryContextSwitchTo(plansource->context);
-				if (iStmt->data_list != NULL)
-				{
-				    pStmt->data_list = (char ***)palloc(sizeof(char **) * iStmt->ndatarows);
-					for (rowIdx = 0; rowIdx < iStmt->ndatarows; rowIdx++)
-					{
-						pStmt->data_list[rowIdx] = (char **)palloc(
-													sizeof(char *) * iStmt->ninsert_columns);
-						for (colIdx = 0; colIdx < iStmt->ninsert_columns; colIdx++)
-							pStmt->data_list[rowIdx][colIdx] = pstrdup(iStmt->data_list[rowIdx][colIdx]);
-					}
-				}
-				pStmt->ndatarows = iStmt->ndatarows;
-				pStmt->ninsert_columns = iStmt->ninsert_columns;
-				MemoryContextSwitchTo(old_ctx);
-			}
-		}
+		_SPI_multi_insert_rewrite(plansource, parsetree, stmt_list);
 
         /* Finish filling in the CachedPlanSource */
         CompleteCachedPlan(plansource,
@@ -2114,7 +2085,7 @@ _SPI_execute_plan(SPIPlanPtr plan, ParamListInfo paramLI,
                                                    plan->nargs,
                                                    _SPI_current->queryEnv);
             }
-
+			_SPI_multi_insert_rewrite(plansource, parsetree, stmt_list);
             /* Finish filling in the CachedPlanSource */
             CompleteCachedPlan(plansource,
                                stmt_list,
@@ -2132,6 +2103,25 @@ _SPI_execute_plan(SPIPlanPtr plan, ParamListInfo paramLI,
          * plan, the refcount must be backed by the CurrentResourceOwner.
          */
         cplan = GetCachedPlan(plansource, paramLI, plan->saved, _SPI_current->queryEnv);
+		/*
+		 * TODO: now we don't support param, if multi values contains paramref, do not
+		 * transform to CopyStmt, refactor later
+		 */
+		if (plansource->insert_into && plansource->raw_parse_tree != NULL &&
+			IsA(plansource->raw_parse_tree->stmt, InsertStmt))
+		{
+			bool suc;
+			InsertStmt *iStmt = (InsertStmt *) plansource->raw_parse_tree->stmt;
+			PlannedStmt *pStmt = (PlannedStmt *) linitial(cplan->stmt_list);
+			Query *query = (Query*) linitial(plansource->query_list);
+			if (!(pStmt->utilityStmt && IsA(pStmt->utilityStmt, CopyStmt)) && iStmt->data_list != NULL)
+			{
+				MemoryContext old_ctx;
+				old_ctx = MemoryContextSwitchTo(plansource->context);
+				cplan->stmt_list = transformInsertValuesIntoCopyFrom(NULL, iStmt, &suc, query->copy_filename, query);
+				MemoryContextSwitchTo(old_ctx);
+			}
+		}
         stmt_list = cplan->stmt_list;
 
         /*
@@ -2889,3 +2879,47 @@ SPI_register_trigger_data(TriggerData *tdata)
 
     return SPI_OK_TD_REGISTER;
 }
+
+/*
+ * _SPI_multi_insert_rewrite
+ * If current stmt is a multi-line insert statement, copy the
+ * datalist to the raw_parse_tree in plansource and set plansource->insert_into
+ */
+static void _SPI_multi_insert_rewrite(CachedPlanSource *plansource,
+									  RawStmt *parsetree, List *stmtList)
+{
+	if (IS_PGXC_COORDINATOR && list_length(stmtList) == 1
+		&& IsA(parsetree->stmt, InsertStmt))
+	{
+		Query *parse = (Query *)linitial(stmtList);
+		/*
+		 * set insert_into when we get multi-values insert, not
+		 * often happen
+		 */
+		if (unlikely(parse->isMultiValues && !parse->hasUnshippableTriggers))
+		{
+			MemoryContext old_ctx;
+			InsertStmt *iStmt = (InsertStmt*)parsetree->stmt;
+			InsertStmt *pStmt = (InsertStmt*)plansource->raw_parse_tree->stmt;
+			int colIdx = 0;
+			int rowIdx = 0;
+
+			plansource->insert_into = true;
+			old_ctx = MemoryContextSwitchTo(plansource->context);
+			if (iStmt->data_list != NULL)
+			{
+				pStmt->data_list = (char ***)palloc(sizeof(char **) * iStmt->ndatarows);
+				for (rowIdx = 0; rowIdx < iStmt->ndatarows; rowIdx++)
+				{
+					pStmt->data_list[rowIdx] = (char **)palloc(
+						sizeof(char *) * iStmt->ninsert_columns);
+					for (colIdx = 0; colIdx < iStmt->ninsert_columns; colIdx++)
+						pStmt->data_list[rowIdx][colIdx] = pstrdup(iStmt->data_list[rowIdx][colIdx]);
+				}
+			}
+			pStmt->ndatarows = iStmt->ndatarows;
+			pStmt->ninsert_columns = iStmt->ninsert_columns;
+			MemoryContextSwitchTo(old_ctx);
+		}
+	}
+}
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index 6e2cf055..f5a26ad3 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -648,6 +648,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
     qry->isSingleValues = false;
     qry->isMultiValues = false;
     stmt->ninsert_columns = 0;
+	qry->copy_filename = NULL;
 #endif
 
     /* process the WITH clause independently of all else */
@@ -875,7 +876,6 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
         List       *colcollations = NIL;
         int            sublist_length = -1;
         bool        lateral = false;
-		bool        all_params = true;
 
         Assert(selectStmt->intoClause == NULL);
 
@@ -1006,12 +1006,10 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
                     switch(nodeTag(v))
                     {
                         case T_A_Const:
-							all_params = false;
                             break;
                         case T_TypeCast:
                         {
                             TypeCast *cast = (TypeCast *)v;
-							all_params = false;
                             if (IsA(cast->arg, A_Const))
                             {
                                 v = (A_Const *)cast->arg;
@@ -1022,9 +1020,6 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
                             }
                             break;
                         }
-					    case T_ParamRef:
-							copy_from = all_params;
-							break;
                         default:
                             {
                                 copy_from = false;
@@ -1039,9 +1034,6 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
                     }
 
                     index++;
-					if(all_params){
-						continue;
-					}
                     /* A_Const */
                     switch(v->val.type)
                     {
@@ -1099,7 +1091,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
 
             if (copy_from)
             {
-				if (ndatarows != column_index && !all_params)
+				if (ndatarows != column_index)
                 {
                     elog(ERROR, "datarow count mismatched, expected %d, result %d",
                                  ndatarows, column_index);
@@ -1107,10 +1099,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
                 qry->copy_filename = palloc(MAXPGPATH);
                 snprintf(qry->copy_filename, MAXPGPATH, "%s", "Insert_into to Copy_from(Simple Protocl)");
                 stmt->ndatarows = ndatarows;
-				if(!all_params)
                 stmt->data_list = data_list;
-				else
-					 goto TRANSFORM_VALUELISTS;
             }
             else
             {
diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c
index 39915901..ac2037a4 100644
--- a/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@@ -71,7 +71,6 @@
 #include "utils/rls.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
-#include "utils/lsyscache.h"
 #ifdef PGXC
 #include "commands/prepare.h"
 #include "pgxc/execRemote.h"
@@ -957,7 +956,7 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist,
     MemoryContext plan_context;
     MemoryContext oldcxt = CurrentMemoryContext;
     ListCell   *lc;
-	char ***data_list = NULL;
+
     /*
      * Normally the querytree should be valid already, but if it's not,
      * rebuild it.
@@ -1005,18 +1004,6 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist,
      */
     plist = pg_plan_queries(qlist, plansource->cursor_options, boundParams);
 
-	/*
-	 * When get the cached multi-values insert plan, we transform insert to copyfrom plan
-	 */
-	if (plansource->insert_into && plansource->raw_parse_tree != NULL
-		&& IsA(plansource->raw_parse_tree->stmt, InsertStmt))
-	{
-          InsertStmt *iStmt =
-              (InsertStmt *)plansource->raw_parse_tree->stmt;
-		  Query *query = (Query*) linitial(qlist);
-		  bool suc = false;
-		  plist=transformInsertValuesIntoCopyFrom(NULL, iStmt, &suc, query->copy_filename, query);
-	}
     /* Release snapshot if we got one */
     if (snapshot_set)
         PopActiveSnapshot();
@@ -1037,10 +1024,7 @@ BuildCachedPlan(CachedPlanSource *plansource, List *qlist,
          * Copy plan into the new context.
          */
         MemoryContextSwitchTo(plan_context);
-		/*
-		 * when we got a CopyStmt tansformed from multi values InsertStmt,
-		 * no need copy data_list, we set later
-		 */
+
         plist = copyObject(plist);
     }
     else
@@ -1373,78 +1357,7 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams,
     }
 
     Assert(plan != NULL);
-	if (plansource->insert_into && plansource->raw_parse_tree != NULL &&
-		IsA(plansource->raw_parse_tree->stmt, InsertStmt)) {
-		MemoryContext old_top;
-		InsertStmt *iStmt = (InsertStmt *)plansource->raw_parse_tree->stmt;
-		char ***data_list = NULL;
-		PlannedStmt *planstmt = (PlannedStmt *)linitial(plan->stmt_list);
-		CopyStmt *copyStmt = (CopyStmt *)planstmt->utilityStmt;
-		/*
-		 * we got parameters passed in, need trans them into data_list in
-		 * InsertStmt, then trans the insertStmt to copyStmt
-		 */
-		if (boundParams != NULL)
-		{
-            int colCnt = iStmt->ninsert_columns;
-            int i = 0;
-            char *valStr = NULL;
-            int colIdx = 0;
-            int rowIdx = 0;
-
-            if (colCnt == 0 || boundParams->numParams == 0 ||
-                boundParams->numParams % colCnt != 0)
-				plansource->insert_into = false;
-
-            old_top = MemoryContextSwitchTo(TopTransactionContext);
-            data_list = (char ***)palloc0(sizeof(char **) *
-                                          (boundParams->numParams / colCnt));
-            for (i = 0; i < (boundParams->numParams / colCnt); i++)
-			{
-              data_list[i] = (char **)palloc0(sizeof(char *) * colCnt);
-            }
 
-            for (i = 0; i < boundParams->numParams; i++)
-			{
-				Oid typOutput;
-				bool typIsVarlena;
-				Datum value;
-				Oid ptype = boundParams->params[i].ptype;
-				getTypeOutputInfo(ptype, &typOutput, &typIsVarlena);
-
-				if(typIsVarlena)
-				{
-					value = PointerGetDatum(PG_DETOAST_DATUM(boundParams->params[i].value));
-				}
-				else
-				{
-					value = boundParams->params[i].value;
-				}
-
-				if (boundParams->params[i].isnull)
-					data_list[rowIdx][colIdx++] = NULL;
-				else {
-					valStr = OidOutputFunctionCall(typOutput, value);
-					data_list[rowIdx][colIdx++] = pstrdup(valStr);
-				}
-				if (colIdx >= colCnt)
-				{
-					colIdx = 0;
-					rowIdx++;
-				}
-            }
-			copyStmt->data_list = data_list;
-			copyStmt->ndatarows = rowIdx;
-			copyStmt->ncolumns = colCnt;
-			MemoryContextSwitchTo(old_top);
-		}
-		else if(iStmt->data_list != NULL)
-		{
-			copyStmt->data_list = iStmt->data_list;
-			copyStmt->ndatarows = iStmt->ndatarows;
-			copyStmt->ncolumns = iStmt->ninsert_columns;
-		}
-	}
     /* Flag the plan as in use by caller */
     if (useResOwner)
         ResourceOwnerEnlargePlanCacheRefs(CurrentResourceOwner);
@@ -1453,10 +1366,10 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams,
         ResourceOwnerRememberPlanCacheRef(CurrentResourceOwner, plan);
 
     /*
-	 * Saved plans should be under CacheMemoryContext so they will not go
-	 * away until their reference count goes to zero.  In the generic-plan
-	 * cases we already took care of that, but for a custom plan, do it as
-	 * soon as we have created a reference-counted link.
+	 * Saved plans should be under CacheMemoryContext so they will not go away
+	 * until their reference count goes to zero.  In the generic-plan cases we
+	 * already took care of that, but for a custom plan, do it as soon as we
+	 * have created a reference-counted link.
      */
     if (customplan && plansource->is_saved)
     {

From fba17235e072054fb8f820760c8f04ee08f18005 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Sun, 24 Apr 2022 16:35:29 +0800
Subject: [PATCH 547/578] Revert "bugfix: server time different cause 2pc clean
 error (merge request 1170),
 http://tapd.oa.com/20421696/bugtrace/bugs/view?bug_id=1020421696096815567"

This reverts commit f442a37db1fe4f279b63b421a3a9f3305eb6c636.
---
 contrib/pg_clean/pg_clean--1.0.sql            |   6 -
 .../pg_clean/pg_clean--unpackaged--1.0.sql    |   1 -
 contrib/pg_clean/pg_clean.c                   | 934 ++++--------------
 src/backend/access/transam/twophase.c         |  51 +-
 src/backend/access/transam/xlog.c             |  42 +-
 src/backend/pgxc/pool/execRemote.c            |  49 +-
 src/backend/postmaster/clean2pc.c             |  45 +-
 src/backend/utils/misc/guc.c                  |  20 +-
 src/include/access/twophase.h                 |   4 -
 9 files changed, 242 insertions(+), 910 deletions(-)

diff --git a/contrib/pg_clean/pg_clean--1.0.sql b/contrib/pg_clean/pg_clean--1.0.sql
index be8623f7..e5bbc9ca 100644
--- a/contrib/pg_clean/pg_clean--1.0.sql
+++ b/contrib/pg_clean/pg_clean--1.0.sql
@@ -50,11 +50,6 @@ RETURNS text
 AS 'MODULE_PATHNAME'
 LANGUAGE C;
 
-CREATE FUNCTION pgxc_get_2pc_prepare_timestamp(gid text)
-RETURNS text
-AS 'MODULE_PATHNAME'
-LANGUAGE C;
-
 CREATE FUNCTION pgxc_get_2pc_commit_timestamp(gid text)
 RETURNS text
 AS 'MODULE_PATHNAME'
@@ -101,7 +96,6 @@ GRANT ALL ON FUNCTION pg_clean_check_txn(time_interval integer) TO PUBLIC;
 GRANT ALL ON FUNCTION pgxc_get_2pc_nodes(gid text) TO PUBLIC;
 GRANT ALL ON FUNCTION pgxc_get_2pc_startnode(gid text) TO PUBLIC;
 GRANT ALL ON FUNCTION pgxc_get_2pc_startxid(gid text) TO PUBLIC;
-GRANT ALL ON FUNCTION pgxc_get_2pc_prepare_timestamp(gid text) TO PUBLIC;
 GRANT ALL ON FUNCTION pgxc_get_2pc_commit_timestamp(gid text) TO PUBLIC;
 GRANT ALL ON FUNCTION pgxc_get_2pc_xid(gid text) TO PUBLIC;
 GRANT ALL ON FUNCTION pgxc_get_2pc_file(gid text) TO PUBLIC;
diff --git a/contrib/pg_clean/pg_clean--unpackaged--1.0.sql b/contrib/pg_clean/pg_clean--unpackaged--1.0.sql
index d173a607..a6a67659 100644
--- a/contrib/pg_clean/pg_clean--unpackaged--1.0.sql
+++ b/contrib/pg_clean/pg_clean--unpackaged--1.0.sql
@@ -9,7 +9,6 @@ ALTER EXTENSION pg_clean ADD function pg_clean_check_txn(time_interval integer);
 ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_nodes(gid text);
 ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_startnode(gid text);
 ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_startxid(gid text);
-ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_prepare_timestamp(gid text);
 ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_commit_timestamp(gid text);
 ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_xid(gid text);
 ALTER EXTENSION pg_clean ADD function pgxc_get_2pc_file(gid text);
diff --git a/contrib/pg_clean/pg_clean.c b/contrib/pg_clean/pg_clean.c
index 4ee21911..5a20456e 100644
--- a/contrib/pg_clean/pg_clean.c
+++ b/contrib/pg_clean/pg_clean.c
@@ -63,14 +63,8 @@ int  transaction_threshold = 200000;
 #define MAXIMUM_OUTPUT_FILE 1000
 #define XIDPREFIX "_$XC$"
 #define DEFAULT_CLEAN_TIME_INTERVAL 120
-
-#ifdef __TWO_PHASE_TESTS__
-#define LEAST_CLEAN_TIME_INTERVAL     1 /* should not clean twophase trans prepared in 1s */
+#define LEAST_CLEAN_TIME_INTERVAL     3 /* should not clean twophase trans prepared in 3s */
 #define LEAST_CHECK_TIME_INTERVAL     1 /* should not check twophase trans prepared in 1s */
-#else
-#define LEAST_CLEAN_TIME_INTERVAL     10 /* should not clean twophase trans prepared in 10s */
-#define LEAST_CHECK_TIME_INTERVAL     3  /* should not check twophase trans prepared in 3s */
-#endif
 
 GlobalTimestamp clean_time_interval = DEFAULT_CLEAN_TIME_INTERVAL * USECS_PER_SEC;
 
@@ -78,15 +72,19 @@ PG_MODULE_MAGIC;
 
 #define MAX_GID               64
 
+#define CLEAN_CHECK_TIMES_DEFAULT    3
+#define CLEAN_CHECK_INTERVAL_DEFAULT 100000
+
+#define CLEAN_NODE_CHECK_TIMES       5
+#define CLEAN_NODE_CHECK_INTERVAL    500000
+
 #define MAX_DBNAME	64
 #define GET_START_XID "startxid:"
-#define GET_PREPARE_TIMESTAMP "global_prepare_timestamp:"
 #define GET_COMMIT_TIMESTAMP "global_commit_timestamp:"
 #define GET_START_NODE "startnode:"
 #define GET_NODE "nodes:"
 #define GET_XID "\nxid:"
 #define GET_READONLY "readonly"
-#define ROLLBACK_POSTFIX ".rollback" /* 2pc file postfix when the 2pc is rollbacked */
 #define GIDSIZE (200 + 24)
 #define MAX_TWOPC_TXN 1000
 #define STRING_BUFF_LEN 1024
@@ -192,7 +190,6 @@ typedef struct txn_info
 	TXN_STATUS		*txn_stat;			/* Array for each nodes */
 	char			*msg;				/* Notice message for this txn. */
 	GlobalTimestamp  global_commit_timestamp;	/* get global_commit_timestamp from node once it is committed*/
-	GlobalTimestamp  global_prepare_timestamp;	/* get global_prepare_timestamp from node once it is prepared*/
 
 	TXN_STATUS		global_txn_stat;
 	OPERATION		op;
@@ -265,10 +262,8 @@ database_info       *last_database_info = NULL;
 bool		        execute = false;
 int                 total_twopc_txn = 0;
 
-TimestampTz         current_time = 0;
-TimestampTz         abnormal_time = 0;
-GlobalTimestamp     current_gts = InvalidGlobalTimestamp;   /* use to save current gts */
-GlobalTimestamp     abnormal_gts = InvalidGlobalTimestamp;  /* use to save abnormal gts, clean 2PCs which prepare gts less than abnormal gts */
+TimestampTz         current_time;
+GlobalTimestamp     abnormal_time = InvalidGlobalTimestamp;
 char                *abnormal_nodename = NULL;
 Oid                 abnormal_nodeoid = InvalidOid;
 bool                clear_2pc_belong_node = false;
@@ -346,14 +341,6 @@ static void
 static void 
      get_node_handles(PGXCNodeAllHandles ** pgxc_handles, Oid nodeoid);
 
-uint32 get_start_xid_from_gid(char *gid);
-char *get_start_node_from_gid(char *gid);
-Oid get_start_node_oid_from_gid(char *gid);
-
-bool is_xid_running_on_node(uint32 xid, Oid node_oid);
-bool is_gid_start_xid_running(char *gid);
-bool is_txn_start_xid_running(txn_info *txn);
-
 Datum	pg_clean_execute(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pg_clean_execute);
 Datum	pg_clean_execute(PG_FUNCTION_ARGS)
@@ -507,7 +494,6 @@ Datum	pg_clean_execute_on_node(PG_FUNCTION_ARGS)
 	char				txn_status[100];
 	char				txn_op[100];
 	char				txn_op_issuccess[100];
-	int64				time_gap = 0;
 	
 	Datum		values[ACCESS_CONTROL_ATTR_NUM];
 	bool		nulls[ACCESS_CONTROL_ATTR_NUM];
@@ -554,36 +540,21 @@ Datum	pg_clean_execute_on_node(PG_FUNCTION_ARGS)
         execute = true;
         clear_2pc_belong_node = true;
 
-        if (0 == PG_GETARG_DATUM(0))
-        {
-            elog(ERROR, "pg_clean_execute_on_node: node name is empty");
-        }
         abnormal_nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
         abnormal_nodeoid = get_pgxc_nodeoid(abnormal_nodename);
         if (InvalidOid == abnormal_nodeoid)
         {
-            elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of "
-                "invalid nodename '%s'", abnormal_nodename);
+            elog(ERROR, "pg_clean_execute_on_node, cannot clear 2pc of invalid nodename '%s'", abnormal_nodename);
         }
         abnormal_time = PG_GETARG_INT64(1);
         current_time = GetCurrentTimestamp();
-        time_gap = current_time - abnormal_time;
-        if (time_gap < LEAST_CLEAN_TIME_INTERVAL * USECS_PER_SEC)
+        if (abnormal_time >= current_time - LEAST_CLEAN_TIME_INTERVAL * USECS_PER_SEC)
         {
-            /*time gap less than LEAST_CLEAN_TIME_INTERVAL, can not clean*/
-            elog(ERROR, "pg_clean_execute_on_node, least clean interval is %ds, "
-                "abnormal time: " INT64_FORMAT ", current time: " INT64_FORMAT,
+            elog(ERROR, "pg_clean_execute_on_node, least clean time interval is %ds, "
+                "abnormal time: " INT64_FORMAT ", current_time: " INT64_FORMAT,
                 LEAST_CLEAN_TIME_INTERVAL, abnormal_time, current_time);
         }
         
-        current_gts = GetGlobalTimestampGTM();
-        if (!GlobalTimestampIsValid(current_gts))
-        {
-            /*get invalid gts, can not clean*/
-            elog(ERROR, "pg_clean_execute_on_node, get invalid gts");
-        }
-        abnormal_gts = current_gts - time_gap;
-
 		/*get node list*/
 		PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
 						&cn_nodes_num, &dn_nodes_num, true);
@@ -799,9 +770,7 @@ static void ResetGlobalVariables(void)
 	head_database_info = last_database_info = NULL;
 
     current_time = 0;
-    abnormal_time = 0;
-    current_gts = InvalidGlobalTimestamp;
-    abnormal_gts = InvalidGlobalTimestamp;
+    abnormal_time = InvalidGlobalTimestamp;
     abnormal_nodename = NULL;
     abnormal_nodeoid = InvalidOid;
     clear_2pc_belong_node = false;
@@ -953,7 +922,7 @@ static void getDatabaseList(void)
 {
 	int i;
 	TupleTableSlots result_db;
-	const char *query_db = "select datname::text from pg_catalog.pg_database";
+	const char *query_db = "select datname::text from pg_database;";
 	/*add datname into tail of head_database_info*/
 	if (execute_query_on_single_node(my_nodeoid, query_db, 1, &result_db) == (Datum) 1)
 	{
@@ -1010,12 +979,6 @@ static void getTxnInfoOnNodesAll(void)
 {
 	int i;
 	current_time = GetCurrentTimestamp();
-	current_gts = GetGlobalTimestampGTM();
-	if (!GlobalTimestampIsValid(current_gts))
-	{
-		/*get invalid gts, get txn info error*/
-		elog(ERROR, "getTxnInfoOnNodesAll, get invalid gts");
-	}
 	/*upload 2PC transaction from CN*/
 	for (i = 0; i < cn_nodes_num; i++)
 	{
@@ -1039,12 +1002,10 @@ void getTxnInfoOnNode(Oid node)
 	TupleTableSlots result_txn;
 	Datum execute_res;
 	char query_execute[1024];
-	const char *query_txn_status = "select transaction::text, gid::text, "
-					"owner::text, database::text, timestamptz_out(prepared)::text "
-					"from pg_catalog.pg_prepared_xacts";
-	const char *query_txn_status_execute = "select transaction::text, gid::text, "
-					"owner::text, database::text, timestamptz_out(prepared)::text "
-					"from pg_catalog.pg_prepared_xacts where database = '%s'";
+	const char *query_txn_status = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text "
+										  "from pg_prepared_xacts;";
+	const char *query_txn_status_execute = "select transaction::text, gid::text, owner::text, database::text, timestamptz_out(prepared)::text "
+										  		  "from pg_prepared_xacts where database = '%s';";
 	snprintf(query_execute, 1024, query_txn_status_execute, get_database_name(MyDatabaseId));
 
 	if (execute)
@@ -1145,7 +1106,6 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
     char *file_content = NULL;
     uint32 startxid = 0;
     char *str_startxid = NULL;
-    char *str_prepare_gts = NULL;
     char *str_timestamp = NULL;
 	char *temp = NULL;
 	Oid	 temp_nodeoid;
@@ -1153,7 +1113,7 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
 	int  temp_nodeidx;
 	char stmt[1024];
 	static const char *STMT_FORM = "select pgxc_get_2pc_file('%s')::text";
-	snprintf(stmt, 1024, STMT_FORM, txn->gid);
+	snprintf(stmt, 1024, STMT_FORM, txn->gid, txn->gid, txn->gid, txn->gid);
     
 	if (execute_query_on_single_node(node_oid, stmt, 1, &result) == (Datum) 1)
 	{
@@ -1166,12 +1126,6 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
 		{
             file_content = TTSgetvalue(&result, 0, 0);    
             
-            if (strlen(file_content) == 0)
-            {
-                elog(LOG, "gid: %s, 2pc file is not exist", txn->gid);
-	            return TWOPHASE_FILE_NOT_EXISTS;
-            }
-
             if (!IsXidImplicit(txn->gid) && strstr(file_content, GET_READONLY))
             {
                 txn->is_readonly = true;
@@ -1181,7 +1135,6 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
             }
             startnode = strstr(file_content, GET_START_NODE);
             str_startxid = strstr(file_content, GET_START_XID);
-            str_prepare_gts = strstr(file_content, GET_PREPARE_TIMESTAMP);
             partnodes = strstr(file_content, GET_NODE);
             temp = strstr(file_content, GET_COMMIT_TIMESTAMP);
             
@@ -1193,7 +1146,6 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
                 temp = strstr(temp, GET_COMMIT_TIMESTAMP);
             }
             
-            /* get start node name */
             if (startnode)
             {
                 startnode += strlen(GET_START_NODE);
@@ -1201,7 +1153,6 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
                 txn->origcoord = get_pgxc_nodeoid(startnode);
             }
             
-            /* get start xid */
             if (str_startxid)
             {
                 str_startxid += strlen(GET_START_XID);
@@ -1210,7 +1161,6 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
                 txn->startxid = startxid;
             }
             
-            /* get participated nodes */
             if (partnodes)
             {
                 partnodes += strlen(GET_NODE);
@@ -1233,37 +1183,15 @@ TWOPHASE_FILE_STATUS GetTransactionPartNodes(txn_info *txn, Oid node_oid)
                 return res;
             }
 
-            /* get prepare gts */
-            if (str_prepare_gts)
-            {
-                str_prepare_gts += strlen(GET_PREPARE_TIMESTAMP);
-                str_prepare_gts = strtok(str_prepare_gts, "\n");
-                txn->global_prepare_timestamp = strtoull(str_prepare_gts, NULL, 10);
-            }
-            else
-            {
-                txn->global_prepare_timestamp = InvalidGlobalTimestamp;
-            }
-
-            /* get commit gts */
             if (str_timestamp)
             {
                 str_timestamp += strlen(GET_COMMIT_TIMESTAMP);
                 str_timestamp = strtok(str_timestamp, "\n");
                 txn->global_commit_timestamp = strtoull(str_timestamp, NULL, 10);
             }
-            else
-            {
-                txn->global_commit_timestamp = InvalidGlobalTimestamp;
-            }
-
-            elog(DEBUG1, "get 2pc txn: %s, partnodes in nodename: %s(nodeoid:%u), "
-                "partnodes: (%s), startnode: %s(startnodeoid: %u), startxid: %u, "
-                "global_prepare_timestamp: %ld, global_commit_timestamp: %ld", 
-                txn->gid, get_pgxc_nodename(node_oid), node_oid,
-                partnodes, startnode, txn->origcoord, startxid,
-                txn->global_prepare_timestamp, txn->global_commit_timestamp);
             
+            elog(DEBUG1, "get 2pc txn:%s partnodes in nodename: %s (nodeoid:%u) result: partnodes:%s, startnode:%s, startnodeoid:%u, startxid:%u", 
+                txn->gid, get_pgxc_nodename(node_oid), node_oid, partnodes, startnode, txn->origcoord, startxid);
             /* in explicit transaction startnode participate the transaction */
             if (strstr(partnodes, startnode) || !IsXidImplicit(txn->gid))
             {
@@ -1529,8 +1457,7 @@ void getTxnInfoOnOtherNodes(txn_info *txn)
         node_oid = get_pgxc_nodeoid(ptr);
         status = GetTransactionPartNodes(txn, node_oid);
     }
-
-    if (status == TWOPHASE_FILE_NOT_EXISTS)
+    else
     {
         for (ii = 0; ii < cn_nodes_num + dn_nodes_num; ii++)
         {
@@ -1695,7 +1622,7 @@ void getTxnStatus(txn_info *txn, int node_idx)
 	TupleTableSlots result;
 
 	static const char *STMT_FORM = "SELECT pgxc_is_committed('%d'::xid)::text";
-	snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx]);
+	snprintf(stmt, 1024, STMT_FORM, txn->xid[node_idx], txn->xid[node_idx]);
 
 	node_oid = find_node_oid(node_idx);
 	if (0 != execute_query_on_single_node(node_oid, stmt, 1, &result))
@@ -1786,10 +1713,6 @@ char *get2PCInfo(const char *tid)
     return NULL;
 }
 
-/*
- * pgxc_get_2pc_file
- * Get 2pc file content
- */
 Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_file);
 Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS)
@@ -1798,10 +1721,6 @@ Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS)
     char *result = NULL;
     text *t_result = NULL;
 
-    if (0 == PG_GETARG_DATUM(0))
-    {
-        elog(ERROR, "2PC gid is empty");
-    }
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
     result = get2PCInfo(tid);
     if (NULL != result)
@@ -1813,10 +1732,7 @@ Datum pgxc_get_2pc_file(PG_FUNCTION_ARGS)
     PG_RETURN_NULL();
 }
 
-/*
- * pgxc_get_2pc_nodes
- * Get 2pc participants
- */
+
 Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_nodes);
 Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS)
@@ -1826,10 +1742,6 @@ Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS)
     char *nodename = NULL;
 	text *t_result = NULL;
     
-    if (0 == PG_GETARG_DATUM(0))
-    {
-        elog(ERROR, "2PC gid is empty");
-    }
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
     result = get2PCInfo(tid);
     if (NULL != result)
@@ -1844,13 +1756,10 @@ Datum pgxc_get_2pc_nodes(PG_FUNCTION_ARGS)
 				return PointerGetDatum(t_result);
 			}
 		}
+
     PG_RETURN_NULL();
 }
 
-/*
- * pgxc_get_2pc_startnode
- * Get 2pc start node
- */
 Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_startnode);
 Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS)
@@ -1860,10 +1769,6 @@ Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS)
     char *nodename = NULL;
 	text *t_result = NULL;
     
-    if (0 == PG_GETARG_DATUM(0))
-    {
-        elog(ERROR, "2PC gid is empty");
-    }
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
     result = get2PCInfo(tid);
     if (NULL != result)
@@ -1882,10 +1787,6 @@ Datum pgxc_get_2pc_startnode(PG_FUNCTION_ARGS)
     PG_RETURN_NULL();
 }
 
-/*
- * pgxc_get_2pc_startxid
- * Get 2pc start xid
- */
 Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_startxid);
 Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS)
@@ -1895,10 +1796,6 @@ Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS)
     char *startxid = NULL;
 	text *t_result = NULL;
     
-    if (0 == PG_GETARG_DATUM(0))
-    {
-        elog(ERROR, "2PC gid is empty");
-    }
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
     result = get2PCInfo(tid);
     if (NULL != result)
@@ -1916,44 +1813,7 @@ Datum pgxc_get_2pc_startxid(PG_FUNCTION_ARGS)
     PG_RETURN_NULL();
 }
 
-/*
- * pgxc_get_2pc_prepare_timestamp
- * Get 2pc prepare timestamp
- */
-Datum pgxc_get_2pc_prepare_timestamp(PG_FUNCTION_ARGS);
-PG_FUNCTION_INFO_V1(pgxc_get_2pc_prepare_timestamp);
-Datum pgxc_get_2pc_prepare_timestamp(PG_FUNCTION_ARGS)
-{
-    char *tid = NULL;
-    char *result = NULL;
-    char *prepare_timestamp = NULL;
-    text *t_result = NULL;
-
-    if (0 == PG_GETARG_DATUM(0))
-    {
-        elog(ERROR, "2PC gid is empty");
-    }
-    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-    result = get2PCInfo(tid);
-    if (NULL != result)
-    {
-        prepare_timestamp = strstr(result, GET_PREPARE_TIMESTAMP);
-        if (NULL != prepare_timestamp)
-        {
-            prepare_timestamp += strlen(GET_PREPARE_TIMESTAMP);
-            prepare_timestamp = strtok(prepare_timestamp, "\n");
-            t_result = cstring_to_text(prepare_timestamp);
-            pfree(result);
-            return PointerGetDatum(t_result);
-        }
-    }
-    PG_RETURN_NULL();
-}
 
-/*
- * pgxc_get_2pc_commit_timestamp
- * Get 2pc commit timestamp
- */
 Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_commit_timestamp);
 Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS)
@@ -1963,10 +1823,6 @@ Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS)
     char *commit_timestamp = NULL;
 	text *t_result = NULL;
     
-    if (0 == PG_GETARG_DATUM(0))
-    {
-        elog(ERROR, "2PC gid is empty");
-    }
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
     result = get2PCInfo(tid);
     if (NULL != result)
@@ -1984,23 +1840,17 @@ Datum pgxc_get_2pc_commit_timestamp(PG_FUNCTION_ARGS)
     PG_RETURN_NULL();
 }
 
-/*
- * pgxc_get_2pc_xid
- * Get 2pc local xid
- */
+
+
 Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_2pc_xid);
 Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS)
 {
-    GlobalTransactionId xid;
     char *tid = NULL;
     char *result = NULL;
     char *str_xid = NULL;
+    GlobalTransactionId xid;
     
-    if (0 == PG_GETARG_DATUM(0))
-    {
-        elog(ERROR, "2PC gid is empty");
-    }
     tid = text_to_cstring(PG_GETARG_TEXT_P(0));
     result = get2PCInfo(tid);
     if (NULL != result)
@@ -2018,31 +1868,16 @@ Datum pgxc_get_2pc_xid(PG_FUNCTION_ARGS)
     PG_RETURN_NULL();
 }
 
-/*
- * pgxc_remove_2pc_records
- * Remove a 2pc file
- */
 Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_remove_2pc_records);
 Datum pgxc_remove_2pc_records(PG_FUNCTION_ARGS)
 {
-    char *tid = NULL;
-
-    if (0 == PG_GETARG_DATUM(0))
-    {
-        elog(ERROR, "2PC gid is empty");
-    }
-    tid = text_to_cstring(PG_GETARG_TEXT_P(0));
-
+    char *tid = text_to_cstring(PG_GETARG_TEXT_P(0));
 	remove_2pc_records(tid, true);
     pfree(tid);
     PG_RETURN_BOOL(true);
 }
 
-/*
- * pgxc_clear_2pc_records
- * Clear all 2pc files which are not running
- */
 Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_clear_2pc_records);
 Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
@@ -2066,8 +1901,6 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
 		elog(ERROR, "can only called on coordinator");
 	}
 	
-	elog(LOG, "clear 2pc files");
-	
 	mycontext = AllocSetContextCreate(CurrentMemoryContext,
 											  "clean_check",
 											  ALLOCSET_DEFAULT_MINSIZE,
@@ -2076,6 +1909,25 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
 	oldcontext = MemoryContextSwitchTo(mycontext);
 
     ResetGlobalVariables();
+#if 0
+	if((dir = opendir(TWOPHASE_RECORD_DIR)))
+	{		
+		while((ptr = readdir(dir)) != NULL)
+	    {
+	    	if (count > 999)
+				break;
+	        if(strcmp(ptr->d_name,".") == 0 || strcmp(ptr->d_name,"..") == 0)
+	        {
+	            continue;
+	        }       
+			snprintf(path[count], MAX_GID, "/%s", ptr->d_name);
+			//snprintf(path[count], MAX_GID, "/%s", ptr->d_name);
+			count++;
+		}
+
+		closedir(dir);
+	}
+#endif
 
 	/*get node list*/
 	PgxcNodeGetOids(&cn_node_list, &dn_node_list, 
@@ -2096,14 +1948,28 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
     {
         (void) execute_query_on_single_node(dn_node_list[i], query, 1, result+cn_nodes_num+i);
     }
-
 	/*get all database info*/
 	getDatabaseList();
 	
 	/*get all info of 2PC transactions*/
 	getTxnInfoOnNodesAll();
+#if 0
+	if((dir = opendir(TWOPHASE_RECORD_DIR)))
+	{		
+		while (i < count)
+		{
+			if (!find_txn(path[i]))
+			{
+				unlink(path[i]);
+				WriteClean2pcXlogRec(path[i]);
+			}
+			i++;
+		}
 
-    /*delete all rest 2pc files in each cn*/
+		closedir(dir);
+	}
+#endif
+    /*delete all rest 2pc file in each nodes*/
     for (i = 0; i < cn_nodes_num; i++)
     {
         if (0 == result[i].slot_count)
@@ -2111,54 +1977,24 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
             continue;
         }
         if (!(twopcfiles = TTSgetvalue(result+i, 0, 0)))
-        {
             continue;
-        }
-
-        /*iterate through all 2pc files, delete rest ones*/
         ptr = strtok(twopcfiles, ",");
-        for (;ptr != NULL; ptr = strtok(NULL, ","))
+        while(ptr)
         {
             if (count >= MAXIMUM_CLEAR_FILE)
-            {
                 break;
-            }
-
-            /*whether 2pc is running?*/
-            if (find_txn(ptr))
-            {
-                /*2pc is running, do not delete its file*/
-                continue;
-            }
-
-            /*whether 2pc is rollbacked?*/
-            if (strstr(ptr, ROLLBACK_POSTFIX) == NULL)
+            if (!find_txn(ptr))
             {
-                /*2pc is not rollbacked*/
-
-                /*whether 2pc start xid transaction is running?*/
-                if (is_gid_start_xid_running(ptr))
-                {
-                    /*2pc start xid transaction is running, do not delete its file*/
-                    elog(LOG, "2PC '%s' is running", ptr);
-                    continue;
-                }
-            }
-
-            /*2pc is not running, delete its file*/
                 snprintf(clear_query, 100, CLEAR_STMT, ptr);
-            elog(LOG, "clear 2pc file: %s", ptr);
-            if (execute_query_on_single_node(cn_node_list[i],
-                clear_query, 1, &clear_result) == (Datum)0)
-            {
+                if (execute_query_on_single_node(cn_node_list[i], clear_query, 1, &clear_result) == (Datum)0)
                     res = false;
-            }
                 DropTupleTableSlots(&clear_result);
                 count++;
             }
+            ptr = strtok(NULL, ",");
+        }
     }
 
-    /*delete all rest 2pc files in each dn*/
     for (i = 0; i < dn_nodes_num; i++)
     {
         if (0 == result[cn_nodes_num+i].slot_count)
@@ -2166,51 +2002,22 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
             continue;
         }
         if (!(twopcfiles = TTSgetvalue(result+cn_nodes_num+i, 0, 0)))
-        {
             continue;
-        }
-
-        /*iterate through all 2pc files, delete rest ones*/
         ptr = strtok(twopcfiles, ",");
-        for (;ptr != NULL; ptr = strtok(NULL, ","))
+        while(ptr)
         {
             if (count >= MAXIMUM_CLEAR_FILE)
-            {
                 break;
-            }
-
-            /*whether 2pc is running?*/
-            if (find_txn(ptr))
-            {
-                /*2pc is running, do not delete its file*/
-                continue;
-            }
-
-            /*whether 2pc is rollbacked?*/
-            if (strstr(ptr, ROLLBACK_POSTFIX) == NULL)
-            {
-                /*2pc is not rollbacked*/
-
-                /*whether 2pc start xid transaction is running?*/
-                if (is_gid_start_xid_running(ptr))
+            if (!find_txn(ptr))
             {
-                    /*2pc start xid transaction is running, do not delete its file*/
-                    elog(LOG, "2PC '%s' is running", ptr);
-                    continue;
-                }
-            }
-
-            /*2pc is not running, delete its file*/
                 snprintf(clear_query, 100, CLEAR_STMT, ptr);
-            elog(LOG, "clear 2pc file: %s", ptr);
-            if (execute_query_on_single_node(dn_node_list[i],
-                clear_query, 1, &clear_result) == (Datum)0)
-            {
+                if (execute_query_on_single_node(dn_node_list[i], clear_query, 1, &clear_result) == (Datum)0)
                     res = false;
-            }
                 DropTupleTableSlots(&clear_result);
                 count++;
             }
+            ptr = strtok(NULL, ",");
+        }
     }
 
     for (i = 0; i < pgxc_clean_node_count; i++)
@@ -2226,10 +2033,6 @@ Datum pgxc_clear_2pc_records(PG_FUNCTION_ARGS)
     PG_RETURN_BOOL(res);
 }
 
-/*
- * pgxc_get_record_list
- * Get 2pc files list
- */
 Datum pgxc_get_record_list(PG_FUNCTION_ARGS);
 PG_FUNCTION_INFO_V1(pgxc_get_record_list);
 Datum pgxc_get_record_list(PG_FUNCTION_ARGS)
@@ -2244,11 +2047,7 @@ Datum pgxc_get_record_list(PG_FUNCTION_ARGS)
     recordList = get_2pc_list_from_cache(&count);
     if (count >= MAXIMUM_OUTPUT_FILE)
     {
-        if (NULL == recordList)
-        {
-            elog(PANIC, "recordList is NULL");
-        }
-
+        Assert(NULL != recordList);
         t_recordList = cstring_to_text(recordList);
         return PointerGetDatum(t_recordList);
     }
@@ -2331,18 +2130,8 @@ Datum pgxc_commit_on_node(PG_FUNCTION_ARGS)
     cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
     dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
  
-    if (0 == PG_GETARG_DATUM(0))
-    {
-        elog(ERROR, "pgxc_commit_on_node: node name is empty");
-    }
     nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
-
-    if (0 == PG_GETARG_DATUM(1))
-    {
-        elog(ERROR, "pgxc_commit_on_node: gid is empty");
-    }
     gid = text_to_cstring(PG_GETARG_TEXT_P(1));
-
     nodeoid = get_pgxc_nodeoid(nodename);
     if (InvalidOid == nodeoid)
     {
@@ -2374,10 +2163,6 @@ Datum pgxc_commit_on_node(PG_FUNCTION_ARGS)
         else
         {
             txn->global_commit_timestamp = GetGlobalTimestampGTM();
-            if (!GlobalTimestampIsValid(current_gts))
-            {
-                elog(ERROR, "pgxc_commit_on_node, get invalid gts");
-            }
         }
     }
     
@@ -2451,18 +2236,8 @@ Datum pgxc_abort_on_node(PG_FUNCTION_ARGS)
     cn_health_map = palloc0(cn_nodes_num * sizeof(bool));
     dn_health_map = palloc0(dn_nodes_num * sizeof(bool));
  
-    if (0 == PG_GETARG_DATUM(0))
-    {
-        elog(ERROR, "pgxc_abort_on_node: node name is empty");
-    }
     nodename = text_to_cstring(PG_GETARG_TEXT_P(0));
-
-    if (0 == PG_GETARG_DATUM(1))
-    {
-        elog(ERROR, "pgxc_abort_on_node: gid is empty");
-    }
     gid = text_to_cstring(PG_GETARG_TEXT_P(1));
-
     nodeoid = get_pgxc_nodeoid(nodename);
     if (InvalidOid == nodeoid)
     {
@@ -2628,15 +2403,6 @@ bool send_query_clean_transaction(PGXCNodeHandle* conn, txn_info *txn, const cha
                             TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
         }
         
-        if (InvalidGlobalTimestamp != txn->global_prepare_timestamp &&
-			pgxc_node_send_prepare_timestamp(conn, txn->global_prepare_timestamp))
-        {
-            ereport(ERROR,
-                    (errcode(ERRCODE_INTERNAL_ERROR),
-                     errmsg("in pg_clean failed to send prepare timestamp for %s PREPARED command",
-                            TXN_STATUS_COMMITTED == txn->global_txn_stat ? "COMMIT" : "ROLLBACK")));
-        }
-
         if (NULL != txn->participants && pgxc_node_send_partnodes(conn, txn->participants))
         {
             ereport(ERROR,
@@ -2662,13 +2428,7 @@ bool check_2pc_belong_node(txn_info * txn)
     int node_index = 0;
     char node_type;
     node_index = find_node_index(abnormal_nodeoid);
-
-    /* abnormal node oid must be valid here */
-    if (InvalidOid == abnormal_nodeoid)
-    {
-        elog(PANIC, "abnormal_nodeoid is invalid");
-    }
-
+    Assert(InvalidOid != abnormal_nodeoid);
     if (abnormal_nodeoid == txn->origcoord)
     {
         txn->belong_abnormal_node = true;
@@ -2688,7 +2448,9 @@ bool check_2pc_belong_node(txn_info * txn)
 
     if (InvalidOid == txn->origcoord)
     {
+        char *startnode = NULL;
         int   node_oid  = InvalidOid;
+        char  gid[MAX_GID];
 
         if (!IsXidImplicit(txn->gid))
         {
@@ -2696,16 +2458,39 @@ bool check_2pc_belong_node(txn_info * txn)
             return true;
         }
 
-        /* Get start node oid from gid */
-        node_oid = get_start_node_oid_from_gid(txn->gid);
-        if (node_oid == InvalidOid)
+        Assert(IsXidImplicit(txn->gid));
+
+        /* get start node from gid */
+        strcpy(gid, txn->gid);
+        startnode = strtok(gid, ":");
+        if (NULL == startnode)
+        {
+            elog(WARNING, "get startnode(%s) from gid(%s) failed",
+                startnode, gid);
+            txn->belong_abnormal_node = false;
+            return false;
+        }
+
+        startnode = strtok(NULL, ":");
+        if (NULL == startnode)
         {
-            elog(WARNING, "Get invalid start node oid from gid(%s)", txn->gid);
+            elog(WARNING, "get startnode(%s) from gid(%s) failed",
+                startnode, gid);
             txn->belong_abnormal_node = false;
             return false;
         }
 
-        elog(DEBUG1, "Get start node oid(%d) from gid(%s)", node_oid, txn->gid);
+        node_oid = get_pgxc_nodeoid(startnode);
+        if (NULL == startnode)
+        {
+            elog(WARNING, "get invalid oid for startnode(%s) from gid(%s)",
+                startnode, gid);
+            txn->belong_abnormal_node = false;
+            return false;
+        }
+
+        elog(DEBUG5, "get oid(%d) for startnode(%s) from gid(%s)",
+            node_oid, startnode, gid);
 
         if (abnormal_nodeoid == node_oid)
         {
@@ -2733,13 +2518,22 @@ bool check_node_participate(txn_info * txn, int node_idx)
 
 void recover2PC(txn_info * txn)
 {
-	bool is_running = true;
+	int i = 0;
+	bool check_ok = false;
+	int check_times = CLEAN_CHECK_TIMES_DEFAULT;
+	int check_interval = CLEAN_CHECK_INTERVAL_DEFAULT;
 	MemoryContext current_context = NULL;
 	ErrorData* edata = NULL;
 	TXN_STATUS txn_stat;
 	txn_stat = check_txn_global_status(txn);
 	txn->global_txn_stat = txn_stat;
 
+	if (clear_2pc_belong_node)
+	{
+		check_times = CLEAN_NODE_CHECK_TIMES;
+		check_interval = CLEAN_NODE_CHECK_INTERVAL;
+	}
+
 #ifdef DEBUG_EXECABORT
 	txn_stat = TXN_STATUS_ABORTED;
 #endif
@@ -2773,59 +2567,46 @@ void recover2PC(txn_info * txn)
             else
             {
     			txn->op = COMMIT;
-
-				/* check whether the 2pc start xid is 0 */
-				if (txn->startxid == 0 && IsXidImplicit(txn->gid))
-				{
-					elog(WARNING, "Commit 2PC '%s' start xid is 0", txn->gid);
-					txn->op_issuccess = false;
-					return;
-				}
-
-				/* check whether the 2pc start xid is still running on start node */
-				if (is_txn_start_xid_running(txn))
+				/* check whether all nodes can commit prepared */
+				for (i = 0; i < check_times; i++)
 				{
-					elog(WARNING, "Commit 2PC '%s' start xid %d is running",
-						txn->gid, txn->startxid);
-					txn->op_issuccess = false;
-					return;
-				}
-
-				/* check whether the 2pc is still running on participants */
-				is_running = false;
+					check_ok = true;
 					current_context = CurrentMemoryContext;
 					PG_TRY();
 					{
     			if (!clean_2PC_iscommit(txn, true, true))
     			{
-						is_running = true;
-						elog(WARNING, "Commit 2PC '%s' check failed", txn->gid);
+							check_ok = false;
+							elog(LOG, "check commit 2PC transaction %s failed",
+								txn->gid);
 						}
 					}
 					PG_CATCH();
 					{
-					is_running = true;
 						(void)MemoryContextSwitchTo(current_context);
 						edata = CopyErrorData();
 						FlushErrorState();
 
-					elog(WARNING, "Commit 2PC '%s' is running, error: %s",
+						check_ok = false;
+						elog(WARNING, "check commit 2PC transaction %s error: %s",
 							txn->gid, edata->message);
 					}
 					PG_END_TRY();
 
-				/* 2pc is still running, do not try to clean */
-				if (is_running)
+					if (!check_ok)
 					{
     				txn->op_issuccess = false;
     				return;
     			}
 
+					pg_usleep(check_interval);
+				}
+
     			/* send commit prepared to all nodes */
     			if (!clean_2PC_iscommit(txn, true, false))
     			{
     				txn->op_issuccess = false;
-					elog(WARNING, "Commit 2PC '%s' failed", txn->gid);
+					elog(LOG, "commit 2PC transaction %s failed", txn->gid);
     				return;
     			}
     			txn->op_issuccess = true;
@@ -2835,57 +2616,46 @@ void recover2PC(txn_info * txn)
 		
 		case TXN_STATUS_ABORTED:
 			txn->op = ABORT;
-
-			/* check whether the 2pc start xid is 0 */
-			if (txn->startxid == 0 && IsXidImplicit(txn->gid))
+			/* check whether all nodes can rollback prepared */
+			for (i = 0; i < check_times; i++)
 			{
-				elog(WARNING, "Rollback 2PC '%s' start xid is 0", txn->gid);
-			}
-
-			/* check whether the 2pc start xid is still running on start node */
-			if (is_txn_start_xid_running(txn))
-			{
-				elog(WARNING, "Rollback 2PC '%s' start xid %d is running",
-					txn->gid, txn->startxid);
-				txn->op_issuccess = false;
-				return;
-			}
-
-			/* check whether the 2pc is still running on participants */
-			is_running = false;
+				check_ok = true;
 				current_context = CurrentMemoryContext;
 				PG_TRY();
 				{
 			if (!clean_2PC_iscommit(txn, false, true))
 			{
-					is_running = true;
-					elog(WARNING, "Rollback 2PC '%s' check failed", txn->gid);
+						check_ok = false;
+						elog(LOG, "check rollback 2PC transaction %s failed",
+							txn->gid);
 					}
 				}
 				PG_CATCH();
 				{
-				is_running = true;
+					check_ok = false;
 					(void)MemoryContextSwitchTo(current_context);
 					edata = CopyErrorData();
 					FlushErrorState();
 
-				elog(WARNING, "Rollback 2PC '%s' is running, error: %s",
+					elog(WARNING, "check rollback 2PC transaction %s error: %s",
 						txn->gid, edata->message);
 				}
 				PG_END_TRY();
 
-			/* 2pc is still running, do not try to clean */
-			if (is_running)
+				if (!check_ok)
 				{
 				txn->op_issuccess = false;
 				return;
 			}
 
+				pg_usleep(check_interval);
+			}
+
 			/* send rollback prepared to all nodes */
 			if (!clean_2PC_iscommit(txn, false, false))
 			{
 				txn->op_issuccess = false;
-				elog(WARNING, "Rollback 2PC '%s' failed", txn->gid);
+				elog(LOG, "rollback 2PC transaction %s failed", txn->gid);
 				return;
 			}
 			txn->op_issuccess = true;
@@ -2915,6 +2685,7 @@ TXN_STATUS check_txn_global_status(txn_info *txn)
 #define TXN_INPROGRESS	0X0020
 	int ii;
 	int check_flag = 0;
+    int node_idx = 0;
 	TimestampTz prepared_time = 0;
 	TimestampTz time_gap = clean_time_interval;
 
@@ -2999,84 +2770,6 @@ TXN_STATUS check_txn_global_status(txn_info *txn)
         return TXN_STATUS_INPROGRESS;
     }
 #endif                
-
-    /* start xid is 0, maybe at the beginning of the 2pc */
-    if (txn->startxid == 0)
-    {
-        /* prepare timestamp must be invalid */
-        if (GlobalTimestampIsValid(txn->global_prepare_timestamp))
-        {
-            elog(PANIC, "gid: %s, start xid is 0, global_prepare_timestamp: %ld",
-                txn->gid, txn->global_prepare_timestamp);
-        }
-
-        elog(DEBUG2, "2PC '%s' start xid is 0", txn->gid);
-
-        if (check_flag & TXN_INPROGRESS
-            || current_time - prepared_time <= time_gap)
-        {
-            /* inprogress or less than time gap, do not clean it */
-            elog(LOG, "2PC '%s' start xid is 0, inprogress, "
-                "current_time: %ld, prepared_time: %ld, "
-                "time_gap: %ld, time_diff: %ld",
-                txn->gid, current_time, prepared_time,
-                time_gap, current_time - prepared_time);
-
-            return TXN_STATUS_INPROGRESS;
-        }
-        else
-        {
-            /* otherwise, abort it */
-            elog(WARNING, "2PC '%s' start xid is 0, "
-                "current_time: %ld, prepared_time: %ld, "
-                "time_gap: %ld, time_diff: %ld",
-                txn->gid, current_time, prepared_time,
-                time_gap, current_time - prepared_time);
-
-            return TXN_STATUS_ABORTED;
-        }
-    }
-
-    /* use for upgrade from old version, no prepare timestamp in old version */
-    if (!GlobalTimestampIsValid(txn->global_prepare_timestamp))
-        {
-        elog(WARNING, "gid: %s, start xid is %d, global_prepare_timestamp "
-            "is invalid", txn->gid, txn->startxid);
-
-        if (check_flag & TXN_INPROGRESS
-            || current_time - prepared_time <= time_gap)
-            {
-            /* inprogress or less than time gap, do not clean it */
-            elog(WARNING, "gid: %s, start xid is %d, inprogress, "
-                "current_time: %ld, prepared_time: %ld, "
-                "time_gap: %ld, time_diff: %ld",
-                txn->gid, txn->startxid, current_time, prepared_time,
-                time_gap, current_time - prepared_time);
-
-                return TXN_STATUS_INPROGRESS;
-            }
-        else
-        {
-            /* otherwise, set prepare timestamp */
-            if (clear_2pc_belong_node)
-            {
-                txn->global_prepare_timestamp = abnormal_gts;
-        }
-        else
-        {
-                txn->global_prepare_timestamp = current_gts - time_gap;
-            }
-
-            elog(WARNING, "gid: %s, start xid is %d, "
-                "current_time: %ld, prepared_time: %ld, "
-                "time_gap: %ld, time_diff: %ld, "
-                "set global_prepare_timestamp: %ld",
-                txn->gid, txn->startxid, current_time, prepared_time,
-                time_gap, current_time - prepared_time,
-                txn->global_prepare_timestamp);
-        }
-        }
-
     if (clear_2pc_belong_node)
         {
         if (!check_2pc_belong_node(txn))
@@ -3089,35 +2782,32 @@ TXN_STATUS check_txn_global_status(txn_info *txn)
             return TXN_STATUS_INPROGRESS;
         }
 
-        /* abnormal gts must be valid */
-        if (!GlobalTimestampIsValid(abnormal_gts))
+        node_idx = find_node_index(abnormal_nodeoid);
+        if (node_idx >= 0)
         {
-            elog(PANIC, "gid: %s, abnormal_gts is invalid gts", txn->gid);
-        }
-
-        /* abnormal gts less than prepare gts, do not clean it */
-        if (abnormal_gts < txn->global_prepare_timestamp)
+            if (abnormal_time < txn->prepare_timestamp[node_idx])
         {
-            elog(LOG, "gid: %s, abnormal gts: " INT64_FORMAT
-                ", prepare gts: " INT64_FORMAT, txn->gid,
-                abnormal_gts, txn->global_prepare_timestamp);
+                elog(WARNING, "gid: %s, abnormal time: " INT64_FORMAT
+                    ", prepare timestamp[%d]: " INT64_FORMAT, txn->gid,
+                    abnormal_time, node_idx, txn->prepare_timestamp[node_idx]);
 
             return TXN_STATUS_INPROGRESS;
         }
-
-        if (GlobalTimestampIsValid(txn->global_commit_timestamp))
+        }
+        else
         {
-            /* abnormal gts less than commit gts, do not clean it */
-            if (abnormal_gts < txn->global_commit_timestamp)
+            elog(WARNING, "gid: %s, node_idx: %d", txn->gid, node_idx);
+        }
+
+        if (abnormal_time < prepared_time)
             {
-                elog(LOG, "gid: %s, abnormal gts: " INT64_FORMAT
-                    ", commit gts: " INT64_FORMAT, txn->gid,
-                    abnormal_gts, txn->global_commit_timestamp);
+            elog(WARNING, "gid: %s, abnormal time: " INT64_FORMAT
+                ", prepared time: " INT64_FORMAT, txn->gid,
+                abnormal_time, prepared_time);
 
                 return TXN_STATUS_INPROGRESS;
             }
         }
-    }
     else
     {
         if (check_flag & TXN_INPROGRESS ||current_time - prepared_time <= time_gap)
@@ -3125,36 +2815,8 @@ TXN_STATUS check_txn_global_status(txn_info *txn)
             /* transaction inprogress */
             return TXN_STATUS_INPROGRESS;
         }
-
-        /* current gts must be valid */
-        if (!GlobalTimestampIsValid(current_gts))
-        {
-            elog(PANIC, "gid: %s, current_gts is invalid gts", txn->gid);
-        }
-
-        /* 2pc prepare gts gap less than time gap, do not clean it */
-        if (current_gts - txn->global_prepare_timestamp < time_gap)
-        {
-            elog(LOG, "gid: %s, current gts: " INT64_FORMAT
-                ", prepare gts: " INT64_FORMAT ", time gap: " INT64_FORMAT,
-                txn->gid, current_gts, txn->global_prepare_timestamp, time_gap);
-
-            return TXN_STATUS_INPROGRESS;
     }
 
-        if (GlobalTimestampIsValid(txn->global_commit_timestamp))
-        {
-            /* 2pc commit gts gap less than time gap, do not clean it */
-            if (current_gts - txn->global_commit_timestamp <= time_gap)
-            {
-                elog(LOG, "gid: %s, current gts: " INT64_FORMAT
-                    ", commit gts: " INT64_FORMAT ", time gap: " INT64_FORMAT,
-                    txn->gid, current_gts, txn->global_commit_timestamp, time_gap);
-
-                return TXN_STATUS_INPROGRESS;
-            }
-        }
-    }
 
     if (!IsXidImplicit(txn->gid) && txn->after_first_phase && (TXN_PREPARED == check_flag))
     {
@@ -3175,21 +2837,6 @@ TXN_STATUS check_txn_global_status(txn_info *txn)
 	if (check_flag & TXN_COMMITTED)
 		/* Some 2PC transactions are committed.  Need to commit others. */
 		return TXN_STATUS_COMMITTED;
-
-	/* If 2PC commit gts is valid, must commit it. */
-	if (GlobalTimestampIsValid(txn->global_commit_timestamp))
-	{
-		elog(LOG, "'%s' global_commit_timestamp: %ld",
-			txn->gid, txn->global_commit_timestamp);
-
-		if (!(check_flag & TXN_PREPARED))
-		{
-			elog(PANIC, "gid: %s, check_flag: %d", txn->gid, check_flag);
-		}
-
-		return TXN_STATUS_COMMITTED;
-	}
-
 	/* All the transactions remain prepared.   No need to recover. */
 	return TXN_STATUS_ABORTED;
 }
@@ -3254,11 +2901,6 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check)
 	{
         node_oid = pgxc_handles->datanode_handles[ii]->nodeoid;
     	node_idx = find_node_index(node_oid);
-        if (node_idx < 0 || node_idx >= cn_nodes_num + dn_nodes_num)
-        {
-            elog(PANIC, "gid: %s, node_idx(%d) is invalid", txn->gid, node_idx);
-        }
-
         if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx])
         {
             continue;
@@ -3292,11 +2934,6 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check)
 	{
         node_oid = pgxc_handles->coord_handles[ii]->nodeoid;
     	node_idx = find_node_index(node_oid);
-        if (node_idx < 0 || node_idx >= cn_nodes_num + dn_nodes_num)
-        {
-            elog(PANIC, "gid: %s, node_idx(%d) is invalid", txn->gid, node_idx);
-        }
-
         if (TXN_STATUS_PREPARED != txn->txn_stat[ node_idx])
         {
             continue;
@@ -3324,6 +2961,7 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check)
             }
 #endif
         }
+
 	}
 
     /* receive response */
@@ -3362,14 +3000,10 @@ bool clean_2PC_iscommit(txn_info *txn, bool is_commit, bool is_check)
     if (txn->origcoord != InvalidOid)
     {
     	node_idx = find_node_index(txn->origcoord);
-        if (node_idx < 0 || node_idx >= cn_nodes_num + dn_nodes_num)
-        {
-            elog(PANIC, "gid: %s, node_idx(%d) is invalid", txn->gid, node_idx);
-        }
-
     	if (txn->coordparts[node_idx] == 1)
     	{
 			/*send global timestamp to dn_node_list[ii]*/
+            
 			if (txn->txn_stat[node_idx] == TXN_STATUS_PREPARED)
 			{
                 get_node_handles(&pgxc_handles, txn->origcoord);
@@ -3438,8 +3072,7 @@ bool clean_2PC_files(txn_info * txn)
 		}
 		else
 		{
-			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s",
-				txn->gid, get_pgxc_nodename(dn_node_list[ii]));
+			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(dn_node_list[ii]));
 			issuccess = false;
 		}
 		DropTupleTableSlots(&result);
@@ -3453,15 +3086,14 @@ bool clean_2PC_files(txn_info * txn)
 		{
 			if (TTSgetvalue(&result, 0, 0) == false)
 			{
-				elog(LOG, "pg_clean: delete 2PC file failed of transaction %s on node %s",
+				elog(LOG, "Error:delete 2PC file failed of transaction %s on node %s",
 						  txn->gid, get_pgxc_nodename(txn->coordparts[ii]));
 				issuccess = false;
 			}
 		}
 		else
 		{
-			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s",
-				txn->gid, get_pgxc_nodename(cn_node_list[ii]));
+			elog(LOG, "pg_clean: failed clean 2pc file of transaction %s on node %s", txn->gid, get_pgxc_nodename(cn_node_list[ii]));
 			issuccess = false;
 		}
 		DropTupleTableSlots(&result);
@@ -3746,14 +3378,12 @@ void get_node_handles(PGXCNodeAllHandles **pgxc_handles, Oid nodeoid)
 	*pgxc_handles = get_handles(nodelist, coordlist, false, true, true);
 }
 
+
 bool check_2pc_start_from_node(txn_info *txn)
 {
 	char node_type;
 
-	if (InvalidOid == abnormal_nodeoid)
-	{
-		elog(PANIC, "gid: %s, abnormal_nodeoid is invalid", txn->gid);
-	}
+	Assert(InvalidOid != abnormal_nodeoid);
 
 	if (abnormal_nodeoid == txn->origcoord)
 	{
@@ -3768,239 +3398,51 @@ bool check_2pc_start_from_node(txn_info *txn)
 
 	if (InvalidOid == txn->origcoord)
 	{
+		char *startnode = NULL;
 		int   node_oid  = InvalidOid;
+		char  gid[MAX_GID];
 
 		if (!IsXidImplicit(txn->gid))
 		{
 			return true;
 		}
 
-		/* Get start node oid from gid */
-		node_oid = get_start_node_oid_from_gid(txn->gid);
-		if (InvalidOid == node_oid)
-		{
-			elog(WARNING, "Get invalid start node oid from gid(%s)", txn->gid);
-			return false;
-		}
-
-		elog(DEBUG1, "Get start node oid(%d) from gid(%s)", node_oid, txn->gid);
+		Assert(IsXidImplicit(txn->gid));
 
-		if (abnormal_nodeoid == node_oid)
+		/* get start node from gid */
+		strcpy(gid, txn->gid);
+		startnode = strtok(gid, ":");
+		if (NULL == startnode)
 		{
-			return true;
-		}
-	}
-
+			elog(WARNING, "get startnode(%s) from gid(%s) failed",
+				startnode, gid);
 			return false;
 		}
 
-/*
- * get_start_node_from_gid
- * Get start node name from gid
- * gid: 2pc gid
- */
-char *get_start_node_from_gid(char *gid)
-		{
-	char *str_start_node = NULL;
-
-	if (!IsXidImplicit(gid))
-	{
-		elog(WARNING, "2PC '%s' is not implicit", gid);
-		return NULL;
-	}
-
-	/* Get start node name from gid */
-	str_start_node = strtok(gid, ":");
-	if (str_start_node == NULL)
-	{
-		elog(WARNING, "Get start node from gid(%s) failed", gid);
-		return NULL;
-		}
-
-	str_start_node = strtok(NULL, ":");
-	if (str_start_node == NULL)
-	{
-		elog(WARNING, "Get start node from gid(%s) failed", gid);
-		return NULL;
-	}
-
-	return str_start_node;
-}
-
-/*
- * get_start_node_oid_from_gid
- * Get start node oid from gid
- * gid: 2pc gid
- */
-Oid get_start_node_oid_from_gid(char *gid)
-{
-	Oid start_node_oid = 0;
-	char *str_start_node = NULL;
-	char gid_buf[MAX_GID];
-
-	/* Get start node oid from gid */
-	strcpy(gid_buf, gid);
-	str_start_node = get_start_node_from_gid(gid_buf);
-	if (str_start_node == NULL)
-	{
-		elog(WARNING, "Get start node from gid(%s) failed", gid);
-		return 0;
-	}
-
-	elog(LOG, "Get start node(%s) from gid(%s)", str_start_node, gid);
-
-	start_node_oid = get_pgxc_nodeoid(str_start_node);
-	if (start_node_oid == InvalidOid)
-	{
-		elog(WARNING, "Get invalid oid for start node(%s) from gid(%s)",
-			str_start_node, gid);
-		return 0;
-	}
-
-	return start_node_oid;
-}
-
-/*
- * get_start_xid_from_gid
- * Get start xid from gid
- * gid: 2pc gid
- */
-uint32 get_start_xid_from_gid(char *gid)
-{
-	uint32 start_xid = 0;
-	char *str_start_xid = NULL;
-	char gid_buf[MAX_GID];
-
-	if (!IsXidImplicit(gid))
-	{
-		elog(WARNING, "2PC '%s' is not implicit", gid);
-		return 0;
-	}
-
-	/* Get start xid from gid */
-	strcpy(gid_buf, gid);
-	str_start_xid = gid_buf + strlen(XIDPREFIX);
-	str_start_xid = strtok(str_start_xid, ":");
-	start_xid = strtoul(str_start_xid, NULL, 10);
-	if (start_xid == 0)
-		{
-		elog(WARNING, "Get start xid from gid(%s) failed", gid);
-		return 0;
-	}
-
-	return start_xid;
-}
-
-/*
- * is_xid_running_on_node
- * Whether the transaction with the xid is still running on the node
- * xid: transaction id
- * node_oid: node oid
- */
-bool is_xid_running_on_node(uint32 xid, Oid node_oid)
+		startnode = strtok(NULL, ":");
+		if (NULL == startnode)
 {
-	bool is_running = true;
-
-	Datum execute_res;
-	TupleTableSlots result;
-	char command[MAX_CMD_LENGTH];
-
-	if (xid == 0 || node_oid == InvalidOid)
-	{
-		elog(PANIC, "2PC xid: %d, node oid: %d", xid, node_oid);
-			return true;
-		}
-
-	snprintf(command, MAX_CMD_LENGTH, "select pid::text, backend_xid::text "
-			"from pg_catalog.pg_stat_activity where backend_xid=%d", xid);
-
-	execute_res = execute_query_on_single_node(node_oid, command, 2, &result);
-	if (execute_res == (Datum) 1)
-	{
-		if (result.slot_count == 0)
-		{
-			is_running = false;
+			elog(WARNING, "get startnode(%s) from gid(%s) failed",
+				startnode, gid);
+			return false;
 		}
-		else
-		{
-			is_running = true;
 
-			if (result.slot_count != 1)
+		node_oid = get_pgxc_nodeoid(startnode);
+		if (NULL == startnode)
 			{
-				elog(PANIC, "Get %d resules for xid: %d", result.slot_count, xid);
-			}
-		}
-	}
-	else
-	{
-		elog(WARNING, "pg_clean: Faile to query xid %d on node %s",
-			xid, get_pgxc_nodename(node_oid));
-		is_running = true;
+			elog(WARNING, "get invalid oid for startnode(%s) from gid(%s)",
+				startnode, gid);
+			return false;
 	}
-	DropTupleTableSlots(&result);
 
-	return is_running;
-	}
+		elog(DEBUG1, "get oid(%d) for startnode(%s) from gid(%s)",
+			node_oid, startnode, gid);
 
-/*
- * is_gid_start_xid_running
- * Whether the transaction with the start xid is still running on start node
- * gid: 2pc gid
- */
-bool is_gid_start_xid_running(char *gid)
-{
-	uint32 start_xid = 0;
-	Oid start_node_oid = InvalidOid;
-
-	if (!IsXidImplicit(gid))
+		if (abnormal_nodeoid == node_oid)
 	{
-		elog(LOG, "Explicit 2PC '%s'", gid);
 		return true;
 	}
-
-	/* Get start xid from gid */
-	start_xid = get_start_xid_from_gid(gid);
-	if (start_xid == 0)
-	{
-		elog(ERROR, "Get start xid from gid(%s) failed", gid);
-		return true;
 	}
 
-	elog(LOG, "Get start xid(%d) from gid(%s)", start_xid, gid);
-
-	/* Get start node oid from gid */
-	start_node_oid = get_start_node_oid_from_gid(gid);
-	if (start_node_oid == InvalidOid)
-	{
-		elog(WARNING, "Get invalid start node oid from gid(%s)", gid);
 	return false;
 }
-
-	elog(LOG, "Get start node oid(%d) from gid(%s)", start_node_oid, gid);
-
-	return is_xid_running_on_node(start_xid, start_node_oid);
-}
-
-/*
- * is_txn_start_xid_running
- * Whether the transaction with the start xid is still running on start node
- * txn: 2pc transaction info
- */
-bool is_txn_start_xid_running(txn_info *txn)
-{
-	if (txn->startxid != 0)
-	{
-		Assert(txn->origcoord != InvalidOid);
-		return is_xid_running_on_node(txn->startxid, txn->origcoord);
-	}
-
-	Assert(txn->origcoord == InvalidOid);
-
-	if (!IsXidImplicit(txn->gid))
-	{
-		elog(LOG, "Explicit 2PC '%s' start xid is %d", txn->gid, txn->startxid);
-		return false;
-	}
-
-	return is_gid_start_xid_running(txn->gid);
-}
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index ea188961..e78f9c53 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -2392,11 +2392,12 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 
 				if (!save_and_remove_2pc_info(gxact->gid))
 					{
-					elog(DEBUG1, "checkpoint: %s save to file failed", gxact->gid);
+					elog(LOG, "[%s] %s save to file failed",
+						__FUNCTION__, gxact->gid);
 				}
 				else
 				{
-					elog(LOG, "checkpoint: %s is saved to file", gxact->gid);
+					elog(LOG, "[%s] %s is saved to file", __FUNCTION__, gxact->gid);
 				}
 			}
 #endif
@@ -3740,12 +3741,10 @@ void record_2pc_involved_nodes_xid(const char * tid,
 	File fd = 0;
 	int ret = 0;
 	int size = 0;
-	int pg_clean_check_size = 0;
 	StringInfoData content;
 	struct stat fst;
 	char path[MAXPGPATH];
 	char *result = NULL;
-	GlobalTimestamp prepare_gts = InvalidGlobalTimestamp;
 
 #ifdef __TWO_PHASE_TESTS__
 	XLogRecPtr xlogrec = 0;
@@ -3756,18 +3755,6 @@ void record_2pc_involved_nodes_xid(const char * tid,
 		return;
 	}
 
-	prepare_gts = GetGlobalPrepareTimestamp();
-	if (!GlobalTimestampIsValid(prepare_gts))
-	{
-		elog(WARNING, "prepare gts is invalid");
-		prepare_gts = GetGlobalTimestampGTM();
-		if (!GlobalTimestampIsValid(prepare_gts))
-		{
-			elog(ERROR, "get gts for prepare is invalid");
-		}
-		SetGlobalPrepareTimestamp(prepare_gts);
-	}
-
 	if (enable_distri_print || enable_2pc_entry_trace)
     {
 		elog(LOG, "[%s] record %s, startnode: %s, participants: %s",
@@ -3793,10 +3780,6 @@ void record_2pc_involved_nodes_xid(const char * tid,
 	appendStringInfo(&content, "startxid:%u\n", startxid);
 	appendStringInfo(&content, "nodes:%s\n", nodestring);
 	appendStringInfo(&content, "xid:%u\n", xid);
-	pg_clean_check_size = content.len;
-	Assert(pg_clean_check_size == strlen(content.data));
-
-	appendStringInfo(&content, "global_prepare_timestamp:%ld\n", prepare_gts);
 	size = content.len;
 	Assert(size == strlen(content.data));
 
@@ -3815,10 +3798,11 @@ void record_2pc_involved_nodes_xid(const char * tid,
 				Assert(strlen(info) < MAX_2PC_INFO_SIZE);
 				check_2pc_file(tid, info, __FUNCTION__);
 
-				if (pg_strncasecmp(info, content.data, pg_clean_check_size) != 0)
+				if (strncmp(info, content.data, size) != 0)
 				{
-					elog(ERROR, "pg_clean attemp to write %s info conflict, "
-						"content: %s, info: %s", tid, content.data, info);
+					elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, "
+						"content: %s, info: %s", __FUNCTION__, tid,
+						content.data, info);
 				}
 
 				resetStringInfo(&content);
@@ -3852,10 +3836,11 @@ void record_2pc_involved_nodes_xid(const char * tid,
 
 			Assert(NULL != result);
 
-			if (pg_strncasecmp(result, content.data, pg_clean_check_size) != 0)
+			if (strncmp(result, content.data, size) != 0)
             {
-				elog(ERROR, "pg_clean attemp to write %s info conflict, "
-					"content: %s, info: %s", tid, content.data, result);
+				elog(ERROR, "[%s] pg_clean attemp to write %s info conflict, "
+					"content: %s, info: %s",
+					__FUNCTION__, tid, content.data, result);
             }
 
 			pfree(result);
@@ -3868,16 +3853,12 @@ void record_2pc_involved_nodes_xid(const char * tid,
     
 	if (!RecoveryInProgress())
 	{
-		char *fmt_v2 = XLOG_FMT_2PC_V2;
 		XLogBeginInsert();
 		XLogRegisterData((char *)tid, strlen(tid) + 1);
-		XLogRegisterData((char *)fmt_v2, strlen(fmt_v2) + 1);
 		XLogRegisterData((char *)startnode, strlen(startnode) + 1);
-		XLogRegisterData((char *)&startxid, sizeof(GlobalTransactionId));
+		XLogRegisterData((char *)&startxid, sizeof(GlobalTransactionId) + 1);
 		XLogRegisterData((char *)nodestring, strlen(nodestring) + 1);
-		XLogRegisterData((char *)&xid, sizeof(GlobalTransactionId));
-		XLogRegisterData((char *)&prepare_gts, sizeof(GlobalTimestamp));
-
+		XLogRegisterData((char *)&xid, sizeof(GlobalTransactionId) + 1);
 #ifdef __TWO_PHASE_TESTS__
 		xlogrec = 
 #endif
@@ -3992,7 +3973,7 @@ void record_2pc_commit_timestamp(const char *tid, GlobalTimestamp commit_timesta
 	{
 		XLogBeginInsert();
 		XLogRegisterData((char *)tid, strlen(tid) + 1);
-		XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp));
+		XLogRegisterData((char *)&commit_timestamp, sizeof(GlobalTimestamp) + 1);
 		xlogrec = XLogInsert(RM_XLOG_ID, XLOG_RECORD_2PC_TIMESTAMP);
 		/* only start node need to flush and sync XLOG_RECORD_2PC_TIMESTAMP */
 		if (IS_PGXC_LOCAL_COORDINATOR)
@@ -4197,7 +4178,7 @@ void rename_2pc_records(const char *tid, TimestampTz timestamp)
 		XLogBeginInsert();
 		XLogRegisterData((char *)tid, strlen(tid) + 1);
 		XLogRegisterData((char *)type, strlen(type) + 1);
-		XLogRegisterData((char *)&timestamp, sizeof(TimestampTz));
+		XLogRegisterData((char *)&timestamp, sizeof(TimestampTz) + 1);
 		XLogInsert(RM_XLOG_ID, XLOG_CLEAN_2PC_FILE);
 	}
 
@@ -4407,7 +4388,7 @@ char *get_2pc_list_from_cache(int *count)
 		{
 			recordList = (char *) repalloc(recordList,
 				strlen(entry->key) + strlen(recordList) + 2);
-			sprintf(recordList + strlen(recordList), ",%s", entry->key);
+			sprintf(recordList, "%s,%s", recordList, entry->key);
 		}
 
 		if (++(*count) >= MAX_OUTPUT_FILE)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 99cc62f3..bf528c0d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -10177,10 +10177,10 @@ xlog_redo(XLogReaderState *record)
 		TimestampTz timestamp = 0;
         gid = XLogRecGetData(record);
 		type = gid + strlen(gid) + 1;
-		if (0 == strcmp(type, "rename"))
-		{
 			pos = type + strlen(type) + 1;
 			memcpy(&timestamp, pos, sizeof(TimestampTz));
+		if (0 == strcmp(type, "rename"))
+		{
 			rename_2pc_records(gid, timestamp);
 		}
 		else
@@ -10192,13 +10192,11 @@ xlog_redo(XLogReaderState *record)
     {
         TransactionId xid;
         TransactionId startxid;
-        GlobalTimestamp prepare_gts = InvalidGlobalTimestamp;
-        char *fmt_v2 = XLOG_FMT_2PC_V2;
         char *gid; 
         char *startnode;
         char *nodestring;
         char *pos;
-        char *type;
+        char *temp;
 #ifdef __TWO_PHASE_TESTS__            
         TransactionId old_shem_nextxid = ShmemVariableCache->nextXid;
 #endif
@@ -10206,48 +10204,27 @@ xlog_redo(XLogReaderState *record)
         gid = XLogRecGetData(record);
         pos = gid + strlen(gid) +1;
         /* if the transaction is readonly */
-        type = pos;
-        pos = pos + strlen(type) + 1;
+        temp = pos;
+        pos = pos + strlen(temp) + 1;
 
-        if (0 != strcmp(type, "readonly"))
+        if (0 != strcmp(temp, "readonly"))
         {
-            if (0 == strcmp(type, fmt_v2))
-            {
-                startnode = pos;
-                pos = pos + strlen(startnode) + 1;
-                memcpy(&startxid, pos, sizeof(TransactionId));
-                pos = pos + sizeof(TransactionId);
-                nodestring = pos;
-                pos = pos + strlen(nodestring) + 1;
-                memcpy(&xid, pos, sizeof(TransactionId));
-                pos = pos + sizeof(TransactionId);
-                memcpy(&prepare_gts, pos, sizeof(GlobalTimestamp));
-                pos = pos + sizeof(GlobalTimestamp);
-            }
-            else
-        {
-                /* compatible with old format */
-                startnode = type;
+            startnode = temp;
             memcpy(&startxid, pos, sizeof(TransactionId));
             pos = pos + sizeof(TransactionId) + 1;
             nodestring = pos;
             pos = pos + strlen(nodestring) + 1;
             memcpy(&xid, pos, sizeof(TransactionId));
-                pos = pos + sizeof(TransactionId) + 1;
-            }
-
             if (enable_distri_print)
             {
                 elog(LOG, "xlog redo 2pc file name: '%s', startnode: %s, "
-                    "startxid: %u, prepare_gts: %ld, nodestring: %s, xid: %u",
-                    gid, startnode, startxid, prepare_gts, nodestring, xid);
+                    "startxid: %u, nodestring: %s, xid: %u",
+                    gid, startnode, startxid, nodestring, xid);
             }
-
 #ifdef __TWO_PHASE_TESTS__            
             if (FILE_XLOG_EXISTED == twophase_exception_case)
             {
                 elog(LOG, "FILE_XLOG_EXISTED complish");
-                SetGlobalPrepareTimestamp(prepare_gts);
                 record_2pc_involved_nodes_xid(gid, startnode, startxid, nodestring, xid);
             }
 #endif            
@@ -10271,7 +10248,6 @@ xlog_redo(XLogReaderState *record)
                 LWLockRelease(XidGenLock);
             }
 
-            SetGlobalPrepareTimestamp(prepare_gts);
             record_2pc_involved_nodes_xid(gid, startnode, startxid, nodestring, xid);
         }
         else
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 7ad4a8be..c3ee221a 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -3986,10 +3986,11 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 #endif
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
+	if(implicit)
+	{
         if(enable_distri_print)
         {
-		elog(LOG, "prepare remote transaction xid %d gid %s",
-			GetTopTransactionIdIfAny(), prepareGID);
+			elog(LOG, "prepare remote transaction xid %d gid %s", GetTopTransactionIdIfAny(), prepareGID);
         }
         global_prepare_ts = GetGlobalTimestampGTM();
 
@@ -3999,19 +4000,17 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
         global_prepare_ts = 0;
     }
 #endif
-
-	if (!GlobalTimestampIsValid(global_prepare_ts))
-	{
+		if(!GlobalTimestampIsValid(global_prepare_ts)){
             ereport(ERROR,
             (errcode(ERRCODE_INTERNAL_ERROR),
              errmsg("failed to get global timestamp for PREPARED command")));
         }
         if(enable_distri_print)
         {
-		elog(LOG, "prepare phase get global prepare timestamp gid %s, time "
-			INT64_FORMAT, prepareGID, global_prepare_ts);
+			elog(LOG, "prepare phase get global prepare timestamp gid %s, time " INT64_FORMAT, prepareGID, global_prepare_ts);
         }
         SetGlobalPrepareTimestamp(global_prepare_ts);
+	}
 #endif
 
 #ifdef __TWO_PHASE_TRANS__
@@ -4106,18 +4105,19 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
             {
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
+				if(implicit)
+				{
                     if(enable_distri_print)
                     {
-					elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts "
-						INT64_FORMAT, GetTopTransactionIdIfAny(),
+						elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(),
                                                         prepareGID, global_prepare_ts);
                     }
                     if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts))
                     {
                         ereport(ERROR,
                                 (errcode(ERRCODE_INTERNAL_ERROR),
-						 errmsg("failed to send global prepare committs for "
-							"PREPARED command")));
+								 errmsg("failed to send global prepare committs for PREPARED command")));
+					}
                 }
 #endif
                 /* Send down prepare command */
@@ -4151,10 +4151,11 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 #endif
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
+				if(implicit)
+				{
                     if(enable_distri_print)
                     {
-					elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts "
-						INT64_FORMAT, GetTopTransactionIdIfAny(),
+						elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(),
                                                         prepareGID, global_prepare_ts);
                     }
                     if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts))
@@ -4168,8 +4169,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 #endif
                         ereport(ERROR,
                                 (errcode(ERRCODE_INTERNAL_ERROR),
-						 errmsg("failed to send global prepare committs for "
-							"PREPARED command")));
+								 errmsg("failed to send global prepare committs for PREPARED command")));
+					}
                 }
 #endif
 
@@ -4308,18 +4309,19 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
             if (conn->read_only)
             {
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
+				if(implicit)
+				{
                     if(enable_distri_print)
                     {
-					elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts "
-						INT64_FORMAT,GetTopTransactionIdIfAny(),
+						elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(),
                                                         prepareGID, global_prepare_ts);
                     }
                     if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts))
                     {
                         ereport(ERROR,
                                 (errcode(ERRCODE_INTERNAL_ERROR),
-						 errmsg("failed to send global prepare committs for "
-							"PREPARED command")));
+								 errmsg("failed to send global prepare committs for PREPARED command")));
+					}
                 }
 #endif
                 /* Send down prepare command */
@@ -4350,10 +4352,11 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 #endif
 
 #ifdef __SUPPORT_DISTRIBUTED_TRANSACTION__
+				if(implicit)
+				{
                     if(enable_distri_print)
                     {
-					elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts "
-						INT64_FORMAT,GetTopTransactionIdIfAny(),
+						elog(LOG, "send prepare timestamp for xid %d gid %s prepare ts " INT64_FORMAT,GetTopTransactionIdIfAny(),
                                                         prepareGID, global_prepare_ts);
                     }
                     if (pgxc_node_send_prepare_timestamp(conn, global_prepare_ts))
@@ -4367,8 +4370,8 @@ pgxc_node_remote_prepare(char *prepareGID, bool localNode, bool implicit)
 #endif
                         ereport(ERROR,
                                 (errcode(ERRCODE_INTERNAL_ERROR),
-						 errmsg("failed to send global prepare committs for "
-							"PREPARED command")));
+								 errmsg("failed to send global prepare committs for PREPARED command")));
+					}
                 }
 #endif
 
diff --git a/src/backend/postmaster/clean2pc.c b/src/backend/postmaster/clean2pc.c
index c1e3a31f..def81c95 100644
--- a/src/backend/postmaster/clean2pc.c
+++ b/src/backend/postmaster/clean2pc.c
@@ -17,7 +17,6 @@
 #include "postgres.h"
 
 #include "access/htup_details.h"
-#include "catalog/namespace.h"
 #include "catalog/pg_database.h"
 #include "catalog/pg_type.h"
 #include "commands/dbcommands.h"
@@ -59,7 +58,7 @@ typedef enum
 bool enable_clean_2pc_launcher = true;
 
 int auto_clean_2pc_interval        = 60;
-int auto_clean_2pc_delay           = 60;
+int auto_clean_2pc_delay           = 300;
 int auto_clean_2pc_timeout         = 1200;
 int auto_clean_2pc_max_check_time  = 1200;
 
@@ -89,8 +88,6 @@ static void	start_clean_worker(int count);
 static void do_query_2pc(TimestampTz clean_time);
 static void do_clean_2pc(TimestampTz clean_time);
 
-static bool check_pg_clean_extension(void);
-
 static void clean_2pc_sigterm_handler(SIGNAL_ARGS);
 static void clean_2pc_sighup_handler(SIGNAL_ARGS);
 static void clean_2pc_sigusr2_handler(SIGNAL_ARGS);
@@ -435,12 +432,6 @@ do_query_2pc(TimestampTz clean_time)
 	Assert(result_str != NULL);
 	resetStringInfo(result_str);
 
-	if (!check_pg_clean_extension())
-	{
-		elog(WARNING, "create extension pg_clean please");
-		return;
-	}
-
 	check_time = (curr_time - clean_time)/USECS_PER_SEC;
 
 	if (check_time < 0)
@@ -695,40 +686,6 @@ do_clean_2pc(TimestampTz clean_time)
 	}
 }
 
-/*
- * check if pg_clean_check_txn funciton exist
- */
-static bool
-check_pg_clean_extension(void)
-{
-	bool                 res = false;
-	List                *names = NULL;
-	FuncCandidateList    clist = NULL;
-	char                *fuc_name = "pg_clean_check_txn";
-
-	StartTransactionCommand();
-
-	/*
-	 * Parse the name into components and see if it matches any pg_proc
-	 * entries in the current search path.
-	 */
-	names = list_make1(makeString(fuc_name));
-	clist = FuncnameGetCandidates(names, -1, NIL, false, false, true);
-
-	if (clist == NULL || clist->next != NULL)
-	{
-		res = false;
-	}
-	else
-	{
-		res = true;
-	}
-
-	CommitTransactionCommand();
-
-	return res;
-}
-
 /* SIGTERM: set flag to exit normally */
 static void
 clean_2pc_sigterm_handler(SIGNAL_ARGS)
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3f2e046a..8b7af537 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4887,11 +4887,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_interval,
-#ifdef __TWO_PHASE_TESTS__
-		60, 0, INT_MAX,
-#else
-		60, 30, INT_MAX,
-#endif
+		60, 10, INT_MAX,
 		NULL, NULL, NULL
 	},
 
@@ -4902,11 +4898,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_delay,
-#ifdef __TWO_PHASE_TESTS__
-		60, 0, INT_MAX,
-#else
-		60, 30, INT_MAX,
-#endif
+		300, 3, INT_MAX,
 		NULL, NULL, NULL
 	},
 
@@ -4917,11 +4909,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_timeout,
-#ifdef __TWO_PHASE_TESTS__
 		1200, 0, INT_MAX,
-#else
-		1200, 30, INT_MAX,
-#endif
 		NULL, NULL, NULL
 	},
 
@@ -4932,11 +4920,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_S
 		},
 		&auto_clean_2pc_max_check_time,
-#ifdef __TWO_PHASE_TESTS__
 		1200, 0, INT_MAX,
-#else
-		1200, 30, INT_MAX,
-#endif
 		NULL, NULL, NULL
 	},
 
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index 132f19d8..06f9685e 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -81,10 +81,6 @@
 #include "gtm/gtm_c.h"
 
 #define GIDSIZE (200 + 24)
-
-/* 2pc xlog v2 add prepare timestamp */
-#define XLOG_FMT_2PC_V2 "fmt_v2"
-
 /*
  * GlobalTransactionData is defined in twophase.c; other places have no
  * business knowing the internal definition.

From c3495ad823068e6ec6f8f39fa05f5c3948083b62 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 22 Apr 2022 16:15:29 +0800
Subject: [PATCH 548/578] pull up to cn if has user defined functions of
 plpgsql

code sync from 5.06.1.1
Author: arrowbowang
---
 src/backend/commands/functioncmds.c         | 32 +++++++++++-
 src/backend/nodes/copyfuncs.c               |  1 +
 src/backend/optimizer/util/clauses.c        | 39 +++++++++++++-
 src/backend/optimizer/util/pathnode.c       | 38 +++++++-------
 src/backend/parser/analyze.c                |  1 +
 src/backend/parser/gram.y                   | 11 +++-
 src/backend/parser/parse_func.c             | 34 ++++++++++++
 src/backend/utils/adt/ruleutils.c           |  2 +
 src/backend/utils/cache/lsyscache.c         | 36 +++++++++++++
 src/bin/pg_dump/pg_dump.c                   | 15 ++++++
 src/include/catalog/pg_proc.h               |  2 +-
 src/include/nodes/parsenodes.h              |  1 +
 src/include/nodes/relation.h                |  2 +
 src/include/optimizer/clauses.h             |  1 +
 src/include/parser/kwlist.h                 |  1 +
 src/include/parser/parse_func.h             | 58 +++++++++++----------
 src/include/parser/parse_node.h             |  1 +
 src/include/utils/lsyscache.h               |  2 +
 src/test/regress/expected/privileges.out    |  2 +-
 src/test/regress/expected/rowsecurity_1.out |  2 +-
 src/test/regress/expected/select_views.out  |  2 +-
 src/test/regress/expected/union_1.out       | 34 ++++++++++--
 src/test/regress/sql/privileges.sql         |  2 +-
 src/test/regress/sql/rowsecurity.sql        |  2 +-
 src/test/regress/sql/select_views.sql       |  2 +-
 src/test/regress/sql/union.sql              | 11 +++-
 26 files changed, 273 insertions(+), 61 deletions(-)

diff --git a/src/backend/commands/functioncmds.c b/src/backend/commands/functioncmds.c
index a9bba8e3..a0cd8d3d 100644
--- a/src/backend/commands/functioncmds.c
+++ b/src/backend/commands/functioncmds.c
@@ -456,6 +456,7 @@ compute_common_attribute(ParseState *pstate,
                          DefElem **strict_item,
                          DefElem **security_item,
                          DefElem **leakproof_item,
+						 DefElem **pushdow_item,
                          List **set_items,
                          DefElem **cost_item,
                          DefElem **rows_item,
@@ -489,6 +490,13 @@ compute_common_attribute(ParseState *pstate,
 
         *leakproof_item = defel;
     }
+	else if (strcmp(defel->defname, "pushdown") == 0)
+	{
+		if (*pushdow_item)
+			goto duplicate_error;
+		
+		*pushdow_item = defel;
+	}
     else if (strcmp(defel->defname, "set") == 0)
     {
         *set_items = lappend(*set_items, defel->arg);
@@ -612,6 +620,7 @@ compute_attributes_sql_style(ParseState *pstate,
                              bool *strict_p,
                              bool *security_definer,
                              bool *leakproof_p,
+							 bool *pushable_p,
                              ArrayType **proconfig,
                              float4 *procost,
                              float4 *prorows,
@@ -626,6 +635,7 @@ compute_attributes_sql_style(ParseState *pstate,
     DefElem    *strict_item = NULL;
     DefElem    *security_item = NULL;
     DefElem    *leakproof_item = NULL;
+	DefElem    *pushdown_item = NULL;
     List       *set_items = NIL;
     DefElem    *cost_item = NULL;
     DefElem    *rows_item = NULL;
@@ -677,6 +687,7 @@ compute_attributes_sql_style(ParseState *pstate,
                                           &strict_item,
                                           &security_item,
                                           &leakproof_item,
+										  &pushdown_item,
                                           &set_items,
                                           &cost_item,
                                           &rows_item,
@@ -724,6 +735,8 @@ compute_attributes_sql_style(ParseState *pstate,
         *security_definer = intVal(security_item->arg);
     if (leakproof_item)
         *leakproof_p = intVal(leakproof_item->arg);
+	if (pushdown_item)
+		*pushable_p = intVal(pushdown_item->arg);
     if (set_items)
         *proconfig = update_proconfig_value(NULL, set_items);
     if (cost_item)
@@ -883,7 +896,8 @@ CreateFunction(ParseState *pstate, CreateFunctionStmt *stmt)
     bool        isWindowFunc,
                 isStrict,
                 security,
-                isLeakProof;
+				isLeakProof,
+				isPushdown;
     char        volatility;
     ArrayType  *proconfig;
     float4        procost;
@@ -908,6 +922,7 @@ CreateFunction(ParseState *pstate, CreateFunctionStmt *stmt)
     isStrict = false;
     security = false;
     isLeakProof = false;
+	isPushdown = false;
     volatility = PROVOLATILE_VOLATILE;
     proconfig = NULL;
     procost = -1;                /* indicates not set */
@@ -919,7 +934,7 @@ CreateFunction(ParseState *pstate, CreateFunctionStmt *stmt)
                                  stmt->options,
                                  &as_clause, &language, &transformDefElem,
                                  &isWindowFunc, &volatility,
-                                 &isStrict, &security, &isLeakProof,
+								 &isStrict, &security, &isLeakProof, &isPushdown,
                                  &proconfig, &procost, &prorows, &parallel);
 
     /* Look up the language and validate permissions */
@@ -1064,6 +1079,9 @@ CreateFunction(ParseState *pstate, CreateFunctionStmt *stmt)
         else
             procost = 100;
     }
+	if(isPushdown)
+		procost = -procost;
+	
     if (prorows < 0)
     {
         if (returnsSet)
@@ -1174,6 +1192,7 @@ AlterFunction(ParseState *pstate, AlterFunctionStmt *stmt)
     DefElem    *strict_item = NULL;
     DefElem    *security_def_item = NULL;
     DefElem    *leakproof_item = NULL;
+	DefElem    *pushdown_item = NULL;
     List       *set_items = NIL;
     DefElem    *cost_item = NULL;
     DefElem    *rows_item = NULL;
@@ -1212,6 +1231,7 @@ AlterFunction(ParseState *pstate, AlterFunctionStmt *stmt)
                                      &strict_item,
                                      &security_def_item,
                                      &leakproof_item,
+									 &pushdown_item,
                                      &set_items,
                                      &cost_item,
                                      &rows_item,
@@ -1241,6 +1261,14 @@ AlterFunction(ParseState *pstate, AlterFunctionStmt *stmt)
                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                      errmsg("COST must be positive")));
     }
+	if (pushdown_item)
+	{
+		bool pushdown = intVal(pushdown_item->arg);
+		if (pushdown && procForm->procost > 0)
+			procForm->procost = -procForm->procost;
+		if ((!pushdown) && procForm->procost < 0)
+			procForm->procost = -procForm->procost;
+	}
     if (rows_item)
     {
         procForm->prorows = defGetNumeric(rows_item);
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 5bc4e05c..e2e6b7fc 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -3260,6 +3260,7 @@ _copyQuery(const Query *from)
     COPY_SCALAR_FIELD(isSingleValues);
     COPY_SCALAR_FIELD(isMultiValues);
     COPY_SCALAR_FIELD(hasUnshippableTriggers);
+	COPY_SCALAR_FIELD(hasCoordFuncs);
     COPY_STRING_FIELD(copy_filename);
 #endif
     COPY_NODE_FIELD(cteList);
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index ef96602f..f2c9cc1e 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -5367,4 +5367,41 @@ replace_eval_sql_value_function(Node *node)
 
 	return expression_tree_mutator(node, replace_eval_sql_value_function, NULL);
 }
-#endif
\ No newline at end of file
+#endif
+/*****************************************************************************
+ *		Check clauses for pull-up-ed user defined functions
+ *****************************************************************************/
+ 
+static bool
+contain_user_defined_functions_checker(Oid func_id, void *context)
+{
+	return func_is_pullup(func_id);
+}
+
+static bool
+contain_check_functions_walker(Node *node, bool (*checker)())
+{
+	if (node == NULL)
+		return false;
+	
+	if (check_functions_in_node(node, checker,
+	                            NULL))
+		return true;
+	
+	/* Recurse to check arguments */
+	if (IsA(node, Query))
+	{
+		/* Recurse into subselects */
+		return query_tree_walker((Query *) node,
+		                         contain_check_functions_walker,
+		                         checker, 0);
+	}
+	return expression_tree_walker(node, contain_check_functions_walker,
+	                              checker);
+}
+
+bool
+contain_user_defined_functions(Node *clause)
+{
+	return contain_check_functions_walker(clause, &contain_user_defined_functions_checker);
+}
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 5c1a3ca5..f3d1adb2 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1657,6 +1657,26 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
     if (innerd == NULL && outerd == NULL)
         return NIL;
 #ifdef __TBASE__
+	/*
+	 * DML may need to push down to datanodes, for example:
+	 *   DELETE FROM
+	 *   	geocode_settings as gc
+	 *   USING geocode_settings_default AS gf
+	 *   WHERE
+	 *   	gf.name = gc.name and gf.setting = gc.setting;
+	 * prefer_olap means pulling query up to coordinator node, in case data
+	 * re-distribute in TPC-C test case.
+	 *
+	 * TODO: We need to automatically determine whether we need to pull it up,
+	* but not using GUC.
+	*/
+	if(!dml &&
+	   (!prefer_olap ||
+	    (root->parse &&
+	     root->parse->hasCoordFuncs)))
+	{
+		goto pull_up;
+	}
 
 	/*
 	 * If outer or inner subpaths are distributed by shard and they do not exist
@@ -1785,24 +1805,6 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
         return alternate;
     }
 
-	/*
-	 * DML may need to push down to datanodes, for example:
-	 *   DELETE FROM
-	 *   	geocode_settings as gc
-	 *   USING geocode_settings_default AS gf
-	 *   WHERE
-	 *   	gf.name = gc.name and gf.setting = gc.setting;
-	 * prefer_olap means pulling query up to coordinator node, in case data
-	 * re-distribute in TPC-C test case.
-	 *
-	 * TODO: We need to automatically determine whether we need to pull it up,
-		* but not using GUC.
-		*/
-	if(!prefer_olap && false == dml)
-	{
-		goto pull_up;
-	}
-
     restrictClauses = list_copy(pathnode->joinrestrictinfo);
     restrictClauses = list_concat(restrictClauses,
             pathnode->movedrestrictinfo);
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index f5a26ad3..c68d7b06 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -447,6 +447,7 @@ transformStmt(ParseState *pstate, Node *parseTree)
     /* Mark as original query until we learn differently */
     result->querySource = QSRC_ORIGINAL;
     result->canSetTag = true;
+	result->hasCoordFuncs = pstate->p_hasCoordFuncs;
 
     return result;
 }
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index dad866bf..756b4bad 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -720,7 +720,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 
 	PARALLEL PARSER PARTIAL PARTITION PARTITIONS PASSING PASSWORD PAUSE PLACING PLANS POLICY
 	POSITION PRECEDING PRECISION PREFERRED PRESERVE PREPARE PREPARED PRIMARY
-	PRIOR PRIVILEGES PROCEDURAL PROCEDURE PROGRAM PUBLICATION
+	PRIOR PRIVILEGES PROCEDURAL PROCEDURE PROGRAM PUBLICATION PUSHDOWN
 
 	QUOTE
 
@@ -8287,6 +8287,14 @@ common_func_opt_item:
 				{
 					$$ = makeDefElem("leakproof", (Node *)makeInteger(FALSE), @1);
 				}
+			| PUSHDOWN
+				{
+					$$ = makeDefElem("pushdown", (Node *)makeInteger(TRUE), @1);
+				}
+			| NOT PUSHDOWN
+				{
+					$$ = makeDefElem("pushdown", (Node *)makeInteger(FALSE), @1);
+				}
 			| COST NumericOnly
 				{
 					$$ = makeDefElem("cost", (Node *)$2, @1);
@@ -16853,6 +16861,7 @@ unreserved_keyword:
 			| PROCEDURE
 			| PROGRAM
 			| PUBLICATION
+			| PUSHDOWN
 			| QUOTE
 /* PGXC_BEGIN */
 			| RANDOMLY
diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c
index 8778a4b7..ac9fc9c0 100644
--- a/src/backend/parser/parse_func.c
+++ b/src/backend/parser/parse_func.c
@@ -18,10 +18,12 @@
 #include "catalog/pg_aggregate.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_type.h"
+#include "commands/proclang.h"
 #include "funcapi.h"
 #include "lib/stringinfo.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
+#include "optimizer/clauses.h"
 #include "parser/parse_agg.h"
 #include "parser/parse_clause.h"
 #include "parser/parse_coerce.h"
@@ -253,6 +255,8 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs,
 
     cancel_parser_errposition_callback(&pcbstate);
 
+	pstate->p_hasCoordFuncs = func_is_pullup(funcid);
+	
     if (fdresult == FUNCDETAIL_COERCION)
     {
         /*
@@ -2257,3 +2261,33 @@ check_srf_call_placement(ParseState *pstate, Node *last_srf, int location)
                         ParseExprKindName(pstate->p_expr_kind)),
                  parser_errposition(pstate, location)));
 }
+
+bool
+func_is_pullup(Oid func_id)
+{
+	char *name = NULL;
+	if (func_id >= FirstNormalObjectId)
+	{
+		Oid func_lang_oid;
+		Oid plpgsql_oid;
+		float cost;
+		
+		/*
+		 * A set returning function is not supposed to be in targetlist
+		 * so ignore it.
+		 */
+		if (get_func_retset(func_id))
+			return false;
+		
+		/* A stable function surely can be pushed down to DN */
+		if (func_volatile(func_id) == PROVOLATILE_STABLE)
+			return false;
+		
+		func_lang_oid = get_func_lang(func_id);
+		plpgsql_oid = get_language_oid("plpgsql", true);
+		cost = get_func_cost_with_sign(func_id);
+		if (func_lang_oid == plpgsql_oid && cost >= 0)
+			return true;
+	}
+	return false;
+}
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 7d16c0f6..e7a6bdd9 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -2613,6 +2613,8 @@ pg_get_functiondef(PG_FUNCTION_ARGS)
         appendStringInfoString(&buf, " SECURITY DEFINER");
     if (proc->proleakproof)
         appendStringInfoString(&buf, " LEAKPROOF");
+	if (proc->procost < 0)
+		appendStringInfoString(&buf, " PUSHDOWN");
 
     /* This code for the default cost and rows should match functioncmds.c */
     if (proc->prolang == INTERNALlanguageId ||
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index 9061c0ed..33005a7e 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -1766,6 +1766,23 @@ get_func_cost(Oid funcid)
         elog(ERROR, "cache lookup failed for function %u", funcid);
 
     result = ((Form_pg_proc) GETSTRUCT(tp))->procost;
+	if (result < 0)
+		result = -result;
+	ReleaseSysCache(tp);
+	return result;
+}
+
+float4
+get_func_cost_with_sign(Oid funcid)
+{
+	HeapTuple	tp;
+	float4		result;
+	
+	tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
+	if (!HeapTupleIsValid(tp))
+		elog(ERROR, "cache lookup failed for function %u", funcid);
+	
+	result = ((Form_pg_proc) GETSTRUCT(tp))->procost;
     ReleaseSysCache(tp);
     return result;
 }
@@ -1789,6 +1806,25 @@ get_func_rows(Oid funcid)
     return result;
 }
 
+Oid
+get_func_lang(Oid funcid)
+{
+	HeapTuple	tp;
+	
+	tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
+	if (HeapTupleIsValid(tp))
+	{
+		Form_pg_proc functup = (Form_pg_proc) GETSTRUCT(tp);
+		Oid	   result;
+		
+		result = functup->prolang;
+		ReleaseSysCache(tp);
+		return result;
+	}
+	else
+		return InvalidOid;
+}
+
 /*                ---------- RELATION CACHE ----------                     */
 
 /*
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 89685c9f..f987ae1d 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -12020,6 +12020,21 @@ dumpFunc(Archive *fout, FuncInfo *finfo)
      * break backwards-compatibility of the dump without need.  Keep this code
      * in sync with the defaults in functioncmds.c.
      */
+	if(procost[0] == '-')
+	{
+		char* temp;
+		int   len;
+		
+		appendPQExpBufferStr(q, " PUSHDOWN");
+		len = strlen(procost);
+		temp = pg_malloc(len + 1);
+		strcpy(temp, procost+1);
+		temp[len-1] = '\0';
+		strcpy(procost, temp);
+		procost[len-1] = 0;
+		pg_free(temp);
+		temp = NULL;
+	}
     if (strcmp(procost, "0") != 0)
     {
         if (strcmp(lanname, "internal") == 0 || strcmp(lanname, "c") == 0)
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 8f79ca30..1eb1b97f 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -100,7 +100,7 @@ CATALOG(pg_proc,1255) BKI_BOOTSTRAP BKI_ROWTYPE_OID(81) BKI_SCHEMA_MACRO
     Oid            pronamespace;    /* OID of namespace containing this proc */
     Oid            proowner;        /* procedure owner */
     Oid            prolang;        /* OID of pg_language entry */
-    float4        procost;        /* estimated execution cost */
+	float4		procost;		/* estimated execution cost, the negtive number means the function can be pushed down*/
     float4        prorows;        /* estimated # of rows out (if proretset) */
     Oid            provariadic;    /* element type of variadic array, or 0 */
     regproc        protransform;    /* transforms calls to it during planning */
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index e8ac3d54..f6e83887 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -162,6 +162,7 @@ typedef struct Query
                                                                           * only used for DML. Will be set at the plan phase
                                                                           * in shippability check.
                                                                           */
+	bool        hasCoordFuncs;
     char        *copy_filename; /* fake filename for copy from */
     Bitmapset   *conflict_cols;
 #endif
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 689a392f..0736ab79 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -225,6 +225,8 @@ typedef struct PlannerGlobal
 
     bool        parallelModeNeeded; /* parallel mode actually required? */
 
+	bool        hasCoordFuncs;
+
     char        maxParallelHazard;    /* worst PROPARALLEL hazard level */
 } PlannerGlobal;
 
diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h
index e55c6033..52918fd6 100644
--- a/src/include/optimizer/clauses.h
+++ b/src/include/optimizer/clauses.h
@@ -95,4 +95,5 @@ extern Node *replace_distribkey_func(Node *node);
 
 extern Node *replace_eval_sql_value_function(Node *node);
 
+extern bool contain_user_defined_functions(Node *clause);
 #endif							/* CLAUSES_H */
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index d77a2e68..8d54cc60 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -382,6 +382,7 @@ PG_KEYWORD("procedural", PROCEDURAL, UNRESERVED_KEYWORD)
 PG_KEYWORD("procedure", PROCEDURE, UNRESERVED_KEYWORD)
 PG_KEYWORD("program", PROGRAM, UNRESERVED_KEYWORD)
 PG_KEYWORD("publication", PUBLICATION, UNRESERVED_KEYWORD)
+PG_KEYWORD("pushdown", PUSHDOWN, UNRESERVED_KEYWORD)
 PG_KEYWORD("quote", QUOTE, UNRESERVED_KEYWORD)
 #ifdef PGXC
 PG_KEYWORD("randomly", RANDOMLY, UNRESERVED_KEYWORD)
diff --git a/src/include/parser/parse_func.h b/src/include/parser/parse_func.h
index 4b8697fe..c6199410 100644
--- a/src/include/parser/parse_func.h
+++ b/src/include/parser/parse_func.h
@@ -21,54 +21,56 @@
 /* Result codes for func_get_detail */
 typedef enum
 {
-    FUNCDETAIL_NOTFOUND,        /* no matching function */
-    FUNCDETAIL_MULTIPLE,        /* too many matching functions */
-    FUNCDETAIL_NORMAL,            /* found a matching regular function */
-    FUNCDETAIL_AGGREGATE,        /* found a matching aggregate function */
-    FUNCDETAIL_WINDOWFUNC,        /* found a matching window function */
-    FUNCDETAIL_COERCION            /* it's a type coercion request */
+	FUNCDETAIL_NOTFOUND,		/* no matching function */
+	FUNCDETAIL_MULTIPLE,		/* too many matching functions */
+	FUNCDETAIL_NORMAL,			/* found a matching regular function */
+	FUNCDETAIL_AGGREGATE,		/* found a matching aggregate function */
+	FUNCDETAIL_WINDOWFUNC,		/* found a matching window function */
+	FUNCDETAIL_COERCION			/* it's a type coercion request */
 } FuncDetailCode;
 
 
 extern Node *ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs,
-                  Node *last_srf, FuncCall *fn, int location);
+				  Node *last_srf, FuncCall *fn, int location);
 
 extern FuncDetailCode func_get_detail(List *funcname,
-                List *fargs, List *fargnames,
-                int nargs, Oid *argtypes,
-                bool expand_variadic, bool expand_defaults,
-                Oid *funcid, Oid *rettype,
-                bool *retset, int *nvargs, Oid *vatype,
-                Oid **true_typeids, List **argdefaults);
+				List *fargs, List *fargnames,
+				int nargs, Oid *argtypes,
+				bool expand_variadic, bool expand_defaults,
+				Oid *funcid, Oid *rettype,
+				bool *retset, int *nvargs, Oid *vatype,
+				Oid **true_typeids, List **argdefaults);
 
 extern int func_match_argtypes(int nargs,
-                    Oid *input_typeids,
-                    FuncCandidateList raw_candidates,
-                    FuncCandidateList *candidates);
+					Oid *input_typeids,
+					FuncCandidateList raw_candidates,
+					FuncCandidateList *candidates);
 
 extern FuncCandidateList func_select_candidate(int nargs,
-                      Oid *input_typeids,
-                      FuncCandidateList candidates);
+					  Oid *input_typeids,
+					  FuncCandidateList candidates);
 
 extern void make_fn_arguments(ParseState *pstate,
-                  List *fargs,
-                  Oid *actual_arg_types,
-                  Oid *declared_arg_types);
+				  List *fargs,
+				  Oid *actual_arg_types,
+				  Oid *declared_arg_types);
 
 extern const char *funcname_signature_string(const char *funcname, int nargs,
-                          List *argnames, const Oid *argtypes);
+						  List *argnames, const Oid *argtypes);
 extern const char *func_signature_string(List *funcname, int nargs,
-                      List *argnames, const Oid *argtypes);
+					  List *argnames, const Oid *argtypes);
 
 extern Oid LookupFuncName(List *funcname, int nargs, const Oid *argtypes,
-               bool noError);
+			   bool noError);
 extern Oid LookupFuncWithArgs(ObjectWithArgs *func,
-                   bool noError);
+				   bool noError);
 extern Oid LookupAggWithArgs(ObjectWithArgs *agg,
-                  bool noError);
+				  bool noError);
 
 extern void check_srf_call_placement(ParseState *pstate, Node *last_srf,
-                         int location);
+						 int location);
 
 extern void check_pg_get_expr_args(ParseState *pstate, Oid fnoid, List *args);
-#endif                            /* PARSE_FUNC_H */
+
+extern bool func_is_pullup(Oid func_id);
+#endif							/* PARSE_FUNC_H */
diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h
index 0f0490d6..5ae643ce 100644
--- a/src/include/parser/parse_node.h
+++ b/src/include/parser/parse_node.h
@@ -260,6 +260,7 @@ struct ParseState
     bool        p_hasTargetSRFs;
     bool        p_hasSubLinks;
     bool        p_hasModifyingCTE;
+	bool        p_hasCoordFuncs;
 
     Node       *p_last_srf;        /* most recent set-returning func/op found */
 
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index e94c510b..6ad2a50f 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -132,7 +132,9 @@ extern char func_volatile(Oid funcid);
 extern char func_parallel(Oid funcid);
 extern bool get_func_leakproof(Oid funcid);
 extern float4 get_func_cost(Oid funcid);
+extern float4 get_func_cost_with_sign(Oid funcid);
 extern float4 get_func_rows(Oid funcid);
+extern Oid     get_func_lang(Oid funcid);
 extern Oid	get_relname_relid(const char *relname, Oid relnamespace);
 #ifdef PGXC
 extern int	get_relnatts(Oid relid);
diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
index ccf6aba3..d7454603 100644
--- a/src/test/regress/expected/privileges.out
+++ b/src/test/regress/expected/privileges.out
@@ -194,7 +194,7 @@ CREATE INDEX ON atest12 (abs(a));
 VACUUM ANALYZE atest12;
 CREATE FUNCTION leak(integer,integer) RETURNS boolean
   AS $$begin return $1 < $2; end$$
-  LANGUAGE plpgsql immutable;
+  LANGUAGE plpgsql immutable pushdown;
 CREATE OPERATOR <<< (procedure = leak, leftarg = integer, rightarg = integer,
                      restrict = scalarltsel);
 -- view with leaky operator
diff --git a/src/test/regress/expected/rowsecurity_1.out b/src/test/regress/expected/rowsecurity_1.out
index 770c320f..2370da75 100644
--- a/src/test/regress/expected/rowsecurity_1.out
+++ b/src/test/regress/expected/rowsecurity_1.out
@@ -28,7 +28,7 @@ GRANT ALL ON SCHEMA regress_rls_schema to public;
 SET search_path = regress_rls_schema;
 -- setup of malicious function
 CREATE OR REPLACE FUNCTION f_leak(text) RETURNS bool
-    COST 0.0000001 LANGUAGE plpgsql
+    COST 0.0000001 LANGUAGE plpgsql pushdown
     AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END';
 GRANT EXECUTE ON FUNCTION f_leak(text) TO public;
 -- BASIC Row-Level Security Scenario
diff --git a/src/test/regress/expected/select_views.out b/src/test/regress/expected/select_views.out
index 2406dabc..9abe04ed 100644
--- a/src/test/regress/expected/select_views.out
+++ b/src/test/regress/expected/select_views.out
@@ -1252,7 +1252,7 @@ SELECT * FROM toyemp WHERE name = 'sharon';
 --
 CREATE ROLE regress_alice;
 CREATE FUNCTION f_leak (text)
-       RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001
+       RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001 pushdown
        AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END';
 CREATE TABLE customer (
        cid      int primary key,
diff --git a/src/test/regress/expected/union_1.out b/src/test/regress/expected/union_1.out
index 41c0c7fa..08670b47 100644
--- a/src/test/regress/expected/union_1.out
+++ b/src/test/regress/expected/union_1.out
@@ -822,15 +822,15 @@ ORDER BY x;
 -- Test proper handling of parameterized appendrel paths when the
 -- potential join qual is expensive
 create function expensivefunc(int) returns int
-language plpgsql immutable strict cost 10000
+language plpgsql immutable strict pushdown cost 10000
 as $$begin return $1; end$$;
 create temp table t3 as select generate_series(-1000,1000) as x;
 create index t3i on t3 (expensivefunc(x));
 analyze t3;
 explain (num_nodes off, nodes off, costs off)
 select * from
-  (select * from t3 a union all select * from t3 b) ss
-  join int4_tbl on f1 = expensivefunc(x);
+    (select * from t3 a union all select * from t3 b) ss
+        join int4_tbl on f1 = expensivefunc(x);
                             QUERY PLAN                            
 ------------------------------------------------------------------
  Remote Subquery Scan on all
@@ -843,6 +843,34 @@ select * from
                      Index Cond: (expensivefunc(x) = int4_tbl.f1)
 (8 rows)
 
+select * from
+    (select * from t3 a union all select * from t3 b) ss
+        join int4_tbl on f1 = expensivefunc(x);
+ x | f1 
+---+----
+ 0 |  0
+ 0 |  0
+(2 rows)
+
+alter function expensivefunc not pushdown;
+explain (num_nodes off, nodes off, costs off)
+select * from
+  (select * from t3 a union all select * from t3 b) ss
+  join int4_tbl on f1 = expensivefunc(x);
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Nested Loop
+   ->  Remote Subquery Scan on all
+         ->  Seq Scan on int4_tbl
+   ->  Materialize
+         ->  Remote Subquery Scan on all
+               ->  Append
+                     ->  Index Scan using t3i on t3 a
+                           Index Cond: (expensivefunc(x) = int4_tbl.f1)
+                     ->  Index Scan using t3i on t3 b
+                           Index Cond: (expensivefunc(x) = int4_tbl.f1)
+(10 rows)
+
 select * from
   (select * from t3 a union all select * from t3 b) ss
   join int4_tbl on f1 = expensivefunc(x);
diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
index 09122394..a6b92c2f 100644
--- a/src/test/regress/sql/privileges.sql
+++ b/src/test/regress/sql/privileges.sql
@@ -140,7 +140,7 @@ VACUUM ANALYZE atest12;
 
 CREATE FUNCTION leak(integer,integer) RETURNS boolean
   AS $$begin return $1 < $2; end$$
-  LANGUAGE plpgsql immutable;
+  LANGUAGE plpgsql immutable pushdown;
 CREATE OPERATOR <<< (procedure = leak, leftarg = integer, rightarg = integer,
                      restrict = scalarltsel);
 
diff --git a/src/test/regress/sql/rowsecurity.sql b/src/test/regress/sql/rowsecurity.sql
index 4ed98e68..9e5609e9 100644
--- a/src/test/regress/sql/rowsecurity.sql
+++ b/src/test/regress/sql/rowsecurity.sql
@@ -37,7 +37,7 @@ SET search_path = regress_rls_schema;
 
 -- setup of malicious function
 CREATE OR REPLACE FUNCTION f_leak(text) RETURNS bool
-    COST 0.0000001 LANGUAGE plpgsql
+    COST 0.0000001 LANGUAGE plpgsql pushdown
     AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END';
 GRANT EXECUTE ON FUNCTION f_leak(text) TO public;
 
diff --git a/src/test/regress/sql/select_views.sql b/src/test/regress/sql/select_views.sql
index 1b175469..a6820358 100644
--- a/src/test/regress/sql/select_views.sql
+++ b/src/test/regress/sql/select_views.sql
@@ -15,7 +15,7 @@ SELECT * FROM toyemp WHERE name = 'sharon';
 CREATE ROLE regress_alice;
 
 CREATE FUNCTION f_leak (text)
-       RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001
+       RETURNS bool LANGUAGE 'plpgsql' COST 0.0000001 pushdown
        AS 'BEGIN RAISE NOTICE ''f_leak => %'', $1; RETURN true; END';
 
 CREATE TABLE customer (
diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql
index bf51c9a5..58bc43a7 100644
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@@ -341,13 +341,22 @@ ORDER BY x;
 -- Test proper handling of parameterized appendrel paths when the
 -- potential join qual is expensive
 create function expensivefunc(int) returns int
-language plpgsql immutable strict cost 10000
+language plpgsql immutable strict pushdown cost 10000
 as $$begin return $1; end$$;
 
 create temp table t3 as select generate_series(-1000,1000) as x;
 create index t3i on t3 (expensivefunc(x));
 analyze t3;
 
+explain (num_nodes off, nodes off, costs off)
+select * from
+    (select * from t3 a union all select * from t3 b) ss
+        join int4_tbl on f1 = expensivefunc(x);
+select * from
+    (select * from t3 a union all select * from t3 b) ss
+        join int4_tbl on f1 = expensivefunc(x);
+
+alter function expensivefunc not pushdown;
 explain (num_nodes off, nodes off, costs off)
 select * from
   (select * from t3 a union all select * from t3 b) ss

From 22758055448aed22b9189d14646930edbec74da5 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Wed, 1 Dec 2021 12:03:54 +0800
Subject: [PATCH 549/578] Support for creating a Result node to do specific
 qualifications

Before this, a Result node can only do simple qualification like
	"Select 1 < 2; --true"
or acting as a "gating" node, see create_gating_plan.

This commit add a QualPath and create_qual_path to eventually create
a Result node that able to do like plan:
	Result
	Filter(a < b)
		-> Remote Subquery Scan on all:
			-> Seqscan on t
				output: a, b

It seems useless that qualification can performed just after scanning a.
But it comes helpful if we have something computed AFTER a remote
subplan collecting tuples, such as "rownum" expr and UDF that processed
on CN.
---
 src/backend/executor/nodeResult.c       | 409 +++++++++++++-----------
 src/backend/nodes/outfuncs.c            |  14 +
 src/backend/optimizer/plan/createplan.c |  56 +++-
 src/backend/optimizer/util/pathnode.c   |  28 ++
 src/include/nodes/execnodes.h           |   1 +
 src/include/nodes/nodes.h               |   1 +
 src/include/nodes/relation.h            |   7 +
 src/include/optimizer/pathnode.h        |   1 +
 8 files changed, 308 insertions(+), 209 deletions(-)

diff --git a/src/backend/executor/nodeResult.c b/src/backend/executor/nodeResult.c
index 905e4f1f..0269d6d3 100644
--- a/src/backend/executor/nodeResult.c
+++ b/src/backend/executor/nodeResult.c
@@ -1,44 +1,44 @@
 /*-------------------------------------------------------------------------
  *
  * nodeResult.c
- *      support for constant nodes needing special code.
+ *	  support for constant nodes needing special code.
  *
  * DESCRIPTION
  *
- *        Result nodes are used in queries where no relations are scanned.
- *        Examples of such queries are:
+ *		Result nodes are used in queries where no relations are scanned.
+ *		Examples of such queries are:
  *
- *                select 1 * 2
+ *				select 1 * 2
  *
- *                insert into emp values ('mike', 15000)
+ *				insert into emp values ('mike', 15000)
  *
- *        (Remember that in an INSERT or UPDATE, we need a plan tree that
- *        generates the new rows.)
+ *		(Remember that in an INSERT or UPDATE, we need a plan tree that
+ *		generates the new rows.)
  *
- *        Result nodes are also used to optimise queries with constant
- *        qualifications (ie, quals that do not depend on the scanned data),
- *        such as:
+ *		Result nodes are also used to optimise queries with constant
+ *		qualifications (ie, quals that do not depend on the scanned data),
+ *		such as:
  *
- *                select * from emp where 2 > 1
+ *				select * from emp where 2 > 1
  *
- *        In this case, the plan generated is
+ *		In this case, the plan generated is
  *
- *                        Result    (with 2 > 1 qual)
- *                        /
- *                   SeqScan (emp.*)
+ *						Result	(with 2 > 1 qual)
+ *						/
+ *				   SeqScan (emp.*)
  *
- *        At runtime, the Result node evaluates the constant qual once,
- *        which is shown by EXPLAIN as a One-Time Filter.  If it's
- *        false, we can return an empty result set without running the
- *        controlled plan at all.  If it's true, we run the controlled
- *        plan normally and pass back the results.
+ *		At runtime, the Result node evaluates the constant qual once,
+ *		which is shown by EXPLAIN as a One-Time Filter.  If it's
+ *		false, we can return an empty result set without running the
+ *		controlled plan at all.  If it's true, we run the controlled
+ *		plan normally and pass back the results.
  *
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *      src/backend/executor/nodeResult.c
+ *	  src/backend/executor/nodeResult.c
  *
  *-------------------------------------------------------------------------
  */
@@ -48,230 +48,249 @@
 #include "executor/executor.h"
 #include "executor/nodeResult.h"
 #include "miscadmin.h"
+#include "optimizer/clauses.h"
 #include "utils/memutils.h"
 
 
 /* ----------------------------------------------------------------
- *        ExecResult(node)
+ *		ExecResult(node)
  *
- *        returns the tuples from the outer plan which satisfy the
- *        qualification clause.  Since result nodes with right
- *        subtrees are never planned, we ignore the right subtree
- *        entirely (for now).. -cim 10/7/89
+ *		returns the tuples from the outer plan which satisfy the
+ *		qualification clause.  Since result nodes with right
+ *		subtrees are never planned, we ignore the right subtree
+ *		entirely (for now).. -cim 10/7/89
  *
- *        The qualification containing only constant clauses are
- *        checked first before any processing is done. It always returns
- *        'nil' if the constant qualification is not satisfied.
+ *		The qualification containing only constant clauses are
+ *		checked first before any processing is done. It always returns
+ *		'nil' if the constant qualification is not satisfied.
  * ----------------------------------------------------------------
  */
 static TupleTableSlot *
 ExecResult(PlanState *pstate)
 {
-    ResultState *node = castNode(ResultState, pstate);
-    TupleTableSlot *outerTupleSlot;
-    PlanState  *outerPlan;
-    ExprContext *econtext;
-
-    CHECK_FOR_INTERRUPTS();
-
-    econtext = node->ps.ps_ExprContext;
-
-    /*
-     * check constant qualifications like (2 > 1), if not already done
-     */
-    if (node->rs_checkqual)
-    {
-        bool        qualResult = ExecQual(node->resconstantqual, econtext);
-
-        node->rs_checkqual = false;
-        if (!qualResult)
-        {
-            node->rs_done = true;
-            return NULL;
-        }
-    }
-
-    /*
-     * Reset per-tuple memory context to free any expression evaluation
-     * storage allocated in the previous tuple cycle.
-     */
-    ResetExprContext(econtext);
-
-    /*
-     * if rs_done is true then it means that we were asked to return a
-     * constant tuple and we already did the last time ExecResult() was
-     * called, OR that we failed the constant qual check. Either way, now we
-     * are through.
-     */
-    while (!node->rs_done)
-    {
-        outerPlan = outerPlanState(node);
-
-        if (outerPlan != NULL)
-        {
-            /*
-             * retrieve tuples from the outer plan until there are no more.
-             */
-            outerTupleSlot = ExecProcNode(outerPlan);
-
-            if (TupIsNull(outerTupleSlot))
-                return NULL;
-
-            /*
-             * prepare to compute projection expressions, which will expect to
-             * access the input tuples as varno OUTER.
-             */
-            econtext->ecxt_outertuple = outerTupleSlot;
-        }
-        else
-        {
-            /*
-             * if we don't have an outer plan, then we are just generating the
-             * results from a constant target list.  Do it only once.
-             */
-            node->rs_done = true;
-        }
-
-        /* form the result tuple using ExecProject(), and return it */
-        return ExecProject(node->ps.ps_ProjInfo);
-    }
-
-    return NULL;
+	ResultState *node = castNode(ResultState, pstate);
+	ExprState   *qual = node->ps.qual;
+	TupleTableSlot *outerTupleSlot;
+	PlanState  *outerPlan;
+	ExprContext *econtext;
+
+	CHECK_FOR_INTERRUPTS();
+
+	econtext = node->ps.ps_ExprContext;
+
+	/*
+	 * check constant qualifications like (2 > 1), if not already done
+	 */
+	if (node->rs_checkqual)
+	{
+		bool		qualResult = ExecQual(node->resconstantqual, econtext);
+
+		node->rs_checkqual = false;
+		if (!qualResult)
+		{
+			node->rs_done = true;
+			return NULL;
+		}
+	}
+
+	/*
+	 * Reset per-tuple memory context to free any expression evaluation
+	 * storage allocated in the previous tuple cycle.
+	 */
+	ResetExprContext(econtext);
+
+	/*
+	 * if rs_done is true then it means that we were asked to return a
+	 * constant tuple and we already did the last time ExecResult() was
+	 * called, OR that we failed the constant qual check. Either way, now we
+	 * are through.
+	 */
+	while (!node->rs_done)
+	{
+		outerPlan = outerPlanState(node);
+
+		if (outerPlan != NULL)
+		{
+			/*
+			 * retrieve tuples from the outer plan until there are no more.
+			 */
+			outerTupleSlot = ExecProcNode(outerPlan);
+
+			if (TupIsNull(outerTupleSlot))
+				return NULL;
+
+			if (qual)
+			{
+				econtext->ecxt_outertuple = outerTupleSlot;
+				econtext->ecxt_scantuple = outerTupleSlot;
+				
+				if (!ExecQual(qual, econtext))
+				{
+					if (node->rs_fail_return)
+						return NULL;
+					else
+						continue;
+				}
+				
+				ResetExprContext(econtext);
+			}
+
+			/*
+			 * prepare to compute projection expressions, which will expect to
+			 * access the input tuples as varno OUTER.
+			 */
+			econtext->ecxt_outertuple = outerTupleSlot;
+		}
+		else
+		{
+			/*
+			 * if we don't have an outer plan, then we are just generating the
+			 * results from a constant target list.  Do it only once.
+			 */
+			node->rs_done = true;
+		}
+
+		/* form the result tuple using ExecProject(), and return it */
+		return ExecProject(node->ps.ps_ProjInfo);
+	}
+
+	return NULL;
 }
 
 /* ----------------------------------------------------------------
- *        ExecResultMarkPos
+ *		ExecResultMarkPos
  * ----------------------------------------------------------------
  */
 void
 ExecResultMarkPos(ResultState *node)
 {
-    PlanState  *outerPlan = outerPlanState(node);
+	PlanState  *outerPlan = outerPlanState(node);
 
-    if (outerPlan != NULL)
-        ExecMarkPos(outerPlan);
-    else
-        elog(DEBUG2, "Result nodes do not support mark/restore");
+	if (outerPlan != NULL)
+		ExecMarkPos(outerPlan);
+	else
+		elog(DEBUG2, "Result nodes do not support mark/restore");
 }
 
 /* ----------------------------------------------------------------
- *        ExecResultRestrPos
+ *		ExecResultRestrPos
  * ----------------------------------------------------------------
  */
 void
 ExecResultRestrPos(ResultState *node)
 {
-    PlanState  *outerPlan = outerPlanState(node);
+	PlanState  *outerPlan = outerPlanState(node);
 
-    if (outerPlan != NULL)
-        ExecRestrPos(outerPlan);
-    else
-        elog(ERROR, "Result nodes do not support mark/restore");
+	if (outerPlan != NULL)
+		ExecRestrPos(outerPlan);
+	else
+		elog(ERROR, "Result nodes do not support mark/restore");
 }
 
 /* ----------------------------------------------------------------
- *        ExecInitResult
+ *		ExecInitResult
  *
- *        Creates the run-time state information for the result node
- *        produced by the planner and initializes outer relations
- *        (child nodes).
+ *		Creates the run-time state information for the result node
+ *		produced by the planner and initializes outer relations
+ *		(child nodes).
  * ----------------------------------------------------------------
  */
 ResultState *
 ExecInitResult(Result *node, EState *estate, int eflags)
 {
-    ResultState *resstate;
-
-    /* check for unsupported flags */
-    Assert(!(eflags & (EXEC_FLAG_MARK | EXEC_FLAG_BACKWARD)) ||
-           outerPlan(node) != NULL);
-
-    /*
-     * create state structure
-     */
-    resstate = makeNode(ResultState);
-    resstate->ps.plan = (Plan *) node;
-    resstate->ps.state = estate;
-    resstate->ps.ExecProcNode = ExecResult;
-
-    resstate->rs_done = false;
-    resstate->rs_checkqual = (node->resconstantqual == NULL) ? false : true;
-
-    /*
-     * Miscellaneous initialization
-     *
-     * create expression context for node
-     */
-    ExecAssignExprContext(estate, &resstate->ps);
-
-    /*
-     * tuple table initialization
-     */
-    ExecInitResultTupleSlot(estate, &resstate->ps);
-
-    /*
-     * initialize child expressions
-     */
-    resstate->ps.qual =
-        ExecInitQual(node->plan.qual, (PlanState *) resstate);
-    resstate->resconstantqual =
-        ExecInitQual((List *) node->resconstantqual, (PlanState *) resstate);
-
-    /*
-     * initialize child nodes
-     */
-    outerPlanState(resstate) = ExecInitNode(outerPlan(node), estate, eflags);
-
-    /*
-     * we don't use inner plan
-     */
-    Assert(innerPlan(node) == NULL);
-
-    /*
-     * initialize tuple type and projection info
-     */
-    ExecAssignResultTypeFromTL(&resstate->ps);
-    ExecAssignProjectionInfo(&resstate->ps, NULL);
-
-    return resstate;
+	ResultState *resstate;
+
+	/* check for unsupported flags */
+	Assert(!(eflags & (EXEC_FLAG_MARK | EXEC_FLAG_BACKWARD)) ||
+		   outerPlan(node) != NULL);
+
+	/*
+	 * create state structure
+	 */
+	resstate = makeNode(ResultState);
+	resstate->ps.plan = (Plan *) node;
+	resstate->ps.state = estate;
+	resstate->ps.ExecProcNode = ExecResult;
+
+	resstate->rs_done = false;
+	resstate->rs_checkqual = (node->resconstantqual == NULL) ? false : true;
+	resstate->rs_fail_return = contain_rownum_fetch((Node *) node->plan.qual);
+
+	/*
+	 * Miscellaneous initialization
+	 *
+	 * create expression context for node
+	 */
+	ExecAssignExprContext(estate, &resstate->ps);
+
+	/*
+	 * tuple table initialization
+	 */
+	ExecInitResultTupleSlot(estate, &resstate->ps);
+
+	/*
+	 * initialize child expressions
+	 */
+	resstate->ps.qual =
+		ExecInitQual(node->plan.qual, (PlanState *) resstate);
+	resstate->resconstantqual =
+		ExecInitQual((List *) node->resconstantqual, (PlanState *) resstate);
+
+	/*
+	 * initialize child nodes
+	 */
+	outerPlanState(resstate) = ExecInitNode(outerPlan(node), estate, eflags);
+
+	/*
+	 * we don't use inner plan
+	 */
+	Assert(innerPlan(node) == NULL);
+
+	/*
+	 * initialize tuple type and projection info
+	 */
+	ExecAssignResultTypeFromTL(&resstate->ps);
+	ExecAssignProjectionInfo(&resstate->ps, NULL);
+
+	return resstate;
 }
 
 /* ----------------------------------------------------------------
- *        ExecEndResult
+ *		ExecEndResult
  *
- *        frees up storage allocated through C routines
+ *		frees up storage allocated through C routines
  * ----------------------------------------------------------------
  */
 void
 ExecEndResult(ResultState *node)
 {
-    /*
-     * Free the exprcontext
-     */
-    ExecFreeExprContext(&node->ps);
-
-    /*
-     * clean out the tuple table
-     */
-    ExecClearTuple(node->ps.ps_ResultTupleSlot);
-
-    /*
-     * shut down subplans
-     */
-    ExecEndNode(outerPlanState(node));
+	/*
+	 * Free the exprcontext
+	 */
+	ExecFreeExprContext(&node->ps);
+
+	/*
+	 * clean out the tuple table
+	 */
+	ExecClearTuple(node->ps.ps_ResultTupleSlot);
+
+	/*
+	 * shut down subplans
+	 */
+	ExecEndNode(outerPlanState(node));
 }
 
 void
 ExecReScanResult(ResultState *node)
 {
-    node->rs_done = false;
-    node->rs_checkqual = (node->resconstantqual == NULL) ? false : true;
-
-    /*
-     * If chgParam of subnode is not null then plan will be re-scanned by
-     * first ExecProcNode.
-     */
-    if (node->ps.lefttree &&
-        node->ps.lefttree->chgParam == NULL)
-        ExecReScan(node->ps.lefttree);
+	node->rs_done = false;
+	node->rs_checkqual = (node->resconstantqual == NULL) ? false : true;
+
+	/*
+	 * If chgParam of subnode is not null then plan will be re-scanned by
+	 * first ExecProcNode.
+	 */
+	if (node->ps.lefttree &&
+		node->ps.lefttree->chgParam == NULL)
+		ExecReScan(node->ps.lefttree);
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index f6b2295d..5a4602f5 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -3255,6 +3255,17 @@ _outGatherPath(StringInfo str, const GatherPath *node)
     WRITE_INT_FIELD(num_workers);
 }
 
+static void
+_outQualPath(StringInfo str, const QualPath *node)
+{
+	WRITE_NODE_TYPE("QUALPATH");
+	
+	_outPathInfo(str, (const Path *) node);
+	
+	WRITE_NODE_FIELD(subpath);
+	WRITE_NODE_FIELD(quals);
+}
+
 static void
 _outProjectionPath(StringInfo str, const ProjectionPath *node)
 {
@@ -5502,6 +5513,9 @@ outNode(StringInfo str, const void *obj)
             case T_ResultPath:
                 _outResultPath(str, obj);
                 break;
+			case T_QualPath:
+				_outQualPath(str, obj);
+				break;
             case T_MaterialPath:
                 _outMaterialPath(str, obj);
                 break;
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 367b6766..08273bf9 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -132,6 +132,7 @@ static RemoteSubplan *create_remotescan_plan(PlannerInfo *root,
                        RemoteSubPath *best_path);
 //static char *get_internal_cursor(void);
 #endif
+static Result *create_qual_plan(PlannerInfo *root, QualPath *best_path);
 static ProjectSet *create_project_set_plan(PlannerInfo *root, ProjectSetPath *best_path);
 static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path,
                      int flags);
@@ -325,7 +326,7 @@ static SetOp *make_setop(SetOpCmd cmd, SetOpStrategy strategy, Plan *lefttree,
            List *distinctList, AttrNumber flagColIdx, int firstFlag,
            long numGroups);
 static LockRows *make_lockrows(Plan *lefttree, List *rowMarks, int epqParam);
-static Result *make_result(List *tlist, Node *resconstantqual, Plan *subplan);
+static Result *make_result(List *tlist, Node *resconstantqual, Plan *subplan, List *qual);
 static ProjectSet *make_project_set(List *tlist, Plan *subplan);
 static ModifyTable *make_modifytable(PlannerInfo *root,
                  CmdType operation, bool canSetTag,
@@ -479,6 +480,10 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags)
                 plan = (Plan *) create_minmaxagg_plan(root,
                                                       (MinMaxAggPath *) best_path);
             }
+			else if (IsA(best_path, QualPath))
+			{
+				plan = (Plan *) create_qual_plan(root, (QualPath *) best_path);
+			}
             else
             {
                 Assert(IsA(best_path, ResultPath));
@@ -1048,7 +1053,7 @@ create_scan_plan(PlannerInfo *root, Path *best_path, int flags)
 /*
         if (need_projection)
         {
-            plan = (Plan *)make_result(outtlist, NULL, plan);
+			plan = (Plan *)make_result(outtlist, NULL, plan, NULL);
             plan->parallel_aware = best_path->parallel_aware;
         }
 */
@@ -1259,7 +1264,7 @@ create_gating_plan(PlannerInfo *root, Path *path, Plan *plan,
      */
     gplan = (Plan *) make_result(build_path_tlist(root, path),
                                  (Node *) gating_quals,
-                                 plan);
+								 plan, NULL);
 
     /*
      * Notice that we don't change cost or size estimates when doing gating.
@@ -1374,7 +1379,7 @@ create_append_plan(PlannerInfo *root, AppendPath *best_path)
         plan = (Plan *) make_result(tlist,
                                     (Node *) list_make1(makeBoolConst(false,
                                                                       false)),
-                                    NULL);
+									NULL, NULL);
 
         copy_generic_path_info(plan, (Path *) best_path);
 
@@ -1545,7 +1550,29 @@ create_result_plan(PlannerInfo *root, ResultPath *best_path)
     /* best_path->quals is just bare clauses */
     quals = order_qual_clauses(root, best_path->quals);
 
-    plan = make_result(tlist, (Node *) quals, NULL);
+	plan = make_result(tlist, (Node *) quals, NULL, NULL);
+
+	copy_generic_path_info(&plan->plan, (Path *) best_path);
+
+	return plan;
+}
+
+static Result *
+create_qual_plan(PlannerInfo *root, QualPath *best_path)
+{
+	Result	   *plan;
+	Plan	   *subplan;
+	List	   *tlist;
+	List	   *quals;
+	
+	subplan = create_plan_recurse(root, best_path->subpath, 0);
+	
+	tlist = build_path_tlist(root, &best_path->path);
+	
+	/* best_path->quals is just bare clauses */
+	quals = order_qual_clauses(root, best_path->quals);
+	
+	plan = make_result(tlist, NULL, subplan, quals);
 
     copy_generic_path_info(&plan->plan, (Path *) best_path);
 
@@ -2142,7 +2169,7 @@ create_projection_plan(PlannerInfo *root, ProjectionPath *best_path)
     else
     {
         /* We need a Result node */
-        plan = (Plan *) make_result(tlist, NULL, subplan);
+		plan = (Plan *) make_result(tlist, NULL, subplan, NULL);
 
         copy_generic_path_info(plan, (Path *) best_path);
     }
@@ -2166,7 +2193,7 @@ inject_projection_plan(Plan *subplan, List *tlist, bool parallel_safe)
 {
     Plan       *plan;
 
-    plan = (Plan *) make_result(tlist, NULL, subplan);
+	plan = (Plan *) make_result(tlist, NULL, subplan, NULL);
 
     /*
      * In principle, we should charge tlist eval cost plus cpu_per_tuple per
@@ -2626,7 +2653,7 @@ create_minmaxagg_plan(PlannerInfo *root, MinMaxAggPath *best_path)
     /* Generate the output plan --- basically just a Result */
     tlist = build_path_tlist(root, &best_path->path);
 
-    plan = make_result(tlist, (Node *) best_path->quals, NULL);
+	plan = make_result(tlist, (Node *) best_path->quals, NULL, NULL);
 
     copy_generic_path_info(&plan->plan, (Path *) best_path);
 
@@ -6842,7 +6869,7 @@ make_remotesubplan(PlannerInfo *root,
 						{
 							List *newtlist = list_copy(leftchild->targetlist);
 							newtlist = lappend(newtlist, newtle);
-							leftchild = (Plan *) make_result(newtlist, NULL, leftchild);
+							leftchild = (Plan *) make_result(newtlist, NULL, leftchild, NULL);
 							lefttree->lefttree = leftchild;
 						}
 					}
@@ -6853,7 +6880,7 @@ make_remotesubplan(PlannerInfo *root,
                     /* Use Result node to calculate expression */
                     List *newtlist = list_copy(lefttree->targetlist);
                     newtlist = lappend(newtlist, newtle);
-                    lefttree = (Plan *) make_result(newtlist, NULL, lefttree);
+					lefttree = (Plan *) make_result(newtlist, NULL, lefttree, NULL);
                 }
 
                 node->distributionKey = newtle->resno;
@@ -7071,7 +7098,7 @@ make_remotesubplan(PlannerInfo *root,
                     {
                         /* copy needed so we don't modify input's tlist below */
                         tlist = copyObject(tlist);
-                        lefttree = (Plan *) make_result(tlist, NULL, lefttree);
+						lefttree = (Plan *) make_result(tlist, NULL, lefttree, NULL);
                     }
 
                     /*
@@ -8416,13 +8443,14 @@ make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount,
 static Result *
 make_result(List *tlist,
             Node *resconstantqual,
-            Plan *subplan)
-{// #lizard forgives
+			Plan *subplan,
+			List *qual)
+{
     Result       *node = makeNode(Result);
     Plan       *plan = &node->plan;
 
     plan->targetlist = tlist;
-    plan->qual = NIL;
+	plan->qual = qual;
     plan->lefttree = subplan;
     plan->righttree = NULL;
     node->resconstantqual = resconstantqual;
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index f3d1adb2..9b03a3d9 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -4649,6 +4649,34 @@ create_merge_append_path(PlannerInfo *root,
     return pathnode;
 }
 
+QualPath *
+create_qual_path(PlannerInfo *root, Path *subpath, List *quals)
+{
+	QualPath   *pathnode = makeNode(QualPath);
+	RelOptInfo *rel = subpath->parent;
+	QualCost    qual_cost;
+	Cost        run_cost;
+	
+	cost_qual_eval(&qual_cost, quals, root);
+	
+	pathnode->path.pathtype = T_Result;
+	pathnode->path.parent = rel;
+	pathnode->path.pathtarget = subpath->pathtarget;
+	pathnode->path.parallel_safe = rel->consider_parallel;
+	
+	pathnode->quals = quals;
+	pathnode->subpath = subpath;
+	
+	pathnode->path.rows = subpath->rows;
+	run_cost = subpath->total_cost - subpath->startup_cost;
+	run_cost += (cpu_operator_cost + qual_cost.per_tuple) * pathnode->path.rows;
+	
+	pathnode->path.startup_cost = subpath->startup_cost + qual_cost.startup;
+	pathnode->path.total_cost = subpath->total_cost + run_cost;
+	
+	return pathnode;
+}
+
 /*
  * create_result_path
  *      Creates a path representing a Result-and-nothing-else plan.
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index ddb99ddf..b12d1e31 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1018,6 +1018,7 @@ typedef struct ResultState
     ExprState  *resconstantqual;
     bool        rs_done;        /* are we done? */
     bool        rs_checkqual;    /* do we need to check the qual? */
+	bool        rs_fail_return; /* should return after failing qual? */
 } ResultState;
 
 /* ----------------
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 227af23f..854f36a4 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -304,6 +304,7 @@ typedef enum NodeTag
     T_AppendPath,
     T_MergeAppendPath,
     T_ResultPath,
+	T_QualPath,
     T_MaterialPath,
     T_UniquePath,
     T_GatherPath,
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 0736ab79..a0f11da0 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -1639,6 +1639,13 @@ typedef struct ProjectionPath
     bool        dummypp;        /* true if no separate Result is needed */
 } ProjectionPath;
 
+typedef struct QualPath
+{
+	Path        path;
+	Path       *subpath;
+	List       *quals;
+} QualPath;
+
 /*
  * ProjectSetPath represents evaluation of a targetlist that includes
  * set-returning function(s), which will need to be implemented by a
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 505cb463..e1fe0a4f 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -133,6 +133,7 @@ extern MergeAppendPath *create_merge_append_path(PlannerInfo *root,
                           List *pathkeys,
                           Relids required_outer,
                           List *partitioned_rels);
+extern QualPath *create_qual_path(PlannerInfo *root, Path *subpath, List *quals);
 extern ResultPath *create_result_path(PlannerInfo *root, RelOptInfo *rel,
                     PathTarget *target, List *resconstantqual);
 extern MaterialPath *create_material_path(RelOptInfo *rel, Path *subpath);

From 76644d8823ce645f726fd49d5985bb3cc85f2d9b Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 22 Apr 2022 16:48:17 +0800
Subject: [PATCH 550/578] Core changes of cn-udf implement

Change the pathtarget by make_udf_input_target apply appropriate remote path,
projection path and qual path before sort and grouping.
---
 src/backend/executor/nodeResult.c       |   6 +-
 src/backend/optimizer/path/allpaths.c   |   3 +
 src/backend/optimizer/plan/createplan.c |  16 +++
 src/backend/optimizer/plan/initsplan.c  |  20 ++++
 src/backend/optimizer/plan/planner.c    | 136 ++++++++++++++++++++++++
 src/include/nodes/execnodes.h           |   1 -
 src/include/nodes/relation.h            |   1 +
 7 files changed, 177 insertions(+), 6 deletions(-)

diff --git a/src/backend/executor/nodeResult.c b/src/backend/executor/nodeResult.c
index 0269d6d3..82aaa846 100644
--- a/src/backend/executor/nodeResult.c
+++ b/src/backend/executor/nodeResult.c
@@ -126,10 +126,7 @@ ExecResult(PlanState *pstate)
 				
 				if (!ExecQual(qual, econtext))
 				{
-					if (node->rs_fail_return)
-						return NULL;
-					else
-						continue;
+					continue;
 				}
 				
 				ResetExprContext(econtext);
@@ -214,7 +211,6 @@ ExecInitResult(Result *node, EState *estate, int eflags)
 
 	resstate->rs_done = false;
 	resstate->rs_checkqual = (node->resconstantqual == NULL) ? false : true;
-	resstate->rs_fail_return = contain_rownum_fetch((Node *) node->plan.qual);
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index b5ddbfcd..1b6b71bb 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -2038,7 +2038,10 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
     /*
      * The upper query might not use all the subquery's output columns; if
      * not, we can simplify.
+     *
+	 * but if upper query have cn-udf, don't try it.
      */
+	if (!root->udf_quals)
     remove_unused_subquery_outputs(subquery, rel);
 
     /*
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 08273bf9..f674826f 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -1131,6 +1131,13 @@ use_physical_tlist(PlannerInfo *root, Path *path, int flags)
         return false;
 
     /*
+	 * if we got cn-udf or rownum expr, return false to use
+	 * pathtarget to generate tlist.
+	 */
+	if (root->parse && root->parse->hasCoordFuncs)
+		return false;
+	
+	/*
      * We can do this for real relation scans, subquery scans, function scans,
      * tablefunc scans, values scans, and CTE scans (but not for, eg, joins).
      */
@@ -8456,6 +8463,14 @@ make_result(List *tlist,
     node->resconstantqual = resconstantqual;
 
 #ifdef XCP
+	/*
+	 * Do not consider pushing down node if this node is make to process any
+	 * project or qual that contain rownum or cn-udf.
+	 */
+	if (contain_user_defined_functions((Node *) tlist) ||
+	    contain_user_defined_functions((Node *) qual))
+		return node;
+	
     if (subplan)
     {
         /*
@@ -8828,6 +8843,7 @@ is_projection_capable_path(Path *path)
         case T_ModifyTable:
         case T_MergeAppend:
         case T_RecursiveUnion:
+		case T_RemoteSubplan:
             return false;
         case T_Append:
 
diff --git a/src/backend/optimizer/plan/initsplan.c b/src/backend/optimizer/plan/initsplan.c
index ab0972d5..92ddfc0f 100644
--- a/src/backend/optimizer/plan/initsplan.c
+++ b/src/backend/optimizer/plan/initsplan.c
@@ -1740,6 +1740,8 @@ distribute_qual_to_rels(PlannerInfo *root, Node *clause,
     Relids        nullable_relids;
     RestrictInfo *restrictinfo;
 
+	bool        contain_udf = contain_user_defined_functions((Node *) clause);
+	
     /*
      * Retrieve all relids mentioned within the clause.
      */
@@ -2092,9 +2094,27 @@ distribute_qual_to_rels(PlannerInfo *root, Node *clause,
         }
     }
 
+	if (root->parse && root->parse->commandType == CMD_SELECT && contain_udf)
+	{
+		List    *quals_var;
+		
+		/* clause contain cn-udf, don't distribute it to rels, collect it */
+		root->udf_quals = lappend(root->udf_quals, restrictinfo->clause);
+		
+		/* cn-udf quals will not distribute to rels, but vars must be added */
+		quals_var = pull_var_clause((Node *) root->udf_quals,
+		                            PVC_RECURSE_AGGREGATES |
+		                            PVC_RECURSE_WINDOWFUNCS |
+		                            PVC_INCLUDE_PLACEHOLDERS);
+		
+		add_vars_to_targetlist(root, quals_var, bms_make_singleton(0), false);
+	}
+	else
+	{
     /* No EC special case applies, so push it into the clause lists */
     distribute_restrictinfo_to_rels(root, restrictinfo);
 }
+}
 
 /*
  * check_outerjoin_delay
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index df9a5333..090b3588 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1758,6 +1758,72 @@ inheritance_planner(PlannerInfo *root)
                                      SS_assign_special_param(root)));
 }
 
+/*
+ * Like make_rownum_input_target, exclude any udf expr from origin_target,
+ * only those udf that need to execute on CN will be considered, check
+ * function: contain_user_defined_functions.
+ */
+static PathTarget *
+make_udf_input_target(PlannerInfo *root, PathTarget *origin_target)
+{
+	PathTarget *input_target = create_empty_pathtarget();
+	Query	   *parse = root->parse;
+	List	   *udf_cols = NIL;
+	List	   *udf_vars = NIL;
+	int			i;
+	ListCell   *lc;
+	
+	i = 0;
+	foreach(lc, origin_target->exprs)
+	{
+		Expr	   *expr = (Expr *) lfirst(lc);
+		Index		sgref = get_pathtarget_sortgroupref(origin_target, i);
+		
+		if (!contain_user_defined_functions((Node *) expr))
+		{
+			add_column_to_pathtarget(input_target, expr, sgref);
+		}
+		else
+		{
+			/*
+			 * Non-cn-udf column, so just remember the expression for later
+			 * call to pull_var_clause.
+			 */
+			udf_cols = lappend(udf_cols, expr);
+		}
+		
+		i++;
+	}
+	
+	/*
+	 * TODO: having cn-udf expr.
+	 */
+	if (parse->havingQual)
+		udf_cols = lappend(udf_cols, parse->havingQual);
+	
+	udf_cols = list_concat(udf_cols, list_copy(root->udf_quals));
+	
+	/*
+	 * Pull out all the Vars mentioned in non-cn-udf cols, and
+	 * add them to the input target if not already present. Note this
+	 * includes Vars used in resjunk items, so we are covering the needs of
+	 * ORDER BY and window specifications. Vars used within Aggrefs and
+	 * WindowFuncs will be pulled out here, too.
+	 */
+	udf_vars = pull_var_clause((Node *) udf_cols,
+	                           PVC_RECURSE_AGGREGATES |
+	                           PVC_RECURSE_WINDOWFUNCS |
+	                           PVC_INCLUDE_PLACEHOLDERS);
+	add_new_columns_to_pathtarget(input_target, udf_vars);
+	
+	/* clean up cruft */
+	list_free(udf_vars);
+	list_free(udf_cols);
+	
+	/* XXX this causes some redundant cost calculation ... */
+	return set_pathtarget_cost_width(root, input_target);
+}
+
 /*--------------------
  * grouping_planner
  *      Perform planning steps related to grouping, aggregation, etc.
@@ -1906,6 +1972,7 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
         List       *scanjoin_targets;
         List       *scanjoin_targets_contain_srfs;
 		bool		scanjoin_target_parallel_safe;
+		PathTarget *cn_process_target; /* including rownum_target */
         bool        have_grouping;
         AggClauseCosts agg_costs;
         WindowFuncLists *wflists = NULL;
@@ -2096,6 +2163,39 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
 		}
 
         /*
+		 * In postgresql, vars in qual didn't count into targetlist as junk,
+		 * since they are evaluated just after scan happened, but a qual with
+		 * rownum expr or cn-udf will be evaluated after collecting tuple
+		 * to CN, so we need to pull out vars from them.
+		 *
+		 * This is a bit ugly doing things here, but root->rownum_quals and
+		 * root->udf_quals are determined after query_planner, and targetlist
+		 * is determined way before that.
+		 */
+		if (root->udf_quals)
+		{
+			List *quals_var = pull_var_clause((Node *) root->udf_quals,
+			                                  PVC_RECURSE_AGGREGATES |
+			                                  PVC_RECURSE_WINDOWFUNCS |
+			                                  PVC_INCLUDE_PLACEHOLDERS);
+			
+			/* copy to make other targets clean */
+			if (scanjoin_target == grouping_target)
+				scanjoin_target = copy_pathtarget(scanjoin_target);
+			
+			foreach(lc, quals_var)
+			{
+				if (!list_member(scanjoin_target->exprs, lfirst_node(Var, lc)))
+					add_column_to_pathtarget(scanjoin_target, (Expr *) lfirst_node(Var, lc), 0);
+			}
+		}
+		
+		cn_process_target = scanjoin_target;
+		/* exclude cn-udf from scanjoin_target */
+		if (parse->hasCoordFuncs)
+			scanjoin_target = make_udf_input_target(root, scanjoin_target);
+		
+		/*
          * If there are any SRFs in the targetlist, we must separate each of
          * these PathTargets into SRF-computing and SRF-free targets.  Replace
          * each of the named targets with a SRF-free version, and remember the
@@ -2235,6 +2335,42 @@ grouping_planner(PlannerInfo *root, bool inheritance_update,
         root->upper_targets[UPPERREL_WINDOW] = sort_input_target;
         root->upper_targets[UPPERREL_GROUP_AGG] = grouping_target;
 
+		if (parse->hasCoordFuncs)
+		{
+			Path	   *path;
+			
+			foreach(lc, current_rel->pathlist)
+			{
+				path = (Path *) lfirst(lc);
+				
+				/* must collect tuple to cn for further processing */
+				if (path->distribution != NULL)
+					path = create_remotesubplan_path(root, path, NULL);
+				
+				/* add other projection step, currently it's only cn-udf */
+				path = apply_projection_to_path(root, current_rel,
+				                                path, cn_process_target);
+				
+				/* then evaluate other qual on CN, currently it's only cn-udf */
+				if (root->udf_quals != NIL)
+					path = (Path *) create_qual_path(root, path, root->udf_quals);
+				
+				/* apply final target if no grouping and no post-pone projection */
+				if (!have_grouping && final_target == sort_input_target && !activeWindows)
+					path = apply_projection_to_path(root, current_rel,
+					                                path, final_target);
+				
+				lfirst(lc) = path;
+			}
+			
+			set_cheapest(current_rel);
+		}
+		else if (root->udf_quals != NIL)
+		{
+			/* cn-quals found but no cn-target specified, should not happen but raise an error */
+			elog(ERROR, "remote qualification must exist in target list");
+		}
+		
         /*
          * If we have grouping and/or aggregation, consider ways to implement
          * that.  We build a new upperrel representing the output of this
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index b12d1e31..ddb99ddf 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1018,7 +1018,6 @@ typedef struct ResultState
     ExprState  *resconstantqual;
     bool        rs_done;        /* are we done? */
     bool        rs_checkqual;    /* do we need to check the qual? */
-	bool        rs_fail_return; /* should return after failing qual? */
 } ResultState;
 
 /* ----------------
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index a0f11da0..2be2556e 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -433,6 +433,7 @@ typedef struct PlannerInfo
     bool        haspart_tobe_modify;
     Index        partrelindex;
     Bitmapset    *partpruning;
+	List        *udf_quals;    /* quals that contain CN-udf */
 #endif
 #endif
 } PlannerInfo;

From 9e05e5e9c002681596e46d8ae9c2ecf9266d9682 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Fri, 22 Apr 2022 18:37:48 +0800
Subject: [PATCH 551/578] Write and read Query struct in a smart way for
 upgrade

---
 src/backend/nodes/equalfuncs.c |  1 +
 src/backend/nodes/outfuncs.c   |  1 +
 src/backend/nodes/readfuncs.c  | 13 ++++++++++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index f5f2bc77..aff497b5 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -986,6 +986,7 @@ _equalQuery(const Query *a, const Query *b)
     COMPARE_SCALAR_FIELD(hasModifyingCTE);
     COMPARE_SCALAR_FIELD(hasForUpdate);
     COMPARE_SCALAR_FIELD(hasRowSecurity);
+	COMPARE_SCALAR_FIELD(hasCoordFuncs);
     COMPARE_NODE_FIELD(cteList);
     COMPARE_NODE_FIELD(rtable);
     COMPARE_NODE_FIELD(jointree);
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 5a4602f5..c7892f52 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -4310,6 +4310,7 @@ _outQuery(StringInfo str, const Query *node)
     WRITE_BOOL_FIELD(hasModifyingCTE);
     WRITE_BOOL_FIELD(hasForUpdate);
     WRITE_BOOL_FIELD(hasRowSecurity);
+	WRITE_BOOL_FIELD(hasCoordFuncs);
     WRITE_NODE_FIELD(cteList);
     WRITE_NODE_FIELD(rtable);
     WRITE_NODE_FIELD(jointree);
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 96f4ca05..28cd7dbd 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -572,7 +572,18 @@ _readQuery(void)
     READ_BOOL_FIELD(hasModifyingCTE);
     READ_BOOL_FIELD(hasForUpdate);
     READ_BOOL_FIELD(hasRowSecurity);
-    READ_NODE_FIELD(cteList);
+	token = pg_strtok(&length);		/* get :fldname hasRowSecurity or cteList */
+	if (strncmp(nullable_string(token, length), ":hasCoordFuncs", length) == 0)
+	{
+		token = pg_strtok(&length);		/* get field value */
+		local_node->hasCoordFuncs = strtobool(token);
+		token = pg_strtok(&length);		/* skip :fldname cteList */
+	}
+	else
+	{
+		local_node->hasCoordFuncs = false;
+	}
+	local_node->cteList = nodeRead(NULL, 0);
     READ_NODE_FIELD(rtable);
     READ_NODE_FIELD(jointree);
     READ_NODE_FIELD(targetList);

From d6f9378aa09232d7406eed85cd05caf033236d77 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Sun, 24 Apr 2022 15:19:09 +0800
Subject: [PATCH 552/578] Ban pull-up functinos in DML on DN

---
 src/backend/optimizer/path/allpaths.c | 11 +++++++++++
 src/backend/parser/analyze.c          | 10 ++++++++++
 2 files changed, 21 insertions(+)

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 1b6b71bb..61769e22 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -1951,6 +1951,17 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
     RelOptInfo *sub_final_rel;
     ListCell   *lc;
 
+	if (subquery->hasCoordFuncs &&
+	    (parse->commandType == CMD_UPDATE ||
+	     parse->commandType == CMD_INSERT ||
+	     parse->commandType == CMD_DELETE))
+	{
+		ereport(ERROR,
+		        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			        errmsg("DML has a subquery contains a function runs on CN"),
+			        errhint("You might need to push that function down to DN.")));
+	}
+	
     /*
      * Must copy the Query so that planning doesn't mess up the RTE contents
      * (really really need to fix the planner to not scribble on its input,
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index c68d7b06..a720c1fc 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -448,6 +448,16 @@ transformStmt(ParseState *pstate, Node *parseTree)
     result->querySource = QSRC_ORIGINAL;
     result->canSetTag = true;
 	result->hasCoordFuncs = pstate->p_hasCoordFuncs;
+	if (result->hasCoordFuncs &&
+	    (result->commandType == CMD_UPDATE ||
+	     result->commandType == CMD_INSERT ||
+	     result->commandType == CMD_DELETE))
+	{
+		ereport(ERROR,
+		        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			        errmsg("DML contains a function runs on CN which is not supported"),
+			        errhint("You might need to push that function down to DN.")));
+	}
 
     return result;
 }

From a8bd6e852c9805374188c7946550c6e90ef4bdbc Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Sun, 24 Apr 2022 17:04:18 +0800
Subject: [PATCH 553/578] sync regress

---
 src/test/regress/expected/plpgsql_1.out       | 24 +++---
 .../regress/expected/select_parallel_4.out    | 22 +++---
 src/test/regress/expected/subselect.out       |  6 +-
 src/test/regress/expected/transactions.out    |  6 +-
 .../regress/expected/updatable_views_1.out    | 78 ++++++++++++++++---
 src/test/regress/expected/xc_remote.out       |  2 +-
 src/test/regress/sql/transactions.sql         |  4 +-
 src/test/regress/sql/updatable_views.sql      | 28 +++++++
 src/test/regress/sql/xc_remote.sql            |  2 +-
 9 files changed, 134 insertions(+), 38 deletions(-)

diff --git a/src/test/regress/expected/plpgsql_1.out b/src/test/regress/expected/plpgsql_1.out
index 4efbac5f..20f49e11 100644
--- a/src/test/regress/expected/plpgsql_1.out
+++ b/src/test/regress/expected/plpgsql_1.out
@@ -1548,26 +1548,26 @@ update PSlot set slotlink = 'HS.base.hub1.1' where slotname = 'PS.base.b2';
 --
 -- PGXCTODO: This is failing due to issue 3522907, complicated SELECT queries in plpgsql functions
 select * from PField_v1 where pfname = 'PF0_1' order by slotname;
- pfname |       slotname       |          backside          |      patch      
---------+----------------------+----------------------------+-----------------
- PF0_1  | PS.base.a1           | WS.001.1a in room 001 -> - | PS.base.ta1 -> 
- PF0_1  | PS.base.a2           |                            | -
- PF0_1  | PS.base.a3           | WS.001.2a in room 001 -> - | PS.base.ta2 -> 
+ pfname |       slotname       |          backside          |      patch       
+--------+----------------------+----------------------------+------------------
+ PF0_1  | PS.base.a1           | WS.001.1a in room 001 -> - | PS.base.ta1 -> -
+ PF0_1  | PS.base.a2           | WS.001.1b in room 001 -> - | -
+ PF0_1  | PS.base.a3           | WS.001.2a in room 001 -> - | PS.base.ta2 -> -
  PF0_1  | PS.base.a4           | -                          | -
  PF0_1  | PS.base.a5           | -                          | -
  PF0_1  | PS.base.a6           | -                          | -
- PF0_1  | PS.base.b1           |                            | PS.base.ta5 -> 
- PF0_1  | PS.base.b2           |                            | 
- PF0_1  | PS.base.b3           |                            | PS.base.tb2 -> 
- PF0_1  | PS.base.b4           |                            | -
+ PF0_1  | PS.base.b1           | WS.002.1a in room 002 -> - | PS.base.ta5 -> -
+ PF0_1  | PS.base.b2           | WS.002.1b in room 002 -> - | 
+ PF0_1  | PS.base.b3           | WS.002.2a in room 002 -> - | PS.base.tb2 -> -
+ PF0_1  | PS.base.b4           | WS.002.2b in room 002 -> - | -
  PF0_1  | PS.base.b5           | WS.002.3a in room 002 -> - | -
- PF0_1  | PS.base.b6           |                            | -
+ PF0_1  | PS.base.b6           | WS.002.3b in room 002 -> - | -
  PF0_1  | PS.base.c1           | WS.003.1a in room 003 -> - | -
  PF0_1  | PS.base.c2           | WS.003.1b in room 003 -> - | -
  PF0_1  | PS.base.c3           | WS.003.2a in room 003 -> - | -
- PF0_1  | PS.base.c4           |                            | -
+ PF0_1  | PS.base.c4           | WS.003.2b in room 003 -> - | -
  PF0_1  | PS.base.c5           | WS.003.3a in room 003 -> - | -
- PF0_1  | PS.base.c6           |                            | -
+ PF0_1  | PS.base.c6           | WS.003.3b in room 003 -> - | -
 (18 rows)
 
 select * from PField_v1 where pfname = 'PF0_2' order by slotname;
diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out
index f3b81ec8..b57f5248 100644
--- a/src/test/regress/expected/select_parallel_4.out
+++ b/src/test/regress/expected/select_parallel_4.out
@@ -43,20 +43,21 @@ explain (verbose, costs off)
 select parallel_restricted(unique1) from tenk1
   where stringu1 = 'GRAAAA' order by 1;
                           QUERY PLAN                           
----------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)
-   Output: parallel_restricted(unique1)
-   Sort Key: parallel_restricted(tenk1.unique1)
-   ->  Sort
+---------------------------------------------------------------------
+ Sort
          Output: (parallel_restricted(unique1))
          Sort Key: (parallel_restricted(tenk1.unique1))
-         ->  Gather
+   ->  Result
                Output: parallel_restricted(unique1)
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               Output: unique1
+               ->  Gather
+                     Output: unique1
                Workers Planned: 4
                ->  Parallel Seq Scan on public.tenk1
                      Output: unique1
                      Filter: (tenk1.stringu1 = 'GRAAAA'::name)
-(12 rows)
+(13 rows)
 
 -- test parallel plan when group by expression is in target list.
 explain (costs off)
@@ -125,14 +126,15 @@ explain (costs off)
 	select  sum(parallel_restricted(unique1)) from tenk1
 	group by(parallel_restricted(unique1));
                                QUERY PLAN                                
--------------------------------------------------------------------------
+-----------------------------------------------------------------
  HashAggregate
    Group Key: parallel_restricted(unique1)
+   ->  Result
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Gather
                Workers Planned: 4
-               ->  Parallel Index Only Scan using tenk1_unique1 on tenk1
-(6 rows)
+                     ->  Parallel Seq Scan on tenk1
+(7 rows)
 
 -- test parallel plans for queries containing un-correlated subplans.
 alter table tenk2 set (parallel_workers = 0);
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 32ed8e4f..876fd5c8 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1081,9 +1081,9 @@ select * from
   where tattle(x, 8);
                         QUERY PLAN                        
 ----------------------------------------------------------
- Subquery Scan on ss
-   Output: x, u
-   Filter: tattle(ss.x, 8)
+ Result
+   Output: (9), (unnest('{1,2,3,11,12,13}'::integer[]))
+   Filter: tattle((9), 8)
    ->  ProjectSet
          Output: 9, unnest('{1,2,3,11,12,13}'::integer[])
          ->  Result
diff --git a/src/test/regress/expected/transactions.out b/src/test/regress/expected/transactions.out
index 659d5a7c..4b5c54d5 100644
--- a/src/test/regress/expected/transactions.out
+++ b/src/test/regress/expected/transactions.out
@@ -582,6 +582,10 @@ end$$ language plpgsql volatile;
 create table revalidate_bug (c float8 unique);
 insert into revalidate_bug values (1);
 insert into revalidate_bug values (inverse(0));
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+alter function inverse(int) pushdown;
+insert into revalidate_bug values (inverse(0));
 drop table revalidate_bug;
 drop function inverse(int);
 -- verify that cursors created during an aborted subtransaction are
@@ -633,7 +637,7 @@ fetch from foo;
 abort;
 -- Test for proper cleanup after a failure in a cursor portal
 -- that was created in an outer subtransaction
-CREATE FUNCTION invert(x float8) RETURNS float8 LANGUAGE plpgsql AS
+CREATE FUNCTION invert(x float8) RETURNS float8 pushdown LANGUAGE plpgsql AS
 $$ begin return 1/x; end $$;
 CREATE FUNCTION create_temp_tab() RETURNS text
 LANGUAGE plpgsql AS $$
diff --git a/src/test/regress/expected/updatable_views_1.out b/src/test/regress/expected/updatable_views_1.out
index e13b4537..ae85f4e2 100644
--- a/src/test/regress/expected/updatable_views_1.out
+++ b/src/test/regress/expected/updatable_views_1.out
@@ -1829,6 +1829,8 @@ END;
 $$
 LANGUAGE plpgsql STRICT IMMUTABLE LEAKPROOF;
 SELECT * FROM rw_view1 WHERE snoop(person);
+NOTICE:  snooped value: Tom
+NOTICE:  snooped value: Harry
  person 
 --------
  Tom
@@ -1836,7 +1838,15 @@ SELECT * FROM rw_view1 WHERE snoop(person);
 (2 rows)
 
 UPDATE rw_view1 SET person=person WHERE snoop(person);
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
 DELETE FROM rw_view1 WHERE NOT snoop(person);
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+ALTER FUNCTION snoop(anyelement) pushdown;
+UPDATE rw_view1 SET person=person WHERE snoop(person);
+DELETE FROM rw_view1 WHERE NOT snoop(person);
+ALTER FUNCTION snoop(anyelement) not pushdown;
 ALTER VIEW rw_view1 SET (security_barrier = true);
 SELECT table_name, is_insertable_into
   FROM information_schema.tables
@@ -1864,12 +1874,21 @@ SELECT table_name, column_name, is_updatable
 (1 row)
 
 SELECT * FROM rw_view1 WHERE snoop(person);
+NOTICE:  snooped value: Tom
+NOTICE:  snooped value: Harry
  person 
 --------
  Tom
  Harry
 (2 rows)
 
+UPDATE rw_view1 SET person=person WHERE snoop(person);
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+DELETE FROM rw_view1 WHERE NOT snoop(person);
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+ALTER FUNCTION snoop(anyelement) pushdown;
 UPDATE rw_view1 SET person=person WHERE snoop(person);
 DELETE FROM rw_view1 WHERE NOT snoop(person);
 EXPLAIN (costs off, nodes off) SELECT * FROM rw_view1 WHERE snoop(person);
@@ -1900,6 +1919,7 @@ EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE NOT snoop(person);
                Filter: ((visibility = 'public'::text) AND (NOT snoop(person)))
 (4 rows)
 
+ALTER FUNCTION snoop(anyelement) not pushdown;
 -- security barrier view on top of security barrier view
 CREATE VIEW rw_view2 WITH (security_barrier = true) AS
   SELECT * FROM rw_view1 WHERE snoop(person);
@@ -1929,20 +1949,31 @@ SELECT table_name, column_name, is_updatable
 (1 row)
 
 SELECT * FROM rw_view2 WHERE snoop(person);
+NOTICE:  snooped value: Tom
+NOTICE:  snooped value: Tom
+NOTICE:  snooped value: Harry
+NOTICE:  snooped value: Harry
  person 
 --------
  Tom
  Harry
 (2 rows)
 
+UPDATE rw_view2 SET person=person WHERE snoop(person);
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+DELETE FROM rw_view2 WHERE NOT snoop(person);
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+ALTER FUNCTION snoop(anyelement) pushdown;
 UPDATE rw_view2 SET person=person WHERE snoop(person);
 DELETE FROM rw_view2 WHERE NOT snoop(person);
 EXPLAIN (costs off, nodes off) SELECT * FROM rw_view2 WHERE snoop(person);
                         QUERY PLAN                         
 -----------------------------------------------------------
- Remote Subquery Scan on all
-   ->  Subquery Scan on rw_view2
-         Filter: snoop(rw_view2.person)
+ Subquery Scan on rw_view2
+   Filter: snoop(rw_view2.person)
+   ->  Remote Subquery Scan on all
          ->  Subquery Scan on rw_view1
                Filter: snoop(rw_view1.person)
                ->  Seq Scan on base_tbl
@@ -1967,6 +1998,7 @@ EXPLAIN (costs off, nodes off) DELETE FROM rw_view2 WHERE NOT snoop(person);
                Filter: ((visibility = 'public'::text) AND snoop(person) AND (NOT snoop(person)))
 (4 rows)
 
+ALTER FUNCTION snoop(anyelement) not pushdown;
 DROP TABLE base_tbl CASCADE;
 NOTICE:  drop cascades to 2 other objects
 DETAIL:  drop cascades to view rw_view1
@@ -1989,6 +2021,13 @@ SELECT * FROM rw_view1;
   1 | Row 1
 (1 row)
 
+EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE id = 1 AND snoop(data);
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+DELETE FROM rw_view1 WHERE id = 1 AND snoop(data);
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+ALTER FUNCTION snoop(anyelement) pushdown;
 EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE id = 1 AND snoop(data);
                                QUERY PLAN                                
 -------------------------------------------------------------------------
@@ -2003,6 +2042,7 @@ EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE id = 1 AND snoop(data)
 (8 rows)
 
 DELETE FROM rw_view1 WHERE id = 1 AND snoop(data);
+ALTER FUNCTION snoop(anyelement) not pushdown;
 EXPLAIN (costs off, nodes off) INSERT INTO rw_view1 VALUES (2, 'New row 2');
                               QUERY PLAN                               
 -----------------------------------------------------------------------
@@ -2085,6 +2125,14 @@ SELECT * FROM v1 WHERE a=8;
  8 | 8 | t111 | t11d
 (4 rows)
 
+EXPLAIN (VERBOSE, COSTS OFF)
+UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+ALTER FUNCTION leakproof(anyelement) pushdown;
 EXPLAIN (VERBOSE, COSTS OFF)
 UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
                                           QUERY PLAN                                           
@@ -2121,6 +2169,7 @@ UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
 (29 rows)
 
 UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
+ALTER FUNCTION leakproof(anyelement) not pushdown;
 SELECT * FROM v1 WHERE a=100; -- Nothing should have been changed to 100
  a | b | c | d 
 ---+---+---+---
@@ -2131,6 +2180,15 @@ SELECT * FROM t1 WHERE a=100; -- Nothing should have been changed to 100
 ---+---+---
 (0 rows)
 
+EXPLAIN (VERBOSE, COSTS OFF)
+UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8;
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8;
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+ALTER FUNCTION leakproof(anyelement) pushdown;
+ALTER FUNCTION snoop(anyelement) pushdown;
 EXPLAIN (VERBOSE, COSTS OFF)
 UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8;
                                          QUERY PLAN                                          
@@ -2168,15 +2226,17 @@ UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8;
 
 UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8;
 SELECT * FROM v1 WHERE b=8;
- a | b |  c   |  d   
----+---+------+------
- 9 | 8 | t1   | t11d
- 9 | 8 | t11  | t11d
- 9 | 8 | t12  | t11d
- 9 | 8 | t111 | t11d
+ a | b |  c   |   d   
+---+---+------+-------
+ 9 | 8 | t1   | t111d
+ 9 | 8 | t11  | t111d
+ 9 | 8 | t12  | t111d
+ 9 | 8 | t111 | t111d
 (4 rows)
 
 DELETE FROM v1 WHERE snoop(a) AND leakproof(a); -- should not delete everything, just where a>5
+ALTER FUNCTION leakproof(anyelement) not pushdown;
+ALTER FUNCTION snoop(anyelement) not pushdown;
 TABLE t1; -- verify all a<=5 are intact
  a | b |  c   
 ---+---+------
diff --git a/src/test/regress/expected/xc_remote.out b/src/test/regress/expected/xc_remote.out
index 16b7075b..9e01a5e6 100644
--- a/src/test/regress/expected/xc_remote.out
+++ b/src/test/regress/expected/xc_remote.out
@@ -366,7 +366,7 @@ CREATE TABLE xcrem_employee (EMPNO CHAR(6) NOT NULL, FIRSTNAME VARCHAR(12) NOT N
      INSERT INTO xcrem_employee (EMPNO,FIRSTNAME,MIDINIT,LASTNAME,WORKDEPT,PHONENO,HIREDATE,JOB,EDLEVEL,SEX,BIRTHDATE,SALARY,BONUS,COMM) VALUES(
 '000180','MARILYN',       'S',  'SCOUTTEN',       'D11',  '1682','1973-07-07','DESIGNER',    17,  'F', '1949-02-21',  21340.00,500,1707);
 create table xcrem_temptable as select * from xcrem_employee;
-create or replace function volatile_func(id int) returns int as
+create or replace function volatile_func(id int) returns int pushdown as
 $$begin return 3;end $$ language plpgsql;
 \set EXP 'explain (verbose true, costs false, nodes false)'
 \set SEL 'select empno, edlevel, lastname, salary, bonus from xcrem_employee order by empno'
diff --git a/src/test/regress/sql/transactions.sql b/src/test/regress/sql/transactions.sql
index 80f235e0..b9f52f16 100644
--- a/src/test/regress/sql/transactions.sql
+++ b/src/test/regress/sql/transactions.sql
@@ -388,6 +388,8 @@ end$$ language plpgsql volatile;
 create table revalidate_bug (c float8 unique);
 insert into revalidate_bug values (1);
 insert into revalidate_bug values (inverse(0));
+alter function inverse(int) pushdown;
+insert into revalidate_bug values (inverse(0));
 
 drop table revalidate_bug;
 drop function inverse(int);
@@ -431,7 +433,7 @@ abort;
 
 -- Test for proper cleanup after a failure in a cursor portal
 -- that was created in an outer subtransaction
-CREATE FUNCTION invert(x float8) RETURNS float8 LANGUAGE plpgsql AS
+CREATE FUNCTION invert(x float8) RETURNS float8 pushdown LANGUAGE plpgsql AS
 $$ begin return 1/x; end $$;
 
 CREATE FUNCTION create_temp_tab() RETURNS text
diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql
index 6aa951ca..f0c092c8 100644
--- a/src/test/regress/sql/updatable_views.sql
+++ b/src/test/regress/sql/updatable_views.sql
@@ -886,6 +886,10 @@ LANGUAGE plpgsql STRICT IMMUTABLE LEAKPROOF;
 SELECT * FROM rw_view1 WHERE snoop(person);
 UPDATE rw_view1 SET person=person WHERE snoop(person);
 DELETE FROM rw_view1 WHERE NOT snoop(person);
+ALTER FUNCTION snoop(anyelement) pushdown;
+UPDATE rw_view1 SET person=person WHERE snoop(person);
+DELETE FROM rw_view1 WHERE NOT snoop(person);
+ALTER FUNCTION snoop(anyelement) not pushdown;
 
 ALTER VIEW rw_view1 SET (security_barrier = true);
 
@@ -905,10 +909,14 @@ SELECT table_name, column_name, is_updatable
 SELECT * FROM rw_view1 WHERE snoop(person);
 UPDATE rw_view1 SET person=person WHERE snoop(person);
 DELETE FROM rw_view1 WHERE NOT snoop(person);
+ALTER FUNCTION snoop(anyelement) pushdown;
+UPDATE rw_view1 SET person=person WHERE snoop(person);
+DELETE FROM rw_view1 WHERE NOT snoop(person);
 
 EXPLAIN (costs off, nodes off) SELECT * FROM rw_view1 WHERE snoop(person);
 EXPLAIN (costs off, nodes off) UPDATE rw_view1 SET person=person WHERE snoop(person);
 EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE NOT snoop(person);
+ALTER FUNCTION snoop(anyelement) not pushdown;
 
 -- security barrier view on top of security barrier view
 
@@ -931,10 +939,14 @@ SELECT table_name, column_name, is_updatable
 SELECT * FROM rw_view2 WHERE snoop(person);
 UPDATE rw_view2 SET person=person WHERE snoop(person);
 DELETE FROM rw_view2 WHERE NOT snoop(person);
+ALTER FUNCTION snoop(anyelement) pushdown;
+UPDATE rw_view2 SET person=person WHERE snoop(person);
+DELETE FROM rw_view2 WHERE NOT snoop(person);
 
 EXPLAIN (costs off, nodes off) SELECT * FROM rw_view2 WHERE snoop(person);
 EXPLAIN (costs off, nodes off) UPDATE rw_view2 SET person=person WHERE snoop(person);
 EXPLAIN (costs off, nodes off) DELETE FROM rw_view2 WHERE NOT snoop(person);
+ALTER FUNCTION snoop(anyelement) not pushdown;
 
 DROP TABLE base_tbl CASCADE;
 
@@ -959,6 +971,10 @@ SELECT * FROM rw_view1;
 
 EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE id = 1 AND snoop(data);
 DELETE FROM rw_view1 WHERE id = 1 AND snoop(data);
+ALTER FUNCTION snoop(anyelement) pushdown;
+EXPLAIN (costs off, nodes off) DELETE FROM rw_view1 WHERE id = 1 AND snoop(data);
+DELETE FROM rw_view1 WHERE id = 1 AND snoop(data);
+ALTER FUNCTION snoop(anyelement) not pushdown;
 
 EXPLAIN (costs off, nodes off) INSERT INTO rw_view1 VALUES (2, 'New row 2');
 INSERT INTO rw_view1 VALUES (2, 'New row 2');
@@ -1003,10 +1019,20 @@ SELECT * FROM v1 WHERE a=8;
 EXPLAIN (VERBOSE, COSTS OFF)
 UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
 UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
+ALTER FUNCTION leakproof(anyelement) pushdown;
+EXPLAIN (VERBOSE, COSTS OFF)
+UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
+UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
+ALTER FUNCTION leakproof(anyelement) not pushdown;
 
 SELECT * FROM v1 WHERE a=100; -- Nothing should have been changed to 100
 SELECT * FROM t1 WHERE a=100; -- Nothing should have been changed to 100
 
+EXPLAIN (VERBOSE, COSTS OFF)
+UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8;
+UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8;
+ALTER FUNCTION leakproof(anyelement) pushdown;
+ALTER FUNCTION snoop(anyelement) pushdown;
 EXPLAIN (VERBOSE, COSTS OFF)
 UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8;
 UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8;
@@ -1014,6 +1040,8 @@ UPDATE v1 SET a=a+1 WHERE snoop(a) AND leakproof(a) AND a = 8;
 SELECT * FROM v1 WHERE b=8;
 
 DELETE FROM v1 WHERE snoop(a) AND leakproof(a); -- should not delete everything, just where a>5
+ALTER FUNCTION leakproof(anyelement) not pushdown;
+ALTER FUNCTION snoop(anyelement) not pushdown;
 
 TABLE t1; -- verify all a<=5 are intact
 
diff --git a/src/test/regress/sql/xc_remote.sql b/src/test/regress/sql/xc_remote.sql
index edd73ac6..39ef7ecf 100644
--- a/src/test/regress/sql/xc_remote.sql
+++ b/src/test/regress/sql/xc_remote.sql
@@ -185,7 +185,7 @@ CREATE TABLE xcrem_employee (EMPNO CHAR(6) NOT NULL, FIRSTNAME VARCHAR(12) NOT N
 
 create table xcrem_temptable as select * from xcrem_employee;
 
-create or replace function volatile_func(id int) returns int as
+create or replace function volatile_func(id int) returns int pushdown as
 $$begin return 3;end $$ language plpgsql;
 
 \set EXP 'explain (verbose true, costs false, nodes false)'

From 86603cd8a9b781755a91add6af96b1cc4418ed96 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 31 Mar 2022 16:06:43 +0800
Subject: [PATCH 554/578] Consider restricted node number in cost module

---
 src/backend/optimizer/util/pathnode.c           |  8 +++++++-
 src/test/regress/expected/nestloop_by_shard.out |  8 ++++----
 src/test/regress/expected/xc_FQS_2.out          | 12 ++++++------
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 9b03a3d9..ed05ea1e 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -7414,7 +7414,13 @@ path_count_datanodes(Path *path)
 	    (path->distribution->distributionType == LOCATOR_TYPE_SHARD ||
 	     path->distribution->distributionType == LOCATOR_TYPE_HASH))
 	{
-		double nodes = bms_num_members(path->distribution->nodes);
+		double nodes;
+		
+		nodes = bms_num_members(path->distribution->restrictNodes);
+		if (nodes > 0)
+			return nodes;
+		
+		nodes = bms_num_members(path->distribution->nodes);
 		if (nodes > 0)
 			return nodes;
 	}
diff --git a/src/test/regress/expected/nestloop_by_shard.out b/src/test/regress/expected/nestloop_by_shard.out
index da851318..47ebc061 100644
--- a/src/test/regress/expected/nestloop_by_shard.out
+++ b/src/test/regress/expected/nestloop_by_shard.out
@@ -81,8 +81,8 @@ where t1.unique1 = 1;
    ->  Nested Loop Left Join
          ->  Remote Subquery Scan on all
                Distribute results by S: hundred
-               ->  Seq Scan on tenk1_s t1
-                     Filter: (unique1 = 1)
+               ->  Index Scan using unique1_s on tenk1_s t1
+                     Index Cond: (unique1 = 1)
          ->  Materialize
                ->  Remote Subquery Scan on all
                      Distribute results by S: hundred
@@ -114,8 +114,8 @@ where t1.unique1 = 1;
    ->  Nested Loop Left Join
          ->  Remote Subquery Scan on all
                Distribute results by S: hundred
-               ->  Seq Scan on tenk1_s t1
-                     Filter: (unique1 = 1)
+               ->  Index Scan using unique1_s on tenk1_s t1
+                     Index Cond: (unique1 = 1)
          ->  Materialize
                ->  Remote Subquery Scan on all
                      Distribute results by S: hundred
diff --git a/src/test/regress/expected/xc_FQS_2.out b/src/test/regress/expected/xc_FQS_2.out
index 7f9570b4..1089bd60 100644
--- a/src/test/regress/expected/xc_FQS_2.out
+++ b/src/test/regress/expected/xc_FQS_2.out
@@ -1641,14 +1641,14 @@ select * from subquery_fqs t join (select 1 id, 'gd' a, 2 c from dual union sele
 explain select * from subquery_fqs t1 where t1.id = 1 and t1.c IN (select c from subquery_fqs t2 where t2.id=1);
                                             QUERY PLAN                                            
 --------------------------------------------------------------------------------------------------
- Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..121.16 rows=1 width=40)
-   ->  Nested Loop Semi Join  (cost=100.00..121.16 rows=1 width=40)
+ Remote Subquery Scan on all (datanode_1,datanode_2)  (cost=100.00..142.55 rows=1 width=40)
+   ->  Nested Loop Semi Join  (cost=100.00..142.55 rows=1 width=40)
          Join Filter: (t1.c = t2.c)
-         ->  Seq Scan on subquery_fqs t1  (cost=0.00..10.50 rows=2 width=40)
+         ->  Seq Scan on subquery_fqs t1  (cost=0.00..21.00 rows=4 width=40)
                Filter: (id = 1)
-         ->  Materialize  (cost=100.00..110.55 rows=4 width=4)
-               ->  Remote Subquery Scan on all (datanode_1)  (cost=100.00..110.53 rows=4 width=4)
-                     ->  Seq Scan on subquery_fqs t2  (cost=0.00..10.50 rows=2 width=4)
+         ->  Materialize  (cost=100.00..121.09 rows=8 width=4)
+               ->  Remote Subquery Scan on all (datanode_1)  (cost=100.00..121.05 rows=8 width=4)
+                     ->  Seq Scan on subquery_fqs t2  (cost=0.00..21.00 rows=4 width=4)
                            Filter: (id = 1)
 (9 rows)
 

From 2183c18380f0e2324c1368d9c1439485d71a40d2 Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Sun, 24 Apr 2022 17:43:58 +0800
Subject: [PATCH 555/578] fix multi-values insert error: Failing row contains
 (null, null).

        TAPD:http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131098492799&jump_count=1
   add regress test case
---
 src/backend/commands/copy.c             |  3 ++-
 src/backend/executor/spi.c              |  4 ++--
 src/pl/plpgsql/src/pl_exec.c            |  6 ++++++
 src/test/regress/expected/plpgsql_1.out | 25 +++++++++++++++++++++++++
 src/test/regress/sql/plpgsql.sql        | 18 ++++++++++++++++++
 5 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 8bf02419..5600bbf2 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -5039,7 +5039,8 @@ CopyReadLine(CopyState cstate)
                 cstate->line_buf.data[cstate->line_buf.len] = '\0';
                 break;
             case EOL_UNKNOWN:
-                /* shouldn't get here */
+				/* shouldn't get here except we are transform from insert */
+				if (!cstate->internal_mode)
                 Assert(false);
                 break;
         }
diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c
index 510f1fcb..416f3d3a 100644
--- a/src/backend/executor/spi.c
+++ b/src/backend/executor/spi.c
@@ -2107,7 +2107,7 @@ _SPI_execute_plan(SPIPlanPtr plan, ParamListInfo paramLI,
 		 * TODO: now we don't support param, if multi values contains paramref, do not
 		 * transform to CopyStmt, refactor later
 		 */
-		if (plansource->insert_into && plansource->raw_parse_tree != NULL &&
+		if (g_transform_insert_to_copy && plansource->insert_into && plansource->raw_parse_tree != NULL &&
 			IsA(plansource->raw_parse_tree->stmt, InsertStmt))
 		{
 			bool suc;
@@ -2896,7 +2896,7 @@ static void _SPI_multi_insert_rewrite(CachedPlanSource *plansource,
 		 * set insert_into when we get multi-values insert, not
 		 * often happen
 		 */
-		if (unlikely(parse->isMultiValues && !parse->hasUnshippableTriggers))
+		if (unlikely(g_transform_insert_to_copy && parse->isMultiValues && !parse->hasUnshippableTriggers))
 		{
 			MemoryContext old_ctx;
 			InsertStmt *iStmt = (InsertStmt*)parsetree->stmt;
diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c
index 5c232317..74eb8d76 100644
--- a/src/pl/plpgsql/src/pl_exec.c
+++ b/src/pl/plpgsql/src/pl_exec.c
@@ -29,6 +29,7 @@
 #include "optimizer/planner.h"
 #include "parser/parse_coerce.h"
 #include "parser/scansup.h"
+#include "parser/analyze.h"
 #include "storage/proc.h"
 #include "tcop/tcopprot.h"
 #include "utils/array.h"
@@ -3689,6 +3690,11 @@ exec_stmt_execsql(PLpgSQL_execstate *estate,
                         q->commandType == CMD_UPDATE ||
                         q->commandType == CMD_DELETE)
                         stmt->mod_stmt = true;
+
+					/* when transform insert to copy, reset mod_stmt */
+					if (g_transform_insert_to_copy && q->commandType == CMD_INSERT &&
+						q->isMultiValues && !q->hasUnshippableTriggers)
+						stmt->mod_stmt = false;
                     /* PGXCTODO: Support a better parameter interface for XC with DMLs */
                     if
 #ifdef XCP
diff --git a/src/test/regress/expected/plpgsql_1.out b/src/test/regress/expected/plpgsql_1.out
index 20f49e11..a686a7e8 100644
--- a/src/test/regress/expected/plpgsql_1.out
+++ b/src/test/regress/expected/plpgsql_1.out
@@ -6184,3 +6184,28 @@ SELECT * FROM list_partitioned_table() AS t;
  2
 (2 rows)
 
+set transform_insert_to_copy to on;
+create table multi_itb(f1 int,f2 int);
+create or replace function insert_mul () returns text as
+$$
+begin
+  insert into multi_itb values(1,1),(2,2);
+  return 'ok';
+end;
+$$
+language plpgsql;
+select insert_mul();
+ insert_mul 
+------------
+ ok
+(1 row)
+
+select * from multi_itb order by f1;
+ f1 | f2 
+----+----
+  1 |  1
+  2 |  2
+(2 rows)
+
+drop table multi_itb;
+set transform_insert_to_copy to off;
diff --git a/src/test/regress/sql/plpgsql.sql b/src/test/regress/sql/plpgsql.sql
index a614da36..4a7c4ab3 100644
--- a/src/test/regress/sql/plpgsql.sql
+++ b/src/test/regress/sql/plpgsql.sql
@@ -4921,3 +4921,21 @@ BEGIN
 END; $$ LANGUAGE plpgsql;
 
 SELECT * FROM list_partitioned_table() AS t;
+
+set transform_insert_to_copy to on;
+
+create table multi_itb(f1 int,f2 int);
+
+create or replace function insert_mul () returns text as
+$$
+begin
+  insert into multi_itb values(1,1),(2,2);
+  return 'ok';
+end;
+$$
+language plpgsql;
+
+select insert_mul();
+select * from multi_itb order by f1;
+drop table multi_itb;
+set transform_insert_to_copy to off;

From 5533f0ae1d02d73bc88718fdf0f16f787aa5c964 Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Sun, 24 Apr 2022 20:30:29 +0800
Subject: [PATCH 556/578] fix multi-values insert error: Failing row contains
 (null, null).

TAPD:http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131098492799&jump_count=1

reformat code
---
 src/backend/nodes/copyfuncs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index e2e6b7fc..3cc64b1d 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -3320,7 +3320,8 @@ _copyInsertStmt(const InsertStmt *from)
 	{
 	newnode->data_list =
 		(char ***)palloc(sizeof(char **) * from->ndatarows);
-	for (rowIdx = 0; rowIdx < from->ndatarows; rowIdx++) {
+		for (rowIdx = 0; rowIdx < from->ndatarows; rowIdx++)
+		{
 		newnode->data_list[rowIdx] =
 			(char **)palloc(sizeof(char *) * from->ninsert_columns);
 		for (colIdx = 0; colIdx < from->ninsert_columns; colIdx++)

From 54b0b4741989bf0758bcdec1e80e961c22eef46f Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Mon, 25 Apr 2022 11:46:47 +0800
Subject: [PATCH 557/578] fix 2 warnings

---
 src/backend/nodes/readfuncs.c   | 3 ++-
 src/backend/parser/parse_func.c | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 28cd7dbd..2730b6f3 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -573,7 +573,8 @@ _readQuery(void)
     READ_BOOL_FIELD(hasForUpdate);
     READ_BOOL_FIELD(hasRowSecurity);
 	token = pg_strtok(&length);		/* get :fldname hasRowSecurity or cteList */
-	if (strncmp(nullable_string(token, length), ":hasCoordFuncs", length) == 0)
+	Assert(length != 0);
+	if (strncmp(debackslash(token, length), ":hasCoordFuncs", length) == 0)
 	{
 		token = pg_strtok(&length);		/* get field value */
 		local_node->hasCoordFuncs = strtobool(token);
diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c
index ac9fc9c0..b5d44478 100644
--- a/src/backend/parser/parse_func.c
+++ b/src/backend/parser/parse_func.c
@@ -2265,7 +2265,6 @@ check_srf_call_placement(ParseState *pstate, Node *last_srf, int location)
 bool
 func_is_pullup(Oid func_id)
 {
-	char *name = NULL;
 	if (func_id >= FirstNormalObjectId)
 	{
 		Oid func_lang_oid;

From ca57eab6c2e1468a741bf0c812e1e4a7766f235c Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Mon, 25 Apr 2022 12:00:28 +0800
Subject: [PATCH 558/578] fix multi-values insert error: Failing row contains
 (null, null).

TAPD:http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131098492799&jump_count=1

Check NULL in data_list, when copy it from one InsertStmt to another. And add test case for this scenario.
---
 src/backend/executor/spi.c              |  5 +++++
 src/backend/nodes/copyfuncs.c           |  5 +++++
 src/test/regress/expected/plpgsql_1.out | 22 ++++++++++++++++++++++
 src/test/regress/sql/plpgsql.sql        | 12 ++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c
index 416f3d3a..dbad0299 100644
--- a/src/backend/executor/spi.c
+++ b/src/backend/executor/spi.c
@@ -2914,9 +2914,14 @@ static void _SPI_multi_insert_rewrite(CachedPlanSource *plansource,
 					pStmt->data_list[rowIdx] = (char **)palloc(
 						sizeof(char *) * iStmt->ninsert_columns);
 					for (colIdx = 0; colIdx < iStmt->ninsert_columns; colIdx++)
+					{
+						if (iStmt->data_list[rowIdx][colIdx] == NULL)
+							pStmt->data_list[rowIdx][colIdx] = NULL;
+						else
 						pStmt->data_list[rowIdx][colIdx] = pstrdup(iStmt->data_list[rowIdx][colIdx]);
 				}
 			}
+			}
 			pStmt->ndatarows = iStmt->ndatarows;
 			pStmt->ninsert_columns = iStmt->ninsert_columns;
 			MemoryContextSwitchTo(old_ctx);
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 3cc64b1d..475cb4ee 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -3325,10 +3325,15 @@ _copyInsertStmt(const InsertStmt *from)
 		newnode->data_list[rowIdx] =
 			(char **)palloc(sizeof(char *) * from->ninsert_columns);
 		for (colIdx = 0; colIdx < from->ninsert_columns; colIdx++)
+			{
+				if(from->data_list[rowIdx][colIdx] == NULL)
+					newnode->data_list[rowIdx][colIdx] = NULL;
+				else
             newnode->data_list[rowIdx][colIdx] =
                 pstrdup(from->data_list[rowIdx][colIdx]);
 	}
 	}
+	}
 	COPY_SCALAR_FIELD(ndatarows);
 #endif
     return newnode;
diff --git a/src/test/regress/expected/plpgsql_1.out b/src/test/regress/expected/plpgsql_1.out
index a686a7e8..3ae9212f 100644
--- a/src/test/regress/expected/plpgsql_1.out
+++ b/src/test/regress/expected/plpgsql_1.out
@@ -6194,6 +6194,14 @@ begin
 end;
 $$
 language plpgsql;
+create or replace function insert_mul_null () returns text as
+$$
+begin
+    insert into multi_itb values(1,null),(2,null);
+        return 'ok';
+end;
+$$
+language plpgsql;
 select insert_mul();
  insert_mul 
 ------------
@@ -6207,5 +6215,19 @@ select * from multi_itb order by f1;
   2 |  2
 (2 rows)
 
+truncate multi_itb;
+select insert_mul_null();
+ insert_mul_null 
+-----------------
+ ok
+(1 row)
+
+select * from multi_itb order by f1;
+ f1 | f2 
+----+----
+  1 |   
+  2 |   
+(2 rows)
+
 drop table multi_itb;
 set transform_insert_to_copy to off;
diff --git a/src/test/regress/sql/plpgsql.sql b/src/test/regress/sql/plpgsql.sql
index 4a7c4ab3..3f0a9757 100644
--- a/src/test/regress/sql/plpgsql.sql
+++ b/src/test/regress/sql/plpgsql.sql
@@ -4935,7 +4935,19 @@ end;
 $$
 language plpgsql;
 
+create or replace function insert_mul_null () returns text as
+$$
+begin
+    insert into multi_itb values(1,null),(2,null);
+        return 'ok';
+end;
+$$
+language plpgsql;
+
 select insert_mul();
 select * from multi_itb order by f1;
+truncate multi_itb;
+select insert_mul_null();
+select * from multi_itb order by f1;
 drop table multi_itb;
 set transform_insert_to_copy to off;

From 397be63b070ca7b849f18b3024f072dc8005e0d6 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Mon, 25 Apr 2022 12:49:24 +0800
Subject: [PATCH 559/578] Fix bug of setting pstate->p_hasCoordFuncs and sync
 regress

---
 src/backend/parser/parse_func.c            | 2 +-
 src/test/regress/expected/pl_bugs.out      | 2 +-
 src/test/regress/expected/polymorphism.out | 5 +++++
 src/test/regress/sql/pl_bugs.sql           | 2 +-
 src/test/regress/sql/updatable_views.sql   | 2 ++
 5 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c
index b5d44478..dd7e8a00 100644
--- a/src/backend/parser/parse_func.c
+++ b/src/backend/parser/parse_func.c
@@ -255,7 +255,7 @@ ParseFuncOrColumn(ParseState *pstate, List *funcname, List *fargs,
 
     cancel_parser_errposition_callback(&pcbstate);
 
-	pstate->p_hasCoordFuncs = func_is_pullup(funcid);
+	pstate->p_hasCoordFuncs = pstate->p_hasCoordFuncs ? true : func_is_pullup(funcid);
 	
     if (fdresult == FUNCDETAIL_COERCION)
     {
diff --git a/src/test/regress/expected/pl_bugs.out b/src/test/regress/expected/pl_bugs.out
index 0930dd68..dea24fcd 100644
--- a/src/test/regress/expected/pl_bugs.out
+++ b/src/test/regress/expected/pl_bugs.out
@@ -4,7 +4,7 @@ set enable_oracle_compatible to on;
 --
 -- Name: func_getlastnetvalue(varchar2, date); Type: FUNCTION; Schema: sync; Owner: gregsun
 --
-CREATE FUNCTION func_getlastnetvalue(v_fundcode varchar2, v_cdate date) RETURNS numeric
+CREATE FUNCTION func_getlastnetvalue(v_fundcode varchar2, v_cdate date) RETURNS numeric pushdown
     LANGUAGE plpgsql
     AS $$
  declare   v_netvalue text;
diff --git a/src/test/regress/expected/polymorphism.out b/src/test/regress/expected/polymorphism.out
index b37872fa..d3a86fb7 100644
--- a/src/test/regress/expected/polymorphism.out
+++ b/src/test/regress/expected/polymorphism.out
@@ -554,6 +554,11 @@ select case when $1 then $2 else $3 end $$ language sql;
 -- Note this would fail with integer overflow, never mind wrong bleat() output,
 -- if the CASE expression were not successfully inlined
 select f1, sql_if(f1 > 0, bleat(f1), bleat(f1 + 1)) from (select * from int4_tbl order by f1) q order by 1, 2;
+NOTICE:  bleat -2147483646
+NOTICE:  bleat -123455
+NOTICE:  bleat 1
+NOTICE:  bleat 123456
+NOTICE:  bleat 2147483647
      f1      |   sql_if    
 -------------+-------------
  -2147483647 | -2147483646
diff --git a/src/test/regress/sql/pl_bugs.sql b/src/test/regress/sql/pl_bugs.sql
index 0059dc90..0ac392e0 100644
--- a/src/test/regress/sql/pl_bugs.sql
+++ b/src/test/regress/sql/pl_bugs.sql
@@ -7,7 +7,7 @@ set enable_oracle_compatible to on;
 -- Name: func_getlastnetvalue(varchar2, date); Type: FUNCTION; Schema: sync; Owner: gregsun
 --
 
-CREATE FUNCTION func_getlastnetvalue(v_fundcode varchar2, v_cdate date) RETURNS numeric
+CREATE FUNCTION func_getlastnetvalue(v_fundcode varchar2, v_cdate date) RETURNS numeric pushdown
     LANGUAGE plpgsql
     AS $$
  declare   v_netvalue text;
diff --git a/src/test/regress/sql/updatable_views.sql b/src/test/regress/sql/updatable_views.sql
index f0c092c8..6c984268 100644
--- a/src/test/regress/sql/updatable_views.sql
+++ b/src/test/regress/sql/updatable_views.sql
@@ -1020,10 +1020,12 @@ EXPLAIN (VERBOSE, COSTS OFF)
 UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
 UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
 ALTER FUNCTION leakproof(anyelement) pushdown;
+ALTER FUNCTION snoop(anyelement) pushdown;
 EXPLAIN (VERBOSE, COSTS OFF)
 UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
 UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
 ALTER FUNCTION leakproof(anyelement) not pushdown;
+ALTER FUNCTION snoop(anyelement) not pushdown;
 
 SELECT * FROM v1 WHERE a=100; -- Nothing should have been changed to 100
 SELECT * FROM t1 WHERE a=100; -- Nothing should have been changed to 100

From fb04c62519acf4e42e7e8075d4d0d8397ab156f6 Mon Sep 17 00:00:00 2001
From: whalesong <whalesong@tencent.com>
Date: Tue, 26 Apr 2022 10:14:03 +0800
Subject: [PATCH 560/578] Revert "support wal sender proxy on cn (merge request
 1183), http://tapd.woa.com/20421696/prong/stories/view/1020421696872688189"

This reverts commit 39bf77a82a18fabcb6a5e3f4911c06ea4d9b3559.
---
 src/backend/access/common/printtup.c |   2 +-
 src/backend/pgxc/pool/execRemote.c   | 323 +--------------------------
 src/backend/pgxc/pool/pgxcnode.c     |  66 ------
 src/backend/postmaster/pgstat.c      |   8 -
 src/backend/postmaster/postmaster.c  |  39 ----
 src/backend/replication/walsender.c  |   3 -
 src/backend/tcop/postgres.c          | 278 -----------------------
 src/backend/utils/misc/guc.c         |   8 -
 src/backend/utils/misc/ps_status.c   |  23 --
 src/include/pgstat.h                 |   1 -
 src/include/pgxc/execRemote.h        |   3 -
 src/include/pgxc/pgxc.h              |   2 -
 src/include/pgxc/pgxcnode.h          |   8 -
 src/include/postgres.h               |   5 -
 src/include/replication/walsender.h  |   7 -
 src/include/utils/ps_status.h        |   2 -
 16 files changed, 2 insertions(+), 776 deletions(-)

diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index 3c12980a..fa66df73 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -228,7 +228,7 @@ SendRowDescriptionMessage(TupleDesc typeinfo, List *targetlist, int16 *formats)
          * Send the type name from a Postgres-XC backend node.
          * This preserves from OID inconsistencies as architecture is shared nothing.
          */
-		if (IsConnFromCoord() && !IsConnFromProxy())
+		if (IsConnFromCoord())
         {
             char       *typename;
 			typename = get_typenamespace_typename(atttypid);
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index c3ee221a..c37ac46e 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -30,7 +30,6 @@
 #include "executor/executor.h"
 #include "gtm/gtm_c.h"
 #include "libpq/libpq.h"
-#include "libpq/pqformat.h"
 #include "miscadmin.h"
 #include "pgxc/execRemote.h"
 #include "tcop/tcopprot.h"
@@ -157,7 +156,6 @@ static void pgxc_connections_cleanup(ResponseCombiner *combiner);
 
 static bool determine_param_types(Plan *plan,  struct find_params_context *context);
 
-static int handle_reply_msg_on_proxy(PGXCNodeHandle *conn);
 
 #define REMOVE_CURR_CONN(combiner) \
     if ((combiner)->current_conn < --((combiner)->conn_count)) \
@@ -3028,17 +3026,7 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
         while (i < count)
         {
             int32 nbytes = 0;
-			int result =  0;
-
-			if (am_proxy_for_dn)
-			{
-				result =  handle_response_on_proxy(to_receive[i], combiner);
-			}
-			else
-			{
-				result =  handle_response(to_receive[i], combiner);
-			}
-
+			int result =  handle_response(to_receive[i], combiner);
 #ifdef __TBASE__
 #ifdef     _PG_REGRESS_            
             elog(LOG, "Received response %d on connection to node %s",
@@ -13105,313 +13093,4 @@ SetSnapshot(EState *state)
 
     return result;
 }
-
-/*
- * Reveive dn message on proxy.
- * Forward the dn message to client and forward the client reply message to dn.
- */
-int pgxc_node_receive_on_proxy(PGXCNodeHandle *handle)
-{
-	int result = 0;
-	ResponseCombiner combiner;
-
-	struct timeval timeout;
-	timeout.tv_sec = 1;
-	timeout.tv_usec = 0;
-
-	MemSet(&combiner, 0, sizeof(ResponseCombiner));
-
-	InitResponseCombiner(&combiner, 1, COMBINE_TYPE_NONE);
-
-	/* Receive responses */
-	result = pgxc_node_receive_responses(1, &handle, &timeout, &combiner);
-	if (result != 0)
-	{
-		elog(LOG, "Proxy receive responses result is %d", result);
-		return result;
-	}
-
-	CloseCombiner(&combiner);
-	return result;
-}
-
-/*
- * Handle reply message on proxy.
- * Forward the client reply message to dn.
- */
-int handle_reply_msg_on_proxy(PGXCNodeHandle *conn)
-{
-	int ret = 0;
-	unsigned char firstchar;
-	StringInfoData msg;
-
-	Assert(IS_PGXC_COORDINATOR);
-
-	initStringInfo(&msg);
-
-	for (;;)
-	{
-		pq_startmsgread();
-		ret = pq_getbyte_if_available(&firstchar);
-		if (ret < 0)
-		{
-			/* Unexpected error or EOF */
-			ereport(ERROR,
-				(errcode(ERRCODE_INTERNAL_ERROR),
-					errmsg("unexpected EOF on proxy for %s", proxy_for_dn)));
-		}
-
-		if (ret == 0)
-		{
-			/* No data available without blocking */
-			pq_endmsgread();
-			break;
-		}
-
-		/* Read the message contents */
-		if (pq_getmessage(&msg, 0))
-		{
-			ereport(ERROR,
-				(errcode(ERRCODE_INTERNAL_ERROR),
-					errmsg("unexpected EOF on proxy for %s", proxy_for_dn)));
-		}
-
-		elog(DEBUG2, "%s proxy firstchar is %c(%d), reply message length: %d",
-			proxy_for_dn, firstchar, firstchar, msg.len);
-
-		ret = pgxc_node_send_on_proxy(conn, firstchar, &msg);
-		if (ret != 0)
-		{
-			ereport(ERROR,
-				(errcode(ERRCODE_INTERNAL_ERROR),
-					errmsg("%s proxy send reply message error: %d",
-						proxy_for_dn, ret)));
-		}
-
-		/* Handle the very limited subset of commands expected in this phase */
-		switch (firstchar)
-		{
-			/*
-			 * 'd' means a client reply message.
-			 */
-			case 'd':
-				break;
-
-			/*
-			 * 'c' means the client requested to finish streaming.
-			 */
-			case 'c':
-				elog(LOG, "%s proxy: reply message type %c(%d), "
-					"the client requested to finish streaming",
-					proxy_for_dn, firstchar, firstchar);
-
-				/* When replicate stream is closed, set stream_closed to true */
-				conn->stream_closed = true;
-
-				break;
-
-			/*
-			 * 'X' means the client is closing down the socket.
-			 */
-			case 'X':
-				elog(LOG, "%s proxy: reply message type %c(%d), "
-					"the client is closing down the socket",
-					proxy_for_dn, firstchar, firstchar);
-
-				proc_exit(0);
-
-			default:
-				elog(FATAL, "%s proxy: unexpected message type %c(%d), length: %d",
-					proxy_for_dn, firstchar, firstchar, msg.len);
-				break;
-		}
-	}
-
-	return ret;
-}
-
-/*
- * Read next message from the connection and update
- * connection state accordingly on the proxy
- * If we are in an error state we just consume the messages, and do not proxy
- * Long term, we should look into cancelling executing statements
- * and closing the connections.
- * It returns if states need to be handled
- * Return values:
- * RESPONSE_EOF - need to receive more data for the connection
- * RESPONSE_READY - got ReadyForQuery
- * RESPONSE_COMPLETE - done with the connection, but not yet ready for query.
- * Also this result is output in case of error
- * RESPONSE_TUPLEDESC - got tuple description
- * RESPONSE_DATAROW - got data row
- */
-int handle_response_on_proxy(PGXCNodeHandle *conn, ResponseCombiner *combiner)
-{
-	char *msg;
-	int  msg_len;
-	char msg_type;
-	int  ret = 0;
-	StringInfoData buf;
-
-	/* proxy must be cn */
-	Assert(IS_PGXC_COORDINATOR);
-
-	/* proxy must be not in extended query */
-	Assert(!conn->in_extended_query);
-	Assert(!combiner->extended_query);
-
-	for (;;)
-	{
-		/*
-		 * If we are in the process of shutting down, we
-		 * may be rolling back, and the buffer may contain other messages.
-		 * We want to avoid a procarray exception
-		 * as well as an error stack overflow.
-		 */
-		if (proc_exit_inprogress)
-		{
-			PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
-		}
-
-		/*
-		 * Don't read from from the connection if there is a fatal error.
-		 * We still return RESPONSE_COMPLETE, not RESPONSE_ERROR, since
-		 * Handling of RESPONSE_ERROR assumes sending SYNC message, but
-		 * State DN_CONNECTION_STATE_ERROR_FATAL indicates connection is
-		 * not usable.
-		 */
-		if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
-		{
-			return RESPONSE_COMPLETE;
-		}
-
-		ret = handle_reply_msg_on_proxy(conn);
-		if (ret != 0)
-		{
-			ereport(ERROR,
-				(errcode(ERRCODE_INTERNAL_ERROR),
-					errmsg("Handle reply message on proxy for %s error: %d",
-						proxy_for_dn, ret)));
-		}
-
-		/* No data available, exit */
-		if (!HAS_MESSAGE_BUFFERED(conn))
-			return RESPONSE_EOF;
-
-		Assert(conn->combiner == combiner || conn->combiner == NULL);
-
-		msg_type = get_message(conn, &msg_len, &msg);
-		elog(DEBUG1, "handle_response_on_proxy - received message %c, node %s, "
-			"current_state %d", msg_type, conn->nodename, conn->state);
-
-		/*
-		 * Add some protection code when receiving a messy message,
-		 * close the connection, and throw error
-		 */
-		if (msg_len < 0)
-		{
-			PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_ERROR_FATAL);
-
-			elog(LOG, "handle_response_on_proxy, fatal_conn=%p, "
-				"fatal_conn->nodename=%s, fatal_conn->sock=%d, "
-				"fatal_conn->read_only=%d, fatal_conn->transaction_status=%c, "
-				"fatal_conn->sock_fatal_occurred=%d, conn->backend_pid=%d, "
-				"fatal_conn->error=%s", conn, conn->nodename, conn->sock,
-				conn->read_only, conn->transaction_status,
-				conn->sock_fatal_occurred, conn->backend_pid,  conn->error);
-
-			closesocket(conn->sock);
-			conn->sock = NO_SOCKET;
-			conn->sock_fatal_occurred = true;
-
-			elog(LOG, "Received messy message from node:%s host:%s port:%d pid:%d, "
-				"inBuffer:%p inSize:%lu inStart:%lu inEnd:%lu inCursor:%lu "
-				"msg_len:%d, This probably means the remote node terminated "
-				"abnormally before or while processing the request.",
-				conn->nodename, conn->nodehost, conn->nodeport, conn->backend_pid,
-				conn->inBuffer, conn->inSize, conn->inStart, conn->inEnd,
-				conn->inCursor, msg_len);
-
-			ereport(ERROR,
-				(errcode(ERRCODE_INTERNAL_ERROR),
-					errmsg("Proxy: handle_response_on_proxy - received message "
-						"length %d, type %c, node %s, current_state %d",
-						msg_len, msg_type, conn->nodename, conn->state)));
-		}
-
-		if (msg_type == '\0')
-		{
-			/* Not enough data in the buffer */
-			return RESPONSE_EOF;
-		}
-
-		if (conn->stream_closed && msg_type == 'd')
-		{
-			/* When replicate stream is closed, skip 'd' message */
-			elog(DEBUG1, "Proxy: handle_response_on_proxy - received message "
-				"type %c, length %d, node %s, current_state %d, remote pid %d, skip",
-				msg_type, msg_len, conn->nodename, conn->state, conn->backend_pid);
-			continue;;
-		}
-
-		conn->last_command = msg_type;
-
-		elog(DEBUG1, "Proxy: handle_response_on_proxy - received message "
-			"type %c, length %d, node %s, current_state %d, remote pid %d",
-			msg_type, msg_len, conn->nodename, conn->state, conn->backend_pid);
-
-		/* Send message to client */
-		pq_beginmessage(&buf, msg_type);
-		pq_sendbytes(&buf, msg, msg_len);
-		pq_endmessage(&buf);
-		pq_flush();
-
-		switch (msg_type)
-		{
-			case 'c':			/* CopyToCommandComplete */
-				break;
-
-			case 'C':			/* CommandComplete */
-				conn->combiner = NULL;
-				PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
-				return RESPONSE_COMPLETE;
-
-			case 'E':			/* ErrorResponse */
-				HandleError(combiner, msg, msg_len, conn);
-				add_error_message_from_combiner(conn, combiner);
-
-				combiner->errorNode   = conn->nodename;
-				combiner->backend_pid = conn->backend_pid;
-				return RESPONSE_ERROR;
-
-			case 'Z':			/* ReadyForQuery */
-				conn->transaction_status = msg[0];
-				PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_IDLE);
-				conn->combiner = NULL;
-				return RESPONSE_READY;
-
-			case 'T':			/* RowDescription */
-				return RESPONSE_TUPDESC;
-
-			case 'D':			/* DataRow */
-				return RESPONSE_DATAROW;
-
-			case 'd': 			/* CopyOutDataRow */
-				PGXCNodeSetConnectionState(conn, DN_CONNECTION_STATE_COPY_OUT);
-				break;
-
-			case 'W':			/* CopyBothResponse */
-				/* Get a CopyBothResponse message when start streaming */
-				break;
-
-			default:
-				elog(DEBUG1, "Proxy received message type: %c", msg_type);
-				break;
-		}
-	}
-
-	/* Never happen, but keep compiler quiet */
-	return RESPONSE_EOF;
-}
-
 #endif
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 84259600..c19325a9 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -2595,72 +2595,6 @@ pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_p
 }
 #endif
 
-/*
- * Send message to dn
- */
-int
-pgxc_node_send_on_proxy(PGXCNodeHandle *handle, int firstchar, StringInfo inBuf)
-{
-	/* size + len */
-	int msgLen = 4 + inBuf->len;
-
-	/* msgType + msgLen */
-	if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
-	{
-		add_error_message(handle, "out of memory");
-		return EOF;
-	}
-
-	/* msg type */
-	handle->outBuffer[handle->outEnd++] = firstchar;
-
-	/* size */
-	msgLen = htonl(msgLen);
-	memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
-	handle->outEnd += 4;
-
-	/* msg data */
-	memcpy(handle->outBuffer + handle->outEnd, inBuf->data, inBuf->len);
-	handle->outEnd += inBuf->len;
-
-	PGXCNodeSetConnectionState(handle, DN_CONNECTION_STATE_QUERY);
-	handle->in_extended_query = false;
-
-	return pgxc_node_flush(handle);
-}
-
-/*
- * Send proxy configuration to dn
- */
-int
-pgxc_node_send_proxy_flag(PGXCNodeHandle *handle, int flag)
-{
-	/* size + flag */
-	int msgLen = 4 + sizeof(int);
-
-	/* msgType + msgLen */
-	if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
-	{
-		add_error_message(handle, "out of memory");
-		return EOF;
-	}
-
-	/* msg type */
-	handle->outBuffer[handle->outEnd++] = 'w';
-
-	/* size */
-	msgLen = htonl(msgLen);
-	memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
-	handle->outEnd += 4;
-
-	/* flag */
-	flag = htonl(flag);
-	memcpy(handle->outBuffer + handle->outEnd, &flag, sizeof(int));
-	handle->outEnd += sizeof(int);
-
-	return pgxc_node_flush(handle);
-}
-
 /*
  * Send series of Extended Query protocol messages to the data node
  */
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index cfeea974..1286cd1d 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -2913,11 +2913,6 @@ pgstat_bestart(void)
 			/* Clean 2pc Worker */
 			beentry->st_backendType = B_CLEAN_2PC_WORKER;
 		}
-		else if (am_proxy_for_dn)
-		{
-			/* Proxy for dn */
-			beentry->st_backendType = B_PROXY_FOR_DN;
-		}
         else if (am_walsender)
         {
             /* Wal sender */
@@ -4213,9 +4208,6 @@ pgstat_get_backend_desc(BackendType backendType)
 		case B_CLEAN_2PC_WORKER:
 			backendDesc = "2pc clean worker";
 			break;
-		case B_PROXY_FOR_DN:
-			backendDesc = "proxy for dn";
-			break;
     }
 
     return backendDesc;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 7d6d230b..10be77cd 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -162,8 +162,6 @@
 #include "audit/audit_fga.h"
 #endif
 
-#define PS_DISPLAY_MAX_LENGTH	256		/* process display max length */
-
 /*
  * Possible types of a backend. Beyond being the possible bkend_type values in
  * struct bkend, these are OR-able request flag bits for SignalSomeChildren()
@@ -2389,20 +2387,6 @@ ProcessStartupPacket(Port *port, bool SSLdone)
                                     valptr),
                              errhint("Valid values are: \"false\", 0, \"true\", 1, \"database\".")));
             }
-			else if (strcmp(nameptr, "proxy_for_dn") == 0)
-			{
-				if (!IS_PGXC_COORDINATOR)
-				{
-					ereport(FATAL,
-							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-							 errmsg("parameter \"%s\" only support on cn", nameptr)));
-				}
-
-				elog(LOG, "Proxy for dn: %s", valptr);
-
-				am_proxy_for_dn = true;
-				proxy_for_dn = pstrdup(valptr);
-			}
             else
             {
                 /* Assume it's a generic GUC option */
@@ -4956,35 +4940,12 @@ BackendInitialize(Port *port)
      * as dbname to init_ps_display(). XXX: should add a new variant of
      * init_ps_display() to avoid abusing the parameters like this.
      */
-	if (am_proxy_for_dn)
-	{
-		char proxy_display[PS_DISPLAY_MAX_LENGTH];
     if (am_walsender)
-		{
-			snprintf(proxy_display, PS_DISPLAY_MAX_LENGTH,
-				"wal sender proxy for %s", proxy_for_dn);
-		}
-		else
-		{
-			snprintf(proxy_display, PS_DISPLAY_MAX_LENGTH,
-				"proxy for %s", proxy_for_dn);
-		}
-		init_ps_display(proxy_display, port->user_name, remote_ps_data,
-						update_process_title ? "authentication" : "");
-	}
-	else
-	{
-		if (am_walsender)
-		{
         init_ps_display("wal sender process", port->user_name, remote_ps_data,
                         update_process_title ? "authentication" : "");
-		}
     else
-		{
         init_ps_display(port->user_name, port->database_name, remote_ps_data,
                         update_process_title ? "authentication" : "");
-		}
-	}
 
     /*
      * Disable the timeout, and prevent SIGTERM/SIGQUIT again.
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 464cfcd9..4b46d9c8 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -3327,10 +3327,7 @@ WalSndSignals(void)
     pqsignal(SIGINT, StatementCancelHandler);    /* query cancel */
     pqsignal(SIGTERM, die);        /* request shutdown */
     pqsignal(SIGQUIT, quickdie);    /* hard crash time */
-	if (!IsConnFromProxy())
-	{
     InitializeTimeouts();        /* establishes SIGALRM handler */
-	}
     pqsignal(SIGPIPE, SIG_IGN);
     pqsignal(SIGUSR1, procsignal_sigusr1_handler);
     pqsignal(SIGUSR2, WalSndLastCycleHandler);    /* request a last cycle and
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index b075a7e8..abab8378 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -124,9 +124,6 @@
 #include "replication/worker_internal.h"
 #endif
 
-char		*proxy_for_dn = NULL;       /* Proxy for which dn? */
-bool		am_proxy_for_dn = false;    /* Am I a proxy for dn? */
-bool		am_conn_from_proxy = false; /* Am I connected from proxy? */
 
 extern int    optind;
 
@@ -254,13 +251,6 @@ static void replace_null_with_blank(char *src, int length);
 static bool NeedResourceOwner(const char *stmt_name);
 #endif
 
-static PGXCNodeHandle *
-get_handle_on_proxy(void);
-static PGXCNodeHandle *
-handle_request_msg_on_proxy(PGXCNodeHandle *conn, int firstchar, StringInfo input_msg);
-void
-set_flag_from_proxy(int flag, const char *username);
-
 #ifdef __COLD_HOT__
 /*
  * Release memory alloc in TopMemoryContext and only used in single Session.
@@ -665,7 +655,6 @@ SocketBackend(StringInfo inBuf)
                         (errcode(ERRCODE_PROTOCOL_VIOLATION),
                          errmsg("invalid frontend message type %d", qtype)));
             break;
-		case 'w':				/* Set connected by proxy */
 #ifdef PGXC /* PGXC_DATANODE */
 #ifdef __TBASE__
         case 'N':
@@ -4814,8 +4803,6 @@ PostgresMain(int argc, char *argv[],
     volatile bool need_report_activity = false;
     bool        disable_idle_in_transaction_timeout = false;
 
-	PGXCNodeHandle *proxy_conn = NULL;
-
 #ifdef PGXC /* PGXC_DATANODE */
     /* Snapshot info */
     TransactionId             xmin PG_USED_FOR_ASSERTS_ONLY;
@@ -5527,12 +5514,6 @@ PostgresMain(int argc, char *argv[],
         }
 #endif /* XCP */
 
-		if (am_proxy_for_dn)
-		{
-			proxy_conn = handle_request_msg_on_proxy(proxy_conn, firstchar, &input_message);
-			continue;
-		}
-
         switch (firstchar)
         {
             case 'Q':            /* simple query */
@@ -6147,18 +6128,6 @@ PostgresMain(int argc, char *argv[],
                 }
                 break;
 #endif
-			case 'w':				/* Set connected by proxy */
-				{
-					int flag = 0;
-
-					Assert(input_message.len == 4);
-
-					flag = pq_getmsgint(&input_message, 4);
-					pq_getmsgend(&input_message);
-
-					set_flag_from_proxy(flag, username);
-				}
-				break;
             default:
                 ereport(FATAL,
                         (errcode(ERRCODE_PROTOCOL_VIOLATION),
@@ -6435,251 +6404,4 @@ IsExtendedQuery(void)
 {
     return doing_extended_query_message;
 }
-
-/*
- * Get a dn connection on proxy
- */
-PGXCNodeHandle *
-get_handle_on_proxy(void)
-{
-	PGXCNodeHandle *conn = NULL;
-	char node_type = PGXC_NODE_DATANODE;
-	Oid node_oid = InvalidOid;
-	int node_id = -1;
-	int flag = 0;
-	PGXCNodeAllHandles *handles = NULL;
-	List *dnList = NIL;
-	int ret = 0;
-
-	Assert(IS_PGXC_COORDINATOR);
-
-	/* Get dn oid */
-	StartTransactionCommand();
-	InitMultinodeExecutor(false);
-	node_oid = get_pgxc_nodeoid(proxy_for_dn);
-	CommitTransactionCommand();
-
-	if (node_oid == InvalidOid)
-	{
-		ereport(FATAL,
-			(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				errmsg("Unknow dn: %s, oid is invalid", proxy_for_dn)));
-	}
-
-	/* Get dn id */
-	node_id = PGXCNodeGetNodeId(node_oid, &node_type);
-	if (node_id == -1)
-	{
-		ereport(FATAL,
-			(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				errmsg("Unknow dn: %s, oid: %d, id: -1", proxy_for_dn, node_oid)));
-	}
-
-	elog(LOG, "Proxy for dn %s, node oid %d, node id %d",
-		proxy_for_dn, node_oid, node_id);
-
-	/* Get dn connection */
-	dnList = lappend_int(dnList, node_id);
-	Assert(list_length(dnList) == 1);
-	handles = get_handles(dnList, NIL, false, false, true);
-	if (handles == NULL)
-	{
-		ereport(ERROR,
-			(errcode(ERRCODE_INTERNAL_ERROR),
-				errmsg("Get connections failed for %s", proxy_for_dn)));
-
-	}
-	if (handles->dn_conn_count == 0)
-	{
-		ereport(ERROR,
-			(errcode(ERRCODE_INTERNAL_ERROR),
-				errmsg("Get 0 connection for %s", proxy_for_dn)));
-	}
-
-	Assert(handles->co_conn_count == 0);
-	Assert(handles->dn_conn_count == 1);
-
-	conn = handles->datanode_handles[0];
-	Assert(conn != NULL);
-
-	pfree_pgxc_all_handles(handles);
-	handles = NULL;
-
-	/* Set dn process */
-	if (am_walsender)
-	{
-		flag |= FLAG_AM_WALSENDER;
-		if (am_db_walsender)
-		{
-			flag |= FLAG_AM_DB_WALSENDER;
-		}
-	}
-	ret = pgxc_node_send_proxy_flag(conn, flag);
-	if (ret != 0)
-	{
-		ereport(ERROR,
-			(errcode(ERRCODE_INTERNAL_ERROR),
-				errmsg("Proxy send flag to %s error: %d", proxy_for_dn, ret)));
-	}
-
-	return conn;
-}
-
-/*
- * Forward client request command to dn and receive response
- */
-PGXCNodeHandle *
-handle_request_msg_on_proxy(PGXCNodeHandle *conn, int firstchar, StringInfo input_msg)
-{
-	int ret = 0;
-
-	Assert(IS_PGXC_COORDINATOR);
-
-	if (conn == NULL)
-	{
-		conn = get_handle_on_proxy();
-	}
-
-	Assert(conn != NULL);
-
-	/* Before query, replicate stream is not closed, set stream_closed to false */
-	conn->stream_closed = false;
-
-	if (firstchar == 'Q')
-	{
-		const char *query_string = pq_getmsgstring(input_msg);
-		pq_getmsgend(input_msg);
-		debug_query_string = query_string;
-	}
-
-	elog(DEBUG1, "Proxy: firstchar is %c(%d)", firstchar, firstchar);
-
-	/* Send message */
-	ret = pgxc_node_send_on_proxy(conn, firstchar, input_msg);
-	if (ret != 0)
-	{
-		ereport(ERROR,
-			(errcode(ERRCODE_INTERNAL_ERROR),
-				errmsg("Proxy send request to %s error: %d", proxy_for_dn, ret)));
-	}
-
-	switch (firstchar)
-	{
-		/*
-		 * 'X' means that the frontend is closing down the socket. EOF
-		 * means unexpected loss of frontend connection. Either way,
-		 * perform normal shutdown.
-		 */
-		case 'X':
-		case EOF:
-			/*
-			 * Reset whereToSendOutput to prevent ereport from attempting
-			 * to send any more messages to client.
-			 */
-			if (whereToSendOutput == DestRemote)
-			{
-				elog(LOG, "Set whereToSendOutput from %d to %d",
-					whereToSendOutput, DestNone);
-				whereToSendOutput = DestNone;
-			}
-
-			/* Destroy the dn connection on proxy */
-			PoolManagerDisconnect();
-
-			/*
-			 * NOTE: if you are tempted to add more code here, DON'T!
-			 * Whatever you had in mind to do should be set up as an
-			 * on_proc_exit or on_shmem_exit callback, instead. Otherwise
-			 * it will fail to be called during other backend-shutdown
-			 * scenarios.
-			 */
-			proc_exit(0);
-
-		default:
-			break;
-	}
-
-	/* Receive message */
-	ret = pgxc_node_receive_on_proxy(conn);
-	if (ret != 0)
-	{
-		ereport(ERROR,
-			(errcode(ERRCODE_INTERNAL_ERROR),
-				errmsg("Proxy receive from %s error: %d", proxy_for_dn, ret)));
-	}
-
-	debug_query_string = NULL;
-
-	return conn;
-}
-
-/*
- * Set flag from proxy
- */
-void
-set_flag_from_proxy(int flag, const char *username)
-{
-	if (am_conn_from_proxy)
-	{
-		ereport(ERROR,
-			(errcode(ERRCODE_CONNECTION_EXCEPTION),
-				errmsg("It is connected from proxy already")));
-	}
-
-	am_conn_from_proxy = true;
-
-	elog(LOG, "It is connected from proxy");
-
-	if (am_walsender)
-	{
-		ereport(ERROR,
-			(errcode(ERRCODE_CONNECTION_EXCEPTION),
-				errmsg("It is a wal sender already")));
-	}
-
-	if (flag & FLAG_AM_WALSENDER)
-	{
-		am_walsender = true;
-		if (flag & FLAG_AM_DB_WALSENDER)
-		{
-			am_db_walsender = true;
-		}
-	}
-
-	elog(LOG, "Set wal sender: am_walsender(%d), am_db_walsender(%d)",
-		am_walsender, am_db_walsender);
-
-	if (am_walsender)
-	{
-		int fixed_len = 0;
-		const char *fixed = get_ps_display_fixed(&fixed_len);
-		char fixed_buf[fixed_len + 1];
-		char *display = NULL;
-
-		if (fixed_len != 0)
-		{
-			Assert (fixed != NULL);
-
-			snprintf(fixed_buf, fixed_len, "%s", fixed);
-			fixed_buf[fixed_len] = '\0';
-
-			display = strstr(fixed_buf, username);
-			Assert (display != NULL);
-
-			init_ps_display("wal sender used by proxy", display, "", "");
-		}
-		else
-		{
-			elog(WARNING, "Get ps display fixed length is 0");
-
-			init_ps_display("wal sender used by proxy", "", "", "");
-		}
-
-		IsNormalPostgres = false;
-
-		WalSndSignals();
-		InitWalSender();
-	}
-}
-
 #endif
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 8b7af537..dbccb8f6 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -7305,14 +7305,6 @@ ResetAllOptions(void)
 {// #lizard forgives
     int            i;
 
-	if (am_walsender)
-	{
-		/* never be here */
-		ereport(ERROR,
-			(errcode(ERRCODE_INTERNAL_ERROR),
-				errmsg("RESET ALL is forbidden on wal sender")));
-	}
-
     for (i = 0; i < num_guc_variables; i++)
     {
         struct config_generic *gconf = guc_variables[i];
diff --git a/src/backend/utils/misc/ps_status.c b/src/backend/utils/misc/ps_status.c
index 51f668d1..06f6c857 100644
--- a/src/backend/utils/misc/ps_status.c
+++ b/src/backend/utils/misc/ps_status.c
@@ -417,26 +417,3 @@ get_ps_display(int *displen)
 
     return ps_buffer + ps_buffer_fixed_size;
 }
-
-/*
- * Returns the fixed part in the ps display, in case someone needs
- * it.  Note that only the fixed part is returned.
- * The string will not be null-terminated, so return the effective
- * length into *fixlen.
- */
-const char *
-get_ps_display_fixed(int *fixlen)
-{
-#ifdef PS_USE_CLOBBER_ARGV
-	/* If ps_buffer is a pointer, it might still be null */
-	if (!ps_buffer)
-	{
-		*fixlen = 0;
-		return "";
-	}
-#endif
-
-	*fixlen = (int) ps_buffer_fixed_size;
-
-	return ps_buffer;
-}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 6c4c5886..7976c39b 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -722,7 +722,6 @@ typedef enum BackendType
 	B_PGXL_POOLER,
 	B_CLEAN_2PC_LAUNCHER,
 	B_CLEAN_2PC_WORKER,
-	B_PROXY_FOR_DN,
 } BackendType;
 
 
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index baa30f65..7047d510 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -384,7 +384,6 @@ extern void ExecRemoteUtility(RemoteQuery *node);
 extern bool    is_data_node_ready(PGXCNodeHandle * conn);
 
 extern int handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner);
-extern int handle_response_on_proxy(PGXCNodeHandle *conn, ResponseCombiner *combiner);
 extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const char *msg_body,
                                     size_t len);
 
@@ -477,8 +476,6 @@ extern int pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** c
 extern bool validate_combiner(ResponseCombiner *combiner);
 #endif
 
-extern int pgxc_node_receive_on_proxy(PGXCNodeHandle *handle);
-
 #ifdef __TWO_PHASE_TRANS__
 extern char *get_nodelist(char * prepareGID, bool localNode, bool implicit);
 extern void InitLocalTwoPhaseState(void);
diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h
index 370882dd..687be6c8 100644
--- a/src/include/pgxc/pgxc.h
+++ b/src/include/pgxc/pgxc.h
@@ -134,8 +134,6 @@ extern Datum xc_lockForBackupKey2;
 #define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM)
 #define IsConnFromGtmProxy() (remoteConnType == REMOTE_CONN_GTM_PROXY)
 
-#define IsConnFromProxy() (am_conn_from_proxy)
-
 /* key pair to be used as object id while using advisory lock for backup */
 #define XC_LOCK_FOR_BACKUP_KEY_1      0xFFFF
 #define XC_LOCK_FOR_BACKUP_KEY_2      0xFFFF
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index d69aa7f1..f0e7c269 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -123,9 +123,6 @@ struct pgxc_node_handle
 
 	bool		in_extended_query;
 	bool		needSync; /* set when error and extend query. */
-
-	bool		stream_closed; /* Whether replicate stream is closed on proxy? */
-
 #ifdef __TBASE__
 	bool		sock_fatal_occurred;	/*Network failure occurred, and sock descriptor was closed */
 	char        last_command; /*last command we processed. */
@@ -220,11 +217,6 @@ extern int	pgxc_node_send_my_sync(PGXCNodeHandle * handle);
 #ifdef __SUBSCRIPTION__
 extern int pgxc_node_send_apply(PGXCNodeHandle * handle, char * buf, int len, bool ignore_pk_conflict);
 #endif
-
-extern int pgxc_node_send_proxy_flag(PGXCNodeHandle *handle, int flag);
-extern int pgxc_node_send_on_proxy(PGXCNodeHandle *handle, int firstchar,
-									StringInfo inBuf);
-
 #ifdef __TBASE__
 extern int pgxc_node_send_disconnect(PGXCNodeHandle * handle, char *cursor, int cons);
 #endif
diff --git a/src/include/postgres.h b/src/include/postgres.h
index c35967b0..bee66144 100644
--- a/src/include/postgres.h
+++ b/src/include/postgres.h
@@ -71,11 +71,6 @@
 #define EXTENT_FIRST_BLOCKNUMBER(eid) ((eid)*PAGES_PER_EXTENTS)
 #endif
 
-extern char *proxy_for_dn;      /* Proxy for which dn? */
-extern bool am_proxy_for_dn;    /* Am I a proxy for dn? */
-extern bool am_conn_from_proxy; /* Am I connected from proxy? */
-
-
 /* ----------------------------------------------------------------
  *				Section 1:	variable-length datatypes (TOAST support)
  * ----------------------------------------------------------------
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index a3b7876c..1f20db82 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -16,13 +16,6 @@
 
 #include "fmgr.h"
 
-#define FLAG_AM_WALSENDER      0x01		/* Flag to set am_walsender(Am I a walsender process?) */
-#define FLAG_AM_DB_WALSENDER   0x02		/* Flag to set am_db_walsender(Am I a
-											walsender process and connected to
-											a database?
-												Yes: used for logical replicate.
-												No:  used for physical replicate. */
-
 /*
  * What to do with a snapshot in create replication slot command.
  */
diff --git a/src/include/utils/ps_status.h b/src/include/utils/ps_status.h
index 097474c5..2ba5a0ea 100644
--- a/src/include/utils/ps_status.h
+++ b/src/include/utils/ps_status.h
@@ -23,6 +23,4 @@ extern void set_ps_display(const char *activity, bool force);
 
 extern const char *get_ps_display(int *displen);
 
-extern const char *get_ps_display_fixed(int *displen);
-
 #endif							/* PS_STATUS_H */

From a7e94a89f47fea1588d6a457d1dbdc27ec21d45b Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Sat, 6 Apr 2019 15:23:37 +0900
Subject: [PATCH 561/578] Add support TCP user timeout in libpq and the backend
 server

Similarly to the set of parameters for keepalive, a connection parameter
for libpq is added as well as a backend GUC, called tcp_user_timeout.

Increasing the TCP user timeout is useful to allow a connection to
survive extended periods without end-to-end connection, and decreasing
it allows application to fail faster.  By default, the parameter is 0,
which makes the connection use the system default, and follows a logic
close to the keepalive parameters in its handling.  When connecting
through a Unix-socket domain, the parameters have no effect.

Author: Ryohei Nagaura
Reviewed-by: Fabien Coelho, Robert Haas, Kyotaro Horiguchi, Kirk
Jamison, Mikalai Keida, Takayuki Tsunakawa, Andrei Yahorau
Discussion: https://postgr.es/m/EDA4195584F5064680D8130B1CA91C45367328@G01JPEXMBYT04
---
 src/backend/utils/misc/guc.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index dbccb8f6..e38bcbba 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4964,6 +4964,17 @@ static struct config_uint ConfigureNamesUInt[] =
         NULL, NULL, NULL
     },
 
+	{
+		{"tcp_user_timeout", PGC_USERSET, CLIENT_CONN_OTHER,
+			gettext_noop("TCP user timeout."),
+			gettext_noop("A value of 0 uses the system default."),
+			GUC_UNIT_MS
+		},
+		&tcp_user_timeout,
+		0, 0, INT_MAX,
+		NULL, assign_tcp_user_timeout, show_tcp_user_timeout
+	},
+
     /* End-of-list marker */
     {
         {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL

From 29754107356f9d38197728e9f8a9368fbcb69e2d Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Tue, 1 Mar 2022 21:34:13 +0800
Subject: [PATCH 562/578] fix hang when pqcancel
 http://tapd.oa.com/pgxz/bugtrace/bugs/view/1010092131097019641

---
 src/backend/libpq/pqcomm.c   |  1 +
 src/backend/utils/misc/guc.c | 11 -----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index db3b1ea1..132b85b2 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -2072,6 +2072,7 @@ SetSockKeepAlive(int sock)
 		elog(LOG, "SetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m");
 	}
 }
+}
 
 int
 pq_gettcpusertimeout(Port *port)
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index e38bcbba..dbccb8f6 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -4964,17 +4964,6 @@ static struct config_uint ConfigureNamesUInt[] =
         NULL, NULL, NULL
     },
 
-	{
-		{"tcp_user_timeout", PGC_USERSET, CLIENT_CONN_OTHER,
-			gettext_noop("TCP user timeout."),
-			gettext_noop("A value of 0 uses the system default."),
-			GUC_UNIT_MS
-		},
-		&tcp_user_timeout,
-		0, 0, INT_MAX,
-		NULL, assign_tcp_user_timeout, show_tcp_user_timeout
-	},
-
     /* End-of-list marker */
     {
         {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL

From 00f576f088e8d8286e4a1661d515e393f6431822 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 16 Mar 2022 16:40:15 +0800
Subject: [PATCH 563/578] fix pooler bug
 http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131099016045

---
 src/backend/pgxc/pool/poolmgr.c | 127 ++++++++++---------
 src/include/pgxc/poolmgr.h      | 215 ++++++++++++++++----------------
 2 files changed, 179 insertions(+), 163 deletions(-)

diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 72d5786e..28b0d3c6 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -209,7 +209,7 @@ typedef struct
     Oid                  nodeoid;      /* Node Oid related to this pool */
     char                *connstr;    /* palloc memory, need free */
 
-	time_t            m_version;  /* version of node pool */
+	int64             m_version;  /* version of node pool */
     int32             size;        /* total pool size */
     int32               validSize;  /* valid data element number */    
     bool              failed;
@@ -513,14 +513,14 @@ static int agent_acquire_connections(PoolAgent *agent, List *datanodelist, List
 static int send_local_commands(PoolAgent *agent, List *datanodelist, List *coordlist);
 static int cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist, int signal);
 static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, PGXCNodePool **pool,int32 nodeidx, Oid node, bool bCoord);
-static void agent_release_connections(PoolAgent *agent, bool force_destroy);
+static void agent_release_connections(PoolAgent *agent, bool force_destroy, bool sync);
 static void agent_return_connections(PoolAgent *agent);
 
 static bool agent_reset_session(PoolAgent *agent);
 static void release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
-                               int32 nodeidx, Oid node, bool force_destroy, bool bCoord);
-static void destroy_slot_ex(int32 nodeidx, Oid node, PGXCNodePoolSlot *slot, char *file, int32 line);
-#define  destroy_slot(nodeidx, node, slot) destroy_slot_ex(nodeidx, node, slot, __FILE__, __LINE__)
+							   int32 nodeidx, Oid node, bool force_destroy, bool bCoord, bool sync);
+static void destroy_slot_ex(int32 nodeidx, Oid node, PGXCNodePoolSlot *slot, bool sync, char *file, int32 line);
+#define  destroy_slot(nodeidx, node, slot, sync) destroy_slot_ex(nodeidx, node, slot, sync, __FILE__, __LINE__)
 
 static void close_slot(int32 nodeidx, Oid node, PGXCNodePoolSlot *slot);
 
@@ -572,7 +572,7 @@ static void  *pooler_async_utility_thread(void *arg);
 static void  *pooler_async_connection_management_thread(void *arg);
 static void  *pooler_sync_remote_operator_thread(void *arg);
 
-static bool   pooler_async_build_connection(DatabasePool *pool, time_t pool_version, int32 nodeidx, Oid node, 
+static bool   pooler_async_build_connection(DatabasePool *pool, int64 pool_version, int32 nodeidx, Oid node, 
                                             int32 size, char *connStr, bool bCoord);
 static BitmapMgr *BmpMgrCreate(uint32 objnum);
 static int        BmpMgrAlloc(BitmapMgr *mgr);
@@ -1623,7 +1623,7 @@ agent_init(PoolAgent *agent, const char *database, const char *user_name,
     /* disconnect if we are still connected */
     if (agent->pool)
     {
-        agent_release_connections(agent, false);
+		agent_release_connections(agent, false, false);
     }
 
     oldcontext = MemoryContextSwitchTo(agent->mcxt);
@@ -1697,7 +1697,10 @@ agent_destroy(PoolAgent *agent)
             
             if (bsync)
             {
-                agent_release_connections(agent, true);
+			    /*
+			     * if temporary objects used for this pool session, release using synchronization
+			     */
+				agent_release_connections(agent, true, agent->is_temp);
             }                    
         }
 
@@ -1758,7 +1761,7 @@ destroy_pend_agent(PoolAgent *agent)
              */
             if (bsync)
             {
-                agent_release_connections(agent, true);
+				agent_release_connections(agent, true, false);
             }                    
         }
 
@@ -2257,7 +2260,7 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
                     {
                         elog(LOG, POOL_MGR_PREFIX"receive command %c from agent:%d. destory=%d", qtype, agent->pid, destroy);
                     }
-                    agent_release_connections(agent, destroy);
+					agent_release_connections(agent, destroy, false);
                 }
                 break;
                 
@@ -3968,8 +3971,8 @@ PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list, i
  * Release connections for Datanodes and Coordinators
  */
 static void
-agent_release_connections(PoolAgent *agent, bool force_destroy)
-{// #lizard forgives
+agent_release_connections(PoolAgent *agent, bool force_destroy, bool sync)
+{
     MemoryContext oldcontext;
     int              i;
     
@@ -4033,7 +4036,7 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
             {
                 elog(LOG, POOL_MGR_PREFIX"++++agent_release_connections pid:%d release slot_seq:%d nodename:%s backend_pid:%d++++", agent->pid, slot->seqnum, slot->node_name, slot->backend_pid);
             }
-            release_connection(agent->pool, slot, i, agent->dn_conn_oids[i], force_destroy, false);
+			release_connection(agent->pool, slot, i, agent->dn_conn_oids[i], force_destroy, false, sync);
         }
         agent->dn_connections[i] = NULL;
     }
@@ -4053,7 +4056,7 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
             {
                 elog(LOG, POOL_MGR_PREFIX"++++agent_release_connections pid:%d release slot_seq:%d nodename:%s backend_pid:%d++++", agent->pid, slot->seqnum, slot->node_name, slot->backend_pid);
             }
-            release_connection(agent->pool, slot, i, agent->coord_conn_oids[i], force_destroy, true);
+			release_connection(agent->pool, slot, i, agent->coord_conn_oids[i], force_destroy, true, sync);
         }
         agent->coord_connections[i] = NULL;
     }
@@ -4126,7 +4129,7 @@ agent_return_connections(PoolAgent *agent)
             {
                 elog(LOG, POOL_MGR_PREFIX"++++agent_return_connections pid:%d release slot_seq:%d++++", agent->pid, slot->seqnum);
             }
-            release_connection(agent->pool, slot, i, agent->dn_conn_oids[i], false, false);
+			release_connection(agent->pool, slot, i, agent->dn_conn_oids[i], false, false, false);
         }
         agent->dn_connections[i] = NULL;
     }
@@ -4146,7 +4149,7 @@ agent_return_connections(PoolAgent *agent)
             {
                 elog(LOG, POOL_MGR_PREFIX"++++agent_return_connections pid:%d release slot_seq:%d++++", agent->pid, slot->seqnum);
             }
-            release_connection(agent->pool, slot, i, agent->coord_conn_oids[i], false, true);
+			release_connection(agent->pool, slot, i, agent->coord_conn_oids[i], false, true, false);
         }
         agent->coord_connections[i] = NULL;
     }
@@ -4216,7 +4219,7 @@ agent_reset_session(PoolAgent *agent)
                     {
                         elog(LOG, POOL_MGR_PREFIX"++++agent_reset_session pid:%d release slot_seq:%d++++", agent->pid, slot->seqnum);
                     }
-                    release_connection(agent->pool, slot, i, agent->dn_conn_oids[i], false, false);
+					release_connection(agent->pool, slot, i, agent->dn_conn_oids[i], false, false, false);
                     agent->dn_connections[i] = NULL;
 
                 }
@@ -4258,7 +4261,7 @@ agent_reset_session(PoolAgent *agent)
                         elog(LOG, POOL_MGR_PREFIX"++++agent_reset_session pid:%d release slot_seq:%d++++", agent->pid, slot->seqnum);
                     }
                     agent->coord_connections[i] = NULL;
-                    release_connection(agent->pool, slot, i, agent->coord_conn_oids[i], false, false);
+					release_connection(agent->pool, slot, i, agent->coord_conn_oids[i], false, false, false);
 
                 }
                 else
@@ -4422,6 +4425,7 @@ create_database_pool(const char *database, const char *user_name, const char *pg
     databasePool->bneed_warm      = false;
     databasePool->bneed_precreate = false;
     databasePool->bneed_pool       = need_pool;
+    databasePool->version         = 0;
     return databasePool;
 }
 
@@ -4457,7 +4461,7 @@ reload_database_pools(PoolAgent *agent)
      * Release node connections if any held. It is not guaranteed client session
      * does the same so don't ever try to return them to pool and reuse
      */
-    agent_release_connections(agent, true);
+	agent_release_connections(agent, true, false);
 
     /* before destory nodepool, just wait for all async task is done */
     bsucceed = pooler_wait_for_async_task_done();
@@ -4518,10 +4522,10 @@ reload_database_pools(PoolAgent *agent)
                     destroy_node_pool_free_slots(nodePool);
 
                     /* increase the node pool version */
-					nodePool->m_version = time(NULL);
+					nodePool->m_version = databasePool->version++;
 					elog(LOG, POOL_MGR_PREFIX"nodePool:%s has been changed, "
 						"size:%d, freeSize:%d, reload_database_pools: nodePools "
-						"of node (%u, %s) has increased version %lu.",
+						"of node (%u, %s) has increased version "INT64_FORMAT,
 						nodePool->connstr,
 						nodePool->size, nodePool->freeSize,
 						nodePool->nodeoid, nodePool->node_name, nodePool->m_version);
@@ -4672,7 +4676,7 @@ acquire_connection(DatabasePool *dbPool, PGXCNodePool **pool,int32 nodeidx, Oid
                 elog(WARNING, POOL_MGR_PREFIX"connection to node %u contains invalid fd:%d", node, fd);
             }
         }
-        destroy_slot(nodeidx, node, slot);
+		destroy_slot(nodeidx, node, slot, false);
         slot = NULL;
 
         /* Decrement current max pool size */
@@ -4721,8 +4725,8 @@ acquire_connection(DatabasePool *dbPool, PGXCNodePool **pool,int32 nodeidx, Oid
  */
 static void
 release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
-                   int32 nodeidx, Oid node, bool force_destroy, bool bCoord)
-{// #lizard forgives
+				   int32 nodeidx, Oid node, bool force_destroy, bool bCoord, bool sync)
+{
     PGXCNodePool *nodePool;
     time_t        now;
 
@@ -4752,7 +4756,7 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
 				nodePool->node_name, slot->backend_pid, nodeidx,
 				nodePool->size, nodePool->freeSize);
         }
-        destroy_slot(nodeidx, node, slot);
+		destroy_slot(nodeidx, node, slot, sync);
         return;
     }
 
@@ -4788,7 +4792,7 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
 				nodePool->node_name, slot->backend_pid, nodeidx, agentCount,
 				nodePool->size, nodePool->freeSize, nodePool->m_version, slot->m_version);
         }
-		destroy_slot(nodeidx, node, slot);
+		destroy_slot(nodeidx, node, slot, sync);
 		return;
     }
     
@@ -4911,7 +4915,7 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
 				nodePool->node_name, slot->backend_pid, nodeidx,
 				nodePool->size, nodePool->freeSize);
         }
-        destroy_slot(nodeidx, node, slot);
+		destroy_slot(nodeidx, node, slot, sync);
         
         /* Decrease pool size */
         DecreasePoolerSize(nodePool,__FILE__, __LINE__);
@@ -4989,7 +4993,7 @@ grow_pool(DatabasePool *dbPool, int32 nodeidx, Oid node, bool bCoord)
         snprintf(nodePool->node_name, NAMEDATALEN, "%s", name_str);
         MemoryContextSwitchTo(oldcontext);
 
-		nodePool->m_version = time(NULL);
+		nodePool->m_version = dbPool->version++;
 		elog(LOG,
 			"grow_pool: nodePools of node (%u, %s) is created.",
 			nodePool->nodeoid, nodePool->node_name);
@@ -5038,8 +5042,8 @@ grow_pool(DatabasePool *dbPool, int32 nodeidx, Oid node, bool bCoord)
  * Destroy pool slot, including slot itself.
  */
 static void
-destroy_slot_ex(int32 nodeidx, Oid node, PGXCNodePoolSlot *slot, char *file, int32 line)
-{// #lizard forgives
+destroy_slot_ex(int32 nodeidx, Oid node, PGXCNodePoolSlot *slot, bool sync, char *file, int32 line)
+{
     int32                  threadid = 0;
     uint64              pipeput_loops = 0;
     PGXCPoolConnectReq *connReq;
@@ -5070,6 +5074,16 @@ destroy_slot_ex(int32 nodeidx, Oid node, PGXCNodePoolSlot *slot, char *file, int
         elog(LOG, POOL_MGR_PREFIX"destroy_slot invalid slot status, null pointer conn:%p xc_cancelConn:%p", slot->conn, slot->xc_cancelConn);
     }
 
+	if (sync)
+    {
+	    /* release now if sync */
+        PQfreeCancel((PGcancel *)slot->xc_cancelConn);
+        PGXCNodeClose(slot->conn);
+        slot->bdestoryed = true;
+        pfree(slot);
+        return;
+    }
+
     /* if no free pipe line avaliable, just do it sync */
     threadid = pooler_async_task_pick_thread(&g_PoolConnControl, nodeidx);
     if (-1 == threadid)
@@ -5228,7 +5242,7 @@ destroy_node_pool(PGXCNodePool *node_pool)
         nodeidx = get_node_index_by_nodeoid(node_pool->nodeoid);
         for (i = 0; i < node_pool->freeSize; i++)
         {
-            destroy_slot(nodeidx, node_pool->nodeoid, node_pool->slot[i]);
+			destroy_slot(nodeidx, node_pool->nodeoid, node_pool->slot[i], false);
         }
         pfree(node_pool->slot);
 		node_pool->size -= node_pool->freeSize;
@@ -5252,7 +5266,7 @@ destroy_node_pool_free_slots(PGXCNodePool *node_pool)
     if (PoolConnectDebugPrint)
     {
 		elog(LOG,
-			POOL_MGR_PREFIX"About to destroy slots of node pool %s, node_pool version:%lu "
+			POOL_MGR_PREFIX"About to destroy slots of node pool %s, node_pool version:"INT64_FORMAT
 			"agentCount is %d current size is %d, freeSize is %d, %d connections are in use",
 			node_pool->connstr, node_pool->m_version,
 			agentCount, node_pool->size, node_pool->freeSize, node_pool->size - node_pool->freeSize);
@@ -5264,7 +5278,7 @@ destroy_node_pool_free_slots(PGXCNodePool *node_pool)
         nodeidx = get_node_index_by_nodeoid(node_pool->nodeoid);
         for (i = 0; i < node_pool->freeSize; i++)
         {            
-            destroy_slot(nodeidx, node_pool->nodeoid, node_pool->slot[i]);
+			destroy_slot(nodeidx, node_pool->nodeoid, node_pool->slot[i], false);
             node_pool->slot[i] = NULL;
         }
         node_pool->size     -= node_pool->freeSize;        
@@ -5786,7 +5800,7 @@ clean_connection(List *node_discard, const char *database, const char *user_name
                     nodeidx = get_node_index_by_nodeoid(nodePool->nodeoid);
                     for (i = 0; i < nodePool->freeSize; i++)
                     {
-                        destroy_slot(nodeidx, nodePool->nodeoid, nodePool->slot[i]);
+						destroy_slot(nodeidx, nodePool->nodeoid, nodePool->slot[i], false);
                         nodePool->slot[i] = NULL;
                     }
                 }
@@ -5965,7 +5979,7 @@ shrink_pool(DatabasePool *pool)
 							nodePool->size, nodePool->freeSize);
                     }
                     /* connection is idle for long, close it */                    
-                    destroy_slot(nodeidx, nodePool->nodeoid, slot);
+					destroy_slot(nodeidx, nodePool->nodeoid, slot, false);
                     
                     /* reduce pool size and total number of connections */
                     DecreasePoolerFreesize(nodePool,__FILE__,__LINE__);
@@ -6304,7 +6318,7 @@ static void pooler_handle_sync_response_queue(void)
                                         {
                                             if (connRsp->slot->conn)
                                             {                                        
-                                                destroy_slot(connRsp->nodeindex, connRsp->nodepool->nodeoid, connRsp->slot);
+												destroy_slot(connRsp->nodeindex, connRsp->nodepool->nodeoid, connRsp->slot, false);
                                             }
                                             else
                                             {
@@ -6335,7 +6349,7 @@ static void pooler_handle_sync_response_queue(void)
                                         /* Force to close the connection. */
                                         if (slot)
                                         {
-                                            release_connection(connRsp->agent->pool, slot, connRsp->nodeindex, nodeoid, true, connRsp->bCoord);
+											release_connection(connRsp->agent->pool, slot, connRsp->nodeindex, nodeoid, true, connRsp->bCoord, false);
                                         }
                                     }
                                     pfree(connRsp);
@@ -6388,7 +6402,7 @@ static void pooler_handle_sync_response_queue(void)
                                 /* Force to close the connection. */
                                 if (slot)
                                 {
-                                    release_connection(connRsp->agent->pool, slot, connRsp->nodeindex, nodeOid, true, connRsp->bCoord);
+									release_connection(connRsp->agent->pool, slot, connRsp->nodeindex, nodeOid, true, connRsp->bCoord, false);
                                 }
                                 else
                                 {
@@ -6602,7 +6616,7 @@ static void pooler_handle_sync_response_queue(void)
                                 /* Force to close the connection. */
                                 if (slot)
                                 {
-                                    release_connection(connRsp->agent->pool, slot, connRsp->nodeindex, nodeoid, true, connRsp->bCoord);
+									release_connection(connRsp->agent->pool, slot, connRsp->nodeindex, nodeoid, true, connRsp->bCoord, false);
                                 }
                             }
                             else if (connRsp->error_flag)
@@ -6709,7 +6723,7 @@ static void pooler_sync_connections_to_nodepool(void)
                 }
                 
                 /* time to close the connection */
-                destroy_slot(asyncInfo->nodeindex, asyncInfo->node, asyncInfo->slot);
+				destroy_slot(asyncInfo->nodeindex, asyncInfo->node, asyncInfo->slot, false);
                 if (nodePool)
                 {
                     /* Decrement pool size */
@@ -6753,7 +6767,7 @@ static void pooler_sync_connections_to_nodepool(void)
                     nodePool->coord      = false; /* in this case, only datanode */
                     nodePool->nwarming   = 0;
                     nodePool->nquery     = 0;
-					nodePool->m_version = time(NULL);
+					nodePool->m_version = asyncInfo->dbPool->version++;
 
                     name_str = get_node_name_by_nodeoid(asyncInfo->node);
                     if (NULL == name_str)
@@ -6769,7 +6783,7 @@ static void pooler_sync_connections_to_nodepool(void)
                 if (COMMAND_CONNECTION_WARM == asyncInfo->cmd && false == asyncInfo->slot->bwarmed)
                 {
                     nodeidx = get_node_index_by_nodeoid(asyncInfo->node);
-                    destroy_slot(nodeidx, asyncInfo->node, asyncInfo->slot);
+					destroy_slot(nodeidx, asyncInfo->node, asyncInfo->slot, false);
                     
                     /* Decrease pool size */
                     DecreasePoolerSize(nodePool,__FILE__, __LINE__);                                        
@@ -6803,13 +6817,15 @@ static void pooler_sync_connections_to_nodepool(void)
                     {
 						elog(LOG, POOL_MGR_PREFIX"destory connection to node:%u "
 							"nodeidx:%d nodepool size:%d freeSize:%d for unmatch "
-							"version, slot->m_version:%lu, nodePool->m_version:%lu",
+							"version, slot->m_version:"INT64_FORMAT", nodePool->m_version:"INT64_FORMAT,
 							asyncInfo->node,
 							nodeidx, nodePool->size, nodePool->freeSize,
 							asyncInfo->slot->m_version, nodePool->m_version);
                     }
 					nodeidx = get_node_index_by_nodeoid(asyncInfo->node);
-					destroy_slot(nodeidx, asyncInfo->node, asyncInfo->slot);
+
+					destroy_slot(nodeidx, asyncInfo->node, asyncInfo->slot, false);
+
 					break;
                 }
                 
@@ -6922,8 +6938,7 @@ static void pooler_sync_connections_to_nodepool(void)
                                      errmsg(POOL_MGR_PREFIX"get node %u name failed", connRsp->nodeoid)));
                         }
                         snprintf(nodePool->node_name, NAMEDATALEN, "%s", name_str);
-
-						nodePool->m_version = now;
+						nodePool->m_version = connRsp->dbPool->version++;
 						elog(LOG,
 							"pooler_sync_connections_to_nodepool: nodePools of "
 							"node (%u, %s) is created.",
@@ -6973,12 +6988,12 @@ static void pooler_sync_connections_to_nodepool(void)
                         }
                         else
                         {
-                            destroy_slot(connRsp->nodeindex, connRsp->nodeoid, slot);
+							destroy_slot(connRsp->nodeindex, connRsp->nodeoid, slot, false);
                             if (PoolConnectDebugPrint)
                             {
 								elog(LOG, POOL_MGR_PREFIX"destroy slot poolsize:%d, "
 									"freeSize:%d, node:%u, MaxPoolSize:%d, "
-									"connRsp->m_version:%lu, nodePool->m_version:%lu",
+									"connRsp->m_version:"INT64_FORMAT", nodePool->m_version:"INT64_FORMAT,
                                                                                                                            nodePool->size, 
 									nodePool->freeSize, nodePool->nodeoid, MaxPoolSize,
 									connRsp->m_version, nodePool->m_version);
@@ -7171,7 +7186,7 @@ static void pooler_async_ping_node(Oid node)
 
 
 /* async batch connection build  */
-static bool pooler_async_build_connection(DatabasePool *pool, time_t pool_version, int32 nodeidx, Oid node, int32 size, char *connStr, bool bCoord)
+static bool pooler_async_build_connection(DatabasePool *pool, int64 pool_version, int32 nodeidx, Oid node, int32 size, char *connStr, bool bCoord)
 {
 	int32 threadid; 
 	uint64 pipeput_loops = 0;
@@ -7533,7 +7548,7 @@ preconnect_and_warm(DatabasePool *dbPool)
             }
             snprintf(nodePool->node_name, NAMEDATALEN, "%s", name_str);
 
-			nodePool->m_version = time(NULL);
+			nodePool->m_version = dbPool->version++;
 			elog(LOG,
 				"preconnect_and_warm: nodePools of node (%u, %s) is created.",
 				nodePool->nodeoid, nodePool->node_name);
@@ -7564,7 +7579,7 @@ preconnect_and_warm(DatabasePool *dbPool)
                         (errcode(ERRCODE_CONNECTION_FAILURE),
                          errmsg(POOL_MGR_PREFIX"failed to connect to Datanode:[%s],errmsg[%s]", nodePool->connstr, PQerrorMessage((PGconn*)(slot->conn)))));
                 nodeidx = get_node_index_by_nodeoid(nodePool->nodeoid);
-                destroy_slot(nodeidx, nodePool->nodeoid, slot);
+				destroy_slot(nodeidx, nodePool->nodeoid, slot, false);
                 pfree((void*)dnOids);
                 pfree((void*)success);
                 return false;
@@ -9097,7 +9112,7 @@ static inline bool dispatch_reset_request(PGXCASyncTaskCtl  *taskControl,
             {
                 elog(LOG, POOL_MGR_PREFIX"++++dispatch_reset_request pid:%d release slot_seq:%d++++", agent->pid, slot->seqnum);
             }
-            release_connection(agent->pool, slot, nodeindex, node, false, bCoord);
+			release_connection(agent->pool, slot, nodeindex, node, false, bCoord, false);
         }
     }
     return ret;
@@ -10154,7 +10169,7 @@ static void print_pooler_slot(PGXCNodePoolSlot  *slot)
     }
     else
     {
-		elog(LOG, "slot=%p bwarmed=%d usecount=%d refcount=%d m_version=%lu pid=%d seqnum=%d "
+		elog(LOG, "slot=%p bwarmed=%d usecount=%d refcount=%d m_version="INT64_FORMAT" pid=%d seqnum=%d "
                   "bdestoryed=%d file=%s lineno=%d node_name=%s backend_pid=%d",
                   slot, slot->bwarmed,
                   slot->usecount, slot->refcount,slot->m_version,slot->pid,slot->seqnum,
@@ -11071,10 +11086,10 @@ refresh_database_pools(PoolAgent *agent)
 					destroy_node_pool_free_slots(nodePool);
 
 					/* increase the node pool version */
-					nodePool->m_version = time(NULL);
+					nodePool->m_version = databasePool->version++;
 					elog(LOG,
 						"refresh_database_pools: Found an altered node (%u %s) "
-						"size %d freesize %d increased m_version %lu"
+						"size %d freesize %d increased m_version "INT64_FORMAT
 						"connstr_chk=%s, nodePool->connstr=%s",
 						nodePool->nodeoid, nodePool->node_name,
 						nodePool->size, nodePool->freeSize, nodePool->m_version,
@@ -11381,7 +11396,7 @@ handle_close_pooled_connections(PoolAgent * agent, StringInfo s)
                 destroy_node_pool_free_slots(nodePool);
 
                 /* increase the node pool version */
-				nodePool->m_version = time(NULL);
+				nodePool->m_version = databasePool->version++;
             }
         }
 
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h
index e5cee0b6..43ad4d65 100644
--- a/src/include/pgxc/poolmgr.h
+++ b/src/include/pgxc/poolmgr.h
@@ -2,7 +2,7 @@
  *
  * poolmgr.h
  *
- *      Definitions for the Datanode connection pool.
+ *	  Definitions for the Datanode connection pool.
  *
  *
  * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
@@ -56,16 +56,16 @@
  */
 typedef enum
 {
-    POOL_CMD_TEMP,        /* Temporary object flag */
-    POOL_CMD_LOCAL_SET,    /* Local SET flag, current transaction block only */
-    POOL_CMD_GLOBAL_SET    /* Global SET flag */
+	POOL_CMD_TEMP,		/* Temporary object flag */
+	POOL_CMD_LOCAL_SET,	/* Local SET flag, current transaction block only */
+	POOL_CMD_GLOBAL_SET	/* Global SET flag */
 } PoolCommandType;
 
 #ifdef __TBASE__
 typedef enum
 {
-    SIGNAL_SIGINT   = 0,
-    SIGNAL_SIGUSR2  = 1
+	SIGNAL_SIGINT   = 0,
+	SIGNAL_SIGUSR2  = 1
 }   SignalType;
 
 /* 
@@ -74,17 +74,17 @@ typedef enum
  */
 typedef enum
 {
-    POOL_ERR_NONE,
-    POOL_ERR_GET_CONNECTIONS_POOLER_LOCKED,
-    POOL_ERR_GET_CONNECTIONS_TASK_NOT_DONE,
-    POOL_ERR_GET_CONNECTIONS_DISPATCH_FAILED,
-    POOL_ERR_GET_CONNECTIONS_INVALID_ARGUMENT,
-    POOL_ERR_GET_CONNECTIONS_OOM,
-    POOL_ERR_GET_CONNECTIONS_CONNECTION_BAD,
-    POOL_ERR_CANCEL_TASK_NOT_DONE,
-    POOL_ERR_CANCEL_DISPATCH_FAILED,
-    POOL_ERR_CANCEL_SEND_FAILED,
-    NUMBER_POOL_ERRS
+	POOL_ERR_NONE,
+	POOL_ERR_GET_CONNECTIONS_POOLER_LOCKED,
+	POOL_ERR_GET_CONNECTIONS_TASK_NOT_DONE,
+	POOL_ERR_GET_CONNECTIONS_DISPATCH_FAILED,
+	POOL_ERR_GET_CONNECTIONS_INVALID_ARGUMENT,
+	POOL_ERR_GET_CONNECTIONS_OOM,
+	POOL_ERR_GET_CONNECTIONS_CONNECTION_BAD,
+	POOL_ERR_CANCEL_TASK_NOT_DONE,
+	POOL_ERR_CANCEL_DISPATCH_FAILED,
+	POOL_ERR_CANCEL_SEND_FAILED,
+	NUMBER_POOL_ERRS
 }   PoolErrorCode;
 
 #define PoolErrIsValid(err)    ((bool) (err > POOL_ERR_NONE && err < NUMBER_POOL_ERRS))
@@ -93,90 +93,91 @@ typedef enum
 /* Connection pool entry */
 typedef struct
 {
-    /* stamp elements */
-    time_t released;/* timestamp when the connection last time release */
-    time_t checked; /* timestamp when the connection last time check */
-    time_t created; /* timestamp when the connection created */
-    bool   bwarmed;
-    
-    int32  usecount;
-    NODE_CONNECTION *conn;
-    NODE_CANCEL    *xc_cancelConn;
-
-    /* trace info */    
-    int32  refcount;   /* reference count */
-	time_t m_version;  /* version of node slot */
-    int32  pid;           /* agent pid that contains the slot */
-    int32  seqnum;       /* slot seqnum for the slot, unique for one slot */
-    bool   bdestoryed; /* used to show whether we are destoryed */
-    char   *file;      /* file where destroy the slot */
-    int32  lineno;       /* lineno where destroy the slot */
-    char   *node_name; /* connection node name , pointer to datanode_pool node_name, no memory allocated*/
-    int32  backend_pid;/* backend pid of remote connection */
+	/* stamp elements */
+	time_t released;/* timestamp when the connection last time release */
+	time_t checked; /* timestamp when the connection last time check */
+	time_t created; /* timestamp when the connection created */
+	bool   bwarmed;
+	
+	int32  usecount;
+	NODE_CONNECTION *conn;
+	NODE_CANCEL	*xc_cancelConn;
+
+	/* trace info */	
+	int32  refcount;   /* reference count */
+    int64 m_version;  /* version of node slot */
+	int32  pid;		   /* agent pid that contains the slot */
+	int32  seqnum;	   /* slot seqnum for the slot, unique for one slot */
+	bool   bdestoryed; /* used to show whether we are destoryed */
+	char   *file;      /* file where destroy the slot */
+	int32  lineno;	   /* lineno where destroy the slot */
+	char   *node_name; /* connection node name , pointer to datanode_pool node_name, no memory allocated*/
+	int32  backend_pid;/* backend pid of remote connection */
 } PGXCNodePoolSlot;
 
 /* Pool of connections to specified pgxc node */
 typedef struct
 {
-    Oid            nodeoid;    /* Node Oid related to this pool */
-    bool        coord;      /* whether am I coordinator */
-    bool        asyncInProgress;/* whether am in asyn building */
-    char       *connstr;
-    int         nwarming;   /* connection number warming in progress */
-    int         nquery;     /* connection number query memory size in progress */
-    int            freeSize;    /* available connections */
-    int            size;          /* total pool size */
-
-    char        node_name[NAMEDATALEN]; /* name of the node.*/
-	time_t		m_version;	/* version of node pool */
-    PGXCNodePoolSlot **slot;
+	Oid			nodeoid;	/* Node Oid related to this pool */
+	bool        coord;      /* whether am I coordinator */
+	bool        asyncInProgress;/* whether am in asyn building */
+	char	   *connstr;
+	int         nwarming;   /* connection number warming in progress */
+	int         nquery;     /* connection number query memory size in progress */
+	int			freeSize;	/* available connections */
+	int			size;  		/* total pool size */
+
+	char		node_name[NAMEDATALEN]; /* name of the node.*/
+    int64		m_version;	/* version of node pool */
+	PGXCNodePoolSlot **slot;
 } PGXCNodePool;
 
 /* All pools for specified database */
 typedef struct databasepool
 {
-    char       *database;
-    char       *user_name;
-    char       *pgoptions;        /* Connection options */
-    HTAB       *nodePools;         /* Hashtable of PGXCNodePool, one entry for each
-                                 * Coordinator or DataNode */
-    time_t        oldest_idle;
-     bool        bneed_warm;
-    bool        bneed_precreate;
-    bool        bneed_pool;        /* check whether need  connect pool */
-    MemoryContext mcxt;
-    struct databasepool *next;     /* Reference to next to organize linked list */
+	char	   *database;
+	char	   *user_name;
+	char	   *pgoptions;		/* Connection options */
+	HTAB	   *nodePools; 		/* Hashtable of PGXCNodePool, one entry for each
+								 * Coordinator or DataNode */
+	time_t		oldest_idle;
+ 	bool        bneed_warm;
+	bool        bneed_precreate;
+	bool        bneed_pool;		/* check whether need  connect pool */
+    int64		version;        /* used to generate node_pool's version */
+	MemoryContext mcxt;
+	struct databasepool *next; 	/* Reference to next to organize linked list */
 } DatabasePool;
 #define       PGXC_POOL_ERROR_MSG_LEN  512
 typedef struct PGXCASyncTaskCtl
 {
-    slock_t              m_lock;         /* common lock */
-    int32                m_status;          /* PoolAyncCtlStaus */
-    int32                  m_mumber_total;
-    int32                   m_number_done;    
-
-    /* acquire connections */
-    int32                 *m_result;       /* fd array */
-    int32                *m_pidresult;     /* pid array */
-    List                 *m_datanodelist;
-    List                 *m_coordlist;
-    int32                   m_number_succeed;
-
-    /* set local command */
-    int32                m_res;
-
-    /* set command */
-    char                 *m_command;
-    int32                 m_total;
-    int32                 m_succeed;
-
-    /* last command for 'g' and 's' */
-    CommandId             m_max_command_id;
-
-    /* errmsg and error status. */
+	slock_t              m_lock;		 /* common lock */
+	int32                m_status; 		 /* PoolAyncCtlStaus */
+	int32 			     m_mumber_total;
+	int32  				 m_number_done;	
+
+	/* acquire connections */
+	int32             	*m_result;       /* fd array */
+	int32				*m_pidresult;	 /* pid array */
+	List 				*m_datanodelist;
+	List 				*m_coordlist;
+	int32  				 m_number_succeed;
+
+	/* set local command */
+	int32                m_res;
+
+	/* set command */
+	char                 *m_command;
+	int32                 m_total;
+	int32                 m_succeed;
+
+	/* last command for 'g' and 's' */
+	CommandId             m_max_command_id;
+
+	/* errmsg and error status. */
 	bool                  m_missing_ok;
-    int32                  m_error_offset;
-    char                  m_error_msg[PGXC_POOL_ERROR_MSG_LEN];
+	int32				  m_error_offset;
+	char                  m_error_msg[PGXC_POOL_ERROR_MSG_LEN];
 }PGXCASyncTaskCtl;
 
 
@@ -222,8 +223,8 @@ typedef struct
 /* Handle to the pool manager (Session's side) */
 typedef struct
 {
-    /* communication channel */
-    PoolPort    port;
+	/* communication channel */
+	PoolPort	port;
 } PoolHandle;
 
 typedef struct PoolerCmdStatistics
@@ -245,39 +246,39 @@ typedef struct PoolerCmdStatistics
 
 #define     POOLER_ERROR_MSG_LEN  256
 
-extern int    MinPoolSize;
-extern int    MaxPoolSize;
-extern int    InitPoolSize;
-extern int    MinFreeSize;
+extern int	MinPoolSize;
+extern int	MaxPoolSize;
+extern int	InitPoolSize;
+extern int	MinFreeSize;
 
-extern int    PoolerPort;
-extern int    PoolConnKeepAlive;
-extern int    PoolMaintenanceTimeout;
+extern int	PoolerPort;
+extern int	PoolConnKeepAlive;
+extern int	PoolMaintenanceTimeout;
 extern bool PersistentConnections;
 
 extern char *g_PoolerWarmBufferInfo;
 extern char *g_unpooled_database;
 extern char *g_unpooled_user;
 
-extern int    PoolSizeCheckGap; 
-extern int    PoolConnMaxLifetime; 
-extern int    PoolMaxMemoryLimit;
-extern int    PoolConnectTimeOut;
+extern int	PoolSizeCheckGap; 
+extern int	PoolConnMaxLifetime; 
+extern int	PoolMaxMemoryLimit;
+extern int	PoolConnectTimeOut;
 extern int  PoolScaleFactor;
 extern int  PoolDNSetTimeout;
 extern int  PoolCheckSlotTimeout;
 extern int  PoolPrintStatTimeout;
-extern bool PoolConnectDebugPrint; 
+extern bool PoolConnectDebugPrint;
 extern bool PoolSubThreadLogPrint;
 /* Status inquiry functions */
 extern void PGXCPoolerProcessIam(void);
 extern bool IsPGXCPoolerProcess(void);
 
 /* Initialize internal structures */
-extern int    PoolManagerInit(void);
+extern int	PoolManagerInit(void);
 
 /* Destroy internal structures */
-extern int    PoolManagerDestroy(void);
+extern int	PoolManagerDestroy(void);
 
 /*
  * Get handle to pool manager. This function should be called just before
@@ -308,8 +309,8 @@ extern char *session_options(void);
  * initialize respective connection pool
  */
 extern void PoolManagerConnect(PoolHandle *handle,
-                               const char *database, const char *user_name,
-                               char *pgoptions);
+	                           const char *database, const char *user_name,
+	                           char *pgoptions);
 
 /*
  * Reconnect to pool manager
@@ -327,7 +328,7 @@ extern void PoolManagerReconnect(void);
 #define POOL_SET_COMMAND_NONE 0
 
 extern int PoolManagerSetCommand(PGXCNodeHandle **connections, int32 count, PoolCommandType command_type, 
-                                  const char *set_command);
+				      			const char *set_command);
 
 /* Get pooled connections */
 extern int *PoolManagerGetConnections(List *datanodelist, List *coordlist, bool raise_error, int **pids);
@@ -342,7 +343,7 @@ extern bool PoolManagerCheckConnectionInfo(void);
 extern void PoolManagerReloadConnectionInfo(void);
 
 /* Send Abort signal to transactions being run */
-extern int    PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids);
+extern int	PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids);
 
 /* Return connections back to the pool, for both Coordinator and Datanode connections */
 extern void PoolManagerReleaseConnections(bool force);
@@ -364,7 +365,7 @@ extern void PoolAsyncPingNodes(void);
 extern void PoolPingNodes(void);
 extern void PoolPingNodeRecheck(Oid nodeoid);
 extern bool check_persistent_connections(bool *newval, void **extra,
-        GucSource source);
+		GucSource source);
 
 /* Refresh connection data in pooler and drop connections of altered nodes in pooler */
 extern int PoolManagerRefreshConnectionInfo(void);

From 7b3dcdeb119bf29eb88bf29efc7308248e1c9002 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 26 Apr 2022 20:06:41 +0800
Subject: [PATCH 564/578] Fix bug of transparent decryption on a col

tapd: http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131098985873
---
 src/backend/utils/misc/relcrypt.c             |  7 +++--
 src/test/regress/expected/mls_check.out       | 31 +++++++++++++++++++
 .../regress/expected/updatable_views_1.out    |  2 ++
 src/test/regress/sql/mls_check.sql            | 13 ++++++++
 4 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/misc/relcrypt.c b/src/backend/utils/misc/relcrypt.c
index 954ff6c6..ca65b3b4 100644
--- a/src/backend/utils/misc/relcrypt.c
+++ b/src/backend/utils/misc/relcrypt.c
@@ -1605,17 +1605,20 @@ Datum trsprt_crypt_decrypt_one_col_value(TranspCrypt*transp_crypt,
 {
     Datum   datum_ret;
     text *  datum_text;
+    text *  input_text;
 
     if (TRANSP_CRYPT_INVALID_ALGORITHM_ID != transp_crypt->algo_id)
     {
-        datum_text = decrypt_procedure(transp_crypt->algo_id, DatumGetTextP(inputval), INVALID_CONTEXT_LENGTH); 
+    	input_text = DatumGetTextP(inputval);
+    	
+        datum_text = decrypt_procedure(transp_crypt->algo_id, input_text, INVALID_CONTEXT_LENGTH);
         if (datum_text)
         {
         datum_ret = transparent_crypt_text_get_datum(datum_text, attr);
         }
         else
         {
-	        datum_ret = transparent_crypt_text_get_datum(DatumGetTextP(inputval), attr);
+	        datum_ret = transparent_crypt_text_get_datum(input_text, attr);
         }
 
         return datum_ret;
diff --git a/src/test/regress/expected/mls_check.out b/src/test/regress/expected/mls_check.out
index fd8a30b5..129a101f 100644
--- a/src/test/regress/expected/mls_check.out
+++ b/src/test/regress/expected/mls_check.out
@@ -2781,6 +2781,37 @@ select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_SCHEMA('crypt_schema_sm4');
 \c - godlike
 alter schema crypt_schema_sm4 rename to crypt_schema_sm5;
 drop schema crypt_schema_sm5;
+create table tbl_col_sm4(normala int, normalb int, encrypted varchar) distribute by shard(normala);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+\c - mls_admin
+select MLS_TRANSPARENT_CRYPT_ALGORITHM_BIND_TABLE('public', 'tbl_col_sm4', 'encrypted', 4);
+ mls_transparent_crypt_algorithm_bind_table 
+--------------------------------------------
+ t
+(1 row)
+
+\c - godlike
+insert into tbl_col_sm4 values(1, 11, '1111dfa11');
+insert into tbl_col_sm4 values(2, 22, repeat('a', 16));
+insert into tbl_col_sm4 values(3, 33, 'dsfanle1={ntwkqweg-dibjf"sdfaw21(){{()"wjqtoij2j 199');
+select * from tbl_col_sm4 order by 1;
+ normala | normalb |                      encrypted                       
+---------+---------+------------------------------------------------------
+       1 |      11 | 1111dfa11
+       2 |      22 | aaaaaaaaaaaaaaaa
+       3 |      33 | dsfanle1={ntwkqweg-dibjf"sdfaw21(){{()"wjqtoij2j 199
+(3 rows)
+
+truncate tbl_col_sm4;
+\c - mls_admin
+select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('public', 'tbl_col_sm4', 'encrypted');
+ mls_transparent_crypt_algorithm_unbind_table 
+----------------------------------------------
+ t
+(1 row)
+
+\c - godlike
+drop table tbl_col_sm4;
 --case rename tables in crypted schema
 \c - godlike 
 create schema crypt_schema_sm66;
diff --git a/src/test/regress/expected/updatable_views_1.out b/src/test/regress/expected/updatable_views_1.out
index ae85f4e2..df12b4be 100644
--- a/src/test/regress/expected/updatable_views_1.out
+++ b/src/test/regress/expected/updatable_views_1.out
@@ -2133,6 +2133,7 @@ UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
 ERROR:  DML contains a function runs on CN which is not supported
 HINT:  You might need to push that function down to DN.
 ALTER FUNCTION leakproof(anyelement) pushdown;
+ALTER FUNCTION snoop(anyelement) pushdown;
 EXPLAIN (VERBOSE, COSTS OFF)
 UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
                                           QUERY PLAN                                           
@@ -2170,6 +2171,7 @@ UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
 
 UPDATE v1 SET a=100 WHERE snoop(a) AND leakproof(a) AND a < 7 AND a != 6;
 ALTER FUNCTION leakproof(anyelement) not pushdown;
+ALTER FUNCTION snoop(anyelement) not pushdown;
 SELECT * FROM v1 WHERE a=100; -- Nothing should have been changed to 100
  a | b | c | d 
 ---+---+---+---
diff --git a/src/test/regress/sql/mls_check.sql b/src/test/regress/sql/mls_check.sql
index 0b96a0c6..c1eaaeb9 100644
--- a/src/test/regress/sql/mls_check.sql
+++ b/src/test/regress/sql/mls_check.sql
@@ -973,6 +973,19 @@ select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_SCHEMA('crypt_schema_sm4');
 \c - godlike
 alter schema crypt_schema_sm4 rename to crypt_schema_sm5;
 drop schema crypt_schema_sm5;
+create table tbl_col_sm4(normala int, normalb int, encrypted varchar) distribute by shard(normala);
+\c - mls_admin
+select MLS_TRANSPARENT_CRYPT_ALGORITHM_BIND_TABLE('public', 'tbl_col_sm4', 'encrypted', 4);
+\c - godlike
+insert into tbl_col_sm4 values(1, 11, '1111dfa11');
+insert into tbl_col_sm4 values(2, 22, repeat('a', 16));
+insert into tbl_col_sm4 values(3, 33, 'dsfanle1={ntwkqweg-dibjf"sdfaw21(){{()"wjqtoij2j 199');
+select * from tbl_col_sm4 order by 1;
+truncate tbl_col_sm4;
+\c - mls_admin
+select MLS_TRANSPARENT_CRYPT_ALGORITHM_UNBIND_TABLE('public', 'tbl_col_sm4', 'encrypted');
+\c - godlike
+drop table tbl_col_sm4;
 
 --case rename tables in crypted schema
 \c - godlike 

From 15d669556eb36e4fa4d15dfaa1e62c39c888f207 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Mon, 9 May 2022 15:37:36 +0800
Subject: [PATCH 565/578] fix compile and warning

---
 src/backend/parser/parse_utilcmd.c |  7 ++++---
 src/backend/tcop/postgres.c        | 13 -------------
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 2be306b7..87901b13 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -3482,6 +3482,7 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                     int year;
                     int mon;
                     int day;
+                    Form_pg_partition_interval routerinfo = NULL;
 					
                     existnparts = RelationGetNParts(rel);
                     newnparts = ((AddDropPartitions*)cmd->def)->nparts;
@@ -3499,7 +3500,6 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                     /*
                      * Self-developed partition table compatibility processing
                      */
-                    Form_pg_partition_interval routerinfo = NULL;
                     routerinfo = rel->rd_partitions_info;
 
                         /* timestamp convert to posix struct */
@@ -3523,6 +3523,9 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
 
 					for(partidx = existnparts; partidx < existnparts + newnparts; partidx++)
                             {
+                        TableLikeClause *likeclause = makeNode(TableLikeClause);
+                        CreateStmt * createpart = makeNode(CreateStmt);
+
                         /*
                          * for compatible with the calculation of the normal time of the self-developed partition table
                          */
@@ -3538,8 +3541,6 @@ transformAlterTableStmt(Oid relid, AlterTableStmt *stmt,
                             }
                     }
 
-                        TableLikeClause *likeclause = makeNode(TableLikeClause);
-                        CreateStmt * createpart = makeNode(CreateStmt);
                         createpart->relation = copyObject((void *) stmt->relation);
                         createpart->relation->schemaname = get_namespace_name(RelationGetNamespace(rel));
                         //createpart->relation->relname = GetPartitionName(RelationGetRelid(rel), partidx, false);
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index abab8378..77992099 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -1142,19 +1142,6 @@ pg_plan_queries(List *querytrees, int cursorOptions, ParamListInfo boundParams)
     return stmt_list;
 }
 
-static bool
-ch_is_space(char ch)
-{
-	if (ch == ' ' || ch == '\n' || ch == '\t' || ch == '\r' || ch == '\f')
-	{
-	    return true;
-	}
-	else
-	{
-		return false;
-	}
-}
-
 /*
  * get myself query string from original query string,
  * if the query string contain multi stmt

From 63403784f1d23baba0b5845f052a920b4aadc449 Mon Sep 17 00:00:00 2001
From: ceciliasu <ceciliasu@tencent.com>
Date: Fri, 8 Apr 2022 16:19:30 +0800
Subject: [PATCH 566/578] Ignore member of the extensions in function
 pg_get_publication_tables (merge request !1235)

TAPD: --bug=098374847
http://tapd.woa.com/20421696/bugtrace/bugs/view?bug_id=1020421696098374847
---
 src/backend/catalog/pg_publication.c | 153 ++++++++++++++++++++++++++-
 1 file changed, 152 insertions(+), 1 deletion(-)

diff --git a/src/backend/catalog/pg_publication.c b/src/backend/catalog/pg_publication.c
index 438a2a74..4d21340b 100644
--- a/src/backend/catalog/pg_publication.c
+++ b/src/backend/catalog/pg_publication.c
@@ -93,6 +93,8 @@
 #include "catalog/pg_type.h"
 #include "catalog/pg_publication.h"
 #include "catalog/pg_publication_rel.h"
+#include "catalog/pg_depend.h"
+#include "catalog/pg_extension.h"
 
 #include "utils/array.h"
 #include "utils/builtins.h"
@@ -110,6 +112,145 @@
 #include "replication/logicalrelation.h"
 #endif
 
+typedef struct
+{
+	Oid tableoid;
+	Oid oid;
+} CatalogId;
+
+/* This is an array of object identities. */
+static CatalogId *extmembers;
+static int numextmembers;
+
+#define oidcmp(x,y) ( ((x) < (y) ? -1 : ((x) > (y)) ?  1 : 0) )
+
+/*
+ * qsort comparator for CatalogId.
+ */
+static int
+CatalogIdCompare(const void *p1, const void *p2)
+{
+	const CatalogId *obj1 = (const CatalogId *) p1;
+	const CatalogId *obj2 = (const CatalogId *) p2;
+	int cmpval;
+	
+	/*
+	 * Compare OID first since it's usually unique, whereas there will only be
+	 * a few distinct values of tableoid.
+	 */
+	cmpval = oidcmp(obj1->oid, obj2->oid);
+	if (cmpval == 0)
+		cmpval = oidcmp(obj1->tableoid, obj2->tableoid);
+	return cmpval;
+}
+
+/*
+ * setExtensionMembership
+ *	  accept and save data about which objects belong to extensions
+ */
+static void
+setExtensionMembership(CatalogId *extmems, int nextmems)
+{
+	/* Sort array in preparation for binary searches */
+	if (nextmems > 1)
+		qsort((void *) extmems, nextmems, sizeof(CatalogId),
+		      CatalogIdCompare);
+	/* And save */
+	extmembers = extmems;
+	numextmembers = nextmems;
+}
+
+/*
+ * getExtensionMembership --- obtain extension membership data
+ *
+ * We need to identify objects that are extension members as soon as they're
+ * loaded, so that we can correctly determine whether they need to be dentified as publishable.
+ * Generally speaking, extension member objects will get marked as *not* to be publishable.
+ */
+static void
+getExtensionMembership()
+{
+	CatalogId *extmembers;
+	Relation depRel;
+	SysScanDesc depScan;
+	HeapTuple depTup;
+	int maxObjs = 32;
+	int nextmembers = 0;
+	
+	extmembers = (CatalogId *) palloc0(maxObjs * sizeof(CatalogId));
+	
+	depRel = heap_open(DependRelationId, AccessShareLock);
+	depScan = systable_beginscan(depRel, DependReferenceIndexId, true, NULL, 0, NULL);
+	while (HeapTupleIsValid(depTup = systable_getnext(depScan)))
+	{
+		/*
+		 * We scan pg_depend to find those relations(RelationRelationId)
+		 * that depend on the given extension type.
+		 * (We assume we can ignore refobjsubid for a type.)
+		 */
+		Form_pg_depend pg_depend = (Form_pg_depend) GETSTRUCT(depTup);
+		if (pg_depend->refclassid != ExtensionRelationId
+		    || pg_depend->deptype != DEPENDENCY_EXTENSION
+		    || pg_depend->classid != RelationRelationId)
+			continue;
+		
+		if (nextmembers >= maxObjs)
+		{
+			maxObjs *= 2;
+			extmembers = (CatalogId *) repalloc(extmembers, maxObjs * sizeof(CatalogId));
+		}
+		extmembers[nextmembers].tableoid = pg_depend->classid;
+		extmembers[nextmembers].oid = pg_depend->objid;
+		nextmembers++;
+	}
+	
+	systable_endscan(depScan);
+	relation_close(depRel, AccessShareLock);
+	
+	/* Remember the data for use later */
+	setExtensionMembership(extmembers, nextmembers);
+}
+
+/*
+ * IsCatalogIdExtensionMember
+ *	  return If the specified catalog ID depends on some extension.
+ */
+static bool
+IsCatalogIdExtensionMember(CatalogId catalogId)
+{
+	CatalogId *low;
+	CatalogId *high;
+	
+	/*
+	 * We could use bsearch() here, but the notational cruft of calling
+	 * bsearch is nearly as bad as doing it ourselves; and the generalized
+	 * bsearch function is noticeably slower as well.
+	 */
+	if (numextmembers <= 0)
+		return false;
+	
+	low = extmembers;
+	high = extmembers + (numextmembers - 1);
+	while (low <= high)
+	{
+		CatalogId *middle;
+		int difference;
+		
+		middle = low + (high - low) / 2;
+		/* comparison must match ExtensionMemberIdCompare, below */
+		difference = oidcmp(middle->oid, catalogId.oid);
+		if (difference == 0)
+			difference = oidcmp(middle->tableoid, catalogId.tableoid);
+		if (difference == 0)
+			return true;
+		else if (difference < 0)
+			low = middle + 1;
+		else
+			high = middle - 1;
+	}
+	return false;
+}
+
 /*
  * Check if relation can be in given publication and throws appropriate
  * error if not.
@@ -416,6 +557,8 @@ GetAllTablesPublicationRelations(void)
     HeapTuple    tuple;
     List       *result = NIL;
 
+	getExtensionMembership();
+
     classRel = heap_open(RelationRelationId, AccessShareLock);
 
     ScanKeyInit(&key[0],
@@ -427,16 +570,24 @@ GetAllTablesPublicationRelations(void)
 
     while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
     {
+		CatalogId   pub_rel;
         Oid            relid = HeapTupleGetOid(tuple);
         Form_pg_class relForm = (Form_pg_class) GETSTRUCT(tuple);
 
-        if (is_publishable_class(relid, relForm))
+		pub_rel.tableoid = RelationRelationId;
+		pub_rel.oid = relid;
+		
+		if (is_publishable_class(relid, relForm)
+		    && !IsCatalogIdExtensionMember(pub_rel))
             result = lappend_oid(result, relid);
     }
 
     heap_endscan(scan);
     heap_close(classRel, AccessShareLock);
 
+	if (extmembers)
+		pfree(extmembers);
+
     return result;
 }
 

From 4b94902b46804245eeb5fdf85c7d03d0ff9719a7 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Tue, 29 Mar 2022 14:08:57 +0800
Subject: [PATCH 567/578] Code sync from pg11:
 372102b81dd0096764b712deffab00732f3c9d80

background:
The MLS related code uses the "expand_tuple" function of PG11,
and guesses it is to deal with encrypted columns. The EPQ context
distribution also uses this logic to call "expand_tuple", but
this function has been fixed in the PG follow-up:

	The t_self of the new tuple is also set.

Without this fix, the t_self of the expanded tuple will be an illegal value, affecting subsequent use
---
 src/backend/access/common/heaptuple.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
index d858d75b..a8e14398 100644
--- a/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@ -2236,7 +2236,7 @@ expand_tuple(HeapTuple *targetHeapTuple,
             = (HeapTupleHeader) ((char *) *targetHeapTuple + HEAPTUPLESIZE);
         (*targetHeapTuple)->t_len = len;
         (*targetHeapTuple)->t_tableOid = sourceTuple->t_tableOid;
-        ItemPointerSetInvalid(&((*targetHeapTuple)->t_self));
+		(*targetHeapTuple)->t_self = sourceTuple->t_self;
 
         targetTHeader->t_infomask = sourceTHeader->t_infomask;
         targetTHeader->t_hoff = hoff;

From 93c2a2a782381a7f802aa075b57421ef71aabab2 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 31 Mar 2022 16:05:39 +0800
Subject: [PATCH 568/578] Should check oldrel's distribution type when ALTER
 since newrel would be a local temp table with no distribution

---
 src/backend/commands/tablecmds.c          |  2 +-
 src/test/regress/expected/alter_table.out | 25 +++++++++++++++++++++++
 src/test/regress/sql/alter_table.sql      | 11 ++++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 4a44ba49..6aaa9c30 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -6409,7 +6409,7 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
                  * since the per-tuple memory context will be reset shortly.
                  */
 #ifdef _SHARDING_
-                if(newrel && RelationIsSharded(newrel))
+				if (RelationIsSharded(oldrel))
                     tuple = heap_form_tuple_plain(newTupDesc, values,isnull, diskey, secdiskey, RelationGetRelid(newrel));
                 else
 #endif
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index a1ef7dc2..4daa2206 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -3831,3 +3831,28 @@ update dropped_col_remote_dml set b = 2;
 NOTICE:  this is a test
 drop table dropped_col_remote_dml cascade;
 drop function dropped_col_remote_dml_func;
+-- add column with default values and check shardid
+create table t_default_shardid(a int, b int) distribute by shard(a);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into t_default_shardid values(1,1), (2,2), (3,3);
+select shardid, a, b from t_default_shardid order by 1;
+ shardid | a | b 
+---------+---+---
+     105 | 3 | 3
+    2234 | 1 | 1
+    3318 | 2 | 2
+(3 rows)
+
+create sequence s_default_shardid;
+alter table t_default_shardid add column c int default nextval('s_default_shardid');
+-- shardid should not change
+select shardid, a, b from t_default_shardid order by 1;
+ shardid | a | b 
+---------+---+---
+     105 | 3 | 3
+    2234 | 1 | 1
+    3318 | 2 | 2
+(3 rows)
+
+drop table t_default_shardid;
+drop sequence s_default_shardid;
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index 42a9bbfe..48dee67c 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -2563,3 +2563,14 @@ alter table dropped_col_remote_dml drop column c;
 update dropped_col_remote_dml set b = 2;
 drop table dropped_col_remote_dml cascade;
 drop function dropped_col_remote_dml_func;
+
+-- add column with default values and check shardid
+create table t_default_shardid(a int, b int) distribute by shard(a);
+insert into t_default_shardid values(1,1), (2,2), (3,3);
+select shardid, a, b from t_default_shardid order by 1;
+create sequence s_default_shardid;
+alter table t_default_shardid add column c int default nextval('s_default_shardid');
+-- shardid should not change
+select shardid, a, b from t_default_shardid order by 1;
+drop table t_default_shardid;
+drop sequence s_default_shardid;

From b86a845b85a476736ed9eb747dd9851ae1325445 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Thu, 31 Mar 2022 16:06:43 +0800
Subject: [PATCH 569/578] Consider restricted node number in cost module

---
 src/test/regress/expected/alter_table_3.out | 25 +++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/src/test/regress/expected/alter_table_3.out b/src/test/regress/expected/alter_table_3.out
index d6a33ebf..0153fa37 100644
--- a/src/test/regress/expected/alter_table_3.out
+++ b/src/test/regress/expected/alter_table_3.out
@@ -3703,3 +3703,28 @@ update dropped_col_remote_dml set b = 2;
 NOTICE:  this is a test
 drop table dropped_col_remote_dml cascade;
 drop function dropped_col_remote_dml_func;
+-- add column with default values and check shardid
+create table t_default_shardid(a int, b int) distribute by shard(a);
+NOTICE:  Replica identity is needed for shard table, please add to this table through "alter table" command.
+insert into t_default_shardid values(1,1), (2,2), (3,3);
+select shardid, a, b from t_default_shardid order by 1;
+ shardid | a | b 
+---------+---+---
+     105 | 3 | 3
+    2234 | 1 | 1
+    3318 | 2 | 2
+(3 rows)
+
+create sequence s_default_shardid;
+alter table t_default_shardid add column c int default nextval('s_default_shardid');
+-- shardid should not change
+select shardid, a, b from t_default_shardid order by 1;
+ shardid | a | b 
+---------+---+---
+     105 | 3 | 3
+    2234 | 1 | 1
+    3318 | 2 | 2
+(3 rows)
+
+drop table t_default_shardid;
+drop sequence s_default_shardid;

From 6d6ba4885d2dacefa1d18d07e1ddc132caa41751 Mon Sep 17 00:00:00 2001
From: aslanxli <aslanxli@tencent.com>
Date: Wed, 11 May 2022 14:36:59 +0800
Subject: [PATCH 570/578] FIX
 http://tapd.woa.com/pgxz/bugtrace/bugs/view?bug_id=1010092131099359801&jump_count=1

To sync and update local statistics coordinator queries pg_statistic tables on
datanodes and other coordinators, but these are not selectable by PUBLIC. Previously
we just disable check for SELECT permission if query referring the pg_statistic
table is parsed on datanodes, now disable check for SELECT permission when conn from
other coordinator.
---
 src/backend/parser/parse_relation.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c
index 10b20a9e..373c6c85 100644
--- a/src/backend/parser/parse_relation.c
+++ b/src/backend/parser/parse_relation.c
@@ -949,7 +949,7 @@ markRTEForSelectPriv(ParseState *pstate, RangeTblEntry *rte,
          * have arbitrary query parsed on datanode is EXECUTE DIRECT, it is only
          * available for superuser.
          */
-        if (IS_PGXC_DATANODE && rte->relid == StatisticRelationId)
+		if ((IS_PGXC_DATANODE || IsConnFromCoord()) && rte->relid == StatisticRelationId)
             rte->requiredPerms = 0;
         else
 #endif

From e4137ffab2954f65bddeac70bdedfeac3d3db9d0 Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 11 May 2022 19:41:36 +0800
Subject: [PATCH 571/578] fix gtm_ctl exit bug
 http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131098965677 (merge
 request !1306)

Squash merge branch 'sigmalin005' into 'Tbase_v2.15.19.8'
fix gtm_ctl exit bug http://tapd.woa.com/pgxz/bugtrace/bugs/view/1010092131098965677


TAPD: --bug=098965677
---
 src/gtm/gtm_ctl/gtm_ctl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c
index fa26bed5..8aea4576 100644
--- a/src/gtm/gtm_ctl/gtm_ctl.c
+++ b/src/gtm/gtm_ctl/gtm_ctl.c
@@ -488,6 +488,9 @@ test_gtm_connection()
             GTMPQfinish(conn);
             print_msg(".");
             sleep(1); /* 1 sec */
+            /* if the GTM process is not alive, exit directly */
+            if (!gtm_is_alive(get_pgpid()))
+                break;
         }
     }
 

From 7c5577548ae53022a4a852f39c5e65ca2006589a Mon Sep 17 00:00:00 2001
From: sigmalin <sigmalin@tencent.com>
Date: Wed, 11 May 2022 16:33:34 +0800
Subject: [PATCH 572/578] fix bug when create CreateSenderThread fail
 https://zhiyan.woa.com/requirement/1162/bug/4982#/bug?story_tab=info&wsn=164&wtype=bug
  git cherry-pick 8a3aff89

---
 src/backend/pgxc/squeue/squeue.c | 10 +++++-----
 src/include/pgxc/squeue.h        |  1 +
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
index 61289161..4d681ac9 100644
--- a/src/backend/pgxc/squeue/squeue.c
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -4101,9 +4101,6 @@ DataPumpSender BuildDataPumpSenderControl(SharedQueue sq)
             end = sender_control->node_num;
         }
         InitDataPumpThreadControl(&sender_control->thread_control[i], sender_control->nodes, base, end, sender_control->node_num);
-        
-		/* Set running status for the thread. not running now */
-		sender_control->thread_control[i].thread_running = false;		
     }
 
     /* set sqname and max connection */
@@ -4832,6 +4829,7 @@ void DataPumpCleanThread(DataPumpSenderControl *sender)
 {
     int32                   threadid  = 0;
     DataPumpThreadControl *thread   = NULL;
+    bool                  *send_quit = (bool *)palloc0(sizeof(bool) * sender->thread_num);
     
     for (threadid = 0; threadid < sender->thread_num; threadid ++)
     {
@@ -4841,18 +4839,20 @@ void DataPumpCleanThread(DataPumpSenderControl *sender)
         {
             thread->thread_need_quit = true;
             ThreadSemaUp(&thread->send_sem);        
+            send_quit[threadid] = true;
         }
     }
 
     for (threadid = 0; threadid < sender->thread_num; threadid ++)
     {
+	    if (send_quit[threadid])
+        {
         thread = &sender->thread_control[threadid];
 		/* Wait for sender to quit. */	
-		if (thread->thread_need_quit)
-		{
 			ThreadSemaDown(&thread->quit_sem);
 		}
     }
+    pfree(send_quit);
 
     ConvertDone(&sender->convert_control);    
 }
diff --git a/src/include/pgxc/squeue.h b/src/include/pgxc/squeue.h
index 3f0a6408..fc020c9f 100644
--- a/src/include/pgxc/squeue.h
+++ b/src/include/pgxc/squeue.h
@@ -114,6 +114,7 @@ typedef enum
 
 typedef enum
 {
+	ConvertInit,
 	ConvertRunning,
 	ConvertListenError,
 	ConvertAcceptError,

From be29047e38cfd8cabe921022d9a58865753f2ef4 Mon Sep 17 00:00:00 2001
From: andrelin <andrelin@tencent.com>
Date: Wed, 21 Apr 2021 19:32:33 +0800
Subject: [PATCH 573/578] Support join tables from different group on DN

* sending shard route map to lower nodes

* Add a guc to constrain group where to execute join op

tapd: http://tapd.oa.com/TBase_Oracle_Migration/bugtrace/bugs/view/1020421696086892879
---
 src/backend/commands/portalcmds.c     |  3 +
 src/backend/optimizer/util/pathnode.c | 77 ++++++++++++------------
 src/backend/parser/analyze.c          | 31 ++--------
 src/backend/pgxc/nodemgr/groupmgr.c   |  3 +-
 src/backend/pgxc/pool/execRemote.c    | 34 +++++++++--
 src/backend/pgxc/pool/pgxcnode.c      | 32 ++++++++--
 src/backend/pgxc/shard/shardmap.c     | 87 +++++++++++++++++++++++++++
 src/backend/postmaster/postmaster.c   |  2 +
 src/backend/tcop/postgres.c           |  8 +++
 src/backend/utils/misc/guc.c          | 46 ++++++++++++++
 src/include/optimizer/pathnode.h      |  3 +
 src/include/pgxc/pgxc.h               |  3 +
 src/include/pgxc/pgxcnode.h           |  2 +-
 src/include/pgxc/shardmap.h           |  5 ++
 14 files changed, 260 insertions(+), 76 deletions(-)

diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c
index 4bea0943..cdd42fa5 100644
--- a/src/backend/commands/portalcmds.c
+++ b/src/backend/commands/portalcmds.c
@@ -362,6 +362,9 @@ PortalCleanup(Portal portal)
 			/* If cleanup fails below prevent double cleanup */
 			portal->queryDesc = NULL;
 			
+			/* invalidate remote shard map info no matter producer or consumer */
+			InvalidRemoteShardmap();
+			
 			/*
 			 * If portal is producing it has an executor which should be
 			 * shut down
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index ed05ea1e..5143ac3f 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -50,6 +50,7 @@
 #include "optimizer/pgxcship.h"
 #include "pgxc/groupmgr.h"
 #include "pgxc/pgxcnode.h"
+#include "utils/memutils.h"
 #endif
 
 #ifdef _MIGRATE_
@@ -68,6 +69,11 @@ bool restrict_query = false;
 /* Support fast query shipping for subquery */
 bool enable_subquery_shipping = false;
 
+/* join will happen in these nodes forcibly */
+char  *g_constrain_group; /* the GUC variable */
+static Bitmapset *constrainNodes = NULL;
+#define BMS_EQUAL_CONSTRAINT(bms) (bms_is_empty(constrainNodes) || bms_equal(constrainNodes, (bms)))
+
 #define  REPLICATION_FACTOR 0.8
 #endif
 
@@ -1678,28 +1684,6 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 		goto pull_up;
 	}
 
-	/*
-	 * If outer or inner subpaths are distributed by shard and they do not exist
-	 * in same node set, which means we may need to redistribute tuples to data
-	 * nodes which use different router map to producer.
-	 * We don't support that, so pull it up to CN to accomplish the join.
-	 * 
-	 * TODO:
-	 *      1. if the join is "REPLICATION join SHARD", and node set of SHARD table
-	 *      is subset of REPLICATION table, no need to pull up.
-	 *      2. find out which side of this join needs to dispatch, and only decide
-	 *      whether to pull up by the distributionType of another side subpath.
-	 *      3. pass target router map to another group maybe ? thus nothing need to
-	 *      pull up to CN.
-	 */
-	if (innerd && outerd && 
-		(outerd->distributionType == LOCATOR_TYPE_SHARD ||
-		(innerd->distributionType == LOCATOR_TYPE_SHARD)) &&
-		!bms_equal(outerd->nodes, innerd->nodes))
-	{
-		goto pull_up;
-	}
-	
 	/*
 	 * the join of cold-hot tables must be pulled up to CN until we find a way 
 	 * to determine whether this join occurs in a specific group.
@@ -1818,7 +1802,8 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 		innerd->distributionType == outerd->distributionType &&
 		innerd->distributionExpr &&
 		outerd->distributionExpr &&
-		bms_equal(innerd->nodes, outerd->nodes))
+	    bms_equal(innerd->nodes, outerd->nodes) &&
+	    BMS_EQUAL_CONSTRAINT(innerd->nodes))
 	{
 		ListCell   *lc;
 
@@ -2245,7 +2230,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 					 */
 					cost_qual_eval_node(&cost, (Node *) ri, root);
 
-					if (outerd->distributionExpr)
+					if (outerd->distributionExpr && BMS_EQUAL_CONSTRAINT(outerd->nodes))
 					{
 #ifdef __TBASE__
 						/*
@@ -2294,7 +2279,7 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 							continue;
 						}
 					}
-					if (innerd->distributionExpr)
+					if (innerd->distributionExpr && BMS_EQUAL_CONSTRAINT(innerd->nodes))
 					{
 #ifdef __TBASE__
 						/* For UPDATE/DELETE, make sure inner rel does not need to distribute */
@@ -2453,26 +2438,14 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
             /* If we redistribute both parts do join on all nodes ... */
             if (new_inner_key && new_outer_key)
             {
+				if (bms_is_empty(constrainNodes))
+				{
                 int i;
                 for (i = 0; i < NumDataNodes; i++)
                     nodes = bms_add_member(nodes, i);
 
 #ifdef __TBASE__
-				/*
-				 * We end up here that we don't have replication table and whether
-				 * 1. we have no shard table at both sides OR
-				 * 2. we have shard table but spread in same node set
-				 * so check distribution type and decide what's next.
-				 */
-				if (innerd->distributionType == LOCATOR_TYPE_SHARD ||
-					outerd->distributionType == LOCATOR_TYPE_SHARD)
-				{
-					/* must be same node set, just copy */
-					Assert(bms_equal(innerd->nodes, innerd->nodes));
-					nodes = bms_copy(outerd->nodes);
-				}
-				/* check if we can distribute by shard */
-				else if (OidIsValid(group))
+					if (OidIsValid(group))
 				{
 					int      node_index;
 					int32	 dn_num;
@@ -2527,6 +2500,13 @@ set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
 				}
 #endif
             }
+				else
+				{
+					nodes = bms_copy(constrainNodes);
+					replicate_inner = false;
+					replicate_outer = false;
+				}
+			}
             /*
              * ... if we do only one of them redistribute it on the same nodes
              * as other.
@@ -7427,4 +7407,21 @@ path_count_datanodes(Path *path)
 	
 	return 1;
 }
+
+void
+assign_constrain_nodes(List *node_list)
+{
+	MemoryContext oldctx = MemoryContextSwitchTo(TopMemoryContext);
+	ListCell *lc;
+	
+	bms_free(constrainNodes);
+	constrainNodes = NULL;
+	
+	foreach(lc, node_list)
+	{
+		constrainNodes = bms_add_member(constrainNodes, lfirst_int(lc));
+	}
+	
+	MemoryContextSwitchTo(oldctx);
+}
 #endif
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index a720c1fc..9c460dd4 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -753,36 +753,17 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
         ParseState *sub_pstate = make_parsestate(pstate);
         Query       *selectQuery;
 
-#ifdef __TBASE__
+#ifdef __COLD_HOT__
 		/* prevent insert into cold_hot table select ... */
 		if (pstate->p_target_relation)
 		{
-			RelationLocInfo *target_rel_loc_info = pstate->p_target_relation->rd_locator_info;
-			RelationLocInfo *from_rel_loc_info;
-
-			if (target_rel_loc_info && target_rel_loc_info->locatorType == LOCATOR_TYPE_SHARD)
+			RelationLocInfo *rel_loc_info = pstate->p_target_relation->rd_locator_info;
+			if (rel_loc_info)
 			{
-				foreach(lc, selectStmt->fromClause)
+				if (AttributeNumberIsValid(rel_loc_info->secAttrNum)
+				    || OidIsValid(rel_loc_info->coldGroupId))
 				{
-					Node   *node = lfirst(lc);
-					if (IsA(node, RangeVar))
-					{
-						Oid relid = RangeVarGetRelid((RangeVar *) node, NoLock, true);
-						
-						if (InvalidOid != relid)
-						{
-							Relation rel = heap_open(relid, AccessShareLock);
-
-							from_rel_loc_info = rel->rd_locator_info;
-							if (!is_table_allowed_insert(from_rel_loc_info, target_rel_loc_info))
-							{
-								elog(ERROR,
-									"shard table could not be inserted from any other tables in different group");
-							}
-							
-							heap_close(rel, AccessShareLock);
-						}
-					}
+					elog(ERROR, "table in cold-hot group or key-value group could not join with other tables.");
 				}
 			}
 		}
diff --git a/src/backend/pgxc/nodemgr/groupmgr.c b/src/backend/pgxc/nodemgr/groupmgr.c
index be034bed..e5de7d81 100644
--- a/src/backend/pgxc/nodemgr/groupmgr.c
+++ b/src/backend/pgxc/nodemgr/groupmgr.c
@@ -678,7 +678,8 @@ GetMyGroupName(void)
     return NULL;
 }
 
-char* GetGroupNameByNode(Oid nodeoid)
+char *
+GetGroupNameByNode(Oid nodeoid)
 {
     Relation    relation;
     SysScanDesc scan;
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index c37ac46e..4986c4a7 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -60,12 +60,13 @@
 #include "catalog/pgxc_class.h"
 #ifdef __TBASE__
 #include "commands/explain_dist.h"
-#include "pgxc/squeue.h"
 #include "executor/execParallel.h"
-#include "postmaster/postmaster.h"
 #include "executor/nodeModifyTable.h"
-#include "utils/syscache.h"
 #include "nodes/print.h"
+#include "optimizer/pathnode.h"
+#include "pgxc/squeue.h"
+#include "postmaster/postmaster.h"
+#include "utils/syscache.h"
 #endif
 /*
  * We do not want it too long, when query is terminating abnormally we just
@@ -10941,6 +10942,9 @@ ExecRemoteSubplan(PlanState *pstate)
                  OidIsValid(primary_data_node) &&
                  combiner->conn_count > 1 && !g_UseDataPump);
         char cursor[NAMEDATALEN];
+#ifdef __TBASE__
+		StringInfo shardmap = NULL;
+#endif
 
         if (plan->cursor)
         {
@@ -11000,6 +11004,26 @@ ExecRemoteSubplan(PlanState *pstate)
 		if (estate->es_epqTuple != NULL)
 			epqctxlen = encode_epqcontext(&combiner->ss.ps, &epqctxdata);
 
+#ifdef __TBASE__
+		/*
+		 * consider whether to distribute shard map info
+		 * we do that when:
+		 *  1. this is a DN node
+		 *  2. plan distribution is by shard
+		 *  3. target of distribution is not in our group
+		 */
+		if (IS_PGXC_DATANODE && node->execNodes != NIL &&
+		    plan->distributionType == LOCATOR_TYPE_SHARD)
+		{
+			ListCell *cell;
+			
+			foreach(cell, node->execNodes)
+			{
+				if (!list_member_int(PGXCGroupNodeList, lfirst_int(cell)))
+					shardmap = SerializeShardmap();
+			}
+		}
+#endif
         /*
          * The subplan being rescanned, need to restore connections and
          * re-bind the portal
@@ -11039,7 +11063,7 @@ ExecRemoteSubplan(PlanState *pstate)
 
                 /* rebind */
                 pgxc_node_send_bind(conn, combiner->cursor, combiner->cursor,
-									paramlen, paramdata, epqctxlen, epqctxdata);
+									paramlen, paramdata, epqctxlen, epqctxdata, shardmap);
                 if (enable_statistic)
                 {
                     elog(LOG, "Bind Message:pid:%d,remote_pid:%d,remote_ip:%s,remote_port:%d,fd:%d,cursor:%s",
@@ -11128,7 +11152,7 @@ ExecRemoteSubplan(PlanState *pstate)
 
                 /* bind */
 				pgxc_node_send_bind(conn, cursor, cursor, paramlen, paramdata,
-				                    epqctxlen, epqctxdata);
+				                    epqctxlen, epqctxdata, shardmap);
 
                 if (enable_statistic)
                 {
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index c19325a9..d755e5be 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -70,6 +70,7 @@
 #include "catalog/pg_authid.h"
 #endif
 #ifdef __TBASE__
+#include "pgxc/groupmgr.h"
 #include "postmaster/postmaster.h"
 #endif
 
@@ -398,6 +399,16 @@ InitMultinodeExecutor(bool is_force)
     slavedatanode_count = 0;
     PGXCNodeId = 0;
 
+	if (IS_PGXC_DATANODE)
+	{
+		if (PGXCGroupNodeList != NIL)
+		{
+			list_free(PGXCGroupNodeList);
+			PGXCGroupNodeList = NIL;
+		}
+		PGXCGroupNodeList = GetGroupNodeList(GetMyGroupOid());
+	}
+	
     MemoryContextSwitchTo(oldcontext);
 
 	PGXCSessionId[0] = '\0';
@@ -2231,7 +2242,7 @@ pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
 int
 pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
 					const char *statement, int paramlen, const char *params,
-					int epqctxlen, const char *epqctx)
+					int epqctxlen, const char *epqctx, StringInfo shardmap)
 {
     int            pnameLen;
     int            stmtLen;
@@ -2240,6 +2251,7 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
     int         paramOutLen;
 	int         epqCtxLen;
     int            msgLen;
+	int         shardMapLen;
 
     /* Invalid connection state, return error */
     if (handle->state != DN_CONNECTION_STATE_IDLE)
@@ -2257,8 +2269,11 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
     paramOutLen = 2;
 	/* size of epq context, 2 if not epq */
 	epqCtxLen = epqctxlen ? epqctxlen : 2;
-    /* size + pnameLen + stmtLen + parameters */
-	msgLen = 4 + pnameLen + stmtLen + paramCodeLen + paramValueLen + paramOutLen + epqCtxLen;
+	/* size of shard map information */
+	shardMapLen = shardmap ? shardmap->len + 1 : 1;
+	/* size + pnameLen + stmtLen + parameters + epqctx + shardmap */
+	msgLen = 4 + pnameLen + stmtLen + paramCodeLen + paramValueLen +
+	         paramOutLen + epqCtxLen + shardMapLen;
 
     /* msgType + msgLen */
     if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
@@ -2317,6 +2332,15 @@ pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
 		handle->outBuffer[handle->outEnd++] = 0;
 	}
 
+	/* shard map info */
+	if (shardmap && shardMapLen > 1)
+	{
+		memcpy(handle->outBuffer + handle->outEnd, shardmap->data, shardMapLen);
+		handle->outEnd += shardMapLen;
+	}
+	else
+		handle->outBuffer[handle->outEnd++] = '\0';
+
     handle->in_extended_query = true;
      return 0;
 }
@@ -2609,7 +2633,7 @@ pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
     if (query)
         if (pgxc_node_send_parse(handle, statement, query, num_params, param_types))
             return EOF;
-	if (pgxc_node_send_bind(handle, portal, statement, paramlen, params, 0, NULL))
+	if (pgxc_node_send_bind(handle, portal, statement, paramlen, params, 0, NULL, NULL))
         return EOF;
     if (send_describe)
         if (pgxc_node_send_describe(handle, false, portal))
diff --git a/src/backend/pgxc/shard/shardmap.c b/src/backend/pgxc/shard/shardmap.c
index 6583be1c..29f9a946 100644
--- a/src/backend/pgxc/shard/shardmap.c
+++ b/src/backend/pgxc/shard/shardmap.c
@@ -173,6 +173,10 @@ static HTAB               *g_GroupHashTab     = NULL;
 /*For DN*/
 static ShardNodeGroupInfo_DN *g_GroupShardingMgr_DN = NULL;
 
+/* For local DN received from parent node */
+static bool g_ShardMapValid = false;
+static ShardMapItemDef g_ShardMap[SHARD_MAP_GROUP_NUM];
+
 /* used for datanodes */
 Bitmapset                      *g_DatanodeShardgroupBitmap  = NULL;
 
@@ -1315,6 +1319,9 @@ int32  GetNodeIndexByHashValue(Oid group, long hashvalue)
         }
 
         shardIdx     = abs(hashvalue) % (g_GroupShardingMgr_DN->members->shmemNumShards);
+		if (g_ShardMapValid)
+			shardgroup = &g_ShardMap[shardIdx];
+		else
         shardgroup   = &g_GroupShardingMgr_DN->members->shmemshardmap[shardIdx];
         nodeIdx      = shardgroup->nodeindex;
     }
@@ -5718,4 +5725,84 @@ List* GetShardMapRangeList(Oid group, Oid coldgroup, Oid relation, Oid type, Dat
     return list;
 }
 
+/* serialize shard map info for dispatching to lower DNs */
+StringInfo 
+SerializeShardmap(void)
+{
+	GroupShardInfo *info;
+	StringInfo      data;
+	int             i;
+	
+	if (!IS_PGXC_DATANODE)
+		elog(ERROR, "shouldn't try to serialize group shard info on CN");
+	
+	info = g_GroupShardingMgr_DN->members;
+	data = makeStringInfo();
+	
+	appendStringInfo(data, "%d", info->shmemNumShards);
+	for (i = 0; i < info->shmemNumShards; i++)
+	{
+		appendStringInfo(data, ",%d",
+		                 info->shmemshardmap[i].nodeindex);
+	}
+
+	return data;
+}
+
+/*
+ * Deserialize shard map info into g_ShardMap, these information
+ * comes from parent DN and will replace local info for distribution
+ * across multi groups.
+ */
+void
+DeserializeShardmap(const char *data)
+{
+	char    *tmp_head = (char *) data;
+	char    *tmp_pos;
+	int      num_shards, i;
+	
+	num_shards = (int) strtod(tmp_head, &tmp_pos);
+	tmp_head = tmp_pos + 1;
+	
+	if (num_shards != SHARD_MAP_SHARD_NUM)
+	{
+		/*
+		 * for now num_shards should always be SHARD_MAP_GROUP_NUM
+		 * since SHARD_MAP_SHARD_NUM == EXTENSION_SHARD_MAP_SHARD_NUM
+		 * but maybe it will change someday, error out to avoid more
+		 * critical error.
+		 */
+		elog(ERROR, "deserializing invalid num of shard map, %d", num_shards);
+	}
+	
+	for (i = 0; i < num_shards; i++)
+	{
+		g_ShardMap[i].shardgroupid = i;
+		g_ShardMap[i].nodeindex = (int) strtod(tmp_head, &tmp_pos);
+		tmp_head = tmp_pos + 1;
+	}
+	
+	/* enable remote shard map info */
+	g_ShardMapValid = true;
+}
+
+/* g_ShardMap is a static array, simply disable it by another static bool */
+void
+InvalidRemoteShardmap(void)
+{
+	g_ShardMapValid = false;
+}
+
+/*
+ * return group oid of this node in
+ * return invalid if it's not in a group or it's a CN.
+ */
+Oid
+GetMyGroupOid(void)
+{
+	if (IS_PGXC_DATANODE)
+		return g_GroupShardingMgr_DN->members->group;
+	else
+		return InvalidOid;
+}
 #endif
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 10be77cd..74c941db 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -470,6 +470,8 @@ bool        IsPGXCMainCluster = false;
 int            PGXCNodeId = 0;
 #ifdef __TBASE__
 char             PGXCSessionId[NAMEDATALEN];
+int              PGXCLevelId = -1;
+List            *PGXCGroupNodeList = NIL;
 #endif
 /*
  * When a particular node starts up, store the node identifier in this variable
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 77992099..e6afe664 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2225,6 +2225,7 @@ exec_bind_message(StringInfo input_message)
     int column_index;
     int index;
     char ***data_list = NULL;
+	const char *shard_map;
     MemoryContext old_top;
 #endif
 
@@ -2756,6 +2757,8 @@ exec_bind_message(StringInfo input_message)
             rformats[i] = pq_getmsgint(input_message, 2);
     }
 
+	InvalidRemoteShardmap();
+	
 	/* Get epq context, only datanodes need them */
 	if (IsConnFromCoord() || IsConnFromDatanode())
 	{
@@ -2779,6 +2782,11 @@ exec_bind_message(StringInfo input_message)
                 portal->epqContext->nodeid[i] = pq_getmsgint(input_message, 4);
             }
         }
+		
+		/* Get shard map info */
+		shard_map = pq_getmsgstring(input_message);
+		if (shard_map[0] != '\0')
+			DeserializeShardmap(shard_map);
 	}
 	
     pq_getmsgend(input_message);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index dbccb8f6..16fd75c7 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -125,6 +125,8 @@
 #include "tcop/pquery.h"
 #include "optimizer/plancat.h"
 #include "parser/analyze.h"
+#include "pgxc/groupmgr.h"
+#include "utils/lsyscache.h"
 #endif
 
 #ifdef __AUDIT__
@@ -294,6 +296,9 @@ static void strreplace_all(char *str, char *needle, char *replacement);
 #ifdef __TBASE__
 static bool set_warm_shared_buffer(bool *newval, void **extra, GucSource source);
 static const char *show_total_memorysize(void);
+
+static bool check_constrain_group(char **newval, void **extra, GucSource source);
+static void assign_constrain_group(const char *newval, void *extra);
 #endif
 #ifdef __COLD_HOT__
 static void assign_cold_hot_partition_type(const char *newval, void *extra);
@@ -5871,6 +5876,16 @@ static struct config_string ConfigureNamesString[] =
         "mls_admin",
         NULL, NULL, NULL
     },
+	{
+		{"join_constrain_group", PGC_USERSET, CUSTOM_OPTIONS,
+			gettext_noop("name of the group that join execute in, "
+				"any data that not in this group will be redistributed"),
+			NULL
+		},
+		&g_constrain_group,
+		"",
+		check_constrain_group, assign_constrain_group, NULL
+	},
 #endif
 #ifdef _PG_ORCL_
     {
@@ -13737,6 +13752,37 @@ show_total_memorysize(void)
 	snprintf(buf, sizeof(buf), "%dM", size);
     return buf;
 }
+
+static bool
+check_constrain_group(char **newval, void **extra, GucSource source)
+{
+	char *group_name = NULL;
+	if (!IsUnderPostmaster)
+		return true;
+	
+	if ((*newval)[0] == '\0')
+		return true;
+		
+	group_name = pstrdup(*newval);
+	return get_pgxc_groupoid(group_name) != InvalidOid;
+}
+
+static void
+assign_constrain_group(const char *newval, void *extra)
+{
+	char *group_name = NULL;
+	if (!IsUnderPostmaster)
+		return;
+	
+	if (newval[0] == '\0')
+	{
+		assign_constrain_nodes(NIL);
+		return;
+	}
+	
+	group_name = pstrdup(newval);
+	assign_constrain_nodes(GetGroupNodeList(get_pgxc_groupoid(group_name)));
+}
 #endif
 #ifdef __COLD_HOT__
 static void
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index e1fe0a4f..59dddbe6 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -372,10 +372,13 @@ extern Path *create_redistribute_distinct_agg_path(PlannerInfo *root,
 												   Aggref *agg);
 extern void contains_remotesubplan(Path *path, int *number, bool *redistribute);
 
+extern void assign_constrain_nodes(List *node_list);
+
 extern int replication_level;
 
 extern bool restrict_query;
 extern bool enable_subquery_shipping;
+extern char *g_constrain_group;
 #endif
 
 #endif                            /* PATHNODE_H */
diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h
index 687be6c8..264bfbed 100644
--- a/src/include/pgxc/pgxc.h
+++ b/src/include/pgxc/pgxc.h
@@ -76,6 +76,7 @@
 #define PGXC_H
 
 #include "postgres.h"
+#include "nodes/pg_list.h"
 
 extern bool isPGXCCoordinator;
 extern bool isPGXCDataNode;
@@ -107,6 +108,8 @@ extern char *PGXCMainClusterName;
 extern char *PGXCDefaultClusterName;
 #ifdef __TBASE__
 extern char PGXCSessionId[NAMEDATALEN];
+extern int PGXCLevelId;
+extern List *PGXCGroupNodeList;
 #endif
 
 
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index f0e7c269..13757e15 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -222,7 +222,7 @@ extern int pgxc_node_send_disconnect(PGXCNodeHandle * handle, char *cursor, int
 #endif
 extern int	pgxc_node_send_bind(PGXCNodeHandle * handle, const char *portal,
 								const char *statement, int paramlen, const char *params,
-								int eqpctxlen, const char *epqctx);
+								int eqpctxlen, const char *epqctx, StringInfo shardmap);
 extern int	pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
 								 const char *query, short num_params, Oid *param_types);
 extern int	pgxc_node_send_flush(PGXCNodeHandle * handle);
diff --git a/src/include/pgxc/shardmap.h b/src/include/pgxc/shardmap.h
index c62e3144..0674185d 100644
--- a/src/include/pgxc/shardmap.h
+++ b/src/include/pgxc/shardmap.h
@@ -234,6 +234,11 @@ extern bool ScanNeedExecute(Relation rel);
 
 extern List* GetShardMapRangeList(Oid group, Oid coldgroup, Oid relation, Oid type, Datum dvalue, AttrNumber secAttr, Oid secType, 
                     Datum minValue, Datum maxValue, bool  equalMin, bool  equalMax, RelationAccessType accessType);
+
+extern StringInfo SerializeShardmap(void);
+extern void DeserializeShardmap(const char *data);
+extern void InvalidRemoteShardmap(void);
+extern Oid GetMyGroupOid(void);
 #endif
 
 #endif /*_SHARDMAP_H_*/

From 6024988daeb53b9a38c29add75b9947f13ee6a12 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 9 Mar 2023 19:35:39 +0800
Subject: [PATCH 574/578] fix compile error

---
 src/backend/commands/analyze.c | 5 ++---
 src/backend/libpq/pqcomm.c     | 1 -
 src/backend/parser/gram.y      | 2 +-
 src/backend/tcop/utility.c     | 2 +-
 src/include/nodes/nodes.h      | 2 +-
 5 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 31b7cfbf..49c60295 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -655,7 +655,7 @@ do_analyze_rel(Relation				 onerel,
         /*
          * Fetch relation statistics from remote nodes and update
          */
-		vacuum_rel_coordinator(onerel, in_outer_xact, params);
+		vacuum_rel_coordinator(onerel, in_outer_xact, params, NULL);
 
         /*
          * Fetch attribute statistics from remote nodes.
@@ -5825,8 +5825,7 @@ coord_sync_col_stats(Relation		 onerel,
 	update_attstats(RelationGetRelid(onerel),
 					inh,
 					attr_cnt,
-					vacattrstats,
-					RelationGetRelPersistence(onerel));
+					vacattrstats);
 }
 
 /*
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 132b85b2..db3b1ea1 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -2072,7 +2072,6 @@ SetSockKeepAlive(int sock)
 		elog(LOG, "SetSockKeepAlive setsockopt(TCP_USER_TIMEOUT) failed: %m");
 	}
 }
-}
 
 int
 pq_gettcpusertimeout(Port *port)
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 756b4bad..401ffcd6 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -263,7 +263,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 	RoleSpec			*rolespec;
 	PartitionForExpr	*partfor;
 	PartitionBy         *partby; 
-	AnalyzeSyncOpt      *analyze_sync_opt;
+	StatSyncOpt      *analyze_sync_opt;
 }
 
 %type <node>	stmt schema_stmt
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 87be1ef4..c1293401 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -1354,7 +1354,7 @@ ProcessUtilityPost(PlannedStmt *pstmt,
 			if (vstmt->relation != NULL)
 			{
 				Relation rel =
-					relation_openrv_extended(vstmt->relation, NoLock, true, false);
+					relation_openrv_extended(vstmt->relation, NoLock, true);
 				if (rel && rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
 				{
 					relation_close(rel, NoLock);
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 854f36a4..d4daa350 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -587,7 +587,7 @@ typedef enum NodeTag
 #ifdef _MLS_
     ,T_SyncBufIdInfo            /* in bufmgr.c*/
 #endif
-	 ,T_AnalyzeSyncOpt
+	 ,T_StatSyncOpt
 } NodeTag;
 
 /*

From 0ff366d6825cc0a40fc29b5f0847dc4246a104a5 Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 9 Mar 2023 19:51:39 +0800
Subject: [PATCH 575/578] adject regress expected

---
 src/test/regress/expected/join_3.out          |  75 ++++---
 src/test/regress/expected/limit.out           |   8 +-
 src/test/regress/expected/partition_index.out |  18 +-
 .../regress/expected/partition_join_2.out     | 188 +++++++++---------
 src/test/regress/expected/rules.out           |   3 +
 .../regress/expected/select_parallel_4.out    |  38 ++--
 src/test/regress/expected/subselect.out       |   2 +-
 src/test/regress/expected/transactions_2.out  |   6 +-
 8 files changed, 169 insertions(+), 169 deletions(-)

diff --git a/src/test/regress/expected/join_3.out b/src/test/regress/expected/join_3.out
index 1225d2ce..94914044 100644
--- a/src/test/regress/expected/join_3.out
+++ b/src/test/regress/expected/join_3.out
@@ -2420,7 +2420,7 @@ select count(*) from tenk1 a, tenk1 b
          ->  Partial Aggregate
                ->  Hash Join
                      Hash Cond: (a.hundred = b.thousand)
-                     ->  Index Only Scan using tenk1_hundred on tenk1 a
+                     ->  Seq Scan on tenk1 a
                      ->  Hash
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                                  ->  Seq Scan on tenk1 b
@@ -3291,7 +3291,7 @@ select count(*) from
                      Join Filter: (a.unique2 = b.unique1)
                      ->  Remote Subquery Scan on all
                            Distribute results by H: thousand
-                           ->  Index Only Scan using tenk1_thous_tenthous on tenk1 c
+                           ->  Seq Scan on tenk1 c
                      ->  Hash
                            ->  Remote Subquery Scan on all
                                  Distribute results by H: thousand
@@ -3336,7 +3336,7 @@ select b.unique1 from
                            Join Filter: (b.unique1 = 42)
                            ->  Remote Subquery Scan on all
                                  Distribute results by H: 42
-                                 ->  Index Only Scan using tenk1_thous_tenthous on tenk1 c
+                                 ->  Seq Scan on tenk1 c
                            ->  Hash
                                  ->  Remote Subquery Scan on all
                                        Distribute results by H: unique1
@@ -3441,19 +3441,17 @@ select f1, unique2, case when unique2 is null then f1 else 0 end
                                 QUERY PLAN                                
 --------------------------------------------------------------------------
  Remote Subquery Scan on all
-   ->  Merge Right Join
-         Merge Cond: (b.unique2 = a.f1)
+   ->  Hash Right Join
+         Hash Cond: (b.unique2 = a.f1)
          Filter: (CASE WHEN (b.unique2 IS NULL) THEN a.f1 ELSE 0 END = 0)
          ->  Remote Subquery Scan on all
                Distribute results by H: unique2
-               ->  Index Only Scan using tenk1_unique2 on tenk1 b
-         ->  Materialize
+               ->  Seq Scan on tenk1 b
+         ->  Hash
                ->  Remote Subquery Scan on all
                      Distribute results by H: f1
-                     ->  Sort
-                           Sort Key: a.f1
-                           ->  Seq Scan on int4_tbl a
-(13 rows)
+                     ->  Seq Scan on int4_tbl a
+(11 rows)
 
 select f1, unique2, case when unique2 is null then f1 else 0 end
   from int4_tbl a left join tenk1 b on f1 = unique2
@@ -3512,32 +3510,28 @@ left join
    using (join_key)
   ) foo3
 using (join_key);
-                                   QUERY PLAN                                   
---------------------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: "*VALUES*".column1, i1.f1, 666
    ->  Hash Right Join
          Output: "*VALUES*".column1, i1.f1, (666)
          Hash Cond: (i1.f1 = "*VALUES*".column1)
-         ->  Merge Right Join
+         ->  Hash Right Join
                Output: i1.f1, 666
-               Merge Cond: (i2.unique2 = i1.f1)
+               Hash Cond: (i2.unique2 = i1.f1)
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Output: i2.unique2
                      Distribute results by H: unique2
-                     Sort Key: i2.unique2
-                     ->  Index Only Scan using tenk1_unique2 on public.tenk1 i2
+                     ->  Seq Scan on public.tenk1 i2
                            Output: i2.unique2
-               ->  Materialize
+               ->  Hash
                      Output: i1.f1
                      ->  Remote Subquery Scan on all (datanode_1)
                            Output: i1.f1
                            Distribute results by H: f1
-                           ->  Sort
+                           ->  Seq Scan on public.int4_tbl i1
                                  Output: i1.f1
-                                 Sort Key: i1.f1
-                                 ->  Seq Scan on public.int4_tbl i1
-                                       Output: i1.f1
          ->  Hash
                Output: "*VALUES*".column1
                ->  Remote Subquery Scan on all (datanode_1)
@@ -3545,7 +3539,7 @@ using (join_key);
                      Distribute results by H: column1
                      ->  Values Scan on "*VALUES*"
                            Output: "*VALUES*".column1
-(31 rows)
+(27 rows)
 
 select foo1.join_key as foo1_id, foo3.join_key AS foo3_id, bug_field from
   (values (0),(1)) foo1(join_key)
@@ -4702,18 +4696,19 @@ select * from generate_series(100,200) g,
 explain (num_nodes off, nodes off, costs off)
   select count(*) from tenk1 a,
     tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x;
-                                  QUERY PLAN                                  
-------------------------------------------------------------------------------
+                         QUERY PLAN                         
+------------------------------------------------------------
  Finalize Aggregate
    ->  Remote Subquery Scan on all
          ->  Partial Aggregate
-               ->  Merge Join
-                     Merge Cond: (b.unique2 = a.unique1)
+               ->  Hash Join
+                     Hash Cond: (b.unique2 = a.unique1)
                      ->  Remote Subquery Scan on all
                            Distribute results by H: unique2
-                           ->  Index Only Scan using tenk1_unique2 on tenk1 b
-                     ->  Index Only Scan using tenk1_unique1 on tenk1 a
-(9 rows)
+                           ->  Seq Scan on tenk1 b
+                     ->  Hash
+                           ->  Seq Scan on tenk1 a
+(10 rows)
 
 select count(*) from tenk1 a,
   tenk1 b join lateral (values(a.unique1)) ss(x) on b.unique2 = ss.x;
@@ -4726,18 +4721,18 @@ select count(*) from tenk1 a,
 explain (num_nodes off, nodes off, costs off)
   select count(*) from tenk1 a,
     tenk1 b join lateral (values(a.unique1),(-1)) ss(x) on b.unique2 = ss.x;
-                               QUERY PLAN                               
-------------------------------------------------------------------------
+                     QUERY PLAN                      
+-----------------------------------------------------
  Aggregate
    ->  Hash Join
          Hash Cond: ("*VALUES*".column1 = b.unique2)
          ->  Nested Loop
                ->  Remote Subquery Scan on all
-                     ->  Index Only Scan using tenk1_unique1 on tenk1 a
+                     ->  Seq Scan on tenk1 a
                ->  Values Scan on "*VALUES*"
          ->  Hash
                ->  Remote Subquery Scan on all
-                     ->  Index Only Scan using tenk1_unique2 on tenk1 b
+                     ->  Seq Scan on tenk1 b
 (10 rows)
 
 select count(*) from tenk1 a,
@@ -6113,8 +6108,8 @@ from onek t1, tenk1 t2
 where exists (select 1 from tenk1 t3
               where t3.thousand = t1.unique1 and t3.tenthous = t2.hundred)
       and t1.unique1 < 1;
-                                               QUERY PLAN                                                
----------------------------------------------------------------------------------------------------------
+                                                                                                                QUERY PLAN                                                                                                                 
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    Output: t1.unique1, t2.hundred
    ->  Nested Loop
@@ -6129,13 +6124,13 @@ where exists (select 1 from tenk1 t3
                            Output: t3.thousand, t3.tenthous
                            Group Key: t3.thousand, t3.tenthous
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Output: t3.thousand, t3.tenthous
+                                 Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4
                                  Distribute results by H: thousand
                                  ->  HashAggregate
-                                       Output: t3.thousand, t3.tenthous
+                                       Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4
                                        Group Key: t3.thousand, t3.tenthous
-                                       ->  Index Only Scan using tenk1_thous_tenthous on public.tenk1 t3
-                                             Output: t3.thousand, t3.tenthous
+                                       ->  Seq Scan on public.tenk1 t3
+                                             Output: t3.unique1, t3.unique2, t3.two, t3.four, t3.ten, t3.twenty, t3.hundred, t3.thousand, t3.twothousand, t3.fivethous, t3.tenthous, t3.odd, t3.even, t3.stringu1, t3.stringu2, t3.string4
                      ->  Hash
                            Output: t1.unique1
                            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
diff --git a/src/test/regress/expected/limit.out b/src/test/regress/expected/limit.out
index 61a3f53e..1da03844 100644
--- a/src/test/regress/expected/limit.out
+++ b/src/test/regress/expected/limit.out
@@ -503,8 +503,8 @@ order by s2 desc;
 explain (verbose, costs off)
 select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2
   from tenk1 group by thousand order by thousand limit 3;
-                                                            QUERY PLAN                                                             
------------------------------------------------------------------------------------------------------------------------------------
+                                                                                           QUERY PLAN                                                                                            
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  Limit
    Output: (sum(tenthous)), (((sum(tenthous))::double precision + (random() * '0'::double precision))), thousand
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
@@ -526,8 +526,8 @@ select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2
                                        ->  Partial HashAggregate
                                              Output: thousand, PARTIAL sum(tenthous)
                                              Group Key: tenk1.thousand
-                                             ->  Index Only Scan using tenk1_thous_tenthous on public.tenk1
-                                                   Output: thousand, tenthous
+                                             ->  Seq Scan on public.tenk1
+                                                   Output: unique1, unique2, two, four, ten, twenty, hundred, thousand, twothousand, fivethous, tenthous, odd, even, stringu1, stringu2, string4
 (23 rows)
 
 select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2
diff --git a/src/test/regress/expected/partition_index.out b/src/test/regress/expected/partition_index.out
index afe95aab..bf476902 100644
--- a/src/test/regress/expected/partition_index.out
+++ b/src/test/regress/expected/partition_index.out
@@ -83,8 +83,8 @@ select c1,c2 from t_day_1 where c2 < timestamp without time zone '2015-09-04' or
   1 | Tue Sep 01 13:11:00 2015
   2 | Wed Sep 02 13:11:00 2015
   3 | Wed Sep 02 13:11:00 2015
-  7 | Thu Sep 03 13:11:00 2015
   4 | Thu Sep 03 13:11:00 2015
+  7 | Thu Sep 03 13:11:00 2015
 (5 rows)
 
 select c1,c2 from t_day_1 where c2 < timestamp without time zone '2015-09-04' order by c2 desc limit 5;
@@ -103,8 +103,8 @@ select shardid,c2 from t_day_1 where c2 < timestamp without time zone '2015-09-0
     2234 | Tue Sep 01 13:11:00 2015
     3318 | Wed Sep 02 13:11:00 2015
      105 | Wed Sep 02 13:11:00 2015
-    1025 | Thu Sep 03 13:11:00 2015
      213 | Thu Sep 03 13:11:00 2015
+    1025 | Thu Sep 03 13:11:00 2015
 (5 rows)
 
 select shardid,c1+c3 from t_day_1 where c2 < timestamp without time zone '2015-09-04' order by c2 limit 5;
@@ -113,8 +113,8 @@ select shardid,c1+c3 from t_day_1 where c2 < timestamp without time zone '2015-0
     2234 |        2
     3318 |        3
      105 |        4
-    1025 |        8
      213 |        5
+    1025 |        8
 (5 rows)
 
 select shardid,c1+c3 from t_day_1 where c2 < timestamp without time zone '2015-09-04' order by c3,c2 limit 5;
@@ -221,8 +221,8 @@ select c1,c2 from t_day_7 where c2 < timestamp without time zone '2015-09-20' or
   1 | Tue Sep 01 13:11:00 2015
   2 | Tue Sep 08 13:11:00 2015
   3 | Tue Sep 08 13:11:00 2015
-  7 | Tue Sep 15 13:11:00 2015
   4 | Tue Sep 15 13:11:00 2015
+  7 | Tue Sep 15 13:11:00 2015
 (5 rows)
 
 select c1,c2 from t_day_7 where c2 < timestamp without time zone '2015-09-20' order by c2 desc limit 5;
@@ -241,8 +241,8 @@ select shardid,c2 from t_day_7 where c2 < timestamp without time zone '2015-09-2
     2234 | Tue Sep 01 13:11:00 2015
     3318 | Tue Sep 08 13:11:00 2015
      105 | Tue Sep 08 13:11:00 2015
-    1025 | Tue Sep 15 13:11:00 2015
      213 | Tue Sep 15 13:11:00 2015
+    1025 | Tue Sep 15 13:11:00 2015
 (5 rows)
 
 select shardid,c1+c3 from t_day_7 where c2 < timestamp without time zone '2015-09-20' order by c2 limit 5;
@@ -251,8 +251,8 @@ select shardid,c1+c3 from t_day_7 where c2 < timestamp without time zone '2015-0
     2234 |        2
     3318 |        3
      105 |        4
-    1025 |        8
      213 |        5
+    1025 |        8
 (5 rows)
 
 select shardid,c1+c3 from t_day_7 where c2 < timestamp without time zone '2015-09-20' order by c3,c2 limit 5;
@@ -388,10 +388,10 @@ select c1,c2 from t_month_3 where c2 < timestamp without time zone '2016-02-01'
 select c1,c2 from t_month_3 where c2 < timestamp without time zone '2016-02-01' order by c2 desc limit 5;
  c1 |            c2            
 ----+--------------------------
- 17 | Fri Jan 01 13:11:00 2016
- 19 | Fri Jan 01 13:11:00 2016
  21 | Fri Jan 01 13:11:00 2016
  23 | Fri Jan 01 13:11:00 2016
+ 17 | Fri Jan 01 13:11:00 2016
+ 19 | Fri Jan 01 13:11:00 2016
  26 | Fri Jan 01 13:11:00 2016
 (5 rows)
 
@@ -421,8 +421,8 @@ select shardid,c1+c3 from t_month_3 where c2 < timestamp without time zone '2016
     2234 |        2
     3318 |        3
      105 |        4
-     213 |        5
     1025 |        8
+     213 |        5
 (5 rows)
 
 select shardid,c1 from t_month_3 where c2 < timestamp without time zone '2016-02-01' and mod(c1,2) = 1 order by c1 desc limit 5;
diff --git a/src/test/regress/expected/partition_join_2.out b/src/test/regress/expected/partition_join_2.out
index d2435f12..c8622909 100644
--- a/src/test/regress/expected/partition_join_2.out
+++ b/src/test/regress/expected/partition_join_2.out
@@ -148,33 +148,33 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1 t1 RIGHT JOIN prt2 t2 ON t1.a = t2.b WHE
 -- full outer join, with placeholder vars
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b;
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
          Sort Key: a, b
          ->  Hash Full Join
-               Hash Cond: (a = b)
+               Hash Cond: (b = a)
                Filter: (((50) = a) OR ((75) = b))
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     Distribute results by H: a
+               ->  Remote Subquery Scan on all (datanode_2)
+                     Distribute results by H: b
                      ->  Append
-                           ->  Seq Scan on prt1_p1
-                                 Filter: (b = 0)
-                           ->  Seq Scan on prt1_p2
-                                 Filter: (b = 0)
-                           ->  Seq Scan on prt1_p3
-                                 Filter: (b = 0)
+                           ->  Seq Scan on prt2_p1
+                                 Filter: (a = 0)
+                           ->  Seq Scan on prt2_p2
+                                 Filter: (a = 0)
+                           ->  Seq Scan on prt2_p3
+                                 Filter: (a = 0)
                ->  Hash
-                     ->  Remote Subquery Scan on all (datanode_2)
-                           Distribute results by H: b
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
                            ->  Append
-                                 ->  Seq Scan on prt2_p1
-                                       Filter: (a = 0)
-                                 ->  Seq Scan on prt2_p2
-                                       Filter: (a = 0)
-                                 ->  Seq Scan on prt2_p3
-                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt1_p1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_p2
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_p3
+                                       Filter: (b = 0)
 (25 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b) WHERE t1.phv = t1.a OR t2.phv = t2.b ORDER BY t1.a, t2.b;
@@ -301,27 +301,11 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1 WHERE a < 450) t1 FULL JO
 -- Semi-join
 EXPLAIN (COSTS OFF)
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a;
-                                QUERY PLAN                                
---------------------------------------------------------------------------
+                       QUERY PLAN                       
+--------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
-   ->  Merge Join
-         Merge Cond: (b = t1.a)
-         ->  Remote Subquery Scan on all (datanode_2)
-               ->  Sort
-                     Sort Key: b
-                     ->  HashAggregate
-                           Group Key: b
-                           ->  Remote Subquery Scan on all (datanode_2)
-                                 Distribute results by H: b
-                                 ->  HashAggregate
-                                       Group Key: t2.b
-                                       ->  Append
-                                             ->  Seq Scan on prt2_p1 t2
-                                                   Filter: (a = 0)
-                                             ->  Seq Scan on prt2_p2 t2_1
-                                                   Filter: (a = 0)
-                                             ->  Seq Scan on prt2_p3 t2_2
-                                                   Filter: (a = 0)
+   ->  Merge Semi Join
+         Merge Cond: (t1.a = b)
          ->  Sort
                Sort Key: t1.a
                ->  Append
@@ -331,7 +315,17 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0)
                            Filter: (b = 0)
                      ->  Seq Scan on prt1_p3 t1_2
                            Filter: (b = 0)
-(28 rows)
+         ->  Remote Subquery Scan on all (datanode_2)
+               ->  Sort
+                     Sort Key: t2.b
+                     ->  Append
+                           ->  Seq Scan on prt2_p1 t2
+                                 Filter: (a = 0)
+                           ->  Seq Scan on prt2_p2 t2_1
+                                 Filter: (a = 0)
+                           ->  Seq Scan on prt2_p3 t2_2
+                                 Filter: (a = 0)
+(22 rows)
 
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t2.b FROM prt2 t2 WHERE t2.a = 0) AND t1.b = 0 ORDER BY t1.a;
   a  | b |  c   
@@ -689,8 +683,8 @@ SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2
 -- make sure these go to null as expected
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * FROM prt1 WHERE prt1.b = 0) t1 FULL JOIN (SELECT 75 phv, * FROM prt2 WHERE prt2.a = 0) t2 ON (t1.a = t2.b)) FULL JOIN (SELECT 50 phv, * FROM prt1_e WHERE prt1_e.c = 0) t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t1.a = t1.phv OR t2.b = t2.phv OR (t3.a + t3.b)/2 = t3.phv ORDER BY t1.a, t2.b, t3.a + t3.b;
-                                    QUERY PLAN                                     
------------------------------------------------------------------------------------
+                                       QUERY PLAN                                        
+-----------------------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
          Sort Key: a, b, ((a + b))
@@ -700,26 +694,26 @@ SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * F
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Distribute results by H: a
                      ->  Hash Full Join
-                           Hash Cond: (a = b)
-                           ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                                 Distribute results by H: a
+                           Hash Cond: (b = a)
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 Distribute results by H: b
                                  ->  Append
-                                       ->  Seq Scan on prt1_p1
-                                             Filter: (b = 0)
-                                       ->  Seq Scan on prt1_p2
-                                             Filter: (b = 0)
-                                       ->  Seq Scan on prt1_p3
-                                             Filter: (b = 0)
+                                       ->  Seq Scan on prt2_p1
+                                             Filter: (a = 0)
+                                       ->  Seq Scan on prt2_p2
+                                             Filter: (a = 0)
+                                       ->  Seq Scan on prt2_p3
+                                             Filter: (a = 0)
                            ->  Hash
-                                 ->  Remote Subquery Scan on all (datanode_2)
-                                       Distribute results by H: b
+                                 ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                                       Distribute results by H: a
                                        ->  Append
-                                             ->  Seq Scan on prt2_p1
-                                                   Filter: (a = 0)
-                                             ->  Seq Scan on prt2_p2
-                                                   Filter: (a = 0)
-                                             ->  Seq Scan on prt2_p3
-                                                   Filter: (a = 0)
+                                             ->  Seq Scan on prt1_p1
+                                                   Filter: (b = 0)
+                                             ->  Seq Scan on prt1_p2
+                                                   Filter: (b = 0)
+                                             ->  Seq Scan on prt1_p3
+                                                   Filter: (b = 0)
                ->  Hash
                      ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                            Distribute results by H: ((a + b) / 2)
@@ -743,8 +737,8 @@ SELECT t1.a, t1.phv, t2.b, t2.phv, t3.a + t3.b, t3.phv FROM ((SELECT 50 phv, * F
 -- Semi-join
 EXPLAIN (COSTS OFF)
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
-                                  QUERY PLAN                                  
-------------------------------------------------------------------------------
+                               QUERY PLAN                               
+------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Nested Loop Semi Join
          Join Filter: (t1.a = b)
@@ -760,20 +754,20 @@ SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHER
                ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                      Distribute results by H: b
                      ->  Hash Join
-                           Hash Cond: (((t2.a + t2.b) / 2) = b)
-                           ->  Append
-                                 ->  Seq Scan on prt1_e_p1 t2
-                                 ->  Seq Scan on prt1_e_p2 t2_1
-                                 ->  Seq Scan on prt1_e_p3 t2_2
+                           Hash Cond: (b = ((t2.a + t2.b) / 2))
+                           ->  Remote Subquery Scan on all (datanode_2)
+                                 ->  Append
+                                       ->  Seq Scan on prt2_p1 t1_3
+                                             Filter: (a = 0)
+                                       ->  Seq Scan on prt2_p2 t1_4
+                                             Filter: (a = 0)
+                                       ->  Seq Scan on prt2_p3 t1_5
+                                             Filter: (a = 0)
                            ->  Hash
-                                 ->  Remote Subquery Scan on all (datanode_2)
-                                       ->  Append
-                                             ->  Seq Scan on prt2_p1 t1_3
-                                                   Filter: (a = 0)
-                                             ->  Seq Scan on prt2_p2 t1_4
-                                                   Filter: (a = 0)
-                                             ->  Seq Scan on prt2_p3 t1_5
-                                                   Filter: (a = 0)
+                                 ->  Append
+                                       ->  Seq Scan on prt1_e_p1 t2
+                                       ->  Seq Scan on prt1_e_p2 t2_1
+                                       ->  Seq Scan on prt1_e_p3 t2_2
 (29 rows)
 
 SELECT t1.* FROM prt1 t1 WHERE t1.a IN (SELECT t1.b FROM prt2 t1, prt1_e t2 WHERE t1.a = 0 AND t1.b = (t2.a + t2.b)/2) AND t1.b = 0 ORDER BY t1.a;
@@ -1429,36 +1423,36 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_l t1 RIGHT JOIN prt2_l t2 ON t1.a = t2.b
 -- full join
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b;
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
  Remote Subquery Scan on all (datanode_1,datanode_2)
    ->  Sort
          Sort Key: a, b
          ->  Hash Full Join
-               Hash Cond: ((a = b) AND ((c)::text = (c)::text))
-               ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-                     Distribute results by H: a
+               Hash Cond: ((b = a) AND ((c)::text = (c)::text))
+               ->  Remote Subquery Scan on all (datanode_2)
+                     Distribute results by H: b
                      ->  Append
-                           ->  Seq Scan on prt1_l_p1
-                                 Filter: (b = 0)
-                           ->  Seq Scan on prt1_l_p2_p1
-                                 Filter: (b = 0)
-                           ->  Seq Scan on prt1_l_p2_p2
-                                 Filter: (b = 0)
-                           ->  Seq Scan on prt1_l_p3_p1
-                                 Filter: (b = 0)
+                           ->  Seq Scan on prt2_l_p1
+                                 Filter: (a = 0)
+                           ->  Seq Scan on prt2_l_p2_p1
+                                 Filter: (a = 0)
+                           ->  Seq Scan on prt2_l_p2_p2
+                                 Filter: (a = 0)
+                           ->  Seq Scan on prt2_l_p3_p1
+                                 Filter: (a = 0)
                ->  Hash
-                     ->  Remote Subquery Scan on all (datanode_2)
-                           Distribute results by H: b
+                     ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+                           Distribute results by H: a
                            ->  Append
-                                 ->  Seq Scan on prt2_l_p1
-                                       Filter: (a = 0)
-                                 ->  Seq Scan on prt2_l_p2_p1
-                                       Filter: (a = 0)
-                                 ->  Seq Scan on prt2_l_p2_p2
-                                       Filter: (a = 0)
-                                 ->  Seq Scan on prt2_l_p3_p1
-                                       Filter: (a = 0)
+                                 ->  Seq Scan on prt1_l_p1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p2_p1
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p2_p2
+                                       Filter: (b = 0)
+                                 ->  Seq Scan on prt1_l_p3_p1
+                                       Filter: (b = 0)
 (28 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c FROM (SELECT * FROM prt1_l WHERE prt1_l.b = 0) t1 FULL JOIN (SELECT * FROM prt2_l WHERE prt2_l.a = 0) t2 ON (t1.a = t2.b AND t1.c = t2.c) ORDER BY t1.a, t2.b;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index ba5666ef..3c4adea9 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2397,6 +2397,9 @@ toyemp| SELECT emp.name,
     emp.location,
     (12 * emp.salary) AS annualsal
    FROM emp;
+zv1| SELECT zt1.f1,
+    'dummy'::text AS junk
+   FROM pg_temp_31.zt1;
 SELECT tablename, rulename, definition FROM pg_rules
 	ORDER BY tablename, rulename;
 pg_settings|pg_settings_n|CREATE RULE pg_settings_n AS
diff --git a/src/test/regress/expected/select_parallel_4.out b/src/test/regress/expected/select_parallel_4.out
index b57f5248..990ac6ec 100644
--- a/src/test/regress/expected/select_parallel_4.out
+++ b/src/test/regress/expected/select_parallel_4.out
@@ -42,21 +42,21 @@ alter table tenk1 set (parallel_workers = 4);
 explain (verbose, costs off)
 select parallel_restricted(unique1) from tenk1
   where stringu1 = 'GRAAAA' order by 1;
-                          QUERY PLAN                           
+                             QUERY PLAN                              
 ---------------------------------------------------------------------
  Sort
-         Output: (parallel_restricted(unique1))
-         Sort Key: (parallel_restricted(tenk1.unique1))
+   Output: (parallel_restricted(unique1))
+   Sort Key: (parallel_restricted(tenk1.unique1))
    ->  Result
-               Output: parallel_restricted(unique1)
+         Output: parallel_restricted(unique1)
          ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                Output: unique1
                ->  Gather
                      Output: unique1
-               Workers Planned: 4
-               ->  Parallel Seq Scan on public.tenk1
-                     Output: unique1
-                     Filter: (tenk1.stringu1 = 'GRAAAA'::name)
+                     Workers Planned: 4
+                     ->  Parallel Seq Scan on public.tenk1
+                           Output: unique1
+                           Filter: (tenk1.stringu1 = 'GRAAAA'::name)
 (13 rows)
 
 -- test parallel plan when group by expression is in target list.
@@ -125,14 +125,14 @@ select count(stringu1) as num, (CASE WHEN length(stringu1) > 5 THEN 'LONG' ELSE
 explain (costs off)
 	select  sum(parallel_restricted(unique1)) from tenk1
 	group by(parallel_restricted(unique1));
-                               QUERY PLAN                                
+                           QUERY PLAN                            
 -----------------------------------------------------------------
  HashAggregate
    Group Key: parallel_restricted(unique1)
    ->  Result
-   ->  Remote Subquery Scan on all (datanode_1,datanode_2)
-         ->  Gather
-               Workers Planned: 4
+         ->  Remote Subquery Scan on all (datanode_1,datanode_2)
+               ->  Gather
+                     Workers Planned: 4
                      ->  Parallel Seq Scan on tenk1
 (7 rows)
 
@@ -279,8 +279,8 @@ set enable_hashjoin to off;
 set enable_nestloop to off;
 explain (costs off)
 	select  count(*) from tenk1, tenk2 where tenk1.unique1 = tenk2.unique1;
-                                     QUERY PLAN                                      
--------------------------------------------------------------------------------------
+                              QUERY PLAN                               
+-----------------------------------------------------------------------
  Finalize Aggregate
    ->  Remote Subquery Scan on all (datanode_1,datanode_2)
          ->  Gather
@@ -288,9 +288,13 @@ explain (costs off)
                ->  Partial Aggregate
                      ->  Parallel Merge Join
                            Merge Cond: (tenk1.unique1 = tenk2.unique1)
-                           ->  Parallel Index Only Scan using tenk1_unique1 on tenk1
-                           ->  Index Only Scan using tenk2_unique1 on tenk2
-(9 rows)
+                           ->  Sort
+                                 Sort Key: tenk1.unique1
+                                 ->  Parallel Seq Scan on tenk1
+                           ->  Sort
+                                 Sort Key: tenk2.unique1
+                                 ->  Seq Scan on tenk2
+(13 rows)
 
 select  count(*) from tenk1, tenk2 where tenk1.unique1 = tenk2.unique1;
  count 
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 876fd5c8..c841f7c9 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -885,7 +885,7 @@ select * from int4_tbl where
          SubPlan 1
            ->  Remote Subquery Scan on all (datanode_1,datanode_2)
                  Output: a.unique1
-                 ->  Index Only Scan using tenk1_unique1 on public.tenk1 a
+                 ->  Seq Scan on public.tenk1 a
                        Output: a.unique1
 (26 rows)
 
diff --git a/src/test/regress/expected/transactions_2.out b/src/test/regress/expected/transactions_2.out
index 30a34e63..12f40682 100644
--- a/src/test/regress/expected/transactions_2.out
+++ b/src/test/regress/expected/transactions_2.out
@@ -582,6 +582,10 @@ end$$ language plpgsql volatile;
 create table revalidate_bug (c float8 unique);
 insert into revalidate_bug values (1);
 insert into revalidate_bug values (inverse(0));
+ERROR:  DML contains a function runs on CN which is not supported
+HINT:  You might need to push that function down to DN.
+alter function inverse(int) pushdown;
+insert into revalidate_bug values (inverse(0));
 drop table revalidate_bug;
 drop function inverse(int);
 -- verify that cursors created during an aborted subtransaction are
@@ -633,7 +637,7 @@ fetch from foo;
 abort;
 -- Test for proper cleanup after a failure in a cursor portal
 -- that was created in an outer subtransaction
-CREATE FUNCTION invert(x float8) RETURNS float8 LANGUAGE plpgsql AS
+CREATE FUNCTION invert(x float8) RETURNS float8 pushdown LANGUAGE plpgsql AS
 $$ begin return 1/x; end $$;
 CREATE FUNCTION create_temp_tab() RETURNS text
 LANGUAGE plpgsql AS $$

From 7cf7f8afbcab7290538ad5e65893561710be3dfa Mon Sep 17 00:00:00 2001
From: JennyJennyChen <chenzaini@sina.com>
Date: Thu, 9 Mar 2023 20:02:41 +0800
Subject: [PATCH 576/578] add v2.5.0-release-note

---
 v2.5.0-release-note.txt | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 v2.5.0-release-note.txt

diff --git a/v2.5.0-release-note.txt b/v2.5.0-release-note.txt
new file mode 100644
index 00000000..e6ae92c6
--- /dev/null
+++ b/v2.5.0-release-note.txt
@@ -0,0 +1,15 @@
+V2.5.0-release 版本主要修改集中在:
+1、性能优化
+使用扩展协议时/function中执行insert多条数据重写为copy，写入性能提升数十倍
+cost代价估算准确性提升，执行计划性能提升2倍以上
+pg_stat_cluster_activity 内存使用优化
+
+
+2、功能增强
+支持资源隔离的不同nodegroup的表进行join
+支持从CN订阅DN WAL日志
+支持analyze信息同步到其他CN，保障不同CN的统计信息一致
+增加libpq TCP超时设置
+错误信息提示加强，方便用户进行问题分析
+
+3、其他已知bug修复

From 0451d9d9e610297c86c091b405692c972492e1e1 Mon Sep 17 00:00:00 2001
From: runewrz <32592054+runewrz@users.noreply.github.com>
Date: Thu, 26 Sep 2024 17:22:01 +0800
Subject: [PATCH 577/578] fix strcmp issue (#151)

---
 src/backend/pgxc/pool/pgxcnode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 mode change 100644 => 100755 src/backend/pgxc/pool/pgxcnode.c

diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
old mode 100644
new mode 100755
index d755e5be..ace82b0e
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -497,7 +497,7 @@ PGXCNodeConnStr(char *host, int port, char *dbname,
      * remote type can be Coordinator, Datanode or application.
      */
 #ifdef _MLS_
-    if (strcmp(user, MLS_USER) == 0 || strcmp(user, AUDIT_USER))
+    if (strcmp(user, MLS_USER) == 0 || strcmp(user, AUDIT_USER) == 0)
     {
         if (same_host)
         {

From 128d47502476f84392a4ac54603efe007d063285 Mon Sep 17 00:00:00 2001
From: ryanrzwu <runewrz@qq.com>
Date: Mon, 7 Jul 2025 20:58:20 +0800
Subject: [PATCH 578/578] Fix company name.

---
 COPYRIGHT                                      |  4 ++--
 LICENSE.txt                                    |  6 ++----
 contrib/btree_gin/btree_gin.c                  |  4 ++--
 contrib/pg_visibility/pg_visibility.c          |  4 ++--
 contrib/pgxc_ctl/coord_cmd.h                   |  4 ++--
 contrib/pgxc_ctl/varnames.h                    |  4 ++--
 contrib/pgxc_monitor/pgxc_monitor.c            |  4 ++--
 src/backend/access/common/bufmask.c            |  4 ++--
 src/backend/access/common/reloptions.c         |  4 ++--
 src/backend/access/hash/hash_xlog.c            |  4 ++--
 src/backend/access/heap/visibilitymap.c        |  4 ++--
 src/backend/access/index/indexam.c             |  4 ++--
 src/backend/access/rmgrdesc/extentdesc.c       |  6 ++----
 src/backend/access/rmgrdesc/relcryptdesc.c     |  5 ++---
 src/backend/access/rmgrdesc/replslotdesc.c     |  4 ++--
 src/backend/access/rmgrdesc/smgrdesc.c         |  4 ++--
 src/backend/access/rmgrdesc/xactdesc.c         |  4 ++--
 src/backend/access/rmgrdesc/xlogdesc.c         |  4 ++--
 src/backend/access/transam/lru.c               |  4 ++--
 src/backend/access/transam/rmgr.c              |  4 ++--
 src/backend/audit/audit.c                      |  6 ++----
 src/backend/audit/audit_fga.c                  | 10 ++++------
 src/backend/bootstrap/bootparse.y              |  4 ++--
 src/backend/bootstrap/bootstrap.c              |  4 ++--
 src/backend/catalog/catalog.c                  |  4 ++--
 src/backend/catalog/pg_partition_interval.c    |  5 ++---
 src/backend/catalog/pg_publication.c           |  4 ++--
 src/backend/catalog/pgxc_key_values.c          |  5 ++---
 src/backend/catalog/pgxc_shard_map.c           |  5 ++---
 src/backend/catalog/storage.c                  |  4 ++--
 src/backend/catalog/toasting.c                 |  4 ++--
 src/backend/commands/event_trigger.c           |  4 ++--
 src/backend/commands/opclasscmds.c             |  4 ++--
 src/backend/commands/portalcmds.c              |  4 ++--
 src/backend/commands/schemacmds.c              |  5 ++---
 src/backend/contrib/pgcrypto/blf.c             |  4 ++--
 src/backend/contrib/pgcrypto/crypt-blowfish.c  |  4 ++--
 src/backend/contrib/pgcrypto/crypt-des.c       |  4 ++--
 src/backend/contrib/pgcrypto/crypt-gensalt.c   |  4 ++--
 src/backend/contrib/pgcrypto/crypt-md5.c       |  4 ++--
 src/backend/contrib/pgcrypto/fortuna.c         |  4 ++--
 src/backend/contrib/pgcrypto/imath.c           |  4 ++--
 src/backend/contrib/pgcrypto/internal-sha2.c   |  4 ++--
 src/backend/contrib/pgcrypto/internal.c        |  4 ++--
 src/backend/contrib/pgcrypto/mbuf.c            |  4 ++--
 src/backend/contrib/pgcrypto/md5.c             |  4 ++--
 src/backend/contrib/pgcrypto/openssl.c         |  4 ++--
 src/backend/contrib/pgcrypto/pgcrypto.c        |  6 ++----
 src/backend/contrib/pgcrypto/pgp-armor.c       |  4 ++--
 src/backend/contrib/pgcrypto/pgp-cfb.c         |  4 ++--
 src/backend/contrib/pgcrypto/pgp-compress.c    |  4 ++--
 src/backend/contrib/pgcrypto/pgp-decrypt.c     |  4 ++--
 src/backend/contrib/pgcrypto/pgp-encrypt.c     |  4 ++--
 src/backend/contrib/pgcrypto/pgp-info.c        |  4 ++--
 .../contrib/pgcrypto/pgp-mpi-internal.c        |  4 ++--
 src/backend/contrib/pgcrypto/pgp-mpi-openssl.c |  4 ++--
 src/backend/contrib/pgcrypto/pgp-mpi.c         |  4 ++--
 src/backend/contrib/pgcrypto/pgp-pgsql.c       |  4 ++--
 src/backend/contrib/pgcrypto/pgp-pubdec.c      |  4 ++--
 src/backend/contrib/pgcrypto/pgp-pubenc.c      |  4 ++--
 src/backend/contrib/pgcrypto/pgp-pubkey.c      |  4 ++--
 src/backend/contrib/pgcrypto/pgp-s2k.c         |  4 ++--
 src/backend/contrib/pgcrypto/pgp.c             |  4 ++--
 src/backend/contrib/pgcrypto/px-crypt.c        |  4 ++--
 src/backend/contrib/pgcrypto/px-hmac.c         |  4 ++--
 src/backend/contrib/pgcrypto/px.c              |  4 ++--
 src/backend/contrib/pgcrypto/random.c          |  4 ++--
 src/backend/contrib/pgcrypto/rijndael.c        |  4 ++--
 src/backend/contrib/pgcrypto/sha1.c            |  4 ++--
 src/backend/contrib/pgcrypto/sha2.c            |  4 ++--
 src/backend/contrib/sm/sm4.c                   |  4 ++--
 src/backend/executor/execAmi.c                 |  4 ++--
 src/backend/executor/execIndexing.c            |  4 ++--
 src/backend/executor/execUtils.c               |  4 ++--
 src/backend/executor/nodeBitmapAnd.c           |  4 ++--
 src/backend/executor/nodeBitmapIndexscan.c     |  4 ++--
 src/backend/executor/nodeBitmapOr.c            |  4 ++--
 src/backend/executor/nodeSubplan.c             |  4 ++--
 src/backend/libpq/be-fsstubs.c                 |  4 ++--
 src/backend/libpq/be-secure.c                  |  4 ++--
 src/backend/libpq/hba.c                        |  4 ++--
 src/backend/libpq/pqcomm.c                     |  6 +++---
 src/backend/nodes/bitmapset.c                  |  4 ++--
 src/backend/nodes/makefuncs.c                  |  4 ++--
 src/backend/optimizer/plan/subselect.c         |  4 ++--
 src/backend/optimizer/prep/prepjointree.c      |  4 ++--
 src/backend/optimizer/util/tlist.c             |  4 ++--
 src/backend/oracle/charpad.c                   |  4 ++--
 src/backend/oracle/convert.c                   |  5 ++---
 src/backend/oracle/datefce.c                   |  4 ++--
 src/backend/oracle/others.c                    |  5 ++---
 src/backend/oracle/plvstr.c                    |  6 ++----
 src/backend/parser/analyze.c                   |  4 ++--
 src/backend/parser/parse_clause.c              |  4 ++--
 src/backend/parser/parse_expr.c                |  4 ++--
 src/backend/parser/parse_oper.c                |  4 ++--
 src/backend/pgxc/nodemgr/groupmgr.c            |  5 ++---
 src/backend/pgxc/shard/shard_vacuum.c          |  7 +++----
 src/backend/pgxc/shard/shardbarrier.c          |  5 ++---
 src/backend/pgxc/shard/shardmap.c              |  4 ++--
 src/backend/postmaster/auditlogger.c           |  4 ++--
 src/backend/postmaster/bgworker.c              |  4 ++--
 src/backend/replication/logical/decode.c       |  4 ++--
 src/backend/replication/logical/relation.c     |  4 ++--
 .../replication/logical/reorderbuffer.c        |  4 ++--
 src/backend/replication/repl_gram.y            |  4 ++--
 src/backend/replication/repl_scanner.l         |  4 ++--
 src/backend/replication/syncrep_scanner.l      |  4 ++--
 src/backend/replication/walreceiver.c          |  4 ++--
 src/backend/storage/buffer/freelist.c          |  5 ++---
 src/backend/storage/file/buffile.c             |  5 ++---
 src/backend/storage/file/fd.c                  |  4 ++--
 src/backend/storage/freespace/emapage.c        |  5 ++---
 src/backend/storage/freespace/extent_xlog.c    |  6 ++----
 src/backend/storage/ipc/ipc.c                  |  4 ++--
 src/backend/storage/ipc/ipci.c                 |  4 ++--
 src/backend/storage/ipc/procsignal.c           |  4 ++--
 src/backend/storage/lmgr/lmgr.c                |  4 ++--
 src/backend/storage/lmgr/lock.c                |  5 ++---
 src/backend/storage/lmgr/nodelock.c            |  4 ++--
 src/backend/storage/smgr/md.c                  |  4 ++--
 src/backend/tcop/dest.c                        |  4 ++--
 src/backend/utils/adt/datetime.c               |  4 ++--
 src/backend/utils/adt/format_type.c            |  4 ++--
 src/backend/utils/adt/formatting.c             |  4 ++--
 src/backend/utils/adt/json.c                   |  4 ++--
 src/backend/utils/adt/misc.c                   |  4 ++--
 src/backend/utils/adt/oid.c                    |  4 ++--
 src/backend/utils/adt/selfuncs.c               |  4 ++--
 src/backend/utils/adt/varchar.c                |  5 ++---
 src/backend/utils/adt/varlena.c                |  4 ++--
 src/backend/utils/adt/version.c                |  5 ++---
 src/backend/utils/adt/xml.c                    |  4 ++--
 src/backend/utils/cache/inval.c                |  5 ++---
 src/backend/utils/cache/relcryptmap.c          |  5 ++---
 src/backend/utils/init/globals.c               |  4 ++--
 src/backend/utils/misc/cls.c                   |  4 ++--
 src/backend/utils/misc/datamask.c              |  5 ++---
 src/backend/utils/misc/mls.c                   |  5 ++---
 src/backend/utils/misc/relcrypt.c              |  4 ++--
 src/backend/utils/misc/timeout.c               |  5 ++---
 src/backend/utils/mmgr/aset.c                  |  4 ++--
 src/backend/utils/mmgr/mcxt.c                  |  4 ++--
 src/backend/utils/resowner/resowner.c          |  6 ++----
 src/backend/utils/sort/tuplestore.c            |  4 ++--
 src/bin/confmod/conf.c                         |  6 ++----
 src/bin/confmod/conf.h                         |  4 ++--
 src/bin/confmod/confmod.c                      |  5 ++---
 src/bin/confmod/log.c                          |  4 ++--
 src/bin/confmod/log.h                          |  4 ++--
 src/bin/confmod/stree.c                        |  6 ++----
 src/bin/confmod/stree.h                        |  4 ++--
 src/bin/confmod/util.c                         |  4 ++--
 src/bin/confmod/util.h                         |  4 ++--
 src/bin/confmod/var.c                          |  5 ++---
 src/bin/confmod/var.h                          |  4 ++--
 src/bin/pg_basebackup/pg_basebackup.c          |  4 ++--
 src/bin/pg_controldata/pg_controldata.c        |  4 ++--
 src/bin/pg_ctl/pg_ctl.c                        |  4 ++--
 src/bin/pg_dump/compress_io.h                  |  4 ++--
 src/bin/pg_dump/pg_backup_archiver.h           |  4 ++--
 src/bin/pg_dump/pg_dump.h                      |  4 ++--
 src/bin/pg_dump/pg_dump_security.c             |  6 ++----
 src/bin/pg_upgrade/exec.c                      |  4 ++--
 src/bin/psql/common.c                          |  5 ++---
 src/common/relpath.c                           |  4 ++--
 src/gtm/common/gtm_time.c                      |  4 ++--
 src/gtm/common/heap.c                          |  6 ++----
 src/gtm/main/gtm_backup.c                      |  4 ++--
 src/gtm/main/gtm_store.c                       |  4 ++--
 src/gtm/main/gtm_xlog.c                        |  5 ++---
 src/gtm/main/replication.c                     |  4 ++--
 src/gtm/path/path.c                            |  4 ++--
 src/gtm/proxy/proxy_main.c                     |  4 ++--
 src/gtm/xlog_test/xlog_reader.c                |  8 ++++----
 src/gtm/xlog_test/xlog_test.c                  | 13 ++++++++-----
 src/include/access/gtm.h                       |  4 ++--
 src/include/access/heapam_xlog.h               |  4 ++--
 src/include/access/lru.h                       |  4 ++--
 src/include/access/parallel.h                  |  4 ++--
 src/include/access/printtup.h                  |  4 ++--
 src/include/access/relcryptaccess.h            |  4 ++--
 src/include/access/relscan.h                   |  4 ++--
 src/include/access/replslotdesc.h              |  4 ++--
 src/include/access/rmgrlist.h                  |  4 ++--
 src/include/access/transam.h                   |  4 ++--
 src/include/access/tupdesc_details.h           |  4 ++--
 src/include/access/twophase.h                  |  4 ++--
 src/include/access/visibilitymap.h             |  4 ++--
 src/include/access/xlogreader.h                |  4 ++--
 src/include/access/xlogrecord.h                |  4 ++--
 src/include/audit/audit.h                      |  5 ++---
 src/include/audit/audit_fga.h                  |  7 +++----
 src/include/c.h                                |  4 ++--
 src/include/catalog/audit/pg_audit_d.h         |  6 ++----
 src/include/catalog/audit/pg_audit_fga.h       |  6 ++----
 src/include/catalog/audit/pg_audit_o.h         |  6 ++----
 src/include/catalog/audit/pg_audit_s.h         |  6 ++----
 src/include/catalog/audit/pg_audit_u.h         |  6 ++----
 src/include/catalog/catalog.h                  |  4 ++--
 src/include/catalog/dependency.h               |  4 ++--
 src/include/catalog/index.h                    |  4 ++--
 src/include/catalog/mls/pg_cls_compartment.h   |  4 ++--
 src/include/catalog/mls/pg_cls_group.h         |  4 ++--
 src/include/catalog/mls/pg_cls_label.h         |  4 ++--
 src/include/catalog/mls/pg_cls_level.h         |  4 ++--
 src/include/catalog/mls/pg_cls_policy.h        |  6 ++----
 src/include/catalog/mls/pg_cls_table.h         |  4 ++--
 src/include/catalog/mls/pg_cls_user.h          |  4 ++--
 src/include/catalog/mls/pg_data_mask_map.h     |  4 ++--
 src/include/catalog/mls/pg_data_mask_user.h    |  4 ++--
 src/include/catalog/namespace.h                |  4 ++--
 src/include/catalog/objectaddress.h            |  4 ++--
 src/include/catalog/pg_audit.h                 |  4 ++--
 src/include/catalog/pg_authid.h                |  4 ++--
 src/include/catalog/pg_cast.h                  |  4 ++--
 src/include/catalog/pg_mls.h                   |  4 ++--
 src/include/catalog/pg_namespace.h             |  4 ++--
 src/include/catalog/pg_operator.h              |  4 ++--
 src/include/catalog/pg_partition_interval.h    |  4 ++--
 src/include/catalog/pg_proc.h                  |  4 ++--
 src/include/catalog/pg_publication.h           |  4 ++--
 src/include/catalog/pg_publication_shard.h     |  5 ++---
 src/include/catalog/pg_subscription_shard.h    |  6 ++----
 src/include/catalog/pg_subscription_table.h    |  5 ++---
 src/include/catalog/pg_type.h                  |  4 ++--
 src/include/catalog/pgxc_key_values.h          |  6 +++---
 src/include/catalog/pgxc_shard_map.h           |  4 ++--
 src/include/catalog/storage_xlog.h             |  4 ++--
 src/include/commands/cluster.h                 |  4 ++--
 src/include/commands/prepare.h                 |  4 ++--
 src/include/commands/publicationcmds.h         |  4 ++--
 src/include/commands/relcryptcommand.h         |  4 ++--
 src/include/commands/schemacmds.h              |  4 ++--
 src/include/commands/sequence.h                |  4 ++--
 src/include/commands/subscriptioncmds.h        |  4 ++--
 src/include/commands/vacuum.h                  |  4 ++--
 src/include/contrib/pgcrypto/blf.h             |  4 ++--
 src/include/contrib/pgcrypto/fortuna.h         |  4 ++--
 src/include/contrib/pgcrypto/imath.h           |  4 ++--
 src/include/contrib/pgcrypto/mbuf.h            |  4 ++--
 src/include/contrib/pgcrypto/md5.h             |  4 ++--
 src/include/contrib/pgcrypto/pgcrypto.h        |  4 ++--
 src/include/contrib/pgcrypto/pgp.h             |  6 ++----
 src/include/contrib/pgcrypto/px-crypt.h        |  4 ++--
 src/include/contrib/pgcrypto/px.h              |  4 ++--
 src/include/contrib/pgcrypto/rijndael.h        |  4 ++--
 src/include/contrib/pgcrypto/sha1.h            |  4 ++--
 src/include/contrib/pgcrypto/sha2.h            |  4 ++--
 src/include/contrib/sm/sm4.h                   |  4 ++--
 src/include/executor/execdesc.h                |  4 ++--
 src/include/executor/hashjoin.h                |  4 ++--
 src/include/executor/tqueue.h                  |  4 ++--
 src/include/gtm/elog.h                         |  4 ++--
 src/include/gtm/gtm_checkpoint.h               |  4 ++--
 src/include/gtm/gtm_conn.h                     |  4 ++--
 src/include/gtm/gtm_gxid.h                     |  4 ++--
 src/include/gtm/gtm_lock.h                     |  4 ++--
 src/include/gtm/gtm_store.h                    |  4 ++--
 src/include/gtm/gtm_xlog.h                     |  5 ++---
 src/include/gtm/gtm_xlog_internal.h            |  4 ++--
 src/include/gtm/heap.h                         |  5 ++---
 src/include/gtm/libpq-be.h                     |  4 ++--
 src/include/gtm/register.h                     |  4 ++--
 src/include/libpq/auth.h                       |  4 ++--
 src/include/libpq/libpq-be.h                   |  4 ++--
 src/include/libpq/pqcomm.h                     |  4 ++--
 src/include/miscadmin.h                        |  4 ++--
 src/include/nodes/bitmapset.h                  |  4 ++--
 src/include/nodes/makefuncs.h                  |  4 ++--
 src/include/nodes/nodeFuncs.h                  |  4 ++--
 src/include/nodes/relation.h                   |  4 ++--
 src/include/optimizer/pathnode.h               |  4 ++--
 src/include/optimizer/plancat.h                |  4 ++--
 src/include/optimizer/planmain.h               |  4 ++--
 src/include/optimizer/planner.h                |  4 ++--
 src/include/optimizer/subselect.h              |  4 ++--
 src/include/optimizer/var.h                    |  4 ++--
 src/include/oracle/oracle.h                    |  4 ++--
 src/include/parser/analyze.h                   |  4 ++--
 src/include/parser/parse_node.h                |  4 ++--
 src/include/parser/parse_utilcmd.h             |  4 ++--
 src/include/parser/parser.h                    |  4 ++--
 src/include/pg_config_manual.h                 |  4 ++--
 src/include/pgxc/groupmgr.h                    |  4 ++--
 src/include/pgxc/pgxc.h                        |  4 ++--
 src/include/pgxc/planner.h                     |  4 ++--
 src/include/pgxc/shard_vacuum.h                |  4 ++--
 src/include/pgxc/shardmap.h                    |  4 ++--
 src/include/postgres_ext.h                     |  4 ++--
 src/include/postmaster/auditlogger.h           |  4 ++--
 src/include/postmaster/bgworker.h              |  4 ++--
 src/include/postmaster/bgwriter.h              |  4 ++--
 src/include/postmaster/pgarch.h                |  4 ++--
 src/include/replication/decode.h               |  4 ++--
 src/include/replication/logical_statistic.h    |  5 ++---
 src/include/replication/logicallauncher.h      |  4 ++--
 src/include/replication/logicalrelation.h      |  4 ++--
 src/include/replication/walreceiver.h          |  4 ++--
 src/include/replication/worker_internal.h      |  4 ++--
 src/include/storage/buf_internals.h            |  4 ++--
 src/include/storage/buffile.h                  |  4 ++--
 src/include/storage/extent_xlog.h              |  4 ++--
 src/include/storage/extentmapping.h            |  4 ++--
 src/include/storage/lmgr.h                     |  4 ++--
 src/include/storage/lwlock.h                   |  4 ++--
 src/include/storage/nodelock.h                 |  4 ++--
 src/include/storage/proc.h                     |  4 ++--
 src/include/storage/procsignal.h               |  4 ++--
 src/include/storage/relcryptstorage.h          |  4 ++--
 src/include/storage/relfilenode.h              |  4 ++--
 src/include/tcop/dest.h                        |  4 ++--
 src/include/tcop/pquery.h                      |  4 ++--
 src/include/tcop/tcopprot.h                    |  4 ++--
 src/include/utils/builtins.h                   |  4 ++--
 src/include/utils/cls.h                        |  4 ++--
 src/include/utils/datamask.h                   |  4 ++--
 src/include/utils/elog.h                       |  4 ++--
 src/include/utils/guc_tables.h                 |  4 ++--
 src/include/utils/inval.h                      |  4 ++--
 src/include/utils/memutils.h                   |  4 ++--
 src/include/utils/mls.h                        |  4 ++--
 src/include/utils/mls_extension.h              |  4 ++--
 src/include/utils/pg_locale.h                  |  4 ++--
 src/include/utils/plancache.h                  |  4 ++--
 src/include/utils/portal.h                     |  4 ++--
 src/include/utils/relcrypt.h                   |  4 ++--
 src/include/utils/relcryptcache.h              |  4 ++--
 src/include/utils/relcryptmap.h                |  4 ++--
 src/include/utils/relcryptmisc.h               |  4 ++--
 src/include/utils/resowner_private.h           |  4 ++--
 src/include/utils/ruleutils.h                  |  4 ++--
 src/include/utils/snapshot.h                   |  4 ++--
 src/include/utils/syscache.h                   |  4 ++--
 src/include/utils/timeout.h                    |  4 ++--
 src/include/utils/tqual.h                      |  4 ++--
 src/include/utils/tuplestore.h                 |  4 ++--
 src/interfaces/libpq/fe-connect.c              |  4 ++--
 src/interfaces/libpq/fe-protocol3.c            |  4 ++--
 src/interfaces/libpq/libpq-fe.h                |  4 ++--
 src/interfaces/libpq/libpq-int.h               |  4 ++--
 src/pl/plperl/plperl_helpers.h                 |  4 ++--
 src/test/isolation/isolation_test.conf         | 18 ++----------------
 343 files changed, 700 insertions(+), 783 deletions(-)

diff --git a/COPYRIGHT b/COPYRIGHT
index 26cb400c..fc3cef0e 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -2,7 +2,7 @@ TBase Cluster Database Management System
 
 Tencent is pleased to support the open source community by making TBase available.  
 
-Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+Copyright (C) 2019 Tencent.  All rights reserved.
 
 TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
 
@@ -48,7 +48,7 @@ Redistribution and use in source and binary forms, with or without modification,
 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
 and/or other materials provided with the distribution.
 
-3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
 specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/LICENSE.txt b/LICENSE.txt
index d4589fbe..487e0176 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,6 +1,6 @@
 Tencent is pleased to support the open source community by making TBase available.  
 
-Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+Copyright (C) 2019 Tencent.  All rights reserved.
 
 TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
 
@@ -46,7 +46,7 @@ Redistribution and use in source and binary forms, with or without modification,
 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
 and/or other materials provided with the distribution.
 
-3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
 specific prior written permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -55,5 +55,3 @@ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUEN
 GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 
 DAMAGE.
-
-
diff --git a/contrib/btree_gin/btree_gin.c b/contrib/btree_gin/btree_gin.c
index 9b56b3c2..313df90d 100644
--- a/contrib/btree_gin/btree_gin.c
+++ b/contrib/btree_gin/btree_gin.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c
index 8b9836e9..74af3424 100644
--- a/contrib/pg_visibility/pg_visibility.c
+++ b/contrib/pg_visibility/pg_visibility.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/contrib/pgxc_ctl/coord_cmd.h b/contrib/pgxc_ctl/coord_cmd.h
index 79a71dd9..f889c443 100644
--- a/contrib/pgxc_ctl/coord_cmd.h
+++ b/contrib/pgxc_ctl/coord_cmd.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/contrib/pgxc_ctl/varnames.h b/contrib/pgxc_ctl/varnames.h
index 61fa33fe..601b11a0 100644
--- a/contrib/pgxc_ctl/varnames.h
+++ b/contrib/pgxc_ctl/varnames.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/contrib/pgxc_monitor/pgxc_monitor.c b/contrib/pgxc_monitor/pgxc_monitor.c
index 83f831f5..6604da2d 100644
--- a/contrib/pgxc_monitor/pgxc_monitor.c
+++ b/contrib/pgxc_monitor/pgxc_monitor.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c
index 20f3e4ba..28a3d264 100644
--- a/src/backend/access/common/bufmask.c
+++ b/src/backend/access/common/bufmask.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index f3602fb6..e422b27f 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c
index f8dec838..977d4b27 100644
--- a/src/backend/access/hash/hash_xlog.c
+++ b/src/backend/access/hash/hash_xlog.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index d0a83854..b66f2473 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 931f71cc..0675dad8 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/access/rmgrdesc/extentdesc.c b/src/backend/access/rmgrdesc/extentdesc.c
index 6071d8c5..329b3900 100644
--- a/src/backend/access/rmgrdesc/extentdesc.c
+++ b/src/backend/access/rmgrdesc/extentdesc.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -403,5 +403,3 @@ extent_identify(uint8 info)
             return "Extent ERROR";
     }
 }
-
-
diff --git a/src/backend/access/rmgrdesc/relcryptdesc.c b/src/backend/access/rmgrdesc/relcryptdesc.c
index 2b05fe81..11f904a8 100644
--- a/src/backend/access/rmgrdesc/relcryptdesc.c
+++ b/src/backend/access/rmgrdesc/relcryptdesc.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -146,4 +146,3 @@ const char * rel_crypt_identify(uint8 info)
 
     return id;
 }
-
diff --git a/src/backend/access/rmgrdesc/replslotdesc.c b/src/backend/access/rmgrdesc/replslotdesc.c
index 0d71eb8b..9b2e6169 100644
--- a/src/backend/access/rmgrdesc/replslotdesc.c
+++ b/src/backend/access/rmgrdesc/replslotdesc.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c
index 3fc546ed..c03f4c78 100644
--- a/src/backend/access/rmgrdesc/smgrdesc.c
+++ b/src/backend/access/rmgrdesc/smgrdesc.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c
index 450e2594..832d6a30 100644
--- a/src/backend/access/rmgrdesc/xactdesc.c
+++ b/src/backend/access/rmgrdesc/xactdesc.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 1246fdd5..0101d24c 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/access/transam/lru.c b/src/backend/access/transam/lru.c
index 0c772617..68009bd0 100644
--- a/src/backend/access/transam/lru.c
+++ b/src/backend/access/transam/lru.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 5e1848d4..145d3e73 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/audit/audit.c b/src/backend/audit/audit.c
index 7185796a..5ea922ee 100644
--- a/src/backend/audit/audit.c
+++ b/src/backend/audit/audit.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -8975,5 +8975,3 @@ static void audit_hit_process_result_info(bool is_success)
 #ifdef Audit_004_For_Log
 
 #endif
-
-
diff --git a/src/backend/audit/audit_fga.c b/src/backend/audit/audit_fga.c
index 2327ac00..2c998b76 100644
--- a/src/backend/audit/audit_fga.c
+++ b/src/backend/audit/audit_fga.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -431,7 +431,7 @@ exec_policy_funct_on_other_node(char *query_string)
         {
             cn_node_list = (Oid *) palloc0(cn_nodes_num * sizeof(Oid));
                 
-            PGXCGetCoordOidOthers(cn_node_list);
+            PGXCGetCoordOidOthers(cn_node_list);
             pgxc_execute_on_nodes(cn_nodes_num, cn_node_list, query_string);
         }
     }
@@ -1822,7 +1822,7 @@ process_fga_trigger(bool timeout)
                 else
                 {
                     elog(LOG, "AUDIT_FGA: cannot connect to db");
-	                PQfinish(conn);
+	                PQfinish(conn);
                 }
             }
         }
@@ -1976,5 +1976,3 @@ write_trigger_handle_to_shmem(Oid func_oid)
 
     return ;
 }
-
-
diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y
index e720c618..5800c275 100644
--- a/src/backend/bootstrap/bootparse.y
+++ b/src/backend/bootstrap/bootparse.y
@@ -2,7 +2,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -48,7 +48,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 5a1ace81..760cd82e 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c
index 203f31c2..2ba9dee1 100644
--- a/src/backend/catalog/catalog.c
+++ b/src/backend/catalog/catalog.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/catalog/pg_partition_interval.c b/src/backend/catalog/pg_partition_interval.c
index 8ca204e5..db6ccd90 100644
--- a/src/backend/catalog/pg_partition_interval.c
+++ b/src/backend/catalog/pg_partition_interval.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -271,4 +271,3 @@ ModifyPartitionStartValue(Oid relid, int64 startval)
     ReleaseSysCache(tup);
     heap_close(rel,RowExclusiveLock);
 }
-
diff --git a/src/backend/catalog/pg_publication.c b/src/backend/catalog/pg_publication.c
index 4d21340b..ab9f7d37 100644
--- a/src/backend/catalog/pg_publication.c
+++ b/src/backend/catalog/pg_publication.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/catalog/pgxc_key_values.c b/src/backend/catalog/pgxc_key_values.c
index 6af66d75..b78d05dc 100644
--- a/src/backend/catalog/pgxc_key_values.c
+++ b/src/backend/catalog/pgxc_key_values.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -613,4 +613,3 @@ bool IsKeyValues(Oid db, Oid rel, char *value)
                                   CStringGetDatum(value), 
                                   0);    
 }
-
diff --git a/src/backend/catalog/pgxc_shard_map.c b/src/backend/catalog/pgxc_shard_map.c
index 64ef2f5c..2f79b4a6 100644
--- a/src/backend/catalog/pgxc_shard_map.c
+++ b/src/backend/catalog/pgxc_shard_map.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -336,4 +336,3 @@ void DropShardMap_Node(Oid group)
 
     RegisterInvalidShmemShardMap(group, ShardOpType_drop);
 }
-
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index b9136469..5e3d424c 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c
index 95b0564c..336727e8 100644
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/commands/event_trigger.c b/src/backend/commands/event_trigger.c
index 742e23d4..8ced270c 100644
--- a/src/backend/commands/event_trigger.c
+++ b/src/backend/commands/event_trigger.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c
index 6e0f12b7..acb248ea 100644
--- a/src/backend/commands/opclasscmds.c
+++ b/src/backend/commands/opclasscmds.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c
index cdd42fa5..dfc941fa 100644
--- a/src/backend/commands/portalcmds.c
+++ b/src/backend/commands/portalcmds.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/commands/schemacmds.c b/src/backend/commands/schemacmds.c
index 3ef2e6a1..8b41446c 100644
--- a/src/backend/commands/schemacmds.c
+++ b/src/backend/commands/schemacmds.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -562,4 +562,3 @@ char * GetSchemaNameByOid(Oid schemaOid)
 }
 
 #endif
-
diff --git a/src/backend/contrib/pgcrypto/blf.c b/src/backend/contrib/pgcrypto/blf.c
index 598b65f6..efac09b9 100644
--- a/src/backend/contrib/pgcrypto/blf.c
+++ b/src/backend/contrib/pgcrypto/blf.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/crypt-blowfish.c b/src/backend/contrib/pgcrypto/crypt-blowfish.c
index f856f8b2..60a86848 100644
--- a/src/backend/contrib/pgcrypto/crypt-blowfish.c
+++ b/src/backend/contrib/pgcrypto/crypt-blowfish.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/crypt-des.c b/src/backend/contrib/pgcrypto/crypt-des.c
index 20d366b9..a9ae0b0b 100644
--- a/src/backend/contrib/pgcrypto/crypt-des.c
+++ b/src/backend/contrib/pgcrypto/crypt-des.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/crypt-gensalt.c b/src/backend/contrib/pgcrypto/crypt-gensalt.c
index 95916c47..e13c5e2e 100644
--- a/src/backend/contrib/pgcrypto/crypt-gensalt.c
+++ b/src/backend/contrib/pgcrypto/crypt-gensalt.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/crypt-md5.c b/src/backend/contrib/pgcrypto/crypt-md5.c
index b4f58219..b6a6792d 100644
--- a/src/backend/contrib/pgcrypto/crypt-md5.c
+++ b/src/backend/contrib/pgcrypto/crypt-md5.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/fortuna.c b/src/backend/contrib/pgcrypto/fortuna.c
index eb6bf895..b237f1fd 100644
--- a/src/backend/contrib/pgcrypto/fortuna.c
+++ b/src/backend/contrib/pgcrypto/fortuna.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/imath.c b/src/backend/contrib/pgcrypto/imath.c
index 09c1f899..db31d9c3 100644
--- a/src/backend/contrib/pgcrypto/imath.c
+++ b/src/backend/contrib/pgcrypto/imath.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/internal-sha2.c b/src/backend/contrib/pgcrypto/internal-sha2.c
index 10a1e979..370b2a76 100644
--- a/src/backend/contrib/pgcrypto/internal-sha2.c
+++ b/src/backend/contrib/pgcrypto/internal-sha2.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/internal.c b/src/backend/contrib/pgcrypto/internal.c
index 63d1df30..d41c831e 100644
--- a/src/backend/contrib/pgcrypto/internal.c
+++ b/src/backend/contrib/pgcrypto/internal.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/mbuf.c b/src/backend/contrib/pgcrypto/mbuf.c
index 985887a3..ad632f8c 100644
--- a/src/backend/contrib/pgcrypto/mbuf.c
+++ b/src/backend/contrib/pgcrypto/mbuf.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/md5.c b/src/backend/contrib/pgcrypto/md5.c
index 965a480d..a0cdf5d9 100644
--- a/src/backend/contrib/pgcrypto/md5.c
+++ b/src/backend/contrib/pgcrypto/md5.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/openssl.c b/src/backend/contrib/pgcrypto/openssl.c
index 451f9e27..4482a5a0 100644
--- a/src/backend/contrib/pgcrypto/openssl.c
+++ b/src/backend/contrib/pgcrypto/openssl.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgcrypto.c b/src/backend/contrib/pgcrypto/pgcrypto.c
index fa5d1962..c93f92bc 100644
--- a/src/backend/contrib/pgcrypto/pgcrypto.c
+++ b/src/backend/contrib/pgcrypto/pgcrypto.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -576,5 +576,3 @@ void * crypt_memset(void *s, int c, size_t n)
 {
     return memset(s,c,n);
 }
-
-
diff --git a/src/backend/contrib/pgcrypto/pgp-armor.c b/src/backend/contrib/pgcrypto/pgp-armor.c
index c3203f88..827ae206 100644
--- a/src/backend/contrib/pgcrypto/pgp-armor.c
+++ b/src/backend/contrib/pgcrypto/pgp-armor.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-cfb.c b/src/backend/contrib/pgcrypto/pgp-cfb.c
index 7caa7697..7be68343 100644
--- a/src/backend/contrib/pgcrypto/pgp-cfb.c
+++ b/src/backend/contrib/pgcrypto/pgp-cfb.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-compress.c b/src/backend/contrib/pgcrypto/pgp-compress.c
index 5a9955f1..2568b948 100644
--- a/src/backend/contrib/pgcrypto/pgp-compress.c
+++ b/src/backend/contrib/pgcrypto/pgp-compress.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-decrypt.c b/src/backend/contrib/pgcrypto/pgp-decrypt.c
index 1bf73a89..af441918 100644
--- a/src/backend/contrib/pgcrypto/pgp-decrypt.c
+++ b/src/backend/contrib/pgcrypto/pgp-decrypt.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-encrypt.c b/src/backend/contrib/pgcrypto/pgp-encrypt.c
index 88ccde0d..4f10306f 100644
--- a/src/backend/contrib/pgcrypto/pgp-encrypt.c
+++ b/src/backend/contrib/pgcrypto/pgp-encrypt.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-info.c b/src/backend/contrib/pgcrypto/pgp-info.c
index 14128959..cd01c140 100644
--- a/src/backend/contrib/pgcrypto/pgp-info.c
+++ b/src/backend/contrib/pgcrypto/pgp-info.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-mpi-internal.c b/src/backend/contrib/pgcrypto/pgp-mpi-internal.c
index a09b3d0f..91689436 100644
--- a/src/backend/contrib/pgcrypto/pgp-mpi-internal.c
+++ b/src/backend/contrib/pgcrypto/pgp-mpi-internal.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-mpi-openssl.c b/src/backend/contrib/pgcrypto/pgp-mpi-openssl.c
index f13f1254..6c1fda11 100644
--- a/src/backend/contrib/pgcrypto/pgp-mpi-openssl.c
+++ b/src/backend/contrib/pgcrypto/pgp-mpi-openssl.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-mpi.c b/src/backend/contrib/pgcrypto/pgp-mpi.c
index 69edf48b..2b2831fa 100644
--- a/src/backend/contrib/pgcrypto/pgp-mpi.c
+++ b/src/backend/contrib/pgcrypto/pgp-mpi.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-pgsql.c b/src/backend/contrib/pgcrypto/pgp-pgsql.c
index ef00e276..1180bc16 100644
--- a/src/backend/contrib/pgcrypto/pgp-pgsql.c
+++ b/src/backend/contrib/pgcrypto/pgp-pgsql.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-pubdec.c b/src/backend/contrib/pgcrypto/pgp-pubdec.c
index 94d2a200..cc4ac0cd 100644
--- a/src/backend/contrib/pgcrypto/pgp-pubdec.c
+++ b/src/backend/contrib/pgcrypto/pgp-pubdec.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-pubenc.c b/src/backend/contrib/pgcrypto/pgp-pubenc.c
index 424fe7e3..273c5d9a 100644
--- a/src/backend/contrib/pgcrypto/pgp-pubenc.c
+++ b/src/backend/contrib/pgcrypto/pgp-pubenc.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-pubkey.c b/src/backend/contrib/pgcrypto/pgp-pubkey.c
index ef35d515..6b5a31a3 100644
--- a/src/backend/contrib/pgcrypto/pgp-pubkey.c
+++ b/src/backend/contrib/pgcrypto/pgp-pubkey.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp-s2k.c b/src/backend/contrib/pgcrypto/pgp-s2k.c
index d0ee95cf..7f9b5b70 100644
--- a/src/backend/contrib/pgcrypto/pgp-s2k.c
+++ b/src/backend/contrib/pgcrypto/pgp-s2k.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/pgp.c b/src/backend/contrib/pgcrypto/pgp.c
index 9ab0aa30..f5a969f1 100644
--- a/src/backend/contrib/pgcrypto/pgp.c
+++ b/src/backend/contrib/pgcrypto/pgp.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/px-crypt.c b/src/backend/contrib/pgcrypto/px-crypt.c
index 74fdc92d..4825f3f8 100644
--- a/src/backend/contrib/pgcrypto/px-crypt.c
+++ b/src/backend/contrib/pgcrypto/px-crypt.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/px-hmac.c b/src/backend/contrib/pgcrypto/px-hmac.c
index 6bdc23f3..bd249290 100644
--- a/src/backend/contrib/pgcrypto/px-hmac.c
+++ b/src/backend/contrib/pgcrypto/px-hmac.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/px.c b/src/backend/contrib/pgcrypto/px.c
index 45505eca..a29e4be3 100644
--- a/src/backend/contrib/pgcrypto/px.c
+++ b/src/backend/contrib/pgcrypto/px.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/random.c b/src/backend/contrib/pgcrypto/random.c
index c6fdc789..039e769f 100644
--- a/src/backend/contrib/pgcrypto/random.c
+++ b/src/backend/contrib/pgcrypto/random.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/rijndael.c b/src/backend/contrib/pgcrypto/rijndael.c
index 82dee497..653f1613 100644
--- a/src/backend/contrib/pgcrypto/rijndael.c
+++ b/src/backend/contrib/pgcrypto/rijndael.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/sha1.c b/src/backend/contrib/pgcrypto/sha1.c
index cb563ae7..18ed0a32 100644
--- a/src/backend/contrib/pgcrypto/sha1.c
+++ b/src/backend/contrib/pgcrypto/sha1.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/pgcrypto/sha2.c b/src/backend/contrib/pgcrypto/sha2.c
index 2829714a..c10bfc95 100644
--- a/src/backend/contrib/pgcrypto/sha2.c
+++ b/src/backend/contrib/pgcrypto/sha2.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/contrib/sm/sm4.c b/src/backend/contrib/sm/sm4.c
index 857acd7d..2acb3c20 100644
--- a/src/backend/contrib/sm/sm4.c
+++ b/src/backend/contrib/sm/sm4.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index 1e819521..44651753 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index ba54f4cf..983c0a78 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
index c6401651..f7fca4aa 100644
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/executor/nodeBitmapAnd.c b/src/backend/executor/nodeBitmapAnd.c
index ae2be3d2..d6882a88 100644
--- a/src/backend/executor/nodeBitmapAnd.c
+++ b/src/backend/executor/nodeBitmapAnd.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c
index 08549075..1ab9dcd9 100644
--- a/src/backend/executor/nodeBitmapIndexscan.c
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/executor/nodeBitmapOr.c b/src/backend/executor/nodeBitmapOr.c
index 6c662006..5bcc770b 100644
--- a/src/backend/executor/nodeBitmapOr.c
+++ b/src/backend/executor/nodeBitmapOr.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 6df349ab..98e37494 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/libpq/be-fsstubs.c b/src/backend/libpq/be-fsstubs.c
index 3f52bfa4..128bb410 100644
--- a/src/backend/libpq/be-fsstubs.c
+++ b/src/backend/libpq/be-fsstubs.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index ea947f5b..59fb602c 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c
index 35ad2dc1..7c0a4b24 100644
--- a/src/backend/libpq/hba.c
+++ b/src/backend/libpq/hba.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index db3b1ea1..9b62a647 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -2143,4 +2143,4 @@ pq_settcpusertimeout(int timeout, Port *port)
 #endif
 
 	return STATUS_OK;
-}
\ No newline at end of file
+}
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index 61b30a35..66778548 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index 30c49729..a5e24898 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 61647167..248040d0 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c
index 45e03eb6..9be1cb2b 100644
--- a/src/backend/optimizer/prep/prepjointree.c
+++ b/src/backend/optimizer/prep/prepjointree.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c
index 496fd970..4c1c7bc8 100644
--- a/src/backend/optimizer/util/tlist.c
+++ b/src/backend/optimizer/util/tlist.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/oracle/charpad.c b/src/backend/oracle/charpad.c
index 20c656e5..3d913f28 100644
--- a/src/backend/oracle/charpad.c
+++ b/src/backend/oracle/charpad.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/oracle/convert.c b/src/backend/oracle/convert.c
index 89602a4c..8fd69209 100644
--- a/src/backend/oracle/convert.c
+++ b/src/backend/oracle/convert.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -658,4 +658,3 @@ orcl_to_single_byte(PG_FUNCTION_ARGS)
 
     PG_RETURN_TEXT_P(dst);
 }
-
diff --git a/src/backend/oracle/datefce.c b/src/backend/oracle/datefce.c
index 034fcf3f..f394320c 100644
--- a/src/backend/oracle/datefce.c
+++ b/src/backend/oracle/datefce.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/oracle/others.c b/src/backend/oracle/others.c
index e27c9a95..6f1ebd33 100644
--- a/src/backend/oracle/others.c
+++ b/src/backend/oracle/others.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -631,4 +631,3 @@ ORCL_DECODE_FOR(date);
 ORCL_DECODE_FOR(time);
 ORCL_DECODE_FOR(timestamp);
 ORCL_DECODE_FOR(timestamptz);
-
diff --git a/src/backend/oracle/plvstr.c b/src/backend/oracle/plvstr.c
index 9f4b3596..b5999653 100644
--- a/src/backend/oracle/plvstr.c
+++ b/src/backend/oracle/plvstr.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -285,5 +285,3 @@ orcl_instr4(PG_FUNCTION_ARGS)
 
     PG_RETURN_INT32(orcl_instr(arg1, arg2, arg3, arg4));
 }
-
-
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index 9c460dd4..f4155215 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c
index e91f59dd..df8de7ca 100644
--- a/src/backend/parser/parse_clause.c
+++ b/src/backend/parser/parse_clause.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c
index 8de92ed6..b1143fad 100644
--- a/src/backend/parser/parse_expr.c
+++ b/src/backend/parser/parse_expr.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/parser/parse_oper.c b/src/backend/parser/parse_oper.c
index 4f9f4949..2dbd35fd 100644
--- a/src/backend/parser/parse_oper.c
+++ b/src/backend/parser/parse_oper.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/pgxc/nodemgr/groupmgr.c b/src/backend/pgxc/nodemgr/groupmgr.c
index e5de7d81..7a36c5cb 100644
--- a/src/backend/pgxc/nodemgr/groupmgr.c
+++ b/src/backend/pgxc/nodemgr/groupmgr.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -723,4 +723,3 @@ GetGroupNameByNode(Oid nodeoid)
 
 
 #endif
-
diff --git a/src/backend/pgxc/shard/shard_vacuum.c b/src/backend/pgxc/shard/shard_vacuum.c
index 2f933972..905c81c8 100644
--- a/src/backend/pgxc/shard/shard_vacuum.c
+++ b/src/backend/pgxc/shard/shard_vacuum.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -449,7 +449,7 @@ int64 vacuum_shard_internal(Relation rel, Bitmapset *to_vacuum, Snapshot vacuum_
             if(to_delete)
             {
                 tuples++;
-				if(tuples > 2000)
+				if(tuples > 2000)
                 {
                     tuples = 0;
                     pg_usleep(sleep_interval * 1000);
@@ -574,4 +574,3 @@ List * GetShardRelations_NoChild(bool is_contain_replic)
 
     return result;    
 }
-
diff --git a/src/backend/pgxc/shard/shardbarrier.c b/src/backend/pgxc/shard/shardbarrier.c
index 3219e3e1..9c761bb4 100644
--- a/src/backend/pgxc/shard/shardbarrier.c
+++ b/src/backend/pgxc/shard/shardbarrier.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -382,4 +382,3 @@ Datum pg_stat_barrier_shards(PG_FUNCTION_ARGS)
     result = HeapTupleGetDatum(tuple);
     SRF_RETURN_NEXT(funcctx, result);
 }
-
diff --git a/src/backend/pgxc/shard/shardmap.c b/src/backend/pgxc/shard/shardmap.c
index 29f9a946..655ed921 100644
--- a/src/backend/pgxc/shard/shardmap.c
+++ b/src/backend/pgxc/shard/shardmap.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/postmaster/auditlogger.c b/src/backend/postmaster/auditlogger.c
index 5aea9c14..81151f67 100644
--- a/src/backend/postmaster/auditlogger.c
+++ b/src/backend/postmaster/auditlogger.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c
index 84682f1b..56b86ebe 100644
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index fa3a078e..8c15abd9 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/replication/logical/relation.c b/src/backend/replication/logical/relation.c
index ec8f6ac4..f7ceb93e 100644
--- a/src/backend/replication/logical/relation.c
+++ b/src/backend/replication/logical/relation.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 4fbc6120..ec5f82ef 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y
index e812c5dd..299786f1 100644
--- a/src/backend/replication/repl_gram.y
+++ b/src/backend/replication/repl_gram.y
@@ -2,7 +2,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -48,7 +48,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l
index 93596914..7c8f30b4 100644
--- a/src/backend/replication/repl_scanner.l
+++ b/src/backend/replication/repl_scanner.l
@@ -2,7 +2,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -48,7 +48,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/replication/syncrep_scanner.l b/src/backend/replication/syncrep_scanner.l
index 64fe25ac..8a5bf473 100644
--- a/src/backend/replication/syncrep_scanner.l
+++ b/src/backend/replication/syncrep_scanner.l
@@ -2,7 +2,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -48,7 +48,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index ec3c962f..39a7434b 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 2d5b5b95..306fb027 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -771,4 +771,3 @@ GetAccessStrategy_part(int npart)
     return strategy;
 }
 #endif
-
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index 20190dc7..6e81a3b8 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -843,4 +843,3 @@ BufFile * BufFileOpen(char* fileName, int fileFlags, int fileMode, bool interXac
 }
     
 #endif
-
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 67ae7984..26165089 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/storage/freespace/emapage.c b/src/backend/storage/freespace/emapage.c
index ade86acd..ea1ca701 100644
--- a/src/backend/storage/freespace/emapage.c
+++ b/src/backend/storage/freespace/emapage.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -7437,4 +7437,3 @@ Datum pg_check_extent(PG_FUNCTION_ARGS)
 
     SRF_RETURN_DONE(funcctx);   
 }
-
diff --git a/src/backend/storage/freespace/extent_xlog.c b/src/backend/storage/freespace/extent_xlog.c
index 1f51a047..5fb5047a 100644
--- a/src/backend/storage/freespace/extent_xlog.c
+++ b/src/backend/storage/freespace/extent_xlog.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -570,5 +570,3 @@ static void extend_heap(RelFileNode rnode, xl_extent_seteme *xlogrec)
 
     RelationExtendHeapForRedo(rnode, xlogrec->extentid, xlogrec->eme.shardid);
 }
-
-
diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c
index 2b3a74d8..eed31a12 100644
--- a/src/backend/storage/ipc/ipc.c
+++ b/src/backend/storage/ipc/ipc.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index ae1a9029..4c17851c 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index a709ead2..b39bc165 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index 75cf2ab6..41a4f68b 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index f1570bb3..2a08199b 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -4638,4 +4638,3 @@ void StillHoldlock(void)
     }
 }
 #endif
-
diff --git a/src/backend/storage/lmgr/nodelock.c b/src/backend/storage/lmgr/nodelock.c
index 060e0a0b..08a16ba5 100644
--- a/src/backend/storage/lmgr/nodelock.c
+++ b/src/backend/storage/lmgr/nodelock.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 2269e8e9..3d265c9e 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/tcop/dest.c b/src/backend/tcop/dest.c
index 95e361a3..e5c20b27 100644
--- a/src/backend/tcop/dest.c
+++ b/src/backend/tcop/dest.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/adt/datetime.c b/src/backend/utils/adt/datetime.c
index 6661ab49..3a0dc618 100644
--- a/src/backend/utils/adt/datetime.c
+++ b/src/backend/utils/adt/datetime.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/adt/format_type.c b/src/backend/utils/adt/format_type.c
index a202942a..4e6ee08f 100644
--- a/src/backend/utils/adt/format_type.c
+++ b/src/backend/utils/adt/format_type.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 5938bf2d..28bd464e 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index e0fed865..34ece726 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
index 3cb719ee..3c4b9d65 100644
--- a/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/adt/oid.c b/src/backend/utils/adt/oid.c
index 8b28d653..598e23d5 100644
--- a/src/backend/utils/adt/oid.c
+++ b/src/backend/utils/adt/oid.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 80a4ce72..63c834bf 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c
index c60b452b..a32ef514 100644
--- a/src/backend/utils/adt/varchar.c
+++ b/src/backend/utils/adt/varchar.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -1519,4 +1519,3 @@ nvarchar2(PG_FUNCTION_ARGS)
 }
 
 #endif
-
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 51d53bab..9c5795f5 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c
index b11ef63b..5b6276a0 100644
--- a/src/backend/utils/adt/version.c
+++ b/src/backend/utils/adt/version.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -93,4 +93,3 @@ tbase_version(PG_FUNCTION_ARGS)
     PG_RETURN_TEXT_P(cstring_to_text(TBASE_VERSION_STR));
 }
 #endif
-
diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 88552709..f94b8c70 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 59f529a8..ef5f194c 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -1686,4 +1686,3 @@ void MlsRegisterRelcacheInvalidation(Oid dbId, Oid relId)
 }
 
 #endif
-
diff --git a/src/backend/utils/cache/relcryptmap.c b/src/backend/utils/cache/relcryptmap.c
index 69fc1a06..63c43bfd 100644
--- a/src/backend/utils/cache/relcryptmap.c
+++ b/src/backend/utils/cache/relcryptmap.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -2230,4 +2230,3 @@ void StartupReachConsistentState(void)
 
 
 #endif
-
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 2f466b9b..ab9558e8 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/misc/cls.c b/src/backend/utils/misc/cls.c
index 916968ad..28e5f7a2 100644
--- a/src/backend/utils/misc/cls.c
+++ b/src/backend/utils/misc/cls.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/misc/datamask.c b/src/backend/utils/misc/datamask.c
index a0101b84..321d6186 100644
--- a/src/backend/utils/misc/datamask.c
+++ b/src/backend/utils/misc/datamask.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -1331,4 +1331,3 @@ bool datamask_check_column_in_expr(Node * node, void * context)
 
 
 #endif
-
diff --git a/src/backend/utils/misc/mls.c b/src/backend/utils/misc/mls.c
index 4d7ea96e..60f3441d 100644
--- a/src/backend/utils/misc/mls.c
+++ b/src/backend/utils/misc/mls.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -1858,4 +1858,3 @@ void check_tbase_mls_extension(void)
                         errmsg("This operation is not allowed until the extension \"%s\" is installed.",
                                MLS_EXTENSION_NAME)));
 }
-
diff --git a/src/backend/utils/misc/relcrypt.c b/src/backend/utils/misc/relcrypt.c
index ca65b3b4..3f3f7335 100644
--- a/src/backend/utils/misc/relcrypt.c
+++ b/src/backend/utils/misc/relcrypt.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/misc/timeout.c b/src/backend/utils/misc/timeout.c
index 8fd4212a..dd51de22 100644
--- a/src/backend/utils/misc/timeout.c
+++ b/src/backend/utils/misc/timeout.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -742,4 +742,3 @@ disable_timeout_safely(void)
     disable_timeout(STATEMENT_TIMEOUT, false);    
 }
 #endif
-
diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index 426191db..959a6505 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index 34b4827b..a05ebd3f 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/backend/utils/resowner/resowner.c b/src/backend/utils/resowner/resowner.c
index 05ef66d8..333442c8 100644
--- a/src/backend/utils/resowner/resowner.c
+++ b/src/backend/utils/resowner/resowner.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -1505,5 +1505,3 @@ uint32 GetResourceArrayLastidx(void)
 
 
 #endif
-
-
diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c
index 02d0696d..595a34eb 100644
--- a/src/backend/utils/sort/tuplestore.c
+++ b/src/backend/utils/sort/tuplestore.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/confmod/conf.c b/src/bin/confmod/conf.c
index 8b779b0e..29dab18c 100644
--- a/src/bin/confmod/conf.c
+++ b/src/bin/confmod/conf.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -167,5 +167,3 @@ read_vars(FILE *conf, const char * del)
 
     return lineno;
 }
-
-
diff --git a/src/bin/confmod/conf.h b/src/bin/confmod/conf.h
index 95bc5262..71bf56cc 100644
--- a/src/bin/confmod/conf.h
+++ b/src/bin/confmod/conf.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/confmod/confmod.c b/src/bin/confmod/confmod.c
index d8a15eb2..3e0fc80e 100644
--- a/src/bin/confmod/confmod.c
+++ b/src/bin/confmod/confmod.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -353,4 +353,3 @@ int main(int argc, char *argv[])
     
     return 0;
 }
-
diff --git a/src/bin/confmod/log.c b/src/bin/confmod/log.c
index 891b31b9..e04fbf6e 100644
--- a/src/bin/confmod/log.c
+++ b/src/bin/confmod/log.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/confmod/log.h b/src/bin/confmod/log.h
index ae095bb3..078c4fda 100644
--- a/src/bin/confmod/log.h
+++ b/src/bin/confmod/log.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/confmod/stree.c b/src/bin/confmod/stree.c
index cac9af32..c35b19ed 100644
--- a/src/bin/confmod/stree.c
+++ b/src/bin/confmod/stree.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -124,5 +124,3 @@ stree_pre_traverse(stree * root, void (*traverse)(void *))
         stree_pre_traverse(root->right, traverse);
     }
 }
-
-
diff --git a/src/bin/confmod/stree.h b/src/bin/confmod/stree.h
index caa5bdfb..0ee16a20 100644
--- a/src/bin/confmod/stree.h
+++ b/src/bin/confmod/stree.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/confmod/util.c b/src/bin/confmod/util.c
index dda4c4cb..d96a29a2 100644
--- a/src/bin/confmod/util.c
+++ b/src/bin/confmod/util.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/confmod/util.h b/src/bin/confmod/util.h
index 3588481c..5adae595 100644
--- a/src/bin/confmod/util.h
+++ b/src/bin/confmod/util.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/confmod/var.c b/src/bin/confmod/var.c
index 073c785b..6f827fda 100644
--- a/src/bin/confmod/var.c
+++ b/src/bin/confmod/var.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -209,4 +209,3 @@ var_hash_2_stree()
 
     return root;
 }
-
diff --git a/src/bin/confmod/var.h b/src/bin/confmod/var.h
index 053d0635..602fa44b 100644
--- a/src/bin/confmod/var.h
+++ b/src/bin/confmod/var.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 91af5b93..5317584f 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index 5c22ea00..1c00288c 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index 3079adb4..170fa3d3 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/pg_dump/compress_io.h b/src/bin/pg_dump/compress_io.h
index 3fee84ef..2dd5a86a 100644
--- a/src/bin/pg_dump/compress_io.h
+++ b/src/bin/pg_dump/compress_io.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h
index d3139c75..cd1ac12f 100644
--- a/src/bin/pg_dump/pg_backup_archiver.h
+++ b/src/bin/pg_dump/pg_backup_archiver.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h
index 133a66a9..c8da4a4e 100644
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/pg_dump/pg_dump_security.c b/src/bin/pg_dump/pg_dump_security.c
index dd7d5024..a9431897 100644
--- a/src/bin/pg_dump/pg_dump_security.c
+++ b/src/bin/pg_dump/pg_dump_security.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -834,5 +834,3 @@ static void dump_pg_transparent_crypt_policy_map(PGconn *conn)
     destroyPQExpBuffer(query);
     return;
 }
-
-
diff --git a/src/bin/pg_upgrade/exec.c b/src/bin/pg_upgrade/exec.c
index 9199369c..b3d67cd5 100644
--- a/src/bin/pg_upgrade/exec.c
+++ b/src/bin/pg_upgrade/exec.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/bin/psql/common.c b/src/bin/psql/common.c
index ff401aa4..9991898f 100644
--- a/src/bin/psql/common.c
+++ b/src/bin/psql/common.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -2269,4 +2269,3 @@ bool is_mls_or_audit_user_front(void)
     return false;
 }
 #endif
-
diff --git a/src/common/relpath.c b/src/common/relpath.c
index a9abdcd7..08ddedec 100644
--- a/src/common/relpath.c
+++ b/src/common/relpath.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/gtm/common/gtm_time.c b/src/gtm/common/gtm_time.c
index ee451bbb..5b449b4e 100644
--- a/src/gtm/common/gtm_time.c
+++ b/src/gtm/common/gtm_time.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/gtm/common/heap.c b/src/gtm/common/heap.c
index f9168d38..1f6b2be8 100644
--- a/src/gtm/common/heap.c
+++ b/src/gtm/common/heap.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -456,5 +456,3 @@ void heap_foreach(heap* h, void (*func)(void*,void*)) {
         func(entry->key, entry->value);
     }
 }
-
-
diff --git a/src/gtm/main/gtm_backup.c b/src/gtm/main/gtm_backup.c
index 66d0ba87..50d60772 100644
--- a/src/gtm/main/gtm_backup.c
+++ b/src/gtm/main/gtm_backup.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/gtm/main/gtm_store.c b/src/gtm/main/gtm_store.c
index 5e0e5ea9..9ee1a245 100644
--- a/src/gtm/main/gtm_store.c
+++ b/src/gtm/main/gtm_store.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/gtm/main/gtm_xlog.c b/src/gtm/main/gtm_xlog.c
index 3d46942e..4a305a30 100644
--- a/src/gtm/main/gtm_xlog.c
+++ b/src/gtm/main/gtm_xlog.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -4462,4 +4462,3 @@ load_sync_structures(void)
 	load_syncconfig();
 	load_xlogsync();
 }
-
diff --git a/src/gtm/main/replication.c b/src/gtm/main/replication.c
index 723d4f40..faba0bd7 100644
--- a/src/gtm/main/replication.c
+++ b/src/gtm/main/replication.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/gtm/path/path.c b/src/gtm/path/path.c
index 5444cbc3..38d82e2e 100644
--- a/src/gtm/path/path.c
+++ b/src/gtm/path/path.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c
index 279471d3..fc4e2e49 100644
--- a/src/gtm/proxy/proxy_main.c
+++ b/src/gtm/proxy/proxy_main.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/gtm/xlog_test/xlog_reader.c b/src/gtm/xlog_test/xlog_reader.c
index 3e0d6520..4471425f 100644
--- a/src/gtm/xlog_test/xlog_reader.c
+++ b/src/gtm/xlog_test/xlog_reader.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -3325,7 +3325,8 @@ void GTM_TimerRun(void)
     }
 }
 
-    void *
+    void 
+*
 GTM_TimerThread(void *argp)
 {
     GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp;
@@ -3598,4 +3599,3 @@ main(int argc, char *argv[])
         Read_XLogRecovery(argv[1],seg,0);
     return 0;
 }
-
diff --git a/src/gtm/xlog_test/xlog_test.c b/src/gtm/xlog_test/xlog_test.c
index 2972fd9d..6b756d38 100644
--- a/src/gtm/xlog_test/xlog_test.c
+++ b/src/gtm/xlog_test/xlog_test.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -1508,7 +1508,8 @@ void bind_service_threads(void)
 }
 
 /* time keeper thread will not handle any signal, any signal will cause the thread exit. */
-void *
+void 
+*
 GTM_ThreadTimeKeeper(void *argp)
 {
     GTM_ThreadInfo *my_threadinfo = (GTM_ThreadInfo *)argp;
@@ -1621,7 +1622,8 @@ GTM_ThreadTimeKeeper(void *argp)
 
 
 /* time keeper thread will not handle any signal, any signal will cause the thread exit. */
-void *
+void 
+*
 GTM_ThreadTimeBackup(void *argp)
 {
     GTM_ThreadInfo *my_threadinfo = (GTM_ThreadInfo *)argp;
@@ -3925,7 +3927,8 @@ void GTM_TimerRun(void)
     }
 }
 
-    void *
+    void 
+*
 GTM_TimerThread(void *argp)
 {
     GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp;
diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h
index 4df9fc1e..3fda8250 100644
--- a/src/include/access/gtm.h
+++ b/src/include/access/gtm.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index 0a76bb5a..8ae2326f 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/lru.h b/src/include/access/lru.h
index 7d9980a3..4053e99b 100644
--- a/src/include/access/lru.h
+++ b/src/include/access/lru.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h
index 2e258ca4..b9f51485 100644
--- a/src/include/access/parallel.h
+++ b/src/include/access/parallel.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/printtup.h b/src/include/access/printtup.h
index 0b395eb9..0af5c09d 100644
--- a/src/include/access/printtup.h
+++ b/src/include/access/printtup.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/relcryptaccess.h b/src/include/access/relcryptaccess.h
index 511f948d..7d31750d 100644
--- a/src/include/access/relcryptaccess.h
+++ b/src/include/access/relcryptaccess.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 79e8cab7..af98fdf8 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/replslotdesc.h b/src/include/access/replslotdesc.h
index 4f6ad1e7..9fe16de4 100644
--- a/src/include/access/replslotdesc.h
+++ b/src/include/access/replslotdesc.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index 3321c275..bd4f49a1 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index d94c4d26..cbbaf4b3 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/tupdesc_details.h b/src/include/access/tupdesc_details.h
index 84f6225a..e712d721 100644
--- a/src/include/access/tupdesc_details.h
+++ b/src/include/access/tupdesc_details.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index 06f9685e..9d7a06ba 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
index dda3c7a4..7f916fa2 100644
--- a/src/include/access/visibilitymap.h
+++ b/src/include/access/visibilitymap.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index 55656d15..88d47831 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h
index 909518f7..55594331 100644
--- a/src/include/access/xlogrecord.h
+++ b/src/include/access/xlogrecord.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/audit/audit.h b/src/include/audit/audit.h
index a3828441..a2eb5e80 100644
--- a/src/include/audit/audit.h
+++ b/src/include/audit/audit.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -279,4 +279,3 @@ extern bool audituser(void);
 extern bool audituser_arg(Oid roleid);
 
 #endif
-
diff --git a/src/include/audit/audit_fga.h b/src/include/audit/audit_fga.h
index c60c3549..f17336d0 100644
--- a/src/include/audit/audit_fga.h
+++ b/src/include/audit/audit_fga.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -70,7 +70,7 @@
 #define AUDIT_TRIGGER_FEEDBACK_LEN  256
 
 extern bool enable_fga;
-extern const char *g_commandTag;
+extern const char *g_commandTag;
 
 
 /* simple list of strings */
@@ -142,4 +142,3 @@ extern void write_trigger_handle_to_shmem(Oid func);
 
 
 #endif  /*AUDIT_FGA_H*/
-
diff --git a/src/include/c.h b/src/include/c.h
index 7a6ab8e2..9a4d2e4e 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/audit/pg_audit_d.h b/src/include/catalog/audit/pg_audit_d.h
index e7d9bfe9..12488366 100644
--- a/src/include/catalog/audit/pg_audit_d.h
+++ b/src/include/catalog/audit/pg_audit_d.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -90,5 +90,3 @@ typedef FormData_audit_obj_def_opts *Form_audit_obj_def_opts;
 /* DATA(insert OID = 5104 (10 4 n f)); */
 
 #endif     /* PGXC_AUDIT_DEFAULT_H */
-
-
diff --git a/src/include/catalog/audit/pg_audit_fga.h b/src/include/catalog/audit/pg_audit_fga.h
index 23764839..f281fa42 100644
--- a/src/include/catalog/audit/pg_audit_fga.h
+++ b/src/include/catalog/audit/pg_audit_fga.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -104,5 +104,3 @@ typedef FormData_audit_fga_conf *Form_audit_fga_conf;
 #define Anum_audit_fga_conf_audit_column_opts        13
 
 #endif     /* PGXC_AUDIT_FGA_H */
-
-
diff --git a/src/include/catalog/audit/pg_audit_o.h b/src/include/catalog/audit/pg_audit_o.h
index 18b4b206..f10399cc 100644
--- a/src/include/catalog/audit/pg_audit_o.h
+++ b/src/include/catalog/audit/pg_audit_o.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -98,5 +98,3 @@ typedef FormData_audit_obj_conf *Form_audit_obj_conf;
 /* DATA(insert OID = 5111 ( 4200     2617    95        0    6     n     f)); */
 
 #endif     /* PGXC_AUDIT_OBJ_H */
-
-
diff --git a/src/include/catalog/audit/pg_audit_s.h b/src/include/catalog/audit/pg_audit_s.h
index f5aafe30..ca476406 100644
--- a/src/include/catalog/audit/pg_audit_s.h
+++ b/src/include/catalog/audit/pg_audit_s.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -93,5 +93,3 @@ typedef FormData_audit_stmt_conf *Form_audit_stmt_conf;
 /* DATA(insert  OID = 5119 ( 4200      6 n f)); */
 
 #endif     /* PGXC_AUDIT_STMT_H */
-
-
diff --git a/src/include/catalog/audit/pg_audit_u.h b/src/include/catalog/audit/pg_audit_u.h
index fc171782..164abea2 100644
--- a/src/include/catalog/audit/pg_audit_u.h
+++ b/src/include/catalog/audit/pg_audit_u.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -98,5 +98,3 @@ typedef FormData_audit_user_conf *Form_audit_user_conf;
 /* DATA(insert OID = 5130 ( 4200     3377  6 n f)); */
 
 #endif     /* PGXC_AUDIT_USER_H */
-
-
diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h
index 1394114e..542ca147 100644
--- a/src/include/catalog/catalog.h
+++ b/src/include/catalog/catalog.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h
index 589bbaab..70fdc4e0 100644
--- a/src/include/catalog/dependency.h
+++ b/src/include/catalog/dependency.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index d37be02e..abf5041f 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/mls/pg_cls_compartment.h b/src/include/catalog/mls/pg_cls_compartment.h
index 356c72f1..888b1d1c 100644
--- a/src/include/catalog/mls/pg_cls_compartment.h
+++ b/src/include/catalog/mls/pg_cls_compartment.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/mls/pg_cls_group.h b/src/include/catalog/mls/pg_cls_group.h
index 136681ec..b0a776cc 100644
--- a/src/include/catalog/mls/pg_cls_group.h
+++ b/src/include/catalog/mls/pg_cls_group.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/mls/pg_cls_label.h b/src/include/catalog/mls/pg_cls_label.h
index 7b788917..f07b41c8 100644
--- a/src/include/catalog/mls/pg_cls_label.h
+++ b/src/include/catalog/mls/pg_cls_label.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/mls/pg_cls_level.h b/src/include/catalog/mls/pg_cls_level.h
index 5ddaf7d9..956e360a 100644
--- a/src/include/catalog/mls/pg_cls_level.h
+++ b/src/include/catalog/mls/pg_cls_level.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/mls/pg_cls_policy.h b/src/include/catalog/mls/pg_cls_policy.h
index 189d6d0e..bf5a30d4 100644
--- a/src/include/catalog/mls/pg_cls_policy.h
+++ b/src/include/catalog/mls/pg_cls_policy.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -83,5 +83,3 @@ typedef FormData_cls_policy * Form_pg_cls_policy;
 #define Anum_pg_cls_policy_reloption    4
 
 #endif   /* PG_CLS_POLICY_H */
-
-
diff --git a/src/include/catalog/mls/pg_cls_table.h b/src/include/catalog/mls/pg_cls_table.h
index 46a5c4a5..7933869c 100644
--- a/src/include/catalog/mls/pg_cls_table.h
+++ b/src/include/catalog/mls/pg_cls_table.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/mls/pg_cls_user.h b/src/include/catalog/mls/pg_cls_user.h
index dda745c1..144b2395 100644
--- a/src/include/catalog/mls/pg_cls_user.h
+++ b/src/include/catalog/mls/pg_cls_user.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/mls/pg_data_mask_map.h b/src/include/catalog/mls/pg_data_mask_map.h
index ec6132dc..1e49dc26 100644
--- a/src/include/catalog/mls/pg_data_mask_map.h
+++ b/src/include/catalog/mls/pg_data_mask_map.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/mls/pg_data_mask_user.h b/src/include/catalog/mls/pg_data_mask_user.h
index e703d58f..664979cf 100644
--- a/src/include/catalog/mls/pg_data_mask_user.h
+++ b/src/include/catalog/mls/pg_data_mask_user.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/namespace.h b/src/include/catalog/namespace.h
index 16ef82ae..ef656a77 100644
--- a/src/include/catalog/namespace.h
+++ b/src/include/catalog/namespace.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/objectaddress.h b/src/include/catalog/objectaddress.h
index ec2cd56f..8fbafdb1 100644
--- a/src/include/catalog/objectaddress.h
+++ b/src/include/catalog/objectaddress.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/pg_audit.h b/src/include/catalog/pg_audit.h
index 54c58959..c1ac3d76 100644
--- a/src/include/catalog/pg_audit.h
+++ b/src/include/catalog/pg_audit.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/pg_authid.h b/src/include/catalog/pg_authid.h
index 5d9a9b26..ad4ddad1 100644
--- a/src/include/catalog/pg_authid.h
+++ b/src/include/catalog/pg_authid.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/pg_cast.h b/src/include/catalog/pg_cast.h
index 2bcfc409..6495aec1 100644
--- a/src/include/catalog/pg_cast.h
+++ b/src/include/catalog/pg_cast.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/pg_mls.h b/src/include/catalog/pg_mls.h
index b2c670fb..3af65945 100644
--- a/src/include/catalog/pg_mls.h
+++ b/src/include/catalog/pg_mls.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/pg_namespace.h b/src/include/catalog/pg_namespace.h
index 91ef4b66..b4577e66 100644
--- a/src/include/catalog/pg_namespace.h
+++ b/src/include/catalog/pg_namespace.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h
index 2b3bf872..20316d65 100644
--- a/src/include/catalog/pg_operator.h
+++ b/src/include/catalog/pg_operator.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/pg_partition_interval.h b/src/include/catalog/pg_partition_interval.h
index 4b10f876..d65eb686 100644
--- a/src/include/catalog/pg_partition_interval.h
+++ b/src/include/catalog/pg_partition_interval.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 1eb1b97f..f5ebe168 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/pg_publication.h b/src/include/catalog/pg_publication.h
index 4e3a8241..fea62666 100644
--- a/src/include/catalog/pg_publication.h
+++ b/src/include/catalog/pg_publication.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/pg_publication_shard.h b/src/include/catalog/pg_publication_shard.h
index b42e86d4..17ea8b51 100644
--- a/src/include/catalog/pg_publication_shard.h
+++ b/src/include/catalog/pg_publication_shard.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -111,4 +111,3 @@ typedef FormData_pg_publication_shard *Form_pg_publication_shard;
 #define Anum_pg_publication_shard_prshardid        2
 
 #endif                            /* PG_PUBLICATION_SHARD_H */
-
diff --git a/src/include/catalog/pg_subscription_shard.h b/src/include/catalog/pg_subscription_shard.h
index cb73b335..819ae03e 100644
--- a/src/include/catalog/pg_subscription_shard.h
+++ b/src/include/catalog/pg_subscription_shard.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -102,5 +102,3 @@ typedef FormData_pg_subscription_shard *Form_pg_subscription_shard;
 #define Anum_pg_subscription_shard_pubname      3
 
 #endif                            /* PG_SUBSCRIPTION_SHARD_H */
-
-
diff --git a/src/include/catalog/pg_subscription_table.h b/src/include/catalog/pg_subscription_table.h
index c5a53621..80ae633d 100644
--- a/src/include/catalog/pg_subscription_table.h
+++ b/src/include/catalog/pg_subscription_table.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -102,4 +102,3 @@ typedef FormData_pg_subscription_table *Form_pg_subscription_table;
 #define Anum_pg_subscription_table_pubname      3
 
 #endif                            /* PG_SUBSCRIPTION_TABLE_H */
-
diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h
index 79cdcf48..97e2c573 100644
--- a/src/include/catalog/pg_type.h
+++ b/src/include/catalog/pg_type.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/pgxc_key_values.h b/src/include/catalog/pgxc_key_values.h
index 982507f0..01bff97e 100644
--- a/src/include/catalog/pgxc_key_values.h
+++ b/src/include/catalog/pgxc_key_values.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -102,4 +102,4 @@ extern char *BuildRelationCheckoverlapsStr(DistributeBy *distributeby, PGXCSubCl
 
 extern Oid GetKeyValuesGroup(Oid db, Oid rel, char *value, Oid *coldgroup);
 
-extern bool IsKeyValues(Oid db, Oid rel, char *value);
\ No newline at end of file
+extern bool IsKeyValues(Oid db, Oid rel, char *value);
diff --git a/src/include/catalog/pgxc_shard_map.h b/src/include/catalog/pgxc_shard_map.h
index 721c237c..8429b1fe 100644
--- a/src/include/catalog/pgxc_shard_map.h
+++ b/src/include/catalog/pgxc_shard_map.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h
index eace4c4e..0ec10fa3 100644
--- a/src/include/catalog/storage_xlog.h
+++ b/src/include/catalog/storage_xlog.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h
index c3da4cec..f7e9e845 100644
--- a/src/include/commands/cluster.h
+++ b/src/include/commands/cluster.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/commands/prepare.h b/src/include/commands/prepare.h
index a5d6383e..a3433532 100644
--- a/src/include/commands/prepare.h
+++ b/src/include/commands/prepare.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/commands/publicationcmds.h b/src/include/commands/publicationcmds.h
index 16a0d0f9..f11c030c 100644
--- a/src/include/commands/publicationcmds.h
+++ b/src/include/commands/publicationcmds.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/commands/relcryptcommand.h b/src/include/commands/relcryptcommand.h
index a34abdb5..bbdf6bc4 100644
--- a/src/include/commands/relcryptcommand.h
+++ b/src/include/commands/relcryptcommand.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/commands/schemacmds.h b/src/include/commands/schemacmds.h
index 16eb00a5..5acc6fb2 100644
--- a/src/include/commands/schemacmds.h
+++ b/src/include/commands/schemacmds.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h
index a9f5ddca..e0c0a554 100644
--- a/src/include/commands/sequence.h
+++ b/src/include/commands/sequence.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/commands/subscriptioncmds.h b/src/include/commands/subscriptioncmds.h
index b1f9ab20..a0ce73db 100644
--- a/src/include/commands/subscriptioncmds.h
+++ b/src/include/commands/subscriptioncmds.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 47859b0b..b7af5023 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/contrib/pgcrypto/blf.h b/src/include/contrib/pgcrypto/blf.h
index 71345083..938fc4bf 100644
--- a/src/include/contrib/pgcrypto/blf.h
+++ b/src/include/contrib/pgcrypto/blf.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/contrib/pgcrypto/fortuna.h b/src/include/contrib/pgcrypto/fortuna.h
index 61a2ea6b..b1b812a5 100644
--- a/src/include/contrib/pgcrypto/fortuna.h
+++ b/src/include/contrib/pgcrypto/fortuna.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/contrib/pgcrypto/imath.h b/src/include/contrib/pgcrypto/imath.h
index 0695eed8..3d3a1456 100644
--- a/src/include/contrib/pgcrypto/imath.h
+++ b/src/include/contrib/pgcrypto/imath.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/contrib/pgcrypto/mbuf.h b/src/include/contrib/pgcrypto/mbuf.h
index ad0768f0..92eb8fb9 100644
--- a/src/include/contrib/pgcrypto/mbuf.h
+++ b/src/include/contrib/pgcrypto/mbuf.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/contrib/pgcrypto/md5.h b/src/include/contrib/pgcrypto/md5.h
index 9fbc95d7..aefa9725 100644
--- a/src/include/contrib/pgcrypto/md5.h
+++ b/src/include/contrib/pgcrypto/md5.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/contrib/pgcrypto/pgcrypto.h b/src/include/contrib/pgcrypto/pgcrypto.h
index 68bb8ead..377b69b9 100644
--- a/src/include/contrib/pgcrypto/pgcrypto.h
+++ b/src/include/contrib/pgcrypto/pgcrypto.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/contrib/pgcrypto/pgp.h b/src/include/contrib/pgcrypto/pgp.h
index afaffaa1..615f7f96 100644
--- a/src/include/contrib/pgcrypto/pgp.h
+++ b/src/include/contrib/pgcrypto/pgp.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -393,5 +393,3 @@ decrypt_internal(int is_pubenc, int need_text, text *data,
 extern bytea *
 encrypt_internal(int is_pubenc, int is_text,
                                   text *data, text *key, text *args);
-
-
diff --git a/src/include/contrib/pgcrypto/px-crypt.h b/src/include/contrib/pgcrypto/px-crypt.h
index 401eeeec..a6a2919d 100644
--- a/src/include/contrib/pgcrypto/px-crypt.h
+++ b/src/include/contrib/pgcrypto/px-crypt.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/contrib/pgcrypto/px.h b/src/include/contrib/pgcrypto/px.h
index afd7c44a..a6829357 100644
--- a/src/include/contrib/pgcrypto/px.h
+++ b/src/include/contrib/pgcrypto/px.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/contrib/pgcrypto/rijndael.h b/src/include/contrib/pgcrypto/rijndael.h
index d876b1b2..df23fbd3 100644
--- a/src/include/contrib/pgcrypto/rijndael.h
+++ b/src/include/contrib/pgcrypto/rijndael.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/contrib/pgcrypto/sha1.h b/src/include/contrib/pgcrypto/sha1.h
index c1f207e9..df33c32b 100644
--- a/src/include/contrib/pgcrypto/sha1.h
+++ b/src/include/contrib/pgcrypto/sha1.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/contrib/pgcrypto/sha2.h b/src/include/contrib/pgcrypto/sha2.h
index 20d03790..25ccca2d 100644
--- a/src/include/contrib/pgcrypto/sha2.h
+++ b/src/include/contrib/pgcrypto/sha2.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/contrib/sm/sm4.h b/src/include/contrib/sm/sm4.h
index b9bb3693..37cc84d3 100644
--- a/src/include/contrib/sm/sm4.h
+++ b/src/include/contrib/sm/sm4.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h
index 00e26823..b9929034 100644
--- a/src/include/executor/execdesc.h
+++ b/src/include/executor/execdesc.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index efafc8da..f65f8f3a 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/executor/tqueue.h b/src/include/executor/tqueue.h
index d4d12a8e..989e6914 100644
--- a/src/include/executor/tqueue.h
+++ b/src/include/executor/tqueue.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/gtm/elog.h b/src/include/gtm/elog.h
index ffdc99c5..52545c27 100644
--- a/src/include/gtm/elog.h
+++ b/src/include/gtm/elog.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/gtm/gtm_checkpoint.h b/src/include/gtm/gtm_checkpoint.h
index 5b37c8d6..7cf7992c 100644
--- a/src/include/gtm/gtm_checkpoint.h
+++ b/src/include/gtm/gtm_checkpoint.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/gtm/gtm_conn.h b/src/include/gtm/gtm_conn.h
index 6709b2b1..8e257db7 100644
--- a/src/include/gtm/gtm_conn.h
+++ b/src/include/gtm/gtm_conn.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/gtm/gtm_gxid.h b/src/include/gtm/gtm_gxid.h
index 978286e3..4461b0a3 100644
--- a/src/include/gtm/gtm_gxid.h
+++ b/src/include/gtm/gtm_gxid.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/gtm/gtm_lock.h b/src/include/gtm/gtm_lock.h
index a5aa93a4..15fd6194 100644
--- a/src/include/gtm/gtm_lock.h
+++ b/src/include/gtm/gtm_lock.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/gtm/gtm_store.h b/src/include/gtm/gtm_store.h
index 81205f8b..2e84f1a0 100644
--- a/src/include/gtm/gtm_store.h
+++ b/src/include/gtm/gtm_store.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/gtm/gtm_xlog.h b/src/include/gtm/gtm_xlog.h
index 986f73c3..76ebb36e 100644
--- a/src/include/gtm/gtm_xlog.h
+++ b/src/include/gtm/gtm_xlog.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -291,4 +291,3 @@ extern void RegisterNewSyncStandby(GTM_StandbyReplication *replication);
 extern void RemoveSyncStandby(GTM_StandbyReplication *replication);
 extern void load_sync_structures(void);
 #endif /* GTM_XLOG_H */
-
diff --git a/src/include/gtm/gtm_xlog_internal.h b/src/include/gtm/gtm_xlog_internal.h
index 78353939..e8ee1ef8 100644
--- a/src/include/gtm/gtm_xlog_internal.h
+++ b/src/include/gtm/gtm_xlog_internal.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/gtm/heap.h b/src/include/gtm/heap.h
index d2370cca..340cd217 100644
--- a/src/include/gtm/heap.h
+++ b/src/include/gtm/heap.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -143,4 +143,3 @@ void heap_foreach(heap* h, void (*func)(void*,void*));
 void heap_destroy(heap* h);
 
 #endif
-
diff --git a/src/include/gtm/libpq-be.h b/src/include/gtm/libpq-be.h
index e8243e34..a24c58cd 100644
--- a/src/include/gtm/libpq-be.h
+++ b/src/include/gtm/libpq-be.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/gtm/register.h b/src/include/gtm/register.h
index 0100096d..57216a2f 100644
--- a/src/include/gtm/register.h
+++ b/src/include/gtm/register.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/libpq/auth.h b/src/include/libpq/auth.h
index bc6ef2f6..04cee3b6 100644
--- a/src/include/libpq/auth.h
+++ b/src/include/libpq/auth.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/libpq/libpq-be.h b/src/include/libpq/libpq-be.h
index 00737906..e40b9623 100644
--- a/src/include/libpq/libpq-be.h
+++ b/src/include/libpq/libpq-be.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/libpq/pqcomm.h b/src/include/libpq/pqcomm.h
index c41c0bbf..914c8a66 100644
--- a/src/include/libpq/pqcomm.h
+++ b/src/include/libpq/pqcomm.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 0d948295..65ca053e 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/nodes/bitmapset.h b/src/include/nodes/bitmapset.h
index fc101c8f..7e39d00d 100644
--- a/src/include/nodes/bitmapset.h
+++ b/src/include/nodes/bitmapset.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h
index 6b1997ea..92cdb50a 100644
--- a/src/include/nodes/makefuncs.h
+++ b/src/include/nodes/makefuncs.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/nodes/nodeFuncs.h b/src/include/nodes/nodeFuncs.h
index cfb41c3c..36d4965d 100644
--- a/src/include/nodes/nodeFuncs.h
+++ b/src/include/nodes/nodeFuncs.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 2be2556e..73e6c150 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 59dddbe6..c74130c6 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h
index dc0eb282..21fcc48d 100644
--- a/src/include/optimizer/plancat.h
+++ b/src/include/optimizer/plancat.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index dc4ca53e..dae7b638 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h
index 2e47c1e3..6b5f4781 100644
--- a/src/include/optimizer/planner.h
+++ b/src/include/optimizer/planner.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/optimizer/subselect.h b/src/include/optimizer/subselect.h
index bfba85b3..ef3265f6 100644
--- a/src/include/optimizer/subselect.h
+++ b/src/include/optimizer/subselect.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/optimizer/var.h b/src/include/optimizer/var.h
index 5cbb90d2..a762f1fa 100644
--- a/src/include/optimizer/var.h
+++ b/src/include/optimizer/var.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/oracle/oracle.h b/src/include/oracle/oracle.h
index 207a6cba..2e7e37f9 100644
--- a/src/include/oracle/oracle.h
+++ b/src/include/oracle/oracle.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/parser/analyze.h b/src/include/parser/analyze.h
index 1efb963c..0bba2711 100644
--- a/src/include/parser/analyze.h
+++ b/src/include/parser/analyze.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h
index 5ae643ce..8bc4609c 100644
--- a/src/include/parser/parse_node.h
+++ b/src/include/parser/parse_node.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h
index b6a0be60..4c5ba439 100644
--- a/src/include/parser/parse_utilcmd.h
+++ b/src/include/parser/parse_utilcmd.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/parser/parser.h b/src/include/parser/parser.h
index e4da6bbf..bcfa1ad3 100644
--- a/src/include/parser/parser.h
+++ b/src/include/parser/parser.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index 8d110b88..e0d7f306 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/pgxc/groupmgr.h b/src/include/pgxc/groupmgr.h
index d27d6c2b..a5be4a63 100644
--- a/src/include/pgxc/groupmgr.h
+++ b/src/include/pgxc/groupmgr.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h
index 264bfbed..0ae12298 100644
--- a/src/include/pgxc/pgxc.h
+++ b/src/include/pgxc/pgxc.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h
index 2acef598..fbbac577 100644
--- a/src/include/pgxc/planner.h
+++ b/src/include/pgxc/planner.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/pgxc/shard_vacuum.h b/src/include/pgxc/shard_vacuum.h
index eac39965..7559b8d4 100644
--- a/src/include/pgxc/shard_vacuum.h
+++ b/src/include/pgxc/shard_vacuum.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/pgxc/shardmap.h b/src/include/pgxc/shardmap.h
index 0674185d..8ad6b88e 100644
--- a/src/include/pgxc/shardmap.h
+++ b/src/include/pgxc/shardmap.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/postgres_ext.h b/src/include/postgres_ext.h
index d02f346c..4d3bed00 100644
--- a/src/include/postgres_ext.h
+++ b/src/include/postgres_ext.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/postmaster/auditlogger.h b/src/include/postmaster/auditlogger.h
index 2ec37bc1..0087b854 100644
--- a/src/include/postmaster/auditlogger.h
+++ b/src/include/postmaster/auditlogger.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/postmaster/bgworker.h b/src/include/postmaster/bgworker.h
index c862bec5..f54138eb 100644
--- a/src/include/postmaster/bgworker.h
+++ b/src/include/postmaster/bgworker.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index a9530074..69ed2fef 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/postmaster/pgarch.h b/src/include/postmaster/pgarch.h
index 7bc5eec1..682efcc1 100644
--- a/src/include/postmaster/pgarch.h
+++ b/src/include/postmaster/pgarch.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/replication/decode.h b/src/include/replication/decode.h
index e6a4211b..8e3d49b6 100644
--- a/src/include/replication/decode.h
+++ b/src/include/replication/decode.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/replication/logical_statistic.h b/src/include/replication/logical_statistic.h
index db94ca6d..8e63192a 100644
--- a/src/include/replication/logical_statistic.h
+++ b/src/include/replication/logical_statistic.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -137,4 +137,3 @@ extern Datum tbase_remove_pubtable_stat(PG_FUNCTION_ARGS);
 extern Datum tbase_remove_sub_stat(PG_FUNCTION_ARGS);
 extern Datum tbase_remove_subtable_stat(PG_FUNCTION_ARGS);
 #endif                            /* PG_SUBSCRIPTION_STATISTIC_H */
-
diff --git a/src/include/replication/logicallauncher.h b/src/include/replication/logicallauncher.h
index 60a23a95..c885867f 100644
--- a/src/include/replication/logicallauncher.h
+++ b/src/include/replication/logicallauncher.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/replication/logicalrelation.h b/src/include/replication/logicalrelation.h
index 0c6af1cf..fdf5b629 100644
--- a/src/include/replication/logicalrelation.h
+++ b/src/include/replication/logicalrelation.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h
index d50bc882..3935cf95 100644
--- a/src/include/replication/walreceiver.h
+++ b/src/include/replication/walreceiver.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/replication/worker_internal.h b/src/include/replication/worker_internal.h
index 5673c572..0def2474 100644
--- a/src/include/replication/worker_internal.h
+++ b/src/include/replication/worker_internal.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 6eae2368..3979c675 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h
index 05f6ec03..e7cdfe00 100644
--- a/src/include/storage/buffile.h
+++ b/src/include/storage/buffile.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/storage/extent_xlog.h b/src/include/storage/extent_xlog.h
index 93d6d8a6..c4e1b66c 100644
--- a/src/include/storage/extent_xlog.h
+++ b/src/include/storage/extent_xlog.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/storage/extentmapping.h b/src/include/storage/extentmapping.h
index 13d4017c..f401957d 100644
--- a/src/include/storage/extentmapping.h
+++ b/src/include/storage/extentmapping.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h
index 1924df26..3cc900ca 100644
--- a/src/include/storage/lmgr.h
+++ b/src/include/storage/lmgr.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index ae936afc..55a3a7de 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/storage/nodelock.h b/src/include/storage/nodelock.h
index 6a87d592..cb66cc0c 100644
--- a/src/include/storage/nodelock.h
+++ b/src/include/storage/nodelock.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 32dea324..ee30fe95 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h
index 96064c66..e9baa497 100644
--- a/src/include/storage/procsignal.h
+++ b/src/include/storage/procsignal.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/storage/relcryptstorage.h b/src/include/storage/relcryptstorage.h
index 19e4eae2..c310316e 100644
--- a/src/include/storage/relcryptstorage.h
+++ b/src/include/storage/relcryptstorage.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h
index b1095952..799fd31a 100644
--- a/src/include/storage/relfilenode.h
+++ b/src/include/storage/relfilenode.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/tcop/dest.h b/src/include/tcop/dest.h
index 4e72e4a2..2cc43d88 100644
--- a/src/include/tcop/dest.h
+++ b/src/include/tcop/dest.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/tcop/pquery.h b/src/include/tcop/pquery.h
index 1c920257..5401d969 100644
--- a/src/include/tcop/pquery.h
+++ b/src/include/tcop/pquery.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/tcop/tcopprot.h b/src/include/tcop/tcopprot.h
index 65bbd737..ccc140db 100644
--- a/src/include/tcop/tcopprot.h
+++ b/src/include/tcop/tcopprot.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 4218d9d2..228da956 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/cls.h b/src/include/utils/cls.h
index 2d753eec..75709956 100644
--- a/src/include/utils/cls.h
+++ b/src/include/utils/cls.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/datamask.h b/src/include/utils/datamask.h
index 64a4a48c..ca19cfc9 100644
--- a/src/include/utils/datamask.h
+++ b/src/include/utils/datamask.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h
index ef6b381f..7f170aca 100644
--- a/src/include/utils/elog.h
+++ b/src/include/utils/elog.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 2a890162..747ff219 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h
index b4ef58b1..45dbeadc 100644
--- a/src/include/utils/inval.h
+++ b/src/include/utils/inval.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 74404bf1..10d904ff 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/mls.h b/src/include/utils/mls.h
index 67cd004b..c9b23731 100644
--- a/src/include/utils/mls.h
+++ b/src/include/utils/mls.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/mls_extension.h b/src/include/utils/mls_extension.h
index d633b596..817adafa 100644
--- a/src/include/utils/mls_extension.h
+++ b/src/include/utils/mls_extension.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index deffc0d2..37c37faf 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h
index e04b03d8..0b6e53fa 100644
--- a/src/include/utils/plancache.h
+++ b/src/include/utils/plancache.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h
index d662e3f6..ec184420 100644
--- a/src/include/utils/portal.h
+++ b/src/include/utils/portal.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/relcrypt.h b/src/include/utils/relcrypt.h
index fc071bad..cd847b88 100644
--- a/src/include/utils/relcrypt.h
+++ b/src/include/utils/relcrypt.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/relcryptcache.h b/src/include/utils/relcryptcache.h
index 7b8ac22b..737f4791 100644
--- a/src/include/utils/relcryptcache.h
+++ b/src/include/utils/relcryptcache.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/relcryptmap.h b/src/include/utils/relcryptmap.h
index ff434bbe..6451659c 100644
--- a/src/include/utils/relcryptmap.h
+++ b/src/include/utils/relcryptmap.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/relcryptmisc.h b/src/include/utils/relcryptmisc.h
index 354beb42..1ecafb08 100644
--- a/src/include/utils/relcryptmisc.h
+++ b/src/include/utils/relcryptmisc.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/resowner_private.h b/src/include/utils/resowner_private.h
index 34996bfa..8990610a 100644
--- a/src/include/utils/resowner_private.h
+++ b/src/include/utils/resowner_private.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/ruleutils.h b/src/include/utils/ruleutils.h
index 03d502f9..85ad751c 100644
--- a/src/include/utils/ruleutils.h
+++ b/src/include/utils/ruleutils.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 4ba6f96b..5bfa289c 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h
index 9ab96811..65a8c960 100644
--- a/src/include/utils/syscache.h
+++ b/src/include/utils/syscache.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h
index ebf2d15f..5186eaf1 100644
--- a/src/include/utils/timeout.h
+++ b/src/include/utils/timeout.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/tqual.h b/src/include/utils/tqual.h
index 70c67bb7..f4db7bda 100644
--- a/src/include/utils/tqual.h
+++ b/src/include/utils/tqual.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/include/utils/tuplestore.h b/src/include/utils/tuplestore.h
index ec59731a..d4782980 100644
--- a/src/include/utils/tuplestore.h
+++ b/src/include/utils/tuplestore.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 1b7e3fe7..1dd1c662 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/interfaces/libpq/fe-protocol3.c b/src/interfaces/libpq/fe-protocol3.c
index 10f873e0..62d1c323 100644
--- a/src/interfaces/libpq/fe-protocol3.c
+++ b/src/interfaces/libpq/fe-protocol3.c
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/interfaces/libpq/libpq-fe.h b/src/interfaces/libpq/libpq-fe.h
index e0cd17c0..ac477ed0 100644
--- a/src/interfaces/libpq/libpq-fe.h
+++ b/src/interfaces/libpq/libpq-fe.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h
index ad150bf7..a2f3532d 100644
--- a/src/interfaces/libpq/libpq-int.h
+++ b/src/interfaces/libpq/libpq-int.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/pl/plperl/plperl_helpers.h b/src/pl/plperl/plperl_helpers.h
index 020538f9..591c1c01 100644
--- a/src/pl/plperl/plperl_helpers.h
+++ b/src/pl/plperl/plperl_helpers.h
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
diff --git a/src/test/isolation/isolation_test.conf b/src/test/isolation/isolation_test.conf
index 49eaa259..89ea1281 100644
--- a/src/test/isolation/isolation_test.conf
+++ b/src/test/isolation/isolation_test.conf
@@ -1,7 +1,7 @@
 /*
  * Tencent is pleased to support the open source community by making TBase available.  
  * 
- * Copyright (C) 2019 THL A29 Limited, a Tencent company.  All rights reserved.
+ * Copyright (C) 2019 Tencent.  All rights reserved.
  * 
  * TBase is licensed under the BSD 3-Clause License, except for the third-party component listed below. 
  * 
@@ -47,7 +47,7 @@
  * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation 
  * and/or other materials provided with the distribution.
  * 
- * 3. Neither the name of THL A29 Limited nor the names of its contributors may be used to endorse or promote products derived from this software without 
+ * 3. Neither the name of Tencent nor the names of its contributors may be used to endorse or promote products derived from this software without 
  * specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
@@ -64,17 +64,3 @@ diskey3=3
 connect:dbname=postgres hostaddr=172.0.0.1 port=30001 user=qguo
 connect:dbname=postgres hostaddr=172.0.0.1 port=30002 user=qguo
 connect:dbname=postgres hostaddr=172.0.0.1 port=30003 user=qguo
-
-
-
-
-
-
-
-
-
-
-
-
-
-